diff options
Diffstat (limited to 'tools/assistant/index.cpp')
-rw-r--r-- | tools/assistant/index.cpp | 526 |
1 files changed, 526 insertions, 0 deletions
diff --git a/tools/assistant/index.cpp b/tools/assistant/index.cpp new file mode 100644 index 000000000..e542b627e --- /dev/null +++ b/tools/assistant/index.cpp @@ -0,0 +1,526 @@ +/********************************************************************** +** Copyright (C) 2000-2008 Trolltech ASA. All rights reserved. +** +** This file is part of the TQt Assistant. +** +** This file may be used under the terms of the GNU General +** Public License versions 2.0 or 3.0 as published by the Free +** Software Foundation and appearing in the files LICENSE.GPL2 +** and LICENSE.GPL3 included in the packaging of this file. +** Alternatively you may (at your option) use any later version +** of the GNU General Public License if such license has been +** publicly approved by Trolltech ASA (or its successors, if any) +** and the KDE Free TQt Foundation. +** +** Please review the following information to ensure GNU General +** Public Licensing retquirements will be met: +** http://trolltech.com/products/qt/licenses/licensing/opensource/. +** If you are unsure which license is appropriate for your use, please +** review the following information: +** http://trolltech.com/products/qt/licenses/licensing/licensingoverview +** or contact the sales department at sales@trolltech.com. +** +** Licensees holding valid TQt Commercial licenses may use this file in +** accordance with the TQt Commercial License Agreement provided with +** the Software. +** +** This file is provided "AS IS" with NO WARRANTY OF ANY KIND, +** INCLUDING THE WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR +** A PARTICULAR PURPOSE. Trolltech reserves all rights not granted +** herein. +** +**********************************************************************/ + +#include "index.h" + +#include <qfile.h> +#include <qdir.h> +#include <qstringlist.h> +#include <qdict.h> +#include <qapplication.h> + +#include <ctype.h> + +int TermList::compareItems( TQPtrCollection::Item i1, TQPtrCollection::Item i2 ) +{ + if( ( (Term*)i1 )->frequency == ( (Term*)i2 )->frequency ) + return 0; + if( ( (Term*)i1 )->frequency < ( (Term*)i2 )->frequency ) + return -1; + return 1; +} + +TQDataStream &operator>>( TQDataStream &s, Document &l ) +{ + s >> l.docNumber; + s >> l.frequency; + return s; +} + +TQDataStream &operator<<( TQDataStream &s, const Document &l ) +{ + s << (Q_INT16)l.docNumber; + s << (Q_INT16)l.frequency; + return s; +} + +Index::Index( const TQString &dp, const TQString &hp ) + : TQObject( 0, 0 ), dict( 8999 ), docPath( dp ) +{ + alreadyHaveDocList = FALSE; + lastWindowClosed = FALSE; + connect( qApp, SIGNAL( lastWindowClosed() ), + this, SLOT( setLastWinClosed() ) ); +} + +Index::Index( const TQStringList &dl, const TQString &hp ) + : TQObject( 0, 0 ), dict( 8999 ) +{ + docList = dl; + alreadyHaveDocList = TRUE; + lastWindowClosed = FALSE; + connect( qApp, SIGNAL( lastWindowClosed() ), + this, SLOT( setLastWinClosed() ) ); +} + +void Index::setLastWinClosed() +{ + lastWindowClosed = TRUE; +} + +void Index::setDictionaryFile( const TQString &f ) +{ + dictFile = f; +} + +void Index::setDocListFile( const TQString &f ) +{ + docListFile = f; +} + +void Index::setDocList( const TQStringList &lst ) +{ + docList = lst; +} + +int Index::makeIndex() +{ + if ( !alreadyHaveDocList ) + setupDocumentList(); + if ( docList.isEmpty() ) + return 1; + TQStringList::Iterator it = docList.begin(); + int steps = docList.count() / 100; + if ( !steps ) + steps++; + int prog = 0; + for ( int i = 0; it != docList.end(); ++it, ++i ) { + if ( lastWindowClosed ) { + return -1; + } + parseDocument( *it, i ); + if ( i%steps == 0 ) { + prog++; + emit indexingProgress( prog ); + } + } + return 0; +} + +void Index::setupDocumentList() +{ + TQDir d( docPath ); + TQStringList lst = d.entryList( "*.html" ); + TQStringList::ConstIterator it = lst.begin(); + for ( ; it != lst.end(); ++it ) + docList.append( docPath + "/" + *it ); +} + +void Index::insertInDict( const TQString &str, int docNum ) +{ + if ( strcmp( str, "amp" ) == 0 || strcmp( str, "nbsp" ) == 0 ) + return; + Entry *e = 0; + if ( dict.count() ) + e = dict[ str ]; + + if ( e ) { + if ( e->documents.first().docNumber != docNum ) + e->documents.prepend( Document( docNum, 1 ) ); + else + e->documents.first().frequency++; + } else { + dict.insert( str, new Entry( docNum ) ); + } +} + +void Index::parseDocument( const TQString &filename, int docNum ) +{ + TQFile file( filename ); + if ( !file.open( IO_ReadOnly ) ) { + qWarning( "can not open file " + filename ); + return; + } + + TQTextStream s( &file ); + TQString text = s.read(); + if (text.isNull()) + return; + + bool valid = TRUE; + const TQChar *buf = text.unicode(); + TQChar str[64]; + TQChar c = buf[0]; + int j = 0; + int i = 0; + while ( (uint)j < text.length() ) { + if ( c == '<' || c == '&' ) { + valid = FALSE; + if ( i > 1 ) + insertInDict( TQString(str,i), docNum ); + i = 0; + c = buf[++j]; + continue; + } + if ( ( c == '>' || c == ';' ) && !valid ) { + valid = TRUE; + c = buf[++j]; + continue; + } + if ( !valid ) { + c = buf[++j]; + continue; + } + if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) { + str[i] = c.lower(); + ++i; + } else { + if ( i > 1 ) + insertInDict( TQString(str,i), docNum ); + i = 0; + } + c = buf[++j]; + } + if ( i > 1 ) + insertInDict( TQString(str,i), docNum ); + file.close(); +} + +void Index::writeDict() +{ + TQDictIterator<Entry> it( dict ); + TQFile f( dictFile ); + if ( !f.open( IO_WriteOnly ) ) + return; + TQDataStream s( &f ); + for( ; it.current(); ++it ) { + Entry *e = it.current(); + s << it.currentKey(); + s << e->documents; + } + f.close(); + writeDocumentList(); +} + +void Index::writeDocumentList() +{ + TQFile f( docListFile ); + if ( !f.open( IO_WriteOnly ) ) + return; + TQDataStream s( &f ); + s << docList; +} + +void Index::readDict() +{ + TQFile f( dictFile ); + if ( !f.open( IO_ReadOnly ) ) + return; + + dict.clear(); + TQDataStream s( &f ); + TQString key; + TQValueList<Document> docs; + while ( !s.atEnd() ) { + s >> key; + s >> docs; + dict.insert( key, new Entry( docs ) ); + } + f.close(); + readDocumentList(); +} + +void Index::readDocumentList() +{ + TQFile f( docListFile ); + if ( !f.open( IO_ReadOnly ) ) + return; + TQDataStream s( &f ); + s >> docList; +} + +TQStringList Index::query( const TQStringList &terms, const TQStringList &termSeq, const TQStringList &seqWords ) +{ + TermList termList; + + TQStringList::ConstIterator it = terms.begin(); + for ( it = terms.begin(); it != terms.end(); ++it ) { + Entry *e = 0; + if ( (*it).contains( '*' ) ) { + TQValueList<Document> wcts = setupDummyTerm( getWildcardTerms( *it ) ); + termList.append( new Term( "dummy", wcts.count(), wcts ) ); + } else if ( dict[ *it ] ) { + e = dict[ *it ]; + termList.append( new Term( *it, e->documents.count(), e->documents ) ); + } else { + return TQStringList(); + } + } + termList.sort(); + + Term *minTerm = termList.first(); + if ( !termList.count() ) + return TQStringList(); + termList.removeFirst(); + + TQValueList<Document> minDocs = minTerm->documents; + TQValueList<Document>::iterator C; + TQValueList<Document>::ConstIterator It; + Term *t = termList.first(); + for ( ; t; t = termList.next() ) { + TQValueList<Document> docs = t->documents; + C = minDocs.begin(); + while ( C != minDocs.end() ) { + bool found = FALSE; + for ( It = docs.begin(); It != docs.end(); ++It ) { + if ( (*C).docNumber == (*It).docNumber ) { + (*C).frequency += (*It).frequency; + found = TRUE; + break; + } + } + if ( !found ) + C = minDocs.remove( C ); + else + ++C; + } + } + + TQStringList results; + qHeapSort( minDocs ); + if ( termSeq.isEmpty() ) { + for ( C = minDocs.begin(); C != minDocs.end(); ++C ) + results << docList[ (int)(*C).docNumber ]; + return results; + } + + TQString fileName; + for ( C = minDocs.begin(); C != minDocs.end(); ++C ) { + fileName = docList[ (int)(*C).docNumber ]; + if ( searchForPattern( termSeq, seqWords, fileName ) ) + results << fileName; + } + return results; +} + +TQString Index::getDocumentTitle( const TQString &fileName ) +{ + TQFile file( fileName ); + if ( !file.open( IO_ReadOnly ) ) { + qWarning( "cannot open file " + fileName ); + return fileName; + } + TQTextStream s( &file ); + TQString text = s.read(); + + int start = text.find( "<title>", 0, FALSE ) + 7; + int end = text.find( "</title>", 0, FALSE ); + + TQString title = ( end - start <= 0 ? tr("Untitled") : text.mid( start, end - start ) ); + return title; +} + +TQStringList Index::getWildcardTerms( const TQString &term ) +{ + TQStringList lst; + TQStringList terms = split( term ); + TQValueList<TQString>::iterator iter; + + TQDictIterator<Entry> it( dict ); + for( ; it.current(); ++it ) { + int index = 0; + bool found = FALSE; + TQString text( it.currentKey() ); + for ( iter = terms.begin(); iter != terms.end(); ++iter ) { + if ( *iter == "*" ) { + found = TRUE; + continue; + } + if ( iter == terms.begin() && (*iter)[0] != text[0] ) { + found = FALSE; + break; + } + index = text.find( *iter, index ); + if ( *iter == terms.last() && index != (int)text.length()-1 ) { + index = text.findRev( *iter ); + if ( index != (int)text.length() - (int)(*iter).length() ) { + found = FALSE; + break; + } + } + if ( index != -1 ) { + found = TRUE; + index += (*iter).length(); + continue; + } else { + found = FALSE; + break; + } + } + if ( found ) + lst << text; + } + + return lst; +} + +TQStringList Index::split( const TQString &str ) +{ + TQStringList lst; + int j = 0; + int i = str.find( '*', j ); + + while ( i != -1 ) { + if ( i > j && i <= (int)str.length() ) { + lst << str.mid( j, i - j ); + lst << "*"; + } + j = i + 1; + i = str.find( '*', j ); + } + + int l = str.length() - 1; + if ( str.mid( j, l - j + 1 ).length() > 0 ) + lst << str.mid( j, l - j + 1 ); + + return lst; +} + +TQValueList<Document> Index::setupDummyTerm( const TQStringList &terms ) +{ + TermList termList; + TQStringList::ConstIterator it = terms.begin(); + for ( ; it != terms.end(); ++it ) { + Entry *e = 0; + if ( dict[ *it ] ) { + e = dict[ *it ]; + termList.append( new Term( *it, e->documents.count(), e->documents ) ); + } + } + termList.sort(); + + TQValueList<Document> maxList; + + if ( !termList.count() ) + return maxList; + maxList = termList.last()->documents; + termList.removeLast(); + + TQValueList<Document>::iterator docIt; + Term *t = termList.first(); + while ( t ) { + TQValueList<Document> docs = t->documents; + for ( docIt = docs.begin(); docIt != docs.end(); ++docIt ) { + if ( maxList.findIndex( *docIt ) == -1 ) + maxList.append( *docIt ); + } + t = termList.next(); + } + return maxList; +} + +void Index::buildMiniDict( const TQString &str ) +{ + if ( miniDict[ str ] ) + miniDict[ str ]->positions.append( wordNum ); + ++wordNum; +} + +bool Index::searchForPattern( const TQStringList &patterns, const TQStringList &words, const TQString &fileName ) +{ + TQFile file( fileName ); + if ( !file.open( IO_ReadOnly ) ) { + qWarning( "cannot open file " + fileName ); + return FALSE; + } + + wordNum = 3; + miniDict.clear(); + TQStringList::ConstIterator cIt = words.begin(); + for ( ; cIt != words.end(); ++cIt ) + miniDict.insert( *cIt, new PosEntry( 0 ) ); + + TQTextStream s( &file ); + TQString text = s.read(); + bool valid = TRUE; + const TQChar *buf = text.unicode(); + TQChar str[64]; + TQChar c = buf[0]; + int j = 0; + int i = 0; + while ( (uint)j < text.length() ) { + if ( c == '<' || c == '&' ) { + valid = FALSE; + if ( i > 1 ) + buildMiniDict( TQString(str,i) ); + i = 0; + c = buf[++j]; + continue; + } + if ( ( c == '>' || c == ';' ) && !valid ) { + valid = TRUE; + c = buf[++j]; + continue; + } + if ( !valid ) { + c = buf[++j]; + continue; + } + if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) { + str[i] = c.lower(); + ++i; + } else { + if ( i > 1 ) + buildMiniDict( TQString(str,i) ); + i = 0; + } + c = buf[++j]; + } + if ( i > 1 ) + buildMiniDict( TQString(str,i) ); + file.close(); + + TQStringList::ConstIterator patIt = patterns.begin(); + TQStringList wordLst; + TQValueList<uint> a, b; + TQValueList<uint>::iterator aIt; + for ( ; patIt != patterns.end(); ++patIt ) { + wordLst = TQStringList::split( ' ', *patIt ); + a = miniDict[ wordLst[0] ]->positions; + for ( int j = 1; j < (int)wordLst.count(); ++j ) { + b = miniDict[ wordLst[j] ]->positions; + aIt = a.begin(); + while ( aIt != a.end() ) { + if ( b.find( *aIt + 1 ) != b.end() ) { + (*aIt)++; + ++aIt; + } else { + aIt = a.remove( aIt ); + } + } + } + } + if ( a.count() ) + return TRUE; + return FALSE; +} |