summaryrefslogtreecommitdiffstats
path: root/src/modules/help/index.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/modules/help/index.cpp')
-rwxr-xr-xsrc/modules/help/index.cpp854
1 files changed, 854 insertions, 0 deletions
diff --git a/src/modules/help/index.cpp b/src/modules/help/index.cpp
new file mode 100755
index 00000000..659ff44c
--- /dev/null
+++ b/src/modules/help/index.cpp
@@ -0,0 +1,854 @@
+#include "index.h"
+
+#include "kvi_file.h"
+#include <qdir.h>
+#include <qstringlist.h>
+#include "kvi_pointerhashtable.h"
+#include <qapplication.h>
+#include <qtextstream.h>
+#include <ctype.h>
+
+
+int kvi_compare(const Term * p1,const Term * p2)
+{
+ if(p1->frequency == p2->frequency)
+ return 0;
+ if(p1->frequency < p2->frequency)
+ return -1;
+ return 1;
+}
+
+QDataStream &operator>>( QDataStream &s, Document &l )
+{
+ s >> l.docNumber;
+ s >> l.frequency;
+ return s;
+}
+
+QDataStream &operator<<( QDataStream &s, const Document &l )
+{
+ s << (Q_INT16)l.docNumber;
+ s << (Q_INT16)l.frequency;
+ return s;
+}
+
+Index::Index( const QString &dp, const QString &hp )
+
+ : QObject( 0, 0 ), dict( 8999 ), docPath( dp )
+
+{
+ alreadyHaveDocList = FALSE;
+ lastWindowClosed = FALSE;
+ connect( qApp, SIGNAL( lastWindowClosed() ),
+ this, SLOT( setLastWinClosed() ) );
+}
+
+
+
+Index::Index( const QStringList &dl, const QString &hp )
+
+ : QObject( 0, 0 ), dict( 8999 )
+
+{
+ docList = dl;
+ alreadyHaveDocList = TRUE;
+ lastWindowClosed = FALSE;
+ connect( qApp, SIGNAL( lastWindowClosed() ),
+ this, SLOT( setLastWinClosed() ) );
+}
+
+
+
+void Index::setLastWinClosed()
+
+{
+
+ lastWindowClosed = TRUE;
+
+}
+
+
+
+void Index::setDictionaryFile( const QString &f )
+
+{
+
+ dictFile = f;
+
+}
+
+
+
+void Index::setDocListFile( const QString &f )
+{
+ docListFile = f;
+}
+
+
+
+int Index::makeIndex()
+{
+ if ( !alreadyHaveDocList )
+ setupDocumentList();
+ if ( docList.isEmpty() )
+ return 1;
+ dict.clear();
+ QStringList::Iterator it = docList.begin();
+ int steps = docList.count() / 100;
+ if ( !steps )
+ steps++;
+ int prog = 0;
+ for ( int i = 0; it != docList.end(); ++it, ++i ) {
+ if ( lastWindowClosed ) {
+ return -1;
+ }
+ parseDocument( *it, i );
+ if ( i%steps == 0 ) {
+ prog++;
+ emit indexingProgress( prog );
+ }
+ }
+ return 0;
+}
+
+
+
+void Index::setupDocumentList()
+
+{
+ docList.clear();
+ titleList.clear();
+ QDir d( docPath );
+ QString szCur;
+ QStringList lst = d.entryList( "*.html" );
+ QStringList::ConstIterator it = lst.begin();
+ for ( ; it != lst.end(); ++it )
+ {
+ szCur=docPath + "/" + *it;
+ docList.append( szCur );
+ titleList.append(getDocumentTitle( szCur ));
+ }
+}
+
+
+
+void Index::insertInDict( const QString &str, int docNum )
+{
+ if ( strcmp( str, "amp" ) == 0 || strcmp( str, "nbsp" ) == 0 )
+ return;
+ Entry *e = 0;
+ if ( dict.count() )
+ e = dict[ str ];
+
+ if ( e ) {
+ if ( e->documents.first().docNumber != docNum )
+ e->documents.prepend( Document( docNum, 1 ) );
+ else
+ e->documents.first().frequency++;
+ } else {
+ dict.insert( str, new Entry( docNum ) );
+ }
+}
+
+
+
+void Index::parseDocument( const QString &filename, int docNum )
+{
+ KviFile file( filename );
+ if ( !file.openForReading() ) {
+ qWarning( "can not open file " + filename );
+ return;
+ }
+ QTextStream s( &file );
+ QString text = s.read();
+ if (text.isNull())
+ return;
+ bool valid = TRUE;
+ const QChar *buf = text.unicode();
+ QChar str[64];
+ QChar c = buf[0];
+ int j = 0;
+ int i = 0;
+ while ( (uint)j < text.length() ) {
+ if ( c == '<' || c == '&' ) {
+ valid = FALSE;
+ if ( i > 1 )
+ insertInDict( QString(str,i), docNum );
+ i = 0;
+ c = buf[++j];
+ continue;
+ }
+ if ( ( c == '>' || c == ';' ) && !valid ) {
+ valid = TRUE;
+ c = buf[++j];
+ continue;
+ }
+
+ if ( !valid ) {
+
+ c = buf[++j];
+
+ continue;
+
+ }
+
+ if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) {
+
+ str[i] = c.lower();
+
+ ++i;
+
+ } else {
+
+ if ( i > 1 )
+
+ insertInDict( QString(str,i), docNum );
+
+ i = 0;
+
+ }
+
+ c = buf[++j];
+
+ }
+
+ if ( i > 1 )
+
+ insertInDict( QString(str,i), docNum );
+
+ file.close();
+
+}
+
+
+
+void Index::writeDict()
+
+{
+
+ KviPointerHashTableIterator<QString,Entry> it( dict );
+
+ KviFile f( dictFile );
+
+ if ( !f.openForWriting() )
+
+ return;
+
+ QDataStream s( &f );
+
+ for( ; it.current(); ++it ) {
+
+ Entry *e = it.current();
+
+ s << it.currentKey();
+
+ s << e->documents;
+
+ }
+
+ f.close();
+
+ writeDocumentList();
+
+}
+
+
+
+void Index::writeDocumentList()
+
+{
+ KviFile f( docListFile );
+ if ( !f.openForWriting() )
+ return;
+ QTextStream s( &f );
+ QString docs = docList.join("[#item#]");
+ s << docs;
+
+ KviFile f1( docListFile+".titles" );
+ if ( !f1.openForWriting() )
+ return;
+ QTextStream s1( &f1 );
+ docs = titleList.join("[#item#]");
+ s1 << docs;
+}
+
+
+
+void Index::readDict()
+
+{
+ KviFile f( dictFile );
+ if ( !f.openForReading() )
+ return;
+ dict.clear();
+ QDataStream s( &f );
+ QString key;
+ KviValueList<Document> docs;
+ while ( !s.atEnd() ) {
+ s >> key;
+ s >> docs;
+ dict.insert( key, new Entry( docs ) );
+ }
+ f.close();
+ readDocumentList();
+}
+
+
+
+void Index::readDocumentList()
+{
+ //reading docs
+ KviFile f( docListFile );
+ if ( !f.openForReading() )
+ return;
+ QTextStream s( &f );
+ docList = QStringList::split("[#item#]",s.read());
+
+ //reading titles
+ KviFile f1( docListFile+".titles" );
+ if ( !f1.openForReading() )
+ return;
+ QTextStream s1( &f1 );
+ titleList = QStringList::split("[#item#]",s1.read());
+// debug(titleList);
+}
+
+
+
+QStringList Index::query( const QStringList &terms, const QStringList &termSeq, const QStringList &seqWords )
+
+{
+
+ TermList termList;
+
+
+
+ QStringList::ConstIterator it = terms.begin();
+
+ for ( it = terms.begin(); it != terms.end(); ++it ) {
+
+ Entry *e = 0;
+
+ if ( (*it).contains( '*' ) ) {
+
+ KviValueList<Document> wcts = setupDummyTerm( getWildcardTerms( *it ) );
+
+ termList.append( new Term( "dummy", wcts.count(), wcts ) );
+
+ } else if ( dict[ *it ] ) {
+
+ e = dict[ *it ];
+
+ termList.append( new Term( *it, e->documents.count(), e->documents ) );
+
+ } else {
+
+ return QStringList();
+
+ }
+
+ }
+
+ termList.sort();
+
+
+
+ Term *minTerm = termList.first();
+
+ if ( !termList.count() )
+
+ return QStringList();
+
+ termList.removeFirst();
+
+
+
+ KviValueList<Document> minDocs = minTerm->documents;
+
+ KviValueList<Document>::iterator C;
+
+ KviValueList<Document>::ConstIterator It;
+
+ Term *t = termList.first();
+
+ for ( ; t; t = termList.next() ) {
+
+ KviValueList<Document> docs = t->documents;
+
+ C = minDocs.begin();
+
+ while ( C != minDocs.end() ) {
+
+ bool found = FALSE;
+
+ for ( It = docs.begin(); It != docs.end(); ++It ) {
+
+ if ( (*C).docNumber == (*It).docNumber ) {
+
+ (*C).frequency += (*It).frequency;
+
+ found = TRUE;
+
+ break;
+
+ }
+
+ }
+
+ if ( !found )
+
+ C = minDocs.remove( C );
+
+ else
+
+ ++C;
+
+ }
+
+ }
+
+
+
+ QStringList results;
+
+#ifndef COMPILE_USE_QT4
+ qHeapSort( minDocs );
+#endif
+ if ( termSeq.isEmpty() ) {
+
+ for ( C = minDocs.begin(); C != minDocs.end(); ++C )
+
+ results << docList[ (int)(*C).docNumber ];
+
+ return results;
+
+ }
+
+
+
+ QString fileName;
+
+ for ( C = minDocs.begin(); C != minDocs.end(); ++C ) {
+
+ fileName = docList[ (int)(*C).docNumber ];
+
+ if ( searchForPattern( termSeq, seqWords, fileName ) )
+
+ results << fileName;
+
+ }
+
+ return results;
+
+}
+
+
+
+QString Index::getDocumentTitle( const QString &fileName )
+
+{
+
+ KviFile file( fileName );
+
+ if ( !file.openForReading() ) {
+
+ qWarning( "cannot open file " + fileName );
+
+ return fileName;
+
+ }
+
+ QTextStream s( &file );
+
+ QString text = s.read();
+
+
+
+ int start = text.find( "<title>", 0, FALSE ) + 7;
+
+ int end = text.find( "</title>", 0, FALSE );
+
+
+
+ QString title = ( end - start <= 0 ? tr("Untitled") : text.mid( start, end - start ) );
+
+ return title;
+
+}
+
+
+
+QStringList Index::getWildcardTerms( const QString &term )
+
+{
+
+ QStringList lst;
+
+ QStringList terms = split( term );
+
+#ifdef COMPILE_USE_QT4
+ QStringList::Iterator iter;
+#else
+ KviValueList<QString>::iterator iter;
+#endif
+
+
+ KviPointerHashTableIterator<QString,Entry> it( dict );
+
+ for( ; it.current(); ++it ) {
+
+ int index = 0;
+
+ bool found = FALSE;
+
+ QString text( it.currentKey() );
+
+ for ( iter = terms.begin(); iter != terms.end(); ++iter ) {
+
+ if ( *iter == "*" ) {
+
+ found = TRUE;
+
+ continue;
+
+ }
+
+ if ( iter == terms.begin() && (*iter)[0] != text[0] ) {
+
+ found = FALSE;
+
+ break;
+
+ }
+
+ index = text.find( *iter, index );
+
+ if ( *iter == terms.last() && index != (int)text.length()-1 ) {
+
+ index = text.findRev( *iter );
+
+ if ( index != (int)text.length() - (int)(*iter).length() ) {
+
+ found = FALSE;
+
+ break;
+
+ }
+
+ }
+
+ if ( index != -1 ) {
+
+ found = TRUE;
+
+ index += (*iter).length();
+
+ continue;
+
+ } else {
+
+ found = FALSE;
+
+ break;
+
+ }
+
+ }
+
+ if ( found )
+
+ lst << text;
+
+ }
+
+
+
+ return lst;
+
+}
+
+
+
+QStringList Index::split( const QString &str )
+
+{
+
+ QStringList lst;
+
+ int j = 0;
+
+ int i = str.find( '*', j );
+
+
+
+ while ( i != -1 ) {
+
+ if ( i > j && i <= (int)str.length() ) {
+
+ lst << str.mid( j, i - j );
+
+ lst << "*";
+
+ }
+
+ j = i + 1;
+
+ i = str.find( '*', j );
+
+ }
+
+
+
+ int l = str.length() - 1;
+
+ if ( str.mid( j, l - j + 1 ).length() > 0 )
+
+ lst << str.mid( j, l - j + 1 );
+
+
+
+ return lst;
+
+}
+
+
+
+KviValueList<Document> Index::setupDummyTerm( const QStringList &terms )
+
+{
+
+ TermList termList;
+
+ QStringList::ConstIterator it = terms.begin();
+
+ for ( ; it != terms.end(); ++it ) {
+
+ Entry *e = 0;
+
+ if ( dict[ *it ] ) {
+
+ e = dict[ *it ];
+
+ termList.append( new Term( *it, e->documents.count(), e->documents ) );
+
+ }
+
+ }
+
+ termList.sort();
+
+
+
+ KviValueList<Document> maxList;
+
+
+
+ if ( !termList.count() )
+
+ return maxList;
+
+ maxList = termList.last()->documents;
+
+ termList.removeLast();
+
+
+
+ KviValueList<Document>::iterator docIt;
+
+ Term *t = termList.first();
+
+ while ( t ) {
+
+ KviValueList<Document> docs = t->documents;
+
+ for ( docIt = docs.begin(); docIt != docs.end(); ++docIt ) {
+
+ if ( maxList.findIndex( *docIt ) == -1 )
+
+ maxList.append( *docIt );
+
+ }
+
+ t = termList.next();
+
+ }
+
+ return maxList;
+
+}
+
+
+
+void Index::buildMiniDict( const QString &str )
+
+{
+
+ if ( miniDict[ str ] )
+
+ miniDict[ str ]->positions.append( wordNum );
+
+ ++wordNum;
+
+}
+
+
+
+bool Index::searchForPattern( const QStringList &patterns, const QStringList &words, const QString &fileName )
+
+{
+
+ KviFile file( fileName );
+
+ if ( !file.openForReading() ) {
+
+ qWarning( "cannot open file " + fileName );
+
+ return FALSE;
+
+ }
+
+
+
+ wordNum = 3;
+
+ miniDict.clear();
+
+ QStringList::ConstIterator cIt = words.begin();
+
+ for ( ; cIt != words.end(); ++cIt )
+
+ miniDict.insert( *cIt, new PosEntry( 0 ) );
+
+
+
+ QTextStream s( &file );
+
+ QString text = s.read();
+
+ bool valid = TRUE;
+
+ const QChar *buf = text.unicode();
+
+ QChar str[64];
+
+ QChar c = buf[0];
+
+ int j = 0;
+
+ int i = 0;
+
+ while ( (uint)j < text.length() ) {
+
+ if ( c == '<' || c == '&' ) {
+
+ valid = FALSE;
+
+ if ( i > 1 )
+
+ buildMiniDict( QString(str,i) );
+
+ i = 0;
+
+ c = buf[++j];
+
+ continue;
+
+ }
+
+ if ( ( c == '>' || c == ';' ) && !valid ) {
+
+ valid = TRUE;
+
+ c = buf[++j];
+
+ continue;
+
+ }
+
+ if ( !valid ) {
+
+ c = buf[++j];
+
+ continue;
+
+ }
+
+ if ( ( c.isLetterOrNumber() || c == '_' ) && i < 63 ) {
+
+ str[i] = c.lower();
+
+ ++i;
+
+ } else {
+
+ if ( i > 1 )
+
+ buildMiniDict( QString(str,i) );
+
+ i = 0;
+
+ }
+
+ c = buf[++j];
+
+ }
+
+ if ( i > 1 )
+
+ buildMiniDict( QString(str,i) );
+
+ file.close();
+
+
+
+ QStringList::ConstIterator patIt = patterns.begin();
+
+ QStringList wordLst;
+
+ KviValueList<uint> a, b;
+
+ KviValueList<uint>::iterator aIt;
+
+ for ( ; patIt != patterns.end(); ++patIt ) {
+
+ wordLst = QStringList::split( ' ', *patIt );
+
+ a = miniDict[ wordLst[0] ]->positions;
+
+ for ( int j = 1; j < (int)wordLst.count(); ++j ) {
+
+ b = miniDict[ wordLst[j] ]->positions;
+
+ aIt = a.begin();
+
+ while ( aIt != a.end() ) {
+
+ if ( b.find( *aIt + 1 ) != b.end() ) {
+
+ (*aIt)++;
+
+ ++aIt;
+
+ } else {
+
+ aIt = a.remove( aIt );
+
+ }
+
+ }
+
+ }
+
+ }
+
+ if ( a.count() )
+
+ return TRUE;
+
+ return FALSE;
+
+}
+
+
+
+#include "index.moc" \ No newline at end of file