summaryrefslogtreecommitdiffstats
path: root/lib/libchmfile/libchmfile_search.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/libchmfile/libchmfile_search.cpp')
-rw-r--r--lib/libchmfile/libchmfile_search.cpp289
1 files changed, 289 insertions, 0 deletions
diff --git a/lib/libchmfile/libchmfile_search.cpp b/lib/libchmfile/libchmfile_search.cpp
new file mode 100644
index 0000000..41acfb3
--- /dev/null
+++ b/lib/libchmfile/libchmfile_search.cpp
@@ -0,0 +1,289 @@
+/***************************************************************************
+ * Copyright (C) 2004-2007 by Georgy Yunaev, gyunaev@ulduzsoft.com *
+ * Please do not use email address above for bug reports; see *
+ * the README file *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+#include <qregexp.h>
+
+#include "libchmfile.h"
+#include "libchmfileimpl.h"
+
+//#define DEBUG_SEARCH(A) qDebug A
+#define DEBUG_SEARCH(A)
+
+
+static inline void validateWord ( QString & word, bool & query_valid )
+{
+ QRegExp rxvalid ("[^\\d\\w_\\.]+");
+
+ QString orig = word;
+ word.remove ( rxvalid );
+
+ if ( word != orig )
+ query_valid = false;
+}
+
+static inline void validateWords ( QStringList & wordlist, bool & query_valid )
+{
+ QRegExp rxvalid ("[^\\d\\w_\\.]+");
+
+ for ( unsigned int i = 0; i < wordlist.size(); i++ )
+ validateWord ( wordlist[i], query_valid );
+}
+
+
+inline static void mergeResults ( LCHMSearchProgressResults & results, const LCHMSearchProgressResults & src, bool add )
+{
+ if ( results.empty() && add )
+ {
+ results = src;
+ return;
+ }
+
+ for ( unsigned int s1 = 0; s1 < results.size(); s1++ )
+ {
+ bool found = false;
+
+ for ( unsigned int s2 = 0; s2 < src.size(); s2++ )
+ {
+ if ( results[s1].urloff == src[s2].urloff )
+ {
+ found = true;
+ break;
+ }
+ }
+
+ // If we're adding, we only add the items found (i.e. any item, which is not found, is removed.
+ // But if we're removing, we only remove the items found.
+ if ( (found && !add) || (!found && add) )
+ {
+ results.erase ( results.begin() + s1 );
+ s1--;
+ }
+ }
+}
+
+
+static inline void findNextWords ( QT34VECTOR<u_int64_t> & src, const QT34VECTOR<u_int64_t> & needle )
+{
+ for ( unsigned int s1 = 0; s1 < src.size(); s1++ )
+ {
+ bool found = false;
+ u_int64_t target_offset = src[s1] + 1;
+
+ DEBUG_SEARCH (("Offset loop: offset at %u is %u, target %u", (unsigned int) s1,
+ (unsigned int) src[s1], (unsigned int) target_offset));
+
+ // Search in the offsets list in attempt to find next word
+ for ( unsigned int s2 = 0; s2 < needle.size(); s2++ )
+ {
+ if ( needle[s2] == target_offset )
+ {
+ found = true;
+ break;
+ }
+ }
+
+ if ( !found )
+ {
+ // Remove this offset, we don't need it anymore
+ DEBUG_SEARCH (("Offset loop failed: offset %u not found", (unsigned int) target_offset));
+ src.erase ( src.begin() + s1 );
+ s1--;
+ }
+ else
+ {
+ DEBUG_SEARCH (("Offset loop succeed: offset %u found", (unsigned int) target_offset));
+ src[s1]++;
+ }
+ }
+}
+
+
+inline bool searchPhrase( LCHMFileImpl * impl, const QStringList & phrase, LCHMSearchProgressResults & results )
+{
+ // Accumulate the phrase data here.
+ LCHMSearchProgressResults phrasekeeper;
+
+ // On the first word, just fill the phrasekeeper with every occupence of the first word
+ DEBUG_SEARCH (("Search word(0): '%s'", phrase[0].ascii()));
+ if ( !impl->searchWord ( phrase[0], true, false, phrasekeeper, true ) )
+ return false; // the word not found, so the whole phrase is not found either.
+
+ for ( unsigned int i = 1; i < phrase.size(); i++ )
+ {
+ LCHMSearchProgressResults srchtmp;
+
+ DEBUG_SEARCH (("Search word(%d): '%s'", i, phrase[i].ascii()));
+ if ( !impl->searchWord ( phrase[i], true, false, srchtmp, true ) )
+ return false; // the ith word not found, so the whole phrase is not found either.
+
+ // Iterate the both arrays, and remove every word in phrasekeeper, which is not found
+ // in the srchtmp, or is found on a different position.
+ for ( unsigned int p1 = 0; p1 < phrasekeeper.size(); p1++ )
+ {
+ bool found = false;
+
+ DEBUG_SEARCH (("Ext loop (it %d): urloff %d", p1, phrasekeeper[p1].urloff));
+
+ for ( unsigned int p2 = 0; p2 < srchtmp.size(); p2++ )
+ {
+ // look up for words on the the same page
+ if ( srchtmp[p2].urloff != phrasekeeper[p1].urloff )
+ continue;
+
+ // Now check every offset to find the one which is 1 bigger than the
+ findNextWords ( phrasekeeper[p1].offsets, srchtmp[p2].offsets );
+
+ // If at least one next word is found, we leave the block intact, otherwise remove it.
+ if ( !phrasekeeper[p1].offsets.empty() )
+ found = true;
+ }
+
+ if ( !found )
+ {
+ DEBUG_SEARCH (("Ext loop: this word not found on %d, remove it", phrasekeeper[p1].urloff));
+ phrasekeeper.erase ( phrasekeeper.begin() + p1 );
+ p1--;
+ }
+ }
+ }
+
+ for ( unsigned int o = 0; o < phrasekeeper.size(); o++ )
+ results.push_back ( LCHMSearchProgressResult (phrasekeeper[o].titleoff, phrasekeeper[o].urloff) );
+
+ return !results.empty();
+}
+
+
+
+bool LCHMFile::searchQuery( const QString& inquery, QStringList * searchresults, unsigned int limit )
+{
+ QStringList words_must_exist, words_must_not_exist, words_highlight;
+ QT34VECTOR<QStringList> phrases_must_exist;
+ QString query = inquery;
+ bool query_valid = true;
+ LCHMSearchProgressResults results;
+ int pos;
+ unsigned int i;
+
+ /*
+ * Parse the search query with a simple state machine.
+ * Query should consist of one of more words separated by a space with a possible prefix.
+ * A prefix may be:
+ * + indicates that the word is required; any page without this word is excluded from the result.
+ * - indicates that the word is required to be absent; any page with this word is excluded from
+ * the result.
+ * "." indicates a phrase. Anything between quotes indicates a phrase, which is set of space-separated
+ * words. Will be in result only if the words in phrase are in page in the same sequence, and
+ * follow each other.
+ * If there is no prefix, the word considered as required.
+ */
+
+ QRegExp rxphrase( "\"(.*)\"" );
+ QRegExp rxword( "([^\\s]+)" );
+ rxphrase.setMinimal( TRUE );
+
+ // First, get the phrase queries
+ while ( (pos = rxphrase.search (query, 0)) != -1 )
+ {
+ // A phrase query found. Locate its boundaries, and parse it.
+ QStringList plist = QStringList::split ( QRegExp ("\\s+"), rxphrase.cap ( 1 ));
+
+ validateWords ( plist, query_valid );
+
+ if ( plist.size() > 0 )
+ phrases_must_exist.push_back( plist );
+
+ query.remove (pos, rxphrase.matchedLength());
+ }
+
+ // Then, parse the rest query
+ while ( (pos = rxword.search (query, 0)) != -1 )
+ {
+ // A phrase query found. Locate its boundaries, and parse it.
+ QString word = rxword.cap ( 1 );
+ QChar type = '+';
+
+ if ( word[0] == '-' || word[0] == '+' )
+ {
+ type = word[0];
+ word.remove (0, 1);
+ }
+
+ validateWord ( word, query_valid );
+
+ if ( type == '-' )
+ words_must_not_exist.push_back ( word );
+ else
+ words_must_exist.push_back ( word );
+
+ query.remove (pos, rxword.matchedLength());
+ }
+
+#if defined (DUMP_SEARCH_QUERY)
+ // Dump the search query
+ QString qdump;
+ for ( i = 0; i < phrases_must_exist.size(); i++ )
+ qdump += QString(" \"") + phrases_must_exist[i].join (" ") + QString ("\"");
+
+ for ( i = 0; i < words_must_not_exist.size(); i++ )
+ qdump += QString (" -") + words_must_not_exist[i];
+
+ for ( i = 0; i < words_must_exist.size(); i++ )
+ qdump += QString (" +") + words_must_exist[i];
+
+ qDebug ("Search query dump: %s", qdump.ascii());
+#endif
+
+ // First search for phrases
+ if ( phrases_must_exist.size() > 0 )
+ {
+ LCHMSearchProgressResults tempres;
+
+ for ( i = 0; i < phrases_must_exist.size(); i++ )
+ {
+ if ( !searchPhrase ( impl(), phrases_must_exist[i], tempres ) )
+ return false;
+
+ mergeResults ( results, tempres, true );
+ }
+ }
+
+ for ( i = 0; i < words_must_exist.size(); i++ )
+ {
+ LCHMSearchProgressResults tempres;
+
+ if ( !m_impl->searchWord ( words_must_exist[i], true, false, tempres, false ) )
+ return false;
+
+ mergeResults ( results, tempres, true );
+ }
+
+ for ( i = 0; i < words_must_not_exist.size(); i++ )
+ {
+ LCHMSearchProgressResults tempres;
+
+ m_impl->searchWord ( words_must_not_exist[i], true, false, tempres, false );
+ mergeResults ( results, tempres, false );
+ }
+
+ m_impl->getSearchResults( results, searchresults, limit );
+ return true;
+}