diff options
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc')
-rw-r--r-- | debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc | 1058 |
1 files changed, 1058 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc new file mode 100644 index 00000000..8a610d36 --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc @@ -0,0 +1,1058 @@ +//------------------------------------------------------------- +// +// libhtdig_htdig.cc +// +// 1/25/2002 created from htdig.cc +// +// Neal Richter nealr@rightnow.com +// +// libhtdig_htdig.cc +// +// htdig: Indexes the web sites specified in the config file +// generating several databases to be used by htmerge +// +// Part of the ht://Dig package <http://www.htdig.org/> +// Copyright (c) 1995-2004 The ht://Dig Group +// For copyright details, see the file COPYING in your distribution +// or the GNU Library General Public License (LGPL) version 2 or later or later +// <http://www.gnu.org/copyleft/lgpl.html> +// +// $Id: libhtdig_htdig.cc,v 1.5 2004/05/28 13:15:29 lha Exp $ +// +//------------------------------------------------------------- + +#ifdef HAVE_CONFIG_H +#include "htconfig.h" +#endif /* HAVE_CONFIG_H */ + +#ifdef HAVE_STD +#include <iostream> +#ifdef HAVE_NAMESPACES +using namespace std; +#endif +#else +#include <iostream.h> +#endif /* HAVE_STD */ + +extern "C" { +#include "libhtdig_api.h" +} + +#include "libhtdig_log.h" + +#include "BasicDocument.h" +#include "Document.h" +#include "TextCollector.h" +#include "Retriever.h" +#include "StringList.h" +#include "htdig.h" +#include "defaults.h" +#include "HtURLCodec.h" +#include "WordContext.h" +#include "HtDateTime.h" +#include "HtURLRewriter.h" +#include "URL.h" +#include "Server.h" + +//////////////////////////// +// For cookie jar +//////////////////////////// +#include "HtCookieJar.h" +#include "HtCookieMemJar.h" +#include "HtHTTP.h" +//////////////////////////// + +// If we have this, we probably want it. +//#ifdef HAVE_GETOPT_H +//#include <getopt.h> +//#endif + + + +//Global Variables for Library + +int debug = 0; +HtRegexList limits; +HtRegexList limitsn; +String configFile = DEFAULT_CONFIG_FILE; +FILE *urls_seen = NULL; +FILE *images_seen = NULL; +DocumentDB docs; + + +// +// Global variables for this file +// +static int report_statistics = 0; +static String minimalFile = 0; +static HtDateTime StartTime; +static HtDateTime EndTime; + +//static char *max_hops = NULL; +static String credentials; +static HtCookieJar *_cookie_jar = NULL; +static HtConfiguration * config = NULL; +static WordContext * wc = NULL; + +static int create_text_database = 0; +static int alt_work_area = 0; +static int initial = 0; + +int htdig_index_open_flag = FALSE; + + +//new. URLs from 'command-line' +#define URL_SEPCHARS " ," +static char *myURL = NULL; + + +BasicDocument *a_basicdoc; +TextCollector *Indexer; + +BasicDocument the_basicdoc; +//TextCollector the_Indexer; + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_open(...) + * + * + * opens/creates document indexes and initializes variables + * for indexing. + * + * + * see libhtdig_api.h headerfile for definition of + * htdig_parameters_struct + * + * + * TODO Examine external function calls for error return + * codes + * + *******************************************************/ + +int htdig_index_open(htdig_parameters_struct * htdig_parms) +{ + int ret = -1; + + if(htdig_index_open_flag != FALSE) + return(FALSE); + + //load 'comand-line' parameters + + if (htdig_parms->configFile[0] != 0) + configFile = htdig_parms->configFile; + + if (htdig_parms->URL[0] != 0) + { + myURL = strdup(htdig_parms->URL); + } + + debug = htdig_parms->debug; + if(debug != 0) + { + ret = logOpen(htdig_parms->logFile); + + if(ret == FALSE) + { + reportError (form ("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n", + htdig_parms->logFile, errno, strerror(errno)) ); + return(HTDIG_ERROR_LOGFILE_OPEN); + } + } + + initial = htdig_parms->initial; + create_text_database = htdig_parms->create_text_database; + //max_hops = strdup(htdig_parms->max_hops); + report_statistics = htdig_parms->report_statistics; + credentials = htdig_parms->credentials; + alt_work_area = htdig_parms->alt_work_area; + minimalFile = htdig_parms->minimalFile; + + + if(htdig_parms->use_cookies == TRUE) + { + // Cookie jar dynamic creation. + + _cookie_jar = new HtCookieMemJar (); // new cookie jar + if (_cookie_jar) + HtHTTP::SetCookieJar (_cookie_jar); + } + + // + // First set all the defaults and then read the specified config + // file to override the defaults. + // + + config = HtConfiguration::config (); + + config->Defaults (&defaults[0]); + if (access ((char *) configFile, R_OK) < 0) + { + reportError (form ("[HTDIG] Unable to find configuration file '%s'", + configFile.get ())); + return(HTDIG_ERROR_CONFIG_READ); + } + config->Read (configFile); + + //------- Now override config settings ------------ + + //------- override database path ------------ + if(strlen(htdig_parms->DBpath) > 0) + { + config->Add("database_dir", htdig_parms->DBpath); + } + + //------- custom filters from htdig_parms ---------- + + if(strlen(htdig_parms->locale) > 0) + { + config->Add("locale", htdig_parms->locale); + } + + if (config->Find ("locale").empty () && debug > 0) + logEntry("Warning: unknown locale!\n"); + + if (strlen(htdig_parms->max_hops) > 0) + { + config->Add ("max_hop_count", htdig_parms->max_hops); + } + + if(strlen(htdig_parms->limit_urls_to) > 0) + { + config->Add("limit_urls_to", htdig_parms->limit_urls_to); + } + + if(strlen(htdig_parms->limit_normalized) > 0) + { + config->Add("limit_normalized", htdig_parms->limit_normalized); + } + + if(strlen(htdig_parms->exclude_urls) > 0) + { + config->Add("exclude_urls", htdig_parms->exclude_urls); + } + + if(strlen(htdig_parms->url_rewrite_rules) > 0) + { + config->Add("url_rewrite_rules", htdig_parms->url_rewrite_rules); + } + + if(strlen(htdig_parms->bad_querystr) > 0) + { + config->Add("bad_querystr", htdig_parms->bad_querystr); + } + + if(strlen(htdig_parms->locale) > 0) + { + config->Add("locale", htdig_parms->locale); + } + + if(strlen(htdig_parms->meta_description_factor) > 0) + { + config->Add("meta_description_factor", htdig_parms->meta_description_factor); + } + + if(strlen(htdig_parms->title_factor) > 0) + { + config->Add("title_factor", htdig_parms->title_factor); + } + + if(strlen(htdig_parms->text_factor) > 0) + { + config->Add("text_factor", htdig_parms->text_factor); + } + + if(strlen(htdig_parms->URL) > 0) + { + config->Add("start_url", htdig_parms->URL); + free(myURL); + myURL=NULL; + } + + //------- end custom filters from htdig_parms ---------- + + // Set up credentials for this run + if (credentials.length ()) + config->Add ("authorization", credentials); + + // + // Check url_part_aliases and common_url_parts for + // errors. + String url_part_errors = HtURLCodec::instance ()->ErrMsg (); + + if (url_part_errors.length () != 0) + { + reportError (form("[HTDIG] Invalid url_part_aliases or common_url_parts: %s", + url_part_errors.get ())); + return(HTDIG_ERROR_URL_PART); + } + // + // Check url_rewrite_rules for errors. + String url_rewrite_rules = HtURLRewriter::instance ()->ErrMsg (); + + if (url_rewrite_rules.length () != 0) + { + reportError (form ("[HTDIG] Invalid url_rewrite_rules: %s", + url_rewrite_rules.get ())); + return(HTDIG_ERROR_URL_REWRITE); + } + + // + // If indicated, change the database file names to have the .work + // extension + // + if (alt_work_area != 0) + { + String configValue = config->Find ("doc_db"); + + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("doc_db", configValue); + } + + configValue = config->Find ("word_db"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("word_db", configValue); + } + + configValue = config->Find ("doc_index"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("doc_index", configValue); + } + + configValue = config->Find ("doc_excerpt"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("doc_excerpt", configValue); + } + + configValue = config->Find ("md5_db"); + if (configValue.length () != 0) + { + configValue << ".work"; + config->Add ("md5_db", configValue); + } + } + + // + // If needed, we will create a list of every URL we come across. + //TODO put document-index log file stuff here + + if (config->Boolean ("create_url_list")) + { + const String filename = config->Find ("url_list"); + urls_seen = fopen (filename, initial ? "w" : "a"); + if (urls_seen == 0) + { + reportError (form ("[HTDIG] Unable to create URL file '%s'", + filename.get ())); + return(HTDIG_ERROR_URL_CREATE_FILE); + } + } + + // + // If needed, we will create a list of every image we come across. + // + if (config->Boolean ("create_image_list")) + { + const String filename = config->Find ("image_list"); + images_seen = fopen (filename, initial ? "w" : "a"); + if (images_seen == 0) + { + reportError (form ("[HTDIG] Unable to create images file '%s'", + filename.get ())); + return(HTDIG_ERROR_IMAGE_CREATE_FILE); + } + } + + // + // Set up the limits list + // + StringList l (config->Find ("limit_urls_to"), " \t"); + limits.setEscaped (l, config->Boolean ("case_sensitive")); + l.Destroy (); + + l.Create (config->Find ("limit_normalized"), " \t"); + limitsn.setEscaped (l, config->Boolean ("case_sensitive")); + l.Destroy (); + + // + // Open the document database + // + const String filename = config->Find ("doc_db"); + if (initial) + unlink (filename); + + const String index_filename = config->Find ("doc_index"); + if (initial) + unlink (index_filename); + + const String head_filename = config->Find ("doc_excerpt"); + if (initial) + unlink (head_filename); + + if (docs.Open (filename, index_filename, head_filename) < 0) + { + reportError (form ("[HTDIG] Unable to open/create document database '%s'", + filename.get ())); + return(HTDIG_ERROR_OPEN_CREATE_DOCDB); + } + + const String word_filename = config->Find ("word_db"); + if (initial) + unlink (word_filename); + + // Initialize htword + wc = new WordContext; + wc->Initialize(*config); + + + //a_basicdoc = new BasicDocument; + Indexer = new TextCollector; + + a_basicdoc = &the_basicdoc; + a_basicdoc->Reset(); + + //Indexer = &the_Indexer; + + if ((a_basicdoc == NULL) || (Indexer == NULL)) + return(FALSE); + + + htdig_index_open_flag = TRUE; + + return(TRUE); + +} + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_simple_doc(...) + * + * + * indexes a simple document supplied by parameter + * + * see libhtdig_api.h headerfile for definition of + * htdig_simple_doc_struct + * + * TODO Examine external function calls for error return + * codes + * + *******************************************************/ +int htdig_index_simple_doc(htdig_simple_doc_struct * a_simple_doc) +{ + int index_error = 0; + //int ret = 0; + + // Reset the document to clean out any old data + a_basicdoc->Reset(); + + a_basicdoc->ModTime(a_simple_doc->doc_time); + a_basicdoc->Location(a_simple_doc->location); + a_basicdoc->DocumentID(a_simple_doc->documentid); + a_basicdoc->Title(a_simple_doc->title); + a_basicdoc->MetaContent(a_simple_doc->meta); + a_basicdoc->Contents(a_simple_doc->contents); //MUST ALLOCATE & FREE!!! + a_basicdoc->ContentType(a_simple_doc->content_type); //MIME-ISH string + a_basicdoc->Length(); + + + //TODO What is this error? + index_error = Indexer->IndexDoc(*a_basicdoc); + + return(TRUE); +} + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_urls(...) + * + * Starts fetch & index of URL supplied in config file + * OR supplied in htdig_index_open parameter + * + * TODO Examine external function calls for error return + * codes + * TODO Blank/empty URL error? + *******************************************************/ +int htdig_index_urls(void) +{ + + char * temp_URL_list = NULL; + char * temp_url = NULL; + + // Create the Retriever object which we will use to parse all the + // HTML files. + // In case this is just an update dig, we will add all existing + // URLs? + // + Retriever retriever (Retriever_logUrl); + if (minimalFile.length () == 0) + { + List *list = docs.URLs (); + retriever.Initial (*list); + delete list; + + // Add start_url to the initial list of the retriever. + // Don't check a URL twice! + // Beware order is important, if this bugs you could change + // previous line retriever.Initial(*list, 0) to Initial(*list,1) + retriever.Initial (config->Find ("start_url"), 1); + } + + // Handle list of URLs given on 'command-line' + if (myURL != NULL) + { + String str; + temp_URL_list = strdup(myURL); + temp_url = strtok(temp_URL_list, URL_SEPCHARS); + while (temp_url != NULL) + { + str = temp_url; + str.chop ("\r\n"); + if (str.length () > 0) + retriever.Initial (str, 1); + + temp_url = strtok(NULL, URL_SEPCHARS); + } + free(temp_URL_list); + } + else if (minimalFile.length () != 0) + { + FILE *input = fopen (minimalFile.get (), "r"); + char buffer[1000]; + + if (input) + { + while (fgets (buffer, sizeof (buffer), input)) + { + String str (buffer); + str.chop ("\r\n\t "); + if (str.length () > 0) + retriever.Initial (str, 1); + } + fclose (input); + } + } + + // + // Go do it! + // + retriever.Start (); + + // + // All done with parsing. + // + + // + // If the user so wants, create a text version of the document database. + // + + if (create_text_database) + { + const String doc_list = config->Find ("doc_list"); + if (initial) + unlink (doc_list); + docs.DumpDB (doc_list); + const String word_dump = config->Find ("word_dump"); + if (initial) + unlink (word_dump); + HtWordList words (*config); + if (words.Open (config->Find ("word_db"), O_RDONLY) == OK) + { + words.Dump (word_dump); + } + } + + // + // Cleanup + // + if (images_seen) + fclose (images_seen); + + // + // If needed, report some statistics + // + if (report_statistics) + { + retriever.ReportStatistics ("htdig"); + } + + return(TRUE); +} + + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_close(...) + * + * Closes the database and destroys various objects + * + * TODO Examine external function calls for error return + * codes + * + *******************************************************/ +int htdig_index_close(void) +{ + int ret = -1; + + if(htdig_index_open_flag == TRUE) + { + //delete a_basicdoc; + //delete Indexer; + + Indexer->FlushWordDB(); + + if (_cookie_jar) + delete _cookie_jar; + + //if (max_hops != NULL) + // free(max_hops); + + if (myURL != NULL) + free(myURL); + + //call destructors here + docs.~DocumentDB(); + //config->~HtConfiguration(); + + if (debug != 0) + { + ret = logClose(); + + if (ret == FALSE) + { + reportError (form ("[HTDIG] Error closing log file . Error:[%d], %s\n", + errno, strerror(errno)) ); + return(HTDIG_ERROR_LOGFILE_CLOSE); + } + } + + /* + if(config) { + WordContext::Finish(); + } + */ + + if (wc) + delete wc; + + if (urls_seen) + fclose (urls_seen); + + htdig_index_open_flag = FALSE; + } + + return(TRUE); +} + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_reset(...) + * + * + * TODO Examine external function calls for error return + * codes + * + *******************************************************/ + +int htdig_index_reset(void) +{ + Indexer->FlushWordDB(); + a_basicdoc->Reset(); + + return(TRUE); +} + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_get_max_head_length(...) + * + * + * Returns size of maximum document storage length + * for db.excerpts [htdig.conf:max_head_length] + * + * This represents the maximum amount of the document + * That will be available for excerpting. + * + * + *******************************************************/ + +int htdig_get_max_head_length() +{ + int ret = -1; + + if(config != NULL) + ret = config->Value("max_head_length"); + + return(ret); +} + +/******************************************************* + * + * LIBHTDIG API FUNCTION + * + * int htdig_index_test_url(...) + * + * + * Test a URL for filter Pass/Fail + * + * Pass = return(TRUE) + * Fail = return(XXX) [Negative Value] + * + * + * + * + * + *******************************************************/ + + +//int htdig_index_test_url(htdig_parameters_struct *htdig_parms) +int htdig_index_test_url(htdig_parameters_struct *htdig_parms) +{ + //int ret = FALSE; + String the_URL(htdig_parms->URL); + HtConfiguration* config= HtConfiguration::config(); + Dictionary invalids; + Dictionary valids; + URL aUrl(the_URL); + String rewritten_url(the_URL); + StringList tmpList; + HtRegex limitTo; + HtRegex excludeFrom; + + //initalize outgoing-parameter rewritten_URL + htdig_parms->rewritten_URL[0] = 0; + +#ifdef DEBUG + //output relevant config variables + cout << " bad_extensions = " << config->Find("bad_extensions") << endl; + cout << " valid_extensions = " << config->Find("valid_extensions") << endl; + cout << " exclude_urls = " << config->Find("exclude_urls") << endl; + cout << " bad_querystr = " << config->Find("bad_querystr") << endl; + cout << " limit_urls_to = " << config->Find("limit_urls_to") << endl; + cout << " limit_normalized = " << config->Find("limit_normalized") << endl; + cout << " restrict = " << config->Find("restrict") << endl; + cout << " exclude = " << config->Find("exclude") << endl; +#endif + + //------------ read the config file if it is given --------------- + if (htdig_parms->configFile[0] != 0) + configFile = htdig_parms->configFile; + + config = HtConfiguration::config (); + + config->Defaults (&defaults[0]); + if (access ((char *) configFile, R_OK) < 0) + { + reportError (form ("[HTDIG] Unable to find configuration file '%s'", + configFile.get ())); + return(HTDIG_ERROR_CONFIG_READ); + } + config->Read (configFile); + + //---------- Now override config settings ----------------- + + //------- override database path ------------ + if(strlen(htdig_parms->DBpath) > 0) + { + config->Add("database_dir", htdig_parms->DBpath); + } + + //------- custom filters from htdig_parms ---------- + + if(strlen(htdig_parms->locale) > 0) + { + config->Add("locale", htdig_parms->locale); + } + + if (config->Find ("locale").empty () && debug > 0) + logEntry("Warning: unknown locale!\n"); + + if (strlen(htdig_parms->max_hops) > 0) + { + config->Add ("max_hop_count", htdig_parms->max_hops); + } + + if(strlen(htdig_parms->limit_urls_to) > 0) + { + config->Add("limit_urls_to", htdig_parms->limit_urls_to); + } + + if(strlen(htdig_parms->limit_normalized) > 0) + { + config->Add("limit_normalized", htdig_parms->limit_normalized); + } + + if(strlen(htdig_parms->exclude_urls) > 0) + { + config->Add("exclude_urls", htdig_parms->exclude_urls); + } + + if(strlen(htdig_parms->url_rewrite_rules) > 0) + { + config->Add("url_rewrite_rules", htdig_parms->url_rewrite_rules); + } + + if(strlen(htdig_parms->bad_querystr) > 0) + { + config->Add("bad_querystr", htdig_parms->bad_querystr); + } + + if(strlen(htdig_parms->locale) > 0) + { + config->Add("locale", htdig_parms->locale); + } + + if(strlen(htdig_parms->meta_description_factor) > 0) + { + config->Add("meta_description_factor", htdig_parms->meta_description_factor); + } + + if(strlen(htdig_parms->title_factor) > 0) + { + config->Add("title_factor", htdig_parms->title_factor); + } + + if(strlen(htdig_parms->text_factor) > 0) + { + config->Add("text_factor", htdig_parms->text_factor); + } + + //------------------------------------------------------------------- + +#ifdef DEBUG + //output relevant config variables + cout << " bad_extensions = " << config->Find("bad_extensions") << endl; + cout << " valid_extensions = " << config->Find("valid_extensions") << endl; + cout << " exclude_urls = " << config->Find("exclude_urls") << endl; + cout << " bad_querystr = " << config->Find("bad_querystr") << endl; + cout << " limit_urls_to = " << config->Find("limit_urls_to") << endl; + cout << " limit_normalized = " << config->Find("limit_normalized") << endl; + cout << " restrict = " << config->Find("restrict") << endl; + cout << " exclude = " << config->Find("exclude") << endl; +#endif + + + //------ bad_extensions ----------------------------------------------- + //A list of bad extensions, separated by spaces or tabs + + String t = config->Find("bad_extensions"); + String lowerp; + char *p = strtok(t, " \t"); + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + invalids.Add(lowerp, 0); + p = strtok(0, " \t"); + } + + + //------ valid_extensions ------------------------------------------------ + // Valid extensions are performed similarly + // A list of valid extensions, separated by spaces or tabs + + t = config->Find("valid_extensions"); + p = strtok(t, " \t"); + while (p) + { + // Extensions are case insensitive + lowerp = p; + lowerp.lowercase(); + valids.Add(lowerp, 0); + p = strtok(0, " \t"); + } + + //----- rewrite the URL------------------------------------------ + aUrl.rewrite(); + rewritten_url = aUrl.get(); + + if(rewritten_url.length() <= 0) + { + //Rejected: empty rewritten URL + String temp = config->Find("url_rewrite_rules"); + strcpy(htdig_parms->rewritten_URL, temp.get()); + system(form("echo \"%s\" > /tmp/neal", temp.get())); + + return(HTDIG_ERROR_TESTURL_REWRITE_EMPTY); + } + + //cout << form("TestURL: org=[%s]\n", the_URL.get()); + //cout << form(" rewritten[%s]\n", rewritten_url.get()); + + //copy the rewritten URL for outgoing parm pass + strcpy(htdig_parms->rewritten_URL, rewritten_url.get()); + + //---- exclude_urls --------------------------------------------- + // If the URL contains any of the patterns in the exclude list, + // mark it as invalid + + /*if(strlen(htdig_parms->exclude_urls) > 0) + tmpList.Create(htdig_parms->exclude_urls," \t"); + else*/ + tmpList.Create(config->Find("exclude_urls")," \t"); + + HtRegexList excludes; + excludes.setEscaped(tmpList, config->Boolean("case_sensitive")); + if (excludes.match(rewritten_url, 0, 0) != 0) + { + //Rejected: item in exclude list + return(HTDIG_ERROR_TESTURL_EXCLUDE); + } + + //---- bad_querystr ------------------------------------------- + // If the URL has a query string and it is in the bad query list + // mark it as invalid + + tmpList.Destroy(); + + /*if(strlen(htdig_parms->bad_querystr) > 0) + tmpList.Create(htdig_parms->bad_querystr, " \t"); + else*/ + tmpList.Create(config->Find("bad_querystr"), " \t"); + + HtRegexList badquerystr; + badquerystr.setEscaped(tmpList, config->Boolean("case_sensitive")); + char *ext = strrchr((char*)rewritten_url, '?'); + if (ext && badquerystr.match(ext, 0, 0) != 0) + { + //if (debug > 2) + // cout << endl << " Rejected: item in bad query list "; + return(HTDIG_ERROR_TESTURL_BADQUERY); + } + + //------ invalid_extensions #2 ------ + // See if the file extension is in the list of invalid ones + + ext = strrchr((char*)rewritten_url, '.'); + String lowerext; + if (ext && strchr(ext,'/')) // Ignore a dot if it's not in the + ext = NULL; // final component of the path. + if(ext) + { + lowerext.set(ext); + int parm = lowerext.indexOf('?'); // chop off URL parameter + if (parm >= 0) + lowerext.chop(lowerext.length() - parm); + lowerext.lowercase(); + if (invalids.Exists(lowerext)) + { + //Rejected: Extension is invalid! + return(HTDIG_ERROR_TESTURL_EXTENSION); + } + } + + //------ valid_extensions #2 ------ + // Or NOT in the list of valid ones + + if (ext && valids.Count() > 0 && !valids.Exists(lowerext)) + { + //Rejected: Extension is not valid! + return(HTDIG_ERROR_TESTURL_EXTENSION2); + } + + //----- limit_urls_to & limit_normalized ------------------------------ + // Set up the limits list + + StringList l; + /*if(strlen(htdig_parms->limit_urls_to) > 0) + l.Create(htdig_parms->limit_urls_to, " \t"); + else*/ + l.Create(config->Find ("limit_urls_to"), " \t"); + + limits.setEscaped (l, config->Boolean ("case_sensitive")); + + l.Destroy (); + + /*if(strlen(htdig_parms->limit_normalized) > 0) + l.Create (htdig_parms->limit_normalized, " \t"); + else*/ + l.Create (config->Find ("limit_normalized"), " \t"); + + limitsn.setEscaped (l, config->Boolean ("case_sensitive")); + l.Destroy (); + + // If any of the limits are met, we allow the URL + if (limits.match(rewritten_url, 1, 0) == 0) + { + //Rejected: URL not in the limits!; + return(HTDIG_ERROR_TESTURL_LIMITS); + } + + + // or not in list of normalized urls + // Warning! should be last in checks because of aUrl normalization + aUrl.normalize(); + if (limitsn.match(rewritten_url.get(), 1, 0) == 0) + { + //Rejected: not in "limit_normalized" list! + return(HTDIG_ERROR_TESTURL_LIMITSNORM); + } + + //----- restrict & exclude ---------------------------------- + //Search-Time Filters + + String temp; + + /*if(strlen(htdig_parms->search_restrict) > 0) + temp = htdig_parms->search_restrict; + else*/ + temp = config->Find("restrict"); + + if (temp.length()) + { + // Create a temporary list from either the configuration + // file or the input parameter + StringList l(temp, " \t\r\n\001|"); + limitTo.setEscaped(l); + } + + /*if(strlen(htdig_parms->search_exclude) > 0) + temp = htdig_parms->search_exclude; + else*/ + temp = config->Find("exclude"); + + if (temp.length()) + { + // Create a temporary list from either the configuration + // file or the input parameter + StringList l(temp, " \t\r\n\001|"); + excludeFrom.setEscaped(l); + } + + //Restrict Test + if (limitTo.match(rewritten_url, 1, 0) == 0) + { + //Rejected URL Not in SearchTime Restrict List + return(HTDIG_ERROR_TESTURL_SRCH_RESTRICT); + } + //Exclude Test + if (excludeFrom.match(rewritten_url, 0, 0) != 0) + { + //Rejected URL in SearchTime Exclude List + return(HTDIG_ERROR_TESTURL_SRCH_EXCLUDE); + } + + + //Success! + return TRUE; +} |