summaryrefslogtreecommitdiffstats
path: root/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc
diff options
context:
space:
mode:
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc')
-rw-r--r--debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc1058
1 files changed, 1058 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc
new file mode 100644
index 00000000..8a610d36
--- /dev/null
+++ b/debian/htdig/htdig-3.2.0b6/libhtdig/libhtdig_htdig.cc
@@ -0,0 +1,1058 @@
+//-------------------------------------------------------------
+//
+// libhtdig_htdig.cc
+//
+// 1/25/2002 created from htdig.cc
+//
+// Neal Richter nealr@rightnow.com
+//
+// libhtdig_htdig.cc
+//
+// htdig: Indexes the web sites specified in the config file
+// generating several databases to be used by htmerge
+//
+// Part of the ht://Dig package <http://www.htdig.org/>
+// Copyright (c) 1995-2004 The ht://Dig Group
+// For copyright details, see the file COPYING in your distribution
+// or the GNU Library General Public License (LGPL) version 2 or later or later
+// <http://www.gnu.org/copyleft/lgpl.html>
+//
+// $Id: libhtdig_htdig.cc,v 1.5 2004/05/28 13:15:29 lha Exp $
+//
+//-------------------------------------------------------------
+
+#ifdef HAVE_CONFIG_H
+#include "htconfig.h"
+#endif /* HAVE_CONFIG_H */
+
+#ifdef HAVE_STD
+#include <iostream>
+#ifdef HAVE_NAMESPACES
+using namespace std;
+#endif
+#else
+#include <iostream.h>
+#endif /* HAVE_STD */
+
+extern "C" {
+#include "libhtdig_api.h"
+}
+
+#include "libhtdig_log.h"
+
+#include "BasicDocument.h"
+#include "Document.h"
+#include "TextCollector.h"
+#include "Retriever.h"
+#include "StringList.h"
+#include "htdig.h"
+#include "defaults.h"
+#include "HtURLCodec.h"
+#include "WordContext.h"
+#include "HtDateTime.h"
+#include "HtURLRewriter.h"
+#include "URL.h"
+#include "Server.h"
+
+////////////////////////////
+// For cookie jar
+////////////////////////////
+#include "HtCookieJar.h"
+#include "HtCookieMemJar.h"
+#include "HtHTTP.h"
+////////////////////////////
+
+// If we have this, we probably want it.
+//#ifdef HAVE_GETOPT_H
+//#include <getopt.h>
+//#endif
+
+
+
+//Global Variables for Library
+
+int debug = 0;
+HtRegexList limits;
+HtRegexList limitsn;
+String configFile = DEFAULT_CONFIG_FILE;
+FILE *urls_seen = NULL;
+FILE *images_seen = NULL;
+DocumentDB docs;
+
+
+//
+// Global variables for this file
+//
+static int report_statistics = 0;
+static String minimalFile = 0;
+static HtDateTime StartTime;
+static HtDateTime EndTime;
+
+//static char *max_hops = NULL;
+static String credentials;
+static HtCookieJar *_cookie_jar = NULL;
+static HtConfiguration * config = NULL;
+static WordContext * wc = NULL;
+
+static int create_text_database = 0;
+static int alt_work_area = 0;
+static int initial = 0;
+
+int htdig_index_open_flag = FALSE;
+
+
+//new. URLs from 'command-line'
+#define URL_SEPCHARS " ,"
+static char *myURL = NULL;
+
+
+BasicDocument *a_basicdoc;
+TextCollector *Indexer;
+
+BasicDocument the_basicdoc;
+//TextCollector the_Indexer;
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_open(...)
+ *
+ *
+ * opens/creates document indexes and initializes variables
+ * for indexing.
+ *
+ *
+ * see libhtdig_api.h headerfile for definition of
+ * htdig_parameters_struct
+ *
+ *
+ * TODO Examine external function calls for error return
+ * codes
+ *
+ *******************************************************/
+
+int htdig_index_open(htdig_parameters_struct * htdig_parms)
+{
+ int ret = -1;
+
+ if(htdig_index_open_flag != FALSE)
+ return(FALSE);
+
+ //load 'comand-line' parameters
+
+ if (htdig_parms->configFile[0] != 0)
+ configFile = htdig_parms->configFile;
+
+ if (htdig_parms->URL[0] != 0)
+ {
+ myURL = strdup(htdig_parms->URL);
+ }
+
+ debug = htdig_parms->debug;
+ if(debug != 0)
+ {
+ ret = logOpen(htdig_parms->logFile);
+
+ if(ret == FALSE)
+ {
+ reportError (form ("[HTDIG] Error opening log file [%s] . Error:[%d], %s\n",
+ htdig_parms->logFile, errno, strerror(errno)) );
+ return(HTDIG_ERROR_LOGFILE_OPEN);
+ }
+ }
+
+ initial = htdig_parms->initial;
+ create_text_database = htdig_parms->create_text_database;
+ //max_hops = strdup(htdig_parms->max_hops);
+ report_statistics = htdig_parms->report_statistics;
+ credentials = htdig_parms->credentials;
+ alt_work_area = htdig_parms->alt_work_area;
+ minimalFile = htdig_parms->minimalFile;
+
+
+ if(htdig_parms->use_cookies == TRUE)
+ {
+ // Cookie jar dynamic creation.
+
+ _cookie_jar = new HtCookieMemJar (); // new cookie jar
+ if (_cookie_jar)
+ HtHTTP::SetCookieJar (_cookie_jar);
+ }
+
+ //
+ // First set all the defaults and then read the specified config
+ // file to override the defaults.
+ //
+
+ config = HtConfiguration::config ();
+
+ config->Defaults (&defaults[0]);
+ if (access ((char *) configFile, R_OK) < 0)
+ {
+ reportError (form ("[HTDIG] Unable to find configuration file '%s'",
+ configFile.get ()));
+ return(HTDIG_ERROR_CONFIG_READ);
+ }
+ config->Read (configFile);
+
+ //------- Now override config settings ------------
+
+ //------- override database path ------------
+ if(strlen(htdig_parms->DBpath) > 0)
+ {
+ config->Add("database_dir", htdig_parms->DBpath);
+ }
+
+ //------- custom filters from htdig_parms ----------
+
+ if(strlen(htdig_parms->locale) > 0)
+ {
+ config->Add("locale", htdig_parms->locale);
+ }
+
+ if (config->Find ("locale").empty () && debug > 0)
+ logEntry("Warning: unknown locale!\n");
+
+ if (strlen(htdig_parms->max_hops) > 0)
+ {
+ config->Add ("max_hop_count", htdig_parms->max_hops);
+ }
+
+ if(strlen(htdig_parms->limit_urls_to) > 0)
+ {
+ config->Add("limit_urls_to", htdig_parms->limit_urls_to);
+ }
+
+ if(strlen(htdig_parms->limit_normalized) > 0)
+ {
+ config->Add("limit_normalized", htdig_parms->limit_normalized);
+ }
+
+ if(strlen(htdig_parms->exclude_urls) > 0)
+ {
+ config->Add("exclude_urls", htdig_parms->exclude_urls);
+ }
+
+ if(strlen(htdig_parms->url_rewrite_rules) > 0)
+ {
+ config->Add("url_rewrite_rules", htdig_parms->url_rewrite_rules);
+ }
+
+ if(strlen(htdig_parms->bad_querystr) > 0)
+ {
+ config->Add("bad_querystr", htdig_parms->bad_querystr);
+ }
+
+ if(strlen(htdig_parms->locale) > 0)
+ {
+ config->Add("locale", htdig_parms->locale);
+ }
+
+ if(strlen(htdig_parms->meta_description_factor) > 0)
+ {
+ config->Add("meta_description_factor", htdig_parms->meta_description_factor);
+ }
+
+ if(strlen(htdig_parms->title_factor) > 0)
+ {
+ config->Add("title_factor", htdig_parms->title_factor);
+ }
+
+ if(strlen(htdig_parms->text_factor) > 0)
+ {
+ config->Add("text_factor", htdig_parms->text_factor);
+ }
+
+ if(strlen(htdig_parms->URL) > 0)
+ {
+ config->Add("start_url", htdig_parms->URL);
+ free(myURL);
+ myURL=NULL;
+ }
+
+ //------- end custom filters from htdig_parms ----------
+
+ // Set up credentials for this run
+ if (credentials.length ())
+ config->Add ("authorization", credentials);
+
+ //
+ // Check url_part_aliases and common_url_parts for
+ // errors.
+ String url_part_errors = HtURLCodec::instance ()->ErrMsg ();
+
+ if (url_part_errors.length () != 0)
+ {
+ reportError (form("[HTDIG] Invalid url_part_aliases or common_url_parts: %s",
+ url_part_errors.get ()));
+ return(HTDIG_ERROR_URL_PART);
+ }
+ //
+ // Check url_rewrite_rules for errors.
+ String url_rewrite_rules = HtURLRewriter::instance ()->ErrMsg ();
+
+ if (url_rewrite_rules.length () != 0)
+ {
+ reportError (form ("[HTDIG] Invalid url_rewrite_rules: %s",
+ url_rewrite_rules.get ()));
+ return(HTDIG_ERROR_URL_REWRITE);
+ }
+
+ //
+ // If indicated, change the database file names to have the .work
+ // extension
+ //
+ if (alt_work_area != 0)
+ {
+ String configValue = config->Find ("doc_db");
+
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_db", configValue);
+ }
+
+ configValue = config->Find ("word_db");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("word_db", configValue);
+ }
+
+ configValue = config->Find ("doc_index");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_index", configValue);
+ }
+
+ configValue = config->Find ("doc_excerpt");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("doc_excerpt", configValue);
+ }
+
+ configValue = config->Find ("md5_db");
+ if (configValue.length () != 0)
+ {
+ configValue << ".work";
+ config->Add ("md5_db", configValue);
+ }
+ }
+
+ //
+ // If needed, we will create a list of every URL we come across.
+ //TODO put document-index log file stuff here
+
+ if (config->Boolean ("create_url_list"))
+ {
+ const String filename = config->Find ("url_list");
+ urls_seen = fopen (filename, initial ? "w" : "a");
+ if (urls_seen == 0)
+ {
+ reportError (form ("[HTDIG] Unable to create URL file '%s'",
+ filename.get ()));
+ return(HTDIG_ERROR_URL_CREATE_FILE);
+ }
+ }
+
+ //
+ // If needed, we will create a list of every image we come across.
+ //
+ if (config->Boolean ("create_image_list"))
+ {
+ const String filename = config->Find ("image_list");
+ images_seen = fopen (filename, initial ? "w" : "a");
+ if (images_seen == 0)
+ {
+ reportError (form ("[HTDIG] Unable to create images file '%s'",
+ filename.get ()));
+ return(HTDIG_ERROR_IMAGE_CREATE_FILE);
+ }
+ }
+
+ //
+ // Set up the limits list
+ //
+ StringList l (config->Find ("limit_urls_to"), " \t");
+ limits.setEscaped (l, config->Boolean ("case_sensitive"));
+ l.Destroy ();
+
+ l.Create (config->Find ("limit_normalized"), " \t");
+ limitsn.setEscaped (l, config->Boolean ("case_sensitive"));
+ l.Destroy ();
+
+ //
+ // Open the document database
+ //
+ const String filename = config->Find ("doc_db");
+ if (initial)
+ unlink (filename);
+
+ const String index_filename = config->Find ("doc_index");
+ if (initial)
+ unlink (index_filename);
+
+ const String head_filename = config->Find ("doc_excerpt");
+ if (initial)
+ unlink (head_filename);
+
+ if (docs.Open (filename, index_filename, head_filename) < 0)
+ {
+ reportError (form ("[HTDIG] Unable to open/create document database '%s'",
+ filename.get ()));
+ return(HTDIG_ERROR_OPEN_CREATE_DOCDB);
+ }
+
+ const String word_filename = config->Find ("word_db");
+ if (initial)
+ unlink (word_filename);
+
+ // Initialize htword
+ wc = new WordContext;
+ wc->Initialize(*config);
+
+
+ //a_basicdoc = new BasicDocument;
+ Indexer = new TextCollector;
+
+ a_basicdoc = &the_basicdoc;
+ a_basicdoc->Reset();
+
+ //Indexer = &the_Indexer;
+
+ if ((a_basicdoc == NULL) || (Indexer == NULL))
+ return(FALSE);
+
+
+ htdig_index_open_flag = TRUE;
+
+ return(TRUE);
+
+}
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_simple_doc(...)
+ *
+ *
+ * indexes a simple document supplied by parameter
+ *
+ * see libhtdig_api.h headerfile for definition of
+ * htdig_simple_doc_struct
+ *
+ * TODO Examine external function calls for error return
+ * codes
+ *
+ *******************************************************/
+int htdig_index_simple_doc(htdig_simple_doc_struct * a_simple_doc)
+{
+ int index_error = 0;
+ //int ret = 0;
+
+ // Reset the document to clean out any old data
+ a_basicdoc->Reset();
+
+ a_basicdoc->ModTime(a_simple_doc->doc_time);
+ a_basicdoc->Location(a_simple_doc->location);
+ a_basicdoc->DocumentID(a_simple_doc->documentid);
+ a_basicdoc->Title(a_simple_doc->title);
+ a_basicdoc->MetaContent(a_simple_doc->meta);
+ a_basicdoc->Contents(a_simple_doc->contents); //MUST ALLOCATE & FREE!!!
+ a_basicdoc->ContentType(a_simple_doc->content_type); //MIME-ISH string
+ a_basicdoc->Length();
+
+
+ //TODO What is this error?
+ index_error = Indexer->IndexDoc(*a_basicdoc);
+
+ return(TRUE);
+}
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_urls(...)
+ *
+ * Starts fetch & index of URL supplied in config file
+ * OR supplied in htdig_index_open parameter
+ *
+ * TODO Examine external function calls for error return
+ * codes
+ * TODO Blank/empty URL error?
+ *******************************************************/
+int htdig_index_urls(void)
+{
+
+ char * temp_URL_list = NULL;
+ char * temp_url = NULL;
+
+ // Create the Retriever object which we will use to parse all the
+ // HTML files.
+ // In case this is just an update dig, we will add all existing
+ // URLs?
+ //
+ Retriever retriever (Retriever_logUrl);
+ if (minimalFile.length () == 0)
+ {
+ List *list = docs.URLs ();
+ retriever.Initial (*list);
+ delete list;
+
+ // Add start_url to the initial list of the retriever.
+ // Don't check a URL twice!
+ // Beware order is important, if this bugs you could change
+ // previous line retriever.Initial(*list, 0) to Initial(*list,1)
+ retriever.Initial (config->Find ("start_url"), 1);
+ }
+
+ // Handle list of URLs given on 'command-line'
+ if (myURL != NULL)
+ {
+ String str;
+ temp_URL_list = strdup(myURL);
+ temp_url = strtok(temp_URL_list, URL_SEPCHARS);
+ while (temp_url != NULL)
+ {
+ str = temp_url;
+ str.chop ("\r\n");
+ if (str.length () > 0)
+ retriever.Initial (str, 1);
+
+ temp_url = strtok(NULL, URL_SEPCHARS);
+ }
+ free(temp_URL_list);
+ }
+ else if (minimalFile.length () != 0)
+ {
+ FILE *input = fopen (minimalFile.get (), "r");
+ char buffer[1000];
+
+ if (input)
+ {
+ while (fgets (buffer, sizeof (buffer), input))
+ {
+ String str (buffer);
+ str.chop ("\r\n\t ");
+ if (str.length () > 0)
+ retriever.Initial (str, 1);
+ }
+ fclose (input);
+ }
+ }
+
+ //
+ // Go do it!
+ //
+ retriever.Start ();
+
+ //
+ // All done with parsing.
+ //
+
+ //
+ // If the user so wants, create a text version of the document database.
+ //
+
+ if (create_text_database)
+ {
+ const String doc_list = config->Find ("doc_list");
+ if (initial)
+ unlink (doc_list);
+ docs.DumpDB (doc_list);
+ const String word_dump = config->Find ("word_dump");
+ if (initial)
+ unlink (word_dump);
+ HtWordList words (*config);
+ if (words.Open (config->Find ("word_db"), O_RDONLY) == OK)
+ {
+ words.Dump (word_dump);
+ }
+ }
+
+ //
+ // Cleanup
+ //
+ if (images_seen)
+ fclose (images_seen);
+
+ //
+ // If needed, report some statistics
+ //
+ if (report_statistics)
+ {
+ retriever.ReportStatistics ("htdig");
+ }
+
+ return(TRUE);
+}
+
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_close(...)
+ *
+ * Closes the database and destroys various objects
+ *
+ * TODO Examine external function calls for error return
+ * codes
+ *
+ *******************************************************/
+int htdig_index_close(void)
+{
+ int ret = -1;
+
+ if(htdig_index_open_flag == TRUE)
+ {
+ //delete a_basicdoc;
+ //delete Indexer;
+
+ Indexer->FlushWordDB();
+
+ if (_cookie_jar)
+ delete _cookie_jar;
+
+ //if (max_hops != NULL)
+ // free(max_hops);
+
+ if (myURL != NULL)
+ free(myURL);
+
+ //call destructors here
+ docs.~DocumentDB();
+ //config->~HtConfiguration();
+
+ if (debug != 0)
+ {
+ ret = logClose();
+
+ if (ret == FALSE)
+ {
+ reportError (form ("[HTDIG] Error closing log file . Error:[%d], %s\n",
+ errno, strerror(errno)) );
+ return(HTDIG_ERROR_LOGFILE_CLOSE);
+ }
+ }
+
+ /*
+ if(config) {
+ WordContext::Finish();
+ }
+ */
+
+ if (wc)
+ delete wc;
+
+ if (urls_seen)
+ fclose (urls_seen);
+
+ htdig_index_open_flag = FALSE;
+ }
+
+ return(TRUE);
+}
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_reset(...)
+ *
+ *
+ * TODO Examine external function calls for error return
+ * codes
+ *
+ *******************************************************/
+
+int htdig_index_reset(void)
+{
+ Indexer->FlushWordDB();
+ a_basicdoc->Reset();
+
+ return(TRUE);
+}
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_get_max_head_length(...)
+ *
+ *
+ * Returns size of maximum document storage length
+ * for db.excerpts [htdig.conf:max_head_length]
+ *
+ * This represents the maximum amount of the document
+ * That will be available for excerpting.
+ *
+ *
+ *******************************************************/
+
+int htdig_get_max_head_length()
+{
+ int ret = -1;
+
+ if(config != NULL)
+ ret = config->Value("max_head_length");
+
+ return(ret);
+}
+
+/*******************************************************
+ *
+ * LIBHTDIG API FUNCTION
+ *
+ * int htdig_index_test_url(...)
+ *
+ *
+ * Test a URL for filter Pass/Fail
+ *
+ * Pass = return(TRUE)
+ * Fail = return(XXX) [Negative Value]
+ *
+ *
+ *
+ *
+ *
+ *******************************************************/
+
+
+//int htdig_index_test_url(htdig_parameters_struct *htdig_parms)
+int htdig_index_test_url(htdig_parameters_struct *htdig_parms)
+{
+ //int ret = FALSE;
+ String the_URL(htdig_parms->URL);
+ HtConfiguration* config= HtConfiguration::config();
+ Dictionary invalids;
+ Dictionary valids;
+ URL aUrl(the_URL);
+ String rewritten_url(the_URL);
+ StringList tmpList;
+ HtRegex limitTo;
+ HtRegex excludeFrom;
+
+ //initalize outgoing-parameter rewritten_URL
+ htdig_parms->rewritten_URL[0] = 0;
+
+#ifdef DEBUG
+ //output relevant config variables
+ cout << " bad_extensions = " << config->Find("bad_extensions") << endl;
+ cout << " valid_extensions = " << config->Find("valid_extensions") << endl;
+ cout << " exclude_urls = " << config->Find("exclude_urls") << endl;
+ cout << " bad_querystr = " << config->Find("bad_querystr") << endl;
+ cout << " limit_urls_to = " << config->Find("limit_urls_to") << endl;
+ cout << " limit_normalized = " << config->Find("limit_normalized") << endl;
+ cout << " restrict = " << config->Find("restrict") << endl;
+ cout << " exclude = " << config->Find("exclude") << endl;
+#endif
+
+ //------------ read the config file if it is given ---------------
+ if (htdig_parms->configFile[0] != 0)
+ configFile = htdig_parms->configFile;
+
+ config = HtConfiguration::config ();
+
+ config->Defaults (&defaults[0]);
+ if (access ((char *) configFile, R_OK) < 0)
+ {
+ reportError (form ("[HTDIG] Unable to find configuration file '%s'",
+ configFile.get ()));
+ return(HTDIG_ERROR_CONFIG_READ);
+ }
+ config->Read (configFile);
+
+ //---------- Now override config settings -----------------
+
+ //------- override database path ------------
+ if(strlen(htdig_parms->DBpath) > 0)
+ {
+ config->Add("database_dir", htdig_parms->DBpath);
+ }
+
+ //------- custom filters from htdig_parms ----------
+
+ if(strlen(htdig_parms->locale) > 0)
+ {
+ config->Add("locale", htdig_parms->locale);
+ }
+
+ if (config->Find ("locale").empty () && debug > 0)
+ logEntry("Warning: unknown locale!\n");
+
+ if (strlen(htdig_parms->max_hops) > 0)
+ {
+ config->Add ("max_hop_count", htdig_parms->max_hops);
+ }
+
+ if(strlen(htdig_parms->limit_urls_to) > 0)
+ {
+ config->Add("limit_urls_to", htdig_parms->limit_urls_to);
+ }
+
+ if(strlen(htdig_parms->limit_normalized) > 0)
+ {
+ config->Add("limit_normalized", htdig_parms->limit_normalized);
+ }
+
+ if(strlen(htdig_parms->exclude_urls) > 0)
+ {
+ config->Add("exclude_urls", htdig_parms->exclude_urls);
+ }
+
+ if(strlen(htdig_parms->url_rewrite_rules) > 0)
+ {
+ config->Add("url_rewrite_rules", htdig_parms->url_rewrite_rules);
+ }
+
+ if(strlen(htdig_parms->bad_querystr) > 0)
+ {
+ config->Add("bad_querystr", htdig_parms->bad_querystr);
+ }
+
+ if(strlen(htdig_parms->locale) > 0)
+ {
+ config->Add("locale", htdig_parms->locale);
+ }
+
+ if(strlen(htdig_parms->meta_description_factor) > 0)
+ {
+ config->Add("meta_description_factor", htdig_parms->meta_description_factor);
+ }
+
+ if(strlen(htdig_parms->title_factor) > 0)
+ {
+ config->Add("title_factor", htdig_parms->title_factor);
+ }
+
+ if(strlen(htdig_parms->text_factor) > 0)
+ {
+ config->Add("text_factor", htdig_parms->text_factor);
+ }
+
+ //-------------------------------------------------------------------
+
+#ifdef DEBUG
+ //output relevant config variables
+ cout << " bad_extensions = " << config->Find("bad_extensions") << endl;
+ cout << " valid_extensions = " << config->Find("valid_extensions") << endl;
+ cout << " exclude_urls = " << config->Find("exclude_urls") << endl;
+ cout << " bad_querystr = " << config->Find("bad_querystr") << endl;
+ cout << " limit_urls_to = " << config->Find("limit_urls_to") << endl;
+ cout << " limit_normalized = " << config->Find("limit_normalized") << endl;
+ cout << " restrict = " << config->Find("restrict") << endl;
+ cout << " exclude = " << config->Find("exclude") << endl;
+#endif
+
+
+ //------ bad_extensions -----------------------------------------------
+ //A list of bad extensions, separated by spaces or tabs
+
+ String t = config->Find("bad_extensions");
+ String lowerp;
+ char *p = strtok(t, " \t");
+ while (p)
+ {
+ // Extensions are case insensitive
+ lowerp = p;
+ lowerp.lowercase();
+ invalids.Add(lowerp, 0);
+ p = strtok(0, " \t");
+ }
+
+
+ //------ valid_extensions ------------------------------------------------
+ // Valid extensions are performed similarly
+ // A list of valid extensions, separated by spaces or tabs
+
+ t = config->Find("valid_extensions");
+ p = strtok(t, " \t");
+ while (p)
+ {
+ // Extensions are case insensitive
+ lowerp = p;
+ lowerp.lowercase();
+ valids.Add(lowerp, 0);
+ p = strtok(0, " \t");
+ }
+
+ //----- rewrite the URL------------------------------------------
+ aUrl.rewrite();
+ rewritten_url = aUrl.get();
+
+ if(rewritten_url.length() <= 0)
+ {
+ //Rejected: empty rewritten URL
+ String temp = config->Find("url_rewrite_rules");
+ strcpy(htdig_parms->rewritten_URL, temp.get());
+ system(form("echo \"%s\" > /tmp/neal", temp.get()));
+
+ return(HTDIG_ERROR_TESTURL_REWRITE_EMPTY);
+ }
+
+ //cout << form("TestURL: org=[%s]\n", the_URL.get());
+ //cout << form(" rewritten[%s]\n", rewritten_url.get());
+
+ //copy the rewritten URL for outgoing parm pass
+ strcpy(htdig_parms->rewritten_URL, rewritten_url.get());
+
+ //---- exclude_urls ---------------------------------------------
+ // If the URL contains any of the patterns in the exclude list,
+ // mark it as invalid
+
+ /*if(strlen(htdig_parms->exclude_urls) > 0)
+ tmpList.Create(htdig_parms->exclude_urls," \t");
+ else*/
+ tmpList.Create(config->Find("exclude_urls")," \t");
+
+ HtRegexList excludes;
+ excludes.setEscaped(tmpList, config->Boolean("case_sensitive"));
+ if (excludes.match(rewritten_url, 0, 0) != 0)
+ {
+ //Rejected: item in exclude list
+ return(HTDIG_ERROR_TESTURL_EXCLUDE);
+ }
+
+ //---- bad_querystr -------------------------------------------
+ // If the URL has a query string and it is in the bad query list
+ // mark it as invalid
+
+ tmpList.Destroy();
+
+ /*if(strlen(htdig_parms->bad_querystr) > 0)
+ tmpList.Create(htdig_parms->bad_querystr, " \t");
+ else*/
+ tmpList.Create(config->Find("bad_querystr"), " \t");
+
+ HtRegexList badquerystr;
+ badquerystr.setEscaped(tmpList, config->Boolean("case_sensitive"));
+ char *ext = strrchr((char*)rewritten_url, '?');
+ if (ext && badquerystr.match(ext, 0, 0) != 0)
+ {
+ //if (debug > 2)
+ // cout << endl << " Rejected: item in bad query list ";
+ return(HTDIG_ERROR_TESTURL_BADQUERY);
+ }
+
+ //------ invalid_extensions #2 ------
+ // See if the file extension is in the list of invalid ones
+
+ ext = strrchr((char*)rewritten_url, '.');
+ String lowerext;
+ if (ext && strchr(ext,'/')) // Ignore a dot if it's not in the
+ ext = NULL; // final component of the path.
+ if(ext)
+ {
+ lowerext.set(ext);
+ int parm = lowerext.indexOf('?'); // chop off URL parameter
+ if (parm >= 0)
+ lowerext.chop(lowerext.length() - parm);
+ lowerext.lowercase();
+ if (invalids.Exists(lowerext))
+ {
+ //Rejected: Extension is invalid!
+ return(HTDIG_ERROR_TESTURL_EXTENSION);
+ }
+ }
+
+ //------ valid_extensions #2 ------
+ // Or NOT in the list of valid ones
+
+ if (ext && valids.Count() > 0 && !valids.Exists(lowerext))
+ {
+ //Rejected: Extension is not valid!
+ return(HTDIG_ERROR_TESTURL_EXTENSION2);
+ }
+
+ //----- limit_urls_to & limit_normalized ------------------------------
+ // Set up the limits list
+
+ StringList l;
+ /*if(strlen(htdig_parms->limit_urls_to) > 0)
+ l.Create(htdig_parms->limit_urls_to, " \t");
+ else*/
+ l.Create(config->Find ("limit_urls_to"), " \t");
+
+ limits.setEscaped (l, config->Boolean ("case_sensitive"));
+
+ l.Destroy ();
+
+ /*if(strlen(htdig_parms->limit_normalized) > 0)
+ l.Create (htdig_parms->limit_normalized, " \t");
+ else*/
+ l.Create (config->Find ("limit_normalized"), " \t");
+
+ limitsn.setEscaped (l, config->Boolean ("case_sensitive"));
+ l.Destroy ();
+
+ // If any of the limits are met, we allow the URL
+ if (limits.match(rewritten_url, 1, 0) == 0)
+ {
+ //Rejected: URL not in the limits!;
+ return(HTDIG_ERROR_TESTURL_LIMITS);
+ }
+
+
+ // or not in list of normalized urls
+ // Warning! should be last in checks because of aUrl normalization
+ aUrl.normalize();
+ if (limitsn.match(rewritten_url.get(), 1, 0) == 0)
+ {
+ //Rejected: not in "limit_normalized" list!
+ return(HTDIG_ERROR_TESTURL_LIMITSNORM);
+ }
+
+ //----- restrict & exclude ----------------------------------
+ //Search-Time Filters
+
+ String temp;
+
+ /*if(strlen(htdig_parms->search_restrict) > 0)
+ temp = htdig_parms->search_restrict;
+ else*/
+ temp = config->Find("restrict");
+
+ if (temp.length())
+ {
+ // Create a temporary list from either the configuration
+ // file or the input parameter
+ StringList l(temp, " \t\r\n\001|");
+ limitTo.setEscaped(l);
+ }
+
+ /*if(strlen(htdig_parms->search_exclude) > 0)
+ temp = htdig_parms->search_exclude;
+ else*/
+ temp = config->Find("exclude");
+
+ if (temp.length())
+ {
+ // Create a temporary list from either the configuration
+ // file or the input parameter
+ StringList l(temp, " \t\r\n\001|");
+ excludeFrom.setEscaped(l);
+ }
+
+ //Restrict Test
+ if (limitTo.match(rewritten_url, 1, 0) == 0)
+ {
+ //Rejected URL Not in SearchTime Restrict List
+ return(HTDIG_ERROR_TESTURL_SRCH_RESTRICT);
+ }
+ //Exclude Test
+ if (excludeFrom.match(rewritten_url, 0, 0) != 0)
+ {
+ //Rejected URL in SearchTime Exclude List
+ return(HTDIG_ERROR_TESTURL_SRCH_EXCLUDE);
+ }
+
+
+ //Success!
+ return TRUE;
+}