//----------------------------------------------------------------
//
// libhtdig_api.h
//
// Header function for htdig shared library API
//
// 1/25/2002 created
//
// Neal Richter nealr@rightnow.com
//
// Part of the ht://Dig package
// Copyright (c) 1995-2004 The ht://Dig Group
// For copyright details, see the file COPYING in your distribution
// or the GNU Library General Public License (LGPL) version 2 or later or later
//
//
// $Id: libhtdig_api.h,v 1.4 2004/05/28 13:15:29 lha Exp $
//
//----------------------------------------------------------------
#ifndef LIBHTDIG_API_H
#define LIBHTDIG_API_H
#include
#ifndef TRUE
#define TRUE 1
#endif
#ifndef FALSE
#define FALSE 0
#endif
#define HTDIG_MAX_FILENAME_PATH_L 1024
#define HTDIG_DOCUMENT_ID_L 32
#define HTDIG_DOCUMENT_TITLE_L 256
#define HTDIG_DOCUMENT_META_L 4096
#define HTDIG_DOCUMENT_CONTENT_TYPE_L 32
#define HTDIG_DOCUMENT_EXCERPT_L 1024
//make sure HTDIG_DOCUMENT_EXCERPT_L is more than config 'excerpt_length'
//default failsafe size of 'excerpt' document
//make sure it's more than config 'max_head_length'
#define HTDIG_DEFAULT_EXCERPT_SIZE 524288
//should be the same as the default value in HTDIG
#define HTDIG_MAX_QUERY_L 256
#define HTDIG_CUSTOM_TEXT_MIME_TYPE "text/vnd.customdocument"
//htfuzzy
#define HTDIG_ALG_ACCENTS 0x00000100 //"accents"
#define HTDIG_ALG_ACCENTS_STR "accents"
#define HTDIG_ALG_ENDINGS 0x00001000 //"endings"
#define HTDIG_ALG_ENDINGS_STR "endings"
#define HTDIG_ALG_METAPHONE 0x00000010 //"metaphone"
#define HTDIG_ALG_METAPHONE_STR "metaphone"
#define HTDIG_ALG_SOUNDEX 0x00000001 //"soundex"
#define HTDIG_ALG_SOUNDEX_STR "soundex"
#define HTDIG_ALG_SYNONYMS 0x00010000 //"synonyms"
#define HTDIG_ALG_SYNONYMS_STR "synonyms"
//searching
#define HTSEARCH_ALG_AND 0x00000100 //"and"
#define HTSEARCH_ALG_AND_STR "and"
#define HTSEARCH_ALG_BOOLEAN 0x00000001 //"boolean"
#define HTSEARCH_ALG_BOOLEAN_STR "boolean"
#define HTSEARCH_ALG_OR 0x00000010 //"or"
#define HTSEARCH_ALG_OR_STR "or"
#define HTSEARCH_FORMAT_LONG 0x00000001 //"long"
#define HTSEARCH_FORMAT_LONG_STR "long"
#define HTSEARCH_FORMAT_SHORT 0x00000010 //"short"
#define HTSEARCH_FORMAT_SHORT_STR "short"
#define HTSEARCH_SORT_SCORE 0x00000001 //"score"
#define HTSEARCH_SORT_SCORE_STR "score"
#define HTSEARCH_SORT_REV_SCORE 0x00000010 //"reverse score"
#define HTSEARCH_SORT_REV_SCORE_STR "reverse score"
#define HTSEARCH_SORT_TIME 0x00000100 //"time"
#define HTSEARCH_SORT_TIME_STR "time"
#define HTSEARCH_SORT_REV_TIME 0x00001000 //"reverse time"
#define HTSEARCH_SORT_REV_TIME_STR "reverse time"
#define HTSEARCH_SORT_TITLE 0x00010000 //"title"
#define HTSEARCH_SORT_TITLE_STR "title"
#define HTSEARCH_SORT_REV_TITLE 0x00100000 //"reverse title"
#define HTSEARCH_SORT_REV_TITLE_STR "reverse title"
#define HTDIG_ERROR_CONFIG_READ -101
#define HTDIG_ERROR_URL_PART -102
#define HTDIG_ERROR_URL_REWRITE -103
#define HTDIG_ERROR_URL_CREATE_FILE -104
#define HTDIG_ERROR_IMAGE_CREATE_FILE -105
#define HTDIG_ERROR_OPEN_CREATE_DOCDB -106
#define HTDIG_ERROR_LOGFILE_OPEN -107
#define HTDIG_ERROR_LOGFILE_CLOSE -108
#define HTDIG_ERROR_TESTURL_EXCLUDE -109
#define HTDIG_ERROR_TESTURL_BADQUERY -110
#define HTDIG_ERROR_TESTURL_EXTENSION -111
#define HTDIG_ERROR_TESTURL_EXTENSION2 -112
#define HTDIG_ERROR_TESTURL_LIMITS -113
#define HTDIG_ERROR_TESTURL_LIMITSNORM -114
#define HTDIG_ERROR_TESTURL_SRCH_RESTRICT -115
#define HTDIG_ERROR_TESTURL_SRCH_EXCLUDE -116
#define HTDIG_ERROR_TESTURL_REWRITE_EMPTY -117
#define HTDIG_ERROR_TESTURL_ROBOT_FORBID -118
#define HTSEARCH_ERROR_NO_MATCH -201
#define HTSEARCH_ERROR_BAD_MATCH_INDEX -202
#define HTSEARCH_ERROR_BAD_DOCUMENT -203
#define HTSEARCH_ERROR_TEMPLATE_ERROR -204
#define HTSEARCH_ERROR_LOGFILE_OPEN -205
#define HTSEARCH_ERROR_LOGFILE_CLOSE -206
#define HTSEARCH_ERROR_CONFIG_READ -207
#define HTSEARCH_ERROR_URL_PART -208
#define HTSEARCH_ERROR_WORDDB_READ -209
#define HTSEARCH_ERROR_DOCINDEX_READ -210
#define HTSEARCH_ERROR_DOCDB_READ -211
#define HTSEARCH_ERROR_EXCERPTDB_READ -212
#define HTMERGE_ERROR_LOGFILE_OPEN -301
#define HTMERGE_ERROR_LOGFILE_CLOSE -302
#define HTMERGE_ERROR_CONFIG_READ -303
#define HTMERGE_ERROR_URL_PART -304
#define HTMERGE_ERROR_WORDDB_READ -305
#define HTMERGE_ERROR_DOCINDEX_READ -306
#define HTMERGE_ERROR_DOCDB_READ -307
#define HTMERGE_ERROR_EXCERPTDB_READ -308
#define PHP_HTDIG_CONFIGFILE_PARM "configFile"
#define PHP_HTDIG_URL_PARM "URL"
#define PHP_HTDIG_LIMITTO_PARM "limit_urls_to"
#define PHP_HTDIG_LIMITN_PARM "limit_normalized"
#define PHP_HTDIG_EXCLUDEURLS_PARM "exclude_urls"
#define PHP_HTDIG_SEARCHRESTRICT_PARM "search_restrict"
#define PHP_HTDIG_SEARCHEXCLUDE_PARM "search_exclude"
#define PHP_HTDIG_MAXHOPCOUNT_PARM "max_hop_cont"
#define PHP_HTDIG_URLREWRITE_PARM "url_rewrite_rules"
#define PHP_HTDIG_BAD_QUERYSTR_PARM "bad_querystr"
//=============================================================================
//===== HTDIG INDEXING API ====================================================
/***************************************************
* HTDIG_DOCUMENTATION for htdig_parameters_struct
*
* DEBUGGING PARAMETERS
*
* int debug
* Verbose mode. This increases the verbosity of the
* program. Using more than 2 is probably only useful
* for debugging purposes. The default verbose mode
* gives a nice progress report while digging.
*
* char logFile
* File to stream debugging & error messages to!
*
* BOOLEAN PARAMETERS
*
* int initial
* Initial. Do not use any old databases. This is
* accomplished by first erasing the databases
*
* int create_text_database
* Create an ASCII version of the document database.
* This database is easy to parse with other programs so
* that information can be extracted from it.
*
* int report_statistics
* Report statistics after completion.
*
* int alt_work_area
* Use alternate work files.
* Tells htdig to append .work to database files, causing
* a second copy of the database to be built. This allows
* the original files to be used by htsearch during the
* indexing run.
*
*
* STRING PARAMETERS
*
* char configFile
* configfile
* Use the specified configuration file instead of the
* default.
*
* char credentials
* username:password
* Tells htdig to send the supplied username and
* password with each HTTP request. The credentials
* will be encoded using the 'Basic' authentication scheme.
* There *HAS* to be a colon (:) between the username
* and password.
*
*
* char maxhops //9 digit limit
* hopcount
* Limit the stored documents to those which are at
* most hopcount links away from the start URL.
*
* char minimalFile
*
* char URL
* 'command-line' URLs from stdin
* fetches & indexes these URLs
*
******************************************************************/
typedef struct htdig_parameters_struct {
char configFile[HTDIG_MAX_FILENAME_PATH_L];
char DBpath[HTDIG_MAX_FILENAME_PATH_L];
char credentials[HTDIG_MAX_FILENAME_PATH_L];
char max_hops[10]; //9 digit limit
char minimalFile[HTDIG_MAX_FILENAME_PATH_L];
//debugging & logfile
char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file
int debug; //0, 1 ,2, 3, 4, 5
//booelan values
int initial;
int create_text_database;
int report_statistics;
int alt_work_area;
int use_cookies;
//spidering filters
char URL[HTDIG_MAX_FILENAME_PATH_L];
char limit_urls_to[HTDIG_MAX_FILENAME_PATH_L];
char limit_normalized[HTDIG_MAX_FILENAME_PATH_L];
char exclude_urls[HTDIG_MAX_FILENAME_PATH_L];
char search_restrict[HTDIG_MAX_FILENAME_PATH_L];
char search_exclude[HTDIG_MAX_FILENAME_PATH_L];
char url_rewrite_rules[HTDIG_MAX_FILENAME_PATH_L];
char bad_querystr[HTDIG_MAX_FILENAME_PATH_L];
char locale[16];
char title_factor[16];
char text_factor[16];
char meta_description_factor[16];
int max_hop_count;
//the rewritten URL - OUTGOING after htdig_index_test_url
char rewritten_URL[HTDIG_MAX_FILENAME_PATH_L];
} htdig_parameters_struct;
/*****************************************************************
* HTDIG_DOCUMENTATION for htdig_simple_doc_struct
*
* STRING PARAMETERS
*
* char location
* the 'URL' of the document. Can be any usefull string.
*
* char documentid
* document id of document [NOT CURRENTLY USED - IGNORED]
*
* char title
* document title
*
* char meta
* content that is indexed but won appear in an search excerpts
*
* char * contents
* pointer to a NULL TERMINATED string on information to be
* indexed.
*
* char content_type
* a MIME-like string
* custom MIME-type defined above, others are supported by
* htdig as well.
*
*
*****************************************************************/
typedef struct htdig_simple_doc_struct {
char location[HTDIG_MAX_FILENAME_PATH_L];
char documentid[HTDIG_DOCUMENT_ID_L];
char title[HTDIG_DOCUMENT_TITLE_L];
char meta[HTDIG_DOCUMENT_META_L];
char *contents; //MUST ALLOCATE & FREE!!!
char content_type[HTDIG_DOCUMENT_CONTENT_TYPE_L]; //MIME-ISH string
//struct tm time_tm; // use to override index time
time_t doc_time;
} htdig_simple_doc_struct;
int htdig_index_open(htdig_parameters_struct *);
int htdig_index_simple_doc(htdig_simple_doc_struct * );
int htdig_index_urls(void);
int htdig_index_reset(void);
int htdig_index_close(void);
int htdig_index_test_url(htdig_parameters_struct *htparms);
int htdig_get_max_head_length(void);
//=============================================================================
//===== HTDIG MERGING API =====================================================
/**************************************************
* HTDIG_DOCUMENTATION for htmerge_parameters_struct
*
* DEBUGGING PARAMETERS
*
* int debug
* Verbose mode. This increases the verbosity of the
* program. Using more than 2 is probably only useful
* for debugging purposes. The default verbose mode
* gives a progress on what it is doing and where it is.
*
* char logFile
* File to stream debugging & error messages to!
*
*
* BOOLEAN PARAMETERS
*
* int alt_work_area
* Use alternate work files.
* Tells htmerge to append .work to database files causing
* a second copy of the database to be built. This allows
* original files to be used by htsearch during the indexing run.
*
*
* STRING PARAMETERS
*
* char configFile
* configfile
* Use the specified configuration file instead of the default.
*
* char merge_configFile
* merge_configfile
* Merge the databases specified into the databases specified
* by -c or the default.
*
*
*************************************************/
typedef struct htmerge_parameters_struct {
char configFile[HTDIG_MAX_FILENAME_PATH_L];
char merge_configFile[HTDIG_MAX_FILENAME_PATH_L];
//debugging & logfile
char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file
int debug; //0, 1 ,2, 3, 4, 5
//booelan values
int alt_work_area;
} htmerge_parameters_struct;
int htmerge_index_merge(htmerge_parameters_struct *);
//=============================================================================
//===== HTDIG HTFUZZY API =====================================================
/**************************************************
* HTDIG_DOCUMENTATION for htfuzzy_parameters_struct
*
* DEBUGGING PARAMETERS
*
* int debug
* Verbose mode. This increases the verbosity of the
* program. Using more than 2 is probably only useful
* for debugging purposes.
*
* char logFile
* File to stream debugging & error messages to!
*
*
* PARAMETERS
*
* char configFile
* configfile
* Use the specified configuration file instead of the default.
*
* int algorithms_flag
* Bitwise Flags to signal algorithms to be used
*
* soundex == HTDIG_ALG_SOUNDEX
* metaphone == HTDIG_ALG_METAPHONE
* accents == HTDIG_ALG_ACCENTS
* endings == HTDIG_ALG_ENDINGS
* synonyms == HTDIG_ALG_SYNONYMS
*
***************************************************/
typedef struct htfuzzy_parameters_struct {
char configFile[HTDIG_MAX_FILENAME_PATH_L];
int algorithms_flag;
//debugging & logfile
char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file
int debug; //0, 1 ,2, 3, 4, 5
//booelan values
} htfuzzy_parameters_struct;
// htfuzzy functions
int htfuzzy_index(htfuzzy_parameters_struct *);
//==============================================================================
//===== HTDIG SEARCHING API ====================================================
/************************************************
* HTDIG_DOCUMENTATION for htsearch_parameters_struct
*
* DEBUGGING PARAMETERS
*
* int debug
* Verbose mode. This increases the verbosity of the;
* program. Using more than 2 is probably only useful;
* for debugging purposes. The default verbose mode;
* gives a progress on what it is doing and where it is.;
*
* char logFile
* File to stream debugging & error messages to!
*
* STRING PARAMETERS
*
* char configFile
* configfile
* Use the specified configuration file instead of the default.
*
*
**************************************************/
typedef struct htsearch_parameters_struct {
char configFile[HTDIG_MAX_FILENAME_PATH_L];
char DBpath[HTDIG_MAX_FILENAME_PATH_L];
char locale[16];
//debugging & logfile
char logFile[HTDIG_MAX_FILENAME_PATH_L]; //location of log file
int debug; //0, 1 ,2, 3, 4, 5
//filters
char search_restrict[HTDIG_MAX_FILENAME_PATH_L];
char search_exclude[HTDIG_MAX_FILENAME_PATH_L];
char title_factor[16];
char text_factor[16];
char meta_description_factor[16];
} htsearch_parameters_struct;
/*****************************************************************
* HTDIG_DOCUMENTATION for htsearch_query_struct
*
* STRING PARAMETERS
*
* char raw_query
* STRING of text that is the search query -- syntax is important
*
* INTEGER PARAMETERS
*
* int algorithms_flag [ALSO CALLED 'method' IN HTDIG]
* HTSEARCH_ALG_BOOLEAN
* HTSEARCH_ALG_OR
* HTSEARCH_ALG_AND
*
* int sortby_flag
* score, date, title & reversed
* HTSEARCH_SORT_SCORE
* HTSEARCH_SORT_REV_SCORE
* HTSEARCH_SORT_TIME
* HTSEARCH_SORT_REV_TIME
* HTSEARCH_SORT_TITLE
* HTSEARCH_SORT_REV_TITLE
*
* int format
* short, long (with excerpt)
* HTSEARCH_FORMAT_LONG
* HTSEARCH_FORMAT_SHORT
*
*
*
* TODO: 'Connect' these htsearch features to this API
*
* config
* Specifies the name of the configuration file.
*
* exclude
* This value is a pattern that specifies which URLs are to be excluded from
* the search results.
*
* keywords
* Used to specify a list of required words that have to be in the documents.
*
* restrict
* This value is a pattern that all URLs of the search results will have to
* match.
*
* startyear, startmonth, startday, endyear, endmonth, endday
* These values specify the allowed range of document modification dates
* allowed in the search results.
*
*
*
*****************************************************************/
typedef struct htsearch_query_struct {
char raw_query[HTDIG_MAX_QUERY_L];
int algorithms_flag;
int sortby_flag;
int format;
} htsearch_query_struct;
/*****************************************************************
* HTDIG_DOCUMENTATION for htsearch_query_match_struct
*
* STRING PARAMETERS
*
* char title
* Title of document returned
*
* char URL
* URL/location-string of document returned
*
* char excerpt
* Excerpt with search words highlighted with
* searchword
*
* INTEGER PARAMETERS
*
* int score
* score in 'number of stars'
* [MAX NUMBER OF STARS DECLARED IN CONFIG FILE]
*
* int score_percent //top result is 100%
*
* time_t time [DOCUMENT TIME]
* struct tm time_tm [DOCUMENT TIME]
* int size [TOTAL DOCUMENT SIZE]
*
*
*****************************************************************/
typedef struct htsearch_query_match_struct {
char title[HTDIG_DOCUMENT_TITLE_L];
char URL[HTDIG_MAX_FILENAME_PATH_L];
char excerpt[HTDIG_DOCUMENT_EXCERPT_L];
int score;
int score_percent; //top result is 100%
struct tm time_tm;
int size;
} htsearch_query_match_struct;
// htsearch functions
int htsearch_open(htsearch_parameters_struct *);
int htsearch_query(htsearch_query_struct *);
int htsearch_get_nth_match(int, htsearch_query_match_struct *);
int htsearch_close();
//htsearch_free(indicator)
char * htsearch_get_error();
#endif /* LIBHTDIG_API_H */