diff options
author | Slávek Banko <slavek.banko@axis.cz> | 2021-11-05 13:28:23 +0100 |
---|---|---|
committer | Slávek Banko <slavek.banko@axis.cz> | 2021-11-05 13:28:23 +0100 |
commit | 8c787c3591c1c885b91a54128835b400858c5cca (patch) | |
tree | eca1b776912a305c4d45b3964038278a2fae1ead /debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.pl | |
parent | fe188b907cdf30dfdfe0eba9412e7f8749fec158 (diff) | |
download | extra-dependencies-8c787c3591c1c885b91a54128835b400858c5cca.tar.gz extra-dependencies-8c787c3591c1c885b91a54128835b400858c5cca.zip |
DEB htdig: Added to repository.
Signed-off-by: Slávek Banko <slavek.banko@axis.cz>
Diffstat (limited to 'debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.pl')
-rwxr-xr-x | debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.pl | 676 |
1 files changed, 676 insertions, 0 deletions
diff --git a/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.pl b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.pl new file mode 100755 index 00000000..c69f00cc --- /dev/null +++ b/debian/htdig/htdig-3.2.0b6/contrib/doc2html/doc2html.pl @@ -0,0 +1,676 @@ +#!/usr/bin/perl +use strict; +# +# Version 3.0.1 19-September-2002 +# +# External converter for htdig 3.1.4 or later (Perl5 or later) +# Usage: (in htdig.conf) +# +#external_parsers: application/rtf->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# text/rtf->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/pdf->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/postscript->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/msword->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/wordperfect5.1->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/wordperfect6.0->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/msexcel->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/vnd.ms-excel->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/vnd.ms-powerpoint->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl +# application/x-shockwave-flash->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl \ +# application/x-shockwave-flash2-preview->text/html /opt/local/htdig-3.1.6/scripts/doc2html.pl +# +# Uses wp2html to convert Word and WordPerfect documents into HTML, and +# falls back to using Catdoc for Word and Catwpd for WordPerfect if +# Wp2html is unavailable or unable to convert. +# +# Uses range of other converters as available. +# +# If all else fails, attempts to read file without conversion. +# +######################################################################################## +# Written by David Adams <d.j.adams@soton.ac.uk>. +# Based on conv_doc.pl written by Gilles Detillieux <grdetil@scrc.umanitoba.ca>, +# which in turn was based on the parse_word_doc.pl script, written by +# Jesse op den Brouw <MSQL_User@st.hhs.nl>. +######################################################################################## + +# Install Sys::AlarmCall if you can +eval "use Sys::AlarmCall"; + +######## Full paths of conversion utilities ########## +######## YOU MUST SET THESE ########## +######## (leave null those you don't have) ########## + +# Wp2html converts Word & Wordperfect to HTML +# (get it from: http://www.res.bbsrc.ac.uk/wp2html/): +my $WP2HTML = ''; + +#Catwpd for WordPerfect to text conversion +# (you don't need this if you have wp2html) +# (get it from htdig site) +my $CATWPD = ''; + +# rtf2html converts Rich Text Font documents to HTML +# (get it from http://www.ice.ru/~vitus/catdoc/): +my $RTF2HTML = ''; + +# Catdoc converts Word (MicroSoft) to plain text +# (get it from: http://www.ice.ru/~vitus/catdoc/): + +#version of catdoc for Word6, Word7 & Word97 files: +my $CATDOC = ''; + +#version of catdoc for Word2 files: +my $CATDOC2 = $CATDOC; + +#version of catdoc for Word 5.1 for MAC: +my $CATDOCM = $CATDOC; + +# PostScript to text converter +# (get it from the ghostscript 3.33 (or later) package): +my $CATPS = ''; + +# add to search path the directory which contains gs: +#$ENV{PATH} .= ":/usr/freeware/bin"; + +# PDF to HTML conversion script: +my $PDF2HTML = ''; # full pathname of pdf2html/pl script + +# Excel (MicroSoft) to HTML converter +# (get it from www.xlhtml.org) +my $XLS2HTML = ''; + +# Excel (MicroSoft) to .CSV converter +# (you don't need this if you have xlhtml) +# (if you do want it, you can get it with catdoc) +my $CATXLS = ''; + +# Powerpoint (MicroSoft) to HTML converter +# (get it from www.xlhtml.org) +my $PPT2HTML = ''; + +# Shockwave Flash +# (extracts links from file) +my $SWF2HTML = ''; # full pathname of swf2html.pl script + +# OpenOffice.org files +#my $OpenOffice2XML = '/usr/bin/unzip'; +my $OpenOffice2XML = ''; +# (remove multi-byte unicode from XML in OOo documents) +#my $strip_unicode = '| /usr/bin/iconv -c -s -f UTF-8 -t ISO-8859-1'; +my $strip_unicode = ''; + + +######################################################################## + +# Other Global Variables +my ($Success, $LOG, $Verbose, $CORE_MESS, $TMP, $RM, $ED, $Magic, $Time, + $Count, $Prog, $Input, $MIME_type, $URL, $Name, $Efile, $Maxerr, + $Redir, $Emark, $EEmark, $Method, $OP_Limit, $IP_Limit); +my (%HTML_Method, %TEXT_Method, %BAD_type); + + +&init; # initialise +my $size = -s $Input; +&quit("Input file size of $size at or above $IP_Limit limit" ) if $size >= $IP_Limit; +&store_methods; # +&read_magic; # Magic reveals type +&error_setup; # re-route standard error o/p from utilities + +# see if a document -> HTML converter will work: +&run('&try_html'); +if ($Success) { &quit(0) } + +# try a document -> text converter: +&run('&try_text'); +if ($Success) { &quit(0) } + +# see if a known problem +my $fail = &cannot_do; +if ($fail) { &quit($fail) } + +# last-ditch attempt, try copying document +&try_plain; +if ($Success) {&quit(0)} + +&quit("UNABLE to convert"); + +#------------------------------------------------------------------------------ + +sub init { + + # Doc2html log file + $LOG = $ENV{'DOC2HTML_LOG'} || ''; + # + if ($LOG) { + open(STDERR,">>$LOG"); # ignore possible failure to open + } # else O/P really does go to STDERR + + # Set to 1 for O/P to STDERR or Log file + $Verbose = exists($ENV{'DOC2HTML_LOG'}) ? 1 : 0; + + # Limiting size of file doc2html.pl will try to process (default 20Mbyte) + $IP_Limit = $ENV{'DOC2HTML_IP_LIMIT'} || 20000000; + + # Limit for O/P returned to htdig (default 10Mbyte) + $OP_Limit = $ENV{'DOC2HTML_OP_LIMIT'} || 10000000; + + # Mark error message produced within doc2html script + $Emark = "!\t"; + # Mark error message produced by conversion utility + $EEmark = "!!\t"; + + # Message to STDERR if core dump detected + $CORE_MESS = "CORE DUMPED"; + + # Directory for temporary files + $TMP = "/tmp/htdig"; + if (! -d $TMP) { + mkdir($TMP,0700) or die "Unable to create directory \"$TMP\": $!"; + } + # Current directory during run of script: + chdir $TMP or warn "Cannot change directory to $TMP\n"; + + # File for error output from utility + $Efile = 'doc_err.' . $$; + + # Max. number of lines of error output from utility copied + $Maxerr = 10; + + # System command to delete a file + $RM = "/bin/rm -f"; + + # Line editor to do substitution + $ED = "/bin/sed -e"; + if ($^O eq "MSWin32") {$ED = "$^X -pe"} + + $Time = 60; # allow 60 seconds for external utility to complete + + $Success = 0; + $Count = 0; + $Method = ''; + $Prog = $0; + $Prog =~ s#^.*/##; + $Prog =~ s/\..*?$//; + + $Input = $ARGV[0] or die "No filename given\n"; + $MIME_type = $ARGV[1] or die "No MIME-type given"; + $URL = $ARGV[2] || '?'; + $Name = $URL; + $Name =~ s#^.*/##; + $Name =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie; + + if ($Verbose and not $LOG) { print STDERR "\n$Prog: [$MIME_type] " } + if ($LOG) { print STDERR "$URL [$MIME_type] " } + +} + +#------------------------------------------------------------------------------ + +sub store_methods { +# The method of dealing with each file type is set up here. +# Edit as necessary + + my ($mime_type,$magic,$cmd,$cmdl,$type,$description); + + my $name = quotemeta($Name); + + ####Document -> HTML converters#### + + # WordPerfect documents + if ($WP2HTML) { + $mime_type = "application/wordperfect|application/msword"; + $cmd = $WP2HTML; + $cmdl = "($cmd -q -DTitle=\"[$name]\" -c doc2html.cfg -s doc2html.sty -i $Input -O; $RM CmdLine.ovr)"; + $magic = '\377WPC'; + &store_html_method('WordPerfect (wp2html)',$cmd,$cmdl,$mime_type,$magic); + } + + # Word documents + if ($WP2HTML) { + $mime_type = "application/msword"; + $cmd = $WP2HTML; + $cmdl = "($cmd -q -DTitle=\"[$name]\" -c doc2html.cfg -s doc2html.sty -i $Input -O; $RM CmdLine.ovr)"; + $magic = '^\320\317\021\340'; + &store_html_method('Word (wp2html)',$cmd,$cmdl,$mime_type,$magic); + } + + # RTF documents + if ($RTF2HTML) { + $mime_type = "application/msword|application/rtf|text/rtf"; + $cmd = $RTF2HTML; + # Rtf2html uses filename as title, change this: + $cmdl = "$cmd $Input | $ED \"s#^<TITLE>$Input</TITLE>#<TITLE>[$name]</TITLE>#\""; + $magic = '^{\134rtf'; + &store_html_method('RTF (rtf2html)',$cmd,$cmdl,$mime_type,$magic); + } + + # Microsoft Excel spreadsheet + if ($XLS2HTML) { + $mime_type = "application/msexcel|application/vnd.ms-excel"; + $cmd = $XLS2HTML; + # xlHtml uses filename as title, change this: + $cmdl = "$cmd -fw $Input | $ED \"s#<TITLE>$Input</TITLE>#<TITLE>[$name]</TITLE>#\""; + $magic = '^\320\317\021\340'; + &store_html_method('Excel (xlHtml)',$cmd,$cmdl,$mime_type,$magic); + } + + # Microsoft Powerpoint Presentation + if ($PPT2HTML) { + $mime_type = "application/vnd.ms-powerpoint"; + $cmd = $PPT2HTML; + # xlHtml uses filename as title, change this: + $cmdl = "$cmd $Input | $ED \"s#<TITLE>$Input</TITLE>#<TITLE>[$name]</TITLE>#\""; + $magic = '^\320\317\021\340'; + &store_html_method('Powerpoint (pptHtml)',$cmd,$cmdl,$mime_type,$magic); + } + + # Adobe PDF file using Perl script + if ($PDF2HTML) { + $mime_type = "application/pdf"; + $cmd = $PDF2HTML; + # Replace default title (if used) with filename: + $cmdl = "$cmd $Input $mime_type $name"; + $magic = '%PDF-|\0PDF CARO\001\000\377'; + &store_html_method('PDF (pdf2html)',$cmd,$cmdl,$mime_type,$magic); + } + + # Shockwave Flash file using Perl script + if ($SWF2HTML) { + $mime_type = "application/x-shockwave-flash"; + $cmd = $SWF2HTML; + $cmdl = "$cmd $Input"; + $magic = '^FWS[\001-\010]'; # versions 1 to 5, perhaps some later versions + &store_html_method('Shockwave-Flash (swf2html)',$cmd,$cmdl,$mime_type,$magic); + } + + # OpenOffice Documents + if ($OpenOffice2XML) { + $mime_type = "application/vnd.sun.xml.writer|application/vnd.sun.xml.impress|application/vnd.sun.xml.calc|application/vnd.sun.xml.draw|application/vnd.sun.xml.math"; + $cmd = $OpenOffice2XML; + $cmdl = "$cmd -p -qq $Input content.xml | /bin/sed -r 's/<[^>]*>/ /gi' $strip_unicode"; + $magic = 'PK'; + &store_html_method('OpenOffice XML (oo2xml)',$cmd,$cmdl,$mime_type,$magic); + } + + ####Document -> Text converters#### + + # Word6, Word7 & Word97 documents + if ($CATDOC) { + $mime_type = "application/msword"; + $cmd = $CATDOC; + # -b option increases chance of success: + $cmdl = "$cmd -a -b -w $Input"; + $magic = '^\320\317\021\340'; + &store_text_method('Word (catdoc)',$cmd,$cmdl,$mime_type,$magic); + } + + # Word2 documents + if ($CATDOC2) { + $mime_type = "application/msword"; + $cmd = $CATDOC2; + $cmdl = "$cmd -a -b -w $Input"; + $magic = '^\333\245-\000'; + &store_text_method('Word2 (catdoc)',$cmd,$cmdl,$mime_type,$magic); + } + + # Word 5.1 for MAC documents + if ($CATDOCM) { + $mime_type = "application/msword"; + $cmd = $CATDOCM; + $cmdl = "$cmd -a -b -w $Input"; + $magic = '^\3767\000#\000\000\000\000'; + &store_text_method('MACWord (catdoc)',$cmd,$cmdl,$mime_type,$magic); + } + + # PostScript files + if ($CATPS) { + $mime_type = "application/postscript"; + $cmd = $CATPS; + # allow PS interpreter to give error messages + $cmdl = "($cmd; $RM _temp_.???) < $Input"; + $magic = '^.{0,20}?%!|^\033%-12345.*\n%!'; + &store_text_method('PostScript (ps2ascii)',$cmd,$cmdl,$mime_type,$magic); + } + + # Microsoft Excel file + if ($CATXLS) { + $mime_type = "application/vnd.ms-excel"; + $cmd = $CATXLS; + $cmdl = "$cmd $Input"; + $magic = '^\320\317\021\340'; + &store_text_method('MS Excel (xls2csv)',$cmd,$cmdl,$mime_type,$magic); + } + + # WordPerfect document + if ($CATWPD) { + $mime_type = "application/wordperfect|application/msword"; + $cmd = $CATWPD; + $cmdl = "$cmd $Input"; + $magic = '\377WPC'; + &store_text_method('WordPerfect (catwpd)',$cmd,$cmdl,$mime_type,$magic); + } + + + ####Documents that cannot be converted#### + + # wrapped encapsulated Postscript + $type = "EPS"; + $magic = '^\305\320\323\306 \0'; + $description = 'wrapped Encapsulated Postscript'; + &store_cannot_do($type,$magic,$description); + + # Shockwave Flash version 6 + $type = "SWF6"; + $description = 'Shockwave-Flash Version 6'; + $magic = '^CWS\006'; + &store_cannot_do($type,$magic,$description); + +#### Binary (data or whatever) +###$type = "BIN"; +###$magic = '[\000-\007\016-\037\177]'; # rather crude test! +###$description = 'apparently binary'; +###&store_cannot_do($type,$magic,$description); + + return; +} + +#------------------------------------------------------------------------------ + +sub read_magic { + + # Read first bytes of file to check for file type + open(FILE, "< $Input") || die "Can't open file $Input\n"; + read FILE,$Magic,256; + close FILE; + + return; +} + +#------------------------------------------------------------------------------ + +sub error_setup { + + if ($Efile) { + open SAVERR, ">&STDERR"; + if (open STDERR, "> $Efile") { + print SAVERR " Overwriting $Efile\n" if (-s $Efile); + $Redir = 1; + } else { close SAVERR } + } + +} + +#------------------------------------------------------------------------------ + +sub run { + + my $routine = shift; + my $return; + + if (defined &alarm_call) { + $return = alarm_call($Time, $routine); + } else { + eval $routine; + $return = $@ if $@; + } + + if ($return) { &quit($return) } + +} + +#------------------------------------------------------------------------------ + +sub try_html { + + my($set,$cmnd,$type); + + $Success = 0; + foreach $type (keys %HTML_Method) { + $set = $HTML_Method{$type}; + if (($MIME_type =~ m/$set->{'mime'}/i) and + ($Magic =~ m/$set->{'magic'}/s)) { # found the method to use + $Method = $type; + my $cmnd = $set->{'cmnd'}; + if (! -x $cmnd) { + warn "Unable to execute $cmnd for $type document\n"; + return; + } + if (not open(CAT, "$set->{'command'} |")) { + warn "$cmnd doesn't want to be opened using pipe\n"; + return; + } + while (<CAT>) { + # getting something, so it is working + $Success = 1; + if ($_ !~ m/^<!--/) { # skip comment lines inserted by converter + print; + $Count += length; + if ($Count > $OP_Limit) { last } + } + } + close CAT; + last; + } + } + return; +} + +#------------------------------------------------------------------------------ + +sub try_text { + + my($set,$cmnd,$type); + + $Success = 0; + foreach $type (keys %TEXT_Method) { + $set = $TEXT_Method{$type}; + if (($MIME_type =~ m/$set->{'mime'}/i) and + ($Magic =~ m/$set->{'magic'}/s)) { # found the method to use + $Method = $type; + my $cmnd = $set->{'cmnd'}; + if (! -x $cmnd) { die "Unable to execute $cmnd for $type document\n" } + + # Open file via selected converter, output head, then its text: + open(CAT, "$set->{'command'} |") or + die "$cmnd doesn't want to be opened using pipe\n"; + &head; + print "<BODY>\n<PRE>\n"; + $Success = 1; + while (<CAT>) { + s/\255/-/g; # replace dashes with hyphens + # replace bell, backspace, tab. etc. with single space: + s/[\000-\040]+/ /g; + if (length > 1) { # if not just a single character, eg space + print &HTML($_), "\n"; + $Count += length; + if ($Count > $OP_Limit) { last } + } + } + close CAT; + + print "</PRE>\n</BODY>\n</HTML>\n"; + last; + } + + } + + return; +} + +#------------------------------------------------------------------------------ + +sub cannot_do { + + my ($type,$set); + + # see if known, unconvertable type + $Method = ''; + foreach $type (keys %BAD_type) { + $set = $BAD_type{$type}; + if ($Magic =~ m/$set->{'magic'}/s) { # known problem + return "CANNOT DO $set->{'desc'} "; + } + } + + return 0; +} + +#------------------------------------------------------------------------------ + +sub try_plain { + + $Success = 0; + ####### if ($Magic !~ m/^[\000-\007\016-\037\177]) { + if (-T $Input) { # Looks like text, so go for it: + $Method = 'Plain Text'; + open(FILE, "<$Input") || die "Error reading $Input\n"; + $Success = 1; + $Method = 'Plain Text'; + &head; + print "<BODY>\n<PRE>\n"; + + while (<FILE>) { + # replace bell, backspace, tab. etc. with single space: + s/[\000-\040\177]+/ /g; + if (length > 1) { + print &HTML($_), "\n"; + $Count += length; + if ($Count > $OP_Limit) { last } + } + } + close FILE; + print "</PRE>\n</BODY>\n</HTML>\n"; + + } else { $Method = '' } + + return; +} + +#------------------------------------------------------------------------------ + +sub HTML { + + my $text = shift; + + $text =~ s/\f/\n/gs; # replace form feed + $text =~ s/\s+/ /g; # replace multiple spaces, etc. with a single space + $text =~ s/\s+$//gm; # remove trailing spaces + $text =~ s/&/&/g; + $text =~ s/</</g; + $text =~ s/>/>/g; + + return $text; +} + +#------------------------------------------------------------------------------ + +sub store_html_method { + + my $type = shift; + my $cmnd = shift; + my $cline = shift; + my $mime = shift; + my $magic = shift; + + $HTML_Method{$type} = { + 'mime' => $mime, + 'magic' => $magic, + 'cmnd' => $cmnd, + 'command' => $cline, + }; + + return; +} + +#------------------------------------------------------------------------------ + +sub store_text_method { + + my $type = shift; + my $cmnd = shift; + my $cline = shift; + my $mime = shift; + my $magic = shift; + + $TEXT_Method{$type} = { + 'mime' => $mime, + 'magic' => $magic, + 'cmnd' => $cmnd, + 'command' => $cline, + }; + + return; +} + +#------------------------------------------------------------------------------ + +sub store_cannot_do { + + my $type = shift; + my $magic = shift; + my $desc = shift; + + $BAD_type{$type} = { + 'magic' => $magic, + 'desc' => $desc, + }; + + return; + +} + +#------------------------------------------------------------------------------ + +sub head { + + print "<HTML>\n<HEAD>\n"; + print "<TITLE>[" . $Name . "]</TITLE>\n"; + print "</HEAD>\n"; + +} + +#------------------------------------------------------------------------------ + +sub quit { + + if ($Redir) { # end redirection of STDERR to temporary file + close STDERR; + open STDERR, ">&SAVERR"; + } + + if ($Verbose) { + print STDERR "$Method $Count" if ($Success); + print STDERR "\n"; + } + + if ($Count > $OP_Limit) { + print STDERR $Emark, "Output truncated after limit $OP_Limit reached\n"; + } + + my $return = shift; + if ($return) { + print STDERR $Emark, $return, "\n"; + $return = 1; + } + + chdir $TMP; + if ($Efile && -s $Efile) { + open EFILE, "<$Efile"; + my $c = 0; + while (<EFILE>) { + $c++; + if ($c <= $Maxerr) { + print STDERR $EEmark, $_; + } + } + close EFILE; + print STDERR $Emark, " ... (total of $c lines of error messages)\n" if ($c > $Maxerr); + } + unlink $Efile if ($Efile && -e $Efile); + + if (-e "core" && (-M "core" < 0)) { + print STDERR $Emark, "$CORE_MESS\n"; + } + exit $return; +} |