debian/htdig/htdig-3.2.0b6/contrib/conv_doc.pl


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214

#!/usr/local/bin/perl

#
# Sample external converter for htdig 3.1.4 or later.
# Usage: (in htdig.conf)
#
# external_parsers: application/msword->text/html /usr/local/bin/conv_doc.pl \
#               application/postscript->text/html /usr/local/bin/conv_doc.pl \
#               application/pdf->text/html /usr/local/bin/conv_doc.pl
#
# Written by Gilles Detillieux <grdetil@scrc.umanitoba.ca>.
# Based in part on the parse_word_doc.pl script, written by
# Jesse op den Brouw <MSQL_User@st.hhs.nl> but heavily revised.
#
# 1998/12/11
# Added:        catdoc test (is catdoc runnable?)    <carl@dpiwe.tas.gov.au>
# 1999/02/09
# Added:        uses ps2ascii to handle PS files     <grdetil@scrc.umanitoba.ca>
# 1999/02/15
# Added:        check for some file formats          <Frank.Richter@hrz.tu-chemnitz.de>
# 1999/02/25
# Added:        uses pdftotext to handle PDF files   <grdetil@scrc.umanitoba.ca>
# 1999/03/01
# Added:        extra checks for file "wrappers"     <grdetil@scrc.umanitoba.ca>
#               & check for MS Word signature (no longer defaults to catdoc)
# 1999/03/05
# Changed:      rejoin hyphenated words across lines <grdetil@scrc.umanitoba.ca>
#               (in PDFs)
# 1999/08/12
# Changed:      adapted for xpdf 0.90 release        <grdetil@scrc.umanitoba.ca>
# Added:        uses pdfinfo to handle PDF titles    <grdetil@scrc.umanitoba.ca>
# Changed:      change dashes to hyphens             <grdetil@scrc.umanitoba.ca>
# 1999/09/09
# Changed:      fix to handle empty PDF title right  <grdetil@scrc.umanitoba.ca>
# 1999/12/01
# Changed:      rewritten as external converter      <grdetil@scrc.umanitoba.ca>
#               stripped out all parser-related code
# Added:        test to silently ignore wrapped EPS files    < " >
# Added:        test for null device on Win32 env.   <PBISSET@emergency.qld.gov.au>
# 2000/01/12
# Changed:      "break" to "last" (no break in Perl) <wjones@tc.fluke.com>
# 2001/07/12
# Changed:      fix "last" handling in dehyphenation <grdetil@scrc.umanitoba.ca>
# Added:        handle %xx codes in title from URL   <grdetil@scrc.umanitoba.ca>
#########################################
#
# set this to your MS Word to text converter
# get it from: http://www.fe.msk.ru/~vitus/catdoc/
#
$CATDOC = "/usr/local/bin/catdoc";
#
# set this to your WordPerfect to text converter, or /bin/true if none available
# this nabs WP documents with .doc suffix, so catdoc doesn't see them
#
$CATWP = "/bin/true";
#
# set this to your RTF to text converter, or /bin/true if none available
# this nabs RTF documents with .doc suffix, so catdoc doesn't see them
#
$CATRTF = "/bin/true";
#
# set this to your PostScript to text converter
# get it from the ghostscript 3.33 (or later) package
#
$CATPS = "/usr/bin/ps2ascii";
#
# set this to your PDF to text converter, and pdfinfo tool
# get it from the xpdf 0.90 package at http://www.foolabs.com/xpdf/
#
$CATPDF = "/usr/bin/pdftotext";
$PDFINFO = "/usr/bin/pdfinfo";
#$CATPDF = "/usr/local/bin/pdftotext";
#$PDFINFO = "/usr/local/bin/pdfinfo";

#########################################
#
# need some var's
$dehyphenate = 0;                       # set if we must dehyphenate text output
$ishtml = 0;                            # set if converter produces HTML
$null = "";
$magic = "";
$type = "";
$cvtr = "";
$cvtcmd = "";
$title = "";
@parts = ();

# make portable to win32 platform or unix
$null = "/dev/null";
if ($^O eq "MSWin32") {$null = "nul";}


#########################################
#
# Read first bytes of file to check for file type (like file(1) does)
open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n";
read FILE,$magic,8;
close FILE;

if ($magic =~ /^\0\n/) {                # possible MacBinary header
    open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n";
    read FILE,$magic,136;               # let's hope converters can handle them!
    close FILE;
}

if ($magic =~ /%!|^\033%-12345/) {      # it's PostScript (or HP print job)
    $cvtr = $CATPS;                     # gs 3.33 leaves _temp_.??? files in .
# keep quiet even if PS gives errors...
    $cvtcmd = "(cd /tmp; $cvtr; rm -f _temp_.???) < $ARGV[0] 2>$null";
# allow PS interpreter to give error messages...
#   $cvtcmd = "(cd /tmp; $cvtr; rm -f _temp_.???) < $ARGV[0]";
    $type = "PostScript";
    $dehyphenate = 0;                   # ps2ascii already does this
    if ($magic =~ /^\033%-12345/) {     # HP print job
        open(FILE, "< $ARGV[0]") || die "Can't open file $ARGV[0]: $!\n";
        read FILE,$magic,256;
        close FILE;
        exit unless $magic =~ /^\033%-12345X\@PJL.*\n*.*\n*.*ENTER\s*LANGUAGE\s*=\s*POSTSCRIPT.*\n*.*\n*.*\n%!/
    }
} elsif ($magic =~ /\305\320\323\306\036/) {    # it's a wrapped EPS - ignore
    exit
} elsif ($magic =~ /%PDF-/) {           # it's PDF (Acrobat)
    $cvtr = $CATPDF;
    $cvtcmd = "$cvtr -raw $ARGV[0] -";
# to handle single-column, strangely laid out PDFs, use coalescing feature...
#   $cvtcmd = "$cvtr $ARGV[0] -";
    $type = "PDF";
    $dehyphenate = 1;                   # PDFs often have hyphenated lines
    if (open(INFO, "$PDFINFO $ARGV[0] 2>$null |")) {
        while (<INFO>) {
            if (/^Title:/) {
                s/^Title:\s+//;
                s/\s+$//;
                s/\s+/ /g;
                s/&/\&amp\;/g;
                s/</\&lt\;/g;
                s/>/\&gt\;/g;
                $title = $_;
                last;
            }
        }
        close INFO;
    }
# to use coalescing feature conditionally...
#   if ($title =~ /...Title of Corel DRAW output.../) {
#       $cvtcmd = "$cvtr $ARGV[0] -";
#   }
} elsif ($magic =~ /WPC/) {             # it's WordPerfect
    $cvtr = $CATWP;
    $cvtcmd = "$cvtr $ARGV[0]";
    $type = "WordPerfect";
    $dehyphenate = 0;                   # WP documents not likely hyphenated
} elsif ($magic =~ /^{\\rtf/) {         # it's Richtext
    $cvtr = $CATRTF;
    $cvtcmd = "$cvtr $ARGV[0]";
    $type = "RTF";
    $dehyphenate = 0;                   # RTF documents not likely hyphenated
} elsif ($magic =~ /\320\317\021\340/) {    # it's MS Word
    $cvtr = $CATDOC;
    $cvtcmd = "$cvtr -a -w $ARGV[0]";
    $type = "Word";
    $dehyphenate = 0;                   # Word documents not likely hyphenated
} else {
    die "Can't determine type of file $ARGV[0]; content-type: $ARGV[1]; URL: $ARGV[2]\n";
}

die "$cvtr is absent or unwilling to execute.\n" unless -x $cvtr;

#############################################
#
# Start output.

# if running as a converter for "user-defined" output type...
#print "Content-Type: text/html\n\n";

if ($ishtml) {
    # converter will give its own HTML output
    system("$cvtcmd") || die "$cvtr doesn't want to be run from shell.\n";
    exit;
}

# Produce HTML output from converter's text output, so we can add title.
print "<HTML>\n<head>\n";

# print out the title, if it's set, and not just a file name, or make one up
if ($title eq "" || $title =~ /^[A-G]:[^\s]+\.[Pp][Dd][Ff]$/) {
    @parts = split(/\//, $ARGV[2]);         # get the file basename
    $parts[-1] =~ s/%([A-F0-9][A-F0-9])/pack("C", hex($1))/gie;
    $title = "$type Document $parts[-1]";   # use it in title
}
print "<title>$title</title>\n";

print "</head>\n<body>\n";

# Open file via selected converter, output its text.
open(CAT, "$cvtcmd |") || die "$cvtr doesn't want to be opened using pipe.\n";
while (<CAT>) {
    while (/[A-Za-z\300-\377]-\s*$/ && $dehyphenate) {
        $_ .= <CAT>;
        last if eof;
        s/([A-Za-z\300-\377])-\s*\n\s*([A-Za-z\300-\377])/$1$2/s
    }
    s/[\255]/-/g;                       # replace dashes with hyphens
    s/\f/\n/g;                          # replace form feed
    s/&/\&amp\;/g;                      # HTMLify text
    s/</\&lt\;/g;
    s/>/\&gt\;/g;
    print;
}

print "</body>\n</HTML>\n";

close CAT;