line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Biblio::Document::Parser::Utils; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
###################################################################### |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
# ParaTools::Document::Parser::Utils; |
6
|
|
|
|
|
|
|
# |
7
|
|
|
|
|
|
|
###################################################################### |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# This file is part of ParaCite Tools ((http://paracite.eprints.org/developers/) |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# Copyright (c) 2002 University of Southampton, UK. SO17 1BJ. |
12
|
|
|
|
|
|
|
# |
13
|
|
|
|
|
|
|
# ParaTools is free software; you can redistribute it and/or modify |
14
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
15
|
|
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or |
16
|
|
|
|
|
|
|
# (at your option) any later version. |
17
|
|
|
|
|
|
|
# |
18
|
|
|
|
|
|
|
# ParaTools is distributed in the hope that it will be useful, |
19
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
20
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
21
|
|
|
|
|
|
|
# GNU General Public License for more details. |
22
|
|
|
|
|
|
|
# |
23
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
24
|
|
|
|
|
|
|
# along with ParaTools; if not, write to the Free Software |
25
|
|
|
|
|
|
|
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA |
26
|
|
|
|
|
|
|
# |
27
|
|
|
|
|
|
|
###################################################################### |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
|
30
|
1
|
|
|
1
|
|
921
|
use utf8; |
|
1
|
|
|
|
|
11
|
|
|
1
|
|
|
|
|
5
|
|
31
|
1
|
|
|
1
|
|
31
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
38
|
|
32
|
|
|
|
|
|
|
require Exporter; |
33
|
1
|
|
|
1
|
|
1002
|
use LWP::UserAgent; |
|
1
|
|
|
|
|
64510
|
|
|
1
|
|
|
|
|
45
|
|
34
|
1
|
|
|
1
|
|
3638
|
use File::Temp qw/ tempfile tempdir /; |
|
1
|
|
|
|
|
32219
|
|
|
1
|
|
|
|
|
168
|
|
35
|
1
|
|
|
1
|
|
8
|
use URI; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
37
|
|
36
|
1
|
|
|
1
|
|
5
|
use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAG $CHAR_MATCHES %CHAR_TRANSFORMS %CONVERTERS $DEBUG); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1323
|
|
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
@ISA = qw( Exporter ); |
39
|
|
|
|
|
|
|
@EXPORT_OK = qw( &normalise_multichars ); |
40
|
|
|
|
|
|
|
@EXPORT = qw( &get_content ); |
41
|
|
|
|
|
|
|
$DEBUG = 0; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=pod |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
=head1 NAME |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
@ - utility module for handling International characters and document conversion |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
=head1 DESCRIPTION |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
Biblio::Document::Parser::Utils provides some utility functions for handling international |
52
|
|
|
|
|
|
|
characters and for conversion of documents to plaintext. |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head1 SYNOPSIS |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
use Biblio::Document::Parser::Utils qw( normalise_multichars ); |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
print normalise_multichars( $str ); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head1 METHODS |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=over 4 |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=item $str = normalise_multichar( $str ) |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
Convert multi-char international characters into single UTF-8 chars, e.g.: |
67
|
|
|
|
|
|
|
¨o => ö |
68
|
|
|
|
|
|
|
These appear in pdftotext output from PDFs generated by pdflatex. |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=cut |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
$CHAR_MATCHES = '[\x{5e}\x{60}\x{a8}\x{b4}\x{7e}][aeounzn]'; |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
%CHAR_TRANSFORMS = ( |
75
|
|
|
|
|
|
|
"\x{5e}a"=>"\x{e2}", |
76
|
|
|
|
|
|
|
"\x{5e}e"=>"\x{ea}", |
77
|
|
|
|
|
|
|
"\x{5e}o"=>"\x{f4}", |
78
|
|
|
|
|
|
|
"\x{5e}u"=>"\x{fb}", |
79
|
|
|
|
|
|
|
"\x{60}a"=>"\x{e0}", |
80
|
|
|
|
|
|
|
"\x{60}e"=>"\x{e8}", |
81
|
|
|
|
|
|
|
"\x{60}o"=>"\x{f2}", |
82
|
|
|
|
|
|
|
"\x{60}u"=>"\x{f9}", |
83
|
|
|
|
|
|
|
"\x{a8}a"=>"\x{e4}", |
84
|
|
|
|
|
|
|
"\x{a8}e"=>"\x{eb}", |
85
|
|
|
|
|
|
|
"\x{a8}o"=>"\x{f6}", |
86
|
|
|
|
|
|
|
"\x{a8}u"=>"\x{fc}", |
87
|
|
|
|
|
|
|
"\x{b4}a"=>"\x{e1}", |
88
|
|
|
|
|
|
|
"\x{b4}e"=>"\x{e9}", |
89
|
|
|
|
|
|
|
"\x{b4}o"=>"\x{f3}", |
90
|
|
|
|
|
|
|
"\x{b4}u"=>"\x{fa}", |
91
|
|
|
|
|
|
|
"\x{b4}n"=>"\x{144}", |
92
|
|
|
|
|
|
|
"\x{b4}z"=>"\x{17a}", |
93
|
|
|
|
|
|
|
"\x{7e}n"=>"\x{f1}", |
94
|
|
|
|
|
|
|
); |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
%CONVERTERS = |
97
|
|
|
|
|
|
|
( |
98
|
|
|
|
|
|
|
doc => "wvText _IN_ _OUT_", |
99
|
|
|
|
|
|
|
pdf => "pdftotext -raw _IN_ _OUT_", |
100
|
|
|
|
|
|
|
ps => "pstotext -output _OUT_ _IN_", |
101
|
|
|
|
|
|
|
htm => "links --dump _IN_ > _OUT_", |
102
|
|
|
|
|
|
|
html => "links --dump _IN_ > _OUT_", |
103
|
|
|
|
|
|
|
); |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
if($DEBUG) { |
106
|
|
|
|
|
|
|
binmode(STDOUT,":utf8"); |
107
|
|
|
|
|
|
|
for(sort { $a cmp $b } keys %CHAR_TRANSFORMS) { |
108
|
|
|
|
|
|
|
print "$_ => $CHAR_TRANSFORMS{$_}\n"; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub normalise_multichars { |
113
|
6
|
|
|
6
|
0
|
9
|
my $str = shift; |
114
|
6
|
|
|
|
|
28
|
$str =~ s/($CHAR_MATCHES)/$CHAR_TRANSFORMS{$1}/sgo; |
115
|
6
|
|
|
|
|
29
|
$str; |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=pod |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=item $content = ParaTools::Utils::get_content($location) |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
This function takes either a filename or a URL as a parameter, and |
124
|
|
|
|
|
|
|
aims to return a string containing the lines in the file. A hash of |
125
|
|
|
|
|
|
|
converters is provided in ParaTools/Utils.pm, which should be customised |
126
|
|
|
|
|
|
|
for your system. |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
For URLs, the file is first downloaded to a temporary directory, then |
129
|
|
|
|
|
|
|
converted, whereas local files are copied straight into the temporary |
130
|
|
|
|
|
|
|
directory. For this reason, some care should be taken when handling very |
131
|
|
|
|
|
|
|
large files. |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=cut |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub get_content |
136
|
|
|
|
|
|
|
{ |
137
|
1
|
|
|
1
|
1
|
11
|
my($location) = @_; |
138
|
|
|
|
|
|
|
|
139
|
|
|
|
|
|
|
# Get some temporary files ready. |
140
|
1
|
|
|
|
|
9
|
my $dir = tempdir( CLEANUP => 1 ); |
141
|
1
|
|
|
|
|
837
|
my (undef, $tofile) = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".txt"); |
142
|
|
|
|
|
|
|
|
143
|
1
|
|
|
|
|
518
|
my $type = "txt"; |
144
|
1
|
|
|
|
|
3
|
my $converter = ""; |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
# Set up the type. |
147
|
1
|
50
|
|
|
|
10
|
if ($location =~ /\.(\w+?)$/) |
148
|
|
|
|
|
|
|
{ |
149
|
1
|
|
|
|
|
4
|
$type = $1; |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
|
152
|
1
|
50
|
|
|
|
6
|
if ($location =~ /^http:\/\//) |
153
|
|
|
|
|
|
|
{ |
154
|
0
|
0
|
|
|
|
0
|
if (!$type) |
155
|
|
|
|
|
|
|
{ |
156
|
0
|
|
|
|
|
0
|
print STDERR "Unknown type - assuming HTML\n"; |
157
|
0
|
|
|
|
|
0
|
$type = "html"; |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
else |
161
|
|
|
|
|
|
|
{ |
162
|
1
|
50
|
|
|
|
5
|
if (!$type) |
163
|
|
|
|
|
|
|
{ |
164
|
0
|
|
|
|
|
0
|
print STDERR "Unknown type - assuming plaintext\n"; |
165
|
0
|
|
|
|
|
0
|
$type = "txt"; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
1
|
|
|
|
|
5
|
my (undef, $fromfile) = tempfile(UNLINK => 1, DIR => $dir, SUFFIX => ".$type"); |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# Now we know the type, grab the files. |
172
|
1
|
50
|
|
|
|
415
|
if ($location =~ /^http:\/\//) |
173
|
|
|
|
|
|
|
{ |
174
|
|
|
|
|
|
|
# If it's remote, use the LWP mirror function to grab it. |
175
|
0
|
|
|
|
|
0
|
my $ua = new LWP::UserAgent(); |
176
|
0
|
|
|
|
|
0
|
$ua->mirror($location, $fromfile); |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
else |
179
|
|
|
|
|
|
|
{ |
180
|
|
|
|
|
|
|
# If it's local, mirror it straight to the $fromfile. |
181
|
1
|
50
|
|
|
|
40
|
open(FIN, $location) or die $!; |
182
|
1
|
50
|
|
|
|
65
|
open(FOUT, ">$fromfile") or die $!; |
183
|
1
|
|
|
|
|
41
|
foreach() { print FOUT $_; } |
|
15
|
|
|
|
|
41
|
|
184
|
1
|
50
|
|
|
|
55
|
close FOUT or die $!; |
185
|
1
|
50
|
|
|
|
14
|
close FIN or die $!; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
1
|
50
|
|
|
|
5
|
if ($type ne "txt") |
189
|
|
|
|
|
|
|
{ |
190
|
|
|
|
|
|
|
# Convert from the $fromfile to the $tofile. |
191
|
0
|
0
|
|
|
|
0
|
if (!$CONVERTERS{$type}) |
192
|
|
|
|
|
|
|
{ |
193
|
0
|
|
|
|
|
0
|
print STDERR "Sorry, no converters available for type $type\n"; |
194
|
0
|
|
|
|
|
0
|
return; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
else |
197
|
|
|
|
|
|
|
{ |
198
|
0
|
|
|
|
|
0
|
$converter = $CONVERTERS{$type}; |
199
|
0
|
|
|
|
|
0
|
$converter =~ s/_IN_/$fromfile/g; |
200
|
0
|
|
|
|
|
0
|
$converter =~ s/_OUT_/$tofile/g; |
201
|
|
|
|
|
|
|
} |
202
|
0
|
|
|
|
|
0
|
system($converter); |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
else |
205
|
|
|
|
|
|
|
{ |
206
|
|
|
|
|
|
|
# If we have text, just use the fromfile. |
207
|
1
|
|
|
|
|
3
|
$tofile = $fromfile; |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
1
|
|
|
|
|
2
|
my $content = ""; |
211
|
1
|
50
|
|
|
|
47
|
open( INPUT, $tofile ) or return; |
212
|
1
|
|
|
|
|
22
|
read( INPUT, $content, -s INPUT ); |
213
|
1
|
50
|
|
|
|
31
|
close INPUT or die $!; |
214
|
|
|
|
|
|
|
|
215
|
1
|
|
|
|
|
6
|
return $content; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=pod |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=item $escaped_url = ParaTools::Utils::url_escape($string) |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
Simple function to convert a string into an encoded |
223
|
|
|
|
|
|
|
URL (i.e. spaces to %20, etc). Takes the unencoded |
224
|
|
|
|
|
|
|
URL as a parameter, and returns the encoded version. |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
=cut |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
sub url_escape |
229
|
|
|
|
|
|
|
{ |
230
|
0
|
|
|
0
|
1
|
|
my( $url ) = @_; |
231
|
0
|
|
|
|
|
|
$url =~ s/%3C/g; |
232
|
0
|
|
|
|
|
|
$url =~ s/>/%3E/g; |
233
|
0
|
|
|
|
|
|
$url =~ s/#/%23/g; |
234
|
0
|
|
|
|
|
|
$url =~ s/;/%3B/g; |
235
|
0
|
|
|
|
|
|
$url =~ s/&/%26/g; |
236
|
0
|
|
|
|
|
|
my $uri = URI->new( $url ); |
237
|
0
|
|
|
|
|
|
my $out = $uri->as_string; |
238
|
0
|
|
|
|
|
|
return $out; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
1; |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
__END__ |