File Coverage

blib/lib/arXiv/FileGuess.pm
Criterion Covered Total %
statement 150 173 86.7
branch 65 120 54.1
condition 60 135 44.4
subroutine 8 8 100.0
pod 3 3 100.0
total 286 439 65.1


line stmt bran cond sub pod time code
1             package arXiv::FileGuess;
2              
3             =head1 NAME
4              
5             arXiv::FileGuess - Central file type determination for arXiv
6              
7             =head1 SYNOPSIS
8              
9             Central file type identification for arXiv.org.
10              
11             Much of this should probably be replaced with the file program with externally
12             maintained magic information. However, we need here to support a number of cases
13             special to arXiv and to make some rather fine-grained determinations of TeX
14             related file formats, along with determination of the order in which they
15             should be processed.
16              
17             =cut
18              
19 1     1   19175 use strict;
  1         2  
  1         35  
20              
21 1     1   5 use base qw(Exporter);
  1         2  
  1         118  
22             our @EXPORT_OK = qw( guess_file_type is_tex_type type_name );
23              
24 1     1   4 use vars qw(%NAME %TEX_types);
  1         1  
  1         370  
25              
26             =head2 INTERNAL VARIABLES
27              
28             =head3 %TEX_types
29              
30             A hash of type names which represent TeX types (i.e. should be processed with
31             AutoTeX). Accessed via function is_tex_type().
32              
33             =cut
34              
35             =head3 %NAME
36              
37             A display name indexed by type. Accessed via function type_name().
38              
39             =cut
40              
41             BEGIN {
42             # Make sure %TEX_types and %NAME full before use
43              
44 1     1   5 @TEX_types{qw(
45             TYPE_LATEX
46             TYPE_TEX
47             TYPE_TEX_priority
48             TYPE_TEX_AMS
49             TYPE_TEX_MAC
50             TYPE_LATEX2e
51             TYPE_TEX_priority2
52             TYPE_TEXINFO
53             TYPE_PDFLATEX
54             TYPE_PDFTEX
55             )} = ();
56              
57 1         2 $NAME{'TYPE_ABORT'} = 'Immediate stop';
58 1         2 $NAME{'TYPE_FAILED'} = 'unknown';
59 1         2 $NAME{'TYPE_ALWAYS_IGNORE'} = 'Always ignore';
60 1         1 $NAME{'TYPE_INPUT'} = 'Input for (La)TeX';
61 1         1 $NAME{'TYPE_BIBTEX'} = 'BiBTeX';
62 1         2 $NAME{'TYPE_POSTSCRIPT'} = 'Postscript';
63 1         1 $NAME{'TYPE_DOS_EPS'} = 'DOS EPS Binary File';
64 1         2 $NAME{'TYPE_PS_FONT'} = 'Postscript Type 1 Font';
65 1         1 $NAME{'TYPE_PS_PC'} = '^D%! Postscript';
66 1         1 $NAME{'TYPE_IMAGE'} = 'Image (gif/jpg etc)';
67 1         2 $NAME{'TYPE_ANIM'} = 'Animation (mpeg etc)';
68 1         1 $NAME{'TYPE_HTML'} = 'HTML';
69 1         2 $NAME{'TYPE_PDF'} = 'PDF';
70 1         2 $NAME{'TYPE_DVI'} = 'DVI';
71 1         1 $NAME{'TYPE_NOTEBOOK'} = 'Mathematica Notebook';
72 1         2 $NAME{'TYPE_ODF'} = 'OpenDocument Format';
73 1         1 $NAME{'TYPE_DOCX'} = 'Microsoft DOCX';
74 1         16 $NAME{'TYPE_TEX'} = 'TEX';
75 1         1 $NAME{'TYPE_PDFTEX'} = 'PDFTEX';
76 1         2 $NAME{'TYPE_TEX_priority2'} = 'TeX (with \\end or \\bye - not starting a line)';
77 1         2 $NAME{'TYPE_TEX_AMS'} = 'AMSTeX';
78 1         1 $NAME{'TYPE_TEX_priority'} = 'TeX (with \\end or \\bye)';
79 1         1 $NAME{'TYPE_TEX_MAC'} = 'TeX +macros (harv,lanl..)';
80 1         2 $NAME{'TYPE_LATEX'} = 'LaTeX';
81 1         1 $NAME{'TYPE_LATEX2e'} = 'LATEX2e';
82 1         1 $NAME{'TYPE_PDFLATEX'} = 'PDFLATEX';
83 1         1 $NAME{'TYPE_TEXINFO'} = 'Texinfo';
84 1         2 $NAME{'TYPE_MF'} = 'Metafont';
85 1         2 $NAME{'TYPE_UUENCODED'} = 'UUencoded';
86 1         1 $NAME{'TYPE_ENCRYPTED'} = 'Encrypted';
87 1         2 $NAME{'TYPE_PC'} = 'PC-ctrl-Ms';
88 1         2 $NAME{'TYPE_MAC'} = 'MAC-ctrl-Ms';
89 1         2 $NAME{'TYPE_CSH'} = 'CSH';
90 1         1 $NAME{'TYPE_SH'} = 'SH';
91 1         2 $NAME{'TYPE_JAR'} = 'JAR archive';
92 1         2 $NAME{'TYPE_RAR'} = 'RAR archive';
93 1         1 $NAME{'TYPE_COMPRESSED'} = 'UNIX-compressed';
94 1         2 $NAME{'TYPE_ZIP'} = 'ZIP-compressed';
95 1         1 $NAME{'TYPE_GZIPPED'} = 'GZIP-compressed';
96 1         7 $NAME{'TYPE_BZIP2'} = 'BZIP2-compressed';
97 1         1 $NAME{'TYPE_MULTI_PART_MIME'} = 'MULTI_PART_MIME';
98 1         2 $NAME{'TYPE_TAR'} = 'TAR archive';
99 1         2 $NAME{'TYPE_IGNORE'} = ' user defined IGNORE';
100 1         1 $NAME{'TYPE_README'} = 'override';
101 1         2 $NAME{'TYPE_TEXAUX'} = 'TeX auxiliary';
102 1         1 $NAME{'TYPE_ABS'} = 'abstract';
103 1         1949 $NAME{'TYPE_INCLUDE'} = ' keep';
104             }
105              
106              
107             =head2 SUBROUTINES
108              
109             =head3 guess_file_type($filename)
110              
111             Guess the file type $filename. Returns ($type, $tex_format,
112             $error), all of which are strings. The $type may be supplied to
113             is_tex_type($type) or type_name($type) for additional information
114             related to the file type.
115              
116             For most files $tex_format and $error will be undefined. $tex_format
117             will be defined for some TeX file formats where there is additional
118             information about the type of TeX.
119              
120             =cut
121              
122             sub guess_file_type {
123 31     31 1 672 my ($filename) = @_;
124 31         43 local $_ = $filename;
125              
126 31 100       76 return 'TYPE_README' if /(^|\/)00README\.XXX$/;
127              
128             # Ignore tmp files created by (unpatched) dvihps, in top dir
129 30 50       52 return 'TYPE_ALWAYS_IGNORE' if /(^|\/)(head|body)\.tmp$/;
130              
131             # missfont.log files created in top dir should abort processing
132             # (missfont.log files in subdirs can be ignored)
133 30 50       57 return 'TYPE_ABORT' if /(^|\/)missfont.log$/;
134              
135 30 50       128 return 'TYPE_TEXAUX' if /\.(sty|cls|mf|\d*pk|bbl|bst|tfm|ax|def|log|hrfldf|cfg|clo|inx|end|fgx|tbx|rtx|rty|toc)$/i;
136 30 50       55 return 'TYPE_ABS' if /\.abs$/;
137 30 50       80 return 'TYPE_IGNORE' if /\.fig$/; # ignore xfig files
138 30 50       57 return 'TYPE_NOTEBOOK' if /\.nb$/i;
139 30 50       56 return 'TYPE_INPUT' if /\.inp$/i;
140              
141 30 50       57 return 'TYPE_HTML' if /\.html?$/i;
142 30 50       699 return 'TYPE_ENCRYPTED' if /\.cry$/;
143              
144             # Ignore zero size files
145 30 50       420 return 'TYPE_IGNORE' if (-z $filename);
146              
147             # Open file and read first few bytes to do magic sequence identification
148             # note that file will be auto-closed when $FILE_TO_GUESS goes out of scope
149 30 100       862 open(my $FILE_TO_GUESS, '<', $_) ||
150             return ('TYPE_FAILED', undef, "failed to open '$filename' to guess its format: $!. Continuing.\n");
151              
152 28   50     344 my $b1 = ord(getc($FILE_TO_GUESS) || 0);
153 28   50     53 my $b2 = ord(getc($FILE_TO_GUESS) || 0);
154 28   50     50 my $b3 = ord(getc($FILE_TO_GUESS) || 0);
155 28   50     42 my $b4 = ord(getc($FILE_TO_GUESS) || 0);
156 28   50     50 my $b5 = ord(getc($FILE_TO_GUESS) || 0);
157 28   50     47 my $b6 = ord(getc($FILE_TO_GUESS) || 0);
158 28   50     49 my $b7 = ord(getc($FILE_TO_GUESS) || 0);
159 28   50     45 my $b8 = ord(getc($FILE_TO_GUESS) || 0);
160              
161 28 50 33     67 return 'TYPE_COMPRESSED' if $b1 == 037 && $b2 == 0235;
162 28 50 33     52 return 'TYPE_GZIPPED' if $b1 == 037 && $b2 == 0213;
163 28 50 66     123 return 'TYPE_BZIP2' if $b1 == 0x42 && $b2 == 0x5A && $b3 == 0x68 && $b4 > 0x2F;
      66        
      33        
164              
165             # POSIX tarfiles: look for the string 'ustar' at posn 257
166             # (There used to be additional code to detect non-POSIX tar files
167             # which is not detected with above, no longer necessary)
168 25         20 my $tar_test;
169 25         161 seek($FILE_TO_GUESS, 257, 0);
170 25 50 66     220 if ( read($FILE_TO_GUESS, $tar_test, 5) && $tar_test eq 'ustar' ) {
171 0         0 return 'TYPE_TAR';
172             }
173              
174             # DVI
175 25 100 66     76 return 'TYPE_DVI' if $b1 == oct(367) && $b2 == oct(2);
176             # GIF
177 24 50       79 return 'TYPE_IMAGE' if (sprintf('%c%c%c%c',$b1,$b2,$b3,$b4) eq 'GIF8');
178             # PNG
179 24 50       88 return 'TYPE_IMAGE' if sprintf('%c%c%c%c%c%c%c%c',$b1,$b2,$b3,$b4,$b5,$b6,$b7,$b8) eq "\211PNG\r\n\032\n";
180              
181             # TIF
182 24 0 33     47 return 'TYPE_IMAGE' if $b1 == 0115 && $b2 == 0115 && $filename =~ /\.tif/i;
      33        
183             # JPEG
184 24 0 33     42 return 'TYPE_IMAGE' if ($b1==0377 && $b2==0330 && $b3==0377 && ($b4==0340 || $b4==0356));
      33        
      0        
      0        
185              
186             # MPEG
187 24 50 66     49 return 'TYPE_ANIM' if $b1 == 0 && $b2 == 0 && $b3 == 01 && $b4 == 0263;
      66        
      33        
188              
189 24 100 66     110 if (("PK\003\004" eq sprintf('%c%c%c%c', $b1,$b2,$b3,$b4)) or
190             ("PK00PK\003\004" eq sprintf('%c%c%c%c%c%c%c%c', $b1,$b2,$b3,$b4,$b5,$b6,$b7,$b8))) {
191 7 100       40 return 'TYPE_JAR' if ($filename =~ /\.jar$/i);
192 6 100       27 return 'TYPE_ODF' if ($filename =~ /\.odt$/i);
193 5 100       44 return 'TYPE_DOCX' if ($filename =~ /\.docx$/i);
194 3         55 return 'TYPE_ZIP'
195             }
196 17 100       63 return 'TYPE_RAR' if ('Rar!' eq sprintf('%c%c%c%c', $b1,$b2,$b3,$b4));
197              
198             #:0 belong 0xC5D0D3C6 DOS EPS Binary File
199             #->4 long >0 Postscript starts at byte %d
200 16 50 66     80 return 'TYPE_DOS_EPS' if $b1 == oct(305) && $b2 == oct(320) && $b3 == oct(323) && $b4 == oct(306);
      66        
      33        
201              
202 14         14 my $one_kb;
203 14         78 seek($FILE_TO_GUESS, 0, 0);
204 14         85 read($FILE_TO_GUESS, $one_kb, 1024);
205 14 50       42 return 'TYPE_PDF' if index($one_kb, '%PDF-') >= 0;
206 14 50       1667 return 'TYPE_MAC' if $one_kb =~ /#!\/bin\/csh -f\r#|(\r|^)begin \d{1,4}\s+\S.*\r[^\n]/;
207              
208 14         13 my ($maybe_tex, $maybe_tex_priority, $maybe_tex_priority2);
209 14         72 seek($FILE_TO_GUESS, 0, 0);
210              
211 14         53 local $/ = "\n";
212 14         16 my $accum='';
213 14         170 while (<$FILE_TO_GUESS>) {
214 40 50 33     92 if (/\%auto-ignore/ && $. <= 10) {
215 0         0 return 'TYPE_IGNORE';
216             }
217 40 50 33     157 if ($. <= 10 && /\\input texinfo/) {
218 0         0 return 'TYPE_TEXINFO';
219             }
220 40 50 33     1398 if ($. <= 40 && /(^|\r)Content-type: /i ) {
221 0         0 return 'TYPE_MULTI_PART_MIME';
222             }
223              
224             # Match strings starting at either 1st or 7th byte. Use $accum
225             # to build string of file to this point as the preceding 6 chars
226             # may include \n
227 40         70 $accum.=$_;
228 40 100 100     150 if ($. <= 7 && $accum=~/^(......)?%\!(PS-AdobeFont-1\.|FontType1|PS-Adobe-3\.0\ Resource-Font)/s) {
229 4         76 return 'TYPE_PS_FONT';
230             }
231              
232             # This must come after the test for TYPE_PS_FONT
233 36 50 66     91 if ($. == 1 && /^%\!/) {
234 0         0 return 'TYPE_POSTSCRIPT';
235             }
236              
237 36 50 33     258 if (($. == 1 && (/^\%*\004%\!/ || /.*%\!PS-Adobe/))
      66        
      33        
      33        
      33        
238             || ($. <= 10 && /^%\!PS/ && !$maybe_tex)) {
239 0         0 return 'TYPE_PS_PC';
240             }
241              
242 36 50 33     139 if ($. <= 12 && /^\r?%\&([^\s\n]+)/) {
243 0 0 0     0 if ($1 eq 'latex209' || $1 eq 'biglatex' ||
      0        
      0        
244             $1 eq 'latex' || $1 eq 'LaTeX') {
245 0         0 return ('TYPE_LATEX', $1);
246             } else {
247 0         0 return ('TYPE_TEX_MAC', $1);
248             }
249             }
250 36 50 33     171 if ($. <= 10 && /\s]/i) {
251 0         0 return 'TYPE_HTML';
252             }
253 36 50 33     133 if ($. <= 10 && /\%auto-include/) {
254 0         0 return 'TYPE_INCLUDE';
255             }
256             # All subsequent checks have lines with '%' in them chopped.
257             # if we need to look for a % then do it earlier!
258 36         104 s/\%[^\r]*//;
259 36 100       73 if (/(^|\r)\s*\\documentstyle/) {
260 1         19 return 'TYPE_LATEX';
261             }
262 35 100       72 if (/(^|\r)\s*\\documentclass/) {
263 8         10 return _type_of_latex2e(\*{$FILE_TO_GUESS});
  8         20  
264             }
265 27 50       57 if (/(^|\r)\s*(\\font|\\magnification|\\input|\\def|\\special|\\baselineskip|\\begin)/) {
266 0         0 $maybe_tex = 1;
267 0 0       0 return 'TYPE_TEX_priority' if /\\input\s+amstex/;
268             }
269 27 50       54 if (/(^|\r)\s*\\(end|bye)(\s|$)/) {
270 0         0 $maybe_tex_priority = 1;
271             }
272 27 50       64 if (/\\(end|bye)(\s|$)/) {
273 0         0 $maybe_tex_priority2 = 1;
274             }
275 27 50 33     102 if (/\\input *(harv|lanl)mac/ || /\\input\s+phyzzx/) {
276 0         0 return 'TYPE_TEX_MAC';
277             }
278 27 50       52 if (/beginchar\(/) {
279 0         0 return 'TYPE_MF';
280             }
281 27 50       52 if (/(^|\r)\@(book|article|inbook|unpublished)\{/i) {
282 0         0 return 'TYPE_BIBTEX';
283             }
284 27 50       37 if (/^begin \d{1,4}\s+[^\s]+\r?$/) {
285 0 0       0 return 'TYPE_TEX_priority' if $maybe_tex_priority;
286 0 0       0 return 'TYPE_TEX' if $maybe_tex;
287 0 0       0 return 'TYPE_PC' if /\r$/;
288 0         0 return 'TYPE_UUENCODED';
289             }
290 27 50       98 if (m/paper deliberately replaced by what little/) {
291 0         0 return 'TYPE_ALWAYS_IGNORE'; # Was 'TYPE_FAILED'
292             }
293             }
294 1 50       12 close $FILE_TO_GUESS || warn "couldn't close file: $!";
295              
296 1 50       4 return 'TYPE_TEX_priority' if $maybe_tex_priority;
297 1 50       3 return 'TYPE_TEX_priority2' if $maybe_tex_priority2;
298 1 50       4 return 'TYPE_TEX' if $maybe_tex;
299              
300 1         9 return 'TYPE_FAILED';
301             }
302              
303              
304             # _type_of_latex2e($filehandle)
305             #
306             # Takes an open file handle and searches for some heuristic regexps to
307             # determine whether the contents is of type pdflatex or regular latex2e.
308             #
309             sub _type_of_latex2e {
310 8     8   9 my ($filehandle) = @_;
311 8         11 my $lines_to_search=$.+5; #will search for pdfoutput from beginning to 5 lines beyond
312 8 50       60 if (seek($filehandle,0,0)) {
313 8         26 local $. = 0; #reset line counter if rewind worked
314             }
315              
316 8         56 while (<$filehandle>) {
317 295 100 100     1392 if (/^[^%]*\\includegraphics[^%]*\.(?:pdf|png|gif|jpg)\s?\}/i ||
      66        
318             ($. < $lines_to_search && /^[^%]*\\pdfoutput(?:\s+)?=(?:\s+)?1/)) {
319 5         39 return 'TYPE_PDFLATEX';
320             }
321             }
322 3         25 return 'TYPE_LATEX2e';
323             }
324              
325              
326             =head3 is_tex_type($type)
327              
328             Returns true (1) if the type name supplied is a TeX type, false ('')
329             otherwise.
330              
331             =cut
332              
333             sub is_tex_type {
334 4     4 1 7 my ($type) = @_;
335 4         18 return exists $TEX_types{$type};
336             }
337              
338              
339             =head3 type_name($type)
340              
341             Returns display string for the type $type. Or 'unknown' if
342             $type is not recognized.
343              
344             =cut
345              
346             sub type_name {
347 4     4 1 1819 my ($type)=@_;
348 4   100     40 return( $NAME{$type} || 'unknown' );
349             }
350              
351             1;
352              
353             __END__