File Coverage

blib/lib/Image/ExifTool/Text.pm
Criterion Covered Total %
statement 72 100 72.0
branch 39 80 48.7
condition 14 36 38.8
subroutine 4 4 100.0
pod 0 1 0.0
total 129 221 58.3


line stmt bran cond sub pod time code
1             #------------------------------------------------------------------------------
2             # File: Text.pm
3             #
4             # Description: Deduce characteristics of TXT and CSV files
5             #
6             # Revisions: 2019-11-01 - P. Harvey Created
7             # 2020-02-13 - PH Added CSV file support
8             #
9             # References: 1) https://github.com/file/file
10             #------------------------------------------------------------------------------
11              
12             package Image::ExifTool::Text;
13              
14 5     5   6830 use strict;
  5         12  
  5         235  
15 5     5   33 use vars qw($VERSION);
  5         11  
  5         307  
16 5     5   51 use Image::ExifTool qw(:DataAccess :Utils);
  5         12  
  5         10603  
17              
18             $VERSION = '1.05';
19              
20             # Text tags
21             %Image::ExifTool::Text::Main = (
22             VARS => { ID_FMT => 'none' },
23             GROUPS => { 0 => 'File', 1 => 'File', 2 => 'Document' },
24             NOTES => q{
25             Although basic text files contain no metadata, the following tags are
26             determined from a simple analysis of the data in TXT and CSV files.
27             Statistics are generated only for 8-bit encodings, but the L (-fast)
28             option may be used to limit processing to the first 64 KiB in which case
29             some tags are not produced. To avoid long processing delays, ExifTool will
30             issue a minor warning and process only the first 64 KiB of any file larger
31             than 20 MiB unless the L (-m) option is used.
32             },
33             MIMEEncoding => { Groups => { 2 => 'Other' } },
34             Newlines => {
35             PrintConv => {
36             "\r\n" => 'Windows CRLF',
37             "\r" => 'Macintosh CR',
38             "\n" => 'Unix LF',
39             '' => '(none)',
40             },
41             },
42             ByteOrderMark => { PrintConv => { 0 => 'No', 1 => 'Yes' } },
43             LineCount => { },
44             WordCount => { },
45             Delimiter => { PrintConv => { '' => '(none)', ',' => 'Comma', ';' => 'Semicolon', "\t" => 'Tab' }},
46             Quoting => { PrintConv => { '' => '(none)', '"' => 'Double quotes', "'" => 'Single quotes' }},
47             RowCount => { },
48             ColumnCount => { },
49             );
50              
51             #------------------------------------------------------------------------------
52             # Extract some stats from a text file
53             # Inputs: 0) ExifTool ref, 1) dirInfo ref
54             # Returns: 1 on success, 0 if this wasn't a text file
55             sub ProcessTXT($$)
56             {
57 17     17 0 54 my ($et, $dirInfo) = @_;
58 17         47 my $dataPt = $$dirInfo{TestBuff};
59 17         50 my $raf = $$dirInfo{RAF};
60 17   50     79 my $fast = $et->Options('FastScan') || 0;
61 17         41 my ($buff, $enc, $isBOM, $isUTF8);
62 17         36 my $nl = '';
63              
64 17 100       78 return 0 unless length $$dataPt; # can't call it a text file if it has no text
65              
66             # read more from the file if necessary
67 14 50 33     86 if ($fast < 3 and length($$dataPt) == $Image::ExifTool::testLen) {
68 0 0       0 $raf->Read($buff, 65536) or return 0;
69 0         0 $dataPt = \$buff;
70             }
71             #
72             # make our best guess at the character encoding (EBCDIC is not supported)
73             #
74 14 100       75 if ($$dataPt =~ /([\0-\x06\x0e-\x1a\x1c-\x1f\x7f])/) {
75             # file contains weird control characters, could be multi-byte Unicode
76 1 50       12 if ($$dataPt =~ /^(\xff\xfe\0\0|\0\0\xfe\xff)/) {
    50          
77 0 0       0 if ($1 eq "\xff\xfe\0\0") {
78 0         0 $enc = 'utf-32le';
79 0 0       0 $nl = $1 if $$dataPt =~ /(\r\0\0\0\n|\r|\n)\0\0\0/;
80             } else {
81 0         0 $enc = 'utf-32be';
82 0 0       0 $nl = $1 if $$dataPt =~ /\0\0\0(\r\0\0\0\n|\r|\n)/;
83             }
84             } elsif ($$dataPt =~ /^(\xff\xfe|\xfe\xff)/) {
85 1 50       7 if ($1 eq "\xff\xfe") {
86 0         0 $enc = 'utf-16le';
87 0 0       0 $nl = $1 if $$dataPt =~ /(\r\0\n|\r|\n)\0/;
88             } else {
89 1         3 $enc = 'utf-16be';
90 1 50       11 $nl = $1 if $$dataPt =~ /\0(\r\0\n|\r|\n)/;
91             }
92             } else {
93 0         0 return 0; # probably not a text file
94             }
95 1         4 $nl =~ tr/\0//d; # remove nulls from newline sequence
96 1         3 $isBOM = 1; # (we don't recognize UTF-16/UTF-32 without one)
97             } else {
98 13         66 $isUTF8 = Image::ExifTool::IsUTF8($dataPt, 1);
99 13 100       86 if ($isUTF8 == 0) {
    100          
    100          
100 10         27 $enc = 'us-ascii';
101             } elsif ($isUTF8 > 0) {
102 1         2 $enc = 'utf-8';
103 1 50       6 $isBOM = ($$dataPt =~ /^\xef\xbb\xbf/ ? 1 : 0);
104             } elsif ($$dataPt !~ /[\x80-\x9f]/) {
105 1         2 $enc = 'iso-8859-1';
106             } else {
107 1         4 $enc = 'unknown-8bit';
108             }
109 13 100       109 $nl = $1 if $$dataPt =~ /(\r\n|\r|\n)/;
110             }
111              
112 14         59 my $tagTablePtr = GetTagTable('Image::ExifTool::Text::Main');
113              
114 14         104 $et->SetFileType();
115 14         96 $et->HandleTag($tagTablePtr, MIMEEncoding => $enc);
116              
117 14 50 33     129 return 1 if $fast == 3 or not $raf->Seek(0,0);
118              
119 14 100       76 $et->HandleTag($tagTablePtr, ByteOrderMark => $isBOM) if defined $isBOM;
120 14         62 $et->HandleTag($tagTablePtr, Newlines => $nl);
121              
122 14 100 66     89 return 1 if $fast or not defined $isUTF8;
123             #
124             # generate stats for CSV files
125             #
126 13 100       67 if ($$et{FileType} eq 'CSV') {
127 1         3 my ($delim, $quot, $ncols);
128 1         4 my $nrows = 0;
129 1         7 while ($raf->ReadLine($buff)) {
130 3 100       12 if (not defined $delim) {
    50          
131 1         6 my %count = ( ',' => 0, ';' => 0, "\t" => 0 );
132 1         14 ++$count{$_} foreach $buff =~ /[,;\t]/g;
133 1 50 33     11 if ($count{','} > $count{';'} and $count{','} > $count{"\t"}) {
    0          
    0          
134 1         3 $delim = ',';
135             } elsif ($count{';'} > $count{"\t"}) {
136 0         0 $delim = ';';
137             } elsif ($count{"\t"}) {
138 0         0 $delim = "\t";
139             } else {
140 0         0 $delim = '';
141 0         0 $ncols = 1;
142             }
143 1 50       4 unless ($ncols) {
144             # account for delimiters in quotes (simplistically)
145 1         103 while ($buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg) {
146 0         0 $quot = $2;
147 0         0 my $field = $3;
148 0         0 $count{$delim} -= () = $field =~ /$delim/g;
149             }
150 1         6 $ncols = $count{$delim} + 1;
151             }
152             } elsif (not $quot) {
153 2 50       94 $quot = $2 if $buff =~ /(^|$delim)(["'])(.*?)\2(?=$delim|$)/sg;
154             }
155 3 50 33     19 if (++$nrows == 1000 and $et->Warn('Not counting rows past 1000', 2)) {
156 0         0 undef $nrows;
157 0         0 last;
158             }
159             }
160 1   50     9 $et->HandleTag($tagTablePtr, Delimiter => ($delim || ''));
161 1   50     10 $et->HandleTag($tagTablePtr, Quoting => ($quot || ''));
162 1         4 $et->HandleTag($tagTablePtr, ColumnCount => $ncols);
163 1 50       7 $et->HandleTag($tagTablePtr, RowCount => $nrows) if $nrows;
164 1         5 return 1;
165             }
166 12 50 33     97 return 1 if $$et{VALUE}{FileSize} and $$et{VALUE}{FileSize} > 20000000 and
      33        
167             $et->Warn('Not counting lines/words in text file larger than 20 MB', 2);
168             #
169             # count lines/words and check encoding of the rest of the file
170             #
171 12         31 my ($lines, $words) = (0, 0);
172 12         43 my $oldNL = $/;
173 12 100       83 $/ = $nl if $nl;
174 12         69 while ($raf->ReadLine($buff)) {
175 12         43 ++$lines;
176 12         135 ++$words while $buff =~ /\S+/g;
177 12 50 66     79 if (not $nl and $buff =~ /(\r\n|\r|\n)$/) {
178             # (the first line must have been longer than 64 kB)
179 0         0 $$et{VALUE}{Newlines} = $nl = $1;
180             }
181 12 50       51 next if $raf->Tell() < 65536;
182             # continue to check encoding after the first 64 kB
183 0 0 0     0 if ($isUTF8 >= 0) { # (if ascii or utf8)
    0          
184 0         0 $isUTF8 = Image::ExifTool::IsUTF8(\$buff);
185 0 0       0 if ($isUTF8 > 0) {
    0          
186 0         0 $enc = 'utf-8';
187             } elsif ($isUTF8 < 0) {
188 0 0       0 $enc = $buff =~ /[\x80-\x9f]/ ? 'unknown-8bit' : 'iso-8859-1';
189             }
190             } elsif ($enc eq 'iso-8859-1' and $buff =~ /[\x80-\x9f]/) {
191 0         0 $enc = 'unknown-8bit';
192             }
193             }
194             # ($$et{VALUE}{MIMEEncoding} may be undef if it was ignored)
195 12 50 33     106 if (defined $$et{VALUE}{MIMEEncoding} and $$et{VALUE}{MIMEEncoding} ne $enc) {
196 0         0 $$et{VALUE}{MIMEEncoding} = $enc;
197 0         0 $et->VPrint(0," MIMEEncoding [override] = $enc\n");
198             }
199 12         64 $/ = $oldNL;
200 12         62 $et->HandleTag($tagTablePtr, LineCount => $lines);
201 12         54 $et->HandleTag($tagTablePtr, WordCount => $words);
202 12         54 return 1;
203             }
204              
205              
206             1; # end
207              
208             __END__