File Coverage

blib/lib/App/SpamcupNG/HTMLParse.pm
Criterion Covered Total %
statement 198 203 97.5
branch 71 90 78.8
condition 4 8 50.0
subroutine 15 15 100.0
pod 8 8 100.0
total 296 324 91.3


line stmt bran cond sub pod time code
1             use strict;
2 12     12   540371 use warnings;
  12         81  
  12         269  
3 12     12   50 use HTML::TreeBuilder::XPath 0.14;
  12         18  
  12         288  
4 12     12   4559 use Exporter 'import';
  12         589829  
  12         103  
5 12     12   412 use Carp 'croak';
  12         21  
  12         347  
6 12     12   53  
  12         22  
  12         572  
7             use App::SpamcupNG::Error::Factory qw(create_error);
8 12     12   4701 use App::SpamcupNG::Warning::Factory qw(create_warning);
  12         22  
  12         630  
9 12     12   3637  
  12         26  
  12         20989  
10             our @EXPORT_OK = (
11             'find_next_id', 'find_errors',
12             'find_warnings', 'find_spam_header',
13             'find_best_contacts', 'find_receivers',
14             'find_message_age', 'find_header_info'
15             );
16              
17             my %regexes = (
18             next_id => qr#^/sc\?id=(\w+)#,
19             message_age => qr/^Message\sis\s(\d+)\s(\w+)\sold/
20             );
21              
22             our $VERSION = '0.015'; # VERSION
23              
24             =head1 NAME
25              
26             App::SpamcupNG::HTMLParse - functions to extract information from Spamcop.net
27             web pages
28              
29             =head1 SYNOPSIS
30              
31             use App::SpamcupNG::HTMLParse qw(find_next_id find_errors find_warnings find_spam_header find_message_age find_header_info);
32              
33             =head1 DESCRIPTION
34              
35             This package export functions that uses XPath to extract specific information
36             from the spamcop.net HTML pages.
37              
38             =head1 EXPORTS
39              
40             Following are all exported functions by this package.
41              
42             =head2 find_header_info
43              
44             Finds information from the e-mail header of the received SPAM and returns it.
45              
46             Returns a hash reference with the following keys:
47              
48             =over
49              
50             =item mailer: the X-Mailer header, if available
51              
52             =item content_type: the Content-Type, if available
53              
54             =back
55              
56             There is an attempt to normalize the C<Content-Type> header, by removing extra
57             spaces and using just the first two entries, also making everything as lower
58             case.
59              
60             =cut
61              
62             my $content_ref = shift;
63             croak "Must receive an scalar reference as parameter"
64 4     4 1 5665 unless ( ref($content_ref) eq 'SCALAR' );
65 4 50       19 my $tree = HTML::TreeBuilder::XPath->new;
66             $tree->parse_content($$content_ref);
67 4         35 my @nodes = $tree->findnodes('/html/body/div[@id="content"]/pre');
68 4         947 my %info = (
69 4         100903 mailer => undef,
70 4         18045 content_type => undef
71             );
72             my $mailer_regex = qr/^X-Mailer:/;
73             my $content_type_regex = qr/^Content-Type:/;
74 4         23  
75 4         12 foreach my $node (@nodes) {
76              
77 4         9 foreach my $content ( split( "\n", $node->as_text() ) ) {
78             $content =~ s/^\s+//;
79 4         23 $content =~ s/\s+$//;
80 251         636 next if ( $content eq '' );
81 251         391  
82 251 50       316 if ( $content =~ $mailer_regex ) {
83             my $wanted = ( split( ':', $content ) )[1];
84 251 100       456 $wanted =~ s/^\s+//;
85 1         4 $info{mailer} = $wanted;
86 1         2 next;
87 1         2 }
88 1         2  
89             if ( $content =~ $content_type_regex ) {
90             my $wanted = ( split( ':', $content ) )[1];
91 250 100       412 $wanted =~ s/^\s+//;
92 4         14 my @wanted = split( ';', $wanted );
93 4         11  
94 4         12 if ( scalar(@wanted) > 1 ) {
95             my $encoding = lc( $wanted[0] );
96 4 100       11 my $charset = lc( $wanted[1] );
97 3         10 $charset =~ s/^\s+//;
98 3         5 $charset =~ tr/"//d;
99 3         9  
100 3         15 my $not_useful = 'boundary';
101              
102 3         31 if (
103             substr( $charset, 0, length($not_useful) ) eq
104 3 100       27 $not_useful )
105             {
106             $info{content_type} = $encoding;
107             $info{charset} = undef;
108 1         3 }
109 1         3 else {
110             $info{content_type} = $encoding;
111             $info{charset} = ( split( '=', $charset ) )[1];
112 2         7 }
113 2         9 }
114             else {
115             chop $wanted if ( substr( $wanted, -1 ) eq ';' );
116             $info{content_type} = $wanted;
117 1 50       4 }
118 1         3  
119             next;
120             }
121 4         11  
122             last if ( $info{mailer} and $info{content_type} );
123             }
124 246 50 66     355 }
125              
126             return \%info;
127              
128 4         409 }
129              
130             =head2 find_message_age
131              
132             Find and return the SPAM message age information.
133              
134             Returns an array reference, with the zero index as an integer with the age, and
135             the index 1 as the age unit (possibly "hour");
136              
137             If nothing is found, returns C<undef>;
138              
139             =cut
140              
141             my $content_ref = shift;
142             croak "Must receive an scalar reference as parameter"
143             unless ( ref($content_ref) eq 'SCALAR' );
144 2     2 1 211 my $tree = HTML::TreeBuilder::XPath->new;
145 2 50       11 $tree->parse_content($$content_ref);
146             my @nodes = $tree->findnodes('/html/body/child::div[@id="content"]');
147 2         31  
148 2         578 foreach my $node (@nodes) {
149 2         52165 foreach my $content ( $node->content_refs_list ) {
150             next unless ( ref($content) eq 'SCALAR' );
151 2         8634 $$content =~ s/^\s+//;
152 2         14 $$content =~ s/\s+$//;
153 70 100       111 next if ( $$content eq '' );
154 21         46  
155 21         46 if ( $$content =~ $regexes{message_age} ) {
156 21 100       35 my ( $age, $unit ) = ( $1, $2 );
157             chop $unit if ( substr( $unit, -1 ) eq 's' );
158 19 100       53 return [ $age, $unit ];
159 2         8 }
160 2 50       11 }
161 2         168 }
162              
163             return undef;
164             }
165              
166 0         0 =head2 find_next_id
167              
168             Expects as parameter a scalar reference of the HTML page.
169              
170             Tries to find the SPAM ID used to identify SPAM reports on spamcop.net webpage.
171              
172             Returns the ID if found, otherwise C<undef>.
173              
174             =cut
175              
176             my $content_ref = shift;
177             croak "Must receive an scalar reference as parameter"
178             unless ( ref($content_ref) eq 'SCALAR' );
179             my $tree = HTML::TreeBuilder::XPath->new;
180 3     3 1 215 $tree->parse_content($$content_ref);
181 3 100       24 my @nodes = $tree->findnodes('//strong/a');
182             my $next_id;
183 2         20  
184 2         498 foreach my $element (@nodes) {
185 2         30847 if ( $element->as_trimmed_text eq 'Report Now' ) {
186 2         11267  
187             if ( $element->attr('href') =~ $regexes{next_id} ) {
188 2         5 $next_id = $1;
189 4 100       152 my $length = length($next_id);
190             my $expected = 45;
191 2 50       60 warn
192 2         37 "Unexpected length for SPAM ID: got $length, expected $expected"
193 2         4 unless ( $length == $expected );
194 2         3 last;
195 2 50       7 }
196             }
197             }
198 2         4  
199             return $next_id;
200             }
201              
202             =head2 find_warnings
203 2         132  
204             Expects as parameter a scalar reference of the HTML page.
205              
206             Tries to find all warnings on the HTML, based on CSS classes.
207              
208             Returns an array reference with all warnings found.
209              
210             =cut
211              
212             # TODO: create a single tree instance and check for everything at once
213             my $content_ref = shift;
214             croak "Must receive an scalar reference as parameter"
215             unless ( ref($content_ref) eq 'SCALAR' );
216             my $tree = HTML::TreeBuilder::XPath->new;
217             $tree->parse_content($$content_ref);
218 3     3 1 224 my @nodes
219 3 100       28 = $tree->findnodes('//div[@id="content"]/div[@class="warning"]');
220             my @warnings;
221 2         21  
222 2         509 foreach my $node (@nodes) {
223             my @all_text;
224 2         53324 push( @all_text, $node->as_trimmed_text );
225 2         58366  
226             # Spamcop page might add other text lines after the div, until the next div.
227 2         6 foreach my $next ( $node->right() ) {
228 2         3 if ( ref $next ) {
229 2         13 next if ( $next->tag eq 'br' );
230             last if ( $next->tag eq 'div' );
231             }
232 2         109 else {
233 16 100       300 push( @all_text, $next );
234 9 100       19 }
235 2 50       12 }
236              
237             push( @warnings, create_warning( \@all_text ) );
238 7         8 }
239              
240             return \@warnings;
241             }
242 2         16  
243             =head2 find_errors
244              
245 2         213 Expects as parameter a scalar reference of the HTML page.
246              
247             Tries to find all errors on the HTML, based on CSS classes.
248              
249             Returns an array reference with all errors found.
250              
251             =cut
252              
253             my $content_ref = shift;
254             croak "Must receive an scalar reference as parameter"
255             unless ( ref($content_ref) eq 'SCALAR' );
256             my $tree = HTML::TreeBuilder::XPath->new;
257             $tree->parse_content($$content_ref);
258             my @nodes = $tree->findnodes('//div[@id="content"]/div[@class="error"]');
259 7     7 1 1189 my @errors;
260 7 100       52  
261             foreach my $node (@nodes) {
262 6         62 my @all_text;
263 6         1440 push( @all_text, $node->as_trimmed_text );
264 6         93858  
265 6         97135 foreach my $next ( $node->right() ) {
266             if ( ref $next ) {
267 6         16 next if ( $next->tag eq 'br' );
268 5         9 last if ( $next->tag eq 'div' );
269 5         20 }
270             else {
271 5         685 push( @all_text, $next );
272 25 100       301 }
273 23 100       34 }
274 16 100       74  
275             push( @errors, create_error( \@all_text ) );
276             }
277 2         3  
278             # bounce errors are inside a form
279             my $base_xpath = '//form[@action="/mcgi"]';
280             @nodes = $tree->findnodes( $base_xpath . '//strong' );
281 5         137  
282             if (@nodes) {
283             if ( $nodes[0]->as_trimmed_text() eq 'Bounce error' ) {
284             my @nodes = $tree->findnodes($base_xpath);
285 6         48 $nodes[0]->parent(undef);
286 6         29 my @messages;
287              
288 6 100       79935 foreach my $node ( $nodes[0]->content_list() ) {
289 1 50       4 next unless ( ref($node) eq '' );
290 1         48 push( @messages, $node );
291 1         10808 }
292 1         9  
293             push( @errors, create_error( \@messages ) );
294 1         3 }
295 8 100       17 }
296 3         5  
297             return \@errors;
298             }
299 1         5  
300             =head2 find_best_contacts
301              
302             Expects as parameter a scalar reference of the HTML page.
303 6         491  
304             Tries to find all best contacts on the HTML, based on CSS classes.
305              
306             The best contacts are the e-mail address that Spamcop considers appropriate to
307             use for SPAM reporting.
308              
309             Returns an array reference with all best contacts found.
310              
311             =cut
312              
313             my $content_ref = shift;
314             croak "Must receive an scalar reference as parameter"
315             unless ( ref($content_ref) eq 'SCALAR' );
316             my $tree = HTML::TreeBuilder::XPath->new;
317             $tree->parse_content($content_ref);
318             my @nodes = $tree->findnodes('//div[@id="content"]');
319              
320 4     4 1 3628 foreach my $node (@nodes) {
321 4 100       39 for my $html_element ( $node->content_list ) {
322              
323 3         31 # only text
324 3         734 next if ref($html_element);
325 3         76663 $html_element =~ s/^\s+//;
326             if ( index( $html_element, 'Using best contacts' ) == 0 ) {
327 3         75141 my @tokens = split( /\s/, $html_element );
328 3         11 splice( @tokens, 0, 3 );
329             return \@tokens;
330             }
331 130 100       203 }
332 42         78  
333 42 100       76 }
334 2         16  
335 2         8 return [];
336 2         211 }
337              
338             =head2 find_spam_header
339              
340             Expects as parameter a scalar reference of the HTML page.
341              
342 1         119 You can optionally pass a second parameter that defines if each line should be
343             prefixed with a tab character. The default value is false.
344              
345             Tries to find the e-mail header of the SPAM reported.
346              
347             Returns an array reference with all the lines of the e-mail header found.
348              
349             =cut
350              
351             my $content_ref = shift;
352             croak "Must receive an scalar reference as parameter"
353             unless ( ref($content_ref) eq 'SCALAR' );
354             my $formatted //= 0;
355             my $tree = HTML::TreeBuilder::XPath->new;
356             $tree->parse_content($content_ref);
357              
358             my @nodes = $tree->findnodes('/html/body/div[5]/p[1]/strong');
359 2     2 1 206 my $expected = 'Please make sure this email IS spam:';
360 2 50       9 my $parent = undef;
361              
362 2   50     11 foreach my $node (@nodes) {
363 2         19 if ( $node->as_trimmed_text eq $expected ) {
364 2         503 $parent = $node->parent;
365             last;
366 2         50698 }
367 2         8915 }
368 2         4  
369             if ($parent) {
370 2         5 $parent->parent(undef);
371 2 50       12 @nodes = $parent->findnodes('//font');
372 2         123  
373 2         14 if ( ( scalar(@nodes) != 1 )
374             or ( ref( $nodes[0] ) ne 'HTML::Element' ) )
375             {
376             croak 'Unexpected content of SPAM header: ' . Dumper(@nodes);
377 2 50       9 }
378 2         6  
379 2         33 my @lines;
380             my $header = $nodes[0]->content;
381 2 50 33     1969  
382             for ( my $i = 0 ; $i <= scalar( @{$header} ) ; $i++ ) {
383             if ( ref( $header->[$i] ) eq 'HTML::Element' ) {
384 0         0 $header->[$i]->parent(undef);
385              
386             # just want text here
387 2         4 next unless $header->[$i]->content;
388 2         11 my $content = ( $header->[$i]->content )->[0];
389             next unless $content;
390 2         8 next if ( ref($content) );
  20         46  
391 18 100       32 $header->[$i] = $content;
392 8         19 }
393             next unless $header->[$i];
394              
395 8 100       68 # removing Unicode spaces in place
396 2         11 $header->[$i] =~ s/^\s++//u;
397 2 50       10  
398 2 50       6 if ($formatted) {
399 2         5 push( @lines, "\t$header->[$i]" );
400              
401 12 100       21 }
402             else {
403             push( @lines, $header->[$i] );
404 10         27  
405             }
406 10 50       17 }
407 0         0 return \@lines;
408             }
409              
410             return [];
411 10         14 }
412              
413             =head2 find_receivers
414              
415 2         163 Expects as parameter a scalar reference of the HTML page.
416              
417             Tries to find all the receivers of the SPAM report, even if those were not real
418 0         0 e-mail address, only internal identifiers for Spamcop to store statistics.
419              
420             Returns an array reference, where each item is a string.
421              
422             =cut
423              
424             my $content_ref = shift;
425             croak "Must receive an scalar reference as parameter"
426             unless ( ref($content_ref) eq 'SCALAR' );
427             my $tree = HTML::TreeBuilder::XPath->new;
428             $tree->parse_content($content_ref);
429             my @nodes = $tree->findnodes('//*[@id="content"]');
430             my @receivers;
431             my $devnull = q{/dev/null'ing};
432             my $report_sent = 'Spam report id';
433 2     2 1 218  
434 2 50       9 foreach my $node (@nodes) {
435             foreach my $inner ( $node->content_list() ) {
436 2         23  
437 2         506 # we just want text nodes, everything else is discarded
438 2         18974 next if ( ref($inner) );
439 2         35241 $inner =~ s/^\s+//;
440 2         6 $inner =~ s/\s+$//;
441 2         5  
442             my $result_ref;
443 2         6 my @parts = split( /\s/, $inner );
444 2         11  
445             # /dev/null\'ing report for google-abuse-bounces-reports@devnull.spamcop.net
446             if ( substr( $inner, 0, length($devnull) ) eq $devnull ) {
447 10 100       28 $result_ref = [ ( split( '@', $parts[-1] ) )[0], undef ];
448 4         13 }
449 4         12  
450             # Spam report id 7151980235 sent to: dl_security_whois@navercorp.com
451 4         4 elsif (
452 4         34 substr( $inner, 0, length($report_sent) ) eq $report_sent )
453             {
454             $result_ref = [ $parts[6], $parts[3] ];
455 4 100       21 }
    50          
456 3         11 else {
457             warn "Unexpected receiver format: $inner";
458             }
459              
460             push( @receivers, $result_ref );
461             }
462             }
463 1         3  
464             return \@receivers;
465             }
466 0         0  
467             =head1 SEE ALSO
468              
469 4         9 =over
470              
471             =item *
472              
473 2         104 L<HTML::TreeBuilder::XPath>
474              
475             =back
476              
477             =head1 AUTHOR
478              
479             Alceu Rodrigues de Freitas Junior, E<lt>arfreitas@cpan.orgE<gt>
480              
481             =head1 COPYRIGHT AND LICENSE
482              
483             This software is copyright (c) 2018 of Alceu Rodrigues de Freitas Junior,
484             E<lt>arfreitas@cpan.orgE<gt>
485              
486             This file is part of App-SpamcupNG distribution.
487              
488             App-SpamcupNG is free software: you can redistribute it and/or modify it under
489             the terms of the GNU General Public License as published by the Free Software
490             Foundation, either version 3 of the License, or (at your option) any later
491             version.
492              
493             App-SpamcupNG is distributed in the hope that it will be useful, but WITHOUT
494             ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
495             FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
496              
497             You should have received a copy of the GNU General Public License along with
498             App-SpamcupNG. If not, see <http://www.gnu.org/licenses/>.
499              
500             =cut
501              
502             1;