File Coverage

blib/lib/App/SpamcupNG/HTMLParse.pm
Criterion Covered Total %
statement 198 203 97.5
branch 71 90 78.8
condition 4 8 50.0
subroutine 15 15 100.0
pod 8 8 100.0
total 296 324 91.3


line stmt bran cond sub pod time code
1             package App::SpamcupNG::HTMLParse;
2 12     12   745280 use strict;
  12         94  
  12         368  
3 12     12   69 use warnings;
  12         32  
  12         384  
4 12     12   7664 use HTML::TreeBuilder::XPath 0.14;
  12         810088  
  12         124  
5 12     12   545 use Exporter 'import';
  12         32  
  12         411  
6 12     12   76 use Carp 'croak';
  12         26  
  12         667  
7              
8 12     12   6037 use App::SpamcupNG::Error::Factory qw(create_error);
  12         29  
  12         768  
9 12     12   5036 use App::SpamcupNG::Warning::Factory qw(create_warning);
  12         32  
  12         29442  
10              
11             our @EXPORT_OK = (
12             'find_next_id', 'find_errors',
13             'find_warnings', 'find_spam_header',
14             'find_best_contacts', 'find_receivers',
15             'find_message_age', 'find_header_info'
16             );
17              
18             my %regexes = (
19             next_id => qr#^/sc\?id=(\w+)#,
20             message_age => qr/^Message\sis\s(\d+)\s(\w+)\sold/
21             );
22              
23             our $VERSION = '0.016'; # VERSION
24              
25             =head1 NAME
26              
27             App::SpamcupNG::HTMLParse - functions to extract information from Spamcop.net
28             web pages
29              
30             =head1 SYNOPSIS
31              
32             use App::SpamcupNG::HTMLParse qw(find_next_id find_errors find_warnings find_spam_header find_message_age find_header_info);
33              
34             =head1 DESCRIPTION
35              
36             This package export functions that uses XPath to extract specific information
37             from the spamcop.net HTML pages.
38              
39             =head1 EXPORTS
40              
41             Following are all exported functions by this package.
42              
43             =head2 find_header_info
44              
45             Finds information from the e-mail header of the received SPAM and returns it.
46              
47             Returns a hash reference with the following keys:
48              
49             =over
50              
51             =item mailer: the X-Mailer header, if available
52              
53             =item content_type: the Content-Type, if available
54              
55             =back
56              
57             There is an attempt to normalize the C<Content-Type> header, by removing extra
58             spaces and using just the first two entries, also making everything as lower
59             case.
60              
61             =cut
62              
63             sub find_header_info {
64 4     4 1 8924 my $content_ref = shift;
65 4 50       23 croak "Must receive an scalar reference as parameter"
66             unless ( ref($content_ref) eq 'SCALAR' );
67 4         37 my $tree = HTML::TreeBuilder::XPath->new;
68 4         1096 $tree->parse_content($$content_ref);
69 4         138872 my @nodes = $tree->findnodes('/html/body/div[@id="content"]/pre');
70 4         24139 my %info = (
71             mailer => undef,
72             content_type => undef
73             );
74 4         26 my $mailer_regex = qr/^X-Mailer:/;
75 4         19 my $content_type_regex = qr/^Content-Type:/;
76              
77 4         16 foreach my $node (@nodes) {
78              
79 4         25 foreach my $content ( split( "\n", $node->as_text() ) ) {
80 251         839 $content =~ s/^\s+//;
81 251         527 $content =~ s/\s+$//;
82 251 50       434 next if ( $content eq '' );
83              
84 251 100       666 if ( $content =~ $mailer_regex ) {
85 1         12 my $wanted = ( split( ':', $content ) )[1];
86 1         4 $wanted =~ s/^\s+//;
87 1         12 $info{mailer} = $wanted;
88 1         2 next;
89             }
90              
91 250 100       573 if ( $content =~ $content_type_regex ) {
92 4         18 my $wanted = ( split( ':', $content ) )[1];
93 4         15 $wanted =~ s/^\s+//;
94 4         14 my @wanted = split( ';', $wanted );
95              
96 4 100       16 if ( scalar(@wanted) > 1 ) {
97 3         13 my $encoding = lc( $wanted[0] );
98 3         8 my $charset = lc( $wanted[1] );
99 3         11 $charset =~ s/^\s+//;
100 3         13 $charset =~ tr/"//d;
101              
102 3         38 my $not_useful = 'boundary';
103              
104 3 100       39 if (
105             substr( $charset, 0, length($not_useful) ) eq
106             $not_useful )
107             {
108 1         3 $info{content_type} = $encoding;
109 1         5 $info{charset} = undef;
110             }
111             else {
112 2         8 $info{content_type} = $encoding;
113 2         18 $info{charset} = ( split( '=', $charset ) )[1];
114             }
115             }
116             else {
117 1 50       5 chop $wanted if ( substr( $wanted, -1 ) eq ';' );
118 1         3 $info{content_type} = $wanted;
119             }
120              
121 4         18 next;
122             }
123              
124 246 50 66     487 last if ( $info{mailer} and $info{content_type} );
125             }
126             }
127              
128 4         350 return \%info;
129              
130             }
131              
132             =head2 find_message_age
133              
134             Find and return the SPAM message age information.
135              
136             Returns an array reference, with the zero index as an integer with the age, and
137             the index 1 as the age unit (possibly "hour");
138              
139             If nothing is found, returns C<undef>;
140              
141             =cut
142              
143             sub find_message_age {
144 2     2 1 246 my $content_ref = shift;
145 2 50       29 croak "Must receive an scalar reference as parameter"
146             unless ( ref($content_ref) eq 'SCALAR' );
147 2         46 my $tree = HTML::TreeBuilder::XPath->new;
148 2         1029 $tree->parse_content($$content_ref);
149 2         73614 my @nodes = $tree->findnodes('/html/body/child::div[@id="content"]');
150              
151 2         11277 foreach my $node (@nodes) {
152 2         15 foreach my $content ( $node->content_refs_list ) {
153 70 100       151 next unless ( ref($content) eq 'SCALAR' );
154 21         62 $$content =~ s/^\s+//;
155 21         62 $$content =~ s/\s+$//;
156 21 100       45 next if ( $$content eq '' );
157              
158 19 100       74 if ( $$content =~ $regexes{message_age} ) {
159 2         12 my ( $age, $unit ) = ( $1, $2 );
160 2 50       12 chop $unit if ( substr( $unit, -1 ) eq 's' );
161 2         150 return [ $age, $unit ];
162             }
163             }
164             }
165              
166 0         0 return undef;
167             }
168              
169             =head2 find_next_id
170              
171             Expects as parameter a scalar reference of the HTML page.
172              
173             Tries to find the SPAM ID used to identify SPAM reports on spamcop.net webpage.
174              
175             Returns the ID if found, otherwise C<undef>.
176              
177             =cut
178              
179             sub find_next_id {
180 3     3 1 318 my $content_ref = shift;
181 3 100       27 croak "Must receive an scalar reference as parameter"
182             unless ( ref($content_ref) eq 'SCALAR' );
183 2         23 my $tree = HTML::TreeBuilder::XPath->new;
184 2         642 $tree->parse_content($$content_ref);
185 2         42677 my @nodes = $tree->findnodes('//strong/a');
186 2         15450 my $next_id;
187              
188 2         9 foreach my $element (@nodes) {
189 4 100       238 if ( $element->as_trimmed_text eq 'Report Now' ) {
190              
191 2 50       97 if ( $element->attr('href') =~ $regexes{next_id} ) {
192 2         46 $next_id = $1;
193 2         8 my $length = length($next_id);
194 2         4 my $expected = 45;
195 2 50       14 warn
196             "Unexpected length for SPAM ID: got $length, expected $expected"
197             unless ( $length == $expected );
198 2         5 last;
199             }
200             }
201             }
202              
203 2         147 return $next_id;
204             }
205              
206             =head2 find_warnings
207              
208             Expects as parameter a scalar reference of the HTML page.
209              
210             Tries to find all warnings on the HTML, based on CSS classes.
211              
212             Returns an array reference with all warnings found.
213              
214             =cut
215              
216             # TODO: create a single tree instance and check for everything at once
217             sub find_warnings {
218 3     3 1 282 my $content_ref = shift;
219 3 100       30 croak "Must receive an scalar reference as parameter"
220             unless ( ref($content_ref) eq 'SCALAR' );
221 2         19 my $tree = HTML::TreeBuilder::XPath->new;
222 2         575 $tree->parse_content($$content_ref);
223             my @nodes
224 2         69456 = $tree->findnodes('//div[@id="content"]/div[@class="warning"]');
225 2         77989 my @warnings;
226              
227 2         7 foreach my $node (@nodes) {
228 2         4 my @all_text;
229 2         20 push( @all_text, $node->as_trimmed_text );
230              
231             # Spamcop page might add other text lines after the div, until the next div.
232 2         157 foreach my $next ( $node->right() ) {
233 16 100       502 if ( ref $next ) {
234 9 100       21 next if ( $next->tag eq 'br' );
235 2 50       19 last if ( $next->tag eq 'div' );
236             }
237             else {
238 7         15 push( @all_text, $next );
239             }
240             }
241              
242 2         24 push( @warnings, create_warning( \@all_text ) );
243             }
244              
245 2         221 return \@warnings;
246             }
247              
248             =head2 find_errors
249              
250             Expects as parameter a scalar reference of the HTML page.
251              
252             Tries to find all errors on the HTML, based on CSS classes.
253              
254             Returns an array reference with all errors found.
255              
256             =cut
257              
258             sub find_errors {
259 7     7 1 1550 my $content_ref = shift;
260 7 100       57 croak "Must receive an scalar reference as parameter"
261             unless ( ref($content_ref) eq 'SCALAR' );
262 6         65 my $tree = HTML::TreeBuilder::XPath->new;
263 6         1861 $tree->parse_content($$content_ref);
264 6         127974 my @nodes = $tree->findnodes('//div[@id="content"]/div[@class="error"]');
265 6         132232 my @errors;
266              
267 6         29 foreach my $node (@nodes) {
268 5         7 my @all_text;
269 5         27 push( @all_text, $node->as_trimmed_text );
270              
271 5         303 foreach my $next ( $node->right() ) {
272 25 100       412 if ( ref $next ) {
273 23 100       39 next if ( $next->tag eq 'br' );
274 16 100       111 last if ( $next->tag eq 'div' );
275             }
276             else {
277 2         4 push( @all_text, $next );
278             }
279             }
280              
281 5         177 push( @errors, create_error( \@all_text ) );
282             }
283              
284             # bounce errors are inside a form
285 6         72 my $base_xpath = '//form[@action="/mcgi"]';
286 6         33 @nodes = $tree->findnodes( $base_xpath . '//strong' );
287              
288 6 100       108340 if (@nodes) {
289 1 50       5 if ( $nodes[0]->as_trimmed_text() eq 'Bounce error' ) {
290 1         51 my @nodes = $tree->findnodes($base_xpath);
291 1         14898 $nodes[0]->parent(undef);
292 1         11 my @messages;
293              
294 1         4 foreach my $node ( $nodes[0]->content_list() ) {
295 8 100       23 next unless ( ref($node) eq '' );
296 3         7 push( @messages, $node );
297             }
298              
299 1         6 push( @errors, create_error( \@messages ) );
300             }
301             }
302              
303 6         480 return \@errors;
304             }
305              
306             =head2 find_best_contacts
307              
308             Expects as parameter a scalar reference of the HTML page.
309              
310             Tries to find all best contacts on the HTML, based on CSS classes.
311              
312             The best contacts are the e-mail address that Spamcop considers appropriate to
313             use for SPAM reporting.
314              
315             Returns an array reference with all best contacts found.
316              
317             =cut
318              
319             sub find_best_contacts {
320 4     4 1 3496 my $content_ref = shift;
321 4 100       44 croak "Must receive an scalar reference as parameter"
322             unless ( ref($content_ref) eq 'SCALAR' );
323 3         35 my $tree = HTML::TreeBuilder::XPath->new;
324 3         1054 $tree->parse_content($content_ref);
325 3         107069 my @nodes = $tree->findnodes('//div[@id="content"]');
326              
327 3         104226 foreach my $node (@nodes) {
328 3         20 for my $html_element ( $node->content_list ) {
329              
330             # only text
331 130 100       269 next if ref($html_element);
332 42         126 $html_element =~ s/^\s+//;
333 42 100       112 if ( index( $html_element, 'Using best contacts' ) == 0 ) {
334 2         20 my @tokens = split( /\s/, $html_element );
335 2         21 splice( @tokens, 0, 3 );
336 2         213 return \@tokens;
337             }
338             }
339              
340             }
341              
342 1         117 return [];
343             }
344              
345             =head2 find_spam_header
346              
347             Expects as parameter a scalar reference of the HTML page.
348              
349             You can optionally pass a second parameter that defines if each line should be
350             prefixed with a tab character. The default value is false.
351              
352             Tries to find the e-mail header of the SPAM reported.
353              
354             Returns an array reference with all the lines of the e-mail header found.
355              
356             =cut
357              
358             sub find_spam_header {
359 2     2 1 233 my $content_ref = shift;
360 2 50       14 croak "Must receive an scalar reference as parameter"
361             unless ( ref($content_ref) eq 'SCALAR' );
362 2   50     20 my $formatted //= 0;
363 2         19 my $tree = HTML::TreeBuilder::XPath->new;
364 2         631 $tree->parse_content($content_ref);
365              
366 2         69444 my @nodes = $tree->findnodes('/html/body/div[5]/p[1]/strong');
367 2         11990 my $expected = 'Please make sure this email IS spam:';
368 2         6 my $parent = undef;
369              
370 2         6 foreach my $node (@nodes) {
371 2 50       16 if ( $node->as_trimmed_text eq $expected ) {
372 2         160 $parent = $node->parent;
373 2         18 last;
374             }
375             }
376              
377 2 50       8 if ($parent) {
378 2         9 $parent->parent(undef);
379 2         25 @nodes = $parent->findnodes('//font');
380              
381 2 50 33     2806 if ( ( scalar(@nodes) != 1 )
382             or ( ref( $nodes[0] ) ne 'HTML::Element' ) )
383             {
384 0         0 croak 'Unexpected content of SPAM header: ' . Dumper(@nodes);
385             }
386              
387 2         6 my @lines;
388 2         10 my $header = $nodes[0]->content;
389              
390 2         13 for ( my $i = 0 ; $i <= scalar( @{$header} ) ; $i++ ) {
  20         57  
391 18 100       39 if ( ref( $header->[$i] ) eq 'HTML::Element' ) {
392 8         55 $header->[$i]->parent(undef);
393              
394             # just want text here
395 8 100       68 next unless $header->[$i]->content;
396 2         14 my $content = ( $header->[$i]->content )->[0];
397 2 50       14 next unless $content;
398 2 50       9 next if ( ref($content) );
399 2         5 $header->[$i] = $content;
400             }
401 12 100       33 next unless $header->[$i];
402              
403             # removing Unicode spaces in place
404 10         34 $header->[$i] =~ s/^\s++//u;
405              
406 10 50       25 if ($formatted) {
407 0         0 push( @lines, "\t$header->[$i]" );
408              
409             }
410             else {
411 10         24 push( @lines, $header->[$i] );
412              
413             }
414             }
415 2         166 return \@lines;
416             }
417              
418 0         0 return [];
419             }
420              
421             =head2 find_receivers
422              
423             Expects as parameter a scalar reference of the HTML page.
424              
425             Tries to find all the receivers of the SPAM report, even if those were not real
426             e-mail address, only internal identifiers for Spamcop to store statistics.
427              
428             Returns an array reference, where each item is a string.
429              
430             =cut
431              
432             sub find_receivers {
433 2     2 1 262 my $content_ref = shift;
434 2 50       13 croak "Must receive an scalar reference as parameter"
435             unless ( ref($content_ref) eq 'SCALAR' );
436 2         29 my $tree = HTML::TreeBuilder::XPath->new;
437 2         764 $tree->parse_content($content_ref);
438 2         26885 my @nodes = $tree->findnodes('//*[@id="content"]');
439 2         47656 my @receivers;
440 2         17 my $devnull = q{/dev/null'ing};
441 2         7 my $report_sent = 'Spam report id';
442              
443 2         8 foreach my $node (@nodes) {
444 2         14 foreach my $inner ( $node->content_list() ) {
445              
446             # we just want text nodes, everything else is discarded
447 10 100       41 next if ( ref($inner) );
448 4         20 $inner =~ s/^\s+//;
449 4         17 $inner =~ s/\s+$//;
450              
451 4         8 my $result_ref;
452 4         24 my @parts = split( /\s/, $inner );
453              
454             # /dev/null\'ing report for google-abuse-bounces-reports@devnull.spamcop.net
455 4 100       49 if ( substr( $inner, 0, length($devnull) ) eq $devnull ) {
    50          
456 3         15 $result_ref = [ ( split( '@', $parts[-1] ) )[0], undef ];
457             }
458              
459             # Spam report id 7151980235 sent to: dl_security_whois@navercorp.com
460             elsif (
461             substr( $inner, 0, length($report_sent) ) eq $report_sent )
462             {
463 1         3 $result_ref = [ $parts[6], $parts[3] ];
464             }
465             else {
466 0         0 warn "Unexpected receiver format: $inner";
467             }
468              
469 4         14 push( @receivers, $result_ref );
470             }
471             }
472              
473 2         131 return \@receivers;
474             }
475              
476             =head1 SEE ALSO
477              
478             =over
479              
480             =item *
481              
482             L<HTML::TreeBuilder::XPath>
483              
484             =back
485              
486             =head1 AUTHOR
487              
488             Alceu Rodrigues de Freitas Junior, E<lt>arfreitas@cpan.orgE<gt>
489              
490             =head1 COPYRIGHT AND LICENSE
491              
492             This software is copyright (c) 2018 of Alceu Rodrigues de Freitas Junior,
493             E<lt>arfreitas@cpan.orgE<gt>
494              
495             This file is part of App-SpamcupNG distribution.
496              
497             App-SpamcupNG is free software: you can redistribute it and/or modify it under
498             the terms of the GNU General Public License as published by the Free Software
499             Foundation, either version 3 of the License, or (at your option) any later
500             version.
501              
502             App-SpamcupNG is distributed in the hope that it will be useful, but WITHOUT
503             ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
504             FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
505              
506             You should have received a copy of the GNU General Public License along with
507             App-SpamcupNG. If not, see <http://www.gnu.org/licenses/>.
508              
509             =cut
510              
511             1;