File Coverage

blib/lib/App/SpamcupNG/HTMLParse.pm
Criterion Covered Total %
statement 198 203 97.5
branch 71 90 78.8
condition 4 8 50.0
subroutine 15 15 100.0
pod 8 8 100.0
total 296 324 91.3


line stmt bran cond sub pod time code
1             package App::SpamcupNG::HTMLParse;
2 12     12   765821 use strict;
  12         117  
  12         366  
3 12     12   72 use warnings;
  12         32  
  12         373  
4 12     12   6925 use HTML::TreeBuilder::XPath 0.14;
  12         826092  
  12         124  
5 12     12   546 use Exporter 'import';
  12         29  
  12         432  
6 12     12   71 use Carp 'croak';
  12         53  
  12         620  
7              
8 12     12   7223 use App::SpamcupNG::Error::Factory qw(create_error);
  12         31  
  12         826  
9 12     12   5210 use App::SpamcupNG::Warning::Factory qw(create_warning);
  12         53  
  12         30271  
10              
11             our @EXPORT_OK = (
12             'find_next_id', 'find_errors',
13             'find_warnings', 'find_spam_header',
14             'find_best_contacts', 'find_receivers',
15             'find_message_age', 'find_header_info'
16             );
17              
18             my %regexes = (
19             next_id => qr#^/sc\?id=(\w+)#,
20             message_age => qr/^Message\sis\s(\d+)\s(\w+)\sold/
21             );
22              
23             our $VERSION = '0.017'; # VERSION
24              
25             =head1 NAME
26              
27             App::SpamcupNG::HTMLParse - functions to extract information from Spamcop.net
28             web pages
29              
30             =head1 SYNOPSIS
31              
32             use App::SpamcupNG::HTMLParse qw(find_next_id find_errors find_warnings find_spam_header find_message_age find_header_info);
33              
34             =head1 DESCRIPTION
35              
36             This package export functions that uses XPath to extract specific information
37             from the spamcop.net HTML pages.
38              
39             =head1 EXPORTS
40              
41             Following are all exported functions by this package.
42              
43             =head2 find_header_info
44              
45             Finds information from the e-mail header of the received SPAM and returns it.
46              
47             Returns a hash reference with the following keys:
48              
49             =over
50              
51             =item mailer: the X-Mailer header, if available
52              
53             =item content_type: the Content-Type, if available
54              
55             =back
56              
57             There is an attempt to normalize the C<Content-Type> header, by removing extra
58             spaces and using just the first two entries, also making everything as lower
59             case.
60              
61             =cut
62              
63             sub find_header_info {
64 4     4 1 8894 my $content_ref = shift;
65 4 50       24 croak "Must receive an scalar reference as parameter"
66             unless ( ref($content_ref) eq 'SCALAR' );
67 4         42 my $tree = HTML::TreeBuilder::XPath->new;
68 4         1288 $tree->parse_content($$content_ref);
69 4         140099 my @nodes = $tree->findnodes('/html/body/div[@id="content"]/pre');
70 4         25149 my %info = (
71             mailer => undef,
72             content_type => undef
73             );
74 4         24 my $mailer_regex = qr/^X-Mailer:/;
75 4         15 my $content_type_regex = qr/^Content-Type:/;
76              
77 4         13 foreach my $node (@nodes) {
78              
79 4         26 foreach my $content ( split( "\n", $node->as_text() ) ) {
80 251         850 $content =~ s/^\s+//;
81 251         793 $content =~ s/\s+$//;
82 251 50       488 next if ( $content eq '' );
83              
84 251 100       628 if ( $content =~ $mailer_regex ) {
85 1         13 my $wanted = ( split( ':', $content ) )[1];
86 1         4 $wanted =~ s/^\s+//;
87 1         4 $info{mailer} = $wanted;
88 1         2 next;
89             }
90              
91 250 100       582 if ( $content =~ $content_type_regex ) {
92 4         22 my $wanted = ( split( ':', $content ) )[1];
93 4         17 $wanted =~ s/^\s+//;
94 4         15 my @wanted = split( ';', $wanted );
95              
96 4 100       19 if ( scalar(@wanted) > 1 ) {
97 3         10 my $encoding = lc( $wanted[0] );
98 3         8 my $charset = lc( $wanted[1] );
99 3         13 $charset =~ s/^\s+//;
100 3         10 $charset =~ tr/"//d;
101              
102 3         39 my $not_useful = 'boundary';
103              
104 3 100       33 if (
105             substr( $charset, 0, length($not_useful) ) eq
106             $not_useful )
107             {
108 1         5 $info{content_type} = $encoding;
109 1         4 $info{charset} = undef;
110             }
111             else {
112 2         10 $info{content_type} = $encoding;
113 2         11 $info{charset} = ( split( '=', $charset ) )[1];
114             }
115             }
116             else {
117 1 50       8 chop $wanted if ( substr( $wanted, -1 ) eq ';' );
118 1         5 $info{content_type} = $wanted;
119             }
120              
121 4         17 next;
122             }
123              
124 246 50 66     491 last if ( $info{mailer} and $info{content_type} );
125             }
126             }
127              
128 4         356 return \%info;
129              
130             }
131              
132             =head2 find_message_age
133              
134             Find and return the SPAM message age information.
135              
136             Returns an array reference, with the zero index as an integer with the age, and
137             the index 1 as the age unit (possibly "hour");
138              
139             If nothing is found, returns C<undef>;
140              
141             =cut
142              
143             sub find_message_age {
144 2     2 1 262 my $content_ref = shift;
145 2 50       19 croak "Must receive an scalar reference as parameter"
146             unless ( ref($content_ref) eq 'SCALAR' );
147 2         45 my $tree = HTML::TreeBuilder::XPath->new;
148 2         1032 $tree->parse_content($$content_ref);
149 2         77230 my @nodes = $tree->findnodes('/html/body/child::div[@id="content"]');
150              
151 2         11735 foreach my $node (@nodes) {
152 2         16 foreach my $content ( $node->content_refs_list ) {
153 70 100       149 next unless ( ref($content) eq 'SCALAR' );
154 21         67 $$content =~ s/^\s+//;
155 21         61 $$content =~ s/\s+$//;
156 21 100       53 next if ( $$content eq '' );
157              
158 19 100       77 if ( $$content =~ $regexes{message_age} ) {
159 2         14 my ( $age, $unit ) = ( $1, $2 );
160 2 50       17 chop $unit if ( substr( $unit, -1 ) eq 's' );
161 2         205 return [ $age, $unit ];
162             }
163             }
164             }
165              
166 0         0 return undef;
167             }
168              
169             =head2 find_next_id
170              
171             Expects as parameter a scalar reference of the HTML page.
172              
173             Tries to find the SPAM ID used to identify SPAM reports on spamcop.net webpage.
174              
175             Returns the ID if found, otherwise C<undef>.
176              
177             =cut
178              
179             sub find_next_id {
180 3     3 1 288 my $content_ref = shift;
181 3 100       29 croak "Must receive an scalar reference as parameter"
182             unless ( ref($content_ref) eq 'SCALAR' );
183 2         18 my $tree = HTML::TreeBuilder::XPath->new;
184 2         606 $tree->parse_content($$content_ref);
185 2         42672 my @nodes = $tree->findnodes('//strong/a');
186 2         15300 my $next_id;
187              
188 2         7 foreach my $element (@nodes) {
189 4 100       225 if ( $element->as_trimmed_text eq 'Report Now' ) {
190              
191 2 50       89 if ( $element->attr('href') =~ $regexes{next_id} ) {
192 2         51 $next_id = $1;
193 2         7 my $length = length($next_id);
194 2         4 my $expected = 45;
195 2 50       19 warn
196             "Unexpected length for SPAM ID: got $length, expected $expected"
197             unless ( $length == $expected );
198 2         7 last;
199             }
200             }
201             }
202              
203 2         137 return $next_id;
204             }
205              
206             =head2 find_warnings
207              
208             Expects as parameter a scalar reference of the HTML page.
209              
210             Tries to find all warnings on the HTML, based on CSS classes.
211              
212             Returns an array reference with all warnings found.
213              
214             =cut
215              
216             # TODO: create a single tree instance and check for everything at once
217             sub find_warnings {
218 3     3 1 318 my $content_ref = shift;
219 3 100       33 croak "Must receive an scalar reference as parameter"
220             unless ( ref($content_ref) eq 'SCALAR' );
221 2         25 my $tree = HTML::TreeBuilder::XPath->new;
222 2         611 $tree->parse_content($$content_ref);
223             my @nodes
224 2         71231 = $tree->findnodes('//div[@id="content"]/div[@class="warning"]');
225 2         80236 my @warnings;
226              
227 2         9 foreach my $node (@nodes) {
228 2         4 my @all_text;
229 2         13 push( @all_text, $node->as_trimmed_text );
230              
231             # Spamcop page might add other text lines after the div, until the next div.
232 2         164 foreach my $next ( $node->right() ) {
233 16 100       451 if ( ref $next ) {
234 9 100       20 next if ( $next->tag eq 'br' );
235 2 50       15 last if ( $next->tag eq 'div' );
236             }
237             else {
238 7         14 push( @all_text, $next );
239             }
240             }
241              
242 2         24 push( @warnings, create_warning( \@all_text ) );
243             }
244              
245 2         236 return \@warnings;
246             }
247              
248             =head2 find_errors
249              
250             Expects as parameter a scalar reference of the HTML page.
251              
252             Tries to find all errors on the HTML, based on CSS classes.
253              
254             Returns an array reference with all errors found.
255              
256             =cut
257              
258             sub find_errors {
259 7     7 1 1500 my $content_ref = shift;
260 7 100       58 croak "Must receive an scalar reference as parameter"
261             unless ( ref($content_ref) eq 'SCALAR' );
262 6         60 my $tree = HTML::TreeBuilder::XPath->new;
263 6         1876 $tree->parse_content($$content_ref);
264 6         131359 my @nodes = $tree->findnodes('//div[@id="content"]/div[@class="error"]');
265 6         133290 my @errors;
266              
267 6         22 foreach my $node (@nodes) {
268 5         9 my @all_text;
269 5         23 push( @all_text, $node->as_trimmed_text );
270              
271 5         287 foreach my $next ( $node->right() ) {
272 25 100       447 if ( ref $next ) {
273 23 100       41 next if ( $next->tag eq 'br' );
274 16 100       105 last if ( $next->tag eq 'div' );
275             }
276             else {
277 2         5 push( @all_text, $next );
278             }
279             }
280              
281 5         178 push( @errors, create_error( \@all_text ) );
282             }
283              
284             # bounce errors are inside a form
285 6         59 my $base_xpath = '//form[@action="/mcgi"]';
286 6         33 @nodes = $tree->findnodes( $base_xpath . '//strong' );
287              
288 6 100       109338 if (@nodes) {
289 1 50       9 if ( $nodes[0]->as_trimmed_text() eq 'Bounce error' ) {
290 1         55 my @nodes = $tree->findnodes($base_xpath);
291 1         15221 $nodes[0]->parent(undef);
292 1         13 my @messages;
293              
294 1         5 foreach my $node ( $nodes[0]->content_list() ) {
295 8 100       25 next unless ( ref($node) eq '' );
296 3         33 push( @messages, $node );
297             }
298              
299 1         10 push( @errors, create_error( \@messages ) );
300             }
301             }
302              
303 6         554 return \@errors;
304             }
305              
306             =head2 find_best_contacts
307              
308             Expects as parameter a scalar reference of the HTML page.
309              
310             Tries to find all best contacts on the HTML, based on CSS classes.
311              
312             The best contacts are the e-mail address that Spamcop considers appropriate to
313             use for SPAM reporting.
314              
315             Returns an array reference with all best contacts found.
316              
317             =cut
318              
319             sub find_best_contacts {
320 4     4 1 3290 my $content_ref = shift;
321 4 100       48 croak "Must receive an scalar reference as parameter"
322             unless ( ref($content_ref) eq 'SCALAR' );
323 3         33 my $tree = HTML::TreeBuilder::XPath->new;
324 3         927 $tree->parse_content($content_ref);
325 3         108606 my @nodes = $tree->findnodes('//div[@id="content"]');
326              
327 3         106511 foreach my $node (@nodes) {
328 3         14 for my $html_element ( $node->content_list ) {
329              
330             # only text
331 130 100       294 next if ref($html_element);
332 42         108 $html_element =~ s/^\s+//;
333 42 100       109 if ( index( $html_element, 'Using best contacts' ) == 0 ) {
334 2         37 my @tokens = split( /\s/, $html_element );
335 2         11 splice( @tokens, 0, 3 );
336 2         231 return \@tokens;
337             }
338             }
339              
340             }
341              
342 1         111 return [];
343             }
344              
345             =head2 find_spam_header
346              
347             Expects as parameter a scalar reference of the HTML page.
348              
349             You can optionally pass a second parameter that defines if each line should be
350             prefixed with a tab character. The default value is false.
351              
352             Tries to find the e-mail header of the SPAM reported.
353              
354             Returns an array reference with all the lines of the e-mail header found.
355              
356             =cut
357              
358             sub find_spam_header {
359 2     2 1 320 my $content_ref = shift;
360 2 50       14 croak "Must receive an scalar reference as parameter"
361             unless ( ref($content_ref) eq 'SCALAR' );
362 2   50     20 my $formatted //= 0;
363 2         21 my $tree = HTML::TreeBuilder::XPath->new;
364 2         677 $tree->parse_content($content_ref);
365              
366 2         71228 my @nodes = $tree->findnodes('/html/body/div[5]/p[1]/strong');
367 2         12872 my $expected = 'Please make sure this email IS spam:';
368 2         6 my $parent = undef;
369              
370 2         8 foreach my $node (@nodes) {
371 2 50       17 if ( $node->as_trimmed_text eq $expected ) {
372 2         183 $parent = $node->parent;
373 2         20 last;
374             }
375             }
376              
377 2 50       12 if ($parent) {
378 2         10 $parent->parent(undef);
379 2         27 @nodes = $parent->findnodes('//font');
380              
381 2 50 33     2999 if ( ( scalar(@nodes) != 1 )
382             or ( ref( $nodes[0] ) ne 'HTML::Element' ) )
383             {
384 0         0 croak 'Unexpected content of SPAM header: ' . Dumper(@nodes);
385             }
386              
387 2         6 my @lines;
388 2         18 my $header = $nodes[0]->content;
389              
390 2         17 for ( my $i = 0 ; $i <= scalar( @{$header} ) ; $i++ ) {
  20         61  
391 18 100       46 if ( ref( $header->[$i] ) eq 'HTML::Element' ) {
392 8         27 $header->[$i]->parent(undef);
393              
394             # just want text here
395 8 100       85 next unless $header->[$i]->content;
396 2         17 my $content = ( $header->[$i]->content )->[0];
397 2 50       11 next unless $content;
398 2 50       12 next if ( ref($content) );
399 2         6 $header->[$i] = $content;
400             }
401 12 100       45 next unless $header->[$i];
402              
403             # removing Unicode spaces in place
404 10         39 $header->[$i] =~ s/^\s++//u;
405              
406 10 50       50 if ($formatted) {
407 0         0 push( @lines, "\t$header->[$i]" );
408              
409             }
410             else {
411 10         61 push( @lines, $header->[$i] );
412              
413             }
414             }
415 2         223 return \@lines;
416             }
417              
418 0         0 return [];
419             }
420              
421             =head2 find_receivers
422              
423             Expects as parameter a scalar reference of the HTML page.
424              
425             Tries to find all the receivers of the SPAM report, even if those were not real
426             e-mail address, only internal identifiers for Spamcop to store statistics.
427              
428             Returns an array reference, where each item is a string.
429              
430             =cut
431              
432             sub find_receivers {
433 2     2 1 298 my $content_ref = shift;
434 2 50       19 croak "Must receive an scalar reference as parameter"
435             unless ( ref($content_ref) eq 'SCALAR' );
436 2         27 my $tree = HTML::TreeBuilder::XPath->new;
437 2         725 $tree->parse_content($content_ref);
438 2         27976 my @nodes = $tree->findnodes('//*[@id="content"]');
439 2         48636 my @receivers;
440 2         7 my $devnull = q{/dev/null'ing};
441 2         7 my $report_sent = 'Spam report id';
442              
443 2         9 foreach my $node (@nodes) {
444 2         15 foreach my $inner ( $node->content_list() ) {
445              
446             # we just want text nodes, everything else is discarded
447 10 100       44 next if ( ref($inner) );
448 4         21 $inner =~ s/^\s+//;
449 4         19 $inner =~ s/\s+$//;
450              
451 4         7 my $result_ref;
452 4         24 my @parts = split( /\s/, $inner );
453              
454             # /dev/null\'ing report for google-abuse-bounces-reports@devnull.spamcop.net
455 4 100       25 if ( substr( $inner, 0, length($devnull) ) eq $devnull ) {
    50          
456 3         20 $result_ref = [ ( split( '@', $parts[-1] ) )[0], undef ];
457             }
458              
459             # Spam report id 7151980235 sent to: dl_security_whois@navercorp.com
460             elsif (
461             substr( $inner, 0, length($report_sent) ) eq $report_sent )
462             {
463 1         5 $result_ref = [ $parts[6], $parts[3] ];
464             }
465             else {
466 0         0 warn "Unexpected receiver format: $inner";
467             }
468              
469 4         12 push( @receivers, $result_ref );
470             }
471             }
472              
473 2         158 return \@receivers;
474             }
475              
476             =head1 SEE ALSO
477              
478             =over
479              
480             =item *
481              
482             L<HTML::TreeBuilder::XPath>
483              
484             =back
485              
486             =head1 AUTHOR
487              
488             Alceu Rodrigues de Freitas Junior, E<lt>arfreitas@cpan.orgE<gt>
489              
490             =head1 COPYRIGHT AND LICENSE
491              
492             This software is copyright (c) 2018 of Alceu Rodrigues de Freitas Junior,
493             E<lt>arfreitas@cpan.orgE<gt>
494              
495             This file is part of App-SpamcupNG distribution.
496              
497             App-SpamcupNG is free software: you can redistribute it and/or modify it under
498             the terms of the GNU General Public License as published by the Free Software
499             Foundation, either version 3 of the License, or (at your option) any later
500             version.
501              
502             App-SpamcupNG is distributed in the hope that it will be useful, but WITHOUT
503             ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
504             FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
505              
506             You should have received a copy of the GNU General Public License along with
507             App-SpamcupNG. If not, see <http://www.gnu.org/licenses/>.
508              
509             =cut
510              
511             1;