| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package App::SpamcupNG::HTMLParse; | 
| 2 | 12 |  |  | 12 |  | 765821 | use strict; | 
|  | 12 |  |  |  |  | 117 |  | 
|  | 12 |  |  |  |  | 366 |  | 
| 3 | 12 |  |  | 12 |  | 72 | use warnings; | 
|  | 12 |  |  |  |  | 32 |  | 
|  | 12 |  |  |  |  | 373 |  | 
| 4 | 12 |  |  | 12 |  | 6925 | use HTML::TreeBuilder::XPath 0.14; | 
|  | 12 |  |  |  |  | 826092 |  | 
|  | 12 |  |  |  |  | 124 |  | 
| 5 | 12 |  |  | 12 |  | 546 | use Exporter 'import'; | 
|  | 12 |  |  |  |  | 29 |  | 
|  | 12 |  |  |  |  | 432 |  | 
| 6 | 12 |  |  | 12 |  | 71 | use Carp 'croak'; | 
|  | 12 |  |  |  |  | 53 |  | 
|  | 12 |  |  |  |  | 620 |  | 
| 7 |  |  |  |  |  |  |  | 
| 8 | 12 |  |  | 12 |  | 7223 | use App::SpamcupNG::Error::Factory qw(create_error); | 
|  | 12 |  |  |  |  | 31 |  | 
|  | 12 |  |  |  |  | 826 |  | 
| 9 | 12 |  |  | 12 |  | 5210 | use App::SpamcupNG::Warning::Factory qw(create_warning); | 
|  | 12 |  |  |  |  | 53 |  | 
|  | 12 |  |  |  |  | 30271 |  | 
| 10 |  |  |  |  |  |  |  | 
| 11 |  |  |  |  |  |  | our @EXPORT_OK = ( | 
| 12 |  |  |  |  |  |  | 'find_next_id',       'find_errors', | 
| 13 |  |  |  |  |  |  | 'find_warnings',      'find_spam_header', | 
| 14 |  |  |  |  |  |  | 'find_best_contacts', 'find_receivers', | 
| 15 |  |  |  |  |  |  | 'find_message_age',   'find_header_info' | 
| 16 |  |  |  |  |  |  | ); | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | my %regexes = ( | 
| 19 |  |  |  |  |  |  | next_id     => qr#^/sc\?id=(\w+)#, | 
| 20 |  |  |  |  |  |  | message_age => qr/^Message\sis\s(\d+)\s(\w+)\sold/ | 
| 21 |  |  |  |  |  |  | ); | 
| 22 |  |  |  |  |  |  |  | 
| 23 |  |  |  |  |  |  | our $VERSION = '0.017'; # VERSION | 
| 24 |  |  |  |  |  |  |  | 
| 25 |  |  |  |  |  |  | =head1 NAME | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | App::SpamcupNG::HTMLParse - functions to extract information from Spamcop.net | 
| 28 |  |  |  |  |  |  | web pages | 
| 29 |  |  |  |  |  |  |  | 
| 30 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | use App::SpamcupNG::HTMLParse qw(find_next_id find_errors find_warnings find_spam_header find_message_age find_header_info); | 
| 33 |  |  |  |  |  |  |  | 
| 34 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  | This package export functions that uses XPath to extract specific information | 
| 37 |  |  |  |  |  |  | from the spamcop.net HTML pages. | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | =head1 EXPORTS | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | Following are all exported functions by this package. | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | =head2 find_header_info | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | Finds information from the e-mail header of the received SPAM and returns it. | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | Returns a hash reference with the following keys: | 
| 48 |  |  |  |  |  |  |  | 
| 49 |  |  |  |  |  |  | =over | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | =item mailer: the X-Mailer header, if available | 
| 52 |  |  |  |  |  |  |  | 
| 53 |  |  |  |  |  |  | =item content_type: the Content-Type, if available | 
| 54 |  |  |  |  |  |  |  | 
| 55 |  |  |  |  |  |  | =back | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | There is an attempt to normalize the C<Content-Type> header, by removing extra | 
| 58 |  |  |  |  |  |  | spaces and using just the first two entries, also making everything as lower | 
| 59 |  |  |  |  |  |  | case. | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | =cut | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | sub find_header_info { | 
| 64 | 4 |  |  | 4 | 1 | 8894 | my $content_ref = shift; | 
| 65 | 4 | 50 |  |  |  | 24 | croak "Must receive an scalar reference as parameter" | 
| 66 |  |  |  |  |  |  | unless ( ref($content_ref) eq 'SCALAR' ); | 
| 67 | 4 |  |  |  |  | 42 | my $tree = HTML::TreeBuilder::XPath->new; | 
| 68 | 4 |  |  |  |  | 1288 | $tree->parse_content($$content_ref); | 
| 69 | 4 |  |  |  |  | 140099 | my @nodes = $tree->findnodes('/html/body/div[@id="content"]/pre'); | 
| 70 | 4 |  |  |  |  | 25149 | my %info  = ( | 
| 71 |  |  |  |  |  |  | mailer       => undef, | 
| 72 |  |  |  |  |  |  | content_type => undef | 
| 73 |  |  |  |  |  |  | ); | 
| 74 | 4 |  |  |  |  | 24 | my $mailer_regex       = qr/^X-Mailer:/; | 
| 75 | 4 |  |  |  |  | 15 | my $content_type_regex = qr/^Content-Type:/; | 
| 76 |  |  |  |  |  |  |  | 
| 77 | 4 |  |  |  |  | 13 | foreach my $node (@nodes) { | 
| 78 |  |  |  |  |  |  |  | 
| 79 | 4 |  |  |  |  | 26 | foreach my $content ( split( "\n", $node->as_text() ) ) { | 
| 80 | 251 |  |  |  |  | 850 | $content =~ s/^\s+//; | 
| 81 | 251 |  |  |  |  | 793 | $content =~ s/\s+$//; | 
| 82 | 251 | 50 |  |  |  | 488 | next if ( $content eq '' ); | 
| 83 |  |  |  |  |  |  |  | 
| 84 | 251 | 100 |  |  |  | 628 | if ( $content =~ $mailer_regex ) { | 
| 85 | 1 |  |  |  |  | 13 | my $wanted = ( split( ':', $content ) )[1]; | 
| 86 | 1 |  |  |  |  | 4 | $wanted =~ s/^\s+//; | 
| 87 | 1 |  |  |  |  | 4 | $info{mailer} = $wanted; | 
| 88 | 1 |  |  |  |  | 2 | next; | 
| 89 |  |  |  |  |  |  | } | 
| 90 |  |  |  |  |  |  |  | 
| 91 | 250 | 100 |  |  |  | 582 | if ( $content =~ $content_type_regex ) { | 
| 92 | 4 |  |  |  |  | 22 | my $wanted = ( split( ':', $content ) )[1]; | 
| 93 | 4 |  |  |  |  | 17 | $wanted =~ s/^\s+//; | 
| 94 | 4 |  |  |  |  | 15 | my @wanted = split( ';', $wanted ); | 
| 95 |  |  |  |  |  |  |  | 
| 96 | 4 | 100 |  |  |  | 19 | if ( scalar(@wanted) > 1 ) { | 
| 97 | 3 |  |  |  |  | 10 | my $encoding = lc( $wanted[0] ); | 
| 98 | 3 |  |  |  |  | 8 | my $charset  = lc( $wanted[1] ); | 
| 99 | 3 |  |  |  |  | 13 | $charset =~ s/^\s+//; | 
| 100 | 3 |  |  |  |  | 10 | $charset =~ tr/"//d; | 
| 101 |  |  |  |  |  |  |  | 
| 102 | 3 |  |  |  |  | 39 | my $not_useful = 'boundary'; | 
| 103 |  |  |  |  |  |  |  | 
| 104 | 3 | 100 |  |  |  | 33 | if ( | 
| 105 |  |  |  |  |  |  | substr( $charset, 0, length($not_useful) ) eq | 
| 106 |  |  |  |  |  |  | $not_useful ) | 
| 107 |  |  |  |  |  |  | { | 
| 108 | 1 |  |  |  |  | 5 | $info{content_type} = $encoding; | 
| 109 | 1 |  |  |  |  | 4 | $info{charset}      = undef; | 
| 110 |  |  |  |  |  |  | } | 
| 111 |  |  |  |  |  |  | else { | 
| 112 | 2 |  |  |  |  | 10 | $info{content_type} = $encoding; | 
| 113 | 2 |  |  |  |  | 11 | $info{charset}      = ( split( '=', $charset ) )[1]; | 
| 114 |  |  |  |  |  |  | } | 
| 115 |  |  |  |  |  |  | } | 
| 116 |  |  |  |  |  |  | else { | 
| 117 | 1 | 50 |  |  |  | 8 | chop $wanted if ( substr( $wanted, -1 ) eq ';' ); | 
| 118 | 1 |  |  |  |  | 5 | $info{content_type} = $wanted; | 
| 119 |  |  |  |  |  |  | } | 
| 120 |  |  |  |  |  |  |  | 
| 121 | 4 |  |  |  |  | 17 | next; | 
| 122 |  |  |  |  |  |  | } | 
| 123 |  |  |  |  |  |  |  | 
| 124 | 246 | 50 | 66 |  |  | 491 | last if ( $info{mailer} and $info{content_type} ); | 
| 125 |  |  |  |  |  |  | } | 
| 126 |  |  |  |  |  |  | } | 
| 127 |  |  |  |  |  |  |  | 
| 128 | 4 |  |  |  |  | 356 | return \%info; | 
| 129 |  |  |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | } | 
| 131 |  |  |  |  |  |  |  | 
| 132 |  |  |  |  |  |  | =head2 find_message_age | 
| 133 |  |  |  |  |  |  |  | 
| 134 |  |  |  |  |  |  | Find and return the SPAM message age information. | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | Returns an array reference, with the zero index as an integer with the age, and | 
| 137 |  |  |  |  |  |  | the index 1 as the age unit (possibly "hour"); | 
| 138 |  |  |  |  |  |  |  | 
| 139 |  |  |  |  |  |  | If nothing is found, returns C<undef>; | 
| 140 |  |  |  |  |  |  |  | 
| 141 |  |  |  |  |  |  | =cut | 
| 142 |  |  |  |  |  |  |  | 
| 143 |  |  |  |  |  |  | sub find_message_age { | 
| 144 | 2 |  |  | 2 | 1 | 262 | my $content_ref = shift; | 
| 145 | 2 | 50 |  |  |  | 19 | croak "Must receive an scalar reference as parameter" | 
| 146 |  |  |  |  |  |  | unless ( ref($content_ref) eq 'SCALAR' ); | 
| 147 | 2 |  |  |  |  | 45 | my $tree = HTML::TreeBuilder::XPath->new; | 
| 148 | 2 |  |  |  |  | 1032 | $tree->parse_content($$content_ref); | 
| 149 | 2 |  |  |  |  | 77230 | my @nodes = $tree->findnodes('/html/body/child::div[@id="content"]'); | 
| 150 |  |  |  |  |  |  |  | 
| 151 | 2 |  |  |  |  | 11735 | foreach my $node (@nodes) { | 
| 152 | 2 |  |  |  |  | 16 | foreach my $content ( $node->content_refs_list ) { | 
| 153 | 70 | 100 |  |  |  | 149 | next unless ( ref($content) eq 'SCALAR' ); | 
| 154 | 21 |  |  |  |  | 67 | $$content =~ s/^\s+//; | 
| 155 | 21 |  |  |  |  | 61 | $$content =~ s/\s+$//; | 
| 156 | 21 | 100 |  |  |  | 53 | next if ( $$content eq '' ); | 
| 157 |  |  |  |  |  |  |  | 
| 158 | 19 | 100 |  |  |  | 77 | if ( $$content =~ $regexes{message_age} ) { | 
| 159 | 2 |  |  |  |  | 14 | my ( $age, $unit ) = ( $1, $2 ); | 
| 160 | 2 | 50 |  |  |  | 17 | chop $unit if ( substr( $unit, -1 ) eq 's' ); | 
| 161 | 2 |  |  |  |  | 205 | return [ $age, $unit ]; | 
| 162 |  |  |  |  |  |  | } | 
| 163 |  |  |  |  |  |  | } | 
| 164 |  |  |  |  |  |  | } | 
| 165 |  |  |  |  |  |  |  | 
| 166 | 0 |  |  |  |  | 0 | return undef; | 
| 167 |  |  |  |  |  |  | } | 
| 168 |  |  |  |  |  |  |  | 
| 169 |  |  |  |  |  |  | =head2 find_next_id | 
| 170 |  |  |  |  |  |  |  | 
| 171 |  |  |  |  |  |  | Expects as parameter a scalar reference of the HTML page. | 
| 172 |  |  |  |  |  |  |  | 
| 173 |  |  |  |  |  |  | Tries to find the SPAM ID used to identify SPAM reports on spamcop.net webpage. | 
| 174 |  |  |  |  |  |  |  | 
| 175 |  |  |  |  |  |  | Returns the ID if found, otherwise C<undef>. | 
| 176 |  |  |  |  |  |  |  | 
| 177 |  |  |  |  |  |  | =cut | 
| 178 |  |  |  |  |  |  |  | 
| 179 |  |  |  |  |  |  | sub find_next_id { | 
| 180 | 3 |  |  | 3 | 1 | 288 | my $content_ref = shift; | 
| 181 | 3 | 100 |  |  |  | 29 | croak "Must receive an scalar reference as parameter" | 
| 182 |  |  |  |  |  |  | unless ( ref($content_ref) eq 'SCALAR' ); | 
| 183 | 2 |  |  |  |  | 18 | my $tree = HTML::TreeBuilder::XPath->new; | 
| 184 | 2 |  |  |  |  | 606 | $tree->parse_content($$content_ref); | 
| 185 | 2 |  |  |  |  | 42672 | my @nodes = $tree->findnodes('//strong/a'); | 
| 186 | 2 |  |  |  |  | 15300 | my $next_id; | 
| 187 |  |  |  |  |  |  |  | 
| 188 | 2 |  |  |  |  | 7 | foreach my $element (@nodes) { | 
| 189 | 4 | 100 |  |  |  | 225 | if ( $element->as_trimmed_text eq 'Report Now' ) { | 
| 190 |  |  |  |  |  |  |  | 
| 191 | 2 | 50 |  |  |  | 89 | if ( $element->attr('href') =~ $regexes{next_id} ) { | 
| 192 | 2 |  |  |  |  | 51 | $next_id = $1; | 
| 193 | 2 |  |  |  |  | 7 | my $length   = length($next_id); | 
| 194 | 2 |  |  |  |  | 4 | my $expected = 45; | 
| 195 | 2 | 50 |  |  |  | 19 | warn | 
| 196 |  |  |  |  |  |  | "Unexpected length for SPAM ID: got $length, expected $expected" | 
| 197 |  |  |  |  |  |  | unless ( $length == $expected ); | 
| 198 | 2 |  |  |  |  | 7 | last; | 
| 199 |  |  |  |  |  |  | } | 
| 200 |  |  |  |  |  |  | } | 
| 201 |  |  |  |  |  |  | } | 
| 202 |  |  |  |  |  |  |  | 
| 203 | 2 |  |  |  |  | 137 | return $next_id; | 
| 204 |  |  |  |  |  |  | } | 
| 205 |  |  |  |  |  |  |  | 
| 206 |  |  |  |  |  |  | =head2 find_warnings | 
| 207 |  |  |  |  |  |  |  | 
| 208 |  |  |  |  |  |  | Expects as parameter a scalar reference of the HTML page. | 
| 209 |  |  |  |  |  |  |  | 
| 210 |  |  |  |  |  |  | Tries to find all warnings on the HTML, based on CSS classes. | 
| 211 |  |  |  |  |  |  |  | 
| 212 |  |  |  |  |  |  | Returns an array reference with all warnings found. | 
| 213 |  |  |  |  |  |  |  | 
| 214 |  |  |  |  |  |  | =cut | 
| 215 |  |  |  |  |  |  |  | 
| 216 |  |  |  |  |  |  | # TODO: create a single tree instance and check for everything at once | 
| 217 |  |  |  |  |  |  | sub find_warnings { | 
| 218 | 3 |  |  | 3 | 1 | 318 | my $content_ref = shift; | 
| 219 | 3 | 100 |  |  |  | 33 | croak "Must receive an scalar reference as parameter" | 
| 220 |  |  |  |  |  |  | unless ( ref($content_ref) eq 'SCALAR' ); | 
| 221 | 2 |  |  |  |  | 25 | my $tree = HTML::TreeBuilder::XPath->new; | 
| 222 | 2 |  |  |  |  | 611 | $tree->parse_content($$content_ref); | 
| 223 |  |  |  |  |  |  | my @nodes | 
| 224 | 2 |  |  |  |  | 71231 | = $tree->findnodes('//div[@id="content"]/div[@class="warning"]'); | 
| 225 | 2 |  |  |  |  | 80236 | my @warnings; | 
| 226 |  |  |  |  |  |  |  | 
| 227 | 2 |  |  |  |  | 9 | foreach my $node (@nodes) { | 
| 228 | 2 |  |  |  |  | 4 | my @all_text; | 
| 229 | 2 |  |  |  |  | 13 | push( @all_text, $node->as_trimmed_text ); | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | # Spamcop page might add other text lines after the div, until the next div. | 
| 232 | 2 |  |  |  |  | 164 | foreach my $next ( $node->right() ) { | 
| 233 | 16 | 100 |  |  |  | 451 | if ( ref $next ) { | 
| 234 | 9 | 100 |  |  |  | 20 | next if ( $next->tag eq 'br' ); | 
| 235 | 2 | 50 |  |  |  | 15 | last if ( $next->tag eq 'div' ); | 
| 236 |  |  |  |  |  |  | } | 
| 237 |  |  |  |  |  |  | else { | 
| 238 | 7 |  |  |  |  | 14 | push( @all_text, $next ); | 
| 239 |  |  |  |  |  |  | } | 
| 240 |  |  |  |  |  |  | } | 
| 241 |  |  |  |  |  |  |  | 
| 242 | 2 |  |  |  |  | 24 | push( @warnings, create_warning( \@all_text ) ); | 
| 243 |  |  |  |  |  |  | } | 
| 244 |  |  |  |  |  |  |  | 
| 245 | 2 |  |  |  |  | 236 | return \@warnings; | 
| 246 |  |  |  |  |  |  | } | 
| 247 |  |  |  |  |  |  |  | 
| 248 |  |  |  |  |  |  | =head2 find_errors | 
| 249 |  |  |  |  |  |  |  | 
| 250 |  |  |  |  |  |  | Expects as parameter a scalar reference of the HTML page. | 
| 251 |  |  |  |  |  |  |  | 
| 252 |  |  |  |  |  |  | Tries to find all errors on the HTML, based on CSS classes. | 
| 253 |  |  |  |  |  |  |  | 
| 254 |  |  |  |  |  |  | Returns an array reference with all errors found. | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | =cut | 
| 257 |  |  |  |  |  |  |  | 
| 258 |  |  |  |  |  |  | sub find_errors { | 
| 259 | 7 |  |  | 7 | 1 | 1500 | my $content_ref = shift; | 
| 260 | 7 | 100 |  |  |  | 58 | croak "Must receive an scalar reference as parameter" | 
| 261 |  |  |  |  |  |  | unless ( ref($content_ref) eq 'SCALAR' ); | 
| 262 | 6 |  |  |  |  | 60 | my $tree = HTML::TreeBuilder::XPath->new; | 
| 263 | 6 |  |  |  |  | 1876 | $tree->parse_content($$content_ref); | 
| 264 | 6 |  |  |  |  | 131359 | my @nodes = $tree->findnodes('//div[@id="content"]/div[@class="error"]'); | 
| 265 | 6 |  |  |  |  | 133290 | my @errors; | 
| 266 |  |  |  |  |  |  |  | 
| 267 | 6 |  |  |  |  | 22 | foreach my $node (@nodes) { | 
| 268 | 5 |  |  |  |  | 9 | my @all_text; | 
| 269 | 5 |  |  |  |  | 23 | push( @all_text, $node->as_trimmed_text ); | 
| 270 |  |  |  |  |  |  |  | 
| 271 | 5 |  |  |  |  | 287 | foreach my $next ( $node->right() ) { | 
| 272 | 25 | 100 |  |  |  | 447 | if ( ref $next ) { | 
| 273 | 23 | 100 |  |  |  | 41 | next if ( $next->tag eq 'br' ); | 
| 274 | 16 | 100 |  |  |  | 105 | last if ( $next->tag eq 'div' ); | 
| 275 |  |  |  |  |  |  | } | 
| 276 |  |  |  |  |  |  | else { | 
| 277 | 2 |  |  |  |  | 5 | push( @all_text, $next ); | 
| 278 |  |  |  |  |  |  | } | 
| 279 |  |  |  |  |  |  | } | 
| 280 |  |  |  |  |  |  |  | 
| 281 | 5 |  |  |  |  | 178 | push( @errors, create_error( \@all_text ) ); | 
| 282 |  |  |  |  |  |  | } | 
| 283 |  |  |  |  |  |  |  | 
| 284 |  |  |  |  |  |  | # bounce errors are inside a form | 
| 285 | 6 |  |  |  |  | 59 | my $base_xpath = '//form[@action="/mcgi"]'; | 
| 286 | 6 |  |  |  |  | 33 | @nodes = $tree->findnodes( $base_xpath . '//strong' ); | 
| 287 |  |  |  |  |  |  |  | 
| 288 | 6 | 100 |  |  |  | 109338 | if (@nodes) { | 
| 289 | 1 | 50 |  |  |  | 9 | if ( $nodes[0]->as_trimmed_text() eq 'Bounce error' ) { | 
| 290 | 1 |  |  |  |  | 55 | my @nodes = $tree->findnodes($base_xpath); | 
| 291 | 1 |  |  |  |  | 15221 | $nodes[0]->parent(undef); | 
| 292 | 1 |  |  |  |  | 13 | my @messages; | 
| 293 |  |  |  |  |  |  |  | 
| 294 | 1 |  |  |  |  | 5 | foreach my $node ( $nodes[0]->content_list() ) { | 
| 295 | 8 | 100 |  |  |  | 25 | next unless ( ref($node) eq '' ); | 
| 296 | 3 |  |  |  |  | 33 | push( @messages, $node ); | 
| 297 |  |  |  |  |  |  | } | 
| 298 |  |  |  |  |  |  |  | 
| 299 | 1 |  |  |  |  | 10 | push( @errors, create_error( \@messages ) ); | 
| 300 |  |  |  |  |  |  | } | 
| 301 |  |  |  |  |  |  | } | 
| 302 |  |  |  |  |  |  |  | 
| 303 | 6 |  |  |  |  | 554 | return \@errors; | 
| 304 |  |  |  |  |  |  | } | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  | =head2 find_best_contacts | 
| 307 |  |  |  |  |  |  |  | 
| 308 |  |  |  |  |  |  | Expects as parameter a scalar reference of the HTML page. | 
| 309 |  |  |  |  |  |  |  | 
| 310 |  |  |  |  |  |  | Tries to find all best contacts on the HTML, based on CSS classes. | 
| 311 |  |  |  |  |  |  |  | 
| 312 |  |  |  |  |  |  | The best contacts are the e-mail address that Spamcop considers appropriate to | 
| 313 |  |  |  |  |  |  | use for SPAM reporting. | 
| 314 |  |  |  |  |  |  |  | 
| 315 |  |  |  |  |  |  | Returns an array reference with all best contacts found. | 
| 316 |  |  |  |  |  |  |  | 
| 317 |  |  |  |  |  |  | =cut | 
| 318 |  |  |  |  |  |  |  | 
| 319 |  |  |  |  |  |  | sub find_best_contacts { | 
| 320 | 4 |  |  | 4 | 1 | 3290 | my $content_ref = shift; | 
| 321 | 4 | 100 |  |  |  | 48 | croak "Must receive an scalar reference as parameter" | 
| 322 |  |  |  |  |  |  | unless ( ref($content_ref) eq 'SCALAR' ); | 
| 323 | 3 |  |  |  |  | 33 | my $tree = HTML::TreeBuilder::XPath->new; | 
| 324 | 3 |  |  |  |  | 927 | $tree->parse_content($content_ref); | 
| 325 | 3 |  |  |  |  | 108606 | my @nodes = $tree->findnodes('//div[@id="content"]'); | 
| 326 |  |  |  |  |  |  |  | 
| 327 | 3 |  |  |  |  | 106511 | foreach my $node (@nodes) { | 
| 328 | 3 |  |  |  |  | 14 | for my $html_element ( $node->content_list ) { | 
| 329 |  |  |  |  |  |  |  | 
| 330 |  |  |  |  |  |  | # only text | 
| 331 | 130 | 100 |  |  |  | 294 | next if ref($html_element); | 
| 332 | 42 |  |  |  |  | 108 | $html_element =~ s/^\s+//; | 
| 333 | 42 | 100 |  |  |  | 109 | if ( index( $html_element, 'Using best contacts' ) == 0 ) { | 
| 334 | 2 |  |  |  |  | 37 | my @tokens = split( /\s/, $html_element ); | 
| 335 | 2 |  |  |  |  | 11 | splice( @tokens, 0, 3 ); | 
| 336 | 2 |  |  |  |  | 231 | return \@tokens; | 
| 337 |  |  |  |  |  |  | } | 
| 338 |  |  |  |  |  |  | } | 
| 339 |  |  |  |  |  |  |  | 
| 340 |  |  |  |  |  |  | } | 
| 341 |  |  |  |  |  |  |  | 
| 342 | 1 |  |  |  |  | 111 | return []; | 
| 343 |  |  |  |  |  |  | } | 
| 344 |  |  |  |  |  |  |  | 
| 345 |  |  |  |  |  |  | =head2 find_spam_header | 
| 346 |  |  |  |  |  |  |  | 
| 347 |  |  |  |  |  |  | Expects as parameter a scalar reference of the HTML page. | 
| 348 |  |  |  |  |  |  |  | 
| 349 |  |  |  |  |  |  | You can optionally pass a second parameter that defines if each line should be | 
| 350 |  |  |  |  |  |  | prefixed with a tab character. The default value is false. | 
| 351 |  |  |  |  |  |  |  | 
| 352 |  |  |  |  |  |  | Tries to find the e-mail header of the SPAM reported. | 
| 353 |  |  |  |  |  |  |  | 
| 354 |  |  |  |  |  |  | Returns an array reference with all the lines of the e-mail header found. | 
| 355 |  |  |  |  |  |  |  | 
| 356 |  |  |  |  |  |  | =cut | 
| 357 |  |  |  |  |  |  |  | 
| 358 |  |  |  |  |  |  | sub find_spam_header { | 
| 359 | 2 |  |  | 2 | 1 | 320 | my $content_ref = shift; | 
| 360 | 2 | 50 |  |  |  | 14 | croak "Must receive an scalar reference as parameter" | 
| 361 |  |  |  |  |  |  | unless ( ref($content_ref) eq 'SCALAR' ); | 
| 362 | 2 |  | 50 |  |  | 20 | my $formatted //= 0; | 
| 363 | 2 |  |  |  |  | 21 | my $tree = HTML::TreeBuilder::XPath->new; | 
| 364 | 2 |  |  |  |  | 677 | $tree->parse_content($content_ref); | 
| 365 |  |  |  |  |  |  |  | 
| 366 | 2 |  |  |  |  | 71228 | my @nodes    = $tree->findnodes('/html/body/div[5]/p[1]/strong'); | 
| 367 | 2 |  |  |  |  | 12872 | my $expected = 'Please make sure this email IS spam:'; | 
| 368 | 2 |  |  |  |  | 6 | my $parent   = undef; | 
| 369 |  |  |  |  |  |  |  | 
| 370 | 2 |  |  |  |  | 8 | foreach my $node (@nodes) { | 
| 371 | 2 | 50 |  |  |  | 17 | if ( $node->as_trimmed_text eq $expected ) { | 
| 372 | 2 |  |  |  |  | 183 | $parent = $node->parent; | 
| 373 | 2 |  |  |  |  | 20 | last; | 
| 374 |  |  |  |  |  |  | } | 
| 375 |  |  |  |  |  |  | } | 
| 376 |  |  |  |  |  |  |  | 
| 377 | 2 | 50 |  |  |  | 12 | if ($parent) { | 
| 378 | 2 |  |  |  |  | 10 | $parent->parent(undef); | 
| 379 | 2 |  |  |  |  | 27 | @nodes = $parent->findnodes('//font'); | 
| 380 |  |  |  |  |  |  |  | 
| 381 | 2 | 50 | 33 |  |  | 2999 | if (   ( scalar(@nodes) != 1 ) | 
| 382 |  |  |  |  |  |  | or ( ref( $nodes[0] ) ne 'HTML::Element' ) ) | 
| 383 |  |  |  |  |  |  | { | 
| 384 | 0 |  |  |  |  | 0 | croak 'Unexpected content of SPAM header: ' . Dumper(@nodes); | 
| 385 |  |  |  |  |  |  | } | 
| 386 |  |  |  |  |  |  |  | 
| 387 | 2 |  |  |  |  | 6 | my @lines; | 
| 388 | 2 |  |  |  |  | 18 | my $header = $nodes[0]->content; | 
| 389 |  |  |  |  |  |  |  | 
| 390 | 2 |  |  |  |  | 17 | for ( my $i = 0 ; $i <= scalar( @{$header} ) ; $i++ ) { | 
|  | 20 |  |  |  |  | 61 |  | 
| 391 | 18 | 100 |  |  |  | 46 | if ( ref( $header->[$i] ) eq 'HTML::Element' ) { | 
| 392 | 8 |  |  |  |  | 27 | $header->[$i]->parent(undef); | 
| 393 |  |  |  |  |  |  |  | 
| 394 |  |  |  |  |  |  | # just want text here | 
| 395 | 8 | 100 |  |  |  | 85 | next unless $header->[$i]->content; | 
| 396 | 2 |  |  |  |  | 17 | my $content = ( $header->[$i]->content )->[0]; | 
| 397 | 2 | 50 |  |  |  | 11 | next unless $content; | 
| 398 | 2 | 50 |  |  |  | 12 | next if ( ref($content) ); | 
| 399 | 2 |  |  |  |  | 6 | $header->[$i] = $content; | 
| 400 |  |  |  |  |  |  | } | 
| 401 | 12 | 100 |  |  |  | 45 | next unless $header->[$i]; | 
| 402 |  |  |  |  |  |  |  | 
| 403 |  |  |  |  |  |  | # removing Unicode spaces in place | 
| 404 | 10 |  |  |  |  | 39 | $header->[$i] =~ s/^\s++//u; | 
| 405 |  |  |  |  |  |  |  | 
| 406 | 10 | 50 |  |  |  | 50 | if ($formatted) { | 
| 407 | 0 |  |  |  |  | 0 | push( @lines, "\t$header->[$i]" ); | 
| 408 |  |  |  |  |  |  |  | 
| 409 |  |  |  |  |  |  | } | 
| 410 |  |  |  |  |  |  | else { | 
| 411 | 10 |  |  |  |  | 61 | push( @lines, $header->[$i] ); | 
| 412 |  |  |  |  |  |  |  | 
| 413 |  |  |  |  |  |  | } | 
| 414 |  |  |  |  |  |  | } | 
| 415 | 2 |  |  |  |  | 223 | return \@lines; | 
| 416 |  |  |  |  |  |  | } | 
| 417 |  |  |  |  |  |  |  | 
| 418 | 0 |  |  |  |  | 0 | return []; | 
| 419 |  |  |  |  |  |  | } | 
| 420 |  |  |  |  |  |  |  | 
| 421 |  |  |  |  |  |  | =head2 find_receivers | 
| 422 |  |  |  |  |  |  |  | 
| 423 |  |  |  |  |  |  | Expects as parameter a scalar reference of the HTML page. | 
| 424 |  |  |  |  |  |  |  | 
| 425 |  |  |  |  |  |  | Tries to find all the receivers of the SPAM report, even if those were not real | 
| 426 |  |  |  |  |  |  | e-mail address, only internal identifiers for Spamcop to store statistics. | 
| 427 |  |  |  |  |  |  |  | 
| 428 |  |  |  |  |  |  | Returns an array reference, where each item is a string. | 
| 429 |  |  |  |  |  |  |  | 
| 430 |  |  |  |  |  |  | =cut | 
| 431 |  |  |  |  |  |  |  | 
| 432 |  |  |  |  |  |  | sub find_receivers { | 
| 433 | 2 |  |  | 2 | 1 | 298 | my $content_ref = shift; | 
| 434 | 2 | 50 |  |  |  | 19 | croak "Must receive an scalar reference as parameter" | 
| 435 |  |  |  |  |  |  | unless ( ref($content_ref) eq 'SCALAR' ); | 
| 436 | 2 |  |  |  |  | 27 | my $tree = HTML::TreeBuilder::XPath->new; | 
| 437 | 2 |  |  |  |  | 725 | $tree->parse_content($content_ref); | 
| 438 | 2 |  |  |  |  | 27976 | my @nodes = $tree->findnodes('//*[@id="content"]'); | 
| 439 | 2 |  |  |  |  | 48636 | my @receivers; | 
| 440 | 2 |  |  |  |  | 7 | my $devnull     = q{/dev/null'ing}; | 
| 441 | 2 |  |  |  |  | 7 | my $report_sent = 'Spam report id'; | 
| 442 |  |  |  |  |  |  |  | 
| 443 | 2 |  |  |  |  | 9 | foreach my $node (@nodes) { | 
| 444 | 2 |  |  |  |  | 15 | foreach my $inner ( $node->content_list() ) { | 
| 445 |  |  |  |  |  |  |  | 
| 446 |  |  |  |  |  |  | # we just want text nodes, everything else is discarded | 
| 447 | 10 | 100 |  |  |  | 44 | next if ( ref($inner) ); | 
| 448 | 4 |  |  |  |  | 21 | $inner =~ s/^\s+//; | 
| 449 | 4 |  |  |  |  | 19 | $inner =~ s/\s+$//; | 
| 450 |  |  |  |  |  |  |  | 
| 451 | 4 |  |  |  |  | 7 | my $result_ref; | 
| 452 | 4 |  |  |  |  | 24 | my @parts = split( /\s/, $inner ); | 
| 453 |  |  |  |  |  |  |  | 
| 454 |  |  |  |  |  |  | # /dev/null\'ing report for google-abuse-bounces-reports@devnull.spamcop.net | 
| 455 | 4 | 100 |  |  |  | 25 | if ( substr( $inner, 0, length($devnull) ) eq $devnull ) { | 
|  |  | 50 |  |  |  |  |  | 
| 456 | 3 |  |  |  |  | 20 | $result_ref = [ ( split( '@', $parts[-1] ) )[0], undef ]; | 
| 457 |  |  |  |  |  |  | } | 
| 458 |  |  |  |  |  |  |  | 
| 459 |  |  |  |  |  |  | # Spam report id 7151980235 sent to: dl_security_whois@navercorp.com | 
| 460 |  |  |  |  |  |  | elsif ( | 
| 461 |  |  |  |  |  |  | substr( $inner, 0, length($report_sent) ) eq $report_sent ) | 
| 462 |  |  |  |  |  |  | { | 
| 463 | 1 |  |  |  |  | 5 | $result_ref = [ $parts[6], $parts[3] ]; | 
| 464 |  |  |  |  |  |  | } | 
| 465 |  |  |  |  |  |  | else { | 
| 466 | 0 |  |  |  |  | 0 | warn "Unexpected receiver format: $inner"; | 
| 467 |  |  |  |  |  |  | } | 
| 468 |  |  |  |  |  |  |  | 
| 469 | 4 |  |  |  |  | 12 | push( @receivers, $result_ref ); | 
| 470 |  |  |  |  |  |  | } | 
| 471 |  |  |  |  |  |  | } | 
| 472 |  |  |  |  |  |  |  | 
| 473 | 2 |  |  |  |  | 158 | return \@receivers; | 
| 474 |  |  |  |  |  |  | } | 
| 475 |  |  |  |  |  |  |  | 
| 476 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 477 |  |  |  |  |  |  |  | 
| 478 |  |  |  |  |  |  | =over | 
| 479 |  |  |  |  |  |  |  | 
| 480 |  |  |  |  |  |  | =item * | 
| 481 |  |  |  |  |  |  |  | 
| 482 |  |  |  |  |  |  | L<HTML::TreeBuilder::XPath> | 
| 483 |  |  |  |  |  |  |  | 
| 484 |  |  |  |  |  |  | =back | 
| 485 |  |  |  |  |  |  |  | 
| 486 |  |  |  |  |  |  | =head1 AUTHOR | 
| 487 |  |  |  |  |  |  |  | 
| 488 |  |  |  |  |  |  | Alceu Rodrigues de Freitas Junior, E<lt>arfreitas@cpan.orgE<gt> | 
| 489 |  |  |  |  |  |  |  | 
| 490 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 491 |  |  |  |  |  |  |  | 
| 492 |  |  |  |  |  |  | This software is copyright (c) 2018 of Alceu Rodrigues de Freitas Junior, | 
| 493 |  |  |  |  |  |  | E<lt>arfreitas@cpan.orgE<gt> | 
| 494 |  |  |  |  |  |  |  | 
| 495 |  |  |  |  |  |  | This file is part of App-SpamcupNG distribution. | 
| 496 |  |  |  |  |  |  |  | 
| 497 |  |  |  |  |  |  | App-SpamcupNG is free software: you can redistribute it and/or modify it under | 
| 498 |  |  |  |  |  |  | the terms of the GNU General Public License as published by the Free Software | 
| 499 |  |  |  |  |  |  | Foundation, either version 3 of the License, or (at your option) any later | 
| 500 |  |  |  |  |  |  | version. | 
| 501 |  |  |  |  |  |  |  | 
| 502 |  |  |  |  |  |  | App-SpamcupNG is distributed in the hope that it will be useful, but WITHOUT | 
| 503 |  |  |  |  |  |  | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
| 504 |  |  |  |  |  |  | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. | 
| 505 |  |  |  |  |  |  |  | 
| 506 |  |  |  |  |  |  | You should have received a copy of the GNU General Public License along with | 
| 507 |  |  |  |  |  |  | App-SpamcupNG. If not, see <http://www.gnu.org/licenses/>. | 
| 508 |  |  |  |  |  |  |  | 
| 509 |  |  |  |  |  |  | =cut | 
| 510 |  |  |  |  |  |  |  | 
| 511 |  |  |  |  |  |  | 1; |