| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package HTML::ExtractContent::Util; | 
| 2 | 3 |  |  | 3 |  | 4484 | use strict; | 
|  | 3 |  |  |  |  | 8 |  | 
|  | 3 |  |  |  |  | 116 |  | 
| 3 | 3 |  |  | 3 |  | 14 | use warnings; | 
|  | 3 |  |  |  |  | 8 |  | 
|  | 3 |  |  |  |  | 79 |  | 
| 4 | 3 |  |  | 3 |  | 2816 | use Exporter::Lite; | 
|  | 3 |  |  |  |  | 2407 |  | 
|  | 3 |  |  |  |  | 20 |  | 
| 5 | 3 |  |  | 3 |  | 3256 | use utf8; | 
|  | 3 |  |  |  |  | 35 |  | 
|  | 3 |  |  |  |  | 20 |  | 
| 6 |  |  |  |  |  |  |  | 
| 7 | 3 |  |  | 3 |  | 4154 | use HTML::Entities; | 
|  | 3 |  |  |  |  | 546121 |  | 
|  | 3 |  |  |  |  | 2538 |  | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | sub strip { | 
| 10 | 113 |  |  | 113 | 0 | 161 | my $str = shift; | 
| 11 | 113 |  |  |  |  | 952 | $str =~ s/(^\s+|\s+$)//gs; | 
| 12 | 113 |  |  |  |  | 472 | return $str; | 
| 13 |  |  |  |  |  |  | } | 
| 14 |  |  |  |  |  |  |  | 
| 15 |  |  |  |  |  |  | sub strip_tags { | 
| 16 | 67 |  |  | 67 | 0 | 97 | my $page = shift; | 
| 17 | 67 |  |  |  |  | 391 | $page =~ s!<[^>\s]+(?:\s+[^>"]+(?:=(?:"[^"]*"|'[^']*'|\S+))?)*/?>!!gs; | 
| 18 | 67 |  |  |  |  | 181 | return $page; | 
| 19 |  |  |  |  |  |  | } | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | sub eliminate_tags { | 
| 22 | 30 |  |  | 30 | 0 | 44 | my ($page, $tag) = @_; | 
| 23 | 30 |  |  |  |  | 767 | $page =~ s/<$tag[\s>].*?<\/$tag\s*>//igs; | 
| 24 | 30 |  |  |  |  | 109 | return $page; | 
| 25 |  |  |  |  |  |  | } | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | sub eliminate_links { | 
| 28 | 15 |  |  | 15 | 0 | 37 | return eliminate_tags shift, 'a'; | 
| 29 |  |  |  |  |  |  | } | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | sub eliminate_forms { | 
| 32 | 13 |  |  | 13 | 0 | 25 | return eliminate_tags shift, 'form'; | 
| 33 |  |  |  |  |  |  | } | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | sub eliminate_br { | 
| 36 | 59 |  |  | 59 | 0 | 165 | my $page = shift; | 
| 37 | 59 |  |  |  |  | 119 | $page =~ s/ ]*>/ /igs;
 | 
| 38 | 59 |  |  |  |  | 127 | return $page; | 
| 39 |  |  |  |  |  |  | } | 
| 40 |  |  |  |  |  |  |  | 
| 41 |  |  |  |  |  |  | sub extract_alt { | 
| 42 | 15 |  |  | 15 | 0 | 22 | my $page = shift; | 
| 43 | 15 |  |  |  |  | 27 | $page =~ s/ ![]() ]*alt\s*=\s*['"]?(.*?)["']?[^>]*>/$1/igs; | 
| 44 | 15 |  |  |  |  | 33 | return $page; | 
| 45 |  |  |  |  |  |  | } | 
| 46 |  |  |  |  |  |  |  | 
| 47 |  |  |  |  |  |  | sub unescape { | 
| 48 | 59 |  |  | 59 | 0 | 76 | my $page = shift; | 
| 49 | 59 |  |  |  |  | 295 | decode_entities($page); | 
| 50 |  |  |  |  |  |  | } | 
| 51 |  |  |  |  |  |  |  | 
| 52 |  |  |  |  |  |  | sub reduce_ws { | 
| 53 | 59 |  |  | 59 | 0 | 86 | my $page = shift; | 
| 54 | 59 |  |  |  |  | 282 | $page =~ s/[ \t]+/ /g; | 
| 55 | 59 |  |  |  |  | 137 | $page =~ s/\n\s*/\n/gs; | 
| 56 | 59 |  |  |  |  | 137 | return $page; | 
| 57 |  |  |  |  |  |  | } | 
| 58 |  |  |  |  |  |  |  | 
| 59 |  |  |  |  |  |  | sub decode { | 
| 60 | 59 |  |  | 59 | 0 | 109 | return strip (reduce_ws (unescape (strip_tags (eliminate_br shift)))); | 
| 61 |  |  |  |  |  |  | } | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | sub to_text { | 
| 64 | 15 |  |  | 15 | 0 | 43 | return decode (extract_alt shift); | 
| 65 |  |  |  |  |  |  | } | 
| 66 |  |  |  |  |  |  |  | 
| 67 |  |  |  |  |  |  | sub match_count { | 
| 68 | 40 |  |  | 40 | 0 | 157 | my ($str, $exp) = @_; | 
| 69 | 40 |  |  |  |  | 329 | my @list = ($str =~ $exp); | 
| 70 | 40 |  |  |  |  | 147 | return $#list + 1; | 
| 71 |  |  |  |  |  |  | } | 
| 72 |  |  |  |  |  |  |  | 
| 73 |  |  |  |  |  |  | our @EXPORT = qw/strip strip_tags eliminate_tags eliminate_links eliminate_forms eliminate_br extract_alt unescape reduce_ws decode to_text match_count/; | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | 1; |