blib/lib/HTML/Quoted.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 98 | 133 | 73.6 |
branch | 30 | 50 | 60.0 |
condition | 10 | 21 | 47.6 |
subroutine | 12 | 15 | 80.0 |
pod | 2 | 2 | 100.0 |
total | 152 | 221 | 68.7 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | 3 | 3 | 211656 | use 5.008; | |||
3 | 24 | ||||||
2 | 3 | 3 | 17 | use strict; | |||
3 | 4 | ||||||
3 | 68 | ||||||
3 | 3 | 3 | 13 | use warnings; | |||
3 | 5 | ||||||
3 | 814 | ||||||
4 | |||||||
5 | package HTML::Quoted; | ||||||
6 | |||||||
7 | our $VERSION = '0.05'; | ||||||
8 | |||||||
9 | =head1 NAME | ||||||
10 | |||||||
11 | HTML::Quoted - extract structure of quoted HTML mail message | ||||||
12 | |||||||
13 | =head1 SYNOPSIS | ||||||
14 | |||||||
15 | use HTML::Quoted; | ||||||
16 | my $html = '...'; | ||||||
17 | my $struct = HTML::Quoted->extract( $html ); | ||||||
18 | |||||||
19 | =head1 DESCRIPTION | ||||||
20 | |||||||
21 | Parses and extracts quotation structure out of a HTML message. | ||||||
22 | Purpose and returned structures are very similar to | ||||||
23 | L |
||||||
24 | |||||||
25 | =head1 SUPPORTED FORMATS | ||||||
26 | |||||||
27 | Variouse MUAs use quite different approaches for quoting in mails. | ||||||
28 | |||||||
29 | Some use Itag and it's quite easy to parse. |
||||||
30 | |||||||
31 | Some wrap text into I tags and add '>' in the beginning of the |
||||||
32 | paragraphs. | ||||||
33 | |||||||
34 | Things gettign messier when it's an HTML reply on plain text mail | ||||||
35 | thread. | ||||||
36 | |||||||
37 | If B |
||||||
38 | via rt.cpan.org with as short as possible example. B |
||||||
39 | is even better. Test file with patch is the best. Not obviouse patches | ||||||
40 | without tests suck. | ||||||
41 | |||||||
42 | =head1 METHODS | ||||||
43 | |||||||
44 | =head2 extract | ||||||
45 | |||||||
46 | my $struct = HTML::Quoted->extract( $html ); | ||||||
47 | |||||||
48 | Takes a string with HTML and returns array reference. Each element | ||||||
49 | in the array either array or hash. For example: | ||||||
50 | |||||||
51 | |||||||
52 | [ | ||||||
53 | { 'raw' => 'Hi,' }, | ||||||
54 | { 'raw' => ' On date X wrote: ' }, |
||||||
55 | [ | ||||||
56 | { 'raw' => '' }, |
||||||
57 | { 'raw' => 'Hello,' }, | ||||||
58 | { 'raw' => ' How are you? ' }, |
||||||
59 | { 'raw' => '' } | ||||||
60 | ], | ||||||
61 | ... | ||||||
62 | ] | ||||||
63 | |||||||
64 | Hashes represent a part of the html. The following keys are | ||||||
65 | meaningful at the moment: | ||||||
66 | |||||||
67 | =over 4 | ||||||
68 | |||||||
69 | =item * raw - raw HTML | ||||||
70 | |||||||
71 | =item * quoter_raw, quoter - raw and decoded (entities are converted) quoter if block is prefixed with quoting characters | ||||||
72 | |||||||
73 | =back | ||||||
74 | |||||||
75 | =cut | ||||||
76 | |||||||
77 | sub extract { | ||||||
78 | 24 | 24 | 1 | 8516 | my $self = shift; | ||
79 | 24 | 205 | my $parser = HTML::Quoted::Parser->new( | ||||
80 | api_version => 3, | ||||||
81 | handlers => { | ||||||
82 | start_document => [handle_doc_start => 'self'], | ||||||
83 | end_document => [handle_doc_end => 'self'], | ||||||
84 | start => [handle_start => 'self, tagname, attr, attrseq, text'], | ||||||
85 | end => [handle_end => 'self, tagname, text'], | ||||||
86 | text => [handle_text => 'self, text, is_cdata'], | ||||||
87 | default => [handle_default => 'self, event, text'], | ||||||
88 | }, | ||||||
89 | ); | ||||||
90 | 24 | 1439 | $parser->empty_element_tags(1); | ||||
91 | 24 | 147 | $parser->parse($_[0]); | ||||
92 | 24 | 141 | $parser->eof; | ||||
93 | |||||||
94 | 24 | 267 | return $parser->{'html_quoted_parser'}{'result'}; | ||||
95 | } | ||||||
96 | |||||||
97 | =head2 combine_hunks | ||||||
98 | |||||||
99 | my $html = HTML::Quoted->combine_hunks( $arrayref_of_hunks ); | ||||||
100 | |||||||
101 | Takes the output of C |
||||||
102 | |||||||
103 | =cut | ||||||
104 | |||||||
105 | sub combine_hunks { | ||||||
106 | 14 | 14 | 1 | 33 | my ($self, $hunks) = @_; | ||
107 | |||||||
108 | join "", | ||||||
109 | 14 | 100 | 28 | map {; ref $_ eq 'HASH' ? $_->{raw} : $self->combine_hunks($_) } @$hunks; | |||
29 | 120 | ||||||
110 | } | ||||||
111 | |||||||
112 | package HTML::Quoted::Parser; | ||||||
113 | 3 | 3 | 21 | use base "HTML::Parser"; | |||
3 | 6 | ||||||
3 | 1939 | ||||||
114 | |||||||
115 | sub handle_doc_start { | ||||||
116 | 24 | 24 | 45 | my ($self) = @_; | |||
117 | 24 | 62 | my $meta = $self->{'html_quoted_parser'} = {}; | ||||
118 | 24 | 67 | my $res = $meta->{'result'} = [{}]; | ||||
119 | 24 | 46 | $meta->{'current'} = $res->[0]; | ||||
120 | 24 | 49 | $meta->{'stack'} = [$res]; | ||||
121 | 24 | 158 | $meta->{'in'} = { quote => 0, block => [0] }; | ||||
122 | } | ||||||
123 | |||||||
124 | sub handle_doc_end { | ||||||
125 | 24 | 24 | 63 | my ($self) = @_; | |||
126 | |||||||
127 | 24 | 40 | my $meta = $self->{'html_quoted_parser'}; | ||||
128 | 24 | 100 | 66 | 79 | pop @{ $meta->{'result'} } if ref $meta->{'result'}[-1] eq 'HASH' && !keys %{ $meta->{'result'}[-1] }; | ||
12 | 23 | ||||||
24 | 100 | ||||||
129 | 24 | 75 | $self->organize( $meta->{'result'} ); | ||||
130 | } | ||||||
131 | |||||||
132 | sub organize { | ||||||
133 | 28 | 28 | 55 | my ($self, $list) = @_; | |||
134 | |||||||
135 | 28 | 37 | my $prev = undef; | ||||
136 | 28 | 62 | foreach my $e ( splice @$list ) { | ||||
137 | 58 | 100 | 140 | if ( ref $e eq 'ARRAY' ) { | |||
100 | |||||||
50 | |||||||
138 | 4 | 12 | push @$list, $self->organize($e); | ||||
139 | 4 | 7 | $prev = undef; | ||||
140 | } | ||||||
141 | elsif ( $e->{'block'} ) { | ||||||
142 | 22 | 32 | push @$list, $e; | ||||
143 | 22 | 33 | $prev = undef; | ||||
144 | } | ||||||
145 | elsif ( defined $e->{'quoter'} ) { | ||||||
146 | 0 | 0 | 0 | 0 | if ( !$prev || $self->combine( $prev, $e ) ) { | ||
147 | 0 | 0 | push @$list, $prev = [ $e ]; | ||||
148 | } | ||||||
149 | } else { | ||||||
150 | 32 | 53 | push @$list, $e; | ||||
151 | 32 | 56 | $prev = undef; | ||||
152 | } | ||||||
153 | } | ||||||
154 | 28 | 70 | return $list; | ||||
155 | } | ||||||
156 | |||||||
157 | sub combine { | ||||||
158 | 0 | 0 | 0 | my ($self, $list, $e) = @_; | |||
159 | 0 | 0 | my ($last) = grep ref $_ eq 'HASH', reverse @$list; | ||||
160 | 0 | 0 | 0 | if ( $last->{'quoter'} eq $e->{'quoter'} ) { | |||
0 | |||||||
0 | |||||||
161 | 0 | 0 | push @$list, $e; | ||||
162 | 0 | 0 | return (); | ||||
163 | } | ||||||
164 | elsif ( rindex( $last->{'quoter'}, $e->{'quoter'}, 0) == 0 ) { | ||||||
165 | 0 | 0 | @$list = ( [@$list], $e ); | ||||
166 | 0 | 0 | return (); | ||||
167 | } | ||||||
168 | elsif ( rindex( $e->{'quoter'}, $last->{'quoter'}, 0) == 0 ) { | ||||||
169 | 0 | 0 | 0 | 0 | if ( ref $list->[-1] eq 'ARRAY' && !$self->combine( $list->[-1], $e ) ) { | ||
170 | 0 | 0 | return (); | ||||
171 | } | ||||||
172 | 0 | 0 | push @$list, [ $e ]; | ||||
173 | 0 | 0 | return (); | ||||
174 | } | ||||||
175 | else { | ||||||
176 | 0 | 0 | return $e; | ||||
177 | } | ||||||
178 | } | ||||||
179 | |||||||
180 | # XXX: p is treated as inline tag as it's groupping tag that | ||||||
181 | # can not contain blocks inside, use span for groupping | ||||||
182 | # hr is treated as inline tag as it doesn't contain blocks inside | ||||||
183 | my %INLINE_TAG = map {$_ => 1 } qw( | ||||||
184 | a br span bdo map img | ||||||
185 | tt i b big small | ||||||
186 | em strong dfn code q | ||||||
187 | samp kbd var cite abbr acronym sub sup | ||||||
188 | p hr | ||||||
189 | ); | ||||||
190 | |||||||
191 | my %ENTITIES = ( | ||||||
192 | '>' => '>', | ||||||
193 | '>' => '>', | ||||||
194 | '>' => '>', | ||||||
195 | ); | ||||||
196 | |||||||
197 | my $re_amp = join '|', map "\Q$_\E", '>', grep $ENTITIES{$_} eq '>', keys %ENTITIES; | ||||||
198 | $re_amp = qr{$re_amp}; | ||||||
199 | my $re_quote_char = qr{[!#%=|:]}; | ||||||
200 | my $re_quote_chunk = qr{ $re_quote_char(?!\w) | \w*$re_amp+ }x; | ||||||
201 | my $re_quoter = qr{ $re_quote_chunk (?:[ \\t]* $re_quote_chunk)* }x; | ||||||
202 | |||||||
203 | sub handle_start { | ||||||
204 | 44 | 44 | 117 | my ($self, $tag, $attr, $attrseq, $text) = @_; | |||
205 | |||||||
206 | 44 | 65 | my $meta = $self->{'html_quoted_parser'}; | ||||
207 | 44 | 61 | my $stack = $meta->{'stack'}; | ||||
208 | |||||||
209 | 44 | 50 | 94 | if ( $meta->{'in'}{'br'} ) { | |||
210 | 0 | 0 | $meta->{'in'}{'br'} = 0; | ||||
211 | 0 | 0 | push @{ $stack->[-1] }, $meta->{'current'} = {}; | ||||
0 | 0 | ||||||
212 | } | ||||||
213 | |||||||
214 | 44 | 100 | 100 | 184 | if ( $tag eq 'blockquote' ) { | ||
100 | |||||||
100 | |||||||
215 | 4 | 11 | my $new = [{ quote => 1, block => 1 }]; | ||||
216 | 4 | 6 | push @{ $stack->[-1] }, $new; | ||||
4 | 17 | ||||||
217 | 4 | 11 | push @$stack, $new; # HACK: everything pushed into this | ||||
218 | 4 | 9 | $meta->{'current'} = $new->[0]; | ||||
219 | 4 | 7 | $meta->{'in'}{'quote'}++; | ||||
220 | 4 | 6 | push @{ $meta->{'in'}{'block'} }, 0; | ||||
4 | 9 | ||||||
221 | 4 | 8 | $meta->{'current'}{'raw'} .= $text; | ||||
222 | 4 | 7 | push @{ $stack->[-1] }, $meta->{'current'} = {}; | ||||
4 | 22 | ||||||
223 | } | ||||||
224 | elsif ( $tag eq 'br' && !$meta->{'in'}{'block'}[-1] ) { | ||||||
225 | 14 | 31 | $meta->{'current'}{'raw'} .= $text; | ||||
226 | 14 | 23 | my $line = $meta->{'current'}{'raw'}; | ||||
227 | 14 | 50 | 404 | if ( $line =~ /^\n*($re_quoter)/ ) { | |||
228 | 0 | 0 | $meta->{'current'}{'quoter_raw'} = $1; | ||||
229 | $meta->{'current'}{'quoter'} = $self->decode_entities( | ||||||
230 | 0 | 0 | $meta->{'current'}{'quoter_raw'} | ||||
231 | ); | ||||||
232 | } | ||||||
233 | 14 | 95 | $meta->{'in'}{'br'} = 1; | ||||
234 | } | ||||||
235 | elsif ( !$INLINE_TAG{ $tag } ) { | ||||||
236 | 20 | 100 | 100 | 63 | if ( !$meta->{'in'}{'block'}[-1] && keys %{ $meta->{'current'} } ) { | ||
14 | 94 | ||||||
237 | 4 | 6 | push @{ $stack->[-1] }, $meta->{'current'} = { raw => '' }; | ||||
4 | 18 | ||||||
238 | } | ||||||
239 | 20 | 56 | $meta->{'current'}{'block'} = 1; | ||||
240 | 20 | 119 | $meta->{'current'}{'raw'} .= $text; | ||||
241 | |||||||
242 | 20 | 124 | $meta->{'in'}{'block'}[-1]++; | ||||
243 | } | ||||||
244 | else { | ||||||
245 | 6 | 30 | $meta->{'current'}{'raw'} .= $text; | ||||
246 | } | ||||||
247 | } | ||||||
248 | |||||||
249 | sub handle_end { | ||||||
250 | 34 | 34 | 111 | my ($self, $tag, $text) = @_; | |||
251 | |||||||
252 | 34 | 53 | my $meta = $self->{'html_quoted_parser'}; | ||||
253 | 34 | 48 | my $stack = $meta->{'stack'}; | ||||
254 | |||||||
255 | 34 | 50 | 66 | 91 | if ( $meta->{'in'}{'br'} && $tag ne 'br' ) { | ||
256 | 0 | 0 | $meta->{'in'}{'br'} = 0; | ||||
257 | 0 | 0 | push @{ $stack->[-1] }, $meta->{'current'} = {} | ||||
0 | 0 | ||||||
258 | } | ||||||
259 | |||||||
260 | 34 | 84 | $meta->{'current'}{'raw'} .= $text; | ||||
261 | |||||||
262 | 34 | 100 | 128 | if ( $tag eq 'blockquote' ) { | |||
100 | |||||||
50 | |||||||
50 | |||||||
263 | 4 | 7 | pop @$stack; | ||||
264 | 4 | 7 | pop @{ $meta->{'in'}{'block'} }; | ||||
4 | 6 | ||||||
265 | 4 | 7 | push @{ $stack->[-1] }, $meta->{'current'} = {}; | ||||
4 | 10 | ||||||
266 | 4 | 20 | $meta->{'in'}{'quote'}--; | ||||
267 | } | ||||||
268 | elsif ( $tag eq 'br' ) { | ||||||
269 | 10 | 16 | $meta->{'in'}{'br'} = 0; | ||||
270 | 10 | 16 | push @{ $stack->[-1] }, $meta->{'current'} = {} | ||||
10 | 42 | ||||||
271 | } | ||||||
272 | elsif ( $tag eq 'p' ) { | ||||||
273 | 0 | 0 | push @{ $stack->[-1] }, $meta->{'current'} = {} | ||||
0 | 0 | ||||||
274 | } | ||||||
275 | elsif ( !$INLINE_TAG{ $tag } ) { | ||||||
276 | 20 | 32 | $meta->{'in'}{'block'}[-1]--; | ||||
277 | 20 | 100 | 36 | if ( $meta->{'in'}{'block'}[-1] ) { | |||
278 | 6 | 27 | $meta->{'current'}{'block'} = 1; | ||||
279 | } else { | ||||||
280 | 14 | 17 | push @{ $stack->[-1] }, $meta->{'current'} = {}; | ||||
14 | 76 | ||||||
281 | } | ||||||
282 | } | ||||||
283 | } | ||||||
284 | |||||||
285 | sub decode_entities { | ||||||
286 | 0 | 0 | 0 | my ($self, $string) = @_; | |||
287 | 0 | 0 | 0 | 0 | $string =~ s/(&(?:[a-z]+|#[0-9]|#x[0-9a-f]+);)/ $ENTITIES{$1} || $ENTITIES{lc $1} || $1 /ge; | ||
0 | 0 | ||||||
288 | 0 | 0 | return $string; | ||||
289 | } | ||||||
290 | |||||||
291 | sub handle_text { | ||||||
292 | 36 | 36 | 80 | my ($self, $text) = @_; | |||
293 | 36 | 54 | my $meta = $self->{'html_quoted_parser'}; | ||||
294 | 36 | 100 | 86 | if ( $meta->{'in'}{'br'} ) { | |||
295 | 2 | 5 | $meta->{'in'}{'br'} = 0; | ||||
296 | 2 | 4 | push @{ $meta->{'stack'}[-1] }, $meta->{'current'} = {}; | ||||
2 | 8 | ||||||
297 | } | ||||||
298 | 36 | 183 | $self->{'html_quoted_parser'}{'current'}{'raw'} .= $text; | ||||
299 | } | ||||||
300 | |||||||
301 | sub handle_default { | ||||||
302 | 0 | 0 | my ($self, $event, $text) = @_; | ||||
303 | 0 | my $meta = $self->{'html_quoted_parser'}; | |||||
304 | 0 | 0 | if ( $meta->{'in'}{'br'} ) { | ||||
305 | 0 | $meta->{'in'}{'br'} = 0; | |||||
306 | 0 | push @{ $meta->{'stack'}[-1] }, $meta->{'current'} = {}; | |||||
0 | |||||||
307 | } | ||||||
308 | 0 | $self->{'html_quoted_parser'}{'current'}{'raw'} .= $text; | |||||
309 | } | ||||||
310 | |||||||
311 | =head1 AUTHOR | ||||||
312 | |||||||
313 | Ruslan.Zakirov E |
||||||
314 | |||||||
315 | =head1 LICENSE | ||||||
316 | |||||||
317 | Under the same terms as perl itself. | ||||||
318 | |||||||
319 | =cut | ||||||
320 | |||||||
321 | 1; |