File Coverage

blib/lib/Text/Quoted.pm

Criterion	Covered	Total	%
statement	71	72	98.6
branch	24	26	92.3
condition	31	34	91.1
subroutine	11	11	100.0
pod	3	3	100.0
total	140	146	95.8

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Quoted;
2							our $VERSION = "2.09";
3	7			7		125738	use 5.006;
	7					23
	7					285
4	7			7		39	use strict;
	7					11
	7					269
5	7			7		31	use warnings;
	7					13
	7					525
6
7							require Exporter;
8
9							our @ISA = qw(Exporter);
10							our @EXPORT = qw(extract);
11							our @EXPORT_OK = qw(set_quote_characters combine_hunks);
12
13							# Provides either the 'Hang' package -- or, on since version 1.69 of
14							# Text::Autoformat, the 'Text::Autoformat::Hang' package.
15	7			7		4940	use Text::Autoformat();
	7					332418
	7					489
16							my $hang_package = Hang->can('new') ? "Hang" : "Text::Autoformat::Hang";
17
18	7			7		73	use Text::Tabs();
	7					11
	7					6812
19
20							=head1 NAME
21
22							Text::Quoted - Extract the structure of a quoted mail message
23
24							=head1 SYNOPSIS
25
26							use Text::Quoted;
27							my $structure = extract($text);
28
29							# Optionally, customize recognized quote characters:
30							Text::Quoted::set_quote_characters( qr/[:]/ );
31
32							=head1 DESCRIPTION
33
34							C examines the structure of some text which may contain
35							multiple different levels of quoting, and turns the text into a nested
36							data structure.
37
38							The structure is an array reference containing hash references for each
39							paragraph belonging to the same author. Each level of quoting
40							recursively adds another list reference. So for instance, this:
41
42							> foo
43							> # Bar
44							> baz
45
46							quux
47
48							turns into:
49
50							[
51							[
52							{ text => 'foo', quoter => '>', raw => '> foo' },
53							[
54							{ text => 'Bar', quoter => '> #', raw => '> # Bar' }
55							],
56							{ text => 'baz', quoter => '>', raw => '> baz' }
57							],
58
59							{ empty => 1 },
60							{ text => 'quux', quoter => '', raw => 'quux' }
61							];
62
63							This also tells you about what's in the hash references: C is the
64							paragraph of text as it appeared in the original input; C is what
65							it looked like when we stripped off the quotation characters, and
66							C is the quotation string.
67
68							=head1 FUNCTIONS
69
70							=head2 extract
71
72							Takes a single string argument which is the text to extract quote
73							structure from. Returns a nested datastructure as described above.
74
75							Exported by default.
76
77							=cut
78
79							sub extract {
80	20			20	1	614	return _organize( "", _classify( @_ ) );
81							}
82
83							sub _organize {
84	60			60		70	my $top_level = shift;
85	60					91	my @todo = @_;
86	60	50				106	$top_level = '' unless defined $top_level;
87
88	60					64	my @ret;
89
90							# Recursively form a data structure which reflects the quoting
91							# structure of the list.
92	60					116	while (my $line = shift @todo) {
93	147	100				240	my $q = defined $line->{quoter}? $line->{quoter}: '';
94	147	100				477	if ( $q eq $top_level ) {
		100
95
96							# Just append lines at "my" level.
97	101	100	66			364	push @ret, $line
98							if exists $line->{quoter}
99							or exists $line->{empty};
100							}
101							elsif ( $q =~ /^\Q$top_level\E./ ) {
102
103							# Find all the lines at a quoting level "below" me.
104	40					77	my $newquoter = _find_below( $top_level, $line, @todo );
105	40					66	my @next = $line;
106	40		100			348	push @next, shift @todo while defined $todo[0]->{quoter}
107							and $todo[0]->{quoter} =~ /^\Q$newquoter/;
108
109							# And pass them on to _organize()!
110	40					117	push @ret, _organize( $newquoter, @next );
111							}
112							}
113	60					281	return \@ret;
114							}
115
116							# Given, say:
117							# X
118							# > > hello
119							# > foo bar
120							# Stuff
121							#
122							# After "X", we're moving to another level of quoting - but which one?
123							# Naively, you'd pick out the prefix of the next line, "> >", but this
124							# is incorrect - "> >" is actually a "sub-quote" of ">". This routine
125							# works out which is the next level below us.
126
127							sub _find_below {
128	40			40		74	my ( $top_level, @stuff ) = @_;
129
130							# Find the prefixes, shortest first; return the first one which is
131							# "below" where we are right now but is a proper subset of the next
132							# line.
133							return (
134	40		100			1180	sort { length $a <=> length $b }
	22					67
135							grep $_ && /^\Q$top_level\E./ && $stuff[0]->{quoter} =~ /^\Q$_\E/,
136							map $_->{quoter},
137							@stuff
138							)[0];
139							}
140
141							# BITS OF A TEXT LINE
142
143							=head2 set_quote_characters
144
145							Takes a regex (C) matching characters that should indicate a
146							quoted line. By default, a very liberal set is used:
147
148							set_quote_characters(qr/[!#%=\|:]/);
149
150							The character C<< E >> is always recognized as a quoting character.
151
152							If C is provided instead of a regex, only C<< E >> will
153							remain as a quote character.
154
155							Not exported by default, but exportable.
156
157							=cut
158
159							my $separator = qr/[-_]{2,} \| [=#*]{3,} \| [+~]{4,}/x;
160							my ($quotechar, $quotechunk, $quoter);
161
162							set_quote_characters(qr/[!#%=\|:]/);
163
164							sub set_quote_characters {
165	9			9	1	18	$quotechar = shift;
166	9	100				478	$quotechunk = $quotechar
167							? qr/(?!$separator \z)(?:$quotechar(?!\w)\|\w>+)/
168							: qr/(?!$separator \z)\w>+/;
169	9					503	$quoter = qr/$quotechunk(?:[ \t]$quotechunk)/;
170							}
171
172							=head2 combine_hunks
173
174							my $text = combine_hunks( $arrayref_of_hunks );
175
176							Takes the output of C and turns it back into text.
177
178							Not exported by default, but exportable.
179
180							=cut
181
182							sub combine_hunks {
183	13			13	1	12	my ($hunks) = @_;
184
185							join "",
186	13	100				19	map {; ref $_ eq 'HASH' ? "$_->{raw}\n" : combine_hunks($_) } @$hunks;
	35					114
187							}
188
189							sub _classify {
190	20			20		37	my $text = shift;
191	20	100	100			141	return { raw => undef, text => undef, quoter => undef }
192							unless defined $text && length $text;
193							# If the user passes in a null string, we really want to end up with _something_
194
195							# DETABIFY
196	18					121	my @lines = Text::Tabs::expand( split /\n/, $text );
197
198							# PARSE EACH LINE
199	18					1800	foreach (splice @lines) {
200	126					266	my %line = ( raw => $_ );
201	126					1965	@line{'quoter', 'text'} = (/\A ($quoter?) (.?)\s\Z/);
202	126					422	$line{hang} = $hang_package->new( $line{'text'} );
203	126	100	66			915	$line{empty} = 1 if $line{hang}->empty() && $line{'text'} !~ /\S/;
204	126	100				1086	$line{separator} = 1 if $line{text} =~ /\A $separator \Z/o;
205	126					253	push @lines, \%line;
206							}
207
208							# SUBDIVIDE DOCUMENT INTO COHERENT SUBSECTIONS
209
210	18					33	my @chunks;
211	18					41	push @chunks, [ shift @lines ];
212	18					30	foreach my $line (@lines) {
213	108	100	100			578	if ( $line->{separator}
			100
			100
214							\|\| $line->{quoter} ne $chunks[-1][-1]->{quoter}
215							\|\| $line->{empty}
216							\|\| $chunks[-1][-1]->{empty} )
217							{
218	69					101	push @chunks, [$line];
219							}
220							else {
221	39					31	push @{ $chunks[-1] }, $line;
	39					64
222							}
223							}
224
225							# REDIVIDE INTO PARAGRAPHS
226
227	18					21	my @paras;
228	18					29	foreach my $chunk (@chunks) {
229	87					76	my $first = 1;
230	87					56	my $firstfrom;
231	87					86	foreach my $line ( @{$chunk} ) {
	87					88
232	126	100	66			374	if ( $first
			100
233							\|\| $line->{quoter} ne $paras[-1]->{quoter}
234							\|\| $paras[-1]->{separator} )
235							{
236	92					77	push @paras, $line;
237	92					74	$first = 0;
238							# We get warnings from undefined raw and text values if we don't supply alternates
239	92		100			345	$firstfrom = length( $line->{raw} \|\|'' ) - length( $line->{text} \|\| '');
			100
240							}
241							else {
242	34					49	my $extraspace =
243							length( $line->{raw} ) - length( $line->{text} ) - $firstfrom;
244	34					77	$paras[-1]->{text} .= "\n" . q{ } x $extraspace . $line->{text};
245	34					128	$paras[-1]->{raw} .= "\n" . $line->{raw};
246							}
247							}
248							}
249
250							# Reapply hangs
251	18					106	for (grep $_->{'hang'}, @paras) {
252	92	50				419	next unless my $str = (delete $_->{hang})->stringify;
253	0					0	$_->{text} = $str . " " . $_->{text};
254							}
255	18					179	return @paras;
256							}
257
258							=head1 CREDITS
259
260							Most of the heavy lifting is done by a modified version of Damian
261							Conway's C.
262
263							=head1 AUTHOR
264
265							Best Practical Solutions, LLC Emodules@bestpractical.comE
266
267							=head1 LICENSE AND COPYRIGHT
268
269							This software is Copyright (c) 2004-2015 by Best Practical Solutions, LLC
270
271							This library is free software; you can redistribute it and/or modify it
272							under the same terms as Perl itself.
273
274							=cut
275
276							1;