File Coverage

blib/lib/Mail/Pyzor/Digest/Pieces.pm

Criterion	Covered	Total	%
statement	36	77	46.7
branch	4	14	28.5
condition	3	9	33.3
subroutine	10	14	71.4
pod	6	6	100.0
total	59	120	49.1

line	stmt	bran	cond	sub	pod	time	code
1							package Mail::Pyzor::Digest::Pieces;
2
3							# Copyright 2018 cPanel, LLC.
4							# All rights reserved.
5							# http://cpanel.net
6							#
7							# This is free software; you can redistribute it and/or modify it under the
8							# Apache 2.0 license.
9
10	2			2		262822	use strict;
	2					12
	2					64
11	2			2		10	use warnings;
	2					5
	2					72
12
13							=encoding utf-8
14
15							=head1 NAME
16
17							Mail::Pyzor::Digest::Pieces
18
19							=head1 DESCRIPTION
20
21							This module houses backend logic for L.
22
23							It reimplements logic found in pyzor’s F module
24							(L).
25
26							=cut
27
28							#----------------------------------------------------------------------
29
30	2			2		13	use Email::MIME::ContentType ();
	2					4
	2					39
31	2			2		11	use Encode ();
	2					6
	2					61
32
33							# each tuple is [ offset, length ]
34	2			2		10	use constant _HASH_SPEC => ( [ 20, 3 ], [ 60, 3 ] );
	2					6
	2					229
35
36							use constant {
37	2					1328	_MIN_LINE_LENGTH => 8,
38
39							_ATOMIC_NUM_LINES => 4,
40	2			2		13	};
	2					6
41
42							#----------------------------------------------------------------------
43
44							=head1 FUNCTIONS
45
46							=head2 $strings_ar = digest_payloads( $EMAIL_MIME )
47
48							This imitates the corresponding object method in F.
49							It returns a reference to an array of strings. Each string can be either
50							a byte string or a character string (e.g., UTF-8 decoded).
51
52							NB: RFC 2822 stipulates that message bodies should use CRLF
53							line breaks, not plain LF (nor plain CR). L
54							will thus convert any plain CRs in a quoted-printable message
55							body into CRLF. Python, though, doesn’t do this, so the output of
56							our implementation of C diverges from that of the Python
57							original. It doesn’t ultimately make a difference since the line-ending
58							whitespace gets trimmed regardless, but it’s necessary to factor in when
59							comparing the output of our implementation with the Python output.
60
61							=cut
62
63							sub digest_payloads {
64	0			0	1	0	my ($parsed) = @_;
65
66	0					0	my @subparts = $parsed->subparts();
67
68	0					0	my @payloads;
69
70	0	0				0	if (@subparts) {
71	0					0	@payloads = map { @{ digest_payloads($_) } } $parsed->subparts();
	0					0
	0					0
72							}
73							else {
74	0					0	my ( $main_type, $subtype, $encoding, $encode_check ) = parse_content_type( $parsed->content_type() );
75
76	0					0	my $payload;
77
78	0	0				0	if ( $main_type eq 'text' ) {
79
80							# Decode transfer encoding, but leave us as a byte string.
81							# Note that this is where Email::MIME converts plain LF to CRLF.
82	0					0	$payload = $parsed->body();
83
84							# This does the actual character decoding (i.e., “charset”).
85	0					0	$payload = Encode::decode( $encoding, $payload, $encode_check );
86
87	0	0				0	if ( $subtype eq 'html' ) {
88	0					0	require Mail::Pyzor::Digest::StripHtml;
89	0					0	$payload = Mail::Pyzor::Digest::StripHtml::strip($payload);
90							}
91							}
92							else {
93
94							# This does no decoding, even of, e.g., quoted-printable or base64.
95	0					0	$payload = $parsed->body_raw();
96							}
97
98	0					0	push @payloads, $payload;
99							}
100
101	0					0	return \@payloads;
102							}
103
104							#----------------------------------------------------------------------
105
106							=head2 normalize( $STRING )
107
108							This imitates the corresponding object method in F.
109							It modifies C<$STRING> in-place.
110
111							As with the original implementation, if C<$STRING> contains (decoded)
112							Unicode characters, those characters will be parsed accordingly. So:
113
114							$str = "123\xc2\xa0"; # [ c2 a0 ] == \u00a0, non-breaking space
115
116							normalize($str);
117
118							The above will leave C<$str> alone, but this:
119
120							utf8::decode($str);
121
122							normalize($str);
123
124							… will trim off the last two bytes from C<$str>.
125
126							=cut
127
128							sub normalize { ## no critic qw( Subroutines::RequireArgUnpacking )
129
130							# NULs are bad, mm-kay?
131	0			0	1	0	$_[0] =~ tr<\0><>d;
132
133							# NB: Python’s \s without re.UNICODE is the same as Perl’s \s
134							# with the /a modifier.
135							#
136							# https://docs.python.org/2/library/re.html
137							# https://perldoc.perl.org/perlrecharclass.html#Backslash-sequences
138
139							# Python: re.compile(r'\S{10,}')
140	0					0	$_[0] =~ s<\S{10,}><>ag;
141
142							# Python: re.compile(r'\S+@\S+')
143	0					0	$_[0] =~ s<\S+ @ \S+><>agx;
144
145							# Python: re.compile(r'[a-z]+:\S+', re.IGNORECASE)
146	0					0	$_[0] =~ s<[a-zA-Z]+ : \S+><>agx;
147
148							# (from digest.py …)
149							# Make sure we do the whitespace last because some of the previous
150							# patterns rely on whitespace.
151	0					0	$_[0] =~ tr< \x09-\x0d><>d;
152
153							# This is fun. digest.py’s normalize() does a non-UNICODE whitespace
154							# strip, then calls strip() on the string, which will strip Unicode
155							# whitespace from the ends.
156	0					0	$_[0] =~ s<\A\s+><>;
157	0					0	$_[0] =~ s<\s+\z><>;
158
159	0					0	return;
160							}
161
162							#----------------------------------------------------------------------
163
164							=head2 $yn = should_handle_line( $STRING )
165
166							This imitates the corresponding object method in F.
167							It returns a boolean.
168
169							=cut
170
171							sub should_handle_line {
172	6		100	6	1	24879	return $_[0] && length( $_[0] ) >= _MIN_LINE_LENGTH();
173							}
174
175							#----------------------------------------------------------------------
176
177							=head2 $sr = assemble_lines( \@LINES )
178
179							This assembles a string buffer out of @LINES. The string is the buffer
180							of octets that will be hashed to produce the message digest.
181
182							Each member of @LINES is expected to be an B, not a
183							character string.
184
185							=cut
186
187							sub assemble_lines {
188	4			4	1	7311	my ($lines_ar) = @_;
189
190	4	100				13	if ( @$lines_ar <= _ATOMIC_NUM_LINES() ) {
191
192							# cf. handle_atomic() in digest.py
193	1					8	return \join( q<>, @$lines_ar );
194							}
195
196							#----------------------------------------------------------------------
197							# cf. handle_atomic() in digest.py
198
199	3					6	my $str = q<>;
200
201	3					8	for my $ofs_len ( _HASH_SPEC() ) {
202	6					12	my ( $offset, $length ) = @$ofs_len;
203
204	6					14	for my $i ( 0 .. ( $length - 1 ) ) {
205	18					34	my $idx = int( $offset * @$lines_ar / 100 ) + $i;
206
207	18	100				44	next if !defined $lines_ar->[$idx];
208
209	17					32	$str .= $lines_ar->[$idx];
210							}
211							}
212
213	3					19	return \$str;
214							}
215
216							#----------------------------------------------------------------------
217
218							=head2 ($main, $sub, $encoding, $checkval) = parse_content_type( $CONTENT_TYPE )
219
220							=cut
221
222	2					145	use constant _QUOTED_PRINTABLE_NAMES => (
223							"quopri-codec",
224							"quopri",
225							"quoted-printable",
226							"quotedprintable",
227	2			2		17	);
	2					5
228
229							# Make Encode::decode() ignore anything that doesn’t fit the
230							# given encoding.
231	2			2		15	use constant _encode_check_ignore => q<>;
	2					3
	2					597
232
233							sub parse_content_type {
234	0			0	1		my ($content_type) = @_;
235
236	0						my $ct_parse = Email::MIME::ContentType::parse_content_type(
237							$content_type,
238							);
239
240	0		0				my $main = $ct_parse->{'type'} \|\| q<>;
241	0		0				my $sub = $ct_parse->{'subtype'} \|\| q<>;
242
243	0						my $encoding = $ct_parse->{'attributes'}{'charset'};
244
245	0						my $checkval;
246
247	0	0					if ($encoding) {
248
249							# Lower-case everything, convert underscore to dash, and remove NUL.
250	0						$encoding =~ trd;
251
252							# Apparently pyzor accommodates messages that put the transfer
253							# encoding in the Content-Type.
254	0	0					if ( grep { $_ eq $encoding } _QUOTED_PRINTABLE_NAMES() ) {
	0
255	0						$checkval = Encode::FB_CROAK();
256							}
257							}
258							else {
259	0						$encoding = 'ascii';
260							}
261
262							# Match Python .decode()’s 'ignore' behavior
263	0		0				$checkval \|\|= \&_encode_check_ignore;
264
265	0						return ( $main, $sub, $encoding, $checkval );
266							}
267
268							#----------------------------------------------------------------------
269
270							=head2 @lines = splitlines( $TEXT )
271
272							Imitates C. (cf. C)
273
274							Returns a plain list in list context. Returns the number of
275							items to be returned in scalar context.
276
277							=cut
278
279							sub splitlines {
280	0			0	1		return split m<\r\n?\|\n>, $_[0];
281							}
282
283							1;