File Coverage

blib/lib/Locale/Maketext/Utils/Phrase/Norm/WhiteSpace.pm

Criterion	Covered	Total	%
statement	35	35	100.0
branch	10	12	83.3
condition	2	3	66.6
subroutine	4	4	100.0
pod	0	1	0.0
total	51	55	92.7

line	stmt	bran	cond	sub	pod	time	code
1							package Locale::Maketext::Utils::Phrase::Norm::WhiteSpace;
2
3	4			4		2481	use strict;
	4					9
	4					144
4	4			4		17	use warnings;
	4					18
	4					170
5
6	4			4		18	use Encode ();
	4					7
	4					2767
7
8							my $space_and_no_break_space = qr/(?:\x20\|\xc2\xa0)/;
9
10							# regex is made from the Unicode code points from: `unichars '\p{WhiteSpace}'` (sans SPACE and NO-BREAK SPACE)
11							my $disallowed_whitespace = qr/(?:\x09\|\x0a\|\x0b\|\x0c\|\x0d\|\xc2\x85\|\xe1\x9a\x80\|\xe1\xa0\x8e\|\xe2\x80\x80\|\xe2\x80\x81\|\xe2\x80\x82\|\xe2\x80\x83\|\xe2\x80\x84\|\xe2\x80\x85\|\xe2\x80\x86\|\xe2\x80\x87\|\xe2\x80\x88\|\xe2\x80\x89\|\xe2\x80\x8a\|\xe2\x80\xa8\|\xe2\x80\xa9\|\xe2\x80\xaf\|\xe2\x81\x9f\|\xe3\x80\x80)/;
12
13							# regex is made from the Unicode code points from: `uninames invisible`
14							my $invisible = qr/(?:\xe2\x80\x8b\|\xe2\x81\xa2\|\xe2\x81\xa3\|\xe2\x81\xa4)/;
15
16							# regex is made from the Unicode code points from: `unichars '\p{Control}'`
17							my $control =
18							qr/(?:\x00\|\x01\|\x02\|\x03\|\x04\|\x05\|\x06\|\x07\|\x08\|\x09\|\x0a\|\x0b\|\x0c\|\x0d\|\x0e\|\x0f\|\x10\|\x11\|\x12\|\x13\|\x14\|\x15\|\x16\|\x17\|\x18\|\x19\|\x1a\|\x1b\|\x1c\|\x1d\|\x1e\|\x1f\|\x7f\|\xc2\x80\|\xc2\x81\|\xc2\x82\|\xc2\x83\|\xc2\x84\|\xc2\x85\|\xc2\x86\|\xc2\x87\|\xc2\x88\|\xc2\x89\|\xc2\x8a\|\xc2\x8b\|\xc2\x8c\|\xc2\x8d\|\xc2\x8e\|\xc2\x8f\|\xc2\x90\|\xc2\x91\|\xc2\x92\|\xc2\x93\|\xc2\x94\|\xc2\x95\|\xc2\x96\|\xc2\x97\|\xc2\x98\|\xc2\x99\|\xc2\x9a\|\xc2\x9b\|\xc2\x9c\|\xc2\x9d\|\xc2\x9e\|\xc2\x9f)/;
19
20							sub normalize_maketext_string {
21	73			73	0	180	my ($filter) = @_;
22
23	73					229	my $string_sr = $filter->get_string_sr();
24
25							# detect any whitespace-ish characters that are not ' ' or "\xC2\xA0" (non-break-space)
26	73	100				249	if ( ${$string_sr} =~ s/($disallowed_whitespace\|$invisible\|$control)/my $uh=sprintf('%04X', unpack('U',Encode::decode_utf8($1)));"[comment,invalid char Ux$uh]"/exmsg ) {
	73					2282
	24					150
	24					235
27	8					21	$filter->add_violation('Invalid whitespace, control, or invisible characters');
28							}
29
30							# The only WS possible after that is $space_and_no_break_space
31
32							# remove beginning and trailing white space
33	73	100	66			171	if ( ${$string_sr} !~ m/\A \xE2\x80\xA6/ms && ${$string_sr} =~ s/\A($space_and_no_break_space+)//xms ) {
	73					362
	73					765
34	8					19	my $startswith = $1;
35	8	50				31	if ( substr( ${$string_sr}, 0, 3 ) eq "\xE2\x80\xA6" ) {
	8					27
36	8	50				22	if ( $startswith =~ m/\xc2\xa0/ ) {
37	8					17	$filter->add_violation('Beginning ellipsis space should be a normal space');
38							}
39	8					12	${$string_sr} = " ${$string_sr}";
	8					16
	8					11
40							}
41
42	8					17	$filter->add_violation('Beginning white space');
43
44							}
45
46	73	100				147	if ( ${$string_sr} =~ s/(?:$space_and_no_break_space)+\z//xms ) {
	73					1200
47	8					22	$filter->add_violation('Trailing white space');
48							}
49
50							# collapse internal white space into a single space
51	73	100				166	if ( ${$string_sr} =~ s/$space_and_no_break_space{2,}/ /xms ) {
	73					962
52	8					20	$filter->add_violation('Multiple internal white space');
53							}
54
55	73					283	return $filter->return_value;
56							}
57
58							1;
59
60							__END__
61
62							=encoding utf-8
63
64							=head1 Normalization
65
66							The only single white space characters allowed are normal space and non-break-space.
67
68							=head2 Rationale
69
70							=over 4
71
72							=item * A tiny change in white-space[-ish] characters will make a phrase lookup fail erroneously.
73
74							=item * The only other purpose of allowing characters like this would be formatting which should not be part of a phrase.
75
76							=over 4
77
78							=item * Such formatting is not applicable to all contexts (e.g. HTML)
79
80							=item * Since it is not a translatable entity translators are likley to miss it and break your format.
81
82							=item * Same text with different formatting becomes a new, redundant, phrase.
83
84							=back
85
86							Doing internal formatting via bracket notation’s output() methods address the first 2 completely and the third one most of the time (it can be “completely” if you give it a little thought first).
87
88							=item * It is easy for a developer to miss the subtle difference and get it wrong.
89
90							=item * Surrounding whitespace is likely a sign that partial phrases are in use.
91
92							=back
93
94							That being the case we simplify consistently by using single space and non-break-space characters inside the string
95							(and the beginning if it starts with an L<ellipsis\|Locale::Maketext::Utils::Phrase::Norm::Ellipsis>).
96
97							=head2 possible violations
98
99							=over 4
100
101							=item Invalid whitespace-like characters
102
103							The string contains white space characters besides space and non-break-space, invisible characters, or control characters.
104
105							These will be turned into “[comment,invalid char UxNNNN]” (where NNNN is the Unicode code point) so you can find them visually.
106
107							=item Beginning white space
108
109							These are removed.
110
111							This accounts for strings beginning with an ellipsis which should be preceded by one space.
112
113							=item Beginning ellipsis space should be a normal space
114
115							If a string starts with an ellipsis it should be a normal space. A non-break-space implies formatting or concatenation of 2 partial phrases, ick!
116
117							=item Trailing white space
118
119							These are removed.
120
121							=item Multiple internal white space
122
123							These are collapsed into a single space.
124
125							=back
126
127							=head2 possible warnings
128
129							None