| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Locale::Maketext::Utils::Phrase::Norm::NonBytesStr; |
|
2
|
|
|
|
|
|
|
|
|
3
|
4
|
|
|
4
|
|
3279
|
use strict; |
|
|
4
|
|
|
|
|
12
|
|
|
|
4
|
|
|
|
|
157
|
|
|
4
|
4
|
|
|
4
|
|
32
|
use warnings; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
1985
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
sub normalize_maketext_string { |
|
7
|
75
|
|
|
75
|
0
|
222
|
my ($filter) = @_; |
|
8
|
|
|
|
|
|
|
|
|
9
|
75
|
|
|
|
|
274
|
my $string_sr = $filter->get_string_sr(); |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# \x{NNNN…} |
|
12
|
75
|
100
|
|
|
|
160
|
if ( ${$string_sr} =~ s/(\\x\{[0-9a-fA-F]+\})/[comment,non bytes unicode string “$1”]/g ) { |
|
|
75
|
|
|
|
|
848
|
|
|
13
|
8
|
|
|
|
|
33
|
$filter->add_violation('non-bytes string (perl)'); |
|
14
|
|
|
|
|
|
|
} |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
# \N{…} see `perldoc charnames |
|
17
|
75
|
100
|
|
|
|
165
|
if ( ${$string_sr} =~ s/(\\N\{[^}]+\})/[comment,charnames.pm type string “$1”]/g ) { |
|
|
75
|
|
|
|
|
1968
|
|
|
18
|
8
|
|
|
|
|
27
|
$filter->add_violation('charnames.pm string notation'); |
|
19
|
|
|
|
|
|
|
} |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# u"\uNNNN…" |
|
22
|
75
|
100
|
|
|
|
134
|
if ( ${$string_sr} =~ s/([uU])(["'])(\\[uU][0-9a-fA-F]+)\2/[comment,unicode notation “$1“$3””]/g ) { |
|
|
75
|
|
|
|
|
333
|
|
|
23
|
8
|
|
|
|
|
27
|
$filter->add_violation('unicode code point notation (Python style)'); |
|
24
|
|
|
|
|
|
|
} |
|
25
|
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
#\uNNNN… |
|
27
|
75
|
100
|
|
|
|
136
|
if ( ${$string_sr} =~ s/(?<!\[comment,unicode notation “[uU]“)(\\[uU][0-9a-fA-F]+)/[comment,unicode notation “$1”]/g ) { |
|
|
75
|
|
|
|
|
380
|
|
|
28
|
8
|
|
|
|
|
26
|
$filter->add_violation('unicode code point notation (C/C++/Java style)'); |
|
29
|
|
|
|
|
|
|
} |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# X'NNNN…' |
|
32
|
|
|
|
|
|
|
# U'NNNN…' |
|
33
|
75
|
100
|
|
|
|
140
|
if ( ${$string_sr} =~ s/(?:([XxUn])(["'])([0-9a-fA-F]+)\2)/[comment,unicode notation “$1‘$3’”]/g ) { |
|
|
75
|
|
|
|
|
478
|
|
|
34
|
8
|
|
|
|
|
28
|
$filter->add_violation('unicode code point notation (alternate style)'); |
|
35
|
|
|
|
|
|
|
} |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# U+NNNN… |
|
38
|
75
|
100
|
|
|
|
135
|
if ( ${$string_sr} =~ s/(?<!\[comment,charnames\.pm type string “\\N\{)([Uu]\+[0-9a-fA-F]+)/[comment,unicode notation “$1”]/g ) { |
|
|
75
|
|
|
|
|
300
|
|
|
39
|
8
|
|
|
|
|
25
|
$filter->add_violation('unicode code point notation (visual notation style)'); # TODO: [output,codepoint,NNNN] |
|
40
|
|
|
|
|
|
|
} |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# UxNNNN… |
|
43
|
75
|
100
|
|
|
|
131
|
if ( ${$string_sr} =~ s/([Uu]x[0-9a-fA-F]+)/[comment,unicode notation “$1”]/g ) { |
|
|
75
|
|
|
|
|
300
|
|
|
44
|
8
|
|
|
|
|
23
|
$filter->add_violation('unicode code point notation (visual notation type 2 style)'); # TODO: [output,codepoint,NNNN] |
|
45
|
|
|
|
|
|
|
} |
|
46
|
|
|
|
|
|
|
|
|
47
|
75
|
|
|
|
|
318
|
return $filter->return_value; |
|
48
|
|
|
|
|
|
|
} |
|
49
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
1; |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
__END__ |
|
53
|
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=encoding utf-8 |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head1 Normalization |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
We only want bytes strings and not “wide” unicode code point notation. |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head2 Rationale |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
This helps give consistency, clarity, and simplicity. |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=over 4 |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=item * Having one standard means no one has to guess/lookup what it is they are looking at or how they are expected to do it. |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=item * When harvesting phrases we avoid having to deal with interpolating in order to get the correct key to look up. |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
L<Text::Extract::MaketextCallPhrases> will handle it correctly for perl notation but what if you’re not parsing perl code? |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=item * At run time we avoid potential key to look up problems. |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=item * Avoids many encoding/decoding issue complexities. |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=item * Using unicode code point notation adds a layer of complexity that hinders translators and thus makes room for lower quality translations. |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=item * In perl, there's no really good way to combine the use of bytes strings and unicode string without issues. If we use bytes strings everything just works. |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Of course, using unicode strings when you need to operate under character semantics is the appropriate thing to do and newer perls have really great tools for that. |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
However, for localization we are essentially looking up and passing through without examination or collation modifications. So bytes is the way to go for phrases! |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=item * Many things you might want to do with a phrase require it be bytes. |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
You get garbled data when output to browser, file, database, or terminal. |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Various hashing and encrypting operate on bytes (using a unicode string can be fatal or you silently get unexpected data). |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
=back |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Solution: You can simply use the character itself or a bracket notation method for the handful of markup related or visually special characters |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
=head1 possible violations |
|
95
|
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
If you get false positives then that only goes to help highlight how ambiguity adds to the reason to avoid non-bytes strings! |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
Note that HTML Entities are not addressed here since the unicode notation as well as other syntax is covered via L<Ampersand|Locale::Maketext::Utils::Phrase::Norm::Ampersand>. |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=over 4 |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=item non-bytes string (perl)' |
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
This means you have something like \x{NNNN} and need to use the character itself instead. |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
These will be turned into ‘[comment,non bytes unicode string “\x{NNNN}”]’ (where NNNN is the Unicode code point) so you can find them visually. |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=item charnames.pm string notation |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
This means you have something like \N{…} and need to use the character itself instead. |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
These will be turned into ‘[comment,charnames.pm type string “\N{…}”]’ so you can find them visually. |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=item unicode code point notation (C/C++/Java style)' |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
This means you have something like \uNNNN and need to use the character itself instead. |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
These will be turned into ‘[comment,unicode notation “\uNNNN”]’ (where NNNN is the Unicode code point) so you can find them visually. |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=item unicode code point notation (alternate style) |
|
121
|
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
This means you have something like U'NNNN' and need to use the character itself instead. |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
These will be turned into ‘[comment,unicode notation “U'NNNN'”]’ (where NNNN is the Unicode code point) so you can find them visually. |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
=item unicode code point notation (visual notation style)' |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
This means you have something like U+NNNN and need to use the character itself instead. |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
These will be turned into ‘[comment,non bytes unicode string “U+NNNN]’ (where NNNN is the Unicode code point) so you can find them visually. |
|
131
|
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
=item unicode code point notation (visual notation type 2 style)' |
|
133
|
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
This means you have something like UxNNNN and need to use the character itself instead. |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
These will be turned into ‘[comment,non bytes unicode string “UxNNNN]’ (where NNNN is the Unicode code point) so you can find them visually. |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=item unicode code point notation (Python style) |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
This means you have something like u"\uNNNN" and need to use the character itself instead. |
|
141
|
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
These will be turned into ‘[comment,non bytes unicode string “u"\uNNNN"”]’ (where NNNN is the Unicode code point) so you can find them visually. |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=back |
|
145
|
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head1 possible warnings |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
None |