line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package String::ToIdentifier::EN; |
2
|
|
|
|
|
|
|
our $AUTHORITY = 'cpan:AVAR'; |
3
|
|
|
|
|
|
|
$String::ToIdentifier::EN::VERSION = '0.12'; |
4
|
3
|
|
|
3
|
|
205361
|
use 5.008001; |
|
3
|
|
|
|
|
18
|
|
5
|
3
|
|
|
3
|
|
12
|
use strict; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
55
|
|
6
|
3
|
|
|
3
|
|
12
|
use warnings; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
62
|
|
7
|
3
|
|
|
3
|
|
1193
|
use Text::Unidecode 'unidecode'; |
|
3
|
|
|
|
|
5938
|
|
|
3
|
|
|
|
|
140
|
|
8
|
3
|
|
|
3
|
|
1245
|
use Lingua::EN::Inflect::Phrase 'to_PL'; |
|
3
|
|
|
|
|
247026
|
|
|
3
|
|
|
|
|
169
|
|
9
|
3
|
|
|
3
|
|
2327
|
use Unicode::UCD 'charinfo'; |
|
3
|
|
|
|
|
69486
|
|
|
3
|
|
|
|
|
210
|
|
10
|
3
|
|
|
3
|
|
22
|
use namespace::clean; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
23
|
|
11
|
3
|
|
|
3
|
|
643
|
use Exporter 'import'; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
2197
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 NAME |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
String::ToIdentifier::EN - Convert Strings to English Program Identifiers |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 SYNOPSIS |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
use utf8; |
20
|
|
|
|
|
|
|
use String::ToIdentifier::EN 'to_identifier'; |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
to_identifier 'foo-bar'; # fooDashBar |
23
|
|
|
|
|
|
|
to_identifier 'foo-bar', '_'; # foo_dash_bar |
24
|
|
|
|
|
|
|
to_identifier 'foo.bar', '_'; # foo_dot_bar |
25
|
|
|
|
|
|
|
to_identifier "foo\x{4EB0}bar"; # fooJingBar |
26
|
|
|
|
|
|
|
to_identifier "foo\x00bar"; # fooNullCharBar |
27
|
|
|
|
|
|
|
to_identifier "foo\x00\x00bar"; # foo2NullCharsBar |
28
|
|
|
|
|
|
|
to_identifier "foo\x00\x00bar", '_'; # foo_2_null_chars_bar |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
{ |
31
|
|
|
|
|
|
|
no utf8; |
32
|
|
|
|
|
|
|
to_identifier "foo\xFF\xFFbar.baz"; # foo_2_0xFF_BarDotBaz |
33
|
|
|
|
|
|
|
to_identifier "foo\xFF\xFFbar.baz", '_'; # foo_2_0xFF_bar_dot_baz |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 DESCRIPTION |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
This module provides a utility method, L for converting an |
39
|
|
|
|
|
|
|
arbitrary string into a readable representation using the ASCII subset of C<\w> |
40
|
|
|
|
|
|
|
for use as an identifier in a computer program. The intent is to make unique |
41
|
|
|
|
|
|
|
identifier names from which the content of the original string can be easily |
42
|
|
|
|
|
|
|
inferred by a human just by reading the identifier. |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
If you need the full set of C<\w> including Unicode, see |
45
|
|
|
|
|
|
|
the subclass L. |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
Currently, this process is one way only, and will likely remain this way. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
The default is to create camelCase identifiers, or you may pass in a separator |
50
|
|
|
|
|
|
|
char of your choice such as C<_>. |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
Binary char groups will be separated by C<_> even in camelCase identifiers to |
53
|
|
|
|
|
|
|
make them easier to read, e.g.: C. |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=head1 EXPORT |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
Optionally exports the L function. |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=cut |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
our @EXPORT_OK = qw/to_identifier/; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head1 SUBROUTINES |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=cut |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
our %ASCII_MAP = ( |
68
|
|
|
|
|
|
|
0x00 => ['null'], |
69
|
|
|
|
|
|
|
0x01 => ['start', 'of', 'heading'], |
70
|
|
|
|
|
|
|
0x02 => ['start', 'of', 'text'], |
71
|
|
|
|
|
|
|
0x03 => ['end', 'of', 'text'], |
72
|
|
|
|
|
|
|
0x04 => ['end', 'of', 'transmission'], |
73
|
|
|
|
|
|
|
0x05 => ['enquiry', 'char'], |
74
|
|
|
|
|
|
|
0x06 => ['ack'], |
75
|
|
|
|
|
|
|
0x07 => ['bell', 'char'], |
76
|
|
|
|
|
|
|
0x08 => ['backspace'], |
77
|
|
|
|
|
|
|
0x09 => ['tab', 'char'], |
78
|
|
|
|
|
|
|
0x0A => ['newline'], |
79
|
|
|
|
|
|
|
0x0B => ['vertical', 'tab'], |
80
|
|
|
|
|
|
|
0x0C => ['form', 'feed'], |
81
|
|
|
|
|
|
|
0x0D => ['carriage', 'return'], |
82
|
|
|
|
|
|
|
0x0E => ['shift', 'out'], |
83
|
|
|
|
|
|
|
0x0F => ['shift', 'in'], |
84
|
|
|
|
|
|
|
0x10 => ['data', 'link', 'escape'], |
85
|
|
|
|
|
|
|
0x11 => ['device', 'control1'], |
86
|
|
|
|
|
|
|
0x12 => ['device', 'control2'], |
87
|
|
|
|
|
|
|
0x13 => ['device', 'control3'], |
88
|
|
|
|
|
|
|
0x14 => ['device', 'control4'], |
89
|
|
|
|
|
|
|
0x15 => ['negative', 'ack'], |
90
|
|
|
|
|
|
|
0x16 => ['synchronous', 'idle'], |
91
|
|
|
|
|
|
|
0x17 => ['end', 'of', 'transmission', 'block'], |
92
|
|
|
|
|
|
|
0x18 => ['cancel', 'char'], |
93
|
|
|
|
|
|
|
0x19 => ['end', 'of', 'medium'], |
94
|
|
|
|
|
|
|
0x1A => ['substitute', 'char'], |
95
|
|
|
|
|
|
|
0x1B => ['escape', 'char'], |
96
|
|
|
|
|
|
|
0x1C => ['file', 'separator'], |
97
|
|
|
|
|
|
|
0x1D => ['group', 'separator'], |
98
|
|
|
|
|
|
|
0x1E => ['record', 'separator'], |
99
|
|
|
|
|
|
|
0x1F => ['unit', 'separator'], |
100
|
|
|
|
|
|
|
0x20 => ['space', 'char'], |
101
|
|
|
|
|
|
|
0x21 => ['exclamation', 'mark'], |
102
|
|
|
|
|
|
|
0x22 => ['double', 'quote'], |
103
|
|
|
|
|
|
|
0x23 => ['hash', 'mark'], |
104
|
|
|
|
|
|
|
0x24 => ['dollar', 'sign'], |
105
|
|
|
|
|
|
|
0x25 => ['percent', 'sign'], |
106
|
|
|
|
|
|
|
0x26 => ['ampersand'], |
107
|
|
|
|
|
|
|
0x27 => ['single', 'quote'], |
108
|
|
|
|
|
|
|
0x28 => ['left', 'paren'], |
109
|
|
|
|
|
|
|
0x29 => ['right', 'paren'], |
110
|
|
|
|
|
|
|
0x2A => ['asterisk'], |
111
|
|
|
|
|
|
|
0x2B => ['plus', 'sign'], |
112
|
|
|
|
|
|
|
0x2C => ['comma'], |
113
|
|
|
|
|
|
|
0x2D => ['dash'], |
114
|
|
|
|
|
|
|
0x2E => ['dot'], |
115
|
|
|
|
|
|
|
0x2F => ['slash'], |
116
|
|
|
|
|
|
|
0x3A => ['colon'], |
117
|
|
|
|
|
|
|
0x3B => ['semicolon'], |
118
|
|
|
|
|
|
|
0x3C => ['left', 'angle', 'bracket'], |
119
|
|
|
|
|
|
|
0x3D => ['equals', 'sign'], |
120
|
|
|
|
|
|
|
0x3E => ['right', 'angle', 'bracket'], |
121
|
|
|
|
|
|
|
0x3F => ['question', 'mark'], |
122
|
|
|
|
|
|
|
0x40 => ['at', 'sign'], |
123
|
|
|
|
|
|
|
0x5B => ['left', 'bracket'], |
124
|
|
|
|
|
|
|
0x5C => ['backslash'], |
125
|
|
|
|
|
|
|
0x5D => ['right', 'bracket'], |
126
|
|
|
|
|
|
|
0x5E => ['caret'], |
127
|
|
|
|
|
|
|
0x60 => ['backtick'], |
128
|
|
|
|
|
|
|
0x7B => ['left', 'brace'], |
129
|
|
|
|
|
|
|
0x7C => ['pipe', 'char'], |
130
|
|
|
|
|
|
|
0x7D => ['right', 'brace'], |
131
|
|
|
|
|
|
|
0x7E => ['tilde'], |
132
|
|
|
|
|
|
|
0x7F => ['delete', 'char'], |
133
|
|
|
|
|
|
|
); |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# fixup for perl <= 5.8.3 |
136
|
|
|
|
|
|
|
$ASCII_MAP{0} = ['null']; |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=head2 to_identifier |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
Takes the string to be converted to an identifier, and optionally a separator |
141
|
|
|
|
|
|
|
char such as C<_>. If a separator char is not provided, a camelCase identifier |
142
|
|
|
|
|
|
|
will be returned. |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=cut |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub to_identifier { |
147
|
404
|
|
|
404
|
1
|
170216
|
return __PACKAGE__->string_to_identifier(@_); |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
# Override some pluralizations Lingua::EN::Inflect::Phrase gets wrong here, if |
151
|
|
|
|
|
|
|
# needed. |
152
|
|
|
|
|
|
|
sub _pluralize_phrase { |
153
|
146
|
|
|
146
|
|
262
|
my ($self, $phrase) = @_; |
154
|
|
|
|
|
|
|
|
155
|
146
|
|
|
|
|
352
|
return to_PL($phrase); |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# for overriding in ::Unicode |
159
|
|
|
|
|
|
|
sub _non_identifier_char { |
160
|
406
|
|
|
406
|
|
1068
|
return qr/[^0-9a-zA-Z_]/; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
=head1 METHODS |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=head2 string_to_identifier |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
The class method version of L, if you want to use the object |
168
|
|
|
|
|
|
|
oriented interface. |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
=cut |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
sub string_to_identifier { |
173
|
809
|
|
|
809
|
1
|
1450
|
my ($self, $str, $sep_char) = @_; |
174
|
|
|
|
|
|
|
|
175
|
809
|
|
|
|
|
1402
|
my $is_utf8 = utf8::is_utf8($str); |
176
|
|
|
|
|
|
|
|
177
|
809
|
|
|
|
|
1406
|
my $char_to_match = $self->_non_identifier_char; |
178
|
|
|
|
|
|
|
|
179
|
809
|
|
|
|
|
1129
|
my $phrase_at_start = 0; |
180
|
|
|
|
|
|
|
|
181
|
809
|
|
|
|
|
12901
|
while ($str =~ /((${char_to_match})\2*)/sg) { |
182
|
445
|
|
|
|
|
1140
|
my $to_replace = $1; |
183
|
445
|
|
|
|
|
946
|
my $pos = $-[1]; |
184
|
|
|
|
|
|
|
|
185
|
445
|
|
|
|
|
702
|
my $count = length $to_replace; |
186
|
445
|
|
|
|
|
715
|
my $char = substr $to_replace, 0, 1; |
187
|
|
|
|
|
|
|
|
188
|
445
|
|
|
|
|
493
|
my $replacement_phrase; |
189
|
445
|
|
|
|
|
532
|
my $use_underscore = 0; |
190
|
|
|
|
|
|
|
|
191
|
445
|
100
|
|
|
|
905
|
if (ord $char < 128) { |
|
|
100
|
|
|
|
|
|
192
|
412
|
|
|
|
|
455
|
$replacement_phrase = join ' ', @{ $ASCII_MAP{ord $char} }; |
|
412
|
|
|
|
|
1136
|
|
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
elsif ($is_utf8) { |
195
|
17
|
|
|
|
|
48
|
my $decoded = lcfirst unidecode $char; |
196
|
|
|
|
|
|
|
|
197
|
17
|
|
|
|
|
6573
|
$decoded =~ s/^\s+//; |
198
|
17
|
|
|
|
|
54
|
$decoded =~ s/\s+\z//; |
199
|
|
|
|
|
|
|
|
200
|
17
|
|
|
|
|
33
|
(my $decoded_without_spaces = $decoded) =~ s/\s+//g; |
201
|
|
|
|
|
|
|
|
202
|
17
|
|
|
|
|
74
|
my $bad_chars =()= $decoded_without_spaces =~ /$char_to_match/sg; |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
# If Text::Unidecode gives us non-identifier chars, we use |
205
|
|
|
|
|
|
|
# either it or the UCD charname, whichever has fewer |
206
|
|
|
|
|
|
|
# non-identifier chars, after recursively passing the strings |
207
|
|
|
|
|
|
|
# through ->string_to_identifier. |
208
|
17
|
100
|
|
|
|
37
|
if ($bad_chars) { |
209
|
2
|
|
|
|
|
11
|
my $charname = lc charinfo(ord $char)->{name}; |
210
|
|
|
|
|
|
|
|
211
|
2
|
|
|
|
|
222619
|
$charname =~ s/^\s+//; |
212
|
2
|
|
|
|
|
6
|
$charname =~ s/\s+\z//; |
213
|
|
|
|
|
|
|
|
214
|
2
|
|
|
|
|
10
|
(my $charname_without_spaces = $charname) =~ s/\s+//g; |
215
|
|
|
|
|
|
|
|
216
|
2
|
|
|
|
|
12
|
my $charname_bad_chars =()= |
217
|
|
|
|
|
|
|
$charname_without_spaces =~ /$char_to_match/sg; |
218
|
|
|
|
|
|
|
|
219
|
2
|
50
|
|
|
|
9
|
$decoded = $charname if $charname_bad_chars < $bad_chars; |
220
|
|
|
|
|
|
|
|
221
|
2
|
|
|
|
|
22
|
$decoded = |
222
|
|
|
|
|
|
|
join ' ', |
223
|
|
|
|
|
|
|
map $self->string_to_identifier($_), |
224
|
|
|
|
|
|
|
split /\s+/, $decoded; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
17
|
|
|
|
|
43
|
$replacement_phrase = $decoded; |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
else { # binary |
230
|
16
|
|
|
|
|
44
|
$replacement_phrase = sprintf '0x%X', ord $char; |
231
|
16
|
|
|
|
|
21
|
$use_underscore = 1; |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
# For single char replacements, no separation or camelcasing is |
235
|
|
|
|
|
|
|
# necessary. |
236
|
445
|
100
|
|
|
|
833
|
if (length $replacement_phrase > 1) { |
237
|
438
|
100
|
|
|
|
710
|
$phrase_at_start = 1 if $pos == 0; |
238
|
|
|
|
|
|
|
|
239
|
438
|
100
|
|
|
|
973
|
$replacement_phrase = $self->_pluralize_phrase("$count $replacement_phrase") |
240
|
|
|
|
|
|
|
if $count > 1; |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
{ |
243
|
438
|
100
|
|
|
|
599146
|
my $sep_char = $use_underscore ? '_' : $sep_char; |
|
438
|
|
|
|
|
752
|
|
244
|
|
|
|
|
|
|
|
245
|
438
|
100
|
|
|
|
625
|
if ($sep_char) { |
246
|
290
|
|
|
|
|
1049
|
$replacement_phrase = |
247
|
|
|
|
|
|
|
join($sep_char, split /\s+/, $replacement_phrase); |
248
|
|
|
|
|
|
|
|
249
|
290
|
100
|
|
|
|
678
|
$replacement_phrase = $sep_char . $replacement_phrase |
250
|
|
|
|
|
|
|
unless $pos == 0; |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
# Insert sep_char at the end of replacement text unless |
253
|
|
|
|
|
|
|
# position is at the end of the string. |
254
|
290
|
100
|
|
|
|
649
|
$replacement_phrase .= $sep_char |
255
|
|
|
|
|
|
|
unless $pos + length($to_replace) == length($str); |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
else { |
258
|
148
|
|
|
|
|
838
|
$replacement_phrase = |
259
|
|
|
|
|
|
|
join '', map "\u$_", split /\s+/, $replacement_phrase; |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# titlecase the following text for camelCase identifiers |
264
|
438
|
100
|
|
|
|
1025
|
substr($str, $pos + length($to_replace), 1) = |
265
|
|
|
|
|
|
|
ucfirst substr($str, $pos + length($to_replace), 1) |
266
|
|
|
|
|
|
|
if not $sep_char; |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
else { |
269
|
|
|
|
|
|
|
# For single char replacements we want to match the case. |
270
|
7
|
100
|
|
|
|
33
|
if (substr($str, $pos, 1) =~ /^\p{Lu}\z/) { |
271
|
2
|
|
|
|
|
6
|
$replacement_phrase = ucfirst $replacement_phrase; |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
else { |
274
|
5
|
|
|
|
|
11
|
$replacement_phrase = lcfirst $replacement_phrase; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
445
|
|
|
|
|
2097
|
substr($str, $pos, length($to_replace)) = $replacement_phrase; |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
809
|
100
|
|
|
|
1770
|
$str = lcfirst $str if $phrase_at_start; |
282
|
|
|
|
|
|
|
|
283
|
809
|
|
|
|
|
3225
|
return $str; |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
=head1 SEE ALSO |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
L, |
289
|
|
|
|
|
|
|
L, |
290
|
|
|
|
|
|
|
L |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=head1 AUTHOR |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
Rafael Kitover, C<< >> |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=head1 REPOSITORY |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
L |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
Copyright (c) 2018 Rafael Kitover . |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
305
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published |
306
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License. |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=cut |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
1; # End of String::ToIdentifier::EN |