line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Fy::Utils; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
605
|
use 5.020; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
33
|
|
4
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
43
|
|
5
|
|
|
|
|
|
|
|
6
|
1
|
|
|
1
|
|
16
|
use Carp;
|
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
60
|
|
7
|
1
|
|
|
1
|
|
793
|
use Unicode::Normalize;
|
|
1
|
|
|
|
|
1761
|
|
|
1
|
|
|
|
|
63
|
|
8
|
1
|
|
|
1
|
|
483
|
use Encode qw(encode decode);
|
|
1
|
|
|
|
|
8132
|
|
|
1
|
|
|
|
|
696
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
require Exporter; |
11
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
12
|
|
|
|
|
|
|
our @EXPORT = qw(); |
13
|
|
|
|
|
|
|
our @EXPORT_OK = qw(asciify isoify simplify commify cv_to_win cv_from_win); |
14
|
|
|
|
|
|
|
our $VERSION = '0.08'; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
my %cp1252_to_uni;
|
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
for (128..159) {
|
19
|
|
|
|
|
|
|
$cp1252_to_uni{chr($_)} = decode('cp1252', chr($_));
|
20
|
|
|
|
|
|
|
}
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
my %uni_to_ascii = (
|
23
|
|
|
|
|
|
|
"\x{20ac}" => q{E},
|
24
|
|
|
|
|
|
|
"\x{201a}" => q{,},
|
25
|
|
|
|
|
|
|
"\x{0192}" => q{f},
|
26
|
|
|
|
|
|
|
"\x{201e}" => q{"},
|
27
|
|
|
|
|
|
|
"\x{2026}" => q{_},
|
28
|
|
|
|
|
|
|
"\x{2020}" => q{+},
|
29
|
|
|
|
|
|
|
"\x{02c6}" => q{^},
|
30
|
|
|
|
|
|
|
"\x{2030}" => q{%},
|
31
|
|
|
|
|
|
|
# "\x{0160}" => q{S},
|
32
|
|
|
|
|
|
|
"\x{2039}" => q{<},
|
33
|
|
|
|
|
|
|
"\x{0152}" => q{O},
|
34
|
|
|
|
|
|
|
# "\x{017d}" => q{Z},
|
35
|
|
|
|
|
|
|
"\x{2018}" => q{'},
|
36
|
|
|
|
|
|
|
"\x{2019}" => q{'},
|
37
|
|
|
|
|
|
|
"\x{201c}" => q{"},
|
38
|
|
|
|
|
|
|
"\x{201d}" => q{"},
|
39
|
|
|
|
|
|
|
"\x{2022}" => q{.},
|
40
|
|
|
|
|
|
|
"\x{2013}" => q{-},
|
41
|
|
|
|
|
|
|
"\x{2014}" => q{-},
|
42
|
|
|
|
|
|
|
"\x{20dc}" => q{~},
|
43
|
|
|
|
|
|
|
# "\x{0161}" => q{s},
|
44
|
|
|
|
|
|
|
"\x{203a}" => q{>},
|
45
|
|
|
|
|
|
|
"\x{203a}" => q{>},
|
46
|
|
|
|
|
|
|
"\x{0153}" => q{o},
|
47
|
|
|
|
|
|
|
# "\x{017e}" => q{z},
|
48
|
|
|
|
|
|
|
# "\x{017e}" => q{Y},
|
49
|
|
|
|
|
|
|
"\x{00a1}" => q{!},
|
50
|
|
|
|
|
|
|
"\x{00a2}" => q{c},
|
51
|
|
|
|
|
|
|
"\x{00a3}" => q{L},
|
52
|
|
|
|
|
|
|
"\x{00a5}" => q{Y},
|
53
|
|
|
|
|
|
|
"\x{00a6}" => q{|},
|
54
|
|
|
|
|
|
|
"\x{00a9}" => q{C},
|
55
|
|
|
|
|
|
|
"\x{00aa}" => q{a},
|
56
|
|
|
|
|
|
|
"\x{00ab}" => q{"},
|
57
|
|
|
|
|
|
|
"\x{00ac}" => q{-},
|
58
|
|
|
|
|
|
|
"\x{00ad}" => q{-},
|
59
|
|
|
|
|
|
|
"\x{00ae}" => q{R},
|
60
|
|
|
|
|
|
|
"\x{00b2}" => q{2},
|
61
|
|
|
|
|
|
|
"\x{00b3}" => q{3},
|
62
|
|
|
|
|
|
|
"\x{00b4}" => q{'},
|
63
|
|
|
|
|
|
|
"\x{00b7}" => q{.},
|
64
|
|
|
|
|
|
|
"\x{00b9}" => q{1},
|
65
|
|
|
|
|
|
|
"\x{00ba}" => q{0},
|
66
|
|
|
|
|
|
|
"\x{00bb}" => q{"},
|
67
|
|
|
|
|
|
|
"\x{00bf}" => q{?},
|
68
|
|
|
|
|
|
|
"\x{00c6}" => q{A},
|
69
|
|
|
|
|
|
|
"\x{00d7}" => q{x},
|
70
|
|
|
|
|
|
|
"\x{00d8}" => q{O},
|
71
|
|
|
|
|
|
|
"\x{00df}" => q{s},
|
72
|
|
|
|
|
|
|
"\x{00e6}" => q{a},
|
73
|
|
|
|
|
|
|
"\x{00f0}" => q{d},
|
74
|
|
|
|
|
|
|
"\x{00f8}" => q{o},
|
75
|
|
|
|
|
|
|
);
|
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
my %uni_to_iso = (
|
78
|
|
|
|
|
|
|
"\x{2013}" => q{-},
|
79
|
|
|
|
|
|
|
"\x{2014}" => q{-},
|
80
|
|
|
|
|
|
|
"\x{2018}" => q{'},
|
81
|
|
|
|
|
|
|
"\x{2019}" => q{'},
|
82
|
|
|
|
|
|
|
"\x{201c}" => q{"},
|
83
|
|
|
|
|
|
|
"\x{201d}" => q{"},
|
84
|
|
|
|
|
|
|
"\x{2026}" => q{_},
|
85
|
|
|
|
|
|
|
);
|
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
my $convert_c2u = _make_tr(\%cp1252_to_uni);
|
88
|
|
|
|
|
|
|
my $convert_u2c = _make_tr(\%cp1252_to_uni, 'R');
|
89
|
|
|
|
|
|
|
my $convert_u2a = _make_tr(\%uni_to_ascii);
|
90
|
|
|
|
|
|
|
my $convert_u2i = _make_tr(\%uni_to_iso);
|
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub _make_tr {
|
93
|
4
|
|
|
4
|
|
4
|
my ($href, $rev) = @_;
|
94
|
|
|
|
|
|
|
|
95
|
4
|
|
|
|
|
44
|
my $from = join '', map { sprintf '\x{%04x}', ord($_) } sort keys %$href;
|
|
117
|
|
|
|
|
152
|
|
96
|
4
|
|
|
|
|
37
|
my $to = join '', map { sprintf '\x{%04x}', ord($href->{$_}) } sort keys %$href;
|
|
117
|
|
|
|
|
158
|
|
97
|
|
|
|
|
|
|
|
98
|
4
|
100
|
|
|
|
29
|
my $code = 'sub { $_[0] =~ '.($rev ? "tr/$to/$from/" : "tr/$from/$to/").'; }';
|
99
|
|
|
|
|
|
|
|
100
|
4
|
50
|
|
|
|
206
|
eval $code or die "Can't compile >$code< because $@";
|
101
|
|
|
|
|
|
|
}
|
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
sub asciify {
|
104
|
1
|
|
|
1
|
0
|
18
|
_aconvert($_[0], 0, 0);
|
105
|
|
|
|
|
|
|
}
|
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub isoify {
|
108
|
1
|
|
|
1
|
0
|
3
|
_aconvert($_[0], 1, 0);
|
109
|
|
|
|
|
|
|
}
|
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub simplify {
|
112
|
1
|
|
|
1
|
0
|
4
|
_aconvert($_[0], 2, 0);
|
113
|
|
|
|
|
|
|
}
|
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
sub _aconvert {
|
116
|
9
|
|
|
9
|
|
390
|
my ($text, $loc_m, $loc_w) = @_;
|
117
|
|
|
|
|
|
|
|
118
|
9
|
|
|
|
|
201
|
$convert_u2i->($text);
|
119
|
|
|
|
|
|
|
|
120
|
9
|
100
|
|
|
|
19
|
if ($loc_w) { # windows cp1252
|
121
|
3
|
|
|
|
|
47
|
$convert_c2u->($text);
|
122
|
|
|
|
|
|
|
}
|
123
|
|
|
|
|
|
|
|
124
|
9
|
100
|
|
|
|
13
|
if ($loc_m == 1) { # iso
|
125
|
1
|
|
|
1
|
|
741
|
$text = NFC($text) =~ s{\p{Diacriticals}}''xmsgr;
|
|
1
|
|
|
|
|
12
|
|
|
1
|
|
|
|
|
10
|
|
|
3
|
|
|
|
|
62
|
|
126
|
|
|
|
|
|
|
|
127
|
3
|
100
|
|
|
|
7
|
if ($loc_w) { # windows cp1252
|
128
|
1
|
|
|
|
|
30
|
$convert_u2c->($text);
|
129
|
|
|
|
|
|
|
}
|
130
|
|
|
|
|
|
|
|
131
|
3
|
|
|
|
|
19
|
$text =~ s{([^\x00-\xff])}{NFD($1)}xmsge;
|
|
29
|
|
|
|
|
74
|
|
132
|
|
|
|
|
|
|
|
133
|
3
|
|
|
|
|
32
|
$text =~ s{\p{Diacriticals}}''xmsg;
|
134
|
|
|
|
|
|
|
|
135
|
3
|
|
|
|
|
7
|
$text = encode('iso-8859-1', $text);
|
136
|
|
|
|
|
|
|
}
|
137
|
|
|
|
|
|
|
else { # pure or brutal
|
138
|
6
|
100
|
|
|
|
93
|
$convert_u2a->($text) if $loc_m == 2; # brutal
|
139
|
|
|
|
|
|
|
|
140
|
6
|
|
|
|
|
142
|
$text = encode('iso-8859-1', NFD($text) =~ s{\p{Diacriticals}}''xmsgr);
|
141
|
6
|
|
|
|
|
161
|
$text =~ s{\P{ASCII}}'?'xmsg;
|
142
|
|
|
|
|
|
|
}
|
143
|
|
|
|
|
|
|
|
144
|
9
|
|
|
|
|
143
|
return $text;
|
145
|
|
|
|
|
|
|
}
|
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub cv_from_win {
|
148
|
1
|
|
|
1
|
0
|
2
|
my ($buf) = @_;
|
149
|
|
|
|
|
|
|
|
150
|
1
|
|
|
|
|
24
|
$convert_c2u->($buf);
|
151
|
|
|
|
|
|
|
|
152
|
1
|
|
|
|
|
3
|
return $buf;
|
153
|
|
|
|
|
|
|
}
|
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub cv_to_win {
|
156
|
1
|
|
|
1
|
0
|
2
|
my ($buf) = @_;
|
157
|
|
|
|
|
|
|
|
158
|
1
|
|
|
|
|
33
|
$convert_u2c->($buf);
|
159
|
|
|
|
|
|
|
|
160
|
1
|
|
|
|
|
5
|
return $buf;
|
161
|
|
|
|
|
|
|
}
|
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
sub commify {
|
164
|
2
|
|
|
2
|
0
|
4
|
local $_ = shift;
|
165
|
2
|
|
|
|
|
2
|
my ($sep) = @_;
|
166
|
|
|
|
|
|
|
|
167
|
2
|
|
100
|
|
|
8
|
$sep //= '_';
|
168
|
|
|
|
|
|
|
|
169
|
2
|
|
|
|
|
3
|
my $len = length($_);
|
170
|
2
|
|
|
|
|
5
|
for my $i (1..$len) {
|
171
|
6
|
100
|
|
|
|
30
|
last unless s/^([-+]?\d+)(\d{3})/$1$sep$2/;
|
172
|
|
|
|
|
|
|
}
|
173
|
|
|
|
|
|
|
|
174
|
2
|
|
|
|
|
7
|
return $_;
|
175
|
|
|
|
|
|
|
}
|
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
1; |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
__END__ |