line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Transliterator::Unaccent; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
15011
|
use warnings; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
52
|
|
4
|
2
|
|
|
2
|
|
7
|
use strict; |
|
2
|
|
|
|
|
1
|
|
|
2
|
|
|
|
|
29
|
|
5
|
|
|
|
|
|
|
|
6
|
2
|
|
|
2
|
|
294
|
use Text::Transliterator; our $VERSION = $Text::Transliterator::VERSION; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
61
|
|
7
|
|
|
|
|
|
|
|
8
|
2
|
|
|
2
|
|
1730
|
use Unicode::UCD qw(charinfo charscript charblock); |
|
2
|
|
|
|
|
390063
|
|
|
2
|
|
|
|
|
163
|
|
9
|
2
|
|
|
2
|
|
16
|
use Unicode::Normalize qw(); |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
408
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
sub char_map { |
12
|
1
|
|
|
1
|
1
|
1
|
my $class = shift; |
13
|
|
|
|
|
|
|
|
14
|
1
|
|
|
|
|
2
|
my @all_ranges; |
15
|
1
|
|
|
|
|
1
|
my $ignore_wide = 0; |
16
|
1
|
|
|
|
|
1
|
my $ignore_upper = 0; |
17
|
1
|
|
|
|
|
1
|
my $ignore_lower = 0; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# decode arguments to get character ranges and boolean flags |
20
|
1
|
|
|
|
|
5
|
while (my ($kind, $arg) = splice(@_, 0, 2)) { |
21
|
0
|
|
|
|
|
0
|
my $ranges; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
my $todo = { |
24
|
0
|
0
|
|
0
|
|
0
|
script => sub { $ranges = charscript($arg) |
25
|
|
|
|
|
|
|
or die "$arg is not a valid Unicode script" }, |
26
|
0
|
0
|
|
0
|
|
0
|
block => sub { $ranges = charblock($arg) |
27
|
|
|
|
|
|
|
or die "$arg is not a valid Unicode block" }, |
28
|
0
|
|
|
0
|
|
0
|
ranges => sub { $ranges = $arg }, |
29
|
0
|
|
|
0
|
|
0
|
wide => sub { $ignore_wide = !$arg }, |
30
|
0
|
|
|
0
|
|
0
|
upper => sub { $ignore_upper = !$arg }, |
31
|
0
|
|
|
0
|
|
0
|
lower => sub { $ignore_lower = !$arg }, |
32
|
0
|
|
|
|
|
0
|
}; |
33
|
0
|
0
|
|
|
|
0
|
my $coderef = $todo->{$kind} |
34
|
|
|
|
|
|
|
or die "invalid argument: $kind"; |
35
|
0
|
|
|
|
|
0
|
$coderef->(); |
36
|
0
|
0
|
|
|
|
0
|
push @all_ranges, @$ranges if $ranges; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# default |
40
|
1
|
50
|
|
|
|
3
|
@all_ranges = @{charscript('Latin')} if !@all_ranges; |
|
1
|
|
|
|
|
3
|
|
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# build the map |
43
|
1
|
|
|
|
|
8172
|
my %map; |
44
|
1
|
|
|
|
|
3
|
foreach my $range (@all_ranges) { |
45
|
33
|
|
|
|
|
56
|
my ($start, $end) = @$range; |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# iterate over characters in range |
48
|
|
|
|
|
|
|
CHAR: |
49
|
33
|
|
|
|
|
67
|
for my $c ($start .. $end) { |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# maybe drop that char under some conditions |
52
|
1338
|
50
|
33
|
|
|
2214
|
last CHAR if $ignore_wide and $c > 255; |
53
|
1338
|
50
|
33
|
|
|
2056
|
next CHAR if $ignore_upper and chr($c) =~ /\p{Uppercase_Letter}/; |
54
|
1338
|
50
|
33
|
|
|
2042
|
next CHAR if $ignore_lower and chr($c) =~ /\p{Lowercase_Letter}/; |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# get canonical decomposition (if any) |
57
|
1338
|
|
|
|
|
2022
|
my $canon = Unicode::Normalize::getCanon($c); |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# store into map |
60
|
1338
|
100
|
100
|
|
|
9168
|
if ($canon && length($canon) > 1) { |
61
|
|
|
|
|
|
|
# the unaccented char is the the base (first char) of the decomposition |
62
|
498
|
|
|
|
|
725
|
my $base = substr $canon, 0, 1; |
63
|
498
|
|
|
|
|
2977
|
$map{chr($c)} = $base, |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
1
|
|
|
|
|
12
|
return \%map; |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub char_map_descr { |
72
|
0
|
|
|
0
|
1
|
0
|
my $class = shift; |
73
|
|
|
|
|
|
|
|
74
|
0
|
|
|
|
|
0
|
my $map = $class->char_map(@_); |
75
|
|
|
|
|
|
|
|
76
|
0
|
|
|
|
|
0
|
my $txt = ""; |
77
|
0
|
|
|
|
|
0
|
while (my ($k, $v) = each %$map) { |
78
|
0
|
|
|
|
|
0
|
my $accented = ord($k); |
79
|
0
|
|
|
|
|
0
|
my $base = ord($v); |
80
|
|
|
|
|
|
|
$txt .= sprintf "U+%04x %-40s => U+%04x %s\n", |
81
|
|
|
|
|
|
|
$accented, |
82
|
|
|
|
|
|
|
charinfo($accented)->{name}, |
83
|
|
|
|
|
|
|
$base, |
84
|
0
|
|
|
|
|
0
|
charinfo($base)->{name}; |
85
|
|
|
|
|
|
|
} |
86
|
0
|
|
|
|
|
0
|
return $txt; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub new { |
90
|
1
|
|
|
1
|
1
|
381
|
my $class = shift; |
91
|
1
|
|
|
|
|
4
|
my $map = $class->char_map(@_); |
92
|
1
|
|
|
|
|
10
|
return Text::Transliterator->new($map) |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
1; # End of Text::Transliterator::Unaccent |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
__END__ |