line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Transliterator::Unaccent; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
16315
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
52
|
|
4
|
2
|
|
|
2
|
|
8
|
use strict; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
56
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
our $VERSION = "1.03"; |
7
|
|
|
|
|
|
|
|
8
|
2
|
|
|
2
|
|
284
|
use Text::Transliterator; |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
48
|
|
9
|
2
|
|
|
2
|
|
1612
|
use Unicode::UCD qw(charinfo charscript charblock); |
|
2
|
|
|
|
|
84727
|
|
|
2
|
|
|
|
|
156
|
|
10
|
2
|
|
|
2
|
|
19
|
use Unicode::Normalize qw(); |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
420
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
sub char_map { |
13
|
1
|
|
|
1
|
1
|
5
|
my $class = shift; |
14
|
|
|
|
|
|
|
|
15
|
1
|
|
|
|
|
3
|
my @all_ranges; |
16
|
1
|
|
|
|
|
3
|
my $ignore_wide = 0; |
17
|
1
|
|
|
|
|
2
|
my $ignore_upper = 0; |
18
|
1
|
|
|
|
|
3
|
my $ignore_lower = 0; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# decode arguments to get character ranges and boolean flags |
21
|
1
|
|
|
|
|
9
|
while (my ($kind, $arg) = splice(@_, 0, 2)) { |
22
|
0
|
|
|
|
|
0
|
my $ranges; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my $todo = { |
25
|
0
|
0
|
|
0
|
|
0
|
script => sub { $ranges = charscript($arg) |
26
|
|
|
|
|
|
|
or die "$arg is not a valid Unicode script" }, |
27
|
0
|
0
|
|
0
|
|
0
|
block => sub { $ranges = charblock($arg) |
28
|
|
|
|
|
|
|
or die "$arg is not a valid Unicode block" }, |
29
|
0
|
|
|
0
|
|
0
|
ranges => sub { $ranges = $arg }, |
30
|
0
|
|
|
0
|
|
0
|
wide => sub { $ignore_wide = !$arg }, |
31
|
0
|
|
|
0
|
|
0
|
upper => sub { $ignore_upper = !$arg }, |
32
|
0
|
|
|
0
|
|
0
|
lower => sub { $ignore_lower = !$arg }, |
33
|
0
|
|
|
|
|
0
|
}; |
34
|
0
|
0
|
|
|
|
0
|
my $coderef = $todo->{$kind} |
35
|
|
|
|
|
|
|
or die "invalid argument: $kind"; |
36
|
0
|
|
|
|
|
0
|
$coderef->(); |
37
|
0
|
0
|
|
|
|
0
|
push @all_ranges, @$ranges if $ranges; |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# default |
41
|
1
|
50
|
|
|
|
5
|
@all_ranges = @{charscript('Latin')} if !@all_ranges; |
|
1
|
|
|
|
|
7
|
|
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# build the map |
44
|
1
|
|
|
|
|
14249
|
my %map; |
45
|
1
|
|
|
|
|
5
|
foreach my $range (@all_ranges) { |
46
|
31
|
|
|
|
|
74
|
my ($start, $end) = @$range; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# iterate over characters in range |
49
|
|
|
|
|
|
|
CHAR: |
50
|
31
|
|
|
|
|
91
|
for my $c ($start .. $end) { |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# maybe drop that char under some conditions |
53
|
1349
|
50
|
33
|
|
|
3084
|
last CHAR if $ignore_wide and $c > 255; |
54
|
1349
|
50
|
33
|
|
|
2989
|
next CHAR if $ignore_upper and chr($c) =~ /\p{Uppercase_Letter}/; |
55
|
1349
|
50
|
33
|
|
|
2917
|
next CHAR if $ignore_lower and chr($c) =~ /\p{Lowercase_Letter}/; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# get canonical decomposition (if any) |
58
|
1349
|
|
|
|
|
2678
|
my $canon = Unicode::Normalize::getCanon($c); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# store into map |
61
|
1349
|
100
|
100
|
|
|
3838
|
if ($canon && length($canon) > 1) { |
62
|
|
|
|
|
|
|
# the unaccented char is the the base (first char) of the decomposition |
63
|
498
|
|
|
|
|
950
|
my $base = substr $canon, 0, 1; |
64
|
498
|
|
|
|
|
1474
|
$map{chr($c)} = $base, |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
1
|
|
|
|
|
29
|
return \%map; |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
sub char_map_descr { |
73
|
0
|
|
|
0
|
1
|
0
|
my $class = shift; |
74
|
|
|
|
|
|
|
|
75
|
0
|
|
|
|
|
0
|
my $map = $class->char_map(@_); |
76
|
|
|
|
|
|
|
|
77
|
0
|
|
|
|
|
0
|
my $txt = ""; |
78
|
0
|
|
|
|
|
0
|
while (my ($k, $v) = each %$map) { |
79
|
0
|
|
|
|
|
0
|
my $accented = ord($k); |
80
|
0
|
|
|
|
|
0
|
my $base = ord($v); |
81
|
|
|
|
|
|
|
$txt .= sprintf "U+%04x %-40s => U+%04x %s\n", |
82
|
|
|
|
|
|
|
$accented, |
83
|
|
|
|
|
|
|
charinfo($accented)->{name}, |
84
|
|
|
|
|
|
|
$base, |
85
|
0
|
|
|
|
|
0
|
charinfo($base)->{name}; |
86
|
|
|
|
|
|
|
} |
87
|
0
|
|
|
|
|
0
|
return $txt; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
sub new { |
91
|
1
|
|
|
1
|
1
|
215
|
my $class = shift; |
92
|
1
|
|
|
|
|
7
|
my $map = $class->char_map(@_); |
93
|
1
|
|
|
|
|
20
|
return Text::Transliterator->new($map) |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
1; # End of Text::Transliterator::Unaccent |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
__END__ |