| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Text::Transliterator::Unaccent; |
|
2
|
2
|
|
|
2
|
|
145652
|
use warnings; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
109
|
|
|
3
|
2
|
|
|
2
|
|
11
|
use strict; |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
109
|
|
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
our $VERSION = "1.06"; |
|
6
|
|
|
|
|
|
|
|
|
7
|
2
|
|
|
2
|
|
567
|
use Text::Transliterator; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
64
|
|
|
8
|
2
|
|
|
2
|
|
2241
|
use Unicode::UCD qw(charinfo charscript charblock); |
|
|
2
|
|
|
|
|
96077
|
|
|
|
2
|
|
|
|
|
274
|
|
|
9
|
2
|
|
|
2
|
|
17
|
use Unicode::Normalize qw(); |
|
|
2
|
|
|
|
|
5
|
|
|
|
2
|
|
|
|
|
1485
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
sub char_map { |
|
12
|
2
|
|
|
2
|
1
|
4
|
my $class = shift; |
|
13
|
|
|
|
|
|
|
|
|
14
|
2
|
|
|
|
|
2
|
my @all_ranges; |
|
15
|
2
|
|
|
|
|
3
|
my $ignore_wide = 0; |
|
16
|
2
|
|
|
|
|
3
|
my $ignore_upper = 0; |
|
17
|
2
|
|
|
|
|
3
|
my $ignore_lower = 0; |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# decode arguments to get character ranges and boolean flags |
|
20
|
2
|
|
|
|
|
9
|
while (my ($arg_name, $arg_val) = splice(@_, 0, 2)) { |
|
21
|
0
|
|
|
|
|
0
|
my $ranges; |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
my $handle_arg = { |
|
24
|
0
|
0
|
|
0
|
|
0
|
script => sub { $ranges = charscript($arg_val) |
|
25
|
|
|
|
|
|
|
or die "$arg_val is not a valid Unicode script" }, |
|
26
|
0
|
0
|
|
0
|
|
0
|
block => sub { $ranges = charblock($arg_val) |
|
27
|
|
|
|
|
|
|
or die "$arg_val is not a valid Unicode block" }, |
|
28
|
0
|
|
|
0
|
|
0
|
ranges => sub { $ranges = $arg_val }, |
|
29
|
0
|
|
|
0
|
|
0
|
wide => sub { $ignore_wide = !$arg_val }, |
|
30
|
0
|
|
|
0
|
|
0
|
upper => sub { $ignore_upper = !$arg_val }, |
|
31
|
0
|
|
|
0
|
|
0
|
lower => sub { $ignore_lower = !$arg_val }, |
|
32
|
0
|
|
|
|
|
0
|
}; |
|
33
|
0
|
0
|
|
|
|
0
|
my $coderef = $handle_arg->{$arg_name} |
|
34
|
|
|
|
|
|
|
or die "invalid argument: $arg_name"; |
|
35
|
0
|
|
|
|
|
0
|
$coderef->(); |
|
36
|
0
|
0
|
|
|
|
0
|
push @all_ranges, @$ranges if $ranges; |
|
37
|
|
|
|
|
|
|
} |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# default |
|
40
|
2
|
50
|
|
|
|
6
|
@all_ranges = @{charscript('Latin')} if !@all_ranges; |
|
|
2
|
|
|
|
|
10
|
|
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# build the map |
|
43
|
2
|
|
|
|
|
44987
|
my %map; |
|
44
|
2
|
|
|
|
|
5
|
foreach my $range (@all_ranges) { |
|
45
|
78
|
|
|
|
|
103
|
my ($start, $end) = @$range; |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# iterate over characters in range |
|
48
|
|
|
|
|
|
|
CHAR: |
|
49
|
78
|
|
|
|
|
92
|
for my $c ($start .. $end) { |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# maybe drop that char under some conditions |
|
52
|
2974
|
50
|
33
|
|
|
3713
|
last CHAR if $ignore_wide and $c > 255; |
|
53
|
2974
|
50
|
33
|
|
|
3772
|
next CHAR if $ignore_upper and chr($c) =~ /\p{Uppercase_Letter}/; |
|
54
|
2974
|
50
|
33
|
|
|
3609
|
next CHAR if $ignore_lower and chr($c) =~ /\p{Lowercase_Letter}/; |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# get canonical decomposition (if any) |
|
57
|
2974
|
|
|
|
|
3363
|
my $canon = Unicode::Normalize::getCanon($c); |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# store into map |
|
60
|
2974
|
100
|
100
|
|
|
4744
|
if ($canon && length($canon) > 1) { |
|
61
|
|
|
|
|
|
|
# the unaccented char is the the base (first char) of the decomposition |
|
62
|
996
|
|
|
|
|
1119
|
my $base = substr $canon, 0, 1; |
|
63
|
996
|
|
|
|
|
1722
|
$map{chr($c)} = $base, |
|
64
|
|
|
|
|
|
|
} |
|
65
|
|
|
|
|
|
|
} |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
|
|
68
|
2
|
|
|
|
|
25
|
return \%map; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub char_map_descr { |
|
72
|
0
|
|
|
0
|
1
|
0
|
my $class = shift; |
|
73
|
|
|
|
|
|
|
|
|
74
|
0
|
|
|
|
|
0
|
my $map = $class->char_map(@_); |
|
75
|
|
|
|
|
|
|
|
|
76
|
0
|
|
|
|
|
0
|
my $txt = ""; |
|
77
|
0
|
|
|
|
|
0
|
foreach my $k (sort {$a cmp $b} keys %$map) { |
|
|
0
|
|
|
|
|
0
|
|
|
78
|
0
|
|
|
|
|
0
|
my $v = $map->{$k}; |
|
79
|
0
|
|
|
|
|
0
|
my $accented = ord($k); |
|
80
|
0
|
|
|
|
|
0
|
my $base = ord($v); |
|
81
|
|
|
|
|
|
|
$txt .= sprintf "U+%04x %-55s => U+%04x %s\n", |
|
82
|
|
|
|
|
|
|
$accented, |
|
83
|
|
|
|
|
|
|
charinfo($accented)->{name}, |
|
84
|
|
|
|
|
|
|
$base, |
|
85
|
0
|
|
|
|
|
0
|
charinfo($base)->{name}; |
|
86
|
|
|
|
|
|
|
} |
|
87
|
0
|
|
|
|
|
0
|
return $txt; |
|
88
|
|
|
|
|
|
|
} |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
sub new { |
|
91
|
2
|
|
|
2
|
1
|
130843
|
my ($class, %args) = @_; |
|
92
|
|
|
|
|
|
|
|
|
93
|
2
|
|
100
|
|
|
11
|
my $modifiers = delete $args{modifiers} || ""; |
|
94
|
2
|
|
|
|
|
9
|
my $map = $class->char_map(%args); |
|
95
|
2
|
|
|
|
|
14
|
return Text::Transliterator->new($map, $modifiers); |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
1; # End of Text::Transliterator::Unaccent |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
__END__ |