File Coverage

blib/lib/Text/Transliterator/Unaccent.pm
Criterion Covered Total %
statement 39 60 65.0
branch 6 18 33.3
condition 8 14 57.1
subroutine 7 14 50.0
pod 3 3 100.0
total 63 109 57.8


line stmt bran cond sub pod time code
1             package Text::Transliterator::Unaccent;
2 2     2   145652 use warnings;
  2         4  
  2         109  
3 2     2   11 use strict;
  2         5  
  2         109  
4              
5             our $VERSION = "1.06";
6              
7 2     2   567 use Text::Transliterator;
  2         4  
  2         64  
8 2     2   2241 use Unicode::UCD qw(charinfo charscript charblock);
  2         96077  
  2         274  
9 2     2   17 use Unicode::Normalize qw();
  2         5  
  2         1485  
10              
11             sub char_map {
12 2     2 1 4 my $class = shift;
13              
14 2         2 my @all_ranges;
15 2         3 my $ignore_wide = 0;
16 2         3 my $ignore_upper = 0;
17 2         3 my $ignore_lower = 0;
18              
19             # decode arguments to get character ranges and boolean flags
20 2         9 while (my ($arg_name, $arg_val) = splice(@_, 0, 2)) {
21 0         0 my $ranges;
22              
23             my $handle_arg = {
24 0 0   0   0 script => sub { $ranges = charscript($arg_val)
25             or die "$arg_val is not a valid Unicode script" },
26 0 0   0   0 block => sub { $ranges = charblock($arg_val)
27             or die "$arg_val is not a valid Unicode block" },
28 0     0   0 ranges => sub { $ranges = $arg_val },
29 0     0   0 wide => sub { $ignore_wide = !$arg_val },
30 0     0   0 upper => sub { $ignore_upper = !$arg_val },
31 0     0   0 lower => sub { $ignore_lower = !$arg_val },
32 0         0 };
33 0 0       0 my $coderef = $handle_arg->{$arg_name}
34             or die "invalid argument: $arg_name";
35 0         0 $coderef->();
36 0 0       0 push @all_ranges, @$ranges if $ranges;
37             }
38              
39             # default
40 2 50       6 @all_ranges = @{charscript('Latin')} if !@all_ranges;
  2         10  
41              
42             # build the map
43 2         44987 my %map;
44 2         5 foreach my $range (@all_ranges) {
45 78         103 my ($start, $end) = @$range;
46              
47             # iterate over characters in range
48             CHAR:
49 78         92 for my $c ($start .. $end) {
50              
51             # maybe drop that char under some conditions
52 2974 50 33     3713 last CHAR if $ignore_wide and $c > 255;
53 2974 50 33     3772 next CHAR if $ignore_upper and chr($c) =~ /\p{Uppercase_Letter}/;
54 2974 50 33     3609 next CHAR if $ignore_lower and chr($c) =~ /\p{Lowercase_Letter}/;
55              
56             # get canonical decomposition (if any)
57 2974         3363 my $canon = Unicode::Normalize::getCanon($c);
58              
59             # store into map
60 2974 100 100     4744 if ($canon && length($canon) > 1) {
61             # the unaccented char is the the base (first char) of the decomposition
62 996         1119 my $base = substr $canon, 0, 1;
63 996         1722 $map{chr($c)} = $base,
64             }
65             }
66             }
67              
68 2         25 return \%map;
69             }
70              
71             sub char_map_descr {
72 0     0 1 0 my $class = shift;
73              
74 0         0 my $map = $class->char_map(@_);
75              
76 0         0 my $txt = "";
77 0         0 foreach my $k (sort {$a cmp $b} keys %$map) {
  0         0  
78 0         0 my $v = $map->{$k};
79 0         0 my $accented = ord($k);
80 0         0 my $base = ord($v);
81             $txt .= sprintf "U+%04x %-55s => U+%04x %s\n",
82             $accented,
83             charinfo($accented)->{name},
84             $base,
85 0         0 charinfo($base)->{name};
86             }
87 0         0 return $txt;
88             }
89              
90             sub new {
91 2     2 1 130843 my ($class, %args) = @_;
92              
93 2   100     11 my $modifiers = delete $args{modifiers} || "";
94 2         9 my $map = $class->char_map(%args);
95 2         14 return Text::Transliterator->new($map, $modifiers);
96             }
97              
98             1; # End of Text::Transliterator::Unaccent
99              
100              
101             __END__