File Coverage

blib/lib/Text/Transliterator/Unaccent.pm

Criterion	Covered	Total	%
statement	39	60	65.0
branch	6	18	33.3
condition	8	14	57.1
subroutine	7	14	50.0
pod	3	3	100.0
total	63	109	57.8

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Transliterator::Unaccent;
2	2			2		145652	use warnings;
	2					4
	2					109
3	2			2		11	use strict;
	2					5
	2					109
4
5							our $VERSION = "1.06";
6
7	2			2		567	use Text::Transliterator;
	2					4
	2					64
8	2			2		2241	use Unicode::UCD qw(charinfo charscript charblock);
	2					96077
	2					274
9	2			2		17	use Unicode::Normalize qw();
	2					5
	2					1485
10
11							sub char_map {
12	2			2	1	4	my $class = shift;
13
14	2					2	my @all_ranges;
15	2					3	my $ignore_wide = 0;
16	2					3	my $ignore_upper = 0;
17	2					3	my $ignore_lower = 0;
18
19							# decode arguments to get character ranges and boolean flags
20	2					9	while (my ($arg_name, $arg_val) = splice(@_, 0, 2)) {
21	0					0	my $ranges;
22
23							my $handle_arg = {
24	0	0		0		0	script => sub { $ranges = charscript($arg_val)
25							or die "$arg_val is not a valid Unicode script" },
26	0	0		0		0	block => sub { $ranges = charblock($arg_val)
27							or die "$arg_val is not a valid Unicode block" },
28	0			0		0	ranges => sub { $ranges = $arg_val },
29	0			0		0	wide => sub { $ignore_wide = !$arg_val },
30	0			0		0	upper => sub { $ignore_upper = !$arg_val },
31	0			0		0	lower => sub { $ignore_lower = !$arg_val },
32	0					0	};
33	0	0				0	my $coderef = $handle_arg->{$arg_name}
34							or die "invalid argument: $arg_name";
35	0					0	$coderef->();
36	0	0				0	push @all_ranges, @$ranges if $ranges;
37							}
38
39							# default
40	2	50				6	@all_ranges = @{charscript('Latin')} if !@all_ranges;
	2					10
41
42							# build the map
43	2					44987	my %map;
44	2					5	foreach my $range (@all_ranges) {
45	78					103	my ($start, $end) = @$range;
46
47							# iterate over characters in range
48							CHAR:
49	78					92	for my $c ($start .. $end) {
50
51							# maybe drop that char under some conditions
52	2974	50	33			3713	last CHAR if $ignore_wide and $c > 255;
53	2974	50	33			3772	next CHAR if $ignore_upper and chr($c) =~ /\p{Uppercase_Letter}/;
54	2974	50	33			3609	next CHAR if $ignore_lower and chr($c) =~ /\p{Lowercase_Letter}/;
55
56							# get canonical decomposition (if any)
57	2974					3363	my $canon = Unicode::Normalize::getCanon($c);
58
59							# store into map
60	2974	100	100			4744	if ($canon && length($canon) > 1) {
61							# the unaccented char is the the base (first char) of the decomposition
62	996					1119	my $base = substr $canon, 0, 1;
63	996					1722	$map{chr($c)} = $base,
64							}
65							}
66							}
67
68	2					25	return \%map;
69							}
70
71							sub char_map_descr {
72	0			0	1	0	my $class = shift;
73
74	0					0	my $map = $class->char_map(@_);
75
76	0					0	my $txt = "";
77	0					0	foreach my $k (sort {$a cmp $b} keys %$map) {
	0					0
78	0					0	my $v = $map->{$k};
79	0					0	my $accented = ord($k);
80	0					0	my $base = ord($v);
81							$txt .= sprintf "U+%04x %-55s => U+%04x %s\n",
82							$accented,
83							charinfo($accented)->{name},
84							$base,
85	0					0	charinfo($base)->{name};
86							}
87	0					0	return $txt;
88							}
89
90							sub new {
91	2			2	1	130843	my ($class, %args) = @_;
92
93	2		100			11	my $modifiers = delete $args{modifiers} \|\| "";
94	2					9	my $map = $class->char_map(%args);
95	2					14	return Text::Transliterator->new($map, $modifiers);
96							}
97
98							1; # End of Text::Transliterator::Unaccent
99
100
101							__END__