File Coverage

blib/lib/Text/Transliterator/Unaccent.pm

Criterion	Covered	Total	%
statement	38	57	66.6
branch	6	18	33.3
condition	6	12	50.0
subroutine	7	14	50.0
pod	3	3	100.0
total	60	104	57.6

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Transliterator::Unaccent;
2
3	2			2		15011	use warnings;
	2					3
	2					52
4	2			2		7	use strict;
	2					1
	2					29
5
6	2			2		294	use Text::Transliterator; our $VERSION = $Text::Transliterator::VERSION;
	2					2
	2					61
7
8	2			2		1730	use Unicode::UCD qw(charinfo charscript charblock);
	2					390063
	2					163
9	2			2		16	use Unicode::Normalize qw();
	2					2
	2					408
10
11							sub char_map {
12	1			1	1	1	my $class = shift;
13
14	1					2	my @all_ranges;
15	1					1	my $ignore_wide = 0;
16	1					1	my $ignore_upper = 0;
17	1					1	my $ignore_lower = 0;
18
19							# decode arguments to get character ranges and boolean flags
20	1					5	while (my ($kind, $arg) = splice(@_, 0, 2)) {
21	0					0	my $ranges;
22
23							my $todo = {
24	0	0		0		0	script => sub { $ranges = charscript($arg)
25							or die "$arg is not a valid Unicode script" },
26	0	0		0		0	block => sub { $ranges = charblock($arg)
27							or die "$arg is not a valid Unicode block" },
28	0			0		0	ranges => sub { $ranges = $arg },
29	0			0		0	wide => sub { $ignore_wide = !$arg },
30	0			0		0	upper => sub { $ignore_upper = !$arg },
31	0			0		0	lower => sub { $ignore_lower = !$arg },
32	0					0	};
33	0	0				0	my $coderef = $todo->{$kind}
34							or die "invalid argument: $kind";
35	0					0	$coderef->();
36	0	0				0	push @all_ranges, @$ranges if $ranges;
37							}
38
39							# default
40	1	50				3	@all_ranges = @{charscript('Latin')} if !@all_ranges;
	1					3
41
42							# build the map
43	1					8172	my %map;
44	1					3	foreach my $range (@all_ranges) {
45	33					56	my ($start, $end) = @$range;
46
47							# iterate over characters in range
48							CHAR:
49	33					67	for my $c ($start .. $end) {
50
51							# maybe drop that char under some conditions
52	1338	50	33			2214	last CHAR if $ignore_wide and $c > 255;
53	1338	50	33			2056	next CHAR if $ignore_upper and chr($c) =~ /\p{Uppercase_Letter}/;
54	1338	50	33			2042	next CHAR if $ignore_lower and chr($c) =~ /\p{Lowercase_Letter}/;
55
56							# get canonical decomposition (if any)
57	1338					2022	my $canon = Unicode::Normalize::getCanon($c);
58
59							# store into map
60	1338	100	100			9168	if ($canon && length($canon) > 1) {
61							# the unaccented char is the the base (first char) of the decomposition
62	498					725	my $base = substr $canon, 0, 1;
63	498					2977	$map{chr($c)} = $base,
64							}
65							}
66							}
67
68	1					12	return \%map;
69							}
70
71							sub char_map_descr {
72	0			0	1	0	my $class = shift;
73
74	0					0	my $map = $class->char_map(@_);
75
76	0					0	my $txt = "";
77	0					0	while (my ($k, $v) = each %$map) {
78	0					0	my $accented = ord($k);
79	0					0	my $base = ord($v);
80							$txt .= sprintf "U+%04x %-40s => U+%04x %s\n",
81							$accented,
82							charinfo($accented)->{name},
83							$base,
84	0					0	charinfo($base)->{name};
85							}
86	0					0	return $txt;
87							}
88
89							sub new {
90	1			1	1	381	my $class = shift;
91	1					4	my $map = $class->char_map(@_);
92	1					10	return Text::Transliterator->new($map)
93							}
94
95							1; # End of Text::Transliterator::Unaccent
96
97
98							__END__