File Coverage

blib/lib/Locale/Maketext/Fuzzy.pm

Criterion	Covered	Total	%
statement	63	63	100.0
branch	22	28	78.5
condition	3	5	60.0
subroutine	9	9	100.0
pod	2	2	100.0
total	99	107	92.5

line	stmt	bran	cond	sub	pod	time	code
1							package Locale::Maketext::Fuzzy;
2							$Locale::Maketext::Fuzzy::VERSION = '0.11';
3
4	1			1		30338	use 5.005;
	1					4
	1					54
5	1			1		6	use strict;
	1					2
	1					36
6	1			1		1153	use Locale::Maketext;
	1					13260
	1					38
7	1			1		12	use base 'Locale::Maketext';
	1					3
	1					162
8
9							sub override_maketext {
10	6			6	1	1074	my ( $class, $flag ) = @_;
11	6	50				17	$class = ref($class) if ref($class);
12
13	1			1		6	no strict 'refs';
	1					2
	1					994
14
15	6	100				20	if ($flag) {
		100
16	2					3	*{"$class\::maketext"} = \&maketext_fuzzy;
	2					12
17							}
18							elsif ( @_ >= 2 ) {
19	2					3	delete ${"$class\::"}{maketext};
	2					8
20							}
21
22	6	100				7	return ( defined &{"$class\::maketext"} ? 1 : 0 );
	6					40
23							}
24
25							# Global cache of entries and their regexified forms
26							my %regex_cache;
27
28							sub maketext_fuzzy {
29	7			7	1	4568	my ( $handle, $phrase ) = splice( @_, 0, 2 );
30
31							# An array of all lexicon hashrefs
32	7					7	my @lexicons = @{ $handle->_lex_refs };
	7					31
33
34							# Try exact match if possible at all.
35	7					118	foreach my $lex (@lexicons) {
36	7	100				33	return $handle->SUPER::maketext( $phrase, @_ )
37							if exists $lex->{$phrase};
38							}
39
40							# Keys are matched entries; values are arrayrefs of extracted params
41	5					8	my %candidate;
42
43							# Fuzzy match phase 1 -- extract all candidates
44	5					7	foreach my $lex (@lexicons) {
45
46							# We're not interested in non-bracketed entries, so ignore them
47	5					6	foreach my $entry ( grep /(?:(?
	5					31
48							# Skip entries which are _only_ brackets and whitespace.
49							# The most value they could add is rearrangement, and that
50							# is almost certainly incorrect.
51	10	50				49	next if $entry =~ /^\s(\[[^]]+\]\s)+$/;
52
53	10		100			28	my $re = ( $regex_cache{$entry} \|\|= [ _regexify($entry) ] );
54	10	100				76	my @vars = ( $phrase =~ $re->[0] ) or next;
55	7					15	$candidate{$entry} \|\|=
56	7	50	33			20	( @{ $re->[1] } ? [ @vars[ @{ $re->[1] } ] ] : \@vars );
	7					35
57							}
58							}
59
60							# Fail early if we cannot find anything that matches
61	5	100				17	return $phrase unless %candidate;
62
63							# Fuzzy match phase 2 -- select the best candidate
64	3	50				10	$phrase = (
65							sort {
66
67							# For now, we just use a very crude heuristic: "Longer is better"
68	4					12	length($b) <=> length($a)
69							or $b cmp $a
70							} keys %candidate
71							)[0];
72
73	4					5	return $handle->SUPER::maketext( $phrase, @{ $candidate{$phrase} }, @_ );
	4					17
74							}
75
76							sub _regexify {
77	2			2		6	my $text = quotemeta(shift);
78	2					2	my @ords;
79
80	2					27	$text =~ s{
81							( # capture into $1...
82							(?
83							) # (to be restored back)
84							\\\[ # opening bracket
85
86							( # capture into $2...
87							(?: # any numbers of
88							[^~\]] # ordinary non-] characters
89							\| # or
90							~\\?. # escaped characters
91							)*
92							)
93							\\\] # closing bracket
94							}{
95	3					8	$1._paramify($2, \@ords)
96							}egx;
97
98	2					5	$text =~ s/\Q.?\E$/./;
99	2					81	return qr/^$text$/, \@ords;
100							}
101
102							sub _paramify {
103	3			3		5	my ( $text, $ordref ) = @_;
104	3					5	my $out = '(.*?)';
105	3					7	my @choices = split( /\\,/, $text );
106
107	3	50				13	if ( $choices[0] =~ /^(?:\w+\|\\#\|\\\*)$/ ) {
108
109							# Do away with the function name
110	3	100				9	shift @choices unless $choices[0] =~ /^_(?:\d+\|\\\*)$/;
111
112							# Build an alternate regex to weed out vars
113	2					22	$out .= '(?:' . join(
114							'\|',
115							sort {
116							length($b) <=> length($a) # longest first
117							} map {
118	3					5	/^_(?:(\d+)\|\\\*)$/
119	5	100				17	? do {
120	3	50				8	push @{$ordref}, ( $1 - 1 ) if defined $1;
	3					7
121	3					7	'';
122							}
123							: $_ # turn _1, _2, _*... into ''
124							} @choices
125							) . ')';
126
127	3					7	$out =~ s/\Q(?:)\E$//;
128							}
129
130	3					15	return $out;
131							}
132
133							1;
134
135							=head1 NAME
136
137							Locale::Maketext::Fuzzy - Maketext from already interpolated strings
138
139							=head1 SYNOPSIS
140
141							package MyApp::L10N;
142							use base 'Locale::Maketext::Fuzzy'; # instead of Locale::Maketext
143
144							package MyApp::L10N::de;
145							use base 'MyApp::L10N';
146							our %Lexicon = (
147							# Exact match should always be preferred if possible
148							"0 camels were released."
149							=> "Exact match",
150
151							# Fuzzy match candidate
152							"[quant,_1,camel was,camels were] released."
153							=> "[quant,_1,Kamel wurde,Kamele wurden] freigegeben.",
154
155							# This could also match fuzzily, but is less preferred
156							"[_2] released[_1]"
157							=> "[_1][_2] ist frei[_1]",
158							);
159
160							package main;
161							my $lh = MyApp::L10N->get_handle('de');
162
163							# All ->maketext calls below will become ->maketext_fuzzy instead
164							$lh->override_maketext(1);
165
166							# This prints "Exact match"
167							print $lh->maketext('0 camels were released.');
168
169							# "1 Kamel wurde freigegeben." -- quant() gets 1
170							print $lh->maketext('1 camel was released.');
171
172							# "2 Kamele wurden freigegeben." -- quant() gets 2
173							print $lh->maketext('2 camels were released.');
174
175							# "3 Kamele wurden freigegeben." -- parameters are ignored
176							print $lh->maketext('3 released.');
177
178							# "4 Kamele wurden freigegeben." -- normal usage
179							print $lh->maketext('[*,_1,camel was,camels were] released.', 4);
180
181							# "!Perl ist frei!" -- matches the broader one
182							# Note that the sequence ([_2] before [_1]) is preserved
183							print $lh->maketext('Perl released!');
184
185							=head1 DESCRIPTION
186
187							This module is a subclass of C, with additional
188							support for localizing messages that already contains interpolated
189							variables.
190
191							This is most useful when the messages are returned by external sources
192							-- for example, to match C against
193							C<[_1]: command not found>.
194
195							Of course, this module is also useful if you're simply too lazy
196							to use the
197
198							$lh->maketext("[quant,_1,file,files] deleted.", $count);
199
200							syntax, but wish to write
201
202							$lh->maketext_fuzzy("$count files deleted");
203
204							instead, and have the correct plural form figured out automatically.
205
206							If C seems too long to type for you, this module
207							also provides a C method to turn I C
208							calls into C calls.
209
210							=head1 METHODS
211
212							=head2 $lh->maketext_fuzzy(I[, I]);
213
214							That method takes exactly the same arguments as the C method
215							of C.
216
217							If I is found in lexicons, it is applied in the same way as
218							C. Otherwise, it looks at all lexicon entries that could
219							possibly yield I, by turning C<[...]> sequences into C<(.*?)> and
220							match the resulting regular expression against I.
221
222							Once it finds all candidate entries, the longest one replaces the
223							I for the real C call. Variables matched by its bracket
224							sequences (C<$1>, C<$2>...) are placed before I; the order
225							of variables in the matched entry are correctly preserved.
226
227							For example, if the matched entry in C<%Lexicon> is C,
228							this call:
229
230							$fh->maketext_fuzzy("Test string", "param");
231
232							is equivalent to this:
233
234							$fh->maketext("Test [_1]", "string", "param");
235
236							However, most of the time you won't need to supply I to
237							a C call, since all parameters are already interpolated
238							into the string.
239
240							=head2 $lh->override_maketext([I]);
241
242							If I is true, this accessor method turns C<$lh-Emaketext>
243							into an alias for C<$lh-Emaketext_fuzzy>, so all consecutive
244							C calls in the C<$lh>'s packages are automatically fuzzy.
245							A false I restores the original behaviour. If the flag is not
246							specified, returns the current status of override; the default is
247							0 (no overriding).
248
249							Note that this call only modifies the symbol table of the I
250							class> that C<$lh> belongs to, so other languages are not affected.
251							If you want to override all language handles in a certain application,
252							try this:
253
254							MyApp::L10N->override_maketext(1);
255
256							=head1 CAVEATS
257
258							=over 4
259
260							=item *
261
262							The "longer is better" heuristic to determine the best match is
263							reasonably good, but could certainly be improved.
264
265							=item *
266
267							Currently, C<"[quant,_1,file] deleted"> won't match C<"3 files deleted">;
268							you'll have to write C<"[quant,_1,file,files] deleted"> instead, or
269							simply use C<"[_1] file deleted"> as the lexicon key and put the correct
270							plural form handling into the corresponding value.
271
272							=item *
273
274							When used in combination with C's C
275							backend, all keys would be iterated over each time a fuzzy match is
276							performed, and may cause serious speed penalty. Patches welcome.
277
278							=back
279
280							=head1 SEE ALSO
281
282							L, L
283
284							=head1 HISTORY
285
286							This particular module was written to facilitate an I
287							layer for Slashcode's I
288							C and C. It would work like this:
289
290							Input \| from the [% story.dept %] dept.
291							Output\| [%\|loc( story.dept )%]from the [_1] dept.[%END%]
292
293							Now, this layer suffers from the same linguistic problems as an
294							ordinary C or C framework does -- what if we want
295							to make ordinals from C<[% story.dept %]> (i.e. C),
296							or expand the C to C / C?
297
298							The same problem occurred in RT's web interface, where it had to
299							localize messages returned by external modules, which may already
300							contain interpolated variables, e.g. C<"Successfully deleted 7
301							ticket(s) in 'c:\temp'.">.
302
303							Since I didn't have the time to refactor C and C,
304							I devised a C method to pre-process their messages into one
305							of the I, then applied the matched string to C.
306
307							Afterwards, I realized that instead of preparing a set of candidate
308							strings, I could actually match against the original I
309							(i.e. PO files via C). This is how
310							C was born.
311
312							=head1 AUTHORS
313
314							Audrey Tang Ecpan@audreyt.orgE
315
316							=head1 CC0 1.0 Universal
317
318							To the extent possible under law, 唐鳳 has waived all copyright and related
319							or neighboring rights to Locale-Maketext-Fuzzy.
320
321							This work is published from Taiwan.
322
323							L
324
325							=cut