File Coverage

blib/lib/Lingua/Anagrams.pm

Criterion	Covered	Total	%
statement	339	341	99.4
branch	112	132	84.8
condition	36	48	75.0
subroutine	30	30	100.0
pod	5	5	100.0
total	522	556	93.8

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Anagrams;
2							$Lingua::Anagrams::VERSION = '0.019';
3							# ABSTRACT: pure Perl anagram finder
4
5	1			1		415	use strict;
	1					1
	1					22
6	1			1		2	use warnings;
	1					1
	1					20
7
8	1			1		449	use List::MoreUtils qw(uniq);
	1					7475
	1					4
9
10
11							# don't cache anagrams for bigger character counts than this
12							our $LIMIT = 20;
13
14							# some global variables to be localized
15							# used to limit time spent copying values
16							our ( $limit, $known, $trie, %cache, $cleaner, @jumps, @indices );
17
18
19							sub new {
20	12			12	1	4525	my $class = shift;
21	12					12	my $wl = shift;
22	12	50				26	die 'first parameter expected to be an array reference'
23							unless ref $wl eq 'ARRAY';
24	12					20	my %params = _make_opts(@_);
25	12		33			32	$class = ref $class \|\| $class;
26	12		50			37	local $cleaner = $params{clean} // \&_clean;
27	12					11	my @word_lists;
28	12	100				18	if ( ref $wl->[0] eq 'ARRAY' ) {
29	5					7	@word_lists = @$wl;
30							}
31							else {
32	7					8	@word_lists = ($wl);
33							}
34	12					14	_validate_lists( \@word_lists );
35	7					14	my $translator = { '' => 0 };
36	7					3	$translator->{$_} = scalar keys %$translator for @{ $word_lists[-1] };
	7					32
37	7					6	my $offset; # used to reduce number of undef cells in tries and elsewhere
38	7					4	for my $w ( @{ $word_lists[-1] } ) {
	7					8
39	31					54	my @ords = map ord, split //, $w;
40	31					24	for my $o (@ords) {
41	44	100				43	if ( defined $offset ) {
42	37	100				53	$offset = $o if $o < $offset;
43							}
44							else {
45	7					9	$offset = $o;
46							}
47							}
48							}
49	7					5	--$offset;
50	7					5	my @tries;
51	7					7	for my $words (@word_lists) {
52	8					11	my ( $trie, $known ) = _trieify( $words, $translator, $offset );
53	8					20	push @tries, [ $trie, $known ];
54							}
55	7					7	$translator = [ '', @{ $word_lists[-1] } ];
	7					11
56							return bless {
57							limit => $params{limit} // $LIMIT,
58							sorted => $params{sorted} // 0,
59							min => $params{min},
60	7		66			71	clean => $cleaner,
			100
61							tries => \@tries,
62							translator => $translator,
63							offset => $offset,
64							},
65							$class;
66							}
67
68							# there should be no empty lists and each list should be subsumed
69							# by the next
70							sub _validate_lists {
71	12			12		11	my $lists = shift;
72	12					25	for my $i ( 0 .. $#$lists ) {
73							my @list = uniq grep length,
74	16		50			14	map { my $v = $_ // ''; $cleaner->($v); $v } @{ $lists->[$i] };
	40					56
	40					39
	40					89
	16					21
75	16	100				50	die 'empty list' unless @list;
76	14					20	$lists->[$i] = \@list;
77							}
78	10					15	for my $i ( 1 .. $#$lists ) {
79	4					6	my ( $prior, $list ) = @$lists[ $i - 1, $i ];
80	4	100				22	die 'lists misordered by length' if @$prior >= @$list;
81	2					2	my %set = map { $_ => 1 } @$list;
	5					8
82	2					2	for my $word (@$prior) {
83	2	100				13	die 'smaller lists must be subsumed by larger' unless $set{$word};
84							}
85							}
86							}
87
88							sub _trieify {
89	8			8		5	my ( $words, $translator, $offset ) = @_;
90	8					8	my $base = [];
91	8					6	my @known;
92	8					7	for my $word (@$words) {
93	32					33	my @chars = map { ord($_) - $offset } split //, $word;
	47					67
94	32					41	_learn( \@known, \@chars );
95	32					30	_add( $base, \@chars, $word, $translator );
96							}
97	8					9	return $base, \@known;
98							}
99
100							sub _learn {
101	32			32		21	my ( $known, $new ) = @_;
102	32					30	for my $i (@$new) {
103	47		100			92	$known->[$i] \|\|= 1;
104							}
105							}
106
107							sub _add {
108	79			79		68	my ( $base, $chars, $word, $translator ) = @_;
109	79					51	my $i = shift @$chars;
110	79	100				77	if ($i) {
111	47		100			92	my $next = $base->[$i] //= [];
112	47					52	_add( $next, $chars, $word, $translator );
113							}
114							else { # store values in trie at the zero index
115	32					55	$base->[0] = $translator->{$word};
116							}
117							}
118
119							# walk the trie looking for words you can make out of the current character count
120							sub _words_in {
121	71			71		53	my ( $counts, $total ) = @_;
122	71					43	my @words;
123	71					106	my @stack = ( [ 0, $trie ] );
124	71					41	while (1) {
125	698					364	my ( $c, $level ) = @{ $stack[-1] };
	698					537
126	698	100	100			1398	if ( $c == -1 \|\| $c >= @$level ) {
127	160	100				202	last if @stack == 1;
128	89					64	pop @stack;
129	89					54	++$total;
130	89					64	$c = \( $stack[-1][0] );
131	89					63	++$counts->[$$c];
132	89					76	$$c = $jumps[$$c];
133							}
134							else {
135	538					373	my $l = $level->[$c];
136	538	100				437	if ($l) { # trie holds corresponding node
137	388	100				330	if ($c) { # character
138	245	100				206	if ( $counts->[$c] ) {
139	149					149	push @stack, [ 0, $l ];
140	149					83	--$counts->[$c];
141	149					123	--$total;
142							}
143							else {
144	96					85	$stack[-1][0] = $jumps[$c];
145							}
146							}
147							else { # terminal
148	143					215	push @words, [ $l, [@$counts] ];
149	143	100				139	if ($total) {
150	83					72	$stack[-1][0] = $jumps[$c];
151							}
152							else {
153	60					38	pop @stack;
154	60					41	++$total;
155	60					52	$c = \( $stack[-1][0] );
156	60					44	++$counts->[$$c];
157	60					66	$$c = $jumps[$$c];
158							}
159							}
160							}
161							else { # try the next possible character
162	150					135	$stack[-1][0] = $jumps[$c];
163							}
164							}
165							}
166	71					87	\@words;
167							}
168
169
170							sub anagrams {
171	8			8	1	920	my $self = shift;
172	8					7	my $phrase = shift;
173	8					10	my %opts = _make_opts(@_);
174	8					16	local ( $limit, $cleaner ) = @$self{qw(limit clean)};
175	8					10	$cleaner->($phrase);
176	8	50				11	return () unless length $phrase;
177	8		66			16	my $sort = $opts{sorted} // $self->{sorted};
178	8		66			15	my $min = $opts{min} // $self->{min};
179	8		100			16	my $i = $opts{start_list} // 0;
180	8					7	my @pairs = @{ $self->{tries} };
	8					11
181	8	100				12	if ($i) {
182	1	50				3	die "impossible index for start list: $i" unless defined $pairs[$i];
183	1	50				3	$i = @pairs + $i if $i < 0;
184	1					2	@pairs = @pairs[ $i .. $#pairs ];
185							}
186	8					5	my $offset = $self->{offset};
187	8					16	my $counts = _counts( $phrase, $offset );
188	8					6	my @translator = @{ $self->{translator} };
	8					19
189	8					10	local @jumps = _jumps($counts);
190	8					10	local @indices = _indices($counts);
191	8					5	my @anagrams;
192	8					10	for my $pair (@pairs) {
193	8					8	local ( $trie, $known ) = @$pair;
194	8	50				10	next unless _all_known($counts);
195	8					10	local %cache = ();
196	8					10	@anagrams = _anagramize($counts);
197	8	50				10	next unless @anagrams;
198	8	50	66			21	next if $min and @anagrams < $min;
199	8					16	last;
200							}
201	8					7	@anagrams = map { [ @translator[@$_] ] } @anagrams;
	22					36
202	8	100				13	if ($sort) {
203							@anagrams = sort {
204	24	100				25	my $ordered = @$a <= @$b ? 1 : -1;
205	24	100				23	my ( $d, $e ) = $ordered == 1 ? ( $a, $b ) : ( $b, $a );
206	24					25	for ( 0 .. $#$d ) {
207	27					21	my $c = $d->[$_] cmp $e->[$_];
208	27	100				46	return $ordered * $c if $c;
209							}
210	0					0	-$ordered;
211	4					4	} map { [ sort @$_ ] } @anagrams;
	16					23
212							}
213	8					35	return @anagrams;
214							}
215
216							sub _make_opts {
217	27	100		27		38	if ( @_ == 1 ) {
218	1					1	my $r = shift;
219	1	50				9	die 'options expected to be key value pairs or a hash ref'
220							unless 'HASH' eq ref $r;
221	1					4	return %$r;
222							}
223							else {
224	26					50	return @_;
225							}
226							}
227
228
229							our $null = sub { };
230
231							sub iterator {
232	7			7	1	4291	my $self = shift;
233	7					9	my $phrase = shift;
234	7					9	my %opts = _make_opts(@_);
235	7		66			29	$opts{sorted} //= $self->{sorted};
236	7					9	$self->{clean}->($phrase);
237	7		100			17	my $i = $opts{start_list} // 0;
238	7					5	my @pairs = @{ $self->{tries} };
	7					12
239	7	100				10	if ($i) {
240	1	50				3	die "impossible index for start list: $i" unless defined $pairs[$i];
241	1	50				3	$i = @pairs + $i if $i < 0;
242	1					3	@pairs = @pairs[ $i .. $#pairs ];
243							}
244	7	50				17	return $null unless length $phrase;
245							return _super_iterator( \@pairs, $phrase, \%opts,
246	7					15	@$self{qw(translator offset)} );
247							}
248
249							# iterator that converts word indices back to words
250							sub _super_iterator {
251	7			7		8	my ( $tries, $phrase, $opts, $translator, $offset ) = @_;
252	7					8	my $counts = _counts( $phrase, $offset );
253	7					11	my @j = _jumps($counts);
254	7					9	my @ix = _indices($counts);
255	7					10	my $i = _iterator( $tries, $counts, $opts );
256	7					7	my %c;
257							return sub {
258	28			28		735	my $rv;
259	28					40	local @jumps = @j;
260	28					18	local @indices = @ix;
261							{
262	28					24	$rv = $i->();
	54					53
263	54	100				68	return unless $rv;
264	47					83	my $key = join ',', sort { $a <=> $b } @$rv;
	73					119
265	47	100				79	redo if $c{$key}++;
266							}
267	21					36	$rv = [ @$translator[@$rv] ];
268	21	100				33	$rv = [ sort @$rv ] if $opts->{sorted};
269	21					42	$rv;
270	7					26	};
271							}
272
273							# iterator that manages the trie list
274							sub _iterator {
275	49			49		39	my ( $tries, $counts, $opts ) = @_;
276	49					28	my $total = 0;
277	49					81	$total += $_ for @$counts[@indices];
278	49					51	my @t = @$tries;
279	49					28	my $i;
280							sub {
281	154			154		92	my $rv;
282							{
283	154	100				87	unless ($i) {
	199					231
284	96	100				100	if (@t) {
285	49					33	my $pair = shift @t;
286	49					52	local ( $trie, $known ) = @$pair;
287	49	50				44	redo unless _all_known($counts);
288	49					51	my $words = _words_in( $counts, $total );
289	49	50				52	redo unless _worth_pursuing( $counts, $words );
290	49					52	$i = _sub_iterator( $tries, $words, $opts );
291							}
292							else {
293	47					36	return $rv;
294							}
295							}
296	152					132	$rv = $i->();
297	152	100				179	unless ($rv) {
298	45					98	undef $i;
299	45					25	redo;
300							}
301							}
302	107					95	$rv;
303	49					121	};
304							}
305
306							# iterator that actually walks tries looking for anagrams
307							sub _sub_iterator {
308	49			49		29	my ( $tries, $words, $opts ) = @_;
309	49					54	my @pairs = @$words;
310							sub {
311							{
312	152	100		152		87	return unless @pairs;
	237					275
313	192	100				220	if ( $opts->{random} ) {
314	112					151	my $i = int rand scalar @pairs;
315	112	100				133	if ($i) {
316	44					27	my $p = $pairs[0];
317	44					34	$pairs[0] = $pairs[$i];
318	44					33	$pairs[$i] = $p;
319							}
320							}
321	192					106	my ( $w, $s ) = @{ $pairs[0] };
	192					160
322	192	100				260	unless ( ref $s eq 'CODE' ) {
323	89	100				80	if ( _any($s) ) {
324	42					46	$s = _iterator( $tries, $s, $opts );
325							}
326							else {
327	47					42	my $next = [];
328							$s = sub {
329	92					63	my $rv = $next;
330	92					53	undef $next;
331	92					67	$rv;
332	47					82	};
333							}
334	89					81	$pairs[0][1] = $s;
335							}
336	192					169	my $remainder = $s->();
337	192	100				228	unless ($remainder) {
338	85					57	shift @pairs;
339	85					153	redo;
340							}
341	107					127	[ $w, @$remainder ];
342							}
343	49					154	};
344							}
345
346							# all character counts decremented
347							sub _worth_pursuing {
348	49			49		32	my ( $counts, $words ) = @_;
349
350	49					28	my $c;
351
352							# if any letter count didn't change, there's no hope
353	49					41	OUTER: for my $i (@indices) {
354	137	100				166	next unless $c = $counts->[$i];
355	68					62	for (@$words) {
356	89	100				131	next OUTER if $_->[1][$i] < $c;
357							}
358	0					0	return;
359							}
360	49					70	return 1;
361							}
362
363							sub _indices {
364	15			15		11	my $counts = shift;
365	15					8	my @indices;
366	15					19	for my $i ( 0 .. $#$counts ) {
367	85	100				102	push @indices, $i if $counts->[$i];
368							}
369	15					20	return @indices;
370							}
371
372							sub _jumps {
373	15			15		9	my $counts = shift;
374	15					29	my @jumps = (0) x @$counts;
375	15					12	my $j = 0;
376	15					16	while ( my $n = _next_jump( $counts, $j ) ) {
377	36					25	$jumps[$j] = $n;
378	36					38	$j = $n;
379							}
380	15					10	$jumps[-1] = -1;
381	15					31	return @jumps;
382							}
383
384							sub _next_jump {
385	51			51		45	my ( $counts, $j ) = @_;
386	51					64	for my $i ( $j + 1 .. $#$counts ) {
387	70	100				115	return $i if $counts->[$i];
388							}
389	15					24	return;
390							}
391
392							sub _clean {
393	61			61		69	$_[0] =~ s/\W+//g;
394	61					56	$_[0] = lc $_[0];
395							}
396
397							sub _all_known {
398	57			57		44	my $counts = shift;
399	57	50				79	return if @$counts > @$known;
400	57					69	for my $i ( 0 .. $#$counts ) {
401	265	50	66			488	return if $counts->[$i] && !$known->[$i];
402							}
403	57					81	return 1;
404							}
405
406
407							sub key {
408	6			6	1	7073	my ( $self, $phrase ) = @_;
409	6					11	$self->{clean}->($phrase);
410	6					7	my $offset = $self->{offset};
411	6					3	my ( @counts, $lowest );
412	6					13	for my $c ( map { ord($_) - $offset } split //, $phrase ) {
	18					21
413	18	100				22	if ( defined $lowest ) {
414	12	100				16	$lowest = $c if $c < $lowest;
415							}
416							else {
417	6					5	$lowest = $c;
418							}
419	18					18	$counts[$c]++;
420							}
421	6					24	@counts = @counts[ $lowest .. $#counts ];
422	6		100			64	$_ //= '' for @counts;
423	6					19	my $suffix = join '.', @counts;
424	6					24	$suffix =~ s/\.(\.+)\./'('.length($1).')'/ge;
	7					16
425	6					26	return "$lowest:$suffix";
426							}
427
428
429							sub lists {
430	1			1	1	2	my $self = shift;
431	1					1	return scalar @{ $self->{tries} };
	1					3
432							}
433
434							sub _counts {
435	15			15		13	my ( $phrase, $offset ) = @_;
436	15					10	my @counts;
437	15					27	for my $c ( map { ord($_) - $offset } split //, $phrase ) {
	42					48
438	42					40	$counts[$c]++;
439							}
440	15		100			159	$_ //= 0 for @counts;
441	15					18	\@counts;
442							}
443
444							sub _any {
445	125			125		68	for ( @{ $_[0] } ) {
	125					159
446	503	100				583	return 1 if $_;
447							}
448	66					77	'';
449							}
450
451							sub _anagramize {
452	25			25		19	my $counts = shift;
453	25					16	my $total = 0;
454	25					44	$total += $_ for @$counts[@indices];
455	25					13	my $key;
456	25	50				31	if ( $total <= $limit ) {
457	25					43	$key = join ',', @$counts[@indices];
458	25					22	my $cached = $cache{$key};
459	25	100				36	return @$cached if $cached;
460							}
461	22					15	my @anagrams;
462	22					22	my $words = _words_in( $counts, $total );
463	22	50				24	if ( _all_touched( $counts, $words ) ) {
464	22					20	for (@$words) {
465	36					33	my ( $word, $c ) = @$_;
466	36	100				35	if ( _any($c) ) {
467	17					21	push @anagrams, [ $word, @$_ ] for _anagramize($c);
468							}
469							else {
470	19					27	push @anagrams, [$word];
471							}
472							}
473	22					18	my %seen;
474							@anagrams = grep {
475	22					14	!$seen{ join ' ', sort { $a <=> $b } @$_ }++
	39					90
	28					56
476							} @anagrams;
477							}
478	22	50				50	$cache{$key} = \@anagrams if $key;
479	22					63	@anagrams;
480							}
481
482							sub _all_touched {
483	22			22		20	my ( $counts, $words ) = @_;
484
485	22					10	my $c;
486
487	22					14	my ( @tallies, @good_indices );
488	22					25	for (@$words) {
489	52					27	my $wc = $_->[1];
490	52					44	for (@indices) {
491	140	100				155	next unless $c = $counts->[$_];
492	114		66			151	$good_indices[$_] //= $_;
493	114	100				161	$tallies[$_]++ if $wc->[$_] < $c;
494							}
495							}
496
497							# if any letter count didn't change, there's no hope
498	22	50				28	return unless @good_indices;
499	22					41	for (@good_indices) {
500	109	100				154	next unless $_;
501	36	50				43	return unless $tallies[$_];
502							}
503
504							# find the letter with the fewest possibilities
505	22					12	my ( $best, $min, $n );
506	22					22	for (@good_indices) {
507	109	100				139	next unless $_;
508	36					22	$n = $tallies[$_];
509	36	100	66			75	if ( !$best \|\| $n < $min ) {
510	22					13	$best = $_;
511	22					20	$min = $n;
512							}
513							}
514
515							# we only need consider all the branches which affected a
516							# particular letter; we will find all possibilities in their
517							# ramifications
518	22					13	$c = $counts->[$best];
519	22					21	@$words = grep { $_->[1][$best] < $c } @$words;
	52					72
520	22					37	return 1;
521							}
522
523							1;
524
525							__END__