File Coverage

blib/lib/WordLists/Sort/Typical.pm

Criterion	Covered	Total	%
statement	65	66	98.4
branch			n/a
condition			n/a
subroutine	30	31	96.7
pod	6	7	85.7
total	101	104	97.1

line	stmt	sub	pod	time	code
1					package WordLists::Sort::Typical;
2	2	2		27367	use utf8;
	2			4
	2			13
3	2	2		43	use strict;
	2			2
	2			62
4	2	2		10	use warnings;
	2			5
	2			52
5	2	2		884	use Unicode::Normalize; #provides NFD
	2			2409
	2			220
6	2	2		516	use WordLists::Sort qw( atomic_compare complex_compare);
	2			4
	2			126
7	2	2		11	use WordLists::Base;
	2			3
	2			576
8					our $VERSION = $WordLists::Base::VERSION;
9					our $AUTOLOAD;
10					require Exporter;
11					our @ISA = qw (Exporter);
12					our @EXPORT = ();
13					our @EXPORT_OK = qw(
14					cmp_alnum
15					cmp_alnum_only
16					cmp_accnum
17					cmp_accnum_only
18					cmp_ver
19					cmp_dict
20					);
21	37	37		79	sub _cmp { $_[0] cmp $_[1] };
22
23					sub cmp_dict
24					{
25	36	36	1	71	my $norm_remove_the = sub{$_[0] =~ s/^the\s+//;$_[0];};
	36	7		71
	7			28
26	12			23	complex_compare (
27					$_[0], $_[1],
28					{
29					functions =>
30					[
31					{
32					n => $norm_remove_the,
33					t=>[
34					{
35					re=>qr/.+/,
36					c => \&cmp_accnum_only,
37					},
38					],
39					},
40					{
41	12	12		15	n => sub{&{$norm_remove_the}($_[0]); $_[0] =~ s/[^\p{Script: Latin}0-9]/ /g; $_[0] =~ s/[a-z\p{Lowercase}]/ /g; $_[0];},
	12	1		28
	12			48
	12			34
	1			4
	1			1
	1			12
42	2			5	c => \&_cmp,
43					},
44					{
45					n => $norm_remove_the,
46					t=>[
47					{
48					re=>qr/.+/,
49					c => \&cmp_alnum_only,
50					},
51					],
52					},
53					{
54	2	2		4	n => sub{&{$norm_remove_the}($_[0]); $_[0] =~ s/[^\p{Script: Latin}0-9]/ /g; $_[0] =~ s/[a-zA-Z]/ /g; $_[0];},
	2			6
	2			12
	2			7
55	2			5	c => \&_cmp,
56					},
57					{
58	2	2		4	n => sub{&{$norm_remove_the}($_[0]); $_[0] =~ s/[\P{Uppercase}]/ /g; $_[0];},
	2			13
	2			6
59					c => \&_cmp,
60					},
61					{
62	2	2		4	n => sub { &{$norm_remove_the}($_[0]); $_[0]=~ s/[^\p{Script: Latin}0-9]//g; $_[0];},
	2			5
	2			4
	2			7
63					c => \&_cmp,
64					},
65					{
66					n => $norm_remove_the,
67					t=>[
68					{
69					re=>qr/.+/,
70					c => \&cmp_alpha,
71					},
72					],
73					},
74					{
75	8	8		22	c => sub { ($_[0] =~ s/^the\s+//) cmp ($_[1] =~ s/^the\s+//) },
76					},
77	7			201	]
78					}
79					);
80					}
81
82					sub cmp_alnum
83					{
84					atomic_compare (
85					$_[0], $_[1],
86					{
87	12	12		39	n => sub { lc $_[0];},
88					t =>
89					[
90					{
91					re => qr/[0-9]+/,
92	3	3		14	c => sub { $_[0] <=> $_[1]; }
93					},
94	6	6	1	1398	],
95					}
96					);
97					}
98					sub cmp_alpha
99					{
100					atomic_compare (
101					$_[0], $_[1],
102					{
103	0	0		0	n => sub { lc $_[0];},
104					}
105	1	1	0	8	);
106					}
107					sub cmp_alnum_only
108					{
109					atomic_compare (
110					$_[0], $_[1],
111					{
112					n => sub {
113	8	8		19	$_[0] =~ s/[^\p{Script: Latin}0-9]//g;
114	8			22	$_[0];
115					},
116	5	5	1	129	t=>[
117					{
118					re=>qr/.+/,
119					c => \&cmp_alnum,
120					}
121					],
122					}
123					);
124					}
125					sub cmp_accnum_only
126					{
127					atomic_compare (
128					$_[0], $_[1],
129					{
130					n => sub {
131	18	18		50	$_[0] =~ s/[^\p{Script: Latin}0-9]//g;
132	18			48	$_[0];
133					},
134	10	10	1	103	t=>[
135					{
136					re=>qr/.+/,
137					c => \&cmp_accnum,
138					}
139					],
140					}
141					);
142					}
143					sub cmp_accnum
144					{
145					atomic_compare (
146					$_[0], $_[1],
147					{
148					n => sub {
149	30	30		161	$_[0] = NFD ($_[0]);
150	30			117	$_[0] =~ s/\pM//g;
151	30			99	lc $_[0];
152					},
153					t =>
154					[
155					{
156					re => qr/[0-9]+/,
157	1	1		4	c => sub { $_[0] <=> $_[1]; }
158					},
159	15	15	1	166	],
160					}
161					);
162					}
163
164					sub cmp_ver # compares version strings: anything that is not alphanumeric is a separator and doesn't have any preference
165					# 1.1 = 1:1
166					# 1.1.a < 1.1a < 1.1.1
167					{
168	26	26		54	atomic_compare (
169					$_[0], $_[1],
170					{
171	26			66	n => sub {$_[0] =~ s/^v//i; $_[0];},
172					t =>
173					[
174					{
175					re => qr/[0-9]+/,
176	25	25		71	c => sub { $_[0] <=> $_[1]; }
177					},
178					{
179					re => qr/[a-zA-Z]+/,
180	4	4		13	c => sub { lc $_[0] cmp lc $_[1]; }
181					},
182					{
183	14	14	1	258	re => qr/[^a-zA-Z0-9]+/,
184					c => 0
185					},
186
187
188					],
189					}
190					);
191					}
192
193
194
195					1;
196
197
198					=pod
199
200					=head1 NAME
201
202					WordLists::Sort::Typical
203
204					=head1 SYNOPSIS
205
206					'A14' cmp 'A2'; # sadly returns -1, so instead do this:
207					use WordLists::Sort::Typical qw(cmp_alnum);
208					cmp_alnum('A14', 'A2'); # returns 1
209
210					=head1 DESCRIPTION
211
212					This provides functions for sorting text.
213
214					=head3 cmp_alnum
215
216					Compares alphanumeric values sensibly, e.g. "Unit 10" comes after "Unit 9", not before "Unit 2". Case-insensitive.
217
218					=head3 cmp_alnum_only
219
220					Compares alphanumeric values sensibly as C, but ignores all values except alphanumeric characters, so "re-factor" sorts with "refactor", not between "re" and "react". Case-insensitive.
221
222					=head3 cmp_accnum
223
224					Compares alphanumeric values sensibly as C, and considers accented characters to be equivalent to unaccented characters, so "café" sorts with "cafe", not after "caftan".
225
226					=head3 cmp_accnum_only
227
228					Compares alphanumeric values sensibly and accent-insensitively as C, and ignores non-alphanumeric content like C
229
230					=head3 cmp_ver
231
232					Compares version numbers sensibly, even if they are of the form "v1.0028_01a".
233
234					=head3 cmp_dict
235
236					This uses a C, the first stage being C. Strings which are still equal are progressively sorted with tie-breakers so that order is reliable. Strings beginning "the " are sorted identically, except at the end, when strings without "the " have preference.
237
238					=over
239
240					=item *
241
242					Case - uppercase comes after lowercase.
243
244					=item *
245
246					Accents - uppercase comes after lowercase.
247
248					=item *
249
250					Non-alphanumeric characters - these are sorted, ignoring other intervening characters.
251
252					=item *
253
254					Definite article - if the strings are otherwise identical, a string beginning "the " comes after a string not beginning "the "
255
256					=back
257
258					=head1 BUGS
259
260					Please use the Github issues tracker.
261
262					=head1 LICENSE
263
264					Copyright 2011-2012 © Cambridge University Press. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
265
266					=cut