File Coverage

blib/lib/Text/Affixes.pm

Criterion	Covered	Total	%
statement	44	44	100.0
branch	16	16	100.0
condition	2	2	100.0
subroutine	7	7	100.0
pod	2	2	100.0
total	71	71	100.0

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Affixes;
2
3	3			3		101286	use 5.006;
	3					12
	3					117
4	3			3		15	use strict;
	3					6
	3					103
5	3			3		15	use warnings;
	3					9
	3					127
6
7							require Exporter;
8	3			3		3589	use AutoLoader qw(AUTOLOAD);
	3					7334
	3					18
9
10							our @ISA = qw(Exporter);
11
12							our %EXPORT_TAGS = ( 'all' => [ qw(
13
14							) ] );
15
16							our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } );
17
18							our @EXPORT = qw(
19							get_prefixes
20							get_suffixes
21							);
22
23							our $VERSION = '0.07';
24
25							=head1 NAME
26
27							Text::Affixes - Prefixes and suffixes analisys of text
28
29							=head1 SYNOPSIS
30
31							use Text::Affixes;
32							my $text = "Hello, world. Hello, big world.";
33							my $prefixes = get_prefixes($text);
34
35							# $prefixes now holds
36							# {
37							# 3 => {
38							# 'Hel' => 2,
39							# 'wor' => 2,
40							# }
41							# }
42
43							# or
44
45							$prefixes = get_prefixes({min => 1, max => 2},$text);
46
47							# $prefixes now holds
48							# {
49							# 1 => {
50							# 'H' => 2,
51							# 'w' => 2,
52							# 'b' => 1,
53							# },
54							# 2 => {
55							# 'He' => 2,
56							# 'wo' => 2,
57							# 'bi' => 1,
58							# }
59							# }
60
61							# the use for get_suffixes is similar
62
63							=head1 DESCRIPTION
64
65							Provides methods for prefixe and suffix analisys of text.
66
67							=head1 METHODS
68
69							=head2 get_prefixes
70
71							Extracts prefixes from text. You can specify the minimum and maximum
72							number of characters of prefixes you want.
73
74							Returns a reference to a hash, where the specified limits are mapped
75							in hashes; each of those hashes maps every prefix in the text into the
76							number of times it was found.
77
78							By default, both minimum and maximum limits are 3. If the minimum
79							limit is greater than the lower one, an empty hash is returned.
80
81							A prefix is considered to be a sequence of word characters (\w) in
82							the beginning of a word (that is, after a word boundary) that does not
83							reach the end of the word ("regular expressionly", a prefix is the $1
84							of /\b(\w+)\w/).
85
86							# extracting prefixes of size 3
87							$prefixes = get_prefixes( $text );
88
89							# extracting prefixes of sizes 2 and 3
90							$prefixes = get_prefixes( {min => 2}, $text );
91
92							# extracting prefixes of sizes 3 and 4
93							$prefixes = get_prefixes( {max => 4}, $text );
94
95							# extracting prefixes of sizes 2, 3 and 4
96							$prefixes = get_prefixes( {min => 2, max=> 4}, $text);
97
98							=cut
99
100							sub get_prefixes {
101	11			11	1	1721	return _get_elements(1,@_);
102							}
103
104							=head2 get_suffixes
105
106							The get_suffixes function is similar to the get_prefixes one. You
107							should read the documentation for that one and than come back to this
108							point.
109
110							A suffix is considered to be a sequence of word characters (\w) in
111							the end of a word (that is, before a word boundary) that does not start
112							at the beginning of the word ("regular expressionly" speaking, a
113							prefix is the $1 of /\w(\w+)\b/).
114
115							# extracting suffixes of size 3
116							$suffixes = get_suffixes( $text );
117
118							# extracting suffixes of sizes 2 and 3
119							$suffixes = get_suffixes( {min => 2}, $text );
120
121							# extracting suffixes of sizes 3 and 4
122							$suffixes = get_suffixes( {max => 4}, $text );
123
124							# extracting suffixes of sizes 2, 3 and 4
125							$suffixes = get_suffixes( {min => 2, max=> 4}, $text);
126
127							=cut
128
129							sub get_suffixes {
130	7			7	1	188	return _get_elements(0,@_);
131							}
132
133							sub _get_elements {
134	18			18		29	my $task = shift;
135
136							=head1 OPTIONS
137
138							Apart from deciding on a minimum and maximum size for prefixes or suffixes, you
139							can also decide on some configuration options.
140
141							=cut
142
143							# configuration
144	18					72	my %conf = ( min => 3,
145							max => 3,
146							exclude_numbers => 1,
147							lowercase => 0,
148							);
149	18	100				61	if (ref $_[0] eq 'HASH') {
150	15					35	%conf = (%conf, %{+shift});
	15					102
151							}
152	18	100				70	return {} if $conf{max} < $conf{min};
153
154							# get the elements
155	17					21	my %elements;
156	17		100			49	my $text = shift \|\| return undef;
157	16	100				40	$conf{min} = 1 if $conf{min} < 1;
158	16					55	for ($conf{min} .. $conf{max}) {
159
160	17	100				412	my $regex = $task ? qr/\b(\w{$_})\w/ : # prefixes
161							qr/\w(\w{$_})\b/ ; # suffixes
162
163	17					233	while ($text =~ /$regex/g) {
164	52					358	$elements{$_}{$1}++;
165							}
166
167							}
168
169							=head2 exclude_numbers
170
171							Set to 0 if you consider numbers as part of words. Default value is 1.
172
173							# this
174							get_suffixes( {min => 1, max => 1, exclude_numbers => 0}, "Hello, but w8" );
175
176							# returns this:
177							{
178							1 => {
179							'o' => 1,
180							't' => 1,
181							'8' => 1
182							}
183							}
184
185							=cut
186
187							# exclude elements containing numbers
188	16	100				43	if ($conf{exclude_numbers}) {
189	14					39	for my $s (keys %elements) {
190	15					20	for (keys %{$elements{$s}}) {
	15					44
191	34	100				183	delete ${$elements{$s}}{$_} if /\d/;
	4					19
192							}
193							}
194							}
195
196							=head2 lowercase
197
198							Set to 1 to extract all prefixes in lowercase mode. Default value is 0.
199
200							ATTENTION: This does not mean that prefixes with uppercased characters won't be
201							extracted. It means they will be extracted after being lowercased.
202
203							# this...
204							get_prefixes( {min => 2, max => 2, lowercase => 1}, "Hello, hello");
205
206							# returns this:
207							{
208							2 => {
209							'he' => 2
210							}
211							}
212
213							=cut
214
215							# elements containing uppercased characters become lowercased ones
216	16	100				45	if ($conf{lowercase}) {
217	4					12	for my $s (keys %elements) {
218	4					7	for (keys %{$elements{$s}}) {
	4					12
219	6	100				43	if (/[A-Z]/) {
220	4					14	${$elements{$s}}{lc $_} +=
	4					21
221	4					7	${$elements{$s}}{$_};
222	4					8	delete ${$elements{$s}}{$_};
	4					14
223							}
224							}
225							}
226							}
227
228	16					160	return \%elements;
229							}
230
231							1;
232							__END__