line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Affixes; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
101286
|
use 5.006; |
|
3
|
|
|
|
|
12
|
|
|
3
|
|
|
|
|
117
|
|
4
|
3
|
|
|
3
|
|
15
|
use strict; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
103
|
|
5
|
3
|
|
|
3
|
|
15
|
use warnings; |
|
3
|
|
|
|
|
9
|
|
|
3
|
|
|
|
|
127
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
require Exporter; |
8
|
3
|
|
|
3
|
|
3589
|
use AutoLoader qw(AUTOLOAD); |
|
3
|
|
|
|
|
7334
|
|
|
3
|
|
|
|
|
18
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw( |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
) ] ); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our @EXPORT_OK = ( @{ $EXPORT_TAGS{'all'} } ); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our @EXPORT = qw( |
19
|
|
|
|
|
|
|
get_prefixes |
20
|
|
|
|
|
|
|
get_suffixes |
21
|
|
|
|
|
|
|
); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
our $VERSION = '0.07'; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 NAME |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
Text::Affixes - Prefixes and suffixes analisys of text |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=head1 SYNOPSIS |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
use Text::Affixes; |
32
|
|
|
|
|
|
|
my $text = "Hello, world. Hello, big world."; |
33
|
|
|
|
|
|
|
my $prefixes = get_prefixes($text); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
# $prefixes now holds |
36
|
|
|
|
|
|
|
# { |
37
|
|
|
|
|
|
|
# 3 => { |
38
|
|
|
|
|
|
|
# 'Hel' => 2, |
39
|
|
|
|
|
|
|
# 'wor' => 2, |
40
|
|
|
|
|
|
|
# } |
41
|
|
|
|
|
|
|
# } |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# or |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
$prefixes = get_prefixes({min => 1, max => 2},$text); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# $prefixes now holds |
48
|
|
|
|
|
|
|
# { |
49
|
|
|
|
|
|
|
# 1 => { |
50
|
|
|
|
|
|
|
# 'H' => 2, |
51
|
|
|
|
|
|
|
# 'w' => 2, |
52
|
|
|
|
|
|
|
# 'b' => 1, |
53
|
|
|
|
|
|
|
# }, |
54
|
|
|
|
|
|
|
# 2 => { |
55
|
|
|
|
|
|
|
# 'He' => 2, |
56
|
|
|
|
|
|
|
# 'wo' => 2, |
57
|
|
|
|
|
|
|
# 'bi' => 1, |
58
|
|
|
|
|
|
|
# } |
59
|
|
|
|
|
|
|
# } |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# the use for get_suffixes is similar |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head1 DESCRIPTION |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
Provides methods for prefixe and suffix analisys of text. |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=head1 METHODS |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
=head2 get_prefixes |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
Extracts prefixes from text. You can specify the minimum and maximum |
72
|
|
|
|
|
|
|
number of characters of prefixes you want. |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
Returns a reference to a hash, where the specified limits are mapped |
75
|
|
|
|
|
|
|
in hashes; each of those hashes maps every prefix in the text into the |
76
|
|
|
|
|
|
|
number of times it was found. |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
By default, both minimum and maximum limits are 3. If the minimum |
79
|
|
|
|
|
|
|
limit is greater than the lower one, an empty hash is returned. |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
A prefix is considered to be a sequence of word characters (\w) in |
82
|
|
|
|
|
|
|
the beginning of a word (that is, after a word boundary) that does not |
83
|
|
|
|
|
|
|
reach the end of the word ("regular expressionly", a prefix is the $1 |
84
|
|
|
|
|
|
|
of /\b(\w+)\w/). |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# extracting prefixes of size 3 |
87
|
|
|
|
|
|
|
$prefixes = get_prefixes( $text ); |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# extracting prefixes of sizes 2 and 3 |
90
|
|
|
|
|
|
|
$prefixes = get_prefixes( {min => 2}, $text ); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# extracting prefixes of sizes 3 and 4 |
93
|
|
|
|
|
|
|
$prefixes = get_prefixes( {max => 4}, $text ); |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# extracting prefixes of sizes 2, 3 and 4 |
96
|
|
|
|
|
|
|
$prefixes = get_prefixes( {min => 2, max=> 4}, $text); |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
=cut |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
sub get_prefixes { |
101
|
11
|
|
|
11
|
1
|
1721
|
return _get_elements(1,@_); |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=head2 get_suffixes |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
The get_suffixes function is similar to the get_prefixes one. You |
107
|
|
|
|
|
|
|
should read the documentation for that one and than come back to this |
108
|
|
|
|
|
|
|
point. |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
A suffix is considered to be a sequence of word characters (\w) in |
111
|
|
|
|
|
|
|
the end of a word (that is, before a word boundary) that does not start |
112
|
|
|
|
|
|
|
at the beginning of the word ("regular expressionly" speaking, a |
113
|
|
|
|
|
|
|
prefix is the $1 of /\w(\w+)\b/). |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
# extracting suffixes of size 3 |
116
|
|
|
|
|
|
|
$suffixes = get_suffixes( $text ); |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
# extracting suffixes of sizes 2 and 3 |
119
|
|
|
|
|
|
|
$suffixes = get_suffixes( {min => 2}, $text ); |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# extracting suffixes of sizes 3 and 4 |
122
|
|
|
|
|
|
|
$suffixes = get_suffixes( {max => 4}, $text ); |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# extracting suffixes of sizes 2, 3 and 4 |
125
|
|
|
|
|
|
|
$suffixes = get_suffixes( {min => 2, max=> 4}, $text); |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=cut |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub get_suffixes { |
130
|
7
|
|
|
7
|
1
|
188
|
return _get_elements(0,@_); |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub _get_elements { |
134
|
18
|
|
|
18
|
|
29
|
my $task = shift; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
=head1 OPTIONS |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
Apart from deciding on a minimum and maximum size for prefixes or suffixes, you |
139
|
|
|
|
|
|
|
can also decide on some configuration options. |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=cut |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# configuration |
144
|
18
|
|
|
|
|
72
|
my %conf = ( min => 3, |
145
|
|
|
|
|
|
|
max => 3, |
146
|
|
|
|
|
|
|
exclude_numbers => 1, |
147
|
|
|
|
|
|
|
lowercase => 0, |
148
|
|
|
|
|
|
|
); |
149
|
18
|
100
|
|
|
|
61
|
if (ref $_[0] eq 'HASH') { |
150
|
15
|
|
|
|
|
35
|
%conf = (%conf, %{+shift}); |
|
15
|
|
|
|
|
102
|
|
151
|
|
|
|
|
|
|
} |
152
|
18
|
100
|
|
|
|
70
|
return {} if $conf{max} < $conf{min}; |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# get the elements |
155
|
17
|
|
|
|
|
21
|
my %elements; |
156
|
17
|
|
100
|
|
|
49
|
my $text = shift || return undef; |
157
|
16
|
100
|
|
|
|
40
|
$conf{min} = 1 if $conf{min} < 1; |
158
|
16
|
|
|
|
|
55
|
for ($conf{min} .. $conf{max}) { |
159
|
|
|
|
|
|
|
|
160
|
17
|
100
|
|
|
|
412
|
my $regex = $task ? qr/\b(\w{$_})\w/ : # prefixes |
161
|
|
|
|
|
|
|
qr/\w(\w{$_})\b/ ; # suffixes |
162
|
|
|
|
|
|
|
|
163
|
17
|
|
|
|
|
233
|
while ($text =~ /$regex/g) { |
164
|
52
|
|
|
|
|
358
|
$elements{$_}{$1}++; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=head2 exclude_numbers |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
Set to 0 if you consider numbers as part of words. Default value is 1. |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# this |
174
|
|
|
|
|
|
|
get_suffixes( {min => 1, max => 1, exclude_numbers => 0}, "Hello, but w8" ); |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
# returns this: |
177
|
|
|
|
|
|
|
{ |
178
|
|
|
|
|
|
|
1 => { |
179
|
|
|
|
|
|
|
'o' => 1, |
180
|
|
|
|
|
|
|
't' => 1, |
181
|
|
|
|
|
|
|
'8' => 1 |
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=cut |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# exclude elements containing numbers |
188
|
16
|
100
|
|
|
|
43
|
if ($conf{exclude_numbers}) { |
189
|
14
|
|
|
|
|
39
|
for my $s (keys %elements) { |
190
|
15
|
|
|
|
|
20
|
for (keys %{$elements{$s}}) { |
|
15
|
|
|
|
|
44
|
|
191
|
34
|
100
|
|
|
|
183
|
delete ${$elements{$s}}{$_} if /\d/; |
|
4
|
|
|
|
|
19
|
|
192
|
|
|
|
|
|
|
} |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=head2 lowercase |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
Set to 1 to extract all prefixes in lowercase mode. Default value is 0. |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
ATTENTION: This does not mean that prefixes with uppercased characters won't be |
201
|
|
|
|
|
|
|
extracted. It means they will be extracted after being lowercased. |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# this... |
204
|
|
|
|
|
|
|
get_prefixes( {min => 2, max => 2, lowercase => 1}, "Hello, hello"); |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# returns this: |
207
|
|
|
|
|
|
|
{ |
208
|
|
|
|
|
|
|
2 => { |
209
|
|
|
|
|
|
|
'he' => 2 |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=cut |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# elements containing uppercased characters become lowercased ones |
216
|
16
|
100
|
|
|
|
45
|
if ($conf{lowercase}) { |
217
|
4
|
|
|
|
|
12
|
for my $s (keys %elements) { |
218
|
4
|
|
|
|
|
7
|
for (keys %{$elements{$s}}) { |
|
4
|
|
|
|
|
12
|
|
219
|
6
|
100
|
|
|
|
43
|
if (/[A-Z]/) { |
220
|
4
|
|
|
|
|
14
|
${$elements{$s}}{lc $_} += |
|
4
|
|
|
|
|
21
|
|
221
|
4
|
|
|
|
|
7
|
${$elements{$s}}{$_}; |
222
|
4
|
|
|
|
|
8
|
delete ${$elements{$s}}{$_}; |
|
4
|
|
|
|
|
14
|
|
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
} |
227
|
|
|
|
|
|
|
|
228
|
16
|
|
|
|
|
160
|
return \%elements; |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
1; |
232
|
|
|
|
|
|
|
__END__ |