line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Similarity; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
69995
|
use 5.006; |
|
3
|
|
|
|
|
10
|
|
|
3
|
|
|
|
|
110
|
|
4
|
3
|
|
|
3
|
|
18
|
use strict; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
96
|
|
5
|
3
|
|
|
3
|
|
15
|
use warnings; |
|
3
|
|
|
|
|
9
|
|
|
3
|
|
|
|
|
104
|
|
6
|
|
|
|
|
|
|
|
7
|
3
|
|
|
3
|
|
16
|
use constant COMPFILE => "compfile"; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
283
|
|
8
|
3
|
|
|
3
|
|
14
|
use constant STEM => "stem"; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
182
|
|
9
|
3
|
|
|
3
|
|
16
|
use constant VERBOSE => "verbose"; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
128
|
|
10
|
3
|
|
|
3
|
|
22
|
use constant STOPLIST => "stoplist"; |
|
3
|
|
|
|
|
11
|
|
|
3
|
|
|
|
|
122
|
|
11
|
3
|
|
|
3
|
|
16
|
use constant NORMALIZE => "normalize"; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
3501
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
require Exporter; |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.10'; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
# Attributes -- these all have lvalue accessor methods, use those methods |
20
|
|
|
|
|
|
|
# instead of accessing directly. If you add another attribute, be sure |
21
|
|
|
|
|
|
|
# to take the appropriate action in the DESTROY method; otherwise, a memory |
22
|
|
|
|
|
|
|
# leak could occur. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
my %errorString; |
25
|
|
|
|
|
|
|
my %compounds; |
26
|
|
|
|
|
|
|
my %verbose; |
27
|
|
|
|
|
|
|
my %stem; |
28
|
|
|
|
|
|
|
my %normalize; |
29
|
|
|
|
|
|
|
my %stoplist; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub new |
32
|
|
|
|
|
|
|
{ |
33
|
4
|
|
|
4
|
0
|
10
|
my $class = shift; |
34
|
4
|
|
|
|
|
7
|
my $hash_ref = shift; |
35
|
4
|
|
33
|
|
|
29
|
$class = ref $class || $class; |
36
|
4
|
|
|
|
|
15
|
my $self = bless [], $class; |
37
|
|
|
|
|
|
|
|
38
|
4
|
50
|
|
|
|
16
|
if (defined $hash_ref) { |
39
|
4
|
|
|
|
|
23
|
while (my ($key, $val) = each %$hash_ref) { |
40
|
4
|
50
|
33
|
|
|
45
|
if (($key eq COMPFILE) and (defined $val)) { |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
41
|
0
|
|
|
|
|
0
|
$self->_loadCompounds ($val); |
42
|
|
|
|
|
|
|
} |
43
|
|
|
|
|
|
|
elsif ($key eq STEM) { |
44
|
0
|
|
|
|
|
0
|
$self->stem = $val; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
elsif ($key eq VERBOSE) { |
47
|
0
|
|
|
|
|
0
|
$self->verbose = $val; |
48
|
|
|
|
|
|
|
} |
49
|
|
|
|
|
|
|
elsif ($key eq NORMALIZE) { |
50
|
4
|
|
|
|
|
28
|
$self->normalize = $val; |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
elsif ($key eq STOPLIST) { |
53
|
0
|
|
|
|
|
0
|
$self->stoplist = $val; |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
else { |
56
|
0
|
|
|
|
|
0
|
$self->error ("Unknown option: $key"); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
} |
60
|
4
|
|
|
|
|
14
|
return $self; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub DESTROY |
64
|
|
|
|
|
|
|
{ |
65
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
66
|
0
|
|
|
|
|
0
|
delete $errorString{$self}; |
67
|
0
|
|
|
|
|
0
|
delete $compounds{$self}; |
68
|
0
|
|
|
|
|
0
|
delete $stem{$self}; |
69
|
0
|
|
|
|
|
0
|
delete $verbose{$self}; |
70
|
0
|
|
|
|
|
0
|
delete $normalize{$self}; |
71
|
0
|
|
|
|
|
0
|
delete $stoplist{$self}; |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
#sub _loadStoplist |
75
|
|
|
|
|
|
|
#{ |
76
|
|
|
|
|
|
|
# my $self = shift; |
77
|
|
|
|
|
|
|
# my $file = shift; |
78
|
|
|
|
|
|
|
# |
79
|
|
|
|
|
|
|
# unless (open FH, '<', $file) { |
80
|
|
|
|
|
|
|
# $self->error ("Cannot open '$file': $!"); |
81
|
|
|
|
|
|
|
# return undef; |
82
|
|
|
|
|
|
|
# } |
83
|
|
|
|
|
|
|
# |
84
|
|
|
|
|
|
|
# while () { |
85
|
|
|
|
|
|
|
# chomp; |
86
|
|
|
|
|
|
|
# my $word = lc; |
87
|
|
|
|
|
|
|
# $stoplist{$self}->{$word} = 1; |
88
|
|
|
|
|
|
|
# } |
89
|
|
|
|
|
|
|
# |
90
|
|
|
|
|
|
|
# close FH; |
91
|
|
|
|
|
|
|
#} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub error |
94
|
|
|
|
|
|
|
{ |
95
|
2
|
|
|
2
|
0
|
4
|
my $self = shift; |
96
|
2
|
|
|
|
|
3
|
my $msg = shift; |
97
|
2
|
50
|
|
|
|
9
|
if ($msg) { |
98
|
2
|
|
|
|
|
7
|
my ($package, $file, $line) = caller; |
99
|
2
|
50
|
|
|
|
8
|
$errorString{$self} .= "\n" if $errorString{$self}; |
100
|
2
|
|
|
|
|
13
|
$errorString{$self} .= "($file:$line) $msg"; |
101
|
|
|
|
|
|
|
} |
102
|
2
|
|
|
|
|
6
|
return $errorString{$self}; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub verbose : lvalue |
106
|
|
|
|
|
|
|
{ |
107
|
114
|
|
|
114
|
0
|
156
|
my $self = shift; |
108
|
114
|
|
|
|
|
473
|
$verbose{$self} |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub stem : lvalue |
112
|
|
|
|
|
|
|
{ |
113
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
114
|
0
|
|
|
|
|
0
|
$stem{$self} |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub normalize : lvalue |
118
|
|
|
|
|
|
|
{ |
119
|
36
|
|
|
36
|
0
|
82
|
my $self = shift; |
120
|
36
|
|
|
|
|
206
|
$normalize{$self} |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub sanitizeString |
124
|
|
|
|
|
|
|
{ |
125
|
250
|
|
|
250
|
0
|
284
|
my $self = shift; |
126
|
250
|
|
|
|
|
316
|
my $str = shift; |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# get rid of most punctuation |
129
|
250
|
|
|
|
|
406
|
$str =~ tr/.;:,?!(){}\x22\x60\x24\x25\x40<>/ /s; |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# convert to lower case |
132
|
250
|
|
|
|
|
351
|
$str =~ tr/A-Z_/a-z /; |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# convert ampersands into 'and' -- maybe not appropriate? |
135
|
|
|
|
|
|
|
# s/\&/ and /; |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# get rid of apostrophes not surrounded by word characters |
138
|
250
|
|
|
|
|
373
|
$str =~ s/(?
|
139
|
250
|
|
|
|
|
327
|
$str =~ s/\x27(?!\w)/ /g; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# get rid of dashes, but not hyphens |
142
|
250
|
|
|
|
|
308
|
$str =~ s/--/ /g; |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
# collapse consecutive whitespace chars into one space |
145
|
250
|
|
|
|
|
2764
|
$str =~ s/\s+/ /g; |
146
|
250
|
|
|
|
|
1286
|
return $str; |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub stoplist : lvalue |
150
|
|
|
|
|
|
|
{ |
151
|
4
|
|
|
4
|
0
|
8
|
my $self = shift; |
152
|
4
|
|
|
|
|
22
|
$stoplist{$self} |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
sub _loadCompounds |
156
|
|
|
|
|
|
|
{ |
157
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
158
|
0
|
|
|
|
|
0
|
my $compfile = shift; |
159
|
|
|
|
|
|
|
|
160
|
0
|
0
|
|
|
|
0
|
unless (open FH, '<', $compfile) { |
161
|
0
|
|
|
|
|
0
|
$self->error ("Cannot open '$compfile': $!"); |
162
|
0
|
|
|
|
|
0
|
return undef; |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
0
|
|
|
|
|
0
|
while () { |
166
|
0
|
|
|
|
|
0
|
chomp; |
167
|
0
|
|
|
|
|
0
|
$compounds{$self}{$_} = 1; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
|
170
|
0
|
|
|
|
|
0
|
close FH; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub removeStopWords |
174
|
|
|
|
|
|
|
{ |
175
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
176
|
0
|
|
|
|
|
0
|
my $str = shift; |
177
|
0
|
|
|
|
|
0
|
foreach my $stopword (keys %{$self->stoplist}) { |
|
0
|
|
|
|
|
0
|
|
178
|
0
|
|
|
|
|
0
|
$str =~ s/\Q $stopword \E/ /g; |
179
|
|
|
|
|
|
|
} |
180
|
0
|
|
|
|
|
0
|
return $str; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
# compoundifies a block of text |
184
|
|
|
|
|
|
|
# e.g., if you give it "we have a new bird dog", you'll get back |
185
|
|
|
|
|
|
|
# "we have a new bird_dog". |
186
|
|
|
|
|
|
|
# (code borrowed from rawtextFreq.pl) |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
sub compoundify |
189
|
|
|
|
|
|
|
{ |
190
|
140
|
|
|
140
|
0
|
169
|
my $self = shift; |
191
|
140
|
|
|
|
|
188
|
my $block = shift; # get the block of text |
192
|
140
|
|
|
|
|
148
|
my $done; |
193
|
|
|
|
|
|
|
my $temp; |
194
|
|
|
|
|
|
|
|
195
|
140
|
50
|
|
|
|
350
|
unless ($compounds{$self}) { |
196
|
140
|
|
|
|
|
432
|
return $block; |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# get all the words into an array |
200
|
0
|
|
|
|
|
|
my @wordsArray = $block =~ /(\w+)/g; |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# now compoundify, GREEDILY!! |
203
|
0
|
|
|
|
|
|
my $firstPtr = 0; |
204
|
0
|
|
|
|
|
|
my $string = ""; |
205
|
|
|
|
|
|
|
|
206
|
0
|
|
|
|
|
|
while($firstPtr <= $#wordsArray) |
207
|
|
|
|
|
|
|
{ |
208
|
0
|
|
|
|
|
|
my $secondPtr = $#wordsArray; |
209
|
0
|
|
|
|
|
|
$done = 0; |
210
|
0
|
|
0
|
|
|
|
while($secondPtr > $firstPtr && !$done) |
211
|
|
|
|
|
|
|
{ |
212
|
0
|
|
|
|
|
|
$temp = join ("_", @wordsArray[$firstPtr..$secondPtr]); |
213
|
0
|
0
|
|
|
|
|
if(exists $compounds{$self}{$temp}) |
214
|
|
|
|
|
|
|
{ |
215
|
0
|
|
|
|
|
|
$string .= "$temp "; |
216
|
0
|
|
|
|
|
|
$done = 1; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
else |
219
|
|
|
|
|
|
|
{ |
220
|
0
|
|
|
|
|
|
$secondPtr--; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
} |
223
|
0
|
0
|
|
|
|
|
if(!$done) |
224
|
|
|
|
|
|
|
{ |
225
|
0
|
|
|
|
|
|
$string .= "$wordsArray[$firstPtr] "; |
226
|
|
|
|
|
|
|
} |
227
|
0
|
|
|
|
|
|
$firstPtr = $secondPtr + 1; |
228
|
|
|
|
|
|
|
} |
229
|
0
|
|
|
|
|
|
$string =~ s/ $//; |
230
|
|
|
|
|
|
|
|
231
|
0
|
|
|
|
|
|
return $string; |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
1; |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
__END__ |