File Coverage

blib/lib/Text/OverlapFinder.pm

Criterion	Covered	Total	%
statement	110	153	71.9
branch	35	64	54.6
condition	9	15	60.0
subroutine	8	11	72.7
pod	0	5	0.0
total	162	248	65.3

line	stmt	bran	cond	sub	pod	time	code
1							package Text::OverlapFinder;
2
3	4			4		21586	use strict;
	4					8
	4					109
4	4			4		23	use warnings;
	4					6
	4					214
5
6							our @ISA = ();
7							our $VERSION = '0.05';
8
9	4			4		19	use constant MARKER => '###';
	4					8
	4					7091
10
11							# manually patched with the following :
12							# https://rt.cpan.org/Public/Ticket/Attachment/999948/520850
13
14							##sub contains(\@@);
15							##sub containsReplace(\@@);
16
17							sub contains(\@$@);
18							sub containsReplace(\@$@);
19
20							## stemmer support not available as yet
21
22							my $stopregex = "";
23							my %stemmer;
24
25							# new (stoplist => $stoplist, stemmer => 1)
26							sub new
27							{
28	5			5	0	20	my $class = shift;
29	5		33			26	$class = ref $class \|\| $class;
30	5					12	my $self = bless [], $class;
31
32	5					10	my $stoplist;
33							my $stemmer;
34	5					23	while (scalar @_) {
35	0					0	my $arg = shift;
36	0	0				0	if ($arg =~ /stoplist/i) {
		0
37	0					0	$stoplist = shift;
38	0	0				0	if (-z $stoplist) {
39	0					0	die "'$stoplist' is not a stoplist file";
40							}
41							}
42							elsif ($arg =~ /stemmer/i) {
43	0					0	$stemmer = shift;
44	0	0				0	unless (ref $stemmer) {
45	0					0	die "'$stemmer' is not a reference to a stemmer object";
46							}
47							}
48							else {
49	0					0	die "Unknown argument '$arg'";
50							}
51							}
52
53							# stemming
54							# stoplist
55	5	50				18	if (defined $stoplist) {
56	0					0	$self->_loadStoplist ($stoplist);
57							}
58
59	5	50				17	if (defined $stemmer) {
60	0					0	warn "Stemmer defined but ignored";
61							}
62
63	5					21	return $self;
64							}
65
66							sub DESTROY
67							{
68	4			4		9	my $self = shift;
69	4					146	delete $stemmer{$self};
70							}
71
72	0			0	0	0	sub doStop {0}
73
74							# originally adapted from a function in string_compare.pm
75							# (distributed with earlier versions of WordNet::Similarity)
76							# now WordNet::Similarity uses Text::Similarity and no longer
77							# includes string_compare.pm
78							sub getOverlaps
79							{
80	45			45	0	3325	my $self = shift;
81	45					69	my $string0 = shift;
82	45					72	my $string1 = shift;
83
84	45					86	my %overlapsHash = ();
85
86	45					103	$string0 =~ s/^\s+//;
87	45					300	$string0 =~ s/\s+$//;
88	45					88	$string1 =~ s/^\s+//;
89	45					284	$string1 =~ s/\s+$//;
90
91
92	45	50				128	if ($stopregex ne "")
93							{
94	0					0	$string0 = $self->_removeStopWords ($string0);
95	0					0	$string1 = $self->_removeStopWords ($string1);
96							}
97
98							# if stemming on, stem the two strings
99	45					65	my $stemmingReqd = 0;
100	45	50				96	if ($stemmingReqd)
101							{
102	0					0	my $stemmer = bless [];
103	0					0	$string0 = $stemmer->stemString($string0, 1); # 1 turns on caching
104	0					0	$string1 = $stemmer->stemString($string1, 1);
105							}
106
107	45					567	my @words0 = split /\s+/, $string0;
108	45					527	my @words1 = split /\s+/, $string1;
109
110	45					134	my %first;
111	45					132	foreach my $offset (0 .. $#words1) {
112	1104					1097	push @{$first{$words1[$offset]}}, $offset;
	1104					2628
113							}
114
115	45					75	my $wc0 = scalar @words0;
116	45					58	my $wc1 = scalar @words1;
117
118							# for each word in string0, find out how long an overlap can start from it.
119	45					77	my @overlapsLengths = ();
120	45					59	my $matchStartIndex = 0;
121	45					47	my $currIndex = -1;
122
123	45					109	while ($currIndex < $#words0)
124							{
125							# forward the current index to look at the next word
126	1102					1320	$currIndex++;
127
128							# if this works, carry on!
129							# if (contains (@words1, @words0[$matchStartIndex..$currIndex])) {
130	1102	100				4301	if (contains (@words1, $first{$words0[$matchStartIndex]},@words0[$matchStartIndex..$currIndex])) {
131							next
132	1047					3531	}
133							else {
134							# XXX shouldn't this be $currIndex - $matchStartIndex + 1 ?
135	55					90	$overlapsLengths[$matchStartIndex] = $currIndex - $matchStartIndex;
136	55	100				126	$currIndex-- if ($overlapsLengths[$matchStartIndex] > 0);
137	55					156	$matchStartIndex++;
138							}
139							}
140
141	45					127	for (my $i = $matchStartIndex; $i <= $currIndex; $i++)
142							{
143	1028					2286	$overlapsLengths[$i] = $currIndex - $i + 1;
144							}
145
146	45					148	my ($longestOverlap) = sort {$b <=> $a} @overlapsLengths;
	1134					1196
147
148	45		100			200	while (defined($longestOverlap) && ($longestOverlap > 0))
149							{
150	40					102	for (my $i = 0; $i <= $#overlapsLengths; $i++)
151							{
152	1100	100				3293	next if ($overlapsLengths[$i] < $longestOverlap);
153
154							# form the string
155	48					73	my $stringEnd = $i + $longestOverlap - 1;
156
157							# check if still there in $string1. replace in string1 with a mark
158
159	48	100	66			259	if (1 #!doStop($temp)
160							## && containsReplace (@words1, @words0[$i..$stringEnd]))
161							&& exists $first{$words0[$i]}
162							&& containsReplace (@words1, $first{$words0[$i]}, @words0[$i..$stringEnd]))
163							{
164							# so its still there. we have an overlap!
165	44					212	my $temp = join (" ", @words0[$i..$stringEnd]);
166	44					159	$overlapsHash{$temp}++;
167
168							# adjust overlap lengths forward
169	44					112	for (my $j = $i; $j < $i + $longestOverlap; $j++)
170							{
171	1043					2178	$overlapsLengths[$j] = 0;
172							}
173
174							# adjust overlap lengths backward
175	44					203	for (my $j = $i-1; $j >= 0; $j--)
176							{
177	12	50				57	last if ($overlapsLengths[$j] <= $i - $j);
178	0					0	$overlapsLengths[$j] = $i - $j;
179							}
180							}
181							else
182							{
183							# ah its not there any more in string1! see if
184							# anything smaller than the full string works
185	4					8	my $k = $longestOverlap - 1;
186	4					10	while ($k > 0)
187							{
188							# form the string
189	2					4	my $stringEnd = $i + $k - 1;
190							## last if contains (@words1, @words0[$i..$stringEnd]);
191	2	50				6	last if contains (@words1, $first{$words0[$i]}, @words0[$i..$stringEnd]);
192
193	2					5	$k--;
194							}
195
196	4					11	$overlapsLengths[$i] = $k;
197							}
198							}
199	40					111	($longestOverlap) = sort {$b <=> $a} @overlapsLengths;
	1148					1327
200							}
201
202	45					561	return (\%overlapsHash, $wc0, $wc1);
203							}
204
205							# returns true if the first array contains the list, otherwise returns false
206							# See also containsReplace()
207							# e.g., contains (@Array, LIST);
208							##sub contains (\@@)
209							sub contains (\@$@)
210							{
211	1104			1104	0	1372	my $array2_ref = shift;
212
213	1104					1359	my $positions = shift;
214	1104	100				2169	return 0 if (not defined $positions);
215
216	1068					7113	my @array1 = @_;
217
218	1068	100				1194	return 0 if $#{$array2_ref} < $#array1;
	1068					2583
219
220							## for my $j (0..($#{$array2_ref} - $#array1)) {
221	1066					1780	for my $j (@$positions) {
222	1068	100				1184	next if ($j > $#{$array2_ref} - $#array1);
	1068					2360
223
224	1063	100				2312	next if $array2_ref->[$j] eq MARKER;
225
226	1061	50				2325	if ($array1[0] eq $array2_ref->[$j]) {
227	1061					1251	my $match = 1;
228	1061					1880	for my $i (1..$#array1) {
229	37165	100	66			154197	if ($array2_ref->[$j + $i] eq MARKER
230							or $array1[$i] ne $array2_ref->[$j + $i]) {
231	14					20	$match = 0;
232	14					22	last;
233							}
234							}
235	1061	100				2137	if ($match) {
236	1047					5003	return 1;
237							}
238							}
239							}
240
241	19					55	return 0;
242							}
243
244							# same functionality as contains(), but replaces each word in the match
245							# with the constant MARKER
246							##sub containsReplace (\@@)
247							sub containsReplace (\@$@)
248
249							{
250	48			48	0	66	my $array2_ref = shift;
251
252	48					61	my $positions = shift;
253	48	50				136	return 0 if (not defined $positions);
254
255	48					274	my @array1 = @_;
256
257	48	50				56	return 0 if $#{$array2_ref} < $#array1;
	48					132
258
259							# for my $j (0..($#{$array2_ref} - $#array1)) {
260
261	48					85	for my $j (@$positions) {
262	48	50				58	next if ($j > $#{$array2_ref} - $#array1);
	48					117
263
264	48	100				123	next if $array2_ref->[$j] eq MARKER;
265
266	44	50				137	if ($array1[0] eq $array2_ref->[$j]) {
267	44					56	my $match = 1;
268	44					98	for my $i (1..$#array1) {
269	999	50	33			4155	if ($array2_ref->[$j + $i] eq MARKER
270							or $array1[$i] ne $array2_ref->[$j + $i]) {
271	0					0	$match = 0;
272	0					0	last;
273							}
274							}
275
276							# match found, remove match and return true
277	44	50				105	if ($match) {
278	44					89	for my $k ($j..($j+$#array1)) {
279	1043					1319	$array2_ref->[$k] = MARKER;
280							}
281	44					271	return 1;
282							}
283							}
284							}
285
286							# no match found
287	4					14	return 0;
288							}
289
290							sub _removeStopWords
291							{
292	0			0			my $self = shift;
293	0						my $str = shift;
294	0						my @words = split /\s+/, $str;
295	0						my @newwords;
296	0						foreach my $word (@words) {
297	0	0					if(!($word =~ /$stopregex/))
298							{
299	0						push (@newwords, $word);
300							}
301							}
302	0						return join (' ', @newwords);
303							}
304
305							sub _loadStoplist
306							{
307	0			0			my $self = shift;
308	0						my $list = shift;
309	0	0					open FH, '<', $list or die "Cannot open stoplist file '$list': $!";
310
311	0						$stopregex = "(";
312	0						while () {
313	0						chomp;
314	0	0					if ($_ ne "")
315							{
316	0						$_=~s/\///g;
317	0	0					if ($_=~m/\\b/)
318							{
319	0						$stopregex .= "$_\|";
320							}
321							else
322							{
323	0						my $word = "\\b"."$_"."\\b";
324	0						$stopregex .= "$word\|";
325							}
326							}
327							}
328	0						chop $stopregex; $stopregex .= ")";
	0
329	0						close FH;
330							}
331
332
333							1;
334
335							__END__