File Coverage

blib/lib/Text/OverlapFinder.pm

Criterion	Covered	Total	%
statement	100	143	69.9
branch	31	56	55.3
condition	7	12	58.3
subroutine	8	11	72.7
pod	0	5	0.0
total	146	227	64.3

line	stmt	bran	cond	sub	pod	time	code
1							package Text::OverlapFinder;
2
3	4			4		26074	use strict;
	4					7
	4					119
4	4			4		19	use warnings;
	4					7
	4					221
5
6							our @ISA = ();
7							our $VERSION = '0.03';
8
9	4			4		33	use constant MARKER => '###';
	4					7
	4					8328
10
11							sub contains(\@@);
12							sub containsReplace(\@@);
13
14							## stemmer support not available as yet
15
16							my $stopregex = "";
17							my %stemmer;
18
19							# new (stoplist => $stoplist, stemmer => 1)
20							sub new
21							{
22	5			5	0	22	my $class = shift;
23	5		33			28	$class = ref $class \|\| $class;
24	5					30	my $self = bless [], $class;
25
26	5					11	my $stoplist;
27							my $stemmer;
28	5					30	while (scalar @_) {
29	0					0	my $arg = shift;
30	0	0				0	if ($arg =~ /stoplist/i) {
		0
31	0					0	$stoplist = shift;
32	0	0				0	if (-z $stoplist) {
33	0					0	die "'$stoplist' is not a stoplist file";
34							}
35							}
36							elsif ($arg =~ /stemmer/i) {
37	0					0	$stemmer = shift;
38	0	0				0	unless (ref $stemmer) {
39	0					0	die "'$stemmer' is not a reference to a stemmer object";
40							}
41							}
42							else {
43	0					0	die "Unknown argument '$arg'";
44							}
45							}
46
47							# stemming
48							# stoplist
49	5	50				19	if (defined $stoplist) {
50	0					0	$self->_loadStoplist ($stoplist);
51							}
52
53	5	50				16	if (defined $stemmer) {
54	0					0	warn "Stemmer defined but ignored";
55							}
56
57	5					25	return $self;
58							}
59
60							sub DESTROY
61							{
62	4			4		9	my $self = shift;
63	4					441	delete $stemmer{$self};
64							}
65
66	0			0	0	0	sub doStop {0}
67
68							# adapted from a function in string_compare.pm (distributed with
69							# WordNet::Similarity)
70							sub getOverlaps
71							{
72	45			45	0	4294	my $self = shift;
73	45					57	my $string0 = shift;
74	45					59	my $string1 = shift;
75
76	45					88	my %overlapsHash = ();
77
78	45					110	$string0 =~ s/^\s+//;
79	45					342	$string0 =~ s/\s+$//;
80	45					94	$string1 =~ s/^\s+//;
81	45					334	$string1 =~ s/\s+$//;
82
83
84	45	50				122	if ($stopregex ne "")
85							{
86	0					0	$string0 = $self->_removeStopWords ($string0);
87	0					0	$string1 = $self->_removeStopWords ($string1);
88							}
89
90							# if stemming on, stem the two strings
91	45					59	my $stemmingReqd = 0;
92	45	50				102	if ($stemmingReqd)
93							{
94	0					0	my $stemmer = bless [];
95	0					0	$string0 = $stemmer->stemString($string0, 1); # 1 turns on caching
96	0					0	$string1 = $stemmer->stemString($string1, 1);
97							}
98
99	45					419	my @words0 = split /\s+/, $string0;
100	45					424	my @words1 = split /\s+/, $string1;
101
102	45					100	my $wc0 = scalar @words0;
103	45					61	my $wc1 = scalar @words1;
104
105							# for each word in string0, find out how long an overlap can start from it.
106	45					65	my @overlapsLengths = ();
107	45					53	my $matchStartIndex = 0;
108	45					51	my $currIndex = -1;
109
110	45					134	while ($currIndex < $#words0)
111							{
112							# forward the current index to look at the next word
113	1102					1194	$currIndex++;
114
115							# if this works, carry on!
116	1102	100				4509	if (contains (@words1, @words0[$matchStartIndex..$currIndex])) {
117							next
118	1047					3740	}
119							else {
120							# XXX shouldn't this be $currIndex - $matchStartIndex + 1 ?
121	55					86	$overlapsLengths[$matchStartIndex] = $currIndex - $matchStartIndex;
122	55	100				122	$currIndex-- if ($overlapsLengths[$matchStartIndex] > 0);
123	55					4874	$matchStartIndex++;
124							}
125							}
126
127	45					207	for (my $i = $matchStartIndex; $i <= $currIndex; $i++)
128							{
129	1028					2638	$overlapsLengths[$i] = $currIndex - $i + 1;
130							}
131
132	45					200	my ($longestOverlap) = sort {$b <=> $a} @overlapsLengths;
	1134					1487
133
134	45		100			266	while (defined($longestOverlap) && ($longestOverlap > 0))
135							{
136	40					127	for (my $i = 0; $i <= $#overlapsLengths; $i++)
137							{
138	1100	100				2920	next if ($overlapsLengths[$i] < $longestOverlap);
139
140							# form the string
141	48					76	my $stringEnd = $i + $longestOverlap - 1;
142
143							# check if still there in $string1. replace in string1 with a mark
144
145	48	100				219	if (1 #!doStop($temp)
146							&& containsReplace (@words1, @words0[$i..$stringEnd]))
147							{
148							# so its still there. we have an overlap!
149	44					260	my $temp = join (" ", @words0[$i..$stringEnd]);
150	44					182	$overlapsHash{$temp}++;
151
152							# adjust overlap lengths forward
153	44					618	for (my $j = $i; $j < $i + $longestOverlap; $j++)
154							{
155	1043					1806	$overlapsLengths[$j] = 0;
156							}
157
158							# adjust overlap lengths backward
159	44					218	for (my $j = $i-1; $j >= 0; $j--)
160							{
161	12	50				54	last if ($overlapsLengths[$j] <= $i - $j);
162	0					0	$overlapsLengths[$j] = $i - $j;
163							}
164							}
165							else
166							{
167							# ah its not there any more in string1! see if
168							# anything smaller than the full string works
169	4					10	my $k = $longestOverlap - 1;
170	4					11	while ($k > 0)
171							{
172							# form the string
173	2					5	my $stringEnd = $i + $k - 1;
174	2	50				6	last if contains (@words1, @words0[$i..$stringEnd]);
175	2					7	$k--;
176							}
177
178	4					15	$overlapsLengths[$i] = $k;
179							}
180							}
181	40					118	($longestOverlap) = sort {$b <=> $a} @overlapsLengths;
	1148					1302
182							}
183
184	45					414	return (\%overlapsHash, $wc0, $wc1);
185							}
186
187							# returns true of the first array contains the list, otherwise returns false
188							# See also containsReplace()
189							# e.g., contains (@Array, LIST);
190							sub contains (\@@)
191							{
192	1104			1104	0	1422	my $array2_ref = shift;
193	1104					8018	my @array1 = @_;
194
195	1104	100				1142	return 0 if $#{$array2_ref} < $#array1;
	1104					2565
196
197	1094					1344	for my $j (0..($#{$array2_ref} - $#array1)) {
	1094					2159
198	1411	100				2856	next if $array2_ref->[$j] eq MARKER;
199
200	1407	100				2998	if ($array1[0] eq $array2_ref->[$j]) {
201	1061					1134	my $match = 1;
202	1061					1817	for my $i (1..$#array1) {
203	37165	100	66			147975	if ($array2_ref->[$j + $i] eq MARKER
204							or $array1[$i] ne $array2_ref->[$j + $i]) {
205	14					16	$match = 0;
206	14					22	last;
207							}
208							}
209	1061	100				2344	if ($match) {
210	1047					5363	return 1;
211							}
212							}
213							}
214
215	47					139	return 0;
216							}
217
218							# same functionality as contains(), but replaces each word in the match
219							# with the constant MARKER
220							sub containsReplace (\@@)
221							{
222	48			48	0	66	my $array2_ref = shift;
223	48					299	my @array1 = @_;
224
225	48	50				50	return 0 if $#{$array2_ref} < $#array1;
	48					148
226
227	48					84	for my $j (0..($#{$array2_ref} - $#array1)) {
	48					115
228	97	100				333	next if $array2_ref->[$j] eq MARKER;
229
230	71	100				151	if ($array1[0] eq $array2_ref->[$j]) {
231	44					54	my $match = 1;
232	44					90	for my $i (1..$#array1) {
233	999	50	33			3824	if ($array2_ref->[$j + $i] eq MARKER
234							or $array1[$i] ne $array2_ref->[$j + $i]) {
235	0					0	$match = 0;
236	0					0	last;
237							}
238							}
239
240							# match found, remove match and return true
241	44	50				113	if ($match) {
242	44					98	for my $k ($j..($j+$#array1)) {
243	1043					1282	$array2_ref->[$k] = MARKER;
244							}
245	44					215	return 1;
246							}
247							}
248							}
249
250							# no match found
251	4					13	return 0;
252							}
253
254							sub _removeStopWords
255							{
256	0			0			my $self = shift;
257	0						my $str = shift;
258	0						my @words = split /\s+/, $str;
259	0						my @newwords;
260	0						foreach my $word (@words) {
261	0	0					if(!($word =~ /$stopregex/))
262							{
263	0						push (@newwords, $word);
264							}
265							}
266	0						return join (' ', @newwords);
267							}
268
269							sub _loadStoplist
270							{
271	0			0			my $self = shift;
272	0						my $list = shift;
273	0	0					open FH, '<', $list or die "Cannot open stoplist file '$list': $!";
274
275	0						$stopregex = "(";
276	0						while () {
277	0						chomp;
278	0	0					if ($_ ne "")
279							{
280	0						$_=~s/\///g;
281	0	0					if ($_=~m/\\b/)
282							{
283	0						$stopregex .= "$_\|";
284							}
285							else
286							{
287	0						my $word = "\\b"."$_"."\\b";
288	0						$stopregex .= "$word\|";
289							}
290							}
291							}
292	0						chop $stopregex; $stopregex .= ")";
	0
293	0						close FH;
294							}
295
296
297							1;
298
299							__END__