File Coverage

blib/lib/Lingua/AlSetLib.pm

Criterion	Covered	Total	%
statement	6	298	2.0
branch	0	100	0.0
condition	0	59	0.0
subroutine	2	26	7.6
pod	0	12	0.0
total	8	495	1.6

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::AlSetLib;
2							$VERSION=1.1;
3	1			1		6	use strict;
	1					2
	1					37
4	1			1		979	use Dumpvalue;
	1					5494
	1					3998
5
6							# input: array of integers
7							# output: array of contiguous integer arrays
8							sub getContiguousSequences {
9	0			0	0		my $array=shift;
10	0						my $dumper = new Dumpvalue;
11
12	0						my @sortedArray = sort {$a <=> $b} @$array;
	0
13							# print "sorted ARRay:\n";
14							# print $dumper->dumpValue(\@sortedArray);
15	0						my $sequences=[];
16	0						my $numSeq=0;
17
18	0	0					if (@$array > 0){
19	0						my $former = $sortedArray[0];
20	0						push @{$sequences->[$numSeq]},$former;
	0
21	0						for (my $i=1;$i<@sortedArray;$i++){
22	0	0					if ($sortedArray[$i]== ($former+1)){
23	0						push @{$sequences->[$numSeq]},$sortedArray[$i];
	0
24							}else{
25	0						$numSeq++;
26	0						push @{$sequences->[$numSeq]},$sortedArray[$i];
	0
27							}
28	0						$former=$sortedArray[$i];
29							}
30							}
31							# print "SEQUENCES (best $longuestSequence):\n";
32							# print $dumper->dumpValue($sequences);
33	0						return $sequences;
34							}
35
36							sub getLonguestContiguousSequence {
37	0			0	0		my $array=shift;
38	0						my $dumper = new Dumpvalue;
39
40	0						my @sortedArray = sort {$a <=> $b} @$array;
	0
41							# print "sorted ARRay:\n";
42							# print $dumper->dumpValue(\@sortedArray);
43	0						my $sequences=[];
44	0						my ($currentLength,$greatestLength,$longuestSequence,$numSeq)=(0,0,0,0);
45
46	0	0					if (@$array > 0){
47	0						my $former = $sortedArray[0];
48	0						push @{$sequences->[$numSeq]},$former;
	0
49	0						$currentLength++;
50	0						$greatestLength=$currentLength;
51	0						$longuestSequence = $numSeq;
52	0						for (my $i=1;$i<@sortedArray;$i++){
53	0	0					if ($sortedArray[$i]== ($former+1)){
54	0						push @{$sequences->[$numSeq]},$sortedArray[$i];
	0
55	0						$currentLength++;
56	0	0					if ($currentLength > $greatestLength){
57	0						$greatestLength=$currentLength;
58	0						$longuestSequence = $numSeq;
59							}
60							}else{
61	0						$numSeq++;
62	0						push @{$sequences->[$numSeq]},$sortedArray[$i];
	0
63	0						$currentLength=1;
64	0	0					if ($currentLength > $greatestLength){
65	0						$greatestLength=$currentLength;
66	0						$longuestSequence = $numSeq;
67							}
68							}
69	0						$former=$sortedArray[$i];
70							}
71							}
72							# print "SEQUENCES (best $longuestSequence):\n";
73							# print $dumper->dumpValue($sequences);
74	0						return $sequences->[$longuestSequence];
75							}
76
77							sub ibm1Prob {
78	0			0	0		my ($src,$trg,$pt_s)=@_;
79	0						my @wsrc=split(/ /, $src);
80	0						my @wtrg=split(/ /, $trg);
81	0						my $numWs = scalar(@wsrc);
82	0						my $numWt = scalar(@wtrg);
83
84							# IBM1 SRC->TRG
85	0	0					if (@wsrc>1){
86	0						unshift @wtrg,"NULL";
87	0						$numWt++;
88							}
89	0						my $prod=1;
90	0						for my $ws (@wsrc){
91	0						my $sum=0;
92	0						for my $wt (@wtrg){
93	0						my $srctrg="$ws \|\|\| $wt";
94	0						$sum+=$pt_s->{$srctrg};
95							#print "\t$srctrg => $pt_s->{$srctrg} ( $sum )\n";
96							}
97	0						$prod*=$sum;
98							}
99							#calcul de la probabilitat ibm1
100							# my $ibm1_st=$prod/($numWt**$numWs);
101	0						my $ibm1_st=$prod;
102	0	0					if ($ibm1_st==0){
103	0						$ibm1_st=1.0e-40;
104							}
105							# print "$src \| $trg ==> prod:$prod I:$numWt J:$numWs ibmst: $ibm1_st\n";
106	0						return $ibm1_st;
107							}
108
109							# INPUT: ref to array A, ref to array B
110							# OUTPUT: list of positions of array A where array B begins in A
111							# EX: A=(4,5,8,3,9,22,8,3) B=(8,3) output=(2,6)
112							sub findArrayInAnother {
113	0			0	0		my ($refToSubArray,$refToArray)=@_;
114	0						my @startPosi;
115	0						my $numSub = scalar(@$refToSubArray);
116	0						my $numArray=scalar(@$refToArray);
117	0	0					if ($numArray >= $numSub){
118	0						for (my $i=0;$i<$numArray;$i++){
119	0						my $failed=0;
120	0						my $subi=0;
121	0		0				while (!$failed && $subi<$numSub){
122	0	0					if ($refToSubArray->[$subi] ne $refToArray->[$i+$subi]){$failed=1;}
	0
123	0						$subi++;
124							}
125	0	0					if (!$failed){push @startPosi,$i;}
	0
126							}
127							}
128	0						return @startPosi;
129							}
130
131							#########################################################
132							### SUBROUTINES COPIED FROM ALGORITHM::DIFF CPAN MODULE #
133							#########################################################
134							sub _withPositionsOfInInterval
135							{
136	0			0			my $aCollection = shift; # array ref
137	0						my $start = shift;
138	0						my $end = shift;
139	0						my $keyGen = shift;
140	0						my %d;
141							my $index;
142	0						for ( $index = $start ; $index <= $end ; $index++ )
143							{
144	0						my $element = $aCollection->[$index];
145	0						my $key = &$keyGen( $element, @_ );
146	0	0					if ( exists( $d{$key} ) )
147							{
148	0						unshift ( @{ $d{$key} }, $index );
	0
149							}
150							else
151							{
152	0						$d{$key} = [$index];
153							}
154							}
155	0	0					return wantarray ? %d : \%d;
156							}
157
158							sub _replaceNextLargerWith
159							{
160	0			0			my ( $array, $aValue, $high ) = @_;
161	0		0				$high \|\|= $#$array;
162
163							# off the end?
164	0	0	0				if ( $high == -1 \|\| $aValue > $array->[-1] )
165							{
166	0						push ( @$array, $aValue );
167	0						return $high + 1;
168							}
169
170							# binary search for insertion point...
171	0						my $low = 0;
172	0						my $index;
173							my $found;
174	0						while ( $low <= $high )
175							{
176	0						$index = ( $high + $low ) / 2;
177
178							# $index = int(( $high + $low ) / 2); # without 'use integer'
179	0						$found = $array->[$index];
180
181	0	0					if ( $aValue == $found )
		0
182							{
183	0						return undef;
184							}
185							elsif ( $aValue > $found )
186							{
187	0						$low = $index + 1;
188							}
189							else
190							{
191	0						$high = $index - 1;
192							}
193							}
194
195							# now insertion point is in $low.
196	0						$array->[$low] = $aValue; # overwrite next larger
197	0						return $low;
198							}
199
200							sub _longestCommonSubsequence
201							{
202	0			0			my $a = shift; # array ref or hash ref
203	0						my $b = shift; # array ref or hash ref
204	0						my $counting = shift; # scalar
205	0						my $keyGen = shift; # code ref
206	0						my $compare; # code ref
207
208	0	0					if ( ref($a) eq 'HASH' )
209							{ # prepared hash must be in $b
210	0						my $tmp = $b;
211	0						$b = $a;
212	0						$a = $tmp;
213							}
214
215							# Check for bogus (non-ref) argument values
216	0	0	0				if ( !ref($a) \|\| !ref($b) )
217							{
218	0						my @callerInfo = caller(1);
219	0						die 'error: must pass array or hash references to ' . $callerInfo[3];
220							}
221
222							# set up code refs
223							# Note that these are optimized.
224	0	0					if ( !defined($keyGen) ) # optimize for strings
225							{
226	0			0			$keyGen = sub { $_[0] };
	0
227	0			0			$compare = sub { my ( $a, $b ) = @_; $a eq $b };
	0
	0
228							}
229							else
230							{
231							$compare = sub {
232	0			0			my $a = shift;
233	0						my $b = shift;
234	0						&$keyGen( $a, @_ ) eq &$keyGen( $b, @_ );
235	0						};
236							}
237
238	0						my ( $aStart, $aFinish, $matchVector ) = ( 0, $#$a, [] );
239	0						my ( $prunedCount, $bMatches ) = ( 0, {} );
240
241	0	0					if ( ref($b) eq 'HASH' ) # was $bMatches prepared for us?
242							{
243	0						$bMatches = $b;
244							}
245							else
246							{
247	0						my ( $bStart, $bFinish ) = ( 0, $#$b );
248
249							# First we prune off any common elements at the beginning
250	0		0				while ( $aStart <= $aFinish
			0
251							and $bStart <= $bFinish
252							and &$compare( $a->[$aStart], $b->[$bStart], @_ ) )
253							{
254	0						$matchVector->[ $aStart++ ] = $bStart++;
255	0						$prunedCount++;
256							}
257
258							# now the end
259	0		0				while ( $aStart <= $aFinish
			0
260							and $bStart <= $bFinish
261							and &$compare( $a->[$aFinish], $b->[$bFinish], @_ ) )
262							{
263	0						$matchVector->[ $aFinish-- ] = $bFinish--;
264	0						$prunedCount++;
265							}
266
267							# Now compute the equivalence classes of positions of elements
268							$bMatches =
269	0						_withPositionsOfInInterval( $b, $bStart, $bFinish, $keyGen, @_ );
270							}
271	0						my $thresh = [];
272	0						my $links = [];
273
274	0						my ( $i, $ai, $j, $k );
275	0						for ( $i = $aStart ; $i <= $aFinish ; $i++ )
276							{
277	0						$ai = &$keyGen( $a->[$i], @_ );
278	0	0					if ( exists( $bMatches->{$ai} ) )
279							{
280	0						$k = 0;
281	0						for $j ( @{ $bMatches->{$ai} } )
	0
282							{
283
284							# optimization: most of the time this will be true
285	0	0	0				if ( $k and $thresh->[$k] > $j and $thresh->[ $k - 1 ] < $j )
			0
286							{
287	0						$thresh->[$k] = $j;
288							}
289							else
290							{
291	0						$k = _replaceNextLargerWith( $thresh, $j, $k );
292							}
293
294							# oddly, it's faster to always test this (CPU cache?).
295	0	0					if ( defined($k) )
296							{
297	0	0					$links->[$k] =
298							[ ( $k ? $links->[ $k - 1 ] : undef ), $i, $j ];
299							}
300							}
301							}
302							}
303
304	0	0					if (@$thresh)
		0
305							{
306	0	0					return $prunedCount + @$thresh if $counting;
307	0						for ( my $link = $links->[$#$thresh] ; $link ; $link = $link->[0] )
308							{
309	0						$matchVector->[ $link->[1] ] = $link->[2];
310							}
311							}
312							elsif ($counting)
313							{
314	0						return $prunedCount;
315							}
316
317	0	0					return wantarray ? @$matchVector : $matchVector;
318							}
319
320							sub LCS
321							{
322	0			0	0		my $a = shift; # array ref
323	0						my $b = shift; # array ref or hash ref
324	0						my $matchVector = _longestCommonSubsequence( $a, $b, 0, @_ );
325	0						my @retval;
326							my $i;
327	0						for ( $i = 0 ; $i <= $#$matchVector ; $i++ )
328							{
329	0	0					if ( defined( $matchVector->[$i] ) )
330							{
331	0						push ( @retval, $a->[$i] );
332							}
333							}
334	0	0					return wantarray ? @retval : \@retval;
335							}
336
337							sub LCS_ratio
338							{
339	0			0	0		my $a = shift; # array ref
340	0						my $b = shift; # array ref or hash ref
341
342	0						my $lon=_longestCommonSubsequence( $a, $b, 1, @_ );
343							# if ($lon>0){print "LCS:$lon\n";}
344	0						my $ratio;
345							#Aqu� hago el ratio de la similitud cogiendo como valor total la longitud de la frase que sea m�s larga
346	0						my $m=scalar(@$a);
347	0						my $n=scalar(@$b);
348	0	0	0				if (($m >= $n) && ($m != 0) && ($n != 0)){$ratio=$lon/$m;}
	0	0	0
	0		0
			0
349	0						elsif(($m < $n) && ($m != 0) && ($n != 0)){$ratio=$lon/$n;}
350							else{$ratio=0;}
351
352	0						return ($ratio);
353							}
354
355							sub traverse_sequences
356							{
357	0			0	0		my $a = shift; # array ref
358	0						my $b = shift; # array ref
359	0		0				my $callbacks = shift \|\| {};
360	0						my $keyGen = shift;
361	0		0	0			my $matchCallback = $callbacks->{'MATCH'} \|\| sub { };
	0
362	0		0	0			my $discardACallback = $callbacks->{'DISCARD_A'} \|\| sub { };
	0
363	0						my $finishedACallback = $callbacks->{'A_FINISHED'};
364	0		0	0			my $discardBCallback = $callbacks->{'DISCARD_B'} \|\| sub { };
	0
365	0						my $finishedBCallback = $callbacks->{'B_FINISHED'};
366	0						my $matchVector = _longestCommonSubsequence( $a, $b, 0, $keyGen, @_ );
367
368							# Process all the lines in @$matchVector
369	0						my $lastA = $#$a;
370	0						my $lastB = $#$b;
371	0						my $bi = 0;
372	0						my $ai;
373
374	0						for ( $ai = 0 ; $ai <= $#$matchVector ; $ai++ )
375							{
376	0						my $bLine = $matchVector->[$ai];
377	0	0					if ( defined($bLine) ) # matched
378							{
379	0						&$discardBCallback( $ai, $bi++, @_ ) while $bi < $bLine;
380	0						&$matchCallback( $ai, $bi++, @_ );
381							}
382							else
383							{
384	0						&$discardACallback( $ai, $bi, @_ );
385							}
386							}
387
388							# The last entry (if any) processed was a match.
389							# $ai and $bi point just past the last matching lines in their sequences.
390
391	0		0				while ( $ai <= $lastA or $bi <= $lastB )
392							{
393
394							# last A?
395	0	0	0				if ( $ai == $lastA + 1 and $bi <= $lastB )
396							{
397	0	0					if ( defined($finishedACallback) )
398							{
399	0						&$finishedACallback( $lastA, @_ );
400	0						$finishedACallback = undef;
401							}
402							else
403							{
404	0						&$discardBCallback( $ai, $bi++, @_ ) while $bi <= $lastB;
405							}
406							}
407
408							# last B?
409	0	0	0				if ( $bi == $lastB + 1 and $ai <= $lastA )
410							{
411	0	0					if ( defined($finishedBCallback) )
412							{
413	0						&$finishedBCallback( $lastB, @_ );
414	0						$finishedBCallback = undef;
415							}
416							else
417							{
418	0						&$discardACallback( $ai++, $bi, @_ ) while $ai <= $lastA;
419							}
420							}
421
422	0	0					&$discardACallback( $ai++, $bi, @_ ) if $ai <= $lastA;
423	0	0					&$discardBCallback( $ai, $bi++, @_ ) if $bi <= $lastB;
424							}
425
426	0						return 1;
427							}
428
429							#diff computes the smallest set of additions and deletions necessary to turn the first sequence into the second, and returns a description of these changes. The description is a list of hunks; each hunk represents a contiguous section of items which should be added, deleted, or replaced. (Hunks containing unchanged items are not included.)
430							# EXAMPLES:
431							# @diffs = diff( \@seq1, \@seq2 );
432							# $diffs_ref = diff( \@seq1, \@seq2 );
433							sub diff
434							{
435	0			0	0		my $a = shift; # array ref
436	0						my $b = shift; # array ref
437	0						my $retval = [];
438	0						my $hunk = [];
439							my $discard = sub {
440	0			0			push @$hunk, [ '-', $_[0], $a->[ $_[0] ] ];
441	0						};
442							my $add = sub {
443	0			0			push @$hunk, [ '+', $_[1], $b->[ $_[1] ] ];
444	0						};
445							my $match = sub {
446	0	0		0			push @$retval, $hunk
447							if 0 < @$hunk;
448	0						$hunk = []
449	0						};
450	0						traverse_sequences( $a, $b,
451							{ MATCH => $match, DISCARD_A => $discard, DISCARD_B => $add }, @_ );
452	0						&$match();
453	0	0					return wantarray ? @$retval : $retval;
454							}
455
456							###### END ALGORITHM::DIFF FUNCTIONS
457
458
459							sub max {
460	0			0	0		my ($a,$b)=@_;
461	0	0					if ($a > $b) {return $a;}
	0
	0
462							else {return $b};
463							}
464
465							sub min {
466	0			0	0		my ($a,$b)=@_;
467	0	0					if ($a < $b) {return $a;}
	0
	0
468							else {return $b};
469							}
470
471							### copy-paste of Algorithm::MinMax module (to avoid having dependencies)
472							sub minmax {
473	0			0	0		my @array = @{ $_[ 1 ] };
	0
474	0						my @result;
475	0	0					if( scalar( @array ) == 0 ) {
476	0						return @result;
477							}
478	0	0					if( scalar( @array ) == 1 ) {
479	0						$result[ 0 ] = $array[ 0 ];
480	0						$result[ 1 ] = $array[ 0 ];
481	0						return @result;
482							}
483	0						my @min_cand;
484							my @max_cand;
485	0						my $r = scalar( @array ) - 2;
486	0						my $k = 0;
487	0						for( my $i = 0; $i <= $r ; $i = $i + 2 ) {
488	0	0					if( $array[ $i ] < $array[ $i + 1 ] ) {
489	0						$min_cand[ $k ] = $array[ $i ];
490	0						$max_cand[ $k ] = $array[ $i + 1 ];
491							} else {
492	0						$min_cand[ $k ] = $array[ $i + 1 ];
493	0						$max_cand[ $k ] = $array[ $i ];
494							}
495	0						++$k;
496							}
497	0	0					if( scalar( @array ) % 2 != 0 ) {
498	0	0					if( $min_cand[ 0 ] < $array[ $r + 1 ] ) {
499	0						$max_cand[ $k ] = $array[ $r + 1 ];
500							} else {
501	0						$min_cand[ $k ] = $array[ $r + 1 ];
502							}
503							}
504	0						my $m = $min_cand[ 0 ];
505	0						for( my $i = 1; $i < scalar( @min_cand ); ++$i ) {
506	0	0					if( $min_cand[ $i ] < $m ) {
507	0						$m = $min_cand[ $i ];
508							}
509							}
510	0						$result[ 0 ] = $m;
511	0						$m = $max_cand[ 0 ];
512	0						for( my $i = 1; $i < scalar( @max_cand ); ++$i ) {
513	0	0					if( $max_cand[ $i ] > $m ) {
514	0						$m = $max_cand[ $i ];
515							}
516							}
517	0						$result[ 1 ] = $m;
518	0						@result;
519							}
520
521							### END Algorithm::MinMax function
522
523
524
525							sub escapeRegExp {
526	0			0	0		my $line = shift;
527							# regExp characters to escape: \ \| ( ) [ { ^ $ * + ? .
528	0						$line =~ s/\\/\\\\/g;
529	0						$line =~ s/\\|/\\\\|/g;
530	0						$line =~ s/\(/\\\(/g;
531	0						$line =~ s/\)/\\\)/g;
532	0						$line =~ s/\[/\\\[/g;
533	0						$line =~ s/\{/\\\}/g;
534	0						$line =~ s/\^/\\\^/g;
535	0						$line =~ s/\$/\\\$/g;
536	0						$line =~ s/\/\\\/g;
537	0						$line =~ s/\+/\\\+/g;
538	0						$line =~ s/\?/\\\?/g;
539	0						$line =~ s/\./\\\./g;
540	0						return $line;
541							}
542
543							1;