|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package String::Trigram;  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
6814
 | 
 use Carp;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
58
 | 
    | 
| 
4
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
728
 | 
 use locale;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
218
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
    | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
6
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
28
 | 
 use 5.6.0;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
40
 | 
    | 
| 
7
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
4
 | 
 use strict;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
29
 | 
    | 
| 
8
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
4
 | 
 use warnings;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2500
 | 
    | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 require Exporter;  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @ISA       = qw(Exporter);  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our @EXPORT_OK = ('compare');  | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $VERSION   = '0.12';  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $DEFAULT_MIN_SIM          = 0;  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $DEFAULT_WARP             = 1.0;  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $DEFAULT_IGNORE_CASE      = 1;  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $DEFAULT_KEEP_ONLY_ALNUMS = 0;  | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $DEFAULT_DEBUG            = 0;  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $DEFAULT_NGRAM_LEN        = 3;  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 our $DEFAULT_PADDING          = $DEFAULT_NGRAM_LEN - 1;  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new {  | 
| 
25
 | 
39
 | 
 
 | 
 
 | 
  
39
  
 | 
  
1
  
 | 
598
 | 
 	my ( $pkg, %params ) = @_;  | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
27
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
57
 | 
 	my $seen = {};  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
29
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
79
 | 
 	_setParams( \%params );  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
31
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
105
 | 
 	foreach ( keys %params ) {  | 
| 
32
 | 
312
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
895
 | 
 		croak "Unknown parameter $_!"  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		  if ( $_ !~  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 /^(cmpBase|minSim|warp|ignoreCase|keepOnlyAlNums|padding|debug|ngram)$/  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		  );  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# check for reasonable values  | 
| 
39
 | 
39
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
122
 | 
 	if ( ( !$params{cmpBase} ) or ( ref( @{ $params{cmpBase} } ne 'ARRAY' ) ) )  | 
| 
 
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
143
 | 
    | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	{  | 
| 
41
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		croak  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 "We need a base for comparison, so you should specify the parameter cmpBase as a reference to an anonymous array of strings being the base of comparison. Don't bother specifying anything else, but this we do need.\n";  | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
45
 | 
39
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
157
 | 
 	if ( ( $params{minSim} < 0 ) || ( $params{minSim} > 1.0 ) ) {  | 
| 
46
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		croak "Minimal similarity must be >= 0 and <= 1.0";  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
49
 | 
39
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
155
 | 
 	if ( defined $params{ngram} && ( $params{ngram} <= 0 ) ) {  | 
| 
50
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		croak "n in n-gram must be > 0";  | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
53
 | 
39
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
83
 | 
 	if ( $params{warp} == 0 ) {  | 
| 
54
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		croak "Warp must be > 0";  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
57
 | 
39
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
150
 | 
 	if ( ( $params{padding} < 0 ) || ( $params{padding} > $params{ngram} - 1 ) )  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	{  | 
| 
59
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		croak "Padding must be between 0 and " . ( $params{ngram} - 1 ) . ".";  | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
62
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
153
 | 
 	my $self = bless {  | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		ngram  => int( $params{ngram} ),  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		minSim => $params{minSim},  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		warp   => $params{warp},  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		# index for trigrams  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		trigIdx => _trigramify(  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			$params{cmpBase},        $params{ignoreCase},  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			$params{keepOnlyAlNums}, ' ' x $params{padding},  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			undef,                   $params{ngram},  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			$seen  | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		),  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		# index of all strings fed to the object, so no string is  | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		# processed twice (this might lead to wrong results)  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		seenStrings => $seen,  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		ignoreCase     => $params{ignoreCase},  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		keepOnlyAlNums => $params{keepOnlyAlNums},  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		padding        => ' ' x $params{padding},  | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		debug          => $params{debug},  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}, $pkg;  | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
86
 | 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
173
 | 
 	return $self;  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub compare {  | 
| 
90
 | 
31
 | 
 
 | 
 
 | 
  
31
  
 | 
  
1
  
 | 
1148
 | 
 	my ( $s1, $s2 ) = ( shift, shift );  | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
92
 | 
31
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
64
 | 
 	croak "I need at least 2 strings to compare as parameters, died"  | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  unless defined $s2;  | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
95
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
40
 | 
 	my $result = {};  | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
97
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
111
 | 
 	new String::Trigram( cmpBase => [$s1], @_ )  | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  ->getSimilarStrings( $s2, $result );  | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
100
 | 
31
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
368
 | 
 	$result->{$s1} || 0;  | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub reInit {  | 
| 
104
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
  
1
  
 | 
19
 | 
 	my ( $self, $newCmpBase ) = @_;  | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
106
 | 
3
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
20
 | 
 	if ( ( !$newCmpBase ) or ( ref( @$newCmpBase ne 'ARRAY' ) ) ) {  | 
| 
107
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		croak  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 "We need a base for comparison, so, as a parameter to this method, we do need a reference to an anonymous array of strings being the base of comparison.\n";  | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
111
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
 	$self->{seenStrings} = {};  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
113
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
 	$self->_setTrigIdx(  | 
| 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		_trigramify(  | 
| 
115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			$newCmpBase,             $self->{ignoreCase},  | 
| 
116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			$self->{keepOnlyAlNums}, $self->{padding},  | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			undef,                   $self->{ngram},  | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			$self->{seenStrings}  | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		)  | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	);  | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub extendBase {  | 
| 
124
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
7
 | 
 	my ( $self, $newStrings ) = @_;  | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
126
 | 
1
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
10
 | 
 	if ( ( !$newStrings ) or ( ref( @$newStrings ne 'ARRAY' ) ) ) {  | 
| 
127
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		croak  | 
| 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 "We need to add to the base for comparison, so, as a parameter to this method, we do need a reference to an anonymous array of strings being added to the base of comparison.\n";  | 
| 
129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	$self->_setTrigIdx(  | 
| 
132
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
 		_trigramify(  | 
| 
133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			$newStrings,             $self->{ignoreCase},  | 
| 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			$self->{keepOnlyAlNums}, $self->{padding},  | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			$self->{trigIdx},        $self->{ngram},  | 
| 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			$self->{seenStrings}  | 
| 
137
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		)  | 
| 
138
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	);  | 
| 
139
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub minSim {  | 
| 
142
 | 
3
 | 
 
 | 
 
 | 
  
3
  
 | 
  
1
  
 | 
12
 | 
 	my ( $self, $newMinSim ) = @_;  | 
| 
143
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
144
 | 
3
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
14
 | 
 	if (   ( defined $newMinSim )  | 
| 
 
 | 
 
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		&& ( ( ( $newMinSim < 0 ) || ( $newMinSim > 1.0 ) ) ) )  | 
| 
146
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	{  | 
| 
147
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		croak "Minimal similarity must be >= 0 and <= 1.0";  | 
| 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
150
 | 
3
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
7
 | 
 	$self->{minSim} = $newMinSim if ($newMinSim);  | 
| 
151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
152
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
 	return $self->{minSim};  | 
| 
153
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
154
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub warp {  | 
| 
156
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
 	my ( $self, $newWarp ) = @_;  | 
| 
157
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
158
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
0
 | 
 	if ( ( defined $newWarp ) && ( $newWarp <= 0 ) ) {  | 
| 
159
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		croak "Warp must be > 0";  | 
| 
160
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
161
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
162
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	$self->{warp} = $newWarp if ($newWarp);  | 
| 
163
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
164
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	return $self->{warp};  | 
| 
165
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
166
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
167
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub ignoreCase {  | 
| 
168
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
 	my ( $self, $newIgnoreCase ) = @_;  | 
| 
169
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
170
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	$self->{ignoreCase} = $newIgnoreCase if ($newIgnoreCase);  | 
| 
171
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
172
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	return $self->{ignoreCase};  | 
| 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
174
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
175
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub keepOnlyAlNums {  | 
| 
176
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
 	my ( $self, $newKeepOnlyAlNums ) = @_;  | 
| 
177
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
178
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	$self->{keepOnlyAlNums} = $newKeepOnlyAlNums if ($newKeepOnlyAlNums);  | 
| 
179
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
180
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	return $self->{keepOnlyAlNums};  | 
| 
181
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
182
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
183
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub padding {  | 
| 
184
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
 	my ( $self, $newPadding ) = @_;  | 
| 
185
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
186
 | 
0
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
0
 | 
 	if (   ( defined $newPadding )  | 
| 
 
 | 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		&& ( ( ( $newPadding < 0 ) || ( $newPadding > $self->{ngram} - 1 ) ) ) )  | 
| 
188
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	{  | 
| 
189
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 		croak "Padding must be between 0 and " . $self->{ngram} - 1 . ".";  | 
| 
190
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
191
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
192
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	$self->{padding} = ' ' x $newPadding if ($newPadding);  | 
| 
193
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
194
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	return length $self->{padding};  | 
| 
195
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
196
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
197
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub debug {  | 
| 
198
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
 	my ( $self, $newDebug ) = @_;  | 
| 
199
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
200
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	$self->{debug} = $newDebug if ( defined($newDebug) );  | 
| 
201
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
202
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	return $self->{debug};  | 
| 
203
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
205
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Splits str into trigrams and looks up every trigram in trigIdx. If  | 
| 
206
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # successfull, it finds a list of strings containing the trigram and  | 
| 
207
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # increases the value of the string containing the trigram by 1 in the  | 
| 
208
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # lexical $simInfo (ref. to hash). Uses _computeSimilarity() to compute  | 
| 
209
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # the similarity value.  | 
| 
210
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
211
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Parameters  | 
| 
212
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
213
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # result KEY = matching string, VALUE = similarity value  | 
| 
214
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # str    string to be matched  | 
| 
215
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # data   further key-val-pairs for min sim and warp  | 
| 
216
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
217
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Returns  | 
| 
218
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
219
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # 0-n number of similar strings  | 
| 
220
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # -1  no match found  | 
| 
221
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
222
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub getSimilarStrings {  | 
| 
223
 | 
53
 | 
 
 | 
 
 | 
  
53
  
 | 
  
1
  
 | 
98
 | 
 	my $self   = shift;  | 
| 
224
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
61
 | 
 	my $str    = shift;  | 
| 
225
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
52
 | 
 	my $result = shift;  | 
| 
226
 | 
53
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
103
 | 
 	my %data   = @_ if @_;  | 
| 
227
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
228
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
55
 | 
 	my $curMinSim = $data{minSim};  | 
| 
229
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
54
 | 
 	my $curWarp   = $data{warp};  | 
| 
230
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
231
 | 
53
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
187
 | 
 	$curMinSim ||= $self->{minSim};  | 
| 
232
 | 
53
 | 
 
 | 
  
 66
  
 | 
 
 | 
 
 | 
148
 | 
 	$curWarp   ||= $self->{warp};  | 
| 
233
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
234
 | 
53
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
129
 | 
 	croak  | 
| 
235
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 "I need a reference to a hash as second parameter for getSimilarStrings()!"  | 
| 
236
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  if ( ref($result) ne 'HASH' );  | 
| 
237
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
238
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
53
 | 
 	my $trigram;     # contains current trigram  | 
| 
239
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $matches;     # is pointed to all strings containing current trigram  | 
| 
240
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	my $len;         # length of the string to compare  | 
| 
241
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	my $actNum;      # that's how many times current trigram is found in string  | 
| 
242
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	my $actName;     # this is a string containing current trigram  | 
| 
243
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	my $actMatch;    # current match  | 
| 
244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
245
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# KEY = trigram, SUBKEY = potentially similar string, VALUE = number  | 
| 
246
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# of times, trigrams is found in string.  | 
| 
247
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# Here the frequency of every trigram in every string is saved. The  | 
| 
248
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# table is filled with the existing strings containing some trigram  | 
| 
249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# the frequencys of the trigram in the string are noted and decreased  | 
| 
250
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# every time, a matching trigram is found until the value is 0. If  | 
| 
251
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# the value is 0, a match cannot be counted anymore.  | 
| 
252
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
69
 | 
 	my %trigNumBuf = ();  | 
| 
253
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# KEY = potentially similar string, VALUE = number of identical trigrams  | 
| 
255
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
69
 | 
 	my $simInfo = {};  | 
| 
256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
257
 | 
53
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
126
 | 
 	$str =~ s/\W//g if $self->{keepOnlyAlNums};  | 
| 
258
 | 
53
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
115
 | 
 	$str = lc $str if $self->{ignoreCase};  | 
| 
259
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
260
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
83
 | 
 	$str = $self->{padding} . $str . $self->{padding};  | 
| 
261
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# Number of n-grams is length of string minus n + 1  | 
| 
263
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
64
 | 
 	$len = length($str) - $self->{ngram} + 1;  | 
| 
264
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# **********************************************************  | 
| 
266
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# divide string to compare into trigrams and search trigrams  | 
| 
267
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# **********************************************************  | 
| 
268
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
269
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
114
 | 
 	for ( my $i = 0 ; $i < $len ; $i++ ) {  | 
| 
270
 | 
339
 | 
 
 | 
 
 | 
 
 | 
 
 | 
486
 | 
 		$trigram = substr( $str, $i, $self->{ngram} );  | 
| 
271
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
272
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		# look for every trigram in $self->{trigIdx}  | 
| 
273
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		# contine unless found  | 
| 
274
 | 
339
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
820
 | 
 		next unless ( exists( $self->{trigIdx}->{$trigram} ) );  | 
| 
275
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
276
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		# point matches to strings containing current trigram  | 
| 
277
 | 
180
 | 
 
 | 
 
 | 
 
 | 
 
 | 
233
 | 
 		$matches = $self->{trigIdx}->{$trigram};  | 
| 
278
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
279
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		# check every string containing current trigram  | 
| 
280
 | 
180
 | 
 
 | 
 
 | 
 
 | 
 
 | 
490
 | 
 		while ( ( $actName, $actMatch ) = each %$matches ) {  | 
| 
281
 | 
211
 | 
 
 | 
 
 | 
 
 | 
 
 | 
296
 | 
 			$actNum = $actMatch  | 
| 
282
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			  ->{trigs};    # so many times current trigram is found in string  | 
| 
283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
284
 | 
211
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
607
 | 
 			$trigNumBuf{$trigram}->{$actName} = $actNum  | 
| 
285
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			  unless ( exists( $trigNumBuf{$trigram}->{$actName} ) );  | 
| 
286
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
287
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			# if there are instances of current trigram left for string  | 
| 
288
 | 
211
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
466
 | 
 			if ( $trigNumBuf{$trigram}->{$actName} > 0 ) {  | 
| 
289
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
290
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 				# mark that we used on instance of current trigram  | 
| 
291
 | 
205
 | 
 
 | 
 
 | 
 
 | 
 
 | 
267
 | 
 				$trigNumBuf{$trigram}->{$actName}--;  | 
| 
292
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
293
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 				# mark that we have found one more matching trigram for $actName  | 
| 
294
 | 
205
 | 
 
 | 
 
 | 
 
 | 
 
 | 
268
 | 
 				$simInfo->{$actName}->{name}++;  | 
| 
295
 | 
205
 | 
 
 | 
 
 | 
 
 | 
 
 | 
793
 | 
 				$simInfo->{$actName}->{len} = $actMatch->{len};  | 
| 
296
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			}  | 
| 
297
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		}  | 
| 
298
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
299
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
300
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
111
 | 
 	return $self->_computeSimilarity( $str, $simInfo, $result, $curMinSim,  | 
| 
301
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		$curWarp );  | 
| 
302
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
303
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
304
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Uses getSimilarStrings() to get matching strings and filters out the  | 
| 
305
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # best one(s).  | 
| 
306
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
307
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Parameters  | 
| 
308
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
309
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $inpStr     string to be matched  | 
| 
310
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $outStrList list of best strings  | 
| 
311
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
312
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Returns  | 
| 
313
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
314
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # similarity value of best match or -1, of no match  | 
| 
315
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
316
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub getBestMatch {  | 
| 
317
 | 
8
 | 
 
 | 
 
 | 
  
8
  
 | 
  
1
  
 | 
41
 | 
 	my $self       = shift;  | 
| 
318
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
 	my $inpStr     = shift;  | 
| 
319
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
 	my $outStrList = shift;  | 
| 
320
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
321
 | 
8
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
17
 | 
 	croak  | 
| 
322
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  "I need a reference to an array as second parameter for getBestMatch()!"  | 
| 
323
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  if ( ref($outStrList) ne 'ARRAY' );  | 
| 
324
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
325
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
 	my $maxVal   = -1;    # similarity of maximally similar string  | 
| 
326
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
 	my $name     = "";    # current potentially similar string  | 
| 
327
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
 	my $val      = -1;    # similarity of $name  | 
| 
328
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
 	my $rslt     = {};    # KEY = similar name, VALUE = degree of similarity  | 
| 
329
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
 	my @rsltKeys = ();    # contains keys(%rslt)  | 
| 
330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
331
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# clear list ultimatively containing similar strings  | 
| 
332
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
 	@$outStrList = ();  | 
| 
333
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
334
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# there are no similar strings at all  | 
| 
335
 | 
8
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
17
 | 
 	if ( $self->getSimilarStrings( $inpStr, $rslt, @_ ) == 0 ) {  | 
| 
336
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
 		return 0;  | 
| 
337
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
338
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
339
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# there is at least one similar string  | 
| 
340
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
21
 | 
 	@rsltKeys = keys(%$rslt);  | 
| 
341
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
342
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# looking for the best matching string, make the first one the best ...  | 
| 
343
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
 	$name   = $rsltKeys[0];  | 
| 
344
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
 	$maxVal = $rslt->{$name};  | 
| 
345
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
346
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # since there might be several best strings (containing same degree of similarity)  | 
| 
347
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # we put them into an array and not into a scalar  | 
| 
348
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
 	push @$outStrList, $name;  | 
| 
349
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
350
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# ... and check if there are better ones  | 
| 
351
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
14
 | 
 	for ( 1 .. @rsltKeys - 1 ) {  | 
| 
352
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
 		$name = $rsltKeys[$_];  | 
| 
353
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
 		$val  = $rslt->{ $rsltKeys[$_] };  | 
| 
354
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
355
 | 
4
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
19
 | 
 		if ( $val == $maxVal ) {  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
356
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			push @$outStrList, $name;  | 
| 
357
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		}  | 
| 
358
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		elsif ( $val > $maxVal ) {  | 
| 
359
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
360
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # if we found a still better string, clear the array and put the better string into it  | 
| 
361
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
 			@$outStrList = ();  | 
| 
362
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
 			$maxVal      = $val;  | 
| 
363
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
 			push @$outStrList, $name;  | 
| 
364
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		}  | 
| 
365
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
366
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
367
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
53
 | 
 	return $maxVal;  | 
| 
368
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
369
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
370
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _setTrigIdx {  | 
| 
371
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
 
 | 
8
 | 
 	my ( $self, $newTrigIdx ) = @_;  | 
| 
372
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
373
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
11
 | 
 	$self->{trigIdx} = $newTrigIdx;  | 
| 
374
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
375
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
376
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _getTrigIdx {  | 
| 
377
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
 
 | 
0
 | 
 	my $self = shift;  | 
| 
378
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	return $self->{trigIdx};  | 
| 
379
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
380
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
381
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Computes similarity of potientially matching strings in hash  | 
| 
382
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # newSimInfo. The result is saved in newResult. The computation of the  | 
| 
383
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # similarity works like this:  | 
| 
384
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
385
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # (a = all trigrams, d = different trigrams, e = warp)  | 
| 
386
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
387
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # (a**e - d**e)/a**e  | 
| 
388
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
389
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # The default for e is 1. If e is > 1.0, short strings are getting away  | 
| 
390
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # better, if e is < 1.0 short strings are getting away worse.  | 
| 
391
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
392
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Parameters  | 
| 
393
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
394
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $newStr     string to be matched  | 
| 
395
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $newSimInfo KEY = potentially matching string, VALUE = number of matching trigrams  | 
| 
396
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $newResult  KEY = actually matching string, VALUE = similarity value  | 
| 
397
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $curMinSim  current minimal similarity  | 
| 
398
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $curWarp    current warp  | 
| 
399
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
400
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Returns  | 
| 
401
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
402
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # number of matching strings  | 
| 
403
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
404
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _computeSimilarity {  | 
| 
405
 | 
53
 | 
 
 | 
 
 | 
  
53
  
 | 
 
 | 
58
 | 
 	my $self       = shift;  | 
| 
406
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
59
 | 
 	my $newStr     = shift;  | 
| 
407
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
54
 | 
 	my $newSimInfo = shift;  | 
| 
408
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
49
 | 
 	my $newResult  = shift;  | 
| 
409
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
81
 | 
 	my $curMinSim  = shift;  | 
| 
410
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
55
 | 
 	my $curWarp    = shift;  | 
| 
411
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
412
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# clear hash containing the results  | 
| 
413
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
75
 | 
 	%$newResult = ();  | 
| 
414
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
415
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
50
 | 
 	my $strCnt = 0;    # number of similar strings (return value)  | 
| 
416
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
51
 | 
 	my $allTrigs;      # number of all trigrams  | 
| 
417
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $sameTrigs;     # number of same trigrams  | 
| 
418
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 	my $actSim;        # similarity (0 - 1)  | 
| 
419
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
56
 | 
 	my $len =  | 
| 
420
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  length($newStr);    # length of string - $padNum for the padded blanks  | 
| 
421
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # check every potientially similar string (i.e. every string containing at least one  | 
| 
423
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # identical trigram)  | 
| 
424
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
115
 | 
 	foreach ( keys(%$newSimInfo) ) {  | 
| 
425
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
65
 | 
 		$sameTrigs = $newSimInfo->{$_}->{name};  | 
| 
426
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
427
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # the number of n-grams in a string result from subtracting n from the length of  | 
| 
428
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # the string and adding 1. If it is padded with blanks, there is one additional  | 
| 
429
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # n-gram for each blank. So to compute the number of n-grams of two strings we  | 
| 
430
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # subtract n twice, add 2 and add the number of padded blanks * 2. Since $newStr  | 
| 
431
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # is already padded and the length noted for $_ contains already the padding we  | 
| 
432
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # do not need to take the padding explicitly into account. Finally, to get  | 
| 
433
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $allTrigs (types not tokens) we need to subtract the number of matching trigrams  | 
| 
434
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # - those occuring in both strings - once.  | 
| 
435
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
96
 | 
 		$allTrigs =  | 
| 
436
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		  $len + $newSimInfo->{$_}->{len} - 2 * $self->{ngram} - $sameTrigs + 2;  | 
| 
437
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
438
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
73
 | 
 		$actSim = _computeActSim( $sameTrigs, $allTrigs, $curWarp );  | 
| 
439
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
440
 | 
50
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
99
 | 
 		if ( $self->{debug} ) {  | 
| 
441
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			my $tmpStr = $self->{padding} . $_ . $self->{padding};  | 
| 
442
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "\nCompare\n";  | 
| 
443
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "$newStr ->",  | 
| 
444
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			  sort join( ":", $newStr =~ /(?=(...))/g ), "<-\n";  | 
| 
445
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "$tmpStr ->",  | 
| 
446
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			  sort join( ":", $tmpStr =~ /(?=(...))/g ), "<-\n";  | 
| 
447
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "-" x  | 
| 
448
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 			  ( 22 + length($newStr) + 2 * length( $self->{padding} ) +  | 
| 
449
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 				  length($_) ), "\n";  | 
| 
450
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "N-GRAM-LEN: ", $self->{ngram}, "\n";  | 
| 
451
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "ALL :       ", $allTrigs,  "\n";  | 
| 
452
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "SAME:       ", $sameTrigs, "\n";  | 
| 
453
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "DIFF:       ", $allTrigs - $sameTrigs, "\n";  | 
| 
454
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "ACTSIM:     ", $actSim, "\n";  | 
| 
455
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "PADDING:    ", length $self->{padding}, "\n";  | 
| 
456
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "MINSIM:     ", $self->{minSim}, "\n";  | 
| 
457
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
 			print STDERR "WARP:       ", $curWarp, "\n\n";  | 
| 
458
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		}  | 
| 
459
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
460
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		# count string as similar only if similarity exceeds minimal similarity  | 
| 
461
 | 
50
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
112
 | 
 		if ( $actSim > $curMinSim ) {  | 
| 
462
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
69
 | 
 			$newResult->{$_} = $actSim;  | 
| 
463
 | 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
88
 | 
 			$strCnt++;  | 
| 
464
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		}  | 
| 
465
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
466
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
467
 | 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
275
 | 
 	return $strCnt;  | 
| 
468
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
469
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
470
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # compute similarity  | 
| 
471
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _computeActSim {  | 
| 
472
 | 
50
 | 
 
 | 
 
 | 
  
50
  
 | 
 
 | 
51
 | 
 	my $sameTrigs = shift;  | 
| 
473
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
 	my $allTrigs  = shift;  | 
| 
474
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
52
 | 
 	my $curWarp   = shift;  | 
| 
475
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
476
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
80
 | 
 	my $diffTrigs = -1;    # number of different trigrams  | 
| 
477
 | 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
 	my $actSim    = -1;    # similarity (0 - 1)  | 
| 
478
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
479
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# no warp here so skip the complicated stuff below  | 
| 
480
 | 
50
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
77
 | 
 	if ( $curWarp == 1 ) {  | 
| 
481
 | 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
93
 | 
 		$actSim = $sameTrigs / $allTrigs;  | 
| 
482
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
483
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	else {  | 
| 
484
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		# we've got to take warp into account  | 
| 
485
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
 		$diffTrigs = $allTrigs - $sameTrigs;  | 
| 
486
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
30
 | 
 		$actSim    =  | 
| 
487
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		  ( ( $allTrigs**$curWarp ) - ( $diffTrigs**$curWarp ) ) /  | 
| 
488
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		  ( $allTrigs**$curWarp );  | 
| 
489
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
490
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
491
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
492
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Takes list of strings and puts them into an index of trigrams. KEY is a trigram,  | 
| 
493
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # VALUE a list of strings containing that trigram. VALUE has two KEYS:  | 
| 
494
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
495
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # trigs: count of trigram occurring in string  | 
| 
496
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # len:   length of string (if $keepAlNums, after applying s/\W//g)  | 
| 
497
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
498
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Parameters  | 
| 
499
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
500
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $list       list of strings being the base of comparison  | 
| 
501
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $ignoreCase ignore case if 1  | 
| 
502
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $keepAlNums s/\W//g if 1  | 
| 
503
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $pad        contains 0 - 2 blanks for padding  | 
| 
504
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
505
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Returns  | 
| 
506
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
507
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # trigram index  | 
| 
508
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
509
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _trigramify {  | 
| 
510
 | 
43
 | 
 
 | 
 
 | 
  
43
  
 | 
 
 | 
52
 | 
 	my $list       = shift;  | 
| 
511
 | 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
48
 | 
 	my $ignoreCase = shift;  | 
| 
512
 | 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
46
 | 
 	my $keepAlNums = shift;  | 
| 
513
 | 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
45
 | 
 	my $pad        = shift;  | 
| 
514
 | 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
46
 | 
 	my $trigs      = shift;  | 
| 
515
 | 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
47
 | 
 	my $ngram      = shift;  | 
| 
516
 | 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
46
 | 
 	my $seen       = shift;  | 
| 
517
 | 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
41
 | 
 	my $tmpStr;  | 
| 
518
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	my $len;  | 
| 
519
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
520
 | 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
68
 | 
 	foreach (@$list) {  | 
| 
521
 | 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
68
 | 
 		$tmpStr = $_;  | 
| 
522
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
523
 | 
58
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
105
 | 
 		$tmpStr =~ s/\W//g if $keepAlNums;  | 
| 
524
 | 
58
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
152
 | 
 		$tmpStr = lc $tmpStr if $ignoreCase;  | 
| 
525
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
526
 | 
58
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
109
 | 
 		next if exists $seen->{$tmpStr};  | 
| 
527
 | 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
101
 | 
 		$seen->{$tmpStr} = 1;  | 
| 
528
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
529
 | 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
88
 | 
 		$tmpStr = $pad . $tmpStr . $pad;  | 
| 
530
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
531
 | 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
59
 | 
 		$len = length($tmpStr);  | 
| 
532
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
533
 | 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
140
 | 
 		for ( my $i = 0 ; $i < ( length($tmpStr) - $ngram + 1 ) ; $i++ ) {  | 
| 
534
 | 
350
 | 
 
 | 
 
 | 
 
 | 
 
 | 
936
 | 
 			$trigs->{ substr( $tmpStr, $i, $ngram ) }->{$_}->{trigs}++;  | 
| 
535
 | 
350
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1036
 | 
 			$trigs->{ substr( $tmpStr, $i, $ngram ) }->{$_}->{len} = $len;  | 
| 
536
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 		}  | 
| 
537
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	}  | 
| 
538
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
539
 | 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
318
 | 
 	return $trigs;  | 
| 
540
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
541
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
542
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _setParams {  | 
| 
543
 | 
39
 | 
 
 | 
 
 | 
  
39
  
 | 
 
 | 
43
 | 
 	my $params = shift;  | 
| 
544
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
545
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	# set defaults, if not specified otherwise  | 
| 
546
 | 
39
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
105
 | 
 	$params->{ngram}  = $DEFAULT_NGRAM_LEN unless exists $params->{ngram};  | 
| 
547
 | 
39
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
97
 | 
 	$params->{minSim} = $DEFAULT_MIN_SIM   unless exists $params->{minSim};  | 
| 
548
 | 
39
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
95
 | 
 	$params->{warp}   = $DEFAULT_WARP      unless exists $params->{warp};  | 
| 
549
 | 
39
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
101
 | 
 	$params->{ignoreCase} = $DEFAULT_IGNORE_CASE  | 
| 
550
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  unless exists $params->{ignoreCase};  | 
| 
551
 | 
39
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
87
 | 
 	$params->{keepOnlyAlNums} = $DEFAULT_KEEP_ONLY_ALNUMS  | 
| 
552
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 	  unless exists $params->{keepOnlyAlNums};  | 
| 
553
 | 
39
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
103
 | 
 	$params->{padding} = $params->{ngram} - 1 unless exists $params->{padding};  | 
| 
554
 | 
39
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
121
 | 
 	$params->{debug}   = $DEFAULT_DEBUG       unless exists $params->{debug};  | 
| 
555
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
556
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
557
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  | 
| 
558
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 __END__  |