line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Sentence::Alignment; |
2
|
|
|
|
|
|
|
|
3
|
4
|
|
|
4
|
|
263943
|
use warnings; |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
145
|
|
4
|
4
|
|
|
4
|
|
24
|
use strict; |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
163
|
|
5
|
4
|
|
|
4
|
|
68
|
use List::Util qw(max min); |
|
4
|
|
|
|
|
15
|
|
|
4
|
|
|
|
|
16797
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Text::Sentence::Alignment - Two Sentence Alignment |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 VERSION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Version 0.12 |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=cut |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.12'; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 SYNOPSIS |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
This Module process two sentences (i.e. terms separated by space) alignment. |
22
|
|
|
|
|
|
|
Now it provide two kind of alignment method, Global and Local Alignment. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
use Text::Sentence::Alignment; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
my $TSA = Text::Sentence::Alignment->new(); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
# local alignment |
29
|
|
|
|
|
|
|
$TSA->is_local(1); |
30
|
|
|
|
|
|
|
my ($result1,$result2) = $TSA->do_alignment($s1,$s2); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# global alignment |
33
|
|
|
|
|
|
|
$TSA->is_local(0); |
34
|
|
|
|
|
|
|
my ($result1,$result2) = $TSA->do_alignment($s1,$s2); |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 FUNCTIONS |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=cut |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head2 new |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=cut |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
sub new { |
45
|
3
|
|
|
3
|
1
|
46785
|
my $class = shift; |
46
|
3
|
|
|
|
|
10
|
my $self = {}; |
47
|
3
|
|
|
|
|
14
|
$self->{IS_LOCAL} = 0; # 0 for global alignment |
48
|
3
|
|
|
|
|
10
|
$self->{DELIMETER} = '/'; # / for split tags |
49
|
3
|
|
|
|
|
10
|
$self->{TABLE} = (); # Dynamic Programming Table |
50
|
3
|
|
|
|
|
17
|
$self->{BEST} = (); # Best path, for local |
51
|
3
|
|
|
|
|
49
|
$self->{max_len} = 0; # for global |
52
|
3
|
|
|
|
|
11
|
$self->{SENARR1} = []; |
53
|
3
|
|
|
|
|
9
|
$self->{SENARR2} = []; |
54
|
3
|
|
|
|
|
9
|
$self->{has_tag} = 0; # if we need to do hierarchy match, |
55
|
|
|
|
|
|
|
# e.g. POS, semantic tags |
56
|
3
|
|
|
|
|
10
|
bless($self, $class); |
57
|
3
|
|
|
|
|
11
|
return($self); |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head2 is_local |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
Set/get if current algorithm is local alignment |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
=cut |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub is_local { |
67
|
2
|
|
|
2
|
1
|
11
|
my $self = shift; |
68
|
2
|
50
|
|
|
|
13
|
if (@_) { $self->{IS_LOCAL} = shift } |
|
2
|
|
|
|
|
17
|
|
69
|
2
|
|
|
|
|
7
|
return $self->{IS_LOCAL} |
70
|
|
|
|
|
|
|
} |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=head2 delimeter |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
The delimeter() is used to set delimeter of word/tags. |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
=cut |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub delimeter { |
79
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
80
|
0
|
0
|
|
|
|
0
|
if (@_) { $self->{DELIMETER} = shift } |
|
0
|
|
|
|
|
0
|
|
81
|
0
|
|
|
|
|
0
|
return $self->{DELIMETER} |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=head2 do_alignment |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
=cut |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub do_alignment { |
89
|
4
|
|
|
4
|
1
|
9042
|
my $self = shift; |
90
|
4
|
|
|
|
|
9
|
my $sen1 = shift; |
91
|
4
|
|
|
|
|
7
|
my $sen2 = shift; |
92
|
4
|
|
|
|
|
19
|
@{ $self->{SENARR1} } = split / /,$sen1; |
|
4
|
|
|
|
|
31
|
|
93
|
4
|
|
|
|
|
18
|
@{ $self->{SENARR2} } = split / /,$sen2; |
|
4
|
|
|
|
|
89
|
|
94
|
4
|
50
|
33
|
|
|
116
|
if ($sen1 =~ m/$self->{DELIMETER}/ or |
95
|
|
|
|
|
|
|
$sen2 =~ m/$self->{DELIMETER}/) { |
96
|
0
|
|
|
|
|
0
|
$self->{has_tag} = 1; |
97
|
|
|
|
|
|
|
} |
98
|
4
|
|
|
|
|
11
|
$self->{TABLE} = (); |
99
|
4
|
|
|
|
|
21
|
$self->{BEST} = (); |
100
|
4
|
|
|
|
|
13
|
$self->{BEST}{MAX} = 0; |
101
|
4
|
|
|
|
|
12
|
$self->{TABLE}{0}{0} = 0; |
102
|
4
|
|
|
|
|
13
|
calculate_matrix($self); |
103
|
|
|
|
|
|
|
# similarity_print(); |
104
|
4
|
|
|
|
|
17
|
return get_align_result($self); |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
=head2 calculate_matrix |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=cut |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub calculate_matrix { |
112
|
4
|
|
|
4
|
1
|
8
|
my $self = shift; |
113
|
4
|
|
|
|
|
6
|
my @sa1 = @{ $self->{SENARR1}}; |
|
4
|
|
|
|
|
23
|
|
114
|
4
|
|
|
|
|
9
|
my @sa2 = @{ $self->{SENARR2}}; |
|
4
|
|
|
|
|
16
|
|
115
|
4
|
100
|
|
|
|
20
|
if ($self->{IS_LOCAL}) { |
116
|
2
|
|
|
|
|
4
|
$self->{max_len}= 0; |
117
|
|
|
|
|
|
|
} else { |
118
|
2
|
50
|
|
|
|
6
|
$self->{max_len} = scalar(@sa1) > scalar(@sa2) ? scalar(@sa1): scalar(@sa2); # for global |
119
|
|
|
|
|
|
|
} |
120
|
4
|
|
|
|
|
10
|
my ($len_s1, $len_s2) = (0,0); # length of s1/s2 |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# print STDERR "max_len is ".$max_len."\n"; |
123
|
4
|
|
|
|
|
16
|
while ($len_s1 <= (scalar @sa1)) { |
124
|
36
|
|
|
|
|
72
|
while ($len_s2 <= scalar @sa2) { |
125
|
408
|
|
|
|
|
752
|
my ($candidate1, $candidate2, $candidate3) = ($self->{max_len},$self->{max_len},$self->{max_len}); |
126
|
408
|
100
|
100
|
|
|
1458
|
if ($len_s1 > 0 and $len_s2 > 0) { |
127
|
|
|
|
|
|
|
# if match, we add 1 for local, 0 for global |
128
|
|
|
|
|
|
|
# else (not matched), we add -1 for local, 1 for global |
129
|
339
|
100
|
|
|
|
1689
|
$candidate1 = int($self->{TABLE}{$len_s1-1}{$len_s2-1}) + |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
130
|
|
|
|
|
|
|
( $self->{IS_LOCAL} ? 1: -1) * |
131
|
|
|
|
|
|
|
( $self->{has_tag} ? |
132
|
|
|
|
|
|
|
how_similar($sa1[$len_s1-1], $sa2[$len_s2-1]) : |
133
|
|
|
|
|
|
|
( ( $sa1[$len_s1-1] eq $sa2[$len_s2-1] ) ? |
134
|
|
|
|
|
|
|
1+(-1+$self->{IS_LOCAL}) : -1 ) |
135
|
|
|
|
|
|
|
) |
136
|
|
|
|
|
|
|
; |
137
|
|
|
|
|
|
|
} |
138
|
408
|
100
|
|
|
|
6730
|
if ($len_s1 > 0) { |
139
|
371
|
100
|
|
|
|
1204
|
$candidate2 = int($self->{TABLE}{$len_s1-1}{$len_s2}) + |
140
|
|
|
|
|
|
|
( $self->{IS_LOCAL} ? (-1) : 1); |
141
|
|
|
|
|
|
|
} |
142
|
408
|
100
|
|
|
|
990
|
if ($len_s2 > 0) { |
143
|
372
|
100
|
|
|
|
1151
|
$candidate3 = int($self->{TABLE}{$len_s1}{$len_s2 - 1}) + |
144
|
|
|
|
|
|
|
( $self->{IS_LOCAL} ? (-1) : 1); |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
# print STDERR "setting ($len_s1,$len_s2)..."; |
147
|
|
|
|
|
|
|
# print STDERR "(".$candidate1."\t".$candidate2."\t".$candidate3.")\n"; |
148
|
408
|
100
|
|
|
|
707
|
if ($self->{IS_LOCAL}) { |
149
|
338
|
100
|
100
|
|
|
1458
|
$self->{TABLE}{$len_s1}{$len_s2} = max ( |
150
|
|
|
|
|
|
|
$candidate1, $candidate2, $candidate3, 0 |
151
|
|
|
|
|
|
|
) if ($len_s1 > 0 or $len_s2 > 0); |
152
|
338
|
100
|
|
|
|
1021
|
$self->{BEST}{X} = $len_s1 if $self->{BEST}{MAX} <= $self->{TABLE}{$len_s1}{$len_s2}; |
153
|
338
|
100
|
|
|
|
837
|
$self->{BEST}{Y} = $len_s2 if $self->{BEST}{MAX} <= $self->{TABLE}{$len_s1}{$len_s2}; |
154
|
338
|
100
|
|
|
|
945
|
$self->{BEST}{MAX} = $self->{TABLE}{$len_s1}{$len_s2} if $self->{BEST}{MAX} <= $self->{TABLE}{$len_s1}{$len_s2}; |
155
|
|
|
|
|
|
|
} else { # global |
156
|
70
|
100
|
100
|
|
|
924
|
$self->{TABLE}{$len_s1}{$len_s2} = min ( |
157
|
|
|
|
|
|
|
$candidate1, $candidate2, $candidate3 |
158
|
|
|
|
|
|
|
) if ($len_s1 > 0 or $len_s2 > 0); |
159
|
|
|
|
|
|
|
} |
160
|
408
|
|
|
|
|
996
|
$len_s2 +=1; |
161
|
|
|
|
|
|
|
} |
162
|
36
|
|
|
|
|
48
|
$len_s2 = 0; |
163
|
36
|
|
|
|
|
89
|
$len_s1 +=1; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=head2 similarity_print |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
=cut |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
sub similarity_print { |
172
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
173
|
0
|
|
|
|
|
0
|
my @sa1 = @{ $self->{SENARR1}}; |
|
0
|
|
|
|
|
0
|
|
174
|
0
|
|
|
|
|
0
|
my @sa2 = @{ $self->{SENARR2}}; |
|
0
|
|
|
|
|
0
|
|
175
|
0
|
|
|
|
|
0
|
print STDERR "\n \t \t".join("\t",@sa2)."\n"; |
176
|
0
|
|
|
|
|
0
|
for my $key (sort {int($a) <=> int($b)}(keys %{$self->{TABLE}})) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
177
|
0
|
0
|
|
|
|
0
|
print STDERR $sa1[$key-1]."\t" if $key > 0; |
178
|
0
|
0
|
|
|
|
0
|
print STDERR " \t" unless $key > 0; |
179
|
0
|
|
|
|
|
0
|
for my $subkey (sort {int($a) <=> int($b)} (keys %{$self->{TABLE}{$key}})) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
180
|
0
|
|
|
|
|
0
|
print STDERR $self->{TABLE}{$key}{$subkey}."\t"; |
181
|
|
|
|
|
|
|
} |
182
|
0
|
|
|
|
|
0
|
print STDERR "\n"; |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
}; |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head2 get_align_result |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=cut |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
sub get_align_result { |
191
|
4
|
|
|
4
|
1
|
10
|
my $self = shift; |
192
|
4
|
|
|
|
|
15
|
my @sa1 = @{ $self->{SENARR1}}; |
|
4
|
|
|
|
|
23
|
|
193
|
4
|
|
|
|
|
8
|
my @sa2 = @{ $self->{SENARR2}}; |
|
4
|
|
|
|
|
22
|
|
194
|
4
|
|
|
|
|
11
|
my ($i, $j) = (0, 0); |
195
|
4
|
|
|
|
|
7
|
my (@as1, @as2); |
196
|
4
|
|
|
|
|
8
|
my $baseline = 0; |
197
|
4
|
|
|
|
|
6
|
my $first_round = 0; |
198
|
4
|
100
|
|
|
|
18
|
if ($self->{IS_LOCAL}) { |
199
|
2
|
|
|
|
|
6
|
$i = $self->{BEST}{X}; |
200
|
2
|
|
|
|
|
5
|
$j = $self->{BEST}{Y}; |
201
|
|
|
|
|
|
|
} else { |
202
|
2
|
|
|
|
|
3
|
$i = scalar @sa1; |
203
|
2
|
|
|
|
|
3
|
$j = scalar @sa2; |
204
|
|
|
|
|
|
|
} |
205
|
4
|
|
|
|
|
22
|
my $pi = $i + 2; |
206
|
4
|
|
|
|
|
10
|
my $pj = $j + 2; |
207
|
4
|
|
100
|
|
|
5
|
do { |
|
|
|
66
|
|
|
|
|
208
|
30
|
50
|
|
|
|
59
|
if ($first_round) { |
209
|
0
|
|
|
|
|
0
|
push @as1, $sa1[$i]; |
210
|
0
|
|
|
|
|
0
|
push @as2, $sa2[$j]; |
211
|
0
|
|
|
|
|
0
|
$first_round = 0; |
212
|
|
|
|
|
|
|
} else { |
213
|
30
|
100
|
|
|
|
69
|
if ($i == ($pi-1)) { |
|
|
100
|
|
|
|
|
|
214
|
23
|
|
|
|
|
94
|
push @as1, $sa1[$i]; |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
elsif ($i == $pi) { |
217
|
3
|
|
|
|
|
6
|
push @as1, "-"; |
218
|
|
|
|
|
|
|
} |
219
|
30
|
100
|
|
|
|
65
|
if ($j == ($pj-1)) { |
|
|
100
|
|
|
|
|
|
220
|
25
|
|
|
|
|
46
|
push @as2, $sa2[$j]; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
elsif ($j == $pj) { |
223
|
1
|
|
|
|
|
2
|
push @as2, "-"; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
30
|
|
|
|
|
36
|
$pi = $i; |
227
|
30
|
|
|
|
|
31
|
$pj = $j; |
228
|
30
|
100
|
|
|
|
66
|
if ($self->{IS_LOCAL}) { |
229
|
20
|
|
|
|
|
92
|
$baseline = max($self->{TABLE}{$i-1}{$j-1},$self->{TABLE}{$i-1}{$j},$self->{TABLE}{$i}{$j-1}); |
230
|
|
|
|
|
|
|
} else { |
231
|
10
|
|
|
|
|
50
|
$baseline = min($self->{TABLE}{$i-1}{$j-1},$self->{TABLE}{$i-1}{$j},$self->{TABLE}{$i}{$j-1}); |
232
|
|
|
|
|
|
|
} |
233
|
30
|
100
|
|
|
|
92
|
if ($self->{TABLE}{$i-1}{$j-1} == $baseline) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
234
|
25
|
|
|
|
|
29
|
$i--; |
235
|
25
|
|
|
|
|
263
|
$j--; |
236
|
|
|
|
|
|
|
} elsif ($self->{TABLE}{$i}{$j-1} == $baseline) { |
237
|
4
|
|
|
|
|
180
|
$j--; |
238
|
|
|
|
|
|
|
} elsif ($self->{TABLE}{$i-1}{$j} == $baseline) { |
239
|
1
|
|
|
|
|
12
|
$i--; |
240
|
|
|
|
|
|
|
} else { |
241
|
0
|
|
|
|
|
0
|
die $!; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
} while ( $self->{TABLE}{$pi}{$pj} > 0 and $i >0 and $j > 0); |
244
|
4
|
100
|
|
|
|
30
|
if (!$self->{IS_LOCAL}) { |
245
|
2
|
|
|
|
|
18
|
push @as1, $sa1[$i]; |
246
|
2
|
|
|
|
|
63
|
push @as2, $sa2[$j]; |
247
|
|
|
|
|
|
|
} |
248
|
4
|
|
|
|
|
130
|
return ( join (" ",reverse @as1)."\t".join (" ",reverse @as2) ); |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=head1 AUTHOR |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
Cheng-Lung Sung, C<< >> |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=head1 BUGS |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
258
|
|
|
|
|
|
|
C, or through the web interface at |
259
|
|
|
|
|
|
|
L. |
260
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified of progress on |
261
|
|
|
|
|
|
|
your bug as I make changes. |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
Copyright 2005 - 2007 Cheng-Lung Sung, All Rights Reserved. |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
270
|
|
|
|
|
|
|
under the same terms as Perl itself. |
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
=cut |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
1; # End of Text::Sentence::Alignment |