line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package String::Alignment; |
2
|
|
|
|
|
|
|
|
3
|
4
|
|
|
4
|
|
138000
|
use warnings; |
|
4
|
|
|
|
|
13
|
|
|
4
|
|
|
|
|
4322
|
|
4
|
4
|
|
|
4
|
|
33
|
use strict; |
|
4
|
|
|
|
|
5
|
|
|
4
|
|
|
|
|
191
|
|
5
|
4
|
|
|
4
|
|
26
|
use List::Util qw(max min); |
|
4
|
|
|
|
|
13
|
|
|
4
|
|
|
|
|
7166
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
String::Alignment - Pair Sentence Alignment |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 VERSION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Version 0.01 |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=cut |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
our $VERSION = '0.01'; |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=head1 SYNOPSIS |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
This module process string alignment. |
22
|
|
|
|
|
|
|
Now it provide two kind of alignment method, Global and Local Alignment. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
use String::Alignment; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
use String::Alignment qw(do_alignment); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
# local alignment |
29
|
|
|
|
|
|
|
my $result = do_alignment($s1,$s2,1); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
# global alignment |
32
|
|
|
|
|
|
|
my $result = do_alignment($s1,$s2); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 EXPORT |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=cut |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
require Exporter; |
39
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
40
|
|
|
|
|
|
|
our @EXPORT_OK = qw(do_alignment); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head1 BUILD-IN VARIABLES |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=cut |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
my ($s1,$s2); # string1, string2 |
47
|
|
|
|
|
|
|
my (@sa1, @sa2); # string array 1, string array 2 |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
my ($len_s1, $len_s2) = (0,0); # length of s1/s2 |
50
|
|
|
|
|
|
|
my $is_local = 1; # 0 for global alignment |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
my %table; # Dynamic Programming Table |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
my $max_len; # for global |
55
|
|
|
|
|
|
|
my %best; # Best path, for local |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head1 FUNCTIONS |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=cut |
60
|
|
|
|
|
|
|
|
61
|
0
|
|
|
0
|
0
|
0
|
sub new { |
62
|
|
|
|
|
|
|
# print STDERR "I'm loaded\n"; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub do_alignment { |
66
|
3
|
|
|
3
|
0
|
449
|
$s1 = shift; |
67
|
3
|
|
|
|
|
7
|
$s2 = shift; |
68
|
3
|
|
|
|
|
6
|
$is_local = shift; |
69
|
3
|
100
|
|
|
|
16
|
$is_local = 0 unless defined($is_local); |
70
|
3
|
|
|
|
|
9
|
give_string_pair($s1,$s2); |
71
|
3
|
|
|
|
|
8
|
calculate_matrix(); |
72
|
|
|
|
|
|
|
# similarity_print(); |
73
|
3
|
|
|
|
|
48
|
return get_align_result(); |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
=head2 give_string_pair |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=cut |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
sub give_string_pair { |
80
|
3
|
|
|
3
|
0
|
6
|
$s1 = shift; |
81
|
3
|
|
|
|
|
5
|
$s2 = shift; |
82
|
3
|
|
|
|
|
19
|
@sa1 = split //,$s1; |
83
|
3
|
|
|
|
|
14
|
@sa2 = split //,$s2; |
84
|
3
|
|
|
|
|
33
|
%table = (); |
85
|
3
|
|
|
|
|
6
|
%best = (); |
86
|
3
|
|
|
|
|
9
|
$best{MAX} = 0; |
87
|
3
|
|
|
|
|
7
|
$table{0}{0} = 0; |
88
|
3
|
|
|
|
|
7
|
($len_s1, $len_s2) = (0,0); |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head2 cululate_matrix |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=cut |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
sub calculate_matrix { |
96
|
3
|
100
|
|
3
|
0
|
15
|
if ($is_local) { |
97
|
2
|
|
|
|
|
3
|
$max_len = 0; |
98
|
|
|
|
|
|
|
} else { |
99
|
1
|
50
|
|
|
|
4
|
$max_len = scalar(@sa1) > scalar(@sa2) ? scalar(@sa1): scalar(@sa2); # for global |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
# print STDERR "max_len is ".$max_len."\n"; |
102
|
3
|
|
|
|
|
13
|
while ($len_s1 <= (scalar @sa1)) { |
103
|
34
|
|
|
|
|
246
|
while ($len_s2 <= scalar @sa2) { |
104
|
402
|
|
|
|
|
695
|
my ($candidate1, $candidate2, $candidate3) = ($max_len,$max_len,$max_len); |
105
|
402
|
100
|
100
|
|
|
1630
|
if ($len_s1 > 0 and $len_s2 > 0) { |
106
|
|
|
|
|
|
|
# if match, we add 1 for local, 0 for global |
107
|
|
|
|
|
|
|
# else (not matched), we add -1 for local, 1 for global |
108
|
337
|
100
|
|
|
|
3191
|
$candidate1 = int($table{$len_s1-1}{$len_s2-1}) + |
|
|
100
|
|
|
|
|
|
109
|
|
|
|
|
|
|
( $is_local ? 1: -1) * |
110
|
|
|
|
|
|
|
( ( $sa1[$len_s1-1] eq $sa2[$len_s2-1] )? 1+(-1+$is_local) : -1 ) |
111
|
|
|
|
|
|
|
; |
112
|
|
|
|
|
|
|
} |
113
|
402
|
100
|
|
|
|
779
|
if ($len_s1 > 0) { |
114
|
368
|
100
|
|
|
|
998
|
$candidate2 = int($table{$len_s1-1}{$len_s2}) + |
115
|
|
|
|
|
|
|
( $is_local ? (-1) : 1); |
116
|
|
|
|
|
|
|
} |
117
|
402
|
100
|
|
|
|
1540
|
if ($len_s2 > 0) { |
118
|
368
|
100
|
|
|
|
771
|
$candidate3 = int($table{$len_s1}{$len_s2 - 1}) + |
119
|
|
|
|
|
|
|
( $is_local ? (-1) : 1); |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
# print STDERR "setting ($len_s1,$len_s2)..."; |
122
|
|
|
|
|
|
|
# print STDERR "(".$candidate1."\t".$candidate2."\t".$candidate3.")\n"; |
123
|
402
|
100
|
|
|
|
844
|
if ($is_local) { |
124
|
338
|
100
|
100
|
|
|
1393
|
$table{$len_s1}{$len_s2} = max ( |
125
|
|
|
|
|
|
|
$candidate1, $candidate2, $candidate3, 0 |
126
|
|
|
|
|
|
|
) if ($len_s1 > 0 or $len_s2 > 0); |
127
|
338
|
100
|
|
|
|
807
|
$best{X} = $len_s1 if $best{MAX} <= $table{$len_s1}{$len_s2}; |
128
|
338
|
100
|
|
|
|
794
|
$best{Y} = $len_s2 if $best{MAX} <= $table{$len_s1}{$len_s2}; |
129
|
338
|
100
|
|
|
|
758
|
$best{MAX} = $table{$len_s1}{$len_s2} if $best{MAX} <= $table{$len_s1}{$len_s2}; |
130
|
|
|
|
|
|
|
} else { # global |
131
|
64
|
100
|
100
|
|
|
350
|
$table{$len_s1}{$len_s2} = min ( |
132
|
|
|
|
|
|
|
$candidate1, $candidate2, $candidate3 |
133
|
|
|
|
|
|
|
) if ($len_s1 > 0 or $len_s2 > 0); |
134
|
|
|
|
|
|
|
} |
135
|
402
|
|
|
|
|
1486
|
$len_s2 +=1; |
136
|
|
|
|
|
|
|
} |
137
|
34
|
|
|
|
|
38
|
$len_s2 = 0; |
138
|
34
|
|
|
|
|
78
|
$len_s1 +=1; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=head2 similarity_print |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=cut |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
sub similarity_print { |
147
|
0
|
|
|
0
|
1
|
0
|
print STDERR "\n \t \t".join("\t",@sa2)."\n"; |
148
|
0
|
|
|
|
|
0
|
for my $key (sort {int($a) <=> int($b)}(keys %table)) { |
|
0
|
|
|
|
|
0
|
|
149
|
0
|
0
|
|
|
|
0
|
print STDERR $sa1[$key-1]."\t" if $key > 0; |
150
|
0
|
0
|
|
|
|
0
|
print STDERR " \t" unless $key > 0; |
151
|
0
|
|
|
|
|
0
|
for my $subkey (sort {int($a) <=> int($b)} (keys %{$table{$key}})) { |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
152
|
0
|
|
|
|
|
0
|
print STDERR $table{$key}{$subkey}."\t"; |
153
|
|
|
|
|
|
|
} |
154
|
0
|
|
|
|
|
0
|
print STDERR "\n"; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
}; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=head2 get_align_result |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=cut |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub get_align_result { |
163
|
3
|
|
|
3
|
1
|
14
|
my ($i, $j) = (0, 0); |
164
|
3
|
|
|
|
|
6
|
my (@as1, @as2); |
165
|
3
|
|
|
|
|
5
|
my $baseline = 0; |
166
|
3
|
100
|
|
|
|
13
|
if ($is_local) { |
167
|
2
|
|
|
|
|
14
|
$i = $best{X}; |
168
|
2
|
|
|
|
|
4
|
$j = $best{Y}; |
169
|
|
|
|
|
|
|
} else { |
170
|
1
|
|
|
|
|
2
|
$i = scalar @sa1; |
171
|
1
|
|
|
|
|
1
|
$j = scalar @sa2; |
172
|
|
|
|
|
|
|
} |
173
|
3
|
|
|
|
|
23
|
while ( $table{$i}{$j} > 0) { |
174
|
26
|
100
|
|
|
|
50
|
if ($is_local) { |
175
|
18
|
|
|
|
|
87
|
$baseline = max($table{$i-1}{$j-1},$table{$i-1}{$j},$table{$i}{$j-1}); |
176
|
|
|
|
|
|
|
} else { |
177
|
8
|
|
|
|
|
30
|
$baseline = min($table{$i-1}{$j-1},$table{$i-1}{$j},$table{$i}{$j-1}); |
178
|
|
|
|
|
|
|
} |
179
|
26
|
100
|
|
|
|
73
|
if ($table{$i-1}{$j-1} == $baseline) { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
180
|
23
|
|
|
|
|
39
|
push @as1, $sa1[$i-1]; |
181
|
23
|
|
|
|
|
39
|
push @as2, $sa2[$j-1]; |
182
|
23
|
|
|
|
|
26
|
$i--; |
183
|
23
|
|
|
|
|
60
|
$j--; |
184
|
|
|
|
|
|
|
} elsif ($table{$i}{$j-1} == $baseline) { |
185
|
2
|
|
|
|
|
10
|
push @as1, "-"; # gap |
186
|
2
|
|
|
|
|
4
|
push @as2, $sa2[$j-1]; |
187
|
2
|
|
|
|
|
6
|
$j--; |
188
|
|
|
|
|
|
|
} elsif ($table{$i-1}{$j} == $baseline) { |
189
|
1
|
|
|
|
|
2
|
push @as1, $sa1[$i-1]; |
190
|
1
|
|
|
|
|
2
|
push @as2, "-"; # gap |
191
|
1
|
|
|
|
|
4
|
$i--; |
192
|
|
|
|
|
|
|
} else { |
193
|
0
|
|
|
|
|
0
|
die $!; |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
} |
196
|
3
|
|
|
|
|
47
|
return ( join ("",reverse @as2)."\t".join ("",reverse @as1) ); |
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=head1 AUTHOR |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
Cheng-Lung Sung, C<< >> |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
=head1 BUGS |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
206
|
|
|
|
|
|
|
C, or through the web interface at |
207
|
|
|
|
|
|
|
L. |
208
|
|
|
|
|
|
|
I will be notified, and then you'll automatically be notified of progress on |
209
|
|
|
|
|
|
|
your bug as I make changes. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
Copyright 2006 Cheng-Lung Sung, All Rights Reserved. |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
218
|
|
|
|
|
|
|
under the same terms as Perl itself. |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=cut |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
1; # End of String::Alignment |