line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::GaleChurch; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
25303
|
use 5.008008; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
38
|
|
4
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
30
|
|
5
|
1
|
|
|
1
|
|
5
|
use warnings; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
1111
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
require Exporter; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# Items to export into callers namespace by default. Note: do not export |
12
|
|
|
|
|
|
|
# names by default without a very good reason. Use EXPORT_OK instead. |
13
|
|
|
|
|
|
|
# Do not simply export all your public functions/methods/constants. |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
# This allows declaration use Text::GaleChurch ':all'; |
16
|
|
|
|
|
|
|
# If you do not need this, moving things directly into @EXPORT or @EXPORT_OK |
17
|
|
|
|
|
|
|
# will save memory. |
18
|
|
|
|
|
|
|
our %EXPORT_TAGS = ( 'all' => [ qw( |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
) ] ); |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our @EXPORT_OK = qw(align); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
our @EXPORT = qw(align); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
our $VERSION = '1.00'; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
# Preloaded methods go here. |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
sub align{ |
31
|
3
|
|
|
3
|
0
|
9414
|
my ($P1,$P2) = @_; |
32
|
3
|
100
|
66
|
|
|
129
|
if(!ref $P1 || !ref $P2) { |
33
|
1
|
|
|
|
|
2
|
return; |
34
|
|
|
|
|
|
|
} |
35
|
2
|
|
|
|
|
3
|
chomp(@{$P1}); |
|
2
|
|
|
|
|
8
|
|
36
|
2
|
|
|
|
|
3
|
chomp(@{$P2}); |
|
2
|
|
|
|
|
6
|
|
37
|
2
|
|
|
|
|
3
|
my(@A1,@A2); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# parameters |
40
|
0
|
|
|
|
|
0
|
my %PRIOR; |
41
|
2
|
|
|
|
|
6
|
$PRIOR{1}{1} = 0.89; |
42
|
2
|
|
|
|
|
6
|
$PRIOR{1}{0} = 0.01/2; |
43
|
2
|
|
|
|
|
4
|
$PRIOR{0}{1} = 0.01/2; |
44
|
2
|
|
|
|
|
6
|
$PRIOR{2}{1} = 0.089/2; |
45
|
2
|
|
|
|
|
4
|
$PRIOR{1}{2} = 0.089/2; |
46
|
|
|
|
|
|
|
# $PRIOR{2}{2} = 0.011; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# compute length (in characters) |
49
|
2
|
|
|
|
|
3
|
my (@LEN1,@LEN2); |
50
|
2
|
|
|
|
|
5
|
$LEN1[0] = 0; |
51
|
2
|
|
|
|
|
3
|
for(my $i=0;$i
|
|
8
|
|
|
|
|
22
|
|
52
|
6
|
|
|
|
|
11
|
my $line = $$P1[$i]; |
53
|
6
|
|
|
|
|
90
|
$line =~ s/[\s\r\n]+//g; |
54
|
|
|
|
|
|
|
# print "1: $line\n"; |
55
|
6
|
|
|
|
|
18
|
$LEN1[$i+1] = $LEN1[$i] + length($line); |
56
|
|
|
|
|
|
|
} |
57
|
2
|
|
|
|
|
3
|
$LEN2[0] = 0; |
58
|
2
|
|
|
|
|
4
|
for(my $i=0;$i
|
|
7
|
|
|
|
|
20
|
|
59
|
5
|
|
|
|
|
8
|
my $line = $$P2[$i]; |
60
|
5
|
|
|
|
|
95
|
$line =~ s/[\s\r\n]+//g; |
61
|
|
|
|
|
|
|
# print "2: $line\n"; |
62
|
5
|
|
|
|
|
13
|
$LEN2[$i+1] = $LEN2[$i] + length($line); |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# dynamic programming |
66
|
2
|
|
|
|
|
4
|
my (@COST,@BACK); |
67
|
2
|
|
|
|
|
5
|
$COST[0][0] = 0; |
68
|
2
|
|
|
|
|
4
|
for(my $i1=0;$i1<=scalar(@{$P1});$i1++) { |
|
10
|
|
|
|
|
24
|
|
69
|
8
|
|
|
|
|
11
|
for(my $i2=0;$i2<=scalar(@{$P2});$i2++) { |
|
51
|
|
|
|
|
117
|
|
70
|
43
|
100
|
|
|
|
93
|
next if $i1 + $i2 == 0; |
71
|
41
|
|
|
|
|
64
|
$COST[$i1][$i2] = 1e10; |
72
|
41
|
|
|
|
|
86
|
foreach my $d1 (keys %PRIOR) { |
73
|
123
|
100
|
|
|
|
1660
|
next if $d1>$i1; |
74
|
107
|
|
|
|
|
101
|
foreach my $d2 (keys %{$PRIOR{$d1}}) { |
|
107
|
|
|
|
|
225
|
|
75
|
179
|
100
|
|
|
|
404
|
next if $d2>$i2; |
76
|
150
|
|
|
|
|
551
|
my $cost = $COST[$i1-$d1][$i2-$d2] - log($PRIOR{$d1}{$d2}) + |
77
|
|
|
|
|
|
|
&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2]); |
78
|
|
|
|
|
|
|
# print "($i1->".($i1-$d1).",$i2->".($i2-$d2).") [".($LEN1[$i1]-$LEN1[$i1-$d1]).",".($LEN2[$i2]-$LEN2[$i2-$d2])."] = $COST[$i1-$d1][$i2-$d2] - ".log($PRIOR{$d1}{$d2})." + ".&match($LEN1[$i1]-$LEN1[$i1-$d1], $LEN2[$i2]-$LEN2[$i2-$d2])." = $cost\n"; |
79
|
150
|
100
|
|
|
|
516
|
if ($cost < $COST[$i1][$i2]) { |
80
|
80
|
|
|
|
|
102
|
$COST[$i1][$i2] = $cost; |
81
|
80
|
|
|
|
|
105
|
@{$BACK[$i1][$i2]} = ($i1-$d1,$i2-$d2); |
|
80
|
|
|
|
|
318
|
|
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
# print $COST[$i1][$i2]."($i1-$BACK[$i1][$i2][0],$i2-$BACK[$i1][$i2][1]) "; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
# print "\n"; |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
# back tracking |
91
|
2
|
|
|
|
|
3
|
my (%NEXT); |
92
|
2
|
|
|
|
|
3
|
my $i1 = scalar(@{$P1}); |
|
2
|
|
|
|
|
4
|
|
93
|
2
|
|
|
|
|
3
|
my $i2 = scalar(@{$P2}); |
|
2
|
|
|
|
|
4
|
|
94
|
2
|
|
66
|
|
|
27
|
while($i1>0 || $i2>0) { |
95
|
|
|
|
|
|
|
# print "back $i1 $i2\n"; |
96
|
5
|
|
|
|
|
6
|
@{$NEXT{$BACK[$i1][$i2][0]}{$BACK[$i1][$i2][1]}} = ($i1,$i2); |
|
5
|
|
|
|
|
31
|
|
97
|
5
|
|
|
|
|
22
|
($i1,$i2) = ($BACK[$i1][$i2][0],$BACK[$i1][$i2][1]); |
98
|
|
|
|
|
|
|
} |
99
|
2
|
|
66
|
|
|
4
|
while($i1
|
|
7
|
|
|
|
|
21
|
|
|
2
|
|
|
|
|
8
|
|
100
|
|
|
|
|
|
|
# print "fwd $i1 $i2\n"; |
101
|
5
|
|
|
|
|
6
|
my $s1 = ""; |
102
|
5
|
|
|
|
|
11
|
my $s2 = ""; |
103
|
5
|
|
|
|
|
17
|
for(my $i=$i1;$i<$NEXT{$i1}{$i2}[0];$i++) { |
104
|
6
|
100
|
|
|
|
11
|
$s1 .= " " unless $i == $i1; |
105
|
6
|
|
|
|
|
22
|
$s1 .= $$P1[$i]; |
106
|
|
|
|
|
|
|
} |
107
|
5
|
|
|
|
|
8
|
push @A1,$s1; |
108
|
5
|
|
|
|
|
15
|
for(my $i=$i2;$i<$NEXT{$i1}{$i2}[1];$i++) { |
109
|
5
|
50
|
|
|
|
13
|
$s2 .= " " unless $i == $i2; |
110
|
5
|
|
|
|
|
20
|
$s2 .= $$P2[$i]; |
111
|
|
|
|
|
|
|
} |
112
|
5
|
|
|
|
|
7
|
push @A2,$s2; |
113
|
5
|
|
|
|
|
5
|
($i1,$i2) = @{$NEXT{$i1}{$i2}}; |
|
5
|
|
|
|
|
15
|
|
114
|
|
|
|
|
|
|
} |
115
|
2
|
|
|
|
|
28
|
return (\@A1,\@A2); |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
sub match { |
119
|
150
|
|
|
150
|
0
|
183
|
my ($len1,$len2) = @_; |
120
|
150
|
|
|
|
|
163
|
my $c = 1; |
121
|
150
|
|
|
|
|
153
|
my $s2 = 6.8; |
122
|
|
|
|
|
|
|
|
123
|
150
|
50
|
66
|
|
|
383
|
if ($len1==0 && $len2==0) { return 0; } |
|
0
|
|
|
|
|
0
|
|
124
|
150
|
|
|
|
|
250
|
my $mean = ($len1 + $len2/$c) / 2; |
125
|
150
|
|
|
|
|
240
|
my $z = ($c * $len1 - $len2)/sqrt($s2 * $mean); |
126
|
150
|
100
|
|
|
|
311
|
if ($z < 0) { $z = -$z; } |
|
83
|
|
|
|
|
100
|
|
127
|
150
|
|
|
|
|
214
|
my $pd = 2 * (1 - &pnorm($z)); |
128
|
150
|
100
|
|
|
|
327
|
if ($pd>0) { return -log($pd); } |
|
143
|
|
|
|
|
345
|
|
129
|
7
|
|
|
|
|
15
|
return 25; |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
sub pnorm { |
133
|
150
|
|
|
150
|
0
|
161
|
my ($z) = @_; |
134
|
150
|
|
|
|
|
199
|
my $t = 1/(1 + 0.2316419 * $z); |
135
|
150
|
|
|
|
|
513
|
return 1 - 0.3989423 * exp(-$z * $z / 2) * |
136
|
|
|
|
|
|
|
((((1.330274429 * $t |
137
|
|
|
|
|
|
|
- 1.821255978) * $t |
138
|
|
|
|
|
|
|
+ 1.781477937) * $t |
139
|
|
|
|
|
|
|
- 0.356563782) * $t |
140
|
|
|
|
|
|
|
+ 0.319381530) * $t; |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
1; |
144
|
|
|
|
|
|
|
__END__ |