line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Perfide::PartialAlign; |
2
|
|
|
|
|
|
|
|
3
|
4
|
|
|
4
|
|
103849
|
use 5.006; |
|
4
|
|
|
|
|
17
|
|
|
4
|
|
|
|
|
171
|
|
4
|
4
|
|
|
4
|
|
24
|
use strict; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
143
|
|
5
|
4
|
|
|
4
|
|
28
|
use warnings; |
|
4
|
|
|
|
|
13
|
|
|
4
|
|
|
|
|
136
|
|
6
|
4
|
|
|
4
|
|
5277
|
use Data::Dumper; |
|
4
|
|
|
|
|
46622
|
|
|
4
|
|
|
|
|
482
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
Text::Perfide::PartialAlign - Split large bitexts into smaller files. |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 VERSION |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Version 0.01_03 |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=cut |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = '0.01_03'; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 SYNOPSIS |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
Perhaps a little code snippet. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
use Text::Perfide::PartialAlign; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
my $foo = Text::Perfide::PartialAlign->new(); |
29
|
|
|
|
|
|
|
... |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=head1 EXPORT |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
A list of functions that can be exported. You can delete this section if you don't export anything, such as for a purely object-oriented module. |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
=head1 SUBROUTINES/METHODS |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=cut |
38
|
|
|
|
|
|
|
|
39
|
4
|
|
|
4
|
|
37
|
use base 'Exporter'; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
16796
|
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
our @EXPORT_OK = qw/ |
42
|
|
|
|
|
|
|
get_corpus |
43
|
|
|
|
|
|
|
usage |
44
|
|
|
|
|
|
|
subcorpora2files |
45
|
|
|
|
|
|
|
calc_pairs |
46
|
|
|
|
|
|
|
build_chain |
47
|
|
|
|
|
|
|
calc_common_tokens |
48
|
|
|
|
|
|
|
findCommonHap |
49
|
|
|
|
|
|
|
seg_split |
50
|
|
|
|
|
|
|
_log |
51
|
|
|
|
|
|
|
/; |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
sub _log { |
54
|
0
|
|
|
0
|
|
0
|
print STDERR "$_[0]\n"; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub _print_verbose{ |
58
|
0
|
|
|
0
|
|
0
|
my ($file,$data) = @_; |
59
|
0
|
0
|
|
|
|
0
|
open my $fh, '>', $file or die; |
60
|
0
|
|
|
|
|
0
|
print $fh Dumper($data); |
61
|
0
|
|
|
|
|
0
|
close $fh; |
62
|
|
|
|
|
|
|
} |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=head2 build_chain |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=cut |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub build_chain { |
70
|
0
|
|
|
0
|
1
|
0
|
my ($pairs,$maximalChunkSize,$options) = @_; |
71
|
0
|
|
|
|
|
0
|
_log("Computing maximal chain in poset..."); |
72
|
0
|
0
|
|
|
|
0
|
_print_verbose("$$.pairs",$pairs) if $options->{v}; |
73
|
0
|
|
|
|
|
0
|
my $chain = maximalChain($pairs); |
74
|
0
|
|
|
|
|
0
|
_log("Done."); |
75
|
0
|
|
|
|
|
0
|
_log((scalar @$chain)." long chain found in ".(scalar @$pairs)." sized poset..."); |
76
|
|
|
|
|
|
|
|
77
|
0
|
0
|
|
|
|
0
|
if($maximalChunkSize > 0) { |
78
|
0
|
|
|
|
|
0
|
_log("Selecting at most $maximalChunkSize sized chunks..."); |
79
|
0
|
|
|
|
|
0
|
($chain,my $forced) = selectFromChain($chain,$maximalChunkSize); |
80
|
0
|
|
|
|
|
0
|
_log(scalar(@$chain)." chunks selected."); |
81
|
0
|
|
|
|
|
0
|
_log("Done."); |
82
|
0
|
0
|
|
|
|
0
|
_log("WARNING: maximalChunkSized could not be obeyed.") if $forced; |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
0
|
0
|
|
|
|
0
|
_print_verbose("$$.chain",$chain) if $options->{v}; |
86
|
|
|
|
|
|
|
|
87
|
0
|
|
|
|
|
0
|
my @newchain = ([-1,-1,0]); |
88
|
|
|
|
|
|
|
|
89
|
0
|
|
|
|
|
0
|
for my $i (@$chain){ |
90
|
0
|
0
|
0
|
|
|
0
|
if($i->[0] != $newchain[-1][0] and $i->[1] != $newchain[-1][1]){ |
91
|
0
|
|
|
|
|
0
|
push @newchain,$i; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
else { |
94
|
0
|
|
|
|
|
0
|
$newchain[-1][2]+= $i->[2]; |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
0
|
|
|
|
|
0
|
shift @newchain; |
99
|
0
|
0
|
|
|
|
0
|
_print_verbose("$$.newchain",\@newchain) if $options->{v}; |
100
|
0
|
|
|
|
|
0
|
$chain = \@newchain; |
101
|
0
|
|
|
|
|
0
|
return $chain; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=head2 calc_common_tokens |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=cut |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
sub calc_common_tokens { |
109
|
0
|
|
|
0
|
1
|
0
|
my ($huCorpus,$enCorpus,$options) = @_; |
110
|
0
|
|
|
|
|
0
|
my $huFreq = tokenFreq($huCorpus); # Map word => frequency (number of times word appears in corpus) |
111
|
0
|
|
|
|
|
0
|
my $enFreq = tokenFreq($enCorpus); |
112
|
0
|
|
|
|
|
0
|
my $huHap = hapaxes($huFreq); # Words which have frequency = 1 |
113
|
0
|
|
|
|
|
0
|
my $enHap = hapaxes($enFreq); |
114
|
|
|
|
|
|
|
|
115
|
0
|
|
|
|
|
0
|
my $commonHap = findCommonHap($huHap,$enHap,$options->{cf}); |
116
|
0
|
|
|
|
|
0
|
my $huPositions = hapaxPositions($huHap, $huCorpus); # Map word => id_sentence |
117
|
0
|
|
|
|
|
0
|
my $enPositions = hapaxPositions($enHap, $enCorpus); |
118
|
0
|
0
|
|
|
|
0
|
_print_verbose("$$.huPositions",$huPositions) if $options->{v}; |
119
|
0
|
0
|
|
|
|
0
|
_print_verbose("$$.enPositions",$enPositions) if $options->{v}; |
120
|
0
|
|
|
|
|
0
|
return ($commonHap,$huPositions,$enPositions); |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
=head2 calc_pairs |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
=cut |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub calc_pairs{ |
128
|
0
|
|
|
0
|
1
|
0
|
my ($commonHap,$huPositions,$enPositions,$huCorpus,$enCorpus,$options) = @_; |
129
|
0
|
|
|
|
|
0
|
my $pairs = []; # (id_sentence_file1, id_sentence_file2) |
130
|
0
|
0
|
|
|
|
0
|
_print_verbose("$$.commonHap",$commonHap) if $options->{v}; |
131
|
0
|
|
|
|
|
0
|
for my $t (keys %$commonHap) { |
132
|
0
|
|
|
|
|
0
|
my $hup = $huPositions->{$t}; |
133
|
0
|
|
|
|
|
0
|
my $enp = $enPositions->{$commonHap->{$t}}; |
134
|
0
|
|
|
|
|
0
|
push @$pairs, [$hup, $enp]; |
135
|
|
|
|
|
|
|
} |
136
|
0
|
|
|
|
|
0
|
push @$pairs, [0,0]; |
137
|
|
|
|
|
|
|
|
138
|
0
|
|
|
|
|
0
|
my $corpusSizes = [ scalar @$huCorpus, scalar @$enCorpus ]; |
139
|
0
|
|
|
|
|
0
|
push @$pairs, $corpusSizes; |
140
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
0
|
$pairs = bagSort($pairs); |
142
|
0
|
|
|
|
|
0
|
return $pairs; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=head2 subcorpora2files |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
Writes subcorpora to files. |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=cut |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
sub subcorpora2files { |
154
|
0
|
|
|
0
|
1
|
0
|
my ($chain,$huTextRef,$enTextRef,$huOffsets,$enOffsets,$outputFilename,$huLangName,$enLangName) = @_; |
155
|
0
|
|
|
|
|
0
|
_log("Writing subcorpora to files..."); |
156
|
0
|
|
|
|
|
0
|
my $lastPos = [0,0]; |
157
|
0
|
|
|
|
|
0
|
my $ind = 1; |
158
|
0
|
|
|
|
|
0
|
for my $pos (@$chain) { |
159
|
0
|
0
|
0
|
|
|
0
|
next if $pos->[0] == $lastPos->[0] and $pos->[1] == $lastPos->[1]; |
160
|
0
|
|
|
|
|
0
|
my $baseFilename = "${outputFilename}_$ind"; |
161
|
0
|
|
|
|
|
0
|
my $huSubCorpus = strInterval($huTextRef, $lastPos->[0], $pos->[0],$huOffsets); |
162
|
0
|
|
|
|
|
0
|
my $enSubCorpus = strInterval($enTextRef, $lastPos->[1], $pos->[1],$enOffsets); |
163
|
0
|
|
|
|
|
0
|
my $huFilename = "$baseFilename.$huLangName"; |
164
|
0
|
|
|
|
|
0
|
open my $huFile, '>', $huFilename; |
165
|
0
|
|
|
|
|
0
|
print $huFile $huSubCorpus; |
166
|
0
|
|
|
|
|
0
|
close $huFile; |
167
|
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
0
|
my $enFilename = "$baseFilename.$enLangName"; |
169
|
0
|
|
|
|
|
0
|
open my $enFile, '>', $enFilename; |
170
|
0
|
|
|
|
|
0
|
print $enFile $enSubCorpus; |
171
|
0
|
|
|
|
|
0
|
close $enFile; |
172
|
|
|
|
|
|
|
|
173
|
0
|
|
|
|
|
0
|
print "$huFilename\t$enFilename\t$baseFilename.align\n"; |
174
|
|
|
|
|
|
|
|
175
|
0
|
|
|
|
|
0
|
$lastPos = $pos; |
176
|
0
|
|
|
|
|
0
|
$ind++; |
177
|
|
|
|
|
|
|
} |
178
|
0
|
|
|
|
|
0
|
_log("Done."); |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=head2 usage |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
Prints a short description and usage details. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=cut |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
sub usage { |
188
|
0
|
|
|
0
|
1
|
0
|
_log("Perl port of partialAlign.py, 'a preprocessor for hunalign', with some tweaks."); |
189
|
0
|
|
|
|
|
0
|
_log("Cuts a very large sentence-segmented unaligned bicorpus into smaller parts."); |
190
|
0
|
|
|
|
|
0
|
_log(""); |
191
|
0
|
|
|
|
|
0
|
_log("Usage: $0 huge_text_in_one_language huge_text_in_other_language output_filename name_of_first_lang name_of_second_lang [ maximal_size_of_chunks=5000 ] > hunalign_batch"); |
192
|
0
|
|
|
|
|
0
|
_log(""); |
193
|
0
|
|
|
|
|
0
|
_log("The two input files must have one line per sentence. Whitespace-delimited tokenization is preferred."); |
194
|
0
|
|
|
|
|
0
|
_log("The output is a set of files named output_filename_[123..].name_of_lang"); |
195
|
0
|
|
|
|
|
0
|
exit -1; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=head2 tokenFreq |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
Receives an array of lines of a text (each line is an array of words). Calculates the frequency of each word. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=cut |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub tokenFreq { |
207
|
0
|
|
|
0
|
1
|
0
|
my $corpus = shift; |
208
|
0
|
|
|
|
|
0
|
my $freq = {}; |
209
|
0
|
|
|
|
|
0
|
for my $l (@$corpus) { |
210
|
0
|
|
|
|
|
0
|
for my $t (@$l) { |
211
|
0
|
|
|
|
|
0
|
$freq->{$t}++; |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
} |
214
|
0
|
|
|
|
|
0
|
return $freq; |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=head2 hapaxes |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
Receives hash token => freq. Returns hash with elements with freq == 1 |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=cut |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
sub hapaxes { |
224
|
0
|
|
|
0
|
1
|
0
|
my $freq = shift; |
225
|
0
|
|
|
|
|
0
|
my $hapaxes = {}; |
226
|
0
|
|
|
|
|
0
|
while(my ($token, $count) = each(%$freq)) { |
227
|
0
|
0
|
|
|
|
0
|
$hapaxes->{$token} = 1 if $count == 1; |
228
|
|
|
|
|
|
|
} |
229
|
0
|
|
|
|
|
0
|
return $hapaxes; |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
=head2 hapaxPositions |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
Builds an hash with term => positions, where position is the number of the sentence in which term occurs. |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=cut |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
sub hapaxPositions { |
239
|
0
|
|
|
0
|
1
|
0
|
my ($hapaxes, $corpus) = @_; |
240
|
0
|
|
|
|
|
0
|
my $hapaxPos = {}; |
241
|
0
|
|
|
|
|
0
|
my $corpus_size = @$corpus; |
242
|
0
|
|
|
|
|
0
|
for(my $ind = 0; $ind < $corpus_size; $ind++){ |
243
|
0
|
|
|
|
|
0
|
my $l = $corpus->[$ind]; |
244
|
0
|
|
|
|
|
0
|
for my $t (@$l) { |
245
|
0
|
0
|
|
|
|
0
|
$hapaxPos->{$t} = $ind if (defined($hapaxes->{$t})); |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
} |
248
|
0
|
|
|
|
|
0
|
return $hapaxPos; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=head2 bagSort |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
... |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=cut |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
sub bagSort { |
258
|
0
|
|
|
0
|
1
|
0
|
my $l = shift; |
259
|
0
|
|
|
|
|
0
|
my @sorted; |
260
|
|
|
|
|
|
|
my %aux; |
261
|
0
|
|
|
|
|
0
|
for my $coords (@$l) { |
262
|
0
|
|
|
|
|
0
|
my ($x,$y) = (@$coords); |
263
|
0
|
|
|
|
|
0
|
$aux{$x}{$y}++; |
264
|
|
|
|
|
|
|
} |
265
|
0
|
|
|
|
|
0
|
for my $x (sort { $a <=> $b } keys %aux){ |
|
0
|
|
|
|
|
0
|
|
266
|
0
|
|
|
|
|
0
|
for my $y (sort { $a <=> $b } keys %{$aux{$x}}){ |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
267
|
0
|
|
|
|
|
0
|
push @sorted, [$x,$y, $aux{$x}{$y}]; |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
} |
270
|
0
|
|
|
|
|
0
|
return \@sorted; |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=head2 uniqSort |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
Sorts an array of pairs and removes duplicated pairs. |
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
=cut |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
sub uniqSort { |
282
|
0
|
|
|
0
|
1
|
0
|
my $l = shift; |
283
|
0
|
|
|
|
|
0
|
my $hash = {}; |
284
|
0
|
|
|
|
|
0
|
my $uniqSorted = []; |
285
|
0
|
|
|
|
|
0
|
map { $hash->{$_->[0]}{$_->[1]} = 1 } @$l; |
|
0
|
|
|
|
|
0
|
|
286
|
0
|
|
|
|
|
0
|
for my $x (sort { $a <=> $b } keys %$hash){ |
|
0
|
|
|
|
|
0
|
|
287
|
0
|
|
|
|
|
0
|
for my $y (sort { $a <=> $b } keys %{$hash->{$x}}){ |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
288
|
0
|
|
|
|
|
0
|
push @$uniqSorted, [$x,$y]; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
} |
291
|
0
|
|
|
|
|
0
|
return $uniqSorted; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=head2 less |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
Receives two pairs. Checks if both coordinates of the first pair are lower than the second pair. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
=cut |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
sub less { |
301
|
0
|
|
|
0
|
1
|
0
|
my ($a,$b) = @_; |
302
|
0
|
0
|
0
|
|
|
0
|
if ($a->[0] < $b->[0] and $a->[1] < $b->[1]) |
303
|
0
|
|
|
|
|
0
|
{ return 1; } |
304
|
0
|
|
|
|
|
0
|
else { return 0; } |
305
|
|
|
|
|
|
|
} |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
=head2 less_relaxed |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
Receives two pairs... |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
=cut |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
sub less_relaxed { |
315
|
0
|
|
|
0
|
1
|
0
|
my ($a,$b) = @_; |
316
|
0
|
0
|
0
|
|
|
0
|
if ($a->[0] == $b->[0] and $a->[1] == $b->[1]){ return 0; } |
|
0
|
|
|
|
|
0
|
|
317
|
0
|
|
0
|
|
|
0
|
return ($a->[0] <= $b->[0] and $a->[1] <= $b->[1]); |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
|
321
|
|
|
|
|
|
|
=head2 less_or_equal |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
Receives two pairs. Checks if both coordinates of the first pair are lower or equal than the second pair's. |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
=cut |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
sub less_or_equal { |
328
|
0
|
|
|
0
|
1
|
0
|
my ($a,$b) = @_; |
329
|
0
|
0
|
0
|
|
|
0
|
if ($a->[0] <= $b->[0] and $a->[1] <= $b->[1]) |
330
|
0
|
|
|
|
|
0
|
{ return 1; } |
331
|
0
|
|
|
|
|
0
|
else { return 0; } |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
=head2 maximalChain |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
Receives an array of pairs. Using dynamic programming, selects the maximal chain. |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
=cut |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
# Assumes that uniqSort was called to the input! (translated from original Hungarian) |
341
|
|
|
|
|
|
|
sub maximalChain { |
342
|
0
|
|
|
0
|
1
|
0
|
my $pairs = shift; |
343
|
|
|
|
|
|
|
# print Dumper @$pairs; |
344
|
0
|
|
|
|
|
0
|
my $lattice = {}; |
345
|
0
|
|
|
|
|
0
|
for my $p (@$pairs) { |
346
|
0
|
|
|
|
|
0
|
my $bestLength = 0; |
347
|
0
|
|
|
|
|
0
|
my $bestPredessor = undef; |
348
|
0
|
|
|
|
|
0
|
for my $q (@$pairs) { |
349
|
0
|
0
|
0
|
|
|
0
|
if(less_relaxed($q,$p) and defined($lattice->{$q->[0]}{$q->[1]})){ |
350
|
0
|
|
|
|
|
0
|
(my $length,undef) = @{$lattice->{$q->[0]}{$q->[1]}}; |
|
0
|
|
|
|
|
0
|
|
351
|
0
|
0
|
|
|
|
0
|
if($bestLength < $length+$q->[2]){ |
352
|
|
|
|
|
|
|
# print "$bestLength < $length\n"; |
353
|
0
|
|
|
|
|
0
|
$bestLength = $length+$q->[2]; |
354
|
0
|
|
|
|
|
0
|
$bestPredessor = $q; |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
} |
359
|
0
|
|
|
|
|
0
|
$lattice->{$p->[0]}{$p->[1]} = [$bestLength,$bestPredessor]; |
360
|
|
|
|
|
|
|
#print "$bestLength @$p $bestPredessor\n"; |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
#Compute pair with max bestLength |
364
|
0
|
|
|
|
|
0
|
my $x = [ map { [$lattice->{$_->[0]}{$_->[1]}[0],$_] } @$pairs ] ; |
|
0
|
|
|
|
|
0
|
|
365
|
0
|
|
|
|
|
0
|
my $y = (sort { $b->[0] <=> $a->[0] } @$x)[0]; |
|
0
|
|
|
|
|
0
|
|
366
|
0
|
|
|
|
|
0
|
my ($bestLength,$p) = @$y; |
367
|
|
|
|
|
|
|
|
368
|
0
|
|
|
|
|
0
|
my $chain = []; |
369
|
0
|
|
|
|
|
0
|
while($p){ |
370
|
0
|
|
|
|
|
0
|
push @$chain, $p; |
371
|
0
|
|
|
|
|
0
|
(my $length, $p) = @{$lattice->{$p->[0]}{$p->[1]}} ; |
|
0
|
|
|
|
|
0
|
|
372
|
|
|
|
|
|
|
} |
373
|
0
|
|
|
|
|
0
|
return [reverse @$chain ]; |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
=head2 findCommonHap |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
Finds unique terms common to both corpora. Notion of equality can be extended with two lists of correspondences. |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
=head3 findCommonHap($l1Hap,$l2Hap) |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
Returns a reference to a hash containing the elements common to the hashes pointed by the references $l1Hap and $l2Hap. |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=head3 findCommonHap($l1Hap,$l2Hap,$l1_to_l2,$l2_to_l1) |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
$l1_to_l2 and $l2_to_l1 are references to hashes containing correspondences between words in language1 and language2 and vice-versa. |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
=cut |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
sub findCommonHap { |
391
|
2
|
|
|
2
|
1
|
1091
|
my ($l1Hap, $l2Hap, $corresp_file) = @_; |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
# Original algorithm (find occurences of: unique term_l1 = unique term_l2) |
394
|
2
|
|
|
|
|
4
|
my %hash; |
395
|
2
|
|
|
|
|
21
|
@hash{keys %$l1Hap} = keys %$l1Hap; |
396
|
2
|
|
|
|
|
8
|
my $commonHap = {}; |
397
|
2
|
|
|
|
|
7
|
map { $commonHap->{$_} = $_ } grep { $hash{$_} } keys %$l2Hap ; |
|
4
|
|
|
|
|
10
|
|
|
22
|
|
|
|
|
31
|
|
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
# Lists of correspondences |
400
|
2
|
100
|
|
|
|
8
|
if (defined($corresp_file)) { |
401
|
1
|
|
|
|
|
4
|
my $corresp_list = parseCorrespFile($corresp_file); |
402
|
|
|
|
|
|
|
|
403
|
1
|
|
|
|
|
3
|
foreach my $corresp (@$corresp_list) { |
404
|
5
|
|
|
|
|
8
|
my ($l1_terms,$l2_terms) = @$corresp; |
405
|
5
|
|
|
|
|
5
|
my $l1_sum = 0; |
406
|
5
|
|
|
|
|
6
|
my $l1_term; |
407
|
5
|
|
|
|
|
8
|
for (@$l1_terms,@$l2_terms){ |
408
|
18
|
100
|
|
|
|
43
|
if(defined($l1Hap->{$_})){ |
409
|
6
|
|
|
|
|
7
|
$l1_term = $_; |
410
|
6
|
|
|
|
|
10
|
$l1_sum++; |
411
|
|
|
|
|
|
|
} |
412
|
|
|
|
|
|
|
} |
413
|
5
|
100
|
|
|
|
16
|
next unless $l1_sum == 1; |
414
|
|
|
|
|
|
|
|
415
|
4
|
|
|
|
|
5
|
my $l2_sum = 0; |
416
|
4
|
|
|
|
|
5
|
my $l2_term; |
417
|
4
|
|
|
|
|
7
|
for (@$l2_terms,@$l1_terms){ |
418
|
14
|
100
|
|
|
|
35
|
if(defined($l2Hap->{$_})){ |
419
|
4
|
|
|
|
|
5
|
$l2_term = $_; |
420
|
4
|
|
|
|
|
19
|
$l2_sum++; |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
} |
423
|
4
|
50
|
|
|
|
10
|
next unless $l2_sum == 1; |
424
|
4
|
|
|
|
|
10
|
$commonHap->{$l1_term} = $l2_term; |
425
|
|
|
|
|
|
|
} |
426
|
|
|
|
|
|
|
} |
427
|
2
|
|
|
|
|
10
|
return $commonHap; |
428
|
|
|
|
|
|
|
} |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=head2 selectFromChain |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
Selects a chain trying to obbey the maximalChunkSize constraint. |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=cut |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
sub selectFromChain { |
437
|
0
|
|
|
0
|
1
|
0
|
my ($chain,$maximalChunkSize) = @_; |
438
|
0
|
|
|
|
|
0
|
my $forced = 0; |
439
|
0
|
|
|
|
|
0
|
my $cursor; |
440
|
0
|
|
|
|
|
0
|
my $filteredChain = []; |
441
|
|
|
|
|
|
|
|
442
|
0
|
|
|
|
|
0
|
my $chain_size = @$chain; |
443
|
0
|
|
|
|
|
0
|
for (my $ind = 0; $ind < $chain_size; $ind++) { |
444
|
0
|
|
|
|
|
0
|
my $p = $chain->[$ind]; |
445
|
0
|
0
|
|
|
|
0
|
if($ind == 0) { |
446
|
0
|
|
|
|
|
0
|
push @$filteredChain, $p; |
447
|
0
|
|
|
|
|
0
|
$cursor = $p; |
448
|
0
|
|
|
|
|
0
|
next; |
449
|
|
|
|
|
|
|
} |
450
|
0
|
0
|
0
|
|
|
0
|
if( $p->[0] - $cursor->[0] > $maximalChunkSize or |
451
|
|
|
|
|
|
|
$p->[1] - $cursor->[1] > $maximalChunkSize) { |
452
|
0
|
|
|
|
|
0
|
my $lastPos; |
453
|
0
|
0
|
|
|
|
0
|
$lastPos = ($ind!=0 ? $chain->[$ind-1] : [0,0]); |
454
|
0
|
0
|
|
|
|
0
|
if ($lastPos != $cursor) { push @$filteredChain, $lastPos } |
|
0
|
|
|
|
|
0
|
|
455
|
|
|
|
|
|
|
else { |
456
|
0
|
|
|
|
|
0
|
push @$filteredChain,$p; |
457
|
0
|
|
|
|
|
0
|
$forced = 1; |
458
|
|
|
|
|
|
|
} |
459
|
0
|
|
|
|
|
0
|
$cursor = $filteredChain->[-1]; |
460
|
|
|
|
|
|
|
} |
461
|
|
|
|
|
|
|
} |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
|
464
|
0
|
0
|
0
|
|
|
0
|
push @$filteredChain, $chain->[-1] unless(defined($filteredChain->[-1]) and |
465
|
|
|
|
|
|
|
$filteredChain->[-1]==$chain->[-1]); |
466
|
|
|
|
|
|
|
|
467
|
0
|
|
|
|
|
0
|
return ($filteredChain,$forced); |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
=head2 get_corpus |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
Given a file name, splits the segments and words into an array of arrays. |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
Returns: |
475
|
|
|
|
|
|
|
a reference to the array of arrays, |
476
|
|
|
|
|
|
|
a reference to an array of pairs with the offsets of the start and end of each segment, |
477
|
|
|
|
|
|
|
a reference to the full text |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
=cut |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
sub get_corpus { |
482
|
1
|
|
|
1
|
1
|
11
|
my ($filename) = @_; |
483
|
1
|
50
|
|
|
|
72
|
open my $fh, '<', $filename or die; |
484
|
1
|
|
|
|
|
3
|
my ($start,$end); |
485
|
1
|
|
|
|
|
4
|
$start = 0; |
486
|
1
|
|
|
|
|
2
|
my $offsets = []; |
487
|
1
|
|
|
|
|
2
|
my $corpus = []; |
488
|
1
|
|
|
|
|
40
|
while(<$fh>){ |
489
|
10
|
|
|
|
|
13
|
$end = tell($fh)-1; |
490
|
10
|
|
|
|
|
20
|
push @$offsets, [$start,$end]; |
491
|
10
|
|
|
|
|
12
|
$start = $end+1; |
492
|
10
|
|
|
|
|
64
|
push @$corpus, [ split ]; |
493
|
|
|
|
|
|
|
} |
494
|
1
|
|
|
|
|
9
|
close $fh; |
495
|
|
|
|
|
|
|
|
496
|
1
|
50
|
|
|
|
36
|
open $fh, '<', $filename or die; |
497
|
1
|
|
|
|
|
29
|
my $txt = join '',<$fh>; |
498
|
1
|
|
|
|
|
13
|
close $fh; |
499
|
|
|
|
|
|
|
|
500
|
1
|
|
|
|
|
8
|
return ($corpus, $offsets, \$txt); |
501
|
|
|
|
|
|
|
} |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=head2 strInterval |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
Given a corpus and a start and end positions, returns a string with the contents within the given range. |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
=head3 strInterval($corpus,$first,$last) |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
Concatenates all the words in the lines comprised in the $first..$last-1 range from corpus. |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
=head3 strInterval($corpus,$first,$last,$offsets); |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
Retrieves from the original text the substring from the begining of the segment $first to the end of the segment $last; |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
=cut |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
sub strInterval { |
518
|
0
|
|
|
0
|
1
|
0
|
my ($corpus,$first,$last,$offsets) = @_; |
519
|
0
|
0
|
|
|
|
0
|
unless (defined($offsets)){ |
520
|
0
|
|
|
|
|
0
|
my $s; |
521
|
0
|
|
|
|
|
0
|
for my $line (@$corpus[$first..$last-1]){ |
522
|
0
|
|
|
|
|
0
|
$s.= (join ' ', @$line) . "\n"; |
523
|
|
|
|
|
|
|
} |
524
|
0
|
|
|
|
|
0
|
return $s; |
525
|
|
|
|
|
|
|
} |
526
|
|
|
|
|
|
|
else { |
527
|
0
|
|
|
|
|
0
|
my $start = $offsets->[$first][0]; |
528
|
0
|
|
|
|
|
0
|
my $end = $offsets->[$last-1][1]; |
529
|
|
|
|
|
|
|
|
530
|
0
|
|
|
|
|
0
|
my $txt = $$corpus; |
531
|
0
|
|
|
|
|
0
|
my $s = substr $txt, $start, ($end-$start+1); |
532
|
0
|
|
|
|
|
0
|
return $s; |
533
|
|
|
|
|
|
|
} |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=head2 parseCorrespFile |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
Parses a given file with correspondences between two given languages. File must follow the following DSL: |
539
|
|
|
|
|
|
|
file : header correspondence* |
540
|
|
|
|
|
|
|
header: 'langs:' L1, L2 |
541
|
|
|
|
|
|
|
correspondence : term (',' term)* '=' term (',' term)* |
542
|
|
|
|
|
|
|
term : word (\s word)* |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
Does not yet support multi-word terms nor multi-term correspondences! |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
=cut |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
sub parseCorrespFile { |
549
|
1
|
|
|
1
|
1
|
3
|
my ($filepath) = @_; |
550
|
1
|
50
|
|
|
|
56
|
open my $fh, '<', $filepath or die; |
551
|
1
|
|
|
|
|
4
|
my $corresp_list = []; |
552
|
|
|
|
|
|
|
|
553
|
1
|
|
|
|
|
25
|
my $header = <$fh>; |
554
|
1
|
|
|
|
|
7
|
$header =~ /^langs:\s*(\w+)\s*,\s*(\w+)/i; |
555
|
1
|
|
|
|
|
5
|
my ($l1,$l2) = ($1,$2); |
556
|
|
|
|
|
|
|
|
557
|
1
|
|
|
|
|
8
|
while (<$fh>){ |
558
|
5
|
|
|
|
|
14
|
s/#.*$//; |
559
|
5
|
50
|
|
|
|
16
|
next if /^\s*$/; |
560
|
5
|
|
|
|
|
9
|
chomp; |
561
|
5
|
|
|
|
|
20
|
my ($str_l1, $str_l2) = split /\s*=\s*/,$_; |
562
|
5
|
|
|
|
|
18
|
my $terms_l1 = [ split /\s*,\s*/,$str_l1 ]; |
563
|
5
|
|
|
|
|
21
|
my $terms_l2 = [ split /\s*,\s*/,$str_l2 ]; |
564
|
5
|
|
|
|
|
31
|
push @$corresp_list, [$terms_l1,$terms_l2]; |
565
|
|
|
|
|
|
|
} |
566
|
1
|
|
|
|
|
11
|
close $fh; |
567
|
|
|
|
|
|
|
|
568
|
1
|
|
|
|
|
5
|
return $corresp_list; |
569
|
|
|
|
|
|
|
} |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
=head2 seg_split |
572
|
|
|
|
|
|
|
=cut |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
sub seg_split { |
575
|
2
|
|
|
2
|
1
|
4556
|
my ($txtref, $options) = @_; |
576
|
2
|
|
|
|
|
5
|
my ($corpus,$offsets); |
577
|
2
|
100
|
|
|
|
11
|
($corpus,$offsets) = _seg_split_pml($txtref,$options) if $options->{'-pml'}; |
578
|
2
|
100
|
|
|
|
8
|
($corpus,$offsets) = _seg_split_newline($txtref,$options) if $options->{'-newline'}; |
579
|
2
|
|
|
|
|
6
|
return ($corpus, $offsets, $txtref); |
580
|
|
|
|
|
|
|
} |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
sub _seg_split_pml { |
583
|
1
|
|
|
1
|
|
2
|
my ($txtref,$options) = @_; |
584
|
1
|
|
|
|
|
2
|
my $corpus = []; |
585
|
1
|
|
|
|
|
3
|
my $offsets = []; |
586
|
|
|
|
|
|
|
|
587
|
1
|
|
|
|
|
12
|
while($$txtref =~ / (.*?)<\/p>/g){ |
588
|
5
|
|
|
|
|
16
|
my ($start,$end) = ($-[0],$+[0]); |
589
|
5
|
|
|
|
|
13
|
push @$offsets, [$start,$end]; |
590
|
5
|
|
|
|
|
13
|
push @$corpus, token_split($1,$options); |
591
|
|
|
|
|
|
|
} |
592
|
1
|
|
|
|
|
3
|
return ($corpus, $offsets); |
593
|
|
|
|
|
|
|
} |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
sub _seg_split_newline { |
597
|
1
|
|
|
1
|
|
2
|
my ($txtref,$options) = @_; |
598
|
1
|
|
|
|
|
2
|
my $corpus = []; |
599
|
1
|
|
|
|
|
2
|
my $offsets = []; |
600
|
|
|
|
|
|
|
|
601
|
1
|
|
|
|
|
9
|
while($$txtref =~ /(.*)\n/g){ |
602
|
5
|
|
|
|
|
14
|
my ($start,$end) = ($-[1],$+[1]); |
603
|
5
|
|
|
|
|
11
|
push @$offsets, [$start,$end]; |
604
|
5
|
|
|
|
|
10
|
push @$corpus, token_split($1,$options); |
605
|
|
|
|
|
|
|
} |
606
|
1
|
|
|
|
|
2
|
return ($corpus, $offsets); |
607
|
|
|
|
|
|
|
} |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
=head2 token_split |
610
|
|
|
|
|
|
|
=cut |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
sub token_split { |
613
|
10
|
|
|
10
|
1
|
19
|
my ($seg,$options) = @_; |
614
|
10
|
|
|
|
|
9
|
my $tokens; |
615
|
10
|
100
|
|
|
|
25
|
$tokens = _token_split_ws($seg) if $options->{'-ws'}; |
616
|
10
|
100
|
|
|
|
31
|
$tokens = _token_split_punct($seg) if $options->{'-punct'}; |
617
|
10
|
|
|
|
|
50
|
return $tokens; |
618
|
|
|
|
|
|
|
} |
619
|
|
|
|
|
|
|
|
620
|
|
|
|
|
|
|
sub _token_split_ws { |
621
|
5
|
|
|
5
|
|
8
|
my $seg = shift; |
622
|
5
|
|
|
|
|
80
|
return [ split ' ',$seg ]; |
623
|
|
|
|
|
|
|
} |
624
|
|
|
|
|
|
|
|
625
|
|
|
|
|
|
|
sub _token_split_punct { |
626
|
5
|
|
|
5
|
|
7
|
my $seg = shift; |
627
|
5
|
|
|
|
|
102
|
return [ split /[\b\s?!\.,]+/, $seg ]; |
628
|
|
|
|
|
|
|
} |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
=head1 AUTHOR |
631
|
|
|
|
|
|
|
|
632
|
|
|
|
|
|
|
Andre Santos, C<< >> |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
=head1 BUGS |
635
|
|
|
|
|
|
|
|
636
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through |
637
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll |
638
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
=head1 SUPPORT |
641
|
|
|
|
|
|
|
|
642
|
|
|
|
|
|
|
You can find documentation for this module with the perldoc command. |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
perldoc Text::Perfide::PartialAlign |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
|
647
|
|
|
|
|
|
|
You can also look for information at: |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
=over 4 |
650
|
|
|
|
|
|
|
|
651
|
|
|
|
|
|
|
=item * RT: CPAN's request tracker (report bugs here) |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
L |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
=item * AnnoCPAN: Annotated CPAN documentation |
656
|
|
|
|
|
|
|
|
657
|
|
|
|
|
|
|
L |
658
|
|
|
|
|
|
|
|
659
|
|
|
|
|
|
|
=item * CPAN Ratings |
660
|
|
|
|
|
|
|
|
661
|
|
|
|
|
|
|
L |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
=item * Search CPAN |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
L |
666
|
|
|
|
|
|
|
|
667
|
|
|
|
|
|
|
=back |
668
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
|
670
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
671
|
|
|
|
|
|
|
|
672
|
|
|
|
|
|
|
Based on the original script partialAlign.py bundled with |
673
|
|
|
|
|
|
|
hunalign -- http://mokk.bme.hu/resources/hunalign/ . |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
Thanks to Daniel Varga for helping us to understand how partialAlign.py works. |
676
|
|
|
|
|
|
|
|
677
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
Copyright 2012 Andre Santos. |
680
|
|
|
|
|
|
|
|
681
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
682
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published |
683
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License. |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
=cut |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
1; # End of Text::Perfide::PartialAlign |