line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
######################################################################## |
3
|
|
|
|
|
|
|
# Author: Patrik Lambert (lambert@talp.ucp.es) |
4
|
|
|
|
|
|
|
# Contributions from Adria de Gispert (agispert@gps.tsc.upc.es) |
5
|
|
|
|
|
|
|
# and Josep Maria Crego (jmcrego@gps.tsc.upc.es) |
6
|
|
|
|
|
|
|
# Description: Library of tools to process a set of links between the |
7
|
|
|
|
|
|
|
# words of two sentences. |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
#----------------------------------------------------------------------- |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# Copyright 2004 by Patrik Lambert |
12
|
|
|
|
|
|
|
# |
13
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify |
14
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
15
|
|
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or |
16
|
|
|
|
|
|
|
# (at your option) any later version. |
17
|
|
|
|
|
|
|
# |
18
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
19
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
20
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
21
|
|
|
|
|
|
|
# GNU General Public License for more details. |
22
|
|
|
|
|
|
|
# |
23
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
24
|
|
|
|
|
|
|
# along with this program; if not, write to the Free Software |
25
|
|
|
|
|
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
26
|
|
|
|
|
|
|
######################################################################## |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
package Lingua::Alignment; |
29
|
|
|
|
|
|
|
$VERSION=1.1; |
30
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
44
|
|
31
|
1
|
|
|
1
|
|
706
|
use Lingua::AlignmentSlice; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
50
|
|
32
|
1
|
|
|
1
|
|
14
|
use Lingua::AlSetLib 1.1; |
|
1
|
|
|
|
|
45
|
|
|
1
|
|
|
|
|
23
|
|
33
|
1
|
|
|
1
|
|
4
|
use Dumpvalue; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
12999
|
|
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
#an alignment is a hash with 4 components: |
36
|
|
|
|
|
|
|
# {sourceAl} ref to source position array, each position containing the array of aligned target positions. |
37
|
|
|
|
|
|
|
# Each linked target token is indicated with the array: (position,S(sure)/P(possible),confidence score) |
38
|
|
|
|
|
|
|
# {targetAl} same as sourceAl but reversed |
39
|
|
|
|
|
|
|
# {sourceWords} and {targetWords}: array of corresponding words |
40
|
|
|
|
|
|
|
# {sourceLinks}: hash (indexed by the source token position $j and target $i in the link: {$j $i} of arrays giving |
41
|
|
|
|
|
|
|
# {targetLinks}: same as sourceLinks, for target alignment |
42
|
|
|
|
|
|
|
# more information about the link: ( S(sure) or P(possible) , confidence ) |
43
|
|
|
|
|
|
|
sub new { |
44
|
20
|
|
|
20
|
0
|
23
|
my $pkg = shift; |
45
|
20
|
|
|
|
|
26
|
my $al = {}; |
46
|
|
|
|
|
|
|
|
47
|
20
|
|
|
|
|
50
|
$al->{sourceAl}=[]; |
48
|
20
|
|
|
|
|
52
|
$al->{targetAl}=[]; |
49
|
20
|
|
|
|
|
34
|
$al->{sourceWords} = []; |
50
|
20
|
|
|
|
|
33
|
$al->{targetWords} = []; |
51
|
20
|
|
|
|
|
36
|
$al->{sourceLinks} = {}; |
52
|
20
|
|
|
|
|
31
|
$al->{targetLinks} = {}; |
53
|
20
|
|
|
|
|
74
|
return bless $al,$pkg; |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
sub loadFromGiza { |
57
|
0
|
|
|
0
|
0
|
0
|
my ($al,$alignmentString,$targetString,$reverseAlignmentString) = @_; |
58
|
0
|
|
|
|
|
0
|
my ($i,$elem,$positionsString); |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
#TARGET |
61
|
0
|
|
|
|
|
0
|
$targetString =~ s/^\s+//; #trim |
62
|
0
|
|
|
|
|
0
|
$targetString =~ s/\s+$//; #trim |
63
|
0
|
|
|
|
|
0
|
$targetString =~ s/\s{2,}/ /g; #remove multiple spaces |
64
|
0
|
0
|
|
|
|
0
|
if ($targetString !~ /^NULL /){ |
65
|
0
|
|
|
|
|
0
|
$al->{targetWords}=["NULL"]; #we keep a place for the NULL word of the other direction |
66
|
|
|
|
|
|
|
} |
67
|
0
|
|
|
|
|
0
|
push @{$al->{targetWords}},split(/ /,$targetString); |
|
0
|
|
|
|
|
0
|
|
68
|
|
|
|
|
|
|
|
69
|
0
|
|
|
|
|
0
|
$alignmentString =~ s/\s{2,}/ /g; #remove multiple spaces |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
#SOURCE |
72
|
0
|
|
|
|
|
0
|
my $srcString = $alignmentString; |
73
|
0
|
|
|
|
|
0
|
$srcString =~ s/ \(\{[^\}]+\}\)//g; |
74
|
0
|
|
|
|
|
0
|
$srcString =~ s/^\s+//; $srcString =~ s/\s+$//; |
|
0
|
|
|
|
|
0
|
|
75
|
0
|
|
|
|
|
0
|
@{$al->{sourceWords}}=split / /,$srcString; |
|
0
|
|
|
|
|
0
|
|
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
#S2T LINKS |
78
|
|
|
|
|
|
|
# here you can't use a hash because you would loose the order |
79
|
0
|
|
|
|
|
0
|
$_ = $alignmentString; |
80
|
0
|
|
|
|
|
0
|
my @correspondances = /\(\{(.+?)\}\)/g; #take what is between parentesis ie links |
81
|
0
|
|
|
|
|
0
|
foreach my $positionsString (@correspondances){ |
82
|
0
|
|
|
|
|
0
|
$positionsString =~ s/^\s+//; #trim |
83
|
0
|
|
|
|
|
0
|
$positionsString =~ s/\s+$//; #trim |
84
|
0
|
|
|
|
|
0
|
push @{$al->{sourceAl}}, [split / /,$positionsString]; |
|
0
|
|
|
|
|
0
|
|
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
#REVERSE ALIGNMENT |
88
|
0
|
0
|
|
|
|
0
|
if (length($reverseAlignmentString)>0){ |
89
|
0
|
|
|
|
|
0
|
$reverseAlignmentString =~ s/\(\{ \}\)/\(\{ \}\)/g; #insert blanks in unlinked words |
90
|
0
|
|
|
|
|
0
|
$reverseAlignmentString =~ s/\}\)\s*$//g; #rtrim |
91
|
|
|
|
|
|
|
|
92
|
0
|
|
|
|
|
0
|
@correspondances = split /\(\{\s|\}\)\s/, $reverseAlignmentString; |
93
|
0
|
|
|
|
|
0
|
for ($i=0;$i<@correspondances;$i+=2) { |
94
|
0
|
|
|
|
|
0
|
$positionsString = $correspondances[$i+1]; |
95
|
0
|
|
|
|
|
0
|
$positionsString =~ s/^\s+|\s+$//g; #trim |
96
|
0
|
|
|
|
|
0
|
$positionsString =~ s/\s{2,}/ /g; #remove multiple spaces |
97
|
0
|
|
|
|
|
0
|
push @{$al->{targetAl}}, [split / /,$positionsString]; |
|
0
|
|
|
|
|
0
|
|
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
#input: $refToAlignedPairs_ts (target to source),$sourceSentence and $targetSentence are optional |
103
|
|
|
|
|
|
|
sub loadFromBlinker{ |
104
|
20
|
|
|
20
|
0
|
33
|
my ($al,$refToAlignedPairs_st,$refToAlignedPairs_ts,$sourceSentence,$targetSentence)=@_; |
105
|
20
|
|
|
|
|
22
|
my $i; |
106
|
|
|
|
|
|
|
my $pairStr; |
107
|
0
|
|
|
|
|
0
|
my @pair; |
108
|
0
|
|
|
|
|
0
|
my @pairs; |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
#LOAD SENTENCES (if applicable) |
111
|
20
|
100
|
|
|
|
43
|
if (defined($sourceSentence)){ |
112
|
10
|
|
|
|
|
103
|
$sourceSentence =~ s/^\s+|\s+$//g; #trim |
113
|
10
|
|
|
|
|
28
|
$sourceSentence =~ s/\s{2,}/ /g; #remove multiple space |
114
|
|
|
|
|
|
|
|
115
|
10
|
50
|
|
|
|
34
|
if ($sourceSentence !~ /^NULL /){ |
116
|
0
|
|
|
|
|
0
|
$al->{sourceWords}=["NULL"]; |
117
|
|
|
|
|
|
|
} |
118
|
10
|
|
|
|
|
11
|
push @{$al->{sourceWords}},split(/ /,$sourceSentence); |
|
10
|
|
|
|
|
62
|
|
119
|
|
|
|
|
|
|
} |
120
|
20
|
100
|
|
|
|
38
|
if (defined($targetSentence)){ |
121
|
10
|
|
|
|
|
95
|
$targetSentence =~ s/^\s+|\s+$//g; |
122
|
10
|
|
|
|
|
28
|
$targetSentence =~ s/\s{2,}/ /g; |
123
|
|
|
|
|
|
|
|
124
|
10
|
50
|
|
|
|
30
|
if ($targetSentence !~ /^NULL /){ |
125
|
0
|
|
|
|
|
0
|
$al->{targetWords}=["NULL"]; |
126
|
|
|
|
|
|
|
} |
127
|
10
|
|
|
|
|
12
|
push @{$al->{targetWords}},split(/ /,$targetSentence); |
|
10
|
|
|
|
|
49
|
|
128
|
|
|
|
|
|
|
} |
129
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
#LOAD SOURCE TO TARGET ALIGNMENT: |
131
|
|
|
|
|
|
|
#read alignment data |
132
|
20
|
|
|
|
|
35
|
foreach $pairStr (@$refToAlignedPairs_st){ |
133
|
165
|
|
|
|
|
518
|
$pairStr =~ s/^\s+|\s+$//g; #trim |
134
|
165
|
|
|
|
|
287
|
$pairStr =~ s/\s{2,}/ /g; #remove multiple space |
135
|
165
|
|
|
|
|
413
|
@pair = split / /,$pairStr; |
136
|
165
|
|
|
|
|
173
|
push @{$pairs[$pair[0]]},$pair[1]; |
|
165
|
|
|
|
|
380
|
|
137
|
|
|
|
|
|
|
#load extra information (like S/P, confidence) |
138
|
165
|
50
|
|
|
|
402
|
if (@pair > 2){ |
139
|
0
|
|
|
|
|
0
|
$al->{sourceLinks}->{$pair[0]." ".$pair[1]}=[splice(@pair,2)] ; |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
} |
142
|
|
|
|
|
|
|
# take into account unaligned words to have no undef entry in array: |
143
|
|
|
|
|
|
|
# Since we really want to think in terms of alignment and not words, we don't base ourself on the number of words |
144
|
20
|
|
|
|
|
52
|
for ($i=0;$i<@pairs;$i++){ |
145
|
152
|
100
|
|
|
|
241
|
if (defined($pairs[$i])){ |
146
|
124
|
|
|
|
|
99
|
push @{$al->{sourceAl}},$pairs[$i]; |
|
124
|
|
|
|
|
351
|
|
147
|
|
|
|
|
|
|
}else{ |
148
|
28
|
|
|
|
|
29
|
push @{$al->{sourceAl}},[]; |
|
28
|
|
|
|
|
96
|
|
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
# print main::Dumper($refToAlignedPairs_st,$al->{sourceAl}); |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
#LOAD TARGET TO SOURCE ALIGNMENT: |
154
|
20
|
50
|
|
|
|
39
|
if (defined($refToAlignedPairs_ts)){ |
155
|
20
|
50
|
|
|
|
135
|
if (@$refToAlignedPairs_ts>0){ |
156
|
0
|
|
|
|
|
0
|
@pairs=(); |
157
|
|
|
|
|
|
|
#read alignment data |
158
|
0
|
|
|
|
|
0
|
foreach $pairStr (@$refToAlignedPairs_ts){ |
159
|
0
|
|
|
|
|
0
|
$pairStr =~ s/^\s+|\s+$//g; #trim |
160
|
0
|
|
|
|
|
0
|
$pairStr =~ s/\s{2,}/ /g; #remove multiple space |
161
|
0
|
|
|
|
|
0
|
@pair = split / /,$pairStr; |
162
|
0
|
|
|
|
|
0
|
push @{$pairs[$pair[0]]},$pair[1]; |
|
0
|
|
|
|
|
0
|
|
163
|
|
|
|
|
|
|
#load extra information (like S/P, confidence) |
164
|
0
|
0
|
|
|
|
0
|
if (@pair > 2){ |
165
|
0
|
|
|
|
|
0
|
$al->{targetLinks}->{$pair[0]." ".$pair[1]}=[splice(@pair,2)] ; |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
# take into account unaligned words to have no undef entry in array: |
169
|
0
|
|
|
|
|
0
|
for ($i=0;$i<@pairs;$i++){ |
170
|
0
|
0
|
|
|
|
0
|
if (defined($pairs[$i])){ |
171
|
0
|
|
|
|
|
0
|
push @{$al->{targetAl}},$pairs[$i]; |
|
0
|
|
|
|
|
0
|
|
172
|
|
|
|
|
|
|
}else{ |
173
|
0
|
|
|
|
|
0
|
push @{$al->{targetAl}},[]; |
|
0
|
|
|
|
|
0
|
|
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
# print main::Dumper($refToAlignedPairs_ts,$al->{targetAl}); |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub loadFromTalp{ |
182
|
0
|
|
|
0
|
0
|
0
|
my ($al,$st_string,$ts_string,$sourceSentence,$targetSentence)=@_; |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
#LOAD SENTENCES (if applicable) |
185
|
0
|
0
|
|
|
|
0
|
if (defined($sourceSentence)){ |
186
|
0
|
|
|
|
|
0
|
$sourceSentence =~ s/^\s+//g; #trim |
187
|
0
|
|
|
|
|
0
|
$sourceSentence =~ s/\s+$//g; #trim |
188
|
0
|
|
|
|
|
0
|
$sourceSentence =~ s/\s{2,}/ /g; #remove multiple space |
189
|
0
|
0
|
|
|
|
0
|
if ($sourceSentence !~ /^NULL /){ |
190
|
0
|
|
|
|
|
0
|
$al->{sourceWords}=["NULL"]; |
191
|
|
|
|
|
|
|
} |
192
|
0
|
|
|
|
|
0
|
push @{$al->{sourceWords}},split(/ /,$sourceSentence); |
|
0
|
|
|
|
|
0
|
|
193
|
|
|
|
|
|
|
} |
194
|
0
|
0
|
|
|
|
0
|
if (defined($targetSentence)){ |
195
|
0
|
|
|
|
|
0
|
$targetSentence =~ s/^\s+//g; |
196
|
0
|
|
|
|
|
0
|
$targetSentence =~ s/\s+$//g; |
197
|
0
|
|
|
|
|
0
|
$targetSentence =~ s/\s{2,}/ /g; |
198
|
0
|
0
|
|
|
|
0
|
if ($targetSentence !~ /^NULL /){ |
199
|
0
|
|
|
|
|
0
|
$al->{targetWords}=["NULL"]; |
200
|
|
|
|
|
|
|
} |
201
|
0
|
|
|
|
|
0
|
push @{$al->{targetWords}},split(/ /,$targetSentence); |
|
0
|
|
|
|
|
0
|
|
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
#LOAD SOURCE TO TARGET ALIGNMENT: |
205
|
0
|
0
|
|
|
|
0
|
if ($st_string ne ""){ |
206
|
0
|
|
|
|
|
0
|
my @pairs; |
207
|
0
|
|
|
|
|
0
|
$st_string =~ s/\s{2,}/ /g; #remove multiple space |
208
|
0
|
|
|
|
|
0
|
$st_string =~ s/^\s+//g; #trim |
209
|
0
|
|
|
|
|
0
|
$st_string =~ s/\s+$//g; #trim |
210
|
|
|
|
|
|
|
#read alignment data |
211
|
0
|
|
|
|
|
0
|
my @lnks=split (/ /,$st_string); |
212
|
0
|
|
|
|
|
0
|
foreach my $pairStr (@lnks){ |
213
|
0
|
|
|
|
|
0
|
my @info = split /:/,$pairStr; |
214
|
0
|
|
|
|
|
0
|
my ($src,$sep,$trg) = split /([^\d])/,$info[0]; |
215
|
0
|
|
|
|
|
0
|
push @{$pairs[$src]},$trg; |
|
0
|
|
|
|
|
0
|
|
216
|
|
|
|
|
|
|
#load extra information (like S/P, confidence) |
217
|
0
|
0
|
|
|
|
0
|
if ($sep eq "s"){ |
|
|
0
|
|
|
|
|
|
218
|
0
|
|
|
|
|
0
|
$al->{sourceLinks}->{$src." ".$trg}=["S"]; |
219
|
|
|
|
|
|
|
}elsif ($sep eq "p" ){ |
220
|
0
|
|
|
|
|
0
|
$al->{sourceLinks}->{$src." ".$trg}=["P"]; |
221
|
|
|
|
|
|
|
} |
222
|
0
|
|
|
|
|
0
|
for (my $i=1;$i<@info;$i++){ |
223
|
0
|
|
|
|
|
0
|
push @{$al->{sourceLinks}->{$src." ".$trg}},$info[$i]; |
|
0
|
|
|
|
|
0
|
|
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
# take into account unaligned words to have no undef entry in array: |
227
|
|
|
|
|
|
|
# Since we really want to think in terms of alignment and not words, we don't base ourself on the number of words |
228
|
0
|
|
|
|
|
0
|
for (my $i=0;$i<@pairs;$i++){ |
229
|
0
|
0
|
|
|
|
0
|
if (defined($pairs[$i])){ |
230
|
0
|
|
|
|
|
0
|
push @{$al->{sourceAl}},$pairs[$i]; |
|
0
|
|
|
|
|
0
|
|
231
|
|
|
|
|
|
|
}else{ |
232
|
0
|
|
|
|
|
0
|
push @{$al->{sourceAl}},[]; |
|
0
|
|
|
|
|
0
|
|
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
# print main::Dumper($refToAlignedPairs_st,$al->{sourceAl}); |
237
|
0
|
|
|
|
|
0
|
my $refToAlignedPairs_ts; |
238
|
|
|
|
|
|
|
my $pairStr; |
239
|
0
|
|
|
|
|
0
|
my @pair; |
240
|
|
|
|
|
|
|
#LOAD TARGET TO SOURCE ALIGNMENT: |
241
|
0
|
0
|
|
|
|
0
|
if ($ts_string ne ""){ |
242
|
0
|
|
|
|
|
0
|
$ts_string =~ s/^\s+|\s+$//g; #trim |
243
|
0
|
|
|
|
|
0
|
$ts_string =~ s/\s{2,}/ /g; #remove multiple space |
244
|
0
|
|
|
|
|
0
|
my @pairs=(); |
245
|
|
|
|
|
|
|
#read alignment data |
246
|
0
|
|
|
|
|
0
|
my @lnks=split (/ /,$ts_string); |
247
|
0
|
|
|
|
|
0
|
foreach my $pairStr (@lnks){ |
248
|
0
|
|
|
|
|
0
|
my @info = split /:/,$pairStr; |
249
|
0
|
|
|
|
|
0
|
my ($src,$sep,$trg) = split /([^\d])/,$info[0]; |
250
|
0
|
|
|
|
|
0
|
push @{$pairs[$src]},$trg; |
|
0
|
|
|
|
|
0
|
|
251
|
|
|
|
|
|
|
#load extra information (like S/P, confidence) |
252
|
0
|
0
|
|
|
|
0
|
if ($sep eq "s"){ |
|
|
0
|
|
|
|
|
|
253
|
0
|
|
|
|
|
0
|
$al->{targetLinks}->{$src." ".$trg}=["S"]; |
254
|
|
|
|
|
|
|
}elsif ($sep eq "p" ){ |
255
|
0
|
|
|
|
|
0
|
$al->{targetLinks}->{$src." ".$trg}=["P"]; |
256
|
|
|
|
|
|
|
} |
257
|
0
|
|
|
|
|
0
|
for (my $i=1;$i<@info;$i++){ |
258
|
0
|
|
|
|
|
0
|
push @{$al->{targetLinks}->{$src." ".$trg}},$info[$i]; |
|
0
|
|
|
|
|
0
|
|
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
# take into account unaligned words to have no undef entry in array: |
262
|
0
|
|
|
|
|
0
|
for (my $i=0;$i<@pairs;$i++){ |
263
|
0
|
0
|
|
|
|
0
|
if (defined($pairs[$i])){ |
264
|
0
|
|
|
|
|
0
|
push @{$al->{targetAl}},$pairs[$i]; |
|
0
|
|
|
|
|
0
|
|
265
|
|
|
|
|
|
|
}else{ |
266
|
0
|
|
|
|
|
0
|
push @{$al->{targetAl}},[]; |
|
0
|
|
|
|
|
0
|
|
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
# print main::Dumper($refToAlignedPairs_ts,$al->{targetAl}); |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
# sourceSentence: returns the target sentence tokens without NULL word (separated by " "), by parsing the alignment object |
274
|
|
|
|
|
|
|
sub sourceSentence { |
275
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
276
|
0
|
|
|
|
|
0
|
my @sentence=@{$al->{sourceWords}}; |
|
0
|
|
|
|
|
0
|
|
277
|
0
|
|
|
|
|
0
|
shift @sentence; |
278
|
0
|
|
|
|
|
0
|
return join " ",@sentence; |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# TargetSentence: returns the target sentence tokens without NULL word (separated by " "), by parsing the alignment object |
282
|
|
|
|
|
|
|
sub targetSentence { |
283
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
284
|
0
|
|
|
|
|
0
|
my @sentence=@{$al->{targetWords}}; |
|
0
|
|
|
|
|
0
|
|
285
|
0
|
|
|
|
|
0
|
shift @sentence; |
286
|
0
|
|
|
|
|
0
|
return join " ",@sentence; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
# Remove links to NULL. |
290
|
|
|
|
|
|
|
# Note: to do this we need the alignment to be loaded so we do it in a separate function |
291
|
|
|
|
|
|
|
sub forceNoNullAlign { |
292
|
20
|
|
|
20
|
0
|
22
|
my $al = shift; |
293
|
20
|
|
|
|
|
20
|
my ($j,$i); |
294
|
0
|
|
|
|
|
0
|
my $continue; |
295
|
0
|
|
|
|
|
0
|
my $source; |
296
|
20
|
|
|
|
|
41
|
my @sides=("source","target"); |
297
|
|
|
|
|
|
|
|
298
|
20
|
|
|
|
|
25
|
foreach $source (@sides){ |
299
|
40
|
|
|
|
|
99
|
$al->{$source."Al"}[0]=[]; |
300
|
40
|
|
|
|
|
55
|
for ($j=1;$j<@{$al->{$source."Al"}};$j++){ |
|
172
|
|
|
|
|
504
|
|
301
|
132
|
50
|
|
|
|
335
|
if ($al->isIn($source."Al",$j,0)){ |
302
|
0
|
|
|
|
|
0
|
$continue=1; |
303
|
0
|
|
0
|
|
|
0
|
for ($i=0;$i<@{$al->{$source."Al"}[$j]} && $continue;$i++){ |
|
0
|
|
|
|
|
0
|
|
304
|
0
|
0
|
|
|
|
0
|
if ($al->{$source."Al"}[$j][$i]==0){ |
305
|
0
|
|
|
|
|
0
|
splice(@{$al->{$source."Al"}[$j]}, $i, 1); |
|
0
|
|
|
|
|
0
|
|
306
|
0
|
|
|
|
|
0
|
$continue=0; |
307
|
|
|
|
|
|
|
} |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
} |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
} #foreach |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
# Link to NULL with a P (Possible) alignment all words that are not linked to anything |
315
|
|
|
|
|
|
|
sub forceNullAlign { |
316
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
317
|
0
|
|
|
|
|
0
|
my ($j,$i); |
318
|
0
|
|
|
|
|
0
|
my @reverseAl; |
319
|
0
|
|
|
|
|
0
|
my $source; |
320
|
0
|
|
|
|
|
0
|
my @sides=("source","target"); |
321
|
|
|
|
|
|
|
|
322
|
0
|
|
|
|
|
0
|
foreach $source (@sides){ |
323
|
0
|
|
|
|
|
0
|
@reverseAl = (); |
324
|
0
|
|
|
|
|
0
|
for ($j=1;$j<@{$al->{$source."Al"}};$j++){ |
|
0
|
|
|
|
|
0
|
|
325
|
0
|
0
|
|
|
|
0
|
if (@{$al->{$source."Al"}[$j]}==0){ |
|
0
|
|
|
|
|
0
|
|
326
|
0
|
|
|
|
|
0
|
push @{$al->{$source."Al"}[$j]},0; |
|
0
|
|
|
|
|
0
|
|
327
|
0
|
|
|
|
|
0
|
$al->{$source."Links"}->{"$j 0"}= ["P"]; |
328
|
|
|
|
|
|
|
}else{ |
329
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{$source."Al"}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
330
|
0
|
|
|
|
|
0
|
push @{$reverseAl[$i]},$j; |
|
0
|
|
|
|
|
0
|
|
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
} |
333
|
|
|
|
|
|
|
} |
334
|
0
|
|
|
|
|
0
|
for ($i=1;$i<@reverseAl;$i++){ |
335
|
0
|
0
|
0
|
|
|
0
|
if (!defined($reverseAl[$i]) || @{$reverseAl[$i]}==0){ |
|
0
|
|
|
|
|
0
|
|
336
|
0
|
0
|
|
|
|
0
|
if (!$al->isIn($source."Al",0,$i)){ |
337
|
0
|
|
|
|
|
0
|
push @{$al->{$source."Al"}[0]},$i; |
|
0
|
|
|
|
|
0
|
|
338
|
0
|
|
|
|
|
0
|
$al->{$source."Links"}->{"0 $i"}= ["P"]; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
} #foreach |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
sub writeToBlinker{ |
346
|
20
|
|
|
20
|
0
|
25
|
my $al = shift; |
347
|
20
|
|
|
|
|
25
|
my $side = shift; #optional; default:"source"; |
348
|
20
|
50
|
|
|
|
49
|
if (!defined($side)){$side="source"} |
|
20
|
|
|
|
|
26
|
|
349
|
20
|
|
|
|
|
27
|
my @lines = (); |
350
|
20
|
|
|
|
|
18
|
my ($i,$j); |
351
|
|
|
|
|
|
|
|
352
|
20
|
|
|
|
|
26
|
for ($j=0;$j<@{$al->{$side."Al"}};$j++){ |
|
172
|
|
|
|
|
400
|
|
353
|
152
|
|
|
|
|
142
|
foreach $i (@{$al->{$side."Al"}[$j]}){ |
|
152
|
|
|
|
|
349
|
|
354
|
149
|
50
|
|
|
|
134
|
if (${$al->{$side."Links"}}{"$j $i"}){ |
|
149
|
|
|
|
|
351
|
|
355
|
0
|
|
|
|
|
0
|
push @lines,"$j $i ".join(" ",@{$al->{$side."Links"}{"$j $i"}}); |
|
0
|
|
|
|
|
0
|
|
356
|
|
|
|
|
|
|
}else{ |
357
|
149
|
|
|
|
|
438
|
push @lines,"$j $i"; |
358
|
|
|
|
|
|
|
} |
359
|
|
|
|
|
|
|
} |
360
|
|
|
|
|
|
|
} |
361
|
20
|
|
|
|
|
75
|
return \@lines; |
362
|
|
|
|
|
|
|
} |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
sub writeToGiza{ |
365
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
366
|
0
|
|
|
|
|
0
|
my $side = shift; #optional; default:"source"; |
367
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
# first line |
369
|
0
|
|
|
|
|
0
|
my @lines = (); |
370
|
0
|
|
|
|
|
0
|
push @lines,"#\n"; |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
# second line |
373
|
0
|
|
|
|
|
0
|
my $invSide; |
374
|
0
|
0
|
|
|
|
0
|
if (!defined($side)){$side="source"} |
|
0
|
|
|
|
|
0
|
|
375
|
0
|
0
|
|
|
|
0
|
if ($side eq "source"){ |
376
|
0
|
|
|
|
|
0
|
$invSide="target"; |
377
|
0
|
|
|
|
|
0
|
push @lines,$al->targetSentence."\n"; |
378
|
|
|
|
|
|
|
}else{ |
379
|
0
|
|
|
|
|
0
|
$invSide="source"; |
380
|
0
|
|
|
|
|
0
|
push @lines,$al->sourceSentence."\n"; |
381
|
|
|
|
|
|
|
} |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
# third line |
384
|
0
|
|
|
|
|
0
|
my $linksStr=""; |
385
|
0
|
|
|
|
|
0
|
for (my $j=0;$j<@{$al->{$side."Words"}};$j++){ |
|
0
|
|
|
|
|
0
|
|
386
|
0
|
|
|
|
|
0
|
$linksStr.=$al->{$side."Words"}->[$j].' ({ '; |
387
|
0
|
|
|
|
|
0
|
foreach my $i (@{$al->{$side."Al"}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
388
|
0
|
|
|
|
|
0
|
$linksStr.="$i "; |
389
|
|
|
|
|
|
|
} |
390
|
0
|
|
|
|
|
0
|
$linksStr.='}) '; |
391
|
|
|
|
|
|
|
} |
392
|
0
|
|
|
|
|
0
|
$linksStr =~ s/\s+$//; |
393
|
0
|
|
|
|
|
0
|
$linksStr.="\n"; |
394
|
0
|
|
|
|
|
0
|
push @lines,$linksStr; |
395
|
|
|
|
|
|
|
# print "GIZA OUTPUT:\n",join("\n",@lines); |
396
|
0
|
|
|
|
|
0
|
return join("",@lines); |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
sub writeToTalp{ |
400
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
401
|
0
|
|
|
|
|
0
|
my $side = shift; #optional; default:"source"; |
402
|
0
|
0
|
|
|
|
0
|
if (!defined($side)){$side="source"} |
|
0
|
|
|
|
|
0
|
|
403
|
0
|
|
|
|
|
0
|
my @lines = (); |
404
|
0
|
|
|
|
|
0
|
my ($i,$j); |
405
|
|
|
|
|
|
|
|
406
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$al->{$side."Al"}};$j++){ |
|
0
|
|
|
|
|
0
|
|
407
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{$side."Al"}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
408
|
0
|
0
|
|
|
|
0
|
if (${$al->{$side."Links"}}{"$j $i"}){ |
|
0
|
|
|
|
|
0
|
|
409
|
0
|
|
|
|
|
0
|
my $lk="$j".lc(${$al->{$side."Links"}{"$j $i"}}[0])."$i"; |
|
0
|
|
|
|
|
0
|
|
410
|
0
|
|
|
|
|
0
|
for (my $k=1;$k<@{$al->{$side."Links"}{"$j $i"}};$k++){ |
|
0
|
|
|
|
|
0
|
|
411
|
0
|
|
|
|
|
0
|
$lk.=":".${$al->{$side."Links"}{"$j $i"}}[$k]; |
|
0
|
|
|
|
|
0
|
|
412
|
|
|
|
|
|
|
} |
413
|
0
|
|
|
|
|
0
|
push @lines,$lk; |
414
|
|
|
|
|
|
|
}else{ |
415
|
0
|
|
|
|
|
0
|
push @lines,$j."-".$i; |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
} |
419
|
0
|
|
|
|
|
0
|
return join(" ",@lines); |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
sub output { |
423
|
0
|
|
|
0
|
0
|
0
|
my ($al,$FH,$newFormat,$newFH,$newLocation,$internalSentPairNum)=@_; |
424
|
0
|
|
|
|
|
0
|
my $dumper = new Dumpvalue; |
425
|
0
|
0
|
|
|
|
0
|
if ($newFormat eq "TALP"){ |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
426
|
0
|
0
|
|
|
|
0
|
if ($newFH->{source}){ |
427
|
0
|
|
|
|
|
0
|
$newFH->{source}->print($al->sourceSentence."\n"); |
428
|
|
|
|
|
|
|
} |
429
|
0
|
0
|
|
|
|
0
|
if ($newFH->{target}){ |
430
|
0
|
|
|
|
|
0
|
$newFH->{target}->print($al->targetSentence."\n"); |
431
|
|
|
|
|
|
|
} |
432
|
0
|
0
|
|
|
|
0
|
if ($newFH->{sourceToTarget}){ |
433
|
0
|
|
|
|
|
0
|
$newFH->{sourceToTarget}->print($al->writeToTalp("source")."\n"); |
434
|
|
|
|
|
|
|
} |
435
|
0
|
0
|
|
|
|
0
|
if ($newFH->{targetToSource}){ |
436
|
0
|
|
|
|
|
0
|
$newFH->{targetToSource}->print($al->writeToTalp("target")."\n"); |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
}elsif ($newFormat eq "NAACL"){ |
439
|
0
|
0
|
|
|
|
0
|
if ($newFH->{source}){ |
440
|
0
|
|
|
|
|
0
|
$newFH->{source}->print(" ".$al->sourceSentence." \n"); |
441
|
|
|
|
|
|
|
} |
442
|
0
|
0
|
|
|
|
0
|
if ($newFH->{target}){ |
443
|
0
|
|
|
|
|
0
|
$newFH->{target}->print(" ".$al->targetSentence." \n"); |
444
|
|
|
|
|
|
|
} |
445
|
0
|
|
|
|
|
0
|
my $lines = $al->writeToBlinker("source"); |
446
|
0
|
|
|
|
|
0
|
foreach my $line (@$lines){ |
447
|
0
|
|
|
|
|
0
|
$newFH->{sourceToTarget}->print("$internalSentPairNum $line\n"); |
448
|
|
|
|
|
|
|
} |
449
|
0
|
0
|
|
|
|
0
|
if ($newFH->{targetToSource}){ |
450
|
0
|
|
|
|
|
0
|
$lines = $al->writeToBlinker("target"); |
451
|
0
|
|
|
|
|
0
|
foreach my $line (@$lines){ |
452
|
0
|
|
|
|
|
0
|
$newFH->{targetToSource}->print("$internalSentPairNum $line\n"); |
453
|
|
|
|
|
|
|
} |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
}elsif ($newFormat eq "GIZA"){ |
456
|
0
|
0
|
|
|
|
0
|
if (exists($newFH->{sourceToTarget})){ |
457
|
0
|
|
|
|
|
0
|
$newFH->{sourceToTarget}->print("".$al->writeToGiza("source")); |
458
|
|
|
|
|
|
|
} |
459
|
0
|
0
|
|
|
|
0
|
if (exists($newFH->{targetToSource})){ |
460
|
0
|
|
|
|
|
0
|
$newFH->{targetToSource}->print("".$al->writeToGiza("target")); |
461
|
|
|
|
|
|
|
} |
462
|
|
|
|
|
|
|
}elsif ($newFormat eq "BLINKER"){ |
463
|
0
|
0
|
|
|
|
0
|
if ($newFH->{source}){ |
464
|
0
|
|
|
|
|
0
|
$newFH->{source}->print($al->sourceSentence."\n"); |
465
|
|
|
|
|
|
|
} |
466
|
0
|
0
|
|
|
|
0
|
if ($newFH->{target}){ |
467
|
0
|
|
|
|
|
0
|
$newFH->{target}->print($al->targetSentence."\n"); |
468
|
|
|
|
|
|
|
} |
469
|
0
|
|
|
|
|
0
|
my $blinkerFile = $newLocation->{sourceToTarget}."/samp".$newLocation->{sampleNum}.".SentPair".($internalSentPairNum-1); |
470
|
0
|
|
0
|
|
|
0
|
open BLINKER, ">$blinkerFile" || die "Blinker file $blinkerFile opening problem:$!"; |
471
|
0
|
|
|
|
|
0
|
my $lines = $al->writeToBlinker("source"); |
472
|
0
|
|
|
|
|
0
|
foreach my $line (@$lines){ |
473
|
0
|
|
|
|
|
0
|
print BLINKER "$line\n"; |
474
|
|
|
|
|
|
|
} |
475
|
0
|
|
|
|
|
0
|
close BLINKER; |
476
|
0
|
0
|
|
|
|
0
|
if ($newLocation->{targetToSource}){ |
477
|
0
|
|
|
|
|
0
|
$blinkerFile = $newLocation->{targetToSource}."/samp".$newLocation->{sampleNum}.".SentPair".($internalSentPairNum-1); |
478
|
0
|
|
0
|
|
|
0
|
open BLINKER, ">$blinkerFile" || die "Blinker file $blinkerFile opening problem:$!"; |
479
|
0
|
|
|
|
|
0
|
my $lines = $al->writeToBlinker("target"); |
480
|
0
|
|
|
|
|
0
|
foreach my $line (@$lines){ |
481
|
0
|
|
|
|
|
0
|
print BLINKER "$line\n"; |
482
|
|
|
|
|
|
|
} |
483
|
0
|
|
|
|
|
0
|
close BLINKER; |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
}else { |
486
|
0
|
|
|
|
|
0
|
die "Output to format $newFormat is not implemented yet."; |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
sub displayAsLinkEnumeration { |
491
|
0
|
|
|
0
|
0
|
0
|
my ($al,$format,$latex) = @_; |
492
|
0
|
|
|
|
|
0
|
my $lines=""; |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
|
495
|
0
|
0
|
|
|
|
0
|
if ($format eq "text"){ |
|
|
0
|
|
|
|
|
|
496
|
0
|
|
|
|
|
0
|
my ($correspPosition,$wordPosition); |
497
|
|
|
|
|
|
|
|
498
|
0
|
|
|
|
|
0
|
$lines.= join(" ",@{$al->{sourceWords}})."\n"; |
|
0
|
|
|
|
|
0
|
|
499
|
0
|
|
|
|
|
0
|
$lines.= join(" ",@{$al->{targetWords}})."\n\n"; |
|
0
|
|
|
|
|
0
|
|
500
|
|
|
|
|
|
|
|
501
|
0
|
|
|
|
|
0
|
for ($wordPosition=0;$wordPosition<@{$al->{sourceWords}};$wordPosition++){ |
|
0
|
|
|
|
|
0
|
|
502
|
0
|
|
|
|
|
0
|
$lines.= @{$al->{sourceWords}}[$wordPosition]." <- "; |
|
0
|
|
|
|
|
0
|
|
503
|
0
|
|
|
|
|
0
|
foreach $correspPosition (@{$al->{sourceAl}[$wordPosition]}){ |
|
0
|
|
|
|
|
0
|
|
504
|
0
|
|
|
|
|
0
|
$lines.= $al->{targetWords}[$correspPosition]." "; |
505
|
|
|
|
|
|
|
} |
506
|
0
|
|
|
|
|
0
|
$lines.= "\n"; |
507
|
|
|
|
|
|
|
} |
508
|
0
|
|
|
|
|
0
|
$lines.="\n\n"; |
509
|
|
|
|
|
|
|
}elsif ($format eq "latex"){ |
510
|
0
|
|
|
|
|
0
|
my $numRowTokens = @{$al->{sourceWords}}; |
|
0
|
|
|
|
|
0
|
|
511
|
0
|
|
|
|
|
0
|
my $numColTokens = @{$al->{targetWords}}; |
|
0
|
|
|
|
|
0
|
|
512
|
0
|
|
|
|
|
0
|
my ($i,$j,$elt); |
513
|
0
|
|
|
|
|
0
|
my ($j_partOf_Bi,$i_partOf_Bj); |
514
|
0
|
|
|
|
|
0
|
my ($targetWord,$sourceWord); |
515
|
|
|
|
|
|
|
|
516
|
0
|
|
|
|
|
0
|
$lines.= $latex->fromText("\n".join(" ",@{$al->{sourceWords}})."\n"); |
|
0
|
|
|
|
|
0
|
|
517
|
0
|
|
|
|
|
0
|
$lines.= $latex->fromText(join(" ",@{$al->{targetWords}})."\n\n").'\vspace{5mm}'."\n"; |
|
0
|
|
|
|
|
0
|
|
518
|
|
|
|
|
|
|
|
519
|
0
|
|
|
|
|
0
|
for ($j=0; $j<$numRowTokens;$j++){ |
520
|
0
|
|
|
|
|
0
|
for ($i=0;$i<$numColTokens;$i++){ |
521
|
0
|
|
|
|
|
0
|
$targetWord = $latex->fromText($al->{targetWords}[$i]); |
522
|
0
|
|
|
|
|
0
|
$sourceWord = $latex->fromText($al->{sourceWords}[$j]); |
523
|
0
|
|
|
|
|
0
|
$i_partOf_Bj = $al->isIn("sourceAl",$j,$i); |
524
|
0
|
|
|
|
|
0
|
$j_partOf_Bi = $al->isIn("targetAl",$i,$j); |
525
|
0
|
0
|
|
|
|
0
|
if ($i_partOf_Bj > 0) { #ie i=aj |
526
|
0
|
0
|
|
|
|
0
|
if ($j_partOf_Bi > 0){ |
527
|
0
|
|
|
|
|
0
|
$lines.= $sourceWord.' \boldmath $\leftrightarrow$ '.$targetWord." \n\n"; |
528
|
|
|
|
|
|
|
}else{ |
529
|
0
|
|
|
|
|
0
|
$lines.= $sourceWord.' \boldmath $\leftarrow$ '.$targetWord." \n\n"; |
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
}else{ |
532
|
0
|
0
|
|
|
|
0
|
if ($j_partOf_Bi > 0){ |
533
|
0
|
|
|
|
|
0
|
$lines.= $sourceWord.' \boldmath $\rightarrow$ '.$targetWord." \n\n"; |
534
|
|
|
|
|
|
|
}else{ |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
} |
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
} |
539
|
0
|
|
|
|
|
0
|
$lines.= "\n\n".'\vspace{7mm}'; |
540
|
|
|
|
|
|
|
} #elsif $format eq latex |
541
|
0
|
|
|
|
|
0
|
return $lines; |
542
|
|
|
|
|
|
|
} |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
sub displayAsMatrix { |
545
|
0
|
|
|
0
|
0
|
0
|
my ($al,$latex,$mark,$maxRows,$maxCols)= @_; |
546
|
0
|
|
|
|
|
0
|
my $matrix = ""; |
547
|
0
|
|
|
|
|
0
|
my ($mark_ji,$mark_ij); |
548
|
0
|
|
|
|
|
0
|
my $mark_ji_cross='\boldmath $-$'; |
549
|
0
|
|
|
|
|
0
|
my $numRowTokens = @{$al->{sourceWords}}; |
|
0
|
|
|
|
|
0
|
|
550
|
0
|
|
|
|
|
0
|
my $numColTokens = @{$al->{targetWords}}; |
|
0
|
|
|
|
|
0
|
|
551
|
0
|
|
|
|
|
0
|
my ($i,$j,$elt); |
552
|
0
|
|
|
|
|
0
|
my ($j_partOf_Bi,$i_partOf_Bj); |
553
|
0
|
|
|
|
|
0
|
my $offset; |
554
|
|
|
|
|
|
|
|
555
|
0
|
0
|
|
|
|
0
|
if ($numRowTokens>$maxRows){return $al->displayAsLinkEnumeration("latex",$latex)} |
|
0
|
|
|
|
|
0
|
|
556
|
|
|
|
|
|
|
|
557
|
0
|
|
|
|
|
0
|
$matrix.= $latex->fromText("\n".join(" ",@{$al->{sourceWords}})."\n"); |
|
0
|
|
|
|
|
0
|
|
558
|
0
|
|
|
|
|
0
|
$matrix.= $latex->fromText(join(" ",@{$al->{targetWords}})."\n\n").'\vspace{5mm}'; |
|
0
|
|
|
|
|
0
|
|
559
|
|
|
|
|
|
|
|
560
|
0
|
|
|
|
|
0
|
for ($offset=0;$offset<$numColTokens;$offset+=$maxCols){ |
561
|
0
|
|
|
|
|
0
|
$matrix.= "\n".'\begin{tabular}{l'."c" x $numColTokens.'}'; |
562
|
0
|
|
|
|
|
0
|
for ($j=$numRowTokens-1;$j>=0;$j--){ |
563
|
0
|
|
|
|
|
0
|
$matrix.= "\n".$latex->fromText($al->{sourceWords}[$j]); |
564
|
0
|
|
0
|
|
|
0
|
for ($i=$offset;$i<$numColTokens && $i<($offset+$maxCols);$i++){ |
565
|
0
|
|
|
|
|
0
|
$i_partOf_Bj = $al->isIn("sourceAl",$j,$i); |
566
|
0
|
|
|
|
|
0
|
$j_partOf_Bi = $al->isIn("targetAl",$i,$j); |
567
|
0
|
0
|
|
|
|
0
|
if ($mark eq "cross"){$mark_ji=$mark_ji_cross} |
|
0
|
0
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
|
568
|
|
|
|
|
|
|
elsif ($mark eq "ambiguity"){ |
569
|
0
|
0
|
|
|
|
0
|
if (length($al->{sourceLinks}->{"$j $i"}[0])>0){$mark_ji=$al->{sourceLinks}->{"$j $i"}[0]} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
570
|
|
|
|
|
|
|
else {$mark_ji = $mark_ji_cross} |
571
|
|
|
|
|
|
|
} |
572
|
0
|
|
|
|
|
0
|
elsif ($mark eq "confidence"){ |
573
|
0
|
0
|
|
|
|
0
|
if (length($al->{sourceLinks}->{"$j $i"}[1])>0){$mark_ji=$al->{sourceLinks}->{"$j $i"}[1]} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
574
|
|
|
|
|
|
|
else {$mark_ji = $mark_ji_cross} |
575
|
|
|
|
|
|
|
} |
576
|
|
|
|
|
|
|
else {$mark_ji = $mark} |
577
|
0
|
0
|
|
|
|
0
|
if ($mark eq "ambiguity"){ |
|
|
0
|
|
|
|
|
|
578
|
0
|
0
|
|
|
|
0
|
if (length($al->{targetLinks}->{"$i $j"}[0])>0){$mark_ij='\ver{'.$al->{targetLinks}->{"$i $j"}[0].'}'} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
579
|
|
|
|
|
|
|
else {$mark_ij = '\ver{'.$mark_ji_cross.'}'} |
580
|
0
|
|
|
|
|
0
|
}elsif ($mark eq "confidence"){ |
581
|
0
|
0
|
|
|
|
0
|
if (length($al->{targetLinks}->{"$i $j"}[1])>0){$mark_ij='\ver{'.$al->{targetLinks}->{"$i $j"}[1].'}'} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
582
|
|
|
|
|
|
|
else {$mark_ij = '\ver{'.$mark_ji_cross.'}'} |
583
|
|
|
|
|
|
|
}else{$mark_ij = '\ver{'.$mark_ji.'}'} |
584
|
|
|
|
|
|
|
|
585
|
0
|
|
|
|
|
0
|
$matrix.= "&"; |
586
|
0
|
0
|
|
|
|
0
|
if ($i_partOf_Bj > 0) { #ie i=aj |
587
|
0
|
0
|
|
|
|
0
|
if ($j_partOf_Bi > 0){ |
588
|
0
|
0
|
0
|
|
|
0
|
if ($mark_ji eq '\boldmath $-$' && $mark_ij eq '\ver{\boldmath $-$}'){ |
589
|
0
|
|
|
|
|
0
|
$matrix.= ' \boldmath ${+}$ '; |
590
|
|
|
|
|
|
|
}else{ |
591
|
0
|
|
|
|
|
0
|
$matrix.= " $mark_ji$mark_ij "; |
592
|
|
|
|
|
|
|
} |
593
|
|
|
|
|
|
|
}else{ |
594
|
0
|
|
|
|
|
0
|
$matrix.= " $mark_ji "; |
595
|
|
|
|
|
|
|
} |
596
|
|
|
|
|
|
|
}else{ |
597
|
0
|
0
|
|
|
|
0
|
if ($j_partOf_Bi > 0){ |
598
|
0
|
|
|
|
|
0
|
$matrix.= " $mark_ij "; |
599
|
|
|
|
|
|
|
}else{ |
600
|
0
|
|
|
|
|
0
|
$matrix.= ' . '; |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
} #for j=... |
604
|
0
|
|
|
|
|
0
|
$matrix.= ' \\\\'; |
605
|
|
|
|
|
|
|
} #for i=... |
606
|
|
|
|
|
|
|
# last line |
607
|
0
|
|
|
|
|
0
|
$matrix.= "\n "; |
608
|
0
|
|
0
|
|
|
0
|
for ($i=$offset;$i<$numColTokens && $i<($offset+$maxCols);$i++){ |
609
|
0
|
|
|
|
|
0
|
$matrix.= ' & '.'\ver{'.$latex->fromText($al->{targetWords}[$i]).'}'; |
610
|
|
|
|
|
|
|
} |
611
|
0
|
|
|
|
|
0
|
$matrix.= ' \\\\'; |
612
|
0
|
|
|
|
|
0
|
$matrix.= "\n".'\end{tabular}'."\n\n".'\vspace{7mm}'; |
613
|
|
|
|
|
|
|
} # loop on number of matrices |
614
|
|
|
|
|
|
|
|
615
|
0
|
|
|
|
|
0
|
return $matrix; |
616
|
|
|
|
|
|
|
} |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
# prohibits situations of the type: if linked(e,f) and linked(e',f) and linked(e',f') but not linked(e,f') |
620
|
|
|
|
|
|
|
# in this case the function links e and f'. |
621
|
|
|
|
|
|
|
sub forceGroupConsistency { |
622
|
0
|
|
|
0
|
0
|
0
|
my ($al,$mode,$lex1,$lex2) = @_; |
623
|
|
|
|
|
|
|
#defaults: |
624
|
0
|
0
|
|
|
|
0
|
if (!defined($mode)){$mode=""} |
|
0
|
|
|
|
|
0
|
|
625
|
0
|
|
|
|
|
0
|
my $dumper = new Dumpvalue; |
626
|
0
|
|
|
|
|
0
|
my $cloneAl = {}; |
627
|
0
|
|
|
|
|
0
|
foreach my $source (("source","target")){ |
628
|
|
|
|
|
|
|
# SELECT ONLY S LINKS |
629
|
0
|
|
|
|
|
0
|
my $sal = $al->SLinks(); |
630
|
|
|
|
|
|
|
#first we divide the alignment in clusters of positions linked between each other |
631
|
0
|
|
|
|
|
0
|
my $groups=$sal->getAlClusters($source); |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
#delete alignment |
634
|
0
|
0
|
0
|
|
|
0
|
if (defined($sal->{$source."Al"}) && @{$sal->{$source."Al"}}>0){ |
|
0
|
|
|
|
|
0
|
|
635
|
0
|
|
|
|
|
0
|
for (my $j=0;$j<@{$sal->{$source."Al"}};$j++){ |
|
0
|
|
|
|
|
0
|
|
636
|
0
|
|
|
|
|
0
|
$sal->{$source."Al"}[$j]=[]; |
637
|
|
|
|
|
|
|
} |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
# print "BEFORE alignment:\n"; |
641
|
|
|
|
|
|
|
# print $dumper->dumpValue($al->{$source."Al"}); |
642
|
|
|
|
|
|
|
# print "CLUSTERS:\n"; |
643
|
|
|
|
|
|
|
# print $dumper->dumpValue($groups); |
644
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
#then we check that all the links within each cluster exist, and create them if they don't |
646
|
0
|
|
|
|
|
0
|
my $g; |
647
|
0
|
|
|
|
|
0
|
for ($g=0;$g<@$groups;$g++){ |
648
|
0
|
0
|
|
|
|
0
|
if ($mode eq "contiguous"){ |
649
|
0
|
|
|
|
|
0
|
my $sContiguousSeqs=Lingua::AlSetLib::getContiguousSequences ($groups->[$g]{source}); |
650
|
0
|
|
|
|
|
0
|
my $tContiguousSeqs=Lingua::AlSetLib::getContiguousSequences ($groups->[$g]{target}); |
651
|
0
|
0
|
0
|
|
|
0
|
if (@$sContiguousSeqs > 1 || @$tContiguousSeqs > 1){ |
652
|
|
|
|
|
|
|
# print "CLUSTER:\n"; |
653
|
|
|
|
|
|
|
# print $dumper->dumpValue($groups->[$g]); |
654
|
0
|
|
|
|
|
0
|
my ($bestIbm1Prob,$bestSourceSeq,$bestTargetSeq)=(0,0,0); |
655
|
0
|
|
|
|
|
0
|
for (my $sc=0;$sc<@$sContiguousSeqs;$sc++){ |
656
|
0
|
|
|
|
|
0
|
my $sPhrase = $sal->printPhrase("source",$sContiguousSeqs->[$sc]); |
657
|
0
|
|
|
|
|
0
|
for (my $tc=0;$tc<@$tContiguousSeqs;$tc++){ |
658
|
0
|
|
|
|
|
0
|
my $tPhrase = $sal->printPhrase("target",$tContiguousSeqs->[$tc]); |
659
|
0
|
|
|
|
|
0
|
my $ibm1t_s = Lingua::AlSetLib::ibm1Prob ($sPhrase,$tPhrase,$lex1); |
660
|
0
|
|
|
|
|
0
|
my $ibm1s_t; |
661
|
0
|
0
|
|
|
|
0
|
if (defined($lex2)){ |
662
|
0
|
|
|
|
|
0
|
$ibm1s_t = Lingua::AlSetLib::ibm1Prob ($tPhrase,$sPhrase,$lex2); |
663
|
|
|
|
|
|
|
}else{ |
664
|
0
|
|
|
|
|
0
|
$ibm1s_t = $ibm1t_s; |
665
|
|
|
|
|
|
|
} |
666
|
0
|
|
|
|
|
0
|
my $ibm1 = 0.5*($ibm1t_s+$ibm1s_t); |
667
|
|
|
|
|
|
|
# print "$sPhrase ||| $tPhrase ||| $ibm1t_s -- $ibm1s_t ==> $ibm1\n"; |
668
|
0
|
0
|
|
|
|
0
|
if ($ibm1 > $bestIbm1Prob){ |
669
|
0
|
|
|
|
|
0
|
$bestIbm1Prob=$ibm1; |
670
|
0
|
|
|
|
|
0
|
$bestSourceSeq=$sc; |
671
|
0
|
|
|
|
|
0
|
$bestTargetSeq=$tc; |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
} |
674
|
|
|
|
|
|
|
} |
675
|
|
|
|
|
|
|
|
676
|
0
|
|
|
|
|
0
|
@{$groups->[$g]{source}}=@{$sContiguousSeqs->[$bestSourceSeq]}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
677
|
0
|
|
|
|
|
0
|
@{$groups->[$g]{target}}=@{$tContiguousSeqs->[$bestTargetSeq]}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
678
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
# print " contiguous CLUSTER:\n"; |
680
|
|
|
|
|
|
|
# print $dumper->dumpValue($groups->[$g]); |
681
|
|
|
|
|
|
|
# print "best: ".$al->printPhrase('source',$groups->[$g]{source})." | ".$al->printPhrase('target',$groups->[$g]{target})."\n"; |
682
|
|
|
|
|
|
|
} |
683
|
|
|
|
|
|
|
} |
684
|
0
|
|
|
|
|
0
|
foreach my $j (@{$groups->[$g]{source}}){ |
|
0
|
|
|
|
|
0
|
|
685
|
0
|
|
|
|
|
0
|
foreach my $i (@{$groups->[$g]{target}}){ |
|
0
|
|
|
|
|
0
|
|
686
|
0
|
0
|
|
|
|
0
|
if (!$al->isIn($source."Al",$j,$i)){ |
687
|
0
|
|
|
|
|
0
|
push @{$al->{$source."Al"}[$j]},$i; |
|
0
|
|
|
|
|
0
|
|
688
|
|
|
|
|
|
|
}else{ # move from P to S links |
689
|
0
|
|
|
|
|
0
|
@{$al->{$source."Links"}->{"$j $i"}}[0]=""; |
|
0
|
|
|
|
|
0
|
|
690
|
|
|
|
|
|
|
} |
691
|
|
|
|
|
|
|
} |
692
|
|
|
|
|
|
|
} |
693
|
|
|
|
|
|
|
} |
694
|
|
|
|
|
|
|
# print "CLUSTERS after:\n"; |
695
|
|
|
|
|
|
|
# print $dumper->dumpValue($groups); |
696
|
|
|
|
|
|
|
# print "alignment AFTER:\n"; |
697
|
|
|
|
|
|
|
# print $dumper->dumpValue($al->{$source."Al"}); |
698
|
|
|
|
|
|
|
} #foreach $side |
699
|
|
|
|
|
|
|
} |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
##################################################### |
702
|
|
|
|
|
|
|
### SYMMETRIZATION SUBS ### |
703
|
|
|
|
|
|
|
##################################################### |
704
|
|
|
|
|
|
|
# input: alignment object |
705
|
|
|
|
|
|
|
# output: intersection of source and target alignments of this object |
706
|
|
|
|
|
|
|
sub intersect { |
707
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
708
|
0
|
|
|
|
|
0
|
my $intersectSourceAl=[]; |
709
|
0
|
|
|
|
|
0
|
my $intersectTargetAl=[]; |
710
|
0
|
|
|
|
|
0
|
my ($i,$j,$ind); |
711
|
|
|
|
|
|
|
|
712
|
0
|
0
|
0
|
|
|
0
|
if (@{$al->{targetAl}}>0 && @{$al->{sourceAl}}>0){ |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
713
|
|
|
|
|
|
|
#for each link in sourceAl, look if it's present in targetAl |
714
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$al->{sourceAl}};$j++){ |
|
0
|
|
|
|
|
0
|
|
715
|
0
|
0
|
|
|
|
0
|
if (defined($al->{sourceAl}[$j])){ |
716
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
717
|
0
|
0
|
|
|
|
0
|
if ($al->isIn("targetAl",$i,$j)){ |
718
|
0
|
|
|
|
|
0
|
push @{$intersectSourceAl->[$j]},$i; |
|
0
|
|
|
|
|
0
|
|
719
|
0
|
|
|
|
|
0
|
push @{$intersectTargetAl->[$i]},$j; |
|
0
|
|
|
|
|
0
|
|
720
|
|
|
|
|
|
|
} |
721
|
|
|
|
|
|
|
} |
722
|
|
|
|
|
|
|
} #if defined |
723
|
|
|
|
|
|
|
} |
724
|
|
|
|
|
|
|
} #if targetAl is an empty array, then from the intersection sourceAl remains empty |
725
|
0
|
|
|
|
|
0
|
@{$al->{sourceAl}}=@{$intersectSourceAl}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
726
|
0
|
|
|
|
|
0
|
@{$al->{targetAl}}=@{$intersectTargetAl}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
727
|
|
|
|
|
|
|
} |
728
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# input: alignment object |
730
|
|
|
|
|
|
|
# output: union of source and target alignments of this object |
731
|
|
|
|
|
|
|
sub getUnion { |
732
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
733
|
0
|
|
|
|
|
0
|
my %union; |
734
|
0
|
|
|
|
|
0
|
$union{sourceAl}=[]; |
735
|
0
|
|
|
|
|
0
|
$union{targetAl}=[]; |
736
|
0
|
|
|
|
|
0
|
my ($j,$i,$ind); |
737
|
0
|
|
|
|
|
0
|
my %side=("source"=>"target","target"=>"source"); |
738
|
0
|
|
|
|
|
0
|
my ($source,$target); |
739
|
|
|
|
|
|
|
|
740
|
0
|
0
|
0
|
|
|
0
|
if (@{$al->{targetAl}}>0 && @{$al->{sourceAl}}>0){ |
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
741
|
0
|
|
|
|
|
0
|
while (($source,$target)= each(%side)){ |
742
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$al->{$source."Al"}};$j++){ |
|
0
|
|
|
|
|
0
|
|
743
|
0
|
0
|
|
|
|
0
|
if (defined($al->{$source."Al"}[$j])){ |
744
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{$source."Al"}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
745
|
0
|
|
|
|
|
0
|
push @{$union{$source."Al"}->[$j]},$i; |
|
0
|
|
|
|
|
0
|
|
746
|
0
|
0
|
|
|
|
0
|
if (!$al->isIn($target."Al",$i,$j)){ |
747
|
0
|
|
|
|
|
0
|
push @{$union{$target."Al"}->[$i]},$j; |
|
0
|
|
|
|
|
0
|
|
748
|
|
|
|
|
|
|
} |
749
|
|
|
|
|
|
|
} #foreach |
750
|
|
|
|
|
|
|
} |
751
|
|
|
|
|
|
|
} #for |
752
|
|
|
|
|
|
|
} |
753
|
|
|
|
|
|
|
}elsif (@{$al->{sourceAl}}>0){ |
754
|
0
|
|
|
|
|
0
|
@{$union{sourceAl}}=@{$al->{sourceAl}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
755
|
|
|
|
|
|
|
}else{ |
756
|
0
|
|
|
|
|
0
|
@{$union{targetAl}}=@{$al->{targetAl}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
757
|
|
|
|
|
|
|
} |
758
|
0
|
|
|
|
|
0
|
@{$al->{sourceAl}}=@{$union{sourceAl}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
759
|
0
|
|
|
|
|
0
|
@{$al->{targetAl}}=@{$union{targetAl}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
760
|
|
|
|
|
|
|
} |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
# input: alignment object |
763
|
|
|
|
|
|
|
# output: this object where only the links of the side (source or target) with most links are selected |
764
|
|
|
|
|
|
|
sub selectSideWithLinks{ |
765
|
0
|
|
|
0
|
0
|
0
|
my ($al,$criterion,$dontCountNull)=@_; |
766
|
|
|
|
|
|
|
#defaults |
767
|
0
|
0
|
|
|
|
0
|
if (!defined($criterion)){$criterion="most"} |
|
0
|
|
|
|
|
0
|
|
768
|
0
|
0
|
|
|
|
0
|
if (!defined($dontCountNull)){$dontCountNull=1} |
|
0
|
|
|
|
|
0
|
|
769
|
0
|
|
|
|
|
0
|
my ($j,$i,$firstInd); |
770
|
0
|
|
|
|
|
0
|
my ($numSource,$numTarget)=(0,0); |
771
|
0
|
|
|
|
|
0
|
my $sourceAl=[]; |
772
|
0
|
|
|
|
|
0
|
my $targetAl=[]; |
773
|
|
|
|
|
|
|
|
774
|
0
|
0
|
|
|
|
0
|
if ($dontCountNull){$firstInd=1} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
775
|
|
|
|
|
|
|
else {$firstInd=0} |
776
|
|
|
|
|
|
|
#count links |
777
|
0
|
|
|
|
|
0
|
for ($j=$firstInd;$j<@{$al->{sourceAl}};$j++){ |
|
0
|
|
|
|
|
0
|
|
778
|
0
|
0
|
|
|
|
0
|
if (defined($al->{sourceAl}[$j])){ |
779
|
0
|
0
|
|
|
|
0
|
if (!$dontCountNull){ |
780
|
0
|
|
|
|
|
0
|
$numSource+=@{$al->{sourceAl}[$j]}; |
|
0
|
|
|
|
|
0
|
|
781
|
|
|
|
|
|
|
}else{ |
782
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
783
|
0
|
0
|
|
|
|
0
|
if ($i!=0){$numSource++} |
|
0
|
|
|
|
|
0
|
|
784
|
|
|
|
|
|
|
} |
785
|
|
|
|
|
|
|
} |
786
|
|
|
|
|
|
|
} |
787
|
|
|
|
|
|
|
} |
788
|
0
|
|
|
|
|
0
|
for ($i=$firstInd;$i<@{$al->{targetAl}};$i++){ |
|
0
|
|
|
|
|
0
|
|
789
|
0
|
0
|
|
|
|
0
|
if (defined($al->{targetAl}[$i])){ |
790
|
0
|
0
|
|
|
|
0
|
if (!$dontCountNull){ |
791
|
0
|
|
|
|
|
0
|
$numTarget+=@{$al->{targetAl}[$i]}; |
|
0
|
|
|
|
|
0
|
|
792
|
|
|
|
|
|
|
}else{ |
793
|
0
|
|
|
|
|
0
|
foreach $j (@{$al->{targetAl}[$i]}){ |
|
0
|
|
|
|
|
0
|
|
794
|
0
|
0
|
|
|
|
0
|
if ($j!=0){$numTarget++} |
|
0
|
|
|
|
|
0
|
|
795
|
|
|
|
|
|
|
} |
796
|
|
|
|
|
|
|
} |
797
|
|
|
|
|
|
|
} |
798
|
|
|
|
|
|
|
} |
799
|
|
|
|
|
|
|
#select side with (most,least) links |
800
|
0
|
0
|
0
|
|
|
0
|
if ( ($numSource>=$numTarget && $criterion eq "most") || ($numSource<$numTarget && $criterion ne "most")){ #select sourceAl |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
801
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$al->{sourceAl}};$j++){ |
|
0
|
|
|
|
|
0
|
|
802
|
0
|
0
|
|
|
|
0
|
if (defined($al->{sourceAl}[$j])){ |
803
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
804
|
0
|
|
|
|
|
0
|
push @{$sourceAl->[$j]},$i; |
|
0
|
|
|
|
|
0
|
|
805
|
0
|
|
|
|
|
0
|
push @{$targetAl->[$i]},$j; |
|
0
|
|
|
|
|
0
|
|
806
|
|
|
|
|
|
|
} |
807
|
|
|
|
|
|
|
} |
808
|
|
|
|
|
|
|
} |
809
|
|
|
|
|
|
|
}else{ #select targetAl |
810
|
0
|
|
|
|
|
0
|
for ($i=0;$i<@{$al->{targetAl}};$i++){ |
|
0
|
|
|
|
|
0
|
|
811
|
0
|
0
|
|
|
|
0
|
if (defined($al->{targetAl}[$i])){ |
812
|
0
|
|
|
|
|
0
|
foreach $j (@{$al->{targetAl}[$i]}){ |
|
0
|
|
|
|
|
0
|
|
813
|
0
|
|
|
|
|
0
|
push @{$sourceAl->[$j]},$i; |
|
0
|
|
|
|
|
0
|
|
814
|
0
|
|
|
|
|
0
|
push @{$targetAl->[$i]},$j; |
|
0
|
|
|
|
|
0
|
|
815
|
|
|
|
|
|
|
} |
816
|
|
|
|
|
|
|
} |
817
|
|
|
|
|
|
|
} |
818
|
|
|
|
|
|
|
} |
819
|
0
|
|
|
|
|
0
|
@{$al->{sourceAl}}=@$sourceAl; |
|
0
|
|
|
|
|
0
|
|
820
|
0
|
|
|
|
|
0
|
@{$al->{targetAl}}=@$targetAl; |
|
0
|
|
|
|
|
0
|
|
821
|
|
|
|
|
|
|
} |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
sub selectSideWithMostLinks{ |
824
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
825
|
0
|
|
|
|
|
0
|
return $al->selectSideWithLinks("most"); |
826
|
|
|
|
|
|
|
} |
827
|
|
|
|
|
|
|
sub selectSideWithLeastLinks{ |
828
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
829
|
0
|
|
|
|
|
0
|
return $al->selectSideWithLinks("least"); |
830
|
|
|
|
|
|
|
} |
831
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
# input: alignment object |
833
|
|
|
|
|
|
|
# output: alignment object where source and target have been swapped |
834
|
|
|
|
|
|
|
sub swapSourceTarget{ |
835
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
836
|
0
|
|
|
|
|
0
|
my ($link,$ref,$j,$i,$source); |
837
|
0
|
|
|
|
|
0
|
my @st; |
838
|
0
|
|
|
|
|
0
|
my @sides=("source","target"); |
839
|
0
|
|
|
|
|
0
|
my $swappedAl={ "sourceAl"=>[], |
840
|
|
|
|
|
|
|
"targetAl"=>[], |
841
|
|
|
|
|
|
|
"sourceWords"=>$al->{targetWords}, |
842
|
|
|
|
|
|
|
"targetWords"=>$al->{sourceWords}, |
843
|
|
|
|
|
|
|
"sourceLinks"=>{}, |
844
|
|
|
|
|
|
|
"targetLinks"=>{}}; |
845
|
|
|
|
|
|
|
|
846
|
0
|
|
|
|
|
0
|
foreach $source (@sides){ |
847
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$al->{$source."Al"}};$j++){ |
|
0
|
|
|
|
|
0
|
|
848
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{$source."Al"}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
849
|
0
|
|
|
|
|
0
|
push @{$swappedAl->{$source."Al"}[$i]},$j; |
|
0
|
|
|
|
|
0
|
|
850
|
|
|
|
|
|
|
} |
851
|
|
|
|
|
|
|
} |
852
|
|
|
|
|
|
|
#insert ref to empty array instead of undef entries |
853
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$swappedAl->{$source."Al"}};$j++){ |
|
0
|
|
|
|
|
0
|
|
854
|
0
|
0
|
|
|
|
0
|
if (!defined($swappedAl->{$source."Al"}[$j])){ |
855
|
0
|
|
|
|
|
0
|
$swappedAl->{$source."Al"}[$j]=[]; |
856
|
|
|
|
|
|
|
} |
857
|
|
|
|
|
|
|
} |
858
|
|
|
|
|
|
|
# and now the sourceLinks |
859
|
0
|
|
|
|
|
0
|
while (($link,$ref)=each(%{$al->{$source."Links"}})){ |
|
0
|
|
|
|
|
0
|
|
860
|
0
|
|
|
|
|
0
|
@st=split(" ",$link); |
861
|
0
|
|
|
|
|
0
|
$swappedAl->{$source."Links"}{"$st[1] $st[0]"}=$ref; |
862
|
|
|
|
|
|
|
} |
863
|
|
|
|
|
|
|
} |
864
|
0
|
|
|
|
|
0
|
%$al=%$swappedAl; |
865
|
|
|
|
|
|
|
} |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
# input: al object, offset, length, side (src or trg), ref to word list to be added, ref to a list of positions of the other side (to which all added words will be linked). |
870
|
|
|
|
|
|
|
# output: Alignment object where given positions are sustituted by the words |
871
|
|
|
|
|
|
|
# |
872
|
|
|
|
|
|
|
# notes: 1) in case of deleting various words: |
873
|
|
|
|
|
|
|
# - all added words are linked to all positions to which deleted words were linked (except if you provided a list of positions of the other side, in which case all added words are linked to those positions). |
874
|
|
|
|
|
|
|
# - $al->{sourceLinks} information can be lost for these words. |
875
|
|
|
|
|
|
|
# 2) Does not work for targetAl alignment |
876
|
|
|
|
|
|
|
# 3) more efficient in "source" side than in "target" |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
sub splice { |
879
|
0
|
|
|
0
|
0
|
0
|
my ($al,$side,$offset,$length,$refToWordsToAdd,$refToOtherSidePosi)=@_; |
880
|
0
|
|
|
|
|
0
|
my $dumper = new Dumpvalue; |
881
|
|
|
|
|
|
|
|
882
|
0
|
0
|
|
|
|
0
|
if (!defined($refToWordsToAdd)){$refToWordsToAdd=[];} |
|
0
|
|
|
|
|
0
|
|
883
|
0
|
0
|
|
|
|
0
|
if (!defined($refToOtherSidePosi)){$refToOtherSidePosi=[];} |
|
0
|
|
|
|
|
0
|
|
884
|
0
|
|
|
|
|
0
|
my $numToDelete=$length; |
885
|
0
|
|
|
|
|
0
|
my $firstPos=$offset; |
886
|
0
|
|
|
|
|
0
|
my $lastPos=$offset+$length-1; |
887
|
0
|
|
|
|
|
0
|
my $numList = scalar(@$refToOtherSidePosi); |
888
|
|
|
|
|
|
|
# print $al->displayAsLinkEnumeration("text"); |
889
|
|
|
|
|
|
|
# print "splice $side off:$offset len:$length add:",join(" ",@$refToWordsToAdd),"\n"; |
890
|
|
|
|
|
|
|
# print "before:",join(" ",@{$al->{$side."Words"}}),"\n"; |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
# MODIFY WORDS ARRAY |
893
|
0
|
|
|
|
|
0
|
splice(@{$al->{$side."Words"}},$offset,$length,@$refToWordsToAdd); |
|
0
|
|
|
|
|
0
|
|
894
|
|
|
|
|
|
|
# print "after:",join(" ",@{$al->{$side."Words"}}),"\n"; |
895
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
# MODIFY LINKS |
897
|
0
|
|
|
|
|
0
|
my $numToAdd=scalar(@$refToWordsToAdd); |
898
|
0
|
|
|
|
|
0
|
my $diff=$numToAdd-$numToDelete; |
899
|
0
|
|
|
|
|
0
|
my @modified; |
900
|
|
|
|
|
|
|
my %modifs; |
901
|
0
|
|
|
|
|
0
|
my %links; |
902
|
|
|
|
|
|
|
|
903
|
0
|
0
|
|
|
|
0
|
if ($side eq "target"){ |
904
|
0
|
|
|
|
|
0
|
$al->swapSourceTarget; |
905
|
|
|
|
|
|
|
} |
906
|
|
|
|
|
|
|
#initialize modified array |
907
|
0
|
|
|
|
|
0
|
for (my $j=0;$j<@{$al->{sourceAl}}+$diff;$j++){ |
|
0
|
|
|
|
|
0
|
|
908
|
0
|
|
|
|
|
0
|
push @modified,[]; |
909
|
|
|
|
|
|
|
} |
910
|
|
|
|
|
|
|
#print "ANTES:\n"; |
911
|
|
|
|
|
|
|
#print $al->displayAsLinkEnumeration("text"); |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
#fill modified array with existing links |
914
|
0
|
|
|
|
|
0
|
for (my $j=0;$j<@{$al->{sourceAl}};$j++){ |
|
0
|
|
|
|
|
0
|
|
915
|
0
|
0
|
|
|
|
0
|
if (defined($al->{sourceAl}[$j])){ |
916
|
0
|
|
|
|
|
0
|
foreach my $i (@{$al->{sourceAl}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
917
|
|
|
|
|
|
|
#print "i $i j $j firstPos $firstPos\n"; |
918
|
0
|
0
|
0
|
|
|
0
|
if ($j<$firstPos){ |
|
|
0
|
|
|
|
|
|
919
|
0
|
|
|
|
|
0
|
push @{$modified[$j]},$i; |
|
0
|
|
|
|
|
0
|
|
920
|
0
|
|
|
|
|
0
|
$links{"$j $i"}=$al->{sourceLinks}{"$j $i"}; |
921
|
|
|
|
|
|
|
}elsif ($j>=$firstPos && $j<=$lastPos){ |
922
|
0
|
0
|
|
|
|
0
|
if ($numList==0){ |
923
|
|
|
|
|
|
|
#link added words to positions to which were linked the deleted words |
924
|
0
|
|
|
|
|
0
|
for (my $p=$firstPos;$p<$firstPos+$numToAdd;$p++){ |
925
|
0
|
0
|
|
|
|
0
|
if (!exists($modifs{$p}{$i})){ |
926
|
0
|
|
|
|
|
0
|
push @{$modified[$p]},$i; |
|
0
|
|
|
|
|
0
|
|
927
|
0
|
|
|
|
|
0
|
$links{"$p $i"}=$al->{sourceLinks}{"$j $i"}; |
928
|
0
|
|
|
|
|
0
|
$modifs{$p}{$i}=1; |
929
|
|
|
|
|
|
|
} |
930
|
|
|
|
|
|
|
} |
931
|
|
|
|
|
|
|
} |
932
|
|
|
|
|
|
|
}else{ |
933
|
0
|
|
|
|
|
0
|
push @{$modified[$j+$diff]},$i; |
|
0
|
|
|
|
|
0
|
|
934
|
0
|
|
|
|
|
0
|
$links{($j+$diff)." $i"}=$al->{sourceLinks}{"$j $i"}; |
935
|
|
|
|
|
|
|
} |
936
|
|
|
|
|
|
|
} |
937
|
|
|
|
|
|
|
} #if defined |
938
|
|
|
|
|
|
|
} |
939
|
|
|
|
|
|
|
# insert provided links |
940
|
0
|
|
|
|
|
0
|
for (my $p=$firstPos;$p<$firstPos+$numToAdd;$p++){ |
941
|
0
|
|
|
|
|
0
|
foreach my $i (@$refToOtherSidePosi){ |
942
|
0
|
|
|
|
|
0
|
push @{$modified[$p]},$i; |
|
0
|
|
|
|
|
0
|
|
943
|
|
|
|
|
|
|
} |
944
|
|
|
|
|
|
|
} |
945
|
|
|
|
|
|
|
|
946
|
0
|
|
|
|
|
0
|
@{$al->{sourceAl}}=@modified; |
|
0
|
|
|
|
|
0
|
|
947
|
0
|
0
|
|
|
|
0
|
if ($side eq "target"){ |
948
|
0
|
|
|
|
|
0
|
$al->swapSourceTarget; |
949
|
|
|
|
|
|
|
} |
950
|
|
|
|
|
|
|
#print "DESPUES:\n"; |
951
|
|
|
|
|
|
|
#print $al->displayAsLinkEnumeration("text"); |
952
|
|
|
|
|
|
|
} |
953
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
# INPUT: string (regexp) to be replaced, string (regexp) to replace it, side ("source" or "target") |
956
|
|
|
|
|
|
|
# NOTES: 1) in case of deleting various words, all added words are linked to all positions to which deleted words were linked. $al->{sourceLinks} information can be lost for replaced words. |
957
|
|
|
|
|
|
|
# 2) Does not work for targetAl alignment |
958
|
|
|
|
|
|
|
# 3) more efficient in "source" side than in "target" |
959
|
|
|
|
|
|
|
sub regexpReplace { |
960
|
0
|
|
|
0
|
0
|
0
|
my ($al,$regToDelete,$regToReplace,$side)=@_; |
961
|
0
|
|
|
|
|
0
|
my $dumper=new Dumpvalue; |
962
|
|
|
|
|
|
|
#print STDERR "s/$regToDelete/$regToReplace/\n"; |
963
|
0
|
|
|
|
|
0
|
my $sentence; |
964
|
0
|
0
|
|
|
|
0
|
if ($side eq "source"){$sentence=$al->sourceSentence;} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
965
|
|
|
|
|
|
|
else {$sentence=$al->targetSentence;} |
966
|
0
|
|
|
|
|
0
|
my $newSentence=$sentence; |
967
|
0
|
|
|
|
|
0
|
$newSentence =~ s/$regToDelete/$regToReplace/og; |
968
|
|
|
|
|
|
|
#print $al->sourceSentence."\n"; |
969
|
|
|
|
|
|
|
#print $al->targetSentence."\n"; |
970
|
|
|
|
|
|
|
#print $newSentence."\n"; |
971
|
0
|
|
|
|
|
0
|
my @words = split / /,$sentence; |
972
|
0
|
|
|
|
|
0
|
my $nums = scalar(@words); |
973
|
0
|
|
|
|
|
0
|
my @newWords = split / /,$newSentence; |
974
|
0
|
|
|
|
|
0
|
my @diffs = Lingua::AlSetLib::diff( \@words, \@newWords ); |
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
# parse output of diff function |
977
|
0
|
|
|
|
|
0
|
my @updatedPosi; #array: orig posis -> updated posis |
978
|
|
|
|
|
|
|
my %reversePosi; #hash: updated posis -> orig posis |
979
|
0
|
|
|
|
|
0
|
for (my $i=0;$i<=$nums;$i++){ |
980
|
0
|
|
|
|
|
0
|
$updatedPosi[$i]=$i; |
981
|
0
|
|
|
|
|
0
|
$reversePosi{$i}=$i; |
982
|
|
|
|
|
|
|
} |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
#$dumper->dumpValue(\@diffs); |
985
|
0
|
|
|
|
|
0
|
foreach my $hunk (@diffs){ |
986
|
0
|
|
|
|
|
0
|
my @delPosi; |
987
|
|
|
|
|
|
|
my @del; |
988
|
0
|
|
|
|
|
0
|
my @addPosi; |
989
|
0
|
|
|
|
|
0
|
my @add; |
990
|
0
|
|
|
|
|
0
|
foreach my $change (@$hunk) { |
991
|
0
|
0
|
|
|
|
0
|
if ($change->[0] eq '-'){ |
992
|
0
|
|
|
|
|
0
|
push @delPosi,$change->[1]+1; |
993
|
0
|
|
|
|
|
0
|
push @del,$change->[2]; |
994
|
|
|
|
|
|
|
}else{ |
995
|
0
|
|
|
|
|
0
|
push @addPosi,$change->[1]+1; |
996
|
0
|
|
|
|
|
0
|
push @add,$change->[2]; |
997
|
|
|
|
|
|
|
} |
998
|
|
|
|
|
|
|
} |
999
|
0
|
|
|
|
|
0
|
my $numDel=scalar(@delPosi); |
1000
|
0
|
|
|
|
|
0
|
my $numAdd=scalar(@addPosi); |
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
# del posis are relative to first array (@words) => update posis |
1003
|
|
|
|
|
|
|
# add posis are relative to second array (@newWords) => don't update posis |
1004
|
0
|
0
|
|
|
|
0
|
if ($numDel==0){ #insertion |
1005
|
0
|
|
|
|
|
0
|
$al->splice("$side",$addPosi[0],0,\@add); |
1006
|
|
|
|
|
|
|
#print "insert '",join(" ",@add),"' at position { ",$addPosi[0]," }\n"; |
1007
|
|
|
|
|
|
|
#update updatedPosi array |
1008
|
0
|
|
|
|
|
0
|
for (my $i=$reversePosi{"$addPosi[0]"};$i<=$nums;$i++){ |
1009
|
0
|
|
|
|
|
0
|
$updatedPosi[$i]+=$numAdd; |
1010
|
0
|
|
|
|
|
0
|
$reversePosi{"$updatedPosi[$i]"}=$i; |
1011
|
|
|
|
|
|
|
} |
1012
|
|
|
|
|
|
|
}else{ # substitution or deletion |
1013
|
0
|
|
|
|
|
0
|
$al->splice("$side",$updatedPosi[$delPosi[0]],$numDel,\@add); |
1014
|
|
|
|
|
|
|
#print "substitute '",join(" ",@del),"' at positions { ",join(" ",@delPosi)," } by '",join(" ",@add),"'\n"; |
1015
|
|
|
|
|
|
|
#update updatedPosi array |
1016
|
0
|
|
|
|
|
0
|
for (my $i=$delPosi[0]+$numDel;$i<=$nums;$i++){ |
1017
|
0
|
|
|
|
|
0
|
$updatedPosi[$i]+=$numAdd-$numDel; |
1018
|
0
|
|
|
|
|
0
|
$reversePosi{"$updatedPosi[$i]"}=$i; |
1019
|
|
|
|
|
|
|
} |
1020
|
|
|
|
|
|
|
} |
1021
|
|
|
|
|
|
|
} |
1022
|
|
|
|
|
|
|
} |
1023
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
# eliminates any given WORD from the source or target file corpus and updates the alignment |
1025
|
|
|
|
|
|
|
# input: $al (current Alignment object),$word (word RegExp to eliminate), $wordSide (from which side: source or target) |
1026
|
|
|
|
|
|
|
# kept for compatibility with previous versions (regexpReplace or replaceWords should be used instead) |
1027
|
|
|
|
|
|
|
sub eliminateWord { |
1028
|
0
|
|
|
0
|
0
|
0
|
my ($al,$word,$wordSide)= @_; |
1029
|
0
|
|
|
|
|
0
|
return $al->replaceWords($word,'',$wordSide); |
1030
|
|
|
|
|
|
|
} |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
# INPUT: string to be replaced, string to replace it, side ("source" or "target") |
1033
|
|
|
|
|
|
|
# NOTES: 1) in case of deleting various words, all added words are linked to all positions to which deleted words were linked. $al->{sourceLinks} information can be lost for replaced words. |
1034
|
|
|
|
|
|
|
# 2) Does not work for targetAl alignment |
1035
|
|
|
|
|
|
|
# 3) more efficient in "source" side than in "target" |
1036
|
|
|
|
|
|
|
sub replaceWords { |
1037
|
0
|
|
|
0
|
0
|
0
|
my ($al,$stToDelete,$stToReplace,$side)=@_; |
1038
|
0
|
|
|
|
|
0
|
my $dumper=new Dumpvalue; |
1039
|
0
|
|
|
|
|
0
|
$stToDelete =~ s/(^\s|\s$)//g; |
1040
|
0
|
|
|
|
|
0
|
$stToDelete =~ s/\s+/ /g; |
1041
|
0
|
|
|
|
|
0
|
$stToReplace =~ s/(^\s|\s$)//g; |
1042
|
0
|
|
|
|
|
0
|
$stToReplace =~ s/\s+/ /g; |
1043
|
|
|
|
|
|
|
|
1044
|
0
|
|
|
|
|
0
|
my @wToDel=split / /,$stToDelete; |
1045
|
0
|
|
|
|
|
0
|
my $numToDel = scalar(@wToDel); |
1046
|
0
|
|
|
|
|
0
|
my @toAdd=split(/ /,$stToReplace); |
1047
|
0
|
|
|
|
|
0
|
my $numToAdd=scalar(@toAdd); |
1048
|
0
|
|
|
|
|
0
|
my $diff=$numToAdd-$numToDel; |
1049
|
|
|
|
|
|
|
# print $al->displayAsLinkEnumeration("text"),"\n"; |
1050
|
|
|
|
|
|
|
#list of positions where string to be deleted starts in @sourceWords (or target) array |
1051
|
0
|
|
|
|
|
0
|
my @startToDelInAl=Lingua::AlSetLib::findArrayInAnother(\@wToDel,$al->{$side."Words"}); |
1052
|
0
|
|
|
|
|
0
|
my $offset=0; |
1053
|
0
|
|
|
|
|
0
|
foreach my $startPosi (@startToDelInAl){ |
1054
|
0
|
|
|
|
|
0
|
my @posis; |
1055
|
0
|
|
|
|
|
0
|
for (my $i=0;$i<$numToDel;$i++){ |
1056
|
0
|
|
|
|
|
0
|
push @posis,$startPosi+$i+$offset; |
1057
|
|
|
|
|
|
|
} |
1058
|
|
|
|
|
|
|
#print "positions:",join(" ",@posis),"\n"; |
1059
|
|
|
|
|
|
|
#$al->substitutePositions(\@posis,$side,$stToReplace); |
1060
|
0
|
|
|
|
|
0
|
$al->splice($side,$posis[0],scalar(@posis),\@toAdd); |
1061
|
0
|
|
|
|
|
0
|
$offset+=$diff; |
1062
|
|
|
|
|
|
|
} |
1063
|
|
|
|
|
|
|
# print $al->displayAsLinkEnumeration("text"),"\n"; |
1064
|
|
|
|
|
|
|
} |
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
# introduces underscore between links of many-to-many groups in source to target alignment |
1068
|
|
|
|
|
|
|
# WARNING: THIS SUB FOR NOW ONLY CHANGES WORDS FILES, NOT THE LINKS FILE |
1069
|
|
|
|
|
|
|
sub manyToMany2joined { |
1070
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
1071
|
0
|
|
|
|
|
0
|
my $new; |
1072
|
0
|
|
|
|
|
0
|
@{$new->{source}} = @{$al->{sourceWords}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
1073
|
0
|
|
|
|
|
0
|
@{$new->{target}} = @{$al->{targetWords}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
1074
|
|
|
|
|
|
|
|
1075
|
0
|
|
|
|
|
0
|
my @sides=("source","target"); |
1076
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
# group many-to-many linked phrases in clusters |
1079
|
0
|
|
|
|
|
0
|
my $clusters=$al->getAlClusters; |
1080
|
0
|
|
|
|
|
0
|
my $dumper = new Dumpvalue; |
1081
|
|
|
|
|
|
|
# print "\n"; |
1082
|
|
|
|
|
|
|
# print $al->sourceSentence."\n"; |
1083
|
|
|
|
|
|
|
# print $al->targetSentence."\n"; |
1084
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
# print "CLUSTERS:\n"; |
1086
|
|
|
|
|
|
|
# print $dumper->dumpValue($clusters); |
1087
|
0
|
|
|
|
|
0
|
foreach my $source (@sides){ |
1088
|
|
|
|
|
|
|
#sort clusters |
1089
|
0
|
|
|
|
|
0
|
my %firstClustPos; |
1090
|
0
|
|
|
|
|
0
|
for (my $c=0;$c<@$clusters;$c++){ |
1091
|
0
|
|
|
|
|
0
|
@{$clusters->[$c]{$source}} = sort { $a <=> $b; } @{$clusters->[$c]{$source}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
1092
|
|
|
|
|
|
|
} |
1093
|
0
|
|
|
|
|
0
|
@$clusters = sort {$a->{$source}[0] <=> $b->{$source}[0]} @$clusters; |
|
0
|
|
|
|
|
0
|
|
1094
|
|
|
|
|
|
|
|
1095
|
0
|
|
|
|
|
0
|
my $offset=0; |
1096
|
0
|
|
|
|
|
0
|
foreach my $clust (@$clusters){ |
1097
|
0
|
0
|
|
|
|
0
|
if ( @{$clust->{$source}} >1 ){ |
|
0
|
|
|
|
|
0
|
|
1098
|
|
|
|
|
|
|
#check that cluster is contiguous |
1099
|
0
|
|
|
|
|
0
|
my $contiguous=1; |
1100
|
0
|
|
|
|
|
0
|
for (my $c=1;$c<@{$clust->{$source}};$c++){ |
|
0
|
|
|
|
|
0
|
|
1101
|
0
|
0
|
|
|
|
0
|
if ($clust->{$source}[$c] != ($clust->{$source}[$c-1]+1) ){ |
1102
|
0
|
|
|
|
|
0
|
$contiguous=0; |
1103
|
0
|
|
|
|
|
0
|
last; |
1104
|
|
|
|
|
|
|
} |
1105
|
|
|
|
|
|
|
} |
1106
|
0
|
0
|
|
|
|
0
|
if ($contiguous){ |
1107
|
|
|
|
|
|
|
# introduce underscore |
1108
|
0
|
|
|
|
|
0
|
my $numWords = @{$clust->{$source}}; |
|
0
|
|
|
|
|
0
|
|
1109
|
0
|
|
|
|
|
0
|
my $newWord=$al->{$source."Words"}[$clust->{$source}[0]]; |
1110
|
0
|
|
|
|
|
0
|
for (my $c=1;$c<$numWords;$c++){ |
1111
|
0
|
|
|
|
|
0
|
$newWord=$newWord."_".$al->{$source."Words"}[$clust->{$source}[$c]]; |
1112
|
|
|
|
|
|
|
} |
1113
|
|
|
|
|
|
|
# print "new word: $newWord\n"; |
1114
|
0
|
|
|
|
|
0
|
splice(@{$new->{$source}},$clust->{$source}[0]-$offset,$numWords,$newWord); |
|
0
|
|
|
|
|
0
|
|
1115
|
|
|
|
|
|
|
|
1116
|
0
|
|
|
|
|
0
|
$offset+=$numWords-1; |
1117
|
|
|
|
|
|
|
}else{ |
1118
|
0
|
|
|
|
|
0
|
print STDERR "not contiguous\n"; |
1119
|
|
|
|
|
|
|
} |
1120
|
|
|
|
|
|
|
} #if |
1121
|
|
|
|
|
|
|
} |
1122
|
|
|
|
|
|
|
# print "\n"; |
1123
|
|
|
|
|
|
|
} #foreach $source |
1124
|
|
|
|
|
|
|
# print "new source:",join(" ",@{$new->{source}}),"\n"; |
1125
|
|
|
|
|
|
|
# print "new target:",join(" ",@{$new->{target}}),"\n"; |
1126
|
0
|
|
|
|
|
0
|
@{$al->{sourceWords}}=@{$new->{source}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
1127
|
0
|
|
|
|
|
0
|
@{$al->{targetWords}}=@{$new->{target}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
1128
|
|
|
|
|
|
|
} |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
# recreates links of words linked by underscore and removes underscores |
1132
|
|
|
|
|
|
|
# ONLY WORKS WITH SOURCE2TARGET AlIGNMENT |
1133
|
|
|
|
|
|
|
sub joined2ManyToMany { |
1134
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
1135
|
|
|
|
|
|
|
# print $al->sourceSentence."\n"; |
1136
|
|
|
|
|
|
|
# print $al->targetSentence."\n"; |
1137
|
|
|
|
|
|
|
|
1138
|
0
|
|
|
|
|
0
|
my @sides=("source","target"); |
1139
|
0
|
|
|
|
|
0
|
foreach my $source (@sides){ |
1140
|
0
|
|
|
|
|
0
|
my %joined; |
1141
|
0
|
|
|
|
|
0
|
for (my $j=1;$j<@{$al->{$source."Words"}};$j++){ |
|
0
|
|
|
|
|
0
|
|
1142
|
0
|
0
|
|
|
|
0
|
if ($al->{$source."Words"}[$j] =~ /@@@/){ |
1143
|
0
|
|
|
|
|
0
|
$joined{$j}=1; |
1144
|
|
|
|
|
|
|
} |
1145
|
|
|
|
|
|
|
} |
1146
|
0
|
|
|
|
|
0
|
my @sortedJoined = sort { $a <=> $b } keys(%joined); |
|
0
|
|
|
|
|
0
|
|
1147
|
0
|
|
|
|
|
0
|
my $offset=0; |
1148
|
0
|
|
|
|
|
0
|
foreach my $pos (@sortedJoined){ |
1149
|
|
|
|
|
|
|
# insert new words |
1150
|
0
|
|
|
|
|
0
|
my $joinedWords = $al->{$source."Words"}[$pos+$offset]; |
1151
|
0
|
|
|
|
|
0
|
my @newWords = split(/@@@/,$joinedWords); |
1152
|
0
|
|
|
|
|
0
|
my $firstWord = shift @newWords; |
1153
|
0
|
|
|
|
|
0
|
$al->{$source."Words"}[$pos+$offset]=$firstWord; |
1154
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
# insert new words to alignment, all linked to the same target words as the old (joined) token |
1156
|
0
|
0
|
|
|
|
0
|
if ($source eq "source"){ |
1157
|
0
|
|
|
|
|
0
|
$al->splice($source,$pos+1+$offset,0,\@newWords,$al->{sourceAl}[$pos+$offset]); |
1158
|
|
|
|
|
|
|
}else{ |
1159
|
|
|
|
|
|
|
# look for links aligned to $pos+$offset |
1160
|
0
|
|
|
|
|
0
|
my @alignedPos; |
1161
|
|
|
|
|
|
|
# print "pos offset:".($pos+$offset)."\n"; |
1162
|
0
|
|
|
|
|
0
|
for (my $j=0;$j<@{$al->{sourceAl}};$j++){ |
|
0
|
|
|
|
|
0
|
|
1163
|
0
|
|
|
|
|
0
|
foreach my $i (@{$al->{sourceAl}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
1164
|
0
|
0
|
|
|
|
0
|
if ($i == ($pos+$offset)){ |
1165
|
0
|
|
|
|
|
0
|
push @alignedPos,$j; |
1166
|
|
|
|
|
|
|
} |
1167
|
|
|
|
|
|
|
} |
1168
|
|
|
|
|
|
|
} |
1169
|
0
|
|
|
|
|
0
|
$al->splice($source,$pos+1+$offset,0,\@newWords,\@alignedPos); |
1170
|
|
|
|
|
|
|
} |
1171
|
0
|
|
|
|
|
0
|
$offset += @newWords; |
1172
|
|
|
|
|
|
|
} |
1173
|
|
|
|
|
|
|
} |
1174
|
|
|
|
|
|
|
} |
1175
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
#input: (source,target) link |
1177
|
|
|
|
|
|
|
#output: true if the link is reciprocal (or "cross link"), false otherwise |
1178
|
|
|
|
|
|
|
sub isCrossLink { |
1179
|
0
|
|
|
0
|
0
|
0
|
my ($al,$j,$i)=@_; |
1180
|
|
|
|
|
|
|
# print "s $j $i:",$al->isIn("sourceAl",$j,$i)," t $i $j:",$al->isIn("targetAl",$i,$j),"\n"; |
1181
|
0
|
|
0
|
|
|
0
|
return ( $al->isIn("sourceAl",$j,$i) && $al->isIn("targetAl",$i,$j) ); |
1182
|
|
|
|
|
|
|
} |
1183
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
sub isAnchor{ |
1185
|
0
|
|
|
0
|
0
|
0
|
my ($al,$j,$side)=@_; |
1186
|
0
|
|
|
|
|
0
|
my ($reverseSide,$i); |
1187
|
|
|
|
|
|
|
|
1188
|
0
|
0
|
|
|
|
0
|
if ($side eq "source"){$reverseSide="target"} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
1189
|
|
|
|
|
|
|
else {$reverseSide = "source"} |
1190
|
0
|
0
|
|
|
|
0
|
if (defined($al->{$side."Al"}[$j])){ |
1191
|
0
|
0
|
|
|
|
0
|
if (@{$al->{$side."Al"}[$j]}==1){ |
|
0
|
|
|
|
|
0
|
|
1192
|
0
|
|
|
|
|
0
|
$i = $al->{$side."Al"}[$j][0]; |
1193
|
0
|
0
|
|
|
|
0
|
if (defined($al->{$reverseSide."Al"}[$i])){ |
1194
|
0
|
0
|
0
|
|
|
0
|
if (@{$al->{$reverseSide."Al"}[$i]}==1 && $al->{$reverseSide."Al"}[$i][0]==$j){ |
|
0
|
|
|
|
|
0
|
|
1195
|
0
|
|
|
|
|
0
|
return 1; |
1196
|
|
|
|
|
|
|
} |
1197
|
|
|
|
|
|
|
} |
1198
|
|
|
|
|
|
|
} |
1199
|
|
|
|
|
|
|
} |
1200
|
0
|
|
|
|
|
0
|
return 0; |
1201
|
|
|
|
|
|
|
} |
1202
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
#mode: "noAnchors" cuts zones between 2 anchors and cannot include an anchor point |
1204
|
|
|
|
|
|
|
# "anchors" cuts zone established by coordinates and doesn't look more |
1205
|
|
|
|
|
|
|
sub cut { |
1206
|
0
|
|
|
0
|
0
|
0
|
my ($al,$startPointSource,$startPointTarget,$endPointSource,$endPointTarget,$mode)=@_; |
1207
|
0
|
0
|
|
|
|
0
|
if (!defined($mode)){$mode="noAnchors"} |
|
0
|
|
|
|
|
0
|
|
1208
|
0
|
|
|
|
|
0
|
my ($j,$i,$ind); |
1209
|
0
|
|
|
|
|
0
|
my %sourceInGap=(); |
1210
|
0
|
|
|
|
|
0
|
my %targetInGap=(); |
1211
|
0
|
|
|
|
|
0
|
my @sortedSourceInGap=(); |
1212
|
0
|
|
|
|
|
0
|
my @sortedTargetInGap=(); |
1213
|
0
|
|
|
|
|
0
|
my %sourceToNull=(); |
1214
|
0
|
|
|
|
|
0
|
my %targetToNull=(); |
1215
|
0
|
|
|
|
|
0
|
my $gap = Lingua::AlignmentSlice->new($al); |
1216
|
0
|
|
|
|
|
0
|
my @linked=(); |
1217
|
0
|
|
|
|
|
0
|
my ($zeroSource,$zeroTarget,$numSource,$numTarget); |
1218
|
0
|
|
|
|
|
0
|
my ($oldNumInGap,$newNumInGap); |
1219
|
0
|
|
|
|
|
0
|
for ($j=$startPointSource+1;$j<$endPointSource;$j++){ |
1220
|
0
|
|
|
|
|
0
|
$sourceInGap{$j}=1; |
1221
|
|
|
|
|
|
|
} |
1222
|
0
|
|
|
|
|
0
|
for ($i=$startPointTarget+1;$i<$endPointTarget;$i++){ |
1223
|
0
|
0
|
|
|
|
0
|
if ($mode eq "noAnchors"){ |
1224
|
0
|
0
|
|
|
|
0
|
if (!$al->isAnchor($i,"target")){ |
1225
|
0
|
|
|
|
|
0
|
$targetInGap{$i}=1; |
1226
|
|
|
|
|
|
|
} |
1227
|
|
|
|
|
|
|
}else{ |
1228
|
0
|
|
|
|
|
0
|
$targetInGap{$i}=1; |
1229
|
|
|
|
|
|
|
} |
1230
|
|
|
|
|
|
|
} |
1231
|
|
|
|
|
|
|
# print "\n($startPointSource,$startPointTarget,$endPointSource,$endPointTarget)\n"; |
1232
|
|
|
|
|
|
|
# print "source in gap 1:".join(" ",keys %sourceInGap)."\n"; |
1233
|
|
|
|
|
|
|
# print "target in gap 1:".join(" ",keys %targetInGap)."\n"; |
1234
|
|
|
|
|
|
|
|
1235
|
|
|
|
|
|
|
#look at linked words situated outside the gap square: |
1236
|
0
|
|
|
|
|
0
|
$oldNumInGap=0; |
1237
|
0
|
|
|
|
|
0
|
$newNumInGap=scalar(keys %sourceInGap)+scalar(keys %targetInGap); |
1238
|
0
|
|
|
|
|
0
|
while ($oldNumInGap != $newNumInGap){ |
1239
|
0
|
|
|
|
|
0
|
foreach $i (keys %targetInGap){ |
1240
|
0
|
|
|
|
|
0
|
foreach $j (@{$al->{targetAl}[$i]}){ |
|
0
|
|
|
|
|
0
|
|
1241
|
0
|
0
|
|
|
|
0
|
if ($j!=0){ |
|
0
|
|
|
|
|
0
|
|
1242
|
0
|
|
|
|
|
0
|
$sourceInGap{$j}=1; |
1243
|
|
|
|
|
|
|
} |
1244
|
|
|
|
|
|
|
else {$targetToNull{$i}=1}; |
1245
|
|
|
|
|
|
|
} |
1246
|
0
|
|
|
|
|
0
|
for ($j=1;$j<@{$al->{sourceAl}};$j++){ |
|
0
|
|
|
|
|
0
|
|
1247
|
0
|
0
|
|
|
|
0
|
if ($al->isIn("sourceAl",$j,$i)){ |
1248
|
0
|
|
|
|
|
0
|
$sourceInGap{$j}=1; |
1249
|
|
|
|
|
|
|
} |
1250
|
|
|
|
|
|
|
} |
1251
|
|
|
|
|
|
|
} |
1252
|
0
|
|
|
|
|
0
|
foreach $j (keys %sourceInGap){ |
1253
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
1254
|
0
|
0
|
|
|
|
0
|
if ($i!=0){ |
|
0
|
|
|
|
|
0
|
|
1255
|
0
|
|
|
|
|
0
|
$targetInGap{$i}=1; |
1256
|
|
|
|
|
|
|
} |
1257
|
|
|
|
|
|
|
else {$sourceToNull{$j}=1}; |
1258
|
|
|
|
|
|
|
} |
1259
|
0
|
|
|
|
|
0
|
for ($i=1;$i<@{$al->{targetAl}};$i++){ |
|
0
|
|
|
|
|
0
|
|
1260
|
0
|
0
|
|
|
|
0
|
if ($al->isIn("targetAl",$i,$j)){ |
1261
|
0
|
|
|
|
|
0
|
$targetInGap{$i}=1; |
1262
|
|
|
|
|
|
|
} |
1263
|
|
|
|
|
|
|
} |
1264
|
|
|
|
|
|
|
} |
1265
|
0
|
|
|
|
|
0
|
$oldNumInGap=$newNumInGap; |
1266
|
0
|
|
|
|
|
0
|
$newNumInGap=scalar(keys %sourceInGap)+scalar(keys %targetInGap); |
1267
|
|
|
|
|
|
|
} |
1268
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[0]}){ |
|
0
|
|
|
|
|
0
|
|
1269
|
0
|
0
|
|
|
|
0
|
if ($targetInGap{$i}){$targetToNull{$i}=1} |
|
0
|
|
|
|
|
0
|
|
1270
|
|
|
|
|
|
|
} |
1271
|
0
|
|
|
|
|
0
|
foreach $j (@{$al->{targetAl}[0]}){ |
|
0
|
|
|
|
|
0
|
|
1272
|
0
|
0
|
|
|
|
0
|
if ($sourceInGap{$j}){$sourceToNull{$j}=1} |
|
0
|
|
|
|
|
0
|
|
1273
|
|
|
|
|
|
|
} |
1274
|
|
|
|
|
|
|
|
1275
|
0
|
|
|
|
|
0
|
@sortedSourceInGap = sort { $a <=> $b; } keys %sourceInGap; |
|
0
|
|
|
|
|
0
|
|
1276
|
0
|
|
|
|
|
0
|
@sortedTargetInGap = sort { $a <=> $b; } keys %targetInGap; |
|
0
|
|
|
|
|
0
|
|
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
# print "source in gap 2:",join(" ",keys %sourceInGap)."\n"; |
1279
|
|
|
|
|
|
|
# print "target in gap 2:",join(" ",keys %targetInGap)."\n"; |
1280
|
|
|
|
|
|
|
# print "source sorted:",join(" ",@sortedSourceInGap)."\n"; |
1281
|
|
|
|
|
|
|
# print "target sorted:",join(" ",@sortedTargetInGap)."\n"; |
1282
|
|
|
|
|
|
|
# print "target to null:",join(" ",keys %targetToNull)."\n"; |
1283
|
|
|
|
|
|
|
# print "source to null:",join(" ",keys %sourceToNull)."\n"; |
1284
|
|
|
|
|
|
|
|
1285
|
0
|
0
|
|
|
|
0
|
if (@sortedSourceInGap==0){ |
1286
|
0
|
|
|
|
|
0
|
$zeroSource=0; |
1287
|
0
|
|
|
|
|
0
|
$numSource=0; |
1288
|
|
|
|
|
|
|
}else{ |
1289
|
0
|
|
|
|
|
0
|
$zeroSource=$sortedSourceInGap[0]-1; |
1290
|
0
|
|
|
|
|
0
|
$numSource=$sortedSourceInGap[@sortedSourceInGap-1]-$sortedSourceInGap[0]+1; |
1291
|
|
|
|
|
|
|
} |
1292
|
0
|
0
|
|
|
|
0
|
if (@sortedTargetInGap==0){ |
1293
|
0
|
|
|
|
|
0
|
$zeroTarget=0; |
1294
|
0
|
|
|
|
|
0
|
$numTarget=0; |
1295
|
|
|
|
|
|
|
}else{ |
1296
|
0
|
|
|
|
|
0
|
$zeroTarget=$sortedTargetInGap[0]-1; |
1297
|
0
|
|
|
|
|
0
|
$numTarget=$sortedTargetInGap[@sortedTargetInGap-1]-$sortedTargetInGap[0]+1; |
1298
|
|
|
|
|
|
|
} |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
#Actualize AlignmentSlice attributes |
1301
|
0
|
|
|
|
|
0
|
$gap->setZero($zeroSource,$zeroTarget); |
1302
|
0
|
|
|
|
|
0
|
foreach $j (keys %sourceInGap){ |
1303
|
0
|
|
|
|
|
0
|
$gap->{sourceIndices}{$j-$zeroSource}=1; |
1304
|
|
|
|
|
|
|
} |
1305
|
0
|
0
|
|
|
|
0
|
if (scalar (keys %targetToNull)>0){$gap->{sourceIndices}{0}=1}; |
|
0
|
|
|
|
|
0
|
|
1306
|
0
|
|
|
|
|
0
|
foreach $i (keys %targetInGap){ |
1307
|
0
|
|
|
|
|
0
|
$gap->{targetIndices}{$i-$zeroTarget}=1; |
1308
|
|
|
|
|
|
|
} |
1309
|
0
|
0
|
|
|
|
0
|
if (scalar (keys %sourceToNull)>0){$gap->{targetIndices}{0}=1}; |
|
0
|
|
|
|
|
0
|
|
1310
|
|
|
|
|
|
|
|
1311
|
|
|
|
|
|
|
# print "zero s t:",$zeroSource," ",$zeroTarget,"\n"; |
1312
|
|
|
|
|
|
|
# print "num s t:",$numSource," ",$numTarget,"\n"; |
1313
|
|
|
|
|
|
|
|
1314
|
|
|
|
|
|
|
## LOAD GAP |
1315
|
|
|
|
|
|
|
# 1. insert NULL word and select only words linked to NULL that belong to the gap |
1316
|
0
|
|
|
|
|
0
|
push @{$gap->{sourceWords}},'NULL'; |
|
0
|
|
|
|
|
0
|
|
1317
|
0
|
|
|
|
|
0
|
foreach $i (keys %targetToNull){push @linked,$i-$gap->{zeroTarget}} |
|
0
|
|
|
|
|
0
|
|
1318
|
0
|
|
|
|
|
0
|
push @{$gap->{sourceAl}},[@linked]; |
|
0
|
|
|
|
|
0
|
|
1319
|
0
|
|
|
|
|
0
|
push @{$gap->{targetWords}},'NULL'; |
|
0
|
|
|
|
|
0
|
|
1320
|
0
|
|
|
|
|
0
|
@linked=(); |
1321
|
0
|
|
|
|
|
0
|
foreach $j (keys %sourceToNull){push @linked,$j-$gap->{zeroSource}} |
|
0
|
|
|
|
|
0
|
|
1322
|
0
|
|
|
|
|
0
|
push @{$gap->{targetAl}},[@linked]; |
|
0
|
|
|
|
|
0
|
|
1323
|
|
|
|
|
|
|
# 2. Add non-NULL words and alignments |
1324
|
0
|
|
|
|
|
0
|
for ($ind=1;$ind<=$numSource;$ind++){ |
1325
|
0
|
|
|
|
|
0
|
$j=$ind+$gap->{zeroSource}; |
1326
|
0
|
|
|
|
|
0
|
$gap->{sourceWords}[$ind]=$al->{sourceWords}[$j]; |
1327
|
0
|
0
|
|
|
|
0
|
if ($sourceInGap{$j}){ |
1328
|
0
|
|
|
|
|
0
|
@linked=(); |
1329
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[$j]}){ |
|
0
|
|
|
|
|
0
|
|
1330
|
|
|
|
|
|
|
#if ($targetInGap{$i}){ #useless:de facto included in the zone |
1331
|
0
|
|
|
|
|
0
|
push @linked,$i-$gap->{zeroTarget} |
1332
|
|
|
|
|
|
|
#} |
1333
|
|
|
|
|
|
|
} |
1334
|
0
|
|
|
|
|
0
|
$gap->{sourceAl}[$ind]=[@linked]; |
1335
|
|
|
|
|
|
|
} |
1336
|
|
|
|
|
|
|
} |
1337
|
0
|
|
|
|
|
0
|
for ($ind=1;$ind<=$numTarget;$ind++){ |
1338
|
0
|
|
|
|
|
0
|
$i = $ind+$gap->{zeroTarget}; |
1339
|
0
|
|
|
|
|
0
|
$gap->{targetWords}[$ind]=$al->{targetWords}[$i]; |
1340
|
0
|
0
|
|
|
|
0
|
if ($targetInGap{$i}){ |
1341
|
0
|
|
|
|
|
0
|
@linked=(); |
1342
|
0
|
|
|
|
|
0
|
foreach $j (@{$al->{targetAl}[$i]}){ |
|
0
|
|
|
|
|
0
|
|
1343
|
|
|
|
|
|
|
#if ($sourceInGap{$j}) { #useless:de facto included in the zone |
1344
|
0
|
|
|
|
|
0
|
push @linked,$j-$gap->{zeroSource} |
1345
|
|
|
|
|
|
|
#} |
1346
|
|
|
|
|
|
|
} |
1347
|
0
|
|
|
|
|
0
|
$gap->{targetAl}[$ind]=[@linked]; |
1348
|
|
|
|
|
|
|
} |
1349
|
|
|
|
|
|
|
} |
1350
|
0
|
|
|
|
|
0
|
return $gap; |
1351
|
|
|
|
|
|
|
} |
1352
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
##################################################### |
1354
|
|
|
|
|
|
|
### PRIVATE SUBS ### |
1355
|
|
|
|
|
|
|
##################################################### |
1356
|
|
|
|
|
|
|
|
1357
|
|
|
|
|
|
|
# Returns the number of times the link ($ind1,$ind2) is present in the $side alignment |
1358
|
|
|
|
|
|
|
sub isIn { |
1359
|
132
|
|
|
132
|
0
|
181
|
my ($al,$side,$ind1,$ind2) = @_; |
1360
|
132
|
50
|
|
|
|
242
|
if ($side eq "sourceAl"){ |
1361
|
|
|
|
|
|
|
# returns >0 if the link (j,i) is present in sourceAl (ie if i_partOf_Bj), 0 otherwise |
1362
|
132
|
|
|
|
|
141
|
my ($j,$i) = ($ind1,$ind2); |
1363
|
132
|
|
|
|
|
116
|
my $i_partOf_Bj=grep /^$i$/, @{$al->{sourceAl}[$j]}; |
|
132
|
|
|
|
|
524
|
|
1364
|
132
|
|
|
|
|
417
|
return $i_partOf_Bj; |
1365
|
|
|
|
|
|
|
}else{ |
1366
|
|
|
|
|
|
|
# returns >0 if the link (i,j) is present in targetAl (ie if j_partOf_Bi), 0 otherwise |
1367
|
0
|
|
|
|
|
|
my ($i,$j)=($ind1,$ind2); |
1368
|
0
|
|
|
|
|
|
my $j_partOf_Bi = grep /^$j$/, @{$al->{targetAl}[$i]}; |
|
0
|
|
|
|
|
|
|
1369
|
0
|
|
|
|
|
|
return $j_partOf_Bi; |
1370
|
|
|
|
|
|
|
} |
1371
|
|
|
|
|
|
|
} |
1372
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
# returns an object with same content as the input object |
1374
|
|
|
|
|
|
|
sub clone { |
1375
|
0
|
|
|
0
|
0
|
|
my $al = shift; |
1376
|
0
|
|
|
|
|
|
my $clone = Lingua::Alignment->new; |
1377
|
0
|
|
|
|
|
|
my ($i,$j); |
1378
|
0
|
|
|
|
|
|
@{$clone->{sourceWords}}=@{$al->{sourceWords}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1379
|
0
|
|
|
|
|
|
@{$clone->{targetWords}}=@{$al->{targetWords}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1380
|
0
|
|
|
|
|
|
for ($j=0;$j<@{$al->{sourceAl}};$j++){ |
|
0
|
|
|
|
|
|
|
1381
|
0
|
0
|
|
|
|
|
if (defined($al->{sourceAl}[$j])){ |
1382
|
0
|
|
|
|
|
|
push @{$clone->{sourceAl}},[@{$al->{sourceAl}[$j]}]; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
} |
1384
|
|
|
|
|
|
|
} |
1385
|
0
|
|
|
|
|
|
for ($i=0;$i<@{$al->{targetAl}};$i++){ |
|
0
|
|
|
|
|
|
|
1386
|
0
|
0
|
|
|
|
|
if (defined($al->{targetAl}[$i])){ |
1387
|
0
|
|
|
|
|
|
push @{$clone->{targetAl}},[@{$al->{targetAl}[$i]}]; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1388
|
|
|
|
|
|
|
} |
1389
|
|
|
|
|
|
|
} |
1390
|
0
|
|
|
|
|
|
%{$clone->{sourceLinks}}=%{$al->{sourceLinks}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1391
|
0
|
|
|
|
|
|
%{$clone->{targetLinks}}=%{$al->{targetLinks}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1392
|
|
|
|
|
|
|
|
1393
|
0
|
|
|
|
|
|
return $clone; |
1394
|
|
|
|
|
|
|
} |
1395
|
|
|
|
|
|
|
sub clear { |
1396
|
0
|
|
|
0
|
0
|
|
my $al = shift; |
1397
|
0
|
|
|
|
|
|
my ($i,$j); |
1398
|
0
|
|
|
|
|
|
for ($j=0;$j<@{$al->{sourceAl}};$j++){ |
|
0
|
|
|
|
|
|
|
1399
|
0
|
0
|
|
|
|
|
if (defined($al->{sourceAl}[$j])){ |
1400
|
0
|
|
|
|
|
|
@{$al->{sourceAl}[$j]} = (); |
|
0
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
|
} |
1402
|
|
|
|
|
|
|
} |
1403
|
0
|
|
|
|
|
|
for ($i=0;$i<@{$al->{targetAl}};$i++){ |
|
0
|
|
|
|
|
|
|
1404
|
0
|
0
|
|
|
|
|
if (defined($al->{targetAl}[$i])){ |
1405
|
0
|
|
|
|
|
|
@{$al->{targetAl}[$i]} = (); |
|
0
|
|
|
|
|
|
|
1406
|
|
|
|
|
|
|
} |
1407
|
|
|
|
|
|
|
} |
1408
|
0
|
|
|
|
|
|
%{$al->{sourceLinks}} = (); |
|
0
|
|
|
|
|
|
|
1409
|
0
|
|
|
|
|
|
%{$al->{targetLinks}} = (); |
|
0
|
|
|
|
|
|
|
1410
|
|
|
|
|
|
|
} |
1411
|
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
# gets the alignment as clusters of positions aligned together |
1414
|
|
|
|
|
|
|
# input: $al, $direction ("source" for "sourceAl" or "target" for "targetAl") |
1415
|
|
|
|
|
|
|
sub getAlClusters { |
1416
|
0
|
|
|
0
|
0
|
|
my ($al,$direction)=@_; |
1417
|
|
|
|
|
|
|
#default: |
1418
|
0
|
0
|
|
|
|
|
if (!defined($direction)){$direction="source"} |
|
0
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
|
1420
|
0
|
|
|
|
|
|
my $dumper = new Dumpvalue; |
1421
|
|
|
|
|
|
|
# group many-to-many linked phrased in clusters |
1422
|
0
|
|
|
|
|
|
my %scomp; #stores in which cluster is each source word position |
1423
|
|
|
|
|
|
|
my %tcomp; |
1424
|
0
|
|
|
|
|
|
my @clusters; |
1425
|
0
|
|
|
|
|
|
my $alClusters={}; |
1426
|
0
|
|
|
|
|
|
my $numClusters=0; |
1427
|
|
|
|
|
|
|
|
1428
|
0
|
|
|
|
|
|
for (my $j=1;$j<@{$al->{$direction."Al"}};$j++){ |
|
0
|
|
|
|
|
|
|
1429
|
0
|
0
|
|
|
|
|
if (defined($al->{$direction."Al"}[$j])){ |
1430
|
0
|
|
|
|
|
|
foreach my $i (@{$al->{$direction."Al"}[$j]}){ |
|
0
|
|
|
|
|
|
|
1431
|
0
|
0
|
|
|
|
|
if ($i>0){ |
1432
|
|
|
|
|
|
|
# print "j: $j i: $i\n"; |
1433
|
0
|
0
|
0
|
|
|
|
if (exists($scomp{$j}) || exists($tcomp{$i})){ |
1434
|
0
|
|
|
|
|
|
my ($clustIndex1,$clustIndex2); |
1435
|
0
|
0
|
0
|
|
|
|
if (exists($scomp{$j}) && exists($tcomp{$i})){ |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
1436
|
0
|
0
|
|
|
|
|
if ($tcomp{$i} != $scomp{$j}){ |
1437
|
|
|
|
|
|
|
# merge clusters: |
1438
|
0
|
0
|
|
|
|
|
if ($scomp{$j}<$tcomp{$i}){ |
1439
|
0
|
|
|
|
|
|
$clustIndex1=$scomp{$j}; |
1440
|
0
|
|
|
|
|
|
$clustIndex2=$tcomp{$i}; |
1441
|
|
|
|
|
|
|
}else{ |
1442
|
0
|
|
|
|
|
|
$clustIndex1=$tcomp{$i}; |
1443
|
0
|
|
|
|
|
|
$clustIndex2=$scomp{$j}; |
1444
|
|
|
|
|
|
|
} |
1445
|
|
|
|
|
|
|
# print "clusters: $clustIndex1 $clustIndex2 :\n"; |
1446
|
0
|
|
|
|
|
|
push @{$clusters[$clustIndex1]->{source}},@{$clusters[$clustIndex2]->{source}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1447
|
0
|
|
|
|
|
|
push @{$clusters[$clustIndex1]->{target}},@{$clusters[$clustIndex2]->{target}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
|
1449
|
0
|
|
|
|
|
|
while ( my ($key,$val)=each (%scomp) ){ |
1450
|
0
|
0
|
|
|
|
|
if ($val == $clustIndex2){$scomp{$key}=$clustIndex1;} |
|
0
|
|
|
|
|
|
|
1451
|
0
|
0
|
|
|
|
|
if ($val > $clustIndex2) {$scomp{$key}=$scomp{$key}-1;} |
|
0
|
|
|
|
|
|
|
1452
|
|
|
|
|
|
|
} |
1453
|
0
|
|
|
|
|
|
while ( my ($key,$val)=each (%tcomp) ){ |
1454
|
0
|
0
|
|
|
|
|
if ($val == $clustIndex2){$tcomp{$key}=$clustIndex1;} |
|
0
|
|
|
|
|
|
|
1455
|
0
|
0
|
|
|
|
|
if ($val > $clustIndex2) {$tcomp{$key}=$tcomp{$key}-1;} |
|
0
|
|
|
|
|
|
|
1456
|
|
|
|
|
|
|
} |
1457
|
0
|
|
|
|
|
|
splice @clusters,$clustIndex2,1; |
1458
|
0
|
|
|
|
|
|
$numClusters--; |
1459
|
|
|
|
|
|
|
} |
1460
|
|
|
|
|
|
|
}elsif (exists($scomp{$j})){ |
1461
|
0
|
|
|
|
|
|
$clustIndex1=$scomp{$j}; |
1462
|
0
|
|
|
|
|
|
$tcomp{$i}=$clustIndex1; |
1463
|
0
|
|
|
|
|
|
push @{$clusters[$clustIndex1]->{target}},$i; |
|
0
|
|
|
|
|
|
|
1464
|
|
|
|
|
|
|
}elsif (exists($tcomp{$i})){ |
1465
|
0
|
|
|
|
|
|
$clustIndex1=$tcomp{$i}; |
1466
|
0
|
|
|
|
|
|
$scomp{$j}=$clustIndex1; |
1467
|
0
|
|
|
|
|
|
push @{$clusters[$clustIndex1]->{source}},$j; |
|
0
|
|
|
|
|
|
|
1468
|
|
|
|
|
|
|
} |
1469
|
|
|
|
|
|
|
}else{ |
1470
|
0
|
|
|
|
|
|
push @clusters,{source=>[$j],target=>[$i]}; |
1471
|
0
|
|
|
|
|
|
$scomp{$j}=$numClusters; |
1472
|
0
|
|
|
|
|
|
$tcomp{$i}=$numClusters; |
1473
|
0
|
|
|
|
|
|
$numClusters++; |
1474
|
|
|
|
|
|
|
} |
1475
|
|
|
|
|
|
|
} #if $i>0 |
1476
|
|
|
|
|
|
|
# print "scomp:\n"; |
1477
|
|
|
|
|
|
|
# print $dumper->dumpValue(\%scomp); |
1478
|
|
|
|
|
|
|
# print "tcomp:\n"; |
1479
|
|
|
|
|
|
|
# print $dumper->dumpValue(\%tcomp); |
1480
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
# print $dumper->dumpValue(\@clusters); |
1482
|
|
|
|
|
|
|
} |
1483
|
|
|
|
|
|
|
} |
1484
|
|
|
|
|
|
|
} |
1485
|
0
|
|
|
|
|
|
return \@clusters; |
1486
|
|
|
|
|
|
|
} |
1487
|
|
|
|
|
|
|
|
1488
|
|
|
|
|
|
|
# prints a phrase given the side of alignment (source or target) and an array of positions of the phrase words |
1489
|
|
|
|
|
|
|
sub printPhrase { |
1490
|
0
|
|
|
0
|
0
|
|
my ($al,$source,$posArray)=@_; |
1491
|
0
|
|
|
|
|
|
my @words; |
1492
|
0
|
|
|
|
|
|
foreach my $pos (@$posArray){ |
1493
|
0
|
|
|
|
|
|
push @words,$al->{$source."Words"}[$pos]; |
1494
|
|
|
|
|
|
|
} |
1495
|
0
|
|
|
|
|
|
return join(" ",@words); |
1496
|
|
|
|
|
|
|
} |
1497
|
|
|
|
|
|
|
|
1498
|
|
|
|
|
|
|
# SELECT ONLY S LINKS |
1499
|
|
|
|
|
|
|
sub SLinks { |
1500
|
0
|
|
|
0
|
0
|
|
my $al=shift; |
1501
|
0
|
|
|
|
|
|
my $sal = Lingua::Alignment->new; |
1502
|
0
|
|
|
|
|
|
@{$sal->{sourceWords}}=@{$al->{sourceWords}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1503
|
0
|
|
|
|
|
|
@{$sal->{targetWords}}=@{$al->{targetWords}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1504
|
|
|
|
|
|
|
|
1505
|
0
|
|
|
|
|
|
my %side=("source"=>"target","target"=>"source"); |
1506
|
0
|
|
|
|
|
|
while (my ($source,$target)= each(%side)){ |
1507
|
0
|
|
|
|
|
|
for (my $j=0;$j<@{$al->{$source."Al"}};$j++){ |
|
0
|
|
|
|
|
|
|
1508
|
0
|
|
|
|
|
|
push @{$sal->{$source."Al"}},[]; |
|
0
|
|
|
|
|
|
|
1509
|
0
|
0
|
|
|
|
|
if (defined($al->{$source."Al"}[$j])){ |
1510
|
0
|
|
|
|
|
|
foreach my $i (@{$al->{$source."Al"}[$j]}){ |
|
0
|
|
|
|
|
|
|
1511
|
0
|
0
|
0
|
|
|
|
if ($al->{$source."Links"}->{$j." ".$i}[0] ne "p" && $al->{$source."Links"}->{$j." ".$i}[0] ne "P"){ |
1512
|
0
|
|
|
|
|
|
push @{$sal->{$source."Al"}[$j]},$i; |
|
0
|
|
|
|
|
|
|
1513
|
|
|
|
|
|
|
} |
1514
|
|
|
|
|
|
|
} |
1515
|
|
|
|
|
|
|
} |
1516
|
|
|
|
|
|
|
} |
1517
|
|
|
|
|
|
|
} |
1518
|
0
|
|
|
|
|
|
return $sal; |
1519
|
|
|
|
|
|
|
} |
1520
|
|
|
|
|
|
|
|
1521
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
1; |