| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
######################################################################## |
|
3
|
|
|
|
|
|
|
# Author: Patrik Lambert (lambert@talp.ucp.es) |
|
4
|
|
|
|
|
|
|
# Contributions from Adria de Gispert (agispert@gps.tsc.upc.es) |
|
5
|
|
|
|
|
|
|
# and Josep Maria Crego (jmcrego@gps.tsc.upc.es) |
|
6
|
|
|
|
|
|
|
# Description: Library of tools to process a set of links between the |
|
7
|
|
|
|
|
|
|
# words of two sentences. |
|
8
|
|
|
|
|
|
|
# |
|
9
|
|
|
|
|
|
|
#----------------------------------------------------------------------- |
|
10
|
|
|
|
|
|
|
# |
|
11
|
|
|
|
|
|
|
# Copyright 2004 by Patrik Lambert |
|
12
|
|
|
|
|
|
|
# |
|
13
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or modify |
|
14
|
|
|
|
|
|
|
# it under the terms of the GNU General Public License as published by |
|
15
|
|
|
|
|
|
|
# the Free Software Foundation; either version 2 of the License, or |
|
16
|
|
|
|
|
|
|
# (at your option) any later version. |
|
17
|
|
|
|
|
|
|
# |
|
18
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
|
19
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
20
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
21
|
|
|
|
|
|
|
# GNU General Public License for more details. |
|
22
|
|
|
|
|
|
|
# |
|
23
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
|
24
|
|
|
|
|
|
|
# along with this program; if not, write to the Free Software |
|
25
|
|
|
|
|
|
|
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
|
26
|
|
|
|
|
|
|
######################################################################## |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
package Lingua::Alignment; |
|
29
|
|
|
|
|
|
|
$VERSION=1.1; |
|
30
|
1
|
|
|
1
|
|
6
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
44
|
|
|
31
|
1
|
|
|
1
|
|
706
|
use Lingua::AlignmentSlice; |
|
|
1
|
|
|
|
|
5
|
|
|
|
1
|
|
|
|
|
50
|
|
|
32
|
1
|
|
|
1
|
|
14
|
use Lingua::AlSetLib 1.1; |
|
|
1
|
|
|
|
|
45
|
|
|
|
1
|
|
|
|
|
23
|
|
|
33
|
1
|
|
|
1
|
|
4
|
use Dumpvalue; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
12999
|
|
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
#an alignment is a hash with 4 components: |
|
36
|
|
|
|
|
|
|
# {sourceAl} ref to source position array, each position containing the array of aligned target positions. |
|
37
|
|
|
|
|
|
|
# Each linked target token is indicated with the array: (position,S(sure)/P(possible),confidence score) |
|
38
|
|
|
|
|
|
|
# {targetAl} same as sourceAl but reversed |
|
39
|
|
|
|
|
|
|
# {sourceWords} and {targetWords}: array of corresponding words |
|
40
|
|
|
|
|
|
|
# {sourceLinks}: hash (indexed by the source token position $j and target $i in the link: {$j $i} of arrays giving |
|
41
|
|
|
|
|
|
|
# {targetLinks}: same as sourceLinks, for target alignment |
|
42
|
|
|
|
|
|
|
# more information about the link: ( S(sure) or P(possible) , confidence ) |
|
43
|
|
|
|
|
|
|
sub new { |
|
44
|
20
|
|
|
20
|
0
|
23
|
my $pkg = shift; |
|
45
|
20
|
|
|
|
|
26
|
my $al = {}; |
|
46
|
|
|
|
|
|
|
|
|
47
|
20
|
|
|
|
|
50
|
$al->{sourceAl}=[]; |
|
48
|
20
|
|
|
|
|
52
|
$al->{targetAl}=[]; |
|
49
|
20
|
|
|
|
|
34
|
$al->{sourceWords} = []; |
|
50
|
20
|
|
|
|
|
33
|
$al->{targetWords} = []; |
|
51
|
20
|
|
|
|
|
36
|
$al->{sourceLinks} = {}; |
|
52
|
20
|
|
|
|
|
31
|
$al->{targetLinks} = {}; |
|
53
|
20
|
|
|
|
|
74
|
return bless $al,$pkg; |
|
54
|
|
|
|
|
|
|
} |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
sub loadFromGiza { |
|
57
|
0
|
|
|
0
|
0
|
0
|
my ($al,$alignmentString,$targetString,$reverseAlignmentString) = @_; |
|
58
|
0
|
|
|
|
|
0
|
my ($i,$elem,$positionsString); |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
#TARGET |
|
61
|
0
|
|
|
|
|
0
|
$targetString =~ s/^\s+//; #trim |
|
62
|
0
|
|
|
|
|
0
|
$targetString =~ s/\s+$//; #trim |
|
63
|
0
|
|
|
|
|
0
|
$targetString =~ s/\s{2,}/ /g; #remove multiple spaces |
|
64
|
0
|
0
|
|
|
|
0
|
if ($targetString !~ /^NULL /){ |
|
65
|
0
|
|
|
|
|
0
|
$al->{targetWords}=["NULL"]; #we keep a place for the NULL word of the other direction |
|
66
|
|
|
|
|
|
|
} |
|
67
|
0
|
|
|
|
|
0
|
push @{$al->{targetWords}},split(/ /,$targetString); |
|
|
0
|
|
|
|
|
0
|
|
|
68
|
|
|
|
|
|
|
|
|
69
|
0
|
|
|
|
|
0
|
$alignmentString =~ s/\s{2,}/ /g; #remove multiple spaces |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
#SOURCE |
|
72
|
0
|
|
|
|
|
0
|
my $srcString = $alignmentString; |
|
73
|
0
|
|
|
|
|
0
|
$srcString =~ s/ \(\{[^\}]+\}\)//g; |
|
74
|
0
|
|
|
|
|
0
|
$srcString =~ s/^\s+//; $srcString =~ s/\s+$//; |
|
|
0
|
|
|
|
|
0
|
|
|
75
|
0
|
|
|
|
|
0
|
@{$al->{sourceWords}}=split / /,$srcString; |
|
|
0
|
|
|
|
|
0
|
|
|
76
|
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
#S2T LINKS |
|
78
|
|
|
|
|
|
|
# here you can't use a hash because you would loose the order |
|
79
|
0
|
|
|
|
|
0
|
$_ = $alignmentString; |
|
80
|
0
|
|
|
|
|
0
|
my @correspondances = /\(\{(.+?)\}\)/g; #take what is between parentesis ie links |
|
81
|
0
|
|
|
|
|
0
|
foreach my $positionsString (@correspondances){ |
|
82
|
0
|
|
|
|
|
0
|
$positionsString =~ s/^\s+//; #trim |
|
83
|
0
|
|
|
|
|
0
|
$positionsString =~ s/\s+$//; #trim |
|
84
|
0
|
|
|
|
|
0
|
push @{$al->{sourceAl}}, [split / /,$positionsString]; |
|
|
0
|
|
|
|
|
0
|
|
|
85
|
|
|
|
|
|
|
} |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
#REVERSE ALIGNMENT |
|
88
|
0
|
0
|
|
|
|
0
|
if (length($reverseAlignmentString)>0){ |
|
89
|
0
|
|
|
|
|
0
|
$reverseAlignmentString =~ s/\(\{ \}\)/\(\{ \}\)/g; #insert blanks in unlinked words |
|
90
|
0
|
|
|
|
|
0
|
$reverseAlignmentString =~ s/\}\)\s*$//g; #rtrim |
|
91
|
|
|
|
|
|
|
|
|
92
|
0
|
|
|
|
|
0
|
@correspondances = split /\(\{\s|\}\)\s/, $reverseAlignmentString; |
|
93
|
0
|
|
|
|
|
0
|
for ($i=0;$i<@correspondances;$i+=2) { |
|
94
|
0
|
|
|
|
|
0
|
$positionsString = $correspondances[$i+1]; |
|
95
|
0
|
|
|
|
|
0
|
$positionsString =~ s/^\s+|\s+$//g; #trim |
|
96
|
0
|
|
|
|
|
0
|
$positionsString =~ s/\s{2,}/ /g; #remove multiple spaces |
|
97
|
0
|
|
|
|
|
0
|
push @{$al->{targetAl}}, [split / /,$positionsString]; |
|
|
0
|
|
|
|
|
0
|
|
|
98
|
|
|
|
|
|
|
} |
|
99
|
|
|
|
|
|
|
} |
|
100
|
|
|
|
|
|
|
} |
|
101
|
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
#input: $refToAlignedPairs_ts (target to source),$sourceSentence and $targetSentence are optional |
|
103
|
|
|
|
|
|
|
sub loadFromBlinker{ |
|
104
|
20
|
|
|
20
|
0
|
33
|
my ($al,$refToAlignedPairs_st,$refToAlignedPairs_ts,$sourceSentence,$targetSentence)=@_; |
|
105
|
20
|
|
|
|
|
22
|
my $i; |
|
106
|
|
|
|
|
|
|
my $pairStr; |
|
107
|
0
|
|
|
|
|
0
|
my @pair; |
|
108
|
0
|
|
|
|
|
0
|
my @pairs; |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
#LOAD SENTENCES (if applicable) |
|
111
|
20
|
100
|
|
|
|
43
|
if (defined($sourceSentence)){ |
|
112
|
10
|
|
|
|
|
103
|
$sourceSentence =~ s/^\s+|\s+$//g; #trim |
|
113
|
10
|
|
|
|
|
28
|
$sourceSentence =~ s/\s{2,}/ /g; #remove multiple space |
|
114
|
|
|
|
|
|
|
|
|
115
|
10
|
50
|
|
|
|
34
|
if ($sourceSentence !~ /^NULL /){ |
|
116
|
0
|
|
|
|
|
0
|
$al->{sourceWords}=["NULL"]; |
|
117
|
|
|
|
|
|
|
} |
|
118
|
10
|
|
|
|
|
11
|
push @{$al->{sourceWords}},split(/ /,$sourceSentence); |
|
|
10
|
|
|
|
|
62
|
|
|
119
|
|
|
|
|
|
|
} |
|
120
|
20
|
100
|
|
|
|
38
|
if (defined($targetSentence)){ |
|
121
|
10
|
|
|
|
|
95
|
$targetSentence =~ s/^\s+|\s+$//g; |
|
122
|
10
|
|
|
|
|
28
|
$targetSentence =~ s/\s{2,}/ /g; |
|
123
|
|
|
|
|
|
|
|
|
124
|
10
|
50
|
|
|
|
30
|
if ($targetSentence !~ /^NULL /){ |
|
125
|
0
|
|
|
|
|
0
|
$al->{targetWords}=["NULL"]; |
|
126
|
|
|
|
|
|
|
} |
|
127
|
10
|
|
|
|
|
12
|
push @{$al->{targetWords}},split(/ /,$targetSentence); |
|
|
10
|
|
|
|
|
49
|
|
|
128
|
|
|
|
|
|
|
} |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
#LOAD SOURCE TO TARGET ALIGNMENT: |
|
131
|
|
|
|
|
|
|
#read alignment data |
|
132
|
20
|
|
|
|
|
35
|
foreach $pairStr (@$refToAlignedPairs_st){ |
|
133
|
165
|
|
|
|
|
518
|
$pairStr =~ s/^\s+|\s+$//g; #trim |
|
134
|
165
|
|
|
|
|
287
|
$pairStr =~ s/\s{2,}/ /g; #remove multiple space |
|
135
|
165
|
|
|
|
|
413
|
@pair = split / /,$pairStr; |
|
136
|
165
|
|
|
|
|
173
|
push @{$pairs[$pair[0]]},$pair[1]; |
|
|
165
|
|
|
|
|
380
|
|
|
137
|
|
|
|
|
|
|
#load extra information (like S/P, confidence) |
|
138
|
165
|
50
|
|
|
|
402
|
if (@pair > 2){ |
|
139
|
0
|
|
|
|
|
0
|
$al->{sourceLinks}->{$pair[0]." ".$pair[1]}=[splice(@pair,2)] ; |
|
140
|
|
|
|
|
|
|
} |
|
141
|
|
|
|
|
|
|
} |
|
142
|
|
|
|
|
|
|
# take into account unaligned words to have no undef entry in array: |
|
143
|
|
|
|
|
|
|
# Since we really want to think in terms of alignment and not words, we don't base ourself on the number of words |
|
144
|
20
|
|
|
|
|
52
|
for ($i=0;$i<@pairs;$i++){ |
|
145
|
152
|
100
|
|
|
|
241
|
if (defined($pairs[$i])){ |
|
146
|
124
|
|
|
|
|
99
|
push @{$al->{sourceAl}},$pairs[$i]; |
|
|
124
|
|
|
|
|
351
|
|
|
147
|
|
|
|
|
|
|
}else{ |
|
148
|
28
|
|
|
|
|
29
|
push @{$al->{sourceAl}},[]; |
|
|
28
|
|
|
|
|
96
|
|
|
149
|
|
|
|
|
|
|
} |
|
150
|
|
|
|
|
|
|
} |
|
151
|
|
|
|
|
|
|
# print main::Dumper($refToAlignedPairs_st,$al->{sourceAl}); |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
#LOAD TARGET TO SOURCE ALIGNMENT: |
|
154
|
20
|
50
|
|
|
|
39
|
if (defined($refToAlignedPairs_ts)){ |
|
155
|
20
|
50
|
|
|
|
135
|
if (@$refToAlignedPairs_ts>0){ |
|
156
|
0
|
|
|
|
|
0
|
@pairs=(); |
|
157
|
|
|
|
|
|
|
#read alignment data |
|
158
|
0
|
|
|
|
|
0
|
foreach $pairStr (@$refToAlignedPairs_ts){ |
|
159
|
0
|
|
|
|
|
0
|
$pairStr =~ s/^\s+|\s+$//g; #trim |
|
160
|
0
|
|
|
|
|
0
|
$pairStr =~ s/\s{2,}/ /g; #remove multiple space |
|
161
|
0
|
|
|
|
|
0
|
@pair = split / /,$pairStr; |
|
162
|
0
|
|
|
|
|
0
|
push @{$pairs[$pair[0]]},$pair[1]; |
|
|
0
|
|
|
|
|
0
|
|
|
163
|
|
|
|
|
|
|
#load extra information (like S/P, confidence) |
|
164
|
0
|
0
|
|
|
|
0
|
if (@pair > 2){ |
|
165
|
0
|
|
|
|
|
0
|
$al->{targetLinks}->{$pair[0]." ".$pair[1]}=[splice(@pair,2)] ; |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
} |
|
168
|
|
|
|
|
|
|
# take into account unaligned words to have no undef entry in array: |
|
169
|
0
|
|
|
|
|
0
|
for ($i=0;$i<@pairs;$i++){ |
|
170
|
0
|
0
|
|
|
|
0
|
if (defined($pairs[$i])){ |
|
171
|
0
|
|
|
|
|
0
|
push @{$al->{targetAl}},$pairs[$i]; |
|
|
0
|
|
|
|
|
0
|
|
|
172
|
|
|
|
|
|
|
}else{ |
|
173
|
0
|
|
|
|
|
0
|
push @{$al->{targetAl}},[]; |
|
|
0
|
|
|
|
|
0
|
|
|
174
|
|
|
|
|
|
|
} |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
# print main::Dumper($refToAlignedPairs_ts,$al->{targetAl}); |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub loadFromTalp{ |
|
182
|
0
|
|
|
0
|
0
|
0
|
my ($al,$st_string,$ts_string,$sourceSentence,$targetSentence)=@_; |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
#LOAD SENTENCES (if applicable) |
|
185
|
0
|
0
|
|
|
|
0
|
if (defined($sourceSentence)){ |
|
186
|
0
|
|
|
|
|
0
|
$sourceSentence =~ s/^\s+//g; #trim |
|
187
|
0
|
|
|
|
|
0
|
$sourceSentence =~ s/\s+$//g; #trim |
|
188
|
0
|
|
|
|
|
0
|
$sourceSentence =~ s/\s{2,}/ /g; #remove multiple space |
|
189
|
0
|
0
|
|
|
|
0
|
if ($sourceSentence !~ /^NULL /){ |
|
190
|
0
|
|
|
|
|
0
|
$al->{sourceWords}=["NULL"]; |
|
191
|
|
|
|
|
|
|
} |
|
192
|
0
|
|
|
|
|
0
|
push @{$al->{sourceWords}},split(/ /,$sourceSentence); |
|
|
0
|
|
|
|
|
0
|
|
|
193
|
|
|
|
|
|
|
} |
|
194
|
0
|
0
|
|
|
|
0
|
if (defined($targetSentence)){ |
|
195
|
0
|
|
|
|
|
0
|
$targetSentence =~ s/^\s+//g; |
|
196
|
0
|
|
|
|
|
0
|
$targetSentence =~ s/\s+$//g; |
|
197
|
0
|
|
|
|
|
0
|
$targetSentence =~ s/\s{2,}/ /g; |
|
198
|
0
|
0
|
|
|
|
0
|
if ($targetSentence !~ /^NULL /){ |
|
199
|
0
|
|
|
|
|
0
|
$al->{targetWords}=["NULL"]; |
|
200
|
|
|
|
|
|
|
} |
|
201
|
0
|
|
|
|
|
0
|
push @{$al->{targetWords}},split(/ /,$targetSentence); |
|
|
0
|
|
|
|
|
0
|
|
|
202
|
|
|
|
|
|
|
} |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
#LOAD SOURCE TO TARGET ALIGNMENT: |
|
205
|
0
|
0
|
|
|
|
0
|
if ($st_string ne ""){ |
|
206
|
0
|
|
|
|
|
0
|
my @pairs; |
|
207
|
0
|
|
|
|
|
0
|
$st_string =~ s/\s{2,}/ /g; #remove multiple space |
|
208
|
0
|
|
|
|
|
0
|
$st_string =~ s/^\s+//g; #trim |
|
209
|
0
|
|
|
|
|
0
|
$st_string =~ s/\s+$//g; #trim |
|
210
|
|
|
|
|
|
|
#read alignment data |
|
211
|
0
|
|
|
|
|
0
|
my @lnks=split (/ /,$st_string); |
|
212
|
0
|
|
|
|
|
0
|
foreach my $pairStr (@lnks){ |
|
213
|
0
|
|
|
|
|
0
|
my @info = split /:/,$pairStr; |
|
214
|
0
|
|
|
|
|
0
|
my ($src,$sep,$trg) = split /([^\d])/,$info[0]; |
|
215
|
0
|
|
|
|
|
0
|
push @{$pairs[$src]},$trg; |
|
|
0
|
|
|
|
|
0
|
|
|
216
|
|
|
|
|
|
|
#load extra information (like S/P, confidence) |
|
217
|
0
|
0
|
|
|
|
0
|
if ($sep eq "s"){ |
|
|
|
0
|
|
|
|
|
|
|
218
|
0
|
|
|
|
|
0
|
$al->{sourceLinks}->{$src." ".$trg}=["S"]; |
|
219
|
|
|
|
|
|
|
}elsif ($sep eq "p" ){ |
|
220
|
0
|
|
|
|
|
0
|
$al->{sourceLinks}->{$src." ".$trg}=["P"]; |
|
221
|
|
|
|
|
|
|
} |
|
222
|
0
|
|
|
|
|
0
|
for (my $i=1;$i<@info;$i++){ |
|
223
|
0
|
|
|
|
|
0
|
push @{$al->{sourceLinks}->{$src." ".$trg}},$info[$i]; |
|
|
0
|
|
|
|
|
0
|
|
|
224
|
|
|
|
|
|
|
} |
|
225
|
|
|
|
|
|
|
} |
|
226
|
|
|
|
|
|
|
# take into account unaligned words to have no undef entry in array: |
|
227
|
|
|
|
|
|
|
# Since we really want to think in terms of alignment and not words, we don't base ourself on the number of words |
|
228
|
0
|
|
|
|
|
0
|
for (my $i=0;$i<@pairs;$i++){ |
|
229
|
0
|
0
|
|
|
|
0
|
if (defined($pairs[$i])){ |
|
230
|
0
|
|
|
|
|
0
|
push @{$al->{sourceAl}},$pairs[$i]; |
|
|
0
|
|
|
|
|
0
|
|
|
231
|
|
|
|
|
|
|
}else{ |
|
232
|
0
|
|
|
|
|
0
|
push @{$al->{sourceAl}},[]; |
|
|
0
|
|
|
|
|
0
|
|
|
233
|
|
|
|
|
|
|
} |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
} |
|
236
|
|
|
|
|
|
|
# print main::Dumper($refToAlignedPairs_st,$al->{sourceAl}); |
|
237
|
0
|
|
|
|
|
0
|
my $refToAlignedPairs_ts; |
|
238
|
|
|
|
|
|
|
my $pairStr; |
|
239
|
0
|
|
|
|
|
0
|
my @pair; |
|
240
|
|
|
|
|
|
|
#LOAD TARGET TO SOURCE ALIGNMENT: |
|
241
|
0
|
0
|
|
|
|
0
|
if ($ts_string ne ""){ |
|
242
|
0
|
|
|
|
|
0
|
$ts_string =~ s/^\s+|\s+$//g; #trim |
|
243
|
0
|
|
|
|
|
0
|
$ts_string =~ s/\s{2,}/ /g; #remove multiple space |
|
244
|
0
|
|
|
|
|
0
|
my @pairs=(); |
|
245
|
|
|
|
|
|
|
#read alignment data |
|
246
|
0
|
|
|
|
|
0
|
my @lnks=split (/ /,$ts_string); |
|
247
|
0
|
|
|
|
|
0
|
foreach my $pairStr (@lnks){ |
|
248
|
0
|
|
|
|
|
0
|
my @info = split /:/,$pairStr; |
|
249
|
0
|
|
|
|
|
0
|
my ($src,$sep,$trg) = split /([^\d])/,$info[0]; |
|
250
|
0
|
|
|
|
|
0
|
push @{$pairs[$src]},$trg; |
|
|
0
|
|
|
|
|
0
|
|
|
251
|
|
|
|
|
|
|
#load extra information (like S/P, confidence) |
|
252
|
0
|
0
|
|
|
|
0
|
if ($sep eq "s"){ |
|
|
|
0
|
|
|
|
|
|
|
253
|
0
|
|
|
|
|
0
|
$al->{targetLinks}->{$src." ".$trg}=["S"]; |
|
254
|
|
|
|
|
|
|
}elsif ($sep eq "p" ){ |
|
255
|
0
|
|
|
|
|
0
|
$al->{targetLinks}->{$src." ".$trg}=["P"]; |
|
256
|
|
|
|
|
|
|
} |
|
257
|
0
|
|
|
|
|
0
|
for (my $i=1;$i<@info;$i++){ |
|
258
|
0
|
|
|
|
|
0
|
push @{$al->{targetLinks}->{$src." ".$trg}},$info[$i]; |
|
|
0
|
|
|
|
|
0
|
|
|
259
|
|
|
|
|
|
|
} |
|
260
|
|
|
|
|
|
|
} |
|
261
|
|
|
|
|
|
|
# take into account unaligned words to have no undef entry in array: |
|
262
|
0
|
|
|
|
|
0
|
for (my $i=0;$i<@pairs;$i++){ |
|
263
|
0
|
0
|
|
|
|
0
|
if (defined($pairs[$i])){ |
|
264
|
0
|
|
|
|
|
0
|
push @{$al->{targetAl}},$pairs[$i]; |
|
|
0
|
|
|
|
|
0
|
|
|
265
|
|
|
|
|
|
|
}else{ |
|
266
|
0
|
|
|
|
|
0
|
push @{$al->{targetAl}},[]; |
|
|
0
|
|
|
|
|
0
|
|
|
267
|
|
|
|
|
|
|
} |
|
268
|
|
|
|
|
|
|
} |
|
269
|
|
|
|
|
|
|
} |
|
270
|
|
|
|
|
|
|
# print main::Dumper($refToAlignedPairs_ts,$al->{targetAl}); |
|
271
|
|
|
|
|
|
|
} |
|
272
|
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
# sourceSentence: returns the target sentence tokens without NULL word (separated by " "), by parsing the alignment object |
|
274
|
|
|
|
|
|
|
sub sourceSentence { |
|
275
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
|
276
|
0
|
|
|
|
|
0
|
my @sentence=@{$al->{sourceWords}}; |
|
|
0
|
|
|
|
|
0
|
|
|
277
|
0
|
|
|
|
|
0
|
shift @sentence; |
|
278
|
0
|
|
|
|
|
0
|
return join " ",@sentence; |
|
279
|
|
|
|
|
|
|
} |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# TargetSentence: returns the target sentence tokens without NULL word (separated by " "), by parsing the alignment object |
|
282
|
|
|
|
|
|
|
sub targetSentence { |
|
283
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
|
284
|
0
|
|
|
|
|
0
|
my @sentence=@{$al->{targetWords}}; |
|
|
0
|
|
|
|
|
0
|
|
|
285
|
0
|
|
|
|
|
0
|
shift @sentence; |
|
286
|
0
|
|
|
|
|
0
|
return join " ",@sentence; |
|
287
|
|
|
|
|
|
|
} |
|
288
|
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
# Remove links to NULL. |
|
290
|
|
|
|
|
|
|
# Note: to do this we need the alignment to be loaded so we do it in a separate function |
|
291
|
|
|
|
|
|
|
sub forceNoNullAlign { |
|
292
|
20
|
|
|
20
|
0
|
22
|
my $al = shift; |
|
293
|
20
|
|
|
|
|
20
|
my ($j,$i); |
|
294
|
0
|
|
|
|
|
0
|
my $continue; |
|
295
|
0
|
|
|
|
|
0
|
my $source; |
|
296
|
20
|
|
|
|
|
41
|
my @sides=("source","target"); |
|
297
|
|
|
|
|
|
|
|
|
298
|
20
|
|
|
|
|
25
|
foreach $source (@sides){ |
|
299
|
40
|
|
|
|
|
99
|
$al->{$source."Al"}[0]=[]; |
|
300
|
40
|
|
|
|
|
55
|
for ($j=1;$j<@{$al->{$source."Al"}};$j++){ |
|
|
172
|
|
|
|
|
504
|
|
|
301
|
132
|
50
|
|
|
|
335
|
if ($al->isIn($source."Al",$j,0)){ |
|
302
|
0
|
|
|
|
|
0
|
$continue=1; |
|
303
|
0
|
|
0
|
|
|
0
|
for ($i=0;$i<@{$al->{$source."Al"}[$j]} && $continue;$i++){ |
|
|
0
|
|
|
|
|
0
|
|
|
304
|
0
|
0
|
|
|
|
0
|
if ($al->{$source."Al"}[$j][$i]==0){ |
|
305
|
0
|
|
|
|
|
0
|
splice(@{$al->{$source."Al"}[$j]}, $i, 1); |
|
|
0
|
|
|
|
|
0
|
|
|
306
|
0
|
|
|
|
|
0
|
$continue=0; |
|
307
|
|
|
|
|
|
|
} |
|
308
|
|
|
|
|
|
|
} |
|
309
|
|
|
|
|
|
|
} |
|
310
|
|
|
|
|
|
|
} |
|
311
|
|
|
|
|
|
|
} #foreach |
|
312
|
|
|
|
|
|
|
} |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
# Link to NULL with a P (Possible) alignment all words that are not linked to anything |
|
315
|
|
|
|
|
|
|
sub forceNullAlign { |
|
316
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
|
317
|
0
|
|
|
|
|
0
|
my ($j,$i); |
|
318
|
0
|
|
|
|
|
0
|
my @reverseAl; |
|
319
|
0
|
|
|
|
|
0
|
my $source; |
|
320
|
0
|
|
|
|
|
0
|
my @sides=("source","target"); |
|
321
|
|
|
|
|
|
|
|
|
322
|
0
|
|
|
|
|
0
|
foreach $source (@sides){ |
|
323
|
0
|
|
|
|
|
0
|
@reverseAl = (); |
|
324
|
0
|
|
|
|
|
0
|
for ($j=1;$j<@{$al->{$source."Al"}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
325
|
0
|
0
|
|
|
|
0
|
if (@{$al->{$source."Al"}[$j]}==0){ |
|
|
0
|
|
|
|
|
0
|
|
|
326
|
0
|
|
|
|
|
0
|
push @{$al->{$source."Al"}[$j]},0; |
|
|
0
|
|
|
|
|
0
|
|
|
327
|
0
|
|
|
|
|
0
|
$al->{$source."Links"}->{"$j 0"}= ["P"]; |
|
328
|
|
|
|
|
|
|
}else{ |
|
329
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{$source."Al"}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
330
|
0
|
|
|
|
|
0
|
push @{$reverseAl[$i]},$j; |
|
|
0
|
|
|
|
|
0
|
|
|
331
|
|
|
|
|
|
|
} |
|
332
|
|
|
|
|
|
|
} |
|
333
|
|
|
|
|
|
|
} |
|
334
|
0
|
|
|
|
|
0
|
for ($i=1;$i<@reverseAl;$i++){ |
|
335
|
0
|
0
|
0
|
|
|
0
|
if (!defined($reverseAl[$i]) || @{$reverseAl[$i]}==0){ |
|
|
0
|
|
|
|
|
0
|
|
|
336
|
0
|
0
|
|
|
|
0
|
if (!$al->isIn($source."Al",0,$i)){ |
|
337
|
0
|
|
|
|
|
0
|
push @{$al->{$source."Al"}[0]},$i; |
|
|
0
|
|
|
|
|
0
|
|
|
338
|
0
|
|
|
|
|
0
|
$al->{$source."Links"}->{"0 $i"}= ["P"]; |
|
339
|
|
|
|
|
|
|
} |
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
} |
|
342
|
|
|
|
|
|
|
} #foreach |
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
sub writeToBlinker{ |
|
346
|
20
|
|
|
20
|
0
|
25
|
my $al = shift; |
|
347
|
20
|
|
|
|
|
25
|
my $side = shift; #optional; default:"source"; |
|
348
|
20
|
50
|
|
|
|
49
|
if (!defined($side)){$side="source"} |
|
|
20
|
|
|
|
|
26
|
|
|
349
|
20
|
|
|
|
|
27
|
my @lines = (); |
|
350
|
20
|
|
|
|
|
18
|
my ($i,$j); |
|
351
|
|
|
|
|
|
|
|
|
352
|
20
|
|
|
|
|
26
|
for ($j=0;$j<@{$al->{$side."Al"}};$j++){ |
|
|
172
|
|
|
|
|
400
|
|
|
353
|
152
|
|
|
|
|
142
|
foreach $i (@{$al->{$side."Al"}[$j]}){ |
|
|
152
|
|
|
|
|
349
|
|
|
354
|
149
|
50
|
|
|
|
134
|
if (${$al->{$side."Links"}}{"$j $i"}){ |
|
|
149
|
|
|
|
|
351
|
|
|
355
|
0
|
|
|
|
|
0
|
push @lines,"$j $i ".join(" ",@{$al->{$side."Links"}{"$j $i"}}); |
|
|
0
|
|
|
|
|
0
|
|
|
356
|
|
|
|
|
|
|
}else{ |
|
357
|
149
|
|
|
|
|
438
|
push @lines,"$j $i"; |
|
358
|
|
|
|
|
|
|
} |
|
359
|
|
|
|
|
|
|
} |
|
360
|
|
|
|
|
|
|
} |
|
361
|
20
|
|
|
|
|
75
|
return \@lines; |
|
362
|
|
|
|
|
|
|
} |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
sub writeToGiza{ |
|
365
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
|
366
|
0
|
|
|
|
|
0
|
my $side = shift; #optional; default:"source"; |
|
367
|
|
|
|
|
|
|
|
|
368
|
|
|
|
|
|
|
# first line |
|
369
|
0
|
|
|
|
|
0
|
my @lines = (); |
|
370
|
0
|
|
|
|
|
0
|
push @lines,"#\n"; |
|
371
|
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
# second line |
|
373
|
0
|
|
|
|
|
0
|
my $invSide; |
|
374
|
0
|
0
|
|
|
|
0
|
if (!defined($side)){$side="source"} |
|
|
0
|
|
|
|
|
0
|
|
|
375
|
0
|
0
|
|
|
|
0
|
if ($side eq "source"){ |
|
376
|
0
|
|
|
|
|
0
|
$invSide="target"; |
|
377
|
0
|
|
|
|
|
0
|
push @lines,$al->targetSentence."\n"; |
|
378
|
|
|
|
|
|
|
}else{ |
|
379
|
0
|
|
|
|
|
0
|
$invSide="source"; |
|
380
|
0
|
|
|
|
|
0
|
push @lines,$al->sourceSentence."\n"; |
|
381
|
|
|
|
|
|
|
} |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
# third line |
|
384
|
0
|
|
|
|
|
0
|
my $linksStr=""; |
|
385
|
0
|
|
|
|
|
0
|
for (my $j=0;$j<@{$al->{$side."Words"}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
386
|
0
|
|
|
|
|
0
|
$linksStr.=$al->{$side."Words"}->[$j].' ({ '; |
|
387
|
0
|
|
|
|
|
0
|
foreach my $i (@{$al->{$side."Al"}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
388
|
0
|
|
|
|
|
0
|
$linksStr.="$i "; |
|
389
|
|
|
|
|
|
|
} |
|
390
|
0
|
|
|
|
|
0
|
$linksStr.='}) '; |
|
391
|
|
|
|
|
|
|
} |
|
392
|
0
|
|
|
|
|
0
|
$linksStr =~ s/\s+$//; |
|
393
|
0
|
|
|
|
|
0
|
$linksStr.="\n"; |
|
394
|
0
|
|
|
|
|
0
|
push @lines,$linksStr; |
|
395
|
|
|
|
|
|
|
# print "GIZA OUTPUT:\n",join("\n",@lines); |
|
396
|
0
|
|
|
|
|
0
|
return join("",@lines); |
|
397
|
|
|
|
|
|
|
} |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
sub writeToTalp{ |
|
400
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
|
401
|
0
|
|
|
|
|
0
|
my $side = shift; #optional; default:"source"; |
|
402
|
0
|
0
|
|
|
|
0
|
if (!defined($side)){$side="source"} |
|
|
0
|
|
|
|
|
0
|
|
|
403
|
0
|
|
|
|
|
0
|
my @lines = (); |
|
404
|
0
|
|
|
|
|
0
|
my ($i,$j); |
|
405
|
|
|
|
|
|
|
|
|
406
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$al->{$side."Al"}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
407
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{$side."Al"}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
408
|
0
|
0
|
|
|
|
0
|
if (${$al->{$side."Links"}}{"$j $i"}){ |
|
|
0
|
|
|
|
|
0
|
|
|
409
|
0
|
|
|
|
|
0
|
my $lk="$j".lc(${$al->{$side."Links"}{"$j $i"}}[0])."$i"; |
|
|
0
|
|
|
|
|
0
|
|
|
410
|
0
|
|
|
|
|
0
|
for (my $k=1;$k<@{$al->{$side."Links"}{"$j $i"}};$k++){ |
|
|
0
|
|
|
|
|
0
|
|
|
411
|
0
|
|
|
|
|
0
|
$lk.=":".${$al->{$side."Links"}{"$j $i"}}[$k]; |
|
|
0
|
|
|
|
|
0
|
|
|
412
|
|
|
|
|
|
|
} |
|
413
|
0
|
|
|
|
|
0
|
push @lines,$lk; |
|
414
|
|
|
|
|
|
|
}else{ |
|
415
|
0
|
|
|
|
|
0
|
push @lines,$j."-".$i; |
|
416
|
|
|
|
|
|
|
} |
|
417
|
|
|
|
|
|
|
} |
|
418
|
|
|
|
|
|
|
} |
|
419
|
0
|
|
|
|
|
0
|
return join(" ",@lines); |
|
420
|
|
|
|
|
|
|
} |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
sub output { |
|
423
|
0
|
|
|
0
|
0
|
0
|
my ($al,$FH,$newFormat,$newFH,$newLocation,$internalSentPairNum)=@_; |
|
424
|
0
|
|
|
|
|
0
|
my $dumper = new Dumpvalue; |
|
425
|
0
|
0
|
|
|
|
0
|
if ($newFormat eq "TALP"){ |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
426
|
0
|
0
|
|
|
|
0
|
if ($newFH->{source}){ |
|
427
|
0
|
|
|
|
|
0
|
$newFH->{source}->print($al->sourceSentence."\n"); |
|
428
|
|
|
|
|
|
|
} |
|
429
|
0
|
0
|
|
|
|
0
|
if ($newFH->{target}){ |
|
430
|
0
|
|
|
|
|
0
|
$newFH->{target}->print($al->targetSentence."\n"); |
|
431
|
|
|
|
|
|
|
} |
|
432
|
0
|
0
|
|
|
|
0
|
if ($newFH->{sourceToTarget}){ |
|
433
|
0
|
|
|
|
|
0
|
$newFH->{sourceToTarget}->print($al->writeToTalp("source")."\n"); |
|
434
|
|
|
|
|
|
|
} |
|
435
|
0
|
0
|
|
|
|
0
|
if ($newFH->{targetToSource}){ |
|
436
|
0
|
|
|
|
|
0
|
$newFH->{targetToSource}->print($al->writeToTalp("target")."\n"); |
|
437
|
|
|
|
|
|
|
} |
|
438
|
|
|
|
|
|
|
}elsif ($newFormat eq "NAACL"){ |
|
439
|
0
|
0
|
|
|
|
0
|
if ($newFH->{source}){ |
|
440
|
0
|
|
|
|
|
0
|
$newFH->{source}->print(" ".$al->sourceSentence." \n"); |
|
441
|
|
|
|
|
|
|
} |
|
442
|
0
|
0
|
|
|
|
0
|
if ($newFH->{target}){ |
|
443
|
0
|
|
|
|
|
0
|
$newFH->{target}->print(" ".$al->targetSentence." \n"); |
|
444
|
|
|
|
|
|
|
} |
|
445
|
0
|
|
|
|
|
0
|
my $lines = $al->writeToBlinker("source"); |
|
446
|
0
|
|
|
|
|
0
|
foreach my $line (@$lines){ |
|
447
|
0
|
|
|
|
|
0
|
$newFH->{sourceToTarget}->print("$internalSentPairNum $line\n"); |
|
448
|
|
|
|
|
|
|
} |
|
449
|
0
|
0
|
|
|
|
0
|
if ($newFH->{targetToSource}){ |
|
450
|
0
|
|
|
|
|
0
|
$lines = $al->writeToBlinker("target"); |
|
451
|
0
|
|
|
|
|
0
|
foreach my $line (@$lines){ |
|
452
|
0
|
|
|
|
|
0
|
$newFH->{targetToSource}->print("$internalSentPairNum $line\n"); |
|
453
|
|
|
|
|
|
|
} |
|
454
|
|
|
|
|
|
|
} |
|
455
|
|
|
|
|
|
|
}elsif ($newFormat eq "GIZA"){ |
|
456
|
0
|
0
|
|
|
|
0
|
if (exists($newFH->{sourceToTarget})){ |
|
457
|
0
|
|
|
|
|
0
|
$newFH->{sourceToTarget}->print("".$al->writeToGiza("source")); |
|
458
|
|
|
|
|
|
|
} |
|
459
|
0
|
0
|
|
|
|
0
|
if (exists($newFH->{targetToSource})){ |
|
460
|
0
|
|
|
|
|
0
|
$newFH->{targetToSource}->print("".$al->writeToGiza("target")); |
|
461
|
|
|
|
|
|
|
} |
|
462
|
|
|
|
|
|
|
}elsif ($newFormat eq "BLINKER"){ |
|
463
|
0
|
0
|
|
|
|
0
|
if ($newFH->{source}){ |
|
464
|
0
|
|
|
|
|
0
|
$newFH->{source}->print($al->sourceSentence."\n"); |
|
465
|
|
|
|
|
|
|
} |
|
466
|
0
|
0
|
|
|
|
0
|
if ($newFH->{target}){ |
|
467
|
0
|
|
|
|
|
0
|
$newFH->{target}->print($al->targetSentence."\n"); |
|
468
|
|
|
|
|
|
|
} |
|
469
|
0
|
|
|
|
|
0
|
my $blinkerFile = $newLocation->{sourceToTarget}."/samp".$newLocation->{sampleNum}.".SentPair".($internalSentPairNum-1); |
|
470
|
0
|
|
0
|
|
|
0
|
open BLINKER, ">$blinkerFile" || die "Blinker file $blinkerFile opening problem:$!"; |
|
471
|
0
|
|
|
|
|
0
|
my $lines = $al->writeToBlinker("source"); |
|
472
|
0
|
|
|
|
|
0
|
foreach my $line (@$lines){ |
|
473
|
0
|
|
|
|
|
0
|
print BLINKER "$line\n"; |
|
474
|
|
|
|
|
|
|
} |
|
475
|
0
|
|
|
|
|
0
|
close BLINKER; |
|
476
|
0
|
0
|
|
|
|
0
|
if ($newLocation->{targetToSource}){ |
|
477
|
0
|
|
|
|
|
0
|
$blinkerFile = $newLocation->{targetToSource}."/samp".$newLocation->{sampleNum}.".SentPair".($internalSentPairNum-1); |
|
478
|
0
|
|
0
|
|
|
0
|
open BLINKER, ">$blinkerFile" || die "Blinker file $blinkerFile opening problem:$!"; |
|
479
|
0
|
|
|
|
|
0
|
my $lines = $al->writeToBlinker("target"); |
|
480
|
0
|
|
|
|
|
0
|
foreach my $line (@$lines){ |
|
481
|
0
|
|
|
|
|
0
|
print BLINKER "$line\n"; |
|
482
|
|
|
|
|
|
|
} |
|
483
|
0
|
|
|
|
|
0
|
close BLINKER; |
|
484
|
|
|
|
|
|
|
} |
|
485
|
|
|
|
|
|
|
}else { |
|
486
|
0
|
|
|
|
|
0
|
die "Output to format $newFormat is not implemented yet."; |
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
} |
|
489
|
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
sub displayAsLinkEnumeration { |
|
491
|
0
|
|
|
0
|
0
|
0
|
my ($al,$format,$latex) = @_; |
|
492
|
0
|
|
|
|
|
0
|
my $lines=""; |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
|
|
495
|
0
|
0
|
|
|
|
0
|
if ($format eq "text"){ |
|
|
|
0
|
|
|
|
|
|
|
496
|
0
|
|
|
|
|
0
|
my ($correspPosition,$wordPosition); |
|
497
|
|
|
|
|
|
|
|
|
498
|
0
|
|
|
|
|
0
|
$lines.= join(" ",@{$al->{sourceWords}})."\n"; |
|
|
0
|
|
|
|
|
0
|
|
|
499
|
0
|
|
|
|
|
0
|
$lines.= join(" ",@{$al->{targetWords}})."\n\n"; |
|
|
0
|
|
|
|
|
0
|
|
|
500
|
|
|
|
|
|
|
|
|
501
|
0
|
|
|
|
|
0
|
for ($wordPosition=0;$wordPosition<@{$al->{sourceWords}};$wordPosition++){ |
|
|
0
|
|
|
|
|
0
|
|
|
502
|
0
|
|
|
|
|
0
|
$lines.= @{$al->{sourceWords}}[$wordPosition]." <- "; |
|
|
0
|
|
|
|
|
0
|
|
|
503
|
0
|
|
|
|
|
0
|
foreach $correspPosition (@{$al->{sourceAl}[$wordPosition]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
504
|
0
|
|
|
|
|
0
|
$lines.= $al->{targetWords}[$correspPosition]." "; |
|
505
|
|
|
|
|
|
|
} |
|
506
|
0
|
|
|
|
|
0
|
$lines.= "\n"; |
|
507
|
|
|
|
|
|
|
} |
|
508
|
0
|
|
|
|
|
0
|
$lines.="\n\n"; |
|
509
|
|
|
|
|
|
|
}elsif ($format eq "latex"){ |
|
510
|
0
|
|
|
|
|
0
|
my $numRowTokens = @{$al->{sourceWords}}; |
|
|
0
|
|
|
|
|
0
|
|
|
511
|
0
|
|
|
|
|
0
|
my $numColTokens = @{$al->{targetWords}}; |
|
|
0
|
|
|
|
|
0
|
|
|
512
|
0
|
|
|
|
|
0
|
my ($i,$j,$elt); |
|
513
|
0
|
|
|
|
|
0
|
my ($j_partOf_Bi,$i_partOf_Bj); |
|
514
|
0
|
|
|
|
|
0
|
my ($targetWord,$sourceWord); |
|
515
|
|
|
|
|
|
|
|
|
516
|
0
|
|
|
|
|
0
|
$lines.= $latex->fromText("\n".join(" ",@{$al->{sourceWords}})."\n"); |
|
|
0
|
|
|
|
|
0
|
|
|
517
|
0
|
|
|
|
|
0
|
$lines.= $latex->fromText(join(" ",@{$al->{targetWords}})."\n\n").'\vspace{5mm}'."\n"; |
|
|
0
|
|
|
|
|
0
|
|
|
518
|
|
|
|
|
|
|
|
|
519
|
0
|
|
|
|
|
0
|
for ($j=0; $j<$numRowTokens;$j++){ |
|
520
|
0
|
|
|
|
|
0
|
for ($i=0;$i<$numColTokens;$i++){ |
|
521
|
0
|
|
|
|
|
0
|
$targetWord = $latex->fromText($al->{targetWords}[$i]); |
|
522
|
0
|
|
|
|
|
0
|
$sourceWord = $latex->fromText($al->{sourceWords}[$j]); |
|
523
|
0
|
|
|
|
|
0
|
$i_partOf_Bj = $al->isIn("sourceAl",$j,$i); |
|
524
|
0
|
|
|
|
|
0
|
$j_partOf_Bi = $al->isIn("targetAl",$i,$j); |
|
525
|
0
|
0
|
|
|
|
0
|
if ($i_partOf_Bj > 0) { #ie i=aj |
|
526
|
0
|
0
|
|
|
|
0
|
if ($j_partOf_Bi > 0){ |
|
527
|
0
|
|
|
|
|
0
|
$lines.= $sourceWord.' \boldmath $\leftrightarrow$ '.$targetWord." \n\n"; |
|
528
|
|
|
|
|
|
|
}else{ |
|
529
|
0
|
|
|
|
|
0
|
$lines.= $sourceWord.' \boldmath $\leftarrow$ '.$targetWord." \n\n"; |
|
530
|
|
|
|
|
|
|
} |
|
531
|
|
|
|
|
|
|
}else{ |
|
532
|
0
|
0
|
|
|
|
0
|
if ($j_partOf_Bi > 0){ |
|
533
|
0
|
|
|
|
|
0
|
$lines.= $sourceWord.' \boldmath $\rightarrow$ '.$targetWord." \n\n"; |
|
534
|
|
|
|
|
|
|
}else{ |
|
535
|
|
|
|
|
|
|
} |
|
536
|
|
|
|
|
|
|
} |
|
537
|
|
|
|
|
|
|
} |
|
538
|
|
|
|
|
|
|
} |
|
539
|
0
|
|
|
|
|
0
|
$lines.= "\n\n".'\vspace{7mm}'; |
|
540
|
|
|
|
|
|
|
} #elsif $format eq latex |
|
541
|
0
|
|
|
|
|
0
|
return $lines; |
|
542
|
|
|
|
|
|
|
} |
|
543
|
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
sub displayAsMatrix { |
|
545
|
0
|
|
|
0
|
0
|
0
|
my ($al,$latex,$mark,$maxRows,$maxCols)= @_; |
|
546
|
0
|
|
|
|
|
0
|
my $matrix = ""; |
|
547
|
0
|
|
|
|
|
0
|
my ($mark_ji,$mark_ij); |
|
548
|
0
|
|
|
|
|
0
|
my $mark_ji_cross='\boldmath $-$'; |
|
549
|
0
|
|
|
|
|
0
|
my $numRowTokens = @{$al->{sourceWords}}; |
|
|
0
|
|
|
|
|
0
|
|
|
550
|
0
|
|
|
|
|
0
|
my $numColTokens = @{$al->{targetWords}}; |
|
|
0
|
|
|
|
|
0
|
|
|
551
|
0
|
|
|
|
|
0
|
my ($i,$j,$elt); |
|
552
|
0
|
|
|
|
|
0
|
my ($j_partOf_Bi,$i_partOf_Bj); |
|
553
|
0
|
|
|
|
|
0
|
my $offset; |
|
554
|
|
|
|
|
|
|
|
|
555
|
0
|
0
|
|
|
|
0
|
if ($numRowTokens>$maxRows){return $al->displayAsLinkEnumeration("latex",$latex)} |
|
|
0
|
|
|
|
|
0
|
|
|
556
|
|
|
|
|
|
|
|
|
557
|
0
|
|
|
|
|
0
|
$matrix.= $latex->fromText("\n".join(" ",@{$al->{sourceWords}})."\n"); |
|
|
0
|
|
|
|
|
0
|
|
|
558
|
0
|
|
|
|
|
0
|
$matrix.= $latex->fromText(join(" ",@{$al->{targetWords}})."\n\n").'\vspace{5mm}'; |
|
|
0
|
|
|
|
|
0
|
|
|
559
|
|
|
|
|
|
|
|
|
560
|
0
|
|
|
|
|
0
|
for ($offset=0;$offset<$numColTokens;$offset+=$maxCols){ |
|
561
|
0
|
|
|
|
|
0
|
$matrix.= "\n".'\begin{tabular}{l'."c" x $numColTokens.'}'; |
|
562
|
0
|
|
|
|
|
0
|
for ($j=$numRowTokens-1;$j>=0;$j--){ |
|
563
|
0
|
|
|
|
|
0
|
$matrix.= "\n".$latex->fromText($al->{sourceWords}[$j]); |
|
564
|
0
|
|
0
|
|
|
0
|
for ($i=$offset;$i<$numColTokens && $i<($offset+$maxCols);$i++){ |
|
565
|
0
|
|
|
|
|
0
|
$i_partOf_Bj = $al->isIn("sourceAl",$j,$i); |
|
566
|
0
|
|
|
|
|
0
|
$j_partOf_Bi = $al->isIn("targetAl",$i,$j); |
|
567
|
0
|
0
|
|
|
|
0
|
if ($mark eq "cross"){$mark_ji=$mark_ji_cross} |
|
|
0
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
elsif ($mark eq "ambiguity"){ |
|
569
|
0
|
0
|
|
|
|
0
|
if (length($al->{sourceLinks}->{"$j $i"}[0])>0){$mark_ji=$al->{sourceLinks}->{"$j $i"}[0]} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
570
|
|
|
|
|
|
|
else {$mark_ji = $mark_ji_cross} |
|
571
|
|
|
|
|
|
|
} |
|
572
|
0
|
|
|
|
|
0
|
elsif ($mark eq "confidence"){ |
|
573
|
0
|
0
|
|
|
|
0
|
if (length($al->{sourceLinks}->{"$j $i"}[1])>0){$mark_ji=$al->{sourceLinks}->{"$j $i"}[1]} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
574
|
|
|
|
|
|
|
else {$mark_ji = $mark_ji_cross} |
|
575
|
|
|
|
|
|
|
} |
|
576
|
|
|
|
|
|
|
else {$mark_ji = $mark} |
|
577
|
0
|
0
|
|
|
|
0
|
if ($mark eq "ambiguity"){ |
|
|
|
0
|
|
|
|
|
|
|
578
|
0
|
0
|
|
|
|
0
|
if (length($al->{targetLinks}->{"$i $j"}[0])>0){$mark_ij='\ver{'.$al->{targetLinks}->{"$i $j"}[0].'}'} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
579
|
|
|
|
|
|
|
else {$mark_ij = '\ver{'.$mark_ji_cross.'}'} |
|
580
|
0
|
|
|
|
|
0
|
}elsif ($mark eq "confidence"){ |
|
581
|
0
|
0
|
|
|
|
0
|
if (length($al->{targetLinks}->{"$i $j"}[1])>0){$mark_ij='\ver{'.$al->{targetLinks}->{"$i $j"}[1].'}'} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
582
|
|
|
|
|
|
|
else {$mark_ij = '\ver{'.$mark_ji_cross.'}'} |
|
583
|
|
|
|
|
|
|
}else{$mark_ij = '\ver{'.$mark_ji.'}'} |
|
584
|
|
|
|
|
|
|
|
|
585
|
0
|
|
|
|
|
0
|
$matrix.= "&"; |
|
586
|
0
|
0
|
|
|
|
0
|
if ($i_partOf_Bj > 0) { #ie i=aj |
|
587
|
0
|
0
|
|
|
|
0
|
if ($j_partOf_Bi > 0){ |
|
588
|
0
|
0
|
0
|
|
|
0
|
if ($mark_ji eq '\boldmath $-$' && $mark_ij eq '\ver{\boldmath $-$}'){ |
|
589
|
0
|
|
|
|
|
0
|
$matrix.= ' \boldmath ${+}$ '; |
|
590
|
|
|
|
|
|
|
}else{ |
|
591
|
0
|
|
|
|
|
0
|
$matrix.= " $mark_ji$mark_ij "; |
|
592
|
|
|
|
|
|
|
} |
|
593
|
|
|
|
|
|
|
}else{ |
|
594
|
0
|
|
|
|
|
0
|
$matrix.= " $mark_ji "; |
|
595
|
|
|
|
|
|
|
} |
|
596
|
|
|
|
|
|
|
}else{ |
|
597
|
0
|
0
|
|
|
|
0
|
if ($j_partOf_Bi > 0){ |
|
598
|
0
|
|
|
|
|
0
|
$matrix.= " $mark_ij "; |
|
599
|
|
|
|
|
|
|
}else{ |
|
600
|
0
|
|
|
|
|
0
|
$matrix.= ' . '; |
|
601
|
|
|
|
|
|
|
} |
|
602
|
|
|
|
|
|
|
} |
|
603
|
|
|
|
|
|
|
} #for j=... |
|
604
|
0
|
|
|
|
|
0
|
$matrix.= ' \\\\'; |
|
605
|
|
|
|
|
|
|
} #for i=... |
|
606
|
|
|
|
|
|
|
# last line |
|
607
|
0
|
|
|
|
|
0
|
$matrix.= "\n "; |
|
608
|
0
|
|
0
|
|
|
0
|
for ($i=$offset;$i<$numColTokens && $i<($offset+$maxCols);$i++){ |
|
609
|
0
|
|
|
|
|
0
|
$matrix.= ' & '.'\ver{'.$latex->fromText($al->{targetWords}[$i]).'}'; |
|
610
|
|
|
|
|
|
|
} |
|
611
|
0
|
|
|
|
|
0
|
$matrix.= ' \\\\'; |
|
612
|
0
|
|
|
|
|
0
|
$matrix.= "\n".'\end{tabular}'."\n\n".'\vspace{7mm}'; |
|
613
|
|
|
|
|
|
|
} # loop on number of matrices |
|
614
|
|
|
|
|
|
|
|
|
615
|
0
|
|
|
|
|
0
|
return $matrix; |
|
616
|
|
|
|
|
|
|
} |
|
617
|
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
# prohibits situations of the type: if linked(e,f) and linked(e',f) and linked(e',f') but not linked(e,f') |
|
620
|
|
|
|
|
|
|
# in this case the function links e and f'. |
|
621
|
|
|
|
|
|
|
sub forceGroupConsistency { |
|
622
|
0
|
|
|
0
|
0
|
0
|
my ($al,$mode,$lex1,$lex2) = @_; |
|
623
|
|
|
|
|
|
|
#defaults: |
|
624
|
0
|
0
|
|
|
|
0
|
if (!defined($mode)){$mode=""} |
|
|
0
|
|
|
|
|
0
|
|
|
625
|
0
|
|
|
|
|
0
|
my $dumper = new Dumpvalue; |
|
626
|
0
|
|
|
|
|
0
|
my $cloneAl = {}; |
|
627
|
0
|
|
|
|
|
0
|
foreach my $source (("source","target")){ |
|
628
|
|
|
|
|
|
|
# SELECT ONLY S LINKS |
|
629
|
0
|
|
|
|
|
0
|
my $sal = $al->SLinks(); |
|
630
|
|
|
|
|
|
|
#first we divide the alignment in clusters of positions linked between each other |
|
631
|
0
|
|
|
|
|
0
|
my $groups=$sal->getAlClusters($source); |
|
632
|
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
#delete alignment |
|
634
|
0
|
0
|
0
|
|
|
0
|
if (defined($sal->{$source."Al"}) && @{$sal->{$source."Al"}}>0){ |
|
|
0
|
|
|
|
|
0
|
|
|
635
|
0
|
|
|
|
|
0
|
for (my $j=0;$j<@{$sal->{$source."Al"}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
636
|
0
|
|
|
|
|
0
|
$sal->{$source."Al"}[$j]=[]; |
|
637
|
|
|
|
|
|
|
} |
|
638
|
|
|
|
|
|
|
} |
|
639
|
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
# print "BEFORE alignment:\n"; |
|
641
|
|
|
|
|
|
|
# print $dumper->dumpValue($al->{$source."Al"}); |
|
642
|
|
|
|
|
|
|
# print "CLUSTERS:\n"; |
|
643
|
|
|
|
|
|
|
# print $dumper->dumpValue($groups); |
|
644
|
|
|
|
|
|
|
|
|
645
|
|
|
|
|
|
|
#then we check that all the links within each cluster exist, and create them if they don't |
|
646
|
0
|
|
|
|
|
0
|
my $g; |
|
647
|
0
|
|
|
|
|
0
|
for ($g=0;$g<@$groups;$g++){ |
|
648
|
0
|
0
|
|
|
|
0
|
if ($mode eq "contiguous"){ |
|
649
|
0
|
|
|
|
|
0
|
my $sContiguousSeqs=Lingua::AlSetLib::getContiguousSequences ($groups->[$g]{source}); |
|
650
|
0
|
|
|
|
|
0
|
my $tContiguousSeqs=Lingua::AlSetLib::getContiguousSequences ($groups->[$g]{target}); |
|
651
|
0
|
0
|
0
|
|
|
0
|
if (@$sContiguousSeqs > 1 || @$tContiguousSeqs > 1){ |
|
652
|
|
|
|
|
|
|
# print "CLUSTER:\n"; |
|
653
|
|
|
|
|
|
|
# print $dumper->dumpValue($groups->[$g]); |
|
654
|
0
|
|
|
|
|
0
|
my ($bestIbm1Prob,$bestSourceSeq,$bestTargetSeq)=(0,0,0); |
|
655
|
0
|
|
|
|
|
0
|
for (my $sc=0;$sc<@$sContiguousSeqs;$sc++){ |
|
656
|
0
|
|
|
|
|
0
|
my $sPhrase = $sal->printPhrase("source",$sContiguousSeqs->[$sc]); |
|
657
|
0
|
|
|
|
|
0
|
for (my $tc=0;$tc<@$tContiguousSeqs;$tc++){ |
|
658
|
0
|
|
|
|
|
0
|
my $tPhrase = $sal->printPhrase("target",$tContiguousSeqs->[$tc]); |
|
659
|
0
|
|
|
|
|
0
|
my $ibm1t_s = Lingua::AlSetLib::ibm1Prob ($sPhrase,$tPhrase,$lex1); |
|
660
|
0
|
|
|
|
|
0
|
my $ibm1s_t; |
|
661
|
0
|
0
|
|
|
|
0
|
if (defined($lex2)){ |
|
662
|
0
|
|
|
|
|
0
|
$ibm1s_t = Lingua::AlSetLib::ibm1Prob ($tPhrase,$sPhrase,$lex2); |
|
663
|
|
|
|
|
|
|
}else{ |
|
664
|
0
|
|
|
|
|
0
|
$ibm1s_t = $ibm1t_s; |
|
665
|
|
|
|
|
|
|
} |
|
666
|
0
|
|
|
|
|
0
|
my $ibm1 = 0.5*($ibm1t_s+$ibm1s_t); |
|
667
|
|
|
|
|
|
|
# print "$sPhrase ||| $tPhrase ||| $ibm1t_s -- $ibm1s_t ==> $ibm1\n"; |
|
668
|
0
|
0
|
|
|
|
0
|
if ($ibm1 > $bestIbm1Prob){ |
|
669
|
0
|
|
|
|
|
0
|
$bestIbm1Prob=$ibm1; |
|
670
|
0
|
|
|
|
|
0
|
$bestSourceSeq=$sc; |
|
671
|
0
|
|
|
|
|
0
|
$bestTargetSeq=$tc; |
|
672
|
|
|
|
|
|
|
} |
|
673
|
|
|
|
|
|
|
} |
|
674
|
|
|
|
|
|
|
} |
|
675
|
|
|
|
|
|
|
|
|
676
|
0
|
|
|
|
|
0
|
@{$groups->[$g]{source}}=@{$sContiguousSeqs->[$bestSourceSeq]}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
677
|
0
|
|
|
|
|
0
|
@{$groups->[$g]{target}}=@{$tContiguousSeqs->[$bestTargetSeq]}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
678
|
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
# print " contiguous CLUSTER:\n"; |
|
680
|
|
|
|
|
|
|
# print $dumper->dumpValue($groups->[$g]); |
|
681
|
|
|
|
|
|
|
# print "best: ".$al->printPhrase('source',$groups->[$g]{source})." | ".$al->printPhrase('target',$groups->[$g]{target})."\n"; |
|
682
|
|
|
|
|
|
|
} |
|
683
|
|
|
|
|
|
|
} |
|
684
|
0
|
|
|
|
|
0
|
foreach my $j (@{$groups->[$g]{source}}){ |
|
|
0
|
|
|
|
|
0
|
|
|
685
|
0
|
|
|
|
|
0
|
foreach my $i (@{$groups->[$g]{target}}){ |
|
|
0
|
|
|
|
|
0
|
|
|
686
|
0
|
0
|
|
|
|
0
|
if (!$al->isIn($source."Al",$j,$i)){ |
|
687
|
0
|
|
|
|
|
0
|
push @{$al->{$source."Al"}[$j]},$i; |
|
|
0
|
|
|
|
|
0
|
|
|
688
|
|
|
|
|
|
|
}else{ # move from P to S links |
|
689
|
0
|
|
|
|
|
0
|
@{$al->{$source."Links"}->{"$j $i"}}[0]=""; |
|
|
0
|
|
|
|
|
0
|
|
|
690
|
|
|
|
|
|
|
} |
|
691
|
|
|
|
|
|
|
} |
|
692
|
|
|
|
|
|
|
} |
|
693
|
|
|
|
|
|
|
} |
|
694
|
|
|
|
|
|
|
# print "CLUSTERS after:\n"; |
|
695
|
|
|
|
|
|
|
# print $dumper->dumpValue($groups); |
|
696
|
|
|
|
|
|
|
# print "alignment AFTER:\n"; |
|
697
|
|
|
|
|
|
|
# print $dumper->dumpValue($al->{$source."Al"}); |
|
698
|
|
|
|
|
|
|
} #foreach $side |
|
699
|
|
|
|
|
|
|
} |
|
700
|
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
##################################################### |
|
702
|
|
|
|
|
|
|
### SYMMETRIZATION SUBS ### |
|
703
|
|
|
|
|
|
|
##################################################### |
|
704
|
|
|
|
|
|
|
# input: alignment object |
|
705
|
|
|
|
|
|
|
# output: intersection of source and target alignments of this object |
|
706
|
|
|
|
|
|
|
sub intersect { |
|
707
|
0
|
|
|
0
|
0
|
0
|
my $al = shift; |
|
708
|
0
|
|
|
|
|
0
|
my $intersectSourceAl=[]; |
|
709
|
0
|
|
|
|
|
0
|
my $intersectTargetAl=[]; |
|
710
|
0
|
|
|
|
|
0
|
my ($i,$j,$ind); |
|
711
|
|
|
|
|
|
|
|
|
712
|
0
|
0
|
0
|
|
|
0
|
if (@{$al->{targetAl}}>0 && @{$al->{sourceAl}}>0){ |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
713
|
|
|
|
|
|
|
#for each link in sourceAl, look if it's present in targetAl |
|
714
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$al->{sourceAl}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
715
|
0
|
0
|
|
|
|
0
|
if (defined($al->{sourceAl}[$j])){ |
|
716
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
717
|
0
|
0
|
|
|
|
0
|
if ($al->isIn("targetAl",$i,$j)){ |
|
718
|
0
|
|
|
|
|
0
|
push @{$intersectSourceAl->[$j]},$i; |
|
|
0
|
|
|
|
|
0
|
|
|
719
|
0
|
|
|
|
|
0
|
push @{$intersectTargetAl->[$i]},$j; |
|
|
0
|
|
|
|
|
0
|
|
|
720
|
|
|
|
|
|
|
} |
|
721
|
|
|
|
|
|
|
} |
|
722
|
|
|
|
|
|
|
} #if defined |
|
723
|
|
|
|
|
|
|
} |
|
724
|
|
|
|
|
|
|
} #if targetAl is an empty array, then from the intersection sourceAl remains empty |
|
725
|
0
|
|
|
|
|
0
|
@{$al->{sourceAl}}=@{$intersectSourceAl}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
726
|
0
|
|
|
|
|
0
|
@{$al->{targetAl}}=@{$intersectTargetAl}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
727
|
|
|
|
|
|
|
} |
|
728
|
|
|
|
|
|
|
|
|
729
|
|
|
|
|
|
|
# input: alignment object |
|
730
|
|
|
|
|
|
|
# output: union of source and target alignments of this object |
|
731
|
|
|
|
|
|
|
sub getUnion { |
|
732
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
|
733
|
0
|
|
|
|
|
0
|
my %union; |
|
734
|
0
|
|
|
|
|
0
|
$union{sourceAl}=[]; |
|
735
|
0
|
|
|
|
|
0
|
$union{targetAl}=[]; |
|
736
|
0
|
|
|
|
|
0
|
my ($j,$i,$ind); |
|
737
|
0
|
|
|
|
|
0
|
my %side=("source"=>"target","target"=>"source"); |
|
738
|
0
|
|
|
|
|
0
|
my ($source,$target); |
|
739
|
|
|
|
|
|
|
|
|
740
|
0
|
0
|
0
|
|
|
0
|
if (@{$al->{targetAl}}>0 && @{$al->{sourceAl}}>0){ |
|
|
0
|
0
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
741
|
0
|
|
|
|
|
0
|
while (($source,$target)= each(%side)){ |
|
742
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$al->{$source."Al"}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
743
|
0
|
0
|
|
|
|
0
|
if (defined($al->{$source."Al"}[$j])){ |
|
744
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{$source."Al"}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
745
|
0
|
|
|
|
|
0
|
push @{$union{$source."Al"}->[$j]},$i; |
|
|
0
|
|
|
|
|
0
|
|
|
746
|
0
|
0
|
|
|
|
0
|
if (!$al->isIn($target."Al",$i,$j)){ |
|
747
|
0
|
|
|
|
|
0
|
push @{$union{$target."Al"}->[$i]},$j; |
|
|
0
|
|
|
|
|
0
|
|
|
748
|
|
|
|
|
|
|
} |
|
749
|
|
|
|
|
|
|
} #foreach |
|
750
|
|
|
|
|
|
|
} |
|
751
|
|
|
|
|
|
|
} #for |
|
752
|
|
|
|
|
|
|
} |
|
753
|
|
|
|
|
|
|
}elsif (@{$al->{sourceAl}}>0){ |
|
754
|
0
|
|
|
|
|
0
|
@{$union{sourceAl}}=@{$al->{sourceAl}}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
755
|
|
|
|
|
|
|
}else{ |
|
756
|
0
|
|
|
|
|
0
|
@{$union{targetAl}}=@{$al->{targetAl}}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
757
|
|
|
|
|
|
|
} |
|
758
|
0
|
|
|
|
|
0
|
@{$al->{sourceAl}}=@{$union{sourceAl}}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
759
|
0
|
|
|
|
|
0
|
@{$al->{targetAl}}=@{$union{targetAl}}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
760
|
|
|
|
|
|
|
} |
|
761
|
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
# input: alignment object |
|
763
|
|
|
|
|
|
|
# output: this object where only the links of the side (source or target) with most links are selected |
|
764
|
|
|
|
|
|
|
sub selectSideWithLinks{ |
|
765
|
0
|
|
|
0
|
0
|
0
|
my ($al,$criterion,$dontCountNull)=@_; |
|
766
|
|
|
|
|
|
|
#defaults |
|
767
|
0
|
0
|
|
|
|
0
|
if (!defined($criterion)){$criterion="most"} |
|
|
0
|
|
|
|
|
0
|
|
|
768
|
0
|
0
|
|
|
|
0
|
if (!defined($dontCountNull)){$dontCountNull=1} |
|
|
0
|
|
|
|
|
0
|
|
|
769
|
0
|
|
|
|
|
0
|
my ($j,$i,$firstInd); |
|
770
|
0
|
|
|
|
|
0
|
my ($numSource,$numTarget)=(0,0); |
|
771
|
0
|
|
|
|
|
0
|
my $sourceAl=[]; |
|
772
|
0
|
|
|
|
|
0
|
my $targetAl=[]; |
|
773
|
|
|
|
|
|
|
|
|
774
|
0
|
0
|
|
|
|
0
|
if ($dontCountNull){$firstInd=1} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
775
|
|
|
|
|
|
|
else {$firstInd=0} |
|
776
|
|
|
|
|
|
|
#count links |
|
777
|
0
|
|
|
|
|
0
|
for ($j=$firstInd;$j<@{$al->{sourceAl}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
778
|
0
|
0
|
|
|
|
0
|
if (defined($al->{sourceAl}[$j])){ |
|
779
|
0
|
0
|
|
|
|
0
|
if (!$dontCountNull){ |
|
780
|
0
|
|
|
|
|
0
|
$numSource+=@{$al->{sourceAl}[$j]}; |
|
|
0
|
|
|
|
|
0
|
|
|
781
|
|
|
|
|
|
|
}else{ |
|
782
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
783
|
0
|
0
|
|
|
|
0
|
if ($i!=0){$numSource++} |
|
|
0
|
|
|
|
|
0
|
|
|
784
|
|
|
|
|
|
|
} |
|
785
|
|
|
|
|
|
|
} |
|
786
|
|
|
|
|
|
|
} |
|
787
|
|
|
|
|
|
|
} |
|
788
|
0
|
|
|
|
|
0
|
for ($i=$firstInd;$i<@{$al->{targetAl}};$i++){ |
|
|
0
|
|
|
|
|
0
|
|
|
789
|
0
|
0
|
|
|
|
0
|
if (defined($al->{targetAl}[$i])){ |
|
790
|
0
|
0
|
|
|
|
0
|
if (!$dontCountNull){ |
|
791
|
0
|
|
|
|
|
0
|
$numTarget+=@{$al->{targetAl}[$i]}; |
|
|
0
|
|
|
|
|
0
|
|
|
792
|
|
|
|
|
|
|
}else{ |
|
793
|
0
|
|
|
|
|
0
|
foreach $j (@{$al->{targetAl}[$i]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
794
|
0
|
0
|
|
|
|
0
|
if ($j!=0){$numTarget++} |
|
|
0
|
|
|
|
|
0
|
|
|
795
|
|
|
|
|
|
|
} |
|
796
|
|
|
|
|
|
|
} |
|
797
|
|
|
|
|
|
|
} |
|
798
|
|
|
|
|
|
|
} |
|
799
|
|
|
|
|
|
|
#select side with (most,least) links |
|
800
|
0
|
0
|
0
|
|
|
0
|
if ( ($numSource>=$numTarget && $criterion eq "most") || ($numSource<$numTarget && $criterion ne "most")){ #select sourceAl |
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
801
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$al->{sourceAl}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
802
|
0
|
0
|
|
|
|
0
|
if (defined($al->{sourceAl}[$j])){ |
|
803
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
804
|
0
|
|
|
|
|
0
|
push @{$sourceAl->[$j]},$i; |
|
|
0
|
|
|
|
|
0
|
|
|
805
|
0
|
|
|
|
|
0
|
push @{$targetAl->[$i]},$j; |
|
|
0
|
|
|
|
|
0
|
|
|
806
|
|
|
|
|
|
|
} |
|
807
|
|
|
|
|
|
|
} |
|
808
|
|
|
|
|
|
|
} |
|
809
|
|
|
|
|
|
|
}else{ #select targetAl |
|
810
|
0
|
|
|
|
|
0
|
for ($i=0;$i<@{$al->{targetAl}};$i++){ |
|
|
0
|
|
|
|
|
0
|
|
|
811
|
0
|
0
|
|
|
|
0
|
if (defined($al->{targetAl}[$i])){ |
|
812
|
0
|
|
|
|
|
0
|
foreach $j (@{$al->{targetAl}[$i]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
813
|
0
|
|
|
|
|
0
|
push @{$sourceAl->[$j]},$i; |
|
|
0
|
|
|
|
|
0
|
|
|
814
|
0
|
|
|
|
|
0
|
push @{$targetAl->[$i]},$j; |
|
|
0
|
|
|
|
|
0
|
|
|
815
|
|
|
|
|
|
|
} |
|
816
|
|
|
|
|
|
|
} |
|
817
|
|
|
|
|
|
|
} |
|
818
|
|
|
|
|
|
|
} |
|
819
|
0
|
|
|
|
|
0
|
@{$al->{sourceAl}}=@$sourceAl; |
|
|
0
|
|
|
|
|
0
|
|
|
820
|
0
|
|
|
|
|
0
|
@{$al->{targetAl}}=@$targetAl; |
|
|
0
|
|
|
|
|
0
|
|
|
821
|
|
|
|
|
|
|
} |
|
822
|
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
sub selectSideWithMostLinks{ |
|
824
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
|
825
|
0
|
|
|
|
|
0
|
return $al->selectSideWithLinks("most"); |
|
826
|
|
|
|
|
|
|
} |
|
827
|
|
|
|
|
|
|
sub selectSideWithLeastLinks{ |
|
828
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
|
829
|
0
|
|
|
|
|
0
|
return $al->selectSideWithLinks("least"); |
|
830
|
|
|
|
|
|
|
} |
|
831
|
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
# input: alignment object |
|
833
|
|
|
|
|
|
|
# output: alignment object where source and target have been swapped |
|
834
|
|
|
|
|
|
|
sub swapSourceTarget{ |
|
835
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
|
836
|
0
|
|
|
|
|
0
|
my ($link,$ref,$j,$i,$source); |
|
837
|
0
|
|
|
|
|
0
|
my @st; |
|
838
|
0
|
|
|
|
|
0
|
my @sides=("source","target"); |
|
839
|
0
|
|
|
|
|
0
|
my $swappedAl={ "sourceAl"=>[], |
|
840
|
|
|
|
|
|
|
"targetAl"=>[], |
|
841
|
|
|
|
|
|
|
"sourceWords"=>$al->{targetWords}, |
|
842
|
|
|
|
|
|
|
"targetWords"=>$al->{sourceWords}, |
|
843
|
|
|
|
|
|
|
"sourceLinks"=>{}, |
|
844
|
|
|
|
|
|
|
"targetLinks"=>{}}; |
|
845
|
|
|
|
|
|
|
|
|
846
|
0
|
|
|
|
|
0
|
foreach $source (@sides){ |
|
847
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$al->{$source."Al"}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
848
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{$source."Al"}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
849
|
0
|
|
|
|
|
0
|
push @{$swappedAl->{$source."Al"}[$i]},$j; |
|
|
0
|
|
|
|
|
0
|
|
|
850
|
|
|
|
|
|
|
} |
|
851
|
|
|
|
|
|
|
} |
|
852
|
|
|
|
|
|
|
#insert ref to empty array instead of undef entries |
|
853
|
0
|
|
|
|
|
0
|
for ($j=0;$j<@{$swappedAl->{$source."Al"}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
854
|
0
|
0
|
|
|
|
0
|
if (!defined($swappedAl->{$source."Al"}[$j])){ |
|
855
|
0
|
|
|
|
|
0
|
$swappedAl->{$source."Al"}[$j]=[]; |
|
856
|
|
|
|
|
|
|
} |
|
857
|
|
|
|
|
|
|
} |
|
858
|
|
|
|
|
|
|
# and now the sourceLinks |
|
859
|
0
|
|
|
|
|
0
|
while (($link,$ref)=each(%{$al->{$source."Links"}})){ |
|
|
0
|
|
|
|
|
0
|
|
|
860
|
0
|
|
|
|
|
0
|
@st=split(" ",$link); |
|
861
|
0
|
|
|
|
|
0
|
$swappedAl->{$source."Links"}{"$st[1] $st[0]"}=$ref; |
|
862
|
|
|
|
|
|
|
} |
|
863
|
|
|
|
|
|
|
} |
|
864
|
0
|
|
|
|
|
0
|
%$al=%$swappedAl; |
|
865
|
|
|
|
|
|
|
} |
|
866
|
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
# input: al object, offset, length, side (src or trg), ref to word list to be added, ref to a list of positions of the other side (to which all added words will be linked). |
|
870
|
|
|
|
|
|
|
# output: Alignment object where given positions are sustituted by the words |
|
871
|
|
|
|
|
|
|
# |
|
872
|
|
|
|
|
|
|
# notes: 1) in case of deleting various words: |
|
873
|
|
|
|
|
|
|
# - all added words are linked to all positions to which deleted words were linked (except if you provided a list of positions of the other side, in which case all added words are linked to those positions). |
|
874
|
|
|
|
|
|
|
# - $al->{sourceLinks} information can be lost for these words. |
|
875
|
|
|
|
|
|
|
# 2) Does not work for targetAl alignment |
|
876
|
|
|
|
|
|
|
# 3) more efficient in "source" side than in "target" |
|
877
|
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
sub splice { |
|
879
|
0
|
|
|
0
|
0
|
0
|
my ($al,$side,$offset,$length,$refToWordsToAdd,$refToOtherSidePosi)=@_; |
|
880
|
0
|
|
|
|
|
0
|
my $dumper = new Dumpvalue; |
|
881
|
|
|
|
|
|
|
|
|
882
|
0
|
0
|
|
|
|
0
|
if (!defined($refToWordsToAdd)){$refToWordsToAdd=[];} |
|
|
0
|
|
|
|
|
0
|
|
|
883
|
0
|
0
|
|
|
|
0
|
if (!defined($refToOtherSidePosi)){$refToOtherSidePosi=[];} |
|
|
0
|
|
|
|
|
0
|
|
|
884
|
0
|
|
|
|
|
0
|
my $numToDelete=$length; |
|
885
|
0
|
|
|
|
|
0
|
my $firstPos=$offset; |
|
886
|
0
|
|
|
|
|
0
|
my $lastPos=$offset+$length-1; |
|
887
|
0
|
|
|
|
|
0
|
my $numList = scalar(@$refToOtherSidePosi); |
|
888
|
|
|
|
|
|
|
# print $al->displayAsLinkEnumeration("text"); |
|
889
|
|
|
|
|
|
|
# print "splice $side off:$offset len:$length add:",join(" ",@$refToWordsToAdd),"\n"; |
|
890
|
|
|
|
|
|
|
# print "before:",join(" ",@{$al->{$side."Words"}}),"\n"; |
|
891
|
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
# MODIFY WORDS ARRAY |
|
893
|
0
|
|
|
|
|
0
|
splice(@{$al->{$side."Words"}},$offset,$length,@$refToWordsToAdd); |
|
|
0
|
|
|
|
|
0
|
|
|
894
|
|
|
|
|
|
|
# print "after:",join(" ",@{$al->{$side."Words"}}),"\n"; |
|
895
|
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
# MODIFY LINKS |
|
897
|
0
|
|
|
|
|
0
|
my $numToAdd=scalar(@$refToWordsToAdd); |
|
898
|
0
|
|
|
|
|
0
|
my $diff=$numToAdd-$numToDelete; |
|
899
|
0
|
|
|
|
|
0
|
my @modified; |
|
900
|
|
|
|
|
|
|
my %modifs; |
|
901
|
0
|
|
|
|
|
0
|
my %links; |
|
902
|
|
|
|
|
|
|
|
|
903
|
0
|
0
|
|
|
|
0
|
if ($side eq "target"){ |
|
904
|
0
|
|
|
|
|
0
|
$al->swapSourceTarget; |
|
905
|
|
|
|
|
|
|
} |
|
906
|
|
|
|
|
|
|
#initialize modified array |
|
907
|
0
|
|
|
|
|
0
|
for (my $j=0;$j<@{$al->{sourceAl}}+$diff;$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
908
|
0
|
|
|
|
|
0
|
push @modified,[]; |
|
909
|
|
|
|
|
|
|
} |
|
910
|
|
|
|
|
|
|
#print "ANTES:\n"; |
|
911
|
|
|
|
|
|
|
#print $al->displayAsLinkEnumeration("text"); |
|
912
|
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
#fill modified array with existing links |
|
914
|
0
|
|
|
|
|
0
|
for (my $j=0;$j<@{$al->{sourceAl}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
915
|
0
|
0
|
|
|
|
0
|
if (defined($al->{sourceAl}[$j])){ |
|
916
|
0
|
|
|
|
|
0
|
foreach my $i (@{$al->{sourceAl}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
917
|
|
|
|
|
|
|
#print "i $i j $j firstPos $firstPos\n"; |
|
918
|
0
|
0
|
0
|
|
|
0
|
if ($j<$firstPos){ |
|
|
|
0
|
|
|
|
|
|
|
919
|
0
|
|
|
|
|
0
|
push @{$modified[$j]},$i; |
|
|
0
|
|
|
|
|
0
|
|
|
920
|
0
|
|
|
|
|
0
|
$links{"$j $i"}=$al->{sourceLinks}{"$j $i"}; |
|
921
|
|
|
|
|
|
|
}elsif ($j>=$firstPos && $j<=$lastPos){ |
|
922
|
0
|
0
|
|
|
|
0
|
if ($numList==0){ |
|
923
|
|
|
|
|
|
|
#link added words to positions to which were linked the deleted words |
|
924
|
0
|
|
|
|
|
0
|
for (my $p=$firstPos;$p<$firstPos+$numToAdd;$p++){ |
|
925
|
0
|
0
|
|
|
|
0
|
if (!exists($modifs{$p}{$i})){ |
|
926
|
0
|
|
|
|
|
0
|
push @{$modified[$p]},$i; |
|
|
0
|
|
|
|
|
0
|
|
|
927
|
0
|
|
|
|
|
0
|
$links{"$p $i"}=$al->{sourceLinks}{"$j $i"}; |
|
928
|
0
|
|
|
|
|
0
|
$modifs{$p}{$i}=1; |
|
929
|
|
|
|
|
|
|
} |
|
930
|
|
|
|
|
|
|
} |
|
931
|
|
|
|
|
|
|
} |
|
932
|
|
|
|
|
|
|
}else{ |
|
933
|
0
|
|
|
|
|
0
|
push @{$modified[$j+$diff]},$i; |
|
|
0
|
|
|
|
|
0
|
|
|
934
|
0
|
|
|
|
|
0
|
$links{($j+$diff)." $i"}=$al->{sourceLinks}{"$j $i"}; |
|
935
|
|
|
|
|
|
|
} |
|
936
|
|
|
|
|
|
|
} |
|
937
|
|
|
|
|
|
|
} #if defined |
|
938
|
|
|
|
|
|
|
} |
|
939
|
|
|
|
|
|
|
# insert provided links |
|
940
|
0
|
|
|
|
|
0
|
for (my $p=$firstPos;$p<$firstPos+$numToAdd;$p++){ |
|
941
|
0
|
|
|
|
|
0
|
foreach my $i (@$refToOtherSidePosi){ |
|
942
|
0
|
|
|
|
|
0
|
push @{$modified[$p]},$i; |
|
|
0
|
|
|
|
|
0
|
|
|
943
|
|
|
|
|
|
|
} |
|
944
|
|
|
|
|
|
|
} |
|
945
|
|
|
|
|
|
|
|
|
946
|
0
|
|
|
|
|
0
|
@{$al->{sourceAl}}=@modified; |
|
|
0
|
|
|
|
|
0
|
|
|
947
|
0
|
0
|
|
|
|
0
|
if ($side eq "target"){ |
|
948
|
0
|
|
|
|
|
0
|
$al->swapSourceTarget; |
|
949
|
|
|
|
|
|
|
} |
|
950
|
|
|
|
|
|
|
#print "DESPUES:\n"; |
|
951
|
|
|
|
|
|
|
#print $al->displayAsLinkEnumeration("text"); |
|
952
|
|
|
|
|
|
|
} |
|
953
|
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
|
|
955
|
|
|
|
|
|
|
# INPUT: string (regexp) to be replaced, string (regexp) to replace it, side ("source" or "target") |
|
956
|
|
|
|
|
|
|
# NOTES: 1) in case of deleting various words, all added words are linked to all positions to which deleted words were linked. $al->{sourceLinks} information can be lost for replaced words. |
|
957
|
|
|
|
|
|
|
# 2) Does not work for targetAl alignment |
|
958
|
|
|
|
|
|
|
# 3) more efficient in "source" side than in "target" |
|
959
|
|
|
|
|
|
|
sub regexpReplace { |
|
960
|
0
|
|
|
0
|
0
|
0
|
my ($al,$regToDelete,$regToReplace,$side)=@_; |
|
961
|
0
|
|
|
|
|
0
|
my $dumper=new Dumpvalue; |
|
962
|
|
|
|
|
|
|
#print STDERR "s/$regToDelete/$regToReplace/\n"; |
|
963
|
0
|
|
|
|
|
0
|
my $sentence; |
|
964
|
0
|
0
|
|
|
|
0
|
if ($side eq "source"){$sentence=$al->sourceSentence;} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
965
|
|
|
|
|
|
|
else {$sentence=$al->targetSentence;} |
|
966
|
0
|
|
|
|
|
0
|
my $newSentence=$sentence; |
|
967
|
0
|
|
|
|
|
0
|
$newSentence =~ s/$regToDelete/$regToReplace/og; |
|
968
|
|
|
|
|
|
|
#print $al->sourceSentence."\n"; |
|
969
|
|
|
|
|
|
|
#print $al->targetSentence."\n"; |
|
970
|
|
|
|
|
|
|
#print $newSentence."\n"; |
|
971
|
0
|
|
|
|
|
0
|
my @words = split / /,$sentence; |
|
972
|
0
|
|
|
|
|
0
|
my $nums = scalar(@words); |
|
973
|
0
|
|
|
|
|
0
|
my @newWords = split / /,$newSentence; |
|
974
|
0
|
|
|
|
|
0
|
my @diffs = Lingua::AlSetLib::diff( \@words, \@newWords ); |
|
975
|
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
# parse output of diff function |
|
977
|
0
|
|
|
|
|
0
|
my @updatedPosi; #array: orig posis -> updated posis |
|
978
|
|
|
|
|
|
|
my %reversePosi; #hash: updated posis -> orig posis |
|
979
|
0
|
|
|
|
|
0
|
for (my $i=0;$i<=$nums;$i++){ |
|
980
|
0
|
|
|
|
|
0
|
$updatedPosi[$i]=$i; |
|
981
|
0
|
|
|
|
|
0
|
$reversePosi{$i}=$i; |
|
982
|
|
|
|
|
|
|
} |
|
983
|
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
#$dumper->dumpValue(\@diffs); |
|
985
|
0
|
|
|
|
|
0
|
foreach my $hunk (@diffs){ |
|
986
|
0
|
|
|
|
|
0
|
my @delPosi; |
|
987
|
|
|
|
|
|
|
my @del; |
|
988
|
0
|
|
|
|
|
0
|
my @addPosi; |
|
989
|
0
|
|
|
|
|
0
|
my @add; |
|
990
|
0
|
|
|
|
|
0
|
foreach my $change (@$hunk) { |
|
991
|
0
|
0
|
|
|
|
0
|
if ($change->[0] eq '-'){ |
|
992
|
0
|
|
|
|
|
0
|
push @delPosi,$change->[1]+1; |
|
993
|
0
|
|
|
|
|
0
|
push @del,$change->[2]; |
|
994
|
|
|
|
|
|
|
}else{ |
|
995
|
0
|
|
|
|
|
0
|
push @addPosi,$change->[1]+1; |
|
996
|
0
|
|
|
|
|
0
|
push @add,$change->[2]; |
|
997
|
|
|
|
|
|
|
} |
|
998
|
|
|
|
|
|
|
} |
|
999
|
0
|
|
|
|
|
0
|
my $numDel=scalar(@delPosi); |
|
1000
|
0
|
|
|
|
|
0
|
my $numAdd=scalar(@addPosi); |
|
1001
|
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
# del posis are relative to first array (@words) => update posis |
|
1003
|
|
|
|
|
|
|
# add posis are relative to second array (@newWords) => don't update posis |
|
1004
|
0
|
0
|
|
|
|
0
|
if ($numDel==0){ #insertion |
|
1005
|
0
|
|
|
|
|
0
|
$al->splice("$side",$addPosi[0],0,\@add); |
|
1006
|
|
|
|
|
|
|
#print "insert '",join(" ",@add),"' at position { ",$addPosi[0]," }\n"; |
|
1007
|
|
|
|
|
|
|
#update updatedPosi array |
|
1008
|
0
|
|
|
|
|
0
|
for (my $i=$reversePosi{"$addPosi[0]"};$i<=$nums;$i++){ |
|
1009
|
0
|
|
|
|
|
0
|
$updatedPosi[$i]+=$numAdd; |
|
1010
|
0
|
|
|
|
|
0
|
$reversePosi{"$updatedPosi[$i]"}=$i; |
|
1011
|
|
|
|
|
|
|
} |
|
1012
|
|
|
|
|
|
|
}else{ # substitution or deletion |
|
1013
|
0
|
|
|
|
|
0
|
$al->splice("$side",$updatedPosi[$delPosi[0]],$numDel,\@add); |
|
1014
|
|
|
|
|
|
|
#print "substitute '",join(" ",@del),"' at positions { ",join(" ",@delPosi)," } by '",join(" ",@add),"'\n"; |
|
1015
|
|
|
|
|
|
|
#update updatedPosi array |
|
1016
|
0
|
|
|
|
|
0
|
for (my $i=$delPosi[0]+$numDel;$i<=$nums;$i++){ |
|
1017
|
0
|
|
|
|
|
0
|
$updatedPosi[$i]+=$numAdd-$numDel; |
|
1018
|
0
|
|
|
|
|
0
|
$reversePosi{"$updatedPosi[$i]"}=$i; |
|
1019
|
|
|
|
|
|
|
} |
|
1020
|
|
|
|
|
|
|
} |
|
1021
|
|
|
|
|
|
|
} |
|
1022
|
|
|
|
|
|
|
} |
|
1023
|
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
# eliminates any given WORD from the source or target file corpus and updates the alignment |
|
1025
|
|
|
|
|
|
|
# input: $al (current Alignment object),$word (word RegExp to eliminate), $wordSide (from which side: source or target) |
|
1026
|
|
|
|
|
|
|
# kept for compatibility with previous versions (regexpReplace or replaceWords should be used instead) |
|
1027
|
|
|
|
|
|
|
sub eliminateWord { |
|
1028
|
0
|
|
|
0
|
0
|
0
|
my ($al,$word,$wordSide)= @_; |
|
1029
|
0
|
|
|
|
|
0
|
return $al->replaceWords($word,'',$wordSide); |
|
1030
|
|
|
|
|
|
|
} |
|
1031
|
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
# INPUT: string to be replaced, string to replace it, side ("source" or "target") |
|
1033
|
|
|
|
|
|
|
# NOTES: 1) in case of deleting various words, all added words are linked to all positions to which deleted words were linked. $al->{sourceLinks} information can be lost for replaced words. |
|
1034
|
|
|
|
|
|
|
# 2) Does not work for targetAl alignment |
|
1035
|
|
|
|
|
|
|
# 3) more efficient in "source" side than in "target" |
|
1036
|
|
|
|
|
|
|
sub replaceWords { |
|
1037
|
0
|
|
|
0
|
0
|
0
|
my ($al,$stToDelete,$stToReplace,$side)=@_; |
|
1038
|
0
|
|
|
|
|
0
|
my $dumper=new Dumpvalue; |
|
1039
|
0
|
|
|
|
|
0
|
$stToDelete =~ s/(^\s|\s$)//g; |
|
1040
|
0
|
|
|
|
|
0
|
$stToDelete =~ s/\s+/ /g; |
|
1041
|
0
|
|
|
|
|
0
|
$stToReplace =~ s/(^\s|\s$)//g; |
|
1042
|
0
|
|
|
|
|
0
|
$stToReplace =~ s/\s+/ /g; |
|
1043
|
|
|
|
|
|
|
|
|
1044
|
0
|
|
|
|
|
0
|
my @wToDel=split / /,$stToDelete; |
|
1045
|
0
|
|
|
|
|
0
|
my $numToDel = scalar(@wToDel); |
|
1046
|
0
|
|
|
|
|
0
|
my @toAdd=split(/ /,$stToReplace); |
|
1047
|
0
|
|
|
|
|
0
|
my $numToAdd=scalar(@toAdd); |
|
1048
|
0
|
|
|
|
|
0
|
my $diff=$numToAdd-$numToDel; |
|
1049
|
|
|
|
|
|
|
# print $al->displayAsLinkEnumeration("text"),"\n"; |
|
1050
|
|
|
|
|
|
|
#list of positions where string to be deleted starts in @sourceWords (or target) array |
|
1051
|
0
|
|
|
|
|
0
|
my @startToDelInAl=Lingua::AlSetLib::findArrayInAnother(\@wToDel,$al->{$side."Words"}); |
|
1052
|
0
|
|
|
|
|
0
|
my $offset=0; |
|
1053
|
0
|
|
|
|
|
0
|
foreach my $startPosi (@startToDelInAl){ |
|
1054
|
0
|
|
|
|
|
0
|
my @posis; |
|
1055
|
0
|
|
|
|
|
0
|
for (my $i=0;$i<$numToDel;$i++){ |
|
1056
|
0
|
|
|
|
|
0
|
push @posis,$startPosi+$i+$offset; |
|
1057
|
|
|
|
|
|
|
} |
|
1058
|
|
|
|
|
|
|
#print "positions:",join(" ",@posis),"\n"; |
|
1059
|
|
|
|
|
|
|
#$al->substitutePositions(\@posis,$side,$stToReplace); |
|
1060
|
0
|
|
|
|
|
0
|
$al->splice($side,$posis[0],scalar(@posis),\@toAdd); |
|
1061
|
0
|
|
|
|
|
0
|
$offset+=$diff; |
|
1062
|
|
|
|
|
|
|
} |
|
1063
|
|
|
|
|
|
|
# print $al->displayAsLinkEnumeration("text"),"\n"; |
|
1064
|
|
|
|
|
|
|
} |
|
1065
|
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
# introduces underscore between links of many-to-many groups in source to target alignment |
|
1068
|
|
|
|
|
|
|
# WARNING: THIS SUB FOR NOW ONLY CHANGES WORDS FILES, NOT THE LINKS FILE |
|
1069
|
|
|
|
|
|
|
sub manyToMany2joined { |
|
1070
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
|
1071
|
0
|
|
|
|
|
0
|
my $new; |
|
1072
|
0
|
|
|
|
|
0
|
@{$new->{source}} = @{$al->{sourceWords}}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
1073
|
0
|
|
|
|
|
0
|
@{$new->{target}} = @{$al->{targetWords}}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
1074
|
|
|
|
|
|
|
|
|
1075
|
0
|
|
|
|
|
0
|
my @sides=("source","target"); |
|
1076
|
|
|
|
|
|
|
|
|
1077
|
|
|
|
|
|
|
|
|
1078
|
|
|
|
|
|
|
# group many-to-many linked phrases in clusters |
|
1079
|
0
|
|
|
|
|
0
|
my $clusters=$al->getAlClusters; |
|
1080
|
0
|
|
|
|
|
0
|
my $dumper = new Dumpvalue; |
|
1081
|
|
|
|
|
|
|
# print "\n"; |
|
1082
|
|
|
|
|
|
|
# print $al->sourceSentence."\n"; |
|
1083
|
|
|
|
|
|
|
# print $al->targetSentence."\n"; |
|
1084
|
|
|
|
|
|
|
|
|
1085
|
|
|
|
|
|
|
# print "CLUSTERS:\n"; |
|
1086
|
|
|
|
|
|
|
# print $dumper->dumpValue($clusters); |
|
1087
|
0
|
|
|
|
|
0
|
foreach my $source (@sides){ |
|
1088
|
|
|
|
|
|
|
#sort clusters |
|
1089
|
0
|
|
|
|
|
0
|
my %firstClustPos; |
|
1090
|
0
|
|
|
|
|
0
|
for (my $c=0;$c<@$clusters;$c++){ |
|
1091
|
0
|
|
|
|
|
0
|
@{$clusters->[$c]{$source}} = sort { $a <=> $b; } @{$clusters->[$c]{$source}}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
1092
|
|
|
|
|
|
|
} |
|
1093
|
0
|
|
|
|
|
0
|
@$clusters = sort {$a->{$source}[0] <=> $b->{$source}[0]} @$clusters; |
|
|
0
|
|
|
|
|
0
|
|
|
1094
|
|
|
|
|
|
|
|
|
1095
|
0
|
|
|
|
|
0
|
my $offset=0; |
|
1096
|
0
|
|
|
|
|
0
|
foreach my $clust (@$clusters){ |
|
1097
|
0
|
0
|
|
|
|
0
|
if ( @{$clust->{$source}} >1 ){ |
|
|
0
|
|
|
|
|
0
|
|
|
1098
|
|
|
|
|
|
|
#check that cluster is contiguous |
|
1099
|
0
|
|
|
|
|
0
|
my $contiguous=1; |
|
1100
|
0
|
|
|
|
|
0
|
for (my $c=1;$c<@{$clust->{$source}};$c++){ |
|
|
0
|
|
|
|
|
0
|
|
|
1101
|
0
|
0
|
|
|
|
0
|
if ($clust->{$source}[$c] != ($clust->{$source}[$c-1]+1) ){ |
|
1102
|
0
|
|
|
|
|
0
|
$contiguous=0; |
|
1103
|
0
|
|
|
|
|
0
|
last; |
|
1104
|
|
|
|
|
|
|
} |
|
1105
|
|
|
|
|
|
|
} |
|
1106
|
0
|
0
|
|
|
|
0
|
if ($contiguous){ |
|
1107
|
|
|
|
|
|
|
# introduce underscore |
|
1108
|
0
|
|
|
|
|
0
|
my $numWords = @{$clust->{$source}}; |
|
|
0
|
|
|
|
|
0
|
|
|
1109
|
0
|
|
|
|
|
0
|
my $newWord=$al->{$source."Words"}[$clust->{$source}[0]]; |
|
1110
|
0
|
|
|
|
|
0
|
for (my $c=1;$c<$numWords;$c++){ |
|
1111
|
0
|
|
|
|
|
0
|
$newWord=$newWord."_".$al->{$source."Words"}[$clust->{$source}[$c]]; |
|
1112
|
|
|
|
|
|
|
} |
|
1113
|
|
|
|
|
|
|
# print "new word: $newWord\n"; |
|
1114
|
0
|
|
|
|
|
0
|
splice(@{$new->{$source}},$clust->{$source}[0]-$offset,$numWords,$newWord); |
|
|
0
|
|
|
|
|
0
|
|
|
1115
|
|
|
|
|
|
|
|
|
1116
|
0
|
|
|
|
|
0
|
$offset+=$numWords-1; |
|
1117
|
|
|
|
|
|
|
}else{ |
|
1118
|
0
|
|
|
|
|
0
|
print STDERR "not contiguous\n"; |
|
1119
|
|
|
|
|
|
|
} |
|
1120
|
|
|
|
|
|
|
} #if |
|
1121
|
|
|
|
|
|
|
} |
|
1122
|
|
|
|
|
|
|
# print "\n"; |
|
1123
|
|
|
|
|
|
|
} #foreach $source |
|
1124
|
|
|
|
|
|
|
# print "new source:",join(" ",@{$new->{source}}),"\n"; |
|
1125
|
|
|
|
|
|
|
# print "new target:",join(" ",@{$new->{target}}),"\n"; |
|
1126
|
0
|
|
|
|
|
0
|
@{$al->{sourceWords}}=@{$new->{source}}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
1127
|
0
|
|
|
|
|
0
|
@{$al->{targetWords}}=@{$new->{target}}; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
1128
|
|
|
|
|
|
|
} |
|
1129
|
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
# recreates links of words linked by underscore and removes underscores |
|
1132
|
|
|
|
|
|
|
# ONLY WORKS WITH SOURCE2TARGET AlIGNMENT |
|
1133
|
|
|
|
|
|
|
sub joined2ManyToMany { |
|
1134
|
0
|
|
|
0
|
0
|
0
|
my $al=shift; |
|
1135
|
|
|
|
|
|
|
# print $al->sourceSentence."\n"; |
|
1136
|
|
|
|
|
|
|
# print $al->targetSentence."\n"; |
|
1137
|
|
|
|
|
|
|
|
|
1138
|
0
|
|
|
|
|
0
|
my @sides=("source","target"); |
|
1139
|
0
|
|
|
|
|
0
|
foreach my $source (@sides){ |
|
1140
|
0
|
|
|
|
|
0
|
my %joined; |
|
1141
|
0
|
|
|
|
|
0
|
for (my $j=1;$j<@{$al->{$source."Words"}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
1142
|
0
|
0
|
|
|
|
0
|
if ($al->{$source."Words"}[$j] =~ /@@@/){ |
|
1143
|
0
|
|
|
|
|
0
|
$joined{$j}=1; |
|
1144
|
|
|
|
|
|
|
} |
|
1145
|
|
|
|
|
|
|
} |
|
1146
|
0
|
|
|
|
|
0
|
my @sortedJoined = sort { $a <=> $b } keys(%joined); |
|
|
0
|
|
|
|
|
0
|
|
|
1147
|
0
|
|
|
|
|
0
|
my $offset=0; |
|
1148
|
0
|
|
|
|
|
0
|
foreach my $pos (@sortedJoined){ |
|
1149
|
|
|
|
|
|
|
# insert new words |
|
1150
|
0
|
|
|
|
|
0
|
my $joinedWords = $al->{$source."Words"}[$pos+$offset]; |
|
1151
|
0
|
|
|
|
|
0
|
my @newWords = split(/@@@/,$joinedWords); |
|
1152
|
0
|
|
|
|
|
0
|
my $firstWord = shift @newWords; |
|
1153
|
0
|
|
|
|
|
0
|
$al->{$source."Words"}[$pos+$offset]=$firstWord; |
|
1154
|
|
|
|
|
|
|
|
|
1155
|
|
|
|
|
|
|
# insert new words to alignment, all linked to the same target words as the old (joined) token |
|
1156
|
0
|
0
|
|
|
|
0
|
if ($source eq "source"){ |
|
1157
|
0
|
|
|
|
|
0
|
$al->splice($source,$pos+1+$offset,0,\@newWords,$al->{sourceAl}[$pos+$offset]); |
|
1158
|
|
|
|
|
|
|
}else{ |
|
1159
|
|
|
|
|
|
|
# look for links aligned to $pos+$offset |
|
1160
|
0
|
|
|
|
|
0
|
my @alignedPos; |
|
1161
|
|
|
|
|
|
|
# print "pos offset:".($pos+$offset)."\n"; |
|
1162
|
0
|
|
|
|
|
0
|
for (my $j=0;$j<@{$al->{sourceAl}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
1163
|
0
|
|
|
|
|
0
|
foreach my $i (@{$al->{sourceAl}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
1164
|
0
|
0
|
|
|
|
0
|
if ($i == ($pos+$offset)){ |
|
1165
|
0
|
|
|
|
|
0
|
push @alignedPos,$j; |
|
1166
|
|
|
|
|
|
|
} |
|
1167
|
|
|
|
|
|
|
} |
|
1168
|
|
|
|
|
|
|
} |
|
1169
|
0
|
|
|
|
|
0
|
$al->splice($source,$pos+1+$offset,0,\@newWords,\@alignedPos); |
|
1170
|
|
|
|
|
|
|
} |
|
1171
|
0
|
|
|
|
|
0
|
$offset += @newWords; |
|
1172
|
|
|
|
|
|
|
} |
|
1173
|
|
|
|
|
|
|
} |
|
1174
|
|
|
|
|
|
|
} |
|
1175
|
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
#input: (source,target) link |
|
1177
|
|
|
|
|
|
|
#output: true if the link is reciprocal (or "cross link"), false otherwise |
|
1178
|
|
|
|
|
|
|
sub isCrossLink { |
|
1179
|
0
|
|
|
0
|
0
|
0
|
my ($al,$j,$i)=@_; |
|
1180
|
|
|
|
|
|
|
# print "s $j $i:",$al->isIn("sourceAl",$j,$i)," t $i $j:",$al->isIn("targetAl",$i,$j),"\n"; |
|
1181
|
0
|
|
0
|
|
|
0
|
return ( $al->isIn("sourceAl",$j,$i) && $al->isIn("targetAl",$i,$j) ); |
|
1182
|
|
|
|
|
|
|
} |
|
1183
|
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
sub isAnchor{ |
|
1185
|
0
|
|
|
0
|
0
|
0
|
my ($al,$j,$side)=@_; |
|
1186
|
0
|
|
|
|
|
0
|
my ($reverseSide,$i); |
|
1187
|
|
|
|
|
|
|
|
|
1188
|
0
|
0
|
|
|
|
0
|
if ($side eq "source"){$reverseSide="target"} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
1189
|
|
|
|
|
|
|
else {$reverseSide = "source"} |
|
1190
|
0
|
0
|
|
|
|
0
|
if (defined($al->{$side."Al"}[$j])){ |
|
1191
|
0
|
0
|
|
|
|
0
|
if (@{$al->{$side."Al"}[$j]}==1){ |
|
|
0
|
|
|
|
|
0
|
|
|
1192
|
0
|
|
|
|
|
0
|
$i = $al->{$side."Al"}[$j][0]; |
|
1193
|
0
|
0
|
|
|
|
0
|
if (defined($al->{$reverseSide."Al"}[$i])){ |
|
1194
|
0
|
0
|
0
|
|
|
0
|
if (@{$al->{$reverseSide."Al"}[$i]}==1 && $al->{$reverseSide."Al"}[$i][0]==$j){ |
|
|
0
|
|
|
|
|
0
|
|
|
1195
|
0
|
|
|
|
|
0
|
return 1; |
|
1196
|
|
|
|
|
|
|
} |
|
1197
|
|
|
|
|
|
|
} |
|
1198
|
|
|
|
|
|
|
} |
|
1199
|
|
|
|
|
|
|
} |
|
1200
|
0
|
|
|
|
|
0
|
return 0; |
|
1201
|
|
|
|
|
|
|
} |
|
1202
|
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
#mode: "noAnchors" cuts zones between 2 anchors and cannot include an anchor point |
|
1204
|
|
|
|
|
|
|
# "anchors" cuts zone established by coordinates and doesn't look more |
|
1205
|
|
|
|
|
|
|
sub cut { |
|
1206
|
0
|
|
|
0
|
0
|
0
|
my ($al,$startPointSource,$startPointTarget,$endPointSource,$endPointTarget,$mode)=@_; |
|
1207
|
0
|
0
|
|
|
|
0
|
if (!defined($mode)){$mode="noAnchors"} |
|
|
0
|
|
|
|
|
0
|
|
|
1208
|
0
|
|
|
|
|
0
|
my ($j,$i,$ind); |
|
1209
|
0
|
|
|
|
|
0
|
my %sourceInGap=(); |
|
1210
|
0
|
|
|
|
|
0
|
my %targetInGap=(); |
|
1211
|
0
|
|
|
|
|
0
|
my @sortedSourceInGap=(); |
|
1212
|
0
|
|
|
|
|
0
|
my @sortedTargetInGap=(); |
|
1213
|
0
|
|
|
|
|
0
|
my %sourceToNull=(); |
|
1214
|
0
|
|
|
|
|
0
|
my %targetToNull=(); |
|
1215
|
0
|
|
|
|
|
0
|
my $gap = Lingua::AlignmentSlice->new($al); |
|
1216
|
0
|
|
|
|
|
0
|
my @linked=(); |
|
1217
|
0
|
|
|
|
|
0
|
my ($zeroSource,$zeroTarget,$numSource,$numTarget); |
|
1218
|
0
|
|
|
|
|
0
|
my ($oldNumInGap,$newNumInGap); |
|
1219
|
0
|
|
|
|
|
0
|
for ($j=$startPointSource+1;$j<$endPointSource;$j++){ |
|
1220
|
0
|
|
|
|
|
0
|
$sourceInGap{$j}=1; |
|
1221
|
|
|
|
|
|
|
} |
|
1222
|
0
|
|
|
|
|
0
|
for ($i=$startPointTarget+1;$i<$endPointTarget;$i++){ |
|
1223
|
0
|
0
|
|
|
|
0
|
if ($mode eq "noAnchors"){ |
|
1224
|
0
|
0
|
|
|
|
0
|
if (!$al->isAnchor($i,"target")){ |
|
1225
|
0
|
|
|
|
|
0
|
$targetInGap{$i}=1; |
|
1226
|
|
|
|
|
|
|
} |
|
1227
|
|
|
|
|
|
|
}else{ |
|
1228
|
0
|
|
|
|
|
0
|
$targetInGap{$i}=1; |
|
1229
|
|
|
|
|
|
|
} |
|
1230
|
|
|
|
|
|
|
} |
|
1231
|
|
|
|
|
|
|
# print "\n($startPointSource,$startPointTarget,$endPointSource,$endPointTarget)\n"; |
|
1232
|
|
|
|
|
|
|
# print "source in gap 1:".join(" ",keys %sourceInGap)."\n"; |
|
1233
|
|
|
|
|
|
|
# print "target in gap 1:".join(" ",keys %targetInGap)."\n"; |
|
1234
|
|
|
|
|
|
|
|
|
1235
|
|
|
|
|
|
|
#look at linked words situated outside the gap square: |
|
1236
|
0
|
|
|
|
|
0
|
$oldNumInGap=0; |
|
1237
|
0
|
|
|
|
|
0
|
$newNumInGap=scalar(keys %sourceInGap)+scalar(keys %targetInGap); |
|
1238
|
0
|
|
|
|
|
0
|
while ($oldNumInGap != $newNumInGap){ |
|
1239
|
0
|
|
|
|
|
0
|
foreach $i (keys %targetInGap){ |
|
1240
|
0
|
|
|
|
|
0
|
foreach $j (@{$al->{targetAl}[$i]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
1241
|
0
|
0
|
|
|
|
0
|
if ($j!=0){ |
|
|
0
|
|
|
|
|
0
|
|
|
1242
|
0
|
|
|
|
|
0
|
$sourceInGap{$j}=1; |
|
1243
|
|
|
|
|
|
|
} |
|
1244
|
|
|
|
|
|
|
else {$targetToNull{$i}=1}; |
|
1245
|
|
|
|
|
|
|
} |
|
1246
|
0
|
|
|
|
|
0
|
for ($j=1;$j<@{$al->{sourceAl}};$j++){ |
|
|
0
|
|
|
|
|
0
|
|
|
1247
|
0
|
0
|
|
|
|
0
|
if ($al->isIn("sourceAl",$j,$i)){ |
|
1248
|
0
|
|
|
|
|
0
|
$sourceInGap{$j}=1; |
|
1249
|
|
|
|
|
|
|
} |
|
1250
|
|
|
|
|
|
|
} |
|
1251
|
|
|
|
|
|
|
} |
|
1252
|
0
|
|
|
|
|
0
|
foreach $j (keys %sourceInGap){ |
|
1253
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
1254
|
0
|
0
|
|
|
|
0
|
if ($i!=0){ |
|
|
0
|
|
|
|
|
0
|
|
|
1255
|
0
|
|
|
|
|
0
|
$targetInGap{$i}=1; |
|
1256
|
|
|
|
|
|
|
} |
|
1257
|
|
|
|
|
|
|
else {$sourceToNull{$j}=1}; |
|
1258
|
|
|
|
|
|
|
} |
|
1259
|
0
|
|
|
|
|
0
|
for ($i=1;$i<@{$al->{targetAl}};$i++){ |
|
|
0
|
|
|
|
|
0
|
|
|
1260
|
0
|
0
|
|
|
|
0
|
if ($al->isIn("targetAl",$i,$j)){ |
|
1261
|
0
|
|
|
|
|
0
|
$targetInGap{$i}=1; |
|
1262
|
|
|
|
|
|
|
} |
|
1263
|
|
|
|
|
|
|
} |
|
1264
|
|
|
|
|
|
|
} |
|
1265
|
0
|
|
|
|
|
0
|
$oldNumInGap=$newNumInGap; |
|
1266
|
0
|
|
|
|
|
0
|
$newNumInGap=scalar(keys %sourceInGap)+scalar(keys %targetInGap); |
|
1267
|
|
|
|
|
|
|
} |
|
1268
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[0]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
1269
|
0
|
0
|
|
|
|
0
|
if ($targetInGap{$i}){$targetToNull{$i}=1} |
|
|
0
|
|
|
|
|
0
|
|
|
1270
|
|
|
|
|
|
|
} |
|
1271
|
0
|
|
|
|
|
0
|
foreach $j (@{$al->{targetAl}[0]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
1272
|
0
|
0
|
|
|
|
0
|
if ($sourceInGap{$j}){$sourceToNull{$j}=1} |
|
|
0
|
|
|
|
|
0
|
|
|
1273
|
|
|
|
|
|
|
} |
|
1274
|
|
|
|
|
|
|
|
|
1275
|
0
|
|
|
|
|
0
|
@sortedSourceInGap = sort { $a <=> $b; } keys %sourceInGap; |
|
|
0
|
|
|
|
|
0
|
|
|
1276
|
0
|
|
|
|
|
0
|
@sortedTargetInGap = sort { $a <=> $b; } keys %targetInGap; |
|
|
0
|
|
|
|
|
0
|
|
|
1277
|
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
# print "source in gap 2:",join(" ",keys %sourceInGap)."\n"; |
|
1279
|
|
|
|
|
|
|
# print "target in gap 2:",join(" ",keys %targetInGap)."\n"; |
|
1280
|
|
|
|
|
|
|
# print "source sorted:",join(" ",@sortedSourceInGap)."\n"; |
|
1281
|
|
|
|
|
|
|
# print "target sorted:",join(" ",@sortedTargetInGap)."\n"; |
|
1282
|
|
|
|
|
|
|
# print "target to null:",join(" ",keys %targetToNull)."\n"; |
|
1283
|
|
|
|
|
|
|
# print "source to null:",join(" ",keys %sourceToNull)."\n"; |
|
1284
|
|
|
|
|
|
|
|
|
1285
|
0
|
0
|
|
|
|
0
|
if (@sortedSourceInGap==0){ |
|
1286
|
0
|
|
|
|
|
0
|
$zeroSource=0; |
|
1287
|
0
|
|
|
|
|
0
|
$numSource=0; |
|
1288
|
|
|
|
|
|
|
}else{ |
|
1289
|
0
|
|
|
|
|
0
|
$zeroSource=$sortedSourceInGap[0]-1; |
|
1290
|
0
|
|
|
|
|
0
|
$numSource=$sortedSourceInGap[@sortedSourceInGap-1]-$sortedSourceInGap[0]+1; |
|
1291
|
|
|
|
|
|
|
} |
|
1292
|
0
|
0
|
|
|
|
0
|
if (@sortedTargetInGap==0){ |
|
1293
|
0
|
|
|
|
|
0
|
$zeroTarget=0; |
|
1294
|
0
|
|
|
|
|
0
|
$numTarget=0; |
|
1295
|
|
|
|
|
|
|
}else{ |
|
1296
|
0
|
|
|
|
|
0
|
$zeroTarget=$sortedTargetInGap[0]-1; |
|
1297
|
0
|
|
|
|
|
0
|
$numTarget=$sortedTargetInGap[@sortedTargetInGap-1]-$sortedTargetInGap[0]+1; |
|
1298
|
|
|
|
|
|
|
} |
|
1299
|
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
#Actualize AlignmentSlice attributes |
|
1301
|
0
|
|
|
|
|
0
|
$gap->setZero($zeroSource,$zeroTarget); |
|
1302
|
0
|
|
|
|
|
0
|
foreach $j (keys %sourceInGap){ |
|
1303
|
0
|
|
|
|
|
0
|
$gap->{sourceIndices}{$j-$zeroSource}=1; |
|
1304
|
|
|
|
|
|
|
} |
|
1305
|
0
|
0
|
|
|
|
0
|
if (scalar (keys %targetToNull)>0){$gap->{sourceIndices}{0}=1}; |
|
|
0
|
|
|
|
|
0
|
|
|
1306
|
0
|
|
|
|
|
0
|
foreach $i (keys %targetInGap){ |
|
1307
|
0
|
|
|
|
|
0
|
$gap->{targetIndices}{$i-$zeroTarget}=1; |
|
1308
|
|
|
|
|
|
|
} |
|
1309
|
0
|
0
|
|
|
|
0
|
if (scalar (keys %sourceToNull)>0){$gap->{targetIndices}{0}=1}; |
|
|
0
|
|
|
|
|
0
|
|
|
1310
|
|
|
|
|
|
|
|
|
1311
|
|
|
|
|
|
|
# print "zero s t:",$zeroSource," ",$zeroTarget,"\n"; |
|
1312
|
|
|
|
|
|
|
# print "num s t:",$numSource," ",$numTarget,"\n"; |
|
1313
|
|
|
|
|
|
|
|
|
1314
|
|
|
|
|
|
|
## LOAD GAP |
|
1315
|
|
|
|
|
|
|
# 1. insert NULL word and select only words linked to NULL that belong to the gap |
|
1316
|
0
|
|
|
|
|
0
|
push @{$gap->{sourceWords}},'NULL'; |
|
|
0
|
|
|
|
|
0
|
|
|
1317
|
0
|
|
|
|
|
0
|
foreach $i (keys %targetToNull){push @linked,$i-$gap->{zeroTarget}} |
|
|
0
|
|
|
|
|
0
|
|
|
1318
|
0
|
|
|
|
|
0
|
push @{$gap->{sourceAl}},[@linked]; |
|
|
0
|
|
|
|
|
0
|
|
|
1319
|
0
|
|
|
|
|
0
|
push @{$gap->{targetWords}},'NULL'; |
|
|
0
|
|
|
|
|
0
|
|
|
1320
|
0
|
|
|
|
|
0
|
@linked=(); |
|
1321
|
0
|
|
|
|
|
0
|
foreach $j (keys %sourceToNull){push @linked,$j-$gap->{zeroSource}} |
|
|
0
|
|
|
|
|
0
|
|
|
1322
|
0
|
|
|
|
|
0
|
push @{$gap->{targetAl}},[@linked]; |
|
|
0
|
|
|
|
|
0
|
|
|
1323
|
|
|
|
|
|
|
# 2. Add non-NULL words and alignments |
|
1324
|
0
|
|
|
|
|
0
|
for ($ind=1;$ind<=$numSource;$ind++){ |
|
1325
|
0
|
|
|
|
|
0
|
$j=$ind+$gap->{zeroSource}; |
|
1326
|
0
|
|
|
|
|
0
|
$gap->{sourceWords}[$ind]=$al->{sourceWords}[$j]; |
|
1327
|
0
|
0
|
|
|
|
0
|
if ($sourceInGap{$j}){ |
|
1328
|
0
|
|
|
|
|
0
|
@linked=(); |
|
1329
|
0
|
|
|
|
|
0
|
foreach $i (@{$al->{sourceAl}[$j]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
1330
|
|
|
|
|
|
|
#if ($targetInGap{$i}){ #useless:de facto included in the zone |
|
1331
|
0
|
|
|
|
|
0
|
push @linked,$i-$gap->{zeroTarget} |
|
1332
|
|
|
|
|
|
|
#} |
|
1333
|
|
|
|
|
|
|
} |
|
1334
|
0
|
|
|
|
|
0
|
$gap->{sourceAl}[$ind]=[@linked]; |
|
1335
|
|
|
|
|
|
|
} |
|
1336
|
|
|
|
|
|
|
} |
|
1337
|
0
|
|
|
|
|
0
|
for ($ind=1;$ind<=$numTarget;$ind++){ |
|
1338
|
0
|
|
|
|
|
0
|
$i = $ind+$gap->{zeroTarget}; |
|
1339
|
0
|
|
|
|
|
0
|
$gap->{targetWords}[$ind]=$al->{targetWords}[$i]; |
|
1340
|
0
|
0
|
|
|
|
0
|
if ($targetInGap{$i}){ |
|
1341
|
0
|
|
|
|
|
0
|
@linked=(); |
|
1342
|
0
|
|
|
|
|
0
|
foreach $j (@{$al->{targetAl}[$i]}){ |
|
|
0
|
|
|
|
|
0
|
|
|
1343
|
|
|
|
|
|
|
#if ($sourceInGap{$j}) { #useless:de facto included in the zone |
|
1344
|
0
|
|
|
|
|
0
|
push @linked,$j-$gap->{zeroSource} |
|
1345
|
|
|
|
|
|
|
#} |
|
1346
|
|
|
|
|
|
|
} |
|
1347
|
0
|
|
|
|
|
0
|
$gap->{targetAl}[$ind]=[@linked]; |
|
1348
|
|
|
|
|
|
|
} |
|
1349
|
|
|
|
|
|
|
} |
|
1350
|
0
|
|
|
|
|
0
|
return $gap; |
|
1351
|
|
|
|
|
|
|
} |
|
1352
|
|
|
|
|
|
|
|
|
1353
|
|
|
|
|
|
|
##################################################### |
|
1354
|
|
|
|
|
|
|
### PRIVATE SUBS ### |
|
1355
|
|
|
|
|
|
|
##################################################### |
|
1356
|
|
|
|
|
|
|
|
|
1357
|
|
|
|
|
|
|
# Returns the number of times the link ($ind1,$ind2) is present in the $side alignment |
|
1358
|
|
|
|
|
|
|
sub isIn { |
|
1359
|
132
|
|
|
132
|
0
|
181
|
my ($al,$side,$ind1,$ind2) = @_; |
|
1360
|
132
|
50
|
|
|
|
242
|
if ($side eq "sourceAl"){ |
|
1361
|
|
|
|
|
|
|
# returns >0 if the link (j,i) is present in sourceAl (ie if i_partOf_Bj), 0 otherwise |
|
1362
|
132
|
|
|
|
|
141
|
my ($j,$i) = ($ind1,$ind2); |
|
1363
|
132
|
|
|
|
|
116
|
my $i_partOf_Bj=grep /^$i$/, @{$al->{sourceAl}[$j]}; |
|
|
132
|
|
|
|
|
524
|
|
|
1364
|
132
|
|
|
|
|
417
|
return $i_partOf_Bj; |
|
1365
|
|
|
|
|
|
|
}else{ |
|
1366
|
|
|
|
|
|
|
# returns >0 if the link (i,j) is present in targetAl (ie if j_partOf_Bi), 0 otherwise |
|
1367
|
0
|
|
|
|
|
|
my ($i,$j)=($ind1,$ind2); |
|
1368
|
0
|
|
|
|
|
|
my $j_partOf_Bi = grep /^$j$/, @{$al->{targetAl}[$i]}; |
|
|
0
|
|
|
|
|
|
|
|
1369
|
0
|
|
|
|
|
|
return $j_partOf_Bi; |
|
1370
|
|
|
|
|
|
|
} |
|
1371
|
|
|
|
|
|
|
} |
|
1372
|
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
# returns an object with same content as the input object |
|
1374
|
|
|
|
|
|
|
sub clone { |
|
1375
|
0
|
|
|
0
|
0
|
|
my $al = shift; |
|
1376
|
0
|
|
|
|
|
|
my $clone = Lingua::Alignment->new; |
|
1377
|
0
|
|
|
|
|
|
my ($i,$j); |
|
1378
|
0
|
|
|
|
|
|
@{$clone->{sourceWords}}=@{$al->{sourceWords}}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1379
|
0
|
|
|
|
|
|
@{$clone->{targetWords}}=@{$al->{targetWords}}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1380
|
0
|
|
|
|
|
|
for ($j=0;$j<@{$al->{sourceAl}};$j++){ |
|
|
0
|
|
|
|
|
|
|
|
1381
|
0
|
0
|
|
|
|
|
if (defined($al->{sourceAl}[$j])){ |
|
1382
|
0
|
|
|
|
|
|
push @{$clone->{sourceAl}},[@{$al->{sourceAl}[$j]}]; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1383
|
|
|
|
|
|
|
} |
|
1384
|
|
|
|
|
|
|
} |
|
1385
|
0
|
|
|
|
|
|
for ($i=0;$i<@{$al->{targetAl}};$i++){ |
|
|
0
|
|
|
|
|
|
|
|
1386
|
0
|
0
|
|
|
|
|
if (defined($al->{targetAl}[$i])){ |
|
1387
|
0
|
|
|
|
|
|
push @{$clone->{targetAl}},[@{$al->{targetAl}[$i]}]; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1388
|
|
|
|
|
|
|
} |
|
1389
|
|
|
|
|
|
|
} |
|
1390
|
0
|
|
|
|
|
|
%{$clone->{sourceLinks}}=%{$al->{sourceLinks}}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1391
|
0
|
|
|
|
|
|
%{$clone->{targetLinks}}=%{$al->{targetLinks}}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1392
|
|
|
|
|
|
|
|
|
1393
|
0
|
|
|
|
|
|
return $clone; |
|
1394
|
|
|
|
|
|
|
} |
|
1395
|
|
|
|
|
|
|
sub clear { |
|
1396
|
0
|
|
|
0
|
0
|
|
my $al = shift; |
|
1397
|
0
|
|
|
|
|
|
my ($i,$j); |
|
1398
|
0
|
|
|
|
|
|
for ($j=0;$j<@{$al->{sourceAl}};$j++){ |
|
|
0
|
|
|
|
|
|
|
|
1399
|
0
|
0
|
|
|
|
|
if (defined($al->{sourceAl}[$j])){ |
|
1400
|
0
|
|
|
|
|
|
@{$al->{sourceAl}[$j]} = (); |
|
|
0
|
|
|
|
|
|
|
|
1401
|
|
|
|
|
|
|
} |
|
1402
|
|
|
|
|
|
|
} |
|
1403
|
0
|
|
|
|
|
|
for ($i=0;$i<@{$al->{targetAl}};$i++){ |
|
|
0
|
|
|
|
|
|
|
|
1404
|
0
|
0
|
|
|
|
|
if (defined($al->{targetAl}[$i])){ |
|
1405
|
0
|
|
|
|
|
|
@{$al->{targetAl}[$i]} = (); |
|
|
0
|
|
|
|
|
|
|
|
1406
|
|
|
|
|
|
|
} |
|
1407
|
|
|
|
|
|
|
} |
|
1408
|
0
|
|
|
|
|
|
%{$al->{sourceLinks}} = (); |
|
|
0
|
|
|
|
|
|
|
|
1409
|
0
|
|
|
|
|
|
%{$al->{targetLinks}} = (); |
|
|
0
|
|
|
|
|
|
|
|
1410
|
|
|
|
|
|
|
} |
|
1411
|
|
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
|
|
|
1413
|
|
|
|
|
|
|
# gets the alignment as clusters of positions aligned together |
|
1414
|
|
|
|
|
|
|
# input: $al, $direction ("source" for "sourceAl" or "target" for "targetAl") |
|
1415
|
|
|
|
|
|
|
sub getAlClusters { |
|
1416
|
0
|
|
|
0
|
0
|
|
my ($al,$direction)=@_; |
|
1417
|
|
|
|
|
|
|
#default: |
|
1418
|
0
|
0
|
|
|
|
|
if (!defined($direction)){$direction="source"} |
|
|
0
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
|
|
1420
|
0
|
|
|
|
|
|
my $dumper = new Dumpvalue; |
|
1421
|
|
|
|
|
|
|
# group many-to-many linked phrased in clusters |
|
1422
|
0
|
|
|
|
|
|
my %scomp; #stores in which cluster is each source word position |
|
1423
|
|
|
|
|
|
|
my %tcomp; |
|
1424
|
0
|
|
|
|
|
|
my @clusters; |
|
1425
|
0
|
|
|
|
|
|
my $alClusters={}; |
|
1426
|
0
|
|
|
|
|
|
my $numClusters=0; |
|
1427
|
|
|
|
|
|
|
|
|
1428
|
0
|
|
|
|
|
|
for (my $j=1;$j<@{$al->{$direction."Al"}};$j++){ |
|
|
0
|
|
|
|
|
|
|
|
1429
|
0
|
0
|
|
|
|
|
if (defined($al->{$direction."Al"}[$j])){ |
|
1430
|
0
|
|
|
|
|
|
foreach my $i (@{$al->{$direction."Al"}[$j]}){ |
|
|
0
|
|
|
|
|
|
|
|
1431
|
0
|
0
|
|
|
|
|
if ($i>0){ |
|
1432
|
|
|
|
|
|
|
# print "j: $j i: $i\n"; |
|
1433
|
0
|
0
|
0
|
|
|
|
if (exists($scomp{$j}) || exists($tcomp{$i})){ |
|
1434
|
0
|
|
|
|
|
|
my ($clustIndex1,$clustIndex2); |
|
1435
|
0
|
0
|
0
|
|
|
|
if (exists($scomp{$j}) && exists($tcomp{$i})){ |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1436
|
0
|
0
|
|
|
|
|
if ($tcomp{$i} != $scomp{$j}){ |
|
1437
|
|
|
|
|
|
|
# merge clusters: |
|
1438
|
0
|
0
|
|
|
|
|
if ($scomp{$j}<$tcomp{$i}){ |
|
1439
|
0
|
|
|
|
|
|
$clustIndex1=$scomp{$j}; |
|
1440
|
0
|
|
|
|
|
|
$clustIndex2=$tcomp{$i}; |
|
1441
|
|
|
|
|
|
|
}else{ |
|
1442
|
0
|
|
|
|
|
|
$clustIndex1=$tcomp{$i}; |
|
1443
|
0
|
|
|
|
|
|
$clustIndex2=$scomp{$j}; |
|
1444
|
|
|
|
|
|
|
} |
|
1445
|
|
|
|
|
|
|
# print "clusters: $clustIndex1 $clustIndex2 :\n"; |
|
1446
|
0
|
|
|
|
|
|
push @{$clusters[$clustIndex1]->{source}},@{$clusters[$clustIndex2]->{source}}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1447
|
0
|
|
|
|
|
|
push @{$clusters[$clustIndex1]->{target}},@{$clusters[$clustIndex2]->{target}}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
|
|
1449
|
0
|
|
|
|
|
|
while ( my ($key,$val)=each (%scomp) ){ |
|
1450
|
0
|
0
|
|
|
|
|
if ($val == $clustIndex2){$scomp{$key}=$clustIndex1;} |
|
|
0
|
|
|
|
|
|
|
|
1451
|
0
|
0
|
|
|
|
|
if ($val > $clustIndex2) {$scomp{$key}=$scomp{$key}-1;} |
|
|
0
|
|
|
|
|
|
|
|
1452
|
|
|
|
|
|
|
} |
|
1453
|
0
|
|
|
|
|
|
while ( my ($key,$val)=each (%tcomp) ){ |
|
1454
|
0
|
0
|
|
|
|
|
if ($val == $clustIndex2){$tcomp{$key}=$clustIndex1;} |
|
|
0
|
|
|
|
|
|
|
|
1455
|
0
|
0
|
|
|
|
|
if ($val > $clustIndex2) {$tcomp{$key}=$tcomp{$key}-1;} |
|
|
0
|
|
|
|
|
|
|
|
1456
|
|
|
|
|
|
|
} |
|
1457
|
0
|
|
|
|
|
|
splice @clusters,$clustIndex2,1; |
|
1458
|
0
|
|
|
|
|
|
$numClusters--; |
|
1459
|
|
|
|
|
|
|
} |
|
1460
|
|
|
|
|
|
|
}elsif (exists($scomp{$j})){ |
|
1461
|
0
|
|
|
|
|
|
$clustIndex1=$scomp{$j}; |
|
1462
|
0
|
|
|
|
|
|
$tcomp{$i}=$clustIndex1; |
|
1463
|
0
|
|
|
|
|
|
push @{$clusters[$clustIndex1]->{target}},$i; |
|
|
0
|
|
|
|
|
|
|
|
1464
|
|
|
|
|
|
|
}elsif (exists($tcomp{$i})){ |
|
1465
|
0
|
|
|
|
|
|
$clustIndex1=$tcomp{$i}; |
|
1466
|
0
|
|
|
|
|
|
$scomp{$j}=$clustIndex1; |
|
1467
|
0
|
|
|
|
|
|
push @{$clusters[$clustIndex1]->{source}},$j; |
|
|
0
|
|
|
|
|
|
|
|
1468
|
|
|
|
|
|
|
} |
|
1469
|
|
|
|
|
|
|
}else{ |
|
1470
|
0
|
|
|
|
|
|
push @clusters,{source=>[$j],target=>[$i]}; |
|
1471
|
0
|
|
|
|
|
|
$scomp{$j}=$numClusters; |
|
1472
|
0
|
|
|
|
|
|
$tcomp{$i}=$numClusters; |
|
1473
|
0
|
|
|
|
|
|
$numClusters++; |
|
1474
|
|
|
|
|
|
|
} |
|
1475
|
|
|
|
|
|
|
} #if $i>0 |
|
1476
|
|
|
|
|
|
|
# print "scomp:\n"; |
|
1477
|
|
|
|
|
|
|
# print $dumper->dumpValue(\%scomp); |
|
1478
|
|
|
|
|
|
|
# print "tcomp:\n"; |
|
1479
|
|
|
|
|
|
|
# print $dumper->dumpValue(\%tcomp); |
|
1480
|
|
|
|
|
|
|
|
|
1481
|
|
|
|
|
|
|
# print $dumper->dumpValue(\@clusters); |
|
1482
|
|
|
|
|
|
|
} |
|
1483
|
|
|
|
|
|
|
} |
|
1484
|
|
|
|
|
|
|
} |
|
1485
|
0
|
|
|
|
|
|
return \@clusters; |
|
1486
|
|
|
|
|
|
|
} |
|
1487
|
|
|
|
|
|
|
|
|
1488
|
|
|
|
|
|
|
# prints a phrase given the side of alignment (source or target) and an array of positions of the phrase words |
|
1489
|
|
|
|
|
|
|
sub printPhrase { |
|
1490
|
0
|
|
|
0
|
0
|
|
my ($al,$source,$posArray)=@_; |
|
1491
|
0
|
|
|
|
|
|
my @words; |
|
1492
|
0
|
|
|
|
|
|
foreach my $pos (@$posArray){ |
|
1493
|
0
|
|
|
|
|
|
push @words,$al->{$source."Words"}[$pos]; |
|
1494
|
|
|
|
|
|
|
} |
|
1495
|
0
|
|
|
|
|
|
return join(" ",@words); |
|
1496
|
|
|
|
|
|
|
} |
|
1497
|
|
|
|
|
|
|
|
|
1498
|
|
|
|
|
|
|
# SELECT ONLY S LINKS |
|
1499
|
|
|
|
|
|
|
sub SLinks { |
|
1500
|
0
|
|
|
0
|
0
|
|
my $al=shift; |
|
1501
|
0
|
|
|
|
|
|
my $sal = Lingua::Alignment->new; |
|
1502
|
0
|
|
|
|
|
|
@{$sal->{sourceWords}}=@{$al->{sourceWords}}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1503
|
0
|
|
|
|
|
|
@{$sal->{targetWords}}=@{$al->{targetWords}}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
1504
|
|
|
|
|
|
|
|
|
1505
|
0
|
|
|
|
|
|
my %side=("source"=>"target","target"=>"source"); |
|
1506
|
0
|
|
|
|
|
|
while (my ($source,$target)= each(%side)){ |
|
1507
|
0
|
|
|
|
|
|
for (my $j=0;$j<@{$al->{$source."Al"}};$j++){ |
|
|
0
|
|
|
|
|
|
|
|
1508
|
0
|
|
|
|
|
|
push @{$sal->{$source."Al"}},[]; |
|
|
0
|
|
|
|
|
|
|
|
1509
|
0
|
0
|
|
|
|
|
if (defined($al->{$source."Al"}[$j])){ |
|
1510
|
0
|
|
|
|
|
|
foreach my $i (@{$al->{$source."Al"}[$j]}){ |
|
|
0
|
|
|
|
|
|
|
|
1511
|
0
|
0
|
0
|
|
|
|
if ($al->{$source."Links"}->{$j." ".$i}[0] ne "p" && $al->{$source."Links"}->{$j." ".$i}[0] ne "P"){ |
|
1512
|
0
|
|
|
|
|
|
push @{$sal->{$source."Al"}[$j]},$i; |
|
|
0
|
|
|
|
|
|
|
|
1513
|
|
|
|
|
|
|
} |
|
1514
|
|
|
|
|
|
|
} |
|
1515
|
|
|
|
|
|
|
} |
|
1516
|
|
|
|
|
|
|
} |
|
1517
|
|
|
|
|
|
|
} |
|
1518
|
0
|
|
|
|
|
|
return $sal; |
|
1519
|
|
|
|
|
|
|
} |
|
1520
|
|
|
|
|
|
|
|
|
1521
|
|
|
|
|
|
|
|
|
1522
|
|
|
|
|
|
|
1; |