File Coverage

blib/lib/Lingua/AlignmentSet.pm

Criterion	Covered	Total	%
statement	230	1038	22.1
branch	94	476	19.7
condition	23	159	14.4
subroutine	20	36	55.5
pod	0	27	0.0
total	367	1736	21.1

line	stmt	bran	cond	sub	pod	time	code
1							########################################################################
2							# Author: Patrik Lambert (lambert@talp.ucp.es)
3							# Description: Tools library to manage an Alignment Sets, i.e. a set of
4							# sentences aligned at the word (or phrase) level.
5							#-----------------------------------------------------------------------
6							# Copyright 2004 by Patrik Lambert
7							#
8							# This program is free software; you can redistribute it and/or modify
9							# it under the terms of the GNU General Public License as published by
10							# the Free Software Foundation; either version 2 of the License, or
11							# (at your option) any later version.
12							#
13							# This program is distributed in the hope that it will be useful,
14							# but WITHOUT ANY WARRANTY; without even the implied warranty of
15							# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16							# GNU General Public License for more details.
17							#
18							# You should have received a copy of the GNU General Public License
19							# along with this program; if not, write to the Free Software
20							# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21							########################################################################
22
23							package Lingua::AlignmentSet;
24	1			1		25122	use 5.005;
	1					4
	1					68
25	1			1		7	use vars qw($VERSION);
	1					2
	1					58
26	1			1		5	use strict;
	1					7
	1					62
27							$VERSION = 1.1;
28	1			1		661	use Lingua::AlSetLib 1.1;
	1					37
	1					40
29	1			1		927	use Lingua::Alignment 1.1;
	1					40
	1					48
30	1			1		706	use Lingua::WriteLatexFile;
	1					3
	1					30
31	1			1		552	use Lingua::AlignmentEval;
	1					3
	1					34
32
33	1			1		7	use Dumpvalue;
	1					3
	1					23
34	1			1		1128	use IO::File;
	1					12986
	1					14449
35
36							my $dumper=new Dumpvalue;
37							my $true = 1;
38							my $false = 0;
39
40							sub new {
41	2			2	0	33	my ($pkg,$refToFileSets) = @_;
42	2					9	my $refToLocation = readLocation($refToFileSets->[0][0]);
43	2					5	my $format = $refToFileSets->[0][1];
44	2					3	my $range = $refToFileSets->[0][2];
45	2					4	my $alSet = {};
46							#default values:
47	2	50				5	if (!defined($format)){$format="TALP"}
	0					0
	2					4
48							else {$format = uc $format};
49	2	100				7	if (!defined($range)){$range="1-"};
	1					2
50	2	100				6	if ($format eq "BLINKER"){
51							#for future ease we save detailed infos contained in the source sample path
52	1					4	completeBlinkerLocation($refToLocation);
53							}
54
55	2					5	$alSet->{location}=$refToLocation;
56	2					5	$alSet->{format}=$format;
57
58	2					6	setRange($alSet,$range);
59
60							#checking the data:
61	2	50				16	if ($format eq "GIZA"){
		100
		50
		0
62							# if ($ambiguity \|\| $confidence){die "GIZA format not compatible with ambiguity or confidence features"}
63							} elsif ($format eq "BLINKER"){
64							} elsif ($format eq "NAACL"){
65							} elsif ($format eq "TALP"){
66							} else {
67	0					0	die "Unknown format $format. Can't create alignment set object";
68							}
69	2					9	return bless $alSet,$pkg;
70							}
71
72							# create a new AlignmentSet that contains the same data of an already existing alignment set (without copying the addresses)
73							sub copy {
74	0			0	0	0	my $alSet = shift;
75
76	0					0	my $cloneLocation={};
77	0					0	my ($field,$value);
78	0					0	while (($field,$value)=each (%{$alSet->{location}})){
	0					0
79	0					0	$cloneLocation->{$field}=$value;
80							}
81	0					0	return Lingua::AlignmentSet->new([[$cloneLocation,$alSet->{format},$alSet->{firstSentPair}."-".$alSet->{lastSentPair}]]);
82							}
83
84							sub setWordFiles{
85	1			1	0	10	my ($alSet,$sourcePath,$targetPath) = @_;
86
87	1					7	$alSet->{location}->{source}=$sourcePath;
88	1					4	$alSet->{location}->{target}=$targetPath;
89							}
90							sub setSourceFile{
91	0			0	0	0	my ($alSet,$sourcePath) = @_;
92
93	0					0	$alSet->{location}->{source}=$sourcePath;
94							}
95							sub setTargetFile{
96	0			0	0	0	my ($alSet,$targetPath) = @_;
97
98	0					0	$alSet->{location}->{target}=$targetPath;
99							}
100
101							sub setTargetToSourceFile{
102	0			0	0	0	my ($alSet,$targetToSourcePath) = @_;
103
104	0					0	$alSet->{location}->{targetToSource}=$targetToSourcePath;
105							}
106
107							sub chFormat {
108	0			0	0	0	my ($alSet,$newLocation,$newFormat,$alignMode)=@_;
109
110	0					0	$alSet->convert($newLocation,$newFormat,$alignMode);
111							}
112
113
114							# Won't work if the sentence files are not specified
115							sub visualise {
116	0			0	0	0	my ($alSet,$representation,$format,$outputFH,$mark,$alignMode,$maxRows,$maxCols)=@_;
117	0					0	$representation = lc $representation;
118	0					0	$format = lc $format;
119	0	0				0	if (!defined($outputFH)){$outputFH=*STDOUT}
	0					0
120	0	0				0	if ($representation eq "matrix"){
121	0	0				0	if (!defined($mark)){$mark = "cross"}
	0					0
122	0	0				0	if (!defined($maxRows)){$maxRows = 53} #default maxRows value
	0					0
123	0	0				0	if (!defined($maxCols)){$maxCols = 35} #default maxRows value
	0					0
124	0					0	$format="latex";
125							}
126	0					0	my $latex = Lingua::Latex->new;
127	0	0				0	if ($format eq "latex"){
128	0					0	print $outputFH $latex->startFile;
129	0					0	print $outputFH $latex->setTabcolsep("0.5mm");
130							}
131	0					0	my $output = "";
132	0					0	my $inputSentPairNum = $alSet->{firstSentPair};
133	0					0	my $i;
134	0					0	my ($al,$alSetChunk);
135	0					0	my $FH = $alSet->openFiles();
136	0	0	0			0	if (($alSet->{format} ne "GIZA") && (!$FH->{source} \|\| !$FH->{target})){
			0
137	0					0	die "To use the 'visualise' function, you must specify the sentence (words) files.\n";
138							}
139	0					0	while ($alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,$alignMode)){ # returns 0 if eof or last sentence pair
140	0					0	$output = "";
141	0					0	for ($i=0;$i<@$alSetChunk;$i++){
142	0					0	$al = $$alSetChunk[$i];
143							# print main::Dumper($al);
144	0	0				0	if ($representation eq "matrix"){
		0
145	0					0	$output.= "\n$inputSentPairNum\n".$al->displayAsMatrix($latex,$mark,$maxRows,$maxCols);
146							}elsif ($representation eq "enumlinks"){
147	0					0	$output.= "\n$inputSentPairNum\n".$al->displayAsLinkEnumeration($format,$latex);
148							} #elsif
149							}#for
150	0					0	print $outputFH $output;
151	0					0	$inputSentPairNum++;
152							}
153	0	0				0	if ($format eq "latex"){print $outputFH $latex->endFile};
	0					0
154							}
155
156							#only work if the text files are given (not only the alignment files).
157							sub getSize {
158	0			0	0	0	my $alSet = shift;
159	0					0	my ($file,$factor);
160	0					0	my $size;
161
162	0	0	0			0	if ($alSet->{format} eq "GIZA"){
		0	0
163	0					0	$file = $alSet->{location}->{sourceToTarget};
164	0					0	$factor = 3;
165							}elsif ($alSet->{format} eq "NAACL" \|\| $alSet->{format} eq "BLINKER" \|\| $alSet->{format} eq "TALP"){
166	0	0				0	if (!$alSet->{location}->{source}){
167	0					0	die "One of the functions your are using requires you specify the sentence files (source and target)\n";
168							}
169	0					0	$file = $alSet->{location}->{source};
170	0					0	$factor = 1;
171							}
172	0					0	open (FILE,"<$file");
173	0					0	$size += tr/\n/\n/ while sysread(FILE, $_, 2 ** 16);
174	0					0	close(FILE);
175	0					0	$size = $size / $factor;
176	0					0	return $size;
177							}
178
179							# returns a list (in random order) of lineNumbers
180							# to sort this list, do: my @sortedSelection = sort { $a <=> $b; } @selection;
181
182							sub chooseSubsets {
183							#TO DO: possibility of percentage input for the size
184	0			0	0	0	my ($alSet,$seed,$size) = @_;
185	0					0	my $alSetSize = $alSet->getSize();
186	0					0	my $count;
187	0					0	my @selected=();
188	0					0	my @notSelected = ();
189	0					0	my ($ind,$elt);
190
191	0					0	for ($count=1;$count<$alSetSize;$count++){
192	0					0	push @notSelected,$count;
193							}
194	0					0	srand $seed;
195	0					0	for ($count=0;$count<$size;$count++){
196	0					0	$ind = rand @notSelected;
197	0					0	$elt = $notSelected[$ind];
198	0					0	splice @notSelected, $ind, 1;
199	0					0	push @selected,$elt;
200							}
201	0					0	return \@selected;
202							}
203							###################################################################
204							### EVALUATION ###
205							###################################################################
206
207							#code adapted from Rada Mihalcea's wa_eval_align.pl, rada@cs.unt.edu
208							# Evaluation is performed using:
209							# - Standard Precision, Recall, F-measure, separate for S (Sure) and P (Possible) cases
210							# - AER measure, defined as
211							# AER = 1 - ( \|A & S\| + \|A & P\| ) / ( \|A\| + \|S\| )
212							# [where A represents the alignment, S and P represent the S (Sure) and P (Possible) gold standard alignments]
213
214							sub evaluate {
215	1			1	0	8	my ($submissionAlSet,$answerAlSet,$alignMode,$weighted)=@_;
216	1	50				4	if (!defined($weighted)){$weighted=0}
	1					1
217	1					2	my ($line,$alignment);
218	0					0	my ($FH,$alSetChunk,$i,$al,$fhPos);
219	0					0	my ($inputSentPairNum,$internalSentPairNum,$sentPairNum);
220	0					0	my ($sureMatch,$possibleMatch,$possibleMatchSure);
221	0					0	my ($surePrecision,$sureRecall,$possiblePrecision,$possibleRecall,$sureFMeasure,$possibleFMeasure,$AER);
222							# 1 READ ANSWER AND SUBMISSION FILES
223							# (in the case of NAACL format file it's more efficient to treat it directly, otherwise load to internal structure)
224							# answer file
225	0					0	my %sureAnswer;
226	0					0	my %possibleAnswer;
227	1					2	my $INFINITY = 9999999999;
228	1					2	$inputSentPairNum = $answerAlSet->{firstSentPair};
229	1					2	$internalSentPairNum = 1;
230	1	50	33			7	if ( $answerAlSet->{format} eq "NAACL" && $alignMode eq "as-is" && $answerAlSet->{firstSentPair} == 1){
			33
231	0	0				0	my $answerFH = IO::File->new("<".$answerAlSet->{location}{sourceToTarget}) or die "Answer alignment file opening error:$!";
232							#go to first sentence pair:
233	0					0	$fhPos = $answerFH->getpos;
234	0		0			0	while ($answerFH->getline() !~ m/^0$inputSentPairNum ./o && !$answerFH->eof()) {
235	0					0	$fhPos = $answerFH->getpos;
236							}
237	0	0				0	if ($answerFH->eof()){
238	0					0	die "First sentence pair of range not found in ".$answerAlSet->{location}{sourceToTarget};
239							}
240	0					0	$answerFH->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
241
242							#read file:
243	0	0				0	if ($answerAlSet->{lastSentPair} eq "eof"){
244	0					0	$inputSentPairNum = $INFINITY;
245							}else{
246	0					0	$inputSentPairNum = $answerAlSet->{lastSentPair}+1;
247							}
248	0		0			0	while(!$answerFH->eof() && ( ($line=$answerFH->getline()) !~ m/^0$inputSentPairNum ./o )) {
249	0					0	chomp $line;
250	0					0	$line =~ s/^\s+\|\s+$//g;
251	0					0	identifySurePossible($line,\%sureAnswer,\%possibleAnswer);
252							}
253	0					0	$answerFH->close();
254							}else{
255	1					7	$FH = $answerAlSet->openFiles();
256	1					12	while ($alSetChunk = $answerAlSet->loadChunk($FH,$inputSentPairNum,$alignMode)){ # returns 0 if eof or last sentence pair
257	10					52	for ($i=0;$i<@$alSetChunk;$i++){
258	10					15	$al = $$alSetChunk[$i];
259							# print "EVALUATE: answer al:\n";
260							# print main::Dumper($al);
261	10					52	foreach $line (@{$al->writeToBlinker()}){
	10					32
262	90					142	$line = "$internalSentPairNum ".$line;
263	90					179	identifySurePossible($line,\%sureAnswer,\%possibleAnswer);
264							}
265							}
266	10					12	$inputSentPairNum++;
267	10					27	$internalSentPairNum++;
268							}
269	1					4	closeFiles($FH,$answerAlSet->{format});
270							}# if format - else
271
272							# submission file
273	1					2	my %sureSubmission;
274							my %possibleSubmission;
275	1					2	$inputSentPairNum = $submissionAlSet->{firstSentPair};
276	1					2	$internalSentPairNum = 1;
277
278	1	50	33			8	if ($submissionAlSet->{format} eq "NAACL" && $alignMode eq "as-is" && $submissionAlSet->{firstSentPair}==1){
			33
279	0	0				0	my $submissionFH = IO::File->new("<".$submissionAlSet->{location}{sourceToTarget}) or die "Submission alignment file opening error:$!";
280
281							#go to first sentence pair:
282	0					0	$fhPos = $submissionFH->getpos;
283	0		0			0	while ($submissionFH->getline() !~ m/^0$inputSentPairNum ./o && !$submissionFH->eof()) {
284	0					0	$fhPos = $submissionFH->getpos;
285							}
286	0	0				0	if ($submissionFH->eof()){
287	0					0	die "First sentence pair of range not found in ".$submissionAlSet->{location}{sourceToTarget};
288							}
289	0					0	$submissionFH->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
290
291							#read file:
292	0	0				0	if ($submissionAlSet->{lastSentPair} eq "eof"){
293	0					0	$inputSentPairNum = $INFINITY;
294							}else{
295	0					0	$inputSentPairNum = $submissionAlSet->{lastSentPair}+1;
296							}
297	0		0			0	while(!$submissionFH->eof() && (($line = $submissionFH->getline()) !~ m/^0$inputSentPairNum ./o )) {
298	0					0	chomp $line;
299	0					0	$line =~ s/^\s+\|\s+$//g;
300	0					0	identifySurePossible($line,\%sureSubmission,\%possibleSubmission);
301							}
302	0					0	$submissionFH->close();
303							}else{
304	1					4	$FH = $submissionAlSet->openFiles();
305	1					4	while ($alSetChunk = $submissionAlSet->loadChunk($FH,$inputSentPairNum,$alignMode)){ # returns 0 if eof or last sentence pair
306	10					27	for ($i=0;$i<@$alSetChunk;$i++){
307	10					12	$al = $$alSetChunk[$i];
308							# print "submission al:\n";
309							# $dumper->dumpValue($al);
310	10					56	foreach $line (@{$al->writeToBlinker()}){
	10					38
311	59					100	$line = "$internalSentPairNum ".$line;
312	59					120	identifySurePossible($line,\%sureSubmission,\%possibleSubmission);
313							}
314							}
315	10					11	$inputSentPairNum++;
316	10					29	$internalSentPairNum++;
317							}
318	1					6	closeFiles($FH,$submissionAlSet->{format});
319							}# if format=NAACL else
320
321
322							# print "weighted:$weighted\n";
323							# print "SA:".join("-",keys %sureAnswer),"\nSS:".join(" - ",keys %sureSubmission),"\nPA:".join(" - ",keys %possibleAnswer),"\nPS:".join(" - ",keys %possibleSubmission)."\n";
324
325							# 2 WEIGHT LINKS
326							# It is a kind of "normalization" of multiple links: each link (j i) is weighted according to
327							# the number of links in which j and i are involved: weight(j,i)=0.5*(1/numLinks(j)+1/numLinks(i)).
328
329	1					3	my ($link,$j,$hash,$value);
330	0					0	my %weightsSure;
331	0					0	my %linksSure;
332	0					0	my @linksSureInSentence;
333	0					0	my %linksPossible;
334	0					0	my @linksPossibleInSentence;
335	0					0	my %weightsPossible;
336
337	1	50				6	if ($weighted){
338							# When only sure links are considered (calculation of Ps and Rs), they are weighted with respect to the union of both sure sets
339							# take union
340	0					0	foreach $hash ( \%sureSubmission, \%sureAnswer ) {
341	0					0	while (($link, $value) = each %$hash) {
342	0					0	($sentPairNum,$j,$i)=split(" ",$link);
343	0					0	$linksSure{$sentPairNum}{"$j $i"} = $value;
344							}
345							}
346							# calculate weight of each link
347	0					0	foreach $sentPairNum (keys %linksSure){
348	0					0	@linksSureInSentence =keys %{$linksSure{$sentPairNum}};
	0					0
349	0					0	foreach $link (@linksSureInSentence){
350	0					0	($j,$i)=split(" ",$link);
351	0					0	$weightsSure{"$sentPairNum $link"}=0.5*( 1/grep(/^$j /,@linksSureInSentence)+1/grep(/ $i$/,@linksSureInSentence) );
352							}
353							}
354
355							# When all links are considered (calculation of Pp and Rp, AER), possible AND sure links are weighted with respect to the union of all sets.
356	0					0	%linksPossible=%linksSure;
357							# add union of possible links
358	0					0	foreach $hash (\%possibleSubmission, \%possibleAnswer ) {
359	0					0	while (($link, $value) = each %$hash) {
360	0					0	($sentPairNum,$j,$i)=split(" ",$link);
361	0					0	$linksPossible{$sentPairNum}{"$j $i"} = $value;
362							}
363							}
364							# calculate weight of each link
365	0					0	foreach $sentPairNum (keys %linksPossible){
366	0					0	@linksPossibleInSentence =keys %{$linksPossible{$sentPairNum}};
	0					0
367	0					0	foreach $link (@linksPossibleInSentence){
368	0					0	($j,$i)=split(" ",$link);
369	0					0	$weightsPossible{"$sentPairNum $link"}=0.5*( 1/grep(/^$j /,@linksPossibleInSentence)+1/grep(/ $i$/,@linksPossibleInSentence) );
370							}
371							}
372							}
373
374							# 3 SUM UP LINKS
375							# in case of weights distinct from 1: sum of %possibleAnswer and %possibleSubmission is always with %weightsPossible.
376							# however the sum of %sureAnswer and %sureSubmission is with %weightsSure to calculate Ps and Rs, %weightsPossible for Pp, Rp and AER.
377	1					4	my ($totalPossibleAnswer,$totalPossibleSubmission)=(0,0);
378	1					2	my ($totalSureAnswer_weightsSure,$totalSureSubmission_weightsSure,$totalSureAnswer_weightsPossible,$totalSureSubmission_weightsPossible)=(0,0,0,0);
379	1	50				9	if ($weighted){
380	0					0	foreach $link (keys %sureAnswer){
381	0					0	$totalSureAnswer_weightsSure+=$weightsSure{$link};
382	0					0	$totalSureAnswer_weightsPossible+=$weightsPossible{$link};
383							}
384	0					0	foreach $link (keys %sureSubmission){
385	0					0	$totalSureSubmission_weightsSure+=$weightsSure{$link};
386	0					0	$totalSureSubmission_weightsPossible+=$weightsPossible{$link};
387							}
388	0					0	foreach $link (keys %possibleAnswer){
389	0					0	$totalPossibleAnswer+=$weightsPossible{$link};
390							}
391	0					0	foreach $link (keys %possibleSubmission){
392	0					0	$totalPossibleSubmission+=$weightsPossible{$link};
393							}
394							}else{ #every link has a weight 1
395	1					8	$totalSureAnswer_weightsSure=scalar(keys %sureAnswer);
396	1					2	$totalSureAnswer_weightsPossible= $totalSureAnswer_weightsSure;
397	1					3	$totalSureSubmission_weightsSure=scalar(keys %sureSubmission);
398	1					3	$totalSureSubmission_weightsPossible=$totalSureSubmission_weightsSure;
399	1					2	$totalPossibleAnswer=scalar(keys %possibleAnswer);
400	1					3	$totalPossibleSubmission=scalar(keys %possibleSubmission);
401							}
402
403							# 4 COUNT MATCHES
404
405							# print "sureSubmission:",join("\|",keys %sureSubmission),"\n";
406							# print "possibleSubmission:",join("\|",keys %possibleSubmission),"\n";
407							# print "sureAnswer:",join("\|",keys %sureAnswer),"\n";
408							# print "possibleAnswer:",join("\|",keys %possibleAnswer),"\n";
409							# print "\n";
410							# now determine the S[ure] matches
411	1					2	$sureMatch = 0;
412	1					13	foreach $alignment (keys %sureSubmission) {
413	59	100				122	if(defined($sureAnswer{$alignment})) {
414	54	50				80	if (!$weighted){$sureMatch++}
	54					68
	0					0
415							else {$sureMatch += $weightsSure{$alignment}}
416							}
417							}
418							# and the [P]robable matches
419							# these are checked against both S[ure] and P[robable] correct alignments
420	1					6	$possibleMatch = 0;
421	1					10	foreach $alignment (keys %possibleSubmission, keys %sureSubmission) {
422	59	100	66			153	if(defined($sureAnswer{$alignment}) \|\| defined($possibleAnswer{$alignment})) {
423	54	50				79	if (!$weighted){$possibleMatch++}
	54					66
	0					0
424							else{$possibleMatch += $weightsPossible{$alignment}}
425							}
426							}
427							# and also the intersection between all submitted alignments
428							# and the S [Sure] correct alignments -- as needed by AER
429	1					8	$possibleMatchSure = 0;
430	1					9	foreach $alignment (keys %possibleSubmission, keys %sureSubmission) {
431	59	100				120	if(defined($sureAnswer{$alignment})) {
432	54	50				77	if (!$weighted){$possibleMatchSure++}
	54					67
	0					0
433							else{$possibleMatchSure+= $weightsPossible{$alignment}}
434							}
435							}
436							# print "sureMatch:$sureMatch possibleMatch:$possibleMatch possibleMatchSure:$possibleMatchSure\n";
437
438							# 5 COMPUTE EVALUATION MEASURES
439							# now determine the precision, recall, and F-measure for [S]ure alignments
440	1	50				8	if(scalar(keys %sureSubmission) != 0) {
441	1					3	$surePrecision = $sureMatch / $totalSureSubmission_weightsSure;
442							}else {
443	0					0	$surePrecision = 0;
444							}
445	1	50				5	if(scalar(keys %sureAnswer) != 0) {
446	1					4	$sureRecall = $sureMatch / $totalSureAnswer_weightsSure;
447							}else {
448	0					0	$sureRecall = 0;
449							}
450	1	50	33			9	if($sureRecall != 0 && $surePrecision != 0) {
451	1					4	$sureFMeasure = 2 * $sureRecall * $surePrecision / ($sureRecall + $surePrecision);
452							}else {
453	0					0	$sureFMeasure = 0;
454							}
455
456
457							# and now determine the precision, recall, and F-measure for [P]robable alignments
458	1	50				6	if(scalar(keys %sureSubmission) + scalar(keys %possibleSubmission) != 0) {
459	1					2	$possiblePrecision = $possibleMatch / ($totalSureSubmission_weightsPossible+$totalPossibleSubmission);
460							}else {
461	0					0	$possiblePrecision = 0;
462							}
463	1	50				7	if(scalar(keys %sureAnswer) + scalar(keys %possibleAnswer)!= 0) {
464	1					3	$possibleRecall = $possibleMatch / ($totalSureAnswer_weightsPossible+$totalPossibleAnswer);
465							}else {
466	0					0	$possibleRecall = 0;
467							}
468	1	50	33			8	if($possibleRecall != 0 && $possiblePrecision != 0) {
469	1					2	$possibleFMeasure = 2 * $possibleRecall * $possiblePrecision / ($possibleRecall + $possiblePrecision);
470							}else {
471	0					0	$possibleFMeasure = 0;
472							}
473
474							# and determine the AER
475	1	50				3	if(scalar(keys %sureSubmission) + scalar(keys %possibleSubmission) != 0) {
476	1					3	$AER = 1 - ($possibleMatchSure + $possibleMatch) / ($totalSureSubmission_weightsPossible+$totalPossibleSubmission+$totalSureAnswer_weightsPossible);
477							}else {
478	0					0	$AER = 0;
479							}
480	1					11	return Lingua::AlignmentEval->new($surePrecision,$sureRecall,$sureFMeasure,$possiblePrecision,$possibleRecall,$possibleFMeasure,$AER);
481							}
482							###################################################################
483							### PROCESSING ###
484							###################################################################
485
486							sub processAlignment{
487	0			0	0	0	my ($alSet,$AlignmentSub,$newLocation,$newFormat,$alignMode)=@_;
488	0					0	my $newAlSet = $alSet->copy;
489	0	0				0	if (ref($AlignmentSub) eq 'ARRAY'){
490	0	0				0	if ($AlignmentSub->[0] eq "Lingua::Alignment::eliminateWord"){
491	0	0				0	if (@$AlignmentSub<3){die "Missing parameters for Lingua::Alignment::eliminateWord\n"}
	0					0
492							else{
493	0					0	my $side = lc $AlignmentSub->[2];
494	0	0	0			0	if (!$alSet->{location}{$side} \|\| !$newLocation->{$side}){die "Missing $side file for Lingua::Alignment::eliminateWord\n"}
	0					0
495							}
496							}
497							}
498	0					0	$newAlSet->convert($newLocation,$newFormat,$alignMode,$AlignmentSub);
499	0					0	return $newAlSet;
500							}
501
502							sub symmetrize {
503	0			0	0	0	my ($alSet,$newLocation,$newFormat,$ENV,$selectSubgroups,$alignMode,$globals)=@_;
504							#defaults
505	0	0				0	if (!defined($selectSubgroups)){$selectSubgroups=0}
	0					0
506	0	0				0	if (!defined($alignMode)){$alignMode="no-null-align"}
	0					0
507	0	0				0	if (!defined($globals->{"minPhraseFrequency"})){$globals->{"minPhraseFrequency"}=2};
	0					0
508	0	0				0	if (!defined($globals->{"extendGroups"})){$globals->{"extendGroups"}=0};
	0					0
509	0	0				0	if (!defined($globals->{"onlyGroups"})){$globals->{"onlyGroups"}=1};
	0					0
510	0	0				0	if (!defined($globals->{"defaultActionGrouping"})){$globals->{"defaultActionGrouping"}="Lingua::Alignment::getUnion"};
	0					0
511	0	0				0	if (!defined($globals->{"defaultActionGeneral"})){$globals->{"defaultActionGeneral"}="Lingua::Alignment::intersect"};
	0					0
512	0	0				0	if (!defined($globals->{"verbose"})){$globals->{"verbose"}="0"};
	0					0
513	0					0	my $verbose = $globals->{"verbose"};
514	0					0	my $al; # reference alignment- remains unchanged
515							my $modAl; #reference alignment modified with the successive aplication of symRules
516							#load in memory a chunk of the alignment set as a list
517							#of references to (internal representation) alignment objects:
518	0					0	my ($k,$alSetChunk);
519	0					0	my $FH = $alSet->openFiles();
520	0					0	my $newFH;
521	0	0				0	if ($selectSubgroups==0){
522	0					0	$newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
523							}
524	0					0	my $internalSentPairNum = 1;
525	0					0	my ($sentenceNum,$ruleApplied) = ($alSet->{firstSentPair},0);
526
527	0					0	my $grSentPairNum=1;
528
529	0					0	my ($j,$i);
530	0					0	my ($lines,$line);
531	0					0	my $groups = {};
532	0					0	my $groupsCurrentSentence = {};
533	0					0	my $groupKeys = [];
534	0					0	my $subGroups={};
535	0					0	my $subGroupsCurrentSentence = {};
536	0					0	my $subGroupKeys=[];
537	0					0	my ($candidate,$count);
538
539	0	0				0	if (!$selectSubgroups){ #load subgroup hash and array:
540	0					0	open(GROUPS,"<$ENV/groups");
541	0					0	while (){
542	0					0	push @$groupKeys,$_;
543	0					0	@$line = split " \| ",$_,2;
544	0					0	$groups->{$line->[1]}=$line->[0];
545							}
546	0	0				0	if ($globals->{onlyGroups}==0){
547	0					0	open(SUBGROUPS,"<$ENV/subGroups");
548	0					0	while (){
549	0					0	push @$subGroupKeys,$_;
550	0					0	@$line = split " \| ",$_,2;
551	0					0	$subGroups->{$line->[1]}=$line->[0];
552							}
553							}
554							}
555
556	0					0	my %anchors;
557	0					0	my %sourcePerturbed={};
558	0					0	my %targetPerturbed={}; # Perturbations must be distinct so we keep track of the already detected "Perturbed" $j's
559	0					0	my ($perturbation,$perturbationNoMod);
560	0					0	my ($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget);
561	0					0	my ($ind,$newPerturbationDetected,$anchorsInTarget);
562	0					0	my ($countPertubs,$countGrouping,$countOneToMany,$countElse,$countNoGroup)=(0,0,0,0,0);
563	0					0	while ($alSetChunk = $alSet->loadChunk($FH,$sentenceNum,$alignMode)){ # returns 0 if eof or last sentence pair
564	0					0	for ($k=0;$k<@$alSetChunk;$k++){
565							# print "\nsentence pair $sentenceNum\n";
566	0					0	$ruleApplied=0;
567	0					0	$al = $$alSetChunk[$k];
568	0	0				0	if ($verbose >0){
569	0					0	print $sentenceNum."\n";
570	0					0	print $al->sourceSentence."\n";
571	0					0	print $al->targetSentence."\n";
572							}
573	0					0	$modAl = $al->clone();
574	0					0	($lastAnchorSource,$lastAnchorTarget)=(0,0);
575	0					0	%sourcePerturbed=();
576	0					0	%targetPerturbed=();
577	0					0	$j = 1;
578							#detect "perturbations" in the anchor diagonal looping only over $j (to have less repeated zones). We can only miss those where $i is aligned only to NULL
579	0					0	while ($j<@{$al->{sourceAl}}){
	0					0
580	0		0			0	while ( !$al->isAnchor($j,"source") && $j<(@{$al->{sourceAl}})){
	0					0
581	0					0	$j++;
582							}
583	0	0				0	if ($j<=@{$al->{sourceAl}}){
	0					0
584	0	0				0	if ($j==@{$al->{sourceAl}}){
	0					0
585	0					0	($newAnchorSource,$newAnchorTarget) = ($j,scalar(@{$al->{targetAl}}));
	0					0
586							}else{
587	0					0	($newAnchorSource,$newAnchorTarget) = ($j,$al->{sourceAl}[$j][0]);
588							}
589	0					0	$newPerturbationDetected=0;
590	0	0	0			0	if (($newAnchorSource-$lastAnchorSource)!=1 && !$sourcePerturbed{$lastAnchorSource+1}){
		0	0
591	0					0	$newPerturbationDetected = 1;
592							} elsif (($newAnchorTarget-$lastAnchorTarget)!=1 && !$targetPerturbed{$lastAnchorTarget+1}){
593	0					0	$anchorsInTarget=1;
594	0					0	for ($i=$lastAnchorTarget+1;$i<$newAnchorTarget;$i++){
595	0	0				0	if (!$al->isAnchor($i,"target")){$anchorsInTarget=0}
	0					0
596							}
597	0	0				0	if (!$anchorsInTarget){$newPerturbationDetected=1};
	0					0
598							}
599	0	0				0	if ( $newPerturbationDetected ){
600	0					0	$countPertubs++;
601							# print "\n($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget)\n";
602
603	0					0	$perturbation = $al->cut($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget);
604	0					0	$perturbationNoMod = $al->cut($lastAnchorSource,$lastAnchorTarget,$newAnchorSource,$newAnchorTarget);
605
606							# PRINT PERTURBATION TO FILE
607							#if (exists($newFH->{source})){
608							# $newFH->{source}->print(" ~~".join(" ",@{$perturbation->{sourceWords}})."~~ \n");
609							#}
610							#if (exists($newFH->{target})){
611							# $newFH->{target}->print(" ~~".join(" ",@{$perturbation->{targetWords}})."~~ \n");
612							#}
613							#$perturbation->getUnion;
614							#$lines = $perturbation->writeToBlinker;
615							#foreach $line (@$lines){
616							# $newFH->{sourceToTarget}->print("$grSentPairNum $line\n");
617							#}
618							#$grSentPairNum++;
619
620							#END PRINT PERTURBATION TO FILE
621	0	0				0	if ($selectSubgroups){
622	0					0	$perturbation->selectSubgroups($groupsCurrentSentence,$subGroupsCurrentSentence,$globals);
623							}else{
624	0	0				0	if (1==0){
625							# if ($ruleApplied=$perturbation->applyOneToMany_2()){
626							$countOneToMany++;
627	0					0	}elsif (($ruleApplied=$perturbation->applyGrouping($groupKeys,$subGroupKeys,$globals))>0){
628	0					0	$countGrouping++;
629							}else{
630	0					0	my $defaultActionGen = $globals->{defaultActionGeneral};
631	0					0	$perturbation->$defaultActionGen();
632	0	0				0	if ($ruleApplied==-1){
633	0					0	$countNoGroup++;
634							}else{
635	0					0	$perturbation->processNull();
636	0					0	$countElse++;
637							}
638							}
639	0					0	$perturbation->paste($modAl);
640							}
641							# print "\ns indices:",join (" ",keys %{$perturbation->{sourceIndices}}),"\n";
642							# print "t indices:",join (" ",keys %{$perturbation->{targetIndices}}),"\n";
643	0					0	foreach $ind (keys %{$perturbation->{sourceIndices}}){
	0					0
644	0	0				0	if ($ind>0){
645	0					0	$sourcePerturbed{$ind+$perturbation->{zeroSource}}=1;
646							}
647							}
648	0					0	foreach $ind (keys %{$perturbation->{targetIndices}}){
	0					0
649	0	0				0	if ($ind>0){
650	0					0	$targetPerturbed{$ind+$perturbation->{zeroTarget}}=1;
651							}
652							}
653							# print "s perturbed:",join (" ",keys %sourcePerturbed),"\n";
654							# print "t perturbed:",join (" ",keys %targetPerturbed),"\n";
655							} #if perturbation
656
657	0					0	$anchors{"$newAnchorSource $newAnchorTarget"}=1;
658	0					0	($lastAnchorSource,$lastAnchorTarget) = ($newAnchorSource,$newAnchorTarget);
659	0					0	$j++;
660							}
661							} #while j...
662	0	0	0			0	if ($newFormat eq "NAACL" && !$selectSubgroups){
663	0	0				0	if (exists($newFH->{source})){
664	0					0	$newFH->{source}->print(" ~~".join(" ",@{$modAl->{sourceWords}})."~~ \n");
	0					0
665							}
666	0	0				0	if (exists($newFH->{target})){
667	0					0	$newFH->{target}->print(" ~~".join(" ",@{$modAl->{targetWords}})."~~ \n");
	0					0
668							}
669	0					0	$al->intersect();
670	0					0	$lines = $modAl->writeToBlinker;
671	0					0	foreach $line (@$lines){
672	0					0	$newFH->{sourceToTarget}->print("$internalSentPairNum $line\n");
673							}
674							}
675	0	0				0	if (($internalSentPairNum % 1000)==0){print STDERR $internalSentPairNum}
	0	0				0
	0					0
676							elsif (($internalSentPairNum % 100)==0){print STDERR "."}
677	0					0	$sentenceNum++;
678	0					0	$internalSentPairNum++;
679	0	0				0	if ($verbose > 0){print "Candidates:\n";}
	0					0
680	0	0				0	if ($selectSubgroups){
681	0					0	foreach $candidate (keys %$groupsCurrentSentence){
682	0	0				0	if ($verbose > 0){print "$candidate\n";}
	0					0
683	0					0	$groups->{$candidate}=$groups->{$candidate}+1;
684							}
685	0					0	%$groupsCurrentSentence=();
686	0	0				0	if ($globals->{onlyGroups}==0){
687	0					0	foreach $candidate (keys %$subGroupsCurrentSentence){
688	0					0	$subGroups->{$candidate}=$subGroups->{$candidate}+1;
689							}
690	0					0	%$subGroupsCurrentSentence=();
691							}
692							}
693							}#for k<@alSetChunk
694							} #while alsetchunk
695	0					0	print STDERR "\n";
696	0	0				0	if ($selectSubgroups==0){
697	0					0	closeFiles($newFH,$newFormat);
698							}
699	0					0	closeFiles($FH,$alSet->{format});
700	0	0				0	if ($selectSubgroups){
701	0	0				0	if ($verbose>0){print "\ngroups:",scalar(keys(%$groups))," - subgroups:",scalar(keys(%$subGroups)),"\n";}
	0					0
702	0	0				0	open(GROUPS, ">$ENV/groups") or die "File opening error:$!";;
703	0					0	while (($candidate,$count)=each(%$groups)){
704							# print "groups $count \| $candidate\n";
705	0	0				0	if ($count >= $globals->{minPhraseFrequency}){
706	0					0	print GROUPS "$count \| $candidate\n";
707							}
708							}
709	0	0				0	if ($globals->{onlyGroups}==0){
710	0	0				0	open(SUBGROUPS, ">$ENV/subGroups") or die "File opening error:$!";;
711	0					0	while (($candidate,$count)=each(%$subGroups)){
712							# print "SUBGROUPS $count \| $candidate\n";
713	0	0				0	if ($count >= $globals->{minPhraseFrequency}){
714	0					0	print SUBGROUPS "$count \| $candidate\n";
715							}
716							}
717							}
718							}else{
719	0					0	print STDERR "perturbations:$countPertubs (oneToMany:$countOneToMany grouped:$countGrouping not grouped:$countNoGroup others:$countElse)\n";
720	0					0	$alSet->{location}=$newLocation;
721	0					0	$alSet->{format}=$newFormat;
722	0					0	$alSet->{firstSentPair}=1;
723	0					0	$alSet->{lastSentPair}="eof";
724							}
725							}
726
727							sub orderAsBilCorpus {
728	0			0	0	0	my ($alSet,$newLocation,$newFormat,$alignMode,$corpSrc,$corpTrg,$verbose)=@_;
729	0	0				0	if (!defined($newFormat)){$newFormat="TALP"}
	0					0
	0					0
730							else {$newFormat = uc $newFormat}
731	0					0	$newLocation = readLocation($newLocation);
732	0					0	my $FH = $alSet->openFiles();
733	0					0	my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
734	0					0	my $inputSentPairNum=$alSet->{firstSentPair};
735	0					0	my $internalSentPairNum = 1;
736	0	0				0	if ($verbose >0){
737	0					0	select STDOUT; $\| = 1; # enable autoflush (desactivate buffering)
	0					0
738							}
739	0	0				0	open(CS,"<$corpSrc") \|\| die "$corpSrc file opening error !";
740	0	0				0	open(CT,"<$corpTrg") \|\| die "$corpTrg file opening error !";
741
742							# LOAD BILINGUAL CORPUS IN HASH
743	0					0	my %newcorp;
744	0					0	my $cntCorp=0;
745	0					0	while (my $s=) {
746	0					0	chomp $s;
747	0					0	my $t=;
748	0					0	chomp $t;
749	0					0	$newcorp{"$s \|\|\| $t"}=1;
750	0					0	$cntCorp++;
751							}
752	0	0				0	if ($verbose > 0){
753	0					0	print "Number of different sentence pairs in new corpus:".scalar(keys %newcorp)."\n";
754							}
755
756							# PARSE AL SET
757	0					0	my $cntToFind=$cntCorp;
758	0					0	my %found;
759	0					0	while (my $alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,"as-is")){ # returns 0 if eof or last sentence pair
760	0					0	for (my $i=0;$i<@$alSetChunk;$i++){
761	0					0	my $al = $$alSetChunk[$i];
762	0					0	my $pair=$al->sourceSentence." \|\|\| ".$al->targetSentence;
763	0	0				0	if ($newcorp{$pair}){
764	0					0	$found{$pair}=$al;
765	0					0	$cntToFind--;
766							}
767							} #for
768	0	0				0	if ($cntToFind==0){
769	0	0				0	if ($verbose>0){print STDERR "Leaving loop of giza file at line:$inputSentPairNum\n";}
	0					0
770	0					0	last;
771							}
772	0					0	$inputSentPairNum++;
773	0	0				0	if ($verbose>0){
774	0	0				0	if ($inputSentPairNum % 100000 ==0){print $inputSentPairNum;}
	0					0
775	0	0				0	if ($inputSentPairNum % 10000 ==0){print ".";}
	0					0
776							}
777							} #while
778	0	0				0	if ($verbose>0){print "\n";}
	0					0
779							# REORDER AL SET
780	0					0	print "reordering Alignment set...\n";
781	0					0	seek CS,0,0; #go back to beginning of file
782	0					0	seek CT,0,0;
783	0					0	while (my $s=) {
784	0					0	chomp $s;
785	0					0	my $t=;
786	0					0	chomp $t;
787	0	0				0	if (exists($found{"$s \|\|\| $t"})){
788	0					0	my $al= $found{"$s \|\|\| $t"};
789	0					0	$al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
790							}else{
791	0					0	die "ERROR: not found sentence pair $internalSentPairNum in Alignment Set\n";
792							}
793	0					0	$internalSentPairNum++;
794							}
795	0					0	closeFiles($newFH,$newFormat);
796	0					0	closeFiles($FH,$alSet->{format});
797	0					0	$alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
798							}
799
800							sub adaptToBilCorpus {
801	0			0	0	0	my ($alSet,$newLocation,$newFormat,$alignMode,$corpSrc,$corpTrg,$restrictions,$verbose)=@_;
802	0					0	my $pdiff=$restrictions->{allowedPercentWordDiff};
803	0					0	my $mindiff=$restrictions->{minAllowedNumWordDiff};
804	0					0	my $maxdiff=$restrictions->{maxAllowedNumWordDiff};
805	0					0	my $nfirst=$restrictions->{numWordsConsideredFirst};
806	0					0	my $dumper = new Dumpvalue;
807
808	0	0				0	if (!defined($newFormat)){$newFormat="TALP"}
	0					0
	0					0
809							else {$newFormat = uc $newFormat}
810	0					0	$newLocation = readLocation($newLocation);
811	0					0	my $FH = $alSet->openFiles();
812	0					0	my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
813	0					0	my $inputSentPairNum=$alSet->{firstSentPair};
814	0					0	my $internalSentPairNum = 1;
815	0					0	select STDOUT; $\| = 1; # enable autoflush (desactivate buffering)
	0					0
816	0	0				0	open(CS,"<$corpSrc") \|\| die "$corpSrc file opening error !";
817	0	0				0	open(CT,"<$corpTrg") \|\| die "$corpTrg file opening error !";
818
819							# DETECT ALSET SENTENCES THAT ARE IN THE CORPUS
820	0					0	my %newcorp;
821	0					0	while (my $s=) {
822	0					0	chomp $s;
823	0					0	my $t=;
824	0					0	chomp $t;
825	0					0	$newcorp{"$s \|\|\| $t"}=1;
826							}
827	0	0				0	if ($verbose > 0){
828	0					0	print "Number of different sentence pairs in new corpus:".scalar(keys %newcorp)."\n";
829							}
830	0					0	my $count=0;
831	0					0	while (my $alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,$alignMode)){
832							# print $inputSentPairNum."\n";
833	0					0	for (my $i=0;$i<@$alSetChunk;$i++){
834	0					0	my $al = $$alSetChunk[$i];
835	0					0	my $s = $al->sourceSentence;
836	0					0	my $t = $al->targetSentence;
837	0					0	my @ws=split / /,$s;
838	0					0	my @wt=split / /,$t;
839	0					0	my ($nums,$numt)=(scalar(@ws),scalar(@wt));
840	0	0				0	if (!$newcorp{"$s \|\|\| $t"}){
841							# DETECT CLOSEST SENTENCES IN NEW CORPUS AND MODIFY ALIGNMENT SET
842							# calculate values for length test (to, later, skip lcs calculation)
843	0					0	my $sAllowedDiff = Lingua::AlSetLib::max(Lingua::AlSetLib::min($nums*$pdiff/100,$maxdiff),$mindiff);
844	0					0	my $tAllowedDiff = Lingua::AlSetLib::max(Lingua::AlSetLib::min($numt*$pdiff/100,$maxdiff),$mindiff);
845	0					0	my $sMin=$nums-$sAllowedDiff;
846	0					0	my $sMax=$nums+$sAllowedDiff;
847	0					0	my $tMin=$numt-$tAllowedDiff;
848	0					0	my $tMax=$numt+$tAllowedDiff;
849	0	0				0	if ($verbose > 0){
850	0					0	print $s."\n";
851	0					0	print $t."\n";
852	0					0	print "ns:$nums nt:$numt allowed length diff: s:$sAllowedDiff t:$tAllowedDiff;\n\n";
853							}
854	0					0	my $bestLcs = 0;
855	0					0	my @bestSrc;
856							my @bestTrg;
857	0					0	my ($cntGoodLength,$cntPassedFirstLCS,$cnt2)=(0,0,0);
858							# parse new corpus
859	0					0	foreach my $pair (keys %newcorp){
860	0					0	my ($cs,$ct)=split / \\|\\|\\| /,$pair;
861	0					0	my @wcs = split / /,$cs;
862	0					0	my @wct = split / /,$ct;
863	0					0	my ($numcs,$numct)=(scalar(@wcs),scalar(@wct));
864
865	0	0				0	if ($verbose >2){print "ncs:$numcs nct:$numct\n";}
	0					0
866	0	0				0	if ($verbose >2){print "CORPUS:\n$cs\n$ct\n";}
	0					0
867							#length test:
868	0	0	0			0	if ($numcs<$sMin \|\| $numcs>$sMax \|\| $numct<$tMin \|\| $numct>$tMax){
			0
			0
869							# skip lcs ratio calculation
870							}else{
871	0					0	$cntGoodLength++;
872							#calculate LCS, but first looking at the nfirst first words
873	0					0	my @fws;
874							my @fwt;
875	0					0	my @fwcs;
876	0					0	my @fwct;
877	0					0	for (my $i=0;$i<$nfirst;$i++){
878	0					0	push @fws,$ws[$i];
879	0					0	push @fwt,$wt[$i];
880	0					0	push @fwcs,$wcs[$i];
881	0					0	push @fwct,$wct[$i];
882							}
883	0					0	my $sLcs = Lingua::AlSetLib::LCS_ratio(\@fws,\@fwcs);
884	0					0	my $tLcs = Lingua::AlSetLib::LCS_ratio(\@fwt,\@fwct);
885							# first words LCS test:
886	0	0	0			0	if ($sLcs == 0 \|\| $tLcs == 0){
887							# skip lcs ratio calculation
888							}else{
889	0					0	$cntPassedFirstLCS++;
890	0					0	my $sLcs = Lingua::AlSetLib::LCS_ratio(\@ws,\@wcs);
891	0					0	my $tLcs = Lingua::AlSetLib::LCS_ratio(\@wt,\@wct);
892	0	0				0	if ($verbose >1){
893	0					0	print "ncs:$numcs nct:$numct\n";
894	0					0	print "CORPUS:\n$cs\n$ct\n";
895	0					0	print "chars src lcsr: $sLcs\t trg lcsr:$tLcs\n";
896							}
897	0					0	my $lcs = $sLcs+$tLcs;
898	0	0				0	if ($lcs > $bestLcs){
		0
899	0					0	$bestLcs = $lcs;
900	0					0	@bestSrc = ($cs);
901	0					0	@bestTrg = ($ct);
902							}elsif($lcs == $bestLcs){
903	0					0	push @bestSrc,$cs;
904	0					0	push @bestTrg,$ct;
905							}
906							}
907							} # if length test
908	0	0				0	if ($verbose >1){
909	0	0	0			0	if ($cnt2>0 && ($cnt2 % 100000)==0){print "$cnt2";}
	0	0	0			0
	0					0
910							elsif ($cnt2>0 && ($cnt2 % 1000)==0){print ".";}
911							}
912	0					0	$cnt2++;
913							} # for each sent pair in corpus
914	0					0	my $uniqBestSrc;
915							my $uniqBestTrg;
916	0					0	my $cntCharLevel=scalar(@bestSrc);
917	0	0				0	if ($cntCharLevel>1){
918	0					0	my $bestLcs = 0;
919							# calculate LCS at character level
920	0					0	for (my $i=0;$i<$cntCharLevel;$i++){
921	0					0	my $cs=$bestSrc[$i];
922	0					0	my $ct=$bestTrg[$i];
923	0					0	$cntCharLevel++;
924	0					0	my @chars = split //,$s;
925	0					0	my @charcs = split //,$cs;
926	0					0	my $sLcs = Lingua::AlSetLib::LCS_ratio(\@chars,\@charcs);
927	0					0	my @chart = split //,$t;
928	0					0	my @charct = split //,$ct;
929	0					0	my $tLcs = Lingua::AlSetLib::LCS_ratio(\@chart,\@charct);
930	0					0	my $lcs = $sLcs+$tLcs;
931	0	0				0	if ($lcs > $bestLcs){
932	0					0	$bestLcs = $lcs;
933	0					0	$uniqBestSrc = $cs;
934	0					0	$uniqBestTrg = $ct;
935							}
936							}
937							}else{
938	0					0	$uniqBestSrc = $bestSrc[0];
939	0					0	$uniqBestTrg = $bestTrg[0];
940							}
941	0	0				0	if ($cntGoodLength == 0){
942	0					0	print "WARNING: sentence pair $inputSentPairNum not found in corpus\n";
943							}else{
944	0	0				0	if ($verbose >0){
945	0					0	my $numCorp=scalar(keys %newcorp);
946	0					0	print "Passed length test: $cntGoodLength / $numCorp\n";
947	0					0	print "Passed first words LCS test: $cntPassedFirstLCS\n";
948	0	0				0	print "LCS calculated at character level: "; if ($cntCharLevel==1){print "0"}else{print $cntCharLevel}; print "\n";
	0					0
	0					0
	0					0
	0					0
949	0					0	print "\nbest lcsr: $bestLcs, best pair:\n$uniqBestSrc\n$uniqBestTrg\n";
950							}
951							# detect edits to pass from alset sent pair to corpus sent pair
952	0					0	my @bs=split / /,$uniqBestSrc;
953	0					0	my @diffs = Lingua::AlSetLib::diff( \@ws, \@bs );
954	0	0				0	if ($verbose>2){print $dumper->dumpValue(\@diffs);}
	0					0
955
956							# parse output of diff function
957	0					0	my @updatedPosi; #array: orig posis -> updated posis
958							my %reversePosi; #hash: updated posis -> orig posis
959	0					0	for (my $i=0;$i<=$nums;$i++){
960	0					0	$updatedPosi[$i]=$i;
961	0					0	$reversePosi{$i}=$i;
962							}
963
964	0					0	foreach my $hunk (@diffs){
965	0					0	my @delPosi;
966							my @del;
967	0					0	my @addPosi;
968	0					0	my @add;
969	0					0	foreach my $change (@$hunk) {
970	0	0				0	if ($change->[0] eq '-'){
971	0					0	push @delPosi,$change->[1]+1;
972	0					0	push @del,$change->[2];
973							}else{
974	0					0	push @addPosi,$change->[1]+1;
975	0					0	push @add,$change->[2];
976							}
977							}
978							# del posis are relative to first array (@ws) => update posis
979							# add posis are relative to second array (@bs) => don't update posis
980	0					0	my $numDel=scalar(@delPosi);
981	0					0	my $numAdd=scalar(@addPosi);
982	0	0				0	if ($numDel==0){ #insertion
983	0					0	$al->splice("source",$addPosi[0],0,\@add);
984	0					0	print "insert '",join(" ",@add),"' at position { ",$addPosi[0]," }\n";
985							#update updatedPosi array
986	0					0	for (my $i=$reversePosi{"$addPosi[0]"};$i<=$nums;$i++){
987	0					0	$updatedPosi[$i]+=$numAdd;
988	0					0	$reversePosi{"$updatedPosi[$i]"}=$i;
989							}
990							}else{ # substitution or deletion
991	0					0	$al->splice("source",$updatedPosi[$delPosi[0]],$numDel,\@add);
992	0					0	print "substitute '",join(" ",@del),"' at positions { ",join(" ",@delPosi)," } by '",join(" ",@add),"'\n";
993							#update updatedPosi array
994	0					0	for (my $i=$delPosi[0]+$numDel;$i<=$nums;$i++){
995	0					0	$updatedPosi[$i]+=$numAdd-$numDel;
996	0					0	$reversePosi{"$updatedPosi[$i]"}=$i;
997							}
998							}
999							}
1000							#target
1001	0					0	my @bt=split / /,$uniqBestTrg;
1002	0					0	my @diffs = Lingua::AlSetLib::diff( \@wt, \@bt );
1003	0	0				0	if ($verbose>2){print $dumper->dumpValue(\@diffs);}
	0					0
1004							# parse output of diff function
1005	0					0	@updatedPosi=();
1006	0					0	%reversePosi=();
1007	0					0	for (my $i=0;$i<=$numt;$i++){
1008	0					0	$updatedPosi[$i]=$i;
1009	0					0	$reversePosi{$i}=$i;
1010							}
1011	0					0	foreach my $hunk (@diffs){
1012	0					0	my @delPosi;
1013							my @del;
1014	0					0	my @add;
1015	0					0	my @addPosi;
1016	0					0	foreach my $change (@$hunk) {
1017	0	0				0	if ($change->[0] eq '-'){
1018	0					0	push @delPosi,$change->[1]+1;
1019	0					0	push @del,$change->[2];
1020							}else{
1021	0					0	push @addPosi,$change->[1]+1;
1022	0					0	push @add,$change->[2];
1023							}
1024							}
1025							#update updatedPosi array
1026	0					0	my $numDel=scalar(@delPosi);
1027	0					0	my $numAdd=scalar(@addPosi);
1028	0	0				0	if ($numDel==0){ #insertion
1029	0					0	$al->splice("target",$addPosi[0],0,\@add);
1030	0					0	print "insert '",join(" ",@add),"' at position { ",$addPosi[0]," }\n";
1031							#update updatedPosi array
1032	0					0	for (my $i=$reversePosi{"$addPosi[0]"};$i<=$numt;$i++){
1033	0					0	$updatedPosi[$i]+=$numAdd;
1034	0					0	$reversePosi{"$updatedPosi[$i]"}=$i;
1035							}
1036							}else{ # substitution or deletion
1037	0					0	$al->splice("target",$updatedPosi[$delPosi[0]],$numDel,\@add);
1038	0					0	print "substitute '",join(" ",@del),"' at positions { ",join(" ",@delPosi)," } by '",join(" ",@add),"'\n";
1039							#update updatedPosi array
1040	0					0	for (my $i=$delPosi[0]+$numDel;$i<=$numt;$i++){
1041	0					0	$updatedPosi[$i]+=$numAdd-$numDel;
1042	0					0	$reversePosi{"$updatedPosi[$i]"}=$i;
1043							}
1044							}
1045							}
1046	0	0				0	if ($verbose>0){print "--------------------------------------------------------------\n";}
	0					0
1047							}
1048							}else{
1049							# sentence pair is in corpus: don't modify anything
1050							}
1051	0					0	$al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
1052	0					0	$count++;
1053							}
1054	0					0	$inputSentPairNum++;
1055	0					0	$internalSentPairNum++;
1056							} #while
1057	0					0	print "$count sentence pairs parsed in alignment set\n";
1058	0					0	closeFiles($newFH,$newFormat);
1059	0					0	closeFiles($FH,$alSet->{format});
1060	0					0	$alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
1061							}
1062
1063							######################################################################
1064							### PRIVATE SUBS
1065							######################################################################
1066
1067							sub readLocation{
1068	2			2	0	4	my $location = shift;
1069
1070	2	100				6	if (!ref($location)){ #if it is a path, put it in a location hash
1071	1					4	$location = {"sourceToTarget"=>$location}
1072							}
1073	2					5	return $location;
1074							}
1075
1076							sub setRange {
1077	2			2	0	4	my ($alSet,$range) = @_;
1078
1079	2					7	my @limits = split /-/, $range;
1080	2					3	my $numLimits = scalar(@limits);
1081	2	50	33			16	if ($numLimits == 0 \|\| $numLimits >2){
		100
1082	0					0	die "Invalid Range:$range\n";
1083							}elsif ($numLimits == 1){
1084	1					3	$limits[1]="";
1085							}
1086	2					8	$limits[0] =~ s/^\s+\|\s+$//g;
1087	2					6	$limits[1] =~ s/^\s+\|\s+$//g;
1088	2	50	33			20	if ($limits[0] !~ /\d+/ \|\| $limits[0] == 0){
1089	0					0	$alSet->{firstSentPair}="1";
1090							}else{
1091	2					6	$alSet->{firstSentPair}=$limits[0];
1092							}
1093	2	100	66			11	if ($limits[1] !~ /\d+/ \|\| $limits[1] == 0){
1094	1					3	$alSet->{lastSentPair}="eof";
1095							}else{
1096	1					5	$alSet->{lastSentPair}=$limits[1];
1097							}
1098							}
1099
1100							#for future ease we save detailed infos contained in the source sample path
1101							#input: sourceToTarget dir (not optional), targetToSource dir (if exists), source path (optional) and target path (if necessary)
1102							#output: target (if not specified in input), sampleNum (sample number)
1103							sub completeBlinkerLocation{
1104	1			1	0	2	my $refToLocation = shift;
1105	1					4	my ($sourceLang,$targetLang);
1106
1107	1	50				5	if ($refToLocation->{source}){
1108	0					0	my ($sourceDir,$sourceFileName)=split /\/([^\/]+)$/,$refToLocation->{source};
1109	0	0				0	if ($sourceFileName =~ /^(EN\|FR)\.sample.\d+$/){
1110							#extract the sample number and target file:
1111	0					0	my ($sourceLang,$nothing,$sampleNum) = split /\./,$sourceFileName;
1112	0					0	$refToLocation->{sampleNum} = $sampleNum;
1113							}
1114							}
1115	1	50				4	if (!$refToLocation->{sampleNum}){
1116	1					4	$refToLocation->{sampleNum} = 1;
1117							}
1118							}
1119
1120							# open (for read or write) the files contained in a "location" hash (ex. at the {location} key of the alignment set hash)
1121							# if opens for write needs old location hash to check you won't delete the old format files
1122							# returns a ref to a hash containing the filehandle variables (hash with same keys as "location" except for Blinker format)
1123							sub openLocation {
1124	2			2	0	4	my ($location,$format,$openMode,$oldLocation) = @_; #oldLocation: optional parameter
1125	2					4	my %FH;
1126
1127	2	50				6	if ($openMode eq ">"){
1128	0	0				0	if ($format eq "BLINKER"){
1129	0					0	completeBlinkerLocation($location);
1130							}
1131							# check that your new files are different to prevent from deleting the old ones
1132	0					0	my %oldFiles = reverse %$oldLocation;
1133	0					0	my ($key,$newFile);
1134	0					0	while (($key, $newFile)=each %$location){
1135	0	0	0			0	if ($oldFiles{$newFile} && $key ne "sampleNum"){
1136	0					0	die "Convert function: you are opening for write one of the old format file: $newFile\n";
1137							}
1138							}
1139							#end of check
1140
1141							# create directory structure where to create the file/directory if it doesn't exist, create it
1142	0					0	my $type;
1143	0	0				0	if ($format eq "BLINKER"){$type = "dir"}
	0					0
	0					0
1144							else {$type = "file"}
1145	0					0	createDirStructure($location->{sourceToTarget},$type);
1146	0	0				0	if ($location->{targetToSource}){
1147	0					0	createDirStructure($location->{targetToSource},$type);
1148							}
1149	0	0				0	if ($location->{source}){
1150	0					0	createDirStructure($location->{source},"file");
1151							}
1152	0	0				0	if ($location->{target}){
1153	0					0	createDirStructure($location->{target},"file");
1154							}
1155							#end create directory structure
1156							}
1157
1158	2	50	66			24	if ($format eq "GIZA"){
		100
		50
1159	0	0				0	$FH{sourceToTarget} = IO::File->new($openMode.$location->{sourceToTarget}) or die "GIZA file (".$location->{sourceToTarget}.") opening error:$!";
1160	0	0				0	if ($location->{targetToSource}){
1161	0	0				0	$FH{targetToSource} = IO::File->new($openMode.$location->{targetToSource}) or die "GIZA file (".$location->{targetToSource}.") opening error:$!";
1162							}
1163							} elsif ($format eq "NAACL" \|\| $format eq "TALP"){
1164	1	50				4	if ($location->{source}){
1165	1	50				13	$FH{source} = IO::File->new($openMode.$location->{source}) or die "Source file (".$location->{source}.") opening error:$!";
1166							}
1167	1	50				109	if ($location->{target}){
1168	1	50				6	$FH{target} = IO::File->new($openMode.$location->{target}) or die "Target file (".$location->{target}.") opening error:$!";
1169							}
1170	1	50				52	$FH{sourceToTarget} = IO::File->new($openMode.$location->{sourceToTarget}) or die "Alignment file (".$location->{sourceToTarget}.") opening error:$!";
1171	1	50				56	if ($location->{targetToSource}){
1172	0	0				0	$FH{targetToSource} = IO::File->new($openMode.$location->{targetToSource}) or die "Alignment file (".$location->{targetToSource}.") opening error:$!";
1173							}
1174							} elsif ($format eq "BLINKER"){
1175	1	50				3	if ($location->{source}){
1176	0	0				0	$FH{source} = IO::File->new($openMode.$location->{source}) or die "BLINKER source file (".$location->{source}.") opening error:$!";
1177							}
1178	1	50				6	if ($location->{target}){
1179	0	0				0	$FH{target} = IO::File->new($openMode.$location->{target}) or die "BLINKER source file (".$location->{target}.") opening error:$!";
1180							}
1181							}
1182	2					10	return (\%FH);
1183							}
1184
1185
1186							# if you want to create a file of path "directory_structure/file", makes "directory_structure" if necessary.
1187							# if you want to create a directory of path "directory_structure", makes it if it doesn't exist
1188							# type is "dir" (if you want to create a directory) or "file" (a file)
1189							sub createDirStructure {
1190	0			0	0	0	my ($path,$type)=@_;
1191
1192	0	0				0	if ($type eq "dir"){
		0
1193	0	0	0			0	unless(-e $path && -d _){
1194	0					0	system('mkdir -p '.$path);
1195							}
1196							}elsif ($type eq "file"){
1197	0					0	$path =~ s/\/$//;
1198	0	0				0	if ($path =~ /\//){
1199	0					0	my ($dir,$file)=split /\/[^\/]+$/,$path;
1200	0	0				0	unless (-e $dir){
1201	0					0	system('mkdir -p '.$dir);
1202							}
1203							}
1204							}
1205							}
1206
1207							# open files of an alignment set for read and go to first sentence pair
1208							sub openFiles {
1209	2			2	0	3	my $alSet = shift;
1210	2					4	my %FH = %{openLocation($alSet->{location},$alSet->{format},"<")};
	2					9
1211	2					5	my $fhPos;
1212							my $lineNb;
1213
1214							# go to first Sentence pair:
1215	2	50				18	if ($alSet->{format} eq "TALP"){
		50
		100
		50
1216	0					0	for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){ #go to first Sentence pair
1217	0	0				0	if ($FH{source}){
1218	0					0	$FH{source}->getline();
1219							}
1220	0	0				0	if ($FH{target}){
1221	0					0	$FH{target}->getline();
1222							}
1223	0	0				0	if ($FH{sourceToTarget}){
1224	0					0	$FH{sourceToTarget}->getline();
1225							}
1226	0	0				0	if ($FH{targetToSource}){
1227	0					0	$FH{targetToSource}->getline();
1228							}
1229							}
1230							}elsif ($alSet->{format} eq "GIZA"){
1231	0					0	for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){ #go to first Sentence pair
1232	0					0	$FH{sourceToTarget}->getline();
1233	0					0	$FH{sourceToTarget}->getline();
1234	0					0	$FH{sourceToTarget}->getline();
1235	0	0				0	if ($FH{targetToSource}){
1236	0					0	$FH{targetToSource}->getline();
1237	0					0	$FH{targetToSource}->getline();
1238	0					0	$FH{targetToSource}->getline();
1239							}
1240							}
1241							} elsif ($alSet->{format} eq "NAACL"){
1242	1					9	for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){
1243	0	0				0	if ($FH{source}){
1244	0					0	$FH{source}->getline();
1245							}
1246	0	0				0	if ($FH{target}){
1247	0					0	$FH{target}->getline();
1248							}
1249	0					0	$fhPos = $FH{sourceToTarget}->getpos;
1250	0		0			0	while ($FH{sourceToTarget}->getline() !~ m/^0$alSet->{firstSentPair} ./ && !$FH{sourceToTarget}->eof()) {
1251	0					0	$fhPos = $FH{sourceToTarget}->getpos;
1252							}
1253	0	0				0	if ($FH{sourceToTarget}->eof()){
1254	0					0	die "First sentence pair of range (number ".$alSet->{firstSentPair}.") not found in ".$alSet->{location}{sourceToTarget};
1255							}
1256	0					0	$FH{sourceToTarget}->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
1257	0	0				0	if ($FH{targetToSource}){
1258	0					0	$fhPos = $FH{targetToSource}->getpos;
1259	0		0			0	while ($FH{targetToSource}->getline() !~ m/^0$alSet->{firstSentPair} ./ && !$FH{targetToSource}->eof()) {
1260	0					0	$fhPos = $FH{targetToSource}->getpos;
1261							}
1262	0	0				0	if ($FH{targetToSource}->eof()){
1263	0					0	die "First sentence pair of range (number ".$alSet->{firstSentPair}.") not found in ".$alSet->{location}{targetToSource};
1264							}
1265	0					0	$FH{targetToSource}->setpos($fhPos);
1266							}
1267							}
1268							} elsif ($alSet->{format} eq "BLINKER"){
1269	1	50				4	if ($FH{source}){
1270	0					0	for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){
1271	0					0	$FH{source}->getline();
1272							}
1273							}
1274	1	50				3	if ($FH{target}){
1275	0					0	for ($lineNb=$alSet->{firstSentPair}-1;$lineNb>0;$lineNb--){
1276	0					0	$FH{target}->getline();
1277							}
1278							}
1279							}
1280	2					5	return (\%FH);
1281							}
1282
1283							# close the files contained in the hash at the {location} key of the alignment set hash
1284							sub closeFiles {
1285	2			2	0	5	my ($FH,$format) = @_;
1286
1287	2	50	66			20	if ($format eq "GIZA"){
		100
		50
1288	0					0	$FH->{sourceToTarget}->close();
1289	0	0				0	if ($$FH{targetToSource}){
1290	0					0	$FH->{targetToSource}->close();
1291							}
1292							} elsif ($format eq "NAACL" \|\| $format eq "TALP"){
1293	1	50				6	if ($FH->{source}){
1294	1					13	$FH->{source}->close();
1295							}
1296	1	50				24	if ($FH->{target}){
1297	1					5	$FH->{target}->close();
1298							}
1299	1	50				20	if ($FH->{sourceToTarget}){
1300	1					4	$FH->{sourceToTarget}->close();
1301							}
1302	1	50				17	if ($FH->{targetToSource}){
1303	0					0	$FH->{targetToSource}->close();
1304							}
1305							} elsif ($format eq "BLINKER"){
1306	1	50				3	if ($FH->{source}){
1307	0					0	$FH->{source}->close();
1308							}
1309	1	50				3	if ($FH->{target}){
1310	0					0	$FH->{target}->close();
1311							}
1312							}
1313							}
1314
1315							# convert a chunk of alignment set file to an array of references to simple (1 sentence) alignment objects
1316							# returns 0 if the file is at eof
1317							sub loadChunk {
1318	22			22	0	38	my ($alSet,$alFH,$sentPairNum,$alignMode) = @_;
1319	22					25	my ($sourceString,$targetString,$alString,$reverseAlString);
1320	22					27	my $st_alignments=[];
1321	22					32	my $ts_alignments=[];
1322	22					28	my $al;
1323							my $theEnd;
1324
1325	22	50	33			207	if (!defined($alignMode) \|\| $alignMode =~ /^as.?is$/i){
		50
		50
1326	0					0	$alignMode = "as-is";
1327							}elsif ($alignMode =~ /^null.?align$/i){
1328	0					0	$alignMode = "null-align";
1329							}elsif ($alignMode =~ /^no.?null.?align$/i){
1330	22					35	$alignMode = "no-null-align";
1331							}else{
1332	0					0	die 'Incorrect alignment mode. Correct modes are "as-is","null-align" or "no-null-align".'."\n";
1333							}
1334	22	50				97	if ($alSet->{format} eq "TALP"){
		50
		100
		50
1335	0	0				0	if ($alSet->{lastSentPair} eq "eof"){
1336	0					0	$theEnd = $$alFH{sourceToTarget}->eof();
1337							}else{
1338	0		0			0	$theEnd = ($$alFH{sourceToTarget}->eof() \|\| $sentPairNum > $alSet->{lastSentPair});
1339							}
1340	0	0				0	if ($theEnd){
1341	0					0	return 0;
1342							}else{
1343	0	0				0	if ($alFH->{source}){
1344	0					0	$sourceString = $alFH->{source}->getline();
1345							}
1346	0	0				0	if ($alFH->{target}){
1347	0					0	$targetString = $alFH->{target}->getline();
1348							}
1349	0	0				0	if ($alFH->{sourceToTarget}){
1350	0					0	$alString = $alFH->{sourceToTarget}->getline();
1351							}
1352	0	0				0	if ($alFH->{targetToSource}){
1353	0					0	$reverseAlString = $alFH->{targetToSource}->getline();
1354							}
1355	0					0	$al = Lingua::Alignment->new;
1356	0					0	$al->loadFromTalp($alString,$reverseAlString,$sourceString,$targetString);
1357							}
1358							}elsif ($alSet->{format} eq "GIZA"){
1359	0	0				0	if ($alSet->{lastSentPair} eq "eof"){
1360	0					0	$theEnd = $$alFH{sourceToTarget}->eof();
1361							}else{
1362	0		0			0	$theEnd = ($$alFH{sourceToTarget}->eof() \|\| $sentPairNum > $alSet->{lastSentPair});
1363							}
1364	0	0				0	if ($theEnd){
1365	0					0	return 0;
1366							}else{
1367	0					0	$$alFH{sourceToTarget}->getline();
1368	0					0	$targetString = $$alFH{sourceToTarget}->getline();
1369	0					0	$alString = $$alFH{sourceToTarget}->getline();
1370	0	0				0	if ($$alFH{targetToSource}){
1371	0					0	$$alFH{targetToSource}->getline();
1372	0					0	$$alFH{targetToSource}->getline();
1373	0					0	$reverseAlString = $$alFH{targetToSource}->getline();
1374							}
1375	0					0	$al = Lingua::Alignment->new;
1376	0					0	$al->loadFromGiza($alString,$targetString,$reverseAlString);
1377							}
1378							} elsif ($alSet->{format} eq "NAACL"){
1379	11					11	my $fhPos;
1380	11	50				21	if ($alSet->{lastSentPair} eq "eof"){
1381	0					0	$theEnd = $$alFH{sourceToTarget}->eof();
1382							}else{
1383	11		66			39	$theEnd = ($$alFH{sourceToTarget}->eof() \|\| $sentPairNum > $alSet->{lastSentPair});
1384							}
1385	11	100				174	if ($theEnd){
1386	1					7	return 0;
1387							}else{
1388	10	50				25	if ($$alFH{source}){
1389	10					281	$sourceString = $$alFH{source}->getline();
1390							#strip tags and memorize snum:
1391	10					332	$sourceString =~ s/~~(.*)<\/s>/$1/;~~
1392							}
1393	10	50				29	if ($$alFH{target}){
1394	10					218	$targetString = $$alFH{target}->getline();
1395							#strip tags and memorize snum:
1396	10					308	$targetString =~ s/~~(.*)<\/s>/$2/;~~
1397							}
1398	10					43	$fhPos = $$alFH{sourceToTarget}->getpos;
1399	10					211	$alString = $$alFH{sourceToTarget}->getline();
1400	10					264	my ($num,$theRest)=split " ",$alString,2;
1401	10	50				27	if ($num==$sentPairNum){ #skip if there is no link for this sentence pair
1402	10					30	$fhPos = $$alFH{sourceToTarget}->getpos;
1403	10					20	push @$st_alignments,$theRest;
1404	10					211	while ($$alFH{sourceToTarget}->getline() =~ m/^$sentPairNum (.*)$/) {
1405	58					1919	push @$st_alignments,$1;
1406	58					1408	$fhPos = $$alFH{sourceToTarget}->getpos;
1407							}
1408							}
1409	10					384	$$alFH{sourceToTarget}->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
1410	10	50				32	if ($$alFH{targetToSource}){
1411	0					0	$fhPos = $$alFH{targetToSource}->getpos;
1412	0					0	$alString = $$alFH{targetToSource}->getline();
1413	0					0	my ($num,$theRest)=split " ",$alString,2;
1414	0	0				0	if ($num==$sentPairNum){ #skip if there is no link for this sentence pair
1415	0					0	$fhPos = $$alFH{targetToSource}->getpos;
1416	0					0	push @$ts_alignments,$theRest;
1417	0					0	while ($$alFH{targetToSource}->getline() =~ m/^$sentPairNum (.*)$/) {
1418	0					0	push @$ts_alignments,$1;
1419	0					0	$fhPos = $$alFH{targetToSource}->getpos;
1420							}
1421							}
1422	0					0	$$alFH{targetToSource}->setpos($fhPos); #if we changed sentences, we read the first line of next sentence=>go back one line
1423							}
1424	10					36	$al = Lingua::Alignment->new;
1425	10					30	$al->loadFromBlinker($st_alignments,$ts_alignments,$sourceString,$targetString);
1426							}
1427							} elsif ($alSet->{format} eq "BLINKER"){
1428	11	50				21	if ($alSet->{lastSentPair} eq "eof"){
1429	11					485	$theEnd = !(-e $alSet->{location}->{sourceToTarget}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1));
1430							}else{
1431	0		0			0	$theEnd = !(-e $alSet->{location}->{sourceToTarget}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1)) \|\| $sentPairNum > $alSet->{lastSentPair};
1432							}
1433	11	100				22	if ($theEnd){
1434	1					4	return 0;
1435							}else{
1436	10	50				24	if ($alFH->{source}){
1437	0					0	$sourceString = $alFH->{source}->getline();
1438							}
1439	10	50				21	if ($alFH->{target}){
1440	0					0	$targetString = $alFH->{target}->getline();
1441							}
1442	10					346	open(AL,"< ".$alSet->{location}->{sourceToTarget}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1));
1443	10					241	@$st_alignments = ;
1444	10					103	close(AL);
1445	10	50				31	if ($alSet->{location}->{targetToSource}){
1446	0					0	open(AL,"< ".$alSet->{location}->{targetToSource}."/samp".$alSet->{location}->{sampleNum}.".SentPair".($sentPairNum-1));
1447	0					0	@$ts_alignments = ;
1448	0					0	close(AL);
1449							}
1450	10					44	$al = Lingua::Alignment->new;
1451	10					35	$al->loadFromBlinker($st_alignments,$ts_alignments,$sourceString,$targetString);
1452							}
1453							}
1454	20	50				68	if ($alignMode eq "null-align"){
		50
1455	0					0	$al->forceNullAlign();
1456							}elsif ($alignMode eq "no-null-align"){
1457	20					55	$al->forceNoNullAlign();
1458							}
1459	20					110	return [$al];
1460							}
1461
1462
1463							sub updateObject {
1464	0			0	0	0	my ($alSet,$newFormat,$newLocation,$lastSentPairNum)=@_;
1465	0					0	$alSet->{location}->{sourceToTarget}=$newLocation->{sourceToTarget};
1466	0					0	$alSet->{location}->{targetToSource}=$newLocation->{targetToSource};
1467	0	0				0	if ($newLocation->{source}){
1468	0					0	$alSet->{location}->{source}=$newLocation->{source};
1469							}else{
1470	0	0	0			0	if ($alSet->{firstSentPair} != 1 \|\| $alSet->{format} ne $newFormat){
1471							# in this case the numeration of the converted alignment file and that of the (not converted) source file will not correspond
1472	0					0	delete($alSet->{location}->{source});
1473							# warn "After converting into ",$newLocation->{sourceToTarget},", the numeration of the source words file",
1474							# " didn't correspond any more to that of the alignment file. So the 'source' entry has been removed from the location hash.";
1475							}
1476							}
1477	0	0				0	if ($newLocation->{target}){
1478	0					0	$alSet->{location}->{target}=$newLocation->{target};
1479							}else{
1480	0	0	0			0	if ($alSet->{firstSentPair} != 1 \|\| $alSet->{format} ne $newFormat){
1481							# in this case the numeration of the converted alignment file and that of the (not converted) source file will not correspond
1482	0					0	delete($alSet->{location}->{target});
1483							# warn "After converting into ",$newLocation->{sourceToTarget},", the numeration of the target words file ",
1484							# "didn't correspond any more to that of the alignment file. So the 'target' entry has been removed from the location hash.";
1485							}
1486							}
1487	0					0	$alSet->{format}=$newFormat;
1488	0	0				0	if ($newFormat eq "BLINKER"){
		0
1489	0					0	$alSet->{location}->{sampleNum}=$newLocation->{sampleNum};
1490							}elsif(exists($alSet->{location}->{sampleNum})){
1491	0					0	delete($alSet->{location}->{sampleNum});
1492							}
1493	0					0	$alSet->{firstSentPair}=1;
1494	0					0	$alSet->{lastSentPair}=$lastSentPairNum;
1495							}
1496
1497							# returns the alignment set, with a unique new file set that has the required location,format and range values.
1498							# TO DO: conversion to Giza++ format
1499							sub convert {
1500	0			0	0	0	my ($alSet,$newLocation,$newFormat,$alignMode,$AlignmentSub)=@_;
1501	0	0				0	if (!defined($newFormat)){$newFormat="TALP"}
	0					0
	0					0
1502							else {$newFormat = uc $newFormat}
1503	0					0	$newLocation = readLocation($newLocation);
1504	0					0	my $FH = $alSet->openFiles();
1505	0					0	my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
1506	0					0	my ($i,$al,$alSetChunk,$line,$lines);
1507	0					0	my $inputSentPairNum=$alSet->{firstSentPair};
1508	0					0	my $internalSentPairNum = 1;
1509	0					0	while ($alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,$alignMode)){ # returns 0 if eof or last sentence pair
1510							# print $inputSentPairNum."\n";
1511	0					0	for ($i=0;$i<@$alSetChunk;$i++){
1512	0					0	$al = $$alSetChunk[$i];
1513	0	0				0	if (defined($AlignmentSub)){
1514							#look if $AlignmentSub is a ref to an Array or a subroutine
1515	0	0				0	if (ref($AlignmentSub) eq "ARRAY"){
1516	0					0	my ($sub,@params) = @$AlignmentSub;
1517	0					0	$al->$sub(@params);
1518							}else{
1519	0					0	$al->$AlignmentSub();
1520							}
1521							}
1522	0					0	$al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
1523							} #for
1524	0					0	$inputSentPairNum++;
1525	0					0	$internalSentPairNum++;
1526							} #while
1527	0					0	closeFiles($newFH,$newFormat);
1528	0					0	closeFiles($FH,$alSet->{format});
1529	0					0	$alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
1530							}
1531
1532
1533							# returns the alignment set, with only manyToMany links (source and target words file don't change)
1534							# at the moment, only works for sourceToTarget alignment (sourceAl)
1535							sub printManyToMany {
1536	0			0	0	0	my ($alSet,$newLocation,$newFormat)=@_;
1537	0	0				0	if (!defined($newFormat)){$newFormat="TALP"}
	0					0
	0					0
1538							else {$newFormat = uc $newFormat}
1539	0					0	$newLocation = readLocation($newLocation);
1540	0					0	my $dumper=new Dumpvalue;
1541	0					0	my $FH = $alSet->openFiles();
1542	0					0	my $newFH = openLocation($newLocation,$newFormat,">",$alSet->{location});
1543	0					0	my ($i,$al,$alSetChunk,$line,$lines);
1544	0					0	my $inputSentPairNum=$alSet->{firstSentPair};
1545	0					0	my $internalSentPairNum = 1;
1546
1547	0					0	while ($alSetChunk = $alSet->loadChunk($FH,$inputSentPairNum,"no-null-align")){ # returns 0 if eof or last sentence pair
1548	0					0	for ($i=0;$i<@$alSetChunk;$i++){
1549	0					0	$al = $$alSetChunk[$i];
1550	0					0	my $clone = $al->clone;
1551							# look for manyToMany links
1552	0					0	my $clusters = $al->getAlClusters;
1553	0					0	$al->{sourceAl}=[];
1554							# print $dumper->dumpValue($clusters);
1555	0					0	for (my $c=0;$c<@$clusters;$c++){
1556	0	0	0			0	if ( @{$clusters->[$c]{source}}>1 \|\| @{$clusters->[$c]{target}}>1 ){
	0					0
	0					0
1557							# this is a many to many alignment
1558	0					0	foreach my $j (@{$clusters->[$c]{source}}){
	0					0
1559	0					0	foreach my $k (@{$clusters->[$c]{target}}){
	0					0
1560	0	0				0	if ($clone->isIn("sourceAl",$j,$k)){
1561	0					0	push @{$al->{sourceAl}[$j]},$k;
	0					0
1562							}
1563							}
1564							}
1565							}
1566							}
1567	0					0	$al->output($FH,$newFormat,$newFH,$newLocation,$internalSentPairNum);
1568							} #for
1569	0					0	$inputSentPairNum++;
1570	0					0	$internalSentPairNum++;
1571							} #while
1572	0					0	closeFiles($newFH,$newFormat);
1573	0					0	closeFiles($FH,$alSet->{format});
1574	0					0	$alSet->updateObject($newFormat,$newLocation,$internalSentPairNum-1);
1575							}
1576
1577
1578							# identifies a link as sure or possible
1579							# input: a Naacl-file line containing the link and refs to sure and possible hashes
1580							# action: add to the relevant hash a key corresponding to this link
1581							sub identifySurePossible{
1582	149			149	0	241	my ($line,$sure,$possible)=@_;
1583	149					126	my @components;
1584							my $alignment;
1585
1586							# print "line:$line\n";
1587
1588							#code adapted from Rada Mihalcea's wa_eval_align.pl, rada@cs.unt.edu
1589							# get all line components: format should be
1590							# sentence_no position_L1 position_L2 [S\|P] [confidence]
1591	149					376	@components = split /\s+/, $line;
1592
1593	149	50				411	if(scalar(@components) < 3) {
1594	0					0	print STDERR "Incorrect format in answer file\n";
1595	0					0	exit;
1596							}
1597
1598	149					262	$alignment = $components[0]." ".$components[1]." ".$components[2];
1599
1600							# identify the S[ure] alignments
1601	149	0	0			279	if( scalar (@components) == 3 \|\| (scalar (@components) == 4 && ($components[3] =~ /^[\d\.]+$/ \|\| $components[3] eq 'S')) \|\|
			0
			33
			0
			0
			0
1602							(scalar (@components) == 5 && ($components[3] eq 'S' \|\| $components[4] eq 'S'))) {
1603	149					312	$sure->{$alignment} = 1;
1604							}
1605
1606							# identify the P[robable] alignments
1607	149	0	33			812	if( (scalar (@components) == 4 && $components[3] eq 'P') \|\| (scalar (@components) == 5 &&
			0
			33
			33
1608							($components[3] eq 'P' \|\| $components[4] eq 'P'))) {
1609	0						$possible->{$alignment} = 1;
1610							}
1611							}
1612
1613							1;
1614							__END__