File Coverage

blib/lib/Text/Perfide/PartialAlign.pm

Criterion	Covered	Total	%
statement	104	294	35.3
branch	21	74	28.3
condition	0	27	0.0
subroutine	14	32	43.7
pod	21	21	100.0
total	160	448	35.7

line	stmt	bran	cond	sub	pod	time	code
1							package Text::Perfide::PartialAlign;
2
3	4			4		103849	use 5.006;
	4					17
	4					171
4	4			4		24	use strict;
	4					8
	4					143
5	4			4		28	use warnings;
	4					13
	4					136
6	4			4		5277	use Data::Dumper;
	4					46622
	4					482
7
8							=head1 NAME
9
10							Text::Perfide::PartialAlign - Split large bitexts into smaller files.
11
12							=head1 VERSION
13
14							Version 0.01_03
15
16							=cut
17
18							our $VERSION = '0.01_03';
19
20
21							=head1 SYNOPSIS
22
23
24							Perhaps a little code snippet.
25
26							use Text::Perfide::PartialAlign;
27
28							my $foo = Text::Perfide::PartialAlign->new();
29							...
30
31							=head1 EXPORT
32
33							A list of functions that can be exported. You can delete this section if you don't export anything, such as for a purely object-oriented module.
34
35							=head1 SUBROUTINES/METHODS
36
37							=cut
38
39	4			4		37	use base 'Exporter';
	4					8
	4					16796
40
41							our @EXPORT_OK = qw/
42							get_corpus
43							usage
44							subcorpora2files
45							calc_pairs
46							build_chain
47							calc_common_tokens
48							findCommonHap
49							seg_split
50							_log
51							/;
52
53							sub _log {
54	0			0		0	print STDERR "$_[0]\n";
55							}
56
57							sub _print_verbose{
58	0			0		0	my ($file,$data) = @_;
59	0	0				0	open my $fh, '>', $file or die;
60	0					0	print $fh Dumper($data);
61	0					0	close $fh;
62							}
63
64
65							=head2 build_chain
66
67							=cut
68
69							sub build_chain {
70	0			0	1	0	my ($pairs,$maximalChunkSize,$options) = @_;
71	0					0	_log("Computing maximal chain in poset...");
72	0	0				0	_print_verbose("$$.pairs",$pairs) if $options->{v};
73	0					0	my $chain = maximalChain($pairs);
74	0					0	_log("Done.");
75	0					0	_log((scalar @$chain)." long chain found in ".(scalar @$pairs)." sized poset...");
76
77	0	0				0	if($maximalChunkSize > 0) {
78	0					0	_log("Selecting at most $maximalChunkSize sized chunks...");
79	0					0	($chain,my $forced) = selectFromChain($chain,$maximalChunkSize);
80	0					0	_log(scalar(@$chain)." chunks selected.");
81	0					0	_log("Done.");
82	0	0				0	_log("WARNING: maximalChunkSized could not be obeyed.") if $forced;
83							}
84
85	0	0				0	_print_verbose("$$.chain",$chain) if $options->{v};
86
87	0					0	my @newchain = ([-1,-1,0]);
88
89	0					0	for my $i (@$chain){
90	0	0	0			0	if($i->[0] != $newchain[-1][0] and $i->[1] != $newchain[-1][1]){
91	0					0	push @newchain,$i;
92							}
93							else {
94	0					0	$newchain[-1][2]+= $i->[2];
95							}
96							}
97
98	0					0	shift @newchain;
99	0	0				0	_print_verbose("$$.newchain",\@newchain) if $options->{v};
100	0					0	$chain = \@newchain;
101	0					0	return $chain;
102							}
103
104							=head2 calc_common_tokens
105
106							=cut
107
108							sub calc_common_tokens {
109	0			0	1	0	my ($huCorpus,$enCorpus,$options) = @_;
110	0					0	my $huFreq = tokenFreq($huCorpus); # Map word => frequency (number of times word appears in corpus)
111	0					0	my $enFreq = tokenFreq($enCorpus);
112	0					0	my $huHap = hapaxes($huFreq); # Words which have frequency = 1
113	0					0	my $enHap = hapaxes($enFreq);
114
115	0					0	my $commonHap = findCommonHap($huHap,$enHap,$options->{cf});
116	0					0	my $huPositions = hapaxPositions($huHap, $huCorpus); # Map word => id_sentence
117	0					0	my $enPositions = hapaxPositions($enHap, $enCorpus);
118	0	0				0	_print_verbose("$$.huPositions",$huPositions) if $options->{v};
119	0	0				0	_print_verbose("$$.enPositions",$enPositions) if $options->{v};
120	0					0	return ($commonHap,$huPositions,$enPositions);
121							}
122
123							=head2 calc_pairs
124
125							=cut
126
127							sub calc_pairs{
128	0			0	1	0	my ($commonHap,$huPositions,$enPositions,$huCorpus,$enCorpus,$options) = @_;
129	0					0	my $pairs = []; # (id_sentence_file1, id_sentence_file2)
130	0	0				0	_print_verbose("$$.commonHap",$commonHap) if $options->{v};
131	0					0	for my $t (keys %$commonHap) {
132	0					0	my $hup = $huPositions->{$t};
133	0					0	my $enp = $enPositions->{$commonHap->{$t}};
134	0					0	push @$pairs, [$hup, $enp];
135							}
136	0					0	push @$pairs, [0,0];
137
138	0					0	my $corpusSizes = [ scalar @$huCorpus, scalar @$enCorpus ];
139	0					0	push @$pairs, $corpusSizes;
140
141	0					0	$pairs = bagSort($pairs);
142	0					0	return $pairs;
143							}
144
145
146
147							=head2 subcorpora2files
148
149							Writes subcorpora to files.
150
151							=cut
152
153							sub subcorpora2files {
154	0			0	1	0	my ($chain,$huTextRef,$enTextRef,$huOffsets,$enOffsets,$outputFilename,$huLangName,$enLangName) = @_;
155	0					0	_log("Writing subcorpora to files...");
156	0					0	my $lastPos = [0,0];
157	0					0	my $ind = 1;
158	0					0	for my $pos (@$chain) {
159	0	0	0			0	next if $pos->[0] == $lastPos->[0] and $pos->[1] == $lastPos->[1];
160	0					0	my $baseFilename = "${outputFilename}_$ind";
161	0					0	my $huSubCorpus = strInterval($huTextRef, $lastPos->[0], $pos->[0],$huOffsets);
162	0					0	my $enSubCorpus = strInterval($enTextRef, $lastPos->[1], $pos->[1],$enOffsets);
163	0					0	my $huFilename = "$baseFilename.$huLangName";
164	0					0	open my $huFile, '>', $huFilename;
165	0					0	print $huFile $huSubCorpus;
166	0					0	close $huFile;
167
168	0					0	my $enFilename = "$baseFilename.$enLangName";
169	0					0	open my $enFile, '>', $enFilename;
170	0					0	print $enFile $enSubCorpus;
171	0					0	close $enFile;
172
173	0					0	print "$huFilename\t$enFilename\t$baseFilename.align\n";
174
175	0					0	$lastPos = $pos;
176	0					0	$ind++;
177							}
178	0					0	_log("Done.");
179							}
180
181							=head2 usage
182
183							Prints a short description and usage details.
184
185							=cut
186
187							sub usage {
188	0			0	1	0	_log("Perl port of partialAlign.py, 'a preprocessor for hunalign', with some tweaks.");
189	0					0	_log("Cuts a very large sentence-segmented unaligned bicorpus into smaller parts.");
190	0					0	_log("");
191	0					0	_log("Usage: $0 huge_text_in_one_language huge_text_in_other_language output_filename name_of_first_lang name_of_second_lang [ maximal_size_of_chunks=5000 ] > hunalign_batch");
192	0					0	_log("");
193	0					0	_log("The two input files must have one line per sentence. Whitespace-delimited tokenization is preferred.");
194	0					0	_log("The output is a set of files named output_filename_[123..].name_of_lang");
195	0					0	exit -1;
196							}
197
198
199
200							=head2 tokenFreq
201
202							Receives an array of lines of a text (each line is an array of words). Calculates the frequency of each word.
203
204							=cut
205
206							sub tokenFreq {
207	0			0	1	0	my $corpus = shift;
208	0					0	my $freq = {};
209	0					0	for my $l (@$corpus) {
210	0					0	for my $t (@$l) {
211	0					0	$freq->{$t}++;
212							}
213							}
214	0					0	return $freq;
215							}
216
217							=head2 hapaxes
218
219							Receives hash token => freq. Returns hash with elements with freq == 1
220
221							=cut
222
223							sub hapaxes {
224	0			0	1	0	my $freq = shift;
225	0					0	my $hapaxes = {};
226	0					0	while(my ($token, $count) = each(%$freq)) {
227	0	0				0	$hapaxes->{$token} = 1 if $count == 1;
228							}
229	0					0	return $hapaxes;
230							}
231
232							=head2 hapaxPositions
233
234							Builds an hash with term => positions, where position is the number of the sentence in which term occurs.
235
236							=cut
237
238							sub hapaxPositions {
239	0			0	1	0	my ($hapaxes, $corpus) = @_;
240	0					0	my $hapaxPos = {};
241	0					0	my $corpus_size = @$corpus;
242	0					0	for(my $ind = 0; $ind < $corpus_size; $ind++){
243	0					0	my $l = $corpus->[$ind];
244	0					0	for my $t (@$l) {
245	0	0				0	$hapaxPos->{$t} = $ind if (defined($hapaxes->{$t}));
246							}
247							}
248	0					0	return $hapaxPos;
249							}
250
251							=head2 bagSort
252
253							...
254
255							=cut
256
257							sub bagSort {
258	0			0	1	0	my $l = shift;
259	0					0	my @sorted;
260							my %aux;
261	0					0	for my $coords (@$l) {
262	0					0	my ($x,$y) = (@$coords);
263	0					0	$aux{$x}{$y}++;
264							}
265	0					0	for my $x (sort { $a <=> $b } keys %aux){
	0					0
266	0					0	for my $y (sort { $a <=> $b } keys %{$aux{$x}}){
	0					0
	0					0
267	0					0	push @sorted, [$x,$y, $aux{$x}{$y}];
268							}
269							}
270	0					0	return \@sorted;
271							}
272
273
274
275							=head2 uniqSort
276
277							Sorts an array of pairs and removes duplicated pairs.
278
279							=cut
280
281							sub uniqSort {
282	0			0	1	0	my $l = shift;
283	0					0	my $hash = {};
284	0					0	my $uniqSorted = [];
285	0					0	map { $hash->{$_->[0]}{$_->[1]} = 1 } @$l;
	0					0
286	0					0	for my $x (sort { $a <=> $b } keys %$hash){
	0					0
287	0					0	for my $y (sort { $a <=> $b } keys %{$hash->{$x}}){
	0					0
	0					0
288	0					0	push @$uniqSorted, [$x,$y];
289							}
290							}
291	0					0	return $uniqSorted;
292							}
293
294							=head2 less
295
296							Receives two pairs. Checks if both coordinates of the first pair are lower than the second pair.
297
298							=cut
299
300							sub less {
301	0			0	1	0	my ($a,$b) = @_;
302	0	0	0			0	if ($a->[0] < $b->[0] and $a->[1] < $b->[1])
303	0					0	{ return 1; }
304	0					0	else { return 0; }
305							}
306
307
308							=head2 less_relaxed
309
310							Receives two pairs...
311
312							=cut
313
314							sub less_relaxed {
315	0			0	1	0	my ($a,$b) = @_;
316	0	0	0			0	if ($a->[0] == $b->[0] and $a->[1] == $b->[1]){ return 0; }
	0					0
317	0		0			0	return ($a->[0] <= $b->[0] and $a->[1] <= $b->[1]);
318							}
319
320
321							=head2 less_or_equal
322
323							Receives two pairs. Checks if both coordinates of the first pair are lower or equal than the second pair's.
324
325							=cut
326
327							sub less_or_equal {
328	0			0	1	0	my ($a,$b) = @_;
329	0	0	0			0	if ($a->[0] <= $b->[0] and $a->[1] <= $b->[1])
330	0					0	{ return 1; }
331	0					0	else { return 0; }
332							}
333
334							=head2 maximalChain
335
336							Receives an array of pairs. Using dynamic programming, selects the maximal chain.
337
338							=cut
339
340							# Assumes that uniqSort was called to the input! (translated from original Hungarian)
341							sub maximalChain {
342	0			0	1	0	my $pairs = shift;
343							# print Dumper @$pairs;
344	0					0	my $lattice = {};
345	0					0	for my $p (@$pairs) {
346	0					0	my $bestLength = 0;
347	0					0	my $bestPredessor = undef;
348	0					0	for my $q (@$pairs) {
349	0	0	0			0	if(less_relaxed($q,$p) and defined($lattice->{$q->[0]}{$q->[1]})){
350	0					0	(my $length,undef) = @{$lattice->{$q->[0]}{$q->[1]}};
	0					0
351	0	0				0	if($bestLength < $length+$q->[2]){
352							# print "$bestLength < $length\n";
353	0					0	$bestLength = $length+$q->[2];
354	0					0	$bestPredessor = $q;
355
356							}
357							}
358							}
359	0					0	$lattice->{$p->[0]}{$p->[1]} = [$bestLength,$bestPredessor];
360							#print "$bestLength @$p $bestPredessor\n";
361							}
362
363							#Compute pair with max bestLength
364	0					0	my $x = [ map { [$lattice->{$_->[0]}{$_->[1]}[0],$_] } @$pairs ] ;
	0					0
365	0					0	my $y = (sort { $b->[0] <=> $a->[0] } @$x)[0];
	0					0
366	0					0	my ($bestLength,$p) = @$y;
367
368	0					0	my $chain = [];
369	0					0	while($p){
370	0					0	push @$chain, $p;
371	0					0	(my $length, $p) = @{$lattice->{$p->[0]}{$p->[1]}} ;
	0					0
372							}
373	0					0	return [reverse @$chain ];
374							}
375
376							=head2 findCommonHap
377
378							Finds unique terms common to both corpora. Notion of equality can be extended with two lists of correspondences.
379
380							=head3 findCommonHap($l1Hap,$l2Hap)
381
382							Returns a reference to a hash containing the elements common to the hashes pointed by the references $l1Hap and $l2Hap.
383
384							=head3 findCommonHap($l1Hap,$l2Hap,$l1_to_l2,$l2_to_l1)
385
386							$l1_to_l2 and $l2_to_l1 are references to hashes containing correspondences between words in language1 and language2 and vice-versa.
387
388							=cut
389
390							sub findCommonHap {
391	2			2	1	1091	my ($l1Hap, $l2Hap, $corresp_file) = @_;
392
393							# Original algorithm (find occurences of: unique term_l1 = unique term_l2)
394	2					4	my %hash;
395	2					21	@hash{keys %$l1Hap} = keys %$l1Hap;
396	2					8	my $commonHap = {};
397	2					7	map { $commonHap->{$_} = $_ } grep { $hash{$_} } keys %$l2Hap ;
	4					10
	22					31
398
399							# Lists of correspondences
400	2	100				8	if (defined($corresp_file)) {
401	1					4	my $corresp_list = parseCorrespFile($corresp_file);
402
403	1					3	foreach my $corresp (@$corresp_list) {
404	5					8	my ($l1_terms,$l2_terms) = @$corresp;
405	5					5	my $l1_sum = 0;
406	5					6	my $l1_term;
407	5					8	for (@$l1_terms,@$l2_terms){
408	18	100				43	if(defined($l1Hap->{$_})){
409	6					7	$l1_term = $_;
410	6					10	$l1_sum++;
411							}
412							}
413	5	100				16	next unless $l1_sum == 1;
414
415	4					5	my $l2_sum = 0;
416	4					5	my $l2_term;
417	4					7	for (@$l2_terms,@$l1_terms){
418	14	100				35	if(defined($l2Hap->{$_})){
419	4					5	$l2_term = $_;
420	4					19	$l2_sum++;
421							}
422							}
423	4	50				10	next unless $l2_sum == 1;
424	4					10	$commonHap->{$l1_term} = $l2_term;
425							}
426							}
427	2					10	return $commonHap;
428							}
429
430							=head2 selectFromChain
431
432							Selects a chain trying to obbey the maximalChunkSize constraint.
433
434							=cut
435
436							sub selectFromChain {
437	0			0	1	0	my ($chain,$maximalChunkSize) = @_;
438	0					0	my $forced = 0;
439	0					0	my $cursor;
440	0					0	my $filteredChain = [];
441
442	0					0	my $chain_size = @$chain;
443	0					0	for (my $ind = 0; $ind < $chain_size; $ind++) {
444	0					0	my $p = $chain->[$ind];
445	0	0				0	if($ind == 0) {
446	0					0	push @$filteredChain, $p;
447	0					0	$cursor = $p;
448	0					0	next;
449							}
450	0	0	0			0	if( $p->[0] - $cursor->[0] > $maximalChunkSize or
451							$p->[1] - $cursor->[1] > $maximalChunkSize) {
452	0					0	my $lastPos;
453	0	0				0	$lastPos = ($ind!=0 ? $chain->[$ind-1] : [0,0]);
454	0	0				0	if ($lastPos != $cursor) { push @$filteredChain, $lastPos }
	0					0
455							else {
456	0					0	push @$filteredChain,$p;
457	0					0	$forced = 1;
458							}
459	0					0	$cursor = $filteredChain->[-1];
460							}
461							}
462
463
464	0	0	0			0	push @$filteredChain, $chain->[-1] unless(defined($filteredChain->[-1]) and
465							$filteredChain->[-1]==$chain->[-1]);
466
467	0					0	return ($filteredChain,$forced);
468							}
469
470							=head2 get_corpus
471
472							Given a file name, splits the segments and words into an array of arrays.
473
474							Returns:
475							a reference to the array of arrays,
476							a reference to an array of pairs with the offsets of the start and end of each segment,
477							a reference to the full text
478
479							=cut
480
481							sub get_corpus {
482	1			1	1	11	my ($filename) = @_;
483	1	50				72	open my $fh, '<', $filename or die;
484	1					3	my ($start,$end);
485	1					4	$start = 0;
486	1					2	my $offsets = [];
487	1					2	my $corpus = [];
488	1					40	while(<$fh>){
489	10					13	$end = tell($fh)-1;
490	10					20	push @$offsets, [$start,$end];
491	10					12	$start = $end+1;
492	10					64	push @$corpus, [ split ];
493							}
494	1					9	close $fh;
495
496	1	50				36	open $fh, '<', $filename or die;
497	1					29	my $txt = join '',<$fh>;
498	1					13	close $fh;
499
500	1					8	return ($corpus, $offsets, \$txt);
501							}
502
503							=head2 strInterval
504
505							Given a corpus and a start and end positions, returns a string with the contents within the given range.
506
507							=head3 strInterval($corpus,$first,$last)
508
509							Concatenates all the words in the lines comprised in the $first..$last-1 range from corpus.
510
511							=head3 strInterval($corpus,$first,$last,$offsets);
512
513							Retrieves from the original text the substring from the begining of the segment $first to the end of the segment $last;
514
515							=cut
516
517							sub strInterval {
518	0			0	1	0	my ($corpus,$first,$last,$offsets) = @_;
519	0	0				0	unless (defined($offsets)){
520	0					0	my $s;
521	0					0	for my $line (@$corpus[$first..$last-1]){
522	0					0	$s.= (join ' ', @$line) . "\n";
523							}
524	0					0	return $s;
525							}
526							else {
527	0					0	my $start = $offsets->[$first][0];
528	0					0	my $end = $offsets->[$last-1][1];
529
530	0					0	my $txt = $$corpus;
531	0					0	my $s = substr $txt, $start, ($end-$start+1);
532	0					0	return $s;
533							}
534							}
535
536							=head2 parseCorrespFile
537
538							Parses a given file with correspondences between two given languages. File must follow the following DSL:
539							file : header correspondence*
540							header: 'langs:' L1, L2
541							correspondence : term (',' term)* '=' term (',' term)*
542							term : word (\s word)*
543
544							Does not yet support multi-word terms nor multi-term correspondences!
545
546							=cut
547
548							sub parseCorrespFile {
549	1			1	1	3	my ($filepath) = @_;
550	1	50				56	open my $fh, '<', $filepath or die;
551	1					4	my $corresp_list = [];
552
553	1					25	my $header = <$fh>;
554	1					7	$header =~ /^langs:\s(\w+)\s,\s*(\w+)/i;
555	1					5	my ($l1,$l2) = ($1,$2);
556
557	1					8	while (<$fh>){
558	5					14	s/#.*$//;
559	5	50				16	next if /^\s*$/;
560	5					9	chomp;
561	5					20	my ($str_l1, $str_l2) = split /\s=\s/,$_;
562	5					18	my $terms_l1 = [ split /\s,\s/,$str_l1 ];
563	5					21	my $terms_l2 = [ split /\s,\s/,$str_l2 ];
564	5					31	push @$corresp_list, [$terms_l1,$terms_l2];
565							}
566	1					11	close $fh;
567
568	1					5	return $corresp_list;
569							}
570
571							=head2 seg_split
572							=cut
573
574							sub seg_split {
575	2			2	1	4556	my ($txtref, $options) = @_;
576	2					5	my ($corpus,$offsets);
577	2	100				11	($corpus,$offsets) = _seg_split_pml($txtref,$options) if $options->{'-pml'};
578	2	100				8	($corpus,$offsets) = _seg_split_newline($txtref,$options) if $options->{'-newline'};
579	2					6	return ($corpus, $offsets, $txtref);
580							}
581
582							sub _seg_split_pml {
583	1			1		2	my ($txtref,$options) = @_;
584	1					2	my $corpus = [];
585	1					3	my $offsets = [];
586
587	1					12	while($$txtref =~ / (.*?)<\/p>/g){
588	5					16	my ($start,$end) = ($-[0],$+[0]);
589	5					13	push @$offsets, [$start,$end];
590	5					13	push @$corpus, token_split($1,$options);
591							}
592	1					3	return ($corpus, $offsets);
593							}
594
595
596							sub _seg_split_newline {
597	1			1		2	my ($txtref,$options) = @_;
598	1					2	my $corpus = [];
599	1					2	my $offsets = [];
600
601	1					9	while($$txtref =~ /(.*)\n/g){
602	5					14	my ($start,$end) = ($-[1],$+[1]);
603	5					11	push @$offsets, [$start,$end];
604	5					10	push @$corpus, token_split($1,$options);
605							}
606	1					2	return ($corpus, $offsets);
607							}
608
609							=head2 token_split
610							=cut
611
612							sub token_split {
613	10			10	1	19	my ($seg,$options) = @_;
614	10					9	my $tokens;
615	10	100				25	$tokens = _token_split_ws($seg) if $options->{'-ws'};
616	10	100				31	$tokens = _token_split_punct($seg) if $options->{'-punct'};
617	10					50	return $tokens;
618							}
619
620							sub _token_split_ws {
621	5			5		8	my $seg = shift;
622	5					80	return [ split ' ',$seg ];
623							}
624
625							sub _token_split_punct {
626	5			5		7	my $seg = shift;
627	5					102	return [ split /[\b\s?!\.,]+/, $seg ];
628							}
629
630							=head1 AUTHOR
631
632							Andre Santos, C<< >>
633
634							=head1 BUGS
635
636							Please report any bugs or feature requests to C, or through
637							the web interface at L. I will be notified, and then you'll
638							automatically be notified of progress on your bug as I make changes.
639
640							=head1 SUPPORT
641
642							You can find documentation for this module with the perldoc command.
643
644							perldoc Text::Perfide::PartialAlign
645
646
647							You can also look for information at:
648
649							=over 4
650
651							=item * RT: CPAN's request tracker (report bugs here)
652
653							L
654
655							=item * AnnoCPAN: Annotated CPAN documentation
656
657							L
658
659							=item * CPAN Ratings
660
661							L
662
663							=item * Search CPAN
664
665							L
666
667							=back
668
669
670							=head1 ACKNOWLEDGEMENTS
671
672							Based on the original script partialAlign.py bundled with
673							hunalign -- http://mokk.bme.hu/resources/hunalign/ .
674
675							Thanks to Daniel Varga for helping us to understand how partialAlign.py works.
676
677							=head1 LICENSE AND COPYRIGHT
678
679							Copyright 2012 Andre Santos.
680
681							This program is free software; you can redistribute it and/or modify it
682							under the terms of either: the GNU General Public License as published
683							by the Free Software Foundation; or the Artistic License.
684
685							See http://dev.perl.org/licenses/ for more information.
686
687
688							=cut
689
690							1; # End of Text::Perfide::PartialAlign