File Coverage

blib/lib/XML/TMX/CWB.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package XML::TMX::CWB;
2				$XML::TMX::CWB::VERSION = '0.10';
3	1	1	13660	use warnings;
	1		3
	1		28
4	1	1	4	use strict;
	1		1
	1		40
5	1	1	497	use Lingua::PT::PLNbase;
	1		28086
	1		9
6	1	1	2289	use XML::TMX::Reader;
	0
	0
7				use XML::TMX::Writer;
8				use CWB::CL::Strict;
9				use File::Spec::Functions;
10				use Encode;
11
12				use POSIX qw(locale_h);
13				setlocale(&POSIX::LC_ALL, "pt_PT");
14				use locale;
15
16				=head1 NAME
17
18				XML::TMX::CWB - TMX interface with Open Corpus Workbench
19
20				=head1 SYNOPSIS
21
22				XML::TMX::CWB->toCWB( tmx => $tmxfile,
23				from => 'PT', to => 'EN',
24				corpora => "/corpora",
25				corpus_name => "foo",
26				tokenize_source => 1,
27				tokenize_target => 1,
28				verbose => 1,
29				registry => '/path/to/cwb/registry' );
30
31				XML::TMX::CWB->toTMX( source => 'sourcecorpus',
32				target => 'targetcorpus',
33				source_lang => 'PT',
34				target_lang => 'ES',
35				verbose => 1,
36				output => "foobar.tmx");
37
38
39				=head1 METHODS
40
41				=head2 toTMX
42
43				Fetches an aligned pair of corpora on CWB and exports it as a TMX
44				file.
45
46				=cut
47
48				sub toTMX {
49				shift if $_[0] eq 'XML::TMX::CWB';
50				my %ops = @_;
51
52				die "Source and target corpora names are required.\n" unless $ops{source} and $ops{target};
53
54				my $Cs = CWB::CL::Corpus->new(uc $ops{source});
55				die "Can't find corpus [$ops{source}]\n" unless $Cs;
56				my $Ct = CWB::CL::Corpus->new(uc $ops{target});
57				die "Can't find corpus [$ops{target}]\n" unless $Ct;
58
59				my $align = $Cs->attribute(lc($ops{target}), "a");
60				my $count = $align->max_alg;
61
62				my $Ws = $Cs->attribute("word", "p");
63				my $Wt = $Ct->attribute("word", "p");
64
65				my $tmx = new XML::TMX::Writer();
66				$tmx->start_tmx( $ops{output} ? (OUTPUT => $ops{output}) : (),
67				TOOL => 'XML::TMX::CWB',
68				TOOLVERSION => $XML::TMX::CWB::VERSION);
69				for my $i (0 .. $count-1) {
70				my ($s1, $s2, $t1, $t2) = $align->alg2cpos($i);
71				my $source = join(" ",$Ws->cpos2str($s1 .. $s2));
72				my $target = join(" ",$Wt->cpos2str($t1 .. $t2));
73				Encode::_utf8_on($source);
74				Encode::_utf8_on($target);
75				$tmx->add_tu($ops{source_lang} => $source,
76				$ops{target_lang} => $target);
77				}
78				$tmx->end_tmx();
79				}
80
81				=head2 toCWB
82
83				Imports a TMX file (just two languages) to a parallel corpus on CWB.
84
85				=cut
86
87				sub _get_header_prop_list {
88				my ($reader, $prop, $default) = @_;
89
90				$default \|\|= [];
91
92				return $default unless exists $reader->{header}{-prop}{$prop};
93
94				my $value = (ref($reader->{header}{-prop}{$prop}) eq "ARRAY")
95				? join(",",@{$reader->{header}{-prop}{$prop}})
96				: $reader->{header}{-prop}{$prop};
97
98				return [ split /\s,\s/ => $value ];
99				}
100
101
102				sub _RUN {
103				my $command = shift;
104				print STDERR "Running [$command]\n";
105				`$command`;
106				}
107
108				sub toCWB {
109				shift if $_[0] eq 'XML::TMX::CWB';
110				my %ops = @_;
111
112				my $tmx = $ops{tmx} or die "tmx file not specified.\n";
113
114				my $corpora = $ops{corpora} \|\| "/corpora";
115				die "Need a corpora folder" unless -d $corpora;
116
117				die "Can't open [$tmx] file for reading\n" unless -f $tmx;
118
119				# Create reader object
120				my $reader = XML::TMX::Reader->new($tmx);
121
122				my %tagged_languages = ();
123				for my $language (@{ _get_header_prop_list($reader, "pos-tagged") }) {
124				$tagged_languages{$language}++;
125				}
126
127				my $has_tagged_languages = scalar(keys %tagged_languages);
128
129				my $tag_data = undef;
130				my $s_attributes = ['s'];
131				my $pos_fields = [qw'word lemma pos'];
132
133				if ($has_tagged_languages) {
134
135				if ($ops{tok} \|\| $ops{tokenize_source} \|\| $ops{tokenize_target}) {
136				warn "Can't tokenize tagged languages. Ignoring tagging request for ",
137				join(", ", keys %tagged_languages);
138				}
139
140				$s_attributes = _get_header_prop_list($reader, 'pos-s-attributes', $s_attributes);
141				$pos_fields = _get_header_prop_list($reader, 'pos-fields', $pos_fields);
142
143				$tag_data = {
144				languages => \%tagged_languages,
145				'pos-s-attributes' => $s_attributes,
146				'pos-fields' => $pos_fields
147				};
148				}
149
150				my ($source, $target);
151
152				if ($ops{mono}) {
153				# Detect what languages to use
154				($source) = _detect_language($reader,
155				($ops{mono} \|\| undef));
156				$ops{verbose} && print STDERR "Using language [$source]\n";
157				} else {
158				# Detect what languages to use
159				($source, $target) = _detect_languages($reader,
160				($ops{from} \|\| undef),
161				($ops{to} \|\| undef));
162				$ops{verbose} && print STDERR "Using languages [$source, $target]\n";
163				}
164
165				# Detect corpus registry
166				my $registry = $ops{registry} \|\| $ENV{CORPUS_REGISTRY};
167				chomp( $registry = `cwb-config -r` ) unless $registry;
168				die "Could not detect a suitable CWB registry folder.\n" unless $registry && -d $registry;
169
170				# detect corpus name
171				my $cname = $ops{corpus_name} \|\| $tmx;
172				$cname =~ s/[.-]/_/g;
173
174				if ($ops{mono}) {
175				_mtmx2cqpfiles($reader, $cname, $source,
176				$ops{tok} \|\| 0,
177				$ops{verbose} \|\| 0,
178				$tag_data
179				);
180				_mencode($cname, $corpora, $registry, $source, $tag_data);
181				} else {
182				_tmx2cqpfiles($reader, $cname, $source, $target,
183				$ops{tokenize_source} \|\| 0,
184				$ops{tokenize_target} \|\| 0,
185				$ops{verbose} \|\| 0,
186				$tag_data
187				);
188				_encode($cname, $corpora, $registry, $source, $target, $tag_data);
189				unlink "target.cqp";
190				unlink "align.txt";
191				}
192
193				unlink "source.cqp";
194				}
195
196				sub _encode {
197				my ($cname, $corpora, $registry, $l1, $l2, $tagged) = @_;
198
199				my @languages = ($l1, $l2);
200				my @files = (qw'source target');
201
202				for my $i (0, 1) {
203				my ($name, $folder, $reg);
204				my ($posatt, $sattr) = ("", "");
205
206				my $l = $languages[$i];
207				my $f = $files[$i];
208
209				if ($tagged && exists($tagged->{languages}{$l})) {
210				# shift @{$tagged->{'pos-fields'}};
211				my (undef, @tags) = @{$tagged->{'pos-fields'}};
212				$posatt = join(" ", map { "-P $_" } @tags);
213				$sattr = join(" ", map { "-S $_" } @{$tagged->{'pos-s-attributes'}});
214				}
215
216				$name = lc("${cname}_$l");
217				$folder = catfile($corpora, $name);
218				$reg = catfile($registry, $name);
219
220				mkdir $folder;
221
222				_RUN("cwb-encode -c utf8 -d $folder -f $f.cqp -R $reg -S tu+id $sattr $posatt");
223				_RUN("cwb-make -r $registry -v " . uc($name));
224				}
225
226				_RUN("cwb-align-import -r $registry -v align.txt");
227				_RUN("cwb-align-import -r $registry -v -inverse align.txt");
228				}
229
230				sub _mencode {
231				my ($cname, $corpora, $registry, $l1, $tagged) = @_;
232
233				my $name = lc("${cname}_$l1");
234				my $folder = catfile($corpora, $name);
235				my $reg = catfile($registry, $name);
236
237				my ($posatt, $sattr) = ("", "");
238				if ($tagged && $tagged->{languages}{$l1}) {
239				shift @{$tagged->{'pos-fields'}};
240				$posatt = join(" ", map { "-P $_" } @{$tagged->{'pos-fields'}});
241				$sattr = join(" ", map { "-S $_" } @{$tagged->{'pos-s-attributes'}});
242				}
243
244				mkdir $folder;
245				_RUN("cwb-encode -c utf8 -d $folder -f source.cqp -R $reg -S tu+id $sattr $posatt");
246				_RUN("cwb-make -r $registry -v " . uc($name));
247				}
248
249				sub _tmx2cqpfiles {
250				my ($reader, $cname, $l1, $l2, $t1, $t2, $v, $tagged) = @_;
251				open F1, ">:utf8", "source.cqp" or die "Can't create cqp outfile\n";
252				open F2, ">:utf8", "target.cqp" or die "Can't create cqp outfile\n";
253				open AL, ">:utf8", "align.txt" or die "Can't create alignment file\n";
254				my $i = 1;
255
256				printf AL "%s\t%s\ttu\tid_{id}\n", uc("${cname}_$l1"), uc("${cname}_$l2");
257
258				print STDERR "Processing..." if $v;
259
260				my $proc = sub {
261				my ($langs) = @_;
262				return unless exists $langs->{$l1} && exists $langs->{$l2};
263
264				my (@S, @T);
265
266				# Language 1
267				if ($tagged && exists($tagged->{languages}{$l1})) {
268				@S = split /\n/, $langs->{$l1}{-seg};
269				}
270				else {
271				for ($langs->{$l1}{-seg}) {
272				s/&/&/g;
273				s/
274				s/
275				}
276				@S = $t1 ? tokenize($langs->{$l1}{-seg}) : split /\s+/, $langs->{$l1}{-seg};
277				}
278
279				# Language 2
280				if ($tagged && exists($tagged->{languages}{$l2})) {
281				@T = split /\n/, $langs->{$l2}{-seg};
282				}
283				else {
284				for ($langs->{$l2}{-seg}) {
285				s/&/&/g;
286				s/
287				s/
288				}
289
290				@T = $t2 ? tokenize($langs->{$l2}{-seg}) : split /\s+/, $langs->{$l2}{-seg};
291				}
292
293				print STDERR "\rProcessing... $i translation units" if $v && $i%1000==0;
294
295				print AL "id_$i\tid_$i\n";
296				print F1 "\n", join("\n", @S), "\n\n";
297				print F2 "\n", join("\n", @T), "\n\n";
298				++$i;
299				};
300
301				$reader->for_tu( $proc );
302				print STDERR "\rProcessing... $i translation units\n" if $v;
303				close AL;
304				close F1;
305				close F2;
306				}
307
308				sub _mtmx2cqpfiles {
309				my ($reader, $cname, $l1, $t1, $v, $tagged) = @_;
310				open F1, ">:utf8", "source.cqp" or die "Can't create cqp outfile\n";
311				my $i = 1;
312
313				print STDERR "Processing..." if $v;
314
315				my $proc = sub {
316				my ($langs) = @_;
317				return unless exists $langs->{$l1};
318
319				my (@S);
320				if ($tagged && exists($tagged->{languages}{$l1})) {
321				@S = split /\n/, $langs->{$l1}{-seg};
322				} else {
323				for ($langs->{$l1}{-seg}) {
324				s/&/&/g;
325				s/
326				s/>/>/g;
327				}
328
329				@S = $t1 ? tokenize($langs->{$l1}{-seg}) : split /\s+/, $langs->{$l1}{-seg};
330				}
331				print STDERR "\rProcessing... $i translation units" if $v && $i%1000==0;
332
333				print F1 "\n", join("\n", @S), "\n\n";
334				++$i;
335				};
336
337				$reader->for_tu( $proc );
338				print STDERR "\rProcessing... $i translation units\n" if $v;
339				close F1;
340				}
341
342				sub _detect_languages {
343				my ($reader, $from, $to) = @_;
344				my @languages = $reader->languages();
345
346				die "Language $from not available\n" if $from and !grep{_ieq($_, $from)}@languages;
347				die "Language $to not available\n" if $to and !grep{_ieq($_, $to)} @languages;
348
349				($from) = grep { _ieq($_, $from) } @languages if $from;
350				($to) = grep { _ieq($_, $to ) } @languages if $to;
351
352				return ($from, $to) if $from and $to;
353
354				if (scalar(@languages) == 2) {
355				$to = grep { $_ ne $from } @languages if $from and not $to;
356				$from = grep { $_ ne $to } @languages if $to and not $from;
357				($from, $to) = @languages if not ($to or $from);
358				return ($from, $to) if $from and $to;
359				}
360				die "Can't guess what languages to use!\n";
361				}
362
363				sub _detect_language {
364				my ($reader, $mono) = @_;
365				my @languages = $reader->languages();
366
367				die "Language $mono not available\n" if $mono and !grep {_ieq($_, $mono)} @languages;
368
369				($mono) = grep { _ieq($_, $mono) } @languages if $mono;
370
371				return ($mono) if $mono;
372
373				if (scalar(@languages) == 1) {
374				($mono) = @languages;
375				return ($mono);
376				}
377				die "Can't guess what languages to use!\n";
378				}
379
380				sub _ieq {
381				uc($_[0]) eq uc($_[1])
382				}
383
384
385
386				=head1 AUTHOR
387
388				Alberto Simoes, C<< >>
389
390				=head1 BUGS
391
392				Please report any bugs or feature requests to C, or through
393				the web interface at L. I will be notified, and then you'll
394				automatically be notified of progress on your bug as I make changes.
395
396
397
398
399				=head1 SUPPORT
400
401				You can find documentation for this module with the perldoc command.
402
403				perldoc XML::TMX::CWB
404
405
406				You can also look for information at:
407
408				=over 4
409
410				=item * RT: CPAN's request tracker
411
412				L
413
414				=item * AnnoCPAN: Annotated CPAN documentation
415
416				L
417
418				=item * CPAN Ratings
419
420				L
421
422				=item * Search CPAN
423
424				L
425
426				=back
427
428
429				=head1 ACKNOWLEDGEMENTS
430
431
432				=head1 LICENSE AND COPYRIGHT
433
434				Copyright 2010-2014 Alberto Simoes.
435
436				This program is free software; you can redistribute it and/or modify it
437				under the terms of either: the GNU General Public License as published
438				by the Free Software Foundation; or the Artistic License.
439
440				See http://dev.perl.org/licenses/ for more information.
441
442
443				=cut
444
445				1; # End of XML::TMX::CWB