File Coverage

blib/lib/WWW/Babelfish.pm

Criterion	Covered	Total	%
statement	21	152	13.8
branch	0	48	0.0
condition	0	23	0.0
subroutine	7	15	46.6
pod	6	7	85.7
total	34	245	13.8

line	stmt	bran	cond	sub	pod	time	code
1							package WWW::Babelfish;
2
3							require 5.008;
4
5	1			1		582	use strict;
	1					1
	1					32
6	1			1		3	use vars qw($VERSION @ISA @EXPORT @EXPORT_OK);
	1					2
	1					100
7
8							require Exporter;
9							require AutoLoader;
10
11							@ISA = qw(Exporter AutoLoader);
12							# Items to export into callers namespace by default. Note: do not export
13							# names by default without a very good reason. Use EXPORT_OK instead.
14							# Do not simply export all your public functions/methods/constants.
15							@EXPORT = qw();
16
17							$VERSION = '0.16';
18
19							# Preloaded methods go here.
20
21	1			1		719	use HTTP::Request::Common qw(POST);
	1					55179
	1					98
22	1			1		1705	use LWP::UserAgent;
	1					37803
	1					33
23	1			1		742	use HTML::TokeParser;
	1					14152
	1					32
24	1			1		889	use IO::String;
	1					4650
	1					29
25	1			1		3056	use Encode;
	1					12827
	1					2463
26
27							my $MAXCHUNK = 1000; # Maximum number of characters
28							# Bablefish will translate at one time
29
30							my $MAXRETRIES = 50; # Maximum number of retries for a chunk of text
31							$\| = 1;
32
33							my $Services = {
34							Babelfish => {
35							agent => $0 . ":" . __PACKAGE__ . "/" . $VERSION,
36
37							languagesrequest => sub {
38							my $req = new HTTP::Request(GET => 'http://babelfish.altavista.com/babelfish/tr?il=en');
39							return $req;
40							},
41
42							translaterequest => sub {
43							my($langpair, $text) = @_;
44							my $req = POST ( 'http://babelfish.altavista.com/babelfish/tr?il=en',
45							[ 'doit' => 'done', 'urltext' => encode("utf8",$text), 'lp' => $langpair, 'Submit' => 'Translate', 'enc' => 'utf8' ], qw(Accept-Charset utf-8) );
46							return $req;
47							},
48
49							# Extract the text from the html we get back from babelfish and return
50							# it (keying on the fact that it's the first thing after a tag,
51							# possibly removing a textarea tag after it).
52
53							# extract_text => sub {
54							# my($html) = @_;
55							# my $p = HTML::TokeParser->new(\$html);
56							# my $tag;
57							# while ($tag = $p->get_tag('input')) {
58							# $_ = @{$tag}[1]->{value} if @{$tag}[1]->{name} eq 'q';
59							# return decode("utf8",$_);
60							# }
61
62							extract_text => sub {
63							my($html) = @_;
64							my $p = HTML::TokeParser->new(\$html);
65							while ( my $_tag = $p->get_tag('div') ) {
66							my($tag,$attr,$attrseq) = @$_tag;
67							next unless @$attrseq == 1
68							&& $attrseq->[-1] eq 'style'
69							&& $attr->{style} eq 'padding:10px;';
70							my($token) = $p->get_token or return;
71							my ( $type, $text, $is_data ) = @$token;
72							next if $type ne 'T';
73							return decode( utf8 => $text );
74							}
75
76
77							}
78							},
79
80							Google => {
81							agent => 'Mozilla/5.0', # Google is finicky
82
83							languagesrequest => sub {
84							my $req = new HTTP::Request(GET => 'http://www.google.com/language_tools?hl=en');
85							return $req;
86							},
87
88							translaterequest => sub {
89							my($langpair, $text) = @_;
90							my $req = POST ( 'http://translate.google.com/translate_t',
91							[ 'text' => encode("utf8",$text), 'langpair' => $langpair, hl => 'en', ie => "UTF8", oe => "UTF8",]);
92							return $req;
93							},
94
95							extract_text => sub {
96							my($html) = @_;
97							my $p = HTML::TokeParser->new(\$html);
98							my $tag;
99							while ($tag = $p->get_tag('div')) {
100							if (@{$tag}[1]->{id} eq 'result_box') {
101							$_ = $p->get_text;
102							return decode("utf8",$_);
103							}
104							}
105							}
106							},
107
108							Yahoo => {
109							agent => $0 . ":" . __PACKAGE__ . "/" . $VERSION,
110
111							languagesrequest => sub {
112							my $req = new HTTP::Request(GET => 'http://babelfish.yahoo.com/translate_txt');
113							return $req;
114							},
115
116							translaterequest => sub {
117							my($langpair, $text) = @_;
118							my $req = POST ( 'http://babelfish.yahoo.com/translate_txt',
119							[ 'ei' => 'UTF-8', 'doit' => 'done', 'tt' => 'urltext', 'trtext' => encode("utf8",$text), 'lp' => $langpair, 'btnTrTxt' => 'Translate', 'intl' => '1' ]);
120							return $req;
121							},
122
123							# Extract the text from the html we get back from Yahoo
124							extract_text => sub {
125							my($html) = @_;
126							my $p = HTML::TokeParser->new(\$html);
127							my $tag;
128							while ($tag = $p->get_tag('div')) {
129							next if (@{$tag}[1]->{id} ne 'result');
130							$_ = $p->get_text('/div');
131							return decode("utf8",$_);
132							}
133							}
134							},
135
136							};
137
138
139							sub new {
140	0			0	1		my ($this, @args) = @_;
141	0		0				my $class = ref($this) \|\| $this;
142	0						my $self = {};
143	0						bless $self, $class;
144	0	0					return undef unless( $self->initialize(@args) );
145	0						return $self;
146							}
147
148							sub initialize {
149	0			0	0		my($self, %params) = @_;
150
151	0		0				$self->{service} = $params{service} \|\| 'Babelfish';
152	0	0					die "No such service: " . $self->{service} unless defined $Services->{ $self->{service} };
153
154							# Caller can set user agent; we default to "script:WWW::Babelfish/0.01"
155	0		0				$self->{agent} = $params{agent} \|\| $Services->{agent};
156
157	0	0					$self->{proxy} = $params{proxy} if defined $params{proxy};
158
159							# Get the page
160	0						my $ua = new LWP::UserAgent;
161	0	0					$ua->proxy('http','http://' . $self->{proxy}) if defined $self->{proxy};
162	0						$ua->agent($self->{agent});
163	0						$self->{ua} = $ua;
164
165	0						my $req = &{ $Services->{ $self->{service} }->{languagesrequest} };
	0
166	0						my $res = $ua->request($req);
167	0	0					unless($res->is_success){
168	0						warn(__PACKAGE__ . ":" . $res->status_line);
169	0						return 0;
170							}
171	0						my $page = $res->content;
172
173							# Extract the language names and the mapping of languages to options to
174							# be passed back, and store them on our object in "Langs" hash of hashes
175							# Incredibly, this works for both Babelfish and Google; it should really
176							# be a method in $Services
177	0						my $p = HTML::TokeParser->new(\$page);
178	0						my $a2b;
179	0	0					if ( $p->get_tag("select") ) {
180	0						while ( $_ = $p->get_tag("option") ) {
181	0						$a2b = $p->get_trimmed_text;
182	0	0					next if $a2b =~ /Select from and to languages/; # This for babelfish
183	0	0					$a2b =~ /(\S+)\sto\s(\S+)/ or next;
184	0						$self->{Langs}{$1}{$2} = $_->[1]{value};
185	0		0				$self->{Langs}{$2} \|\|= {};
186							}
187							}
188
189	0						return 1;
190							}
191
192							sub services {
193	0			0	1		my $self = shift;
194	0	0					if($self){
195	0						return keys %{$self->Services};
	0
196							}
197							else{
198	0						return keys %{$Services};
	0
199							}
200							}
201
202							sub languages {
203	0			0	1		my $self = shift;
204	0						return sort keys %{$self->{Langs}};
	0
205							}
206
207							sub languagepairs {
208	0			0	1		my $self = shift;
209	0						return $self->{Langs};
210							}
211
212							sub translate {
213	0			0	1		my ($self, %params) = @_;
214
215							# Paragraph separator is "\n\n" by default
216	0		0				local $/ = $params{delimiter} \|\| "\n\n";
217	0						local $_;
218
219	0	0					$params{delimiter} = "\n\n" if ( ! defined( $params{delimiter} ) );
220
221	0						undef $self->{error};
222	0	0					unless ( exists($self->{Langs}->{$params{source}}) ) {
223	0						$self->{error} = qq(Language "$params{source}" is not available);
224	0						warn(__PACKAGE__ . ": " . $self->{error} . "\n");
225	0						return undef;
226							}
227
228							# This "feature" is actually useful as a pass-thru filter.
229							# Babelfish doesn't do same-to-same anyway (though it would be
230							# pretty interesting if it did)
231	0	0					return $params{text} if $params{source} eq $params{destination};
232
233	0	0					unless ( exists($self->{Langs}->{$params{source}}{$params{destination}}) ) {
234	0						$self->{error} =
235							qq(Cannot translate from "$params{source}" to "$params{destination}");
236	0						warn(__PACKAGE__ . ": " . $self->{error} . "\n");
237	0						return undef;
238							}
239
240	0						my $langopt = $self->{Langs}{$params{source}}{$params{destination}};
241
242	0						my $th; # "Text Handle"
243	0	0					if ( ref $params{text} ) { # We've been passed a filehandle
244	0						$th = $params{text};
245							} else { # We've been passed a string
246	0						$th = new IO::String($params{text});
247							}
248
249	0						my $Text = "";
250	0						my $WANT_STRING_RETURNED = 0;
251	0	0					unless ( defined $params{ofh} ) {
252	0						$params{ofh} = new IO::String($Text);
253	0						$WANT_STRING_RETURNED = 1;
254							}
255
256							# Variables we use in the next mega-block
257	0						my $para; # paragraph
258	0						my $num_paras = 0; # number of paragraphs
259	0						my $transpara; # translated paragraph
260	0						my $para_start_ws = ""; # initial whitespace in paragraph
261	0						my $chunk; # paragraph piece to feed to babelfish
262							my $req; # LWP request object
263	0						my $ua; # LWP user agent
264	0						my $res; # LWP result
265	0						my $text; # translated chunk
266	0						my $i; # a counter
267	0						while ($para = <$th>) {
268	0						$num_paras++;
269	0						$transpara = "";
270
271							# Extract any leading whitespace from the start of the paragraph
272							# Babelfish will eat it anyway.
273	0	0					if ($para =~ s/(^\s+)(\S)/$2/) {
274	0		0				$para_start_ws = $1 \|\| "";
275							}
276	0						$para =~ s/$params{delimiter}//; # Remove the para delimiter
277
278							CHUNK:
279	0						foreach $chunk ( $self->_chunk_text($MAXCHUNK, $para) ) {
280	0						$req = &{ $Services->{ $self->{service} }->{translaterequest} }($langopt, $chunk);
	0
281	0						$ua = $self->{ua};
282
283							RETRY:
284	0						for ($i = 0; $i <= $MAXRETRIES; $i++) {
285	0						$res = $ua->request($req);
286
287	0	0					if ( $res->is_success ) {
288
289							#$text = $self->_extract_text($res->as_string); #REMOVE
290	0						$text = &{ $Services->{ $self->{service} }->{extract_text} }($res->as_string);
	0
291	0	0	0				if ( ( ! defined( $text ) ) \|\|
292							( $text =~ /^\\time-out\\/ )
293							) # in-band signalling; yuck
294							{
295	0						next RETRY;
296
297							} ## end if
298
299	0						$text =~ s/\n$//; # Babelfish likes to append newlines
300	0						$transpara .= $text;
301
302	0						next CHUNK;
303							}
304							}
305	0						$self->{error} = "Request timed out more than $MAXRETRIES times";
306	0						return undef;
307							}
308	0	0					print { $params{ofh} } $/ if $num_paras > 1;
	0
309	0						print { $params{ofh} } $para_start_ws . $transpara;
	0
310							}
311
312	0	0					if ( $WANT_STRING_RETURNED ) {
313	0						return $Text;
314							} else {
315	0						return 1;
316							}
317							}
318
319							sub error {
320	0			0	1		my $self = shift;
321	0						return $self->{error};
322							}
323
324							# Given a maximum chunk size and some text, return
325							# an array of pieces of the text chopped up in a
326							# logical way and less than or equal to the chunk size
327							sub _chunk_text {
328	0			0			my($self, $max, $text) = @_;
329
330	0						my @result;
331
332							# The trivial case
333	0	0					return($text) if length($text) <= $max;
334
335							# Hmmm. There are a couple of ways we could do this.
336							# I'm guessing that Babelfish doesn't look at any structure larger than
337							# a sentence; in fact I'm often tempted to guess that it doesn't look
338							# at anything larger than a word, but we'll give it the benefit of the doubt.
339							#
340
341							# FIXME there are no built-in regexps for matching sentence
342							# breaks; I'm not sure if terminal punctuation will work for all
343							# languages...
344
345	0						my $total = length($text);
346	0						my $offset = 0;
347	0						my $lastoffset = 0;
348	0						my $test;
349							my $chunk;
350
351	0						while ( ($total - $lastoffset) > $max) {
352	0						$test = $lastoffset + $max;
353
354							# Split by terminal punctuation...
355	0						@_ = sort {$b <=> $a} ( rindex($text, '.', $test),
	0
356							rindex($text, '!', $test),
357							rindex($text, '?', $test),
358							);
359	0						$offset = shift(@_) + 1;
360
361							# or by clause...
362	0	0	0				if ( $offset == -1 or $offset <= $lastoffset ) {
363	0						@_ = sort {$b <=> $a} ( rindex($text, ',', $test),
	0
364							rindex($text, ';', $test),
365							rindex($text, ':', $test),
366							);
367	0						$offset = shift(@_) + 1;
368
369
370							# or by word
371	0	0	0				if ( $offset == -1 or $offset <= $lastoffset) {
372	0						$offset = rindex($text, " ", $test);
373							}
374
375							# or give up
376	0	0					return undef if $offset == -1;
377							}
378
379	0						$chunk = substr($text, $lastoffset, $offset - $lastoffset);
380
381	0						push( @result, $chunk);
382	0						$lastoffset = $offset;
383							}
384
385	0						push( @result, substr($text, $lastoffset) );
386	0						return @result;
387							}
388
389							# This code is now obsoleted by the new result page format, but I'm
390							# leaving it here commented out in case we end up needing the
391							# whitespace hack again.
392							#
393							# my ($tag,$token);
394							# my $text="";
395							# if ($tag = $p->get_tag('br')) {
396							# while ($token = $p->get_token) {
397							# next if shift(@{$token}) ne "T";
398							# $text = shift(@{$token});
399
400							# #$text =~ s/[\r\n]//g;
401							# # This patch for whitespace handling from Olivier Scherler
402							# $text =~ s/[\r\n]/ /g;
403							# $text =~ s/^\s*//;
404							# $text =~ s/\s+/ /g;
405							# $text =~ s/\s+$//;
406
407							# last if defined($text) and $text ne "";
408							# }
409							# }
410							# return $text;
411
412
413							#}
414
415
416							# Autoload methods go after =cut, and are processed by the autosplit program.
417
418							1;
419							__END__