File Coverage

blib/lib/I22r/Translate/Microsoft.pm
Criterion Covered Total %
statement 37 39 94.8
branch n/a
condition n/a
subroutine 13 13 100.0
pod n/a
total 50 52 96.1


line stmt bran cond sub pod time code
1             package I22r::Translate::Microsoft;
2 1     1   13104 use Moose;
  1         299588  
  1         4  
3             with 'I22r::Translate::Backend';
4 1     1   4889 use I22r::Translate::Result;
  1         37638  
  1         34  
5 1     1   603 use Time::HiRes;
  1         1066  
  1         3  
6 1     1   87 use Carp;
  1         1  
  1         57  
7 1     1   807 use Data::Dumper;
  1         4633  
  1         48  
8 1     1   486 use Encode;
  1         6393  
  1         57  
9 1     1   430 use HTML::Entities;
  1         3635  
  1         61  
10 1     1   402 use HTTP::Request;
  1         13779  
  1         23  
11 1     1   5 use HTTP::Headers;
  1         1  
  1         16  
12 1     1   588 use JSON;
  1         8007  
  1         5  
13 1     1   642 use LWP::UserAgent;
  1         14959  
  1         24  
14 1     1   380 use URL::Encode 'url_encode';
  1         2678  
  1         41  
15 1     1   454 use XML::XPath;
  0            
  0            
16              
17             use constant TOKENAUTH_URL =>
18             'https://datamarket.accesscontrol.windows.net/v2/OAuth2-13';
19             use constant SERVICE_URL =>
20             'http://api.microsofttranslator.com/V2/Http.svc';
21             use constant TEXT_TAG =>
22             "<string xmlns=\"http://schemas.microsoft.com/2003/10/Serialization/Arrays\">";
23              
24             sub encode_content {
25             # encode a string to be translated. Non-ASCII text must be UTF-8 encoded
26             # and the characters < & > must be encoded as their XML/HTML entities.
27             my $text = shift;
28             $text = HTML::Entities::encode_entities( $text, q[<>&] );
29             $text = Encode::encode("utf-8", $text);
30             return $text;
31             }
32              
33             sub decode_ents {
34             my $text = shift;
35             $text = HTML::Entities::decode_entities( $text, q[<>&] );
36             return $text;
37             }
38              
39              
40              
41             our $VERSION = '0.95';
42              
43             # TODO: apply MooseX::ClassAttribute to these variables
44             our %remap = ( hm => 'nww', zh => 'zh-CHS' );
45             our %unremap = ( mww => 'hm', "zh-CHS" => "zh" );
46             our @bing_languages = qw(ar bg ca zh-CHS zh-CHT cs da nl en et fi fr
47             de el ht he hi mww hu id it ja ko lv lt no fa pl pt ro ru sk sl
48             es sv th tr uk vi );
49              
50             my $token = { expired => 0 };
51             sub _token { $token }
52              
53             sub BUILD {
54             my $self = shift;
55             $self->name('Bing') unless $self->name;
56             }
57              
58             sub can_translate {
59             my ($self, $lang1, $lang2) = @_;
60             if ($lang1 eq $lang2) {
61             return 1;
62             }
63             my $langs = join(" ", keys %remap, @bing_languages);
64             return -1 unless " $langs " =~ / \Q$lang1 / && " $langs " =~ / \Q$lang2 /;
65             if ($lang1 =~ /zh/ && $lang2 =~ /zh/) {
66             return 0.9;
67             }
68             return 0.39 + 0.02 * rand;
69             }
70              
71             sub network_available { 1 }
72              
73             sub get_translations {
74             my ($self, $req) = @_;
75              
76             return unless $req->config('ENABLED');
77             return unless $self->network_available();
78             return unless $self->_check_token($req);
79              
80             # XXX - handle source encoding
81              
82             my (%result, %untext);
83             my %text = %{ $req->text };
84             while (my ($id,$text) = each %text) {
85             push @{ $untext{$text} }, $id;
86             }
87            
88             my $src = $remap{ $req->src } // $req->src;
89             my $dest = $remap{ $req->dest } // $req->dest;
90             my $content0 = _template($src, $dest);
91             my @text = keys %untext;
92             my @translated;
93              
94             while (@text) {
95             last if $req->timed_out;
96             last if !$self->_check_token($req);
97              
98             my $uri = URI->new( SERVICE_URL . "/GetTranslationsArray");
99             my $headers = HTTP::Headers->new;
100             $headers->header( Authorization => $token->{authorization} );
101             $headers->header( "Content-Type" => "text/xml" );
102             my $content = $content0;
103              
104             my @itext;
105             my $otext = shift @text;
106             my $xtext = TEXT_TAG . encode_content($otext) . "</string>\n";
107             while (length($content) + length($xtext) < 10000) {
108             $content =~ s{<TXT/>\n*}{$xtext . "<TXT/>"}e;
109             push @itext, $otext;
110             last unless defined( $otext = shift @text );
111             $xtext =
112             "<string xmlns=\"http://schemas.microsoft.com/2003/10/Serialization/Arrays\">"
113             . encode_content($otext) . "</string>\n";
114             }
115             if (defined $otext) {
116             unshift @text, $otext;
117             }
118             $content =~ s{<TXT/>}{};
119             $content =~ s{<COUNT/>}{scalar @itext}e;
120              
121             # $content = Encode::encode("utf-8", $content);
122              
123             my $remaining;
124             my $translation_start = Time::HiRes::time();
125              
126             eval {
127             $SIG{ALRM} = sub { die "bing translator timeout\n" };
128             alarm( $req->config("timeout") // 15 );
129              
130             my $request = HTTP::Request->new(
131             POST => $uri, $headers, $content );
132             my $response = $token->{ua}->request($request);
133              
134             $remaining = alarm(0);
135              
136             if ($response->code == 414) {
137             die "Request was too long (content length was "
138             . length($content) . ")";
139             }
140             if ($response->code != 200) {
141             my $rq = delete $response->{_request};
142             print STDERR "$_ => " . Dumper($rq->{$_}) . "\n" for keys %$rq;
143             die "Error response from the Bing translator: ", Dumper($response);
144             }
145             my $dc = $response->decoded_content;
146              
147             my $xp = XML::XPath->new( xml => $dc );
148             my @resultnodes = $xp->findnodes("//TranslatedText");
149             for my $i (0 .. $#resultnodes) {
150             my $input = $itext[$i];
151              
152             my $output = $xp->getNodeText($resultnodes[$i]);
153             $output = decode_ents($output);
154              
155             # XXX - handle destination encoding
156              
157             my $ids = $untext{$input};
158             foreach my $id (@$ids) {
159             $req->results->{$id} = I22r::Translate::Result->new(
160             id => $id,
161             otext => $input,
162             olang => $unremap{$req->src} // $req->src,
163             lang => $unremap{$req->dest} // $req->dest,
164             text => $output,
165             source => $self->name,
166             length => length($output),
167             time => time
168             );
169             push @translated, $id;
170             }
171             $self->config->{_NETWORK_ERR} = 0;
172             }
173             };
174             if ($@) {
175             if ($@ =~ /bing translator timeout/) {
176             carp __PACKAGE__, ": translation timed out";
177             }
178             # XXX - what does network error look like?
179             elsif ($@ =~ /asdfasdfasdf/) {
180             carp __PACKAGE__, ": network error - $@";
181             $self->config->{_NETWORK_ERR}++;
182             } else {
183             carp __PACKAGE__, ": error in translation: $@";
184             }
185             }
186             my $translation_elapsed = Time::HiRes::time() - $translation_start;
187             }
188             return @translated;
189             }
190              
191             sub _template {
192             my ($src, $dest) = @_;
193             my $content0 = qq[
194             <GetTranslationsArrayRequest>
195             <AppId />
196             <From>$src</From>
197             <Options>
198             <Category xmlns="http://schemas.datacontract.org/2004/07/Microsoft.MT.Web.Service.V2">general</Category>
199             <ContentType xmlns="http://schemas.datacontract.org/2004/07/Microsoft.MT.Web.Service.V2">text/plain</ContentType>
200             <ReservedFlags xmlns="http://schemas.datacontract.org/2004/07/Microsoft.MT.Web.Service.V2"/>
201             <State xmlns="http://schemas.datacontract.org/2004/07/Microsoft.MT.Web.Service.V2" />
202             <Uri xmlns="http://schemas.datacontract.org/2004/07/Microsoft.MT.Web.Service.V2"></Uri>
203             <User xmlns="http://schemas.datacontract.org/2004/07/Microsoft.MT.Web.Service.V2">TestUserId</User>
204             </Options>
205             <Texts>
206             <TXT/>
207             </Texts>
208             <To>$dest</To>
209             <MaxTranslations><COUNT/></MaxTranslations>
210             </GetTranslationsArrayRequest>];
211              
212             return $content0;
213             }
214              
215              
216              
217              
218              
219              
220              
221             sub _check_token {
222             my ($pkg, $req) = @_;
223              
224             if (time < ($token->{expires} // 0)) {
225             I22r::Translate->log( $req->{logger}, "Microsoft backend: ",
226             "auth token still valid" );
227             return 1;
228             }
229              
230             my ($client_id, $secret);
231             if ($req) {
232             $client_id = $req->config('CLIENT_ID');
233             $secret = $req->config('SECRET');
234             } else {
235             $client_id = $pkg->config('CLIENT_ID');
236             $secret = $pkg->config('SECRET');
237             }
238             unless ($client_id && $secret) {
239             I22r::Translate->log( $req->{logger},
240             "Microsoft backend: ",
241             "client_id or secret missing, ",
242             "cannot obtain auth token!" );
243             return;
244             }
245              
246             if (!$token->{secret} || $token->{secret} ne $secret) {
247             $token->{secret} = $secret;
248             $token->{secretx} = url_encode( $secret );
249             }
250             $token->{client_idx} = url_encode( $client_id );
251              
252             my $content = join( '&',
253             'grant_type=client_credentials',
254             'client_id=' . $token->{client_idx},
255             'client_secret=' . $token->{secretx},
256             'scope=http://api.microsofttranslator.com/' );
257              
258             my $headers = HTTP::Headers->new(
259             'Content-Type', 'application/x-www-form-urlencoded');
260             $token->{ua} //= LWP::UserAgent->new;
261             my $request = HTTP::Request->new(
262             POST => TOKENAUTH_URL,
263             $headers, $content );
264             my $req_start = time;
265             my $res = $token->{ua}->request( $request );
266             if (!$res->is_success) {
267              
268             eval {
269             my $js = decode_json( $res->decoded_content );
270             if ($js->{error} && $js->{error_description}) {
271             carp __PACKAGE__,
272             ": Failed to refresh authorization token ",
273             "for client [$client_id\n$secret]:\n",
274             "Error: ", $js->{error}, "\n",
275             "Error description: ", $js->{error_description}, "\n",
276             "Result code: ", $res->{_rc}, "\n",
277             "FULL ERROR: ", join(";",%$js), "\n",
278             "REQUEST: ", join(" ",%$request), "\n",
279             "REQ_HEADERS: ", join(" ", %{$request->{_headers}}), "\n";
280             I22r::Translate->log( $req->{logger},
281             "request for auth token failed: ",
282             $js->{error}, "/",
283             $js->{error_description} );
284             1;
285             }
286             } or carp __PACKAGE__, ": Failed to refresh authorization token for",
287             " client [$client_id]. $!\n";
288             return;
289             }
290             my $dc = $res->decoded_content;
291             my $js = eval { decode_json( $dc ) };
292             if ($@) {
293             carp "decode_json error. input was $dc";
294             return;
295             }
296              
297             my $expires = $js->{expires_in};
298             if ($expires > 120) {
299             $expires -= 60;
300             } else {
301             $expires /= 2;
302             }
303             $token->{expires} = $req_start + $expires;
304             $token->{token} = $js->{access_token};
305             $token->{authorization} = "Bearer " . $token->{token};
306             I22r::Translate->log($req->{logger}, "Microsoft backend: ",
307             "obtained auth token");
308             return 1;
309             }
310              
311             1;
312             # End of I22r::Translate::Microsoft
313              
314             =head1 NAME
315              
316             I22r::Translate::Microsoft - Microsoft Translator backend for I22r::Translate
317             framework
318              
319             =head1 VERSION
320              
321             Version 0.95
322              
323             =head1 SYNOPSIS
324              
325             I22r::Translate->config(
326             'I22r::Translate::Microsoft' => {
327             ENABLED => 1,
328             CLIENT_ID => 'your_Microsoft/Azure_client_id',
329             SECRET => 'your_Microsoft/Azure_secret'
330             }
331             );
332              
333             $translation = I22r::Translate->translate_string(
334             src => 'en', dest => 'es', text => 'hello world',
335             quality => { 'I22r::Translate::Microsoft' => 2.0 } );
336              
337             =head1 DESCRIPTION
338              
339             Invokes Microsoft's translation webservice to translate content
340             from one language to another.
341              
342             =head1 CONFIG
343              
344             You instruct the L<I22r::Translate> package to use the
345             Microsoft backend by passing a key-value pair to the
346             L<I22r::Translate::config|I22r::Translate/"config"> method
347             where the key is the string "C<I22r::Translate::Microsoft>"
348             and the value is a hash reference with at least the following
349             key-value pairs:
350              
351             =over 4
352              
353             =item ENABLED => 0 | 1
354              
355             Must be set to a true value for the Microsoft backend to function.
356              
357             =item CLIENT_ID => userid
358              
359             Required Windows Azure Marketplace client ID for accessing the
360             Microsoft Translator API. See L<"CREDENTIALS">, below.
361              
362             =item SECRET => 44-character string
363              
364             Required Windows Azure Marketplace "client secret" for accessing the
365             Microsoft Translator API. See L<"CREDENTIALS">, below.
366              
367             =item timeout => integer
368              
369             Stops a translation job after a certain number of seconds have
370             passed. Optional. Any translations that were completed before
371             the timeout will still be returned.
372              
373             =item callback => code reference or function name
374              
375             A function to be invoked when the Microsoft backend obtains
376             a translation result.
377             The function will be called with two arguments: the
378             L<request|I22r::Translate::Request> object that is handling the
379             translation, and a hash reference containing the fields and values
380             for the new translation result.
381              
382             You can have separate callbacks in the global configuration, for each
383             backend, and for the current request.
384              
385             =item filter => array reference
386              
387             List of filters to use (see L<I22r::Translate::Filter>) when
388             sending text to the Microsoft Translate webservice.
389              
390             =back
391              
392             =head1 CREDENTIALS
393              
394             This package interacts with the Microsoft Translator API,
395             which requires some you/us to provide a "client id" and
396             "client secret" to access Microsoft's data services.
397             As of October 2012, here are the steps you need to take
398             to get those credentials. (If these steps don't work anymore,
399             and you do figure out what steps you need to do, L<let me
400             know|mailto:mob@cpan.org> or L<file a bug report|"SUPPORT">
401             and I'll update this document.
402              
403             =over 4
404              
405             =item 1.
406              
407             If you don't have a Windows Live ID , sign up
408             for one at L<https://signup.live.com/signup.aspx?lic=1>
409              
410             =item 2.
411              
412             Visit L<https://datamarket.azure.com/dataset/bing/microsofttranslator>.
413             Register for a "Windows Azure Marketplace" account.
414              
415             =item 3.
416              
417             Choose a Microsoft Translator data plan. One of the
418             available plans is a free option for 2,000,000 characters/month.
419              
420             =item 4.
421              
422             Now you have to "register an application". Visit
423             L<https://datamarket.azure.com/developer/applications> and hit the
424             big green B<REGISTER> button.
425              
426             =item 5.
427              
428             Choose any "Client ID" and "Name" for your application. The "URI"
429             is also a required field, but the translator API doesn't use it, so you
430             can put whatever you like in that field, too.
431              
432             Make a note of the "Client ID" value that you entered and the
433             "Client secret" value that Microsoft provided. You will have to provide
434             these values to the C<I22r::Translate::Microsoft> backend configuration
435             with the C<CLIENT_ID> and C<SECRET> keys.
436              
437             Example: If your application registration screen looks like:
438              
439             * Client ID angus
440             * Name The Beefinator
441             * Client secret ykiDjfQ9lztW/oFUC4t2ciPWH2nJS88FqXcQbs/Z9Y=7
442             * Redirect URI https://ilikebeef.com/
443             Description The multilingual Beefinator site
444              
445             Then you would configure the Microsoft backend with
446              
447             I22r::Translate->config(
448             'I22r::Translate::Microsoft' => {
449             ENABLED => 1,
450             CLIENT_ID => "angus",
451             SECRET => "ykiDjfQ9lztW/oFUC4t2ciPWH2nJS88FqXcQbs/Z9Y=7"
452             } );
453              
454             (these are not real credentials).
455              
456             =back
457              
458             =head1 AUTHOR
459              
460             Marty O'Brien, C<< <mob at cpan.org> >>
461              
462             =head1 BUGS
463              
464             Please report any bugs or feature requests to
465             C<bug-i22r-translate-microsoft at rt.cpan.org>, or through
466             the web interface at
467             L<http://rt.cpan.org/NoAuth/ReportBug.html?Queue=I22r-Translate-Microsoft>.
468             I will be notified, and then you'll automatically be notified of progress
469             on your bug as I make changes.
470              
471             =head1 SUPPORT
472              
473             You can find documentation for this module with the perldoc command.
474              
475             perldoc I22r::Translate::Microsoft
476              
477             You can also look for information at:
478              
479             =over 4
480              
481             =item * RT: CPAN's request tracker (report bugs here)
482              
483             L<http://rt.cpan.org/NoAuth/Bugs.html?Dist=I22r-Translate-Microsoft>
484              
485             =item * AnnoCPAN: Annotated CPAN documentation
486              
487             L<http://annocpan.org/dist/I22r-Translate-Microsoft>
488              
489             =item * CPAN Ratings
490              
491             L<http://cpanratings.perl.org/d/I22r-Translate-Microsoft>
492              
493             =item * Search CPAN
494              
495             L<http://search.cpan.org/dist/I22r-Translate-Microsoft/>
496              
497             =back
498              
499             =head1 SUBROUTINES/METHODS
500              
501             There should be no need to use the methods of this package directly.
502             See L<I22r::Translate::Backend> and L<I22r::Translate>.
503              
504             =head1 SEE ALSO
505              
506             L<I22r::Translate>
507              
508             =head1 LICENSE AND COPYRIGHT
509              
510             Copyright 2012-2013 Marty O'Brien.
511              
512             This program is free software; you can redistribute it and/or modify it
513             under the terms of either: the GNU General Public License as published
514             by the Free Software Foundation; or the Artistic License.
515              
516             See http://dev.perl.org/licenses/ for more information.
517              
518              
519             =cut
520