File Coverage

blib/lib/Locale/Maketext/Utils/Phrase.pm
Criterion Covered Total %
statement 81 83 97.5
branch 46 52 88.4
condition 62 71 87.3
subroutine 16 16 100.0
pod 9 9 100.0
total 214 231 92.6


line stmt bran cond sub pod time code
1             package Locale::Maketext::Utils::Phrase;
2              
3 7     7   399010 use strict;
  7         13  
  7         313  
4 7     7   37 use warnings;
  7         12  
  7         1600  
5 7     7   46 use Carp ();
  7         47  
  7         129  
6              
7 7     7   1527 use Module::Want ();
  7         5107  
  7         18500  
8              
9             $Locale::Maketext::Utils::Phrase::VERSION = '0.1';
10              
11             our $closing_bn = qr/(?<!\~)\]/;
12             our $opening_bn = qr/(?<!\~)\[/;
13             our $bn_delimit = qr/(?<!\~)\,/;
14             our $bn_var_arg = qr/(?<!\~)\_(?:0|\-?[1-9]+[0-9]*|\*)/;
15              
16             sub get_bn_var_regexp {
17 193     193 1 4560 return qr/(?<!\~)\_(?:0|\-?[1-9]+[0-9]*|\*)/;
18             }
19              
20             sub get_non_translatable_type_regexp {
21 1     1 1 233318 return qr/(?:var|meth|basic_var)/;
22             }
23              
24             sub string_has_opening_or_closing_bracket {
25 5   100 5 1 21970 return $_[0] =~ m/$opening_bn/ || $_[0] =~ m/$closing_bn/;
26             }
27              
28             sub phrase2struct {
29 124     124 1 329628 my ($phrase) = @_;
30              
31             # Makes parsing (via code or mentally) unnecessarily difficult.
32             # ? TODO ? s/~~/_TILDES_/g (yes w/ an S so _TILDE_ still works) then restore them inside the while loop and don’t croak() here (maybe carp()) ?
33 124 50       564 Carp::croak("Consecutive tildes are ambiguous (use the special placeholder _TILDE_ instead): “$phrase”") if $phrase =~ m/~~/;
34              
35 124 100       3399 return [$phrase] unless $phrase =~ m/(?:$opening_bn|$closing_bn)/;
36              
37 104         285 my @struct;
38 104         1915 while (
39             $phrase =~ m{
40             ( # Capture chunk of …
41             # bracket notation …
42             (?:
43             $opening_bn
44             ( # Capture bracket pair contents
45             (?:
46             \~\]
47             |
48             [^\]]
49             )*
50             )
51             $closing_bn
52             )
53             |
54             # … or non-bracket notation
55             (?:
56             \~[\[\]]
57             |
58             [^\[\]]
59             )+
60             ) # /Capture chunk of …
61             }gx
62             ) {
63 443         3036 my ( $match, $bn_inside ) = ( $1, $2 );
64              
65 443 100       5223 if ( defined $bn_inside ) {
66 188 100       1823 if ( $bn_inside =~ m/(?:$closing_bn|$opening_bn)/ ) {
67 6         3445 Carp::croak("Unbalanced bracket: “[$bn_inside]”");
68             }
69              
70 182         521 my $list = [ _split_bn_cont($bn_inside) ];
71 182         562 my $type = _get_bn_type_from_list($list);
72              
73 182         3915 push @struct,
74             {
75             'orig' => $match,
76             'cont' => $bn_inside,
77             'list' => $list,
78             'type' => $type,
79             };
80             }
81             else {
82              
83             # probably won't trip but for good measure
84 255 50       1790 if ( $match =~ m/(?:$opening_bn|$closing_bn)/ ) {
85 0         0 Carp::croak("Unbalanced bracket: “$match”");
86             }
87              
88 255         6175 push @struct, $match;
89             }
90             }
91              
92 98 50       283 return if !@struct;
93              
94             # if the structure rebuilds differently it means unbalanced [ or ] existed in $phrase that were masked out in @struct
95 98 100       364 if ( struct2phrase( \@struct ) ne $phrase ) {
96 10         2755 Carp::croak("Unbalanced bracket: “$phrase”");
97             }
98              
99 88         418 return \@struct;
100             }
101              
102             sub struct2phrase {
103 102     102 1 1675 my ($struct) = @_;
104              
105             return join(
106             '',
107 102 100       870 map { ref($_) ? $_->{'orig'} : $_ } @{$struct}
  435         1853  
  102         283  
108             );
109             }
110              
111             sub phrase_has_bracket_notation {
112 8 100   8 1 4825 return 1 if $_[0] =~ m/$opening_bn/;
113 2         15 return;
114             }
115              
116             sub struct_has_bracket_notation {
117 4     4 1 9 my $len = @{ $_[0] };
  4         12  
118 4 100 100     48 return 1 if ( $len == 1 && ref( $_[0]->[0] ) ) || $len > 1;
      100        
119 1         7 return;
120             }
121              
122             sub phrase_is_entirely_bracket_notation {
123 8 100   8 1 2138 return 1 if $_[0] =~ m{\A$opening_bn(?:\~[\[\]]|[^\[\]])+$closing_bn\z}x;
124 6         32 return;
125             }
126              
127             sub struct_is_entirely_bracket_notation {
128 106 100 100 106 1 183 return 1 if @{ $_[0] } == 1 && ref( $_[0]->[0] );
  106         786  
129 97         368 return;
130             }
131              
132             sub _split_bn_cont {
133 404     404   457642 my ( $cont, $limit ) = @_;
134 404   100     2275 $limit = abs( int( $limit || 0 ) );
135 404 100       4790 return $limit ? split( $bn_delimit, $cont, $limit ) : split( $bn_delimit, $cont, -1 );
136             }
137              
138             my %meth = (
139             'numf' => 'Should be passing in an unformatted number.',
140             '#' => 'Should be passing in an unformatted number (numf alias).',
141             'format_bytes' => 'Should be passing in the unformatted number of bytes.',
142             'output' => sub {
143             return 'Should be passing in character identifier.' if $_[0]->[1] eq 'chr';
144             return 'Displayed without modification.' if $_[0]->[1] eq 'asis' || $_[0]->[1] eq 'asis_for_tests';
145             return 'No args, character.' if $_[0]->[1] =~ m/^(?:nbsp|amp|quot|apos|shy|lt|gt)/;
146             return 'Domain should be passed in. Hardcoded domain that needs translated should just be a string.' if $_[0]->[1] eq 'encode_puny' || $_[0]->[1] eq 'decode_puny';
147             return;
148             },
149             'datetime' => sub {
150             return 'format has no translatable components' if !$_[0]->[2] # there is no format (FWIW, 0 is not a valid format)
151             || $_[0]->[2] =~ m/\A(?:date|time|datetime)_format_(:full|long|medium|short|default)\z/ # it is a format method name
152             || $_[0]->[2] =~ m/\A[GgyYQqMmwWdDEeaAhHKkSszZvVuLFcj]+(?:{[0-9],?([0-9])?})?\z/; # is only CLDR Pattern codes …
153              
154             # … i.e. which includes values for format_for() AKA $loc->available_formats(),
155             # http://search.cpan.org/perldoc?DateTime#CLDR_Patterns says:
156             # If you want to include any lower or upper case ASCII characters as-is, you can surround them with single quotes (').
157             # If you want to include a single quote, you must escape it as two single quotes ('').
158             # Spaces and any non-letter text will always be passed through as-is.
159              
160             return;
161             },
162             'current_year' => 'Takes no args.',
163             'asis' => 'Displayed without modification.',
164             'comment' => 'Not displayed.',
165             'join' => 'Arbitrary args.',
166             'sprintf' => 'Arbitrary args.',
167             'convert' => 'Converts arbitrary units and identifiers.', # ? technically USD -> GBP, not critical ATM ?
168             'list_and' => 'Arbitrary args.',
169             'list_or' => 'Arbitrary args.',
170             'list_and_quoted' => 'Arbitrary args.',
171             'list_or_quoted' => 'Arbitrary args.',
172             'list' => 'Deprecated. Arbitrary args.',
173             );
174              
175             my %basic = (
176             'output' => 'has possible translatable parts',
177             'datetime' => 'has possible translatable components in format',
178             );
179              
180             my %complex = (
181             'boolean' => 'should have translatable parts',
182             'is_defined' => 'should have translatable parts',
183             'is_future' => 'should have translatable parts',
184             'quant' => 'should have translatable parts',
185             '*' => 'should have translatable parts (quant alias)',
186             'numerate' => 'should have translatable parts',
187             );
188              
189             my $ns_regexp = Module::Want::get_ns_regexp();
190              
191             sub _get_attr_hash_from_list {
192 179     179   381 my ( $list, $start_idx ) = @_;
193              
194 179         318 my $last_list_idx = @{$list} - 1;
  179         340  
195              
196 179         303 my %attr;
197 179         264 my $skip_to = 0;
198 179         533 for my $i ( $start_idx .. $last_list_idx ) {
199 373 100       812 next if $i < $skip_to;
200 193 100       984 next if $list->[$i] =~ m/\A$bn_var_arg\z/;
201              
202 180         588 $attr{ $list->[$i] } = $list->[ $i + 1 ];
203 180         283 $skip_to = $i + 2;
204             }
205              
206 179         768 return %attr;
207             }
208              
209             sub _get_bn_type_from_list {
210 397     397   166713 my ($list) = @_;
211 397         767 my $len = @{$list};
  397         868  
212              
213 397 100 100     2419 return 'var' if $len == 1 && $list->[0] =~ m/\A$bn_var_arg\z/;
214              
215             # recommend to carp/croak
216 290 100 100     4464 return '_invalid' if !defined $list->[0] || $list->[0] !~ m/\A(?:$ns_regexp|\*|\#)\z/;
217 276 100 66     2845 return '_invalid' if $list->[0] eq 'output' && ( !defined $list->[1] || $list->[1] !~ m/\A$ns_regexp\z/ );
      100        
218              
219             # should not be anything translatable
220 275 100 100     1813 return 'meth' if exists $meth{ $list->[0] } && ( ref( $meth{ $list->[0] } ) ne 'CODE' || $meth{ $list->[0] }->($list) );
      100        
221              
222 228 50 33     971 if ( exists $basic{ $list->[0] } && ( ref( $basic{ $list->[0] } ) ne 'CODE' || $basic{ $list->[0] }->($list) ) ) {
      66        
223              
224             # check for 'basic_var' (might be basic except there are not any translatable parts)
225              
226 206 50       3510 if ( $list->[0] eq 'output' ) {
227 206 100       767 if ( $list->[1] =~ m/\A(?:underline|strong|em|class|attr|inline|block|sup|sub)\z/ ) {
228 131         295 my %attr = _get_attr_hash_from_list( $list, 3 );
229              
230 131 100 100     1835 if ( $list->[2] =~ m/\A$bn_var_arg\z/
      100        
      100        
      100        
231             && ( !exists $attr{'title'} || $attr{'title'} =~ m/\A$bn_var_arg\z/ )
232             && ( !exists $attr{'alt'} || $attr{'alt'} =~ m/\A$bn_var_arg\z/ ) ) {
233 36         228 return 'basic_var';
234             }
235             }
236              
237             # TODO: do url && factor in html/plain attr && add to t/13.phrase_object_precursor_functions.t
238 170 100       646 if ( $list->[1] =~ m/\A(?:img|abbr|acronym)\z/ ) {
239 42         95 my %attr = _get_attr_hash_from_list( $list, 4 );
240              
241             # if any of these are true (except maybe $list->[2]) w/ these functions
242             # then the caller is probably doing something wrong, the class/methods
243             # will help find those sort of things better.
244 42 100 66     720 if ( $list->[2] =~ m/\A$bn_var_arg\z/
      100        
      100        
      100        
      100        
245             && $list->[3] =~ m/\A$bn_var_arg\z/
246             && ( !exists $attr{'title'} || $attr{'title'} =~ m/\A$bn_var_arg\z/ )
247             && ( !exists $attr{'alt'} || $attr{'alt'} =~ m/\A$bn_var_arg\z/ ) ) {
248 12         92 return 'basic_var';
249             }
250             }
251             }
252              
253 158         849 return 'basic';
254             }
255              
256 22 50 33     177 return 'complex' if exists $complex{ $list->[0] } && ( ref( $complex{ $list->[0] } ) ne 'CODE' || $complex{ $list->[0] }->($list) );
      33        
257 0           return '_unknown'; # recommend to treat like 'basic' unless its one you know about that your class defines or if it's a show stopper
258             }
259              
260             1;
261              
262             __END__
263              
264             =encoding utf-8
265              
266             =head1 NAME
267              
268             Locale::Maketext::Utils::Phrase - Consolidated Phrase Introspection
269              
270             =head1 VERSION
271              
272             This document describes Locale::Maketext::Utils::Phrase version 0.1
273              
274             =head1 SYNOPSIS
275              
276             use Locale::Maketext::Utils::Phrase ();
277              
278             my $struct = Locale::Maketext::Utils::Phrase::phrase2struct(
279             "So long, and thanks for [output,strong,all] the fish."
280             );
281              
282             for my $piece (@{$struct}) {
283             if (!ref($piece)) {
284             # this $piece is a non-bracket notation chunk
285             }
286             else {
287             # this $piece is a hashref describing the bracket notation chunk
288             }
289             }
290              
291             =head1 DESCRIPTION
292              
293             This module is meant to allow you to simplify an already complex task by doing all of the parsing and basic categorization of bracket notation (or lack of BN) for you.
294              
295             That way you do not have to worry about parsing or matching the syntax/escaping/delimiters/etc correctly and then maintaining it in each place it is used.
296              
297             =head1 INTERFACE
298              
299             =head2 Object
300              
301             Eventually the base functions below will be used in an object that can be used for even more complete and fine tuned introspection.
302              
303             For now these functions allow us to do most of what we need with little trouble.
304              
305             =head2 Functions
306              
307             Terms:
308              
309             =over 4
310              
311             =item Phrase
312              
313             A string intended to be passed to maketext() that may or may not contain bracket notation.
314              
315             =item Struct
316              
317             An array ref that represents a parsed phrase.
318              
319             Each item in that array is either a string (a chunk that is not bracket notation) or a hashref (a chunk that is bracket notation).
320              
321             The hashref has the following keys:
322              
323             =over 4
324              
325             =item orig
326              
327             The value is the original bracket notation string in its entirety. e.g. '[output,strong,NOT]'
328              
329             =item cont
330              
331             The value is the content of the inside of original bracket notation string. e.g. 'output,strong,NOT'
332              
333             =item list
334              
335             The value is the original bracket notation in list form. e.g. an array reference containing 'output', 'strong', 'NOT'.
336              
337             =item type
338              
339             This is a string defining what general type of bracket notation we’re dealing with:
340              
341             =over 4
342              
343             =item 'var'
344              
345             The content is a variable reference (i.e. not translatable).
346              
347             e.g. [_1]
348              
349             =item 'meth'
350              
351             The content is a method that shouldn’t have any translatable part.
352              
353             e.g. [numf,_1]
354              
355             =item 'basic'
356              
357             The content is a method that can have translatable parts and follows a basic pattern like the first part or two after the method can be a string and the rest can be an arbitrary name/value attribute list.
358              
359             e.g. [output,strong,foo]
360              
361             =item 'basic_var'
362              
363             The content is 'basic' except every possible translatable part is a variable reference (i.e. not translatable).
364              
365             e.g. [output,strong,_1]
366              
367             =item 'complex'
368              
369             The content is more complicated than 'basic'.
370              
371             =item '_unknown'
372              
373             The content type could not be determined. This is not necessarily an error. It could be a method specific to your object, it could be something this module misses (rt please!).
374              
375             =item '_invalid'
376              
377             The content type is invalid.
378              
379             This could be something L<Locale::Maketext> would see as a syntax error (e.g. [" ,foo"]) or something it might allow through (on purpose or by happenstance (e.g. [])) but is ambiguous for no gain.
380              
381             =back
382              
383             =back
384              
385             =back
386              
387             =head3 Phrase related
388              
389             These all take a phrase as their only argument.
390              
391             =head4 phrase2struct()
392              
393             Returns the struct for the given phrase.
394              
395             If there is a problem it will croak either "Unbalanced bracket: “…”" or "L</"Consecutive tildes are ambiguous">: “…”".
396              
397             =head4 phrase_has_bracket_notation()
398              
399             Returns a boolean.
400              
401             True: the given phrase has bracket notation.
402              
403             False: the given phrase does not have any bracket notation.
404              
405             =head4 phrase_is_entirely_bracket_notation()
406              
407             Returns a boolean.
408              
409             True: the given phrase is entirely bracket notation.
410              
411             False: the given phrase is not entirely bracket notation.
412              
413             =head4 Consecutive tildes are ambiguous
414              
415             In order to keep the parsing as simple/fast as possible we avoid trying to properly interpret multiple consecutive tildes.
416              
417             In the rare case you really need a literal ~ to precede a comma, ~, [, or ] (really, anywhere in the string) just use the explicit placeholder string “_TILDE_”.
418              
419             $lh->maketext('A tilde is this: _TILDE_, you like?');
420              
421             $lh->maketext('A tilde [output,strong,is this: _TILDE_, you like]?');
422              
423             =head3 Structure Related
424              
425             These all take a struct as their only argument.
426              
427             =head4 struct2phrase()
428              
429             Returns the given struct as a stringified phrase.
430              
431             =head4 struct_has_bracket_notation()
432              
433             Returns a boolean.
434              
435             True: the given struct has bracket notation.
436              
437             False: the given struct does not have any bracket notation.
438              
439             =head4 struct_is_entirely_bracket_notation()
440              
441             Returns a boolean.
442              
443             True: the given struct is entirely bracket notation.
444              
445             False: the given struct is not entirely bracket notation.
446              
447             =head3 Misc
448              
449             =head4 get_bn_var_regexp()
450              
451             Takes no arguments, returns a regular expression that matches bracket notation variable syntax.
452              
453             my $bn_var_regexp = Locale::Maketext::Utils::Phrase::get_bn_var_regexp();
454             if ($string =~ m/\A$bn_var_regexp\z/) {
455             # string is a BN variable
456             }
457             elsif ($string =~ m/$bn_var_regexp/) {
458             # string contains a BN variable
459             }
460              
461             my @bn_variables = $string =~ m/($bn_var_regexp)/g;
462              
463             =head4 get_non_translatable_type_regexp()
464              
465             Takes no arguments, returns a regular expression that matches types that should not have any translatable parts.
466              
467             my $non_translatable_type_regexp = Locale::Maketext::Utils::Phrase::get_non_translatable_type_regexp();
468             if ($piece->{'type'} =~ m/\A$non_translatable_type_regexp\z/) {
469             # nothing to translate here, move along, move along
470             }
471              
472             if ($xliff->{'ctype'} =~ m/\Ax-bn-$non_translatable_type_regexp\z/) {
473             # handle the XLIFF syntax for non-translatable <ph> tags back into bracket notation
474             }
475              
476             =head4 string_has_opening_or_closing_bracket()
477              
478             Takes one argument, a string. Returns true if it contains an opening or closing bracket.
479              
480             if ( !Locale::Maketext::Utils::Phrase::string_has_opening_or_closing_bracket($string) ){
481             # $string does not have any bracket notation.
482             }
483              
484             =head3 Private functions
485              
486             These are essentially meant to be used internally but if you find a use for them be sure to verify the values you pass to them or you will get odd results.
487              
488             =over 4
489              
490             =item _split_bn_cont()
491              
492             Takes the 'cont' of the bracket notation piece hashref and optionally the max number of item to split it into and returns the resulting array.
493              
494             Used internally to build the hash’s 'list' value.
495              
496             =item _get_attr_hash_from_list()
497              
498             Takes the 'list' of the bracket notation piece hashref and the index of where the arbitrary attributes begin and returns a hash. Accounts for non-key/value variable array refs.
499              
500             =item _get_bn_type_from_list()
501              
502             Takes the 'list' of the bracket notation piece hashref and returns the type.
503              
504             Used internally to build the hash’s 'type' value.
505              
506             =back
507              
508             =head1 DIAGNOSTICS
509              
510             Nothing besides what is documented in phrase2struct().
511              
512             =head1 CONFIGURATION AND ENVIRONMENT
513              
514             Locale::Maketext::Utils::Phrase requires no configuration files or environment variables.
515              
516             =head1 DEPENDENCIES
517              
518             L<Locale::Maketext::Utils>
519              
520             =head1 INCOMPATIBILITIES
521              
522             None reported.
523              
524             =head1 BUGS AND LIMITATIONS
525              
526             No bugs have been reported.
527              
528             Please report any bugs or feature requests to
529             C<bug-locale-maketext-utils-mock@rt.cpan.org>, or through the web interface at
530             L<http://rt.cpan.org>.
531              
532             =head1 TODO
533              
534             Add in the object layer to really make the introspection complete.
535              
536             =head1 AUTHOR
537              
538             Daniel Muey C<< <http://drmuey.com/cpan_contact.pl> >>
539              
540             =head1 LICENCE AND COPYRIGHT
541              
542             Copyright (c) 2012, Daniel Muey C<< <http://drmuey.com/cpan_contact.pl> >>. All rights reserved.
543              
544             This module is free software; you can redistribute it and/or
545             modify it under the same terms as Perl itself. See L<perlartistic>.
546              
547             =head1 DISCLAIMER OF WARRANTY
548              
549             BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
550             FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
551             OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
552             PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER
553             EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
554             WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE
555             ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH
556             YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL
557             NECESSARY SERVICING, REPAIR, OR CORRECTION.
558              
559             IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
560             WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
561             REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENCE, BE
562             LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL,
563             OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE
564             THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
565             RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
566             FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
567             SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
568             SUCH DAMAGES.