File Coverage

blib/lib/Lingua/Translit.pm
Criterion Covered Total %
statement 79 81 97.5
branch 36 46 78.2
condition 12 18 66.6
subroutine 13 13 100.0
pod 6 6 100.0
total 146 164 89.0


line stmt bran cond sub pod time code
1             package Lingua::Translit;
2              
3             #
4             # Copyright (C) 2007-2008 ...
5             # Alex Linke
6             # Rona Linke
7             # Copyright (C) 2009-2016 Lingua-Systems Software GmbH
8             # Copyright (C) 2016-2017 Netzum Sorglos, Lingua-Systems Software GmbH
9             # Copyright (C) 2017 Netzum Sorglos Software GmbH
10             #
11              
12 25     25   1190518 use strict;
  25         253  
  25         753  
13 25     25   130 use warnings;
  25         44  
  25         901  
14              
15             require 5.008;
16              
17 25     25   131 use Carp qw/croak/;
  25         40  
  25         1232  
18 25     25   6649 use Encode qw/encode decode/;
  25         145936  
  25         1565  
19              
20 25     25   15800 use Lingua::Translit::Tables;
  25         861  
  25         21253  
21              
22             our $VERSION = '0.28';
23              
24             =pod
25              
26             =encoding utf8
27              
28             =head1 NAME
29              
30             Lingua::Translit - transliterates text between writing systems
31              
32             =head1 SYNOPSIS
33              
34             use Lingua::Translit;
35              
36             my $tr = new Lingua::Translit("ISO 843");
37              
38             my $text_tr = $tr->translit("character oriented string");
39              
40             if ($tr->can_reverse()) {
41             $text_tr = $tr->translit_reverse("character oriented string");
42             }
43              
44             =head1 DESCRIPTION
45              
46             Lingua::Translit can be used to convert text from one writing system to
47             another, based on national or international transliteration tables.
48             Where possible a reverse transliteration is supported.
49              
50             The term C describes the conversion of text from one
51             writing system or alphabet to another one.
52             The conversion is ideally unique, mapping one character to exactly one
53             character, so the original spelling can be reconstructed.
54             Practically this is not always the case and one single letter of the
55             original alphabet can be transcribed as two, three or even more letters.
56              
57             Furthermore there is more than one transliteration scheme for one writing
58             system.
59             Therefore it is an important and necessary information, which scheme will be
60             or has been used to transliterate a text, to work integrative and be able to
61             reconstruct the original data.
62              
63             Reconstruction is a problem though for non-unique transliterations, if no
64             language specific knowledge is available as the resulting clusters of
65             letters may be ambiguous.
66             For example, the Greek character "PSI" maps to "ps", but "ps" could also
67             result from the sequence "PI", "SIGMA" since "PI" maps to "p" and "SIGMA"
68             maps to s.
69             If a transliteration table leads to ambiguous conversions, the provided
70             table cannot be used reverse.
71              
72             Otherwise the table can be used in both directions, if appreciated.
73             So if ISO 9 is originally created to convert Cyrillic letters to
74             the Latin alphabet, the reverse transliteration will transform Latin
75             letters to Cyrillic.
76              
77             =head1 METHODS
78              
79             =head2 new(I<"name of table">)
80              
81             Initializes an object with the specific transliteration table, e.g. "ISO 9".
82              
83             =cut
84              
85             sub new {
86 40     40 1 10457 my $class = shift();
87 40         79 my $name = shift();
88              
89 40         59 my $self;
90              
91             # Assure that a table name was set
92 40 100       260 croak("No transliteration name given.") unless $name;
93              
94             # Stay compatible with programs that use Lingua::Translit < 0.05
95 39 100       158 if ( $name =~ /^DIN 5008$/i ) {
96 1         2 $name = "Common DEU";
97             }
98              
99 39         140 my $table = Lingua::Translit::Tables::_get_table_reference($name);
100              
101             # Check that a table reference was assigned to the object
102 39 50       132 croak("No table found for $name.") unless $table;
103              
104             # Assure the table's data is complete
105 39 50       119 croak("$name table: missing 'name'") unless defined $table->{name};
106 39 50       107 croak("$name table: missing 'desc'") unless defined $table->{desc};
107 39 50       105 croak("$name table: missing 'reverse'") unless defined $table->{reverse};
108 39 50       100 croak("$name table: missing 'rules'") unless defined $table->{rules};
109              
110             # Copy over the table's data
111 39         104 $self->{name} = $table->{name};
112 39         87 $self->{desc} = $table->{desc};
113 39         75 $self->{rules} = $table->{rules};
114              
115             # Set a truth value of the transliteration's reversibility according to
116             # the natural language string in the original transliteration table
117 39 100       180 $self->{reverse} = ( $table->{reverse} =~ /^true$/i ) ? 1 : 0;
118              
119 39         75 undef($table);
120              
121 39         163 return bless $self, $class;
122             }
123              
124             =head2 translit(I<"character oriented string">)
125              
126             Transliterates the given text according to the object's transliteration
127             table.
128             Returns the transliterated text.
129              
130             =cut
131              
132             sub translit {
133 153     153 1 24396 my $self = shift();
134 153         219 my $text = shift();
135              
136             # Return if no input was given
137 153 50       318 return unless $text;
138              
139 153         357 my $utf8_flag_on = Encode::is_utf8($text);
140              
141 153 100       267 unless ($utf8_flag_on) {
142 59         169 $text = decode( "UTF-8", $text );
143             }
144              
145 153         4799 foreach my $rule ( @{ $self->{rules} } ) {
  153         339  
146 14308 100       244052 if ( defined $rule->{context} ) {
147 493         612 my $c = $rule->{context};
148              
149             # single context rules
150 493 100 100     2149 if ( defined $c->{before} && !defined $c->{after} ) {
    100 66        
    50 33        
151 107         1539 $text =~ s/$rule->{from}(?=$c->{before})/$rule->{to}/g;
152             }
153             elsif ( defined $c->{after} && !defined $c->{before} ) {
154 4     4   22 $text =~ s/(?<=$c->{after})$rule->{from}/$rule->{to}/g;
  4         8  
  4         49  
  223         2968  
155             }
156              
157             # double context rules: logical "inbetween"
158             elsif ( defined $c->{before} && defined $c->{after} ) {
159 163     7   2231 $text =~ s/
  7         38  
  7         13  
  7         67  
160             (?<=$c->{after})$rule->{from}(?=$c->{before})
161             /$rule->{to}/gx;
162             }
163              
164             else {
165 0         0 croak("incomplete rule context");
166             }
167             }
168             else {
169 13815         190671 $text =~ s/$rule->{from}/$rule->{to}/g;
170             }
171             }
172              
173 153 100       333 unless ($utf8_flag_on) {
174 59         197 return encode( "UTF-8", $text );
175             }
176             else {
177 94         308 return $text;
178             }
179             }
180              
181             =head2 translit_reverse(I<"character oriented string">)
182              
183             Transliterates the given text according to the object's transliteration
184             table, but uses it the other way round. For example table ISO 9 is a
185             transliteration scheme for the conversion of Cyrillic letters to the Latin
186             alphabet. So if used reverse, Latin letters will be mapped to Cyrillic ones.
187              
188             Returns the transliterated text.
189              
190             =cut
191              
192             sub translit_reverse {
193 18     18 1 9244 my $self = shift();
194 18         36 my $text = shift();
195              
196             # Return if no input was given
197 18 50       51 return unless $text;
198              
199             # Is this transliteration reversible?
200 18 50       63 croak("$self->{name} cannot be reversed") unless $self->{reverse};
201              
202 18         53 my $utf8_flag_on = Encode::is_utf8($text);
203              
204 18 100       43 unless ($utf8_flag_on) {
205 15         40 $text = decode( "UTF-8", $text );
206             }
207              
208 18         698 foreach my $rule ( @{ $self->{rules} } ) {
  18         45  
209 1503 100       32780 if ( defined $rule->{context} ) {
210 178         221 my $c = $rule->{context};
211              
212             # single context rules
213 178 100 100     895 if ( defined $c->{before} && !defined $c->{after} ) {
    100 66        
    50 33        
214 54         753 $text =~ s/$rule->{to}(?=$c->{before})/$rule->{from}/g;
215             }
216             elsif ( defined $c->{after} && !defined $c->{before} ) {
217 19         164 $text =~ s/(?<=$c->{after})$rule->{to}/$rule->{from}/g;
218             }
219              
220             # double context rules: logical "inbetween"
221             elsif ( defined $c->{before} && defined $c->{after} ) {
222 105         1268 $text =~ s/
223             (?<=$c->{after})$rule->{to}(?=$c->{before})
224             /$rule->{from}/gx;
225             }
226              
227             else {
228 0         0 croak("incomplete rule context");
229             }
230             }
231             else {
232 1325         8827 $text =~ s/$rule->{to}/$rule->{from}/g;
233             }
234             }
235              
236 18 100       49 unless ($utf8_flag_on) {
237 15         48 return encode( "UTF-8", $text );
238             }
239             else {
240 3         10 return $text;
241             }
242             }
243              
244             =head2 can_reverse()
245              
246             Returns true (1), iff reverse transliteration is possible.
247             False (0) otherwise.
248              
249             =cut
250              
251             sub can_reverse {
252 22     22 1 740 return $_[0]->{reverse};
253             }
254              
255             =head2 name()
256              
257             Returns the name of the chosen transliteration table, e.g. "ISO 9".
258              
259             =cut
260              
261             sub name {
262 12     12 1 5326 return $_[0]->{name};
263             }
264              
265             =head2 desc()
266              
267             Returns a description for the transliteration,
268             e.g. "ISO 9:1995, Cyrillic to Latin".
269              
270             =cut
271              
272             sub desc {
273 12     12 1 46 return $_[0]->{desc};
274             }
275              
276             =head1 SUPPORTED TRANSLITERATIONS
277              
278             =over 4
279              
280             =item Cyrillic
281              
282             I, not reversible, ALA-LC:1997, Cyrillic to Latin, Russian
283              
284             I, reversible, ISO 9:1995, Cyrillic to Latin
285              
286             I, reversible, ISO 9:1954, Cyrillic to Latin
287              
288             I, reversible, DIN 1460:1982, Cyrillic to Latin, Russian
289              
290             I, reversible, DIN 1460:1982, Cyrillic to Latin, Ukrainian
291              
292             I, reversible, DIN 1460:1982, Cyrillic to Latin, Bulgarian
293              
294             I, not reversible, The Streamlined System: 2006,
295             Cyrillic to Latin, Bulgarian
296              
297             I, reversible, GOST 7.79:2000 (table B), Cyrillic to Latin,
298             Russian
299              
300             I, not reversible, GOST 7.79:2000 (table B), Cyrillic to
301             Latin with support for Old Russian (pre 1918), Russian
302              
303             I, reversible, GOST 7.79:2000 (table B), Cyrillic to Latin,
304             Ukrainian
305              
306             I, not reversible, BGN/PCGN:1947 (Standard Variant),
307             Cyrillic to Latin, Russian
308              
309             I, not reversible, BGN/PCGN:1947 (Strict Variant),
310             Cyrillic to Latin, Russian
311              
312             =item Greek
313              
314             I, not reversible, ISO 843:1997, Greek to Latin
315              
316             I, not reversible, DIN 31634:1982, Greek to Latin
317              
318             I, not reversible, Greeklish (Phonetic), Greek to Latin
319              
320             =item Latin
321              
322             I, not reversible, Czech without diacritics
323              
324             I, not reversible, German without umlauts
325              
326             I, not reversible, Unaccented Polish
327              
328             I, not reversible, Romanian without diacritics as commonly used
329              
330             I, not reversible, Slovak without diacritics
331              
332             I, not reversible, Slovenian without diacritics
333              
334             I, reversible, Romanian with appropriate diacritics
335              
336             =item Arabic
337              
338             I, not reversible, Common Romanization of Arabic
339              
340             =item Sanskrit
341              
342             I, not reversible, IAST Romanization to Devanāgarī
343              
344             I, not reversible, Devanāgarī to IAST Romanization
345              
346             =back
347              
348             =head1 ADDING NEW TRANSLITERATIONS
349              
350             In case you want to add your own transliteration tables to
351             L, have a look at the developer documentation at
352             L.
353              
354             A template of a transliteration table is provided as well
355             (F) so you can easily start developing.
356              
357              
358             =head1 RESTRICTIONS
359              
360             L is suited to handle B and utilizes comparisons
361             and regular expressions that rely on B.
362             Therefore, any input is supposed to be B
363             (C, ...) instead of byte oriented.
364              
365             However, if your data is byte oriented, be sure to pass it
366             B to translit() and/or translit_reverse() - it will be
367             converted internally.
368              
369             =head1 BUGS
370              
371             None known.
372              
373             Please report bugs using CPAN's request tracker at
374             L.
375              
376             =head1 SEE ALSO
377              
378             L, L, L
379              
380             C's manpage
381              
382             L
383              
384             =head1 CREDITS
385              
386             Thanks to Dr. Daniel Eiwen, Romanisches Seminar, Universitaet Koeln for his
387             help on Romanian transliteration.
388              
389             Thanks to Dmitry Smal and Rusar Publishing for contributing the "ALA-LC RUS"
390             transliteration table.
391              
392             Thanks to Ahmed Elsheshtawy for his help implementing the "Common ARA" Arabic
393             transliteration.
394              
395             Thanks to Dusan Vuckovic for contributing the "ISO/R 9" transliteration table.
396              
397             Thanks to Ștefan Suciu for contributing the "ISO 8859-16 RON" transliteration
398             table.
399              
400             Thanks to Philip Kime for contributing the "IAST Devanagari" and "Devanagari
401             IAST" transliteration tables.
402              
403             Thanks to Nikola Lečić for contributing the "BGN/PCGN RUS Standard" and
404             "BGN/PCGN RUS Strict" transliteration tables.
405              
406             =head1 AUTHORS
407              
408             Alex Linke
409              
410             Rona Linke
411              
412             =head1 LICENSE AND COPYRIGHT
413              
414             Copyright (C) 2007-2008 Alex Linke and Rona Linke
415              
416             Copyright (C) 2009-2016 Lingua-Systems Software GmbH
417              
418             Copyright (C) 2016-2017 Netzum Sorglos, Lingua-Systems Software GmbH
419              
420             Copyright (C) 2017 Netzum Sorglos Software GmbH
421              
422             This module is free software; you can redistribute it and/or modify it under
423             the same terms as Perl itself.
424              
425             =cut
426              
427             1;
428              
429             # vim: set ft=perl sts=4 sw=4 ts=4 ai et: