File Coverage

blib/lib/Text/Amuse/Preprocessor/TypographyFilters.pm
Criterion Covered Total %
statement 93 94 98.9
branch 22 36 61.1
condition 6 6 100.0
subroutine 15 16 93.7
pod 6 6 100.0
total 142 158 89.8


line stmt bran cond sub pod time code
1             package Text::Amuse::Preprocessor::TypographyFilters;
2              
3 15     15   111935 use strict;
  15         41  
  15         368  
4 15     15   62 use warnings;
  15         24  
  15         294  
5 15     15   544 use utf8;
  15         40  
  15         138  
6             # use Encode;
7              
8             =encoding utf8
9              
10             =head1 NAME
11              
12             Text::Amuse::Preprocessor::TypographyFilters - Text::Amuse::Preprocessor's filters
13              
14             =head1 DESCRIPTION
15              
16             Used internally by L.
17              
18             =head1 FUNCTIONS
19              
20             =head2 linkify($string);
21              
22             Activate links in $string and returns it.
23              
24             =cut
25              
26             sub linkify {
27 2072     2072 1 14676 my $l = shift;
28 2072 50       3374 return unless defined $l;
29 2072         3726 $l =~ s{(.?) # be sure not to redo the same thing, looking behind #1
30             ((?:https?:\/\/) # protocol
31             (\w[\w\-\.]+\.\w+) # domain
32             (?:\:\d+)? # the port
33             (?:
34             / # a slash
35             [^\[\]<>\s]* # everything that is not a space or < > [ ]
36             [\w/] # but end with a letter or a slash
37             |
38             / # or a slash alone
39             |
40             \?[^\[\]<>\s]*[\w/] # or the query alone
41             |
42             )?
43             )
44             (.?)
45 177         370 }{_linkify_link($1, $2, $3, $4)}gmxe;
46 2072         3894 return $l;
47             }
48              
49             sub _linkify_link {
50 177     177   609 my ($prefix, $url, $domain, $suffix) = @_;
51             # print "=$prefix, $url, $domain, $suffix=\n";
52 177 100 100     662 if ($prefix and $prefix eq '[') {
    100 100        
53             # already processed
54 40         189 return $prefix . $url . $suffix
55             }
56             elsif ($suffix and $suffix eq ']') {
57 2         7 return $prefix . $url . $suffix
58             }
59             else {
60 135         708 return $prefix . "[[$url][$domain]]" . $suffix;
61             }
62             }
63              
64             =head2 characters
65              
66             Return an hashref where keys are the language codes, and the values an
67             hashref with the definition of punctuation characters. Each of them
68             has the following keys: C, C, C,
69             C, C, C, C.
70              
71             C refers to a dash between digits.
72              
73             =cut
74              
75             # EM-DASH: 2014
76             # EN-DASH: 2013
77              
78             sub characters {
79             return {
80 57     57 1 12686 en => {
81             ldouble => "\x{201c}",
82             rdouble => "\x{201d}",
83             lsingle => "\x{2018}",
84             rsingle => "\x{2019}",
85             apos => "\x{2019}",
86             emdash => "\x{2014}",
87             endash => "\x{2013}",
88             dash => "\x{2014}",
89             },
90             # esperanto same as english, for the moment
91             eo => {
92             ldouble => "\x{201c}",
93             rdouble => "\x{201d}",
94             lsingle => "\x{2018}",
95             rsingle => "\x{2019}",
96             apos => "\x{2019}",
97             emdash => "\x{2014}",
98             endash => "\x{2013}",
99             dash => "\x{2014}",
100             },
101             # „…”
102             pl => {
103             ldouble => "\x{201e}",
104             rdouble => "\x{201d}",
105             lsingle => "\x{2018}",
106             rsingle => "\x{2019}",
107             apos => "\x{2019}",
108             emdash => "\x{2014}",
109             endash => "\x{2013}",
110             dash => "\x{2014}",
111             },
112             sq => {
113             ldouble => "\x{201c}",
114             rdouble => "\x{201d}",
115             lsingle => "\x{2018}",
116             rsingle => "\x{2019}",
117             apos => "\x{2019}",
118             emdash => "\x{2014}",
119             endash => "\x{2013}",
120             dash => "\x{2014}",
121             },
122             pt => {
123             ldouble => "\x{201c}",
124             rdouble => "\x{201d}",
125             lsingle => "\x{2018}",
126             rsingle => "\x{2019}",
127             apos => "\x{2019}",
128             emdash => "\x{2013}",
129             endash => "\x{2013}",
130             dash => "\x{2014}",
131             },
132             es => {
133             ldouble => "\x{ab}",
134             rdouble => "\x{bb}",
135             lsingle => "\x{2018}",
136             rsingle => "\x{2019}",
137             apos => "\x{2019}",
138             emdash => "\x{2014}",
139             endash => "-",
140             dash => "\x{2014}",
141             },
142             fr => {
143             ldouble => "\x{ab} ",
144             rdouble => " \x{bb}",
145             lsingle => "\x{2018}",
146             rsingle => "\x{2019}",
147             apos => "\x{2019}",
148             emdash => "\x{2014}",
149             endash => "\x{2013}",
150             dash => "\x{2014}",
151             },
152              
153             # according to http://en.wikipedia.org/wiki/International_variation_in_quotation_marks#Finnish_and_Swedish
154             sv => {
155             ldouble => "\x{201d}",
156             rdouble => "\x{201d}",
157             lsingle => "\x{2019}",
158             rsingle => "\x{2019}",
159             apos => "\x{2019}",
160             # finnish uses short dash
161             emdash => "\x{2013}",
162             endash => "-",
163             dash => "\x{2013}",
164             },
165             fi => {
166             ldouble => "\x{201d}",
167             rdouble => "\x{201d}",
168             lsingle => "\x{2019}",
169             rsingle => "\x{2019}",
170             apos => "\x{2019}",
171             # finnish uses short dash
172             emdash => "\x{2013}",
173             endash => "-",
174             dash => "\x{2013}",
175             },
176             id => {
177             ldouble => "\x{201c}",
178             rdouble => "\x{201d}",
179             lsingle => "\x{2018}",
180             rsingle => "\x{2019}",
181             apos => "\x{2019}",
182             # https://id.wikipedia.org/wiki/Tanda_pisah
183             emdash => "\x{2013}", # en dash –
184             endash => "\x{2013}", # en dash –
185             dash => "\x{2013}",
186             },
187             nl => {
188             ldouble => "\x{201c}",
189             rdouble => "\x{201d}",
190             lsingle => "\x{2018}",
191             rsingle => "\x{2019}",
192             apos => "\x{2019}",
193             emdash => "\x{2013}", # en dash –
194             endash => "-", # between numbers, use the hyphen
195             dash => "\x{2013}",
196             },
197             # like serbian
198             bg => {
199             # „članak o ’svicima’“
200             ldouble => "\x{201e}",
201             rdouble => "\x{201c}",
202             lsingle => "\x{2019}",
203             rsingle => "\x{2019}",
204             apos => "\x{2019}",
205             # serbian uses short dash.
206             emdash => "\x{2013}",
207             endash => "\x{2013}",
208             dash => "\x{2014}",
209             },
210             sr => {
211             # „članak o ’svicima’“
212             ldouble => "\x{201e}",
213             rdouble => "\x{201c}",
214             lsingle => "\x{2019}",
215             rsingle => "\x{2019}",
216             apos => "\x{2019}",
217             # serbian uses short dash.
218             emdash => "\x{2013}",
219             endash => "\x{2013}",
220             dash => "\x{2014}",
221             },
222             hr => {
223             # http://pravopis.hr/pravilo/navodnici/71/ „...” i »...«.
224             ldouble => "\x{201e}",
225             rdouble => "\x{201d}",
226             # http://pravopis.hr/pravilo/polunavodnici/73/ ‘...’
227             lsingle => "\x{2018}",
228             rsingle => "\x{2019}",
229             apos => "\x{2019}",
230             # croatian uses short dash:
231             # http://pravopis.hr/pravilo/crtica/69/
232             emdash => "\x{2013}",
233             endash => "\x{2013}",
234             dash => "\x{2014}",
235             },
236             ru => {
237             ldouble => "\x{ab}",
238             rdouble => "\x{bb}",
239             lsingle => "\x{201e}",
240             rsingle => "\x{201c}",
241             apos => "\x{2019}",
242             emdash => "\x{2014}",
243             endash => "-",
244             dash => "\x{2014}",
245             },
246             it => {
247             ldouble => "\x{201c}",
248             rdouble => "\x{201d}",
249             lsingle => "\x{2018}",
250             rsingle => "\x{2019}",
251             apos => "\x{2019}",
252             emdash => "\x{2013}",
253             endash => "-",
254             dash => "\x{2014}",
255             },
256             # Macedonian „…“ ’…‘
257             mk => {
258             ldouble => "\x{201e}",
259             rdouble => "\x{201c}",
260             lsingle => "\x{2019}",
261             rsingle => "\x{2018}",
262             apos => "\x{2019}",
263             emdash => "\x{2013}",
264             endash => "\x{2013}",
265             dash => "\x{2014}",
266             },
267             # http://de.wikipedia.org/wiki/Halbgeviertstrich
268             # http://en.wikipedia.org/wiki/International_variation_in_quotation_marks#German_.28Germany_and_Austria.29
269             de => {
270             ldouble => "\x{201e}",
271             rdouble => "\x{201c}",
272             lsingle => "\x{201a}",
273             rsingle => "\x{2018}",
274             apos => "\x{2019}",
275             emdash => "\x{2013}",
276             endash => "\x{2013}",
277             dash => "\x{2013}",
278             },
279             # »Outer quotation ’inner’ hyphen-for-words – and a dash«
280             # (en-dash between spaces is correct)
281             da => {
282             ldouble => "\x{bb}",
283             rdouble => "\x{ab}",
284             lsingle => "\x{2019}",
285             rsingle => "\x{2019}",
286             apos => "\x{2019}",
287             emdash => "\x{2013}",
288             endash => "\x{2013}",
289             dash => "\x{2013}",
290             },
291             };
292             }
293              
294              
295             =head2 specific_filters
296              
297             Return an hashref where the key is the language codes and the value a
298             subroutine to filter the line.
299              
300             Here we put the routines which can't be abstracted away in a
301             language-indipendent fashion.
302              
303             =cut
304              
305             sub _english_specific {
306 226     226   303 my $l = shift;
307 226         734 $l =~ s!\b(\d+)(th|rd|st|nd)\b!$1$2!g;
308 226         408 return $l;
309             }
310              
311             sub _chinese_specific {
312 2     2   837 my $l = shift;
313             # regexps goes here
314             # $l =~ s/in/out/g;
315              
316             # and return the string
317 2         7 return $l;
318             }
319              
320             sub specific_filters {
321             return {
322 56     56 1 244 en => \&_english_specific,
323             zh => \&_chinese_specific,
324             };
325             }
326              
327             =head2 specific_filter($lang)
328              
329             Return the specific filter for lang, if present.
330              
331             =cut
332              
333             sub specific_filter {
334 56     56 1 177 my ($lang) = @_;
335 56 50       109 return unless $lang;
336 56         98 return specific_filters()->{$lang};
337             }
338              
339             =head2 filter($lang)
340              
341             Return a sub for the typographical fixes for the language $lang.
342              
343             =cut
344              
345              
346             sub filter {
347 56     56 1 1077 my ($lang) = @_;
348 56 50       133 return unless $lang;
349 56         117 my $all = characters();
350 56         133 my $chars = $all->{$lang};
351 56 50       119 return unless $chars;
352              
353             # copy to avoid typos
354 56 50       127 my $ldouble = $chars->{ldouble} or die;
355 56 50       138 my $rdouble = $chars->{rdouble} or die;
356 56 50       134 my $lsingle = $chars->{lsingle} or die;
357 56 50       116 my $rsingle = $chars->{rsingle} or die;
358 56 50       105 my $apos = $chars->{apos} or die;
359 56 50       106 my $emdash = $chars->{emdash} or die;
360 56 50       133 my $endash = $chars->{endash} or die;
361 56 50       103 my $dash = $chars->{dash} or die;
362             my $filter = sub {
363 2149     2149   2899 my $l = shift;
364              
365             # if there is nothing to do, speed up.
366 2149 100       5478 return $l unless $l =~ /['"`-]/;
367              
368             # first, consider `` and '' opening and closing doubles
369 1266         2158 $l =~ s/``/$ldouble/g;
370              
371 1266         1892 $l =~ s/`/$lsingle/g;
372              
373             # but set it as ", we'll replace that later
374 1266         1924 $l =~ s/''/"/g;
375              
376             # beginning of the line, long dash
377 1266         1942 $l =~ s/^-(?=\s)/$dash/gm;
378              
379             # between spaces, just replace
380 1266         2623 $l =~ s/(?<=\S)(\x{20}+)-{1,3}(\x{20}+)(?=\S)/$1$emdash$2/g;
381              
382             # end of line with
383 1266         2524 $l =~ s/(?<=\S) +-{1,3}$/ $emdash/gm;
384              
385             # -word and word-, in the middle of a line
386 1266         2226 $l =~ s/(?<=\S)(\x{20}+)-(\w.+?\w)-(?=\x{20})/$1$emdash $2 $emdash/g;
387              
388             # an opening before two digits *probably* is an apostrophe.
389             # Very common case.
390 1266         2412 $l =~ s/'(?=\d\d\b)/$apos/g;
391              
392             # if it touches a word on the right, and on the left there is not a
393             # word, it's an opening quote
394 1266         3032 $l =~ s/(?<=\W)"(?=\w)/$ldouble/g;
395 1266         2936 $l =~ s/(?<=\W)'(?=\w)/$lsingle/g;
396              
397             # beginning of line, opening
398 1266         2242 $l =~ s/^"/$ldouble/gm;
399 1266         1950 $l =~ s/^'/$lsingle/gm;
400              
401             # end of line/chunk, closing
402 1266         3507 $l =~ s/"( *)$/$rdouble$1/gm;
403 1266         2657 $l =~ s/'( *)$/$rsingle$1/gm;
404              
405             # if there is a space at the left, it's opening
406 1266         2270 $l =~ s/(?<=\s)"/$ldouble/g;
407 1266         2203 $l =~ s/(?<=\s)'/$lsingle/g;
408              
409             # print encode('UTF-8', "**** $l");
410              
411             # apostrophes, between non-white material, probably
412 1266         2339 $l =~ s/(?<=\w)'(?=\w)/$apos/g;
413              
414             # print encode('UTF-8', "**** $l");
415              
416             # or before a left quote
417 1266         3839 $l =~ s/(?<=\w)'(\Q$lsingle\E)/$apos$1/g;
418 1266         3113 $l =~ s/(?<=\w)'(\Q$ldouble\E)/$apos$1/g;
419              
420             # print encode('UTF-8', "**** $l");
421              
422             # word at the left, closing
423 1266         2862 $l =~ s/(?<=\w)"(?=\W)/$rdouble/g;
424 1266         2550 $l =~ s/(?<=\w)'(?=\W)/$rsingle/g;
425              
426              
427             # the others are right quotes, hopefully
428 1266         2177 $l =~ s/"/$rdouble/gs;
429 1266         1882 $l =~ s/'/$rsingle/g;
430              
431             # replace with an endash, but only if between digits and not
432             # in the middle of something
433 1266         2685 $l =~ s/(?
434              
435 1266         2956 return $l;
436 56         345 };
437 56         640 return $filter;
438             }
439              
440             sub _nbsp_filters {
441 0     0   0 return { ru => \&_ru_nbsp_filter };
442             }
443              
444             sub _nbsp_specs {
445             return {
446             # to read: add a space ...
447 15     15   232 ru => {
448             before_words => [
449             "\x{2013}", "\x{2014}", "\x{2212}",
450             "б", "ж", "ли", "же", "ль", "бы", "бы,", "же",
451             ],
452             after_digit_before_words => [
453             "января", "февраля",
454             "марта", "апреля",
455             "мая", "июня", "июля",
456             "августа", "сентября",
457             "октября", "ноября",
458             "декабря", "г", "кг",
459             "мм", "дм", "см", "м",
460             "км", "л", "В", "А",
461             "ВТ", "W", "°C",
462             ],
463             after_words => [
464             "в", "к", "о", "с", "у",
465             "В", "К", "О", "С", "У",
466             "на", "от", "об", "из", "за", "по", "до", "во",
467             "та", "ту", "то", "те", "ко", "со",
468             "На", "От", "Об", "Из", "За", "По", "До", "Во",
469             "Ко", "Та", "Ту", "То", "Те", "Со",
470             "А", "А,", "а", "а,",
471             "И", "И,", "и", "и,",
472             "но", "но,", "Но", "Но,",
473             "да", "да,", "Да", "Да,",
474             "не", "ни", "Не", "Ни",
475             "ну", "ну,", "Ну", "Ну,",
476             "с.", "ч.", "см.", "См.",
477             "им.", "Им.","т.", "п."
478             ]
479             },
480             };
481             }
482              
483             =head2 nbsp_filter($lang)
484              
485             Return a sub (if the filter exists) to place non-breaking spaces in
486             language-specific places.
487              
488             =cut
489              
490             sub nbsp_filter {
491 15     15 1 36 my ($lang) = @_;
492 15 50       31 return unless $lang;
493 15         33 my $specs = _nbsp_specs()->{$lang};
494 15 100       73 return unless $specs;
495 5         8 my @patterns;
496 5         8 foreach my $token (@{ $specs->{before_words} }) {
  5         15  
497 55         948 push @patterns, [
498             qr/(?<=\S)
499             \s+
500             \Q$token\E
501             (?=\W|$)/xm,
502             "\x{a0}$token"
503             ];
504             }
505 5         13 foreach my $token (@{ $specs->{after_digit_before_words} }) {
  5         14  
506 125         1915 push @patterns, [
507             qr/(?<=\d)
508             \s+
509             \Q$token\E
510             (?=\W|$)
511             /xm,
512             "\x{a0}$token"
513             ];
514             }
515 5         14 foreach my $token (@{ $specs->{after_words} }) {
  5         15  
516 350         3810 push @patterns, [
517             qr/\b
518             \Q$token\E
519             \s+
520             (?=\S|$)
521             /xm,
522             "$token\x{a0}"
523             ];
524             }
525 5 50       18 return unless (@patterns);
526             return sub {
527 173     173   242 my $l = shift;
528 173         266 foreach my $pattern (@patterns) {
529 18338         26732 my ($from, $to) = @$pattern;
530 18338         38345 $l =~ s/$from/$to/g;
531             }
532 173         443 return $l;
533 5         49 };
534             }
535              
536              
537              
538             1;