line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# -*- cperl; cperl-indent-level: 4 -*- |
2
|
|
|
|
|
|
|
# Copyright (C) 2009-2021, Roland van Ipenburg |
3
|
|
|
|
|
|
|
package HTML::Hyphenate v1.1.10; |
4
|
12
|
|
|
12
|
|
1033273
|
use Moose; |
|
12
|
|
|
|
|
4991722
|
|
|
12
|
|
|
|
|
77
|
|
5
|
12
|
|
|
12
|
|
78753
|
use utf8; |
|
12
|
|
|
|
|
68
|
|
|
12
|
|
|
|
|
85
|
|
6
|
12
|
|
|
12
|
|
586
|
use 5.016000; |
|
12
|
|
|
|
|
47
|
|
7
|
|
|
|
|
|
|
|
8
|
12
|
|
|
12
|
|
9295
|
use charnames (); |
|
12
|
|
|
|
|
335244
|
|
|
12
|
|
|
|
|
326
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
#use Log::Log4perl qw(:resurrect :easy get_logger); |
11
|
12
|
|
|
12
|
|
6059
|
use I18N::LangTags; |
|
12
|
|
|
|
|
25927
|
|
|
12
|
|
|
|
|
721
|
|
12
|
12
|
|
|
12
|
|
5499
|
use Set::Scalar; |
|
12
|
|
|
|
|
114606
|
|
|
12
|
|
|
|
|
529
|
|
13
|
12
|
|
|
12
|
|
7455
|
use TeX::Hyphen; |
|
12
|
|
|
|
|
17485
|
|
|
12
|
|
|
|
|
419
|
|
14
|
12
|
|
|
12
|
|
5738
|
use TeX::Hyphen::Pattern v1.1.8; |
|
12
|
|
|
|
|
1342326
|
|
|
12
|
|
|
|
|
512
|
|
15
|
12
|
|
|
12
|
|
6204
|
use HTML::Hyphenate::DOM; |
|
12
|
|
|
|
|
68
|
|
|
12
|
|
|
|
|
382
|
|
16
|
|
|
|
|
|
|
|
17
|
12
|
|
|
12
|
|
129
|
use Readonly; |
|
12
|
|
|
|
|
21
|
|
|
12
|
|
|
|
|
25918
|
|
18
|
|
|
|
|
|
|
## no critic qw(ProhibitCallsToUnexportedSubs) |
19
|
|
|
|
|
|
|
Readonly::Scalar my $EMPTY => q{}; |
20
|
|
|
|
|
|
|
Readonly::Scalar my $DOT => q{.}; |
21
|
|
|
|
|
|
|
Readonly::Scalar my $SOFT_HYPHEN => charnames::string_vianame(q{SOFT HYPHEN}); |
22
|
|
|
|
|
|
|
Readonly::Scalar my $CLASS_JOINER => q{, .}; # for CSS classnames |
23
|
|
|
|
|
|
|
Readonly::Scalar my $ONE_LEVEL_UP => -1; |
24
|
|
|
|
|
|
|
Readonly::Scalar my $DOCTYPE => q{<!DOCTYPE html>}; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
Readonly::Hash my %DEFAULT => ( |
27
|
|
|
|
|
|
|
'MIN_LENGTH' => 10, |
28
|
|
|
|
|
|
|
'MIN_PRE' => 2, |
29
|
|
|
|
|
|
|
'MIN_POST' => 2, |
30
|
|
|
|
|
|
|
'LANG' => q{en_us}, |
31
|
|
|
|
|
|
|
'INCLUDED' => 1, |
32
|
|
|
|
|
|
|
); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# HTML %Text attributes <http://www.w3.org/TR/REC-html40/index/attributes.html> |
35
|
|
|
|
|
|
|
# HTML5 text attributes <https://developer.mozilla.org/en-US/docs/Web/HTML/Attributes> |
36
|
|
|
|
|
|
|
my $text_attr = |
37
|
|
|
|
|
|
|
Set::Scalar->new(qw/abbr alt label list placeholder standby summary title/); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
## no critic qw(ProhibitCallsToUnexportedSubs) |
40
|
|
|
|
|
|
|
Readonly::Hash my %LOG => ( |
41
|
|
|
|
|
|
|
'TRAVERSE' => q{Traversing HTML element '%s'}, |
42
|
|
|
|
|
|
|
'LANGUAGE_SET' => q{Language changed to '%s'}, |
43
|
|
|
|
|
|
|
'PATTERN_FILE' => q{Using pattern file '%s'}, |
44
|
|
|
|
|
|
|
'TEXT_NODE' => q{Text node value '%s'}, |
45
|
|
|
|
|
|
|
'HYPHEN_TEXT' => q{Hyphenating text '%s'}, |
46
|
|
|
|
|
|
|
'HYPHEN_WORD' => q{Hyphenating word '%s' to '%s'}, |
47
|
|
|
|
|
|
|
'LOOKING_UP' => q{Looking up for %d class(es)}, |
48
|
|
|
|
|
|
|
'HTML_METHOD' => q{Using HTML passed to method '%s'}, |
49
|
|
|
|
|
|
|
'HTML_PROPERTY' => q{Using HTML property '%s'}, |
50
|
|
|
|
|
|
|
'NOT_HYPHEN' => q{No pattern found for '%s'}, |
51
|
|
|
|
|
|
|
'TRY_SIMILAR' => q{Searching a pattern similar to the unmatched '%s'}, |
52
|
|
|
|
|
|
|
'SIMILARITY' => q{Similarity of candidate '%s' is %i}, |
53
|
|
|
|
|
|
|
'REGISTER' => q{Registering TeX::Hyphen object for label '%s'}, |
54
|
|
|
|
|
|
|
'NO_CLASSES' => q{No classes defined, so not check for them}, |
55
|
|
|
|
|
|
|
); |
56
|
|
|
|
|
|
|
## use critic |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
## no critic qw(ProhibitCommentedOutCode) |
59
|
|
|
|
|
|
|
###l4p Log::Log4perl->easy_init( { 'level' => $DEBUG, 'utf8' => 1 } ); |
60
|
|
|
|
|
|
|
###l4p my $log = get_logger(); |
61
|
|
|
|
|
|
|
## use critic |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
## no critic qw(ProhibitHashBarewords ProhibitCallsToUnexportedSubs ProhibitCallsToUndeclaredSubs) |
64
|
|
|
|
|
|
|
has html => ( is => 'rw', isa => 'Str' ); |
65
|
|
|
|
|
|
|
after 'html' => sub { |
66
|
|
|
|
|
|
|
my ( $self, $html ) = @_; |
67
|
|
|
|
|
|
|
if ( defined $html ) { |
68
|
|
|
|
|
|
|
## no critic qw(ProhibitUnusedCapture) |
69
|
|
|
|
|
|
|
if ( $self->html =~ m{^(?<doctype>\s*\Q$DOCTYPE\E)(?<html>.*)}gismx ) { |
70
|
|
|
|
|
|
|
## use critic |
71
|
|
|
|
|
|
|
$self->html( ${+}{html} ); |
72
|
|
|
|
|
|
|
$self->_doctype( ${+}{doctype} ); |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
else { |
75
|
|
|
|
|
|
|
$self->_doctype($EMPTY); |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
} |
78
|
|
|
|
|
|
|
}; |
79
|
|
|
|
|
|
|
has style => ( is => 'rw', isa => 'Str' ); |
80
|
|
|
|
|
|
|
has min_length => |
81
|
|
|
|
|
|
|
( is => 'rw', isa => 'Int', default => $DEFAULT{'MIN_LENGTH'} ); |
82
|
|
|
|
|
|
|
has min_pre => ( is => 'rw', isa => 'Int', default => $DEFAULT{'MIN_PRE'} ); |
83
|
|
|
|
|
|
|
has min_post => ( is => 'rw', isa => 'Int', default => $DEFAULT{'MIN_POST'} ); |
84
|
|
|
|
|
|
|
has default_lang => ( is => 'rw', isa => 'Str', default => $DEFAULT{'LANG'} ); |
85
|
|
|
|
|
|
|
has default_included => |
86
|
|
|
|
|
|
|
( is => 'rw', isa => 'Int', default => $DEFAULT{'INCLUDED'} ); |
87
|
|
|
|
|
|
|
has classes_included => |
88
|
|
|
|
|
|
|
( is => 'rw', isa => 'ArrayRef', default => sub { [] } ); |
89
|
|
|
|
|
|
|
has classes_excluded => |
90
|
|
|
|
|
|
|
( is => 'rw', isa => 'ArrayRef', default => sub { [] } ); |
91
|
|
|
|
|
|
|
after 'classes_included' => sub { |
92
|
|
|
|
|
|
|
my ( $self, $ar ) = @_; |
93
|
|
|
|
|
|
|
if ( defined $ar ) { |
94
|
|
|
|
|
|
|
$self->_classes( |
95
|
|
|
|
|
|
|
( scalar $self->classes_excluded + scalar $self->classes_included ) |
96
|
|
|
|
|
|
|
> 0 ); |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
}; |
100
|
|
|
|
|
|
|
after 'classes_excluded' => sub { |
101
|
|
|
|
|
|
|
my ( $self, $ar ) = @_; |
102
|
|
|
|
|
|
|
if ( defined $ar ) { |
103
|
|
|
|
|
|
|
$self->_classes( |
104
|
|
|
|
|
|
|
( scalar $self->classes_excluded + scalar $self->classes_included ) |
105
|
|
|
|
|
|
|
> 0 ); |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
}; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
has _hyphenators => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); |
110
|
|
|
|
|
|
|
has _lang => ( is => 'rw', isa => 'Str' ); |
111
|
|
|
|
|
|
|
has _doctype => ( is => 'rw', isa => 'Str' ); |
112
|
|
|
|
|
|
|
has _dom => ( is => 'rw', isa => 'HTML::Hyphenate::DOM' ); |
113
|
|
|
|
|
|
|
has _scope_is_root => ( is => 'rw', isa => 'Bool', default => sub { 0 } ); |
114
|
|
|
|
|
|
|
has _classes => ( is => 'rw', isa => 'Bool', default => sub { 0 } ); |
115
|
|
|
|
|
|
|
## use critic |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
## no critic qw(ProhibitCallsToUnexportedSubs) |
118
|
|
|
|
|
|
|
Readonly::Scalar my $LANG => q{lang}; |
119
|
|
|
|
|
|
|
Readonly::Scalar my $HTML => q{html}; |
120
|
|
|
|
|
|
|
Readonly::Scalar my $TEXT => q{text}; |
121
|
|
|
|
|
|
|
Readonly::Scalar my $TAG => q{tag}; |
122
|
|
|
|
|
|
|
Readonly::Scalar my $RAW => q{raw}; |
123
|
|
|
|
|
|
|
Readonly::Scalar my $PRE => q{pre}; |
124
|
|
|
|
|
|
|
Readonly::Scalar my $CLASS => q{class}; |
125
|
|
|
|
|
|
|
## no critic qw(RequireDotMatchAnything RequireExtendedFormatting RequireLineBoundaryMatching) |
126
|
|
|
|
|
|
|
Readonly::Scalar my $NONSPACE => qr{\S+}; |
127
|
|
|
|
|
|
|
## use critic |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub hyphenated { |
130
|
135
|
|
|
135
|
1
|
101312
|
my ( $self, $html ) = @_; |
131
|
135
|
100
|
|
|
|
438
|
if ( defined $html ) { |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
###l4p $log->debug( sprintf $LOG{'HTML_METHOD'}, $html ); |
134
|
134
|
|
|
|
|
659
|
$self->html($html); |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
else { |
137
|
|
|
|
|
|
|
###l4p $log->debug( sprintf $LOG{'HTML_PROPERTY'}, $self->html ); |
138
|
|
|
|
|
|
|
} |
139
|
135
|
|
|
|
|
843
|
$self->_reset_dom; |
140
|
135
|
|
|
|
|
3324
|
$self->_dom->parse( $self->html ); |
141
|
135
|
|
|
|
|
38147
|
$self->_traverse_dom( $self->_dom->root ); |
142
|
135
|
|
|
|
|
545
|
return $self->_clean_html(); |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub register_tex_hyphen { |
146
|
5
|
|
|
5
|
1
|
323625
|
my ( $self, $label, $tex ) = @_; |
147
|
5
|
100
|
100
|
|
|
60
|
if ( |
|
|
|
100
|
|
|
|
|
148
|
|
|
|
|
|
|
defined $label |
149
|
|
|
|
|
|
|
## no critic qw(ProhibitCallsToUndeclaredSubs) |
150
|
|
|
|
|
|
|
&& blessed $tex |
151
|
|
|
|
|
|
|
## use critic |
152
|
|
|
|
|
|
|
&& $tex->isa('TeX::Hyphen') |
153
|
|
|
|
|
|
|
) |
154
|
|
|
|
|
|
|
{ |
155
|
2
|
|
|
|
|
92
|
my $cache = $self->_hyphenators; |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
###l4p $log->debug( sprintf $LOG{'REGISTER'}, $label ); |
158
|
2
|
|
|
|
|
5
|
${$cache}{$label} = $tex; |
|
2
|
|
|
|
|
6
|
|
159
|
2
|
|
|
|
|
48
|
$self->_hyphenators($cache); |
160
|
|
|
|
|
|
|
} |
161
|
5
|
|
|
|
|
12
|
return; |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
sub _traverse_dom { |
165
|
606
|
|
|
606
|
|
4540
|
my ( $self, $node ) = @_; |
166
|
606
|
100
|
|
|
|
1587
|
if ( $self->_hyphenable($node) ) { |
167
|
549
|
|
|
|
|
1458
|
my $type = $node->type; |
168
|
549
|
100
|
100
|
|
|
6345
|
if ( $TAG eq $type ) { |
|
|
100
|
|
|
|
|
|
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
###l4p $log->debug( sprintf $LOG{'TRAVERSE'}, $node->tag ); |
171
|
191
|
|
|
|
|
683
|
$self->_configure_lang($node); |
172
|
191
|
|
|
|
|
378
|
while ( my ( $k, $v ) = each %{ $node->attr } ) { |
|
333
|
|
|
|
|
3832
|
|
173
|
142
|
100
|
100
|
|
|
4643
|
if ( $text_attr->has($k) |
174
|
|
|
|
|
|
|
&& length $v >= $self->min_length ) |
175
|
|
|
|
|
|
|
{ |
176
|
10
|
|
|
|
|
26
|
$node->attr( $k, $self->_hyphen($v) ); |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
elsif ( $TEXT eq $type || $RAW eq $type ) { |
181
|
231
|
|
|
|
|
760
|
my $string = $node->to_string; |
182
|
|
|
|
|
|
|
###l4p $log->trace( sprintf $LOG{'TEXT_NODE'}, $string ); |
183
|
231
|
100
|
100
|
|
|
12660
|
if ( |
184
|
|
|
|
|
|
|
length $string >= $self->min_length |
185
|
|
|
|
|
|
|
## no critic qw(RequireDotMatchAnything RequireLineBoundaryMatching) |
186
|
|
|
|
|
|
|
&& $string =~ m{$NONSPACE}x |
187
|
|
|
|
|
|
|
) |
188
|
|
|
|
|
|
|
## use critic |
189
|
|
|
|
|
|
|
{ |
190
|
150
|
|
|
|
|
680
|
$self->_configure_lang($node); |
191
|
150
|
|
|
|
|
466
|
my $hyphened = $self->_hyphen($string); |
192
|
150
|
|
|
|
|
921
|
$node->replace($hyphened); |
193
|
|
|
|
|
|
|
} |
194
|
231
|
|
|
|
|
28841
|
return; |
195
|
|
|
|
|
|
|
} |
196
|
|
|
|
|
|
|
} |
197
|
375
|
|
|
|
|
3866
|
for my $child ( $node->child_nodes->each ) { |
198
|
471
|
|
|
|
|
30080
|
$self->_traverse_dom($child); |
199
|
|
|
|
|
|
|
} |
200
|
375
|
|
|
|
|
2234
|
return; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
sub _clean_html { |
204
|
135
|
|
|
135
|
|
309
|
my ($self) = @_; |
205
|
135
|
|
|
|
|
4409
|
my $html = $self->_dom->to_string(); |
206
|
135
|
|
|
|
|
13622
|
$self->_reset_dom; |
207
|
135
|
100
|
|
|
|
3644
|
if ( $EMPTY ne $self->_doctype ) { |
208
|
3
|
|
|
|
|
75
|
$html = $self->_doctype . $html; |
209
|
|
|
|
|
|
|
} |
210
|
135
|
|
|
|
|
537
|
return $html; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
sub _hyphen { |
214
|
160
|
|
|
160
|
|
375
|
my ( $self, $text ) = @_; |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
###l4p $log->debug( sprintf $LOG{'HYPHEN_TEXT'}, $text ); |
217
|
160
|
|
|
|
|
294
|
$text =~ s/(\w{@{[$self->min_length]},})/$self->_hyphen_word($1)/xsmeg; |
|
182
|
|
|
|
|
657
|
|
|
160
|
|
|
|
|
4141
|
|
218
|
160
|
|
|
|
|
596
|
return $text; |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
sub _hyphen_word { |
222
|
182
|
|
|
182
|
|
623
|
my ( $self, $word ) = @_; |
223
|
182
|
100
|
|
|
|
4829
|
if ( defined $self->_hyphenators->{ $self->_lang } ) { |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
###l4p $log->debug( sprintf $LOG{'HYPHEN_WORD'}, |
226
|
|
|
|
|
|
|
###l4p $word, $self->_hyphenators->{ $self->_lang }->visualize($word) ); |
227
|
181
|
|
|
|
|
293
|
my $number = 0; |
228
|
181
|
|
|
|
|
4196
|
foreach |
229
|
|
|
|
|
|
|
my $pos ( $self->_hyphenators->{ $self->_lang }->hyphenate($word) ) |
230
|
|
|
|
|
|
|
{ |
231
|
1199
|
|
|
|
|
389652
|
substr $word, $pos + $number, 0, $SOFT_HYPHEN; |
232
|
1199
|
|
|
|
|
1731
|
$number += length $SOFT_HYPHEN; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
else { |
236
|
|
|
|
|
|
|
###l4p $log->warn( sprintf $LOG{'NOT_HYPHEN'}, $self->_lang ); |
237
|
|
|
|
|
|
|
} |
238
|
182
|
|
|
|
|
11848
|
return $word; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
## no critic qw(RequireArgUnpacking) |
242
|
|
|
|
|
|
|
sub __lang_attr { |
243
|
680
|
100
|
|
680
|
|
11255
|
if ( $_[0] ) { |
244
|
579
|
|
100
|
|
|
3214
|
return $_[0]->attr($LANG) || $_[0]->attr(qq{xml:$LANG}); |
245
|
|
|
|
|
|
|
## use critic |
246
|
|
|
|
|
|
|
} |
247
|
|
|
|
|
|
|
else { |
248
|
101
|
|
|
|
|
175
|
return; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
sub _configure_lang { |
253
|
341
|
|
|
341
|
|
798
|
my ( $self, $element ) = @_; |
254
|
341
|
|
|
|
|
736
|
my $lang = __lang_attr($element); |
255
|
341
|
100
|
|
|
|
9860
|
if ( defined $lang ) { |
256
|
106
|
|
|
|
|
340
|
$self->_scope_is_root( $HTML eq $element->tag ); |
257
|
|
|
|
|
|
|
} |
258
|
341
|
100
|
|
|
|
881
|
if ( !defined $lang ) { |
259
|
235
|
|
|
|
|
555
|
$lang = __lang_attr( $element->parent ); |
260
|
235
|
100
|
|
|
|
5448
|
if ( defined $lang ) { |
261
|
115
|
|
|
|
|
380
|
$self->_scope_is_root( $HTML eq $element->parent->tag ); |
262
|
|
|
|
|
|
|
} |
263
|
|
|
|
|
|
|
} |
264
|
341
|
100
|
|
|
|
908
|
if ( !defined $lang ) { |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
# If the scope was already set by the root element we don't have to |
267
|
|
|
|
|
|
|
# check if it has gone out of scope because we never leave the root |
268
|
|
|
|
|
|
|
# scope: |
269
|
120
|
100
|
|
|
|
3215
|
if ( !$self->_scope_is_root ) { |
270
|
104
|
|
|
|
|
370
|
my $recent = $element->ancestors(qq{[$LANG]})->first(); |
271
|
104
|
|
100
|
|
|
23311
|
$self->_scope_is_root( $recent && $HTML eq $recent->tag ); |
272
|
104
|
|
|
|
|
215
|
$lang = __lang_attr($recent); |
273
|
|
|
|
|
|
|
} |
274
|
|
|
|
|
|
|
else { |
275
|
16
|
|
|
|
|
384
|
$lang = $self->_lang; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
} |
278
|
341
|
100
|
|
|
|
857
|
if ( !defined $lang ) { |
279
|
101
|
|
|
|
|
2184
|
$lang = $self->default_lang; |
280
|
|
|
|
|
|
|
} |
281
|
341
|
100
|
100
|
|
|
8026
|
if ( !defined $self->_lang || $lang ne $self->_lang ) { |
282
|
105
|
|
|
|
|
2504
|
$self->_lang($lang); |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
###l4p $log->debug( sprintf $LOG{'LANGUAGE_SET'}, $lang ); |
285
|
105
|
100
|
|
|
|
2693
|
if ( !exists $self->_hyphenators->{$lang} ) { |
286
|
78
|
|
|
|
|
314
|
$self->_add_tex_hyphen_to_cache(); |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
} |
289
|
341
|
|
|
|
|
39914
|
return; |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
sub _add_tex_hyphen_to_cache { |
293
|
78
|
|
|
78
|
|
185
|
my ($self) = @_; |
294
|
78
|
|
|
|
|
730
|
my $thp = TeX::Hyphen::Pattern->new(); |
295
|
78
|
|
|
|
|
127073
|
my $pat = $self->_lang; |
296
|
78
|
|
|
|
|
2023
|
$thp->label($pat); |
297
|
78
|
|
|
|
|
1912
|
my $file = $thp->filename; |
298
|
78
|
100
|
|
|
|
27105872
|
if ( !defined $file ) { |
299
|
|
|
|
|
|
|
###l4p $log->debug( sprintf $LOG{'TRY_SIMILAR'}, $pat ); |
300
|
4
|
|
|
|
|
12
|
my $max_sim = 0; |
301
|
4
|
|
|
|
|
20
|
for my $tag ( $thp->available() ) { |
302
|
584
|
|
|
|
|
316995
|
$tag =~ s{.*::}{}msx; |
303
|
584
|
|
|
|
|
1081
|
$tag =~ s{[_]}{-}msx; |
304
|
584
|
|
|
|
|
13735
|
my $sim = |
305
|
|
|
|
|
|
|
I18N::LangTags::similarity_language_tag( $tag, $self->_lang ); |
306
|
|
|
|
|
|
|
###l4p $log->debug( sprintf $LOG{'SIMILARITY'}, $tag, $sim ); |
307
|
584
|
100
|
|
|
|
29949
|
if ( $sim > $max_sim ) { |
308
|
4
|
|
|
|
|
8
|
$pat = $tag; |
309
|
4
|
|
|
|
|
10
|
$max_sim = $sim; |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
} |
312
|
4
|
100
|
|
|
|
71
|
if ( $max_sim > 0 ) { |
313
|
3
|
|
|
|
|
45234
|
$thp->label($pat); |
314
|
3
|
|
|
|
|
81
|
$file = $thp->filename; |
315
|
|
|
|
|
|
|
} |
316
|
|
|
|
|
|
|
} |
317
|
78
|
100
|
|
|
|
321571
|
if ( defined $file ) { |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
###l4p $log->debug( sprintf $LOG{'PATTERN_FILE'}, $file ); |
320
|
77
|
|
|
|
|
2950
|
my $cache = $self->_hyphenators; |
321
|
77
|
|
|
|
|
2253
|
${$cache}{ $self->_lang } = TeX::Hyphen->new( |
|
77
|
|
|
|
|
17306046
|
|
322
|
|
|
|
|
|
|
q{file} => $file, |
323
|
|
|
|
|
|
|
q{leftmin} => $self->min_pre, |
324
|
|
|
|
|
|
|
q{rightmin} => $self->min_post, |
325
|
|
|
|
|
|
|
); |
326
|
77
|
|
|
|
|
2333
|
$self->_hyphenators($cache); |
327
|
|
|
|
|
|
|
} |
328
|
78
|
|
|
|
|
728
|
return; |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
sub _hyphenable_by_class { |
332
|
118
|
|
|
118
|
|
178
|
my ( $self, $node ) = @_; |
333
|
118
|
|
|
|
|
165
|
my $included_level = $ONE_LEVEL_UP; |
334
|
118
|
|
|
|
|
130
|
my $excluded_level = $ONE_LEVEL_UP; |
335
|
118
|
100
|
|
|
|
2768
|
$self->default_included && $excluded_level--; |
336
|
118
|
100
|
|
|
|
2349
|
$self->default_included || $included_level--; |
337
|
|
|
|
|
|
|
|
338
|
118
|
|
|
|
|
369
|
$included_level = |
339
|
|
|
|
|
|
|
$self->_get_nearest_ancestor_level_by_classname( $node, |
340
|
|
|
|
|
|
|
$self->classes_included, $included_level ); |
341
|
118
|
|
|
|
|
6000
|
$excluded_level = |
342
|
|
|
|
|
|
|
$self->_get_nearest_ancestor_level_by_classname( $node, |
343
|
|
|
|
|
|
|
$self->classes_excluded, $excluded_level ); |
344
|
118
|
100
|
|
|
|
5729
|
if ( $included_level == $excluded_level ) { |
345
|
4
|
|
|
|
|
112
|
return $self->default_included; |
346
|
|
|
|
|
|
|
} |
347
|
114
|
|
|
|
|
501
|
return !( $excluded_level > $included_level ); |
348
|
|
|
|
|
|
|
} |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
sub __parent_is_pre { |
351
|
606
|
|
|
606
|
|
999
|
my ($node) = @_; |
352
|
606
|
|
|
|
|
1528
|
my $parent = $node->parent; |
353
|
606
|
|
100
|
|
|
26374
|
return defined $parent |
354
|
|
|
|
|
|
|
&& ( ( $parent->tag || $EMPTY ) eq $PRE ); |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
sub _hyphenable { |
358
|
606
|
|
|
606
|
|
1038
|
my ( $self, $node ) = @_; |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
###l4p $self->_classes || $log->debug( $LOG{'NO_CLASSES'} ); |
361
|
606
|
|
100
|
|
|
1122
|
return !( __parent_is_pre($node) |
362
|
|
|
|
|
|
|
|| ( $self->_classes && !$self->_hyphenable_by_class($node) ) ); |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
sub _get_nearest_ancestor_level_by_classname { |
366
|
236
|
|
|
236
|
|
1332
|
my ( $self, $node, $ar_classnames, $level ) = @_; |
367
|
236
|
|
|
|
|
262
|
my $classnames = Set::Scalar->new( @{$ar_classnames} ); |
|
236
|
|
|
|
|
654
|
|
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
###l4p $log->debug( sprintf $LOG{'LOOKING_UP'}, $classnames->size ); |
370
|
236
|
100
|
100
|
|
|
13747
|
if ( !$classnames->is_empty |
371
|
|
|
|
|
|
|
&& ( $node->ancestors->size ) ) |
372
|
|
|
|
|
|
|
{ |
373
|
144
|
|
|
|
|
11288
|
my $selector = $DOT . join $CLASS_JOINER, $classnames->members; |
374
|
144
|
|
|
|
|
1018
|
my $nearest = $node->ancestors($selector)->first; |
375
|
144
|
100
|
|
|
|
41373
|
if ($nearest) { |
376
|
64
|
|
|
|
|
327
|
return $nearest->ancestors->size; |
377
|
|
|
|
|
|
|
} |
378
|
|
|
|
|
|
|
} |
379
|
172
|
|
|
|
|
3559
|
return $level; |
380
|
|
|
|
|
|
|
} |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
sub _reset_dom { |
383
|
270
|
|
|
270
|
|
594
|
my ($self) = @_; |
384
|
270
|
|
|
|
|
974
|
my $dom = HTML::Hyphenate::DOM->new(); |
385
|
270
|
|
|
|
|
10001
|
$self->_dom($dom); |
386
|
270
|
|
|
|
|
489
|
return; |
387
|
|
|
|
|
|
|
} |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
1; |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
__END__ |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
=encoding utf8 |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=for stopwords Ipenburg Readonly merchantability Mojolicious Bitbucket |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=head1 NAME |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
HTML::Hyphenate - insert soft hyphens into HTML |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=head1 VERSION |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
This document describes HTML::Hyphenate version C<v1.1.10>. |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
=head1 SYNOPSIS |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
use HTML::Hyphenate; |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
$hyphenator = new HTML::Hyphenate(); |
410
|
|
|
|
|
|
|
$html_with_soft_hyphens = $hyphenator->hyphenated($html); |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
$hyphenator->html($html); |
413
|
|
|
|
|
|
|
$hyphenator->style($style); # czech or german |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
$hyphenator->min_length(10); |
416
|
|
|
|
|
|
|
$hyphenator->min_pre(2); |
417
|
|
|
|
|
|
|
$hyphenator->min_post(2); |
418
|
|
|
|
|
|
|
$hyphenator->default_lang('en-us'); |
419
|
|
|
|
|
|
|
$hyphenator->default_included(1); |
420
|
|
|
|
|
|
|
$hyphenator->classes_included(['shy']); |
421
|
|
|
|
|
|
|
$hyphenator->classes_excluded(['noshy']); |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=head1 DESCRIPTION |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
Most HTML rendering engines used in web browsers don't figure out by |
426
|
|
|
|
|
|
|
themselves how to hyphenate words when needed, but we can tell them how they |
427
|
|
|
|
|
|
|
might do it by inserting soft hyphens into the words. |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
=head1 SUBROUTINES/METHODS |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
=over 4 |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=item HTML::Hyphenate-E<gt>new() |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
Constructs a new HTML::Hyphenate object. |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
=item $hyphenator-E<gt>hyphenated() |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
Returns the HTML including the soft hyphens. |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
=item $hyphenator->html(); |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
Gets or sets the HTML to hyphenate. |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
=item $hyphenator->style(); |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
Gets or sets the style to use for pattern usages in |
448
|
|
|
|
|
|
|
L<TeX::Hyphen|TeX::Hyphen>. Can be C<czech> or C<german>. |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
=item $hyphenator->min_length(); |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
Gets or sets the minimum word length required for having soft hyphens |
453
|
|
|
|
|
|
|
inserted. Defaults to 10 characters. |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
=item $hyphenator->min_pre(2); |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
Gets or sets the minimum amount of characters in a word preserved before the |
458
|
|
|
|
|
|
|
first soft hyphen. Defaults to 2 characters. |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
=item $hyphenator->min_post(2); |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
Gets or sets the minimum amount of characters in a word preserved after the |
463
|
|
|
|
|
|
|
last soft hyphen. Defaults to 2 characters. |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=item $hyphenator->default_lang('en-us'); |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
Gets or sets the default pattern to use when no language can be derived from |
468
|
|
|
|
|
|
|
the HTML. |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
=item $hyphenator->default_included(); |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
Gets or sets if soft hyphens should be included in the whole tree by default. |
473
|
|
|
|
|
|
|
This can be used to insert soft hyphens only in parts of the HTML having |
474
|
|
|
|
|
|
|
specific class names. |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
=item $hyphenator->classes_included(); |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
Gets or sets a reference to an array of class names that will have soft |
479
|
|
|
|
|
|
|
hyphens inserted. |
480
|
|
|
|
|
|
|
|
481
|
|
|
|
|
|
|
=item $hyphenator->classes_excluded(); |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
Gets or sets a reference to an array of class names that will not have soft |
484
|
|
|
|
|
|
|
hyphens inserted. |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=item $hyphenator->register_tex_hyphen(C<lang>, C<TeX::Hyphen>) |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
Registers a TeX::Hyphen object to handle the language defined by C<lang>. |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
=back |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=head1 CONFIGURATION AND ENVIRONMENT |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
The output is generated by L<Mojo::DOM|Mojo::DOM> so the environment variable |
495
|
|
|
|
|
|
|
C<MOJO_DOM_CSS_DEBUG> can be set to debug it's CSS selection process. |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
=head1 DEPENDENCIES |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=over 4 |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
=item * Perl 5.16 |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=item * L<Moose|Moose> |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
=item * L<Mojolicious|Mojolicious> for L<Mojo::Dom|Mojo::Dom> |
506
|
|
|
|
|
|
|
|
507
|
|
|
|
|
|
|
=item * L<Readonly|Readonly> |
508
|
|
|
|
|
|
|
|
509
|
|
|
|
|
|
|
=item * L<Set::Scalar|Set::Scalar> |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
=item * L<TeX::Hyphen|TeX::Hyphen> |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
=item * L<TeX::Hyphen::Pattern|TeX::Hyphen::Pattern> |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
=back |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
=head1 INCOMPATIBILITIES |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
This module has the same limits as TeX::Hyphen, TeX::Hyphen::Pattern and |
520
|
|
|
|
|
|
|
Mojo::DOM. Tests might fail if the patterns used for them are updated and |
521
|
|
|
|
|
|
|
change the test result. |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
=head1 DIAGNOSTICS |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
This module uses Log::Log4perl for logging when it's resurrected. |
526
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
=over 4 |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
=item * It warns when a language encountered in the HTML is not supported by |
530
|
|
|
|
|
|
|
TeX::Hyphen::Pattern |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
=back |
533
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
=head1 BUGS AND LIMITATIONS |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
=over 4 |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
=item * Perfect hyphenation can be more complicated than just inserting a |
539
|
|
|
|
|
|
|
hyphen somewhere in a word, and sometimes requires semantics to get it right. |
540
|
|
|
|
|
|
|
For example C<cafeetje> should be hyphenated as C<cafe-tje> and not |
541
|
|
|
|
|
|
|
C<cafee-tje> and C<buurtje> can be hyphenated as C<buur-tje> or C<buurt-je>, |
542
|
|
|
|
|
|
|
depending on it's meaning. While HTML could provide a bit more context - |
543
|
|
|
|
|
|
|
mainly the language being used - than plain text to handle these issues, the |
544
|
|
|
|
|
|
|
initial purpose of this module is to make it possible for HTML rendering |
545
|
|
|
|
|
|
|
engines that support soft hyphens to be able to break long words over multiple |
546
|
|
|
|
|
|
|
lines to avoid unwanted overflow. |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
=item * The hyphenation doesn't get better than TeX::Hyphenate and it's |
549
|
|
|
|
|
|
|
hyphenation patterns provide. |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
=item * The round trip from HTML source via Mojo::DOM to HTML source might |
552
|
|
|
|
|
|
|
introduce changes to the source, for example accented characters might be |
553
|
|
|
|
|
|
|
transformed to HTML encoded entity equivalents or Boolean attributes are |
554
|
|
|
|
|
|
|
converted to a different notation. |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
=back |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
Please report any bugs or feature requests at |
559
|
|
|
|
|
|
|
L<Bitbucket|https://bitbucket.org/rolandvanipenburg/html-hyphenate/issues>. |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
=head1 AUTHOR |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
Roland van Ipenburg, E<lt>roland@rolandvanipenburg.comE<gt> |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
566
|
|
|
|
|
|
|
|
567
|
|
|
|
|
|
|
Copyright (C) 2009-2021, Roland van Ipenburg |
568
|
|
|
|
|
|
|
|
569
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
570
|
|
|
|
|
|
|
it under the same terms as Perl itself, either Perl version 5.14.0 or, |
571
|
|
|
|
|
|
|
at your option, any later version of Perl 5 you may have available. |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
=head1 DISCLAIMER OF WARRANTY |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
BECAUSE THIS SOFTWARE IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY |
576
|
|
|
|
|
|
|
FOR THE SOFTWARE, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN |
577
|
|
|
|
|
|
|
OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES |
578
|
|
|
|
|
|
|
PROVIDE THE SOFTWARE "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER |
579
|
|
|
|
|
|
|
EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
580
|
|
|
|
|
|
|
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE |
581
|
|
|
|
|
|
|
ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE SOFTWARE IS WITH |
582
|
|
|
|
|
|
|
YOU. SHOULD THE SOFTWARE PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL |
583
|
|
|
|
|
|
|
NECESSARY SERVICING, REPAIR, OR CORRECTION. |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING |
586
|
|
|
|
|
|
|
WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR |
587
|
|
|
|
|
|
|
REDISTRIBUTE THE SOFTWARE AS PERMITTED BY THE ABOVE LICENSE, BE |
588
|
|
|
|
|
|
|
LIABLE TO YOU FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL, |
589
|
|
|
|
|
|
|
OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE |
590
|
|
|
|
|
|
|
THE SOFTWARE (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING |
591
|
|
|
|
|
|
|
RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A |
592
|
|
|
|
|
|
|
FAILURE OF THE SOFTWARE TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF |
593
|
|
|
|
|
|
|
SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF |
594
|
|
|
|
|
|
|
SUCH DAMAGES. |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
=cut |