line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package PDF::Builder::Content::Hyphenate_basic; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
11458
|
use base 'PDF::Builder::Content::Text'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
131
|
|
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
8
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
22
|
|
6
|
1
|
|
|
1
|
|
6
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1043
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '3.023'; # VERSION |
9
|
|
|
|
|
|
|
our $LAST_UPDATE = '3.021'; # manually update whenever code is changed |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
PDF::Builder::Content::Hyphenate_basic - Simple hyphenation capability |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 SYNOPSIS |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
These are internal routines that are somewhat experimental, and may (or may |
18
|
|
|
|
|
|
|
not) be extended in the future. They are called from various Content routines |
19
|
|
|
|
|
|
|
that take long strings of text and split them into fixed-length lines. |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
Words are split to fill the line most completely, without regard to widows and |
22
|
|
|
|
|
|
|
orphans, long runs of hyphens at the right edge, "rivers" of space flowing |
23
|
|
|
|
|
|
|
through a paragraph, and other problems. Also, only simple splitting is done |
24
|
|
|
|
|
|
|
(not actually I), on a simple, language-independent basis. No dictionary |
25
|
|
|
|
|
|
|
or rules-based splitting is currently done. |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
This functionality may well be replaced by "hooks" to call language-specific |
28
|
|
|
|
|
|
|
word-splitting rules, as well as worrying about the appearance of the results |
29
|
|
|
|
|
|
|
(such as Knuth-Plass). |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=cut |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Main entry. Returns array of left portion of word (and -) to stick on end of |
34
|
|
|
|
|
|
|
# sentence (may be empty) and remaining (right) portion of word to go on next |
35
|
|
|
|
|
|
|
# line (usually not empty). |
36
|
|
|
|
|
|
|
sub splitWord { |
37
|
0
|
|
|
0
|
0
|
|
my ($self, $word, $width, %opts) = @_; |
38
|
|
|
|
|
|
|
|
39
|
0
|
|
|
|
|
|
my ($leftWord, $rightWord, @splitLoc, @chars, $i, $j, $len); |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# various settings, some of which may be language-specific |
42
|
0
|
|
|
|
|
|
my $minBegin = 2; # minimum 2 characters before split |
43
|
0
|
|
|
|
|
|
my $minEnd = 2; # minimum 2 characters to next line |
44
|
|
|
|
|
|
|
#my $hyphen = '-'; |
45
|
0
|
|
|
|
|
|
my $hyphen = "\xAD"; # add a hyphen at split, unless splitting at - |
46
|
|
|
|
|
|
|
# or other dash character |
47
|
|
|
|
|
|
|
# NOTE: PDF-1.7 14.8.2.2.3 suggests using a soft hyphen (\AD) when splitting |
48
|
|
|
|
|
|
|
# a word at the end of the line, so that when text is extracted for |
49
|
|
|
|
|
|
|
# a screen reader, etc., the closed-up word can have the "visible" |
50
|
|
|
|
|
|
|
# hyphen removed. PDF readers should render as -. |
51
|
0
|
|
|
|
|
|
my @suppressHyphen = ( # ASCII/Latin-1/UTF-8 ordinals to NOT add - after |
52
|
|
|
|
|
|
|
# - en-dash em-dash / |
53
|
|
|
|
|
|
|
45, 8211, 8212, 47, |
54
|
|
|
|
|
|
|
); |
55
|
0
|
0
|
|
|
|
|
my $splitHardH = defined($opts{'-spHH'})? $opts{'-spHH'}: 1; # 1=OK to split on hard (explicit) hyphen U+002D |
56
|
0
|
0
|
|
|
|
|
my $otherPunc = defined($opts{'-spOP'})? $opts{'-spOP'}: 1; # 1=OK to split after most punctuation |
57
|
0
|
0
|
|
|
|
|
my $digitRun = defined($opts{'-spDR'})? $opts{'-spDR'}: 1; # 1=OK to split after run of digit(s) |
58
|
0
|
0
|
|
|
|
|
my $letterRun = defined($opts{'-spLR'})? $opts{'-spLR'}: 1; # 1=OK to split after run of ASCII letter(s) |
59
|
0
|
0
|
|
|
|
|
my $camelCase = defined($opts{'-spCC'})? $opts{'-spCC'}: 1; # 1=OK to split camelCase on ASCII lc-to-UC transition |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
# note that we are ignoring U+2010 "hyphen" and U+2011 "non-splitting |
62
|
|
|
|
|
|
|
# hyphen". The first is probably rare enough to not be worth the bother, |
63
|
|
|
|
|
|
|
# and the second won't be split at anyway. |
64
|
|
|
|
|
|
|
|
65
|
0
|
|
|
|
|
|
$leftWord = ''; # default return values |
66
|
0
|
|
|
|
|
|
$rightWord = $word; |
67
|
|
|
|
|
|
|
|
68
|
0
|
|
|
|
|
|
@splitLoc = (); # no known OK splits yet |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# highest priority for splits: hard and soft hyphens |
71
|
|
|
|
|
|
|
# remove SHYs, remember any break points |
72
|
0
|
|
|
|
|
|
($word, @splitLoc) = _removeSHY($word); |
73
|
|
|
|
|
|
|
# remember any break points due to hard coded hyphens |
74
|
0
|
|
|
|
|
|
@chars = split //, $word; |
75
|
0
|
|
|
|
|
|
for ($i=0; $i
|
76
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] eq '-' && $splitHardH) { push @splitLoc, $i; } |
|
0
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
# note that unlike SHY, - is not removed |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
# If nothing in @splitLoc, proceed to find other splits. If @splitLoc |
81
|
|
|
|
|
|
|
# has at least one entry, could make it the top priority and split there, |
82
|
|
|
|
|
|
|
# and not look at other possible splits. Or, keep adding to @splitLoc |
83
|
|
|
|
|
|
|
# (equal priority for all possible splits). Mix and match is OK |
84
|
|
|
|
|
|
|
# (grouping criteria, as hard and soft hyphens were done together). |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
#if (!@splitLoc) { |
87
|
0
|
0
|
|
|
|
|
if ($otherPunc) { |
88
|
|
|
|
|
|
|
# look for other punctuation to split after. |
89
|
|
|
|
|
|
|
# don't split on ' or " or other quotes (<, <<, etc.) |
90
|
|
|
|
|
|
|
# !%&)]*+/,.:;<>?^_~ and curly right brace ASCII OK for now |
91
|
|
|
|
|
|
|
# en-dash, em-dash should ideally be split after, whether they are |
92
|
|
|
|
|
|
|
# free floating or embedded between words. |
93
|
0
|
|
|
|
|
|
my @ASCII_punct = ( '!', '.', '?', ',', '%', '&', ':', ';', |
94
|
|
|
|
|
|
|
'<', '>', ')', ']', chr(125), '_', '~', |
95
|
|
|
|
|
|
|
'^', '+', '*', '/', ); |
96
|
|
|
|
|
|
|
# en-dash em-dash |
97
|
0
|
|
|
|
|
|
my @UTF8_punct = ( 8211, 8212, ); |
98
|
|
|
|
|
|
|
# remember not to split if next char is - |
99
|
|
|
|
|
|
|
# (defer split to after hard hyphen - [if allowed]). |
100
|
0
|
|
|
|
|
|
for ($i=0; $i
|
101
|
0
|
|
|
|
|
|
foreach (@ASCII_punct) { |
102
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] eq $_ && $chars[$i+1] ne '-') { |
103
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
104
|
0
|
|
|
|
|
|
last; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
} |
107
|
0
|
|
|
|
|
|
foreach (@UTF8_punct) { |
108
|
0
|
0
|
0
|
|
|
|
if (ord($chars[$i]) == $_ && $chars[$i+1] ne '-') { |
109
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
110
|
0
|
|
|
|
|
|
last; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
#} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# group digit runs and camelCase together at same priority |
118
|
|
|
|
|
|
|
#if (!@splitLoc) { |
119
|
0
|
0
|
|
|
|
|
if ($digitRun) { |
120
|
|
|
|
|
|
|
# look for a run of digits to split after. |
121
|
|
|
|
|
|
|
# that is, any digit NOT followed by another digit. |
122
|
|
|
|
|
|
|
# remember not to split if next char is - |
123
|
|
|
|
|
|
|
# (defer split to after hard hyphen - [if allowed]). |
124
|
0
|
|
|
|
|
|
for ($i=0; $i
|
125
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] ge '0' && $chars[$i] le '9' && |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
126
|
|
|
|
|
|
|
!($chars[$i+1] ge '0' && $chars[$i+1] le '9' || |
127
|
|
|
|
|
|
|
$chars[$i+1] eq '-')) { |
128
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
0
|
0
|
|
|
|
|
if ($letterRun) { |
134
|
|
|
|
|
|
|
# look for a run of letters (ASCII) to split after. |
135
|
|
|
|
|
|
|
# that is, any letter NOT followed by another letter. |
136
|
|
|
|
|
|
|
# remember not to split if next char is - |
137
|
|
|
|
|
|
|
# (defer split to after hard hyphen - [if allowed]). |
138
|
0
|
|
|
|
|
|
for ($i=0; $i
|
139
|
0
|
0
|
0
|
|
|
|
if (($chars[$i] ge 'a' && $chars[$i] le 'z' || |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
140
|
|
|
|
|
|
|
$chars[$i] ge 'A' && $chars[$i] le 'Z' ) && |
141
|
|
|
|
|
|
|
!($chars[$i+1] ge 'a' && $chars[$i+1] le 'z' || |
142
|
|
|
|
|
|
|
$chars[$i+1] ge 'A' && $chars[$i+1] le 'Z' || |
143
|
|
|
|
|
|
|
$chars[$i+1] eq '-') ) { |
144
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
0
|
0
|
|
|
|
|
if ($camelCase) { |
150
|
|
|
|
|
|
|
# look for camelCase to split on lowercase to |
151
|
|
|
|
|
|
|
# uppercase transitions. just ASCII letters for now. |
152
|
|
|
|
|
|
|
# Note that this will split names like McIlroy -> Mc-Ilroy |
153
|
|
|
|
|
|
|
# and MacDonald -> Mac-Donald. |
154
|
0
|
|
|
|
|
|
for ($i=0; $i
|
155
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] ge 'a' && $chars[$i] le 'z' && |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
156
|
|
|
|
|
|
|
$chars[$i+1] ge 'A' && $chars[$i+1] le 'Z') { |
157
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
#} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
#if (!@splitLoc) { |
164
|
|
|
|
|
|
|
# look for real English word split locations |
165
|
|
|
|
|
|
|
# TBD |
166
|
|
|
|
|
|
|
#} |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
# sort final @splitLoc, remove any split points violating "min" settings |
169
|
|
|
|
|
|
|
# set $leftWord and $rightWord if find successful split |
170
|
0
|
0
|
|
|
|
|
if (@splitLoc) { |
171
|
0
|
|
|
|
|
|
@splitLoc = sort { $a <=> $b } @splitLoc; |
|
0
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# unnecessary to have unique values |
173
|
0
|
|
|
|
|
|
$len = length($word); |
174
|
0
|
|
|
|
|
|
$j = -1; |
175
|
0
|
|
|
|
|
|
for ($i=0; $i
|
176
|
0
|
0
|
|
|
|
|
if ($splitLoc[$i] >= $minBegin-1) { last; } |
|
0
|
|
|
|
|
|
|
177
|
0
|
|
|
|
|
|
$j = $i; |
178
|
|
|
|
|
|
|
} |
179
|
0
|
0
|
|
|
|
|
if ($j >= 0) { splice(@splitLoc, 0, $j+1); } # remove j+1 els |
|
0
|
|
|
|
|
|
|
180
|
0
|
|
|
|
|
|
$j = -1; |
181
|
0
|
|
|
|
|
|
for ($i=$#splitLoc; $i>=0; $i--) { |
182
|
0
|
0
|
|
|
|
|
if ($splitLoc[$i] < $len-$minEnd) { last; } |
|
0
|
|
|
|
|
|
|
183
|
0
|
|
|
|
|
|
$j = $i; |
184
|
|
|
|
|
|
|
} |
185
|
0
|
0
|
|
|
|
|
if ($j >= 0) { splice(@splitLoc, $j); } # remove els >= j-th |
|
0
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
# scan R to L through @splitLoc to try splitting there |
188
|
|
|
|
|
|
|
# TBD estimate starting position in @splitLoc by dividing $width by |
189
|
|
|
|
|
|
|
# 1em to get approximate split location; pick highest @splitLoc |
190
|
|
|
|
|
|
|
# element that does not exceed it, and move right (probably) or left |
191
|
|
|
|
|
|
|
# to get proper split point. |
192
|
0
|
|
|
|
|
|
while (@splitLoc) { |
193
|
0
|
|
|
|
|
|
$j = pop @splitLoc; # proposed split rightmost on list |
194
|
0
|
|
|
|
|
|
my $trial = substr($word, 0, $j+1); |
195
|
|
|
|
|
|
|
# this is the left fragment at the end of the line. make sure |
196
|
|
|
|
|
|
|
# there is room for the space before it, the hyphen (if added), |
197
|
|
|
|
|
|
|
# and any letter doubling (e.g., in German) |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# does the left fragment already end in -, etc.? |
200
|
|
|
|
|
|
|
# if it does, don't add a $hyphen. |
201
|
0
|
|
|
|
|
|
my $h = $hyphen; |
202
|
0
|
|
|
|
|
|
$i = ord(substr($trial, -1, 1)); # last character in left fragment |
203
|
0
|
|
|
|
|
|
foreach (@suppressHyphen) { |
204
|
0
|
0
|
|
|
|
|
if ($i == $_) { $h = ''; last; } |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
# $width should already count the trailing space in the existing |
207
|
|
|
|
|
|
|
# line, or full width if empty |
208
|
0
|
|
|
|
|
|
$len = $self->advancewidth("$trial$h", %opts); |
209
|
0
|
0
|
|
|
|
|
if ($len > $width) { next; } |
|
0
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
# any letter doubling needed? |
212
|
0
|
|
|
|
|
|
$leftWord = $trial.$h; |
213
|
0
|
|
|
|
|
|
$rightWord = substr($word, $j+1); |
214
|
0
|
|
|
|
|
|
last; |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
# if fell through because no fragment was short enough, $leftWord and |
217
|
|
|
|
|
|
|
# $rightWord were never reassigned, and effect is to leave the entire |
218
|
|
|
|
|
|
|
# word for the next line. |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
# if 0 elements in @splitLoc, $leftWord and $rightWord already defaulted |
221
|
|
|
|
|
|
|
|
222
|
0
|
|
|
|
|
|
return ($leftWord, $rightWord); |
223
|
|
|
|
|
|
|
} |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
# remove soft hyphens (SHYs) from a word. assume is always #173 (good for |
226
|
|
|
|
|
|
|
# Latin-1, CP-1252, UTF-8; might not work for some encodings) TBD might want |
227
|
|
|
|
|
|
|
# to pass in current encoding, or what SHY value is. |
228
|
|
|
|
|
|
|
# return list of break points where SHYs were removed |
229
|
|
|
|
|
|
|
sub _removeSHY { |
230
|
0
|
|
|
0
|
|
|
my ($word) = @_; |
231
|
|
|
|
|
|
|
|
232
|
0
|
|
|
|
|
|
my @SHYs = (); |
233
|
0
|
|
|
|
|
|
my $i = 0; |
234
|
|
|
|
|
|
|
|
235
|
0
|
|
|
|
|
|
my @chars = split //, $word; |
236
|
0
|
|
|
|
|
|
my $out = ''; |
237
|
0
|
|
|
|
|
|
foreach (@chars) { |
238
|
0
|
0
|
|
|
|
|
if (ord($_) == 173) { |
239
|
|
|
|
|
|
|
# it's a SHY, so remove from word, add to list |
240
|
0
|
|
|
|
|
|
push @SHYs, ($i - 1); |
241
|
0
|
|
|
|
|
|
next; |
242
|
|
|
|
|
|
|
} |
243
|
0
|
|
|
|
|
|
$out .= $_; |
244
|
0
|
|
|
|
|
|
$i++; |
245
|
|
|
|
|
|
|
} |
246
|
0
|
|
|
|
|
|
return ($out, @SHYs); |
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
1; |