line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package PDF::Builder::Content::Hyphenate_basic; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
9282
|
use base 'PDF::Builder::Content::Text'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
90
|
|
4
|
|
|
|
|
|
|
|
5
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
19
|
|
6
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
942
|
|
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our $VERSION = '3.024'; # VERSION |
9
|
|
|
|
|
|
|
our $LAST_UPDATE = '3.024'; # manually update whenever code is changed |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
PDF::Builder::Content::Hyphenate_basic - Simple hyphenation capability |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 SYNOPSIS |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
These are internal routines that are somewhat experimental, and may (or may |
18
|
|
|
|
|
|
|
not) be extended in the future. They are called from various Content routines |
19
|
|
|
|
|
|
|
that take long strings of text and split them into fixed-length lines. |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
Words are split to fill the line most completely, without regard to widows and |
22
|
|
|
|
|
|
|
orphans, long runs of hyphens at the right edge, "rivers" of space flowing |
23
|
|
|
|
|
|
|
through a paragraph, and other problems. Also, only simple splitting is done |
24
|
|
|
|
|
|
|
(not actually I), on a simple, language-independent basis. No dictionary |
25
|
|
|
|
|
|
|
or rules-based splitting is currently done. |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
This functionality may well be replaced by "hooks" to call language-specific |
28
|
|
|
|
|
|
|
word-splitting rules, as well as worrying about the appearance of the results |
29
|
|
|
|
|
|
|
(such as Knuth-Plass). |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=cut |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Main entry. Returns array of left portion of word (and -) to stick on end of |
34
|
|
|
|
|
|
|
# sentence (may be empty) and remaining (right) portion of word to go on next |
35
|
|
|
|
|
|
|
# line (usually not empty). |
36
|
|
|
|
|
|
|
sub splitWord { |
37
|
0
|
|
|
0
|
0
|
|
my ($self, $word, $width, %opts) = @_; |
38
|
|
|
|
|
|
|
# copy dashed option names to preferred undashed names |
39
|
0
|
0
|
0
|
|
|
|
if (defined $opts{'-spHH'} && !defined $opts{'spHH'}) { $opts{'spHH'} = delete($opts{'-spHH'}); } |
|
0
|
|
|
|
|
|
|
40
|
0
|
0
|
0
|
|
|
|
if (defined $opts{'-spOP'} && !defined $opts{'spOP'}) { $opts{'spOP'} = delete($opts{'-spOP'}); } |
|
0
|
|
|
|
|
|
|
41
|
0
|
0
|
0
|
|
|
|
if (defined $opts{'-spDR'} && !defined $opts{'spDR'}) { $opts{'spDR'} = delete($opts{'-spDR'}); } |
|
0
|
|
|
|
|
|
|
42
|
0
|
0
|
0
|
|
|
|
if (defined $opts{'-spLR'} && !defined $opts{'spLR'}) { $opts{'spLR'} = delete($opts{'-spLR'}); } |
|
0
|
|
|
|
|
|
|
43
|
0
|
0
|
0
|
|
|
|
if (defined $opts{'-spCC'} && !defined $opts{'spCC'}) { $opts{'spCC'} = delete($opts{'-spCC'}); } |
|
0
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
|
45
|
0
|
|
|
|
|
|
my ($leftWord, $rightWord, @splitLoc, @chars, $i, $j, $len); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# various settings, some of which may be language-specific |
48
|
0
|
|
|
|
|
|
my $minBegin = 2; # minimum 2 characters before split |
49
|
0
|
|
|
|
|
|
my $minEnd = 2; # minimum 2 characters to next line |
50
|
|
|
|
|
|
|
#my $hyphen = '-'; |
51
|
0
|
|
|
|
|
|
my $hyphen = "\xAD"; # add a hyphen at split, unless splitting at - |
52
|
|
|
|
|
|
|
# or other dash character |
53
|
|
|
|
|
|
|
# NOTE: PDF-1.7 14.8.2.2.3 suggests using a soft hyphen (\AD) when splitting |
54
|
|
|
|
|
|
|
# a word at the end of the line, so that when text is extracted for |
55
|
|
|
|
|
|
|
# a screen reader, etc., the closed-up word can have the "visible" |
56
|
|
|
|
|
|
|
# hyphen removed. PDF readers should render as -. |
57
|
0
|
|
|
|
|
|
my @suppressHyphen = ( # ASCII/Latin-1/UTF-8 ordinals to NOT add - after |
58
|
|
|
|
|
|
|
# - en-dash em-dash / |
59
|
|
|
|
|
|
|
45, 8211, 8212, 47, |
60
|
|
|
|
|
|
|
); |
61
|
0
|
0
|
|
|
|
|
my $splitHardH = defined($opts{'spHH'})? $opts{'spHH'}: 1; # 1=OK to split on hard (explicit) hyphen U+002D |
62
|
0
|
0
|
|
|
|
|
my $otherPunc = defined($opts{'spOP'})? $opts{'spOP'}: 1; # 1=OK to split after most punctuation |
63
|
0
|
0
|
|
|
|
|
my $digitRun = defined($opts{'spDR'})? $opts{'spDR'}: 1; # 1=OK to split after run of digit(s) |
64
|
0
|
0
|
|
|
|
|
my $letterRun = defined($opts{'spLR'})? $opts{'spLR'}: 1; # 1=OK to split after run of ASCII letter(s) |
65
|
0
|
0
|
|
|
|
|
my $camelCase = defined($opts{'spCC'})? $opts{'spCC'}: 1; # 1=OK to split camelCase on ASCII lc-to-UC transition |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
# note that we are ignoring U+2010 "hyphen" and U+2011 "non-splitting |
68
|
|
|
|
|
|
|
# hyphen". The first is probably rare enough to not be worth the bother, |
69
|
|
|
|
|
|
|
# and the second won't be split at anyway. |
70
|
|
|
|
|
|
|
|
71
|
0
|
|
|
|
|
|
$leftWord = ''; # default return values |
72
|
0
|
|
|
|
|
|
$rightWord = $word; |
73
|
|
|
|
|
|
|
|
74
|
0
|
|
|
|
|
|
@splitLoc = (); # no known OK splits yet |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# highest priority for splits: hard and soft hyphens |
77
|
|
|
|
|
|
|
# remove SHYs, remember any break points |
78
|
0
|
|
|
|
|
|
($word, @splitLoc) = _removeSHY($word); |
79
|
|
|
|
|
|
|
# remember any break points due to hard coded hyphens |
80
|
0
|
|
|
|
|
|
@chars = split //, $word; |
81
|
0
|
|
|
|
|
|
for ($i=0; $i
|
82
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] eq '-' && $splitHardH) { push @splitLoc, $i; } |
|
0
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
# note that unlike SHY, - is not removed |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# If nothing in @splitLoc, proceed to find other splits. If @splitLoc |
87
|
|
|
|
|
|
|
# has at least one entry, could make it the top priority and split there, |
88
|
|
|
|
|
|
|
# and not look at other possible splits. Or, keep adding to @splitLoc |
89
|
|
|
|
|
|
|
# (equal priority for all possible splits). Mix and match is OK |
90
|
|
|
|
|
|
|
# (grouping criteria, as hard and soft hyphens were done together). |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
#if (!@splitLoc) { |
93
|
0
|
0
|
|
|
|
|
if ($otherPunc) { |
94
|
|
|
|
|
|
|
# look for other punctuation to split after. |
95
|
|
|
|
|
|
|
# don't split on ' or " or other quotes (<, <<, etc.) |
96
|
|
|
|
|
|
|
# !%&)]*+/,.:;<>?^_~ and curly right brace ASCII OK for now |
97
|
|
|
|
|
|
|
# en-dash, em-dash should ideally be split after, whether they are |
98
|
|
|
|
|
|
|
# free floating or embedded between words. |
99
|
0
|
|
|
|
|
|
my @ASCII_punct = ( '!', '.', '?', ',', '%', '&', ':', ';', |
100
|
|
|
|
|
|
|
'<', '>', ')', ']', chr(125), '_', '~', |
101
|
|
|
|
|
|
|
'^', '+', '*', '/', ); |
102
|
|
|
|
|
|
|
# en-dash em-dash |
103
|
0
|
|
|
|
|
|
my @UTF8_punct = ( 8211, 8212, ); |
104
|
|
|
|
|
|
|
# remember not to split if next char is - |
105
|
|
|
|
|
|
|
# (defer split to after hard hyphen - [if allowed]). |
106
|
0
|
|
|
|
|
|
for ($i=0; $i
|
107
|
0
|
|
|
|
|
|
foreach (@ASCII_punct) { |
108
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] eq $_ && $chars[$i+1] ne '-') { |
109
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
110
|
0
|
|
|
|
|
|
last; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
} |
113
|
0
|
|
|
|
|
|
foreach (@UTF8_punct) { |
114
|
0
|
0
|
0
|
|
|
|
if (ord($chars[$i]) == $_ && $chars[$i+1] ne '-') { |
115
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
116
|
0
|
|
|
|
|
|
last; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
#} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# group digit runs and camelCase together at same priority |
124
|
|
|
|
|
|
|
#if (!@splitLoc) { |
125
|
0
|
0
|
|
|
|
|
if ($digitRun) { |
126
|
|
|
|
|
|
|
# look for a run of digits to split after. |
127
|
|
|
|
|
|
|
# that is, any digit NOT followed by another digit. |
128
|
|
|
|
|
|
|
# remember not to split if next char is - |
129
|
|
|
|
|
|
|
# (defer split to after hard hyphen - [if allowed]). |
130
|
0
|
|
|
|
|
|
for ($i=0; $i
|
131
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] ge '0' && $chars[$i] le '9' && |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
132
|
|
|
|
|
|
|
!($chars[$i+1] ge '0' && $chars[$i+1] le '9' || |
133
|
|
|
|
|
|
|
$chars[$i+1] eq '-')) { |
134
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
|
139
|
0
|
0
|
|
|
|
|
if ($letterRun) { |
140
|
|
|
|
|
|
|
# look for a run of letters (ASCII) to split after. |
141
|
|
|
|
|
|
|
# that is, any letter NOT followed by another letter. |
142
|
|
|
|
|
|
|
# remember not to split if next char is - |
143
|
|
|
|
|
|
|
# (defer split to after hard hyphen - [if allowed]). |
144
|
0
|
|
|
|
|
|
for ($i=0; $i
|
145
|
0
|
0
|
0
|
|
|
|
if (($chars[$i] ge 'a' && $chars[$i] le 'z' || |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
146
|
|
|
|
|
|
|
$chars[$i] ge 'A' && $chars[$i] le 'Z' ) && |
147
|
|
|
|
|
|
|
!($chars[$i+1] ge 'a' && $chars[$i+1] le 'z' || |
148
|
|
|
|
|
|
|
$chars[$i+1] ge 'A' && $chars[$i+1] le 'Z' || |
149
|
|
|
|
|
|
|
$chars[$i+1] eq '-') ) { |
150
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
0
|
0
|
|
|
|
|
if ($camelCase) { |
156
|
|
|
|
|
|
|
# look for camelCase to split on lowercase to |
157
|
|
|
|
|
|
|
# uppercase transitions. just ASCII letters for now. |
158
|
|
|
|
|
|
|
# Note that this will split names like McIlroy -> Mc-Ilroy |
159
|
|
|
|
|
|
|
# and MacDonald -> Mac-Donald. |
160
|
0
|
|
|
|
|
|
for ($i=0; $i
|
161
|
0
|
0
|
0
|
|
|
|
if ($chars[$i] ge 'a' && $chars[$i] le 'z' && |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
162
|
|
|
|
|
|
|
$chars[$i+1] ge 'A' && $chars[$i+1] le 'Z') { |
163
|
0
|
|
|
|
|
|
push @splitLoc, $i; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
} |
167
|
|
|
|
|
|
|
#} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
#if (!@splitLoc) { |
170
|
|
|
|
|
|
|
# look for real English word split locations |
171
|
|
|
|
|
|
|
# TBD |
172
|
|
|
|
|
|
|
#} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# sort final @splitLoc, remove any split points violating "min" settings |
175
|
|
|
|
|
|
|
# set $leftWord and $rightWord if find successful split |
176
|
0
|
0
|
|
|
|
|
if (@splitLoc) { |
177
|
0
|
|
|
|
|
|
@splitLoc = sort { $a <=> $b } @splitLoc; |
|
0
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# unnecessary to have unique values |
179
|
0
|
|
|
|
|
|
$len = length($word); |
180
|
0
|
|
|
|
|
|
$j = -1; |
181
|
0
|
|
|
|
|
|
for ($i=0; $i
|
182
|
0
|
0
|
|
|
|
|
if ($splitLoc[$i] >= $minBegin-1) { last; } |
|
0
|
|
|
|
|
|
|
183
|
0
|
|
|
|
|
|
$j = $i; |
184
|
|
|
|
|
|
|
} |
185
|
0
|
0
|
|
|
|
|
if ($j >= 0) { splice(@splitLoc, 0, $j+1); } # remove j+1 els |
|
0
|
|
|
|
|
|
|
186
|
0
|
|
|
|
|
|
$j = -1; |
187
|
0
|
|
|
|
|
|
for ($i=$#splitLoc; $i>=0; $i--) { |
188
|
0
|
0
|
|
|
|
|
if ($splitLoc[$i] < $len-$minEnd) { last; } |
|
0
|
|
|
|
|
|
|
189
|
0
|
|
|
|
|
|
$j = $i; |
190
|
|
|
|
|
|
|
} |
191
|
0
|
0
|
|
|
|
|
if ($j >= 0) { splice(@splitLoc, $j); } # remove els >= j-th |
|
0
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# scan R to L through @splitLoc to try splitting there |
194
|
|
|
|
|
|
|
# TBD estimate starting position in @splitLoc by dividing $width by |
195
|
|
|
|
|
|
|
# 1em to get approximate split location; pick highest @splitLoc |
196
|
|
|
|
|
|
|
# element that does not exceed it, and move right (probably) or left |
197
|
|
|
|
|
|
|
# to get proper split point. |
198
|
0
|
|
|
|
|
|
while (@splitLoc) { |
199
|
0
|
|
|
|
|
|
$j = pop @splitLoc; # proposed split rightmost on list |
200
|
0
|
|
|
|
|
|
my $trial = substr($word, 0, $j+1); |
201
|
|
|
|
|
|
|
# this is the left fragment at the end of the line. make sure |
202
|
|
|
|
|
|
|
# there is room for the space before it, the hyphen (if added), |
203
|
|
|
|
|
|
|
# and any letter doubling (e.g., in German) |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
# does the left fragment already end in -, etc.? |
206
|
|
|
|
|
|
|
# if it does, don't add a $hyphen. |
207
|
0
|
|
|
|
|
|
my $h = $hyphen; |
208
|
0
|
|
|
|
|
|
$i = ord(substr($trial, -1, 1)); # last character in left fragment |
209
|
0
|
|
|
|
|
|
foreach (@suppressHyphen) { |
210
|
0
|
0
|
|
|
|
|
if ($i == $_) { $h = ''; last; } |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
# $width should already count the trailing space in the existing |
213
|
|
|
|
|
|
|
# line, or full width if empty |
214
|
0
|
|
|
|
|
|
$len = $self->advancewidth("$trial$h", %opts); |
215
|
0
|
0
|
|
|
|
|
if ($len > $width) { next; } |
|
0
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# any letter doubling needed? |
218
|
0
|
|
|
|
|
|
$leftWord = $trial.$h; |
219
|
0
|
|
|
|
|
|
$rightWord = substr($word, $j+1); |
220
|
0
|
|
|
|
|
|
last; |
221
|
|
|
|
|
|
|
} |
222
|
|
|
|
|
|
|
# if fell through because no fragment was short enough, $leftWord and |
223
|
|
|
|
|
|
|
# $rightWord were never reassigned, and effect is to leave the entire |
224
|
|
|
|
|
|
|
# word for the next line. |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
# if 0 elements in @splitLoc, $leftWord and $rightWord already defaulted |
227
|
|
|
|
|
|
|
|
228
|
0
|
|
|
|
|
|
return ($leftWord, $rightWord); |
229
|
|
|
|
|
|
|
} |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
# remove soft hyphens (SHYs) from a word. assume is always #173 (good for |
232
|
|
|
|
|
|
|
# Latin-1, CP-1252, UTF-8; might not work for some encodings) TBD might want |
233
|
|
|
|
|
|
|
# to pass in current encoding, or what SHY value is. |
234
|
|
|
|
|
|
|
# return list of break points where SHYs were removed |
235
|
|
|
|
|
|
|
sub _removeSHY { |
236
|
0
|
|
|
0
|
|
|
my ($word) = @_; |
237
|
|
|
|
|
|
|
|
238
|
0
|
|
|
|
|
|
my @SHYs = (); |
239
|
0
|
|
|
|
|
|
my $i = 0; |
240
|
|
|
|
|
|
|
|
241
|
0
|
|
|
|
|
|
my @chars = split //, $word; |
242
|
0
|
|
|
|
|
|
my $out = ''; |
243
|
0
|
|
|
|
|
|
foreach (@chars) { |
244
|
0
|
0
|
|
|
|
|
if (ord($_) == 173) { |
245
|
|
|
|
|
|
|
# it's a SHY, so remove from word, add to list |
246
|
0
|
|
|
|
|
|
push @SHYs, ($i - 1); |
247
|
0
|
|
|
|
|
|
next; |
248
|
|
|
|
|
|
|
} |
249
|
0
|
|
|
|
|
|
$out .= $_; |
250
|
0
|
|
|
|
|
|
$i++; |
251
|
|
|
|
|
|
|
} |
252
|
0
|
|
|
|
|
|
return ($out, @SHYs); |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
1; |