File Coverage

blib/lib/PDF/Builder/Content/Hyphenate_basic.pm

Criterion	Covered	Total	%
statement	9	135	6.6
branch	0	78	0.0
condition	0	57	0.0
subroutine	3	5	60.0
pod	0	1	0.0
total	12	276	4.3

line	stmt	bran	cond	sub	pod	time	code
1							package PDF::Builder::Content::Hyphenate_basic;
2
3	1			1		2663	use base 'PDF::Builder::Content::Text';
	1					1
	1					131
4
5	1			1		5	use strict;
	1					1
	1					18
6	1			1		3	use warnings;
	1					2
	1					1313
7
8							our $VERSION = '3.028'; # VERSION
9							our $LAST_UPDATE = '3.027'; # manually update whenever code is changed
10
11							=head1 NAME
12
13							PDF::Builder::Content::Hyphenate_basic - Simple hyphenation capability
14
15							Inherits from L<PDF::Builder::Content::Text>
16
17							=head1 SYNOPSIS
18
19							These are internal routines that are somewhat experimental, and may (or may
20							not) be extended in the future. They are called from various Content routines
21							that take long strings of text and split them into fixed-length lines.
22
23							Words are split to fill the line most completely, without regard to widows and
24							orphans, long runs of hyphens at the right edge, "rivers" of space flowing
25							through a paragraph, and other problems. Also, only simple splitting is done
26							(not actually I<words>), on a simple, language-independent basis. No dictionary
27							or rules-based splitting is currently done.
28
29							This functionality may well be replaced by "hooks" to call language-specific
30							word-splitting rules, as well as worrying about the appearance of the results
31							(such as Knuth-Plass).
32
33							=cut
34
35							# Main entry. Returns array of left portion of word (and -) to stick on end of
36							# sentence (may be empty) and remaining (right) portion of word to go on next
37							# line (usually not empty).
38							sub splitWord {
39	0			0	0		my ($self, $word, $width, %opts) = @_;
40							# copy dashed option names to preferred undashed names
41	0	0	0				if (defined $opts{'-spHH'} && !defined $opts{'spHH'}) { $opts{'spHH'} = delete($opts{'-spHH'}); }
	0
42	0	0	0				if (defined $opts{'-spOP'} && !defined $opts{'spOP'}) { $opts{'spOP'} = delete($opts{'-spOP'}); }
	0
43	0	0	0				if (defined $opts{'-spDR'} && !defined $opts{'spDR'}) { $opts{'spDR'} = delete($opts{'-spDR'}); }
	0
44	0	0	0				if (defined $opts{'-spLR'} && !defined $opts{'spLR'}) { $opts{'spLR'} = delete($opts{'-spLR'}); }
	0
45	0	0	0				if (defined $opts{'-spCC'} && !defined $opts{'spCC'}) { $opts{'spCC'} = delete($opts{'-spCC'}); }
	0
46
47	0						my ($leftWord, $rightWord, @splitLoc, @chars, $i, $j, $len);
48
49							# various settings, some of which may be language-specific
50	0						my $minBegin = 2; # minimum 2 characters before split (English rules)
51	0	0					if (defined $opts{'min_prefix'}) { $minBegin = $opts{'min_prefix'}; }
	0
52	0						my $minEnd = 3; # minimum 3 characters to next line (English rules)
53	0	0					if (defined $opts{'min_suffix'}) { $minEnd = $opts{'min_suffix'}; }
	0
54	0						my $hyphen = '-';
55							#my $hyphen = "\xAD"; # add a hyphen at split, unless splitting at -
56							# or other dash character
57							# NOTE: PDF-1.7 14.8.2.2.3 suggests using a soft hyphen (\AD) when splitting
58							# a word at the end of the line, so that when text is extracted for
59							# a screen reader, etc., the closed-up word can have the "visible"
60							# hyphen removed. PDF readers should render as -.
61	0						my @suppressHyphen = ( # ASCII/Latin-1/UTF-8 ordinals to NOT add - after
62							# - en-dash em-dash /
63							45, 8211, 8212, 47,
64							);
65	0	0					my $splitHardH = defined($opts{'spHH'})? $opts{'spHH'}: 1; # 1=OK to split on hard (explicit) hyphen U+002D
66	0	0					my $otherPunc = defined($opts{'spOP'})? $opts{'spOP'}: 1; # 1=OK to split after most punctuation
67	0	0					my $digitRun = defined($opts{'spDR'})? $opts{'spDR'}: 1; # 1=OK to split after run of digit(s)
68	0	0					my $letterRun = defined($opts{'spLR'})? $opts{'spLR'}: 1; # 1=OK to split after run of ASCII letter(s)
69	0	0					my $camelCase = defined($opts{'spCC'})? $opts{'spCC'}: 1; # 1=OK to split camelCase on ASCII lc-to-UC transition
70	0	0					my $splitReqBlnk = defined($opts{'spRB'})? $opts{'spRB'}: 0; # 1=OK to split on required blank (NBSP) -- desperation move
71	0	0					my $splitAnywhere = defined($opts{'spFS'})? $opts{'spFS'}: 0; # 1=OK to split to fit available space -- super desperation move
72	0	0					if ($splitAnywhere) {
73							# if requesting to split within a certain length, suppress all other flags
74	0						$splitHardH = $otherPunc = $digitRun = $letterRun = $camelCase =
75							$splitReqBlnk = 0;
76							}
77
78							# note that we are ignoring U+2010 "hyphen" and U+2011 "non-splitting
79							# hyphen". The first is probably rare enough to not be worth the bother,
80							# and the second won't be split at anyway.
81
82	0						$leftWord = ''; # default return values
83	0						$rightWord = $word;
84
85	0						@splitLoc = (); # no known OK splits yet
86
87							# highest priority for splits: hard and soft hyphens
88							# remove SHYs, remember any break points
89	0						($word, @splitLoc) = _removeSHY($word);
90							# remember any break points due to hard coded hyphens
91	0						@chars = split //, $word;
92	0						for ($i=0; $i<scalar(@chars); $i++) {
93	0	0	0				if ($chars[$i] eq '-' && $splitHardH) { push @splitLoc, $i; }
	0
94							# note that unlike SHY, - is not removed
95							}
96
97							# If nothing in @splitLoc, proceed to find other splits. If @splitLoc
98							# has at least one entry, could make it the top priority and split there,
99							# and not look at other possible splits. Or, keep adding to @splitLoc
100							# (equal priority for all possible splits). Mix and match is OK
101							# (grouping criteria, as hard and soft hyphens were done together).
102
103							#if (!@splitLoc) {
104	0	0					if ($otherPunc) {
105							# look for other punctuation to split after.
106							# don't split on ' or " or other quotes (<, <<, etc.)
107							# !%&)]*+/,.:;<>?^_~ and curly right brace ASCII OK for now
108							# en-dash, em-dash should ideally be split after, whether they are
109							# free floating or embedded between words.
110	0						my @ASCII_punct = ( '!', '.', '?', ',', '%', '&', ':', ';',
111							'<', '>', ')', ']', chr(125), '_', '~',
112							'^', '+', '*', '/', );
113							# en-dash em-dash
114	0						my @UTF8_punct = ( 8211, 8212, );
115							# remember not to split if next char is -
116							# (defer split to after hard hyphen - [if allowed]).
117	0						for ($i=0; $i<scalar(@chars)-1; $i++) {
118	0						foreach (@ASCII_punct) {
119	0	0	0				if ($chars[$i] eq $_ && $chars[$i+1] ne '-') {
120	0						push @splitLoc, $i;
121	0						last;
122							}
123							}
124	0						foreach (@UTF8_punct) {
125	0	0	0				if (ord($chars[$i]) == $_ && $chars[$i+1] ne '-') {
126	0						push @splitLoc, $i;
127	0						last;
128							}
129							}
130							}
131							}
132							#}
133
134							# group digit runs and camelCase together at same priority
135							#if (!@splitLoc) {
136	0	0					if ($digitRun) {
137							# look for a run of digits to split after.
138							# that is, any digit NOT followed by another digit.
139							# remember not to split if next char is -
140							# (defer split to after hard hyphen - [if allowed]).
141	0						for ($i=0; $i<scalar(@chars)-1; $i++) {
142	0	0	0				if ($chars[$i] ge '0' && $chars[$i] le '9' &&
			0
			0
143							!($chars[$i+1] ge '0' && $chars[$i+1] le '9' \|\|
144							$chars[$i+1] eq '-')) {
145	0						push @splitLoc, $i;
146							}
147							}
148							}
149
150	0	0					if ($letterRun) {
151							# look for a run of letters (ASCII) to split after.
152							# that is, any letter NOT followed by another letter.
153							# remember not to split if next char is -
154							# (defer split to after hard hyphen - [if allowed]).
155	0						for ($i=0; $i<scalar(@chars)-1; $i++) {
156	0	0	0				if (($chars[$i] ge 'a' && $chars[$i] le 'z' \|\|
			0
			0
157							$chars[$i] ge 'A' && $chars[$i] le 'Z' ) &&
158							!($chars[$i+1] ge 'a' && $chars[$i+1] le 'z' \|\|
159							$chars[$i+1] ge 'A' && $chars[$i+1] le 'Z' \|\|
160							$chars[$i+1] eq '-') ) {
161	0						push @splitLoc, $i;
162							}
163							}
164							}
165
166	0	0					if ($camelCase) {
167							# look for camelCase to split on lowercase to
168							# uppercase transitions. just ASCII letters for now.
169							# Note that this will split names like McIlroy -> Mc-Ilroy
170							# and MacDonald -> Mac-Donald.
171	0						for ($i=0; $i<scalar(@chars)-1; $i++) {
172	0	0	0				if ($chars[$i] ge 'a' && $chars[$i] le 'z' &&
			0
			0
173							$chars[$i+1] ge 'A' && $chars[$i+1] le 'Z') {
174	0						push @splitLoc, $i;
175							}
176							}
177							}
178							#}
179
180							#if (!@splitLoc) {
181							# look for real English word split locations
182							# TBD
183							#}
184
185	0	0	0				if (!@splitLoc && $splitReqBlnk) {
186							# remember any break points due to desperation split at NBSP
187	0						@chars = split //, $word;
188	0						for ($i=0; $i<scalar(@chars); $i++) {
189	0	0					if ($chars[$i] eq "\xA0") { push @splitLoc, $i; }
	0
190							# note that NBSP converted to regular space (x20). we will need
191							# to overwrite the split one with the hyphen
192							}
193							}
194
195	0	0	0				if (!@splitLoc && $splitAnywhere) {
196							# remember any break point due to desperation split at available length
197	0						@chars = split //, $word;
198	0						my $trial = '';
199	0						for ($i=0; $i<scalar(@chars); $i++) {
200	0						$trial .= $chars[$i];
201	0	0					if ($self->advancewidth("$trial$hyphen") > $width) { last; }
	0
202							}
203							# nothing fit? force one letter, even though it overflows
204	0	0					if ($i == 0) { $i = 1; }
	0
205	0						push @splitLoc, $i-1;
206							# disable minimum prefix and suffix for this
207	0						$minBegin = $minEnd = 1;
208							}
209
210							# sort final @splitLoc, remove any split points violating "min" settings
211							# set $leftWord and $rightWord if find successful split
212							# TBD consider hierarchy of desirable splits, rather than equal weight
213	0	0					if (@splitLoc) {
214	0						@splitLoc = sort { $a <=> $b } @splitLoc;
	0
215							# unnecessary to have unique values
216	0						$len = length($word);
217	0						$j = -1;
218	0						for ($i=0; $i<scalar(@splitLoc); $i++) {
219	0	0					if ($splitLoc[$i] >= $minBegin-1) { last; }
	0
220	0						$j = $i;
221							}
222	0	0					if ($j >= 0) { splice(@splitLoc, 0, $j+1); } # remove j+1 els
	0
223	0						$j = -1;
224	0						for ($i=$#splitLoc; $i>=0; $i--) {
225	0	0					if ($splitLoc[$i] < $len-$minEnd) { last; }
	0
226	0						$j = $i;
227							}
228	0	0					if ($j >= 0) { splice(@splitLoc, $j); } # remove els >= j-th
	0
229
230							# scan R to L through @splitLoc to try splitting there
231							# TBD estimate starting position in @splitLoc by dividing $width by
232							# 1em to get approximate split location; pick highest @splitLoc
233							# element that does not exceed it, and move right (probably) or left
234							# to get proper split point.
235	0						while (@splitLoc) {
236	0						$j = pop @splitLoc; # proposed split rightmost on list
237	0						my $trial = substr($word, 0, $j+1);
238							# this is the left fragment at the end of the line. make sure
239							# there is room for the space before it, the hyphen (if added),
240							# and any letter doubling (e.g., in German or Dutch)
241
242							# does the left fragment already end in -, etc.?
243							# if it does, don't add a $hyphen.
244	0						my $h = $hyphen;
245	0						$i = ord(substr($trial, -1, 1)); # last character in left fragment
246	0						foreach (@suppressHyphen) {
247	0	0					if ($i == $_) { $h = ''; last; }
	0
	0
248							}
249							# left fragment ends in a space (used to be an NBSP)?
250							# remove space, and no hyphen
251	0	0					if ($i eq ' ') {
252	0						chop($trial);
253	0						$h = '';
254							}
255
256							# $width should already count the trailing space in the existing
257							# line, or full width if empty
258	0						$len = $self->advancewidth("$trial$h", %opts);
259	0	0					if ($len > $width) { next; }
	0
260
261							# TBD any letter doubling needed?
262	0						$leftWord = $trial.$h;
263	0						$rightWord = substr($word, $j+1);
264	0						last;
265							} # while splitLoc has content
266							# if fell through because no fragment was short enough, $leftWord and
267							# $rightWord were never reassigned, and effect is to leave the entire
268							# word for the next line.
269							}
270							# if 0 elements in @splitLoc, $leftWord and $rightWord already defaulted
271
272	0						return ($leftWord, $rightWord);
273							}
274
275							# remove soft hyphens (SHYs) from a word. assume is always #173 (good for
276							# Latin-1, CP-1252, UTF-8; might not work for some encodings) TBD might want
277							# to pass in current encoding, or what SHY value is.
278							# return list of break points where SHYs were removed
279							sub _removeSHY {
280	0			0			my ($word) = @_;
281
282	0						my @SHYs = ();
283	0						my $i = 0;
284
285	0						my @chars = split //, $word;
286	0						my $out = '';
287	0						foreach (@chars) {
288	0	0					if (ord($_) == 173) {
289							# it's a SHY, so remove from word, add to list
290	0						push @SHYs, ($i - 1);
291	0						next;
292							}
293	0						$out .= $_;
294	0						$i++;
295							}
296	0						return ($out, @SHYs);
297							}
298
299							1;