| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Text::Hyphenate; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
2298
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
46
|
|
|
4
|
1
|
|
|
1
|
|
5
|
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK $DEBUG); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
122
|
|
|
5
|
1
|
|
|
1
|
|
488
|
BEGIN { $DEBUG = 0 } |
|
6
|
|
|
|
|
|
|
require Exporter; |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
my $h; |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
while () { |
|
14
|
|
|
|
|
|
|
last if /-{32}/; |
|
15
|
|
|
|
|
|
|
s/\r?\n$//; |
|
16
|
|
|
|
|
|
|
# print "--- $_\n"; |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
my ($tag, $value, $begin, $end); |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
$begin = 1 if s/^\.//; |
|
21
|
|
|
|
|
|
|
$end = 1 if s/\.$//; |
|
22
|
|
|
|
|
|
|
s/(\D)(?!\d)/${1}0/g; |
|
23
|
|
|
|
|
|
|
s/^(?!\d)/0/; |
|
24
|
|
|
|
|
|
|
($tag = lc $_) =~ tr/0-9//d; |
|
25
|
|
|
|
|
|
|
($value = $_) =~ tr/0-9//cd; |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# print "$_: TAG $tag VALUE $value\n"; |
|
28
|
|
|
|
|
|
|
if ($begin and $end) { |
|
29
|
|
|
|
|
|
|
$h->{both}{$tag} = $value; |
|
30
|
|
|
|
|
|
|
} elsif ($begin) { |
|
31
|
|
|
|
|
|
|
$h->{begin}{$tag} = $value; |
|
32
|
|
|
|
|
|
|
} elsif ($end) { |
|
33
|
|
|
|
|
|
|
$h->{end}{$tag} = $value; |
|
34
|
|
|
|
|
|
|
} else { |
|
35
|
|
|
|
|
|
|
$h->{hyphen}{$tag} = $value; |
|
36
|
|
|
|
|
|
|
} |
|
37
|
|
|
|
|
|
|
} |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
while () { |
|
40
|
|
|
|
|
|
|
last if /-{32}/; |
|
41
|
|
|
|
|
|
|
chomp; |
|
42
|
|
|
|
|
|
|
my ($tag, $value); |
|
43
|
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
($tag = lc $_) =~ tr/-//d; |
|
45
|
|
|
|
|
|
|
($value = '0' . $_) =~ s/[^-](?!-)/0/g; |
|
46
|
|
|
|
|
|
|
$value =~ s/[^-]-/1/g; |
|
47
|
|
|
|
|
|
|
$value =~ tr/01/0/c; |
|
48
|
|
|
|
|
|
|
$h->{exact}{$tag} = $value; |
|
49
|
|
|
|
|
|
|
} |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
@EXPORT_OK = qw(hyphenate fill_par); |
|
52
|
|
|
|
|
|
|
$VERSION = '0.02'; |
|
53
|
|
|
|
|
|
|
|
|
54
|
1
|
|
|
1
|
|
15
|
use vars qw($RAGGED_RIGHT $RAGGED_LEFT $JUSTIFY $CENTER); |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
3011
|
|
|
55
|
|
|
|
|
|
|
$RAGGED_RIGHT=0; $RAGGED_LEFT=1; $JUSTIFY=2; $CENTER=3; |
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# hyphenate($long_string, $width, [$fill_mode]); |
|
58
|
|
|
|
|
|
|
sub hyphenate { |
|
59
|
16
|
|
|
16
|
0
|
2523
|
my (@chunks) = split(/(\n[ \t]*\n)/, $_[0]); |
|
60
|
16
|
|
|
|
|
24
|
my $result; |
|
61
|
16
|
|
|
|
|
116
|
while (@chunks) { |
|
62
|
48
|
|
|
|
|
125
|
local $^W = 0; |
|
63
|
48
|
|
|
|
|
108
|
my $par = hyphenate_par(shift @chunks, $_[1], $_[2]); |
|
64
|
48
|
|
|
|
|
306
|
$result .= $par . (shift @chunks); |
|
65
|
|
|
|
|
|
|
} |
|
66
|
16
|
|
|
|
|
208
|
$result; |
|
67
|
|
|
|
|
|
|
} |
|
68
|
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
my $PENALTY = 500; |
|
70
|
|
|
|
|
|
|
{ my %cache; |
|
71
|
|
|
|
|
|
|
my $a = 50000/9; |
|
72
|
|
|
|
|
|
|
my $b = 40000/9; |
|
73
|
|
|
|
|
|
|
sub badness { |
|
74
|
1380
|
|
|
1380
|
0
|
1496
|
my ($length, $target) = @_; |
|
75
|
1380
|
|
|
|
|
2248
|
my $bd = $cache{$length,$target}; |
|
76
|
1380
|
100
|
|
|
|
2972
|
return $bd if defined $bd; |
|
77
|
33
|
100
|
|
|
|
60
|
return $cache{$length,$target} = 1000 if $length > $target; |
|
78
|
29
|
|
|
|
|
41
|
my $shortfall = ($target - $length)/$target; |
|
79
|
29
|
|
|
|
|
85
|
$bd = $cache{$length,$target} = ($shortfall * $a + $b) * $shortfall; |
|
80
|
29
|
|
|
|
|
43
|
$bd; |
|
81
|
|
|
|
|
|
|
} |
|
82
|
|
|
|
|
|
|
} |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
sub hyphenate_par { |
|
85
|
48
|
|
|
48
|
0
|
70
|
my ($par, $target, $mode) = @_; |
|
86
|
48
|
|
|
|
|
39
|
my $result; |
|
87
|
48
|
|
|
|
|
82
|
my @wordsets = breakup_par($par, $target); |
|
88
|
48
|
|
|
|
|
267
|
my $i; |
|
89
|
48
|
|
|
|
|
115
|
for ($i=0; $i < @wordsets; $i++) { |
|
90
|
1428
|
|
|
|
|
1519
|
my $wordset = $wordsets[$i]; |
|
91
|
1428
|
100
|
100
|
|
|
2854
|
$mode = $RAGGED_RIGHT if $i == $#wordsets && $mode == $JUSTIFY; |
|
92
|
1428
|
|
|
|
|
2304
|
$result .= align_words($wordset, $target, $mode); |
|
93
|
1428
|
|
|
|
|
3510
|
$result .= "\n"; |
|
94
|
|
|
|
|
|
|
} |
|
95
|
48
|
|
|
|
|
620
|
$result; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub align_words { |
|
99
|
1428
|
|
|
1428
|
0
|
1515
|
my ($wds, $t, $mode) = @_; |
|
100
|
1428
|
|
|
|
|
1281
|
my $l; |
|
101
|
|
|
|
|
|
|
my $w; |
|
102
|
1428
|
100
|
|
|
|
1871
|
if ($mode != $JUSTIFY) { |
|
103
|
1083
|
|
|
|
|
1288
|
for $w (@$wds) { |
|
104
|
3797
|
|
|
|
|
3432
|
$l .= $w; |
|
105
|
3797
|
100
|
|
|
|
7343
|
$l .= ' ' unless $w =~ /-$/; |
|
106
|
|
|
|
|
|
|
} |
|
107
|
1083
|
|
|
|
|
3172
|
$l =~ s/\s+$//; |
|
108
|
1083
|
100
|
|
|
|
2086
|
my $mul = $mode == $RAGGED_LEFT ? 1 |
|
|
|
100
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
: $mode == $CENTER ? 0.5 : 0 |
|
110
|
|
|
|
|
|
|
; |
|
111
|
1083
|
|
|
|
|
2149
|
$l = (' ' x ($mul * ($t - length $l))) . $l; |
|
112
|
|
|
|
|
|
|
} else { |
|
113
|
345
|
|
|
|
|
297
|
my $i; |
|
114
|
|
|
|
|
|
|
my @wds; |
|
115
|
0
|
|
|
|
|
0
|
my $tl; |
|
116
|
345
|
|
|
|
|
666
|
for ($i = 0; $i < @$wds; $i++) { |
|
117
|
1219
|
|
|
|
|
2629
|
local $^W = 0; |
|
118
|
1219
|
|
|
|
|
1477
|
$w = $wds->[$i]; |
|
119
|
1219
|
|
|
|
|
1212
|
$tl += length $w; |
|
120
|
1219
|
100
|
|
|
|
1962
|
if ($w =~ /-$/) { |
|
121
|
38
|
|
|
|
|
75
|
push @wds, $w . $wds->[$i+1] ; |
|
122
|
38
|
|
|
|
|
51
|
$tl += length($wds->[$i+1]); |
|
123
|
38
|
|
|
|
|
96
|
++$i; |
|
124
|
|
|
|
|
|
|
} else { |
|
125
|
1181
|
|
|
|
|
3214
|
push @wds, $w; |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
} |
|
128
|
345
|
|
|
|
|
376
|
my $padding = $t - $tl; |
|
129
|
345
|
|
|
|
|
658
|
for ($i = 0; $i < @wds; $i++) { |
|
130
|
1219
|
|
|
|
|
1289
|
$l .= $wds[$i]; |
|
131
|
1219
|
100
|
|
|
|
2235
|
last if $i == $#wds; |
|
132
|
874
|
|
|
|
|
1195
|
my $spl = int($padding / ($#wds - $i) + .5); |
|
133
|
874
|
|
|
|
|
854
|
$padding -= $spl; |
|
134
|
874
|
|
|
|
|
1868
|
$l .= (' ' x $spl); |
|
135
|
|
|
|
|
|
|
} |
|
136
|
|
|
|
|
|
|
} |
|
137
|
1428
|
|
|
|
|
2380
|
$l; |
|
138
|
|
|
|
|
|
|
} |
|
139
|
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub breakup_par { |
|
141
|
48
|
|
|
48
|
0
|
56
|
my ($par, $target) = @_; |
|
142
|
48
|
|
|
|
|
58
|
my ($r, $l) = ('', 0); |
|
143
|
48
|
|
|
|
|
42
|
my ($w, @ln); |
|
144
|
48
|
|
|
|
|
7382
|
my @words = grep $_ ne '', split(/\s+|([\w\']+[^\w\s]*)/, $par); |
|
145
|
|
|
|
|
|
|
# print "Split into: (@words)\n"; |
|
146
|
48
|
|
|
|
|
854
|
my @lines; |
|
147
|
|
|
|
|
|
|
|
|
148
|
48
|
|
|
|
|
112
|
while (defined($w = shift @words)) { |
|
149
|
6244
|
|
|
|
|
6739
|
my $wl = length $w; |
|
150
|
|
|
|
|
|
|
|
|
151
|
6244
|
100
|
|
|
|
8533
|
if ($wl + $l <= $target) { |
|
152
|
4864
|
|
|
|
|
5226
|
push @ln, $w; |
|
153
|
4864
|
|
|
|
|
4102
|
$l += length($w); |
|
154
|
4864
|
50
|
|
|
|
15056
|
$l += 1 unless $w =~ /-$/; |
|
155
|
|
|
|
|
|
|
} else { |
|
156
|
1380
|
|
|
|
|
2038
|
my $b = badness($l, $target); |
|
157
|
1380
|
50
|
|
|
|
2336
|
print "Badness of this line ($l/$target) is $b ($PENALTY)\n" if $DEBUG; |
|
158
|
1380
|
100
|
|
|
|
2362
|
if ($b >= $PENALTY) { |
|
159
|
840
|
|
|
|
|
1143
|
my @splitpos = hyphen_pos($w); |
|
160
|
840
|
50
|
|
|
|
1487
|
print "`$w' hyphenates at positions: (@splitpos).\n" if $DEBUG; |
|
161
|
840
|
|
|
|
|
876
|
my ($longest, $good_hyphen) = (0,0); |
|
162
|
840
|
|
|
|
|
1095
|
for $longest (@splitpos) { |
|
163
|
356
|
100
|
|
|
|
780
|
$good_hyphen=$longest, last if $longest + $l + 1 <= $target; |
|
164
|
|
|
|
|
|
|
} |
|
165
|
840
|
100
|
|
|
|
1196
|
if ($good_hyphen) { |
|
166
|
152
|
50
|
|
|
|
251
|
print "$good_hyphen is the best place to hyphenate `$w'.\n" |
|
167
|
|
|
|
|
|
|
if $DEBUG; |
|
168
|
152
|
|
|
|
|
270
|
push @ln, (substr($w, 0, $good_hyphen) . '-'); |
|
169
|
152
|
|
|
|
|
255
|
substr($w, 0, $good_hyphen) = ''; |
|
170
|
|
|
|
|
|
|
} else { |
|
171
|
688
|
50
|
|
|
|
1289
|
print "No improvement from hyphenating `$w'.\n" |
|
172
|
|
|
|
|
|
|
if $DEBUG; |
|
173
|
|
|
|
|
|
|
} |
|
174
|
|
|
|
|
|
|
} |
|
175
|
1380
|
|
|
|
|
6462
|
push @lines, [@ln]; |
|
176
|
1380
|
|
|
|
|
1992
|
@ln = (); |
|
177
|
1380
|
|
|
|
|
1237
|
$l = 0; |
|
178
|
1380
|
|
|
|
|
1533
|
redo; |
|
179
|
|
|
|
|
|
|
} |
|
180
|
|
|
|
|
|
|
} |
|
181
|
48
|
50
|
|
|
|
112
|
push @lines, \@ln if @ln; |
|
182
|
48
|
|
|
|
|
311
|
@lines; |
|
183
|
|
|
|
|
|
|
} |
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
my ($LEFTMIN, $RIGHTMIN) = (2, 2); |
|
186
|
|
|
|
|
|
|
sub hyphen_pos { |
|
187
|
840
|
|
|
840
|
0
|
1226
|
hyphen_pos_aux($h, @_); |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
{ my %cache ; |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub hyphen_pos_aux { |
|
193
|
840
|
|
|
840
|
0
|
996
|
my ($h, $word) = @_; |
|
194
|
|
|
|
|
|
|
|
|
195
|
840
|
100
|
|
|
|
1596
|
return @{$cache{$word}} if exists $cache{$word}; |
|
|
762
|
|
|
|
|
1749
|
|
|
196
|
78
|
50
|
|
|
|
104
|
print STDERR "Hyphenate `$word'\n" if $DEBUG; |
|
197
|
|
|
|
|
|
|
|
|
198
|
78
|
|
|
|
|
100
|
my $exact = $h->{exact}; |
|
199
|
78
|
50
|
|
|
|
165
|
if (defined(my $res = $exact->{$word})) { |
|
200
|
0
|
0
|
|
|
|
0
|
print STDERR "Exact match $res\n" if $DEBUG; |
|
201
|
0
|
|
|
|
|
0
|
my @result = split //, $res; |
|
202
|
0
|
|
|
|
|
0
|
my @result_list = reverse grep $result[$_] % 2, (0 .. $#result); |
|
203
|
0
|
|
|
|
|
0
|
$cache{$word} = \@result_list; |
|
204
|
0
|
|
|
|
|
0
|
return @result_list; |
|
205
|
|
|
|
|
|
|
} |
|
206
|
|
|
|
|
|
|
|
|
207
|
78
|
|
|
|
|
87
|
my $hyphen = $h->{hyphen}; |
|
208
|
78
|
|
|
|
|
86
|
my $beginhyphen = $h->{begin}; |
|
209
|
78
|
|
|
|
|
70
|
my $endhyphen = $h->{end}; |
|
210
|
78
|
|
|
|
|
81
|
my $bothhyphen = $h->{both}; |
|
211
|
|
|
|
|
|
|
|
|
212
|
78
|
|
|
|
|
70
|
my $totallength = length $word; |
|
213
|
78
|
|
|
|
|
161
|
my @result = (0) x ($totallength + 1); |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# walk the word |
|
216
|
78
|
|
|
|
|
81
|
my $rightstop = $totallength - $RIGHTMIN; |
|
217
|
78
|
|
|
|
|
57
|
my $pos; |
|
218
|
78
|
|
|
|
|
568
|
for ($pos = 0; $pos <= $rightstop; $pos++) { |
|
219
|
|
|
|
|
|
|
# length of the rest of the word |
|
220
|
327
|
|
|
|
|
309
|
my $restlength = $totallength - $pos; |
|
221
|
|
|
|
|
|
|
# length of a substring |
|
222
|
327
|
|
|
|
|
249
|
my $length; |
|
223
|
327
|
|
|
|
|
512
|
for ($length = 1; $length <= $restlength; $length++) { |
|
224
|
1308
|
|
|
|
|
1457
|
my $substr = substr $word, $pos, $length; |
|
225
|
1308
|
|
|
|
|
1050
|
my $value; |
|
226
|
|
|
|
|
|
|
my $j; |
|
227
|
0
|
|
|
|
|
0
|
my $letter; |
|
228
|
1308
|
100
|
|
|
|
2738
|
if (defined($value = $hyphen->{$substr})) { |
|
229
|
166
|
|
|
|
|
141
|
$j = $pos; |
|
230
|
166
|
50
|
|
|
|
320
|
print STDERR "$j: $substr: $value\n" if $DEBUG > 2; |
|
231
|
166
|
|
|
|
|
458
|
while ($value =~ /(.)/gs) { |
|
232
|
618
|
100
|
|
|
|
1240
|
$result[$j] = $1 if ($1 > $result[$j]); |
|
233
|
618
|
|
|
|
|
1332
|
$j++; |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
} |
|
236
|
1308
|
100
|
100
|
|
|
3146
|
if (($pos == 0) and |
|
237
|
|
|
|
|
|
|
defined($value = $beginhyphen->{$substr})) { |
|
238
|
16
|
|
|
|
|
14
|
$j = 0; |
|
239
|
16
|
50
|
|
|
|
28
|
print STDERR "$j: $substr: $value\n" if $DEBUG > 2; |
|
240
|
16
|
|
|
|
|
63
|
while ($value =~ /(.)/gs) { |
|
241
|
57
|
100
|
|
|
|
123
|
$result[$j] = $1 if ($1 > $result[$j]); |
|
242
|
57
|
|
|
|
|
127
|
$j++; |
|
243
|
|
|
|
|
|
|
} |
|
244
|
|
|
|
|
|
|
} |
|
245
|
1308
|
100
|
100
|
|
|
4468
|
if (($restlength == $length) and |
|
246
|
|
|
|
|
|
|
defined($value = $endhyphen->{$substr})) { |
|
247
|
16
|
|
|
|
|
18
|
$j = $pos; |
|
248
|
16
|
50
|
|
|
|
29
|
print STDERR "$j: $substr: $value\n" if $DEBUG > 2; |
|
249
|
16
|
|
|
|
|
53
|
while ($value =~ /(.)/gs) { |
|
250
|
50
|
100
|
|
|
|
108
|
$result[$j] = $1 if ($1 > $result[$j]); |
|
251
|
50
|
|
|
|
|
147
|
$j++; |
|
252
|
|
|
|
|
|
|
} |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
} |
|
255
|
|
|
|
|
|
|
} |
|
256
|
78
|
|
|
|
|
65
|
my $value; |
|
257
|
|
|
|
|
|
|
my $letter; |
|
258
|
78
|
50
|
|
|
|
156
|
if (defined($value = $bothhyphen->{$word})) { |
|
259
|
0
|
|
|
|
|
0
|
my $j = 0; |
|
260
|
0
|
0
|
|
|
|
0
|
print STDERR "$j: $word: $value\n" if $DEBUG > 2; |
|
261
|
0
|
|
|
|
|
0
|
while ($value =~ /(.)/gs) { |
|
262
|
0
|
0
|
|
|
|
0
|
$result[$j] = $1 if ($1 > $result[$j]); |
|
263
|
0
|
|
|
|
|
0
|
$j++; |
|
264
|
|
|
|
|
|
|
} |
|
265
|
|
|
|
|
|
|
} |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
### substr($result, 0, $LEFTMIN + 1) = '0' x ($LEFTMIN + 1); |
|
268
|
78
|
|
|
|
|
188
|
@result[0..$LEFTMIN-1] = ((0) x $LEFTMIN); |
|
269
|
78
|
|
|
|
|
136
|
@result[-$RIGHTMIN..-1] = ((0) x $RIGHTMIN); |
|
270
|
|
|
|
|
|
|
|
|
271
|
78
|
|
|
|
|
281
|
my @result_list = reverse grep $result[$_] % 2, (0 .. $#result); |
|
272
|
78
|
50
|
|
|
|
142
|
print STDERR "Result: @result_list\n" if $DEBUG; |
|
273
|
78
|
|
|
|
|
169
|
$cache{$word} = \@result_list; |
|
274
|
78
|
|
|
|
|
289
|
@result_list; |
|
275
|
|
|
|
|
|
|
} |
|
276
|
|
|
|
|
|
|
} |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
# ######################################### |
|
280
|
|
|
|
|
|
|
# For a word show the result of hyphenation |
|
281
|
|
|
|
|
|
|
# |
|
282
|
|
|
|
|
|
|
sub hyphenate_word { |
|
283
|
0
|
|
|
0
|
0
|
|
my $pos; |
|
284
|
0
|
|
|
|
|
|
my ($word) = @_; |
|
285
|
0
|
|
|
|
|
|
for $pos (hyphen_pos_aux($h, $word)) { |
|
286
|
0
|
|
|
|
|
|
substr($word, $pos, 0) = "-"; |
|
287
|
|
|
|
|
|
|
} |
|
288
|
0
|
|
|
|
|
|
$word; |
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
1; |
|
292
|
|
|
|
|
|
|
__DATA__ |