line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Hyphenate; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
2298
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
46
|
|
4
|
1
|
|
|
1
|
|
5
|
use vars qw($VERSION @ISA @EXPORT @EXPORT_OK $DEBUG); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
122
|
|
5
|
1
|
|
|
1
|
|
488
|
BEGIN { $DEBUG = 0 } |
6
|
|
|
|
|
|
|
require Exporter; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
my $h; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
while () { |
14
|
|
|
|
|
|
|
last if /-{32}/; |
15
|
|
|
|
|
|
|
s/\r?\n$//; |
16
|
|
|
|
|
|
|
# print "--- $_\n"; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
my ($tag, $value, $begin, $end); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
$begin = 1 if s/^\.//; |
21
|
|
|
|
|
|
|
$end = 1 if s/\.$//; |
22
|
|
|
|
|
|
|
s/(\D)(?!\d)/${1}0/g; |
23
|
|
|
|
|
|
|
s/^(?!\d)/0/; |
24
|
|
|
|
|
|
|
($tag = lc $_) =~ tr/0-9//d; |
25
|
|
|
|
|
|
|
($value = $_) =~ tr/0-9//cd; |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
# print "$_: TAG $tag VALUE $value\n"; |
28
|
|
|
|
|
|
|
if ($begin and $end) { |
29
|
|
|
|
|
|
|
$h->{both}{$tag} = $value; |
30
|
|
|
|
|
|
|
} elsif ($begin) { |
31
|
|
|
|
|
|
|
$h->{begin}{$tag} = $value; |
32
|
|
|
|
|
|
|
} elsif ($end) { |
33
|
|
|
|
|
|
|
$h->{end}{$tag} = $value; |
34
|
|
|
|
|
|
|
} else { |
35
|
|
|
|
|
|
|
$h->{hyphen}{$tag} = $value; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
while () { |
40
|
|
|
|
|
|
|
last if /-{32}/; |
41
|
|
|
|
|
|
|
chomp; |
42
|
|
|
|
|
|
|
my ($tag, $value); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
($tag = lc $_) =~ tr/-//d; |
45
|
|
|
|
|
|
|
($value = '0' . $_) =~ s/[^-](?!-)/0/g; |
46
|
|
|
|
|
|
|
$value =~ s/[^-]-/1/g; |
47
|
|
|
|
|
|
|
$value =~ tr/01/0/c; |
48
|
|
|
|
|
|
|
$h->{exact}{$tag} = $value; |
49
|
|
|
|
|
|
|
} |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
@EXPORT_OK = qw(hyphenate fill_par); |
52
|
|
|
|
|
|
|
$VERSION = '0.02'; |
53
|
|
|
|
|
|
|
|
54
|
1
|
|
|
1
|
|
15
|
use vars qw($RAGGED_RIGHT $RAGGED_LEFT $JUSTIFY $CENTER); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
3011
|
|
55
|
|
|
|
|
|
|
$RAGGED_RIGHT=0; $RAGGED_LEFT=1; $JUSTIFY=2; $CENTER=3; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# hyphenate($long_string, $width, [$fill_mode]); |
58
|
|
|
|
|
|
|
sub hyphenate { |
59
|
16
|
|
|
16
|
0
|
2523
|
my (@chunks) = split(/(\n[ \t]*\n)/, $_[0]); |
60
|
16
|
|
|
|
|
24
|
my $result; |
61
|
16
|
|
|
|
|
116
|
while (@chunks) { |
62
|
48
|
|
|
|
|
125
|
local $^W = 0; |
63
|
48
|
|
|
|
|
108
|
my $par = hyphenate_par(shift @chunks, $_[1], $_[2]); |
64
|
48
|
|
|
|
|
306
|
$result .= $par . (shift @chunks); |
65
|
|
|
|
|
|
|
} |
66
|
16
|
|
|
|
|
208
|
$result; |
67
|
|
|
|
|
|
|
} |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
my $PENALTY = 500; |
70
|
|
|
|
|
|
|
{ my %cache; |
71
|
|
|
|
|
|
|
my $a = 50000/9; |
72
|
|
|
|
|
|
|
my $b = 40000/9; |
73
|
|
|
|
|
|
|
sub badness { |
74
|
1380
|
|
|
1380
|
0
|
1496
|
my ($length, $target) = @_; |
75
|
1380
|
|
|
|
|
2248
|
my $bd = $cache{$length,$target}; |
76
|
1380
|
100
|
|
|
|
2972
|
return $bd if defined $bd; |
77
|
33
|
100
|
|
|
|
60
|
return $cache{$length,$target} = 1000 if $length > $target; |
78
|
29
|
|
|
|
|
41
|
my $shortfall = ($target - $length)/$target; |
79
|
29
|
|
|
|
|
85
|
$bd = $cache{$length,$target} = ($shortfall * $a + $b) * $shortfall; |
80
|
29
|
|
|
|
|
43
|
$bd; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
sub hyphenate_par { |
85
|
48
|
|
|
48
|
0
|
70
|
my ($par, $target, $mode) = @_; |
86
|
48
|
|
|
|
|
39
|
my $result; |
87
|
48
|
|
|
|
|
82
|
my @wordsets = breakup_par($par, $target); |
88
|
48
|
|
|
|
|
267
|
my $i; |
89
|
48
|
|
|
|
|
115
|
for ($i=0; $i < @wordsets; $i++) { |
90
|
1428
|
|
|
|
|
1519
|
my $wordset = $wordsets[$i]; |
91
|
1428
|
100
|
100
|
|
|
2854
|
$mode = $RAGGED_RIGHT if $i == $#wordsets && $mode == $JUSTIFY; |
92
|
1428
|
|
|
|
|
2304
|
$result .= align_words($wordset, $target, $mode); |
93
|
1428
|
|
|
|
|
3510
|
$result .= "\n"; |
94
|
|
|
|
|
|
|
} |
95
|
48
|
|
|
|
|
620
|
$result; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub align_words { |
99
|
1428
|
|
|
1428
|
0
|
1515
|
my ($wds, $t, $mode) = @_; |
100
|
1428
|
|
|
|
|
1281
|
my $l; |
101
|
|
|
|
|
|
|
my $w; |
102
|
1428
|
100
|
|
|
|
1871
|
if ($mode != $JUSTIFY) { |
103
|
1083
|
|
|
|
|
1288
|
for $w (@$wds) { |
104
|
3797
|
|
|
|
|
3432
|
$l .= $w; |
105
|
3797
|
100
|
|
|
|
7343
|
$l .= ' ' unless $w =~ /-$/; |
106
|
|
|
|
|
|
|
} |
107
|
1083
|
|
|
|
|
3172
|
$l =~ s/\s+$//; |
108
|
1083
|
100
|
|
|
|
2086
|
my $mul = $mode == $RAGGED_LEFT ? 1 |
|
|
100
|
|
|
|
|
|
109
|
|
|
|
|
|
|
: $mode == $CENTER ? 0.5 : 0 |
110
|
|
|
|
|
|
|
; |
111
|
1083
|
|
|
|
|
2149
|
$l = (' ' x ($mul * ($t - length $l))) . $l; |
112
|
|
|
|
|
|
|
} else { |
113
|
345
|
|
|
|
|
297
|
my $i; |
114
|
|
|
|
|
|
|
my @wds; |
115
|
0
|
|
|
|
|
0
|
my $tl; |
116
|
345
|
|
|
|
|
666
|
for ($i = 0; $i < @$wds; $i++) { |
117
|
1219
|
|
|
|
|
2629
|
local $^W = 0; |
118
|
1219
|
|
|
|
|
1477
|
$w = $wds->[$i]; |
119
|
1219
|
|
|
|
|
1212
|
$tl += length $w; |
120
|
1219
|
100
|
|
|
|
1962
|
if ($w =~ /-$/) { |
121
|
38
|
|
|
|
|
75
|
push @wds, $w . $wds->[$i+1] ; |
122
|
38
|
|
|
|
|
51
|
$tl += length($wds->[$i+1]); |
123
|
38
|
|
|
|
|
96
|
++$i; |
124
|
|
|
|
|
|
|
} else { |
125
|
1181
|
|
|
|
|
3214
|
push @wds, $w; |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
} |
128
|
345
|
|
|
|
|
376
|
my $padding = $t - $tl; |
129
|
345
|
|
|
|
|
658
|
for ($i = 0; $i < @wds; $i++) { |
130
|
1219
|
|
|
|
|
1289
|
$l .= $wds[$i]; |
131
|
1219
|
100
|
|
|
|
2235
|
last if $i == $#wds; |
132
|
874
|
|
|
|
|
1195
|
my $spl = int($padding / ($#wds - $i) + .5); |
133
|
874
|
|
|
|
|
854
|
$padding -= $spl; |
134
|
874
|
|
|
|
|
1868
|
$l .= (' ' x $spl); |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
} |
137
|
1428
|
|
|
|
|
2380
|
$l; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub breakup_par { |
141
|
48
|
|
|
48
|
0
|
56
|
my ($par, $target) = @_; |
142
|
48
|
|
|
|
|
58
|
my ($r, $l) = ('', 0); |
143
|
48
|
|
|
|
|
42
|
my ($w, @ln); |
144
|
48
|
|
|
|
|
7382
|
my @words = grep $_ ne '', split(/\s+|([\w\']+[^\w\s]*)/, $par); |
145
|
|
|
|
|
|
|
# print "Split into: (@words)\n"; |
146
|
48
|
|
|
|
|
854
|
my @lines; |
147
|
|
|
|
|
|
|
|
148
|
48
|
|
|
|
|
112
|
while (defined($w = shift @words)) { |
149
|
6244
|
|
|
|
|
6739
|
my $wl = length $w; |
150
|
|
|
|
|
|
|
|
151
|
6244
|
100
|
|
|
|
8533
|
if ($wl + $l <= $target) { |
152
|
4864
|
|
|
|
|
5226
|
push @ln, $w; |
153
|
4864
|
|
|
|
|
4102
|
$l += length($w); |
154
|
4864
|
50
|
|
|
|
15056
|
$l += 1 unless $w =~ /-$/; |
155
|
|
|
|
|
|
|
} else { |
156
|
1380
|
|
|
|
|
2038
|
my $b = badness($l, $target); |
157
|
1380
|
50
|
|
|
|
2336
|
print "Badness of this line ($l/$target) is $b ($PENALTY)\n" if $DEBUG; |
158
|
1380
|
100
|
|
|
|
2362
|
if ($b >= $PENALTY) { |
159
|
840
|
|
|
|
|
1143
|
my @splitpos = hyphen_pos($w); |
160
|
840
|
50
|
|
|
|
1487
|
print "`$w' hyphenates at positions: (@splitpos).\n" if $DEBUG; |
161
|
840
|
|
|
|
|
876
|
my ($longest, $good_hyphen) = (0,0); |
162
|
840
|
|
|
|
|
1095
|
for $longest (@splitpos) { |
163
|
356
|
100
|
|
|
|
780
|
$good_hyphen=$longest, last if $longest + $l + 1 <= $target; |
164
|
|
|
|
|
|
|
} |
165
|
840
|
100
|
|
|
|
1196
|
if ($good_hyphen) { |
166
|
152
|
50
|
|
|
|
251
|
print "$good_hyphen is the best place to hyphenate `$w'.\n" |
167
|
|
|
|
|
|
|
if $DEBUG; |
168
|
152
|
|
|
|
|
270
|
push @ln, (substr($w, 0, $good_hyphen) . '-'); |
169
|
152
|
|
|
|
|
255
|
substr($w, 0, $good_hyphen) = ''; |
170
|
|
|
|
|
|
|
} else { |
171
|
688
|
50
|
|
|
|
1289
|
print "No improvement from hyphenating `$w'.\n" |
172
|
|
|
|
|
|
|
if $DEBUG; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
} |
175
|
1380
|
|
|
|
|
6462
|
push @lines, [@ln]; |
176
|
1380
|
|
|
|
|
1992
|
@ln = (); |
177
|
1380
|
|
|
|
|
1237
|
$l = 0; |
178
|
1380
|
|
|
|
|
1533
|
redo; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
} |
181
|
48
|
50
|
|
|
|
112
|
push @lines, \@ln if @ln; |
182
|
48
|
|
|
|
|
311
|
@lines; |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
my ($LEFTMIN, $RIGHTMIN) = (2, 2); |
186
|
|
|
|
|
|
|
sub hyphen_pos { |
187
|
840
|
|
|
840
|
0
|
1226
|
hyphen_pos_aux($h, @_); |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
{ my %cache ; |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub hyphen_pos_aux { |
193
|
840
|
|
|
840
|
0
|
996
|
my ($h, $word) = @_; |
194
|
|
|
|
|
|
|
|
195
|
840
|
100
|
|
|
|
1596
|
return @{$cache{$word}} if exists $cache{$word}; |
|
762
|
|
|
|
|
1749
|
|
196
|
78
|
50
|
|
|
|
104
|
print STDERR "Hyphenate `$word'\n" if $DEBUG; |
197
|
|
|
|
|
|
|
|
198
|
78
|
|
|
|
|
100
|
my $exact = $h->{exact}; |
199
|
78
|
50
|
|
|
|
165
|
if (defined(my $res = $exact->{$word})) { |
200
|
0
|
0
|
|
|
|
0
|
print STDERR "Exact match $res\n" if $DEBUG; |
201
|
0
|
|
|
|
|
0
|
my @result = split //, $res; |
202
|
0
|
|
|
|
|
0
|
my @result_list = reverse grep $result[$_] % 2, (0 .. $#result); |
203
|
0
|
|
|
|
|
0
|
$cache{$word} = \@result_list; |
204
|
0
|
|
|
|
|
0
|
return @result_list; |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
|
207
|
78
|
|
|
|
|
87
|
my $hyphen = $h->{hyphen}; |
208
|
78
|
|
|
|
|
86
|
my $beginhyphen = $h->{begin}; |
209
|
78
|
|
|
|
|
70
|
my $endhyphen = $h->{end}; |
210
|
78
|
|
|
|
|
81
|
my $bothhyphen = $h->{both}; |
211
|
|
|
|
|
|
|
|
212
|
78
|
|
|
|
|
70
|
my $totallength = length $word; |
213
|
78
|
|
|
|
|
161
|
my @result = (0) x ($totallength + 1); |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# walk the word |
216
|
78
|
|
|
|
|
81
|
my $rightstop = $totallength - $RIGHTMIN; |
217
|
78
|
|
|
|
|
57
|
my $pos; |
218
|
78
|
|
|
|
|
568
|
for ($pos = 0; $pos <= $rightstop; $pos++) { |
219
|
|
|
|
|
|
|
# length of the rest of the word |
220
|
327
|
|
|
|
|
309
|
my $restlength = $totallength - $pos; |
221
|
|
|
|
|
|
|
# length of a substring |
222
|
327
|
|
|
|
|
249
|
my $length; |
223
|
327
|
|
|
|
|
512
|
for ($length = 1; $length <= $restlength; $length++) { |
224
|
1308
|
|
|
|
|
1457
|
my $substr = substr $word, $pos, $length; |
225
|
1308
|
|
|
|
|
1050
|
my $value; |
226
|
|
|
|
|
|
|
my $j; |
227
|
0
|
|
|
|
|
0
|
my $letter; |
228
|
1308
|
100
|
|
|
|
2738
|
if (defined($value = $hyphen->{$substr})) { |
229
|
166
|
|
|
|
|
141
|
$j = $pos; |
230
|
166
|
50
|
|
|
|
320
|
print STDERR "$j: $substr: $value\n" if $DEBUG > 2; |
231
|
166
|
|
|
|
|
458
|
while ($value =~ /(.)/gs) { |
232
|
618
|
100
|
|
|
|
1240
|
$result[$j] = $1 if ($1 > $result[$j]); |
233
|
618
|
|
|
|
|
1332
|
$j++; |
234
|
|
|
|
|
|
|
} |
235
|
|
|
|
|
|
|
} |
236
|
1308
|
100
|
100
|
|
|
3146
|
if (($pos == 0) and |
237
|
|
|
|
|
|
|
defined($value = $beginhyphen->{$substr})) { |
238
|
16
|
|
|
|
|
14
|
$j = 0; |
239
|
16
|
50
|
|
|
|
28
|
print STDERR "$j: $substr: $value\n" if $DEBUG > 2; |
240
|
16
|
|
|
|
|
63
|
while ($value =~ /(.)/gs) { |
241
|
57
|
100
|
|
|
|
123
|
$result[$j] = $1 if ($1 > $result[$j]); |
242
|
57
|
|
|
|
|
127
|
$j++; |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
} |
245
|
1308
|
100
|
100
|
|
|
4468
|
if (($restlength == $length) and |
246
|
|
|
|
|
|
|
defined($value = $endhyphen->{$substr})) { |
247
|
16
|
|
|
|
|
18
|
$j = $pos; |
248
|
16
|
50
|
|
|
|
29
|
print STDERR "$j: $substr: $value\n" if $DEBUG > 2; |
249
|
16
|
|
|
|
|
53
|
while ($value =~ /(.)/gs) { |
250
|
50
|
100
|
|
|
|
108
|
$result[$j] = $1 if ($1 > $result[$j]); |
251
|
50
|
|
|
|
|
147
|
$j++; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
} |
256
|
78
|
|
|
|
|
65
|
my $value; |
257
|
|
|
|
|
|
|
my $letter; |
258
|
78
|
50
|
|
|
|
156
|
if (defined($value = $bothhyphen->{$word})) { |
259
|
0
|
|
|
|
|
0
|
my $j = 0; |
260
|
0
|
0
|
|
|
|
0
|
print STDERR "$j: $word: $value\n" if $DEBUG > 2; |
261
|
0
|
|
|
|
|
0
|
while ($value =~ /(.)/gs) { |
262
|
0
|
0
|
|
|
|
0
|
$result[$j] = $1 if ($1 > $result[$j]); |
263
|
0
|
|
|
|
|
0
|
$j++; |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
} |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
### substr($result, 0, $LEFTMIN + 1) = '0' x ($LEFTMIN + 1); |
268
|
78
|
|
|
|
|
188
|
@result[0..$LEFTMIN-1] = ((0) x $LEFTMIN); |
269
|
78
|
|
|
|
|
136
|
@result[-$RIGHTMIN..-1] = ((0) x $RIGHTMIN); |
270
|
|
|
|
|
|
|
|
271
|
78
|
|
|
|
|
281
|
my @result_list = reverse grep $result[$_] % 2, (0 .. $#result); |
272
|
78
|
50
|
|
|
|
142
|
print STDERR "Result: @result_list\n" if $DEBUG; |
273
|
78
|
|
|
|
|
169
|
$cache{$word} = \@result_list; |
274
|
78
|
|
|
|
|
289
|
@result_list; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
|
279
|
|
|
|
|
|
|
# ######################################### |
280
|
|
|
|
|
|
|
# For a word show the result of hyphenation |
281
|
|
|
|
|
|
|
# |
282
|
|
|
|
|
|
|
sub hyphenate_word { |
283
|
0
|
|
|
0
|
0
|
|
my $pos; |
284
|
0
|
|
|
|
|
|
my ($word) = @_; |
285
|
0
|
|
|
|
|
|
for $pos (hyphen_pos_aux($h, $word)) { |
286
|
0
|
|
|
|
|
|
substr($word, $pos, 0) = "-"; |
287
|
|
|
|
|
|
|
} |
288
|
0
|
|
|
|
|
|
$word; |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
1; |
292
|
|
|
|
|
|
|
__DATA__ |