File Coverage

blib/lib/Text/WrapI18N.pm

Criterion	Covered	Total	%
statement	63	109	57.8
branch	15	46	32.6
condition	3	45	6.6
subroutine	7	8	87.5
pod	0	1	0.0
total	88	209	42.1

line	stmt	bran	cond	sub	pod	time	code
1							package Text::WrapI18N;
2
3							require Exporter;
4	1			1		26501	use strict;
	1					2
	1					40
5	1			1		5	use warnings;
	1					3
	1					125
6
7							our @ISA = qw(Exporter);
8							our @EXPORT = qw(wrap);
9							our @EXPORT_OK = qw($columns $separator);
10							our %EXPORT_TAGS = ('all' => [ @EXPORT, @EXPORT_OK ]);
11
12							our $VERSION = '0.06';
13
14	1			1		6	use vars qw($columns $break $tabstop $separator $huge $unexpand $charmap);
	1					7
	1					233
15	1			1		1220	use Text::CharWidth qw(mbswidth mblen);
	1					4835
	1					181
16
17							BEGIN {
18	1			1		2	$columns = 76;
19							# $break, $separator, $huge, and $unexpand are not supported yet.
20	1					2	$break = '\s';
21	1					1	$tabstop = 8;
22	1					3	$separator = "\n";
23	1					2	$huge = 'wrap';
24	1					2	$unexpand = 1;
25	1					1452	undef $charmap;
26							}
27
28							sub wrap {
29	2			2	0	15	my $top1=shift;
30	2					9	my $top2=shift;
31	2					4	my $text=shift;
32
33	2					5	$text = $top1 . $text;
34
35							# $out already-formatted text for output including current line
36							# $len visible width of the current line without the current word
37							# $word the current word which might be sent to the next line
38							# $wlen visible width of the current word
39							# $c the current character
40							# $b whether to allow line-breaking after the current character
41							# $cont_lf true when LF (line feed) characters appear continuously
42							# $w visible width of the current character
43
44	2					4	my $out = '';
45	2					5	my $len = 0;
46	2					3	my $word = '';
47	2					2	my $wlen = 0;
48	2					4	my $cont_lf = 0;
49	2					3	my ($c, $w, $b);
50	2					5	$text =~ s/\n+$/\n/;
51	2					2	while(1) {
52	28	100				61	if (length($text) == 0) {
53	2					52	return $out . $word;
54							}
55	26					42	($c, $text, $w, $b) = _extract($text);
56	26	50				80	if ($c eq "\n") {
		50
57	0					0	$out .= $word . $separator;
58	0	0				0	if (length($text) == 0) {return $out;}
	0					0
59	0					0	$len = 0;
60	0					0	$text = $top2 . $text;
61	0					0	$word = '' ; $wlen = 0;
	0					0
62	0					0	next;
63							} elsif ($w == -1) {
64							# all control characters other than LF are ignored
65	0					0	next;
66							}
67
68							# when the current line have enough room
69							# for the curren character
70
71	26	100				67	if ($len + $wlen + $w <= $columns) {
72	25	100	66			109	if ($c eq ' ' \|\| $b) {
73	2					5	$out .= $word . $c;
74	2					14	$len += $wlen + $w;
75	2					5	$word = ''; $wlen = 0;
	2					4
76							} else {
77	23					34	$word .= $c; $wlen += $w;
	23					23
78							}
79	25					29	next;
80							}
81
82							# when the current line overflows with the
83							# current character
84
85	1	50				4	if ($c eq ' ') {
		0
86							# the line ends by space
87	1					3	$out .= $word . $separator;
88	1					2	$len = 0;
89	1					2	$text = $top2 . $text;
90	1					1	$word = ''; $wlen = 0;
	1					2
91							} elsif ($wlen + $w <= $columns) {
92							# the current word is sent to next line
93	0					0	$out .= $separator;
94	0					0	$len = 0;
95	0					0	$text = $top2 . $word . $c . $text;
96	0					0	$word = ''; $wlen = 0;
	0					0
97							} else {
98							# the current word is too long to fit a line
99	0					0	$out .= $word . $separator;
100	0					0	$len = 0;
101	0					0	$text = $top2 . $c . $text;
102	0					0	$word = ''; $wlen = 0;
	0					0
103							}
104							}
105							}
106
107
108							# Extract one character from the beginning from the given string.
109							# Supports multibyte encodings such as UTF-8, EUC-JP, EUC-KR,
110							# GB2312, and Big5.
111							#
112							# return value: (character, rest string, width, line breakable)
113							# character: a character. This may consist from multiple bytes.
114							# rest string: given string without the extracted character.
115							# width: number of columns which the character occupies on screen.
116							# line breakable: true if the character allows line break after it.
117
118							sub _extract {
119	26			26		31	my $string=shift;
120	26					28	my ($l, $c, $r, $w, $b, $u);
121
122	26	50				45	if (length($string) == 0) {
123	0					0	return ('', '', 0, 0);
124							}
125	26					64	$l = mblen($string);
126	26	50	33			100	if ($l == 0 \|\| $l == -1) {
127	0					0	return ('?', substr($string,1), 1, 0);
128							}
129	26					109	$c = substr($string, 0, $l);
130	26					39	$r = substr($string, $l);
131	26					61	$w = mbswidth($c);
132
133	26	100				46	if (!defined($charmap)) {
134	1					19919	$charmap = `/usr/bin/locale charmap`;
135							}
136
137	26	50				221	if ($charmap =~ /UTF.8/i) {
		50
138							# UTF-8
139	0	0				0	if ($l == 3) {
		0
140							# U+0800 - U+FFFF
141	0					0	$u = (ord(substr($c,0,1))&0x0f) * 0x1000
142							+ (ord(substr($c,1,1))&0x3f) * 0x40
143							+ (ord(substr($c,2,1))&0x3f);
144	0					0	$b = _isCJ($u);
145							} elsif ($l == 4) {
146							# U+10000 - U+10FFFF
147	0					0	$u = (ord(substr($c,0,1))&7) * 0x40000
148							+ (ord(substr($c,1,1))&0x3f) * 0x1000
149							+ (ord(substr($c,2,1))&0x3f) * 0x40
150							+ (ord(substr($c,3,1))&0x3f);
151	0					0	$b = _isCJ($u);
152							} else {
153	0					0	$b = 0;
154							}
155							} elsif ($charmap =~ /(^EUC)\|(^GB)\|(^BIG)/i) {
156							# East Asian legacy encodings
157							# (EUC-JP, EUC-KR, GB2312, Big5, Big5HKSCS, and so on)
158
159	0	0				0	if (ord(substr($c,0,1)) >= 0x80) {$b = 1;} else {$b = 0;}
	0					0
	0					0
160							} else {
161	26					39	$b = 0;
162							}
163	26					117	return ($c, $r, $w, $b);
164							}
165
166							# Returns 1 for Chinese and Japanese characters. This means that
167							# these characters allow line wrapping after this character even
168							# without whitespaces because these languages don't use whitespaces
169							# between words.
170							#
171							# Character must be given in UCS-4 codepoint value.
172
173							sub _isCJ {
174	0			0			my $u=shift;
175
176	0	0	0				if ($u >= 0x3000 && $u <= 0x312f) {
177	0	0	0				if ($u == 0x300a \|\| $u == 0x300c \|\| $u == 0x300e \|\|
	0		0
			0
			0
			0
			0
			0
178							$u == 0x3010 \|\| $u == 0x3014 \|\| $u == 0x3016 \|\|
179							$u == 0x3018 \|\| $u == 0x301a) {return 0;}
180	0						return 1;
181							} # CJK punctuations, Hiragana, Katakana, Bopomofo
182	0	0	0				if ($u >= 0x31a0 && $u <= 0x31bf) {return 1;} # Bopomofo
	0
183	0	0	0				if ($u >= 0x31f0 && $u <= 0x31ff) {return 1;} # Katakana extension
	0
184	0	0	0				if ($u >= 0x3400 && $u <= 0x9fff) {return 1;} # Han Ideogram
	0
185	0	0	0				if ($u >= 0xf900 && $u <= 0xfaff) {return 1;} # Han Ideogram
	0
186	0	0	0				if ($u >= 0x20000 && $u <= 0x2ffff) {return 1;} # Han Ideogram
	0
187
188	0						return 0;
189							}
190
191							1;
192							__END__