File Coverage

blib/lib/Text/WrapI18N.pm
Criterion Covered Total %
statement 63 109 57.8
branch 15 46 32.6
condition 3 45 6.6
subroutine 7 8 87.5
pod 0 1 0.0
total 88 209 42.1


line stmt bran cond sub pod time code
1             package Text::WrapI18N;
2              
3             require Exporter;
4 1     1   26501 use strict;
  1         2  
  1         40  
5 1     1   5 use warnings;
  1         3  
  1         125  
6              
7             our @ISA = qw(Exporter);
8             our @EXPORT = qw(wrap);
9             our @EXPORT_OK = qw($columns $separator);
10             our %EXPORT_TAGS = ('all' => [ @EXPORT, @EXPORT_OK ]);
11              
12             our $VERSION = '0.06';
13              
14 1     1   6 use vars qw($columns $break $tabstop $separator $huge $unexpand $charmap);
  1         7  
  1         233  
15 1     1   1220 use Text::CharWidth qw(mbswidth mblen);
  1         4835  
  1         181  
16              
17             BEGIN {
18 1     1   2 $columns = 76;
19             # $break, $separator, $huge, and $unexpand are not supported yet.
20 1         2 $break = '\s';
21 1         1 $tabstop = 8;
22 1         3 $separator = "\n";
23 1         2 $huge = 'wrap';
24 1         2 $unexpand = 1;
25 1         1452 undef $charmap;
26             }
27              
28             sub wrap {
29 2     2 0 15 my $top1=shift;
30 2         9 my $top2=shift;
31 2         4 my $text=shift;
32              
33 2         5 $text = $top1 . $text;
34              
35             # $out already-formatted text for output including current line
36             # $len visible width of the current line without the current word
37             # $word the current word which might be sent to the next line
38             # $wlen visible width of the current word
39             # $c the current character
40             # $b whether to allow line-breaking after the current character
41             # $cont_lf true when LF (line feed) characters appear continuously
42             # $w visible width of the current character
43              
44 2         4 my $out = '';
45 2         5 my $len = 0;
46 2         3 my $word = '';
47 2         2 my $wlen = 0;
48 2         4 my $cont_lf = 0;
49 2         3 my ($c, $w, $b);
50 2         5 $text =~ s/\n+$/\n/;
51 2         2 while(1) {
52 28 100       61 if (length($text) == 0) {
53 2         52 return $out . $word;
54             }
55 26         42 ($c, $text, $w, $b) = _extract($text);
56 26 50       80 if ($c eq "\n") {
    50          
57 0         0 $out .= $word . $separator;
58 0 0       0 if (length($text) == 0) {return $out;}
  0         0  
59 0         0 $len = 0;
60 0         0 $text = $top2 . $text;
61 0         0 $word = '' ; $wlen = 0;
  0         0  
62 0         0 next;
63             } elsif ($w == -1) {
64             # all control characters other than LF are ignored
65 0         0 next;
66             }
67              
68             # when the current line have enough room
69             # for the curren character
70              
71 26 100       67 if ($len + $wlen + $w <= $columns) {
72 25 100 66     109 if ($c eq ' ' || $b) {
73 2         5 $out .= $word . $c;
74 2         14 $len += $wlen + $w;
75 2         5 $word = ''; $wlen = 0;
  2         4  
76             } else {
77 23         34 $word .= $c; $wlen += $w;
  23         23  
78             }
79 25         29 next;
80             }
81              
82             # when the current line overflows with the
83             # current character
84              
85 1 50       4 if ($c eq ' ') {
    0          
86             # the line ends by space
87 1         3 $out .= $word . $separator;
88 1         2 $len = 0;
89 1         2 $text = $top2 . $text;
90 1         1 $word = ''; $wlen = 0;
  1         2  
91             } elsif ($wlen + $w <= $columns) {
92             # the current word is sent to next line
93 0         0 $out .= $separator;
94 0         0 $len = 0;
95 0         0 $text = $top2 . $word . $c . $text;
96 0         0 $word = ''; $wlen = 0;
  0         0  
97             } else {
98             # the current word is too long to fit a line
99 0         0 $out .= $word . $separator;
100 0         0 $len = 0;
101 0         0 $text = $top2 . $c . $text;
102 0         0 $word = ''; $wlen = 0;
  0         0  
103             }
104             }
105             }
106              
107              
108             # Extract one character from the beginning from the given string.
109             # Supports multibyte encodings such as UTF-8, EUC-JP, EUC-KR,
110             # GB2312, and Big5.
111             #
112             # return value: (character, rest string, width, line breakable)
113             # character: a character. This may consist from multiple bytes.
114             # rest string: given string without the extracted character.
115             # width: number of columns which the character occupies on screen.
116             # line breakable: true if the character allows line break after it.
117              
118             sub _extract {
119 26     26   31 my $string=shift;
120 26         28 my ($l, $c, $r, $w, $b, $u);
121              
122 26 50       45 if (length($string) == 0) {
123 0         0 return ('', '', 0, 0);
124             }
125 26         64 $l = mblen($string);
126 26 50 33     100 if ($l == 0 || $l == -1) {
127 0         0 return ('?', substr($string,1), 1, 0);
128             }
129 26         109 $c = substr($string, 0, $l);
130 26         39 $r = substr($string, $l);
131 26         61 $w = mbswidth($c);
132              
133 26 100       46 if (!defined($charmap)) {
134 1         19919 $charmap = `/usr/bin/locale charmap`;
135             }
136              
137 26 50       221 if ($charmap =~ /UTF.8/i) {
    50          
138             # UTF-8
139 0 0       0 if ($l == 3) {
    0          
140             # U+0800 - U+FFFF
141 0         0 $u = (ord(substr($c,0,1))&0x0f) * 0x1000
142             + (ord(substr($c,1,1))&0x3f) * 0x40
143             + (ord(substr($c,2,1))&0x3f);
144 0         0 $b = _isCJ($u);
145             } elsif ($l == 4) {
146             # U+10000 - U+10FFFF
147 0         0 $u = (ord(substr($c,0,1))&7) * 0x40000
148             + (ord(substr($c,1,1))&0x3f) * 0x1000
149             + (ord(substr($c,2,1))&0x3f) * 0x40
150             + (ord(substr($c,3,1))&0x3f);
151 0         0 $b = _isCJ($u);
152             } else {
153 0         0 $b = 0;
154             }
155             } elsif ($charmap =~ /(^EUC)|(^GB)|(^BIG)/i) {
156             # East Asian legacy encodings
157             # (EUC-JP, EUC-KR, GB2312, Big5, Big5HKSCS, and so on)
158              
159 0 0       0 if (ord(substr($c,0,1)) >= 0x80) {$b = 1;} else {$b = 0;}
  0         0  
  0         0  
160             } else {
161 26         39 $b = 0;
162             }
163 26         117 return ($c, $r, $w, $b);
164             }
165              
166             # Returns 1 for Chinese and Japanese characters. This means that
167             # these characters allow line wrapping after this character even
168             # without whitespaces because these languages don't use whitespaces
169             # between words.
170             #
171             # Character must be given in UCS-4 codepoint value.
172              
173             sub _isCJ {
174 0     0     my $u=shift;
175              
176 0 0 0       if ($u >= 0x3000 && $u <= 0x312f) {
177 0 0 0       if ($u == 0x300a || $u == 0x300c || $u == 0x300e ||
  0   0        
      0        
      0        
      0        
      0        
      0        
178             $u == 0x3010 || $u == 0x3014 || $u == 0x3016 ||
179             $u == 0x3018 || $u == 0x301a) {return 0;}
180 0           return 1;
181             } # CJK punctuations, Hiragana, Katakana, Bopomofo
182 0 0 0       if ($u >= 0x31a0 && $u <= 0x31bf) {return 1;} # Bopomofo
  0            
183 0 0 0       if ($u >= 0x31f0 && $u <= 0x31ff) {return 1;} # Katakana extension
  0            
184 0 0 0       if ($u >= 0x3400 && $u <= 0x9fff) {return 1;} # Han Ideogram
  0            
185 0 0 0       if ($u >= 0xf900 && $u <= 0xfaff) {return 1;} # Han Ideogram
  0            
186 0 0 0       if ($u >= 0x20000 && $u <= 0x2ffff) {return 1;} # Han Ideogram
  0            
187              
188 0           return 0;
189             }
190              
191             1;
192             __END__