| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Text::WrapI18N; | 
| 2 |  |  |  |  |  |  |  | 
| 3 |  |  |  |  |  |  | require Exporter; | 
| 4 | 1 |  |  | 1 |  | 26501 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 40 |  | 
| 5 | 1 |  |  | 1 |  | 5 | use warnings; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 125 |  | 
| 6 |  |  |  |  |  |  |  | 
| 7 |  |  |  |  |  |  | our @ISA = qw(Exporter); | 
| 8 |  |  |  |  |  |  | our @EXPORT = qw(wrap); | 
| 9 |  |  |  |  |  |  | our @EXPORT_OK = qw($columns $separator); | 
| 10 |  |  |  |  |  |  | our %EXPORT_TAGS = ('all' => [ @EXPORT, @EXPORT_OK ]); | 
| 11 |  |  |  |  |  |  |  | 
| 12 |  |  |  |  |  |  | our $VERSION = '0.06'; | 
| 13 |  |  |  |  |  |  |  | 
| 14 | 1 |  |  | 1 |  | 6 | use vars qw($columns $break $tabstop $separator $huge $unexpand $charmap); | 
|  | 1 |  |  |  |  | 7 |  | 
|  | 1 |  |  |  |  | 233 |  | 
| 15 | 1 |  |  | 1 |  | 1220 | use Text::CharWidth qw(mbswidth mblen); | 
|  | 1 |  |  |  |  | 4835 |  | 
|  | 1 |  |  |  |  | 181 |  | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | BEGIN { | 
| 18 | 1 |  |  | 1 |  | 2 | $columns = 76; | 
| 19 |  |  |  |  |  |  | # $break, $separator, $huge, and $unexpand are not supported yet. | 
| 20 | 1 |  |  |  |  | 2 | $break = '\s'; | 
| 21 | 1 |  |  |  |  | 1 | $tabstop = 8; | 
| 22 | 1 |  |  |  |  | 3 | $separator = "\n"; | 
| 23 | 1 |  |  |  |  | 2 | $huge = 'wrap'; | 
| 24 | 1 |  |  |  |  | 2 | $unexpand = 1; | 
| 25 | 1 |  |  |  |  | 1452 | undef $charmap; | 
| 26 |  |  |  |  |  |  | } | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | sub wrap { | 
| 29 | 2 |  |  | 2 | 0 | 15 | my $top1=shift; | 
| 30 | 2 |  |  |  |  | 9 | my $top2=shift; | 
| 31 | 2 |  |  |  |  | 4 | my $text=shift; | 
| 32 |  |  |  |  |  |  |  | 
| 33 | 2 |  |  |  |  | 5 | $text = $top1 . $text; | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | # $out     already-formatted text for output including current line | 
| 36 |  |  |  |  |  |  | # $len     visible width of the current line without the current word | 
| 37 |  |  |  |  |  |  | # $word    the current word which might be sent to the next line | 
| 38 |  |  |  |  |  |  | # $wlen    visible width of the current word | 
| 39 |  |  |  |  |  |  | # $c       the current character | 
| 40 |  |  |  |  |  |  | # $b       whether to allow line-breaking after the current character | 
| 41 |  |  |  |  |  |  | # $cont_lf true when LF (line feed) characters appear continuously | 
| 42 |  |  |  |  |  |  | # $w       visible width of the current character | 
| 43 |  |  |  |  |  |  |  | 
| 44 | 2 |  |  |  |  | 4 | my $out = ''; | 
| 45 | 2 |  |  |  |  | 5 | my $len = 0; | 
| 46 | 2 |  |  |  |  | 3 | my $word = ''; | 
| 47 | 2 |  |  |  |  | 2 | my $wlen = 0; | 
| 48 | 2 |  |  |  |  | 4 | my $cont_lf = 0; | 
| 49 | 2 |  |  |  |  | 3 | my ($c, $w, $b); | 
| 50 | 2 |  |  |  |  | 5 | $text =~ s/\n+$/\n/; | 
| 51 | 2 |  |  |  |  | 2 | while(1) { | 
| 52 | 28 | 100 |  |  |  | 61 | if (length($text) == 0) { | 
| 53 | 2 |  |  |  |  | 52 | return $out . $word; | 
| 54 |  |  |  |  |  |  | } | 
| 55 | 26 |  |  |  |  | 42 | ($c, $text, $w, $b) = _extract($text); | 
| 56 | 26 | 50 |  |  |  | 80 | if ($c eq "\n") { | 
|  |  | 50 |  |  |  |  |  | 
| 57 | 0 |  |  |  |  | 0 | $out .= $word . $separator; | 
| 58 | 0 | 0 |  |  |  | 0 | if (length($text) == 0) {return $out;} | 
|  | 0 |  |  |  |  | 0 |  | 
| 59 | 0 |  |  |  |  | 0 | $len = 0; | 
| 60 | 0 |  |  |  |  | 0 | $text = $top2 . $text; | 
| 61 | 0 |  |  |  |  | 0 | $word = '' ; $wlen = 0; | 
|  | 0 |  |  |  |  | 0 |  | 
| 62 | 0 |  |  |  |  | 0 | next; | 
| 63 |  |  |  |  |  |  | } elsif ($w == -1) { | 
| 64 |  |  |  |  |  |  | # all control characters other than LF are ignored | 
| 65 | 0 |  |  |  |  | 0 | next; | 
| 66 |  |  |  |  |  |  | } | 
| 67 |  |  |  |  |  |  |  | 
| 68 |  |  |  |  |  |  | # when the current line have enough room | 
| 69 |  |  |  |  |  |  | # for the curren character | 
| 70 |  |  |  |  |  |  |  | 
| 71 | 26 | 100 |  |  |  | 67 | if ($len + $wlen + $w <= $columns) { | 
| 72 | 25 | 100 | 66 |  |  | 109 | if ($c eq ' ' || $b) { | 
| 73 | 2 |  |  |  |  | 5 | $out .= $word . $c; | 
| 74 | 2 |  |  |  |  | 14 | $len += $wlen + $w; | 
| 75 | 2 |  |  |  |  | 5 | $word = ''; $wlen = 0; | 
|  | 2 |  |  |  |  | 4 |  | 
| 76 |  |  |  |  |  |  | } else { | 
| 77 | 23 |  |  |  |  | 34 | $word .= $c; $wlen += $w; | 
|  | 23 |  |  |  |  | 23 |  | 
| 78 |  |  |  |  |  |  | } | 
| 79 | 25 |  |  |  |  | 29 | next; | 
| 80 |  |  |  |  |  |  | } | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | # when the current line overflows with the | 
| 83 |  |  |  |  |  |  | # current character | 
| 84 |  |  |  |  |  |  |  | 
| 85 | 1 | 50 |  |  |  | 4 | if ($c eq ' ') { | 
|  |  | 0 |  |  |  |  |  | 
| 86 |  |  |  |  |  |  | # the line ends by space | 
| 87 | 1 |  |  |  |  | 3 | $out .= $word . $separator; | 
| 88 | 1 |  |  |  |  | 2 | $len = 0; | 
| 89 | 1 |  |  |  |  | 2 | $text = $top2 . $text; | 
| 90 | 1 |  |  |  |  | 1 | $word = ''; $wlen = 0; | 
|  | 1 |  |  |  |  | 2 |  | 
| 91 |  |  |  |  |  |  | } elsif ($wlen + $w <= $columns) { | 
| 92 |  |  |  |  |  |  | # the current word is sent to next line | 
| 93 | 0 |  |  |  |  | 0 | $out .= $separator; | 
| 94 | 0 |  |  |  |  | 0 | $len = 0; | 
| 95 | 0 |  |  |  |  | 0 | $text = $top2 . $word . $c . $text; | 
| 96 | 0 |  |  |  |  | 0 | $word = ''; $wlen = 0; | 
|  | 0 |  |  |  |  | 0 |  | 
| 97 |  |  |  |  |  |  | } else { | 
| 98 |  |  |  |  |  |  | # the current word is too long to fit a line | 
| 99 | 0 |  |  |  |  | 0 | $out .= $word . $separator; | 
| 100 | 0 |  |  |  |  | 0 | $len = 0; | 
| 101 | 0 |  |  |  |  | 0 | $text = $top2 . $c . $text; | 
| 102 | 0 |  |  |  |  | 0 | $word = ''; $wlen = 0; | 
|  | 0 |  |  |  |  | 0 |  | 
| 103 |  |  |  |  |  |  | } | 
| 104 |  |  |  |  |  |  | } | 
| 105 |  |  |  |  |  |  | } | 
| 106 |  |  |  |  |  |  |  | 
| 107 |  |  |  |  |  |  |  | 
| 108 |  |  |  |  |  |  | # Extract one character from the beginning from the given string. | 
| 109 |  |  |  |  |  |  | # Supports multibyte encodings such as UTF-8, EUC-JP, EUC-KR, | 
| 110 |  |  |  |  |  |  | # GB2312, and Big5. | 
| 111 |  |  |  |  |  |  | # | 
| 112 |  |  |  |  |  |  | # return value: (character, rest string, width, line breakable) | 
| 113 |  |  |  |  |  |  | #   character: a character.  This may consist from multiple bytes. | 
| 114 |  |  |  |  |  |  | #   rest string: given string without the extracted character. | 
| 115 |  |  |  |  |  |  | #   width: number of columns which the character occupies on screen. | 
| 116 |  |  |  |  |  |  | #   line breakable: true if the character allows line break after it. | 
| 117 |  |  |  |  |  |  |  | 
| 118 |  |  |  |  |  |  | sub _extract { | 
| 119 | 26 |  |  | 26 |  | 31 | my $string=shift; | 
| 120 | 26 |  |  |  |  | 28 | my ($l, $c, $r, $w, $b, $u); | 
| 121 |  |  |  |  |  |  |  | 
| 122 | 26 | 50 |  |  |  | 45 | if (length($string) == 0) { | 
| 123 | 0 |  |  |  |  | 0 | return ('', '', 0, 0); | 
| 124 |  |  |  |  |  |  | } | 
| 125 | 26 |  |  |  |  | 64 | $l = mblen($string); | 
| 126 | 26 | 50 | 33 |  |  | 100 | if ($l == 0 || $l == -1) { | 
| 127 | 0 |  |  |  |  | 0 | return ('?', substr($string,1), 1, 0); | 
| 128 |  |  |  |  |  |  | } | 
| 129 | 26 |  |  |  |  | 109 | $c = substr($string, 0, $l); | 
| 130 | 26 |  |  |  |  | 39 | $r = substr($string, $l); | 
| 131 | 26 |  |  |  |  | 61 | $w = mbswidth($c); | 
| 132 |  |  |  |  |  |  |  | 
| 133 | 26 | 100 |  |  |  | 46 | if (!defined($charmap)) { | 
| 134 | 1 |  |  |  |  | 19919 | $charmap = `/usr/bin/locale charmap`; | 
| 135 |  |  |  |  |  |  | } | 
| 136 |  |  |  |  |  |  |  | 
| 137 | 26 | 50 |  |  |  | 221 | if ($charmap =~ /UTF.8/i) { | 
|  |  | 50 |  |  |  |  |  | 
| 138 |  |  |  |  |  |  | # UTF-8 | 
| 139 | 0 | 0 |  |  |  | 0 | if ($l == 3) { | 
|  |  | 0 |  |  |  |  |  | 
| 140 |  |  |  |  |  |  | # U+0800 - U+FFFF | 
| 141 | 0 |  |  |  |  | 0 | $u = (ord(substr($c,0,1))&0x0f) * 0x1000 | 
| 142 |  |  |  |  |  |  | + (ord(substr($c,1,1))&0x3f) * 0x40 | 
| 143 |  |  |  |  |  |  | + (ord(substr($c,2,1))&0x3f); | 
| 144 | 0 |  |  |  |  | 0 | $b = _isCJ($u); | 
| 145 |  |  |  |  |  |  | } elsif ($l == 4) { | 
| 146 |  |  |  |  |  |  | # U+10000 - U+10FFFF | 
| 147 | 0 |  |  |  |  | 0 | $u = (ord(substr($c,0,1))&7) * 0x40000 | 
| 148 |  |  |  |  |  |  | + (ord(substr($c,1,1))&0x3f) * 0x1000 | 
| 149 |  |  |  |  |  |  | + (ord(substr($c,2,1))&0x3f) * 0x40 | 
| 150 |  |  |  |  |  |  | + (ord(substr($c,3,1))&0x3f); | 
| 151 | 0 |  |  |  |  | 0 | $b = _isCJ($u); | 
| 152 |  |  |  |  |  |  | } else { | 
| 153 | 0 |  |  |  |  | 0 | $b = 0; | 
| 154 |  |  |  |  |  |  | } | 
| 155 |  |  |  |  |  |  | } elsif ($charmap =~ /(^EUC)|(^GB)|(^BIG)/i) { | 
| 156 |  |  |  |  |  |  | # East Asian legacy encodings | 
| 157 |  |  |  |  |  |  | # (EUC-JP, EUC-KR, GB2312, Big5, Big5HKSCS, and so on) | 
| 158 |  |  |  |  |  |  |  | 
| 159 | 0 | 0 |  |  |  | 0 | if (ord(substr($c,0,1)) >= 0x80) {$b = 1;} else {$b = 0;} | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 160 |  |  |  |  |  |  | } else { | 
| 161 | 26 |  |  |  |  | 39 | $b = 0; | 
| 162 |  |  |  |  |  |  | } | 
| 163 | 26 |  |  |  |  | 117 | return ($c, $r, $w, $b); | 
| 164 |  |  |  |  |  |  | } | 
| 165 |  |  |  |  |  |  |  | 
| 166 |  |  |  |  |  |  | # Returns 1 for Chinese and Japanese characters.  This means that | 
| 167 |  |  |  |  |  |  | # these characters allow line wrapping after this character even | 
| 168 |  |  |  |  |  |  | # without whitespaces because these languages don't use whitespaces | 
| 169 |  |  |  |  |  |  | # between words. | 
| 170 |  |  |  |  |  |  | # | 
| 171 |  |  |  |  |  |  | # Character must be given in UCS-4 codepoint value. | 
| 172 |  |  |  |  |  |  |  | 
| 173 |  |  |  |  |  |  | sub _isCJ { | 
| 174 | 0 |  |  | 0 |  |  | my $u=shift; | 
| 175 |  |  |  |  |  |  |  | 
| 176 | 0 | 0 | 0 |  |  |  | if ($u >= 0x3000 && $u <= 0x312f) { | 
| 177 | 0 | 0 | 0 |  |  |  | if ($u == 0x300a || $u == 0x300c || $u == 0x300e || | 
|  | 0 |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
| 178 |  |  |  |  |  |  | $u == 0x3010 || $u == 0x3014 || $u == 0x3016 || | 
| 179 |  |  |  |  |  |  | $u == 0x3018 || $u == 0x301a) {return 0;} | 
| 180 | 0 |  |  |  |  |  | return 1; | 
| 181 |  |  |  |  |  |  | }  # CJK punctuations, Hiragana, Katakana, Bopomofo | 
| 182 | 0 | 0 | 0 |  |  |  | if ($u >= 0x31a0 && $u <= 0x31bf) {return 1;}  # Bopomofo | 
|  | 0 |  |  |  |  |  |  | 
| 183 | 0 | 0 | 0 |  |  |  | if ($u >= 0x31f0 && $u <= 0x31ff) {return 1;}  # Katakana extension | 
|  | 0 |  |  |  |  |  |  | 
| 184 | 0 | 0 | 0 |  |  |  | if ($u >= 0x3400 && $u <= 0x9fff) {return 1;}  # Han Ideogram | 
|  | 0 |  |  |  |  |  |  | 
| 185 | 0 | 0 | 0 |  |  |  | if ($u >= 0xf900 && $u <= 0xfaff) {return 1;}  # Han Ideogram | 
|  | 0 |  |  |  |  |  |  | 
| 186 | 0 | 0 | 0 |  |  |  | if ($u >= 0x20000 && $u <= 0x2ffff) {return 1;}  # Han Ideogram | 
|  | 0 |  |  |  |  |  |  | 
| 187 |  |  |  |  |  |  |  | 
| 188 | 0 |  |  |  |  |  | return 0; | 
| 189 |  |  |  |  |  |  | } | 
| 190 |  |  |  |  |  |  |  | 
| 191 |  |  |  |  |  |  | 1; | 
| 192 |  |  |  |  |  |  | __END__ |