| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package Unicode::Lite; | 
| 2 |  |  |  |  |  |  |  | 
| 3 | 1 |  |  | 1 |  | 113497 | use 5.005_62; | 
|  | 1 |  |  |  |  | 4 |  | 
|  | 1 |  |  |  |  | 39 |  | 
| 4 | 1 |  |  | 1 |  | 6 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 35 |  | 
| 5 | 1 |  |  | 1 |  | 5 | use warnings; | 
|  | 1 |  |  |  |  | 7 |  | 
|  | 1 |  |  |  |  | 37 |  | 
| 6 | 1 |  |  | 1 |  | 5 | use base qw/Exporter/; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 115 |  | 
| 7 | 1 |  |  | 1 |  | 6 | use Carp qw/croak carp/; | 
|  | 1 |  |  |  |  | 7 |  | 
|  | 1 |  |  |  |  | 227 |  | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  |  | 
| 10 |  |  |  |  |  |  | our $VERSION = '0.12'; | 
| 11 |  |  |  |  |  |  | our @EXPORT  = qw/convert convertor addequal UL_CHR UL_ENT UL_EQV UL_SEQ UL_7BT UL_ALL/; | 
| 12 |  |  |  |  |  |  | our %EXPORT_TAGS = ( | 
| 13 |  |  |  |  |  |  | utils => [grep{!/^UL_/}@EXPORT], | 
| 14 |  |  |  |  |  |  | flags => [grep{ /^UL_/}@EXPORT] | 
| 15 |  |  |  |  |  |  | ); | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  |  | 
| 18 | 1 |  |  | 1 |  | 878 | use enum qw/BITMASK: RP_CHR RP_ENT EQ_CHR EQ_SEQ EQ_7BT/; | 
|  | 1 |  |  |  |  | 1401 |  | 
|  | 1 |  |  |  |  | 6 |  | 
| 19 | 1 |  |  | 1 |  | 1745 | use enum qw/nil src dst all/; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 5 |  | 
| 20 | 1 |  |  | 1 |  | 384 | use constant uni => qr/^(?:utf16|utf8|utf7|ucs4|uchr|uhex|latin1)$/; | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 81 |  | 
| 21 | 1 |  |  | 1 |  | 5 | use constant UL_CHR => RP_CHR;          # REPLACE TO CHAR   (default >) | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 48 |  | 
| 22 | 1 |  |  | 1 |  | 5 | use constant UL_ENT => RP_CHR | RP_ENT; # REPLACE TO ENTITY (like ) | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 42 |  | 
| 23 | 1 |  |  | 1 |  | 6 | use constant UL_EQV => EQ_CHR;          # EQUIVALENT char | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 45 |  | 
| 24 | 1 |  |  | 1 |  | 5 | use constant UL_SEQ => EQ_CHR | EQ_SEQ; # EQUIVALENT sequence of chars | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 51 |  | 
| 25 | 1 |  |  | 1 |  | 6 | use constant UL_7BT => EQ_7BT | UL_SEQ; # EQUIVALENT sequence of 7bit chars | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 70 |  | 
| 26 | 1 |  |  | 1 |  | 14 | use constant UL_ALL => UL_CHR | UL_ENT | UL_EQV | UL_SEQ; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 2398 |  | 
| 27 |  |  |  |  |  |  | our (%MAPPING, %CONVERT, %EQUIVAL, $REGISTR, $TEST); | 
| 28 |  |  |  |  |  |  |  | 
| 29 |  |  |  |  |  |  | sub convertor($$;$$) | 
| 30 |  |  |  |  |  |  | { | 
| 31 | 1 |  | 50 | 1 | 1 | 18 | my ($src, $dst, $mod, $chr) = (lc shift, lc shift, shift||0, shift||''); | 
|  |  |  | 50 |  |  |  |  | 
| 32 |  |  |  |  |  |  |  | 
| 33 | 1 | 50 |  |  |  | 8 | return $CONVERT{$src}{$dst}{$mod}{$chr} if exists | 
| 34 |  |  |  |  |  |  | $CONVERT{$src}{$dst}{$mod}{$chr}; | 
| 35 | 1 | 50 |  |  |  | 5 | require Unicode::String unless defined %Unicode::String::; | 
| 36 |  |  |  |  |  |  |  | 
| 37 | 1 |  |  |  |  | 2 | my  ($SRC, $DST) = ($src, $dst); | 
| 38 | 1 |  |  |  |  | 3 | for ($SRC, $DST){ | 
| 39 | 2 | 0 | 33 |  |  | 14 | next if $_=~uni or s/^(?:ucs2|unicode)$/utf16/o or s/^iso-8859-1$/latin1/o; | 
|  |  |  | 33 |  |  |  |  | 
| 40 | 0 | 0 |  |  |  | 0 | next if exists $MAPPING{$_}; | 
| 41 | 0 | 0 |  |  |  | 0 | unless ($REGISTR){ require Unicode::Map; local $_; | 
|  | 0 |  |  |  |  | 0 |  | 
|  | 0 |  |  |  |  | 0 |  | 
| 42 | 0 |  |  |  |  | 0 | $REGISTR = new Unicode::Map() } | 
| 43 | 0 |  | 0 |  |  | 0 | $_ = lc $REGISTR->id(uc $_) || croak "Character Set '$_' not defined!"; | 
| 44 | 0 | 0 |  |  |  | 0 | $_ = 'latin1' if $_ eq 'iso-8859-1'; | 
| 45 |  |  |  |  |  |  | } | 
| 46 |  |  |  |  |  |  |  | 
| 47 | 1 | 50 |  |  |  | 5 | return $CONVERT{$src}{$dst}{$mod}{$chr} = | 
| 48 |  |  |  |  |  |  | $CONVERT{$SRC}{$DST}{$mod}{$chr} if exists | 
| 49 |  |  |  |  |  |  | $CONVERT{$SRC}{$DST}{$mod}{$chr}; | 
| 50 |  |  |  |  |  |  |  | 
| 51 | 1 |  |  |  |  | 5 | my $map = ($SRC !~ uni) | ($DST !~ uni) << 1; | 
| 52 |  |  |  |  |  |  |  | 
| 53 | 1 |  |  |  |  | 5 | for ([$src, $SRC, $map&src], [$dst, $DST, $map&dst]){ | 
| 54 | 2 | 50 | 33 |  |  | 8 | next unless $$_[2] and !$MAPPING{$$_[0]}; | 
| 55 | 0 |  | 0 |  |  | 0 | $MAPPING{$$_[0]} = $MAPPING{$$_[1]} || | 
| 56 |  |  |  |  |  |  | ($MAPPING{$$_[1]} = new Unicode::Map(uc $$_[1])) || | 
| 57 |  |  |  |  |  |  | croak "Can't create Unicode::Map object for '$$_[1]' charset!"; | 
| 58 |  |  |  |  |  |  | } | 
| 59 |  |  |  |  |  |  |  | 
| 60 | 1 | 50 | 33 |  |  | 16 | $map = all if | 
|  |  |  | 33 |  |  |  |  | 
|  |  |  | 33 |  |  |  |  | 
|  |  |  | 33 |  |  |  |  | 
|  |  |  | 33 |  |  |  |  | 
|  |  |  | 33 |  |  |  |  | 
| 61 |  |  |  |  |  |  | $map == src && $DST eq 'latin1' or | 
| 62 |  |  |  |  |  |  | $map == dst && $SRC eq 'latin1' or | 
| 63 |  |  |  |  |  |  | $map == nil && $SRC eq 'latin1' && $DST eq 'latin1'; | 
| 64 |  |  |  |  |  |  |  | 
| 65 |  |  |  |  |  |  | # Situation checking | 
| 66 | 1 | 50 | 33 |  |  | 5 | croak "FLAG param can be only for SBCS->SBCS!" if $map != all and $mod; | 
| 67 | 1 | 50 | 33 |  |  | 7 | croak "CHAR param can be only for SBCS->SBCS!" if $map != all and length $chr; | 
| 68 | 1 | 0 | 0 |  |  | 2 | croak "Can't convert to the same codepage!"    if $SRC eq $DST and | 
|  |  |  | 33 |  |  |  |  | 
| 69 |  |  |  |  |  |  | $map != all || not $mod & EQ_7BT; | 
| 70 | 1 |  |  |  |  | 1 | my ($mut); | 
| 71 | 1 | 50 |  |  |  | 5 | if ($map != all) | 
| 72 |  |  |  |  |  |  | { | 
| 73 | 1 |  |  |  |  | 1 | my ($uni, $utf) = ($map^all, 0); | 
| 74 | 1 | 50 | 33 |  |  | 6 | $utf |= src if $uni & src and $SRC ne 'utf16'; | 
| 75 | 1 | 50 | 33 |  |  | 7 | $utf |= dst if $uni & dst and $DST ne 'utf16'; | 
| 76 |  |  |  |  |  |  |  | 
| 77 | 1 |  |  |  |  | 2 | $mut = '$_'; | 
| 78 |  |  |  |  |  |  |  | 
| 79 | 1 | 50 |  |  |  | 17 | $mut = "\$MAPPING{'$SRC'}->to_unicode($mut)"   if $map & src; | 
| 80 | 1 | 50 | 33 |  |  | 9 | $mut = "Unicode::String::$SRC($mut)"           if $uni & src && not | 
|  |  |  | 33 |  |  |  |  | 
| 81 |  |  |  |  |  |  | $map & dst &&!($utf&src); | 
| 82 | 1 | 50 |  |  |  | 3 | $mut = "\$MAPPING{'$DST'}->from_unicode($mut)" if $map & dst; | 
| 83 | 1 | 50 | 33 |  |  | 5 | $mut = "Unicode::String::utf16($mut)"          if $utf & dst && $map & src; | 
| 84 | 1 | 50 | 33 |  |  | 7 | $mut = "$mut->$DST"                            if $uni & dst && $uni & src or | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 33 |  |  |  |  | 
| 85 |  |  |  |  |  |  | $utf & dst && $map & src; | 
| 86 | 1 |  |  |  |  | 3 | $mut = '$_='.$mut; | 
| 87 |  |  |  |  |  |  | } | 
| 88 | 0 |  |  |  |  | 0 | else{ $mut = __sbcs_convertor($SRC, $DST, $mod, $chr) } | 
| 89 | 1 | 50 |  |  |  | 3 | warn "MUTATOR: $SRC -> $DST [$mod]\t$mut\n" if $TEST; | 
| 90 |  |  |  |  |  |  |  | 
| 91 |  |  |  |  |  |  | return | 
| 92 | 1 |  |  |  |  | 203 | $CONVERT{$src}{$dst}{$mod}{$chr} = | 
| 93 |  |  |  |  |  |  | $CONVERT{$SRC}{$DST}{$mod}{$chr} = eval 'sub(;$){ | 
| 94 |  |  |  |  |  |  | my $str = @_ ? $_[0] : defined wantarray ? $_ : \$_; | 
| 95 |  |  |  |  |  |  | for( ref$str?$$str:$str ){ if($_){'.$mut.'} | 
| 96 |  |  |  |  |  |  | return $_ if defined wantarray} | 
| 97 |  |  |  |  |  |  | $_ = $str if defined $_[0] and not ref $str }'; | 
| 98 |  |  |  |  |  |  | } | 
| 99 |  |  |  |  |  |  |  | 
| 100 |  |  |  |  |  |  | sub convert($$;$$$){ | 
| 101 | 1 |  |  | 1 | 1 | 43507 | my $fn = convertor( shift, shift, $_[1], $_[2] ); | 
| 102 | 1 |  |  |  |  | 21 | &$fn; | 
| 103 |  |  |  |  |  |  | } | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | sub addequal(@) | 
| 106 |  |  |  |  |  |  | { | 
| 107 |  |  |  |  |  |  | return unless | 
| 108 | 0 |  |  |  |  |  | my @chr = map{ | 
| 109 | 0 | 0 |  | 0 | 1 |  | my @a = map hex, split /\+/; | 
|  |  | 0 |  |  |  |  |  | 
| 110 | 0 | 0 |  |  |  |  | $#a ? \@a : $a[0]; | 
| 111 |  |  |  |  |  |  | }$#_ ? @_ : split /\s+/, shift; | 
| 112 |  |  |  |  |  |  |  | 
| 113 | 0 |  |  |  |  |  | $EQUIVAL{shift @chr} = \@chr; | 
| 114 |  |  |  |  |  |  |  | 
| 115 | 0 |  |  |  |  |  | @chr = map{ | 
| 116 | 0 |  |  |  |  |  | (ref || !exists $EQUIVAL{$_}) ? $_ : | 
| 117 | 0 | 0 | 0 |  |  |  | ($_, @{$EQUIVAL{$_}}) | 
| 118 |  |  |  |  |  |  | }@chr; | 
| 119 |  |  |  |  |  |  | } | 
| 120 |  |  |  |  |  |  |  | 
| 121 |  |  |  |  |  |  | sub __sbcs_convertor($$$$) | 
| 122 |  |  |  |  |  |  | { | 
| 123 | 0 |  |  | 0 |  |  | my ($src, $dst, $mod, $chr) = (shift, shift, shift, shift); | 
| 124 | 0 |  |  |  |  |  | my (@src, %src, @dst, %dst, @dif, %dif); | 
| 125 |  |  |  |  |  |  |  | 
| 126 | 0 | 0 |  |  |  |  | croak "Unknown flags: $mod!"      if $mod & ~(UL_ALL|UL_7BT); | 
| 127 | 0 | 0 | 0 |  |  |  | croak "CHAR and UL_ENT together!" if length $chr and $mod & RP_ENT; | 
| 128 |  |  |  |  |  |  |  | 
| 129 | 0 | 0 | 0 |  |  |  | $chr = length($chr) ? substr($chr,0,1) : '?' if | 
|  |  | 0 |  |  |  |  |  | 
| 130 |  |  |  |  |  |  | $mod & RP_CHR and not $mod & RP_ENT; | 
| 131 |  |  |  |  |  |  |  | 
| 132 |  |  |  |  |  |  | # fill charsets arrays with U+0000 | 
| 133 | 0 | 0 |  |  |  |  | @dst = (0) x 0x80 if $mod & EQ_7BT; | 
| 134 | 0 | 0 |  |  |  |  | for ([$src, \@src], ($mod & EQ_7BT)?():[$dst, \@dst]){ | 
| 135 | 0 |  |  |  |  |  | my $conv = convertor( $$_[0], 'utf16' ); | 
| 136 | 0 | 0 |  |  |  |  | @{$$_[1]} = map {&$conv(); $_ ? unpack 'n', $_ : 0} map chr, 0x80..0xff; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  | } | 
| 138 |  |  |  |  |  |  |  | 
| 139 | 0 | 0 |  |  |  |  | @src{@src} = 0x80..0xff  if $mod & ~RP_CHR; | 
| 140 | 0 |  |  |  |  |  | @dst{@dst} = 0x80..0xff; | 
| 141 |  |  |  |  |  |  |  | 
| 142 |  |  |  |  |  |  | # collect positions of unused chars | 
| 143 | 0 | 0 |  |  |  |  | if ($mod & ~RP_CHR){                # if need indirect replace | 
| 144 | 0 |  |  |  |  |  | for (0 .. $#dst){ | 
| 145 | 0 | 0 | 0 |  |  |  | push @dif, $_ + 0x80 if | 
| 146 |  |  |  |  |  |  | !$dst[$_] or                # char not used in dst codepage | 
| 147 |  |  |  |  |  |  | !exists $src{$dst[$_]}      # char not used in src codepage | 
| 148 |  |  |  |  |  |  | } | 
| 149 |  |  |  |  |  |  | } | 
| 150 |  |  |  |  |  |  |  | 
| 151 |  |  |  |  |  |  | # read equivalent rules | 
| 152 | 0 | 0 | 0 |  |  |  | if ($mod & UL_EQV and not %EQUIVAL){ | 
| 153 | 0 |  |  |  |  |  | local $_; | 
| 154 | 0 |  |  |  |  |  | while (){ s/\s*#.*//so; addequal($_); } | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 155 |  |  |  |  |  |  | } | 
| 156 |  |  |  |  |  |  |  | 
| 157 |  |  |  |  |  |  | my $find = sub(){ | 
| 158 | 0 |  |  | 0 |  |  | my $chr = $src[$_]; | 
| 159 | 0 | 0 |  |  |  |  | return undef unless exists $EQUIVAL{$chr}; | 
| 160 | 0 |  |  |  |  |  | LOOP: | 
| 161 | 0 |  |  |  |  |  | for (@{$EQUIVAL{$chr}}){ | 
| 162 | 0 | 0 | 0 |  |  |  | if (!ref){ next LOOP unless $_ < 0x80 or exists $dst{$_}; return $_ } | 
|  | 0 | 0 |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 163 | 0 | 0 |  |  |  |  | next unless $mod & EQ_SEQ; | 
| 164 | 0 | 0 | 0 |  |  |  | for (@$_){ next LOOP unless $_ < 0x80 or exists $dst{$_}} return $_; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 165 |  |  |  |  |  |  | } | 
| 166 | 0 |  |  |  |  |  | return undef; | 
| 167 | 0 |  |  |  |  |  | }; | 
| 168 |  |  |  |  |  |  |  | 
| 169 | 0 |  |  |  |  |  | my (@map, @eqv, @ent, @chr, @del); | 
| 170 |  |  |  |  |  |  |  | 
| 171 | 0 |  |  |  |  |  | for (0 .. $#src) | 
| 172 |  |  |  |  |  |  | { | 
| 173 | 0 | 0 | 0 |  |  |  | next if !$src[$_] or            # char not used in src codepage | 
| 174 |  |  |  |  |  |  | $src[$_] == $dst[$_];  # chars in src and dst maps are equal | 
| 175 |  |  |  |  |  |  |  | 
| 176 | 0 | 0 | 0 |  |  |  | if( exists $dst{$src[$_]} ){ | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 177 | 0 |  |  |  |  |  | push @map, [$_, $src[$_]]; | 
| 178 |  |  |  |  |  |  |  | 
| 179 |  |  |  |  |  |  | }elsif( $mod & EQ_CHR and my $uni = &$find ){ | 
| 180 | 0 | 0 | 0 |  |  |  | next if     ref $uni and | 
| 181 |  |  |  |  |  |  | push @eqv, [$_, $uni]; | 
| 182 | 0 | 0 | 0 |  |  |  | next if not ($dst{$uni} and $_ == $dst{$uni} - 0x80) and | 
|  |  |  | 0 |  |  |  |  | 
| 183 |  |  |  |  |  |  | push @map, [$_, $uni]; | 
| 184 | 0 |  |  |  |  |  | @dif = grep{ $_ != $dst{$uni} }@dif; | 
|  | 0 |  |  |  |  |  |  | 
| 185 |  |  |  |  |  |  |  | 
| 186 |  |  |  |  |  |  | }elsif( $mod & RP_ENT ){ | 
| 187 | 0 |  |  |  |  |  | push @ent, [$_, $src[$_]]; | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | }elsif( $mod & RP_CHR ){ | 
| 190 | 0 |  |  |  |  |  | push @chr, $_; | 
| 191 |  |  |  |  |  |  |  | 
| 192 |  |  |  |  |  |  | }else{ | 
| 193 | 0 |  |  |  |  |  | push @del, $_; | 
| 194 |  |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | } | 
| 196 |  |  |  |  |  |  | } | 
| 197 |  |  |  |  |  |  |  | 
| 198 | 0 | 0 |  |  |  |  | croak "Internal ERROR: not enough additional chars!\n" if @ent+@eqv > @dif; | 
| 199 |  |  |  |  |  |  |  | 
| 200 | 0 |  |  |  |  |  | ($src, $dst) = ('') x 2; | 
| 201 |  |  |  |  |  |  |  | 
| 202 |  |  |  |  |  |  | $src .= chr $$_[0] + 0x80, | 
| 203 |  |  |  |  |  |  | $dst .= chr($$_[1] < 0x80 ? $$_[1] : $dst{$$_[1]}) | 
| 204 | 0 | 0 |  |  |  |  | for @map; | 
| 205 | 0 |  |  |  |  |  | for (@ent){ | 
| 206 | 0 |  |  |  |  |  | $src .= chr $$_[0] + 0x80; | 
| 207 | 0 |  |  |  |  |  | $dst .= $$_[0] = chr shift @dif; | 
| 208 |  |  |  |  |  |  | } | 
| 209 |  |  |  |  |  |  |  | 
| 210 | 0 |  |  |  |  |  | for (@eqv){ | 
| 211 | 0 |  |  |  |  |  | $src .= chr $$_[0] + 0x80; | 
| 212 | 0 |  |  |  |  |  | $dst .= $$_[0] = chr shift @dif; | 
| 213 | 0 | 0 |  |  |  |  | $$_[1] = join '', map{ | 
| 214 | 0 |  |  |  |  |  | chr( $_ < 0x80 ? $_ : $dst{$_} ) | 
| 215 | 0 |  |  |  |  |  | }@{$$_[1]}; | 
| 216 | 0 |  |  |  |  |  | $$_[1] =~ s/([\-\\\/\$])/\\$1/gso; | 
| 217 |  |  |  |  |  |  | } | 
| 218 | 0 |  |  |  |  |  | $src .= chr $_ + 0x80       for @chr; | 
| 219 | 0 | 0 |  |  |  |  | $dst .= $chr x(@del?@chr:1) if  @chr; | 
|  |  | 0 |  |  |  |  |  | 
| 220 | 0 |  |  |  |  |  | $src .= chr $_ + 0x80       for @del; | 
| 221 |  |  |  |  |  |  |  | 
| 222 | 0 |  |  |  |  |  | s/(?=[-\\\[\]])/\\/gso      for $src, $dst; | 
| 223 |  |  |  |  |  |  |  | 
| 224 |  |  |  |  |  |  | my | 
| 225 | 0 | 0 |  |  |  |  | $res = "tr\n[$src]\n[$dst]" . (@del?'d':''); | 
| 226 | 0 |  |  |  |  |  | $res.= ";s/$$_[0]/$$_[1];/g" for @ent; | 
| 227 | 0 |  |  |  |  |  | $res.= ";s/$$_[0]/$$_[1]/g"    for @eqv; | 
| 228 |  |  |  |  |  |  |  | 
| 229 | 0 |  |  |  |  |  | return $res; | 
| 230 |  |  |  |  |  |  | } | 
| 231 |  |  |  |  |  |  |  | 
| 232 |  |  |  |  |  |  | 1; | 
| 233 |  |  |  |  |  |  |  | 
| 234 |  |  |  |  |  |  | =head1 NAME | 
| 235 |  |  |  |  |  |  |  | 
| 236 |  |  |  |  |  |  | Unicode::Lite - Easy conversion between encodings | 
| 237 |  |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  | =head1 SYNOPSIS | 
| 239 |  |  |  |  |  |  |  | 
| 240 |  |  |  |  |  |  | use Unicode::Lite; | 
| 241 |  |  |  |  |  |  |  | 
| 242 |  |  |  |  |  |  | print convert( 'latin1', 'unicode', "hello world!" ); | 
| 243 |  |  |  |  |  |  |  | 
| 244 |  |  |  |  |  |  | local *lat2uni = convertor( 'latin1', 'unicode' ); | 
| 245 |  |  |  |  |  |  | print lat2uni( "hello world!" ); | 
| 246 |  |  |  |  |  |  |  | 
| 247 |  |  |  |  |  |  | my $lat2uni = convertor( 'latin1', 'unicode' ); | 
| 248 |  |  |  |  |  |  | print &$lat2uni( "hello world!" ); | 
| 249 |  |  |  |  |  |  |  | 
| 250 |  |  |  |  |  |  | =head1 DESCRIPTION | 
| 251 |  |  |  |  |  |  |  | 
| 252 |  |  |  |  |  |  | This module includes string converting function from one and to another | 
| 253 |  |  |  |  |  |  | charset. Requires installed Unicode::String and Unicode::Map packages. | 
| 254 |  |  |  |  |  |  |  | 
| 255 |  |  |  |  |  |  | Supported unicode charsets: unicode, utf16, ucs2, utf8, utf7, ucs4, | 
| 256 |  |  |  |  |  |  | uchr, uhex. | 
| 257 |  |  |  |  |  |  |  | 
| 258 |  |  |  |  |  |  | Supported Single-Byte Charsets (SBCS): latin1 and all installed maps in | 
| 259 |  |  |  |  |  |  | Unicode::Map package. | 
| 260 |  |  |  |  |  |  |  | 
| 261 |  |  |  |  |  |  | =head1 FUNCTIONS | 
| 262 |  |  |  |  |  |  |  | 
| 263 |  |  |  |  |  |  | =over 4 | 
| 264 |  |  |  |  |  |  |  | 
| 265 |  |  |  |  |  |  | =item B SRC_CP DST_CP [FLGS] [CHAR] | 
| 266 |  |  |  |  |  |  |  | 
| 267 |  |  |  |  |  |  | Creates convertor function and returns reference to her, for further | 
| 268 |  |  |  |  |  |  | fast direct call. | 
| 269 |  |  |  |  |  |  |  | 
| 270 |  |  |  |  |  |  | The param FLGS operates replacing by SBCS->SBCS converting if any char | 
| 271 |  |  |  |  |  |  | from SRC_CP is absent at DST_CP. The order of search of substitution: | 
| 272 |  |  |  |  |  |  |  | 
| 273 |  |  |  |  |  |  | UL_7BT - to equivalent 7bit char or sequence of 7bit chars | 
| 274 |  |  |  |  |  |  | UL_SEQ - to equivalent char or sequence of chars | 
| 275 |  |  |  |  |  |  | UL_EQV - to equivalent char | 
| 276 |  |  |  |  |  |  |  | 
| 277 |  |  |  |  |  |  | UL_ENT - to entity - | 
| 278 |  |  |  |  |  |  | UL_CHR - to [CHAR]. | 
| 279 |  |  |  |  |  |  | UL_ALL - UL_SEQ or UL_EQV and UL_ENT or UL_CHR | 
| 280 |  |  |  |  |  |  |  | 
| 281 |  |  |  |  |  |  | If flag UL_CHR or UL_ENT is not specified, absent chars will be deleted. | 
| 282 |  |  |  |  |  |  | Param CHAR used for replacing of absent chars. If CHAR is not specified, | 
| 283 |  |  |  |  |  |  | will be used '?' char. | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  | If you are getting message "Character Set '' not defined!", run the | 
| 286 |  |  |  |  |  |  | script test.pl from distribution. | 
| 287 |  |  |  |  |  |  |  | 
| 288 |  |  |  |  |  |  | =item B SRC_CP DST_CP [VAR] [FLGS] [CHAR] | 
| 289 |  |  |  |  |  |  |  | 
| 290 |  |  |  |  |  |  | Convert VAR from SRC_CP codepage to DST_CP codepage and returns | 
| 291 |  |  |  |  |  |  | converted string. | 
| 292 |  |  |  |  |  |  |  | 
| 293 |  |  |  |  |  |  | =item B UNICODES... | 
| 294 |  |  |  |  |  |  |  | 
| 295 |  |  |  |  |  |  | The function adds a rule for equivalent char finding. Params is a list of | 
| 296 |  |  |  |  |  |  | hex unicodes of chars. For substitution on a sequence of characters, | 
| 297 |  |  |  |  |  |  | the codes of characters need to be connected in character '+'. | 
| 298 |  |  |  |  |  |  |  | 
| 299 |  |  |  |  |  |  | addequal( qw/2026 2E+2E+2E 3A/ ); # ELLIPSIS ... : | 
| 300 |  |  |  |  |  |  |  | 
| 301 |  |  |  |  |  |  | Note! Work of rules for finding of equivalent char is cascade: | 
| 302 |  |  |  |  |  |  |  | 
| 303 |  |  |  |  |  |  | 2500 002D      # - - | 
| 304 |  |  |  |  |  |  | 2550 2500      # = - | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  | 2550 2500 002D # = - - | 
| 307 |  |  |  |  |  |  |  | 
| 308 |  |  |  |  |  |  | =back | 
| 309 |  |  |  |  |  |  |  | 
| 310 |  |  |  |  |  |  | The following rules are correct for converting functions: | 
| 311 |  |  |  |  |  |  |  | 
| 312 |  |  |  |  |  |  | VAR may be SCALAR or REF to SCALAR. | 
| 313 |  |  |  |  |  |  | If VAR is REF to SCALAR then SCALAR will be converted. | 
| 314 |  |  |  |  |  |  | If VAR is omitted, uses $_. | 
| 315 |  |  |  |  |  |  | If function called to void context and VAR is not REF then result placed to $_. | 
| 316 |  |  |  |  |  |  |  | 
| 317 |  |  |  |  |  |  | =head1 EXAMPLES | 
| 318 |  |  |  |  |  |  |  | 
| 319 |  |  |  |  |  |  | $_ = "drüben, Straße"; | 
| 320 |  |  |  |  |  |  | convert 'latin1', 'latin1', $_, UL_7BT; | 
| 321 |  |  |  |  |  |  | convert 'latin1', 'latin2', $_, UL_SEQ|UL_CHR, '?'; | 
| 322 |  |  |  |  |  |  | convert 'latin1', 'latin2', $_, UL_SEQ|UL_ENT, '?'; | 
| 323 |  |  |  |  |  |  |  | 
| 324 |  |  |  |  |  |  | # EQVIVALENT CALLS: | 
| 325 |  |  |  |  |  |  |  | 
| 326 |  |  |  |  |  |  | local *lat2uni = convertor( 'latin1', 'unicode' ); | 
| 327 |  |  |  |  |  |  |  | 
| 328 |  |  |  |  |  |  | lat2uni( $str );        # called to void context -> result placed to $_ | 
| 329 |  |  |  |  |  |  | $_ = lat2uni( $str ); | 
| 330 |  |  |  |  |  |  |  | 
| 331 |  |  |  |  |  |  | lat2uni( \$str );       # called with REF to string -> direct converting | 
| 332 |  |  |  |  |  |  | $str = lat2uni( $str ); | 
| 333 |  |  |  |  |  |  |  | 
| 334 |  |  |  |  |  |  | lat2uni();              # with omitted param called -> $_ converted | 
| 335 |  |  |  |  |  |  | lat2uni( \$_ ); | 
| 336 |  |  |  |  |  |  | $_ = lat2uni( $_ ); | 
| 337 |  |  |  |  |  |  |  | 
| 338 |  |  |  |  |  |  | =head1 AUTHOR | 
| 339 |  |  |  |  |  |  |  | 
| 340 |  |  |  |  |  |  | Albert MICHEEV | 
| 341 |  |  |  |  |  |  |  | 
| 342 |  |  |  |  |  |  | =head1 COPYRIGHT | 
| 343 |  |  |  |  |  |  |  | 
| 344 |  |  |  |  |  |  | Copyright (C) 2000, Albert MICHEEV | 
| 345 |  |  |  |  |  |  |  | 
| 346 |  |  |  |  |  |  | This module is free software; you can redistribute it or modify it | 
| 347 |  |  |  |  |  |  | under the same terms as Perl itself. | 
| 348 |  |  |  |  |  |  |  | 
| 349 |  |  |  |  |  |  | =head1 AVAILABILITY | 
| 350 |  |  |  |  |  |  |  | 
| 351 |  |  |  |  |  |  | The latest version of this library is likely to be available from: | 
| 352 |  |  |  |  |  |  |  | 
| 353 |  |  |  |  |  |  | http://www.perl.com/CPAN | 
| 354 |  |  |  |  |  |  |  | 
| 355 |  |  |  |  |  |  | =head1 SEE ALSO | 
| 356 |  |  |  |  |  |  |  | 
| 357 |  |  |  |  |  |  | Unicode::String, Unicode::Map, map | 
| 358 |  |  |  |  |  |  |  | 
| 359 |  |  |  |  |  |  | =cut | 
| 360 |  |  |  |  |  |  |  | 
| 361 |  |  |  |  |  |  | __DATA__ |