File Coverage

blib/lib/EBook/Ishmael/CharDet/EUCJP.pm
Criterion Covered Total %
statement 53 73 72.6
branch 22 32 68.7
condition 6 15 40.0
subroutine 8 9 88.8
pod 0 5 0.0
total 89 134 66.4


line stmt bran cond sub pod time code
1             package EBook::Ishmael::CharDet::EUCJP;
2 18     18   361 use 5.016;
  18         79  
3             our $VERSION = '2.03';
4 18     18   136 use strict;
  18         40  
  18         501  
5 18     18   116 use warnings;
  18         36  
  18         1278  
6              
7 18     18   149 use EBook::Ishmael::CharDet::Constants qw(:CONSTANTS);
  18         34  
  18         23125  
8              
9             # Generated from contrib/cjk2encodings.pl
10             my %EUCJP_FREQS = map { $_ => 1 } (
11             41428,41429,42190,42175,42148,42167,41378,42187,42186,42191,41379,42182,42184,
12             42155,42226,42150,42156,42219,42217,42183,42227,42210,42180,42179,42220,42159,
13             42163,42206,42176,42218,42146,42169,42157,42212,42165,42173,41430,41431,42216,
14             42223,42161,41404,42154,42177,42483,51424,42171,49101,42185,42209,42152,42425,
15             42207,42200,42221,45292,42168,42192,42194,42197,47275,48631,48592,51884,42189,
16             42203,48087,41382,45759,42441,42404,48118,42475,42224,42473,48362,48299,42440,
17             47296,42474,42463,45502,49895,46294,42170,48126,41397,47572,42162,42402,42208,
18             46575,50940,48869,50150,48082,50931,42195,48100,42164,45756,42431,42415,47779,
19             49076,42198,48340,51965,50126,45768,53460,53187,42438,49336,52444,49584,42435,
20             46500,42214,52905,42201,42158,42416,42453,42403,42424,51897,42406,42448,42181,
21             50942,52188,42178,42211,56806,42454,42160,42188,42436,42166,45744,49846,50123,
22             42477,42423,47333,42476,51882,47023,42462,50649,48573,42408,48360,48625,51119,
23             42204,48125,45269,47777,42215,50380,48814,42443,47591,51668,47357,49096,48048,
24             46262,52428,42467,52405,49587,50127,51700,42433,52416,52712,42427,48618,47565,
25             48876,46000,42174,48815,53725,52459,42411,49370,48848,50865,49893,42172,49384,
26             42410,49837,50624,49063,50602,48359,50149,52173,45755,48069,52387,42465,49316,
27             52149,49333,45809,49079,51186,50860,47815,49642,51926,52733,51896,46818,48096,
28             50361,45306,51671,48102,49065,47525,42412,50864,50875,56555,61690,41398,42439,
29             51948,51683,49070,52414,47358,42471,48880,48289,47083,47334,48051,42225,47810,
30             52452,51703,49340,45222,48355,52943,49876,48077,51669,50095,42413,48342,47614,
31             49094,46817,47351,52718,42419,50914,42455,50366,46313,52685,59105,47086,46837,
32             47326,49088,46511,52462,52442,52689,47586,49588,50678,49633,45258,50628,42432,
33             50134,50350,45996,51454,51105,47286,50424,52647,42193,45995,45793,49078,51916,
34             52693,42449,48581,48585,46770,42407,49863,46323,54748,42464,48322,47302,49896,
35             45776,42442,53154,46307,51151,46773,50893,50356,47854,48078,42421,50617,48095,
36             49325,42426,50347,49627,49339,50880,41432,47778,41433,42446,45283,50341,58051,
37             46558,45250,46828,51921,52412,47818,49125,48617,51390,61939,52212,42457,51413,
38             52695,48316,48072,49870,42484,52415,48373,53490,50889,51903,49854,50615,42469,
39             49084,56773,47080,52653,49830,48038,49855,52923,47340,47035,46045,45782,46536,
40             50122,48554,45810,51682,47783,49643,50663,50629,50682,46532,42414,45991,45764,
41             49145,53175,48314,48079,47833,52158,56539,42447,46546,45253,52404,47601,45763,
42             51679,45555,49087,47532,55464,51938,42461,51628,46296,48088,46787,50681,48370,
43             48081,45229,52420,47301,45811,45503,52964,51163,49648,50913,48545,52449,51414,
44             42417,54434,49390,46056,54761,46321,48835,51645,49360,49833,46509,42429,52154,
45             45815,57796,46762,45798,49326,51443,49872,52728,48872,53179,50401,45303,47352,
46             50102,42479,46832,49655,47285,46498,50639,51171,42460,50866,49573,46497,56014,
47             59834,49085,51444,52910,60925,48568,46030,49374,51922,48122,45774,50410,48338,
48             50427,49098,51650,46779,48298,46781,49366,42466,47041,51710,49140,46753,46042,
49             50157,49878,55544,48879,47090,50650,57514,45272,51964,46767,46792,49898,48068,
50             50147,51372,
51             );
52              
53             # https://www-archive.mozilla.org/projects/intl/universalcharsetdetection
54             my $DIST_RATIO = 0.93;
55              
56             sub new {
57              
58 67     67 0 180 my ($class) = @_;
59              
60 67         597 my $self = {
61             Code => 0,
62             Left => 0,
63             Set => 0,
64             Freqs => 0,
65             MBs => 0,
66             Total => 0,
67             Bad => 0,
68             };
69              
70 67         752 return bless $self, $class;
71              
72             }
73              
74             sub take {
75              
76 5800     5800 0 12138 my ($self, $bytes) = @_;
77              
78 5800 50       14089 return TAKE_BAD if $self->{Bad};
79              
80 5800         13138 for my $i (0 .. length($bytes) - 1) {
81 92455         153342 my $b = ord(substr $bytes, $i, 1) & 0xff;
82 92455 100 66     356733 if ($self->{Set} == 3) {
    50          
    100          
    100          
    50          
    100          
    100          
83 232 50 33     731 if ($b < 0xa1 or $b > 0xfe) {
84 0         0 $self->{Bad} = 1;
85 0         0 return TAKE_BAD;
86             }
87 232         397 $self->{Code} = ($self->{Code} << 8) | $b;
88 232         321 $self->{Left}--;
89 232 100       463 if ($self->{Left} == 0) {
90 116 50       275 if (exists $EUCJP_FREQS{ $self->{Code} }) {
91 0         0 $self->{Freqs}++;
92             }
93 116         196 $self->{Total}++;
94 116         161 $self->{MBs}++;
95 116         187 $self->{Code} = 0;
96 116         226 $self->{Set} = 0;
97             }
98             } elsif ($self->{Set} == 2) {
99 0 0 0     0 if ($b < 0xa1 or $b > 0xdf) {
100 0         0 $self->{Bad} = 1;
101 0         0 return TAKE_BAD;
102             }
103 0         0 $self->{Code} = ($self->{Code} << 8) | $b;
104 0         0 $self->{Left}--;
105 0 0       0 if (exists $EUCJP_FREQS{ $self->{Code} }) {
106 0         0 $self->{Freqs}++;
107             }
108 0         0 $self->{Total}++;
109 0         0 $self->{MBs}++;
110 0         0 $self->{Code} = 0;
111 0         0 $self->{Set} = 0;
112             } elsif ($self->{Set} == 1) {
113             # The range is supposed to be 0xa1-0xfe, but HP-16 allows for
114             # lower trailing bytes.
115 23013 100 66     72326 if ($b < 0x21 or $b > 0xfe) {
116 16         31 $self->{Bad} = 1;
117 16         65 return TAKE_BAD;
118             }
119 22997         42246 $self->{Code} = ($self->{Code} << 8) | $b;
120 22997 100       53284 if (exists $EUCJP_FREQS{ $self->{Code} }) {
121 6427         9269 $self->{Freqs}++;
122             }
123 22997         33486 $self->{Left}--;
124 22997         31678 $self->{MBs}++;
125 22997         32564 $self->{Total}++;
126 22997         33131 $self->{Code} = 0;
127 22997         38269 $self->{Set} = 0;
128             } elsif ($b == 0x8f) {
129 116         179 $self->{Set} = 3;
130 116         229 $self->{Left} = 2;
131 116         185 $self->{Code} = $b;
132             } elsif ($b == 0x8e) {
133 0         0 $self->{Set} = 2;
134 0         0 $self->{Left} = 1;
135 0         0 $self->{Code} = $b;
136             } elsif ($b >= 0xa1 && $b <= 0xfe) {
137 23015         35319 $self->{Set} = 1;
138 23015         34483 $self->{Left} = 1;
139 23015         37873 $self->{Code} = $b;
140             } elsif ($b & 0x80) {
141 24         70 $self->{Bad} = 1;
142 24         97 return TAKE_BAD;
143             } else {
144 46055         77847 $self->{Total}++;
145             }
146             }
147              
148 5760         16124 return TAKE_OK;
149              
150             }
151              
152             sub confidence {
153              
154 7     7 0 15 my ($self) = @_;
155              
156 7 50 33     98 if ($self->{Bad} or $self->{MBs} == 0) {
157 0         0 return 0;
158             }
159              
160 7         33 return $self->{Freqs} / $self->{MBs};
161              
162             }
163              
164             sub bad {
165              
166 0     0 0 0 my ($self) = @_;
167              
168 0         0 return $self->{Bad};
169              
170             }
171              
172 7     7 0 23 sub encoding { 'EUC-JP' }
173              
174             1;
175