File Coverage

blib/lib/EBook/Ishmael/CharDet/ShiftJIS.pm
Criterion Covered Total %
statement 36 39 92.3
branch 10 12 83.3
condition 9 12 75.0
subroutine 8 9 88.8
pod 0 5 0.0
total 63 77 81.8


line stmt bran cond sub pod time code
1             package EBook::Ishmael::CharDet::ShiftJIS;
2 18     18   387 use 5.016;
  18         115  
3             our $VERSION = '2.03';
4 18     18   118 use strict;
  18         37  
  18         453  
5 18     18   135 use warnings;
  18         35  
  18         1023  
6              
7 18     18   103 use EBook::Ishmael::CharDet::Constants qw(:CONSTANTS);
  18         50  
  18         17413  
8              
9             # Generated from contrib/cjk2encodings.pl.
10             my %SHIFTJIS_FREQS = map { $_ => 1 } (
11             33139,33140,33484,33469,33442,33461,33089,33481,33480,33485,33090,33476,33478,
12             33449,33520,33444,33450,33513,33511,33477,33521,33504,33474,33473,33514,33453,
13             33457,33500,33470,33512,33440,33463,33451,33506,33459,33467,33141,33142,33510,
14             33517,33455,33115,33448,33471,33683,38110,33465,36972,33479,33503,33446,33624,
15             33501,33494,33515,35050,33462,33486,33488,33491,36009,36759,36719,38314,33483,
16             33497,36470,33093,35261,33640,33603,36502,33675,33518,33673,36584,36521,33639,
17             36030,33674,33662,35165,37349,35540,33464,36510,33108,36211,33456,33601,33502,
18             35727,37882,36835,37510,36465,37873,33489,36484,33458,35258,33630,33614,36257,
19             36947,33492,36562,38395,37485,35270,39122,39010,33637,37046,38618,37199,33634,
20             35651,33508,38823,33495,33452,33615,33652,33602,33623,38327,33605,33647,33475,
21             37884,38523,33472,33505,40838,33653,33454,33482,33635,33460,35246,37300,37482,
22             33677,33622,36067,33676,38312,35918,33661,37752,36700,33607,36582,36753,37966,
23             33498,36509,35027,36255,33509,37578,36780,33642,36231,38259,36091,36967,36431,
24             35508,38602,33667,38579,37202,37486,38292,33632,38590,38792,33626,36746,36204,
25             36842,35407,33468,36781,39292,38633,33610,37080,36814,37807,37347,33466,37094,
26             33609,37291,37727,36934,37705,36581,37509,38508,35257,36452,38561,33665,37026,
27             38484,37043,35311,36950,38034,37802,36293,37258,38356,38813,38326,35808,36480,
28             37559,35064,38262,36486,36936,36164,33611,37806,37817,40681,59640,33109,33638,
29             38378,38275,36941,38588,36092,33671,36846,36511,35979,36068,36434,33519,36288,
30             38626,38295,37050,34980,36577,38861,37330,36460,38260,37454,33612,36564,36254,
31             36965,35807,36085,38798,33618,37856,33654,37564,35559,38764,58335,35982,35827,
32             36060,36959,35662,38636,38616,38768,36226,37203,37782,37249,35016,37731,33631,
33             37493,37548,35403,38140,37952,36020,37622,38726,33487,35402,35295,36949,38346,
34             38772,33648,36708,36712,35760,33606,37317,35569,39803,33664,36544,36036,37350,
35             35278,33641,38977,35553,37998,35763,37835,37554,36332,36461,33620,37720,36478,
36             37035,33625,37545,37242,37049,37822,33143,36256,33144,33645,35041,37539,57793,
37             35709,35008,35818,38351,38586,36296,36997,36745,38076,59795,38548,33656,38099,
38             38774,36538,36455,37324,33684,38589,36595,39152,37831,38333,37308,37718,33669,
39             36955,40804,35976,38732,37284,36421,37309,38841,36074,35930,35452,35284,35687,
40             37481,36681,35312,38274,36261,37259,37767,37732,37786,35683,33613,35398,35266,
41             37017,38998,36536,36462,36311,38493,40665,33646,35697,35011,38578,36241,35265,
42             38270,35219,36958,36171,40102,38368,33660,38219,35542,36471,35777,37785,36592,
43             36464,34987,38594,36035,35313,35166,38882,38010,37264,37855,36672,38623,38100,
44             33616,39584,37100,35464,39817,35567,36801,38236,37070,37287,35660,33628,38489,
45             35317,57699,35752,35300,37036,38129,37326,38808,36838,39002,37599,35061,36086,
46             37461,33679,35822,37271,36019,35649,37742,38019,33659,37808,37188,35648,40396,
47             58713,36956,38130,38828,59293,36695,35437,37084,38352,36506,35276,37608,36560,
48             37625,36969,38241,35769,36520,35771,37076,33666,35936,38302,37012,35743,35449,
49             37517,37332,40182,36845,35986,37753,57512,35030,38394,35757,35782,37352,36451,
50             37507,38058,
51             );
52              
53             # https://www-archive.mozilla.org/projects/intl/universalcharsetdetection
54             my $DIST_RATIO = 0.93;
55              
56             sub new {
57              
58 67     67 0 190 my ($class) = @_;
59              
60 67         390 my $self = {
61             Code => 0,
62             Left => 0,
63             Freqs => 0,
64             MBs => 0,
65             Total => 0,
66             Bad => 0,
67             };
68              
69 67         870 return bless $self, $class;
70              
71             }
72              
73             sub take {
74              
75 4835     4835 0 9289 my ($self, $bytes) = @_;
76              
77 4835 50       10507 return TAKE_BAD if $self->{Bad};
78              
79 4835         10968 for my $i (0 .. length($bytes) - 1) {
80 76905         122561 my $b = ord(substr $bytes, $i, 1) & 0xff;
81 76905 100 100     274950 if ($self->{Left}) {
    100 100        
    100 66        
82             ## So if the leading byte is even, the trailing byte must be in
83             ## range 0x9f-0xfc. If the leading byte is odd, is must in range
84             ## 0x40-0x9e (but not 0x7f). However, the texts I tested this
85             ## on did not follow these rules, so I guess we ignore them :-/.
86             #if ($self->{Code} % 2 == 0) {
87             # if ($b >= 0x9f && $b <= 0xfc) {
88             # $self->{Code} = ($self->{Code} << 8) | $b;
89             # } else {
90             # $self->{Bad} = 1;
91             # return TAKE_BAD;
92             # }
93             #} else {
94             # if ($b >= 0x40 && $b <= 0x9e && $b != 0x7f) {
95             # $self->{Code} = ($self->{Code} << 8) | $b;
96             # } else {
97             # say $self->{Total};
98             # $self->{Bad} = 1;
99             # return TAKE_BAD;
100             # }
101             #}
102 6723         12114 $self->{Code} = ($self->{Code} << 8) | $b;
103 6723 100       18302 if (exists $SHIFTJIS_FREQS{ $self->{Code} }) {
104 4365         6983 $self->{Freqs}++;
105             }
106 6723         10015 $self->{MBs}++;
107 6723         10600 $self->{Total}++;
108 6723         9804 $self->{Left}--;
109 6723         11850 $self->{Code} = 0;
110             } elsif (($b >= 0x81 && $b <= 0x9f) or ($b >= 0xe0 && $b <= 0xef)) {
111 6730         10983 $self->{Code} = $b;
112 6730         11974 $self->{Left} = 1;
113             # TODO: Does this interfere with ShiftJIS extensions
114             } elsif ($b & 0x80) {
115 52         126 $self->{Bad} = 1;
116 52         195 return TAKE_BAD;
117             } else {
118 63400         103717 $self->{Total}++;
119             }
120             }
121              
122 4783         12128 return TAKE_OK;
123              
124             }
125              
126             sub confidence {
127              
128 4     4 0 11 my ($self) = @_;
129              
130 4 50 33     29 if ($self->{Bad} or $self->{MBs} == 0) {
131 0         0 return 0;
132             }
133              
134 4         39 return $self->{Freqs} / $self->{MBs};
135              
136             }
137              
138             sub bad {
139              
140 0     0 0 0 my ($self) = @_;
141              
142 0         0 return $self->{Bad};
143              
144             }
145              
146 4     4 0 33 sub encoding { 'Shift_JIS' }
147              
148             1;
149