| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package EBook::Ishmael::CharDet::EUCJP; |
|
2
|
18
|
|
|
18
|
|
361
|
use 5.016; |
|
|
18
|
|
|
|
|
79
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '2.03'; |
|
4
|
18
|
|
|
18
|
|
136
|
use strict; |
|
|
18
|
|
|
|
|
40
|
|
|
|
18
|
|
|
|
|
501
|
|
|
5
|
18
|
|
|
18
|
|
116
|
use warnings; |
|
|
18
|
|
|
|
|
36
|
|
|
|
18
|
|
|
|
|
1278
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
18
|
|
|
18
|
|
149
|
use EBook::Ishmael::CharDet::Constants qw(:CONSTANTS); |
|
|
18
|
|
|
|
|
34
|
|
|
|
18
|
|
|
|
|
23125
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# Generated from contrib/cjk2encodings.pl |
|
10
|
|
|
|
|
|
|
my %EUCJP_FREQS = map { $_ => 1 } ( |
|
11
|
|
|
|
|
|
|
41428,41429,42190,42175,42148,42167,41378,42187,42186,42191,41379,42182,42184, |
|
12
|
|
|
|
|
|
|
42155,42226,42150,42156,42219,42217,42183,42227,42210,42180,42179,42220,42159, |
|
13
|
|
|
|
|
|
|
42163,42206,42176,42218,42146,42169,42157,42212,42165,42173,41430,41431,42216, |
|
14
|
|
|
|
|
|
|
42223,42161,41404,42154,42177,42483,51424,42171,49101,42185,42209,42152,42425, |
|
15
|
|
|
|
|
|
|
42207,42200,42221,45292,42168,42192,42194,42197,47275,48631,48592,51884,42189, |
|
16
|
|
|
|
|
|
|
42203,48087,41382,45759,42441,42404,48118,42475,42224,42473,48362,48299,42440, |
|
17
|
|
|
|
|
|
|
47296,42474,42463,45502,49895,46294,42170,48126,41397,47572,42162,42402,42208, |
|
18
|
|
|
|
|
|
|
46575,50940,48869,50150,48082,50931,42195,48100,42164,45756,42431,42415,47779, |
|
19
|
|
|
|
|
|
|
49076,42198,48340,51965,50126,45768,53460,53187,42438,49336,52444,49584,42435, |
|
20
|
|
|
|
|
|
|
46500,42214,52905,42201,42158,42416,42453,42403,42424,51897,42406,42448,42181, |
|
21
|
|
|
|
|
|
|
50942,52188,42178,42211,56806,42454,42160,42188,42436,42166,45744,49846,50123, |
|
22
|
|
|
|
|
|
|
42477,42423,47333,42476,51882,47023,42462,50649,48573,42408,48360,48625,51119, |
|
23
|
|
|
|
|
|
|
42204,48125,45269,47777,42215,50380,48814,42443,47591,51668,47357,49096,48048, |
|
24
|
|
|
|
|
|
|
46262,52428,42467,52405,49587,50127,51700,42433,52416,52712,42427,48618,47565, |
|
25
|
|
|
|
|
|
|
48876,46000,42174,48815,53725,52459,42411,49370,48848,50865,49893,42172,49384, |
|
26
|
|
|
|
|
|
|
42410,49837,50624,49063,50602,48359,50149,52173,45755,48069,52387,42465,49316, |
|
27
|
|
|
|
|
|
|
52149,49333,45809,49079,51186,50860,47815,49642,51926,52733,51896,46818,48096, |
|
28
|
|
|
|
|
|
|
50361,45306,51671,48102,49065,47525,42412,50864,50875,56555,61690,41398,42439, |
|
29
|
|
|
|
|
|
|
51948,51683,49070,52414,47358,42471,48880,48289,47083,47334,48051,42225,47810, |
|
30
|
|
|
|
|
|
|
52452,51703,49340,45222,48355,52943,49876,48077,51669,50095,42413,48342,47614, |
|
31
|
|
|
|
|
|
|
49094,46817,47351,52718,42419,50914,42455,50366,46313,52685,59105,47086,46837, |
|
32
|
|
|
|
|
|
|
47326,49088,46511,52462,52442,52689,47586,49588,50678,49633,45258,50628,42432, |
|
33
|
|
|
|
|
|
|
50134,50350,45996,51454,51105,47286,50424,52647,42193,45995,45793,49078,51916, |
|
34
|
|
|
|
|
|
|
52693,42449,48581,48585,46770,42407,49863,46323,54748,42464,48322,47302,49896, |
|
35
|
|
|
|
|
|
|
45776,42442,53154,46307,51151,46773,50893,50356,47854,48078,42421,50617,48095, |
|
36
|
|
|
|
|
|
|
49325,42426,50347,49627,49339,50880,41432,47778,41433,42446,45283,50341,58051, |
|
37
|
|
|
|
|
|
|
46558,45250,46828,51921,52412,47818,49125,48617,51390,61939,52212,42457,51413, |
|
38
|
|
|
|
|
|
|
52695,48316,48072,49870,42484,52415,48373,53490,50889,51903,49854,50615,42469, |
|
39
|
|
|
|
|
|
|
49084,56773,47080,52653,49830,48038,49855,52923,47340,47035,46045,45782,46536, |
|
40
|
|
|
|
|
|
|
50122,48554,45810,51682,47783,49643,50663,50629,50682,46532,42414,45991,45764, |
|
41
|
|
|
|
|
|
|
49145,53175,48314,48079,47833,52158,56539,42447,46546,45253,52404,47601,45763, |
|
42
|
|
|
|
|
|
|
51679,45555,49087,47532,55464,51938,42461,51628,46296,48088,46787,50681,48370, |
|
43
|
|
|
|
|
|
|
48081,45229,52420,47301,45811,45503,52964,51163,49648,50913,48545,52449,51414, |
|
44
|
|
|
|
|
|
|
42417,54434,49390,46056,54761,46321,48835,51645,49360,49833,46509,42429,52154, |
|
45
|
|
|
|
|
|
|
45815,57796,46762,45798,49326,51443,49872,52728,48872,53179,50401,45303,47352, |
|
46
|
|
|
|
|
|
|
50102,42479,46832,49655,47285,46498,50639,51171,42460,50866,49573,46497,56014, |
|
47
|
|
|
|
|
|
|
59834,49085,51444,52910,60925,48568,46030,49374,51922,48122,45774,50410,48338, |
|
48
|
|
|
|
|
|
|
50427,49098,51650,46779,48298,46781,49366,42466,47041,51710,49140,46753,46042, |
|
49
|
|
|
|
|
|
|
50157,49878,55544,48879,47090,50650,57514,45272,51964,46767,46792,49898,48068, |
|
50
|
|
|
|
|
|
|
50147,51372, |
|
51
|
|
|
|
|
|
|
); |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# https://www-archive.mozilla.org/projects/intl/universalcharsetdetection |
|
54
|
|
|
|
|
|
|
my $DIST_RATIO = 0.93; |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
sub new { |
|
57
|
|
|
|
|
|
|
|
|
58
|
67
|
|
|
67
|
0
|
180
|
my ($class) = @_; |
|
59
|
|
|
|
|
|
|
|
|
60
|
67
|
|
|
|
|
597
|
my $self = { |
|
61
|
|
|
|
|
|
|
Code => 0, |
|
62
|
|
|
|
|
|
|
Left => 0, |
|
63
|
|
|
|
|
|
|
Set => 0, |
|
64
|
|
|
|
|
|
|
Freqs => 0, |
|
65
|
|
|
|
|
|
|
MBs => 0, |
|
66
|
|
|
|
|
|
|
Total => 0, |
|
67
|
|
|
|
|
|
|
Bad => 0, |
|
68
|
|
|
|
|
|
|
}; |
|
69
|
|
|
|
|
|
|
|
|
70
|
67
|
|
|
|
|
752
|
return bless $self, $class; |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
} |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
sub take { |
|
75
|
|
|
|
|
|
|
|
|
76
|
5800
|
|
|
5800
|
0
|
12138
|
my ($self, $bytes) = @_; |
|
77
|
|
|
|
|
|
|
|
|
78
|
5800
|
50
|
|
|
|
14089
|
return TAKE_BAD if $self->{Bad}; |
|
79
|
|
|
|
|
|
|
|
|
80
|
5800
|
|
|
|
|
13138
|
for my $i (0 .. length($bytes) - 1) { |
|
81
|
92455
|
|
|
|
|
153342
|
my $b = ord(substr $bytes, $i, 1) & 0xff; |
|
82
|
92455
|
100
|
66
|
|
|
356733
|
if ($self->{Set} == 3) { |
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
83
|
232
|
50
|
33
|
|
|
731
|
if ($b < 0xa1 or $b > 0xfe) { |
|
84
|
0
|
|
|
|
|
0
|
$self->{Bad} = 1; |
|
85
|
0
|
|
|
|
|
0
|
return TAKE_BAD; |
|
86
|
|
|
|
|
|
|
} |
|
87
|
232
|
|
|
|
|
397
|
$self->{Code} = ($self->{Code} << 8) | $b; |
|
88
|
232
|
|
|
|
|
321
|
$self->{Left}--; |
|
89
|
232
|
100
|
|
|
|
463
|
if ($self->{Left} == 0) { |
|
90
|
116
|
50
|
|
|
|
275
|
if (exists $EUCJP_FREQS{ $self->{Code} }) { |
|
91
|
0
|
|
|
|
|
0
|
$self->{Freqs}++; |
|
92
|
|
|
|
|
|
|
} |
|
93
|
116
|
|
|
|
|
196
|
$self->{Total}++; |
|
94
|
116
|
|
|
|
|
161
|
$self->{MBs}++; |
|
95
|
116
|
|
|
|
|
187
|
$self->{Code} = 0; |
|
96
|
116
|
|
|
|
|
226
|
$self->{Set} = 0; |
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
} elsif ($self->{Set} == 2) { |
|
99
|
0
|
0
|
0
|
|
|
0
|
if ($b < 0xa1 or $b > 0xdf) { |
|
100
|
0
|
|
|
|
|
0
|
$self->{Bad} = 1; |
|
101
|
0
|
|
|
|
|
0
|
return TAKE_BAD; |
|
102
|
|
|
|
|
|
|
} |
|
103
|
0
|
|
|
|
|
0
|
$self->{Code} = ($self->{Code} << 8) | $b; |
|
104
|
0
|
|
|
|
|
0
|
$self->{Left}--; |
|
105
|
0
|
0
|
|
|
|
0
|
if (exists $EUCJP_FREQS{ $self->{Code} }) { |
|
106
|
0
|
|
|
|
|
0
|
$self->{Freqs}++; |
|
107
|
|
|
|
|
|
|
} |
|
108
|
0
|
|
|
|
|
0
|
$self->{Total}++; |
|
109
|
0
|
|
|
|
|
0
|
$self->{MBs}++; |
|
110
|
0
|
|
|
|
|
0
|
$self->{Code} = 0; |
|
111
|
0
|
|
|
|
|
0
|
$self->{Set} = 0; |
|
112
|
|
|
|
|
|
|
} elsif ($self->{Set} == 1) { |
|
113
|
|
|
|
|
|
|
# The range is supposed to be 0xa1-0xfe, but HP-16 allows for |
|
114
|
|
|
|
|
|
|
# lower trailing bytes. |
|
115
|
23013
|
100
|
66
|
|
|
72326
|
if ($b < 0x21 or $b > 0xfe) { |
|
116
|
16
|
|
|
|
|
31
|
$self->{Bad} = 1; |
|
117
|
16
|
|
|
|
|
65
|
return TAKE_BAD; |
|
118
|
|
|
|
|
|
|
} |
|
119
|
22997
|
|
|
|
|
42246
|
$self->{Code} = ($self->{Code} << 8) | $b; |
|
120
|
22997
|
100
|
|
|
|
53284
|
if (exists $EUCJP_FREQS{ $self->{Code} }) { |
|
121
|
6427
|
|
|
|
|
9269
|
$self->{Freqs}++; |
|
122
|
|
|
|
|
|
|
} |
|
123
|
22997
|
|
|
|
|
33486
|
$self->{Left}--; |
|
124
|
22997
|
|
|
|
|
31678
|
$self->{MBs}++; |
|
125
|
22997
|
|
|
|
|
32564
|
$self->{Total}++; |
|
126
|
22997
|
|
|
|
|
33131
|
$self->{Code} = 0; |
|
127
|
22997
|
|
|
|
|
38269
|
$self->{Set} = 0; |
|
128
|
|
|
|
|
|
|
} elsif ($b == 0x8f) { |
|
129
|
116
|
|
|
|
|
179
|
$self->{Set} = 3; |
|
130
|
116
|
|
|
|
|
229
|
$self->{Left} = 2; |
|
131
|
116
|
|
|
|
|
185
|
$self->{Code} = $b; |
|
132
|
|
|
|
|
|
|
} elsif ($b == 0x8e) { |
|
133
|
0
|
|
|
|
|
0
|
$self->{Set} = 2; |
|
134
|
0
|
|
|
|
|
0
|
$self->{Left} = 1; |
|
135
|
0
|
|
|
|
|
0
|
$self->{Code} = $b; |
|
136
|
|
|
|
|
|
|
} elsif ($b >= 0xa1 && $b <= 0xfe) { |
|
137
|
23015
|
|
|
|
|
35319
|
$self->{Set} = 1; |
|
138
|
23015
|
|
|
|
|
34483
|
$self->{Left} = 1; |
|
139
|
23015
|
|
|
|
|
37873
|
$self->{Code} = $b; |
|
140
|
|
|
|
|
|
|
} elsif ($b & 0x80) { |
|
141
|
24
|
|
|
|
|
70
|
$self->{Bad} = 1; |
|
142
|
24
|
|
|
|
|
97
|
return TAKE_BAD; |
|
143
|
|
|
|
|
|
|
} else { |
|
144
|
46055
|
|
|
|
|
77847
|
$self->{Total}++; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
5760
|
|
|
|
|
16124
|
return TAKE_OK; |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
} |
|
151
|
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub confidence { |
|
153
|
|
|
|
|
|
|
|
|
154
|
7
|
|
|
7
|
0
|
15
|
my ($self) = @_; |
|
155
|
|
|
|
|
|
|
|
|
156
|
7
|
50
|
33
|
|
|
98
|
if ($self->{Bad} or $self->{MBs} == 0) { |
|
157
|
0
|
|
|
|
|
0
|
return 0; |
|
158
|
|
|
|
|
|
|
} |
|
159
|
|
|
|
|
|
|
|
|
160
|
7
|
|
|
|
|
33
|
return $self->{Freqs} / $self->{MBs}; |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
sub bad { |
|
165
|
|
|
|
|
|
|
|
|
166
|
0
|
|
|
0
|
0
|
0
|
my ($self) = @_; |
|
167
|
|
|
|
|
|
|
|
|
168
|
0
|
|
|
|
|
0
|
return $self->{Bad}; |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
} |
|
171
|
|
|
|
|
|
|
|
|
172
|
7
|
|
|
7
|
0
|
23
|
sub encoding { 'EUC-JP' } |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
1; |
|
175
|
|
|
|
|
|
|
|