| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package EBook::Ishmael::CharDet::ShiftJIS; |
|
2
|
18
|
|
|
18
|
|
387
|
use 5.016; |
|
|
18
|
|
|
|
|
115
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '2.03'; |
|
4
|
18
|
|
|
18
|
|
118
|
use strict; |
|
|
18
|
|
|
|
|
37
|
|
|
|
18
|
|
|
|
|
453
|
|
|
5
|
18
|
|
|
18
|
|
135
|
use warnings; |
|
|
18
|
|
|
|
|
35
|
|
|
|
18
|
|
|
|
|
1023
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
18
|
|
|
18
|
|
103
|
use EBook::Ishmael::CharDet::Constants qw(:CONSTANTS); |
|
|
18
|
|
|
|
|
50
|
|
|
|
18
|
|
|
|
|
17413
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# Generated from contrib/cjk2encodings.pl. |
|
10
|
|
|
|
|
|
|
my %SHIFTJIS_FREQS = map { $_ => 1 } ( |
|
11
|
|
|
|
|
|
|
33139,33140,33484,33469,33442,33461,33089,33481,33480,33485,33090,33476,33478, |
|
12
|
|
|
|
|
|
|
33449,33520,33444,33450,33513,33511,33477,33521,33504,33474,33473,33514,33453, |
|
13
|
|
|
|
|
|
|
33457,33500,33470,33512,33440,33463,33451,33506,33459,33467,33141,33142,33510, |
|
14
|
|
|
|
|
|
|
33517,33455,33115,33448,33471,33683,38110,33465,36972,33479,33503,33446,33624, |
|
15
|
|
|
|
|
|
|
33501,33494,33515,35050,33462,33486,33488,33491,36009,36759,36719,38314,33483, |
|
16
|
|
|
|
|
|
|
33497,36470,33093,35261,33640,33603,36502,33675,33518,33673,36584,36521,33639, |
|
17
|
|
|
|
|
|
|
36030,33674,33662,35165,37349,35540,33464,36510,33108,36211,33456,33601,33502, |
|
18
|
|
|
|
|
|
|
35727,37882,36835,37510,36465,37873,33489,36484,33458,35258,33630,33614,36257, |
|
19
|
|
|
|
|
|
|
36947,33492,36562,38395,37485,35270,39122,39010,33637,37046,38618,37199,33634, |
|
20
|
|
|
|
|
|
|
35651,33508,38823,33495,33452,33615,33652,33602,33623,38327,33605,33647,33475, |
|
21
|
|
|
|
|
|
|
37884,38523,33472,33505,40838,33653,33454,33482,33635,33460,35246,37300,37482, |
|
22
|
|
|
|
|
|
|
33677,33622,36067,33676,38312,35918,33661,37752,36700,33607,36582,36753,37966, |
|
23
|
|
|
|
|
|
|
33498,36509,35027,36255,33509,37578,36780,33642,36231,38259,36091,36967,36431, |
|
24
|
|
|
|
|
|
|
35508,38602,33667,38579,37202,37486,38292,33632,38590,38792,33626,36746,36204, |
|
25
|
|
|
|
|
|
|
36842,35407,33468,36781,39292,38633,33610,37080,36814,37807,37347,33466,37094, |
|
26
|
|
|
|
|
|
|
33609,37291,37727,36934,37705,36581,37509,38508,35257,36452,38561,33665,37026, |
|
27
|
|
|
|
|
|
|
38484,37043,35311,36950,38034,37802,36293,37258,38356,38813,38326,35808,36480, |
|
28
|
|
|
|
|
|
|
37559,35064,38262,36486,36936,36164,33611,37806,37817,40681,59640,33109,33638, |
|
29
|
|
|
|
|
|
|
38378,38275,36941,38588,36092,33671,36846,36511,35979,36068,36434,33519,36288, |
|
30
|
|
|
|
|
|
|
38626,38295,37050,34980,36577,38861,37330,36460,38260,37454,33612,36564,36254, |
|
31
|
|
|
|
|
|
|
36965,35807,36085,38798,33618,37856,33654,37564,35559,38764,58335,35982,35827, |
|
32
|
|
|
|
|
|
|
36060,36959,35662,38636,38616,38768,36226,37203,37782,37249,35016,37731,33631, |
|
33
|
|
|
|
|
|
|
37493,37548,35403,38140,37952,36020,37622,38726,33487,35402,35295,36949,38346, |
|
34
|
|
|
|
|
|
|
38772,33648,36708,36712,35760,33606,37317,35569,39803,33664,36544,36036,37350, |
|
35
|
|
|
|
|
|
|
35278,33641,38977,35553,37998,35763,37835,37554,36332,36461,33620,37720,36478, |
|
36
|
|
|
|
|
|
|
37035,33625,37545,37242,37049,37822,33143,36256,33144,33645,35041,37539,57793, |
|
37
|
|
|
|
|
|
|
35709,35008,35818,38351,38586,36296,36997,36745,38076,59795,38548,33656,38099, |
|
38
|
|
|
|
|
|
|
38774,36538,36455,37324,33684,38589,36595,39152,37831,38333,37308,37718,33669, |
|
39
|
|
|
|
|
|
|
36955,40804,35976,38732,37284,36421,37309,38841,36074,35930,35452,35284,35687, |
|
40
|
|
|
|
|
|
|
37481,36681,35312,38274,36261,37259,37767,37732,37786,35683,33613,35398,35266, |
|
41
|
|
|
|
|
|
|
37017,38998,36536,36462,36311,38493,40665,33646,35697,35011,38578,36241,35265, |
|
42
|
|
|
|
|
|
|
38270,35219,36958,36171,40102,38368,33660,38219,35542,36471,35777,37785,36592, |
|
43
|
|
|
|
|
|
|
36464,34987,38594,36035,35313,35166,38882,38010,37264,37855,36672,38623,38100, |
|
44
|
|
|
|
|
|
|
33616,39584,37100,35464,39817,35567,36801,38236,37070,37287,35660,33628,38489, |
|
45
|
|
|
|
|
|
|
35317,57699,35752,35300,37036,38129,37326,38808,36838,39002,37599,35061,36086, |
|
46
|
|
|
|
|
|
|
37461,33679,35822,37271,36019,35649,37742,38019,33659,37808,37188,35648,40396, |
|
47
|
|
|
|
|
|
|
58713,36956,38130,38828,59293,36695,35437,37084,38352,36506,35276,37608,36560, |
|
48
|
|
|
|
|
|
|
37625,36969,38241,35769,36520,35771,37076,33666,35936,38302,37012,35743,35449, |
|
49
|
|
|
|
|
|
|
37517,37332,40182,36845,35986,37753,57512,35030,38394,35757,35782,37352,36451, |
|
50
|
|
|
|
|
|
|
37507,38058, |
|
51
|
|
|
|
|
|
|
); |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
# https://www-archive.mozilla.org/projects/intl/universalcharsetdetection |
|
54
|
|
|
|
|
|
|
my $DIST_RATIO = 0.93; |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
sub new { |
|
57
|
|
|
|
|
|
|
|
|
58
|
67
|
|
|
67
|
0
|
190
|
my ($class) = @_; |
|
59
|
|
|
|
|
|
|
|
|
60
|
67
|
|
|
|
|
390
|
my $self = { |
|
61
|
|
|
|
|
|
|
Code => 0, |
|
62
|
|
|
|
|
|
|
Left => 0, |
|
63
|
|
|
|
|
|
|
Freqs => 0, |
|
64
|
|
|
|
|
|
|
MBs => 0, |
|
65
|
|
|
|
|
|
|
Total => 0, |
|
66
|
|
|
|
|
|
|
Bad => 0, |
|
67
|
|
|
|
|
|
|
}; |
|
68
|
|
|
|
|
|
|
|
|
69
|
67
|
|
|
|
|
870
|
return bless $self, $class; |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
sub take { |
|
74
|
|
|
|
|
|
|
|
|
75
|
4835
|
|
|
4835
|
0
|
9289
|
my ($self, $bytes) = @_; |
|
76
|
|
|
|
|
|
|
|
|
77
|
4835
|
50
|
|
|
|
10507
|
return TAKE_BAD if $self->{Bad}; |
|
78
|
|
|
|
|
|
|
|
|
79
|
4835
|
|
|
|
|
10968
|
for my $i (0 .. length($bytes) - 1) { |
|
80
|
76905
|
|
|
|
|
122561
|
my $b = ord(substr $bytes, $i, 1) & 0xff; |
|
81
|
76905
|
100
|
100
|
|
|
274950
|
if ($self->{Left}) { |
|
|
|
100
|
100
|
|
|
|
|
|
|
|
100
|
66
|
|
|
|
|
|
82
|
|
|
|
|
|
|
## So if the leading byte is even, the trailing byte must be in |
|
83
|
|
|
|
|
|
|
## range 0x9f-0xfc. If the leading byte is odd, is must in range |
|
84
|
|
|
|
|
|
|
## 0x40-0x9e (but not 0x7f). However, the texts I tested this |
|
85
|
|
|
|
|
|
|
## on did not follow these rules, so I guess we ignore them :-/. |
|
86
|
|
|
|
|
|
|
#if ($self->{Code} % 2 == 0) { |
|
87
|
|
|
|
|
|
|
# if ($b >= 0x9f && $b <= 0xfc) { |
|
88
|
|
|
|
|
|
|
# $self->{Code} = ($self->{Code} << 8) | $b; |
|
89
|
|
|
|
|
|
|
# } else { |
|
90
|
|
|
|
|
|
|
# $self->{Bad} = 1; |
|
91
|
|
|
|
|
|
|
# return TAKE_BAD; |
|
92
|
|
|
|
|
|
|
# } |
|
93
|
|
|
|
|
|
|
#} else { |
|
94
|
|
|
|
|
|
|
# if ($b >= 0x40 && $b <= 0x9e && $b != 0x7f) { |
|
95
|
|
|
|
|
|
|
# $self->{Code} = ($self->{Code} << 8) | $b; |
|
96
|
|
|
|
|
|
|
# } else { |
|
97
|
|
|
|
|
|
|
# say $self->{Total}; |
|
98
|
|
|
|
|
|
|
# $self->{Bad} = 1; |
|
99
|
|
|
|
|
|
|
# return TAKE_BAD; |
|
100
|
|
|
|
|
|
|
# } |
|
101
|
|
|
|
|
|
|
#} |
|
102
|
6723
|
|
|
|
|
12114
|
$self->{Code} = ($self->{Code} << 8) | $b; |
|
103
|
6723
|
100
|
|
|
|
18302
|
if (exists $SHIFTJIS_FREQS{ $self->{Code} }) { |
|
104
|
4365
|
|
|
|
|
6983
|
$self->{Freqs}++; |
|
105
|
|
|
|
|
|
|
} |
|
106
|
6723
|
|
|
|
|
10015
|
$self->{MBs}++; |
|
107
|
6723
|
|
|
|
|
10600
|
$self->{Total}++; |
|
108
|
6723
|
|
|
|
|
9804
|
$self->{Left}--; |
|
109
|
6723
|
|
|
|
|
11850
|
$self->{Code} = 0; |
|
110
|
|
|
|
|
|
|
} elsif (($b >= 0x81 && $b <= 0x9f) or ($b >= 0xe0 && $b <= 0xef)) { |
|
111
|
6730
|
|
|
|
|
10983
|
$self->{Code} = $b; |
|
112
|
6730
|
|
|
|
|
11974
|
$self->{Left} = 1; |
|
113
|
|
|
|
|
|
|
# TODO: Does this interfere with ShiftJIS extensions |
|
114
|
|
|
|
|
|
|
} elsif ($b & 0x80) { |
|
115
|
52
|
|
|
|
|
126
|
$self->{Bad} = 1; |
|
116
|
52
|
|
|
|
|
195
|
return TAKE_BAD; |
|
117
|
|
|
|
|
|
|
} else { |
|
118
|
63400
|
|
|
|
|
103717
|
$self->{Total}++; |
|
119
|
|
|
|
|
|
|
} |
|
120
|
|
|
|
|
|
|
} |
|
121
|
|
|
|
|
|
|
|
|
122
|
4783
|
|
|
|
|
12128
|
return TAKE_OK; |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
} |
|
125
|
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub confidence { |
|
127
|
|
|
|
|
|
|
|
|
128
|
4
|
|
|
4
|
0
|
11
|
my ($self) = @_; |
|
129
|
|
|
|
|
|
|
|
|
130
|
4
|
50
|
33
|
|
|
29
|
if ($self->{Bad} or $self->{MBs} == 0) { |
|
131
|
0
|
|
|
|
|
0
|
return 0; |
|
132
|
|
|
|
|
|
|
} |
|
133
|
|
|
|
|
|
|
|
|
134
|
4
|
|
|
|
|
39
|
return $self->{Freqs} / $self->{MBs}; |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
sub bad { |
|
139
|
|
|
|
|
|
|
|
|
140
|
0
|
|
|
0
|
0
|
0
|
my ($self) = @_; |
|
141
|
|
|
|
|
|
|
|
|
142
|
0
|
|
|
|
|
0
|
return $self->{Bad}; |
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
} |
|
145
|
|
|
|
|
|
|
|
|
146
|
4
|
|
|
4
|
0
|
33
|
sub encoding { 'Shift_JIS' } |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
1; |
|
149
|
|
|
|
|
|
|
|