| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package EBook::Ishmael::CharDet::GB2312; |
|
2
|
18
|
|
|
18
|
|
326
|
use 5.016; |
|
|
18
|
|
|
|
|
69
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '2.03'; |
|
4
|
18
|
|
|
18
|
|
115
|
use strict; |
|
|
18
|
|
|
|
|
39
|
|
|
|
18
|
|
|
|
|
575
|
|
|
5
|
18
|
|
|
18
|
|
154
|
use warnings; |
|
|
18
|
|
|
|
|
37
|
|
|
|
18
|
|
|
|
|
1111
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
18
|
|
|
18
|
|
104
|
use EBook::Ishmael::CharDet::Constants qw(:CONSTANTS); |
|
|
18
|
|
|
|
|
40
|
|
|
|
18
|
|
|
|
|
16383
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# Generated from contrib/cjk2encodings.pl |
|
10
|
|
|
|
|
|
|
my %GB2312_FREQS = map { $_ => 1 } ( |
|
11
|
|
|
|
|
|
|
41379,53947,45755,51403,49611,46532,54958,41400,41401,54224,51911,52946,46528, |
|
12
|
|
|
|
|
|
|
54490,55251,52219,46323,53938,51663,51365,50403,46531,54992,53186,48338,53972, |
|
13
|
|
|
|
|
|
|
46840,46525,46283,50916,51413,46846,51706,46070,51453,55032,41378,48845,51431, |
|
14
|
|
|
|
|
|
|
50375,52460,54230,51906,54954,53930,53409,55252,51886,49359,54751,54459,47531, |
|
15
|
|
|
|
|
|
|
49097,49076,52217,51387,47822,53444,45265,51120,48088,48625,47811,50410,54971, |
|
16
|
|
|
|
|
|
|
41398,41399,53456,51645,52725,57825,50150,53969,60598,50396,48624,50928,53216, |
|
17
|
|
|
|
|
|
|
54977,46552,53192,54781,52164,51693,46816,55031,50606,51926,50171,45539,55263, |
|
18
|
|
|
|
|
|
|
53716,50167,55287,47577,52932,48592,46780,52652,52706,52160,47090,53984,52965, |
|
19
|
|
|
|
|
|
|
52395,47037,49391,46536,52916,52729,46322,51697,54257,47562,45271,54002,54466, |
|
20
|
|
|
|
|
|
|
46025,48893,53986,48040,53227,45502,51701,52983,47821,50379,54211,47062,45528, |
|
21
|
|
|
|
|
|
|
52142,45273,47327,55201,51444,55286,52923,53971,45560,48308,45234,51903,46556, |
|
22
|
|
|
|
|
|
|
55288,49847,51897,51435,54758,51887,47288,49114,50172,51154,47047,51673,50371, |
|
23
|
|
|
|
|
|
|
53462,55254,47039,51173,45765,51664,45259,48838,49390,46036,53417,50383,53945, |
|
24
|
|
|
|
|
|
|
54489,48596,45499,47334,46760,53414,48588,51904,48314,51177,54470,53445,49657, |
|
25
|
|
|
|
|
|
|
50911,48837,47862,46554,52964,50415,54234,47016,47314,47351,46530,49570,46507, |
|
26
|
|
|
|
|
|
|
52204,53723,52173,50360,47597,51168,47859,51361,49389,52648,50877,54442,46286, |
|
27
|
|
|
|
|
|
|
47278,46023,53170,45291,54517,53954,53234,47045,48304,55005,52419,46521,53461, |
|
28
|
|
|
|
|
|
|
51919,48328,51111,52961,47586,48610,48557,55004,46839,51187,54445,54267,54520, |
|
29
|
|
|
|
|
|
|
54503,52975,47779,45987,55000,46266,54961,45489,46518,51371,53219,49646,47861, |
|
30
|
|
|
|
|
|
|
51627,46252,45277,52731,49135,47611,49574,48114,49101,52422,45806,53467,53193, |
|
31
|
|
|
|
|
|
|
52221,51932,47354,48611,48825,51927,53442,47557,54217,48034,45999,45475,55267, |
|
32
|
|
|
|
|
|
|
54178,54262,46842,48851,50367,48097,55279,41395,41394,50669,49622,47823,48571, |
|
33
|
|
|
|
|
|
|
48809,49652,50086,48112,52166,46333,47335,54225,49655,55479,52174,48877,48835, |
|
34
|
|
|
|
|
|
|
46012,45729,48305,49892,47356,49132,54251,47555,50355,51895,50910,46069,51890, |
|
35
|
|
|
|
|
|
|
48595,55258,48047,54974,47827,51928,46570,47524,50626,54458,52156,51879,52717, |
|
36
|
|
|
|
|
|
|
50161,47088,54741,50112,49109,50366,47580,51172,51654,48636,51925,47102,49332, |
|
37
|
|
|
|
|
|
|
47267,48840,53484,54968,50346,53460,51694,47858,47028,54240,50111,49353,52906, |
|
38
|
|
|
|
|
|
|
45237,47086,49120,53458,48868,46330,51891,52714,49103,55283,52158,51421,45756, |
|
39
|
|
|
|
|
|
|
47860,47574,41402,54703,45500,52195, |
|
40
|
|
|
|
|
|
|
); |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# https://www-archive.mozilla.org/projects/intl/universalcharsetdetection |
|
43
|
|
|
|
|
|
|
my $DIST_RATIO = 0.75; |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
sub new { |
|
46
|
|
|
|
|
|
|
|
|
47
|
67
|
|
|
67
|
0
|
181
|
my ($class) = @_; |
|
48
|
|
|
|
|
|
|
|
|
49
|
67
|
|
|
|
|
408
|
my $self = { |
|
50
|
|
|
|
|
|
|
Code => 0, |
|
51
|
|
|
|
|
|
|
Left => 0, |
|
52
|
|
|
|
|
|
|
Freqs => 0, |
|
53
|
|
|
|
|
|
|
MBs => 0, |
|
54
|
|
|
|
|
|
|
Total => 0, |
|
55
|
|
|
|
|
|
|
Bad => 0, |
|
56
|
|
|
|
|
|
|
}; |
|
57
|
|
|
|
|
|
|
|
|
58
|
67
|
|
|
|
|
294
|
return bless $self, $class; |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
} |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub take { |
|
63
|
|
|
|
|
|
|
|
|
64
|
4434
|
|
|
4434
|
0
|
8341
|
my ($self, $bytes) = @_; |
|
65
|
|
|
|
|
|
|
|
|
66
|
4434
|
50
|
|
|
|
10115
|
return TAKE_BAD if $self->{Bad}; |
|
67
|
|
|
|
|
|
|
|
|
68
|
4434
|
|
|
|
|
9820
|
for my $i (0 .. length($bytes) - 1) { |
|
69
|
70492
|
|
|
|
|
107712
|
my $b = ord(substr $bytes, $i, 1) & 0xff; |
|
70
|
70492
|
100
|
66
|
|
|
175702
|
if ($self->{Left}) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
71
|
15629
|
100
|
66
|
|
|
39242
|
if ($b >= 0xa1 && $b <= 0xfe) { |
|
72
|
15584
|
|
|
|
|
24985
|
$self->{Code} = ($self->{Code} << 8) | $b; |
|
73
|
15584
|
|
|
|
|
20046
|
$self->{Left}--; |
|
74
|
15584
|
100
|
|
|
|
32487
|
if (exists $GB2312_FREQS{ $self->{Code} }) { |
|
75
|
4265
|
|
|
|
|
6460
|
$self->{Freqs}++; |
|
76
|
|
|
|
|
|
|
} |
|
77
|
15584
|
|
|
|
|
20085
|
$self->{MBs}++; |
|
78
|
15584
|
|
|
|
|
19659
|
$self->{Total}++; |
|
79
|
15584
|
|
|
|
|
23852
|
$self->{Code} = 0; |
|
80
|
|
|
|
|
|
|
} else { |
|
81
|
45
|
|
|
|
|
92
|
$self->{Bad} = 1; |
|
82
|
45
|
|
|
|
|
140
|
return TAKE_BAD; |
|
83
|
|
|
|
|
|
|
} |
|
84
|
|
|
|
|
|
|
} elsif ($b >= 0xa1 && $b <= 0xfe) { |
|
85
|
15631
|
|
|
|
|
21629
|
$self->{Code} = $b; |
|
86
|
15631
|
|
|
|
|
22877
|
$self->{Left} = 1; |
|
87
|
|
|
|
|
|
|
} elsif ($b & 0x80) { |
|
88
|
9
|
|
|
|
|
23
|
$self->{Bad} = 1; |
|
89
|
9
|
|
|
|
|
29
|
return TAKE_BAD; |
|
90
|
|
|
|
|
|
|
} else { |
|
91
|
39223
|
|
|
|
|
64464
|
$self->{Total}++; |
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
} |
|
94
|
|
|
|
|
|
|
|
|
95
|
4380
|
|
|
|
|
10680
|
return TAKE_OK; |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
sub confidence { |
|
100
|
|
|
|
|
|
|
|
|
101
|
5
|
|
|
5
|
0
|
11
|
my ($self) = @_; |
|
102
|
|
|
|
|
|
|
|
|
103
|
5
|
50
|
33
|
|
|
38
|
if ($self->{Bad} or $self->{MBs} == 0) { |
|
104
|
0
|
|
|
|
|
0
|
return 0; |
|
105
|
|
|
|
|
|
|
} |
|
106
|
|
|
|
|
|
|
|
|
107
|
5
|
|
|
|
|
83
|
return $self->{Freqs} / $self->{MBs}; |
|
108
|
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub bad { |
|
112
|
|
|
|
|
|
|
|
|
113
|
0
|
|
|
0
|
0
|
0
|
my ($self) = @_; |
|
114
|
|
|
|
|
|
|
|
|
115
|
0
|
|
|
|
|
0
|
return $self->{Bad}; |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
|
|
119
|
5
|
|
|
5
|
0
|
21
|
sub encoding { 'GB2312' } |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
1; |
|
122
|
|
|
|
|
|
|
|