File Coverage

blib/lib/EBook/Ishmael/CharDet/GB2312.pm
Criterion Covered Total %
statement 39 42 92.8
branch 12 14 85.7
condition 5 9 55.5
subroutine 8 9 88.8
pod 0 5 0.0
total 64 79 81.0


line stmt bran cond sub pod time code
1             package EBook::Ishmael::CharDet::GB2312;
2 18     18   326 use 5.016;
  18         69  
3             our $VERSION = '2.03';
4 18     18   115 use strict;
  18         39  
  18         575  
5 18     18   154 use warnings;
  18         37  
  18         1111  
6              
7 18     18   104 use EBook::Ishmael::CharDet::Constants qw(:CONSTANTS);
  18         40  
  18         16383  
8              
9             # Generated from contrib/cjk2encodings.pl
10             my %GB2312_FREQS = map { $_ => 1 } (
11             41379,53947,45755,51403,49611,46532,54958,41400,41401,54224,51911,52946,46528,
12             54490,55251,52219,46323,53938,51663,51365,50403,46531,54992,53186,48338,53972,
13             46840,46525,46283,50916,51413,46846,51706,46070,51453,55032,41378,48845,51431,
14             50375,52460,54230,51906,54954,53930,53409,55252,51886,49359,54751,54459,47531,
15             49097,49076,52217,51387,47822,53444,45265,51120,48088,48625,47811,50410,54971,
16             41398,41399,53456,51645,52725,57825,50150,53969,60598,50396,48624,50928,53216,
17             54977,46552,53192,54781,52164,51693,46816,55031,50606,51926,50171,45539,55263,
18             53716,50167,55287,47577,52932,48592,46780,52652,52706,52160,47090,53984,52965,
19             52395,47037,49391,46536,52916,52729,46322,51697,54257,47562,45271,54002,54466,
20             46025,48893,53986,48040,53227,45502,51701,52983,47821,50379,54211,47062,45528,
21             52142,45273,47327,55201,51444,55286,52923,53971,45560,48308,45234,51903,46556,
22             55288,49847,51897,51435,54758,51887,47288,49114,50172,51154,47047,51673,50371,
23             53462,55254,47039,51173,45765,51664,45259,48838,49390,46036,53417,50383,53945,
24             54489,48596,45499,47334,46760,53414,48588,51904,48314,51177,54470,53445,49657,
25             50911,48837,47862,46554,52964,50415,54234,47016,47314,47351,46530,49570,46507,
26             52204,53723,52173,50360,47597,51168,47859,51361,49389,52648,50877,54442,46286,
27             47278,46023,53170,45291,54517,53954,53234,47045,48304,55005,52419,46521,53461,
28             51919,48328,51111,52961,47586,48610,48557,55004,46839,51187,54445,54267,54520,
29             54503,52975,47779,45987,55000,46266,54961,45489,46518,51371,53219,49646,47861,
30             51627,46252,45277,52731,49135,47611,49574,48114,49101,52422,45806,53467,53193,
31             52221,51932,47354,48611,48825,51927,53442,47557,54217,48034,45999,45475,55267,
32             54178,54262,46842,48851,50367,48097,55279,41395,41394,50669,49622,47823,48571,
33             48809,49652,50086,48112,52166,46333,47335,54225,49655,55479,52174,48877,48835,
34             46012,45729,48305,49892,47356,49132,54251,47555,50355,51895,50910,46069,51890,
35             48595,55258,48047,54974,47827,51928,46570,47524,50626,54458,52156,51879,52717,
36             50161,47088,54741,50112,49109,50366,47580,51172,51654,48636,51925,47102,49332,
37             47267,48840,53484,54968,50346,53460,51694,47858,47028,54240,50111,49353,52906,
38             45237,47086,49120,53458,48868,46330,51891,52714,49103,55283,52158,51421,45756,
39             47860,47574,41402,54703,45500,52195,
40             );
41              
42             # https://www-archive.mozilla.org/projects/intl/universalcharsetdetection
43             my $DIST_RATIO = 0.75;
44              
45             sub new {
46              
47 67     67 0 181 my ($class) = @_;
48              
49 67         408 my $self = {
50             Code => 0,
51             Left => 0,
52             Freqs => 0,
53             MBs => 0,
54             Total => 0,
55             Bad => 0,
56             };
57              
58 67         294 return bless $self, $class;
59              
60             }
61              
62             sub take {
63              
64 4434     4434 0 8341 my ($self, $bytes) = @_;
65              
66 4434 50       10115 return TAKE_BAD if $self->{Bad};
67              
68 4434         9820 for my $i (0 .. length($bytes) - 1) {
69 70492         107712 my $b = ord(substr $bytes, $i, 1) & 0xff;
70 70492 100 66     175702 if ($self->{Left}) {
    100          
    100          
71 15629 100 66     39242 if ($b >= 0xa1 && $b <= 0xfe) {
72 15584         24985 $self->{Code} = ($self->{Code} << 8) | $b;
73 15584         20046 $self->{Left}--;
74 15584 100       32487 if (exists $GB2312_FREQS{ $self->{Code} }) {
75 4265         6460 $self->{Freqs}++;
76             }
77 15584         20085 $self->{MBs}++;
78 15584         19659 $self->{Total}++;
79 15584         23852 $self->{Code} = 0;
80             } else {
81 45         92 $self->{Bad} = 1;
82 45         140 return TAKE_BAD;
83             }
84             } elsif ($b >= 0xa1 && $b <= 0xfe) {
85 15631         21629 $self->{Code} = $b;
86 15631         22877 $self->{Left} = 1;
87             } elsif ($b & 0x80) {
88 9         23 $self->{Bad} = 1;
89 9         29 return TAKE_BAD;
90             } else {
91 39223         64464 $self->{Total}++;
92             }
93             }
94              
95 4380         10680 return TAKE_OK;
96              
97             }
98              
99             sub confidence {
100              
101 5     5 0 11 my ($self) = @_;
102              
103 5 50 33     38 if ($self->{Bad} or $self->{MBs} == 0) {
104 0         0 return 0;
105             }
106              
107 5         83 return $self->{Freqs} / $self->{MBs};
108              
109             }
110              
111             sub bad {
112              
113 0     0 0 0 my ($self) = @_;
114              
115 0         0 return $self->{Bad};
116              
117             }
118              
119 5     5 0 21 sub encoding { 'GB2312' }
120              
121             1;
122