| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package EBook::Ishmael::CharDet::CP1255; |
|
2
|
18
|
|
|
18
|
|
397
|
use 5.016; |
|
|
18
|
|
|
|
|
67
|
|
|
3
|
|
|
|
|
|
|
our $VERSION = '2.03'; |
|
4
|
18
|
|
|
18
|
|
115
|
use strict; |
|
|
18
|
|
|
|
|
34
|
|
|
|
18
|
|
|
|
|
516
|
|
|
5
|
18
|
|
|
18
|
|
118
|
use warnings; |
|
|
18
|
|
|
|
|
37
|
|
|
|
18
|
|
|
|
|
1066
|
|
|
6
|
|
|
|
|
|
|
|
|
7
|
18
|
|
|
18
|
|
103
|
use parent 'EBook::Ishmael::CharDet::CP'; |
|
|
18
|
|
|
|
|
31
|
|
|
|
18
|
|
|
|
|
115
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
# Generated from contrib/512-bigram.pl from various Hebrew texts from |
|
10
|
|
|
|
|
|
|
# Project Gutenberg. |
|
11
|
|
|
|
|
|
|
my %CP1255_FREQS = map { $_ => 1 } ( |
|
12
|
|
|
|
|
|
|
"\xe5\xfa","\xe9\xed","\xf0\xe9","\xec\xe9","\xe9\xe5","\xe4\xe9","\xe0\xe5", |
|
13
|
|
|
|
|
|
|
"\xe5\xec","\xfa\xe9","\xf8\xe9","\xec\xe4","\xe1\xe9","\xe0\xfa","\xe5\xe0", |
|
14
|
|
|
|
|
|
|
"\xe5\xf8","\xec\xe0","\xf8\xe5","\xf0\xe5","\xf2\xec","\xf9\xec","\xee\xe5", |
|
15
|
|
|
|
|
|
|
"\xeb\xec","\xec\xe5","\xe0\xe9","\xee\xe9","\xe4\xe5","\xe5\xe1","\xee\xe4", |
|
16
|
|
|
|
|
|
|
"\xe9\xfa","\xe9\xe4","\xf9\xe9","\xee\xf8","\xe1\xf8","\xe1\xe5","\xe7\xe5", |
|
17
|
|
|
|
|
|
|
"\xf2\xe5","\xe9\xe9","\xf9\xe5","\xe0\xf0","\xe0\xec","\xe5\xe3","\xe9\xf8", |
|
18
|
|
|
|
|
|
|
"\xe5\xee","\xe6\xe4","\xf9\xe4","\xe5\xe4","\xe9\xf0","\xe0\xe7","\xfa\xe5", |
|
19
|
|
|
|
|
|
|
"\xe3\xe9","\xe9\xf9","\xe0\xee","\xe4\xee","\xe9\xec","\xf8\xe0","\xe4\x2e", |
|
20
|
|
|
|
|
|
|
"\xe9\xef","\xe4\xe0","\xf0\xe4","\xe5\xf0","\xee\xf9","\xeb\xe5","\xe1\xe4", |
|
21
|
|
|
|
|
|
|
"\xe1\xec","\xf7\xe5","\xe5\xf9","\xe9\xe3","\xe3\xe5","\xf9\xe1","\xf2\xe9", |
|
22
|
|
|
|
|
|
|
"\xf8\xe4","\xeb\xe9","\xfa\xe4","\xee\xfa","\xe0\xf9","\xe7\xe9","\xe9\xe0", |
|
23
|
|
|
|
|
|
|
"\xf9\xf8","\xe4\xf9","\xe1\xf2","\xf9\xee","\xee\xf2","\xe5\xeb","\xe5\xe9", |
|
24
|
|
|
|
|
|
|
"\xf9\xe0","\xe4\x2c","\xf4\xe9","\xed\x2e","\xe4\xf8","\x74\x68","\xe5\xef", |
|
25
|
|
|
|
|
|
|
"\xf8\xe1","\xe7\xf8","\xe4\xfa","\xe5\xed","\xe4\xf2","\xf4\xe5","\xe4\xed", |
|
26
|
|
|
|
|
|
|
"\xe5\xf2","\xe0\xe1","\xf7\xf8","\xe1\xe0","\xf4\xf8","\xe5\xe7","\xf2\xf8", |
|
27
|
|
|
|
|
|
|
"\xe4\xe7","\x74\x65","\xe1\xfa","\xe9\xe1","\xed\x2c","\xe9\xf2","\xec\xee", |
|
28
|
|
|
|
|
|
|
"\xee\xe0","\x65\x72","\xfa\xf8","\xe7\xe3","\xe4\xe1","\xfa\x2e","\xf7\xe9", |
|
29
|
|
|
|
|
|
|
"\xe0\xf8","\xf8\xfa","\xec\xe1","\x6f\x72","\xe5\xf7","\xe9\xeb","\x68\x65", |
|
30
|
|
|
|
|
|
|
"\xe4\xf0","\xf6\xe9","\xf1\xe5","\xe0\xe4","\xe3\xf8","\xeb\xee","\xe1\xee", |
|
31
|
|
|
|
|
|
|
"\x6f\x6e","\xeb\xe1","\xe5\xf4","\xec\xfa","\xf9\xf0","\x69\x6e","\xf1\xe9", |
|
32
|
|
|
|
|
|
|
"\xe1\xf9","\xec\xeb","\xee\xe3","\xe9\xf4","\xf2\xfa","\xe4\xe6","\xe9\xee", |
|
33
|
|
|
|
|
|
|
"\xf9\xfa","\x22\xe0","\xe8\xe5","\xe7\xfa","\xee\xf0","\xf2\xe3","\xee\xec", |
|
34
|
|
|
|
|
|
|
"\x2e\x22","\xf6\xe5","\xfa\x2c","\xe4\xe2","\xe4\xeb","\xf4\xf0","\xe9\xe7", |
|
35
|
|
|
|
|
|
|
"\x65\x6e","\xe9\xf7","\xe3\xe1","\xe7\xec","\xee\xe7","\xeb\xe4","\xe5\x2e", |
|
36
|
|
|
|
|
|
|
"\xe4\xec","\xe9\x2e","\xfa\xe7","\xe1\xeb","\xeb\xf9","\xe2\xe5","\xe5\xf6", |
|
37
|
|
|
|
|
|
|
"\xe0\xe3","\xe2\xe9","\x72\x65","\x72\x6f","\xe4\xf7","\xe4\xf4","\xe9\x2c", |
|
38
|
|
|
|
|
|
|
"\xec\xf2","\xe8\xe9","\xe2\xe3","\x61\x74","\xf2\xf9","\xec\xf9","\xf4\xe4", |
|
39
|
|
|
|
|
|
|
"\xec\xe7","\xf2\xed","\xee\xf6","\xec\xea","\x22\x2c","\xee\xf7","\xf9\xf2", |
|
40
|
|
|
|
|
|
|
"\xf8\x2e","\x65\x63","\xf8\xf7","\xe6\xe5","\xe2\xec","\xeb\xfa","\xf2\xee", |
|
41
|
|
|
|
|
|
|
"\xe1\xe7","\xee\xe1","\xf8\xe2","\x61\x6e","\xf2\xf0","\xe1\xe3","\xe9\xea", |
|
42
|
|
|
|
|
|
|
"\xf4\xfa","\xe5\xf1","\xe0\xed","\xe1\xf0","\x69\x74","\xf9\xed","\xf6\xf8", |
|
43
|
|
|
|
|
|
|
"\x63\x74","\x74\x69","\xeb\xef","\xe4\xe3","\x75\x74","\xef\x2e","\xe4\xf6", |
|
44
|
|
|
|
|
|
|
"\xf2\xf6","\xe3\xe4","\xec\xf4","\xe3\xf2","\xe1\xf7","\xf0\xf9","\xf8\xe7", |
|
45
|
|
|
|
|
|
|
"\xee\xee","\xfa\xf0","\xee\xef","\xf2\xe4","\xe5\xe2","\xe5\x2c","\x6f\x75", |
|
46
|
|
|
|
|
|
|
"\xe4\xf1","\xe6\xf8","\xec\xed","\x65\x64","\xeb\xea","\xf2\xe1","\xec\xec", |
|
47
|
|
|
|
|
|
|
"\xe5\xe5","\xf6\xe4","\xf6\xe0","\xeb\xf8","\xe9\xe8","\xe9\xf6","\xf0\xe7", |
|
48
|
|
|
|
|
|
|
"\x69\x73","\xec\xe3","\xec\xf0","\xf9\xeb","\xee\xeb","\xf0\xf2","\x6e\x64", |
|
49
|
|
|
|
|
|
|
"\xe0\xe6","\xf8\xf2","\xef\x2c","\xf1\xf4","\x62\x65","\xf0\xfa","\xf4\xf2", |
|
50
|
|
|
|
|
|
|
"\x6f\x66","\xee\xf1","\xeb\xf0","\x69\x6f","\xf4\xf9","\xf4\xec","\xe7\xf9", |
|
51
|
|
|
|
|
|
|
"\x61\x72","\xec\xf7","\xe3\xee","\xfa\xf4","\xf2\xeb","\xf8\x2c","\xee\xf4", |
|
52
|
|
|
|
|
|
|
"\xfa\xed","\x21\x22","\xf9\xf4","\xe5\xe8","\xe7\xf0","\xe5\xea","\xeb\xe0", |
|
53
|
|
|
|
|
|
|
"\xe5\xe6","\xe0\xf4","\xf0\xf4","\xe2\xed","\xf6\xee","\x63\x6f","\xe6\xe9", |
|
54
|
|
|
|
|
|
|
"\x72\x67","\x65\x73","\xe9\xf1","\xf0\xe0","\xe7\xe4","\x22\xe4","\xf7\xe8", |
|
55
|
|
|
|
|
|
|
"\xe7\xe6","\xf7\xe4","\xf8\xf9","\xe1\xe8","\xec\xe2","\xe5\xf3","\xee\xe6", |
|
56
|
|
|
|
|
|
|
"\xe7\xee","\xfa\xee","\xe2\xe1","\xe3\xec","\xfa\xf7","\xf6\xec","\xe1\xe1", |
|
57
|
|
|
|
|
|
|
"\x72\x6b","\x72\x69","\xfa\xe1","\x6e\x62","\xf9\xe7","\x73\x65","\x6e\x67", |
|
58
|
|
|
|
|
|
|
"\xf8\xe3","\xf9\xf7","\x73\x74","\xe4\xef","\xf7\xf9","\xf8\xea","\x64\x69", |
|
59
|
|
|
|
|
|
|
"\x50\x72","\x6e\x74","\x69\x63","\xe6\xee","\xfa\xe0","\xec\x2e","\xe3\xed", |
|
60
|
|
|
|
|
|
|
"\xf0\xf1","\xf8\xf6","\xfa\xf2","\xe8\xf8","\x6a\x65","\xf4\xf1","\x61\x6c", |
|
61
|
|
|
|
|
|
|
"\xe2\xf8","\x47\x75","\xf0\xf7","\x6f\x6a","\x74\x72","\xe7\xe1","\x63\x65", |
|
62
|
|
|
|
|
|
|
"\x77\x6f","\xf8\xeb","\xf8\xf5","\xf6\xf2","\xee\xe8","\x6c\x65","\xf1\xfa", |
|
63
|
|
|
|
|
|
|
"\x72\x61","\x74\x6f","\xe0\xeb","\xf4\xe8","\xf0\xf8","\xe4\x22","\xf1\xe1", |
|
64
|
|
|
|
|
|
|
"\xe8\xe4","\x68\x69","\xf6\xe1","\x64\x65","\xf4\xe7","\xf7\xf0","\x77\x69", |
|
65
|
|
|
|
|
|
|
"\xec\xf8","\xe1\xe2","\xe1\xf4","\xf0\xee","\xe6\xf7","\xe6\xe0","\xfa\xec", |
|
66
|
|
|
|
|
|
|
"\xe9\xe6","\xe3\xfa","\x74\x61","\xe2\xf2","\x66\x6f","\x6d\x61","\xec\xe8", |
|
67
|
|
|
|
|
|
|
"\xe0\xf1","\xf7\xe3","\x79\x6f","\xe3\xf7","\xf7\xec","\xf2\xe8","\xe1\xf1", |
|
68
|
|
|
|
|
|
|
"\xec\x2c","\xea\x2c","\xe7\xeb","\xf7\xf6","\x75\x6e","\xea\x2e","\xe8\xe1", |
|
69
|
|
|
|
|
|
|
"\x6f\x74","\xf0\xf0","\xfa\xeb","\xf7\xe1","\xe3\x2e","\xe1\xe6","\xe9\xe2", |
|
70
|
|
|
|
|
|
|
"\xe1\xef","\x6e\x73","\xe1\x2e","\xf0\xe2","\xe3\xf9","\xe4\xe8","\xe4\xe4", |
|
71
|
|
|
|
|
|
|
"\x65\x65","\xeb\xeb","\xe7\xf7","\xf0\xe1","\xfa\xef","\x76\x65","\xe3\xe0", |
|
72
|
|
|
|
|
|
|
"\x6e\x69","\xf0\xeb","\x6d\x65","\x6c\x6c","\x72\x6d","\xec\xf6","\x63\x68", |
|
73
|
|
|
|
|
|
|
"\xe2\xf9","\xee\xe2","\xf4\xf7","\xf1\xf8","\xf0\xe3","\xe1\xf6","\x70\x72", |
|
74
|
|
|
|
|
|
|
"\xf6\xf4","\xe1\x2c","\xe0\xf6","\xfa\xea","\xf2\xf5","\xeb\xe7","\xeb\xe3", |
|
75
|
|
|
|
|
|
|
"\x6f\x70","\xf1\xec","\x67\x99","\x6c\x61","\x65\x6d","\xf7\xef","\xeb\xed", |
|
76
|
|
|
|
|
|
|
"\xe4\x21","\x61\x67","\x6c\x69","\xe2\xee","\xf9\xe8","\xe5\xf5","\x22\xee", |
|
77
|
|
|
|
|
|
|
"\xf9\xf9","\x22\xec","\xe8\xec","\xe8\xf0","\xf8\xf4","\xe6\xeb","\xec\xf1", |
|
78
|
|
|
|
|
|
|
"\x75\x73","\xed\x22","\x65\x6c","\x68\x61","\x6f\x6d","\xf6\xe3","\xfa\xf9", |
|
79
|
|
|
|
|
|
|
"\xf7\xf4","\xe3\x2c","\xf8\xf0","\x22\xe6","\x73\x73","\xf6\xe7","\xe0\x2c", |
|
80
|
|
|
|
|
|
|
"\x72\x74","\x6e\x6f","\xf1\xee","\xe7\xf1","\xe0\xf3","\xf8\xe8","\xe8\xef", |
|
81
|
|
|
|
|
|
|
"\xeb\xf4","\x6e\x79","\xf6\xfa","\xf2\xf7","\x65\x61","\xf9\x2e","\xf1\xe3", |
|
82
|
|
|
|
|
|
|
"\xf8\xee","\x6b\x73","\x6e\x65","\x69\x76","\xe3\xf0","\x64\x61","\x70\x61", |
|
83
|
|
|
|
|
|
|
"\x63\x61","\x31\x2e","\xf1\xf7","\x70\x65","\xe0\xe2","\xf0\xed","\x67\x72", |
|
84
|
|
|
|
|
|
|
"\x2e\x97","\x61\x63","\x73\x69","\xf7\xfa","\xe7\xf6","\xf2\x2e","\xe9\x22", |
|
85
|
|
|
|
|
|
|
"\xe7\xf4", |
|
86
|
|
|
|
|
|
|
); |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
my $DIST_RATIO = '0.91'; |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
my %CHARSET_SPACE_SET = map { $_ => 1 } ( |
|
91
|
|
|
|
|
|
|
"\x81", # undef |
|
92
|
|
|
|
|
|
|
"\x8a", # ^ |
|
93
|
|
|
|
|
|
|
"\x8c", # ^ |
|
94
|
|
|
|
|
|
|
"\x8d", # ^ |
|
95
|
|
|
|
|
|
|
"\x8e", # ^ |
|
96
|
|
|
|
|
|
|
"\x8f", # ^ |
|
97
|
|
|
|
|
|
|
"\x90", # ^ |
|
98
|
|
|
|
|
|
|
"\x9a", # ^ |
|
99
|
|
|
|
|
|
|
"\x9c", # ^ |
|
100
|
|
|
|
|
|
|
"\x9d", # ^ |
|
101
|
|
|
|
|
|
|
"\x9e", # ^ |
|
102
|
|
|
|
|
|
|
"\x9f", # ^ |
|
103
|
|
|
|
|
|
|
"\xaa", # ^ |
|
104
|
|
|
|
|
|
|
"\xa0", # NBSP |
|
105
|
|
|
|
|
|
|
"\xd9", # undef |
|
106
|
|
|
|
|
|
|
"\xda", # ^ |
|
107
|
|
|
|
|
|
|
"\xdb", # ^ |
|
108
|
|
|
|
|
|
|
"\xdc", # ^ |
|
109
|
|
|
|
|
|
|
"\xdd", # ^ |
|
110
|
|
|
|
|
|
|
"\xde", # ^ |
|
111
|
|
|
|
|
|
|
"\xfb", # ^ |
|
112
|
|
|
|
|
|
|
"\xfc", # ^ |
|
113
|
|
|
|
|
|
|
"\xff", # undef |
|
114
|
|
|
|
|
|
|
); |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
my %IGNORE = ( |
|
117
|
|
|
|
|
|
|
%EBook::Ishmael::CharDet::Constants::ASCII_SPACE_SET, |
|
118
|
|
|
|
|
|
|
%CHARSET_SPACE_SET, |
|
119
|
|
|
|
|
|
|
); |
|
120
|
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub ignore { |
|
122
|
|
|
|
|
|
|
|
|
123
|
548864
|
|
|
548864
|
0
|
899440
|
my ($self, $byte) = @_; |
|
124
|
|
|
|
|
|
|
|
|
125
|
548864
|
|
|
|
|
1437975
|
return exists $IGNORE{ $byte }; |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
} |
|
128
|
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub freq_bigram { |
|
130
|
|
|
|
|
|
|
|
|
131
|
372763
|
|
|
372763
|
0
|
604484
|
my ($self, $bigram) = @_; |
|
132
|
|
|
|
|
|
|
|
|
133
|
372763
|
|
|
|
|
999347
|
return exists $CP1255_FREQS{ $bigram }; |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
} |
|
136
|
|
|
|
|
|
|
|
|
137
|
0
|
|
|
0
|
0
|
0
|
sub dist_ratio { $DIST_RATIO }; |
|
138
|
|
|
|
|
|
|
|
|
139
|
67
|
|
|
67
|
0
|
188
|
sub encoding { 'CP1255' } |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
1; |