line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# Package Lingua::DetectCharset |
2
|
|
|
|
|
|
|
# Version 1.02 |
3
|
|
|
|
|
|
|
# Part of "WWW Cyrillic Encoding Suite" |
4
|
|
|
|
|
|
|
# Get docs and newest version from |
5
|
|
|
|
|
|
|
# http://www.neystadt.org/cyrillic/ |
6
|
|
|
|
|
|
|
# |
7
|
|
|
|
|
|
|
# Copyright (c) 1997-98, John Neystadt |
8
|
|
|
|
|
|
|
# You may install this script on your web site for free |
9
|
|
|
|
|
|
|
# To obtain permision for redistribution or any other usage |
10
|
|
|
|
|
|
|
# contact john@neystadt.org. |
11
|
|
|
|
|
|
|
# |
12
|
|
|
|
|
|
|
# Portions copyright by |
13
|
|
|
|
|
|
|
# |
14
|
|
|
|
|
|
|
# Drop me a line if you deploy this script on tyour site. |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
package Lingua::DetectCharset; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
$VERSION = "1.02"; |
19
|
|
|
|
|
|
|
|
20
|
1
|
|
|
1
|
|
1345
|
use Convert::Cyrillic; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
28
|
|
21
|
1
|
|
|
1
|
|
3590
|
use Lingua::DetectCharset::StatKoi; |
|
1
|
|
|
|
|
83
|
|
|
1
|
|
|
|
|
57
|
|
22
|
1
|
|
|
1
|
|
5933
|
use Lingua::DetectCharset::StatWin; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
69
|
|
23
|
1
|
|
|
1
|
|
4129
|
use Lingua::DetectCharset::StatUtf8; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
672
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
$PairSize = 2; |
26
|
|
|
|
|
|
|
$MinRatio = 1.5; # Mark must be in $MinRatio times larger of |
27
|
|
|
|
|
|
|
# one encoding than another to decide upon, or ENG. |
28
|
|
|
|
|
|
|
$DoubtRatio = 1; |
29
|
|
|
|
|
|
|
$DoubtLog = 'DetectCharsetDoubt.txt'; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub Detect { |
32
|
1
|
|
|
1
|
0
|
6
|
my (@Data) = @_; |
33
|
1
|
|
|
|
|
6
|
my ($KoiMark) = GetCodeScore ('Koi', @Data); |
34
|
1
|
|
|
|
|
6
|
my ($WinMark) = GetCodeScore ('Win', @Data); |
35
|
1
|
|
|
|
|
3
|
my ($Utf8Mark) = GetCodeScore ('Utf8', @Data); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# print STDERR "GetEncoding: Koi8 - $KoiMark, Win - $WinMark, Utf8 - $Utf8Mark\n"; |
38
|
|
|
|
|
|
|
|
39
|
1
|
|
|
|
|
4
|
$KoiRatio = $KoiMark/($WinMark+$Utf8Mark+1); |
40
|
1
|
|
|
|
|
17
|
$WinRatio = $WinMark/($KoiMark+$Utf8Mark+1); |
41
|
1
|
|
|
|
|
3
|
$Utf8Ratio = $Utf8Mark/($KoiMark+$WinMark+1); |
42
|
|
|
|
|
|
|
|
43
|
1
|
50
|
|
|
|
4
|
if ($DoubtLog) { |
44
|
1
|
50
|
33
|
|
|
21
|
if (($KoiRatio < $MinRatio && $KoiRatio > $DoubtRatio) || |
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
33
|
|
|
|
|
45
|
|
|
|
|
|
|
($WinRatio < $MinRatio && $WinRatio > $DoubtRatio) || |
46
|
|
|
|
|
|
|
($Utf8Ratio < $MinRatio && $Utf8Ratio > $DoubtRatio)) { |
47
|
0
|
|
|
|
|
0
|
open Log, ">>$DoubtLog"; |
48
|
0
|
|
|
|
|
0
|
print Log " Koi8 - $KoiMark, Win - $WinMark, Utf8 - $Utf8Mark\n", |
49
|
|
|
|
|
|
|
join ("\n", @Data), "\n\n"; |
50
|
0
|
|
|
|
|
0
|
close Log; |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
1
|
50
|
33
|
|
|
5
|
return 'KOI8' if $KoiRatio > $WinRatio && $KoiRatio > $Utf8Ratio; # $MinRatio; |
55
|
1
|
50
|
|
|
|
11
|
return 'WIN' if $WinRatio > $Utf8Ratio; |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
# We do english, only if no single cyrillic character were detected |
58
|
0
|
0
|
|
|
|
0
|
return 'UTF8' if $WinRatio + $KoiRatio + $Utf8Ratio > 0; |
59
|
0
|
|
|
|
|
0
|
return 'ENG'; |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub GetCodeScore { |
63
|
3
|
|
|
3
|
0
|
7
|
my ($Code, @Data) = @_; |
64
|
3
|
|
|
|
|
4
|
my ($Table); |
65
|
|
|
|
|
|
|
|
66
|
3
|
100
|
|
|
|
13
|
if ($Code eq 'Koi') { |
|
|
100
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
67
|
1
|
|
|
|
|
3
|
$Table = \%Lingua::DetectCharset::StatKoi::StatsTableKoi; |
68
|
|
|
|
|
|
|
} elsif ($Code eq 'Win') { |
69
|
1
|
|
|
|
|
2
|
$Table = \%Lingua::DetectCharset::StatWin::StatsTableWin; |
70
|
|
|
|
|
|
|
} elsif ($Code eq 'Utf8') { |
71
|
1
|
|
|
|
|
2
|
$Table = \%Lingua::DetectCharset::StatUtf8::StatsTableUtf8Long; |
72
|
|
|
|
|
|
|
} else { |
73
|
0
|
|
|
|
|
0
|
die "Don't know $Code!\n"; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
3
|
100
|
|
|
|
10
|
$PairSize = 4 if $Code eq 'Utf8'; |
77
|
3
|
100
|
|
|
|
7
|
$PairSize = 2 if $Code ne 'Utf8'; |
78
|
|
|
|
|
|
|
|
79
|
3
|
|
|
|
|
4
|
my ($Mark, $i); |
80
|
3
|
|
|
|
|
6
|
for (@Data) { |
81
|
3
|
|
|
|
|
8
|
s/[\n\r]//go; |
82
|
3
|
|
|
|
|
10
|
$_ = Convert::Cyrillic::toLower ($_, $Code); |
83
|
3
|
|
|
|
|
18
|
for (split (/[\.\,\-\s\:\;\?\!\'\"\(\)\d<>]+/o)) { |
84
|
12
|
|
|
|
|
24
|
for $i (0..length ()-$PairSize) { |
85
|
19
|
|
|
|
|
17
|
$Mark += ${$Table} {substr ($_, $i, $PairSize)}; |
|
19
|
|
|
|
|
54
|
|
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
3
|
|
|
|
|
10
|
$Mark; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
1; |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
__END__ |