line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# $File: //member/autrijus/Lingua-ZH-HanDetect/HanDetect.pm $ $Author: autrijus $ |
2
|
|
|
|
|
|
|
# $Revision: #4 $ $Change: 6772 $ $DateTime: 2003/06/27 04:42:27 $ |
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package Lingua::ZH::HanDetect; |
5
|
|
|
|
|
|
|
$Lingua::ZH::HanDetect::VERSION = '0.04'; |
6
|
|
|
|
|
|
|
|
7
|
1
|
|
|
1
|
|
4044
|
use bytes; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
10
|
|
8
|
1
|
|
|
1
|
|
40
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
40
|
|
9
|
1
|
|
|
1
|
|
5
|
use vars qw($VERSION @ISA @EXPORT $columns $overflow); |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
84
|
|
10
|
|
|
|
|
|
|
|
11
|
1
|
|
|
1
|
|
5
|
use Exporter; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
13958
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 NAME |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
Lingua::ZH::HanDetect - Guess Chinese text's variant and encoding |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 VERSION |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
This document describes version 0.04 of Lingua::ZH::HanDetect, released |
20
|
|
|
|
|
|
|
June 27, 2003. |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 SYNOPSIS |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
use Lingua::ZH::HanDetect; |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# $encoding is 'big5-hkscs', 'big5', 'gbk', 'euc-cn', 'utf8' or '' |
27
|
|
|
|
|
|
|
# $variant is 'traditional', 'simplified' or '' |
28
|
|
|
|
|
|
|
my ($encoding, $variant) = han_detect($some_chinese_text); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head1 DESCRIPTION |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
B uses statistical measures to test a text |
33
|
|
|
|
|
|
|
string to see if it's in Traditional or Simplified Chinese, as well |
34
|
|
|
|
|
|
|
as which encoding it is in. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
If the string does not contain Chinese characters, both the encoding |
37
|
|
|
|
|
|
|
and variant values will be set to the empty string. |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
This module is needed because the various encodings for Chinese text |
40
|
|
|
|
|
|
|
tend to occupy the similar byte ranges, rendering C |
41
|
|
|
|
|
|
|
ineffective. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=cut |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
46
|
|
|
|
|
|
|
@EXPORT = qw(han_detect); |
47
|
|
|
|
|
|
|
my (%rev, %map); |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
sub han_detect { |
50
|
5
|
|
|
5
|
0
|
202
|
my $text = shift; |
51
|
5
|
|
|
|
|
6
|
my %count; |
52
|
|
|
|
|
|
|
|
53
|
5
|
|
|
|
|
19
|
while (my ($k, $v) = each %rev) { |
54
|
1800
|
100
|
|
|
|
5377
|
next unless index($text, $k) > -1; |
55
|
22
|
|
|
|
|
119
|
$count{$_}++ for keys %$v; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
5
|
|
100
|
|
|
19
|
my $trad = delete($count{trad}) || 0; |
59
|
5
|
|
100
|
|
|
16
|
my $simp = delete($count{simp}) || 0; |
60
|
5
|
|
100
|
|
|
18
|
my $encoding = (sort { $count{$b} <=> $count{$a} } keys %count)[0] || ''; |
61
|
|
|
|
|
|
|
|
62
|
5
|
100
|
|
|
|
18
|
return $encoding unless wantarray; |
63
|
3
|
100
|
|
|
|
23
|
return($encoding, ($encoding ? (($trad < $simp) ? 'simplified' : 'traditional') : '')); |
|
|
100
|
|
|
|
|
|
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
1; |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
# data section -- no user-servicable parts inside. {{{ |
69
|
|
|
|
|
|
|
%map = ( |
70
|
|
|
|
|
|
|
big5_trad => [qw( |
71
|
|
|
|
|
|
|
眖 厩 蔼 猭 常 戳 瓣 筿 秏 ず 摸 弧 狶 ゅ 琵 |
72
|
|
|
|
|
|
|
阿 丁 穨 魁 朝 莱 じ 隔 ノ 碞 ㄤ 硂 パ 单 ㄓ |
73
|
|
|
|
|
|
|
セ 璶 る ら 跋 叫 穦 盢 城 ぃ 腹 订 癸 τ 穝 |
74
|
|
|
|
|
|
|
┮ ㎝ 眤 材 玡 ┪ い Τ и 琌 呼 籔 の ぇ |
75
|
|
|
|
|
|
|
)], |
76
|
|
|
|
|
|
|
gbk_simp => [qw( |
77
|
|
|
|
|
|
|
版 从 学 高 科 法 表 都 期 多 国 电 乡 如 已 内 四 类 说 此 林 至 文 让 能 |
78
|
|
|
|
|
|
|
陕 间 业 录 主 陈 应 并 地 元 路 用 就 但 二 到 其 这 後 由 等 来 他 三 可 |
79
|
|
|
|
|
|
|
本 名 要 页 小 者 站 月 於 日 区 请 会 将 杰 不 时 也 号 隆 你 对 而 大 新 |
80
|
|
|
|
|
|
|
所 和 您 下 年 第 人 前 或 了 以 为 中 有 我 上 一 是 网 回 与 在 及 之 的 |
81
|
|
|
|
|
|
|
)], |
82
|
|
|
|
|
|
|
gbk_trad => [qw( |
83
|
|
|
|
|
|
|
版 從 學 高 科 法 表 都 期 多 國 電 鄉 如 已 內 四 類 說 此 林 至 文 讓 能 |
84
|
|
|
|
|
|
|
陝 間 業 錄 主 陳 應 並 地 元 路 用 就 但 二 到 其 這 後 由 等 來 他 三 可 |
85
|
|
|
|
|
|
|
本 名 要 頁 小 者 站 月 於 日 區 請 會 將 傑 不 時 也 號 隆 你 對 而 大 新 |
86
|
|
|
|
|
|
|
所 和 您 下 年 第 人 前 或 了 以 為 中 有 我 上 一 是 網 回 與 在 及 之 的 |
87
|
|
|
|
|
|
|
)], |
88
|
|
|
|
|
|
|
utf8_trad => [qw( |
89
|
|
|
|
|
|
|
鐗 寰 瀛 楂 绉 娉 琛 閮 鏈 澶 鍦 闆 閯 濡 宸 鍏 鍥 椤 瑾 姝 鏋 鑷 鏂 璁 鑳 |
90
|
|
|
|
|
|
|
闄 闁 妤 閷 涓 闄 鎳 涓 鍦 鍏 璺 鐢 灏 浣 浜 鍒 鍏 閫 寰 鐢 绛 渚 浠 涓 鍙 |
91
|
|
|
|
|
|
|
鏈 鍚 瑕 闋 灏 鑰 绔 鏈 鏂 鏃 鍗 璜 鏈 灏 鍌 涓 鏅 涔 铏 闅 浣 灏 鑰 澶 鏂 |
92
|
|
|
|
|
|
|
鎵 鍜 鎮 涓 骞 绗 浜 鍓 鎴 浜 浠 鐐 涓 鏈 鎴 涓 涓 鏄 缍 鍥 鑸 鍦 鍙 涔 鐨 |
93
|
|
|
|
|
|
|
)], |
94
|
|
|
|
|
|
|
utf8_simp => [qw( |
95
|
|
|
|
|
|
|
鐗 浠 瀛 楂 绉 娉 琛 閮 鏈 澶 鍥 鐢 涔 濡 宸 鍐 鍥 绫 璇 姝 鏋 鑷 鏂 璁 鑳 |
96
|
|
|
|
|
|
|
闄 闂 涓 褰 涓 闄 搴 骞 鍦 鍏 璺 鐢 灏 浣 浜 鍒 鍏 杩 寰 鐢 绛 鏉 浠 涓 鍙 |
97
|
|
|
|
|
|
|
鏈 鍚 瑕 椤 灏 鑰 绔 鏈 鏂 鏃 鍖 璇 浼 灏 鏉 涓 鏃 涔 鍙 闅 浣 瀵 鑰 澶 鏂 |
98
|
|
|
|
|
|
|
鎵 鍜 鎮 涓 骞 绗 浜 鍓 鎴 浜 浠 涓 涓 鏈 鎴 涓 涓 鏄 缃 鍥 涓 鍦 鍙 涔 鐨 |
99
|
|
|
|
|
|
|
)], |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
); |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
while (my ($k, $v) = each %map) { |
104
|
|
|
|
|
|
|
my @k = split(/_/, $k); |
105
|
|
|
|
|
|
|
foreach my $c (@{$v}) { |
106
|
|
|
|
|
|
|
$rev{$c}{$_} = 1 for @k; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# }}} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=head1 SEE ALSO |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
L |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
=head1 AUTHORS |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
Autrijus Tang Eautrijus@autrijus.orgE |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=head1 COPYRIGHT |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
Copyright 2003 by Autrijus Tang Eautrijus@autrijus.orgE. |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
125
|
|
|
|
|
|
|
under the same terms as Perl itself. |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
See L |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=cut |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
1; |