line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package MILA::Transliterate; |
2
|
3
|
|
|
3
|
|
95613
|
use utf8; |
|
3
|
|
|
|
|
34
|
|
|
3
|
|
|
|
|
24
|
|
3
|
3
|
|
|
3
|
|
99
|
use Exporter; |
|
3
|
|
|
|
|
10
|
|
|
3
|
|
|
|
|
3549
|
|
4
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
5
|
|
|
|
|
|
|
our @EXPORT_OK = qw(hebrew2treebank treebank2hebrew hebrew2erel erel2hebrew hebrew2fsma fsma2hebrew); |
6
|
|
|
|
|
|
|
our $VERSION = 0.01; |
7
|
|
|
|
|
|
|
=head1 NAME |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
MILA::Transliterate - A Perl Module for transliterating text from Hebrew to various transliterations used in the Knowledge Center for Processing Hebrew (MILA) and vise versa |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 SYNOPSIS |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use MILA::Transliterate qw((hebrew2treebank hebrew2erel hebrew2fsma); |
14
|
|
|
|
|
|
|
my $erel_transliterated = hebrew2erel($utf8_encoded_hebrew_text); |
15
|
|
|
|
|
|
|
my $treebank_transliterated = hebrew2treebank($utf8_encoded_hebrew_text); |
16
|
|
|
|
|
|
|
my $fsma_transliterated = hebrew2fsma($utf8_encoded_hebrew_text); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
# note that the reverse transliteration does NOT maintain final Hebrew letters! |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=head1 DESCRIPTION |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
Before UNICODE was widely used, applications that were manipulating Hebrew text usually used some transliteration into ASCII characters instead of using Hebrew letters. This was particularly true for software developed in the academia. MILA is a nick name for the Knowledge Center for Processing Hebrew (see: http://mila.cs.technion.ac.il/). This knowledge center develops software and standards that result from research in natural language processing for Hebrew. As a result, some legacy software also needs to be maintained and such legacy software usually used transliteration. |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This module contains mapping from UTF-8 encoded Hebrew to the various transliteration schemes that MILA needs to support and also contains the reversed mapping. |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
=head1 FUNCTIONS |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=item $treebank_transliterated = hebrew2treebank( $utf8_encoded_hebrew_text ) |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
This function maps UTF-8 encoded Hebrew text into the treebank transliteration. Every character not in the mapping is being copied as is without any conversion. |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=item $erel_transliterated = hebrew2erel( $utf8_encoded_hebrew_text ) |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
This function maps UTF-8 encoded Hebrew text into the erel transliteration. Every character not in the mapping is being copied as is without any conversion. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=item $fsma_transliterated = hebrew2fsma( $utf8_encoded_hebrew_text ) |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
This function maps UTF-8 encoded Hebrew text into the fsma transliteration. Every character not in the mapping is being copied as is without any conversion. |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=item $utf8_encoded_hebrew_text = treebank2hebrew( $treebank_transliterated ) |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
This function provides the reverse transliteration that is provided by hebrew2treebank(). Note that final letters are not preserved and are lost. |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=item $utf8_encoded_hebrew_text = erel2hebrew( $erel_transliterated ) |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
This function provides the reverse transliteration that is provided by hebrew2erel(). Note that final letters are not preserved and are lost. |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=item $utf8_encoded_hebrew_text = fsma2hebrew( $fsma_transliterated ) |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
This function provides the reverse transliteration that is provided by hebrew2fsma(). Note that final letters are not preserved and are lost. |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=item AUTHOR |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
Shlomo Yona yona@cs.technion.ac.il http://cs.haifa.ac.il/~shlomo/ |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head1 COPYRIGHT |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
Copyright (c) 20042 Shlomo Yona. All rights reserved. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
This library is free software. |
61
|
|
|
|
|
|
|
You can redistribute it and/or modify it under the same terms as Perl itself. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head1 CVS INFO |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
$Revision: 1.1 $ |
66
|
|
|
|
|
|
|
$Date: 2004/12/17 09:17:37 $ |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=cut |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# UTF-8 Encoded Hebrew letters mapped to Treebank alphabet |
71
|
|
|
|
|
|
|
my %h2t =( |
72
|
|
|
|
|
|
|
א => 'A', |
73
|
|
|
|
|
|
|
ב => 'B', |
74
|
|
|
|
|
|
|
ג => 'G', |
75
|
|
|
|
|
|
|
ד => 'D', |
76
|
|
|
|
|
|
|
ה => 'H', |
77
|
|
|
|
|
|
|
ו => 'W', |
78
|
|
|
|
|
|
|
ז => 'Z', |
79
|
|
|
|
|
|
|
ח => 'X', |
80
|
|
|
|
|
|
|
ט => 'J', |
81
|
|
|
|
|
|
|
י => 'I', |
82
|
|
|
|
|
|
|
ך => 'K', |
83
|
|
|
|
|
|
|
כ => 'K', |
84
|
|
|
|
|
|
|
ל => 'L', |
85
|
|
|
|
|
|
|
ם => 'M', |
86
|
|
|
|
|
|
|
מ => 'M', |
87
|
|
|
|
|
|
|
ן => 'N', |
88
|
|
|
|
|
|
|
נ => 'N', |
89
|
|
|
|
|
|
|
ס => 'S', |
90
|
|
|
|
|
|
|
ע => 'E', |
91
|
|
|
|
|
|
|
ף => 'P', |
92
|
|
|
|
|
|
|
פ => 'P', |
93
|
|
|
|
|
|
|
ץ => 'C', |
94
|
|
|
|
|
|
|
צ => 'C', |
95
|
|
|
|
|
|
|
ק => 'Q', |
96
|
|
|
|
|
|
|
ר => 'R', |
97
|
|
|
|
|
|
|
ש => 'F', |
98
|
|
|
|
|
|
|
ת => 'T', |
99
|
|
|
|
|
|
|
'"' => 'U', |
100
|
|
|
|
|
|
|
'%' => 'O', |
101
|
|
|
|
|
|
|
); |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# Treebank alphabet mapped to UTF-8 Encoded Hebrew letters |
104
|
|
|
|
|
|
|
my %t2h=( |
105
|
|
|
|
|
|
|
'A' => 'א', |
106
|
|
|
|
|
|
|
'B' => 'ב', |
107
|
|
|
|
|
|
|
'G' => 'ג', |
108
|
|
|
|
|
|
|
'D' => 'ד', |
109
|
|
|
|
|
|
|
'H' => 'ה', |
110
|
|
|
|
|
|
|
'W' => 'ו', |
111
|
|
|
|
|
|
|
'Z' => 'ז', |
112
|
|
|
|
|
|
|
'X' => 'ח', |
113
|
|
|
|
|
|
|
'J' => 'ט', |
114
|
|
|
|
|
|
|
'I' => 'י', |
115
|
|
|
|
|
|
|
'K' => 'כ', |
116
|
|
|
|
|
|
|
'L' => 'ל', |
117
|
|
|
|
|
|
|
'M' => 'ם', |
118
|
|
|
|
|
|
|
'M' => 'מ', |
119
|
|
|
|
|
|
|
'N' => 'נ', |
120
|
|
|
|
|
|
|
'S' => 'ס', |
121
|
|
|
|
|
|
|
'E' => 'ע', |
122
|
|
|
|
|
|
|
'P' => 'פ', |
123
|
|
|
|
|
|
|
'C' => 'צ', |
124
|
|
|
|
|
|
|
'Q' => 'ק', |
125
|
|
|
|
|
|
|
'R' => 'ר', |
126
|
|
|
|
|
|
|
'F' => 'ש', |
127
|
|
|
|
|
|
|
'T' => 'ת', |
128
|
|
|
|
|
|
|
'U' => '"', |
129
|
|
|
|
|
|
|
'O' => '%', |
130
|
|
|
|
|
|
|
); |
131
|
|
|
|
|
|
|
|
132
|
|
|
|
|
|
|
# UTF-8 encoded Hebrew letters mapped to Erel's alphabet |
133
|
|
|
|
|
|
|
my %h2e =( |
134
|
|
|
|
|
|
|
א => 'A', |
135
|
|
|
|
|
|
|
ב => 'B', |
136
|
|
|
|
|
|
|
ג => 'G', |
137
|
|
|
|
|
|
|
ד => 'D', |
138
|
|
|
|
|
|
|
ה => 'H', |
139
|
|
|
|
|
|
|
ו => 'W', |
140
|
|
|
|
|
|
|
ז => 'Z', |
141
|
|
|
|
|
|
|
ח => 'X', |
142
|
|
|
|
|
|
|
ט => '@', |
143
|
|
|
|
|
|
|
י => 'I', |
144
|
|
|
|
|
|
|
ך => 'K', |
145
|
|
|
|
|
|
|
כ => 'K', |
146
|
|
|
|
|
|
|
ל => 'L', |
147
|
|
|
|
|
|
|
ם => 'M', |
148
|
|
|
|
|
|
|
מ => 'M', |
149
|
|
|
|
|
|
|
ן => 'N', |
150
|
|
|
|
|
|
|
נ => 'N', |
151
|
|
|
|
|
|
|
ס => 'S', |
152
|
|
|
|
|
|
|
ע => '&', |
153
|
|
|
|
|
|
|
ף => 'P', |
154
|
|
|
|
|
|
|
פ => 'P', |
155
|
|
|
|
|
|
|
ץ => 'C', |
156
|
|
|
|
|
|
|
צ => 'C', |
157
|
|
|
|
|
|
|
ק => 'Q', |
158
|
|
|
|
|
|
|
ר => 'R', |
159
|
|
|
|
|
|
|
ש => '$', |
160
|
|
|
|
|
|
|
ת => 'T', |
161
|
|
|
|
|
|
|
); |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# Erel's alphabet mapped to UTF-8 encoded Hebrew letters |
164
|
|
|
|
|
|
|
my %e2h=( |
165
|
|
|
|
|
|
|
'A' => 'א', |
166
|
|
|
|
|
|
|
'B' => 'ב', |
167
|
|
|
|
|
|
|
'G' => 'ג', |
168
|
|
|
|
|
|
|
'D' => 'ד', |
169
|
|
|
|
|
|
|
'H' => 'ה', |
170
|
|
|
|
|
|
|
'W' => 'ו', |
171
|
|
|
|
|
|
|
'Z' => 'ז', |
172
|
|
|
|
|
|
|
'X' => 'ח', |
173
|
|
|
|
|
|
|
'@' => 'ט', |
174
|
|
|
|
|
|
|
'I' => 'י', |
175
|
|
|
|
|
|
|
'K' => 'כ', |
176
|
|
|
|
|
|
|
'L' => 'ל', |
177
|
|
|
|
|
|
|
'M' => 'ם', |
178
|
|
|
|
|
|
|
'M' => 'מ', |
179
|
|
|
|
|
|
|
'N' => 'נ', |
180
|
|
|
|
|
|
|
'S' => 'ס', |
181
|
|
|
|
|
|
|
'&' => 'ע', |
182
|
|
|
|
|
|
|
'P' => 'פ', |
183
|
|
|
|
|
|
|
'C' => 'צ', |
184
|
|
|
|
|
|
|
'Q' => 'ק', |
185
|
|
|
|
|
|
|
'R' => 'ר', |
186
|
|
|
|
|
|
|
'$' => 'ש', |
187
|
|
|
|
|
|
|
'T' => 'ת', |
188
|
|
|
|
|
|
|
); |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# UTF-8 encoded Hebrew letters mapped to FSMA's alphabet |
191
|
|
|
|
|
|
|
my %h2l =( |
192
|
|
|
|
|
|
|
א => 'a', |
193
|
|
|
|
|
|
|
ב => 'b', |
194
|
|
|
|
|
|
|
ג => 'g', |
195
|
|
|
|
|
|
|
ד => 'd', |
196
|
|
|
|
|
|
|
ה => 'h', |
197
|
|
|
|
|
|
|
ו => 'w', |
198
|
|
|
|
|
|
|
ז => 'z', |
199
|
|
|
|
|
|
|
ח => 'x', |
200
|
|
|
|
|
|
|
ט => 'v', |
201
|
|
|
|
|
|
|
י => 'i', |
202
|
|
|
|
|
|
|
ך => 'k', |
203
|
|
|
|
|
|
|
כ => 'k', |
204
|
|
|
|
|
|
|
ל => 'l', |
205
|
|
|
|
|
|
|
ם => 'm', |
206
|
|
|
|
|
|
|
מ => 'm', |
207
|
|
|
|
|
|
|
ן => 'n', |
208
|
|
|
|
|
|
|
נ => 'n', |
209
|
|
|
|
|
|
|
ס => 's', |
210
|
|
|
|
|
|
|
ע => 'y', |
211
|
|
|
|
|
|
|
ף => 'p', |
212
|
|
|
|
|
|
|
פ => 'p', |
213
|
|
|
|
|
|
|
ץ => 'c', |
214
|
|
|
|
|
|
|
צ => 'c', |
215
|
|
|
|
|
|
|
ק => 'q', |
216
|
|
|
|
|
|
|
ר => 'r', |
217
|
|
|
|
|
|
|
ש => 'e', |
218
|
|
|
|
|
|
|
ת => 't', |
219
|
|
|
|
|
|
|
); |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
# FSMA's alphabet mapped to UTF-8 encoded Hebrew letters |
222
|
|
|
|
|
|
|
my %l2h=( |
223
|
|
|
|
|
|
|
'a' => 'א', |
224
|
|
|
|
|
|
|
'b' => 'ב', |
225
|
|
|
|
|
|
|
'g' => 'ג', |
226
|
|
|
|
|
|
|
'd' => 'ד', |
227
|
|
|
|
|
|
|
'h' => 'ה', |
228
|
|
|
|
|
|
|
'w' => 'ו', |
229
|
|
|
|
|
|
|
'z' => 'ז', |
230
|
|
|
|
|
|
|
'x' => 'ח', |
231
|
|
|
|
|
|
|
'v' => 'ט', |
232
|
|
|
|
|
|
|
'i' => 'י', |
233
|
|
|
|
|
|
|
'k' => 'כ', |
234
|
|
|
|
|
|
|
'l' => 'ל', |
235
|
|
|
|
|
|
|
'm' => 'ם', |
236
|
|
|
|
|
|
|
'm' => 'מ', |
237
|
|
|
|
|
|
|
'n' => 'נ', |
238
|
|
|
|
|
|
|
's' => 'ס', |
239
|
|
|
|
|
|
|
'y' => 'ע', |
240
|
|
|
|
|
|
|
'p' => 'פ', |
241
|
|
|
|
|
|
|
'c' => 'צ', |
242
|
|
|
|
|
|
|
'q' => 'ק', |
243
|
|
|
|
|
|
|
'r' => 'ר', |
244
|
|
|
|
|
|
|
'e' => 'ש', |
245
|
|
|
|
|
|
|
't' => 'ת', |
246
|
|
|
|
|
|
|
); |
247
|
|
|
|
|
|
|
|
248
|
|
|
|
|
|
|
sub generic_translation { |
249
|
9
|
|
|
9
|
0
|
16
|
my ($from_string,$mapping_hash) = @_; |
250
|
9
|
|
|
|
|
150
|
my $to_string=''; |
251
|
9
|
|
|
|
|
1307
|
foreach my $c (split //,$from_string) { |
252
|
1278
|
100
|
|
|
|
2614
|
if (exists $mapping_hash->{$c}) { |
253
|
849
|
|
|
|
|
1748
|
$to_string.= $mapping_hash->{$c}; |
254
|
|
|
|
|
|
|
} else{ |
255
|
429
|
|
|
|
|
750
|
$to_string.=$c; |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
} |
258
|
9
|
|
|
|
|
290
|
return $to_string; |
259
|
|
|
|
|
|
|
} |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
sub hebrew2treebank { |
262
|
1
|
|
|
1
|
1
|
14
|
my ($hebrew_string) = @_; |
263
|
1
|
|
|
|
|
7
|
return generic_translation($hebrew_string,\%h2t); |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
sub treebank2hebrew { |
267
|
2
|
|
|
2
|
1
|
5
|
my ($treebank_string) = @_; |
268
|
2
|
|
|
|
|
6
|
return generic_translation($treebank_string,\%t2h); |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
sub hebrew2erel { |
272
|
1
|
|
|
1
|
1
|
4
|
my ($hebrew_string) = @_; |
273
|
1
|
|
|
|
|
4
|
return generic_translation($hebrew_string,\%h2e); |
274
|
|
|
|
|
|
|
} |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
sub erel2hebrew { |
277
|
2
|
|
|
2
|
1
|
7
|
my ($treebank_string) = @_; |
278
|
2
|
|
|
|
|
7
|
return generic_translation($treebank_string,\%e2h); |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
sub hebrew2fsma { |
282
|
1
|
|
|
1
|
1
|
4
|
my ($hebrew_string) = @_; |
283
|
1
|
|
|
|
|
19
|
return generic_translation($hebrew_string,\%h2l); |
284
|
|
|
|
|
|
|
} |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
sub fsma2hebrew { |
287
|
2
|
|
|
2
|
1
|
6
|
my ($treebank_string) = @_; |
288
|
2
|
|
|
|
|
770
|
return generic_translation($treebank_string,\%l2h); |
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
1; |