line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::EN::Alphabet::Shaw; |
2
|
|
|
|
|
|
|
|
3
|
7
|
|
|
7
|
|
258167
|
use 5.005; |
|
7
|
|
|
|
|
28
|
|
|
7
|
|
|
|
|
307
|
|
4
|
7
|
|
|
7
|
|
42
|
use strict; |
|
7
|
|
|
|
|
14
|
|
|
7
|
|
|
|
|
301
|
|
5
|
7
|
|
|
7
|
|
53
|
use warnings; |
|
7
|
|
|
|
|
32
|
|
|
7
|
|
|
|
|
221
|
|
6
|
7
|
|
|
7
|
|
393275
|
use DBI; |
|
7
|
|
|
|
|
267728
|
|
|
7
|
|
|
|
|
1983
|
|
7
|
7
|
|
|
7
|
|
9239
|
use Encode; |
|
7
|
|
|
|
|
167369
|
|
|
7
|
|
|
|
|
1315
|
|
8
|
7
|
|
|
7
|
|
21478
|
use File::ShareDir qw(dist_file); |
|
7
|
|
|
|
|
82554
|
|
|
7
|
|
|
|
|
669
|
|
9
|
7
|
|
|
7
|
|
12218
|
use HTML::Parser; |
|
7
|
|
|
|
|
104934
|
|
|
7
|
|
|
|
|
27687
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = 0.64; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
sub new { |
14
|
7
|
|
|
7
|
1
|
4914
|
my ($class) = @_; |
15
|
|
|
|
|
|
|
my $self = { |
16
|
|
|
|
|
|
|
dbh => undef, |
17
|
|
|
|
|
|
|
sth => undef, |
18
|
|
|
|
|
|
|
map => undef, |
19
|
|
|
|
|
|
|
# default behaviour for "unknown" is to return its argument |
20
|
3
|
|
|
3
|
|
33
|
unknown => sub { $_[0]; }, |
21
|
7
|
|
|
|
|
84
|
}; |
22
|
7
|
|
|
|
|
42
|
return bless($self, $class); |
23
|
|
|
|
|
|
|
} |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sub unknown_handler { |
26
|
2
|
|
|
2
|
1
|
17
|
my ($self, $handler) = @_; |
27
|
|
|
|
|
|
|
|
28
|
2
|
100
|
|
|
|
13
|
$self->{unknown} = $handler if defined $handler; |
29
|
|
|
|
|
|
|
|
30
|
2
|
|
|
|
|
13
|
return $self->{unknown}; |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my %_source_to_bank = ( |
34
|
|
|
|
|
|
|
0 => 'W', # Shavian wiki |
35
|
|
|
|
|
|
|
1 => 'C', # CMUDict |
36
|
|
|
|
|
|
|
2 => 'A', # Androcles and the Lion |
37
|
|
|
|
|
|
|
); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub transliterate_details { |
40
|
|
|
|
|
|
|
|
41
|
23
|
|
|
23
|
0
|
44
|
my @result; |
42
|
|
|
|
|
|
|
|
43
|
23
|
|
|
|
|
57
|
my ($self, @texts) = @_; |
44
|
|
|
|
|
|
|
|
45
|
23
|
100
|
|
|
|
137
|
unless (defined $self->{dbh}) { |
46
|
6
|
|
|
|
|
13
|
my $filename; |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# allow a local override |
49
|
6
|
|
|
|
|
12846
|
$filename = glob('~/.cache/shavian/shavian-set.sqlite'); |
50
|
6
|
50
|
|
|
|
140
|
$filename = dist_file('Lingua-EN-Alphabet-Shaw', 'shavian-set.sqlite') unless -e $filename; |
51
|
|
|
|
|
|
|
|
52
|
6
|
|
|
|
|
2115
|
$self->{dbh} = DBI->connect("dbi:SQLite:dbname=$filename","",""); |
53
|
6
|
|
|
|
|
129373
|
$self->{sth} = $self->{dbh}->prepare('select shaw, pos, dab, source from words where latn=?'); |
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
|
56
|
23
|
|
|
|
|
2827
|
my $prevpos = 'n'; # sensible default |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
my $lookup_word = sub { |
59
|
113
|
|
|
113
|
|
149
|
my ($word) = @_; |
60
|
|
|
|
|
|
|
|
61
|
113
|
|
|
|
|
14329
|
$self->{sth}->execute(lc $word); |
62
|
113
|
|
|
|
|
2064
|
my $homonyms = $self->{sth}->fetchall_arrayref(); |
63
|
|
|
|
|
|
|
return { |
64
|
113
|
100
|
|
|
|
319
|
bank => 'U', |
65
|
|
|
|
|
|
|
src => $word, |
66
|
|
|
|
|
|
|
text => $self->{'unknown'}->($word, $word), |
67
|
|
|
|
|
|
|
} unless @$homonyms; |
68
|
109
|
|
|
|
|
158
|
my $candidate = $homonyms->[0]; |
69
|
109
|
|
|
|
|
214
|
for (@$homonyms) { |
70
|
122
|
100
|
|
|
|
8024
|
$candidate = $_ if $_->[2] =~ $prevpos; |
71
|
122
|
100
|
100
|
|
|
383
|
$candidate = $_ if $_->[2] eq 'g' && $word =~ /^[A-Z]/; |
72
|
122
|
100
|
100
|
|
|
475
|
$candidate = $_ if $_->[2] eq 'h' && $word =~ /^[a-z]/; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
109
|
|
|
|
|
173
|
$prevpos = $candidate->[1]; |
76
|
|
|
|
|
|
|
|
77
|
109
|
|
50
|
|
|
643
|
my $result = { |
78
|
|
|
|
|
|
|
bank => $_source_to_bank{$candidate->[3]} || '?', |
79
|
|
|
|
|
|
|
src => $word, |
80
|
|
|
|
|
|
|
text => decode_utf8($candidate->[0]), |
81
|
|
|
|
|
|
|
}; |
82
|
|
|
|
|
|
|
|
83
|
109
|
100
|
|
|
|
5995
|
$result->{'dab'}=1 if scalar(@$homonyms)>1; |
84
|
|
|
|
|
|
|
|
85
|
109
|
|
|
|
|
585
|
return $result; |
86
|
23
|
|
|
|
|
153
|
}; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
my $store_literal = sub { |
89
|
162
|
|
|
162
|
|
241
|
my ($literal) = @_; |
90
|
162
|
100
|
|
|
|
366
|
return if $literal eq ''; |
91
|
|
|
|
|
|
|
|
92
|
139
|
100
|
100
|
|
|
716
|
if (@result && $result[-1]->{'bank'} eq 'L') { |
93
|
24
|
|
|
|
|
89
|
$result[-1]->{'text'} .= $literal; |
94
|
|
|
|
|
|
|
} else { |
95
|
115
|
|
|
|
|
452
|
push @result, { bank=>'L', text=>$literal }; |
96
|
|
|
|
|
|
|
} |
97
|
23
|
|
|
|
|
91
|
}; |
98
|
|
|
|
|
|
|
|
99
|
23
|
|
|
|
|
87
|
while (@texts) { |
100
|
43
|
|
|
|
|
81
|
my $text = shift @texts; |
101
|
|
|
|
|
|
|
|
102
|
43
|
|
|
|
|
586
|
my @splittext = split(m/(?
|
103
|
|
|
|
|
|
|
|
104
|
43
|
|
|
|
|
121
|
while (@splittext) { |
105
|
141
|
|
|
|
|
331
|
$store_literal->(shift @splittext); |
106
|
|
|
|
|
|
|
|
107
|
141
|
100
|
|
|
|
2351
|
push @result, $lookup_word->(shift @splittext) if @splittext; |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
43
|
100
|
|
|
|
4819
|
$store_literal->(shift @texts) if @texts; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
23
|
50
|
|
|
|
469
|
return @result if wantarray; |
114
|
0
|
|
|
|
|
0
|
return [@result]; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub transliterate { |
118
|
22
|
|
|
22
|
1
|
2714
|
my ($self, @texts) = @_; |
119
|
|
|
|
|
|
|
|
120
|
22
|
|
|
|
|
85
|
return join('', map { $_->{'text'} } $self->transliterate_details(@texts) ); |
|
211
|
|
|
|
|
1368
|
|
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub mapping { |
124
|
|
|
|
|
|
|
|
125
|
5
|
|
|
5
|
1
|
8
|
my ($self, $text) = @_; |
126
|
|
|
|
|
|
|
|
127
|
5
|
100
|
|
|
|
15
|
unless (defined $self->{map}) { |
128
|
1
|
|
|
|
|
21
|
$self->{map} = {}; |
129
|
1
|
|
|
|
|
3
|
my $codepoint = 66640; |
130
|
1
|
|
|
|
|
5
|
for (qw(p t k f T s S c j N b d g v H z |
131
|
|
|
|
|
|
|
Z J w h l m i e A a o U Q y r n |
132
|
|
|
|
|
|
|
I E F u O M q Y R P X x D C W V)) { |
133
|
48
|
|
|
|
|
90
|
$self->{map}->{chr($codepoint)} = $_; |
134
|
48
|
|
|
|
|
98
|
$self->{map}->{$_} = chr($codepoint); |
135
|
48
|
|
|
|
|
51
|
$codepoint++; |
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
1
|
|
|
|
|
3
|
my $naming_dot = chr(0xB7); |
139
|
1
|
|
|
|
|
2
|
$self->{map}->{$naming_dot} = 'G'; |
140
|
1
|
|
|
|
|
2
|
$self->{map}->{'G'} = $naming_dot; |
141
|
1
|
|
|
|
|
3
|
$self->{map}->{'B'} = $naming_dot; |
142
|
|
|
|
|
|
|
# some standards also map it to the solidus |
143
|
|
|
|
|
|
|
# but that will stop this function being |
144
|
|
|
|
|
|
|
# its own inverse |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
my $remap = sub { |
148
|
82
|
|
|
82
|
|
100
|
my ($char) = @_; |
149
|
82
|
100
|
|
|
|
291
|
return $self->{map}->{$char} if defined $self->{map}->{$char}; |
150
|
24
|
|
|
|
|
53
|
return $char; |
151
|
5
|
|
|
|
|
21
|
}; |
152
|
|
|
|
|
|
|
|
153
|
5
|
|
|
|
|
25
|
$text =~ s/(.)/$remap->($1)/ge; |
|
82
|
|
|
|
|
113
|
|
154
|
5
|
|
|
|
|
40
|
return $text; |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub normalise { |
158
|
7
|
|
|
7
|
1
|
15992
|
my ($self, $shaw) = @_; |
159
|
|
|
|
|
|
|
|
160
|
7
|
|
|
|
|
76
|
my %mappings = ( |
161
|
|
|
|
|
|
|
chr(66664).chr(66670) => chr(66680), # ash + roar = are |
162
|
|
|
|
|
|
|
chr(66666).chr(66670) => chr(66681), # on + roar = or |
163
|
|
|
|
|
|
|
chr(66663).chr(66670) => chr(66682), # egg + roar = air |
164
|
|
|
|
|
|
|
chr(66675).chr(66670) => chr(66683), # up + roar = err |
165
|
|
|
|
|
|
|
chr(66665).chr(66670) => chr(66684), # ado + roar = array |
166
|
|
|
|
|
|
|
chr(66662).chr(66670) => chr(66685), # if + roar = ear |
167
|
|
|
|
|
|
|
chr(66662).chr(66665) => chr(66686), # if + ado = ian |
168
|
|
|
|
|
|
|
chr(66648).chr(66677) => chr(66687), # yea + ooze = yew |
169
|
|
|
|
|
|
|
); |
170
|
|
|
|
|
|
|
|
171
|
7
|
|
|
|
|
62
|
for (keys %mappings) { |
172
|
56
|
|
|
|
|
600
|
$shaw =~ s/$_/$mappings{$_}/g; |
173
|
|
|
|
|
|
|
} |
174
|
|
|
|
|
|
|
|
175
|
7
|
|
|
|
|
55
|
return $shaw; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
sub transliterate_html { |
179
|
1
|
|
|
1
|
1
|
952
|
my ($self, $html, %flags) = @_; |
180
|
|
|
|
|
|
|
|
181
|
1
|
|
|
|
|
3
|
my @content; |
182
|
|
|
|
|
|
|
my $result; |
183
|
|
|
|
|
|
|
|
184
|
1
|
|
|
|
|
5
|
my %toplevel_tags = map {$_=>1} qw(p div h1 h2 h3 h4 h5 h6 ul ol li dt dd dl title); |
|
15
|
|
|
|
|
35
|
|
185
|
1
|
|
|
|
|
4
|
my %text_attrs = map {$_=>1} qw(alt title); |
|
2
|
|
|
|
|
16
|
|
186
|
|
|
|
|
|
|
|
187
|
1
|
|
|
|
|
4
|
my $generator_seen = 0; |
188
|
1
|
|
|
|
|
3
|
my $generator_name = ref($self); |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
my $output = sub { |
191
|
27
|
|
|
27
|
|
44
|
my ($repr, $tag) = @_; |
192
|
|
|
|
|
|
|
|
193
|
27
|
100
|
100
|
|
|
328
|
if (!$tag || $toplevel_tags{$tag}) { |
194
|
9
|
|
|
|
|
16
|
my $want_tag = 0; |
195
|
9
|
|
|
|
|
26
|
my @ordered = (''); |
196
|
9
|
|
|
|
|
22
|
for (@content) { |
197
|
39
|
|
|
|
|
97
|
my $is_tag = /^; |
198
|
39
|
100
|
|
|
|
81
|
if ($want_tag != $is_tag) { |
199
|
31
|
|
|
|
|
48
|
push @ordered, ''; |
200
|
31
|
|
|
|
|
35
|
$want_tag = $is_tag; |
201
|
|
|
|
|
|
|
} |
202
|
39
|
|
|
|
|
85
|
$ordered[-1] .= $_; |
203
|
|
|
|
|
|
|
} |
204
|
9
|
50
|
|
|
|
21
|
if ($flags{'titles'}) { |
205
|
|
|
|
|
|
|
# FIXME we should also include class="dab" if they ask for it |
206
|
0
|
|
|
|
|
0
|
my $entity = 0; |
207
|
0
|
|
|
|
|
0
|
for my $detail ($self->transliterate_details(@ordered)) { |
208
|
0
|
0
|
0
|
|
|
0
|
if (defined $detail->{'src'} && !$entity) { |
209
|
0
|
|
|
|
|
0
|
$result .= '
210
|
|
|
|
|
|
|
$detail->{'src'} . |
211
|
|
|
|
|
|
|
'">' . |
212
|
|
|
|
|
|
|
$detail->{'text'} . |
213
|
|
|
|
|
|
|
''; |
214
|
|
|
|
|
|
|
} else { |
215
|
0
|
|
|
|
|
0
|
$result .= $detail->{'text'}; |
216
|
0
|
|
|
|
|
0
|
$entity = ($detail->{'text'} =~ /&$/); |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
} |
219
|
|
|
|
|
|
|
} else { |
220
|
9
|
|
|
|
|
42
|
$result .= $self->transliterate(@ordered); |
221
|
|
|
|
|
|
|
} |
222
|
9
|
|
|
|
|
80
|
@content = (); |
223
|
9
|
|
|
|
|
209
|
$result .= $repr; |
224
|
|
|
|
|
|
|
} else { |
225
|
18
|
|
|
|
|
112
|
push @content, $repr; |
226
|
|
|
|
|
|
|
} |
227
|
1
|
|
|
|
|
8
|
}; |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
my $p = HTML::Parser->new( api_version => 3, |
230
|
|
|
|
|
|
|
handlers => { |
231
|
|
|
|
|
|
|
text => [sub { |
232
|
20
|
|
|
20
|
|
36
|
my ($text) = @_; |
233
|
20
|
|
|
|
|
117
|
push @content, $text; |
234
|
|
|
|
|
|
|
}, 'text'], |
235
|
|
|
|
|
|
|
start => [sub { |
236
|
13
|
|
|
13
|
|
29
|
my ($tag, $attrs) = @_; |
237
|
13
|
|
|
|
|
28
|
my $repr = "<$tag"; |
238
|
13
|
|
|
|
|
64
|
for my $attr (sort keys %$attrs) { |
239
|
5
|
100
|
|
|
|
14
|
next if $attr eq '/'; |
240
|
4
|
|
|
|
|
12
|
my $value = $attrs->{$attr}; |
241
|
4
|
100
|
|
|
|
19
|
$value = $self->transliterate($value) |
242
|
|
|
|
|
|
|
if $text_attrs{$attr}; |
243
|
4
|
|
|
|
|
36
|
$repr .= " $attr=\"$value\""; |
244
|
|
|
|
|
|
|
} |
245
|
13
|
100
|
|
|
|
43
|
$repr .= '/' if $attrs->{'/'}; |
246
|
13
|
|
|
|
|
22
|
$repr .= '>'; |
247
|
|
|
|
|
|
|
|
248
|
13
|
0
|
33
|
|
|
55
|
if ($tag eq 'meta' && |
|
|
|
33
|
|
|
|
|
249
|
|
|
|
|
|
|
lc($attrs->{'name'}) eq 'generator' && |
250
|
|
|
|
|
|
|
lc($attrs->{'content'}) eq lc($generator_name)) { |
251
|
|
|
|
|
|
|
|
252
|
0
|
|
|
|
|
0
|
$generator_seen = 1; |
253
|
|
|
|
|
|
|
} |
254
|
|
|
|
|
|
|
|
255
|
13
|
|
|
|
|
29
|
$output->($repr, $tag); |
256
|
|
|
|
|
|
|
}, 'tagname, attr'], |
257
|
|
|
|
|
|
|
end => [sub { |
258
|
12
|
|
|
12
|
|
25
|
my ($tag) = @_; |
259
|
12
|
|
|
|
|
34
|
my $repr .= "$tag>"; |
260
|
|
|
|
|
|
|
|
261
|
12
|
100
|
66
|
|
|
43
|
if ($tag eq 'head' && !$generator_seen) { |
262
|
1
|
|
|
|
|
5
|
$output->("", |
263
|
|
|
|
|
|
|
$tag); |
264
|
|
|
|
|
|
|
} |
265
|
|
|
|
|
|
|
|
266
|
12
|
|
|
|
|
28
|
$output->($repr, $tag); |
267
|
|
|
|
|
|
|
}, 'tagname'], |
268
|
|
|
|
|
|
|
comment => [sub { |
269
|
1
|
|
|
1
|
|
3
|
my ($text) = @_; |
270
|
1
|
|
|
|
|
5
|
push @content, $text; |
271
|
1
|
|
|
|
|
40
|
}, 'text'], |
272
|
|
|
|
|
|
|
}, |
273
|
|
|
|
|
|
|
marked_sections => 1, |
274
|
|
|
|
|
|
|
); |
275
|
|
|
|
|
|
|
|
276
|
1
|
|
|
|
|
112
|
$p->parse($html); |
277
|
1
|
|
|
|
|
12
|
$p->eof(); |
278
|
1
|
|
|
|
|
3
|
$output->(''); |
279
|
|
|
|
|
|
|
|
280
|
1
|
|
|
|
|
1216
|
return $result; |
281
|
|
|
|
|
|
|
} |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
sub DESTROY { |
284
|
7
|
|
|
7
|
|
8738
|
my ($self) = @_; |
285
|
|
|
|
|
|
|
|
286
|
7
|
100
|
|
|
|
2061
|
$self->{sth}->finish() if defined $self->{sth}; |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
1; |
290
|
|
|
|
|
|
|
=head1 NAME |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
Lingua::EN::Alphabet::Shaw - transliterate the Latin to Shavian alphabets |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=head1 AUTHOR |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
Thomas Thurman |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
=head1 SYNOPSIS |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
use Lingua::EN::Alphabet::Shaw; |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
my $shaw = Lingua::EN::Alphabet::Shaw->new(); |
303
|
|
|
|
|
|
|
print $shaw->transliterate('I live near a live wire.'); |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=head1 DESCRIPTION |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
The Shaw or Shavian alphabet was commissioned by the will of the playwright |
308
|
|
|
|
|
|
|
George Bernard Shaw in the early 1960s as a replacement for the Latin |
309
|
|
|
|
|
|
|
alphabet for representing English. It is designed to have a one-to-one |
310
|
|
|
|
|
|
|
phonemic (not phonetic) mapping with the sounds of English. |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
Its ISO 15924 code is "Shaw" 281. |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
This module transliterates English text from the Latin alphabet into the |
315
|
|
|
|
|
|
|
Shavian alphabet. |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
The API has changed since version 0.03 to be object-based. |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
If you find an error in the translation database, you can change it |
320
|
|
|
|
|
|
|
yourself at http://shavian.org.uk/wiki/ . You may download a current |
321
|
|
|
|
|
|
|
copy of the dataset at http://shavian.org.uk/set/ . |
322
|
|
|
|
|
|
|
If you want to override the database shipped with this module, |
323
|
|
|
|
|
|
|
place the new copy at ~/.cache/shavian/shavian-set.sqlite and it will |
324
|
|
|
|
|
|
|
be used in preference. |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=head1 METHODS |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=head2 Lingua::EN::Alphabet::Shaw->new() |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
Constructor. Currently takes no arguments. |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=head2 $shaw->transliterate($phrase) |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
Returns the transliteration of the given phrase into the Shavian alphabet. |
335
|
|
|
|
|
|
|
Can handle multi-word phrases. Does a reasonable job resolving homonym |
336
|
|
|
|
|
|
|
ambiguity ("does he like does?"). |
337
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
If you pass multiple arguments, the results will be concatenated, and only the |
339
|
|
|
|
|
|
|
odd-numbered arguments will be transliterated. The state of homonym |
340
|
|
|
|
|
|
|
resolution is maintained. This allows you to embed chunks of text |
341
|
|
|
|
|
|
|
which should not be transliterated into the line, such as XML tags. |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=head2 $shaw->unknown_handler([$handler]) |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
If a word is not found in the translation database, the transliteration |
346
|
|
|
|
|
|
|
routines will call a particular handler to find out what to do, with the |
347
|
|
|
|
|
|
|
unknown word as both its first and second arguments. (This is to allow |
348
|
|
|
|
|
|
|
later expansion; see BUGS AND ISSUES, below.) |
349
|
|
|
|
|
|
|
The result of the handler should be |
350
|
|
|
|
|
|
|
a string, which will be inserted into the result of the transliteration |
351
|
|
|
|
|
|
|
routine at the correct place. |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
This method allows you to set a new handler by passing it as an argument. |
354
|
|
|
|
|
|
|
If you pass no argument, this method returns the current handler. |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
The default handler only returns its argument. A replacement handler could, |
357
|
|
|
|
|
|
|
for example, make an attempt at guessing the transliteration; it could die, |
358
|
|
|
|
|
|
|
to abort the transliteration process; it could return its argument but |
359
|
|
|
|
|
|
|
also store the new value in a table so that a list of missing words could |
360
|
|
|
|
|
|
|
later be reported to the user. |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=head2 $shaw->mapping($phrase) |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
There is a quasi-standard mapping of the conventional alphabet onto the Shavian |
365
|
|
|
|
|
|
|
alphabet. This method maps Shavian text into the conventional alphabet |
366
|
|
|
|
|
|
|
and vice versa. It does not transliterate. |
367
|
|
|
|
|
|
|
Think of this as a kind of ASCII-armouring. |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
Various versions of the standard map the naming dot to "G", "B", and "/". |
370
|
|
|
|
|
|
|
This method does not support "/", but maps both "G" and "B" to the naming |
371
|
|
|
|
|
|
|
dot; in reverse, it maps the naming dot to "G". |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
The letters "K" and "L" have no mapping to Shavian letters, and are |
374
|
|
|
|
|
|
|
left alone. |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
=head2 $shaw->normalise($shavian_text) |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
Certain letters in the Shavian alphabet are ligatures of pairs of |
379
|
|
|
|
|
|
|
other letters: because of this, these pairs should not exist separately. |
380
|
|
|
|
|
|
|
(For example, the letter YEW is a ligature of YEA and OOZE.) This method |
381
|
|
|
|
|
|
|
replaces these pairs with their ligature equivalents. |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
=head2 $shaw->transliterate_html($html) |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
Given a block of text in the conventional alphabet which is formatted |
386
|
|
|
|
|
|
|
as HTML, this will make a reasonable attempt at returning the same text |
387
|
|
|
|
|
|
|
transliterated into the Shavian alphabet. It is aware of which tags |
388
|
|
|
|
|
|
|
commonly break the flow of sentences, and handles homonym resolution |
389
|
|
|
|
|
|
|
accordingly. |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=head1 BUGS AND ISSUES |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
There should be a version of the main transliteration method which |
394
|
|
|
|
|
|
|
returned a list of hashes, each of which gave the source and |
395
|
|
|
|
|
|
|
destination forms of a word, part of speech and disambiguation |
396
|
|
|
|
|
|
|
information, and a marking of the source (CMUDict or |
397
|
|
|
|
|
|
|
Shavian Wiki). |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
It should probably be possible to transliterate in reverse, |
400
|
|
|
|
|
|
|
from Shavian to the conventional alphabet. |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
It should be possible to handle other alternative scripts, such as |
403
|
|
|
|
|
|
|
Deseret and Tengwar. This shouldn't be very difficult. |
404
|
|
|
|
|
|
|
It would also allow representation in the IPA, which would mean |
405
|
|
|
|
|
|
|
this module could be used for simple text-to-speech processing. |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
The portion of the database which is taken from CMUdict exhibits |
408
|
|
|
|
|
|
|
unhelpful mergers (notably father/bother). There isn't much that |
409
|
|
|
|
|
|
|
can be done about this except extending the Shavian wiki further. |
410
|
|
|
|
|
|
|
In addition, in some cases it does not use the letters ARRAY and |
411
|
|
|
|
|
|
|
ADO in unstressed syllables as they should be; this could and should be |
412
|
|
|
|
|
|
|
fixed. |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
It would be useful on initialisation to read a text file |
415
|
|
|
|
|
|
|
in a standard location, which gave a local mapping overriding the |
416
|
|
|
|
|
|
|
database for given words. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
It would be helpful if there was a callback for any words found |
419
|
|
|
|
|
|
|
from the CMUDict data rather than from the Shavian Wiki data, so that |
420
|
|
|
|
|
|
|
the wiki could be updated. |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
The HTML transliterator should mark its output as being |
423
|
|
|
|
|
|
|
encoded in UTF-8, whatever the source encoding. (Shavian cannot |
424
|
|
|
|
|
|
|
be represented in any other standard encoding.) |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
The HTML transliterator should have an option which put a span |
427
|
|
|
|
|
|
|
around each word whose title was the word's spelling in the |
428
|
|
|
|
|
|
|
conventional alphabet, in the manner of translate.google.com. |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
The HTML transliterator should have an option to rewrite the |
431
|
|
|
|
|
|
|
destinations of links, and to add a target to them, so that |
432
|
|
|
|
|
|
|
it can be used by a web script to link back to itself. |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
The HTML transliterator should add a "generator" META tag |
435
|
|
|
|
|
|
|
referencing itself, if one is not already present. |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
The HTML transliterator should ignore sections marked as |
438
|
|
|
|
|
|
|
being written in non-English languages. |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
The HTML transliterator should have an option to |
441
|
|
|
|
|
|
|
allow loading documents in chunks, as C already does. |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
The mapping() method should have an extra parameter to |
444
|
|
|
|
|
|
|
cause it to map in one direction only. |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
Most of these will be implemented before this module reaches |
447
|
|
|
|
|
|
|
version 1.00. |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=head1 FONTS |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
You will need a Shavian Unicode font to use this module. |
452
|
|
|
|
|
|
|
There are several such fonts at http://marnanel.org/shavian/fonts/ . |
453
|
|
|
|
|
|
|
Please be sure to get a Unicode font and not one with the "Latin mapping". |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
However, the Mac can handle the Shavian alphabet out of the box. |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=head1 COPYRIGHT |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
This Perl module is copyright (C) Thomas Thurman, 2009-2010. |
460
|
|
|
|
|
|
|
This is free software, and can be used/modified under the same terms as |
461
|
|
|
|
|
|
|
Perl itself. |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
The transliteration data is available under various free licences, |
464
|
|
|
|
|
|
|
which are reproduced below. |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
=head1 LICENCES |
467
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
=head2 Androcles and the Lion |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
Part of the transliteration data was taken from the 1962 Shavian alphabet |
471
|
|
|
|
|
|
|
edition of "Androcles and the Lion"; this data is in the public domain. |
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
=head2 Shavian Wiki |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
Part of the transliteration data was taken from the Shavian Wiki, and |
476
|
|
|
|
|
|
|
this is available under the Creative Commons cc-by-sa licence. |
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=head2 CMUdict |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
Another part of the transliteration data was taken from CMUdict. Its |
481
|
|
|
|
|
|
|
licence is reproduced below. |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved. |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
Redistribution and use in source and binary forms, with or without |
486
|
|
|
|
|
|
|
modification, are permitted provided that the following conditions |
487
|
|
|
|
|
|
|
are met: |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
1. Redistributions of source code must retain the above copyright |
490
|
|
|
|
|
|
|
notice, this list of conditions and the following disclaimer. |
491
|
|
|
|
|
|
|
The contents of this file are deemed to be source code. |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright |
494
|
|
|
|
|
|
|
notice, this list of conditions and the following disclaimer in |
495
|
|
|
|
|
|
|
the documentation and/or other materials provided with the |
496
|
|
|
|
|
|
|
distribution. |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
This work was supported in part by funding from the Defense Advanced |
499
|
|
|
|
|
|
|
Research Projects Agency, the Office of Naval Research and the National |
500
|
|
|
|
|
|
|
Science Foundation of the United States of America, and by member |
501
|
|
|
|
|
|
|
companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge |
502
|
|
|
|
|
|
|
the contributions of many volunteers to the expansion and improvement of |
503
|
|
|
|
|
|
|
this dictionary. |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND |
506
|
|
|
|
|
|
|
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, |
507
|
|
|
|
|
|
|
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
508
|
|
|
|
|
|
|
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY |
509
|
|
|
|
|
|
|
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
510
|
|
|
|
|
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
511
|
|
|
|
|
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
512
|
|
|
|
|
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
513
|
|
|
|
|
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
514
|
|
|
|
|
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
515
|
|
|
|
|
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
=head2 Brown tagger |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
The part-of-speech data was taken from the Brown tagger (although the |
520
|
|
|
|
|
|
|
tagger built into this model is not the Brown tagger, so its first |
521
|
|
|
|
|
|
|
sentence is inaccurate). Its licence is also reproduced below: |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
This software was written by Eric Brill. |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
This software is being provided to you, the LICENSEE, by the |
526
|
|
|
|
|
|
|
Massachusetts Institute of Technology (M.I.T.) under the following |
527
|
|
|
|
|
|
|
license. By obtaining, using and/or copying this software, you agree |
528
|
|
|
|
|
|
|
that you have read, understood, and will comply with these terms and |
529
|
|
|
|
|
|
|
conditions: |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
Permission to [use, copy, modify and distribute, including the right to |
532
|
|
|
|
|
|
|
grant others rights to distribute at any tier, this software and its |
533
|
|
|
|
|
|
|
documentation for any purpose and without fee or royalty] is hereby |
534
|
|
|
|
|
|
|
granted, provided that you agree to comply with the following copyright |
535
|
|
|
|
|
|
|
notice and statements, including the disclaimer, and that the same |
536
|
|
|
|
|
|
|
appear on ALL copies of the software and documentation, including |
537
|
|
|
|
|
|
|
modifications that you make for internal use or for distribution: |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
Copyright 1993 by the Massachusetts Institute of Technology and the |
540
|
|
|
|
|
|
|
University of Pennsylvania. All rights reserved. |
541
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS |
543
|
|
|
|
|
|
|
OR WARRANTIES, EXPRESS OR IMPLIED. By way of example, but not |
544
|
|
|
|
|
|
|
limitation, M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF |
545
|
|
|
|
|
|
|
MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF |
546
|
|
|
|
|
|
|
THE LICENSED SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY |
547
|
|
|
|
|
|
|
PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
The name of the Massachusetts Institute of Technology or M.I.T. may NOT |
550
|
|
|
|
|
|
|
be used in advertising or publicity pertaining to distribution of the |
551
|
|
|
|
|
|
|
software. Title to copyright in this software and any associated |
552
|
|
|
|
|
|
|
documentation shall at all times remain with M.I.T., and USER agrees to |
553
|
|
|
|
|
|
|
preserve same. |