| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Lingua::EN::Alphabet::Shaw; |
|
2
|
|
|
|
|
|
|
|
|
3
|
7
|
|
|
7
|
|
258167
|
use 5.005; |
|
|
7
|
|
|
|
|
28
|
|
|
|
7
|
|
|
|
|
307
|
|
|
4
|
7
|
|
|
7
|
|
42
|
use strict; |
|
|
7
|
|
|
|
|
14
|
|
|
|
7
|
|
|
|
|
301
|
|
|
5
|
7
|
|
|
7
|
|
53
|
use warnings; |
|
|
7
|
|
|
|
|
32
|
|
|
|
7
|
|
|
|
|
221
|
|
|
6
|
7
|
|
|
7
|
|
393275
|
use DBI; |
|
|
7
|
|
|
|
|
267728
|
|
|
|
7
|
|
|
|
|
1983
|
|
|
7
|
7
|
|
|
7
|
|
9239
|
use Encode; |
|
|
7
|
|
|
|
|
167369
|
|
|
|
7
|
|
|
|
|
1315
|
|
|
8
|
7
|
|
|
7
|
|
21478
|
use File::ShareDir qw(dist_file); |
|
|
7
|
|
|
|
|
82554
|
|
|
|
7
|
|
|
|
|
669
|
|
|
9
|
7
|
|
|
7
|
|
12218
|
use HTML::Parser; |
|
|
7
|
|
|
|
|
104934
|
|
|
|
7
|
|
|
|
|
27687
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
our $VERSION = 0.64; |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
sub new { |
|
14
|
7
|
|
|
7
|
1
|
4914
|
my ($class) = @_; |
|
15
|
|
|
|
|
|
|
my $self = { |
|
16
|
|
|
|
|
|
|
dbh => undef, |
|
17
|
|
|
|
|
|
|
sth => undef, |
|
18
|
|
|
|
|
|
|
map => undef, |
|
19
|
|
|
|
|
|
|
# default behaviour for "unknown" is to return its argument |
|
20
|
3
|
|
|
3
|
|
33
|
unknown => sub { $_[0]; }, |
|
21
|
7
|
|
|
|
|
84
|
}; |
|
22
|
7
|
|
|
|
|
42
|
return bless($self, $class); |
|
23
|
|
|
|
|
|
|
} |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sub unknown_handler { |
|
26
|
2
|
|
|
2
|
1
|
17
|
my ($self, $handler) = @_; |
|
27
|
|
|
|
|
|
|
|
|
28
|
2
|
100
|
|
|
|
13
|
$self->{unknown} = $handler if defined $handler; |
|
29
|
|
|
|
|
|
|
|
|
30
|
2
|
|
|
|
|
13
|
return $self->{unknown}; |
|
31
|
|
|
|
|
|
|
} |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
my %_source_to_bank = ( |
|
34
|
|
|
|
|
|
|
0 => 'W', # Shavian wiki |
|
35
|
|
|
|
|
|
|
1 => 'C', # CMUDict |
|
36
|
|
|
|
|
|
|
2 => 'A', # Androcles and the Lion |
|
37
|
|
|
|
|
|
|
); |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
sub transliterate_details { |
|
40
|
|
|
|
|
|
|
|
|
41
|
23
|
|
|
23
|
0
|
44
|
my @result; |
|
42
|
|
|
|
|
|
|
|
|
43
|
23
|
|
|
|
|
57
|
my ($self, @texts) = @_; |
|
44
|
|
|
|
|
|
|
|
|
45
|
23
|
100
|
|
|
|
137
|
unless (defined $self->{dbh}) { |
|
46
|
6
|
|
|
|
|
13
|
my $filename; |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
# allow a local override |
|
49
|
6
|
|
|
|
|
12846
|
$filename = glob('~/.cache/shavian/shavian-set.sqlite'); |
|
50
|
6
|
50
|
|
|
|
140
|
$filename = dist_file('Lingua-EN-Alphabet-Shaw', 'shavian-set.sqlite') unless -e $filename; |
|
51
|
|
|
|
|
|
|
|
|
52
|
6
|
|
|
|
|
2115
|
$self->{dbh} = DBI->connect("dbi:SQLite:dbname=$filename","",""); |
|
53
|
6
|
|
|
|
|
129373
|
$self->{sth} = $self->{dbh}->prepare('select shaw, pos, dab, source from words where latn=?'); |
|
54
|
|
|
|
|
|
|
} |
|
55
|
|
|
|
|
|
|
|
|
56
|
23
|
|
|
|
|
2827
|
my $prevpos = 'n'; # sensible default |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
my $lookup_word = sub { |
|
59
|
113
|
|
|
113
|
|
149
|
my ($word) = @_; |
|
60
|
|
|
|
|
|
|
|
|
61
|
113
|
|
|
|
|
14329
|
$self->{sth}->execute(lc $word); |
|
62
|
113
|
|
|
|
|
2064
|
my $homonyms = $self->{sth}->fetchall_arrayref(); |
|
63
|
|
|
|
|
|
|
return { |
|
64
|
113
|
100
|
|
|
|
319
|
bank => 'U', |
|
65
|
|
|
|
|
|
|
src => $word, |
|
66
|
|
|
|
|
|
|
text => $self->{'unknown'}->($word, $word), |
|
67
|
|
|
|
|
|
|
} unless @$homonyms; |
|
68
|
109
|
|
|
|
|
158
|
my $candidate = $homonyms->[0]; |
|
69
|
109
|
|
|
|
|
214
|
for (@$homonyms) { |
|
70
|
122
|
100
|
|
|
|
8024
|
$candidate = $_ if $_->[2] =~ $prevpos; |
|
71
|
122
|
100
|
100
|
|
|
383
|
$candidate = $_ if $_->[2] eq 'g' && $word =~ /^[A-Z]/; |
|
72
|
122
|
100
|
100
|
|
|
475
|
$candidate = $_ if $_->[2] eq 'h' && $word =~ /^[a-z]/; |
|
73
|
|
|
|
|
|
|
} |
|
74
|
|
|
|
|
|
|
|
|
75
|
109
|
|
|
|
|
173
|
$prevpos = $candidate->[1]; |
|
76
|
|
|
|
|
|
|
|
|
77
|
109
|
|
50
|
|
|
643
|
my $result = { |
|
78
|
|
|
|
|
|
|
bank => $_source_to_bank{$candidate->[3]} || '?', |
|
79
|
|
|
|
|
|
|
src => $word, |
|
80
|
|
|
|
|
|
|
text => decode_utf8($candidate->[0]), |
|
81
|
|
|
|
|
|
|
}; |
|
82
|
|
|
|
|
|
|
|
|
83
|
109
|
100
|
|
|
|
5995
|
$result->{'dab'}=1 if scalar(@$homonyms)>1; |
|
84
|
|
|
|
|
|
|
|
|
85
|
109
|
|
|
|
|
585
|
return $result; |
|
86
|
23
|
|
|
|
|
153
|
}; |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
my $store_literal = sub { |
|
89
|
162
|
|
|
162
|
|
241
|
my ($literal) = @_; |
|
90
|
162
|
100
|
|
|
|
366
|
return if $literal eq ''; |
|
91
|
|
|
|
|
|
|
|
|
92
|
139
|
100
|
100
|
|
|
716
|
if (@result && $result[-1]->{'bank'} eq 'L') { |
|
93
|
24
|
|
|
|
|
89
|
$result[-1]->{'text'} .= $literal; |
|
94
|
|
|
|
|
|
|
} else { |
|
95
|
115
|
|
|
|
|
452
|
push @result, { bank=>'L', text=>$literal }; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
23
|
|
|
|
|
91
|
}; |
|
98
|
|
|
|
|
|
|
|
|
99
|
23
|
|
|
|
|
87
|
while (@texts) { |
|
100
|
43
|
|
|
|
|
81
|
my $text = shift @texts; |
|
101
|
|
|
|
|
|
|
|
|
102
|
43
|
|
|
|
|
586
|
my @splittext = split(m/(?
|
|
103
|
|
|
|
|
|
|
|
|
104
|
43
|
|
|
|
|
121
|
while (@splittext) { |
|
105
|
141
|
|
|
|
|
331
|
$store_literal->(shift @splittext); |
|
106
|
|
|
|
|
|
|
|
|
107
|
141
|
100
|
|
|
|
2351
|
push @result, $lookup_word->(shift @splittext) if @splittext; |
|
108
|
|
|
|
|
|
|
} |
|
109
|
|
|
|
|
|
|
|
|
110
|
43
|
100
|
|
|
|
4819
|
$store_literal->(shift @texts) if @texts; |
|
111
|
|
|
|
|
|
|
} |
|
112
|
|
|
|
|
|
|
|
|
113
|
23
|
50
|
|
|
|
469
|
return @result if wantarray; |
|
114
|
0
|
|
|
|
|
0
|
return [@result]; |
|
115
|
|
|
|
|
|
|
} |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub transliterate { |
|
118
|
22
|
|
|
22
|
1
|
2714
|
my ($self, @texts) = @_; |
|
119
|
|
|
|
|
|
|
|
|
120
|
22
|
|
|
|
|
85
|
return join('', map { $_->{'text'} } $self->transliterate_details(@texts) ); |
|
|
211
|
|
|
|
|
1368
|
|
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub mapping { |
|
124
|
|
|
|
|
|
|
|
|
125
|
5
|
|
|
5
|
1
|
8
|
my ($self, $text) = @_; |
|
126
|
|
|
|
|
|
|
|
|
127
|
5
|
100
|
|
|
|
15
|
unless (defined $self->{map}) { |
|
128
|
1
|
|
|
|
|
21
|
$self->{map} = {}; |
|
129
|
1
|
|
|
|
|
3
|
my $codepoint = 66640; |
|
130
|
1
|
|
|
|
|
5
|
for (qw(p t k f T s S c j N b d g v H z |
|
131
|
|
|
|
|
|
|
Z J w h l m i e A a o U Q y r n |
|
132
|
|
|
|
|
|
|
I E F u O M q Y R P X x D C W V)) { |
|
133
|
48
|
|
|
|
|
90
|
$self->{map}->{chr($codepoint)} = $_; |
|
134
|
48
|
|
|
|
|
98
|
$self->{map}->{$_} = chr($codepoint); |
|
135
|
48
|
|
|
|
|
51
|
$codepoint++; |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
|
|
138
|
1
|
|
|
|
|
3
|
my $naming_dot = chr(0xB7); |
|
139
|
1
|
|
|
|
|
2
|
$self->{map}->{$naming_dot} = 'G'; |
|
140
|
1
|
|
|
|
|
2
|
$self->{map}->{'G'} = $naming_dot; |
|
141
|
1
|
|
|
|
|
3
|
$self->{map}->{'B'} = $naming_dot; |
|
142
|
|
|
|
|
|
|
# some standards also map it to the solidus |
|
143
|
|
|
|
|
|
|
# but that will stop this function being |
|
144
|
|
|
|
|
|
|
# its own inverse |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
my $remap = sub { |
|
148
|
82
|
|
|
82
|
|
100
|
my ($char) = @_; |
|
149
|
82
|
100
|
|
|
|
291
|
return $self->{map}->{$char} if defined $self->{map}->{$char}; |
|
150
|
24
|
|
|
|
|
53
|
return $char; |
|
151
|
5
|
|
|
|
|
21
|
}; |
|
152
|
|
|
|
|
|
|
|
|
153
|
5
|
|
|
|
|
25
|
$text =~ s/(.)/$remap->($1)/ge; |
|
|
82
|
|
|
|
|
113
|
|
|
154
|
5
|
|
|
|
|
40
|
return $text; |
|
155
|
|
|
|
|
|
|
} |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
sub normalise { |
|
158
|
7
|
|
|
7
|
1
|
15992
|
my ($self, $shaw) = @_; |
|
159
|
|
|
|
|
|
|
|
|
160
|
7
|
|
|
|
|
76
|
my %mappings = ( |
|
161
|
|
|
|
|
|
|
chr(66664).chr(66670) => chr(66680), # ash + roar = are |
|
162
|
|
|
|
|
|
|
chr(66666).chr(66670) => chr(66681), # on + roar = or |
|
163
|
|
|
|
|
|
|
chr(66663).chr(66670) => chr(66682), # egg + roar = air |
|
164
|
|
|
|
|
|
|
chr(66675).chr(66670) => chr(66683), # up + roar = err |
|
165
|
|
|
|
|
|
|
chr(66665).chr(66670) => chr(66684), # ado + roar = array |
|
166
|
|
|
|
|
|
|
chr(66662).chr(66670) => chr(66685), # if + roar = ear |
|
167
|
|
|
|
|
|
|
chr(66662).chr(66665) => chr(66686), # if + ado = ian |
|
168
|
|
|
|
|
|
|
chr(66648).chr(66677) => chr(66687), # yea + ooze = yew |
|
169
|
|
|
|
|
|
|
); |
|
170
|
|
|
|
|
|
|
|
|
171
|
7
|
|
|
|
|
62
|
for (keys %mappings) { |
|
172
|
56
|
|
|
|
|
600
|
$shaw =~ s/$_/$mappings{$_}/g; |
|
173
|
|
|
|
|
|
|
} |
|
174
|
|
|
|
|
|
|
|
|
175
|
7
|
|
|
|
|
55
|
return $shaw; |
|
176
|
|
|
|
|
|
|
} |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
sub transliterate_html { |
|
179
|
1
|
|
|
1
|
1
|
952
|
my ($self, $html, %flags) = @_; |
|
180
|
|
|
|
|
|
|
|
|
181
|
1
|
|
|
|
|
3
|
my @content; |
|
182
|
|
|
|
|
|
|
my $result; |
|
183
|
|
|
|
|
|
|
|
|
184
|
1
|
|
|
|
|
5
|
my %toplevel_tags = map {$_=>1} qw(p div h1 h2 h3 h4 h5 h6 ul ol li dt dd dl title); |
|
|
15
|
|
|
|
|
35
|
|
|
185
|
1
|
|
|
|
|
4
|
my %text_attrs = map {$_=>1} qw(alt title); |
|
|
2
|
|
|
|
|
16
|
|
|
186
|
|
|
|
|
|
|
|
|
187
|
1
|
|
|
|
|
4
|
my $generator_seen = 0; |
|
188
|
1
|
|
|
|
|
3
|
my $generator_name = ref($self); |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
my $output = sub { |
|
191
|
27
|
|
|
27
|
|
44
|
my ($repr, $tag) = @_; |
|
192
|
|
|
|
|
|
|
|
|
193
|
27
|
100
|
100
|
|
|
328
|
if (!$tag || $toplevel_tags{$tag}) { |
|
194
|
9
|
|
|
|
|
16
|
my $want_tag = 0; |
|
195
|
9
|
|
|
|
|
26
|
my @ordered = (''); |
|
196
|
9
|
|
|
|
|
22
|
for (@content) { |
|
197
|
39
|
|
|
|
|
97
|
my $is_tag = /^; |
|
198
|
39
|
100
|
|
|
|
81
|
if ($want_tag != $is_tag) { |
|
199
|
31
|
|
|
|
|
48
|
push @ordered, ''; |
|
200
|
31
|
|
|
|
|
35
|
$want_tag = $is_tag; |
|
201
|
|
|
|
|
|
|
} |
|
202
|
39
|
|
|
|
|
85
|
$ordered[-1] .= $_; |
|
203
|
|
|
|
|
|
|
} |
|
204
|
9
|
50
|
|
|
|
21
|
if ($flags{'titles'}) { |
|
205
|
|
|
|
|
|
|
# FIXME we should also include class="dab" if they ask for it |
|
206
|
0
|
|
|
|
|
0
|
my $entity = 0; |
|
207
|
0
|
|
|
|
|
0
|
for my $detail ($self->transliterate_details(@ordered)) { |
|
208
|
0
|
0
|
0
|
|
|
0
|
if (defined $detail->{'src'} && !$entity) { |
|
209
|
0
|
|
|
|
|
0
|
$result .= '
210
|
|
|
|
|
|
|
$detail->{'src'} . |
|
211
|
|
|
|
|
|
|
'">' . |
|
212
|
|
|
|
|
|
|
$detail->{'text'} . |
|
213
|
|
|
|
|
|
|
''; |
|
214
|
|
|
|
|
|
|
} else { |
|
215
|
0
|
|
|
|
|
0
|
$result .= $detail->{'text'}; |
|
216
|
0
|
|
|
|
|
0
|
$entity = ($detail->{'text'} =~ /&$/); |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
} |
|
219
|
|
|
|
|
|
|
} else { |
|
220
|
9
|
|
|
|
|
42
|
$result .= $self->transliterate(@ordered); |
|
221
|
|
|
|
|
|
|
} |
|
222
|
9
|
|
|
|
|
80
|
@content = (); |
|
223
|
9
|
|
|
|
|
209
|
$result .= $repr; |
|
224
|
|
|
|
|
|
|
} else { |
|
225
|
18
|
|
|
|
|
112
|
push @content, $repr; |
|
226
|
|
|
|
|
|
|
} |
|
227
|
1
|
|
|
|
|
8
|
}; |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
my $p = HTML::Parser->new( api_version => 3, |
|
230
|
|
|
|
|
|
|
handlers => { |
|
231
|
|
|
|
|
|
|
text => [sub { |
|
232
|
20
|
|
|
20
|
|
36
|
my ($text) = @_; |
|
233
|
20
|
|
|
|
|
117
|
push @content, $text; |
|
234
|
|
|
|
|
|
|
}, 'text'], |
|
235
|
|
|
|
|
|
|
start => [sub { |
|
236
|
13
|
|
|
13
|
|
29
|
my ($tag, $attrs) = @_; |
|
237
|
13
|
|
|
|
|
28
|
my $repr = "<$tag"; |
|
238
|
13
|
|
|
|
|
64
|
for my $attr (sort keys %$attrs) { |
|
239
|
5
|
100
|
|
|
|
14
|
next if $attr eq '/'; |
|
240
|
4
|
|
|
|
|
12
|
my $value = $attrs->{$attr}; |
|
241
|
4
|
100
|
|
|
|
19
|
$value = $self->transliterate($value) |
|
242
|
|
|
|
|
|
|
if $text_attrs{$attr}; |
|
243
|
4
|
|
|
|
|
36
|
$repr .= " $attr=\"$value\""; |
|
244
|
|
|
|
|
|
|
} |
|
245
|
13
|
100
|
|
|
|
43
|
$repr .= '/' if $attrs->{'/'}; |
|
246
|
13
|
|
|
|
|
22
|
$repr .= '>'; |
|
247
|
|
|
|
|
|
|
|
|
248
|
13
|
0
|
33
|
|
|
55
|
if ($tag eq 'meta' && |
|
|
|
|
33
|
|
|
|
|
|
249
|
|
|
|
|
|
|
lc($attrs->{'name'}) eq 'generator' && |
|
250
|
|
|
|
|
|
|
lc($attrs->{'content'}) eq lc($generator_name)) { |
|
251
|
|
|
|
|
|
|
|
|
252
|
0
|
|
|
|
|
0
|
$generator_seen = 1; |
|
253
|
|
|
|
|
|
|
} |
|
254
|
|
|
|
|
|
|
|
|
255
|
13
|
|
|
|
|
29
|
$output->($repr, $tag); |
|
256
|
|
|
|
|
|
|
}, 'tagname, attr'], |
|
257
|
|
|
|
|
|
|
end => [sub { |
|
258
|
12
|
|
|
12
|
|
25
|
my ($tag) = @_; |
|
259
|
12
|
|
|
|
|
34
|
my $repr .= "$tag>"; |
|
260
|
|
|
|
|
|
|
|
|
261
|
12
|
100
|
66
|
|
|
43
|
if ($tag eq 'head' && !$generator_seen) { |
|
262
|
1
|
|
|
|
|
5
|
$output->("", |
|
263
|
|
|
|
|
|
|
$tag); |
|
264
|
|
|
|
|
|
|
} |
|
265
|
|
|
|
|
|
|
|
|
266
|
12
|
|
|
|
|
28
|
$output->($repr, $tag); |
|
267
|
|
|
|
|
|
|
}, 'tagname'], |
|
268
|
|
|
|
|
|
|
comment => [sub { |
|
269
|
1
|
|
|
1
|
|
3
|
my ($text) = @_; |
|
270
|
1
|
|
|
|
|
5
|
push @content, $text; |
|
271
|
1
|
|
|
|
|
40
|
}, 'text'], |
|
272
|
|
|
|
|
|
|
}, |
|
273
|
|
|
|
|
|
|
marked_sections => 1, |
|
274
|
|
|
|
|
|
|
); |
|
275
|
|
|
|
|
|
|
|
|
276
|
1
|
|
|
|
|
112
|
$p->parse($html); |
|
277
|
1
|
|
|
|
|
12
|
$p->eof(); |
|
278
|
1
|
|
|
|
|
3
|
$output->(''); |
|
279
|
|
|
|
|
|
|
|
|
280
|
1
|
|
|
|
|
1216
|
return $result; |
|
281
|
|
|
|
|
|
|
} |
|
282
|
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
sub DESTROY { |
|
284
|
7
|
|
|
7
|
|
8738
|
my ($self) = @_; |
|
285
|
|
|
|
|
|
|
|
|
286
|
7
|
100
|
|
|
|
2061
|
$self->{sth}->finish() if defined $self->{sth}; |
|
287
|
|
|
|
|
|
|
} |
|
288
|
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
1; |
|
290
|
|
|
|
|
|
|
=head1 NAME |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
Lingua::EN::Alphabet::Shaw - transliterate the Latin to Shavian alphabets |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=head1 AUTHOR |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
Thomas Thurman |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
use Lingua::EN::Alphabet::Shaw; |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
my $shaw = Lingua::EN::Alphabet::Shaw->new(); |
|
303
|
|
|
|
|
|
|
print $shaw->transliterate('I live near a live wire.'); |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
The Shaw or Shavian alphabet was commissioned by the will of the playwright |
|
308
|
|
|
|
|
|
|
George Bernard Shaw in the early 1960s as a replacement for the Latin |
|
309
|
|
|
|
|
|
|
alphabet for representing English. It is designed to have a one-to-one |
|
310
|
|
|
|
|
|
|
phonemic (not phonetic) mapping with the sounds of English. |
|
311
|
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
Its ISO 15924 code is "Shaw" 281. |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
This module transliterates English text from the Latin alphabet into the |
|
315
|
|
|
|
|
|
|
Shavian alphabet. |
|
316
|
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
The API has changed since version 0.03 to be object-based. |
|
318
|
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
If you find an error in the translation database, you can change it |
|
320
|
|
|
|
|
|
|
yourself at http://shavian.org.uk/wiki/ . You may download a current |
|
321
|
|
|
|
|
|
|
copy of the dataset at http://shavian.org.uk/set/ . |
|
322
|
|
|
|
|
|
|
If you want to override the database shipped with this module, |
|
323
|
|
|
|
|
|
|
place the new copy at ~/.cache/shavian/shavian-set.sqlite and it will |
|
324
|
|
|
|
|
|
|
be used in preference. |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
=head1 METHODS |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=head2 Lingua::EN::Alphabet::Shaw->new() |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
Constructor. Currently takes no arguments. |
|
331
|
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
=head2 $shaw->transliterate($phrase) |
|
333
|
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
Returns the transliteration of the given phrase into the Shavian alphabet. |
|
335
|
|
|
|
|
|
|
Can handle multi-word phrases. Does a reasonable job resolving homonym |
|
336
|
|
|
|
|
|
|
ambiguity ("does he like does?"). |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
If you pass multiple arguments, the results will be concatenated, and only the |
|
339
|
|
|
|
|
|
|
odd-numbered arguments will be transliterated. The state of homonym |
|
340
|
|
|
|
|
|
|
resolution is maintained. This allows you to embed chunks of text |
|
341
|
|
|
|
|
|
|
which should not be transliterated into the line, such as XML tags. |
|
342
|
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
=head2 $shaw->unknown_handler([$handler]) |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
If a word is not found in the translation database, the transliteration |
|
346
|
|
|
|
|
|
|
routines will call a particular handler to find out what to do, with the |
|
347
|
|
|
|
|
|
|
unknown word as both its first and second arguments. (This is to allow |
|
348
|
|
|
|
|
|
|
later expansion; see BUGS AND ISSUES, below.) |
|
349
|
|
|
|
|
|
|
The result of the handler should be |
|
350
|
|
|
|
|
|
|
a string, which will be inserted into the result of the transliteration |
|
351
|
|
|
|
|
|
|
routine at the correct place. |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
This method allows you to set a new handler by passing it as an argument. |
|
354
|
|
|
|
|
|
|
If you pass no argument, this method returns the current handler. |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
The default handler only returns its argument. A replacement handler could, |
|
357
|
|
|
|
|
|
|
for example, make an attempt at guessing the transliteration; it could die, |
|
358
|
|
|
|
|
|
|
to abort the transliteration process; it could return its argument but |
|
359
|
|
|
|
|
|
|
also store the new value in a table so that a list of missing words could |
|
360
|
|
|
|
|
|
|
later be reported to the user. |
|
361
|
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
=head2 $shaw->mapping($phrase) |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
There is a quasi-standard mapping of the conventional alphabet onto the Shavian |
|
365
|
|
|
|
|
|
|
alphabet. This method maps Shavian text into the conventional alphabet |
|
366
|
|
|
|
|
|
|
and vice versa. It does not transliterate. |
|
367
|
|
|
|
|
|
|
Think of this as a kind of ASCII-armouring. |
|
368
|
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
Various versions of the standard map the naming dot to "G", "B", and "/". |
|
370
|
|
|
|
|
|
|
This method does not support "/", but maps both "G" and "B" to the naming |
|
371
|
|
|
|
|
|
|
dot; in reverse, it maps the naming dot to "G". |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
The letters "K" and "L" have no mapping to Shavian letters, and are |
|
374
|
|
|
|
|
|
|
left alone. |
|
375
|
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
=head2 $shaw->normalise($shavian_text) |
|
377
|
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
Certain letters in the Shavian alphabet are ligatures of pairs of |
|
379
|
|
|
|
|
|
|
other letters: because of this, these pairs should not exist separately. |
|
380
|
|
|
|
|
|
|
(For example, the letter YEW is a ligature of YEA and OOZE.) This method |
|
381
|
|
|
|
|
|
|
replaces these pairs with their ligature equivalents. |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
=head2 $shaw->transliterate_html($html) |
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
Given a block of text in the conventional alphabet which is formatted |
|
386
|
|
|
|
|
|
|
as HTML, this will make a reasonable attempt at returning the same text |
|
387
|
|
|
|
|
|
|
transliterated into the Shavian alphabet. It is aware of which tags |
|
388
|
|
|
|
|
|
|
commonly break the flow of sentences, and handles homonym resolution |
|
389
|
|
|
|
|
|
|
accordingly. |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=head1 BUGS AND ISSUES |
|
392
|
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
There should be a version of the main transliteration method which |
|
394
|
|
|
|
|
|
|
returned a list of hashes, each of which gave the source and |
|
395
|
|
|
|
|
|
|
destination forms of a word, part of speech and disambiguation |
|
396
|
|
|
|
|
|
|
information, and a marking of the source (CMUDict or |
|
397
|
|
|
|
|
|
|
Shavian Wiki). |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
It should probably be possible to transliterate in reverse, |
|
400
|
|
|
|
|
|
|
from Shavian to the conventional alphabet. |
|
401
|
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
It should be possible to handle other alternative scripts, such as |
|
403
|
|
|
|
|
|
|
Deseret and Tengwar. This shouldn't be very difficult. |
|
404
|
|
|
|
|
|
|
It would also allow representation in the IPA, which would mean |
|
405
|
|
|
|
|
|
|
this module could be used for simple text-to-speech processing. |
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
The portion of the database which is taken from CMUdict exhibits |
|
408
|
|
|
|
|
|
|
unhelpful mergers (notably father/bother). There isn't much that |
|
409
|
|
|
|
|
|
|
can be done about this except extending the Shavian wiki further. |
|
410
|
|
|
|
|
|
|
In addition, in some cases it does not use the letters ARRAY and |
|
411
|
|
|
|
|
|
|
ADO in unstressed syllables as they should be; this could and should be |
|
412
|
|
|
|
|
|
|
fixed. |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
It would be useful on initialisation to read a text file |
|
415
|
|
|
|
|
|
|
in a standard location, which gave a local mapping overriding the |
|
416
|
|
|
|
|
|
|
database for given words. |
|
417
|
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
It would be helpful if there was a callback for any words found |
|
419
|
|
|
|
|
|
|
from the CMUDict data rather than from the Shavian Wiki data, so that |
|
420
|
|
|
|
|
|
|
the wiki could be updated. |
|
421
|
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
The HTML transliterator should mark its output as being |
|
423
|
|
|
|
|
|
|
encoded in UTF-8, whatever the source encoding. (Shavian cannot |
|
424
|
|
|
|
|
|
|
be represented in any other standard encoding.) |
|
425
|
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
The HTML transliterator should have an option which put a span |
|
427
|
|
|
|
|
|
|
around each word whose title was the word's spelling in the |
|
428
|
|
|
|
|
|
|
conventional alphabet, in the manner of translate.google.com. |
|
429
|
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
The HTML transliterator should have an option to rewrite the |
|
431
|
|
|
|
|
|
|
destinations of links, and to add a target to them, so that |
|
432
|
|
|
|
|
|
|
it can be used by a web script to link back to itself. |
|
433
|
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
The HTML transliterator should add a "generator" META tag |
|
435
|
|
|
|
|
|
|
referencing itself, if one is not already present. |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
The HTML transliterator should ignore sections marked as |
|
438
|
|
|
|
|
|
|
being written in non-English languages. |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
The HTML transliterator should have an option to |
|
441
|
|
|
|
|
|
|
allow loading documents in chunks, as C already does. |
|
442
|
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
The mapping() method should have an extra parameter to |
|
444
|
|
|
|
|
|
|
cause it to map in one direction only. |
|
445
|
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
Most of these will be implemented before this module reaches |
|
447
|
|
|
|
|
|
|
version 1.00. |
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=head1 FONTS |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
You will need a Shavian Unicode font to use this module. |
|
452
|
|
|
|
|
|
|
There are several such fonts at http://marnanel.org/shavian/fonts/ . |
|
453
|
|
|
|
|
|
|
Please be sure to get a Unicode font and not one with the "Latin mapping". |
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
However, the Mac can handle the Shavian alphabet out of the box. |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
=head1 COPYRIGHT |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
This Perl module is copyright (C) Thomas Thurman, 2009-2010. |
|
460
|
|
|
|
|
|
|
This is free software, and can be used/modified under the same terms as |
|
461
|
|
|
|
|
|
|
Perl itself. |
|
462
|
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
The transliteration data is available under various free licences, |
|
464
|
|
|
|
|
|
|
which are reproduced below. |
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
=head1 LICENCES |
|
467
|
|
|
|
|
|
|
|
|
468
|
|
|
|
|
|
|
=head2 Androcles and the Lion |
|
469
|
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
Part of the transliteration data was taken from the 1962 Shavian alphabet |
|
471
|
|
|
|
|
|
|
edition of "Androcles and the Lion"; this data is in the public domain. |
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
=head2 Shavian Wiki |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
Part of the transliteration data was taken from the Shavian Wiki, and |
|
476
|
|
|
|
|
|
|
this is available under the Creative Commons cc-by-sa licence. |
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
=head2 CMUdict |
|
479
|
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
Another part of the transliteration data was taken from CMUdict. Its |
|
481
|
|
|
|
|
|
|
licence is reproduced below. |
|
482
|
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
Copyright (C) 1993-2008 Carnegie Mellon University. All rights reserved. |
|
484
|
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
Redistribution and use in source and binary forms, with or without |
|
486
|
|
|
|
|
|
|
modification, are permitted provided that the following conditions |
|
487
|
|
|
|
|
|
|
are met: |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
1. Redistributions of source code must retain the above copyright |
|
490
|
|
|
|
|
|
|
notice, this list of conditions and the following disclaimer. |
|
491
|
|
|
|
|
|
|
The contents of this file are deemed to be source code. |
|
492
|
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright |
|
494
|
|
|
|
|
|
|
notice, this list of conditions and the following disclaimer in |
|
495
|
|
|
|
|
|
|
the documentation and/or other materials provided with the |
|
496
|
|
|
|
|
|
|
distribution. |
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
This work was supported in part by funding from the Defense Advanced |
|
499
|
|
|
|
|
|
|
Research Projects Agency, the Office of Naval Research and the National |
|
500
|
|
|
|
|
|
|
Science Foundation of the United States of America, and by member |
|
501
|
|
|
|
|
|
|
companies of the Carnegie Mellon Sphinx Speech Consortium. We acknowledge |
|
502
|
|
|
|
|
|
|
the contributions of many volunteers to the expansion and improvement of |
|
503
|
|
|
|
|
|
|
this dictionary. |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND |
|
506
|
|
|
|
|
|
|
ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, |
|
507
|
|
|
|
|
|
|
THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
|
508
|
|
|
|
|
|
|
PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY |
|
509
|
|
|
|
|
|
|
NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
|
510
|
|
|
|
|
|
|
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
|
511
|
|
|
|
|
|
|
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, |
|
512
|
|
|
|
|
|
|
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY |
|
513
|
|
|
|
|
|
|
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
|
514
|
|
|
|
|
|
|
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
|
515
|
|
|
|
|
|
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
=head2 Brown tagger |
|
518
|
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
The part-of-speech data was taken from the Brown tagger (although the |
|
520
|
|
|
|
|
|
|
tagger built into this model is not the Brown tagger, so its first |
|
521
|
|
|
|
|
|
|
sentence is inaccurate). Its licence is also reproduced below: |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
This software was written by Eric Brill. |
|
524
|
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
This software is being provided to you, the LICENSEE, by the |
|
526
|
|
|
|
|
|
|
Massachusetts Institute of Technology (M.I.T.) under the following |
|
527
|
|
|
|
|
|
|
license. By obtaining, using and/or copying this software, you agree |
|
528
|
|
|
|
|
|
|
that you have read, understood, and will comply with these terms and |
|
529
|
|
|
|
|
|
|
conditions: |
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
Permission to [use, copy, modify and distribute, including the right to |
|
532
|
|
|
|
|
|
|
grant others rights to distribute at any tier, this software and its |
|
533
|
|
|
|
|
|
|
documentation for any purpose and without fee or royalty] is hereby |
|
534
|
|
|
|
|
|
|
granted, provided that you agree to comply with the following copyright |
|
535
|
|
|
|
|
|
|
notice and statements, including the disclaimer, and that the same |
|
536
|
|
|
|
|
|
|
appear on ALL copies of the software and documentation, including |
|
537
|
|
|
|
|
|
|
modifications that you make for internal use or for distribution: |
|
538
|
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
Copyright 1993 by the Massachusetts Institute of Technology and the |
|
540
|
|
|
|
|
|
|
University of Pennsylvania. All rights reserved. |
|
541
|
|
|
|
|
|
|
|
|
542
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED "AS IS", AND M.I.T. MAKES NO REPRESENTATIONS |
|
543
|
|
|
|
|
|
|
OR WARRANTIES, EXPRESS OR IMPLIED. By way of example, but not |
|
544
|
|
|
|
|
|
|
limitation, M.I.T. MAKES NO REPRESENTATIONS OR WARRANTIES OF |
|
545
|
|
|
|
|
|
|
MERCHANTABILITY OR FITNESS FOR ANY PARTICULAR PURPOSE OR THAT THE USE OF |
|
546
|
|
|
|
|
|
|
THE LICENSED SOFTWARE OR DOCUMENTATION WILL NOT INFRINGE ANY THIRD PARTY |
|
547
|
|
|
|
|
|
|
PATENTS, COPYRIGHTS, TRADEMARKS OR OTHER RIGHTS. |
|
548
|
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
The name of the Massachusetts Institute of Technology or M.I.T. may NOT |
|
550
|
|
|
|
|
|
|
be used in advertising or publicity pertaining to distribution of the |
|
551
|
|
|
|
|
|
|
software. Title to copyright in this software and any associated |
|
552
|
|
|
|
|
|
|
documentation shall at all times remain with M.I.T., and USER agrees to |
|
553
|
|
|
|
|
|
|
preserve same. |