| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package MARC::Detrans::Rules; |
|
2
|
|
|
|
|
|
|
|
|
3
|
9
|
|
|
9
|
|
1978
|
use strict; |
|
|
9
|
|
|
|
|
20
|
|
|
|
9
|
|
|
|
|
370
|
|
|
4
|
9
|
|
|
9
|
|
55
|
use warnings; |
|
|
9
|
|
|
|
|
20
|
|
|
|
9
|
|
|
|
|
9890
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
MARC::Detrans::Rules - A set of detransliteration rules |
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use MARC::Detrans::Rules; |
|
13
|
|
|
|
|
|
|
my $rules = MARC::Detrans::Rules->new(); |
|
14
|
|
|
|
|
|
|
$rules->addRule( MARC::Detrans::Rule->new( from=>'a', to='b' ) ); |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
MARC::Detrans::Rules provides the core functionality for managing |
|
19
|
|
|
|
|
|
|
detransliteration rules and for converting transliterated text to |
|
20
|
|
|
|
|
|
|
MARC-8. A MARC::Detrans::Rules object is essentially a collection of |
|
21
|
|
|
|
|
|
|
MARC::Detrans::Rule objects which are consulted during a call to convert(). |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 METHODS |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=cut |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=head2 new() |
|
28
|
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
Create an empty rules object to add individual rules to. |
|
30
|
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=cut |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub new { |
|
34
|
9
|
|
|
9
|
1
|
1240
|
my $class = shift; |
|
35
|
9
|
|
|
|
|
58
|
my $self = { rules => {}, error => undef }; |
|
36
|
9
|
|
33
|
|
|
102
|
return bless $self, ref( $class ) || $class; |
|
37
|
|
|
|
|
|
|
} |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head2 addRule() |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
Add a MARC::Detrans::Rule to the rules object. |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=cut |
|
44
|
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
sub addRule { |
|
46
|
465
|
|
|
465
|
1
|
715
|
my ( $self, $rule ) = @_; |
|
47
|
|
|
|
|
|
|
## get first character off the source for lookup |
|
48
|
|
|
|
|
|
|
## since we'll be processing a character at a time |
|
49
|
465
|
|
|
|
|
1465
|
my $key = substr( $rule->from(), 0, 1 ); |
|
50
|
|
|
|
|
|
|
## look for existing rules with this key |
|
51
|
465
|
100
|
|
|
|
2562
|
my $rules = exists($self->{rules}{$key}) ? $self->{rules}{$key} : []; |
|
52
|
|
|
|
|
|
|
## and the new rule and sort the rules so that the longest come first. |
|
53
|
|
|
|
|
|
|
## this will mean that when we go to use the rules in convert() |
|
54
|
|
|
|
|
|
|
## that the longest match will occur first. |
|
55
|
465
|
|
|
|
|
1035
|
push( @$rules, $rule ); |
|
56
|
465
|
|
|
|
|
2886
|
@$rules = sort byRule @$rules; |
|
57
|
|
|
|
|
|
|
## stash away the new rules |
|
58
|
465
|
|
|
|
|
2123
|
$self->{rules}{$key} = $rules; |
|
59
|
|
|
|
|
|
|
} |
|
60
|
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub byRule { |
|
62
|
|
|
|
|
|
|
return |
|
63
|
309
|
|
|
309
|
0
|
1000
|
length( $b->from() . $b->position() ) |
|
64
|
|
|
|
|
|
|
<=> |
|
65
|
|
|
|
|
|
|
length( $a->from() . $a->position() ) |
|
66
|
|
|
|
|
|
|
} |
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=head2 convert() |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
convert() applies the rules contained in the MARC::Detrans::Rules object |
|
71
|
|
|
|
|
|
|
to convert a string that is passed in. |
|
72
|
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=cut |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub convert { |
|
76
|
82
|
|
|
82
|
1
|
1463
|
my ( $self, $in ) = @_; |
|
77
|
|
|
|
|
|
|
## ok, this is probably the most complicated bit of the distro |
|
78
|
|
|
|
|
|
|
## and it's not really that bad. |
|
79
|
82
|
|
|
|
|
243
|
my $inLength = length( $in ); |
|
80
|
82
|
|
|
|
|
107
|
my $out = ''; |
|
81
|
82
|
|
|
|
|
104
|
my $pos = 0; |
|
82
|
82
|
|
|
|
|
112
|
my $currentEscape = ''; |
|
83
|
|
|
|
|
|
|
## we're going to step through the source string and build up $out |
|
84
|
|
|
|
|
|
|
## to contain the de-transliterated text |
|
85
|
82
|
|
|
|
|
834
|
while ( $pos < $inLength ) { |
|
86
|
|
|
|
|
|
|
## extract the character at the current position |
|
87
|
|
|
|
|
|
|
## and look to see if we have a rule for it |
|
88
|
1117
|
|
|
|
|
2740
|
my $key = substr( $in, $pos, 1 ); |
|
89
|
1117
|
100
|
|
|
|
6144
|
my $rules = exists $self->{rules}{$key} ? $self->{rules}{$key} : []; |
|
90
|
1117
|
|
|
|
|
3371
|
pos($in) = $pos; |
|
91
|
1117
|
|
|
|
|
1424
|
my $foundRule; |
|
92
|
|
|
|
|
|
|
## go through each of the rules and see if we've got a match |
|
93
|
1117
|
|
|
|
|
1851
|
foreach my $rule ( @$rules ) { |
|
94
|
1513
|
|
|
|
|
3952
|
my $from = $rule->from(); |
|
95
|
|
|
|
|
|
|
## if the rule matches remember it for later and jump out of |
|
96
|
|
|
|
|
|
|
## the loop since we've got what we needed |
|
97
|
|
|
|
|
|
|
## \G anchors the match at our current position |
|
98
|
|
|
|
|
|
|
## \Q...\E makes sure that metacharacters in our pattern are escaped |
|
99
|
1513
|
100
|
|
|
|
26696
|
if ( $in =~ m/\G\Q$from\E/ ) { |
|
100
|
1128
|
|
100
|
|
|
6786
|
my $position = $rule->position() || ''; |
|
101
|
1128
|
100
|
|
|
|
7513
|
if ( $position eq 'initial' ) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
102
|
22
|
100
|
|
|
|
61
|
next unless isInitial( $in, $pos ); |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
elsif ( $position eq 'medial' ) { |
|
105
|
8
|
100
|
66
|
|
|
13
|
next if isInitial( $in, $pos ) or isFinal( $in, $pos ); |
|
106
|
|
|
|
|
|
|
} |
|
107
|
|
|
|
|
|
|
elsif ( $position eq 'final' ) { |
|
108
|
4
|
50
|
|
|
|
8
|
next unless isFinal( $in, $pos ); |
|
109
|
|
|
|
|
|
|
} |
|
110
|
1115
|
|
|
|
|
2296
|
$foundRule = $rule; |
|
111
|
1115
|
|
|
|
|
3051
|
last; |
|
112
|
|
|
|
|
|
|
} |
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
## no matched rule, then we've got a character in the source |
|
115
|
|
|
|
|
|
|
## data which doesn't map. Store the error and return asap. |
|
116
|
1117
|
100
|
|
|
|
3280
|
if ( ! defined($foundRule) ) { |
|
117
|
2
|
|
|
|
|
14
|
$self->{error} = sprintf( |
|
118
|
|
|
|
|
|
|
qq(no matching rule found for "%s" [0x%x] at position %i), |
|
119
|
|
|
|
|
|
|
$key, ord($key), $pos+1 ); |
|
120
|
2
|
|
|
|
|
14
|
return; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
## advance the position the amount of characters that we matched |
|
123
|
1115
|
|
|
|
|
3391
|
$pos += length( $foundRule->from() ); |
|
124
|
|
|
|
|
|
|
## if the rule has an associated MARC-8 escape character tag it |
|
125
|
|
|
|
|
|
|
## onto the output text |
|
126
|
1115
|
100
|
100
|
|
|
4326
|
if ($foundRule->escape() and $foundRule->escape() ne $currentEscape) { |
|
127
|
77
|
|
|
|
|
239
|
$out .= chr(0x1B).$foundRule->escape(); |
|
128
|
77
|
|
|
|
|
392
|
$currentEscape = $foundRule->escape(); |
|
129
|
|
|
|
|
|
|
} |
|
130
|
|
|
|
|
|
|
## append the new text |
|
131
|
1115
|
|
|
|
|
3464
|
$out .= $foundRule->to(); |
|
132
|
|
|
|
|
|
|
} |
|
133
|
|
|
|
|
|
|
## escape back to ASCII if approriate |
|
134
|
80
|
100
|
|
|
|
261
|
if ( $currentEscape ) { $out .= chr(0x1B).chr(0x28).chr(0x42); } |
|
|
73
|
|
|
|
|
703
|
|
|
135
|
|
|
|
|
|
|
## make sure error flag is undef since we're ok now |
|
136
|
80
|
|
|
|
|
153
|
$self->{error} = undef; |
|
137
|
|
|
|
|
|
|
## return the new text! |
|
138
|
80
|
|
|
|
|
534
|
return( $out ); |
|
139
|
|
|
|
|
|
|
} |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=head2 error() |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
Will return the latest error encountered during a call to convert(). Can |
|
144
|
|
|
|
|
|
|
be useful for determining why a call to convert() failed. A side effect |
|
145
|
|
|
|
|
|
|
of calling error() is that the error slot is reset. |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=cut |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub error { |
|
150
|
3
|
|
|
3
|
1
|
8
|
my $self = shift; |
|
151
|
3
|
|
|
|
|
6
|
my $error = $self->{error}; |
|
152
|
3
|
|
|
|
|
5
|
$self->{error} = undef; |
|
153
|
3
|
|
|
|
|
16
|
return( $error ); |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=head1 AUTHORS |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=over 4 |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=item * Ed Summers |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=cut |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
## helper functions to determine whether a specific positon in a string |
|
165
|
|
|
|
|
|
|
## is at the start or at the end of a word. |
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub isInitial { |
|
168
|
30
|
|
|
30
|
0
|
48
|
my ($string,$position) = @_; |
|
169
|
30
|
100
|
|
|
|
82
|
return 1 if $position == 0; |
|
170
|
24
|
100
|
|
|
|
93
|
return 1 if substr($string,$position-1,1) =~ /\W/; |
|
171
|
17
|
|
|
|
|
61
|
return 0; |
|
172
|
|
|
|
|
|
|
} |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
sub isFinal { |
|
175
|
12
|
|
|
12
|
0
|
16
|
my ($string,$position) = @_; |
|
176
|
12
|
100
|
|
|
|
39
|
return 1 if $position == length($string)-1; |
|
177
|
6
|
100
|
|
|
|
39
|
return 1 if substr($string,$position+1,1) =~ /\W/; |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
1; |