line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package MARC::Detrans::Rules; |
2
|
|
|
|
|
|
|
|
3
|
9
|
|
|
9
|
|
1978
|
use strict; |
|
9
|
|
|
|
|
20
|
|
|
9
|
|
|
|
|
370
|
|
4
|
9
|
|
|
9
|
|
55
|
use warnings; |
|
9
|
|
|
|
|
20
|
|
|
9
|
|
|
|
|
9890
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
MARC::Detrans::Rules - A set of detransliteration rules |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 SYNOPSIS |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use MARC::Detrans::Rules; |
13
|
|
|
|
|
|
|
my $rules = MARC::Detrans::Rules->new(); |
14
|
|
|
|
|
|
|
$rules->addRule( MARC::Detrans::Rule->new( from=>'a', to='b' ) ); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 DESCRIPTION |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
MARC::Detrans::Rules provides the core functionality for managing |
19
|
|
|
|
|
|
|
detransliteration rules and for converting transliterated text to |
20
|
|
|
|
|
|
|
MARC-8. A MARC::Detrans::Rules object is essentially a collection of |
21
|
|
|
|
|
|
|
MARC::Detrans::Rule objects which are consulted during a call to convert(). |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 METHODS |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=cut |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
=head2 new() |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
Create an empty rules object to add individual rules to. |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
=cut |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub new { |
34
|
9
|
|
|
9
|
1
|
1240
|
my $class = shift; |
35
|
9
|
|
|
|
|
58
|
my $self = { rules => {}, error => undef }; |
36
|
9
|
|
33
|
|
|
102
|
return bless $self, ref( $class ) || $class; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head2 addRule() |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
Add a MARC::Detrans::Rule to the rules object. |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
=cut |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
sub addRule { |
46
|
465
|
|
|
465
|
1
|
715
|
my ( $self, $rule ) = @_; |
47
|
|
|
|
|
|
|
## get first character off the source for lookup |
48
|
|
|
|
|
|
|
## since we'll be processing a character at a time |
49
|
465
|
|
|
|
|
1465
|
my $key = substr( $rule->from(), 0, 1 ); |
50
|
|
|
|
|
|
|
## look for existing rules with this key |
51
|
465
|
100
|
|
|
|
2562
|
my $rules = exists($self->{rules}{$key}) ? $self->{rules}{$key} : []; |
52
|
|
|
|
|
|
|
## and the new rule and sort the rules so that the longest come first. |
53
|
|
|
|
|
|
|
## this will mean that when we go to use the rules in convert() |
54
|
|
|
|
|
|
|
## that the longest match will occur first. |
55
|
465
|
|
|
|
|
1035
|
push( @$rules, $rule ); |
56
|
465
|
|
|
|
|
2886
|
@$rules = sort byRule @$rules; |
57
|
|
|
|
|
|
|
## stash away the new rules |
58
|
465
|
|
|
|
|
2123
|
$self->{rules}{$key} = $rules; |
59
|
|
|
|
|
|
|
} |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
sub byRule { |
62
|
|
|
|
|
|
|
return |
63
|
309
|
|
|
309
|
0
|
1000
|
length( $b->from() . $b->position() ) |
64
|
|
|
|
|
|
|
<=> |
65
|
|
|
|
|
|
|
length( $a->from() . $a->position() ) |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=head2 convert() |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
convert() applies the rules contained in the MARC::Detrans::Rules object |
71
|
|
|
|
|
|
|
to convert a string that is passed in. |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
=cut |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub convert { |
76
|
82
|
|
|
82
|
1
|
1463
|
my ( $self, $in ) = @_; |
77
|
|
|
|
|
|
|
## ok, this is probably the most complicated bit of the distro |
78
|
|
|
|
|
|
|
## and it's not really that bad. |
79
|
82
|
|
|
|
|
243
|
my $inLength = length( $in ); |
80
|
82
|
|
|
|
|
107
|
my $out = ''; |
81
|
82
|
|
|
|
|
104
|
my $pos = 0; |
82
|
82
|
|
|
|
|
112
|
my $currentEscape = ''; |
83
|
|
|
|
|
|
|
## we're going to step through the source string and build up $out |
84
|
|
|
|
|
|
|
## to contain the de-transliterated text |
85
|
82
|
|
|
|
|
834
|
while ( $pos < $inLength ) { |
86
|
|
|
|
|
|
|
## extract the character at the current position |
87
|
|
|
|
|
|
|
## and look to see if we have a rule for it |
88
|
1117
|
|
|
|
|
2740
|
my $key = substr( $in, $pos, 1 ); |
89
|
1117
|
100
|
|
|
|
6144
|
my $rules = exists $self->{rules}{$key} ? $self->{rules}{$key} : []; |
90
|
1117
|
|
|
|
|
3371
|
pos($in) = $pos; |
91
|
1117
|
|
|
|
|
1424
|
my $foundRule; |
92
|
|
|
|
|
|
|
## go through each of the rules and see if we've got a match |
93
|
1117
|
|
|
|
|
1851
|
foreach my $rule ( @$rules ) { |
94
|
1513
|
|
|
|
|
3952
|
my $from = $rule->from(); |
95
|
|
|
|
|
|
|
## if the rule matches remember it for later and jump out of |
96
|
|
|
|
|
|
|
## the loop since we've got what we needed |
97
|
|
|
|
|
|
|
## \G anchors the match at our current position |
98
|
|
|
|
|
|
|
## \Q...\E makes sure that metacharacters in our pattern are escaped |
99
|
1513
|
100
|
|
|
|
26696
|
if ( $in =~ m/\G\Q$from\E/ ) { |
100
|
1128
|
|
100
|
|
|
6786
|
my $position = $rule->position() || ''; |
101
|
1128
|
100
|
|
|
|
7513
|
if ( $position eq 'initial' ) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
102
|
22
|
100
|
|
|
|
61
|
next unless isInitial( $in, $pos ); |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
elsif ( $position eq 'medial' ) { |
105
|
8
|
100
|
66
|
|
|
13
|
next if isInitial( $in, $pos ) or isFinal( $in, $pos ); |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
elsif ( $position eq 'final' ) { |
108
|
4
|
50
|
|
|
|
8
|
next unless isFinal( $in, $pos ); |
109
|
|
|
|
|
|
|
} |
110
|
1115
|
|
|
|
|
2296
|
$foundRule = $rule; |
111
|
1115
|
|
|
|
|
3051
|
last; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
## no matched rule, then we've got a character in the source |
115
|
|
|
|
|
|
|
## data which doesn't map. Store the error and return asap. |
116
|
1117
|
100
|
|
|
|
3280
|
if ( ! defined($foundRule) ) { |
117
|
2
|
|
|
|
|
14
|
$self->{error} = sprintf( |
118
|
|
|
|
|
|
|
qq(no matching rule found for "%s" [0x%x] at position %i), |
119
|
|
|
|
|
|
|
$key, ord($key), $pos+1 ); |
120
|
2
|
|
|
|
|
14
|
return; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
## advance the position the amount of characters that we matched |
123
|
1115
|
|
|
|
|
3391
|
$pos += length( $foundRule->from() ); |
124
|
|
|
|
|
|
|
## if the rule has an associated MARC-8 escape character tag it |
125
|
|
|
|
|
|
|
## onto the output text |
126
|
1115
|
100
|
100
|
|
|
4326
|
if ($foundRule->escape() and $foundRule->escape() ne $currentEscape) { |
127
|
77
|
|
|
|
|
239
|
$out .= chr(0x1B).$foundRule->escape(); |
128
|
77
|
|
|
|
|
392
|
$currentEscape = $foundRule->escape(); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
## append the new text |
131
|
1115
|
|
|
|
|
3464
|
$out .= $foundRule->to(); |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
## escape back to ASCII if approriate |
134
|
80
|
100
|
|
|
|
261
|
if ( $currentEscape ) { $out .= chr(0x1B).chr(0x28).chr(0x42); } |
|
73
|
|
|
|
|
703
|
|
135
|
|
|
|
|
|
|
## make sure error flag is undef since we're ok now |
136
|
80
|
|
|
|
|
153
|
$self->{error} = undef; |
137
|
|
|
|
|
|
|
## return the new text! |
138
|
80
|
|
|
|
|
534
|
return( $out ); |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
=head2 error() |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
Will return the latest error encountered during a call to convert(). Can |
144
|
|
|
|
|
|
|
be useful for determining why a call to convert() failed. A side effect |
145
|
|
|
|
|
|
|
of calling error() is that the error slot is reset. |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
=cut |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
sub error { |
150
|
3
|
|
|
3
|
1
|
8
|
my $self = shift; |
151
|
3
|
|
|
|
|
6
|
my $error = $self->{error}; |
152
|
3
|
|
|
|
|
5
|
$self->{error} = undef; |
153
|
3
|
|
|
|
|
16
|
return( $error ); |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=head1 AUTHORS |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=over 4 |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=item * Ed Summers |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=cut |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
## helper functions to determine whether a specific positon in a string |
165
|
|
|
|
|
|
|
## is at the start or at the end of a word. |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub isInitial { |
168
|
30
|
|
|
30
|
0
|
48
|
my ($string,$position) = @_; |
169
|
30
|
100
|
|
|
|
82
|
return 1 if $position == 0; |
170
|
24
|
100
|
|
|
|
93
|
return 1 if substr($string,$position-1,1) =~ /\W/; |
171
|
17
|
|
|
|
|
61
|
return 0; |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
sub isFinal { |
175
|
12
|
|
|
12
|
0
|
16
|
my ($string,$position) = @_; |
176
|
12
|
100
|
|
|
|
39
|
return 1 if $position == length($string)-1; |
177
|
6
|
100
|
|
|
|
39
|
return 1 if substr($string,$position+1,1) =~ /\W/; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
1; |