File Coverage

blib/lib/MARC/Detrans/Rules.pm

Criterion	Covered	Total	%
statement	59	59	100.0
branch	31	32	96.8
condition	8	11	72.7
subroutine	9	9	100.0
pod	4	7	57.1
total	111	118	94.0

line	stmt	bran	cond	sub	pod	time	code
1							package MARC::Detrans::Rules;
2
3	9			9		1978	use strict;
	9					20
	9					370
4	9			9		55	use warnings;
	9					20
	9					9890
5
6							=head1 NAME
7
8							MARC::Detrans::Rules - A set of detransliteration rules
9
10							=head1 SYNOPSIS
11
12							use MARC::Detrans::Rules;
13							my $rules = MARC::Detrans::Rules->new();
14							$rules->addRule( MARC::Detrans::Rule->new( from=>'a', to='b' ) );
15
16							=head1 DESCRIPTION
17
18							MARC::Detrans::Rules provides the core functionality for managing
19							detransliteration rules and for converting transliterated text to
20							MARC-8. A MARC::Detrans::Rules object is essentially a collection of
21							MARC::Detrans::Rule objects which are consulted during a call to convert().
22
23							=head1 METHODS
24
25							=cut
26
27							=head2 new()
28
29							Create an empty rules object to add individual rules to.
30
31							=cut
32
33							sub new {
34	9			9	1	1240	my $class = shift;
35	9					58	my $self = { rules => {}, error => undef };
36	9		33			102	return bless $self, ref( $class ) \|\| $class;
37							}
38
39							=head2 addRule()
40
41							Add a MARC::Detrans::Rule to the rules object.
42
43							=cut
44
45							sub addRule {
46	465			465	1	715	my ( $self, $rule ) = @_;
47							## get first character off the source for lookup
48							## since we'll be processing a character at a time
49	465					1465	my $key = substr( $rule->from(), 0, 1 );
50							## look for existing rules with this key
51	465	100				2562	my $rules = exists($self->{rules}{$key}) ? $self->{rules}{$key} : [];
52							## and the new rule and sort the rules so that the longest come first.
53							## this will mean that when we go to use the rules in convert()
54							## that the longest match will occur first.
55	465					1035	push( @$rules, $rule );
56	465					2886	@$rules = sort byRule @$rules;
57							## stash away the new rules
58	465					2123	$self->{rules}{$key} = $rules;
59							}
60
61							sub byRule {
62							return
63	309			309	0	1000	length( $b->from() . $b->position() )
64							<=>
65							length( $a->from() . $a->position() )
66							}
67
68							=head2 convert()
69
70							convert() applies the rules contained in the MARC::Detrans::Rules object
71							to convert a string that is passed in.
72
73							=cut
74
75							sub convert {
76	82			82	1	1463	my ( $self, $in ) = @_;
77							## ok, this is probably the most complicated bit of the distro
78							## and it's not really that bad.
79	82					243	my $inLength = length( $in );
80	82					107	my $out = '';
81	82					104	my $pos = 0;
82	82					112	my $currentEscape = '';
83							## we're going to step through the source string and build up $out
84							## to contain the de-transliterated text
85	82					834	while ( $pos < $inLength ) {
86							## extract the character at the current position
87							## and look to see if we have a rule for it
88	1117					2740	my $key = substr( $in, $pos, 1 );
89	1117	100				6144	my $rules = exists $self->{rules}{$key} ? $self->{rules}{$key} : [];
90	1117					3371	pos($in) = $pos;
91	1117					1424	my $foundRule;
92							## go through each of the rules and see if we've got a match
93	1117					1851	foreach my $rule ( @$rules ) {
94	1513					3952	my $from = $rule->from();
95							## if the rule matches remember it for later and jump out of
96							## the loop since we've got what we needed
97							## \G anchors the match at our current position
98							## \Q...\E makes sure that metacharacters in our pattern are escaped
99	1513	100				26696	if ( $in =~ m/\G\Q$from\E/ ) {
100	1128		100			6786	my $position = $rule->position() \|\| '';
101	1128	100				7513	if ( $position eq 'initial' ) {
		100
		100
102	22	100				61	next unless isInitial( $in, $pos );
103							}
104							elsif ( $position eq 'medial' ) {
105	8	100	66			13	next if isInitial( $in, $pos ) or isFinal( $in, $pos );
106							}
107							elsif ( $position eq 'final' ) {
108	4	50				8	next unless isFinal( $in, $pos );
109							}
110	1115					2296	$foundRule = $rule;
111	1115					3051	last;
112							}
113							}
114							## no matched rule, then we've got a character in the source
115							## data which doesn't map. Store the error and return asap.
116	1117	100				3280	if ( ! defined($foundRule) ) {
117	2					14	$self->{error} = sprintf(
118							qq(no matching rule found for "%s" [0x%x] at position %i),
119							$key, ord($key), $pos+1 );
120	2					14	return;
121							}
122							## advance the position the amount of characters that we matched
123	1115					3391	$pos += length( $foundRule->from() );
124							## if the rule has an associated MARC-8 escape character tag it
125							## onto the output text
126	1115	100	100			4326	if ($foundRule->escape() and $foundRule->escape() ne $currentEscape) {
127	77					239	$out .= chr(0x1B).$foundRule->escape();
128	77					392	$currentEscape = $foundRule->escape();
129							}
130							## append the new text
131	1115					3464	$out .= $foundRule->to();
132							}
133							## escape back to ASCII if approriate
134	80	100				261	if ( $currentEscape ) { $out .= chr(0x1B).chr(0x28).chr(0x42); }
	73					703
135							## make sure error flag is undef since we're ok now
136	80					153	$self->{error} = undef;
137							## return the new text!
138	80					534	return( $out );
139							}
140
141							=head2 error()
142
143							Will return the latest error encountered during a call to convert(). Can
144							be useful for determining why a call to convert() failed. A side effect
145							of calling error() is that the error slot is reset.
146
147							=cut
148
149							sub error {
150	3			3	1	8	my $self = shift;
151	3					6	my $error = $self->{error};
152	3					5	$self->{error} = undef;
153	3					16	return( $error );
154							}
155
156							=head1 AUTHORS
157
158							=over 4
159
160							=item * Ed Summers
161
162							=cut
163
164							## helper functions to determine whether a specific positon in a string
165							## is at the start or at the end of a word.
166
167							sub isInitial {
168	30			30	0	48	my ($string,$position) = @_;
169	30	100				82	return 1 if $position == 0;
170	24	100				93	return 1 if substr($string,$position-1,1) =~ /\W/;
171	17					61	return 0;
172							}
173
174							sub isFinal {
175	12			12	0	16	my ($string,$position) = @_;
176	12	100				39	return 1 if $position == length($string)-1;
177	6	100				39	return 1 if substr($string,$position+1,1) =~ /\W/;
178							}
179
180							1;