File Coverage

blib/lib/MARC/Lint.pm

Criterion	Covered	Total	%
statement	284	305	93.1
branch	150	188	79.7
condition	43	64	67.1
subroutine	21	22	95.4
pod	9	9	100.0
total	507	588	86.2

line	stmt	bran	cond	sub	pod	time	code
1							package MARC::Lint;
2
3	5			5		425697	use strict;
	5					47
	5					148
4	5			5		26	use warnings;
	5					9
	5					130
5	5			5		23	use integer;
	5					12
	5					30
6	5			5		1692	use MARC::Record;
	5					8989
	5					216
7	5			5		30	use MARC::Field;
	5					10
	5					146
8
9	5			5		2527	use MARC::Lint::CodeData qw(%GeogAreaCodes %ObsoleteGeogAreaCodes %LanguageCodes %ObsoleteLanguageCodes);
	5					15
	5					4668
10
11							our $VERSION = 1.53;
12
13							=head1 NAME
14
15							MARC::Lint - Perl extension for checking validity of MARC records
16
17							=head1 SYNOPSIS
18
19							use MARC::File::USMARC;
20							use MARC::Lint;
21
22							my $lint = new MARC::Lint;
23							my $filename = shift;
24
25							my $file = MARC::File::USMARC->in( $filename );
26							while ( my $marc = $file->next() ) {
27							$lint->check_record( $marc );
28
29							# Print the title tag
30							print $marc->title, "\n";
31
32							# Print the errors that were found
33							print join( "\n", $lint->warnings ), "\n";
34							} # while
35
36							Given the following MARC record:
37
38							LDR 00000nam 22002538a 4500
39							040 _aMdSSJTT
40							_cMdSSJTT
41							040 _aMdSSJTT
42							_beng
43							_cMdSSJTT
44							100 14 _aWall, Larry.
45							110 1 _aO'Reilly & Associates.
46							245 90 _aProgramming Perl /
47							_aBig Book of Perl /
48							_cLarry Wall, Tom Christiansen & Jon Orwant.
49							250 _a3rd ed.
50							250 _a3rd ed.
51							260 _aCambridge, Mass. :
52							_bO'Reilly,
53							_r2000.
54							590 4 _aPersonally signed by Larry.
55							856 43 _uhttp://www.perl.com/
56
57							the following errors are generated:
58
59							1XX: Only one 1XX tag is allowed, but I found 2 of them.
60							100: Indicator 2 must be blank but it's "4"
61							245: Indicator 1 must be 0 or 1 but it's "9"
62							245: Subfield _a is not repeatable.
63							040: Field is not repeatable.
64							260: Subfield _r is not allowed.
65							856: Indicator 2 must be blank, 0, 1, 2 or 8 but it's "3"
66
67							=head1 DESCRIPTION
68
69							Module for checking validity of MARC records. 99% of the users will want to do
70							something like is shown in the synopsis. The other intrepid 1% will overload the
71							C module's methods and provide their own special field-level checking.
72
73							What this means is that if you have certain requirements, such as making sure that
74							all 952 tags have a certain call number in them, you can write a function that
75							checks for that, and still get all the benefits of the MARC::Lint framework.
76
77							=head1 EXPORT
78
79							None. Everything is done through objects.
80
81							=head1 METHODS
82
83							=head2 new()
84
85							No parms needed. The C object is little more than a list of warnings
86							and a bunch of rules.
87
88							=cut
89
90							sub new {
91	6			6	1	6580	my $class = shift;
92
93	6					24	my $self = {
94							_warnings => [],
95							};
96	6					16	bless $self, $class;
97
98	6					30	$self->_read_rules();
99
100	6					44	return $self;
101							}
102
103							=head2 warnings()
104
105							Returns a list of warnings found by C and its brethren.
106
107							=cut
108
109							sub warnings {
110	64			64	1	1114	my $self = shift;
111
112	64	50				116	return wantarray ? @{$self->{_warnings}} : scalar @{$self->{_warnings}};
	64					191
	0					0
113							}
114
115							=head2 clear_warnings()
116
117							Clear the list of warnings for this linter object. It's automatically called
118							when you call C.
119
120							=cut
121
122							sub clear_warnings {
123	64			64	1	180	my $self = shift;
124
125	64					214	$self->{_warnings} = [];
126							}
127
128							=head2 warn( $str [, $str...] )
129
130							Create a warning message, built from strings passed, like a C
131							statement.
132
133							Typically, you'll leave this to C, but industrious
134							programmers may want to do their own checking as well.
135
136							=cut
137
138							sub warn {
139	59			59	1	1681	my $self = shift;
140
141	59					76	push( @{$self->{_warnings}}, join( "", @_ ) );
	59					197
142
143	59					225	return;
144							}
145
146							=head2 check_record( $marc )
147
148							Does all sorts of lint-like checks on the MARC record I<$marc>,
149							both on the record as a whole, and on the individual fields &
150							subfields.
151
152							=cut
153
154							sub check_record {
155	13			13	1	36395	my $self = shift;
156	13					24	my $marc = shift;
157
158	13					33	$self->clear_warnings();
159
160	13	50	33			95	( (ref $marc) && $marc->isa('MARC::Record') )
161							or return $self->warn( "Must pass a MARC::Record object to check_record" );
162
163	13					42	my @_1xx = $marc->field( "1.." );
164	13					1384	my $n1xx = scalar @_1xx;
165	13	100				38	if ( $n1xx > 1 ) {
166	1					7	$self->warn( "1XX: Only one 1XX tag is allowed, but I found $n1xx of them." );
167							}
168
169	13	50				28	if ( not $marc->field( 245 ) ) {
170	0					0	$self->warn( "245: No 245 tag." );
171							}
172
173
174	13					972	my %field_seen;
175	13					33	my $rules = $self->{_rules};
176	13					30	for my $field ( $marc->fields ) {
177	208					467	my $tagno = $field->tag;
178
179	208					848	my $tagrules = '';
180							#if 880 field, inherit rules from tagno in subfield _6
181	208					252	my $is_880 = 0;
182	208	100				354	if ($tagno eq '880') {
183	3					6	$is_880 = 1;
184	3	100				7	if ($field->subfield('6')) {
185	2					48	my $sub6 = $field->subfield('6');
186	2					44	$tagno = substr($sub6, 0, 3);
187
188	2	50				7	$tagrules = $rules->{$tagno} or next;
189							#880 is repeatable, but its linked field may not be
190	2	50	66			15	if ( ($tagrules->{'repeatable'} && ( $tagrules->{'repeatable'} eq 'NR' )) && $field_seen{'880.'.$tagno} ) {
			66
191	0					0	$self->warn( "$tagno: Field is not repeatable." );
192							} #if repeatability
193							} #if subfield 6 present
194							else {
195	1					23	$self->warn( "880: No subfield 6." );
196	1					2	next;
197							} #else no subfield 6 in 880 field
198							} #if this is 880 field
199							else {
200	205	100				446	$tagrules = $rules->{$tagno} or next;
201
202	204	50	66			851	if ( ($tagrules->{'repeatable'} && ( $tagrules->{'repeatable'} eq 'NR' )) && $field_seen{$tagno} ) {
			66
203	0					0	$self->warn( "$tagno: Field is not repeatable." );
204							} #if repeatability
205							} #else not 880
206
207	206	100				440	if ( $tagno >= 10 ) {
		50
208	159					274	for my $ind ( 1..2 ) {
209	318					586	my $indvalue = $field->indicator($ind);
210	318	100				4519	if ( not ($indvalue =~ $tagrules->{"ind$ind" . "_regex"}) ) {
211							$self->warn(
212							"$tagno: Indicator $ind must be ",
213	5					43	$tagrules->{"ind$ind" . "_desc"},
214							" but it's \"$indvalue\""
215							);
216							}
217							} # for
218
219	159					222	my %sub_seen;
220	159					301	for my $subfield ( $field->subfields ) {
221	264					3006	my ($code,$data) = @$subfield;
222
223	264					393	my $rule = $tagrules->{$code};
224	264	100	66			734	if ( not defined $rule ) {
		100
225	3					31	$self->warn( "$tagno: Subfield _$code is not allowed." );
226							} elsif ( ($rule eq "NR") && $sub_seen{$code} ) {
227	1					6	$self->warn( "$tagno: Subfield _$code is not repeatable." );
228							}
229
230	264	50				556	if ( $data =~ /[\t\r\n]/ ) {
231	0					0	$self->warn( "$tagno: Subfield _$code has an invalid control character" );
232							}
233
234	264					547	++$sub_seen{$code};
235							} # for $subfields
236							} # if $tagno >= 10
237
238							elsif ($tagno < 10) {
239							#check for subfield characters
240	47	100				93	if ($field->data() =~ /\x1F/) {
241	1					36	$self->warn( "$tagno: Subfields are not allowed in fields lower than 010" );
242							} #if control field has subfield delimiter
243							} #elsif $tagno < 10
244
245							# Check to see if a check_xxx() function exists, and call it on the field if it does
246	206					812	my $checker = "check_$tagno";
247	206	100				708	if ( $self->can( $checker ) ) {
248	25					84	$self->$checker( $field );
249							}
250
251	206	100				3290	if ($is_880) {
252	2					7	++$field_seen{'880.'.$tagno};
253							} #if 880 field
254							else {
255	204					408	++$field_seen{$tagno};
256							}
257							} # for my $fields
258
259	13					54	return;
260							}
261
262							=head2 check_I( $field )
263
264							Various functions to check the different fields. If the function doesn't exist,
265							then it doesn't get checked.
266
267							=head2 check_020()
268
269							Looks at 020$a and reports errors if the check digit is wrong.
270							Looks at 020$z and validates number if hyphens are present.
271
272							Uses Business::ISBN to do validation. Thirteen digit checking is currently done
273							with the internal sub _isbn13_check_digit(), based on code from Business::ISBN.
274
275							TO DO (check_020):
276
277							Fix 13-digit ISBN checking.
278
279							=cut
280
281							sub check_020 {
282
283
284	5			5		2869	use Business::ISBN;
	5					257226
	5					18335
285
286	22			22	1	12920	my $self = shift;
287	22					45	my $field = shift;
288
289							###################################################
290
291							# break subfields into code-data array and validate data
292
293	22					50	my @subfields = $field->subfields();
294
295	22					394	while (my $subfield = pop(@subfields)) {
296	22					44	my ($code, $data) = @$subfield;
297	22					37	my $isbnno = $data;
298							#remove any hyphens
299	22					62	$isbnno =~ s/\-//g;
300							#remove nondigits
301	22					138	$isbnno =~ s/^\D(\d{9,12}[X\d])\b.$/$1/;
302
303							#report error if this is subfield 'a'
304							#and the first 10 or 13 characters are not a match for $isbnno
305	22	100				61	if ($code eq 'a') {
		50
306	21	100				58	if ((substr($data,0,length($isbnno)) ne $isbnno)) {
307	2					5	$self->warn( "020: Subfield a may have invalid characters.");
308							} #if first characters don't match
309
310							#report error if no space precedes a qualifier in subfield a
311	21	100				59	if ($data =~ /\(/) {
312	8	100				36	$self->warn( "020: Subfield a qualifier must be preceded by space, $data.") unless ($data =~ /[X0-9] \(/);
313							} #if data has parenthetical qualifier
314
315							#report error if unable to find 10-13 digit string of digits in subfield 'a'
316	21	100				68	if (($isbnno !~ /(?:^\d{10}$)\|(?:^\d{13}$)\|(?:^\d{9}X$)/)) {
317	3					12	$self->warn( "020: Subfield a has the wrong number of digits, $data.");
318							} # if subfield 'a' but not 10 or 13 digit isbn
319							#otherwise, check 10 and 13 digit checksums for validity
320							else {
321	18	100				47	if ((length ($isbnno) == 10)) {
		50
322
323	16	50	33			45	if (($Business::ISBN::VERSION gt '2.02_01') \|\| ($Business::ISBN::VERSION gt '2.009')) {
		0
324	16	100				56	$self->warn( "020: Subfield a has bad checksum, $data." ) if (Business::ISBN::valid_isbn_checksum($isbnno) != 1);
325							} #if Business::ISBN version higher than 2.02_01 or 2.009
326							elsif ($Business::ISBN::VERSION lt '2') {
327	0	0				0	$self->warn( "020: Subfield a has bad checksum, $data." ) if (Business::ISBN::is_valid_checksum($isbnno) != 1);
328							} #elsif Business::ISBN version lower than 2
329							else {
330	0					0	$self->warn( "Business::ISBN version must be below 2 or above 2.02_02 or 2.009." );
331							} #else Business::ISBN version between 2 and 2.02_02
332							} #if 10 digit ISBN has invalid check digit
333							# do validation check for 13 digit isbn
334							#########################################
335							### Not yet fully implemented ###########
336							#########################################
337							elsif (length($isbnno) == 13){
338							#change line below once Business::ISBN handles 13-digit ISBNs
339	2					4	my $is_valid_13 = _isbn13_check_digit($isbnno);
340	2	100				16	$self->warn( "020: Subfield a has bad checksum (13 digit), $data.") unless ($is_valid_13 == 1);
341							} #elsif 13 digit ISBN has invalid check digit
342							###################################################
343							} #else subfield 'a' has 10 or 13 digits
344							} #if subfield 'a'
345							#look for valid isbn in 020$z
346							elsif ($code eq 'z') {
347	1	50	33			13	if (($data =~ /^ISBN/) \|\| ($data =~ /^\d*\-\d+/)){
348							##################################################
349							## Turned on for now--Comment to unimplement ####
350							##################################################
351	0	0	0			0	$self->warn( "020: Subfield z is numerically valid.") if ((length ($isbnno) == 10) && (Business::ISBN::is_valid_checksum($isbnno) == 1));
352							} #if 10 digit ISBN has invalid check digit
353							} #elsif subfield 'z'
354
355							} # while @subfields
356
357							} #check_020
358
359							=head2 _isbn13_check_digit($ean)
360
361							Internal sub to determine if 13-digit ISBN has a valid checksum. The code is
362							taken from Business::ISBN::as_ean. It is expected to be temporary until
363							Business::ISBN is updated to check 13-digit ISBNs itself.
364
365							=cut
366
367							sub _isbn13_check_digit {
368
369	2			2		4	my $ean = shift;
370							#remove and store current check digit
371	2					4	my $check_digit = chop($ean);
372
373							#calculate valid checksum
374	2					4	my $sum = 0;
375	2					4	foreach my $index ( 0, 2, 4, 6, 8, 10 )
376							{
377	12					16	$sum += substr($ean, $index, 1);
378	12					24	$sum += 3 * substr($ean, $index + 1, 1);
379							}
380
381							#take the next higher multiple of 10 and subtract the sum.
382							#if $sum is 37, the next highest multiple of ten is 40. the
383							#check digit would be 40 - 37 => 3.
384	2					10	my $valid_check_digit = ( 10 * ( int( $sum / 10 ) + 1 ) - $sum ) % 10;
385
386	2	100				6	return $check_digit == $valid_check_digit ? 1 : 0;
387
388							} # _isbn13_check_digit
389
390							#########################################
391
392							=head2 check_041( $field )
393
394							Warns if subfields are not evenly divisible by 3 unless second indicator is 7
395							(future implementation would ensure that each subfield is exactly 3 characters
396							unless ind2 is 7--since subfields are now repeatable. This is not implemented
397							here due to the large number of records needing to be corrected.). Validates
398							against the MARC Code List for Languages (L) using the
399							MARC::Lint::CodeData data pack to MARC::Lint (%LanguageCodes,
400							%ObsoleteLanguageCodes).
401
402							=cut
403
404							sub check_041 {
405
406
407	3			3	1	3506	my $self = shift;
408	3					7	my $field = shift;
409
410							# break subfields into code-data array (so the entire field is in one array)
411
412	3					11	my @subfields = $field->subfields();
413	3					66	my @newsubfields = ();
414
415	3					11	while (my $subfield = pop(@subfields)) {
416	7					14	my ($code, $data) = @$subfield;
417	7					28	unshift (@newsubfields, $code, $data);
418							} # while
419
420							#warn if length of each subfield is not divisible by 3 unless ind2 is 7
421	3	50				14	unless ($field->indicator(2) eq '7') {
422	3					55	for (my $index = 0; $index <=$#newsubfields; $index+=2) {
423	7	100				23	if (length ($newsubfields[$index+1]) %3 != 0) {
424	3					18	$self->warn( "041: Subfield _$newsubfields[$index] must be evenly divisible by 3 or exactly three characters if ind2 is not 7, ($newsubfields[$index+1])." );
425							} #if field length not divisible evenly by 3
426							##############################################
427							# validation against code list data
428							## each subfield has a multiple of 3 chars
429							# need to look at each group of 3 characters
430							else {
431
432							#break each character of the subfield into an array position
433	4					19	my @codechars = split '', $newsubfields[$index+1];
434
435	4					8	my $pos = 0;
436							#store each 3 char code in a slot of @codes041
437	4					7	my @codes041 = ();
438	4					23	while ($pos <= $#codechars) {
439	6					22	push @codes041, (join '', @codechars[$pos..$pos+2]);
440	6					16	$pos += 3;
441							}
442
443
444	4					8	foreach my $code041 (@codes041) {
445							#see if language code matches valid code
446	6	50				28	my $validlang = $LanguageCodes{$code041} ? 1 : 0;
447							#look for invalid code match if valid code was not matched
448	6	100				15	my $obsoletelang = $ObsoleteLanguageCodes{$code041} ? 1 : 0;
449
450							# skip valid subfields
451	6	50				25	unless ($validlang) {
452							#report invalid matches as possible obsolete codes
453	6	100				13	if ($obsoletelang) {
454	1					5	$self->warn( "041: Subfield _$newsubfields[$index], $newsubfields[$index+1], may be obsolete.");
455							}
456							else {
457	5					49	$self->warn( "041: Subfield _$newsubfields[$index], $newsubfields[$index+1] ($code041), is not valid.");
458							} #else code not found
459							} # unless found valid code
460							} #foreach code in 041
461							} # else subfield has multiple of 3 chars
462							##############################################
463							} # foreach subfield
464							} #unless ind2 is 7
465							} #check_041
466
467							=head2 check_043( $field )
468
469							Warns if each subfield a is not exactly 7 characters. Validates each code
470							against the MARC code list for Geographic Areas (L)
471							using the MARC::Lint::CodeData data pack to MARC::Lint (%GeogAreaCodes,
472							%ObsoleteGeogAreaCodes).
473
474							=cut
475
476							sub check_043 {
477
478	2			2	1	2485	my $self = shift;
479	2					5	my $field = shift;
480
481							# break subfields into code-data array (so the entire field is in one array)
482
483	2					7	my @subfields = $field->subfields();
484	2					41	my @newsubfields = ();
485
486	2					8	while (my $subfield = pop(@subfields)) {
487	5					11	my ($code, $data) = @$subfield;
488	5					17	unshift (@newsubfields, $code, $data);
489							} # while
490
491							#warn if length of subfield a is not exactly 7
492	2					10	for (my $index = 0; $index <=$#newsubfields; $index+=2) {
493	5	100	66			36	if (($newsubfields[$index] eq 'a') && (length ($newsubfields[$index+1]) != 7)) {
		50
494	2					7	$self->warn( "043: Subfield _a must be exactly 7 characters, $newsubfields[$index+1]" );
495							} # if suba and length is not 7
496							#check against code list for geographic areas.
497							elsif ($newsubfields[$index] eq 'a') {
498
499							#see if geog area code matches valid code
500	3	50				15	my $validgac = $GeogAreaCodes{$newsubfields[$index+1]} ? 1 : 0;
501							#look for obsolete code match if valid code was not matched
502	3	100				13	my $obsoletegac = $ObsoleteGeogAreaCodes{$newsubfields[$index+1]} ? 1 : 0;
503
504							# skip valid subfields
505	3	50				31	unless ($validgac) {
506							#report invalid matches as possible obsolete codes
507	3	100				11	if ($obsoletegac) {
508	1					5	$self->warn( "043: Subfield _a, $newsubfields[$index+1], may be obsolete.");
509							}
510							else {
511	2					11	$self->warn( "043: Subfield _a, $newsubfields[$index+1], is not valid.");
512							} #else code not found
513							} # unless found valid code
514
515							} #elsif suba
516							} #foreach subfield
517							} #check_043
518
519							=head2 check_245( $field )
520
521							-Makes sure $a exists (and is first subfield).
522							-Warns if last character of field is not a period
523							--Follows LCRI 1.0C, Nov. 2003 rather than MARC21 rule
524							-Verifies that $c is preceded by / (space-/)
525							-Verifies that initials in $c are not spaced
526							-Verifies that $b is preceded by :;= (space-colon, space-semicolon, space-equals)
527							-Verifies that $h is not preceded by space unless it is dash-space
528							-Verifies that data of $h is enclosed in square brackets
529							-Verifies that $n is preceded by . (period)
530							--As part of that, looks for no-space period, or dash-space-period (for replaced elipses)
531							-Verifies that $p is preceded by , (no-space-comma) when following $n and . (period) when following other subfields.
532							-Performs rudimentary article check of 245 2nd indicator vs. 1st word of 245$a (for manual verification).
533
534							Article checking is done by internal _check_article method, which should work for 130, 240, 245, 440, 630, 730, and 830.
535
536							=cut
537
538							sub check_245 {
539
540	49			49	1	27701	my $self = shift;
541	49					71	my $field = shift;
542
543							#set tagno for reporting
544	49					76	my $tagno = '245';
545
546	49	100				146	if ( not $field->subfield( "a" ) ) {
547	1					25	$self->warn( "245: Must have a subfield _a." );
548							}
549
550							# break subfields into code-data array (so the entire field is in one array)
551
552	49					1192	my @subfields = $field->subfields();
553	49					896	my @newsubfields = ();
554	49					70	my $has_sub_6 = 0;
555
556	49					162	while (my $subfield = pop(@subfields)) {
557	90					154	my ($code, $data) = @$subfield;
558							#check for subfield 6 being present
559	90	100				161	$has_sub_6 = 1 if ($code eq '6');
560	90					303	unshift (@newsubfields, $code, $data);
561							} # while
562
563							# 245 must end in period (may want to make this less restrictive by allowing trailing spaces)
564							#do 2 checks--for final punctuation (MARC21 rule), and for period (LCRI 1.0C, Nov. 2003; LCPS 1.7.1)
565	49	100				301	if ($newsubfields[$#newsubfields] !~ /[.?!]$/) {
		100
566	1					5	$self->warn ( "245: Must end with . (period).");
567							}
568							elsif($newsubfields[$#newsubfields] =~ /[?!]$/) {
569	2					7	$self->warn ( "245: MARC21 allows ? or ! as final punctuation but LCRI 1.0C, Nov. 2003 (LCPS 1.7.1 for RDA records), requires period.");
570							}
571
572							##Check for first subfield
573							#subfield a should be first subfield (or 2nd if subfield '6' is present)
574	49	100				96	if ($has_sub_6) {
575							#make sure there are at least 2 subfields
576	2	50				5	if ($#newsubfields < 3) {
577	0					0	$self->warn ("$tagno: May have too few subfields.");
578							} #if fewer than 2 subfields
579							else {
580	2	50				5	if ($newsubfields[0] ne '6') {
581	0					0	$self->warn ( "$tagno: First subfield must be _6, but it is $newsubfields[0]");
582							} #if 1st subfield not '6'
583	2	50				4	if ($newsubfields[2] ne 'a') {
584	0					0	$self->warn ( "$tagno: First subfield after subfield _6 must be _a, but it is _$newsubfields[2]");
585							} #if 2nd subfield not 'a'
586							} #else at least 2 subfields
587							} #if has subfield 6
588							else {
589							#1st subfield must be 'a'
590	47	100				119	if ($newsubfields[0] ne 'a') {
591	1					5	$self->warn ( "$tagno: First subfield must be _a, but it is _$newsubfields[0]");
592							} #if 2nd subfield not 'a'
593							} #else no subfield _6
594							##End check for first subfield
595
596							#subfield c, if present, must be preceded by /
597							#also look for space between initials
598	49	100				134	if ($field->subfield("c")) {
599
600	14					367	for (my $index = 2; $index <=$#newsubfields; $index+=2) {
601							# 245 subfield c must be preceded by / (space-/)
602	17	100				46	if ($newsubfields[$index] eq 'c') {
603	14	100				65	$self->warn ( "245: Subfield _c must be preceded by /") if ($newsubfields[$index-1] !~ /\s\/$/);
604							# 245 subfield c initials should not have space
605	14	100	66			70	$self->warn ( "245: Subfield _c initials should not have a space.") if (($newsubfields[$index+1] =~ /\b\w\. \b\w\./) && ($newsubfields[$index+1] !~ /\[\bi\.e\. \b\w\..*\]/));
606	14					31	last;
607							} #if
608							} #for
609							} # subfield c exists
610
611							#each subfield b, if present, should be preceded by :;= (colon, semicolon, or equals sign)
612							### Are there others? ###
613	49	100				761	if ($field->subfield("b")) {
614
615							# 245 subfield b should be preceded by space-:;= (colon, semicolon, or equals sign)
616	13					307	for (my $index = 2; $index <=$#newsubfields; $index+=2) {
617							#report error if subfield 'b' is not preceded by space-:;= (colon, semicolon, or equals sign)
618	16	100	100			108	if (($newsubfields[$index] eq 'b') && ($newsubfields[$index-1] !~ / [:;=]$/)) {
619	4					10	$self->warn ( "245: Subfield _b should be preceded by space-colon, space-semicolon, or space-equals sign.");
620							} #if
621							} #for
622							} # subfield b exists
623
624
625							#each subfield h, if present, should be preceded by non-space
626	49	100				753	if ($field->subfield("h")) {
627
628							# 245 subfield h should not be preceded by space
629	4					94	for (my $index = 2; $index <=$#newsubfields; $index+=2) {
630							#report error if subfield 'h' is preceded by space (unless dash-space)
631	6	100	100			59	if (($newsubfields[$index] eq 'h') && ($newsubfields[$index-1] !~ /(\S$)\|(\-\- $)/)) {
632	1					6	$self->warn ( "245: Subfield _h should not be preceded by space.");
633							} #if h and not preceded by no-space (unless dash)
634							#report error if subfield 'h' does not start with open square bracket with a matching close bracket
635							##could have check against list of valid values here
636	6	100	100			40	if (($newsubfields[$index] eq 'h') && ($newsubfields[$index+1] !~ /^\[\w\s\w*\]/)) {
637	1					6	$self->warn ( "245: Subfield _h must have matching square brackets, $newsubfields[$index].");
638							}
639							} #for
640							} # subfield h exists
641
642							#each subfield n, if present, must be preceded by . (period)
643	49	100				935	if ($field->subfield("n")) {
644
645							# 245 subfield n must be preceded by . (period)
646	4					92	for (my $index = 2; $index <=$#newsubfields; $index+=2) {
647							#report error if subfield 'n' is not preceded by non-space-period or dash-space-period
648	6	100	100			47	if (($newsubfields[$index] eq 'n') && ($newsubfields[$index-1] !~ /(\S\.$)\|(\-\- \.$)/)) {
649	1					5	$self->warn ( "245: Subfield _n must be preceded by . (period).");
650							} #if
651							} #for
652							} # subfield n exists
653
654							#each subfield p, if present, must be preceded by a , (no-space-comma) if it follows subfield n, or by . (no-space-period or dash-space-period) following other subfields
655	49	100				926	if ($field->subfield("p")) {
656
657							# 245 subfield p must be preceded by . (period) or , (comma)
658	4					90	for (my $index = 2; $index <=$#newsubfields; $index+=2) {
659							#only looking for subfield p
660	6	100				16	if ($newsubfields[$index] eq 'p') {
661							# case for subfield 'n' being field before this one (allows dash-space-comma)
662	4	100	100			51	if (($newsubfields[$index-2] eq 'n') && ($newsubfields[$index-1] !~ /(\S,$)\|(\-\- ,$)/)) {
		100	100
663	1					3	$self->warn ( "245: Subfield _p must be preceded by , (comma) when it follows subfield _n.");
664							} #if subfield n precedes this one
665							# elsif case for subfield before this one is not n
666							elsif (($newsubfields[$index-2] ne 'n') && ($newsubfields[$index-1] !~ /(\S\.$)\|(\-\- \.$)/)) {
667	1					5	$self->warn ( "245: Subfield _p must be preceded by . (period) when it follows a subfield other than _n.");
668							} #elsif subfield p preceded by non-period when following a non-subfield 'n'
669							} #if index is looking at subfield p
670							} #for
671							} # subfield p exists
672
673							######################################
674							#check for invalid 2nd indicator
675	49					971	$self->_check_article($field);
676
677							} # check_245
678
679
680
681
682							############
683							# Internal #
684							############
685
686							=head2 _check_article
687
688							Check of articles is based on code from Ian Hamilton. This version is more
689							limited in that it focuses on English, Spanish, French, Italian and German
690							articles. Certain possible articles have been removed if they are valid English
691							non-articles. This version also disregards 008_language/041 codes and just uses
692							the list of articles to provide warnings/suggestions.
693
694							source for articles = L
695
696							Should work with fields 130, 240, 245, 440, 630, 730, and 830. Reports error if
697							another field is passed in.
698
699							=cut
700
701							sub _check_article {
702
703	49			49		76	my $self = shift;
704	49					81	my $field = shift;
705
706							#add articles here as needed
707							##Some omitted due to similarity with valid words (e.g. the German 'die').
708	49					744	my %article = (
709							'a' => 'eng glg hun por',
710							'an' => 'eng',
711							'das' => 'ger',
712							'dem' => 'ger',
713							'der' => 'ger',
714							'ein' => 'ger',
715							'eine' => 'ger',
716							'einem' => 'ger',
717							'einen' => 'ger',
718							'einer' => 'ger',
719							'eines' => 'ger',
720							'el' => 'spa',
721							'en' => 'cat dan nor swe',
722							'gl' => 'ita',
723							'gli' => 'ita',
724							'il' => 'ita mlt',
725							'l' => 'cat fre ita mlt',
726							'la' => 'cat fre ita spa',
727							'las' => 'spa',
728							'le' => 'fre ita',
729							'les' => 'cat fre',
730							'lo' => 'ita spa',
731							'los' => 'spa',
732							'os' => 'por',
733							'the' => 'eng',
734							'um' => 'por',
735							'uma' => 'por',
736							'un' => 'cat spa fre ita',
737							'una' => 'cat spa ita',
738							'une' => 'fre',
739							'uno' => 'ita',
740							);
741
742							#add exceptions here as needed
743							# may want to make keys lowercase
744	49					531	my %exceptions = (
745							'A & E' => 1,
746							'A & ' => 1,
747							'A-' => 1,
748							'A+' => 1,
749							'A is ' => 1,
750							'A isn\'t ' => 1,
751							'A l\'' => 1,
752							'A la ' => 1,
753							'A posteriori' => 1,
754							'A priori' => 1,
755							'A to ' => 1,
756							'El Nino' => 1,
757							'El Salvador' => 1,
758							'L is ' => 1,
759							'L-' => 1,
760							'La Salle' => 1,
761							'Las Vegas' => 1,
762							'Lo cual' => 1,
763							'Lo mein' => 1,
764							'Lo que' => 1,
765							'Los Alamos' => 1,
766							'Los Angeles' => 1,
767							);
768
769							#get tagno to determine which indicator to check and for reporting
770	49					126	my $tagno = $field->tag();
771							#retrieve tagno from subfield 6 if 880 field
772	49	100				282	if ($tagno eq '880') {
773	1	50				13	if ($field->subfield('6')) {
774	1					31	my $sub6 = $field->subfield('6');
775	1					23	$tagno = substr($sub6, 0, 3);
776							} #if subfield 6
777							} #if 880 field
778
779							#$ind holds nonfiling character indicator value
780	49					91	my $ind = '';
781							#$first_or_second holds which indicator is for nonfiling char value
782	49					71	my $first_or_second = '';
783	49	50				344	if ($tagno !~ /^(?:130\|240\|245\|440\|630\|730\|830)$/) {
		50
		50
784	0					0	print $tagno, " is not a valid field for article checking\n";
785	0					0	return;
786							} #if field is not one of those checked for articles
787							#130, 630, 730 => ind1
788							elsif ($tagno =~ /^(?:130\|630\|730)$/) {
789	0					0	$ind = $field->indicator(1);
790	0					0	$first_or_second = '1st';
791							} #if field is 130, 630, or 730
792							#240, 245, 440, 830 => ind2
793							elsif ($tagno =~ /^(?:240\|245\|440\|830)$/) {
794	49					107	$ind = $field->indicator(2);
795	49					553	$first_or_second = '2nd';
796							} #if field is 240, 245, 440, or 830
797
798
799							#report non-numeric non-filing indicators as invalid
800	49	50				121	$self->warn ( $tagno, ": Non-filing indicator is non-numeric" ) unless ($ind =~ /^[0-9]$/);
801							#get subfield 'a' of the title field
802	49		100			116	my $title = $field->subfield('a') \|\| '';
803
804
805	49					1026	my $char1_notalphanum = 0;
806							#check for apostrophe, quote, bracket, or parenthesis, before first word
807							#remove if found and add to non-word counter
808	49					155	while ($title =~ /^["'\[\(*]/){
809	4					11	$char1_notalphanum++;
810	4					16	$title =~ s/^["'\[\(*]//;
811							}
812							# split title into first word + rest on space, parens, bracket, apostrophe, quote, or hyphen
813	49					238	my ($firstword, $separator, $etc) = $title =~ /^([^ \[\]'"\-]+)([ \[\]'"\-])?(.*)/i;
814	49	100				122	$firstword = '' if ! defined( $firstword );
815	49	100				82	$separator = '' if ! defined( $separator );
816	49	100				90	$etc = '' if ! defined( $etc );
817
818							#get length of first word plus the number of chars removed above plus one for the separator
819	49					83	my $nonfilingchars = length($firstword) + $char1_notalphanum + 1;
820
821							#check to see if first word is an exception
822	49					59	my $isan_exception = 0;
823	49					229	$isan_exception = grep {$title =~ /^\Q$_\E/i} (keys %exceptions);
	1078					6258
824
825							#lowercase chars of $firstword for comparison with article list
826	49					146	$firstword = lc($firstword);
827
828	49					66	my $isan_article = 0;
829
830							#see if first word is in the list of articles and not an exception
831	49	100	100			166	$isan_article = 1 if (($article{$firstword}) && !($isan_exception));
832
833							#if article then $nonfilingchars should match $ind
834	49	100				101	if ($isan_article) {
835							#account for quotes, apostrophes, parens, or brackets before 2nd word
836							# if (($separator eq ' ') && ($etc =~ /^['"]/)) {
837	9	100	66			38	if (($separator) && ($etc =~ /^[ \[\]'"\-]+/)) {
838	4					11	while ($etc =~ /^[ "'\[\]*]/){
839	6					9	$nonfilingchars++;
840	6					23	$etc =~ s/^[ "'\[\]*]//;
841							} #while etc starts with nonfiling chars
842							} #if separator defined and etc starts with nonfiling chars
843							#special case for 'en' (unsure why)
844	9	50				72	if ($firstword eq 'en') {
		100
845	0	0	0			0	$self->warn ( $tagno, ": First word, , $firstword, may be an article, check $first_or_second indicator ($ind)." ) unless (($ind eq '3') \|\| ($ind eq '0'));
846							}
847							elsif ($nonfilingchars ne $ind) {
848	3					14	$self->warn ( $tagno, ": First word, $firstword, may be an article, check $first_or_second indicator ($ind)." );
849							} #unless ind is same as length of first word and nonfiling characters
850							} #if first word is in article list
851							#not an article so warn if $ind is not 0
852							else {
853	40	100				384	unless ($ind eq '0') {
854	1					7	$self->warn ( $tagno, ": First word, $firstword, does not appear to be an article, check $first_or_second indicator ($ind)." );
855							} #unless ind is 0
856							} #else not in article list
857
858							#######################################
859
860							} #_check_article
861
862
863							############
864
865							=head1 SEE ALSO
866
867							Check the docs for L. All software links are there.
868
869							=head1 TODO
870
871							=over 4
872
873							=item * Subfield 6
874
875							For subfield 6, it should always be the 1st subfield according to MARC 21 specifications. Perhaps a generic check should be added that warns if subfield 6 is not the 1st subfield.
876
877							=item * Subfield 8.
878
879							This subfield could be the 1st or 2nd subfield, so the code that checks for the 1st few subfields (check_245, check_250) should take that into account.
880
881							=item * Subfield 9
882
883							This subfield is not officially allowed in MARC, since it is locally defined. Some way needs to be made to allow messages/warnings about this subfield to be turned off (or otherwise deal with records using/allowing locally defined subfield 9).
884
885							=item * 008 length and presence check
886
887							Currently, 008 validation is not implemented in MARC::Lint, but is left to MARC::Errorchecks. It might be useful if MARC::Lint's basic validation checks included a verification that the 008 exists and is exactly 40 characters long. Additional 008-related checking and byte validation would remain in MARC::Errorchecks.
888
889							=item * ISBN and ISSN checking
890
891							020 and 022 fields are validated with the C and
892							C modules, respectively. Business::ISBN versions between 2 and
893							2.02_01 are incompatible with MARC::Lint.
894
895							=item * check_041 cleanup
896
897							Splitting subfield code strings every 3 chars could probably be written more efficiently.
898
899							=item * check_245 cleanup
900
901							The article checking in particular.
902
903							=item * Method for turning off checks
904
905							Provide a way for users to skip checks more easily when using check_record, or a
906							specific check_xxx method (e.g. skip article checking).
907
908							=back
909
910							=head1 LICENSE
911
912							This code may be distributed under the same terms as Perl itself.
913
914							Please note that these modules are not products of or supported by the
915							employers of the various contributors to the code.
916
917							=cut
918
919							# Used only to read the stuff from __DATA__
920							sub _read_rules {
921	6			6		14	my $self = shift;
922
923	6					24	my $tell = tell(DATA); # Stash the position so we can reset it for next time
924
925	6					31	local $/ = "";
926	6					129	while ( my $tagblock = ) {
927	1464					6942	my @lines = split( /\n/, $tagblock );
928	1464					17880	s/\s+$// for @lines;
929
930	1464	100				3270	next unless @lines >= 4; # Some of our entries are tag-only
931
932	1362					2255	my $tagline = shift @lines;
933	1362					4142	my @keyvals = split( /\s+/, $tagline, 3 );
934	1362					2265	my $tagno = shift @keyvals;
935	1362					1870	my $repeatable = shift @keyvals;
936
937	1362					2875	$self->_parse_tag_rules( $tagno, $repeatable, @lines );
938							} # while
939
940							# Set the pointer back to where it was, in case we do this again
941	6					97	seek( DATA, $tell, 0 );
942							}
943
944							sub _parse_tag_rules {
945	1362			1362		1834	my $self = shift;
946	1362					1739	my $tagno = shift;
947	1362					1635	my $repeatable = shift;
948	1362					4234	my @lines = @_;
949
950	1362		50			7992	my $rules = ($self->{_rules}->{$tagno} \|\|= {});
951	1362					2861	$rules->{'repeatable'} = $repeatable;
952
953	1362					2062	for my $line ( @lines ) {
954	17190					50461	my @keyvals = split( /\s+/, $line, 3 );
955	17190					26909	my $key = shift @keyvals;
956	17190					23211	my $val = shift @keyvals;
957
958							# Do magic for indicators
959	17190	100				28758	if ( $key =~ /^ind/ ) {
960	2724					5017	$rules->{$key} = $val;
961
962	2724					3535	my $desc;
963							my $regex;
964
965	2724	100				4256	if ( $val eq "blank" ) {
966	1674					2039	$desc = "blank";
967	1674					5028	$regex = qr/^ $/;
968							} else {
969	1050					1717	$desc = _nice_list($val);
970	1050					2419	$val =~ s/^b/ /;
971	1050					12108	$regex = qr/^[$val]$/;
972							}
973
974	2724					6720	$rules->{$key."_desc"} = $desc;
975	2724					6086	$rules->{$key."_regex"} = $regex;
976							} # if indicator
977							else {
978	14466	50				19686	if ( $key =~ /(.)-(.)/ ) {
979	0					0	my ($min,$max) = ($1,$2);
980	0					0	$rules->{$_} = $val for ($min..$max);
981							} else {
982	14466					39826	$rules->{$key} = $val;
983							}
984							} # not an indicator
985							} # for $line
986							}
987
988
989							sub _nice_list {
990	1050			1050		1355	my $str = shift;
991
992	1050	100				2471	if ( $str =~ s/(\d)-(\d)/$1 thru $2/ ) {
993	60					202	return $str;
994							}
995
996	990					2380	my @digits = split( //, $str );
997	990	100				1899	$digits[0] = "blank" if $digits[0] eq "b";
998	990					1413	my $last = pop @digits;
999	990					3149	return join( ", ", @digits ) . " or $last";
1000							}
1001
1002							sub _ind_regex {
1003	0			0			my $str = shift;
1004
1005	0	0					return qr/^ $/ if $str eq "blank";
1006
1007	0						return qr/^[$str]$/;
1008							}
1009
1010
1011							1;
1012
1013							__DATA__