File Coverage

blib/lib/MARC/Lint.pm

Criterion	Covered	Total	%
statement	284	304	93.4
branch	150	188	79.7
condition	41	64	64.0
subroutine	21	22	95.4
pod	9	9	100.0
total	505	587	86.0

line	stmt	bran	cond	sub	pod	time	code
1							package MARC::Lint;
2
3	5			5		2013194	use strict;
	5					9
	5					149
4	5			5		27	use warnings;
	5					8
	5					142
5	5			5		23	use integer;
	5					15
	5					29
6	5			5		2962	use MARC::Record;
	5					9440
	5					212
7	5			5		30	use MARC::Field;
	5					8
	5					142
8
9	5			5		183630	use MARC::Lint::CodeData qw(%GeogAreaCodes %ObsoleteGeogAreaCodes %LanguageCodes %ObsoleteLanguageCodes);
	5					13
	5					5566
10
11							our $VERSION = 1.50 ;
12
13							=head1 NAME
14
15							MARC::Lint - Perl extension for checking validity of MARC records
16
17							=head1 SYNOPSIS
18
19							use MARC::File::USMARC;
20							use MARC::Lint;
21
22							my $lint = new MARC::Lint;
23							my $filename = shift;
24
25							my $file = MARC::File::USMARC->in( $filename );
26							while ( my $marc = $file->next() ) {
27							$lint->check_record( $marc );
28
29							# Print the title tag
30							print $marc->title, "\n";
31
32							# Print the errors that were found
33							print join( "\n", $lint->warnings ), "\n";
34							} # while
35
36							Given the following MARC record:
37
38							LDR 00000nam 22002538a 4500
39							040 _aMdSSJTT
40							_cMdSSJTT
41							040 _aMdSSJTT
42							_beng
43							_cMdSSJTT
44							100 14 _aWall, Larry.
45							110 1 _aO'Reilly & Associates.
46							245 90 _aProgramming Perl /
47							_aBig Book of Perl /
48							_cLarry Wall, Tom Christiansen & Jon Orwant.
49							250 _a3rd ed.
50							250 _a3rd ed.
51							260 _aCambridge, Mass. :
52							_bO'Reilly,
53							_r2000.
54							590 4 _aPersonally signed by Larry.
55							856 43 _uhttp://www.perl.com/
56
57							the following errors are generated:
58
59							1XX: Only one 1XX tag is allowed, but I found 2 of them.
60							100: Indicator 2 must be blank but it's "4"
61							245: Indicator 1 must be 0 or 1 but it's "9"
62							245: Subfield _a is not repeatable.
63							040: Field is not repeatable.
64							260: Subfield _r is not allowed.
65							856: Indicator 2 must be blank, 0, 1, 2 or 8 but it's "3"
66
67							=head1 DESCRIPTION
68
69							Module for checking validity of MARC records. 99% of the users will want to do
70							something like is shown in the synopsis. The other intrepid 1% will overload the
71							C module's methods and provide their own special field-level checking.
72
73							What this means is that if you have certain requirements, such as making sure that
74							all 952 tags have a certain call number in them, you can write a function that
75							checks for that, and still get all the benefits of the MARC::Lint framework.
76
77							=head1 EXPORT
78
79							None. Everything is done through objects.
80
81							=head1 METHODS
82
83							=head2 new()
84
85							No parms needed. The C object is little more than a list of warnings
86							and a bunch of rules.
87
88							=cut
89
90							sub new {
91	6			6	1	5698	my $class = shift;
92
93	6					24	my $self = {
94							_warnings => [],
95							};
96	6					17	bless $self, $class;
97
98	6					29	$self->_read_rules();
99
100	6					34	return $self;
101							}
102
103							=head2 warnings()
104
105							Returns a list of warnings found by C and its brethren.
106
107							=cut
108
109							sub warnings {
110	64			64	1	949	my $self = shift;
111
112	64	50				141	return wantarray ? @{$self->{_warnings}} : scalar @{$self->{_warnings}};
	64					218
	0					0
113							}
114
115							=head2 clear_warnings()
116
117							Clear the list of warnings for this linter object. It's automatically called
118							when you call C.
119
120							=cut
121
122							sub clear_warnings {
123	64			64	1	198	my $self = shift;
124
125	64					215	$self->{_warnings} = [];
126							}
127
128							=head2 warn( $str [, $str...] )
129
130							Create a warning message, built from strings passed, like a C
131							statement.
132
133							Typically, you'll leave this to C, but industrious
134							programmers may want to do their own checking as well.
135
136							=cut
137
138							sub warn {
139	56			56	1	1262	my $self = shift;
140
141	56					70	push( @{$self->{_warnings}}, join( "", @_ ) );
	56					189
142
143	56					242	return;
144							}
145
146							=head2 check_record( $marc )
147
148							Does all sorts of lint-like checks on the MARC record I<$marc>,
149							both on the record as a whole, and on the individual fields &
150							subfields.
151
152							=cut
153
154							sub check_record {
155	13			13	1	807132	my $self = shift;
156	13					25	my $marc = shift;
157
158	13					44	$self->clear_warnings();
159
160	13	50	33			129	( (ref $marc) && $marc->isa('MARC::Record') )
161							or return $self->warn( "Must pass a MARC::Record object to check_record" );
162
163	13					47	my @_1xx = $marc->field( "1.." );
164	13					1455	my $n1xx = scalar @_1xx;
165	13	100				41	if ( $n1xx > 1 ) {
166	1					8	$self->warn( "1XX: Only one 1XX tag is allowed, but I found $n1xx of them." );
167							}
168
169	13	50				42	if ( not $marc->field( 245 ) ) {
170	0					0	$self->warn( "245: No 245 tag." );
171							}
172
173
174	13					988	my %field_seen;
175	13					28	my $rules = $self->{_rules};
176	13					50	for my $field ( $marc->fields ) {
177	206					606	my $tagno = $field->tag;
178
179	206					843	my $tagrules = '';
180							#if 880 field, inherit rules from tagno in subfield _6
181	206					292	my $is_880 = 0;
182	206	100				426	if ($tagno eq '880') {
183	1					2	$is_880 = 1;
184	1	50				3	if ($field->subfield('6')) {
185	1					21	my $sub6 = $field->subfield('6');
186	1					19	$tagno = substr($sub6, 0, 3);
187
188	1	50				3	$tagrules = $rules->{$tagno} or next;
189							#880 is repeatable, but its linked field may not be
190	1	50	33			12	if ( ($tagrules->{'repeatable'} && ( $tagrules->{'repeatable'} eq 'NR' )) && $field_seen{'880.'.$tagno} ) {
			33
191	0					0	$self->warn( "$tagno: Field is not repeatable." );
192							} #if repeatability
193							} #if subfield 6 present
194							else {
195	0					0	$self->warn( "880: No subfield 6." );
196							} #else no subfield 6 in 880 field
197							} #if this is 880 field
198							else {
199	205	100				587	$tagrules = $rules->{$tagno} or next;
200
201	204	50	66			1266	if ( ($tagrules->{'repeatable'} && ( $tagrules->{'repeatable'} eq 'NR' )) && $field_seen{$tagno} ) {
			66
202	0					0	$self->warn( "$tagno: Field is not repeatable." );
203							} #if repeatability
204							} #else not 880
205
206	205	100				527	if ( $tagno >= 10 ) {
		50
207	158					293	for my $ind ( 1..2 ) {
208	316					882	my $indvalue = $field->indicator($ind);
209	316	100				4426	if ( not ($indvalue =~ $tagrules->{"ind$ind" . "_regex"}) ) {
210							$self->warn(
211							"$tagno: Indicator $ind must be ",
212	4					21	$tagrules->{"ind$ind" . "_desc"},
213							" but it's \"$indvalue\""
214							);
215							}
216							} # for
217
218	158					246	my %sub_seen;
219	158					415	for my $subfield ( $field->subfields ) {
220	262					2915	my ($code,$data) = @$subfield;
221
222	262					500	my $rule = $tagrules->{$code};
223	262	100	66			1082	if ( not defined $rule ) {
		100
224	2					9	$self->warn( "$tagno: Subfield _$code is not allowed." );
225							} elsif ( ($rule eq "NR") && $sub_seen{$code} ) {
226	1					5	$self->warn( "$tagno: Subfield _$code is not repeatable." );
227							}
228
229	262	50				685	if ( $data =~ /[\t\r\n]/ ) {
230	0					0	$self->warn( "$tagno: Subfield _$code has an invalid control character" );
231							}
232
233	262					766	++$sub_seen{$code};
234							} # for $subfields
235							} # if $tagno >= 10
236
237							elsif ($tagno < 10) {
238							#check for subfield characters
239	47	100				173	if ($field->data() =~ /\x1F/) {
240	1					12	$self->warn( "$tagno: Subfields are not allowed in fields lower than 010" );
241							} #if control field has subfield delimiter
242							} #elsif $tagno < 10
243
244							# Check to see if a check_xxx() function exists, and call it on the field if it does
245	205					847	my $checker = "check_$tagno";
246	205	100				904	if ( $self->can( $checker ) ) {
247	25					75	$self->$checker( $field );
248							}
249
250	205	100				2959	if ($is_880) {
251	1					4	++$field_seen{'880.'.$tagno};
252							} #if 880 field
253							else {
254	204					545	++$field_seen{$tagno};
255							}
256							} # for my $fields
257
258	13					65	return;
259							}
260
261							=head2 check_I( $field )
262
263							Various functions to check the different fields. If the function doesn't exist,
264							then it doesn't get checked.
265
266							=head2 check_020()
267
268							Looks at 020$a and reports errors if the check digit is wrong.
269							Looks at 020$z and validates number if hyphens are present.
270
271							Uses Business::ISBN to do validation. Thirteen digit checking is currently done
272							with the internal sub _isbn13_check_digit(), based on code from Business::ISBN.
273
274							TO DO (check_020):
275
276							Fix 13-digit ISBN checking.
277
278							=cut
279
280							sub check_020 {
281
282
283	5			5		4479	use Business::ISBN;
	5					244205
	5					18226
284
285	22			22	1	9915	my $self = shift;
286	22					34	my $field = shift;
287
288							###################################################
289
290							# break subfields into code-data array and validate data
291
292	22					57	my @subfields = $field->subfields();
293
294	22					360	while (my $subfield = pop(@subfields)) {
295	22					42	my ($code, $data) = @$subfield;
296	22					29	my $isbnno = $data;
297							#remove any hyphens
298	22					56	$isbnno =~ s/\-//g;
299							#remove nondigits
300	22					120	$isbnno =~ s/^\D(\d{9,12}[X\d])\b.$/$1/;
301
302							#report error if this is subfield 'a'
303							#and the first 10 or 13 characters are not a match for $isbnno
304	22	100				62	if ($code eq 'a') {
		50
305	21	100				68	if ((substr($data,0,length($isbnno)) ne $isbnno)) {
306	2					8	$self->warn( "020: Subfield a may have invalid characters.");
307							} #if first characters don't match
308
309							#report error if no space precedes a qualifier in subfield a
310	21	100				58	if ($data =~ /\(/) {
311	8	100				41	$self->warn( "020: Subfield a qualifier must be preceded by space, $data.") unless ($data =~ /[X0-9] \(/);
312							} #if data has parenthetical qualifier
313
314							#report error if unable to find 10-13 digit string of digits in subfield 'a'
315	21	100				86	if (($isbnno !~ /(?:^\d{10}$)\|(?:^\d{13}$)\|(?:^\d{9}X$)/)) {
316	3					16	$self->warn( "020: Subfield a has the wrong number of digits, $data.");
317							} # if subfield 'a' but not 10 or 13 digit isbn
318							#otherwise, check 10 and 13 digit checksums for validity
319							else {
320	18	100				52	if ((length ($isbnno) == 10)) {
		50
321
322	16	50	33			101	if (($Business::ISBN::VERSION gt '2.02_01') \|\| ($Business::ISBN::VERSION gt '2.009')) {
		0
323	16	100				63	$self->warn( "020: Subfield a has bad checksum, $data." ) if (Business::ISBN::valid_isbn_checksum($isbnno) != 1);
324							} #if Business::ISBN version higher than 2.02_01 or 2.009
325							elsif ($Business::ISBN::VERSION lt '2') {
326	0	0				0	$self->warn( "020: Subfield a has bad checksum, $data." ) if (Business::ISBN::is_valid_checksum($isbnno) != 1);
327							} #elsif Business::ISBN version lower than 2
328							else {
329	0					0	$self->warn( "Business::ISBN version must be below 2 or above 2.02_02 or 2.009." );
330							} #else Business::ISBN version between 2 and 2.02_02
331							} #if 10 digit ISBN has invalid check digit
332							# do validation check for 13 digit isbn
333							#########################################
334							### Not yet fully implemented ###########
335							#########################################
336							elsif (length($isbnno) == 13){
337							#change line below once Business::ISBN handles 13-digit ISBNs
338	2					7	my $is_valid_13 = _isbn13_check_digit($isbnno);
339	2	100				14	$self->warn( "020: Subfield a has bad checksum (13 digit), $data.") unless ($is_valid_13 == 1);
340							} #elsif 13 digit ISBN has invalid check digit
341							###################################################
342							} #else subfield 'a' has 10 or 13 digits
343							} #if subfield 'a'
344							#look for valid isbn in 020$z
345							elsif ($code eq 'z') {
346	1	50	33			13	if (($data =~ /^ISBN/) \|\| ($data =~ /^\d*\-\d+/)){
347							##################################################
348							## Turned on for now--Comment to unimplement ####
349							##################################################
350	0	0	0			0	$self->warn( "020: Subfield z is numerically valid.") if ((length ($isbnno) == 10) && (Business::ISBN::is_valid_checksum($isbnno) == 1));
351							} #if 10 digit ISBN has invalid check digit
352							} #elsif subfield 'z'
353
354							} # while @subfields
355
356							} #check_020
357
358							=head2 _isbn13_check_digit($ean)
359
360							Internal sub to determine if 13-digit ISBN has a valid checksum. The code is
361							taken from Business::ISBN::as_ean. It is expected to be temporary until
362							Business::ISBN is updated to check 13-digit ISBNs itself.
363
364							=cut
365
366							sub _isbn13_check_digit {
367
368	2			2		3	my $ean = shift;
369							#remove and store current check digit
370	2					6	my $check_digit = chop($ean);
371
372							#calculate valid checksum
373	2					3	my $sum = 0;
374	2					5	foreach my $index ( 0, 2, 4, 6, 8, 10 )
375							{
376	12					19	$sum += substr($ean, $index, 1);
377	12					17	$sum += 3 * substr($ean, $index + 1, 1);
378							}
379
380							#take the next higher multiple of 10 and subtract the sum.
381							#if $sum is 37, the next highest multiple of ten is 40. the
382							#check digit would be 40 - 37 => 3.
383	2					6	my $valid_check_digit = ( 10 * ( int( $sum / 10 ) + 1 ) - $sum ) % 10;
384
385	2	100				7	return $check_digit == $valid_check_digit ? 1 : 0;
386
387							} # _isbn13_check_digit
388
389							#########################################
390
391							=head2 check_041( $field )
392
393							Warns if subfields are not evenly divisible by 3 unless second indicator is 7
394							(future implementation would ensure that each subfield is exactly 3 characters
395							unless ind2 is 7--since subfields are now repeatable. This is not implemented
396							here due to the large number of records needing to be corrected.). Validates
397							against the MARC Code List for Languages (L) using the
398							MARC::Lint::CodeData data pack to MARC::Lint (%LanguageCodes,
399							%ObsoleteLanguageCodes).
400
401							=cut
402
403							sub check_041 {
404
405
406	3			3	1	2383	my $self = shift;
407	3					5	my $field = shift;
408
409							# break subfields into code-data array (so the entire field is in one array)
410
411	3					11	my @subfields = $field->subfields();
412	3					60	my @newsubfields = ();
413
414	3					12	while (my $subfield = pop(@subfields)) {
415	7					15	my ($code, $data) = @$subfield;
416	7					29	unshift (@newsubfields, $code, $data);
417							} # while
418
419							#warn if length of each subfield is not divisible by 3 unless ind2 is 7
420	3	50				11	unless ($field->indicator(2) eq '7') {
421	3					48	for (my $index = 0; $index <=$#newsubfields; $index+=2) {
422	7	100				23	if (length ($newsubfields[$index+1]) %3 != 0) {
423	3					15	$self->warn( "041: Subfield _$newsubfields[$index] must be evenly divisible by 3 or exactly three characters if ind2 is not 7, ($newsubfields[$index+1])." );
424							} #if field length not divisible evenly by 3
425							##############################################
426							# validation against code list data
427							## each subfield has a multiple of 3 chars
428							# need to look at each group of 3 characters
429							else {
430
431							#break each character of the subfield into an array position
432	4					18	my @codechars = split '', $newsubfields[$index+1];
433
434	4					9	my $pos = 0;
435							#store each 3 char code in a slot of @codes041
436	4					6	my @codes041 = ();
437	4					13	while ($pos <= $#codechars) {
438	6					22	push @codes041, (join '', @codechars[$pos..$pos+2]);
439	6					15	$pos += 3;
440							}
441
442
443	4					9	foreach my $code041 (@codes041) {
444							#see if language code matches valid code
445	6	50				18	my $validlang = $LanguageCodes{$code041} ? 1 : 0;
446							#look for invalid code match if valid code was not matched
447	6	100				16	my $obsoletelang = $ObsoleteLanguageCodes{$code041} ? 1 : 0;
448
449							# skip valid subfields
450	6	50				15	unless ($validlang) {
451							#report invalid matches as possible obsolete codes
452	6	100				14	if ($obsoletelang) {
453	1					13	$self->warn( "041: Subfield _$newsubfields[$index], $newsubfields[$index+1], may be obsolete.");
454							}
455							else {
456	5					48	$self->warn( "041: Subfield _$newsubfields[$index], $newsubfields[$index+1] ($code041), is not valid.");
457							} #else code not found
458							} # unless found valid code
459							} #foreach code in 041
460							} # else subfield has multiple of 3 chars
461							##############################################
462							} # foreach subfield
463							} #unless ind2 is 7
464							} #check_041
465
466							=head2 check_043( $field )
467
468							Warns if each subfield a is not exactly 7 characters. Validates each code
469							against the MARC code list for Geographic Areas (L)
470							using the MARC::Lint::CodeData data pack to MARC::Lint (%GeogAreaCodes,
471							%ObsoleteGeogAreaCodes).
472
473							=cut
474
475							sub check_043 {
476
477	2			2	1	1725	my $self = shift;
478	2					5	my $field = shift;
479
480							# break subfields into code-data array (so the entire field is in one array)
481
482	2					8	my @subfields = $field->subfields();
483	2					36	my @newsubfields = ();
484
485	2					10	while (my $subfield = pop(@subfields)) {
486	5					10	my ($code, $data) = @$subfield;
487	5					21	unshift (@newsubfields, $code, $data);
488							} # while
489
490							#warn if length of subfield a is not exactly 7
491	2					10	for (my $index = 0; $index <=$#newsubfields; $index+=2) {
492	5	100	66			65	if (($newsubfields[$index] eq 'a') && (length ($newsubfields[$index+1]) != 7)) {
		50
493	2					7	$self->warn( "043: Subfield _a must be exactly 7 characters, $newsubfields[$index+1]" );
494							} # if suba and length is not 7
495							#check against code list for geographic areas.
496							elsif ($newsubfields[$index] eq 'a') {
497
498							#see if geog area code matches valid code
499	3	50				12	my $validgac = $GeogAreaCodes{$newsubfields[$index+1]} ? 1 : 0;
500							#look for obsolete code match if valid code was not matched
501	3	100				11	my $obsoletegac = $ObsoleteGeogAreaCodes{$newsubfields[$index+1]} ? 1 : 0;
502
503							# skip valid subfields
504	3	50				14	unless ($validgac) {
505							#report invalid matches as possible obsolete codes
506	3	100				8	if ($obsoletegac) {
507	1					5	$self->warn( "043: Subfield _a, $newsubfields[$index+1], may be obsolete.");
508							}
509							else {
510	2					9	$self->warn( "043: Subfield _a, $newsubfields[$index+1], is not valid.");
511							} #else code not found
512							} # unless found valid code
513
514							} #elsif suba
515							} #foreach subfield
516							} #check_043
517
518							=head2 check_245( $field )
519
520							-Makes sure $a exists (and is first subfield).
521							-Warns if last character of field is not a period
522							--Follows LCRI 1.0C, Nov. 2003 rather than MARC21 rule
523							-Verifies that $c is preceded by / (space-/)
524							-Verifies that initials in $c are not spaced
525							-Verifies that $b is preceded by :;= (space-colon, space-semicolon, space-equals)
526							-Verifies that $h is not preceded by space unless it is dash-space
527							-Verifies that data of $h is enclosed in square brackets
528							-Verifies that $n is preceded by . (period)
529							--As part of that, looks for no-space period, or dash-space-period (for replaced elipses)
530							-Verifies that $p is preceded by , (no-space-comma) when following $n and . (period) when following other subfields.
531							-Performs rudimentary article check of 245 2nd indicator vs. 1st word of 245$a (for manual verification).
532
533							Article checking is done by internal _check_article method, which should work for 130, 240, 245, 440, 630, 730, and 830.
534
535							=cut
536
537							sub check_245 {
538
539	49			49	1	24512	my $self = shift;
540	49					68	my $field = shift;
541
542							#set tagno for reporting
543	49					76	my $tagno = '245';
544
545	49	100				144	if ( not $field->subfield( "a" ) ) {
546	1					24	$self->warn( "245: Must have a subfield _a." );
547							}
548
549							# break subfields into code-data array (so the entire field is in one array)
550
551	49					1068	my @subfields = $field->subfields();
552	49					833	my @newsubfields = ();
553	49					73	my $has_sub_6 = 0;
554
555	49					147	while (my $subfield = pop(@subfields)) {
556	90					146	my ($code, $data) = @$subfield;
557							#check for subfield 6 being present
558	90	100				198	$has_sub_6 = 1 if ($code eq '6');
559	90					412	unshift (@newsubfields, $code, $data);
560							} # while
561
562							# 245 must end in period (may want to make this less restrictive by allowing trailing spaces)
563							#do 2 checks--for final punctuation (MARC21 rule), and for period (LCRI 1.0C, Nov. 2003; LCPS 1.7.1)
564	49	100				293	if ($newsubfields[$#newsubfields] !~ /[.?!]$/) {
		100
565	1					5	$self->warn ( "245: Must end with . (period).");
566							}
567							elsif($newsubfields[$#newsubfields] =~ /[?!]$/) {
568	2					7	$self->warn ( "245: MARC21 allows ? or ! as final punctuation but LCRI 1.0C, Nov. 2003 (LCPS 1.7.1 for RDA records), requires period.");
569							}
570
571							##Check for first subfield
572							#subfield a should be first subfield (or 2nd if subfield '6' is present)
573	49	100				111	if ($has_sub_6) {
574							#make sure there are at least 2 subfields
575	2	50				5	if ($#newsubfields < 3) {
576	0					0	$self->warn ("$tagno: May have too few subfields.");
577							} #if fewer than 2 subfields
578							else {
579	2	50				6	if ($newsubfields[0] ne '6') {
580	0					0	$self->warn ( "$tagno: First subfield must be _6, but it is $newsubfields[0]");
581							} #if 1st subfield not '6'
582	2	50				6	if ($newsubfields[2] ne 'a') {
583	0					0	$self->warn ( "$tagno: First subfield after subfield _6 must be _a, but it is _$newsubfields[2]");
584							} #if 2nd subfield not 'a'
585							} #else at least 2 subfields
586							} #if has subfield 6
587							else {
588							#1st subfield must be 'a'
589	47	100				122	if ($newsubfields[0] ne 'a') {
590	1					5	$self->warn ( "$tagno: First subfield must be _a, but it is _$newsubfields[0]");
591							} #if 2nd subfield not 'a'
592							} #else no subfield _6
593							##End check for first subfield
594
595							#subfield c, if present, must be preceded by /
596							#also look for space between initials
597	49	100				146	if ($field->subfield("c")) {
598
599	14					325	for (my $index = 2; $index <=$#newsubfields; $index+=2) {
600							# 245 subfield c must be preceded by / (space-/)
601	17	100				57	if ($newsubfields[$index] eq 'c') {
602	14	100				73	$self->warn ( "245: Subfield _c must be preceded by /") if ($newsubfields[$index-1] !~ /\s\/$/);
603							# 245 subfield c initials should not have space
604	14	100	66			73	$self->warn ( "245: Subfield _c initials should not have a space.") if (($newsubfields[$index+1] =~ /\b\w\. \b\w\./) && ($newsubfields[$index+1] !~ /\[\bi\.e\. \b\w\..*\]/));
605	14					30	last;
606							} #if
607							} #for
608							} # subfield c exists
609
610							#each subfield b, if present, should be preceded by :;= (colon, semicolon, or equals sign)
611							### Are there others? ###
612	49	100				730	if ($field->subfield("b")) {
613
614							# 245 subfield b should be preceded by space-:;= (colon, semicolon, or equals sign)
615	13					287	for (my $index = 2; $index <=$#newsubfields; $index+=2) {
616							#report error if subfield 'b' is not preceded by space-:;= (colon, semicolon, or equals sign)
617	16	100	100			137	if (($newsubfields[$index] eq 'b') && ($newsubfields[$index-1] !~ / [:;=]$/)) {
618	4					14	$self->warn ( "245: Subfield _b should be preceded by space-colon, space-semicolon, or space-equals sign.");
619							} #if
620							} #for
621							} # subfield b exists
622
623
624							#each subfield h, if present, should be preceded by non-space
625	49	100				709	if ($field->subfield("h")) {
626
627							# 245 subfield h should not be preceded by space
628	4					95	for (my $index = 2; $index <=$#newsubfields; $index+=2) {
629							#report error if subfield 'h' is preceded by space (unless dash-space)
630	6	100	100			65	if (($newsubfields[$index] eq 'h') && ($newsubfields[$index-1] !~ /(\S$)\|(\-\- $)/)) {
631	1					4	$self->warn ( "245: Subfield _h should not be preceded by space.");
632							} #if h and not preceded by no-space (unless dash)
633							#report error if subfield 'h' does not start with open square bracket with a matching close bracket
634							##could have check against list of valid values here
635	6	100	100			58	if (($newsubfields[$index] eq 'h') && ($newsubfields[$index+1] !~ /^\[\w\s\w*\]/)) {
636	1					8	$self->warn ( "245: Subfield _h must have matching square brackets, $newsubfields[$index].");
637							}
638							} #for
639							} # subfield h exists
640
641							#each subfield n, if present, must be preceded by . (period)
642	49	100				863	if ($field->subfield("n")) {
643
644							# 245 subfield n must be preceded by . (period)
645	4					82	for (my $index = 2; $index <=$#newsubfields; $index+=2) {
646							#report error if subfield 'n' is not preceded by non-space-period or dash-space-period
647	6	100	100			53	if (($newsubfields[$index] eq 'n') && ($newsubfields[$index-1] !~ /(\S\.$)\|(\-\- \.$)/)) {
648	1					4	$self->warn ( "245: Subfield _n must be preceded by . (period).");
649							} #if
650							} #for
651							} # subfield n exists
652
653							#each subfield p, if present, must be preceded by a , (no-space-comma) if it follows subfield n, or by . (no-space-period or dash-space-period) following other subfields
654	49	100				844	if ($field->subfield("p")) {
655
656							# 245 subfield p must be preceded by . (period) or , (comma)
657	4					82	for (my $index = 2; $index <=$#newsubfields; $index+=2) {
658							#only looking for subfield p
659	6	100				59	if ($newsubfields[$index] eq 'p') {
660							# case for subfield 'n' being field before this one (allows dash-space-comma)
661	4	100	100			49	if (($newsubfields[$index-2] eq 'n') && ($newsubfields[$index-1] !~ /(\S,$)\|(\-\- ,$)/)) {
		100	100
662	1					4	$self->warn ( "245: Subfield _p must be preceded by , (comma) when it follows subfield _n.");
663							} #if subfield n precedes this one
664							# elsif case for subfield before this one is not n
665							elsif (($newsubfields[$index-2] ne 'n') && ($newsubfields[$index-1] !~ /(\S\.$)\|(\-\- \.$)/)) {
666	1					3	$self->warn ( "245: Subfield _p must be preceded by . (period) when it follows a subfield other than _n.");
667							} #elsif subfield p preceded by non-period when following a non-subfield 'n'
668							} #if index is looking at subfield p
669							} #for
670							} # subfield p exists
671
672							######################################
673							#check for invalid 2nd indicator
674	49					823	$self->_check_article($field);
675
676							} # check_245
677
678
679
680
681							############
682							# Internal #
683							############
684
685							=head2 _check_article
686
687							Check of articles is based on code from Ian Hamilton. This version is more
688							limited in that it focuses on English, Spanish, French, Italian and German
689							articles. Certain possible articles have been removed if they are valid English
690							non-articles. This version also disregards 008_language/041 codes and just uses
691							the list of articles to provide warnings/suggestions.
692
693							source for articles = L
694
695							Should work with fields 130, 240, 245, 440, 630, 730, and 830. Reports error if
696							another field is passed in.
697
698							=cut
699
700							sub _check_article {
701
702	49			49		78	my $self = shift;
703	49					119	my $field = shift;
704
705							#add articles here as needed
706							##Some omitted due to similarity with valid words (e.g. the German 'die').
707	49					868	my %article = (
708							'a' => 'eng glg hun por',
709							'an' => 'eng',
710							'das' => 'ger',
711							'dem' => 'ger',
712							'der' => 'ger',
713							'ein' => 'ger',
714							'eine' => 'ger',
715							'einem' => 'ger',
716							'einen' => 'ger',
717							'einer' => 'ger',
718							'eines' => 'ger',
719							'el' => 'spa',
720							'en' => 'cat dan nor swe',
721							'gl' => 'ita',
722							'gli' => 'ita',
723							'il' => 'ita mlt',
724							'l' => 'cat fre ita mlt',
725							'la' => 'cat fre ita spa',
726							'las' => 'spa',
727							'le' => 'fre ita',
728							'les' => 'cat fre',
729							'lo' => 'ita spa',
730							'los' => 'spa',
731							'os' => 'por',
732							'the' => 'eng',
733							'um' => 'por',
734							'uma' => 'por',
735							'un' => 'cat spa fre ita',
736							'una' => 'cat spa ita',
737							'une' => 'fre',
738							'uno' => 'ita',
739							);
740
741							#add exceptions here as needed
742							# may want to make keys lowercase
743	49					565	my %exceptions = (
744							'A & E' => 1,
745							'A & ' => 1,
746							'A-' => 1,
747							'A+' => 1,
748							'A is ' => 1,
749							'A isn\'t ' => 1,
750							'A l\'' => 1,
751							'A la ' => 1,
752							'A posteriori' => 1,
753							'A priori' => 1,
754							'A to ' => 1,
755							'El Nino' => 1,
756							'El Salvador' => 1,
757							'L is ' => 1,
758							'L-' => 1,
759							'La Salle' => 1,
760							'Las Vegas' => 1,
761							'Lo cual' => 1,
762							'Lo mein' => 1,
763							'Lo que' => 1,
764							'Los Alamos' => 1,
765							'Los Angeles' => 1,
766							);
767
768							#get tagno to determine which indicator to check and for reporting
769	49					137	my $tagno = $field->tag();
770							#retrieve tagno from subfield 6 if 880 field
771	49	100				270	if ($tagno eq '880') {
772	1	50				3	if ($field->subfield('6')) {
773	1					19	my $sub6 = $field->subfield('6');
774	1					18	$tagno = substr($sub6, 0, 3);
775							} #if subfield 6
776							} #if 880 field
777
778							#$ind holds nonfiling character indicator value
779	49					79	my $ind = '';
780							#$first_or_second holds which indicator is for nonfiling char value
781	49					72	my $first_or_second = '';
782	49	50				395	if ($tagno !~ /^(?:130\|240\|245\|440\|630\|730\|830)$/) {
		50
		50
783	0					0	print $tagno, " is not a valid field for article checking\n";
784	0					0	return;
785							} #if field is not one of those checked for articles
786							#130, 630, 730 => ind1
787							elsif ($tagno =~ /^(?:130\|630\|730)$/) {
788	0					0	$ind = $field->indicator(1);
789	0					0	$first_or_second = '1st';
790							} #if field is 130, 630, or 730
791							#240, 245, 440, 830 => ind2
792							elsif ($tagno =~ /^(?:240\|245\|440\|830)$/) {
793	49					144	$ind = $field->indicator(2);
794	49					502	$first_or_second = '2nd';
795							} #if field is 240, 245, 440, or 830
796
797
798							#report non-numeric non-filing indicators as invalid
799	49	50				158	$self->warn ( $tagno, ": Non-filing indicator is non-numeric" ) unless ($ind =~ /^[0-9]$/);
800							#get subfield 'a' of the title field
801	49		100			130	my $title = $field->subfield('a') \|\| '';
802
803
804	49					928	my $char1_notalphanum = 0;
805							#check for apostrophe, quote, bracket, or parenthesis, before first word
806							#remove if found and add to non-word counter
807	49					172	while ($title =~ /^["'\[\(*]/){
808	4					5	$char1_notalphanum++;
809	4					18	$title =~ s/^["'\[\(*]//;
810							}
811							# split title into first word + rest on space, parens, bracket, apostrophe, quote, or hyphen
812	49					240	my ($firstword, $separator, $etc) = $title =~ /^([^ \[\]'"\-]+)([ \[\]'"\-])?(.*)/i;
813	49	100				124	$firstword = '' if ! defined( $firstword );
814	49	100				109	$separator = '' if ! defined( $separator );
815	49	100				107	$etc = '' if ! defined( $etc );
816
817							#get length of first word plus the number of chars removed above plus one for the separator
818	49					83	my $nonfilingchars = length($firstword) + $char1_notalphanum + 1;
819
820							#check to see if first word is an exception
821	49					101	my $isan_exception = 0;
822	49					243	$isan_exception = grep {$title =~ /^\Q$_\E/i} (keys %exceptions);
	1078					7953
823
824							#lowercase chars of $firstword for comparison with article list
825	49					153	$firstword = lc($firstword);
826
827	49					64	my $isan_article = 0;
828
829							#see if first word is in the list of articles and not an exception
830	49	100	100			206	$isan_article = 1 if (($article{$firstword}) && !($isan_exception));
831
832							#if article then $nonfilingchars should match $ind
833	49	100				102	if ($isan_article) {
834							#account for quotes, apostrophes, parens, or brackets before 2nd word
835							# if (($separator eq ' ') && ($etc =~ /^['"]/)) {
836	9	100	66			47	if (($separator) && ($etc =~ /^[ \[\]'"\-]+/)) {
837	4					18	while ($etc =~ /^[ "'\[\]*]/){
838	6					7	$nonfilingchars++;
839	6					28	$etc =~ s/^[ "'\[\]*]//;
840							} #while etc starts with nonfiling chars
841							} #if separator defined and etc starts with nonfiling chars
842							#special case for 'en' (unsure why)
843	9	50				88	if ($firstword eq 'en') {
		100
844	0	0	0			0	$self->warn ( $tagno, ": First word, , $firstword, may be an article, check $first_or_second indicator ($ind)." ) unless (($ind eq '3') \|\| ($ind eq '0'));
845							}
846							elsif ($nonfilingchars ne $ind) {
847	3					13	$self->warn ( $tagno, ": First word, $firstword, may be an article, check $first_or_second indicator ($ind)." );
848							} #unless ind is same as length of first word and nonfiling characters
849							} #if first word is in article list
850							#not an article so warn if $ind is not 0
851							else {
852	40	100				542	unless ($ind eq '0') {
853	1					6	$self->warn ( $tagno, ": First word, $firstword, does not appear to be an article, check $first_or_second indicator ($ind)." );
854							} #unless ind is 0
855							} #else not in article list
856
857							#######################################
858
859							} #_check_article
860
861
862							############
863
864							=head1 SEE ALSO
865
866							Check the docs for L. All software links are there.
867
868							=head1 TODO
869
870							=over 4
871
872							=item * Subfield 6
873
874							For subfield 6, it should always be the 1st subfield according to MARC 21 specifications. Perhaps a generic check should be added that warns if subfield 6 is not the 1st subfield.
875
876							=item * Subfield 8.
877
878							This subfield could be the 1st or 2nd subfield, so the code that checks for the 1st few subfields (check_245, check_250) should take that into account.
879
880							=item * Subfield 9
881
882							This subfield is not officially allowed in MARC, since it is locally defined. Some way needs to be made to allow messages/warnings about this subfield to be turned off (or otherwise deal with records using/allowing locally defined subfield 9).
883
884							=item * 008 length and presence check
885
886							Currently, 008 validation is not implemented in MARC::Lint, but is left to MARC::Errorchecks. It might be useful if MARC::Lint's basic validation checks included a verification that the 008 exists and is exactly 40 characters long. Additional 008-related checking and byte validation would remain in MARC::Errorchecks.
887
888							=item * ISBN and ISSN checking
889
890							020 and 022 fields are validated with the C and
891							C modules, respectively. Business::ISBN versions between 2 and
892							2.02_01 are incompatible with MARC::Lint.
893
894							=item * check_041 cleanup
895
896							Splitting subfield code strings every 3 chars could probably be written more efficiently.
897
898							=item * check_245 cleanup
899
900							The article checking in particular.
901
902							=item * Method for turning off checks
903
904							Provide a way for users to skip checks more easily when using check_record, or a
905							specific check_xxx method (e.g. skip article checking).
906
907							=back
908
909							=head1 LICENSE
910
911							This code may be distributed under the same terms as Perl itself.
912
913							Please note that these modules are not products of or supported by the
914							employers of the various contributors to the code.
915
916							=cut
917
918							# Used only to read the stuff from __DATA__
919							sub _read_rules {
920	6			6		14	my $self = shift;
921
922	6					38	my $tell = tell(DATA); # Stash the position so we can reset it for next time
923
924	6					26	local $/ = "";
925	6					87	while ( my $tagblock = ) {
926	1428					7121	my @lines = split( /\n/, $tagblock );
927	1428					36566	s/\s+$// for @lines;
928
929	1428	100				3391	next unless @lines >= 4; # Some of our entries are tag-only
930
931	1326					1972	my $tagline = shift @lines;
932	1326					4493	my @keyvals = split( /\s+/, $tagline, 3 );
933	1326					1874	my $tagno = shift @keyvals;
934	1326					1905	my $repeatable = shift @keyvals;
935
936	1326					3232	$self->_parse_tag_rules( $tagno, $repeatable, @lines );
937							} # while
938
939							# Set the pointer back to where it was, in case we do this again
940	6					54	seek( DATA, $tell, 0 );
941							}
942
943							sub _parse_tag_rules {
944	1326			1326		1811	my $self = shift;
945	1326					1760	my $tagno = shift;
946	1326					1743	my $repeatable = shift;
947	1326					4045	my @lines = @_;
948
949	1326		50			6795	my $rules = ($self->{_rules}->{$tagno} \|\|= {});
950	1326					2780	$rules->{'repeatable'} = $repeatable;
951
952	1326					2121	for my $line ( @lines ) {
953	15816					48805	my @keyvals = split( /\s+/, $line, 3 );
954	15816					22185	my $key = shift @keyvals;
955	15816					21935	my $val = shift @keyvals;
956
957							# Do magic for indicators
958	15816	100				30593	if ( $key =~ /^ind/ ) {
959	2652					5335	$rules->{$key} = $val;
960
961	2652					3404	my $desc;
962							my $regex;
963
964	2652	100				4859	if ( $val eq "blank" ) {
965	1620					1944	$desc = "blank";
966	1620					5282	$regex = qr/^ $/;
967							} else {
968	1032					1786	$desc = _nice_list($val);
969	1032					2123	$val =~ s/^b/ /;
970	1032					10205	$regex = qr/^[$val]$/;
971							}
972
973	2652					6092	$rules->{$key."_desc"} = $desc;
974	2652					7486	$rules->{$key."_regex"} = $regex;
975							} # if indicator
976							else {
977	13164	100				20903	if ( $key =~ /(.)-(.)/ ) {
978	18					48	my ($min,$max) = ($1,$2);
979	18					367	$rules->{$_} = $val for ($min..$max);
980							} else {
981	13146					43364	$rules->{$key} = $val;
982							}
983							} # not an indicator
984							} # for $line
985							}
986
987
988							sub _nice_list {
989	1032			1032		1511	my $str = shift;
990
991	1032	100				2504	if ( $str =~ s/(\d)-(\d)/$1 thru $2/ ) {
992	66					188	return $str;
993							}
994
995	966					2214	my @digits = split( //, $str );
996	966	100				2141	$digits[0] = "blank" if $digits[0] eq "b";
997	966					1329	my $last = pop @digits;
998	966					3304	return join( ", ", @digits ) . " or $last";
999							}
1000
1001							sub _ind_regex {
1002	0			0			my $str = shift;
1003
1004	0	0					return qr/^ $/ if $str eq "blank";
1005
1006	0						return qr/^[$str]$/;
1007							}
1008
1009
1010							1;
1011
1012							__DATA__