File Coverage

blib/lib/HTTP/OAIPMH/Validator.pm

Criterion	Covered	Total	%
statement	34	36	94.4
branch			n/a
condition			n/a
subroutine	12	12	100.0
pod			n/a
total	46	48	95.8

line	stmt	sub	time	code
1				package HTTP::OAIPMH::Validator;
2
3				=head1 NAME
4
5				HTTP::OAIPMH::Validator - OAI-PMH validator class
6
7				=head1 SYNOPSIS
8
9				Validation suite for OAI-PMH data providers that checks for responses
10				in accord with OAI-PMH v2
11				L.
12
13				Typical use:
14
15				use HTTP::OAIPMH::Validator;
16				use Try::Tiny;
17				my $val = HTTP::OAIPMH::Validator->new( base_url=>'http://example.com/oai' );
18				try {
19				$val->run_complete_validation;
20				} catch {
21				warn "oops, validation didn't run to completion: $!\n";
22				};
23				print "Validation status of data provider ".$val->base_url." is ".$val->status."\n";
24
25				=cut
26
27	1	1	37934	use strict;
	1		1
	1		47
28
29				our $VERSION = '1.05';
30
31	1	1	3	use base qw(Class::Accessor::Fast);
	1		2
	1		476
32	1	1	2335	use Data::UUID;
	1		566
	1		49
33	1	1	361	use Date::Manip;
	1		100517
	1		119
34	1	1	397	use HTTP::Request; # for rendering http queries
	1		732
	1		20
35	1	1	5	use HTTP::Headers;
	1		1
	1		18
36	1	1	431	use HTTP::Request::Common; # makes POST easier
	1		1597
	1		48
37	1	1	5	use HTTP::Status; # for checking error codes
	1		1
	1		218
38	1	1	667	use LWP::UserAgent; # send http requests
	1		9853
	1		27
39	1	1	494	use LWP::Protocol::https; # explicit include so we fail without https support
	1		67356
	1		43
40	1	1	7	use URI::Escape; # excape special characters
	1		1
	1		53
41	1	1	764	use XML::DOM;
	0
	0
42				use HTTP::OAIPMH::Log;
43
44				=head2 METHODS
45
46				=head3 new(%args)
47
48				Create new HTTP::OAIPMH::Validator object and initialize counters.
49
50				The following instance variables may be set via %args and have read-write
51				accessors (via L):
52
53				base_url - base URL of the data provdier being validated
54				run_id - UUID identifying the run (will be generated if none supplied)
55				protocol_version - protocol version supported
56				admin_email - admin email extracted from Identify response
57				granularity - datestamp granularity (defaults to 'days', else 'seconds')
58				uses_https - set true if the validator sees an https URL at any stage
59
60				debug - set true to add extra debugging output
61				log - logging object (usually L)
62				parser - XML DOM parser instance
63
64				identify_response - string of identify response (used for registration record)
65				earliest_datestamp - value extracted from earliestDatestamp in Identify response
66				namespace_id - if the oai-identifier is used then this records the namespace identifier extracted
67				set_names - array of all the set names reported in listSets
68
69				example_record_id - example id used for tests that require a specific identifier
70				example_set_spec - example setSpec ("&set=name") used for tests that require a set
71				example_metadata_prefix - example metadataPrefix which defaults to 'oai_dc'
72
73				=cut
74
75				HTTP::OAIPMH::Validator->mk_accessors( qw( base_url protocol_version
76				admin_email granularity uses_503 uses_https
77				debug parser run_id ua allow_https doc save_all_responses
78				response_number http_timeout max_retries max_size
79				protocol guidelines
80				identify_response earliest_datestamp namespace_id set_names
81				example_record_id example_set_spec example_metadata_prefix
82				log status
83				) );
84
85				sub new {
86				my $this=shift;
87				my $class=ref($this) \|\| $this;
88				my $self={
89				'base_url' => undef,
90				'protocol_version' => undef,
91				# Repository features extracted
92				'granularity' => 'days', # can also be "seconds"
93				'uses_503' => 0, # set true if 503 responses ever used
94				'uses_https' => 0, # set to true if https is ever used
95				# Control
96				'debug' => 0,
97				'parser' => XML::DOM::Parser->new(),
98				'run_id' => undef,
99				'ua' => undef,
100				'allow_https' => 0, # allow https URIs
101				'doc' => undef, # current parsed xml document
102				'save_all_responses' => 0, # set True to save all HTTP responses
103				'response_number' => 1, # initial response number
104				'http_timeout' => 600,
105				'max_retries' => 5, # number of 503's in a row that we will accept
106				'max_size' => 100000000, # max response size in bytes (100MB)
107				'protocol' => 'http://www.openarchives.org/OAI/2.0/openarchivesprotocol.htm', #URL of protocol spec
108				'guidelines' => 'http://www.openarchives.org/OAI/2.0/guidelines-repository.htm', #URL of repository guidelines doc
109				# Results
110				'namespace_id' => undef,
111				'set_names' => [],
112				'example_record_id' => undef,
113				'example_set_spec' => undef,
114				'example_metadata_prefix' => 'oai_dc',
115				'log' => HTTP::OAIPMH::Log->new(),
116				'status' => 'unknown',
117				@_};
118				bless($self, $class);
119				$self->setup_run_id if (not $self->run_id);
120				$self->setup_user_agent if (not $self->ua);
121				return($self);
122				}
123
124				=head3 setup_run_id()
125
126				Set a UUID for the run_id.
127
128				=cut
129
130				sub setup_run_id {
131				my $self=shift;
132				my $ug=Data::UUID->new;
133				$self->run_id(lc($ug->to_string($ug->create)));
134				}
135
136				=head3 setup_user_agent()
137
138				Setup L for the validator.
139
140				=cut
141
142				sub setup_user_agent {
143				my $self=shift;
144				my $ua = LWP::UserAgent->new(); # User agent, to render http requests
145				$ua->timeout($self->http_timeout); # give responses 10 minutes
146				$ua->max_size($self->max_size); # size limit ##seems to break http://eprints.soton.ac.uk/perl/oai2 [Simeon/2005-06-06]
147				$ua->requests_redirectable([]); # we will do redirects manually
148				$ua->agent('OAIPMH_Validator'); # set user agent
149				$ua->from('https://groups.google.com/d/forum/oai-pmh'); # set a default From: address -> direct to google group for dicussion
150				$self->ua($ua);
151				}
152
153
154				=head3 abort($msg)
155
156				Special purpose "die" routine because tests cannot continue. Logs
157				failure and then dies.
158
159				=cut
160
161				sub abort {
162				my $self=shift;
163				my ($msg)=@_;
164				$self->log->fail('ABORT: '.$msg);
165				$self->status('FAILED');
166				die('ABORT: '.$msg."\n");
167				}
168
169
170				=head3 run_complete_validation($skip_test_identify)
171
172				Run all tests for a complete validation and return true is the data provider passes,
173				false otherwise. All actions are logged and may be accessed to provide a report
174				(including warnings that do not indicate failure) after the run.
175
176				Arguments:
177				$skip_identify - set true to skip the text_identify() step
178
179				=cut
180
181				sub run_complete_validation {
182				my $self=shift;
183				my ($skip_identify)=@_;
184
185				$self->response_number(1);
186				$self->test_identify unless ($skip_identify);
187				$self->test_list_sets;
188				$self->test_list_identifiers;
189
190				my $baseURL = $self->base_url;
191				my ($formats, $gotDC) = $self->test_list_metadata_formats;
192
193				# If the repository doesn't support oai_dc then this is a failure (because
194				# the standard demands it) but see whether we can find another metadataPrefix
195				# in order to continue the tests
196				if ( $gotDC ) {
197				$self->log->pass("Data provider supports oai_dc metadataPrefix");
198				} else {
199				if ($formats and $formats->getLength()>0) {
200				$self->example_metadata_prefix( $formats->item(0)->getFirstChild->getData );
201				$self->log->fail("Data provider does not support the simple Dublin Core metadata ".
202				"format with metadataPrefix oai_dc. Tests that require a ".
203				"metadataPrefix to be specified will use '".
204				$self->example_metadata_prefix."'");
205				} else {
206				$self->log->fail("There are no metadata formats available to use with the GetRecord ".
207				"request. The metadataPrefix ".
208				$self->example_metadata_prefix.
209				" will be used for later tests even though it seems unsupported.");
210				}
211				}
212
213				my ($dateStamp)=$self->test_get_record($self->example_record_id,$self->example_metadata_prefix);
214				$self->test_list_records($dateStamp,$self->example_metadata_prefix);
215
216				# Check responses to erroneous queries
217				$self->test_expected_errors($self->example_record_id);
218
219				if ($self->protocol_version eq '2.0') {
220				$self->test_expected_v2_errors($self->earliest_datestamp,$self->example_metadata_prefix);
221				# As of version 2.0, data providers must support HTTP POST requests
222				$self->test_post_requests($self->example_metadata_prefix);
223				}
224				$self->test_resumption_tokens;
225
226				# Getting here with no failures means that the data provider is compliant
227				# (there may be warnings which are not counted in num_fail)
228				$self->status( $self->log->num_fail==0 ? 'COMPLIANT' : 'FAILED' );
229				return($self->log->num_fail==0);
230				}
231
232
233				=head3 failures()
234
235				Return Markdown summary of failure log entries, along with the appropriate
236				titles and request details. Will return empty string if there are no
237				failures in the log.
238
239				=cut
240
241				sub failures {
242				my $self=shift;
243				return($self->log->failures());
244				}
245
246
247				=head3 summary()
248
249				Return summary statistics for the validation in Markdown (designed to agree
250				with conversion to HTML by L).
251
252				=cut
253
254				sub summary {
255				my $self=shift;
256
257				my $sf=($self->log->num_fail>0?'failure':'success');
258
259				my $str="\n## Summary - $sf\n\n";
260				my $namespace_id = $self->namespace_id;
261				if ($namespace_id) {
262				if ($namespace_id=~/\./) { #v2.0
263				$str.=" * Namespace declared for v2.0 oai-identifiers is $namespace_id\n";
264				} else { #v1.1
265				$str.=" * Namespace declared for v1.1 oai-identifiers (the repositoryIdentifier) is $namespace_id\n";
266				}
267				}
268				$str.=" * Uses 503 for flow control\n" if ($self->uses_503);
269				$str.=" * Uses https URIs (not specified in protocol)\n" if ($self->uses_https);
270				$str.=" * Total tests passed: ".$self->log->num_pass."\n";
271				$str.=" * Total warnings: ".$self->log->num_warn."\n";
272				$str.=" * Total error count: ".$self->log->num_fail."\n";
273				$str.=" * Validation status: ".($self->status \|\| 'unknown')."\n";
274				return($str);
275				}
276
277
278				=head2 METHODS TESTING SPECIFIC OAI-PMH VERBS
279
280				=head3 test_identify()
281
282				Check response to an Identify request. Returns false if tests cannot
283				continue, true otherwise.
284
285				Side effects based on values extracted:
286
287				- $self->admin_email set to email extracted from adminEmail element
288				- $self->granularity set to 'days' or 'seconds'
289
290				=cut
291
292				sub test_identify {
293				my $self=shift;
294
295				my $cantContinue=0;
296				$self->log->start("Checking Identify response");
297
298				# Send the verb request to the base URL - vet extracts the email address
299				my $burl=$self->base_url;
300				my $req = $burl."?verb=Identify";
301
302				my $response = $self->make_request($req); #don't use make_request_and_validate() just do simplest thing here
303				unless ($response->is_success) {
304				my $r="Server at base URL '$burl' failed to respond to Identify. The HTTP GET request with URL $req received response code '".$response->code()."'.";
305				if ($response->code() == 301) {
306				$self->log->fail("$r HTTP code 301 'Moved Permanently' is not widely supported by ".
307				"harvesters and is anyway inappropriate for registration of a ".
308				"service. If requests must be redirected then an HTTP response 302 ".
309				"may be used as outlined in the guidelines [".
310				$self->guidelines."#LoadBalancing].");
311				} else {
312				$self->log->fail($r);
313				}
314				$self->abort("Failed to get Identify response from server at base URL '$burl'.\n");
315				return;
316				}
317
318				# Parse the XML response
319				unless ($self->parse_response($req,$response)) {
320				$self->log->fail("Failed to parse Identify response");
321				$self->abort("Failed to parse Identify response from server at base URL '$burl'.\n");
322				}
323
324				# Check that this really is a Identify response
325				my $oaipmhNode=$self->doc->getFirstChild();
326				# skip over and processing instructions such as XML stylesheets
327				while ($oaipmhNode->getNodeType==PROCESSING_INSTRUCTION_NODE or
328				$oaipmhNode->getNodeType==COMMENT_NODE) {
329				$oaipmhNode=$oaipmhNode->getNextSibling();
330				}
331				unless (defined $oaipmhNode and $oaipmhNode->getNodeName eq 'OAI-PMH') {
332				$self->log->fail("Identify response does not have OAI-PMH as root element! ".
333				"Found node named '".$oaipmhNode->getNodeName."' instead");
334				$self->abort("Identify response from server at base URL '$burl' does not have ".
335				"OAI-PMH as root element!\n");
336				}
337				my $identifyNode=$oaipmhNode->getElementsByTagName('Identify',0);
338				unless ($identifyNode->getLength()>0) {
339				my $errorNode=$oaipmhNode->getElementsByTagName('error',0);
340				if ($errorNode->getLength()>0) {
341				# give specific message if response is and error
342				$self->log->fail("Error response to Identify request!\n");
343				$self->abort("Error response to Identify request from server at base URL '$burl'.\n");
344				return;
345				} else {
346				$self->log->fail("Identify response does not contain <Identify> block.\n");
347				$self->abort("Identify response does not contain Identify block from server at base URL '$burl'.\n");
348				return;
349				}
350				}
351
352				# Extract admin email and protocol version numbers, check
353				my ($admin_email,$email_error)=$self->get_admin_email;
354				if (not $admin_email or $email_error) {
355				$self->abort(($email_error \|\| "Failed to extract adminEmail").", aborting.\n");
356				return;
357				}
358				$self->admin_email($admin_email);
359				$self->check_protocol_version; # bails if not Version 2.0
360
361				# URL is valid, Identify response was provided, extract content as string
362				$self->identify_response( $response->content );
363
364				my $baseURL = $self->doc->getElementsByTagName('baseURL');
365
366				# BUG FOUND ON AUGUST 26, 2002: empty baseURL still returns length > 0
367				# So it is necessary to explicity check for an empty element.
368				if ( $baseURL->getLength() > 0 ) {
369				$baseURL = $baseURL->item(0)->getFirstChild;
370				if ( $baseURL ) { $baseURL = $baseURL->getData; }
371
372				# $burl is the one given on the form; $baseURL is the one in the XML doc.
373				if ($burl eq $baseURL) {
374				$self->log->pass("baseURL supplied matches the Identify response");
375				} else {
376				# report the error, but keep the form URL
377				# (at least it answered Identify!)
378				$self->log->fail("baseURL supplied '$burl' does not match the baseURL in the ".
379				"Identify response '$baseURL'. The baseURL you enter must EXACTLY ".
380				"match the baseURL returned in the Identify response. It must ".
381				"match in case (http://Wibble.org/ does not match http://wibble.org/) ".
382				"and include any trailing slashes etc.");
383				$cantContinue++;
384				}
385				}
386
387				# For Version 2.0, Check for seconds granularity
388				if ($self->protocol_version eq '2.0') {
389				my $gran_el = $self->doc->getElementsByTagName('granularity');
390				if ($self->parse_granularity($gran_el)) {
391				$self->log->pass("Datestamp granularity is '".$self->granularity."'");
392				} else {
393				$cantContinue++;
394				}
395				}
396
397				# For an exception check new to Version 2.0, extract the earliest date
398				# and also check that its granularity is right
399				if (my $err=$self->get_earliest_datestamp) {
400				$self->log->fail("Bad earliestDatestamp: $err");
401				$cantContinue++;
402				} else {
403				$self->log->pass("Extracted earliestDatestamp ".$self->earliest_datestamp);
404				}
405
406				# Check for OAI-identifier. If already in use by another base URL, bump
407				# the error count to avoid having this URL register.
408				#
409				my $oaiIds = $self->doc->getElementsByTagName('oai-identifier');
410				if ($oaiIds and $oaiIds->getLength()>0) {
411				if ($oaiIds->getLength()>1) {
412				$self->log->fail("Found more than one oai-identifier element. The intention ".
413				"is that this declaration only be used by repositories ".
414				"declaring the use of a single identifier namespace.");
415				$cantContinue++;
416				} else {
417				$oaiIds=$oaiIds->item(0);
418
419				# Now find out if this is v1.1 or v2.0
420				my $oai_id_version='2.0';
421				if (my $xmlns=$oaiIds->getAttribute('xmlns')) { #FIXME this requires default namespace to be set to oai-id
422				if ($xmlns eq 'http://www.openarchives.org/OAI/2.0/oai-identifier') {
423				$oai_id_version='2.0';
424				$self->log->pass("oai-identifier description for version $oai_id_version is being used");
425				} elsif ($xmlns eq 'http://www.openarchives.org/OAI/1.1/oai-identifier') {
426				$oai_id_version='1.1';
427				$self->log->pass("oai-identifier description for version $oai_id_version is being used");
428				} elsif ($xmlns) {
429				$self->log->fail("Unrecognized namespace declaration '$xmlns' for ".
430				"oai-identifier, expected ".
431				"http://www.openarchives.org/OAI/2.0/oai-identifier ".
432				"(for v2.0) or ".
433				"http://www.openarchives.org/OAI/1.1/oai-identifier ".
434				"(for v1.1). Assuming version $oai_id_version.");
435				} else {
436				$self->log->fail("No namespace declaration found for oai-identifier, expected ".
437				"http://www.openarchives.org/OAI/2.0/oai-identifier ".
438				"(for v2.0) or ".
439				"http://www.openarchives.org/OAI/1.1/oai-identifier ".
440				"(for v1.1). Assuming version $oai_id_version/");
441				}
442				} else {
443				$self->log->fail("Can't find namespace declaration for the oai-identifier description. ".
444				"This must be added as ".
445				"(or 1.1), there will likely also be schema validation weeors. Will ".
446				"assume that the oai-identifier is version $oai_id_version for ".
447				"later tests");
448				}
449				my $repoIds = $oaiIds->getElementsByTagName('repositoryIdentifier');
450				if ($repoIds) {
451				my $temp = $repoIds->item(0);
452				if (!defined($temp)) {
453				$self->log->fail("No namespace-identifier (repositoryIdentifier element) in ".
454				"the oai-identifier block of the Identify description");
455				return;
456				}
457				my $nsel = $temp->getFirstChild;
458				unless ( $nsel ) {
459				# Empty repositoryIdentifier element, squawk loudly
460				$self->log->fail("Empty namespace-identifier (repositoryIdentifier element) in ".
461				"the oai-identifier block of the Identify description");
462				return;
463				}
464				my $namespace_id = $nsel->getData;
465				# Having validated the value of namespace-identifier, we can now tell if it is v1.1 or v2.0 based
466				# on whether is has a . in it (i.e. if /\./)
467				if ($oai_id_version eq '2.0') {
468				#schema:
469				unless ($namespace_id=~/^[a-z][a-z0-9\-]*(\.[a-z][a-z0-9\-]+)+$/i) {
470				$self->log->fail("Bad namespace-identifier (repositoryIdentifier element) ".
471				"'$namespace_id' in oai-identifier declaration. See section ".
472				"2.1 of the OAI Identifier specification for details ".
473				"(http://www.openarchives.org/OAI/2.0/guidelines-oai-identifier.htm).");
474				$cantContinue++;
475				} else {
476				$self->log->pass("namespace-identifier (repositoryIdentifier element) in oai-identifier ".
477				"declaration is $namespace_id");
478				$self->namespace_id( $namespace_id );
479				}
480				} else { #v1.1 schema:
481				unless ($namespace_id=~/^[a-z0-9]+$/i) {
482				$self->log->fail("Bad namespace-identifier (repositoryIdentifier element) ".
483				"'$namespace_id' in oai-identifier declaration. See section ".
484				"2.1 of the OAI Identifier specification for details ".
485				"(http://www.openarchives.org/OAI/1.1/guidelines-oai-identifier.htm).");
486				$cantContinue++;
487				} else {
488				$self->log->pass("namespace-identifier (repositoryIdentifier element) in oai-identifier ".
489				"declaration is $namespace_id");
490				$self->namespace_id( $namespace_id );
491				}
492				}
493				}
494				}
495				}
496				return(not $cantContinue);
497				}
498
499
500				=head3 test_list_sets()
501
502				Check response to the ListSets verb.
503
504				Save the setSpecs for later use.
505
506				Note that the any set might be empty. So if test_list_identifiers doesn't
507				get a match, we need to try the second set identifier, and so on.
508				So keep a list of the setSpec elements.
509
510				=cut
511
512				sub test_list_sets {
513				my $self=shift;
514
515				$self->log->start("Checking ListSets response");
516				my $req=$self->base_url."?verb=ListSets";
517				my $response = $self->make_request_and_validate("ListSets", $req);
518				unless ($response) {
519				$self->log->fail("Can't check set names");
520				return;
521				}
522
523				unless ($self->parse_response($req,$response)) {
524				$self->log->fail("Can't parse response");
525				$self->abort("failed to parse response to ListSets");
526				}
527
528				$self->set_names( [] );
529				$self->example_set_spec( '' );
530				my $set_elements=$self->doc->getElementsByTagName('setSpec');
531				if (not defined($set_elements) or ($set_elements->getLength<1)) {
532				# No setSpec elements, so there should be an element
533				my $details={};
534				if ($self->is_error_response($details)) {
535				if ($details->{'noSetHierarchy'}) {
536				$self->log->pass("Repository does not support sets and the is correctly reported with a ".
537				"noSetHierarchy exception in the ListSets response");
538				} else {
539				$self->log->fail("Failed to extract any setSpec elements from ListSets ".
540				"but did not find a noSetHierarchy exception. Found instead a '".
541				join(', ',keys %{$details})."' exception(s). See <".
542				$self->protocol."#ListSets>.");
543				}
544				} else {
545				$self->log->fail("Failed to extract any setSpec elements from ListSets but did not ".
546				"find an exception message. If sets are not supported by the ".
547				"repository then the ListSets response must be the noSetHierarchy ".
548				"error. See <".$self->protocol."#ListSets>.");
549				}
550				} else {
551				# Have setSpec elements, record all set names and pick an example set spec
552				for (my $j=0; $j<$set_elements->getLength; $j++) {
553				my $set_name=$set_elements->item($j)->getFirstChild->getData;
554				##FIXME - should validate each set name
555				push(@{$self->set_names},$set_name);
556				}
557				# Sanity check, did we get the number we expected?
558				my $num_sets=scalar(@{$self->set_names});
559				if ($num_sets!=$set_elements->getLength) {
560				$self->log->fail("Failed to extract the expected number of set names (got ".
561				"$num_sets, expected ".$set_elements->getLength.")");
562				}
563				if ($num_sets>0) {
564				$self->example_set_spec( "&set=".$self->set_names->[0] );
565				}
566				my $msg='';
567				for (my $j=0; $j<$num_sets and $j<3; $j++) { $msg.=" ".$self->set_names->[$j]; }
568				$msg.=" ..." if ($num_sets>3);
569				$self->log->pass("Extracted $num_sets set names: {$msg }, will use setSpec ".
570				$self->example_set_spec." in tests");
571				}
572				}
573
574
575				=head3 test_list_identifiers()
576
577				Check response to ListIdentifiers and record an example record id in
578				$self->example_record_id to be used in other tests.
579
580				If there are no identifiers, but the response is legal, stop the test with
581				errors=0, number of verbs checked is three.
582
583				As of version 2.0, a metadataPrefix argument is required. Unfortunately
584				we need to call test_list_identifiers first in order to get an id for
585				GetRecord, so we simply use oai_dc.
586
587				=cut
588
589				sub test_list_identifiers {
590				my $self=shift;
591
592				$self->log->start("Checking ListIdentifiers response");
593
594				### FIXME -- skip the set= restriction because this code doesn't
595				### FIXME work right for set hierarchies - 2002-10-17
596				### FIXME 2015-01-02 - put/left in, is it OK?
597				my $set_spec = $self->example_set_spec;
598				my $req = $self->base_url."?verb=ListIdentifiers&metadataPrefix=oai_dc".$set_spec;
599				my $response = $self->make_request_and_validate("ListIdentifiers", $req);
600
601				# Note: $response will come back null if an error code was returned
602				# An error code of "noRecordsMatch" comes back if specified set is
603				# empty. In that case we should drop the set and try again.
604				if ( $set_spec and (! $response or $self->is_no_records_match ) ) {
605				$self->log->note("Empty set made ListIdentifiers fail - trying other sets...");
606				my $i=1;
607				my $m = scalar(@{$self->set_names});
608				while ($i<$m and not $response ) {
609				$set_spec = "&set=".$self->set_names->[$i];
610				$req = $self->base_url."?verb=ListIdentifiers&metadataPrefix=oai_dc".$set_spec;
611				$response = $self->make_request_and_validate("ListIdentifiers", $req);
612				$self->log->note("Trying set ".$set_spec);
613				}
614				# If we were successful then set the example_set_spec for any future tests
615				if ($response) {
616				$self->example_set_spec( $set_spec );
617				}
618				}
619
620				# None of the sets had any identifiers in them. Try the whole entire
621				# list of identifiers.
622				if ( $set_spec and !$response ) {
623				$self->log->note("Last attempt is without any sets...");
624				$req = $self->base_url."?verb=ListIdentifiers&metadataPrefix=oai_dc";
625				$response = $self->make_request_and_validate("ListIdentifiers",$req);
626				}
627
628				# Now we are for real in trouble if $response is null
629				unless ($response) {
630				$self->log->fail("No ListIdentifiers response with content");
631				$self->log->note("The base URL did not respond to the ListIdentifiers verb.".
632				"Without that, we cannot proceed with the validation test. Exiting.");
633				$self->abort("The base URL did not respond to the ListIdentifiers verb. Without that, we cannot proceed with the validation test. Exiting.");
634				}
635
636				# Grab the first identifier for later use
637				unless ($self->parse_response($req,$response)) {
638				$self->log->fail("Can't parse ListIdentifiers response");
639				$self->abort("unable to parse response");
640				}
641				#
642				# Now look for the identifier of a non-deleted record
643				# If there are no identifiers to be harvested, we cannot complete the
644				# validation test.
645				#
646				# FIXME - this still doesn't solve the problem that there may be no
647				# non-deleted items listed in the particular response or partial
648				# response that we are looking at [Simeon/2005-07-20]
649				#
650				my $headers = $self->doc->getElementsByTagName('header');
651				my $h;
652				my $record_id;
653				for ($h=0; $h<$headers->getLength(); $h++) {
654				my $hdnode=$headers->item($h);
655				my $idnode=$hdnode->getElementsByTagName('identifier',0);
656				next unless ($idnode and $idnode->getLength()==1);
657				$record_id=$idnode->item(0)->getFirstChild->getData;
658				last unless ($hdnode->getAttribute('status') and $hdnode->getAttribute('status') eq 'deleted');
659				$self->log->warn("Identifier ".($h+1).", '$record_id', is for a deleted record, skipping");
660				}
661				if ($h==$headers->getLength()) {
662				# No identifiers were in the ListIdentifiers response. Further testing
663				# is not possible.
664				$self->log->fail("The response to the ListIdentifiers verb with metadataPrefix oai_dc ".
665				"contained no identifiers. Without at least one identifier, we cannot ".
666				"proceed with the validation tests.");
667				$self->abort("No identifiers in response to ListIdentifiers. Without an identifier ".
668				"we cannot proceed with validation tests.");
669				}
670				$self->log->pass("Good ListIdentifiers response, extracted id '$record_id' for use in future tests.");
671				$self->example_record_id( $record_id );
672				}
673
674
675				=head3 test_list_metadata_formats()
676
677				Vet the verb as usual, and then make sure that Dublin Core in included
678				In particular, we will check the metadata formats available for "record_id",
679				obtained from checking the ListIdentifier verb.
680				Side effect: save available formats for later use (global "formats").
681				NOTE:if there are no formats, error will be picked up by getRecord
682
683				=cut
684
685				sub test_list_metadata_formats {
686				my $self=shift;
687
688				$self->log->start("Checking ListMetadataFormats response");
689
690				# Do we have an example record id to check with?
691				my $record_id = $self->example_record_id;
692				unless ($record_id) {
693				$self->log->fail("Cannot check ListMetadataFormats as we do not have an example id");
694				return;
695				}
696
697				my $req = $self->base_url."?verb=ListMetadataFormats&identifier=".url_encode($record_id);
698				my $response = $self->make_request_and_validate("ListMetadataFormats",$req);
699				unless ($response) {
700				$self->log->fail("Can't check metadataFormats available for item $record_id, no ".
701				"response to ListMetadataFormats request.");
702				return;
703				}
704
705				# Check for Dublin Core
706				unless ($self->parse_response($req,$response)) {
707				$self->log->fail("Can't parse response to ListMetadataFormats request for item $record_id.");
708				return;
709				}
710
711				my $formats = $self->doc->getElementsByTagName('metadataPrefix');
712				unless ($formats->getLength() > 0) {
713				$self->log->fail("No metadata formats are listed in the response to a ListMetadataFormats ".
714				"request for item $record_id.");
715				return;
716				}
717
718				if ($self->debug) {
719				$self->log->note("debug: ".$formats->getLength()." formats supported for identifier '$record_id'");
720				}
721				my $gotDC=0;
722				for my $i (0..$formats->getLength()-1) {
723				my $format = $formats->item($i);
724				#assume this node has only one child, and its data for a format
725				if ( $format->getFirstChild->getData =~ /^\soai_dc\s$/ ) {
726				$gotDC = 1;
727				last;
728				}
729				}
730				if ($gotDC) {
731				$self->log->pass("Good ListMetadataFormats response, includes oai_dc");
732				} else {
733				$self->log->pass("Good ListMetadataFormats response, BUT DID NOT FIND oai_dc");
734				}
735				return($formats, $gotDC);
736				}
737
738
739				=head3 test_get_record($record_id, $format)
740
741				Try to get record $record_id in $format.
742
743				If either $record_id or $format are undef then we have an error
744				right off the bat. Else make the request and return the
745				datestamp of the record.
746
747				=cut
748
749				sub test_get_record {
750				my $self=shift;
751				my ($record_id, $format)=@_;
752
753				$self->log->start("Checking GetRecord response");
754
755				unless (defined $format) {
756				$self->log->fail("Skipping GetRecord test as no metadata format is listed as being available.");
757				return;
758				}
759				unless (defined $record_id) {
760				$self->log->fail("Skipping GetRecord test as no items are listed as having metadata available.");
761				return;
762				}
763
764				my $numerr=0; #count up non-fatal errors
765
766				my $req = $self->base_url."?verb=GetRecord&identifier=".url_encode($record_id)."&metadataPrefix=".url_encode($format);
767				my $response = $self->make_request_and_validate("GetRecord", $req);
768				unless ($response) {
769				$self->log->fail("Can't complete datestamp check for GetRecord");
770				$self->abort("Can't complete datestamp check for GetRecord");
771				}
772
773				# Save the datestamp for later use by ListRecords
774				# As of version 2.0, Identify response can have a granularity and the
775				# datestamp MUST be in the finest granularity supported by the repository
776				unless ($self->parse_response($req,$response)) {
777				$self->log->fail("Can't parse response");
778				$self->abort("Unable to parse response from GetRecord");
779				}
780
781				if (my $msg=$self->is_error_response) {
782				$self->log->fail("The response to the GetRecord verb was the OAI exception $msg. ".
783				"It is this not possible to extract a valid datestamp for remaining tests");
784				$self->abort("Unexpected OAI exception response");
785				}
786
787				my $datestamps = $self->doc->getElementsByTagName('datestamp');
788				# If there is no there is no datestamp ... but there should be a record
789				unless ( $datestamps->getLength() > 0 ) {
790				$self->log->fail("The response to the GetRecord verb did not have a datestamp, which is ".
791				"needed to continue checking verbs.");
792				$self->abort("No datestamp in the response for GetRecord");
793				}
794
795				my $datestamp=undef;
796				eval {
797				$datestamp = $datestamps->item(0)->getFirstChild->getData;
798				};
799				if (not defined($datestamp)) {
800				$self->log->fail("Failed to extract datestamp from the GetRecord response. See <".
801				$self->protocol."#Dates>.");
802				$numerr++;
803				} elsif ( my $granularity=$self->get_datestamp_granularity($datestamp) ) {
804				$self->log->pass("Datestamp in GetRecord response ($datestamp) has the correct form for ".
805				"$granularity granularity.");
806				if ( $granularity eq $self->granularity ) {
807				# The granularity in v2.0 must match the finest granularity supported (see sec3.3.2)
808				$self->log->pass("Datestamp in GetRecord response ($datestamp) matched the ".
809				$self->granularity." granularity specified in the Identify response. ");
810				} else {
811				$self->log->fail("Datestamp in GetRecord response ($datestamp) is not consistent ".
812				"with the ".$self->granularity." granularity specified in the ".
813				"Identify response");
814				$numerr++;
815				}
816				} else {
817				$self->log->fail("Datestamp in GetRecord response ($datestamp) is not valid. See <".
818				$self->protocol."#Dates>.");
819				$numerr++;
820				}
821
822				# As of OAI-PMH Version 2.0, GetRecord must return a set spec if the
823				# repository supports sets and the item is in a set
824				if (not $self->example_set_spec) {
825				$self->log->pass("Valid GetRecord response") unless ($numerr>0);
826				return($datestamp);
827				}
828
829				my $set_list = $self->doc->getElementsByTagName('setSpec');
830				my $set_value = $self->example_set_spec;
831				$set_value =~ s/&set=//;
832				$self->log->note("Looking for set '".$set_value."' or a descendant set.") if $self->debug;
833				my $i;
834				my $subset_str = '';
835				for ($i=0; $i<$set_list->getLength; $i++) {
836				my $s = $set_list->item($i)->getFirstChild->getData;
837				last if ($s eq $set_value);
838				if ($s =~ m/^${set_value}:/) {
839				$subset_str = " (implied by a descendant setSpec)";
840				last;
841				}
842				}
843				if ($i==$set_list->getLength) { # error
844				$self->log->fail("Expected setSpec was missing from the response. The GetRecord ".
845				"response for identifier $record_id did not contain a set ".
846				"specification for $set_value");
847				} else {
848				$self->log->pass("Expected setSpec was returned in the response".$subset_str);
849				}
850				return($datestamp);
851				}
852
853
854				=head3 test_list_records($datestamp,$metadata_prefix)
855
856				Test the response for the ListRecords verb. In addition, if there is
857				no Dublin Core available for this repository, this is an error.
858				(And the error has already been counted in test_get_record)
859				We can still test the verb, however, with one of the available
860				formats found by testGetMetadataFormats. Since the output of
861				ListRecords is likely to be large, use the datestamp of the one
862				record we did retrieve to limit the output.
863
864				=cut
865
866				sub test_list_records {
867				my $self=shift;
868				my ($datestamp,$metadata_prefix)=@_;
869
870				$self->log->start("Checking ListRecords response");
871
872				my $req = $self->base_url."?verb=ListRecords";
873				if ($datestamp) {
874				$req.="&from=".$datestamp."&until=".$datestamp;
875				} else {
876				$self->log->warn("Omitting datestamp parameter as none was obtained from earlier test");
877				}
878				$req.="&metadataPrefix=".$metadata_prefix;
879				my $list_not_complete=1;
880
881				while ($list_not_complete) {
882				$list_not_complete=0;
883				my $response = $self->make_request_and_validate("ListRecords", $req);
884				unless ($response) {
885				#Nothing else to say since we don't do other tests
886				return;
887				}
888
889				if ($self->parse_response($req,$response)) {
890				$self->log->pass("Response is well formed");
891				} else {
892				$self->log->fail("The ListRecords response was not well formed XML");
893				}
894
895				# Now check to make sure that we got back the record for the identifier
896				# $self->example_record_id if there is one specified, else fail that
897				# test.
898				my $record_id=$self->example_record_id;
899				unless ($record_id) {
900				$self->log->fail("Cannot check for correct record inclusion without an example record id");
901				return;
902				}
903				my $details={};
904				if ($self->is_error_response($details)) {
905				if ($details->{'noRecordsMatch'}) {
906				$self->log->fail("ListRecords response gave a noRecordsMatch error when it should ".
907				"have included at least the record with identifier $record_id. ".
908				"The from and until parameters of the request were set to the ".
909				"datestamp of this record ($datestamp). The from and until parameters ".
910				"are inclusive, see protocol spec section 2.7.1. The message ".
911				"included in the error response was: '".
912				$details->{'noRecordsMatch'}."'");
913				} else {
914				my @txt=();
915				foreach my $k (keys %$details) {
916				push(@txt,"$k (".$details->{$k}.")");
917				}
918				$self->log->fail("ListRecords gave an unexpected error response to a request using ".
919				"from and until datestamps taken from the previous GetRecord response: ".
920				join(', ',@txt));
921				}
922				} else {
923				my $id_list = $self->doc->getElementsByTagName('identifier');
924				my $i;
925				my $badly_formed=0;
926				for ($i=0; $i<$id_list->getLength; $i++) {
927				if (my $child=$id_list->item($i)->getFirstChild()) {
928				last if ($id_list->item($i)->getFirstChild->getData eq $record_id);
929				} else {
930				$badly_formed++;
931				last;
932				}
933				}
934				if ($badly_formed) {
935				$self->log->fail("ListRecords response badly formed, identifier element for record ".
936				($i+1)." is empty");
937				} elsif ($i<$id_list->getLength) {
938				$self->log->pass("ListRecords response correctly included record with identifier $record_id");
939				} elsif (my $token=$self->get_resumption_token) {
940				# More responses to come, may just not have got to the
941				# record yet... roll around for more:
942				$self->log->pass("ListRecords response includes resumptionToken. Haven't found ".
943				"record with identifier $record_id yet, will continue with resumptionToken...");
944				$list_not_complete=1;
945				$req = $self->base_url."?verb=ListRecords&resumptionToken=".url_encode($token);
946				} else {
947				$self->log->fail("ListRecords response did not include the identifier $record_id ".
948				"which should have been included because both the from and until ".
949				"parameters were set to the datestamp of this record ($datestamp). ".
950				"The from and until parameters are inclusive, see protocol spec ".
951				"section 2.7.1");
952				}
953				}
954				}
955				}
956
957
958				=head3 test_resumption_tokens()
959
960				Request an unlimited ListRecords. If there is a resumption token, see
961				if it works. It is an error if resumption is there but doesn't work.
962				Empty resumption tokens are OK -- this ends the list.
963
964				CGI takes care of URL-encoding the resumption token.
965
966				=cut
967
968				sub test_resumption_tokens {
969				my $self=shift;
970
971				$self->log->start("Checking for correct use of resumptionToken (if used)");
972
973				my $req = $self->base_url."?verb=ListRecords&metadataPrefix=oai_dc";
974				my $response = $self->make_request($req);
975
976				# was there a resumption token?
977				unless ($self->parse_response($req,$response)) {
978				$self->log->fail("Can't parse malformed XML in response to ListRecords request. ".
979				"Cannot complete test for correct use of resumptionToken (if used)");
980				return;
981				}
982
983				my $tokenList = $self->doc->getElementsByTagName('resumptionToken');
984				if ( !$tokenList or $tokenList->getLength()==0 ) {
985				$self->log->pass("resumptionToken not used");
986				return;
987				}
988				if ( $tokenList->getLength()>1 ) {
989				$self->log->fail("More than one resumptionToken in response!");
990				return;
991				}
992
993				# Dig out the resumption token from the document
994				my $tokenElement = $tokenList->item(0);
995
996				# Try getting the resumption token, $token will be will be undefined
997				# unless the element has content
998				my $token = $tokenElement->getFirstChild;
999				my $tokenString;
1000				if ($token) {
1001				$tokenString = $token->getData;
1002				}
1003				unless ($tokenString) {
1004				$self->log->fail("Empty resumption token in response to $req There should never ".
1005				"be an empty resumptionToken in response to a request without a ".
1006				"resumptionToken argument");
1007				return;
1008				}
1009
1010				# If there us a 'cursor' value given then check that it is
1011				# correct. It must have the value 0 in the first response
1012				my $usingCursor=0;
1013				if (my $cursor=$tokenElement->getAttribute('cursor')) {
1014				$usingCursor=1;
1015				if ($cursor==0) {
1016				$self->log->pass("A cursor value was supplied with the resumptionToken and it ".
1017				"correctly had the value zero in the first response");
1018				} else {
1019				$self->log->fail("A cursor value was supplied with the resumptionToken but it ".
1020				"did not have the correct value zero for the first response. ".
1021				"The value was '$cursor'.");
1022				}
1023				}
1024
1025				$self->log->note("Got resumptionToken ".$tokenString);
1026
1027				# Try using the resumption token. Before including a resumptionToken in
1028				# the URL of a subsequent request, we must encode all special characters
1029				# getData in this version of XML::DOM expands entitities
1030				$req = $self->base_url."?verb=ListRecords&resumptionToken=".url_encode($tokenString);
1031				$response = $self->make_request($req);
1032				unless ( $response ) {
1033				$self->log->fail("Site failed to respond to request using resumptionToken: $req");
1034				return;
1035				}
1036				unless ( $self->parse_response($req,$response)) {
1037				$self->log->fail("Response to request is using resumptionToken not valid XML: $req");
1038				return;
1039				}
1040
1041				my $errorList = $self->doc->getElementsByTagName('error');
1042				if ( $errorList and $errorList->getLength() > 0 ) {
1043				$self->log->fail("Response to request using resumptionToken was an error code: $req");
1044				return;
1045				}
1046
1047				###FIXME: put in test for cursor again, should be number of items returned in the
1048				###FIXME: first response [Simeon/2005-10-11]
1049
1050				$self->log->pass("Resumption tokens appear to work");
1051				}
1052
1053
1054				=head2 METHODS CHECKING ERRORS AND EXCEPTIONS
1055
1056				=head3 test_expected_errors($record_id)
1057
1058				Each one of these requests should get a 400 response in OAI-PHM v1.1,
1059				or a 200 response in 2.0, along with a Reason_Phrase. Bump error_count
1060				if this does not hold. Return the number of errorneous responses.
1061
1062				$record_id is a valid record identifier to be used in tests that require
1063				one.
1064
1065				=cut
1066
1067				sub test_expected_errors {
1068				my $self=shift;
1069				my ($record_id)=@_;
1070
1071				$self->log->start("Checking exception handling (errors)");
1072
1073				my @request_list = (
1074				[ 'junk', [ 'badVerb' ], '', '' ],
1075				[ 'verb=junk', [ 'badVerb' ], '', '' ],
1076				[ 'verb=GetRecord&metadataPrefix=oai_dc', [ 'badArgument' ], '', '' ],
1077				[ 'verb=GetRecord&identifier='.$record_id, [ 'badArgument' ], '', '' ],
1078				[ 'verb=GetRecord&identifier=invalid"id&metadataPrefix=oai_dc', [ 'badArgument','idDoesNotExist' ], 'An XML parsing error may be due to incorrectly including the invalid identifier in the element of your XML error response; only valid arguments should be included. A response that includes ..baseURL.. is not well-formed XML because of the quotation mark (") in the identifier.', 'Either the badArgument or idDoesNotExist error codes would be appropriate to report this case.' ],
1079				[ 'verb=ListIdentifiers&until=junk', [ 'badArgument' ], '', '' ],
1080				[ 'verb=ListIdentifiers&from=junk', [ 'badArgument' ], '', '' ],
1081				[ 'verb=ListIdentifiers&resumptionToken=junk&until=2000-02-05', [ 'badArgument','badResumptionToken' ], '', 'Either the badArgument and/or badResumptionToken error codes may be reported in this case. If only one is reported then the badArgument error is to be preferred because the resumptionToken and until parameters are exclusive.' ],
1082				[ 'verb=ListRecords&metadataPrefix=oai_dc&from=junk', [ 'badArgument' ], '', '' ],
1083				[ 'verb=ListRecords&resumptionToken=junk', [ 'badResumptionToken' ], '', '' ],
1084				[ 'verb=ListRecords&metadataPrefix=oai_dc&resumptionToken=junk&until=1990-01-10', [ 'badArgument','badResumptionToken' ] , '', 'Either the badArgument and/or badResumptionToken error codes may be reported in this case. If only one is reported then the badArgument error is to be preferred because the resumptionToken and until parameters are exclusive.' ],
1085				[ 'verb=ListRecords&metadataPrefix=oai_dc&until=junk', [ 'badArgument' ], '', '' ],
1086				[ 'verb=ListRecords', [ 'badArgument' ], '', '' ]
1087				);
1088
1089				my $n=0;
1090				foreach my $rrr ( @request_list ) {
1091				my ($request_string, $error_codes, $xml_reason, $reason)=@$rrr;
1092				my $req = $self->base_url.'?'.$request_string;
1093				my $ok_errors=join(' or ',@$error_codes);
1094
1095				my $response=$self->make_request($req);
1096
1097				# TBD: $response->status_line should also be checked? see output from
1098				# physnet.uni-oldenburg.de/oai/oai.php
1099				if ($self->protocol_version eq "1.1") {
1100				if ($response->code ne "400") {
1101				$self->log->note("Invalid requests which failed to return 400:") if $n == 0;
1102				$n++;
1103				$self->log->fail("Expected 400 from: $request_string");
1104				}
1105				} elsif ($self->protocol_version eq "2.0") {
1106				# The document must contain the proper error code
1107				unless ($self->parse_response($req,$response,$xml_reason)) {
1108				$self->log->fail("Can't parse malformed response. ".html_escape($xml_reason));
1109				$n++;
1110				next;
1111				}
1112				# check that the error code is in the error_list
1113				my $error_elements = $self->doc->getElementsByTagName('error');
1114				if (my $matching_code=$self->error_elements_include($error_elements, $error_codes)) {
1115				$self->log->pass("Error response correctly includes error code '$matching_code'");
1116				} else {
1117				$self->log->fail("Exception/error response did not contain error code ".
1118				"'$ok_errors' ".html_escape($reason));
1119				$n++;
1120				next;
1121				}
1122				} else {
1123				$self->log->fail("Invalid protocol version returned");
1124				$self->abort("test_expected_errors - invalid protocol version");
1125				}
1126				}
1127				my $total = scalar @request_list;
1128				if ($n==0) {
1129				$self->log->pass("All $total error requests properly handled");
1130				} else {
1131				$self->log->warn("Only ".($total-$n)." out of $total error requests properly handled");
1132				}
1133				return($n);
1134				}
1135
1136
1137				=head3 test_expected_v2_errors($earliest_datestamp,$metadata_prefix)
1138
1139				There are some additional exception tests for OAI-PMH version 2.0.
1140
1141				=cut
1142
1143				sub test_expected_v2_errors {
1144				my $self=shift;
1145				my ($earliest_datestamp,$metadata_prefix)=@_;
1146
1147				$self->log->start("Checking for version 2.0 specific exceptions");
1148
1149				my $too_early_date=one_year_before($earliest_datestamp);
1150
1151				# format of entries: [ request_string, [error_codes_accepable], resaon ]
1152				my @request_list = (
1153				[ "verb=ListRecords&metadataPrefix=".url_encode($metadata_prefix)."&from=2002-02-05&until=2002-02-06T05:35:00Z", ['badArgument'],
1154				'The request has different granularities for the from and until parameters.' ],
1155				[ "verb=ListRecords&metadataPrefix=".url_encode($metadata_prefix)."&until=$too_early_date" , ['noRecordsMatch'],
1156				'The request specified a date one year before the earliestDatestamp given in the Identify response. '.
1157				'There should therefore not be any records with datestamps on or before this date and a noRecordsMatch '.
1158				'error code should be returned.' ]
1159				);
1160
1161				foreach my $rrr ( @request_list ) {
1162				my ($request_string,$error_codes,$reason)=@$rrr;
1163
1164				my $req=$self->base_url."?$request_string";
1165				my $response = $self->make_request($req);
1166				# parse the response content for the desired error code
1167				unless ( $self->parse_response($req,$response) ) {
1168				$self->log->fail("Error in parsing XML response to exception request: $request_string");
1169				next;
1170				}
1171				# check that there is at least the desired error code
1172				my $ok_errors=join(' or ',@$error_codes);
1173				my $error_elements = $self->doc->getElementsByTagName('error');
1174				if ( !$error_elements or $error_elements->getLength==0 ) {
1175				$self->log->fail("Failed to extract error code from the response to request: ".
1176				"$request_string $reason");
1177				} elsif (my $matching_code=$self->error_elements_include($error_elements, $error_codes) ) {
1178				$self->log->pass("Error response correctly includes error code '$matching_code'");
1179				} else {
1180				$self->log->fail("Error code $ok_errors not found in response but should be given ".
1181				"to the request: $request_string $reason");
1182				}
1183				}
1184				return;
1185				}
1186
1187
1188				=head2 METHODS TO TEST USE OF HTTP POST
1189
1190				=head3 test_post_requests()
1191
1192				Test responses to POST requests. Do both the simplest possible -- the Identify
1193				verb -- and a GetRecord request which uses two additional parameters.
1194
1195				=cut
1196
1197				sub test_post_requests {
1198				my $self=shift;
1199				my ($metadata_prefix)=@_;
1200
1201				$self->log->start("Checking that HTTP POST requests are handled correctly");
1202
1203				$self->test_post_request(1,{verb => "Identify"});
1204
1205				my $record_id=$self->example_record_id;
1206				if ($record_id) {
1207				$self->test_post_request(2,{verb => "GetRecord", 'identifier' => $record_id, 'metadataPrefix' => $metadata_prefix});
1208				} else {
1209				$self->log->fail("Cannot test GetRecord via POST without and example record identifier");
1210				}
1211				}
1212
1213
1214				# Called just by test_post_requests to actually run the test
1215				#
1216				sub test_post_request {
1217				my $self=shift;
1218				my ($num, $post_data) = @_;
1219				my $response = $self->make_request($self->base_url, $post_data);
1220				if ($response->is_success) {
1221				my $verb = $post_data->{verb};
1222				if ( $self->is_verb_response($response,$verb) ) {
1223				$self->log->pass("POST test $num for $verb was successful");
1224				} elsif ( $self->check_error_response($response) ) {
1225				$self->log->fail("POST test $num for $verb was unsuccessful, an OAI error ".
1226				"response was received");
1227				} else {
1228				$self->log->fail("POST test $num for $verb was unsuccessful, got neither a ".
1229				"valid response nor an error");
1230				}
1231				} else {
1232				$self->log->fail("POST test $num was unsuccessful. Server returned HTTP Status: '".
1233				$response->status_line."'");
1234				}
1235				}
1236
1237
1238				=head2 METHODS CHECKING ELEMENTS WITHIN VERB AND ERROR RESPONSES
1239
1240				=head3 check_response_date($req, $doc)
1241
1242				Check responseDate for being in UTC format
1243				(should perhaps also check that it is at least the current day?)
1244
1245				=cut
1246
1247				sub check_response_date {
1248				my $self=shift;
1249				my ($req, $doc) = @_;
1250
1251				my $elements = $self->doc->getElementsByTagName('responseDate');
1252				# assume rest of validity already checked, just take first
1253				my $item;
1254				my $child;
1255				if ($elements and $item=$elements->item(0) and $child=$item->getFirstChild()) {
1256				my $date = $child->getData();
1257				if ($date=~/\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\dZ/) {
1258				$self->log->pass("responseDate has correct format: $date");
1259				} else {
1260				$self->log->fail("Bad responseDate of $date, this is not in UTC DateTime ".
1261				"(YYYY-MM-DDThh:mm:ssZ) format");
1262				}
1263				} else {
1264				$self->log->fail("Failed to extract responseDate");
1265				}
1266				}
1267
1268
1269				=head3 check_schema_name($req, $doc)
1270
1271				Given the response to one of the OAI verbs, make sure that it it
1272				going to be validated against the "official" OAI schema, and not
1273				one that the repository made up for itself. If the response can't
1274				be parsed, or if there is no OAI-PMH element, or if the schema is
1275				incorrect, print an error message and bump the error_count.
1276
1277				Return true if the schema name and date check out, else return undef
1278
1279				=cut
1280
1281				sub check_schema_name {
1282				my $self=shift;
1283				my ($req, $doc) = @_;
1284
1285				my $namespace = 'http://www.openarchives.org/OAI/2.0/';
1286				my $location = 'http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd';
1287
1288				my $elements = $self->doc->getElementsByTagName('OAI-PMH'); #NodeList
1289				unless ( $elements->getLength() > 0 ) {
1290				$self->log->fail("Response to $req did not contain a OAI-PMH element");
1291				return(0);
1292				}
1293				my $attributes = $elements->item(0)->getAttributes; #Node->NamedNodeMap
1294				my $attr = $attributes->getNamedItem('xsi:schemaLocation'); #Node
1295				unless ( $attr ) {
1296				$self->log->fail("No xsi:schemaLocation attribute for the OAI-PMH element was ".
1297				"found, expected xsi:schemaLocation=\"$namespace $location\"");
1298				return(0);
1299				}
1300				$attr = $attributes->getNamedItem('xsi:schemaLocation'); #Node
1301				my $pair = $attr->getNodeValue(); # must pair OAI namespace with schema
1302				unless ( $pair =~ /^\s?$namespace\s*$location/ ) {
1303				$self->log->fail("Error in pairing OAI namespace with schema location, expected: ".
1304				"xsi:schemaLocation=\"$namespace $location\" but got $pair");
1305				return(0);
1306				}
1307				return(1);
1308				}
1309
1310
1311				=head3 check_protocol_version
1312
1313				Extract the protocol version being used from the Identify response, check that it is
1314				valid and then abort unless 2.0.
1315
1316				=cut
1317
1318				sub check_protocol_version {
1319				my $self=shift;
1320				my $doc;
1321				# Extract the version number of the validator to run
1322				my $x = $self->doc->getElementsByTagName('protocolVersion');
1323				if (not $x) {
1324				$self->abort("Unknown protocol version, failed to extract protocolVersion element from Identify response");
1325				}
1326				my $protocol_version = $x->item(0)->getFirstChild->getData;
1327				if ($protocol_version ne '2.0' and
1328				$protocol_version ne '1.1' and
1329				$protocol_version ne '1.0') {
1330				$self->abort("Invalid protocol version ($protocol_version)");
1331				}
1332				$self->protocol_version( $protocol_version );
1333				if ($protocol_version ne '2.0') {
1334				$self->abort("Repository reports OAI-PMH protocol version $protocol_version and will not be validated. Guidelines for upgrading to 2.0 can be found at http://www.openarchives.org/OAI/2.0/migration.htm\n\n");
1335				}
1336				$self->log->pass("Correctly reports OAI-PMH protocol version 2.0");
1337				}
1338
1339
1340				=head2 is_verb_response($reponse,$verb)
1341
1342				Return true if $response is a response for the specified $verb.
1343
1344				FIXME -- need better checks!
1345
1346				=cut
1347
1348				sub is_verb_response {
1349				my $self=shift;
1350				my ($response,$verb) = @_;
1351				my $doc;
1352				eval { $doc=$self->parser->parse($response->content); };
1353				return unless $doc; # We can't parse it so it isn't a valid doc
1354				my $verb_elements = $doc->getElementsByTagName($verb);
1355				return(1) if ( $verb_elements and $verb_elements->getLength==1 );
1356				return; # not the one element we expected
1357				}
1358
1359
1360				=head3 error_elements_include($error_elements,$error_codes)
1361
1362				Determine whether the list of error elements ($error_elements) includes at least
1363				one of the desired codes. Return string with first matching error code, else
1364				return false/nothing.
1365
1366				Does a sanity check on $error_list to check that it is set and has length>0
1367				before trying to match, so cose calling it can simply do a
1368				getElementsByTagName or similar before caling.
1369
1370				=cut
1371
1372				sub error_elements_include {
1373				my $self=shift;
1374				my ($error_elements, $error_codes) = @_;
1375				# sanity check
1376				return if (!$error_elements or $error_elements->getLength==0);
1377				for (my $i=0; $i<$error_elements->getLength; $i++) {
1378				foreach my $ec (@$error_codes) {
1379				my $code = $error_elements->item($i)->getAttribute('code') \|\| 'no-code-attribute';
1380				$self->log->note("$code =? $ec") if ($self->debug);
1381				return($ec) if ($code eq $ec);
1382				}
1383				}
1384				return;
1385				}
1386
1387
1388
1389				=head3 check_error_response($response)
1390
1391				Given the response to an HTTP request, make sure it is not an
1392				OAI-PMH error message. The $response is a success. If it is an
1393				OAI error message, return 2; if the response cannot be parsed, return
1394				-1; otherwise return undef (it must be a real Identify response).
1395
1396				FIXME -- need better checks!
1397
1398				FIXME -- need to merge thic functionality in with is_error_response
1399
1400				=cut
1401
1402				sub check_error_response {
1403				my $self=shift;
1404				my ($response) = @_;
1405				my $doc;
1406				eval { $doc=$self->parser->parse($response->content); };
1407				return unless $doc; # We can't parse it so it isn't a valid error
1408				my $error_elements = $doc->getElementsByTagName('error');
1409				return(1) if ( $error_elements and $error_elements->getLength() > 0 );
1410				return; # no error codes so not an error response
1411				}
1412
1413
1414				=head3 get_earliest_datestamp()
1415
1416				A new exception check for Version 2.0 raises noRecordsMatch errorcode
1417				if the set of records returned by ListRecords is empty. This requires
1418				that we know the earliest date in the repository. Also check that the
1419				earliest date matches the specified granularity.
1420
1421				Called only for version 2.0 or greater.
1422
1423				Since the Identify response has already been validated, we know
1424				there is exactly one earliestDatestamp element in the current document.
1425				Extract this value, check it, and if it looks good then set
1426				$self->earliest_datestamp and return false.
1427
1428				If there is an error then return string explaining that.
1429
1430				=cut
1431
1432				sub get_earliest_datestamp {
1433				my $self=shift;
1434
1435				my $earliest = $self->doc->getElementsByTagName('earliestDatestamp');
1436				my $el = $earliest->item(0);
1437				return("Can't get earliestDatestamp element from Identify response.") unless ($el);
1438				return("earliestDatestamp element is empty in Identify response.") unless ($el->getFirstChild);
1439
1440				my $error='';
1441				my $earliest_datestamp = $el->getFirstChild->getData;
1442				$self->log->note("Earliest datestamp in repository is $earliest_datestamp") if $self->debug;
1443
1444				$earliest_datestamp =~ /^([0-9]{4})-([0-9][0-9])-([0-9][0-9])(.*)$/;
1445				if ($1 eq '' \|\| $2 eq '' \|\| $3 eq '') {
1446				$error="is not in ISO8601 format";
1447				} elsif ( $4 eq '' and $self->granularity eq 'seconds') {
1448				$error="must have seconds granularity (format YYYY-MM-DDThh:mm:ssZ) to match ".
1449				"the granularity for the repository. The granularity has been set to seconds ".
1450				"by the granularity element of the Identify response.\n";
1451				} elsif ( $4 ne '' and $self->granularity eq 'days') {
1452				$error="must have days granularity (format YYYY-MM-DD) to match the granularity for ".
1453				"the repository. The granularity has been set to days by the granularity ".
1454				"element of the Identify response (or that element is bad/missing).\n";
1455				} elsif ( $self->granularity eq 'seconds' and $4 !~ /^T\d\d:\d\d:\d\d(\.\d+)?Z$/ ) {
1456				$error="does not have the correct format for the time part of the UTCdatetime. The ".
1457				"overall format must be YYYY-MM-DDThh:mm:ssZ.\n";
1458				}
1459				if ($error) {
1460				# Sanitize for error message
1461				return("The earliestDatestamp in the identify response (".
1462				sanitize($earliest_datestamp).") $error");
1463				} else {
1464				$self->earliest_datestamp($earliest_datestamp);
1465				return;
1466				}
1467				}
1468
1469
1470				=head3 parse_granularity($granularity_element)
1471
1472				Parse contents of the granularity element of the Identify response. Returns either
1473				'days', 'seconds' or nothing on failure. Sets $self->granularity if valid, otherwise
1474				does not change setting.
1475
1476				As of v2.0 the granularity element is mandatory, see:
1477				http://www.openarchives.org/OAI/openarchivesprotocol.html#Identify
1478
1479				=cut
1480
1481				sub parse_granularity {
1482				my $self=shift;
1483				my ($gran) = @_;
1484				if (!$gran or $gran->getLength==0) {
1485				$self->log->fail("Missing granularity element");
1486				return;
1487				} elsif ($gran->getLength>1) {
1488				$self->log->fail("Multiple granularity elements");
1489				return;
1490				}
1491				#schema validation guarantees that there is a spec here
1492				my $el=$gran->item(0)->getFirstChild->getData;
1493				if ($el eq 'YYYY-MM-DD') {
1494				$self->granularity('days');
1495				return($self->granularity);
1496				} elsif ($el eq 'YYYY-MM-DDThh:mm:ssZ') {
1497				$self->granularity('seconds');
1498				return($self->granularity);
1499				} else {
1500				$self->log->fail("Bad value for the granularity element '$el', must be either ".
1501				"YYYY-MM-DD or YYYY-MM-DDThh:mm:ssZ");
1502				return;
1503				}
1504				}
1505
1506
1507				=head3 get_datestamp_granularity($datestamp)
1508
1509				Parse the datestamp supplied and return 'days' if it is valid with granularity
1510				of days, 'seconds' if it is valid for seconds granularity, and nothing if it is not
1511				valid.
1512
1513				# FIXME - should add more validation
1514
1515				=cut
1516
1517				sub get_datestamp_granularity {
1518				my $self=shift;
1519				my ($datestamp)=@_;
1520				if ($datestamp=~/^(\d\d\d\d)-(\d\d)-(\d\d)$/) {
1521				return 'days' if ($2>=1 and $2<=12 and $3>=1 and $3<=31);
1522				} elsif ($datestamp=~/^(\d\d\d\d)-(\d\d)-(\d\d)T(\d\d):(\d\d):(\d\d)(\.\d+)?Z$/) {
1523				return 'seconds' if ($2>=1 and $2<=12 and $3>=1 and $3<=31 and $4<24 and $5<60);
1524				}
1525				return;
1526				}
1527
1528
1529				=head3 is_no_records_match
1530
1531				Returns true if the current document contains and error code element with the code "noRecordsMatch"
1532
1533				### FIXME - should be merged into an extended is_error_response
1534
1535				=cut
1536
1537				sub is_no_records_match {
1538				my $self=shift;
1539				my $error_elements = $self->doc->getElementsByTagName('error');
1540				return( $self->error_elements_include($error_elements, ['noRecordsMatch']) );
1541				}
1542
1543
1544				=head3 get_resumption_token()
1545
1546				See if there is a resumptionToken with this response, return
1547				value if present, empty if not or if there is some other error.
1548
1549				=cut
1550
1551				sub get_resumption_token {
1552				my $self=shift;
1553
1554				my $tokenList = $self->doc->getElementsByTagName('resumptionToken');
1555				if ( !$tokenList or $tokenList->getLength()==0 ) {
1556				return; #no resumptionToken
1557				}
1558
1559				# Dig out the resumption token from the document
1560				my $token = $tokenList->item(0)->getFirstChild();
1561
1562				# Try getting the resumption token, $token will be will be undefined
1563				# unless the element has content
1564				if ($token) {
1565				return($token->getData());
1566				}
1567				return;
1568				}
1569
1570
1571				=head3 is_error_response($details)
1572
1573				Look at the parsed response in $self->doc to see if it is an error response,
1574				parse data and return true if it is.
1575
1576				Returns true (a printable string containing the error messages) if response was a valid
1577				OAI_PMH error response, codes in %$details if a hash reference is passed in.
1578
1579				=cut
1580
1581				sub is_error_response {
1582				my $self=shift;
1583				my ($details)=@_;
1584				$details={} unless (ref($details) eq 'HASH'); #dummy hash unless one supplied
1585				#
1586				my $error_elements = $self->doc->getElementsByTagName('error');
1587				if ($error_elements and $error_elements->getLength()>=1) {
1588				my $msg='';
1589				for (my $i=0; $i<$error_elements->getLength; $i++) {
1590				my $code=$error_elements->item($i)->getAttribute("code");
1591				my $child=$error_elements->item($i)->getFirstChild();
1592				unless ($child) {
1593				# Warn about no content unless it is the special case of noSetHierarchy
1594				# where the error code really is sufficient
1595				unless ($code eq 'noSetHierarchy') {
1596				$self->log->warn("No human readable message included in error element for ".
1597				"$code error, this is discouraged");
1598				}
1599				$details->{$code}='[NO MESSAGE RETURNED]';
1600				$msg.="[$code] ";
1601				} else {
1602				$details->{$code}=$child->getData();
1603				$msg.="[$code: $details->{$code}] ";
1604				}
1605				}
1606				return($msg);
1607				} else {
1608				return;
1609				}
1610				}
1611
1612
1613				=head3 get_admin_email()
1614
1615				Extract admin email from a parsed Identify response in $self->doc).
1616				Also note that the email target may have been set via form option
1617
1618				Returns the pair of ($email,$error) where $email is the combined
1619				set of email addresses (comma separated). $error will be undef
1620				or a string with error message to users.
1621
1622				=cut
1623
1624				sub get_admin_email {
1625				my $self=shift;
1626
1627				my $adminEmailElements = $self->doc->getElementsByTagName('adminEmail');
1628				my @emails=();
1629				my $n = $adminEmailElements->getLength;
1630				if ($n > 0) {
1631				my $name_node = $adminEmailElements->item(0)->getFirstChild;
1632				if ($name_node) {
1633				for (my $i=0; $i<$n; $i++) {
1634				my $e=$adminEmailElements->item($i)->getFirstChild->getData;
1635				if ($e=~s/mailto://g) {
1636				$self->log->warn("Stripped mailto: prefix from adminEmail address, this ".
1637				"should not be included.");
1638				}
1639				if (my $msg=$self->bad_admin_email($e)) {
1640				return(undef,$msg);
1641				}
1642				push(@emails,$e);
1643				}
1644				} else {
1645				$self->log->fail("adminEmail element is empty!");
1646				return(undef);
1647				}
1648				} else {
1649				$self->log->fail("No adminEmail element!");
1650				return(undef);
1651				}
1652				my $email=join(',',@emails);
1653				$self->log->pass("Administrator email address is '$email'");
1654				return($email);
1655				}
1656
1657
1658				=head3 bad_admin_email($admin_email)
1659
1660				Check for some stupid email addresses to avoid so much bounced email.
1661				Returns a string (True) if bad, else nothing.
1662
1663				=cut
1664
1665				sub bad_admin_email {
1666				my $self=shift;
1667				my ($admin_email)=@_;
1668				if ($admin_email=~/\@localhost$/) {
1669				$self->log->fail("adminEmail '$admin_email' is local. This must be corrected to a ".
1670				"valid globally resolvable email address before tests can continue");
1671				return("local adminEmail");
1672				} elsif ($admin_email!~/^\w[\w\-\.]+\@[a-zA-Z0-9\-\.]+\.[a-z]{2,}$/) {
1673				$self->log->fail("adminEmail '$admin_email' looks bogus. This must be corrected to ".
1674				"a valid email address before tests can continue");
1675				return("looks like bogus adminEmail");
1676				}
1677				return;
1678				}
1679
1680
1681				=head2 METHODS FOR MAKING REQUESTS AND PARSING RESPONSES
1682
1683				=head3 make_request_and_validate($verb, $req)
1684
1685				Given the base URL that we are validating, the Verb that we are checking
1686				and the complete query to be sent to the OAI server, get the response to
1687				the verb. Validation has already been done, so we need only do some
1688				special checks here. Return the response to the OAI verb,
1689				or undef if the OAI server failed to respond to that verb.
1690
1691				Side effects: errors may be printed and error_count bumped.
1692				If the verb involved is "Identify" then set the version number and the
1693				email address, assuming that some response has been obtained.
1694
1695				Simple well-formedness is checked by this routine. An undef exit means
1696				that any calling code should fail the test but need not report 'no response'.
1697
1698				If the response is true then $self->doc contains a parsed XML
1699				document.
1700
1701				This is the usual way we make requests with integrated parsing and error
1702				checking. This method is built around calls to L and
1703				L.
1704
1705				=cut
1706
1707				sub make_request_and_validate {
1708				my $self=shift;
1709				my ($verb, $req) = @_;
1710
1711				my $response = $self->make_request($req);
1712
1713				unless ( $response->is_success ) {
1714				my $status = $response->status_line;
1715				my $age = $response->current_age;
1716				my $lifetime = $response->freshness_lifetime;
1717				my $is_fresh = $response->is_fresh;
1718				$self->log->fail("Server failed to respond to the $verb request (HTTP header ".
1719				"values: status=$status, age=$age, lifetime=$lifetime, ".
1720				"is fresh:=$is_fresh)");
1721				return;
1722				}
1723
1724				unless ($self->parse_response($req, $response)) {
1725				$self->log->fail("Failed to parse response");
1726				return;
1727				}
1728
1729				# Check that the responseDate is in UTC format
1730				$self->check_response_date($req,$self->doc);
1731				# Check that the response refers to the "official" OAI schema
1732				$self->check_schema_name($req,$self->doc);
1733
1734				return($response);
1735				}
1736
1737
1738				=head3 make_request($url,$post_data)
1739
1740				Routine to GET or POST a request, handle 503's, and return the response
1741
1742				Second parameter, $post_data, must be hasfref to POST data to indicate that
1743				the request should be an HTTP POST request instead of a GET.
1744
1745				=cut
1746
1747				sub make_request {
1748				my $self=shift;
1749				my ($url,$post_data) = @_;
1750
1751				# Is this https and do we allow that?
1752				if (is_https_uri($url)) {
1753				$self->uses_https(1);
1754				if (not $self->allow_https) {
1755				$self->abort("URI $url is https. Use of https URIs is not allowed ".
1756				"by the OAI-PMH v2.0 specification");
1757				}
1758				}
1759
1760				my $request;
1761				if ($post_data) {
1762				my $content_msg=''; #nice string to report
1763				# Sort keys in alpha order for consistent behavior
1764				foreach my $k (sort keys(%$post_data)) {
1765				my $v=$post_data->{$k};
1766				$content_msg.="$k:$v ";
1767				}
1768				$self->log->request($url,'POST',$content_msg);
1769				$request = POST($url,'Content'=>$post_data);
1770				} else {
1771				$self->log->request($url,'GET');
1772				$request = GET($url);
1773				}
1774				my $response;
1775				my $tries=0;
1776				my $try_again = 1;
1777				while ( $try_again ) {
1778				#$ua->max_redirect(0);
1779				$response = $self->ua->request($request);
1780				#
1781				# Write response if requested
1782				if ($self->save_all_responses) {
1783				my $response_file="/tmp/".$self->run_id.".".$self->response_number;
1784				open(my $fh,'>',$response_file) \|\| $self->abort("Can't write response $response_file: $!");
1785				print {$fh} $response->content();
1786				$self->log->note("Response saved as $response_file") if ($self->debug);
1787				close($fh);
1788				$self->{response_number}++;
1789				}
1790				$tries++;
1791				if ($tries > $self->max_retries) {
1792				$self->abort("Too many 503 Retry-After or 302 Redirect responses received in a row");
1793				}
1794				#
1795				# Check response for 503 and 302
1796				if ($response->code eq '503') {
1797				# 503 (Retry-After), expect to get a time too
1798				$self->uses_503(1);
1799				if (defined $response->header("Retry-After")) {
1800				my $retryAfter=$response->header("Retry-After");
1801				if ($retryAfter=~/^\d+$/) {
1802				if ($retryAfter<=3600) {
1803				###FIXME: Should check the Retry-After value carefully and barf if bad
1804				my $sleep_time = 1 + $response->header("Retry-After");
1805				$self->log->note("Status: ".$response->code().
1806				" -- going to sleep for $sleep_time seconds.");
1807				sleep $sleep_time;
1808				} else {
1809				$self->abort("503 response with Retry-After > 1hour (3600s), aborting");
1810				}
1811				} else {
1812				$self->log->fail("503 response with bad (non-numeric) Retry-After time, ".
1813				"will wait 10s");
1814				sleep 10;
1815				}
1816				} else {
1817				$self->log->warn("503 response without Retry-After time, will wait 10s");
1818				sleep 10;
1819				}
1820				} elsif ($response->code eq '302') {
1821				# 302 (Found) redirect
1822				my $loc=$response->header('Location');
1823				if ($loc!~m%^http://([^\?&]+)%) {
1824				if (is_https_uri($loc)) {
1825				$self->uses_https(1);
1826				if (not $self->allow_https) {
1827				$self->abort("Redirect URI specified in 302 response is https. Use of ".
1828				"https URIs is not allowed by the OAI-PMH v2.0 specification");
1829				}
1830				} else {
1831				$self->abort("Bad redirect URI specified in 302 response");
1832				}
1833				}
1834				# Make new request
1835				if ($post_data and $loc!~/\?/) { #don't do POST if new Location includes ?
1836				$request = POST($loc,'Content'=>$post_data);
1837				} else {
1838				$request = GET($loc);
1839				}
1840				} elsif ($response->code eq '501') {
1841				$self->abort("Got 501 Not Implemented response which may either have come from ".
1842				"the server or have been generated within the validator because the ".
1843				"request type (perhaps https) is not supported.");
1844				} else {
1845				$try_again=0;
1846				}
1847				}
1848				# Check for oversize limit (indicated by X-Content-Range header)
1849				if (defined $response->header('X-Content-Range')) {
1850				$self->log->fail("Response to <$url> exceeds maximum size limit (".$self->max_size." bytes), discarded. ".
1851				"While this limit is set only in this validation program you should not use excessively ".
1852				"large responses as service providers will likely have problems parsing the XML. You ".
1853				"should split the responses using the resumptionToken mechanism. (X-Content-Range: '".
1854				$response->header('X-Content-Range')."' Content-Length: '".$response->content_length."')\n");
1855				$response->content('');
1856				}
1857				return $response;
1858				}
1859
1860
1861				=head3 parse_response($request_url,$response,$xml_reason)
1862
1863				Attempt to parse the HTTP response $response, examining both the response code
1864				and then attempting to parse the content as XML.
1865
1866				If $xml_reason is specified then this is added to the failure message, if
1867				nothing is specified then a standard message about UTF-8 issues is
1868				added.
1869
1870				Returns true on success and sets $self->doc with the parsed XML document.
1871				If unsuccessful, log an error message, bump the error count, and
1872				return false.
1873
1874				=cut
1875
1876				sub parse_response {
1877				my $self=shift;
1878				my ($request_url,$response,$xml_reason) = @_;
1879				$xml_reason='' unless (defined $xml_reason);
1880				#
1881				# Fail if reponse=undef, else check to see if response is ref to
1882				# response object or is string
1883				if (!defined($response) or not ref($response)) {
1884				$self->log->warn("Bad response from server");
1885				return;
1886				}
1887				# Unpack the bits we want from response object
1888				my $code=$response->code;
1889				my $content=$response->content;
1890				# Check return code (if given)
1891				if ($code and $code=~/^[45]/) {
1892				$self->log->warn("Bad HTTP status code from server: $code");
1893				return;
1894				}
1895				#
1896				# Check content
1897				my $doc;
1898				eval { $doc=$self->parser->parse($content); };
1899				unless ( $doc ) {
1900				my $err=$@;
1901				$err=~s/^\s+//;
1902				$err=~s%at\s+/usr/lib/perl.*%%i; #trim stuff about our perl installation
1903				if ($request_url) {
1904				unless ($xml_reason) {
1905				$xml_reason="The most common reason for malformed responses is illegal bytes in ".
1906				"UTF-8 streams (e.g. the inclusion of Latin1 characters with codes>127 ".
1907				"without creating proper UTF-8 mutli-byte sequences). You might find ".
1908				"the utf8conditioner, found on the OAI tools page helpful for debugging.";
1909				}
1910				$self->log->warn("Malformed response: $err. $xml_reason");
1911				}
1912				return;
1913				}
1914				# Set parsed document
1915				$self->doc( $doc );
1916				return(1);
1917				}
1918
1919
1920				=head2 UTILITY FUNCTIONS
1921
1922				=head3 html_escape($str)
1923
1924				Escapes characters which have special meanings in HTML
1925
1926				=cut
1927
1928				sub html_escape {
1929				my $string = shift;
1930				$string =~ s/&/&/g; #must be first!
1931				$string =~ s/
1932				$string =~ s/>/>/g;
1933				$string =~ s/"/"/g;
1934				$string =~ s/'/'/g;
1935				return $string;
1936				}
1937
1938				=head3 one_year_before($date)
1939
1940				Assumes properly formatted date, decrements year by one
1941				via string manipulation and returns date.
1942
1943				=cut
1944
1945				sub one_year_before {
1946				my ($date)=@_;
1947				my ($year) = $date =~ /^([0-9]{4})/;
1948				my $year_minus_one = sprintf('%04d',($year - 1)); #make sure we get leading zeros
1949				$date =~ s/^$year/$year_minus_one/;
1950				return($date);
1951				}
1952
1953				=head3 url_encode($str)
1954
1955				Escape/encode any characters that aren't in the small safe set for URLs
1956
1957				=cut
1958
1959				sub url_encode {
1960				my $str=shift;
1961				$str =~ s/([^\w\/\,\- ])/sprintf("%%%02X",ord($1))/eg;
1962				$str =~ tr/ /+/;
1963				return($str);
1964				}
1965
1966
1967				=head3 is_https_uri($uri)
1968
1969				Return true if the URI is an https URI, false otherwise.
1970
1971				=cut
1972
1973				sub is_https_uri {
1974				my $uri=shift;
1975				return($uri=~m%^https:%);
1976				}
1977
1978
1979				=head3 sanitize($str)
1980
1981				Return a sanitized version of $str that doesn't contain odd
1982				characters and it not over 80 chars long. Will have the
1983				string '(sanitized)' appended if changed.
1984
1985				=cut
1986
1987				sub sanitize {
1988				my ($str)=@_;
1989				my $out=$str;
1990				$out=~s/[^\w\-:;.!@#%^* ]/_/g;
1991				$out=substr($out,0,80);
1992				if ($out ne $str) {
1993				$out.='(sanitized)';
1994				}
1995				return($out);
1996				}
1997
1998
1999				=head1 SUPPORT
2000
2001				Please report any bugs of questions about validation via the
2002				OAI-PMH discussion list at L.
2003				Be sure to make it clear that you are talking about the
2004				HTTP::OAIPMH::Validator module.
2005
2006				=head1 AUTHORS
2007
2008				Simeon Warner, Donna Bergmark
2009
2010				=head1 HISTORY
2011
2012				This module is based on an OAI-PMH validator first written by Donna Bergmark
2013				(Cornell University) in 2001-01 for the OAI-PMH validation and registration
2014				service (L).
2015				Simeon Warner (Cornell University) took over the validator and operation of
2016				the registration service in 2004-01, and then did a significant tidy/rework
2017				of the code. That code ran the validation and registration service with
2018				few changes through 2015-01. Some of the early work on the OAI-PMH validation
2019				service was supported through NSF award number 0127308.
2020
2021				Code was abstracted into this module 2015-01 by Simeon Warner and is
2022				used for the OAI-PMH validation and registration service on
2023				L.
2024
2025				=head1 COPYRIGHT
2026
2027				Copyright 2001..2017 by Simeon Warner, Donna Bergmark.
2028
2029				This library is free software; you can redistribute it and/or modify it under
2030				the same terms as Perl itself.
2031
2032				=cut
2033
2034				1;