File Coverage

blib/lib/SeeAlso/Source/BeaconAggregator/Maintenance.pm
Criterion Covered Total %
statement 20 20 100.0
branch n/a
condition n/a
subroutine 6 6 100.0
pod n/a
total 26 26 100.0


line stmt bran cond sub pod time code
1             package SeeAlso::Source::BeaconAggregator::Maintenance;
2 10     10   223404 use strict;
  10         21  
  10         344  
3 10     10   41 use warnings;
  10         12  
  10         295  
4              
5             BEGIN {
6 10     10   37 use Exporter ();
  10         13  
  10         204  
7 10     10   37 use vars qw($VERSION @ISA @EXPORT @EXPORT_OK %EXPORT_TAGS);
  10         13  
  10         1100  
8 10     10   27 $VERSION = '0.2_88';
9 10         112 @ISA = qw(Exporter);
10             #Give a hoot don't pollute, do not export more than needed by default
11 10         22 @EXPORT = qw();
12 10         14 @EXPORT_OK = qw();
13 10         208 %EXPORT_TAGS = ();
14             }
15              
16 10     10   50 use base ("SeeAlso::Source::BeaconAggregator");
  10         33  
  10         4497  
17             use Carp;
18             use HTTP::Date; # not perfect, but the module is commonly installed...
19             use HTTP::Request;
20             use LWP::UserAgent;
21             use File::Temp;
22              
23             =head1 NAME
24              
25             sasbactrl.pl - command line interface to SeeAlso::Source::BeaconAggregator and
26             auxiliary classes
27              
28             =head1 SYNOPSIS
29              
30              
31             =head1 DESCRIPTION
32              
33             This Module allows a collection of BEACON files (cf. http://de.wikipedia.org/wiki/Wikipedia:BEACON)
34             to be used as SeeAlso::Source (probably in the context of an SeeAlso::Server application).
35             Therefore it implements the four methods documented in SeeAlso::Source
36              
37             The BEACON files (lists of non-local identifiers of a certain type documenting the coverage of a given
38             online database plus means for access) are imported by the methods provided by
39             SeeAlso::Source::BeaconAggregator::Maintenance.pm, usually by employing the script sasbactrl.pl
40             as command line client.
41              
42             Serving other formats than SeeAlso or providing a BEACON file with respect to this
43             SeeAlso service is achieved by using SeeAlso::Source::BeaconAggregator::Publisher.
44              
45              
46             =head1 USAGE
47              
48             Use the C method inherited from C to
49             access an existing database or create a new one.
50              
51              
52             =head2 Database Methods
53              
54             =head3 init( [ %options] )
55              
56             Sets up and initializes the database structure for the object.
57             This has to be done once after creating a new database and after
58             upgrading this module.
59              
60             Valid options include:
61              
62             =over 8
63              
64             =item verbose
65              
66             =item prepareRedirs
67              
68             =item identifierClass
69              
70             =back
71              
72              
73             The I table contains as columns all valid beacon fields plus
74             the following administrative fields which have to be prefixed with
75             "_" in the interface:
76              
77             =over 8
78              
79             =item seqno
80              
81             Sequence number: Is incremented on any successfull load
82              
83             =item alias
84              
85             Unique key: On update older seqences with the same alias are
86             automatically discarded. Most methods take an alias as
87             argument thus obliterating the need to determine the sequence
88             number.
89              
90             =item sort
91              
92             optional sort key
93              
94              
95             =item uri
96              
97             Overrides the #FEED header for updates
98              
99             =item ruri
100              
101             Real uri from which the last instance was loaded
102              
103              
104             =item ftime
105              
106             Fetch time: Timestamp as to when this instance was loaded
107              
108             Clear this or mtime to force automatic reload.
109              
110             =item fstat
111              
112             Short statistics line of last successful reload on update.
113              
114              
115             =item mtime
116              
117             Modification time: Timestamp of the file / HTTP object from which this instance was loaded.
118             Identical to ftime if no timestamp is provided
119              
120             Clear this or ftime to force automatic reload on update.
121              
122              
123             =item utime
124              
125             Timestamp of last update attempt
126              
127             =item ustat
128              
129             Short status line of last update attempt.
130              
131              
132             =item counti
133              
134             Identifier count
135              
136             =item countu
137              
138             Unique identifier count
139              
140              
141             =item admin
142              
143             Just to store some remarks.
144              
145             =back
146              
147             The I table stores the individual beacon entries from the input files.
148             Its columns are:
149              
150             =over 8
151              
152             =item hash
153              
154             Identifier. If a (subclass of) C instance is provided,
155             this will be transformed by the C method.
156              
157             =item seqno
158              
159             Sequence number of the beacon file in the database
160              
161             =item altid
162              
163             optional identifier from an alternative identifier system for use
164             with ALTTARGET templates.
165              
166             =item hits
167              
168             optional number of hits for this identifier in the given resource
169              
170             =item info
171              
172             optional information text
173              
174             =item link
175              
176             optional explicit URL
177              
178             =back
179              
180              
181             The I table contains C, C pairs for various metadata
182             concerning the collection as such, notably the values needed for
183             the Open Search Description and the Header fields needed in case
184             of publishing a beacon file for this collection.
185              
186             The I table stores (unique) C, C pairs for
187             general persistent data. Currently the following keys are defined:
188              
189             =over 8
190              
191             =item DATA_VERSION
192              
193             Integer version number to migrate database layout.
194              
195             =item IDENTIFIER_CLASS
196              
197             Name of the Identifier class to be used.
198              
199             =item REDIRECTION_INDEX
200              
201             Control creation of an additional index for the I column
202             (facialiates reverse lookups as needed for clustering).
203              
204             =back
205              
206              
207             =cut
208              
209             sub init {
210             my ($self, %options) = @_;
211             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
212              
213             my @fieldlist = SeeAlso::Source::BeaconAggregator->beaconfields();
214             my @bf = map{ join(" ", @{[SeeAlso::Source::BeaconAggregator->beaconfields($_)]}[0..1]) } @fieldlist;
215             my $hdl = $self->{dbh} or croak("no database handle?");
216              
217             local($") = ",\n";
218             $hdl->do(<<"XxX"
219             CREATE TABLE IF NOT EXISTS repos (
220             seqno INTEGER PRIMARY KEY AUTOINCREMENT,
221             alias TEXT,
222             sort TEXT,
223             uri VARCHAR(512),
224             ruri VARCHAR(512),
225             mtime INTEGER,
226             utime INTEGER,
227             ftime INTEGER,
228             counti INTEGER DEFAULT 0,
229             countu INTEGER DEFAULT 0,
230             fstat TEXT,
231             ustat TEXT,
232             admin VARCHAR(512),
233             @bf
234             );
235             XxX
236             ) or croak("Setup error: ".$hdl->errstr);
237              
238             $hdl->do("CREATE UNIQUE INDEX IF NOT EXISTS seqnos ON repos(seqno);") or croak("Setup error: ".$hdl->errstr);
239             $hdl->do("CREATE INDEX IF NOT EXISTS aliases ON repos(alias);") or croak("Setup error: ".$hdl->errstr);
240              
241             $hdl->do(<<"XxX"
242             CREATE TABLE IF NOT EXISTS beacons (
243             hash CHARACTER(64) NOT NULL,
244             seqno INTEGER REFERENCES repos(seqno) ON DELETE CASCADE,
245             altid TEXT,
246             hits INTEGER,
247             info VARCHAR(255),
248             link VARCHAR(1024)
249             );
250             XxX
251             ) or croak("Setup error: ".$hdl->errstr);
252              
253              
254             # Faciliate lookups
255             $hdl->do("CREATE INDEX IF NOT EXISTS lookup ON beacons(hash);") or croak("Setup error: ".$hdl->errstr);
256             # maintenance and enforce constraints
257             # (Problem: Dupes w/o altid but differing in link *and* info fields should be legitimate, too)
258             $hdl->do("CREATE UNIQUE INDEX IF NOT EXISTS mntnce ON beacons(seqno, hash, altid);") or croak("Setup error: ".$hdl->errstr);
259              
260             # foreign key on cascade does not work?
261              
262             $hdl->do(<<"XxX"
263             CREATE TRIGGER IF NOT EXISTS on_delete_seqno BEFORE DELETE ON repos FOR EACH ROW
264             BEGIN
265             DELETE FROM beacons WHERE seqno=OLD.seqno;
266             END;
267             XxX
268             ) or croak("Setup error: ".$hdl->errstr);
269              
270             # OpenSearchDescription
271             $hdl->do(<<"XxX"
272             CREATE TABLE IF NOT EXISTS osd (
273             key CHAR(20) NOT NULL,
274             val VARCHAR(1024)
275             );
276             XxX
277             ) or croak("Setup error: ".$hdl->errstr);
278             $hdl->do("CREATE INDEX IF NOT EXISTS OSDKeys ON osd(key);") or croak("Setup error: ".$hdl->errstr);
279              
280             # Admin Stuff
281             $hdl->do(<<"XxX"
282             CREATE TABLE IF NOT EXISTS admin (
283             key CHAR(20) PRIMARY KEY NOT NULL,
284             val VARCHAR(1024)
285             );
286             XxX
287             ) or croak("Setup error: ".$hdl->errstr);
288              
289             $hdl->do("CREATE UNIQUE INDEX IF NOT EXISTS ADMKeys ON admin(key);") or croak("Setup error: ".$hdl->errstr);
290              
291             my $admref = $self->admhash();
292              
293             my $verkey = "DATA_VERSION";
294             my $goalver = $SeeAlso::Source::BeaconAggregator::DATA_VERSION;
295             my $dbver = $admref->{$verkey} || 0;
296             if ( $dbver != $goalver ) {
297             print "NOTICE: Database version $dbver: Upgrading to $goalver\n";
298             # alter tables here
299             if ( $dbver < 2 ) {
300             # my ($at, $type) = SeeAlso::Source::BeaconAggregator->beaconfields("COUNT");
301             # $hdl->do("ALTER TABLE repos ADD COLUMN $at $type;");
302             # ($at, $type) = SeeAlso::Source::BeaconAggregator->beaconfields("REMARK");
303             # $hdl->do("ALTER TABLE repos ADD COLUMN $at $type;");
304             };
305             }
306             elsif ( $options{'verbose'} ) {
307             print "INFO: Database version $dbver is current\n"};
308              
309             unless ( $dbver == $goalver) {
310             my $verh = $self->stmtHdl("INSERT OR REPLACE INTO admin VALUES (?, ?);", "update version statement");
311             $verh->execute($verkey, $goalver)
312             or croak("Could not execute update version statement: ".$verh->errstr);
313             };
314              
315             unless ( exists $options{'identifierClass'} ) {
316             $options{'identifierClass'} = $self->{'identifierClass'} if exists $self->{'identifierClass'};
317             };
318              
319             my $ickey = "IDENTIFIER_CLASS";
320             if ( (exists $options{identifierClass}) and (my $wanttype = ref($options{'identifierClass'})) ) {
321             if ( (exists $self->{identifierClass}) && (ref($self->{identifierClass}) ne $wanttype) ) {
322             croak("Cannot override identifierClass set on new()")};
323             if ( my $oldtype = $admref->{$ickey} ) {
324             croak ("Identifier mismatch: Cannot set to $wanttype since database already branded to $oldtype")
325             unless($oldtype eq $wanttype);
326             }
327             else {
328             print "fixing identifierClass as $wanttype\n" if $options{'verbose'};
329             my $ichdl = $self->stmtHdl("INSERT INTO admin VALUES (?, ?);", "fix identifier class statement");
330             $ichdl->execute($ickey, $wanttype)
331             or croak("Could not execute fix identifier class statement: ".$ichdl->errstr);
332             $self->{identifierClass} = $options{identifierClass};
333             };
334             }
335             elsif ( (exists $options{identifierClass}) and (not $options{identifierClass}) ) {
336             print "removing fixed identifierClass from admin table\n" if $options{'verbose'};
337             my $ichdl = $self->stmtHdl("DELETE FROM admin WHERE key=?;", "identifier class statement");
338             $ichdl->execute($ickey)
339             or croak("Could not execute remove identifier class statement: ".$ichdl->errstr);
340             delete $self->{identifierClass};
341             };
342              
343             my $rikey = "REDIRECTION_INDEX";
344             if ( exists $options{prepareRedirs} or exists $admref->{$rikey} ) {
345             my $rihdl = $self->stmtHdl("INSERT OR REPLACE INTO admin VALUES (?, ?);", "fix redirection index statement");
346             if ( $options{prepareRedirs} or ( $admref->{$rikey} and not exists $options{prepareRedirs} ) ) {
347             print "creating redirection index\n" if $options{prepareRedirs} and $options{'verbose'};
348             $hdl->do("CREATE INDEX IF NOT EXISTS redir ON beacons(altid,seqno);") or croak("Setup error: ".$hdl->errstr);
349             $rihdl->execute($rikey, 1)
350             or croak("Could not execute fix redirection index: ".$rihdl->errstr);
351             }
352             elsif ( not( $admref->{$rikey} and ($options{prepareRedirs} or (not exists $options{prepareRedirs})) ) ) {
353             print "dropping redirection index\n" if $options{'verbose'};
354             $hdl->do("DROP INDEX IF EXISTS redir;") or croak("Setup error: ".$hdl->errstr);
355             $rihdl->execute($rikey, 0)
356             or croak("Could not execute fix redirection index: ".$rihdl->errstr);
357             };
358             # $admref = $self->admhash();
359             }
360              
361             print "[ANALYZE ..." if $options{'verbose'};
362             $hdl->do("ANALYZE;");
363             print "]\n" if $options{'verbose'};
364             return 1; # o.k.
365             };
366              
367              
368             =head3 deflate()
369              
370             Maintenance action: performs VACCUUM, REINDEX and ANALYZE on the database
371              
372             =cut
373              
374             sub deflate {
375             my ($self) = @_;
376             my $hdl = $self->{dbh} or croak("no handle?");
377             print "VACUUM\n";
378             $hdl->do("VACUUM") or croak("could not VACUUM: Abort");
379             print "REINDEX\n";
380             $hdl->do("REINDEX") or croak("could not REINDEX: Abort");
381             print "ANALYZE\n";
382             $hdl->do("ANALYZE;") or croak("could not ANALYZE: Abort");
383             return 1;
384             }
385              
386              
387             =head2 Handling of beacon files
388              
389             =head3 loadFile ( $file, $fields, %options )
390              
391             Reads a physical beacon file and stores it with a new Sequence number in the
392             database.
393              
394             Returns a triple:
395              
396             my ($seqno, $rec_ok, $message) = loadFile ( $file, $fields, %options )
397              
398             $seqno is undef on error
399              
400             $seqno and $rec_ok are zero with $message containing an explanation in case
401             of no action taken.
402              
403             $seqno is an positive integer if something was loaded: The L
404             (internal unique identifier) for the representation of the beacon file in
405             the database.
406              
407             =over 8
408              
409             =item $file
410              
411             File to read: Must be a beacon file
412              
413             =item $fields
414              
415             Hashref with additional meta and admin fields to store
416              
417             =item Supported options:
418              
419             verbose => (0|1)
420             force => (0|1) process unconditionally without timestamp comparison
421             nostat => (0|1) don't refresh global identifier counters
422              
423             =back
424              
425             If the file does not contain a minimal correct header (eg. is an empty file
426             or an HTML error page accidentaly caught) no action is performed.
427              
428             Otherwise, a fresh SeqNo (sequence number) is generated and meta and
429             BEACON-Lines are stored in the appropriate tables in the database.
430              
431             If the _alias field is provided, existing database entries for this
432             Alias are updated, identifiers not accounted for any more are
433             eventually discarded.
434              
435             =cut
436              
437             sub loadFile {
438             my ($self, $file, $fields, %options) = @_;
439             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
440             $options{'verbose'} ||= 0;
441              
442             if ( ! $file ) {
443             croak("Missing file argument")}
444             elsif ( ! -e $file ) {
445             print "ERROR: no such file $file\n" && return undef}
446             elsif ( ! -r _ ) {
447             print "ERROR: no read permissions for $file\n" && return undef}
448             elsif ( -z _ ) {
449             print "WARNING: empty file $file\n";
450             return (0,0, "empty file: Will not process");
451             }
452             my $mtime = (stat(_))[9];
453             open(BKN, "<:utf8", $file) or (print "ERROR: cannot read $file\n", return undef);
454             local($.) = 0;
455              
456             unless ( defined $self->{identifierClass} ) {
457             my $package = $self->autoIdentifier();
458             $options{'verbose'} && ref($package) && print "Assuming identifiers of type ".ref($package)."\n";
459             };
460              
461             $fields = {} unless $fields;
462             $fields->{'_ftime'} ||= time();
463             $fields->{'_mtime'} ||= $mtime;
464             delete $fields->{_uri} unless $fields->{_uri};
465             delete $fields->{_alias} unless $fields->{_alias};
466             my $autopurge = $fields->{_alias} || "";
467             my $showme = $fields->{_alias} || $fields->{_uri} || $file;
468              
469             if ( $options{'verbose'} ) {
470             printf("* Loading %s from URI %s\n", $fields->{_alias} || "", $fields->{_uri} || "");
471             printf("* local input %s (%s)\n", $file, SeeAlso::Source::BeaconAggregator::tToISO($mtime));
472             };
473              
474             my ($collno, $inserthandle, $replacehandle, $err, $format);
475             my ($linecount, $headerseen, $oseq) = (0, 0, 0);
476             my ($reccount, $recill, $recign, $recnil, $recupd, $recnew, $recdupl, $recdel) = (0, 0, 0, 0, 0, 0, 0, 0);
477             local($_);
478             lines:
479             while ( ) {
480             s/[ \x0d\x0a]+$//;
481             unless ( $linecount++ ) {
482             if ( s/^\x{FEFF}// ) { # BOM-Character
483             }
484             elsif ( s/^\xef\xbb\xbf// ) { # BOM-Bytes
485             print "ERROR: cannot cope with doubly UTF-8 encoded $file\n";
486             return (undef, undef, "encoding trouble")};
487             if ( /^\s*$/ ) {
488             print "WARNING: Discarding blank line before beacon header [$showme l.$.]\n";
489             next;
490             };
491             };
492             if ( not defined $collno ) { # $collno used as flag: "still in header"
493             if ( /^#\s*([A-Z][\w-]*):\s*(.*)$/ ) {
494             $headerseen++;
495             my ($field, $data) = ($1, $2);
496             $field =~ s/^DATE$/TIMESTAMP/ && print "WARNING: corrected DATE to TIMESTAMP in Beacon-Header [$showme l.$.]\n";
497             $data =~ s/\s+$//;
498             next if $data =~ /^\s*$/;
499             if ( SeeAlso::Source::BeaconAggregator->beaconfields($field) ) {
500             if ( $fields->{$field} ) {
501             print "WARNING: Skipping already set $field [$showme l.$.]\n"}
502             else {
503             $fields->{$field} = $data}
504             }
505             else {
506             print "WARNING: Ignoring unknown $field [$data] [$showme l.$.]\n";
507             };
508             }
509             elsif ( /^(#[^:\s]+)/ ) {
510             print "WARNING: Discarding unparseable line >$1...< in beacon header context [$showme l.$.]\n"}
511             elsif ( /^\s*$/ ) {
512             print "NOTICE: Discarding blank line in beacon header context [$showme l.$.]\n" if $options{'verbose'}}
513             elsif ( ! $headerseen ) {
514             print "ERROR: no header fields [$showme l.$.]\n";
515             return (0, 0, "no header fields: Will not proceed");
516             }
517             else {
518             ($collno, $err, $format, $inserthandle, $replacehandle, $oseq) = $self->processbeaconheader($fields, %options);
519             unless ( $collno ) {
520             print "ERROR: metadata error [$showme l.$.]\n";
521             return (0, 0, "metadata error: $err");
522             };
523             $self->{dbh}->{AutoCommit} = 0;
524             $linecount --;
525             redo lines;
526             }
527             }
528             else {
529             s/^\s+//; s/\s+$//;
530             my ($id, $altid, @rest);
531             ($id, @rest) = split(/\s*\|\s*/, $_, 4);
532             ($id, $altid) = split(/\s*=\s*/, $id, 2) if $id;
533             $id || ($recnil++, next);
534              
535             if ( $options{'filter'} ) {
536             ($id, $altid) = &{$options{'filter'}}($id, $altid, @rest);
537             unless ( $id ) {
538             $recign ++;
539             unless ( ++$reccount % 10000 ) {
540             $self->{dbh}->{AutoCommit} = 1;
541             print "$reccount\n" if $options{'verbose'};
542             $self->{dbh}->{AutoCommit} = 0;
543             };
544             next lines;
545             };
546             };
547             $altid ||= "";
548              
549             my($hits, $info, $link);
550             if ( @rest && ($rest[$#rest] =~ m!^\S+://\S+$!) ) {
551             $link = pop @rest}
552             elsif ( defined $rest[2] ) {
553             print "WARNING: unparseable link content >$rest[2]< [$showme l.$.]"};
554              
555             if ( @rest && ($rest[0] =~ /^\d*$/) ) {
556             $hits = shift @rest;
557             # really disregard hits with explicit 0?
558             if ( (!$altid) and (!$link) and ($format =~ /\baltTARGET\b/) ) {
559             $altid = shift @rest || "";
560             }
561             else {
562             $info = shift @rest || "";
563             };
564             }
565             elsif ( defined $rest[1] ) {
566             $hits = "";
567             if ( (!$altid) and (!$link) and ($format =~ /\baltTARGET\b/) ) {
568             $info = shift @rest;
569             $altid = shift @rest;
570             }
571             else {
572             shift @rest;
573             $info = shift @rest;
574             };
575             }
576             elsif ( defined $rest[0] ) {
577             $hits = "";
578             $info = shift @rest;
579             };
580             if ( @rest ) {
581             print "WARNING: unparseable content >$_< [$showme l.$.]"};
582              
583             unless ( $link ) {
584             if ( ($format =~ /\bhasTARGET\b/) ) { # ok
585             }
586             elsif ( $altid && ($format =~ /\baltTARGET\b/) ) { # also ok
587             }
588             elsif ( $format =~ /\bnoTARGET\b/ ) {
589             print "NOTICE: discarding >$id<".(defined $hits ? " ($hits)" : "")." without link [$showme l.$.]\n" if $options{'verbose'} > 1;
590             $recill++;
591             next lines;
592             }
593             else {
594             print "WARNING: discarding >$id<".(defined $hits ? " ($hits)" : "")." without link [$showme l.$.] (assertion failed)\n";
595             $recill++;
596             next lines;
597             }
598             };
599              
600             if ( $format !~ /\baltTARGET\b/ ) { # Allow certain duplicates (force disambiguisation)
601             $altid ||= $info || $link}
602              
603             $hits = "" unless defined $hits;
604             ($hits =~ /^0+/) && ($recnil++, next); # Explizit "0" => raus
605             $hits = 0 if $hits eq "";
606             $altid ||= "";
607             my $hash;
608             if ( defined $self->{identifierClass} ) {
609             $self->{identifierClass}->value($id);
610             unless ( $self->{identifierClass}->valid ) {
611             print "NOTICE: invalid identifier >$id< ($hits) [$showme l.$.]\n" if $options{'verbose'};
612             $recill++;
613             next lines;
614             };
615             $hash = $self->{identifierClass}->hash();
616             }
617             else {
618             $hash = $id};
619             my $did;
620             if ( $replacehandle && ($did = $replacehandle->execute($hits, $info, $link, $hash, $altid)) ) { # UPDATE OR FAIL old record
621             if ( $replacehandle->err ) {
622             carp("update in trouble: $replacehandle->errstring [$showme l.$.]");
623             $recdupl++;
624             }
625             elsif ( $did eq "0E0" ) { # not found, try insert
626             $did = $inserthandle->execute($hash, $altid, $hits, $info, $link);
627             if ( $did eq "0E0" ) {
628             $recdupl++;
629             if ( $altid ) {
630             print "INFO: did not insert duplicate Id >$id< = >$altid< ($hits) [$showme l.$.]\n" if $options{'verbose'}}
631             else {
632             print "INFO: did not insert duplicate Id >$id< ($hits) [$showme l.$.]\n" if $options{'verbose'} > 1};
633             }
634             else {
635             $recnew++};
636             }
637             else {
638             $recupd++};
639             }
640             elsif ( $did = $inserthandle->execute($hash, $altid, $hits, $info, $link) ) { # INSERT OR IGNORE new record
641             if ( $did eq "0E0" ) {
642             $recdupl++;
643             print "INFO: did not insert duplicate Id $id ($hits) [$showme l.$.]\n" if $options{'verbose'} > 1;
644             }
645             else {
646             $recnew++};
647             }
648             elsif ( $inserthandle->errstr =~ /constraint/ ) {
649             $recdupl++;
650             print "INFO: duplicate Id $id ($hits): not inserting [$showme l.$.]\n" if $options{'verbose'} > 1;
651             }
652             else {
653             croak("Could not insert: ($id, $hits, $info, $link): ".$inserthandle->errstr)};
654              
655             unless ( ++$reccount % 10000 ) {
656             $self->{dbh}->{AutoCommit} = 1;
657             print "$reccount\n" if $options{'verbose'};
658             $self->{dbh}->{AutoCommit} = 0;
659             };
660             }
661             };
662             if ( not defined $collno ) {
663             if ( $headerseen ) {
664             ($collno, $err, $format, $inserthandle, $replacehandle, $oseq) = $self->processbeaconheader($fields, %options);
665             if ( $collno ) {
666             print "WARNING: no idn content in file [$showme l.$.]\n"}
667             else {
668             print "ERROR: metadata error [$showme l.$.]\n";
669             return (0,0, "metadata error: $err");
670             };
671             }
672             elsif ( $. ) {
673             print "ERROR: no header fields [$showme l.$.]\n";
674             return (0, 0, "no header fields: Will not proceed");
675             }
676             else {
677             print "WARNING: empty file [$showme]\n";
678             return (0,0, "empty file");
679             };
680             }
681             $self->{dbh}->{AutoCommit} = 1;
682              
683             if ( $autopurge ) {
684             $self->{dbh}->{AutoCommit} = 0;
685             if ( $oseq ) {
686             my ($bcdelh, $bcdelexpl) = $self->stmtHdl("DELETE FROM beacons WHERE seqno==?");
687             $self->stmtExplain($bcdelexpl, $oseq) if $ENV{'DBI_PROFILE'};
688             my $rows = $bcdelh->execute($oseq) or croak("Could not execute >".$bcdelh->{Statement}."<: ".$bcdelh->errstr);
689             $self->{dbh}->{AutoCommit} = 1;
690             printf("INFO: Purged %s surplus identifiers from old sequence %u\n", $rows, $oseq) if $options{'verbose'};
691             $rows = "0" if $rows eq "0E0";
692             $recdel += $rows;
693             };
694              
695             $self->{dbh}->{AutoCommit} = 0;
696             my ($rpdelh, $rpdelexpl) = $self->stmtHdl("DELETE FROM repos WHERE (alias=?) AND (seqno
697             $self->stmtExplain($rpdelexpl, $autopurge, $collno) if $ENV{'DBI_PROFILE'};
698             my $rows = $rpdelh->execute($autopurge, $collno) or croak("Could not execute >".$rpdelh->{Statement}."<: ".$rpdelh->errstr);
699             $self->{dbh}->{AutoCommit} = 1;
700             $rows = "0" if $rows eq "0E0";
701             printf("INFO: %u old sequences discarded\n", $rows) if $options{'verbose'};
702             }
703              
704             printf "NOTICE: New sequence %u for %s: processed %u Records from %u lines\n",
705             $collno, $autopurge || "???", $reccount, $linecount;
706             my $statline = sprintf "%u replaced, %u new, %u deleted, %u duplicate, %u nil, %u invalid, %u ignored",
707             $recupd, $recnew, $recdel, $recdupl, $recnil, $recill, $recign;
708             print " ($statline)\n";
709              
710             my $recok = $recupd + $recnew;
711             my $numchg = ($recnew or $recdel) ? 1 : 0;
712              
713             # my $ct1hdl = $self->stmtHdl("SELECT COUNT(*) FROM beacons WHERE seqno==? LIMIT 1;");
714             # $ct1hdl->execute($collno) or croak("could not execute live count: ".$ct1hdl->errstr);
715             # my $ct1ref = $ct1hdl->fetchrow_arrayref();
716             # my $counti = $ct1ref->[0] || 0;
717              
718             # my $ct2hdl = $self->stmtHdl("SELECT COUNT(DISTINCT hash) FROM beacons WHERE seqno==?");
719             # using subquery to trick SQLite into using indices
720             # my $ct2hdl = $self->stmtHdl("SELECT COUNT(*) FROM (SELECT DISTINCT hash FROM beacons WHERE seqno==?) LIMIT 1;");
721             # $ct2hdl->execute($collno) or croak("could not execute live count: ".$ct2hdl->errstr);
722             # my $ct2ref = $ct2hdl->fetchrow_arrayref();
723             # my $countu = $ct2ref->[0] || 0;
724              
725             # combined query turned out as not as efficient
726             # my $ct0hdl = $self->stmtHdl("SELECT COUNT(*), COUNT(DISTINCT hash) FROM beacons WHERE seqno==? LIMIT 1;");
727             # $ct0hdl->execute($collno) or croak("could not execute live count: ".$ct0hdl->errstr);
728             # my $ct0ref = $ct0hdl->fetchrow_arrayref();
729             # my ($counti, $countu) = ($ct0ref->[0] || 0, $ct0ref->[1] || 0);
730              
731             my ($updh, $updexpl) = $self->stmtHdl(<<"XxX");
732             UPDATE OR FAIL repos SET counti=?,countu=?,fstat=?,utime=?,ustat=?,sort=? WHERE seqno==?;
733             XxX
734              
735             my $counti = $self->idStat($collno, 'distinct' => 0) || 0;
736             printf("WARNING: expected %u valid records, counted %u\n", $recok, $counti) if $recok != $counti;
737             unless ( $numchg ) {
738             $fields->{'_counti'} ||= 0;
739             printf("WARNING: expected unchanged number %u valid records, counted %u\n", $fields->{'_counti'}, $counti) if $fields->{'_counti'} != $counti;
740             };
741              
742             my $sort = $fields->{'_sort'} || "";
743             my $countu = $numchg ? ( $self->idStat($collno, 'distinct' => 1) || 0 )
744             : ( $fields->{'_countu'} || $self->idStat($collno, 'distinct' => 1) || 0 );
745             $self->stmtExplain($updexpl, $counti, $countu, $statline, time(), "successfully loaded", $sort, $collno) if $ENV{'DBI_PROFILE'};
746             $updh->execute($counti, $countu, $statline, time(), "successfully loaded", $sort, $collno)
747             or croak("Could not execute >".$updh->{Statement}."<: ".$updh->errstr);
748             close(BKN);
749              
750             if ( $numchg or $options{'force'} ) {
751             # if ( $options{'force'} ) {
752             # print "[ANALYZE ..." if $options{'verbose'};
753             # $self->{dbh}->do("ANALYZE;");
754             # print "]\n" if $options{'verbose'};
755             # };
756              
757             if ( $options{'nostat'} ) { # invalidate since they might have changed
758             $self->admin('gcounti', undef);
759             $self->admin('gcountu', undef);
760             }
761             else {
762             $self->admin('gcounti', $self->idStat(undef, 'distinct' => 0) || 0);
763             $self->admin('gcountu', $self->idStat(undef, 'distinct' => 1) || 0);
764             }
765             };
766              
767             return ($collno, $recok, undef);
768             }
769              
770              
771             =head4 processbeaconheader($self, $fieldref, [ %options] )
772              
773             Internal subroutine used by C.
774              
775             =over 8
776              
777             =item $fieldref
778              
779             Hash with raw fields.
780              
781             =item Supported options:
782              
783             verbose => (0|1)
784              
785             Show seqnos of old instances which are met by the alias
786              
787             =back
788              
789              
790             =cut
791              
792             sub processbeaconheader {
793             my ($self, $fieldref, %options) = @_;
794             my $osq = 0;
795             my @carp;
796              
797             if ( my $alias = $fieldref->{_alias} ) {
798             my $stampfield = SeeAlso::Source::BeaconAggregator->beaconfields("TIMESTAMP");
799             my ($listh, $listexpl) = $self->stmtHdl("SELECT seqno, $stampfield, mtime, counti, countu FROM repos WHERE alias=?;");
800             $self->stmtExplain($listexpl, $alias) if $ENV{'DBI_PROFILE'};
801             $listh->execute($alias) or croak("Could not execute >".$listh->{Statement}."<: ".$listh->errstr);
802             my ($rowcnt, $ocounti, $ocountu);
803             while ( my($row) = $listh->fetchrow_arrayref ) {
804             last unless defined $row;
805             $rowcnt ++;
806             ($ocounti, $ocountu) = ($row->[3], $row->[4]);
807             if ( $options{'verbose'} ) {
808             print "* Old Instances for $alias:\n" unless $osq;
809             $osq = $row->[0];
810             print "+\t#$osq ", SeeAlso::Source::BeaconAggregator::tToISO($row->[1] || $row->[2]), " (", $row->[3] || "???", ")\n";
811             }
812             else {
813             $osq = $row->[0]};
814             }
815             if ( $rowcnt && ($rowcnt == 1) ) {
816             $fieldref->{_counti} ||= $ocounti if $ocounti;
817             $fieldref->{_countu} ||= $ocountu if $ocountu;
818             }
819             };
820              
821             my $format = "";
822             if ( $fieldref->{'FORMAT'} && $self->{accept}->{'FORMAT'} ) {
823             if ( $fieldref->{'FORMAT'} =~ $self->{accept}->{'FORMAT'} ) {
824             $format = $fieldref->{'FORMAT'}}
825             else {
826             push(@carp, "ERROR: only FORMAT '".$self->{accept}->{'FORMAT'}."' are supported, this is ".$fieldref->{'FORMAT'})}
827             }
828             elsif ( $fieldref->{'FORMAT'} ) {
829             $format = $fieldref->{'FORMAT'}}
830             elsif ( $fieldref->{'VERSION'} or $fieldref->{'TARGET'} or $fieldref->{'PREFIX'} or $fieldref->{'MESSAGE'} ) {
831             push(@carp, "WARNING: header line #FORMAT: BEACON should be supplied")}
832             elsif ( $self->{accept}->{'FORMAT'} ) {
833             push(@carp, "ERROR: header line #FORMAT is missing")}
834             else {
835             push(@carp, "WARNING: header line #FORMAT: BEACON should be supplied")};
836              
837             if ( $fieldref->{'FORMAT'} && ($fieldref->{'FORMAT'} =~ /v(?:ersion)?\s*(\d+(?:\.\d*)?)/i) ) {
838             $fieldref->{'VERSION'} ||= $1};
839             unless ( $fieldref->{'VERSION'} ) {
840             $fieldref->{'VERSION'} = $fieldref->{'FORMAT'} ? "0.1" : "1.0";
841             push(@carp, "NOTICE: added header field #VERSION as '".$fieldref->{'VERSION'}."'");
842             };
843             if ( $self->{accept}->{'VERSION'} ) {
844             ($fieldref->{'VERSION'} =~ $self->{accept}->{'VERSION'})
845             || push(@carp, "ERROR: only VERSION '".$self->{accept}->{'VERSION'}."' is supported, this is ".$fieldref->{'VERSION'});
846             };
847              
848             if ( $fieldref->{'ALTTARGET'} ) {
849             $fieldref->{'ALTTARGET'} = "" unless defined $fieldref->{'ALTTARGET'};
850             my $parsed = hDecode($fieldref, 'ALTTARGET');
851             if ( $parsed && ($parsed =~ /(^|[^%])(%.)*%\d\$s/) ) {
852             $fieldref->{'ALTTARGET'} = $parsed;
853             $format =~ s/\s*-altTARGET//;
854             $format .= " -altTARGET";
855             ($parsed =~ /(^|[^%])(%.)*%2\$s/) or
856             push(@carp, "WARNING: header field #ALTTARGET should contain placeholder {ALTID}");
857             }
858             elsif ( $parsed ) {
859             push(@carp, "ERROR: header field #ALTTARGET must contain placeholder {ALTID} (or {ID})");
860             delete $fieldref->{'ALTTARGET'};
861             }
862             else {
863             push(@carp, "ERROR: could not parse header field #ALTTARGET: '".$fieldref->{'ALTTARGET'}."'");
864             delete $fieldref->{'ALTTARGET'};
865             }
866             };
867              
868             if ( $fieldref->{'IMGTARGET'} ) {
869             $fieldref->{'IMGTARGET'} = "" unless defined $fieldref->{'IMGTARGET'};
870             my $parsed = hDecode($fieldref, 'IMGTARGET');
871             if ( $parsed && ($parsed =~ /(^|[^%])(%.)*%\d\$s/) ) {
872             $fieldref->{'IMGTARGET'} = $parsed;
873             $format =~ s/\s*-imgTARGET//;
874             $format .= " -imgTARGET";
875             }
876             elsif ( $parsed ) {
877             push(@carp, "WARNING: header field #IMGTARGET should contain placeholders {ID} or {ALTID}")}
878             else {
879             push(@carp, "ERROR: could not parse header field #IMGTARGET: '".$fieldref->{'IMGTARGET'}."'");
880             delete $fieldref->{'IMGTARGET'};
881             }
882             };
883              
884             if ( exists $fieldref->{'TARGET'} ) {
885             $fieldref->{'TARGET'} = "" unless defined $fieldref->{'TARGET'};
886             my $parsed = hDecode($fieldref, 'TARGET');
887             if ( $parsed && ($parsed =~ /(^|[^%])(%.)*%1\$s/) && ($parsed !~ /(^|[^%])(%.)*%[2-9]\$s/) ) {
888             $fieldref->{'TARGET'} = $parsed;
889             $format .= " -hasTARGET";
890             }
891             elsif ( $parsed ) {
892             if ( exists $fieldref->{'ALTTARGET'} ) {
893             push(@carp, "ERROR: header field #TARGET must contain placeholder {ID} only");
894             }
895             else {
896             push(@carp, "WARNING: Adding implicit {ID} to #TARGET as #ALTTARGET");
897             $fieldref->{'ALTTARGET'} = $parsed."%2\$s";
898             $format .= " -altTARGET";
899             }
900             delete $fieldref->{'TARGET'};
901             }
902             else {
903             push(@carp, "ERROR: could not parse header field #TARGET: '".$fieldref->{'TARGET'}."'");
904             delete $fieldref->{'TARGET'};
905             }
906             }
907             elsif ( $format =~ /^BEACON/ ) {
908             push(@carp, "WARNING: header field #TARGET not set: ALL beacon lines will have to provide their link by other means!");
909             $format =~ s/\s*-noTARGET//;
910             $format .= " -noTARGET";
911             }
912             else {
913             push(@carp, "ERROR: header field #TARGET is mandatory")};
914              
915              
916             $fieldref->{'MESSAGE'} = hDecode($fieldref, 'MESSAGE') if $fieldref->{'MESSAGE'};
917              
918             if ( $fieldref->{'TIMESTAMP'} ) {
919             if ( my $parsed = hDecode($fieldref, 'TIMESTAMP') ) {
920             printf("* %-30s %s\n", "Beacon Timestamp:", hEncode($parsed, 'TIMESTAMP')) if $options{'verbose'};
921             $fieldref->{'TIMESTAMP'} = $parsed;
922             }
923             else { # unparseable => use current
924             push(@carp, "WARNING: cannot parse TIMESTAMP '".$fieldref->{'TIMESTAMP'}."', using current time");
925             $fieldref->{'TIMESTAMP'} = $^T;
926             };
927             }
928             else {
929             # $fieldref->{'TIMESTAMP'} = $fieldref->{'_mtime'} || $^T;
930             push(@carp, "NOTICE: no header field #TIMESTAMP detected");
931             };
932              
933             if ( $fieldref->{'REVISIT'} ) {
934             if ( my $parsed = hDecode($fieldref, 'REVISIT') ) {
935             if ( $parsed < $^T ) {
936             printf("* %-30s %s [%s]\n", "STALE Revisit hint parsed as", hEncode($parsed, 'REVISIT'), $fieldref->{'REVISIT'})} # if $options{'verbose'}
937             else {
938             printf("* %-30s %s\n", "Revisit hint parsed as", hEncode($parsed, 'REVISIT')) if $options{'verbose'}};
939             $fieldref->{'REVISIT'} = $parsed;
940             }
941             else { # unparseable => discard
942             push(@carp, "WARNING: cannot parse #REVISIT '".$fieldref->{'REVISIT'}."', discarding");
943             delete $fieldref->{'REVISIT'};
944             };
945             }
946             else {
947             push(@carp, "INFO: no header field #REVISIT detected");
948             };
949              
950             my $cancontinue = 1;
951             my $err = "";
952             foreach ( @carp ) {
953             print "$_\n";
954             if ( s/^ERROR: // ) {
955             $cancontinue = 0;
956             $err .= " | " if $err;
957             $err .= $_;
958             };
959             }
960             unless ( $cancontinue or $options{'ignore-header-errors'} ) {
961             print "CRITICAL: Aborting because of Header Errors\n";
962             return (undef, $err, $format);
963             };
964              
965             $fieldref->{'_uri'} ||= $fieldref->{'FEED'};
966             delete $fieldref->{'_uri'} unless $fieldref->{'_uri'};
967              
968             $fieldref->{'_alias'} ||= $fieldref->{'FEED'} || $fieldref->{'TARGET'};
969              
970             my (@fn, @fd);
971             while ( my ($key, $val) = each %$fieldref ) {
972             next unless defined $val;
973             my $dbkey = "";
974             if ( $dbkey = SeeAlso::Source::BeaconAggregator->beaconfields($key) ) {
975             push(@fn, $dbkey)}
976             elsif ( $key =~ /_(\w+)$/ ) {
977             push(@fn, $1)}
978             else {
979             next};
980             my $myval = $val;
981             unless ( $myval =~ /^\d+$/ ) {
982             $myval =~ s/'/''/g;
983             $myval = "'".$myval."'";
984             };
985             push(@fd, $myval);
986             };
987             local($") = ",\n";
988             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
989             INSERT INTO repos ( seqno, @fn ) VALUES ( NULL, @fd );
990             XxX
991             $self->stmtExplain($sthexpl) if $ENV{'DBI_PROFILE'};
992             $sth->execute() or croak("Could not execute >".$sth->{Statement}."<:".$sth->errstr);
993             my $collno = $self->{dbh}->last_insert_id("", "", "", "");
994              
995             my $rhandle;
996             if ( $osq ) {
997             $rhandle = $self->stmtHdl(<<"XxX");
998             UPDATE OR FAIL beacons SET seqno=$collno, hits=?, info=?, link=? WHERE hash=? AND seqno==$osq AND altid=?;
999             XxX
1000             };
1001             my $ihandle = $self->stmtHdl(<<"XxX");
1002             INSERT OR IGNORE INTO beacons ( hash, seqno, altid, hits, info, link ) VALUES (?, $collno, ?, ?, ?, ?);
1003             XxX
1004             return ($collno, "", $format, $ihandle, $rhandle, $osq);
1005             }
1006              
1007              
1008              
1009              
1010             my ($lwpcarp817, $lwpcarp827);
1011              
1012             =head3 update ($sq_or_alias, $params, %options)
1013              
1014             Loads a beacon file into the database, possibly replacing a previous instance.
1015              
1016             Some magic is employed to autoconvert ISO-8859-1 or doubly UTF-8 encoded files
1017             back to UTF-8.
1018              
1019             Returns undef, if something goes wrong, or the file was not modified since,
1020             otherwise returns a pair (new seqence number, number of lines imported).
1021              
1022              
1023             =over 8
1024              
1025              
1026             =item $sq_or_alias
1027              
1028             Sequence number or alias: Used to determine an existing instance.
1029              
1030              
1031             =item $params
1032              
1033             Hashref, containing
1034              
1035             agent => LWP::UserAgent to use
1036             _uri => Feed URL to load from
1037              
1038             =item %options
1039              
1040             Hash, propagated to C
1041              
1042             verbose => (0|1)
1043             force => (0|1) process unconditionally without timestamp comparison
1044             nostat => (0|1) don't refresh global identifier counters
1045              
1046             =back
1047              
1048             Incorporates a new beacon source from a URI in the database or updates an existing one.
1049             For HTTP URIs care is taken not to reload an unmodified BEACON feed (unless the 'force'
1050             option is provided).
1051              
1052             If the feed appears to be newer than the previously loaded version it is fetched,
1053             some UTF-8 adjustments are performed if necessary, then it is stored to a temporary file
1054             and from there finally processed by the C method above.
1055              
1056             The URI to load is determined by the following order of precedence:
1057              
1058             =over 8
1059              
1060             =item 1
1061              
1062             _uri Option
1063              
1064             =item 2
1065              
1066             admin field uri stored in the database
1067              
1068             =item 3
1069              
1070             meta field #FEED taken from the database
1071              
1072             =back
1073              
1074             Typical use is with an alias, not with a sequence number:
1075              
1076             $db->update('whatever');
1077              
1078             Can be used to initially load beacon files from URIs:
1079              
1080             $db->update("new_alias", {_uri => $file_uri} );
1081              
1082             =cut
1083              
1084             sub update {
1085             my ($self, $sq_or_alias, $params, %options) = @_;
1086             $params = {} unless $params;
1087             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1088              
1089             my $ua = $params->{'agent'};
1090             unless ( $ua ) {
1091             require LWP::UserAgent;
1092             $ua = LWP::UserAgent->new(agent => "SA-S-BeaconAggregator ", # end with space to get default agent appended
1093             env_proxy => 1,
1094             timeout => 300,
1095             ) or croak("cannot create UserAgent");
1096             };
1097              
1098             my ($cond, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($sq_or_alias);
1099             my $alias = ($sq_or_alias =~ /^\d+$/) ? "" : $sq_or_alias;
1100             my $feedname = SeeAlso::Source::BeaconAggregator->beaconfields("FEED");
1101             my ($ssth, $ssthexpl) = $self->stmtHdl(<<"XxX");
1102             SELECT seqno, uri, alias, $feedname, ftime, mtime, sort FROM repos $cond;
1103             XxX
1104             $self->stmtExplain($ssthexpl, @cval) if $ENV{'DBI_PROFILE'};
1105             $ssth->execute(@cval) or croak("Could not execute >".$ssth->{Statement}."<: ".$ssth->errstr);
1106             croak("Select old instance error: ".$ssth->errstr) if $ssth->err;
1107             my $aryref = $ssth->fetchrow_arrayref;
1108             my ($osq, $ouri, $oalias, $feed, $fetchtime, $modtime, $osort) = $aryref ? @$aryref : ();
1109              
1110             my $uri = $params->{'_uri'} || $ouri || $feed;
1111             croak("Cannot update $sq_or_alias: URI not given nor determinable from previous content") unless $uri;
1112             $uri =~ s/\s$//;
1113             $alias ||= $oalias || "";
1114              
1115             print "Requesting $uri\n" if $options{'verbose'};
1116             my $rq = HTTP::Request->new('GET', $uri, ['Accept' => 'text/*']) or croak("could not construct request from $uri");
1117             if ( $fetchtime && $modtime && !$options{'force'} ) { # allow force-reload by deleting _ftime or _mtime
1118             printf(" %-30s %s\n", "Old instance stamped", scalar localtime($modtime)) if $options{'verbose'};
1119             $rq->header('If-Modified-Since', HTTP::Date::time2str($modtime));
1120             };
1121             if ( $rq->can("accept_decodable") ) { # LWP 5.817 and newer
1122             $rq->accept_decodable}
1123             else {
1124             carp("please upgrade to LWP >= 5.817 for compression negotiation") if $options{'verbose'} && (!$lwpcarp817++)};
1125              
1126             my $response = $ua->request($rq); # Well, we hoggishly slurp everything into memory,
1127             # however explicit decompression of an already dumped result would be PITA
1128             my $nuri = ($response->request)->uri;
1129             print "NOTICE: Differing result URI: $nuri\n" if $uri ne $nuri;
1130             if ( $response->is_success ) {
1131             print $osq ? "INFO: refreshing $alias sq $osq from $uri\n"
1132             : "INFO: importing previously unseen $alias from $uri\n";
1133             my $charset;
1134             if ( $response->can("content_charset") ) { # LWP 5.827 and above
1135             $charset = $response->content_charset;
1136             print "DEBUG: Content charset is $charset\n" if $charset && $options{'verbose'};
1137             }
1138             else {
1139             carp("please upgrade to LWP >= 5.827 for better detection of content_charset") if $options{'verbose'} && (!$lwpcarp827++)};
1140             $charset ||= "UTF-8";
1141              
1142             my $lm = $response->last_modified;
1143             printf(" %-30s %s\n", "Last modified", scalar localtime($lm)) if $lm && $options{'verbose'};
1144             $lm ||= $^T;
1145              
1146             my $vt = $response->fresh_until(h_min => 1800, h_max => 30 * 86400);
1147             printf(" %-30s %s\n", "Should be valid until", scalar localtime($vt)) if $vt && $options{'verbose'};
1148             $vt ||= 0;
1149              
1150             # temporary file for dumped contents
1151             my ($tmpfh, $tmpfile) = File::Temp::tempfile("BeaconAggregator-XXXXXXXX", SUFFIX => ".txt", TMPDIR => 1) or croak("Could not acquire temporary file for storage");
1152             my $contref; # reference to content buffer
1153             if ( ! $response->content_is_text ) {
1154             my $ct = $response->content_type;
1155             print "WARNING: Response content is $ct, not text/*\n";
1156             if ( my $ce = $response->content_encoding ) {
1157             print "NOTICE: Response is also Content-encoded: $ce\n"}
1158             my $ctt = join("|", $response->decodable());
1159             if ( $ct =~ s!^(.+\/)?($ctt)$!$2! ) {
1160             # yes: decode anyway since it could be a gzip-encoded .txt.gz file!
1161             my $cr = $response->decoded_content( raise_error => 1, ref => 1); # method exists since LWP 5.802 (2004-11-30)
1162             $response->remove_content_headers;
1163             my $newresp = HTTP::Response->new($response->code, $response->message, $response->headers);
1164             $newresp->content_type("text/plain; charset: $charset");
1165             $newresp->content_encoding($ct);
1166             $newresp->content_ref($cr);
1167             $response = $newresp;
1168             }
1169             };
1170             $contref = $response->decoded_content( raise_error => 1, ref => 1); # method exists since LWP 5.802 (2004-11-30)
1171              
1172             if ( $$contref =~ /^\x{FFEF}/ ) { # properly encoded BOM => put Characters to file
1173             binmode($tmpfh, ":utf8");
1174             print "INFO: properly encoded BOM detected: Groked UTF8\n"; # if $options{'verbose'};
1175             }
1176             elsif ( $$contref =~ s/^\xef\xbb\xbf// ) { # BOM Bytes => put Bytes to file, re-read as UTF-8
1177             print "INFO: Byte coded BOM detected: trying to restitute character semantics\n"; # if $options{'verbose'};
1178             print "INFO: Length is ", length($$contref), " ", (utf8::is_utf8($$contref) ? "characters" : "bytes"), "\n";
1179             binmode($tmpfh, ":bytes");
1180             utf_deduplicate($contref) && binmode($tmpfh, ":utf8");
1181             }
1182             elsif ( utf8::is_utf8($$contref) ) { # already Upgraded strings should be written as utf-8
1183             print "INFO: UTF8-ness already established\n" if $options{'verbose'};
1184             binmode($tmpfh, ":utf8");
1185             utf_deduplicate($contref); # but don't trust it (older LWP with file URLs, ...)
1186             }
1187             elsif ( utf8::decode($$contref) ) { # everything in character semantics now
1188             print "INFO: Could decode bytes to UTF8-characters\n" if $options{'verbose'};
1189             binmode($tmpfh, ":utf8");
1190             }
1191             else { # leave it alone
1192             print "WARNING: No clue about character encoding: Assume ISO 8859-1\n"; # if $options{'verbose'};
1193             binmode($tmpfh, ":utf8");
1194             };
1195             print $tmpfh $$contref;
1196             close($tmpfh);
1197             # early cleanup since everything might be huge....
1198             $contref = $response = undef;
1199              
1200             my ($collno, $count, $statref) = $self->loadFile($tmpfile, {_alias => $alias, _uri => $uri, _ruri => $nuri, _mtime => $lm, _sort => $osort}, %options);
1201             if ( ! $collno && $osq ) {
1202             my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1203             UPDATE OR FAIL repos SET utime=?,ustat=? WHERE seqno==?;
1204             XxX
1205             $self->stmtExplain($usthexpl, time(), ($statref ? "load error: $statref" : "internal error"), $osq) if $ENV{'DBI_PROFILE'};
1206             $usth->execute(time(), ($statref ? "load error: $statref" : "internal error"), $osq)
1207             or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
1208             };
1209              
1210             unlink($tmpfile) if -f $tmpfile;
1211             return $collno ? ($collno, $count) : undef;
1212             }
1213             elsif ( $response->code == 304 ) {
1214             print "INFO: $alias not modified since ".HTTP::Date::time2str($modtime)."\n";
1215             my $vt = $response->fresh_until(h_min => 1800, h_max => 6 * 86400);
1216             printf(" %-30s %s\n", "Will not try again before", scalar localtime($vt)) if $options{'verbose'};
1217             my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1218             UPDATE OR FAIL repos SET utime=?,ustat=?,ruri=? WHERE seqno==?;
1219             XxX
1220             $self->stmtExplain($usthexpl, time(), $response->status_line, $nuri, $osq) if $ENV{'DBI_PROFILE'};
1221             $usth->execute(time(), $response->status_line, $nuri, $osq)
1222             or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
1223             return undef;
1224             }
1225             else {
1226             print "WARNING: No access to $uri for $alias [".$response->status_line."]\n";
1227             print $response->headers_as_string, "\n";
1228             return undef unless $osq;
1229             my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1230             UPDATE OR FAIL repos SET utime=?,ustat=?,ruri=? WHERE seqno==?;
1231             XxX
1232             $self->stmtExplain($usthexpl, time(), $response->status_line, $nuri, $osq) if $ENV{'DBI_PROFILE'};
1233             $usth->execute(time(), $response->status_line, $nuri, $osq)
1234             or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
1235             return undef;
1236             };
1237             }
1238              
1239              
1240              
1241             sub utf_deduplicate {
1242             my ($success, $stringref) = (0, @_);
1243             if ( utf8::downgrade($$stringref, 1) ) { # 1 = FAIL_OK
1244             my $prevlength = length($$stringref);
1245             print "INFO: Downgrade was possible, length now $prevlength ", (utf8::is_utf8($$stringref) ? "characters" : "bytes"), "\n";
1246             while ( utf8::decode($$stringref) ) {
1247             $success ++;
1248             my $newlength = length($$stringref);
1249             print "DEBUG: Reassembling as UTF-8 succeeded, length now $newlength ", (utf8::is_utf8($$stringref) ? "characters" : "bytes"), "\n";
1250             last if $newlength == $prevlength;
1251             $prevlength = $newlength;
1252             # last unless utf8::downgrade($$stringref, 1);
1253             }
1254             }
1255             else {
1256             print "WARNING: no downgrade possible, proceed with byte semantics";
1257             };
1258             return $success;
1259             }
1260              
1261             =head3 unload ( [ $seqno_or_alias, %options ] )
1262              
1263             Deletes the sequence(s).
1264              
1265             =over 8
1266              
1267             =item $seqno_or_alias
1268              
1269             numeric sequence number, Alias or SQL pattern.
1270              
1271             =item Supported options:
1272              
1273             force => (0|1)
1274              
1275             Needed to purge the complete database ($seqno_or_alias empty) or to purge
1276             more than one sequence ($seqno_or_alias yields more than one seqno).
1277              
1278             =back
1279              
1280              
1281             =cut
1282              
1283             sub unload {
1284             my ($self, $seqno_or_alias, %options) = @_;
1285             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1286              
1287             my @seqnos = ();
1288             if ( $seqno_or_alias && ($seqno_or_alias =~ /^\d+$/) ) {
1289             @seqnos = ($seqno_or_alias)}
1290             elsif ( $seqno_or_alias || $options{'force'} ) {
1291             @seqnos = $self->Seqnos('_alias', $seqno_or_alias ? ($seqno_or_alias) : ());
1292             unless ( @seqnos ) {
1293             carp("no Seqnos selected by $seqno_or_alias");
1294             return 0;
1295             };
1296             unless ( $options{'force'} or (@seqnos == 1) ) {
1297             carp("Use --force to purge more than one sequence (@seqnos)");
1298             return 0;
1299             };
1300             }
1301             else {
1302             carp("Use --force to purge the complete database");
1303             return 0;
1304             };
1305              
1306             if ( $options{'force'} ) {
1307             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1308             DELETE FROM beacons WHERE seqno==?;
1309             XxX
1310             foreach my $seqno ( @seqnos ) {
1311             $self->stmtExplain($sthexpl, $seqno_or_alias) if $ENV{'DBI_PROFILE'};
1312             my $rows = $sth->execute($seqno_or_alias) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1313             print "INFO: $rows forced for $seqno\n" if $options{'verbose'};
1314             };
1315             };
1316              
1317             my ($cond, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($seqno_or_alias);
1318             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1319             DELETE FROM repos $cond;
1320             XxX
1321             $self->stmtExplain($sthexpl, @cval) if $ENV{'DBI_PROFILE'};
1322             my $rows = $sth->execute(@cval) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1323             $rows = 0 if $rows eq "0E0";
1324              
1325             if ( $rows or $options{'force'} ) {
1326             # if ( $options{'force'} ) {
1327             # print "[ANALYZE ..." if $options{'verbose'};
1328             # $self->{dbh}->do("ANALYZE;");
1329             # print "]\n" if $options{'verbose'};
1330             # };
1331              
1332             if ( $options{'nostat'} ) { # invalidate since they might have changed
1333             $self->admin('gcounti', undef);
1334             $self->admin('gcountu', undef);
1335             }
1336             else {
1337             $self->admin('gcounti', $self->idStat(undef, 'distinct' => 0) || 0);
1338             $self->admin('gcountu', $self->idStat(undef, 'distinct' => 1) || 0);
1339             }
1340             };
1341              
1342             return $rows;
1343             }
1344              
1345              
1346             =head3 purge ( $seqno_or_alias[, %options ] )
1347              
1348             Deletes all identifiers from the database to the given pattern,
1349             but leaves the stored header information intact, such that it
1350             can be updated automatically.
1351              
1352             =over 8
1353              
1354             =item $seqno_or_alias
1355              
1356             Pattern
1357              
1358             =item Supported options:
1359              
1360             force => (0|1)
1361              
1362             Allow purging of more than one sequence.
1363              
1364             =back
1365              
1366              
1367             =cut
1368              
1369             sub purge {
1370             my ($self, $seqno_or_alias, %options) = @_;
1371             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1372             my @seqnos;
1373             if ( $seqno_or_alias && ($seqno_or_alias =~ /^\d+$/) ) {
1374             @seqnos = ($seqno_or_alias)}
1375             elsif ( $seqno_or_alias || $options{'force'} ) {
1376             @seqnos = $self->Seqnos('_alias', $seqno_or_alias ? ($seqno_or_alias) : ());
1377             unless ( @seqnos ) {
1378             carp("no Seqnos selected by $seqno_or_alias");
1379             return 0;
1380             };
1381             unless ( $options{'force'} or (@seqnos == 1) ) {
1382             carp("Use --force to purge more than one sequence (@seqnos)");
1383             return 0;
1384             };
1385             }
1386             else {
1387             carp("Use --force to purge the complete database");
1388             return 0;
1389             };
1390             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1391             DELETE FROM beacons WHERE seqno==?;
1392             XxX
1393             my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1394             UPDATE OR FAIL repos SET counti=?,countu=?,utime=?,ustat=? WHERE seqno==?;
1395             XxX
1396             my $trows = 0;
1397             foreach my $seqno ( @seqnos ) {
1398             $self->stmtExplain($sthexpl, $seqno) if $ENV{'DBI_PROFILE'};
1399             my $rows = $sth->execute($seqno) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1400             $rows = "0" if $rows eq "0E0";
1401             print "INFO: $rows purged for $seqno\n" if $options{'verbose'};
1402             $trows += $rows;
1403             $self->stmtExplain($usthexpl, 0, 0, time, "purged", $seqno) if $ENV{'DBI_PROFILE'};
1404             $usth->execute(0, 0, time, "purged", $seqno)
1405             or croak("Could not execute >".$usth->{Statement}."<: ".$usth->errstr);
1406             };
1407              
1408             if ( $trows or $options{'force'} ) {
1409             # if ( $options{'force'} ) {
1410             # print "[ANALYZE ..." if $options{'verbose'};
1411             # $self->{dbh}->do("ANALYZE;");
1412             # print "]\n" if $options{'verbose'};
1413             # };
1414              
1415             if ( $options{'nostat'} ) { # invalidate since they might have changed
1416             $self->admin('gcounti', undef);
1417             $self->admin('gcountu', undef);
1418             }
1419             else {
1420             $self->admin('gcounti', $self->idStat(undef, 'distinct' => 0) || 0);
1421             $self->admin('gcountu', $self->idStat(undef, 'distinct' => 1) || 0);
1422             }
1423             };
1424              
1425             return $trows;
1426             }
1427              
1428              
1429             =head2 Methods for headers
1430              
1431             =head3 ($rows, @oldvalues) = headerfield ( $sq_or_alias, $key [, $value] )
1432              
1433             Gets or sets an meta or admin Entry for the constituent file indicated by $sq_or_alias
1434              
1435             =cut
1436              
1437             sub headerfield {
1438             my ($self, $sq_or_alias, $key, $value) = @_;
1439              
1440             my $dbkey = "";
1441             if ( $dbkey = SeeAlso::Source::BeaconAggregator->beaconfields($key) ) {
1442             }
1443             elsif ( $key =~ /_(\w+)$/ ) {
1444             $dbkey = $1}
1445             else {
1446             carp "Field $key not known";
1447             return undef;
1448             };
1449              
1450             my ($cond, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($sq_or_alias);
1451              
1452             my ($osth, $osthexpl) = $self->stmtHdl(<<"XxX");
1453             SELECT $dbkey FROM repos $cond;
1454             XxX
1455             $self->stmtExplain($osthexpl, @cval) if $ENV{'DBI_PROFILE'};
1456             $osth->execute(@cval) or croak("Could not execute >".$osth->{Statement}."<:".$osth->errstr);
1457             my $tmpval = $osth->fetchall_arrayref();
1458             my @oval = map { hEncode($_, $key) } map { (defined $_->[0]) ? ($_->[0]) : () } @$tmpval;
1459             my $rows = scalar @oval;
1460              
1461             if ( (defined $value) and ($value ne "") ) { # set
1462             my ($usth, $usthexpl) = $self->stmtHdl(<<"XxX");
1463             UPDATE OR FAIL repos SET $dbkey=? $cond;
1464             XxX
1465             $value = hDecode($value, $key) || "";
1466             $self->stmtExplain($usthexpl, $value, @cval) if $ENV{'DBI_PROFILE'};
1467             $rows = $usth->execute($value, @cval) or croak("Could not execute >".$usth->{Statement}."<:".$usth->errstr);
1468             }
1469             elsif ( defined $value ) { # clear
1470             my ($dsth, $dsthexpl) = $self->stmtHdl(<<"XxX");
1471             UPDATE OR FAIL repos SET $dbkey=? $cond;
1472             XxX
1473             $self->stmtExplain($dsthexpl, undef, @cval) if $ENV{'DBI_PROFILE'};
1474             $rows = $dsth->execute(undef, @cval) or croak("Could not execute >".$dsth->{Statement}."<:".$dsth->errstr);
1475             }
1476             else { # read
1477             }
1478              
1479             return ($rows, @oval);
1480             }
1481              
1482             =head3 ($resultref, $metaref) = headers ( [ $seqno_or_alias ] )
1483              
1484             Iterates over all
1485              
1486             For each iteration returns two hash references:
1487              
1488             =over 8
1489              
1490             =item 1
1491             all official beacon fields
1492              
1493             =item 2
1494             all administrative fields (_alias, ...)
1495              
1496             =back
1497              
1498             =cut
1499              
1500             sub headers {
1501             my ($self, $seqno_or_alias) = @_;
1502              
1503             unless ( $self->{_iterator_info} ) {
1504             my ($constraint, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($seqno_or_alias);
1505             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1506             SELECT * FROM repos $constraint;
1507             XxX
1508             $self->stmtExplain($sthexpl, @cval) if $ENV{'DBI_PROFILE'};
1509             $sth->execute(@cval) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1510             $self->{_iterator_info} = $sth;
1511             };
1512              
1513             my $info = $self->{_iterator_info}->fetchrow_hashref;
1514             unless ( defined $info ) {
1515             croak("Error listing Collections: $self->{_iterator_info}->errstr") if $self->{_iterator_info}->err;
1516             delete $self->{_iterator_info};
1517             return undef;
1518             }
1519              
1520             my $collno = $info->{seqno} || $seqno_or_alias;
1521             my %meta = (_seqno => $collno);
1522             my %result = ();
1523             while ( my($key, $val) = each %$info ) {
1524             next unless defined $val;
1525             my $pval = hEncode($val, $key);
1526              
1527             if ( $key =~ /^bc(\w+)$/ ) {
1528             $result{$1} = $pval}
1529             else {
1530             $meta{"_$key"} = $pval};
1531             }
1532             return \%result, \%meta;
1533             }
1534              
1535             =head3 listCollections ( [ $seqno_or_alias ] )
1536              
1537             Iterates over all Sequences and returns on each call an array of
1538              
1539             Seqno, Alias, Uri, Modification time, Identifier Count and Unique identifier count
1540              
1541             Returns undef if done.
1542              
1543             =cut
1544              
1545             sub listCollections {
1546             my ($self, $seqno_or_alias) = @_;
1547              
1548             unless ( $self->{_iterator_listCollections} ) {
1549             my ($constraint, @cval) = SeeAlso::Source::BeaconAggregator::mkConstraint($seqno_or_alias);
1550             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1551             SELECT seqno, alias, uri, mtime, counti, countu FROM repos $constraint;
1552             XxX
1553             $self->stmtExplain($sthexpl, @cval) if $ENV{'DBI_PROFILE'};
1554             $sth->execute(@cval) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1555             $self->{_iterator_listCollections} = $sth;
1556             };
1557             my $onerow = $self->{_iterator_listCollections}->fetchrow_arrayref;
1558             unless ( $onerow ) {
1559             croak("Error listing Collections: $self->{_iterator_listCollections}->errstr") if $self->{_iterator_listCollections}->err;
1560             delete $self->{_iterator_listCollections};
1561             return ();
1562             };
1563             return @$onerow;
1564             }
1565              
1566             =head2 Statistics
1567              
1568             =head3 idStat ( [ $seqno_or_alias, %options ] )
1569              
1570             Count identifiers for the given pattern.
1571              
1572             =over 8
1573              
1574             =item Supported options:
1575              
1576             distinct => (0|1)
1577              
1578             Count multiple occurences only once
1579              
1580             verbose => (0|1)
1581              
1582             =back
1583              
1584              
1585             =cut
1586              
1587             sub idStat {
1588             my ($self, $seqno_or_alias, %options) = @_;
1589             $options{'verbose'} = $self->{'verbose'} unless exists $options{'verbose'};
1590             my $cond = "";
1591             if ( $seqno_or_alias && ($seqno_or_alias =~ /^\d+$/) ) {
1592             $cond = "WHERE seqno==$seqno_or_alias"}
1593             elsif ( $seqno_or_alias ) {
1594             my @seqnos = $self->Seqnos('_alias', $seqno_or_alias);
1595             if ( @seqnos ) {
1596             $cond = "WHERE seqno IN (".join(",", @seqnos).")"}
1597             else {
1598             carp("no Seqnos selected by $seqno_or_alias");
1599             return 0;
1600             };
1601             };
1602             # my $count_what = $options{'distinct'} ? "DISTINCT hash" : "*";
1603             # will not be optimized by SQLite or mySQL: SELECT COUNT($count_what) FROM beacons $cond;
1604             # my $sth= $self->stmtHdl("SELECT COUNT($count_what) FROM beacons $cond LIMIT 1;");
1605             my $from = $options{'distinct'} ? "(SELECT DISTINCT hash FROM beacons $cond)"
1606             : "beacons $cond";
1607             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1608             SELECT COUNT(*) FROM $from LIMIT 1;
1609             XxX
1610             $self->stmtExplain($sthexpl) if $ENV{'DBI_PROFILE'};
1611             $sth->execute() or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1612             my $hits = $sth->fetchrow_arrayref;
1613              
1614             return $hits->[0] || 0;
1615             };
1616              
1617              
1618             =head3 idCounts ( [ $pattern, %options ] )
1619              
1620             Iterates through the entries according to the optional id filter expression.
1621              
1622             For each iteration the call returns a triple consisting of (identifier,
1623             number of rows, and sum of all individual counts).
1624              
1625             =over 8
1626              
1627             =item Supported options:
1628              
1629             distinct => (0|1)
1630              
1631             Count multiple occurences in one beacon file only once.
1632              
1633             =back
1634              
1635             =cut
1636              
1637             sub idCounts {
1638             my ($self, $pattern, %options) = @_;
1639             my $cond = $pattern ? qq!WHERE hash LIKE "$pattern"! : "";
1640             my $count_what = $options{'distinct'} ? "DISTINCT seqno" : "seqno";
1641             unless ( $self->{_iterator_idCounts} ) {
1642             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1643             SELECT hash, COUNT($count_what), SUM(hits) FROM beacons $cond GROUP BY hash ORDER BY hash;
1644             XxX
1645             $self->stmtExplain($sthexpl) if $ENV{'DBI_PROFILE'};
1646             $sth->execute() or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1647             $self->{_iterator_idCounts} = $sth;
1648             unless ( defined $self->{identifierClass} ) {
1649             my $package = $self->autoIdentifier();
1650             $options{'verbose'} && ref($package) && carp "Assuming identifiers of type ".ref($package)."\n";
1651             }
1652             };
1653             my $onerow = $self->{_iterator_idCounts}->fetchrow_arrayref;
1654             unless ( $onerow ) {
1655             croak("Error listing Collections: $self->{_iterator_idCounts}->errstr") if $self->{_iterator_idCounts}->err;
1656             delete $self->{_iterator_idCounts};
1657             return ();
1658             };
1659             if ( defined $self->{identifierClass} ) {
1660             my $c = $self->{identifierClass};
1661             # compat: hash might not take an argument, must resort to value, has to be cleared before...
1662             $c->value("");
1663             my $did = $c->hash($onerow->[0]) || $c->value($onerow->[0]);
1664             $onerow->[0] = $c->can("pretty") ? $c->pretty() : $c->value();
1665             };
1666             return @$onerow;
1667             };
1668              
1669              
1670             =head3 idList ( [ $pattern ] )
1671              
1672             Iterates through the entries according to the optional selection.
1673              
1674             For each iteration the call returns a tuple consisting of identifier and an
1675             list of array references (Seqno, Hits, Info, explicit Link, AltId) or the emtpy list
1676             if finished.
1677              
1678             Hits, Info, Link and AltId are normalized to the empty string if undefined (or < 2 for hits).
1679              
1680             It is important to finish all iterations before calling this method for "new" arguments:
1681              
1682             1 while $db->idList(); # flush pending results
1683              
1684             =cut
1685              
1686             sub idList {
1687             my ($self, $pattern) = @_;
1688             my $cond = $pattern ? ($pattern =~ /%/ ? "WHERE hash LIKE ?" : qq"WHERE hash=?")
1689             : "";
1690             unless ( $self->{_iterator_idList_handle} ) {
1691             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1692             SELECT hash, seqno, hits, info, link, altid FROM beacons $cond ORDER BY hash, seqno, altid;
1693             XxX
1694             $self->stmtExplain($sthexpl, ($pattern ? ($pattern) : () )) if $ENV{'DBI_PROFILE'};
1695             $sth->execute(($pattern ? ($pattern) : () )) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1696             $self->{_iterator_idList_handle} = $sth;
1697             $self->{_iterator_idList_crosscheck} = $self->RepoCols("ALTTARGET");
1698             $self->{_iterator_idList_prefetch} = undef;
1699             $self->autoIdentifier() unless defined $self->{identifierClass};
1700             };
1701             unless ( exists $self->{_iterator_idList_prefetch} ) { # deferred exit
1702             delete $self->{_iterator_idList_handle};
1703             delete $self->{_iterator_idList_crosscheck};
1704             return ();
1705             };
1706             my $pf = $self->{_iterator_idList_prefetch};
1707             while ( my $onerow = $self->{_iterator_idList_handle}->fetchrow_arrayref ) {
1708             # $onerow->[2] = "" unless $self->{_iterator_idList_crosscheck}->{$onerow->[1]}; # kill artefacts
1709             $onerow->[2] = "" unless $onerow->[2]; # kill artefacts
1710             $onerow->[3] = "" unless defined $onerow->[3]; # kill artefacts
1711             $onerow->[4] = "" unless defined $onerow->[4]; # kill artefacts
1712             $onerow->[5] = "" unless defined $onerow->[5]; # kill artefacts
1713             if ( defined $self->{identifierClass} ) {
1714             my $c = $self->{identifierClass};
1715             # compat: hash might not take an argument, must resort to value, has to be cleared before...
1716             $c->value("");
1717             my $did = $c->hash($onerow->[0]) || $c->value($onerow->[0]);
1718             $onerow->[0] = $c->can("pretty") ? $c->pretty() : $c->value();
1719             };
1720             if ( $pf ) {
1721             if ( $pf->[0] eq $onerow->[0] ) {
1722             push(@$pf, [@$onerow[1..@$onerow-1]]);
1723             next;
1724             }
1725             else {
1726             $self->{_iterator_idList_prefetch} = [$onerow->[0], [@$onerow[1..@$onerow-1]]];
1727             return @$pf;
1728             }
1729             }
1730             else {
1731             $pf = [$onerow->[0], [@$onerow[1..@$onerow-1]]]};
1732             };
1733            
1734             if ( $self->{_iterator_idList_handle}->err ) {
1735             croak("Error listing Collections: $self->{_iterator_idList_handle}->errstr");
1736             };
1737             delete $self->{_iterator_idList_prefetch};
1738             return $pf ? @$pf : ();
1739             };
1740              
1741              
1742             =head2 Manipulation of global metadata: Open Search Description
1743              
1744             =head3 setOSD ( $field, @values }
1745              
1746             Sets the field $field of the OpenSearchDescription to @value(s).
1747              
1748             =cut
1749              
1750             sub setOSD {
1751             my ($self) = shift;
1752             $self->clearOSD($_[0]) or return undef;
1753             return (defined $_[1]) ? $self->addOSD(@_) : 0; # value(s) to set
1754             };
1755              
1756             =head3 clearOSD ( $field }
1757              
1758             Clears the field $field of the OpenSearchDescription.
1759              
1760             =cut
1761              
1762             sub clearOSD {
1763             my ($self, $field) = @_;
1764             $field || (carp("no OSD field name provided"), return undef);
1765             defined $self->osdKeys($field) || (carp("no valid OSD field '$field'"), return undef);
1766             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1767             DELETE FROM osd WHERE key=?;
1768             XxX
1769             $self->stmtExplain($sthexpl, $field) if $ENV{'DBI_PROFILE'};
1770             $sth->execute($field) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1771             return 1;
1772             }
1773              
1774             =head3 addOSD ( $field, @values }
1775              
1776             Adds more @value(s) as (repeatable) field $field of the OpenSearchDescription.
1777              
1778             =cut
1779              
1780             sub addOSD {
1781             my ($self, $field, @values) = @_;
1782             $field || (carp("no OSD field name provided"), return undef);
1783             return 0 unless @values;
1784             defined $self->osdKeys($field) || (carp("no valid OSD field '$field'"), return undef);
1785             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1786             INSERT INTO osd ( key, val ) VALUES ( ?, ? );
1787             XxX
1788             $self->stmtExplain($sthexpl, $field, $values[0]) if $ENV{'DBI_PROFILE'};
1789             my $tstatus = [];
1790             my $tuples = $sth->execute_array({ArrayTupleStatus => $tstatus}, $field, \@values) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1791             return $tuples;
1792             }
1793              
1794             =head2 Manipulation of global metadata: Beacon Metadata
1795              
1796             These headers are used when you will be publishing a beacon file for the collection.
1797              
1798             =head3 setBeaconMeta ( $field, $value )
1799              
1800             Sets the field $field of the Beacon meta table (used to generate a BEACON file for this
1801             service) to $value.
1802              
1803             =cut
1804              
1805             sub setBeaconMeta {
1806             my ($self) = shift;
1807             $self->clearBeaconMeta(@_) or return undef;
1808             return (defined $_[1]) ? $self->addBeaconMeta(@_) : 0; # value to set
1809             };
1810              
1811             =head3 clearBeaconMeta ( $field }
1812              
1813             Deletes the field $field of the Beacon meta table.
1814              
1815             =cut
1816              
1817             sub clearBeaconMeta {
1818             my ($self, $rfield) = @_;
1819             $rfield || (carp("no Beacon field name provided"), return undef);
1820             my $field = $self->beaconfields($rfield) or (carp("no valid Beacon field '$rfield'"), return undef);
1821             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1822             DELETE FROM osd WHERE key=?;
1823             XxX
1824             $self->stmtExplain($sthexpl, $field) if $ENV{'DBI_PROFILE'};
1825             $sth->execute($field) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1826             return 1;
1827             }
1828              
1829             =head3 addBeaconMeta ( $field, $value )
1830              
1831             Appends $value to the field $field of the BEACON meta table
1832              
1833             =cut
1834             sub addBeaconMeta {
1835             my ($self, $rfield, $value) = @_;
1836             $rfield || (carp("no Beacon field name provided"), return undef);
1837             my $field = $self->beaconfields($rfield) or (carp("no valid Beacon field '$rfield'"), return undef);
1838             my ($sth, $sthexpl) = $self->stmtHdl(<<"XxX");
1839             INSERT INTO osd ( key, val ) VALUES ( ?, ? );
1840             XxX
1841             $self->stmtExplain($sthexpl, $field, $value) if $ENV{'DBI_PROFILE'};
1842             $sth->execute($field, $value) or croak("Could not execute >".$sth->{Statement}."<: ".$sth->errstr);
1843             return 1;
1844             }
1845              
1846             =head3 admin ( [$field, [$value]] )
1847              
1848             Manipulates the admin table.
1849              
1850             Yields a hashref to the admin table if called without arguments.
1851              
1852             If called with $field, returns the current value, and sets the
1853             table entry to $value if defined.
1854              
1855              
1856             =cut
1857              
1858             sub admin {
1859             my ($self, $field, $value) = @_;
1860             my $admref = $self->admhash();
1861             return $admref unless $field;
1862             my $retval = $admref->{$field};
1863             return $retval unless defined $value;
1864              
1865             my ($admh, $admexpl) = $self->stmtHdl("INSERT OR REPLACE INTO admin VALUES (?, ?);");
1866             $self->stmtExplain($admexpl, $field, $value) if $ENV{'DBI_PROFILE'};
1867             $admh->execute($field, $value)
1868             or croak("Could not execute update to admin table: ".$admh->errstr);
1869             return defined($retval) ? $retval : "";
1870             }
1871              
1872              
1873             # on-the-fly conversions
1874              
1875             sub hDecode { # external time to numeric timestamp, printf placeholders
1876             my ($val, $fnam) = @_;
1877             return $val unless $fnam;
1878             local($_) = (ref $val) ? $val->{$fnam} : $val;
1879             return undef unless defined $_;
1880              
1881             if ( $fnam =~ /target$/i ) { s/%/%%/g; s/(\{id\}|\$PND)/%1\$s/gi; s/(\{altid\}|\$PND)/%2\$s/gi; }
1882             elsif ( $fnam =~ /message$/i ) { s/%/%%/g; s/\{hits?\}/%s/gi; }
1883             elsif ( $fnam =~ /time|revisit/i ) {
1884             if ( /^\d+$/ ) { # legacy UNIX timestamp
1885             }
1886             elsif ( my $p = HTTP::Date::str2time($_, "GMT") ) { # all unqualified times are GMT
1887             $_ = $p}
1888             else {
1889             carp("could not parse value '$_' as time in field $fnam");
1890             return undef;
1891             };
1892             }
1893             return $_;
1894             }
1895              
1896             sub hEncode { # timestamp to beacon format
1897             my ($val, $fnam) = @_;
1898             local($_) = (ref $val) ? $val->{$fnam} : $val;
1899             return undef unless defined $_;
1900             if ( $fnam =~ /time|revisit/i ) { $_ = SeeAlso::Source::BeaconAggregator::tToISO($_) }
1901             elsif ( $fnam =~ /message/i ) { s/%s/{hits}/; s/%%/%/g; }
1902             elsif ( $fnam =~ /target/i ) { s/%s/{ID}/; s/%1\$s/{ID}/; s/%2\$s/{ALTID}/; s/%%/%/g; };
1903             return $_;
1904             }
1905              
1906             =head1 AUTHOR
1907              
1908             Thomas Berger
1909             CPAN ID: THB
1910             gymel.com
1911             THB@cpan.org
1912              
1913             =head1 COPYRIGHT
1914              
1915             This program is free software; you can redistribute
1916             it and/or modify it under the same terms as Perl itself.
1917              
1918             The full text of the license can be found in the
1919             LICENSE file included with this module.
1920              
1921             =cut
1922              
1923             1;
1924