File Coverage

blib/lib/CPAN/Testers/Data/Generator.pm
Criterion Covered Total %
statement 49 51 96.0
branch n/a
condition n/a
subroutine 17 17 100.0
pod n/a
total 66 68 97.0


line stmt bran cond sub pod time code
1             package CPAN::Testers::Data::Generator;
2              
3 9     9   130765 use warnings;
  9         15  
  9         276  
4 9     9   33 use strict;
  9         9  
  9         279  
5              
6 9     9   36 use vars qw($VERSION);
  9         10  
  9         452  
7             $VERSION = '1.20';
8              
9             #----------------------------------------------------------------------------
10             # Library Modules
11              
12 9     9   5136 use Config::IniFiles;
  9         269381  
  9         240  
13 9     9   4588 use CPAN::Testers::Common::Article;
  9         118864  
  9         59  
14 9     9   3937 use CPAN::Testers::Common::DBUtils;
  9         125516  
  9         60  
15             #use Data::Dumper;
16 9     9   4833 use Data::FlexSerializer;
  9         10828704  
  9         72  
17 9     9   13924 use DateTime;
  9         771338  
  9         317  
18 9     9   73 use DateTime::Duration;
  9         12  
  9         147  
19 9     9   92 use File::Basename;
  9         10  
  9         665  
20 9     9   37 use File::Path;
  9         8  
  9         342  
21 9     9   5579 use File::Slurp;
  9         29220  
  9         534  
22 9     9   4240 use HTML::Entities;
  9         35917  
  9         611  
23 9     9   65 use IO::File;
  9         10  
  9         940  
24 9     9   4526 use JSON;
  9         30028  
  9         36  
25 9     9   1077 use Time::Local;
  9         15  
  9         445  
26              
27 9     9   10443 use Metabase 0.004;
  0            
  0            
28             use Metabase::Fact;
29             use Metabase::Resource;
30             use CPAN::Testers::Fact::LegacyReport;
31             use CPAN::Testers::Fact::TestSummary;
32             use CPAN::Testers::Metabase::AWS;
33             use CPAN::Testers::Report;
34              
35             #----------------------------------------------------------------------------
36             # Variables
37              
38             my $DIFF = 30; # max difference allowed in seconds
39             my $MINS = 15; # split time in minutes
40              
41             my %testers;
42              
43             my $FROM = 'CPAN Tester Report Server <do_not_reply@cpantesters.org>';
44             my $HOW = '/usr/sbin/sendmail -bm';
45             my $HEAD = 'To: EMAIL
46             From: FROM
47             Date: DATE
48             Subject: CPAN Testers Generator Error Report
49              
50             ';
51              
52             my $BODY = '
53             The following reports failed to parse into the cpanstats database:
54              
55             INVALID
56              
57             Thanks,
58             CPAN Testers Server.
59             ';
60              
61             my @admins = (
62             'barbie@missbarbell.co.uk',
63             #'david@dagolden.com'
64             );
65              
66             my ($OSNAMES,%MAPPINGS);
67              
68             #----------------------------------------------------------------------------
69             # The Application Programming Interface
70              
71             sub new {
72             my $class = shift;
73             my %hash = @_;
74              
75             my $self = {
76             meta_count => 0,
77             stat_count => 0,
78             last => '',
79             };
80             bless $self, $class;
81              
82             # load configuration
83             my $cfg = Config::IniFiles->new( -file => $hash{config} );
84              
85             # configure databases
86             for my $db (qw(CPANSTATS METABASE)) {
87             die "No configuration for $db database\n" unless($cfg->SectionExists($db));
88             my %opts = map {$_ => ($cfg->val($db,$_)||undef);} qw(driver database dbfile dbhost dbport dbuser dbpass);
89             $opts{AutoCommit} = 0;
90             $self->{$db} = CPAN::Testers::Common::DBUtils->new(%opts);
91             die "Cannot configure $db database\n" unless($self->{$db});
92             $self->{$db}->{'mysql_enable_utf8'} = 1 if($opts{driver} =~ /mysql/i);
93             $self->{$db}->{'mysql_auto_reconnect'} = 1 if($opts{driver} =~ /mysql/i);
94             }
95              
96             if($cfg->SectionExists('ADMINISTRATION')) {
97             my @admins = $cfg->val('ADMINISTRATION','admins');
98             $self->{admins} = \@admins;
99             }
100              
101             # command line swtiches override configuration settings
102             for my $key (qw(logfile poll_limit stopfile offset aws_bucket aws_namespace)) {
103             $self->{$key} = $hash{$key} || $cfg->val('MAIN',$key);
104             }
105              
106             $self->{offset} ||= 1;
107             $self->{poll_limit} ||= 1000;
108              
109             my @rows = $self->{METABASE}->get_query('hash','SELECT * FROM testers_email');
110             for my $row (@rows) {
111             $testers{$row->{resource}} = $row->{email};
112             }
113              
114             # build OS names map
115             @rows = $self->{CPANSTATS}->get_query('array','SELECT osname,ostitle FROM osname');
116             for my $row (@rows) {
117             $self->{OSNAMES}{lc $row->[0]} ||= $row->[1];
118             }
119             $OSNAMES = join('|',keys %{$self->{OSNAMES}}) if(keys %{$self->{OSNAMES}});
120              
121             $self->load_uploads();
122             $self->load_authors();
123             $self->load_perl_versions();
124              
125             if($cfg->SectionExists('DISABLE')) {
126             my @values = $cfg->val('DISABLE','LIST');
127             $self->{DISABLE}{$_} = 1 for(@values);
128             }
129              
130             if($cfg->SectionExists('OSNAMES')) {
131             for my $param ($cfg->Parameters('OSNAMES')) {
132             $self->{OSNAMES}{lc $param} ||= lc $cfg->val('OSNAMES',$param);
133             }
134             }
135              
136             if($cfg->SectionExists('MAPPINGS')) {
137             for my $param ($cfg->Parameters('MAPPINGS')) {
138             $MAPPINGS{$param} = [ split(',', $cfg->val('MAPPINGS',$param), 2) ];
139             }
140             }
141              
142             eval {
143             $self->{metabase} = CPAN::Testers::Metabase::AWS->new(
144             bucket => $self->{aws_bucket},
145             namespace => $self->{aws_namespace},
146             );
147             $self->{librarian} = $self->{metabase}->public_librarian;
148             };
149              
150             # if we require remote access, we need the librarian
151             unless($hash{localonly}) {
152             return unless($self->{metabase} && $self->{librarian});
153             }
154              
155             # reports are now stored in a compressed format
156             $self->{serializer} = Data::FlexSerializer->new(
157             detect_compression => 1,
158             detect_json => 1,
159             output_format => 'json'
160             );
161             $self->{serializer2} = Data::FlexSerializer->new(
162             detect_compression => 1,
163             detect_sereal => 1,
164             output_format => 'sereal'
165             );
166              
167             return $self;
168             }
169              
170             sub DESTROY {
171             my $self = shift;
172              
173             $self->save_perl_versions();
174             }
175              
176             #----------------------------------------------------------------------------
177             # Public Methods
178              
179             sub generate {
180             my $self = shift;
181             my $nonstop = shift || 0;
182             my $maxdate = shift;
183             my ($to,@reports);
184              
185             $self->{reparse} = 0;
186              
187             $self->_log("START GENERATE nonstop=$nonstop\n");
188              
189             do {
190             my $start = localtime(time);
191             ($self->{processed},$self->{stored},$self->{cached}) = (0,0,0);
192              
193             if($maxdate) {
194             $to = $maxdate;
195             } else {
196             $to = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
197             }
198              
199             $self->_log("DATES maxdate=$maxdate, to=$to \n");
200              
201             my $data = $self->get_next_dates($to);
202            
203             $self->_consume_reports( $to, $data );
204              
205             $nonstop = 0 if($self->{processed} == 0);
206             $nonstop = 0 if($self->{stopfile} && -f $self->{stopfile});
207             $nonstop = 0 if($maxdate && $maxdate le $to);
208              
209             $self->load_uploads() if($nonstop);
210             $self->load_authors() if($nonstop);
211              
212             $self->_log("CHECK nonstop=$nonstop\n");
213             } while($nonstop);
214             $self->_log("STOP GENERATE nonstop=$nonstop\n");
215             }
216              
217             sub regenerate {
218             my ($self,$hash) = @_;
219              
220             $self->{reparse} = 0;
221              
222             my $maxdate = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
223              
224             $self->_log("START REGENERATE\n");
225              
226             my @data;
227             if($hash->{file}) {
228             my $fh = IO::File->new($hash->{file},'r') or die "Cannot open file [$hash->{file}]: $!\n";
229             while(<$fh>) {
230             s/\s+$//;
231             my ($fval,$tval) = split(/,/,$_,2);
232             my %data;
233             $data{gstart} = $fval if($fval =~ /^\w+-\w+-\w+-\w+-\w+$/);
234             $data{dstart} = $fval if($fval =~ /^\d+-\d+-\d+T\d+:\d+:\d+Z$/);
235             $data{gend} = $tval if($tval =~ /^\w+-\w+-\w+-\w+-\w+$/);
236             $data{dend} = $tval if($tval =~ /^\d+-\d+-\d+T\d+:\d+:\d+Z$/);
237             push @data, \%data;
238             }
239             $fh->close;
240             } else {
241             push @data, { gstart => $hash->{gstart}, gend => $hash->{gend},
242             dstart => $hash->{dstart}, dend => $hash->{dend} };
243             }
244              
245             $self->_consume_reports( $maxdate, \@data );
246              
247             $self->_log("STOP REGENERATE\n");
248             }
249              
250             sub rebuild {
251             my ($self,$hash) = @_;
252             $self->_log("START REBUILD\n");
253              
254             my $start = localtime(time);
255             ($self->{processed},$self->{stored},$self->{cached}) = (0,0,0);
256              
257             $self->{reparse} = 1;
258             $self->{localonly} = $hash->{localonly} ? 1 : 0;
259             $self->{check} = $hash->{check} ? 1 : 0;
260              
261              
262             # selection choices:
263             # 1) from guid [to guid]
264             # 2) from date [to date]
265              
266             $hash->{dstart} = $self->_get_createdate( $hash->{gstart}, $hash->{dstart} );
267             $hash->{dend} = $self->_get_createdate( $hash->{gend}, $hash->{dend} );
268              
269             my @where;
270             push @where, "updated >= '$hash->{dstart}'" if($hash->{dstart});
271             push @where, "updated <= '$hash->{dend}'" if($hash->{dend});
272            
273             my $sql = 'SELECT * FROM metabase' .
274             (@where ? ' WHERE ' . join(' AND ',@where) : '') .
275             ' ORDER BY updated ASC';
276              
277             $self->_log("START sql=[$sql]\n");
278              
279             # $self->{CPANSTATS}->do_query("DELETE FROM cpanstats WHERE id >= $start AND id <= $end");
280              
281             my $iterator = $self->{METABASE}->iterator('hash',$sql);
282             while(my $row = $iterator->()) {
283             $self->_log("GUID [$row->{guid}]");
284             $self->{processed}++;
285              
286             my $report = $self->load_fact(undef,0,$row);
287              
288             unless($report) {
289             $self->_log(" ... no report\n");
290             warn "No report returned [$row->{id},$row->{guid}]\n";
291             next;
292             }
293              
294             $self->{report}{id} = $row->{id};
295             $self->{report}{guid} = $row->{guid};
296             $self->{report}{metabase} = $self->{facts};
297              
298             # corrupt cached report?
299             if($self->reparse_report()) { # true if invalid report
300             $self->_log(".. cannot parse metabase cache report\n");
301             warn "Cannot parse cached report [$row->{id},$row->{guid}]\n";
302             next;
303             }
304              
305             if($self->store_report()) { $self->_log(".. cpanstats stored\n") }
306             else { $self->_log(".. cpanstats not stored\n") }
307             if($self->cache_update()) { $self->_log(".. metabase stored\n") }
308             else { $self->_log(".. bad metabase cache data\n") }
309              
310             $self->{stored}++;
311             $self->{cached}++;
312             }
313              
314             my $invalid = $self->{invalid} ? scalar(@{$self->{invalid}}) : 0;
315             my $stop = localtime(time);
316             $self->_log("MARKER: processed=$self->{processed}, stored=$self->{stored}, cached=$self->{cached}, invalid=$invalid, start=$start, stop=$stop\n");
317              
318             $self->commit();
319             $self->_log("STOP REBUILD\n");
320             }
321              
322             sub parse {
323             my ($self,$hash) = @_;
324             $self->_log("START PARSE\n");
325              
326             my @guids = $self->_get_guid_list($hash->{guid},$hash->{file});
327             return unless(@guids);
328              
329             $self->{force} ||= 0;
330              
331             for my $guid (@guids) {
332             $self->_log("GUID [$guid]");
333              
334             my ($report,$stored);
335             unless($hash->{force}) {
336             $report = $self->load_fact($guid,1);
337             $stored = $self->retrieve_report($guid);
338             }
339              
340             if($report && $stored) {
341             $self->_log(".. report already stored and cached\n");
342             next;
343             }
344              
345             $report = $self->get_fact($guid);
346              
347             unless($report) {
348             $self->_log(".. report not found [$guid]\n");
349             next;
350             }
351            
352             $self->{report}{guid} = $guid;
353             $hash->{report} = $report;
354             if($self->parse_report(%$hash)) { # true if invalid report
355             $self->_log(".. cannot parse report [$guid]\n");
356             next;
357             }
358              
359             if($self->store_report()) { $self->_log(".. stored"); }
360             else { $self->_log(".. already stored"); }
361              
362             if($self->cache_report()) { $self->_log(".. cached\n"); }
363             else { $self->_log(".. FAIL: bad cache data\n"); }
364             }
365              
366             $self->commit();
367             $self->_log("STOP PARSE\n");
368             return 1;
369             }
370              
371             sub reparse {
372             my ($self,$hash) = @_;
373             $self->_log("START REPARSE\n");
374              
375             my @guids = $self->_get_guid_list($hash->{guid},$hash->{file});
376             return unless(@guids);
377              
378             $self->{reparse} = $self->{force} ? 0 : 1;
379             $self->{localonly} = $hash->{localonly} ? 1 : 0;
380             $self->{check} = $hash->{check} ? 1 : 0;
381              
382             for my $guid (@guids) {
383             $self->_log("GUID [$guid]");
384              
385             my $report;
386             $report = $self->load_fact($guid) unless($hash->{force});
387              
388             if($report) {
389             $self->{report}{metabase} = $report;
390             $self->{report}{guid} = $guid;
391             $hash->{report} = $report;
392             if($self->reparse_report(%$hash)) { # true if invalid report
393             $self->_log(".. cannot parse report [$guid]\n");
394             return 0;
395             }
396             } else {
397             $report = $self->get_fact($guid) unless($report || $hash->{localonly});
398              
399             unless($report) {
400             if($self->{localonly}) {
401             $self->_log(".. report not available locally [$guid]\n");
402             return 0;
403             }
404             $self->_log(".. report not found [$guid]\n");
405             return 0;
406             }
407            
408             $self->{report}{guid} = $guid;
409             $hash->{report} = $report;
410             if($self->parse_report(%$hash)) { # true if invalid report
411             $self->_log(".. cannot parse report [$guid]\n");
412             return 0;
413             }
414             }
415              
416             if($self->store_report()) { $self->_log(".. stored"); }
417             else {
418             if($self->{time} gt $self->{report}{updated}) {
419             $self->_log(".. FAIL: older than requested [$self->{time}]\n");
420             return 0;
421             }
422            
423             $self->_log(".. already stored");
424             }
425             if($self->cache_report()) { $self->_log(".. cached\n"); }
426             else { $self->_log(".. FAIL: bad cache data\n"); }
427             }
428              
429             $self->commit();
430             $self->_log("STOP REPARSE\n");
431             return 1;
432             }
433              
434             sub tail {
435             my ($self,$hash) = @_;
436             return unless($hash->{file});
437              
438             $self->_log("START TAIL\n");
439              
440             my $guids = $self->get_tail_guids();
441             my $fh = IO::File->new($hash->{file},'a+') or die "Cannot read file [$hash->{file}]: $!";
442             print $fh "$_\n" for(@$guids);
443             $fh->close;
444              
445             $self->_log("STOP TAIL\n");
446             }
447              
448             #----------------------------------------------------------------------------
449             # Internal Methods
450              
451             sub commit {
452             my $self = shift;
453             for(qw(CPANSTATS)) {
454             next unless($self->{$_});
455             $self->{$_}->do_commit;
456             }
457             }
458              
459             sub get_tail_guids {
460             my $self = shift;
461             my $guids;
462              
463             eval {
464             # $guids = $self->{librarian}->search(
465             # 'core.type' => 'CPAN-Testers-Report',
466             # 'core.update_time' => { ">", 0 },
467             # '-desc' => 'core.update_time',
468             # '-limit' => $self->{poll_limit},
469             # );
470             $guids = $self->{librarian}->search(
471             '-where' => [
472             '-and' =>
473             [ '-eq' => 'core.type' => 'CPAN-Testers-Report' ],
474             [ '-ge' => 'core.update_time' => 0 ]
475             ],
476             '-order' => [ '-desc' => 'core.update_time' ],
477             '-limit' => $self->{poll_limit},
478             );
479             };
480              
481             $self->_log(" ... Metabase Tail Failed [$@]\n") if($@);
482             $self->_log("Retrieved ".($guids ? scalar(@$guids) : 0)." guids\n");
483              
484             return $guids;
485             }
486              
487             sub get_next_dates {
488             my ($self,$to) = @_;
489             my (@data,$from);
490              
491             my $time = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
492              
493             $self->_log("DATES to=$to, time=$time\n");
494              
495             # note that because Amazon's SimpleDB can return odd entries out of sync, we have to look at previous entries
496             # to ensure we are starting from the right point. Also ignore date/times in the future.
497             my @rows = $self->{METABASE}->get_query('array','SELECT updated FROM metabase WHERE updated <= ? ORDER BY updated DESC LIMIT 10',$time);
498             for my $row (@rows) {
499             if($from) {
500             my $diff = abs( _date_diff($from,$row->[0]) ); # just interested in the difference
501             $self->_log("get_next_dates from=[$from], updated=[$row->[0]], diff=$diff, DIFF=$DIFF\n");
502             next if($diff < $DIFF);
503             }
504              
505             $from = $row->[0];
506             }
507              
508             $from ||= '1999-01-01T00:00:00Z';
509             if($from gt $to) {
510             my $xx = $from;
511             $from = $to;
512             $to = $xx;
513             }
514              
515             $self->_log("NEXT from=[$from], to=[$to]\n");
516              
517             while($from lt $to) {
518             my @from = $from =~ /(\d+)\-(\d+)\-(\d+)T(\d+):(\d+):(\d+)/;
519             my $dt = DateTime->new(
520             year => $from[0], month => $from[1], day => $from[2],
521             hour => $from[3], minute => $from[4], second => $from[5],
522             );
523             $dt->add( DateTime::Duration->new( minutes => $MINS ) );
524             my $split = sprintf "%sT%sZ", $dt->ymd, $dt->hms;
525             if($split lt $to) {
526             push @data, { dstart => $from, dend => $split };
527             } else {
528             push @data, { dstart => $from, dend => $to };
529             }
530              
531             $from = $split;
532             }
533              
534             return \@data;
535             }
536              
537             sub get_next_guids {
538             my ($self,$start,$end) = @_;
539             my ($guids);
540              
541             $self->{time} ||= 0;
542             $self->{last} ||= 0;
543             $start ||= 0;
544              
545             $self->_log("PRE time=[$self->{time}], last=[$self->{last}], start=[".($start||'')."], end=[".($end||'')."]\n");
546              
547             if($start) {
548             $self->{time} = $start;
549             $self->{time_to} = $end || '';
550             } else {
551             my $time = sprintf "%sT%sZ", DateTime->now->ymd, DateTime->now->hms;
552              
553             # note that because Amazon's SimpleDB can return odd entries out of sync, we have to look at previous entries
554             # to ensure we are starting from the right point. Also ignore date/times in the future.
555             my @rows = $self->{METABASE}->get_query('array','SELECT updated FROM metabase WHERE updated <= ? ORDER BY updated DESC LIMIT 10',$time);
556             for my $row (@rows) {
557             if($self->{time}) {
558             my $diff = abs( _date_diff($self->{time},$row->[0]) ); # just interested in the difference
559             next if($diff < $DIFF);
560             }
561              
562             $self->{time} = $row->[0];
563             }
564              
565             $self->{time} ||= '1999-01-01T00:00:00Z';
566             if($self->{last} ge $self->{time}) {
567             my @ts = $self->{last} =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/;
568             $ts[1]--;
569             my $ts = timelocal(reverse @ts);
570             @ts = localtime($ts + $self->{offset}); # increment the offset for next time
571             $self->{time} = sprintf "%04d-%02d-%02dT%02d:%02d:%02dZ", $ts[5]+1900,$ts[4]+1,$ts[3], $ts[2],$ts[1],$ts[0];
572             }
573             }
574              
575             $self->_log("START time=[$self->{time}], last=[$self->{last}]\n");
576             $self->{last} = $self->{time};
577              
578             eval {
579             # if($self->{time_to}) {
580             # $guids = $self->{librarian}->search(
581             # 'core.type' => 'CPAN-Testers-Report',
582             # 'core.update_time' => { -and => { ">=" => $self->{time}, "<=" => $self->{time_to} } },
583             # '-asc' => 'core.update_time',
584             # '-limit' => $self->{poll_limit},
585             # );
586             # } else {
587             $guids = $self->{librarian}->search(
588             '-where' => [
589             '-and' =>
590             [ '-eq' => 'core.type' => 'CPAN-Testers-Report' ],
591             [ '-ge' => 'core.update_time' => $self->{time} ]
592             ],
593             '-order' => [ '-asc' => 'core.update_time' ],
594             '-limit' => $self->{poll_limit},
595             );
596             # }
597             };
598              
599             $self->_log(" ... Metabase Search Failed [$@]\n") if($@);
600             $self->_log("Retrieved ".($guids ? scalar(@$guids) : 0)." guids\n");
601             return $guids;
602             }
603              
604             sub retrieve_reports {
605             my ($self,$guids,$start) = @_;
606              
607             if($guids) {
608             for my $guid (@$guids) {
609             $self->_log("GUID [$guid]");
610             $self->{processed}++;
611             $self->{msg} = '';
612              
613             if(my $report = $self->get_fact($guid)) {
614             $self->{report}{guid} = $guid;
615             next if($self->parse_report(report => $report)); # true if invalid report
616              
617             if($self->store_report()) {
618             $self->{msg} .= ".. stored";
619             $self->{stored}++;
620              
621             } else {
622             if($self->{time} gt $self->{report}{updated}) {
623             $self->_log(".. FAIL: older than requested [$self->{time}]\n");
624             next;
625             }
626             $self->{msg} .= ".. already stored";
627             }
628             if($self->cache_report()) { $self->_log(".. cached\n"); $self->{cached}++; }
629             else { $self->_log(".. bad cache data\n"); }
630             } else {
631             $self->_log(".. FAIL\n");
632             }
633             }
634             }
635              
636             $self->commit();
637             my $invalid = $self->{invalid} ? scalar(@{$self->{invalid}}) : 0;
638             my $stop = localtime(time);
639             $self->_log("MARKER: processed=$self->{processed}, stored=$self->{stored}, cached=$self->{cached}, invalid=$invalid, start=$start, stop=$stop\n");
640              
641             # only email invalid reports during the generate process
642             $self->_send_email() if($self->{invalid});
643             }
644              
645             sub already_saved {
646             my ($self,$guid) = @_;
647             my @rows = $self->{METABASE}->get_query('array','SELECT updated FROM metabase WHERE guid=?',$guid);
648             return $rows[0]->[0] if(@rows);
649             return 0;
650             }
651              
652             sub load_fact {
653             my ($self,$guid,$check,$row) = @_;
654              
655             if(!$row && $guid) {
656             my @rows = $self->{METABASE}->get_query('hash','SELECT report,fact FROM metabase WHERE guid=?',$guid);
657             $row = $rows[0] if(@rows);
658             }
659              
660             if($row) {
661             if($row->{fact}) {
662             $self->{fact} = $self->{serializer2}->deserialize($row->{fact});
663             $self->{facts} = $self->dereference_report($self->{fact});
664             return $self->{facts};
665             }
666            
667             if($row->{report}) {
668             $self->{facts} = $self->{serializer}->deserialize($row->{report});
669             return $self->{facts};
670             }
671             }
672              
673             $self->_log(" ... no report [guid=$guid]\n") unless($check);
674             return;
675             }
676              
677             sub get_fact {
678             my ($self,$guid) = @_;
679             my $fact;
680             #print STDERR "guid=$guid\n";
681             eval { $fact = $self->{librarian}->extract( $guid ) };
682              
683             if($fact) {
684             $self->{fact} = $fact;
685             return $fact;
686             }
687              
688             $self->_log(" ... no report [guid=$guid] [$@]\n");
689             return;
690             }
691              
692             sub dereference_report {
693             my ($self,$report) = @_;
694             my %facts;
695              
696             my @facts = $report->facts();
697             for my $fact (@facts) {
698             my $name = ref $fact;
699             $facts{$name} = $fact->as_struct;
700             $facts{$name}{content} = decode_json($facts{$name}{content});
701             }
702              
703             return \%facts;
704             }
705              
706             sub parse_report {
707             my ($self,%hash) = @_;
708             my $options = $hash{options};
709             my $report = $hash{report};
710             my $guid = $self->{report}{guid};
711             my $invalid;
712              
713             $self->{report}{created} = $report->{metadata}{core}{creation_time};
714             $self->{report}{updated} = $report->{metadata}{core}{update_time};
715              
716             unless(ref($report) eq 'CPAN::Testers::Report') {
717             $self->{msg} .= ".. ref [" . ref($report) . "]";
718             return 1;
719             }
720              
721             my @facts = $report->facts();
722             for my $fact (@facts) {
723             if(ref $fact eq 'CPAN::Testers::Fact::TestSummary') {
724             $self->{report}{metabase}{'CPAN::Testers::Fact::TestSummary'} = $fact->as_struct;
725             $self->{report}{metabase}{'CPAN::Testers::Fact::TestSummary'}{content} = decode_json($self->{report}{metabase}{'CPAN::Testers::Fact::TestSummary'}{content});
726              
727             $self->{report}{state} = lc $fact->{content}{grade};
728             $self->{report}{platform} = $fact->{content}{archname};
729             $self->{report}{osname} = $self->_osname($fact->{content}{osname});
730             $self->{report}{osvers} = $fact->{content}{osversion};
731             $self->{report}{perl} = $fact->{content}{perl_version};
732             #$self->{report}{created} = $fact->{metadata}{core}{creation_time};
733             #$self->{report}{updated} = $fact->{metadata}{core}{update_time};
734              
735             my $dist = Metabase::Resource->new( $fact->resource );
736             $self->{report}{dist} = $dist->metadata->{dist_name};
737             $self->{report}{version} = $dist->metadata->{dist_version};
738             $self->{report}{resource} = $dist->metadata->{resource};
739              
740             # some distros are a pain!
741             if($self->{report}{version} eq '' && $MAPPINGS{$self->{report}{dist}}) {
742             $self->{report}{version} = $MAPPINGS{$self->{report}{dist}}->[1];
743             $self->{report}{dist} = $MAPPINGS{$self->{report}{dist}}->[0];
744             } elsif($self->{report}{version} eq '') {
745             $self->{report}{version} = 0;
746             }
747              
748             $self->{report}{from} = $self->_get_tester( $fact->creator->resource );
749              
750             # alternative API
751             #my $profile = $fact->creator->user;
752             #$self->{report}{from} = $profile->{email};
753             #$self->{report}{from} =~ s/'/''/g; #'
754             #$self->{report}{dist} = $fact->resource->dist_name;
755             #$self->{report}{version} = $fact->resource->dist_version;
756              
757             } elsif(ref $fact eq 'CPAN::Testers::Fact::LegacyReport') {
758             $self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'} = $fact->as_struct;
759             $self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'}{content} = decode_json($self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'}{content});
760             $invalid = 'missing textreport' if(length $fact->{content}{textreport} < 10); # what is the smallest report?
761              
762             $self->{report}{perl} = $fact->{content}{perl_version};
763             }
764             }
765              
766             if($invalid) {
767             push @{$self->{invalid}}, {msg => $invalid, guid => $guid};
768             return 1;
769             }
770              
771             # fixes from metabase formatting
772             $self->{report}{perl} =~ s/^v//; # no leading 'v'
773             $self->_check_arch_os();
774              
775             if($self->{report}{created}) {
776             my @created = $self->{report}{created} =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/; # 2010-02-23T20:33:52Z
777             $self->{report}{postdate} = sprintf "%04d%02d", $created[0], $created[1];
778             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[0], $created[1], $created[2], $created[3], $created[4];
779             } else {
780             my @created = localtime(time);
781             $self->{report}{postdate} = sprintf "%04d%02d", $created[5]+1900, $created[4]+1;
782             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[5]+1900, $created[4]+1, $created[3], $created[2], $created[1];
783             }
784              
785             $self->{msg} .= ".. time [$self->{report}{created}][$self->{report}{updated}]";
786              
787             $self->{report}{type} = 2;
788             if($self->{DISABLE} && $self->{DISABLE}{$self->{report}{from}}) {
789             $self->{report}{state} .= ':invalid';
790             $self->{report}{type} = 3;
791             } elsif($self->{report}{response} && $self->{report}{response} =~ m!/perl6/!) {
792             # $self->{report}{type} = 6;
793             return 1;
794             }
795              
796             #print STDERR "\n====\nreport=".Dumper($self->{report});
797              
798             return 1 unless($self->_valid_field($guid, 'dist' => $self->{report}{dist}) || ($options && $options->{exclude}{dist}));
799             return 1 unless($self->_valid_field($guid, 'version' => $self->{report}{version}) || ($options && $options->{exclude}{version}));
800             return 1 unless($self->_valid_field($guid, 'from' => $self->{report}{from}) || ($options && $options->{exclude}{from}));
801             return 1 unless($self->_valid_field($guid, 'perl' => $self->{report}{perl}) || ($options && $options->{exclude}{perl}));
802             return 1 unless($self->_valid_field($guid, 'platform' => $self->{report}{platform}) || ($options && $options->{exclude}{platform}));
803             return 1 unless($self->_valid_field($guid, 'osname' => $self->{report}{osname}) || ($options && $options->{exclude}{osname}));
804             return 1 unless($self->_valid_field($guid, 'osvers' => $self->{report}{osvers}) || ($options && $options->{exclude}{osname}));
805              
806             return 0
807             }
808              
809             sub reparse_report {
810             my ($self,%hash) = @_;
811             my $fact = 'CPAN::Testers::Fact::TestSummary';
812             my $options = $hash{options};
813              
814             $self->{report}{metabase}{$fact}{content} = encode_json($self->{report}{metabase}{$fact}{content});
815             my $report = CPAN::Testers::Fact::TestSummary->from_struct( $self->{report}{metabase}{$fact} );
816             my $guid = $self->{report}{guid};
817              
818             $self->{report}{state} = lc $report->{content}{grade};
819             $self->{report}{platform} = $report->{content}{archname};
820             $self->{report}{osname} = $self->_osname($report->{content}{osname});
821             $self->{report}{osvers} = $report->{content}{osversion};
822             $self->{report}{perl} = $report->{content}{perl_version};
823             $self->{report}{created} = $report->{metadata}{core}{creation_time};
824              
825             my $dist = Metabase::Resource->new( $report->{metadata}{core}{resource} );
826             $self->{report}{dist} = $dist->metadata->{dist_name};
827             $self->{report}{version} = $dist->metadata->{dist_version};
828             $self->{report}{resource} = $dist->metadata->{resource};
829              
830             $self->{report}{from} = $self->_get_tester( $report->{metadata}{core}{creator}{resource} );
831              
832             if($self->{report}{created}) {
833             my @created = $self->{report}{created} =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/; # 2010-02-23T20:33:52Z
834             $self->{report}{postdate} = sprintf "%04d%02d", $created[0], $created[1];
835             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[0], $created[1], $created[2], $created[3], $created[4];
836             } else {
837             my @created = localtime(time);
838             $self->{report}{postdate} = sprintf "%04d%02d", $created[5]+1900, $created[4]+1;
839             $self->{report}{fulldate} = sprintf "%04d%02d%02d%02d%02d", $created[5]+1900, $created[4]+1, $created[3], $created[2], $created[1];
840             }
841              
842             $self->{report}{type} = 2;
843             if($self->{DISABLE} && $self->{DISABLE}{$self->{report}{from}}) {
844             $self->{report}{state} .= ':invalid';
845             $self->{report}{type} = 3;
846             } elsif($self->{report}{response} && $self->{report}{response} =~ m!/perl6/!) {
847             # $self->{report}{type} = 6;
848             return 1;
849             }
850              
851             return 1 unless($self->_valid_field($guid, 'dist' => $self->{report}{dist}) || ($options && $options->{exclude}{dist}));
852             return 1 unless($self->_valid_field($guid, 'version' => $self->{report}{version}) || ($options && $options->{exclude}{version}));
853             return 1 unless($self->_valid_field($guid, 'from' => $self->{report}{from}) || ($options && $options->{exclude}{from}));
854             return 1 unless($self->_valid_field($guid, 'perl' => $self->{report}{perl}) || ($options && $options->{exclude}{perl}));
855             return 1 unless($self->_valid_field($guid, 'platform' => $self->{report}{platform}) || ($options && $options->{exclude}{platform}));
856             return 1 unless($self->_valid_field($guid, 'osname' => $self->{report}{osname}) || ($options && $options->{exclude}{osname}));
857             return 1 unless($self->_valid_field($guid, 'osvers' => $self->{report}{osvers}) || ($options && $options->{exclude}{osname}));
858              
859             return 0;
860             }
861              
862             sub retrieve_report {
863             my $self = shift;
864             my $guid = shift or return;
865              
866             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT * FROM cpanstats WHERE guid=?',$guid);
867             return $rows[0] if(@rows);
868             return;
869             }
870              
871             sub store_report {
872             my $self = shift;
873             my @fields = qw(guid state postdate from dist version platform perl osname osvers fulldate type);
874              
875             my %fields = map {$_ => $self->{report}{$_}} @fields;
876             $fields{$_} ||= 0 for(qw(type));
877             $fields{$_} ||= '0' for(qw(perl));
878             $fields{$_} ||= '' for(@fields);
879              
880             my @values = map {$fields{$_}} @fields;
881              
882             my %SQL = (
883             'SELECT' => {
884             CPANSTATS => 'SELECT id FROM cpanstats WHERE guid=?',
885             RELEASE => 'SELECT id FROM release_data WHERE guid=?',
886             },
887             'INSERT' => {
888             CPANSTATS => 'INSERT INTO cpanstats (guid,state,postdate,tester,dist,version,platform,perl,osname,osvers,fulldate,type) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',
889             RELEASE => 'INSERT INTO release_data (id,guid,dist,version,oncpan,distmat,perlmat,patched,pass,fail,na,unknown) VALUES (?,?,?,?,?,?,?,?,?,?,?,?)',
890             PASSES => 'INSERT IGNORE passreports SET platform=?, osname=?, perl=?, dist=?, postdate=?',
891             },
892             'UPDATE' => {
893             CPANSTATS => 'UPDATE cpanstats SET state=?,postdate=?,tester=?,dist=?,version=?,platform=?,perl=?,osname=?,osvers=?,fulldate=?,type=? WHERE guid=?',
894             RELEASE => 'UPDATE release_data SET id=?,dist=?,version=?,oncpan=?,distmat=?,perlmat=?,patched=?,pass=?,fail=?,na=?,unknown=? WHERE guid=?',
895             },
896             );
897              
898             # update the mysql database
899             my @rows = $self->{CPANSTATS}->get_query('array',$SQL{SELECT}{CPANSTATS},$values[0]);
900             if(@rows) {
901             if($self->{reparse}) {
902             my ($guid,@update) = @values;
903             if($self->{check}) {
904             $self->_log( "CHECK: $SQL{UPDATE}{CPANSTATS},[" . join(',',@update,$guid) . "]\n" );
905             } else {
906             $self->{CPANSTATS}->do_query($SQL{UPDATE}{CPANSTATS},@update,$guid);
907             }
908             } else {
909             $self->{report}{id} = $rows[0]->[0];
910             return 0;
911             }
912             } else {
913             if($self->{check}) {
914             $self->_log( "CHECK: $SQL{INSERT}{CPANSTATS},[" . join(',',@values) . "]\n" );
915             } else {
916             $self->{report}{id} = $self->{CPANSTATS}->id_query($SQL{INSERT}{CPANSTATS},@values);
917             }
918             }
919              
920             # in check mode, assume the rest happens
921             return 1 if($self->{check});
922              
923             # perl version components
924             my ($perl,$patch,$devel) = $self->_get_perl_version($fields{perl});
925              
926             # only valid perl5 reports
927             if($self->{report}{type} == 2) {
928             $fields{id} = $self->{report}{id};
929              
930             # push page requests
931             # - note we only update the author if this is the *latest* version of the distribution
932             my $author = $self->{report}{pauseid} || $self->_get_author($fields{dist},$fields{version});
933             $self->{CPANSTATS}->do_query("INSERT INTO page_requests (type,name,weight,id) VALUES ('author',?,1,?)",$author,$fields{id}) if($author);
934             $self->{CPANSTATS}->do_query("INSERT INTO page_requests (type,name,weight,id) VALUES ('distro',?,1,?)",$fields{dist},$fields{id});
935              
936             my @rows = $self->{CPANSTATS}->get_query('array',$SQL{SELECT}{RELEASE},$fields{guid});
937             #print STDERR "# select release $SQL{SELECT}{RELEASE},$fields{guid}\n";
938             if(@rows) {
939             if($self->{reparse}) {
940             $self->{CPANSTATS}->do_query($SQL{UPDATE}{RELEASE},
941             $fields{id}, # id,
942             $fields{dist},$fields{version}, # dist, version
943              
944             $self->_oncpan($fields{dist},$fields{version}) ? 1 : 2,
945              
946             $fields{version} =~ /_/ ? 2 : 1,
947             $devel ? 2 : 1,
948             $patch ? 2 : 1,
949              
950             $fields{state} eq 'pass' ? 1 : 0,
951             $fields{state} eq 'fail' ? 1 : 0,
952             $fields{state} eq 'na' ? 1 : 0,
953             $fields{state} eq 'unknown' ? 1 : 0,
954              
955             $fields{guid}); # guid
956             }
957             } else {
958             #print STDERR "# insert release $SQL{INSERT}{RELEASE},$fields[0],$fields[1]\n";
959             $self->{CPANSTATS}->do_query($SQL{INSERT}{RELEASE},
960             $fields{id},$fields{guid}, # id, guid
961             $fields{dist},$fields{version}, # dist, version
962              
963             $self->_oncpan($fields{dist},$fields{version}) ? 1 : 2,
964              
965             $fields{version} =~ /_/ ? 2 : 1,
966             $devel ? 2 : 1,
967             $patch ? 2 : 1,
968              
969             $fields{state} eq 'pass' ? 1 : 0,
970             $fields{state} eq 'fail' ? 1 : 0,
971             $fields{state} eq 'na' ? 1 : 0,
972             $fields{state} eq 'unknown' ? 1 : 0);
973             }
974             }
975              
976             if($fields{state} eq 'pass') {
977             $fields{perl} =~ s/\s.*//; # only need to know the main release
978             $self->{CPANSTATS}->do_query($SQL{INSERT}{PASSES},
979             $fields{platform},
980             $fields{osname},
981             $fields{perl},
982             $fields{dist},
983             $fields{postdate});
984             }
985              
986             if((++$self->{stat_count} % 500) == 0) {
987             $self->commit;
988             }
989              
990             return 1;
991             }
992              
993             sub cache_report {
994             my $self = shift;
995             return 0 unless($self->{report}{guid} && $self->{report}{metabase});
996              
997             # in check mode, assume the rest happens
998             return 1 if($self->{check});
999             return 1 if($self->{localonly});
1000              
1001             my ($json,$data,$fact);
1002              
1003             eval { $json = encode_json($self->{report}{metabase}); };
1004             eval { $data = $self->{serializer}->serialize("$json"); };
1005             eval { $data = $self->{serializer}->serialize( $self->{report}{metabase} ); } if($@);
1006             eval { $fact = $self->{serializer2}->serialize($self->{fact}); };
1007              
1008             $data ||= '';
1009             $fact ||= '';
1010              
1011             $self->{METABASE}->do_query('INSERT IGNORE INTO metabase (guid,id,updated,report,fact) VALUES (?,?,?,?,?)',
1012             $self->{report}{guid},$self->{report}{id},$self->{report}{updated},$data,$fact);
1013              
1014             if((++$self->{meta_count} % 500) == 0) {
1015             $self->{METABASE}->do_commit;
1016             }
1017              
1018             return 1;
1019             }
1020              
1021             sub cache_update {
1022             my $self = shift;
1023             return 0 unless($self->{report}{guid} && $self->{report}{id});
1024              
1025             # in check mode, assume the rest happens
1026             return 1 if($self->{check});
1027             return 1 if($self->{localonly});
1028              
1029             $self->{METABASE}->do_query('UPDATE metabase SET id=? WHERE guid=?',$self->{report}{id},$self->{report}{guid});
1030              
1031             if((++$self->{meta_count} % 500) == 0) {
1032             $self->{METABASE}->do_commit;
1033             }
1034              
1035             return 1;
1036             }
1037              
1038             #----------------------------------------------------------------------------
1039             # Internal Cache Methods
1040              
1041             sub load_uploads {
1042             my $self = shift;
1043              
1044             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT dist,version,type FROM uploads');
1045             for my $row (@rows) {
1046             $self->{oncpan}{$row->{dist}}{$row->{version}} = $row->{type};
1047             }
1048             }
1049              
1050             sub load_authors {
1051             my $self = shift;
1052              
1053             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT author,dist,version FROM ixlatest');
1054             for my $row (@rows) {
1055             $self->{author}{$row->{dist}}{$row->{version}} = $row->{author};
1056             }
1057             }
1058              
1059             sub load_perl_versions {
1060             my $self = shift;
1061              
1062             my @rows = $self->{CPANSTATS}->get_query('hash','SELECT * FROM perl_version');
1063             for my $row (@rows) {
1064             $self->{perls}{$row->{version}} = {
1065             perl => $row->{perl},
1066             patch => $row->{patch},
1067             devel => $row->{devel},
1068             saved => 1
1069             };
1070             }
1071             }
1072              
1073             sub save_perl_versions {
1074             my $self = shift;
1075              
1076             for my $vers (keys %{ $self->{perls} }) {
1077             next if($self->{perls}{$vers}{saved});
1078             $self->{CPANSTATS}->do_query("INSERT INTO perl_version (version,perl,patch,devel) VALUES (?,?,?,?)",
1079             $vers, $self->{perls}{$vers}{perl}, $self->{perls}{$vers}{patch}, $self->{perls}{$vers}{devel});
1080             }
1081             }
1082              
1083             #----------------------------------------------------------------------------
1084             # Private Methods
1085              
1086             sub _consume_reports {
1087             my ($self,$maxdate,$dataset) = @_;
1088              
1089             for my $data (@$dataset) {
1090             my $start = $self->_get_createdate( $data->{gstart}, $data->{dstart} );
1091             my $end = $self->_get_createdate( $data->{gend}, $data->{dend} );
1092              
1093             unless($start && $end) {
1094             $start ||= '';
1095             $end ||= '';
1096             $self->_log("BAD DATES: start=$start, end=$end [missing dates]\n");
1097             next;
1098             }
1099             if($start ge $end) {
1100             $self->_log("BAD DATES: start=$start, end=$end [end before start]\n");
1101             next;
1102             }
1103             # if($end gt $maxdate) {
1104             # $self->_log("BAD DATES: start=$start, end=$end [exceeds $maxdate]\n");
1105             # next;
1106             # }
1107              
1108             $self->_log("LOOP: start=$start, end=$end\n");
1109              
1110             ($self->{processed},$self->{stored},$self->{cached}) = (0,0,0);
1111              
1112             # what guids do we already have?
1113             my $sql = 'SELECT guid FROM metabase WHERE updated >= ? AND updated <= ? ORDER BY updated asc';
1114             my @guids = $self->{METABASE}->get_query('hash',$sql,$data->{dstart},$data->{dend});
1115             my %guids = map {$_->{guid} => 1} @guids;
1116              
1117             # note that because Amazon's SimpleDB can return odd entries out of
1118             # sync, we have to look at previous entries to ensure we are starting
1119             # from the right point
1120             my ($update,$prev,$last) = ($start,$start,$start);
1121             my @times = ();
1122              
1123             my $prior = [ 0, 0 ];
1124             my $saved = 0;
1125             while($update lt $end) {
1126             $self->_log("UPDATE: update=$update, end=$end, saved=$saved, guids=".(scalar(@guids))."\n");
1127              
1128             # get list of guids from last update date
1129             my $guids = $self->get_next_guids($update,$end);
1130             last unless($guids);
1131              
1132             @guids = grep { !$guids{$_} } @$guids;
1133             last unless(@guids);
1134             last if($prior->[0] eq $guids[0] && $prior->[1] eq $guids[-1]); # prevent an endless loop
1135             $prior = [ $guids[0], $guids[-1] ];
1136              
1137             $self->_log("UPDATE: todo guids=".(scalar(@guids))."\n");
1138              
1139             my $current = $update;
1140             for my $guid (@guids) {
1141             # don't process too far
1142             shift @times if(@times > 9); # one off
1143             push @times, [ $current, (_date_diff($end,$current) <= 0 ? 0 : 1) ]; # one on ... max 10
1144              
1145             my $times = 0;
1146             $times += $_->[1] for(@times);
1147             last if($times == 10); # stop if all greater than end
1148              
1149             # okay process
1150             $self->_log("GUID [$guid]");
1151              
1152             $self->{processed}++;
1153              
1154             if(my $time = $self->already_saved($guid)) {
1155             $self->_log(".. already saved [$time]\n");
1156             $current = $time;
1157             $saved++;
1158             next;
1159             }
1160              
1161             if(my $report = $self->get_fact($guid)) {
1162             $current = $report->{metadata}{core}{update_time};
1163             $self->{report}{guid} = $guid;
1164             next if($self->parse_report(report => $report)); # true if invalid report
1165              
1166             if($self->store_report()) { $self->_log(".. stored"); $self->{stored}++; }
1167             else { $self->_log(".. already stored"); }
1168             if($self->cache_report()) { $self->_log(".. cached\n"); $self->{cached}++; }
1169             else { $self->_log(".. bad cache data\n"); }
1170             } else {
1171             $self->_log(".. FAIL\n");
1172             }
1173             }
1174              
1175             $update = $times[0]->[0];
1176              
1177             $self->commit();
1178             }
1179              
1180             $self->commit();
1181             my $invalid = $self->{invalid} ? scalar(@{$self->{invalid}}) : 0;
1182             my $stop = localtime(time);
1183             $self->_log("MARKER: processed=$self->{processed}, stored=$self->{stored}, cached=$self->{cached}, invalid=$invalid, start=$start, stop=$stop\n");
1184             }
1185              
1186             # only email invalid reports during the generate process
1187             $self->_send_email() if($self->{invalid});
1188             }
1189              
1190             sub _get_perl_version {
1191             my $self = shift;
1192             my $vers = shift;
1193              
1194             unless($self->{perls}{$vers}) {
1195             my $patch = $vers =~ /^5.(7|9|[1-9][13579])/ ? 1 : 0, # odd numbers now mark development releases
1196             my $devel = $vers =~ /(RC\d+|patch)/ ? 1 : 0,
1197             my ($perl) = $vers =~ /(5\.\d+(?:\.\d+)?)/;
1198              
1199             $self->{perls}{$vers} = {
1200             perl => $perl,
1201             patch => $patch,
1202             devel => $devel,
1203             saved => 0
1204             };
1205             }
1206              
1207             return $self->{perls}{$vers}{perl}, $self->{perls}{$vers}{patch}, $self->{perls}{$vers}{devel};
1208             }
1209              
1210             sub _get_guid_list {
1211             my ($self,$guid,$file) = @_;
1212             my (@ids,@guids);
1213              
1214             # we're only parsing one id
1215             if($guid) {
1216             if($guid =~ /^\d+$/) { push @ids, $guid }
1217             else { push @guids, $guid }
1218             } elsif($file) {
1219             my $fh = IO::File->new($file,'r') or die "Cannot read file [$file]: $!";
1220             while(<$fh>) {
1221             chomp;
1222             my ($num) = (m/^([\da-z-]+)/i);
1223             if($num =~ /^\d+$/) { push @ids, $num }
1224             else { push @guids, $num }
1225             }
1226             $fh->close;
1227             } else {
1228             return;
1229             }
1230              
1231             # turn ids into guids
1232             if(@ids) {
1233             my @rows = $self->{CPANSTATS}->get_query('array','SELECT guid FROM cpanstats WHERE id IN ('.join(',',@ids).')');
1234             push @guids, $_->[0] for(@rows);
1235             }
1236              
1237             my %guids = map {$_ => 1} @guids;
1238             my @list = keys %guids;
1239             return @list;
1240             }
1241              
1242             sub _get_createdate {
1243             my ($self,$guid,$date) = @_;
1244              
1245             return unless($guid || $date);
1246             if($guid) {
1247             my @rows = $self->{METABASE}->get_query('hash','SELECT updated FROM metabase WHERE guid=?',$guid);
1248             $date = $rows[0]->{updated} if(@rows);
1249             }
1250              
1251             return unless($date && $date =~ /^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}Z$/);
1252             return $date;
1253             }
1254              
1255             sub _get_tester {
1256             my ($self,$creator) = @_;
1257             return $testers{$creator} if($testers{$creator});
1258              
1259             my $profile = Metabase::Resource->new( $creator );
1260             return $creator unless($profile);
1261              
1262             my $user;
1263             eval { $user = $self->{librarian}->extract( $profile->guid ) };
1264             return $creator unless($user);
1265              
1266             my ($name,@emails);
1267             for my $fact ($user->facts()) {
1268             if(ref $fact eq 'Metabase::User::EmailAddress') {
1269             push @emails, $fact->{content};
1270             } elsif(ref $fact eq 'Metabase::User::FullName') {
1271             $name = encode_entities($fact->{content});
1272             }
1273             }
1274              
1275             $name ||= 'NONAME'; # shouldn't happen, but allows for checks later
1276              
1277             for my $em (@emails) {
1278             $self->{METABASE}->do_query('INSERT INTO testers_email (resource,fullname,email) VALUES (?,?,?)',$creator,$name,$em);
1279             }
1280              
1281             $testers{$creator} = @emails ? $emails[0] : $creator;
1282             $testers{$creator} =~ s/\'/''/g if($testers{$creator});
1283             return $testers{$creator};
1284             }
1285              
1286             sub _get_author {
1287             my ($self,$dist,$vers) = @_;
1288             my $author = $self->{author}{$dist}{$vers} || '';
1289             return $author;
1290             }
1291              
1292             sub _valid_field {
1293             my ($self,$id,$name,$value) = @_;
1294             return 1 if(defined $value);
1295             $self->_log(" . [$id] ... missing field: $name\n");
1296             return 0;
1297             }
1298              
1299             sub _get_lastid {
1300             my $self = shift;
1301              
1302             my @rows = $self->{METABASE}->get_query('array',"SELECT MAX(id) FROM metabase");
1303             return 0 unless(@rows);
1304             return $rows[0]->[0] || 0;
1305             }
1306              
1307             sub _oncpan {
1308             my ($self,$dist,$vers) = @_;
1309              
1310             my $type = $self->{oncpan}{$dist}{$vers};
1311              
1312             return 1 unless($type); # assume it's a new release
1313             return 0 if($type eq 'backpan'); # on backpan only
1314             return 1; # on cpan or new upload
1315             }
1316              
1317             sub _osname {
1318             my $self = shift;
1319             my $name = shift || return '';
1320              
1321             my $lname = lc $name;
1322             my $uname = uc $name;
1323             $self->{OSNAMES}{$lname} ||= do {
1324             $self->{CPANSTATS}->do_query(qq{INSERT INTO osname (osname,ostitle) VALUES ('$name','$uname')});
1325             $uname;
1326             };
1327              
1328             return $self->{OSNAMES}{$lname};
1329             }
1330              
1331             sub _check_arch_os {
1332             my $self = shift;
1333              
1334             my $text = $self->_platform_to_osname($self->{report}{platform});
1335             #print STDERR "_check: text=$text\n";
1336             #print STDERR "_check: platform=$self->{report}{platform}\n";
1337             #print STDERR "_check: osname=$self->{report}{osname}\n";
1338             return if($text && $self->{report}{osname} && lc $text eq lc $self->{report}{osname});
1339              
1340             #print STDERR "_check: metabase=".Dumper($self->{report}{metabase})."\n";
1341             my $textreport = $self->{report}{metabase}{'CPAN::Testers::Fact::LegacyReport'}{content}{textreport};
1342             $textreport =~ s/\\n/\n/g; # newlines may be escaped
1343              
1344             # create a fake mail, as CTC::Article parses a mail like text block
1345             my $mail = <<EMAIL;
1346             From: fake\@example.com
1347             To: fake\@example.com
1348             Subject: PASS Fake-0.01
1349             Date: 01-01-2010 01:01:01 Z
1350              
1351             $textreport
1352             EMAIL
1353             my $object = CPAN::Testers::Common::Article->new( $mail ) or return;
1354             $object->parse_report();
1355              
1356             $self->{report}{osname} = $object->osname;
1357             $self->{report}{platform} = $object->archname;
1358             }
1359              
1360             sub _platform_to_osname {
1361             my $self = shift;
1362             my $arch = shift || return '';
1363              
1364             $OSNAMES = join('|',keys %{$self->{OSNAMES}}) if(keys %{$self->{OSNAMES}});
1365              
1366             return $1 if($arch =~ /($OSNAMES)/i);
1367              
1368             for my $rx (keys %{ $self->{OSNAMES} }) {
1369             return $self->{OSNAMES}{$rx} if($arch =~ /$rx/i);
1370             }
1371              
1372             return '';
1373             }
1374              
1375             sub _send_email {
1376             my $self = shift;
1377             my $t = localtime;
1378             my $DATE = $t->strftime("%a, %d %b %Y %H:%M:%S +0000");
1379             $DATE =~ s/\s+$//;
1380             my $INVALID = join("\n",@{$self->{invalid}});
1381             $self->_log("INVALID:\n$INVALID\n");
1382              
1383             for my $admin (@{$self->{admins}}) {
1384             my $cmd = qq!| $HOW $admin!;
1385              
1386             my $body = $HEAD . $BODY;
1387             $body =~ s/FROM/$FROM/g;
1388             $body =~ s/EMAIL/$admin/g;
1389             $body =~ s/DATE/$DATE/g;
1390             $body =~ s/INVALID/$INVALID/g;
1391              
1392             if(my $fh = IO::File->new($cmd)) {
1393             print $fh $body;
1394             $fh->close;
1395             $self->_log(".. MAIL SEND - SUCCESS - $admin\n");
1396             } else {
1397             $self->_log(".. MAIL SEND - FAILED - $admin\n");
1398             }
1399             }
1400             }
1401              
1402             sub _date_diff {
1403             my ($date1,$date2) = @_;
1404              
1405             my (@dt1) = $date1 =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/;
1406             my (@dt2) = $date2 =~ /(\d+)-(\d+)-(\d+)T(\d+):(\d+):(\d+)Z/;
1407              
1408             return -1 unless(@dt1 && @dt2);
1409              
1410             my $dt1 = DateTime->new( year => $dt1[0], month => $dt1[1], day => $dt1[2], hour => $dt1[3], minute => $dt1[4], second => $dt1[5], time_zone => 'UTC' )->epoch;
1411             my $dt2 = DateTime->new( year => $dt2[0], month => $dt2[1], day => $dt2[2], hour => $dt2[3], minute => $dt2[4], second => $dt2[5], time_zone => 'UTC' )->epoch;
1412              
1413             return $dt2 - $dt1;
1414             }
1415              
1416             sub _log {
1417             my $self = shift;
1418             my $log = $self->{logfile} or return;
1419             mkpath(dirname($log)) unless(-f $log);
1420             my $fh = IO::File->new($log,'a+') or die "Cannot append to log file [$log]: $!\n";
1421             print $fh $self->{msg} if($self->{msg});
1422             print $fh @_;
1423             $fh->close;
1424             $self->{msg} = '';
1425             }
1426              
1427             1;
1428              
1429             __END__
1430              
1431             =head1 NAME
1432              
1433             CPAN::Testers::Data::Generator - Download and summarize CPAN Testers data
1434              
1435             =head1 SYNOPSIS
1436              
1437             % cpanstats
1438             # ... wait patiently, very patiently
1439             # ... then use the cpanstats MySQL database
1440              
1441             =head1 DESCRIPTION
1442              
1443             This distribution was originally written by Leon Brocard to download and
1444             summarize CPAN Testers data. However, all of the original code has been
1445             rewritten to use the CPAN Testers Statistics database generation code. This
1446             now means that all the CPAN Testers sites including the Reports site, the
1447             Statistics site and the CPAN Dependencies site, can use the same database.
1448              
1449             This module retrieves and parses reports from the Metabase, generating or
1450             updating entries in the cpanstats database, which extracts specific metadata
1451             from the reports. The information in the cpanstats database is then presented
1452             via CPAN::Testers::WWW::Reports on the CPAN Testers Reports website.
1453              
1454             A good example query from the cpanstats database for Acme-Colour would be:
1455              
1456             SELECT version, status, count(*) FROM cpanstats WHERE
1457             dist = "Acme-Colour" group by version, state;
1458              
1459             To create a database from scratch can take several days, as there are now over
1460             24 million submitted reports. As such updating from a known copy of the
1461             database is much more advisable. If you don't want to generate the database
1462             yourself, you can obtain a feed using CPAN::Testers::WWW::Report::Query::Reports.
1463              
1464             With over 24 million reports in the database, if you do plan to run this
1465             software to generate the databases it is recommended you utilise a high-end
1466             processor machine. Even with a reasonable processor it can take over a week!
1467              
1468             =head1 DATABASE SCHEMA
1469              
1470             The cpanstats database schema is very straightforward, one main table with
1471             several index tables to speed up searches. The main table is as below:
1472              
1473             CREATE TABLE `cpanstats` (
1474              
1475             `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
1476             `guid` char(36) NOT NULL DEFAULT '',
1477             `state` varchar(32) DEFAULT NULL,
1478             `postdate` varchar(8) DEFAULT NULL,
1479             `tester` varchar(255) DEFAULT NULL,
1480             `dist` varchar(255) DEFAULT NULL,
1481             `version` varchar(255) DEFAULT NULL,
1482             `platform` varchar(255) DEFAULT NULL,
1483             `perl` varchar(255) DEFAULT NULL,
1484             `osname` varchar(255) DEFAULT NULL,
1485             `osvers` varchar(255) DEFAULT NULL,
1486             `fulldate` varchar(32) DEFAULT NULL,
1487             `type` int(2) DEFAULT '0',
1488            
1489             PRIMARY KEY (`id`),
1490             KEY `guid` (`guid`),
1491             KEY `distvers` (`dist`,`version`),
1492             KEY `tester` (`tester`),
1493             KEY `state` (`state`),
1494             KEY `postdate` (`postdate`)
1495            
1496             )
1497              
1498             It should be noted that 'postdate' refers to the YYYYMM formatted date, whereas
1499             the 'fulldate' field refers to the YYYYMMDDhhmm formatted date and time.
1500              
1501             The metabase database schema is again very straightforward, and consists of one
1502             main table, as below:
1503              
1504             CREATE TABLE `metabase` (
1505            
1506             `guid` char(36) NOT NULL,
1507             `id` int(10) unsigned NOT NULL,
1508             `updated` varchar(32) DEFAULT NULL,
1509             `report` longblob NOT NULL,
1510             `fact` longblob NOT NULL,
1511            
1512             PRIMARY KEY (`guid`),
1513             KEY `id` (`id`),
1514             KEY `updated` (`updated`)
1515            
1516             )
1517              
1518             The id field is a reference to the cpanstats.id field.
1519              
1520             The report field is JSON encoded, and is a cached version of the facts of a
1521             report, while the fact field is the full report fact, and associated child
1522             facts, Sereal encoded. Both are extracted from the returned fact from
1523             Metabase::Librarian.
1524              
1525             See F<examples/cpanstats-createdb> for the full list of tables used.
1526              
1527             =head1 SIGNIFICANT CHANGES
1528              
1529             =head2 v0.31 CHANGES
1530              
1531             With the release of v0.31, a number of changes to the codebase were made as
1532             a further move towards CPAN Testers 2.0. The first change is the name for this
1533             distribution. Now titled 'CPAN-Testers-Data-Generator', this now fits more
1534             appropriately within the CPAN-Testers namespace on CPAN.
1535              
1536             The second significant change is to now reference a MySQL cpanstats database.
1537             The SQLite version is still updated as before, as a number of other websites
1538             and toolsets still rely on that database file format. However, in order to make
1539             the CPAN Testers Reports website more dynamic, an SQLite database is not really
1540             appropriate for a high demand website.
1541              
1542             The database creation code is now available as a standalone program, in the
1543             examples directory, and all the database communication is now handled by the
1544             new distribution CPAN-Testers-Common-DBUtils.
1545              
1546             =head2 v0.41 CHANGES
1547              
1548             In the next stage of development of CPAN Testers 2.0, the id field used within
1549             the database schema above for the cpanstats table no longer matches the NNTP
1550             ID value, although the id in the articles does still reference the NNTP ID, at
1551             least for the reports submitted prior to the switch to the Metabase in 2010.
1552              
1553             In order to correctly reference the id in the articles table, you will need to
1554             use the function guid_to_nntp() with CPAN::Testers::Common::Utils, using the
1555             new guid field in the cpanstats table.
1556              
1557             As of this release the cpanstats id field is a unique auto incrementing field.
1558              
1559             The next release of this distribution will be focused on generation of stats
1560             using the Metabase storage API.
1561              
1562             =head2 v1.00 CHANGES
1563              
1564             Moved to Metabase API. The change to a definite major version number hopefully
1565             indicates that this is a major interface change. All previous NNTP access has
1566             been dropped and is no longer relavent. All report updates are now fed from
1567             the Metabase API.
1568              
1569             =head1 INTERFACE
1570              
1571             =head2 The Constructor
1572              
1573             =over
1574              
1575             =item * new
1576              
1577             Instatiates the object CPAN::Testers::Data::Generator. Accepts a hash containing
1578             values to prepare the object. These are described as:
1579              
1580             my $obj = CPAN::Testers::Data::Generator->new(
1581             logfile => './here/logfile',
1582             config => './here/config.ini'
1583             );
1584              
1585             Where 'logfile' is the location to write log messages. Log messages are only
1586             written if a logfile entry is specified, and will always append to any existing
1587             file. The 'config' should contain the path to the configuration file, used
1588             to define the database access and general operation settings.
1589              
1590             =back
1591              
1592             =head2 Public Methods
1593              
1594             =over
1595              
1596             =item * generate
1597              
1598             Starting from the last cached report, retrieves all the more recent reports
1599             from the Metabase Report Submission server, parsing each and recording each
1600             report in both the cpanstats database and the metabase cache database.
1601              
1602             =item * regenerate
1603              
1604             For a given date range, retrieves all the reports from the Metabase Report
1605             Submission server, parsing each and recording each report in both the cpanstats
1606             database and the metabase cache database.
1607              
1608             Note that as only 2500 can be returned at any one time due to Amazon SimpleDB
1609             restrictions, this method will only process the guids returned from a given
1610             start data, up to a maxiumu of 2500 guids.
1611              
1612             This method will return the guid of the last report processed.
1613              
1614             =item * rebuild
1615              
1616             In the event that the cpanstats database needs regenerating, either in part or
1617             for the whole database, this method allow you to do so. You may supply
1618             parameters as to the 'start' and 'end' values (inclusive), where all records
1619             are assumed by default. Records are rebuilt using the local metabase cache
1620             database.
1621              
1622             =item * reparse
1623              
1624             Rather than a complete rebuild the option to selective reparse selected entries
1625             is useful if there are reports which were previously unable to correctly supply
1626             a particular field, which now has supporting parsing code within the codebase.
1627              
1628             In addition there is the option to exclude fields from parsing checks, where
1629             they may be corrupted, and can be later amended using the 'cpanstats-update'
1630             tool.
1631              
1632             =item * parse
1633              
1634             Unlike reparse, parse is used to parse just missing reports. As such if a
1635             report has already been stored and cached, it won't be processed again, unless
1636             the 'force' option is used.
1637              
1638             In addition, as per reparse, there is the option to exclude fields from parsing
1639             checks, where they may be corrupted, and can be later amended using the
1640             'cpanstats-update' tool.
1641              
1642             =item * tail
1643              
1644             Write to a file, the list of GUIDs returned from a tail request.
1645              
1646             =back
1647              
1648             =head2 Private Methods
1649              
1650             =over
1651              
1652             =item * commit
1653              
1654             To speed up the transaction process, a commit is performed every 500 inserts.
1655             This method is used as part of the clean up process to ensure all transactions
1656             are completed.
1657              
1658             =item * get_tail_guids
1659              
1660             Get the list of GUIDs as would be seen for a tail log.
1661              
1662             =item * get_next_dates
1663              
1664             Get the list of dates to use in the next cycle of report retrieval.
1665              
1666             =item * get_next_guids
1667              
1668             Get the list of GUIDs for the reports that have been submitted since the last
1669             cached report.
1670              
1671             =item * retrieve_reports
1672              
1673             Abstracted loop of requesting GUIDs, then parsing, storing and caching each
1674             report as appropriate.
1675              
1676             =item * already_saved
1677              
1678             Given a guid, determines whether it has already been saved in the local
1679             metabase cache.
1680              
1681             =item * load_fact
1682              
1683             Get a specific report fact for a given GUID, from the local database.
1684              
1685             =item * get_fact
1686              
1687             Get a specific report fact for a given GUID, from the Metabase.
1688              
1689             =item * dereference_report
1690              
1691             When you retrieve the parent report fact from the database, you'll need to
1692             dereference it to ensure the child elements contain the child facts in the
1693             correct format for processing.
1694              
1695             =item * parse_report
1696              
1697             Parses a report extracting the metadata required for the cpanstats database.
1698              
1699             =item * reparse_report
1700              
1701             Parses a report (from a local metabase cache) extracting the metadata required
1702             for the stats database.
1703              
1704             =item * retrieve_report
1705              
1706             Given a guid will attempt to return the report metadata from the cpanstats
1707             database.
1708              
1709             =item * store_report
1710              
1711             Inserts the components of a parsed report into the cpanstats database.
1712              
1713             =item * cache_report
1714              
1715             Inserts a serialised report into a local metabase cache database.
1716              
1717             =item * cache_update
1718              
1719             For the current report will update the local metabase cache with the id used
1720             within the cpanstats database.
1721              
1722             =back
1723              
1724             =head2 Very Private methods
1725              
1726             The following modules load information enmasse to avoid DB connection hogging
1727             and IO blocking. Thus improving performance.
1728              
1729             =over 4
1730              
1731             =item * load_uploads
1732              
1733             Loads the upload information.
1734              
1735             =item * load_authors
1736              
1737             Loads information regarding each author's distribution.
1738              
1739             =item * load_perl_versions
1740              
1741             Loads all the known Perl versions.
1742              
1743             =item * save_perl_versions
1744              
1745             Saves any new Perl versions
1746              
1747             =back
1748              
1749             =head1 HISTORY
1750              
1751             The CPAN Testers was conceived back in May 1998 by Graham Barr and Chris
1752             Nandor as a way to provide multi-platform testing for modules. Today there
1753             are over 40 million tester reports and more than 100 testers each month
1754             giving valuable feedback for users and authors alike.
1755              
1756             =head1 BECOME A TESTER
1757              
1758             Whether you have a common platform or a very unusual one, you can help by
1759             testing modules you install and submitting reports. There are plenty of
1760             module authors who could use test reports and helpful feedback on their
1761             modules and distributions.
1762              
1763             If you'd like to get involved, please take a look at the CPAN Testers Wiki,
1764             where you can learn how to install and configure one of the recommended
1765             smoke tools.
1766              
1767             For further help and advice, please subscribe to the the CPAN Testers
1768             discussion mailing list.
1769              
1770             CPAN Testers Wiki
1771             - http://wiki.cpantesters.org
1772             CPAN Testers Discuss mailing list
1773             - http://lists.cpan.org/showlist.cgi?name=cpan-testers-discuss
1774              
1775             =head1 BUCKETS
1776              
1777             beta6 - 2014-01-21
1778             beta7 - 2014-11-12
1779              
1780             =head1 BUGS, PATCHES & FIXES
1781              
1782             There are no known bugs at the time of this release. However, if you spot a
1783             bug or are experiencing difficulties, that is not explained within the POD
1784             documentation, please send bug reports and patches to the RT Queue (see below).
1785              
1786             Fixes are dependent upon their severity and my availability. Should a fix not
1787             be forthcoming, please feel free to (politely) remind me.
1788              
1789             RT Queue -
1790             http://rt.cpan.org/Public/Dist/Display.html?Name=CPAN-Testers-Data-Generator
1791              
1792             =head1 SEE ALSO
1793              
1794             L<CPAN::Testers::Report>,
1795             L<Metabase>,
1796             L<Metabase::Fact>,
1797             L<CPAN::Testers::Fact::LegacyReport>,
1798             L<CPAN::Testers::Fact::TestSummary>,
1799             L<CPAN::Testers::Metabase::AWS>
1800              
1801             L<CPAN::Testers::WWW::Statistics>
1802              
1803             F<http://www.cpantesters.org/>,
1804             F<http://stats.cpantesters.org/>,
1805             F<http://wiki.cpantesters.org/>
1806              
1807             =head1 AUTHOR
1808              
1809             It should be noted that the original code for this distribution began life
1810             under another name. The original distribution generated data for the original
1811             CPAN Testers website. However, in 2008 the code was reworked to generate data
1812             in the format for the statistics data analysis, which in turn was reworked to
1813             drive the redesign of the all the CPAN Testers websites. To reflect the code
1814             changes, a new name was given to the distribution.
1815              
1816             =head2 CPAN-WWW-Testers-Generator
1817              
1818             Original author: Leon Brocard <acme@astray.com> (C) 2002-2008
1819             Current maintainer: Barbie <barbie@cpan.org> (C) 2008-2010
1820              
1821             =head2 CPAN-Testers-Data-Generator
1822              
1823             Original author: Barbie <barbie@cpan.org> (C) 2008-2015
1824              
1825             =head1 LICENSE
1826              
1827             This module is free software; you can redistribute it and/or
1828             modify it under the Artistic License 2.0.