| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | ## -*- Mode: CPerl -*- | 
| 2 |  |  |  |  |  |  | ## File: DiaColloDB::Compat::v0_09::Relation.pm | 
| 3 |  |  |  |  |  |  | ## Author: Bryan Jurish <moocow@cpan.org> | 
| 4 |  |  |  |  |  |  | ## Description: collocation db, relation API (abstract & utilities) | 
| 5 |  |  |  |  |  |  |  | 
| 6 |  |  |  |  |  |  | package DiaColloDB::Compat::v0_09::Relation; | 
| 7 | 1 |  |  | 1 |  | 6 | use DiaColloDB::Compat; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 24 |  | 
| 8 | 1 |  |  | 1 |  | 4 | use DiaColloDB::Relation; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 35 |  | 
| 9 | 1 |  |  | 1 |  | 6 | use DiaColloDB::Utils qw(:si); | 
|  | 1 |  |  |  |  | 1 |  | 
|  | 1 |  |  |  |  | 31 |  | 
| 10 | 1 |  |  | 1 |  | 173 | use Algorithm::BinarySearch::Vec qw(:api); | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 29 |  | 
| 11 | 1 |  |  | 1 |  | 230 | use strict; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 100 |  | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | ##============================================================================== | 
| 14 |  |  |  |  |  |  | ## Globals & Constants | 
| 15 |  |  |  |  |  |  |  | 
| 16 |  |  |  |  |  |  | our @ISA = qw(DiaColloDB::Relation DiaColloDB::Compat); | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | ##============================================================================== | 
| 19 |  |  |  |  |  |  | ## Constructors etc. | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | ## $rel = CLASS_OR_OBJECT->new(%args) | 
| 22 |  |  |  |  |  |  | ## + %args, object structure: see subclases | 
| 23 |  |  |  |  |  |  | sub new { | 
| 24 | 0 |  |  | 0 | 1 |  | my ($that,%args) = @_; | 
| 25 | 0 |  | 0 |  |  |  | return bless({ %args }, ref($that)||$that); | 
| 26 |  |  |  |  |  |  | } | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | ##============================================================================== | 
| 29 |  |  |  |  |  |  | ## Relation API: create | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | ## $rel = $CLASS_OR_OBJECT->create($coldb, $tokdat_file, %opts) | 
| 32 |  |  |  |  |  |  | ##  + populates current database from $tokdat_file, | 
| 33 |  |  |  |  |  |  | ##    a tt-style text file containing 1 token-id perl line with optional blank lines | 
| 34 |  |  |  |  |  |  | ##  + %opts: clobber %$rel | 
| 35 |  |  |  |  |  |  | ##  + DISABLED | 
| 36 |  |  |  |  |  |  | BEGIN { | 
| 37 | 1 |  |  | 1 |  | 8 | *create = DiaColloDB::Compat->nocompat('create'); | 
| 38 |  |  |  |  |  |  | } | 
| 39 |  |  |  |  |  |  |  | 
| 40 |  |  |  |  |  |  | ##============================================================================== | 
| 41 |  |  |  |  |  |  | ## Relation API: union | 
| 42 |  |  |  |  |  |  |  | 
| 43 |  |  |  |  |  |  | ## $rel = $CLASS_OR_OBJECT->union($coldb, \@pairs, %opts) | 
| 44 |  |  |  |  |  |  | ##  + merge multiple co-frequency indices into new object | 
| 45 |  |  |  |  |  |  | ##  + @pairs : array of pairs ([$argrel,\@xi2u],...) | 
| 46 |  |  |  |  |  |  | ##    of relation-objects $argrel and tuple-id maps \@xi2u for $rel | 
| 47 |  |  |  |  |  |  | ##  + %opts: clobber %$rel | 
| 48 |  |  |  |  |  |  | ##  + implicitly flushes the new index | 
| 49 |  |  |  |  |  |  | ##  + DISABLED | 
| 50 |  |  |  |  |  |  | BEGIN { | 
| 51 | 1 |  |  | 1 |  | 6 | *union = DiaColloDB::Compat->nocompat('union'); | 
| 52 |  |  |  |  |  |  | } | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | ##============================================================================== | 
| 55 |  |  |  |  |  |  | ## Relation API: info | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | ## \%info = $rel->dbinfo($coldb) | 
| 58 |  |  |  |  |  |  | ##  + embedded info-hash for $coldb->dbinfo() | 
| 59 |  |  |  |  |  |  | sub dbinfo { | 
| 60 | 0 |  |  | 0 | 0 |  | my $rel = shift; | 
| 61 | 0 |  |  |  |  |  | my $info = { class=>ref($rel) }; | 
| 62 | 0 | 0 |  |  |  |  | if ($rel->can('du')) { | 
| 63 | 0 |  |  |  |  |  | $info->{du_b} = $rel->du(); | 
| 64 | 0 |  |  |  |  |  | $info->{du_h} = si_str($info->{du_b}); | 
| 65 |  |  |  |  |  |  | } | 
| 66 | 0 |  |  |  |  |  | return $info; | 
| 67 |  |  |  |  |  |  | } | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  |  | 
| 70 |  |  |  |  |  |  | ##============================================================================== | 
| 71 |  |  |  |  |  |  | ## Relation API: profiling & comparison: top-level | 
| 72 |  |  |  |  |  |  |  | 
| 73 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 74 |  |  |  |  |  |  | ## Relation API: profile | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | ## $mprf = $rel->profile($coldb, %opts) | 
| 77 |  |  |  |  |  |  | ##  + get a relation profile for selected items as a DiaColloDB::Profile::Multi object | 
| 78 |  |  |  |  |  |  | ##  + %opts: | 
| 79 |  |  |  |  |  |  | ##    ( | 
| 80 |  |  |  |  |  |  | ##     ##-- selection parameters | 
| 81 |  |  |  |  |  |  | ##     query => $query,           ##-- target request ATTR:REQ... | 
| 82 |  |  |  |  |  |  | ##     date  => $date1,           ##-- string or array or range "MIN-MAX" (inclusive) : default=all | 
| 83 |  |  |  |  |  |  | ##     ## | 
| 84 |  |  |  |  |  |  | ##     ##-- aggregation parameters | 
| 85 |  |  |  |  |  |  | ##     slice   => $slice,         ##-- date slice (default=1, 0 for global profile) | 
| 86 |  |  |  |  |  |  | ##     groupby => $groupby,       ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method | 
| 87 |  |  |  |  |  |  | ##     ## | 
| 88 |  |  |  |  |  |  | ##     ##-- scoring and trimming parameters | 
| 89 |  |  |  |  |  |  | ##     eps     => $eps,           ##-- smoothing constant (default=0) | 
| 90 |  |  |  |  |  |  | ##     score   => $func,          ##-- scoring function ("f"|"lf"|"fm"|"lfm"|"mi"|"ld"|"ll") : default="f" | 
| 91 |  |  |  |  |  |  | ##     kbest   => $k,             ##-- return only $k best collocates per date (slice) : default=-1:all | 
| 92 |  |  |  |  |  |  | ##     cutoff  => $cutoff,        ##-- minimum score | 
| 93 |  |  |  |  |  |  | ##     global  => $bool,          ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) | 
| 94 |  |  |  |  |  |  | ##     ## | 
| 95 |  |  |  |  |  |  | ##     ##-- profiling and debugging parameters | 
| 96 |  |  |  |  |  |  | ##     strings => $bool,          ##-- do/don't stringify item keys (default=do) | 
| 97 |  |  |  |  |  |  | ##     packed  => $bool,          ##-- leave item keys packed (default=don't) | 
| 98 |  |  |  |  |  |  | ##     fill    => $bool,          ##-- if true, returned multi-profile will have null profiles inserted for missing slices | 
| 99 |  |  |  |  |  |  | ##     onepass => $bool,          ##-- if true, use fast but incorrect 1-pass method (Cofreqs subclass only) | 
| 100 |  |  |  |  |  |  | ##    ) | 
| 101 |  |  |  |  |  |  | ##  + default implementation | 
| 102 |  |  |  |  |  |  | ##    - calls $rel->subprofile1() for every requested date-slice, then | 
| 103 |  |  |  |  |  |  | ##    - calls $rel->subprofile2() to compute item2 frequencies, and finally | 
| 104 |  |  |  |  |  |  | ##    - collects the result in a DiaColloDB::Profile::Multi object | 
| 105 |  |  |  |  |  |  | ##  + default values for %opts should be set by higher-level call, e.g. DiaColloDB::profile() | 
| 106 |  |  |  |  |  |  | sub profile { | 
| 107 | 0 |  |  | 0 | 1 |  | my ($reldb,$coldb,%opts) = @_; | 
| 108 |  |  |  |  |  |  |  | 
| 109 |  |  |  |  |  |  | ##-- sanity check(s) | 
| 110 | 0 | 0 | 0 |  |  |  | $reldb->logconfess("profile(): incompatible DB class ".ref($coldb).", v".($coldb->{version}//'???')." for directory $coldb->{dbdir}") | 
| 111 |  |  |  |  |  |  | if (!$coldb->isa('DiaColloDB::Compat::v0_09::DiaColloDB')); | 
| 112 |  |  |  |  |  |  |  | 
| 113 |  |  |  |  |  |  | ##-- common variables | 
| 114 | 0 |  |  |  |  |  | my $logProfile = $coldb->{logProfile}; | 
| 115 |  |  |  |  |  |  |  | 
| 116 |  |  |  |  |  |  | ##-- variables: by attribute | 
| 117 | 0 |  |  |  |  |  | my $groupby= $coldb->groupby($opts{groupby}); | 
| 118 | 0 |  |  |  |  |  | my $attrs  = $coldb->attrs(); | 
| 119 | 0 |  |  |  |  |  | my $adata  = $coldb->attrData($attrs); | 
| 120 | 0 |  |  |  |  |  | my $a2data = {map {($_->{a}=>$_)} @$adata}; | 
|  | 0 |  |  |  |  |  |  | 
| 121 | 0 |  |  |  |  |  | my $areqs  = $coldb->parseRequest($opts{query}, logas=>'query', default=>$attrs->[0]); | 
| 122 | 0 |  |  |  |  |  | foreach (@$areqs) { | 
| 123 | 0 |  |  |  |  |  | $a2data->{$_->[0]}{req} = $_->[1]; | 
| 124 |  |  |  |  |  |  | } | 
| 125 |  |  |  |  |  |  |  | 
| 126 |  |  |  |  |  |  | ##-- sanity check(s) | 
| 127 | 0 | 0 |  |  |  |  | if (!@$areqs) { | 
| 128 | 0 |  |  |  |  |  | $reldb->logwarn($coldb->{error}="profile(): no target attributes specified (supported attributes: ".join(' ',@{$coldb->attrs}).")"); | 
|  | 0 |  |  |  |  |  |  | 
| 129 | 0 |  |  |  |  |  | return undef; | 
| 130 |  |  |  |  |  |  | } | 
| 131 | 0 | 0 |  |  |  |  | if (!@{$groupby->{attrs}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 132 | 0 |  |  |  |  |  | $reldb->logconfess($coldb->{error}="profile(): cannot profile with empty groupby clause"); | 
| 133 | 0 |  |  |  |  |  | return undef; | 
| 134 |  |  |  |  |  |  | } | 
| 135 |  |  |  |  |  |  |  | 
| 136 |  |  |  |  |  |  | ##-- prepare: get target IDs (by attribute) | 
| 137 | 0 |  |  |  |  |  | my ($ac); | 
| 138 | 0 |  | 0 |  |  |  | foreach $ac (grep {($_->{req}//'') ne ''} @$adata) { | 
|  | 0 |  |  |  |  |  |  | 
| 139 | 0 |  |  |  |  |  | $ac->{reqids} = $coldb->enumIds($ac->{enum},$ac->{req},logLevel=>$logProfile,logPrefix=>"profile(): get target $ac->{a}-values"); | 
| 140 | 0 | 0 |  |  |  |  | if (!@{$ac->{reqids}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 141 | 0 |  |  |  |  |  | $reldb->logwarn($coldb->{error}="profile(): no $ac->{a}-attribute values match user query '$ac->{req}'"); | 
| 142 | 0 |  |  |  |  |  | return undef; | 
| 143 |  |  |  |  |  |  | } | 
| 144 |  |  |  |  |  |  | } | 
| 145 |  |  |  |  |  |  |  | 
| 146 |  |  |  |  |  |  | ##-- prepare: get tuple-ids (by attribute) | 
| 147 | 0 |  |  |  |  |  | $reldb->vlog($logProfile, "profile(): get target tuple IDs"); | 
| 148 | 0 |  |  |  |  |  | my $xivec = undef; | 
| 149 | 0 |  |  |  |  |  | my $nbits = undef; | 
| 150 | 0 |  |  |  |  |  | my $pack_xv = undef; | 
| 151 | 0 |  |  |  |  |  | my $test_xv = undef;    ##-- test value via vec() | 
| 152 | 0 |  |  |  |  |  | foreach $ac (grep {$_->{reqids}} @$adata) { | 
|  | 0 |  |  |  |  |  |  | 
| 153 |  |  |  |  |  |  | ##-- sanity checks | 
| 154 | 0 |  | 0 |  |  |  | $nbits   //= $ac->{a2x}{len_i}*8; | 
| 155 | 0 |  | 0 |  |  |  | $pack_xv //= "$ac->{a2x}{pack_i}*"; | 
| 156 | 0 | 0 |  |  |  |  | vec($test_xv='',0,$nbits) = 0x12345678 if (!defined($test_xv)); | 
| 157 |  |  |  |  |  |  | $reldb->logconfess("profile(): multimap pack-size mismatch: nbits($ac->{a2x}{base}.*) != $nbits") | 
| 158 | 0 | 0 |  |  |  |  | if ($ac->{a2x}{len_i} != $nbits/8); | 
| 159 |  |  |  |  |  |  | $reldb->logconfess("profile(): multimap pack-template '$ac->{a2x}{pack_i}' for $ac->{a2x}{base}.* is not big-endian") | 
| 160 | 0 | 0 |  |  |  |  | if (pack($ac->{a2x}{pack_i},0x12345678) ne $test_xv); | 
| 161 |  |  |  |  |  |  |  | 
| 162 |  |  |  |  |  |  | ##-- target set construction | 
| 163 | 0 |  |  |  |  |  | my $axiset = ''; | 
| 164 | 0 |  |  |  |  |  | $axiset = vunion($axiset, $ac->{a2x}->fetchraw($_), $nbits) foreach (@{$ac->{reqids}}); | 
|  | 0 |  |  |  |  |  |  | 
| 165 | 0 | 0 |  |  |  |  | $xivec  = defined($xivec) ? vintersect($xivec, $axiset, $nbits) : $axiset; | 
| 166 |  |  |  |  |  |  | } | 
| 167 |  |  |  |  |  |  |  | 
| 168 |  |  |  |  |  |  | ##-- check maxExpand | 
| 169 | 0 |  | 0 |  |  |  | $nbits //= packsize($coldb->{pack_id}); | 
| 170 | 0 | 0 |  |  |  |  | my $nxis = $xivec ? length($xivec)/($nbits/8) : 0; | 
| 171 | 0 | 0 | 0 |  |  |  | if ($coldb->{maxExpand}>0 && $nxis > $coldb->{maxExpand}) { | 
| 172 | 0 |  |  |  |  |  | $reldb->logwarn("profile(): Warning: target set exceeds max expansion size ($nxis > $coldb->{maxExpand}): truncating"); | 
| 173 | 0 |  |  |  |  |  | substr($xivec, -($nxis - $coldb->{maxExpand})*($nbits/8)) = ''; | 
| 174 |  |  |  |  |  |  | } | 
| 175 | 0 | 0 |  |  |  |  | my $xis = [$xivec ? unpack($pack_xv, $xivec) : qw()]; | 
| 176 |  |  |  |  |  |  |  | 
| 177 |  |  |  |  |  |  | ##-- prepare: parse and filter tuples | 
| 178 | 0 |  |  |  |  |  | $reldb->vlog($logProfile, "profile(): parse and filter target tuples (date=$opts{date}, slice=$opts{slice}, fill=$opts{fill})"); | 
| 179 | 0 |  |  |  |  |  | my $d2xis = $coldb->xidsByDate($xis, @opts{qw(date slice fill)}); | 
| 180 |  |  |  |  |  |  |  | 
| 181 |  |  |  |  |  |  | ##-- profile: get relation profiles (by date-slice, pass 1: f12) | 
| 182 | 0 |  | 0 |  |  |  | my $onepass = $opts{onepass} || ($reldb->can('subprofile2') eq \&subprofile2); | 
| 183 | 0 | 0 |  |  |  |  | $reldb->vlog($logProfile, "profile(): get frequency profile(s): ".($onepass ? 'single-pass' : 'pass-1')); | 
| 184 | 0 |  |  |  |  |  | my %d2prf  = qw(); | 
| 185 | 0 |  |  |  |  |  | my @slices = sort {$a<=>$b} keys %$d2xis; | 
|  | 0 |  |  |  |  |  |  | 
| 186 | 0 |  |  |  |  |  | my ($d,$prf); | 
| 187 | 0 |  |  |  |  |  | foreach $d (@slices) { | 
| 188 | 0 |  |  |  |  |  | $prf = $reldb->subprofile1($d2xis->{$d}, groupby=>$groupby->{xi2g}, coldb=>$coldb, onepass=>$onepass, opts=>\%opts); | 
| 189 | 0 |  |  |  |  |  | $prf->{label}  = $d; | 
| 190 | 0 |  |  |  |  |  | $prf->{titles} = $groupby->{titles}; | 
| 191 | 0 |  |  |  |  |  | $d2prf{$d} = $prf; | 
| 192 |  |  |  |  |  |  | } | 
| 193 |  |  |  |  |  |  |  | 
| 194 |  |  |  |  |  |  | ##-- profile: complete slice-wise profiles (pass 2: f2) | 
| 195 | 0 | 0 |  |  |  |  | if (!$onepass) { | 
| 196 | 0 |  |  |  |  |  | $reldb->vlog($logProfile, "profile(): get frequency profile(s): pass-2"); | 
| 197 | 0 |  |  |  |  |  | $reldb->subprofile2(\%d2prf, %opts, coldb=>$coldb, groupby=>$groupby, a2data=>$a2data, opts=>\%opts); | 
| 198 |  |  |  |  |  |  | } | 
| 199 |  |  |  |  |  |  |  | 
| 200 |  |  |  |  |  |  | ##-- compile & collect: multi-profile | 
| 201 | 0 |  |  |  |  |  | foreach $prf (values %d2prf) { | 
| 202 | 0 |  |  |  |  |  | $prf->compile($opts{score}, eps=>$opts{eps}); | 
| 203 |  |  |  |  |  |  | } | 
| 204 |  |  |  |  |  |  | my $mp = DiaColloDB::Profile::Multi->new(profiles=>[@d2prf{@slices}], | 
| 205 |  |  |  |  |  |  | titles=>$groupby->{titles}, | 
| 206 | 0 |  |  |  |  |  | qinfo =>$reldb->qinfo($coldb, %opts, qreqs=>$areqs, gbreq=>$groupby), | 
| 207 |  |  |  |  |  |  | ); | 
| 208 |  |  |  |  |  |  |  | 
| 209 |  |  |  |  |  |  | ##-- trim and stringify | 
| 210 | 0 |  |  |  |  |  | $reldb->vlog($logProfile, "profile(): trim and stringify"); | 
| 211 | 0 |  |  |  |  |  | $mp->trim(%opts, empty=>!$opts{fill}); | 
| 212 | 0 | 0 |  |  |  |  | if (!$opts{packed}) { | 
| 213 | 0 | 0 | 0 |  |  |  | if ($opts{strings}//1) { | 
| 214 | 0 |  |  |  |  |  | $mp->stringify($groupby->{g2s}); | 
| 215 |  |  |  |  |  |  | } else { | 
| 216 | 0 |  |  |  |  |  | $mp->stringify($groupby->{g2txt}); | 
| 217 |  |  |  |  |  |  | } | 
| 218 |  |  |  |  |  |  | } | 
| 219 |  |  |  |  |  |  |  | 
| 220 |  |  |  |  |  |  | ##-- return | 
| 221 | 0 |  |  |  |  |  | return $mp; | 
| 222 |  |  |  |  |  |  | } | 
| 223 |  |  |  |  |  |  |  | 
| 224 |  |  |  |  |  |  |  | 
| 225 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 226 |  |  |  |  |  |  | ## Relation API: comparison (diff) | 
| 227 |  |  |  |  |  |  |  | 
| 228 |  |  |  |  |  |  | ## $mpdiff = $rel->compare($coldb, %opts) | 
| 229 |  |  |  |  |  |  | ##  + get a relation comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object | 
| 230 |  |  |  |  |  |  | ##  + %opts: | 
| 231 |  |  |  |  |  |  | ##    ( | 
| 232 |  |  |  |  |  |  | ##     ##-- selection parameters | 
| 233 |  |  |  |  |  |  | ##     (a|b)?query => $query,       ##-- target query as for parseRequest() | 
| 234 |  |  |  |  |  |  | ##     (a|b)?date  => $date1,       ##-- string or array or range "MIN-MAX" (inclusive) : default=all | 
| 235 |  |  |  |  |  |  | ##     ## | 
| 236 |  |  |  |  |  |  | ##     ##-- aggregation parameters | 
| 237 |  |  |  |  |  |  | ##     groupby      => $groupby,    ##-- string or array "ATTR1[:HAVING1] ...": default=$coldb->attrs; see groupby() method | 
| 238 |  |  |  |  |  |  | ##     (a|b)?slice  => $slice,      ##-- date slice (default=1, 0 for global profile) | 
| 239 |  |  |  |  |  |  | ##     ## | 
| 240 |  |  |  |  |  |  | ##     ##-- scoring and trimming parameters | 
| 241 |  |  |  |  |  |  | ##     eps     => $eps,           ##-- smoothing constant (default=0) | 
| 242 |  |  |  |  |  |  | ##     score   => $func,          ##-- scoring function ("f"|"lf"|"fm"|"lfm"|"mi"|"ld"|"ll") : default="f" | 
| 243 |  |  |  |  |  |  | ##     kbest   => $k,             ##-- return only $k best collocates per date (slice) : default=-1:all | 
| 244 |  |  |  |  |  |  | ##     cutoff  => $cutoff,        ##-- minimum score | 
| 245 |  |  |  |  |  |  | ##     global  => $bool,          ##-- trim profiles globally (vs. locally for each date-slice?) (default=0) | 
| 246 |  |  |  |  |  |  | ##     diff    => $diff,          ##-- low-level score-diff operation (adiff|diff|sum|min|max|avg|havg); default='adiff' | 
| 247 |  |  |  |  |  |  | ##     ## | 
| 248 |  |  |  |  |  |  | ##     ##-- profiling and debugging parameters | 
| 249 |  |  |  |  |  |  | ##     strings => $bool,          ##-- do/don't stringify item keys (default=do) | 
| 250 |  |  |  |  |  |  | ##     packed  => $bool,          ##-- leave item keys packed (override stringification; default=don't) | 
| 251 |  |  |  |  |  |  | ##     ## | 
| 252 |  |  |  |  |  |  | ##     ##-- sublcass abstraction parameters | 
| 253 |  |  |  |  |  |  | ##     _gbparse => $bool,         ##-- if true (default), 'groupby' clause will be parsed only once, using $coldb->groupby() method | 
| 254 |  |  |  |  |  |  | ##     _abkeys  => \@abkeys,      ##-- additional key-suffixes KEY s.t. (KEY=>VAL) gets passed to profile() calls if e.g. (aKEY=>VAL) is in %opts | 
| 255 |  |  |  |  |  |  | ##    ) | 
| 256 |  |  |  |  |  |  | ##  + default implementation wraps profile() method | 
| 257 |  |  |  |  |  |  | ##  + default values for %opts should be set by higher-level call, e.g. DiaColloDB::compare() | 
| 258 |  |  |  |  |  |  | sub compare { | 
| 259 | 0 |  |  | 0 | 1 |  | my ($reldb,$coldb,%opts) = @_; | 
| 260 |  |  |  |  |  |  |  | 
| 261 |  |  |  |  |  |  | ##-- common variables | 
| 262 | 0 |  |  |  |  |  | my $logProfile = $coldb->{logProfile}; | 
| 263 | 0 |  | 0 |  |  |  | my $groupby    = $opts{groupby} || [@{$coldb->attrs}]; | 
| 264 | 0 | 0 | 0 |  |  |  | $groupby       = $coldb->groupby($groupby) if ($opts{_gbparse}//1); | 
| 265 | 0 | 0 | 0 |  |  |  | my %aopts      = map {exists($opts{"a$_"}) ? ($_=>$opts{"a$_"}) : qw()} (qw(query date slice), @{$opts{_abkeys}//[]}); | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 266 | 0 | 0 | 0 |  |  |  | my %bopts      = map {exists($opts{"b$_"}) ? ($_=>$opts{"b$_"}) : qw()} (qw(query date slice), @{$opts{_abkeys}//[]}); | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 267 | 0 |  |  |  |  |  | my %popts      = (kbest=>-1,cutoff=>'',global=>0,strings=>0,packed=>1,fill=>1, groupby=>$groupby); | 
| 268 |  |  |  |  |  |  |  | 
| 269 |  |  |  |  |  |  | ##-- get profiles to compare | 
| 270 | 0 | 0 |  |  |  |  | my $mpa = $reldb->profile($coldb,%opts, %aopts,%popts) or return undef; | 
| 271 | 0 | 0 |  |  |  |  | my $mpb = $reldb->profile($coldb,%opts, %bopts,%popts) or return undef; | 
| 272 |  |  |  |  |  |  |  | 
| 273 |  |  |  |  |  |  | ##-- alignment and trimming | 
| 274 | 0 | 0 |  |  |  |  | $reldb->vlog($logProfile, "compare(): align and trim (".($opts{global} ? 'global' : 'local').")"); | 
| 275 | 0 |  |  |  |  |  | my $ppairs = DiaColloDB::Profile::MultiDiff->align($mpa,$mpb); | 
| 276 | 0 |  |  |  |  |  | DiaColloDB::Profile::MultiDiff->trimPairs($ppairs, %opts); | 
| 277 | 0 |  |  |  |  |  | my $diff = DiaColloDB::Profile::MultiDiff->new($mpa,$mpb, titles=>$mpa->{titles}, diff=>$opts{diff}); | 
| 278 | 0 | 0 |  |  |  |  | $diff->trim( DiaColloDB::Profile::Diff->diffkbest($opts{diff})=>$opts{kbest} ) if (!$opts{global}); | 
| 279 |  |  |  |  |  |  |  | 
| 280 |  |  |  |  |  |  | ##-- finalize: stringify | 
| 281 | 0 | 0 |  |  |  |  | if (!$opts{packed}) { | 
| 282 | 0 | 0 | 0 |  |  |  | if ($opts{strings}//1) { | 
| 283 | 0 |  |  |  |  |  | $diff->stringify($groupby->{g2s}); | 
| 284 |  |  |  |  |  |  | } else { | 
| 285 | 0 |  |  |  |  |  | $diff->stringify($groupby->{g2txt}); | 
| 286 |  |  |  |  |  |  | } | 
| 287 |  |  |  |  |  |  | } | 
| 288 |  |  |  |  |  |  |  | 
| 289 | 0 |  |  |  |  |  | return $diff; | 
| 290 |  |  |  |  |  |  | } | 
| 291 |  |  |  |  |  |  |  | 
| 292 |  |  |  |  |  |  | ## $mpdiff = $rel->diff($coldb, %opts) | 
| 293 |  |  |  |  |  |  | ##  + alias for compare() | 
| 294 |  |  |  |  |  |  | sub diff { | 
| 295 | 0 |  |  | 0 | 1 |  | my $rel = shift; | 
| 296 | 0 |  |  |  |  |  | return $rel->compare(@_); | 
| 297 |  |  |  |  |  |  | } | 
| 298 |  |  |  |  |  |  |  | 
| 299 |  |  |  |  |  |  |  | 
| 300 |  |  |  |  |  |  | ##============================================================================== | 
| 301 |  |  |  |  |  |  | ## Relation API: default: subprofile1() | 
| 302 |  |  |  |  |  |  |  | 
| 303 |  |  |  |  |  |  | ## $prf = $rel->subprofile1(\@xids, %opts) | 
| 304 |  |  |  |  |  |  | ##  + get joint frequency profile for @xids (db must be opened) | 
| 305 |  |  |  |  |  |  | ##  + %opts: | 
| 306 |  |  |  |  |  |  | ##     groupby => \&gbsub,  ##-- key-extractor $key2_or_undef = $gbsub->($i2) | 
| 307 |  |  |  |  |  |  | ##     coldb   => $coldb,   ##-- parent DiaColloDB object (for shared data, debugging) | 
| 308 |  |  |  |  |  |  | ##     opts    => \%opts,   ##-- pass-through for options to top-level profile() method | 
| 309 |  |  |  |  |  |  | sub subprofile1 { | 
| 310 | 0 |  |  | 0 | 1 |  | my ($rel,$ids,%opts) = @_; | 
| 311 | 0 |  |  |  |  |  | $rel->logconfess("subprofile(): abstract method called"); | 
| 312 |  |  |  |  |  |  | } | 
| 313 |  |  |  |  |  |  |  | 
| 314 |  |  |  |  |  |  | ## \%slice2prf = $rel->subprofile2(\%slice2prf, %opts) | 
| 315 |  |  |  |  |  |  | ##  + populate f2 frequencies for profiles in \%slice2prf | 
| 316 |  |  |  |  |  |  | ##  + %opts: | 
| 317 |  |  |  |  |  |  | ##     groupby => \%gbreq,  ##-- parsed groupby object | 
| 318 |  |  |  |  |  |  | ##     a2data  => \%a2data, ##-- maps indexed attributes to associated datastructures | 
| 319 |  |  |  |  |  |  | ##     coldb   => $coldb,   ##-- parent DiaColloDB object (for shared data, debugging) | 
| 320 |  |  |  |  |  |  | ##     opts    => \%opts,   ##-- pass-through for options to top-level profile() method | 
| 321 |  |  |  |  |  |  | ##  + default implementation just returns \%slice2prf | 
| 322 |  |  |  |  |  |  | sub subprofile2 { | 
| 323 |  |  |  |  |  |  | #my ($rel,$slice2prf,%opts) = @_; | 
| 324 | 0 |  |  | 0 | 1 |  | return $_[1]; | 
| 325 |  |  |  |  |  |  | } | 
| 326 |  |  |  |  |  |  |  | 
| 327 |  |  |  |  |  |  | ##============================================================================== | 
| 328 |  |  |  |  |  |  | ## Relation API: default: qinfo() | 
| 329 |  |  |  |  |  |  |  | 
| 330 |  |  |  |  |  |  | ## \%qinfo = $rel->qinfo($coldb, %opts) | 
| 331 |  |  |  |  |  |  | ##  + get query-info hash for profile administrivia (ddc hit links) | 
| 332 |  |  |  |  |  |  | ##  + %opts: as for profile(), additionally: | 
| 333 |  |  |  |  |  |  | ##    ( | 
| 334 |  |  |  |  |  |  | ##     qreqs => \@areqs,      ##-- as returned by $coldb->parseRequest($opts{query}) | 
| 335 |  |  |  |  |  |  | ##     gbreq => \%groupby,    ##-- as returned by $coldb->groupby($opts{groupby}) | 
| 336 |  |  |  |  |  |  | ##    ) | 
| 337 |  |  |  |  |  |  | ##  + returned hash \%qinfo should have keys: | 
| 338 |  |  |  |  |  |  | ##    ( | 
| 339 |  |  |  |  |  |  | ##     fcoef => $fcoef,         ##-- frequency coefficient (2*$coldb->{dmax} for CoFreqs) | 
| 340 |  |  |  |  |  |  | ##     qtemplate => $qtemplate, ##-- query template with __W1.I1__ rsp __W2.I2__ replacing groupby fields | 
| 341 |  |  |  |  |  |  | ##    ) | 
| 342 |  |  |  |  |  |  | sub qinfo { | 
| 343 | 0 |  |  | 0 | 1 |  | my ($rel,$coldb,%opts) = @_; | 
| 344 | 0 |  |  |  |  |  | $rel->logconfess("qinfo(): abstract method called"); | 
| 345 |  |  |  |  |  |  | } | 
| 346 |  |  |  |  |  |  |  | 
| 347 |  |  |  |  |  |  | ## (\@q1strs,\@q2strs,\@qxstrs,\@fstrs) = $rel->qinfoData($coldb,%opts) | 
| 348 |  |  |  |  |  |  | ##  + parses @opts{qw(qreqs gbreq)} into conditions on w1, w2 and metadata filters (for ddc linkup) | 
| 349 |  |  |  |  |  |  | ##  + call this from subclass qinfo() methods | 
| 350 |  |  |  |  |  |  | sub qinfoData { | 
| 351 | 0 |  |  | 0 | 1 |  | my ($rel,$coldb,%opts) = @_; | 
| 352 | 0 |  |  |  |  |  | my (@q1strs,@q2strs,@qxstrs,@fstrs,$q,$q2); | 
| 353 |  |  |  |  |  |  |  | 
| 354 |  |  |  |  |  |  | ##-- query clause | 
| 355 | 0 |  |  |  |  |  | foreach (@{$opts{qreqs}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 356 | 0 |  |  |  |  |  | $q = $coldb->attrQuery(@$_); | 
| 357 | 0 | 0 | 0 |  |  |  | if (UNIVERSAL::isa($q,'DDC::Any::CQFilter')) { | 
|  |  | 0 |  |  |  |  |  | 
| 358 | 0 |  |  |  |  |  | push(@fstrs, $q->toString); | 
| 359 |  |  |  |  |  |  | } | 
| 360 |  |  |  |  |  |  | elsif (defined($q) && !UNIVERSAL::isa($q,'DDC::Any::CQTokAny')) { | 
| 361 | 0 |  |  |  |  |  | push(@q1strs, $q->toString); | 
| 362 |  |  |  |  |  |  | } | 
| 363 |  |  |  |  |  |  | } | 
| 364 |  |  |  |  |  |  |  | 
| 365 |  |  |  |  |  |  | ##-- groupby clause | 
| 366 | 0 |  |  |  |  |  | my $xi=1; | 
| 367 | 0 |  |  |  |  |  | foreach (@{$opts{gbreq}{areqs}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 368 | 0 | 0 |  |  |  |  | if ($_->[0] =~ /^doc\.(.*)/) { | 
| 369 | 0 |  |  |  |  |  | push(@fstrs, DDC::Any::CQFHasField->new($1,"__W2.${xi}__")->toString); | 
| 370 |  |  |  |  |  |  | } | 
| 371 |  |  |  |  |  |  | else { | 
| 372 | 0 |  |  |  |  |  | push(@q2strs, DDC::Any::CQTokExact->new($_->[0],"__W2.${xi}__")->toString); | 
| 373 |  |  |  |  |  |  | } | 
| 374 | 0 |  |  |  |  |  | ++$xi; | 
| 375 |  |  |  |  |  |  | } | 
| 376 |  |  |  |  |  |  |  | 
| 377 |  |  |  |  |  |  | ##-- common restrictions (trunk/2015-10-28: these are too expensive for large corpora (timeouts): ignore 'em | 
| 378 |  |  |  |  |  |  | #push(@qxstrs, qq(\$p=/$coldb->{pgood}/)) if ($coldb->{pgood}); | 
| 379 |  |  |  |  |  |  | #push(@qxstrs, qq(\$=!/$coldb->{pbad}/))  if ($coldb->{pbad}); | 
| 380 |  |  |  |  |  |  |  | 
| 381 |  |  |  |  |  |  | ##-- utf8 | 
| 382 | 0 |  |  |  |  |  | foreach (@q1strs,@q2strs,@qxstrs,@fstrs) { | 
| 383 | 0 | 0 |  |  |  |  | utf8::decode($_) if (!utf8::is_utf8($_)); | 
| 384 |  |  |  |  |  |  | } | 
| 385 |  |  |  |  |  |  |  | 
| 386 | 0 |  |  |  |  |  | return (\@q1strs,\@q2strs,\@qxstrs,\@fstrs); | 
| 387 |  |  |  |  |  |  | } | 
| 388 |  |  |  |  |  |  |  | 
| 389 |  |  |  |  |  |  |  | 
| 390 |  |  |  |  |  |  | ##============================================================================== | 
| 391 |  |  |  |  |  |  | ## Footer | 
| 392 |  |  |  |  |  |  | 1; | 
| 393 |  |  |  |  |  |  |  | 
| 394 |  |  |  |  |  |  | __END__ |