| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | ## -*- Mode: CPerl -*- | 
| 2 |  |  |  |  |  |  | ## File: DiaColloDB::Relation::DDC.pm | 
| 3 |  |  |  |  |  |  | ## Author: Bryan Jurish <moocow@cpan.org> | 
| 4 |  |  |  |  |  |  | ## Description: collocation db, profiling relation: ddc client (using DDC::Client::Distributed) | 
| 5 |  |  |  |  |  |  |  | 
| 6 |  |  |  |  |  |  | package DiaColloDB::Relation::DDC; | 
| 7 | 1 |  |  | 1 |  | 9 | use DiaColloDB::Relation; | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 39 |  | 
| 8 | 1 |  |  | 1 |  | 7 | use DiaColloDB::Utils qw(:math :list); | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 92 |  | 
| 9 | 1 |  |  | 1 |  | 1754 | use DDC::Client::Distributed; | 
|  | 1 |  |  |  |  | 29318 |  | 
|  | 1 |  |  |  |  | 43 |  | 
| 10 | 1 |  |  | 1 |  | 13 | use Fcntl qw(:seek); | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 98 |  | 
| 11 | 1 |  |  | 1 |  | 324 | use strict; | 
|  | 1 |  |  |  |  | 4 |  | 
|  | 1 |  |  |  |  | 6240 |  | 
| 12 |  |  |  |  |  |  |  | 
| 13 |  |  |  |  |  |  | ##============================================================================== | 
| 14 |  |  |  |  |  |  | ## Globals & Constants | 
| 15 |  |  |  |  |  |  |  | 
| 16 |  |  |  |  |  |  | our @ISA = qw(DiaColloDB::Relation); | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | ##============================================================================== | 
| 19 |  |  |  |  |  |  | ## Constructors etc. | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | ## $ddc = CLASS_OR_OBJECT->new(%args) | 
| 22 |  |  |  |  |  |  | ## + %args, object structure: | 
| 23 |  |  |  |  |  |  | ##   ( | 
| 24 |  |  |  |  |  |  | ##    ##-- persistent options | 
| 25 |  |  |  |  |  |  | ##    base => $basename,               ##-- configuration header basename (default=undef) | 
| 26 |  |  |  |  |  |  | ##    ## | 
| 27 |  |  |  |  |  |  | ##    ##-- ddc client options | 
| 28 |  |  |  |  |  |  | ##    ddcServer => "$server:$port",    ##-- ddc server (required; default=$coldb->{ddcServer} via fromDB() method) | 
| 29 |  |  |  |  |  |  | ##    ddcTimeout => $timeout,          ##-- ddc timeout; default=300 | 
| 30 |  |  |  |  |  |  | ##    ddcLimit   => $limit,            ##-- default limit for ddc queries (default=-1) | 
| 31 |  |  |  |  |  |  | ##    ddcSample  => $sample,           ##-- default sample size for ddc queries (default=-1:all) | 
| 32 |  |  |  |  |  |  | ##    dmax    => $maxDistance,         ##-- default distance for near() queries (default=5; 1=immediate adjacency; ~ ddc CQNear.Dist+1) | 
| 33 |  |  |  |  |  |  | ##    cfmin   => $minFreq,             ##-- default minimum frequency for count() queries (default=2) | 
| 34 |  |  |  |  |  |  | ##    ## | 
| 35 |  |  |  |  |  |  | ##    ##-- logging options | 
| 36 |  |  |  |  |  |  | ##    #logProfile => $level,           ##-- log-level for verbose profiling (from $coldb) | 
| 37 |  |  |  |  |  |  | ##    logTrunc => $nchars,             ##-- max length of query string to log (default=256) | 
| 38 |  |  |  |  |  |  | ##    ## | 
| 39 |  |  |  |  |  |  | ##    ##-- low-level data | 
| 40 |  |  |  |  |  |  | ##    dclient   => $ddcClient,         ##-- a DDC::Client::Distributed object | 
| 41 |  |  |  |  |  |  | ##   ) | 
| 42 |  |  |  |  |  |  | sub new { | 
| 43 | 0 |  |  | 0 | 1 |  | my $that = shift; | 
| 44 | 0 |  |  |  |  |  | my $rel  = $that->SUPER::new( | 
| 45 |  |  |  |  |  |  | #base       => undef, | 
| 46 |  |  |  |  |  |  | ddcServer  => undef, | 
| 47 |  |  |  |  |  |  | ddcTimeout => 300, | 
| 48 |  |  |  |  |  |  | ddcLimit   => -1, | 
| 49 |  |  |  |  |  |  | ddcSample  => -1, | 
| 50 |  |  |  |  |  |  | dmax       => 5, | 
| 51 |  |  |  |  |  |  | cfmin      => 2, | 
| 52 |  |  |  |  |  |  | #logProfile => 'trace', | 
| 53 |  |  |  |  |  |  | logTrunc   => 256, | 
| 54 |  |  |  |  |  |  | @_ | 
| 55 |  |  |  |  |  |  | ); | 
| 56 | 0 |  |  |  |  |  | $rel->{class} = ref($rel); | 
| 57 | 0 | 0 |  |  |  |  | return $rel->open() if (defined($rel->{base})); | 
| 58 | 0 |  |  |  |  |  | return $rel; | 
| 59 |  |  |  |  |  |  | } | 
| 60 |  |  |  |  |  |  |  | 
| 61 |  |  |  |  |  |  | ## $rel_or_undef = $CLASS_OR_OBJECT->fromDB($coldb,%opts) | 
| 62 |  |  |  |  |  |  | ##  + default implementation clobbers $rel->headerKeys() from %$coldb, %opts | 
| 63 |  |  |  |  |  |  | sub fromDB { | 
| 64 | 0 |  |  | 0 | 1 |  | my ($that,$coldb,%opts) = @_; | 
| 65 | 0 | 0 |  |  |  |  | my $rel = ref($that) ? $that : $that->new(); | 
| 66 | 0 |  |  |  |  |  | $rel->{$_} = $coldb->{$_} foreach (grep {exists $coldb->{$_}} $rel->headerKeys); | 
|  | 0 |  |  |  |  |  |  | 
| 67 | 0 |  |  |  |  |  | @$rel{keys %opts} = values %opts; | 
| 68 | 0 | 0 |  |  |  |  | return undef if (!$rel->{ddcServer}); | 
| 69 | 0 |  |  |  |  |  | return $rel; | 
| 70 |  |  |  |  |  |  | } | 
| 71 |  |  |  |  |  |  |  | 
| 72 |  |  |  |  |  |  |  | 
| 73 |  |  |  |  |  |  | ##============================================================================== | 
| 74 |  |  |  |  |  |  | ## Relation API: create | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | ## $rel = $CLASS_OR_OBJECT->create($coldb,$tokdat_file,%opts) | 
| 77 |  |  |  |  |  |  | ##  + default just calls fromDB() and saveHeaderFile() | 
| 78 |  |  |  |  |  |  | sub create { | 
| 79 | 0 |  |  | 0 | 1 |  | my ($rel,$coldb,$datfile,%opts) = @_; | 
| 80 | 0 | 0 |  |  |  |  | $rel = $rel->fromDB($coldb,%opts) or return undef; | 
| 81 | 0 | 0 |  |  |  |  | $rel->saveHeader() if ($rel->{base}); | 
| 82 | 0 |  |  |  |  |  | return $rel; | 
| 83 |  |  |  |  |  |  | } | 
| 84 |  |  |  |  |  |  |  | 
| 85 |  |  |  |  |  |  | ##============================================================================== | 
| 86 |  |  |  |  |  |  | ## Relation API: union (SKETCHY) | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | ## $rel = $CLASS_OR_OBJECT->union($coldb, \@pairs, %opts) | 
| 89 |  |  |  |  |  |  | ##  + merge multiple co-frequency indices into new object | 
| 90 |  |  |  |  |  |  | ##  + @pairs : array of pairs ([$ug,\@xi2u],...) | 
| 91 |  |  |  |  |  |  | ##    of unigram-objects $ug and tuple-id maps \@xi2u for $ug | 
| 92 |  |  |  |  |  |  | ##  + %opts: clobber %$rel | 
| 93 |  |  |  |  |  |  | ##  + default just calls create(), but should probably create a list of ddc servers to query | 
| 94 |  |  |  |  |  |  | sub union { | 
| 95 | 0 |  |  | 0 | 1 |  | my ($rel,$coldb,$pairs,%opts) = @_; | 
| 96 |  |  |  |  |  |  | $rel->logwarn("union() of ddc may not work as expected without global 'ddcServer' option") | 
| 97 | 0 | 0 | 0 |  |  |  | if (!$coldb->{ddcServer} && !$opts{ddcServer}); | 
| 98 | 0 |  |  |  |  |  | return $rel->create($coldb,undef,%opts); | 
| 99 |  |  |  |  |  |  | } | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | ##============================================================================== | 
| 102 |  |  |  |  |  |  | ## Relation API: dbinfo | 
| 103 |  |  |  |  |  |  |  | 
| 104 |  |  |  |  |  |  | ## \%info = $rel->dbinfo($coldb) | 
| 105 |  |  |  |  |  |  | ##  + embedded info-hash for $coldb->dbinfo() | 
| 106 |  |  |  |  |  |  | sub dbinfo { | 
| 107 | 0 |  |  | 0 | 0 |  | my ($rel,$coldb) = @_; | 
| 108 | 0 |  |  |  |  |  | $rel = $rel->fromDB($coldb); | 
| 109 | 0 |  |  |  |  |  | my $info = $rel->SUPER::dbinfo(); | 
| 110 | 0 |  |  |  |  |  | my @keys = qw(ddcServer ddcTimeout ddcLimit ddcSample dmax cfmin); | 
| 111 | 0 |  |  |  |  |  | @$info{@keys} = @$rel{@keys}; | 
| 112 | 0 |  |  |  |  |  | return $info; | 
| 113 |  |  |  |  |  |  | } | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  |  | 
| 116 |  |  |  |  |  |  | ##============================================================================== | 
| 117 |  |  |  |  |  |  | ## Relation API: profiling & comparison: top-level | 
| 118 |  |  |  |  |  |  |  | 
| 119 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 120 |  |  |  |  |  |  | ## Relation API: profile | 
| 121 |  |  |  |  |  |  |  | 
| 122 |  |  |  |  |  |  | ## $mprf = $rel->profile($coldb, %opts) | 
| 123 |  |  |  |  |  |  | ## + get a relation profile for selected items as a DiaColloDB::Profile::Multi object | 
| 124 |  |  |  |  |  |  | ## + %opts: as for DiaColloDB::Relation::profile(), also: | 
| 125 |  |  |  |  |  |  | ##   ( | 
| 126 |  |  |  |  |  |  | ##    ##-- sampling options | 
| 127 |  |  |  |  |  |  | ##    limit => $limit,       ##-- maximum number of items to return from ddc; sets $qconfig{limit} (default: query "#limit[N]" or $rel->{ddcLimit}) | 
| 128 |  |  |  |  |  |  | ##    sample => $sample,     ##-- ddc sample size; sets $qconfig{qcount} Sample property (default: query "#sample[N]" or $rel->{ddcSample}) | 
| 129 |  |  |  |  |  |  | ##    cfmin => $cfmin,       ##-- minimum subcorpus frequency for returned items (default: query "#fmin[N]" or $rel->{cfmin}) | 
| 130 |  |  |  |  |  |  | ##    dmax  => $dmax,        ##-- maxmimum distance for implicit near() queries (default: query "#dmax[N]" or $rel->{dmax}) | 
| 131 |  |  |  |  |  |  | ##   ) | 
| 132 |  |  |  |  |  |  | sub profile { | 
| 133 | 0 |  |  | 0 | 1 |  | my ($that,$coldb,%opts) = @_; | 
| 134 |  |  |  |  |  |  | my $rel = $that->fromDB($coldb,%opts) | 
| 135 | 0 | 0 |  |  |  |  | or $that->logconfess($coldb->{error}="profile(): initialization failed (did you forget to set the 'ddcServer' option?)"); | 
| 136 | 0 |  |  |  |  |  | $opts{coldb} = $coldb; | 
| 137 |  |  |  |  |  |  |  | 
| 138 |  |  |  |  |  |  | ##-- get count-query, count-by expressions, titles | 
| 139 | 0 |  |  |  |  |  | my $qcount  = $rel->countQuery($coldb,\%opts); | 
| 140 | 0 |  |  |  |  |  | my $gbtitles = $opts{gbtitles}; | 
| 141 |  |  |  |  |  |  |  | 
| 142 |  |  |  |  |  |  | ##-- get raw f12 results and parse into slice-wise profiles | 
| 143 | 0 |  |  |  |  |  | my $result12 = $rel->ddcQuery($coldb, $qcount, limit=>$opts{limit}, logas=>'f12'); | 
| 144 | 0 |  |  |  |  |  | my (%y2prf,$y,$prf,$key); | 
| 145 | 0 |  |  |  |  |  | foreach (@{$result12->{counts_}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 146 | 0 |  | 0 |  |  |  | $y   = $_->[1]//'0'; | 
| 147 | 0 |  |  |  |  |  | $key = join("\t", @$_[2..$#$_]); | 
| 148 | 0 |  | 0 |  |  |  | $prf = ($y2prf{$y} //= DiaColloDB::Profile->new(label=>$y)); | 
| 149 | 0 |  |  |  |  |  | $prf->{f12}{$key}   += $_->[0]; | 
| 150 |  |  |  |  |  |  | } | 
| 151 | 0 |  |  |  |  |  | undef $result12; ##-- save some memory | 
| 152 |  |  |  |  |  |  |  | 
| 153 |  |  |  |  |  |  | ##-- get raw f2 counts & propagate to profiles in %$y2prf | 
| 154 |  |  |  |  |  |  | ##  + iterate over all queries in @qcounts2 (for multi-pass mode) | 
| 155 | 0 |  |  |  |  |  | my $qcounts2 = $rel->collocateCountQueries($qcount,\%y2prf, \%opts); | 
| 156 | 0 |  |  |  |  |  | my $fcoef    = $opts{fcoef}; | 
| 157 | 0 |  |  |  |  |  | foreach my $qc2i (0..$#$qcounts2) { | 
| 158 | 0 |  |  |  |  |  | my $qcount2 = $qcounts2->[$qc2i]; | 
| 159 | 0 |  |  |  |  |  | my $result2 = $rel->ddcQuery($coldb, $qcount2, limit=>-1, logTrunc=>128, logas=>("f2[".($qc2i+1).'/'.scalar(@$qcounts2)."]")); | 
| 160 | 0 |  |  |  |  |  | foreach (@{$result2->{counts_}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 161 | 0 | 0 |  |  |  |  | next if (!defined($prf=$y2prf{$y=$_->[1]})); | 
| 162 | 0 |  |  |  |  |  | $key = join("\t", @$_[2..$#$_]); | 
| 163 | 0 | 0 |  |  |  |  | $prf->{f2}{$key} += $_->[0]*$fcoef if (exists $prf->{f12}{$key}); | 
| 164 |  |  |  |  |  |  | } | 
| 165 |  |  |  |  |  |  | } | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | ##-- query independent f1 and update slice-wise profiles | 
| 168 | 0 | 0 | 0 |  |  |  | if ($opts{qcount1} && !$opts{onepass}) { | 
| 169 |  |  |  |  |  |  | ##-- f1 : via direct pre-generated $opts{qcount1} (DiaColloDB >= v0.12.017) | 
| 170 | 0 |  |  |  |  |  | my $qcount1 = $opts{qcount1}; | 
| 171 | 0 |  |  |  |  |  | my $result1 = $rel->ddcQuery($coldb, $qcount1, limit=>-1, logas=>'f1'); | 
| 172 | 0 |  |  |  |  |  | foreach (@{$result1->{counts_}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 173 | 0 | 0 |  |  |  |  | next if (!defined($prf=$y2prf{$y=$_->[1]})); | 
| 174 | 0 |  | 0 |  |  |  | $prf->{f1} += $_->[0]*($opts{fcoef1}//1); | 
| 175 |  |  |  |  |  |  | } | 
| 176 |  |  |  |  |  |  | } | 
| 177 |  |  |  |  |  |  | else { #if ($opts{onepass} | 
| 178 |  |  |  |  |  |  | ##-- f1 : 1-pass (~ DiaColloDB <= v0.12.016) | 
| 179 | 0 | 0 |  |  |  |  | if ($opts{needCountsByToken}) { | 
|  |  | 0 |  |  |  |  |  | 
| 180 |  |  |  |  |  |  | ##-- not sure why we're using count(keys(...)) #by[$l=1] here | 
| 181 | 0 |  |  |  |  |  | my $qcount1 = $qcounts2->[0]->clone(); | 
| 182 | 0 | 0 |  |  |  |  | $_->setMatchId(1) foreach (grep {UNIVERSAL::isa($_,'DDC::Any::CQCountKeyExprToken') && $_->getMatchId==2} | 
|  | 0 |  |  |  |  |  |  | 
| 183 | 0 |  |  |  |  |  | @{$qcount1->getDtr->getQCount->getKeys->getExprs}, | 
| 184 | 0 |  |  |  |  |  | @{$qcount1->getKeys->getExprs}, | 
| 185 |  |  |  |  |  |  | ); | 
| 186 | 0 |  |  |  |  |  | $qcount1->getDtr->setMatchId(1); | 
| 187 | 0 |  |  |  |  |  | my $result1 = $rel->ddcQuery($coldb, $qcount1, limit=>-1, logas=>'f1'); | 
| 188 | 0 |  |  |  |  |  | foreach (@{$result1->{counts_}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 189 | 0 | 0 |  |  |  |  | next if (!defined($prf=$y2prf{$y=$_->[1]})); | 
| 190 | 0 |  |  |  |  |  | $prf->{f1} += $_->[0]*$fcoef; | 
| 191 |  |  |  |  |  |  | } | 
| 192 | 0 |  |  |  |  |  | undef $result1; ##-- save some memory (but not much) | 
| 193 |  |  |  |  |  |  | } | 
| 194 | 0 | 0 | 0 |  |  |  | elsif (grep {UNIVERSAL::isa($_,'DDC::Any::CQToken') && $_->getMatchId==2 && !UNIVERSAL::isa($_,'DDC::Any::CQTokAny')} @{$qcount->Descendants}) { | 
|  | 0 |  |  |  |  |  |  | 
| 195 |  |  |  |  |  |  | ##-- no item2 keys in groupby clause, but real item2 restriction: count f1 anyways | 
| 196 | 0 |  |  |  |  |  | my $qcount1 = $qcount->clone(); | 
| 197 | 0 |  |  |  |  |  | my $qdtr1    = $qcount1->getDtr; | 
| 198 | 0 |  |  |  |  |  | my ($nod,$newnod); | 
| 199 |  |  |  |  |  |  | $qdtr1->mapTraverse(sub { | 
| 200 | 0 |  |  | 0 |  |  | $nod = shift; | 
| 201 | 0 | 0 | 0 |  |  |  | if (UNIVERSAL::isa($nod,'DDC::Any::CQToken') && $nod->getMatchId==2 && !UNIVERSAL::isa($nod,'DDC::Any::CQTokAny')) { | 
|  |  |  | 0 |  |  |  |  | 
| 202 | 0 |  |  |  |  |  | $newnod = DDC::Any::CQTokAny->new(); | 
| 203 | 0 |  |  |  |  |  | $newnod->setMatchId('2'); | 
| 204 | 0 |  |  |  |  |  | $newnod->setOptions($nod->getOptions); | 
| 205 | 0 |  |  |  |  |  | $nod->setOptions(undef); | 
| 206 | 0 |  |  |  |  |  | return $newnod; | 
| 207 |  |  |  |  |  |  | } | 
| 208 | 0 |  |  |  |  |  | return $nod; | 
| 209 | 0 |  |  |  |  |  | }); | 
| 210 |  |  |  |  |  |  | $qcount1->getKeys->setExprs([grep | 
| 211 | 0 |  | 0 |  |  |  | {!(UNIVERSAL::isa($_,'DDC::Any::CQCountKeyExprToken') && $_->getMatchId==2)} | 
| 212 | 0 |  |  |  |  |  | @{$qcount1->getKeys->getExprs}]); | 
|  | 0 |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  |  | 
| 214 | 0 |  |  |  |  |  | my $result1 = $rel->ddcQuery($coldb, $qcount1, limit=>-1, logas=>'f1'); | 
| 215 | 0 |  |  |  |  |  | foreach (@{$result1->{counts_}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 216 | 0 | 0 |  |  |  |  | next if (!defined($prf=$y2prf{$y=$_->[1]})); | 
| 217 | 0 |  |  |  |  |  | $prf->{f1} += $_->[0]; | 
| 218 |  |  |  |  |  |  | } | 
| 219 | 0 |  |  |  |  |  | undef $result1; ##-- save some memory (but not much) | 
| 220 |  |  |  |  |  |  | } | 
| 221 |  |  |  |  |  |  | else { | 
| 222 |  |  |  |  |  |  | ##-- no f1 query required (item2 is universal wildcard) | 
| 223 | 0 |  |  |  |  |  | foreach $prf (values %y2prf) { | 
| 224 | 0 |  |  |  |  |  | my $f1=0; | 
| 225 | 0 |  |  |  |  |  | $f1 += $_ foreach (values %{$prf->{f12}}); | 
|  | 0 |  |  |  |  |  |  | 
| 226 | 0 |  |  |  |  |  | $prf->{f1} = $f1*$fcoef; | 
| 227 |  |  |  |  |  |  | } | 
| 228 |  |  |  |  |  |  | } | 
| 229 |  |  |  |  |  |  | } | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | ##-- query independent N (by epoch) | 
| 232 | 0 | 0 |  |  |  |  | my $qstrN   = "COUNT(* #SEP) #BY[".($opts{slice} ? "date/$opts{slice}" : q{@'0'})."]"; | 
| 233 | 0 |  |  |  |  |  | my $resultN = $rel->ddcQuery($coldb, $qstrN, limit=>-1, logas=>'fN'); | 
| 234 | 0 |  |  |  |  |  | my %fN      = qw(); | 
| 235 | 0 |  |  |  |  |  | my $N       = 0; ##-- total N | 
| 236 | 0 |  |  |  |  |  | foreach (@{$resultN->{counts_}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 237 | 0 |  |  |  |  |  | $fN{$_->[1]} = $_->[0] * $fcoef; | 
| 238 | 0 |  |  |  |  |  | $N          += $_->[1] * $fcoef; | 
| 239 |  |  |  |  |  |  | } | 
| 240 |  |  |  |  |  |  |  | 
| 241 |  |  |  |  |  |  | ##-- finalize sub-profiles: label, titles, N, compile | 
| 242 | 0 |  |  |  |  |  | my ($f1); | 
| 243 | 0 |  |  |  |  |  | foreach $prf (values %y2prf) { | 
| 244 | 0 |  |  |  |  |  | $prf->{titles} = $gbtitles; | 
| 245 |  |  |  |  |  |  |  | 
| 246 | 0 | 0 |  |  |  |  | if (!($f1=$prf->{f1})) { | 
| 247 | 0 |  |  |  |  |  | $f1  = 0; | 
| 248 | 0 |  |  |  |  |  | $f1 += $_ foreach (values %{$prf->{f12}}); | 
|  | 0 |  |  |  |  |  |  | 
| 249 | 0 |  |  |  |  |  | $prf->{f1} = $f1; | 
| 250 |  |  |  |  |  |  | } | 
| 251 |  |  |  |  |  |  |  | 
| 252 | 0 |  | 0 |  |  |  | $prf->{N} = $fN{$prf->{label}} || $N; | 
| 253 | 0 | 0 |  |  |  |  | $prf->{N} = $f1 if ($f1 > $prf->{N}); | 
| 254 | 0 |  |  |  |  |  | $prf->compile($opts{score}, eps=>$opts{eps}); | 
| 255 |  |  |  |  |  |  | } | 
| 256 |  |  |  |  |  |  |  | 
| 257 |  |  |  |  |  |  | ##-- honor "fill" option | 
| 258 | 0 | 0 |  |  |  |  | if ($opts{fill}) { | 
| 259 | 0 |  | 0 |  |  |  | for ($y=$opts{dslo}; $y <= $opts{dshi}; $y += ($opts{slice}||1)) { | 
| 260 | 0 | 0 |  |  |  |  | next if (exists($y2prf{$y})); | 
| 261 | 0 |  | 0 |  |  |  | $prf = $y2prf{$y} = DiaColloDB::Profile->new(N=>($fN{$y}||$N||0),f1=>0,label=>$y,titles=>$gbtitles)->compile($opts{score},eps=>$opts{eps}); | 
| 262 |  |  |  |  |  |  | } | 
| 263 |  |  |  |  |  |  | } | 
| 264 |  |  |  |  |  |  |  | 
| 265 |  |  |  |  |  |  | ##-- setup meta-profile query info | 
| 266 | 0 |  |  |  |  |  | my $qtemplate = $qcount->getDtr->clone(); | 
| 267 |  |  |  |  |  |  | my $qinfo = { | 
| 268 |  |  |  |  |  |  | qcanon=>$qcount->getDtr->toStringFull, | 
| 269 |  |  |  |  |  |  | q12=>$qcount->toStringFull, | 
| 270 |  |  |  |  |  |  | #q1=>$qcount1->toStringFull, | 
| 271 |  |  |  |  |  |  | #q2=>$qcount2->toStringFull, | 
| 272 |  |  |  |  |  |  | #qN=>$qstrN, | 
| 273 |  |  |  |  |  |  | ## | 
| 274 |  |  |  |  |  |  | fcoef=>$fcoef, | 
| 275 |  |  |  |  |  |  | qtemplate=>$opts{qtemplate}, | 
| 276 | 0 |  |  |  |  |  | }; | 
| 277 | 0 | 0 |  |  |  |  | foreach (values %$qinfo) { utf8::decode($_) if (!utf8::is_utf8($_)); } | 
|  | 0 |  |  |  |  |  |  | 
| 278 |  |  |  |  |  |  |  | 
| 279 |  |  |  |  |  |  | ##-- finalize: collect multi-profile & trim | 
| 280 | 0 |  |  |  |  |  | $rel->vlog($coldb->{logProfile}, "profile(): collect and trim"); | 
| 281 |  |  |  |  |  |  | my $mp = DiaColloDB::Profile::Multi->new( | 
| 282 | 0 |  |  |  |  |  | profiles => [@y2prf{sort {$a<=>$b} keys %y2prf}], | 
|  | 0 |  |  |  |  |  |  | 
| 283 |  |  |  |  |  |  | titles   => $gbtitles, | 
| 284 |  |  |  |  |  |  | qinfo    => $qinfo, | 
| 285 |  |  |  |  |  |  | ); | 
| 286 | 0 |  |  |  |  |  | $mp->trim(%opts, empty=>!$opts{fill}); | 
| 287 |  |  |  |  |  |  |  | 
| 288 |  |  |  |  |  |  | ##-- return | 
| 289 | 0 |  |  |  |  |  | return $mp; | 
| 290 |  |  |  |  |  |  | } | 
| 291 |  |  |  |  |  |  |  | 
| 292 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 293 |  |  |  |  |  |  | ## Relation API: extend | 
| 294 |  |  |  |  |  |  |  | 
| 295 |  |  |  |  |  |  | ## $mprf = $rel->extend($coldb, %opts) | 
| 296 |  |  |  |  |  |  | ## + get f2 frequencies (and ONLY f2 frequencies) for selected items as a DiaColloDB::Profile::Multi object | 
| 297 |  |  |  |  |  |  | ## + requires 'query' option for correct estimation of 'fcoef' | 
| 298 |  |  |  |  |  |  | ## + %opts: as for DiaColloDB::Relation::extend() | 
| 299 |  |  |  |  |  |  | sub extend { | 
| 300 | 0 |  |  | 0 | 1 |  | my ($that,$coldb,%opts) = @_; | 
| 301 |  |  |  |  |  |  | my $rel = $that->fromDB($coldb,%opts) | 
| 302 | 0 | 0 |  |  |  |  | or $that->logconfess($coldb->{error}="extend(): initialization failed (did you forget to set the 'ddcServer' option?)"); | 
| 303 |  |  |  |  |  |  |  | 
| 304 |  |  |  |  |  |  | ##-- common variables | 
| 305 | 0 |  |  |  |  |  | $opts{coldb}   = $coldb; | 
| 306 | 0 |  |  |  |  |  | my $opts       = \%opts; | 
| 307 | 0 |  |  |  |  |  | my $logProfile = $coldb->{logProfile}; | 
| 308 |  |  |  |  |  |  |  | 
| 309 |  |  |  |  |  |  | ##-- sanity check(s) | 
| 310 | 0 |  |  |  |  |  | my ($slice2keys); | 
| 311 | 0 | 0 |  |  |  |  | if (!($slice2keys=$opts{slice2keys})) { | 
|  |  | 0 |  |  |  |  |  | 
| 312 | 0 |  |  |  |  |  | $rel->warn($coldb->{error}="extend(): no 'slice2keys' parameter specified!"); | 
| 313 | 0 |  |  |  |  |  | return undef; | 
| 314 |  |  |  |  |  |  | } | 
| 315 |  |  |  |  |  |  | elsif (!UNIVERSAL::isa($slice2keys,'HASH')) { | 
| 316 | 0 |  |  |  |  |  | $rel->warn($coldb->{error}="extend(): failed to parse 'slice2keys' parameter"); | 
| 317 | 0 |  |  |  |  |  | return undef; | 
| 318 |  |  |  |  |  |  | } | 
| 319 |  |  |  |  |  |  |  | 
| 320 |  |  |  |  |  |  | ##-- get "real" count-query, count-by expressions, titles, fcoef, ... | 
| 321 | 0 |  |  |  |  |  | my $qcount = $rel->countQuery($coldb,\%opts); | 
| 322 |  |  |  |  |  |  |  | 
| 323 |  |  |  |  |  |  | ##-- create %y2pf | 
| 324 | 0 |  |  |  |  |  | my %y2prf; | 
| 325 | 0 |  |  |  |  |  | my ($y,$ykeys, $prf); | 
| 326 | 0 |  |  |  |  |  | while (($y,$ykeys) = each %$slice2keys) { | 
| 327 | 0 |  | 0 |  |  |  | $prf = ($y2prf{$y} //= DiaColloDB::Profile->new(label=>$y, titles=>$opts{gbtitles})); | 
| 328 | 0 | 0 |  |  |  |  | $prf->{f12}{$_} = 0 foreach (UNIVERSAL::isa($ykeys,'HASH') ? keys(%$ykeys) : @$ykeys); | 
| 329 |  |  |  |  |  |  | } | 
| 330 |  |  |  |  |  |  |  | 
| 331 |  |  |  |  |  |  | ##-- get raw f2 counts & propagate to profiles in %$y2prf (copy+pasted from profile()) | 
| 332 |  |  |  |  |  |  | ##  + iterate over all queries in @qcounts2 (for multi-pass mode) | 
| 333 | 0 |  |  |  |  |  | my $qcounts2 = $rel->collocateCountQueries($qcount,\%y2prf, \%opts); | 
| 334 | 0 |  |  |  |  |  | my $fcoef    = $opts{fcoef}; | 
| 335 | 0 |  |  |  |  |  | my ($key); | 
| 336 | 0 |  |  |  |  |  | foreach my $qc2i (0..$#$qcounts2) { | 
| 337 | 0 |  |  |  |  |  | my $qcount2 = $qcounts2->[$qc2i]; | 
| 338 | 0 |  |  |  |  |  | my $result2 = $rel->ddcQuery($coldb, $qcount2, limit=>-1, logTrunc=>128, logas=>("f2[".($qc2i+1).'/'.scalar(@$qcounts2)."]")); | 
| 339 | 0 |  |  |  |  |  | foreach (@{$result2->{counts_}}) { | 
|  | 0 |  |  |  |  |  |  | 
| 340 | 0 | 0 |  |  |  |  | next if (!defined($prf=$y2prf{$y=$_->[1]})); | 
| 341 | 0 |  |  |  |  |  | $key = join("\t", @$_[2..$#$_]); | 
| 342 | 0 | 0 |  |  |  |  | $prf->{f2}{$key} += $_->[0]*$fcoef if (exists $prf->{f12}{$key}); | 
| 343 |  |  |  |  |  |  | } | 
| 344 |  |  |  |  |  |  | } | 
| 345 |  |  |  |  |  |  |  | 
| 346 |  |  |  |  |  |  | ##-- finalize: collect multi-profile (don't trim) | 
| 347 | 0 |  |  |  |  |  | my $mp = DiaColloDB::Profile::Multi->new(profiles => [@y2prf{sort {$a<=>$b} keys %y2prf}], N=>0,f1=>0); | 
|  | 0 |  |  |  |  |  |  | 
| 348 | 0 |  |  |  |  |  | $mp->trim(empty=>0, extend=>$slice2keys); | 
| 349 | 0 |  |  |  |  |  | return $mp; | 
| 350 |  |  |  |  |  |  | } | 
| 351 |  |  |  |  |  |  |  | 
| 352 |  |  |  |  |  |  |  | 
| 353 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 354 |  |  |  |  |  |  | ## Relation API: compare | 
| 355 |  |  |  |  |  |  |  | 
| 356 |  |  |  |  |  |  | ## $mprf = $rel->compare($coldb, %opts) | 
| 357 |  |  |  |  |  |  | ## + get a comparison profile for selected items as a DiaColloDB::Profile::MultiDiff object | 
| 358 |  |  |  |  |  |  | ## + %opts: as for DiaColloDB::Relation::compare(), also: | 
| 359 |  |  |  |  |  |  | ##   ( | 
| 360 |  |  |  |  |  |  | ##    ##-- sampling options | 
| 361 |  |  |  |  |  |  | ##    (a|b)?limit => $limit,       ##-- maximum number of items to return from ddc; sets $qconfig{limit} (default: query "#limit[N]" or $rel->{ddcLimit}) | 
| 362 |  |  |  |  |  |  | ##    (a|b)?sample => $sample,     ##-- ddc sample size; sets $qconfig{qcount} Sample property (default: query "#sample[N]" or $rel->{ddcSample}) | 
| 363 |  |  |  |  |  |  | ##    (a|b)?cfmin => $cfmin,       ##-- minimum subcorpus frequency for returned items (default: query "#fmin[N]" or $rel->{cfmin}) | 
| 364 |  |  |  |  |  |  | ##    (a|b)?dmax  => $dmax,        ##-- maxmimum distance for implicit near() queries (default: query "#dmax[N]" or $rel->{dmax}) | 
| 365 |  |  |  |  |  |  | ##   ) | 
| 366 |  |  |  |  |  |  | sub compare { | 
| 367 | 0 |  |  | 0 | 1 |  | my $rel = shift; | 
| 368 | 0 |  |  |  |  |  | return $rel->SUPER::compare(@_, _gbparse=>0, _abkeys=>[qw(limit sample cfmin dmax)], strings=>0); | 
| 369 |  |  |  |  |  |  | } | 
| 370 |  |  |  |  |  |  |  | 
| 371 |  |  |  |  |  |  | ##============================================================================== | 
| 372 |  |  |  |  |  |  | ## Utils: profiling | 
| 373 |  |  |  |  |  |  |  | 
| 374 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 375 |  |  |  |  |  |  | ## $dclient = $rel->ddcClient(%opts) | 
| 376 |  |  |  |  |  |  | ##  + returns cached $rel->{dclient} if defined, otherwise creates and caches a new client | 
| 377 |  |  |  |  |  |  | ##  + chokes if ddcServer is not defined | 
| 378 |  |  |  |  |  |  | ##  + %opts clobber %{$rel->{dclient}} | 
| 379 |  |  |  |  |  |  | sub ddcClient { | 
| 380 | 0 |  |  | 0 | 1 |  | my ($rel,%opts) = @_; | 
| 381 | 0 | 0 |  |  |  |  | if (defined($rel->{dclient})) { | 
| 382 |  |  |  |  |  |  | ##-- cached client: just set options | 
| 383 | 0 |  |  |  |  |  | @{$rel->{dclient}}{keys %opts} = values %opts; | 
|  | 0 |  |  |  |  |  |  | 
| 384 | 0 |  |  |  |  |  | return $rel->{dclient}; | 
| 385 |  |  |  |  |  |  | } | 
| 386 |  |  |  |  |  |  |  | 
| 387 |  |  |  |  |  |  | ##-- sanity check(s) | 
| 388 | 0 | 0 |  |  |  |  | $rel->logconfess($opts{coldb}{error}="ddcClient(): no 'ddcServer' key defined") if (!$rel->{ddcServer}); | 
| 389 |  |  |  |  |  |  |  | 
| 390 |  |  |  |  |  |  | ##-- get server | 
| 391 | 0 |  |  |  |  |  | my ($server,$port) = @$rel{qw(ddcServer ddcPort)}; | 
| 392 | 0 |  | 0 |  |  |  | $server ||= 'localhost'; | 
| 393 | 0 | 0 |  |  |  |  | $port     = $1 if ($server =~ s/\:([0-9]+)$//); | 
| 394 | 0 |  | 0 |  |  |  | $port   ||= 50000; | 
| 395 |  |  |  |  |  |  |  | 
| 396 |  |  |  |  |  |  | return $rel->{dclient} = DDC::Client::Distributed->new( | 
| 397 | 0 |  | 0 |  |  |  | timeout=>($rel->{ddcTimeout}//300), | 
| 398 |  |  |  |  |  |  | connect=>{PeerAddr=>$server,PeerPort=>$port}, | 
| 399 |  |  |  |  |  |  | mode=>'json', | 
| 400 |  |  |  |  |  |  | %opts | 
| 401 |  |  |  |  |  |  | ); | 
| 402 |  |  |  |  |  |  | } | 
| 403 |  |  |  |  |  |  |  | 
| 404 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 405 |  |  |  |  |  |  | ## $results = $rel->ddcQuery($coldb, $query_or_str, %opts) | 
| 406 |  |  |  |  |  |  | ##  + %opts: | 
| 407 |  |  |  |  |  |  | ##     logas => $prefix,   ##-- log prefix (default: 'ddcQuery()') | 
| 408 |  |  |  |  |  |  | ##     loglevel => $level, ##-- log level (default=$coldb->{logProfile}) | 
| 409 |  |  |  |  |  |  | ##     limit => $limit,    ##-- set result client limit (default: current client limit, or -1 for limit=>undef) | 
| 410 |  |  |  |  |  |  | ##     logTrunc => $len,   ##-- truncate long query strings to max $len characters (default=-1: full string) | 
| 411 |  |  |  |  |  |  | sub ddcQuery { | 
| 412 | 0 |  |  | 0 | 1 |  | my ($rel,$coldb,$query,%opts) = @_; | 
| 413 | 0 |  | 0 |  |  |  | my $logas = $opts{logas} // 'ddcQuery'; | 
| 414 | 0 | 0 |  |  |  |  | my $level = exists($opts{loglevel}) ? $opts{loglevel} : $coldb->{logProfile}; | 
| 415 | 0 | 0 | 0 |  |  |  | my $trunc = $opts{logTrunc} // (ref($rel) ? $rel->{logTrunc} : undef) // -1; | 
|  |  |  | 0 |  |  |  |  | 
| 416 |  |  |  |  |  |  |  | 
| 417 | 0 | 0 |  |  |  |  | my $qstr = ref($query) ? $query->toStringFull : $query; | 
| 418 | 0 |  |  |  |  |  | my $cli  = $rel->ddcClient(); | 
| 419 | 0 | 0 | 0 |  |  |  | $cli->{limit} = $opts{limit}//-1 if (exists($opts{limit})); | 
| 420 |  |  |  |  |  |  |  | 
| 421 | 0 | 0 | 0 |  |  |  | $rel->vlog($level, "$logas: query[server=$rel->{ddcServer},limit=$cli->{limit},timeout=$cli->{timeout}]: ", | 
| 422 |  |  |  |  |  |  | ($trunc < 0 || length($qstr) <= $trunc ? $qstr : (substr($qstr,0,$trunc)."..."))); | 
| 423 |  |  |  |  |  |  | $cli->open() | 
| 424 |  |  |  |  |  |  | or $rel->logconfess($coldb->{error}="$logas: failed to connect to DDC server on $rel->{ddcServer}: $!") | 
| 425 | 0 | 0 | 0 |  |  |  | if (!defined($cli->{sock})); | 
| 426 | 0 |  |  |  |  |  | my $result = $cli->queryJson($qstr); | 
| 427 |  |  |  |  |  |  |  | 
| 428 |  |  |  |  |  |  | $rel->logconfess($coldb->{error}="$logas ERROR: DDC query failed: ".($result->{error_}//'(undefined error)')) | 
| 429 | 0 | 0 | 0 |  |  |  | if ($result->{error_} || $result->{istatus_} || $result->{nstatus_} || !$result->{counts_}); | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
| 430 | 0 | 0 | 0 |  |  |  | my $approx = ($result->{end_}//0) < ($result->{nhits_}//2**32) ? "~" : ''; | 
|  |  |  | 0 |  |  |  |  | 
| 431 | 0 |  | 0 |  |  |  | $rel->vlog($level, "$logas: fetched ", ($result->{end_}//'?'), " of $approx", ($result->{nhits_}//'?'), " result row(s)"); | 
|  |  |  | 0 |  |  |  |  | 
| 432 |  |  |  |  |  |  |  | 
| 433 | 0 |  |  |  |  |  | return $result; | 
| 434 |  |  |  |  |  |  | } | 
| 435 |  |  |  |  |  |  |  | 
| 436 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 437 |  |  |  |  |  |  | ## $fcoef = $rel->fcoef($cquery) | 
| 438 |  |  |  |  |  |  | sub fcoef { | 
| 439 | 0 |  |  | 0 | 1 |  | my ($rel,$qnod) = @_; | 
| 440 | 0 | 0 | 0 |  |  |  | if (UNIVERSAL::isa($qnod,'DDC::Any::CQNear')) { | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 441 | 0 |  |  |  |  |  | return 2 * ($qnod->getDist + 1) * $rel->fcoef($qnod->getDtr1) * $rel->fcoef($qnod->getDtr2); | 
| 442 |  |  |  |  |  |  | } | 
| 443 |  |  |  |  |  |  | elsif (UNIVERSAL::isa($qnod,'DDC::Any::CQAnd') || UNIVERSAL::isa($qnod,'DDC::Any::CQOr')) { | 
| 444 | 0 |  |  |  |  |  | return max2($rel->fcoef($qnod->getDtr1),$rel->fcoef($qnod->getDtr2)); | 
| 445 |  |  |  |  |  |  | } | 
| 446 |  |  |  |  |  |  | elsif (UNIVERSAL::isa($qnod,'DDC::Any::CQSeq')) { | 
| 447 | 0 |  |  |  |  |  | my $fcoef = 1; | 
| 448 | 0 |  |  |  |  |  | my $items = $qnod->getItems; | 
| 449 | 0 |  |  |  |  |  | my $dists = $qnod->getDists; | 
| 450 | 0 |  |  |  |  |  | my $dops  = $qnod->getDistOps; | 
| 451 | 0 |  |  |  |  |  | foreach (0..$#$dists) { | 
| 452 | 0 |  |  |  |  |  | $fcoef *= $rel->fcoef($items->[$_]); | 
| 453 | 0 |  | 0 |  |  |  | $dops->[$_] //= '<'; | 
| 454 | 0 | 0 |  |  |  |  | if ($dops->[$_] eq '<') { | 
|  |  | 0 |  |  |  |  |  | 
| 455 | 0 |  |  |  |  |  | $fcoef *= ($dists->[$_]+1); | 
| 456 |  |  |  |  |  |  | } | 
| 457 |  |  |  |  |  |  | elsif ($dops->[$_] eq '>') { | 
| 458 | 0 |  |  |  |  |  | $fcoef *= 32-($dists->[$_]+1); ##-- ddc global MaxDistanceForNear=32 | 
| 459 |  |  |  |  |  |  | } | 
| 460 |  |  |  |  |  |  | } | 
| 461 | 0 |  |  |  |  |  | return $fcoef * $rel->fcoef($items->[$#$items]); | 
| 462 |  |  |  |  |  |  | } | 
| 463 | 0 |  |  |  |  |  | return 1; | 
| 464 |  |  |  |  |  |  | } | 
| 465 |  |  |  |  |  |  |  | 
| 466 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 467 |  |  |  |  |  |  | ## $qcount = $rel->countQuery($coldb,\%opts) | 
| 468 |  |  |  |  |  |  | ## + creates a DDC::Any::CQCount object for profile() options %opts | 
| 469 |  |  |  |  |  |  | ## + %opts: as for DiaColloDB::Relation::DDC::profile() | 
| 470 |  |  |  |  |  |  | ## + sets following keys in %opts: | 
| 471 |  |  |  |  |  |  | ##   ( | 
| 472 |  |  |  |  |  |  | ##    gbexprs => $gbexprs,      ##-- groupby expressions (DDC::Any::CQCountKeyExprList) | 
| 473 |  |  |  |  |  |  | ##    gbrestr => $gbrestr,      ##-- groupby item2 restrictions (DDC::Any::CQWith conjunction of token expressions) | 
| 474 |  |  |  |  |  |  | ##    gbfilters => \@gbfilters, ##-- groupby filter restrictions (ARRAY-ref of DDC::Any::CQFilter) | 
| 475 |  |  |  |  |  |  | ##    gbtitles => \@titles,     ##-- groupby column titles (ARRAY-ref of strings) | 
| 476 |  |  |  |  |  |  | ##    limit  => $limit,		##-- hit return limit for ddc query | 
| 477 |  |  |  |  |  |  | ##    dslo   => $dslo,          ##-- minimum date-slice, from @opts{qw(date slice fill)} | 
| 478 |  |  |  |  |  |  | ##    dshi   => $dshi,          ##-- maximum date-slice, from @opts{qw(date slice fill)} | 
| 479 |  |  |  |  |  |  | ##    dlo    => $dlo,           ##-- minimum date request (ddc) | 
| 480 |  |  |  |  |  |  | ##    dhi    => $dhi,           ##-- maximum date request (ddc) | 
| 481 |  |  |  |  |  |  | ##    fcoef  => $fcoef,		##-- frequency coefficient, parsed from "#coef[N]", auto-generated for near() queries | 
| 482 |  |  |  |  |  |  | ##    qtemplate => $qtemplate,  ##-- query template for ddc hit link-up | 
| 483 |  |  |  |  |  |  | ##    qcount1 => $qcount1,      ##-- count-query for f1 acquisition | 
| 484 |  |  |  |  |  |  | ##    fcoef1 => $fcoef1,        ##-- f1 coefficient for qcount1 | 
| 485 |  |  |  |  |  |  | ##   ) | 
| 486 |  |  |  |  |  |  | sub countQuery { | 
| 487 | 0 |  |  | 0 | 1 |  | my ($rel,$coldb,$opts) = @_; | 
| 488 |  |  |  |  |  |  |  | 
| 489 |  |  |  |  |  |  | ##-- groupby clause: user request | 
| 490 | 0 |  |  |  |  |  | my ($gbexprs,$gbrestr,$gbfilters) = @$opts{qw(gbexprs gbrestr gbfilters)} = $coldb->parseGroupBy($opts->{groupby}, %$opts); | 
| 491 |  |  |  |  |  |  |  | 
| 492 |  |  |  |  |  |  | ##-- query hacks: override options | 
| 493 | 0 |  | 0 |  |  |  | my $limit  = ($opts->{query} =~ s/\s*\#limit\s*[\s\[]\s*([\+\-]?\d+)\s*\]?//i ? $1 : ($opts->{limit}//$rel->{ddcLimit})) || -1; | 
| 494 | 0 |  | 0 |  |  |  | my $sample = ($opts->{query} =~ s/\s*\#samp(?:le)?\s*[\s\[]\s*([\+\-]?\d+)\s*\]?//i ? $1 : ($opts->{sample}//$rel->{ddcSample})) || -1; | 
| 495 | 0 |  | 0 |  |  |  | my $dmax   = ($opts->{query} =~ s/\s*\#d(?:ist)?max\s*[\s\[]\s*([\+\-]?\d+)\s*\]?//i ? $1 : ($opts->{dmax}//$rel->{dmax})) || 1; | 
| 496 | 0 | 0 | 0 |  |  |  | my $cfmin  = ($opts->{query} =~ s/\s*\#c?fmin\s*[\s\[]\s*([\+\-]?\d+)\s*\]?//i ? $1 : ($opts->{cfmin}//$rel->{cfmin})) // ''; | 
|  |  |  | 0 |  |  |  |  | 
| 497 | 0 | 0 |  |  |  |  | my $fcoef  = ($opts->{query} =~ s/\s*\#f?coef\s*[\s\[]s*([\+\-]?\d*\.?\d+(?:[eE][\+-]?\d+)?)\s*\]?//i ? $1 : $opts->{fcoef}); | 
| 498 |  |  |  |  |  |  |  | 
| 499 |  |  |  |  |  |  | ##-- parse daughter query & setup match-ids | 
| 500 | 0 |  |  |  |  |  | my $qdtr  = $coldb->parseQuery($opts->{query}, logas=>'query', default=>'', ddcmode=>-1); | 
| 501 | 0 |  | 0 |  |  |  | my $qopts = $qdtr->getOptions || DDC::Any::CQueryOptions->new; | 
| 502 | 0 |  |  |  |  |  | $qdtr->setOptions(undef); | 
| 503 |  |  |  |  |  |  |  | 
| 504 |  |  |  |  |  |  | ##-- get target query nodes (item1 ~ matchid =1, item2 ~ matchid =2) | 
| 505 |  |  |  |  |  |  | ##  + propagate explicit match-id from parent CQWith into CQToken nodes | 
| 506 | 0 |  |  |  |  |  | my (@qnodes1,@qnodes2); | 
| 507 | 0 |  |  |  |  |  | $rel->propagateMatchIds($qdtr); | 
| 508 | 0 |  |  |  |  |  | foreach (grep {UNIVERSAL::isa($_,'DDC::Any::CQToken')} @{$qdtr->Descendants}) { | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 509 | 0 | 0 |  |  |  |  | push(@qnodes1,$_) if ($_->getMatchId<=1); | 
| 510 | 0 | 0 |  |  |  |  | push(@qnodes2,$_) if ($_->getMatchId==2); | 
| 511 |  |  |  |  |  |  | } | 
| 512 | 0 | 0 | 0 |  |  |  | $rel->logconfess($coldb->{error}="no primary target-nodes found in daughter query '", $qdtr->toString, "': use match-id =1 to specify primary target(s)") | 
| 513 |  |  |  |  |  |  | if (!@qnodes1 && !@qnodes2); | 
| 514 | 0 |  |  |  |  |  | $_->setMatchId(1) foreach (@qnodes1); | 
| 515 |  |  |  |  |  |  |  | 
| 516 |  |  |  |  |  |  | ##-- check for target match-id =2 and maybe implicitly construct near() query | 
| 517 | 0 | 0 |  |  |  |  | if (!@qnodes2) { | 
|  |  | 0 |  |  |  |  |  | 
| 518 | 0 | 0 | 0 |  |  |  | $gbrestr //= DDC::Any::CQTokRegex->new('p',$coldb->{pgood},0) if ($coldb->{pgood}); ##-- simulate coldb 'pgood' restriction (content words only) | 
| 519 | 0 |  | 0 |  |  |  | my $qany = $gbrestr // DDC::Any::CQTokAny->new(); | 
| 520 | 0 |  |  |  |  |  | $qany->setMatchId(2); | 
| 521 | 0 | 0 |  |  |  |  | $dmax = 1 if ($dmax < 1); | 
| 522 | 0 |  |  |  |  |  | my $qnear = DDC::Any::CQNear->new($dmax-1,$qdtr,$qany); | 
| 523 |  |  |  |  |  |  | #@qnodes2  = ($qany); ##-- unused | 
| 524 | 0 |  |  |  |  |  | $qdtr     = $qnear; | 
| 525 |  |  |  |  |  |  | } | 
| 526 |  |  |  |  |  |  | elsif ($gbrestr) { | 
| 527 |  |  |  |  |  |  | ##-- append groupby restriction to all targets (=2) | 
| 528 | 0 |  |  |  |  |  | my ($nod,$newnod); | 
| 529 |  |  |  |  |  |  | $qdtr = $qdtr->mapTraverse(sub { | 
| 530 | 0 |  |  | 0 |  |  | $nod = shift; | 
| 531 | 0 | 0 | 0 |  |  |  | if (UNIVERSAL::isa($nod, 'DDC::Any::CQToken') && $nod->getMatchId == 2) { | 
| 532 | 0 |  |  |  |  |  | $newnod = DDC::Any::CQWith->new($nod,$gbrestr); | 
| 533 | 0 |  |  |  |  |  | $newnod->setOptions($nod->getOptions); | 
| 534 | 0 |  |  |  |  |  | $nod->setMatchId(0); | 
| 535 | 0 |  |  |  |  |  | $newnod->setMatchId(2); | 
| 536 | 0 |  |  |  |  |  | $nod->setOptions(undef); | 
| 537 | 0 |  |  |  |  |  | return $newnod; | 
| 538 |  |  |  |  |  |  | } | 
| 539 | 0 |  |  |  |  |  | return $nod; | 
| 540 | 0 |  |  |  |  |  | }); | 
| 541 |  |  |  |  |  |  | } | 
| 542 |  |  |  |  |  |  |  | 
| 543 |  |  |  |  |  |  | ##-- qdtr options | 
| 544 | 0 | 0 |  |  |  |  | $qopts->setSeparateHits(1) if ($opts->{query} !~ /\#(?:sep(?:arate)?|nojoin)(?:_hits)?\b/i); | 
| 545 | 0 |  |  |  |  |  | $qdtr->setOptions($qopts); | 
| 546 |  |  |  |  |  |  |  | 
| 547 |  |  |  |  |  |  | ##-- maybe guess fcoef | 
| 548 | 0 |  |  |  |  |  | my $fcoef_user = $fcoef; | 
| 549 | 0 |  | 0 |  |  |  | $fcoef //= $rel->fcoef($qdtr); | 
| 550 | 0 |  |  |  |  |  | $rel->vlog($coldb->{logProfile}, "guessed fcoef=$fcoef"); | 
| 551 |  |  |  |  |  |  |  | 
| 552 |  |  |  |  |  |  | ##-- date clause | 
| 553 | 0 |  |  |  |  |  | my ($dfilter,$dslo,$dshi,$dlo,$dhi) = $coldb->parseDateRequest(@$opts{qw(date slice fill)},1); | 
| 554 | 0 |  |  |  |  |  | my $filters = [@$gbfilters, @{$qopts->getFilters}]; | 
|  | 0 |  |  |  |  |  |  | 
| 555 | 0 | 0 | 0 |  |  |  | if ($dfilter && !grep {UNIVERSAL::isa($_,'DDC::Any::CQFDateSort')} @$filters) { | 
|  | 0 |  |  |  |  |  |  | 
| 556 | 0 | 0 |  |  |  |  | unshift(@$filters, DDC::Any::CQFDateSort->new(DDC::Any::LessByDate(), | 
|  |  | 0 |  |  |  |  |  | 
| 557 |  |  |  |  |  |  | ($dlo ? "${dlo}-00-00" : ''), | 
| 558 |  |  |  |  |  |  | ($dhi ? "${dhi}-12-31" : '') | 
| 559 |  |  |  |  |  |  | )); | 
| 560 |  |  |  |  |  |  | } | 
| 561 | 0 |  |  |  |  |  | $qopts->setFilters($filters); | 
| 562 |  |  |  |  |  |  |  | 
| 563 |  |  |  |  |  |  | ##-- set random seed if we're using a limited sample | 
| 564 | 0 | 0 |  |  |  |  | if ($sample > 0) { | 
| 565 | 0 |  |  |  |  |  | my $gotseed = 0; | 
| 566 | 0 |  |  |  |  |  | foreach (@$filters) { | 
| 567 | 0 | 0 |  |  |  |  | if (UNIVERSAL::isa($_,'DDC::Any::CQFRandomSort')) { | 
| 568 | 0 | 0 |  |  |  |  | $_->setArg1(int(rand(100))) if (!$_->getArg1); ##-- use 100 random seeds | 
| 569 | 0 |  |  |  |  |  | $gotseed = 1; | 
| 570 | 0 |  |  |  |  |  | last; | 
| 571 |  |  |  |  |  |  | } | 
| 572 |  |  |  |  |  |  | } | 
| 573 | 0 | 0 |  |  |  |  | if (!$gotseed) { | 
| 574 | 0 |  |  |  |  |  | push(@$filters, DDC::Any::CQFRandomSort->new(int(rand(100)))); | 
| 575 | 0 |  |  |  |  |  | $qopts->setFilters($filters); | 
| 576 |  |  |  |  |  |  | } | 
| 577 |  |  |  |  |  |  | } | 
| 578 |  |  |  |  |  |  |  | 
| 579 |  |  |  |  |  |  | ##-- setup query template | 
| 580 | 0 |  |  |  |  |  | my ($qtconds,$qtcond,@qtfilters); | 
| 581 | 0 |  |  |  |  |  | my $xi=0; | 
| 582 | 0 |  |  |  |  |  | foreach (@{$gbexprs->getExprs}) { | 
|  | 0 |  |  |  |  |  |  | 
| 583 | 0 | 0 | 0 |  |  |  | if (!defined($_) || UNIVERSAL::isa($_,'DDC::Any::CQCountKeyExprConstant') || UNIVERSAL::isa($_,'DDC::Any::CQCountKeyExprDate')) { | 
|  |  | 0 | 0 |  |  |  |  | 
|  |  | 0 | 0 |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 584 |  |  |  |  |  |  | ; ##-- skip these | 
| 585 |  |  |  |  |  |  | } elsif (UNIVERSAL::isa($_,'DDC::Any::CQCountKeyExprToken')) { | 
| 586 |  |  |  |  |  |  | ##-- token expression | 
| 587 | 0 |  |  |  |  |  | $qtcond  = DDC::Any::CQTokExact->new($_->getIndexName, "__W2.${xi}__"); | 
| 588 | 0 | 0 |  |  |  |  | $qtconds = defined($qtconds) ? DDC::Any::CQWith->new($qtconds,$qtcond) : $qtcond; | 
| 589 | 0 | 0 |  |  |  |  | $_->setMatchId(2) if ($_->getMatchId==0); | 
| 590 |  |  |  |  |  |  | } elsif (UNIVERSAL::isa($_, 'DDC::Any::CQCountKeyExprBibl')) { | 
| 591 |  |  |  |  |  |  | ##-- bibl expression | 
| 592 | 0 |  |  |  |  |  | my $label = $_->getLabel; | 
| 593 |  |  |  |  |  |  | push(@qtfilters, DDC::Any::CQFHasField->new($label, "__W2.${xi}__")) | 
| 594 | 0 | 0 | 0 |  |  |  | if (!grep {((ref($_)//'') eq 'DDC::Any::CQFHasField') && $_->getArg0 eq $label} @$filters); | 
|  | 0 | 0 |  |  |  |  |  | 
| 595 |  |  |  |  |  |  | }  elsif (UNIVERSAL::isa($_, 'DDC::Any::CQCountKeyExprRegex') && UNIVERSAL::isa($_->getSrc, 'DDC::Any::CQCountKeyExprBibl')) { | 
| 596 |  |  |  |  |  |  | ##-- regex transformation: hack | 
| 597 | 0 | 0 | 0 |  |  |  | if ($_->getReplacement eq '' && $_->getPattern =~ /^(.)\.\*\$/) { | 
|  |  | 0 |  |  |  |  |  | 
| 598 |  |  |  |  |  |  | ##-- hack for simple prefix regex transformations (textclass) | 
| 599 | 0 |  |  |  |  |  | push(@qtfilters, DDC::Any::CQFHasFieldPrefix->new($_->getSrc->getLabel, "__W2.${xi}__")); | 
| 600 |  |  |  |  |  |  | } elsif ($_->getReplacement eq '') { | 
| 601 |  |  |  |  |  |  | ##-- hack for simple regex deletions (any substring; not always correct) | 
| 602 | 0 |  |  |  |  |  | push(@qtfilters, DDC::Any::CQFHasFieldInfix->new($_->getSrc->getLabel, "__W2.${xi}__")); | 
| 603 |  |  |  |  |  |  | } else { | 
| 604 |  |  |  |  |  |  | ##-- non-trivial regex transformation | 
| 605 | 0 |  |  |  |  |  | $coldb->warn("can't generate template for non-trivial regex transformation \`", $_->toString, "'"); | 
| 606 |  |  |  |  |  |  | } | 
| 607 |  |  |  |  |  |  | } else { | 
| 608 | 0 |  | 0 |  |  |  | $coldb->warn("can't generate template for groupby expression of type ", ref($_)//'(undefined)', " \`", $_->toString, "'"); | 
| 609 |  |  |  |  |  |  | } | 
| 610 | 0 |  |  |  |  |  | ++$xi; | 
| 611 |  |  |  |  |  |  | } | 
| 612 |  |  |  |  |  |  | #$qtconds //= DDC::Any::CQTokAny->new(); | 
| 613 | 0 | 0 |  |  |  |  | $qtconds->setMatchId(2) if ($qtconds); | 
| 614 |  |  |  |  |  |  | my $qtemplate = $qdtr->clone->mapTraverse(sub { | 
| 615 | 0 |  |  | 0 |  |  | my $nod = shift; | 
| 616 | 0 | 0 | 0 |  |  |  | if (UNIVERSAL::can($nod,'getMatchId') && $nod->getMatchId==2) { | 
| 617 | 0 |  | 0 |  |  |  | return $qtconds // $nod; | 
| 618 |  |  |  |  |  |  | } | 
| 619 | 0 |  |  |  |  |  | return $nod; | 
| 620 | 0 |  |  |  |  |  | }); | 
| 621 | 0 | 0 |  |  |  |  | $qtemplate->setOptions($qdtr->getOptions->clone) if (!$qtemplate->getOptions); ##-- traversal bug | 
| 622 | 0 |  |  |  |  |  | $qtemplate->getOptions->setFilters([@$filters,@qtfilters]); | 
| 623 |  |  |  |  |  |  |  | 
| 624 |  |  |  |  |  |  | ##-- cleanup coldb parser (so we're using "real" refcounts) | 
| 625 | 0 |  |  |  |  |  | $coldb->qcompiler->CleanParser(); | 
| 626 |  |  |  |  |  |  |  | 
| 627 |  |  |  |  |  |  | ##-- construct count query | 
| 628 | 0 | 0 | 0 |  |  |  | $cfmin = '' if (($cfmin//1) <= 1); | 
| 629 | 0 |  |  |  |  |  | my $qcount = DDC::Any::CQCount->new($qdtr, $gbexprs, $sample, DDC::Any::GreaterByCountValue(), $cfmin); | 
| 630 |  |  |  |  |  |  |  | 
| 631 |  |  |  |  |  |  | ##-- contruct f1 count-query | 
| 632 | 0 |  |  |  |  |  | my $qcount1 = $opts->{qcount1} = $rel->collocantCountQuery($qcount, 1); | 
| 633 | 0 |  | 0 |  |  |  | $opts->{fcoef1} = $fcoef_user // ($fcoef / $rel->fcoef($qcount1->getDtr)); | 
| 634 |  |  |  |  |  |  |  | 
| 635 |  |  |  |  |  |  | ##-- set count-by expressions, titles | 
| 636 | 0 |  |  |  |  |  | my $gbexprs_array = $qcount->getKeys->getExprs; | 
| 637 |  |  |  |  |  |  | my $gbtitles = $opts->{gbtitles} = [map { | 
| 638 | 0 |  |  |  |  |  | $coldb->attrTitle($_->can('getIndexName') | 
| 639 |  |  |  |  |  |  | ? $_->getIndexName | 
| 640 | 0 | 0 |  |  |  |  | : do { (my $label=$_->toString) =~ s{\'((?:\\.|[^\'])*)\'}{$1}; $label =~ s{ ~ s/}{~s/}g; $label }) | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 641 |  |  |  |  |  |  | } @$gbexprs_array[1..$#$gbexprs_array]]; | 
| 642 |  |  |  |  |  |  |  | 
| 643 |  |  |  |  |  |  | ##-- set options | 
| 644 | 0 |  |  |  |  |  | @$opts{qw(limit sample dslo dshi dlo dhi fcoef cfmin qtemplate)} = ($limit,$sample,$dslo,$dshi,$dlo,$dhi,$fcoef,$cfmin,$qtemplate->toStringFull); | 
| 645 | 0 |  |  |  |  |  | return $qcount; | 
| 646 |  |  |  |  |  |  | } | 
| 647 |  |  |  |  |  |  |  | 
| 648 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 649 |  |  |  |  |  |  | ## $nod = $rel->propagateMatchIds($nod,$parentMatchId=0) | 
| 650 |  |  |  |  |  |  | sub propagateMatchIds { | 
| 651 | 0 | 0 |  | 0 | 0 |  | my $that = UNIVERSAL::isa($_[0],__PACKAGE__) ? shift : __PACKAGE__; | 
| 652 | 0 |  |  |  |  |  | my ($nod,$matchid) = @_; | 
| 653 | 0 | 0 |  |  |  |  | return if (!UNIVERSAL::isa($nod,'DDC::Any::CQuery')); | 
| 654 | 0 |  | 0 |  |  |  | $matchid //= 0; | 
| 655 |  |  |  |  |  |  |  | 
| 656 | 0 | 0 | 0 |  |  |  | if ($nod->isa('DDC::Any::CQWith') || $nod->isa('DDC::Any::CQToken')) { | 
| 657 | 0 | 0 |  |  |  |  | $matchid = $nod->getMatchId | 
| 658 |  |  |  |  |  |  | if ($nod->toString =~ /=[0-9]+$/); ##-- test for local match-id (hack) | 
| 659 |  |  |  |  |  |  | } | 
| 660 | 0 | 0 | 0 |  |  |  | $nod->setMatchId($matchid) | 
| 661 |  |  |  |  |  |  | if ($nod->isa('DDC::Any::CQToken') && !$nod->HasMatchId); | 
| 662 |  |  |  |  |  |  |  | 
| 663 |  |  |  |  |  |  | $that->propagateMatchIds($_,$matchid) | 
| 664 | 0 |  |  |  |  |  | foreach (@{$nod->Children}); | 
|  | 0 |  |  |  |  |  |  | 
| 665 |  |  |  |  |  |  |  | 
| 666 | 0 |  |  |  |  |  | return $nod; | 
| 667 |  |  |  |  |  |  | } | 
| 668 |  |  |  |  |  |  |  | 
| 669 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 670 |  |  |  |  |  |  | ## $qcount1 = $rel->collocantCountQuery($qcount,$matchId) | 
| 671 |  |  |  |  |  |  | ##  + maps count-queries returned by countQuery() to ${matchId}-item queries (default $matchid=1) | 
| 672 |  |  |  |  |  |  | sub collocantCountQuery { | 
| 673 | 0 | 0 |  | 0 | 1 |  | my $that = UNIVERSAL::isa($_[0],__PACKAGE__) ? shift : __PACKAGE__; | 
| 674 | 0 |  |  |  |  |  | my ($qcount,$matchid) = @_; | 
| 675 | 0 |  | 0 |  |  |  | $matchid //= 1; | 
| 676 |  |  |  |  |  |  |  | 
| 677 |  |  |  |  |  |  | ##-- map item-conditions | 
| 678 | 0 |  |  |  |  |  | my $qdtr = $qcount->getDtr; | 
| 679 | 0 |  |  |  |  |  | my $idtr = $that->itemCountNode($qdtr, $matchid); | 
| 680 | 0 | 0 |  |  |  |  | if (!$idtr) { | 
| 681 | 0 |  |  |  |  |  | $idtr = DDC::Any::CQTokAny->new(); | 
| 682 | 0 | 0 |  |  |  |  | $idtr->setMatchId($matchid) if ($matchid != 1); | 
| 683 |  |  |  |  |  |  | } | 
| 684 |  |  |  |  |  |  |  | 
| 685 |  |  |  |  |  |  | ##-- inherit options | 
| 686 | 0 |  |  |  |  |  | $idtr->setOptions($qdtr->getOptions->clone); | 
| 687 |  |  |  |  |  |  |  | 
| 688 |  |  |  |  |  |  | ##-- get count-query | 
| 689 | 0 |  |  |  |  |  | my $icount = $qcount->clone; | 
| 690 | 0 | 0 |  |  |  |  | my $iexprs = [ grep {!UNIVERSAL::can($_,'getMatchId') || $_->getMatchId==$matchid} @{$icount->getKeys->getExprs()} ]; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 691 | 0 |  |  |  |  |  | $icount->setDtr($idtr); | 
| 692 | 0 |  |  |  |  |  | $icount->getKeys->setExprs($iexprs); | 
| 693 |  |  |  |  |  |  |  | 
| 694 | 0 |  |  |  |  |  | return $icount; | 
| 695 |  |  |  |  |  |  | } | 
| 696 |  |  |  |  |  |  |  | 
| 697 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 698 |  |  |  |  |  |  | ## $nod2_or_undef = $rel->itemCountNode($nod,$matchId) | 
| 699 |  |  |  |  |  |  | ##  + maps countQuery() nodes to ${matchId}-query nodes only | 
| 700 |  |  |  |  |  |  | ##  + simplifies by removing extraneous CQBinOp, CQNear, and CQSeq nodes | 
| 701 |  |  |  |  |  |  | sub itemCountNode { | 
| 702 | 0 | 0 |  | 0 | 1 |  | my $that = UNIVERSAL::isa($_[0],__PACKAGE__) ? shift : __PACKAGE__; | 
| 703 | 0 |  |  |  |  |  | my ($nod,$matchid) = @_; | 
| 704 | 0 |  | 0 |  |  |  | $matchid //= 1; | 
| 705 | 0 | 0 |  |  |  |  | return undef if (!defined($nod)); | 
| 706 |  |  |  |  |  |  |  | 
| 707 | 0 | 0 | 0 |  |  |  | if (UNIVERSAL::can($nod,'getMatchId') && ($nod->getMatchId//0) != $matchid) { | 
|  |  | 0 | 0 |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 708 | 0 |  |  |  |  |  | return undef; | 
| 709 |  |  |  |  |  |  | } | 
| 710 |  |  |  |  |  |  | elsif (UNIVERSAL::isa($nod,'DDC::Any::CQBinOp')) { | 
| 711 | 0 |  |  |  |  |  | my $dtr1 = $that->itemCountNode($nod->getDtr1,$matchid); | 
| 712 | 0 |  |  |  |  |  | my $dtr2 = $that->itemCountNode($nod->getDtr2,$matchid); | 
| 713 | 0 | 0 | 0 |  |  |  | if ($dtr1 && $dtr2) { | 
|  |  | 0 | 0 |  |  |  |  | 
| 714 | 0 |  |  |  |  |  | my $nod2 = $nod->clone; | 
| 715 | 0 |  |  |  |  |  | $nod2->setDtr1($dtr1); | 
| 716 | 0 |  |  |  |  |  | $nod2->setDtr2($dtr2); | 
| 717 | 0 |  |  |  |  |  | return $nod2; | 
| 718 |  |  |  |  |  |  | } | 
| 719 |  |  |  |  |  |  | elsif ($dtr1 || $dtr2) { | 
| 720 | 0 | 0 |  |  |  |  | my $nod2    = $dtr1 ? $dtr1 : $dtr2; | 
| 721 | 0 |  |  |  |  |  | my $negated = $nod->Negated; | 
| 722 | 0 | 0 | 0 |  |  |  | $negated = !$negated if (UNIVERSAL::isa($nod,'DDC::Any::CQWithout') && $nod2 eq $dtr2); | 
| 723 | 0 | 0 |  |  |  |  | $nod2->Negate() if ($negated); | 
| 724 | 0 |  |  |  |  |  | return $nod2; | 
| 725 |  |  |  |  |  |  | } | 
| 726 | 0 |  |  |  |  |  | return undef; | 
| 727 |  |  |  |  |  |  | } | 
| 728 |  |  |  |  |  |  | elsif (UNIVERSAL::isa($nod,'DDC::Any::CQSeq')) { | 
| 729 | 0 |  |  |  |  |  | my @items = map {$that->itemCountNode($_,$matchid)} @{$nod->getItems}; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 730 | 0 |  |  |  |  |  | my $nitems = scalar(grep {defined($_)} @items); | 
|  | 0 |  |  |  |  |  |  | 
| 731 | 0 | 0 |  |  |  |  | if ($nitems == 0) { | 
|  |  | 0 |  |  |  |  |  | 
| 732 |  |  |  |  |  |  | ##-- no items in phrase: skip it | 
| 733 | 0 |  |  |  |  |  | return undef; | 
| 734 |  |  |  |  |  |  | } | 
| 735 |  |  |  |  |  |  | elsif ($nitems == 1) { | 
| 736 |  |  |  |  |  |  | ##-- singleton phrase: simplify | 
| 737 | 0 |  |  |  |  |  | my $nod2 = (grep {defined($_)} @items)[0]->clone; | 
|  | 0 |  |  |  |  |  |  | 
| 738 | 0 | 0 |  |  |  |  | $nod2->Negate() if ($nod->Negated); | 
| 739 | 0 |  |  |  |  |  | return $nod2; | 
| 740 |  |  |  |  |  |  | } | 
| 741 |  |  |  |  |  |  | else { | 
| 742 |  |  |  |  |  |  | ##-- multiple items in phrase: keep it a CQSeq, but insert wildcards | 
| 743 | 0 |  |  |  |  |  | my $nod2 = $nod->clone; | 
| 744 | 0 | 0 |  |  |  |  | @items = map {defined($_) ? $_ : DDC::Any::CQTokAny->new} @items; | 
|  | 0 |  |  |  |  |  |  | 
| 745 | 0 |  |  |  |  |  | $nod2->setItems(\@items); | 
| 746 | 0 |  |  |  |  |  | return $nod2; | 
| 747 |  |  |  |  |  |  | } | 
| 748 |  |  |  |  |  |  | } | 
| 749 |  |  |  |  |  |  | elsif (UNIVERSAL::isa($nod,'DDC::Any::CQNear')) { | 
| 750 | 0 |  |  |  |  |  | my @dtrs = grep {defined($_)} map {$that->itemCountNode($_,$matchid)} ($nod->getDtr1, $nod->getDtr2, $nod->getDtr3); | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 751 | 0 | 0 |  |  |  |  | if (!@dtrs) { | 
|  |  | 0 |  |  |  |  |  | 
| 752 | 0 |  |  |  |  |  | return undef; | 
| 753 |  |  |  |  |  |  | } | 
| 754 |  |  |  |  |  |  | elsif (@dtrs == 1) { | 
| 755 | 0 |  |  |  |  |  | my $nod2 = $dtrs[0]; | 
| 756 | 0 | 0 |  |  |  |  | $nod2->Negate() if ($nod->Negated); | 
| 757 | 0 |  |  |  |  |  | return $nod2; | 
| 758 |  |  |  |  |  |  | } | 
| 759 |  |  |  |  |  |  | else { | 
| 760 | 0 |  |  |  |  |  | my $nod2 = $nod->clone; | 
| 761 | 0 |  |  |  |  |  | $nod2->setDtr1($dtrs[0]); | 
| 762 | 0 |  |  |  |  |  | $nod2->setDtr2($dtrs[1]); | 
| 763 | 0 |  |  |  |  |  | $nod2->setDtr3($dtrs[2]); | 
| 764 | 0 |  |  |  |  |  | return $nod2; | 
| 765 |  |  |  |  |  |  | } | 
| 766 |  |  |  |  |  |  | } | 
| 767 | 0 |  |  |  |  |  | return $nod->clone; ##-- default | 
| 768 |  |  |  |  |  |  | } | 
| 769 |  |  |  |  |  |  |  | 
| 770 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 771 |  |  |  |  |  |  | ## %ATTR_SPECIFICITY = ($tokenAttributeName => $specificity, ...) | 
| 772 |  |  |  |  |  |  | ##  + hack for estimating most-specific attrigbute for collocateCountQueries() | 
| 773 |  |  |  |  |  |  | ##  + "proper" way to do this would be to query DDC info and compute domain sizes | 
| 774 |  |  |  |  |  |  | our (%ATTR_SPECIFICITY); | 
| 775 |  |  |  |  |  |  | BEGIN { | 
| 776 |  |  |  |  |  |  | %ATTR_SPECIFICITY = | 
| 777 |  |  |  |  |  |  | ( | 
| 778 | 2 |  |  |  |  | 7 | (map {($_=>1000)} qw(Utf8 u)), | 
| 779 | 4 |  |  |  |  | 11 | (map {($_=> 900)} qw(Token w CanonicalToken v)), | 
| 780 | 2 |  |  |  |  | 4 | (map {($_=> 500)} qw(Lemma l)), | 
| 781 | 1 |  |  | 1 |  | 13 | (map {($_=>   5)} qw(Pos p)), | 
|  | 2 |  |  |  |  | 742 |  | 
| 782 |  |  |  |  |  |  | 'DEFAULT' => 0, | 
| 783 |  |  |  |  |  |  | ); | 
| 784 |  |  |  |  |  |  | } | 
| 785 |  |  |  |  |  |  |  | 
| 786 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 787 |  |  |  |  |  |  | ## \@qcounts2 = $rel->collocateCountQueries($qcount,\%slice2prf,\%opts) | 
| 788 |  |  |  |  |  |  | ## + gets a list of DDC::Any::CQCount object(s) for f2-acquisition given profile() options %opts | 
| 789 |  |  |  |  |  |  | ## + %opts: as for countQuery(), DiaColloDB::Relation::DDC::profile(), etc. | 
| 790 |  |  |  |  |  |  | ## + sets following keys in %opts: | 
| 791 |  |  |  |  |  |  | ##   ( | 
| 792 |  |  |  |  |  |  | ##    needCountsByToken => $bool, ##-- see needCountsByToken() | 
| 793 |  |  |  |  |  |  | ##   ) | 
| 794 |  |  |  |  |  |  | sub collocateCountQueries { | 
| 795 | 0 |  |  | 0 | 1 |  | my ($rel,$qcount,$y2prf,$opts) = @_; | 
| 796 |  |  |  |  |  |  |  | 
| 797 |  |  |  |  |  |  | ##-- check whether we're grouping by any token attributes for match-id =2 | 
| 798 | 0 |  | 0 |  |  |  | $opts->{needCountsByToken} //= $rel->needCountsByToken($qcount); | 
| 799 |  |  |  |  |  |  |  | 
| 800 |  |  |  |  |  |  | ##-- construct f2-queries | 
| 801 | 0 |  |  |  |  |  | my (@qcounts2); | 
| 802 | 0 | 0 | 0 |  |  |  | if ($opts->{needCountsByToken} && $opts->{onepass}) { | 
|  |  | 0 |  |  |  |  |  | 
| 803 |  |  |  |  |  |  | ##-- count-by token attributes, 1-pass mode (~ DiaColloDB <= v0.12.016) | 
| 804 | 0 |  |  |  |  |  | my $qkeys2 = DDC::Any::CQKeys->new($qcount); | 
| 805 | 0 |  |  |  |  |  | $qkeys2->setOptions($qcount->getDtr->getOptions); | 
| 806 | 0 |  |  |  |  |  | $qkeys2->SetMatchId(2); | 
| 807 | 0 |  |  |  |  |  | @qcounts2 = (DDC::Any::CQCount->new($qkeys2, $qcount->getKeys, $qcount->getSample, $qcount->getSort, $qcount->getLo, $qcount->getHi)); | 
| 808 |  |  |  |  |  |  | } | 
| 809 |  |  |  |  |  |  | elsif ($opts->{needCountsByToken}) { | 
| 810 |  |  |  |  |  |  | ##-- count-by token attributes, multi-pass mode (DiaColloDB >= v0.12.017) | 
| 811 |  |  |  |  |  |  | #my $template = $rel->collocateQueryTemplate($qcount); | 
| 812 | 0 | 0 |  |  |  |  | my @gbattrs  = map { UNIVERSAL::can($_,'getIndexName') ? $_->getIndexName : 'DEFAULT' } @{$opts->{gbexprs}->getExprs}; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 813 | 0 |  |  |  |  |  | shift(@gbattrs); ##-- 1st projected attribute is ALWAYS date-slice | 
| 814 |  |  |  |  |  |  |  | 
| 815 |  |  |  |  |  |  | ##-- get "most-specific projected attribute" (MSPA): that projected attribute with most diverse value domain | 
| 816 |  |  |  |  |  |  | ## + via hack using %ATTR_SPECIFICITY | 
| 817 |  |  |  |  |  |  | ## + see also Cofreqs::subprofile2() | 
| 818 | 0 |  | 0 |  |  |  | my $mspai = (sort {$b->[1]<=>$a->[1]} map {[$_,$ATTR_SPECIFICITY{$gbattrs[$_]}//0]} (0..$#gbattrs))[0][0]; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 819 | 0 |  |  |  |  |  | my $mspa  = $gbattrs[$mspai]; | 
| 820 | 0 | 0 |  |  |  |  | $rel->logconfess("collocateCountQueries(): can't determine most specific projected attribute for f2 acquisition") | 
| 821 |  |  |  |  |  |  | if ($mspa eq 'DEFAULT'); | 
| 822 |  |  |  |  |  |  |  | 
| 823 |  |  |  |  |  |  | ##-- extract all MSPA-values | 
| 824 | 0 |  |  |  |  |  | my %mspvals = qw(); | 
| 825 | 0 |  |  |  |  |  | foreach my $prf (values %$y2prf) { | 
| 826 |  |  |  |  |  |  | $mspvals{(split(/\t/,$_,scalar(@gbattrs)))[$mspai]} = undef | 
| 827 | 0 |  |  |  |  |  | foreach (keys %{$prf->{f12}}); | 
|  | 0 |  |  |  |  |  |  | 
| 828 |  |  |  |  |  |  | } | 
| 829 | 0 |  |  |  |  |  | my @mspvals = sort keys %mspvals; | 
| 830 |  |  |  |  |  |  |  | 
| 831 |  |  |  |  |  |  | ##-- construct daughter queries | 
| 832 | 0 |  |  |  |  |  | my $max_qlen = 2048; ##-- ddc max = 4096 | 
| 833 | 0 |  |  |  |  |  | my ($val2); | 
| 834 | 0 |  |  |  |  |  | my @qvals2 = qw(); | 
| 835 | 0 |  |  |  |  |  | my $qlen2  = 0; | 
| 836 | 0 |  |  |  |  |  | while (defined($val2=shift(@mspvals))) { | 
| 837 | 0 |  |  |  |  |  | push(@qvals2, $val2); | 
| 838 | 0 |  |  |  |  |  | $qlen2 += (length($val2)+3); ##-- ",'${val}'" | 
| 839 | 0 | 0 | 0 |  |  |  | if (!@mspvals || $qlen2 >= $max_qlen) { | 
| 840 | 0 |  |  |  |  |  | my $qdtr2   = DDC::Any::CQTokSet->new($mspa,'',\@qvals2); | 
| 841 | 0 |  |  |  |  |  | $qdtr2->setMatchId(2); | 
| 842 | 0 |  |  |  |  |  | $qdtr2->setOptions($qcount->getDtr->getOptions); | 
| 843 | 0 |  |  |  |  |  | push(@qcounts2,$qcount->clone); | 
| 844 | 0 |  |  |  |  |  | $qcounts2[$#qcounts2]->setDtr($qdtr2); | 
| 845 | 0 |  |  |  |  |  | @qvals2 = qw(); | 
| 846 | 0 |  |  |  |  |  | $qlen2 = 0; | 
| 847 |  |  |  |  |  |  | } | 
| 848 |  |  |  |  |  |  | } | 
| 849 |  |  |  |  |  |  | } | 
| 850 |  |  |  |  |  |  | else { | 
| 851 |  |  |  |  |  |  | ##-- count-by file attributes only | 
| 852 | 0 |  |  |  |  |  | my $qdtr2 = DDC::Any::CQTokAny->new; | 
| 853 |  |  |  |  |  |  | #$qdtr2->SetMatchId(2); ##-- not needed here, file-attributes only | 
| 854 | 0 |  |  |  |  |  | $qdtr2->setOptions($qcount->getDtr->getOptions); | 
| 855 | 0 |  |  |  |  |  | my $qcount2 = $qcount->clone(); | 
| 856 | 0 |  |  |  |  |  | $qcount2->setDtr($qdtr2); | 
| 857 | 0 |  |  |  |  |  | @qcounts2 = ($qcount->clone); | 
| 858 |  |  |  |  |  |  | } | 
| 859 |  |  |  |  |  |  |  | 
| 860 | 0 |  |  |  |  |  | return \@qcounts2; | 
| 861 |  |  |  |  |  |  | } | 
| 862 |  |  |  |  |  |  |  | 
| 863 |  |  |  |  |  |  |  | 
| 864 |  |  |  |  |  |  | ##-------------------------------------------------------------- | 
| 865 |  |  |  |  |  |  | ## $bool = $CLASS_OR_OBJECT->needCountsByToken($qcount) | 
| 866 |  |  |  |  |  |  | ##  + returns true iff $qcount groups by any token attributes for match-id =2 | 
| 867 |  |  |  |  |  |  | sub needCountsByToken { | 
| 868 | 0 | 0 |  | 0 | 1 |  | my $that   = UNIVERSAL::isa($_[0],__PACKAGE__) ? shift : __PACKAGE__; | 
| 869 | 0 |  |  |  |  |  | my $qcount = shift; | 
| 870 |  |  |  |  |  |  |  | 
| 871 |  |  |  |  |  |  | ##-- check whether count-query uses any token-attributes | 
| 872 | 0 | 0 |  |  |  |  | return grep {UNIVERSAL::can($_,'getMatchId') && $_->getMatchId==2} @{$qcount->getKeys->getExprs}; | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 873 |  |  |  |  |  |  | } | 
| 874 |  |  |  |  |  |  |  | 
| 875 |  |  |  |  |  |  |  | 
| 876 |  |  |  |  |  |  | ##============================================================================== | 
| 877 |  |  |  |  |  |  | ## Pacakge Alias(es) | 
| 878 |  |  |  |  |  |  | package DiaColloDB::DDC; | 
| 879 | 1 |  |  | 1 |  | 26 | use strict; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 87 |  | 
| 880 |  |  |  |  |  |  | our @ISA = qw(DiaColloDB::Relation::DDC); | 
| 881 |  |  |  |  |  |  |  | 
| 882 |  |  |  |  |  |  |  | 
| 883 |  |  |  |  |  |  | ##============================================================================== | 
| 884 |  |  |  |  |  |  | ## Footer | 
| 885 |  |  |  |  |  |  | 1; | 
| 886 |  |  |  |  |  |  |  | 
| 887 |  |  |  |  |  |  | __END__ |