File Coverage

blib/lib/Combine/MySQLhdb.pm
Criterion Covered Total %
statement 22 210 10.4
branch 1 56 1.7
condition 0 3 0.0
subroutine 8 13 61.5
pod 0 6 0.0
total 31 288 10.7


line stmt bran cond sub pod time code
1             # MySql replacement for hdb
2             # AA0 2002-09-30
3             # Modified Open to return DBI connection and HDB table name
4              
5             package Combine::MySQLhdb;
6              
7 2     2   722 use strict;
  2         5  
  2         64  
8 2     2   10 use Combine::XWI;
  2         4  
  2         35  
9 2     2   1804 use HTTP::Date;
  2         8929  
  2         120  
10 2     2   2116 use Encode;
  2         28962  
  2         272  
11              
12             my $sv; # holds the mysql connection
13             my $table = ''; # holds the hdb table name
14             my $savehtml;
15             my $doOAI;
16              
17             sub Open { #needed??
18 2     2   587 use Combine::Config;
  2         7  
  2         1943  
19 1     1 0 10 $sv = Combine::Config::Get('MySQLhandle');
20 0         0 $savehtml = Combine::Config::Get('saveHTML');
21 0         0 $doOAI = Combine::Config::Get('doOAI');
22 0         0 my $hdbd = 'hdb';
23 0         0 return ($sv,$hdbd);
24             }
25              
26             sub Close {
27             # print "MySQLhdb::Close\n";
28 0     0 0 0 $sv->disconnect ;
29             }
30              
31             sub DESTROY {
32 0     0   0 print STDERR "MySQLhdb::DESTROY\n";
33 0         0 $sv->disconnect ;
34             }
35              
36             sub Write {
37 0     0 0 0 my ($xwi) = @_;
38 0 0       0 return undef unless $xwi;
39 0 0       0 if (!defined($sv)) { Open(); } #Init $sv CHANGE?
  0         0  
40 0         0 my $md5 = $xwi->md5;
41 0         0 my $recordid = $xwi->recordid; #Set by DataBase.pm
42             #OAI
43 0 0       0 if ($doOAI) {
44 0         0 $sv->prepare("REPLACE INTO oai SET status='created', recordid=?, md5=?")->execute($recordid, $md5);
45             }
46             #OAI
47             # $xwi->url_rewind; MORE THAN one URL??
48             # my $url = $xwi->url_get;
49 0         0 my $urlid = $xwi->urlid;
50 0         0 my $my_netlocid = $xwi->netlocid;
51 0         0 my $type = $xwi->type;
52 0         0 my $title = $xwi->title;
53             #checkedDate is inserted/updated in DataBase.pm and harvpars.pl
54 0         0 my $modifiedDate = $xwi->modifiedDate;
55 0 0       0 if ( ! $modifiedDate) { $modifiedDate = $xwi->checkedDate; }
  0         0  
56 0         0 my $expiryDate = $xwi->expiryDate;
57             # if ($expiryDate) { $expiryDate = str2time($expiryDate) ; }
58             # else { $expiryDate = 'NULL'; }
59 0         0 my $length = $xwi->length;
60 0         0 my $server = $xwi->server;
61 0         0 my $etag = $xwi->etag;
62 0         0 my $nheadings = $xwi->heading_count;
63 0         0 my $headings='';
64             # headings
65 0         0 $xwi->heading_rewind;
66 0         0 while (1) {
67 0 0       0 my $this = $xwi->heading_get or last;
68 0         0 $headings .= $this . '; ';
69             }
70 0         0 my $nlinks = $xwi->link_count;
71 0         0 my $this = $xwi->text;
72 0         0 my $ip;
73 0 0       0 if ($this) {
74 0         0 $this = $$this;
75 0 0       0 if ($xwi->truncated()) {
76              
77             # IMPORTANT! This document was truncated. Therefore:
78             #
79             # 1) Discard it if no space characters in it, because then it
80             # could be binary.
81             #
82             # 2) If a space is found, then truncate after the last space,
83             # so as to avoid erroneous indexing (since the truncation
84             # most likely cut a word).
85              
86 0         0 my $last_blank = rindex($this,' ');
87 0 0       0 if ($last_blank > 0) {
88 0         0 $ip = substr($this, 0, $last_blank) ;
89             }
90             }
91             else {
92 0         0 $ip = $this ;
93             }
94 0         0 } else { my $t=''; $xwi->text(\$t); } #make sure xwi->text is defined
  0         0  
95             #?? if (length($ip)>250000) {$ip = substr($ip, 0, 250000);}
96              
97 0         0 $sv->prepare("REPLACE INTO hdb VALUES (?, ?, ?, FROM_UNIXTIME( ? ), FROM_UNIXTIME( ? ), ?, ?, ?, ?, ?, ?, COMPRESS(?))")->execute(
98             $recordid, $type, Encode::encode('utf8',$title), $modifiedDate, $expiryDate, $length, $server, $etag, $nheadings, $nlinks, Encode::encode('utf8',$headings), Encode::encode('utf8',$ip));
99              
100 0 0       0 if ( $savehtml == 1 ) {
101 0         0 my $html = $xwi->content;
102 0         0 $sv->prepare("REPLACE INTO html SET html=COMPRESS(?), recordid=?")->execute(Encode::encode('utf8',$$html),$recordid);
103             }
104              
105 0         0 my $res;
106              
107             #save links
108 0         0 my ( $urlstr, $anchor, $ltype);
109 0         0 $xwi->link_rewind;
110 0         0 my $link_count = 1;
111 0         0 my $netlocid;
112 0         0 $res = $sv->do(qq{DELETE FROM links WHERE recordid='$recordid';}); #needed?
113 0         0 while(1) { #links
114 0         0 ($urlstr, $netlocid, $urlid, $anchor, $ltype) = $xwi->link_get;
115 0 0       0 if (defined($urlstr)) {
116             #Convert urlstr to urlid,netlocid if needed
117 0 0 0     0 if ( ($netlocid <= 0) || ($urlid <= 0) ) {
118 0 0       0 if ( $urlstr eq '') { print STDERR "ERR MySQLhdb, save links, no info\n"; } ## sanity check -> log error
  0         0  
119 2     2   1566 use Combine::selurl;
  2         5  
  2         3968  
120 0         0 my $u;
121 0 0       0 if ( $u = new Combine::selurl($urlstr) ) {
122 0         0 $urlstr = $u->normalise();
123 0         0 my $netlocstr = $u->authority;
124 0         0 my $path_query = $u->path_query;
125 0         0 my $lsth = $sv->prepare(qq{SELECT netlocid,urlid FROM urls WHERE urlstr=?;});
126 0         0 $lsth->execute($urlstr);
127 0         0 ($netlocid,$urlid) = $lsth->fetchrow_array;
128 0 0       0 if ( !defined($urlid) ) {
129 0         0 $sv->prepare(qq{INSERT IGNORE INTO netlocs SET netlocstr=?;})->execute($netlocstr);
130             # ($netlocid) = $sv->selectrow_array(qq{SELECT netlocid FROM netlocs WHERE netlocstr='$netlocstr';});
131 0         0 my $nlsth = $sv->prepare(qq{SELECT netlocid FROM netlocs WHERE netlocstr=?;});
132 0         0 $nlsth->execute($netlocstr);
133 0         0 ($netlocid) = $nlsth->fetchrow_array();
134 0         0 $sv->prepare(qq{INSERT IGNORE INTO urls SET urlstr=?, netlocid=?, path=?;})->execute($urlstr,$netlocid,$path_query);
135 0         0 $lsth->execute($urlstr);
136 0         0 ($netlocid,$urlid) = $lsth->fetchrow_array;
137             }
138 0         0 $sv->prepare("INSERT INTO links (recordid,mynetlocid,urlid,netlocid,anchor,linktype) VALUES (?, ?, ?, ?, ?, ?)")->execute($recordid,$my_netlocid,$urlid,$netlocid,Encode::encode('utf8',$anchor),$ltype);
139             }
140             } else {
141 0         0 $sv->prepare("INSERT INTO links (recordid,mynetlocid,urlid,netlocid,anchor,linktype) VALUES (?, ?, ?, ?, ?, ?)")->execute($recordid,$my_netlocid,$urlid,$netlocid,Encode::encode('utf8',$anchor),$ltype);
142             }
143 0         0 } else { last; }
144 0 0       0 last if ($link_count++ >= 500); # limit on number of links
145             }
146              
147             #save metadata
148 0         0 $xwi->meta_rewind;
149 0         0 $res = $sv->do(qq{DELETE FROM meta WHERE recordid='$recordid';}); #needed?
150 0         0 my ($name,$content);
151 0         0 while (1) {
152 0         0 ($name,$content) = $xwi->meta_get;
153 0 0       0 last unless $name;
154 0         0 $sv->prepare("INSERT INTO meta VALUES (?, ?, ?)")->execute($recordid, Encode::encode('utf8',$name), Encode::encode('utf8',$content));
155             }
156              
157             #OLD
158             #save URLs
159             # $xwi->url_rewind;
160             # $res = $sv->do(qq{DELETE FROM urls WHERE recordid='$recordid';});
161             # while (1) {
162             # $this = $xwi->url_get or last;
163             ## $res = $sv->do(qq{INSERT INTO urls VALUES ('$recordid','$this');});
164             # my $machine = $this;
165             # $machine =~ s|http://([^:/]+)[:/]?.*|$1|;
166             # $sv->prepare("INSERT INTO urls VALUES (?, ?, ?)")->execute($recordid, $this, $machine);
167             # }
168              
169             #save robot data in analys table (uses that URL is stored)
170 0         0 $xwi->robot_rewind;
171 0         0 $res = $sv->do(qq{DELETE FROM analys WHERE recordid='$recordid';}); #needed?
172 0         0 while (1) {
173 0         0 ($name,$content) = $xwi->robot_get;
174 0 0       0 last unless $name;
175 0         0 $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, $name, Encode::encode('utf8',$content));
176             }
177             ## my $alinks = calclinks($recordid,$machine); #?
178             #What if link-stats are inserted double after a Get and following write?
179 0         0 my $sth = $sv->prepare(qq{SELECT COUNT(DISTINCT(links.recordid)), COUNT(DISTINCT(mynetlocid)) FROM links,recordurl WHERE recordurl.recordid= ? AND
180             links.urlid = recordurl.urlid AND mynetlocid<>links.netlocid;});
181 0         0 $sth->execute($recordid);
182 0         0 my ($inlinks,$hostinlinks)=$sth->fetchrow_array;
183 0         0 $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, 'inlinks', $inlinks);
184 0         0 $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, 'hostinlinks', $hostinlinks);
185 0         0 $sth = $sv->prepare(qq{SELECT count(distinct(netlocid)) FROM links WHERE recordid=?;});
186 0         0 $sth->execute($recordid);
187 0         0 my ($outlinks)=$sth->fetchrow_array;
188 0         0 $sv->prepare("INSERT INTO analys VALUES (?, ?, ?)")->execute($recordid, 'outlinks', $outlinks);
189              
190             #save topic, ie result of autoclassification
191 0         0 $xwi->topic_rewind;
192 0         0 $res = $sv->do(qq{DELETE FROM topic WHERE recordid='$recordid';}); #needed?
193 0         0 my ($cls,$absscore, $relscore, $terms, $alg);
194 0         0 while (1) {
195 0         0 ($cls,$absscore, $relscore,$terms, $alg) = $xwi->topic_get;
196 0 0       0 last unless $cls;
197 0         0 $sv->prepare("INSERT INTO topic VALUES (?, ?, ?, ?, ?, ?)")->execute($recordid, Encode::encode('utf8',$cls), $absscore, $relscore, Encode::encode('utf8',$terms), $alg);
198             }
199 0 0       0 if (my $zh = Combine::Config::Get('ZebraHost')) {
200 0         0 require Combine::Zebra;
201 0         0 Combine::Zebra::update($zh,$xwi);
202             }
203 0 0       0 if (Combine::Config::Get('MySQLfulltext')) {
204 0         0 $sv->prepare("REPLACE INTO search VALUES (?, ?)")->execute($recordid, Encode::encode('utf8',$title .' '. $ip));
205             }
206 0 0       0 if (my $sh = Combine::Config::Get('SolrHost')) {
207 0         0 require Combine::Solr;
208 0         0 Combine::Solr::update($sh,$xwi);
209             }
210             }
211              
212             sub Delete { #Used??
213 0     0 0 0 my ($xwi) = @_;
214 0 0       0 return undef unless $xwi;
215              
216 0         0 my $recordid = $xwi->recordid;
217             #print "MySQLhdb::DeleteMD5 $recordid\n";
218 0         0 DeleteKey($recordid, $xwi->md5);
219             }
220              
221             sub DeleteKey {
222 1     1 0 18 my ($key, $md5) = @_;
223 1 50       19 if (!defined($sv)) { Open(); } #Init $sv CHANGE?
  1         15  
224             #OAI
225 0 0         if ($doOAI) {
226             # $sv->prepare("REPLACE INTO oai SET status='deleted', recordid=?, md5=?")->execute($key,$md5);
227             ##FEL recurdurl updaterad i Database.pm FIX!
228 0           $sv->prepare("REPLACE INTO oai SELECT recordid,md5,NOW(),'deleted' FROM recordurl WHERE recordid=?")->execute($key);
229             }
230             #OAI
231              
232             #Zebra
233 0 0         if (my $zh = Combine::Config::Get('ZebraHost')) {
234 0           require Combine::Zebra;
235             #Not needed: if ($md5 eq '') { ($md5)=$sv->selectrow_array('SELECT md5 FROM recordurl WHERE recordid=$key'); }
236 0           Combine::Zebra::delete($zh, $md5, $key);
237             }
238 0 0         if (my $sh = Combine::Config::Get('SolrHost')) {
239 0           require Combine::Solr;
240 0           Combine::Solr::delete($sh, $md5, $key);
241             }
242              
243             #print "MySQLhdb::DeleteKey $key\n";
244 0           my $res = $sv->do(qq{DELETE FROM hdb WHERE recordid=$key;});
245 0           $res = $sv->do(qq{DELETE FROM html WHERE recordid=$key;});
246 0           $res = $sv->do(qq{DELETE FROM search WHERE recordid=$key;});
247 0           $res = $sv->do(qq{DELETE FROM meta WHERE recordid=$key;});
248 0           $res = $sv->do(qq{DELETE FROM analys WHERE recordid=$key});
249 0           $res = $sv->do(qq{DELETE FROM links WHERE recordid=$key;});
250 0           $res = $sv->do(qq{DELETE FROM topic WHERE recordid=$key;});
251 0           $res = $sv->do(qq{DELETE FROM recordurl WHERE recordid=$key;});
252             }
253              
254             sub Get {
255 0     0 0   my ($key) = @_;
256             #should return an initalized xwi-object
257 0 0         if (!defined($sv)) { Open(); } #Init $sv CHANGE?
  0            
258              
259 0           my ($type, $title, $modifiedDate, $expiryDate, $length, $server, $etag, $nheadings, $nlinks, $headings, $ip) =
260             $sv->selectrow_array(qq{SELECT type,title,
261             UNIX_TIMESTAMP(mdate),IF(expiredate,UNIX_TIMESTAMP(expiredate),0),
262             length,server,etag,nheadings,nlinks,headings,UNCOMPRESS(ip)
263             FROM hdb WHERE recordid='$key';});
264              
265 0           my $xwi = new Combine::XWI ;
266 0           $xwi->recordid($key);
267             #url Relies on that all urls are in table urls
268 0           $xwi->type($type);
269 0           $xwi->title(Encode::decode('utf8',$title));
270 0           $xwi->modifiedDate($modifiedDate);
271 0 0         if ($expiryDate>0) {$xwi->expiryDate($expiryDate)};
  0            
272 0           $xwi->length($length);
273 0           $xwi->server($server);
274 0           $xwi->etag($etag);
275 0           $xwi->nheadings($nheadings);
276 0           $xwi->nlinks($nlinks);
277 0           $headings =~ s/; $//;
278 0           $xwi->heading_add(Encode::decode('utf8',$headings)) ;
279 0           my $ip1=Encode::decode('utf8',$ip);
280 0           $xwi->text(\$ip1);
281 0           my ($html1) = $sv->selectrow_array(qq{SELECT UNCOMPRESS(html) FROM html WHERE recordid='$key';});
282 0           my $html = Encode::decode('utf8',$html1);
283 0           $xwi->content(\$html);
284              
285 0           my ($urlpath) = $sv->selectrow_array(qq{SELECT path FROM urls,recordurl WHERE recordid='$key' AND recordurl.urlid=urls.urlid;});
286 0           $xwi->urlpath($urlpath);
287              
288 0           my ($url,$anchor,$lty,$name,$value,$heading);
289             #links
290 0           my $sth = $sv->prepare(qq{SELECT urlid,netlocid,anchor,linktype from links WHERE recordid='$key';});
291 0           $sth->execute;
292 0           my ($urlid,$netlocid,$checkedDate,$md5,$fingerprint,$cls,$absscore,$relscore,$terms,$alg);
293 0           while (($urlid,$netlocid,$anchor,$lty)=$sth->fetchrow_array) {
294 0           $xwi->link_add('', $netlocid, $urlid, Encode::decode('utf8',$anchor), $lty) ; #no URLstr add?
295             }
296              
297             #meta
298 0           $sth = $sv->prepare(qq{SELECT name,value from meta WHERE recordid='$key';});
299 0           $sth->execute;
300 0           while (($name,$value)=$sth->fetchrow_array) {
301 0           $xwi->meta_add(Encode::decode('utf8',$name),Encode::decode('utf8',$value)) ;
302             }
303              
304             # analys -> robot
305 0           $sth = $sv->prepare(qq{SELECT name,value FROM analys WHERE recordid='$key';});
306 0           $sth->execute;
307 0           while (($name,$value)=$sth->fetchrow_array) {
308 0           $xwi->robot_add($name,Encode::decode('utf8',$value)) ;
309             }
310              
311             # topic
312 0           $sth = $sv->prepare(qq{SELECT notation,abscore,relscore,terms,algorithm FROM topic WHERE recordid='$key';});
313 0           $sth->execute;
314 0           while (($cls,$absscore,$relscore,$terms,$alg)=$sth->fetchrow_array) {
315 0           $xwi->topic_add(Encode::decode('utf8',$cls),$absscore,$relscore,Encode::decode('utf8',$terms),$alg) ;
316             }
317              
318             #recordurl
319 0           $sth = $sv->prepare(qq{SELECT urlid,UNIX_TIMESTAMP(lastchecked),md5,fingerprint FROM recordurl WHERE recordid='$key';});
320 0           $sth->execute;
321 0           while (($urlid,$checkedDate,$md5,$fingerprint)=$sth->fetchrow_array) {
322 0           $xwi->urlid($urlid);
323 0           $xwi->checkedDate($checkedDate);
324 0           $xwi->md5($md5);
325 0           $xwi->fingerprint($fingerprint);
326             }
327              
328 0           return $xwi;
329             }
330              
331             1;