|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # $Id: Indexer.pm,v 1.2 2003/02/02 21:20:46 matt Exp $  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package AxKit::XSP::Wiki::Indexer;  | 
| 
4
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
5
 | 
 use strict;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28
 | 
    | 
| 
5
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
2400
 | 
 use XML::SAX::Base;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
28813
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
36
 | 
    | 
| 
6
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
13
 | 
 use vars qw($VERSION @ISA);  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
697
 | 
    | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 $VERSION = '1.00';  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 @ISA = qw(XML::SAX::Base);  | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new {  | 
| 
11
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
0
  
 | 
 
 | 
     my $class = shift;  | 
| 
12
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my (%opts) = @_;  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
14
 | 
0
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     my $db = $opts{DB} || die "DB argument required";  | 
| 
15
 | 
0
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
     my $page_id = $opts{PageId} || die "PageId argument required";  | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
17
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $self = bless { DB => $db, PageId => $page_id }, $class;  | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
19
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{InsertCTI} = $db->prepare("INSERT INTO ContentIndex (page_id, word_id, value) VALUES (?, ?, ?)");  | 
| 
20
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{InsertWord} = $db->prepare("INSERT INTO Word (word) VALUES (?)");  | 
| 
21
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{InsertWord}->{PrintError} = 0;  | 
| 
22
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{FindWord} = $db->prepare("SELECT id FROM Word WHERE word = ?");  | 
| 
23
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{DeleteCTI} = $db->prepare("DELETE FROM ContentIndex WHERE page_id = ?");  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
25
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{Words} = {};  | 
| 
26
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{DocSize} = 0;  | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
28
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $self;  | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub end_document {  | 
| 
32
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
 
 | 
     my ($self) = @_;  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # Delete current index for this page  | 
| 
35
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{DeleteCTI}->execute($self->{PageId});  | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
37
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     for my $word (keys %{$self->{Words}}) {  | 
| 
 
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
38
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         next unless $word;  | 
| 
39
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $word_id = $self->insert_word($word);  | 
| 
40
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         next unless $word_id;  | 
| 
41
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 warn("Indexing: $self->{PageId}, $word_id, $word\n");  | 
| 
42
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->{InsertCTI}->execute(  | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $self->{PageId},  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $word_id,  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             $self->{Words}{$word},  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         );  | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
48
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     $self->{DB}->commit;  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub insert_word {  | 
| 
52
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
0
  
 | 
 
 | 
     my ($self, $word) = @_;  | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
54
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     my $word_id;  | 
| 
55
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     eval {  | 
| 
56
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->{InsertWord}->execute($word);  | 
| 
57
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $word_id = $self->{DB}->func('last_insert_rowid');  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     };  | 
| 
59
 | 
0
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     if ($@) {  | 
| 
60
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->{FindWord}->execute($word);  | 
| 
61
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $row = $self->{FindWord}->fetch;  | 
| 
62
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $word_id = $row->[0];  | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
65
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return $word_id;  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # NB: This implementation assumes SAX parsers that don't break mid-word.  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # (Could use filter if this is a problem)  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub characters {  | 
| 
71
 | 
0
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
 
 | 
     my ($self, $node) = @_;  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
       | 
| 
73
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     while ($node->{Data} =~ /\G(\S*)\s*/gc) {  | 
| 
74
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         my $word = $1;  | 
| 
75
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $word =~ s/\W*$//; # strip trailing non-word chars  | 
| 
76
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $word =~ s/^\W*//; # strip leading non-word chars  | 
| 
77
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         $self->{Words}{lc($word)}++;  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  |