File Coverage

lib/Metadata/ByInode/Indexer.pm
Criterion Covered Total %
statement 105 127 82.6
branch 29 80 36.2
condition 10 25 40.0
subroutine 16 17 94.1
pod 2 5 40.0
total 162 254 63.7


line stmt bran cond sub pod time code
1             package Metadata::ByInode::Indexer;
2 2     2   11 use warnings;
  2         4  
  2         51  
3 2     2   10 use strict;
  2         3  
  2         52  
4 2     2   9 use Carp;
  2         3  
  2         300  
5 2     2   11 use Cwd;
  2         11  
  2         120  
6 2     2   16941 use File::Find::Rule;
  2         30104  
  2         20  
7             our $VERSION = sprintf "%d.%02d", q$Revision: 1.8 $ =~ /(\d+)/g;
8             #use Smart::Comments '###';
9              
10             my $DEBUG =1;
11 67     67 0 3991 sub DEBUG : lvalue { $DEBUG }
12              
13             my $TEST =0;
14 0     0 0 0 sub TEST : lvalue { $TEST }
15              
16              
17             =pod
18              
19             =head1 NAME
20              
21             Metadata::ByInode::Indexer - customizable file and directory indexer
22              
23             =head1 DESCRIPTION
24              
25             part of Metadata::ByInode
26             not meant to be used alone!
27              
28             =head1 index()
29              
30             First argument is an absolute file path.
31              
32             If this is a dir, will recurse - NON inclusive
33             that means the dir *itself* will NOT be indexed
34              
35             if it is a file, will do just that one.
36              
37             returns indexed files count
38              
39             by default the indexer does not index hidden files
40             to index hidden files,
41              
42             $m = new Metadata::ByInode::Indexer({
43             abs_dbfile => '/tmp/mbi_test.db',
44             index_hidden_files => 1
45             });
46            
47             $m->index('/path/to/what'); # dir or file
48            
49             =cut
50              
51              
52             sub _teststop {
53 31     31   45 my $self = shift;
54 31         37 my $arg = shift;
55 31 50 33     84 if (defined $arg and $arg=~/^\d+$/){
56 0         0 $self->{_teststop} = $arg;
57 0 0       0 print STDERR " teststop changed to $arg\n" if DEBUG;
58             }
59 31   100     86 $self->{_teststop}||= 1000;
60 31         311 return $self->{_teststop};
61             }
62              
63              
64              
65             sub index {
66 3     3 0 9 my $self = shift;
67 3 50       8 my $arg = shift; $arg or croak('missing argument to index');
  3         18  
68 3         284 my $abs_path = Cwd::abs_path($arg);
69              
70             # index hidden? follow symlinks?
71 3         6 my $files_indexed = 0;
72             # make sure if this is a dir, we use mindepth so we do NOT index itself
73 3         178 my $ondisk = time;
74              
75              
76 3         40 $self->_delete_treeslice($abs_path);
77              
78              
79 3 50       90 unless ($self->{index_hidden_files}){
80 3 50       10 print STDERR " setting rule for no hidden files.. " if DEBUG;
81 3         37 $self->finder->not_name( qr/^\./ ); # no hidden files , but will this index a reg file ina hidden dir?
82 3 50       355 print STDERR "done.\n" if DEBUG;
83             }
84            
85 3         15 my @files = $self->finder->in($abs_path);
86              
87 3 50       3793 if (DEBUG){
88 3         281 printf STDERR " we count %s files\n", scalar @files;
89 3         28 printf STDERR " we will stop at %s (DEBUG is on)\n", $self->_teststop;
90             }
91              
92              
93              
94            
95 3         10 my $runonce_=0;
96 3         9 for ( @files ){ #### Working===[%] done
97            
98             #take out first if it's self and a dir, we do not index ourselves in this case!
99 31 100       65 unless($runonce_){
100 3         5 $runonce_=1;
101 3 50 33     93 if ($abs_path eq $_ and -d $_){
102 3 50       8 print STDERR " index() took out self.. $_\n" if DEBUG;
103 3         8 next;
104             }
105             }
106            
107 28 50 33     54 if ( (DEBUG or TEST) and $self->_teststop == $files_indexed ){
      33        
108 0         0 printf STDERR " reached teststop of %s files\n", $self->_teststop;
109 0         0 last;
110             }
111              
112             # make sure we do not index the original argument
113            
114 28         118 my $abs_path = $_;
115 28 50       172 $abs_path=~/^(.+)\/([^\/]+$)/ or die(__PACKAGE__.'115');
116 28         91 my ($abs_loc,$filename)=($1,$2);
117              
118             # unless( $self->{index_hidden_files} ){
119             # if ($abs_loc=~/\/\./ or $filename=~/^\./){ next; } # /. anywhere
120             # }
121            
122              
123 28         710 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
124             $atime,$mtime,$ctime,$blksize,$blocks)
125             = lstat($abs_path);
126            
127 28 50 33     206 if ( -l _ or -p _ or -b _){
      33        
128 0         0 next;
129             }
130            
131 28         109 $self->_reset;
132            
133 28         66 $self->_set('abs_loc',$abs_loc);
134 28         59 $self->_set('filename',$filename);
135 28         65 $self->_set('ondisk',$ondisk);
136              
137 28 50       57 if ($self->_save_stat_data){
138            
139 0 0       0 $self->_set('size',$size) if $size;
140 0 0       0 $self->_set('ctime',$ctime) if $ctime;
141 0 0       0 $self->_set('mtime',$mtime) if $mtime;
142              
143 0 0       0 if ( -f _ ){
    0          
144 0         0 $self->_set( is_file => 1);
145             }
146             elsif( -d _ ){
147 0         0 $self->_set( is_dir => 1 );
148             }
149              
150 0 0       0 if ( -T _ ){
    0          
151 0         0 $self->_set( is_text => 1 );
152             }
153             elsif( -B _ ){
154 0         0 $self->_set( is_binary => 1 );
155             }
156             }
157            
158 28         69 $self->index_extra;
159              
160 28         57 $self->set($ino,$self->_record); # set first arg can be inode or abs path, this should quicken with passing it inode
161 28         71 $files_indexed++;
162              
163             }
164            
165 3         13 my $seconds_elapsed = int(time - $ondisk);
166             ### $seconds_elapsed
167             ### $files_indexed
168              
169 3         13 $self->dbh->commit;
170            
171 3         75 return $files_indexed;
172             }
173              
174             sub _save_stat_data {
175 28     28   34 my $self = shift;
176 28   50     114 $self->{save_stat_data} ||= 0;
177 28         70 return $self->{save_stat_data};
178             }
179              
180             =for old
181             # through system find
182             # causes problems on some systems!!!!
183             sub find_abs_paths_systemfind {
184             my $abs_path= shift;
185             $abs_path or die();
186             my $mindepth = (-d $abs_path) ? '-mindepth 1' : '';
187             my @abs_paths = split(/\n/,`find "$abs_path" $mindepth`);
188             return \@abs_paths;
189             }
190             =cut
191              
192             sub finder {
193 6     6 1 12 my $self = shift;
194 6 50       23 unless( defined $self->{file_file_rule} ){
195 6         62 $self->{file_find_rule} = new File::Find::Rule();
196 6 50       150 defined $self->{file_find_rule} or die("cant get File::Find::Rule object");
197             }
198 6         84 return $self->{file_find_rule};
199             }
200              
201              
202              
203              
204              
205             sub _reset {
206 28     28   38 my $self = shift;
207 28         47 $self->{_current_record} = undef;
208 28         86 return 1;
209             }
210              
211             sub _set {
212 84     84   112 my $self = shift;
213 2     2   2397 no warnings;
  2         5  
  2         1935  
214 84 50 33     133 my ($key,$val)=(shift,shift); (defined $key and defined $val)
  84         327  
215             or confess("_set() missing [key:$key] or [val:$val]");
216 84         211 $self->{_current_record}->{$key} = $val;
217 84         127 return 1;
218             }
219              
220             sub _record {
221 28     28   30 my $self = shift;
222 28 50       64 defined $self->{_current_record} or die($!);
223 28         98 return $self->{_current_record};
224             }
225              
226              
227              
228             sub index_extra {
229 28     28 1 33 my $self = shift;
230 28         33 return 1;
231             }
232              
233             =pod
234              
235              
236              
237             =head1 USING THE INDEXER
238              
239             by deafault we just record abs_loc, filename, ontime(timestamp we recorded it on)
240             you can use the method rule() which returns a L object, to do neat things..
241              
242             my $i = new Metadata::ByInode({ abs_dbfile => '/tmp/dbfile.db' });
243              
244             $i->finder->name( qr/\.mp3$|\.avi$/ );
245              
246             $i->index('/home/myself');
247              
248             This would only index mp3 and avi files in your home dir.
249              
250             =head2 finder()
251              
252             returns File::Find::Rule object,
253             you can feed it rules before calling index()
254              
255              
256             =head1 CREATING YOUR OWN INDEXER
257              
258             =head2 index_extra()
259              
260             If you want to invent your own indexer, then this is the method to override.
261             For every file found, this method is run, it just inserts data into the record
262             for that file.
263             By default, all files will have 'filename', 'abs_loc', and 'ondisk', which is a
264             timestamp of when the file was seen (now).
265              
266             for example, if you want the indexer to record mime types, you should override
267             the index_extra method as..
268              
269             package Indexer::WithMime;
270             use File::MMagic;
271             use base 'Metadata::ByInode::Indexer';
272              
273            
274             sub index_extra {
275            
276             my $self = shift;
277            
278             # get hash with current record data
279             my $record = $self->_record;
280              
281             # by default, record holds 'abs_loc', 'filename', and 'ondisk'
282            
283             # ext will be the distiction between dirs here
284             if ($record->{filename}=~/\.\w{1,4}$/ ){
285            
286             my $m = new File::MMagic;
287             my $mime = $m->checktype_filename(
288             $record->{abs_loc} .'/'. $record->{filename}
289             );
290            
291             if ($mime){
292             # and now we append to the record another key and value pair
293             $self->_set('mime_type',$mime);
294             }
295             }
296            
297             return 1;
298             }
299              
300             Then in your script
301              
302             use Indexer::WithMime;
303              
304             my $i = new Indexer::WithMime({ abs_dbfile => '/home/myself/dbfistartedle.db' });
305              
306             $i->index('/home/myself');
307              
308             # now you can search files by mime type residing somewhere in that dir
309              
310             $i->search({ mime_type => 'mp3' });
311              
312             #or
313             $i->search({
314             mime_type => 'mp3',
315             filename => 'u2',
316             });
317              
318             =head2 _teststop()
319              
320             returns how many files to index before stop
321             only happens if DEBUG is on.
322             default val is 1000, to change it, provide new argument before indexing.
323              
324             $self->_teststop(10000); # now set to 10k
325              
326             You may also pass this ammount to the constructor
327              
328             my $i = new Metadata::ByInode( { _teststop => 500, abs_dbfile => '/tmp/index.db' });
329              
330             =head2 _find_abs_paths()
331              
332             argument is abs path to what base dir to scan to index, returns abs paths to all within
333             no hidden files are returned
334              
335             Returns array ref with abs paths:
336              
337             $self->_find_abs_paths('/var/wwww');
338              
339             =head2 _save_stat_data()
340              
341             By default we do not save stat data, if you want to, then pass as argument to constructor:
342              
343             my $i = new Metadata::ByInode({ save_stat_data => 1 });
344              
345             This will create for each entry indexed;
346              
347             ctime mtime is_dir is_file is_text is_binary size
348              
349             If you are indexing 1k files, this makes little difference. But if you are indexing 1million,
350             It makes a lot of difference in time.
351              
352             =head1 CHANGES
353              
354             The previous version used the system find to get a list of what to index, now
355             we use File::Find::Rule
356              
357             =head1 SEE ALSO
358              
359             L and L
360              
361             =cut
362              
363              
364              
365              
366              
367              
368             # delete a slice of the indexed tree
369             sub _delete_treeslice {
370 3     3   9 my $self = shift;
371 3 50       6 my $arg = shift; $arg or croak('missing abs path arg to _delete_treeslice');
  3         13  
372 3         4 my $ondisk = shift; #optional
373              
374 3 50       19 print STDERR "_delete_treeslice started\n" if DEBUG;
375            
376 3         230 my $abs_path = Cwd::abs_path($arg);
377             ## recursive delete
378             ## $abs_path
379             ## $ondisk
380              
381             #delete by location AND by time
382 3 50       15 if ($ondisk) { # if this was a dir
383 0 0       0 print STDERR " ondisk $ondisk, " if DEBUG;
384             # YEAH! IT WORKS !! :)
385             ### was dir, will get rid of sub not updt
386 0 0       0 unless (defined $self->{_open_handle}->{recursive_delete_o}){
387 0 0       0 $self->{_open_handle}->{recursive_delete_o} = $self->dbh->prepare(
388             q{DELETE FROM metadata WHERE inode IN }
389             .q{(SELECT inode FROM metadata WHERE mkey='abs_loc' AND mvalue LIKE ? AND inode IN }
390             .q{(SELECT inode FROM metadata WHERE mkey='ondisk' AND mvalue < ?));"})
391             or croak( "_delete_treeslice() ".$self->dbh->errstr );
392             }
393            
394 0         0 $self->{_open_handle}->{recursive_delete_o}->execute("$abs_path%",$ondisk);
395 0         0 my $rows_deleted_o = $self->{_open_handle}->{recursive_delete_o}->rows;
396             ### $rows_deleted_o
397 0         0 $self->dbh->commit;
398 0 0       0 print STDERR "done\n" if DEBUG;
399            
400            
401             }
402              
403             # delete not by time
404             else {
405 3 50       13 print STDERR " regular, " if DEBUG;
406            
407            
408             =for did not work with mysql, only with sqlite
409             unless (defined $self->{_open_handle}->{recursive_delete}){
410             $self->{_open_handle}->{recursive_delete} = $self->dbh->prepare(
411             q{DELETE FROM metadata WHERE inode IN ( SELECT inode FROM (select * from metadata) as x WHERE mkey='abs_loc' AND mvalue LIKE ? )}
412             ) or croak( "_delete_treeslice() ". $self->dbh->errstr );# normal sub select bug in mysql, made up for here by selct * from .... as x
413             }
414            
415             $self->{_open_handle}->{recursive_delete}->execute("$abs_path%");
416             my $rows_deleted = $self->{_open_handle}->{recursive_delete}->rows;
417             ### $rows_deleted
418             $self->dbh->commit;
419            
420             =cut
421              
422              
423 3 50       10 print STDERR " preparing.. " if DEBUG;
424              
425             # my which??
426 3 50       13 print STDERR " preparing select 1.. " if DEBUG;
427 3         26 my $inodes = $self->dbh->selectcol_arrayref("SELECT inode FROM metadata WHERE mkey='abs_loc' and mvalue LIKE '$abs_path%'");
428 3 50       1138 print STDERR "done.\n" if DEBUG;
429            
430             #print STDERR "executing select 1.. " if DEBUG;
431             #$inodes->execute("$abs_path\%");
432             #print STDERR "done.\n" if DEBUG;
433            
434 3         18 my $del = $self->dbh->prepare('DELETE FROM metadata WHERE inode=?');
435            
436 3 50       223 print STDERR "executing.. " if DEBUG;
437 3         13 for (@$inodes){
438 19         3182 $del->execute($_);
439             }
440 3 50       10957 print STDERR "done.\n" if DEBUG;
441              
442 3         24 $self->dbh->commit;
443            
444            
445              
446            
447             =for newway
448              
449             DOING A SUBSELECT LIKE THIS TAKES FOREEEEEVVVVVEEERRRRRRRRRRRR
450            
451             my $delete = $self->dbh->prepare(
452             q{DELETE FROM metadata WHERE inode IN(
453             SELECT inode FROM (select * from metadata) as temptable WHERE temptable.mkey='abs_loc' AND temptable.mvalue LIKE ?)}
454             ) or croak( "_delete_treeslice() ". $self->dbh->errstr );
455            
456             print STDERR "done.\n" if DEBUG;
457            
458             print STDERR "executing.. " if DEBUG;
459             $delete->execute("$abs_path\%");
460             print STDERR "done.\n" if DEBUG;
461            
462             my $rows_deleted = $delete->rows;
463             ## $rows_deleted
464             $self->dbh->commit;
465             =cut
466            
467              
468              
469 3 50       27 print STDERR "_delete_treeslice regular done\n" if DEBUG;
470            
471             }
472              
473              
474 3 50       16 print STDERR "_delete_treeslice done\n" if DEBUG;
475              
476 3         15 return 1;
477             }
478              
479             =pod
480              
481             =head1 AUTHOR
482              
483             Leo Charre leocharre at cpan dot org
484              
485             =cut
486              
487             1;