File Coverage

Bio/DB/Flat/BinarySearch.pm

Criterion	Covered	Total	%
statement	244	555	43.9
branch	71	230	30.8
condition	17	38	44.7
subroutine	34	57	59.6
pod	34	42	80.9
total	400	922	43.3

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# BioPerl module for Bio::DB::Flat::BinarySearch
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Michele Clamp >
7							#
8							# You may distribute this module under the same terms as perl itself
9
10							# POD documentation - main docs before the code
11
12							=head1 NAME
13
14							Bio::DB::Flat::BinarySearch - BinarySearch search indexing system for sequence files
15
16							=head1 SYNOPSIS
17
18							TODO: SYNOPSIS NEEDED!
19
20							=head1 DESCRIPTION
21
22							This module can be used both to index sequence files and also to
23							retrieve sequences from existing sequence files.
24
25							This object allows indexing of sequence files both by a primary key
26							(say accession) and multiple secondary keys (say ids). This is
27							different from the Bio::Index::Abstract (see L)
28							which uses DBM files as storage. This module uses a binary search to
29							retrieve sequences which is more efficient for large datasets.
30
31							=head2 Index creation
32
33							my $sequencefile; # Some fasta sequence file
34
35							Patterns have to be entered to define where the keys are to be indexed
36							and also where the start of each record. E.g. for fasta
37
38							my $start_pattern = '^>';
39							my $primary_pattern = '^>(\S+)';
40
41							So the start of a record is a line starting with a E and the
42							primary key is all characters up to the first space after the E
43
44							A string also has to be entered to defined what the primary key
45							(primary_namespace) is called.
46
47							The index can now be created using
48
49							my $index = Bio::DB::Flat::BinarySearch->new(
50							-directory => "/home/max/",
51							-dbname => "mydb",
52							-start_pattern => $start_pattern,
53							-primary_pattern => $primary_pattern,
54							-primary_namespace => "ID",
55							-format => "fasta" );
56
57							my @files = ("file1","file2","file3");
58
59							$index->build_index(@files);
60
61							The index is now ready to use. For large sequence files the perl way
62							of indexing takes a long time and a huge amount of memory. For
63							indexing things like dbEST I recommend using the DB_File indexer, BDB.
64
65							The formats currently supported by this module are fasta, Swissprot,
66							and EMBL.
67
68							=head2 Creating indices with secondary keys
69
70							Sometimes just indexing files with one id per entry is not enough. For
71							instance you may want to retrieve sequences from swissprot using
72							their accessions as well as their ids.
73
74							To be able to do this when creating your index you need to pass in
75							a hash of secondary_patterns which have their namespaces as the keys
76							to the hash.
77
78							e.g. For Indexing something like
79
80							ID 1433_CAEEL STANDARD; PRT; 248 AA.
81							AC P41932;
82							DT 01-NOV-1995 (Rel. 32, Created)
83							DT 01-NOV-1995 (Rel. 32, Last sequence update)
84							DT 15-DEC-1998 (Rel. 37, Last annotation update)
85							DE 14-3-3-LIKE PROTEIN 1.
86							GN FTT-1 OR M117.2.
87							OS Caenorhabditis elegans.
88							OC Eukaryota; Metazoa; Nematoda; Chromadorea; Rhabditida; Rhabditoidea;
89							OC Rhabditidae; Peloderinae; Caenorhabditis.
90							OX NCBI_TaxID=6239;
91							RN [1]
92
93							where we want to index the accession (P41932) as the primary key and the
94							id (1433_CAEEL) as the secondary id. The index is created as follows
95
96							my %secondary_patterns;
97
98							my $start_pattern = '^ID (\S+)';
99							my $primary_pattern = '^AC (\S+)\;';
100
101							$secondary_patterns{"ID"} = '^ID (\S+)';
102
103							my $index = Bio::DB::Flat::BinarySearch->new(
104							-directory => $index_directory,
105							-dbname => "ppp",
106							-write_flag => 1,
107							-verbose => 1,
108							-start_pattern => $start_pattern,
109							-primary_pattern => $primary_pattern,
110							-primary_namespace => 'AC',
111							-secondary_patterns => \%secondary_patterns);
112
113							$index->build_index($seqfile);
114
115							Of course having secondary indices makes indexing slower and use more
116							memory.
117
118							=head2 Index reading
119
120							To fetch sequences using an existing index first of all create your sequence
121							object
122
123							my $index = Bio::DB::Flat::BinarySearch->new(
124							-directory => $index_directory);
125
126							Now you can happily fetch sequences either by the primary key or
127							by the secondary keys.
128
129							my $entry = $index->get_entry_by_id('HBA_HUMAN');
130
131							This returns just a string containing the whole entry. This is
132							useful is you just want to print the sequence to screen or write it to a file.
133
134							Other ways of getting sequences are
135
136							my $fh = $index->get_stream_by_id('HBA_HUMAN');
137
138							This can then be passed to a seqio object for output or converting
139							into objects.
140
141							my $seq = Bio::SeqIO->new(-fh => $fh,
142							-format => 'fasta');
143
144							The last way is to retrieve a sequence directly. This is the
145							slowest way of extracting as the sequence objects need to be made.
146
147							my $seq = $index->get_Seq_by_id('HBA_HUMAN');
148
149							To access the secondary indices the secondary namespace needs to be known
150
151							$index->secondary_namespaces("ID");
152
153							Then the following call can be used
154
155							my $seq = $index->get_Seq_by_secondary('ID','1433_CAEEL');
156
157							These calls are not yet implemented
158
159							my $fh = $index->get_stream_by_secondary('ID','1433_CAEEL');
160							my $entry = $index->get_entry_by_secondary('ID','1433_CAEEL');
161
162							=head1 FEEDBACK
163
164							=head2 Mailing Lists
165
166							User feedback is an integral part of the evolution of this and other
167							Bioperl modules. Send your comments and suggestions preferably to one
168							of the Bioperl mailing lists. Your participation is much appreciated.
169
170							bioperl-l@bioperl.org - General discussion
171							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
172
173							=head2 Support
174
175							Please direct usage questions or support issues to the mailing list:
176
177							I
178
179							rather than to the module maintainer directly. Many experienced and
180							reponsive experts will be able look at the problem and quickly
181							address it. Please include a thorough description of the problem
182							with code and data examples if at all possible.
183
184							=head2 Reporting Bugs
185
186							Report bugs to the Bioperl bug tracking system to help us keep track
187							the bugs and their resolution. Bug reports can be submitted via the
188							web:
189
190							https://github.com/bioperl/bioperl-live/issues
191
192							=head1 AUTHOR - Michele Clamp
193
194							Email - michele@sanger.ac.uk
195
196							=head1 CONTRIBUTORS
197
198							Jason Stajich, jason@bioperl.org
199
200							=head1 APPENDIX
201
202							The rest of the documentation details each of the object methods. Internal
203							methods are usually preceded with an "_" (underscore).
204
205							=cut
206
207							package Bio::DB::Flat::BinarySearch;
208
209	1			1		3	use strict;
	1					1
	1					26
210
211	1			1		3	use Fcntl qw(SEEK_END SEEK_CUR);
	1					1
	1					75
212
213							# rather than using tell which might be buffered
214	0			0	0	0	sub systell { sysseek( $_[0], 0, SEEK_CUR ) }
215	0			0	0	0	sub syseof { sysseek( $_[0], 0, SEEK_END ) }
216
217	1			1		5	use File::Spec;
	1					2
	1					31
218	1			1		5	use Bio::Root::RootI;
	1					2
	1					21
219	1			1		323	use Bio::SeqIO;
	1					1
	1					26
220	1			1		426	use Bio::Seq;
	1					2
	1					26
221
222	1			1		6	use base qw(Bio::DB::RandomAccessI);
	1					1
	1					62
223
224	1			1		4	use constant CONFIG_FILE_NAME => 'config.dat';
	1					1
	1					46
225	1			1		3	use constant HEADER_SIZE => 4;
	1					1
	1					40
226	1			1		4	use constant DEFAULT_FORMAT => 'fasta';
	1					1
	1					4318
227
228							my @formats = [ 'FASTA', 'SWISSPROT', 'EMBL' ];
229
230							=head2 new
231
232							Title : new
233							Usage : For reading
234							my $index = Bio::DB::Flat::BinarySearch->new(
235							-directory => '/Users/michele/indices/dbest',
236							-dbname => 'mydb',
237							-format => 'fasta');
238
239							For writing
240
241							my %secondary_patterns = {"ACC" => "^>\\S+ +(\\S+)"}
242							my $index = Bio::DB::Flat::BinarySearch->new(
243							-directory => '/Users/michele/indices',
244							-dbname => 'mydb',
245							-primary_pattern => "^>(\\S+)",
246							-secondary_patterns => \%secondary_patterns,
247							-primary_namespace => "ID");
248
249							my @files = ('file1','file2','file3');
250
251							$index->build_index(@files);
252
253
254							Function: create a new Bio::DB::Flat::BinarySearch object
255							Returns : new Bio::DB::Flat::BinarySearch
256							Args : -directory Root directory for index files
257							-dbname Name of subdirectory containing indices
258							for named database
259							-write_flag Allow building index
260							-primary_pattern Regexp defining the primary id
261							-secondary_patterns A hash ref containing the secondary
262							patterns with the namespaces as keys
263							-primary_namespace A string defining what the primary key
264							is
265
266							Status : Public
267
268							=cut
269
270							sub new {
271	1			1	1	5	my ( $class, @args ) = @_;
272
273	1					7	my $self = $class->SUPER::new(@args);
274
275	1					1	bless $self, $class;
276
277	1					8	my ( $index_dir, $dbname, $format, $write_flag, $primary_pattern,
278							$primary_namespace, $start_pattern, $secondary_patterns )
279							= $self->_rearrange(
280							[
281							qw(DIRECTORY
282							DBNAME
283							FORMAT
284							WRITE_FLAG
285							PRIMARY_PATTERN
286							PRIMARY_NAMESPACE
287							START_PATTERN
288							SECONDARY_PATTERNS)
289							],
290							@args
291							);
292
293	1					4	$self->index_directory($index_dir);
294	1					2	$self->dbname($dbname);
295
296	1	50	33			1	if ( $self->index_directory && $self->read_config_file ) {
297
298	0					0	my $fh = $self->primary_index_filehandle;
299	0					0	my $record_width = $self->read_header($fh);
300	0					0	$self->record_size($record_width);
301							}
302	1		50			2	$format \|\|= DEFAULT_FORMAT;
303	1					2	$self->format($format);
304	1					2	$self->write_flag($write_flag);
305
306	1	50	33			1	if ( $self->write_flag && !$primary_namespace ) {
307							(
308	1					1	$primary_namespace, $primary_pattern,
309							$start_pattern, $secondary_patterns
310							) = $self->_guess_patterns( $self->format );
311							}
312
313	1					2	$self->primary_pattern($primary_pattern);
314	1					2	$self->primary_namespace($primary_namespace);
315	1					2	$self->start_pattern($start_pattern);
316	1					1	$self->secondary_patterns($secondary_patterns);
317
318	1					8	return $self;
319							}
320
321							sub new_from_registry {
322	0			0	0	0	my ( $self, %config ) = @_;
323
324	0					0	my $dbname = $config{'dbname'};
325	0					0	my $location = $config{'location'};
326
327	0					0	my $index = Bio::DB::Flat::BinarySearch->new(
328							-dbname => $dbname,
329							-index_dir => $location,
330							);
331							}
332
333							=head2 get_Seq_by_id
334
335							Title : get_Seq_by_id
336							Usage : $obj->get_Seq_by_id($newval)
337							Function:
338							Example :
339							Returns : value of get_Seq_by_id
340							Args : newvalue (optional)
341
342							=cut
343
344							sub get_Seq_by_id {
345	0			0	1	0	my ( $self, $id ) = @_;
346
347							# too many uninit variables...
348	0					0	local $^W = 0;
349
350	0					0	my ( $fh, $length ) = $self->get_stream_by_id($id);
351
352	0	0				0	unless ( defined( $self->format ) ) {
353	0					0	$self->throw("Can't create sequence - format is not defined");
354							}
355
356	0	0				0	return unless $fh;
357
358	0	0				0	unless ( defined( $self->{_seqio} ) ) {
359
360	0					0	$self->{_seqio} = Bio::SeqIO->new(
361							-fh => $fh,
362							-format => $self->format
363							);
364							}
365							else {
366	0					0	$self->{_seqio}->fh($fh);
367							}
368
369	0					0	return $self->{_seqio}->next_seq;
370							}
371
372							=head2 get_entry_by_id
373
374							Title : get_entry_by_id
375							Usage : $obj->get_entry_by_id($newval)
376							Function: Get a Bio::SeqI object for a unique ID
377							Returns : Bio::SeqI
378							Args : string
379
380
381							=cut
382
383							sub get_entry_by_id {
384	0			0	1	0	my ( $self, $id ) = @_;
385
386	0					0	my ( $fh, $length ) = $self->get_stream_by_id($id);
387
388	0					0	my $entry;
389
390	0					0	sysread( $fh, $entry, $length );
391
392	0					0	return $entry;
393							}
394
395							=head2 get_stream_by_id
396
397							Title : get_stream_by_id
398							Usage : $obj->get_stream_by_id($id)
399							Function: Gets a Sequence stream for an id
400							Returns : Bio::SeqIO stream
401							Args : Id to lookup by
402
403
404							=cut
405
406							sub get_stream_by_id {
407	0			0	1	0	my ( $self, $id ) = @_;
408
409	0	0				0	unless ( $self->record_size ) {
410	0	0	0			0	if ( $self->index_directory && $self->read_config_file ) {
411
412	0					0	my $fh = $self->primary_index_filehandle;
413	0					0	my $record_width = $self->read_header($fh);
414	0					0	$self->record_size($record_width);
415							}
416							}
417	0					0	my $indexfh = $self->primary_index_filehandle;
418	0					0	syseof($indexfh);
419
420	0					0	my $filesize = systell($indexfh);
421
422	0	0				0	$self->throw("file was not parsed properly, record size is empty")
423							unless $self->record_size;
424
425	0					0	my $end = ( $filesize - $self->{'_start_pos'} ) / $self->record_size;
426	0					0	my ( $newid, $rest, $fhpos ) =
427							$self->find_entry( $indexfh, 0, $end, $id, $self->record_size );
428
429	0					0	my ( $fileid, $pos, $length ) = split( /\t/, $rest );
430
431							#print STDERR "BinarySearch Found id entry $newid $fileid $pos $length:$rest\n";
432
433	0	0				0	if ( !$newid ) {
434	0					0	return;
435							}
436
437	0					0	my $file = $self->{_file}{$fileid};
438
439	0	0				0	open my $IN, '<', $file or $self->throw("Could not read file '$file': $!");
440
441	0					0	my $entry;
442
443	0					0	sysseek( $IN, $pos, 0 );
444
445	0					0	return ( $IN, $length );
446							}
447
448							=head2 get_Seq_by_acc
449
450							Title : get_Seq_by_acc
451							Usage : $obj->get_Seq_by_acc($acc)
452							Function: Gets a Bio::SeqI object by accession number
453							Returns : Bio::SeqI object
454							Args : string representing accession number
455
456
457							=cut
458
459							sub get_Seq_by_acc {
460	0			0	1	0	my ( $self, $acc ) = @_;
461
462							# too many uninit variables...
463	0					0	local $^W = 0;
464
465	0	0				0	if ( $self->primary_namespace eq "ACC" ) {
466	0					0	return $self->get_Seq_by_id($acc);
467							}
468							else {
469	0					0	return $self->get_Seq_by_secondary( "ACC", $acc );
470							}
471							}
472
473							=head2 get_Seq_by_version
474
475							Title : get_Seq_by_version
476							Usage : $obj->get_Seq_by_version($version)
477							Function: Gets a Bio::SeqI object by accession.version number
478							Returns : Bio::SeqI object
479							Args : string representing accession.version number
480
481
482							=cut
483
484							sub get_Seq_by_version {
485	0			0	1	0	my ( $self, $acc ) = @_;
486
487							# too many uninit variables...
488	0					0	local $^W = 0;
489
490	0	0				0	if ( $self->primary_namespace eq "VERSION" ) {
491	0					0	return $self->get_Seq_by_id($acc);
492							}
493							else {
494	0					0	return $self->get_Seq_by_secondary( "VERSION", $acc );
495							}
496							}
497
498							=head2 get_Seq_by_secondary
499
500							Title : get_Seq_by_secondary
501							Usage : $obj->get_Seq_by_secondary($namespace,$acc)
502							Function: Gets a Bio::SeqI object looking up secondary accessions
503							Returns : Bio::SeqI object
504							Args : namespace name to check secondary namespace and an id
505
506
507							=cut
508
509							sub get_Seq_by_secondary {
510	0			0	1	0	my ( $self, $name, $id ) = @_;
511
512	0					0	my @names = $self->secondary_namespaces;
513
514	0					0	my $found = 0;
515	0					0	foreach my $tmpname (@names) {
516	0	0				0	if ( $name eq $tmpname ) {
517	0					0	$found = 1;
518							}
519							}
520
521	0	0				0	if ( $found == 0 ) {
522	0					0	$self->throw("Secondary index for $name doesn't exist\n");
523							}
524
525	0					0	my $fh = $self->open_secondary_index($name);
526
527	0					0	syseof($fh);
528
529	0					0	my $filesize = systell($fh);
530
531	0					0	my $recsize = $self->{'_secondary_record_size'}{$name};
532
533							# print "Name " . $recsize . "\n";
534
535	0					0	my $end = ( $filesize - $self->{'_start_pos'} ) / $recsize;
536
537							# print "End $end $filesize\n";
538	0					0	my ( $newid, $primary_id, $pos ) =
539							$self->find_entry( $fh, 0, $end, $id, $recsize );
540
541	0					0	sysseek( $fh, $pos, 0 );
542
543							# print "Found new id $newid $primary_id\n";
544							# We now need to shuffle up the index file to find the top secondary entry
545
546	0					0	my $record = $newid;
547
548	0		0			0	while ( $record =~ /^$newid/ && $pos >= 0 ) {
549
550	0					0	$record = $self->read_record( $fh, $pos, $recsize );
551	0					0	$pos = $pos - $recsize;
552
553							# print "Up record = $record:$newid\n";
554							}
555
556	0					0	$pos += $recsize;
557
558							# print "Top position is $pos\n";
559
560							# Now we have to shuffle back down again to read all the secondary entries
561
562	0					0	my $current_id = $newid;
563	0					0	my %primary_id;
564
565	0					0	$primary_id{$primary_id} = 1;
566
567	0					0	while ( $current_id eq $newid ) {
568	0					0	$record = $self->read_record( $fh, $pos, $recsize );
569
570							# print "Record is :$record:\n";
571	0					0	my ( $secid, $primary_id ) = split( /\t/, $record, 2 );
572	0					0	$current_id = $secid;
573
574	0	0				0	if ( $current_id eq $newid ) {
575	0					0	$primary_id =~ s/ //g;
576
577							# print "Primary $primary_id\n";
578	0					0	$primary_id{$primary_id} = 1;
579
580	0					0	$pos = $pos + $recsize;
581
582							# print "Down record = $record\n";
583							}
584							}
585
586	0	0				0	if ( !defined($newid) ) {
587	0					0	return;
588							}
589
590	0					0	my @entry;
591
592	0					0	foreach my $id ( keys %primary_id ) {
593	0					0	push @entry, $self->get_Seq_by_id($id);
594							}
595	0	0				0	return wantarray ? @entry : $entry[0];
596
597							}
598
599							=head2 read_header
600
601							Title : read_header
602							Usage : $obj->read_header($fhl)
603							Function: Reads the header from the db file
604							Returns : width of a record
605							Args : filehandle
606
607
608							=cut
609
610							sub read_header {
611	0			0	1	0	my ( $self, $fh ) = @_;
612
613	0					0	my $record_width;
614
615	0					0	sysread( $fh, $record_width, HEADER_SIZE );
616
617	0					0	$self->{'_start_pos'} = HEADER_SIZE;
618	0					0	$record_width =~ s/ //g;
619	0					0	$record_width = $record_width * 1;
620
621	0					0	return $record_width;
622							}
623
624							=head2 read_record
625
626							Title : read_record
627							Usage : $obj->read_record($fh,$pos,$len)
628							Function: Reads a record from a filehandle
629							Returns : String
630							Args : filehandle, offset, and length
631
632
633							=cut
634
635							sub read_record {
636	0			0	1	0	my ( $self, $fh, $pos, $len ) = @_;
637
638	0					0	sysseek( $fh, $pos, 0 );
639
640	0					0	my $record;
641
642	0					0	sysread( $fh, $record, $len );
643
644	0					0	return $record;
645
646							}
647
648							=head2 get_all_primary_ids
649
650							Title : get_all_primary_ids
651							Usage : @ids = $seqdb->get_all_primary_ids()
652							Function: gives an array of all the primary_ids of the
653							sequence objects in the database.
654							Returns : an array of strings
655							Args : none
656
657							=cut
658
659							sub get_all_primary_ids {
660	0			0	1	0	my $self = shift;
661
662	0					0	my $fh = $self->primary_index_filehandle;
663	0					0	syseof($fh);
664	0					0	my $filesize = systell($fh);
665	0					0	my $recsize = $self->record_size;
666	0					0	my $end = $filesize;
667
668	0					0	my @ids;
669	0					0	for ( my $pos = $self->{'_start_pos'} ; $pos < $end ; $pos += $recsize ) {
670	0					0	my $record = $self->read_record( $fh, $pos, $recsize );
671	0					0	my ($entryid) = split( /\t/, $record );
672	0					0	push @ids, $entryid;
673							}
674	0					0	@ids;
675							}
676
677							=head2 find_entry
678
679							Title : find_entry
680							Usage : $obj->find_entry($fh,$start,$end,$id,$recsize)
681							Function: Extract an entry based on the start,end,id and record size
682							Returns : string
683							Args : filehandle, start, end, id, recordsize
684
685
686							=cut
687
688							sub find_entry {
689	0			0	1	0	my ( $self, $fh, $start, $end, $id, $recsize ) = @_;
690
691	0					0	my $mid = int( ( $end + 1 + $start ) / 2 );
692	0					0	my $pos = ( $mid - 1 ) * $recsize + $self->{'_start_pos'};
693
694	0					0	my ($record) = $self->read_record( $fh, $pos, $recsize );
695	0					0	my ( $entryid, $rest ) = split( /\t/, $record, 2 );
696	0					0	$rest =~ s/\s+$//;
697
698							# print "Mid $recsize $mid $pos:$entryid:$rest:$record\n";
699							# print "Entry :$id:$entryid:$rest\n";
700
701	0	0				0	my ( $first, $second ) =
702							$id le $entryid ? ( $id, $entryid ) : ( $entryid, $id );
703
704	0	0				0	if ( $id eq $entryid ) {
		0
		0
705
706	0					0	return ( $id, $rest, $pos - $recsize );
707
708							}
709							elsif ( $first eq $id ) {
710
711	0	0				0	if ( $end - $start <= 1 ) {
712	0					0	return;
713							}
714	0					0	my $end = $mid;
715
716							# print "Moving up $entryid $id\n";
717	0					0	$self->find_entry( $fh, $start, $end, $id, $recsize );
718
719							}
720							elsif ( $second eq $id ) {
721
722							# print "Moving down $entryid $id\n";
723	0	0				0	if ( $end - $start <= 1 ) {
724	0					0	return;
725							}
726
727	0					0	$start = $mid;
728
729	0					0	$self->find_entry( $fh, $start, $end, $id, $recsize );
730							}
731
732							}
733
734							=head2 build_index
735
736							Title : build_index
737							Usage : $obj->build_index(@files)
738							Function: Build the index based on a set of files
739							Returns : count of the number of entries
740							Args : List of filenames
741
742
743							=cut
744
745							sub build_index {
746	1			1	1	2	my ( $self, @files ) = @_;
747	1	50				2	$self->write_flag
748							or $self->throw('Cannot build index unless -write_flag is true');
749
750	1					2	my $rootdir = $self->index_directory;
751
752	1	50				6	if ( !defined($rootdir) ) {
753	0					0	$self->throw("No index directory set - can't build indices");
754							}
755
756	1	50				11	if ( !-d $rootdir ) {
757	0					0	$self->throw(
758							"Index directory [$rootdir] is not a directory. Cant' build indices"
759							);
760							}
761
762	1					2	my $dbpath = File::Spec->catfile( $rootdir, $self->dbname );
763	1	50				8	if ( !-d $dbpath ) {
764	0					0	warn "Creating directory $dbpath\n";
765	0	0				0	mkdir $dbpath, 0777 or $self->throw("Couldn't create $dbpath: $!");
766							}
767
768	1	50				2	unless (@files) {
769	0					0	$self->throw("Must enter an array of filenames to index");
770							}
771
772	1					2	foreach my $file (@files) {
773	1	50				35	$file = File::Spec->rel2abs($file)
774							unless File::Spec->file_name_is_absolute($file);
775	1	50				17	unless ( -e $file ) {
776	0					0	$self->throw("Can't index file [$file] as it doesn't exist");
777							}
778							}
779
780	1	50				4	if ( my $filehash = $self->{_dbfile} ) {
781	0					0	push @files, keys %$filehash;
782							}
783
784	1					1	my %seen;
785	1					1	@files = grep { !$seen{$_}++ } @files;
	1					5
786
787							# Lets index
788	1					2	$self->make_config_file( \@files );
789	1					1	my $entries = 0;
790	1					2	foreach my $file (@files) {
791	1					3	$entries += $self->_index_file($file);
792							}
793
794							# update alphabet if necessary
795	1					3	$self->make_config_file( \@files );
796
797							# And finally write out the indices
798	1					3	$self->write_primary_index;
799	1					3	$self->write_secondary_indices;
800
801	1					3	$entries;
802							}
803
804							=head2 _index_file
805
806							Title : _index_file
807							Usage : $obj->_index_file($newval)
808							Function:
809							Example :
810							Returns : value of _index_file
811							Args : newvalue (optional)
812
813							=cut
814
815							sub _index_file {
816	1			1		1	my ( $self, $file ) = @_;
817	1					4	my $v = $self->verbose;
818	1	50				27	open my $FILE, '<', $file or $self->throw("Could not read file '$file': $!");
819
820	1					2	my $recstart = 0;
821	1					3	my $fileid = $self->get_fileid_by_filename($file);
822	1					1	my $found = 0;
823	1					1	my $id;
824	1					1	my $count = 0;
825
826	1					64	my $primary = $self->primary_pattern;
827	1					143	my $start_pattern = $self->start_pattern;
828
829	1					2	my $pos = 0;
830
831	1					1	my $new_primary_entry;
832
833							my $length;
834
835	1					1	my $fh = $FILE;
836
837	1					0	my $done = -1;
838
839	1					2	my @secondary_names = $self->secondary_namespaces;
840	1					1	my %secondary_id;
841							my $last_one;
842
843							# In Windows, text files have '\r\n' as line separator, but when reading in
844							# text mode Perl will only show the '\n'. This means that for a line "ABC\r\n",
845							# "length $_" will report 4 although the line is 5 bytes in length.
846							# We assume that all lines have the same line separator and only read current line.
847	1					6	my $init_pos = tell($fh);
848	1					9	my $curr_line = <$fh>;
849	1					2	my $pos_diff = tell($fh) - $init_pos;
850	1					1	my $correction = $pos_diff - length $curr_line;
851	1					2	seek $fh, $init_pos, 0; # Rewind position to proceed to read the file
852
853	1					5	while (<$fh>) {
854	58					35	$last_one = $_;
855	58		33			102	$self->{alphabet} \|\|= $self->guess_alphabet($_);
856	58	100				99	if ( $_ =~ /$start_pattern/ ) {
857	7	100				11	if ( $done == 0 ) {
858	6					2	$id = $new_primary_entry;
859	6		33			19	$self->{alphabet} \|\|= $self->guess_alphabet($_);
860
861	6					7	my $tmplen = ( tell $fh ) - length($_) - $correction;
862
863	6					4	$length = $tmplen - $pos;
864
865	6	50				8	unless ( defined($id) ) {
866	0					0	$self->throw("No id defined for sequence");
867							}
868	6	50				8	unless ( defined($fileid) ) {
869	0					0	$self->throw("No fileid defined for file $file");
870							}
871	6	50				7	unless ( defined($pos) ) {
872	0					0	$self->throw( "No position defined for " . $id . "\n" );
873							}
874	6	50				7	unless ( defined($length) ) {
875	0					0	$self->throw( "No length defined for " . $id . "\n" );
876							}
877	6					7	$self->_add_id_position( $id, $pos, $fileid, $length,
878							\%secondary_id );
879
880	6					6	$pos = $tmplen;
881
882	6	50	66			14	if ( $count > 0 && $count % 1000 == 0 ) {
883	0	0				0	$self->debug("Indexed $count ids\n") if $v > 0;
884							}
885
886	6					6	$count++;
887							}
888							else {
889	1					1	$done = 0;
890							}
891							}
892
893	58	100				94	if ( $_ =~ /$primary/ ) {
894	7					11	$new_primary_entry = $1;
895							}
896
897	58					55	my $secondary_patterns = $self->secondary_patterns;
898
899	58					120	foreach my $sec (@secondary_names) {
900	0					0	my $pattern = $secondary_patterns->{$sec};
901
902	0	0				0	if ( $_ =~ /$pattern/ ) {
903	0					0	$secondary_id{$sec} = $1;
904							}
905							}
906
907							}
908
909							# Remember to add in the last one
910
911	1					1	$id = $new_primary_entry;
912
913							# my $tmplen = (tell $fh) - length($last_one);
914	1					2	my $tmplen = ( tell $fh );
915
916	1					1	$length = $tmplen - $pos;
917
918	1	50				2	if ( !defined($id) ) {
919	0					0	$self->throw("No id defined for sequence");
920							}
921	1	50				2	if ( !defined($fileid) ) {
922	0					0	$self->throw("No fileid defined for file $file");
923							}
924	1	50				1	if ( !defined($pos) ) {
925	0					0	$self->throw( "No position defined for " . $id . "\n" );
926							}
927	1	50				2	if ( !defined($length) ) {
928	0					0	$self->throw( "No length defined for " . $id . "\n" );
929							}
930
931	1					2	$self->_add_id_position( $id, $pos, $fileid, $length, \%secondary_id );
932	1					1	$count++;
933
934	1					6	close $FILE;
935	1					5	$count;
936							}
937
938							=head2 write_primary_index
939
940							Title : write_primary_index
941							Usage : $obj->write_primary_index($newval)
942							Function:
943							Example :
944							Returns : value of write_primary_index
945							Args : newvalue (optional)
946
947
948							=cut
949
950							sub write_primary_index {
951	1			1	1	1	my ($self) = @_;
952
953	1					1	my @ids = keys %{ $self->{_id} };
	1					4
954
955	1					3	@ids = sort { $a cmp $b } @ids;
	11					9
956
957	1	50				4	open my $INDEX, '>', $self->primary_index_file
958							or $self->throw(
959							"Could not write primary index file '" . $self->primary_index_file . "': $!" );
960
961							my $recordlength =
962							$self->{_maxidlength} +
963							$self->{_maxfileidlength} +
964							$self->{_maxposlength} +
965	1					6	$self->{_maxlengthlength} + 3;
966
967	1					9	print $INDEX sprintf( "%04d", $recordlength );
968
969	1					3	foreach my $id (@ids) {
970
971	7	50				10	if ( !defined( $self->{_id}{$id}{_fileid} ) ) {
972	0					0	$self->throw("No fileid for $id\n");
973							}
974	7	50				11	if ( !defined( $self->{_id}{$id}{_pos} ) ) {
975	0					0	$self->throw("No position for $id\n");
976							}
977	7	50				9	if ( !defined( $self->{_id}{$id}{_length} ) ) {
978	0					0	$self->throw("No length for $id");
979							}
980
981							my $record =
982							$id . "\t"
983							. $self->{_id}{$id}{_fileid} . "\t"
984							. $self->{_id}{$id}{_pos} . "\t"
985	7					9	. $self->{_id}{$id}{_length};
986
987	7					35	print $INDEX sprintf( "%-${recordlength}s", $record );
988
989							}
990							}
991
992							=head2 write_secondary_indices
993
994							Title : write_secondary_indices
995							Usage : $obj->write_secondary_indices($newval)
996							Function:
997							Example :
998							Returns : value of write_secondary_indices
999							Args : newvalue (optional)
1000
1001
1002							=cut
1003
1004							sub write_secondary_indices {
1005	1			1	1	2	my ($self) = @_;
1006
1007							# These are the different
1008	1					1	my @names = keys( %{ $self->{_secondary_id} } );
	1					3
1009
1010	1					2	foreach my $name (@names) {
1011
1012	0					0	my @seconds = keys %{ $self->{_secondary_id}{$name} };
	0					0
1013
1014							# First we need to loop over to get the longest record.
1015	0					0	my $length = 0;
1016
1017	0					0	foreach my $second (@seconds) {
1018	0					0	my $tmplen = length($second) + 1;
1019	0					0	my @prims = keys %{ $self->{_secondary_id}{$name}{$second} };
	0					0
1020
1021	0					0	foreach my $prim (@prims) {
1022	0					0	my $recordlen = $tmplen + length($prim);
1023
1024	0	0				0	if ( $recordlen > $length ) {
1025	0					0	$length = $recordlen;
1026							}
1027							}
1028							}
1029
1030							# Now we can print the index
1031
1032	0					0	my $fh = $self->new_secondary_filehandle($name);
1033
1034	0					0	print $fh sprintf( "%04d", $length );
1035	0					0	@seconds = sort @seconds;
1036
1037	0					0	foreach my $second (@seconds) {
1038
1039	0					0	my @prims = keys %{ $self->{_secondary_id}{$name}{$second} };
	0					0
1040	0					0	my $tmp = $second;
1041
1042	0					0	foreach my $prim (@prims) {
1043	0					0	my $record = $tmp . "\t" . $prim;
1044	0	0				0	if ( length($record) > $length ) {
1045	0					0	$self->throw(
1046							"Something has gone horribly wrong - length of record is more than we thought [$length]\n"
1047							);
1048							}
1049							else {
1050	0					0	print $fh sprintf( "%-${length}s", $record );
1051							}
1052							}
1053							}
1054
1055	0					0	close($fh);
1056							}
1057							}
1058
1059							=head2 new_secondary_filehandle
1060
1061							Title : new_secondary_filehandle
1062							Usage : $obj->new_secondary_filehandle($newval)
1063							Function:
1064							Example :
1065							Returns : value of new_secondary_filehandle
1066							Args : newvalue (optional)
1067
1068
1069							=cut
1070
1071							sub new_secondary_filehandle {
1072	0			0	1	0	my ( $self, $name ) = @_;
1073
1074	0					0	my $indexdir = $self->_config_path;
1075
1076	0					0	my $secindex = File::Spec->catfile( $indexdir, "id_$name.index" );
1077
1078	0	0				0	open my $fh, '>', $secindex or $self->throw("Could not write file '$secindex': $!");
1079	0					0	return $fh;
1080							}
1081
1082							=head2 open_secondary_index
1083
1084							Title : open_secondary_index
1085							Usage : $obj->open_secondary_index($newval)
1086							Function:
1087							Example :
1088							Returns : value of open_secondary_index
1089							Args : newvalue (optional)
1090
1091
1092							=cut
1093
1094							sub open_secondary_index {
1095	0			0	1	0	my ( $self, $name ) = @_;
1096
1097	0	0				0	if ( !defined( $self->{_secondary_filehandle}{$name} ) ) {
1098
1099	0					0	my $indexdir = $self->_config_path;
1100	0					0	my $secindex = $indexdir . "/id_$name.index";
1101
1102	0	0				0	if ( !-e $secindex ) {
1103	0					0	$self->throw("Index is not present for namespace [$name]\n");
1104							}
1105
1106	0	0				0	open my $newfh, '<', $secindex or $self->throw("Could not read file '$secindex': $!");
1107	0					0	my $reclen = $self->read_header($newfh);
1108
1109	0					0	$self->{_secondary_filehandle}{$name} = $newfh;
1110	0					0	$self->{_secondary_record_size}{$name} = $reclen;
1111							}
1112
1113	0					0	return $self->{_secondary_filehandle}{$name};
1114
1115							}
1116
1117							=head2 _add_id_position
1118
1119							Title : _add_id_position
1120							Usage : $obj->_add_id_position($newval)
1121							Function:
1122							Example :
1123							Returns : value of _add_id_position
1124							Args : newvalue (optional)
1125
1126
1127							=cut
1128
1129							sub _add_id_position {
1130	7			7		8	my ( $self, $id, $pos, $fileid, $length, $secondary_id ) = @_;
1131
1132	7	50				7	if ( !defined($id) ) {
1133	0					0	$self->throw("No id defined. Can't add id position");
1134							}
1135	7	50				9	if ( !defined($pos) ) {
1136	0					0	$self->throw("No position defined. Can't add id position");
1137							}
1138	7	50				7	if ( !defined($fileid) ) {
1139	0					0	$self->throw("No fileid defined. Can't add id position");
1140							}
1141	7	50	33			23	if ( !defined($length) \|\| $length <= 0 ) {
1142	0					0	$self->throw(
1143							"No length defined or <= 0 [$length]. Can't add id position");
1144							}
1145
1146	7					14	$self->{_id}{$id}{_pos} = $pos;
1147	7					7	$self->{_id}{$id}{_length} = $length;
1148	7					8	$self->{_id}{$id}{_fileid} = $fileid;
1149
1150							# Now the secondary ids
1151
1152	7					11	foreach my $sec ( keys(%$secondary_id) ) {
1153	0					0	my $value = $secondary_id->{$sec};
1154	0					0	$self->{_secondary_id}{$sec}{$value}{$id} = 1;
1155							}
1156
1157							$self->{_maxidlength} = length($id)
1158							if !exists $self->{_maxidlength}
1159	7	100	100			25	or length($id) >= $self->{_maxidlength};
1160
1161							$self->{_maxfileidlength} = length($fileid)
1162							if !exists $self->{_maxfileidlength}
1163	7	50	66			18	or length($fileid) >= $self->{_maxfileidlength};
1164
1165							$self->{_maxposlength} = length($pos)
1166							if !exists $self->{_maxposlength}
1167	7	50	66			20	or length($pos) >= $self->{_maxposlength};
1168
1169							$self->{_maxlengthlength} = length($length)
1170							if !exists $self->{_maxlengthlength}
1171	7	50	66			26	or length($length) >= $self->{_maxlengthlength};
1172							}
1173
1174							=head2 make_config_file
1175
1176							Title : make_config_file
1177							Usage : $obj->make_config_file($newval)
1178							Function:
1179							Example :
1180							Returns : value of make_config_file
1181							Args : newvalue (optional)
1182
1183							=cut
1184
1185							sub make_config_file {
1186	2			2	1	2	my ( $self, $files ) = @_;
1187
1188	2					3	my @files = @$files;
1189
1190	2					4	my $configfile = $self->_config_file;
1191
1192	2	50				158	open my $CON, '>', $configfile
1193							or $self->throw("Could not write config file '$configfile': $!");
1194
1195							# First line must be the type of index - in this case flat
1196	2					15	print $CON "index\tflat/1\n";
1197
1198							# Now the fileids
1199
1200	2					3	my $count = 0;
1201
1202	2					3	foreach my $file (@files) {
1203
1204	2					24	my $size = -s $file;
1205
1206	2					5	print $CON "fileid_$count\t$file\t$size\n";
1207
1208	2					5	$self->{_file}{$count} = $file;
1209	2					4	$self->{_dbfile}{$file} = $count;
1210	2					2	$self->{_size}{$count} = $size;
1211	2					4	$count++;
1212							}
1213
1214							# Now the namespaces
1215
1216	2					4	print $CON "primary_namespace\t" . $self->primary_namespace . "\n";
1217
1218							# Needs fixing for the secondary stuff
1219
1220	2					3	my $second_patterns = $self->secondary_patterns;
1221
1222	2					10	my @second = keys %$second_patterns;
1223
1224	2	50				5	if ( (@second) ) {
1225	0					0	print $CON "secondary_namespaces";
1226
1227	0					0	foreach my $second (@second) {
1228	0					0	print $CON "\t$second";
1229							}
1230	0					0	print $CON "\n";
1231							}
1232
1233							# Now the config format
1234
1235	2	50				3	unless ( defined( $self->format ) ) {
1236	0					0	$self->throw(
1237							"Format does not exist in module - can't write config file");
1238							}
1239							else {
1240	2					3	my $format = $self->format;
1241	2					3	my $alphabet = $self->alphabet;
1242	2	50				3	my $alpha = $alphabet ? "/$alphabet" : '';
1243	2					5	print $CON "format\t" . "$format\n";
1244							}
1245	2					72	close $CON;
1246							}
1247
1248							=head2 read_config_file
1249
1250							Title : read_config_file
1251							Usage : $obj->read_config_file($newval)
1252							Function:
1253							Example :
1254							Returns : value of read_config_file
1255							Args : newvalue (optional)
1256
1257							=cut
1258
1259							sub read_config_file {
1260	1			1	1	1	my ($self) = @_;
1261	1					2	my $configfile = $self->_config_file;
1262	1	50				17	return unless -e $configfile;
1263
1264	0	0				0	open my $CON, '<', $configfile
1265							or $self->throw("Could not read config file '$configfile': $!");
1266
1267							# First line must be type
1268	0					0	my $line = <$CON>;
1269	0					0	chomp($line);
1270	0					0	my $version;
1271
1272							# This is hard coded as we only index flatfiles here
1273	0	0				0	if ( $line =~ m{index\tflat/(\d+)} ) {
1274	0					0	$version = $1;
1275							}
1276							else {
1277	0					0	$self->throw(
1278							"First line not compatible with flat file index. Should be something like\n\nindex\tflat/1"
1279							);
1280							}
1281
1282	0					0	$self->index_type("flat");
1283	0					0	$self->index_version($version);
1284
1285	0					0	while (<$CON>) {
1286	0					0	chomp;
1287
1288							# Look for fileid lines
1289	0	0				0	if ( $_ =~ /^fileid_(\d+)\t(.+)\t(\d+)/ ) {
1290	0					0	my $fileid = $1;
1291	0					0	my $filename = $2;
1292	0					0	my $filesize = $3;
1293
1294	0	0				0	if ( !-e $filename ) {
1295	0					0	$self->throw("File [$filename] does not exist!");
1296							}
1297	0	0				0	if ( -s $filename != $filesize ) {
1298	0					0	$self->throw(
1299							"Flatfile size for $filename differs from what the index thinks it is. Real size ["
1300							. ( -s $filename )
1301							. "] Index thinks it is ["
1302							. $filesize
1303							. "]" );
1304							}
1305
1306	0					0	$self->{_file}{$fileid} = $filename;
1307	0					0	$self->{_dbfile}{$filename} = $fileid;
1308	0					0	$self->{_size}{$fileid} = $filesize;
1309							}
1310
1311							# Look for namespace lines
1312	0	0				0	if (/(.*)_namespaces?\t(.+)/) {
1313	0	0				0	if ( $1 eq "primary" ) {
		0
1314	0					0	$self->primary_namespace($2);
1315							}
1316							elsif ( $1 eq "secondary" ) {
1317	0					0	$self->secondary_namespaces( split "\t", $2 );
1318							}
1319							else {
1320	0					0	$self->throw("Unknown namespace name in config file [$1");
1321							}
1322							}
1323
1324							# Look for format lines
1325	0	0				0	if ( $_ =~ /format\t(\S+)/ ) {
1326
1327							# Check the format here?
1328	0					0	my $format = $1;
1329
1330							# handle LSID format
1331	0	0				0	if ( $format =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))?/ ) {
1332	0					0	$self->format($1);
1333	0					0	$self->alphabet($2);
1334							}
1335							else { # compatibility with older versions
1336	0					0	$self->format($1);
1337							}
1338							}
1339							}
1340
1341	0					0	close($CON);
1342
1343							# Now check we have all that we need
1344
1345	0					0	my @fileid_keys = keys( %{ $self->{_file} } );
	0					0
1346
1347	0	0				0	if ( !(@fileid_keys) ) {
1348	0					0	$self->throw(
1349							"No flatfile fileid files in config - check the index has been made correctly"
1350							);
1351							}
1352
1353	0	0				0	if ( !defined( $self->primary_namespace ) ) {
1354	0					0	$self->throw("No primary namespace exists");
1355							}
1356
1357	0	0				0	if ( !-e $self->primary_index_file ) {
1358	0					0	$self->throw( "Primary index file ["
1359							. $self->primary_index_file
1360							. "] doesn't exist" );
1361							}
1362
1363	0					0	1;
1364							}
1365
1366							=head2 get_fileid_by_filename
1367
1368							Title : get_fileid_by_filename
1369							Usage : $obj->get_fileid_by_filename($newval)
1370							Function:
1371							Example :
1372							Returns : value of get_fileid_by_filename
1373							Args : newvalue (optional)
1374
1375							=cut
1376
1377							sub get_fileid_by_filename {
1378	1			1	1	2	my ( $self, $file ) = @_;
1379
1380	1	50				2	if ( !defined( $self->{_dbfile} ) ) {
1381	0					0	$self->throw(
1382							"No file to fileid mapping present. Has the fileid file been read?"
1383							);
1384							}
1385
1386	1					3	return $self->{_dbfile}{$file};
1387							}
1388
1389							=head2 get_filehandle_by_fileid
1390
1391							Title : get_filehandle_by_fileid
1392							Usage : $obj->get_filehandle_by_fileid($newval)
1393							Function:
1394							Example :
1395							Returns : value of get_filehandle_by_fileid
1396							Args : newvalue (optional)
1397
1398							=cut
1399
1400							sub get_filehandle_by_fileid {
1401	0			0	1	0	my ( $self, $fileid ) = @_;
1402
1403	0	0				0	if ( !defined( $self->{_file}{$fileid} ) ) {
1404	0					0	$self->throw("ERROR: undefined fileid in index [$fileid]");
1405							}
1406
1407	0	0				0	open my $fh, '<', $self->{_file}{$fileid} or $self->throw("Could not read file '$self->{_file}{$fileid}': $!");
1408	0					0	return $fh;
1409							}
1410
1411							=head2 primary_index_file
1412
1413							Title : primary_index_file
1414							Usage : $obj->primary_index_file($newval)
1415							Function:
1416							Example :
1417							Returns : value of primary_index_file
1418							Args : newvalue (optional)
1419
1420
1421							=cut
1422
1423							sub primary_index_file {
1424	1			1	1	1	my ($self) = @_;
1425
1426	1					2	return File::Spec->catfile( $self->_config_path,
1427							"key_" . $self->primary_namespace . ".key" );
1428							}
1429
1430							=head2 primary_index_filehandle
1431
1432							Title : primary_index_filehandle
1433							Usage : $obj->primary_index_filehandle($newval)
1434							Function:
1435							Example :
1436							Returns : value of primary_index_filehandle
1437							Args : newvalue (optional)
1438
1439
1440							=cut
1441
1442							sub primary_index_filehandle {
1443	0			0	1	0	my ($self) = @_;
1444
1445	0	0				0	unless ( defined( $self->{'_primary_index_handle'} ) ) {
1446	0					0	my $primary_file = $self->primary_index_file;
1447	0	0				0	open $self->{'_primary_index_handle'}, '<', $primary_file
1448							or self->throw("Could not read file '$primary_file': $!\n");
1449							}
1450	0					0	return $self->{'_primary_index_handle'};
1451							}
1452
1453							=head2 format
1454
1455							Title : format
1456							Usage : $obj->format($newval)
1457							Function:
1458							Example :
1459							Returns : value of format
1460							Args : newvalue (optional)
1461
1462
1463							=cut
1464
1465							sub format {
1466	70			70	1	44	my ( $obj, $value ) = @_;
1467	70	100				81	if ( defined $value ) {
1468	1					1	$obj->{'format'} = $value;
1469							}
1470	70					69	return $obj->{'format'};
1471
1472							}
1473
1474							sub alphabet {
1475	2			2	0	2	my ( $obj, $value ) = @_;
1476	2	50				3	if ( defined $value ) {
1477	0					0	$obj->{alphabet} = $value;
1478							}
1479	2					2	return $obj->{alphabet};
1480							}
1481
1482							=head2 write_flag
1483
1484							Title : write_flag
1485							Usage : $obj->write_flag($newval)
1486							Function:
1487							Example :
1488							Returns : value of write_flag
1489							Args : newvalue (optional)
1490
1491
1492							=cut
1493
1494							sub write_flag {
1495	3			3	1	2	my ( $obj, $value ) = @_;
1496	3	100				5	if ( defined $value ) {
1497	1					1	$obj->{'write_flag'} = $value;
1498							}
1499	3					9	return $obj->{'write_flag'};
1500
1501							}
1502
1503							=head2 dbname
1504
1505							Title : dbname
1506							Usage : $obj->dbname($newval)
1507							Function: get/set database name
1508							Example :
1509							Returns : value of dbname
1510							Args : newvalue (optional)
1511
1512							=cut
1513
1514							sub dbname {
1515	6			6	1	7	my $self = shift;
1516	6					4	my $d = $self->{flat_dbname};
1517	6	100				10	$self->{flat_dbname} = shift if @_;
1518	6					10	$d;
1519							}
1520
1521							=head2 index_directory
1522
1523							Title : index_directory
1524							Usage : $obj->index_directory($newval)
1525							Function:
1526							Example :
1527							Returns : value of index_directory
1528							Args : newvalue (optional)
1529
1530
1531							=cut
1532
1533							sub index_directory {
1534	7			7	1	8	my ( $self, $arg ) = @_;
1535
1536	7	100				9	if ( defined($arg) ) {
1537	1	50				3	if ( $arg !~ m{/$} ) {
1538	1					1	$arg .= "/";
1539							}
1540	1					2	$self->{_index_directory} = $arg;
1541							}
1542	7					12	return $self->{_index_directory};
1543
1544							}
1545
1546							sub _config_path {
1547	4			4		3	my $self = shift;
1548	4					5	my $root = $self->index_directory;
1549	4					7	my $dbname = $self->dbname;
1550	4					44	File::Spec->catfile( $root, $dbname );
1551							}
1552
1553							sub _config_file {
1554	3			3		3	my $self = shift;
1555	3					4	my $path = $self->_config_path;
1556	3					14	File::Spec->catfile( $path, CONFIG_FILE_NAME );
1557							}
1558
1559							=head2 record_size
1560
1561							Title : record_size
1562							Usage : $obj->record_size($newval)
1563							Function:
1564							Example :
1565							Returns : value of record_size
1566							Args : newvalue (optional)
1567
1568
1569							=cut
1570
1571							sub record_size {
1572	0			0	1	0	my $self = shift;
1573	0	0				0	$self->{_record_size} = shift if @_;
1574	0					0	return $self->{_record_size};
1575							}
1576
1577							=head2 primary_namespace
1578
1579							Title : primary_namespace
1580							Usage : $obj->primary_namespace($newval)
1581							Function:
1582							Example :
1583							Returns : value of primary_namespace
1584							Args : newvalue (optional)
1585
1586							=cut
1587
1588							sub primary_namespace {
1589	4			4	1	3	my $self = shift;
1590	4	100				8	$self->{_primary_namespace} = shift if @_;
1591	4					81	return $self->{_primary_namespace};
1592							}
1593
1594							=head2 index_type
1595
1596							Title : index_type
1597							Usage : $obj->index_type($newval)
1598							Function:
1599							Example :
1600							Returns : value of index_type
1601							Args : newvalue (optional)
1602
1603
1604							=cut
1605
1606							sub index_type {
1607	0			0	1	0	my $self = shift;
1608	0	0				0	$self->{_index_type} = shift if @_;
1609	0					0	return $self->{_index_type};
1610							}
1611
1612							=head2 index_version
1613
1614							Title : index_version
1615							Usage : $obj->index_version($newval)
1616							Function:
1617							Example :
1618							Returns : value of index_version
1619							Args : newvalue (optional)
1620
1621
1622							=cut
1623
1624							sub index_version {
1625	0			0	1	0	my $self = shift;
1626	0	0				0	$self->{_index_version} = shift if @_;
1627	0					0	return $self->{_index_version};
1628							}
1629
1630							=head2 primary_pattern
1631
1632							Title : primary_pattern
1633							Usage : $obj->primary_pattern($newval)
1634							Function:
1635							Example :
1636							Returns : value of primary_pattern
1637							Args : newvalue (optional)
1638
1639
1640							=cut
1641
1642							sub primary_pattern {
1643	2			2	1	2	my $obj = shift;
1644	2	100				5	$obj->{'primary_pattern'} = shift if @_;
1645	2					4	return $obj->{'primary_pattern'};
1646							}
1647
1648							=head2 start_pattern
1649
1650							Title : start_pattern
1651							Usage : $obj->start_pattern($newval)
1652							Function:
1653							Example :
1654							Returns : value of start_pattern
1655							Args : newvalue (optional)
1656
1657
1658							=cut
1659
1660							sub start_pattern {
1661	2			2	1	3	my $obj = shift;
1662	2	100				5	$obj->{'start_pattern'} = shift if @_;
1663	2					4	return $obj->{'start_pattern'};
1664							}
1665
1666							=head2 secondary_patterns
1667
1668							Title : secondary_patterns
1669							Usage : $obj->secondary_patterns($newval)
1670							Function:
1671							Example :
1672							Returns : value of secondary_patterns
1673							Args : newvalue (optional)
1674
1675
1676							=cut
1677
1678							sub secondary_patterns {
1679	61			61	1	42	my ( $obj, $value ) = @_;
1680	61	50				65	if ( defined $value ) {
1681	0					0	$obj->{'secondary_patterns'} = $value;
1682
1683	0					0	my @names = keys %$value;
1684
1685	0					0	foreach my $name (@names) {
1686	0					0	$obj->secondary_namespaces($name);
1687							}
1688							}
1689	61					54	return $obj->{'secondary_patterns'};
1690
1691							}
1692
1693							=head2 secondary_namespaces
1694
1695							Title : secondary_namespaces
1696							Usage : $obj->secondary_namespaces($newval)
1697							Function:
1698							Example :
1699							Returns : value of secondary_namespaces
1700							Args : newvalue (optional)
1701
1702
1703							=cut
1704
1705							sub secondary_namespaces {
1706	1			1	1	2	my ( $obj, @values ) = @_;
1707
1708	1	50				2	if (@values) {
1709	0					0	push( @{ $obj->{'secondary_namespaces'} }, @values );
	0					0
1710							}
1711	1	50				1	return @{ $obj->{'secondary_namespaces'} \|\| [] };
	1					5
1712							}
1713
1714							## These are indexing routines to index commonly used format - fasta
1715							## swissprot and embl
1716
1717							sub new_SWISSPROT_index {
1718	0			0	0	0	my ( $self, $index_dir, @files ) = @_;
1719
1720	0					0	my %secondary_patterns;
1721
1722	0					0	my $start_pattern = "^ID (\\S+)";
1723	0					0	my $primary_pattern = "^AC (\\S+)\\;";
1724
1725	0					0	$secondary_patterns{"ID"} = $start_pattern;
1726
1727	0					0	my $index = Bio::DB::Flat::BinarySearch->new(
1728							-index_dir => $index_dir,
1729							-format => 'swissprot',
1730							-primary_pattern => $primary_pattern,
1731							-primary_namespace => "ACC",
1732							-start_pattern => $start_pattern,
1733							-secondary_patterns => \%secondary_patterns
1734							);
1735
1736	0					0	$index->build_index(@files);
1737							}
1738
1739							sub new_EMBL_index {
1740	0			0	0	0	my ( $self, $index_dir, @files ) = @_;
1741
1742	0					0	my %secondary_patterns;
1743
1744	0					0	my $start_pattern = "^ID (\\S+)";
1745	0					0	my $primary_pattern = "^AC (\\S+)\\;";
1746	0					0	my $primary_namespace = "ACC";
1747
1748	0					0	$secondary_patterns{"ID"} = $start_pattern;
1749
1750	0					0	my $index = Bio::DB::Flat::BinarySearch->new(
1751							-index_dir => $index_dir,
1752							-format => 'embl',
1753							-primary_pattern => $primary_pattern,
1754							-primary_namespace => "ACC",
1755							-start_pattern => $start_pattern,
1756							-secondary_patterns => \%secondary_patterns
1757							);
1758
1759	0					0	$index->build_index(@files);
1760
1761	0					0	return $index;
1762							}
1763
1764							sub new_FASTA_index {
1765	0			0	0	0	my ( $self, $index_dir, @files ) = @_;
1766
1767	0					0	my %secondary_patterns;
1768
1769	0					0	my $start_pattern = "^>";
1770	0					0	my $primary_pattern = "^>(\\S+)";
1771	0					0	my $primary_namespace = "ACC";
1772
1773	0					0	$secondary_patterns{"ID"} = "^>\\S+ +(\\S+)";
1774
1775	0					0	my $index = Bio::DB::Flat::BinarySearch->new(
1776							-index_dir => $index_dir,
1777							-format => 'fasta',
1778							-primary_pattern => $primary_pattern,
1779							-primary_namespace => "ACC",
1780							-start_pattern => $start_pattern,
1781							-secondary_patterns => \%secondary_patterns
1782							);
1783
1784	0					0	$index->build_index(@files);
1785
1786	0					0	return $index;
1787							}
1788
1789							# EVERYTHING THAT FOLLOWS THIS
1790							# is an awful hack - in reality Michele's code needs to be rewritten
1791							# to use Bio::SeqIO, but I have too little time to do this -- LS
1792							sub guess_alphabet {
1793	64			64	0	34	my $self = shift;
1794	64					39	my $line = shift;
1795
1796	64					65	my $format = $self->format;
1797	64	50				84	return 'protein' if $format eq 'swissprot';
1798
1799	64	50				66	if ( $format eq 'genbank' ) {
1800	0	0				0	return unless $line =~ /^LOCUS/;
1801	0	0				0	return 'dna' if $line =~ /\s+\d+\s+bp/i;
1802	0					0	return 'protein';
1803							}
1804
1805	64	50				62	if ( $format eq 'embl' ) {
1806	0	0				0	return unless $line =~ /^ID/;
1807	0	0				0	return 'dna' if $line =~ / DNA;/i;
1808	0	0				0	return 'rna' if $line =~ / RNA;/i;
1809	0					0	return 'protein';
1810							}
1811
1812	64					100	return;
1813							}
1814
1815							# return (namespace,primary_pattern,start_pattern,secondary_pattern)
1816							sub _guess_patterns {
1817	1			1		1	my $self = shift;
1818	1					1	my $format = shift;
1819	1	50				4	if ( $format =~ /swiss(prot)?/i ) {
1820	0					0	return ( 'ID', "^ID (\\S+)", "^ID (\\S+)",
1821							{ ACC => "^AC (\\S+);" } );
1822							}
1823
1824	1	50				2	if ($format =~ /embl/i) {
1825	0					0	return ('ID',
1826							"^ID (\\S+[^; ])",
1827							"^ID (\\S+[^; ])",
1828							{
1829							ACC => q/^AC (\S+);/,
1830							VERSION => q/^SV\s+(\S+)/
1831							});
1832							}
1833
1834	1	50				3	if ( $format =~ /genbank/i ) {
1835							return (
1836	0					0	'ID',
1837							q/^LOCUS\s+(\S+)/,
1838							q/^LOCUS/,
1839							{
1840							ACC => q/^ACCESSION\s+(\S+)/,
1841							VERSION => q/^VERSION\s+(\S+)/
1842							}
1843							);
1844							}
1845
1846	1	50				3	if ( $format =~ /fasta/i ) {
1847	1					2	return ( 'ACC', '^>(\S+)', '^>(\S+)', );
1848							}
1849
1850	0						$self->throw("I can't handle format $format");
1851
1852							}
1853
1854							1;