line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# BioPerl module for Bio::DB::Flat |
4
|
|
|
|
|
|
|
# |
5
|
|
|
|
|
|
|
# Please direct questions and support issues to |
6
|
|
|
|
|
|
|
# |
7
|
|
|
|
|
|
|
# Cared for by Lincoln Stein |
8
|
|
|
|
|
|
|
# |
9
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 NAME |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
Bio::DB::Flat - Interface for indexed flat files |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 SYNOPSIS |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
$db = Bio::DB::Flat->new(-directory => '/usr/share/embl', |
20
|
|
|
|
|
|
|
-dbname => 'mydb', |
21
|
|
|
|
|
|
|
-format => 'embl', |
22
|
|
|
|
|
|
|
-index => 'bdb', |
23
|
|
|
|
|
|
|
-write_flag => 1); |
24
|
|
|
|
|
|
|
$db->build_index('/usr/share/embl/primate.embl', |
25
|
|
|
|
|
|
|
'/usr/share/embl/protists.embl'); |
26
|
|
|
|
|
|
|
$seq = $db->get_Seq_by_id('HSFOS'); |
27
|
|
|
|
|
|
|
@sequences = $db->get_Seq_by_acc('DIV' => 'primate'); |
28
|
|
|
|
|
|
|
$raw = $db->fetch_raw('HSFOS'); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head1 DESCRIPTION |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
This object provides the basic mechanism to associate positions in |
33
|
|
|
|
|
|
|
files with primary and secondary name spaces. Unlike |
34
|
|
|
|
|
|
|
Bio::Index::Abstract (see L), this is specialized |
35
|
|
|
|
|
|
|
to work with the "flat index" and BerkeleyDB indexed flat file formats |
36
|
|
|
|
|
|
|
worked out at the 2002 BioHackathon. |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
This object is a general front end to the underlying databases. |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 FEEDBACK |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head2 Mailing Lists |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
User feedback is an integral part of the evolution of this and other |
45
|
|
|
|
|
|
|
Bioperl modules. Send your comments and suggestions preferably to one |
46
|
|
|
|
|
|
|
of the Bioperl mailing lists. Your participation is much appreciated. |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
bioperl-l@bioperl.org - General discussion |
49
|
|
|
|
|
|
|
http://bioperl.org/wiki/Mailing_lists - About the mailing lists |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
=head2 Support |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
Please direct usage questions or support issues to the mailing list: |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
I |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
rather than to the module maintainer directly. Many experienced and |
58
|
|
|
|
|
|
|
reponsive experts will be able look at the problem and quickly |
59
|
|
|
|
|
|
|
address it. Please include a thorough description of the problem |
60
|
|
|
|
|
|
|
with code and data examples if at all possible. |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=head2 Reporting Bugs |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
Report bugs to the Bioperl bug tracking system to help us keep track |
65
|
|
|
|
|
|
|
the bugs and their resolution. Bug reports can be submitted via the |
66
|
|
|
|
|
|
|
web: |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
https://github.com/bioperl/bioperl-live/issues |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
=head1 AUTHOR - Lincoln Stein |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
Email - lstein@cshl.org |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
=head1 APPENDIX |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
The rest of the documentation details each of the object methods. Internal |
77
|
|
|
|
|
|
|
methods are usually preceded with an "_" (underscore). |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=cut |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Let the code begin... |
83
|
|
|
|
|
|
|
package Bio::DB::Flat; |
84
|
|
|
|
|
|
|
|
85
|
1
|
|
|
1
|
|
1155
|
use File::Spec; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
29
|
|
86
|
|
|
|
|
|
|
|
87
|
1
|
|
|
1
|
|
3
|
use base qw(Bio::Root::Root Bio::DB::RandomAccessI); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
78
|
|
88
|
|
|
|
|
|
|
|
89
|
1
|
|
|
1
|
|
4
|
use constant CONFIG_FILE_NAME => 'config.dat'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
2092
|
|
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
=head2 new |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
Title : new |
94
|
|
|
|
|
|
|
Usage : my $db = Bio::DB::Flat->new( |
95
|
|
|
|
|
|
|
-directory => $root_directory, |
96
|
|
|
|
|
|
|
-dbname => 'mydb', |
97
|
|
|
|
|
|
|
-write_flag => 1, |
98
|
|
|
|
|
|
|
-index => 'bdb', |
99
|
|
|
|
|
|
|
-verbose => 0, |
100
|
|
|
|
|
|
|
-out => 'outputfile', |
101
|
|
|
|
|
|
|
-format => 'genbank'); |
102
|
|
|
|
|
|
|
Function: create a new Bio::DB::Flat object |
103
|
|
|
|
|
|
|
Returns : new Bio::DB::Flat object |
104
|
|
|
|
|
|
|
Args : -directory Root directory containing "config.dat" |
105
|
|
|
|
|
|
|
-write_flag If true, allows creation/updating. |
106
|
|
|
|
|
|
|
-verbose Verbose messages |
107
|
|
|
|
|
|
|
-out File to write to when write_seq invoked |
108
|
|
|
|
|
|
|
-index 'bdb' or 'binarysearch' |
109
|
|
|
|
|
|
|
Status : Public |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
The required -directory argument indicates where the flat file indexes |
112
|
|
|
|
|
|
|
will be stored. The build_index() and write_seq() methods will |
113
|
|
|
|
|
|
|
automatically create subdirectories of this root directory. Each |
114
|
|
|
|
|
|
|
subdirectory will contain a human-readable configuration file named |
115
|
|
|
|
|
|
|
"config.dat" that specifies where the individual indexes are stored. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
The required -dbname argument gives a name to the database index. The |
118
|
|
|
|
|
|
|
index files will actually be stored in a like-named subdirectory |
119
|
|
|
|
|
|
|
underneath the root directory. |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
The -write_flag enables writing new entries into the database as well |
122
|
|
|
|
|
|
|
as the creation of the indexes. By default the indexes will be opened |
123
|
|
|
|
|
|
|
read only. |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
-index is one of "bdb" or "binarysearch" and indicates the type of |
126
|
|
|
|
|
|
|
index to generate. "bdb" corresponds to Berkeley DB. You *must* be |
127
|
|
|
|
|
|
|
using BerkeleyDB version 2 or higher, and have the Perl BerkeleyDB |
128
|
|
|
|
|
|
|
extension installed (DB_File will *not* work). "binarysearch" |
129
|
|
|
|
|
|
|
corresponds to the OBDA "flat" indexed file. |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
The -out argument specifies the output file for writing objects created |
132
|
|
|
|
|
|
|
with write_seq(). |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
The -format argument specifies the format of the input file or files. If |
135
|
|
|
|
|
|
|
the file suffix is one that Bioperl can already associate with a format |
136
|
|
|
|
|
|
|
then this is optional. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=cut |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
sub new { |
141
|
1
|
|
|
1
|
1
|
269
|
my $class = shift; |
142
|
1
|
50
|
|
|
|
5
|
$class = ref($class) if ref($class); |
143
|
1
|
|
|
|
|
14
|
my $self = $class->SUPER::new(@_); |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
# first we initialize ourselves |
146
|
1
|
|
|
|
|
8
|
my ($flat_directory,$dbname,$format) = |
147
|
|
|
|
|
|
|
$self->_rearrange([qw(DIRECTORY DBNAME FORMAT)],@_); |
148
|
|
|
|
|
|
|
|
149
|
1
|
50
|
|
|
|
3
|
defined $flat_directory |
150
|
|
|
|
|
|
|
or $self->throw('Please supply a -directory argument'); |
151
|
1
|
50
|
|
|
|
3
|
defined $dbname |
152
|
|
|
|
|
|
|
or $self->throw('Please supply a -dbname argument'); |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# set values from configuration file |
155
|
1
|
|
|
|
|
3
|
$self->directory($flat_directory); |
156
|
1
|
|
|
|
|
3
|
$self->dbname($dbname); |
157
|
|
|
|
|
|
|
|
158
|
1
|
50
|
|
|
|
11
|
$self->throw("Base directory $flat_directory doesn't exist") |
159
|
|
|
|
|
|
|
unless -e $flat_directory; |
160
|
1
|
50
|
|
|
|
3
|
$self->throw("$flat_directory isn't a directory") |
161
|
|
|
|
|
|
|
unless -d _; |
162
|
1
|
|
|
|
|
8
|
my $dbpath = File::Spec->catfile($flat_directory,$dbname); |
163
|
1
|
50
|
|
|
|
15
|
unless (-d $dbpath) { |
164
|
1
|
|
|
|
|
7
|
$self->debug("creating db directory $dbpath\n"); |
165
|
1
|
50
|
|
|
|
45
|
mkdir $dbpath,0777 or $self->throw("Can't create $dbpath: $!"); |
166
|
|
|
|
|
|
|
} |
167
|
1
|
|
|
|
|
3
|
$self->_read_config(); |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# but override with initialization values |
170
|
1
|
|
|
|
|
4
|
$self->_initialize(@_); |
171
|
|
|
|
|
|
|
|
172
|
1
|
50
|
|
|
|
2
|
$self->throw('you must specify an indexing scheme') |
173
|
|
|
|
|
|
|
unless $self->indexing_scheme; |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# now we figure out what subclass to instantiate |
176
|
1
|
50
|
|
|
|
3
|
my $index_type = $self->indexing_scheme eq 'BerkeleyDB/1' ? 'BDB' |
|
|
50
|
|
|
|
|
|
177
|
|
|
|
|
|
|
:$self->indexing_scheme eq 'flat/1' ? 'Binary' |
178
|
|
|
|
|
|
|
:$self->throw("unknown indexing scheme: " . |
179
|
|
|
|
|
|
|
$self->indexing_scheme); |
180
|
1
|
|
|
|
|
2
|
$format = $self->file_format; |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# because Michele and Lincoln did it differently |
183
|
|
|
|
|
|
|
# Michele's way is via a standalone concrete class |
184
|
1
|
50
|
|
|
|
3
|
if ($index_type eq 'Binary') { |
185
|
1
|
|
|
|
|
1
|
my $child_class = 'Bio::DB::Flat::BinarySearch'; |
186
|
1
|
|
|
1
|
|
437
|
eval "use $child_class"; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
18
|
|
|
1
|
|
|
|
|
44
|
|
187
|
1
|
50
|
|
|
|
4
|
$self->throw($@) if $@; |
188
|
1
|
|
|
|
|
4
|
push @_, ('-format', $format); |
189
|
1
|
|
|
|
|
4
|
return $child_class->new(@_); |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# Lincoln uses Bio::SeqIO style delegation. |
193
|
0
|
|
|
|
|
0
|
my $child_class= "Bio\:\:DB\:\:Flat\:\:$index_type\:\:\L$format"; |
194
|
0
|
|
|
|
|
0
|
eval "use $child_class"; |
195
|
0
|
0
|
|
|
|
0
|
$self->throw($@) if $@; |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
# rebless & reinitialize with the new class |
198
|
|
|
|
|
|
|
# (this prevents subclasses from forgetting to call our own initialization) |
199
|
0
|
|
|
|
|
0
|
bless $self,$child_class; |
200
|
0
|
|
|
|
|
0
|
$self->_initialize(@_); |
201
|
0
|
|
|
|
|
0
|
$self->_set_namespaces(@_); |
202
|
|
|
|
|
|
|
|
203
|
0
|
|
|
|
|
0
|
$self; |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
sub _initialize { |
207
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
208
|
|
|
|
|
|
|
|
209
|
1
|
|
|
|
|
5
|
my ($flat_write_flag,$dbname,$flat_indexing,$flat_verbose,$flat_outfile,$flat_format) |
210
|
|
|
|
|
|
|
= $self->_rearrange([qw(WRITE_FLAG DBNAME INDEX VERBOSE OUT FORMAT)],@_); |
211
|
|
|
|
|
|
|
|
212
|
1
|
50
|
|
|
|
5
|
$self->write_flag($flat_write_flag) if defined $flat_write_flag; |
213
|
|
|
|
|
|
|
|
214
|
1
|
50
|
|
|
|
3
|
if (defined $flat_indexing) { |
215
|
|
|
|
|
|
|
# very permissive |
216
|
1
|
50
|
|
|
|
3
|
$flat_indexing = 'BerkeleyDB/1' if $flat_indexing =~ /bdb/; |
217
|
1
|
50
|
|
|
|
5
|
$flat_indexing = 'flat/1' if $flat_indexing =~ /^(flat|binary)/; |
218
|
1
|
|
|
|
|
3
|
$self->indexing_scheme($flat_indexing); |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
1
|
50
|
|
|
|
2
|
$self->verbose($flat_verbose) if defined $flat_verbose; |
222
|
1
|
50
|
|
|
|
3
|
$self->dbname($dbname) if defined $dbname; |
223
|
1
|
50
|
|
|
|
2
|
$self->out_file($flat_outfile) if defined $flat_outfile; |
224
|
1
|
50
|
|
|
|
3
|
$self->file_format($flat_format) if defined $flat_format; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub _set_namespaces { |
228
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
$self->primary_namespace($self->default_primary_namespace) |
231
|
0
|
0
|
|
|
|
0
|
unless defined $self->{flat_primary_namespace}; |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
$self->secondary_namespaces($self->default_secondary_namespaces) |
234
|
0
|
0
|
|
|
|
0
|
unless defined $self->{flat_secondary_namespaces}; |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
$self->file_format($self->default_file_format) |
237
|
0
|
0
|
|
|
|
0
|
unless defined $self->{flat_format}; |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
=head2 new_from_registry |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
Title : new_from_registry |
243
|
|
|
|
|
|
|
Usage : $db = Bio::DB::Flat->new_from_registry(%config) |
244
|
|
|
|
|
|
|
Function: creates a new Bio::DB::Flat object in a Bio::DB::Registry- |
245
|
|
|
|
|
|
|
compatible fashion |
246
|
|
|
|
|
|
|
Returns : new Bio::DB::Flat |
247
|
|
|
|
|
|
|
Args : provided by the registry, see below |
248
|
|
|
|
|
|
|
Status : Public |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
The following registry-configuration tags are recognized: |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
location Root of the indexed flat file; corresponds to the new() method's |
253
|
|
|
|
|
|
|
-directory argument. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=cut |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
sub new_from_registry { |
258
|
0
|
|
|
0
|
1
|
0
|
my ($self,%config) = @_; |
259
|
0
|
0
|
|
|
|
0
|
my $location = $config{'location'} or |
260
|
|
|
|
|
|
|
$self->throw('location tag must be specified.'); |
261
|
0
|
0
|
|
|
|
0
|
my $dbname = $config{'dbname'} or |
262
|
|
|
|
|
|
|
$self->throw('dbname tag must be specified.'); |
263
|
|
|
|
|
|
|
|
264
|
0
|
|
|
|
|
0
|
my $db = $self->new(-directory => $location, |
265
|
|
|
|
|
|
|
-dbname => $dbname, |
266
|
|
|
|
|
|
|
); |
267
|
0
|
|
|
|
|
0
|
$db; |
268
|
|
|
|
|
|
|
} |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# accessors |
271
|
|
|
|
|
|
|
sub directory { |
272
|
2
|
|
|
2
|
0
|
3
|
my $self = shift; |
273
|
2
|
|
|
|
|
2
|
my $d = $self->{flat_directory}; |
274
|
2
|
100
|
|
|
|
5
|
$self->{flat_directory} = shift if @_; |
275
|
2
|
|
|
|
|
3
|
$d; |
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
sub write_flag { |
278
|
1
|
|
|
1
|
0
|
2
|
my $self = shift; |
279
|
1
|
|
|
|
|
1
|
my $d = $self->{flat_write_flag}; |
280
|
1
|
50
|
|
|
|
3
|
$self->{flat_write_flag} = shift if @_; |
281
|
1
|
|
|
|
|
1
|
$d; |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
sub verbose { |
284
|
2
|
|
|
2
|
1
|
3
|
my $self = shift; |
285
|
2
|
|
|
|
|
13
|
my $d = $self->{flat_verbose}; |
286
|
2
|
100
|
|
|
|
5
|
$self->{flat_verbose} = shift if @_; |
287
|
2
|
|
|
|
|
8
|
$d; |
288
|
|
|
|
|
|
|
} |
289
|
|
|
|
|
|
|
sub out_file { |
290
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
291
|
0
|
|
|
|
|
0
|
my $d = $self->{flat_outfile}; |
292
|
0
|
0
|
|
|
|
0
|
$self->{flat_outfile} = shift if @_; |
293
|
0
|
|
|
|
|
0
|
$d; |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
sub dbname { |
296
|
3
|
|
|
3
|
0
|
3
|
my $self = shift; |
297
|
3
|
|
|
|
|
3
|
my $d = $self->{flat_dbname}; |
298
|
3
|
100
|
|
|
|
5
|
$self->{flat_dbname} = shift if @_; |
299
|
3
|
|
|
|
|
9
|
$d; |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
sub primary_namespace { |
302
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
303
|
0
|
|
|
|
|
0
|
my $d = $self->{flat_primary_namespace}; |
304
|
0
|
0
|
|
|
|
0
|
$self->{flat_primary_namespace} = shift if @_; |
305
|
0
|
|
|
|
|
0
|
$d; |
306
|
|
|
|
|
|
|
} |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# get/set secondary namespace(s) |
309
|
|
|
|
|
|
|
# pass an array ref. |
310
|
|
|
|
|
|
|
# get an array ref in scalar context, list in list context. |
311
|
|
|
|
|
|
|
sub secondary_namespaces { |
312
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
313
|
0
|
|
|
|
|
0
|
my $d = $self->{flat_secondary_namespaces}; |
314
|
0
|
0
|
|
|
|
0
|
$self->{flat_secondary_namespaces} = (ref($_[0]) eq 'ARRAY' ? shift : [@_]) if @_; |
|
|
0
|
|
|
|
|
|
315
|
0
|
0
|
|
|
|
0
|
return unless $d; |
316
|
0
|
0
|
0
|
|
|
0
|
$d = [$d] if $d && ref($d) ne 'ARRAY'; # just paranoia |
317
|
0
|
0
|
|
|
|
0
|
return wantarray ? @$d : $d; |
318
|
|
|
|
|
|
|
} |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
# return the file format |
321
|
|
|
|
|
|
|
sub file_format { |
322
|
2
|
|
|
2
|
0
|
3
|
my $self = shift; |
323
|
2
|
|
|
|
|
1
|
my $d = $self->{flat_format}; |
324
|
2
|
100
|
|
|
|
4
|
$self->{flat_format} = shift if @_; |
325
|
2
|
|
|
|
|
2
|
$d; |
326
|
|
|
|
|
|
|
} |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
# return the alphabet |
329
|
|
|
|
|
|
|
sub alphabet { |
330
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
331
|
0
|
|
|
|
|
0
|
my $d = $self->{flat_alphabet}; |
332
|
0
|
0
|
|
|
|
0
|
$self->{flat_alphabet} = shift if @_; |
333
|
0
|
|
|
|
|
0
|
$d; |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
sub parse_one_record { |
337
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
338
|
0
|
|
|
|
|
0
|
my $fh = shift; |
339
|
|
|
|
|
|
|
my $parser = |
340
|
0
|
|
0
|
|
|
0
|
$self->{cached_parsers}{fileno($fh)} |
341
|
|
|
|
|
|
|
||= Bio::SeqIO->new(-fh=>$fh,-format=>$self->default_file_format); |
342
|
0
|
0
|
|
|
|
0
|
my $seq = $parser->next_seq or return; |
343
|
0
|
|
0
|
|
|
0
|
$self->{flat_alphabet} ||= $seq->alphabet; |
344
|
0
|
|
|
|
|
0
|
my $ids = $self->seq_to_ids($seq); |
345
|
0
|
|
|
|
|
0
|
return $ids; |
346
|
|
|
|
|
|
|
} |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# return the indexing scheme |
350
|
|
|
|
|
|
|
sub indexing_scheme { |
351
|
4
|
|
|
4
|
0
|
4
|
my $self = shift; |
352
|
4
|
|
|
|
|
3
|
my $d = $self->{flat_indexing}; |
353
|
4
|
100
|
|
|
|
8
|
$self->{flat_indexing} = shift if @_; |
354
|
4
|
|
|
|
|
8
|
$d; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
sub add_flat_file { |
358
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
359
|
0
|
|
|
|
|
0
|
my ($file_path,$file_length,$nf) = @_; |
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
# check that file_path is absolute |
362
|
0
|
0
|
|
|
|
0
|
unless (File::Spec->file_name_is_absolute($file_path)) { |
363
|
0
|
|
|
|
|
0
|
$file_path = File::Spec->rel2abs($file_path); |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
|
366
|
0
|
0
|
|
|
|
0
|
-r $file_path or $self->throw("flat file $file_path cannot be read: $!"); |
367
|
|
|
|
|
|
|
|
368
|
0
|
|
|
|
|
0
|
my $current_size = -s _; |
369
|
0
|
0
|
|
|
|
0
|
if (defined $file_length) { |
370
|
0
|
0
|
|
|
|
0
|
$current_size == $file_length |
371
|
|
|
|
|
|
|
or $self->throw("flat file $file_path has changed size. Was $file_length bytes; now $current_size"); |
372
|
|
|
|
|
|
|
} else { |
373
|
0
|
|
|
|
|
0
|
$file_length = $current_size; |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
|
376
|
0
|
0
|
|
|
|
0
|
unless (defined $nf) { |
377
|
0
|
0
|
|
|
|
0
|
$self->{flat_file_index} = 0 unless exists $self->{flat_file_index}; |
378
|
0
|
|
|
|
|
0
|
$nf = $self->{flat_file_index}++; |
379
|
|
|
|
|
|
|
} |
380
|
0
|
|
|
|
|
0
|
$self->{flat_flat_file_path}{$nf} = $file_path; |
381
|
0
|
|
|
|
|
0
|
$self->{flat_flat_file_no}{$file_path} = $nf; |
382
|
0
|
|
|
|
|
0
|
$nf; |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
sub write_config { |
386
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
387
|
0
|
0
|
|
|
|
0
|
$self->write_flag or $self->throw("cannot write configuration file because write_flag is not set"); |
388
|
0
|
|
|
|
|
0
|
my $path = $self->_config_path; |
389
|
|
|
|
|
|
|
|
390
|
0
|
0
|
|
|
|
0
|
open my $F, '>', $path or $self->throw("Could not write file '$path': $!"); |
391
|
|
|
|
|
|
|
|
392
|
0
|
|
|
|
|
0
|
my $index_type = $self->indexing_scheme; |
393
|
0
|
|
|
|
|
0
|
print $F "index\t$index_type\n"; |
394
|
|
|
|
|
|
|
|
395
|
0
|
|
|
|
|
0
|
my $format = $self->file_format; |
396
|
0
|
|
|
|
|
0
|
my $alphabet = $self->alphabet; |
397
|
0
|
0
|
|
|
|
0
|
my $alpha = $alphabet ? "/$alphabet" : ''; |
398
|
0
|
|
|
|
|
0
|
print $F "format\tURN:LSID:open-bio.org:${format}${alpha}\n"; |
399
|
|
|
|
|
|
|
|
400
|
0
|
0
|
|
|
|
0
|
my @filenos = $self->_filenos or $self->throw("cannot write config file because no flat files defined"); |
401
|
0
|
|
|
|
|
0
|
for my $nf (@filenos) { |
402
|
0
|
|
|
|
|
0
|
my $path = $self->{flat_flat_file_path}{$nf}; |
403
|
0
|
|
|
|
|
0
|
my $size = -s $path; |
404
|
0
|
|
|
|
|
0
|
print $F join("\t","fileid_$nf",$path,$size),"\n"; |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# write primary namespace |
408
|
0
|
0
|
|
|
|
0
|
my $primary_ns = $self->primary_namespace |
409
|
|
|
|
|
|
|
or $self->throw('cannot write config file because no primary namespace defined'); |
410
|
|
|
|
|
|
|
|
411
|
0
|
|
|
|
|
0
|
print $F join("\t",'primary_namespace',$primary_ns),"\n"; |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
# write secondary namespaces |
414
|
0
|
|
|
|
|
0
|
my @secondary = $self->secondary_namespaces; |
415
|
0
|
|
|
|
|
0
|
print $F join("\t",'secondary_namespaces',@secondary),"\n"; |
416
|
|
|
|
|
|
|
|
417
|
0
|
0
|
|
|
|
0
|
close $F or $self->throw("close error on $path: $!"); |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
sub files { |
421
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
422
|
0
|
0
|
|
|
|
0
|
return unless $self->{flat_flat_file_no}; |
423
|
0
|
|
|
|
|
0
|
return keys %{$self->{flat_flat_file_no}}; |
|
0
|
|
|
|
|
0
|
|
424
|
|
|
|
|
|
|
} |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
sub write_seq { |
427
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
428
|
0
|
|
|
|
|
0
|
my $seq = shift; |
429
|
|
|
|
|
|
|
|
430
|
0
|
0
|
|
|
|
0
|
$self->write_flag or $self->throw("cannot write sequences because write_flag is not set"); |
431
|
|
|
|
|
|
|
|
432
|
0
|
0
|
|
|
|
0
|
my $file = $self->out_file or $self->throw('no outfile defined; use the -out argument to new()'); |
433
|
0
|
0
|
0
|
|
|
0
|
my $seqio = $self->{flat_cached_parsers}{$file} |
434
|
|
|
|
|
|
|
||= Bio::SeqIO->new(-Format => $self->file_format, |
435
|
|
|
|
|
|
|
-file => ">$file") |
436
|
|
|
|
|
|
|
or $self->throw("couldn't create Bio::SeqIO object"); |
437
|
|
|
|
|
|
|
|
438
|
0
|
0
|
|
|
|
0
|
my $fh = $seqio->_fh or $self->throw("couldn't get filehandle from Bio::SeqIO object"); |
439
|
0
|
|
|
|
|
0
|
my $offset = tell($fh); |
440
|
0
|
|
|
|
|
0
|
$seqio->write_seq($seq); |
441
|
0
|
|
|
|
|
0
|
my $length = tell($fh)-$offset; |
442
|
0
|
|
|
|
|
0
|
my $ids = $self->seq_to_ids($seq); |
443
|
0
|
|
|
|
|
0
|
$self->_store_index($ids,$file,$offset,$length); |
444
|
|
|
|
|
|
|
|
445
|
0
|
|
|
|
|
0
|
$self->{flat_outfile_dirty}++; |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub close { |
449
|
1
|
|
|
1
|
0
|
2
|
my $self = shift; |
450
|
1
|
50
|
|
|
|
13
|
return unless $self->{flat_outfile_dirty}; |
451
|
0
|
|
|
|
|
0
|
$self->write_config; |
452
|
0
|
|
|
|
|
0
|
delete $self->{flat_outfile_dirty}; |
453
|
0
|
|
|
|
|
0
|
delete $self->{flat_cached_parsers}{$self->out_file}; |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
sub _filenos { |
458
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
459
|
0
|
0
|
|
|
|
0
|
return unless $self->{flat_flat_file_path}; |
460
|
0
|
|
|
|
|
0
|
return keys %{$self->{flat_flat_file_path}}; |
|
0
|
|
|
|
|
0
|
|
461
|
|
|
|
|
|
|
} |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
# read the configuration file |
464
|
|
|
|
|
|
|
sub _read_config { |
465
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
466
|
1
|
|
|
|
|
3
|
my $path = $self->_config_path; |
467
|
1
|
50
|
|
|
|
14
|
return unless -e $path; |
468
|
|
|
|
|
|
|
|
469
|
0
|
0
|
|
|
|
0
|
open my $F, '<', $path or $self->throw("Could not read file '$path': $!"); |
470
|
0
|
|
|
|
|
0
|
my %config; |
471
|
0
|
|
|
|
|
0
|
while (<$F>) { |
472
|
0
|
|
|
|
|
0
|
chomp; |
473
|
0
|
|
|
|
|
0
|
my ($tag,@values) = split "\t"; |
474
|
0
|
|
|
|
|
0
|
$config{$tag} = \@values; |
475
|
|
|
|
|
|
|
} |
476
|
0
|
0
|
|
|
|
0
|
CORE::close $F or $self->throw("close error on $path: $!"); |
477
|
|
|
|
|
|
|
|
478
|
0
|
0
|
|
|
|
0
|
$config{index}[0] =~ m~(flat/1|BerkeleyDB/1)~ |
479
|
|
|
|
|
|
|
or $self->throw("invalid configuration file $path: no index line"); |
480
|
|
|
|
|
|
|
|
481
|
0
|
|
|
|
|
0
|
$self->indexing_scheme($1); |
482
|
|
|
|
|
|
|
|
483
|
0
|
0
|
|
|
|
0
|
if ($config{format}) { |
484
|
|
|
|
|
|
|
# handle LSID format |
485
|
0
|
0
|
|
|
|
0
|
if ($config{format}[0] =~ /^URN:LSID:open-bio\.org:(\w+)(?:\/(\w+))/) { |
486
|
0
|
|
|
|
|
0
|
$self->file_format($1); |
487
|
0
|
|
|
|
|
0
|
$self->alphabet($2); |
488
|
|
|
|
|
|
|
} else { # compatibility with older versions |
489
|
0
|
|
|
|
|
0
|
$self->file_format($config{format}[0]); |
490
|
|
|
|
|
|
|
} |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
# set up primary namespace |
494
|
0
|
0
|
|
|
|
0
|
my $primary_namespace = $config{primary_namespace}[0] |
495
|
|
|
|
|
|
|
or $self->throw("invalid configuration file $path: no primary namespace defined"); |
496
|
0
|
|
|
|
|
0
|
$self->primary_namespace($primary_namespace); |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
# set up secondary namespaces (may be empty) |
499
|
0
|
|
|
|
|
0
|
$self->secondary_namespaces($config{secondary_namespaces}); |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
# get file paths and their normalization information |
502
|
0
|
0
|
|
|
|
0
|
my @normalized_files = grep {$_ ne ''} map {/^fileid_(\S+)/ && $1} keys %config; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
503
|
0
|
|
|
|
|
0
|
for my $nf (@normalized_files) { |
504
|
0
|
|
|
|
|
0
|
my ($file_path,$file_length) = @{$config{"fileid_${nf}"}}; |
|
0
|
|
|
|
|
0
|
|
505
|
0
|
|
|
|
|
0
|
$self->add_flat_file($file_path,$file_length,$nf); |
506
|
|
|
|
|
|
|
} |
507
|
0
|
|
|
|
|
0
|
1; |
508
|
|
|
|
|
|
|
} |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
sub _config_path { |
512
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
513
|
1
|
|
|
|
|
3
|
$self->_catfile($self->_config_name); |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
sub _catfile { |
517
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
518
|
1
|
|
|
|
|
2
|
my $component = shift; |
519
|
1
|
|
|
|
|
2
|
File::Spec->catfile($self->directory,$self->dbname,$component); |
520
|
|
|
|
|
|
|
} |
521
|
|
|
|
|
|
|
|
522
|
1
|
|
|
1
|
|
3
|
sub _config_name { CONFIG_FILE_NAME } |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
sub _path2fileno { |
525
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
526
|
0
|
|
|
|
|
0
|
my $path = shift; |
527
|
|
|
|
|
|
|
return $self->add_flat_file($path) |
528
|
0
|
0
|
|
|
|
0
|
unless exists $self->{flat_flat_file_no}{$path}; |
529
|
0
|
|
|
|
|
0
|
$self->{flat_flat_file_no}{$path}; |
530
|
|
|
|
|
|
|
} |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
sub _fileno2path { |
533
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
534
|
0
|
|
|
|
|
0
|
my $fileno = shift; |
535
|
0
|
|
|
|
|
0
|
$self->{flat_flat_file_path}{$fileno}; |
536
|
|
|
|
|
|
|
} |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
sub _files { |
539
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
540
|
0
|
|
|
|
|
0
|
my $paths = $self->{flat_flat_file_no}; |
541
|
0
|
|
|
|
|
0
|
return keys %$paths; |
542
|
|
|
|
|
|
|
} |
543
|
|
|
|
|
|
|
|
544
|
|
|
|
|
|
|
=head2 fetch |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
Title : fetch |
547
|
|
|
|
|
|
|
Usage : $index->fetch( $id ) |
548
|
|
|
|
|
|
|
Function: Returns a Bio::Seq object from the index |
549
|
|
|
|
|
|
|
Example : $seq = $index->fetch( 'dJ67B12' ) |
550
|
|
|
|
|
|
|
Returns : Bio::Seq object |
551
|
|
|
|
|
|
|
Args : ID |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
Deprecated. Use get_Seq_by_id instead. |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
=cut |
556
|
|
|
|
|
|
|
|
557
|
0
|
|
|
0
|
1
|
0
|
sub fetch { shift->get_Seq_by_id(@_) } |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
=head2 To Be Implemented in Subclasses |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
The following methods MUST be implemented by subclasses. |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
=cut |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
# create real live Bio::Seq object |
567
|
|
|
|
|
|
|
sub get_Seq_by_id { |
568
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
569
|
0
|
|
|
|
|
0
|
my $id = shift; |
570
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented; |
571
|
|
|
|
|
|
|
} |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
# fetch array of Bio::Seq objects |
575
|
|
|
|
|
|
|
sub get_Seq_by_acc { |
576
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
577
|
0
|
0
|
|
|
|
0
|
return $self->get_Seq_by_id(shift) if @_ == 1; |
578
|
0
|
|
|
|
|
0
|
my ($ns,$key) = @_; |
579
|
|
|
|
|
|
|
|
580
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented; |
581
|
|
|
|
|
|
|
} |
582
|
|
|
|
|
|
|
|
583
|
|
|
|
|
|
|
sub fetch_raw { |
584
|
0
|
|
|
0
|
0
|
0
|
my ($self,$id,$namespace) = @_; |
585
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented; |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
sub default_file_format { |
589
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
590
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented; |
591
|
|
|
|
|
|
|
} |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
sub _store_index { |
594
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
595
|
0
|
|
|
|
|
0
|
my ($ids,$file,$offset,$length) = @_; |
596
|
0
|
|
|
|
|
0
|
$self->throw_not_implemented; |
597
|
|
|
|
|
|
|
} |
598
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
=head2 May Be Overridden in Subclasses |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
The following methods MAY be overridden by subclasses. |
602
|
|
|
|
|
|
|
|
603
|
|
|
|
|
|
|
=cut |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
sub default_primary_namespace { |
606
|
0
|
|
|
0
|
0
|
0
|
return "ACC"; |
607
|
|
|
|
|
|
|
} |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
sub default_secondary_namespaces { |
610
|
0
|
|
|
0
|
0
|
0
|
return; |
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
sub seq_to_ids { |
614
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
615
|
0
|
|
|
|
|
0
|
my $seq = shift; |
616
|
0
|
|
|
|
|
0
|
my %ids; |
617
|
0
|
|
|
|
|
0
|
$ids{$self->primary_namespace} = $seq->accession_number; |
618
|
0
|
|
|
|
|
0
|
\%ids; |
619
|
|
|
|
|
|
|
} |
620
|
|
|
|
|
|
|
|
621
|
|
|
|
|
|
|
sub DESTROY { |
622
|
1
|
|
|
1
|
|
1
|
my $self = shift; |
623
|
1
|
|
|
|
|
3
|
$self->close; |
624
|
|
|
|
|
|
|
} |
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
1; |