line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Phylo::Forest::DBTree; |
2
|
2
|
|
|
2
|
|
6515
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
50
|
|
3
|
2
|
|
|
2
|
|
8
|
use warnings; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
44
|
|
4
|
2
|
|
|
2
|
|
2022
|
use DBI; |
|
2
|
|
|
|
|
24791
|
|
|
2
|
|
|
|
|
112
|
|
5
|
2
|
|
|
2
|
|
713
|
use Bio::Phylo::Factory; |
|
2
|
|
|
|
|
15289
|
|
|
2
|
|
|
|
|
10
|
|
6
|
2
|
|
|
2
|
|
63
|
use Bio::Phylo::Util::Exceptions 'throw'; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
76
|
|
7
|
2
|
|
|
2
|
|
9
|
use base 'DBIx::Class::Schema'; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
801
|
|
8
|
2
|
|
|
2
|
|
88229
|
use base 'Bio::Phylo::Forest::Tree'; |
|
2
|
|
|
|
|
7
|
|
|
2
|
|
|
|
|
900
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
__PACKAGE__->load_namespaces; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
my $SINGLETON; |
13
|
|
|
|
|
|
|
my $DBH; |
14
|
|
|
|
|
|
|
my $fac = Bio::Phylo::Factory->new; |
15
|
2
|
|
|
2
|
|
133500
|
use version 0.77; our $VERSION = qv("v0.1.2"); |
|
2
|
|
|
|
|
2721
|
|
|
2
|
|
|
|
|
17
|
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 NAME |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
Bio::Phylo::Forest::DBTree - Phylogenetic database as a tree object |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 SYNOPSIS |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
use Bio::Phylo::Forest::DBTree; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
# connect to the Green Genes tree |
26
|
|
|
|
|
|
|
my $file = 'gg_13_5_otus_99_annotated.db'; |
27
|
|
|
|
|
|
|
my $dbtree = Bio::Phylo::Forest::DBTree->connect($file); |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
# $dbtree can be used as a Bio::Phylo::Forest::Tree object, |
30
|
|
|
|
|
|
|
# and the node objects that are returned can be used as |
31
|
|
|
|
|
|
|
# Bio::Phylo::Forest::Node objects |
32
|
|
|
|
|
|
|
my $root = $dbtree->get_root; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 DESCRIPTION |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
This package provides the functionality to handle very large phylogenies (examples: the |
37
|
|
|
|
|
|
|
NCBI taxonomy, the Green Genes tree) as if they are L tree objects, with all |
38
|
|
|
|
|
|
|
the possibilities for traversal, computation, serialization, and visualization, but stored |
39
|
|
|
|
|
|
|
in a SQLite database. These databases are single files, so that they can be easily shared. |
40
|
|
|
|
|
|
|
Some useful database files are available here: |
41
|
|
|
|
|
|
|
https://figshare.com/account/home#/projects/18808 |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
To make new tree databases, a number of scripts are provided with the distribution of this |
44
|
|
|
|
|
|
|
package: |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=over |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
=item * C Loads a very large Newick tree into a database. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=item * C Loads the NCBI taxonomy dump into a database. |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
=item * C Loads a tree in the format of L |
53
|
|
|
|
|
|
|
into a database. |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=back |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
As an example of interacting with a database tree, the script C can be |
58
|
|
|
|
|
|
|
used to extract subtrees from a database. |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
=head1 DATABASE METHODS |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
The following methods deal with the database as a whole: creating a new database, |
63
|
|
|
|
|
|
|
connecting to an existing one, persisting a tree in a database and extracting one as a |
64
|
|
|
|
|
|
|
mutable, in-memory object. |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
=head2 create() |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
Creates a SQLite database file in the provided location. Usage: |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
use Bio::Phylo::Forest::DBTree; |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# second argument is optional |
73
|
|
|
|
|
|
|
Bio::Phylo::Forest::DBTree->create( $file, '/opt/local/bin/sqlite3' ); |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
The first argument is the location where the database file is going to be created. The |
76
|
|
|
|
|
|
|
second argument is optional, and provides the location of the C executable that |
77
|
|
|
|
|
|
|
is used to create the database. By default, the C is simply found on the |
78
|
|
|
|
|
|
|
C<$PATH>, but if it is installed in a non-standard location that location can be provided |
79
|
|
|
|
|
|
|
here. The database schema that is created corresponds to the following SQL statements: |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
create table node( |
82
|
|
|
|
|
|
|
id int not null, |
83
|
|
|
|
|
|
|
parent int, |
84
|
|
|
|
|
|
|
left int, |
85
|
|
|
|
|
|
|
right int, |
86
|
|
|
|
|
|
|
name varchar(20), |
87
|
|
|
|
|
|
|
length float, |
88
|
|
|
|
|
|
|
height float, |
89
|
|
|
|
|
|
|
primary key(id) |
90
|
|
|
|
|
|
|
); |
91
|
|
|
|
|
|
|
create index parent_idx on node(parent); |
92
|
|
|
|
|
|
|
create index left_idx on node(left); |
93
|
|
|
|
|
|
|
create index right_idx on node(right); |
94
|
|
|
|
|
|
|
create index name_idx on node(name); |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
=cut |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
sub create { |
99
|
0
|
|
|
0
|
1
|
0
|
my $class = shift; |
100
|
0
|
|
|
|
|
0
|
my $file = shift; |
101
|
0
|
|
0
|
|
|
0
|
my $sqlite3 = shift || 'sqlite3'; |
102
|
0
|
|
|
|
|
0
|
my $command = do { local $/; }; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
103
|
0
|
0
|
|
|
|
0
|
system("echo '$command' | sqlite3 '$file'") == 0 or die 'Create failed!'; |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=head2 connect() |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
Connects to a SQLite database file, returns the connection as a |
109
|
|
|
|
|
|
|
C object. Usage: |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
use Bio::Phylo::Forest::DBTree; |
112
|
|
|
|
|
|
|
my $dbtree = Bio::Phylo::Forest::DBTree->connect($file); |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
The argument is a file name. If the file exists, a L database handle to that |
115
|
|
|
|
|
|
|
file is returned. If the file does not exist, a new database is created in that location, |
116
|
|
|
|
|
|
|
and subsequently the handle to that newly created database is returned. The creation of |
117
|
|
|
|
|
|
|
the database is handled by the C method (see below). |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
=cut |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub connect { |
122
|
2
|
|
|
2
|
1
|
534
|
my $class = shift; |
123
|
2
|
|
|
|
|
5
|
my $file = shift; |
124
|
2
|
100
|
|
|
|
10
|
if ( not $SINGLETON ) { |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
# create if not exist |
127
|
1
|
50
|
|
|
|
37
|
if ( not -e $file ) { |
128
|
0
|
|
|
|
|
0
|
$class->create($file); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# fuck it, let's just hardcode it here - Yeehaw! |
132
|
1
|
|
|
|
|
3
|
my $dsn = "dbi:SQLite:dbname=$file"; |
133
|
1
|
|
|
|
|
8
|
$DBH = DBI->connect($dsn,'',''); |
134
|
1
|
|
|
|
|
8473
|
$DBH->{'RaiseError'} = 1; |
135
|
1
|
|
|
1
|
|
17
|
$SINGLETON = $class->SUPER::connect( sub { $DBH } ); |
|
1
|
|
|
|
|
140071
|
|
136
|
|
|
|
|
|
|
} |
137
|
2
|
|
|
|
|
55779
|
return $SINGLETON; |
138
|
|
|
|
|
|
|
} |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=head2 persist() |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
Persist a phylogenetic tree object (a subclass of L) into a |
143
|
|
|
|
|
|
|
newly created database file. Usage: |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
use Bio::Phylo::Forest::DBTree; |
146
|
|
|
|
|
|
|
my $dbtree = Bio::Phylo::Forest::DBTree->persist( |
147
|
|
|
|
|
|
|
-file => $file, |
148
|
|
|
|
|
|
|
-tree => $tree, |
149
|
|
|
|
|
|
|
); |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
This method first create a database at the location specified by C<$file> by making a call |
152
|
|
|
|
|
|
|
to the C method. Subsequently, the C<$tree> object is traversed from root to |
153
|
|
|
|
|
|
|
tips and inserted in the newly created database. Finally, the handle to this database is |
154
|
|
|
|
|
|
|
returned, i.e. a C object. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=cut |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub persist { |
159
|
0
|
|
|
0
|
1
|
0
|
my ( $class, %args ) = @_; |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
# need a file argument to write to |
162
|
0
|
0
|
|
|
|
0
|
if ( not $args{'-file'} ) { |
163
|
0
|
|
|
|
|
0
|
throw 'BadArgs' => "Need -file argument!"; |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# need a tree argument to persis |
167
|
0
|
0
|
|
|
|
0
|
if ( not $args{'-tree'} ) { |
168
|
0
|
|
|
|
|
0
|
throw 'BadArgs' => "Need -tree argument!"; |
169
|
|
|
|
|
|
|
} |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# create a new database, prepare statement handler |
172
|
0
|
|
|
|
|
0
|
$class->create( $args{'-file'} ); |
173
|
0
|
|
|
|
|
0
|
my $dsn = 'dbi:SQLite:dbname=' . $args{'-file'}; |
174
|
0
|
|
|
|
|
0
|
my $dbh = DBI->connect($dsn,'',''); |
175
|
0
|
|
|
|
|
0
|
$dbh->{'RaiseError'} = 1; |
176
|
0
|
|
|
0
|
|
0
|
my $db = $class->SUPER::connect( sub { $dbh } ); |
|
0
|
|
|
|
|
0
|
|
177
|
0
|
|
|
|
|
0
|
my $sth = $dbh->prepare("insert into node values(?,?,?,?)"); |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# start traversing |
180
|
0
|
|
|
|
|
0
|
my $counter = 2; |
181
|
0
|
|
|
|
|
0
|
my %idmap; |
182
|
|
|
|
|
|
|
$args{'-tree'}->visit_depth_first( |
183
|
|
|
|
|
|
|
'-pre' => sub { |
184
|
0
|
|
|
0
|
|
0
|
my $node = shift; |
185
|
0
|
|
|
|
|
0
|
my $id = $node->get_id; |
186
|
0
|
|
|
|
|
0
|
$idmap{$id} = $counter++; |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# get the parent id, or "1" if root |
189
|
0
|
|
|
|
|
0
|
my $parent_id; |
190
|
0
|
0
|
|
|
|
0
|
if ( my $parent = $node->get_parent ) { |
191
|
0
|
|
|
|
|
0
|
my $pid = $parent->get_id; |
192
|
0
|
|
|
|
|
0
|
$parent_id = $idmap{$pid}; |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
else { |
195
|
0
|
|
|
|
|
0
|
$parent_id = 1; |
196
|
|
|
|
|
|
|
} |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# do the insertion |
199
|
|
|
|
|
|
|
$sth->execute( |
200
|
0
|
|
|
|
|
0
|
$idmap{$id}, # primary key |
201
|
|
|
|
|
|
|
$parent_id, # self-joining foreign key |
202
|
|
|
|
|
|
|
undef, # not indexed yet |
203
|
|
|
|
|
|
|
undef, # not indexed yet |
204
|
|
|
|
|
|
|
$node->get_internal_name, # node label or taxon name |
205
|
|
|
|
|
|
|
$node->get_branch_length, # branch length |
206
|
|
|
|
|
|
|
undef # not computed yet |
207
|
|
|
|
|
|
|
); |
208
|
|
|
|
|
|
|
} |
209
|
0
|
|
|
|
|
0
|
); |
210
|
0
|
|
|
|
|
0
|
my $i = 0; |
211
|
0
|
|
|
|
|
0
|
$db->get_root->_index(\$i,0); |
212
|
0
|
|
|
|
|
0
|
return $db; |
213
|
|
|
|
|
|
|
} |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=head2 extract() |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
Extracts a tree from a database. The returned tree is an in-memory object. Hence, this is |
218
|
|
|
|
|
|
|
an expensive operation that is best avoided as much as possible. Usage: |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
my $tree = $dbtree->extract; |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=cut |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
sub extract { |
225
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
226
|
0
|
|
|
|
|
0
|
my $tree = $fac->create_tree; |
227
|
0
|
|
|
|
|
0
|
my $root = $self->get_root; |
228
|
0
|
|
|
|
|
0
|
_clone_mutable( |
229
|
|
|
|
|
|
|
$fac->create_node( |
230
|
|
|
|
|
|
|
'-name' => $root->get_name, |
231
|
|
|
|
|
|
|
'-branch_length' => $root->get_branch_length, |
232
|
|
|
|
|
|
|
), |
233
|
|
|
|
|
|
|
$root, |
234
|
|
|
|
|
|
|
$tree |
235
|
|
|
|
|
|
|
); |
236
|
0
|
|
|
|
|
0
|
return $tree; |
237
|
|
|
|
|
|
|
} |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
{ |
240
|
2
|
|
|
2
|
|
1046
|
no warnings 'recursion'; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
501
|
|
241
|
|
|
|
|
|
|
sub _clone_mutable { |
242
|
0
|
|
|
0
|
|
0
|
my ( $parent, $template, $tree ) = @_; |
243
|
0
|
|
|
|
|
0
|
$tree->insert($parent); |
244
|
0
|
|
|
|
|
0
|
for my $child ( @{ $template->get_children } ) { |
|
0
|
|
|
|
|
0
|
|
245
|
0
|
|
|
|
|
0
|
_clone_mutable( |
246
|
|
|
|
|
|
|
$fac->create_node( |
247
|
|
|
|
|
|
|
'-name' => $child->get_name, |
248
|
|
|
|
|
|
|
'-branch_length' => $child->get_branch_length, |
249
|
|
|
|
|
|
|
'-parent' => $parent, |
250
|
|
|
|
|
|
|
), |
251
|
|
|
|
|
|
|
$child, |
252
|
|
|
|
|
|
|
$tree |
253
|
|
|
|
|
|
|
); |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
} |
256
|
|
|
|
|
|
|
} |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
=head2 dbh() |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
Returns the underlying handle through which SQL statements can be executed directly on the |
261
|
|
|
|
|
|
|
database. This is a L object. Usage: |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
my $dbh = $dbtree->dbh; |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
=cut |
266
|
|
|
|
|
|
|
|
267
|
0
|
|
|
0
|
1
|
0
|
sub dbh { $DBH } |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=head1 TREE METHODS |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
The following methods are implemented here to override methods of the same name in the |
272
|
|
|
|
|
|
|
L hierarchy so that the tree database is accessed more efficiently than |
273
|
|
|
|
|
|
|
otherwise would be the case. |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
=head2 get_root() |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
Returns the root of the tree, i.e. a L object, |
278
|
|
|
|
|
|
|
which is a subclass of L. Usage: |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
my $root = $dbtree->get_root; |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=cut |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
sub get_root { |
285
|
|
|
|
|
|
|
shift->_rs->search( |
286
|
1
|
|
|
1
|
1
|
533
|
{ 'parent' => 1 }, |
287
|
|
|
|
|
|
|
{ |
288
|
|
|
|
|
|
|
'order_by' => 'id', |
289
|
|
|
|
|
|
|
'rows' => 1, |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
)->single |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
=head2 get_id() |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
Returns a dummy ID, an integer. Usage: |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
my $id = $dbtree->get_id; |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
=cut |
301
|
|
|
|
|
|
|
|
302
|
0
|
|
|
0
|
1
|
0
|
sub get_id { 0 } |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
=head2 get_by_name() |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
Returns the first node object that has the provided name. Usage: |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
my $node = $dbtree->get_by_name( 'Homo sapiens' ); |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
=cut |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
sub get_by_name { |
313
|
6
|
|
|
6
|
1
|
48553
|
my ( $self, $name ) = @_; |
314
|
6
|
|
|
|
|
20
|
return $self->_rs->search({ 'name' => $name })->single; |
315
|
|
|
|
|
|
|
} |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=head2 visit() |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
Given a code reference, visits all the nodes in the tree and executes the code on the |
320
|
|
|
|
|
|
|
focal node. Usage: |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
$dbtree->visit(sub{ |
323
|
|
|
|
|
|
|
my $node = shift; |
324
|
|
|
|
|
|
|
print $node->name, "\n"; |
325
|
|
|
|
|
|
|
}); |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
=cut |
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
sub visit { |
330
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $code ) = @_; |
331
|
0
|
|
|
|
|
0
|
my $rs = $self->_rs; |
332
|
0
|
|
|
|
|
0
|
while( my $node = $rs->next ) { |
333
|
0
|
|
|
|
|
0
|
$code->($node); |
334
|
|
|
|
|
|
|
} |
335
|
0
|
|
|
|
|
0
|
return $self; |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
|
338
|
7
|
|
|
7
|
|
34
|
sub _rs { shift->resultset('Node') } |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
1; |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
__DATA__ |