line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Ace::Sequence; |
2
|
1
|
|
|
1
|
|
3411
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
33
|
|
3
|
|
|
|
|
|
|
|
4
|
1
|
|
|
1
|
|
5
|
use Carp; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
96
|
|
5
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
34
|
|
6
|
1
|
|
|
1
|
|
857
|
use Ace 1.50 qw(:DEFAULT rearrange); |
|
1
|
|
|
|
|
37
|
|
|
1
|
|
|
|
|
195
|
|
7
|
1
|
|
|
1
|
|
836
|
use Ace::Sequence::FeatureList; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
29
|
|
8
|
1
|
|
|
1
|
|
653
|
use Ace::Sequence::Feature; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
35
|
|
9
|
1
|
|
|
1
|
|
7
|
use AutoLoader 'AUTOLOAD'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
6
|
|
10
|
1
|
|
|
1
|
|
28
|
use vars '$VERSION'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
45
|
|
11
|
|
|
|
|
|
|
my %CACHE; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
$VERSION = '1.51'; |
14
|
|
|
|
|
|
|
|
15
|
1
|
|
|
1
|
|
5
|
use constant CACHE => 1; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
72
|
|
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
use overload |
18
|
1
|
|
|
|
|
6
|
'""' => 'asString', |
19
|
|
|
|
|
|
|
cmp => 'cmp', |
20
|
1
|
|
|
1
|
|
6
|
; |
|
1
|
|
|
|
|
2
|
|
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
# synonym: stop = end |
23
|
|
|
|
|
|
|
*stop = \&end; |
24
|
|
|
|
|
|
|
*abs = \&absolute; |
25
|
|
|
|
|
|
|
*source_seq = \&source; |
26
|
|
|
|
|
|
|
*source_tag = \&subtype; |
27
|
|
|
|
|
|
|
*primary_tag = \&type; |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
my %plusminus = ( '+' => '-', |
30
|
|
|
|
|
|
|
'-' => '+', |
31
|
|
|
|
|
|
|
'.' => '.'); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# internal keys |
34
|
|
|
|
|
|
|
# parent => reference Sequence in "+" strand |
35
|
|
|
|
|
|
|
# p_offset => our start in the parent |
36
|
|
|
|
|
|
|
# length => our length |
37
|
|
|
|
|
|
|
# strand => our strand (+ or -) |
38
|
|
|
|
|
|
|
# refseq => reference Sequence for coordinate system |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# object constructor |
41
|
|
|
|
|
|
|
# usually called like this: |
42
|
|
|
|
|
|
|
# $seq = Ace::Sequence->new($object); |
43
|
|
|
|
|
|
|
# but can be called like this: |
44
|
|
|
|
|
|
|
# $seq = Ace::Sequence->new(-db=>$db,-name=>$name); |
45
|
|
|
|
|
|
|
# or |
46
|
|
|
|
|
|
|
# $seq = Ace::Sequence->new(-seq => $object, |
47
|
|
|
|
|
|
|
# -offset => $offset, |
48
|
|
|
|
|
|
|
# -length => $length, |
49
|
|
|
|
|
|
|
# -ref => $refseq |
50
|
|
|
|
|
|
|
# ); |
51
|
|
|
|
|
|
|
# $refseq, if provided, will be used to establish the coordinate |
52
|
|
|
|
|
|
|
# system. Otherwise the first base pair will be set to 1. |
53
|
|
|
|
|
|
|
sub new { |
54
|
0
|
|
|
0
|
0
|
|
my $pack = shift; |
55
|
0
|
|
|
|
|
|
my ($seq,$start,$end,$offset,$length,$refseq,$db) = |
56
|
|
|
|
|
|
|
rearrange([ |
57
|
|
|
|
|
|
|
['SEQ','SEQUENCE','SOURCE'], |
58
|
|
|
|
|
|
|
'START', |
59
|
|
|
|
|
|
|
['END','STOP'], |
60
|
|
|
|
|
|
|
['OFFSET','OFF'], |
61
|
|
|
|
|
|
|
['LENGTH','LEN'], |
62
|
|
|
|
|
|
|
'REFSEQ', |
63
|
|
|
|
|
|
|
['DATABASE','DB'], |
64
|
|
|
|
|
|
|
],@_); |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# Object must have a parent sequence and/or a reference |
67
|
|
|
|
|
|
|
# sequence. In some cases, the parent sequence will be the |
68
|
|
|
|
|
|
|
# object itself. The reference sequence is used to set up |
69
|
|
|
|
|
|
|
# the frame of reference for the coordinate system. |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# fetch the sequence object if we don't have it already |
72
|
0
|
0
|
0
|
|
|
|
croak "Please provide either a Sequence object or a database and name" |
|
|
|
0
|
|
|
|
|
73
|
|
|
|
|
|
|
unless ref($seq) || ($seq && $db); |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# convert start into offset |
76
|
0
|
0
|
0
|
|
|
|
$offset = $start - 1 if defined($start) and !defined($offset); |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
# convert stop/end into length |
79
|
0
|
0
|
0
|
|
|
|
$length = ($end > $start) ? $end - $offset : $end - $offset - 2 |
|
|
0
|
|
|
|
|
|
80
|
|
|
|
|
|
|
if defined($end) && !defined($length); |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# if just a string is passed, try to fetch a Sequence object |
83
|
0
|
0
|
|
|
|
|
my $obj = ref($seq) ? $seq : $db->fetch('Sequence'=>$seq); |
84
|
0
|
0
|
|
|
|
|
unless ($obj) { |
85
|
0
|
|
|
|
|
|
Ace->error("No Sequence named $obj found in database"); |
86
|
0
|
|
|
|
|
|
return; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# get parent coordinates and length of this sequence |
90
|
|
|
|
|
|
|
# the parent is an Ace Sequence object in the "+" strand |
91
|
0
|
|
|
|
|
|
my ($parent,$p_offset,$p_length,$strand) = find_parent($obj); |
92
|
0
|
0
|
|
|
|
|
return unless $parent; |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
# handle negative strands |
95
|
0
|
|
|
|
|
|
my $r_strand = $strand; |
96
|
0
|
|
|
|
|
|
my $r_offset = $p_offset; |
97
|
0
|
|
0
|
|
|
|
$offset ||= 0; |
98
|
0
|
0
|
|
|
|
|
$offset *= -1 if $strand < 0; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
# handle feature objects |
101
|
0
|
0
|
|
|
|
|
$offset += $obj->offset if $obj->can('smapped'); |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# get source |
104
|
0
|
0
|
|
|
|
|
my $source = $obj->can('smapped') ? $obj->source : $obj; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# store the object into our instance variables |
107
|
0
|
|
0
|
|
|
|
my $self = bless { |
108
|
|
|
|
|
|
|
obj => $source, |
109
|
|
|
|
|
|
|
offset => $offset, |
110
|
|
|
|
|
|
|
length => $length || $p_length, |
111
|
|
|
|
|
|
|
parent => $parent, |
112
|
|
|
|
|
|
|
p_offset => $p_offset, |
113
|
|
|
|
|
|
|
refseq => [$source,$r_offset,$r_strand], |
114
|
|
|
|
|
|
|
strand => $strand, |
115
|
|
|
|
|
|
|
absolute => 0, |
116
|
|
|
|
|
|
|
automerge => 1, |
117
|
|
|
|
|
|
|
},$pack; |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
# set the reference sequence |
120
|
0
|
0
|
0
|
|
|
|
eval { $self->refseq($refseq) } or return if defined $refseq; |
|
0
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
# wheww! |
123
|
0
|
|
|
|
|
|
return $self; |
124
|
|
|
|
|
|
|
} |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
# return the "source" object that the user offset from |
127
|
|
|
|
|
|
|
sub source { |
128
|
0
|
|
|
0
|
0
|
|
$_[0]->{obj}; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
# return the parent object |
132
|
0
|
|
|
0
|
0
|
|
sub parent { $_[0]->{parent} } |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
# return the length |
135
|
|
|
|
|
|
|
#sub length { $_[0]->{length} } |
136
|
|
|
|
|
|
|
sub length { |
137
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
138
|
0
|
|
|
|
|
|
my ($start,$end) = ($self->start,$self->end); |
139
|
0
|
0
|
|
|
|
|
return $end - $start + ($end > $start ? 1 : -1); # for stupid 1-based adjustments |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
|
142
|
0
|
|
|
0
|
1
|
|
sub reversed { return shift->strand < 0; } |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
sub automerge { |
145
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
146
|
0
|
|
|
|
|
|
my $d = $self->{automerge}; |
147
|
0
|
0
|
|
|
|
|
$self->{automerge} = shift if @_; |
148
|
0
|
|
|
|
|
|
$d; |
149
|
|
|
|
|
|
|
} |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# return reference sequence |
152
|
|
|
|
|
|
|
sub refseq { |
153
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
154
|
0
|
|
|
|
|
|
my $prev = $self->{refseq}; |
155
|
0
|
0
|
|
|
|
|
if (@_) { |
156
|
0
|
|
|
|
|
|
my $refseq = shift; |
157
|
0
|
|
|
|
|
|
my $arrayref; |
158
|
|
|
|
|
|
|
|
159
|
0
|
0
|
|
|
|
|
BLOCK: { |
160
|
0
|
|
|
|
|
|
last BLOCK unless defined ($refseq); |
161
|
|
|
|
|
|
|
|
162
|
0
|
0
|
0
|
|
|
|
if (ref($refseq) && ref($refseq) eq 'ARRAY') { |
163
|
0
|
|
|
|
|
|
$arrayref = $refseq; |
164
|
0
|
|
|
|
|
|
last BLOCK; |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
0
|
0
|
0
|
|
|
|
if (ref($refseq) && ($refseq->can('smapped'))) { |
168
|
0
|
0
|
|
|
|
|
croak "Reference sequence has no common ancestor with sequence" |
169
|
|
|
|
|
|
|
unless $self->parent eq $refseq->parent; |
170
|
0
|
|
|
|
|
|
my ($a,$b,$c) = @{$refseq->{refseq}}; |
|
0
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
# $b += $refseq->offset; |
172
|
0
|
|
|
|
|
|
$b += $refseq->offset; |
173
|
0
|
|
|
|
|
|
$arrayref = [$refseq,$b,$refseq->strand]; |
174
|
0
|
|
|
|
|
|
last BLOCK; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# look up reference sequence in database if we aren't given |
179
|
|
|
|
|
|
|
# database object already |
180
|
0
|
0
|
|
|
|
|
$refseq = $self->db->fetch('Sequence' => $refseq) |
181
|
|
|
|
|
|
|
unless $refseq->isa('Ace::Object'); |
182
|
0
|
0
|
|
|
|
|
croak "Invalid reference sequence" unless $refseq; |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# find position of ref sequence in parent strand |
185
|
0
|
|
|
|
|
|
my ($r_parent,$r_offset,$r_length,$r_strand) = find_parent($refseq); |
186
|
0
|
0
|
|
|
|
|
croak "Reference sequence has no common ancestor with sequence" |
187
|
|
|
|
|
|
|
unless $r_parent eq $self->{parent}; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# set to array reference containing this information |
190
|
0
|
|
|
|
|
|
$arrayref = [$refseq,$r_offset,$r_strand]; |
191
|
|
|
|
|
|
|
} |
192
|
0
|
|
|
|
|
|
$self->{refseq} = $arrayref; |
193
|
|
|
|
|
|
|
} |
194
|
0
|
0
|
|
|
|
|
return unless $prev; |
195
|
0
|
0
|
|
|
|
|
return $self->parent if $self->absolute; |
196
|
0
|
0
|
|
|
|
|
return wantarray ? @{$prev} : $prev->[0]; |
|
0
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
} |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# return strand |
200
|
0
|
|
|
0
|
1
|
|
sub strand { return $_[0]->{strand} } |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
# return reference strand |
203
|
|
|
|
|
|
|
sub r_strand { |
204
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
205
|
0
|
0
|
|
|
|
|
return "+1" if $self->absolute; |
206
|
0
|
0
|
|
|
|
|
if (my ($ref,$r_offset,$r_strand) = $self->refseq) { |
207
|
0
|
|
|
|
|
|
return $r_strand; |
208
|
|
|
|
|
|
|
} else { |
209
|
0
|
|
|
|
|
|
return $self->{strand} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
0
|
|
|
0
|
1
|
|
sub offset { $_[0]->{offset} } |
214
|
0
|
|
|
0
|
0
|
|
sub p_offset { $_[0]->{p_offset} } |
215
|
|
|
|
|
|
|
|
216
|
0
|
|
|
0
|
0
|
|
sub smapped { 1; } |
217
|
0
|
|
|
0
|
0
|
|
sub type { 'Sequence' } |
218
|
0
|
|
|
0
|
0
|
|
sub subtype { } |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
sub debug { |
221
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
222
|
0
|
|
|
|
|
|
my $d = $self->{_debug}; |
223
|
0
|
0
|
|
|
|
|
$self->{_debug} = shift if @_; |
224
|
0
|
|
|
|
|
|
$d; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
# return the database this sequence is associated with |
228
|
|
|
|
|
|
|
sub db { |
229
|
0
|
|
0
|
0
|
1
|
|
return Ace->name2db($_[0]->{db} ||= $_[0]->source->db); |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
sub start { |
233
|
0
|
|
|
0
|
1
|
|
my ($self,$abs) = @_; |
234
|
0
|
0
|
|
|
|
|
$abs = $self->absolute unless defined $abs; |
235
|
0
|
0
|
|
|
|
|
return $self->{p_offset} + $self->{offset} + 1 if $abs; |
236
|
|
|
|
|
|
|
|
237
|
0
|
0
|
|
|
|
|
if ($self->refseq) { |
238
|
0
|
|
|
|
|
|
my ($ref,$r_offset,$r_strand) = $self->refseq; |
239
|
0
|
0
|
|
|
|
|
return $r_strand < 0 ? 1 + $r_offset - ($self->{p_offset} + $self->{offset}) |
240
|
|
|
|
|
|
|
: 1 + $self->{p_offset} + $self->{offset} - $r_offset; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
else { |
244
|
0
|
|
|
|
|
|
return $self->{offset} +1; |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
} |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
sub end { |
250
|
0
|
|
|
0
|
1
|
|
my ($self,$abs) = @_; |
251
|
0
|
|
|
|
|
|
my $start = $self->start($abs); |
252
|
0
|
0
|
|
|
|
|
my $f = $self->{length} > 0 ? 1 : -1; # for stupid 1-based adjustments |
253
|
0
|
0
|
0
|
|
|
|
if ($abs && $self->refseq ne $self->parent) { |
254
|
0
|
|
|
|
|
|
my $r_strand = $self->r_strand; |
255
|
0
|
0
|
0
|
|
|
|
return $start - $self->{length} + $f |
|
|
|
0
|
|
|
|
|
256
|
|
|
|
|
|
|
if $r_strand < 0 or $self->{strand} < 0 or $self->{length} < 0; |
257
|
0
|
|
|
|
|
|
return $start + $self->{length} - $f |
258
|
|
|
|
|
|
|
} |
259
|
0
|
0
|
|
|
|
|
return $start + $self->{length} - $f if $self->r_strand eq $self->{strand}; |
260
|
0
|
|
|
|
|
|
return $start - $self->{length} + $f; |
261
|
|
|
|
|
|
|
} |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
# turn on absolute coordinates (relative to reference sequence) |
264
|
|
|
|
|
|
|
sub absolute { |
265
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
266
|
0
|
|
|
|
|
|
my $prev = $self->{absolute}; |
267
|
0
|
0
|
|
|
|
|
$self->{absolute} = $_[0] if defined $_[0]; |
268
|
0
|
|
|
|
|
|
return $prev; |
269
|
|
|
|
|
|
|
} |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
# human readable string (for debugging) |
272
|
|
|
|
|
|
|
sub asString { |
273
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
274
|
0
|
0
|
|
|
|
|
if ($self->absolute) { |
|
|
0
|
|
|
|
|
|
275
|
0
|
|
|
|
|
|
return join '',$self->parent,'/',$self->start,',',$self->end; |
276
|
|
|
|
|
|
|
} elsif (my $ref = $self->refseq){ |
277
|
0
|
0
|
|
|
|
|
my $label = $ref->isa('Ace::Sequence::Feature') ? $ref->info : "$ref"; |
278
|
0
|
|
|
|
|
|
return join '',$label,'/',$self->start,',',$self->end; |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
} else { |
281
|
0
|
|
|
|
|
|
join '',$self->source,'/',$self->start,',',$self->end; |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
} |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
sub cmp { |
286
|
0
|
|
|
0
|
0
|
|
my ($self,$arg,$reversed) = @_; |
287
|
0
|
0
|
0
|
|
|
|
if (ref($arg) and $arg->isa('Ace::Sequence')) { |
288
|
0
|
|
0
|
|
|
|
my $cmp = $self->parent cmp $arg->parent |
289
|
|
|
|
|
|
|
|| $self->start <=> $arg->start; |
290
|
0
|
0
|
|
|
|
|
return $reversed ? -$cmp : $cmp; |
291
|
|
|
|
|
|
|
} |
292
|
0
|
|
|
|
|
|
my $name = $self->asString; |
293
|
0
|
0
|
|
|
|
|
return $reversed ? $arg cmp $name : $name cmp $arg; |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# Return the DNA |
297
|
|
|
|
|
|
|
sub dna { |
298
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
299
|
0
|
0
|
|
|
|
|
return $self->{dna} if $self->{dna}; |
300
|
0
|
|
|
|
|
|
my $raw = $self->_query('seqdna'); |
301
|
0
|
|
|
|
|
|
$raw=~s/^>.*\n//m; |
302
|
0
|
|
|
|
|
|
$raw=~s/^\/\/.*//mg; |
303
|
0
|
|
|
|
|
|
$raw=~s/\n//g; |
304
|
0
|
|
|
|
|
|
$raw =~ s/\0+\Z//; # blasted nulls! |
305
|
0
|
0
|
|
|
|
|
my $effective_strand = $self->end >= $self->start ? '+1' : '-1'; |
306
|
0
|
0
|
|
|
|
|
_complement(\$raw) if $self->r_strand ne $effective_strand; |
307
|
0
|
|
|
|
|
|
return $self->{dna} = $raw; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
# return a gff file |
311
|
|
|
|
|
|
|
sub gff { |
312
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
313
|
0
|
|
|
|
|
|
my ($abs,$features) = rearrange([['ABS','ABSOLUTE'],'FEATURES'],@_); |
314
|
0
|
0
|
|
|
|
|
$abs = $self->absolute unless defined $abs; |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# can provide list of feature names, such as 'similarity', or 'all' to get 'em all |
317
|
|
|
|
|
|
|
# !THIS IS BROKEN; IT SHOULD LOOK LIKE FEATURE()! |
318
|
0
|
|
|
|
|
|
my $opt = $self->_feature_filter($features); |
319
|
|
|
|
|
|
|
|
320
|
0
|
|
|
|
|
|
my $gff = $self->_gff($opt); |
321
|
0
|
0
|
|
|
|
|
warn $gff if $self->debug; |
322
|
|
|
|
|
|
|
|
323
|
0
|
0
|
|
|
|
|
$self->transformGFF(\$gff) unless $abs; |
324
|
0
|
|
|
|
|
|
return $gff; |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
# return a GFF object using the optional GFF.pm module |
328
|
|
|
|
|
|
|
sub GFF { |
329
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
330
|
0
|
|
|
|
|
|
my ($filter,$converter) = @_; # anonymous subs |
331
|
0
|
0
|
|
|
|
|
croak "GFF module not installed" unless require GFF; |
332
|
0
|
|
|
|
|
|
require GFF::Filehandle; |
333
|
|
|
|
|
|
|
|
334
|
0
|
|
|
|
|
|
my @lines = grep !/^\/\//,split "\n",$self->gff(@_); |
335
|
0
|
|
|
|
|
|
local *IN; |
336
|
0
|
|
|
|
|
|
local ($^W) = 0; # prevent complaint by GFF module |
337
|
0
|
|
|
|
|
|
tie *IN,'GFF::Filehandle',\@lines; |
338
|
0
|
|
|
|
|
|
my $gff = GFF::GeneFeatureSet->new; |
339
|
0
|
0
|
|
|
|
|
$gff->read(\*IN,$filter,$converter) if $gff; |
340
|
0
|
|
|
|
|
|
return $gff; |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
# Get the features table. Can filter by type/subtype this way: |
344
|
|
|
|
|
|
|
# features('similarity:EST','annotation:assembly_tag') |
345
|
|
|
|
|
|
|
sub features { |
346
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
347
|
0
|
|
|
|
|
|
my ($filter,$opt) = $self->_make_filter(@_); |
348
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
# get raw gff file |
350
|
0
|
|
|
|
|
|
my $gff = $self->gff(-features=>$opt); |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# turn it into a list of features |
353
|
0
|
|
|
|
|
|
my @features = $self->_make_features($gff,$filter); |
354
|
|
|
|
|
|
|
|
355
|
0
|
0
|
|
|
|
|
if ($self->automerge) { # automatic merging |
356
|
|
|
|
|
|
|
# fetch out constructed transcripts and clones |
357
|
0
|
|
|
|
|
|
my %types = map {lc($_)=>1} (@$opt,@_); |
|
0
|
|
|
|
|
|
|
358
|
0
|
0
|
|
|
|
|
if ($types{'transcript'}) { |
359
|
0
|
|
|
|
|
|
push @features,$self->_make_transcripts(\@features); |
360
|
0
|
|
|
|
|
|
@features = grep {$_->type !~ /^(intron|exon)$/ } @features; |
|
0
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
} |
362
|
0
|
0
|
|
|
|
|
push @features,$self->_make_clones(\@features) if $types{'clone'}; |
363
|
0
|
0
|
|
|
|
|
if ($types{'similarity'}) { |
364
|
0
|
|
|
|
|
|
my @f = $self->_make_alignments(\@features); |
365
|
0
|
|
|
|
|
|
@features = grep {$_->type ne 'similarity'} @features; |
|
0
|
|
|
|
|
|
|
366
|
0
|
|
|
|
|
|
push @features,@f; |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
|
370
|
0
|
0
|
|
|
|
|
return wantarray ? @features : \@features; |
371
|
|
|
|
|
|
|
} |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# A little bit more complex - assemble a list of "transcripts" |
374
|
|
|
|
|
|
|
# consisting of Ace::Sequence::Transcript objects. These objects |
375
|
|
|
|
|
|
|
# contain a list of exons and introns. |
376
|
|
|
|
|
|
|
sub transcripts { |
377
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
378
|
0
|
|
|
|
|
|
my $curated = shift; |
379
|
0
|
0
|
|
|
|
|
my $ef = $curated ? "exon:curated" : "exon"; |
380
|
0
|
0
|
|
|
|
|
my $if = $curated ? "intron:curated" : "intron"; |
381
|
0
|
0
|
|
|
|
|
my $sf = $curated ? "Sequence:curated" : "Sequence"; |
382
|
0
|
|
|
|
|
|
my @features = $self->features($ef,$if,$sf); |
383
|
0
|
0
|
|
|
|
|
return unless @features; |
384
|
0
|
|
|
|
|
|
return $self->_make_transcripts(\@features); |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
sub _make_transcripts { |
388
|
0
|
|
|
0
|
|
|
my $self = shift; |
389
|
0
|
|
|
|
|
|
my $features = shift; |
390
|
|
|
|
|
|
|
|
391
|
0
|
|
|
|
|
|
require Ace::Sequence::Transcript; |
392
|
0
|
|
|
|
|
|
my %transcripts; |
393
|
|
|
|
|
|
|
|
394
|
0
|
|
|
|
|
|
for my $feature (@$features) { |
395
|
0
|
|
|
|
|
|
my $transcript = $feature->info; |
396
|
0
|
0
|
|
|
|
|
next unless $transcript; |
397
|
0
|
0
|
|
|
|
|
if ($feature->type =~ /^(exon|intron|cds)$/) { |
|
|
0
|
|
|
|
|
|
398
|
0
|
|
|
|
|
|
my $type = $1; |
399
|
0
|
|
|
|
|
|
push @{$transcripts{$transcript}{$type}},$feature; |
|
0
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
} elsif ($feature->type eq 'Sequence') { |
401
|
0
|
|
0
|
|
|
|
$transcripts{$transcript}{base} ||= $feature; |
402
|
|
|
|
|
|
|
} |
403
|
|
|
|
|
|
|
} |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
# get rid of transcripts without exons |
406
|
0
|
|
|
|
|
|
foreach (keys %transcripts) { |
407
|
0
|
0
|
|
|
|
|
delete $transcripts{$_} unless exists $transcripts{$_}{exon} |
408
|
|
|
|
|
|
|
} |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
# map the rest onto Ace::Sequence::Transcript objects |
411
|
0
|
|
|
|
|
|
return map {Ace::Sequence::Transcript->new($transcripts{$_})} keys %transcripts; |
|
0
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
} |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# Reassemble clones from clone left and right ends |
415
|
|
|
|
|
|
|
sub clones { |
416
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
417
|
0
|
|
|
|
|
|
my @clones = $self->features('Clone_left_end','Clone_right_end','Sequence'); |
418
|
0
|
|
|
|
|
|
my %clones; |
419
|
0
|
0
|
|
|
|
|
return unless @clones; |
420
|
0
|
|
|
|
|
|
return $self->_make_clones(\@clones); |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
sub _make_clones { |
424
|
0
|
|
|
0
|
|
|
my $self = shift; |
425
|
0
|
|
|
|
|
|
my $features = shift; |
426
|
|
|
|
|
|
|
|
427
|
0
|
|
|
|
|
|
my (%clones,@canonical_clones); |
428
|
0
|
0
|
|
|
|
|
my $start_label = $self->strand < 0 ? 'end' : 'start'; |
429
|
0
|
0
|
|
|
|
|
my $end_label = $self->strand < 0 ? 'start' : 'end'; |
430
|
0
|
|
|
|
|
|
for my $feature (@$features) { |
431
|
0
|
0
|
|
|
|
|
$clones{$feature->info}{$start_label} = $feature->start if $feature->type eq 'Clone_left_end'; |
432
|
0
|
0
|
|
|
|
|
$clones{$feature->info}{$end_label} = $feature->start if $feature->type eq 'Clone_right_end'; |
433
|
|
|
|
|
|
|
|
434
|
0
|
0
|
|
|
|
|
if ($feature->type eq 'Sequence') { |
435
|
0
|
|
|
|
|
|
my $info = $feature->info; |
436
|
0
|
0
|
|
|
|
|
next if $info =~ /LINK|CHROMOSOME|\.\w+$/; |
437
|
0
|
0
|
|
|
|
|
if ($info->Genomic_canonical(0)) { |
438
|
0
|
0
|
|
|
|
|
push (@canonical_clones,$info->Clone) if $info->Clone; |
439
|
|
|
|
|
|
|
} |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
} |
442
|
|
|
|
|
|
|
|
443
|
0
|
|
|
|
|
|
foreach (@canonical_clones) { |
444
|
0
|
|
0
|
|
|
|
$clones{$_} ||= {}; |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
|
447
|
0
|
|
|
|
|
|
my @features; |
448
|
0
|
|
|
|
|
|
my ($r,$r_offset,$r_strand) = $self->refseq; |
449
|
0
|
|
|
|
|
|
my $parent = $self->parent; |
450
|
0
|
|
|
|
|
|
my $abs = $self->absolute; |
451
|
0
|
0
|
|
|
|
|
if ($abs) { |
452
|
0
|
|
|
|
|
|
$r_offset = 0; |
453
|
0
|
|
|
|
|
|
$r = $parent; |
454
|
0
|
|
|
|
|
|
$r_strand = '+1'; |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# BAD HACK ALERT. WE DON'T KNOW WHERE THE LEFT END OF THE CLONE IS SO WE USE |
458
|
|
|
|
|
|
|
# THE MAGIC NUMBER -99_999_999 to mean "off left end" and |
459
|
|
|
|
|
|
|
# +99_999_999 to mean "off right end" |
460
|
0
|
|
|
|
|
|
for my $clone (keys %clones) { |
461
|
0
|
|
0
|
|
|
|
my $start = $clones{$clone}{start} || -99_999_999; |
462
|
0
|
|
0
|
|
|
|
my $end = $clones{$clone}{end} || +99_999_999; |
463
|
0
|
|
|
|
|
|
my $phony_gff = join "\t",($parent,'Clone','structural',$start,$end,'.','.','.',qq(Clone "$clone")); |
464
|
0
|
|
|
|
|
|
push @features,Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$phony_gff); |
465
|
|
|
|
|
|
|
} |
466
|
0
|
|
|
|
|
|
return @features; |
467
|
|
|
|
|
|
|
} |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
# Assemble a list of "GappedAlignment" objects. These objects |
470
|
|
|
|
|
|
|
# contain a list of aligned segments. |
471
|
|
|
|
|
|
|
sub alignments { |
472
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
473
|
0
|
|
|
|
|
|
my @subtypes = @_; |
474
|
0
|
|
|
|
|
|
my @types = map { "similarity:\^$_\$" } @subtypes; |
|
0
|
|
|
|
|
|
|
475
|
0
|
0
|
|
|
|
|
push @types,'similarity' unless @types; |
476
|
0
|
|
|
|
|
|
return $self->features(@types); |
477
|
|
|
|
|
|
|
} |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
sub segments { |
480
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
481
|
0
|
|
|
|
|
|
return; |
482
|
|
|
|
|
|
|
} |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
sub _make_alignments { |
485
|
0
|
|
|
0
|
|
|
my $self = shift; |
486
|
0
|
|
|
|
|
|
my $features = shift; |
487
|
0
|
|
|
|
|
|
require Ace::Sequence::GappedAlignment; |
488
|
|
|
|
|
|
|
|
489
|
0
|
|
|
|
|
|
my %homol; |
490
|
|
|
|
|
|
|
|
491
|
0
|
|
|
|
|
|
for my $feature (@$features) { |
492
|
0
|
0
|
|
|
|
|
next unless $feature->type eq 'similarity'; |
493
|
0
|
|
|
|
|
|
my $target = $feature->info; |
494
|
0
|
|
|
|
|
|
my $subtype = $feature->subtype; |
495
|
0
|
|
|
|
|
|
push @{$homol{$target,$subtype}},$feature; |
|
0
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
} |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
# map onto Ace::Sequence::GappedAlignment objects |
499
|
0
|
|
|
|
|
|
return map {Ace::Sequence::GappedAlignment->new($homol{$_})} keys %homol; |
|
0
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
} |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
# return list of features quickly |
503
|
|
|
|
|
|
|
sub feature_list { |
504
|
0
|
|
|
0
|
1
|
|
my $self = shift; |
505
|
0
|
0
|
|
|
|
|
return $self->{'feature_list'} if $self->{'feature_list'}; |
506
|
0
|
0
|
|
|
|
|
return unless my $raw = $self->_query('seqfeatures -version 2 -list'); |
507
|
0
|
|
|
|
|
|
return $self->{'feature_list'} = Ace::Sequence::FeatureList->new($raw); |
508
|
|
|
|
|
|
|
} |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# transform a GFF file into the coordinate system of the sequence |
511
|
|
|
|
|
|
|
sub transformGFF { |
512
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
513
|
0
|
|
|
|
|
|
my $gff = shift; |
514
|
0
|
|
|
|
|
|
my $parent = $self->parent; |
515
|
0
|
|
|
|
|
|
my $strand = $self->{strand}; |
516
|
0
|
|
|
|
|
|
my $source = $self->source; |
517
|
0
|
|
|
|
|
|
my ($ref_source,$ref_offset,$ref_strand) = $self->refseq; |
518
|
0
|
|
0
|
|
|
|
$ref_source ||= $source; |
519
|
0
|
|
0
|
|
|
|
$ref_strand ||= $strand; |
520
|
|
|
|
|
|
|
|
521
|
0
|
0
|
|
|
|
|
if ($ref_strand > 0) { |
522
|
0
|
0
|
|
|
|
|
my $o = defined($ref_offset) ? $ref_offset : ($self->p_offset + $self->offset); |
523
|
|
|
|
|
|
|
# find anything that looks like a numeric field and subtract offset from it |
524
|
0
|
|
|
|
|
|
$$gff =~ s/(?
|
|
0
|
|
|
|
|
|
|
525
|
0
|
|
|
|
|
|
$$gff =~ s/^$parent/$source/mg; |
526
|
0
|
|
|
|
|
|
$$gff =~ s/\#\#sequence-region\s+\S+/##sequence-region $ref_source/m; |
527
|
0
|
|
|
|
|
|
$$gff =~ s/FMAP_FEATURES\s+"\S+"/FMAP_FEATURES "$ref_source"/m; |
528
|
0
|
|
|
|
|
|
return; |
529
|
|
|
|
|
|
|
} else { # strand eq '-' |
530
|
0
|
0
|
|
|
|
|
my $o = defined($ref_offset) ? (2 + $ref_offset) : (2 + $self->p_offset - $self->offset); |
531
|
0
|
|
|
|
|
|
$$gff =~ s/(?
|
|
0
|
|
|
|
|
|
|
532
|
0
|
|
|
|
|
|
$$gff =~ s/(Target \"[^\"]+\" )(-?\d+) (-?\d+)/$1 $3 $2/g; |
533
|
0
|
|
|
|
|
|
$$gff =~ s/^$parent/$source/mg; |
534
|
0
|
|
|
|
|
|
$$gff =~ s/\#\#sequence-region\s+\S+\s+(-?\d+)\s+(-?\d+)/"##sequence-region $ref_source " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em; |
|
0
|
|
|
|
|
|
|
535
|
0
|
|
|
|
|
|
$$gff =~ s/FMAP_FEATURES\s+"\S+"\s+(-?\d+)\s+(-?\d+)/"FMAP_FEATURES \"$ref_source\" " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em; |
|
0
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
} |
537
|
|
|
|
|
|
|
|
538
|
|
|
|
|
|
|
} |
539
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
# return a name for the object |
541
|
|
|
|
|
|
|
sub name { |
542
|
0
|
|
|
0
|
1
|
|
return shift->source_seq->name; |
543
|
|
|
|
|
|
|
} |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# for compatibility with Ace::Sequence::Feature |
546
|
|
|
|
|
|
|
sub info { |
547
|
0
|
|
|
0
|
0
|
|
return shift->source_seq; |
548
|
|
|
|
|
|
|
} |
549
|
|
|
|
|
|
|
|
550
|
|
|
|
|
|
|
###################### internal functions ################# |
551
|
|
|
|
|
|
|
# not necessarily object-oriented!! |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# return parent, parent offset and strand |
554
|
|
|
|
|
|
|
sub find_parent { |
555
|
0
|
|
|
0
|
0
|
|
my $obj = shift; |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
# first, if we are passed an Ace::Sequence, then we can inherit |
558
|
|
|
|
|
|
|
# these settings directly |
559
|
0
|
0
|
|
|
|
|
return (@{$obj}{qw(parent p_offset length)},$obj->r_strand) |
|
0
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
if $obj->isa('Ace::Sequence'); |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
# otherwise, if we are passed an Ace::Object, then we must |
563
|
|
|
|
|
|
|
# traverse upwards until we find a suitable parent |
564
|
0
|
0
|
|
|
|
|
return _traverse($obj) if $obj->isa('Ace::Object'); |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
# otherwise, we don't know what to do... |
567
|
0
|
|
|
|
|
|
croak "Source sequence not an Ace::Object or an Ace::Sequence"; |
568
|
|
|
|
|
|
|
} |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
sub _get_parent { |
571
|
0
|
|
|
0
|
|
|
my $obj = shift; |
572
|
|
|
|
|
|
|
# ** DANGER DANGER WILL ROBINSON! ** |
573
|
|
|
|
|
|
|
# This is an experiment in caching parents to speed lookups. Probably eats memory voraciously. |
574
|
0
|
0
|
|
|
|
|
return $CACHE{$obj} if CACHE && exists $CACHE{$obj}; |
575
|
0
|
|
0
|
|
|
|
my $p = $obj->get(S_Parent=>2)|| $obj->get(Source=>1); |
576
|
0
|
0
|
|
|
|
|
return unless $p; |
577
|
0
|
|
|
|
|
|
return CACHE ? $CACHE{$obj} = $p->fetch |
578
|
|
|
|
|
|
|
: $p->fetch; |
579
|
|
|
|
|
|
|
} |
580
|
|
|
|
|
|
|
|
581
|
|
|
|
|
|
|
sub _get_children { |
582
|
0
|
|
|
0
|
|
|
my $obj = shift; |
583
|
0
|
|
|
|
|
|
my @pieces = $obj->get(S_Child=>2); |
584
|
0
|
0
|
|
|
|
|
return @pieces if @pieces; |
585
|
0
|
|
|
|
|
|
return @pieces = $obj->get('Subsequence'); |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
# get sequence, offset and strand of topmost container |
589
|
|
|
|
|
|
|
sub _traverse { |
590
|
0
|
|
|
0
|
|
|
my $obj = shift; |
591
|
0
|
|
|
|
|
|
my ($offset,$length); |
592
|
|
|
|
|
|
|
|
593
|
|
|
|
|
|
|
# invoke seqget to find the top-level container for this sequence |
594
|
0
|
|
|
|
|
|
my ($tl,$tl_start,$tl_end) = _get_toplevel($obj); |
595
|
0
|
|
0
|
|
|
|
$tl_start ||= 0; |
596
|
0
|
|
0
|
|
|
|
$tl_end ||= 0; |
597
|
|
|
|
|
|
|
|
598
|
|
|
|
|
|
|
# make it an object |
599
|
0
|
|
|
|
|
|
$tl = ref($obj)->new(-name=>$tl,-class=>'Sequence',-db=>$obj->db); |
600
|
|
|
|
|
|
|
|
601
|
0
|
|
|
|
|
|
$offset += $tl_start - 1; # offset to beginning of toplevel |
602
|
0
|
|
0
|
|
|
|
$length ||= abs($tl_end - $tl_start) + 1; |
603
|
0
|
0
|
|
|
|
|
my $strand = $tl_start < $tl_end ? +1 : -1; |
604
|
|
|
|
|
|
|
|
605
|
0
|
0
|
|
|
|
|
return ($tl,$offset,$strand < 0 ? ($length,'-1') : ($length,'+1') ) if $length; |
|
|
0
|
|
|
|
|
|
606
|
|
|
|
|
|
|
} |
607
|
|
|
|
|
|
|
|
608
|
|
|
|
|
|
|
sub _get_toplevel { |
609
|
0
|
|
|
0
|
|
|
my $obj = shift; |
610
|
0
|
|
|
|
|
|
my $class = $obj->class; |
611
|
0
|
|
|
|
|
|
my $name = $obj->name; |
612
|
|
|
|
|
|
|
|
613
|
0
|
|
|
|
|
|
my $smap = $obj->db->raw_query("gif smap -from $class:$name"); |
614
|
0
|
|
|
|
|
|
my ($parent,$pstart,$pstop,$tstart,$tstop,$map_type) = |
615
|
|
|
|
|
|
|
$smap =~ /^SMAP\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.+)/; |
616
|
|
|
|
|
|
|
|
617
|
0
|
|
0
|
|
|
|
$parent ||= ''; |
618
|
0
|
|
|
|
|
|
$parent =~ s/^Sequence://; # remove this in next version of Acedb |
619
|
0
|
|
|
|
|
|
return ($parent,$pstart,$pstop); |
620
|
|
|
|
|
|
|
} |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# create subroutine that filters GFF files for certain feature types |
623
|
|
|
|
|
|
|
sub _make_filter { |
624
|
0
|
|
|
0
|
|
|
my $self = shift; |
625
|
0
|
|
|
|
|
|
my $automerge = $self->automerge; |
626
|
|
|
|
|
|
|
|
627
|
|
|
|
|
|
|
# parse out the filter |
628
|
0
|
|
|
|
|
|
my %filter; |
629
|
0
|
|
|
|
|
|
foreach (@_) { |
630
|
0
|
|
|
|
|
|
my ($type,$filter) = split(':',$_,2); |
631
|
0
|
0
|
0
|
|
|
|
if ($automerge && lc($type) eq 'transcript') { |
|
|
0
|
0
|
|
|
|
|
632
|
0
|
|
|
|
|
|
@filter{'exon','intron','Sequence','cds'} = ([undef],[undef],[undef],[undef]); |
633
|
|
|
|
|
|
|
} elsif ($automerge && lc($type) eq 'clone') { |
634
|
0
|
|
|
|
|
|
@filter{'Clone_left_end','Clone_right_end','Sequence'} = ([undef],[undef],[undef]); |
635
|
|
|
|
|
|
|
} else { |
636
|
0
|
|
|
|
|
|
push @{$filter{$type}},$filter; |
|
0
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
} |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
# create pattern-match sub |
641
|
0
|
|
|
|
|
|
my $sub; |
642
|
|
|
|
|
|
|
my $promiscuous; # indicates that there is a subtype without a type |
643
|
|
|
|
|
|
|
|
644
|
0
|
0
|
|
|
|
|
if (%filter) { |
645
|
0
|
|
|
|
|
|
my $s = "sub { my \@d = split(\"\\t\",\$_[0]);\n"; |
646
|
0
|
|
|
|
|
|
for my $type (keys %filter) { |
647
|
0
|
|
|
|
|
|
my $expr; |
648
|
0
|
|
|
|
|
|
my $subtypes = $filter{$type}; |
649
|
0
|
0
|
|
|
|
|
if ($type ne '') { |
650
|
0
|
|
|
|
|
|
for my $st (@$subtypes) { |
651
|
0
|
0
|
|
|
|
|
$expr .= defined $st ? "return 1 if \$d[2]=~/$type/i && \$d[1]=~/$st/i;\n" |
652
|
|
|
|
|
|
|
: "return 1 if \$d[2]=~/$type/i;\n" |
653
|
|
|
|
|
|
|
} |
654
|
|
|
|
|
|
|
} else { # no type, only subtypes |
655
|
0
|
|
|
|
|
|
$promiscuous++; |
656
|
0
|
|
|
|
|
|
for my $st (@$subtypes) { |
657
|
0
|
0
|
|
|
|
|
next unless defined $st; |
658
|
0
|
|
|
|
|
|
$expr .= "return 1 if \$d[1]=~/$st/i;\n"; |
659
|
|
|
|
|
|
|
} |
660
|
|
|
|
|
|
|
} |
661
|
0
|
|
|
|
|
|
$s .= $expr; |
662
|
|
|
|
|
|
|
} |
663
|
0
|
|
|
|
|
|
$s .= "return;\n }"; |
664
|
|
|
|
|
|
|
|
665
|
0
|
|
|
|
|
|
$sub = eval $s; |
666
|
0
|
0
|
|
|
|
|
croak $@ if $@; |
667
|
|
|
|
|
|
|
} else { |
668
|
0
|
|
|
0
|
|
|
$sub = sub { 1; } |
669
|
0
|
|
|
|
|
|
} |
670
|
0
|
0
|
|
|
|
|
return ($sub,$promiscuous ? [] : [keys %filter]); |
671
|
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
|
673
|
|
|
|
|
|
|
# turn a GFF file and a filter into a list of Ace::Sequence::Feature objects |
674
|
|
|
|
|
|
|
sub _make_features { |
675
|
0
|
|
|
0
|
|
|
my $self = shift; |
676
|
0
|
|
|
|
|
|
my ($gff,$filter) = @_; |
677
|
|
|
|
|
|
|
|
678
|
0
|
|
|
|
|
|
my ($r,$r_offset,$r_strand) = $self->refseq; |
679
|
0
|
|
|
|
|
|
my $parent = $self->parent; |
680
|
0
|
|
|
|
|
|
my $abs = $self->absolute; |
681
|
0
|
0
|
|
|
|
|
if ($abs) { |
682
|
0
|
|
|
|
|
|
$r_offset = 0; |
683
|
0
|
|
|
|
|
|
$r = $parent; |
684
|
0
|
|
|
|
|
|
$r_strand = '+1'; |
685
|
|
|
|
|
|
|
} |
686
|
0
|
|
0
|
|
|
|
my @features = map {Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$_)} |
|
0
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
grep !m@^(?:\#|//)@ && $filter->($_),split("\n",$gff); |
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
|
691
|
|
|
|
|
|
|
# low level GFF call, no changing absolute to relative coordinates |
692
|
|
|
|
|
|
|
sub _gff { |
693
|
0
|
|
|
0
|
|
|
my $self = shift; |
694
|
0
|
|
|
|
|
|
my ($opt,$db) = @_; |
695
|
0
|
|
|
|
|
|
my $data = $self->_query("seqfeatures -version 2 $opt",$db); |
696
|
0
|
|
|
|
|
|
$data =~ s/\0+\Z//; |
697
|
0
|
|
|
|
|
|
return $data; #blasted nulls! |
698
|
|
|
|
|
|
|
} |
699
|
|
|
|
|
|
|
|
700
|
|
|
|
|
|
|
# shortcut for running a gif query |
701
|
|
|
|
|
|
|
sub _query { |
702
|
0
|
|
|
0
|
|
|
my $self = shift; |
703
|
0
|
|
|
|
|
|
my $command = shift; |
704
|
0
|
|
0
|
|
|
|
my $db = shift || $self->db; |
705
|
|
|
|
|
|
|
|
706
|
0
|
|
|
|
|
|
my $parent = $self->parent; |
707
|
0
|
|
|
|
|
|
my $start = $self->start(1); |
708
|
0
|
|
|
|
|
|
my $end = $self->end(1); |
709
|
0
|
0
|
|
|
|
|
($start,$end) = ($end,$start) if $start > $end; #flippity floppity |
710
|
|
|
|
|
|
|
|
711
|
0
|
|
|
|
|
|
my $coord = "-coords $start $end"; |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
# BAD BAD HACK ALERT - CHECKS THE QUERY THAT IS PASSED DOWN |
714
|
|
|
|
|
|
|
# ALSO MAKES THINGS INCOMPATIBLE WITH PRIOR 4.9 servers. |
715
|
|
|
|
|
|
|
# my $opt = $command =~ /seqfeatures/ ? '-nodna' : ''; |
716
|
0
|
|
|
|
|
|
my $opt = '-noclip'; |
717
|
|
|
|
|
|
|
|
718
|
0
|
|
|
|
|
|
my $query = "gif seqget $parent $opt $coord ; $command"; |
719
|
0
|
0
|
|
|
|
|
warn $query if $self->debug; |
720
|
|
|
|
|
|
|
|
721
|
0
|
|
|
|
|
|
return $db->raw_query("gif seqget $parent $opt $coord ; $command"); |
722
|
|
|
|
|
|
|
} |
723
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
# utility function -- reverse complement |
725
|
|
|
|
|
|
|
sub _complement { |
726
|
0
|
|
|
0
|
|
|
my $dna = shift; |
727
|
0
|
|
|
|
|
|
$$dna =~ tr/GATCgatc/CTAGctag/; |
728
|
0
|
|
|
|
|
|
$$dna = scalar reverse $$dna; |
729
|
|
|
|
|
|
|
} |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
sub _feature_filter { |
732
|
0
|
|
|
0
|
|
|
my $self = shift; |
733
|
0
|
|
|
|
|
|
my $features = shift; |
734
|
0
|
0
|
|
|
|
|
return '' unless $features; |
735
|
0
|
|
|
|
|
|
my $opt = ''; |
736
|
0
|
0
|
0
|
|
|
|
$opt = '+feature ' . join('|',@$features) if ref($features) eq 'ARRAY' && @$features; |
737
|
0
|
0
|
|
|
|
|
$opt = "+feature $features" unless ref $features; |
738
|
0
|
|
|
|
|
|
$opt; |
739
|
|
|
|
|
|
|
} |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
1; |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
=head1 NAME |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
Ace::Sequence - Examine ACeDB Sequence Objects |
746
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
=head1 SYNOPSIS |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
# open database connection and get an Ace::Object sequence |
750
|
|
|
|
|
|
|
use Ace::Sequence; |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
$db = Ace->connect(-host => 'stein.cshl.org',-port => 200005); |
753
|
|
|
|
|
|
|
$obj = $db->fetch(Predicted_gene => 'ZK154.3'); |
754
|
|
|
|
|
|
|
|
755
|
|
|
|
|
|
|
# Wrap it in an Ace::Sequence object |
756
|
|
|
|
|
|
|
$seq = Ace::Sequence->new($obj); |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
# Find all the exons |
759
|
|
|
|
|
|
|
@exons = $seq->features('exon'); |
760
|
|
|
|
|
|
|
|
761
|
|
|
|
|
|
|
# Find all the exons predicted by various versions of "genefinder" |
762
|
|
|
|
|
|
|
@exons = $seq->features('exon:genefinder.*'); |
763
|
|
|
|
|
|
|
|
764
|
|
|
|
|
|
|
# Iterate through the exons, printing their start, end and DNA |
765
|
|
|
|
|
|
|
for my $exon (@exons) { |
766
|
|
|
|
|
|
|
print join "\t",$exon->start,$exon->end,$exon->dna,"\n"; |
767
|
|
|
|
|
|
|
} |
768
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
# Find the region 1000 kb upstream of the first exon |
770
|
|
|
|
|
|
|
$sub = Ace::Sequence->new(-seq=>$exons[0], |
771
|
|
|
|
|
|
|
-offset=>-1000,-length=>1000); |
772
|
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
# Find all features in that area |
774
|
|
|
|
|
|
|
@features = $sub->features; |
775
|
|
|
|
|
|
|
|
776
|
|
|
|
|
|
|
# Print its DNA |
777
|
|
|
|
|
|
|
print $sub->dna; |
778
|
|
|
|
|
|
|
|
779
|
|
|
|
|
|
|
# Create a new Sequence object from the first 500 kb of chromosome 1 |
780
|
|
|
|
|
|
|
$seq = Ace::Sequence->new(-name=>'CHROMOSOME_I',-db=>$db, |
781
|
|
|
|
|
|
|
-offset=>0,-length=>500_000); |
782
|
|
|
|
|
|
|
|
783
|
|
|
|
|
|
|
# Get the GFF dump as a text string |
784
|
|
|
|
|
|
|
$gff = $seq->gff; |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
# Limit dump to Predicted_genes |
787
|
|
|
|
|
|
|
$gff_genes = $seq->gff(-features=>'Predicted_gene'); |
788
|
|
|
|
|
|
|
|
789
|
|
|
|
|
|
|
# Return a GFF object (using optional GFF.pm module from Sanger) |
790
|
|
|
|
|
|
|
$gff_obj = $seq->GFF; |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
=head1 DESCRIPTION |
793
|
|
|
|
|
|
|
|
794
|
|
|
|
|
|
|
I, and its allied classes L and |
795
|
|
|
|
|
|
|
L, provide a convenient interface to the |
796
|
|
|
|
|
|
|
ACeDB Sequence classes and the GFF sequence feature file format. |
797
|
|
|
|
|
|
|
|
798
|
|
|
|
|
|
|
Using this class, you can define a region of the genome by using a |
799
|
|
|
|
|
|
|
landmark (sequenced clone, link, superlink, predicted gene), an offset |
800
|
|
|
|
|
|
|
from that landmark, and a distance. Offsets and distances can be |
801
|
|
|
|
|
|
|
positive or negative. This will return an I object. |
802
|
|
|
|
|
|
|
Once a region is defined, you may retrieve its DNA sequence, or query |
803
|
|
|
|
|
|
|
the database for any features that may be contained within this |
804
|
|
|
|
|
|
|
region. Features can be returned as objects (using the |
805
|
|
|
|
|
|
|
I class), as GFF text-only dumps, or in the |
806
|
|
|
|
|
|
|
form of the GFF class defined by the Sanger Centre's GFF.pm module. |
807
|
|
|
|
|
|
|
|
808
|
|
|
|
|
|
|
This class builds on top of L and L. Please see |
809
|
|
|
|
|
|
|
their manual pages before consulting this one. |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
=head1 Creating New Ace::Sequence Objects, the new() Method |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
$seq = Ace::Sequence->new($object); |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
$seq = Ace::Sequence->new(-source => $object, |
816
|
|
|
|
|
|
|
-offset => $offset, |
817
|
|
|
|
|
|
|
-length => $length, |
818
|
|
|
|
|
|
|
-refseq => $reference_sequence); |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
$seq = Ace::Sequence->new(-name => $name, |
821
|
|
|
|
|
|
|
-db => $db, |
822
|
|
|
|
|
|
|
-offset => $offset, |
823
|
|
|
|
|
|
|
-length => $length, |
824
|
|
|
|
|
|
|
-refseq => $reference_sequence); |
825
|
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
In order to create an I you will need an active I |
827
|
|
|
|
|
|
|
database accessor. Sequence regions are defined using a "source" |
828
|
|
|
|
|
|
|
sequence, an offset, and a length. Optionally, you may also provide a |
829
|
|
|
|
|
|
|
"reference sequence" to establish the coordinate system for all |
830
|
|
|
|
|
|
|
inquiries. Sequences may be generated from existing I |
831
|
|
|
|
|
|
|
sequence objects, from other I and |
832
|
|
|
|
|
|
|
I objects, or from a sequence name and a |
833
|
|
|
|
|
|
|
database handle. |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
The class method named new() is the interface to these facilities. In |
836
|
|
|
|
|
|
|
its simplest, one-argument form, you provide new() with a |
837
|
|
|
|
|
|
|
previously-created I that points to Sequence or |
838
|
|
|
|
|
|
|
sequence-like object (the meaning of "sequence-like" is explained in |
839
|
|
|
|
|
|
|
more detail below.) The new() method will return an I |
840
|
|
|
|
|
|
|
object extending from the beginning of the object through to its |
841
|
|
|
|
|
|
|
natural end. |
842
|
|
|
|
|
|
|
|
843
|
|
|
|
|
|
|
In the named-parameter form of new(), the following arguments are |
844
|
|
|
|
|
|
|
recognized: |
845
|
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
=over 4 |
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
=item -source |
849
|
|
|
|
|
|
|
|
850
|
|
|
|
|
|
|
The sequence source. This must be an I of the "Sequence" |
851
|
|
|
|
|
|
|
class, or be a sequence-like object containing the SMap tag (see |
852
|
|
|
|
|
|
|
below). |
853
|
|
|
|
|
|
|
|
854
|
|
|
|
|
|
|
=item -offset |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
An offset from the beginning of the source sequence. The retrieved |
857
|
|
|
|
|
|
|
I will begin at this position. The offset can be any |
858
|
|
|
|
|
|
|
positive or negative integer. Offets are B<0-based>. |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
=item -length |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
The length of the sequence to return. Either a positive or negative |
863
|
|
|
|
|
|
|
integer can be specified. If a negative length is given, the returned |
864
|
|
|
|
|
|
|
sequence will be complemented relative to the source sequence. |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
=item -refseq |
867
|
|
|
|
|
|
|
|
868
|
|
|
|
|
|
|
The sequence to use to establish the coordinate system for the |
869
|
|
|
|
|
|
|
returned sequence. Normally the source sequence is used to establish |
870
|
|
|
|
|
|
|
the coordinate system, but this can be used to override that choice. |
871
|
|
|
|
|
|
|
You can provide either an I or just a sequence name for |
872
|
|
|
|
|
|
|
this argument. The source and reference sequences must share a common |
873
|
|
|
|
|
|
|
ancestor, but do not have to be directly related. An attempt to use a |
874
|
|
|
|
|
|
|
disjunct reference sequence, such as one on a different chromosome, |
875
|
|
|
|
|
|
|
will fail. |
876
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
=item -name |
878
|
|
|
|
|
|
|
|
879
|
|
|
|
|
|
|
As an alternative to using an I with the B<-source> |
880
|
|
|
|
|
|
|
argument, you may specify a source sequence using B<-name> and B<-db>. |
881
|
|
|
|
|
|
|
The I module will use the provided database accessor to |
882
|
|
|
|
|
|
|
fetch a Sequence object with the specified name. new() will return |
883
|
|
|
|
|
|
|
undef is no Sequence by this name is known. |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
=item -db |
886
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
This argument is required if the source sequence is specified by name |
888
|
|
|
|
|
|
|
rather than by object reference. |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
=back |
891
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
If new() is successful, it will create an I object and |
893
|
|
|
|
|
|
|
return it. Otherwise it will return undef and return a descriptive |
894
|
|
|
|
|
|
|
message in Ace->error(). Certain programming errors, such as a |
895
|
|
|
|
|
|
|
failure to provide required arguments, cause a fatal error. |
896
|
|
|
|
|
|
|
|
897
|
|
|
|
|
|
|
=head2 Reference Sequences and the Coordinate System |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
When retrieving information from an I, the coordinate |
900
|
|
|
|
|
|
|
system is based on the sequence segment selected at object creation |
901
|
|
|
|
|
|
|
time. That is, the "+1" strand is the natural direction of the |
902
|
|
|
|
|
|
|
I object, and base pair 1 is its first base pair. This |
903
|
|
|
|
|
|
|
behavior can be overridden by providing a reference sequence to the |
904
|
|
|
|
|
|
|
new() method, in which case the orientation and position of the |
905
|
|
|
|
|
|
|
reference sequence establishes the coordinate system for the object. |
906
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
In addition to the reference sequence, there are two other sequences |
908
|
|
|
|
|
|
|
used by I for internal bookeeping. The "source" |
909
|
|
|
|
|
|
|
sequence corresponds to the smallest ACeDB sequence object that |
910
|
|
|
|
|
|
|
completely encloses the selected sequence segment. The "parent" |
911
|
|
|
|
|
|
|
sequence is the smallest ACeDB sequence object that contains the |
912
|
|
|
|
|
|
|
"source". The parent is used to derive the length and orientation of |
913
|
|
|
|
|
|
|
source sequences that are not directly associated with DNA objects. |
914
|
|
|
|
|
|
|
|
915
|
|
|
|
|
|
|
In many cases, the source sequence will be identical to the sequence |
916
|
|
|
|
|
|
|
initially passed to the new() method. However, there are exceptions |
917
|
|
|
|
|
|
|
to this rule. One common exception occurs when the offset and/or |
918
|
|
|
|
|
|
|
length cross the boundaries of the passed-in sequence. In this case, |
919
|
|
|
|
|
|
|
the ACeDB database is searched for the smallest sequence that contains |
920
|
|
|
|
|
|
|
both endpoints of the I object. |
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
The other common exception occurs in Ace 4.8, where there is support |
923
|
|
|
|
|
|
|
for "sequence-like" objects that contain the C ("Sequence Map") |
924
|
|
|
|
|
|
|
tag. The C tag provides genomic location information for |
925
|
|
|
|
|
|
|
arbitrary object -- not just those descended from the Sequence class. |
926
|
|
|
|
|
|
|
This allows ACeDB to perform genome map operations on objects that are |
927
|
|
|
|
|
|
|
not directly related to sequences, such as genetic loci that have been |
928
|
|
|
|
|
|
|
interpolated onto the physical map. When an C-containing object |
929
|
|
|
|
|
|
|
is passed to the I new() method, the module will again |
930
|
|
|
|
|
|
|
choose the smallest ACeDB Sequence object that contains both |
931
|
|
|
|
|
|
|
end-points of the desired region. |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
If an I object is used to create a new I |
934
|
|
|
|
|
|
|
object, then the original object's source is inherited. |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
=head1 Object Methods |
937
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
Once an I object is created, you can query it using the |
939
|
|
|
|
|
|
|
following methods: |
940
|
|
|
|
|
|
|
|
941
|
|
|
|
|
|
|
=head2 asString() |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
$name = $seq->asString; |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
Returns a human-readable identifier for the sequence in the form |
946
|
|
|
|
|
|
|
I, where "Source" is the name of the source |
947
|
|
|
|
|
|
|
sequence, and "start" and "end" are the endpoints of the sequence |
948
|
|
|
|
|
|
|
relative to the source (using 1-based indexing). This method is |
949
|
|
|
|
|
|
|
called automatically when the I is used in a string |
950
|
|
|
|
|
|
|
context. |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
=head2 source_seq() |
953
|
|
|
|
|
|
|
|
954
|
|
|
|
|
|
|
$source = $seq->source_seq; |
955
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
Return the source of the I. |
957
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
=head2 parent_seq() |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
$parent = $seq->parent_seq; |
961
|
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
Return the immediate ancestor of the sequence. The parent of the |
963
|
|
|
|
|
|
|
top-most sequence (such as the CHROMOSOME link) is itself. This |
964
|
|
|
|
|
|
|
method is used internally to ascertain the length of source sequences |
965
|
|
|
|
|
|
|
which are not associated with a DNA object. |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
NOTE: this procedure is a trifle funky and cannot reliably be used to |
968
|
|
|
|
|
|
|
traverse upwards to the top-most sequence. The reason for this is |
969
|
|
|
|
|
|
|
that it will return an I in some cases, and an |
970
|
|
|
|
|
|
|
I in others. Use get_parent() to traverse upwards |
971
|
|
|
|
|
|
|
through a uniform series of I objects upwards. |
972
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
=head2 refseq([$seq]) |
974
|
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
$refseq = $seq->refseq; |
976
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
Returns the reference sequence, if one is defined. |
978
|
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
$seq->refseq($new_ref); |
980
|
|
|
|
|
|
|
|
981
|
|
|
|
|
|
|
Set the reference sequence. The reference sequence must share the same |
982
|
|
|
|
|
|
|
ancestor with $seq. |
983
|
|
|
|
|
|
|
|
984
|
|
|
|
|
|
|
=head2 start() |
985
|
|
|
|
|
|
|
|
986
|
|
|
|
|
|
|
$start = $seq->start; |
987
|
|
|
|
|
|
|
|
988
|
|
|
|
|
|
|
Start of this sequence, relative to the source sequence, using 1-based |
989
|
|
|
|
|
|
|
indexing. |
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
=head2 end() |
992
|
|
|
|
|
|
|
|
993
|
|
|
|
|
|
|
$end = $seq->end; |
994
|
|
|
|
|
|
|
|
995
|
|
|
|
|
|
|
End of this sequence, relative to the source sequence, using 1-based |
996
|
|
|
|
|
|
|
indexing. |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
=head2 offset() |
999
|
|
|
|
|
|
|
|
1000
|
|
|
|
|
|
|
$offset = $seq->offset; |
1001
|
|
|
|
|
|
|
|
1002
|
|
|
|
|
|
|
Offset of the beginning of this sequence relative to the source |
1003
|
|
|
|
|
|
|
sequence, using 0-based indexing. The offset may be negative if the |
1004
|
|
|
|
|
|
|
beginning of the sequence is to the left of the beginning of the |
1005
|
|
|
|
|
|
|
source sequence. |
1006
|
|
|
|
|
|
|
|
1007
|
|
|
|
|
|
|
=head2 length() |
1008
|
|
|
|
|
|
|
|
1009
|
|
|
|
|
|
|
$length = $seq->length; |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
The length of this sequence, in base pairs. The length may be |
1012
|
|
|
|
|
|
|
negative if the sequence's orientation is reversed relative to the |
1013
|
|
|
|
|
|
|
source sequence. Use abslength() to obtain the absolute value of |
1014
|
|
|
|
|
|
|
the sequence length. |
1015
|
|
|
|
|
|
|
|
1016
|
|
|
|
|
|
|
=head2 abslength() |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
$length = $seq->abslength; |
1019
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
Return the absolute value of the length of the sequence. |
1021
|
|
|
|
|
|
|
|
1022
|
|
|
|
|
|
|
=head2 strand() |
1023
|
|
|
|
|
|
|
|
1024
|
|
|
|
|
|
|
$strand = $seq->strand; |
1025
|
|
|
|
|
|
|
|
1026
|
|
|
|
|
|
|
Returns +1 for a sequence oriented in the natural direction of the |
1027
|
|
|
|
|
|
|
genomic reference sequence, or -1 otherwise. |
1028
|
|
|
|
|
|
|
|
1029
|
|
|
|
|
|
|
=head2 reversed() |
1030
|
|
|
|
|
|
|
|
1031
|
|
|
|
|
|
|
Returns true if the segment is reversed relative to the canonical |
1032
|
|
|
|
|
|
|
genomic direction. This is the same as $seq->strand < 0. |
1033
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
=head2 dna() |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
$dna = $seq->dna; |
1037
|
|
|
|
|
|
|
|
1038
|
|
|
|
|
|
|
Return the DNA corresponding to this sequence. If the sequence length |
1039
|
|
|
|
|
|
|
is negative, the reverse complement of the appropriate segment will be |
1040
|
|
|
|
|
|
|
returned. |
1041
|
|
|
|
|
|
|
|
1042
|
|
|
|
|
|
|
ACeDB allows Sequences to exist without an associated DNA object |
1043
|
|
|
|
|
|
|
(which typically happens during intermediate stages of a sequencing |
1044
|
|
|
|
|
|
|
project. In such a case, the returned sequence will contain the |
1045
|
|
|
|
|
|
|
correct number of "-" characters. |
1046
|
|
|
|
|
|
|
|
1047
|
|
|
|
|
|
|
=head2 name() |
1048
|
|
|
|
|
|
|
|
1049
|
|
|
|
|
|
|
$name = $seq->name; |
1050
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
Return the name of the source sequence as a string. |
1052
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
=head2 get_parent() |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
$parent = $seq->parent; |
1056
|
|
|
|
|
|
|
|
1057
|
|
|
|
|
|
|
Return the immediate ancestor of this I (i.e., the |
1058
|
|
|
|
|
|
|
sequence that contains this one). The return value is a new |
1059
|
|
|
|
|
|
|
I or undef, if no parent sequence exists. |
1060
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
=head2 get_children() |
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
@children = $seq->get_children(); |
1064
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
Returns all subsequences that exist as independent objects in the |
1066
|
|
|
|
|
|
|
ACeDB database. What exactly is returned is dependent on the data |
1067
|
|
|
|
|
|
|
model. In older ACeDB databases, the only subsequences are those |
1068
|
|
|
|
|
|
|
under the catchall Subsequence tag. In newer ACeDB databases, the |
1069
|
|
|
|
|
|
|
objects returned correspond to objects to the right of the S_Child |
1070
|
|
|
|
|
|
|
subtag using a tag[2] syntax, and may include Predicted_genes, |
1071
|
|
|
|
|
|
|
Sequences, Links, or other objects. The return value is a list of |
1072
|
|
|
|
|
|
|
I objects. |
1073
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
=head2 features() |
1075
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
@features = $seq->features; |
1077
|
|
|
|
|
|
|
@features = $seq->features('exon','intron','Predicted_gene'); |
1078
|
|
|
|
|
|
|
@features = $seq->features('exon:GeneFinder','Predicted_gene:hand.*'); |
1079
|
|
|
|
|
|
|
|
1080
|
|
|
|
|
|
|
features() returns an array of I objects. If |
1081
|
|
|
|
|
|
|
called without arguments, features() returns all features that cross |
1082
|
|
|
|
|
|
|
the sequence region. You may also provide a filter list to select a |
1083
|
|
|
|
|
|
|
set of features by type and subtype. The format of the filter list |
1084
|
|
|
|
|
|
|
is: |
1085
|
|
|
|
|
|
|
|
1086
|
|
|
|
|
|
|
type:subtype |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
Where I is the class of the feature (the "feature" field of the |
1089
|
|
|
|
|
|
|
GFF format), and I is a description of how the feature was |
1090
|
|
|
|
|
|
|
derived (the "source" field of the GFF format). Either of these |
1091
|
|
|
|
|
|
|
fields can be absent, and either can be a regular expression. More |
1092
|
|
|
|
|
|
|
advanced filtering is not supported, but is provided by the Sanger |
1093
|
|
|
|
|
|
|
Centre's GFF module. |
1094
|
|
|
|
|
|
|
|
1095
|
|
|
|
|
|
|
The order of the features in the returned list is not specified. To |
1096
|
|
|
|
|
|
|
obtain features sorted by position, use this idiom: |
1097
|
|
|
|
|
|
|
|
1098
|
|
|
|
|
|
|
@features = sort { $a->start <=> $b->start } $seq->features; |
1099
|
|
|
|
|
|
|
|
1100
|
|
|
|
|
|
|
=head2 feature_list() |
1101
|
|
|
|
|
|
|
|
1102
|
|
|
|
|
|
|
my $list = $seq->feature_list(); |
1103
|
|
|
|
|
|
|
|
1104
|
|
|
|
|
|
|
This method returns a summary list of the features that cross the |
1105
|
|
|
|
|
|
|
sequence in the form of a L object. From the |
1106
|
|
|
|
|
|
|
L object you can obtain the list of feature names |
1107
|
|
|
|
|
|
|
and the number of each type. The feature list is obtained from the |
1108
|
|
|
|
|
|
|
ACeDB server with a single short transaction, and therefore has much |
1109
|
|
|
|
|
|
|
less overhead than features(). |
1110
|
|
|
|
|
|
|
|
1111
|
|
|
|
|
|
|
See L for more details. |
1112
|
|
|
|
|
|
|
|
1113
|
|
|
|
|
|
|
=head2 transcripts() |
1114
|
|
|
|
|
|
|
|
1115
|
|
|
|
|
|
|
This returns a list of Ace::Sequence::Transcript objects, which are |
1116
|
|
|
|
|
|
|
specializations of Ace::Sequence::Feature. See L |
1117
|
|
|
|
|
|
|
for details. |
1118
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
=head2 clones() |
1120
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
This returns a list of Ace::Sequence::Feature objects containing |
1122
|
|
|
|
|
|
|
reconstructed clones. This is a nasty hack, because ACEDB currently |
1123
|
|
|
|
|
|
|
records clone ends, but not the clones themselves, meaning that we |
1124
|
|
|
|
|
|
|
will not always know both ends of the clone. In this case the missing |
1125
|
|
|
|
|
|
|
end has a synthetic position of -99,999,999 or +99,999,999. Sorry. |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
=head2 gff() |
1128
|
|
|
|
|
|
|
|
1129
|
|
|
|
|
|
|
$gff = $seq->gff(); |
1130
|
|
|
|
|
|
|
$gff = $seq->gff(-abs => 1, |
1131
|
|
|
|
|
|
|
-features => ['exon','intron:GeneFinder']); |
1132
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
This method returns a GFF file as a scalar. The following arguments |
1134
|
|
|
|
|
|
|
are optional: |
1135
|
|
|
|
|
|
|
|
1136
|
|
|
|
|
|
|
=over 4 |
1137
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
=item -abs |
1139
|
|
|
|
|
|
|
|
1140
|
|
|
|
|
|
|
Ordinarily the feature entries in the GFF file will be returned in |
1141
|
|
|
|
|
|
|
coordinates relative to the start of the I object. |
1142
|
|
|
|
|
|
|
Position 1 will be the start of the sequence object, and the "+" |
1143
|
|
|
|
|
|
|
strand will be the sequence object's natural orientation. However if |
1144
|
|
|
|
|
|
|
a true value is provided to B<-abs>, the coordinate system used will |
1145
|
|
|
|
|
|
|
be relative to the start of the source sequence, i.e. the native ACeDB |
1146
|
|
|
|
|
|
|
Sequence object (usually a cosmid sequence or a link). |
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
If a reference sequence was provided when the I was |
1149
|
|
|
|
|
|
|
created, it will be used by default to set the coordinate system. |
1150
|
|
|
|
|
|
|
Relative coordinates can be reenabled by providing a false value to |
1151
|
|
|
|
|
|
|
B<-abs>. |
1152
|
|
|
|
|
|
|
|
1153
|
|
|
|
|
|
|
Ordinarily the coordinate system manipulations automatically "do what |
1154
|
|
|
|
|
|
|
you want" and you will not need to adjust them. See also the abs() |
1155
|
|
|
|
|
|
|
method described below. |
1156
|
|
|
|
|
|
|
|
1157
|
|
|
|
|
|
|
=item -features |
1158
|
|
|
|
|
|
|
|
1159
|
|
|
|
|
|
|
The B<-features> argument filters the features according to a list of |
1160
|
|
|
|
|
|
|
types and subtypes. The format is identical to the one described for |
1161
|
|
|
|
|
|
|
the features() method. A single filter may be provided as a scalar |
1162
|
|
|
|
|
|
|
string. Multiple filters may be passed as an array reference. |
1163
|
|
|
|
|
|
|
|
1164
|
|
|
|
|
|
|
=back |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
See also the GFF() method described next. |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
=head2 GFF() |
1169
|
|
|
|
|
|
|
|
1170
|
|
|
|
|
|
|
$gff_object = $seq->gff; |
1171
|
|
|
|
|
|
|
$gff_object = $seq->gff(-abs => 1, |
1172
|
|
|
|
|
|
|
-features => ['exon','intron:GeneFinder']); |
1173
|
|
|
|
|
|
|
|
1174
|
|
|
|
|
|
|
The GFF() method takes the same arguments as gff() described above, |
1175
|
|
|
|
|
|
|
but it returns a I object from the GFF.pm |
1176
|
|
|
|
|
|
|
module. If the GFF module is not installed, this method will generate |
1177
|
|
|
|
|
|
|
a fatal error. |
1178
|
|
|
|
|
|
|
|
1179
|
|
|
|
|
|
|
=head2 absolute() |
1180
|
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
$abs = $seq->absolute; |
1182
|
|
|
|
|
|
|
$abs = $seq->absolute(1); |
1183
|
|
|
|
|
|
|
|
1184
|
|
|
|
|
|
|
This method controls whether the coordinates of features are returned |
1185
|
|
|
|
|
|
|
in absolute or relative coordinates. "Absolute" coordinates are |
1186
|
|
|
|
|
|
|
relative to the underlying source or reference sequence. "Relative" |
1187
|
|
|
|
|
|
|
coordinates are relative to the I object. By default, |
1188
|
|
|
|
|
|
|
coordinates are relative unless new() was provided with a reference |
1189
|
|
|
|
|
|
|
sequence. This default can be examined and changed using absolute(). |
1190
|
|
|
|
|
|
|
|
1191
|
|
|
|
|
|
|
=head2 automerge() |
1192
|
|
|
|
|
|
|
|
1193
|
|
|
|
|
|
|
$merge = $seq->automerge; |
1194
|
|
|
|
|
|
|
$seq->automerge(0); |
1195
|
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
This method controls whether groups of features will automatically be |
1197
|
|
|
|
|
|
|
merged together by the features() call. If true (the default), then |
1198
|
|
|
|
|
|
|
the left and right end of clones will be merged into "clone" features, |
1199
|
|
|
|
|
|
|
introns, exons and CDS entries will be merged into |
1200
|
|
|
|
|
|
|
Ace::Sequence::Transcript objects, and similarity entries will be |
1201
|
|
|
|
|
|
|
merged into Ace::Sequence::GappedAlignment objects. |
1202
|
|
|
|
|
|
|
|
1203
|
|
|
|
|
|
|
=head2 db() |
1204
|
|
|
|
|
|
|
|
1205
|
|
|
|
|
|
|
$db = $seq->db; |
1206
|
|
|
|
|
|
|
|
1207
|
|
|
|
|
|
|
Returns the L database accessor associated with this sequence. |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
=head1 SEE ALSO |
1210
|
|
|
|
|
|
|
|
1211
|
|
|
|
|
|
|
L, L, L, |
1212
|
|
|
|
|
|
|
L, L |
1213
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
=head1 AUTHOR |
1215
|
|
|
|
|
|
|
|
1216
|
|
|
|
|
|
|
Lincoln Stein with extensive help from Jean |
1217
|
|
|
|
|
|
|
Thierry-Mieg |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
Many thanks to David Block for finding and |
1220
|
|
|
|
|
|
|
fixing the nasty off-by-one errors. |
1221
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
Copyright (c) 1999, Lincoln D. Stein |
1223
|
|
|
|
|
|
|
|
1224
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
1225
|
|
|
|
|
|
|
it under the same terms as Perl itself. See DISCLAIMER.txt for |
1226
|
|
|
|
|
|
|
disclaimers of warranty. |
1227
|
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
=cut |
1229
|
|
|
|
|
|
|
|
1230
|
|
|
|
|
|
|
__END__ |