File Coverage

blib/lib/Ace/Sequence.pm
Criterion Covered Total %
statement 30 392 7.6
branch 0 204 0.0
condition 0 85 0.0
subroutine 10 58 17.2
pod 19 34 55.8
total 59 773 7.6


line stmt bran cond sub pod time code
1             package Ace::Sequence;
2 1     1   3411 use strict;
  1         3  
  1         33  
3              
4 1     1   5 use Carp;
  1         1  
  1         96  
5 1     1   6 use strict;
  1         2  
  1         34  
6 1     1   857 use Ace 1.50 qw(:DEFAULT rearrange);
  1         37  
  1         195  
7 1     1   836 use Ace::Sequence::FeatureList;
  1         3  
  1         29  
8 1     1   653 use Ace::Sequence::Feature;
  1         3  
  1         35  
9 1     1   7 use AutoLoader 'AUTOLOAD';
  1         1  
  1         6  
10 1     1   28 use vars '$VERSION';
  1         2  
  1         45  
11             my %CACHE;
12              
13             $VERSION = '1.51';
14              
15 1     1   5 use constant CACHE => 1;
  1         1  
  1         72  
16              
17             use overload
18 1         6 '""' => 'asString',
19             cmp => 'cmp',
20 1     1   6 ;
  1         2  
21              
22             # synonym: stop = end
23             *stop = \&end;
24             *abs = \&absolute;
25             *source_seq = \&source;
26             *source_tag = \&subtype;
27             *primary_tag = \&type;
28              
29             my %plusminus = ( '+' => '-',
30             '-' => '+',
31             '.' => '.');
32              
33             # internal keys
34             # parent => reference Sequence in "+" strand
35             # p_offset => our start in the parent
36             # length => our length
37             # strand => our strand (+ or -)
38             # refseq => reference Sequence for coordinate system
39              
40             # object constructor
41             # usually called like this:
42             # $seq = Ace::Sequence->new($object);
43             # but can be called like this:
44             # $seq = Ace::Sequence->new(-db=>$db,-name=>$name);
45             # or
46             # $seq = Ace::Sequence->new(-seq => $object,
47             # -offset => $offset,
48             # -length => $length,
49             # -ref => $refseq
50             # );
51             # $refseq, if provided, will be used to establish the coordinate
52             # system. Otherwise the first base pair will be set to 1.
53             sub new {
54 0     0 0   my $pack = shift;
55 0           my ($seq,$start,$end,$offset,$length,$refseq,$db) =
56             rearrange([
57             ['SEQ','SEQUENCE','SOURCE'],
58             'START',
59             ['END','STOP'],
60             ['OFFSET','OFF'],
61             ['LENGTH','LEN'],
62             'REFSEQ',
63             ['DATABASE','DB'],
64             ],@_);
65              
66             # Object must have a parent sequence and/or a reference
67             # sequence. In some cases, the parent sequence will be the
68             # object itself. The reference sequence is used to set up
69             # the frame of reference for the coordinate system.
70              
71             # fetch the sequence object if we don't have it already
72 0 0 0       croak "Please provide either a Sequence object or a database and name"
      0        
73             unless ref($seq) || ($seq && $db);
74              
75             # convert start into offset
76 0 0 0       $offset = $start - 1 if defined($start) and !defined($offset);
77              
78             # convert stop/end into length
79 0 0 0       $length = ($end > $start) ? $end - $offset : $end - $offset - 2
    0          
80             if defined($end) && !defined($length);
81              
82             # if just a string is passed, try to fetch a Sequence object
83 0 0         my $obj = ref($seq) ? $seq : $db->fetch('Sequence'=>$seq);
84 0 0         unless ($obj) {
85 0           Ace->error("No Sequence named $obj found in database");
86 0           return;
87             }
88              
89             # get parent coordinates and length of this sequence
90             # the parent is an Ace Sequence object in the "+" strand
91 0           my ($parent,$p_offset,$p_length,$strand) = find_parent($obj);
92 0 0         return unless $parent;
93              
94             # handle negative strands
95 0           my $r_strand = $strand;
96 0           my $r_offset = $p_offset;
97 0   0       $offset ||= 0;
98 0 0         $offset *= -1 if $strand < 0;
99              
100             # handle feature objects
101 0 0         $offset += $obj->offset if $obj->can('smapped');
102              
103             # get source
104 0 0         my $source = $obj->can('smapped') ? $obj->source : $obj;
105              
106             # store the object into our instance variables
107 0   0       my $self = bless {
108             obj => $source,
109             offset => $offset,
110             length => $length || $p_length,
111             parent => $parent,
112             p_offset => $p_offset,
113             refseq => [$source,$r_offset,$r_strand],
114             strand => $strand,
115             absolute => 0,
116             automerge => 1,
117             },$pack;
118              
119             # set the reference sequence
120 0 0 0       eval { $self->refseq($refseq) } or return if defined $refseq;
  0            
121              
122             # wheww!
123 0           return $self;
124             }
125              
126             # return the "source" object that the user offset from
127             sub source {
128 0     0 0   $_[0]->{obj};
129             }
130              
131             # return the parent object
132 0     0 0   sub parent { $_[0]->{parent} }
133              
134             # return the length
135             #sub length { $_[0]->{length} }
136             sub length {
137 0     0 1   my $self = shift;
138 0           my ($start,$end) = ($self->start,$self->end);
139 0 0         return $end - $start + ($end > $start ? 1 : -1); # for stupid 1-based adjustments
140             }
141              
142 0     0 1   sub reversed { return shift->strand < 0; }
143              
144             sub automerge {
145 0     0 1   my $self = shift;
146 0           my $d = $self->{automerge};
147 0 0         $self->{automerge} = shift if @_;
148 0           $d;
149             }
150              
151             # return reference sequence
152             sub refseq {
153 0     0 1   my $self = shift;
154 0           my $prev = $self->{refseq};
155 0 0         if (@_) {
156 0           my $refseq = shift;
157 0           my $arrayref;
158              
159 0 0         BLOCK: {
160 0           last BLOCK unless defined ($refseq);
161              
162 0 0 0       if (ref($refseq) && ref($refseq) eq 'ARRAY') {
163 0           $arrayref = $refseq;
164 0           last BLOCK;
165             }
166              
167 0 0 0       if (ref($refseq) && ($refseq->can('smapped'))) {
168 0 0         croak "Reference sequence has no common ancestor with sequence"
169             unless $self->parent eq $refseq->parent;
170 0           my ($a,$b,$c) = @{$refseq->{refseq}};
  0            
171             # $b += $refseq->offset;
172 0           $b += $refseq->offset;
173 0           $arrayref = [$refseq,$b,$refseq->strand];
174 0           last BLOCK;
175             }
176              
177              
178             # look up reference sequence in database if we aren't given
179             # database object already
180 0 0         $refseq = $self->db->fetch('Sequence' => $refseq)
181             unless $refseq->isa('Ace::Object');
182 0 0         croak "Invalid reference sequence" unless $refseq;
183              
184             # find position of ref sequence in parent strand
185 0           my ($r_parent,$r_offset,$r_length,$r_strand) = find_parent($refseq);
186 0 0         croak "Reference sequence has no common ancestor with sequence"
187             unless $r_parent eq $self->{parent};
188              
189             # set to array reference containing this information
190 0           $arrayref = [$refseq,$r_offset,$r_strand];
191             }
192 0           $self->{refseq} = $arrayref;
193             }
194 0 0         return unless $prev;
195 0 0         return $self->parent if $self->absolute;
196 0 0         return wantarray ? @{$prev} : $prev->[0];
  0            
197             }
198              
199             # return strand
200 0     0 1   sub strand { return $_[0]->{strand} }
201              
202             # return reference strand
203             sub r_strand {
204 0     0 0   my $self = shift;
205 0 0         return "+1" if $self->absolute;
206 0 0         if (my ($ref,$r_offset,$r_strand) = $self->refseq) {
207 0           return $r_strand;
208             } else {
209 0           return $self->{strand}
210             }
211             }
212              
213 0     0 1   sub offset { $_[0]->{offset} }
214 0     0 0   sub p_offset { $_[0]->{p_offset} }
215              
216 0     0 0   sub smapped { 1; }
217 0     0 0   sub type { 'Sequence' }
218 0     0 0   sub subtype { }
219              
220             sub debug {
221 0     0 0   my $self = shift;
222 0           my $d = $self->{_debug};
223 0 0         $self->{_debug} = shift if @_;
224 0           $d;
225             }
226              
227             # return the database this sequence is associated with
228             sub db {
229 0   0 0 1   return Ace->name2db($_[0]->{db} ||= $_[0]->source->db);
230             }
231              
232             sub start {
233 0     0 1   my ($self,$abs) = @_;
234 0 0         $abs = $self->absolute unless defined $abs;
235 0 0         return $self->{p_offset} + $self->{offset} + 1 if $abs;
236              
237 0 0         if ($self->refseq) {
238 0           my ($ref,$r_offset,$r_strand) = $self->refseq;
239 0 0         return $r_strand < 0 ? 1 + $r_offset - ($self->{p_offset} + $self->{offset})
240             : 1 + $self->{p_offset} + $self->{offset} - $r_offset;
241             }
242              
243             else {
244 0           return $self->{offset} +1;
245             }
246              
247             }
248              
249             sub end {
250 0     0 1   my ($self,$abs) = @_;
251 0           my $start = $self->start($abs);
252 0 0         my $f = $self->{length} > 0 ? 1 : -1; # for stupid 1-based adjustments
253 0 0 0       if ($abs && $self->refseq ne $self->parent) {
254 0           my $r_strand = $self->r_strand;
255 0 0 0       return $start - $self->{length} + $f
      0        
256             if $r_strand < 0 or $self->{strand} < 0 or $self->{length} < 0;
257 0           return $start + $self->{length} - $f
258             }
259 0 0         return $start + $self->{length} - $f if $self->r_strand eq $self->{strand};
260 0           return $start - $self->{length} + $f;
261             }
262              
263             # turn on absolute coordinates (relative to reference sequence)
264             sub absolute {
265 0     0 1   my $self = shift;
266 0           my $prev = $self->{absolute};
267 0 0         $self->{absolute} = $_[0] if defined $_[0];
268 0           return $prev;
269             }
270              
271             # human readable string (for debugging)
272             sub asString {
273 0     0 1   my $self = shift;
274 0 0         if ($self->absolute) {
    0          
275 0           return join '',$self->parent,'/',$self->start,',',$self->end;
276             } elsif (my $ref = $self->refseq){
277 0 0         my $label = $ref->isa('Ace::Sequence::Feature') ? $ref->info : "$ref";
278 0           return join '',$label,'/',$self->start,',',$self->end;
279              
280             } else {
281 0           join '',$self->source,'/',$self->start,',',$self->end;
282             }
283             }
284              
285             sub cmp {
286 0     0 0   my ($self,$arg,$reversed) = @_;
287 0 0 0       if (ref($arg) and $arg->isa('Ace::Sequence')) {
288 0   0       my $cmp = $self->parent cmp $arg->parent
289             || $self->start <=> $arg->start;
290 0 0         return $reversed ? -$cmp : $cmp;
291             }
292 0           my $name = $self->asString;
293 0 0         return $reversed ? $arg cmp $name : $name cmp $arg;
294             }
295              
296             # Return the DNA
297             sub dna {
298 0     0 1   my $self = shift;
299 0 0         return $self->{dna} if $self->{dna};
300 0           my $raw = $self->_query('seqdna');
301 0           $raw=~s/^>.*\n//m;
302 0           $raw=~s/^\/\/.*//mg;
303 0           $raw=~s/\n//g;
304 0           $raw =~ s/\0+\Z//; # blasted nulls!
305 0 0         my $effective_strand = $self->end >= $self->start ? '+1' : '-1';
306 0 0         _complement(\$raw) if $self->r_strand ne $effective_strand;
307 0           return $self->{dna} = $raw;
308             }
309              
310             # return a gff file
311             sub gff {
312 0     0 1   my $self = shift;
313 0           my ($abs,$features) = rearrange([['ABS','ABSOLUTE'],'FEATURES'],@_);
314 0 0         $abs = $self->absolute unless defined $abs;
315              
316             # can provide list of feature names, such as 'similarity', or 'all' to get 'em all
317             # !THIS IS BROKEN; IT SHOULD LOOK LIKE FEATURE()!
318 0           my $opt = $self->_feature_filter($features);
319              
320 0           my $gff = $self->_gff($opt);
321 0 0         warn $gff if $self->debug;
322              
323 0 0         $self->transformGFF(\$gff) unless $abs;
324 0           return $gff;
325             }
326              
327             # return a GFF object using the optional GFF.pm module
328             sub GFF {
329 0     0 1   my $self = shift;
330 0           my ($filter,$converter) = @_; # anonymous subs
331 0 0         croak "GFF module not installed" unless require GFF;
332 0           require GFF::Filehandle;
333              
334 0           my @lines = grep !/^\/\//,split "\n",$self->gff(@_);
335 0           local *IN;
336 0           local ($^W) = 0; # prevent complaint by GFF module
337 0           tie *IN,'GFF::Filehandle',\@lines;
338 0           my $gff = GFF::GeneFeatureSet->new;
339 0 0         $gff->read(\*IN,$filter,$converter) if $gff;
340 0           return $gff;
341             }
342              
343             # Get the features table. Can filter by type/subtype this way:
344             # features('similarity:EST','annotation:assembly_tag')
345             sub features {
346 0     0 1   my $self = shift;
347 0           my ($filter,$opt) = $self->_make_filter(@_);
348              
349             # get raw gff file
350 0           my $gff = $self->gff(-features=>$opt);
351              
352             # turn it into a list of features
353 0           my @features = $self->_make_features($gff,$filter);
354              
355 0 0         if ($self->automerge) { # automatic merging
356             # fetch out constructed transcripts and clones
357 0           my %types = map {lc($_)=>1} (@$opt,@_);
  0            
358 0 0         if ($types{'transcript'}) {
359 0           push @features,$self->_make_transcripts(\@features);
360 0           @features = grep {$_->type !~ /^(intron|exon)$/ } @features;
  0            
361             }
362 0 0         push @features,$self->_make_clones(\@features) if $types{'clone'};
363 0 0         if ($types{'similarity'}) {
364 0           my @f = $self->_make_alignments(\@features);
365 0           @features = grep {$_->type ne 'similarity'} @features;
  0            
366 0           push @features,@f;
367             }
368             }
369              
370 0 0         return wantarray ? @features : \@features;
371             }
372              
373             # A little bit more complex - assemble a list of "transcripts"
374             # consisting of Ace::Sequence::Transcript objects. These objects
375             # contain a list of exons and introns.
376             sub transcripts {
377 0     0 1   my $self = shift;
378 0           my $curated = shift;
379 0 0         my $ef = $curated ? "exon:curated" : "exon";
380 0 0         my $if = $curated ? "intron:curated" : "intron";
381 0 0         my $sf = $curated ? "Sequence:curated" : "Sequence";
382 0           my @features = $self->features($ef,$if,$sf);
383 0 0         return unless @features;
384 0           return $self->_make_transcripts(\@features);
385             }
386              
387             sub _make_transcripts {
388 0     0     my $self = shift;
389 0           my $features = shift;
390              
391 0           require Ace::Sequence::Transcript;
392 0           my %transcripts;
393              
394 0           for my $feature (@$features) {
395 0           my $transcript = $feature->info;
396 0 0         next unless $transcript;
397 0 0         if ($feature->type =~ /^(exon|intron|cds)$/) {
    0          
398 0           my $type = $1;
399 0           push @{$transcripts{$transcript}{$type}},$feature;
  0            
400             } elsif ($feature->type eq 'Sequence') {
401 0   0       $transcripts{$transcript}{base} ||= $feature;
402             }
403             }
404              
405             # get rid of transcripts without exons
406 0           foreach (keys %transcripts) {
407 0 0         delete $transcripts{$_} unless exists $transcripts{$_}{exon}
408             }
409              
410             # map the rest onto Ace::Sequence::Transcript objects
411 0           return map {Ace::Sequence::Transcript->new($transcripts{$_})} keys %transcripts;
  0            
412             }
413              
414             # Reassemble clones from clone left and right ends
415             sub clones {
416 0     0 1   my $self = shift;
417 0           my @clones = $self->features('Clone_left_end','Clone_right_end','Sequence');
418 0           my %clones;
419 0 0         return unless @clones;
420 0           return $self->_make_clones(\@clones);
421             }
422              
423             sub _make_clones {
424 0     0     my $self = shift;
425 0           my $features = shift;
426              
427 0           my (%clones,@canonical_clones);
428 0 0         my $start_label = $self->strand < 0 ? 'end' : 'start';
429 0 0         my $end_label = $self->strand < 0 ? 'start' : 'end';
430 0           for my $feature (@$features) {
431 0 0         $clones{$feature->info}{$start_label} = $feature->start if $feature->type eq 'Clone_left_end';
432 0 0         $clones{$feature->info}{$end_label} = $feature->start if $feature->type eq 'Clone_right_end';
433              
434 0 0         if ($feature->type eq 'Sequence') {
435 0           my $info = $feature->info;
436 0 0         next if $info =~ /LINK|CHROMOSOME|\.\w+$/;
437 0 0         if ($info->Genomic_canonical(0)) {
438 0 0         push (@canonical_clones,$info->Clone) if $info->Clone;
439             }
440             }
441             }
442              
443 0           foreach (@canonical_clones) {
444 0   0       $clones{$_} ||= {};
445             }
446              
447 0           my @features;
448 0           my ($r,$r_offset,$r_strand) = $self->refseq;
449 0           my $parent = $self->parent;
450 0           my $abs = $self->absolute;
451 0 0         if ($abs) {
452 0           $r_offset = 0;
453 0           $r = $parent;
454 0           $r_strand = '+1';
455             }
456              
457             # BAD HACK ALERT. WE DON'T KNOW WHERE THE LEFT END OF THE CLONE IS SO WE USE
458             # THE MAGIC NUMBER -99_999_999 to mean "off left end" and
459             # +99_999_999 to mean "off right end"
460 0           for my $clone (keys %clones) {
461 0   0       my $start = $clones{$clone}{start} || -99_999_999;
462 0   0       my $end = $clones{$clone}{end} || +99_999_999;
463 0           my $phony_gff = join "\t",($parent,'Clone','structural',$start,$end,'.','.','.',qq(Clone "$clone"));
464 0           push @features,Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$phony_gff);
465             }
466 0           return @features;
467             }
468              
469             # Assemble a list of "GappedAlignment" objects. These objects
470             # contain a list of aligned segments.
471             sub alignments {
472 0     0 0   my $self = shift;
473 0           my @subtypes = @_;
474 0           my @types = map { "similarity:\^$_\$" } @subtypes;
  0            
475 0 0         push @types,'similarity' unless @types;
476 0           return $self->features(@types);
477             }
478              
479             sub segments {
480 0     0 0   my $self = shift;
481 0           return;
482             }
483              
484             sub _make_alignments {
485 0     0     my $self = shift;
486 0           my $features = shift;
487 0           require Ace::Sequence::GappedAlignment;
488              
489 0           my %homol;
490              
491 0           for my $feature (@$features) {
492 0 0         next unless $feature->type eq 'similarity';
493 0           my $target = $feature->info;
494 0           my $subtype = $feature->subtype;
495 0           push @{$homol{$target,$subtype}},$feature;
  0            
496             }
497              
498             # map onto Ace::Sequence::GappedAlignment objects
499 0           return map {Ace::Sequence::GappedAlignment->new($homol{$_})} keys %homol;
  0            
500             }
501              
502             # return list of features quickly
503             sub feature_list {
504 0     0 1   my $self = shift;
505 0 0         return $self->{'feature_list'} if $self->{'feature_list'};
506 0 0         return unless my $raw = $self->_query('seqfeatures -version 2 -list');
507 0           return $self->{'feature_list'} = Ace::Sequence::FeatureList->new($raw);
508             }
509              
510             # transform a GFF file into the coordinate system of the sequence
511             sub transformGFF {
512 0     0 0   my $self = shift;
513 0           my $gff = shift;
514 0           my $parent = $self->parent;
515 0           my $strand = $self->{strand};
516 0           my $source = $self->source;
517 0           my ($ref_source,$ref_offset,$ref_strand) = $self->refseq;
518 0   0       $ref_source ||= $source;
519 0   0       $ref_strand ||= $strand;
520              
521 0 0         if ($ref_strand > 0) {
522 0 0         my $o = defined($ref_offset) ? $ref_offset : ($self->p_offset + $self->offset);
523             # find anything that looks like a numeric field and subtract offset from it
524 0           $$gff =~ s/(?
  0            
525 0           $$gff =~ s/^$parent/$source/mg;
526 0           $$gff =~ s/\#\#sequence-region\s+\S+/##sequence-region $ref_source/m;
527 0           $$gff =~ s/FMAP_FEATURES\s+"\S+"/FMAP_FEATURES "$ref_source"/m;
528 0           return;
529             } else { # strand eq '-'
530 0 0         my $o = defined($ref_offset) ? (2 + $ref_offset) : (2 + $self->p_offset - $self->offset);
531 0           $$gff =~ s/(?
  0            
532 0           $$gff =~ s/(Target \"[^\"]+\" )(-?\d+) (-?\d+)/$1 $3 $2/g;
533 0           $$gff =~ s/^$parent/$source/mg;
534 0           $$gff =~ s/\#\#sequence-region\s+\S+\s+(-?\d+)\s+(-?\d+)/"##sequence-region $ref_source " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em;
  0            
535 0           $$gff =~ s/FMAP_FEATURES\s+"\S+"\s+(-?\d+)\s+(-?\d+)/"FMAP_FEATURES \"$ref_source\" " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em;
  0            
536             }
537              
538             }
539              
540             # return a name for the object
541             sub name {
542 0     0 1   return shift->source_seq->name;
543             }
544              
545             # for compatibility with Ace::Sequence::Feature
546             sub info {
547 0     0 0   return shift->source_seq;
548             }
549              
550             ###################### internal functions #################
551             # not necessarily object-oriented!!
552              
553             # return parent, parent offset and strand
554             sub find_parent {
555 0     0 0   my $obj = shift;
556              
557             # first, if we are passed an Ace::Sequence, then we can inherit
558             # these settings directly
559 0 0         return (@{$obj}{qw(parent p_offset length)},$obj->r_strand)
  0            
560             if $obj->isa('Ace::Sequence');
561              
562             # otherwise, if we are passed an Ace::Object, then we must
563             # traverse upwards until we find a suitable parent
564 0 0         return _traverse($obj) if $obj->isa('Ace::Object');
565              
566             # otherwise, we don't know what to do...
567 0           croak "Source sequence not an Ace::Object or an Ace::Sequence";
568             }
569              
570             sub _get_parent {
571 0     0     my $obj = shift;
572             # ** DANGER DANGER WILL ROBINSON! **
573             # This is an experiment in caching parents to speed lookups. Probably eats memory voraciously.
574 0 0         return $CACHE{$obj} if CACHE && exists $CACHE{$obj};
575 0   0       my $p = $obj->get(S_Parent=>2)|| $obj->get(Source=>1);
576 0 0         return unless $p;
577 0           return CACHE ? $CACHE{$obj} = $p->fetch
578             : $p->fetch;
579             }
580              
581             sub _get_children {
582 0     0     my $obj = shift;
583 0           my @pieces = $obj->get(S_Child=>2);
584 0 0         return @pieces if @pieces;
585 0           return @pieces = $obj->get('Subsequence');
586             }
587              
588             # get sequence, offset and strand of topmost container
589             sub _traverse {
590 0     0     my $obj = shift;
591 0           my ($offset,$length);
592              
593             # invoke seqget to find the top-level container for this sequence
594 0           my ($tl,$tl_start,$tl_end) = _get_toplevel($obj);
595 0   0       $tl_start ||= 0;
596 0   0       $tl_end ||= 0;
597              
598             # make it an object
599 0           $tl = ref($obj)->new(-name=>$tl,-class=>'Sequence',-db=>$obj->db);
600              
601 0           $offset += $tl_start - 1; # offset to beginning of toplevel
602 0   0       $length ||= abs($tl_end - $tl_start) + 1;
603 0 0         my $strand = $tl_start < $tl_end ? +1 : -1;
604              
605 0 0         return ($tl,$offset,$strand < 0 ? ($length,'-1') : ($length,'+1') ) if $length;
    0          
606             }
607              
608             sub _get_toplevel {
609 0     0     my $obj = shift;
610 0           my $class = $obj->class;
611 0           my $name = $obj->name;
612              
613 0           my $smap = $obj->db->raw_query("gif smap -from $class:$name");
614 0           my ($parent,$pstart,$pstop,$tstart,$tstop,$map_type) =
615             $smap =~ /^SMAP\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.+)/;
616              
617 0   0       $parent ||= '';
618 0           $parent =~ s/^Sequence://; # remove this in next version of Acedb
619 0           return ($parent,$pstart,$pstop);
620             }
621              
622             # create subroutine that filters GFF files for certain feature types
623             sub _make_filter {
624 0     0     my $self = shift;
625 0           my $automerge = $self->automerge;
626              
627             # parse out the filter
628 0           my %filter;
629 0           foreach (@_) {
630 0           my ($type,$filter) = split(':',$_,2);
631 0 0 0       if ($automerge && lc($type) eq 'transcript') {
    0 0        
632 0           @filter{'exon','intron','Sequence','cds'} = ([undef],[undef],[undef],[undef]);
633             } elsif ($automerge && lc($type) eq 'clone') {
634 0           @filter{'Clone_left_end','Clone_right_end','Sequence'} = ([undef],[undef],[undef]);
635             } else {
636 0           push @{$filter{$type}},$filter;
  0            
637             }
638             }
639              
640             # create pattern-match sub
641 0           my $sub;
642             my $promiscuous; # indicates that there is a subtype without a type
643              
644 0 0         if (%filter) {
645 0           my $s = "sub { my \@d = split(\"\\t\",\$_[0]);\n";
646 0           for my $type (keys %filter) {
647 0           my $expr;
648 0           my $subtypes = $filter{$type};
649 0 0         if ($type ne '') {
650 0           for my $st (@$subtypes) {
651 0 0         $expr .= defined $st ? "return 1 if \$d[2]=~/$type/i && \$d[1]=~/$st/i;\n"
652             : "return 1 if \$d[2]=~/$type/i;\n"
653             }
654             } else { # no type, only subtypes
655 0           $promiscuous++;
656 0           for my $st (@$subtypes) {
657 0 0         next unless defined $st;
658 0           $expr .= "return 1 if \$d[1]=~/$st/i;\n";
659             }
660             }
661 0           $s .= $expr;
662             }
663 0           $s .= "return;\n }";
664              
665 0           $sub = eval $s;
666 0 0         croak $@ if $@;
667             } else {
668 0     0     $sub = sub { 1; }
669 0           }
670 0 0         return ($sub,$promiscuous ? [] : [keys %filter]);
671             }
672              
673             # turn a GFF file and a filter into a list of Ace::Sequence::Feature objects
674             sub _make_features {
675 0     0     my $self = shift;
676 0           my ($gff,$filter) = @_;
677              
678 0           my ($r,$r_offset,$r_strand) = $self->refseq;
679 0           my $parent = $self->parent;
680 0           my $abs = $self->absolute;
681 0 0         if ($abs) {
682 0           $r_offset = 0;
683 0           $r = $parent;
684 0           $r_strand = '+1';
685             }
686 0   0       my @features = map {Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$_)}
  0            
687             grep !m@^(?:\#|//)@ && $filter->($_),split("\n",$gff);
688             }
689              
690              
691             # low level GFF call, no changing absolute to relative coordinates
692             sub _gff {
693 0     0     my $self = shift;
694 0           my ($opt,$db) = @_;
695 0           my $data = $self->_query("seqfeatures -version 2 $opt",$db);
696 0           $data =~ s/\0+\Z//;
697 0           return $data; #blasted nulls!
698             }
699              
700             # shortcut for running a gif query
701             sub _query {
702 0     0     my $self = shift;
703 0           my $command = shift;
704 0   0       my $db = shift || $self->db;
705              
706 0           my $parent = $self->parent;
707 0           my $start = $self->start(1);
708 0           my $end = $self->end(1);
709 0 0         ($start,$end) = ($end,$start) if $start > $end; #flippity floppity
710              
711 0           my $coord = "-coords $start $end";
712              
713             # BAD BAD HACK ALERT - CHECKS THE QUERY THAT IS PASSED DOWN
714             # ALSO MAKES THINGS INCOMPATIBLE WITH PRIOR 4.9 servers.
715             # my $opt = $command =~ /seqfeatures/ ? '-nodna' : '';
716 0           my $opt = '-noclip';
717              
718 0           my $query = "gif seqget $parent $opt $coord ; $command";
719 0 0         warn $query if $self->debug;
720              
721 0           return $db->raw_query("gif seqget $parent $opt $coord ; $command");
722             }
723              
724             # utility function -- reverse complement
725             sub _complement {
726 0     0     my $dna = shift;
727 0           $$dna =~ tr/GATCgatc/CTAGctag/;
728 0           $$dna = scalar reverse $$dna;
729             }
730              
731             sub _feature_filter {
732 0     0     my $self = shift;
733 0           my $features = shift;
734 0 0         return '' unless $features;
735 0           my $opt = '';
736 0 0 0       $opt = '+feature ' . join('|',@$features) if ref($features) eq 'ARRAY' && @$features;
737 0 0         $opt = "+feature $features" unless ref $features;
738 0           $opt;
739             }
740              
741             1;
742              
743             =head1 NAME
744              
745             Ace::Sequence - Examine ACeDB Sequence Objects
746              
747             =head1 SYNOPSIS
748              
749             # open database connection and get an Ace::Object sequence
750             use Ace::Sequence;
751              
752             $db = Ace->connect(-host => 'stein.cshl.org',-port => 200005);
753             $obj = $db->fetch(Predicted_gene => 'ZK154.3');
754              
755             # Wrap it in an Ace::Sequence object
756             $seq = Ace::Sequence->new($obj);
757              
758             # Find all the exons
759             @exons = $seq->features('exon');
760              
761             # Find all the exons predicted by various versions of "genefinder"
762             @exons = $seq->features('exon:genefinder.*');
763              
764             # Iterate through the exons, printing their start, end and DNA
765             for my $exon (@exons) {
766             print join "\t",$exon->start,$exon->end,$exon->dna,"\n";
767             }
768              
769             # Find the region 1000 kb upstream of the first exon
770             $sub = Ace::Sequence->new(-seq=>$exons[0],
771             -offset=>-1000,-length=>1000);
772              
773             # Find all features in that area
774             @features = $sub->features;
775              
776             # Print its DNA
777             print $sub->dna;
778              
779             # Create a new Sequence object from the first 500 kb of chromosome 1
780             $seq = Ace::Sequence->new(-name=>'CHROMOSOME_I',-db=>$db,
781             -offset=>0,-length=>500_000);
782              
783             # Get the GFF dump as a text string
784             $gff = $seq->gff;
785              
786             # Limit dump to Predicted_genes
787             $gff_genes = $seq->gff(-features=>'Predicted_gene');
788              
789             # Return a GFF object (using optional GFF.pm module from Sanger)
790             $gff_obj = $seq->GFF;
791              
792             =head1 DESCRIPTION
793              
794             I, and its allied classes L and
795             L, provide a convenient interface to the
796             ACeDB Sequence classes and the GFF sequence feature file format.
797              
798             Using this class, you can define a region of the genome by using a
799             landmark (sequenced clone, link, superlink, predicted gene), an offset
800             from that landmark, and a distance. Offsets and distances can be
801             positive or negative. This will return an I object.
802             Once a region is defined, you may retrieve its DNA sequence, or query
803             the database for any features that may be contained within this
804             region. Features can be returned as objects (using the
805             I class), as GFF text-only dumps, or in the
806             form of the GFF class defined by the Sanger Centre's GFF.pm module.
807              
808             This class builds on top of L and L. Please see
809             their manual pages before consulting this one.
810              
811             =head1 Creating New Ace::Sequence Objects, the new() Method
812              
813             $seq = Ace::Sequence->new($object);
814              
815             $seq = Ace::Sequence->new(-source => $object,
816             -offset => $offset,
817             -length => $length,
818             -refseq => $reference_sequence);
819              
820             $seq = Ace::Sequence->new(-name => $name,
821             -db => $db,
822             -offset => $offset,
823             -length => $length,
824             -refseq => $reference_sequence);
825              
826             In order to create an I you will need an active I
827             database accessor. Sequence regions are defined using a "source"
828             sequence, an offset, and a length. Optionally, you may also provide a
829             "reference sequence" to establish the coordinate system for all
830             inquiries. Sequences may be generated from existing I
831             sequence objects, from other I and
832             I objects, or from a sequence name and a
833             database handle.
834              
835             The class method named new() is the interface to these facilities. In
836             its simplest, one-argument form, you provide new() with a
837             previously-created I that points to Sequence or
838             sequence-like object (the meaning of "sequence-like" is explained in
839             more detail below.) The new() method will return an I
840             object extending from the beginning of the object through to its
841             natural end.
842              
843             In the named-parameter form of new(), the following arguments are
844             recognized:
845              
846             =over 4
847              
848             =item -source
849              
850             The sequence source. This must be an I of the "Sequence"
851             class, or be a sequence-like object containing the SMap tag (see
852             below).
853              
854             =item -offset
855              
856             An offset from the beginning of the source sequence. The retrieved
857             I will begin at this position. The offset can be any
858             positive or negative integer. Offets are B<0-based>.
859              
860             =item -length
861              
862             The length of the sequence to return. Either a positive or negative
863             integer can be specified. If a negative length is given, the returned
864             sequence will be complemented relative to the source sequence.
865              
866             =item -refseq
867              
868             The sequence to use to establish the coordinate system for the
869             returned sequence. Normally the source sequence is used to establish
870             the coordinate system, but this can be used to override that choice.
871             You can provide either an I or just a sequence name for
872             this argument. The source and reference sequences must share a common
873             ancestor, but do not have to be directly related. An attempt to use a
874             disjunct reference sequence, such as one on a different chromosome,
875             will fail.
876              
877             =item -name
878              
879             As an alternative to using an I with the B<-source>
880             argument, you may specify a source sequence using B<-name> and B<-db>.
881             The I module will use the provided database accessor to
882             fetch a Sequence object with the specified name. new() will return
883             undef is no Sequence by this name is known.
884              
885             =item -db
886              
887             This argument is required if the source sequence is specified by name
888             rather than by object reference.
889              
890             =back
891              
892             If new() is successful, it will create an I object and
893             return it. Otherwise it will return undef and return a descriptive
894             message in Ace->error(). Certain programming errors, such as a
895             failure to provide required arguments, cause a fatal error.
896              
897             =head2 Reference Sequences and the Coordinate System
898              
899             When retrieving information from an I, the coordinate
900             system is based on the sequence segment selected at object creation
901             time. That is, the "+1" strand is the natural direction of the
902             I object, and base pair 1 is its first base pair. This
903             behavior can be overridden by providing a reference sequence to the
904             new() method, in which case the orientation and position of the
905             reference sequence establishes the coordinate system for the object.
906              
907             In addition to the reference sequence, there are two other sequences
908             used by I for internal bookeeping. The "source"
909             sequence corresponds to the smallest ACeDB sequence object that
910             completely encloses the selected sequence segment. The "parent"
911             sequence is the smallest ACeDB sequence object that contains the
912             "source". The parent is used to derive the length and orientation of
913             source sequences that are not directly associated with DNA objects.
914              
915             In many cases, the source sequence will be identical to the sequence
916             initially passed to the new() method. However, there are exceptions
917             to this rule. One common exception occurs when the offset and/or
918             length cross the boundaries of the passed-in sequence. In this case,
919             the ACeDB database is searched for the smallest sequence that contains
920             both endpoints of the I object.
921              
922             The other common exception occurs in Ace 4.8, where there is support
923             for "sequence-like" objects that contain the C ("Sequence Map")
924             tag. The C tag provides genomic location information for
925             arbitrary object -- not just those descended from the Sequence class.
926             This allows ACeDB to perform genome map operations on objects that are
927             not directly related to sequences, such as genetic loci that have been
928             interpolated onto the physical map. When an C-containing object
929             is passed to the I new() method, the module will again
930             choose the smallest ACeDB Sequence object that contains both
931             end-points of the desired region.
932              
933             If an I object is used to create a new I
934             object, then the original object's source is inherited.
935              
936             =head1 Object Methods
937              
938             Once an I object is created, you can query it using the
939             following methods:
940              
941             =head2 asString()
942              
943             $name = $seq->asString;
944              
945             Returns a human-readable identifier for the sequence in the form
946             I, where "Source" is the name of the source
947             sequence, and "start" and "end" are the endpoints of the sequence
948             relative to the source (using 1-based indexing). This method is
949             called automatically when the I is used in a string
950             context.
951              
952             =head2 source_seq()
953              
954             $source = $seq->source_seq;
955              
956             Return the source of the I.
957              
958             =head2 parent_seq()
959              
960             $parent = $seq->parent_seq;
961              
962             Return the immediate ancestor of the sequence. The parent of the
963             top-most sequence (such as the CHROMOSOME link) is itself. This
964             method is used internally to ascertain the length of source sequences
965             which are not associated with a DNA object.
966              
967             NOTE: this procedure is a trifle funky and cannot reliably be used to
968             traverse upwards to the top-most sequence. The reason for this is
969             that it will return an I in some cases, and an
970             I in others. Use get_parent() to traverse upwards
971             through a uniform series of I objects upwards.
972              
973             =head2 refseq([$seq])
974              
975             $refseq = $seq->refseq;
976              
977             Returns the reference sequence, if one is defined.
978              
979             $seq->refseq($new_ref);
980              
981             Set the reference sequence. The reference sequence must share the same
982             ancestor with $seq.
983              
984             =head2 start()
985              
986             $start = $seq->start;
987              
988             Start of this sequence, relative to the source sequence, using 1-based
989             indexing.
990              
991             =head2 end()
992              
993             $end = $seq->end;
994              
995             End of this sequence, relative to the source sequence, using 1-based
996             indexing.
997              
998             =head2 offset()
999              
1000             $offset = $seq->offset;
1001              
1002             Offset of the beginning of this sequence relative to the source
1003             sequence, using 0-based indexing. The offset may be negative if the
1004             beginning of the sequence is to the left of the beginning of the
1005             source sequence.
1006              
1007             =head2 length()
1008              
1009             $length = $seq->length;
1010              
1011             The length of this sequence, in base pairs. The length may be
1012             negative if the sequence's orientation is reversed relative to the
1013             source sequence. Use abslength() to obtain the absolute value of
1014             the sequence length.
1015              
1016             =head2 abslength()
1017              
1018             $length = $seq->abslength;
1019              
1020             Return the absolute value of the length of the sequence.
1021              
1022             =head2 strand()
1023              
1024             $strand = $seq->strand;
1025              
1026             Returns +1 for a sequence oriented in the natural direction of the
1027             genomic reference sequence, or -1 otherwise.
1028              
1029             =head2 reversed()
1030              
1031             Returns true if the segment is reversed relative to the canonical
1032             genomic direction. This is the same as $seq->strand < 0.
1033              
1034             =head2 dna()
1035              
1036             $dna = $seq->dna;
1037              
1038             Return the DNA corresponding to this sequence. If the sequence length
1039             is negative, the reverse complement of the appropriate segment will be
1040             returned.
1041              
1042             ACeDB allows Sequences to exist without an associated DNA object
1043             (which typically happens during intermediate stages of a sequencing
1044             project. In such a case, the returned sequence will contain the
1045             correct number of "-" characters.
1046              
1047             =head2 name()
1048              
1049             $name = $seq->name;
1050              
1051             Return the name of the source sequence as a string.
1052              
1053             =head2 get_parent()
1054              
1055             $parent = $seq->parent;
1056              
1057             Return the immediate ancestor of this I (i.e., the
1058             sequence that contains this one). The return value is a new
1059             I or undef, if no parent sequence exists.
1060              
1061             =head2 get_children()
1062              
1063             @children = $seq->get_children();
1064              
1065             Returns all subsequences that exist as independent objects in the
1066             ACeDB database. What exactly is returned is dependent on the data
1067             model. In older ACeDB databases, the only subsequences are those
1068             under the catchall Subsequence tag. In newer ACeDB databases, the
1069             objects returned correspond to objects to the right of the S_Child
1070             subtag using a tag[2] syntax, and may include Predicted_genes,
1071             Sequences, Links, or other objects. The return value is a list of
1072             I objects.
1073              
1074             =head2 features()
1075              
1076             @features = $seq->features;
1077             @features = $seq->features('exon','intron','Predicted_gene');
1078             @features = $seq->features('exon:GeneFinder','Predicted_gene:hand.*');
1079              
1080             features() returns an array of I objects. If
1081             called without arguments, features() returns all features that cross
1082             the sequence region. You may also provide a filter list to select a
1083             set of features by type and subtype. The format of the filter list
1084             is:
1085              
1086             type:subtype
1087              
1088             Where I is the class of the feature (the "feature" field of the
1089             GFF format), and I is a description of how the feature was
1090             derived (the "source" field of the GFF format). Either of these
1091             fields can be absent, and either can be a regular expression. More
1092             advanced filtering is not supported, but is provided by the Sanger
1093             Centre's GFF module.
1094              
1095             The order of the features in the returned list is not specified. To
1096             obtain features sorted by position, use this idiom:
1097              
1098             @features = sort { $a->start <=> $b->start } $seq->features;
1099              
1100             =head2 feature_list()
1101              
1102             my $list = $seq->feature_list();
1103              
1104             This method returns a summary list of the features that cross the
1105             sequence in the form of a L object. From the
1106             L object you can obtain the list of feature names
1107             and the number of each type. The feature list is obtained from the
1108             ACeDB server with a single short transaction, and therefore has much
1109             less overhead than features().
1110              
1111             See L for more details.
1112              
1113             =head2 transcripts()
1114              
1115             This returns a list of Ace::Sequence::Transcript objects, which are
1116             specializations of Ace::Sequence::Feature. See L
1117             for details.
1118              
1119             =head2 clones()
1120              
1121             This returns a list of Ace::Sequence::Feature objects containing
1122             reconstructed clones. This is a nasty hack, because ACEDB currently
1123             records clone ends, but not the clones themselves, meaning that we
1124             will not always know both ends of the clone. In this case the missing
1125             end has a synthetic position of -99,999,999 or +99,999,999. Sorry.
1126              
1127             =head2 gff()
1128              
1129             $gff = $seq->gff();
1130             $gff = $seq->gff(-abs => 1,
1131             -features => ['exon','intron:GeneFinder']);
1132              
1133             This method returns a GFF file as a scalar. The following arguments
1134             are optional:
1135              
1136             =over 4
1137              
1138             =item -abs
1139              
1140             Ordinarily the feature entries in the GFF file will be returned in
1141             coordinates relative to the start of the I object.
1142             Position 1 will be the start of the sequence object, and the "+"
1143             strand will be the sequence object's natural orientation. However if
1144             a true value is provided to B<-abs>, the coordinate system used will
1145             be relative to the start of the source sequence, i.e. the native ACeDB
1146             Sequence object (usually a cosmid sequence or a link).
1147              
1148             If a reference sequence was provided when the I was
1149             created, it will be used by default to set the coordinate system.
1150             Relative coordinates can be reenabled by providing a false value to
1151             B<-abs>.
1152              
1153             Ordinarily the coordinate system manipulations automatically "do what
1154             you want" and you will not need to adjust them. See also the abs()
1155             method described below.
1156              
1157             =item -features
1158              
1159             The B<-features> argument filters the features according to a list of
1160             types and subtypes. The format is identical to the one described for
1161             the features() method. A single filter may be provided as a scalar
1162             string. Multiple filters may be passed as an array reference.
1163              
1164             =back
1165              
1166             See also the GFF() method described next.
1167              
1168             =head2 GFF()
1169              
1170             $gff_object = $seq->gff;
1171             $gff_object = $seq->gff(-abs => 1,
1172             -features => ['exon','intron:GeneFinder']);
1173              
1174             The GFF() method takes the same arguments as gff() described above,
1175             but it returns a I object from the GFF.pm
1176             module. If the GFF module is not installed, this method will generate
1177             a fatal error.
1178              
1179             =head2 absolute()
1180              
1181             $abs = $seq->absolute;
1182             $abs = $seq->absolute(1);
1183              
1184             This method controls whether the coordinates of features are returned
1185             in absolute or relative coordinates. "Absolute" coordinates are
1186             relative to the underlying source or reference sequence. "Relative"
1187             coordinates are relative to the I object. By default,
1188             coordinates are relative unless new() was provided with a reference
1189             sequence. This default can be examined and changed using absolute().
1190              
1191             =head2 automerge()
1192              
1193             $merge = $seq->automerge;
1194             $seq->automerge(0);
1195              
1196             This method controls whether groups of features will automatically be
1197             merged together by the features() call. If true (the default), then
1198             the left and right end of clones will be merged into "clone" features,
1199             introns, exons and CDS entries will be merged into
1200             Ace::Sequence::Transcript objects, and similarity entries will be
1201             merged into Ace::Sequence::GappedAlignment objects.
1202              
1203             =head2 db()
1204              
1205             $db = $seq->db;
1206              
1207             Returns the L database accessor associated with this sequence.
1208              
1209             =head1 SEE ALSO
1210              
1211             L, L, L,
1212             L, L
1213              
1214             =head1 AUTHOR
1215              
1216             Lincoln Stein with extensive help from Jean
1217             Thierry-Mieg
1218              
1219             Many thanks to David Block for finding and
1220             fixing the nasty off-by-one errors.
1221              
1222             Copyright (c) 1999, Lincoln D. Stein
1223              
1224             This library is free software; you can redistribute it and/or modify
1225             it under the same terms as Perl itself. See DISCLAIMER.txt for
1226             disclaimers of warranty.
1227              
1228             =cut
1229              
1230             __END__