File Coverage

blib/lib/Ace/Sequence.pm

Criterion	Covered	Total	%
statement	30	392	7.6
branch	0	204	0.0
condition	0	85	0.0
subroutine	10	58	17.2
pod	19	34	55.8
total	59	773	7.6

line	stmt	bran	cond	sub	pod	time	code
1							package Ace::Sequence;
2	1			1		3411	use strict;
	1					3
	1					33
3
4	1			1		5	use Carp;
	1					1
	1					96
5	1			1		6	use strict;
	1					2
	1					34
6	1			1		857	use Ace 1.50 qw(:DEFAULT rearrange);
	1					37
	1					195
7	1			1		836	use Ace::Sequence::FeatureList;
	1					3
	1					29
8	1			1		653	use Ace::Sequence::Feature;
	1					3
	1					35
9	1			1		7	use AutoLoader 'AUTOLOAD';
	1					1
	1					6
10	1			1		28	use vars '$VERSION';
	1					2
	1					45
11							my %CACHE;
12
13							$VERSION = '1.51';
14
15	1			1		5	use constant CACHE => 1;
	1					1
	1					72
16
17							use overload
18	1					6	'""' => 'asString',
19							cmp => 'cmp',
20	1			1		6	;
	1					2
21
22							# synonym: stop = end
23							*stop = \&end;
24							*abs = \&absolute;
25							*source_seq = \&source;
26							*source_tag = \&subtype;
27							*primary_tag = \&type;
28
29							my %plusminus = ( '+' => '-',
30							'-' => '+',
31							'.' => '.');
32
33							# internal keys
34							# parent => reference Sequence in "+" strand
35							# p_offset => our start in the parent
36							# length => our length
37							# strand => our strand (+ or -)
38							# refseq => reference Sequence for coordinate system
39
40							# object constructor
41							# usually called like this:
42							# $seq = Ace::Sequence->new($object);
43							# but can be called like this:
44							# $seq = Ace::Sequence->new(-db=>$db,-name=>$name);
45							# or
46							# $seq = Ace::Sequence->new(-seq => $object,
47							# -offset => $offset,
48							# -length => $length,
49							# -ref => $refseq
50							# );
51							# $refseq, if provided, will be used to establish the coordinate
52							# system. Otherwise the first base pair will be set to 1.
53							sub new {
54	0			0	0		my $pack = shift;
55	0						my ($seq,$start,$end,$offset,$length,$refseq,$db) =
56							rearrange([
57							['SEQ','SEQUENCE','SOURCE'],
58							'START',
59							['END','STOP'],
60							['OFFSET','OFF'],
61							['LENGTH','LEN'],
62							'REFSEQ',
63							['DATABASE','DB'],
64							],@_);
65
66							# Object must have a parent sequence and/or a reference
67							# sequence. In some cases, the parent sequence will be the
68							# object itself. The reference sequence is used to set up
69							# the frame of reference for the coordinate system.
70
71							# fetch the sequence object if we don't have it already
72	0	0	0				croak "Please provide either a Sequence object or a database and name"
			0
73							unless ref($seq) \|\| ($seq && $db);
74
75							# convert start into offset
76	0	0	0				$offset = $start - 1 if defined($start) and !defined($offset);
77
78							# convert stop/end into length
79	0	0	0				$length = ($end > $start) ? $end - $offset : $end - $offset - 2
		0
80							if defined($end) && !defined($length);
81
82							# if just a string is passed, try to fetch a Sequence object
83	0	0					my $obj = ref($seq) ? $seq : $db->fetch('Sequence'=>$seq);
84	0	0					unless ($obj) {
85	0						Ace->error("No Sequence named $obj found in database");
86	0						return;
87							}
88
89							# get parent coordinates and length of this sequence
90							# the parent is an Ace Sequence object in the "+" strand
91	0						my ($parent,$p_offset,$p_length,$strand) = find_parent($obj);
92	0	0					return unless $parent;
93
94							# handle negative strands
95	0						my $r_strand = $strand;
96	0						my $r_offset = $p_offset;
97	0		0				$offset \|\|= 0;
98	0	0					$offset *= -1 if $strand < 0;
99
100							# handle feature objects
101	0	0					$offset += $obj->offset if $obj->can('smapped');
102
103							# get source
104	0	0					my $source = $obj->can('smapped') ? $obj->source : $obj;
105
106							# store the object into our instance variables
107	0		0				my $self = bless {
108							obj => $source,
109							offset => $offset,
110							length => $length \|\| $p_length,
111							parent => $parent,
112							p_offset => $p_offset,
113							refseq => [$source,$r_offset,$r_strand],
114							strand => $strand,
115							absolute => 0,
116							automerge => 1,
117							},$pack;
118
119							# set the reference sequence
120	0	0	0				eval { $self->refseq($refseq) } or return if defined $refseq;
	0
121
122							# wheww!
123	0						return $self;
124							}
125
126							# return the "source" object that the user offset from
127							sub source {
128	0			0	0		$_[0]->{obj};
129							}
130
131							# return the parent object
132	0			0	0		sub parent { $_[0]->{parent} }
133
134							# return the length
135							#sub length { $_[0]->{length} }
136							sub length {
137	0			0	1		my $self = shift;
138	0						my ($start,$end) = ($self->start,$self->end);
139	0	0					return $end - $start + ($end > $start ? 1 : -1); # for stupid 1-based adjustments
140							}
141
142	0			0	1		sub reversed { return shift->strand < 0; }
143
144							sub automerge {
145	0			0	1		my $self = shift;
146	0						my $d = $self->{automerge};
147	0	0					$self->{automerge} = shift if @_;
148	0						$d;
149							}
150
151							# return reference sequence
152							sub refseq {
153	0			0	1		my $self = shift;
154	0						my $prev = $self->{refseq};
155	0	0					if (@_) {
156	0						my $refseq = shift;
157	0						my $arrayref;
158
159	0	0					BLOCK: {
160	0						last BLOCK unless defined ($refseq);
161
162	0	0	0				if (ref($refseq) && ref($refseq) eq 'ARRAY') {
163	0						$arrayref = $refseq;
164	0						last BLOCK;
165							}
166
167	0	0	0				if (ref($refseq) && ($refseq->can('smapped'))) {
168	0	0					croak "Reference sequence has no common ancestor with sequence"
169							unless $self->parent eq $refseq->parent;
170	0						my ($a,$b,$c) = @{$refseq->{refseq}};
	0
171							# $b += $refseq->offset;
172	0						$b += $refseq->offset;
173	0						$arrayref = [$refseq,$b,$refseq->strand];
174	0						last BLOCK;
175							}
176
177
178							# look up reference sequence in database if we aren't given
179							# database object already
180	0	0					$refseq = $self->db->fetch('Sequence' => $refseq)
181							unless $refseq->isa('Ace::Object');
182	0	0					croak "Invalid reference sequence" unless $refseq;
183
184							# find position of ref sequence in parent strand
185	0						my ($r_parent,$r_offset,$r_length,$r_strand) = find_parent($refseq);
186	0	0					croak "Reference sequence has no common ancestor with sequence"
187							unless $r_parent eq $self->{parent};
188
189							# set to array reference containing this information
190	0						$arrayref = [$refseq,$r_offset,$r_strand];
191							}
192	0						$self->{refseq} = $arrayref;
193							}
194	0	0					return unless $prev;
195	0	0					return $self->parent if $self->absolute;
196	0	0					return wantarray ? @{$prev} : $prev->[0];
	0
197							}
198
199							# return strand
200	0			0	1		sub strand { return $_[0]->{strand} }
201
202							# return reference strand
203							sub r_strand {
204	0			0	0		my $self = shift;
205	0	0					return "+1" if $self->absolute;
206	0	0					if (my ($ref,$r_offset,$r_strand) = $self->refseq) {
207	0						return $r_strand;
208							} else {
209	0						return $self->{strand}
210							}
211							}
212
213	0			0	1		sub offset { $_[0]->{offset} }
214	0			0	0		sub p_offset { $_[0]->{p_offset} }
215
216	0			0	0		sub smapped { 1; }
217	0			0	0		sub type { 'Sequence' }
218	0			0	0		sub subtype { }
219
220							sub debug {
221	0			0	0		my $self = shift;
222	0						my $d = $self->{_debug};
223	0	0					$self->{_debug} = shift if @_;
224	0						$d;
225							}
226
227							# return the database this sequence is associated with
228							sub db {
229	0		0	0	1		return Ace->name2db($_[0]->{db} \|\|= $_[0]->source->db);
230							}
231
232							sub start {
233	0			0	1		my ($self,$abs) = @_;
234	0	0					$abs = $self->absolute unless defined $abs;
235	0	0					return $self->{p_offset} + $self->{offset} + 1 if $abs;
236
237	0	0					if ($self->refseq) {
238	0						my ($ref,$r_offset,$r_strand) = $self->refseq;
239	0	0					return $r_strand < 0 ? 1 + $r_offset - ($self->{p_offset} + $self->{offset})
240							: 1 + $self->{p_offset} + $self->{offset} - $r_offset;
241							}
242
243							else {
244	0						return $self->{offset} +1;
245							}
246
247							}
248
249							sub end {
250	0			0	1		my ($self,$abs) = @_;
251	0						my $start = $self->start($abs);
252	0	0					my $f = $self->{length} > 0 ? 1 : -1; # for stupid 1-based adjustments
253	0	0	0				if ($abs && $self->refseq ne $self->parent) {
254	0						my $r_strand = $self->r_strand;
255	0	0	0				return $start - $self->{length} + $f
			0
256							if $r_strand < 0 or $self->{strand} < 0 or $self->{length} < 0;
257	0						return $start + $self->{length} - $f
258							}
259	0	0					return $start + $self->{length} - $f if $self->r_strand eq $self->{strand};
260	0						return $start - $self->{length} + $f;
261							}
262
263							# turn on absolute coordinates (relative to reference sequence)
264							sub absolute {
265	0			0	1		my $self = shift;
266	0						my $prev = $self->{absolute};
267	0	0					$self->{absolute} = $_[0] if defined $_[0];
268	0						return $prev;
269							}
270
271							# human readable string (for debugging)
272							sub asString {
273	0			0	1		my $self = shift;
274	0	0					if ($self->absolute) {
		0
275	0						return join '',$self->parent,'/',$self->start,',',$self->end;
276							} elsif (my $ref = $self->refseq){
277	0	0					my $label = $ref->isa('Ace::Sequence::Feature') ? $ref->info : "$ref";
278	0						return join '',$label,'/',$self->start,',',$self->end;
279
280							} else {
281	0						join '',$self->source,'/',$self->start,',',$self->end;
282							}
283							}
284
285							sub cmp {
286	0			0	0		my ($self,$arg,$reversed) = @_;
287	0	0	0				if (ref($arg) and $arg->isa('Ace::Sequence')) {
288	0		0				my $cmp = $self->parent cmp $arg->parent
289							\|\| $self->start <=> $arg->start;
290	0	0					return $reversed ? -$cmp : $cmp;
291							}
292	0						my $name = $self->asString;
293	0	0					return $reversed ? $arg cmp $name : $name cmp $arg;
294							}
295
296							# Return the DNA
297							sub dna {
298	0			0	1		my $self = shift;
299	0	0					return $self->{dna} if $self->{dna};
300	0						my $raw = $self->_query('seqdna');
301	0						$raw=~s/^>.*\n//m;
302	0						$raw=~s/^\/\/.*//mg;
303	0						$raw=~s/\n//g;
304	0						$raw =~ s/\0+\Z//; # blasted nulls!
305	0	0					my $effective_strand = $self->end >= $self->start ? '+1' : '-1';
306	0	0					_complement(\$raw) if $self->r_strand ne $effective_strand;
307	0						return $self->{dna} = $raw;
308							}
309
310							# return a gff file
311							sub gff {
312	0			0	1		my $self = shift;
313	0						my ($abs,$features) = rearrange([['ABS','ABSOLUTE'],'FEATURES'],@_);
314	0	0					$abs = $self->absolute unless defined $abs;
315
316							# can provide list of feature names, such as 'similarity', or 'all' to get 'em all
317							# !THIS IS BROKEN; IT SHOULD LOOK LIKE FEATURE()!
318	0						my $opt = $self->_feature_filter($features);
319
320	0						my $gff = $self->_gff($opt);
321	0	0					warn $gff if $self->debug;
322
323	0	0					$self->transformGFF(\$gff) unless $abs;
324	0						return $gff;
325							}
326
327							# return a GFF object using the optional GFF.pm module
328							sub GFF {
329	0			0	1		my $self = shift;
330	0						my ($filter,$converter) = @_; # anonymous subs
331	0	0					croak "GFF module not installed" unless require GFF;
332	0						require GFF::Filehandle;
333
334	0						my @lines = grep !/^\/\//,split "\n",$self->gff(@_);
335	0						local *IN;
336	0						local ($^W) = 0; # prevent complaint by GFF module
337	0						tie *IN,'GFF::Filehandle',\@lines;
338	0						my $gff = GFF::GeneFeatureSet->new;
339	0	0					$gff->read(\*IN,$filter,$converter) if $gff;
340	0						return $gff;
341							}
342
343							# Get the features table. Can filter by type/subtype this way:
344							# features('similarity:EST','annotation:assembly_tag')
345							sub features {
346	0			0	1		my $self = shift;
347	0						my ($filter,$opt) = $self->_make_filter(@_);
348
349							# get raw gff file
350	0						my $gff = $self->gff(-features=>$opt);
351
352							# turn it into a list of features
353	0						my @features = $self->_make_features($gff,$filter);
354
355	0	0					if ($self->automerge) { # automatic merging
356							# fetch out constructed transcripts and clones
357	0						my %types = map {lc($_)=>1} (@$opt,@_);
	0
358	0	0					if ($types{'transcript'}) {
359	0						push @features,$self->_make_transcripts(\@features);
360	0						@features = grep {$_->type !~ /^(intron\|exon)$/ } @features;
	0
361							}
362	0	0					push @features,$self->_make_clones(\@features) if $types{'clone'};
363	0	0					if ($types{'similarity'}) {
364	0						my @f = $self->_make_alignments(\@features);
365	0						@features = grep {$_->type ne 'similarity'} @features;
	0
366	0						push @features,@f;
367							}
368							}
369
370	0	0					return wantarray ? @features : \@features;
371							}
372
373							# A little bit more complex - assemble a list of "transcripts"
374							# consisting of Ace::Sequence::Transcript objects. These objects
375							# contain a list of exons and introns.
376							sub transcripts {
377	0			0	1		my $self = shift;
378	0						my $curated = shift;
379	0	0					my $ef = $curated ? "exon:curated" : "exon";
380	0	0					my $if = $curated ? "intron:curated" : "intron";
381	0	0					my $sf = $curated ? "Sequence:curated" : "Sequence";
382	0						my @features = $self->features($ef,$if,$sf);
383	0	0					return unless @features;
384	0						return $self->_make_transcripts(\@features);
385							}
386
387							sub _make_transcripts {
388	0			0			my $self = shift;
389	0						my $features = shift;
390
391	0						require Ace::Sequence::Transcript;
392	0						my %transcripts;
393
394	0						for my $feature (@$features) {
395	0						my $transcript = $feature->info;
396	0	0					next unless $transcript;
397	0	0					if ($feature->type =~ /^(exon\|intron\|cds)$/) {
		0
398	0						my $type = $1;
399	0						push @{$transcripts{$transcript}{$type}},$feature;
	0
400							} elsif ($feature->type eq 'Sequence') {
401	0		0				$transcripts{$transcript}{base} \|\|= $feature;
402							}
403							}
404
405							# get rid of transcripts without exons
406	0						foreach (keys %transcripts) {
407	0	0					delete $transcripts{$_} unless exists $transcripts{$_}{exon}
408							}
409
410							# map the rest onto Ace::Sequence::Transcript objects
411	0						return map {Ace::Sequence::Transcript->new($transcripts{$_})} keys %transcripts;
	0
412							}
413
414							# Reassemble clones from clone left and right ends
415							sub clones {
416	0			0	1		my $self = shift;
417	0						my @clones = $self->features('Clone_left_end','Clone_right_end','Sequence');
418	0						my %clones;
419	0	0					return unless @clones;
420	0						return $self->_make_clones(\@clones);
421							}
422
423							sub _make_clones {
424	0			0			my $self = shift;
425	0						my $features = shift;
426
427	0						my (%clones,@canonical_clones);
428	0	0					my $start_label = $self->strand < 0 ? 'end' : 'start';
429	0	0					my $end_label = $self->strand < 0 ? 'start' : 'end';
430	0						for my $feature (@$features) {
431	0	0					$clones{$feature->info}{$start_label} = $feature->start if $feature->type eq 'Clone_left_end';
432	0	0					$clones{$feature->info}{$end_label} = $feature->start if $feature->type eq 'Clone_right_end';
433
434	0	0					if ($feature->type eq 'Sequence') {
435	0						my $info = $feature->info;
436	0	0					next if $info =~ /LINK\|CHROMOSOME\|\.\w+$/;
437	0	0					if ($info->Genomic_canonical(0)) {
438	0	0					push (@canonical_clones,$info->Clone) if $info->Clone;
439							}
440							}
441							}
442
443	0						foreach (@canonical_clones) {
444	0		0				$clones{$_} \|\|= {};
445							}
446
447	0						my @features;
448	0						my ($r,$r_offset,$r_strand) = $self->refseq;
449	0						my $parent = $self->parent;
450	0						my $abs = $self->absolute;
451	0	0					if ($abs) {
452	0						$r_offset = 0;
453	0						$r = $parent;
454	0						$r_strand = '+1';
455							}
456
457							# BAD HACK ALERT. WE DON'T KNOW WHERE THE LEFT END OF THE CLONE IS SO WE USE
458							# THE MAGIC NUMBER -99_999_999 to mean "off left end" and
459							# +99_999_999 to mean "off right end"
460	0						for my $clone (keys %clones) {
461	0		0				my $start = $clones{$clone}{start} \|\| -99_999_999;
462	0		0				my $end = $clones{$clone}{end} \|\| +99_999_999;
463	0						my $phony_gff = join "\t",($parent,'Clone','structural',$start,$end,'.','.','.',qq(Clone "$clone"));
464	0						push @features,Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$phony_gff);
465							}
466	0						return @features;
467							}
468
469							# Assemble a list of "GappedAlignment" objects. These objects
470							# contain a list of aligned segments.
471							sub alignments {
472	0			0	0		my $self = shift;
473	0						my @subtypes = @_;
474	0						my @types = map { "similarity:\^$_\$" } @subtypes;
	0
475	0	0					push @types,'similarity' unless @types;
476	0						return $self->features(@types);
477							}
478
479							sub segments {
480	0			0	0		my $self = shift;
481	0						return;
482							}
483
484							sub _make_alignments {
485	0			0			my $self = shift;
486	0						my $features = shift;
487	0						require Ace::Sequence::GappedAlignment;
488
489	0						my %homol;
490
491	0						for my $feature (@$features) {
492	0	0					next unless $feature->type eq 'similarity';
493	0						my $target = $feature->info;
494	0						my $subtype = $feature->subtype;
495	0						push @{$homol{$target,$subtype}},$feature;
	0
496							}
497
498							# map onto Ace::Sequence::GappedAlignment objects
499	0						return map {Ace::Sequence::GappedAlignment->new($homol{$_})} keys %homol;
	0
500							}
501
502							# return list of features quickly
503							sub feature_list {
504	0			0	1		my $self = shift;
505	0	0					return $self->{'feature_list'} if $self->{'feature_list'};
506	0	0					return unless my $raw = $self->_query('seqfeatures -version 2 -list');
507	0						return $self->{'feature_list'} = Ace::Sequence::FeatureList->new($raw);
508							}
509
510							# transform a GFF file into the coordinate system of the sequence
511							sub transformGFF {
512	0			0	0		my $self = shift;
513	0						my $gff = shift;
514	0						my $parent = $self->parent;
515	0						my $strand = $self->{strand};
516	0						my $source = $self->source;
517	0						my ($ref_source,$ref_offset,$ref_strand) = $self->refseq;
518	0		0				$ref_source \|\|= $source;
519	0		0				$ref_strand \|\|= $strand;
520
521	0	0					if ($ref_strand > 0) {
522	0	0					my $o = defined($ref_offset) ? $ref_offset : ($self->p_offset + $self->offset);
523							# find anything that looks like a numeric field and subtract offset from it
524	0						$$gff =~ s/(?
	0
525	0						$$gff =~ s/^$parent/$source/mg;
526	0						$$gff =~ s/\#\#sequence-region\s+\S+/##sequence-region $ref_source/m;
527	0						$$gff =~ s/FMAP_FEATURES\s+"\S+"/FMAP_FEATURES "$ref_source"/m;
528	0						return;
529							} else { # strand eq '-'
530	0	0					my $o = defined($ref_offset) ? (2 + $ref_offset) : (2 + $self->p_offset - $self->offset);
531	0						$$gff =~ s/(?
	0
532	0						$$gff =~ s/(Target \"[^\"]+\" )(-?\d+) (-?\d+)/$1 $3 $2/g;
533	0						$$gff =~ s/^$parent/$source/mg;
534	0						$$gff =~ s/\#\#sequence-region\s+\S+\s+(-?\d+)\s+(-?\d+)/"##sequence-region $ref_source " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em;
	0
535	0						$$gff =~ s/FMAP_FEATURES\s+"\S+"\s+(-?\d+)\s+(-?\d+)/"FMAP_FEATURES \"$ref_source\" " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em;
	0
536							}
537
538							}
539
540							# return a name for the object
541							sub name {
542	0			0	1		return shift->source_seq->name;
543							}
544
545							# for compatibility with Ace::Sequence::Feature
546							sub info {
547	0			0	0		return shift->source_seq;
548							}
549
550							###################### internal functions #################
551							# not necessarily object-oriented!!
552
553							# return parent, parent offset and strand
554							sub find_parent {
555	0			0	0		my $obj = shift;
556
557							# first, if we are passed an Ace::Sequence, then we can inherit
558							# these settings directly
559	0	0					return (@{$obj}{qw(parent p_offset length)},$obj->r_strand)
	0
560							if $obj->isa('Ace::Sequence');
561
562							# otherwise, if we are passed an Ace::Object, then we must
563							# traverse upwards until we find a suitable parent
564	0	0					return _traverse($obj) if $obj->isa('Ace::Object');
565
566							# otherwise, we don't know what to do...
567	0						croak "Source sequence not an Ace::Object or an Ace::Sequence";
568							}
569
570							sub _get_parent {
571	0			0			my $obj = shift;
572							# DANGER DANGER WILL ROBINSON!
573							# This is an experiment in caching parents to speed lookups. Probably eats memory voraciously.
574	0	0					return $CACHE{$obj} if CACHE && exists $CACHE{$obj};
575	0		0				my $p = $obj->get(S_Parent=>2)\|\| $obj->get(Source=>1);
576	0	0					return unless $p;
577	0						return CACHE ? $CACHE{$obj} = $p->fetch
578							: $p->fetch;
579							}
580
581							sub _get_children {
582	0			0			my $obj = shift;
583	0						my @pieces = $obj->get(S_Child=>2);
584	0	0					return @pieces if @pieces;
585	0						return @pieces = $obj->get('Subsequence');
586							}
587
588							# get sequence, offset and strand of topmost container
589							sub _traverse {
590	0			0			my $obj = shift;
591	0						my ($offset,$length);
592
593							# invoke seqget to find the top-level container for this sequence
594	0						my ($tl,$tl_start,$tl_end) = _get_toplevel($obj);
595	0		0				$tl_start \|\|= 0;
596	0		0				$tl_end \|\|= 0;
597
598							# make it an object
599	0						$tl = ref($obj)->new(-name=>$tl,-class=>'Sequence',-db=>$obj->db);
600
601	0						$offset += $tl_start - 1; # offset to beginning of toplevel
602	0		0				$length \|\|= abs($tl_end - $tl_start) + 1;
603	0	0					my $strand = $tl_start < $tl_end ? +1 : -1;
604
605	0	0					return ($tl,$offset,$strand < 0 ? ($length,'-1') : ($length,'+1') ) if $length;
		0
606							}
607
608							sub _get_toplevel {
609	0			0			my $obj = shift;
610	0						my $class = $obj->class;
611	0						my $name = $obj->name;
612
613	0						my $smap = $obj->db->raw_query("gif smap -from $class:$name");
614	0						my ($parent,$pstart,$pstop,$tstart,$tstop,$map_type) =
615							$smap =~ /^SMAP\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.+)/;
616
617	0		0				$parent \|\|= '';
618	0						$parent =~ s/^Sequence://; # remove this in next version of Acedb
619	0						return ($parent,$pstart,$pstop);
620							}
621
622							# create subroutine that filters GFF files for certain feature types
623							sub _make_filter {
624	0			0			my $self = shift;
625	0						my $automerge = $self->automerge;
626
627							# parse out the filter
628	0						my %filter;
629	0						foreach (@_) {
630	0						my ($type,$filter) = split(':',$_,2);
631	0	0	0				if ($automerge && lc($type) eq 'transcript') {
		0	0
632	0						@filter{'exon','intron','Sequence','cds'} = ([undef],[undef],[undef],[undef]);
633							} elsif ($automerge && lc($type) eq 'clone') {
634	0						@filter{'Clone_left_end','Clone_right_end','Sequence'} = ([undef],[undef],[undef]);
635							} else {
636	0						push @{$filter{$type}},$filter;
	0
637							}
638							}
639
640							# create pattern-match sub
641	0						my $sub;
642							my $promiscuous; # indicates that there is a subtype without a type
643
644	0	0					if (%filter) {
645	0						my $s = "sub { my \@d = split(\"\\t\",\$_[0]);\n";
646	0						for my $type (keys %filter) {
647	0						my $expr;
648	0						my $subtypes = $filter{$type};
649	0	0					if ($type ne '') {
650	0						for my $st (@$subtypes) {
651	0	0					$expr .= defined $st ? "return 1 if \$d[2]=~/$type/i && \$d[1]=~/$st/i;\n"
652							: "return 1 if \$d[2]=~/$type/i;\n"
653							}
654							} else { # no type, only subtypes
655	0						$promiscuous++;
656	0						for my $st (@$subtypes) {
657	0	0					next unless defined $st;
658	0						$expr .= "return 1 if \$d[1]=~/$st/i;\n";
659							}
660							}
661	0						$s .= $expr;
662							}
663	0						$s .= "return;\n }";
664
665	0						$sub = eval $s;
666	0	0					croak $@ if $@;
667							} else {
668	0			0			$sub = sub { 1; }
669	0						}
670	0	0					return ($sub,$promiscuous ? [] : [keys %filter]);
671							}
672
673							# turn a GFF file and a filter into a list of Ace::Sequence::Feature objects
674							sub _make_features {
675	0			0			my $self = shift;
676	0						my ($gff,$filter) = @_;
677
678	0						my ($r,$r_offset,$r_strand) = $self->refseq;
679	0						my $parent = $self->parent;
680	0						my $abs = $self->absolute;
681	0	0					if ($abs) {
682	0						$r_offset = 0;
683	0						$r = $parent;
684	0						$r_strand = '+1';
685							}
686	0		0				my @features = map {Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$_)}
	0
687							grep !m@^(?:\#\|//)@ && $filter->($_),split("\n",$gff);
688							}
689
690
691							# low level GFF call, no changing absolute to relative coordinates
692							sub _gff {
693	0			0			my $self = shift;
694	0						my ($opt,$db) = @_;
695	0						my $data = $self->_query("seqfeatures -version 2 $opt",$db);
696	0						$data =~ s/\0+\Z//;
697	0						return $data; #blasted nulls!
698							}
699
700							# shortcut for running a gif query
701							sub _query {
702	0			0			my $self = shift;
703	0						my $command = shift;
704	0		0				my $db = shift \|\| $self->db;
705
706	0						my $parent = $self->parent;
707	0						my $start = $self->start(1);
708	0						my $end = $self->end(1);
709	0	0					($start,$end) = ($end,$start) if $start > $end; #flippity floppity
710
711	0						my $coord = "-coords $start $end";
712
713							# BAD BAD HACK ALERT - CHECKS THE QUERY THAT IS PASSED DOWN
714							# ALSO MAKES THINGS INCOMPATIBLE WITH PRIOR 4.9 servers.
715							# my $opt = $command =~ /seqfeatures/ ? '-nodna' : '';
716	0						my $opt = '-noclip';
717
718	0						my $query = "gif seqget $parent $opt $coord ; $command";
719	0	0					warn $query if $self->debug;
720
721	0						return $db->raw_query("gif seqget $parent $opt $coord ; $command");
722							}
723
724							# utility function -- reverse complement
725							sub _complement {
726	0			0			my $dna = shift;
727	0						$$dna =~ tr/GATCgatc/CTAGctag/;
728	0						$$dna = scalar reverse $$dna;
729							}
730
731							sub _feature_filter {
732	0			0			my $self = shift;
733	0						my $features = shift;
734	0	0					return '' unless $features;
735	0						my $opt = '';
736	0	0	0				$opt = '+feature ' . join('\|',@$features) if ref($features) eq 'ARRAY' && @$features;
737	0	0					$opt = "+feature $features" unless ref $features;
738	0						$opt;
739							}
740
741							1;
742
743							=head1 NAME
744
745							Ace::Sequence - Examine ACeDB Sequence Objects
746
747							=head1 SYNOPSIS
748
749							# open database connection and get an Ace::Object sequence
750							use Ace::Sequence;
751
752							$db = Ace->connect(-host => 'stein.cshl.org',-port => 200005);
753							$obj = $db->fetch(Predicted_gene => 'ZK154.3');
754
755							# Wrap it in an Ace::Sequence object
756							$seq = Ace::Sequence->new($obj);
757
758							# Find all the exons
759							@exons = $seq->features('exon');
760
761							# Find all the exons predicted by various versions of "genefinder"
762							@exons = $seq->features('exon:genefinder.*');
763
764							# Iterate through the exons, printing their start, end and DNA
765							for my $exon (@exons) {
766							print join "\t",$exon->start,$exon->end,$exon->dna,"\n";
767							}
768
769							# Find the region 1000 kb upstream of the first exon
770							$sub = Ace::Sequence->new(-seq=>$exons[0],
771							-offset=>-1000,-length=>1000);
772
773							# Find all features in that area
774							@features = $sub->features;
775
776							# Print its DNA
777							print $sub->dna;
778
779							# Create a new Sequence object from the first 500 kb of chromosome 1
780							$seq = Ace::Sequence->new(-name=>'CHROMOSOME_I',-db=>$db,
781							-offset=>0,-length=>500_000);
782
783							# Get the GFF dump as a text string
784							$gff = $seq->gff;
785
786							# Limit dump to Predicted_genes
787							$gff_genes = $seq->gff(-features=>'Predicted_gene');
788
789							# Return a GFF object (using optional GFF.pm module from Sanger)
790							$gff_obj = $seq->GFF;
791
792							=head1 DESCRIPTION
793
794							I, and its allied classes L and
795							L, provide a convenient interface to the
796							ACeDB Sequence classes and the GFF sequence feature file format.
797
798							Using this class, you can define a region of the genome by using a
799							landmark (sequenced clone, link, superlink, predicted gene), an offset
800							from that landmark, and a distance. Offsets and distances can be
801							positive or negative. This will return an I object.
802							Once a region is defined, you may retrieve its DNA sequence, or query
803							the database for any features that may be contained within this
804							region. Features can be returned as objects (using the
805							I class), as GFF text-only dumps, or in the
806							form of the GFF class defined by the Sanger Centre's GFF.pm module.
807
808							This class builds on top of L and L. Please see
809							their manual pages before consulting this one.
810
811							=head1 Creating New Ace::Sequence Objects, the new() Method
812
813							$seq = Ace::Sequence->new($object);
814
815							$seq = Ace::Sequence->new(-source => $object,
816							-offset => $offset,
817							-length => $length,
818							-refseq => $reference_sequence);
819
820							$seq = Ace::Sequence->new(-name => $name,
821							-db => $db,
822							-offset => $offset,
823							-length => $length,
824							-refseq => $reference_sequence);
825
826							In order to create an I you will need an active I
827							database accessor. Sequence regions are defined using a "source"
828							sequence, an offset, and a length. Optionally, you may also provide a
829							"reference sequence" to establish the coordinate system for all
830							inquiries. Sequences may be generated from existing I
831							sequence objects, from other I and
832							I objects, or from a sequence name and a
833							database handle.
834
835							The class method named new() is the interface to these facilities. In
836							its simplest, one-argument form, you provide new() with a
837							previously-created I that points to Sequence or
838							sequence-like object (the meaning of "sequence-like" is explained in
839							more detail below.) The new() method will return an I
840							object extending from the beginning of the object through to its
841							natural end.
842
843							In the named-parameter form of new(), the following arguments are
844							recognized:
845
846							=over 4
847
848							=item -source
849
850							The sequence source. This must be an I of the "Sequence"
851							class, or be a sequence-like object containing the SMap tag (see
852							below).
853
854							=item -offset
855
856							An offset from the beginning of the source sequence. The retrieved
857							I will begin at this position. The offset can be any
858							positive or negative integer. Offets are B<0-based>.
859
860							=item -length
861
862							The length of the sequence to return. Either a positive or negative
863							integer can be specified. If a negative length is given, the returned
864							sequence will be complemented relative to the source sequence.
865
866							=item -refseq
867
868							The sequence to use to establish the coordinate system for the
869							returned sequence. Normally the source sequence is used to establish
870							the coordinate system, but this can be used to override that choice.
871							You can provide either an I or just a sequence name for
872							this argument. The source and reference sequences must share a common
873							ancestor, but do not have to be directly related. An attempt to use a
874							disjunct reference sequence, such as one on a different chromosome,
875							will fail.
876
877							=item -name
878
879							As an alternative to using an I with the B<-source>
880							argument, you may specify a source sequence using B<-name> and B<-db>.
881							The I module will use the provided database accessor to
882							fetch a Sequence object with the specified name. new() will return
883							undef is no Sequence by this name is known.
884
885							=item -db
886
887							This argument is required if the source sequence is specified by name
888							rather than by object reference.
889
890							=back
891
892							If new() is successful, it will create an I object and
893							return it. Otherwise it will return undef and return a descriptive
894							message in Ace->error(). Certain programming errors, such as a
895							failure to provide required arguments, cause a fatal error.
896
897							=head2 Reference Sequences and the Coordinate System
898
899							When retrieving information from an I, the coordinate
900							system is based on the sequence segment selected at object creation
901							time. That is, the "+1" strand is the natural direction of the
902							I object, and base pair 1 is its first base pair. This
903							behavior can be overridden by providing a reference sequence to the
904							new() method, in which case the orientation and position of the
905							reference sequence establishes the coordinate system for the object.
906
907							In addition to the reference sequence, there are two other sequences
908							used by I for internal bookeeping. The "source"
909							sequence corresponds to the smallest ACeDB sequence object that
910							completely encloses the selected sequence segment. The "parent"
911							sequence is the smallest ACeDB sequence object that contains the
912							"source". The parent is used to derive the length and orientation of
913							source sequences that are not directly associated with DNA objects.
914
915							In many cases, the source sequence will be identical to the sequence
916							initially passed to the new() method. However, there are exceptions
917							to this rule. One common exception occurs when the offset and/or
918							length cross the boundaries of the passed-in sequence. In this case,
919							the ACeDB database is searched for the smallest sequence that contains
920							both endpoints of the I object.
921
922							The other common exception occurs in Ace 4.8, where there is support
923							for "sequence-like" objects that contain the C ("Sequence Map")
924							tag. The C tag provides genomic location information for
925							arbitrary object -- not just those descended from the Sequence class.
926							This allows ACeDB to perform genome map operations on objects that are
927							not directly related to sequences, such as genetic loci that have been
928							interpolated onto the physical map. When an C-containing object
929							is passed to the I new() method, the module will again
930							choose the smallest ACeDB Sequence object that contains both
931							end-points of the desired region.
932
933							If an I object is used to create a new I
934							object, then the original object's source is inherited.
935
936							=head1 Object Methods
937
938							Once an I object is created, you can query it using the
939							following methods:
940
941							=head2 asString()
942
943							$name = $seq->asString;
944
945							Returns a human-readable identifier for the sequence in the form
946							I, where "Source" is the name of the source
947							sequence, and "start" and "end" are the endpoints of the sequence
948							relative to the source (using 1-based indexing). This method is
949							called automatically when the I is used in a string
950							context.
951
952							=head2 source_seq()
953
954							$source = $seq->source_seq;
955
956							Return the source of the I.
957
958							=head2 parent_seq()
959
960							$parent = $seq->parent_seq;
961
962							Return the immediate ancestor of the sequence. The parent of the
963							top-most sequence (such as the CHROMOSOME link) is itself. This
964							method is used internally to ascertain the length of source sequences
965							which are not associated with a DNA object.
966
967							NOTE: this procedure is a trifle funky and cannot reliably be used to
968							traverse upwards to the top-most sequence. The reason for this is
969							that it will return an I in some cases, and an
970							I in others. Use get_parent() to traverse upwards
971							through a uniform series of I objects upwards.
972
973							=head2 refseq([$seq])
974
975							$refseq = $seq->refseq;
976
977							Returns the reference sequence, if one is defined.
978
979							$seq->refseq($new_ref);
980
981							Set the reference sequence. The reference sequence must share the same
982							ancestor with $seq.
983
984							=head2 start()
985
986							$start = $seq->start;
987
988							Start of this sequence, relative to the source sequence, using 1-based
989							indexing.
990
991							=head2 end()
992
993							$end = $seq->end;
994
995							End of this sequence, relative to the source sequence, using 1-based
996							indexing.
997
998							=head2 offset()
999
1000							$offset = $seq->offset;
1001
1002							Offset of the beginning of this sequence relative to the source
1003							sequence, using 0-based indexing. The offset may be negative if the
1004							beginning of the sequence is to the left of the beginning of the
1005							source sequence.
1006
1007							=head2 length()
1008
1009							$length = $seq->length;
1010
1011							The length of this sequence, in base pairs. The length may be
1012							negative if the sequence's orientation is reversed relative to the
1013							source sequence. Use abslength() to obtain the absolute value of
1014							the sequence length.
1015
1016							=head2 abslength()
1017
1018							$length = $seq->abslength;
1019
1020							Return the absolute value of the length of the sequence.
1021
1022							=head2 strand()
1023
1024							$strand = $seq->strand;
1025
1026							Returns +1 for a sequence oriented in the natural direction of the
1027							genomic reference sequence, or -1 otherwise.
1028
1029							=head2 reversed()
1030
1031							Returns true if the segment is reversed relative to the canonical
1032							genomic direction. This is the same as $seq->strand < 0.
1033
1034							=head2 dna()
1035
1036							$dna = $seq->dna;
1037
1038							Return the DNA corresponding to this sequence. If the sequence length
1039							is negative, the reverse complement of the appropriate segment will be
1040							returned.
1041
1042							ACeDB allows Sequences to exist without an associated DNA object
1043							(which typically happens during intermediate stages of a sequencing
1044							project. In such a case, the returned sequence will contain the
1045							correct number of "-" characters.
1046
1047							=head2 name()
1048
1049							$name = $seq->name;
1050
1051							Return the name of the source sequence as a string.
1052
1053							=head2 get_parent()
1054
1055							$parent = $seq->parent;
1056
1057							Return the immediate ancestor of this I (i.e., the
1058							sequence that contains this one). The return value is a new
1059							I or undef, if no parent sequence exists.
1060
1061							=head2 get_children()
1062
1063							@children = $seq->get_children();
1064
1065							Returns all subsequences that exist as independent objects in the
1066							ACeDB database. What exactly is returned is dependent on the data
1067							model. In older ACeDB databases, the only subsequences are those
1068							under the catchall Subsequence tag. In newer ACeDB databases, the
1069							objects returned correspond to objects to the right of the S_Child
1070							subtag using a tag[2] syntax, and may include Predicted_genes,
1071							Sequences, Links, or other objects. The return value is a list of
1072							I objects.
1073
1074							=head2 features()
1075
1076							@features = $seq->features;
1077							@features = $seq->features('exon','intron','Predicted_gene');
1078							@features = $seq->features('exon:GeneFinder','Predicted_gene:hand.*');
1079
1080							features() returns an array of I objects. If
1081							called without arguments, features() returns all features that cross
1082							the sequence region. You may also provide a filter list to select a
1083							set of features by type and subtype. The format of the filter list
1084							is:
1085
1086							type:subtype
1087
1088							Where I is the class of the feature (the "feature" field of the
1089							GFF format), and I is a description of how the feature was
1090							derived (the "source" field of the GFF format). Either of these
1091							fields can be absent, and either can be a regular expression. More
1092							advanced filtering is not supported, but is provided by the Sanger
1093							Centre's GFF module.
1094
1095							The order of the features in the returned list is not specified. To
1096							obtain features sorted by position, use this idiom:
1097
1098							@features = sort { $a->start <=> $b->start } $seq->features;
1099
1100							=head2 feature_list()
1101
1102							my $list = $seq->feature_list();
1103
1104							This method returns a summary list of the features that cross the
1105							sequence in the form of a L object. From the
1106							L object you can obtain the list of feature names
1107							and the number of each type. The feature list is obtained from the
1108							ACeDB server with a single short transaction, and therefore has much
1109							less overhead than features().
1110
1111							See L for more details.
1112
1113							=head2 transcripts()
1114
1115							This returns a list of Ace::Sequence::Transcript objects, which are
1116							specializations of Ace::Sequence::Feature. See L
1117							for details.
1118
1119							=head2 clones()
1120
1121							This returns a list of Ace::Sequence::Feature objects containing
1122							reconstructed clones. This is a nasty hack, because ACEDB currently
1123							records clone ends, but not the clones themselves, meaning that we
1124							will not always know both ends of the clone. In this case the missing
1125							end has a synthetic position of -99,999,999 or +99,999,999. Sorry.
1126
1127							=head2 gff()
1128
1129							$gff = $seq->gff();
1130							$gff = $seq->gff(-abs => 1,
1131							-features => ['exon','intron:GeneFinder']);
1132
1133							This method returns a GFF file as a scalar. The following arguments
1134							are optional:
1135
1136							=over 4
1137
1138							=item -abs
1139
1140							Ordinarily the feature entries in the GFF file will be returned in
1141							coordinates relative to the start of the I object.
1142							Position 1 will be the start of the sequence object, and the "+"
1143							strand will be the sequence object's natural orientation. However if
1144							a true value is provided to B<-abs>, the coordinate system used will
1145							be relative to the start of the source sequence, i.e. the native ACeDB
1146							Sequence object (usually a cosmid sequence or a link).
1147
1148							If a reference sequence was provided when the I was
1149							created, it will be used by default to set the coordinate system.
1150							Relative coordinates can be reenabled by providing a false value to
1151							B<-abs>.
1152
1153							Ordinarily the coordinate system manipulations automatically "do what
1154							you want" and you will not need to adjust them. See also the abs()
1155							method described below.
1156
1157							=item -features
1158
1159							The B<-features> argument filters the features according to a list of
1160							types and subtypes. The format is identical to the one described for
1161							the features() method. A single filter may be provided as a scalar
1162							string. Multiple filters may be passed as an array reference.
1163
1164							=back
1165
1166							See also the GFF() method described next.
1167
1168							=head2 GFF()
1169
1170							$gff_object = $seq->gff;
1171							$gff_object = $seq->gff(-abs => 1,
1172							-features => ['exon','intron:GeneFinder']);
1173
1174							The GFF() method takes the same arguments as gff() described above,
1175							but it returns a I object from the GFF.pm
1176							module. If the GFF module is not installed, this method will generate
1177							a fatal error.
1178
1179							=head2 absolute()
1180
1181							$abs = $seq->absolute;
1182							$abs = $seq->absolute(1);
1183
1184							This method controls whether the coordinates of features are returned
1185							in absolute or relative coordinates. "Absolute" coordinates are
1186							relative to the underlying source or reference sequence. "Relative"
1187							coordinates are relative to the I object. By default,
1188							coordinates are relative unless new() was provided with a reference
1189							sequence. This default can be examined and changed using absolute().
1190
1191							=head2 automerge()
1192
1193							$merge = $seq->automerge;
1194							$seq->automerge(0);
1195
1196							This method controls whether groups of features will automatically be
1197							merged together by the features() call. If true (the default), then
1198							the left and right end of clones will be merged into "clone" features,
1199							introns, exons and CDS entries will be merged into
1200							Ace::Sequence::Transcript objects, and similarity entries will be
1201							merged into Ace::Sequence::GappedAlignment objects.
1202
1203							=head2 db()
1204
1205							$db = $seq->db;
1206
1207							Returns the L database accessor associated with this sequence.
1208
1209							=head1 SEE ALSO
1210
1211							L, L, L,
1212							L, L
1213
1214							=head1 AUTHOR
1215
1216							Lincoln Stein with extensive help from Jean
1217							Thierry-Mieg
1218
1219							Many thanks to David Block for finding and
1220							fixing the nasty off-by-one errors.
1221
1222							Copyright (c) 1999, Lincoln D. Stein
1223
1224							This library is free software; you can redistribute it and/or modify
1225							it under the same terms as Perl itself. See DISCLAIMER.txt for
1226							disclaimers of warranty.
1227
1228							=cut
1229
1230							__END__