File Coverage

Bio/SeqUtils.pm
Criterion Covered Total %
statement 421 451 93.3
branch 168 260 64.6
condition 58 114 50.8
subroutine 28 28 100.0
pod 13 13 100.0
total 688 866 79.4


line stmt bran cond sub pod time code
1             # BioPerl module for Bio::SeqUtils
2             #
3             # Please direct questions and support issues to
4             #
5             # Cared for by Heikki Lehvaslaiho
6             #
7             # Copyright Heikki Lehvaslaiho
8             #
9             # You may distribute this module under the same terms as perl itself
10              
11             # POD documentation - main docs before the code
12              
13             =head1 NAME
14              
15             Bio::SeqUtils - Additional methods for PrimarySeq objects
16              
17             =head1 SYNOPSIS
18              
19             use Bio::SeqUtils;
20             # get a Bio::PrimarySeqI compliant object, $seq, somehow
21             $util = Bio::SeqUtils->new();
22             $polypeptide_3char = $util->seq3($seq);
23             # or
24             $polypeptide_3char = Bio::SeqUtils->seq3($seq);
25              
26             # set the sequence string (stored in one char code in the object)
27             Bio::SeqUtils->seq3($seq, $polypeptide_3char);
28              
29             # translate a sequence in all six frames
30             @seqs = Bio::SeqUtils->translate_6frames($seq);
31              
32             # inplace editing of the sequence
33             Bio::SeqUtils->mutate($seq,
34             Bio::LiveSeq::Mutation->new(-seq => 'c',
35             -pos => 3
36             ));
37             # mutate a sequence to desired similarity%
38             $newseq = Bio::SeqUtils-> evolve
39             ($seq, $similarity, $transition_transversion_rate);
40              
41             # concatenate two or more sequences with annotations and features,
42             # the first sequence will be modified
43             Bio::SeqUtils->cat(@seqs);
44             my $catseq=$seqs[0];
45              
46             # truncate a sequence, retaining features and adjusting their
47             # coordinates if necessary
48             my $truncseq = Bio::SeqUtils->trunc_with_features($seq, 100, 200);
49              
50             # reverse complement a sequence and its features
51             my $revcomseq = Bio::SeqUtils->revcom_with_features($seq);
52              
53             # simulate cloning of a fragment into a vector. Cut the vector at
54             # positions 1000 and 1100 (deleting postions 1001 to 1099) and
55             # "ligate" a fragment into the sites. The fragment is
56             # reverse-complemented in this example (option "flip").
57             # All features of the vector and fragment are preserved and
58             # features that are affected by the deletion/insertion are
59             # modified accordingly.
60             # $vector and $fragment must be Bio::SeqI compliant objects
61             my $new_molecule = Bio::Sequtils->ligate(
62             -vector => $vector,
63             -fragment => $fragment,
64             -left => 1000,
65             -right => 1100,
66             -flip => 1
67             );
68              
69             # delete a segment of a sequence (from pos 1000 to 1100, inclusive),
70             # again preserving features and annotations
71             my $new_molecule = Bio::SeqUtils->cut( $seq, 1000, 1100 );
72              
73             # insert a fragment into a recipient between positions 1000 and
74             # 1001. $recipient is a Bio::SeqI compliant object
75             my $new_molecule = Bio::SeqUtils::PbrTools->insert(
76             $recipient_seq,
77             $fragment_seq,
78             1000
79             );
80              
81             =head1 DESCRIPTION
82              
83             This class is a holder of methods that work on Bio::PrimarySeqI-
84             compliant sequence objects, e.g. Bio::PrimarySeq and
85             Bio::Seq. These methods are not part of the Bio::PrimarySeqI
86             interface and should in general not be essential to the primary function
87             of sequence objects. If you are thinking of adding essential
88             functions, it might be better to create your own sequence class.
89             See L, L, and L for more.
90              
91             The methods take as their first argument a sequence object. It is
92             possible to use methods without first creating a SeqUtils object,
93             i.e. use it as an anonymous hash.
94              
95             The first two methods, seq3() and seq3in(), give out or read in protein
96             sequences coded in three letter IUPAC amino acid codes.
97              
98             The next two methods, translate_3frames() and translate_6frames(), wrap
99             around the standard translate method to give back an array of three
100             forward or all six frame translations.
101              
102             The mutate() method mutates the sequence string with a mutation
103             description object.
104              
105             The cat() method concatenates two or more sequences. The first sequence
106             is modified by addition of the remaining sequences. All annotations and
107             sequence features will be transferred.
108              
109             The revcom_with_features() and trunc_with_features() methods are similar
110             to the revcom() and trunc() methods from Bio::Seq, but also adjust any
111             features associated with the sequence as appropriate.
112              
113             There are also methods that simulate molecular cloning with rich
114             sequence objects.
115             The delete() method cuts a segment out of a sequence and re-joins the
116             left and right fragments (like splicing or digesting and re-ligating a
117             molecule). Positions (and types) of sequence features are adjusted
118             accordingly:
119             Features that span the deleted segment are converted to split featuress
120             to indicate the disruption. (Sub)Features that extend into the deleted
121             segment are truncated.
122             A new molecule is created and returned.
123              
124             The insert() method inserts a fragment (which can be a rich Bio::Seq
125             object) into another sequence object adding all annotations and
126             features to the final product.
127             Features that span the insertion site are converted to split features
128             to indicate the disruption.
129             A new feature is added to indicate the inserted fragment itself.
130             A new molecule is created and returned.
131              
132             The ligate() method simulates digesting a recipient (vector) and
133             ligating a fragment into it, which can also be flipped if needed. It
134             is simply a combination of a deletion and an insertion step and
135             returns a new molecule. The rules for modifying feature locations
136             outlined above are also used here, e.g. features that span the cut
137             sites are converted to split features with truncated sub-locations.
138              
139              
140             =head1 FEEDBACK
141              
142             =head2 Mailing Lists
143              
144             User feedback is an integral part of the evolution of this and other
145             Bioperl modules. Send your comments and suggestions preferably to one
146             of the Bioperl mailing lists. Your participation is much appreciated.
147              
148             bioperl-l@bioperl.org - General discussion
149             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
150              
151             =head2 Support
152              
153             Please direct usage questions or support issues to the mailing list:
154              
155             I
156              
157             rather than to the module maintainer directly. Many experienced and
158             reponsive experts will be able look at the problem and quickly
159             address it. Please include a thorough description of the problem
160             with code and data examples if at all possible.
161              
162             =head2 Reporting Bugs
163              
164             Report bugs to the Bioperl bug tracking system to help us keep track
165             the bugs and their resolution. Bug reports can be submitted via the
166             web:
167              
168             https://github.com/bioperl/bioperl-live/issues
169              
170             =head1 AUTHOR - Heikki Lehvaslaiho
171              
172             Email: heikki-at-bioperl-dot-org
173              
174             =head1 CONTRIBUTORS
175              
176             Roy R. Chaudhuri - roy.chaudhuri at gmail.com
177             Frank Schwach - frank.schwach@sanger.ac.uk
178              
179             =head1 APPENDIX
180              
181             The rest of the documentation details each of the object
182             methods. Internal methods are usually preceded with a _
183              
184             =cut
185              
186             # Let the code begin...
187              
188             package Bio::SeqUtils;
189 204     204   735 use strict;
  204         220  
  204         5147  
190 204     204   667 use warnings;
  204         261  
  204         5680  
191 204     204   964 use Scalar::Util qw(blessed);
  204         214  
  204         9377  
192 204     204   712 use parent qw(Bio::Root::Root);
  204         222  
  204         1296  
193              
194             # new inherited from RootI
195              
196             our %ONECODE = (
197             'Ala' => 'A',
198             'Asx' => 'B',
199             'Cys' => 'C',
200             'Asp' => 'D',
201             'Glu' => 'E',
202             'Phe' => 'F',
203             'Gly' => 'G',
204             'His' => 'H',
205             'Ile' => 'I',
206             'Lys' => 'K',
207             'Leu' => 'L',
208             'Met' => 'M',
209             'Asn' => 'N',
210             'Pro' => 'P',
211             'Gln' => 'Q',
212             'Arg' => 'R',
213             'Ser' => 'S',
214             'Thr' => 'T',
215             'Val' => 'V',
216             'Trp' => 'W',
217             'Xaa' => 'X',
218             'Tyr' => 'Y',
219             'Glx' => 'Z',
220             'Ter' => '*',
221             'Sec' => 'U',
222             'Pyl' => 'O',
223             'Xle' => 'J'
224             );
225              
226             our %THREECODE = (
227             'A' => 'Ala',
228             'B' => 'Asx',
229             'C' => 'Cys',
230             'D' => 'Asp',
231             'E' => 'Glu',
232             'F' => 'Phe',
233             'G' => 'Gly',
234             'H' => 'His',
235             'I' => 'Ile',
236             'K' => 'Lys',
237             'L' => 'Leu',
238             'M' => 'Met',
239             'N' => 'Asn',
240             'P' => 'Pro',
241             'Q' => 'Gln',
242             'R' => 'Arg',
243             'S' => 'Ser',
244             'T' => 'Thr',
245             'V' => 'Val',
246             'W' => 'Trp',
247             'Y' => 'Tyr',
248             'Z' => 'Glx',
249             'X' => 'Xaa',
250             '*' => 'Ter',
251             'U' => 'Sec',
252             'O' => 'Pyl',
253             'J' => 'Xle'
254             );
255              
256             =head2 seq3
257              
258             Title : seq3
259             Usage : $string = Bio::SeqUtils->seq3($seq)
260             Function: Read only method that returns the amino acid sequence as a
261             string of three letter codes. alphabet has to be
262             'protein'. Output follows the IUPAC standard plus 'Ter' for
263             terminator. Any unknown character, including the default
264             unknown character 'X', is changed into 'Xaa'. A noncoded
265             aminoacid selenocystein is recognized (Sec, U).
266              
267             Returns : A scalar
268             Args : character used for stop in the protein sequence optional,
269             defaults to '*' string used to separate the output amino
270             acid codes, optional, defaults to ''
271              
272             =cut
273              
274             sub seq3 {
275 4     4 1 7 my ( $self, $seq, $stop, $sep ) = @_;
276              
277 4 50       18 $seq->isa('Bio::PrimarySeqI')
278             || $self->throw('Not a Bio::PrimarySeqI object but [$self]');
279 4 50       6 $seq->alphabet eq 'protein'
280             || $self->throw('Not a protein sequence');
281              
282 4 100       8 if ( defined $stop ) {
283 1 50       3 length $stop != 1
284             and $self->throw('One character stop needed, not [$stop]');
285 1         2 $THREECODE{$stop} = "Ter";
286             }
287 4   100     11 $sep ||= '';
288              
289 4         3 my $aa3s;
290 4         6 foreach my $aa ( split //, uc $seq->seq ) {
291 87 50       133 $THREECODE{$aa} and $aa3s .= $THREECODE{$aa} . $sep, next;
292 0         0 $aa3s .= 'Xaa' . $sep;
293             }
294 4 100       14 $sep and substr( $aa3s, -( length $sep ), length $sep ) = '';
295 4         13 return $aa3s;
296             }
297              
298             =head2 seq3in
299              
300             Title : seq3in
301             Usage : $seq = Bio::SeqUtils->seq3in($seq, 'MetGlyTer')
302             Function: Method for changing of the sequence of a
303             Bio::PrimarySeqI sequence object. The three letter amino
304             acid input string is converted into one letter code. Any
305             unknown character triplet, including the default 'Xaa', is
306             converted into 'X'.
307              
308             Returns : Bio::PrimarySeq object
309             Args : sequence string
310             optional character to be used for stop in the protein sequence,
311             defaults to '*'
312             optional character to be used for unknown in the protein sequence,
313             defaults to 'X'
314              
315             =cut
316              
317             sub seq3in {
318 3     3 1 5 my ( $self, $seq, $string, $stop, $unknown ) = @_;
319              
320 3 50       13 $seq->isa('Bio::PrimarySeqI')
321             || $self->throw("Not a Bio::PrimarySeqI object but [$self]");
322 3 50       7 $seq->alphabet eq 'protein'
323             || $self->throw('Not a protein sequence');
324              
325 3 50       7 if ( defined $stop ) {
326 0 0       0 length $stop != 1
327             and $self->throw("One character stop needed, not [$stop]");
328 0         0 $ONECODE{'Ter'} = $stop;
329             }
330 3 50       8 if ( defined $unknown ) {
331 0 0       0 length $unknown != 1
332             and $self->throw("One character stop needed, not [$unknown]");
333 0         0 $ONECODE{'Xaa'} = $unknown;
334             }
335              
336 3         2 my ( $aas, $aa3 );
337 3         7 my $length = ( length $string ) - 2;
338 3         8 for ( my $i = 0 ; $i < $length ; $i += 3 ) {
339 89         52 $aa3 = substr( $string, $i, 3 );
340 89         57 $aa3 = ucfirst( lc($aa3) );
341 89 100       185 $ONECODE{$aa3} and $aas .= $ONECODE{$aa3}, next;
342 1         3 $aas .= $ONECODE{'Xaa'};
343             }
344 3         7 $seq->seq($aas);
345 3         7 return $seq;
346             }
347              
348             =head2 translate_3frames
349              
350             Title : translate_3frames
351             Usage : @prots = Bio::SeqUtils->translate_3frames($seq)
352             Function: Translate a nucleotide sequence in three forward frames.
353             The IDs of the sequences are appended with '-0F', '-1F', '-2F'.
354             Returns : An array of seq objects
355             Args : sequence object
356             same arguments as to Bio::PrimarySeqI::translate
357              
358             =cut
359              
360             sub translate_3frames {
361 3     3 1 4 my ( $self, $seq, @args ) = @_;
362              
363 3 50       15 $self->throw( 'Object [$seq] '
364             . 'of class ['
365             . ref($seq)
366             . '] can not be translated.' )
367             unless $seq->can('translate');
368              
369 3         4 my ( $stop, $unknown, $frame, $tableid, $fullCDS, $throw ) = @args;
370 3         3 my @seqs;
371 3         2 my $f = 0;
372 3         6 while ( $f != 3 ) {
373 9         15 my $translation =
374             $seq->translate( $stop, $unknown, $f, $tableid, $fullCDS, $throw );
375 9         15 $translation->id( $seq->id . "-" . $f . "F" );
376 9         9 push @seqs, $translation;
377 9         17 $f++;
378             }
379              
380 3         8 return @seqs;
381             }
382              
383             =head2 translate_6frames
384              
385             Title : translate_6frames
386             Usage : @prots = Bio::SeqUtils->translate_6frames($seq)
387             Function: translate a nucleotide sequence in all six frames
388             The IDs of the sequences are appended with '-0F', '-1F', '-2F',
389             '-0R', '-1R', '-2R'.
390             Returns : An array of seq objects
391             Args : sequence object
392             same arguments as to Bio::PrimarySeqI::translate
393              
394             =cut
395              
396             sub translate_6frames {
397 1     1 1 336 my ( $self, $seq, @args ) = @_;
398              
399 1         3 my @seqs = $self->translate_3frames( $seq, @args );
400 1         7 my @seqs2 = $self->translate_3frames( $seq->revcom, @args );
401 1         2 foreach my $seq2 (@seqs2) {
402 3         5 my ($tmp) = $seq2->id;
403 3         8 $tmp =~ s/F$/R/g;
404 3         4 $seq2->id($tmp);
405             }
406 1         3 return @seqs, @seqs2;
407             }
408              
409             =head2 valid_aa
410              
411             Title : valid_aa
412             Usage : my @aa = $table->valid_aa
413             Function: Retrieves a list of the valid amino acid codes.
414             The list is ordered so that first 21 codes are for unique
415             amino acids. The rest are ['B', 'Z', 'X', '*'].
416             Returns : array of all the valid amino acid codes
417             Args : [optional] $code => [0 -> return list of 1 letter aa codes,
418             1 -> return list of 3 letter aa codes,
419             2 -> return associative array of both ]
420              
421             =cut
422              
423             sub valid_aa {
424 410     410 1 2268 my ( $self, $code ) = @_;
425              
426 410 100       1573 if ( !$code ) {
    100          
    50          
427 204         251 my @codes;
428 204         1923 foreach my $c ( sort values %ONECODE ) {
429 5508 100       9101 push @codes, $c unless ( $c =~ /[BZX\*]/ );
430             }
431 204         401 push @codes, qw(B Z X *); # so they are in correct order ?
432 204         1308 return @codes;
433             }
434             elsif ( $code == 1 ) {
435 1         2 my @codes;
436 1         10 foreach my $c ( sort keys %ONECODE ) {
437 27 100       74 push @codes, $c unless ( $c =~ /(Asx|Glx|Xaa|Ter)/ );
438             }
439 1         3 push @codes, ( 'Asx', 'Glx', 'Xaa', 'Ter' );
440 1         12 return @codes;
441             }
442             elsif ( $code == 2 ) {
443 205         2117 my %codes = %ONECODE;
444 205         814 foreach my $c ( keys %ONECODE ) {
445 5535         3492 my $aa = $ONECODE{$c};
446 5535         5200 $codes{$aa} = $c;
447             }
448 205         4762 return %codes;
449             }
450             else {
451 0         0 $self->warn(
452             "unrecognized code in " . ref($self) . " method valid_aa()" );
453 0         0 return ();
454             }
455             }
456              
457             =head2 mutate
458              
459             Title : mutate
460             Usage : Bio::SeqUtils->mutate($seq,$mutation1, $mutation2);
461             Function: Inplace editing of the sequence.
462              
463             The second argument can be a Bio::LiveSeq::Mutation object
464             or an array of them. The mutations are applied sequentially
465             checking only that their position is within the current
466             sequence. Insertions are inserted before the given
467             position.
468              
469             Returns : boolean
470             Args : sequence object
471             mutation, a Bio::LiveSeq::Mutation object, or an array of them
472              
473             See L.
474              
475             =cut
476              
477             sub mutate {
478 3     3 1 7 my ( $self, $seq, @mutations ) = @_;
479              
480 3 50       9 $self->throw( 'Object [$seq] '
481             . 'of class ['
482             . ref($seq)
483             . '] should be a Bio::PrimarySeqI ' )
484             unless $seq->isa('Bio::PrimarySeqI');
485 3 50       10 $self->throw( 'Object [$mutations[0]] '
486             . 'of class ['
487             . ref( $mutations[0] )
488             . '] should be a Bio::LiveSeq::Mutation' )
489             unless $mutations[0]->isa('Bio::LiveSeq::Mutation');
490              
491 3         20 foreach my $mutation (@mutations) {
492 4 50       6 $self->throw('Attempting to mutate sequence beyond its length')
493             unless $mutation->pos - 1 <= $seq->length;
494              
495 4         7 my $string = $seq->seq;
496 4         7 substr $string, $mutation->pos - 1, $mutation->len, $mutation->seq;
497 4         8 $seq->seq($string);
498             }
499 3         5 1;
500             }
501              
502             =head2 cat
503              
504             Title : cat
505             Usage : Bio::SeqUtils->cat(@seqs);
506             my $catseq=$seqs[0];
507             Function: Concatenates a list of Bio::Seq objects, adding them all on to the
508             end of the first sequence. Annotations and sequence features are
509             copied over from any additional objects, and the coordinates of any
510             copied features are adjusted appropriately.
511             Returns : a boolean
512             Args : array of sequence objects
513              
514             Note that annotations have no sequence locations. If you concatenate
515             sequences with the same annotations they will all be added.
516              
517             =cut
518              
519             sub cat {
520 4     4 1 40 my ( $self, $seq, @seqs ) = @_;
521 4 50       15 $self->throw( 'Object [$seq] '
522             . 'of class ['
523             . ref($seq)
524             . '] should be a Bio::PrimarySeqI ' )
525             unless $seq->isa('Bio::PrimarySeqI');
526              
527 4         7 for my $catseq (@seqs) {
528 5 50       12 $self->throw( 'Object [$catseq] '
529             . 'of class ['
530             . ref($catseq)
531             . '] should be a Bio::PrimarySeqI ' )
532             unless $catseq->isa('Bio::PrimarySeqI');
533              
534 5 100       12 $self->throw(
535             'Trying to concatenate sequences with different alphabets: '
536             . $seq->display_id . '('
537             . $seq->alphabet
538             . ') and '
539             . $catseq->display_id . '('
540             . $catseq->alphabet
541             . ')' )
542             unless $catseq->alphabet eq $seq->alphabet;
543              
544 4         9 my $length = $seq->length;
545 4         7 $seq->seq( $seq->seq . $catseq->seq );
546              
547             # move annotations
548 4 100 66     29 if ( $seq->isa("Bio::AnnotatableI")
549             and $catseq->isa("Bio::AnnotatableI") )
550             {
551 3         6 foreach my $key ( $catseq->annotation->get_all_annotation_keys() ) {
552              
553 3         5 foreach my $value ( $catseq->annotation->get_Annotations($key) )
554             {
555 5         8 $seq->annotation->add_Annotation( $key, $value );
556             }
557             }
558             }
559              
560             # move SeqFeatures
561 4 100 66     21 if ( $seq->isa('Bio::SeqI') and $catseq->isa('Bio::SeqI') ) {
562 3         6 for my $feat ( $catseq->get_SeqFeatures ) {
563 2         5 $seq->add_SeqFeature( $self->_coord_adjust( $feat, $length ) );
564             }
565             }
566              
567             }
568 3         9 1;
569             }
570              
571             =head2 trunc_with_features
572              
573             Title : trunc_with_features
574             Usage : $trunc=Bio::SeqUtils->trunc_with_features($seq, $start, $end);
575             Function: Like Bio::Seq::trunc, but keeps features (adjusting coordinates
576             where necessary. Features that partially overlap the region have
577             their location changed to a Bio::Location::Fuzzy.
578             Returns : A new sequence object
579             Args : A sequence object, start coordinate, end coordinate (inclusive)
580              
581              
582             =cut
583              
584             sub trunc_with_features {
585 204     204   282907 use Bio::Range;
  204         297  
  204         658764  
586 1     1 1 2 my ( $self, $seq, $start, $end ) = @_;
587 1 50       4 $self->throw( 'Object [$seq] '
588             . 'of class ['
589             . ref($seq)
590             . '] should be a Bio::SeqI ' )
591             unless $seq->isa('Bio::SeqI');
592 1         8 my $trunc = $seq->trunc( $start, $end );
593 1         9 my $truncrange =
594             Bio::Range->new( -start => $start, -end => $end, -strand => 0 );
595              
596             # make sure that there is no annotation or features in $trunc
597             # (->trunc() now clone objects except for Bio::Seq::LargePrimarySeq)
598 1         3 $trunc->annotation->remove_Annotations;
599 1         3 $trunc->remove_SeqFeatures;
600              
601             # move annotations
602 1         3 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
603 1         3 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
604 1         3 $trunc->annotation->add_Annotation( $key, $value );
605             }
606             }
607              
608             # move features
609 1         3 foreach (
610             grep {
611 3 50       11 $_ = $self->_coord_adjust( $_, 1 - $start, $end + 1 - $start )
612             if $_->overlaps($truncrange)
613             } $seq->get_SeqFeatures
614             )
615             {
616 3         5 $trunc->add_SeqFeature($_);
617             }
618 1         11 return $trunc;
619             }
620              
621             =head2 delete
622            
623             Title : delete
624             Function: cuts a segment out of a sequence and re-joins the left and right fragments
625             (like splicing or digesting and re-ligating a molecule).
626             Positions (and types) of sequence features are adjusted accordingly:
627             Features that span the cut site are converted to split featuress to
628             indicate the disruption.
629             Features that extend into the cut-out fragment are truncated.
630             A new molecule is created and returned.
631             Usage : my $cutseq = Bio::SeqUtils::PbrTools->cut( $seq, 1000, 1100 );
632             Args : a Bio::PrimarySeqI compliant object to cut,
633             first nt of the segment to be deleted
634             last nt of the segment to be deleted
635             optional:
636             hash-ref of options:
637             clone_obj: if true, clone the input sequence object rather
638             than calling "new" on the object's class
639              
640             Returns : a new Bio::Seq object
641              
642             =cut
643              
644             sub delete {
645 6     6 1 252 my $self = shift;
646 6         11 my ( $seq, $left, $right, $opts_ref ) = @_;
647 6 50 66     24 $self->throw( 'was expecting 3-4 paramters but got ' . @_ )
648             unless @_ == 3 || @_ == 4;
649              
650 6 50 33     42 $self->throw(
651             'Object of class [' . ref($seq) . '] should be a Bio::PrimarySeqI ' )
652             unless blessed($seq) && $seq->isa('Bio::PrimarySeqI');
653              
654 6 50       11 $self->throw("Left coordinate ($left) must be >= 1") if $left < 1;
655 6 50       14 if ( $right > $seq->length ) {
656 0         0 $self->throw( "Right coordinate ($right) must be less than "
657             . 'sequence length ('
658             . $seq->length
659             . ')' );
660             }
661              
662             # piece together the sequence string of the remaining fragments
663 6         18 my $left_seq = $seq->subseq( 1, $left - 1 );
664 6         14 my $right_seq = $seq->subseq( $right + 1, $seq->length );
665 6 50 33     18 if ( !$left_seq || !$right_seq ) {
666 0         0 $self->throw(
667             'could not assemble sequences. At least one of the fragments is empty'
668             );
669             }
670 6         10 my $seq_str = $left_seq . $right_seq;
671              
672             # create the new seq object with the same class as the recipient
673             # or (if requested), make a clone of the existing object. In the
674             # latter case we need to remove sequence features from the cloned
675             # object instead of copying them
676 6         5 my $product;
677 6 100       9 if ( $opts_ref->{clone_obj} ) {
678 2         5 $product = $self->_new_seq_via_clone( $seq, $seq_str );
679             }
680             else {
681 4         13 $product = $self->_new_seq_from_old( $seq, { seq => $seq_str } );
682             }
683              
684             # move sequence features
685 4 50 33     23 if ( $product->isa('Bio::SeqI') && $seq->isa('Bio::SeqI') ) {
686 4         9 for my $feat ( $seq->get_SeqFeatures ) {
687 26         42 my $adjfeat = $self->_coord_adjust_deletion( $feat, $left, $right );
688 26 100       83 $product->add_SeqFeature($adjfeat) if $adjfeat;
689             }
690             }
691              
692             # add a feature to annotatde the deletion
693 4         33 my $deletion_feature = Bio::SeqFeature::Generic->new(
694             -primary_tag => 'misc_feature',
695             -tag => { note => 'deletion of ' . ( $right - $left + 1 ) . 'bp' },
696             -location => Bio::Location::Simple->new(
697             -start => $left - 1,
698             -end => $left,
699             -location_type => 'IN-BETWEEN'
700             )
701             );
702 4         13 $product->add_SeqFeature($deletion_feature);
703 4         17 return $product;
704             }
705              
706             =head2 insert
707            
708             Title : insert
709             Function: inserts a fragment (a Bio::Seq object) into a nother sequence object
710             adding all annotations and features to the final product.
711             Features that span the insertion site are converted to split
712             features to indicate the disruption.
713             A new feature is added to indicate the inserted fragment itself.
714             A new molecule is created and returned.
715             Usage : # insert a fragment after pos 1000
716             my $insert_seq = Bio::SeqUtils::PbrTools->insert(
717             $recipient_seq,
718             $fragment_seq,
719             1000
720             );
721             Args : recipient sequence (a Bio::PrimarySeqI compliant object),
722             a fragmetn to insert (Bio::PrimarySeqI compliant object),
723             insertion position (fragment is inserted to the right of this pos)
724             pos=0 will prepend the fragment to the recipient
725             optional:
726             hash-ref of options:
727             clone_obj: if true, clone the input sequence object rather
728             than calling "new" on the object's class
729             Returns : a new Bio::Seq object
730              
731             =cut
732              
733             sub insert {
734 3     3 1 22 my $self = shift;
735 3         6 my ( $recipient, $fragment, $insert_pos, $opts_ref ) = @_;
736 3 50 66     22 $self->throw( 'was expecting 3-4 paramters but got ' . @_ )
737             unless @_ == 3 || @_ == 4;
738              
739 3 50 33     24 $self->throw( 'Recipient object of class ['
740             . ref($recipient)
741             . '] should be a Bio::PrimarySeqI ' )
742             unless blessed($recipient) && $recipient->isa('Bio::PrimarySeqI');
743              
744 3 50 33     18 $self->throw( 'Fragment object of class ['
745             . ref($fragment)
746             . '] should be a Bio::PrimarySeqI ' )
747             unless blessed($fragment) && $fragment->isa('Bio::PrimarySeqI');
748              
749 3 50       12 $self->throw( 'Can\'t concatenate sequences with different alphabets: '
750             . 'recipient is '
751             . $recipient->alphabet
752             . ' and fragment is '
753             . $fragment->alphabet )
754             unless $recipient->alphabet eq $fragment->alphabet;
755              
756 3 50 33     14 if ( $insert_pos < 0 or $insert_pos > $recipient->length ) {
757 0         0 $self->throw( "insertion position ($insert_pos) must be between 0 and "
758             . 'recipient sequence length ('
759             . $recipient->length
760             . ')' );
761             }
762              
763 3 50 33     21 if ( $fragment->can('is_circular') && $fragment->is_circular ) {
764 0         0 $self->throw('Can\'t insert circular fragments');
765             }
766              
767 3 50       7 if ( !$recipient->seq ) {
768 0         0 $self->throw(
769             'Recipient has no sequence, can not insert into this object');
770             }
771              
772             # construct raw sequence of the new molecule
773 3 50       10 my $left_seq =
774             $insert_pos > 0
775             ? $recipient->subseq( 1, $insert_pos )
776             : '';
777 3         7 my $mid_seq = $fragment->seq;
778 3 50       8 my $right_seq =
779             $insert_pos < $recipient->length
780             ? $recipient->subseq( $insert_pos + 1, $recipient->length )
781             : '';
782 3         9 my $seq_str = $left_seq . $mid_seq . $right_seq;
783              
784             # create the new seq object with the same class as the recipient
785             # or (if requested), make a clone of the existing object. In the
786             # latter case we need to remove sequence features from the cloned
787             # object instead of copying them
788 3         3 my $product;
789 3 100       9 if ( $opts_ref->{clone_obj} ) {
790 1         2 $product = $self->_new_seq_via_clone( $recipient, $seq_str );
791             }
792             else {
793 2         2 my @desc;
794 2 50       4 push @desc, 'Inserted fragment: ' . $fragment->desc
795             if defined $fragment->desc;
796 2 50       4 push @desc, 'Recipient: ' . $recipient->desc
797             if defined $recipient->desc;
798 2   50     6 $product = $self->_new_seq_from_old(
      33        
      50        
799             $recipient,
800             {
801             seq => $seq_str,
802             display_id => $recipient->display_id,
803             accession_number => $recipient->accession_number || '',
804             alphabet => $recipient->alphabet,
805             desc => join( '; ', @desc ),
806             verbose => $recipient->verbose || $fragment->verbose,
807             is_circular => $recipient->is_circular || 0,
808             }
809             );
810              
811             } # if clone_obj
812              
813             # move annotations from fragment to product
814 3 50 33     22 if ( $product->isa("Bio::AnnotatableI")
815             && $fragment->isa("Bio::AnnotatableI") )
816             {
817 3         7 foreach my $key ( $fragment->annotation->get_all_annotation_keys ) {
818 3         5 foreach my $value ( $fragment->annotation->get_Annotations($key) ) {
819 3         5 $product->annotation->add_Annotation( $key, $value );
820             }
821             }
822             }
823              
824             # move sequence features to product with adjusted coordinates
825 3 50       9 if ( $product->isa('Bio::SeqI') ) {
826              
827             # for the fragment, just shift the features to new position
828 3 50       8 if ( $fragment->isa('Bio::SeqI') ) {
829 3         5 for my $feat ( $fragment->get_SeqFeatures ) {
830 3         19 my $adjfeat = $self->_coord_adjust( $feat, $insert_pos );
831 3 50       14 $product->add_SeqFeature($adjfeat) if $adjfeat;
832             }
833             }
834              
835             # for recipient, shift and modify features according to insertion.
836 3 50       10 if ( $recipient->isa('Bio::SeqI') ) {
837 3         4 for my $feat ( $recipient->get_SeqFeatures ) {
838 17         30 my $adjfeat =
839             $self->_coord_adjust_insertion( $feat, $insert_pos,
840             $fragment->length );
841 17 100       56 $product->add_SeqFeature($adjfeat) if $adjfeat;
842             }
843             }
844             }
845              
846             # add a feature to annotate the insertion
847 3         19 my $insertion_feature = Bio::SeqFeature::Generic->new(
848             -start => $insert_pos + 1,
849             -end => $insert_pos + $fragment->length,
850             -primary_tag => 'misc_feature',
851             -tag => { note => 'inserted fragment' },
852             );
853 3         11 $product->add_SeqFeature($insertion_feature);
854              
855 3         13 return $product;
856             }
857              
858             =head2 ligate
859            
860             title : ligate
861             function: pastes a fragment (which can also have features) into a recipient
862             sequence between two "cut" sites, preserving features and adjusting
863             their locations.
864             This is a shortcut for deleting a segment from a sequence object followed
865             by an insertion of a fragmnet and is supposed to be used to simulate
866             in-vitro cloning where a recipient (a vector) is digested and a fragment
867             is then ligated into the recipient molecule. The fragment can be flipped
868             (reverse-complemented with all its features).
869             A new sequence object is returned to represent the product of the reaction.
870             Features and annotations are transferred from the insert to the product
871             and features on the recipient are adjusted according to the methods
872             L amd L:
873             Features spanning the insertion site will be split up into two sub-locations.
874             (Sub-)features in the deleted region are themselves deleted.
875             (Sub-)features that extend into the deleted region are truncated.
876             The class of the product object depends on the class of the recipient (vector)
877             sequence object. if it is not possible to instantiate a new
878             object of that class, a Bio::Primaryseq object is created instead.
879             usage : # insert the flipped fragment between positions 1000 and 1100 of the
880             # vector, i.e. everything between these two positions is deleted and
881             # replaced by the fragment
882             my $new_molecule = Bio::Sequtils::Pbrtools->ligate(
883             -recipient => $vector,
884             -fragment => $fragment,
885             -left => 1000,
886             -right => 1100,
887             -flip => 1,
888             -clone_obj => 1
889             );
890             args : recipient: the recipient/vector molecule
891             fragment: molecule that is to be ligated into the vector
892             left: left cut site (fragment will be inserted to the right of
893             this position)
894             optional:
895             right: right cut site (fragment will be inseterted to the
896             left of this position). defaults to left+1
897             flip: boolean, if true, the fragment is reverse-complemented
898             (including features) before inserting
899             clone_obj: if true, clone the recipient object to create the product
900             instead of calling "new" on its class
901             returns : a new Bio::Seq object of the ligated fragments
902              
903             =cut
904              
905             sub ligate {
906 3     3 1 73 my $self = shift;
907 3         13 my ( $recipient, $fragment, $left, $right, $flip, $clone_obj ) =
908             $self->_rearrange( [qw(RECIPIENT FRAGMENT LEFT RIGHT FLIP CLONE_OBJ )],
909             @_ );
910 3 50       11 $self->throw("missing required parameter 'recipient'") unless $recipient;
911 3 50       6 $self->throw("missing required parameter 'fragment'") unless $fragment;
912 3 50       6 $self->throw("missing required parameter 'left'") unless defined $left;
913              
914 3   33     6 $right ||= $left + 1;
915              
916 3 50 33     20 $self->throw(
917             "Fragment must be a Bio::PrimarySeqI compliant object but it is a "
918             . ref($fragment) )
919             unless blessed($fragment) && $fragment->isa('Bio::PrimarySeqI');
920              
921 3 50       11 $fragment = $self->revcom_with_features($fragment) if $flip;
922              
923 3         4 my $opts_ref = {};
924 3 100       8 $opts_ref->{clone_obj} = 1 if $clone_obj;
925              
926             # clone in two steps: first delete between the insertion sites,
927             # then insert the fragment. Step 1 is skipped if insert positions
928             # are adjacent (no deletion)
929 3         3 my ( $product1, $product2 );
930 3         5 eval {
931 3 50       5 if ( $right == $left + 1 ) {
932 0         0 $product1 = $recipient;
933             }
934             else {
935 3         9 $product1 =
936             $self->delete( $recipient, $left + 1, $right - 1, $opts_ref );
937             }
938             };
939 3 100       10 $self->throw( "Failed in step 1 (cut recipient): " . $@ ) if $@;
940 2         3 eval { $product2 = $self->insert( $product1, $fragment, $left, $opts_ref ) };
  2         6  
941 2 50       5 $self->throw( "Failed in step 2 (insert fragment): " . $@ ) if $@;
942              
943 2         10 return $product2;
944              
945             }
946              
947             =head2 _coord_adjust_deletion
948            
949             title : _coord_adjust_deletion
950             function: recursively adjusts coordinates of seqfeatures on a molecule
951             where a segment has been deleted.
952             (sub)features that span the deletion site become split features.
953             (sub)features that extend into the deletion site are truncated.
954             A note is added to the feature to inform about the size and
955             position of the deletion.
956             usage : my $adjusted_feature = Bio::Sequtils::_coord_adjust_deletion(
957             $feature,
958             $start,
959             $end
960             );
961             args : a Bio::SeqFeatureI compliant object,
962             start (inclusive) position of the deletion site,
963             end (inclusive) position of the deletion site
964             returns : a Bio::SeqFeatureI compliant object
965              
966             =cut
967              
968             sub _coord_adjust_deletion {
969 38     38   44 my ( $self, $feat, $left, $right ) = @_;
970              
971 38 50       84 $self->throw( 'object [$feat] '
972             . 'of class ['
973             . ref($feat)
974             . '] should be a Bio::SeqFeatureI ' )
975             unless $feat->isa('Bio::SeqFeatureI');
976 38 50 33     101 $self->throw('missing coordinates: need a left and a right position')
977             unless defined $left && defined $right;
978              
979 38 50       53 if ( $left > $right ) {
980 0 0 0     0 if ( $feat->can('is_circular') && $feat->is_circular ) {
981              
982             # todo handle circular molecules
983 0         0 $self->throw(
984             'can not yet handle deletions in circular molecules if deletion spans origin'
985             );
986             }
987             else {
988 0         0 $self->throw(
989             "left coordinate ($left) must be less than right ($right)"
990             . " but it was greater" );
991             }
992             }
993 38         120 my $deletion = Bio::Location::Simple->new(
994             -start => $left,
995             -end => $right,
996             );
997 38         43 my $del_length = $right - $left + 1;
998              
999 38         28 my @adjsubfeat;
1000 38         69 for my $subfeat ( $feat->get_SeqFeatures ) {
1001 12         36 my $adjsubfeat =
1002             $self->_coord_adjust_deletion( $subfeat, $left, $right );
1003 12 100       24 push @adjsubfeat, $adjsubfeat if $adjsubfeat;
1004             }
1005              
1006 38         34 my @loc;
1007             my $note;
1008 38         53 for ( $feat->location->each_Location ) {
1009 38 100       69 next if $deletion->contains($_); # this location will be deleted;
1010 25         50 my $strand = $_->strand;
1011 25         36 my $type = $_->location_type;
1012 25         32 my $start = $_->start;
1013 25 50       87 my $start_type = $_->can('start_pos_type') ? $_->start_pos_type : undef;
1014 25         37 my $end = $_->end;
1015 25 50       79 my $end_type = $_->can('end_pos_type') ? $_->end_pos_type : undef;
1016 25         31 my @newcoords = ();
1017 25 100 100     36 if ( $start < $deletion->start && $end > $deletion->end )
    100 100        
    100 100        
    100          
1018             { # split the feature
1019 4         9 @newcoords = (
1020             [ $start, ( $deletion->start - 1 ), $start_type, $end_type ],
1021             [
1022             ( $deletion->start ), $end - $del_length,
1023             $start_type, $end_type
1024             ]
1025             );
1026 4         13 $note =
1027             $del_length
1028             . 'bp internal deletion between pos '
1029             . ( $deletion->start - 1 ) . ' and '
1030             . $deletion->start;
1031             }
1032             elsif ( $_->start < $deletion->start && $_->end >= $deletion->start )
1033             { # truncate feature end
1034 8         14 @newcoords =
1035             ( [ $start, ( $deletion->start - 1 ), $start_type, $end_type ] );
1036 8         15 $note =
1037             ( $end - $deletion->start + 1 ) . 'bp deleted from feature ';
1038 8 50       14 if ( $feat->strand ) {
1039 0 0       0 $note .= $feat->strand == 1 ? "3' " : "5' ";
1040             }
1041 8         13 $note .= 'end';
1042             }
1043             elsif ( $_->start <= $deletion->end && $_->end > $deletion->end )
1044             { # truncate feature start and shift end
1045 5         8 @newcoords = (
1046             [
1047             ( $deletion->start ), $end - $del_length,
1048             $start_type, $end_type
1049             ]
1050             );
1051 5         7 $note =
1052             ( $deletion->end - $start + 1 ) . 'bp deleted from feature ';
1053 5 100       9 if ( $feat->strand ) {
1054 2 50       4 $note .= $feat->strand == 1 ? "5' end" : "3' end";
1055             }
1056             else {
1057 3         4 $note .= 'start';
1058             }
1059             }
1060             elsif ( $start >= $deletion->end ) { # just shift entire location
1061 4         11 @newcoords = (
1062             [
1063             $start - $del_length, $end - $del_length,
1064             $start_type, $end_type
1065             ]
1066             );
1067             }
1068             else { # not affected by deletion
1069 4         11 @newcoords = ( [ $start, $end, $start_type, $end_type ] );
1070             }
1071              
1072             # if we have no coordinates, we return nothing
1073             # the feature is deleted
1074 25 50       46 return unless @newcoords;
1075              
1076 25         43 my @subloc =
1077             $self->_location_objects_from_coordinate_list( \@newcoords, $strand,
1078             $type );
1079 25         50 push @loc, $self->_single_loc_object_from_collection(@subloc);
1080             } # each location
1081              
1082             # create new feature based on original one and move annotation across
1083 38         92 my $newfeat =
1084             Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1085 38         65 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1086 4         8 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1087 4         9 $newfeat->annotation->add_Annotation( $key, $value );
1088             }
1089             }
1090 38         63 foreach my $key ( $feat->get_all_tags() ) {
1091 0         0 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1092             }
1093              
1094             # If we have a note about the deleted bases, add it
1095 38 100       53 if ($note) {
1096 17         21 $newfeat->add_tag_value( 'note', $note );
1097             }
1098              
1099             # set modified location(s) for the new feature and
1100             # add its subfeatures if any
1101 38         67 my $loc = $self->_single_loc_object_from_collection(@loc);
1102 38 100       91 $loc ? $newfeat->location($loc) : return;
1103 25         42 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1104              
1105 25         70 return $newfeat;
1106              
1107             }
1108              
1109             =head2 _coord_adjust_insertion
1110            
1111             title : _coord_adjust_insertion
1112             function: recursively adjusts coordinates of seqfeatures on a molecule
1113             where another sequence has been inserted.
1114             (sub)features that span the insertion site become split features
1115             and a note is added about the size and positin of the insertion.
1116             Features with an IN-BETWEEN location at the insertion site
1117             are lost (such features can only exist between adjacent bases)
1118             usage : my $adjusted_feature = Bio::Sequtils::_coord_adjust_insertion(
1119             $feature,
1120             $insert_pos,
1121             $insert_length
1122             );
1123             args : a Bio::SeqFeatureI compliant object,
1124             insertion position (insert to the right of this position)
1125             length of inserted fragment
1126             returns : a Bio::SeqFeatureI compliant object
1127              
1128             =cut
1129              
1130             sub _coord_adjust_insertion {
1131 22     22   22 my ( $self, $feat, $insert_pos, $insert_len ) = @_;
1132              
1133 22 50       46 $self->throw( 'object [$feat] '
1134             . 'of class ['
1135             . ref($feat)
1136             . '] should be a Bio::SeqFeatureI ' )
1137             unless $feat->isa('Bio::SeqFeatureI');
1138 22 50       26 $self->throw('missing insert position') unless defined $insert_pos;
1139 22 50       28 $self->throw('missing insert length') unless defined $insert_len;
1140              
1141 22         17 my @adjsubfeat;
1142 22         29 for my $subfeat ( $feat->get_SeqFeatures ) {
1143 5         18 push @adjsubfeat,
1144             $self->_coord_adjust_insertion( $subfeat, $insert_pos, $insert_len );
1145             }
1146              
1147 22         20 my @loc;
1148             my $note;
1149 22         30 for ( $feat->location->each_Location ) {
1150              
1151             # loose IN-BETWEEN features at the insertion site
1152             next
1153 22 100 66     254 if ( $_->location_type eq 'IN-BETWEEN' && $_->start == $insert_pos );
1154 20         37 my $strand = $_->strand;
1155 20         27 my $type = $_->location_type;
1156 20         27 my $start = $_->start;
1157 20 50       65 my $start_type = $_->can('start_pos_type') ? $_->start_pos_type : undef;
1158 20         30 my $end = $_->end;
1159 20 50       56 my $end_type = $_->can('end_pos_type') ? $_->end_pos_type : undef;
1160 20         22 my @newcoords = ();
1161 20 100 100     63 if ( $start <= $insert_pos && $end > $insert_pos ) { # split the feature
    100          
1162 3         7 @newcoords = (
1163             [ $start, $insert_pos, $start_type, $end_type ],
1164             [
1165             ( $insert_pos + 1 + $insert_len ), $end + $insert_len,
1166             $start_type, $end_type
1167             ]
1168             );
1169 3         11 $note =
1170             $insert_len
1171             . 'bp internal insertion between pos '
1172             . $insert_pos . ' and '
1173             . ( $insert_pos + $insert_len + 1 );
1174              
1175             }
1176             elsif ( $start > $insert_pos ) { # just shift entire location
1177 8         15 @newcoords = (
1178             [
1179             $start + $insert_len, $end + $insert_len,
1180             $start_type, $end_type
1181             ]
1182             );
1183             }
1184             else { # not affected
1185 9         18 @newcoords = ( [ $start, $end, $start_type, $end_type ] );
1186             }
1187              
1188             # if we have deleted all coordinates, return nothing
1189             # (possible if all locations are IN-BETWEEN)
1190 20 50       31 return unless @newcoords;
1191              
1192 20         33 my @subloc =
1193             $self->_location_objects_from_coordinate_list( \@newcoords, $strand,
1194             $type );
1195              
1196             # put together final location which could be a split now
1197 20         36 push @loc, $self->_single_loc_object_from_collection(@subloc);
1198             } # each location
1199              
1200             # create new feature based on original one and move annotation across
1201 22         57 my $newfeat =
1202             Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1203 22         38 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1204 3         5 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1205 3         5 $newfeat->annotation->add_Annotation( $key, $value );
1206             }
1207             }
1208 22         37 foreach my $key ( $feat->get_all_tags() ) {
1209 10         15 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1210             }
1211              
1212             # If we have a note about the inserted bases, add it
1213 22 100       40 if ($note) {
1214 3         4 $newfeat->add_tag_value( 'note', $note );
1215             }
1216              
1217             # set modified location(s) for the new feature and
1218             # add its subfeatures if any
1219 22         34 my $loc = $self->_single_loc_object_from_collection(@loc);
1220 22 100       52 $loc ? $newfeat->location($loc) : return;
1221 20         32 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1222              
1223 20         31 return $newfeat;
1224              
1225             }
1226              
1227             =head2 _single_loc_object_from_collection
1228            
1229             Title : _single_loc_object_from_collection
1230             Function: takes an array of location objects. Returns either a split
1231             location object if there are more than one locations in the
1232             array or returns the single location if there is only one
1233             Usage : my $loc = _single_loc_object_from_collection( @sublocs );
1234             Args : array of Bio::Location objects
1235             Returns : a single Bio:;Location object containing all locations
1236              
1237             =cut
1238              
1239             sub _single_loc_object_from_collection {
1240 119     119   139 my ( $self, @locs ) = @_;
1241 119         76 my $loc;
1242 119 100       269 if ( @locs > 1 ) {
    100          
1243 7         24 $loc = Bio::Location::Split->new;
1244 7         13 $loc->add_sub_Location(@locs);
1245             }
1246             elsif ( @locs == 1 ) {
1247 97         90 $loc = shift @locs;
1248             }
1249 119         196 return $loc;
1250             } # _single_loc_object_from_collection
1251              
1252             =head2 _location_objects_from_coordinate_list
1253            
1254             Title : _location_objects_from_coordinate_list
1255             Function: takes an array-ref of start/end coordinates, a strand and a
1256             type and returns a list of Bio::Location objects (Fuzzy by
1257             default, Simple in case of in-between coordinates).
1258             If location type is not "IN-BETWEEN", individual types may be
1259             passed in for start and end location as per Bio::Location::Fuzzy
1260             documentation.
1261             Usage : my @loc_objs = $self->_location_objects_from_coordinate_list(
1262             \@coords,
1263             $strand,
1264             $type
1265             );
1266             Args : array-ref of array-refs each containing:
1267             start, end [, start-type, end-type]
1268             where types are optional. If given, must be
1269             a one of ('BEFORE', 'AFTER', 'EXACT','WITHIN', 'BETWEEN')
1270             strand (all locations must be on same strand)
1271             location-type (EXACT, IN-BETWEEN etc)
1272             Returns : list of Bio::Location objects
1273              
1274             =cut
1275              
1276             sub _location_objects_from_coordinate_list {
1277 59     59   47 my $self = shift;
1278 59         68 my ( $coords_ref, $strand, $type ) = @_;
1279 59 50       79 $self->throw( 'expected 3 parameters but got ' . @_ ) unless @_ == 3;
1280 59 50       102 $self->throw('first argument must be an ARRAY reference#')
1281             unless ref($coords_ref) eq 'ARRAY';
1282              
1283 59         36 my @loc;
1284 59         67 foreach my $coords_set (@$coords_ref) {
1285 66         95 my ( $start, $end, $start_type, $end_type ) = @$coords_set;
1286              
1287             # taken from Bio::SeqUtils::_coord_adjust
1288 66 50       100 if ( $type ne 'IN-BETWEEN' ) {
1289 66         201 my $loc = Bio::Location::Fuzzy->new(
1290             -start => $start,
1291             -end => $end,
1292             -strand => $strand,
1293             -location_type => $type
1294             );
1295 66 100       147 $loc->start_pos_type($start_type) if $start_type;
1296 66 100       123 $loc->end_pos_type($end_type) if $end_type;
1297 66         112 push @loc, $loc;
1298             }
1299             else {
1300 0         0 push @loc,
1301             Bio::Location::Simple->new(
1302             -start => $start,
1303             -end => $end,
1304             -strand => $strand,
1305             -location_type => $type
1306             );
1307             }
1308             } # each coords_set
1309 59         121 return @loc;
1310             } # _location_objects_from_coordinate_list
1311              
1312             =head2 _new_seq_via_clone
1313            
1314             Title : _new_seq_via_clone
1315             Function: clone a sequence object using Bio::Root::Root::clone and set the new sequence string
1316             sequence features are removed.
1317             Usage : my $new_seq = $self->_new_seq_via_clone( $seq_obj, $seq_str );
1318             Args : original seq object [, new sequence string]
1319             Returns : a clone of the original sequence object, optionally with new sequence string
1320              
1321             =cut
1322              
1323             sub _new_seq_via_clone {
1324 3     3   4 my ( $self, $in_seq_obj, $seq_str ) = @_;
1325 3         11 my $out_seq_obj = $in_seq_obj->clone;
1326 3 50       23 $out_seq_obj->remove_SeqFeatures if $out_seq_obj->can('remove_SeqFeatures');
1327 3 50 33     10 if ( blessed $out_seq_obj->seq
1328             && $out_seq_obj->seq->isa('Bio::PrimarySeq') )
1329             {
1330 0         0 $out_seq_obj->seq->seq($seq_str);
1331             }
1332             else {
1333 3         6 $out_seq_obj->seq($seq_str);
1334             }
1335 3         5 return $out_seq_obj;
1336              
1337             } # _new_seq_via_clone
1338              
1339             =head2 _new_seq_from_old
1340            
1341             Title : _new_seq_from_old
1342             Function: creates a new sequence obejct, if possible of the same class as the old and adds
1343             attributes to it. Also copies annotation across to the new object.
1344             Usage : my $new_seq = $self->_new_seq_from_old( $seq_obj, { seq => $seq_str, display_id => 'some_ID'});
1345             Args : old sequence object
1346             hashref of attributes for the new sequence (sequence string etc.)
1347             Returns : a new Bio::Seq object
1348              
1349             =cut
1350              
1351             sub _new_seq_from_old {
1352 6     6   8 my ( $self, $in_seq_obj, $attr ) = @_;
1353 6 50 33     24 $self->throw('attributes must be a hashref')
1354             if $attr && ref($attr) ne 'HASH';
1355              
1356 6         5 my $seqclass;
1357 6 100       13 if ( $in_seq_obj->can_call_new ) {
1358 4         5 $seqclass = ref($in_seq_obj);
1359             }
1360             else {
1361 2         6 $seqclass = 'Bio::Primaryseq';
1362 2         26 $self->_attempt_to_load_seq;
1363             }
1364              
1365             my $out_seq_obj = $seqclass->new(
1366             -seq => $attr->{seq} || $in_seq_obj->seq,
1367             -display_id => $attr->{display_id} || $in_seq_obj->display_id,
1368             -accession_number => $attr->{accession_number}
1369             || $in_seq_obj->accession_number
1370             || '',
1371             -alphabet => $in_seq_obj->alphabet,
1372             -desc => $attr->{desc} || $in_seq_obj->desc,
1373             -verbose => $attr->{verbose} || $in_seq_obj->verbose,
1374 4   33     24 -is_circular => $attr->{is_circular} || $in_seq_obj->is_circular || 0,
      66        
      50        
      66        
      33        
      50        
1375             );
1376              
1377             # move the annotation across to the product
1378 4 50 33     24 if ( $out_seq_obj->isa("Bio::AnnotatableI")
1379             && $in_seq_obj->isa("Bio::AnnotatableI") )
1380             {
1381 4         7 foreach my $key ( $in_seq_obj->annotation->get_all_annotation_keys ) {
1382 4         7 foreach my $value ( $in_seq_obj->annotation->get_Annotations($key) )
1383             {
1384 4         6 $out_seq_obj->annotation->add_Annotation( $key, $value );
1385             }
1386             }
1387             }
1388 4         9 return $out_seq_obj;
1389             } # _new_seq_from_old
1390              
1391             =head2 _coord_adjust
1392              
1393             Title : _coord_adjust
1394             Usage : my $newfeat=Bio::SeqUtils->_coord_adjust($feature, 100, $seq->length);
1395             Function: Recursive subroutine to adjust the coordinates of a feature
1396             and all its subfeatures. If a sequence length is specified, then
1397             any adjusted features that have locations beyond the boundaries
1398             of the sequence are converted to Bio::Location::Fuzzy objects.
1399              
1400             Returns : A Bio::SeqFeatureI compliant object.
1401             Args : A Bio::SeqFeatureI compliant object,
1402             the number of bases to add to the coordinates
1403             (optional) the length of the parent sequence
1404              
1405              
1406             =cut
1407              
1408             sub _coord_adjust {
1409 8     8   11 my ( $self, $feat, $add, $length ) = @_;
1410 8 50       20 $self->throw( 'Object [$feat] '
1411             . 'of class ['
1412             . ref($feat)
1413             . '] should be a Bio::SeqFeatureI ' )
1414             unless $feat->isa('Bio::SeqFeatureI');
1415 8         9 my @adjsubfeat;
1416 8         13 for my $subfeat ( $feat->get_SeqFeatures ) {
1417 0         0 push @adjsubfeat, $self->_coord_adjust( $subfeat, $add, $length );
1418             }
1419 8         9 my @loc;
1420 8         12 for ( $feat->location->each_Location ) {
1421 8         17 my @coords = ( $_->start, $_->end );
1422 8         14 my $strand = $_->strand;
1423 8         29 my $type = $_->location_type;
1424 8         14 foreach (@coords) {
1425 16 50       26 $self->throw("can not handle negative feature positions (got: $_)")
1426             if $_ < 0;
1427 16 100 100     40 if ( $add + $_ < 1 ) {
    100          
1428 2         3 $_ = '<1';
1429             }
1430             elsif ( defined $length and $add + $_ > $length ) {
1431 1         3 $_ = ">$length";
1432             }
1433             else {
1434 13         16 $_ = $add + $_;
1435             }
1436             }
1437 8         23 push @loc,
1438             $self->_location_objects_from_coordinate_list( [ \@coords ],
1439             $strand, $type );
1440             }
1441 8         25 my $newfeat =
1442             Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1443 8         21 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1444 0         0 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1445 0         0 $newfeat->annotation->add_Annotation( $key, $value );
1446             }
1447             }
1448 8         17 foreach my $key ( $feat->get_all_tags() ) {
1449 6         12 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1450             }
1451 8         15 my $loc = $self->_single_loc_object_from_collection(@loc);
1452 8 50       20 $loc ? $newfeat->location($loc) : return;
1453 8         15 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1454 8         23 return $newfeat;
1455             }
1456              
1457             =head2 revcom_with_features
1458              
1459             Title : revcom_with_features
1460             Usage : $revcom=Bio::SeqUtils->revcom_with_features($seq);
1461             Function: Like Bio::Seq::revcom, but keeps features (adjusting coordinates
1462             as appropriate.
1463             Returns : A new sequence object
1464             Args : A sequence object
1465              
1466              
1467             =cut
1468              
1469             sub revcom_with_features {
1470 5     5 1 9 my ( $self, $seq ) = @_;
1471 5 50       17 $self->throw( 'Object [$seq] '
1472             . 'of class ['
1473             . ref($seq)
1474             . '] should be a Bio::SeqI ' )
1475             unless $seq->isa('Bio::SeqI');
1476 5         19 my $revcom = $seq->revcom;
1477              
1478             # make sure that there is no annotation or features in $trunc
1479             # (->revcom() now clone objects except for Bio::Seq::LargePrimarySeq)
1480 5         11 $revcom->annotation->remove_Annotations;
1481 5         11 $revcom->remove_SeqFeatures;
1482              
1483             #move annotations
1484 5         10 foreach my $key ( $seq->annotation->get_all_annotation_keys() ) {
1485 4         7 foreach my $value ( $seq->annotation->get_Annotations($key) ) {
1486 4         8 $revcom->annotation->add_Annotation( $key, $value );
1487             }
1488             }
1489              
1490             #move features
1491 5         10 for ( map { $self->_feature_revcom( $_, $seq->length ) }
  6         12  
1492             reverse $seq->get_SeqFeatures )
1493             {
1494 6         13 $revcom->add_SeqFeature($_);
1495             }
1496 5         8 return $revcom;
1497             }
1498              
1499             =head2 _feature_revcom
1500              
1501             Title : _feature_revcom
1502             Usage : my $newfeat=Bio::SeqUtils->_feature_revcom($feature, $seq->length);
1503             Function: Recursive subroutine to reverse complement a feature and
1504             all its subfeatures. The length of the parent sequence must be
1505             specified.
1506              
1507             Returns : A Bio::SeqFeatureI compliant object.
1508             Args : A Bio::SeqFeatureI compliant object,
1509             the length of the parent sequence
1510              
1511              
1512             =cut
1513              
1514             sub _feature_revcom {
1515 6     6   8 my ( $self, $feat, $length ) = @_;
1516 6 50       17 $self->throw( 'Object [$feat] '
1517             . 'of class ['
1518             . ref($feat)
1519             . '] should be a Bio::SeqFeatureI ' )
1520             unless $feat->isa('Bio::SeqFeatureI');
1521 6         5 my @adjsubfeat;
1522 6         12 for my $subfeat ( $feat->get_SeqFeatures ) {
1523 0         0 push @adjsubfeat, $self->_feature_revcom( $subfeat, $length );
1524             }
1525 6         8 my @loc;
1526 6         9 for ( $feat->location->each_Location ) {
1527 6         12 my $type = $_->location_type;
1528 6         6 my $strand;
1529 6 100       11 if ( $_->strand == -1 ) { $strand = 1 }
  5 50       4  
1530 1         1 elsif ( $_->strand == 1 ) { $strand = -1 }
1531 0         0 else { $strand = $_->strand }
1532 6         16 my $newend =
1533             $self->_coord_revcom( $_->start, $_->start_pos_type, $length );
1534 6         14 my $newstart =
1535             $self->_coord_revcom( $_->end, $_->end_pos_type, $length );
1536 6         13 my $newstart_type = $_->end_pos_type;
1537 6 50       9 $newstart_type = 'BEFORE' if $_->end_pos_type eq 'AFTER';
1538 6 50       10 $newstart_type = 'AFTER' if $_->end_pos_type eq 'BEFORE';
1539 6         9 my $newend_type = $_->start_pos_type;
1540 6 50       9 $newend_type = 'BEFORE' if $_->start_pos_type eq 'AFTER';
1541 6 100       13 $newend_type = 'AFTER' if $_->start_pos_type eq 'BEFORE';
1542 6         17 push @loc,
1543             $self->_location_objects_from_coordinate_list(
1544             [ [ $newstart, $newend, $newstart_type, $newend_type ] ],
1545             $strand, $type );
1546             }
1547 6         17 my $newfeat =
1548             Bio::SeqFeature::Generic->new( -primary => $feat->primary_tag );
1549 6         16 foreach my $key ( $feat->annotation->get_all_annotation_keys() ) {
1550 0         0 foreach my $value ( $feat->annotation->get_Annotations($key) ) {
1551 0         0 $newfeat->annotation->add_Annotation( $key, $value );
1552             }
1553             }
1554 6         14 foreach my $key ( $feat->get_all_tags() ) {
1555 3         6 $newfeat->add_tag_value( $key, $feat->get_tag_values($key) );
1556             }
1557              
1558 6         13 my $loc = $self->_single_loc_object_from_collection(@loc);
1559 6 50       17 $loc ? $newfeat->location($loc) : return;
1560              
1561 6         9 $newfeat->add_SeqFeature($_) for @adjsubfeat;
1562 6         18 return $newfeat;
1563             }
1564              
1565             sub _coord_revcom {
1566 12     12   11 my ( $self, $coord, $type, $length ) = @_;
1567 12 50 33     41 if ( $type eq 'BETWEEN' or $type eq 'WITHIN' ) {
1568 0         0 $coord =~ s/(\d+)(\D*)(\d+)/$length+1-$3.$2.$length+1-$1/ge;
  0         0  
1569             }
1570             else {
1571 12         79 $coord =~ s/(\d+)/$length+1-$1/ge;
  12         33  
1572 12         15 $coord =~ tr/<>/>
1573 12 100 66     29 $coord = '>' . $coord
1574             if $type eq 'BEFORE' and substr( $coord, 0, 1 ) ne '>';
1575 12 50 33     25 $coord = '<' . $coord
1576             if $type eq 'AFTER' and substr( $coord, 0, 1 ) ne '<';
1577             }
1578 12         37 return $coord;
1579             }
1580              
1581             =head2 evolve
1582              
1583             Title : evolve
1584             Usage : my $newseq = Bio::SeqUtils->
1585             evolve($seq, $similarity, $transition_transversion_rate);
1586             Function: Mutates the sequence by point mutations until the similarity of
1587             the new sequence has decreased to the required level.
1588             Transition/transversion rate is adjustable.
1589             Returns : A new Bio::PrimarySeq object
1590             Args : sequence object
1591             percentage similarity (e.g. 80)
1592             tr/tv rate, optional, defaults to 1 (= 1:1)
1593              
1594             Set the verbosity of the Bio::SeqUtils object to positive integer to
1595             see the mutations as they happen.
1596              
1597             This method works only on nucleotide sequences. It prints a warning if
1598             you set the target similarity to be less than 25%.
1599              
1600             Transition/transversion ratio is an observed attribute of an sequence
1601             comparison. We are dealing here with the transition/transversion rate
1602             that we set for our model of sequence evolution.
1603              
1604             =cut
1605              
1606             sub evolve {
1607 1     1 1 1 my ( $self, $seq, $sim, $rate ) = @_;
1608 1   50     3 $rate ||= 1;
1609              
1610 1 50       4 $self->throw( 'Object [$seq] '
1611             . 'of class ['
1612             . ref($seq)
1613             . '] should be a Bio::PrimarySeqI ' )
1614             unless $seq->isa('Bio::PrimarySeqI');
1615              
1616 1 50 33     9 $self->throw(
1617             "[$sim] " . ' should be a positive integer or float under 100' )
1618             unless $sim =~ /^[+\d.]+$/ and $sim <= 100;
1619              
1620 1 50       3 $self->warn(
1621             "Nucleotide sequences are 25% similar by chance.
1622             Do you really want to set similarity to [$sim]%?\n"
1623             ) unless $sim > 25;
1624              
1625 1 50       3 $self->throw('Only nucleotide sequences are supported')
1626             if $seq->alphabet eq 'protein';
1627              
1628             # arrays of possible changes have transitions as first items
1629 1         1 my %changes;
1630 1         3 $changes{'a'} = [ 't', 'c', 'g' ];
1631 1         3 $changes{'t'} = [ 'a', 'c', 'g' ];
1632 1         2 $changes{'c'} = [ 'g', 'a', 't' ];
1633 1         2 $changes{'g'} = [ 'c', 'a', 't' ];
1634              
1635             # given the desired rate, find out where cut off points need to be
1636             # when random numbers are generated from 0 to 100
1637             # we are ignoring identical mutations (e.g. A->A) to speed things up
1638 1         2 my $bin_size = 100 / ( $rate + 2 );
1639 1         2 my $transition = 100 - ( 2 * $bin_size );
1640 1         2 my $first_transversion = $transition + $bin_size;
1641              
1642             # unify the look of sequence strings
1643 1         1 my $string = lc $seq->seq; # lower case
1644 1         2 $string =~
1645             s/u/t/; # simplyfy our life; modules should deal with the change anyway
1646             # store the original sequence string
1647 1         1 my $oristring = $string;
1648 1         2 my $length = $seq->length;
1649              
1650             # stop evolving if the limit has been reached
1651 1         3 until ( $self->_get_similarity( $oristring, $string ) <= $sim ) {
1652              
1653             # find the location in the string to change
1654 4         80 my $loc = int( rand $length ) + 1;
1655              
1656             # nucleotide to change
1657 4         8 my $oldnuc = substr $string, $loc - 1, 1;
1658 4         2 my $newnuc;
1659              
1660             # nucleotide it is changed to
1661 4         3 my $choose = rand(100);
1662 4 100       7 if ( $choose < $transition ) {
    50          
1663 3         6 $newnuc = $changes{$oldnuc}[0];
1664             }
1665             elsif ( $choose < $first_transversion ) {
1666 1         3 $newnuc = $changes{$oldnuc}[1];
1667             }
1668             else {
1669 0         0 $newnuc = $changes{$oldnuc}[2];
1670             }
1671              
1672             # do the change
1673 4         5 substr $string, $loc - 1, 1, $newnuc;
1674              
1675 4         15 $self->debug("$loc$oldnuc>$newnuc\n");
1676             }
1677              
1678 1         3 return new Bio::PrimarySeq(
1679             -id => $seq->id . "-$sim",
1680             -description => $seq->description,
1681             -seq => $string
1682             );
1683             }
1684              
1685             sub _get_similarity {
1686 5     5   6 my ( $self, $oriseq, $seq ) = @_;
1687              
1688 5         3 my $len = length($oriseq);
1689 5         5 my $c;
1690              
1691 5         8 for ( my $i = 0 ; $i < $len ; $i++ ) {
1692 50 100       88 $c++ if substr( $oriseq, $i, 1 ) eq substr( $seq, $i, 1 );
1693             }
1694 5         26 return 100 * $c / $len;
1695             }
1696              
1697             1;