|  line  | 
 stmt  | 
 bran  | 
 cond  | 
 sub  | 
 pod  | 
 time  | 
 code  | 
| 
1
 | 
  
 
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # BioPerl module for Bio::Seq::EncodedSeq  | 
| 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Please direct questions and support issues to    | 
| 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Cared for by Aaron Mackey  | 
| 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # Copyright Aaron Mackey  | 
| 
9
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 #  | 
| 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # You may distribute this module under the same terms as perl itself  | 
| 
11
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # POD documentation - main docs before the code  | 
| 
13
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
14
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 NAME  | 
| 
15
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
16
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Bio::Seq::EncodedSeq - subtype of L to store DNA that encodes a protein  | 
| 
17
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
18
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 SYNOPSIS  | 
| 
19
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $obj = Bio::Seq::EncodedSeq->new( -seq      => $dna,  | 
| 
21
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                     -encoding => "CCCCCCCIIIIICCCCC",  | 
| 
22
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                     -start    => 1,  | 
| 
23
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                     -strand   => 1,  | 
| 
24
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                     -length   => 17 );  | 
| 
25
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
26
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # splice out (and possibly revcomp) the coding sequence  | 
| 
27
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $cds = obj->cds;  | 
| 
28
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
29
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # obtain the protein translation of the sequence  | 
| 
30
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $prot = $obj->translate;  | 
| 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
32
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # other access/inspection routines as with Bio::LocatableSeq and  | 
| 
33
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # Bio::SeqI; note that coordinates are relative only to the DNA  | 
| 
34
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # sequence, not it's implicit encoded protein sequence.  | 
| 
35
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
36
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 DESCRIPTION  | 
| 
37
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
38
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Bio::Seq::EncodedSeq is a L  | 
| 
39
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 object that holds a DNA sequence as well as information about the  | 
| 
40
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 coding potential of that DNA sequence.  It is meant to be useful in an  | 
| 
41
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 alignment context, where the DNA may contain frameshifts, gaps and/or  | 
| 
42
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 introns, or in describing the transcript of a gene.  An EncodedSeq  | 
| 
43
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 provides the ability to access the "spliced" coding sequence, meaning  | 
| 
44
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 that all introns and gaps are removed, and any frameshifts are  | 
| 
45
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 adjusted to provide a "clean" CDS.  | 
| 
46
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
47
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 In order to make simultaneous use of either the DNA or the implicit  | 
| 
48
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 encoded protein sequence coordinates, please see  | 
| 
49
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 L.  | 
| 
50
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
51
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 ENCODING  | 
| 
52
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
53
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 We use the term "encoding" here to refer to the series of symbols that  | 
| 
54
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 we use to identify which residues of a DNA sequence are protein-coding  | 
| 
55
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 (i.e. part of a codon), intronic, part of a 5' or 3', frameshift  | 
| 
56
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 "mutations", etc.  From this information, a Bio::Seq::EncodedSeq is  | 
| 
57
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 able to "figure out" its translational CDS.  There are two sets of  | 
| 
58
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 coding characters, one termed "implicit" and one termed "explicit".  | 
| 
59
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
60
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The "implicit" encoding is a bit simpler than the "explicit" encoding:  | 
| 
61
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 'C' is used for any nucleotide that's part of a codon, 'U' for any  | 
| 
62
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 UTR, etc.  The full list is shown below:  | 
| 
63
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
64
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Code  Meaning  | 
| 
65
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  ----  -------  | 
| 
66
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   C    coding  | 
| 
67
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   I    intronic  | 
| 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   U    untranslated  | 
| 
69
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   G    gapped (for use in alignments)  | 
| 
70
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   F    a "forward", +1 frameshift  | 
| 
71
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   B    a "backward", -1 frameshift  | 
| 
72
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
73
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The "explicit" encoding is just an expansion of the "implicit"  | 
| 
74
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 encoding, to denote phase:  | 
| 
75
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
76
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Code  Meaning  | 
| 
77
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  ----  -------  | 
| 
78
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   C    coding, 1st codon position  | 
| 
79
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   D    coding, 2nd codon position  | 
| 
80
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   E    coding, 3rd codon position  | 
| 
81
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
82
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   I    intronic, phase 0 (relative to intron begin)  | 
| 
83
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   J    intronic, phase 1  | 
| 
84
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   K    intronic, phase 2  | 
| 
85
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
86
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   U    untranslated 3'UTR  | 
| 
87
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   V    untranslated 5'UTR  | 
| 
88
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
89
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   G    gapped (for use in alignments)  | 
| 
90
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   F    a "forward", +1 frameshift  | 
| 
91
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   B    a "backward", -1 frameshift  | 
| 
92
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
93
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Note that the explicit coding is meant to provide easy access to  | 
| 
94
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 position/phase specific nucleotides:  | 
| 
95
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
96
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   $obj = Bio::Seq::EncodedSeq->new(-seq => "ACAATCAGACTACG...",  | 
| 
97
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                    -encoding => "CCCCCCIII..."  | 
| 
98
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                   );  | 
| 
99
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
100
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # fetch arrays of nucleotides at each codon position:  | 
| 
101
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my @pos1 = $obj->dnaseq(encoding => 'C', explicit => 1);  | 
| 
102
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my @pos2 = $obj->dnaseq(encoding => 'D');  | 
| 
103
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my @pos3 = $obj->dnaseq(encoding => 'E');  | 
| 
104
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
105
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # fetch arrays of "3-1" codon dinucleotides, useful for genomic  | 
| 
106
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   # signature analyses without compounding influences of codon bias:  | 
| 
107
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   my @pairs = $obj->dnaseq(encoding => 'EC');  | 
| 
108
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
109
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 FEEDBACK  | 
| 
110
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
111
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 Mailing Lists  | 
| 
112
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
113
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 User feedback is an integral part of the evolution of this and other  | 
| 
114
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Bioperl modules. Send your comments and suggestions preferably to one  | 
| 
115
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 of the Bioperl mailing lists.  Your participation is much appreciated.  | 
| 
116
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
117
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   bioperl-l@bioperl.org                  - General discussion  | 
| 
118
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   http://bioperl.org/wiki/Mailing_lists  - About the mailing lists  | 
| 
119
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
120
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 Support   | 
| 
121
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
122
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Please direct usage questions or support issues to the mailing list:  | 
| 
123
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
124
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 I  | 
| 
125
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
126
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 rather than to the module maintainer directly. Many experienced and   | 
| 
127
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 reponsive experts will be able look at the problem and quickly   | 
| 
128
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 address it. Please include a thorough description of the problem   | 
| 
129
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 with code and data examples if at all possible.  | 
| 
130
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
131
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 Reporting Bugs  | 
| 
132
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
133
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Report bugs to the Bioperl bug tracking system to help us keep track  | 
| 
134
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 the bugs and their resolution.  Bug reports can be submitted via the  | 
| 
135
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 web:  | 
| 
136
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
137
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
   https://github.com/bioperl/bioperl-live/issues  | 
| 
138
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
139
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 AUTHOR - Aaron Mackey  | 
| 
140
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
141
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 Email amackey@virginia.edu  | 
| 
142
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
143
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head1 APPENDIX  | 
| 
144
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
145
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 The rest of the documentation details each of the object  | 
| 
146
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 methods. Internal methods are usually preceded with a _  | 
| 
147
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
148
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
149
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
150
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
151
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
152
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 package Bio::Seq::EncodedSeq;  | 
| 
153
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
154
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
477
 | 
 use strict;  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
    | 
| 
155
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
156
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
 
 | 
3
 | 
 use base qw(Bio::LocatableSeq);  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1
 | 
    | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
284
 | 
    | 
| 
157
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
158
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
159
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 new  | 
| 
160
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
161
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Title   : new  | 
| 
162
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Usage   : $obj = Bio::Seq::EncodedSeq->new(-seq      => "AGTACGTGTCATG",  | 
| 
163
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                             -encoding => "CCCCCCFCCCCCC",  | 
| 
164
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                             -id       => "myseq",  | 
| 
165
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                             -start    => 1,  | 
| 
166
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                             -end      => 13,  | 
| 
167
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                             -strand   => 1  | 
| 
168
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                       );  | 
| 
169
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Function: creates a new Bio::Seq::EncodedSeq object from a supplied DNA  | 
| 
170
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            sequence  | 
| 
171
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Returns : a new Bio::Seq::EncodedSeq object  | 
| 
172
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
173
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Args    : seq      - primary nucleotide sequence used to encode the  | 
| 
174
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       protein; note that any positions involved in a  | 
| 
175
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       gap ('G') or backward frameshift ('B') should  | 
| 
176
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       have one or more gap characters; if the encoding  | 
| 
177
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       specifies G or B, but no (or not enough) gap  | 
| 
178
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       characters exist, *they'll be added*; similary,  | 
| 
179
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       if there are gap characters without a  | 
| 
180
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       corresponding G or B encoding, G's will be  | 
| 
181
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       inserted into the encoding.  This allows some  | 
| 
182
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       flexibility in specifying your sequence and  | 
| 
183
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       coding without having to calculate a lot of the  | 
| 
184
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       encoding for yourself.  | 
| 
185
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
186
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            encoding - a string of characters (see Encoding Table)  | 
| 
187
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       describing backwards frameshifts implied by the  | 
| 
188
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       encoding but not present in the sequence will be  | 
| 
189
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       added (as '-'s) to the sequence.  If not  | 
| 
190
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       supplied, it will be assumed that all positions  | 
| 
191
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       are coding (C).  Encoding may include either  | 
| 
192
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       implicit phase encoding characters (i.e. "CCC")  | 
| 
193
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       and/or explicit encoding characters (i.e. "CDE").  | 
| 
194
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       Additionally, prefixed numbers may be used to  | 
| 
195
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       denote repetition (i.e. "27C3I28C").  | 
| 
196
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
197
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       Alternatively, encoding may be a hashref  | 
| 
198
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       datastructure, with encoding characters as keys  | 
| 
199
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       and Bio::LocationI objects (or arrayrefs of  | 
| 
200
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       Bio::LocationI objects) as values, e.g.:  | 
| 
201
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
202
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       { C => [ Bio::Location::Simple->new(1,9),  | 
| 
203
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                Bio::Location::Simple->new(11,13) ],  | 
| 
204
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         F => Bio::Location::Simple->new(10,10),  | 
| 
205
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       } # same as "CCCCCCCCCFCCC"  | 
| 
206
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
207
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       Note that if the location ranges overlap, the  | 
| 
208
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       behavior of the encoding will be undefined  | 
| 
209
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       (well, it will be defined, but only according to  | 
| 
210
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       the order in which the hash keys are read, which  | 
| 
211
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       is basically undefined ... just don't do that).  | 
| 
212
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
213
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            id, start, end, strand - as with Bio::LocatableSeq; note  | 
| 
214
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       that the coordinates are relative to the  | 
| 
215
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       encoding DNA sequence, not the implicit protein  | 
| 
216
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       sequence.  If strand is reversed, then the  | 
| 
217
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       encoding is assumed to be relative to the  | 
| 
218
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       reverse strand as well.  | 
| 
219
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
220
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
221
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
222
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub new {  | 
| 
223
 | 
4
 | 
 
 | 
 
 | 
  
4
  
 | 
  
1
  
 | 
135
 | 
     my ($self, @args) = @_;  | 
| 
224
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
     $self = $self->SUPER::new(@args, -alphabet => 'dna');  | 
| 
225
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
     my ($enc) = $self->_rearrange([qw(ENCODING)], @args);  | 
| 
226
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # set the real encoding:  | 
| 
227
 | 
4
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
8
 | 
     if ($enc) {  | 
| 
228
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $self->encoding($enc);  | 
| 
229
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
230
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
         $self->_recheck_encoding;  | 
| 
231
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
232
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
22
 | 
     return $self;  | 
| 
233
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
234
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
235
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
236
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 encoding  | 
| 
237
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
238
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Title   : encoding  | 
| 
239
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Usage   : $obj->encoding("CCCCCC");  | 
| 
240
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            $obj->encoding( -encoding => { I => $location } );  | 
| 
241
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            $enc = $obj->encoding(-explicit => 1);  | 
| 
242
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            $enc = $obj->encoding("CCCCCC", -explicit => 1);  | 
| 
243
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            $enc = $obj->encoding(-location => $location,  | 
| 
244
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                  -explicit => 1,  | 
| 
245
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                  -absolute => 1 );  | 
| 
246
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Function: get/set the objects encoding, either globally or by location(s).  | 
| 
247
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Returns : the (possibly new) encoding string.  | 
| 
248
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Args    : encoding - see the encoding argument to the new() function.  | 
| 
249
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
250
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            explicit - whether or not to return explicit phase  | 
| 
251
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       information in the coding (i.e. "CCC" becomes  | 
| 
252
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       "CDE", "III" becomes "IJK", etc); defaults to 0.  | 
| 
253
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
254
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            location - optional; location to get/set the encoding.  | 
| 
255
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       Defaults to the entire sequence.  | 
| 
256
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
257
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            absolute - whether or not the locational elements (either  | 
| 
258
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       in the encoding hashref or the location  | 
| 
259
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       argument) are relative to the absolute start/end  | 
| 
260
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       of the Bio::LocatableSeq, or to the internal,  | 
| 
261
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       relative coordinate system (beginning at 1);  | 
| 
262
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       defaults to 0 (i.e. relative)  | 
| 
263
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
264
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
265
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
266
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub encoding {  | 
| 
267
 | 
8
 | 
 
 | 
 
 | 
  
8
  
 | 
  
1
  
 | 
13
 | 
     my ($self, @args) = @_;  | 
| 
268
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
     my ($enc, $loc, $exp, $abs) = $self->_rearrange([qw(ENCODING LOCATION EXPLICIT ABSOLUTE)], @args);  | 
| 
269
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
270
 | 
8
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
23
 | 
     if (!$enc) {  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
271
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # do nothing; _recheck_encoding will fix for us, if necessary  | 
| 
272
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif (ref $enc eq 'HASH') {  | 
| 
273
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $self->throw( -class => 'Bio::Root::NotImplemented',  | 
| 
274
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       -text  => "Hashref functionality not yet implemented;\nplease email me if you really need this.");  | 
| 
275
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         #TODO: finish all this  | 
| 
276
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         while (my ($char, $locs) = each %$enc) {  | 
| 
277
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             if (ref $locs eq 'ARRAY') {  | 
| 
 
 | 
 
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
278
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             } elsif (UNIVERSAL::isa($locs, "Bio::LocationI")) {  | 
| 
279
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             } else {  | 
| 
280
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 $self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}");  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
281
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
282
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
283
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif (! ref $enc) {  | 
| 
284
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
         $enc = uc $enc;  | 
| 
285
 | 
4
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
19
 | 
         $exp = 1 if (!defined $exp && $enc =~ m/[DEJKV]/o);  | 
| 
286
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
287
 | 
4
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
7
 | 
         if ($enc =~ m/\d/o) { # numerically "enhanced" encoding  | 
| 
288
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             my $numenc = $enc;  | 
| 
289
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             $enc = '';  | 
| 
290
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
             while ($numenc =~ m/\G(\d*)([CDEIJKUVGFB])/g) {  | 
| 
291
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
                 my ($num, $char) = ($1, $2);  | 
| 
292
 | 
4
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
7
 | 
                 $num = 1 unless $num;  | 
| 
293
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
                 $enc .= $char x $num;  | 
| 
294
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
295
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
296
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
297
 | 
4
 | 
  
 50
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
21
 | 
         if (defined $exp && $exp == 0 && $enc =~ m/([^CIUGFB])/) {  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
298
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             $self->throw("Unrecognized character '$1' in implicit encoding");  | 
| 
299
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } elsif ($enc =~ m/[^CDEIJKUVGFB]/) {  | 
| 
300
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             $self->throw("Unrecognized character '$1' in explicit encoding");  | 
| 
301
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
302
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
303
 | 
4
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
5
 | 
         if ($loc) { # a global location, over which to apply the specified encoding.  | 
| 
304
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
305
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # balk if too many non-gap characters present in encoding:  | 
| 
306
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
             my ($ct) = $enc =~ tr/GB/GB/;  | 
| 
307
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             $ct = length($enc) - $ct;  | 
| 
308
 | 
1
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
3
 | 
             $self->throw("Location length doesn't match number of coding chars in encoding!")  | 
| 
309
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 if ($loc->location_type eq 'EXACT' &&  $loc->length != $ct);  | 
| 
310
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
311
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             my $start = $loc->start;  | 
| 
312
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
             my $end = $loc->end;  | 
| 
313
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
314
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # strip any encoding that hangs off the ends of the sequence:  | 
| 
315
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3
 | 
             if ($start < $self->start) {  | 
| 
316
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 my $diff = $self->start - $start;  | 
| 
317
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 $start = $self->start;  | 
| 
318
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 $enc = substr($enc, $diff);  | 
| 
319
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
320
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3
 | 
             if ($end > $self->end) {  | 
| 
321
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 my $diff = $end - $self->end;  | 
| 
322
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 $end = $self->end;  | 
| 
323
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 $enc = substr($enc, -$diff);  | 
| 
324
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
325
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
326
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1
 | 
             my $currenc = $self->{_encoding};  | 
| 
327
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
             my $currseq = $self->seq;  | 
| 
328
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
329
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             my ($spanstart, $spanend) = ($self->column_from_residue_number($start),  | 
| 
330
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                          $self->column_from_residue_number($end) );  | 
| 
331
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
332
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
4
 | 
             if ($currseq) {  | 
| 
333
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # strip any gaps in sequence spanned by this location:  | 
| 
334
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2
 | 
                 ($spanstart, $spanend) = ($spanend, $spanstart) if $self->strand < 0;  | 
| 
335
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2
 | 
                 my ($before, $in, $after) = $currseq =~ m/(.{@{[ $spanstart - ($loc->location_type eq 'IN-BETWEEN' ? 0 : 1) ]}})  | 
| 
 
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
    | 
| 
336
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3
 | 
                                                           (.{@{[ $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1) ]}})  | 
| 
337
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                                           (.*)  | 
| 
338
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                                                          /x;  | 
| 
339
 | 
1
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
5
 | 
                 $in ||= '';  | 
| 
340
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1
 | 
                 $in =~ s/[\.\-]+//g;  | 
| 
341
 | 
1
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
4
 | 
                 $currseq = ($before||'') . $in. ($after||'');  | 
| 
 
 | 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
342
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 # change seq without changing the alphabet  | 
| 
343
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
                 $self->seq($currseq,$self->alphabet());  | 
| 
344
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
345
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
346
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3
 | 
             $currenc = reverse $currenc if $self->strand < 0;  | 
| 
347
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3
 | 
             substr($currenc, $spanstart, $spanend - $spanstart + ($loc->location_type eq 'IN-BETWEEN' ? -1 : 1),  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
348
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                    $self->strand >= 0 ? $enc : reverse $enc);  | 
| 
349
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             $currenc = reverse $currenc if $self->strand < 0;  | 
| 
350
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
351
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             $self->{_encoding} = $currenc;  | 
| 
352
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             $self->_recheck_encoding;  | 
| 
353
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
354
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             $currenc = $self->{_encoding};  | 
| 
355
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             $currenc = reverse $currenc if $self->strand < 0;  | 
| 
356
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             $enc = substr($currenc, $spanstart, length $enc);  | 
| 
357
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             $enc = reverse $enc if $self->strand < 0;  | 
| 
358
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
359
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
3
 | 
             return $exp ? $enc: $self->_convert2implicit($enc);  | 
| 
360
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
361
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } else {  | 
| 
362
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # presume a global redefinition; strip any current gap  | 
| 
363
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # characters in the sequence so they don't corrupt the  | 
| 
364
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # encoding  | 
| 
365
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
             my $dna = $self->seq;  | 
| 
366
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
8
 | 
             $dna =~ s/[\.\-]//g;  | 
| 
367
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
             $self->seq($dna, 'dna');  | 
| 
368
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
             $self->{_encoding} = $enc;  | 
| 
369
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
370
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } else {  | 
| 
371
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $self->throw("Only a scalar or a ref to a hash; not a ref to a @{{lc ref $enc}}");  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
372
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
373
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
374
 | 
7
 | 
 
 | 
 
 | 
 
 | 
 
 | 
12
 | 
     $self->_recheck_encoding();  | 
| 
375
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
376
 | 
7
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
15
 | 
     return $exp ? $self->{_encoding} : $self->_convert2implicit($self->{_encoding});  | 
| 
377
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
378
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
379
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
380
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _convert2implicit {  | 
| 
381
 | 
8
 | 
 
 | 
 
 | 
  
8
  
 | 
 
 | 
9
 | 
     my ($self, $enc) = @_;  | 
| 
382
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
383
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
23
 | 
     $enc =~ s/[DE]/C/g;  | 
| 
384
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
     $enc =~ s/[JK]/I/g;  | 
| 
385
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
9
 | 
     $enc =~ s/V/U/g;  | 
| 
386
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
387
 | 
8
 | 
 
 | 
 
 | 
 
 | 
 
 | 
25
 | 
     return $enc;  | 
| 
388
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
389
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
390
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
391
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub _recheck_encoding {  | 
| 
392
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
393
 | 
12
 | 
 
 | 
 
 | 
  
12
  
 | 
 
 | 
13
 | 
     my $self = shift;  | 
| 
394
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
395
 | 
12
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
40
 | 
     my @enc = split //, ($self->{_encoding} || '');  | 
| 
396
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
397
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
24
 | 
     my @nt = split(//, $self->SUPER::seq);  | 
| 
398
 | 
12
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
22
 | 
     @nt = reverse @nt if $self->strand && $self->strand < 0;  | 
| 
399
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
400
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # make sure an encoding exists!  | 
| 
401
 | 
12
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
23
 | 
     @enc = ('C') x scalar grep { !/[\.\-]/o } @nt  | 
| 
 
 | 
31
 | 
 
 | 
 
 | 
 
 | 
 
 | 
37
 | 
    | 
| 
402
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         unless @enc;  | 
| 
403
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
404
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # check for gaps to be truly present in the sequence  | 
| 
405
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # and vice versa  | 
| 
406
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
     my $i;  | 
| 
407
 | 
12
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
39
 | 
     for ($i = 0 ; $i < @nt && $i < @enc ; $i++) {  | 
| 
408
 | 
101
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
541
 | 
         if ($nt[$i] =~ /[\.\-]/o && $enc[$i] !~ m/[GB]/o) {  | 
| 
 
 | 
 
 | 
  
100
  
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
409
 | 
5
 | 
 
 | 
 
 | 
 
 | 
 
 | 
18
 | 
             splice(@enc, $i, 0, 'G');  | 
| 
410
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } elsif ($nt[$i] !~ /[\.\-]/o && $enc[$i] =~ m/[GB]/o) {  | 
| 
411
 | 
10
 | 
 
 | 
 
 | 
 
 | 
 
 | 
32
 | 
             splice(@nt, $i, 0, '-');  | 
| 
412
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
413
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
414
 | 
12
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
26
 | 
     if ($i < @enc) {  | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
415
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         # extra encoding; presumably all gaps?  | 
| 
416
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         for (  ; $i < @enc ; $i++) {  | 
| 
417
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             if ($enc[$i] =~ m/[GB]/o) {  | 
| 
418
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 push @nt, '-';  | 
| 
419
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             } else {  | 
| 
420
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 $self->throw("Extraneous encoding info: " . join('', @enc[$i..$#enc]));  | 
| 
421
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
422
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
423
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     } elsif ($i < @nt) {  | 
| 
424
 | 
3
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
         for (  ; $i < @nt ; $i++) {  | 
| 
425
 | 
8
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
11
 | 
             if ($nt[$i] =~ m/[\.\-]/o) {  | 
| 
426
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
                 push @enc, 'G';  | 
| 
427
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             } else {  | 
| 
428
 | 
6
 | 
 
 | 
 
 | 
 
 | 
 
 | 
10
 | 
                 push @enc, 'C';  | 
| 
429
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
430
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
431
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
432
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
433
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
16
 | 
     my @cde_array = qw(C D E);  | 
| 
434
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
     my @ijk_array = qw(I J K);  | 
| 
435
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     # convert any leftover implicit coding into explicit coding  | 
| 
436
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
13
 | 
     my ($Cct, $Ict, $Uct, $Vct, $Vwarned) = (0, 0, 0, 0);  | 
| 
437
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
20
 | 
     for ($i = 0 ; $i < @enc ; $i++) {  | 
| 
438
 | 
109
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
225
 | 
         if ($enc[$i] =~ m/[CDE]/o) {  | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
439
 | 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
53
 | 
             my  $temp_index = $Cct %3;  | 
| 
440
 | 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
34
 | 
             $enc[$i] = $cde_array[$temp_index];  | 
| 
441
 | 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
47
 | 
             $Cct++; $Ict = 0; $Uct = 1;  | 
| 
 
 | 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
33
 | 
    | 
| 
 
 | 
68
 | 
 
 | 
 
 | 
 
 | 
 
 | 
44
 | 
    | 
| 
442
 | 
68
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
149
 | 
             $self->warn("3' untranslated encoding (V) seen prior to other coding symbols")  | 
| 
443
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 if ($Vct && !$Vwarned++);  | 
| 
444
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } elsif ($enc[$i] =~ m/[IJK]/o) {  | 
| 
445
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             $enc[$i] = $ijk_array[$Ict % 3];  | 
| 
446
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             $Ict++; $Uct = 1;  | 
| 
 
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
    | 
| 
447
 | 
  
0
  
 | 
  
  0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
0
 | 
             $self->warn("3' untranslated encoding (V) seen before other coding symbols")  | 
| 
448
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                 if ($Vct && !$Vwarned++);  | 
| 
449
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } elsif ($enc[$i] =~ m/[UV]/o) {  | 
| 
450
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             if ($Uct == 1) {  | 
| 
451
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 $enc[$i] = 'V';  | 
| 
452
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
                 $Vct = 1;  | 
| 
453
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             }  | 
| 
454
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } elsif ($enc[$i] eq 'B') {  | 
| 
455
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
3
 | 
             $Cct++; $Ict++  | 
| 
 
 | 
4
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
    | 
| 
456
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } elsif ($enc[$i] eq 'G') {  | 
| 
457
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # gap; leave alone  | 
| 
458
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
459
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
460
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
461
 | 
12
 | 
  
100
  
 | 
  
 66
  
 | 
 
 | 
 
 | 
17
 | 
     @nt = reverse @nt if $self->strand && $self->strand < 0;  | 
| 
462
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
31
 | 
     $self->seq(join('', @nt), 'dna');  | 
| 
463
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
464
 | 
12
 | 
 
 | 
 
 | 
 
 | 
 
 | 
40
 | 
     $self->{_encoding} = join '', @enc;  | 
| 
465
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
466
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
467
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
468
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 cds  | 
| 
469
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
470
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Title   : cds  | 
| 
471
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Usage   : $cds = $obj->cds(-nogaps => 1);  | 
| 
472
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Function: obtain the "spliced" DNA sequence, by removing any  | 
| 
473
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            nucleotides that participate in an UTR, forward frameshift  | 
| 
474
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            or intron, and replacing any unknown nucleotide implied by  | 
| 
475
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            a backward frameshift or gap with N's.  | 
| 
476
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Returns : a Bio::Seq::EncodedSeq object, with an encoding consisting only  | 
| 
477
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            of "CCCC..".  | 
| 
478
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Args    : nogaps - strip any gap characters (resulting from 'G' or 'B'  | 
| 
479
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            encodings), rather than replacing them with N's.  | 
| 
480
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
481
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
482
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
483
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub cds {  | 
| 
484
 | 
2
 | 
 
 | 
 
 | 
  
2
  
 | 
  
1
  
 | 
4
 | 
     my ($self, @args) = @_;  | 
| 
485
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
486
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
7
 | 
     my ($nogaps, $loc) = $self->_rearrange([qw(NOGAPS LOCATION)], @args);  | 
| 
487
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     $nogaps = 0 unless defined $nogaps;  | 
| 
488
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
489
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     my @nt  = split //, $self->strand < 0 ? $self->revcom->seq : $self->seq;  | 
| 
490
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     my @enc = split //, $self->_convert2implicit($self->{_encoding});  | 
| 
491
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
492
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     my ($start, $end) = (0, scalar @nt);  | 
| 
493
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
494
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     if ($loc) {  | 
| 
495
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $start = $loc->start;  | 
| 
496
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $start++ if $loc->location_type eq 'IN-BETWEEN';  | 
| 
497
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $start = $self->column_from_residue_number($start);  | 
| 
498
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $start--;  | 
| 
499
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
500
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $end = $loc->end;  | 
| 
501
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $end = $self->column_from_residue_number($end);  | 
| 
502
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
503
 | 
  
0
  
 | 
  
  0
  
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         ($start, $end) = ($end, $start) if $self->strand < 0;  | 
| 
504
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
         $start--;  | 
| 
505
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
506
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
507
 | 
2
 | 
 
 | 
 
 | 
 
 | 
 
 | 
5
 | 
     for (my $i = $start ; $i < $end ; $i++) {  | 
| 
508
 | 
20
 | 
  
 50
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
112
 | 
         if ($enc[$i] eq 'I' || $enc[$i] eq 'U' || $enc[$i] eq 'F') {  | 
| 
 
 | 
 
 | 
  
100
  
 | 
  
 33
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
 
 | 
 
 | 
 
 | 
  
100
  
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
509
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # remove introns, untranslated and forward frameshift nucleotides  | 
| 
510
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
             $nt[$i] = undef;  | 
| 
511
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         } elsif ($enc[$i] eq 'G' || $enc[$i] eq 'B') {  | 
| 
512
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
             # replace gaps and backward frameshifts with N's, unless asked not to.  | 
| 
513
 | 
8
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
15
 | 
             $nt[$i] = $nogaps ? undef : 'N';  | 
| 
514
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         }  | 
| 
515
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     }  | 
| 
516
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
517
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
     return ($self->can_call_new ? ref($self) : __PACKAGE__)->new(  | 
| 
518
 | 
2
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
7
 | 
         -seq      => join('', grep { defined } @nt[$start..--$end]),  | 
| 
 
 | 
20
 | 
 
 | 
 
 | 
 
 | 
 
 | 
22
 | 
    | 
| 
519
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         -start    => $self->start,  | 
| 
520
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         -end      => $self->end,  | 
| 
521
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         -strand   => 1,  | 
| 
522
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
         -alphabet => 'dna' );  | 
| 
523
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
524
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
525
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
526
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 translate  | 
| 
527
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
528
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Title   : translate  | 
| 
529
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Usage   : $prot = $obj->translate(@args);  | 
| 
530
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Function: obtain the protein sequence encoded by the underlying DNA  | 
| 
531
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            sequence; same as $obj->cds()->translate(@args).  | 
| 
532
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Returns : a Bio::PrimarySeq object.  | 
| 
533
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Args    : same as the translate() function of Bio::PrimarySeqI  | 
| 
534
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
535
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
536
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
537
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
4
 | 
 sub translate { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_) };  | 
| 
538
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
539
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
540
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 protseq  | 
| 
541
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
542
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Title   : seq  | 
| 
543
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Usage   : $protseq = $obj->protseq();  | 
| 
544
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Function: obtain the raw protein sequence encoded by the underlying  | 
| 
545
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            DNA sequence; This is the same as calling  | 
| 
546
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            $obj->translate()->seq();  | 
| 
547
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Returns : a string of single-letter amino acid codes  | 
| 
548
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Args :    same as the seq() function of Bio::PrimarySeq; note that this  | 
| 
549
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            function may not be used to set the protein sequence; see  | 
| 
550
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            the dnaseq() function for that.  | 
| 
551
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
552
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
553
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
554
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
 sub protseq { shift->cds(-nogaps => 1, @_)->SUPER::translate(@_)->seq };  | 
| 
555
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
556
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
557
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =head2 dnaseq  | 
| 
558
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
559
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Title   : dnaseq  | 
| 
560
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Usage   : $dnaseq = $obj->dnaseq();  | 
| 
561
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            $obj->dnaseq("ACGTGTCGT", "CCCCCCCCC");  | 
| 
562
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            $obj->dnaseq(-seq      => "ATG",  | 
| 
563
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         -encoding => "CCC",  | 
| 
564
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                         -location => $loc );  | 
| 
565
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            @introns = $obj->$dnaseq(-encoding => 'I')  | 
| 
566
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Function: get/set the underlying DNA sequence; will overwrite any  | 
| 
567
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            current DNA and/or encoding information present.  | 
| 
568
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Returns : a string of single-letter nucleotide codes, including any  | 
| 
569
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            gaps implied by the encoding.  | 
| 
570
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
  Args    : seq      - the DNA sequence to be used as a replacement  | 
| 
571
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            encoding - the encoding of the DNA sequence (see the new()  | 
| 
572
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       constructor); defaults to all 'C' if setting a  | 
| 
573
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       new DNA sequence.  If no new DNA sequence is  | 
| 
574
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       being provided, then the encoding is used as a  | 
| 
575
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       "filter" for which to return fragments of  | 
| 
576
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       non-overlapping DNA that match the encoding.  | 
| 
577
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
            location - optional, the location of the DNA sequence to  | 
| 
578
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
                       get/set; defaults to the entire sequence.  | 
| 
579
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
580
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 =cut  | 
| 
581
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
582
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub dnaseq {  | 
| 
583
 | 
  
0
  
 | 
 
 | 
 
 | 
  
0
  
 | 
  
1
  
 | 
0
 | 
     my ($self, @args) = @_;  | 
| 
584
 | 
  
0
  
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     my ($seq, $enc, $loc) = $self->_rearrange([qw(DNASEQ ENCODING LOCATION)], @args);  | 
| 
585
 | 
0
 | 
 
 | 
 
 | 
 
 | 
 
 | 
0
 | 
     return $self;  | 
| 
586
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
587
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
588
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
589
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 # need to overload this so that we truncate both the seq and the encoding!  | 
| 
590
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 sub trunc {  | 
| 
591
 | 
1
 | 
 
 | 
 
 | 
  
1
  
 | 
  
1
  
 | 
1
 | 
     my ($self, $start, $end) = @_;  | 
| 
592
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
6
 | 
     my $new = $self->SUPER::trunc($start, $end);  | 
| 
593
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     $start--;  | 
| 
594
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
1
 | 
     my $enc = $self->{_encoding};  | 
| 
595
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     $enc = reverse $enc if $self->strand < 0;  | 
| 
596
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     $enc = substr($enc, $start, $end - $start);  | 
| 
597
 | 
1
 | 
  
 50
  
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     $enc = reverse $enc if $self->strand < 0;  | 
| 
598
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
2
 | 
     $new->encoding($enc);  | 
| 
599
 | 
1
 | 
 
 | 
 
 | 
 
 | 
 
 | 
4
 | 
     return $new;  | 
| 
600
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 }  | 
| 
601
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
    | 
| 
602
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 
 | 
 1;  |