line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Gonzales::Seq; |
2
|
|
|
|
|
|
|
|
3
|
9
|
|
|
9
|
|
118266
|
use v5.10; |
|
9
|
|
|
|
|
58
|
|
4
|
|
|
|
|
|
|
|
5
|
9
|
|
|
9
|
|
615
|
use Mouse; |
|
9
|
|
|
|
|
31017
|
|
|
9
|
|
|
|
|
52
|
|
6
|
|
|
|
|
|
|
|
7
|
9
|
|
|
9
|
|
11528
|
use overload '""' => 'all'; |
|
9
|
|
|
|
|
6267
|
|
|
9
|
|
|
|
|
64
|
|
8
|
9
|
|
|
9
|
|
704
|
use Carp; |
|
9
|
|
|
|
|
21
|
|
|
9
|
|
|
|
|
587
|
|
9
|
9
|
|
|
9
|
|
716
|
use Data::Dumper; |
|
9
|
|
|
|
|
7239
|
|
|
9
|
|
|
|
|
551
|
|
10
|
9
|
|
|
9
|
|
4727
|
use Digest::SHA1 qw/sha1_hex/; |
|
9
|
|
|
|
|
7147
|
|
|
9
|
|
|
|
|
560
|
|
11
|
9
|
|
|
9
|
|
81
|
use Digest::MD5 qw/md5_hex/; |
|
9
|
|
|
|
|
33
|
|
|
9
|
|
|
|
|
460
|
|
12
|
9
|
|
|
9
|
|
4732
|
use Digest::SHA2; |
|
9
|
|
|
|
|
7121
|
|
|
9
|
|
|
|
|
23544
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our $WIDTH = 80; |
15
|
|
|
|
|
|
|
our $VERSION = '0.083'; # VERSION |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
has id => ( is => 'rw', required => 1 ); |
18
|
|
|
|
|
|
|
has desc => ( is => 'rw', default => '' ); |
19
|
|
|
|
|
|
|
has seq => ( is => 'rw', required => 1 ); |
20
|
|
|
|
|
|
|
has delim => ( is => 'rw', default => " " ); |
21
|
|
|
|
|
|
|
has info => ( is => 'rw', default => sub { {} } ); |
22
|
|
|
|
|
|
|
has gaps => ( is => 'rw', lazy_build => 1 ); |
23
|
|
|
|
|
|
|
has length => ( is => 'rw', lazy_build => 1 ); |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
sub _build_gaps { |
26
|
0
|
|
|
0
|
|
0
|
my $gaps = ( shift->seq() =~ tr/-.// ); |
27
|
0
|
|
|
|
|
0
|
return $gaps; |
28
|
|
|
|
|
|
|
} |
29
|
|
|
|
|
|
|
|
30
|
1
|
|
|
1
|
0
|
9
|
sub ungapped_seq { return shift->gapless_seq(@_); } |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
sub _build_length { |
33
|
5
|
|
|
5
|
|
20
|
return CORE::length( shift->seq() ); |
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub BUILDARGS { |
37
|
61
|
|
|
61
|
1
|
735
|
my $class = shift; |
38
|
|
|
|
|
|
|
|
39
|
61
|
|
|
|
|
94
|
my %a; |
40
|
61
|
50
|
|
|
|
129
|
if ( scalar @_ == 1 ) { |
41
|
0
|
0
|
|
|
|
0
|
( ref( $_[0] ) eq 'HASH' ) |
42
|
|
|
|
|
|
|
|| $class->meta->throw_error("Single parameters to new() must be a HASH ref"); |
43
|
0
|
|
|
|
|
0
|
%a = %{ $_[0] }; |
|
0
|
|
|
|
|
0
|
|
44
|
|
|
|
|
|
|
} else { |
45
|
61
|
|
|
|
|
219
|
%a = @_; |
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
delete $a{delim} |
49
|
61
|
100
|
100
|
|
|
248
|
if ( exists( $a{delim} ) && !defined( $a{delim} ) ); |
50
|
|
|
|
|
|
|
delete $a{desc} |
51
|
61
|
100
|
100
|
|
|
190
|
if ( exists( $a{desc} ) && !defined( $a{desc} ) ); |
52
|
|
|
|
|
|
|
|
53
|
61
|
|
|
|
|
122
|
$a{seq} = _filter_seq( $a{seq} ); |
54
|
|
|
|
|
|
|
|
55
|
61
|
|
|
|
|
577
|
return \%a; |
56
|
|
|
|
|
|
|
} |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
sub as_hash { |
59
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
60
|
0
|
|
|
|
|
0
|
return { id => $self->id, desc => $self->desc, seq => $self->seq, info => $self->info }; |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
sub Format_seq_string { |
64
|
1
|
|
|
1
|
0
|
3
|
my ( $str, $width ) = @_; |
65
|
1
|
|
33
|
|
|
5
|
$width //= $WIDTH; |
66
|
|
|
|
|
|
|
|
67
|
1
|
50
|
33
|
|
|
8
|
if ( defined $str && length($str) > 0 ) { |
68
|
1
|
|
|
|
|
15
|
$str =~ tr/ \t\n\r//d; # Remove whitespace and numbers |
69
|
1
|
|
|
|
|
6
|
$str =~ s/\d+//g; |
70
|
1
|
|
|
|
|
35
|
$str =~ s/(.{1,$width})/$1\n/g; |
71
|
1
|
|
|
|
|
29
|
return $str; |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
sub def { |
76
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
77
|
|
|
|
|
|
|
|
78
|
0
|
|
|
|
|
0
|
return join( $self->delim, $self->id, $self->desc ); |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
before 'desc' => sub { |
82
|
|
|
|
|
|
|
my $self = shift; |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
if ( @_ == 1 && $_[0] eq '' ) { |
85
|
|
|
|
|
|
|
$self->delim(''); |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
}; |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub _filter_seq { |
90
|
63
|
|
|
63
|
|
112
|
my ($seq) = @_; |
91
|
|
|
|
|
|
|
|
92
|
63
|
100
|
|
|
|
235
|
$seq = join( "", @$seq ) |
93
|
|
|
|
|
|
|
if ( ref($seq) eq 'ARRAY' ); |
94
|
63
|
|
|
|
|
305
|
$seq =~ y/ \t\n\r\f//d; |
95
|
|
|
|
|
|
|
|
96
|
63
|
|
|
|
|
146
|
return $seq; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
around 'seq' => sub { |
100
|
|
|
|
|
|
|
my $orig = shift; |
101
|
|
|
|
|
|
|
my $self = shift; |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
return $self->$orig() |
104
|
|
|
|
|
|
|
unless @_; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
return $self->$orig( _filter_seq(@_) ); |
107
|
|
|
|
|
|
|
}; |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub gapless_seq { |
110
|
1
|
|
|
1
|
1
|
3
|
( my $seq = shift->seq ) =~ tr/-.//d; |
111
|
1
|
|
|
|
|
5
|
return $seq; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub rm_gaps { |
115
|
0
|
|
|
0
|
0
|
0
|
my ($self) = @_; |
116
|
|
|
|
|
|
|
|
117
|
0
|
|
|
|
|
0
|
$self->seq( $self->gapless_seq ); |
118
|
0
|
|
|
|
|
0
|
return $self; |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub clone { |
122
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
123
|
|
|
|
|
|
|
|
124
|
0
|
|
|
|
|
0
|
return __PACKAGE__->new( |
125
|
|
|
|
|
|
|
id => $self->id, |
126
|
|
|
|
|
|
|
desc => $self->desc, |
127
|
|
|
|
|
|
|
seq => $self->seq, |
128
|
|
|
|
|
|
|
delim => $self->delim, |
129
|
|
|
|
|
|
|
info => $self->info |
130
|
|
|
|
|
|
|
); |
131
|
|
|
|
|
|
|
#shift->clone_object(@_) |
132
|
|
|
|
|
|
|
} |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
sub clone_empty { |
135
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
136
|
|
|
|
|
|
|
|
137
|
0
|
|
|
|
|
0
|
return __PACKAGE__->new( |
138
|
|
|
|
|
|
|
id => $self->id, |
139
|
|
|
|
|
|
|
desc => $self->desc, |
140
|
|
|
|
|
|
|
seq => '', |
141
|
|
|
|
|
|
|
delim => $self->delim, |
142
|
|
|
|
|
|
|
info => $self->info |
143
|
|
|
|
|
|
|
); |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
1
|
|
|
1
|
1
|
7
|
sub display_id { shift->id(@_) } |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub ungapped_length { |
149
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
150
|
|
|
|
|
|
|
|
151
|
0
|
|
|
|
|
0
|
return $self->length - $self->gaps; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
0
|
|
|
0
|
0
|
0
|
sub sequence { shift->seq(@_) } |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
sub stringify { |
157
|
220
|
|
|
220
|
0
|
327
|
my ($self) = @_; |
158
|
|
|
|
|
|
|
|
159
|
220
|
100
|
|
|
|
676
|
return ">" . $self->id . ( $self->desc ? $self->delim . $self->desc : "" ) . "\n" . $self->seq . "\n"; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
220
|
|
|
220
|
1
|
459
|
sub all { shift->stringify(@_) } |
163
|
|
|
|
|
|
|
|
164
|
0
|
|
|
0
|
1
|
0
|
sub all_formatted { shift->stringify_pretty(@_) } |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub stringify_pretty { |
167
|
1
|
|
|
1
|
0
|
4
|
my ( $self, $width ) = @_; |
168
|
1
|
|
33
|
|
|
6
|
$width //= $WIDTH; |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
return |
171
|
1
|
50
|
|
|
|
11
|
">" |
172
|
|
|
|
|
|
|
. $self->id |
173
|
|
|
|
|
|
|
. ( $self->desc ? $self->delim . $self->desc : "" ) . "\n" |
174
|
|
|
|
|
|
|
. Format_seq_string( $self->seq ); |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
1
|
|
|
1
|
1
|
4
|
sub all_pretty { shift->stringify_pretty(@_) } |
178
|
|
|
|
|
|
|
|
179
|
0
|
|
|
0
|
0
|
0
|
sub pretty { shift->stringify_pretty(@_) } |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub as_primaryseq { |
182
|
0
|
|
|
0
|
1
|
0
|
my ($self) = @_; |
183
|
|
|
|
|
|
|
|
184
|
0
|
|
|
|
|
0
|
return Bio::PrimarySeq->new( |
185
|
|
|
|
|
|
|
-seq => $self->seq, |
186
|
|
|
|
|
|
|
-id => $self->id, |
187
|
|
|
|
|
|
|
-desc => $self->desc, |
188
|
|
|
|
|
|
|
-alphabet => $self->guess_alphabet, |
189
|
|
|
|
|
|
|
-direct => 1, |
190
|
|
|
|
|
|
|
); |
191
|
|
|
|
|
|
|
} |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
sub guess_alphabet { |
194
|
1
|
|
|
1
|
0
|
3
|
my ($self) = @_; |
195
|
|
|
|
|
|
|
|
196
|
1
|
|
|
|
|
3
|
my $str = $self->seq(); |
197
|
1
|
|
|
|
|
6
|
$str =~ s/[-.?*]//gi; |
198
|
|
|
|
|
|
|
|
199
|
1
|
|
|
|
|
2
|
my $alphabet; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
# Check for sequences without valid letters |
202
|
1
|
|
|
|
|
2
|
my $total = CORE::length($str); |
203
|
|
|
|
|
|
|
|
204
|
1
|
50
|
|
|
|
5
|
if ( $str =~ m/[EFIJLOPQXZ]/i ) { |
205
|
|
|
|
|
|
|
# Start with a safe method to find proteins. |
206
|
|
|
|
|
|
|
# Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z |
207
|
0
|
|
|
|
|
0
|
$alphabet = 'protein'; |
208
|
|
|
|
|
|
|
} else { |
209
|
|
|
|
|
|
|
# Alphabet is unsure, could still be DNA, RNA or protein. |
210
|
|
|
|
|
|
|
# DNA and RNA contain mostly A, T, U, G, C and N, but the other letters |
211
|
|
|
|
|
|
|
# they use are also among the 15 valid letters that a protein sequence |
212
|
|
|
|
|
|
|
# can contain at this stage. Make our best guess based on sequence |
213
|
|
|
|
|
|
|
# composition. If it contains over 70% of ACGTUN, it is likely nucleic. |
214
|
1
|
50
|
|
|
|
8
|
if ( ( $str =~ tr/ATUGCNatugcn// ) / $total > 0.7 ) { |
215
|
1
|
50
|
|
|
|
4
|
if ( $str =~ m/U/i ) { |
216
|
0
|
|
|
|
|
0
|
$alphabet = 'rna'; |
217
|
|
|
|
|
|
|
} else { |
218
|
1
|
|
|
|
|
3
|
$alphabet = 'dna'; |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
} else { |
221
|
0
|
|
|
|
|
0
|
$alphabet = 'protein'; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
} |
224
|
1
|
|
|
|
|
4
|
return $alphabet; |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
sub revcom { |
228
|
1
|
|
|
1
|
1
|
8
|
my ($self) = @_; |
229
|
|
|
|
|
|
|
|
230
|
1
|
|
|
|
|
4
|
$self->seq( _revcom_from_string( $self->seq, $self->guess_alphabet ) ); |
231
|
|
|
|
|
|
|
|
232
|
1
|
|
|
|
|
3
|
return $self; |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
sub revcom_seq { |
236
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
237
|
0
|
|
|
|
|
0
|
return _revcom_from_string( $self->seq, $self->guess_alphabet ); |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
sub subseq { |
241
|
0
|
|
|
0
|
1
|
0
|
my ( $self, $range, $c ) = @_; |
242
|
|
|
|
|
|
|
|
243
|
0
|
0
|
0
|
|
|
0
|
$range = [ $range->start, $range->end, $range->strand ] |
244
|
|
|
|
|
|
|
if ( blessed($range) && $range->isa('Bio::Gonzales::Feat') ); |
245
|
0
|
|
|
|
|
0
|
my ( $seq, $corrected_range ) = $self->subseq_as_string( $range, $c ); |
246
|
0
|
|
|
|
|
0
|
my ( $b, $e, $strand, @rest ) = @$corrected_range; |
247
|
|
|
|
|
|
|
|
248
|
0
|
|
|
|
|
0
|
my $keep_original_id = $c->{keep_id}; |
249
|
|
|
|
|
|
|
|
250
|
0
|
|
|
|
|
0
|
my $new_seq = $self->clone_empty; |
251
|
0
|
|
|
|
|
0
|
$new_seq->seq($seq); |
252
|
|
|
|
|
|
|
|
253
|
0
|
0
|
|
|
|
0
|
if ( $c->{attach_details} ) { |
254
|
0
|
|
|
|
|
0
|
my $info = $new_seq->info; |
255
|
0
|
|
|
|
|
0
|
$info->{subseq} = { from => $b + 1, to => $e }; |
256
|
0
|
0
|
|
|
|
0
|
$info->{subseq}{strand} = ( $strand < 0 ? '-' : ( $strand > 0 ? '+' : '.' ) ); |
|
|
0
|
|
|
|
|
|
257
|
0
|
0
|
|
|
|
0
|
$info->{subseq}{rest} = \@rest if ( @rest > 0 ); |
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
0
|
0
|
|
|
|
0
|
unless ($keep_original_id) { |
261
|
0
|
|
|
|
|
0
|
$new_seq->id( $new_seq->id . "|" . ( $b + 1 ) . "..$e" ); |
262
|
0
|
0
|
|
|
|
0
|
$new_seq->id( $new_seq->id . "|" . ( $strand < 0 ? '-' : ( $strand > 0 ? '+' : '.' ) ) ) |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
263
|
|
|
|
|
|
|
if ( defined($strand) ); #print also nothing if strand == 0 |
264
|
0
|
0
|
|
|
|
0
|
$new_seq->id( $new_seq->id . "|" . join( "|", @rest ) ) if ( @rest > 0 ); |
265
|
|
|
|
|
|
|
} |
266
|
0
|
|
|
|
|
0
|
return $new_seq; |
267
|
|
|
|
|
|
|
} |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
sub sha1_checksum { |
270
|
0
|
|
|
0
|
0
|
0
|
return sha1_hex( shift->seq ); |
271
|
|
|
|
|
|
|
} |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
sub md5_checksum { |
274
|
0
|
|
|
0
|
0
|
0
|
return md5_hex( shift->seq ); |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
sub sha512_checksum { |
278
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
279
|
0
|
|
|
|
|
0
|
my $sha2 = Digest::SHA2->new(512); |
280
|
0
|
|
|
|
|
0
|
$sha2->add( $self->seq ); |
281
|
0
|
|
|
|
|
0
|
return $sha2->hexdigest; |
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
sub subseq_as_string { |
285
|
0
|
|
|
0
|
0
|
0
|
my ( $self, $range, $c ) = @_; |
286
|
|
|
|
|
|
|
|
287
|
0
|
0
|
0
|
|
|
0
|
confess "you use the deprecated version of subseq" if ( defined($c) && ref $c ne 'HASH' ); |
288
|
|
|
|
|
|
|
|
289
|
0
|
|
|
|
|
0
|
my ( $b, $e, $strand, @rest ) = @$range; |
290
|
0
|
0
|
|
|
|
0
|
if ( $c->{relaxed_range} ) { |
291
|
|
|
|
|
|
|
#if b or e are not defined, just take the beginning and end as given |
292
|
|
|
|
|
|
|
#warn "requested invalid subseq range ($b,$e;$strand) from " . $self->id . ", using relaxed boundaries." |
293
|
|
|
|
|
|
|
#unless ( $b && $e ); |
294
|
0
|
|
0
|
|
|
0
|
$b ||= '^'; |
295
|
0
|
|
0
|
|
|
0
|
$e ||= '$'; |
296
|
|
|
|
|
|
|
} |
297
|
|
|
|
|
|
|
|
298
|
0
|
0
|
0
|
|
|
0
|
confess "requested invalied subseq range ($b,$e;$strand) from " |
299
|
|
|
|
|
|
|
. $self->id . "\n" |
300
|
|
|
|
|
|
|
. Dumper($range) |
301
|
|
|
|
|
|
|
. Dumper( $self->clone_empty ) |
302
|
|
|
|
|
|
|
unless ( $b && $e ); |
303
|
|
|
|
|
|
|
|
304
|
0
|
|
|
|
|
0
|
my $seq_len = $self->length; |
305
|
|
|
|
|
|
|
|
306
|
0
|
0
|
|
|
|
0
|
$b = 1 if ( $b eq '^' ); |
307
|
0
|
0
|
|
|
|
0
|
$b = $seq_len if ( $b eq '$' ); |
308
|
|
|
|
|
|
|
|
309
|
0
|
0
|
|
|
|
0
|
$e = 1 if ( $e eq '^' ); |
310
|
0
|
0
|
|
|
|
0
|
$e = $seq_len if ( $e eq '$' ); |
311
|
|
|
|
|
|
|
|
312
|
0
|
0
|
0
|
|
|
0
|
croak "subseq range error: $b > $e" if ( $b > $e && $b > 0 && $e > 0 ); |
|
|
|
0
|
|
|
|
|
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
#count from the end, |
315
|
0
|
0
|
|
|
|
0
|
if ( $b < 0 ) { |
316
|
0
|
0
|
|
|
|
0
|
$b = $c->{wrap} ? $seq_len + $b + 1 : 1; |
317
|
|
|
|
|
|
|
} |
318
|
0
|
0
|
|
|
|
0
|
if ( $e < 0 ) { |
319
|
0
|
0
|
|
|
|
0
|
$e = $c->{wrap} ? $seq_len + $e + 1 : $seq_len; |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
#get the index right for substr. |
323
|
0
|
|
|
|
|
0
|
$b--; |
324
|
|
|
|
|
|
|
|
325
|
0
|
|
|
|
|
0
|
my $seq = substr( $self->{seq}, $b, $e - $b ); |
326
|
|
|
|
|
|
|
|
327
|
0
|
0
|
0
|
|
|
0
|
if ( $strand && $strand < 0 ) { |
328
|
0
|
0
|
|
|
|
0
|
if ( $c->{relaxed_revcom} ) { |
329
|
0
|
|
|
|
|
0
|
$seq =~ y/AGCTNagctn/N/c; |
330
|
|
|
|
|
|
|
} else { |
331
|
0
|
0
|
|
|
|
0
|
confess "cannot create reverse complement, sequence contains non-AGCTN characters" |
332
|
|
|
|
|
|
|
if ( $seq =~ /[^AGCTN]/i ); |
333
|
|
|
|
|
|
|
} |
334
|
|
|
|
|
|
|
|
335
|
0
|
|
|
|
|
0
|
$seq = _revcom_from_string( $seq, $self->guess_alphabet ); |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
|
338
|
0
|
0
|
|
|
|
0
|
return wantarray ? ( $seq, [ $b, $e, $strand, @rest ] ) : $seq; |
339
|
|
|
|
|
|
|
} |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
sub _revcom_from_string { |
342
|
1
|
|
|
1
|
|
3
|
my ( $string, $alphabet ) = @_; |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
# Check that reverse-complementing makes sense |
345
|
1
|
50
|
|
|
|
6
|
if ( $alphabet eq 'protein' ) { |
346
|
0
|
|
|
|
|
0
|
confess("Sequence is a protein. Cannot revcom."); |
347
|
|
|
|
|
|
|
} |
348
|
1
|
50
|
33
|
|
|
6
|
if ( $alphabet ne 'dna' && $alphabet ne 'rna' ) { |
349
|
0
|
|
|
|
|
0
|
carp "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, " |
350
|
|
|
|
|
|
|
. "but unsure if this is right."; |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
# If sequence is RNA, map to DNA (then map back later) |
354
|
1
|
50
|
|
|
|
4
|
if ( $alphabet eq 'rna' ) { |
355
|
0
|
|
|
|
|
0
|
$string =~ tr/uU/tT/; |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
# Reverse-complement now |
359
|
1
|
|
|
|
|
2
|
$string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/; |
360
|
1
|
|
|
|
|
2
|
$string = CORE::reverse $string; |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
# Map back RNA to DNA |
363
|
1
|
50
|
|
|
|
3
|
if ( $alphabet eq 'rna' ) { |
364
|
0
|
|
|
|
|
0
|
$string =~ tr/tT/uU/; |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
|
367
|
1
|
|
|
|
|
4
|
return $string; |
368
|
|
|
|
|
|
|
} |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
1; |
371
|
|
|
|
|
|
|
|
372
|
|
|
|
|
|
|
__END__ |
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
=head1 NAME |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
Bio::Gonzales::Seq - Gonzales Sequence Object |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
=head1 SYNOPSIS |
379
|
|
|
|
|
|
|
|
380
|
|
|
|
|
|
|
my $seq = Bio::Gonzales::Seq->new(id => $id, seq => $seq, desc? => '', delim? => ' '); |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
print $seq->def; |
383
|
|
|
|
|
|
|
print $seq->desc; |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
=head1 DESCRIPTION |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
=head1 METHODS |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=over 4 |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=item B<< $seq->id >> |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
=item B<< $seq->desc >> |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
The description of a sequence object. In case of FASTA-files, this corresponds |
396
|
|
|
|
|
|
|
to the text after the first space. |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=item B<< $seq->seq >> |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
=item B<< $seq->delim >> |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
=item B<< $seq->info >> |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
An hash of additional stuff you can store about the sequence |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
=item B<< $seq->gaps >> |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
=item B<< $seq->length >> |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=item B<< $seq->def >> |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
The definition also known as the FASTA header line w/o ">" |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=item B<< $seq->clone >> |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
Clone the sequence |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=item B<< $seq->clone_empty >> |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
Clone the sequence properties, do not clone the sequence string. |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=item B<< $seq->display_id >> |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
Same as C<$seq->id> |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=item B<< $seq->ungapped_length >> |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
=item B<< $seq->all >> |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=item B<< "$seq" >> |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
The complete sequence in fasta format, ready to be written. |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=item B<< $seq->all_formatted >> |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
=item B<< $seq->all_pretty >> |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
The complete sequence in I<pretty> fasta format, ready to be written. |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
=item B<< $seq->as_primaryseq >> |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
Return a Bio::PrimarySeqI compatible object, so you can use it in BioPerl. |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
=item B<< $seq_string = $seq->gapless_seq >> |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
=item B<< $seq->rm_gaps! >> |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
=item B<< $seq->revcom >> |
449
|
|
|
|
|
|
|
|
450
|
|
|
|
|
|
|
Create the reverse complement of the sequence. B<THIS FUNCTION ALTERS THE SEQUENCE OBJECT>. |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=item B<< $seq->subseq( [ $begin, $end, $strand , @rest ], \%c ) >> |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
Gets a subseq from C<$seq>. Config options can be: |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
%c = ( |
457
|
|
|
|
|
|
|
keep_id => 1, # keeps the original id of the sequence |
458
|
|
|
|
|
|
|
attach_details => 1, # keeps the original range and strand in $seq->info->{subseq} |
459
|
|
|
|
|
|
|
wrap => 1, # see further down |
460
|
|
|
|
|
|
|
relaxed_range => 1, # substitute 0 or undef for $begin with '^' and for $end with '$' |
461
|
|
|
|
|
|
|
relaxed_revcom => 1, # substitute N for all characters that are non-AGCTN before doing a reverse complement |
462
|
|
|
|
|
|
|
); |
463
|
|
|
|
|
|
|
|
464
|
|
|
|
|
|
|
There are several possibilities for C<$begin> and C<$end>: |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
GGCAAAGGA ATGATGGTGT GCAGGCTTGG CATGGGAGAC |
467
|
|
|
|
|
|
|
^..........^ (1,11) OR ('^', 11) |
468
|
|
|
|
|
|
|
^.....................................^ (4,'$') |
469
|
|
|
|
|
|
|
^..............^ (21,35) { with wrap on: OR (-19,35) OR (-19, -5) } |
470
|
|
|
|
|
|
|
^..................^ (21,35) { with wrap on: OR (-19,'$') } |
471
|
|
|
|
|
|
|
|
472
|
|
|
|
|
|
|
=over 4 |
473
|
|
|
|
|
|
|
|
474
|
|
|
|
|
|
|
=item C<wrap> |
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
The default is to limit all negative |
477
|
|
|
|
|
|
|
values to the sequence boundaries, so a negative begin would be equal to 1 or |
478
|
|
|
|
|
|
|
'^' and a negative end would be equal to '$'. |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
=back |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
See also L<Bio::Gonzales::Seq::IO/fasubseq>. |
483
|
|
|
|
|
|
|
|
484
|
|
|
|
|
|
|
=back |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=over 4 |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
=item B<< my $reverse_complement_string = Bio::Gonzales::Seq::_revcom_from_string($seq_string, $alphabet) >> |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
Stolen from L<Bio::Perl>. Alphabet can be 'rna' or 'dna'; |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=back |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
=head1 AUTHOR |
495
|
|
|
|
|
|
|
|
496
|
|
|
|
|
|
|
jw bargsten, C<< <joachim.bargsten at wur.nl> >> |
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
=cut |