line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
# |
2
|
|
|
|
|
|
|
# bioperl module for Bio::LiveSeq::IO::Loader |
3
|
|
|
|
|
|
|
# |
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Cared for by Joseph Insana |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Copyright Joseph Insana |
9
|
|
|
|
|
|
|
# |
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
11
|
|
|
|
|
|
|
# |
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
#documentation needed |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 DESCRIPTION |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This package holds common methods used by BioPerl and file loaders. |
25
|
|
|
|
|
|
|
It contains methods to create LiveSeq objects out of entire entries or from a |
26
|
|
|
|
|
|
|
localized sequence region surrounding a particular gene. |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head1 AUTHOR - Joseph A.L. Insana |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
Email: Insana@ebi.ac.uk, jinsana@gmx.net |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 APPENDIX |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
35
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=cut |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# Let the code begin... |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
package Bio::LiveSeq::IO::Loader; |
42
|
|
|
|
|
|
|
|
43
|
2
|
|
|
2
|
|
9
|
use strict; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
55
|
|
44
|
2
|
|
|
2
|
|
6
|
use Carp qw(cluck croak carp); |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
103
|
|
45
|
2
|
|
|
2
|
|
499
|
use Bio::LiveSeq::DNA; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
45
|
|
46
|
2
|
|
|
2
|
|
502
|
use Bio::LiveSeq::Exon; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
47
|
|
47
|
2
|
|
|
2
|
|
626
|
use Bio::LiveSeq::Transcript ; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
47
|
|
48
|
2
|
|
|
2
|
|
528
|
use Bio::LiveSeq::Translation; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
42
|
|
49
|
2
|
|
|
2
|
|
534
|
use Bio::LiveSeq::Gene; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
45
|
|
50
|
2
|
|
|
2
|
|
484
|
use Bio::LiveSeq::Intron; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
46
|
|
51
|
2
|
|
|
2
|
|
8
|
use Bio::LiveSeq::Prim_Transcript; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
26
|
|
52
|
2
|
|
|
2
|
|
450
|
use Bio::LiveSeq::Repeat_Region; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
45
|
|
53
|
2
|
|
|
2
|
|
470
|
use Bio::LiveSeq::Repeat_Unit; |
|
2
|
|
|
|
|
3
|
|
|
2
|
|
|
|
|
45
|
|
54
|
2
|
|
|
2
|
|
594
|
use Bio::LiveSeq::AARange; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
45
|
|
55
|
2
|
|
|
2
|
|
8
|
use Bio::Tools::CodonTable; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
6520
|
|
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head2 entry2liveseq |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
Title : entry2liveseq |
60
|
|
|
|
|
|
|
Usage : @translationobjects=$loader->entry2liveseq(); |
61
|
|
|
|
|
|
|
: @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0); |
62
|
|
|
|
|
|
|
Function: creates LiveSeq objects from an entry previously loaded |
63
|
|
|
|
|
|
|
Returns : array of references to objects of class Translation |
64
|
|
|
|
|
|
|
Errorcode 0 |
65
|
|
|
|
|
|
|
Args : optional boolean flag to avoid the retrieval of SwissProt |
66
|
|
|
|
|
|
|
information for all Transcripts containing SwissProt x-reference |
67
|
|
|
|
|
|
|
default is 1 (to retrieve those information and create AARange |
68
|
|
|
|
|
|
|
LiveSeq objects) |
69
|
|
|
|
|
|
|
Note : this method can get really slow for big entries. The lightweight |
70
|
|
|
|
|
|
|
gene2liveseq method is recommended |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=cut |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
sub entry2liveseq { |
75
|
0
|
|
|
0
|
1
|
0
|
my ($self, %args) = @_; |
76
|
0
|
|
|
|
|
0
|
my ($getswissprotinfo)=($args{-getswissprotinfo}); |
77
|
0
|
0
|
|
|
|
0
|
if (defined($getswissprotinfo)) { |
78
|
0
|
0
|
0
|
|
|
0
|
if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { |
79
|
0
|
|
|
|
|
0
|
carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; |
80
|
0
|
|
|
|
|
0
|
$getswissprotinfo=0; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
} else { |
83
|
0
|
|
|
|
|
0
|
$getswissprotinfo=1; |
84
|
|
|
|
|
|
|
} |
85
|
0
|
|
|
|
|
0
|
my $hashref=$self->{'hash'}; |
86
|
0
|
0
|
|
|
|
0
|
unless ($hashref) { return (0); } |
|
0
|
|
|
|
|
0
|
|
87
|
0
|
|
|
|
|
0
|
my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo); |
88
|
0
|
|
|
|
|
0
|
my $test_transl=0; |
89
|
0
|
0
|
|
|
|
0
|
if ($test_transl) { $self->test_transl($hashref,\@translationobjects);} |
|
0
|
|
|
|
|
0
|
|
90
|
0
|
|
|
|
|
0
|
return @translationobjects; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head2 novelaasequence2gene |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Title : novelaasequence2gene |
96
|
|
|
|
|
|
|
Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); |
97
|
|
|
|
|
|
|
: $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); |
98
|
|
|
|
|
|
|
-taxon => 9606, |
99
|
|
|
|
|
|
|
-gene_name => "tyr-kinase"); |
100
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Function: creates LiveSeq objects from a novel amino acid sequence, |
102
|
|
|
|
|
|
|
using codon usage database to choose codons according to |
103
|
|
|
|
|
|
|
relative frequencies. |
104
|
|
|
|
|
|
|
If a taxon ID is not specified, the default is to use the human |
105
|
|
|
|
|
|
|
one (taxonomy ID 9606). |
106
|
|
|
|
|
|
|
Returns : reference to a Gene object containing references to LiveSeq objects |
107
|
|
|
|
|
|
|
Errorcode 0 |
108
|
|
|
|
|
|
|
Args : string containing an amino acid sequence |
109
|
|
|
|
|
|
|
integer (optional) with a taxonomy ID |
110
|
|
|
|
|
|
|
string specifying a gene name |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=cut |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head2 gene2liveseq |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
Title : gene2liveseq |
117
|
|
|
|
|
|
|
Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name"); |
118
|
|
|
|
|
|
|
: $gene=$loader->gene2liveseq(-gene_name => "gene name", |
119
|
|
|
|
|
|
|
-flanking => 64); |
120
|
|
|
|
|
|
|
: $gene=$loader->gene2liveseq(-gene_name => "gene name", |
121
|
|
|
|
|
|
|
-getswissprotinfo => 0); |
122
|
|
|
|
|
|
|
: $gene=$loader->gene2liveseq(-position => 4); |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Function: creates LiveSeq objects from an entry previously loaded |
125
|
|
|
|
|
|
|
It is a "light weight" creation because it creates |
126
|
|
|
|
|
|
|
a LiveSequence just for the interesting region in an entry |
127
|
|
|
|
|
|
|
(instead than for the total entry, like in entry2liveseq) and for |
128
|
|
|
|
|
|
|
the flanking regions up to 500 nucleotides (default) or up to |
129
|
|
|
|
|
|
|
the specified amount of nucleotides (given as argument) around the |
130
|
|
|
|
|
|
|
Gene. |
131
|
|
|
|
|
|
|
Returns : reference to a Gene object containing possibly alternative |
132
|
|
|
|
|
|
|
Transcripts. |
133
|
|
|
|
|
|
|
Errorcode 0 |
134
|
|
|
|
|
|
|
Args : string containing the gene name as in the EMBL feature qualifier |
135
|
|
|
|
|
|
|
integer (optional) "flanking": amount of flanking bases to be kept |
136
|
|
|
|
|
|
|
boolean (optional) "getswissprotinfo": if set to "0" it will avoid |
137
|
|
|
|
|
|
|
trying to fetch information from a crossreference to a SwissProt |
138
|
|
|
|
|
|
|
entry, avoding the process of creation of AARange objects |
139
|
|
|
|
|
|
|
It is "1" (on) by default |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Alternative to a gene_name, a position can be given: an |
142
|
|
|
|
|
|
|
integer (1-) containing the position of the desired CDS in the |
143
|
|
|
|
|
|
|
loaded entry |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=cut |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub gene2liveseq { |
148
|
6
|
|
|
6
|
1
|
722
|
my ($self, %args) = @_; |
149
|
6
|
|
|
|
|
32
|
my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position}); |
150
|
6
|
|
|
|
|
8
|
my $input; |
151
|
6
|
50
|
33
|
|
|
23
|
unless (($gene_name)||($cds_position)) { |
152
|
0
|
|
|
|
|
0
|
carp "Gene_Name or Position not specified for gene2liveseq loading function"; |
153
|
0
|
|
|
|
|
0
|
return (0); |
154
|
|
|
|
|
|
|
} |
155
|
6
|
50
|
33
|
|
|
39
|
if (($gene_name)&&($cds_position)) { |
|
|
50
|
|
|
|
|
|
156
|
0
|
|
|
|
|
0
|
carp "Gene_Name and Position cannot be given together"; |
157
|
0
|
|
|
|
|
0
|
return (0); |
158
|
|
|
|
|
|
|
} elsif ($gene_name) { |
159
|
6
|
|
|
|
|
10
|
$input=$gene_name; |
160
|
|
|
|
|
|
|
} else { |
161
|
0
|
|
|
|
|
0
|
$input="cds-position:".$cds_position; |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
|
164
|
6
|
50
|
|
|
|
41
|
if (defined($getswissprotinfo)) { |
165
|
0
|
0
|
0
|
|
|
0
|
if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { |
166
|
0
|
|
|
|
|
0
|
carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; |
167
|
0
|
|
|
|
|
0
|
$getswissprotinfo=0; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
} else { |
170
|
6
|
|
|
|
|
9
|
$getswissprotinfo=1; |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
|
173
|
6
|
50
|
|
|
|
13
|
if (defined($flanking)) { |
174
|
0
|
0
|
|
|
|
0
|
unless ($flanking >= 0) { |
175
|
0
|
|
|
|
|
0
|
carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function"; |
176
|
0
|
|
|
|
|
0
|
return (0); |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
} else { |
179
|
6
|
|
|
|
|
8
|
$flanking=500; # the default flanking length |
180
|
|
|
|
|
|
|
} |
181
|
6
|
|
|
|
|
16
|
my $hashref=$self->{'hash'}; |
182
|
6
|
50
|
|
|
|
10
|
unless ($hashref) { return (0); } |
|
0
|
|
|
|
|
0
|
|
183
|
6
|
|
|
|
|
28
|
my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo); |
184
|
6
|
50
|
|
|
|
47
|
unless ($gene) { # if $gene == 0 it means problems in hash2gene |
185
|
0
|
|
|
|
|
0
|
carp "gene2liveseq produced error"; |
186
|
0
|
|
|
|
|
0
|
return (0); |
187
|
|
|
|
|
|
|
} |
188
|
6
|
|
|
|
|
36
|
return $gene; |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# TODO: update so that it will work even if CDS is not only accepted FEATURE!! |
192
|
|
|
|
|
|
|
# this method is for now deprecated and not supported |
193
|
|
|
|
|
|
|
sub test_transl { |
194
|
0
|
|
|
0
|
0
|
0
|
my ($self,$entry)=@_; |
195
|
0
|
|
|
|
|
0
|
my @features=@{$entry->{'Features'}}; |
|
0
|
|
|
|
|
0
|
|
196
|
0
|
|
|
|
|
0
|
my @translationobjects=@{$_[1]}; |
|
0
|
|
|
|
|
0
|
|
197
|
0
|
|
|
|
|
0
|
my ($i,$translation); |
198
|
0
|
|
|
|
|
0
|
my ($obj_transl,$hash_transl); |
199
|
0
|
|
|
|
|
0
|
my @cds=@{$entry->{'CDS'}}; |
|
0
|
|
|
|
|
0
|
|
200
|
0
|
|
|
|
|
0
|
foreach $translation (@translationobjects) { |
201
|
0
|
|
|
|
|
0
|
$obj_transl=$translation->seq; |
202
|
0
|
|
|
|
|
0
|
$hash_transl=$cds[$i]->{'qualifiers'}->{'translation'}; |
203
|
|
|
|
|
|
|
#before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*" |
204
|
0
|
0
|
|
|
|
0
|
unless ($obj_transl eq $hash_transl) { |
205
|
0
|
|
|
|
|
0
|
cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i"; |
206
|
0
|
|
|
|
|
0
|
carp "\nEntry's transl: ",$hash_transl,"\n"; |
207
|
0
|
|
|
|
|
0
|
carp "\nObject's transl: ",$obj_transl,"\n"; |
208
|
0
|
|
|
|
|
0
|
exit; |
209
|
|
|
|
|
|
|
} |
210
|
0
|
|
|
|
|
0
|
$i++; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# argument: hashref containing the EMBL entry datas, |
215
|
|
|
|
|
|
|
# getswissprotinfo boolean flag |
216
|
|
|
|
|
|
|
# creates the liveseq objects |
217
|
|
|
|
|
|
|
# returns: an array of Translation object references |
218
|
|
|
|
|
|
|
sub hash2liveseq { |
219
|
0
|
|
|
0
|
0
|
0
|
my ($self,$entry,$getswissprotinfo)=@_; |
220
|
0
|
|
|
|
|
0
|
my $i; |
221
|
|
|
|
|
|
|
my @transcripts; |
222
|
0
|
|
|
|
|
0
|
my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} ); |
223
|
0
|
|
|
|
|
0
|
$dna->alphabet(lc($entry->{'Molecule'})); |
224
|
0
|
|
|
|
|
0
|
$dna->display_id($entry->{'ID'}); |
225
|
0
|
|
|
|
|
0
|
$dna->accession_number($entry->{'AccNumber'}); |
226
|
0
|
|
|
|
|
0
|
$dna->desc($entry->{'Description'}); |
227
|
0
|
|
|
|
|
0
|
my @cds=@{$entry->{'CDS'}}; |
|
0
|
|
|
|
|
0
|
|
228
|
0
|
|
|
|
|
0
|
my ($swissacc,$swisshash); my @swisshashes; |
|
0
|
|
|
|
|
0
|
|
229
|
0
|
|
|
|
|
0
|
for $i (0..$#cds) { |
230
|
|
|
|
|
|
|
#my @transcript=@{$cds[$i]->{'range'}}; |
231
|
|
|
|
|
|
|
#$transcript=\@transcript; |
232
|
|
|
|
|
|
|
#push (@transcripts,$transcript); |
233
|
0
|
|
|
|
|
0
|
push (@transcripts,$cds[$i]->{'range'}); |
234
|
0
|
0
|
|
|
|
0
|
if ($getswissprotinfo) { |
235
|
0
|
|
|
|
|
0
|
$swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'}; |
236
|
0
|
|
|
|
|
0
|
$swisshash=$self->get_swisshash($swissacc); |
237
|
|
|
|
|
|
|
#$self->printswissprot($swisshash); # DEBUG |
238
|
0
|
|
|
|
|
0
|
push (@swisshashes,$swisshash); |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
} |
241
|
0
|
|
|
|
|
0
|
my @translations=($self->transexonscreation($dna,\@transcripts)); |
242
|
0
|
|
|
|
|
0
|
my $translation; my $j=0; |
|
0
|
|
|
|
|
0
|
|
243
|
0
|
|
|
|
|
0
|
foreach $translation (@translations) { |
244
|
0
|
0
|
|
|
|
0
|
if ($swisshashes[$j]) { # if not 0 |
245
|
0
|
|
|
|
|
0
|
$self->swisshash2liveseq($swisshashes[$j],$translation); |
246
|
|
|
|
|
|
|
} |
247
|
0
|
|
|
|
|
0
|
$j++; |
248
|
|
|
|
|
|
|
} |
249
|
0
|
|
|
|
|
0
|
return (@translations); |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
# only features pertaining to a specified gene are created |
253
|
|
|
|
|
|
|
# only the sequence of the gene and appropriate context flanking regions |
254
|
|
|
|
|
|
|
# are created as chain |
255
|
|
|
|
|
|
|
# arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag |
256
|
|
|
|
|
|
|
# returns: reference to Gene object |
257
|
|
|
|
|
|
|
# |
258
|
|
|
|
|
|
|
# Note: if entry contains just one CDS, all the features get added |
259
|
|
|
|
|
|
|
# this is useful because often the features in these entries do not |
260
|
|
|
|
|
|
|
# carry the /gene qualifier |
261
|
|
|
|
|
|
|
# |
262
|
|
|
|
|
|
|
# errorcode: 0 |
263
|
|
|
|
|
|
|
sub hash2gene { |
264
|
6
|
|
|
6
|
0
|
16
|
my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_; |
265
|
6
|
|
|
|
|
7
|
my $entryfeature; |
266
|
|
|
|
|
|
|
my $genefeatureshash; |
267
|
|
|
|
|
|
|
|
268
|
6
|
|
|
|
|
10
|
my @cds=@{$entry->{'CDS'}}; |
|
6
|
|
|
|
|
18
|
|
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# checking if a position has been given instead than a gene_name |
271
|
6
|
50
|
|
|
|
23
|
if (index($input,"cds-position:") == 0 ) { |
272
|
0
|
|
|
|
|
0
|
my $cds_position=substr($input,13); # extracting the cds position |
273
|
0
|
0
|
0
|
|
|
0
|
if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) { |
274
|
0
|
|
|
|
|
0
|
$genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo); |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
} else { |
277
|
6
|
|
|
|
|
27
|
$genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo); |
278
|
|
|
|
|
|
|
} |
279
|
|
|
|
|
|
|
|
280
|
6
|
50
|
50
|
|
|
22
|
unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found |
|
6
|
|
|
|
|
18
|
|
281
|
0
|
|
|
|
|
0
|
my @genes=$self->genes($entry); |
282
|
0
|
|
|
|
|
0
|
my $cds_number=scalar(@cds); |
283
|
0
|
|
|
|
|
0
|
warn "Warning! Not even one genefeature found for /$input/.... |
284
|
|
|
|
|
|
|
The genes present in this entry are:\n\t@genes\n |
285
|
|
|
|
|
|
|
The number of CDS in this entry is:\n\t$cds_number\n"; |
286
|
0
|
|
|
|
|
0
|
return(0); |
287
|
|
|
|
|
|
|
} |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
# get max and min, check flankings |
290
|
6
|
|
|
|
|
11
|
my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries" |
|
6
|
|
|
|
|
26
|
|
291
|
6
|
|
|
|
|
13
|
my $seqlength=$entry->{'SeqLength'}; |
292
|
6
|
|
|
|
|
6
|
my ($mindna,$maxdna); # some flanking region next to the gene "boundaries" |
293
|
6
|
50
|
|
|
|
18
|
if ($min-$flanking < 1) { |
294
|
6
|
|
|
|
|
11
|
$mindna=1; |
295
|
|
|
|
|
|
|
} else { |
296
|
0
|
|
|
|
|
0
|
$mindna=$min-$flanking; |
297
|
|
|
|
|
|
|
} |
298
|
6
|
50
|
|
|
|
17
|
if ($max+$flanking > $seqlength) { |
299
|
6
|
|
|
|
|
8
|
$maxdna=$seqlength; |
300
|
|
|
|
|
|
|
} else { |
301
|
0
|
|
|
|
|
0
|
$maxdna=$max+$flanking; |
302
|
|
|
|
|
|
|
} |
303
|
6
|
|
|
|
|
61
|
my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1); |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# create LiveSeq objects |
306
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
# create DNA |
308
|
6
|
|
|
|
|
54
|
my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna); |
309
|
6
|
|
|
|
|
47
|
$dna->alphabet(lc($entry->{'Molecule'})); |
310
|
6
|
|
|
|
|
26
|
$dna->source($entry->{'Organism'}); |
311
|
6
|
|
|
|
|
25
|
$dna->display_id($entry->{'ID'}); |
312
|
6
|
|
|
|
|
54
|
$dna->accession_number($entry->{'AccNumber'}); |
313
|
6
|
|
|
|
|
24
|
$dna->desc($entry->{'Description'}); |
314
|
|
|
|
|
|
|
|
315
|
6
|
|
|
|
|
7
|
my @transcripts=@{$genefeatureshash->{'transcripts'}}; |
|
6
|
|
|
|
|
19
|
|
316
|
|
|
|
|
|
|
# create Translations, Transcripts, Exons out of the CDS |
317
|
6
|
50
|
|
|
|
15
|
unless (@transcripts) { |
318
|
0
|
|
|
|
|
0
|
cluck "no CDS feature found for /$input/...."; |
319
|
0
|
|
|
|
|
0
|
return(0); |
320
|
|
|
|
|
|
|
} |
321
|
6
|
|
|
|
|
25
|
my @translationobjs=$self->transexonscreation($dna,\@transcripts); |
322
|
6
|
|
|
|
|
11
|
my @transcriptobjs; |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
# get the Transcript obj_refs |
325
|
|
|
|
|
|
|
my $translation; |
326
|
6
|
|
|
|
|
6
|
my $j=0; |
327
|
6
|
|
|
|
|
6
|
my @ttables=@{$genefeatureshash->{'ttables'}}; |
|
6
|
|
|
|
|
19
|
|
328
|
6
|
|
|
|
|
9
|
my @swisshashes=@{$genefeatureshash->{'swisshashes'}}; |
|
6
|
|
|
|
|
14
|
|
329
|
6
|
|
|
|
|
11
|
foreach $translation (@translationobjs) { |
330
|
6
|
|
|
|
|
19
|
push(@transcriptobjs,$translation->get_Transcript); |
331
|
6
|
50
|
|
|
|
21
|
if ($ttables[$j]) { # if not undef |
332
|
0
|
|
|
|
|
0
|
$translation->get_Transcript->translation_table($ttables[$j]); |
333
|
|
|
|
|
|
|
#} else { # DEBUG |
334
|
|
|
|
|
|
|
# print "\n\t\tno translation table information....\n"; |
335
|
|
|
|
|
|
|
} |
336
|
6
|
50
|
|
|
|
17
|
if ($swisshashes[$j]) { # if not 0 |
337
|
0
|
|
|
|
|
0
|
$self->swisshash2liveseq($swisshashes[$j],$translation); |
338
|
|
|
|
|
|
|
} |
339
|
6
|
|
|
|
|
12
|
$j++; |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
|
342
|
6
|
|
|
|
|
6
|
my %gene; # this is the hash to store created object references |
343
|
6
|
|
|
|
|
14
|
$gene{DNA}=$dna; |
344
|
6
|
|
|
|
|
14
|
$gene{Transcripts}=\@transcriptobjs; |
345
|
6
|
|
|
|
|
14
|
$gene{Translations}=\@translationobjs; |
346
|
|
|
|
|
|
|
|
347
|
6
|
|
|
|
|
8
|
my @exonobjs; my @intronobjs; |
348
|
0
|
|
|
|
|
0
|
my @repeatunitobjs; my @repeatregionobjs; |
|
0
|
|
|
|
|
0
|
|
349
|
0
|
|
|
|
|
0
|
my @primtranscriptobjs; |
350
|
|
|
|
|
|
|
|
351
|
0
|
|
|
|
|
0
|
my ($object,$range,$start,$end,$strand); |
352
|
|
|
|
|
|
|
|
353
|
6
|
|
|
|
|
9
|
my @exons=@{$genefeatureshash->{'exons'}}; |
|
6
|
|
|
|
|
16
|
|
354
|
6
|
|
|
|
|
7
|
my @exondescs=@{$genefeatureshash->{'exondescs'}}; |
|
6
|
|
|
|
|
16
|
|
355
|
6
|
100
|
|
|
|
13
|
if (@exons) { |
356
|
1
|
|
|
|
|
2
|
my $exoncount = 0; |
357
|
1
|
|
|
|
|
2
|
foreach $range (@exons) { |
358
|
9
|
|
|
|
|
8
|
($start,$end,$strand)=@{$range}; |
|
9
|
|
|
|
|
24
|
|
359
|
9
|
|
|
|
|
27
|
$object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
360
|
9
|
50
|
|
|
|
15
|
if ($object != -1) { |
361
|
9
|
50
|
|
|
|
32
|
$object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount]; |
362
|
9
|
|
|
|
|
5
|
$exoncount++; |
363
|
9
|
|
|
|
|
15
|
push (@exonobjs,$object); |
364
|
|
|
|
|
|
|
} else { |
365
|
0
|
|
|
|
|
0
|
$exoncount++; |
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
} |
368
|
1
|
|
|
|
|
3
|
$gene{Exons}=\@exonobjs; |
369
|
|
|
|
|
|
|
} |
370
|
6
|
|
|
|
|
7
|
my @introns=@{$genefeatureshash->{'introns'}}; |
|
6
|
|
|
|
|
14
|
|
371
|
6
|
|
|
|
|
7
|
my @introndescs=@{$genefeatureshash->{'introndescs'}}; |
|
6
|
|
|
|
|
13
|
|
372
|
6
|
100
|
|
|
|
17
|
if (@introns) { |
373
|
1
|
|
|
|
|
3
|
my $introncount = 0; |
374
|
1
|
|
|
|
|
4
|
foreach $range (@introns) { |
375
|
8
|
|
|
|
|
10
|
($start,$end,$strand)=@{$range}; |
|
8
|
|
|
|
|
20
|
|
376
|
8
|
|
|
|
|
37
|
$object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
377
|
8
|
50
|
|
|
|
22
|
if ($object != -1) { |
378
|
8
|
|
|
|
|
26
|
$object->desc($introndescs[$introncount]); |
379
|
8
|
|
|
|
|
3
|
$introncount++; |
380
|
8
|
|
|
|
|
14
|
push (@intronobjs,$object); |
381
|
|
|
|
|
|
|
} else { |
382
|
0
|
|
|
|
|
0
|
$introncount++; |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
} |
385
|
1
|
|
|
|
|
4
|
$gene{Introns}=\@intronobjs; |
386
|
|
|
|
|
|
|
} |
387
|
6
|
|
|
|
|
9
|
my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}}; |
|
6
|
|
|
|
|
17
|
|
388
|
6
|
100
|
|
|
|
15
|
if (@prim_transcripts) { |
389
|
5
|
|
|
|
|
16
|
foreach $range (@prim_transcripts) { |
390
|
7
|
|
|
|
|
10
|
($start,$end,$strand)=@{$range}; |
|
7
|
|
|
|
|
44
|
|
391
|
7
|
|
|
|
|
68
|
$object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
392
|
7
|
50
|
|
|
|
37
|
if ($object != -1) { push (@primtranscriptobjs,$object); } |
|
7
|
|
|
|
|
28
|
|
393
|
|
|
|
|
|
|
} |
394
|
5
|
|
|
|
|
18
|
$gene{Prim_Transcripts}=\@primtranscriptobjs; |
395
|
|
|
|
|
|
|
} |
396
|
6
|
|
|
|
|
11
|
my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}}; |
|
6
|
|
|
|
|
21
|
|
397
|
6
|
|
|
|
|
12
|
my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}}; |
|
6
|
|
|
|
|
19
|
|
398
|
6
|
50
|
|
|
|
18
|
if (@repeat_regions) { |
399
|
0
|
|
|
|
|
0
|
my $k=0; |
400
|
0
|
|
|
|
|
0
|
foreach $range (@repeat_regions) { |
401
|
0
|
|
|
|
|
0
|
($start,$end,$strand)=@{$range}; |
|
0
|
|
|
|
|
0
|
|
402
|
0
|
|
|
|
|
0
|
$object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
403
|
0
|
0
|
|
|
|
0
|
if ($object != -1) { |
404
|
0
|
|
|
|
|
0
|
$object->desc($repeat_regions_family[$k]); |
405
|
0
|
|
|
|
|
0
|
$k++; |
406
|
0
|
|
|
|
|
0
|
push (@repeatregionobjs,$object); |
407
|
|
|
|
|
|
|
} else { |
408
|
0
|
|
|
|
|
0
|
$k++; |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
} |
411
|
0
|
|
|
|
|
0
|
$gene{Repeat_Regions}=\@repeatregionobjs; |
412
|
|
|
|
|
|
|
} |
413
|
6
|
|
|
|
|
11
|
my @repeat_units=@{$genefeatureshash->{'repeat_units'}}; |
|
6
|
|
|
|
|
18
|
|
414
|
6
|
|
|
|
|
9
|
my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}}; |
|
6
|
|
|
|
|
14
|
|
415
|
6
|
50
|
|
|
|
15
|
if (@repeat_units) { |
416
|
0
|
|
|
|
|
0
|
my $k=0; |
417
|
0
|
|
|
|
|
0
|
foreach $range (@repeat_units) { |
418
|
0
|
|
|
|
|
0
|
($start,$end,$strand)=@{$range}; |
|
0
|
|
|
|
|
0
|
|
419
|
0
|
|
|
|
|
0
|
$object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
420
|
0
|
0
|
|
|
|
0
|
if ($object != -1) { |
421
|
0
|
|
|
|
|
0
|
$object->desc($repeat_units_family[$k]); |
422
|
0
|
|
|
|
|
0
|
$k++; |
423
|
0
|
|
|
|
|
0
|
push (@repeatunitobjs,$object); |
424
|
|
|
|
|
|
|
} else { |
425
|
0
|
|
|
|
|
0
|
$k++; |
426
|
|
|
|
|
|
|
} |
427
|
|
|
|
|
|
|
} |
428
|
0
|
|
|
|
|
0
|
$gene{Repeat_Units}=\@repeatunitobjs; |
429
|
|
|
|
|
|
|
} |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
# create the Gene |
432
|
6
|
|
|
|
|
12
|
my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos |
433
|
6
|
|
|
|
|
77
|
return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene, |
434
|
|
|
|
|
|
|
-upbound=>$min,-downbound=>$max)); |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
# maybe this function will be moved to general utility package |
438
|
|
|
|
|
|
|
# argument: array of numbers |
439
|
|
|
|
|
|
|
# returns: (min,max) numbers in the array |
440
|
|
|
|
|
|
|
sub rangeofarray { |
441
|
6
|
|
|
6
|
0
|
10
|
my $self=shift; |
442
|
6
|
|
|
|
|
29
|
my @array=@_; |
443
|
|
|
|
|
|
|
#print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n"; |
444
|
6
|
|
|
|
|
7
|
my ($max,$min,$element); |
445
|
6
|
|
|
|
|
13
|
$min=$max=shift(@array); |
446
|
6
|
|
|
|
|
14
|
foreach $element (@array) { |
447
|
110
|
50
|
|
|
|
118
|
$element = 0 unless defined $element; |
448
|
110
|
50
|
|
|
|
135
|
if ($element < $min) { |
449
|
0
|
|
|
|
|
0
|
$min=$element; |
450
|
|
|
|
|
|
|
} |
451
|
110
|
100
|
|
|
|
130
|
if ($element > $max) { |
452
|
6
|
|
|
|
|
11
|
$max=$element; |
453
|
|
|
|
|
|
|
} |
454
|
|
|
|
|
|
|
} |
455
|
|
|
|
|
|
|
#print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n"; |
456
|
6
|
|
|
|
|
29
|
return ($min,$max); |
457
|
|
|
|
|
|
|
} |
458
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# argument: reference to DNA object, reference to array of transcripts |
461
|
|
|
|
|
|
|
# returns: an array of Translation object references |
462
|
|
|
|
|
|
|
sub transexonscreation { |
463
|
6
|
|
|
6
|
0
|
8
|
my $self=shift; |
464
|
6
|
|
|
|
|
12
|
my $dna=$_[0]; |
465
|
6
|
|
|
|
|
7
|
my @transcripts=@{$_[1]}; |
|
6
|
|
|
|
|
11
|
|
466
|
|
|
|
|
|
|
|
467
|
6
|
|
|
|
|
9
|
my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj); |
468
|
0
|
|
|
|
|
0
|
my $translationobj; |
469
|
0
|
|
|
|
|
0
|
my @translationobjects; |
470
|
6
|
|
|
|
|
11
|
foreach $transcript (@transcripts) { |
471
|
6
|
|
|
|
|
9
|
foreach $exonref (@{$transcript}) { |
|
6
|
|
|
|
|
12
|
|
472
|
34
|
|
|
|
|
24
|
($start,$end,$strand)=@{$exonref}; |
|
34
|
|
|
|
|
70
|
|
473
|
|
|
|
|
|
|
#print "Creating Exon: start $start end $end strand $strand\n"; |
474
|
34
|
|
|
|
|
162
|
$exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
475
|
|
|
|
|
|
|
#push (@exonobjects,$exonobj); |
476
|
34
|
|
|
|
|
72
|
push (@transexons,$exonobj); |
477
|
|
|
|
|
|
|
} |
478
|
6
|
|
|
|
|
50
|
$transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons ); |
479
|
6
|
50
|
|
|
|
21
|
if ($transcriptobj != -1) { |
480
|
6
|
|
|
|
|
57
|
$translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj); |
481
|
6
|
|
|
|
|
15
|
@transexons=(); # cleans it |
482
|
|
|
|
|
|
|
#push (@transcriptobjects,$transcriptobj); |
483
|
6
|
|
|
|
|
15
|
push (@translationobjects,$translationobj); |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
} |
486
|
6
|
|
|
|
|
19
|
return (@translationobjects); |
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
#sub printgene { |
490
|
|
|
|
|
|
|
# deleted. Some functionality placed in Gene->printfeaturesnum |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=head2 printswissprot |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
Title : printswissprot |
495
|
|
|
|
|
|
|
Usage : $loader->printswissprot($hashref); |
496
|
|
|
|
|
|
|
Function: prints out all information loaded from a database entry into the |
497
|
|
|
|
|
|
|
loader. Mainly used for testing purposes. |
498
|
|
|
|
|
|
|
Args : a hashref containing the SWISSPROT entry datas |
499
|
|
|
|
|
|
|
Note : the hashref can be obtained with a call to the method |
500
|
|
|
|
|
|
|
$loader->get_swisshash() (BioPerl via Bio::DB::EMBL.pm) |
501
|
|
|
|
|
|
|
that takes as argument a string like "SWISS-PROT:P10275" |
502
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=cut |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# argument: hashref containing the SWISSPROT entry datas |
506
|
|
|
|
|
|
|
# prints out that hash, showing the information loaded |
507
|
|
|
|
|
|
|
sub printswissprot { |
508
|
0
|
|
|
0
|
1
|
0
|
my ($self,$entry)=@_; |
509
|
0
|
0
|
|
|
|
0
|
unless ($entry) { |
510
|
0
|
|
|
|
|
0
|
return; |
511
|
|
|
|
|
|
|
} |
512
|
|
|
|
|
|
|
printf "ID: %s\n", |
513
|
0
|
|
|
|
|
0
|
$entry->{'ID'}; |
514
|
|
|
|
|
|
|
printf "ACC: %s\n", |
515
|
0
|
|
|
|
|
0
|
$entry->{'AccNumber'}; |
516
|
|
|
|
|
|
|
printf "GENE: %s\n", |
517
|
0
|
|
|
|
|
0
|
$entry->{'Gene'}; |
518
|
|
|
|
|
|
|
printf "DES: %s\n", |
519
|
0
|
|
|
|
|
0
|
$entry->{'Description'}; |
520
|
|
|
|
|
|
|
printf "ORG: %s\n", |
521
|
0
|
|
|
|
|
0
|
$entry->{'Organism'}; |
522
|
|
|
|
|
|
|
printf "SEQLN: %s\n", |
523
|
0
|
|
|
|
|
0
|
$entry->{'SeqLength'}; |
524
|
|
|
|
|
|
|
printf "SEQ: %s\n", |
525
|
0
|
|
|
|
|
0
|
substr($entry->{'Sequence'},0,64); |
526
|
0
|
0
|
|
|
|
0
|
if ($entry->{'Features'}) { |
527
|
0
|
|
|
|
|
0
|
my @features=@{$entry->{'Features'}}; |
|
0
|
|
|
|
|
0
|
|
528
|
0
|
|
|
|
|
0
|
my $i; |
529
|
0
|
|
|
|
|
0
|
for $i (0..$#features) { |
530
|
0
|
|
|
|
|
0
|
print "|",$features[$i]->{'name'},"|"; |
531
|
0
|
|
|
|
|
0
|
print " at ",$features[$i]->{'location'},": "; |
532
|
0
|
|
|
|
|
0
|
print "",$features[$i]->{'desc'},"\n"; |
533
|
|
|
|
|
|
|
} |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
} |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=head2 printembl |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
Title : printembl |
540
|
|
|
|
|
|
|
Usage : $loader->printembl(); |
541
|
|
|
|
|
|
|
Function: prints out all information loaded from a database entry into the |
542
|
|
|
|
|
|
|
loader. Mainly used for testing purposes. |
543
|
|
|
|
|
|
|
Args : none |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=cut |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
# argument: hashref containing the EMBL entry datas |
548
|
|
|
|
|
|
|
# prints out that hash, showing the information loaded |
549
|
|
|
|
|
|
|
sub printembl { |
550
|
0
|
|
|
0
|
1
|
0
|
my ($self,$entry)=@_; |
551
|
0
|
0
|
|
|
|
0
|
unless ($entry) { |
552
|
0
|
|
|
|
|
0
|
$entry=$self->{'hash'}; |
553
|
|
|
|
|
|
|
} |
554
|
0
|
|
|
|
|
0
|
my ($i,$featurename); |
555
|
|
|
|
|
|
|
printf "ID: %s\n", |
556
|
0
|
|
|
|
|
0
|
$entry->{'ID'}; |
557
|
|
|
|
|
|
|
printf "ACC: %s\n", |
558
|
0
|
|
|
|
|
0
|
$entry->{'AccNumber'}; |
559
|
|
|
|
|
|
|
printf "MOL: %s\n", |
560
|
0
|
|
|
|
|
0
|
$entry->{'Molecule'}; |
561
|
|
|
|
|
|
|
printf "DIV: %s\n", |
562
|
0
|
|
|
|
|
0
|
$entry->{'Division'}; |
563
|
|
|
|
|
|
|
printf "DES: %s\n", |
564
|
0
|
|
|
|
|
0
|
$entry->{'Description'}; |
565
|
|
|
|
|
|
|
printf "ORG: %s\n", |
566
|
0
|
|
|
|
|
0
|
$entry->{'Organism'}; |
567
|
|
|
|
|
|
|
printf "SEQLN: %s\n", |
568
|
0
|
|
|
|
|
0
|
$entry->{'SeqLength'}; |
569
|
|
|
|
|
|
|
printf "SEQ: %s\n", |
570
|
0
|
|
|
|
|
0
|
substr($entry->{'Sequence'},0,64); |
571
|
0
|
|
|
|
|
0
|
my @features=@{$entry->{'Features'}}; |
|
0
|
|
|
|
|
0
|
|
572
|
0
|
|
|
|
|
0
|
my @cds=@{$entry->{'CDS'}}; |
|
0
|
|
|
|
|
0
|
|
573
|
0
|
|
|
|
|
0
|
print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n"; |
574
|
0
|
|
|
|
|
0
|
my ($exonref,$transcript); |
575
|
0
|
|
|
|
|
0
|
my ($qualifiernumber,$qualifiers,$key); |
576
|
0
|
|
|
|
|
0
|
my ($start,$end,$strand); |
577
|
0
|
|
|
|
|
0
|
my $j=0; |
578
|
0
|
|
|
|
|
0
|
for $i (0..$#features) { |
579
|
0
|
|
|
|
|
0
|
$featurename=$features[$i]->{'name'}; |
580
|
0
|
0
|
|
|
|
0
|
if ($featurename eq "CDS") { |
581
|
0
|
|
|
|
|
0
|
print "|CDS| number $j at feature position: $i\n"; |
582
|
|
|
|
|
|
|
#print $features[$i]->{'location'},"\n"; |
583
|
0
|
|
|
|
|
0
|
$transcript=$features[$i]->{'range'}; |
584
|
0
|
|
|
|
|
0
|
foreach $exonref (@{$transcript}) { |
|
0
|
|
|
|
|
0
|
|
585
|
0
|
|
|
|
|
0
|
($start,$end,$strand)=@{$exonref}; |
|
0
|
|
|
|
|
0
|
|
586
|
0
|
|
|
|
|
0
|
print "\tExon: start $start end $end strand $strand\n"; |
587
|
|
|
|
|
|
|
} |
588
|
0
|
|
|
|
|
0
|
$j++; |
589
|
|
|
|
|
|
|
} else { |
590
|
0
|
|
|
|
|
0
|
print "|$featurename| at feature position: $i\n"; |
591
|
0
|
|
|
|
|
0
|
print "\trange: "; |
592
|
0
|
|
|
|
|
0
|
print join("\t",@{$features[$i]->{'range'}}),"\n"; |
|
0
|
|
|
|
|
0
|
|
593
|
|
|
|
|
|
|
#print $features[$i]->{'location'},"\n"; |
594
|
|
|
|
|
|
|
} |
595
|
0
|
|
|
|
|
0
|
$qualifiernumber=$features[$i]->{'qual_number'}; |
596
|
0
|
|
|
|
|
0
|
$qualifiers=$features[$i]->{'qualifiers'}; # hash |
597
|
0
|
|
|
|
|
0
|
foreach $key (keys (%{$qualifiers})) { |
|
0
|
|
|
|
|
0
|
|
598
|
0
|
|
|
|
|
0
|
print "\t\t",$key,": "; |
599
|
0
|
|
|
|
|
0
|
print $qualifiers->{$key},"\n"; |
600
|
|
|
|
|
|
|
} |
601
|
|
|
|
|
|
|
} |
602
|
|
|
|
|
|
|
} |
603
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
=head2 genes |
605
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
Title : genes |
607
|
|
|
|
|
|
|
Usage : $loader->genes(); |
608
|
|
|
|
|
|
|
Function: Returns an array of gene_names (strings) contained in the loaded |
609
|
|
|
|
|
|
|
entry. |
610
|
|
|
|
|
|
|
Args : none |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=cut |
613
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
# argument: entryhashref |
615
|
|
|
|
|
|
|
# returns: array of genenames found in the entry |
616
|
|
|
|
|
|
|
sub genes { |
617
|
0
|
|
|
0
|
1
|
0
|
my ($self,$entry)=@_; |
618
|
0
|
0
|
|
|
|
0
|
unless ($entry) { |
619
|
0
|
|
|
|
|
0
|
$entry=$self->{'hash'}; |
620
|
|
|
|
|
|
|
} |
621
|
0
|
|
|
|
|
0
|
my @entryfeatures=@{$entry->{'Features'}}; |
|
0
|
|
|
|
|
0
|
|
622
|
0
|
|
|
|
|
0
|
my ($genename,$genenames,$entryfeature); |
623
|
0
|
|
|
|
|
0
|
for $entryfeature (@entryfeatures) { |
624
|
0
|
|
|
|
|
0
|
$genename=$entryfeature->{'qualifiers'}->{'gene'}; |
625
|
0
|
0
|
|
|
|
0
|
if ($genename) { |
626
|
0
|
0
|
|
|
|
0
|
if (index($genenames,$genename) == -1) { # if name is new |
627
|
0
|
|
|
|
|
0
|
$genenames .= $genename . " "; # add the name |
628
|
|
|
|
|
|
|
} |
629
|
|
|
|
|
|
|
} |
630
|
|
|
|
|
|
|
} |
631
|
0
|
|
|
|
|
0
|
return (split(/ /,$genenames)); # assumes no space inbetween each genename |
632
|
|
|
|
|
|
|
} |
633
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# arguments: swisshash, translation objref |
635
|
|
|
|
|
|
|
# adds information to the Translation, creates AARange objects, sets the |
636
|
|
|
|
|
|
|
# aa_range attribute on the Translation, pointing to those objects |
637
|
|
|
|
|
|
|
sub swisshash2liveseq { |
638
|
0
|
|
|
0
|
0
|
0
|
my ($self,$entry,$translation)=@_; |
639
|
0
|
|
|
|
|
0
|
my $translength=$translation->length; |
640
|
0
|
|
|
|
|
0
|
$translation->desc($translation->desc . $entry->{'Description'}); |
641
|
0
|
|
|
|
|
0
|
$translation->display_id("SWISSPROT:" . $entry->{'ID'}); |
642
|
0
|
|
|
|
|
0
|
$translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'}); |
643
|
0
|
|
|
|
|
0
|
$translation->name($entry->{'Gene'}); |
644
|
0
|
|
|
|
|
0
|
$translation->source($entry->{'Organism'}); |
645
|
0
|
|
|
|
|
0
|
my @aarangeobjects; |
646
|
0
|
|
|
|
|
0
|
my ($start,$end,$aarangeobj,$feature); |
647
|
0
|
|
|
|
|
0
|
my @features; my @newfeatures; |
|
0
|
|
|
|
|
0
|
|
648
|
0
|
0
|
|
|
|
0
|
if ($entry->{'Features'}) { |
649
|
0
|
|
|
|
|
0
|
@features=@{$entry->{'Features'}}; |
|
0
|
|
|
|
|
0
|
|
650
|
|
|
|
|
|
|
} |
651
|
0
|
|
|
|
|
0
|
my $cleavedmet=0; |
652
|
|
|
|
|
|
|
# check for cleaved Met |
653
|
0
|
|
|
|
|
0
|
foreach $feature (@features) { |
654
|
0
|
0
|
0
|
|
|
0
|
if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) { |
655
|
0
|
|
|
|
|
0
|
$cleavedmet=1; |
656
|
0
|
|
|
|
|
0
|
$translation->{'offset'}="1"; # from swissprot to liveseq protein sequence |
657
|
|
|
|
|
|
|
} else { |
658
|
0
|
|
|
|
|
0
|
push(@newfeatures,$feature); |
659
|
|
|
|
|
|
|
} |
660
|
|
|
|
|
|
|
} |
661
|
|
|
|
|
|
|
|
662
|
0
|
|
|
|
|
0
|
my $swissseq=$entry->{'Sequence'}; |
663
|
0
|
|
|
|
|
0
|
my $liveseqtransl=$translation->seq; |
664
|
0
|
|
|
|
|
0
|
chop $liveseqtransl; # to take away the trailing STOP "*" |
665
|
0
|
|
|
|
|
0
|
my $translated=substr($liveseqtransl,$cleavedmet); |
666
|
|
|
|
|
|
|
|
667
|
0
|
|
|
|
|
0
|
my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met |
668
|
|
|
|
|
|
|
|
669
|
0
|
0
|
0
|
|
|
0
|
if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed? |
670
|
0
|
|
|
|
|
0
|
print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n"; |
671
|
0
|
|
|
|
|
0
|
carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!"; |
672
|
0
|
|
|
|
|
0
|
return; |
673
|
|
|
|
|
|
|
} |
674
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
#my $i=0; # debug |
676
|
0
|
|
|
|
|
0
|
@features=@newfeatures; |
677
|
0
|
|
|
|
|
0
|
foreach $feature (@features) { |
678
|
|
|
|
|
|
|
#print "Processing SwissProtFeature: $i\n"; # debug |
679
|
0
|
|
|
|
|
0
|
($start,$end)=split(/ /,$feature->{'location'}); |
680
|
|
|
|
|
|
|
# Note: cleavedmet is taken in account for updating numbering |
681
|
0
|
|
|
|
|
0
|
$aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -description => $feature->{'description'}, -translation => $translation, -translength => $translength); |
682
|
0
|
0
|
|
|
|
0
|
if ($aarangeobj != -1) { |
683
|
0
|
|
|
|
|
0
|
push(@aarangeobjects,$aarangeobj); |
684
|
|
|
|
|
|
|
} |
685
|
|
|
|
|
|
|
# $i++; # debug |
686
|
|
|
|
|
|
|
} |
687
|
0
|
|
|
|
|
0
|
$translation->{'aa_ranges'}=\@aarangeobjects; |
688
|
|
|
|
|
|
|
} |
689
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
# if there is no SRS support, the default will be to return 0 |
691
|
|
|
|
|
|
|
# i.e. this function is overridden in SRS package |
692
|
|
|
|
|
|
|
sub get_swisshash { |
693
|
6
|
|
|
6
|
0
|
9
|
return (0); |
694
|
|
|
|
|
|
|
} |
695
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
# Args: $entry hashref, gene_name OR cds_position (undef is used to |
697
|
|
|
|
|
|
|
# choose between the two), getswissprotinfo boolean flag |
698
|
|
|
|
|
|
|
# Returns: an hash holding various arrayref used in the hash2gene method |
699
|
|
|
|
|
|
|
# Function: examines the nucleotide entry, identifying features belonging |
700
|
|
|
|
|
|
|
# to the gene (defined either by its name or by the position of its CDS in |
701
|
|
|
|
|
|
|
# the entry) |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
sub _findgenefeatures { |
704
|
6
|
|
|
6
|
|
10
|
my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_; |
705
|
|
|
|
|
|
|
|
706
|
6
|
|
|
|
|
9
|
my @entryfeatures=@{$entry->{'Features'}}; |
|
6
|
|
|
|
|
21
|
|
707
|
6
|
|
|
|
|
10
|
my @exons; my @introns; my @prim_transcripts; my @transcripts; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
708
|
0
|
|
|
|
|
0
|
my @repeat_units; my @repeat_regions; |
|
0
|
|
|
|
|
0
|
|
709
|
0
|
|
|
|
|
0
|
my @repeat_units_family; my @repeat_regions_family; my $rpt_family; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
710
|
0
|
|
|
|
|
0
|
my $entryfeature; my @genefeatures; |
|
0
|
|
|
|
|
0
|
|
711
|
0
|
|
|
|
|
0
|
my $desc; my @exondescs; my @introndescs; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
# for swissprot xreference |
714
|
0
|
|
|
|
|
0
|
my ($swissacc,$swisshash); my @swisshashes; |
|
0
|
|
|
|
|
0
|
|
715
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
# for translation_tables |
717
|
0
|
|
|
|
|
0
|
my @ttables; |
718
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
# to create labels |
720
|
0
|
|
|
|
|
0
|
my ($name,$exon); |
721
|
0
|
|
|
|
|
0
|
my @range; my @cdsexons; my @labels; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
# maybe here also could be added special case when there is no CDS feature |
724
|
|
|
|
|
|
|
# in the entry (e.g. tRNA entry -> TOCHECK). |
725
|
|
|
|
|
|
|
# let's deal with the special case in which there is just one gene per entry |
726
|
|
|
|
|
|
|
# usually without /gene qualifier |
727
|
6
|
|
|
|
|
10
|
my @cds=@{$entry->{'CDS'}}; |
|
6
|
|
|
|
|
12
|
|
728
|
|
|
|
|
|
|
|
729
|
6
|
|
|
|
|
13
|
my $skipgenematch=0; |
730
|
6
|
50
|
|
|
|
18
|
if (scalar(@cds) == 1) { |
731
|
|
|
|
|
|
|
#carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features."; |
732
|
6
|
|
|
|
|
6
|
$skipgenematch=1; |
733
|
|
|
|
|
|
|
} |
734
|
|
|
|
|
|
|
|
735
|
6
|
|
|
|
|
10
|
my ($cds_begin,$cds_end,$proximity); |
736
|
6
|
50
|
|
|
|
16
|
if ($cds_position) { # if a position has been requested |
737
|
0
|
|
|
|
|
0
|
my @cds_exons=@{$cds[$cds_position-1]->{'range'}}; |
|
0
|
|
|
|
|
0
|
|
738
|
0
|
|
|
|
|
0
|
($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS |
739
|
0
|
|
|
|
|
0
|
$gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'}; |
740
|
|
|
|
|
|
|
# DEBUG |
741
|
0
|
0
|
|
|
|
0
|
unless ($skipgenematch) { |
742
|
0
|
|
|
|
|
0
|
carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------"; |
743
|
|
|
|
|
|
|
} |
744
|
0
|
|
|
|
|
0
|
$proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS |
745
|
|
|
|
|
|
|
} |
746
|
|
|
|
|
|
|
|
747
|
6
|
|
|
|
|
8
|
for $entryfeature (@entryfeatures) { # get only features for the desired gene |
748
|
30
|
0
|
0
|
|
|
42
|
if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) { |
|
|
|
33
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
749
|
30
|
|
|
|
|
31
|
push(@genefeatures,$entryfeature); |
750
|
|
|
|
|
|
|
|
751
|
30
|
|
|
|
|
20
|
my @range=@{$entryfeature->{'range'}}; |
|
30
|
|
|
|
|
59
|
|
752
|
30
|
|
|
|
|
34
|
$name=$entryfeature->{'name'}; |
753
|
30
|
|
|
|
|
21
|
my %qualifierhash=%{$entryfeature->{'qualifiers'}}; |
|
30
|
|
|
|
|
70
|
|
754
|
30
|
100
|
|
|
|
44
|
if ($name eq "CDS") { # that has range containing array of exons |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
# swissprot crossindexing (if without SRS support it will fill array |
757
|
|
|
|
|
|
|
# with zeros and do nothing |
758
|
6
|
50
|
|
|
|
14
|
if ($getswissprotinfo) { |
759
|
6
|
|
|
|
|
13
|
$swissacc=$entryfeature->{'qualifiers'}->{'db_xref'}; |
760
|
6
|
|
|
|
|
26
|
$swisshash=$self->get_swisshash($swissacc); |
761
|
|
|
|
|
|
|
#$self->printswissprot($swisshash); # DEBUG |
762
|
6
|
|
|
|
|
11
|
push (@swisshashes,$swisshash); |
763
|
|
|
|
|
|
|
} |
764
|
|
|
|
|
|
|
|
765
|
6
|
|
|
|
|
15
|
push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
# create labels array |
768
|
6
|
|
|
|
|
11
|
for $exon (@range) { |
769
|
34
|
|
|
|
|
58
|
push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS |
770
|
|
|
|
|
|
|
} |
771
|
6
|
|
|
|
|
20
|
push (@transcripts,$entryfeature->{'range'}); |
772
|
|
|
|
|
|
|
} else { |
773
|
|
|
|
|
|
|
# "simplifying" the joinedlocation features. I.e. changing them from |
774
|
|
|
|
|
|
|
# multijoined ones to simple plain start-end features, taking only |
775
|
|
|
|
|
|
|
# the start of the first "exon" and the end of the last "exon" as |
776
|
|
|
|
|
|
|
# start and end of the entire feature |
777
|
24
|
50
|
33
|
|
|
48
|
if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location |
778
|
0
|
|
|
|
|
0
|
@range=($range[0]->[0],$range[-1]->[1]); |
779
|
|
|
|
|
|
|
} |
780
|
24
|
|
|
|
|
31
|
push(@labels,$range[0],$range[1]); # start and end of every feature |
781
|
24
|
100
|
|
|
|
39
|
if ($name eq "exon") { |
782
|
9
|
|
|
|
|
11
|
$desc=$entryfeature->{'qualifiers'}->{'number'}; |
783
|
9
|
100
|
|
|
|
12
|
if ($entryfeature->{'qualifiers'}->{'note'}) { |
784
|
3
|
50
|
|
|
|
4
|
if ($desc) { |
785
|
0
|
|
|
|
|
0
|
$desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; |
786
|
|
|
|
|
|
|
} else { |
787
|
3
|
|
|
|
|
3
|
$desc = $entryfeature->{'qualifiers'}->{'note'}; |
788
|
|
|
|
|
|
|
} |
789
|
|
|
|
|
|
|
} |
790
|
9
|
|
50
|
|
|
14
|
push (@exondescs,$desc || "unknown"); |
791
|
9
|
|
|
|
|
8
|
push(@exons,\@range); |
792
|
|
|
|
|
|
|
} |
793
|
24
|
100
|
|
|
|
37
|
if ($name eq "intron") { |
794
|
8
|
|
|
|
|
7
|
$desc=$entryfeature->{'qualifiers'}->{'number'}; |
795
|
8
|
50
|
|
|
|
7
|
if ($desc) { |
796
|
0
|
|
|
|
|
0
|
$desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; |
797
|
|
|
|
|
|
|
} else { |
798
|
8
|
|
|
|
|
9
|
$desc = $entryfeature->{'qualifiers'}->{'note'}; |
799
|
|
|
|
|
|
|
} |
800
|
8
|
|
50
|
|
|
10
|
push (@introndescs,$desc || "unknown"); |
801
|
8
|
|
|
|
|
15
|
push(@introns,\@range); |
802
|
|
|
|
|
|
|
} |
803
|
24
|
100
|
100
|
|
|
78
|
if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\@range); } |
|
7
|
|
|
|
|
14
|
|
804
|
24
|
50
|
|
|
|
36
|
if ($name eq "repeat_unit") { push(@repeat_units,\@range); |
|
0
|
|
|
|
|
0
|
|
805
|
0
|
|
|
|
|
0
|
$rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; |
806
|
0
|
|
0
|
|
|
0
|
push (@repeat_units_family,$rpt_family || "unknown"); |
807
|
|
|
|
|
|
|
} |
808
|
24
|
50
|
|
|
|
47
|
if ($name eq "repeat_region") { push(@repeat_regions,\@range); |
|
0
|
|
|
|
|
0
|
|
809
|
0
|
|
|
|
|
0
|
$rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; |
810
|
0
|
|
0
|
|
|
0
|
push (@repeat_regions_family,$rpt_family || "unknown"); |
811
|
|
|
|
|
|
|
} |
812
|
|
|
|
|
|
|
} |
813
|
|
|
|
|
|
|
} |
814
|
|
|
|
|
|
|
} |
815
|
6
|
50
|
|
|
|
18
|
unless ($gene_name) { $gene_name="cds-position:".$cds_position; } |
|
0
|
|
|
|
|
0
|
|
816
|
6
|
|
|
|
|
8
|
my %genefeatureshash; |
817
|
6
|
|
|
|
|
13
|
$genefeatureshash{gene_name}=$gene_name; |
818
|
6
|
|
|
|
|
13
|
$genefeatureshash{genefeatures}=\@genefeatures; |
819
|
6
|
|
|
|
|
11
|
$genefeatureshash{labels}=\@labels; |
820
|
6
|
|
|
|
|
13
|
$genefeatureshash{ttables}=\@ttables; |
821
|
6
|
|
|
|
|
13
|
$genefeatureshash{swisshashes}=\@swisshashes; |
822
|
6
|
|
|
|
|
11
|
$genefeatureshash{transcripts}=\@transcripts; |
823
|
6
|
|
|
|
|
11
|
$genefeatureshash{exons}=\@exons; |
824
|
6
|
|
|
|
|
16
|
$genefeatureshash{exondescs}=\@exondescs; |
825
|
6
|
|
|
|
|
11
|
$genefeatureshash{introns}=\@introns; |
826
|
6
|
|
|
|
|
8
|
$genefeatureshash{introndescs}=\@introndescs; |
827
|
6
|
|
|
|
|
10
|
$genefeatureshash{prim_transcripts}=\@prim_transcripts; |
828
|
6
|
|
|
|
|
12
|
$genefeatureshash{repeat_units}=\@repeat_units; |
829
|
6
|
|
|
|
|
9
|
$genefeatureshash{repeat_regions}=\@repeat_regions; |
830
|
6
|
|
|
|
|
12
|
$genefeatureshash{repeat_units_family}=\@repeat_units_family; |
831
|
6
|
|
|
|
|
16
|
$genefeatureshash{repeat_regions_family}=\@repeat_regions_family; |
832
|
6
|
|
|
|
|
24
|
return (\%genefeatureshash); |
833
|
|
|
|
|
|
|
} |
834
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
# used by _findgenefeatures, when a CDS at a certain position is requested, |
837
|
|
|
|
|
|
|
# to retrieve only features quite close to the wanted CDS. |
838
|
|
|
|
|
|
|
# Args: range hashref, begin and end positions of the CDS, $proximity |
839
|
|
|
|
|
|
|
# $proximity holds the maximum distance between the extremes of the CDS |
840
|
|
|
|
|
|
|
# and of the feature under exam. |
841
|
|
|
|
|
|
|
# Returns: boolean |
842
|
|
|
|
|
|
|
sub _checkfeatureproximity { |
843
|
0
|
|
|
0
|
|
|
my ($self,$range,$cds_begin,$cds_end,$proximity)=@_; |
844
|
0
|
|
|
|
|
|
my @range=@{$range}; |
|
0
|
|
|
|
|
|
|
845
|
0
|
|
|
|
|
|
my ($begin,$end,$strand); |
846
|
0
|
0
|
|
|
|
|
if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons |
847
|
0
|
|
|
|
|
|
($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]); |
848
|
|
|
|
|
|
|
} else { |
849
|
0
|
|
|
|
|
|
($begin,$end,$strand)=@range; |
850
|
|
|
|
|
|
|
} |
851
|
0
|
0
|
|
|
|
|
if ($cds_begin > $cds_end) { # i.e. reverse strand CDS |
852
|
0
|
|
|
|
|
|
($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries |
853
|
|
|
|
|
|
|
} |
854
|
0
|
0
|
|
|
|
|
if ($strand == -1) { # reverse strand |
855
|
0
|
|
|
|
|
|
($begin,$end)=($end,$begin); # swap boundaries |
856
|
|
|
|
|
|
|
} |
857
|
0
|
0
|
|
|
|
|
if (($cds_begin-$end)>$proximity) { |
858
|
0
|
|
|
|
|
|
carp "--DEBUG-- feature rejected: begin $begin end $end -------"; |
859
|
0
|
|
|
|
|
|
return (0); |
860
|
|
|
|
|
|
|
} |
861
|
0
|
0
|
|
|
|
|
if (($begin-$cds_end)>$proximity) { |
862
|
0
|
|
|
|
|
|
carp "--DEBUG-- feature rejected: begin $begin end $end -------"; |
863
|
0
|
|
|
|
|
|
return (0); |
864
|
|
|
|
|
|
|
} |
865
|
0
|
|
|
|
|
|
carp "--DEBUG-- feature accepted: begin $begin end $end -------"; |
866
|
0
|
|
|
|
|
|
return (1); # otherwise ok, feature considered next to CDS |
867
|
|
|
|
|
|
|
} |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
# function that calls the external program "align" (on the fasta2 package) |
871
|
|
|
|
|
|
|
# to create an alignment between two sequences, returning the aligned |
872
|
|
|
|
|
|
|
# strings and the codes for the identity (:: ::::) |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
sub _get_alignment { |
875
|
0
|
|
|
0
|
|
|
my ($self,$seq1,$seq2)=@_; |
876
|
|
|
|
|
|
|
|
877
|
0
|
0
|
|
|
|
|
my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; |
878
|
0
|
|
|
|
|
|
my $fastafile1="/tmp/tmpfastafile1"; |
879
|
0
|
|
|
|
|
|
my $fastafile2="/tmp/tmpfastafile2"; |
880
|
0
|
|
|
|
|
|
my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut |
881
|
0
|
|
|
|
|
|
my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>$null | $grepcut"; # ALIGN |
882
|
0
|
0
|
|
|
|
|
open my $TMPFASTAFILE1, '>', $fastafile1 or croak "Could not write file '$fastafile1' for aa alignment: $!"; |
883
|
0
|
0
|
|
|
|
|
open my $TMPFASTAFILE2, '>', $fastafile2 or croak "Could not write file '$fastafile2' for aa alignment: $!"; |
884
|
0
|
|
|
|
|
|
print $TMPFASTAFILE1 ">firstseq\n$seq1\n"; |
885
|
0
|
|
|
|
|
|
print $TMPFASTAFILE2 ">secondseq\n$seq2\n"; |
886
|
0
|
|
|
|
|
|
close $TMPFASTAFILE1; |
887
|
0
|
|
|
|
|
|
close $TMPFASTAFILE2; |
888
|
0
|
|
|
|
|
|
my $alignment=`$alignprogram`; |
889
|
0
|
|
|
|
|
|
my @alignlines=split(/\n/,$alignment); |
890
|
0
|
|
|
|
|
|
my ($linecount,$seq1_aligned,$seq2_aligned,$codes); |
891
|
0
|
|
|
|
|
|
for ($linecount=0; $linecount < @alignlines; $linecount+=3) { |
892
|
0
|
|
|
|
|
|
$seq1_aligned .= $alignlines[$linecount]; |
893
|
0
|
|
|
|
|
|
$codes .= $alignlines[$linecount+1]; |
894
|
0
|
|
|
|
|
|
$seq2_aligned .= $alignlines[$linecount+2]; |
895
|
|
|
|
|
|
|
} |
896
|
0
|
|
|
|
|
|
return ($seq1_aligned,$seq2_aligned,$codes); |
897
|
|
|
|
|
|
|
} |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
# common part of the function to create a novel liveseq gene structure |
900
|
|
|
|
|
|
|
# from an amino acid sequence, using codon usage frequencies |
901
|
|
|
|
|
|
|
# args: codon_usage_array transltableid aasequence gene_name |
902
|
|
|
|
|
|
|
sub _common_novelaasequence2gene { |
903
|
0
|
|
|
0
|
|
|
my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_; |
904
|
0
|
|
|
|
|
|
my @species_codon_usage=@{$species_codon_usage}; |
|
0
|
|
|
|
|
|
|
905
|
0
|
|
|
|
|
|
my @codon_usage_label= |
906
|
|
|
|
|
|
|
qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg |
907
|
|
|
|
|
|
|
tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga |
908
|
|
|
|
|
|
|
ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa |
909
|
|
|
|
|
|
|
gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag |
910
|
|
|
|
|
|
|
tga); |
911
|
0
|
|
|
|
|
|
my ($i,$j); |
912
|
0
|
|
|
|
|
|
my %codon_usage_value; |
913
|
0
|
|
|
|
|
|
my $aa_codon_total; |
914
|
0
|
|
|
|
|
|
for ($i=0;$i<64;$i++) { |
915
|
0
|
|
|
|
|
|
$codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i]; |
916
|
|
|
|
|
|
|
} |
917
|
|
|
|
|
|
|
|
918
|
0
|
|
|
|
|
|
my $CodonTable = Bio::Tools::CodonTable->new ( -id => $ttabid ); |
919
|
0
|
|
|
|
|
|
my @aminoacids = split(//,uc($aasequence)); |
920
|
0
|
|
|
|
|
|
my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon); |
|
0
|
|
|
|
|
|
|
921
|
0
|
|
|
|
|
|
for $i (@aminoacids) { |
922
|
0
|
|
|
|
|
|
@alt_codons = $CodonTable->revtranslate($i); |
923
|
0
|
0
|
|
|
|
|
unless (@alt_codons) { |
924
|
0
|
|
|
|
|
|
carp "No reverse translation possible for aminoacid \'$i\'"; |
925
|
0
|
|
|
|
|
|
$dnasequence .= "???"; |
926
|
|
|
|
|
|
|
} else { |
927
|
0
|
|
|
|
|
|
$aa_codon_total=0; |
928
|
0
|
|
|
|
|
|
for $j (@alt_codons) { |
929
|
0
|
|
|
|
|
|
$aa_codon_total+=$codon_usage_value{$j}; |
930
|
|
|
|
|
|
|
} |
931
|
|
|
|
|
|
|
# print "aminoacid $i, codonchoice: "; # verbose |
932
|
|
|
|
|
|
|
#$partial=0; |
933
|
|
|
|
|
|
|
#for $j (@alt_codons) { |
934
|
|
|
|
|
|
|
#printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total; |
935
|
|
|
|
|
|
|
#$partial+=($codon_usage_value{$j}/$aa_codon_total); |
936
|
|
|
|
|
|
|
#} |
937
|
|
|
|
|
|
|
#print "\n"; |
938
|
0
|
|
|
|
|
|
$dice=rand(1); |
939
|
|
|
|
|
|
|
#print "roulette: $dice\n"; # verbose |
940
|
0
|
|
|
|
|
|
$partial=0; |
941
|
0
|
|
|
|
|
|
$chosen_codon=""; |
942
|
|
|
|
|
|
|
CODONCHOICE: |
943
|
0
|
|
|
|
|
|
for $j (0..@alt_codons) { # last one not accounted |
944
|
0
|
|
|
|
|
|
$thiscodon=$alt_codons[$j]; |
945
|
0
|
|
|
|
|
|
$relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total); |
946
|
0
|
0
|
|
|
|
|
if ($dice < $relativeusage+$partial) { |
947
|
0
|
|
|
|
|
|
$chosen_codon=$thiscodon; |
948
|
0
|
|
|
|
|
|
last CODONCHOICE; |
949
|
|
|
|
|
|
|
} else { |
950
|
0
|
|
|
|
|
|
$partial += $relativeusage; |
951
|
|
|
|
|
|
|
} |
952
|
|
|
|
|
|
|
} |
953
|
0
|
0
|
|
|
|
|
unless ($chosen_codon) { |
954
|
0
|
|
|
|
|
|
$chosen_codon = $alt_codons[-1]; # the last one |
955
|
|
|
|
|
|
|
} |
956
|
|
|
|
|
|
|
# print ".....adding $chosen_codon\n"; # verbose |
957
|
0
|
|
|
|
|
|
$dnasequence .= $chosen_codon; |
958
|
|
|
|
|
|
|
} |
959
|
|
|
|
|
|
|
} |
960
|
|
|
|
|
|
|
|
961
|
0
|
|
|
|
|
|
my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence); |
962
|
0
|
|
|
|
|
|
my $min=1; |
963
|
0
|
|
|
|
|
|
my $max=length($dnasequence); |
964
|
0
|
|
|
|
|
|
my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1); |
965
|
0
|
|
|
|
|
|
my @exons=($exon); |
966
|
0
|
|
|
|
|
|
my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons); |
967
|
0
|
|
|
|
|
|
$transcript->translation_table($ttabid); |
968
|
0
|
|
|
|
|
|
my @transcripts=($transcript); |
969
|
0
|
|
|
|
|
|
my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript); |
970
|
0
|
|
|
|
|
|
my @translations=($translation); |
971
|
0
|
|
|
|
|
|
my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations); |
972
|
0
|
|
|
|
|
|
my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max); |
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
# creation of gene |
975
|
0
|
0
|
|
|
|
|
unless ($gene) { # if $gene == 0 it means problems in hash2gene |
976
|
0
|
|
|
|
|
|
carp "Error in Gene creation phase"; |
977
|
0
|
|
|
|
|
|
return (0); |
978
|
|
|
|
|
|
|
} |
979
|
0
|
|
|
|
|
|
return $gene; |
980
|
|
|
|
|
|
|
} |
981
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
1; |