| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
# |
|
2
|
|
|
|
|
|
|
# bioperl module for Bio::LiveSeq::IO::Loader |
|
3
|
|
|
|
|
|
|
# |
|
4
|
|
|
|
|
|
|
# Please direct questions and support issues to |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
# Cared for by Joseph Insana |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# Copyright Joseph Insana |
|
9
|
|
|
|
|
|
|
# |
|
10
|
|
|
|
|
|
|
# You may distribute this module under the same terms as perl itself |
|
11
|
|
|
|
|
|
|
# |
|
12
|
|
|
|
|
|
|
# POD documentation - main docs before the code |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
=head1 NAME |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq |
|
17
|
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
#documentation needed |
|
21
|
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
This package holds common methods used by BioPerl and file loaders. |
|
25
|
|
|
|
|
|
|
It contains methods to create LiveSeq objects out of entire entries or from a |
|
26
|
|
|
|
|
|
|
localized sequence region surrounding a particular gene. |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
=head1 AUTHOR - Joseph A.L. Insana |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
Email: Insana@ebi.ac.uk, jinsana@gmx.net |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 APPENDIX |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
The rest of the documentation details each of the object |
|
35
|
|
|
|
|
|
|
methods. Internal methods are usually preceded with a _ |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=cut |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# Let the code begin... |
|
40
|
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
package Bio::LiveSeq::IO::Loader; |
|
42
|
|
|
|
|
|
|
|
|
43
|
2
|
|
|
2
|
|
10
|
use strict; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
58
|
|
|
44
|
2
|
|
|
2
|
|
7
|
use Carp qw(cluck croak carp); |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
93
|
|
|
45
|
2
|
|
|
2
|
|
512
|
use Bio::LiveSeq::DNA; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
47
|
|
|
46
|
2
|
|
|
2
|
|
473
|
use Bio::LiveSeq::Exon; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
47
|
|
|
47
|
2
|
|
|
2
|
|
612
|
use Bio::LiveSeq::Transcript ; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
60
|
|
|
48
|
2
|
|
|
2
|
|
525
|
use Bio::LiveSeq::Translation; |
|
|
2
|
|
|
|
|
4
|
|
|
|
2
|
|
|
|
|
43
|
|
|
49
|
2
|
|
|
2
|
|
622
|
use Bio::LiveSeq::Gene; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
46
|
|
|
50
|
2
|
|
|
2
|
|
555
|
use Bio::LiveSeq::Intron; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
47
|
|
|
51
|
2
|
|
|
2
|
|
7
|
use Bio::LiveSeq::Prim_Transcript; |
|
|
2
|
|
|
|
|
1
|
|
|
|
2
|
|
|
|
|
28
|
|
|
52
|
2
|
|
|
2
|
|
478
|
use Bio::LiveSeq::Repeat_Region; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
45
|
|
|
53
|
2
|
|
|
2
|
|
443
|
use Bio::LiveSeq::Repeat_Unit; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
46
|
|
|
54
|
2
|
|
|
2
|
|
560
|
use Bio::LiveSeq::AARange; |
|
|
2
|
|
|
|
|
3
|
|
|
|
2
|
|
|
|
|
46
|
|
|
55
|
2
|
|
|
2
|
|
9
|
use Bio::Tools::CodonTable; |
|
|
2
|
|
|
|
|
2
|
|
|
|
2
|
|
|
|
|
6804
|
|
|
56
|
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
=head2 entry2liveseq |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
Title : entry2liveseq |
|
60
|
|
|
|
|
|
|
Usage : @translationobjects=$loader->entry2liveseq(); |
|
61
|
|
|
|
|
|
|
: @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0); |
|
62
|
|
|
|
|
|
|
Function: creates LiveSeq objects from an entry previously loaded |
|
63
|
|
|
|
|
|
|
Returns : array of references to objects of class Translation |
|
64
|
|
|
|
|
|
|
Errorcode 0 |
|
65
|
|
|
|
|
|
|
Args : optional boolean flag to avoid the retrieval of SwissProt |
|
66
|
|
|
|
|
|
|
information for all Transcripts containing SwissProt x-reference |
|
67
|
|
|
|
|
|
|
default is 1 (to retrieve those information and create AARange |
|
68
|
|
|
|
|
|
|
LiveSeq objects) |
|
69
|
|
|
|
|
|
|
Note : this method can get really slow for big entries. The lightweight |
|
70
|
|
|
|
|
|
|
gene2liveseq method is recommended |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
=cut |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
sub entry2liveseq { |
|
75
|
0
|
|
|
0
|
1
|
0
|
my ($self, %args) = @_; |
|
76
|
0
|
|
|
|
|
0
|
my ($getswissprotinfo)=($args{-getswissprotinfo}); |
|
77
|
0
|
0
|
|
|
|
0
|
if (defined($getswissprotinfo)) { |
|
78
|
0
|
0
|
0
|
|
|
0
|
if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { |
|
79
|
0
|
|
|
|
|
0
|
carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; |
|
80
|
0
|
|
|
|
|
0
|
$getswissprotinfo=0; |
|
81
|
|
|
|
|
|
|
} |
|
82
|
|
|
|
|
|
|
} else { |
|
83
|
0
|
|
|
|
|
0
|
$getswissprotinfo=1; |
|
84
|
|
|
|
|
|
|
} |
|
85
|
0
|
|
|
|
|
0
|
my $hashref=$self->{'hash'}; |
|
86
|
0
|
0
|
|
|
|
0
|
unless ($hashref) { return (0); } |
|
|
0
|
|
|
|
|
0
|
|
|
87
|
0
|
|
|
|
|
0
|
my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo); |
|
88
|
0
|
|
|
|
|
0
|
my $test_transl=0; |
|
89
|
0
|
0
|
|
|
|
0
|
if ($test_transl) { $self->test_transl($hashref,\@translationobjects);} |
|
|
0
|
|
|
|
|
0
|
|
|
90
|
0
|
|
|
|
|
0
|
return @translationobjects; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=head2 novelaasequence2gene |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
Title : novelaasequence2gene |
|
96
|
|
|
|
|
|
|
Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); |
|
97
|
|
|
|
|
|
|
: $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*"); |
|
98
|
|
|
|
|
|
|
-taxon => 9606, |
|
99
|
|
|
|
|
|
|
-gene_name => "tyr-kinase"); |
|
100
|
|
|
|
|
|
|
|
|
101
|
|
|
|
|
|
|
Function: creates LiveSeq objects from a novel amino acid sequence, |
|
102
|
|
|
|
|
|
|
using codon usage database to choose codons according to |
|
103
|
|
|
|
|
|
|
relative frequencies. |
|
104
|
|
|
|
|
|
|
If a taxon ID is not specified, the default is to use the human |
|
105
|
|
|
|
|
|
|
one (taxonomy ID 9606). |
|
106
|
|
|
|
|
|
|
Returns : reference to a Gene object containing references to LiveSeq objects |
|
107
|
|
|
|
|
|
|
Errorcode 0 |
|
108
|
|
|
|
|
|
|
Args : string containing an amino acid sequence |
|
109
|
|
|
|
|
|
|
integer (optional) with a taxonomy ID |
|
110
|
|
|
|
|
|
|
string specifying a gene name |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=cut |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=head2 gene2liveseq |
|
115
|
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
Title : gene2liveseq |
|
117
|
|
|
|
|
|
|
Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name"); |
|
118
|
|
|
|
|
|
|
: $gene=$loader->gene2liveseq(-gene_name => "gene name", |
|
119
|
|
|
|
|
|
|
-flanking => 64); |
|
120
|
|
|
|
|
|
|
: $gene=$loader->gene2liveseq(-gene_name => "gene name", |
|
121
|
|
|
|
|
|
|
-getswissprotinfo => 0); |
|
122
|
|
|
|
|
|
|
: $gene=$loader->gene2liveseq(-position => 4); |
|
123
|
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
Function: creates LiveSeq objects from an entry previously loaded |
|
125
|
|
|
|
|
|
|
It is a "light weight" creation because it creates |
|
126
|
|
|
|
|
|
|
a LiveSequence just for the interesting region in an entry |
|
127
|
|
|
|
|
|
|
(instead than for the total entry, like in entry2liveseq) and for |
|
128
|
|
|
|
|
|
|
the flanking regions up to 500 nucleotides (default) or up to |
|
129
|
|
|
|
|
|
|
the specified amount of nucleotides (given as argument) around the |
|
130
|
|
|
|
|
|
|
Gene. |
|
131
|
|
|
|
|
|
|
Returns : reference to a Gene object containing possibly alternative |
|
132
|
|
|
|
|
|
|
Transcripts. |
|
133
|
|
|
|
|
|
|
Errorcode 0 |
|
134
|
|
|
|
|
|
|
Args : string containing the gene name as in the EMBL feature qualifier |
|
135
|
|
|
|
|
|
|
integer (optional) "flanking": amount of flanking bases to be kept |
|
136
|
|
|
|
|
|
|
boolean (optional) "getswissprotinfo": if set to "0" it will avoid |
|
137
|
|
|
|
|
|
|
trying to fetch information from a crossreference to a SwissProt |
|
138
|
|
|
|
|
|
|
entry, avoding the process of creation of AARange objects |
|
139
|
|
|
|
|
|
|
It is "1" (on) by default |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
Alternative to a gene_name, a position can be given: an |
|
142
|
|
|
|
|
|
|
integer (1-) containing the position of the desired CDS in the |
|
143
|
|
|
|
|
|
|
loaded entry |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=cut |
|
146
|
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub gene2liveseq { |
|
148
|
6
|
|
|
6
|
1
|
653
|
my ($self, %args) = @_; |
|
149
|
6
|
|
|
|
|
31
|
my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position}); |
|
150
|
6
|
|
|
|
|
10
|
my $input; |
|
151
|
6
|
50
|
33
|
|
|
21
|
unless (($gene_name)||($cds_position)) { |
|
152
|
0
|
|
|
|
|
0
|
carp "Gene_Name or Position not specified for gene2liveseq loading function"; |
|
153
|
0
|
|
|
|
|
0
|
return (0); |
|
154
|
|
|
|
|
|
|
} |
|
155
|
6
|
50
|
33
|
|
|
38
|
if (($gene_name)&&($cds_position)) { |
|
|
|
50
|
|
|
|
|
|
|
156
|
0
|
|
|
|
|
0
|
carp "Gene_Name and Position cannot be given together"; |
|
157
|
0
|
|
|
|
|
0
|
return (0); |
|
158
|
|
|
|
|
|
|
} elsif ($gene_name) { |
|
159
|
6
|
|
|
|
|
11
|
$input=$gene_name; |
|
160
|
|
|
|
|
|
|
} else { |
|
161
|
0
|
|
|
|
|
0
|
$input="cds-position:".$cds_position; |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
|
|
164
|
6
|
50
|
|
|
|
12
|
if (defined($getswissprotinfo)) { |
|
165
|
0
|
0
|
0
|
|
|
0
|
if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) { |
|
166
|
0
|
|
|
|
|
0
|
carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information...."; |
|
167
|
0
|
|
|
|
|
0
|
$getswissprotinfo=0; |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
} else { |
|
170
|
6
|
|
|
|
|
9
|
$getswissprotinfo=1; |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
|
|
173
|
6
|
50
|
|
|
|
12
|
if (defined($flanking)) { |
|
174
|
0
|
0
|
|
|
|
0
|
unless ($flanking >= 0) { |
|
175
|
0
|
|
|
|
|
0
|
carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function"; |
|
176
|
0
|
|
|
|
|
0
|
return (0); |
|
177
|
|
|
|
|
|
|
} |
|
178
|
|
|
|
|
|
|
} else { |
|
179
|
6
|
|
|
|
|
12
|
$flanking=500; # the default flanking length |
|
180
|
|
|
|
|
|
|
} |
|
181
|
6
|
|
|
|
|
13
|
my $hashref=$self->{'hash'}; |
|
182
|
6
|
50
|
|
|
|
15
|
unless ($hashref) { return (0); } |
|
|
0
|
|
|
|
|
0
|
|
|
183
|
6
|
|
|
|
|
28
|
my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo); |
|
184
|
6
|
50
|
|
|
|
39
|
unless ($gene) { # if $gene == 0 it means problems in hash2gene |
|
185
|
0
|
|
|
|
|
0
|
carp "gene2liveseq produced error"; |
|
186
|
0
|
|
|
|
|
0
|
return (0); |
|
187
|
|
|
|
|
|
|
} |
|
188
|
6
|
|
|
|
|
30
|
return $gene; |
|
189
|
|
|
|
|
|
|
} |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# TODO: update so that it will work even if CDS is not only accepted FEATURE!! |
|
192
|
|
|
|
|
|
|
# this method is for now deprecated and not supported |
|
193
|
|
|
|
|
|
|
sub test_transl { |
|
194
|
0
|
|
|
0
|
0
|
0
|
my ($self,$entry)=@_; |
|
195
|
0
|
|
|
|
|
0
|
my @features=@{$entry->{'Features'}}; |
|
|
0
|
|
|
|
|
0
|
|
|
196
|
0
|
|
|
|
|
0
|
my @translationobjects=@{$_[1]}; |
|
|
0
|
|
|
|
|
0
|
|
|
197
|
0
|
|
|
|
|
0
|
my ($i,$translation); |
|
198
|
0
|
|
|
|
|
0
|
my ($obj_transl,$hash_transl); |
|
199
|
0
|
|
|
|
|
0
|
my @cds=@{$entry->{'CDS'}}; |
|
|
0
|
|
|
|
|
0
|
|
|
200
|
0
|
|
|
|
|
0
|
foreach $translation (@translationobjects) { |
|
201
|
0
|
|
|
|
|
0
|
$obj_transl=$translation->seq; |
|
202
|
0
|
|
|
|
|
0
|
$hash_transl=$cds[$i]->{'qualifiers'}->{'translation'}; |
|
203
|
|
|
|
|
|
|
#before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*" |
|
204
|
0
|
0
|
|
|
|
0
|
unless ($obj_transl eq $hash_transl) { |
|
205
|
0
|
|
|
|
|
0
|
cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i"; |
|
206
|
0
|
|
|
|
|
0
|
carp "\nEntry's transl: ",$hash_transl,"\n"; |
|
207
|
0
|
|
|
|
|
0
|
carp "\nObject's transl: ",$obj_transl,"\n"; |
|
208
|
0
|
|
|
|
|
0
|
exit; |
|
209
|
|
|
|
|
|
|
} |
|
210
|
0
|
|
|
|
|
0
|
$i++; |
|
211
|
|
|
|
|
|
|
} |
|
212
|
|
|
|
|
|
|
} |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
# argument: hashref containing the EMBL entry datas, |
|
215
|
|
|
|
|
|
|
# getswissprotinfo boolean flag |
|
216
|
|
|
|
|
|
|
# creates the liveseq objects |
|
217
|
|
|
|
|
|
|
# returns: an array of Translation object references |
|
218
|
|
|
|
|
|
|
sub hash2liveseq { |
|
219
|
0
|
|
|
0
|
0
|
0
|
my ($self,$entry,$getswissprotinfo)=@_; |
|
220
|
0
|
|
|
|
|
0
|
my $i; |
|
221
|
|
|
|
|
|
|
my @transcripts; |
|
222
|
0
|
|
|
|
|
0
|
my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} ); |
|
223
|
0
|
|
|
|
|
0
|
$dna->alphabet(lc($entry->{'Molecule'})); |
|
224
|
0
|
|
|
|
|
0
|
$dna->display_id($entry->{'ID'}); |
|
225
|
0
|
|
|
|
|
0
|
$dna->accession_number($entry->{'AccNumber'}); |
|
226
|
0
|
|
|
|
|
0
|
$dna->desc($entry->{'Description'}); |
|
227
|
0
|
|
|
|
|
0
|
my @cds=@{$entry->{'CDS'}}; |
|
|
0
|
|
|
|
|
0
|
|
|
228
|
0
|
|
|
|
|
0
|
my ($swissacc,$swisshash); my @swisshashes; |
|
|
0
|
|
|
|
|
0
|
|
|
229
|
0
|
|
|
|
|
0
|
for $i (0..$#cds) { |
|
230
|
|
|
|
|
|
|
#my @transcript=@{$cds[$i]->{'range'}}; |
|
231
|
|
|
|
|
|
|
#$transcript=\@transcript; |
|
232
|
|
|
|
|
|
|
#push (@transcripts,$transcript); |
|
233
|
0
|
|
|
|
|
0
|
push (@transcripts,$cds[$i]->{'range'}); |
|
234
|
0
|
0
|
|
|
|
0
|
if ($getswissprotinfo) { |
|
235
|
0
|
|
|
|
|
0
|
$swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'}; |
|
236
|
0
|
|
|
|
|
0
|
$swisshash=$self->get_swisshash($swissacc); |
|
237
|
|
|
|
|
|
|
#$self->printswissprot($swisshash); # DEBUG |
|
238
|
0
|
|
|
|
|
0
|
push (@swisshashes,$swisshash); |
|
239
|
|
|
|
|
|
|
} |
|
240
|
|
|
|
|
|
|
} |
|
241
|
0
|
|
|
|
|
0
|
my @translations=($self->transexonscreation($dna,\@transcripts)); |
|
242
|
0
|
|
|
|
|
0
|
my $translation; my $j=0; |
|
|
0
|
|
|
|
|
0
|
|
|
243
|
0
|
|
|
|
|
0
|
foreach $translation (@translations) { |
|
244
|
0
|
0
|
|
|
|
0
|
if ($swisshashes[$j]) { # if not 0 |
|
245
|
0
|
|
|
|
|
0
|
$self->swisshash2liveseq($swisshashes[$j],$translation); |
|
246
|
|
|
|
|
|
|
} |
|
247
|
0
|
|
|
|
|
0
|
$j++; |
|
248
|
|
|
|
|
|
|
} |
|
249
|
0
|
|
|
|
|
0
|
return (@translations); |
|
250
|
|
|
|
|
|
|
} |
|
251
|
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
# only features pertaining to a specified gene are created |
|
253
|
|
|
|
|
|
|
# only the sequence of the gene and appropriate context flanking regions |
|
254
|
|
|
|
|
|
|
# are created as chain |
|
255
|
|
|
|
|
|
|
# arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag |
|
256
|
|
|
|
|
|
|
# returns: reference to Gene object |
|
257
|
|
|
|
|
|
|
# |
|
258
|
|
|
|
|
|
|
# Note: if entry contains just one CDS, all the features get added |
|
259
|
|
|
|
|
|
|
# this is useful because often the features in these entries do not |
|
260
|
|
|
|
|
|
|
# carry the /gene qualifier |
|
261
|
|
|
|
|
|
|
# |
|
262
|
|
|
|
|
|
|
# errorcode: 0 |
|
263
|
|
|
|
|
|
|
sub hash2gene { |
|
264
|
6
|
|
|
6
|
0
|
14
|
my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_; |
|
265
|
6
|
|
|
|
|
8
|
my $entryfeature; |
|
266
|
|
|
|
|
|
|
my $genefeatureshash; |
|
267
|
|
|
|
|
|
|
|
|
268
|
6
|
|
|
|
|
7
|
my @cds=@{$entry->{'CDS'}}; |
|
|
6
|
|
|
|
|
17
|
|
|
269
|
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# checking if a position has been given instead than a gene_name |
|
271
|
6
|
50
|
|
|
|
19
|
if (index($input,"cds-position:") == 0 ) { |
|
272
|
0
|
|
|
|
|
0
|
my $cds_position=substr($input,13); # extracting the cds position |
|
273
|
0
|
0
|
0
|
|
|
0
|
if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) { |
|
274
|
0
|
|
|
|
|
0
|
$genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo); |
|
275
|
|
|
|
|
|
|
} |
|
276
|
|
|
|
|
|
|
} else { |
|
277
|
6
|
|
|
|
|
26
|
$genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo); |
|
278
|
|
|
|
|
|
|
} |
|
279
|
|
|
|
|
|
|
|
|
280
|
6
|
50
|
50
|
|
|
19
|
unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found |
|
|
6
|
|
|
|
|
20
|
|
|
281
|
0
|
|
|
|
|
0
|
my @genes=$self->genes($entry); |
|
282
|
0
|
|
|
|
|
0
|
my $cds_number=scalar(@cds); |
|
283
|
0
|
|
|
|
|
0
|
warn "Warning! Not even one genefeature found for /$input/.... |
|
284
|
|
|
|
|
|
|
The genes present in this entry are:\n\t@genes\n |
|
285
|
|
|
|
|
|
|
The number of CDS in this entry is:\n\t$cds_number\n"; |
|
286
|
0
|
|
|
|
|
0
|
return(0); |
|
287
|
|
|
|
|
|
|
} |
|
288
|
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
# get max and min, check flankings |
|
290
|
6
|
|
|
|
|
9
|
my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries" |
|
|
6
|
|
|
|
|
27
|
|
|
291
|
6
|
|
|
|
|
11
|
my $seqlength=$entry->{'SeqLength'}; |
|
292
|
6
|
|
|
|
|
7
|
my ($mindna,$maxdna); # some flanking region next to the gene "boundaries" |
|
293
|
6
|
50
|
|
|
|
16
|
if ($min-$flanking < 1) { |
|
294
|
6
|
|
|
|
|
8
|
$mindna=1; |
|
295
|
|
|
|
|
|
|
} else { |
|
296
|
0
|
|
|
|
|
0
|
$mindna=$min-$flanking; |
|
297
|
|
|
|
|
|
|
} |
|
298
|
6
|
50
|
|
|
|
14
|
if ($max+$flanking > $seqlength) { |
|
299
|
6
|
|
|
|
|
10
|
$maxdna=$seqlength; |
|
300
|
|
|
|
|
|
|
} else { |
|
301
|
0
|
|
|
|
|
0
|
$maxdna=$max+$flanking; |
|
302
|
|
|
|
|
|
|
} |
|
303
|
6
|
|
|
|
|
65
|
my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1); |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# create LiveSeq objects |
|
306
|
|
|
|
|
|
|
|
|
307
|
|
|
|
|
|
|
# create DNA |
|
308
|
6
|
|
|
|
|
44
|
my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna); |
|
309
|
6
|
|
|
|
|
48
|
$dna->alphabet(lc($entry->{'Molecule'})); |
|
310
|
6
|
|
|
|
|
29
|
$dna->source($entry->{'Organism'}); |
|
311
|
6
|
|
|
|
|
26
|
$dna->display_id($entry->{'ID'}); |
|
312
|
6
|
|
|
|
|
19
|
$dna->accession_number($entry->{'AccNumber'}); |
|
313
|
6
|
|
|
|
|
21
|
$dna->desc($entry->{'Description'}); |
|
314
|
|
|
|
|
|
|
|
|
315
|
6
|
|
|
|
|
37
|
my @transcripts=@{$genefeatureshash->{'transcripts'}}; |
|
|
6
|
|
|
|
|
21
|
|
|
316
|
|
|
|
|
|
|
# create Translations, Transcripts, Exons out of the CDS |
|
317
|
6
|
50
|
|
|
|
16
|
unless (@transcripts) { |
|
318
|
0
|
|
|
|
|
0
|
cluck "no CDS feature found for /$input/...."; |
|
319
|
0
|
|
|
|
|
0
|
return(0); |
|
320
|
|
|
|
|
|
|
} |
|
321
|
6
|
|
|
|
|
26
|
my @translationobjs=$self->transexonscreation($dna,\@transcripts); |
|
322
|
6
|
|
|
|
|
7
|
my @transcriptobjs; |
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
# get the Transcript obj_refs |
|
325
|
|
|
|
|
|
|
my $translation; |
|
326
|
6
|
|
|
|
|
8
|
my $j=0; |
|
327
|
6
|
|
|
|
|
9
|
my @ttables=@{$genefeatureshash->{'ttables'}}; |
|
|
6
|
|
|
|
|
20
|
|
|
328
|
6
|
|
|
|
|
8
|
my @swisshashes=@{$genefeatureshash->{'swisshashes'}}; |
|
|
6
|
|
|
|
|
14
|
|
|
329
|
6
|
|
|
|
|
14
|
foreach $translation (@translationobjs) { |
|
330
|
6
|
|
|
|
|
14
|
push(@transcriptobjs,$translation->get_Transcript); |
|
331
|
6
|
50
|
|
|
|
19
|
if ($ttables[$j]) { # if not undef |
|
332
|
0
|
|
|
|
|
0
|
$translation->get_Transcript->translation_table($ttables[$j]); |
|
333
|
|
|
|
|
|
|
#} else { # DEBUG |
|
334
|
|
|
|
|
|
|
# print "\n\t\tno translation table information....\n"; |
|
335
|
|
|
|
|
|
|
} |
|
336
|
6
|
50
|
|
|
|
15
|
if ($swisshashes[$j]) { # if not 0 |
|
337
|
0
|
|
|
|
|
0
|
$self->swisshash2liveseq($swisshashes[$j],$translation); |
|
338
|
|
|
|
|
|
|
} |
|
339
|
6
|
|
|
|
|
10
|
$j++; |
|
340
|
|
|
|
|
|
|
} |
|
341
|
|
|
|
|
|
|
|
|
342
|
6
|
|
|
|
|
8
|
my %gene; # this is the hash to store created object references |
|
343
|
6
|
|
|
|
|
13
|
$gene{DNA}=$dna; |
|
344
|
6
|
|
|
|
|
13
|
$gene{Transcripts}=\@transcriptobjs; |
|
345
|
6
|
|
|
|
|
12
|
$gene{Translations}=\@translationobjs; |
|
346
|
|
|
|
|
|
|
|
|
347
|
6
|
|
|
|
|
12
|
my @exonobjs; my @intronobjs; |
|
348
|
0
|
|
|
|
|
0
|
my @repeatunitobjs; my @repeatregionobjs; |
|
|
0
|
|
|
|
|
0
|
|
|
349
|
0
|
|
|
|
|
0
|
my @primtranscriptobjs; |
|
350
|
|
|
|
|
|
|
|
|
351
|
0
|
|
|
|
|
0
|
my ($object,$range,$start,$end,$strand); |
|
352
|
|
|
|
|
|
|
|
|
353
|
6
|
|
|
|
|
9
|
my @exons=@{$genefeatureshash->{'exons'}}; |
|
|
6
|
|
|
|
|
14
|
|
|
354
|
6
|
|
|
|
|
10
|
my @exondescs=@{$genefeatureshash->{'exondescs'}}; |
|
|
6
|
|
|
|
|
16
|
|
|
355
|
6
|
100
|
|
|
|
18
|
if (@exons) { |
|
356
|
1
|
|
|
|
|
2
|
my $exoncount = 0; |
|
357
|
1
|
|
|
|
|
2
|
foreach $range (@exons) { |
|
358
|
9
|
|
|
|
|
8
|
($start,$end,$strand)=@{$range}; |
|
|
9
|
|
|
|
|
21
|
|
|
359
|
9
|
|
|
|
|
28
|
$object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
|
360
|
9
|
50
|
|
|
|
18
|
if ($object != -1) { |
|
361
|
9
|
50
|
|
|
|
29
|
$object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount]; |
|
362
|
9
|
|
|
|
|
7
|
$exoncount++; |
|
363
|
9
|
|
|
|
|
14
|
push (@exonobjs,$object); |
|
364
|
|
|
|
|
|
|
} else { |
|
365
|
0
|
|
|
|
|
0
|
$exoncount++; |
|
366
|
|
|
|
|
|
|
} |
|
367
|
|
|
|
|
|
|
} |
|
368
|
1
|
|
|
|
|
2
|
$gene{Exons}=\@exonobjs; |
|
369
|
|
|
|
|
|
|
} |
|
370
|
6
|
|
|
|
|
9
|
my @introns=@{$genefeatureshash->{'introns'}}; |
|
|
6
|
|
|
|
|
14
|
|
|
371
|
6
|
|
|
|
|
9
|
my @introndescs=@{$genefeatureshash->{'introndescs'}}; |
|
|
6
|
|
|
|
|
18
|
|
|
372
|
6
|
100
|
|
|
|
14
|
if (@introns) { |
|
373
|
1
|
|
|
|
|
2
|
my $introncount = 0; |
|
374
|
1
|
|
|
|
|
2
|
foreach $range (@introns) { |
|
375
|
8
|
|
|
|
|
7
|
($start,$end,$strand)=@{$range}; |
|
|
8
|
|
|
|
|
21
|
|
|
376
|
8
|
|
|
|
|
34
|
$object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
|
377
|
8
|
50
|
|
|
|
19
|
if ($object != -1) { |
|
378
|
8
|
|
|
|
|
23
|
$object->desc($introndescs[$introncount]); |
|
379
|
8
|
|
|
|
|
8
|
$introncount++; |
|
380
|
8
|
|
|
|
|
16
|
push (@intronobjs,$object); |
|
381
|
|
|
|
|
|
|
} else { |
|
382
|
0
|
|
|
|
|
0
|
$introncount++; |
|
383
|
|
|
|
|
|
|
} |
|
384
|
|
|
|
|
|
|
} |
|
385
|
1
|
|
|
|
|
4
|
$gene{Introns}=\@intronobjs; |
|
386
|
|
|
|
|
|
|
} |
|
387
|
6
|
|
|
|
|
8
|
my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}}; |
|
|
6
|
|
|
|
|
16
|
|
|
388
|
6
|
100
|
|
|
|
16
|
if (@prim_transcripts) { |
|
389
|
5
|
|
|
|
|
11
|
foreach $range (@prim_transcripts) { |
|
390
|
7
|
|
|
|
|
11
|
($start,$end,$strand)=@{$range}; |
|
|
7
|
|
|
|
|
26
|
|
|
391
|
7
|
|
|
|
|
67
|
$object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
|
392
|
7
|
50
|
|
|
|
42
|
if ($object != -1) { push (@primtranscriptobjs,$object); } |
|
|
7
|
|
|
|
|
39
|
|
|
393
|
|
|
|
|
|
|
} |
|
394
|
5
|
|
|
|
|
20
|
$gene{Prim_Transcripts}=\@primtranscriptobjs; |
|
395
|
|
|
|
|
|
|
} |
|
396
|
6
|
|
|
|
|
13
|
my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}}; |
|
|
6
|
|
|
|
|
17
|
|
|
397
|
6
|
|
|
|
|
8
|
my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}}; |
|
|
6
|
|
|
|
|
11
|
|
|
398
|
6
|
50
|
|
|
|
19
|
if (@repeat_regions) { |
|
399
|
0
|
|
|
|
|
0
|
my $k=0; |
|
400
|
0
|
|
|
|
|
0
|
foreach $range (@repeat_regions) { |
|
401
|
0
|
|
|
|
|
0
|
($start,$end,$strand)=@{$range}; |
|
|
0
|
|
|
|
|
0
|
|
|
402
|
0
|
|
|
|
|
0
|
$object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
|
403
|
0
|
0
|
|
|
|
0
|
if ($object != -1) { |
|
404
|
0
|
|
|
|
|
0
|
$object->desc($repeat_regions_family[$k]); |
|
405
|
0
|
|
|
|
|
0
|
$k++; |
|
406
|
0
|
|
|
|
|
0
|
push (@repeatregionobjs,$object); |
|
407
|
|
|
|
|
|
|
} else { |
|
408
|
0
|
|
|
|
|
0
|
$k++; |
|
409
|
|
|
|
|
|
|
} |
|
410
|
|
|
|
|
|
|
} |
|
411
|
0
|
|
|
|
|
0
|
$gene{Repeat_Regions}=\@repeatregionobjs; |
|
412
|
|
|
|
|
|
|
} |
|
413
|
6
|
|
|
|
|
12
|
my @repeat_units=@{$genefeatureshash->{'repeat_units'}}; |
|
|
6
|
|
|
|
|
13
|
|
|
414
|
6
|
|
|
|
|
6
|
my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}}; |
|
|
6
|
|
|
|
|
13
|
|
|
415
|
6
|
50
|
|
|
|
15
|
if (@repeat_units) { |
|
416
|
0
|
|
|
|
|
0
|
my $k=0; |
|
417
|
0
|
|
|
|
|
0
|
foreach $range (@repeat_units) { |
|
418
|
0
|
|
|
|
|
0
|
($start,$end,$strand)=@{$range}; |
|
|
0
|
|
|
|
|
0
|
|
|
419
|
0
|
|
|
|
|
0
|
$object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
|
420
|
0
|
0
|
|
|
|
0
|
if ($object != -1) { |
|
421
|
0
|
|
|
|
|
0
|
$object->desc($repeat_units_family[$k]); |
|
422
|
0
|
|
|
|
|
0
|
$k++; |
|
423
|
0
|
|
|
|
|
0
|
push (@repeatunitobjs,$object); |
|
424
|
|
|
|
|
|
|
} else { |
|
425
|
0
|
|
|
|
|
0
|
$k++; |
|
426
|
|
|
|
|
|
|
} |
|
427
|
|
|
|
|
|
|
} |
|
428
|
0
|
|
|
|
|
0
|
$gene{Repeat_Units}=\@repeatunitobjs; |
|
429
|
|
|
|
|
|
|
} |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
# create the Gene |
|
432
|
6
|
|
|
|
|
15
|
my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos |
|
433
|
6
|
|
|
|
|
79
|
return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene, |
|
434
|
|
|
|
|
|
|
-upbound=>$min,-downbound=>$max)); |
|
435
|
|
|
|
|
|
|
} |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
# maybe this function will be moved to general utility package |
|
438
|
|
|
|
|
|
|
# argument: array of numbers |
|
439
|
|
|
|
|
|
|
# returns: (min,max) numbers in the array |
|
440
|
|
|
|
|
|
|
sub rangeofarray { |
|
441
|
6
|
|
|
6
|
0
|
6
|
my $self=shift; |
|
442
|
6
|
|
|
|
|
30
|
my @array=@_; |
|
443
|
|
|
|
|
|
|
#print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n"; |
|
444
|
6
|
|
|
|
|
7
|
my ($max,$min,$element); |
|
445
|
6
|
|
|
|
|
10
|
$min=$max=shift(@array); |
|
446
|
6
|
|
|
|
|
13
|
foreach $element (@array) { |
|
447
|
110
|
50
|
|
|
|
109
|
$element = 0 unless defined $element; |
|
448
|
110
|
50
|
|
|
|
132
|
if ($element < $min) { |
|
449
|
0
|
|
|
|
|
0
|
$min=$element; |
|
450
|
|
|
|
|
|
|
} |
|
451
|
110
|
100
|
|
|
|
133
|
if ($element > $max) { |
|
452
|
6
|
|
|
|
|
11
|
$max=$element; |
|
453
|
|
|
|
|
|
|
} |
|
454
|
|
|
|
|
|
|
} |
|
455
|
|
|
|
|
|
|
#print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n"; |
|
456
|
6
|
|
|
|
|
20
|
return ($min,$max); |
|
457
|
|
|
|
|
|
|
} |
|
458
|
|
|
|
|
|
|
|
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# argument: reference to DNA object, reference to array of transcripts |
|
461
|
|
|
|
|
|
|
# returns: an array of Translation object references |
|
462
|
|
|
|
|
|
|
sub transexonscreation { |
|
463
|
6
|
|
|
6
|
0
|
8
|
my $self=shift; |
|
464
|
6
|
|
|
|
|
8
|
my $dna=$_[0]; |
|
465
|
6
|
|
|
|
|
9
|
my @transcripts=@{$_[1]}; |
|
|
6
|
|
|
|
|
9
|
|
|
466
|
|
|
|
|
|
|
|
|
467
|
6
|
|
|
|
|
7
|
my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj); |
|
468
|
0
|
|
|
|
|
0
|
my $translationobj; |
|
469
|
0
|
|
|
|
|
0
|
my @translationobjects; |
|
470
|
6
|
|
|
|
|
12
|
foreach $transcript (@transcripts) { |
|
471
|
6
|
|
|
|
|
9
|
foreach $exonref (@{$transcript}) { |
|
|
6
|
|
|
|
|
11
|
|
|
472
|
34
|
|
|
|
|
30
|
($start,$end,$strand)=@{$exonref}; |
|
|
34
|
|
|
|
|
67
|
|
|
473
|
|
|
|
|
|
|
#print "Creating Exon: start $start end $end strand $strand\n"; |
|
474
|
34
|
|
|
|
|
155
|
$exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand); |
|
475
|
|
|
|
|
|
|
#push (@exonobjects,$exonobj); |
|
476
|
34
|
|
|
|
|
61
|
push (@transexons,$exonobj); |
|
477
|
|
|
|
|
|
|
} |
|
478
|
6
|
|
|
|
|
51
|
$transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons ); |
|
479
|
6
|
50
|
|
|
|
26
|
if ($transcriptobj != -1) { |
|
480
|
6
|
|
|
|
|
50
|
$translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj); |
|
481
|
6
|
|
|
|
|
17
|
@transexons=(); # cleans it |
|
482
|
|
|
|
|
|
|
#push (@transcriptobjects,$transcriptobj); |
|
483
|
6
|
|
|
|
|
16
|
push (@translationobjects,$translationobj); |
|
484
|
|
|
|
|
|
|
} |
|
485
|
|
|
|
|
|
|
} |
|
486
|
6
|
|
|
|
|
20
|
return (@translationobjects); |
|
487
|
|
|
|
|
|
|
} |
|
488
|
|
|
|
|
|
|
|
|
489
|
|
|
|
|
|
|
#sub printgene { |
|
490
|
|
|
|
|
|
|
# deleted. Some functionality placed in Gene->printfeaturesnum |
|
491
|
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
=head2 printswissprot |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
Title : printswissprot |
|
495
|
|
|
|
|
|
|
Usage : $loader->printswissprot($hashref); |
|
496
|
|
|
|
|
|
|
Function: prints out all information loaded from a database entry into the |
|
497
|
|
|
|
|
|
|
loader. Mainly used for testing purposes. |
|
498
|
|
|
|
|
|
|
Args : a hashref containing the SWISSPROT entry datas |
|
499
|
|
|
|
|
|
|
Note : the hashref can be obtained with a call to the method |
|
500
|
|
|
|
|
|
|
$loader->get_swisshash() (BioPerl via Bio::DB::EMBL.pm) |
|
501
|
|
|
|
|
|
|
that takes as argument a string like "SWISS-PROT:P10275" |
|
502
|
|
|
|
|
|
|
|
|
503
|
|
|
|
|
|
|
=cut |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# argument: hashref containing the SWISSPROT entry datas |
|
506
|
|
|
|
|
|
|
# prints out that hash, showing the information loaded |
|
507
|
|
|
|
|
|
|
sub printswissprot { |
|
508
|
0
|
|
|
0
|
1
|
0
|
my ($self,$entry)=@_; |
|
509
|
0
|
0
|
|
|
|
0
|
unless ($entry) { |
|
510
|
0
|
|
|
|
|
0
|
return; |
|
511
|
|
|
|
|
|
|
} |
|
512
|
|
|
|
|
|
|
printf "ID: %s\n", |
|
513
|
0
|
|
|
|
|
0
|
$entry->{'ID'}; |
|
514
|
|
|
|
|
|
|
printf "ACC: %s\n", |
|
515
|
0
|
|
|
|
|
0
|
$entry->{'AccNumber'}; |
|
516
|
|
|
|
|
|
|
printf "GENE: %s\n", |
|
517
|
0
|
|
|
|
|
0
|
$entry->{'Gene'}; |
|
518
|
|
|
|
|
|
|
printf "DES: %s\n", |
|
519
|
0
|
|
|
|
|
0
|
$entry->{'Description'}; |
|
520
|
|
|
|
|
|
|
printf "ORG: %s\n", |
|
521
|
0
|
|
|
|
|
0
|
$entry->{'Organism'}; |
|
522
|
|
|
|
|
|
|
printf "SEQLN: %s\n", |
|
523
|
0
|
|
|
|
|
0
|
$entry->{'SeqLength'}; |
|
524
|
|
|
|
|
|
|
printf "SEQ: %s\n", |
|
525
|
0
|
|
|
|
|
0
|
substr($entry->{'Sequence'},0,64); |
|
526
|
0
|
0
|
|
|
|
0
|
if ($entry->{'Features'}) { |
|
527
|
0
|
|
|
|
|
0
|
my @features=@{$entry->{'Features'}}; |
|
|
0
|
|
|
|
|
0
|
|
|
528
|
0
|
|
|
|
|
0
|
my $i; |
|
529
|
0
|
|
|
|
|
0
|
for $i (0..$#features) { |
|
530
|
0
|
|
|
|
|
0
|
print "|",$features[$i]->{'name'},"|"; |
|
531
|
0
|
|
|
|
|
0
|
print " at ",$features[$i]->{'location'},": "; |
|
532
|
0
|
|
|
|
|
0
|
print "",$features[$i]->{'desc'},"\n"; |
|
533
|
|
|
|
|
|
|
} |
|
534
|
|
|
|
|
|
|
} |
|
535
|
|
|
|
|
|
|
} |
|
536
|
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=head2 printembl |
|
538
|
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
Title : printembl |
|
540
|
|
|
|
|
|
|
Usage : $loader->printembl(); |
|
541
|
|
|
|
|
|
|
Function: prints out all information loaded from a database entry into the |
|
542
|
|
|
|
|
|
|
loader. Mainly used for testing purposes. |
|
543
|
|
|
|
|
|
|
Args : none |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=cut |
|
546
|
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
# argument: hashref containing the EMBL entry datas |
|
548
|
|
|
|
|
|
|
# prints out that hash, showing the information loaded |
|
549
|
|
|
|
|
|
|
sub printembl { |
|
550
|
0
|
|
|
0
|
1
|
0
|
my ($self,$entry)=@_; |
|
551
|
0
|
0
|
|
|
|
0
|
unless ($entry) { |
|
552
|
0
|
|
|
|
|
0
|
$entry=$self->{'hash'}; |
|
553
|
|
|
|
|
|
|
} |
|
554
|
0
|
|
|
|
|
0
|
my ($i,$featurename); |
|
555
|
|
|
|
|
|
|
printf "ID: %s\n", |
|
556
|
0
|
|
|
|
|
0
|
$entry->{'ID'}; |
|
557
|
|
|
|
|
|
|
printf "ACC: %s\n", |
|
558
|
0
|
|
|
|
|
0
|
$entry->{'AccNumber'}; |
|
559
|
|
|
|
|
|
|
printf "MOL: %s\n", |
|
560
|
0
|
|
|
|
|
0
|
$entry->{'Molecule'}; |
|
561
|
|
|
|
|
|
|
printf "DIV: %s\n", |
|
562
|
0
|
|
|
|
|
0
|
$entry->{'Division'}; |
|
563
|
|
|
|
|
|
|
printf "DES: %s\n", |
|
564
|
0
|
|
|
|
|
0
|
$entry->{'Description'}; |
|
565
|
|
|
|
|
|
|
printf "ORG: %s\n", |
|
566
|
0
|
|
|
|
|
0
|
$entry->{'Organism'}; |
|
567
|
|
|
|
|
|
|
printf "SEQLN: %s\n", |
|
568
|
0
|
|
|
|
|
0
|
$entry->{'SeqLength'}; |
|
569
|
|
|
|
|
|
|
printf "SEQ: %s\n", |
|
570
|
0
|
|
|
|
|
0
|
substr($entry->{'Sequence'},0,64); |
|
571
|
0
|
|
|
|
|
0
|
my @features=@{$entry->{'Features'}}; |
|
|
0
|
|
|
|
|
0
|
|
|
572
|
0
|
|
|
|
|
0
|
my @cds=@{$entry->{'CDS'}}; |
|
|
0
|
|
|
|
|
0
|
|
|
573
|
0
|
|
|
|
|
0
|
print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n"; |
|
574
|
0
|
|
|
|
|
0
|
my ($exonref,$transcript); |
|
575
|
0
|
|
|
|
|
0
|
my ($qualifiernumber,$qualifiers,$key); |
|
576
|
0
|
|
|
|
|
0
|
my ($start,$end,$strand); |
|
577
|
0
|
|
|
|
|
0
|
my $j=0; |
|
578
|
0
|
|
|
|
|
0
|
for $i (0..$#features) { |
|
579
|
0
|
|
|
|
|
0
|
$featurename=$features[$i]->{'name'}; |
|
580
|
0
|
0
|
|
|
|
0
|
if ($featurename eq "CDS") { |
|
581
|
0
|
|
|
|
|
0
|
print "|CDS| number $j at feature position: $i\n"; |
|
582
|
|
|
|
|
|
|
#print $features[$i]->{'location'},"\n"; |
|
583
|
0
|
|
|
|
|
0
|
$transcript=$features[$i]->{'range'}; |
|
584
|
0
|
|
|
|
|
0
|
foreach $exonref (@{$transcript}) { |
|
|
0
|
|
|
|
|
0
|
|
|
585
|
0
|
|
|
|
|
0
|
($start,$end,$strand)=@{$exonref}; |
|
|
0
|
|
|
|
|
0
|
|
|
586
|
0
|
|
|
|
|
0
|
print "\tExon: start $start end $end strand $strand\n"; |
|
587
|
|
|
|
|
|
|
} |
|
588
|
0
|
|
|
|
|
0
|
$j++; |
|
589
|
|
|
|
|
|
|
} else { |
|
590
|
0
|
|
|
|
|
0
|
print "|$featurename| at feature position: $i\n"; |
|
591
|
0
|
|
|
|
|
0
|
print "\trange: "; |
|
592
|
0
|
|
|
|
|
0
|
print join("\t",@{$features[$i]->{'range'}}),"\n"; |
|
|
0
|
|
|
|
|
0
|
|
|
593
|
|
|
|
|
|
|
#print $features[$i]->{'location'},"\n"; |
|
594
|
|
|
|
|
|
|
} |
|
595
|
0
|
|
|
|
|
0
|
$qualifiernumber=$features[$i]->{'qual_number'}; |
|
596
|
0
|
|
|
|
|
0
|
$qualifiers=$features[$i]->{'qualifiers'}; # hash |
|
597
|
0
|
|
|
|
|
0
|
foreach $key (keys (%{$qualifiers})) { |
|
|
0
|
|
|
|
|
0
|
|
|
598
|
0
|
|
|
|
|
0
|
print "\t\t",$key,": "; |
|
599
|
0
|
|
|
|
|
0
|
print $qualifiers->{$key},"\n"; |
|
600
|
|
|
|
|
|
|
} |
|
601
|
|
|
|
|
|
|
} |
|
602
|
|
|
|
|
|
|
} |
|
603
|
|
|
|
|
|
|
|
|
604
|
|
|
|
|
|
|
=head2 genes |
|
605
|
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
Title : genes |
|
607
|
|
|
|
|
|
|
Usage : $loader->genes(); |
|
608
|
|
|
|
|
|
|
Function: Returns an array of gene_names (strings) contained in the loaded |
|
609
|
|
|
|
|
|
|
entry. |
|
610
|
|
|
|
|
|
|
Args : none |
|
611
|
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
=cut |
|
613
|
|
|
|
|
|
|
|
|
614
|
|
|
|
|
|
|
# argument: entryhashref |
|
615
|
|
|
|
|
|
|
# returns: array of genenames found in the entry |
|
616
|
|
|
|
|
|
|
sub genes { |
|
617
|
0
|
|
|
0
|
1
|
0
|
my ($self,$entry)=@_; |
|
618
|
0
|
0
|
|
|
|
0
|
unless ($entry) { |
|
619
|
0
|
|
|
|
|
0
|
$entry=$self->{'hash'}; |
|
620
|
|
|
|
|
|
|
} |
|
621
|
0
|
|
|
|
|
0
|
my @entryfeatures=@{$entry->{'Features'}}; |
|
|
0
|
|
|
|
|
0
|
|
|
622
|
0
|
|
|
|
|
0
|
my ($genename,$genenames,$entryfeature); |
|
623
|
0
|
|
|
|
|
0
|
for $entryfeature (@entryfeatures) { |
|
624
|
0
|
|
|
|
|
0
|
$genename=$entryfeature->{'qualifiers'}->{'gene'}; |
|
625
|
0
|
0
|
|
|
|
0
|
if ($genename) { |
|
626
|
0
|
0
|
|
|
|
0
|
if (index($genenames,$genename) == -1) { # if name is new |
|
627
|
0
|
|
|
|
|
0
|
$genenames .= $genename . " "; # add the name |
|
628
|
|
|
|
|
|
|
} |
|
629
|
|
|
|
|
|
|
} |
|
630
|
|
|
|
|
|
|
} |
|
631
|
0
|
|
|
|
|
0
|
return (split(/ /,$genenames)); # assumes no space inbetween each genename |
|
632
|
|
|
|
|
|
|
} |
|
633
|
|
|
|
|
|
|
|
|
634
|
|
|
|
|
|
|
# arguments: swisshash, translation objref |
|
635
|
|
|
|
|
|
|
# adds information to the Translation, creates AARange objects, sets the |
|
636
|
|
|
|
|
|
|
# aa_range attribute on the Translation, pointing to those objects |
|
637
|
|
|
|
|
|
|
sub swisshash2liveseq { |
|
638
|
0
|
|
|
0
|
0
|
0
|
my ($self,$entry,$translation)=@_; |
|
639
|
0
|
|
|
|
|
0
|
my $translength=$translation->length; |
|
640
|
0
|
|
|
|
|
0
|
$translation->desc($translation->desc . $entry->{'Description'}); |
|
641
|
0
|
|
|
|
|
0
|
$translation->display_id("SWISSPROT:" . $entry->{'ID'}); |
|
642
|
0
|
|
|
|
|
0
|
$translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'}); |
|
643
|
0
|
|
|
|
|
0
|
$translation->name($entry->{'Gene'}); |
|
644
|
0
|
|
|
|
|
0
|
$translation->source($entry->{'Organism'}); |
|
645
|
0
|
|
|
|
|
0
|
my @aarangeobjects; |
|
646
|
0
|
|
|
|
|
0
|
my ($start,$end,$aarangeobj,$feature); |
|
647
|
0
|
|
|
|
|
0
|
my @features; my @newfeatures; |
|
|
0
|
|
|
|
|
0
|
|
|
648
|
0
|
0
|
|
|
|
0
|
if ($entry->{'Features'}) { |
|
649
|
0
|
|
|
|
|
0
|
@features=@{$entry->{'Features'}}; |
|
|
0
|
|
|
|
|
0
|
|
|
650
|
|
|
|
|
|
|
} |
|
651
|
0
|
|
|
|
|
0
|
my $cleavedmet=0; |
|
652
|
|
|
|
|
|
|
# check for cleaved Met |
|
653
|
0
|
|
|
|
|
0
|
foreach $feature (@features) { |
|
654
|
0
|
0
|
0
|
|
|
0
|
if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) { |
|
655
|
0
|
|
|
|
|
0
|
$cleavedmet=1; |
|
656
|
0
|
|
|
|
|
0
|
$translation->{'offset'}="1"; # from swissprot to liveseq protein sequence |
|
657
|
|
|
|
|
|
|
} else { |
|
658
|
0
|
|
|
|
|
0
|
push(@newfeatures,$feature); |
|
659
|
|
|
|
|
|
|
} |
|
660
|
|
|
|
|
|
|
} |
|
661
|
|
|
|
|
|
|
|
|
662
|
0
|
|
|
|
|
0
|
my $swissseq=$entry->{'Sequence'}; |
|
663
|
0
|
|
|
|
|
0
|
my $liveseqtransl=$translation->seq; |
|
664
|
0
|
|
|
|
|
0
|
chop $liveseqtransl; # to take away the trailing STOP "*" |
|
665
|
0
|
|
|
|
|
0
|
my $translated=substr($liveseqtransl,$cleavedmet); |
|
666
|
|
|
|
|
|
|
|
|
667
|
0
|
|
|
|
|
0
|
my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met |
|
668
|
|
|
|
|
|
|
|
|
669
|
0
|
0
|
0
|
|
|
0
|
if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed? |
|
670
|
0
|
|
|
|
|
0
|
print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n"; |
|
671
|
0
|
|
|
|
|
0
|
carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!"; |
|
672
|
0
|
|
|
|
|
0
|
return; |
|
673
|
|
|
|
|
|
|
} |
|
674
|
|
|
|
|
|
|
|
|
675
|
|
|
|
|
|
|
#my $i=0; # debug |
|
676
|
0
|
|
|
|
|
0
|
@features=@newfeatures; |
|
677
|
0
|
|
|
|
|
0
|
foreach $feature (@features) { |
|
678
|
|
|
|
|
|
|
#print "Processing SwissProtFeature: $i\n"; # debug |
|
679
|
0
|
|
|
|
|
0
|
($start,$end)=split(/ /,$feature->{'location'}); |
|
680
|
|
|
|
|
|
|
# Note: cleavedmet is taken in account for updating numbering |
|
681
|
0
|
|
|
|
|
0
|
$aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -description => $feature->{'description'}, -translation => $translation, -translength => $translength); |
|
682
|
0
|
0
|
|
|
|
0
|
if ($aarangeobj != -1) { |
|
683
|
0
|
|
|
|
|
0
|
push(@aarangeobjects,$aarangeobj); |
|
684
|
|
|
|
|
|
|
} |
|
685
|
|
|
|
|
|
|
# $i++; # debug |
|
686
|
|
|
|
|
|
|
} |
|
687
|
0
|
|
|
|
|
0
|
$translation->{'aa_ranges'}=\@aarangeobjects; |
|
688
|
|
|
|
|
|
|
} |
|
689
|
|
|
|
|
|
|
|
|
690
|
|
|
|
|
|
|
# if there is no SRS support, the default will be to return 0 |
|
691
|
|
|
|
|
|
|
# i.e. this function is overridden in SRS package |
|
692
|
|
|
|
|
|
|
sub get_swisshash { |
|
693
|
6
|
|
|
6
|
0
|
7
|
return (0); |
|
694
|
|
|
|
|
|
|
} |
|
695
|
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
# Args: $entry hashref, gene_name OR cds_position (undef is used to |
|
697
|
|
|
|
|
|
|
# choose between the two), getswissprotinfo boolean flag |
|
698
|
|
|
|
|
|
|
# Returns: an hash holding various arrayref used in the hash2gene method |
|
699
|
|
|
|
|
|
|
# Function: examines the nucleotide entry, identifying features belonging |
|
700
|
|
|
|
|
|
|
# to the gene (defined either by its name or by the position of its CDS in |
|
701
|
|
|
|
|
|
|
# the entry) |
|
702
|
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
sub _findgenefeatures { |
|
704
|
6
|
|
|
6
|
|
10
|
my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_; |
|
705
|
|
|
|
|
|
|
|
|
706
|
6
|
|
|
|
|
9
|
my @entryfeatures=@{$entry->{'Features'}}; |
|
|
6
|
|
|
|
|
18
|
|
|
707
|
6
|
|
|
|
|
10
|
my @exons; my @introns; my @prim_transcripts; my @transcripts; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
708
|
0
|
|
|
|
|
0
|
my @repeat_units; my @repeat_regions; |
|
|
0
|
|
|
|
|
0
|
|
|
709
|
0
|
|
|
|
|
0
|
my @repeat_units_family; my @repeat_regions_family; my $rpt_family; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
710
|
0
|
|
|
|
|
0
|
my $entryfeature; my @genefeatures; |
|
|
0
|
|
|
|
|
0
|
|
|
711
|
0
|
|
|
|
|
0
|
my $desc; my @exondescs; my @introndescs; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
712
|
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
# for swissprot xreference |
|
714
|
0
|
|
|
|
|
0
|
my ($swissacc,$swisshash); my @swisshashes; |
|
|
0
|
|
|
|
|
0
|
|
|
715
|
|
|
|
|
|
|
|
|
716
|
|
|
|
|
|
|
# for translation_tables |
|
717
|
0
|
|
|
|
|
0
|
my @ttables; |
|
718
|
|
|
|
|
|
|
|
|
719
|
|
|
|
|
|
|
# to create labels |
|
720
|
0
|
|
|
|
|
0
|
my ($name,$exon); |
|
721
|
0
|
|
|
|
|
0
|
my @range; my @cdsexons; my @labels; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
722
|
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
# maybe here also could be added special case when there is no CDS feature |
|
724
|
|
|
|
|
|
|
# in the entry (e.g. tRNA entry -> TOCHECK). |
|
725
|
|
|
|
|
|
|
# let's deal with the special case in which there is just one gene per entry |
|
726
|
|
|
|
|
|
|
# usually without /gene qualifier |
|
727
|
6
|
|
|
|
|
6
|
my @cds=@{$entry->{'CDS'}}; |
|
|
6
|
|
|
|
|
14
|
|
|
728
|
|
|
|
|
|
|
|
|
729
|
6
|
|
|
|
|
10
|
my $skipgenematch=0; |
|
730
|
6
|
50
|
|
|
|
14
|
if (scalar(@cds) == 1) { |
|
731
|
|
|
|
|
|
|
#carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features."; |
|
732
|
6
|
|
|
|
|
8
|
$skipgenematch=1; |
|
733
|
|
|
|
|
|
|
} |
|
734
|
|
|
|
|
|
|
|
|
735
|
6
|
|
|
|
|
7
|
my ($cds_begin,$cds_end,$proximity); |
|
736
|
6
|
50
|
|
|
|
16
|
if ($cds_position) { # if a position has been requested |
|
737
|
0
|
|
|
|
|
0
|
my @cds_exons=@{$cds[$cds_position-1]->{'range'}}; |
|
|
0
|
|
|
|
|
0
|
|
|
738
|
0
|
|
|
|
|
0
|
($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS |
|
739
|
0
|
|
|
|
|
0
|
$gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'}; |
|
740
|
|
|
|
|
|
|
# DEBUG |
|
741
|
0
|
0
|
|
|
|
0
|
unless ($skipgenematch) { |
|
742
|
0
|
|
|
|
|
0
|
carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------"; |
|
743
|
|
|
|
|
|
|
} |
|
744
|
0
|
|
|
|
|
0
|
$proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS |
|
745
|
|
|
|
|
|
|
} |
|
746
|
|
|
|
|
|
|
|
|
747
|
6
|
|
|
|
|
11
|
for $entryfeature (@entryfeatures) { # get only features for the desired gene |
|
748
|
30
|
0
|
0
|
|
|
45
|
if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) { |
|
|
|
|
33
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
749
|
30
|
|
|
|
|
26
|
push(@genefeatures,$entryfeature); |
|
750
|
|
|
|
|
|
|
|
|
751
|
30
|
|
|
|
|
21
|
my @range=@{$entryfeature->{'range'}}; |
|
|
30
|
|
|
|
|
58
|
|
|
752
|
30
|
|
|
|
|
27
|
$name=$entryfeature->{'name'}; |
|
753
|
30
|
|
|
|
|
24
|
my %qualifierhash=%{$entryfeature->{'qualifiers'}}; |
|
|
30
|
|
|
|
|
70
|
|
|
754
|
30
|
100
|
|
|
|
39
|
if ($name eq "CDS") { # that has range containing array of exons |
|
755
|
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
# swissprot crossindexing (if without SRS support it will fill array |
|
757
|
|
|
|
|
|
|
# with zeros and do nothing |
|
758
|
6
|
50
|
|
|
|
12
|
if ($getswissprotinfo) { |
|
759
|
6
|
|
|
|
|
12
|
$swissacc=$entryfeature->{'qualifiers'}->{'db_xref'}; |
|
760
|
6
|
|
|
|
|
23
|
$swisshash=$self->get_swisshash($swissacc); |
|
761
|
|
|
|
|
|
|
#$self->printswissprot($swisshash); # DEBUG |
|
762
|
6
|
|
|
|
|
8
|
push (@swisshashes,$swisshash); |
|
763
|
|
|
|
|
|
|
} |
|
764
|
|
|
|
|
|
|
|
|
765
|
6
|
|
|
|
|
13
|
push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified |
|
766
|
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
# create labels array |
|
768
|
6
|
|
|
|
|
8
|
for $exon (@range) { |
|
769
|
34
|
|
|
|
|
54
|
push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS |
|
770
|
|
|
|
|
|
|
} |
|
771
|
6
|
|
|
|
|
15
|
push (@transcripts,$entryfeature->{'range'}); |
|
772
|
|
|
|
|
|
|
} else { |
|
773
|
|
|
|
|
|
|
# "simplifying" the joinedlocation features. I.e. changing them from |
|
774
|
|
|
|
|
|
|
# multijoined ones to simple plain start-end features, taking only |
|
775
|
|
|
|
|
|
|
# the start of the first "exon" and the end of the last "exon" as |
|
776
|
|
|
|
|
|
|
# start and end of the entire feature |
|
777
|
24
|
50
|
33
|
|
|
42
|
if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location |
|
778
|
0
|
|
|
|
|
0
|
@range=($range[0]->[0],$range[-1]->[1]); |
|
779
|
|
|
|
|
|
|
} |
|
780
|
24
|
|
|
|
|
30
|
push(@labels,$range[0],$range[1]); # start and end of every feature |
|
781
|
24
|
100
|
|
|
|
37
|
if ($name eq "exon") { |
|
782
|
9
|
|
|
|
|
9
|
$desc=$entryfeature->{'qualifiers'}->{'number'}; |
|
783
|
9
|
100
|
|
|
|
10
|
if ($entryfeature->{'qualifiers'}->{'note'}) { |
|
784
|
3
|
50
|
|
|
|
5
|
if ($desc) { |
|
785
|
0
|
|
|
|
|
0
|
$desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; |
|
786
|
|
|
|
|
|
|
} else { |
|
787
|
3
|
|
|
|
|
4
|
$desc = $entryfeature->{'qualifiers'}->{'note'}; |
|
788
|
|
|
|
|
|
|
} |
|
789
|
|
|
|
|
|
|
} |
|
790
|
9
|
|
50
|
|
|
15
|
push (@exondescs,$desc || "unknown"); |
|
791
|
9
|
|
|
|
|
7
|
push(@exons,\@range); |
|
792
|
|
|
|
|
|
|
} |
|
793
|
24
|
100
|
|
|
|
37
|
if ($name eq "intron") { |
|
794
|
8
|
|
|
|
|
7
|
$desc=$entryfeature->{'qualifiers'}->{'number'}; |
|
795
|
8
|
50
|
|
|
|
8
|
if ($desc) { |
|
796
|
0
|
|
|
|
|
0
|
$desc .= "|" . $entryfeature->{'qualifiers'}->{'note'}; |
|
797
|
|
|
|
|
|
|
} else { |
|
798
|
8
|
|
|
|
|
8
|
$desc = $entryfeature->{'qualifiers'}->{'note'}; |
|
799
|
|
|
|
|
|
|
} |
|
800
|
8
|
|
50
|
|
|
10
|
push (@introndescs,$desc || "unknown"); |
|
801
|
8
|
|
|
|
|
9
|
push(@introns,\@range); |
|
802
|
|
|
|
|
|
|
} |
|
803
|
24
|
100
|
100
|
|
|
69
|
if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\@range); } |
|
|
7
|
|
|
|
|
15
|
|
|
804
|
24
|
50
|
|
|
|
34
|
if ($name eq "repeat_unit") { push(@repeat_units,\@range); |
|
|
0
|
|
|
|
|
0
|
|
|
805
|
0
|
|
|
|
|
0
|
$rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; |
|
806
|
0
|
|
0
|
|
|
0
|
push (@repeat_units_family,$rpt_family || "unknown"); |
|
807
|
|
|
|
|
|
|
} |
|
808
|
24
|
50
|
|
|
|
50
|
if ($name eq "repeat_region") { push(@repeat_regions,\@range); |
|
|
0
|
|
|
|
|
0
|
|
|
809
|
0
|
|
|
|
|
0
|
$rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'}; |
|
810
|
0
|
|
0
|
|
|
0
|
push (@repeat_regions_family,$rpt_family || "unknown"); |
|
811
|
|
|
|
|
|
|
} |
|
812
|
|
|
|
|
|
|
} |
|
813
|
|
|
|
|
|
|
} |
|
814
|
|
|
|
|
|
|
} |
|
815
|
6
|
50
|
|
|
|
16
|
unless ($gene_name) { $gene_name="cds-position:".$cds_position; } |
|
|
0
|
|
|
|
|
0
|
|
|
816
|
6
|
|
|
|
|
12
|
my %genefeatureshash; |
|
817
|
6
|
|
|
|
|
18
|
$genefeatureshash{gene_name}=$gene_name; |
|
818
|
6
|
|
|
|
|
10
|
$genefeatureshash{genefeatures}=\@genefeatures; |
|
819
|
6
|
|
|
|
|
13
|
$genefeatureshash{labels}=\@labels; |
|
820
|
6
|
|
|
|
|
9
|
$genefeatureshash{ttables}=\@ttables; |
|
821
|
6
|
|
|
|
|
11
|
$genefeatureshash{swisshashes}=\@swisshashes; |
|
822
|
6
|
|
|
|
|
12
|
$genefeatureshash{transcripts}=\@transcripts; |
|
823
|
6
|
|
|
|
|
11
|
$genefeatureshash{exons}=\@exons; |
|
824
|
6
|
|
|
|
|
13
|
$genefeatureshash{exondescs}=\@exondescs; |
|
825
|
6
|
|
|
|
|
10
|
$genefeatureshash{introns}=\@introns; |
|
826
|
6
|
|
|
|
|
12
|
$genefeatureshash{introndescs}=\@introndescs; |
|
827
|
6
|
|
|
|
|
9
|
$genefeatureshash{prim_transcripts}=\@prim_transcripts; |
|
828
|
6
|
|
|
|
|
10
|
$genefeatureshash{repeat_units}=\@repeat_units; |
|
829
|
6
|
|
|
|
|
11
|
$genefeatureshash{repeat_regions}=\@repeat_regions; |
|
830
|
6
|
|
|
|
|
6
|
$genefeatureshash{repeat_units_family}=\@repeat_units_family; |
|
831
|
6
|
|
|
|
|
11
|
$genefeatureshash{repeat_regions_family}=\@repeat_regions_family; |
|
832
|
6
|
|
|
|
|
22
|
return (\%genefeatureshash); |
|
833
|
|
|
|
|
|
|
} |
|
834
|
|
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
# used by _findgenefeatures, when a CDS at a certain position is requested, |
|
837
|
|
|
|
|
|
|
# to retrieve only features quite close to the wanted CDS. |
|
838
|
|
|
|
|
|
|
# Args: range hashref, begin and end positions of the CDS, $proximity |
|
839
|
|
|
|
|
|
|
# $proximity holds the maximum distance between the extremes of the CDS |
|
840
|
|
|
|
|
|
|
# and of the feature under exam. |
|
841
|
|
|
|
|
|
|
# Returns: boolean |
|
842
|
|
|
|
|
|
|
sub _checkfeatureproximity { |
|
843
|
0
|
|
|
0
|
|
|
my ($self,$range,$cds_begin,$cds_end,$proximity)=@_; |
|
844
|
0
|
|
|
|
|
|
my @range=@{$range}; |
|
|
0
|
|
|
|
|
|
|
|
845
|
0
|
|
|
|
|
|
my ($begin,$end,$strand); |
|
846
|
0
|
0
|
|
|
|
|
if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons |
|
847
|
0
|
|
|
|
|
|
($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]); |
|
848
|
|
|
|
|
|
|
} else { |
|
849
|
0
|
|
|
|
|
|
($begin,$end,$strand)=@range; |
|
850
|
|
|
|
|
|
|
} |
|
851
|
0
|
0
|
|
|
|
|
if ($cds_begin > $cds_end) { # i.e. reverse strand CDS |
|
852
|
0
|
|
|
|
|
|
($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries |
|
853
|
|
|
|
|
|
|
} |
|
854
|
0
|
0
|
|
|
|
|
if ($strand == -1) { # reverse strand |
|
855
|
0
|
|
|
|
|
|
($begin,$end)=($end,$begin); # swap boundaries |
|
856
|
|
|
|
|
|
|
} |
|
857
|
0
|
0
|
|
|
|
|
if (($cds_begin-$end)>$proximity) { |
|
858
|
0
|
|
|
|
|
|
carp "--DEBUG-- feature rejected: begin $begin end $end -------"; |
|
859
|
0
|
|
|
|
|
|
return (0); |
|
860
|
|
|
|
|
|
|
} |
|
861
|
0
|
0
|
|
|
|
|
if (($begin-$cds_end)>$proximity) { |
|
862
|
0
|
|
|
|
|
|
carp "--DEBUG-- feature rejected: begin $begin end $end -------"; |
|
863
|
0
|
|
|
|
|
|
return (0); |
|
864
|
|
|
|
|
|
|
} |
|
865
|
0
|
|
|
|
|
|
carp "--DEBUG-- feature accepted: begin $begin end $end -------"; |
|
866
|
0
|
|
|
|
|
|
return (1); # otherwise ok, feature considered next to CDS |
|
867
|
|
|
|
|
|
|
} |
|
868
|
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
|
|
870
|
|
|
|
|
|
|
# function that calls the external program "align" (on the fasta2 package) |
|
871
|
|
|
|
|
|
|
# to create an alignment between two sequences, returning the aligned |
|
872
|
|
|
|
|
|
|
# strings and the codes for the identity (:: ::::) |
|
873
|
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
sub _get_alignment { |
|
875
|
0
|
|
|
0
|
|
|
my ($self,$seq1,$seq2)=@_; |
|
876
|
|
|
|
|
|
|
|
|
877
|
0
|
0
|
|
|
|
|
my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; |
|
878
|
0
|
|
|
|
|
|
my $fastafile1="/tmp/tmpfastafile1"; |
|
879
|
0
|
|
|
|
|
|
my $fastafile2="/tmp/tmpfastafile2"; |
|
880
|
0
|
|
|
|
|
|
my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut |
|
881
|
0
|
|
|
|
|
|
my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>$null | $grepcut"; # ALIGN |
|
882
|
0
|
0
|
|
|
|
|
open my $TMPFASTAFILE1, '>', $fastafile1 or croak "Could not write file '$fastafile1' for aa alignment: $!"; |
|
883
|
0
|
0
|
|
|
|
|
open my $TMPFASTAFILE2, '>', $fastafile2 or croak "Could not write file '$fastafile2' for aa alignment: $!"; |
|
884
|
0
|
|
|
|
|
|
print $TMPFASTAFILE1 ">firstseq\n$seq1\n"; |
|
885
|
0
|
|
|
|
|
|
print $TMPFASTAFILE2 ">secondseq\n$seq2\n"; |
|
886
|
0
|
|
|
|
|
|
close $TMPFASTAFILE1; |
|
887
|
0
|
|
|
|
|
|
close $TMPFASTAFILE2; |
|
888
|
0
|
|
|
|
|
|
my $alignment=`$alignprogram`; |
|
889
|
0
|
|
|
|
|
|
my @alignlines=split(/\n/,$alignment); |
|
890
|
0
|
|
|
|
|
|
my ($linecount,$seq1_aligned,$seq2_aligned,$codes); |
|
891
|
0
|
|
|
|
|
|
for ($linecount=0; $linecount < @alignlines; $linecount+=3) { |
|
892
|
0
|
|
|
|
|
|
$seq1_aligned .= $alignlines[$linecount]; |
|
893
|
0
|
|
|
|
|
|
$codes .= $alignlines[$linecount+1]; |
|
894
|
0
|
|
|
|
|
|
$seq2_aligned .= $alignlines[$linecount+2]; |
|
895
|
|
|
|
|
|
|
} |
|
896
|
0
|
|
|
|
|
|
return ($seq1_aligned,$seq2_aligned,$codes); |
|
897
|
|
|
|
|
|
|
} |
|
898
|
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
# common part of the function to create a novel liveseq gene structure |
|
900
|
|
|
|
|
|
|
# from an amino acid sequence, using codon usage frequencies |
|
901
|
|
|
|
|
|
|
# args: codon_usage_array transltableid aasequence gene_name |
|
902
|
|
|
|
|
|
|
sub _common_novelaasequence2gene { |
|
903
|
0
|
|
|
0
|
|
|
my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_; |
|
904
|
0
|
|
|
|
|
|
my @species_codon_usage=@{$species_codon_usage}; |
|
|
0
|
|
|
|
|
|
|
|
905
|
0
|
|
|
|
|
|
my @codon_usage_label= |
|
906
|
|
|
|
|
|
|
qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg |
|
907
|
|
|
|
|
|
|
tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga |
|
908
|
|
|
|
|
|
|
ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa |
|
909
|
|
|
|
|
|
|
gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag |
|
910
|
|
|
|
|
|
|
tga); |
|
911
|
0
|
|
|
|
|
|
my ($i,$j); |
|
912
|
0
|
|
|
|
|
|
my %codon_usage_value; |
|
913
|
0
|
|
|
|
|
|
my $aa_codon_total; |
|
914
|
0
|
|
|
|
|
|
for ($i=0;$i<64;$i++) { |
|
915
|
0
|
|
|
|
|
|
$codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i]; |
|
916
|
|
|
|
|
|
|
} |
|
917
|
|
|
|
|
|
|
|
|
918
|
0
|
|
|
|
|
|
my $CodonTable = Bio::Tools::CodonTable->new ( -id => $ttabid ); |
|
919
|
0
|
|
|
|
|
|
my @aminoacids = split(//,uc($aasequence)); |
|
920
|
0
|
|
|
|
|
|
my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon); |
|
|
0
|
|
|
|
|
|
|
|
921
|
0
|
|
|
|
|
|
for $i (@aminoacids) { |
|
922
|
0
|
|
|
|
|
|
@alt_codons = $CodonTable->revtranslate($i); |
|
923
|
0
|
0
|
|
|
|
|
unless (@alt_codons) { |
|
924
|
0
|
|
|
|
|
|
carp "No reverse translation possible for aminoacid \'$i\'"; |
|
925
|
0
|
|
|
|
|
|
$dnasequence .= "???"; |
|
926
|
|
|
|
|
|
|
} else { |
|
927
|
0
|
|
|
|
|
|
$aa_codon_total=0; |
|
928
|
0
|
|
|
|
|
|
for $j (@alt_codons) { |
|
929
|
0
|
|
|
|
|
|
$aa_codon_total+=$codon_usage_value{$j}; |
|
930
|
|
|
|
|
|
|
} |
|
931
|
|
|
|
|
|
|
# print "aminoacid $i, codonchoice: "; # verbose |
|
932
|
|
|
|
|
|
|
#$partial=0; |
|
933
|
|
|
|
|
|
|
#for $j (@alt_codons) { |
|
934
|
|
|
|
|
|
|
#printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total; |
|
935
|
|
|
|
|
|
|
#$partial+=($codon_usage_value{$j}/$aa_codon_total); |
|
936
|
|
|
|
|
|
|
#} |
|
937
|
|
|
|
|
|
|
#print "\n"; |
|
938
|
0
|
|
|
|
|
|
$dice=rand(1); |
|
939
|
|
|
|
|
|
|
#print "roulette: $dice\n"; # verbose |
|
940
|
0
|
|
|
|
|
|
$partial=0; |
|
941
|
0
|
|
|
|
|
|
$chosen_codon=""; |
|
942
|
|
|
|
|
|
|
CODONCHOICE: |
|
943
|
0
|
|
|
|
|
|
for $j (0..@alt_codons) { # last one not accounted |
|
944
|
0
|
|
|
|
|
|
$thiscodon=$alt_codons[$j]; |
|
945
|
0
|
|
|
|
|
|
$relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total); |
|
946
|
0
|
0
|
|
|
|
|
if ($dice < $relativeusage+$partial) { |
|
947
|
0
|
|
|
|
|
|
$chosen_codon=$thiscodon; |
|
948
|
0
|
|
|
|
|
|
last CODONCHOICE; |
|
949
|
|
|
|
|
|
|
} else { |
|
950
|
0
|
|
|
|
|
|
$partial += $relativeusage; |
|
951
|
|
|
|
|
|
|
} |
|
952
|
|
|
|
|
|
|
} |
|
953
|
0
|
0
|
|
|
|
|
unless ($chosen_codon) { |
|
954
|
0
|
|
|
|
|
|
$chosen_codon = $alt_codons[-1]; # the last one |
|
955
|
|
|
|
|
|
|
} |
|
956
|
|
|
|
|
|
|
# print ".....adding $chosen_codon\n"; # verbose |
|
957
|
0
|
|
|
|
|
|
$dnasequence .= $chosen_codon; |
|
958
|
|
|
|
|
|
|
} |
|
959
|
|
|
|
|
|
|
} |
|
960
|
|
|
|
|
|
|
|
|
961
|
0
|
|
|
|
|
|
my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence); |
|
962
|
0
|
|
|
|
|
|
my $min=1; |
|
963
|
0
|
|
|
|
|
|
my $max=length($dnasequence); |
|
964
|
0
|
|
|
|
|
|
my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1); |
|
965
|
0
|
|
|
|
|
|
my @exons=($exon); |
|
966
|
0
|
|
|
|
|
|
my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons); |
|
967
|
0
|
|
|
|
|
|
$transcript->translation_table($ttabid); |
|
968
|
0
|
|
|
|
|
|
my @transcripts=($transcript); |
|
969
|
0
|
|
|
|
|
|
my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript); |
|
970
|
0
|
|
|
|
|
|
my @translations=($translation); |
|
971
|
0
|
|
|
|
|
|
my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations); |
|
972
|
0
|
|
|
|
|
|
my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max); |
|
973
|
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
# creation of gene |
|
975
|
0
|
0
|
|
|
|
|
unless ($gene) { # if $gene == 0 it means problems in hash2gene |
|
976
|
0
|
|
|
|
|
|
carp "Error in Gene creation phase"; |
|
977
|
0
|
|
|
|
|
|
return (0); |
|
978
|
|
|
|
|
|
|
} |
|
979
|
0
|
|
|
|
|
|
return $gene; |
|
980
|
|
|
|
|
|
|
} |
|
981
|
|
|
|
|
|
|
|
|
982
|
|
|
|
|
|
|
1; |