File Coverage

Bio/LiveSeq/IO/Loader.pm
Criterion Covered Total %
statement 268 592 45.2
branch 50 156 32.0
condition 10 49 20.4
subroutine 19 29 65.5
pod 5 12 41.6
total 352 838 42.0


line stmt bran cond sub pod time code
1             #
2             # bioperl module for Bio::LiveSeq::IO::Loader
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Joseph Insana
7             #
8             # Copyright Joseph Insana
9             #
10             # You may distribute this module under the same terms as perl itself
11             #
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq
17              
18             =head1 SYNOPSIS
19              
20             #documentation needed
21              
22             =head1 DESCRIPTION
23              
24             This package holds common methods used by BioPerl and file loaders.
25             It contains methods to create LiveSeq objects out of entire entries or from a
26             localized sequence region surrounding a particular gene.
27              
28             =head1 AUTHOR - Joseph A.L. Insana
29              
30             Email: Insana@ebi.ac.uk, jinsana@gmx.net
31              
32             =head1 APPENDIX
33              
34             The rest of the documentation details each of the object
35             methods. Internal methods are usually preceded with a _
36              
37             =cut
38              
39             # Let the code begin...
40              
41             package Bio::LiveSeq::IO::Loader;
42              
43 2     2   13 use strict;
  2         2  
  2         54  
44 2     2   9 use Carp qw(cluck croak carp);
  2         4  
  2         86  
45 2     2   405 use Bio::LiveSeq::DNA;
  2         5  
  2         55  
46 2     2   503 use Bio::LiveSeq::Exon;
  2         4  
  2         53  
47 2     2   526 use Bio::LiveSeq::Transcript ;
  2         4  
  2         57  
48 2     2   474 use Bio::LiveSeq::Translation;
  2         4  
  2         47  
49 2     2   483 use Bio::LiveSeq::Gene;
  2         4  
  2         51  
50 2     2   747 use Bio::LiveSeq::Intron;
  2         3  
  2         52  
51 2     2   10 use Bio::LiveSeq::Prim_Transcript;
  2         4  
  2         31  
52 2     2   402 use Bio::LiveSeq::Repeat_Region;
  2         4  
  2         50  
53 2     2   389 use Bio::LiveSeq::Repeat_Unit;
  2         5  
  2         50  
54 2     2   500 use Bio::LiveSeq::AARange;
  2         3  
  2         70  
55 2     2   12 use Bio::Tools::CodonTable;
  2         3  
  2         7659  
56              
57             =head2 entry2liveseq
58              
59             Title : entry2liveseq
60             Usage : @translationobjects=$loader->entry2liveseq();
61             : @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0);
62             Function: creates LiveSeq objects from an entry previously loaded
63             Returns : array of references to objects of class Translation
64             Errorcode 0
65             Args : optional boolean flag to avoid the retrieval of SwissProt
66             information for all Transcripts containing SwissProt x-reference
67             default is 1 (to retrieve those information and create AARange
68             LiveSeq objects)
69             Note : this method can get really slow for big entries. The lightweight
70             gene2liveseq method is recommended
71              
72             =cut
73              
74             sub entry2liveseq {
75 0     0 1 0 my ($self, %args) = @_;
76 0         0 my ($getswissprotinfo)=($args{-getswissprotinfo});
77 0 0       0 if (defined($getswissprotinfo)) {
78 0 0 0     0 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
79 0         0 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
80 0         0 $getswissprotinfo=0;
81             }
82             } else {
83 0         0 $getswissprotinfo=1;
84             }
85 0         0 my $hashref=$self->{'hash'};
86 0 0       0 unless ($hashref) { return (0); }
  0         0  
87 0         0 my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo);
88 0         0 my $test_transl=0;
89 0 0       0 if ($test_transl) { $self->test_transl($hashref,\@translationobjects);}
  0         0  
90 0         0 return @translationobjects;
91             }
92              
93             =head2 novelaasequence2gene
94              
95             Title : novelaasequence2gene
96             Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
97             : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
98             -taxon => 9606,
99             -gene_name => "tyr-kinase");
100              
101             Function: creates LiveSeq objects from a novel amino acid sequence,
102             using codon usage database to choose codons according to
103             relative frequencies.
104             If a taxon ID is not specified, the default is to use the human
105             one (taxonomy ID 9606).
106             Returns : reference to a Gene object containing references to LiveSeq objects
107             Errorcode 0
108             Args : string containing an amino acid sequence
109             integer (optional) with a taxonomy ID
110             string specifying a gene name
111              
112             =cut
113              
114             =head2 gene2liveseq
115              
116             Title : gene2liveseq
117             Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name");
118             : $gene=$loader->gene2liveseq(-gene_name => "gene name",
119             -flanking => 64);
120             : $gene=$loader->gene2liveseq(-gene_name => "gene name",
121             -getswissprotinfo => 0);
122             : $gene=$loader->gene2liveseq(-position => 4);
123              
124             Function: creates LiveSeq objects from an entry previously loaded
125             It is a "light weight" creation because it creates
126             a LiveSequence just for the interesting region in an entry
127             (instead than for the total entry, like in entry2liveseq) and for
128             the flanking regions up to 500 nucleotides (default) or up to
129             the specified amount of nucleotides (given as argument) around the
130             Gene.
131             Returns : reference to a Gene object containing possibly alternative
132             Transcripts.
133             Errorcode 0
134             Args : string containing the gene name as in the EMBL feature qualifier
135             integer (optional) "flanking": amount of flanking bases to be kept
136             boolean (optional) "getswissprotinfo": if set to "0" it will avoid
137             trying to fetch information from a crossreference to a SwissProt
138             entry, avoiding the process of creation of AARange objects
139             It is "1" (on) by default
140              
141             Alternative to a gene_name, a position can be given: an
142             integer (1-) containing the position of the desired CDS in the
143             loaded entry
144              
145             =cut
146              
147             sub gene2liveseq {
148 6     6 1 482 my ($self, %args) = @_;
149 6         23 my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position});
150 6         7 my $input;
151 6 50 33     20 unless (($gene_name)||($cds_position)) {
152 0         0 carp "Gene_Name or Position not specified for gene2liveseq loading function";
153 0         0 return (0);
154             }
155 6 50 33     30 if (($gene_name)&&($cds_position)) {
    50          
156 0         0 carp "Gene_Name and Position cannot be given together";
157 0         0 return (0);
158             } elsif ($gene_name) {
159 6         10 $input=$gene_name;
160             } else {
161 0         0 $input="cds-position:".$cds_position;
162             }
163              
164 6 50       15 if (defined($getswissprotinfo)) {
165 0 0 0     0 if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
166 0         0 carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
167 0         0 $getswissprotinfo=0;
168             }
169             } else {
170 6         11 $getswissprotinfo=1;
171             }
172              
173 6 50       17 if (defined($flanking)) {
174 0 0       0 unless ($flanking >= 0) {
175 0         0 carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function";
176 0         0 return (0);
177             }
178             } else {
179 6         9 $flanking=500; # the default flanking length
180             }
181 6         14 my $hashref=$self->{'hash'};
182 6 50       15 unless ($hashref) { return (0); }
  0         0  
183 6         23 my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo);
184 6 50       50 unless ($gene) { # if $gene == 0 it means problems in hash2gene
185 0         0 carp "gene2liveseq produced error";
186 0         0 return (0);
187             }
188 6         41 return $gene;
189             }
190              
191             # TODO: update so that it will work even if CDS is not only accepted FEATURE!!
192             # this method is for now deprecated and not supported
193             sub test_transl {
194 0     0 0 0 my ($self,$entry)=@_;
195 0         0 my @features=@{$entry->{'Features'}};
  0         0  
196 0         0 my @translationobjects=@{$_[1]};
  0         0  
197 0         0 my ($i,$translation);
198 0         0 my ($obj_transl,$hash_transl);
199 0         0 my @cds=@{$entry->{'CDS'}};
  0         0  
200 0         0 foreach $translation (@translationobjects) {
201 0         0 $obj_transl=$translation->seq;
202 0         0 $hash_transl=$cds[$i]->{'qualifiers'}->{'translation'};
203             #before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*"
204 0 0       0 unless ($obj_transl eq $hash_transl) {
205 0         0 cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i";
206 0         0 carp "\nEntry's transl: ",$hash_transl,"\n";
207 0         0 carp "\nObject's transl: ",$obj_transl,"\n";
208 0         0 exit;
209             }
210 0         0 $i++;
211             }
212             }
213              
214             # argument: hashref containing the EMBL entry datas,
215             # getswissprotinfo boolean flag
216             # creates the liveseq objects
217             # returns: an array of Translation object references
218             sub hash2liveseq {
219 0     0 0 0 my ($self,$entry,$getswissprotinfo)=@_;
220 0         0 my $i;
221             my @transcripts;
222 0         0 my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} );
223 0         0 $dna->alphabet(lc($entry->{'Molecule'}));
224 0         0 $dna->display_id($entry->{'ID'});
225 0         0 $dna->accession_number($entry->{'AccNumber'});
226 0         0 $dna->desc($entry->{'Description'});
227 0         0 my @cds=@{$entry->{'CDS'}};
  0         0  
228 0         0 my ($swissacc,$swisshash); my @swisshashes;
  0         0  
229 0         0 for $i (0..$#cds) {
230             #my @transcript=@{$cds[$i]->{'range'}};
231             #$transcript=\@transcript;
232             #push (@transcripts,$transcript);
233 0         0 push (@transcripts,$cds[$i]->{'range'});
234 0 0       0 if ($getswissprotinfo) {
235 0         0 $swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'};
236 0         0 $swisshash=$self->get_swisshash($swissacc);
237             #$self->printswissprot($swisshash); # DEBUG
238 0         0 push (@swisshashes,$swisshash);
239             }
240             }
241 0         0 my @translations=($self->transexonscreation($dna,\@transcripts));
242 0         0 my $translation; my $j=0;
  0         0  
243 0         0 foreach $translation (@translations) {
244 0 0       0 if ($swisshashes[$j]) { # if not 0
245 0         0 $self->swisshash2liveseq($swisshashes[$j],$translation);
246             }
247 0         0 $j++;
248             }
249 0         0 return (@translations);
250             }
251              
252             # only features pertaining to a specified gene are created
253             # only the sequence of the gene and appropriate context flanking regions
254             # are created as chain
255             # arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag
256             # returns: reference to Gene object
257             #
258             # Note: if entry contains just one CDS, all the features get added
259             # this is useful because often the features in these entries do not
260             # carry the /gene qualifier
261             #
262             # errorcode: 0
263             sub hash2gene {
264 6     6 0 16 my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_;
265 6         10 my $entryfeature;
266             my $genefeatureshash;
267              
268 6         9 my @cds=@{$entry->{'CDS'}};
  6         17  
269              
270             # checking if a position has been given instead than a gene_name
271 6 50       22 if (index($input,"cds-position:") == 0 ) {
272 0         0 my $cds_position=substr($input,13); # extracting the cds position
273 0 0 0     0 if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) {
274 0         0 $genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo);
275             }
276             } else {
277 6         28 $genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo);
278             }
279              
280 6 50 50     23 unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found
  6         21  
281 0         0 my @genes=$self->genes($entry);
282 0         0 my $cds_number=scalar(@cds);
283 0         0 warn "Warning! Not even one genefeature found for /$input/....
284             The genes present in this entry are:\n\t@genes\n
285             The number of CDS in this entry is:\n\t$cds_number\n";
286 0         0 return(0);
287             }
288              
289             # get max and min, check flankings
290 6         11 my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries"
  6         26  
291 6         12 my $seqlength=$entry->{'SeqLength'};
292 6         10 my ($mindna,$maxdna); # some flanking region next to the gene "boundaries"
293 6 50       17 if ($min-$flanking < 1) {
294 6         9 $mindna=1;
295             } else {
296 0         0 $mindna=$min-$flanking;
297             }
298 6 50       15 if ($max+$flanking > $seqlength) {
299 6         8 $maxdna=$seqlength;
300             } else {
301 0         0 $maxdna=$max+$flanking;
302             }
303 6         89 my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1);
304              
305             # create LiveSeq objects
306              
307             # create DNA
308 6         50 my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna);
309 6         72 $dna->alphabet(lc($entry->{'Molecule'}));
310 6         28 $dna->source($entry->{'Organism'});
311 6         34 $dna->display_id($entry->{'ID'});
312 6         28 $dna->accession_number($entry->{'AccNumber'});
313 6         28 $dna->desc($entry->{'Description'});
314              
315 6         12 my @transcripts=@{$genefeatureshash->{'transcripts'}};
  6         20  
316             # create Translations, Transcripts, Exons out of the CDS
317 6 50       14 unless (@transcripts) {
318 0         0 cluck "no CDS feature found for /$input/....";
319 0         0 return(0);
320             }
321 6         70 my @translationobjs=$self->transexonscreation($dna,\@transcripts);
322 6         13 my @transcriptobjs;
323              
324             # get the Transcript obj_refs
325             my $translation;
326 6         11 my $j=0;
327 6         7 my @ttables=@{$genefeatureshash->{'ttables'}};
  6         20  
328 6         7 my @swisshashes=@{$genefeatureshash->{'swisshashes'}};
  6         19  
329 6         13 foreach $translation (@translationobjs) {
330 6         24 push(@transcriptobjs,$translation->get_Transcript);
331 6 50       19 if ($ttables[$j]) { # if not undef
332 0         0 $translation->get_Transcript->translation_table($ttables[$j]);
333             #} else { # DEBUG
334             # print "\n\t\tno translation table information....\n";
335             }
336 6 50       16 if ($swisshashes[$j]) { # if not 0
337 0         0 $self->swisshash2liveseq($swisshashes[$j],$translation);
338             }
339 6         9 $j++;
340             }
341              
342 6         7 my %gene; # this is the hash to store created object references
343 6         18 $gene{DNA}=$dna;
344 6         14 $gene{Transcripts}=\@transcriptobjs;
345 6         13 $gene{Translations}=\@translationobjs;
346              
347 6         37 my @exonobjs; my @intronobjs;
348 6         0 my @repeatunitobjs; my @repeatregionobjs;
  6         0  
349 6         0 my @primtranscriptobjs;
350              
351 6         0 my ($object,$range,$start,$end,$strand);
352              
353 6         9 my @exons=@{$genefeatureshash->{'exons'}};
  6         17  
354 6         9 my @exondescs=@{$genefeatureshash->{'exondescs'}};
  6         21  
355 6 100       15 if (@exons) {
356 1         2 my $exoncount = 0;
357 1         2 foreach $range (@exons) {
358 9         9 ($start,$end,$strand)=@{$range};
  9         20  
359 9         29 $object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
360 9 50       21 if ($object != -1) {
361 9 50       31 $object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount];
362 9         9 $exoncount++;
363 9         16 push (@exonobjs,$object);
364             } else {
365 0         0 $exoncount++;
366             }
367             }
368 1         3 $gene{Exons}=\@exonobjs;
369             }
370 6         7 my @introns=@{$genefeatureshash->{'introns'}};
  6         15  
371 6         10 my @introndescs=@{$genefeatureshash->{'introndescs'}};
  6         16  
372 6 100       18 if (@introns) {
373 1         3 my $introncount = 0;
374 1         3 foreach $range (@introns) {
375 8         12 ($start,$end,$strand)=@{$range};
  8         18  
376 8         36 $object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
377 8 50       24 if ($object != -1) {
378 8         28 $object->desc($introndescs[$introncount]);
379 8         9 $introncount++;
380 8         17 push (@intronobjs,$object);
381             } else {
382 0         0 $introncount++;
383             }
384             }
385 1         3 $gene{Introns}=\@intronobjs;
386             }
387 6         9 my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}};
  6         16  
388 6 100       14 if (@prim_transcripts) {
389 5         10 foreach $range (@prim_transcripts) {
390 7         13 ($start,$end,$strand)=@{$range};
  7         23  
391 7         57 $object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
392 7 50       43 if ($object != -1) { push (@primtranscriptobjs,$object); }
  7         34  
393             }
394 5         23 $gene{Prim_Transcripts}=\@primtranscriptobjs;
395             }
396 6         12 my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}};
  6         20  
397 6         12 my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}};
  6         15  
398 6 50       21 if (@repeat_regions) {
399 0         0 my $k=0;
400 0         0 foreach $range (@repeat_regions) {
401 0         0 ($start,$end,$strand)=@{$range};
  0         0  
402 0         0 $object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
403 0 0       0 if ($object != -1) {
404 0         0 $object->desc($repeat_regions_family[$k]);
405 0         0 $k++;
406 0         0 push (@repeatregionobjs,$object);
407             } else {
408 0         0 $k++;
409             }
410             }
411 0         0 $gene{Repeat_Regions}=\@repeatregionobjs;
412             }
413 6         9 my @repeat_units=@{$genefeatureshash->{'repeat_units'}};
  6         15  
414 6         9 my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}};
  6         16  
415 6 50       18 if (@repeat_units) {
416 0         0 my $k=0;
417 0         0 foreach $range (@repeat_units) {
418 0         0 ($start,$end,$strand)=@{$range};
  0         0  
419 0         0 $object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
420 0 0       0 if ($object != -1) {
421 0         0 $object->desc($repeat_units_family[$k]);
422 0         0 $k++;
423 0         0 push (@repeatunitobjs,$object);
424             } else {
425 0         0 $k++;
426             }
427             }
428 0         0 $gene{Repeat_Units}=\@repeatunitobjs;
429             }
430              
431             # create the Gene
432 6         15 my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos
433 6         78 return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene,
434             -upbound=>$min,-downbound=>$max));
435             }
436              
437             # maybe this function will be moved to general utility package
438             # argument: array of numbers
439             # returns: (min,max) numbers in the array
440             sub rangeofarray {
441 6     6 0 9 my $self=shift;
442 6         33 my @array=@_;
443             #print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n";
444 6         10 my ($max,$min,$element);
445 6         15 $min=$max=shift(@array);
446 6         14 foreach $element (@array) {
447 110 50       127 $element = 0 unless defined $element;
448 110 50       141 if ($element < $min) {
449 0         0 $min=$element;
450             }
451 110 100       158 if ($element > $max) {
452 6         11 $max=$element;
453             }
454             }
455             #print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n";
456 6         23 return ($min,$max);
457             }
458              
459              
460             # argument: reference to DNA object, reference to array of transcripts
461             # returns: an array of Translation object references
462             sub transexonscreation {
463 6     6 0 14 my $self=shift;
464 6         11 my $dna=$_[0];
465 6         8 my @transcripts=@{$_[1]};
  6         14  
466              
467 6         25 my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj);
468 6         0 my $translationobj;
469 6         0 my @translationobjects;
470 6         14 foreach $transcript (@transcripts) {
471 6         9 foreach $exonref (@{$transcript}) {
  6         12  
472 34         40 ($start,$end,$strand)=@{$exonref};
  34         79  
473             #print "Creating Exon: start $start end $end strand $strand\n";
474 34         146 $exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
475             #push (@exonobjects,$exonobj);
476 34         78 push (@transexons,$exonobj);
477             }
478 6         63 $transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons );
479 6 50       27 if ($transcriptobj != -1) {
480 6         53 $translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj);
481 6         14 @transexons=(); # cleans it
482             #push (@transcriptobjects,$transcriptobj);
483 6         17 push (@translationobjects,$translationobj);
484             }
485             }
486 6         25 return (@translationobjects);
487             }
488              
489             #sub printgene {
490             # deleted. Some functionality placed in Gene->printfeaturesnum
491              
492             =head2 printswissprot
493              
494             Title : printswissprot
495             Usage : $loader->printswissprot($hashref);
496             Function: prints out all information loaded from a database entry into the
497             loader. Mainly used for testing purposes.
498             Args : a hashref containing the SWISSPROT entry datas
499             Note : the hashref can be obtained with a call to the method
500             $loader->get_swisshash() (BioPerl via Bio::DB::EMBL.pm)
501             that takes as argument a string like "SWISS-PROT:P10275"
502              
503             =cut
504              
505             # argument: hashref containing the SWISSPROT entry datas
506             # prints out that hash, showing the information loaded
507             sub printswissprot {
508 0     0 1 0 my ($self,$entry)=@_;
509 0 0       0 unless ($entry) {
510 0         0 return;
511             }
512             printf "ID: %s\n",
513 0         0 $entry->{'ID'};
514             printf "ACC: %s\n",
515 0         0 $entry->{'AccNumber'};
516             printf "GENE: %s\n",
517 0         0 $entry->{'Gene'};
518             printf "DES: %s\n",
519 0         0 $entry->{'Description'};
520             printf "ORG: %s\n",
521 0         0 $entry->{'Organism'};
522             printf "SEQLN: %s\n",
523 0         0 $entry->{'SeqLength'};
524             printf "SEQ: %s\n",
525 0         0 substr($entry->{'Sequence'},0,64);
526 0 0       0 if ($entry->{'Features'}) {
527 0         0 my @features=@{$entry->{'Features'}};
  0         0  
528 0         0 my $i;
529 0         0 for $i (0..$#features) {
530 0         0 print "|",$features[$i]->{'name'},"|";
531 0         0 print " at ",$features[$i]->{'location'},": ";
532 0         0 print "",$features[$i]->{'desc'},"\n";
533             }
534             }
535             }
536              
537             =head2 printembl
538              
539             Title : printembl
540             Usage : $loader->printembl();
541             Function: prints out all information loaded from a database entry into the
542             loader. Mainly used for testing purposes.
543             Args : none
544              
545             =cut
546              
547             # argument: hashref containing the EMBL entry datas
548             # prints out that hash, showing the information loaded
549             sub printembl {
550 0     0 1 0 my ($self,$entry)=@_;
551 0 0       0 unless ($entry) {
552 0         0 $entry=$self->{'hash'};
553             }
554 0         0 my ($i,$featurename);
555             printf "ID: %s\n",
556 0         0 $entry->{'ID'};
557             printf "ACC: %s\n",
558 0         0 $entry->{'AccNumber'};
559             printf "MOL: %s\n",
560 0         0 $entry->{'Molecule'};
561             printf "DIV: %s\n",
562 0         0 $entry->{'Division'};
563             printf "DES: %s\n",
564 0         0 $entry->{'Description'};
565             printf "ORG: %s\n",
566 0         0 $entry->{'Organism'};
567             printf "SEQLN: %s\n",
568 0         0 $entry->{'SeqLength'};
569             printf "SEQ: %s\n",
570 0         0 substr($entry->{'Sequence'},0,64);
571 0         0 my @features=@{$entry->{'Features'}};
  0         0  
572 0         0 my @cds=@{$entry->{'CDS'}};
  0         0  
573 0         0 print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n";
574 0         0 my ($exonref,$transcript);
575 0         0 my ($qualifiernumber,$qualifiers,$key);
576 0         0 my ($start,$end,$strand);
577 0         0 my $j=0;
578 0         0 for $i (0..$#features) {
579 0         0 $featurename=$features[$i]->{'name'};
580 0 0       0 if ($featurename eq "CDS") {
581 0         0 print "|CDS| number $j at feature position: $i\n";
582             #print $features[$i]->{'location'},"\n";
583 0         0 $transcript=$features[$i]->{'range'};
584 0         0 foreach $exonref (@{$transcript}) {
  0         0  
585 0         0 ($start,$end,$strand)=@{$exonref};
  0         0  
586 0         0 print "\tExon: start $start end $end strand $strand\n";
587             }
588 0         0 $j++;
589             } else {
590 0         0 print "|$featurename| at feature position: $i\n";
591 0         0 print "\trange: ";
592 0         0 print join("\t",@{$features[$i]->{'range'}}),"\n";
  0         0  
593             #print $features[$i]->{'location'},"\n";
594             }
595 0         0 $qualifiernumber=$features[$i]->{'qual_number'};
596 0         0 $qualifiers=$features[$i]->{'qualifiers'}; # hash
597 0         0 foreach $key (keys (%{$qualifiers})) {
  0         0  
598 0         0 print "\t\t",$key,": ";
599 0         0 print $qualifiers->{$key},"\n";
600             }
601             }
602             }
603              
604             =head2 genes
605              
606             Title : genes
607             Usage : $loader->genes();
608             Function: Returns an array of gene_names (strings) contained in the loaded
609             entry.
610             Args : none
611              
612             =cut
613              
614             # argument: entryhashref
615             # returns: array of genenames found in the entry
616             sub genes {
617 0     0 1 0 my ($self,$entry)=@_;
618 0 0       0 unless ($entry) {
619 0         0 $entry=$self->{'hash'};
620             }
621 0         0 my @entryfeatures=@{$entry->{'Features'}};
  0         0  
622 0         0 my ($genename,$genenames,$entryfeature);
623 0         0 for $entryfeature (@entryfeatures) {
624 0         0 $genename=$entryfeature->{'qualifiers'}->{'gene'};
625 0 0       0 if ($genename) {
626 0 0       0 if (index($genenames,$genename) == -1) { # if name is new
627 0         0 $genenames .= $genename . " "; # add the name
628             }
629             }
630             }
631 0         0 return (split(/ /,$genenames)); # assumes no space inbetween each genename
632             }
633              
634             # arguments: swisshash, translation objref
635             # adds information to the Translation, creates AARange objects, sets the
636             # aa_range attribute on the Translation, pointing to those objects
637             sub swisshash2liveseq {
638 0     0 0 0 my ($self,$entry,$translation)=@_;
639 0         0 my $translength=$translation->length;
640 0         0 $translation->desc($translation->desc . $entry->{'Description'});
641 0         0 $translation->display_id("SWISSPROT:" . $entry->{'ID'});
642 0         0 $translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'});
643 0         0 $translation->name($entry->{'Gene'});
644 0         0 $translation->source($entry->{'Organism'});
645 0         0 my @aarangeobjects;
646 0         0 my ($start,$end,$aarangeobj,$feature);
647 0         0 my @features; my @newfeatures;
  0         0  
648 0 0       0 if ($entry->{'Features'}) {
649 0         0 @features=@{$entry->{'Features'}};
  0         0  
650             }
651 0         0 my $cleavedmet=0;
652             # check for cleaved Met
653 0         0 foreach $feature (@features) {
654 0 0 0     0 if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) {
655 0         0 $cleavedmet=1;
656 0         0 $translation->{'offset'}="1"; # from swissprot to liveseq protein sequence
657             } else {
658 0         0 push(@newfeatures,$feature);
659             }
660             }
661              
662 0         0 my $swissseq=$entry->{'Sequence'};
663 0         0 my $liveseqtransl=$translation->seq;
664 0         0 chop $liveseqtransl; # to take away the trailing STOP "*"
665 0         0 my $translated=substr($liveseqtransl,$cleavedmet);
666              
667 0         0 my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met
668              
669 0 0 0     0 if ((index($liveseq_aa,"-") != -1)||(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed?
670 0         0 print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n";
671 0         0 carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!";
672 0         0 return;
673             }
674              
675             #my $i=0; # debug
676 0         0 @features=@newfeatures;
677 0         0 foreach $feature (@features) {
678             #print "Processing SwissProtFeature: $i\n"; # debug
679 0         0 ($start,$end)=split(/ /,$feature->{'location'});
680             # Note: cleavedmet is taken in account for updating numbering
681 0         0 $aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -description => $feature->{'description'}, -translation => $translation, -translength => $translength);
682 0 0       0 if ($aarangeobj != -1) {
683 0         0 push(@aarangeobjects,$aarangeobj);
684             }
685             # $i++; # debug
686             }
687 0         0 $translation->{'aa_ranges'}=\@aarangeobjects;
688             }
689              
690             # if there is no SRS support, the default will be to return 0
691             # i.e. this function is overridden in SRS package
692             sub get_swisshash {
693 6     6 0 13 return (0);
694             }
695              
696             # Args: $entry hashref, gene_name OR cds_position (undef is used to
697             # choose between the two), getswissprotinfo boolean flag
698             # Returns: an hash holding various arrayref used in the hash2gene method
699             # Function: examines the nucleotide entry, identifying features belonging
700             # to the gene (defined either by its name or by the position of its CDS in
701             # the entry)
702              
703             sub _findgenefeatures {
704 6     6   15 my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_;
705              
706 6         7 my @entryfeatures=@{$entry->{'Features'}};
  6         23  
707 6         98 my @exons; my @introns; my @prim_transcripts; my @transcripts;
  6         0  
  6         0  
708 6         0 my @repeat_units; my @repeat_regions;
  6         0  
709 6         0 my @repeat_units_family; my @repeat_regions_family; my $rpt_family;
  6         0  
  6         0  
710 6         0 my $entryfeature; my @genefeatures;
  6         0  
711 6         0 my $desc; my @exondescs; my @introndescs;
  6         0  
  6         0  
712              
713             # for swissprot xreference
714 6         0 my ($swissacc,$swisshash); my @swisshashes;
  6         0  
715              
716             # for translation_tables
717 6         0 my @ttables;
718              
719             # to create labels
720 6         0 my ($name,$exon);
721 6         0 my @range; my @cdsexons; my @labels;
  6         0  
  6         0  
722              
723             # maybe here also could be added special case when there is no CDS feature
724             # in the entry (e.g. tRNA entry -> TOCHECK).
725             # let's deal with the special case in which there is just one gene per entry
726             # usually without /gene qualifier
727 6         10 my @cds=@{$entry->{'CDS'}};
  6         13  
728              
729 6         10 my $skipgenematch=0;
730 6 50       21 if (scalar(@cds) == 1) {
731             #carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features.";
732 6         7 $skipgenematch=1;
733             }
734              
735 6         12 my ($cds_begin,$cds_end,$proximity);
736 6 50       21 if ($cds_position) { # if a position has been requested
737 0         0 my @cds_exons=@{$cds[$cds_position-1]->{'range'}};
  0         0  
738 0         0 ($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS
739 0         0 $gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'};
740             # DEBUG
741 0 0       0 unless ($skipgenematch) {
742 0         0 carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------";
743             }
744 0         0 $proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS
745             }
746              
747 6         13 for $entryfeature (@entryfeatures) { # get only features for the desired gene
748 30 0 0     45 if (($skipgenematch)||(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))||(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) {
      33        
      0        
      0        
749 30         40 push(@genefeatures,$entryfeature);
750              
751 30         36 my @range=@{$entryfeature->{'range'}};
  30         63  
752 30         49 $name=$entryfeature->{'name'};
753 30         29 my %qualifierhash=%{$entryfeature->{'qualifiers'}};
  30         82  
754 30 100       60 if ($name eq "CDS") { # that has range containing array of exons
755              
756             # swissprot crossindexing (if without SRS support it will fill array
757             # with zeros and do nothing
758 6 50       19 if ($getswissprotinfo) {
759 6         13 $swissacc=$entryfeature->{'qualifiers'}->{'db_xref'};
760 6         26 $swisshash=$self->get_swisshash($swissacc);
761             #$self->printswissprot($swisshash); # DEBUG
762 6         17 push (@swisshashes,$swisshash);
763             }
764              
765 6         15 push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified
766            
767             # create labels array
768 6         12 for $exon (@range) {
769 34         77 push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS
770             }
771 6         18 push (@transcripts,$entryfeature->{'range'});
772             } else {
773             # "simplifying" the joinedlocation features. I.e. changing them from
774             # multijoined ones to simple plain start-end features, taking only
775             # the start of the first "exon" and the end of the last "exon" as
776             # start and end of the entire feature
777 24 50 33     43 if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location
778 0         0 @range=($range[0]->[0],$range[-1]->[1]);
779             }
780 24         37 push(@labels,$range[0],$range[1]); # start and end of every feature
781 24 100       51 if ($name eq "exon") {
782 9         10 $desc=$entryfeature->{'qualifiers'}->{'number'};
783 9 100       15 if ($entryfeature->{'qualifiers'}->{'note'}) {
784 3 50       4 if ($desc) {
785 0         0 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
786             } else {
787 3         5 $desc = $entryfeature->{'qualifiers'}->{'note'};
788             }
789             }
790 9   50     12 push (@exondescs,$desc || "unknown");
791 9         10 push(@exons,\@range);
792             }
793 24 100       45 if ($name eq "intron") {
794 8         9 $desc=$entryfeature->{'qualifiers'}->{'number'};
795 8 50       9 if ($desc) {
796 0         0 $desc .= "|" . $entryfeature->{'qualifiers'}->{'note'};
797             } else {
798 8         8 $desc = $entryfeature->{'qualifiers'}->{'note'};
799             }
800 8   50     12 push (@introndescs,$desc || "unknown");
801 8         9 push(@introns,\@range);
802             }
803 24 100 100     66 if (($name eq "prim_transcript")||($name eq "mRNA")) { push(@prim_transcripts,\@range); }
  7         18  
804 24 50       36 if ($name eq "repeat_unit") { push(@repeat_units,\@range);
  0         0  
805 0         0 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
806 0   0     0 push (@repeat_units_family,$rpt_family || "unknown");
807             }
808 24 50       55 if ($name eq "repeat_region") { push(@repeat_regions,\@range);
  0         0  
809 0         0 $rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
810 0   0     0 push (@repeat_regions_family,$rpt_family || "unknown");
811             }
812             }
813             }
814             }
815 6 50       19 unless ($gene_name) { $gene_name="cds-position:".$cds_position; }
  0         0  
816 6         8 my %genefeatureshash;
817 6         16 $genefeatureshash{gene_name}=$gene_name;
818 6         14 $genefeatureshash{genefeatures}=\@genefeatures;
819 6         14 $genefeatureshash{labels}=\@labels;
820 6         14 $genefeatureshash{ttables}=\@ttables;
821 6         14 $genefeatureshash{swisshashes}=\@swisshashes;
822 6         16 $genefeatureshash{transcripts}=\@transcripts;
823 6         15 $genefeatureshash{exons}=\@exons;
824 6         18 $genefeatureshash{exondescs}=\@exondescs;
825 6         18 $genefeatureshash{introns}=\@introns;
826 6         12 $genefeatureshash{introndescs}=\@introndescs;
827 6         14 $genefeatureshash{prim_transcripts}=\@prim_transcripts;
828 6         12 $genefeatureshash{repeat_units}=\@repeat_units;
829 6         13 $genefeatureshash{repeat_regions}=\@repeat_regions;
830 6         11 $genefeatureshash{repeat_units_family}=\@repeat_units_family;
831 6         17 $genefeatureshash{repeat_regions_family}=\@repeat_regions_family;
832 6         26 return (\%genefeatureshash);
833             }
834              
835              
836             # used by _findgenefeatures, when a CDS at a certain position is requested,
837             # to retrieve only features quite close to the wanted CDS.
838             # Args: range hashref, begin and end positions of the CDS, $proximity
839             # $proximity holds the maximum distance between the extremes of the CDS
840             # and of the feature under exam.
841             # Returns: boolean
842             sub _checkfeatureproximity {
843 0     0     my ($self,$range,$cds_begin,$cds_end,$proximity)=@_;
844 0           my @range=@{$range};
  0            
845 0           my ($begin,$end,$strand);
846 0 0         if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons
847 0           ($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]);
848             } else {
849 0           ($begin,$end,$strand)=@range;
850             }
851 0 0         if ($cds_begin > $cds_end) { # i.e. reverse strand CDS
852 0           ($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries
853             }
854 0 0         if ($strand == -1) { # reverse strand
855 0           ($begin,$end)=($end,$begin); # swap boundaries
856             }
857 0 0         if (($cds_begin-$end)>$proximity) {
858 0           carp "--DEBUG-- feature rejected: begin $begin end $end -------";
859 0           return (0);
860             }
861 0 0         if (($begin-$cds_end)>$proximity) {
862 0           carp "--DEBUG-- feature rejected: begin $begin end $end -------";
863 0           return (0);
864             }
865 0           carp "--DEBUG-- feature accepted: begin $begin end $end -------";
866 0           return (1); # otherwise ok, feature considered next to CDS
867             }
868              
869              
870             # function that calls the external program "align" (on the fasta2 package)
871             # to create an alignment between two sequences, returning the aligned
872             # strings and the codes for the identity (:: ::::)
873              
874             sub _get_alignment {
875 0     0     my ($self,$seq1,$seq2)=@_;
876              
877 0 0         my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
878 0           my $fastafile1="/tmp/tmpfastafile1";
879 0           my $fastafile2="/tmp/tmpfastafile2";
880 0           my $grepcut='egrep -v "[[:digit:]]|^ *$|sequences" | cut -c8-'; # grep/cut
881 0           my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>$null | $grepcut"; # ALIGN
882 0 0         open my $TMPFASTAFILE1, '>', $fastafile1 or croak "Could not write file '$fastafile1' for aa alignment: $!";
883 0 0         open my $TMPFASTAFILE2, '>', $fastafile2 or croak "Could not write file '$fastafile2' for aa alignment: $!";
884 0           print $TMPFASTAFILE1 ">firstseq\n$seq1\n";
885 0           print $TMPFASTAFILE2 ">secondseq\n$seq2\n";
886 0           close $TMPFASTAFILE1;
887 0           close $TMPFASTAFILE2;
888 0           my $alignment=`$alignprogram`;
889 0           my @alignlines=split(/\n/,$alignment);
890 0           my ($linecount,$seq1_aligned,$seq2_aligned,$codes);
891 0           for ($linecount=0; $linecount < @alignlines; $linecount+=3) {
892 0           $seq1_aligned .= $alignlines[$linecount];
893 0           $codes .= $alignlines[$linecount+1];
894 0           $seq2_aligned .= $alignlines[$linecount+2];
895             }
896 0           return ($seq1_aligned,$seq2_aligned,$codes);
897             }
898              
899             # common part of the function to create a novel liveseq gene structure
900             # from an amino acid sequence, using codon usage frequencies
901             # args: codon_usage_array transltableid aasequence gene_name
902             sub _common_novelaasequence2gene {
903 0     0     my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_;
904 0           my @species_codon_usage=@{$species_codon_usage};
  0            
905 0           my @codon_usage_label=
906             qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg
907             tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga
908             ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa
909             gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag
910             tga);
911 0           my ($i,$j);
912 0           my %codon_usage_value;
913 0           my $aa_codon_total;
914 0           for ($i=0;$i<64;$i++) {
915 0           $codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i];
916             }
917              
918 0           my $CodonTable = Bio::Tools::CodonTable->new ( -id => $ttabid );
919 0           my @aminoacids = split(//,uc($aasequence));
920 0           my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon);
  0            
921 0           for $i (@aminoacids) {
922 0           @alt_codons = $CodonTable->revtranslate($i);
923 0 0         unless (@alt_codons) {
924 0           carp "No reverse translation possible for aminoacid \'$i\'";
925 0           $dnasequence .= "???";
926             } else {
927 0           $aa_codon_total=0;
928 0           for $j (@alt_codons) {
929 0           $aa_codon_total+=$codon_usage_value{$j};
930             }
931             # print "aminoacid $i, codonchoice: "; # verbose
932             #$partial=0;
933             #for $j (@alt_codons) {
934             #printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total;
935             #$partial+=($codon_usage_value{$j}/$aa_codon_total);
936             #}
937             #print "\n";
938 0           $dice=rand(1);
939             #print "roulette: $dice\n"; # verbose
940 0           $partial=0;
941 0           $chosen_codon="";
942             CODONCHOICE:
943 0           for $j (0..@alt_codons) { # last one not accounted
944 0           $thiscodon=$alt_codons[$j];
945 0           $relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total);
946 0 0         if ($dice < $relativeusage+$partial) {
947 0           $chosen_codon=$thiscodon;
948 0           last CODONCHOICE;
949             } else {
950 0           $partial += $relativeusage;
951             }
952             }
953 0 0         unless ($chosen_codon) {
954 0           $chosen_codon = $alt_codons[-1]; # the last one
955             }
956             # print ".....adding $chosen_codon\n"; # verbose
957 0           $dnasequence .= $chosen_codon;
958             }
959             }
960              
961 0           my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence);
962 0           my $min=1;
963 0           my $max=length($dnasequence);
964 0           my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1);
965 0           my @exons=($exon);
966 0           my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons);
967 0           $transcript->translation_table($ttabid);
968 0           my @transcripts=($transcript);
969 0           my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript);
970 0           my @translations=($translation);
971 0           my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations);
972 0           my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max);
973              
974             # creation of gene
975 0 0         unless ($gene) { # if $gene == 0 it means problems in hash2gene
976 0           carp "Error in Gene creation phase";
977 0           return (0);
978             }
979 0           return $gene;
980             }
981              
982             1;