File Coverage

Bio/LiveSeq/IO/Loader.pm

Criterion	Covered	Total	%
statement	268	592	45.2
branch	50	156	32.0
condition	10	49	20.4
subroutine	19	29	65.5
pod	5	12	41.6
total	352	838	42.0

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# bioperl module for Bio::LiveSeq::IO::Loader
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Joseph Insana
7							#
8							# Copyright Joseph Insana
9							#
10							# You may distribute this module under the same terms as perl itself
11							#
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::LiveSeq::IO::Loader - Parent Loader for LiveSeq
17
18							=head1 SYNOPSIS
19
20							#documentation needed
21
22							=head1 DESCRIPTION
23
24							This package holds common methods used by BioPerl and file loaders.
25							It contains methods to create LiveSeq objects out of entire entries or from a
26							localized sequence region surrounding a particular gene.
27
28							=head1 AUTHOR - Joseph A.L. Insana
29
30							Email: Insana@ebi.ac.uk, jinsana@gmx.net
31
32							=head1 APPENDIX
33
34							The rest of the documentation details each of the object
35							methods. Internal methods are usually preceded with a _
36
37							=cut
38
39							# Let the code begin...
40
41							package Bio::LiveSeq::IO::Loader;
42
43	2			2		13	use strict;
	2					2
	2					54
44	2			2		9	use Carp qw(cluck croak carp);
	2					4
	2					86
45	2			2		405	use Bio::LiveSeq::DNA;
	2					5
	2					55
46	2			2		503	use Bio::LiveSeq::Exon;
	2					4
	2					53
47	2			2		526	use Bio::LiveSeq::Transcript ;
	2					4
	2					57
48	2			2		474	use Bio::LiveSeq::Translation;
	2					4
	2					47
49	2			2		483	use Bio::LiveSeq::Gene;
	2					4
	2					51
50	2			2		747	use Bio::LiveSeq::Intron;
	2					3
	2					52
51	2			2		10	use Bio::LiveSeq::Prim_Transcript;
	2					4
	2					31
52	2			2		402	use Bio::LiveSeq::Repeat_Region;
	2					4
	2					50
53	2			2		389	use Bio::LiveSeq::Repeat_Unit;
	2					5
	2					50
54	2			2		500	use Bio::LiveSeq::AARange;
	2					3
	2					70
55	2			2		12	use Bio::Tools::CodonTable;
	2					3
	2					7659
56
57							=head2 entry2liveseq
58
59							Title : entry2liveseq
60							Usage : @translationobjects=$loader->entry2liveseq();
61							: @translationobjects=$loader->entry2liveseq(-getswissprotinfo => 0);
62							Function: creates LiveSeq objects from an entry previously loaded
63							Returns : array of references to objects of class Translation
64							Errorcode 0
65							Args : optional boolean flag to avoid the retrieval of SwissProt
66							information for all Transcripts containing SwissProt x-reference
67							default is 1 (to retrieve those information and create AARange
68							LiveSeq objects)
69							Note : this method can get really slow for big entries. The lightweight
70							gene2liveseq method is recommended
71
72							=cut
73
74							sub entry2liveseq {
75	0			0	1	0	my ($self, %args) = @_;
76	0					0	my ($getswissprotinfo)=($args{-getswissprotinfo});
77	0	0				0	if (defined($getswissprotinfo)) {
78	0	0	0			0	if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
79	0					0	carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
80	0					0	$getswissprotinfo=0;
81							}
82							} else {
83	0					0	$getswissprotinfo=1;
84							}
85	0					0	my $hashref=$self->{'hash'};
86	0	0				0	unless ($hashref) { return (0); }
	0					0
87	0					0	my @translationobjects=$self->hash2liveseq($hashref,$getswissprotinfo);
88	0					0	my $test_transl=0;
89	0	0				0	if ($test_transl) { $self->test_transl($hashref,\@translationobjects);}
	0					0
90	0					0	return @translationobjects;
91							}
92
93							=head2 novelaasequence2gene
94
95							Title : novelaasequence2gene
96							Usage : $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
97							: $gene=$loader->novelaasequence2gene(-aasequence => "MGLAAPTRS*");
98							-taxon => 9606,
99							-gene_name => "tyr-kinase");
100
101							Function: creates LiveSeq objects from a novel amino acid sequence,
102							using codon usage database to choose codons according to
103							relative frequencies.
104							If a taxon ID is not specified, the default is to use the human
105							one (taxonomy ID 9606).
106							Returns : reference to a Gene object containing references to LiveSeq objects
107							Errorcode 0
108							Args : string containing an amino acid sequence
109							integer (optional) with a taxonomy ID
110							string specifying a gene name
111
112							=cut
113
114							=head2 gene2liveseq
115
116							Title : gene2liveseq
117							Usage : $gene=$loader->gene2liveseq(-gene_name => "gene name");
118							: $gene=$loader->gene2liveseq(-gene_name => "gene name",
119							-flanking => 64);
120							: $gene=$loader->gene2liveseq(-gene_name => "gene name",
121							-getswissprotinfo => 0);
122							: $gene=$loader->gene2liveseq(-position => 4);
123
124							Function: creates LiveSeq objects from an entry previously loaded
125							It is a "light weight" creation because it creates
126							a LiveSequence just for the interesting region in an entry
127							(instead than for the total entry, like in entry2liveseq) and for
128							the flanking regions up to 500 nucleotides (default) or up to
129							the specified amount of nucleotides (given as argument) around the
130							Gene.
131							Returns : reference to a Gene object containing possibly alternative
132							Transcripts.
133							Errorcode 0
134							Args : string containing the gene name as in the EMBL feature qualifier
135							integer (optional) "flanking": amount of flanking bases to be kept
136							boolean (optional) "getswissprotinfo": if set to "0" it will avoid
137							trying to fetch information from a crossreference to a SwissProt
138							entry, avoiding the process of creation of AARange objects
139							It is "1" (on) by default
140
141							Alternative to a gene_name, a position can be given: an
142							integer (1-) containing the position of the desired CDS in the
143							loaded entry
144
145							=cut
146
147							sub gene2liveseq {
148	6			6	1	482	my ($self, %args) = @_;
149	6					23	my ($gene_name,$flanking,$getswissprotinfo,$cds_position)=($args{-gene_name},$args{-flanking},$args{-getswissprotinfo},$args{-position});
150	6					7	my $input;
151	6	50	33			20	unless (($gene_name)\|\|($cds_position)) {
152	0					0	carp "Gene_Name or Position not specified for gene2liveseq loading function";
153	0					0	return (0);
154							}
155	6	50	33			30	if (($gene_name)&&($cds_position)) {
		50
156	0					0	carp "Gene_Name and Position cannot be given together";
157	0					0	return (0);
158							} elsif ($gene_name) {
159	6					10	$input=$gene_name;
160							} else {
161	0					0	$input="cds-position:".$cds_position;
162							}
163
164	6	50				15	if (defined($getswissprotinfo)) {
165	0	0	0			0	if (($getswissprotinfo ne 0)&&($getswissprotinfo ne 1)) {
166	0					0	carp "-getswissprotinfo argument can take only boolean (1 or 0) values. Setting it to 0, i.e. not trying to retrieve SwissProt information....";
167	0					0	$getswissprotinfo=0;
168							}
169							} else {
170	6					11	$getswissprotinfo=1;
171							}
172
173	6	50				17	if (defined($flanking)) {
174	0	0				0	unless ($flanking >= 0) {
175	0					0	carp "No sense in specifying a number < 0 for flanking regions to be created for gene2liveseq loading function";
176	0					0	return (0);
177							}
178							} else {
179	6					9	$flanking=500; # the default flanking length
180							}
181	6					14	my $hashref=$self->{'hash'};
182	6	50				15	unless ($hashref) { return (0); }
	0					0
183	6					23	my $gene=$self->hash2gene($hashref,$input,$flanking,$getswissprotinfo);
184	6	50				50	unless ($gene) { # if $gene == 0 it means problems in hash2gene
185	0					0	carp "gene2liveseq produced error";
186	0					0	return (0);
187							}
188	6					41	return $gene;
189							}
190
191							# TODO: update so that it will work even if CDS is not only accepted FEATURE!!
192							# this method is for now deprecated and not supported
193							sub test_transl {
194	0			0	0	0	my ($self,$entry)=@_;
195	0					0	my @features=@{$entry->{'Features'}};
	0					0
196	0					0	my @translationobjects=@{$_[1]};
	0					0
197	0					0	my ($i,$translation);
198	0					0	my ($obj_transl,$hash_transl);
199	0					0	my @cds=@{$entry->{'CDS'}};
	0					0
200	0					0	foreach $translation (@translationobjects) {
201	0					0	$obj_transl=$translation->seq;
202	0					0	$hash_transl=$cds[$i]->{'qualifiers'}->{'translation'};
203							#before seq was changed in Translation 1.4# chop $obj_transl; # to remove trailing "*"
204	0	0				0	unless ($obj_transl eq $hash_transl) {
205	0					0	cluck "Serious error: Translation from the Entry does not match Translation from object's seq for CDS at position $i";
206	0					0	carp "\nEntry's transl: ",$hash_transl,"\n";
207	0					0	carp "\nObject's transl: ",$obj_transl,"\n";
208	0					0	exit;
209							}
210	0					0	$i++;
211							}
212							}
213
214							# argument: hashref containing the EMBL entry datas,
215							# getswissprotinfo boolean flag
216							# creates the liveseq objects
217							# returns: an array of Translation object references
218							sub hash2liveseq {
219	0			0	0	0	my ($self,$entry,$getswissprotinfo)=@_;
220	0					0	my $i;
221							my @transcripts;
222	0					0	my $dna=Bio::LiveSeq::DNA->new(-seq => $entry->{'Sequence'} );
223	0					0	$dna->alphabet(lc($entry->{'Molecule'}));
224	0					0	$dna->display_id($entry->{'ID'});
225	0					0	$dna->accession_number($entry->{'AccNumber'});
226	0					0	$dna->desc($entry->{'Description'});
227	0					0	my @cds=@{$entry->{'CDS'}};
	0					0
228	0					0	my ($swissacc,$swisshash); my @swisshashes;
	0					0
229	0					0	for $i (0..$#cds) {
230							#my @transcript=@{$cds[$i]->{'range'}};
231							#$transcript=\@transcript;
232							#push (@transcripts,$transcript);
233	0					0	push (@transcripts,$cds[$i]->{'range'});
234	0	0				0	if ($getswissprotinfo) {
235	0					0	$swissacc=$cds[$i]->{'qualifiers'}->{'db_xref'};
236	0					0	$swisshash=$self->get_swisshash($swissacc);
237							#$self->printswissprot($swisshash); # DEBUG
238	0					0	push (@swisshashes,$swisshash);
239							}
240							}
241	0					0	my @translations=($self->transexonscreation($dna,\@transcripts));
242	0					0	my $translation; my $j=0;
	0					0
243	0					0	foreach $translation (@translations) {
244	0	0				0	if ($swisshashes[$j]) { # if not 0
245	0					0	$self->swisshash2liveseq($swisshashes[$j],$translation);
246							}
247	0					0	$j++;
248							}
249	0					0	return (@translations);
250							}
251
252							# only features pertaining to a specified gene are created
253							# only the sequence of the gene and appropriate context flanking regions
254							# are created as chain
255							# arguments: hashref, gene_name (OR: cds_position), length_of_flanking_sequences, getswissprotinfo boolean flag
256							# returns: reference to Gene object
257							#
258							# Note: if entry contains just one CDS, all the features get added
259							# this is useful because often the features in these entries do not
260							# carry the /gene qualifier
261							#
262							# errorcode: 0
263							sub hash2gene {
264	6			6	0	16	my ($self,$entry,$input,$flanking,$getswissprotinfo)=@_;
265	6					10	my $entryfeature;
266							my $genefeatureshash;
267
268	6					9	my @cds=@{$entry->{'CDS'}};
	6					17
269
270							# checking if a position has been given instead than a gene_name
271	6	50				22	if (index($input,"cds-position:") == 0 ) {
272	0					0	my $cds_position=substr($input,13); # extracting the cds position
273	0	0	0			0	if (($cds_position >= 1)&&($cds_position <= scalar(@cds))) {
274	0					0	$genefeatureshash=$self->_findgenefeatures($entry,undef,$cds_position,$getswissprotinfo);
275							}
276							} else {
277	6					28	$genefeatureshash=$self->_findgenefeatures($entry,$input,undef,$getswissprotinfo);
278							}
279
280	6	50	50			23	unless (($genefeatureshash)&&(scalar(@{$genefeatureshash->{'genefeatures'}}))) { # array empty, no gene features found
	6					21
281	0					0	my @genes=$self->genes($entry);
282	0					0	my $cds_number=scalar(@cds);
283	0					0	warn "Warning! Not even one genefeature found for /$input/....
284							The genes present in this entry are:\n\t@genes\n
285							The number of CDS in this entry is:\n\t$cds_number\n";
286	0					0	return(0);
287							}
288
289							# get max and min, check flankings
290	6					11	my ($min,$max)=$self->rangeofarray(@{$genefeatureshash->{'labels'}}); # gene "boundaries"
	6					26
291	6					12	my $seqlength=$entry->{'SeqLength'};
292	6					10	my ($mindna,$maxdna); # some flanking region next to the gene "boundaries"
293	6	50				17	if ($min-$flanking < 1) {
294	6					9	$mindna=1;
295							} else {
296	0					0	$mindna=$min-$flanking;
297							}
298	6	50				15	if ($max+$flanking > $seqlength) {
299	6					8	$maxdna=$seqlength;
300							} else {
301	0					0	$maxdna=$max+$flanking;
302							}
303	6					89	my $subseq=substr($entry->{'Sequence'},$mindna-1,$maxdna-$mindna+1);
304
305							# create LiveSeq objects
306
307							# create DNA
308	6					50	my $dna=Bio::LiveSeq::DNA->new(-seq => $subseq, -offset => $mindna);
309	6					72	$dna->alphabet(lc($entry->{'Molecule'}));
310	6					28	$dna->source($entry->{'Organism'});
311	6					34	$dna->display_id($entry->{'ID'});
312	6					28	$dna->accession_number($entry->{'AccNumber'});
313	6					28	$dna->desc($entry->{'Description'});
314
315	6					12	my @transcripts=@{$genefeatureshash->{'transcripts'}};
	6					20
316							# create Translations, Transcripts, Exons out of the CDS
317	6	50				14	unless (@transcripts) {
318	0					0	cluck "no CDS feature found for /$input/....";
319	0					0	return(0);
320							}
321	6					70	my @translationobjs=$self->transexonscreation($dna,\@transcripts);
322	6					13	my @transcriptobjs;
323
324							# get the Transcript obj_refs
325							my $translation;
326	6					11	my $j=0;
327	6					7	my @ttables=@{$genefeatureshash->{'ttables'}};
	6					20
328	6					7	my @swisshashes=@{$genefeatureshash->{'swisshashes'}};
	6					19
329	6					13	foreach $translation (@translationobjs) {
330	6					24	push(@transcriptobjs,$translation->get_Transcript);
331	6	50				19	if ($ttables[$j]) { # if not undef
332	0					0	$translation->get_Transcript->translation_table($ttables[$j]);
333							#} else { # DEBUG
334							# print "\n\t\tno translation table information....\n";
335							}
336	6	50				16	if ($swisshashes[$j]) { # if not 0
337	0					0	$self->swisshash2liveseq($swisshashes[$j],$translation);
338							}
339	6					9	$j++;
340							}
341
342	6					7	my %gene; # this is the hash to store created object references
343	6					18	$gene{DNA}=$dna;
344	6					14	$gene{Transcripts}=\@transcriptobjs;
345	6					13	$gene{Translations}=\@translationobjs;
346
347	6					37	my @exonobjs; my @intronobjs;
348	6					0	my @repeatunitobjs; my @repeatregionobjs;
	6					0
349	6					0	my @primtranscriptobjs;
350
351	6					0	my ($object,$range,$start,$end,$strand);
352
353	6					9	my @exons=@{$genefeatureshash->{'exons'}};
	6					17
354	6					9	my @exondescs=@{$genefeatureshash->{'exondescs'}};
	6					21
355	6	100				15	if (@exons) {
356	1					2	my $exoncount = 0;
357	1					2	foreach $range (@exons) {
358	9					9	($start,$end,$strand)=@{$range};
	9					20
359	9					29	$object = Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
360	9	50				21	if ($object != -1) {
361	9	50				31	$object->desc($exondescs[$exoncount]) if defined $exondescs[$exoncount];
362	9					9	$exoncount++;
363	9					16	push (@exonobjs,$object);
364							} else {
365	0					0	$exoncount++;
366							}
367							}
368	1					3	$gene{Exons}=\@exonobjs;
369							}
370	6					7	my @introns=@{$genefeatureshash->{'introns'}};
	6					15
371	6					10	my @introndescs=@{$genefeatureshash->{'introndescs'}};
	6					16
372	6	100				18	if (@introns) {
373	1					3	my $introncount = 0;
374	1					3	foreach $range (@introns) {
375	8					12	($start,$end,$strand)=@{$range};
	8					18
376	8					36	$object=Bio::LiveSeq::Intron->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
377	8	50				24	if ($object != -1) {
378	8					28	$object->desc($introndescs[$introncount]);
379	8					9	$introncount++;
380	8					17	push (@intronobjs,$object);
381							} else {
382	0					0	$introncount++;
383							}
384							}
385	1					3	$gene{Introns}=\@intronobjs;
386							}
387	6					9	my @prim_transcripts=@{$genefeatureshash->{'prim_transcripts'}};
	6					16
388	6	100				14	if (@prim_transcripts) {
389	5					10	foreach $range (@prim_transcripts) {
390	7					13	($start,$end,$strand)=@{$range};
	7					23
391	7					57	$object=Bio::LiveSeq::Prim_Transcript->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
392	7	50				43	if ($object != -1) { push (@primtranscriptobjs,$object); }
	7					34
393							}
394	5					23	$gene{Prim_Transcripts}=\@primtranscriptobjs;
395							}
396	6					12	my @repeat_regions=@{$genefeatureshash->{'repeat_regions'}};
	6					20
397	6					12	my @repeat_regions_family=@{$genefeatureshash->{'repeat_regions_family'}};
	6					15
398	6	50				21	if (@repeat_regions) {
399	0					0	my $k=0;
400	0					0	foreach $range (@repeat_regions) {
401	0					0	($start,$end,$strand)=@{$range};
	0					0
402	0					0	$object=Bio::LiveSeq::Repeat_Region->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
403	0	0				0	if ($object != -1) {
404	0					0	$object->desc($repeat_regions_family[$k]);
405	0					0	$k++;
406	0					0	push (@repeatregionobjs,$object);
407							} else {
408	0					0	$k++;
409							}
410							}
411	0					0	$gene{Repeat_Regions}=\@repeatregionobjs;
412							}
413	6					9	my @repeat_units=@{$genefeatureshash->{'repeat_units'}};
	6					15
414	6					9	my @repeat_units_family=@{$genefeatureshash->{'repeat_units_family'}};
	6					16
415	6	50				18	if (@repeat_units) {
416	0					0	my $k=0;
417	0					0	foreach $range (@repeat_units) {
418	0					0	($start,$end,$strand)=@{$range};
	0					0
419	0					0	$object=Bio::LiveSeq::Repeat_Unit->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
420	0	0				0	if ($object != -1) {
421	0					0	$object->desc($repeat_units_family[$k]);
422	0					0	$k++;
423	0					0	push (@repeatunitobjs,$object);
424							} else {
425	0					0	$k++;
426							}
427							}
428	0					0	$gene{Repeat_Units}=\@repeatunitobjs;
429							}
430
431							# create the Gene
432	6					15	my $gene_name=$genefeatureshash->{'gene_name'}; # either a name or a cdspos
433	6					78	return (Bio::LiveSeq::Gene->new(-name=>$gene_name,-features=>\%gene,
434							-upbound=>$min,-downbound=>$max));
435							}
436
437							# maybe this function will be moved to general utility package
438							# argument: array of numbers
439							# returns: (min,max) numbers in the array
440							sub rangeofarray {
441	6			6	0	9	my $self=shift;
442	6					33	my @array=@_;
443							#print "\n-=-=-=-=-=-=-=-=-=-=array: @array\n";
444	6					10	my ($max,$min,$element);
445	6					15	$min=$max=shift(@array);
446	6					14	foreach $element (@array) {
447	110	50				127	$element = 0 unless defined $element;
448	110	50				141	if ($element < $min) {
449	0					0	$min=$element;
450							}
451	110	100				158	if ($element > $max) {
452	6					11	$max=$element;
453							}
454							}
455							#print "\n-=-=-=-=-=-=-=-=-=-=min: $min\tmax: $max\n";
456	6					23	return ($min,$max);
457							}
458
459
460							# argument: reference to DNA object, reference to array of transcripts
461							# returns: an array of Translation object references
462							sub transexonscreation {
463	6			6	0	14	my $self=shift;
464	6					11	my $dna=$_[0];
465	6					8	my @transcripts=@{$_[1]};
	6					14
466
467	6					25	my (@transexons,$start,$end,$strand,$exonref,$exonobj,$transcript,$transcriptobj);
468	6					0	my $translationobj;
469	6					0	my @translationobjects;
470	6					14	foreach $transcript (@transcripts) {
471	6					9	foreach $exonref (@{$transcript}) {
	6					12
472	34					40	($start,$end,$strand)=@{$exonref};
	34					79
473							#print "Creating Exon: start $start end $end strand $strand\n";
474	34					146	$exonobj=Bio::LiveSeq::Exon->new(-seq=>$dna,-start=>$start,-end=>$end,-strand=>$strand);
475							#push (@exonobjects,$exonobj);
476	34					78	push (@transexons,$exonobj);
477							}
478	6					63	$transcriptobj=Bio::LiveSeq::Transcript->new(-exons => \@transexons );
479	6	50				27	if ($transcriptobj != -1) {
480	6					53	$translationobj=Bio::LiveSeq::Translation->new(-transcript=>$transcriptobj);
481	6					14	@transexons=(); # cleans it
482							#push (@transcriptobjects,$transcriptobj);
483	6					17	push (@translationobjects,$translationobj);
484							}
485							}
486	6					25	return (@translationobjects);
487							}
488
489							#sub printgene {
490							# deleted. Some functionality placed in Gene->printfeaturesnum
491
492							=head2 printswissprot
493
494							Title : printswissprot
495							Usage : $loader->printswissprot($hashref);
496							Function: prints out all information loaded from a database entry into the
497							loader. Mainly used for testing purposes.
498							Args : a hashref containing the SWISSPROT entry datas
499							Note : the hashref can be obtained with a call to the method
500							$loader->get_swisshash() (BioPerl via Bio::DB::EMBL.pm)
501							that takes as argument a string like "SWISS-PROT:P10275"
502
503							=cut
504
505							# argument: hashref containing the SWISSPROT entry datas
506							# prints out that hash, showing the information loaded
507							sub printswissprot {
508	0			0	1	0	my ($self,$entry)=@_;
509	0	0				0	unless ($entry) {
510	0					0	return;
511							}
512							printf "ID: %s\n",
513	0					0	$entry->{'ID'};
514							printf "ACC: %s\n",
515	0					0	$entry->{'AccNumber'};
516							printf "GENE: %s\n",
517	0					0	$entry->{'Gene'};
518							printf "DES: %s\n",
519	0					0	$entry->{'Description'};
520							printf "ORG: %s\n",
521	0					0	$entry->{'Organism'};
522							printf "SEQLN: %s\n",
523	0					0	$entry->{'SeqLength'};
524							printf "SEQ: %s\n",
525	0					0	substr($entry->{'Sequence'},0,64);
526	0	0				0	if ($entry->{'Features'}) {
527	0					0	my @features=@{$entry->{'Features'}};
	0					0
528	0					0	my $i;
529	0					0	for $i (0..$#features) {
530	0					0	print "\|",$features[$i]->{'name'},"\|";
531	0					0	print " at ",$features[$i]->{'location'},": ";
532	0					0	print "",$features[$i]->{'desc'},"\n";
533							}
534							}
535							}
536
537							=head2 printembl
538
539							Title : printembl
540							Usage : $loader->printembl();
541							Function: prints out all information loaded from a database entry into the
542							loader. Mainly used for testing purposes.
543							Args : none
544
545							=cut
546
547							# argument: hashref containing the EMBL entry datas
548							# prints out that hash, showing the information loaded
549							sub printembl {
550	0			0	1	0	my ($self,$entry)=@_;
551	0	0				0	unless ($entry) {
552	0					0	$entry=$self->{'hash'};
553							}
554	0					0	my ($i,$featurename);
555							printf "ID: %s\n",
556	0					0	$entry->{'ID'};
557							printf "ACC: %s\n",
558	0					0	$entry->{'AccNumber'};
559							printf "MOL: %s\n",
560	0					0	$entry->{'Molecule'};
561							printf "DIV: %s\n",
562	0					0	$entry->{'Division'};
563							printf "DES: %s\n",
564	0					0	$entry->{'Description'};
565							printf "ORG: %s\n",
566	0					0	$entry->{'Organism'};
567							printf "SEQLN: %s\n",
568	0					0	$entry->{'SeqLength'};
569							printf "SEQ: %s\n",
570	0					0	substr($entry->{'Sequence'},0,64);
571	0					0	my @features=@{$entry->{'Features'}};
	0					0
572	0					0	my @cds=@{$entry->{'CDS'}};
	0					0
573	0					0	print "\nFEATURES\nNumber of CDS: ",scalar(@cds)," (out of ",$entry->{'FeaturesNumber'}, " total features)\n";
574	0					0	my ($exonref,$transcript);
575	0					0	my ($qualifiernumber,$qualifiers,$key);
576	0					0	my ($start,$end,$strand);
577	0					0	my $j=0;
578	0					0	for $i (0..$#features) {
579	0					0	$featurename=$features[$i]->{'name'};
580	0	0				0	if ($featurename eq "CDS") {
581	0					0	print "\|CDS\| number $j at feature position: $i\n";
582							#print $features[$i]->{'location'},"\n";
583	0					0	$transcript=$features[$i]->{'range'};
584	0					0	foreach $exonref (@{$transcript}) {
	0					0
585	0					0	($start,$end,$strand)=@{$exonref};
	0					0
586	0					0	print "\tExon: start $start end $end strand $strand\n";
587							}
588	0					0	$j++;
589							} else {
590	0					0	print "\|$featurename\| at feature position: $i\n";
591	0					0	print "\trange: ";
592	0					0	print join("\t",@{$features[$i]->{'range'}}),"\n";
	0					0
593							#print $features[$i]->{'location'},"\n";
594							}
595	0					0	$qualifiernumber=$features[$i]->{'qual_number'};
596	0					0	$qualifiers=$features[$i]->{'qualifiers'}; # hash
597	0					0	foreach $key (keys (%{$qualifiers})) {
	0					0
598	0					0	print "\t\t",$key,": ";
599	0					0	print $qualifiers->{$key},"\n";
600							}
601							}
602							}
603
604							=head2 genes
605
606							Title : genes
607							Usage : $loader->genes();
608							Function: Returns an array of gene_names (strings) contained in the loaded
609							entry.
610							Args : none
611
612							=cut
613
614							# argument: entryhashref
615							# returns: array of genenames found in the entry
616							sub genes {
617	0			0	1	0	my ($self,$entry)=@_;
618	0	0				0	unless ($entry) {
619	0					0	$entry=$self->{'hash'};
620							}
621	0					0	my @entryfeatures=@{$entry->{'Features'}};
	0					0
622	0					0	my ($genename,$genenames,$entryfeature);
623	0					0	for $entryfeature (@entryfeatures) {
624	0					0	$genename=$entryfeature->{'qualifiers'}->{'gene'};
625	0	0				0	if ($genename) {
626	0	0				0	if (index($genenames,$genename) == -1) { # if name is new
627	0					0	$genenames .= $genename . " "; # add the name
628							}
629							}
630							}
631	0					0	return (split(/ /,$genenames)); # assumes no space inbetween each genename
632							}
633
634							# arguments: swisshash, translation objref
635							# adds information to the Translation, creates AARange objects, sets the
636							# aa_range attribute on the Translation, pointing to those objects
637							sub swisshash2liveseq {
638	0			0	0	0	my ($self,$entry,$translation)=@_;
639	0					0	my $translength=$translation->length;
640	0					0	$translation->desc($translation->desc . $entry->{'Description'});
641	0					0	$translation->display_id("SWISSPROT:" . $entry->{'ID'});
642	0					0	$translation->accession_number("SWISSPROT:" . $entry->{'AccNumber'});
643	0					0	$translation->name($entry->{'Gene'});
644	0					0	$translation->source($entry->{'Organism'});
645	0					0	my @aarangeobjects;
646	0					0	my ($start,$end,$aarangeobj,$feature);
647	0					0	my @features; my @newfeatures;
	0					0
648	0	0				0	if ($entry->{'Features'}) {
649	0					0	@features=@{$entry->{'Features'}};
	0					0
650							}
651	0					0	my $cleavedmet=0;
652							# check for cleaved Met
653	0					0	foreach $feature (@features) {
654	0	0	0			0	if (($feature->{'name'} eq "INIT_MET")&&($feature->{'location'} eq "0 0")) {
655	0					0	$cleavedmet=1;
656	0					0	$translation->{'offset'}="1"; # from swissprot to liveseq protein sequence
657							} else {
658	0					0	push(@newfeatures,$feature);
659							}
660							}
661
662	0					0	my $swissseq=$entry->{'Sequence'};
663	0					0	my $liveseqtransl=$translation->seq;
664	0					0	chop $liveseqtransl; # to take away the trailing STOP "*"
665	0					0	my $translated=substr($liveseqtransl,$cleavedmet);
666
667	0					0	my ($liveseq_aa,$swiss_aa,$codes_aa)=$self->_get_alignment($translated,$swissseq); # alignment after cleavage of possible initial met
668
669	0	0	0			0	if ((index($liveseq_aa,"-") != -1)\|\|(index($swiss_aa,"-") != -1)) { # there are gaps, how to proceed?
670	0					0	print "LIVE-SEQ=\'$liveseq_aa\'\nIDENTITY=\'$codes_aa\'\nSWS-PROT=\'$swiss_aa\'\n";
671	0					0	carp "Nucleotides translation and SwissProt translation are different in size, cannot attach the SwissSequence to the EMBL one, cannot add any AminoAcidRange object/Domain information!";
672	0					0	return;
673							}
674
675							#my $i=0; # debug
676	0					0	@features=@newfeatures;
677	0					0	foreach $feature (@features) {
678							#print "Processing SwissProtFeature: $i\n"; # debug
679	0					0	($start,$end)=split(/ /,$feature->{'location'});
680							# Note: cleavedmet is taken in account for updating numbering
681	0					0	$aarangeobj=Bio::LiveSeq::AARange->new(-start => $start+$cleavedmet, -end => $end+$cleavedmet, -name => $feature->{'name'}, -description => $feature->{'description'}, -translation => $translation, -translength => $translength);
682	0	0				0	if ($aarangeobj != -1) {
683	0					0	push(@aarangeobjects,$aarangeobj);
684							}
685							# $i++; # debug
686							}
687	0					0	$translation->{'aa_ranges'}=\@aarangeobjects;
688							}
689
690							# if there is no SRS support, the default will be to return 0
691							# i.e. this function is overridden in SRS package
692							sub get_swisshash {
693	6			6	0	13	return (0);
694							}
695
696							# Args: $entry hashref, gene_name OR cds_position (undef is used to
697							# choose between the two), getswissprotinfo boolean flag
698							# Returns: an hash holding various arrayref used in the hash2gene method
699							# Function: examines the nucleotide entry, identifying features belonging
700							# to the gene (defined either by its name or by the position of its CDS in
701							# the entry)
702
703							sub _findgenefeatures {
704	6			6		15	my ($self,$entry,$gene_name,$cds_position,$getswissprotinfo)=@_;
705
706	6					7	my @entryfeatures=@{$entry->{'Features'}};
	6					23
707	6					98	my @exons; my @introns; my @prim_transcripts; my @transcripts;
	6					0
	6					0
708	6					0	my @repeat_units; my @repeat_regions;
	6					0
709	6					0	my @repeat_units_family; my @repeat_regions_family; my $rpt_family;
	6					0
	6					0
710	6					0	my $entryfeature; my @genefeatures;
	6					0
711	6					0	my $desc; my @exondescs; my @introndescs;
	6					0
	6					0
712
713							# for swissprot xreference
714	6					0	my ($swissacc,$swisshash); my @swisshashes;
	6					0
715
716							# for translation_tables
717	6					0	my @ttables;
718
719							# to create labels
720	6					0	my ($name,$exon);
721	6					0	my @range; my @cdsexons; my @labels;
	6					0
	6					0
722
723							# maybe here also could be added special case when there is no CDS feature
724							# in the entry (e.g. tRNA entry -> TOCHECK).
725							# let's deal with the special case in which there is just one gene per entry
726							# usually without /gene qualifier
727	6					10	my @cds=@{$entry->{'CDS'}};
	6					13
728
729	6					10	my $skipgenematch=0;
730	6	50				21	if (scalar(@cds) == 1) {
731							#carp "Note: only one CDS in this entry. Treating all features found in entry as Gene features.";
732	6					7	$skipgenematch=1;
733							}
734
735	6					12	my ($cds_begin,$cds_end,$proximity);
736	6	50				21	if ($cds_position) { # if a position has been requested
737	0					0	my @cds_exons=@{$cds[$cds_position-1]->{'range'}};
	0					0
738	0					0	($cds_begin,$cds_end)=($cds_exons[0]->[0],$cds_exons[-1]->[1]); # begin and end of CDS
739	0					0	$gene_name=$cds[$cds_position-1]->{'qualifiers'}->{'gene'};
740							# DEBUG
741	0	0				0	unless ($skipgenematch) {
742	0					0	carp "--DEBUG-- cdsbegin $cds_begin cdsend $cds_end--------";
743							}
744	0					0	$proximity=100; # proximity CONSTANT to decide whether a feature "belongs" to the CDS
745							}
746
747	6					13	for $entryfeature (@entryfeatures) { # get only features for the desired gene
748	30	0	0			45	if (($skipgenematch)\|\|(($cds_position)&&($self->_checkfeatureproximity($entryfeature->{'range'},$cds_begin,$cds_end,$proximity)))\|\|(!($cds_position)&&($entryfeature->{'qualifiers'}->{'gene'} eq "$gene_name"))) {
			33
			0
			0
749	30					40	push(@genefeatures,$entryfeature);
750
751	30					36	my @range=@{$entryfeature->{'range'}};
	30					63
752	30					49	$name=$entryfeature->{'name'};
753	30					29	my %qualifierhash=%{$entryfeature->{'qualifiers'}};
	30					82
754	30	100				60	if ($name eq "CDS") { # that has range containing array of exons
755
756							# swissprot crossindexing (if without SRS support it will fill array
757							# with zeros and do nothing
758	6	50				19	if ($getswissprotinfo) {
759	6					13	$swissacc=$entryfeature->{'qualifiers'}->{'db_xref'};
760	6					26	$swisshash=$self->get_swisshash($swissacc);
761							#$self->printswissprot($swisshash); # DEBUG
762	6					17	push (@swisshashes,$swisshash);
763							}
764
765	6					15	push (@ttables,$entryfeature->{'qualifiers'}->{'transl_table'}); # undef if not specified
766
767							# create labels array
768	6					12	for $exon (@range) {
769	34					77	push(@labels,$exon->[0],$exon->[1]); # start and end of every exon of the CDS
770							}
771	6					18	push (@transcripts,$entryfeature->{'range'});
772							} else {
773							# "simplifying" the joinedlocation features. I.e. changing them from
774							# multijoined ones to simple plain start-end features, taking only
775							# the start of the first "exon" and the end of the last "exon" as
776							# start and end of the entire feature
777	24	50	33			43	if ($entryfeature->{'locationtype'} && $entryfeature->{'locationtype'} eq "joined") { # joined location
778	0					0	@range=($range[0]->[0],$range[-1]->[1]);
779							}
780	24					37	push(@labels,$range[0],$range[1]); # start and end of every feature
781	24	100				51	if ($name eq "exon") {
782	9					10	$desc=$entryfeature->{'qualifiers'}->{'number'};
783	9	100				15	if ($entryfeature->{'qualifiers'}->{'note'}) {
784	3	50				4	if ($desc) {
785	0					0	$desc .= "\|" . $entryfeature->{'qualifiers'}->{'note'};
786							} else {
787	3					5	$desc = $entryfeature->{'qualifiers'}->{'note'};
788							}
789							}
790	9		50			12	push (@exondescs,$desc \|\| "unknown");
791	9					10	push(@exons,\@range);
792							}
793	24	100				45	if ($name eq "intron") {
794	8					9	$desc=$entryfeature->{'qualifiers'}->{'number'};
795	8	50				9	if ($desc) {
796	0					0	$desc .= "\|" . $entryfeature->{'qualifiers'}->{'note'};
797							} else {
798	8					8	$desc = $entryfeature->{'qualifiers'}->{'note'};
799							}
800	8		50			12	push (@introndescs,$desc \|\| "unknown");
801	8					9	push(@introns,\@range);
802							}
803	24	100	100			66	if (($name eq "prim_transcript")\|\|($name eq "mRNA")) { push(@prim_transcripts,\@range); }
	7					18
804	24	50				36	if ($name eq "repeat_unit") { push(@repeat_units,\@range);
	0					0
805	0					0	$rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
806	0		0			0	push (@repeat_units_family,$rpt_family \|\| "unknown");
807							}
808	24	50				55	if ($name eq "repeat_region") { push(@repeat_regions,\@range);
	0					0
809	0					0	$rpt_family=$entryfeature->{'qualifiers'}->{'rpt_family'};
810	0		0			0	push (@repeat_regions_family,$rpt_family \|\| "unknown");
811							}
812							}
813							}
814							}
815	6	50				19	unless ($gene_name) { $gene_name="cds-position:".$cds_position; }
	0					0
816	6					8	my %genefeatureshash;
817	6					16	$genefeatureshash{gene_name}=$gene_name;
818	6					14	$genefeatureshash{genefeatures}=\@genefeatures;
819	6					14	$genefeatureshash{labels}=\@labels;
820	6					14	$genefeatureshash{ttables}=\@ttables;
821	6					14	$genefeatureshash{swisshashes}=\@swisshashes;
822	6					16	$genefeatureshash{transcripts}=\@transcripts;
823	6					15	$genefeatureshash{exons}=\@exons;
824	6					18	$genefeatureshash{exondescs}=\@exondescs;
825	6					18	$genefeatureshash{introns}=\@introns;
826	6					12	$genefeatureshash{introndescs}=\@introndescs;
827	6					14	$genefeatureshash{prim_transcripts}=\@prim_transcripts;
828	6					12	$genefeatureshash{repeat_units}=\@repeat_units;
829	6					13	$genefeatureshash{repeat_regions}=\@repeat_regions;
830	6					11	$genefeatureshash{repeat_units_family}=\@repeat_units_family;
831	6					17	$genefeatureshash{repeat_regions_family}=\@repeat_regions_family;
832	6					26	return (\%genefeatureshash);
833							}
834
835
836							# used by _findgenefeatures, when a CDS at a certain position is requested,
837							# to retrieve only features quite close to the wanted CDS.
838							# Args: range hashref, begin and end positions of the CDS, $proximity
839							# $proximity holds the maximum distance between the extremes of the CDS
840							# and of the feature under exam.
841							# Returns: boolean
842							sub _checkfeatureproximity {
843	0			0			my ($self,$range,$cds_begin,$cds_end,$proximity)=@_;
844	0						my @range=@{$range};
	0
845	0						my ($begin,$end,$strand);
846	0	0					if (ref($range[0]) eq "ARRAY") { # like in CDS, whose range equivals to exons
847	0						($begin,$end,$strand)=($range[0]->[0],$range[-1]->[1],$range[0]->[2]);
848							} else {
849	0						($begin,$end,$strand)=@range;
850							}
851	0	0					if ($cds_begin > $cds_end) { # i.e. reverse strand CDS
852	0						($cds_begin,$cds_end)=($cds_end,$cds_begin); # swap boundaries
853							}
854	0	0					if ($strand == -1) { # reverse strand
855	0						($begin,$end)=($end,$begin); # swap boundaries
856							}
857	0	0					if (($cds_begin-$end)>$proximity) {
858	0						carp "--DEBUG-- feature rejected: begin $begin end $end -------";
859	0						return (0);
860							}
861	0	0					if (($begin-$cds_end)>$proximity) {
862	0						carp "--DEBUG-- feature rejected: begin $begin end $end -------";
863	0						return (0);
864							}
865	0						carp "--DEBUG-- feature accepted: begin $begin end $end -------";
866	0						return (1); # otherwise ok, feature considered next to CDS
867							}
868
869
870							# function that calls the external program "align" (on the fasta2 package)
871							# to create an alignment between two sequences, returning the aligned
872							# strings and the codes for the identity (:: ::::)
873
874							sub _get_alignment {
875	0			0			my ($self,$seq1,$seq2)=@_;
876
877	0	0					my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
878	0						my $fastafile1="/tmp/tmpfastafile1";
879	0						my $fastafile2="/tmp/tmpfastafile2";
880	0						my $grepcut='egrep -v "[[:digit:]]\|^ *$\|sequences" \| cut -c8-'; # grep/cut
881	0						my $alignprogram="/usr/local/etc/bioinfo/fasta2/align -s /usr/local/etc/bioinfo/fasta2/idnaa.mat $fastafile1 $fastafile2 2>$null \| $grepcut"; # ALIGN
882	0	0					open my $TMPFASTAFILE1, '>', $fastafile1 or croak "Could not write file '$fastafile1' for aa alignment: $!";
883	0	0					open my $TMPFASTAFILE2, '>', $fastafile2 or croak "Could not write file '$fastafile2' for aa alignment: $!";
884	0						print $TMPFASTAFILE1 ">firstseq\n$seq1\n";
885	0						print $TMPFASTAFILE2 ">secondseq\n$seq2\n";
886	0						close $TMPFASTAFILE1;
887	0						close $TMPFASTAFILE2;
888	0						my $alignment=`$alignprogram`;
889	0						my @alignlines=split(/\n/,$alignment);
890	0						my ($linecount,$seq1_aligned,$seq2_aligned,$codes);
891	0						for ($linecount=0; $linecount < @alignlines; $linecount+=3) {
892	0						$seq1_aligned .= $alignlines[$linecount];
893	0						$codes .= $alignlines[$linecount+1];
894	0						$seq2_aligned .= $alignlines[$linecount+2];
895							}
896	0						return ($seq1_aligned,$seq2_aligned,$codes);
897							}
898
899							# common part of the function to create a novel liveseq gene structure
900							# from an amino acid sequence, using codon usage frequencies
901							# args: codon_usage_array transltableid aasequence gene_name
902							sub _common_novelaasequence2gene {
903	0			0			my ($species_codon_usage,$ttabid,$aasequence,$gene_name)=@_;
904	0						my @species_codon_usage=@{$species_codon_usage};
	0
905	0						my @codon_usage_label=
906							qw (cga cgc cgg cgt aga agg cta ctc ctg ctt tta ttg tca tcc tcg
907							tct agc agt aca acc acg act cca ccc ccg cct gca gcc gcg gct gga
908							ggc ggg ggt gta gtc gtg gtt aaa aag aac aat caa cag cac cat gaa
909							gag gac gat tac tat tgc tgt ttc ttt ata atc att atg tgg taa tag
910							tga);
911	0						my ($i,$j);
912	0						my %codon_usage_value;
913	0						my $aa_codon_total;
914	0						for ($i=0;$i<64;$i++) {
915	0						$codon_usage_value{$codon_usage_label[$i]}=$species_codon_usage[$i];
916							}
917
918	0						my $CodonTable = Bio::Tools::CodonTable->new ( -id => $ttabid );
919	0						my @aminoacids = split(//,uc($aasequence));
920	0						my @alt_codons; my ($relativeusage,$dnasequence,$chosen_codon,$dice,$partial,$thiscodon);
	0
921	0						for $i (@aminoacids) {
922	0						@alt_codons = $CodonTable->revtranslate($i);
923	0	0					unless (@alt_codons) {
924	0						carp "No reverse translation possible for aminoacid \'$i\'";
925	0						$dnasequence .= "???";
926							} else {
927	0						$aa_codon_total=0;
928	0						for $j (@alt_codons) {
929	0						$aa_codon_total+=$codon_usage_value{$j};
930							}
931							# print "aminoacid $i, codonchoice: "; # verbose
932							#$partial=0;
933							#for $j (@alt_codons) {
934							#printf "%s %.2f ",$j,$partial+$codon_usage_value{$j}/$aa_codon_total;
935							#$partial+=($codon_usage_value{$j}/$aa_codon_total);
936							#}
937							#print "\n";
938	0						$dice=rand(1);
939							#print "roulette: $dice\n"; # verbose
940	0						$partial=0;
941	0						$chosen_codon="";
942							CODONCHOICE:
943	0						for $j (0..@alt_codons) { # last one not accounted
944	0						$thiscodon=$alt_codons[$j];
945	0						$relativeusage=($codon_usage_value{$thiscodon}/$aa_codon_total);
946	0	0					if ($dice < $relativeusage+$partial) {
947	0						$chosen_codon=$thiscodon;
948	0						last CODONCHOICE;
949							} else {
950	0						$partial += $relativeusage;
951							}
952							}
953	0	0					unless ($chosen_codon) {
954	0						$chosen_codon = $alt_codons[-1]; # the last one
955							}
956							# print ".....adding $chosen_codon\n"; # verbose
957	0						$dnasequence .= $chosen_codon;
958							}
959							}
960
961	0						my $dna = Bio::LiveSeq::DNA->new(-seq => $dnasequence);
962	0						my $min=1;
963	0						my $max=length($dnasequence);
964	0						my $exon = Bio::LiveSeq::Exon->new(-seq => $dna, -start => $min, -end => $max, -strand => 1);
965	0						my @exons=($exon);
966	0						my $transcript = Bio::LiveSeq::Transcript->new(-exons => \@exons);
967	0						$transcript->translation_table($ttabid);
968	0						my @transcripts=($transcript);
969	0						my $translation = Bio::LiveSeq::Translation->new(-transcript => $transcript);
970	0						my @translations=($translation);
971	0						my %features=(DNA => $dna, Transcripts => \@transcripts, Translations => \@translations);
972	0						my $gene = Bio::LiveSeq::Gene->new(-name => $gene_name, -features => \%features, -upbound => $min, -downbound => $max);
973
974							# creation of gene
975	0	0					unless ($gene) { # if $gene == 0 it means problems in hash2gene
976	0						carp "Error in Gene creation phase";
977	0						return (0);
978							}
979	0						return $gene;
980							}
981
982							1;