File Coverage

Bio/SeqFeature/Tools/Unflattener.pm

Criterion	Covered	Total	%
statement	546	725	75.3
branch	222	344	64.5
condition	39	60	65.0
subroutine	33	42	78.5
pod	16	22	72.7
total	856	1193	71.7

line	stmt	bran	cond	sub	pod	time	code
1							#
2							# bioperl module for Bio::SeqFeature::Tools::Unflattener
3							#
4							# Please direct questions and support issues to
5							#
6							# Cared for by Chris Mungall
7							#
8							# Copyright Chris Mungall
9							#
10							# You may distribute this module under the same terms as perl itself
11
12							# POD documentation - main docs before the code
13
14							=head1 NAME
15
16							Bio::SeqFeature::Tools::Unflattener - turns flat list of genbank-sourced features into a nested SeqFeatureI hierarchy
17
18							=head1 SYNOPSIS
19
20							# standard / generic use - unflatten a genbank record
21							use Bio::SeqIO;
22							use Bio::SeqFeature::Tools::Unflattener;
23
24							# generate an Unflattener object
25							$unflattener = Bio::SeqFeature::Tools::Unflattener->new;
26
27							# first fetch a genbank SeqI object
28							$seqio =
29							Bio::SeqIO->new(-file=>'AE003644.gbk',
30							-format=>'GenBank');
31							my $out =
32							Bio::SeqIO->new(-format=>'asciitree');
33							while ($seq = $seqio->next_seq()) {
34
35							# get top level unflattended SeqFeatureI objects
36							$unflattener->unflatten_seq(-seq=>$seq,
37							-use_magic=>1);
38							$out->write_seq($seq);
39
40							@top_sfs = $seq->get_SeqFeatures;
41							foreach my $sf (@top_sfs) {
42							# do something with top-level features (eg genes)
43							}
44							}
45
46
47							=head1 DESCRIPTION
48
49							Most GenBank entries for annotated genomic DNA contain a B list
50							of features. These features can be parsed into an equivalent flat list
51							of L objects using the standard L
52							classes. However, it is often desirable to B this list into
53							something resembling actual B, in which genes, mRNAs and CDSs
54							are B according to the nature of the gene model.
55
56							The BioPerl object model allows us to store these kind of associations
57							between SeqFeatures in B -- any SeqFeatureI
58							object can contain nested SeqFeatureI objects. The
59							Bio::SeqFeature::Tools::Unflattener object facilitates construction of
60							these hierarchies from the underlying GenBank flat-feature-list
61							representation.
62
63							For example, if you were to look at a typical GenBank DNA entry, say,
64							B, you would see a flat list of features:
65
66							source
67
68							gene CG4491
69							mRNA CG4491-RA
70							CDS CG4491-PA
71
72							gene tRNA-Pro
73							tRNA tRNA-Pro
74
75							gene CG32954
76							mRNA CG32954-RA
77							mRNA CG32954-RC
78							mRNA CG32954-RB
79							CDS CG32954-PA
80							CDS CG32954-PB
81							CDS CG32954-PC
82
83							These features have sequence locations, but it is not immediately
84							clear how to write code such that each mRNA is linked to the
85							appropriate CDS (other than relying on IDs which is very bad)
86
87							We would like to convert the above list into the B
88							hierarchy>, shown below:
89
90							source
91							gene
92							mRNA CG4491-RA
93							CDS CG4491-PA
94							exon
95							exon
96							gene
97							tRNA tRNA-Pro
98							exon
99							gene
100							mRNA CG32954-RA
101							CDS CG32954-PA
102							exon
103							exon
104							mRNA CG32954-RC
105							CDS CG32954-PC
106							exon
107							exon
108							mRNA CG32954-RB
109							CDS CG32954-PB
110							exon
111							exon
112
113							Where each feature is nested underneath its container. Note that exons
114							have been automatically inferred (even for tRNA genes).
115
116							We do this using a call on a L
117							object
118
119							@sfs = $unflattener->unflatten_seq(-seq=>$seq);
120
121							This would return a list of the B (i.e. container)
122							SeqFeatureI objects - in this case, genes. Other top level features
123							are possible; for instance, the B feature which is always
124							present, and other features such as B or B
125							types.
126
127							The containment hierarchy can be accessed using the get_SeqFeature()
128							call on any feature object - see L.
129							The following code will traverse the containment hierarchy for a
130							feature:
131
132							sub traverse {
133							$sf = shift; # $sf isa Bio::SeqfeatureI
134
135							# ...do something with $sf!
136
137							# depth first traversal of containment tree
138							@contained_sfs = $sf->get_SeqFeatures;
139							traverse($_) foreach @contained_sfs;
140							}
141
142							Once you have built the hierarchy, you can do neat stuff like turn the
143							features into 'rich' feature objects (eg
144							L) or convert to a suitable
145							format such as GFF3 or chadoxml (after mapping to the Sequence
146							Ontology); this step is not described here.
147
148							=head1 USING MAGIC
149
150							Due to the quixotic nature of how features are stored in
151							GenBank/EMBL/DDBJ, there is no guarantee that the default behaviour of
152							this module will produce perfect results. Sometimes it is hard or
153							impossible to build a correct containment hierarchy if the information
154							provided is simply too lossy, as is often the case. If you care deeply
155							about your data, you should always manually inspect the resulting
156							containment hierarchy; you may have to customise the algorithm for
157							building the hierarchy, or even manually tweak the resulting
158							hierarchy. This is explained in more detail further on in the document.
159
160							However, if you are satisfied with the default behaviour, then you do
161							not need to read any further. Just make sure you set the parameter
162							B - this will invoke incantations which will magically
163							produce good results no matter what the idiosyncracies of the
164							particular GenBank record in question.
165
166							For example
167
168							$unflattener->unflatten_seq(-seq=>$seq,
169							-use_magic=>1);
170
171							The success of this depends on the phase of the moon at the time the
172							entry was submitted to GenBank. Note that the magical recipe is being
173							constantly improved, so the results of invoking magic may vary
174							depending on the bioperl release.
175
176							If you are skeptical of magic, or you wish to exact fine grained
177							control over how the entry is unflattened, or you simply wish to
178							understand more about how this crazy stuff works, then read on!
179
180							=head1 PROBLEMATIC DATA AND INCONSISTENCIES
181
182							Occasionally the Unflattener will have problems with certain
183							records. For example, the record may contain inconsistent data - maybe
184							there is an B entry that has no corresponding B location.
185
186							The default behaviour is to throw an exception reporting the problem,
187							if the problem is relatively serious - for example, inconsistent data.
188
189							You can exert more fine grained control over this - perhaps you want
190							the Unflattener to do the best it can, and report any problems. This
191							can be done - refer to the methods.
192
193							error_threshold()
194
195							get_problems()
196
197							report_problems()
198
199							ignore_problems()
200
201							=head1 ALGORITHM
202
203							This is the default algorithm; you should be able to override any part
204							of it to customise.
205
206							The core of the algorithm is in two parts
207
208							=over
209
210							=item Partitioning the flat feature list into groups
211
212							=item Resolving the feature containment hierarchy for each group
213
214							=back
215
216							There are other optional steps after the completion of these two
217							steps, such as B; we now describe in more detail what
218							is going on.
219
220							=head2 Partitioning into groups
221
222							First of all the flat feature list is partitioned into Bs.
223
224							The default way of doing this is to use the B attribute; if we
225							look at two features from GenBank accession AE003644.3:
226
227							gene 20111..23268
228							/gene="noc"
229							/locus_tag="CG4491"
230							/note="last curated on Thu Dec 13 16:51:32 PST 2001"
231							/map="35B2-35B2"
232							/db_xref="FLYBASE:FBgn0005771"
233							mRNA join(20111..20584,20887..23268)
234							/gene="noc"
235							/locus_tag="CG4491"
236							/product="CG4491-RA"
237							/db_xref="FLYBASE:FBgn0005771"
238
239							Both these features share the same /gene tag which is "noc", so they
240							correspond to the same gene model (the CDS feature is not shown, but
241							this also has a tag-value /gene="noc").
242
243							Not all groups need to correspond to gene models, but this is the most
244							common use case; later on we shall describe how to customise the
245							grouping.
246
247							Sometimes other tags have to be used; for instance, if you look at the
248							entire record for AE003644.3 you will see you actually need the use the
249							/locus_tag attribute. This attribute is actually B in
250							most records!
251
252							You can override this:
253
254							$collection->unflatten_seq(-seq=>$seq, -group_tag=>'locus_tag');
255
256							Alternatively, if you B<-use_magic>, the object will try and make a
257							guess as to what the correct group_tag should be.
258
259							At the end of this step, we should have a list of groups - there is no
260							structure within a group; the group just serves to partition the flat
261							features. For the example data above, we would have the following groups.
262
263							[ source ]
264							[ gene mRNA CDS ]
265							[ gene mRNA CDS ]
266							[ gene mRNA CDS ]
267							[ gene mRNA mRNA mRNA CDS CDS CDS ]
268
269							=head3 Multicopy Genes
270
271							Multicopy genes are usually rRNAs or tRNAs that are duplicated across
272							the genome. Because they are functionally equivalent, and usually have
273							the same sequence, they usually have the same group_tag (ie gene
274							symbol); they often have a /note tag giving copy number. This means
275							they will end up in the same group. This is undesirable, because they
276							are spatially disconnected.
277
278							There is another step, which involves splitting spatially disconnected
279							groups into distinct groups
280
281							this would turn this
282
283							[gene-rrn3 rRNA-rrn3 gene-rrn3 rRNA-rrn3]
284
285							into this
286
287							[gene-rrn3 rRNA-rrn3] [gene-rrn3 rRNA-rrn3]
288
289							based on the coordinates
290
291							=head3 What next?
292
293							The next step is to add some structure to each group, by making
294							B, trees that represent how the features
295							interrelate
296
297							=head2 Resolving the containment hierarchy
298
299							After the grouping is done, we end up with a list of groups which
300							probably contain features of type 'gene', 'mRNA', 'CDS' and so on.
301
302							Singleton groups (eg the 'source' feature) are ignored at this stage.
303
304							Each group is itself flat; we need to add an extra level of
305							organisation. Usually this is because different spliceforms
306							(represented by the 'mRNA' feature) can give rise to different
307							protein products (indicated by the 'CDS' feature). We want to correctly
308							associate mRNAs to CDSs.
309
310							We want to go from a group like this:
311
312							[ gene mRNA mRNA mRNA CDS CDS CDS ]
313
314							to a containment hierarchy like this:
315
316							gene
317							mRNA
318							CDS
319							mRNA
320							CDS
321							mRNA
322							CDS
323
324							In which each CDS is nested underneath the correct corresponding mRNA.
325
326							For entries that contain no alternate splicing, this is simple; we
327							know that the group
328
329							[ gene mRNA CDS ]
330
331							Must resolve to the tree
332
333							gene
334							mRNA
335							CDS
336
337							How can we do this in entries with alternate splicing? The bad
338							news is that there is no guaranteed way of doing this correctly for
339							any GenBank entry. Occasionally the submission will have been done in
340							such a way as to reconstruct the containment hierarchy. However, this
341							is not consistent across databank entries, so no generic solution can
342							be provided by this object. This module does provide the framework
343							within which you can customise a solution for the particular dataset
344							you are interested in - see later.
345
346							The good news is that there is an inference we can do that should
347							produce pretty good results the vast majority of the time. It uses
348							splice coordinate data - this is the default behaviour of this module,
349							and is described in detail below.
350
351							=head2 Using splice site coordinates to infer containment
352
353							If an mRNA is to be the container for a CDS, then the splice site
354							coordinates (or intron coordinates, depending on how you look at it)
355							of the CDS must fit inside the splice site coordinates of the mRNA.
356
357							Ambiguities can still arise, but the results produced should still be
358							reasonable and consistent at the sequence level. Look at this fake
359							example:
360
361							mRNA XXX---XX--XXXXXX--XXXX join(1..3,7..8,11..16,19..23)
362							mRNA XXX-------XXXXXX--XXXX join(1..3,11..16,19..23)
363							CDS XXXX--XX join(13..16,19..20)
364							CDS XXXX--XX join(13..16,19..20)
365
366							[obviously the positions have been scaled down]
367
368							We cannot unambiguously match mRNA with CDS based on splice sites,
369							since both CDS share the splice site locations 16^17 and
370							18^19. However, the consequences of making a wrong match are probably
371							not very severe. Any annotation data attached to the first CDS is
372							probably identical to the seconds CDS, other than identifiers.
373
374							The default behaviour of this module is to make an arbitrary call
375							where it is ambiguous (the mapping will always be bijective; i.e. one
376							mRNA -E one CDS).
377
378							[TODO: NOTE: not tested on EMBL data, which may not be bijective; ie two
379							mRNAs can share the same CDS??]
380
381							This completes the building of the containment hierarchy; other
382							optional step follow
383
384							=head1 POST-GROUPING STEPS
385
386							=head2 Inferring exons from mRNAs
387
388							This step always occurs if B<-use_magic> is invoked.
389
390							In a typical GenBank entry, the exons are B. That is they
391							can be inferred from the mRNA location.
392
393							For example:
394
395							mRNA join(20111..20584,20887..23268)
396
397							This tells us that this particular transcript has two exons. In
398							bioperl, the mRNA feature will have a 'split location'.
399
400							If we call
401
402							$unflattener->feature_from_splitloc(-seq=>$seq);
403
404							This will generate the necessary exon features, and nest them under
405							the appropriate mRNAs. Note that the mRNAs will no longer have split
406							locations - they will have simple locations spanning the extent of the
407							exons. This is intentional, to avoid redundancy.
408
409							Occasionally a GenBank entry will have both implicit exons (from the
410							mRNA location) B explicit exon features.
411
412							In this case, exons will still be transferred. Tag-value data from the
413							explicit exon will be transfered to the implicit exon. If exons are
414							shared between mRNAs these will be represented by different
415							objects. Any inconsistencies between implicit and explicit will be
416							reported.
417
418							=head3 tRNAs and other noncoding RNAs
419
420							exons will also be generated from these features
421
422							=head2 Inferring mRNAs from CDS
423
424							Some GenBank entries represent gene models using features of type
425							gene, mRNA and CDS; some entries just use gene and CDS.
426
427							If we only have gene and CDS, then the containment hierarchies will
428							look like this:
429
430							gene
431							CDS
432
433							If we want the containment hierarchies to be uniform, like this
434
435							gene
436							mRNA
437							CDS
438
439							Then we must create an mRNA feature. This will have identical
440							coordinates to the CDS. The assumption is that there is either no
441							untranslated region, or it is unknown.
442
443							To do this, we can call
444
445							$unflattener->infer_mRNA_from_CDS(-seq=>$seq);
446
447							This is taken care of automatically, if B<-use_magic> is invoked.
448
449							=head1 ADVANCED
450
451							=head2 Customising the grouping of features
452
453							The default behaviour is suited mostly to building models of protein
454							coding genes and noncoding genes from genbank genomic DNA submissions.
455
456							You can change the tag used to partition the feature by passing in a
457							different group_tag argument - see the unflatten_seq() method
458
459							Other behaviour may be desirable. For example, even though SNPs
460							(features of type 'variation' in GenBank) are not actually part of the
461							gene model, it may be desirable to group SNPs that overlap or are
462							nearby gene models.
463
464							It should certainly be possible to extend this module to do
465							this. However, I have yet to code this part!!! If anyone would find
466							this useful let me know.
467
468							In the meantime, you could write your own grouping subroutine, and
469							feed the results into unflatten_groups() [see the method documentation
470							below]
471
472							=head2 Customising the resolution of the containment hierarchy
473
474							Once the flat list of features has been partitioned into groups, the
475							method unflatten_group() is called on each group to build a tree.
476
477							The algorithm for doing this is described above; ambiguities are
478							resolved by using splice coordinates. As discussed, this can be
479							ambiguous.
480
481							Some submissions may contain information in tags/attributes that hint
482							as to the mapping that needs to be made between the features.
483
484							For example, with the Drosophila Melanogaster release 3 submission, we
485							see that CDS features in alternately spliced mRNAs have a form like
486							this:
487
488							CDS join(145588..145686,145752..146156,146227..146493)
489							/locus_tag="CG32954"
490							/note="CG32954 gene product from transcript CG32954-RA"
491							^^^^^^^^^^^^^^^^^^^^^^^^^^^
492							/codon_start=1
493							/product="CG32954-PA"
494							/protein_id="AAF53403.1"
495							/db_xref="GI:7298167"
496							/db_xref="FLYBASE:FBgn0052954"
497							/translation="MSFTLTNKNVIFVAGLGGIGLDTSKELLKRDLKNLVILDRIENP..."
498
499							Here the /note tag provides the clue we need to link CDS to mRNA
500							(highlighted with ^^^^). We just need to find the mRNA with the tag
501
502							/product="CG32954-RA"
503
504							I have no idea how consistent this practice is across submissions; it
505							is consistent for the fruitfly genome submission.
506
507							We can customise the behaviour of unflatten_group() by providing our
508							own resolver method. This obviously requires a bit of extra
509							programming, but there is no way to get around this.
510
511							Here is an example of how to pass in your own resolver; this example
512							basically checks the parent (container) /product tag to see if it
513							matches the required string in the child (contained) /note tag.
514
515							$unflattener->unflatten_seq(-seq=>$seq,
516							-group_tag=>'locus_tag',
517							-resolver_method=>sub {
518							my $self = shift;
519							my ($sf, @candidate_container_sfs) = @_;
520							if ($sf->has_tag('note')) {
521							my @notes = $sf->get_tag_values('note');
522							my @trnames = map {/from transcript\s+(.*)/;
523							$1} @notes;
524							@trnames = grep {$_} @trnames;
525							my $trname;
526							if (@trnames == 0) {
527							$self->throw("UNRESOLVABLE");
528							}
529							elsif (@trnames == 1) {
530							$trname = $trnames[0];
531							}
532							else {
533							$self->throw("AMBIGUOUS: @trnames");
534							}
535							my @container_sfs =
536							grep {
537							my ($product) =
538							$_->has_tag('product') ?
539							$_->get_tag_values('product') :
540							('');
541							$product eq $trname;
542							} @candidate_container_sfs;
543							if (@container_sfs == 0) {
544							$self->throw("UNRESOLVABLE");
545							}
546							elsif (@container_sfs == 1) {
547							# we got it!
548							return $container_sfs[0];
549							}
550							else {
551							$self->throw("AMBIGUOUS");
552							}
553							}
554							});
555
556							the resolver method is only called when there is more than one spliceform.
557
558							=head2 Parsing mRNA records
559
560							Some of the entries in sequence databanks are for mRNA sequences as
561							well as genomic DNA. We may want to build models from these too.
562
563							NOT YET DONE - IN PROGRESS!!!
564
565							Open question - what would these look like?
566
567							Ideally we would like a way of combining a mRNA record with the
568							corresponding SeFeature entry from the appropriate genomic DNA
569							record. This could be problemmatic in some cases - for example, the
570							mRNA sequences may not match 100% (due to differences in strain,
571							assembly problems, sequencing problems, etc). What then...?
572
573							=head1 SEE ALSO
574
575							Feature table description
576
577							http://www.ebi.ac.uk/embl/Documentation/FT_definitions/feature_table.html
578
579							=head1 FEEDBACK
580
581							=head2 Mailing Lists
582
583							User feedback is an integral part of the evolution of this and other
584							Bioperl modules. Send your comments and suggestions preferably to the
585							Bioperl mailing lists Your participation is much appreciated.
586
587							bioperl-l@bioperl.org - General discussion
588							http://bioperl.org/wiki/Mailing_lists - About the mailing lists
589
590							=head2 Support
591
592							Please direct usage questions or support issues to the mailing list:
593
594							I
595
596							rather than to the module maintainer directly. Many experienced and
597							reponsive experts will be able look at the problem and quickly
598							address it. Please include a thorough description of the problem
599							with code and data examples if at all possible.
600
601							=head2 Reporting Bugs
602
603							report bugs to the Bioperl bug tracking system to help us keep track
604							the bugs and their resolution. Bug reports can be submitted via the
605							web:
606
607							https://github.com/bioperl/bioperl-live/issues
608
609							=head1 AUTHOR - Chris Mungall
610
611							Email: cjm@fruitfly.org
612
613							=head1 APPENDIX
614
615							The rest of the documentation details each of the object
616							methods. Internal methods are usually preceded with a _
617
618							=cut
619
620
621							# Let the code begin...
622
623							package Bio::SeqFeature::Tools::Unflattener;
624	3			3		1512	use strict;
	3					4
	3					70
625
626							# Object preamble - inherits from Bio::Root::Root
627	3			3		9	use Bio::Location::Simple;
	3					3
	3					44
628	3			3		345	use Bio::SeqFeature::Generic;
	3					3
	3					48
629	3			3		9	use Bio::Range;
	3					2
	3					46
630
631
632	3			3		9	use base qw(Bio::Root::Root);
	3					2
	3					13295
633
634							=head2 new
635
636							Title : new
637							Usage : $unflattener = Bio::SeqFeature::Tools::Unflattener->new();
638							$unflattener->unflatten_seq(-seq=>$seq);
639							Function: constructor
640							Example :
641							Returns : a new Bio::SeqFeature::Tools::Unflattener
642							Args : see below
643
644							Arguments
645
646							-seq : A L object (optional)
647							the sequence to unflatten; this can also be passed in
648							when we call unflatten_seq()
649
650							-group_tag : a string representing the /tag used to partition flat features
651							(see discussion above)
652
653							=cut
654
655
656							sub new {
657	1			1	1	12	my($class,@args) = @_;
658	1					11	my $self = $class->SUPER::new(@args);
659
660	1					11	my($seq, $group_tag, $trust_grouptag) =
661							$self->_rearrange([qw(SEQ
662							GROUP_TAG
663							TRUST_GROUPTAG
664							)],
665							@args);
666
667	1	50				4	$seq && $self->seq($seq);
668	1	50				2	$group_tag && $self->group_tag($group_tag);
669							# $self->{'trust_grouptag'}= $trust_grouptag if($trust_grouptag); #dgg suggestion
670	1					2	return $self; # success - we hope!
671							}
672
673							sub DESTROY {
674	1			1		2	my $self = shift;
675	1	50				4	return if $self->{_reported_problems};
676	1	50				3	return if $self->{_ignore_problems};
677	1					4	my @probs = $self->get_problems;
678	1	50	50			6	if (!$self->{_problems_reported} &&
679							scalar(@probs)) {
680	0					0	$self->warn(
681							"WARNING: There are UNREPORTED PROBLEMS.\n".
682							"You may wish to use the method report_problems(), \n",
683							"or ignore_problems() on the Unflattener object\n");
684							}
685	1					8	return;
686							}
687
688							=head2 seq
689
690							Title : seq
691							Usage : $unflattener->seq($newval)
692							Function:
693							Example :
694							Returns : value of seq (a Bio::SeqI)
695							Args : on set, new value (a Bio::SeqI, optional)
696
697							The Bio::SeqI object should hold a flat list of Bio::SeqFeatureI
698							objects; this is the list that will be unflattened.
699
700							The sequence object can also be set when we call unflatten_seq()
701
702							=cut
703
704							sub seq{
705	12			12	1	17	my $self = shift;
706
707	12	100				36	return $self->{'seq'} = shift if @_;
708	11					34	return $self->{'seq'};
709							}
710
711							=head2 group_tag
712
713							Title : group_tag
714							Usage : $unflattener->group_tag($newval)
715							Function:
716							Example :
717							Returns : value of group_tag (a scalar)
718							Args : on set, new value (a scalar or undef, optional)
719
720							This is the tag that will be used to collect elements from the flat
721							feature list into groups; for instance, if we look at two typical
722							GenBank features:
723
724							gene 20111..23268
725							/gene="noc"
726							/locus_tag="CG4491"
727							/note="last curated on Thu Dec 13 16:51:32 PST 2001"
728							/map="35B2-35B2"
729							/db_xref="FLYBASE:FBgn0005771"
730							mRNA join(20111..20584,20887..23268)
731							/gene="noc"
732							/locus_tag="CG4491"
733							/product="CG4491-RA"
734							/db_xref="FLYBASE:FBgn0005771"
735
736							We can see that these comprise the same gene model because they share
737							the same /gene attribute; we want to collect these together in groups.
738
739							Setting group_tag is optional. The default is to use 'gene'. In the
740							example above, we could also use /locus_tag
741
742							=cut
743
744							sub group_tag{
745	9			9	1	15	my $self = shift;
746
747	9	50				22	return $self->{'group_tag'} = shift if @_;
748	9					35	return $self->{'group_tag'};
749							}
750
751							=head2 partonomy
752
753							Title : partonomy
754							Usage : $unflattener->partonomy({mRNA=>'gene', CDS=>'mRNA')
755							Function:
756							Example :
757							Returns : value of partonomy (a scalar)
758							Args : on set, new value (a scalar or undef, optional)
759
760							A hash representing the containment structure that the seq_feature
761							nesting should conform to; each key represents the contained (child)
762							type; each value represents the container (parent) type.
763
764							=cut
765
766							sub partonomy{
767	28863			28863	1	18821	my $self = shift;
768
769	28863	100				34429	return $self->{'partonomy'} = shift if @_;
770	28849	100				34847	if (!$self->{'partonomy'}) {
771	1					3	$self->{'partonomy'} = $self->_default_partonomy;
772							}
773	28849					23525	return $self->{'partonomy'};
774							}
775
776							sub _default_partonomy{
777							return {
778	1			1		12	mRNA => 'gene',
779							tRNA => 'gene',
780							rRNA => 'gene',
781							scRNA => 'gene',
782							snRNA => 'gene',
783							snoRNA => 'gene',
784							misc_RNA => 'gene',
785							CDS => 'mRNA',
786							exon => 'mRNA',
787							intron => 'mRNA',
788
789							pseudoexon => 'pseudogene',
790							pseudointron => 'pseudogene',
791							pseudotranscript => 'pseudogene',
792							};
793							}
794
795							=head2 structure_type
796
797							Title : structure_type
798							Usage : $unflattener->structure_type($newval)
799							Function:
800							Example :
801							Returns : value of structure_type (a scalar)
802							Args : on set, new value (an int or undef, optional)
803
804							GenBank entries conform to different flavours, or B
805							types>. Some have mRNAs, some do not.
806
807							Right now there are only two base structure types defined. If you set
808							the structure type, then appropriate unflattening action will be
809							taken. The presence or absence of explicit exons does not affect the
810							structure type.
811
812							If you invoke B<-use_magic> then this will be set automatically, based
813							on the content of the record.
814
815							=over
816
817							=item Type 0 (DEFAULT)
818
819							typically contains
820
821							source
822							gene
823							mRNA
824							CDS
825
826							with this structure type, we want the seq_features to be nested like this
827
828							gene
829							mRNA
830							CDS
831							exon
832
833							exons and introns are implicit from the mRNA 'join' location
834
835							to get exons from the mRNAs, you will need this call (see below)
836
837							$unflattener->feature_from_splitloc(-seq=>$seq);
838
839							=item Type 1
840
841							typically contains
842
843							source
844							gene
845							CDS
846							exon [optional]
847							intron [optional]
848
849							there are no mRNA features
850
851							with this structure type, we want the seq_features to be nested like this
852
853							gene
854							CDS
855							exon
856							intron
857
858							exon and intron may or may not be present; they may be implicit from
859							the CDS 'join' location
860
861							=back
862
863							=cut
864
865							sub structure_type{
866	22			22	1	30	my $self = shift;
867
868	22	100				66	return $self->{'structure_type'} = shift if @_;
869	11					22	return $self->{'structure_type'};
870							}
871
872							=head2 get_problems
873
874							Title : get_problems
875							Usage : @probs = get_problems()
876							Function: Get the list of problem(s) for this object.
877							Example :
878							Returns : An array of [severity, description] pairs
879							Args :
880
881							In the course of unflattening a record, problems may occur. Some of
882							these problems are non-fatal, and can be ignored.
883
884							Problems are represented as arrayrefs containing a pair [severity,
885							description]
886
887							severity is a number, the higher, the more severe the problem
888
889							the description is a text string
890
891							=cut
892
893							sub get_problems{
894	2			2	1	341	my $self = shift;
895
896	2	50				6	return @{$self->{'_problems'}} if exists($self->{'_problems'});
	2					6
897	0					0	return ();
898							}
899
900							=head2 clear_problems
901
902							Title : clear_problems
903							Usage :
904							Function: resets the problem list to empty
905							Example :
906							Returns :
907							Args :
908
909
910							=cut
911
912							sub clear_problems{
913	1			1	1	5	my ($self,@args) = @_;
914	1					1	$self->{'_problems'} = [];
915	1					3	return;
916							}
917
918
919							# PRIVATE
920							# see get_problems
921							sub add_problem{
922	6			6	0	7	my $self = shift;
923
924	6	100				13	$self->{'_problems'} = [] unless exists($self->{'_problems'});
925	6	50				13	if ($self->verbose > 0) {
926	0					0	warn( "PROBLEM: $_\n") foreach @_;
927							}
928	6					7	push(@{$self->{'_problems'}}, @_);
	6					13
929							}
930
931							# PRIVATE
932							# see get_problems
933							sub problem {
934	6			6	0	6	my $self = shift;
935	6					9	my ($severity, $desc, @sfs) = @_;
936	6	50				12	if (@sfs) {
937	6					8	foreach my $sf (@sfs) {
938							$desc .=
939							sprintf("\nSF [$sf]: ". $sf->location->to_FTstring . "; %s\n",
940							join('; ',
941							$sf->primary_tag,
942							map {
943	6	100				16	$sf->has_tag($_) ?
	24					36
944							$sf->get_tag_values($_) : ()
945							} qw(locus_tag gene product label)));
946							}
947							}
948	6					11	my $thresh = $self->error_threshold;
949	6	50				9	if ($severity > $thresh) {
950	0					0	$self->{_problems_reported} = 1;
951	0					0	$self->throw("PROBLEM, SEVERITY==$severity\n$desc");
952							}
953	6					12	$self->add_problem([$severity, $desc]);
954	6					7	return;
955							}
956
957							=head2 report_problems
958
959							Title : report_problems
960							Usage : $unflattener->report_problems(\*STDERR);
961							Function:
962							Example :
963							Returns :
964							Args : FileHandle (defaults to STDERR)
965
966
967							=cut
968
969							sub report_problems{
970	0			0	1	0	my ($self, $fh) = @_;
971
972	0	0				0	if (!$fh) {
973	0					0	$fh = \*STDERR;
974							}
975	0					0	foreach my $problem ($self->get_problems) {
976	0					0	my ($sev, $desc) = @$problem;
977	0					0	printf $fh "PROBLEM, SEVERITY==$sev\n$desc\n";
978							}
979	0					0	$self->{_problems_reported} = 1;
980	0					0	return;
981							}
982
983							=head2 ignore_problems
984
985							Title : ignore_problems
986							Usage : $obj->ignore_problems();
987							Function:
988							Example :
989							Returns :
990							Args :
991
992							Unflattener is very particular about problems it finds along the
993							way. If you have set the error_threshold such that less severe
994							problems do not cause exceptions, Unflattener still expects you to
995							report_problems() at the end, so that the user of the module is aware
996							of any inconsistencies or problems with the data. In fact, a warning
997							will be produced if there are unreported problems. To silence, this
998							warning, call the ignore_problems() method before the Unflattener
999							object is destroyed.
1000
1001							=cut
1002
1003							sub ignore_problems{
1004	0			0	1	0	my ($self) = @_;
1005	0					0	$self->{_ignore_problems} = 1;
1006	0					0	return;
1007							}
1008
1009
1010							=head2 error_threshold
1011
1012							Title : error_threshold
1013							Usage : $obj->error_threshold($severity)
1014							Function:
1015							Example :
1016							Returns : value of error_threshold (a scalar)
1017							Args : on set, new value (an integer)
1018
1019							Sets the threshold above which errors cause this module to throw an
1020							exception. The default is 0; all problems with a severity E 0 will
1021							cause an exception.
1022
1023							If you raise the threshold to 1, then the unflattening process will be
1024							more lax; problems of severity==1 are generally non-fatal, but may
1025							indicate that the results should be inspected, for example, to make
1026							sure there is no data loss.
1027
1028							=cut
1029
1030							sub error_threshold{
1031	7			7	1	14	my $self = shift;
1032
1033	7	100				15	return $self->{'error_threshold'} = shift if @_;
1034	6		50			14	return $self->{'error_threshold'} \|\| 0;
1035							}
1036
1037
1038
1039							# PRIVATE
1040							#
1041							# given a type (eg mRNA), will return the container type (eg gene)
1042							sub get_container_type{
1043	15023			15023	0	11894	my ($self,$type) = @_;
1044	15023					15620	my @roots = $self->_get_partonomy_roots;
1045	15023	100				14999	if (grep {$_ eq $type} @roots) {
	221597					177665
1046							# it is a root - no parents/containers
1047	2725					4374	return;
1048							}
1049	12298					12793	my $ch = $self->partonomy;
1050	12298					11204	my $ctype = $ch->{$type};
1051	12298	100				14685	if (!$ctype) {
1052							# asterix acts as a wild card
1053	192					166	$ctype = $ch->{'*'};
1054							}
1055	12298					17167	return $ctype;
1056							}
1057
1058							# get root node of partonomy hierarchy (usually gene)
1059							sub _get_partonomy_roots {
1060	15023			15023		11059	my $self = shift;
1061	15023					15468	my $ch = $self->partonomy;
1062	15023					40798	my @parents = values %$ch;
1063							# find parents that do not have parents themselves
1064	15023					12142	return grep {!$ch->{$_}} @parents;
	309029					279736
1065							}
1066
1067
1068
1069							=head2 unflatten_seq
1070
1071							Title : unflatten_seq
1072							Usage : @sfs = $unflattener->unflatten_seq($seq);
1073							Function: turns a flat list of features into a list of holder features
1074							Example :
1075							Returns : list of Bio::SeqFeatureI objects
1076							Args : see below
1077
1078							partitions a list of features then arranges them in a nested tree; see
1079							above for full explanation.
1080
1081							note - the Bio::SeqI object passed in will be modified
1082
1083							Arguments
1084
1085							-seq : a Bio::SeqI object; must contain Bio::SeqFeatureI objects
1086							(this is optional if seq has already been set)
1087
1088							-use_magic: if TRUE (ie non-zero) then magic will be invoked;
1089							see discussion above.
1090
1091							-resolver_method: a CODE reference
1092							see the documentation above for an example of
1093							a subroutine that can be used to resolve hierarchies
1094							within groups.
1095
1096							this is optional - if nothing is supplied, a default
1097							subroutine will be used (see below)
1098
1099							-group_tag: a string
1100							[ see the group_tag() method ]
1101							this overrides the default group_tag which is 'gene'
1102
1103
1104
1105							=cut
1106
1107							sub unflatten_seq{
1108	11			11	1	334	my ($self,@args) = @_;
1109
1110	11					64	my($seq, $resolver_method, $group_tag, $partonomy,
1111							$structure_type, $resolver_tag, $use_magic, $noinfer) =
1112							$self->_rearrange([qw(SEQ
1113							RESOLVER_METHOD
1114							GROUP_TAG
1115							PARTONOMY
1116							STRUCTURE_TYPE
1117							RESOLVER_TAG
1118							USE_MAGIC
1119							NOINFER
1120							)],
1121							@args);
1122
1123							# seq we want to unflatten
1124	11		33			44	$seq = $seq \|\| $self->seq;
1125	11	100				38	if (!$self->seq) {
1126	1					4	$self->seq($seq);
1127							}
1128
1129
1130							# prevent bad argument combinations
1131	11	50	66			54	if ($partonomy &&
1132							defined($structure_type)) {
1133	0					0	$self->throw("You cannot set both -partonomy and -structure_type\n".
1134							"(the former is implied by the latter)");
1135							}
1136
1137							# remember the current value of partonomy, to reset later
1138	11					38	my $old_partonomy = $self->partonomy;
1139	11	100				35	$self->partonomy($partonomy) if defined $partonomy;
1140
1141							# remember old structure_type
1142	11					36	my $old_structure_type = $self->structure_type;
1143	11	50				28	$self->structure_type($structure_type) if defined $structure_type;
1144
1145							# if we are sourcing our data from genbank, all the
1146							# features should be flat (eq no sub_SeqFeatures)
1147	11					34	my @flat_seq_features = $seq->get_SeqFeatures;
1148	11					351	my @all_seq_features = $seq->get_all_SeqFeatures;
1149
1150							# sanity checks
1151	11	50				289	if (@all_seq_features > @flat_seq_features) {
1152	0					0	$self->throw("It looks as if this sequence has already been unflattened");
1153							}
1154	11	50				33	if (@all_seq_features < @flat_seq_features) {
1155	0					0	$self->throw("ASSERTION ERROR: something is seriously wrong with your features");
1156							}
1157
1158							# tag for ungrouping; usually /gene or /locus_tag
1159							# for example: /gene="foo"
1160	11		66			55	$group_tag = $group_tag \|\| $self->group_tag;
1161	11	100				29	if ($use_magic) {
1162							# use magic to guess the group tag
1163							my @sfs_with_locus_tag =
1164	7					30	grep {$_->has_tag("locus_tag")} @flat_seq_features;
	6020					6469
1165							my @sfs_with_gene_tag =
1166	7					25	grep {$_->has_tag("gene")} @flat_seq_features;
	6020					6250
1167							my @sfs_with_product_tag =
1168	7					22	grep {$_->has_tag("product")} @flat_seq_features;
	6020					6367
1169
1170							# if ($group_tag && $self->{'trust_grouptag'}) { # dgg suggestion
1171							#
1172							# }
1173							# elsif
1174	7	100				29	if (@sfs_with_locus_tag) {
1175							# dgg note: would like to -use_magic with -group_tag = 'gene' for ensembl genomes
1176							# where ensembl gene FT have both /locus_tag and /gene, but mRNA, CDS have /gene only
1177	1	50	33			5	if ($group_tag && $group_tag ne 'locus_tag') {
1178	0					0	$self->throw("You have explicitly set group_tag to be '$group_tag'\n".
1179							"However, I detect that some features use /locus_tag\n".
1180							"I believe that this is the correct group_tag to use\n".
1181							"You can resolve this by either NOT setting -group_tag\n".
1182							"OR you can unset -use_magic to regain control");
1183							}
1184
1185							# use /locus_tag instead of /gene tag for grouping
1186							# see GenBank entry AE003677 (version 3) for an example
1187	1					2	$group_tag = 'locus_tag';
1188	1	50				5	if ($self->verbose > 0) {
1189	0					0	warn "Set group tag to: $group_tag\n";
1190							}
1191							}
1192
1193							# on rare occasions, records will have no /gene or /locus_tag
1194							# but it WILL have /product tags. These serve the same purpose
1195							# for grouping. For an example, see AY763288 (also in t/data)
1196	7	50	100			391	if (@sfs_with_locus_tag==0 &&
			66
			66
1197							@sfs_with_gene_tag==0 &&
1198							@sfs_with_product_tag>0 &&
1199							!$group_tag) {
1200	1					3	$group_tag = 'product';
1201	1	50				3	if ($self->verbose > 0) {
1202	0					0	warn "Set group tag to: $group_tag\n";
1203							}
1204
1205							}
1206							}
1207	11	100				32	if (!$group_tag) {
1208	7					19	$group_tag = 'gene';
1209							}
1210
1211							# ------------------------------
1212							# GROUP FEATURES using $group_tag
1213							# collect features into unstructured groups
1214							# ------------------------------
1215
1216							# -------------
1217							# we want to generate a list of groups;
1218							# each group is a list of SeqFeatures; this
1219							# group probably (but not necessarily)
1220							# corresponds to a gene model.
1221							#
1222							# this array will look something like this:
1223							# ([$f1], [$f2, $f3, $f4], ...., [$f97, $f98, $f99])
1224							#
1225							# there are also 'singleton' groups, with one member.
1226							# for instance, the 'source' feature is in a singleton group;
1227							# the same with others such as 'misc_feature'
1228	11					20	my @groups = ();
1229							# -------------
1230
1231							# --------------------
1232							# we hope that the genbank record allows us to group by some grouping
1233							# tag.
1234							# for instance, most of the time a gene model can be grouped using
1235							# the gene tag - that is where you see
1236							# /gene="foo"
1237							# in a genbank record
1238							# --------------------
1239
1240							# keep an index of groups by their
1241							# grouping tag
1242	11					32	my %group_by_tag = ();
1243
1244
1245							# iterate through all features, putting them into groups
1246	11					21	foreach my $sf (@flat_seq_features) {
1247	6333	100				7481	if (!$sf->has_tag($group_tag)) {
1248							# SINGLETON
1249							# this is an ungroupable feature;
1250							# add it to a group of its own
1251	64					115	push(@groups, [$sf]);
1252							}
1253							else {
1254							# NON-SINGLETON
1255	6269					6526	my @group_tagvals = $sf->get_tag_values($group_tag);
1256	6269	50				7584	if (@group_tagvals > 1) {
1257							# sanity check:
1258							# currently something can only belong to one group
1259	0					0	$self->problem(2,
1260							">1 value for /$group_tag: @group_tagvals\n".
1261							"At this time this module is not equipped to handle this adequately", $sf);
1262							}
1263							# get value of group tag
1264	6269					4443	my $gtv = shift @group_tagvals;
1265	6269	50				6586	$gtv \|\| $self->throw("Empty /$group_tag vals not allowed!");
1266
1267							# is this a new group?
1268	6269					4531	my $group = $group_by_tag{$gtv};
1269	6269	100				5694	if ($group) {
1270							# this group has been encountered before - add current
1271							# sf to the end of the group
1272	4253					5190	push(@$group, $sf);
1273							}
1274							else {
1275							# new group; add to index and create new group
1276	2016					1859	$group = [$sf]; # currently one member; probably more to come
1277	2016					2391	$group_by_tag{$gtv} = $group;
1278	2016					2410	push(@groups, $group);
1279							}
1280							}
1281							}
1282
1283							# as well as having the same group_tag, a group should be spatially
1284							# connected. if not, then the group should be split into subgroups.
1285							# this turns out to be necessary in the case of multicopy genes.
1286							# the standard way to represent these is as spatially disconnected
1287							# gene models (usually a 'gene' feature and some kind of RNA feature)
1288							# with the same group tag; the code below will split these into
1289							# seperate groups, one per copy.
1290	11					27	@groups = map { $self->_split_group_if_disconnected($_) } @groups;
	2080					3759
1291
1292							# remove any duplicates; most of the time the method below has
1293							# no effect. there are some unusual genbank records for which
1294							# duplicate removal is necessary. see the comments in the
1295							# _remove_duplicates_from_group() method if you want to know
1296							# the ugly details
1297	11					188	foreach my $group (@groups) {
1298	2083					1950	$self->_remove_duplicates_from_group($group);
1299							}
1300
1301							# -
1302
1303							# PSEUDOGENES, PSEUDOEXONS AND PSEUDOINTRONS
1304							# these are indicated with the /pseudo tag
1305							# these are mapped to a different type; they should NOT
1306							# be treated as normal genes
1307	11					51	foreach my $sf (@all_seq_features) {
1308	6333	100				6490	if ($sf->has_tag('pseudo')) {
1309	899					950	my $type = $sf->primary_tag;
1310							# SO type is typically the same as the normal
1311							# type but preceeded by "pseudo"
1312	899	100	66			1982	if ($type eq 'misc_RNA' \|\| $type eq 'mRNA') {
1313							# dgg: see TypeMapper; both pseudo mRNA,misc_RNA should be pseudogenic_transcript
1314	101					110	$sf->primary_tag("pseudotranscript");
1315							}
1316							else {
1317	798					1034	$sf->primary_tag("pseudo$type");
1318							}
1319							}
1320							}
1321							# now some of the post-processing that follows which applies to
1322							# genes will NOT be applied to pseudogenes; this is deliberate
1323							# for example, gene models are normalised to be gene-transcript-exon
1324							# for pseudogenes we leave them as pseudogene-pseudoexon
1325
1326							# --- MAGIC ---
1327	11					22	my $need_to_infer_exons = 0;
1328	11					15	my $need_to_infer_mRNAs = 0;
1329	11					20	my @removed_exons = ();
1330	11	100				35	if ($use_magic) {
1331	7	50				22	if (defined($structure_type)) {
1332	0					0	$self->throw("Can't combine use_magic AND setting structure_type");
1333							}
1334							my $n_introns =
1335	7					26	scalar(grep {$_->primary_tag eq 'exon'} @flat_seq_features);
	6020					6296
1336							my $n_exons =
1337	7					21	scalar(grep {$_->primary_tag eq 'exon'} @flat_seq_features);
	6020					6100
1338							my $n_mrnas =
1339	7					24	scalar(grep {$_->primary_tag eq 'mRNA'} @flat_seq_features);
	6020					6106
1340							my $n_mrnas_attached_to_gene =
1341	7	100				23	scalar(grep {$_->primary_tag eq 'mRNA' &&
	6020					6672
1342							$_->has_tag($group_tag)} @flat_seq_features);
1343							my $n_cdss =
1344	7					25	scalar(grep {$_->primary_tag eq 'CDS'} @flat_seq_features);
	6020					6335
1345							my $n_rnas =
1346	7					22	scalar(grep {$_->primary_tag =~ /RNA/} @flat_seq_features);
	6020					6647
1347							# Are there any CDS features in the record?
1348	7	100				18	if ($n_cdss > 0) {
1349							# YES
1350
1351							# - a pc gene model should contain at the least a CDS
1352
1353							# Are there any mRNA features in the record?
1354	6	100				25	if ($n_mrnas == 0) {
		50
1355							# NO mRNAs:
1356							# looks like structure_type == 1
1357	1					3	$structure_type = 1;
1358	1					3	$need_to_infer_mRNAs = 1;
1359							}
1360							elsif ($n_mrnas_attached_to_gene == 0) {
1361							# $n_mrnas > 0
1362							# $n_mrnas_attached_to_gene = 0
1363							#
1364							# The entries _do_ contain mRNA features,
1365							# but none of them are part of a group/gene, i.e. they
1366							# are 'floating'
1367
1368							# this is an annoying weird file that has some floating
1369							# mRNA features;
1370							# eg ftp.ncbi.nih.gov/genomes/Schizosaccharomyces_pombe/
1371
1372	0	0				0	if ($self->verbose) {
1373							my @floating_mrnas =
1374	0	0				0	grep {$_->primary_tag eq 'mRNA' &&
	0					0
1375							!$_->has_tag($group_tag)} @flat_seq_features;
1376	0					0	printf STDERR "Unattached mRNAs:\n";
1377	0					0	foreach my $mrna (@floating_mrnas) {
1378	0					0	$self->_write_sf_detail($mrna);
1379							}
1380	0					0	printf STDERR "Don't know how to deal with these; filter at source?\n";
1381							}
1382
1383	0					0	foreach (@flat_seq_features) {
1384	0	0				0	if ($_->primary_tag eq 'mRNA') {
1385							# what should we do??
1386
1387							# I think for pombe we just have to filter
1388							# out bogus mRNAs prior to starting
1389							}
1390							}
1391
1392							# looks like structure_type == 2
1393	0					0	$structure_type = 2;
1394	0					0	$need_to_infer_mRNAs = 1;
1395							}
1396							else {
1397							}
1398
1399							# we always infer exons in magic mode
1400	6					13	$need_to_infer_exons = 1;
1401							}
1402							else {
1403							# this doesn't seem to be any kind of protein coding gene model
1404	1	50				4	if ( $n_rnas > 0 ) {
1405	1					2	$need_to_infer_exons = 1;
1406							}
1407							}
1408
1409	7	50				21	$need_to_infer_exons = 0 if $noinfer; #NML
1410
1411	7	50				21	if ($need_to_infer_exons) {
1412							# remove exons and introns from group -
1413							# we will infer exons later, and we
1414							# can always infer introns from exons
1415	7					20	foreach my $group (@groups) {
1416							@$group =
1417							grep {
1418	1936					1723	my $type = $_->primary_tag();
	6009					6390
1419	6009	100				7122	if ($type eq 'exon') {
1420							# keep track of all removed exons,
1421							# so we can do a sanity check later
1422	130					108	push(@removed_exons, $_);
1423							}
1424	6009	100				14920	$type ne 'exon' && $type ne 'intron'
1425							} @$group;
1426							}
1427							# get rid of any groups that have zero members
1428	7					17	@groups = grep {scalar(@$_)} @groups;
	1936					1497
1429							}
1430							}
1431							# --- END OF MAGIC ---
1432
1433							# LOGICAL ASSERTION
1434	11	50				59	if (grep {!scalar(@$_)} @groups) {
	2078					1619
1435	0					0	$self->throw("ASSERTION ERROR: empty group");
1436							}
1437
1438							# LOGGING
1439	11	50				36	if ($self->verbose > 0) {
1440	0					0	printf STDERR "GROUPS:\n";
1441	0					0	foreach my $group (@groups) {
1442	0					0	$self->_write_group($group, $group_tag);
1443							}
1444							}
1445							# -
1446
1447							# --------- FINISHED GROUPING -------------
1448
1449
1450							# TYPE CONTAINMENT HIERARCHY (aka partonomy)
1451							# set the containment hierarchy if desired
1452							# see docs for structure_type() method
1453	11	100				36	if ($structure_type) {
1454	1	50				5	if ($structure_type == 1) {
1455	1					7	$self->partonomy(
1456							{CDS => 'gene',
1457							exon => 'CDS',
1458							intron => 'CDS',
1459							}
1460							);
1461							}
1462							else {
1463	0					0	$self->throw("structure_type $structure_type is currently unknown");
1464							}
1465							}
1466
1467							# see if we have an obvious resolver_tag
1468	11	100				21	if ($use_magic) {
1469	7					17	foreach my $sf (@all_seq_features) {
1470	6020	100				6139	if ($sf->has_tag('derived_from')) {
1471	2					4	$resolver_tag = 'derived_from';
1472							}
1473							}
1474							}
1475
1476	11	100				29	if ($use_magic) {
1477							# point all feature types without a container type to the root type.
1478							#
1479							# for example, if we have an unanticipated feature_type, say
1480							# 'aberration', this should by default point to the parent 'gene'
1481	7					17	foreach my $group (@groups) {
1482	1931					3207	my @sfs = @$group;
1483	1931	100				2469	if (@sfs > 1) {
1484	1384					1125	foreach my $sf (@sfs) {
1485	5227					6526	my $type = $sf->primary_tag;
1486	5227	100				6573	next if $type eq 'gene';
1487	3989					3689	my $container_type = $self->get_container_type($type);
1488	3989	100				5929	if (!$container_type) {
1489	9					14	$self->partonomy->{$type} = 'gene';
1490							}
1491							}
1492							}
1493							}
1494							}
1495
1496							# we have done the first part of the unflattening.
1497							# we now have a list of groups; each group is a list of seqfeatures.
1498							# the actual group itself is flat; we may want to unflatten this further;
1499							# for instance, a gene model can contain multiple mRNAs and CDSs. We may want
1500							# to link the correct mRNA to the correct CDS via the bioperl sub_SeqFeature tree.
1501							#
1502							# what we would end up with would be
1503							# gene1
1504							# mRNA-a
1505							# CDS-a
1506							# mRNA-b
1507							# CDS-b
1508	11					81	my @top_sfs = $self->unflatten_groups(-groups=>\@groups,
1509							-resolver_method=>$resolver_method,
1510							-resolver_tag=>$resolver_tag);
1511
1512							# restore settings
1513	11					161	$self->partonomy($old_partonomy);
1514
1515							# restore settings
1516	11					38	$self->structure_type($old_structure_type);
1517
1518							# modify the original Seq object - the top seqfeatures are now
1519							# the top features from each group
1520	11					71	$seq->remove_SeqFeatures;
1521	11					52	$seq->add_SeqFeature($_) foreach @top_sfs;
1522
1523							# --------- FINISHED UNFLATTENING -------------
1524
1525							# lets see if there are any post-unflattening tasks we need to do
1526
1527
1528
1529							# INFERRING mRNAs
1530	11	100				33	if ($need_to_infer_mRNAs) {
1531	1	50				5	if ($self->verbose > 0) {
1532	0					0	printf STDERR "** INFERRING mRNA from CDS\n";
1533							}
1534	1					7	$self->infer_mRNA_from_CDS(-seq=>$seq, -noinfer=>$noinfer);
1535							}
1536
1537							# INFERRING exons
1538	11	100				24	if ($need_to_infer_exons) {
1539
1540							# infer exons, one group/gene at a time
1541	7					14	foreach my $sf (@top_sfs) {
1542	2034					4679	my @sub_sfs = ($sf, $sf->get_all_SeqFeatures);
1543	2034					5502	$self->feature_from_splitloc(-features=>\@sub_sfs);
1544							}
1545
1546							# some exons are stated explicitly; ie there is an "exon" feature
1547							# most exons are inferred; ie there is a "mRNA" feature with
1548							# split locations
1549							#
1550							# if there were exons explicitly stated in the entry, we need to
1551							# do two things:
1552							#
1553							# make sure these exons are consistent with the inferred exons
1554							# (you never know)
1555							#
1556							# transfer annotation (tag-vals) from the explicit exon to the
1557							# new inferred exon
1558	7	100				26	if (@removed_exons) {
1559	2					10	my @allfeats = $seq->get_all_SeqFeatures;
1560
1561							# find all the inferred exons that are children of mRNA
1562	2					7	my @mrnas = grep {$_->primary_tag eq 'mRNA'} @allfeats;
	213					222
1563							my @exons =
1564	160					165	grep {$_->primary_tag eq 'exon'}
1565	2					4	map {$_->get_SeqFeatures} @mrnas;
	26					30
1566
1567	2					8	my %exon_h = (); # index of exons by location;
1568
1569							# there CAN be >1 exon at a location; we can represent these redundantly
1570							# (ie as a tree, not a graph)
1571	2					4	push(@{$exon_h{$self->_locstr($_)}}, $_) foreach @exons;
	134					151
1572	2					4	my @problems = (); # list of problems;
1573							# each problem is a
1574							# [$severity, $description] pair
1575	2					3	my $problem = '';
1576	2					8	my ($n_exons, $n_removed_exons) =
1577							(scalar(keys %exon_h), scalar(@removed_exons));
1578	2					6	foreach my $removed_exon (@removed_exons) {
1579	130					161	my $locstr = $self->_locstr($removed_exon);
1580	130					158	my $inferred_exons = $exon_h{$locstr};
1581	130					128	delete $exon_h{$locstr};
1582	130	50				152	if ($inferred_exons) {
1583	130					123	my %exons_done = ();
1584	130					124	foreach my $exon (@$inferred_exons) {
1585
1586							# make sure we don't move stuff twice
1587	134	100				243	next if $exons_done{$exon};
1588	130					146	$exons_done{$exon} = 1;
1589
1590							# we need to tranfer any tag-values from the explicit
1591							# exon to the implicit exon
1592	130					174	foreach my $tag ($removed_exon->get_all_tags) {
1593	284					324	my @vals = $removed_exon->get_tag_values($tag);
1594	284	50				583	if (!$exon->can("add_tag_value")) {
1595							# I'm puzzled as to what should be done here;
1596							# SeqFeatureIs are not necessarily mutable,
1597							# but we know that in practice the implementing
1598							# class is mutable
1599	0					0	$self->throw("The SeqFeature object does not ".
1600							"implement add_tag_value()");
1601							}
1602	284					367	$exon->add_tag_value($tag, @vals);
1603							}
1604							}
1605							}
1606							else {
1607							# no exons inferred at $locstr
1608	0					0	push(@problems,
1609							[1,
1610							"there is a conflict with exons; there was an explicitly ".
1611							"stated exon with location $locstr, yet I cannot generate ".
1612							"this exon from the supplied mRNA locations\n"]);
1613							}
1614							}
1615							# do we have any inferred exons left over, that were not
1616							# covered in the explicit exons?
1617	2	50				7	if (keys %exon_h) {
1618							# TODO - we ignore this problem for now
1619	0					0	push(@problems,
1620							[1,
1621							sprintf("There are some inferred exons that are not in the ".
1622							"explicit exon list; they are the exons at locations:\n".
1623							join("\n", keys %exon_h)."\n")]);
1624							}
1625
1626							# report any problems
1627	2	50				23	if (@problems) {
1628	0					0	my $thresh = $self->error_threshold;
1629	0					0	my @bad_problems = grep {$_->[0] > $thresh} @problems;
	0					0
1630	0	0				0	if (@bad_problems) {
1631	0					0	printf STDERR "PROBLEM:\n";
1632	0					0	$self->_write_hier(\@top_sfs);
1633							# TODO - allow more fine grained control over this
1634	0					0	$self->{_problems_reported} = 1;
1635							$self->throw(join("\n",
1636	0					0	map {"@$_"} @bad_problems));
	0					0
1637							}
1638	0					0	$self->problem(@$_) foreach @problems;
1639							}
1640							}
1641							}
1642							# --- end of inferring exons --
1643
1644							# return new top level features; this can also
1645							# be retrieved via
1646							# $seq->get_SeqFeatures();
1647							# return @top_sfs;
1648	11					50	return $seq->get_SeqFeatures;
1649							}
1650
1651							# _split_group_if_disconnected([@sfs])
1652							#
1653							# as well as having the same group_tag, a group should be spatially
1654							# connected. if not, then the group should be split into subgroups.
1655							# this turns out to be necessary in the case of multicopy genes.
1656							# the standard way to represent these is as spatially disconnected
1657							# gene models (usually a 'gene' feature and some kind of RNA feature)
1658							# with the same group tag; the code below will split these into
1659							# seperate groups, one per copy.
1660
1661							sub _split_group_if_disconnected {
1662	2080			2080		1947	my $self = shift;
1663	2080					1805	my $group = shift;
1664	2080					3976	my @sfs = @$group;
1665	2080					4568	my @ranges =
1666							Bio::Range->disconnected_ranges(@sfs);
1667	2080					1924	my @groups;
1668	2080	50				4087	if (@ranges == 0) {
		100
1669	0					0	$self->throw("ASSERTION ERROR");
1670							}
1671							elsif (@ranges == 1) {
1672							# no need to split the group
1673	2077					2372	@groups = ($group);
1674							}
1675							else {
1676							# @ranges > 1
1677							# split the group into disconnected ranges
1678	3	50				10	if ($self->verbose > 0) {
1679	0					0	printf STDERR "GROUP PRE-SPLIT:\n";
1680	0					0	$self->_write_group($group, $self->group_tag);
1681							}
1682							@groups =
1683							map {
1684	3					9	my $range = $_;
	6					7
1685							[grep {
1686	6					18	$_->intersection($range);
	136					231
1687							} @sfs]
1688							} @ranges;
1689	3	50				12	if ($self->verbose > 0) {
1690	0					0	printf STDERR "SPLIT GROUPS:\n";
1691	0					0	$self->_write_group($_, $self->group_tag) foreach @groups;
1692							}
1693							}
1694	2080					3390	return @groups;
1695							}
1696
1697							sub _remove_duplicates_from_group {
1698	2083			2083		1297	my $self = shift;
1699	2083					1372	my $group = shift;
1700
1701							# ::: WEIRD BOUNDARY CASE CODE :::
1702							# for some reason, there are some gb records with two gene
1703							# features for one gene; for example, see ATF14F8.gbk
1704							# in the t/data directory
1705							#
1706							# in this case, we get rid of one of the genes
1707
1708	2083					1796	my @genes = grep {$_->primary_tag eq 'gene'} @$group;
	6333					8269
1709	2083	100				2503	if (@genes > 1) {
1710							# OK, if we look at ATF14F8.gbk we see that some genes
1711							# just exist as a single location, some exist as a multisplit location;
1712							#
1713							# eg
1714
1715							# gene 16790..26395
1716							# /gene="F14F8_60"
1717							# ...
1718							# gene complement(join(16790..19855,20136..20912,21378..21497,
1719							# 21654..21876,22204..22400,22527..23158,23335..23448,
1720							# 23538..23938,24175..24536,24604..24715,24889..24984,
1721							# 25114..25171,25257..25329,25544..25589,25900..26018,
1722							# 26300..26395))
1723							# /gene="F14F8_60"
1724
1725							# the former is the 'standard' way of representing the gene in genbank;
1726							# the latter is redundant with the CDS entry. So we shall get rid of
1727							# the latter with the following filter
1728
1729	11	50				18	if ($self->verbose > 0) {
1730	0					0	printf STDERR "REMOVING DUPLICATES:\n";
1731							}
1732
1733							@genes =
1734							grep {
1735	11					9	my $loc = $_->location;
	22					32
1736	22	100				58	if ($loc->isa("Bio::Location::SplitLocationI")) {
1737	10					17	my @locs = $loc->each_Location;
1738	10	50				17	if (@locs > 1) {
1739	10					14	0;
1740							}
1741							else {
1742	0					0	1;
1743							}
1744							}
1745							else {
1746	12					16	1;
1747							}
1748							} @genes;
1749
1750	11	100				17	if (@genes > 1) {
1751							# OK, that didn't work. Our only resort is to just pick one at random
1752	1					3	@genes = ($genes[0]);
1753							}
1754	11	50				17	if (@genes) {
1755	11	50				15	@genes == 1 \|\| $self->throw("ASSERTION ERROR");
1756							@$group =
1757	11					14	($genes[0], grep {$_->primary_tag ne 'gene'} @$group);
	170					179
1758							}
1759							}
1760							# its a dirty job but someone's gotta do it
1761	2083					1977	return;
1762							}
1763
1764
1765							=head2 unflatten_groups
1766
1767							Title : unflatten_groups
1768							Usage :
1769							Function: iterates over groups, calling unflatten_group() [see below]
1770							Example :
1771							Returns : list of Bio::SeqFeatureI objects that are holders
1772							Args : see below
1773
1774							Arguments
1775
1776							-groups: list of list references; inner list is of Bio::SeqFeatureI objects
1777							e.g. ( [$sf1], [$sf2, $sf3, $sf4], [$sf5, ...], ...)
1778
1779							-resolver_method: a CODE reference
1780							see the documentation above for an example of
1781							a subroutine that can be used to resolve hierarchies
1782							within groups.
1783
1784							this is optional - a default subroutine will be used
1785
1786
1787							NOTE: You should not need to call this method, unless you want fine
1788							grained control over how the unflattening process.
1789
1790							=cut
1791
1792							sub unflatten_groups{
1793	11			11	1	24	my ($self,@args) = @_;
1794	11					52	my($groups, $resolver_method, $resolver_tag) =
1795							$self->_rearrange([qw(GROUPS
1796							RESOLVER_METHOD
1797							RESOLVER_TAG
1798							)],
1799							@args);
1800
1801							# this is just a simple wrapper for unflatten_group()
1802							return
1803							map {
1804	11					30	$self->unflatten_group(-group=>$_,
	2078					5670
1805							-resolver_method=>$resolver_method,
1806							-resolver_tag=>$resolver_tag)
1807							} @$groups;
1808							}
1809
1810							=head2 unflatten_group
1811
1812							Title : unflatten_group
1813							Usage :
1814							Function: nests a group of features into a feature containment hierarchy
1815							Example :
1816							Returns : Bio::SeqFeatureI objects that holds other features
1817							Args : see below
1818
1819							Arguments
1820
1821							-group: reference to list of Bio::SeqFeatureI objects
1822
1823							-resolver_method: a CODE reference
1824							see the documentation above for an example of
1825							a subroutine that can be used to resolve hierarchies
1826							within groups
1827
1828							this is optional - a default subroutine will be used
1829
1830
1831							NOTE: You should not need to call this method, unless you want fine
1832							grained control over how the unflattening process.
1833
1834							=cut
1835
1836							sub unflatten_group{
1837	2078			2078	1	3244	my ($self,@args) = @_;
1838
1839	2078					5643	my($group, $resolver_method, $resolver_tag) =
1840							$self->_rearrange([qw(GROUP
1841							RESOLVER_METHOD
1842							RESOLVER_TAG
1843							)],
1844							@args);
1845
1846	2078	50				5222	if ($self->verbose > 0) {
1847	0					0	printf STDERR "UNFLATTENING GROUP:\n";
1848	0					0	$self->_write_group($group, $self->group_tag);
1849							}
1850
1851	2078					4389	my @sfs = @$group;
1852
1853							# we can safely ignore singletons (e.g. [source])
1854	2078	100				4285	return $sfs[0] if @sfs == 1;
1855
1856	1508					2542	my $partonomy = $self->partonomy;
1857
1858							# $resolver_method is a reference to a SUB that will resolve
1859							# ambiguous parent/child containment; for example, determining
1860							# which mRNAs go with which CDSs
1861	1508		100			4167	$resolver_method = $resolver_method \|\| \&_resolve_container_for_sf;
1862
1863							# TAG BASED RESOLVING OF HIERARCHIES
1864							#
1865							# if the user specifies $resolver_tag, then we use this tag
1866							# to pair up ambiguous parents and children;
1867							#
1868							# for example, the CDS feature may have a resolver tag of /derives_from
1869							# which is a 'foreign key' into the /label tag of the mRNA feature
1870							#
1871							# this kind of tag-based resolution is possible for a certain subset
1872							# of genbank records
1873							#
1874							# if no resolver tag is specified, we revert to the normal
1875							# resolver_method
1876	1508	100				2181	if ($resolver_tag) {
1877	1					3	my $backup_resolver_method = $resolver_method;
1878							# closure: $resolver_tag is remembered by this sub
1879							my $sub =
1880							sub {
1881	2			2		4	my ($self, $sf, @possible_container_sfs) = @_;
1882	2					5	my @container_sfs = ();
1883	2	50				8	if ($sf->has_tag($resolver_tag)) {
1884	2					6	my ($resolver_tagval) = $sf->get_tag_values($resolver_tag);
1885							# if a feature has a resolver_tag (e.g. /derives_from)
1886							# this specifies the /product, /symbol or /label for the
1887							# parent feature
1888							@container_sfs =
1889							grep {
1890	2					3	my $match = 0;
	4					4
1891	4	50				14	$self->_write_sf($_) if $self->verbose > 0;
1892	4					6	foreach my $tag (qw(product symbol label)) {
1893	10	100				16	if ($_->has_tag($tag)) {
1894	6					193	my @vals =
1895							$_->get_tag_values($tag);
1896	6	100				7	if (grep {$_ eq $resolver_tagval} @vals) {
	6					18
1897	2					3	$match = 1;
1898	2					5	last;
1899							}
1900							}
1901							}
1902	4					7	$match;
1903							} @possible_container_sfs;
1904							}
1905							else {
1906	0					0	return $backup_resolver_method->($sf, @possible_container_sfs);
1907							}
1908	2					3	return map {$_=>0} @container_sfs;
	2					8
1909	1					8	};
1910	1					3	$resolver_method = $sub;
1911							}
1912							else {
1913							# CONDITION: $resolver_tag is NOT set
1914	1507	50				2520	$self->throw("assertion error") if $resolver_tag;
1915							}
1916							# we have now set $resolver_method to a subroutine for
1917							# disambiguatimng parent/child relationships. we will
1918							# now build the whole containment hierarchy for this group
1919
1920
1921							# FIND TOP/ROOT SEQFEATURES
1922							#
1923							# find all the features for which there is no
1924							# containing feature type (eg genes)
1925							my @top_sfs =
1926							grep {
1927	1508					1925	!$self->get_container_type($_->primary_tag);
	5517					8935
1928							} @sfs;
1929
1930							# CONDITION: there must be at most one root
1931	1508	50				2353	if (@top_sfs > 1) {
1932	0					0	$self->_write_group($group, $self->group_tag);
1933	0					0	printf STDERR "TOP SFS:\n";
1934	0					0	$self->_write_sf($_) foreach @top_sfs;
1935	0					0	$self->throw("multiple top-sfs in group");
1936							}
1937	1508					1344	my $top_sf = $top_sfs[0];
1938
1939							# CREATE INDEX OF SEQFEATURES BY TYPE
1940	1508					1866	my %sfs_by_type = ();
1941	1508					2160	foreach my $sf (@sfs) {
1942	5517					3800	push(@{$sfs_by_type{$sf->primary_tag}}, $sf);
	5517					6465
1943							}
1944
1945							# containment index; keyed by child; lookup parent
1946							# note: this index uses the stringified object reference of
1947							# the object as a surrogate lookup key
1948
1949	1508					1630	my %container = (); # child -> parent
1950
1951							# ALGORITHM: build containment graph
1952							#
1953							# find all possible containers for each SF;
1954							# for instance, for a CDS, the possible containers are all
1955							# the mRNAs in the same group. For a mRNA, the possible
1956							# containers are any SFs of type 'gene' (should only be 1).
1957							# (these container-type mappings can be overridden)
1958							#
1959							# contention is resolved by checking coordinates of splice sites
1960							# (this is the default, but can be overridden)
1961							#
1962							# most of the time, there is no problem identifying a unique
1963							# parent for every child; this can be ambiguous when constructing
1964							# CDS to mRNA relationships with lots of alternate splicing
1965							#
1966							# a hash of child->parent relationships is constructed (%container)
1967							# any mappings that need further resolution (eg CDS to mRNA) are
1968							# placed in %unresolved
1969
1970							# %unresolved index
1971							# (keyed by stringified object reference of child seqfeature)
1972	1508					1527	my %unresolved = (); # child -> [parent,score] to be resolved
1973
1974							# index of seqfeatures by their stringified object reference;
1975							# this is essentially a way of 'reviving' an object from its stringified
1976							# reference
1977							# (see NOTE ON USING OBJECTS AS KEYS IN HASHES, below)
1978	1508					1322	my %idxsf = map {$_=>$_} @sfs;
	5517					9576
1979
1980	1508					1739	foreach my $sf (@sfs) {
1981	5517					8040	my $type = $sf->primary_tag;
1982
1983							# container type (e.g. the container type for CDS is usually mRNA)
1984	5517					7006	my $container_type =
1985							$self->get_container_type($type);
1986	5517	100				7154	if ($container_type) {
1987
1988							my @possible_container_sfs =
1989	4155	100				2796	@{$sfs_by_type{$container_type} \|\| []};
	4155					8139
1990							# we now have a list of possible containers
1991							# (eg for a CDS in an alternately spliced gene, this
1992							# would be a list of all the mRNAs for this gene)
1993
1994	4155	100				5236	if (!@possible_container_sfs) {
1995							# root of hierarchy
1996							}
1997							else {
1998	3906	100				4718	if (@possible_container_sfs == 1) {
1999							# this is the easy situation, whereby the containment
2000							# hierarchy is unambiguous. this will probably be the
2001							# case if the genbank record has no alternate splicing
2002							# within it
2003
2004							# ONE OPTION ONLY - resolved!
2005	2934					6108	$container{$sf} = $possible_container_sfs[0];
2006
2007							}
2008							else {
2009							# MULTIPLE CONTAINER CHOICES
2010	972	50				1502	$self->throw("ASSERTION ERROR") unless @possible_container_sfs > 1;
2011
2012							# push this onto the %unresolved graph, and deal with it
2013							# later
2014
2015							# for now we hardcode things such that the only type
2016							# with ambiguous parents is a CDS; if this is violated,
2017							# it has a weak problem class of '1' so the API user
2018							# can easily set things to ignore these
2019	972	50				1546	if ($sf->primary_tag ne 'CDS') {
2020	0					0	$self->problem(1,
2021							"multiple container choice for non-CDS; ".
2022							"CDS to mRNA should be the only ".
2023							"relationships requiring resolving",
2024							$sf);
2025							}
2026
2027							# previously we set the SUB $resolver_method
2028	972	50				1853	$self->throw("ASSERTION ERROR")
2029							unless $resolver_method;
2030
2031							# $resolver_method will assign scores to
2032							# parent/child combinations; later on we
2033							# will use these scores to find the optimal
2034							# parent/child pairings
2035
2036							# the default $resolver_method uses splice sites to
2037							# score possible parent/child matches
2038
2039	972					1735	my %container_sfh =
2040							$resolver_method->($self, $sf, @possible_container_sfs);
2041	972	100				3715	if (!%container_sfh) {
2042	6					23	$self->problem(2,
2043							"no containers possible for SeqFeature of ".
2044							"type: $type; this SF is being placed at ".
2045							"root level",
2046							$sf);
2047							# RESOLVED! (sort of - placed at root/gene level)
2048	6					14	$container{$sf} = $top_sf;
2049
2050							# this sort of thing happens if the record is
2051							# badly messed up and there is absolutely no indication
2052							# of where to put the CDS. Perhaps we should just
2053							# place it with a random mRNA?
2054							}
2055	972					1946	foreach my $jsf (keys %container_sfh) {
2056
2057							# add [score, parent] pairs to the %unresolved
2058							# lookup table/graph
2059	1748					7614	push(@{$unresolved{$sf}},
2060	1748		100			1270	[$idxsf{$jsf}, $container_sfh{$jsf} \|\| 0]);
2061							}
2062							}
2063							}
2064							}
2065							else {
2066							# CONDITION:
2067							# not container type for $sf->primary_tag
2068
2069							# CONDITION:
2070							# $sf must be a root/top node (eg gene)
2071							}
2072							}
2073
2074	1508					1157	if (0) {
2075
2076							# CODE CURRENTLY DISABLED
2077
2078							# we require a 1:1 mapping between mRNAs and CDSs;
2079							# create artificial duplicates if we can't do this...
2080							if (%unresolved) {
2081							my %childh = map {$_=>1} keys %unresolved;
2082							my %parenth = map {$_->[0]=>1} map {@$_} values %unresolved;
2083							if ($self->verbose > 0) {
2084							printf STDERR "MATCHING %d CHILDREN TO %d PARENTS\n",
2085							scalar(keys %childh), scalar(keys %parenth);
2086							}
2087							# 99.99% of the time in genbank genomic record of structure type 0, we
2088							# see one CDS for every mRNA; one exception is the S Pombe
2089							# genome, which is all CDS, bar a few spurious mRNAs; we have to
2090							# filter out the spurious mRNAs in this case
2091							#
2092							# another strange case is in the mouse genome, NT_078847.1
2093							# for Pcdh13 you will notice there is 4 mRNAs and 5 CDSs.
2094							# most unusual!
2095							# I'm at a loss for a really clever thing to do here. I think the
2096							# best thing is to create duplicate features to preserve the 1:1 mapping
2097							# my $suffix_id = 1;
2098							# while (keys %childh > keys %parenth) {
2099							#
2100							# }
2101							}
2102							}
2103
2104							# DEBUGGING CODE
2105	1508	0	50			2672	if ($self->verbose > 0 && scalar(keys %unresolved)) {
2106	0					0	printf STDERR "UNRESOLVED PAIRS:\n";
2107	0					0	foreach my $childsf (keys %unresolved) {
2108	0					0	my @poss = @{$unresolved{$childsf}};
	0					0
2109	0					0	foreach my $p (@poss) {
2110	0					0	my $parentsf = $p->[0];
2111	0					0	$childsf = $idxsf{$childsf};
2112	0					0	my @clabels = ($childsf->get_tagset_values(qw(protein_id label product)), "?");
2113	0					0	my @plabels = ($parentsf->get_tagset_values(qw(transcript_id label product)), "?");
2114	0					0	printf STDERR
2115							(" PAIR: $clabels[0] => $plabels[0] (of %d)\n",
2116							scalar(@poss));
2117							}
2118							}
2119							} # -- end of verbose
2120
2121							# Now we have to fully resolve the containment hierarchy; remember,
2122							# the graph %container has the fully resolved child->parent links;
2123							#
2124							# the graph %unresolved is keyed by children missing parents; we
2125							# need to put all these orphans in the %container graph
2126							#
2127							# we do this using the scores in %unresolved, with the
2128							# find_best_matches() algorithm
2129	1508					1532	my $unresolved_problem_reported = 0;
2130	1508	100				2173	if (%unresolved) {
2131	322					1025	my $new_pairs =
2132							$self->find_best_matches(\%unresolved, []);
2133	322	50				750	if (!$new_pairs) {
2134	0		0			0	my ($g) = $sfs[0]->get_tagset_values($self->group_tag \|\| 'gene');
2135	0					0	$self->problem(2,
2136							"Could not resolve hierarchy for $g");
2137	0					0	$new_pairs = [];
2138	0					0	$unresolved_problem_reported = 1;
2139							}
2140	322					577	foreach my $pair (@$new_pairs) {
2141	966	50				1500	if ($self->verbose > 0) {
2142	0					0	printf STDERR " resolved pair @$pair\n";
2143							}
2144	966					1125	$container{$pair->[0]} = $pair->[1];
2145	966					1904	delete $unresolved{$pair->[0]};
2146							}
2147							}
2148
2149							# CONDITION: containment hierarchy resolved
2150	1508	50				2348	if (%unresolved) {
2151	0	0				0	$self->throw("UNRESOLVED: %unresolved")
2152							unless $unresolved_problem_reported;
2153							}
2154
2155							# make nested SeqFeature hierarchy from @containment_pairs
2156							# ie put child SeqFeatures into parent SeqFeatures
2157	1508					1519	my @top = ();
2158	1508					1733	foreach my $sf (@sfs) {
2159	5517					7503	my $container_sf = $container{$sf};
2160	5517	100				5993	if ($container_sf) {
2161							# make $sf nested inside $container_sf
2162
2163							# first check if the container spatially contains the containee
2164	3906	50				7139	if ($container_sf->contains($sf)) {
2165							# add containee
2166	3906					7352	$container_sf->add_SeqFeature($sf);
2167							}
2168							else {
2169							# weird case - the container does NOT spatially
2170							# contain the containee;
2171							# we expand and throw a warning
2172							#
2173							# for an example of this see ZFP91-CNTF dicistronic gene
2174							# in NCBI chrom 11 build 34.3
2175	0					0	$self->problem(1,
2176							"Container feature does not spatially contain ".
2177							"subfeature. Perhaps this is a dicistronic gene? ".
2178							"I am expanding the parent feature",
2179							$container_sf,
2180							$sf);
2181	0					0	$container_sf->add_SeqFeature($sf, 'EXPAND');
2182							}
2183							}
2184							else {
2185	1611					2023	push(@top, $sf);
2186							}
2187							}
2188	1508					9129	return @top;
2189							} # -- end of unflatten_group
2190
2191							# -------
2192							# A NOTE ON USING OBJECTS AS KEYS IN HASHES (stringified objects)
2193							#
2194							# Often we with to use seqfeatures as keys in a hashtable; because seqfeatures
2195							# in bioperl have no unique ID, we use a surrogate ID in the form of the
2196							# stringified object references - this is just what you get if you say
2197							#
2198							# print "$sf\n";
2199							#
2200							# this is guaranteed to be unique (within a particular perl execution)
2201							#
2202							# often we want to 'revive' the objects used as keys in a hash - once the
2203							# objects are used as keys, remember it is the strings used as keys and
2204							# not the object itself, so the object needs to be revived using another
2205							# hashtable that looks like this
2206							#
2207							# %sfidx = map { $_ => $_ } @sfs
2208							#
2209							# -------
2210
2211
2212							# recursively finds the best set of pairings from a matrix of possible pairings
2213							#
2214							# tries to make sure nothing is unpaired
2215							#
2216							# given a matrix of POSSIBLE matches
2217							# (matrix expressed as hash/lookup; keyed by child object; val = [parent, score]
2218							#
2219							#
2220							sub find_best_matches {
2221	1288			1288	0	1146	my $self = shift;
2222	1288					1041	my $matrix = shift;
2223	1288					986	my $pairs = shift; # [child,parent] pairs already selected
2224
2225	1288					1775	my $verbose = $self->verbose;
2226							#################################print "I";
2227	1288	50				1957	if ($verbose > 0) {
2228	0					0	printf STDERR "find_best_matches: (/%d)\n", scalar(@$pairs);
2229							}
2230
2231	1288					1555	my %selected_children = map {($_->[0]=>1)} @$pairs;
	2302					3289
2232	1288					1576	my %selected_parents = map {($_->[1]=>1)} @$pairs;
	2302					3753
2233
2234							# make a copy of the matrix with the portions still to be
2235							# resolved
2236	1288					1506	my %unresolved_parents = ();
2237							my %unresolved =
2238							map {
2239	1288	50				1783	if ($verbose > 0) {
	4604					5686
2240	0					0	printf STDERR " $_ : %s\n", join("; ", map {"[@$_]"} @{$matrix->{$_}});
	0					0
	0					0
2241							}
2242	4604	100				5010	if ($selected_children{$_}) {
2243	2302					2243	();
2244							}
2245							else {
2246							my @parents =
2247							grep {
2248	5133					8321	!$selected_parents{$_->[0]}
2249	2302					1529	} @{$matrix->{$_}};
	2302					2540
2250	2302					5409	$unresolved_parents{$_} = 1 foreach @parents;
2251							# new parents
2252	2302					4310	($_ => [@parents]);
2253							}
2254							} keys %$matrix;
2255
2256	1288					2304	my @I = keys %unresolved;
2257
2258	1288	100				2498	return $pairs if !scalar(keys %unresolved_parents);
2259							# NECESSARY CONDITION:
2260							# all possible parents have a child match
2261
2262	966	50				1316	return $pairs if !scalar(@I);
2263							# NECESSARY CONDITION:
2264							# all possible children have a parent match
2265
2266							# give those with fewest choices highest priority
2267							@I = sort {
2268							# n possible parents
2269	966					2279	scalar(@{$unresolved{$a}})
	2063					1617
2270							<=>
2271	2063					1291	scalar(@{$unresolved{$b}}) ;
	2063					2115
2272							} @I;
2273
2274	966					1283	my $csf = shift @I;
2275
2276	966					806	my @J = @{$unresolved{$csf}}; # array of [parent, score]
	966					1338
2277
2278							# sort by score, highest first
2279							@J =
2280							sort {
2281	966					1030	$b->[1] <=> $a->[1]
	372					524
2282							} @J;
2283
2284							# select pair(s) from remaining matrix of possible pairs
2285							# by iterating through possible parents
2286
2287	966					837	my $successful_pairs;
2288	966					1008	foreach my $j (@J) {
2289	966					1262	my ($psf, $score) = @$j;
2290							# would selecting $csf, $psf as a pair
2291							# remove all choices from another?
2292	966					1016	my $bad = 0;
2293	966					863	foreach my $sf (@I) {
2294	1336	50				878	if (!grep {$_->[0] ne $psf} @{$unresolved{$sf}}) {
	2854					5158
	1336					1477
2295							# $psf was the only parent choice for $sf
2296	0					0	$bad = 1;
2297	0					0	last;
2298							}
2299							}
2300	966	50				1414	if (!$bad) {
2301	966					1284	my $pair = [$csf, $psf];
2302	966					1213	my $new_pairs = [@$pairs, $pair];
2303	966					1815	my $set = $self->find_best_matches($matrix, $new_pairs);
2304	966	50				1269	if ($set) {
2305	966					658	$successful_pairs = $set;
2306	966					1352	last;
2307							}
2308							}
2309							}
2310							# success
2311	966	50				3398	return $successful_pairs if $successful_pairs;
2312							# fail
2313	0					0	return 0;
2314							}
2315
2316							# ----------------------------------------------
2317							# writes a group to stdout
2318							#
2319							# mostly for logging/debugging
2320							# ----------------------------------------------
2321							sub _write_group {
2322	0			0		0	my $self = shift;
2323	0					0	my $group = shift;
2324	0		0			0	my $group_tag = shift \|\| 'gene';
2325
2326	0					0	my $f = $group->[0];
2327	0					0	my $label = '?';
2328	0	0				0	if ($f->has_tag($group_tag)) {
2329	0					0	($label) = $f->get_tag_values($group_tag);
2330							}
2331	0	0				0	if( $self->verbose > 0 ) {
2332							printf STDERR (" GROUP [%s]:%s\n",
2333							$label,
2334							join(' ',
2335	0					0	map { $_->primary_tag } @$group));
	0					0
2336							}
2337
2338							}
2339
2340							sub _write_sf {
2341	0			0		0	my $self = shift;
2342	0					0	my $sf = shift;
2343	0					0	printf STDERR "TYPE:%s\n", $sf->primary_tag;
2344	0					0	return;
2345							}
2346
2347							sub _write_sf_detail {
2348	0			0		0	my $self = shift;
2349	0					0	my $sf = shift;
2350	0					0	printf STDERR "TYPE:%s\n", $sf->primary_tag;
2351	0					0	my @locs = $sf->location->each_Location;
2352	0					0	printf STDERR " %s,%s [%s]\n", $_->start, $_->end, $_->strand foreach @locs;
2353	0					0	return;
2354							}
2355
2356							sub _write_hier {
2357	0			0		0	my $self = shift;
2358	0	0				0	my @sfs = @{shift \|\| []};
	0					0
2359	0		0			0	my $indent = shift \|\| 0;
2360	0	0				0	if( $self->verbose > 0 ) {
2361	0					0	foreach my $sf (@sfs) {
2362	0					0	my $label = '?';
2363	0	0				0	if ($sf->has_tag('product')) {
2364	0					0	($label) = $sf->get_tag_values('product');
2365							}
2366	0					0	printf STDERR "%s%s $label\n", ' ' x $indent, $sf->primary_tag;
2367	0					0	my @sub_sfs = $sf->sub_SeqFeature;
2368	0					0	$self->_write_hier(\@sub_sfs, $indent+1);
2369							}
2370							}
2371							}
2372
2373							# -----------------------------------------------
2374							#
2375							# returns all possible containers for an SF based
2376							# on splice site coordinates; splice site coords
2377							# must be contained
2378							# -----------------------------------------------
2379							sub _resolve_container_for_sf{
2380	962			962		1271	my ($self, $sf, @possible_container_sfs) = @_;
2381
2382	962					1643	my @coords = $self->_get_splice_coords_for_sf($sf);
2383	962					2089	my $start = $sf->start;
2384	962					1572	my $end = $sf->end;
2385	962					2692	my $splice_uniq_str = "@coords";
2386
2387	962					1106	my @sf_score_pairs = ();
2388							# a CDS is contained by a mRNA if the locations of the splice
2389							# coordinates are identical
2390	962					1365	foreach (@possible_container_sfs) {
2391	3606					5035	my @container_coords = $self->_get_splice_coords_for_sf($_);
2392	3606		100			17918	my $inside =
2393							!$splice_uniq_str \|\|
2394							index("@container_coords", $splice_uniq_str) > -1;
2395	3606	100				5653	if ($inside) {
2396							# the container cannot be smaller than the thing contained
2397	1742	100	66			3125	if ($_->start > $start \|\| $_->end < $end) {
2398	20					34	$inside = 0;
2399							}
2400							}
2401
2402
2403							# SPECIAL CASE FOR /ribosomal_slippage
2404							# See: https://www.ncbi.nlm.nih.gov/collab/FT/
2405	3606	100	100			8385	if (!$inside && $sf->has_tag('ribosomal_slippage')) {
2406	22	50				39	if ($self->verbose > 0) {
2407	0					0	printf STDERR " Checking for ribosomal_slippage\n";
2408							}
2409
2410							# TODO: rewrite this to match introns;
2411							# each slippage will be a "fake" small CDS exon
2412	22					34	my @transcript_splice_sites = @container_coords;
2413	22					27	my @cds_splice_sites = @coords;
2414							##printf STDERR "xxTR SSs: @transcript_splice_sites :: %s\n", $_->get_tag_values('product');
2415							##printf STDERR "xxCD SSs: @cds_splice_sites :: %s\n\n", $sf->get_tag_values('product');
2416
2417							# find the the first splice site within the CDS
2418	22		100			78	while (scalar(@transcript_splice_sites) &&
2419							$transcript_splice_sites[0] < $cds_splice_sites[0]) {
2420	29					67	shift @transcript_splice_sites;
2421							}
2422
2423							##print STDERR "TR SSs: @transcript_splice_sites\n";
2424							##print STDERR "CD SSs: @cds_splice_sites\n\n";
2425
2426	22	100	100			62	if (!(scalar(@transcript_splice_sites)) \|\|
2427							$transcript_splice_sites[0] == $cds_splice_sites[0]) {
2428
2429							# we will now try and align all splice remaining sites in the transcript and CDS;
2430							# any splice site that can't be aligned is assumed to be a ribosomal slippage
2431
2432	16					17	my @slips = ();
2433	16					16	my $in_exon = 1;
2434	16					10	$inside = 1; # innocent until proven guilty..
2435	16					24	while (@cds_splice_sites) {
2436	36	100				51	if (!@transcript_splice_sites) {
		100
2437
2438							# ribosomal slippage is after the last transcript splice site
2439							# Example: (NC_00007, isoform 3 of PEG10)
2440							# mRNA join(85682..85903,92646..99007)
2441							# mRNA join(85682..85903,92646..99007)
2442							# CDS join(85899..85903,92646..93825,93825..94994)
2443
2444							# OR: None of the splice sites align;
2445							# may be a single CDS exon with one slippage inside it.
2446							# Example: (NC_00007, isoform 4 of PEG10)
2447							# mRNA join(85637..85892,92646..99007)
2448							# CDS join(92767..93825,93825..94994)
2449
2450							# Yes, this code is repeated below...
2451	14					11	my $p1 = shift @cds_splice_sites;
2452	14					12	my $p2 = shift @cds_splice_sites;
2453	14	50				19	if ($self->verbose > 0) {
2454	0					0	printf STDERR " Found the ribosomal_slippage: $p1..$p2\n";
2455							}
2456	14					38	push(@slips, ($p2-$p1)-1);
2457							}
2458							elsif ($cds_splice_sites[0] == $transcript_splice_sites[0]) {
2459							# splice sites align: this is not the slippage
2460	20					15	shift @cds_splice_sites;
2461	20					27	shift @transcript_splice_sites;
2462							##print STDERR "MATCH\n";
2463							}
2464							else {
2465							# mismatch
2466	2	50				4	if ($cds_splice_sites[0] < $transcript_splice_sites[0]) {
2467							# potential slippage
2468							# v
2469							# ---TTTTTTTTTT----
2470							# ---CCCC--CCCC----
2471							# ^
2472
2473	2					3	my $p1 = shift @cds_splice_sites;
2474	2					3	my $p2 = shift @cds_splice_sites;
2475	2	50				5	if ($self->verbose > 0) {
2476	0					0	printf STDERR " Found the ribosomal_slippage: $p1..$p2\n";
2477							}
2478	2					6	push(@slips, ($p2-$p1)-1);
2479							}
2480							else {
2481							# not a potential ribosomal slippage
2482	0					0	$inside = 0; # guilty!
2483							##print STDERR "FAIL\n";
2484	0					0	last;
2485							}
2486							}
2487							}
2488	16	50				28	if ($inside) {
2489							# TODO: this is currently completely arbitrary. How many ribosomal slippages do we allow?
2490							# perhaps we need some mini-statistical model here....?
2491	16	50				23	if (@slips > 1) {
2492	0					0	$inside = 0;
2493							}
2494							# TODO: this is currently completely arbitrary. What is the maximum size of a ribosomal slippage?
2495							# perhaps we need some mini-statistical model here....?
2496	16	50				19	if (grep {$_ > 2} @slips) {
	16					42
2497	0					0	$inside = 0;
2498							}
2499							}
2500							}
2501							else {
2502							# not a ribosomal_slippage, sorry
2503							}
2504							}
2505	3606	50				7320	if ($self->verbose > 0) {
2506	0					0	printf STDERR " Checking containment:[$inside] (@container_coords) IN ($splice_uniq_str)\n";
2507							}
2508	3606	100				6986	if ($inside) {
2509							# SCORE: matching (ss-scoords+2)/(n-container-ss-coords+2)
2510	1738					2595	my $score =
2511							(scalar(@coords)+2)/(scalar(@container_coords)+2);
2512	1738					4051	push(@sf_score_pairs,
2513							$_=>$score);
2514							}
2515							}
2516	962					5527	return @sf_score_pairs;
2517							}
2518
2519							sub _get_splice_coords_for_sf {
2520	4568			4568		4625	my $self = shift;
2521	4568					3330	my $sf = shift;
2522
2523	4568					7533	my @locs = $sf->location;
2524	4568	100				6039	if ($sf->location->isa("Bio::Location::SplitLocationI")) {
2525	4467					5822	@locs = $sf->location->each_Location;
2526							}
2527
2528							# get an ordered list of (start, end) positions
2529
2530							# my @coords =
2531							# map {
2532							# $_->strand > 0 ? ($_->start, $_->end) : ($_->end, $_->start)
2533							# } @locs;
2534
2535	4568					6283	my @coords = map {($_->start, $_->end)} @locs;
	50783					58315
2536
2537							# remove first and last leaving only splice sites
2538	4568					5859	pop @coords;
2539	4568					3630	shift @coords;
2540	4568					16622	return @coords;
2541							}
2542
2543							=head2 feature_from_splitloc
2544
2545							Title : feature_from_splitloc
2546							Usage : $unflattener->feature_from_splitloc(-features=>$sfs);
2547							Function:
2548							Example :
2549							Returns :
2550							Args : see below
2551
2552							At this time all this method does is generate exons for mRNA or other RNA features
2553
2554							Arguments:
2555
2556							-feature: a Bio::SeqFeatureI object (that conforms to Bio::FeatureHolderI)
2557							-seq: a Bio::SeqI object that contains Bio::SeqFeatureI objects
2558							-features: an arrayref of Bio::SeqFeatureI object
2559
2560
2561							=cut
2562
2563							sub feature_from_splitloc{
2564	2035			2035	1	2374	my ($self,@args) = @_;
2565
2566	2035					5284	my($sf, $seq, $sfs) =
2567							$self->_rearrange([qw(FEATURE
2568							SEQ
2569							FEATURES
2570							)],
2571							@args);
2572	2035	100				2887	my @sfs = (@{$sfs \|\| []});
	2035					4469
2573	2035	50				3002	push(@sfs, $sf) if $sf;
2574	2035	100				3055	if ($seq) {
2575	1	50				6	$seq->isa("Bio::SeqI") \|\| $self->throw("$seq NOT A SeqI");
2576	1					6	@sfs = $seq->get_all_SeqFeatures;
2577							}
2578	2035					2123	my @exons = grep {$_->primary_tag eq 'exon'} @sfs;
	5856					7377
2579	2035	50				3114	if (@exons) {
2580	0					0	$self->problem(2,
2581							"There are already exons, so I will not infer exons");
2582							}
2583
2584							# index of features by type+location
2585	2035					2175	my %loc_h = ();
2586
2587							# infer for every feature
2588	2035					2128	foreach my $sf (@sfs) {
2589
2590	5856	50				12559	$sf->isa("Bio::SeqFeatureI") \|\| $self->throw("$sf NOT A SeqFeatureI");
2591	5856	50				9900	$sf->isa("Bio::FeatureHolderI") \|\| $self->throw("$sf NOT A FeatureHolderI");
2592
2593	5856					7659	my $type = $sf->primary_tag;
2594	5856	100	100			20432	next unless $type eq 'mRNA' or $type =~ /RNA/;
2595
2596							# an mRNA from genbank will have a discontinuous location,
2597							# with each sub-location being equivalent to an exon
2598	1991					3068	my @locs = $sf->location;
2599
2600	1991	100				2681	if ($sf->location->isa("Bio::Location::SplitLocationI")) {
2601	1762					2336	@locs = $sf->location->each_Location;
2602							}
2603
2604	1991	50				4075	if (!@locs) {
2605	3			3		15	use Data::Dumper;
	3					3
	3					3107
2606	0					0	print Dumper $sf;
2607	0					0	$self->throw("ASSERTION ERROR: sf has no location objects");
2608							}
2609
2610							# make exons from locations
2611							my @subsfs =
2612							map {
2613	1991					2466	my $subsf = Bio::SeqFeature::Generic->new(-location=>$_,
	18260					49173
2614							-primary_tag=>'exon');
2615							## Provide seq_id to new feature:
2616	18260	50				29966	$subsf->seq_id($sf->seq_id) if $sf->seq_id;
2617	18260	50				26476	$subsf->source_tag($sf->source_tag) if $sf->source_tag;
2618							## Transfer /locus_tag and /gene tag values to inferred
2619							## features. TODO: Perhaps? this should not be done
2620							## indiscriminantly but rather by virtue of the setting
2621							## of group_tag.
2622	18260					32465	foreach my $tag (grep /gene\|locus_tag/, $sf->get_all_tags) {
2623	34683					47126	my @vals = $sf->get_tag_values($tag);
2624	34683					49312	$subsf->add_tag_value($tag, @vals);
2625							}
2626
2627	18260					38969	my $locstr = 'exon::'.$self->_locstr($subsf);
2628
2629							# re-use feature if type and location the same
2630	18260	100				32151	if ($loc_h{$locstr}) {
2631	7266					8637	$subsf = $loc_h{$locstr};
2632							}
2633							else {
2634	10994					17155	$loc_h{$locstr} = $subsf;
2635							}
2636	18260					29015	$subsf;
2637							} @locs;
2638
2639							# PARANOID CHECK
2640	1991					3831	$self->_check_order_is_consistent($sf->location->strand,@subsfs);
2641							#----
2642
2643	1991					4371	$sf->location(Bio::Location::Simple->new());
2644
2645							# we allow the exons to define the boundaries of the transcript
2646	1991					5701	$sf->add_SeqFeature($_, 'EXPAND') foreach @subsfs;
2647
2648
2649	1991	50				3029	if (!$sf->location->strand) {
2650							# correct weird bioperl bug in previous versions;
2651							# strand was not being set correctly
2652	0					0	$sf->location->strand($subsfs[0]->location->strand);
2653							}
2654
2655
2656							}
2657	2035					8862	return;
2658							}
2659
2660							#sub merge_features_with_same_loc {
2661							# my ($self,@args) = @_;
2662
2663							# my($sfs, $seq) =
2664							# $self->_rearrange([qw(FEATURES
2665							# SEQ
2666							# )],
2667							# @args);
2668							# my @sfs = (@$sfs);
2669							# if ($seq) {
2670							# $seq->isa("Bio::SeqI") \|\| $self->throw("$seq NOT A SeqI");
2671							# @sfs = $seq->get_all_SeqFeatures;
2672							# }
2673
2674
2675							# my %loc_h = ();
2676							# foreach my $sf (@sfs) {
2677							# my $type = $sf->primary_tag;
2678							# my $locstr = $self->_locstr($sf);
2679							## $loc_h{$type.$locstr}
2680							# push(@{$exon_h{$self->_locstr($_)}}, $_) foreach @exons;
2681							# }
2682							#}
2683
2684							=head2 infer_mRNA_from_CDS
2685
2686							Title : infer_mRNA_from_CDS
2687							Usage :
2688							Function:
2689							Example :
2690							Returns :
2691							Args :
2692
2693							given a "type 1" containment hierarchy
2694
2695							gene
2696							CDS
2697							exon
2698
2699							this will infer the uniform "type 0" containment hierarchy
2700
2701							gene
2702							mRNA
2703							CDS
2704							exon
2705
2706							all the children of the CDS will be moved to the mRNA
2707
2708							a "type 2" containment hierarchy is mixed type "0" and "1" (for
2709							example, see ftp.ncbi.nih.gov/genomes/Schizosaccharomyces_pombe/)
2710
2711							=cut
2712
2713							sub infer_mRNA_from_CDS{
2714	1			1	1	3	my ($self,@args) = @_;
2715
2716	1					6	my($sf, $seq, $noinfer) =
2717							$self->_rearrange([qw(FEATURE
2718							SEQ
2719							NOINFER
2720							)],
2721							@args);
2722	1					3	my @sfs = ($sf);
2723	1	50				4	if ($seq) {
2724	1	50				6	$seq->isa("Bio::SeqI") \|\| $self->throw("$seq NOT A SeqI");
2725	1					5	@sfs = $seq->get_all_SeqFeatures;
2726							}
2727
2728	1					4	foreach my $sf (@sfs) {
2729
2730	52	50				124	$sf->isa("Bio::SeqFeatureI") \|\| $self->throw("$sf NOT A SeqFeatureI");
2731	52	50				96	$sf->isa("Bio::FeatureHolderI") \|\| $self->throw("$sf NOT A FeatureHolderI");
2732	52	50				87	if ($self->verbose > 0) {
2733	0					0	printf STDERR " Checking $sf %s\n", $sf->primary_tag;
2734							}
2735
2736	52	50				78	if ($sf->primary_tag eq 'mRNA') {
2737	0					0	$self->problem(2,
2738							"Inferring mRNAs when there are already mRNAs present");
2739							}
2740
2741	52					80	my @cdsl = grep {$_->primary_tag eq 'CDS' } $sf->get_SeqFeatures;
	24					35
2742	52	100				91	if (@cdsl) {
2743	24					37	my @children = grep {$_->primary_tag ne 'CDS'} $sf->get_SeqFeatures;
	24					33
2744	24					27	my @mrnas = ();
2745
2746
2747	24					27	foreach my $cds (@cdsl) {
2748
2749	24	50				38	if ($self->verbose > 0) {
2750	0					0	print " Inferring mRNA from CDS $cds\n";
2751							}
2752	24					42	$self->_check_order_is_consistent($cds->location->strand,$cds->location->each_Location);
2753
2754	24					61	my $loc = Bio::Location::Split->new;
2755	24					50	foreach my $cdsexonloc ($cds->location->each_Location) {
2756	124					235	my $subloc =
2757							Bio::Location::Simple->new(-start=>$cdsexonloc->start,
2758							-end=>$cdsexonloc->end,
2759							-strand=>$cdsexonloc->strand);
2760	124					249	$loc->add_sub_Location($subloc);
2761							}
2762	24	50				40	if ($noinfer) {
2763	0					0	push(@mrnas, $cds);
2764							}
2765							else {
2766							# share the same location
2767	24					88	my $mrna =
2768							Bio::SeqFeature::Generic->new(-location=>$loc,
2769							-primary_tag=>'mRNA');
2770
2771							## Provide seq_id to new feature:
2772	24	50				52	$mrna->seq_id($cds->seq_id) if $cds->seq_id;
2773	24	50				39	$mrna->source_tag($cds->source_tag) if $cds->source_tag;
2774
2775	24					43	$self->_check_order_is_consistent($mrna->location->strand,$mrna->location->each_Location);
2776
2777							# make the mRNA hold the CDS; no EXPAND option,
2778							# the CDS cannot be wider than the mRNA
2779	24					58	$mrna->add_SeqFeature($cds);
2780
2781							# mRNA steals children of CDS
2782	24					46	foreach my $subsf ($cds->get_SeqFeatures) {
2783	0					0	$mrna->add_SeqFeature($subsf);
2784							}
2785	24					45	$cds->remove_SeqFeatures;
2786	24					42	push(@mrnas, $mrna);
2787							}
2788							}
2789							# change gene/CDS to gene/mRNA
2790	24					36	$sf->remove_SeqFeatures;
2791	24					49	$sf->add_SeqFeature($_) foreach (@mrnas, @children);
2792							}
2793							}
2794	1					8	return;
2795
2796
2797							}
2798
2799							=head2 remove_types
2800
2801							Title : remove_types
2802							Usage : $unf->remove_types(-seq=>$seq, -types=>["mRNA"]);
2803							Function:
2804							Example :
2805							Returns :
2806							Args :
2807
2808							removes features of a set type
2809
2810							useful for pre-filtering a genbank record; eg to get rid of STSs
2811
2812							also, there is no way to unflatten
2813							ftp.ncbi.nih.gov/genomes/Schizosaccharomyces_pombe/ UNLESS the bogus
2814							mRNAs in these records are removed (or changed to a different type) -
2815							they just confuse things too much
2816
2817							=cut
2818
2819							sub remove_types{
2820	0			0	1	0	my ($self,@args) = @_;
2821
2822	0					0	my($seq, $types) =
2823							$self->_rearrange([qw(
2824							SEQ
2825							TYPES
2826							)],
2827							@args);
2828	0	0				0	$seq->isa("Bio::SeqI") \|\| $self->throw("$seq NOT A SeqI");
2829	0					0	my @sfs = $seq->get_all_SeqFeatures;
2830	0					0	my %rh = map {$_=>1} @$types;
	0					0
2831	0					0	@sfs = grep {!$rh{$_->primary_tag}} @sfs;
	0					0
2832	0					0	$seq->remove_SeqFeatures;
2833	0					0	$seq->add_SeqFeature($_) foreach @sfs;
2834	0					0	return;
2835							}
2836
2837
2838							# _check_order_is_consistent($strand,$ranges) RETURNS BOOL
2839							#
2840							# note: the value of this test is moot - there are many valid,
2841							# if unusual cases where it would flag an anomaly. for example
2842							# transpliced genes such as mod(mdg4) in dmel on AE003744, and
2843							# the following spliced gene on NC_001284:
2844							#
2845							# mRNA complement(join(20571..20717,21692..22086,190740..190761,
2846							# 140724..141939,142769..142998))
2847							# /gene="nad5"
2848							# /note="trans-splicing, RNA editing"
2849							# /db_xref="GeneID:814567"
2850							#
2851							# note how the exons are not in order
2852							# this will flag a level-3 warning, the user of this module
2853							# can ignore this and deal appropriately with the resulting
2854							# unordered exons
2855							sub _check_order_is_consistent {
2856	2039			2039		1768	my $self = shift;
2857
2858	2039					1699	my $parent_strand = shift; # this does nothing..?
2859	2039					3012	my @ranges = @_;
2860	2039	50				3251	return unless @ranges;
2861							my $rangestr =
2862	2039					2518	join(" ",map{sprintf("[%s,%s]",$_->start,$_->end)} @ranges);
	18508					24773
2863	2039					4557	my $strand = $ranges[0]->strand;
2864	2039					4539	for (my $i=1; $i<@ranges;$i++) {
2865	16469	50				18927	if ($ranges[$i]->strand != $strand) {
2866	0					0	$self->problem(1,"inconsistent strands. Trans-spliced gene? Range: $rangestr");
2867	0					0	return 1;
2868							# mixed ranges - autopass
2869							# some mRNAs have exons on both strands; for
2870							# example, the dmel mod(mdg4) gene which is
2871							# trans-spliced (in actual fact two mRNAs)
2872							}
2873							}
2874	2039					1867	my $pass = 1;
2875	2039					2953	for (my $i=1; $i<@ranges;$i++) {
2876	16469					13885	my $rangeP = $ranges[$i-1];
2877	16469					12836	my $range = $ranges[$i];
2878	16469	50				20730	if ($rangeP->start > $range->end) {
2879	0	0				0	if ($self->seq->is_circular) {
2880							# see for example NC_006578.gbk
2881							# we make exceptions for circular genomes here.
2882							# see Re: [Gmod-ajax] flatfile-to-json.pl error with GFF
2883							# 2010-07-26
2884							}
2885							else {
2886							# failed - but still get one more chance..
2887	0					0	$pass = 0;
2888	0					0	$self->problem(2,"Ranges not in correct order. Strange ensembl genbank entry? Range: $rangestr");
2889	0					0	last;
2890							}
2891							}
2892							}
2893
2894	2039	50				3367	if (!$pass) {
2895							# sometimes (eg ensembl flavour genbank files)
2896							# exons on reverse strand listed in reverse order
2897							# eg join(complement(R1),...,complement(Rn))
2898							# where R1 > R2
2899	0					0	for (my $i=1; $i<@ranges;$i++) {
2900	0					0	my $rangeP = $ranges[$i-1];
2901	0					0	my $range = $ranges[$i];
2902	0	0				0	if ($rangeP->end < $range->start) {
2903	0					0	$self->problem(3,"inconsistent order. Range: $rangestr");
2904	0					0	return 0;
2905							}
2906							}
2907							}
2908	2039					2939	return 1; # pass
2909							}
2910
2911							# PRIVATE METHOD: _locstr($sf)
2912							#
2913							# returns a location string for a feature; just the outer boundaries
2914							sub _locstr {
2915	18524			18524		13441	my $self = shift;
2916	18524					11908	my $sf = shift;
2917							return
2918	18524					27981	sprintf("%d..%d", $sf->start, $sf->end);
2919							}
2920
2921							sub iterate_containment_tree {
2922	0			0	0		my $self = shift;
2923	0						my $feature_holder = shift;
2924	0						my $sub = shift;
2925	0						$sub->($feature_holder);
2926	0						my @sfs = $feature_holder->get_SeqFeatures;
2927	0						$self->iterate_containment_tree($_) foreach @sfs;
2928							}
2929
2930							sub find_best_pairs {
2931	0			0	0		my $matrix = shift;
2932	0						my $size = shift;
2933	0		0				my $i = shift \|\| 0;
2934
2935	0						for (my $j=0; $j < $size; $j++) {
2936	0						my $score = $matrix->[$i][$j];
2937	0	0					if (!defined($score)) {
2938	0						next;
2939							}
2940
2941							}
2942
2943							}
2944
2945							1;