File Coverage

Bio/SeqFeature/Tools/TypeMapper.pm
Criterion Covered Total %
statement 6 49 12.2
branch 0 30 0.0
condition 0 9 0.0
subroutine 2 8 25.0
pod 5 6 83.3
total 13 102 12.7


line stmt bran cond sub pod time code
1             #
2             # bioperl module for Bio::SeqFeature::Tools::TypeMapper
3             #
4             # Please direct questions and support issues to
5             #
6             # Cared for by Chris Mungall
7             #
8             # Copyright Chris Mungall
9             #
10             # You may distribute this module under the same terms as perl itself
11              
12             # POD documentation - main docs before the code
13              
14             =head1 NAME
15              
16             Bio::SeqFeature::Tools::TypeMapper - maps $seq_feature-Eprimary_tag
17              
18             =head1 SYNOPSIS
19              
20             use Bio::SeqIO;
21             use Bio::SeqFeature::Tools::TypeMapper;
22              
23             # first fetch a genbank SeqI object
24             $seqio =
25             Bio::SeqIO->new(-file=>'AE003644.gbk',
26             -format=>'GenBank');
27             $seq = $seqio->next_seq();
28              
29             $tm = Bio::SeqFeature::Tools::TypeMapper->new;
30              
31             # map all the types in the sequence
32             $tm->map_types(-seq=>$seq,
33             {CDS=>'ORF',
34             variation=>sub {
35             my $f = shift;
36             $f->length > 1 ?
37             'variation' : 'SNP'
38             },
39             });
40              
41             # alternatively, use the hardcoded SO mapping
42             $tm->map_types_to_SO(-seq=>$seq);
43              
44             =head1 DESCRIPTION
45              
46             This class implements an object for mapping between types; for
47             example, the types in a genbank feature table, and the types specified
48             in the Sequence Ontology.
49              
50             You can specify your own mapping, either as a simple hash index, or by
51             providing your own subroutines.
52              
53             =head1 FEEDBACK
54              
55             =head2 Mailing Lists
56              
57             User feedback is an integral part of the evolution of this and other
58             Bioperl modules. Send your comments and suggestions preferably to the
59             Bioperl mailing lists Your participation is much appreciated.
60              
61             bioperl-l@bioperl.org - General discussion
62             http://bioperl.org/wiki/Mailing_lists - About the mailing lists
63              
64             =head2 Support
65              
66             Please direct usage questions or support issues to the mailing list:
67              
68             I
69              
70             rather than to the module maintainer directly. Many experienced and
71             reponsive experts will be able look at the problem and quickly
72             address it. Please include a thorough description of the problem
73             with code and data examples if at all possible.
74              
75             =head2 Reporting Bugs
76              
77             report bugs to the Bioperl bug tracking system to help us keep track
78             the bugs and their resolution. Bug reports can be submitted via the
79             web:
80              
81             https://github.com/bioperl/bioperl-live/issues
82              
83             =head1 AUTHOR - Chris Mungall
84              
85             Email: cjm@fruitfly.org
86              
87             =head1 APPENDIX
88              
89             The rest of the documentation details each of the object
90             methods. Internal methods are usually preceded with a _
91              
92             =cut
93              
94              
95             # Let the code begin...
96              
97             package Bio::SeqFeature::Tools::TypeMapper;
98 2     2   12 use strict;
  2         4  
  2         57  
99              
100             # Object preamble - inherits from Bio::Root::Root
101              
102 2     2   11 use base qw(Bio::Root::Root);
  2         5  
  2         1411  
103              
104             =head2 new
105              
106             Title : new
107             Usage : $unflattener = Bio::SeqFeature::Tools::TypeMapper->new();
108             Function: constructor
109             Example :
110             Returns : a new Bio::SeqFeature::Tools::TypeMapper
111             Args : see below
112              
113              
114             =cut
115              
116             sub new {
117 0     0 1   my($class,@args) = @_;
118 0           my $self = $class->SUPER::new(@args);
119              
120 0           my($typemap) =
121             $self->_rearrange([qw(TYPEMAP
122             )],
123             @args);
124              
125 0 0         $typemap && $self->typemap($typemap);
126 0           return $self; # success - we hope!
127             }
128              
129             =head2 typemap
130              
131             Title : typemap
132             Usage : $obj->typemap($newval)
133             Function:
134             Example :
135             Returns : value of typemap (a scalar)
136             Args : on set, new value (a scalar or undef, optional)
137              
138              
139             =cut
140              
141             sub typemap{
142 0     0 1   my $self = shift;
143              
144 0 0         return $self->{'typemap'} = shift if @_;
145 0           return $self->{'typemap'};
146             }
147              
148             =head2 map_types
149              
150             Title : map_types
151             Usage :
152             Function:
153             Example :
154             Returns :
155             Args :
156              
157             dgg: added -undefined => "region" option to produce all valid SO mappings.
158              
159             =cut
160              
161             sub map_types{
162 0     0 1   my ($self,@args) = @_;
163              
164 0           my($sf, $seq, $type_map, $undefmap) =
165             $self->_rearrange([qw(FEATURE
166             SEQ
167             TYPE_MAP
168             UNDEFINED
169             )],
170             @args);
171 0 0 0       if (!$sf && !$seq) {
172 0           $self->throw("you need to pass in either -feature or -seq");
173             }
174              
175 0           my @sfs = ($sf);
176 0 0         if ($seq) {
177 0 0         $seq->isa("Bio::SeqI") || $self->throw("$seq NOT A SeqI");
178 0           @sfs = $seq->get_all_SeqFeatures;
179             }
180 0   0       $type_map = $type_map || $self->typemap; # dgg: was type_map;
181 0           foreach my $sf (@sfs) {
182              
183 0 0         $sf->isa("Bio::SeqFeatureI") || $self->throw("$sf NOT A SeqFeatureI");
184 0 0         $sf->isa("Bio::FeatureHolderI") || $self->throw("$sf NOT A FeatureHolderI");
185              
186 0           my $type = $sf->primary_tag;
187 0           my $mtype = $type_map->{$type};
188 0 0         if ($mtype) {
189 0 0 0       if (ref($mtype)) {
    0          
190 0 0         if (ref($mtype) eq 'CODE') {
191 0           $mtype = $mtype->($sf);
192             }
193             else {
194 0           $self->throw('type_map values must be scalar or CODE ref. You said: '.$mtype.' for type: '.$type);
195             }
196             }
197             elsif ($undefmap && $mtype eq 'undefined') { # dgg
198 0           $mtype= $undefmap;
199             }
200 0           $sf->primary_tag($mtype);
201             }
202             }
203 0           return;
204             }
205              
206             =head2 map_types_to_SO
207              
208             Title : map_types_to_SO
209             Usage :
210             Function:
211             Example :
212             Returns :
213             Args :
214              
215             hardcodes the genbank to SO mapping
216              
217             Based on revision 1.22 of SO
218              
219             Please see the actual code for the mappings
220              
221             Taken from
222              
223             L
224              
225             dgg: separated out FT_SO_map for caller changes. Update with:
226              
227             open(FTSO,"curl -s http://sequenceontology.org/resources/mapping/FT_SO.txt|");
228             while(){
229             chomp; ($ft,$so,$sid,$ftdef,$sodef)= split"\t";
230             print " '$ft' => '$so',\n" if($ft && $so && $ftdef);
231             }
232              
233             =cut
234              
235             sub FT_SO_map {
236             # $self= shift;
237             # note : some of the ft_so mappings are commented out and overriden...
238             return {
239 0     0 0   "-" => ["located_sequence_feature", "so:0000110"],
240             "-10_signal" => ["minus_10_signal", "so:0000175"],
241             "-35_signal" => ["minus_35_signal", "so:0000176"],
242             "3'utr" => ["three_prime_utr", "so:0000205"],
243             "3'clip" => ["three_prime_clip", "so:0000557"],
244             "5'utr" => ["five_prime_utr", "so:0000204"],
245             "5'clip" => ["five_prime_clip", "so:0000555"],
246             "caat_signal" => ["caat_signal", "so:0000172"],
247             "cds" => ["cds", "so:0000316"],
248             "c_region" => ["undefined", ""],
249             "d-loop" => ["d_loop", "so:0000297"],
250             "d_segment" => ["d_gene", "so:0000458"],
251             "gc_signal" => ["gc_rich_region", "so:0000173"],
252             "j_segment" => ["undefined", ""],
253             "ltr" => ["long_terminal_repeat", "so:0000286"],
254             "n_region" => ["undefined", ""],
255             "rbs" => ["ribosome_entry_site", "so:0000139"],
256             "sts" => ["sts", "so:0000331"],
257             "s_region" => ["undefined", ""],
258             "tata_signal" => ["tata_box", "so:0000174"],
259             "v_region" => ["undefined", ""],
260             "v_segment" => ["undefined", ""],
261             "attenuator" => ["attenuator", "so:0000140"],
262             "conflict" => ["undefined", ""],
263             "enhancer" => ["enhancer", "so:0000165"],
264             "exon" => ["exon", "so:0000147"],
265             "gap" => ["gap", "so:0000730"],
266             "gene" => ["gene", "so:0000704"],
267             "idna" => ["idna", "so:0000723"],
268             "intron" => ["intron", "so:0000188"],
269             "mRNA" => ["mRNA", "so:0000234"],
270             "mat_peptide" => ["mature_protein_region", "so:0000419"],
271             "mature_peptide" => ["mature_protein_region", "so:0000419"],
272             #"misc_RNA" => ["transcript", "so:0000673"],
273             "misc_binding" => ["binding_site", "so:0000409"],
274             "misc_difference" => ["sequence_difference", "so:0000413"],
275             "misc_feature" => ["region", undef],
276             "misc_recomb" => ["recombination_feature", "so:0000298"],
277             "misc_signal" => ["regulatory_region", "so:0005836"],
278             "misc_structure" => ["sequence_secondary_structure", "so:0000002"],
279             "modified_base" => ["modified_base_site", "so:0000305"],
280             "old_sequence" => ["undefined", ""],
281             "operon" => ["operon", "so:0000178"],
282             "oriT" => ["origin_of_transfer", "so:0000724"],
283             "polya_signal" => ["polyA_signal_sequence", "so:0000551"],
284             "polya_site" => ["polyA_site", "so:0000553"],
285             "precursor_RNA" => ["primary_transcript", "so:0000185"],
286             "prim_transcript" => ["primary_transcript", "so:0000185"],
287             "primer_bind" => ["primer_binding_site", "so:0005850"],
288             "promoter" => ["promoter", "so:0000167"],
289             "protein_bind" => ["protein_binding_site", "so:0000410"],
290             "rRNA" => ["rRNA", "so:0000252"],
291             "repeat_region" => ["repeat_region", "so:0000657"],
292             "repeat_unit" => ["repeat_unit", "so:0000726"],
293             "satellite" => ["satellite_dna", "so:0000005"],
294             "scRNA" => ["scRNA", "so:0000013"],
295             "sig_peptide" => ["signal_peptide", "so:0000418"],
296             "snRNA" => ["snRNA", "so:0000274"],
297             "snoRNA" => ["snoRNA", "so:0000275"],
298             #"source" => ["databank_entry", "so:2000061"],
299             "stem_loop" => ["stem_loop", "so:0000313"],
300             "tRNA" => ["tRNA", "so:0000253"],
301             "terminator" => ["terminator", "so:0000141"],
302             "transit_peptide" => ["transit_peptide", "so:0000725"],
303             "unsure" => "undefined",
304             "variation" => ["sequence_variant", "so:0000109"],
305              
306             # manually added
307             ## has parent = pseudogene ; dgg
308             "pseudomRNA" => ["pseudogenic_transcript", "so:0000516"],
309             ## from unflattener misc_rna ; dgg
310             "pseudotranscript" => ["pseudogenic_transcript", "so:0000516"],
311             "pseudoexon" => ["pseudogenic_exon", "so:0000507"],
312             "pseudoCDS" => ["pseudogenic_exon", "so:0000507"],
313             "pseudomisc_feature" => ["pseudogenic_region", "so:0000462"],
314             "pseudointron" => ["pseudogenic_region", "so:0000462"],
315              
316              
317             ## "undefined" => "region",
318              
319             # this is the most generic form for rnas;
320             # we always represent the processed form of
321             # the transcript
322             misc_RNA => ['mature_transcript',"so:0000233"],
323              
324             # not sure about this one...
325             source=>['contig', "SO:0000149"],
326              
327             rep_origin=>['origin_of_replication',"SO:0000296"],
328              
329             Protein=>['polypeptide',"SO:0000104"],
330             };
331             # return {
332             #"FT term" => "SO term",
333             #"-" => "located_sequence_feature",
334             #"-10_signal" => "minus_10_signal",
335             #"-35_signal" => "minus_35_signal",
336             #"3'UTR" => "three_prime_UTR",
337             #"3'clip" => "three_prime_clip",
338             #"5'UTR" => "five_prime_UTR",
339             #"5'clip" => "five_prime_clip",
340             #"CAAT_signal" => "CAAT_signal",
341             #"CDS" => "CDS",
342             #"C_region" => "undefined",
343             #"D-loop" => "D_loop",
344             #"D_segment" => "D_gene",
345             #"GC_signal" => "GC_rich_region",
346             #"J_segment" => "undefined",
347             #"LTR" => "long_terminal_repeat",
348             #"N_region" => "undefined",
349             #"RBS" => "ribosome_entry_site",
350             #"STS" => "STS",
351             #"S_region" => "undefined",
352             #"TATA_signal" => "TATA_box",
353             #"V_region" => "undefined",
354             #"V_segment" => "undefined",
355             #"attenuator" => "attenuator",
356             #"conflict" => "undefined",
357             #"enhancer" => "enhancer",
358             #"exon" => "exon",
359             #"gap" => "gap",
360             #"gene" => "gene",
361             #"iDNA" => "iDNA",
362             #"intron" => "intron",
363             #"mRNA" => "mRNA",
364             #"mat_peptide" => "mature_protein_region",
365             #"mature_peptide" => "mature_protein_region",
366             ## "misc_RNA" => "transcript",
367             #"misc_binding" => "binding_site",
368             #"misc_difference" => "sequence_difference",
369             #"misc_feature" => "region",
370             #"misc_recomb" => "recombination_feature",
371             #"misc_signal" => "regulatory_region",
372             #"misc_structure" => "sequence_secondary_structure",
373             #"modified_base" => "modified_base_site",
374             #"old_sequence" => "undefined",
375             #"operon" => "operon",
376             #"oriT" => "origin_of_transfer",
377             #"polyA_signal" => "polyA_signal_sequence",
378             #"polyA_site" => "polyA_site",
379             #"precursor_RNA" => "primary_transcript",
380             #"prim_transcript" => "primary_transcript",
381             #"primer_bind" => "primer_binding_site",
382             #"promoter" => "promoter",
383             #"protein_bind" => "protein_binding_site",
384             #"rRNA" => "rRNA",
385             #"repeat_region" => "repeat_region",
386             #"repeat_unit" => "repeat_unit",
387             #"satellite" => "satellite_DNA",
388             #"scRNA" => "scRNA",
389             #"sig_peptide" => "signal_peptide",
390             #"snRNA" => "snRNA",
391             #"snoRNA" => "snoRNA",
392             ## "source" => "databank_entry",
393             #"stem_loop" => "stem_loop",
394             #"tRNA" => "tRNA",
395             #"terminator" => "terminator",
396             #"transit_peptide" => "transit_peptide",
397             #"unsure" => "undefined",
398             #"variation" => "sequence_variant",
399              
400             #"pseudomRNA" => "pseudogenic_transcript", ## has parent = pseudogene ; dgg
401             #"pseudotranscript" => "pseudogenic_transcript", ## from Unflattener misc_RNA ; dgg
402             #"pseudoexon" => "pseudogenic_exon",
403             #"pseudoCDS" => "pseudogenic_exon",
404             #"pseudomisc_feature" => "pseudogenic_region",
405             #"pseudointron" => "pseudogenic_region",
406            
407             ### "undefined" => "region",
408              
409             ## this is the most generic form for RNAs;
410             ## we always represent the processed form of
411             ## the transcript
412             #misc_RNA=>'processed_transcript',
413            
414             ## not sure about this one...
415             #source=>'contig',
416            
417             #rep_origin=>'origin_of_replication',
418            
419             #Protein=>'protein',
420             #};
421             }
422              
423             sub map_types_to_SO{
424 0     0 1   my ($self,@args) = @_;
425              
426 0           push(@args, (-type_map=> $self->FT_SO_map() ) );
427 0           return $self->map_types(@args);
428             }
429              
430             =head2 get_relationship_type_by_parent_child
431              
432             Title : get_relationship_type_by_parent_child
433             Usage : $type = $tm->get_relationship_type_by_parent_child($parent_sf, $child_sf);
434             Usage : $type = $tm->get_relationship_type_by_parent_child('mRNA', 'protein');
435             Function: given two features where the parent contains the child,
436             will determine what the relationship between them in
437             Example :
438             Returns :
439             Args : parent SeqFeature, child SeqFeature OR
440             parent type string, child type string OR
441              
442             bioperl Seq::FeatureHolderI hierarchies are equivalent to unlabeled
443             graphs (where parent nodes are the containers, and child nodes are the
444             features being contained). For example, a feature of type mRNA can
445             contain features of type exon.
446              
447             Some external representations (eg chadoxml or chaosxml) require that
448             the edges in the feature relationship graph are labeled. For example,
449             the type between mRNA and exon would be B. Although it
450             stretches the bioperl notion of containment, we could have a CDS
451             contained by an mRNA (for example, the
452             L module takes genbank records
453             and makes these kind of links. The relationship here would be
454             B
455              
456             In chado speak, the child is the B feature and the parent is
457             the B feature
458              
459             =cut
460              
461             sub get_relationship_type_by_parent_child {
462 0     0 1   my ($self,$parent,$child) = @_;
463 0 0         $parent = ref($parent) ? $parent->primary_tag : $parent;
464 0 0         $child = ref($child) ? $child->primary_tag : $child;
465              
466 0           my $type = 'part_of'; # default
467              
468             # TODO - do this with metadata, or infer via SO itself
469              
470 0 0         if (lc($child) eq 'protein') {
471 0           $type = 'derives_from';
472             }
473 0 0         if (lc($child) eq 'polypeptide') {
474 0           $type = 'derives_from';
475             }
476 0           return $type;
477             }
478              
479              
480             1;