File Coverage

blib/lib/CracTools/Annotator.pm
Criterion Covered Total %
statement 171 221 77.3
branch 33 86 38.3
condition 3 30 10.0
subroutine 23 24 95.8
pod 12 12 100.0
total 242 373 64.8


line stmt bran cond sub pod time code
1             package CracTools::Annotator;
2             {
3             $CracTools::Annotator::DIST = 'CracTools';
4             }
5             # ABSTRACT: Generic annotation base on CracTools::GFF::Query::File
6             $CracTools::Annotator::VERSION = '1.25';
7 1     1   13163 use strict;
  1         1  
  1         25  
8 1     1   3 use warnings;
  1         1  
  1         21  
9              
10 1     1   3 use Carp;
  1         1  
  1         70  
11 1     1   4 use List::Util qw[min max];
  1         1  
  1         82  
12              
13 1     1   317 use CracTools::Const;
  1         1  
  1         35  
14 1     1   343 use CracTools::GFF::Annotation;
  1         2  
  1         27  
15 1     1   428 use CracTools::Interval::Query;
  1         2  
  1         45  
16 1     1   462 use CracTools::Interval::Query::File;
  1         2  
  1         1452  
17              
18              
19             sub new {
20 1     1 1 1463 my $class = shift;
21 1         2 my $gff_file = shift;
22 1         2 my $mode = shift;
23              
24 1 50       3 $mode = 'light' if !defined $mode;
25              
26 1 50       4 if(!defined $gff_file) {
27 0         0 croak "Missing GFF file argument in CracTools::Annotator constructor";
28             }
29              
30 1         3 my $self = bless {
31             gff_file => $gff_file,
32             mode => $mode,
33             }, $class;
34              
35 1         6 $self->_init();
36              
37 1         3 return $self;
38             }
39              
40              
41             sub mode {
42 48     48 1 33 my $self = shift;
43 48         71 return $self->{mode};
44             }
45              
46              
47             sub foundAnnotation {
48 1     1 1 488 my $self = shift;
49             #my ($chr,$pos_start,$pos_end,$strand) = @_;
50 1         3 my @candidates = @{ $self->getAnnotationCandidates(@_)};
  1         2  
51 1         6 return (scalar @candidates > 0);
52             }
53              
54              
55             sub foundGene {
56 1     1 1 2 my $self = shift;
57 1         2 my ($chr,$pos_start,$pos_end,$strand) = @_;
58 1         2 my @candidates = @{ $self->getAnnotationCandidates($chr,$pos_start,$pos_end,$strand)};
  1         2  
59 1         2 foreach my $candidate (@candidates) {
60 1 50       7 return 1 if defined $candidate->{gene};
61             }
62 0         0 return 0;
63             }
64              
65              
66             sub foundSameGene {
67 3     3 1 266 my $self = shift;
68 3         3 my ($chr,$pos_start1,$pos_end1,$pos_start2,$pos_end2,$strand) = @_;
69 3         4 my @candidates1 = @{ $self->getAnnotationCandidates($chr,$pos_start1,$pos_end1,$strand)};
  3         5  
70 3         3 my @candidates2 = @{ $self->getAnnotationCandidates($chr,$pos_start2,$pos_end2,$strand)};
  3         6  
71 3         4 my $found_same_gene = 0;
72 3         2 my @genes1;
73             my @genes2;
74 3         4 foreach my $candi1 (@candidates1) {
75 5 50       8 if(defined $candi1->{gene}) {
76 5         8 push @genes1,$candi1->{gene}->attribute('ID');
77             }
78             }
79 3         4 foreach my $candi2 (@candidates2) {
80 5 50       8 if(defined $candi2->{gene}) {
81 5         10 push @genes2,$candi2->{gene}->attribute('ID');
82             }
83             }
84 3         4 foreach my $gene_id (@genes1) {
85 3         4 foreach (@genes2) {
86 2 50       3 if($gene_id eq $_) {
87 2         3 $found_same_gene = 1;
88 2         2 last;
89             }
90             }
91 3 100       5 last if $found_same_gene == 1;
92             }
93 3         26 return $found_same_gene;
94             }
95              
96              
97             sub getBestAnnotationCandidate {
98 1     1 1 282 my $self = shift;
99 1         4 my ($best_candidates,$best_priority,$best_type) = $self->getBestAnnotationCandidates(@_);
100 1 50       2 if(@{$best_candidates}) {
  1         11  
101 1         4 return $best_candidates->[0],$best_priority,$best_type;
102             } else {
103 0         0 return undef,undef,undef;
104             }
105             }
106              
107              
108             sub getBestAnnotationCandidates {
109 1     1 1 2 my $self = shift;
110 1         2 my ($chr,$pos_start,$pos_end,$strand,$prioritySub,$compareSub) = @_;
111              
112 1 50 33     7 if(!defined $prioritySub && !defined $compareSub) {
113 1 50       3 $prioritySub = \&getCandidatePriorityDefault unless defined $prioritySub;
114 1 50       3 $compareSub = \&compareTwoCandidatesDefault unless defined $compareSub;
115             }
116              
117 1         1 my @candidates = @{ $self->getAnnotationCandidates($chr,$pos_start,$pos_end,$strand)};
  1         3  
118 1         1 my @best_candidates;
119 1         2 my ($best_priority,$best_type);
120 1         2 foreach my $candi (@candidates) {
121 1         1 my ($priority,$type);
122 1 50       5 ($priority,$type) = $prioritySub->($pos_start,$pos_end,$candi) if defined $prioritySub;
123 1 50 33     5 if(defined $priority && $priority != -1) {
124 1 50 0     3 if(!defined $best_priority) {
    0          
    0          
125 1         1 $best_priority = $priority;
126 1         1 push @best_candidates, $candi;
127 1         2 $best_type = $type;
128             } elsif($priority < $best_priority) {
129 0         0 @best_candidates = ($candi);
130 0         0 $best_priority = $priority;
131 0         0 $best_type = $type;
132             }
133             #we should compare two candidates with equal priority to always choose the one
134             elsif (!defined $priority || $priority == $best_priority){
135 0         0 my $candidate_chosen;
136 0         0 my $found_better_candidate = 0;
137 0         0 foreach my $best_candidate (@best_candidates) {
138 0 0       0 $candidate_chosen = $compareSub->($best_candidate,$candi,$pos_start,$pos_end) if defined $compareSub;
139             # They are both equal
140 0 0       0 if (!defined $candidate_chosen) {
    0          
141             # We cannnot say if this candidate is better
142 0         0 next;
143             } elsif ($candidate_chosen == $candi) {
144             # We have found a better candidate that previously register ones
145             # we save it and remove the others
146 0         0 @best_candidates = ($candi);
147 0         0 $found_better_candidate = 1;
148 0         0 last;
149             } else {
150             # The better candidate is not "candi", so this candidates
151             # does not belong the the best_candidate array.
152             # We can stop looping
153 0         0 $found_better_candidate = 1;
154 0         0 last;
155             }
156             }
157 0 0       0 push @best_candidates, $candi if !$found_better_candidate;
158             }
159             }
160             }
161             # TODO We should not return variable in that order,
162             # it is not easy to only retrieve the best candidatse...
163 1         3 return \@best_candidates,$best_priority,$best_type;
164             }
165              
166              
167             sub getAnnotationCandidates {
168 10     10 1 9 my $self = shift;
169 10         9 my ($chr,$pos_start,$pos_end,$strand) = @_;
170             # TODO if no strand is provided we should return annotations from both strands
171              
172             # get GFF annotations that overlap the region to annotate
173 10         23 my $annotations = $self->{gff_query}->fetchByRegion($chr,$pos_start,$pos_end,$strand);
174             # get a ref of an array of hash of candidates
175 10         25 my $candidatates = $self->_constructCandidatesFromAnnotation($annotations);
176 10         21 return $candidatates;
177             }
178              
179              
180             sub getAnnotationNearestDownCandidates {
181 1     1 1 276 my $self = shift;
182 1         2 my ($chr,$pos_start,$strand) = @_;
183              
184             # get GFF annotations that overlap the pos_start to annotate
185 1         4 my $annotations_overlap = $self->{gff_query}->fetchByLocation($chr,$pos_start,$strand);
186             # get GFF annotations of nearest down intervals that not overlaped [pos_start,pos_end] pos
187 1         1 my @annotations_down;
188              
189 1         2 push @annotations_down, @{$self->{gff_query}->fetchAllNearestDown($chr,$pos_start,$strand)};
  1         4  
190              
191             # get a ref of an array of hash of candidates
192 1         3 my @annotations = (@$annotations_overlap,@annotations_down);
193 1         3 my $candidatates = $self->_constructCandidatesFromAnnotation(\@annotations);
194 1         4 return $candidatates;
195             }
196              
197              
198             sub getAnnotationNearestUpCandidates {
199 1     1 1 252 my $self = shift;
200 1         2 my ($chr,$pos_end,$strand) = @_;
201              
202             # get GFF annotations that overlap the pos_end to annotate
203 1         4 my $annotations_overlap = $self->{gff_query}->fetchByLocation($chr,$pos_end,$strand);
204             # get GFF annotations of nearest up intervals that not overlaped [pos_start,pos_end] pos
205 1         2 my @annotations_up;
206              
207 1         2 push @annotations_up, @{$self->{gff_query}->fetchAllNearestUp($chr,$pos_end,$strand)};
  1         4  
208              
209             # get a ref of an array of hash of candidates
210 1         3 my @annotations = (@$annotations_overlap,@annotations_up);
211 1         3 my $candidatates = $self->_constructCandidatesFromAnnotation(\@annotations);
212 1         3 return $candidatates;
213             }
214              
215              
216             sub getCandidatePriorityDefault {
217 2     2 1 3 my ($pos_start,$pos_end,$candidate) = @_;
218 2         5 my ($priority,$type) = (-1,'');
219 2         3 my ($mRNA,$exon) = ($candidate->{mRNA},$candidate->{exon});
220 2 50       5 if(defined $mRNA) {
221 2 50 33     4 if(defined $mRNA->attribute('type') && $mRNA->attribute('type') =~ /protein_coding/i) {
222 2 50       4 if(defined $exon) {
223 0 0 0     0 if(($exon->start <= $pos_start) && ($exon->end >= $pos_end)) {
224 0         0 $priority = 1;
225 0 0       0 if(defined $candidate->{three}) {
    0          
226 0         0 $type = '3PRIM_UTR';
227             } elsif(defined $candidate->{five}) {
228 0         0 $type = '5PRIM_UTR';
229             # } elsif(defined $candidate->{cds}) {
230             # $type = 'CDS';
231             } else {
232 0         0 $type = 'EXON';
233             }
234             } else {
235 0         0 $priority = 2;
236 0         0 $type = 'INXON';
237             }
238             } else {
239 2         3 $priority = 4;
240 2         3 $type = 'INTRON';
241             }
242             } else {
243 0 0       0 if(defined $exon) {
244 0 0 0     0 if(($exon->start <= $pos_start) && ($exon->end >= $pos_end)) {
245 0         0 $priority = 3;
246 0         0 $type = 'NON_CODING';
247             }
248             }
249             }
250             }
251 2         4 return ($priority,$type);
252             }
253              
254             sub compareTwoCandidatesDefault{
255 0     0 1 0 my ($candidate1,$candidate2,$pos_start) = @_;
256             # If both candidates are exons we try to find wich one is closer to the pos_start of the region to annotate
257 0 0 0     0 if ($candidate1->{exon} && $candidate2->{exon}) {
258 0         0 my $dist1= min(abs($candidate1->{exon}->end - $pos_start),abs($candidate1->{exon}->start - $pos_start));
259 0         0 my $dist2= min(abs($candidate2->{exon}->end - $pos_start),abs($candidate2->{exon}->start - $pos_start));
260 0 0       0 if ($dist1 > $dist2) {
    0          
261 0         0 return $candidate2;
262             } elsif ($dist1 < $dist2) {
263 0         0 return $candidate1;
264             }
265             }
266             # If we have not found a better candidate, we use the lexicographic order of the mRNA ID
267 0         0 my ($mRNA1,$mRNA2) = ($candidate1->{mRNA},$candidate2->{mRNA});
268 0 0 0     0 if(defined $mRNA1 && defined $mRNA1->attribute('ID') && defined $mRNA2 && defined $mRNA2->attribute('ID')) {
      0        
      0        
269 0 0       0 if($mRNA1->attribute('ID') lt $mRNA2->attribute('ID')) {
270 0         0 return $candidate1;
271             } else {
272 0         0 return $candidate2;
273             }
274             }
275             # If nothing has worked we return "undef"
276 0         0 return undef;
277             }
278              
279              
280             sub _init {
281 1     1   1 my $self = shift;
282 1         1 my $gff_query;
283              
284             # Create a GFF file to query exons
285 1 50       4 if($self->mode eq "fast") {
286 1         8 $gff_query = CracTools::Interval::Query->new();
287             my $gff_it = CracTools::Utils::getFileIterator(file => $self->{gff_file},
288 31     31   55 parsing_method => sub { CracTools::GFF::Annotation->new(@_) },
289 1         10 header_regex => "^#",
290             );
291 1         4 while(my $gff_annot = $gff_it->()) {
292 31         49 $gff_query->addInterval($gff_annot->chr,
293             $gff_annot->start+1,
294             $gff_annot->end+1,
295             $gff_annot->strand,
296             $gff_annot,
297             );
298             }
299             } else {
300 0         0 $gff_query = CracTools::Interval::Query::File->new(file => $self->{gff_file}, type => 'gff');
301             }
302              
303 1         4 $self->{gff_query} = $gff_query;
304             }
305              
306              
307             sub _constructCandidates {
308 66     66   53 my ($annot_id,$candidate,$annot_hash) = @_;
309              
310             # We init the "leaf_feature" value if this is the first recursion step
311 66 100       109 $candidate->{leaf_feature} = $annot_hash->{$annot_id}->feature if !defined $candidate->{leaf_feature};
312              
313 66         33 my @candidates;
314 66 50       83 if (!defined $annot_hash->{$annot_id}){
315 0         0 carp("Missing feature for $annot_id in the gff file");
316             }
317 66         104 $candidate->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$annot_id};
318 66         98 my $parents = $annot_hash->{$annot_id}->parents;
319 66 100       70 if(@$parents) {
320 42         25 foreach my $parent (@{$parents}) {
  42         41  
321            
322             #Test to avoid a deep recursion
323 47 50       113 if($parent eq $annot_id) {
    50          
    100          
324 0         0 carp("Parent could not be the candidat itself, please check your gff file for $annot_id");
325 0         0 next;
326             # If there is already a parent with this feature type we duplicated
327             # the candidate since we are branching in the annotation tree
328             }elsif(!defined $annot_hash->{$parent}) {
329 0         0 carp("Parent not found, please check your gff file for $annot_id (Parent: $parent)");
330            
331             }elsif(defined $candidate->{$annot_hash->{$parent}->feature}) {
332 10         5 my %copy_candidate = %{$candidate};
  10         38  
333 10         7 my %copy_parent_feature = %{$candidate->{parent_feature}};
  10         22  
334 10         10 $copy_candidate{parent_feature} = \%copy_parent_feature;
335             # We register in parent_feature links
336 10         16 $copy_candidate{parent_feature}->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$parent}->feature;
337 10         12 my $copy_ref = \%copy_candidate;
338 10         11 push(@candidates,@{_constructCandidates($parent,$copy_ref,$annot_hash)});
  10         9  
339             # If not we only go up to the parent node in order to continue candidate
340             # construction
341             } else {
342             # We register in parent_feature links
343 37         47 $candidate->{parent_feature}->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$parent}->feature;
344 37         35 push(@candidates,@{_constructCandidates($parent,$candidate,$annot_hash)});
  37         44  
345             }
346             }
347 42         75 return \@candidates;
348             } else {
349 24         49 return [$candidate];
350             }
351             }
352              
353              
354             sub _constructCandidatesFromAnnotation {
355 12     12   8 my $self = shift;
356 12         7 my $annotations = shift;
357 12         13 my %annot_hash = ();
358 12         13 my @candidates = ();
359              
360             # Construct annotation hash with annot ID as key
361 12         5 foreach my $annot_line (@{$annotations}) {
  12         14  
362 46 50       48 if($self->mode eq "fast") {
363 46         69 $annot_hash{$annot_line->attribute('ID')} = $annot_line;
364             } else {
365 0         0 my $annot = CracTools::GFF::Annotation->new($annot_line,'gff3');
366 0         0 $annot_hash{$annot->attribute('ID')} = $annot;
367             }
368             }
369              
370             # Find leaves in annotation tree
371 12         7 my %hash_leaves;
372 12         23 foreach my $annot_id (keys %annot_hash) {
373             #my @parents = $annot_hash{$annot_id}->parents;
374 46         18 foreach my $parent (@{$annot_hash{$annot_id}->parents}){
  46         60  
375 40 100       82 $hash_leaves{$parent} = 1 unless (defined $hash_leaves{$parent});
376             }
377             }
378 12         14 foreach my $annot_id (keys %annot_hash) {
379             # check if annot_id is a leaf
380 46 100       64 if (!defined $hash_leaves{$annot_id}){
381             # Get all possible path from this leaf to the root
382 19         10 push @candidates, @{_constructCandidates($annot_id,my $new_candidate,\%annot_hash)};
  19         22  
383             }
384             }
385              
386 12         25 return \@candidates;
387             }
388              
389             1;
390              
391             __END__