File Coverage

blib/lib/CracTools/Annotator.pm

Criterion	Covered	Total	%
statement	171	221	77.3
branch	33	86	38.3
condition	3	30	10.0
subroutine	23	24	95.8
pod	12	12	100.0
total	242	373	64.8

line	stmt	bran	cond	sub	pod	time	code
1							package CracTools::Annotator;
2
3							{
4							$CracTools::Annotator::DIST = 'CracTools';
5							}
6							# ABSTRACT: Generic annotation base on CracTools::GFF::Query::File
7							$CracTools::Annotator::VERSION = '1.251';
8	1			1		13272	use strict;
	1					3
	1					25
9	1			1		4	use warnings;
	1					1
	1					25
10
11	1			1		5	use Carp;
	1					2
	1					65
12	1			1		5	use List::Util qw[min max];
	1					2
	1					83
13
14	1			1		323	use CracTools::Const;
	1					2
	1					36
15	1			1		326	use CracTools::GFF::Annotation;
	1					3
	1					26
16	1			1		362	use CracTools::Interval::Query;
	1					3
	1					31
17	1			1		368	use CracTools::Interval::Query::File;
	1					3
	1					1459
18
19
20							sub new {
21	1			1	1	1759	my $class = shift;
22	1					3	my $gff_file = shift;
23	1					2	my $mode = shift;
24
25	1	50				5	$mode = 'light' if !defined $mode;
26
27	1	50				3	if(!defined $gff_file) {
28	0					0	croak "Missing GFF file argument in CracTools::Annotator constructor";
29							}
30
31	1					5	my $self = bless {
32							gff_file => $gff_file,
33							mode => $mode,
34							}, $class;
35
36	1					4	$self->_init();
37
38	1					4	return $self;
39							}
40
41
42							sub mode {
43	48			48	1	84	my $self = shift;
44	48					112	return $self->{mode};
45							}
46
47
48							sub foundAnnotation {
49	1			1	1	789	my $self = shift;
50							#my ($chr,$pos_start,$pos_end,$strand) = @_;
51	1					3	my @candidates = @{ $self->getAnnotationCandidates(@_)};
	1					3
52	1					9	return (scalar @candidates > 0);
53							}
54
55
56							sub foundGene {
57	1			1	1	2	my $self = shift;
58	1					4	my ($chr,$pos_start,$pos_end,$strand) = @_;
59	1					3	my @candidates = @{ $self->getAnnotationCandidates($chr,$pos_start,$pos_end,$strand)};
	1					4
60	1					3	foreach my $candidate (@candidates) {
61	1	50				9	return 1 if defined $candidate->{gene};
62							}
63	0					0	return 0;
64							}
65
66
67							sub foundSameGene {
68	3			3	1	416	my $self = shift;
69	3					9	my ($chr,$pos_start1,$pos_end1,$pos_start2,$pos_end2,$strand) = @_;
70	3					5	my @candidates1 = @{ $self->getAnnotationCandidates($chr,$pos_start1,$pos_end1,$strand)};
	3					8
71	3					4	my @candidates2 = @{ $self->getAnnotationCandidates($chr,$pos_start2,$pos_end2,$strand)};
	3					10
72	3					6	my $found_same_gene = 0;
73	3					5	my @genes1;
74							my @genes2;
75	3					6	foreach my $candi1 (@candidates1) {
76	5	50				13	if(defined $candi1->{gene}) {
77	5					15	push @genes1,$candi1->{gene}->attribute('ID');
78							}
79							}
80	3					4	foreach my $candi2 (@candidates2) {
81	5	50				13	if(defined $candi2->{gene}) {
82	5					14	push @genes2,$candi2->{gene}->attribute('ID');
83							}
84							}
85	3					6	foreach my $gene_id (@genes1) {
86	3					7	foreach (@genes2) {
87	2	50				6	if($gene_id eq $_) {
88	2					3	$found_same_gene = 1;
89	2					4	last;
90							}
91							}
92	3	100				7	last if $found_same_gene == 1;
93							}
94	3					29	return $found_same_gene;
95							}
96
97
98							sub getBestAnnotationCandidate {
99	1			1	1	426	my $self = shift;
100	1					4	my ($best_candidates,$best_priority,$best_type) = $self->getBestAnnotationCandidates(@_);
101	1	50				2	if(@{$best_candidates}) {
	1					6
102	1					5	return $best_candidates->[0],$best_priority,$best_type;
103							} else {
104	0					0	return undef,undef,undef;
105							}
106							}
107
108
109							sub getBestAnnotationCandidates {
110	1			1	1	2	my $self = shift;
111	1					3	my ($chr,$pos_start,$pos_end,$strand,$prioritySub,$compareSub) = @_;
112
113	1	50	33			8	if(!defined $prioritySub && !defined $compareSub) {
114	1	50				4	$prioritySub = \&getCandidatePriorityDefault unless defined $prioritySub;
115	1	50				4	$compareSub = \&compareTwoCandidatesDefault unless defined $compareSub;
116							}
117
118	1					2	my @candidates = @{ $self->getAnnotationCandidates($chr,$pos_start,$pos_end,$strand)};
	1					4
119	1					2	my @best_candidates;
120	1					3	my ($best_priority,$best_type);
121	1					2	foreach my $candi (@candidates) {
122	1					2	my ($priority,$type);
123	1	50				8	($priority,$type) = $prioritySub->($pos_start,$pos_end,$candi) if defined $prioritySub;
124	1	50	33			7	if(defined $priority && $priority != -1) {
125	1	50	0			8	if(!defined $best_priority) {
		0
		0
126	1					2	$best_priority = $priority;
127	1					2	push @best_candidates, $candi;
128	1					3	$best_type = $type;
129							} elsif($priority < $best_priority) {
130	0					0	@best_candidates = ($candi);
131	0					0	$best_priority = $priority;
132	0					0	$best_type = $type;
133							}
134							#we should compare two candidates with equal priority to always choose the one
135							elsif (!defined $priority \|\| $priority == $best_priority){
136	0					0	my $candidate_chosen;
137	0					0	my $found_better_candidate = 0;
138	0					0	foreach my $best_candidate (@best_candidates) {
139	0	0				0	$candidate_chosen = $compareSub->($best_candidate,$candi,$pos_start,$pos_end) if defined $compareSub;
140							# They are both equal
141	0	0				0	if (!defined $candidate_chosen) {
		0
142							# We cannnot say if this candidate is better
143	0					0	next;
144							} elsif ($candidate_chosen == $candi) {
145							# We have found a better candidate that previously register ones
146							# we save it and remove the others
147	0					0	@best_candidates = ($candi);
148	0					0	$found_better_candidate = 1;
149	0					0	last;
150							} else {
151							# The better candidate is not "candi", so this candidates
152							# does not belong the the best_candidate array.
153							# We can stop looping
154	0					0	$found_better_candidate = 1;
155	0					0	last;
156							}
157							}
158	0	0				0	push @best_candidates, $candi if !$found_better_candidate;
159							}
160							}
161							}
162							# TODO We should not return variable in that order,
163							# it is not easy to only retrieve the best candidatse...
164	1					3	return \@best_candidates,$best_priority,$best_type;
165							}
166
167
168							sub getAnnotationCandidates {
169	10			10	1	13	my $self = shift;
170	10					20	my ($chr,$pos_start,$pos_end,$strand) = @_;
171							# TODO if no strand is provided we should return annotations from both strands
172
173							# get GFF annotations that overlap the region to annotate
174	10					33	my $annotations = $self->{gff_query}->fetchByRegion($chr,$pos_start,$pos_end,$strand);
175							# get a ref of an array of hash of candidates
176	10					23	my $candidatates = $self->_constructCandidatesFromAnnotation($annotations);
177	10					29	return $candidatates;
178							}
179
180
181							sub getAnnotationNearestDownCandidates {
182	1			1	1	416	my $self = shift;
183	1					4	my ($chr,$pos_start,$strand) = @_;
184
185							# get GFF annotations that overlap the pos_start to annotate
186	1					5	my $annotations_overlap = $self->{gff_query}->fetchByLocation($chr,$pos_start,$strand);
187							# get GFF annotations of nearest down intervals that not overlaped [pos_start,pos_end] pos
188	1					2	my @annotations_down;
189
190	1					2	push @annotations_down, @{$self->{gff_query}->fetchAllNearestDown($chr,$pos_start,$strand)};
	1					6
191
192							# get a ref of an array of hash of candidates
193	1					4	my @annotations = (@$annotations_overlap,@annotations_down);
194	1					5	my $candidatates = $self->_constructCandidatesFromAnnotation(\@annotations);
195	1					4	return $candidatates;
196							}
197
198
199							sub getAnnotationNearestUpCandidates {
200	1			1	1	400	my $self = shift;
201	1					3	my ($chr,$pos_end,$strand) = @_;
202
203							# get GFF annotations that overlap the pos_end to annotate
204	1					4	my $annotations_overlap = $self->{gff_query}->fetchByLocation($chr,$pos_end,$strand);
205							# get GFF annotations of nearest up intervals that not overlaped [pos_start,pos_end] pos
206	1					3	my @annotations_up;
207
208	1					2	push @annotations_up, @{$self->{gff_query}->fetchAllNearestUp($chr,$pos_end,$strand)};
	1					5
209
210							# get a ref of an array of hash of candidates
211	1					3	my @annotations = (@$annotations_overlap,@annotations_up);
212	1					5	my $candidatates = $self->_constructCandidatesFromAnnotation(\@annotations);
213	1					4	return $candidatates;
214							}
215
216
217							sub getCandidatePriorityDefault {
218	2			2	1	5	my ($pos_start,$pos_end,$candidate) = @_;
219	2					5	my ($priority,$type) = (-1,'');
220	2					6	my ($mRNA,$exon) = ($candidate->{mRNA},$candidate->{exon});
221	2	50				6	if(defined $mRNA) {
222	2	50	33			6	if(defined $mRNA->attribute('type') && $mRNA->attribute('type') =~ /protein_coding/i) {
223	2	50				7	if(defined $exon) {
224	0	0	0			0	if(($exon->start <= $pos_start) && ($exon->end >= $pos_end)) {
225	0					0	$priority = 1;
226	0	0				0	if(defined $candidate->{three}) {
		0
227	0					0	$type = '3PRIM_UTR';
228							} elsif(defined $candidate->{five}) {
229	0					0	$type = '5PRIM_UTR';
230							# } elsif(defined $candidate->{cds}) {
231							# $type = 'CDS';
232							} else {
233	0					0	$type = 'EXON';
234							}
235							} else {
236	0					0	$priority = 2;
237	0					0	$type = 'INXON';
238							}
239							} else {
240	2					4	$priority = 4;
241	2					4	$type = 'INTRON';
242							}
243							} else {
244	0	0				0	if(defined $exon) {
245	0	0	0			0	if(($exon->start <= $pos_start) && ($exon->end >= $pos_end)) {
246	0					0	$priority = 3;
247	0					0	$type = 'NON_CODING';
248							}
249							}
250							}
251							}
252	2					6	return ($priority,$type);
253							}
254
255							sub compareTwoCandidatesDefault{
256	0			0	1	0	my ($candidate1,$candidate2,$pos_start) = @_;
257							# If both candidates are exons we try to find wich one is closer to the pos_start of the region to annotate
258	0	0	0			0	if ($candidate1->{exon} && $candidate2->{exon}) {
259	0					0	my $dist1= min(abs($candidate1->{exon}->end - $pos_start),abs($candidate1->{exon}->start - $pos_start));
260	0					0	my $dist2= min(abs($candidate2->{exon}->end - $pos_start),abs($candidate2->{exon}->start - $pos_start));
261	0	0				0	if ($dist1 > $dist2) {
		0
262	0					0	return $candidate2;
263							} elsif ($dist1 < $dist2) {
264	0					0	return $candidate1;
265							}
266							}
267							# If we have not found a better candidate, we use the lexicographic order of the mRNA ID
268	0					0	my ($mRNA1,$mRNA2) = ($candidate1->{mRNA},$candidate2->{mRNA});
269	0	0	0			0	if(defined $mRNA1 && defined $mRNA1->attribute('ID') && defined $mRNA2 && defined $mRNA2->attribute('ID')) {
			0
			0
270	0	0				0	if($mRNA1->attribute('ID') lt $mRNA2->attribute('ID')) {
271	0					0	return $candidate1;
272							} else {
273	0					0	return $candidate2;
274							}
275							}
276							# If nothing has worked we return "undef"
277	0					0	return undef;
278							}
279
280
281							sub _init {
282	1			1		3	my $self = shift;
283	1					2	my $gff_query;
284
285							# Create a GFF file to query exons
286	1	50				4	if($self->mode eq "fast") {
287	1					8	$gff_query = CracTools::Interval::Query->new();
288							my $gff_it = CracTools::Utils::getFileIterator(file => $self->{gff_file},
289	31			31		86	parsing_method => sub { CracTools::GFF::Annotation->new(@_) },
290	1					8	header_regex => "^#",
291							);
292	1					4	while(my $gff_annot = $gff_it->()) {
293	31					70	$gff_query->addInterval($gff_annot->chr,
294							$gff_annot->start+1,
295							$gff_annot->end+1,
296							$gff_annot->strand,
297							$gff_annot,
298							);
299							}
300							} else {
301	0					0	$gff_query = CracTools::Interval::Query::File->new(file => $self->{gff_file}, type => 'gff');
302							}
303
304	1					4	$self->{gff_query} = $gff_query;
305							}
306
307
308							sub _constructCandidates {
309	66			66		118	my ($annot_id,$candidate,$annot_hash) = @_;
310
311							# We init the "leaf_feature" value if this is the first recursion step
312	66	100				166	$candidate->{leaf_feature} = $annot_hash->{$annot_id}->feature if !defined $candidate->{leaf_feature};
313
314	66					100	my @candidates;
315	66	50				138	if (!defined $annot_hash->{$annot_id}){
316	0					0	carp("Missing feature for $annot_id in the gff file");
317							}
318	66					179	$candidate->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$annot_id};
319	66					163	my $parents = $annot_hash->{$annot_id}->parents;
320	66	100				128	if(@$parents) {
321	42					60	foreach my $parent (@{$parents}) {
	42					75
322
323							#Test to avoid a deep recursion
324	47	50				183	if($parent eq $annot_id) {
		50
		100
325	0					0	carp("Parent could not be the candidat itself, please check your gff file for $annot_id");
326	0					0	next;
327							# If there is already a parent with this feature type we duplicated
328							# the candidate since we are branching in the annotation tree
329							}elsif(!defined $annot_hash->{$parent}) {
330	0					0	carp("Parent not found, please check your gff file for $annot_id (Parent: $parent)");
331
332							}elsif(defined $candidate->{$annot_hash->{$parent}->feature}) {
333	10					15	my %copy_candidate = %{$candidate};
	10					38
334	10					19	my %copy_parent_feature = %{$candidate->{parent_feature}};
	10					34
335	10					19	$copy_candidate{parent_feature} = \%copy_parent_feature;
336							# We register in parent_feature links
337	10					28	$copy_candidate{parent_feature}->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$parent}->feature;
338	10					16	my $copy_ref = \%copy_candidate;
339	10					19	push(@candidates,@{_constructCandidates($parent,$copy_ref,$annot_hash)});
	10					18
340							# If not we only go up to the parent node in order to continue candidate
341							# construction
342							} else {
343							# We register in parent_feature links
344	37					91	$candidate->{parent_feature}->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$parent}->feature;
345	37					63	push(@candidates,@{_constructCandidates($parent,$candidate,$annot_hash)});
	37					81
346							}
347							}
348	42					112	return \@candidates;
349							} else {
350	24					71	return [$candidate];
351							}
352							}
353
354
355							sub _constructCandidatesFromAnnotation {
356	12			12		20	my $self = shift;
357	12					18	my $annotations = shift;
358	12					21	my %annot_hash = ();
359	12					21	my @candidates = ();
360
361							# Construct annotation hash with annot ID as key
362	12					20	foreach my $annot_line (@{$annotations}) {
	12					23
363	46	50				89	if($self->mode eq "fast") {
364	46					117	$annot_hash{$annot_line->attribute('ID')} = $annot_line;
365							} else {
366	0					0	my $annot = CracTools::GFF::Annotation->new($annot_line,'gff3');
367	0					0	$annot_hash{$annot->attribute('ID')} = $annot;
368							}
369							}
370
371							# Find leaves in annotation tree
372	12					20	my %hash_leaves;
373	12					31	foreach my $annot_id (keys %annot_hash) {
374							#my @parents = $annot_hash{$annot_id}->parents;
375	46					64	foreach my $parent (@{$annot_hash{$annot_id}->parents}){
	46					108
376	40	100				114	$hash_leaves{$parent} = 1 unless (defined $hash_leaves{$parent});
377							}
378							}
379	12					27	foreach my $annot_id (keys %annot_hash) {
380							# check if annot_id is a leaf
381	46	100				104	if (!defined $hash_leaves{$annot_id}){
382							# Get all possible path from this leaf to the root
383	19					29	push @candidates, @{_constructCandidates($annot_id,my $new_candidate,\%annot_hash)};
	19					38
384							}
385							}
386
387	12					36	return \@candidates;
388							}
389
390							1;
391
392							__END__