File Coverage

blib/lib/CracTools/Annotator.pm

Criterion	Covered	Total	%
statement	172	222	77.4
branch	33	86	38.3
condition	2	27	7.4
subroutine	23	24	95.8
pod	12	12	100.0
total	242	371	65.2

line	stmt	bran	cond	sub	pod	time	code
1							package CracTools::Annotator;
2							{
3							$CracTools::Annotator::DIST = 'CracTools';
4							}
5							# ABSTRACT: Generic annotation base on CracTools::GFF::Query::File
6							$CracTools::Annotator::VERSION = '1.22';
7	2			2		41524	use strict;
	2					4
	2					52
8	2			2		9	use warnings;
	2					3
	2					52
9
10	2			2		9	use Carp;
	2					8
	2					167
11	2			2		10	use List::Util qw[min max];
	2					3
	2					229
12
13	2			2		958	use CracTools::Const;
	2					4
	2					79
14	2			2		1139	use CracTools::GFF::Annotation;
	2					7
	2					59
15	2			2		1200	use CracTools::Interval::Query;
	2					8
	2					89
16	2			2		1956	use CracTools::Interval::Query::File;
	2					7
	2					7379
17
18
19							sub new {
20	2			2	1	3051	my $class = shift;
21	2					6	my $gff_file = shift;
22	2					5	my $mode = shift;
23
24	2	50				9	$mode = 'light' if !defined $mode;
25
26	2	50				32	if(!defined $gff_file) {
27	0					0	croak "Missing GFF file argument in CracTools::Annotator constructor";
28							}
29
30	2					10	my $self = bless {
31							gff_file => $gff_file,
32							mode => $mode,
33							}, $class;
34
35	2					12	$self->_init();
36
37	2					6	return $self;
38							}
39
40
41							sub mode {
42	49			49	1	67	my $self = shift;
43	49					142	return $self->{mode};
44							}
45
46
47							sub foundAnnotation {
48	1			1	1	1511	my $self = shift;
49	1					2	my ($chr,$pos_start,$pos_end,$strand) = @_;
50	1					3	my @candidates = @{ $self->getAnnotationCandidates($chr,$pos_start,$pos_end,$strand)};
	1					3
51	1					8	return (scalar @candidates > 0);
52							}
53
54
55							sub foundGene {
56	1			1	1	3	my $self = shift;
57	1					2	my ($chr,$pos_start,$pos_end,$strand) = @_;
58	1					2	my @candidates = @{ $self->getAnnotationCandidates($chr,$pos_start,$pos_end,$strand)};
	1					3
59	1					4	foreach my $candidate (@candidates) {
60	1	50				8	return 1 if defined $candidate->{gene};
61							}
62	0					0	return 0;
63							}
64
65
66							sub foundSameGene {
67	3			3	1	529	my $self = shift;
68	3					6	my ($chr,$pos_start1,$pos_end1,$pos_start2,$pos_end2,$strand) = @_;
69	3					4	my @candidates1 = @{ $self->getAnnotationCandidates($chr,$pos_start1,$pos_end1,$strand)};
	3					7
70	3					6	my @candidates2 = @{ $self->getAnnotationCandidates($chr,$pos_start2,$pos_end2,$strand)};
	3					7
71	3					6	my $found_same_gene = 0;
72	3					4	my @genes1;
73							my @genes2;
74	3					4	foreach my $candi1 (@candidates1) {
75	5	50				14	if(defined $candi1->{gene}) {
76	5					15	push @genes1,$candi1->{gene}->attribute('ID');
77							}
78							}
79	3					5	foreach my $candi2 (@candidates2) {
80	5	50				12	if(defined $candi2->{gene}) {
81	5					15	push @genes2,$candi2->{gene}->attribute('ID');
82							}
83							}
84	3					6	foreach my $gene_id (@genes1) {
85	3					5	foreach (@genes2) {
86	2	50				6	if($gene_id eq $_) {
87	2					3	$found_same_gene = 1;
88	2					4	last;
89							}
90							}
91	3	100				7	last if $found_same_gene == 1;
92							}
93	3					26	return $found_same_gene;
94							}
95
96
97							sub getBestAnnotationCandidate {
98	1			1	1	399	my $self = shift;
99	1					5	my ($best_candidates,$best_priority,$best_type) = $self->getBestAnnotationCandidates(@_);
100	1	50				2	if(@{$best_candidates}) {
	1					4
101	1					4	return $best_candidates->[0],$best_priority,$best_type;
102							} else {
103	0					0	return undef,undef,undef;
104							}
105							}
106
107
108							sub getBestAnnotationCandidates {
109	1			1	1	2	my $self = shift;
110	1					2	my ($chr,$pos_start,$pos_end,$strand,$prioritySub,$compareSub) = @_;
111
112	1	50	33			7	if(!defined $prioritySub && !defined $compareSub) {
113	1	50				4	$prioritySub = \&getCandidatePriorityDefault unless defined $prioritySub;
114	1	50				4	$compareSub = \&compareTwoCandidatesDefault unless defined $compareSub;
115							}
116
117	1					2	my @candidates = @{ $self->getAnnotationCandidates($chr,$pos_start,$pos_end,$strand)};
	1					3
118	1					2	my @best_candidates;
119	1					3	my ($best_priority,$best_type);
120	1					2	foreach my $candi (@candidates) {
121	1					2	my ($priority,$type);
122	1	50				6	($priority,$type) = $prioritySub->($pos_start,$pos_end,$candi) if defined $prioritySub;
123	1	50	33			15	if(defined $priority && $priority != -1) {
124	1	50	0			4	if(!defined $best_priority) {
		0
		0
125	1					2	$best_priority = $priority;
126	1					2	push @best_candidates, $candi;
127	1					3	$best_type = $type;
128							} elsif($priority < $best_priority) {
129	0					0	@best_candidates = ($candi);
130	0					0	$best_priority = $priority;
131	0					0	$best_type = $type;
132							}
133							#we should compare two candidates with equal priority to always choose the one
134							elsif (!defined $priority \|\| $priority == $best_priority){
135	0					0	my $candidate_chosen;
136	0					0	my $found_better_candidate = 0;
137	0					0	foreach my $best_candidate (@best_candidates) {
138	0	0				0	$candidate_chosen = $compareSub->($best_candidate,$candi,$pos_start,$pos_end) if defined $compareSub;
139							# They are both equal
140	0	0				0	if (!defined $candidate_chosen) {
		0
141							# We cannnot say if this candidate is better
142	0					0	next;
143							} elsif ($candidate_chosen == $candi) {
144							# We have found a better candidate that previously register ones
145							# we save it and remove the others
146	0					0	@best_candidates = ($candi);
147	0					0	$found_better_candidate = 1;
148	0					0	last;
149							} else {
150							# The better candidate is not "candi", so this candidates
151							# does not belong the the best_candidate array.
152							# We can stop looping
153	0					0	$found_better_candidate = 1;
154	0					0	last;
155							}
156							}
157	0	0				0	push @best_candidates, $candi if !$found_better_candidate;
158							}
159							}
160							}
161							# TODO We should not return variable in that order,
162							# it is not easy to only retrieve the best candidatse...
163	1					3	return \@best_candidates,$best_priority,$best_type;
164							}
165
166
167							sub getAnnotationCandidates {
168	10			10	1	17	my $self = shift;
169	10					16	my ($chr,$pos_start,$pos_end,$strand) = @_;
170							# TODO if no strand is provided we should return annotations from both strands
171
172							# get GFF annotations that overlap the region to annotate
173	10					37	my $annotations = $self->{gff_query}->fetchByRegion($chr,$pos_start,$pos_end,$strand);
174							# get a ref of an array of hash of candidates
175	10					45	my $candidatates = $self->_constructCandidatesFromAnnotation($annotations);
176	10					31	return $candidatates;
177							}
178
179
180							sub getAnnotationNearestDownCandidates {
181	1			1	1	498	my $self = shift;
182	1					2	my ($chr,$pos_start,$strand) = @_;
183
184							# get GFF annotations that overlap the pos_start to annotate
185	1					6	my $annotations_overlap = $self->{gff_query}->fetchByLocation($chr,$pos_start,$strand);
186							# get GFF annotations of nearest down intervals that not overlaped [pos_start,pos_end] pos
187	1					2	my @annotations_down;
188
189	1					2	push @annotations_down, @{$self->{gff_query}->fetchAllNearestDown($chr,$pos_start,$strand)};
	1					5
190
191							# get a ref of an array of hash of candidates
192	1					5	my @annotations = (@$annotations_overlap,@annotations_down);
193	1					3	my $candidatates = $self->_constructCandidatesFromAnnotation(\@annotations);
194	1					5	return $candidatates;
195							}
196
197
198							sub getAnnotationNearestUpCandidates {
199	1			1	1	485	my $self = shift;
200	1					2	my ($chr,$pos_end,$strand) = @_;
201
202							# get GFF annotations that overlap the pos_end to annotate
203	1					5	my $annotations_overlap = $self->{gff_query}->fetchByLocation($chr,$pos_end,$strand);
204							# get GFF annotations of nearest up intervals that not overlaped [pos_start,pos_end] pos
205	1					2	my @annotations_up;
206
207	1					2	push @annotations_up, @{$self->{gff_query}->fetchAllNearestUp($chr,$pos_end,$strand)};
	1					5
208
209							# get a ref of an array of hash of candidates
210	1					4	my @annotations = (@$annotations_overlap,@annotations_up);
211	1					3	my $candidatates = $self->_constructCandidatesFromAnnotation(\@annotations);
212	1					5	return $candidatates;
213							}
214
215
216							sub getCandidatePriorityDefault {
217	2			2	1	4	my ($pos_start,$pos_end,$candidate) = @_;
218	2					4	my ($priority,$type) = (-1,'');
219	2					4	my ($mRNA,$exon) = ($candidate->{mRNA},$candidate->{exon});
220	2	50				6	if(defined $mRNA) {
221	2	50				5	if($mRNA->attribute('type') =~ /protein_coding/i) {
222	2	50				4	if(defined $exon) {
223	0	0	0			0	if(($exon->start <= $pos_start) && ($exon->end >= $pos_end)) {
224	0					0	$priority = 1;
225	0	0				0	if(defined $candidate->{three}) {
		0
226	0					0	$type = '3PRIM_UTR';
227							} elsif(defined $candidate->{five}) {
228	0					0	$type = '5PRIM_UTR';
229							# } elsif(defined $candidate->{cds}) {
230							# $type = 'CDS';
231							} else {
232	0					0	$type = 'EXON';
233							}
234							} else {
235	0					0	$priority = 2;
236	0					0	$type = 'INXON';
237							}
238							} else {
239	2					2	$priority = 4;
240	2					5	$type = 'INTRON';
241							}
242							} else {
243	0	0				0	if(defined $exon) {
244	0	0	0			0	if(($exon->start <= $pos_start) && ($exon->end >= $pos_end)) {
245	0					0	$priority = 3;
246	0					0	$type = 'NON_CODING';
247							}
248							}
249							}
250							}
251	2					6	return ($priority,$type);
252							}
253
254							sub compareTwoCandidatesDefault{
255	0			0	1	0	my ($candidate1,$candidate2,$pos_start) = @_;
256							# If both candidates are exons we try to find wich one is closer to the pos_start of the region to annotate
257	0	0	0			0	if ($candidate1->{exon} && $candidate2->{exon}) {
258	0					0	my $dist1= min(abs($candidate1->{exon}->end - $pos_start),abs($candidate1->{exon}->start - $pos_start));
259	0					0	my $dist2= min(abs($candidate2->{exon}->end - $pos_start),abs($candidate2->{exon}->start - $pos_start));
260	0	0				0	if ($dist1 > $dist2) {
		0
261	0					0	return $candidate2;
262							} elsif ($dist1 < $dist2) {
263	0					0	return $candidate1;
264							}
265							}
266							# If we have not found a better candidate, we use the lexicographic order of the mRNA ID
267	0					0	my ($mRNA1,$mRNA2) = ($candidate1->{mRNA},$candidate2->{mRNA});
268	0	0	0			0	if(defined $mRNA1 && defined $mRNA1->attribute('ID') && defined $mRNA2 && defined $mRNA2->attribute('ID')) {
			0
			0
269	0	0				0	if($mRNA1->attribute('ID') lt $mRNA2->attribute('ID')) {
270	0					0	return $candidate1;
271							} else {
272	0					0	return $candidate2;
273							}
274							}
275							# If nothing has worked we return "undef"
276	0					0	return undef;
277							}
278
279
280							sub _init {
281	2			2		6	my $self = shift;
282	2					4	my $gff_query;
283
284							# Create a GFF file to query exons
285	2	50				9	if($self->mode eq "fast") {
286	2					20	$gff_query = CracTools::Interval::Query->new();
287							my $gff_it = CracTools::Utils::getFileIterator(file => $self->{gff_file},
288	34			34		109	parsing_method => sub { CracTools::GFF::Annotation->new(@_) },
289	2					22	header_regex => "^#",
290							);
291	2					9	while(my $gff_annot = $gff_it->()) {
292	34					86	$gff_query->addInterval($gff_annot->chr,
293							$gff_annot->start+1,
294							$gff_annot->end+1,
295							$gff_annot->strand,
296							$gff_annot,
297							);
298							}
299							} else {
300	0					0	$gff_query = CracTools::Interval::Query::File->new(file => $self->{gff_file}, type => 'gff');
301							}
302
303	2					7	$self->{gff_query} = $gff_query;
304							}
305
306
307							sub _constructCandidates {
308	66			66		88	my ($annot_id,$candidate,$annot_hash) = @_;
309
310							# We init the "leaf_feature" value if this is the first recursion step
311	66	100				169	$candidate->{leaf_feature} = $annot_hash->{$annot_id}->feature if !defined $candidate->{leaf_feature};
312
313	66					69	my @candidates;
314	66	50				142	if (!defined $annot_hash->{$annot_id}){
315	0					0	carp("Missing feature for $annot_id in the gff file");
316							}
317	66					189	$candidate->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$annot_id};
318	66					191	my $parents = $annot_hash->{$annot_id}->parents;
319	66	100				137	if(@$parents) {
320	42					46	foreach my $parent (@{$parents}) {
	42					73
321
322							#Test to avoid a deep recursion
323	47	50				212	if($parent eq $annot_id) {
		50
		100
324	0					0	carp("Parent could not be the candidat itself, please check your gff file for $annot_id");
325	0					0	next;
326							# If there is already a parent with this feature type we duplicated
327							# the candidate since we are branching in the annotation tree
328							}elsif(!defined $annot_hash->{$parent}) {
329	0					0	carp("Parent not found, please check your gff file for $annot_id (Parent: $parent)");
330
331							}elsif(defined $candidate->{$annot_hash->{$parent}->feature}) {
332	10					12	my %copy_candidate = %{$candidate};
	10					53
333	10					15	my %copy_parent_feature = %{$candidate->{parent_feature}};
	10					31
334	10					19	$copy_candidate{parent_feature} = \%copy_parent_feature;
335							# We register in parent_feature links
336	10					29	$copy_candidate{parent_feature}->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$parent}->feature;
337	10					13	my $copy_ref = \%copy_candidate;
338	10					14	push(@candidates,@{_constructCandidates($parent,$copy_ref,$annot_hash)});
	10					18
339							# If not we only go up to the parent node in order to continue candidate
340							# construction
341							} else {
342							# We register in parent_feature links
343	37					94	$candidate->{parent_feature}->{$annot_hash->{$annot_id}->feature} = $annot_hash->{$parent}->feature;
344	37					51	push(@candidates,@{_constructCandidates($parent,$candidate,$annot_hash)});
	37					73
345							}
346							}
347	42					147	return \@candidates;
348							} else {
349	24					104	return [$candidate];
350							}
351							}
352
353
354							sub _constructCandidatesFromAnnotation {
355	12			12		12	my $self = shift;
356	12					13	my $annotations = shift;
357	12					22	my %annot_hash = ();
358	12					13	my @candidates = ();
359
360							# Construct annotation hash with annot ID as key
361	12					14	foreach my $annot_line (@{$annotations}) {
	12					24
362	46	50				87	if($self->mode eq "fast") {
363	46					121	$annot_hash{$annot_line->attribute('ID')} = $annot_line;
364							} else {
365	0					0	my $annot = CracTools::GFF::Annotation->new($annot_line,'gff3');
366	0					0	$annot_hash{$annot->attribute('ID')} = $annot;
367							}
368							}
369
370							# Find leaves in annotation tree
371	12					16	my %hash_leaves;
372	12					30	foreach my $annot_id (keys %annot_hash) {
373							#my @parents = $annot_hash{$annot_id}->parents;
374	46					46	foreach my $parent (@{$annot_hash{$annot_id}->parents}){
	46					113
375	40	100				138	$hash_leaves{$parent} = 1 unless (defined $hash_leaves{$parent});
376							}
377							}
378	12					28	foreach my $annot_id (keys %annot_hash) {
379							# check if annot_id is a leaf
380	46	100				105	if (!defined $hash_leaves{$annot_id}){
381							# Get all possible path from this leaf to the root
382	19					20	push @candidates, @{_constructCandidates($annot_id,my $new_candidate,\%annot_hash)};
	19					40
383							}
384							}
385
386	12					48	return \@candidates;
387							}
388
389							1;
390
391							__END__