File Coverage

lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm

Criterion	Covered	Total	%
statement	43	66	65.1
branch	6	22	27.2
condition	5	9	55.5
subroutine	6	7	85.7
pod			n/a
total	60	104	57.6

line	stmt	bran	cond	sub	time	code
1						package Bio::Roary::ContigsToGeneIDsFromGFF;
2						$Bio::Roary::ContigsToGeneIDsFromGFF::VERSION = '3.11.0';
3						# ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig
4
5
6	4			4	89838	use Moose;
	4				413012
	4				27
7	4			4	26773	use Bio::Tools::GFF;
	4				168145
	4				2347
8						with 'Bio::Roary::ParseGFFAnnotationRole';
9
10						has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids');
11
12						has 'overlapping_hypothetical_protein_ids' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_overlapping_hypothetical_protein_ids');
13						has '_genes_annotation' => ( is => 'rw', isa => 'ArrayRef', default => sub{[]});
14
15						has '_min_nucleotide_overlap_percentage' => ( is => 'ro', isa => 'Int', default => 10);
16
17						# Manually parse the GFF file because the BioPerl module is too slow
18						sub _build_contig_to_ids
19						{
20	110			110	283	my ($self) = @_;
21	110				238	my %contigs_to_ids;
22						my @genes_annotation;
23
24	110	50			1778	open( my $fh, '-\|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file";
25	110				279794	while(<$fh>)
26						{
27	1021				1888	chomp;
28	1021				1864	my $line = $_;
29	1021				1151	my $id_name;
30	1021	50			4201	if($line =~/ID=["']?([^;"']+)["']?;?/i)
31						{
32	1021				2297	$id_name= $1;
33						}
34						else
35						{
36	0				0	next;
37						}
38
39	1021				2845	my @annotation_elements = split(/\t/,$line);
40						# Map gene IDs to the contig
41	1021				1452	push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name);
	1021				3260
42
43	1021	100			4474	if($line =~/product=["']?([^;,"']+)[,"']?;?/i)
44						{
45	163				231	my %gene_data;
46	163				534	$gene_data{product} = $1;
47	163				414	$gene_data{id_name} = $id_name;
48	163	100	66		1090	if($line =~ /UniProtKB/ \|\| $line =~ /RefSeq/ \|\| $line =~ /protein motif/)
			100
49						{
50	103				264	$gene_data{database_annotation_exists} = 1;
51						}
52						else
53						{
54	60				112	$gene_data{database_annotation_exists} = 0;
55						}
56
57	163				280	$gene_data{contig} = $annotation_elements[0];
58	163				326	$gene_data{start} = $annotation_elements[1];
59	163				300	$gene_data{end} = $annotation_elements[2];
60	163				900	push(@genes_annotation,\%gene_data);
61						}
62
63						}
64	110				2922	close($fh);
65
66	110				10762	$self->_genes_annotation(\@genes_annotation);
67	110				6005	return \%contigs_to_ids;
68						}
69
70						sub _build_overlapping_hypothetical_protein_ids
71						{
72	45			45	120	my ($self) = @_;
73	45				1342	$self->contig_to_ids;
74
75	45				80	my %overlapping_protein_ids;
76
77						#Checking to see if the current feature is hypotheitical and if the next one has annotation
78	45				104	for(my $i = 0; $i< (@{$self->_genes_annotation} -1) ; $i++ )
	45				1157
79						{
80	0				0	my $current_feature = $self->_genes_annotation->[$i];
81	0				0	my $next_feature = $self->_genes_annotation->[$i+1];
82
83	0	0			0	next if($current_feature->{database_annotation_exists} == 1);
84	0	0			0	next unless($current_feature->{product} =~ /hypothetical/i);
85	0	0			0	next unless($next_feature->{database_annotation_exists} == 1);
86
87	0				0	my $start_coord = $current_feature->{start} ;
88	0				0	my $end_coord = $current_feature->{end} ;
89	0				0	my $comparison_start_coord =$next_feature->{start} ;
90	0				0	my $comparison_end_coord =$next_feature->{end} ;
91	0	0	0		0	if($comparison_start_coord < $end_coord && $comparison_end_coord > $start_coord )
92						{
93	0				0	my $percent_overlap = $self->_percent_overlap($start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord);
94	0	0			0	if($percent_overlap >= $self->_min_nucleotide_overlap_percentage)
95						{
96	0				0	$overlapping_protein_ids{$current_feature->{id_name}}++;
97						}
98						}
99						}
100
101	45				1451	return \%overlapping_protein_ids;
102						}
103
104						sub _percent_overlap
105						{
106	0			0	0	my ($self, $start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord) = @_;
107	0				0	my $size_of_hypothetical_gene = $end_coord - $start_coord;
108
109	0				0	my $lower_bound = $start_coord;
110	0	0			0	if($comparison_start_coord > $start_coord)
111						{
112	0				0	$lower_bound = $comparison_start_coord;
113						}
114	0				0	my $upper_bound = $end_coord;
115	0	0			0	if($comparison_end_coord < $end_coord )
116						{
117	0				0	$upper_bound = $comparison_end_coord;
118						}
119	0				0	return (($upper_bound-$lower_bound)*100) / $size_of_hypothetical_gene;
120						}
121
122
123						sub _build__awk_filter {
124	110			110	304	my ($self) = @_;
125						return
126	110				3288	'awk \'BEGIN {FS="\t"};{ if ($3 ~/'
127						. $self->_tags_to_filter
128						. '/) print $1"\t"$4"\t"$5"\t"$9;}\' ';
129						}
130
131	4			4	37	no Moose;
	4				9
	4				34
132						__PACKAGE__->meta->make_immutable;
133
134						1;
135
136						__END__
137
138						=pod
139
140						=encoding UTF-8
141
142						=head1 NAME
143
144						Bio::Roary::ContigsToGeneIDsFromGFF - Parse a GFF and efficiently and extract ordered gene ids on each contig
145
146						=head1 VERSION
147
148						version 3.11.0
149
150						=head1 SYNOPSIS
151
152						Parse a GFF and efficiently and extract ordered gene ids on each contig
153						use Bio::Roary::ContigsToGeneIDsFromGFF;
154
155						my $obj = Bio::Roary::ContigsToGeneIDsFromGFF->new(
156						gff_file => 'abc.gff'
157						);
158						$obj->contig_to_ids;
159
160						=head1 AUTHOR
161
162						Andrew J. Page <ap13@sanger.ac.uk>
163
164						=head1 COPYRIGHT AND LICENSE
165
166						This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
167
168						This is free software, licensed under:
169
170						The GNU General Public License, Version 3, June 2007
171
172						=cut