File Coverage

lib/Bio/Roary/ContigsToGeneIDsFromGFF.pm
Criterion Covered Total %
statement 43 66 65.1
branch 6 22 27.2
condition 5 9 55.5
subroutine 6 7 85.7
pod n/a
total 60 104 57.6


line stmt bran cond sub pod time code
1             package Bio::Roary::ContigsToGeneIDsFromGFF;
2             $Bio::Roary::ContigsToGeneIDsFromGFF::VERSION = '3.11.0';
3             # ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig
4              
5              
6 4     4   89838 use Moose;
  4         413012  
  4         27  
7 4     4   26773 use Bio::Tools::GFF;
  4         168145  
  4         2347  
8             with 'Bio::Roary::ParseGFFAnnotationRole';
9              
10             has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids');
11              
12             has 'overlapping_hypothetical_protein_ids' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_overlapping_hypothetical_protein_ids');
13             has '_genes_annotation' => ( is => 'rw', isa => 'ArrayRef', default => sub{[]});
14              
15             has '_min_nucleotide_overlap_percentage' => ( is => 'ro', isa => 'Int', default => 10);
16              
17             # Manually parse the GFF file because the BioPerl module is too slow
18             sub _build_contig_to_ids
19             {
20 110     110   283 my ($self) = @_;
21 110         238 my %contigs_to_ids;
22             my @genes_annotation;
23            
24 110 50       1778 open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file";
25 110         279794 while(<$fh>)
26             {
27 1021         1888 chomp;
28 1021         1864 my $line = $_;
29 1021         1151 my $id_name;
30 1021 50       4201 if($line =~/ID=["']?([^;"']+)["']?;?/i)
31             {
32 1021         2297 $id_name= $1;
33             }
34             else
35             {
36 0         0 next;
37             }
38            
39 1021         2845 my @annotation_elements = split(/\t/,$line);
40             # Map gene IDs to the contig
41 1021         1452 push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name);
  1021         3260  
42            
43 1021 100       4474 if($line =~/product=["']?([^;,"']+)[,"']?;?/i)
44             {
45 163         231 my %gene_data;
46 163         534 $gene_data{product} = $1;
47 163         414 $gene_data{id_name} = $id_name;
48 163 100 66     1090 if($line =~ /UniProtKB/ || $line =~ /RefSeq/ || $line =~ /protein motif/)
      100        
49             {
50 103         264 $gene_data{database_annotation_exists} = 1;
51             }
52             else
53             {
54 60         112 $gene_data{database_annotation_exists} = 0;
55             }
56            
57 163         280 $gene_data{contig} = $annotation_elements[0];
58 163         326 $gene_data{start} = $annotation_elements[1];
59 163         300 $gene_data{end} = $annotation_elements[2];
60 163         900 push(@genes_annotation,\%gene_data);
61             }
62              
63             }
64 110         2922 close($fh);
65            
66 110         10762 $self->_genes_annotation(\@genes_annotation);
67 110         6005 return \%contigs_to_ids;
68             }
69              
70             sub _build_overlapping_hypothetical_protein_ids
71             {
72 45     45   120 my ($self) = @_;
73 45         1342 $self->contig_to_ids;
74            
75 45         80 my %overlapping_protein_ids;
76            
77             #Checking to see if the current feature is hypotheitical and if the next one has annotation
78 45         104 for(my $i = 0; $i< (@{$self->_genes_annotation} -1) ; $i++ )
  45         1157  
79             {
80 0         0 my $current_feature = $self->_genes_annotation->[$i];
81 0         0 my $next_feature = $self->_genes_annotation->[$i+1];
82            
83 0 0       0 next if($current_feature->{database_annotation_exists} == 1);
84 0 0       0 next unless($current_feature->{product} =~ /hypothetical/i);
85 0 0       0 next unless($next_feature->{database_annotation_exists} == 1);
86            
87 0         0 my $start_coord = $current_feature->{start} ;
88 0         0 my $end_coord = $current_feature->{end} ;
89 0         0 my $comparison_start_coord =$next_feature->{start} ;
90 0         0 my $comparison_end_coord =$next_feature->{end} ;
91 0 0 0     0 if($comparison_start_coord < $end_coord && $comparison_end_coord > $start_coord )
92             {
93 0         0 my $percent_overlap = $self->_percent_overlap($start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord);
94 0 0       0 if($percent_overlap >= $self->_min_nucleotide_overlap_percentage)
95             {
96 0         0 $overlapping_protein_ids{$current_feature->{id_name}}++;
97             }
98             }
99             }
100            
101 45         1451 return \%overlapping_protein_ids;
102             }
103              
104             sub _percent_overlap
105             {
106 0     0   0 my ($self, $start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord) = @_;
107 0         0 my $size_of_hypothetical_gene = $end_coord - $start_coord;
108            
109 0         0 my $lower_bound = $start_coord;
110 0 0       0 if($comparison_start_coord > $start_coord)
111             {
112 0         0 $lower_bound = $comparison_start_coord;
113             }
114 0         0 my $upper_bound = $end_coord;
115 0 0       0 if($comparison_end_coord < $end_coord )
116             {
117 0         0 $upper_bound = $comparison_end_coord;
118             }
119 0         0 return (($upper_bound-$lower_bound)*100) / $size_of_hypothetical_gene;
120             }
121              
122              
123             sub _build__awk_filter {
124 110     110   304 my ($self) = @_;
125             return
126 110         3288 'awk \'BEGIN {FS="\t"};{ if ($3 ~/'
127             . $self->_tags_to_filter
128             . '/) print $1"\t"$4"\t"$5"\t"$9;}\' ';
129             }
130              
131 4     4   37 no Moose;
  4         9  
  4         34  
132             __PACKAGE__->meta->make_immutable;
133              
134             1;
135              
136             __END__
137              
138             =pod
139              
140             =encoding UTF-8
141              
142             =head1 NAME
143              
144             Bio::Roary::ContigsToGeneIDsFromGFF - Parse a GFF and efficiently and extract ordered gene ids on each contig
145              
146             =head1 VERSION
147              
148             version 3.11.0
149              
150             =head1 SYNOPSIS
151              
152             Parse a GFF and efficiently and extract ordered gene ids on each contig
153             use Bio::Roary::ContigsToGeneIDsFromGFF;
154              
155             my $obj = Bio::Roary::ContigsToGeneIDsFromGFF->new(
156             gff_file => 'abc.gff'
157             );
158             $obj->contig_to_ids;
159              
160             =head1 AUTHOR
161              
162             Andrew J. Page <ap13@sanger.ac.uk>
163              
164             =head1 COPYRIGHT AND LICENSE
165              
166             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
167              
168             This is free software, licensed under:
169              
170             The GNU General Public License, Version 3, June 2007
171              
172             =cut