line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Roary::ContigsToGeneIDsFromGFF; |
2
|
|
|
|
|
|
|
$Bio::Roary::ContigsToGeneIDsFromGFF::VERSION = '3.10.2'; |
3
|
|
|
|
|
|
|
# ABSTRACT: Parse a GFF and efficiently and extract ordered gene ids on each contig |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
85167
|
use Moose; |
|
4
|
|
|
|
|
374264
|
|
|
4
|
|
|
|
|
28
|
|
7
|
4
|
|
|
4
|
|
24057
|
use Bio::Tools::GFF; |
|
4
|
|
|
|
|
158426
|
|
|
4
|
|
|
|
|
2537
|
|
8
|
|
|
|
|
|
|
with 'Bio::Roary::ParseGFFAnnotationRole'; |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
has 'contig_to_ids' => ( is => 'rw', isa => 'HashRef', lazy => 1, builder => '_build_contig_to_ids'); |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
has 'overlapping_hypothetical_protein_ids' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_overlapping_hypothetical_protein_ids'); |
13
|
|
|
|
|
|
|
has '_genes_annotation' => ( is => 'rw', isa => 'ArrayRef', default => sub{[]}); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
has '_min_nucleotide_overlap_percentage' => ( is => 'ro', isa => 'Int', default => 10); |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
# Manually parse the GFF file because the BioPerl module is too slow |
18
|
|
|
|
|
|
|
sub _build_contig_to_ids |
19
|
|
|
|
|
|
|
{ |
20
|
110
|
|
|
110
|
|
305
|
my ($self) = @_; |
21
|
110
|
|
|
|
|
308
|
my %contigs_to_ids; |
22
|
|
|
|
|
|
|
my @genes_annotation; |
23
|
|
|
|
|
|
|
|
24
|
110
|
50
|
|
|
|
1850
|
open( my $fh, '-|', $self->_gff_fh_input_string ) or die "Couldnt open GFF file"; |
25
|
110
|
|
|
|
|
297010
|
while(<$fh>) |
26
|
|
|
|
|
|
|
{ |
27
|
1021
|
|
|
|
|
1863
|
chomp; |
28
|
1021
|
|
|
|
|
1705
|
my $line = $_; |
29
|
1021
|
|
|
|
|
1171
|
my $id_name; |
30
|
1021
|
50
|
|
|
|
3711
|
if($line =~/ID=["']?([^;"']+)["']?;?/i) |
31
|
|
|
|
|
|
|
{ |
32
|
1021
|
|
|
|
|
2119
|
$id_name= $1; |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
else |
35
|
|
|
|
|
|
|
{ |
36
|
0
|
|
|
|
|
0
|
next; |
37
|
|
|
|
|
|
|
} |
38
|
|
|
|
|
|
|
|
39
|
1021
|
|
|
|
|
2663
|
my @annotation_elements = split(/\t/,$line); |
40
|
|
|
|
|
|
|
# Map gene IDs to the contig |
41
|
1021
|
|
|
|
|
1242
|
push(@{$contigs_to_ids{$annotation_elements[0]}}, $id_name); |
|
1021
|
|
|
|
|
3028
|
|
42
|
|
|
|
|
|
|
|
43
|
1021
|
100
|
|
|
|
4315
|
if($line =~/product=["']?([^;,"']+)[,"']?;?/i) |
44
|
|
|
|
|
|
|
{ |
45
|
163
|
|
|
|
|
219
|
my %gene_data; |
46
|
163
|
|
|
|
|
464
|
$gene_data{product} = $1; |
47
|
163
|
|
|
|
|
327
|
$gene_data{id_name} = $id_name; |
48
|
163
|
100
|
66
|
|
|
811
|
if($line =~ /UniProtKB/ || $line =~ /RefSeq/ || $line =~ /protein motif/) |
|
|
|
100
|
|
|
|
|
49
|
|
|
|
|
|
|
{ |
50
|
103
|
|
|
|
|
196
|
$gene_data{database_annotation_exists} = 1; |
51
|
|
|
|
|
|
|
} |
52
|
|
|
|
|
|
|
else |
53
|
|
|
|
|
|
|
{ |
54
|
60
|
|
|
|
|
116
|
$gene_data{database_annotation_exists} = 0; |
55
|
|
|
|
|
|
|
} |
56
|
|
|
|
|
|
|
|
57
|
163
|
|
|
|
|
266
|
$gene_data{contig} = $annotation_elements[0]; |
58
|
163
|
|
|
|
|
306
|
$gene_data{start} = $annotation_elements[1]; |
59
|
163
|
|
|
|
|
305
|
$gene_data{end} = $annotation_elements[2]; |
60
|
163
|
|
|
|
|
818
|
push(@genes_annotation,\%gene_data); |
61
|
|
|
|
|
|
|
} |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
} |
64
|
110
|
|
|
|
|
3037
|
close($fh); |
65
|
|
|
|
|
|
|
|
66
|
110
|
|
|
|
|
11633
|
$self->_genes_annotation(\@genes_annotation); |
67
|
110
|
|
|
|
|
5700
|
return \%contigs_to_ids; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
sub _build_overlapping_hypothetical_protein_ids |
71
|
|
|
|
|
|
|
{ |
72
|
45
|
|
|
45
|
|
115
|
my ($self) = @_; |
73
|
45
|
|
|
|
|
1081
|
$self->contig_to_ids; |
74
|
|
|
|
|
|
|
|
75
|
45
|
|
|
|
|
66
|
my %overlapping_protein_ids; |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
#Checking to see if the current feature is hypotheitical and if the next one has annotation |
78
|
45
|
|
|
|
|
85
|
for(my $i = 0; $i< (@{$self->_genes_annotation} -1) ; $i++ ) |
|
45
|
|
|
|
|
1090
|
|
79
|
|
|
|
|
|
|
{ |
80
|
0
|
|
|
|
|
0
|
my $current_feature = $self->_genes_annotation->[$i]; |
81
|
0
|
|
|
|
|
0
|
my $next_feature = $self->_genes_annotation->[$i+1]; |
82
|
|
|
|
|
|
|
|
83
|
0
|
0
|
|
|
|
0
|
next if($current_feature->{database_annotation_exists} == 1); |
84
|
0
|
0
|
|
|
|
0
|
next unless($current_feature->{product} =~ /hypothetical/i); |
85
|
0
|
0
|
|
|
|
0
|
next unless($next_feature->{database_annotation_exists} == 1); |
86
|
|
|
|
|
|
|
|
87
|
0
|
|
|
|
|
0
|
my $start_coord = $current_feature->{start} ; |
88
|
0
|
|
|
|
|
0
|
my $end_coord = $current_feature->{end} ; |
89
|
0
|
|
|
|
|
0
|
my $comparison_start_coord =$next_feature->{start} ; |
90
|
0
|
|
|
|
|
0
|
my $comparison_end_coord =$next_feature->{end} ; |
91
|
0
|
0
|
0
|
|
|
0
|
if($comparison_start_coord < $end_coord && $comparison_end_coord > $start_coord ) |
92
|
|
|
|
|
|
|
{ |
93
|
0
|
|
|
|
|
0
|
my $percent_overlap = $self->_percent_overlap($start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord); |
94
|
0
|
0
|
|
|
|
0
|
if($percent_overlap >= $self->_min_nucleotide_overlap_percentage) |
95
|
|
|
|
|
|
|
{ |
96
|
0
|
|
|
|
|
0
|
$overlapping_protein_ids{$current_feature->{id_name}}++; |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
|
101
|
45
|
|
|
|
|
1253
|
return \%overlapping_protein_ids; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub _percent_overlap |
105
|
|
|
|
|
|
|
{ |
106
|
0
|
|
|
0
|
|
0
|
my ($self, $start_coord, $end_coord , $comparison_start_coord,$comparison_end_coord) = @_; |
107
|
0
|
|
|
|
|
0
|
my $size_of_hypothetical_gene = $end_coord - $start_coord; |
108
|
|
|
|
|
|
|
|
109
|
0
|
|
|
|
|
0
|
my $lower_bound = $start_coord; |
110
|
0
|
0
|
|
|
|
0
|
if($comparison_start_coord > $start_coord) |
111
|
|
|
|
|
|
|
{ |
112
|
0
|
|
|
|
|
0
|
$lower_bound = $comparison_start_coord; |
113
|
|
|
|
|
|
|
} |
114
|
0
|
|
|
|
|
0
|
my $upper_bound = $end_coord; |
115
|
0
|
0
|
|
|
|
0
|
if($comparison_end_coord < $end_coord ) |
116
|
|
|
|
|
|
|
{ |
117
|
0
|
|
|
|
|
0
|
$upper_bound = $comparison_end_coord; |
118
|
|
|
|
|
|
|
} |
119
|
0
|
|
|
|
|
0
|
return (($upper_bound-$lower_bound)*100) / $size_of_hypothetical_gene; |
120
|
|
|
|
|
|
|
} |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub _build__awk_filter { |
124
|
110
|
|
|
110
|
|
333
|
my ($self) = @_; |
125
|
|
|
|
|
|
|
return |
126
|
110
|
|
|
|
|
4025
|
'awk \'BEGIN {FS="\t"};{ if ($3 ~/' |
127
|
|
|
|
|
|
|
. $self->_tags_to_filter |
128
|
|
|
|
|
|
|
. '/) print $1"\t"$4"\t"$5"\t"$9;}\' '; |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
4
|
|
|
4
|
|
35
|
no Moose; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
34
|
|
132
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
1; |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
__END__ |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
=pod |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
=encoding UTF-8 |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
=head1 NAME |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
Bio::Roary::ContigsToGeneIDsFromGFF - Parse a GFF and efficiently and extract ordered gene ids on each contig |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head1 VERSION |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
version 3.10.2 |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=head1 SYNOPSIS |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
Parse a GFF and efficiently and extract ordered gene ids on each contig |
153
|
|
|
|
|
|
|
use Bio::Roary::ContigsToGeneIDsFromGFF; |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
my $obj = Bio::Roary::ContigsToGeneIDsFromGFF->new( |
156
|
|
|
|
|
|
|
gff_file => 'abc.gff' |
157
|
|
|
|
|
|
|
); |
158
|
|
|
|
|
|
|
$obj->contig_to_ids; |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=head1 AUTHOR |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Andrew J. Page <ap13@sanger.ac.uk> |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
This is free software, licensed under: |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=cut |