File Coverage

lib/Bio/Roary/OrderGenes.pm
Criterion Covered Total %
statement 227 245 92.6
branch 28 36 77.7
condition 5 12 41.6
subroutine 18 19 94.7
pod n/a
total 278 312 89.1


line stmt bran cond sub pod time code
1             package Bio::Roary::OrderGenes;
2             $Bio::Roary::OrderGenes::VERSION = '3.10.2';
3             # ABSTRACT: Take in GFF files and create a matrix of what genes are beside what other genes
4              
5              
6 3     3   656 use Moose;
  3         6  
  3         26  
7 3     3   18549 use Bio::Roary::Exceptions;
  3         6  
  3         58  
8 3     3   38 use Bio::Roary::AnalyseGroups;
  3         6  
  3         65  
9 3     3   1133 use Bio::Roary::ContigsToGeneIDsFromGFF;
  3         9  
  3         119  
10 3     3   1904 use Graph;
  3         223831  
  3         140  
11 3     3   1051 use Graph::Writer::Dot;
  3         7531  
  3         99  
12 3     3   21 use File::Basename;
  3         7  
  3         5377  
13              
14             has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
15             has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups', required => 1 );
16             has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
17             has 'pan_graph_filename' => ( is => 'ro', isa => 'Str', default => 'core_accessory_graph.dot' );
18             has 'accessory_graph_filename' => ( is => 'ro', isa => 'Str', default => 'accessory_graph.dot' );
19             has 'sample_weights' => ( is => 'ro', isa => 'Maybe[HashRef]' );
20             has 'samples_to_clusters' => ( is => 'ro', isa => 'Maybe[HashRef]' );
21             has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order' );
22             has 'groups_to_sample_names' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
23             has 'group_graphs' => ( is => 'ro', isa => 'Graph', lazy => 1, builder => '_build_group_graphs' );
24             has 'groups_to_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups_to_contigs' );
25             has '_groups_to_file_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_file_contigs' );
26             has '_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups' );
27             has 'number_of_files' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_number_of_files' );
28             has '_groups_qc' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
29             has '_percentage_of_largest_weak_threshold' => ( is => 'ro', isa => 'Num', default => 0.9 );
30              
31             sub _build_number_of_files {
32 15     15   31 my ($self) = @_;
33 15         22 return @{ $self->gff_files };
  15         325  
34             }
35              
36             sub _build_groups {
37 0     0   0 my ($self) = @_;
38 0         0 my %groups;
39 0         0 for my $group_name ( @{ $self->analyse_groups_obj->_groups } ) {
  0         0  
40 0         0 $groups{$group_name}++;
41             }
42 0         0 return \%groups;
43             }
44              
45             sub _build__groups_to_file_contigs {
46 36     36   130 my ($self) = @_;
47              
48 36         76 my @overlapping_hypothetical_gene_ids;
49             my %samples_to_groups_contigs;
50              
51             # Open each GFF file
52 36         77 for my $filename ( @{ $self->gff_files } ) {
  36         1011  
53 108         230 my @groups_to_contigs;
54 108         4621 my $contigs_to_ids_obj = Bio::Roary::ContigsToGeneIDsFromGFF->new( gff_file => $filename );
55              
56 108         5394 my ( $sample_name, $directories, $suffix ) = fileparse($filename);
57 108         832 $sample_name =~ s/\.gff//gi;
58              
59             # Loop over each contig in the GFF file
60 108         377 for my $contig_name ( keys %{ $contigs_to_ids_obj->contig_to_ids } ) {
  108         4098  
61 189         276 my @groups_on_contig;
62              
63             # loop over each gene in each contig in the GFF file
64 189         211 for my $gene_id ( @{ $contigs_to_ids_obj->contig_to_ids->{$contig_name} } ) {
  189         4694  
65              
66             # convert to group name
67 999         19542 my $group_name = $self->analyse_groups_obj->_genes_to_groups->{$gene_id};
68 999 100       1901 next unless ( defined($group_name) );
69              
70 176 50       4768 if ( $contigs_to_ids_obj->overlapping_hypothetical_protein_ids->{$gene_id} ) {
71 0         0 $self->_groups_qc->{$group_name} =
72             'Hypothetical protein with no hits to refseq/uniprot/clusters/cdd/tigrfams/pfam overlapping another protein with hits';
73             }
74 176         391 push( @groups_on_contig, $group_name );
75             }
76 189         436 push( @groups_to_contigs, \@groups_on_contig );
77             }
78 108         5328 $samples_to_groups_contigs{$sample_name} = \@groups_to_contigs;
79             }
80              
81 36         1773 return \%samples_to_groups_contigs;
82              
83             }
84              
85             sub _build_group_order {
86 36     36   110 my ($self) = @_;
87 36         97 my %group_order;
88              
89             my %groups_to_sample_names;
90 36         55 for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) {
  36         1110  
91 108         3176 my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name};
92 108         181 for my $groups_on_contig ( @{$groups_to_file_contigs} ) {
  108         267  
93 189         218 for ( my $i = 1 ; $i < @{$groups_on_contig} ; $i++ ) {
  320         534  
94 131         209 my $group_from = $groups_on_contig->[ $i - 1 ];
95 131         170 my $group_to = $groups_on_contig->[$i];
96              
97 131 100 66     2523 if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) {
98 11         187 $group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name};
99 11         14 push( @{ $groups_to_sample_names{$group_from} }, $sample_name );
  11         28  
100             }
101             else {
102 120         369 $group_order{$group_from}{$group_to}++;
103             }
104             }
105 189 100       194 if ( @{$groups_on_contig} == 1 ) {
  189         348  
106 6         13 my $group_from = $groups_on_contig->[0];
107 6         10 my $group_to = $groups_on_contig->[0];
108 6 50 33     111 if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) {
109 0         0 $group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name};
110 0         0 push( @{ $groups_to_sample_names{$group_from} }, $sample_name );
  0         0  
111             }
112             else {
113 6         19 $group_order{$group_from}{$group_to}++;
114             }
115             }
116             }
117             }
118              
119 36         1319 $self->groups_to_sample_names( \%groups_to_sample_names );
120 36         1023 return \%group_order;
121             }
122              
123             sub _build_group_graphs {
124 36     36   106 my ($self) = @_;
125 36         399 return Graph->new( undirected => 1 );
126             }
127              
128             sub _save_graph_to_file {
129 72     72   180 my ( $self, $graph, $output_filename ) = @_;
130 72         829 my $writer = Graph::Writer::Dot->new();
131 72         2357 $writer->write_graph( $graph, $output_filename );
132 72         82737 return 1;
133             }
134              
135             sub _add_groups_to_graph {
136 36     36   90 my ($self) = @_;
137              
138 36         63 for my $current_group ( keys %{ $self->group_order() } ) {
  36         1068  
139 65         19003 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  65         1722  
140 71         3871 my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} );
141 71         1656 $self->group_graphs->add_weighted_edge( $current_group, $group_to, $weight );
142             }
143             }
144              
145             }
146              
147             sub _reorder_connected_components {
148 72     72   198 my ( $self, $graph_groups ) = @_;
149 72         169 my @ordered_graph_groups;
150             my @paths_and_weights;
151              
152 72         120 for my $graph_group ( @{$graph_groups} ) {
  72         341  
153 29         49 my %groups;
154 29         43 $groups{$_}++ for ( @{$graph_group} );
  29         141  
155 29         47 my $edge_sum = 0;
156              
157 29         70 for my $current_group ( keys %groups ) {
158 107         122 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  107         2305  
159 97 100       188 next unless defined( $groups{$group_to} );
160 86         1482 $edge_sum += $self->group_order->{$current_group}->{$group_to};
161             }
162             }
163              
164 29         56 my %samples_in_graph;
165 29         67 for my $current_group ( keys %groups ) {
166 107         2075 my $sample_names = $self->groups_to_sample_names->{$current_group};
167 107 100       227 if ( defined($sample_names) ) {
168 11         12 for my $sample_name ( @{$sample_names} ) {
  11         17  
169 15         24 $samples_in_graph{$sample_name}++;
170             }
171             }
172             }
173 29         112 my @sample_names = sort keys %samples_in_graph;
174              
175 29 100       50 if ( @{$graph_group} == 1 ) {
  29         77  
176              
177 6         52 push(
178             @paths_and_weights,
179             {
180             path => $graph_group,
181             average_weight => $edge_sum,
182             sample_names => \@sample_names
183             }
184             );
185             }
186             else {
187 23         83 my $graph = Graph->new( undirected => 1 );
188 23         4884 for my $current_group ( keys %groups ) {
189 101         19402 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  101         2218  
190 91 100       1930 if ( $groups{$group_to} ) {
191 84         1521 my $weight = 1 / $self->group_order->{$current_group}->{$group_to};
192 84         207 $graph->add_weighted_edge( $current_group, $group_to, $weight );
193             }
194             }
195             }
196 23         3922 my $minimum_spanning_tree = $graph->minimum_spanning_tree;
197 23         59831 my $dfs_obj = Graph::Traversal::DFS->new($minimum_spanning_tree);
198 23         7587 my @reordered_dfs_groups = $dfs_obj->dfs;
199 23         38844 push(
200             @paths_and_weights,
201             {
202             path => \@reordered_dfs_groups,
203             average_weight => $edge_sum,
204             sample_names => \@sample_names
205             }
206             );
207             }
208              
209             }
210              
211 72         279 return $self->_order_by_samples_and_weights( \@paths_and_weights );
212             }
213              
214             sub _order_by_samples_and_weights {
215 73     73   201 my ( $self, $paths_and_weights ) = @_;
216              
217 73         135 my @ordered_graph_groups;
218 73 100       2793 if ( !defined( $self->samples_to_clusters ) ) {
219 72         133 my @ordered_paths_and_weights = sort { $a->{average_weight} <=> $b->{average_weight} } @{$paths_and_weights};
  5         29  
  72         254  
220 72         206 @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
  29         77  
221 72         367 return \@ordered_graph_groups;
222             }
223              
224             # Find the largest cluster in each graph and regroup
225 1         2 my %largest_cluster_to_paths_and_weights;
226 1         1 for my $graph_details ( @{$paths_and_weights} ) {
  1         5  
227 3         4 my %cluster_count;
228 3         4 for my $sample_name ( @{ $graph_details->{sample_names} } ) {
  3         5  
229 6 50       100 if ( defined( $self->samples_to_clusters->{$sample_name} ) ) {
230 6         95 $cluster_count{ $self->samples_to_clusters->{$sample_name} }++;
231             }
232             }
233 3 0       9 my $largest_cluster = ( sort { $cluster_count{$b} <=> $cluster_count{$a} || $a cmp $b} keys %cluster_count )[0];
  0         0  
234 3 50       6 if ( !defined($largest_cluster) ) {
235 0         0 my @ordered_paths_and_weights = sort { $b->{average_weight} <=> $a->{average_weight} } @{$paths_and_weights};
  0         0  
  0         0  
236 0         0 @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
  0         0  
237 0         0 return \@ordered_graph_groups;
238             }
239              
240 3         4 push( @{ $largest_cluster_to_paths_and_weights{$largest_cluster}{graph_details} }, $graph_details );
  3         7  
241 3         8 $largest_cluster_to_paths_and_weights{$largest_cluster}{largest_cluster_size} += $cluster_count{$largest_cluster};
242             }
243              
244             # go through each cluster group and order by weight
245 1         4 my @clustered_ordered_graph_groups;
246 1         6 for my $cluster_name (
247             sort {
248             $largest_cluster_to_paths_and_weights{$b}->{largest_cluster_size}
249             <=> $largest_cluster_to_paths_and_weights{$a}->{largest_cluster_size}
250 1         4 } keys %largest_cluster_to_paths_and_weights
251             )
252             {
253            
254             my @ordered_paths_and_weights =
255 2         6 sort { $b->{average_weight} <=> $a->{average_weight} } @{ $largest_cluster_to_paths_and_weights{$cluster_name}->{graph_details} };
  1         4  
  2         6  
256 2         3 @ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
  3         5  
257              
258 2         5 for my $graph_group (@ordered_graph_groups) {
259 3         5 push( @clustered_ordered_graph_groups, $graph_group );
260             }
261             }
262 1         7 return \@clustered_ordered_graph_groups;
263             }
264              
265             sub _build_groups_to_contigs {
266 36     36   104 my ($self) = @_;
267 36         227 $self->_add_groups_to_graph;
268              
269 36         4348 my %groups_to_contigs;
270 36         144 my $counter = 1;
271 36         71 my $overall_counter = 1;
272 36         72 my $counter_filtered = 1;
273              
274             # Accessory
275 36         313 my $accessory_graph = $self->_create_accessory_graph;
276 36         357 my @group_graphs = $accessory_graph->connected_components();
277 36         44928 my $reordered_graphs = $self->_reorder_connected_components( \@group_graphs );
278              
279 36         1247 $self->_save_graph_to_file( $accessory_graph, $self->accessory_graph_filename );
280              
281 36         88 for my $contig_groups ( @{$reordered_graphs} ) {
  36         198  
282 14         32 my $order_counter = 1;
283              
284 14         30 for my $group_name ( @{$contig_groups} ) {
  14         36  
285 29         99 $groups_to_contigs{$group_name}{accessory_label} = $counter;
286 29         47 $groups_to_contigs{$group_name}{accessory_order} = $order_counter;
287 29         56 $groups_to_contigs{$group_name}{'accessory_overall_order'} = $overall_counter;
288 29         42 $order_counter++;
289 29         50 $overall_counter++;
290             }
291 14         26 $counter++;
292             }
293              
294             # Core + accessory
295 36         1562 my @group_graphs_all = $self->group_graphs->connected_components();
296 36         52903 my $reordered_graphs_all = $self->_reorder_connected_components( \@group_graphs_all );
297 36         1010 $self->_save_graph_to_file( $self->group_graphs, $self->pan_graph_filename );
298              
299 36         90 $overall_counter = 1;
300 36         86 $counter = 1;
301 36         75 $counter_filtered = 1;
302 36         65 for my $contig_groups ( @{$reordered_graphs_all} ) {
  36         110  
303 15         18 my $order_counter = 1;
304              
305 15         21 for my $group_name ( @{$contig_groups} ) {
  15         29  
306 78         172 $groups_to_contigs{$group_name}{label} = $counter;
307 78         145 $groups_to_contigs{$group_name}{comment} = '';
308 78         98 $groups_to_contigs{$group_name}{order} = $order_counter;
309 78         121 $groups_to_contigs{$group_name}{'core_accessory_overall_order'} = $overall_counter;
310              
311 78 100       86 if ( @{$contig_groups} <= 2 ) {
  78 50       1605  
312 2         7 $groups_to_contigs{$group_name}{comment} = 'Investigate';
313             }
314             elsif ( $self->_groups_qc->{$group_name} ) {
315 0         0 $groups_to_contigs{$group_name}{comment} = $self->_groups_qc->{$group_name};
316             }
317             else {
318 76         128 $groups_to_contigs{$group_name}{'core_accessory_overall_order_filtered'} = $counter_filtered;
319 76         78 $counter_filtered++;
320             }
321 78         90 $order_counter++;
322 78         93 $overall_counter++;
323             }
324 15         23 $counter++;
325             }
326              
327 36         74 $counter_filtered = 1;
328 36         54 for my $contig_groups ( @{$reordered_graphs} ) {
  36         87  
329 14         23 for my $group_name ( @{$contig_groups} ) {
  14         30  
330 29 50 33     195 if ( ( !defined( $groups_to_contigs{$group_name}{comment} ) )
      33        
331             || ( defined( $groups_to_contigs{$group_name}{comment} ) && $groups_to_contigs{$group_name}{comment} eq '' ) )
332             {
333 29         45 $groups_to_contigs{$group_name}{'accessory_overall_order_filtered'} = $counter_filtered;
334 29         37 $counter_filtered++;
335             }
336             }
337             }
338              
339 36         1418 return \%groups_to_contigs;
340             }
341              
342             sub _create_accessory_graph {
343 36     36   116 my ($self) = @_;
344 36         532 my $graph = Graph->new( undirected => 1 );
345              
346 36         12607 my %core_groups;
347             my %group_freq;
348              
349 36         86 for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) {
  36         1162  
350 108         2661 my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name};
351              
352 108         170 for my $groups_on_contig ( @{$groups_to_file_contigs} ) {
  108         296  
353 189         179 for my $current_group ( @{$groups_on_contig} ) {
  189         260  
354 176         262 $group_freq{$current_group}++;
355             }
356             }
357             }
358              
359 36         99 for my $current_group ( keys %{ $self->group_order() } ) {
  36         860  
360 65 100       6962 next if ( $group_freq{$current_group} >= ( $self->number_of_files * $self->core_definition ) );
361            
362 26         49 for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
  26         529  
363 26 100       556 if ( $group_freq{$group_to} >= ( $self->number_of_files * $self->core_definition ) ) {
364 11         41 $graph->add_vertex($current_group);
365             }
366             else {
367 15         366 my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} );
368 15         49 $graph->add_weighted_edge( $current_group, $group_to, $weight );
369             }
370             }
371             }
372              
373 36         1127 return $graph;
374             }
375              
376 3     3   34 no Moose;
  3         6  
  3         36  
377             __PACKAGE__->meta->make_immutable;
378              
379             1;
380              
381             __END__
382              
383             =pod
384              
385             =encoding UTF-8
386              
387             =head1 NAME
388              
389             Bio::Roary::OrderGenes - Take in GFF files and create a matrix of what genes are beside what other genes
390              
391             =head1 VERSION
392              
393             version 3.10.2
394              
395             =head1 SYNOPSIS
396              
397             Take in the analyse groups and create a matrix of what genes are beside what other genes
398             use Bio::Roary::OrderGenes;
399              
400             my $obj = Bio::Roary::OrderGenes->new(
401             analyse_groups_obj => $analyse_groups_obj,
402             gff_files => ['file1.gff','file2.gff']
403             );
404             $obj->groups_to_contigs;
405              
406             =head1 AUTHOR
407              
408             Andrew J. Page <ap13@sanger.ac.uk>
409              
410             =head1 COPYRIGHT AND LICENSE
411              
412             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
413              
414             This is free software, licensed under:
415              
416             The GNU General Public License, Version 3, June 2007
417              
418             =cut