File Coverage

lib/Bio/Roary/OrderGenes.pm

Criterion	Covered	Total	%
statement	227	245	92.6
branch	28	36	77.7
condition	5	12	41.6
subroutine	18	19	94.7
pod			n/a
total	278	312	89.1

line	stmt	bran	cond	sub	time	code
1						package Bio::Roary::OrderGenes;
2						$Bio::Roary::OrderGenes::VERSION = '3.10.1';
3						# ABSTRACT: Take in GFF files and create a matrix of what genes are beside what other genes
4
5
6	3			3	935	use Moose;
	3				6
	3				22
7	3			3	19381	use Bio::Roary::Exceptions;
	3				9
	3				90
8	3			3	19	use Bio::Roary::AnalyseGroups;
	3				7
	3				88
9	3			3	1176	use Bio::Roary::ContigsToGeneIDsFromGFF;
	3				13
	3				127
10	3			3	1935	use Graph;
	3				281517
	3				126
11	3			3	1142	use Graph::Writer::Dot;
	3				7634
	3				97
12	3			3	22	use File::Basename;
	3				5
	3				5550
13
14						has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 );
15						has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups', required => 1 );
16						has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
17						has 'pan_graph_filename' => ( is => 'ro', isa => 'Str', default => 'core_accessory_graph.dot' );
18						has 'accessory_graph_filename' => ( is => 'ro', isa => 'Str', default => 'accessory_graph.dot' );
19						has 'sample_weights' => ( is => 'ro', isa => 'Maybe[HashRef]' );
20						has 'samples_to_clusters' => ( is => 'ro', isa => 'Maybe[HashRef]' );
21						has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order' );
22						has 'groups_to_sample_names' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
23						has 'group_graphs' => ( is => 'ro', isa => 'Graph', lazy => 1, builder => '_build_group_graphs' );
24						has 'groups_to_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups_to_contigs' );
25						has '_groups_to_file_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_file_contigs' );
26						has '_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups' );
27						has 'number_of_files' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_number_of_files' );
28						has '_groups_qc' => ( is => 'ro', isa => 'HashRef', default => sub { {} } );
29						has '_percentage_of_largest_weak_threshold' => ( is => 'ro', isa => 'Num', default => 0.9 );
30
31						sub _build_number_of_files {
32	15			15	43	my ($self) = @_;
33	15				29	return @{ $self->gff_files };
	15				468
34						}
35
36						sub _build_groups {
37	0			0	0	my ($self) = @_;
38	0				0	my %groups;
39	0				0	for my $group_name ( @{ $self->analyse_groups_obj->_groups } ) {
	0				0
40	0				0	$groups{$group_name}++;
41						}
42	0				0	return \%groups;
43						}
44
45						sub _build__groups_to_file_contigs {
46	36			36	107	my ($self) = @_;
47
48	36				122	my @overlapping_hypothetical_gene_ids;
49						my %samples_to_groups_contigs;
50
51						# Open each GFF file
52	36				61	for my $filename ( @{ $self->gff_files } ) {
	36				1193
53	108				379	my @groups_to_contigs;
54	108				5643	my $contigs_to_ids_obj = Bio::Roary::ContigsToGeneIDsFromGFF->new( gff_file => $filename );
55
56	108				6015	my ( $sample_name, $directories, $suffix ) = fileparse($filename);
57	108				1092	$sample_name =~ s/\.gff//gi;
58
59						# Loop over each contig in the GFF file
60	108				261	for my $contig_name ( keys %{ $contigs_to_ids_obj->contig_to_ids } ) {
	108				4470
61	189				592	my @groups_on_contig;
62
63						#Â loop over each gene in each contig in the GFF file
64	189				311	for my $gene_id ( @{ $contigs_to_ids_obj->contig_to_ids->{$contig_name} } ) {
	189				6893
65
66						#Â convert to group name
67	999				26684	my $group_name = $self->analyse_groups_obj->_genes_to_groups->{$gene_id};
68	999	100			3171	next unless ( defined($group_name) );
69
70	176	50			7384	if ( $contigs_to_ids_obj->overlapping_hypothetical_protein_ids->{$gene_id} ) {
71	0				0	$self->_groups_qc->{$group_name} =
72						'Hypothetical protein with no hits to refseq/uniprot/clusters/cdd/tigrfams/pfam overlapping another protein with hits';
73						}
74	176				581	push( @groups_on_contig, $group_name );
75						}
76	189				616	push( @groups_to_contigs, \@groups_on_contig );
77						}
78	108				6040	$samples_to_groups_contigs{$sample_name} = \@groups_to_contigs;
79						}
80
81	36				1486	return \%samples_to_groups_contigs;
82
83						}
84
85						sub _build_group_order {
86	36			36	116	my ($self) = @_;
87	36				110	my %group_order;
88
89						my %groups_to_sample_names;
90	36				65	for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) {
	36				1274
91	108				3646	my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name};
92	108				347	for my $groups_on_contig ( @{$groups_to_file_contigs} ) {
	108				324
93	189				302	for ( my $i = 1 ; $i < @{$groups_on_contig} ; $i++ ) {
	320				661
94	131				229	my $group_from = $groups_on_contig->[ $i - 1 ];
95	131				213	my $group_to = $groups_on_contig->[$i];
96
97	131	100	66		3260	if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) {
98	11				220	$group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name};
99	11				13	push( @{ $groups_to_sample_names{$group_from} }, $sample_name );
	11				24
100						}
101						else {
102	120				433	$group_order{$group_from}{$group_to}++;
103						}
104						}
105	189	100			255	if ( @{$groups_on_contig} == 1 ) {
	189				465
106	6				21	my $group_from = $groups_on_contig->[0];
107	6				19	my $group_to = $groups_on_contig->[0];
108	6	50	33		220	if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) {
109	0				0	$group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name};
110	0				0	push( @{ $groups_to_sample_names{$group_from} }, $sample_name );
	0				0
111						}
112						else {
113	6				35	$group_order{$group_from}{$group_to}++;
114						}
115						}
116						}
117						}
118
119	36				1592	$self->groups_to_sample_names( \%groups_to_sample_names );
120	36				1221	return \%group_order;
121						}
122
123						sub _build_group_graphs {
124	36			36	111	my ($self) = @_;
125	36				487	return Graph->new( undirected => 1 );
126						}
127
128						sub _save_graph_to_file {
129	72			72	228	my ( $self, $graph, $output_filename ) = @_;
130	72				945	my $writer = Graph::Writer::Dot->new();
131	72				2780	$writer->write_graph( $graph, $output_filename );
132	72				95720	return 1;
133						}
134
135						sub _add_groups_to_graph {
136	36			36	112	my ($self) = @_;
137
138	36				79	for my $current_group ( keys %{ $self->group_order() } ) {
	36				1304
139	65				22888	for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
	65				2023
140	71				3841	my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} );
141	71				1822	$self->group_graphs->add_weighted_edge( $current_group, $group_to, $weight );
142						}
143						}
144
145						}
146
147						sub _reorder_connected_components {
148	72			72	260	my ( $self, $graph_groups ) = @_;
149	72				187	my @ordered_graph_groups;
150						my @paths_and_weights;
151
152	72				138	for my $graph_group ( @{$graph_groups} ) {
	72				255
153	29				157	my %groups;
154	29				56	$groups{$_}++ for ( @{$graph_group} );
	29				145
155	29				75	my $edge_sum = 0;
156
157	29				146	for my $current_group ( keys %groups ) {
158	107				163	for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
	107				2823
159	97	100			327	next unless defined( $groups{$group_to} );
160	86				1965	$edge_sum += $self->group_order->{$current_group}->{$group_to};
161						}
162						}
163
164	29				67	my %samples_in_graph;
165	29				85	for my $current_group ( keys %groups ) {
166	107				2406	my $sample_names = $self->groups_to_sample_names->{$current_group};
167	107	100			266	if ( defined($sample_names) ) {
168	11				16	for my $sample_name ( @{$sample_names} ) {
	11				20
169	15				23	$samples_in_graph{$sample_name}++;
170						}
171						}
172						}
173	29				106	my @sample_names = sort keys %samples_in_graph;
174
175	29	100			49	if ( @{$graph_group} == 1 ) {
	29				99
176
177	6				81	push(
178						@paths_and_weights,
179						{
180						path => $graph_group,
181						average_weight => $edge_sum,
182						sample_names => \@sample_names
183						}
184						);
185						}
186						else {
187	23				105	my $graph = Graph->new( undirected => 1 );
188	23				5761	for my $current_group ( keys %groups ) {
189	101				23480	for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
	101				2889
190	91	100			2650	if ( $groups{$group_to} ) {
191	84				1826	my $weight = 1 / $self->group_order->{$current_group}->{$group_to};
192	84				288	$graph->add_weighted_edge( $current_group, $group_to, $weight );
193						}
194						}
195						}
196	23				5312	my $minimum_spanning_tree = $graph->minimum_spanning_tree;
197	23				70301	my $dfs_obj = Graph::Traversal::DFS->new($minimum_spanning_tree);
198	23				9543	my @reordered_dfs_groups = $dfs_obj->dfs;
199	23				48599	push(
200						@paths_and_weights,
201						{
202						path => \@reordered_dfs_groups,
203						average_weight => $edge_sum,
204						sample_names => \@sample_names
205						}
206						);
207						}
208
209						}
210
211	72				368	return $self->_order_by_samples_and_weights( \@paths_and_weights );
212						}
213
214						sub _order_by_samples_and_weights {
215	73			73	229	my ( $self, $paths_and_weights ) = @_;
216
217	73				150	my @ordered_graph_groups;
218	73	100			3428	if ( !defined( $self->samples_to_clusters ) ) {
219	72				166	my @ordered_paths_and_weights = sort { $a->{average_weight} <=> $b->{average_weight} } @{$paths_and_weights};
	5				46
	72				302
220	72				218	@ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
	29				115
221	72				406	return \@ordered_graph_groups;
222						}
223
224						# Find the largest cluster in each graph and regroup
225	1				4	my %largest_cluster_to_paths_and_weights;
226	1				2	for my $graph_details ( @{$paths_and_weights} ) {
	1				6
227	3				7	my %cluster_count;
228	3				3	for my $sample_name ( @{ $graph_details->{sample_names} } ) {
	3				7
229	6	50			100	if ( defined( $self->samples_to_clusters->{$sample_name} ) ) {
230	6				98	$cluster_count{ $self->samples_to_clusters->{$sample_name} }++;
231						}
232						}
233	3	0			11	my $largest_cluster = ( sort { $cluster_count{$b} <=> $cluster_count{$a} \|\| $a cmp $b} keys %cluster_count )[0];
	0				0
234	3	50			7	if ( !defined($largest_cluster) ) {
235	0				0	my @ordered_paths_and_weights = sort { $b->{average_weight} <=> $a->{average_weight} } @{$paths_and_weights};
	0				0
	0				0
236	0				0	@ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
	0				0
237	0				0	return \@ordered_graph_groups;
238						}
239
240	3				5	push( @{ $largest_cluster_to_paths_and_weights{$largest_cluster}{graph_details} }, $graph_details );
	3				7
241	3				7	$largest_cluster_to_paths_and_weights{$largest_cluster}{largest_cluster_size} += $cluster_count{$largest_cluster};
242						}
243
244						# go through each cluster group and order by weight
245	1				2	my @clustered_ordered_graph_groups;
246	1				34	for my $cluster_name (
247						sort {
248						$largest_cluster_to_paths_and_weights{$b}->{largest_cluster_size}
249						<=> $largest_cluster_to_paths_and_weights{$a}->{largest_cluster_size}
250	1				7	} keys %largest_cluster_to_paths_and_weights
251						)
252						{
253
254						my @ordered_paths_and_weights =
255	2				7	sort { $b->{average_weight} <=> $a->{average_weight} } @{ $largest_cluster_to_paths_and_weights{$cluster_name}->{graph_details} };
	1				5
	2				6
256	2				5	@ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights;
	3				8
257
258	2				4	for my $graph_group (@ordered_graph_groups) {
259	3				8	push( @clustered_ordered_graph_groups, $graph_group );
260						}
261						}
262	1				17	return \@clustered_ordered_graph_groups;
263						}
264
265						sub _build_groups_to_contigs {
266	36			36	118	my ($self) = @_;
267	36				273	$self->_add_groups_to_graph;
268
269	36				6056	my %groups_to_contigs;
270	36				162	my $counter = 1;
271	36				81	my $overall_counter = 1;
272	36				79	my $counter_filtered = 1;
273
274						# Accessory
275	36				284	my $accessory_graph = $self->_create_accessory_graph;
276	36				396	my @group_graphs = $accessory_graph->connected_components();
277	36				45134	my $reordered_graphs = $self->_reorder_connected_components( \@group_graphs );
278
279	36				1410	$self->_save_graph_to_file( $accessory_graph, $self->accessory_graph_filename );
280
281	36				91	for my $contig_groups ( @{$reordered_graphs} ) {
	36				215
282	14				38	my $order_counter = 1;
283
284	14				31	for my $group_name ( @{$contig_groups} ) {
	14				43
285	29				87	$groups_to_contigs{$group_name}{accessory_label} = $counter;
286	29				45	$groups_to_contigs{$group_name}{accessory_order} = $order_counter;
287	29				57	$groups_to_contigs{$group_name}{'accessory_overall_order'} = $overall_counter;
288	29				41	$order_counter++;
289	29				52	$overall_counter++;
290						}
291	14				32	$counter++;
292						}
293
294						# Core + accessory
295	36				1616	my @group_graphs_all = $self->group_graphs->connected_components();
296	36				63808	my $reordered_graphs_all = $self->_reorder_connected_components( \@group_graphs_all );
297	36				1767	$self->_save_graph_to_file( $self->group_graphs, $self->pan_graph_filename );
298
299	36				110	$overall_counter = 1;
300	36				78	$counter = 1;
301	36				105	$counter_filtered = 1;
302	36				74	for my $contig_groups ( @{$reordered_graphs_all} ) {
	36				165
303	15				38	my $order_counter = 1;
304
305	15				26	for my $group_name ( @{$contig_groups} ) {
	15				41
306	78				235	$groups_to_contigs{$group_name}{label} = $counter;
307	78				159	$groups_to_contigs{$group_name}{comment} = '';
308	78				119	$groups_to_contigs{$group_name}{order} = $order_counter;
309	78				124	$groups_to_contigs{$group_name}{'core_accessory_overall_order'} = $overall_counter;
310
311	78	100			113	if ( @{$contig_groups} <= 2 ) {
	78	50			1923
312	2				10	$groups_to_contigs{$group_name}{comment} = 'Investigate';
313						}
314						elsif ( $self->_groups_qc->{$group_name} ) {
315	0				0	$groups_to_contigs{$group_name}{comment} = $self->_groups_qc->{$group_name};
316						}
317						else {
318	76				159	$groups_to_contigs{$group_name}{'core_accessory_overall_order_filtered'} = $counter_filtered;
319	76				109	$counter_filtered++;
320						}
321	78				109	$order_counter++;
322	78				115	$overall_counter++;
323						}
324	15				38	$counter++;
325						}
326
327	36				76	$counter_filtered = 1;
328	36				85	for my $contig_groups ( @{$reordered_graphs} ) {
	36				100
329	14				30	for my $group_name ( @{$contig_groups} ) {
	14				31
330	29	50	33		219	if ( ( !defined( $groups_to_contigs{$group_name}{comment} ) )
			33
331						\|\| ( defined( $groups_to_contigs{$group_name}{comment} ) && $groups_to_contigs{$group_name}{comment} eq '' ) )
332						{
333	29				53	$groups_to_contigs{$group_name}{'accessory_overall_order_filtered'} = $counter_filtered;
334	29				45	$counter_filtered++;
335						}
336						}
337						}
338
339	36				1740	return \%groups_to_contigs;
340						}
341
342						sub _create_accessory_graph {
343	36			36	133	my ($self) = @_;
344	36				533	my $graph = Graph->new( undirected => 1 );
345
346	36				14224	my %core_groups;
347						my %group_freq;
348
349	36				90	for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) {
	36				1368
350	108				3284	my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name};
351
352	108				214	for my $groups_on_contig ( @{$groups_to_file_contigs} ) {
	108				322
353	189				343	for my $current_group ( @{$groups_on_contig} ) {
	189				427
354	176				402	$group_freq{$current_group}++;
355						}
356						}
357						}
358
359	36				191	for my $current_group ( keys %{ $self->group_order() } ) {
	36				1220
360	65	100			8176	next if ( $group_freq{$current_group} >= ( $self->number_of_files * $self->core_definition ) );
361
362	26				55	for my $group_to ( keys %{ $self->group_order->{$current_group} } ) {
	26				576
363	26	100			571	if ( $group_freq{$group_to} >= ( $self->number_of_files * $self->core_definition ) ) {
364	11				51	$graph->add_vertex($current_group);
365						}
366						else {
367	15				328	my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} );
368	15				75	$graph->add_weighted_edge( $current_group, $group_to, $weight );
369						}
370						}
371						}
372
373	36				465	return $graph;
374						}
375
376	3			3	28	no Moose;
	3				5
	3				35
377						__PACKAGE__->meta->make_immutable;
378
379						1;
380
381						__END__
382
383						=pod
384
385						=encoding UTF-8
386
387						=head1 NAME
388
389						Bio::Roary::OrderGenes - Take in GFF files and create a matrix of what genes are beside what other genes
390
391						=head1 VERSION
392
393						version 3.10.1
394
395						=head1 SYNOPSIS
396
397						Take in the analyse groups and create a matrix of what genes are beside what other genes
398						use Bio::Roary::OrderGenes;
399
400						my $obj = Bio::Roary::OrderGenes->new(
401						analyse_groups_obj => $analyse_groups_obj,
402						gff_files => ['file1.gff','file2.gff']
403						);
404						$obj->groups_to_contigs;
405
406						=head1 AUTHOR
407
408						Andrew J. Page <ap13@sanger.ac.uk>
409
410						=head1 COPYRIGHT AND LICENSE
411
412						This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
413
414						This is free software, licensed under:
415
416						The GNU General Public License, Version 3, June 2007
417
418						=cut