line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Roary::OrderGenes; |
2
|
|
|
|
|
|
|
$Bio::Roary::OrderGenes::VERSION = '3.11.0'; |
3
|
|
|
|
|
|
|
# ABSTRACT: Take in GFF files and create a matrix of what genes are beside what other genes |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
6
|
3
|
|
|
3
|
|
676
|
use Moose; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
21
|
|
7
|
3
|
|
|
3
|
|
17719
|
use Bio::Roary::Exceptions; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
66
|
|
8
|
3
|
|
|
3
|
|
27
|
use Bio::Roary::AnalyseGroups; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
53
|
|
9
|
3
|
|
|
3
|
|
858
|
use Bio::Roary::ContigsToGeneIDsFromGFF; |
|
3
|
|
|
|
|
12
|
|
|
3
|
|
|
|
|
127
|
|
10
|
3
|
|
|
3
|
|
1873
|
use Graph; |
|
3
|
|
|
|
|
230754
|
|
|
3
|
|
|
|
|
141
|
|
11
|
3
|
|
|
3
|
|
1078
|
use Graph::Writer::Dot; |
|
3
|
|
|
|
|
8714
|
|
|
3
|
|
|
|
|
100
|
|
12
|
3
|
|
|
3
|
|
24
|
use File::Basename; |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
6691
|
|
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
has 'gff_files' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); |
15
|
|
|
|
|
|
|
has 'analyse_groups_obj' => ( is => 'ro', isa => 'Bio::Roary::AnalyseGroups', required => 1 ); |
16
|
|
|
|
|
|
|
has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 ); |
17
|
|
|
|
|
|
|
has 'pan_graph_filename' => ( is => 'ro', isa => 'Str', default => 'core_accessory_graph.dot' ); |
18
|
|
|
|
|
|
|
has 'accessory_graph_filename' => ( is => 'ro', isa => 'Str', default => 'accessory_graph.dot' ); |
19
|
|
|
|
|
|
|
has 'sample_weights' => ( is => 'ro', isa => 'Maybe[HashRef]' ); |
20
|
|
|
|
|
|
|
has 'samples_to_clusters' => ( is => 'ro', isa => 'Maybe[HashRef]' ); |
21
|
|
|
|
|
|
|
has 'group_order' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_group_order' ); |
22
|
|
|
|
|
|
|
has 'groups_to_sample_names' => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); |
23
|
|
|
|
|
|
|
has 'group_graphs' => ( is => 'ro', isa => 'Graph', lazy => 1, builder => '_build_group_graphs' ); |
24
|
|
|
|
|
|
|
has 'groups_to_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups_to_contigs' ); |
25
|
|
|
|
|
|
|
has '_groups_to_file_contigs' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__groups_to_file_contigs' ); |
26
|
|
|
|
|
|
|
has '_groups' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_groups' ); |
27
|
|
|
|
|
|
|
has 'number_of_files' => ( is => 'ro', isa => 'Int', lazy => 1, builder => '_build_number_of_files' ); |
28
|
|
|
|
|
|
|
has '_groups_qc' => ( is => 'ro', isa => 'HashRef', default => sub { {} } ); |
29
|
|
|
|
|
|
|
has '_percentage_of_largest_weak_threshold' => ( is => 'ro', isa => 'Num', default => 0.9 ); |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
sub _build_number_of_files { |
32
|
15
|
|
|
15
|
|
41
|
my ($self) = @_; |
33
|
15
|
|
|
|
|
21
|
return @{ $self->gff_files }; |
|
15
|
|
|
|
|
340
|
|
34
|
|
|
|
|
|
|
} |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
sub _build_groups { |
37
|
0
|
|
|
0
|
|
0
|
my ($self) = @_; |
38
|
0
|
|
|
|
|
0
|
my %groups; |
39
|
0
|
|
|
|
|
0
|
for my $group_name ( @{ $self->analyse_groups_obj->_groups } ) { |
|
0
|
|
|
|
|
0
|
|
40
|
0
|
|
|
|
|
0
|
$groups{$group_name}++; |
41
|
|
|
|
|
|
|
} |
42
|
0
|
|
|
|
|
0
|
return \%groups; |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
sub _build__groups_to_file_contigs { |
46
|
36
|
|
|
36
|
|
83
|
my ($self) = @_; |
47
|
|
|
|
|
|
|
|
48
|
36
|
|
|
|
|
93
|
my @overlapping_hypothetical_gene_ids; |
49
|
|
|
|
|
|
|
my %samples_to_groups_contigs; |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# Open each GFF file |
52
|
36
|
|
|
|
|
60
|
for my $filename ( @{ $self->gff_files } ) { |
|
36
|
|
|
|
|
999
|
|
53
|
108
|
|
|
|
|
679
|
my @groups_to_contigs; |
54
|
108
|
|
|
|
|
4285
|
my $contigs_to_ids_obj = Bio::Roary::ContigsToGeneIDsFromGFF->new( gff_file => $filename ); |
55
|
|
|
|
|
|
|
|
56
|
108
|
|
|
|
|
4909
|
my ( $sample_name, $directories, $suffix ) = fileparse($filename); |
57
|
108
|
|
|
|
|
715
|
$sample_name =~ s/\.gff//gi; |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
# Loop over each contig in the GFF file |
60
|
108
|
|
|
|
|
204
|
for my $contig_name ( keys %{ $contigs_to_ids_obj->contig_to_ids } ) { |
|
108
|
|
|
|
|
3607
|
|
61
|
189
|
|
|
|
|
272
|
my @groups_on_contig; |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
#Â loop over each gene in each contig in the GFF file |
64
|
189
|
|
|
|
|
219
|
for my $gene_id ( @{ $contigs_to_ids_obj->contig_to_ids->{$contig_name} } ) { |
|
189
|
|
|
|
|
4947
|
|
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
#Â convert to group name |
67
|
999
|
|
|
|
|
20430
|
my $group_name = $self->analyse_groups_obj->_genes_to_groups->{$gene_id}; |
68
|
999
|
100
|
|
|
|
2102
|
next unless ( defined($group_name) ); |
69
|
|
|
|
|
|
|
|
70
|
176
|
50
|
|
|
|
5038
|
if ( $contigs_to_ids_obj->overlapping_hypothetical_protein_ids->{$gene_id} ) { |
71
|
0
|
|
|
|
|
0
|
$self->_groups_qc->{$group_name} = |
72
|
|
|
|
|
|
|
'Hypothetical protein with no hits to refseq/uniprot/clusters/cdd/tigrfams/pfam overlapping another protein with hits'; |
73
|
|
|
|
|
|
|
} |
74
|
176
|
|
|
|
|
441
|
push( @groups_on_contig, $group_name ); |
75
|
|
|
|
|
|
|
} |
76
|
189
|
|
|
|
|
565
|
push( @groups_to_contigs, \@groups_on_contig ); |
77
|
|
|
|
|
|
|
} |
78
|
108
|
|
|
|
|
4641
|
$samples_to_groups_contigs{$sample_name} = \@groups_to_contigs; |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
36
|
|
|
|
|
1199
|
return \%samples_to_groups_contigs; |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
sub _build_group_order { |
86
|
36
|
|
|
36
|
|
115
|
my ($self) = @_; |
87
|
36
|
|
|
|
|
100
|
my %group_order; |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
my %groups_to_sample_names; |
90
|
36
|
|
|
|
|
69
|
for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) { |
|
36
|
|
|
|
|
1055
|
|
91
|
108
|
|
|
|
|
2780
|
my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name}; |
92
|
108
|
|
|
|
|
229
|
for my $groups_on_contig ( @{$groups_to_file_contigs} ) { |
|
108
|
|
|
|
|
337
|
|
93
|
189
|
|
|
|
|
257
|
for ( my $i = 1 ; $i < @{$groups_on_contig} ; $i++ ) { |
|
320
|
|
|
|
|
520
|
|
94
|
131
|
|
|
|
|
205
|
my $group_from = $groups_on_contig->[ $i - 1 ]; |
95
|
131
|
|
|
|
|
174
|
my $group_to = $groups_on_contig->[$i]; |
96
|
|
|
|
|
|
|
|
97
|
131
|
100
|
66
|
|
|
2595
|
if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) { |
98
|
11
|
|
|
|
|
243
|
$group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name}; |
99
|
11
|
|
|
|
|
16
|
push( @{ $groups_to_sample_names{$group_from} }, $sample_name ); |
|
11
|
|
|
|
|
39
|
|
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
else { |
102
|
120
|
|
|
|
|
314
|
$group_order{$group_from}{$group_to}++; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
} |
105
|
189
|
100
|
|
|
|
192
|
if ( @{$groups_on_contig} == 1 ) { |
|
189
|
|
|
|
|
364
|
|
106
|
6
|
|
|
|
|
14
|
my $group_from = $groups_on_contig->[0]; |
107
|
6
|
|
|
|
|
10
|
my $group_to = $groups_on_contig->[0]; |
108
|
6
|
50
|
33
|
|
|
124
|
if ( defined( $self->sample_weights ) && $self->sample_weights->{$sample_name} ) { |
109
|
0
|
|
|
|
|
0
|
$group_order{$group_from}{$group_to} += $self->sample_weights->{$sample_name}; |
110
|
0
|
|
|
|
|
0
|
push( @{ $groups_to_sample_names{$group_from} }, $sample_name ); |
|
0
|
|
|
|
|
0
|
|
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
else { |
113
|
6
|
|
|
|
|
16
|
$group_order{$group_from}{$group_to}++; |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
} |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
36
|
|
|
|
|
1199
|
$self->groups_to_sample_names( \%groups_to_sample_names ); |
120
|
36
|
|
|
|
|
899
|
return \%group_order; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub _build_group_graphs { |
124
|
36
|
|
|
36
|
|
107
|
my ($self) = @_; |
125
|
36
|
|
|
|
|
438
|
return Graph->new( undirected => 1 ); |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub _save_graph_to_file { |
129
|
72
|
|
|
72
|
|
182
|
my ( $self, $graph, $output_filename ) = @_; |
130
|
72
|
|
|
|
|
924
|
my $writer = Graph::Writer::Dot->new(); |
131
|
72
|
|
|
|
|
2335
|
$writer->write_graph( $graph, $output_filename ); |
132
|
72
|
|
|
|
|
77840
|
return 1; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub _add_groups_to_graph { |
136
|
36
|
|
|
36
|
|
109
|
my ($self) = @_; |
137
|
|
|
|
|
|
|
|
138
|
36
|
|
|
|
|
59
|
for my $current_group ( keys %{ $self->group_order() } ) { |
|
36
|
|
|
|
|
1023
|
|
139
|
65
|
|
|
|
|
19614
|
for my $group_to ( keys %{ $self->group_order->{$current_group} } ) { |
|
65
|
|
|
|
|
1915
|
|
140
|
71
|
|
|
|
|
2864
|
my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} ); |
141
|
71
|
|
|
|
|
1755
|
$self->group_graphs->add_weighted_edge( $current_group, $group_to, $weight ); |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
sub _reorder_connected_components { |
148
|
72
|
|
|
72
|
|
177
|
my ( $self, $graph_groups ) = @_; |
149
|
72
|
|
|
|
|
162
|
my @ordered_graph_groups; |
150
|
|
|
|
|
|
|
my @paths_and_weights; |
151
|
|
|
|
|
|
|
|
152
|
72
|
|
|
|
|
108
|
for my $graph_group ( @{$graph_groups} ) { |
|
72
|
|
|
|
|
232
|
|
153
|
29
|
|
|
|
|
57
|
my %groups; |
154
|
29
|
|
|
|
|
41
|
$groups{$_}++ for ( @{$graph_group} ); |
|
29
|
|
|
|
|
120
|
|
155
|
29
|
|
|
|
|
48
|
my $edge_sum = 0; |
156
|
|
|
|
|
|
|
|
157
|
29
|
|
|
|
|
74
|
for my $current_group ( keys %groups ) { |
158
|
107
|
|
|
|
|
112
|
for my $group_to ( keys %{ $self->group_order->{$current_group} } ) { |
|
107
|
|
|
|
|
2475
|
|
159
|
97
|
100
|
|
|
|
212
|
next unless defined( $groups{$group_to} ); |
160
|
86
|
|
|
|
|
1506
|
$edge_sum += $self->group_order->{$current_group}->{$group_to}; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
|
164
|
29
|
|
|
|
|
69
|
my %samples_in_graph; |
165
|
29
|
|
|
|
|
76
|
for my $current_group ( keys %groups ) { |
166
|
107
|
|
|
|
|
2046
|
my $sample_names = $self->groups_to_sample_names->{$current_group}; |
167
|
107
|
100
|
|
|
|
210
|
if ( defined($sample_names) ) { |
168
|
11
|
|
|
|
|
15
|
for my $sample_name ( @{$sample_names} ) { |
|
11
|
|
|
|
|
16
|
|
169
|
15
|
|
|
|
|
25
|
$samples_in_graph{$sample_name}++; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
} |
173
|
29
|
|
|
|
|
160
|
my @sample_names = sort keys %samples_in_graph; |
174
|
|
|
|
|
|
|
|
175
|
29
|
100
|
|
|
|
49
|
if ( @{$graph_group} == 1 ) { |
|
29
|
|
|
|
|
83
|
|
176
|
|
|
|
|
|
|
|
177
|
6
|
|
|
|
|
52
|
push( |
178
|
|
|
|
|
|
|
@paths_and_weights, |
179
|
|
|
|
|
|
|
{ |
180
|
|
|
|
|
|
|
path => $graph_group, |
181
|
|
|
|
|
|
|
average_weight => $edge_sum, |
182
|
|
|
|
|
|
|
sample_names => \@sample_names |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
); |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
else { |
187
|
23
|
|
|
|
|
101
|
my $graph = Graph->new( undirected => 1 ); |
188
|
23
|
|
|
|
|
4908
|
for my $current_group ( keys %groups ) { |
189
|
101
|
|
|
|
|
21859
|
for my $group_to ( keys %{ $self->group_order->{$current_group} } ) { |
|
101
|
|
|
|
|
2896
|
|
190
|
91
|
100
|
|
|
|
1698
|
if ( $groups{$group_to} ) { |
191
|
84
|
|
|
|
|
1554
|
my $weight = 1 / $self->group_order->{$current_group}->{$group_to}; |
192
|
84
|
|
|
|
|
249
|
$graph->add_weighted_edge( $current_group, $group_to, $weight ); |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
} |
195
|
|
|
|
|
|
|
} |
196
|
23
|
|
|
|
|
3080
|
my $minimum_spanning_tree = $graph->minimum_spanning_tree; |
197
|
23
|
|
|
|
|
60280
|
my $dfs_obj = Graph::Traversal::DFS->new($minimum_spanning_tree); |
198
|
23
|
|
|
|
|
8006
|
my @reordered_dfs_groups = $dfs_obj->dfs; |
199
|
23
|
|
|
|
|
39241
|
push( |
200
|
|
|
|
|
|
|
@paths_and_weights, |
201
|
|
|
|
|
|
|
{ |
202
|
|
|
|
|
|
|
path => \@reordered_dfs_groups, |
203
|
|
|
|
|
|
|
average_weight => $edge_sum, |
204
|
|
|
|
|
|
|
sample_names => \@sample_names |
205
|
|
|
|
|
|
|
} |
206
|
|
|
|
|
|
|
); |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
|
211
|
72
|
|
|
|
|
481
|
return $self->_order_by_samples_and_weights( \@paths_and_weights ); |
212
|
|
|
|
|
|
|
} |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
sub _order_by_samples_and_weights { |
215
|
73
|
|
|
73
|
|
236
|
my ( $self, $paths_and_weights ) = @_; |
216
|
|
|
|
|
|
|
|
217
|
73
|
|
|
|
|
140
|
my @ordered_graph_groups; |
218
|
73
|
100
|
|
|
|
2513
|
if ( !defined( $self->samples_to_clusters ) ) { |
219
|
72
|
|
|
|
|
153
|
my @ordered_paths_and_weights = sort { $a->{average_weight} <=> $b->{average_weight} } @{$paths_and_weights}; |
|
5
|
|
|
|
|
31
|
|
|
72
|
|
|
|
|
271
|
|
220
|
72
|
|
|
|
|
152
|
@ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights; |
|
29
|
|
|
|
|
93
|
|
221
|
72
|
|
|
|
|
324
|
return \@ordered_graph_groups; |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# Find the largest cluster in each graph and regroup |
225
|
1
|
|
|
|
|
4
|
my %largest_cluster_to_paths_and_weights; |
226
|
1
|
|
|
|
|
5
|
for my $graph_details ( @{$paths_and_weights} ) { |
|
1
|
|
|
|
|
11
|
|
227
|
3
|
|
|
|
|
6
|
my %cluster_count; |
228
|
3
|
|
|
|
|
3
|
for my $sample_name ( @{ $graph_details->{sample_names} } ) { |
|
3
|
|
|
|
|
6
|
|
229
|
6
|
50
|
|
|
|
112
|
if ( defined( $self->samples_to_clusters->{$sample_name} ) ) { |
230
|
6
|
|
|
|
|
102
|
$cluster_count{ $self->samples_to_clusters->{$sample_name} }++; |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
} |
233
|
3
|
0
|
|
|
|
12
|
my $largest_cluster = ( sort { $cluster_count{$b} <=> $cluster_count{$a} || $a cmp $b} keys %cluster_count )[0]; |
|
0
|
|
|
|
|
0
|
|
234
|
3
|
50
|
|
|
|
7
|
if ( !defined($largest_cluster) ) { |
235
|
0
|
|
|
|
|
0
|
my @ordered_paths_and_weights = sort { $b->{average_weight} <=> $a->{average_weight} } @{$paths_and_weights}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
236
|
0
|
|
|
|
|
0
|
@ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights; |
|
0
|
|
|
|
|
0
|
|
237
|
0
|
|
|
|
|
0
|
return \@ordered_graph_groups; |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
|
240
|
3
|
|
|
|
|
7
|
push( @{ $largest_cluster_to_paths_and_weights{$largest_cluster}{graph_details} }, $graph_details ); |
|
3
|
|
|
|
|
8
|
|
241
|
3
|
|
|
|
|
8
|
$largest_cluster_to_paths_and_weights{$largest_cluster}{largest_cluster_size} += $cluster_count{$largest_cluster}; |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
# go through each cluster group and order by weight |
245
|
1
|
|
|
|
|
1
|
my @clustered_ordered_graph_groups; |
246
|
1
|
|
|
|
|
8
|
for my $cluster_name ( |
247
|
|
|
|
|
|
|
sort { |
248
|
|
|
|
|
|
|
$largest_cluster_to_paths_and_weights{$b}->{largest_cluster_size} |
249
|
|
|
|
|
|
|
<=> $largest_cluster_to_paths_and_weights{$a}->{largest_cluster_size} |
250
|
1
|
|
|
|
|
6
|
} keys %largest_cluster_to_paths_and_weights |
251
|
|
|
|
|
|
|
) |
252
|
|
|
|
|
|
|
{ |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
my @ordered_paths_and_weights = |
255
|
2
|
|
|
|
|
4
|
sort { $b->{average_weight} <=> $a->{average_weight} } @{ $largest_cluster_to_paths_and_weights{$cluster_name}->{graph_details} }; |
|
1
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
6
|
|
256
|
2
|
|
|
|
|
3
|
@ordered_graph_groups = map { $_->{path} } @ordered_paths_and_weights; |
|
3
|
|
|
|
|
6
|
|
257
|
|
|
|
|
|
|
|
258
|
2
|
|
|
|
|
4
|
for my $graph_group (@ordered_graph_groups) { |
259
|
3
|
|
|
|
|
6
|
push( @clustered_ordered_graph_groups, $graph_group ); |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
} |
262
|
1
|
|
|
|
|
9
|
return \@clustered_ordered_graph_groups; |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
sub _build_groups_to_contigs { |
266
|
36
|
|
|
36
|
|
108
|
my ($self) = @_; |
267
|
36
|
|
|
|
|
230
|
$self->_add_groups_to_graph; |
268
|
|
|
|
|
|
|
|
269
|
36
|
|
|
|
|
4497
|
my %groups_to_contigs; |
270
|
36
|
|
|
|
|
114
|
my $counter = 1; |
271
|
36
|
|
|
|
|
67
|
my $overall_counter = 1; |
272
|
36
|
|
|
|
|
58
|
my $counter_filtered = 1; |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# Accessory |
275
|
36
|
|
|
|
|
205
|
my $accessory_graph = $self->_create_accessory_graph; |
276
|
36
|
|
|
|
|
345
|
my @group_graphs = $accessory_graph->connected_components(); |
277
|
36
|
|
|
|
|
37905
|
my $reordered_graphs = $self->_reorder_connected_components( \@group_graphs ); |
278
|
|
|
|
|
|
|
|
279
|
36
|
|
|
|
|
1159
|
$self->_save_graph_to_file( $accessory_graph, $self->accessory_graph_filename ); |
280
|
|
|
|
|
|
|
|
281
|
36
|
|
|
|
|
113
|
for my $contig_groups ( @{$reordered_graphs} ) { |
|
36
|
|
|
|
|
180
|
|
282
|
14
|
|
|
|
|
27
|
my $order_counter = 1; |
283
|
|
|
|
|
|
|
|
284
|
14
|
|
|
|
|
32
|
for my $group_name ( @{$contig_groups} ) { |
|
14
|
|
|
|
|
29
|
|
285
|
29
|
|
|
|
|
78
|
$groups_to_contigs{$group_name}{accessory_label} = $counter; |
286
|
29
|
|
|
|
|
53
|
$groups_to_contigs{$group_name}{accessory_order} = $order_counter; |
287
|
29
|
|
|
|
|
53
|
$groups_to_contigs{$group_name}{'accessory_overall_order'} = $overall_counter; |
288
|
29
|
|
|
|
|
33
|
$order_counter++; |
289
|
29
|
|
|
|
|
38
|
$overall_counter++; |
290
|
|
|
|
|
|
|
} |
291
|
14
|
|
|
|
|
23
|
$counter++; |
292
|
|
|
|
|
|
|
} |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
# Core + accessory |
295
|
36
|
|
|
|
|
1223
|
my @group_graphs_all = $self->group_graphs->connected_components(); |
296
|
36
|
|
|
|
|
52234
|
my $reordered_graphs_all = $self->_reorder_connected_components( \@group_graphs_all ); |
297
|
36
|
|
|
|
|
873
|
$self->_save_graph_to_file( $self->group_graphs, $self->pan_graph_filename ); |
298
|
|
|
|
|
|
|
|
299
|
36
|
|
|
|
|
96
|
$overall_counter = 1; |
300
|
36
|
|
|
|
|
98
|
$counter = 1; |
301
|
36
|
|
|
|
|
81
|
$counter_filtered = 1; |
302
|
36
|
|
|
|
|
77
|
for my $contig_groups ( @{$reordered_graphs_all} ) { |
|
36
|
|
|
|
|
119
|
|
303
|
15
|
|
|
|
|
27
|
my $order_counter = 1; |
304
|
|
|
|
|
|
|
|
305
|
15
|
|
|
|
|
25
|
for my $group_name ( @{$contig_groups} ) { |
|
15
|
|
|
|
|
41
|
|
306
|
78
|
|
|
|
|
196
|
$groups_to_contigs{$group_name}{label} = $counter; |
307
|
78
|
|
|
|
|
160
|
$groups_to_contigs{$group_name}{comment} = ''; |
308
|
78
|
|
|
|
|
105
|
$groups_to_contigs{$group_name}{order} = $order_counter; |
309
|
78
|
|
|
|
|
97
|
$groups_to_contigs{$group_name}{'core_accessory_overall_order'} = $overall_counter; |
310
|
|
|
|
|
|
|
|
311
|
78
|
100
|
|
|
|
83
|
if ( @{$contig_groups} <= 2 ) { |
|
78
|
50
|
|
|
|
1709
|
|
312
|
2
|
|
|
|
|
7
|
$groups_to_contigs{$group_name}{comment} = 'Investigate'; |
313
|
|
|
|
|
|
|
} |
314
|
|
|
|
|
|
|
elsif ( $self->_groups_qc->{$group_name} ) { |
315
|
0
|
|
|
|
|
0
|
$groups_to_contigs{$group_name}{comment} = $self->_groups_qc->{$group_name}; |
316
|
|
|
|
|
|
|
} |
317
|
|
|
|
|
|
|
else { |
318
|
76
|
|
|
|
|
134
|
$groups_to_contigs{$group_name}{'core_accessory_overall_order_filtered'} = $counter_filtered; |
319
|
76
|
|
|
|
|
87
|
$counter_filtered++; |
320
|
|
|
|
|
|
|
} |
321
|
78
|
|
|
|
|
90
|
$order_counter++; |
322
|
78
|
|
|
|
|
99
|
$overall_counter++; |
323
|
|
|
|
|
|
|
} |
324
|
15
|
|
|
|
|
28
|
$counter++; |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
36
|
|
|
|
|
82
|
$counter_filtered = 1; |
328
|
36
|
|
|
|
|
70
|
for my $contig_groups ( @{$reordered_graphs} ) { |
|
36
|
|
|
|
|
103
|
|
329
|
14
|
|
|
|
|
20
|
for my $group_name ( @{$contig_groups} ) { |
|
14
|
|
|
|
|
25
|
|
330
|
29
|
50
|
33
|
|
|
199
|
if ( ( !defined( $groups_to_contigs{$group_name}{comment} ) ) |
|
|
|
33
|
|
|
|
|
331
|
|
|
|
|
|
|
|| ( defined( $groups_to_contigs{$group_name}{comment} ) && $groups_to_contigs{$group_name}{comment} eq '' ) ) |
332
|
|
|
|
|
|
|
{ |
333
|
29
|
|
|
|
|
44
|
$groups_to_contigs{$group_name}{'accessory_overall_order_filtered'} = $counter_filtered; |
334
|
29
|
|
|
|
|
39
|
$counter_filtered++; |
335
|
|
|
|
|
|
|
} |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
|
339
|
36
|
|
|
|
|
1421
|
return \%groups_to_contigs; |
340
|
|
|
|
|
|
|
} |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
sub _create_accessory_graph { |
343
|
36
|
|
|
36
|
|
183
|
my ($self) = @_; |
344
|
36
|
|
|
|
|
626
|
my $graph = Graph->new( undirected => 1 ); |
345
|
|
|
|
|
|
|
|
346
|
36
|
|
|
|
|
11505
|
my %core_groups; |
347
|
|
|
|
|
|
|
my %group_freq; |
348
|
|
|
|
|
|
|
|
349
|
36
|
|
|
|
|
73
|
for my $sample_name ( keys %{ $self->_groups_to_file_contigs } ) { |
|
36
|
|
|
|
|
1140
|
|
350
|
108
|
|
|
|
|
2513
|
my $groups_to_file_contigs = $self->_groups_to_file_contigs->{$sample_name}; |
351
|
|
|
|
|
|
|
|
352
|
108
|
|
|
|
|
161
|
for my $groups_on_contig ( @{$groups_to_file_contigs} ) { |
|
108
|
|
|
|
|
221
|
|
353
|
189
|
|
|
|
|
170
|
for my $current_group ( @{$groups_on_contig} ) { |
|
189
|
|
|
|
|
250
|
|
354
|
176
|
|
|
|
|
254
|
$group_freq{$current_group}++; |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
} |
358
|
|
|
|
|
|
|
|
359
|
36
|
|
|
|
|
86
|
for my $current_group ( keys %{ $self->group_order() } ) { |
|
36
|
|
|
|
|
727
|
|
360
|
65
|
100
|
|
|
|
7296
|
next if ( $group_freq{$current_group} >= ( $self->number_of_files * $self->core_definition ) ); |
361
|
|
|
|
|
|
|
|
362
|
26
|
|
|
|
|
59
|
for my $group_to ( keys %{ $self->group_order->{$current_group} } ) { |
|
26
|
|
|
|
|
481
|
|
363
|
26
|
100
|
|
|
|
509
|
if ( $group_freq{$group_to} >= ( $self->number_of_files * $self->core_definition ) ) { |
364
|
11
|
|
|
|
|
40
|
$graph->add_vertex($current_group); |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
else { |
367
|
15
|
|
|
|
|
303
|
my $weight = 1.0 / ( $self->group_order->{$current_group}->{$group_to} ); |
368
|
15
|
|
|
|
|
49
|
$graph->add_weighted_edge( $current_group, $group_to, $weight ); |
369
|
|
|
|
|
|
|
} |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
} |
372
|
|
|
|
|
|
|
|
373
|
36
|
|
|
|
|
1584
|
return $graph; |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
|
376
|
3
|
|
|
3
|
|
25
|
no Moose; |
|
3
|
|
|
|
|
8
|
|
|
3
|
|
|
|
|
30
|
|
377
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
1; |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
__END__ |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
=pod |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
=encoding UTF-8 |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
=head1 NAME |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
Bio::Roary::OrderGenes - Take in GFF files and create a matrix of what genes are beside what other genes |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
=head1 VERSION |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
version 3.11.0 |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
=head1 SYNOPSIS |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
Take in the analyse groups and create a matrix of what genes are beside what other genes |
398
|
|
|
|
|
|
|
use Bio::Roary::OrderGenes; |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
my $obj = Bio::Roary::OrderGenes->new( |
401
|
|
|
|
|
|
|
analyse_groups_obj => $analyse_groups_obj, |
402
|
|
|
|
|
|
|
gff_files => ['file1.gff','file2.gff'] |
403
|
|
|
|
|
|
|
); |
404
|
|
|
|
|
|
|
$obj->groups_to_contigs; |
405
|
|
|
|
|
|
|
|
406
|
|
|
|
|
|
|
=head1 AUTHOR |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
Andrew J. Page <ap13@sanger.ac.uk> |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
This is free software, licensed under: |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=cut |