| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Bio::Roary::AssemblyStatistics; |
|
2
|
|
|
|
|
|
|
$Bio::Roary::AssemblyStatistics::VERSION = '3.10.1'; |
|
3
|
|
|
|
|
|
|
# ABSTRACT: Given a spreadsheet of gene presence and absence calculate some statistics |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
2
|
|
|
2
|
|
87807
|
use Moose; |
|
|
2
|
|
|
|
|
376083
|
|
|
|
2
|
|
|
|
|
18
|
|
|
7
|
2
|
|
|
2
|
|
19160
|
use Bio::Roary::ExtractCoreGenesFromSpreadsheet; |
|
|
2
|
|
|
|
|
6
|
|
|
|
2
|
|
|
|
|
94
|
|
|
8
|
2
|
|
|
2
|
|
631
|
use Log::Log4perl qw(:easy); |
|
|
2
|
|
|
|
|
33192
|
|
|
|
2
|
|
|
|
|
20
|
|
|
9
|
|
|
|
|
|
|
with 'Bio::Roary::SpreadsheetRole'; |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'assembly_statistics.csv' ); |
|
12
|
|
|
|
|
|
|
has 'job_runner' => ( is => 'ro', isa => 'Str', default => 'Local' ); |
|
13
|
|
|
|
|
|
|
has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 ); |
|
14
|
|
|
|
|
|
|
has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); |
|
15
|
|
|
|
|
|
|
has '_cloud_percentage' => ( is => 'rw', isa => 'Num', default => 0.15 ); |
|
16
|
|
|
|
|
|
|
has '_shell_percentage' => ( is => 'rw', isa => 'Num', default => 0.95 ); |
|
17
|
|
|
|
|
|
|
has '_soft_core_percentage' => ( is => 'rw', isa => 'Num', default => 0.99 ); |
|
18
|
|
|
|
|
|
|
has 'verbose' => ( is => 'ro', isa => 'Bool', default => 0 ); |
|
19
|
|
|
|
|
|
|
has 'contiguous_window' => ( is => 'ro', isa => 'Int', default => 10 ); |
|
20
|
|
|
|
|
|
|
has 'ordered_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_genes' ); |
|
21
|
|
|
|
|
|
|
has '_genes_to_rows' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_rows' ); |
|
22
|
|
|
|
|
|
|
has 'all_sample_statistics' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_all_sample_statistics' ); |
|
23
|
|
|
|
|
|
|
has 'sample_names_to_column_index' => ( is => 'rw', isa => 'Maybe[HashRef]' ); |
|
24
|
|
|
|
|
|
|
has 'summary_output_filename'=> ( is => 'ro', isa => 'Str', default => 'summary_statistics.txt' ); |
|
25
|
|
|
|
|
|
|
has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger'); |
|
26
|
|
|
|
|
|
|
has 'gene_category_count' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_gene_category_count' ); |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub BUILD { |
|
29
|
4
|
|
|
4
|
0
|
10
|
my ($self) = @_; |
|
30
|
4
|
|
|
|
|
102
|
$self->_genes_to_rows; |
|
31
|
4
|
|
|
|
|
94
|
$self->gene_category_count; |
|
32
|
|
|
|
|
|
|
} |
|
33
|
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
sub _build_logger |
|
35
|
|
|
|
|
|
|
{ |
|
36
|
1
|
|
|
1
|
|
3
|
my ($self) = @_; |
|
37
|
1
|
|
|
|
|
10
|
Log::Log4perl->easy_init( $ERROR ); |
|
38
|
1
|
|
|
|
|
2705
|
my $logger = get_logger(); |
|
39
|
1
|
|
|
|
|
72
|
return $logger; |
|
40
|
|
|
|
|
|
|
} |
|
41
|
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
sub create_summary_output |
|
43
|
|
|
|
|
|
|
{ |
|
44
|
1
|
|
|
1
|
0
|
3
|
my ($self) = @_; |
|
45
|
1
|
50
|
|
|
|
41
|
open(my $fh, '>', $self->summary_output_filename) or Bio::Roary::Exceptions::CouldntWriteToFile->throw(error => "Couldnt write to ".$self->summary_output_filename); |
|
46
|
|
|
|
|
|
|
|
|
47
|
1
|
|
|
|
|
29
|
my $core_percentage = $self->core_definition()*100; |
|
48
|
1
|
|
|
|
|
21
|
my $soft_core_percentage = $self->_soft_core_percentage*100; |
|
49
|
1
|
|
|
|
|
19
|
my $shell_percentage = $self->_shell_percentage()*100; |
|
50
|
1
|
|
|
|
|
19
|
my $cloud_percentage = $self->_cloud_percentage()*100; |
|
51
|
|
|
|
|
|
|
|
|
52
|
1
|
50
|
|
|
|
19
|
my $core_genes = ($self->gene_category_count->{core} ? $self->gene_category_count->{core} : 0); |
|
53
|
1
|
50
|
|
|
|
19
|
my $soft_core_genes = ($self->gene_category_count->{soft_core} ? $self->gene_category_count->{soft_core} : 0); |
|
54
|
1
|
50
|
|
|
|
18
|
my $shell_genes =($self->gene_category_count->{shell} ? $self->gene_category_count->{shell} : 0); |
|
55
|
1
|
50
|
|
|
|
19
|
my $cloud_genes = ($self->gene_category_count->{cloud} ? $self->gene_category_count->{cloud} : 0); |
|
56
|
1
|
|
|
|
|
3
|
my $total_genes = $core_genes + $soft_core_genes + $shell_genes + $cloud_genes ; |
|
57
|
|
|
|
|
|
|
|
|
58
|
1
|
50
|
|
|
|
23
|
$self->logger->warn("Very few core genes detected with the current settings. Try modifying the core definition ( -cd 90 ) and/or |
|
59
|
|
|
|
|
|
|
the blast identity (-i 70) parameters. Also try checking for contamination (-qc) and ensure you only have one species.") if($core_genes < 100); |
|
60
|
|
|
|
|
|
|
|
|
61
|
1
|
|
|
|
|
8
|
print {$fh} "Core genes\t($core_percentage".'% <= strains <= 100%)'."\t$core_genes\n"; |
|
|
1
|
|
|
|
|
14
|
|
|
62
|
1
|
|
|
|
|
2
|
print {$fh} "Soft core genes\t(".$shell_percentage."% <= strains < ".$soft_core_percentage."%)\t$soft_core_genes\n"; |
|
|
1
|
|
|
|
|
5
|
|
|
63
|
1
|
|
|
|
|
2
|
print {$fh} "Shell genes\t(".$cloud_percentage."% <= strains < ".$shell_percentage."%)\t$shell_genes\n"; |
|
|
1
|
|
|
|
|
5
|
|
|
64
|
1
|
|
|
|
|
1
|
print {$fh} "Cloud genes\t(0% <= strains < ".$cloud_percentage."%)\t$cloud_genes\n"; |
|
|
1
|
|
|
|
|
3
|
|
|
65
|
1
|
|
|
|
|
1
|
print {$fh} "Total genes\t(0% <= strains <= 100%)\t$total_genes\n"; |
|
|
1
|
|
|
|
|
3
|
|
|
66
|
|
|
|
|
|
|
|
|
67
|
1
|
|
|
|
|
46
|
close($fh); |
|
68
|
1
|
|
|
|
|
7
|
return 1; |
|
69
|
|
|
|
|
|
|
} |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
sub _build_gene_category_count { |
|
72
|
4
|
|
|
4
|
|
8
|
my ($self) = @_; |
|
73
|
4
|
|
|
|
|
6
|
my %gene_category_count; |
|
74
|
4
|
|
|
|
|
83
|
$self->_soft_core_percentage($self->core_definition); |
|
75
|
|
|
|
|
|
|
|
|
76
|
4
|
50
|
|
|
|
103
|
if ( $self->_soft_core_percentage <= $self->_shell_percentage ) { |
|
77
|
0
|
|
|
|
|
0
|
$self->_shell_percentage( $self->_soft_core_percentage - 0.01 ); |
|
78
|
|
|
|
|
|
|
} |
|
79
|
|
|
|
|
|
|
|
|
80
|
4
|
|
|
|
|
5
|
my $number_of_samples = keys %{ $self->sample_names_to_column_index }; |
|
|
4
|
|
|
|
|
83
|
|
|
81
|
4
|
|
|
|
|
5
|
for my $gene_name ( keys %{ $self->_genes_to_rows } ) { |
|
|
4
|
|
|
|
|
73
|
|
|
82
|
140
|
|
|
|
|
172
|
my $isolates_with_gene = 0; |
|
83
|
|
|
|
|
|
|
|
|
84
|
140
|
|
|
|
|
2859
|
for ( my $i = $self->_num_fixed_headers ; $i < @{ $self->_genes_to_rows->{$gene_name} } ; $i++ ) { |
|
|
3140
|
|
|
|
|
61315
|
|
|
85
|
|
|
|
|
|
|
$isolates_with_gene++ |
|
86
|
3000
|
100
|
66
|
|
|
57701
|
if ( defined( $self->_genes_to_rows->{$gene_name}->[$i] ) && $self->_genes_to_rows->{$gene_name}->[$i] ne "" ); |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
140
|
100
|
|
|
|
2902
|
if ( $isolates_with_gene < $self->_cloud_percentage() * $number_of_samples ) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
90
|
12
|
|
|
|
|
42
|
$gene_category_count{cloud}++; |
|
91
|
|
|
|
|
|
|
} |
|
92
|
|
|
|
|
|
|
elsif ( $isolates_with_gene < $self->_shell_percentage() * $number_of_samples ) { |
|
93
|
72
|
|
|
|
|
210
|
$gene_category_count{shell}++; |
|
94
|
|
|
|
|
|
|
} |
|
95
|
|
|
|
|
|
|
elsif ( $isolates_with_gene < $self->_soft_core_percentage() * $number_of_samples ) { |
|
96
|
2
|
|
|
|
|
134
|
$gene_category_count{soft_core}++; |
|
97
|
|
|
|
|
|
|
} |
|
98
|
|
|
|
|
|
|
else { |
|
99
|
54
|
|
|
|
|
103
|
$gene_category_count{core}++; |
|
100
|
|
|
|
|
|
|
} |
|
101
|
|
|
|
|
|
|
} |
|
102
|
4
|
|
|
|
|
119
|
return \%gene_category_count; |
|
103
|
|
|
|
|
|
|
} |
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
sub _build_ordered_genes { |
|
106
|
1
|
|
|
1
|
|
3
|
my ($self) = @_; |
|
107
|
1
|
|
|
|
|
21
|
return Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( spreadsheet => $self->spreadsheet, core_definition => $self->core_definition ) |
|
108
|
|
|
|
|
|
|
->ordered_core_genes(); |
|
109
|
|
|
|
|
|
|
} |
|
110
|
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub _build__genes_to_rows { |
|
112
|
4
|
|
|
4
|
|
7
|
my ($self) = @_; |
|
113
|
|
|
|
|
|
|
|
|
114
|
4
|
|
|
|
|
8
|
my %genes_to_rows; |
|
115
|
4
|
|
|
|
|
91
|
seek( $self->_input_spreadsheet_fh, 0, 0 ); |
|
116
|
4
|
|
|
|
|
79
|
my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ); |
|
117
|
4
|
|
|
|
|
285
|
$self->_populate_sample_names_to_column_index($header_row); |
|
118
|
|
|
|
|
|
|
|
|
119
|
4
|
|
|
|
|
73
|
while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) { |
|
120
|
144
|
100
|
66
|
|
|
2004
|
next if ( !defined( $row->[0] ) || $row->[0] eq "" ); |
|
121
|
140
|
|
|
|
|
3195
|
$genes_to_rows{ $row->[0] } = $row; |
|
122
|
|
|
|
|
|
|
} |
|
123
|
|
|
|
|
|
|
|
|
124
|
4
|
|
|
|
|
227
|
return \%genes_to_rows; |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub _populate_sample_names_to_column_index { |
|
128
|
4
|
|
|
4
|
|
10
|
my ( $self, $row ) = @_; |
|
129
|
|
|
|
|
|
|
|
|
130
|
4
|
|
|
|
|
6
|
my %samples_to_index; |
|
131
|
4
|
|
|
|
|
122
|
for ( my $i = $self->_num_fixed_headers ; $i < @{$row} ; $i++ ) { |
|
|
100
|
|
|
|
|
139
|
|
|
132
|
96
|
50
|
33
|
|
|
195
|
next if ( ( !defined( $row->[$i] ) ) || $row->[$i] eq "" ); |
|
133
|
96
|
|
|
|
|
134
|
$samples_to_index{ $row->[$i] } = $i; |
|
134
|
|
|
|
|
|
|
} |
|
135
|
4
|
|
|
|
|
94
|
$self->sample_names_to_column_index( \%samples_to_index ); |
|
136
|
|
|
|
|
|
|
} |
|
137
|
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
sub _build_all_sample_statistics { |
|
139
|
0
|
|
|
0
|
|
0
|
my ($self) = @_; |
|
140
|
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
0
|
my %sample_stats; |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
# For each sample - loop over genes in order - number of contiguous blocks - max size of contiguous block - n50 - incorrect joins |
|
144
|
0
|
|
|
|
|
0
|
for my $sample_name ( sort keys %{ $self->sample_names_to_column_index } ) { |
|
|
0
|
|
|
|
|
0
|
|
|
145
|
0
|
|
|
|
|
0
|
$sample_stats{$sample_name} = $self->_sample_statistics($sample_name); |
|
146
|
|
|
|
|
|
|
} |
|
147
|
0
|
|
|
|
|
0
|
return \%sample_stats; |
|
148
|
|
|
|
|
|
|
} |
|
149
|
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
sub _sample_statistics { |
|
151
|
6
|
|
|
6
|
|
16
|
my ( $self, $sample_name ) = @_; |
|
152
|
|
|
|
|
|
|
|
|
153
|
6
|
|
|
|
|
167
|
my $sample_column_index = $self->sample_names_to_column_index->{$sample_name}; |
|
154
|
6
|
|
|
|
|
11
|
my @gene_ids; |
|
155
|
6
|
|
|
|
|
7
|
for my $gene_name ( @{ $self->ordered_genes } ) { |
|
|
6
|
|
|
|
|
119
|
|
|
156
|
300
|
|
|
|
|
5371
|
my $sample_gene_id = $self->_genes_to_rows->{$gene_name}->[$sample_column_index]; |
|
157
|
300
|
50
|
|
|
|
402
|
next unless ( defined($sample_gene_id) ); |
|
158
|
|
|
|
|
|
|
|
|
159
|
300
|
50
|
|
|
|
703
|
if ( $sample_gene_id =~ /_([\d]+)$/ ) { |
|
160
|
300
|
|
|
|
|
433
|
my $gene_number = $1; |
|
161
|
300
|
|
|
|
|
501
|
push( @gene_ids, $gene_number ); |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
else { |
|
164
|
0
|
|
|
|
|
0
|
next; |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
} |
|
167
|
|
|
|
|
|
|
|
|
168
|
6
|
|
|
|
|
15
|
return $self->_number_of_contiguous_blocks( \@gene_ids ); |
|
169
|
|
|
|
|
|
|
} |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
sub _number_of_contiguous_blocks { |
|
172
|
6
|
|
|
6
|
|
11
|
my ( $self, $gene_ids ) = @_; |
|
173
|
|
|
|
|
|
|
|
|
174
|
6
|
|
|
|
|
10
|
my $current_gene_id = $gene_ids->[0]; |
|
175
|
6
|
|
|
|
|
7
|
my $number_of_blocks = 1; |
|
176
|
6
|
|
|
|
|
7
|
my $largest_block_size = 0; |
|
177
|
6
|
|
|
|
|
7
|
my $block_size = 0; |
|
178
|
6
|
|
|
|
|
8
|
for my $gene_id ( @{$gene_ids} ) { |
|
|
6
|
|
|
|
|
9
|
|
|
179
|
300
|
100
|
66
|
|
|
5700
|
if ( !( ( $current_gene_id + $self->contiguous_window >= $gene_id ) && ( $current_gene_id - $self->contiguous_window <= $gene_id ) ) |
|
180
|
|
|
|
|
|
|
) |
|
181
|
|
|
|
|
|
|
{ |
|
182
|
53
|
50
|
|
|
|
106
|
if ( $block_size >= $largest_block_size ) { |
|
183
|
53
|
|
|
|
|
55
|
$largest_block_size = $block_size; |
|
184
|
53
|
|
|
|
|
48
|
$block_size = 0; |
|
185
|
|
|
|
|
|
|
} |
|
186
|
53
|
|
|
|
|
160
|
$number_of_blocks++; |
|
187
|
|
|
|
|
|
|
} |
|
188
|
300
|
|
|
|
|
384
|
$current_gene_id = $gene_id; |
|
189
|
300
|
|
|
|
|
334
|
$block_size++; |
|
190
|
|
|
|
|
|
|
} |
|
191
|
|
|
|
|
|
|
|
|
192
|
6
|
100
|
|
|
|
13
|
if ( $block_size > $largest_block_size ) { |
|
193
|
3
|
|
|
|
|
3
|
$largest_block_size = $block_size; |
|
194
|
|
|
|
|
|
|
} |
|
195
|
6
|
|
|
|
|
51
|
return { num_blocks => $number_of_blocks, largest_block_size => $largest_block_size }; |
|
196
|
|
|
|
|
|
|
} |
|
197
|
|
|
|
|
|
|
|
|
198
|
2
|
|
|
2
|
|
4706
|
no Moose; |
|
|
2
|
|
|
|
|
6
|
|
|
|
2
|
|
|
|
|
18
|
|
|
199
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
1; |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
__END__ |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
=pod |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
=encoding UTF-8 |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
=head1 NAME |
|
210
|
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Bio::Roary::AssemblyStatistics - Given a spreadsheet of gene presence and absence calculate some statistics |
|
212
|
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
=head1 VERSION |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
version 3.10.1 |
|
216
|
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
Given a spreadsheet of gene presence and absence calculate some statistics |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
=head1 AUTHOR |
|
222
|
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
Andrew J. Page <ap13@sanger.ac.uk> |
|
224
|
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
This is free software, licensed under: |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
|
232
|
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=cut |