File Coverage

lib/Bio/Roary/AssemblyStatistics.pm
Criterion Covered Total %
statement 107 115 93.0
branch 25 36 69.4
condition 7 12 58.3
subroutine 13 14 92.8
pod 0 2 0.0
total 152 179 84.9


line stmt bran cond sub pod time code
1             package Bio::Roary::AssemblyStatistics;
2             $Bio::Roary::AssemblyStatistics::VERSION = '3.11.0';
3             # ABSTRACT: Given a spreadsheet of gene presence and absence calculate some statistics
4              
5              
6 2     2   96571 use Moose;
  2         415336  
  2         14  
7 2     2   13460 use Bio::Roary::ExtractCoreGenesFromSpreadsheet;
  2         7  
  2         83  
8 2     2   651 use Log::Log4perl qw(:easy);
  2         30821  
  2         18  
9             with 'Bio::Roary::SpreadsheetRole';
10              
11             has 'output_filename' => ( is => 'ro', isa => 'Str', default => 'assembly_statistics.csv' );
12             has 'job_runner' => ( is => 'ro', isa => 'Str', default => 'Local' );
13             has 'cpus' => ( is => 'ro', isa => 'Int', default => 1 );
14             has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 );
15             has '_cloud_percentage' => ( is => 'rw', isa => 'Num', default => 0.15 );
16             has '_shell_percentage' => ( is => 'rw', isa => 'Num', default => 0.95 );
17             has '_soft_core_percentage' => ( is => 'rw', isa => 'Num', default => 0.99 );
18             has 'verbose' => ( is => 'ro', isa => 'Bool', default => 0 );
19             has 'contiguous_window' => ( is => 'ro', isa => 'Int', default => 10 );
20             has 'ordered_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_genes' );
21             has '_genes_to_rows' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build__genes_to_rows' );
22             has 'all_sample_statistics' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_all_sample_statistics' );
23             has 'sample_names_to_column_index' => ( is => 'rw', isa => 'Maybe[HashRef]' );
24             has 'summary_output_filename'=> ( is => 'ro', isa => 'Str', default => 'summary_statistics.txt' );
25             has 'logger' => ( is => 'ro', lazy => 1, builder => '_build_logger');
26             has 'gene_category_count' => ( is => 'ro', isa => 'HashRef', lazy => 1, builder => '_build_gene_category_count' );
27              
28             sub BUILD {
29 4     4 0 9 my ($self) = @_;
30 4         103 $self->_genes_to_rows;
31 4         86 $self->gene_category_count;
32             }
33              
34             sub _build_logger
35             {
36 1     1   3 my ($self) = @_;
37 1         10 Log::Log4perl->easy_init( $ERROR );
38 1         2833 my $logger = get_logger();
39 1         77 return $logger;
40             }
41              
42             sub create_summary_output
43             {
44 1     1 0 3 my ($self) = @_;
45 1 50       34 open(my $fh, '>', $self->summary_output_filename) or Bio::Roary::Exceptions::CouldntWriteToFile->throw(error => "Couldnt write to ".$self->summary_output_filename);
46              
47 1         28 my $core_percentage = $self->core_definition()*100;
48 1         23 my $soft_core_percentage = $self->_soft_core_percentage*100;
49 1         20 my $shell_percentage = $self->_shell_percentage()*100;
50 1         19 my $cloud_percentage = $self->_cloud_percentage()*100;
51            
52 1 50       19 my $core_genes = ($self->gene_category_count->{core} ? $self->gene_category_count->{core} : 0);
53 1 50       19 my $soft_core_genes = ($self->gene_category_count->{soft_core} ? $self->gene_category_count->{soft_core} : 0);
54 1 50       20 my $shell_genes =($self->gene_category_count->{shell} ? $self->gene_category_count->{shell} : 0);
55 1 50       21 my $cloud_genes = ($self->gene_category_count->{cloud} ? $self->gene_category_count->{cloud} : 0);
56 1         2 my $total_genes = $core_genes + $soft_core_genes + $shell_genes + $cloud_genes ;
57            
58 1 50       23 $self->logger->warn("Very few core genes detected with the current settings. Try modifying the core definition ( -cd 90 ) and/or
59             the blast identity (-i 70) parameters. Also try checking for contamination (-qc) and ensure you only have one species.") if($core_genes < 100);
60            
61 1         9 print {$fh} "Core genes\t($core_percentage".'% <= strains <= 100%)'."\t$core_genes\n";
  1         15  
62 1         2 print {$fh} "Soft core genes\t(".$shell_percentage."% <= strains < ".$soft_core_percentage."%)\t$soft_core_genes\n";
  1         7  
63 1         2 print {$fh} "Shell genes\t(".$cloud_percentage."% <= strains < ".$shell_percentage."%)\t$shell_genes\n";
  1         7  
64 1         3 print {$fh} "Cloud genes\t(0% <= strains < ".$cloud_percentage."%)\t$cloud_genes\n";
  1         4  
65 1         2 print {$fh} "Total genes\t(0% <= strains <= 100%)\t$total_genes\n";
  1         3  
66            
67 1         46 close($fh);
68 1         10 return 1;
69             }
70              
71             sub _build_gene_category_count {
72 4     4   11 my ($self) = @_;
73 4         5 my %gene_category_count;
74 4         84 $self->_soft_core_percentage($self->core_definition);
75            
76 4 50       85 if ( $self->_soft_core_percentage <= $self->_shell_percentage ) {
77 0         0 $self->_shell_percentage( $self->_soft_core_percentage - 0.01 );
78             }
79              
80 4         9 my $number_of_samples = keys %{ $self->sample_names_to_column_index };
  4         86  
81 4         7 for my $gene_name ( keys %{ $self->_genes_to_rows } ) {
  4         73  
82 140         154 my $isolates_with_gene = 0;
83              
84 140         2638 for ( my $i = $self->_num_fixed_headers ; $i < @{ $self->_genes_to_rows->{$gene_name} } ; $i++ ) {
  3140         55782  
85             $isolates_with_gene++
86 3000 100 66     52483 if ( defined( $self->_genes_to_rows->{$gene_name}->[$i] ) && $self->_genes_to_rows->{$gene_name}->[$i] ne "" );
87             }
88              
89 140 100       2676 if ( $isolates_with_gene < $self->_cloud_percentage() * $number_of_samples ) {
    100          
    100          
90 12         31 $gene_category_count{cloud}++;
91             }
92             elsif ( $isolates_with_gene < $self->_shell_percentage() * $number_of_samples ) {
93 72         149 $gene_category_count{shell}++;
94             }
95             elsif ( $isolates_with_gene < $self->_soft_core_percentage() * $number_of_samples ) {
96 2         7 $gene_category_count{soft_core}++;
97             }
98             else {
99 54         100 $gene_category_count{core}++;
100             }
101             }
102 4         111 return \%gene_category_count;
103             }
104              
105             sub _build_ordered_genes {
106 1     1   3 my ($self) = @_;
107 1         22 return Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( spreadsheet => $self->spreadsheet, core_definition => $self->core_definition )
108             ->ordered_core_genes();
109             }
110              
111             sub _build__genes_to_rows {
112 4     4   7 my ($self) = @_;
113              
114 4         7 my %genes_to_rows;
115 4         90 seek( $self->_input_spreadsheet_fh, 0, 0 );
116 4         87 my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh );
117 4         307 $self->_populate_sample_names_to_column_index($header_row);
118              
119 4         77 while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) {
120 144 100 66     1912 next if ( !defined( $row->[0] ) || $row->[0] eq "" );
121 140         2834 $genes_to_rows{ $row->[0] } = $row;
122             }
123              
124 4         185 return \%genes_to_rows;
125             }
126              
127             sub _populate_sample_names_to_column_index {
128 4     4   10 my ( $self, $row ) = @_;
129              
130 4         5 my %samples_to_index;
131 4         99 for ( my $i = $self->_num_fixed_headers ; $i < @{$row} ; $i++ ) {
  100         135  
132 96 50 33     200 next if ( ( !defined( $row->[$i] ) ) || $row->[$i] eq "" );
133 96         144 $samples_to_index{ $row->[$i] } = $i;
134             }
135 4         99 $self->sample_names_to_column_index( \%samples_to_index );
136             }
137              
138             sub _build_all_sample_statistics {
139 0     0   0 my ($self) = @_;
140              
141 0         0 my %sample_stats;
142              
143             # For each sample - loop over genes in order - number of contiguous blocks - max size of contiguous block - n50 - incorrect joins
144 0         0 for my $sample_name ( sort keys %{ $self->sample_names_to_column_index } ) {
  0         0  
145 0         0 $sample_stats{$sample_name} = $self->_sample_statistics($sample_name);
146             }
147 0         0 return \%sample_stats;
148             }
149              
150             sub _sample_statistics {
151 6     6   14 my ( $self, $sample_name ) = @_;
152              
153 6         166 my $sample_column_index = $self->sample_names_to_column_index->{$sample_name};
154 6         8 my @gene_ids;
155 6         10 for my $gene_name ( @{ $self->ordered_genes } ) {
  6         111  
156 300         5283 my $sample_gene_id = $self->_genes_to_rows->{$gene_name}->[$sample_column_index];
157 300 50       406 next unless ( defined($sample_gene_id) );
158              
159 300 50       813 if ( $sample_gene_id =~ /_([\d]+)$/ ) {
160 300         403 my $gene_number = $1;
161 300         468 push( @gene_ids, $gene_number );
162             }
163             else {
164 0         0 next;
165             }
166             }
167              
168 6         17 return $self->_number_of_contiguous_blocks( \@gene_ids );
169             }
170              
171             sub _number_of_contiguous_blocks {
172 6     6   10 my ( $self, $gene_ids ) = @_;
173              
174 6         11 my $current_gene_id = $gene_ids->[0];
175 6         7 my $number_of_blocks = 1;
176 6         67 my $largest_block_size = 0;
177 6         8 my $block_size = 0;
178 6         6 for my $gene_id ( @{$gene_ids} ) {
  6         10  
179 300 100 66     5249 if ( !( ( $current_gene_id + $self->contiguous_window >= $gene_id ) && ( $current_gene_id - $self->contiguous_window <= $gene_id ) )
180             )
181             {
182 53 50       72 if ( $block_size >= $largest_block_size ) {
183 53         49 $largest_block_size = $block_size;
184 53         49 $block_size = 0;
185             }
186 53         45 $number_of_blocks++;
187             }
188 300         327 $current_gene_id = $gene_id;
189 300         331 $block_size++;
190             }
191              
192 6 100       11 if ( $block_size > $largest_block_size ) {
193 3         5 $largest_block_size = $block_size;
194             }
195 6         42 return { num_blocks => $number_of_blocks, largest_block_size => $largest_block_size };
196             }
197              
198 2     2   3353 no Moose;
  2         5  
  2         15  
199             __PACKAGE__->meta->make_immutable;
200              
201             1;
202              
203             __END__
204              
205             =pod
206              
207             =encoding UTF-8
208              
209             =head1 NAME
210              
211             Bio::Roary::AssemblyStatistics - Given a spreadsheet of gene presence and absence calculate some statistics
212              
213             =head1 VERSION
214              
215             version 3.11.0
216              
217             =head1 SYNOPSIS
218              
219             Given a spreadsheet of gene presence and absence calculate some statistics
220              
221             =head1 AUTHOR
222              
223             Andrew J. Page <ap13@sanger.ac.uk>
224              
225             =head1 COPYRIGHT AND LICENSE
226              
227             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
228              
229             This is free software, licensed under:
230              
231             The GNU General Public License, Version 3, June 2007
232              
233             =cut