File Coverage

lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm
Criterion Covered Total %
statement 83 83 100.0
branch 18 20 90.0
condition 6 12 50.0
subroutine 14 14 100.0
pod n/a
total 121 129 93.8


line stmt bran cond sub pod time code
1             package Bio::Roary::ExtractCoreGenesFromSpreadsheet;
2             $Bio::Roary::ExtractCoreGenesFromSpreadsheet::VERSION = '3.11.0';
3             # ABSTRACT: Take in a spreadsheet produced by the pipeline and identify the core genes.
4              
5              
6 4     4   98024 use Moose;
  4         374019  
  4         23  
7 4     4   25271 use Text::CSV;
  4         44875  
  4         171  
8 4     4   937 use Bio::Roary::GroupStatistics;
  4         11  
  4         139  
9 4     4   26 use POSIX;
  4         6  
  4         30  
10              
11             has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 );
12             has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV', lazy => 1, builder => '_build__csv_parser' );
13             has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' );
14             has 'ordered_core_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_core_genes' );
15             has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 );
16             has 'sample_names' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
17             has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
18             has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 );
19              
20             has '_number_of_isolates' => ( is => 'rw', isa => 'Int' );
21             has '_gene_column' => ( is => 'rw', isa => 'Int' );
22             has '_num_isolates_column' => ( is => 'rw', isa => 'Int' );
23             has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int' );
24             has '_genome_fragement_column' => ( is => 'rw', isa => 'Int' );
25             has '_order_within_fragement_column' => ( is => 'rw', isa => 'Int' );
26             has '_min_no_isolates_for_core' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' );
27              
28             sub _build__min_no_isolates_for_core {
29 5     5   12 my ($self) = @_;
30 5         125 my $threshold = $self->_number_of_isolates * $self->core_definition;
31              
32 5         115 return $threshold;
33             }
34              
35             sub _build__csv_parser {
36 5     5   12 my ($self) = @_;
37 5         49 return Text::CSV->new( { binary => 1, always_quote => 1 } );
38             }
39              
40             sub _build__input_spreadsheet_fh {
41 5     5   12 my ($self) = @_;
42 5         166 open( my $fh, $self->spreadsheet );
43 5         160 return $fh;
44             }
45              
46             sub _update_number_of_isolates {
47 5     5   13 my ( $self, $header_row ) = @_;
48 5         6 my $number_of_isolates = @{$header_row} - @{ Bio::Roary::GroupStatistics->fixed_headers };
  5         8  
  5         71  
49 5         136 $self->_number_of_isolates($number_of_isolates);
50             }
51              
52             sub _setup_column_mappings {
53 5     5   12 my ( $self, $header_row ) = @_;
54              
55             # current ordering
56 5         35 my %columns_of_interest_mappings = (
57             'Gene' => 0,
58             'No. isolates' => 3,
59             'Avg sequences per isolate' => 5,
60             'Genome Fragment' => 6,
61             'Order within Fragment' => 7,
62             'QC' => 10,
63             );
64              
65             # Dynamically overwrite the default ordering
66 5         11 for ( my $i = 0 ; $i < @{$header_row} ; $i++ ) {
  90         133  
67 85         131 for my $col_name (%columns_of_interest_mappings) {
68 840 100       1064 if ( $header_row->[$i] eq $col_name ) {
69 30         33 $columns_of_interest_mappings{$col_name} = $i;
70 30         46 last;
71             }
72             }
73             }
74 5         145 $self->_gene_column( $columns_of_interest_mappings{'Gene'} );
75 5         130 $self->_num_isolates_column( $columns_of_interest_mappings{'No. isolates'} );
76 5         134 $self->_avg_sequences_per_isolate_column( $columns_of_interest_mappings{'Avg sequences per isolate'} );
77 5         126 $self->_genome_fragement_column( $columns_of_interest_mappings{'Genome Fragment'} );
78 5         139 $self->_order_within_fragement_column( $columns_of_interest_mappings{'Order within Fragment'} );
79 5         16 $self->_update_number_of_isolates($header_row);
80              
81             # Get the sample_names
82 5         8 my @sample_names;
83 5         16 for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$header_row} ; $i++ ) {
  20         39  
84 15         29 push( @sample_names, $header_row->[$i] );
85             }
86 5         176 $self->sample_names( \@sample_names );
87             }
88              
89             sub _length_of_fixed_headers {
90 388     388   482 my ($self) = @_;
91 388         399 return @{ Bio::Roary::GroupStatistics->fixed_headers() };
  388         901  
92             }
93              
94             sub _populate_sample_to_gene_lookup_with_row {
95 61     61   96 my ( $self, $row ) = @_;
96              
97 61         93 for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$row} ; $i++ ) {
  384         619  
98 323 100 66     826 if ( defined( $row->[$i] ) && $row->[$i] ne "" ) {
99 322         6920 my $sample_name = $self->sample_names->[ $i - $self->_length_of_fixed_headers() ];
100              
101 322         7246 $self->sample_names_to_genes->{$sample_name}->{ $row->[$i] } = 1;
102             }
103             }
104 61         1417 return 1;
105             }
106              
107             sub _ordered_core_genes {
108 5     5   10 my ($self) = @_;
109 5         8 my %ordered_genes;
110 5         117 while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) {
111 77 100       1998 next if ( @{$row} < 12 ); # no genes in group
  77         158  
112 76 50 33     1730 next if ( !defined( $row->[ $self->_gene_column ] ) || $row->[ $self->_gene_column ] eq '' ); # no gene name
113             next
114 76 50 33     1851 if ( !defined( $row->[ $self->_avg_sequences_per_isolate_column ] ) || $row->[ $self->_avg_sequences_per_isolate_column ] eq '' )
115             ; # no average
116             next
117 76 100 66     1743 if ( !defined( $row->[ $self->_genome_fragement_column ] ) || $row->[ $self->_genome_fragement_column ] eq '' )
118             ; # fragment not defined
119              
120             # next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates
121 74 100       1648 next if ( $row->[ $self->_num_isolates_column ] < $self->_min_no_isolates_for_core );
122              
123 64 100       1406 if ( $self->allow_paralogs ) {
124             # should never happen
125 5 100       116 next if ( $row->[ $self->_avg_sequences_per_isolate_column ] < 1 );
126             }
127             else {
128 59 100       1384 next if ( $row->[ $self->_avg_sequences_per_isolate_column ] != 1 );
129             }
130              
131 61         1297 $ordered_genes{ $row->[ $self->_genome_fragement_column ] }{ $row->[ $self->_order_within_fragement_column ] } =
132             $row->[ $self->_gene_column ];
133 61         122 $self->_populate_sample_to_gene_lookup_with_row($row);
134             }
135              
136 5         134 my @ordered_core_genes;
137 5         27 for my $fragment_key ( sort { $a <=> $b } keys %ordered_genes ) {
  2         8  
138 7         12 for my $order_within_fragement ( sort { $a <=> $b } keys %{ $ordered_genes{$fragment_key} } ) {
  232         229  
  7         37  
139 61         97 push( @ordered_core_genes, $ordered_genes{$fragment_key}{$order_within_fragement} );
140             }
141             }
142 5         189 return \@ordered_core_genes;
143             }
144              
145             sub _build_ordered_core_genes {
146 5     5   13 my ($self) = @_;
147 5         124 my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh );
148 5         253 $self->_setup_column_mappings($header_row);
149              
150 5         18 return $self->_ordered_core_genes();
151             }
152              
153 4     4   8710 no Moose;
  4         8  
  4         21  
154             __PACKAGE__->meta->make_immutable;
155              
156             1;
157              
158             __END__
159              
160             =pod
161              
162             =encoding UTF-8
163              
164             =head1 NAME
165              
166             Bio::Roary::ExtractCoreGenesFromSpreadsheet - Take in a spreadsheet produced by the pipeline and identify the core genes.
167              
168             =head1 VERSION
169              
170             version 3.11.0
171              
172             =head1 SYNOPSIS
173              
174             Take in a spreadsheet produced by the pipeline and identify the core genes.
175             use Bio::Roary::ExtractCoreGenesFromSpreadsheet;
176              
177             my $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new(
178             spreadsheet => 'group_statistics.csv',
179             );
180             $obj->ordered_core_genes();
181              
182             =head1 AUTHOR
183              
184             Andrew J. Page <ap13@sanger.ac.uk>
185              
186             =head1 COPYRIGHT AND LICENSE
187              
188             This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute.
189              
190             This is free software, licensed under:
191              
192             The GNU General Public License, Version 3, June 2007
193              
194             =cut