| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Bio::Roary::ExtractCoreGenesFromSpreadsheet; |
|
2
|
|
|
|
|
|
|
$Bio::Roary::ExtractCoreGenesFromSpreadsheet::VERSION = '3.11.0'; |
|
3
|
|
|
|
|
|
|
# ABSTRACT: Take in a spreadsheet produced by the pipeline and identify the core genes. |
|
4
|
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
98024
|
use Moose; |
|
|
4
|
|
|
|
|
374019
|
|
|
|
4
|
|
|
|
|
23
|
|
|
7
|
4
|
|
|
4
|
|
25271
|
use Text::CSV; |
|
|
4
|
|
|
|
|
44875
|
|
|
|
4
|
|
|
|
|
171
|
|
|
8
|
4
|
|
|
4
|
|
937
|
use Bio::Roary::GroupStatistics; |
|
|
4
|
|
|
|
|
11
|
|
|
|
4
|
|
|
|
|
139
|
|
|
9
|
4
|
|
|
4
|
|
26
|
use POSIX; |
|
|
4
|
|
|
|
|
6
|
|
|
|
4
|
|
|
|
|
30
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 ); |
|
12
|
|
|
|
|
|
|
has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV', lazy => 1, builder => '_build__csv_parser' ); |
|
13
|
|
|
|
|
|
|
has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' ); |
|
14
|
|
|
|
|
|
|
has 'ordered_core_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_core_genes' ); |
|
15
|
|
|
|
|
|
|
has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 ); |
|
16
|
|
|
|
|
|
|
has 'sample_names' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } ); |
|
17
|
|
|
|
|
|
|
has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); |
|
18
|
|
|
|
|
|
|
has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); |
|
19
|
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
has '_number_of_isolates' => ( is => 'rw', isa => 'Int' ); |
|
21
|
|
|
|
|
|
|
has '_gene_column' => ( is => 'rw', isa => 'Int' ); |
|
22
|
|
|
|
|
|
|
has '_num_isolates_column' => ( is => 'rw', isa => 'Int' ); |
|
23
|
|
|
|
|
|
|
has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int' ); |
|
24
|
|
|
|
|
|
|
has '_genome_fragement_column' => ( is => 'rw', isa => 'Int' ); |
|
25
|
|
|
|
|
|
|
has '_order_within_fragement_column' => ( is => 'rw', isa => 'Int' ); |
|
26
|
|
|
|
|
|
|
has '_min_no_isolates_for_core' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' ); |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub _build__min_no_isolates_for_core { |
|
29
|
5
|
|
|
5
|
|
12
|
my ($self) = @_; |
|
30
|
5
|
|
|
|
|
125
|
my $threshold = $self->_number_of_isolates * $self->core_definition; |
|
31
|
|
|
|
|
|
|
|
|
32
|
5
|
|
|
|
|
115
|
return $threshold; |
|
33
|
|
|
|
|
|
|
} |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub _build__csv_parser { |
|
36
|
5
|
|
|
5
|
|
12
|
my ($self) = @_; |
|
37
|
5
|
|
|
|
|
49
|
return Text::CSV->new( { binary => 1, always_quote => 1 } ); |
|
38
|
|
|
|
|
|
|
} |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub _build__input_spreadsheet_fh { |
|
41
|
5
|
|
|
5
|
|
12
|
my ($self) = @_; |
|
42
|
5
|
|
|
|
|
166
|
open( my $fh, $self->spreadsheet ); |
|
43
|
5
|
|
|
|
|
160
|
return $fh; |
|
44
|
|
|
|
|
|
|
} |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
sub _update_number_of_isolates { |
|
47
|
5
|
|
|
5
|
|
13
|
my ( $self, $header_row ) = @_; |
|
48
|
5
|
|
|
|
|
6
|
my $number_of_isolates = @{$header_row} - @{ Bio::Roary::GroupStatistics->fixed_headers }; |
|
|
5
|
|
|
|
|
8
|
|
|
|
5
|
|
|
|
|
71
|
|
|
49
|
5
|
|
|
|
|
136
|
$self->_number_of_isolates($number_of_isolates); |
|
50
|
|
|
|
|
|
|
} |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub _setup_column_mappings { |
|
53
|
5
|
|
|
5
|
|
12
|
my ( $self, $header_row ) = @_; |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
#Â current ordering |
|
56
|
5
|
|
|
|
|
35
|
my %columns_of_interest_mappings = ( |
|
57
|
|
|
|
|
|
|
'Gene' => 0, |
|
58
|
|
|
|
|
|
|
'No. isolates' => 3, |
|
59
|
|
|
|
|
|
|
'Avg sequences per isolate' => 5, |
|
60
|
|
|
|
|
|
|
'Genome Fragment' => 6, |
|
61
|
|
|
|
|
|
|
'Order within Fragment' => 7, |
|
62
|
|
|
|
|
|
|
'QC' => 10, |
|
63
|
|
|
|
|
|
|
); |
|
64
|
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# Dynamically overwrite the default ordering |
|
66
|
5
|
|
|
|
|
11
|
for ( my $i = 0 ; $i < @{$header_row} ; $i++ ) { |
|
|
90
|
|
|
|
|
133
|
|
|
67
|
85
|
|
|
|
|
131
|
for my $col_name (%columns_of_interest_mappings) { |
|
68
|
840
|
100
|
|
|
|
1064
|
if ( $header_row->[$i] eq $col_name ) { |
|
69
|
30
|
|
|
|
|
33
|
$columns_of_interest_mappings{$col_name} = $i; |
|
70
|
30
|
|
|
|
|
46
|
last; |
|
71
|
|
|
|
|
|
|
} |
|
72
|
|
|
|
|
|
|
} |
|
73
|
|
|
|
|
|
|
} |
|
74
|
5
|
|
|
|
|
145
|
$self->_gene_column( $columns_of_interest_mappings{'Gene'} ); |
|
75
|
5
|
|
|
|
|
130
|
$self->_num_isolates_column( $columns_of_interest_mappings{'No. isolates'} ); |
|
76
|
5
|
|
|
|
|
134
|
$self->_avg_sequences_per_isolate_column( $columns_of_interest_mappings{'Avg sequences per isolate'} ); |
|
77
|
5
|
|
|
|
|
126
|
$self->_genome_fragement_column( $columns_of_interest_mappings{'Genome Fragment'} ); |
|
78
|
5
|
|
|
|
|
139
|
$self->_order_within_fragement_column( $columns_of_interest_mappings{'Order within Fragment'} ); |
|
79
|
5
|
|
|
|
|
16
|
$self->_update_number_of_isolates($header_row); |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# Get the sample_names |
|
82
|
5
|
|
|
|
|
8
|
my @sample_names; |
|
83
|
5
|
|
|
|
|
16
|
for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$header_row} ; $i++ ) { |
|
|
20
|
|
|
|
|
39
|
|
|
84
|
15
|
|
|
|
|
29
|
push( @sample_names, $header_row->[$i] ); |
|
85
|
|
|
|
|
|
|
} |
|
86
|
5
|
|
|
|
|
176
|
$self->sample_names( \@sample_names ); |
|
87
|
|
|
|
|
|
|
} |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub _length_of_fixed_headers { |
|
90
|
388
|
|
|
388
|
|
482
|
my ($self) = @_; |
|
91
|
388
|
|
|
|
|
399
|
return @{ Bio::Roary::GroupStatistics->fixed_headers() }; |
|
|
388
|
|
|
|
|
901
|
|
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
sub _populate_sample_to_gene_lookup_with_row { |
|
95
|
61
|
|
|
61
|
|
96
|
my ( $self, $row ) = @_; |
|
96
|
|
|
|
|
|
|
|
|
97
|
61
|
|
|
|
|
93
|
for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$row} ; $i++ ) { |
|
|
384
|
|
|
|
|
619
|
|
|
98
|
323
|
100
|
66
|
|
|
826
|
if ( defined( $row->[$i] ) && $row->[$i] ne "" ) { |
|
99
|
322
|
|
|
|
|
6920
|
my $sample_name = $self->sample_names->[ $i - $self->_length_of_fixed_headers() ]; |
|
100
|
|
|
|
|
|
|
|
|
101
|
322
|
|
|
|
|
7246
|
$self->sample_names_to_genes->{$sample_name}->{ $row->[$i] } = 1; |
|
102
|
|
|
|
|
|
|
} |
|
103
|
|
|
|
|
|
|
} |
|
104
|
61
|
|
|
|
|
1417
|
return 1; |
|
105
|
|
|
|
|
|
|
} |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub _ordered_core_genes { |
|
108
|
5
|
|
|
5
|
|
10
|
my ($self) = @_; |
|
109
|
5
|
|
|
|
|
8
|
my %ordered_genes; |
|
110
|
5
|
|
|
|
|
117
|
while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) { |
|
111
|
77
|
100
|
|
|
|
1998
|
next if ( @{$row} < 12 ); # no genes in group |
|
|
77
|
|
|
|
|
158
|
|
|
112
|
76
|
50
|
33
|
|
|
1730
|
next if ( !defined( $row->[ $self->_gene_column ] ) || $row->[ $self->_gene_column ] eq '' ); # no gene name |
|
113
|
|
|
|
|
|
|
next |
|
114
|
76
|
50
|
33
|
|
|
1851
|
if ( !defined( $row->[ $self->_avg_sequences_per_isolate_column ] ) || $row->[ $self->_avg_sequences_per_isolate_column ] eq '' ) |
|
115
|
|
|
|
|
|
|
; # no average |
|
116
|
|
|
|
|
|
|
next |
|
117
|
76
|
100
|
66
|
|
|
1743
|
if ( !defined( $row->[ $self->_genome_fragement_column ] ) || $row->[ $self->_genome_fragement_column ] eq '' ) |
|
118
|
|
|
|
|
|
|
; # fragment not defined |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
# next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates |
|
121
|
74
|
100
|
|
|
|
1648
|
next if ( $row->[ $self->_num_isolates_column ] < $self->_min_no_isolates_for_core ); |
|
122
|
|
|
|
|
|
|
|
|
123
|
64
|
100
|
|
|
|
1406
|
if ( $self->allow_paralogs ) { |
|
124
|
|
|
|
|
|
|
# should never happen |
|
125
|
5
|
100
|
|
|
|
116
|
next if ( $row->[ $self->_avg_sequences_per_isolate_column ] < 1 ); |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
else { |
|
128
|
59
|
100
|
|
|
|
1384
|
next if ( $row->[ $self->_avg_sequences_per_isolate_column ] != 1 ); |
|
129
|
|
|
|
|
|
|
} |
|
130
|
|
|
|
|
|
|
|
|
131
|
61
|
|
|
|
|
1297
|
$ordered_genes{ $row->[ $self->_genome_fragement_column ] }{ $row->[ $self->_order_within_fragement_column ] } = |
|
132
|
|
|
|
|
|
|
$row->[ $self->_gene_column ]; |
|
133
|
61
|
|
|
|
|
122
|
$self->_populate_sample_to_gene_lookup_with_row($row); |
|
134
|
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
5
|
|
|
|
|
134
|
my @ordered_core_genes; |
|
137
|
5
|
|
|
|
|
27
|
for my $fragment_key ( sort { $a <=> $b } keys %ordered_genes ) { |
|
|
2
|
|
|
|
|
8
|
|
|
138
|
7
|
|
|
|
|
12
|
for my $order_within_fragement ( sort { $a <=> $b } keys %{ $ordered_genes{$fragment_key} } ) { |
|
|
232
|
|
|
|
|
229
|
|
|
|
7
|
|
|
|
|
37
|
|
|
139
|
61
|
|
|
|
|
97
|
push( @ordered_core_genes, $ordered_genes{$fragment_key}{$order_within_fragement} ); |
|
140
|
|
|
|
|
|
|
} |
|
141
|
|
|
|
|
|
|
} |
|
142
|
5
|
|
|
|
|
189
|
return \@ordered_core_genes; |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub _build_ordered_core_genes { |
|
146
|
5
|
|
|
5
|
|
13
|
my ($self) = @_; |
|
147
|
5
|
|
|
|
|
124
|
my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ); |
|
148
|
5
|
|
|
|
|
253
|
$self->_setup_column_mappings($header_row); |
|
149
|
|
|
|
|
|
|
|
|
150
|
5
|
|
|
|
|
18
|
return $self->_ordered_core_genes(); |
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
|
|
153
|
4
|
|
|
4
|
|
8710
|
no Moose; |
|
|
4
|
|
|
|
|
8
|
|
|
|
4
|
|
|
|
|
21
|
|
|
154
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
1; |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
__END__ |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=pod |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=encoding UTF-8 |
|
163
|
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head1 NAME |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Bio::Roary::ExtractCoreGenesFromSpreadsheet - Take in a spreadsheet produced by the pipeline and identify the core genes. |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=head1 VERSION |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
version 3.11.0 |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
Take in a spreadsheet produced by the pipeline and identify the core genes. |
|
175
|
|
|
|
|
|
|
use Bio::Roary::ExtractCoreGenesFromSpreadsheet; |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
my $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( |
|
178
|
|
|
|
|
|
|
spreadsheet => 'group_statistics.csv', |
|
179
|
|
|
|
|
|
|
); |
|
180
|
|
|
|
|
|
|
$obj->ordered_core_genes(); |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=head1 AUTHOR |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
Andrew J. Page <ap13@sanger.ac.uk> |
|
185
|
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
This is free software, licensed under: |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=cut |