line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Roary::ExtractCoreGenesFromSpreadsheet; |
2
|
|
|
|
|
|
|
$Bio::Roary::ExtractCoreGenesFromSpreadsheet::VERSION = '3.11.0'; |
3
|
|
|
|
|
|
|
# ABSTRACT: Take in a spreadsheet produced by the pipeline and identify the core genes. |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
|
6
|
4
|
|
|
4
|
|
98024
|
use Moose; |
|
4
|
|
|
|
|
374019
|
|
|
4
|
|
|
|
|
23
|
|
7
|
4
|
|
|
4
|
|
25271
|
use Text::CSV; |
|
4
|
|
|
|
|
44875
|
|
|
4
|
|
|
|
|
171
|
|
8
|
4
|
|
|
4
|
|
937
|
use Bio::Roary::GroupStatistics; |
|
4
|
|
|
|
|
11
|
|
|
4
|
|
|
|
|
139
|
|
9
|
4
|
|
|
4
|
|
26
|
use POSIX; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
30
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 ); |
12
|
|
|
|
|
|
|
has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV', lazy => 1, builder => '_build__csv_parser' ); |
13
|
|
|
|
|
|
|
has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' ); |
14
|
|
|
|
|
|
|
has 'ordered_core_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_core_genes' ); |
15
|
|
|
|
|
|
|
has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 ); |
16
|
|
|
|
|
|
|
has 'sample_names' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } ); |
17
|
|
|
|
|
|
|
has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', default => sub { {} } ); |
18
|
|
|
|
|
|
|
has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
has '_number_of_isolates' => ( is => 'rw', isa => 'Int' ); |
21
|
|
|
|
|
|
|
has '_gene_column' => ( is => 'rw', isa => 'Int' ); |
22
|
|
|
|
|
|
|
has '_num_isolates_column' => ( is => 'rw', isa => 'Int' ); |
23
|
|
|
|
|
|
|
has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int' ); |
24
|
|
|
|
|
|
|
has '_genome_fragement_column' => ( is => 'rw', isa => 'Int' ); |
25
|
|
|
|
|
|
|
has '_order_within_fragement_column' => ( is => 'rw', isa => 'Int' ); |
26
|
|
|
|
|
|
|
has '_min_no_isolates_for_core' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' ); |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub _build__min_no_isolates_for_core { |
29
|
5
|
|
|
5
|
|
12
|
my ($self) = @_; |
30
|
5
|
|
|
|
|
125
|
my $threshold = $self->_number_of_isolates * $self->core_definition; |
31
|
|
|
|
|
|
|
|
32
|
5
|
|
|
|
|
115
|
return $threshold; |
33
|
|
|
|
|
|
|
} |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
sub _build__csv_parser { |
36
|
5
|
|
|
5
|
|
12
|
my ($self) = @_; |
37
|
5
|
|
|
|
|
49
|
return Text::CSV->new( { binary => 1, always_quote => 1 } ); |
38
|
|
|
|
|
|
|
} |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
sub _build__input_spreadsheet_fh { |
41
|
5
|
|
|
5
|
|
12
|
my ($self) = @_; |
42
|
5
|
|
|
|
|
166
|
open( my $fh, $self->spreadsheet ); |
43
|
5
|
|
|
|
|
160
|
return $fh; |
44
|
|
|
|
|
|
|
} |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
sub _update_number_of_isolates { |
47
|
5
|
|
|
5
|
|
13
|
my ( $self, $header_row ) = @_; |
48
|
5
|
|
|
|
|
6
|
my $number_of_isolates = @{$header_row} - @{ Bio::Roary::GroupStatistics->fixed_headers }; |
|
5
|
|
|
|
|
8
|
|
|
5
|
|
|
|
|
71
|
|
49
|
5
|
|
|
|
|
136
|
$self->_number_of_isolates($number_of_isolates); |
50
|
|
|
|
|
|
|
} |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub _setup_column_mappings { |
53
|
5
|
|
|
5
|
|
12
|
my ( $self, $header_row ) = @_; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
#Â current ordering |
56
|
5
|
|
|
|
|
35
|
my %columns_of_interest_mappings = ( |
57
|
|
|
|
|
|
|
'Gene' => 0, |
58
|
|
|
|
|
|
|
'No. isolates' => 3, |
59
|
|
|
|
|
|
|
'Avg sequences per isolate' => 5, |
60
|
|
|
|
|
|
|
'Genome Fragment' => 6, |
61
|
|
|
|
|
|
|
'Order within Fragment' => 7, |
62
|
|
|
|
|
|
|
'QC' => 10, |
63
|
|
|
|
|
|
|
); |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# Dynamically overwrite the default ordering |
66
|
5
|
|
|
|
|
11
|
for ( my $i = 0 ; $i < @{$header_row} ; $i++ ) { |
|
90
|
|
|
|
|
133
|
|
67
|
85
|
|
|
|
|
131
|
for my $col_name (%columns_of_interest_mappings) { |
68
|
840
|
100
|
|
|
|
1064
|
if ( $header_row->[$i] eq $col_name ) { |
69
|
30
|
|
|
|
|
33
|
$columns_of_interest_mappings{$col_name} = $i; |
70
|
30
|
|
|
|
|
46
|
last; |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
} |
74
|
5
|
|
|
|
|
145
|
$self->_gene_column( $columns_of_interest_mappings{'Gene'} ); |
75
|
5
|
|
|
|
|
130
|
$self->_num_isolates_column( $columns_of_interest_mappings{'No. isolates'} ); |
76
|
5
|
|
|
|
|
134
|
$self->_avg_sequences_per_isolate_column( $columns_of_interest_mappings{'Avg sequences per isolate'} ); |
77
|
5
|
|
|
|
|
126
|
$self->_genome_fragement_column( $columns_of_interest_mappings{'Genome Fragment'} ); |
78
|
5
|
|
|
|
|
139
|
$self->_order_within_fragement_column( $columns_of_interest_mappings{'Order within Fragment'} ); |
79
|
5
|
|
|
|
|
16
|
$self->_update_number_of_isolates($header_row); |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
# Get the sample_names |
82
|
5
|
|
|
|
|
8
|
my @sample_names; |
83
|
5
|
|
|
|
|
16
|
for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$header_row} ; $i++ ) { |
|
20
|
|
|
|
|
39
|
|
84
|
15
|
|
|
|
|
29
|
push( @sample_names, $header_row->[$i] ); |
85
|
|
|
|
|
|
|
} |
86
|
5
|
|
|
|
|
176
|
$self->sample_names( \@sample_names ); |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
sub _length_of_fixed_headers { |
90
|
388
|
|
|
388
|
|
482
|
my ($self) = @_; |
91
|
388
|
|
|
|
|
399
|
return @{ Bio::Roary::GroupStatistics->fixed_headers() }; |
|
388
|
|
|
|
|
901
|
|
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
sub _populate_sample_to_gene_lookup_with_row { |
95
|
61
|
|
|
61
|
|
96
|
my ( $self, $row ) = @_; |
96
|
|
|
|
|
|
|
|
97
|
61
|
|
|
|
|
93
|
for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$row} ; $i++ ) { |
|
384
|
|
|
|
|
619
|
|
98
|
323
|
100
|
66
|
|
|
826
|
if ( defined( $row->[$i] ) && $row->[$i] ne "" ) { |
99
|
322
|
|
|
|
|
6920
|
my $sample_name = $self->sample_names->[ $i - $self->_length_of_fixed_headers() ]; |
100
|
|
|
|
|
|
|
|
101
|
322
|
|
|
|
|
7246
|
$self->sample_names_to_genes->{$sample_name}->{ $row->[$i] } = 1; |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
} |
104
|
61
|
|
|
|
|
1417
|
return 1; |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub _ordered_core_genes { |
108
|
5
|
|
|
5
|
|
10
|
my ($self) = @_; |
109
|
5
|
|
|
|
|
8
|
my %ordered_genes; |
110
|
5
|
|
|
|
|
117
|
while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) { |
111
|
77
|
100
|
|
|
|
1998
|
next if ( @{$row} < 12 ); # no genes in group |
|
77
|
|
|
|
|
158
|
|
112
|
76
|
50
|
33
|
|
|
1730
|
next if ( !defined( $row->[ $self->_gene_column ] ) || $row->[ $self->_gene_column ] eq '' ); # no gene name |
113
|
|
|
|
|
|
|
next |
114
|
76
|
50
|
33
|
|
|
1851
|
if ( !defined( $row->[ $self->_avg_sequences_per_isolate_column ] ) || $row->[ $self->_avg_sequences_per_isolate_column ] eq '' ) |
115
|
|
|
|
|
|
|
; # no average |
116
|
|
|
|
|
|
|
next |
117
|
76
|
100
|
66
|
|
|
1743
|
if ( !defined( $row->[ $self->_genome_fragement_column ] ) || $row->[ $self->_genome_fragement_column ] eq '' ) |
118
|
|
|
|
|
|
|
; # fragment not defined |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
# next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates |
121
|
74
|
100
|
|
|
|
1648
|
next if ( $row->[ $self->_num_isolates_column ] < $self->_min_no_isolates_for_core ); |
122
|
|
|
|
|
|
|
|
123
|
64
|
100
|
|
|
|
1406
|
if ( $self->allow_paralogs ) { |
124
|
|
|
|
|
|
|
# should never happen |
125
|
5
|
100
|
|
|
|
116
|
next if ( $row->[ $self->_avg_sequences_per_isolate_column ] < 1 ); |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
else { |
128
|
59
|
100
|
|
|
|
1384
|
next if ( $row->[ $self->_avg_sequences_per_isolate_column ] != 1 ); |
129
|
|
|
|
|
|
|
} |
130
|
|
|
|
|
|
|
|
131
|
61
|
|
|
|
|
1297
|
$ordered_genes{ $row->[ $self->_genome_fragement_column ] }{ $row->[ $self->_order_within_fragement_column ] } = |
132
|
|
|
|
|
|
|
$row->[ $self->_gene_column ]; |
133
|
61
|
|
|
|
|
122
|
$self->_populate_sample_to_gene_lookup_with_row($row); |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
5
|
|
|
|
|
134
|
my @ordered_core_genes; |
137
|
5
|
|
|
|
|
27
|
for my $fragment_key ( sort { $a <=> $b } keys %ordered_genes ) { |
|
2
|
|
|
|
|
8
|
|
138
|
7
|
|
|
|
|
12
|
for my $order_within_fragement ( sort { $a <=> $b } keys %{ $ordered_genes{$fragment_key} } ) { |
|
232
|
|
|
|
|
229
|
|
|
7
|
|
|
|
|
37
|
|
139
|
61
|
|
|
|
|
97
|
push( @ordered_core_genes, $ordered_genes{$fragment_key}{$order_within_fragement} ); |
140
|
|
|
|
|
|
|
} |
141
|
|
|
|
|
|
|
} |
142
|
5
|
|
|
|
|
189
|
return \@ordered_core_genes; |
143
|
|
|
|
|
|
|
} |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub _build_ordered_core_genes { |
146
|
5
|
|
|
5
|
|
13
|
my ($self) = @_; |
147
|
5
|
|
|
|
|
124
|
my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ); |
148
|
5
|
|
|
|
|
253
|
$self->_setup_column_mappings($header_row); |
149
|
|
|
|
|
|
|
|
150
|
5
|
|
|
|
|
18
|
return $self->_ordered_core_genes(); |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
4
|
|
|
4
|
|
8710
|
no Moose; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
21
|
|
154
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
1; |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
__END__ |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=pod |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
=encoding UTF-8 |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=head1 NAME |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
Bio::Roary::ExtractCoreGenesFromSpreadsheet - Take in a spreadsheet produced by the pipeline and identify the core genes. |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
=head1 VERSION |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
version 3.11.0 |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head1 SYNOPSIS |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
Take in a spreadsheet produced by the pipeline and identify the core genes. |
175
|
|
|
|
|
|
|
use Bio::Roary::ExtractCoreGenesFromSpreadsheet; |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
my $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( |
178
|
|
|
|
|
|
|
spreadsheet => 'group_statistics.csv', |
179
|
|
|
|
|
|
|
); |
180
|
|
|
|
|
|
|
$obj->ordered_core_genes(); |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
=head1 AUTHOR |
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
Andrew J. Page <ap13@sanger.ac.uk> |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
This is free software, licensed under: |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=cut |