line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
undef $VERSION; |
2
|
|
|
|
|
|
|
package Bio::Roary::CommandLine::RoaryPostAnalysis; |
3
|
|
|
|
|
|
|
$Bio::Roary::CommandLine::RoaryPostAnalysis::VERSION = '3.11.0'; |
4
|
|
|
|
|
|
|
# ABSTRACT: Perform the post analysis on the pan genome |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
|
7
|
1
|
|
|
1
|
|
443002
|
use Moose; |
|
1
|
|
|
|
|
8
|
|
|
1
|
|
|
|
|
6
|
|
8
|
1
|
|
|
1
|
|
6144
|
use Getopt::Long qw(GetOptionsFromArray); |
|
1
|
|
|
|
|
7784
|
|
|
1
|
|
|
|
|
4
|
|
9
|
1
|
|
|
1
|
|
401
|
use Bio::Roary::PostAnalysis; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
43
|
|
10
|
1
|
|
|
1
|
|
556
|
use File::Find::Rule; |
|
1
|
|
|
|
|
6116
|
|
|
1
|
|
|
|
|
7
|
|
11
|
1
|
|
|
1
|
|
346
|
use Bio::Roary::External::GeneAlignmentFromNucleotides; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
44
|
|
12
|
1
|
|
|
1
|
|
8
|
use File::Path qw(remove_tree); |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
86
|
|
13
|
1
|
|
|
1
|
|
7
|
use Bio::Roary::External::Fasttree; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
804
|
|
14
|
|
|
|
|
|
|
extends 'Bio::Roary::CommandLine::Common'; |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
has 'args' => ( is => 'ro', isa => 'ArrayRef', required => 1 ); |
17
|
|
|
|
|
|
|
has 'script_name' => ( is => 'ro', isa => 'Str', required => 1 ); |
18
|
|
|
|
|
|
|
has 'help' => ( is => 'rw', isa => 'Bool', default => 0 ); |
19
|
|
|
|
|
|
|
has '_error_message' => ( is => 'rw', isa => 'Str' ); |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
has 'fasta_files' => ( is => 'rw', isa => 'Str', default => '_fasta_files' ); |
22
|
|
|
|
|
|
|
has 'input_files' => ( is => 'rw', isa => 'Str', default => '_gff_files'); |
23
|
|
|
|
|
|
|
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'clustered_proteins' ); |
24
|
|
|
|
|
|
|
has 'output_pan_geneome_filename' => ( is => 'rw', isa => 'Str', default => 'pan_genome.fa' ); |
25
|
|
|
|
|
|
|
has 'output_statistics_filename' => ( is => 'rw', isa => 'Str', default => 'gene_presence_absence.csv' ); |
26
|
|
|
|
|
|
|
has 'output_multifasta_files' => ( is => 'rw', isa => 'Bool', default => 0 ); |
27
|
|
|
|
|
|
|
has 'clusters_filename' => ( is => 'rw', isa => 'Str', default => '_clustered.clstr' ); |
28
|
|
|
|
|
|
|
has 'job_runner' => ( is => 'rw', isa => 'Str', default => 'Local' ); |
29
|
|
|
|
|
|
|
has 'cpus' => ( is => 'rw', isa => 'Int', default => 1 ); |
30
|
|
|
|
|
|
|
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 ); |
31
|
|
|
|
|
|
|
has 'dont_create_rplots' => ( is => 'rw', isa => 'Bool', default => 0 ); |
32
|
|
|
|
|
|
|
has 'dont_split_groups' => ( is => 'rw', isa => 'Bool', default => 0 ); |
33
|
|
|
|
|
|
|
has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 ); |
34
|
|
|
|
|
|
|
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 ); |
35
|
|
|
|
|
|
|
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 ); |
36
|
|
|
|
|
|
|
has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 ); |
37
|
|
|
|
|
|
|
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 ); |
38
|
|
|
|
|
|
|
has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 ); |
39
|
|
|
|
|
|
|
has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 ); |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
sub BUILD { |
42
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
my ( |
45
|
0
|
|
|
|
|
|
$output_filename, $dont_create_rplots, $dont_delete_files, $dont_split_groups, $output_pan_geneome_filename, |
46
|
|
|
|
|
|
|
$job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $core_definition, |
47
|
|
|
|
|
|
|
$fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft, $allow_paralogs |
48
|
|
|
|
|
|
|
); |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
51
|
0
|
|
|
|
|
|
GetOptionsFromArray( |
52
|
|
|
|
|
|
|
$self->args, |
53
|
|
|
|
|
|
|
'o|output=s' => \$output_filename, |
54
|
|
|
|
|
|
|
'j|job_runner=s' => \$job_runner, |
55
|
|
|
|
|
|
|
'm|output_multifasta_files' => \$output_multifasta_files, |
56
|
|
|
|
|
|
|
'p=s' => \$output_pan_geneome_filename, |
57
|
|
|
|
|
|
|
's=s' => \$output_statistics_filename, |
58
|
|
|
|
|
|
|
'c=s' => \$clusters_filename, |
59
|
|
|
|
|
|
|
'f=s' => \$fasta_files, |
60
|
|
|
|
|
|
|
'i=s' => \$input_files, |
61
|
|
|
|
|
|
|
'a|dont_delete_files' => \$dont_delete_files, |
62
|
|
|
|
|
|
|
'b|dont_create_rplots' => \$dont_create_rplots, |
63
|
|
|
|
|
|
|
'd|dont_split_groups' => \$dont_split_groups, |
64
|
|
|
|
|
|
|
'e|verbose_stats' => \$verbose_stats, |
65
|
|
|
|
|
|
|
'z|processors=i' => \$cpus, |
66
|
|
|
|
|
|
|
't|translation_table=i' => \$translation_table, |
67
|
|
|
|
|
|
|
'g|group_limit=i' => \$group_limit, |
68
|
|
|
|
|
|
|
'cd|core_definition=f' => \$core_definition, |
69
|
|
|
|
|
|
|
'v|verbose' => \$verbose, |
70
|
|
|
|
|
|
|
'n|mafft' => \$mafft, |
71
|
|
|
|
|
|
|
'q|allow_paralogs' => \$allow_paralogs, |
72
|
|
|
|
|
|
|
'h|help' => \$help, |
73
|
|
|
|
|
|
|
); |
74
|
|
|
|
|
|
|
|
75
|
0
|
0
|
|
|
|
|
$self->help($help) if(defined($help)); |
76
|
0
|
0
|
|
|
|
|
$self->job_runner($job_runner) if ( defined($job_runner) ); |
77
|
0
|
0
|
|
|
|
|
$self->fasta_files($fasta_files) if ( defined($fasta_files) ); |
78
|
0
|
0
|
|
|
|
|
$self->input_files($input_files) if ( defined($input_files) ); |
79
|
0
|
0
|
|
|
|
|
$self->output_filename($output_filename) if ( defined($output_filename) ); |
80
|
0
|
0
|
|
|
|
|
$self->output_pan_geneome_filename($output_pan_geneome_filename) if ( defined($output_pan_geneome_filename) ); |
81
|
0
|
0
|
|
|
|
|
$self->output_statistics_filename($output_statistics_filename) if ( defined($output_statistics_filename) ); |
82
|
0
|
0
|
|
|
|
|
$self->output_multifasta_files($output_multifasta_files) if ( defined($output_multifasta_files) ); |
83
|
0
|
0
|
|
|
|
|
$self->clusters_filename($clusters_filename) if ( defined($clusters_filename) ); |
84
|
0
|
0
|
|
|
|
|
$self->dont_delete_files($dont_delete_files) if ( defined($dont_delete_files) ); |
85
|
0
|
0
|
|
|
|
|
$self->dont_create_rplots($dont_create_rplots) if ( defined($dont_create_rplots) ); |
86
|
0
|
0
|
|
|
|
|
$self->dont_split_groups($dont_split_groups) if ( defined($dont_split_groups) ); |
87
|
0
|
0
|
|
|
|
|
$self->verbose_stats($verbose_stats) if ( defined($verbose_stats)); |
88
|
0
|
0
|
|
|
|
|
$self->translation_table($translation_table) if ( defined($translation_table) ); |
89
|
0
|
0
|
|
|
|
|
$self->cpus($cpus) if ( defined($cpus) ); |
90
|
0
|
0
|
|
|
|
|
$self->group_limit($group_limit) if ( defined($group_limit) ); |
91
|
0
|
0
|
|
|
|
|
$self->core_definition( $core_definition/100 ) if ( defined($core_definition) ); |
92
|
0
|
0
|
|
|
|
|
$self->mafft($mafft) if ( defined($mafft) ); |
93
|
0
|
0
|
|
|
|
|
$self->allow_paralogs($allow_paralogs) if ( defined($allow_paralogs) ); |
94
|
0
|
0
|
|
|
|
|
if ( defined($verbose) ) { |
95
|
0
|
|
|
|
|
|
$self->verbose($verbose); |
96
|
0
|
|
|
|
|
|
$self->logger->level(10000); |
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
sub run { |
101
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
102
|
|
|
|
|
|
|
|
103
|
0
|
0
|
|
|
|
|
( !$self->help ) or die $self->usage_text; |
104
|
0
|
0
|
|
|
|
|
if ( defined( $self->_error_message ) ) { |
105
|
0
|
|
|
|
|
|
print $self->_error_message . "\n"; |
106
|
0
|
|
|
|
|
|
die $self->usage_text; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
0
|
|
|
|
|
|
my $input_files = $self->_read_file_into_array($self->input_files); |
110
|
0
|
|
|
|
|
|
my $obj = Bio::Roary::PostAnalysis->new( |
111
|
|
|
|
|
|
|
fasta_files => $self->_read_file_into_array($self->fasta_files) , |
112
|
|
|
|
|
|
|
input_files => $input_files , |
113
|
|
|
|
|
|
|
output_filename => $self->output_filename , |
114
|
|
|
|
|
|
|
output_pan_geneome_filename => $self->output_pan_geneome_filename, |
115
|
|
|
|
|
|
|
output_statistics_filename => $self->output_statistics_filename , |
116
|
|
|
|
|
|
|
output_multifasta_files => $self->output_multifasta_files , |
117
|
|
|
|
|
|
|
clusters_filename => $self->clusters_filename , |
118
|
|
|
|
|
|
|
dont_delete_files => $self->dont_delete_files, |
119
|
|
|
|
|
|
|
dont_create_rplots => $self->dont_create_rplots, |
120
|
|
|
|
|
|
|
dont_split_groups => $self->dont_split_groups, |
121
|
|
|
|
|
|
|
verbose_stats => $self->verbose_stats, |
122
|
|
|
|
|
|
|
group_limit => $self->group_limit, |
123
|
|
|
|
|
|
|
verbose => $self->verbose, |
124
|
|
|
|
|
|
|
cpus => $self->cpus, |
125
|
|
|
|
|
|
|
logger => $self->logger, |
126
|
|
|
|
|
|
|
core_definition => $self->core_definition, |
127
|
|
|
|
|
|
|
); |
128
|
0
|
|
|
|
|
|
$obj->run(); |
129
|
|
|
|
|
|
|
|
130
|
0
|
0
|
|
|
|
|
if($self->dont_delete_files == 0) |
131
|
|
|
|
|
|
|
{ |
132
|
0
|
|
|
|
|
|
unlink('_inflated_unsplit_mcl_groups'); |
133
|
0
|
|
|
|
|
|
remove_tree('split_groups'); |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
0
|
0
|
|
|
|
|
if($self->output_multifasta_files == 1) |
137
|
|
|
|
|
|
|
{ |
138
|
0
|
0
|
|
|
|
|
print "Aligning each cluster\n" if($self->verbose); |
139
|
|
|
|
|
|
|
|
140
|
0
|
|
|
|
|
|
my $job_runner_to_use = $self->job_runner; |
141
|
0
|
0
|
0
|
|
|
|
if($self->_is_lsf_job_runner_available && $self->job_runner eq "LSF") |
142
|
|
|
|
|
|
|
{ |
143
|
0
|
|
|
|
|
|
$job_runner_to_use = $self->job_runner; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
else |
146
|
|
|
|
|
|
|
{ |
147
|
0
|
|
|
|
|
|
$job_runner_to_use = 'Parallel'; |
148
|
|
|
|
|
|
|
} |
149
|
|
|
|
|
|
|
|
150
|
0
|
|
|
|
|
|
my $output_gene_files = $self->_find_input_files; |
151
|
|
|
|
|
|
|
my $seg = Bio::Roary::External::GeneAlignmentFromNucleotides->new( |
152
|
|
|
|
|
|
|
fasta_files => $output_gene_files, |
153
|
|
|
|
|
|
|
job_runner => $job_runner_to_use, |
154
|
|
|
|
|
|
|
translation_table => $self->translation_table, |
155
|
|
|
|
|
|
|
core_definition => $self->core_definition, |
156
|
|
|
|
|
|
|
cpus => $self->cpus, |
157
|
|
|
|
|
|
|
verbose => $self->verbose, |
158
|
|
|
|
|
|
|
mafft => $self->mafft, |
159
|
|
|
|
|
|
|
allow_paralogs => $self->allow_paralogs, |
160
|
|
|
|
|
|
|
dont_delete_files => $self->dont_delete_files, |
161
|
0
|
|
|
|
|
|
num_input_files => $#{$input_files}, |
|
0
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
); |
163
|
0
|
|
|
|
|
|
$seg->run(); |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
sub _is_lsf_job_runner_available |
168
|
|
|
|
|
|
|
{ |
169
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
170
|
0
|
|
|
|
|
|
my $rc = eval "require Bio::Roary::JobRunner::LSF; 1;"; |
171
|
0
|
0
|
0
|
|
|
|
if(defined($rc) && $rc == 1) |
172
|
|
|
|
|
|
|
{ |
173
|
0
|
|
|
|
|
|
return 1; |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
else |
176
|
|
|
|
|
|
|
{ |
177
|
0
|
|
|
|
|
|
return 0; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
sub _find_input_files |
182
|
|
|
|
|
|
|
{ |
183
|
0
|
|
|
0
|
|
|
my ($self) = @_; |
184
|
0
|
|
|
|
|
|
my @files = File::Find::Rule->file() |
185
|
|
|
|
|
|
|
->name( '*.fa' ) |
186
|
|
|
|
|
|
|
->in('pan_genome_sequences' ); |
187
|
0
|
|
|
|
|
|
return \@files; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
sub _read_file_into_array |
191
|
|
|
|
|
|
|
{ |
192
|
0
|
|
|
0
|
|
|
my ($self, $filename) = @_; |
193
|
0
|
|
|
|
|
|
open(my $in_fh, $filename); |
194
|
|
|
|
|
|
|
|
195
|
0
|
|
|
|
|
|
my @filenames; |
196
|
0
|
|
|
|
|
|
while(<$in_fh>){ |
197
|
0
|
|
|
|
|
|
chomp; |
198
|
0
|
|
|
|
|
|
my $line = $_; |
199
|
0
|
|
|
|
|
|
push(@filenames, $line); |
200
|
|
|
|
|
|
|
} |
201
|
0
|
|
|
|
|
|
return \@filenames; |
202
|
|
|
|
|
|
|
} |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
sub usage_text { |
205
|
0
|
|
|
0
|
0
|
|
my ($self) = @_; |
206
|
|
|
|
|
|
|
|
207
|
0
|
|
|
|
|
|
return <<USAGE; |
208
|
|
|
|
|
|
|
Usage: pan_genome_post_analysis [options] |
209
|
|
|
|
|
|
|
Perform the post analysis on the pan genome. This script is usally only called by another script. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
Options: -a dont delete intermediate files |
212
|
|
|
|
|
|
|
-b dont create R plots |
213
|
|
|
|
|
|
|
-c STR clusters filename [_clustered.clstr] |
214
|
|
|
|
|
|
|
-cd FLOAT percentage of isolates a gene must be in to be core [0.99] |
215
|
|
|
|
|
|
|
-d dont split groups |
216
|
|
|
|
|
|
|
-e add inference values to gene presence and absence spreadsheet |
217
|
|
|
|
|
|
|
-f STR file of protein filenames [_fasta_files] |
218
|
|
|
|
|
|
|
-g INT maximum number of clusters [50000] |
219
|
|
|
|
|
|
|
-i STR file of GFF filenames [_gff_files] |
220
|
|
|
|
|
|
|
-m core gene alignement with PRANK |
221
|
|
|
|
|
|
|
-n fast core gene alignement with MAFFT instead of PRANK |
222
|
|
|
|
|
|
|
-o STR clusters output filename [clustered_proteins] |
223
|
|
|
|
|
|
|
-p STR output pan genome filename [pan_genome.fa] |
224
|
|
|
|
|
|
|
-q allow paralogs in core alignment |
225
|
|
|
|
|
|
|
-s STR output gene presence and absence filename [gene_presence_absence.csv] |
226
|
|
|
|
|
|
|
-t INT translation table [11] |
227
|
|
|
|
|
|
|
-z INT number of threads [1] |
228
|
|
|
|
|
|
|
-v verbose output to STDOUT |
229
|
|
|
|
|
|
|
-h this help message |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
For further info see: http://sanger-pathogens.github.io/Roary/ |
232
|
|
|
|
|
|
|
USAGE |
233
|
|
|
|
|
|
|
} |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
236
|
1
|
|
|
1
|
|
7
|
no Moose; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
6
|
|
237
|
|
|
|
|
|
|
1; |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
__END__ |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=pod |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
=encoding UTF-8 |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head1 NAME |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Bio::Roary::CommandLine::RoaryPostAnalysis - Perform the post analysis on the pan genome |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=head1 VERSION |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
version 3.11.0 |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=head1 SYNOPSIS |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
Perform the post analysis on the pan genome |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=head1 AUTHOR |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
Andrew J. Page <ap13@sanger.ac.uk> |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
This software is Copyright (c) 2013 by Wellcome Trust Sanger Institute. |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
This is free software, licensed under: |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
The GNU General Public License, Version 3, June 2007 |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=cut |