line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Palantir::Parser; |
2
|
|
|
|
|
|
|
# ABSTRACT: front-end class for Bio::Palantir::Parser module, wich handles the parsing of biosynML.xml and regions.js antiSMASH reports |
3
|
|
|
|
|
|
|
$Bio::Palantir::Parser::VERSION = '0.200700'; |
4
|
1
|
|
|
1
|
|
934
|
use Moose; |
|
1
|
|
|
|
|
420232
|
|
|
1
|
|
|
|
|
6
|
|
5
|
1
|
|
|
1
|
|
6652
|
use namespace::autoclean; |
|
1
|
|
|
|
|
6829
|
|
|
1
|
|
|
|
|
4
|
|
6
|
|
|
|
|
|
|
|
7
|
1
|
|
|
1
|
|
65
|
use autodie; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
7
|
|
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
4450
|
use Carp; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
64
|
|
10
|
1
|
|
|
1
|
|
7
|
use File::Basename 'fileparse'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
65
|
|
11
|
1
|
|
|
1
|
|
6
|
use File::Temp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
67
|
|
12
|
1
|
|
|
1
|
|
472
|
use JSON::Parse 'json_file_to_perl'; |
|
1
|
|
|
|
|
858
|
|
|
1
|
|
|
|
|
58
|
|
13
|
1
|
|
|
1
|
|
6
|
use POSIX 'ceil'; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
7
|
|
14
|
1
|
|
|
1
|
|
2369
|
use XML::Bare; |
|
1
|
|
|
|
|
7323
|
|
|
1
|
|
|
|
|
40
|
|
15
|
1
|
|
|
1
|
|
749
|
use XML::Hash::XS; |
|
1
|
|
|
|
|
1327
|
|
|
1
|
|
|
|
|
49
|
|
16
|
|
|
|
|
|
|
|
17
|
1
|
|
|
1
|
|
440
|
use aliased 'Bio::Palantir::Parser::Root'; |
|
1
|
|
|
|
|
635
|
|
|
1
|
|
|
|
|
6
|
|
18
|
|
|
|
|
|
|
extends 'Bio::FastParsers::Base'; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# ATTRIBUTES |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
has 'root' => ( |
26
|
|
|
|
|
|
|
is => 'ro', |
27
|
|
|
|
|
|
|
isa => 'Bio::Palantir::Parser::Root', |
28
|
|
|
|
|
|
|
init_arg => undef, |
29
|
|
|
|
|
|
|
lazy => 1, |
30
|
|
|
|
|
|
|
builder => '_build_root', |
31
|
|
|
|
|
|
|
); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
has 'module_delineation' => ( |
35
|
|
|
|
|
|
|
is => 'ro', |
36
|
|
|
|
|
|
|
isa => 'Str', |
37
|
|
|
|
|
|
|
default => 'substrate-selection', |
38
|
|
|
|
|
|
|
); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
## no critic (ProhibitUnusedPrivateSubroutines) |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
sub _build_root { |
43
|
1
|
|
|
1
|
|
3
|
my $self = shift; |
44
|
|
|
|
|
|
|
|
45
|
1
|
|
|
|
|
4
|
my @exts = qw(.xml .js); |
46
|
1
|
|
|
|
|
29
|
my ($name, $dir, $ext) = fileparse($self->file, @exts); |
47
|
|
|
|
|
|
|
|
48
|
1
|
|
|
|
|
140
|
my $biosynml = File::Temp->new(suffix => '.xml'); |
49
|
|
|
|
|
|
|
|
50
|
1
|
50
|
|
|
|
642
|
if ($ext eq '.js') { |
51
|
0
|
|
|
|
|
0
|
my $xmlstr = $self->_convert_js2biosynml; |
52
|
0
|
|
|
|
|
0
|
open my $out, '>', $biosynml->filename; |
53
|
0
|
|
|
|
|
0
|
say {$out} $xmlstr; |
|
0
|
|
|
|
|
0
|
|
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
|
56
|
1
|
50
|
|
|
|
33
|
my $file = $ext eq '.xml' ? $self->file : $biosynml->filename; |
57
|
|
|
|
|
|
|
|
58
|
1
|
50
|
|
|
|
13
|
my $xb = XML::Bare->new( file => $file ) |
59
|
|
|
|
|
|
|
or croak "Can't open '$file' for reading: $!"; |
60
|
|
|
|
|
|
|
|
61
|
1
|
|
|
|
|
2346
|
my $root = $xb->parse->{'root'}; |
62
|
1
|
50
|
|
|
|
3618
|
unless ($root) { |
63
|
0
|
|
|
|
|
0
|
carp "Warning: '$file' unexpectedly empty; returning no root!"; |
64
|
0
|
|
|
|
|
0
|
return; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
|
67
|
1
|
|
|
|
|
58
|
return Root->new( _root => $root, |
68
|
|
|
|
|
|
|
module_delineation => $self->module_delineation ); |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
## use critic |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
sub _convert_js2biosynml { |
74
|
0
|
|
|
0
|
|
|
my $self = shift; |
75
|
|
|
|
|
|
|
|
76
|
0
|
|
|
|
|
|
my $js = $self->file; |
77
|
0
|
|
|
|
|
|
my $json = File::Temp->new(suffix => '.json'); |
78
|
|
|
|
|
|
|
|
79
|
0
|
|
|
|
|
|
open my $in, '<', $js; |
80
|
|
|
|
|
|
|
|
81
|
0
|
|
|
|
|
|
chomp( my @lines = <$in> ); |
82
|
|
|
|
|
|
|
|
83
|
0
|
|
|
|
|
|
open my $out, '>', $json->filename; |
84
|
0
|
|
|
|
|
|
for my $i (0 .. @lines - 1) { |
85
|
|
|
|
|
|
|
|
86
|
0
|
0
|
|
|
|
|
if ($i == 0) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
87
|
0
|
|
|
|
|
|
say {$out} '{' . "\n" . ' "recordData": ['; |
|
0
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
elsif ($lines[$i] =~ m/all_regions/xms) { |
91
|
0
|
|
|
|
|
|
say {$out} ' "all_regions": {'; |
|
0
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
elsif ($lines[$i] =~ m/details_data/xms) { |
95
|
0
|
|
|
|
|
|
say {$out} ' "details_data": {'; |
|
0
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
elsif ($i == @lines - 2) { |
99
|
0
|
|
|
|
|
|
say {$out} ' }'; |
|
0
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
elsif ($i == @lines - 1) { |
103
|
0
|
|
|
|
|
|
say {$out} ' }'; |
|
0
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
else { |
107
|
0
|
|
|
|
|
|
$lines[$i] =~ s/\]\;/\]\,/xms; |
108
|
0
|
|
|
|
|
|
$lines[$i] =~ s/\}\;/\}\,/xms; |
109
|
0
|
|
|
|
|
|
say {$out} ' ' . $lines[$i]; |
|
0
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
0
|
|
|
|
|
|
print {$out} '}'; |
|
0
|
|
|
|
|
|
|
114
|
0
|
|
|
|
|
|
close $out; |
115
|
|
|
|
|
|
|
|
116
|
0
|
|
|
|
|
|
my $root = json_file_to_perl($json->filename); |
117
|
|
|
|
|
|
|
|
118
|
0
|
|
|
|
|
|
my %json_for; |
119
|
0
|
|
|
|
|
|
my ($cluster_id, $gene_id) = (1,1); |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
# parse the first part of the report |
122
|
0
|
|
|
|
|
|
my $region_for = $root->{all_regions}; |
123
|
0
|
|
|
|
|
|
for my $region (@{ $region_for->{order} }) { |
|
0
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
my %cluster_for = ( |
126
|
|
|
|
|
|
|
id => $cluster_id++, |
127
|
|
|
|
|
|
|
name => $region_for->{$region}{anchor}, |
128
|
|
|
|
|
|
|
rank => $region_for->{$region}{idx}, |
129
|
|
|
|
|
|
|
type => $region_for->{$region}{type}, |
130
|
|
|
|
|
|
|
start => $region_for->{$region}{start}, # DNA coordinates |
131
|
|
|
|
|
|
|
end => $region_for->{$region}{end}, |
132
|
0
|
|
|
|
|
|
); |
133
|
|
|
|
|
|
|
|
134
|
|
|
|
|
|
|
$json_for{ $cluster_for{name} }{$_} = $cluster_for{$_} |
135
|
0
|
|
|
|
|
|
for keys %cluster_for; |
136
|
|
|
|
|
|
|
|
137
|
0
|
|
|
|
|
|
my $orfs = $region_for->{$region}{orfs}; |
138
|
|
|
|
|
|
|
|
139
|
0
|
|
|
|
|
|
for my $orf (@{ $orfs }) { |
|
0
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
|
my $def = $orf->{description}; |
142
|
0
|
|
|
|
|
|
my ($sequence) |
143
|
|
|
|
|
|
|
= $def =~ m/PROGRAMS=blastp&QUERY=([A-Z]+)\&/xms; |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
my %orf_for = ( |
146
|
|
|
|
|
|
|
id => $gene_id++, |
147
|
|
|
|
|
|
|
name => $orf->{locus_tag}, |
148
|
|
|
|
|
|
|
start => $orf->{start}, # DNA coordinates |
149
|
|
|
|
|
|
|
end => $orf->{end}, |
150
|
|
|
|
|
|
|
type => $orf->{type}, |
151
|
|
|
|
|
|
|
sequence => $sequence, |
152
|
|
|
|
|
|
|
strand => $orf->{strand}, |
153
|
0
|
|
|
|
|
|
); |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
$json_for{ $cluster_for{name} }{genes}{ $orf_for{name} }{$_} |
156
|
0
|
|
|
|
|
|
= $orf_for{$_} for keys %orf_for; |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
# parse the second part of the report |
161
|
0
|
|
|
|
|
|
my $domain_id = 1; |
162
|
0
|
|
|
|
|
|
my $module_id = 1; |
163
|
0
|
|
|
|
|
|
$region_for = $root->{details_data}; # reassigning the region_for var |
164
|
0
|
|
|
|
|
|
for my $region (keys %{ $region_for }) { |
|
0
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
|
166
|
0
|
|
|
|
|
|
my $cluster_name = $region_for->{$region}{id}; |
167
|
0
|
|
|
|
|
|
my $orfs = $region_for->{$region}{orfs}; |
168
|
|
|
|
|
|
|
|
169
|
0
|
|
|
|
|
|
my $prev_domain; |
170
|
0
|
|
|
|
|
|
for my $orf (@{ $orfs }) { |
|
0
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
|
172
|
0
|
|
|
|
|
|
my $gene_name = $orf->{id}; |
173
|
0
|
|
|
|
|
|
for my $domain (@{ $orf->{domains} }) { |
|
0
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# fix duplicate domains in v5 (2019.11.02) |
176
|
0
|
0
|
0
|
|
|
|
if ($domain_id > 1 && $prev_domain |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
177
|
|
|
|
|
|
|
&& $domain->{start} eq $prev_domain->{start} |
178
|
|
|
|
|
|
|
&& $domain->{sequence} eq $prev_domain->{sequence} |
179
|
|
|
|
|
|
|
) { |
180
|
0
|
|
|
|
|
|
next; |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
$json_for{$cluster_name}{genes}{$gene_name}{domains}{$domain_id} |
184
|
|
|
|
|
|
|
= { |
185
|
|
|
|
|
|
|
id => $domain_id++, |
186
|
|
|
|
|
|
|
gene_id => $json_for{$cluster_name}{genes}{$gene_name}{id}, |
187
|
|
|
|
|
|
|
prot_start => $domain->{start}, |
188
|
|
|
|
|
|
|
prot_end => $domain->{end}, |
189
|
|
|
|
|
|
|
type => $domain->{type}, |
190
|
|
|
|
|
|
|
sequence => $domain->{sequence}, |
191
|
|
|
|
|
|
|
dna_start => $domain->{start} == 1 |
192
|
|
|
|
|
|
|
? 1 : $domain->{start} * 3, |
193
|
|
|
|
|
|
|
dna_end => $domain->{end} * 3, |
194
|
|
|
|
|
|
|
abbreviation => $domain->{abbreviation}, # = symbol |
195
|
0
|
0
|
|
|
|
|
}; |
196
|
|
|
|
|
|
|
|
197
|
0
|
|
|
|
|
|
$prev_domain = $domain; |
198
|
|
|
|
|
|
|
} |
199
|
|
|
|
|
|
|
|
200
|
0
|
0
|
|
|
|
|
if ($orf->{modules}) { # appeared in antiSMASH version 5.1 |
201
|
|
|
|
|
|
|
|
202
|
0
|
|
|
|
|
|
for my $module (@{ $orf->{modules} }) { |
|
0
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
my @gene_dna_coordinates = ( |
205
|
|
|
|
|
|
|
$json_for{$cluster_name}{genes}{$gene_name}{start}, |
206
|
|
|
|
|
|
|
$json_for{$cluster_name}{genes}{$gene_name}{end}, |
207
|
0
|
|
|
|
|
|
); |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
my $genomic_prot_start = ceil(($gene_dna_coordinates[0] / 3)) |
210
|
0
|
|
|
|
|
|
+ $module->{start} - 1; # if module starting pos is 1, this souldn't be position 2 on the gene coords |
211
|
|
|
|
|
|
|
my $genomic_prot_end = ceil(($gene_dna_coordinates[0] / 3)) |
212
|
0
|
|
|
|
|
|
+ $module->{end} - 1; |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
my $dna_start = $module->{start} == 1 # if prot pos is 1, it should still be 1 in DNA |
215
|
|
|
|
|
|
|
? 1 |
216
|
0
|
0
|
|
|
|
|
: $module->{start} * 3 |
217
|
|
|
|
|
|
|
; |
218
|
|
|
|
|
|
|
|
219
|
0
|
|
|
|
|
|
my $genomic_dna_start = $gene_dna_coordinates[0] |
220
|
|
|
|
|
|
|
+ $dna_start - 1; |
221
|
|
|
|
|
|
|
my $genomic_dna_end = $gene_dna_coordinates[0] |
222
|
0
|
|
|
|
|
|
+ ($module->{end} * 3) - 1; |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
$json_for{$cluster_name}{modules}{ |
225
|
|
|
|
|
|
|
'module' . $module_id} = { |
226
|
|
|
|
|
|
|
id => $module_id++, |
227
|
|
|
|
|
|
|
gene_id => $gene_name, |
228
|
|
|
|
|
|
|
rel_start => $module->{start}, # relative to gene coordinates |
229
|
|
|
|
|
|
|
rel_end => $module->{end}, |
230
|
|
|
|
|
|
|
prot_start => $genomic_prot_start, |
231
|
|
|
|
|
|
|
prot_end => $genomic_prot_end, |
232
|
|
|
|
|
|
|
dna_start => $genomic_dna_start, |
233
|
|
|
|
|
|
|
dna_end => $genomic_dna_end, |
234
|
|
|
|
|
|
|
complete => $module->{complete} == 1 |
235
|
|
|
|
|
|
|
? 'true' : 'false', |
236
|
|
|
|
|
|
|
iterative => $module->{iterative}, |
237
|
|
|
|
|
|
|
monomer => $module->{monomer}, |
238
|
|
|
|
|
|
|
# domains => join ',', @module_domains // 'NULL', |
239
|
0
|
0
|
|
|
|
|
}; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
# ### %json_for |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# writing biosynML format |
248
|
0
|
|
|
|
|
|
my %biosynml_for; |
249
|
0
|
|
|
|
|
|
for my $cluster (keys %json_for) { |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
my ($c_id, $c_name, $c_begin, $c_end, $c_type) |
252
|
0
|
|
|
|
|
|
= map { $json_for{$cluster}{$_} } qw(id name start end type); |
|
0
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
|
254
|
0
|
|
|
|
|
|
my $model_id = 'model id="' . $c_id . '"'; |
255
|
|
|
|
|
|
|
|
256
|
0
|
|
|
|
|
|
$biosynml_for{$model_id}{genecluster}{name} = $c_name; |
257
|
0
|
|
|
|
|
|
$biosynml_for{$model_id}{genecluster}{type} = $c_type; |
258
|
0
|
|
|
|
|
|
$biosynml_for{$model_id}{genecluster}{region}{begin} = $c_begin; # DNA coordinates |
259
|
0
|
|
|
|
|
|
$biosynml_for{$model_id}{genecluster}{region}{end} = $c_end; |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
GENE: |
262
|
0
|
|
|
|
|
|
for my $gene (keys %{ $json_for{$cluster}{genes} }) { |
|
0
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
my ($g_id, $g_name, $g_begin, $g_end, $g_sequence) |
265
|
0
|
|
|
|
|
|
= map { $json_for{$cluster}{genes}{$gene}{$_} } |
|
0
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
qw(id name start end sequence) |
267
|
|
|
|
|
|
|
; |
268
|
|
|
|
|
|
|
|
269
|
0
|
|
|
|
|
|
my $attr_gene_id = 'gene id="' . $g_id . '"'; |
270
|
0
|
|
|
|
|
|
$biosynml_for{genelist}{$attr_gene_id}{gene_name} = $g_name; |
271
|
|
|
|
|
|
|
$biosynml_for{genelist}{$attr_gene_id}{gene_location}{begin} |
272
|
0
|
|
|
|
|
|
= $g_begin; |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
$biosynml_for{genelist}{$attr_gene_id}{gene_location}{end} |
275
|
0
|
|
|
|
|
|
= $g_end; |
276
|
|
|
|
|
|
|
|
277
|
0
|
|
|
|
|
|
$biosynml_for{genelist}{$attr_gene_id}{gene_qualifiers}{'qualifier' |
278
|
|
|
|
|
|
|
. ' name="translation" ori="auto-annotation" style="genbank"'} |
279
|
|
|
|
|
|
|
= $g_sequence |
280
|
|
|
|
|
|
|
; |
281
|
|
|
|
|
|
|
|
282
|
0
|
|
|
|
|
|
for my $domain (keys %{ $json_for{$cluster}{genes}{$gene}{domains} |
283
|
0
|
|
|
|
|
|
}) { |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
my ($d_id, $dgene_id, $d_pbegin, $d_pend, $d_dbegin, |
286
|
|
|
|
|
|
|
$d_dend,$d_type, $d_sequence) |
287
|
0
|
|
|
|
|
|
= map { $json_for{$cluster}{genes}{$gene}{domains}{ |
288
|
0
|
|
|
|
|
|
$domain}{$_} } |
289
|
|
|
|
|
|
|
qw(id gene_id prot_start prot_end |
290
|
|
|
|
|
|
|
dna_begin dna_end type sequence) |
291
|
|
|
|
|
|
|
; |
292
|
|
|
|
|
|
|
|
293
|
0
|
|
|
|
|
|
my $attr_domain_id = 'domain id="' . $d_id .'"'; |
294
|
|
|
|
|
|
|
|
295
|
0
|
|
|
|
|
|
$biosynml_for{domainlist}{$attr_domain_id} |
296
|
|
|
|
|
|
|
= { |
297
|
|
|
|
|
|
|
nodeid => $d_id, |
298
|
|
|
|
|
|
|
function => $d_type, |
299
|
|
|
|
|
|
|
location => { |
300
|
|
|
|
|
|
|
gene => { |
301
|
|
|
|
|
|
|
'geneid source ="genelist"' => $dgene_id, |
302
|
|
|
|
|
|
|
position => { begin => $d_dbegin, end => $d_dend, }, |
303
|
|
|
|
|
|
|
}, |
304
|
|
|
|
|
|
|
protein => { |
305
|
|
|
|
|
|
|
sequence => $d_sequence, |
306
|
|
|
|
|
|
|
position => { begin => $d_pbegin, end => $d_pend, }, |
307
|
|
|
|
|
|
|
}, |
308
|
|
|
|
|
|
|
}, |
309
|
|
|
|
|
|
|
}; |
310
|
|
|
|
|
|
|
} |
311
|
|
|
|
|
|
|
} |
312
|
|
|
|
|
|
|
|
313
|
0
|
0
|
|
|
|
|
if ($json_for{$cluster}{modules}) { |
314
|
|
|
|
|
|
|
|
315
|
0
|
|
|
|
|
|
my $module_for = $json_for{$cluster}{modules}; |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
MODULE: |
318
|
0
|
|
|
|
|
|
for my $module (keys %{ $module_for }) { |
|
0
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
my $attr_module_id = 'module id="' |
321
|
0
|
|
|
|
|
|
. $module_for->{$module}{id} .'"'; |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
$biosynml_for{modulelist}{$attr_module_id}{$_} |
324
|
|
|
|
|
|
|
= $module_for->{$module}{$_} |
325
|
0
|
|
|
|
|
|
for keys %{ $module_for->{$module} } |
|
0
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
; |
327
|
|
|
|
|
|
|
} |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
} |
330
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
# ### %biosynml_for |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
# write XML file |
334
|
0
|
|
|
|
|
|
my $conv = XML::Hash::XS->new(utf8 => 0, encoding => 'utf-8', indent => 4); |
335
|
0
|
|
|
|
|
|
my $xmlstr = $conv->hash2xml(\%biosynml_for, utf8 => 1); |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
# correct artificial attributes |
338
|
0
|
|
|
|
|
|
$xmlstr =~ s/(<\/[a-z\_]+).*?>/$1>/xmsg; |
339
|
|
|
|
|
|
|
|
340
|
0
|
|
|
|
|
|
return($xmlstr); |
341
|
|
|
|
|
|
|
} |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
sub is_cluster_type_ok { |
344
|
|
|
|
|
|
|
|
345
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
346
|
|
|
|
|
|
|
|
347
|
0
|
|
|
|
|
|
my @filter_types = shift; |
348
|
|
|
|
|
|
|
|
349
|
0
|
|
|
|
|
|
my @allowed_types = qw( |
350
|
|
|
|
|
|
|
acyl_amino_acids amglyccycl arylpolyene bacteriocin butyrolactone |
351
|
|
|
|
|
|
|
cyanobactin ectoine hserlactone hglE-KS indole ladderane lantipeptide |
352
|
|
|
|
|
|
|
lassopeptide microviridin nrps nucleoside oligosaccharide otherks |
353
|
|
|
|
|
|
|
phenazine phosphonate PKS proteusin PUFA resorcinol siderophore t1pks |
354
|
|
|
|
|
|
|
t2pks t3pks terpene |
355
|
|
|
|
|
|
|
); |
356
|
|
|
|
|
|
|
|
357
|
0
|
|
|
|
|
|
for my $type (@filter_types) { |
358
|
|
|
|
|
|
|
|
359
|
0
|
0
|
|
|
|
|
unless (grep { $type =~ m/$_/xmsi } @allowed_types) { |
|
0
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
|
361
|
0
|
|
|
|
|
|
croak 'Error: value "' . $type . '" from --types option is ' |
362
|
|
|
|
|
|
|
. 'incorrect. Please look allowed values with --help option'; |
363
|
|
|
|
|
|
|
} |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
|
366
|
0
|
|
|
|
|
|
return(1); |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
371
|
|
|
|
|
|
|
1; |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
__END__ |
374
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
=pod |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
=head1 NAME |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
Bio::Palantir::Parser - front-end class for Bio::Palantir::Parser module, wich handles the parsing of biosynML.xml and regions.js antiSMASH reports |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=head1 VERSION |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
version 0.200700 |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
=head1 SYNOPSIS |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
#TODO |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=head1 DESCRIPTION |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
This module implements classes and their methods for B<parsing antisMASH |
392
|
|
|
|
|
|
|
reports>. The supported report formats are the F<biosynML.xml> file generated in |
393
|
|
|
|
|
|
|
antiSMASH v3-4 (though the version 4 needs to be explicitely activated in the |
394
|
|
|
|
|
|
|
options) or the F<regions.js> in the version 5. |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
The Biosynthetic Gene Cluster (BGC) information is hierarchically organized as |
397
|
|
|
|
|
|
|
follows: |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
C<Root.pm>: contains the root of the BGC data structure |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
C<Cluster.pm>: contains attributes and methods for the BGC B<Cluster> level, |
402
|
|
|
|
|
|
|
including an array of Gene objects |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
C<Gene.pm>: contains attributes and methods for the BGC B<Gene> level, |
405
|
|
|
|
|
|
|
including an array of Domain objects (if NRPS/PKS BGCs) |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
C<Module.pm>: contains attributes and methods for the BGC B<Module> level |
408
|
|
|
|
|
|
|
(generated by Palantir), including an array of Domain objects (this class is |
409
|
|
|
|
|
|
|
parallel to Genes, as module can be overlapping 2 genes) |
410
|
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
C<domain.pm>: contains attributes and methods for the BGC B<Domain> level, |
412
|
|
|
|
|
|
|
including an array of Motif objects |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
C<Motif.pm>: contains attributes and methods for the BGC B<Motif> level |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
=head2 file |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
Path to biosynML.xml or regions.js antiSMASH report file to be parsed. |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=head2 root |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
C<Bio::Palantir::Parser::Root> composed object |
425
|
|
|
|
|
|
|
|
426
|
|
|
|
|
|
|
=head2 file |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
Path to a biosynML.xml or regions.js file |
429
|
|
|
|
|
|
|
|
430
|
|
|
|
|
|
|
=head2 root |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
L<Bio::Palantir::Parser::Root> composed object |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
=head2 module_delineation |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
Module delineation method: generates modules from condensation or selection domains. |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=head1 AUTHOR |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
Loic MEUNIER <lmeunier@uliege.be> |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
This software is copyright (c) 2019 by University of Liege / Unit of Eukaryotic Phylogenomics / Loic MEUNIER and Denis BAURAIN. |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
447
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=cut |