line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Bio::Palantir::Parser; |
2
|
|
|
|
|
|
|
# ABSTRACT: front-end class for Bio::Palantir::Parser module, wich handles the parsing of biosynML.xml and regions.js antiSMASH reports |
3
|
|
|
|
|
|
|
$Bio::Palantir::Parser::VERSION = '0.211420'; |
4
|
1
|
|
|
1
|
|
649
|
use Moose; |
|
1
|
|
|
|
|
503791
|
|
|
1
|
|
|
|
|
5
|
|
5
|
1
|
|
|
1
|
|
8582
|
use namespace::autoclean; |
|
1
|
|
|
|
|
9059
|
|
|
1
|
|
|
|
|
5
|
|
6
|
|
|
|
|
|
|
|
7
|
1
|
|
|
1
|
|
483
|
use autodie; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
10
|
|
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
5737
|
use Carp; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
66
|
|
10
|
1
|
|
|
1
|
|
7
|
use File::Basename 'fileparse'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
116
|
|
11
|
1
|
|
|
1
|
|
8
|
use File::Temp; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
97
|
|
12
|
1
|
|
|
1
|
|
583
|
use JSON::Parse 'json_file_to_perl'; |
|
1
|
|
|
|
|
1366
|
|
|
1
|
|
|
|
|
59
|
|
13
|
1
|
|
|
1
|
|
8
|
use POSIX 'ceil'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
17
|
|
14
|
1
|
|
|
1
|
|
3684
|
use XML::Bare; |
|
1
|
|
|
|
|
10300
|
|
|
1
|
|
|
|
|
83
|
|
15
|
1
|
|
|
1
|
|
744
|
use XML::Hash::XS; |
|
1
|
|
|
|
|
1249
|
|
|
1
|
|
|
|
|
66
|
|
16
|
|
|
|
|
|
|
|
17
|
1
|
|
|
1
|
|
574
|
use aliased 'Bio::Palantir::Parser::Root'; |
|
1
|
|
|
|
|
921
|
|
|
1
|
|
|
|
|
7
|
|
18
|
|
|
|
|
|
|
extends 'Bio::FastParsers::Base'; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# ATTRIBUTES |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
has 'root' => ( |
26
|
|
|
|
|
|
|
is => 'ro', |
27
|
|
|
|
|
|
|
isa => 'Bio::Palantir::Parser::Root', |
28
|
|
|
|
|
|
|
init_arg => undef, |
29
|
|
|
|
|
|
|
lazy => 1, |
30
|
|
|
|
|
|
|
builder => '_build_root', |
31
|
|
|
|
|
|
|
); |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
has 'module_delineation' => ( |
35
|
|
|
|
|
|
|
is => 'ro', |
36
|
|
|
|
|
|
|
isa => 'Str', |
37
|
|
|
|
|
|
|
default => 'substrate-selection', |
38
|
|
|
|
|
|
|
); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
## no critic (ProhibitUnusedPrivateSubroutines) |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
sub _build_root { |
43
|
1
|
|
|
1
|
|
3
|
my $self = shift; |
44
|
|
|
|
|
|
|
|
45
|
1
|
|
|
|
|
4
|
my @exts = qw(.xml .js); |
46
|
1
|
|
|
|
|
35
|
my ($name, $dir, $ext) = fileparse($self->file, @exts); |
47
|
|
|
|
|
|
|
|
48
|
1
|
|
|
|
|
156
|
my $biosynml = File::Temp->new(suffix => '.xml'); |
49
|
|
|
|
|
|
|
|
50
|
1
|
50
|
|
|
|
791
|
if ($ext eq '.js') { |
51
|
0
|
|
|
|
|
0
|
my $xmlstr = $self->_convert_js2biosynml; |
52
|
0
|
|
|
|
|
0
|
open my $out, '>', $biosynml->filename; |
53
|
0
|
|
|
|
|
0
|
say {$out} $xmlstr; |
|
0
|
|
|
|
|
0
|
|
54
|
|
|
|
|
|
|
} |
55
|
|
|
|
|
|
|
|
56
|
1
|
50
|
|
|
|
40
|
my $file = $ext eq '.xml' ? $self->file : $biosynml->filename; |
57
|
|
|
|
|
|
|
|
58
|
1
|
50
|
|
|
|
15
|
my $xb = XML::Bare->new( file => $file ) |
59
|
|
|
|
|
|
|
or croak "Can't open '$file' for reading: $!"; |
60
|
|
|
|
|
|
|
|
61
|
1
|
|
|
|
|
3976
|
my $root = $xb->parse->{'root'}; |
62
|
1
|
50
|
|
|
|
4538
|
unless ($root) { |
63
|
0
|
|
|
|
|
0
|
carp "Warning: '$file' unexpectedly empty; returning no root!"; |
64
|
0
|
|
|
|
|
0
|
return; |
65
|
|
|
|
|
|
|
} |
66
|
|
|
|
|
|
|
|
67
|
1
|
|
|
|
|
50
|
return Root->new( _root => $root, |
68
|
|
|
|
|
|
|
module_delineation => $self->module_delineation ); |
69
|
|
|
|
|
|
|
} |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
## use critic |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
sub _convert_js2biosynml { |
74
|
0
|
|
|
0
|
|
|
my $self = shift; |
75
|
|
|
|
|
|
|
|
76
|
0
|
|
|
|
|
|
my $js = $self->file; |
77
|
0
|
|
|
|
|
|
my $json = File::Temp->new(suffix => '.json'); |
78
|
|
|
|
|
|
|
|
79
|
0
|
|
|
|
|
|
open my $in, '<', $js; |
80
|
0
|
|
|
|
|
|
chomp( my @lines = <$in> ); |
81
|
|
|
|
|
|
|
|
82
|
0
|
|
|
|
|
|
open my $out, '>', $json->filename; |
83
|
0
|
|
|
|
|
|
for my $i (0 .. @lines - 1) { |
84
|
|
|
|
|
|
|
|
85
|
0
|
0
|
|
|
|
|
if ($i == 0) { |
86
|
0
|
|
|
|
|
|
say {$out} '{'; |
|
0
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
0
|
0
|
|
|
|
|
if (substr($lines[$i], 0, 3) eq 'var') { |
|
|
0
|
|
|
|
|
|
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
# extract the block name and its sigil type |
92
|
|
|
|
|
|
|
# type may contains two characters if the var is empty |
93
|
0
|
|
|
|
|
|
my ($key, $sigil) = $lines[$i] |
94
|
|
|
|
|
|
|
=~ m/var \s ([A-Za-z\_]+) \s=\s ([\[\]{}]+);?$/xms; |
95
|
|
|
|
|
|
|
|
96
|
0
|
|
|
|
|
|
say {$out} " \"$key\": $sigil"; |
|
0
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
} |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
elsif ($i == @lines - 1) { |
100
|
|
|
|
|
|
|
# do not convert the ';' in a ',' |
101
|
0
|
|
|
|
|
|
say {$out} ' }'; |
|
0
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
else { |
105
|
0
|
|
|
|
|
|
$lines[$i] =~ s/\]\;/\]\,/xms; |
106
|
0
|
|
|
|
|
|
$lines[$i] =~ s/\}\;/\}\,/xms; |
107
|
0
|
|
|
|
|
|
say {$out} ' ' . $lines[$i]; |
|
0
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
0
|
|
|
|
|
|
print {$out} '}'; |
|
0
|
|
|
|
|
|
|
112
|
0
|
|
|
|
|
|
close $out; |
113
|
|
|
|
|
|
|
|
114
|
0
|
|
|
|
|
|
open my $in2, '<', $json->filename; |
115
|
0
|
|
|
|
|
|
while (my $line = <$in>){ |
116
|
0
|
|
|
|
|
|
print $line; |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
0
|
|
|
|
|
|
my $root = json_file_to_perl($json->filename); |
120
|
|
|
|
|
|
|
|
121
|
0
|
|
|
|
|
|
my %json_for; |
122
|
0
|
|
|
|
|
|
my ($cluster_id, $gene_id) = (1,1); |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# parse the first part of the report |
125
|
0
|
|
|
|
|
|
my $region_for = $root->{all_regions}; |
126
|
0
|
|
|
|
|
|
for my $region (@{ $region_for->{order} }) { |
|
0
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
my %cluster_for = ( |
129
|
|
|
|
|
|
|
id => $cluster_id++, |
130
|
|
|
|
|
|
|
name => $region_for->{$region}{anchor}, |
131
|
|
|
|
|
|
|
rank => $region_for->{$region}{idx}, |
132
|
|
|
|
|
|
|
type => $region_for->{$region}{type}, |
133
|
|
|
|
|
|
|
start => $region_for->{$region}{start}, # DNA coordinates |
134
|
|
|
|
|
|
|
end => $region_for->{$region}{end}, |
135
|
0
|
|
|
|
|
|
); |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
$json_for{ $cluster_for{name} }{$_} = $cluster_for{$_} |
138
|
0
|
|
|
|
|
|
for keys %cluster_for; |
139
|
|
|
|
|
|
|
|
140
|
0
|
|
|
|
|
|
my $orfs = $region_for->{$region}{orfs}; |
141
|
|
|
|
|
|
|
|
142
|
0
|
|
|
|
|
|
for my $orf (@{ $orfs }) { |
|
0
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
|
144
|
0
|
|
|
|
|
|
my $def = $orf->{description}; |
145
|
0
|
|
|
|
|
|
my ($sequence) |
146
|
|
|
|
|
|
|
= $def =~ m/PROGRAMS=blastp&QUERY=([A-Z]+)\&/xms; |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
my %orf_for = ( |
149
|
|
|
|
|
|
|
id => $gene_id++, |
150
|
|
|
|
|
|
|
name => $orf->{locus_tag}, |
151
|
|
|
|
|
|
|
start => $orf->{start}, # DNA coordinates |
152
|
|
|
|
|
|
|
end => $orf->{end}, |
153
|
|
|
|
|
|
|
type => $orf->{type}, |
154
|
|
|
|
|
|
|
sequence => $sequence, |
155
|
|
|
|
|
|
|
strand => $orf->{strand}, |
156
|
0
|
|
|
|
|
|
); |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
$json_for{ $cluster_for{name} }{genes}{ $orf_for{name} }{$_} |
159
|
0
|
|
|
|
|
|
= $orf_for{$_} for keys %orf_for; |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# parse the second part of the report |
164
|
0
|
|
|
|
|
|
my $domain_id = 1; |
165
|
0
|
|
|
|
|
|
my $module_id = 1; |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# use a fix for antiSMASH 5.1.1 |
168
|
|
|
|
|
|
|
$region_for = $root->{details_data}{nrpspks} |
169
|
|
|
|
|
|
|
? $root->{details_data}{nrpspks} |
170
|
|
|
|
|
|
|
: $root->{details_data} |
171
|
0
|
0
|
|
|
|
|
; # reassigning the region_for var |
172
|
|
|
|
|
|
|
|
173
|
0
|
|
|
|
|
|
for my $region (keys %{ $region_for }) { |
|
0
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
175
|
0
|
|
|
|
|
|
my $cluster_name = $region_for->{$region}{id}; |
176
|
0
|
|
|
|
|
|
my $orfs = $region_for->{$region}{orfs}; |
177
|
|
|
|
|
|
|
|
178
|
0
|
|
|
|
|
|
my $prev_domain; |
179
|
0
|
|
|
|
|
|
for my $orf (@{ $orfs }) { |
|
0
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
|
181
|
0
|
|
|
|
|
|
my $gene_name = $orf->{id}; |
182
|
0
|
|
|
|
|
|
for my $domain (@{ $orf->{domains} }) { |
|
0
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
# fix duplicate domains in v5 (2019.11.02) |
185
|
0
|
0
|
0
|
|
|
|
if ($domain_id > 1 && $prev_domain |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
186
|
|
|
|
|
|
|
&& $domain->{start} eq $prev_domain->{start} |
187
|
|
|
|
|
|
|
&& $domain->{sequence} eq $prev_domain->{sequence} |
188
|
|
|
|
|
|
|
) { |
189
|
0
|
|
|
|
|
|
next; |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
$json_for{$cluster_name}{genes}{$gene_name}{domains}{$domain_id} |
193
|
|
|
|
|
|
|
= { |
194
|
|
|
|
|
|
|
id => $domain_id++, |
195
|
|
|
|
|
|
|
gene_id => $json_for{$cluster_name}{genes}{$gene_name}{id}, |
196
|
|
|
|
|
|
|
prot_start => $domain->{start}, |
197
|
|
|
|
|
|
|
prot_end => $domain->{end}, |
198
|
|
|
|
|
|
|
type => $domain->{type}, |
199
|
|
|
|
|
|
|
sequence => $domain->{sequence}, |
200
|
|
|
|
|
|
|
dna_start => $domain->{start} == 1 |
201
|
|
|
|
|
|
|
? 1 : $domain->{start} * 3, |
202
|
|
|
|
|
|
|
dna_end => $domain->{end} * 3, |
203
|
|
|
|
|
|
|
abbreviation => $domain->{abbreviation}, # = symbol |
204
|
0
|
0
|
|
|
|
|
}; |
205
|
|
|
|
|
|
|
|
206
|
0
|
|
|
|
|
|
$prev_domain = $domain; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
|
209
|
0
|
0
|
|
|
|
|
if ($orf->{modules}) { # appeared in antiSMASH version 5.1 |
210
|
|
|
|
|
|
|
|
211
|
0
|
|
|
|
|
|
for my $module (@{ $orf->{modules} }) { |
|
0
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
my @gene_dna_coordinates = ( |
214
|
|
|
|
|
|
|
$json_for{$cluster_name}{genes}{$gene_name}{start}, |
215
|
|
|
|
|
|
|
$json_for{$cluster_name}{genes}{$gene_name}{end}, |
216
|
0
|
|
|
|
|
|
); |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
my $genomic_prot_start = ceil(($gene_dna_coordinates[0] / 3)) |
219
|
0
|
|
|
|
|
|
+ $module->{start} - 1; # if module starting pos is 1, this souldn't be position 2 on the gene coords |
220
|
|
|
|
|
|
|
my $genomic_prot_end = ceil(($gene_dna_coordinates[0] / 3)) |
221
|
0
|
|
|
|
|
|
+ $module->{end} - 1; |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
my $dna_start = $module->{start} == 1 # if prot pos is 1, it should still be 1 in DNA |
224
|
|
|
|
|
|
|
? 1 |
225
|
0
|
0
|
|
|
|
|
: $module->{start} * 3 |
226
|
|
|
|
|
|
|
; |
227
|
|
|
|
|
|
|
|
228
|
0
|
|
|
|
|
|
my $genomic_dna_start = $gene_dna_coordinates[0] |
229
|
|
|
|
|
|
|
+ $dna_start - 1; |
230
|
|
|
|
|
|
|
my $genomic_dna_end = $gene_dna_coordinates[0] |
231
|
0
|
|
|
|
|
|
+ ($module->{end} * 3) - 1; |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
$json_for{$cluster_name}{modules}{ |
234
|
|
|
|
|
|
|
'module' . $module_id} = { |
235
|
|
|
|
|
|
|
id => $module_id++, |
236
|
|
|
|
|
|
|
gene_id => $gene_name, |
237
|
|
|
|
|
|
|
rel_start => $module->{start}, # relative to gene coordinates |
238
|
|
|
|
|
|
|
rel_end => $module->{end}, |
239
|
|
|
|
|
|
|
prot_start => $genomic_prot_start, |
240
|
|
|
|
|
|
|
prot_end => $genomic_prot_end, |
241
|
|
|
|
|
|
|
dna_start => $genomic_dna_start, |
242
|
|
|
|
|
|
|
dna_end => $genomic_dna_end, |
243
|
|
|
|
|
|
|
complete => $module->{complete} == 1 |
244
|
|
|
|
|
|
|
? 'true' : 'false', |
245
|
|
|
|
|
|
|
iterative => $module->{iterative}, |
246
|
|
|
|
|
|
|
monomer => $module->{monomer}, |
247
|
|
|
|
|
|
|
# domains => join ',', @module_domains // 'NULL', |
248
|
0
|
0
|
|
|
|
|
}; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
} |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
# ### %json_for |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# writing biosynML format |
257
|
0
|
|
|
|
|
|
my %biosynml_for; |
258
|
0
|
|
|
|
|
|
for my $cluster (keys %json_for) { |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
my ($c_id, $c_name, $c_begin, $c_end, $c_type) |
261
|
0
|
|
|
|
|
|
= map { $json_for{$cluster}{$_} } qw(id name start end type); |
|
0
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
|
263
|
0
|
|
|
|
|
|
my $model_id = 'model id="' . $c_id . '"'; |
264
|
|
|
|
|
|
|
|
265
|
0
|
|
|
|
|
|
$biosynml_for{$model_id}{genecluster}{name} = $c_name; |
266
|
0
|
|
|
|
|
|
$biosynml_for{$model_id}{genecluster}{type} = $c_type; |
267
|
0
|
|
|
|
|
|
$biosynml_for{$model_id}{genecluster}{region}{begin} = $c_begin; # DNA coordinates |
268
|
0
|
|
|
|
|
|
$biosynml_for{$model_id}{genecluster}{region}{end} = $c_end; |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
GENE: |
271
|
0
|
|
|
|
|
|
for my $gene (keys %{ $json_for{$cluster}{genes} }) { |
|
0
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
my ($g_id, $g_name, $g_begin, $g_end, $g_sequence) |
274
|
0
|
|
|
|
|
|
= map { $json_for{$cluster}{genes}{$gene}{$_} } |
|
0
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
qw(id name start end sequence) |
276
|
|
|
|
|
|
|
; |
277
|
|
|
|
|
|
|
|
278
|
0
|
|
|
|
|
|
my $attr_gene_id = 'gene id="' . $g_id . '"'; |
279
|
0
|
|
|
|
|
|
$biosynml_for{genelist}{$attr_gene_id}{gene_name} = $g_name; |
280
|
|
|
|
|
|
|
$biosynml_for{genelist}{$attr_gene_id}{gene_location}{begin} |
281
|
0
|
|
|
|
|
|
= $g_begin; |
282
|
|
|
|
|
|
|
|
283
|
|
|
|
|
|
|
$biosynml_for{genelist}{$attr_gene_id}{gene_location}{end} |
284
|
0
|
|
|
|
|
|
= $g_end; |
285
|
|
|
|
|
|
|
|
286
|
0
|
|
|
|
|
|
$biosynml_for{genelist}{$attr_gene_id}{gene_qualifiers}{'qualifier' |
287
|
|
|
|
|
|
|
. ' name="translation" ori="auto-annotation" style="genbank"'} |
288
|
|
|
|
|
|
|
= $g_sequence |
289
|
|
|
|
|
|
|
; |
290
|
|
|
|
|
|
|
|
291
|
0
|
|
|
|
|
|
for my $domain (keys %{ $json_for{$cluster}{genes}{$gene}{domains} |
292
|
0
|
|
|
|
|
|
}) { |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
my ($d_id, $dgene_id, $d_pbegin, $d_pend, $d_dbegin, |
295
|
|
|
|
|
|
|
$d_dend,$d_type, $d_sequence) |
296
|
0
|
|
|
|
|
|
= map { $json_for{$cluster}{genes}{$gene}{domains}{ |
297
|
0
|
|
|
|
|
|
$domain}{$_} } |
298
|
|
|
|
|
|
|
qw(id gene_id prot_start prot_end |
299
|
|
|
|
|
|
|
dna_begin dna_end type sequence) |
300
|
|
|
|
|
|
|
; |
301
|
|
|
|
|
|
|
|
302
|
0
|
|
|
|
|
|
my $attr_domain_id = 'domain id="' . $d_id .'"'; |
303
|
|
|
|
|
|
|
|
304
|
0
|
|
|
|
|
|
$biosynml_for{domainlist}{$attr_domain_id} |
305
|
|
|
|
|
|
|
= { |
306
|
|
|
|
|
|
|
nodeid => $d_id, |
307
|
|
|
|
|
|
|
function => $d_type, |
308
|
|
|
|
|
|
|
location => { |
309
|
|
|
|
|
|
|
gene => { |
310
|
|
|
|
|
|
|
'geneid source ="genelist"' => $dgene_id, |
311
|
|
|
|
|
|
|
position => { begin => $d_dbegin, end => $d_dend, }, |
312
|
|
|
|
|
|
|
}, |
313
|
|
|
|
|
|
|
protein => { |
314
|
|
|
|
|
|
|
sequence => $d_sequence, |
315
|
|
|
|
|
|
|
position => { begin => $d_pbegin, end => $d_pend, }, |
316
|
|
|
|
|
|
|
}, |
317
|
|
|
|
|
|
|
}, |
318
|
|
|
|
|
|
|
}; |
319
|
|
|
|
|
|
|
} |
320
|
|
|
|
|
|
|
} |
321
|
|
|
|
|
|
|
|
322
|
0
|
0
|
|
|
|
|
if ($json_for{$cluster}{modules}) { |
323
|
|
|
|
|
|
|
|
324
|
0
|
|
|
|
|
|
my $module_for = $json_for{$cluster}{modules}; |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
MODULE: |
327
|
0
|
|
|
|
|
|
for my $module (keys %{ $module_for }) { |
|
0
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
my $attr_module_id = 'module id="' |
330
|
0
|
|
|
|
|
|
. $module_for->{$module}{id} .'"'; |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
$biosynml_for{modulelist}{$attr_module_id}{$_} |
333
|
|
|
|
|
|
|
= $module_for->{$module}{$_} |
334
|
0
|
|
|
|
|
|
for keys %{ $module_for->{$module} } |
|
0
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
; |
336
|
|
|
|
|
|
|
} |
337
|
|
|
|
|
|
|
} |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
# ### %biosynml_for |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
# write XML file |
343
|
0
|
|
|
|
|
|
my $conv = XML::Hash::XS->new(utf8 => 0, encoding => 'utf-8', indent => 4); |
344
|
0
|
|
|
|
|
|
my $xmlstr = $conv->hash2xml(\%biosynml_for, utf8 => 1); |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# correct artificial attributes |
347
|
0
|
|
|
|
|
|
$xmlstr =~ s/(<\/[a-z\_]+).*?>/$1>/xmsg; |
348
|
|
|
|
|
|
|
|
349
|
0
|
|
|
|
|
|
return($xmlstr); |
350
|
|
|
|
|
|
|
} |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
sub is_cluster_type_ok { |
353
|
|
|
|
|
|
|
|
354
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
355
|
|
|
|
|
|
|
|
356
|
0
|
|
|
|
|
|
my @filter_types = shift; |
357
|
|
|
|
|
|
|
|
358
|
0
|
|
|
|
|
|
my @allowed_types = qw( |
359
|
|
|
|
|
|
|
acyl_amino_acids amglyccycl arylpolyene bacteriocin butyrolactone |
360
|
|
|
|
|
|
|
cyanobactin ectoine hserlactone hglE-KS indole ladderane lantipeptide |
361
|
|
|
|
|
|
|
lassopeptide microviridin nrps nucleoside oligosaccharide otherks |
362
|
|
|
|
|
|
|
phenazine phosphonate PKS proteusin PUFA resorcinol siderophore t1pks |
363
|
|
|
|
|
|
|
t2pks t3pks terpene |
364
|
|
|
|
|
|
|
); |
365
|
|
|
|
|
|
|
|
366
|
0
|
|
|
|
|
|
for my $type (@filter_types) { |
367
|
|
|
|
|
|
|
|
368
|
0
|
0
|
|
|
|
|
unless (grep { $type =~ m/$_/xmsi } @allowed_types) { |
|
0
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
|
370
|
0
|
|
|
|
|
|
croak 'Error: value "' . $type . '" from --types option is ' |
371
|
|
|
|
|
|
|
. 'incorrect. Please look allowed values with --help option'; |
372
|
|
|
|
|
|
|
} |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
|
375
|
0
|
|
|
|
|
|
return(1); |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
__PACKAGE__->meta->make_immutable; |
380
|
|
|
|
|
|
|
1; |
381
|
|
|
|
|
|
|
|
382
|
|
|
|
|
|
|
__END__ |
383
|
|
|
|
|
|
|
|
384
|
|
|
|
|
|
|
=pod |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
=head1 NAME |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
Bio::Palantir::Parser - front-end class for Bio::Palantir::Parser module, wich handles the parsing of biosynML.xml and regions.js antiSMASH reports |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
=head1 VERSION |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
version 0.211420 |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
=head1 SYNOPSIS |
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
#TODO |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=head1 DESCRIPTION |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
This module implements classes and their methods for B<parsing antisMASH |
401
|
|
|
|
|
|
|
reports>. The supported report formats are the F<biosynML.xml> file generated in |
402
|
|
|
|
|
|
|
antiSMASH v3-4 (though the version 4 needs to be explicitely activated in the |
403
|
|
|
|
|
|
|
options) or the F<regions.js> in the version 5. |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
The Biosynthetic Gene Cluster (BGC) information is hierarchically organized as |
406
|
|
|
|
|
|
|
follows: |
407
|
|
|
|
|
|
|
|
408
|
|
|
|
|
|
|
C<Root.pm>: contains the root of the BGC data structure |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
C<Cluster.pm>: contains attributes and methods for the BGC B<Cluster> level, |
411
|
|
|
|
|
|
|
including an array of Gene objects |
412
|
|
|
|
|
|
|
|
413
|
|
|
|
|
|
|
C<Gene.pm>: contains attributes and methods for the BGC B<Gene> level, |
414
|
|
|
|
|
|
|
including an array of Domain objects (if NRPS/PKS BGCs) |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
C<Module.pm>: contains attributes and methods for the BGC B<Module> level |
417
|
|
|
|
|
|
|
(generated by Palantir), including an array of Domain objects (this class is |
418
|
|
|
|
|
|
|
parallel to Genes, as module can be overlapping 2 genes) |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
C<domain.pm>: contains attributes and methods for the BGC B<Domain> level, |
421
|
|
|
|
|
|
|
including an array of Motif objects |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
C<Motif.pm>: contains attributes and methods for the BGC B<Motif> level |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
=head1 ATTRIBUTES |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
=head2 file |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
Path to biosynML.xml or regions.js antiSMASH report file to be parsed. |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
=head2 root |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
C<Bio::Palantir::Parser::Root> composed object |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
=head2 file |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
Path to a biosynML.xml or regions.js file |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
=head2 root |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
L<Bio::Palantir::Parser::Root> composed object |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
=head2 module_delineation |
444
|
|
|
|
|
|
|
|
445
|
|
|
|
|
|
|
Module delineation method: generates modules from condensation or selection domains. |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
=head1 AUTHOR |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
Loic MEUNIER <lmeunier@uliege.be> |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
This software is copyright (c) 2019 by University of Liege / Unit of Eukaryotic Phylogenomics / Loic MEUNIER and Denis BAURAIN. |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
This is free software; you can redistribute it and/or modify it under |
456
|
|
|
|
|
|
|
the same terms as the Perl 5 programming language system itself. |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=cut |