File Coverage

blib/lib/Bio/Palantir/Parser.pm

Criterion	Covered	Total	%
statement	43	160	26.8
branch	4	30	13.3
condition	0	9	0.0
subroutine	12	14	85.7
pod	0	1	0.0
total	59	214	27.5

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Palantir::Parser;
2							# ABSTRACT: front-end class for Bio::Palantir::Parser module, wich handles the parsing of biosynML.xml and regions.js antiSMASH reports
3							$Bio::Palantir::Parser::VERSION = '0.211420';
4	1			1		649	use Moose;
	1					503791
	1					5
5	1			1		8582	use namespace::autoclean;
	1					9059
	1					5
6
7	1			1		483	use autodie;
	1					2
	1					10
8
9	1			1		5737	use Carp;
	1					2
	1					66
10	1			1		7	use File::Basename 'fileparse';
	1					3
	1					116
11	1			1		8	use File::Temp;
	1					3
	1					97
12	1			1		583	use JSON::Parse 'json_file_to_perl';
	1					1366
	1					59
13	1			1		8	use POSIX 'ceil';
	1					2
	1					17
14	1			1		3684	use XML::Bare;
	1					10300
	1					83
15	1			1		744	use XML::Hash::XS;
	1					1249
	1					66
16
17	1			1		574	use aliased 'Bio::Palantir::Parser::Root';
	1					921
	1					7
18							extends 'Bio::FastParsers::Base';
19
20
21							# ATTRIBUTES
22
23
24
25							has 'root' => (
26							is => 'ro',
27							isa => 'Bio::Palantir::Parser::Root',
28							init_arg => undef,
29							lazy => 1,
30							builder => '_build_root',
31							);
32
33
34							has 'module_delineation' => (
35							is => 'ro',
36							isa => 'Str',
37							default => 'substrate-selection',
38							);
39
40							## no critic (ProhibitUnusedPrivateSubroutines)
41
42							sub _build_root {
43	1			1		3	my $self = shift;
44
45	1					4	my @exts = qw(.xml .js);
46	1					35	my ($name, $dir, $ext) = fileparse($self->file, @exts);
47
48	1					156	my $biosynml = File::Temp->new(suffix => '.xml');
49
50	1	50				791	if ($ext eq '.js') {
51	0					0	my $xmlstr = $self->_convert_js2biosynml;
52	0					0	open my $out, '>', $biosynml->filename;
53	0					0	say {$out} $xmlstr;
	0					0
54							}
55
56	1	50				40	my $file = $ext eq '.xml' ? $self->file : $biosynml->filename;
57
58	1	50				15	my $xb = XML::Bare->new( file => $file )
59							or croak "Can't open '$file' for reading: $!";
60
61	1					3976	my $root = $xb->parse->{'root'};
62	1	50				4538	unless ($root) {
63	0					0	carp "Warning: '$file' unexpectedly empty; returning no root!";
64	0					0	return;
65							}
66
67	1					50	return Root->new( _root => $root,
68							module_delineation => $self->module_delineation );
69							}
70
71							## use critic
72
73							sub _convert_js2biosynml {
74	0			0			my $self = shift;
75
76	0						my $js = $self->file;
77	0						my $json = File::Temp->new(suffix => '.json');
78
79	0						open my $in, '<', $js;
80	0						chomp( my @lines = <$in> );
81
82	0						open my $out, '>', $json->filename;
83	0						for my $i (0 .. @lines - 1) {
84
85	0	0					if ($i == 0) {
86	0						say {$out} '{';
	0
87							}
88
89	0	0					if (substr($lines[$i], 0, 3) eq 'var') {
		0
90
91							# extract the block name and its sigil type
92							# type may contains two characters if the var is empty
93	0						my ($key, $sigil) = $lines[$i]
94							=~ m/var \s ([A-Za-z\_]+) \s=\s ([\[\]{}]+);?$/xms;
95
96	0						say {$out} " \"$key\": $sigil";
	0
97							}
98
99							elsif ($i == @lines - 1) {
100							# do not convert the ';' in a ','
101	0						say {$out} ' }';
	0
102							}
103
104							else {
105	0						$lines[$i] =~ s/\]\;/\]\,/xms;
106	0						$lines[$i] =~ s/\}\;/\}\,/xms;
107	0						say {$out} ' ' . $lines[$i];
	0
108							}
109							}
110
111	0						print {$out} '}';
	0
112	0						close $out;
113
114	0						open my $in2, '<', $json->filename;
115	0						while (my $line = <$in>){
116	0						print $line;
117							}
118
119	0						my $root = json_file_to_perl($json->filename);
120
121	0						my %json_for;
122	0						my ($cluster_id, $gene_id) = (1,1);
123
124							# parse the first part of the report
125	0						my $region_for = $root->{all_regions};
126	0						for my $region (@{ $region_for->{order} }) {
	0
127
128							my %cluster_for = (
129							id => $cluster_id++,
130							name => $region_for->{$region}{anchor},
131							rank => $region_for->{$region}{idx},
132							type => $region_for->{$region}{type},
133							start => $region_for->{$region}{start}, # DNA coordinates
134							end => $region_for->{$region}{end},
135	0						);
136
137							$json_for{ $cluster_for{name} }{$_} = $cluster_for{$_}
138	0						for keys %cluster_for;
139
140	0						my $orfs = $region_for->{$region}{orfs};
141
142	0						for my $orf (@{ $orfs }) {
	0
143
144	0						my $def = $orf->{description};
145	0						my ($sequence)
146							= $def =~ m/PROGRAMS=blastp&QUERY=([A-Z]+)\&amp/xms;
147
148							my %orf_for = (
149							id => $gene_id++,
150							name => $orf->{locus_tag},
151							start => $orf->{start}, # DNA coordinates
152							end => $orf->{end},
153							type => $orf->{type},
154							sequence => $sequence,
155							strand => $orf->{strand},
156	0						);
157
158							$json_for{ $cluster_for{name} }{genes}{ $orf_for{name} }{$_}
159	0						= $orf_for{$_} for keys %orf_for;
160							}
161							}
162
163							# parse the second part of the report
164	0						my $domain_id = 1;
165	0						my $module_id = 1;
166
167							# use a fix for antiSMASH 5.1.1
168							$region_for = $root->{details_data}{nrpspks}
169							? $root->{details_data}{nrpspks}
170							: $root->{details_data}
171	0	0					; # reassigning the region_for var
172
173	0						for my $region (keys %{ $region_for }) {
	0
174
175	0						my $cluster_name = $region_for->{$region}{id};
176	0						my $orfs = $region_for->{$region}{orfs};
177
178	0						my $prev_domain;
179	0						for my $orf (@{ $orfs }) {
	0
180
181	0						my $gene_name = $orf->{id};
182	0						for my $domain (@{ $orf->{domains} }) {
	0
183
184							# fix duplicate domains in v5 (2019.11.02)
185	0	0	0				if ($domain_id > 1 && $prev_domain
			0
			0
186							&& $domain->{start} eq $prev_domain->{start}
187							&& $domain->{sequence} eq $prev_domain->{sequence}
188							) {
189	0						next;
190							}
191
192							$json_for{$cluster_name}{genes}{$gene_name}{domains}{$domain_id}
193							= {
194							id => $domain_id++,
195							gene_id => $json_for{$cluster_name}{genes}{$gene_name}{id},
196							prot_start => $domain->{start},
197							prot_end => $domain->{end},
198							type => $domain->{type},
199							sequence => $domain->{sequence},
200							dna_start => $domain->{start} == 1
201							? 1 : $domain->{start} * 3,
202							dna_end => $domain->{end} * 3,
203							abbreviation => $domain->{abbreviation}, # = symbol
204	0	0					};
205
206	0						$prev_domain = $domain;
207							}
208
209	0	0					if ($orf->{modules}) { # appeared in antiSMASH version 5.1
210
211	0						for my $module (@{ $orf->{modules} }) {
	0
212
213							my @gene_dna_coordinates = (
214							$json_for{$cluster_name}{genes}{$gene_name}{start},
215							$json_for{$cluster_name}{genes}{$gene_name}{end},
216	0						);
217
218							my $genomic_prot_start = ceil(($gene_dna_coordinates[0] / 3))
219	0						+ $module->{start} - 1; # if module starting pos is 1, this souldn't be position 2 on the gene coords
220							my $genomic_prot_end = ceil(($gene_dna_coordinates[0] / 3))
221	0						+ $module->{end} - 1;
222
223							my $dna_start = $module->{start} == 1 # if prot pos is 1, it should still be 1 in DNA
224							? 1
225	0	0					: $module->{start} * 3
226							;
227
228	0						my $genomic_dna_start = $gene_dna_coordinates[0]
229							+ $dna_start - 1;
230							my $genomic_dna_end = $gene_dna_coordinates[0]
231	0						+ ($module->{end} * 3) - 1;
232
233							$json_for{$cluster_name}{modules}{
234							'module' . $module_id} = {
235							id => $module_id++,
236							gene_id => $gene_name,
237							rel_start => $module->{start}, # relative to gene coordinates
238							rel_end => $module->{end},
239							prot_start => $genomic_prot_start,
240							prot_end => $genomic_prot_end,
241							dna_start => $genomic_dna_start,
242							dna_end => $genomic_dna_end,
243							complete => $module->{complete} == 1
244							? 'true' : 'false',
245							iterative => $module->{iterative},
246							monomer => $module->{monomer},
247							# domains => join ',', @module_domains // 'NULL',
248	0	0					};
249							}
250							}
251							}
252							}
253
254							# ### %json_for
255
256							# writing biosynML format
257	0						my %biosynml_for;
258	0						for my $cluster (keys %json_for) {
259
260							my ($c_id, $c_name, $c_begin, $c_end, $c_type)
261	0						= map { $json_for{$cluster}{$_} } qw(id name start end type);
	0
262
263	0						my $model_id = 'model id="' . $c_id . '"';
264
265	0						$biosynml_for{$model_id}{genecluster}{name} = $c_name;
266	0						$biosynml_for{$model_id}{genecluster}{type} = $c_type;
267	0						$biosynml_for{$model_id}{genecluster}{region}{begin} = $c_begin; # DNA coordinates
268	0						$biosynml_for{$model_id}{genecluster}{region}{end} = $c_end;
269
270							GENE:
271	0						for my $gene (keys %{ $json_for{$cluster}{genes} }) {
	0
272
273							my ($g_id, $g_name, $g_begin, $g_end, $g_sequence)
274	0						= map { $json_for{$cluster}{genes}{$gene}{$_} }
	0
275							qw(id name start end sequence)
276							;
277
278	0						my $attr_gene_id = 'gene id="' . $g_id . '"';
279	0						$biosynml_for{genelist}{$attr_gene_id}{gene_name} = $g_name;
280							$biosynml_for{genelist}{$attr_gene_id}{gene_location}{begin}
281	0						= $g_begin;
282
283							$biosynml_for{genelist}{$attr_gene_id}{gene_location}{end}
284	0						= $g_end;
285
286	0						$biosynml_for{genelist}{$attr_gene_id}{gene_qualifiers}{'qualifier'
287							. ' name="translation" ori="auto-annotation" style="genbank"'}
288							= $g_sequence
289							;
290
291	0						for my $domain (keys %{ $json_for{$cluster}{genes}{$gene}{domains}
292	0						}) {
293
294							my ($d_id, $dgene_id, $d_pbegin, $d_pend, $d_dbegin,
295							$d_dend,$d_type, $d_sequence)
296	0						= map { $json_for{$cluster}{genes}{$gene}{domains}{
297	0						$domain}{$_} }
298							qw(id gene_id prot_start prot_end
299							dna_begin dna_end type sequence)
300							;
301
302	0						my $attr_domain_id = 'domain id="' . $d_id .'"';
303
304	0						$biosynml_for{domainlist}{$attr_domain_id}
305							= {
306							nodeid => $d_id,
307							function => $d_type,
308							location => {
309							gene => {
310							'geneid source ="genelist"' => $dgene_id,
311							position => { begin => $d_dbegin, end => $d_dend, },
312							},
313							protein => {
314							sequence => $d_sequence,
315							position => { begin => $d_pbegin, end => $d_pend, },
316							},
317							},
318							};
319							}
320							}
321
322	0	0					if ($json_for{$cluster}{modules}) {
323
324	0						my $module_for = $json_for{$cluster}{modules};
325
326							MODULE:
327	0						for my $module (keys %{ $module_for }) {
	0
328
329							my $attr_module_id = 'module id="'
330	0						. $module_for->{$module}{id} .'"';
331
332							$biosynml_for{modulelist}{$attr_module_id}{$_}
333							= $module_for->{$module}{$_}
334	0						for keys %{ $module_for->{$module} }
	0
335							;
336							}
337							}
338							}
339
340							# ### %biosynml_for
341
342							# write XML file
343	0						my $conv = XML::Hash::XS->new(utf8 => 0, encoding => 'utf-8', indent => 4);
344	0						my $xmlstr = $conv->hash2xml(\%biosynml_for, utf8 => 1);
345
346							# correct artificial attributes
347	0						$xmlstr =~ s/(<\/[a-z\_]+).*?>/$1>/xmsg;
348
349	0						return($xmlstr);
350							}
351
352							sub is_cluster_type_ok {
353
354	0			0	0		my $self = shift;
355
356	0						my @filter_types = shift;
357
358	0						my @allowed_types = qw(
359							acyl_amino_acids amglyccycl arylpolyene bacteriocin butyrolactone
360							cyanobactin ectoine hserlactone hglE-KS indole ladderane lantipeptide
361							lassopeptide microviridin nrps nucleoside oligosaccharide otherks
362							phenazine phosphonate PKS proteusin PUFA resorcinol siderophore t1pks
363							t2pks t3pks terpene
364							);
365
366	0						for my $type (@filter_types) {
367
368	0	0					unless (grep { $type =~ m/$_/xmsi } @allowed_types) {
	0
369
370	0						croak 'Error: value "' . $type . '" from --types option is '
371							. 'incorrect. Please look allowed values with --help option';
372							}
373							}
374
375	0						return(1);
376							}
377
378
379							__PACKAGE__->meta->make_immutable;
380							1;
381
382							__END__
383
384							=pod
385
386							=head1 NAME
387
388							Bio::Palantir::Parser - front-end class for Bio::Palantir::Parser module, wich handles the parsing of biosynML.xml and regions.js antiSMASH reports
389
390							=head1 VERSION
391
392							version 0.211420
393
394							=head1 SYNOPSIS
395
396							#TODO
397
398							=head1 DESCRIPTION
399
400							This module implements classes and their methods for B<parsing antisMASH
401							reports>. The supported report formats are the F<biosynML.xml> file generated in
402							antiSMASH v3-4 (though the version 4 needs to be explicitely activated in the
403							options) or the F<regions.js> in the version 5.
404
405							The Biosynthetic Gene Cluster (BGC) information is hierarchically organized as
406							follows:
407
408							C<Root.pm>: contains the root of the BGC data structure
409
410							C<Cluster.pm>: contains attributes and methods for the BGC B<Cluster> level,
411							including an array of Gene objects
412
413							C<Gene.pm>: contains attributes and methods for the BGC B<Gene> level,
414							including an array of Domain objects (if NRPS/PKS BGCs)
415
416							C<Module.pm>: contains attributes and methods for the BGC B<Module> level
417							(generated by Palantir), including an array of Domain objects (this class is
418							parallel to Genes, as module can be overlapping 2 genes)
419
420							C<domain.pm>: contains attributes and methods for the BGC B<Domain> level,
421							including an array of Motif objects
422
423							C<Motif.pm>: contains attributes and methods for the BGC B<Motif> level
424
425							=head1 ATTRIBUTES
426
427							=head2 file
428
429							Path to biosynML.xml or regions.js antiSMASH report file to be parsed.
430
431							=head2 root
432
433							C<Bio::Palantir::Parser::Root> composed object
434
435							=head2 file
436
437							Path to a biosynML.xml or regions.js file
438
439							=head2 root
440
441							L<Bio::Palantir::Parser::Root> composed object
442
443							=head2 module_delineation
444
445							Module delineation method: generates modules from condensation or selection domains.
446
447							=head1 AUTHOR
448
449							Loic MEUNIER <lmeunier@uliege.be>
450
451							=head1 COPYRIGHT AND LICENSE
452
453							This software is copyright (c) 2019 by University of Liege / Unit of Eukaryotic Phylogenomics / Loic MEUNIER and Denis BAURAIN.
454
455							This is free software; you can redistribute it and/or modify it under
456							the same terms as the Perl 5 programming language system itself.
457
458							=cut