File Coverage

blib/lib/Bio/Palantir/Parser.pm

Criterion	Covered	Total	%
statement	43	159	27.0
branch	4	32	12.5
condition	0	9	0.0
subroutine	12	14	85.7
pod	0	1	0.0
total	59	215	27.4

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::Palantir::Parser;
2							# ABSTRACT: front-end class for Bio::Palantir::Parser module, wich handles the parsing of biosynML.xml and regions.js antiSMASH reports
3							$Bio::Palantir::Parser::VERSION = '0.200700';
4	1			1		934	use Moose;
	1					420232
	1					6
5	1			1		6652	use namespace::autoclean;
	1					6829
	1					4
6
7	1			1		65	use autodie;
	1					2
	1					7
8
9	1			1		4450	use Carp;
	1					3
	1					64
10	1			1		7	use File::Basename 'fileparse';
	1					2
	1					65
11	1			1		6	use File::Temp;
	1					2
	1					67
12	1			1		472	use JSON::Parse 'json_file_to_perl';
	1					858
	1					58
13	1			1		6	use POSIX 'ceil';
	1					1
	1					7
14	1			1		2369	use XML::Bare;
	1					7323
	1					40
15	1			1		749	use XML::Hash::XS;
	1					1327
	1					49
16
17	1			1		440	use aliased 'Bio::Palantir::Parser::Root';
	1					635
	1					6
18							extends 'Bio::FastParsers::Base';
19
20
21							# ATTRIBUTES
22
23
24
25							has 'root' => (
26							is => 'ro',
27							isa => 'Bio::Palantir::Parser::Root',
28							init_arg => undef,
29							lazy => 1,
30							builder => '_build_root',
31							);
32
33
34							has 'module_delineation' => (
35							is => 'ro',
36							isa => 'Str',
37							default => 'substrate-selection',
38							);
39
40							## no critic (ProhibitUnusedPrivateSubroutines)
41
42							sub _build_root {
43	1			1		3	my $self = shift;
44
45	1					4	my @exts = qw(.xml .js);
46	1					29	my ($name, $dir, $ext) = fileparse($self->file, @exts);
47
48	1					140	my $biosynml = File::Temp->new(suffix => '.xml');
49
50	1	50				642	if ($ext eq '.js') {
51	0					0	my $xmlstr = $self->_convert_js2biosynml;
52	0					0	open my $out, '>', $biosynml->filename;
53	0					0	say {$out} $xmlstr;
	0					0
54							}
55
56	1	50				33	my $file = $ext eq '.xml' ? $self->file : $biosynml->filename;
57
58	1	50				13	my $xb = XML::Bare->new( file => $file )
59							or croak "Can't open '$file' for reading: $!";
60
61	1					2346	my $root = $xb->parse->{'root'};
62	1	50				3618	unless ($root) {
63	0					0	carp "Warning: '$file' unexpectedly empty; returning no root!";
64	0					0	return;
65							}
66
67	1					58	return Root->new( _root => $root,
68							module_delineation => $self->module_delineation );
69							}
70
71							## use critic
72
73							sub _convert_js2biosynml {
74	0			0			my $self = shift;
75
76	0						my $js = $self->file;
77	0						my $json = File::Temp->new(suffix => '.json');
78
79	0						open my $in, '<', $js;
80
81	0						chomp( my @lines = <$in> );
82
83	0						open my $out, '>', $json->filename;
84	0						for my $i (0 .. @lines - 1) {
85
86	0	0					if ($i == 0) {
		0
		0
		0
		0
87	0						say {$out} '{' . "\n" . ' "recordData": [';
	0
88							}
89
90							elsif ($lines[$i] =~ m/all_regions/xms) {
91	0						say {$out} ' "all_regions": {';
	0
92							}
93
94							elsif ($lines[$i] =~ m/details_data/xms) {
95	0						say {$out} ' "details_data": {';
	0
96							}
97
98							elsif ($i == @lines - 2) {
99	0						say {$out} ' }';
	0
100							}
101
102							elsif ($i == @lines - 1) {
103	0						say {$out} ' }';
	0
104							}
105
106							else {
107	0						$lines[$i] =~ s/\]\;/\]\,/xms;
108	0						$lines[$i] =~ s/\}\;/\}\,/xms;
109	0						say {$out} ' ' . $lines[$i];
	0
110							}
111							}
112
113	0						print {$out} '}';
	0
114	0						close $out;
115
116	0						my $root = json_file_to_perl($json->filename);
117
118	0						my %json_for;
119	0						my ($cluster_id, $gene_id) = (1,1);
120
121							# parse the first part of the report
122	0						my $region_for = $root->{all_regions};
123	0						for my $region (@{ $region_for->{order} }) {
	0
124
125							my %cluster_for = (
126							id => $cluster_id++,
127							name => $region_for->{$region}{anchor},
128							rank => $region_for->{$region}{idx},
129							type => $region_for->{$region}{type},
130							start => $region_for->{$region}{start}, # DNA coordinates
131							end => $region_for->{$region}{end},
132	0						);
133
134							$json_for{ $cluster_for{name} }{$_} = $cluster_for{$_}
135	0						for keys %cluster_for;
136
137	0						my $orfs = $region_for->{$region}{orfs};
138
139	0						for my $orf (@{ $orfs }) {
	0
140
141	0						my $def = $orf->{description};
142	0						my ($sequence)
143							= $def =~ m/PROGRAMS=blastp&QUERY=([A-Z]+)\&amp/xms;
144
145							my %orf_for = (
146							id => $gene_id++,
147							name => $orf->{locus_tag},
148							start => $orf->{start}, # DNA coordinates
149							end => $orf->{end},
150							type => $orf->{type},
151							sequence => $sequence,
152							strand => $orf->{strand},
153	0						);
154
155							$json_for{ $cluster_for{name} }{genes}{ $orf_for{name} }{$_}
156	0						= $orf_for{$_} for keys %orf_for;
157							}
158							}
159
160							# parse the second part of the report
161	0						my $domain_id = 1;
162	0						my $module_id = 1;
163	0						$region_for = $root->{details_data}; # reassigning the region_for var
164	0						for my $region (keys %{ $region_for }) {
	0
165
166	0						my $cluster_name = $region_for->{$region}{id};
167	0						my $orfs = $region_for->{$region}{orfs};
168
169	0						my $prev_domain;
170	0						for my $orf (@{ $orfs }) {
	0
171
172	0						my $gene_name = $orf->{id};
173	0						for my $domain (@{ $orf->{domains} }) {
	0
174
175							# fix duplicate domains in v5 (2019.11.02)
176	0	0	0				if ($domain_id > 1 && $prev_domain
			0
			0
177							&& $domain->{start} eq $prev_domain->{start}
178							&& $domain->{sequence} eq $prev_domain->{sequence}
179							) {
180	0						next;
181							}
182
183							$json_for{$cluster_name}{genes}{$gene_name}{domains}{$domain_id}
184							= {
185							id => $domain_id++,
186							gene_id => $json_for{$cluster_name}{genes}{$gene_name}{id},
187							prot_start => $domain->{start},
188							prot_end => $domain->{end},
189							type => $domain->{type},
190							sequence => $domain->{sequence},
191							dna_start => $domain->{start} == 1
192							? 1 : $domain->{start} * 3,
193							dna_end => $domain->{end} * 3,
194							abbreviation => $domain->{abbreviation}, # = symbol
195	0	0					};
196
197	0						$prev_domain = $domain;
198							}
199
200	0	0					if ($orf->{modules}) { # appeared in antiSMASH version 5.1
201
202	0						for my $module (@{ $orf->{modules} }) {
	0
203
204							my @gene_dna_coordinates = (
205							$json_for{$cluster_name}{genes}{$gene_name}{start},
206							$json_for{$cluster_name}{genes}{$gene_name}{end},
207	0						);
208
209							my $genomic_prot_start = ceil(($gene_dna_coordinates[0] / 3))
210	0						+ $module->{start} - 1; # if module starting pos is 1, this souldn't be position 2 on the gene coords
211							my $genomic_prot_end = ceil(($gene_dna_coordinates[0] / 3))
212	0						+ $module->{end} - 1;
213
214							my $dna_start = $module->{start} == 1 # if prot pos is 1, it should still be 1 in DNA
215							? 1
216	0	0					: $module->{start} * 3
217							;
218
219	0						my $genomic_dna_start = $gene_dna_coordinates[0]
220							+ $dna_start - 1;
221							my $genomic_dna_end = $gene_dna_coordinates[0]
222	0						+ ($module->{end} * 3) - 1;
223
224							$json_for{$cluster_name}{modules}{
225							'module' . $module_id} = {
226							id => $module_id++,
227							gene_id => $gene_name,
228							rel_start => $module->{start}, # relative to gene coordinates
229							rel_end => $module->{end},
230							prot_start => $genomic_prot_start,
231							prot_end => $genomic_prot_end,
232							dna_start => $genomic_dna_start,
233							dna_end => $genomic_dna_end,
234							complete => $module->{complete} == 1
235							? 'true' : 'false',
236							iterative => $module->{iterative},
237							monomer => $module->{monomer},
238							# domains => join ',', @module_domains // 'NULL',
239	0	0					};
240							}
241							}
242							}
243							}
244
245							# ### %json_for
246
247							# writing biosynML format
248	0						my %biosynml_for;
249	0						for my $cluster (keys %json_for) {
250
251							my ($c_id, $c_name, $c_begin, $c_end, $c_type)
252	0						= map { $json_for{$cluster}{$_} } qw(id name start end type);
	0
253
254	0						my $model_id = 'model id="' . $c_id . '"';
255
256	0						$biosynml_for{$model_id}{genecluster}{name} = $c_name;
257	0						$biosynml_for{$model_id}{genecluster}{type} = $c_type;
258	0						$biosynml_for{$model_id}{genecluster}{region}{begin} = $c_begin; # DNA coordinates
259	0						$biosynml_for{$model_id}{genecluster}{region}{end} = $c_end;
260
261							GENE:
262	0						for my $gene (keys %{ $json_for{$cluster}{genes} }) {
	0
263
264							my ($g_id, $g_name, $g_begin, $g_end, $g_sequence)
265	0						= map { $json_for{$cluster}{genes}{$gene}{$_} }
	0
266							qw(id name start end sequence)
267							;
268
269	0						my $attr_gene_id = 'gene id="' . $g_id . '"';
270	0						$biosynml_for{genelist}{$attr_gene_id}{gene_name} = $g_name;
271							$biosynml_for{genelist}{$attr_gene_id}{gene_location}{begin}
272	0						= $g_begin;
273
274							$biosynml_for{genelist}{$attr_gene_id}{gene_location}{end}
275	0						= $g_end;
276
277	0						$biosynml_for{genelist}{$attr_gene_id}{gene_qualifiers}{'qualifier'
278							. ' name="translation" ori="auto-annotation" style="genbank"'}
279							= $g_sequence
280							;
281
282	0						for my $domain (keys %{ $json_for{$cluster}{genes}{$gene}{domains}
283	0						}) {
284
285							my ($d_id, $dgene_id, $d_pbegin, $d_pend, $d_dbegin,
286							$d_dend,$d_type, $d_sequence)
287	0						= map { $json_for{$cluster}{genes}{$gene}{domains}{
288	0						$domain}{$_} }
289							qw(id gene_id prot_start prot_end
290							dna_begin dna_end type sequence)
291							;
292
293	0						my $attr_domain_id = 'domain id="' . $d_id .'"';
294
295	0						$biosynml_for{domainlist}{$attr_domain_id}
296							= {
297							nodeid => $d_id,
298							function => $d_type,
299							location => {
300							gene => {
301							'geneid source ="genelist"' => $dgene_id,
302							position => { begin => $d_dbegin, end => $d_dend, },
303							},
304							protein => {
305							sequence => $d_sequence,
306							position => { begin => $d_pbegin, end => $d_pend, },
307							},
308							},
309							};
310							}
311							}
312
313	0	0					if ($json_for{$cluster}{modules}) {
314
315	0						my $module_for = $json_for{$cluster}{modules};
316
317							MODULE:
318	0						for my $module (keys %{ $module_for }) {
	0
319
320							my $attr_module_id = 'module id="'
321	0						. $module_for->{$module}{id} .'"';
322
323							$biosynml_for{modulelist}{$attr_module_id}{$_}
324							= $module_for->{$module}{$_}
325	0						for keys %{ $module_for->{$module} }
	0
326							;
327							}
328							}
329							}
330
331							# ### %biosynml_for
332
333							# write XML file
334	0						my $conv = XML::Hash::XS->new(utf8 => 0, encoding => 'utf-8', indent => 4);
335	0						my $xmlstr = $conv->hash2xml(\%biosynml_for, utf8 => 1);
336
337							# correct artificial attributes
338	0						$xmlstr =~ s/(<\/[a-z\_]+).*?>/$1>/xmsg;
339
340	0						return($xmlstr);
341							}
342
343							sub is_cluster_type_ok {
344
345	0			0	0		my $self = shift;
346
347	0						my @filter_types = shift;
348
349	0						my @allowed_types = qw(
350							acyl_amino_acids amglyccycl arylpolyene bacteriocin butyrolactone
351							cyanobactin ectoine hserlactone hglE-KS indole ladderane lantipeptide
352							lassopeptide microviridin nrps nucleoside oligosaccharide otherks
353							phenazine phosphonate PKS proteusin PUFA resorcinol siderophore t1pks
354							t2pks t3pks terpene
355							);
356
357	0						for my $type (@filter_types) {
358
359	0	0					unless (grep { $type =~ m/$_/xmsi } @allowed_types) {
	0
360
361	0						croak 'Error: value "' . $type . '" from --types option is '
362							. 'incorrect. Please look allowed values with --help option';
363							}
364							}
365
366	0						return(1);
367							}
368
369
370							__PACKAGE__->meta->make_immutable;
371							1;
372
373							__END__
374
375							=pod
376
377							=head1 NAME
378
379							Bio::Palantir::Parser - front-end class for Bio::Palantir::Parser module, wich handles the parsing of biosynML.xml and regions.js antiSMASH reports
380
381							=head1 VERSION
382
383							version 0.200700
384
385							=head1 SYNOPSIS
386
387							#TODO
388
389							=head1 DESCRIPTION
390
391							This module implements classes and their methods for B<parsing antisMASH
392							reports>. The supported report formats are the F<biosynML.xml> file generated in
393							antiSMASH v3-4 (though the version 4 needs to be explicitely activated in the
394							options) or the F<regions.js> in the version 5.
395
396							The Biosynthetic Gene Cluster (BGC) information is hierarchically organized as
397							follows:
398
399							C<Root.pm>: contains the root of the BGC data structure
400
401							C<Cluster.pm>: contains attributes and methods for the BGC B<Cluster> level,
402							including an array of Gene objects
403
404							C<Gene.pm>: contains attributes and methods for the BGC B<Gene> level,
405							including an array of Domain objects (if NRPS/PKS BGCs)
406
407							C<Module.pm>: contains attributes and methods for the BGC B<Module> level
408							(generated by Palantir), including an array of Domain objects (this class is
409							parallel to Genes, as module can be overlapping 2 genes)
410
411							C<domain.pm>: contains attributes and methods for the BGC B<Domain> level,
412							including an array of Motif objects
413
414							C<Motif.pm>: contains attributes and methods for the BGC B<Motif> level
415
416							=head1 ATTRIBUTES
417
418							=head2 file
419
420							Path to biosynML.xml or regions.js antiSMASH report file to be parsed.
421
422							=head2 root
423
424							C<Bio::Palantir::Parser::Root> composed object
425
426							=head2 file
427
428							Path to a biosynML.xml or regions.js file
429
430							=head2 root
431
432							L<Bio::Palantir::Parser::Root> composed object
433
434							=head2 module_delineation
435
436							Module delineation method: generates modules from condensation or selection domains.
437
438							=head1 AUTHOR
439
440							Loic MEUNIER <lmeunier@uliege.be>
441
442							=head1 COPYRIGHT AND LICENSE
443
444							This software is copyright (c) 2019 by University of Liege / Unit of Eukaryotic Phylogenomics / Loic MEUNIER and Denis BAURAIN.
445
446							This is free software; you can redistribute it and/or modify it under
447							the same terms as the Perl 5 programming language system itself.
448
449							=cut