File Coverage

lib/Pheno/Ranker/IO.pm

Criterion	Covered	Total	%
statement	184	220	83.6
branch	44	70	62.8
condition	15	23	65.2
subroutine	31	34	91.1
pod	0	16	0.0
total	274	363	75.4

line	stmt	bran	cond	sub	pod	time	code
1							package Pheno::Ranker::IO;
2
3	5			5		32	use strict;
	5					25
	5					156
4	5			5		13	use warnings;
	5					7
	5					248
5	5			5		47	use autodie;
	5					7
	5					44
6	5			5		18406	use feature qw(say);
	5					8
	5					569
7	5			5		4049	use Path::Tiny;
	5					58917
	5					295
8	5			5		30	use File::Basename;
	5					5
	5					294
9	5			5		23	use File::Spec::Functions qw(catdir catfile);
	5					8
	5					253
10	5			5		22	use List::Util qw(any);
	5					6
	5					230
11	5			5		19	use Hash::Util qw(lock_hash);
	5					7
	5					31
12	5			5		2998	use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
	5					181340
	5					519
13	5			5		2111	use YAML::XS qw(Load LoadFile DumpFile);
	5					13006
	5					295
14	5			5		3192	use JSON::XS;
	5					22368
	5					299
15							#use Data::Dumper;
16
17							#use Sort::Naturally qw(nsort);
18	5			5		28	use Exporter 'import';
	5					5
	5					247
19							our @EXPORT =
20							qw(serialize_hashes write_alignment io_yaml_or_json read_json read_yaml write_json write_array2txt array2object validate_json write_poi coverage_stats check_existence_of_include_terms append_and_rename_primary_key restructure_pxf_interpretations);
21	5			5		18	use constant DEVEL_MODE => 0;
	5					25
	5					13120
22
23							#########################
24							#########################
25							# SUBROUTINES FOR I/O #
26							#########################
27							#########################
28
29							sub serialize_hashes {
30
31	0			0	0	0	my $arg = shift;
32	0					0	my $data = $arg->{data};
33	0					0	my $export_basename = $arg->{export_basename};
34							write_json(
35							{ data => $data->{$_}, filepath => qq/$export_basename.$_.json/ } )
36	0					0	for keys %{$data};
	0					0
37	0					0	return 1;
38							}
39
40							sub write_alignment {
41
42	2			2	0	3	my $arg = shift;
43	2					6	my $basename = $arg->{align};
44	2					3	my $ascii = $arg->{ascii};
45	2					4	my $dataframe = $arg->{dataframe};
46	2					2	my $csv = $arg->{csv};
47	2					9	my %hash = (
48							'.txt' => $ascii,
49							'.csv' => $dataframe,
50							'.target.csv' => $csv
51							);
52
53	2					38	for my $key ( keys %hash ) {
54	6					15	my $output = $basename . $key;
55	6					28	write_array2txt( { filepath => $output, data => $hash{$key} } );
56							}
57	2					14	return 1;
58							}
59
60							sub io_yaml_or_json {
61
62	18			18	0	31	my $arg = shift;
63	18					25	my $file = $arg->{filepath};
64	18					23	my $mode = $arg->{mode};
65	18	50				44	my $data = $mode eq 'write' ? $arg->{data} : undef;
66
67							# Check if the file is gzipped
68	18	100				58	my $is_gz = $file =~ /\.gz$/ ? 1 : 0;
69
70							# Remove .gz for extension recognition if present
71	18	100				40	my $file_for_ext = $is_gz ? ($file =~ s/\.gz$//r) : $file;
72
73							# Allowed extensions
74	18					43	my @exts = qw(.yaml .yml .json);
75
76							# Use fileparse on the file name without the .gz suffix
77	18					952	my ( undef, undef, $ext ) = fileparse( $file_for_ext, @exts );
78	18					70	my $msg = qq(Can't recognize <$file> extension. Extensions allowed are: )
79							. join ',', @exts;
80	18	50		54		170	die $msg unless any { $_ eq $ext } @exts;
	54					96
81
82							# Unify extension by removing "a" and "."
83	18					76	$ext =~ tr/a.//d; # so ".yaml" or ".yml" become "yml" and ".json" becomes "json"
84
85							# Dispatch table for read/write operations
86	18					122	my $return = {
87							read => { json => \&read_json, yml => \&read_yaml },
88							write => { json => \&write_json, yml => \&write_yaml },
89							};
90
91							# Call the appropriate function based on the mode and extension
92							return $mode eq 'read'
93							? $return->{$mode}{$ext}->($file)
94	18	50				69	: $return->{$mode}{$ext}->({ filepath => $file, data => $data });
95							}
96
97							sub read_json {
98
99	21			21	0	24	my $file = shift;
100	21					22	my $str;
101	21	100				56	if ($file =~ /\.gz$/) {
102	2	50				13	gunzip $file => \$str
103							or die "gunzip failed for $file: $GunzipError\n";
104							}
105							else {
106	19					79	$str = path($file)->slurp;
107							}
108	21					78476	return decode_json($str);
109							}
110
111							sub read_yaml {
112	24			24	0	40	my $file = shift;
113	24					30	my $data;
114
115							# Check if the file ends with .gz
116	24	50				77	if ($file =~ /\.gz$/) {
117	0					0	my $yaml_str;
118	0	0				0	gunzip $file => \$yaml_str
119							or die "gunzip failed for $file: $GunzipError\n";
120							# Decode the YAML from the string
121	0					0	$data = Load($yaml_str);
122							}
123							else {
124							# Directly load from the file
125	24					118	$data = LoadFile($file);
126							}
127	24					5365	return $data;
128							}
129
130							sub write_json {
131
132	1			1	0	3	my $arg = shift;
133	1					3	my $file = $arg->{filepath};
134	1					2	my $json_data = $arg->{data};
135
136							# Note that canonical DOES not match the order of nsort from Sort::Naturally
137	1					700	my $json = JSON::XS->new->utf8->canonical->pretty->encode($json_data);
138	1					14	path($file)->spew($json);
139	1					6797	return 1;
140							}
141
142							sub write_yaml {
143
144	0			0	0	0	my $arg = shift;
145	0					0	my $file = $arg->{filepath};
146	0					0	my $json_data = $arg->{data};
147	0					0	local $YAML::XS::Boolean = 'JSON::PP';
148	0					0	DumpFile( $file, $json_data );
149	0					0	return 1;
150							}
151
152							sub write_array2txt {
153
154	8			8	0	10	my $arg = shift;
155	8					13	my $file = $arg->{filepath};
156	8					12	my $data = $arg->{data};
157
158							# Watch out for RAM usage!!!
159	8					30	path($file)->spew_utf8( join( "\n", @$data ) . "\n" );
160	8					29599	return 1;
161							}
162
163							sub write_poi {
164
165	0			0	0	0	my $arg = shift;
166	0					0	my $ref_data = $arg->{ref_data};
167	0					0	my $poi = $arg->{poi};
168	0					0	my $poi_out_dir = $arg->{poi_out_dir};
169	0					0	my $primary_key = $arg->{primary_key};
170	0					0	my $verbose = $arg->{verbose};
171	0					0	for my $name (@$poi) {
172	0					0	my ($match) = grep { $name eq $_->{$primary_key} } @$ref_data;
	0					0
173	0	0				0	if ($match) {
174	0					0	my $out = catfile( $poi_out_dir, "$name.json" );
175	0	0				0	say "Writting <$out>" if $verbose;
176	0					0	write_json( { filepath => $out, data => $match } );
177							}
178							else {
179	0					0	warn
180							"No individual found for <$name>. Are you sure you used the right prefix?\n";
181							}
182							}
183	0					0	return 1;
184							}
185
186							sub array2object {
187
188	2			2	0	5	my $data = shift;
189	2	50				9	if ( ref $data eq ref [] ) {
190	0					0	my $n = @$data;
191	0	0				0	if ( $n == 1 ) {
192	0					0	$data = $data->[0];
193							}
194							else {
195	0					0	die
196							"Sorry, your file has $n patients but only 1 patient is allowed with <-t>\n";
197							}
198							}
199	2					6	return $data;
200							}
201
202							sub validate_json {
203
204	18			18	0	56	my $file = shift;
205	18	100	66			126	my $data = ( $file && -f $file ) ? read_yaml($file) : undef;
206
207							# Premature return with undef if the file does not exist
208	18	100				112	return undef unless defined $data; #perlcritic severity 5
209
210							# schema for the weights file
211	5					46	my $schema = {
212							'$schema' => 'http://json-schema.org/draft-07/schema#',
213							'type' => 'object',
214							'patternProperties' => {
215							'^\w+([.:\w]*\w+)?$' => {
216							'type' => 'integer',
217							},
218							},
219							'additionalProperties' => JSON::XS::false,
220							};
221
222							# Load at runtime
223	5					1560	require JSON::Validator;
224
225							# Create object and load schema
226	5					75652	my $jv = JSON::Validator->new;
227
228							# Load schema in object
229	5					84	$jv->schema($schema);
230
231							# Validate data
232	5					24575	my @errors = $jv->validate($data);
233
234							# Show error(s) if any + die
235	5	100				2215	if (@errors) {
236	2					5	my $msg = join "\n", @errors;
237	2					92	die qq/$msg\n/;
238							}
239
240							# Lock config data (keys+values)
241	3					28	lock_hash(%$data);
242
243							# return data if ok
244	3					102	return $data;
245
246							}
247
248							sub coverage_stats {
249
250	16			16	0	18	my $data = shift;
251	16					22	my $coverage = {};
252
253	16					39	for my $item (@$data) {
254	456					767	for my $key ( keys %$item ) {
255
256							# Initialize key in coverage with 0 if not already present
257	2999		100			3045	$coverage->{$key} //= 0;
258
259							# Increment count only if value is not undef, not an empty hash, not an empty array,
260							# and not equal to 'NA' or 'NaN'
261	2999	50	100			8311	unless (
			66
			66
			66
			33
			33
262							!defined $item->{$key}
263	727					2399	\|\| ( ref $item->{$key} eq 'HASH' && !%{ $item->{$key} } )
264	1435					4673	\|\| ( ref $item->{$key} eq 'ARRAY' && !@{ $item->{$key} } )
265							\|\| $item->{$key} eq 'NA' # Check for 'NA'
266							\|\| $item->{$key} eq 'NaN'
267							) # Check for 'NaN'
268							{
269	2675					2532	$coverage->{$key}++;
270							}
271							}
272							}
273							return {
274	16					84	cohort_size => scalar @$data,
275							coverage_terms => $coverage
276							};
277							}
278
279							sub check_existence_of_include_terms {
280
281	16			16	0	26	my ( $coverage, $include_terms ) = @_;
282
283							# Return true if include_terms is empty
284	16	100				79	return 1 unless @$include_terms;
285
286							# Check for the existence of any term in include_terms within coverage
287							# Returns true if any term exists, false otherwise
288	2			2		13	return any { exists $coverage->{coverage_terms}{$_} } @$include_terms;
	2					8
289							}
290
291							sub append_and_rename_primary_key {
292
293	18			18	0	38	my $arg = shift;
294	18					32	my $ref_data = $arg->{ref_data};
295	18					34	my $append_prefixes = $arg->{append_prefixes};
296	18					27	my $primary_key = $arg->{primary_key};
297
298							# Premature return if @$ref_data == 1 (only 1 cohort)
299							# * IMPORTANT *
300							# $ref_data->[0] can be ARRAY or HASH
301							# We force HASH to be ARRAY
302	18	50				77	return ref $ref_data->[0] eq ref {} ? [ $ref_data->[0] ] : $ref_data->[0]
		100
303							if @$ref_data == 1;
304
305							# Count for prefixes
306	4					7	my $prefix_count = 1;
307
308							# We have to load into a new array data
309							# NB: for is a bit faster than map
310	4					6	my $data;
311	4					7	for my $item (@$ref_data) {
312
313							# Get prefix
314	2	50				6	my $prefix =
315							$append_prefixes->[ $prefix_count - 1 ]
316							? $append_prefixes->[ $prefix_count - 1 ] . '_'
317							: 'C' . $prefix_count . '_';
318
319							# ARRAY
320	2					3	my $item_count = 1;
321	2	100				24	if ( ref $item eq ref [] ) {
322	1					4	for my $individual (@$item) {
323	36					38	my $id = $individual->{$primary_key};
324	36					64	check_null_primary_key(
325							{
326							count => $item_count,
327							primary_key => $primary_key,
328							id => $id,
329							prefix => $prefix
330							}
331							);
332	36					43	$individual->{$primary_key} = $prefix . $id;
333	36					34	push @$data, $individual;
334	36					31	$item_count++;
335							}
336							}
337
338							# Object
339							else {
340
341							# Check if primary_key is defined
342	1					3	my $id = $item->{$primary_key};
343	1					4	check_null_primary_key(
344							{
345							count => $item_count,
346							primary_key => $primary_key,
347							id => $id,
348							prefix => $prefix
349							}
350							);
351	1					3	$item->{$primary_key} = $prefix . $id;
352	1					1	push @$data, $item;
353	1					2	$item_count++;
354							}
355	2					3	$prefix_count++;
356							}
357	4					9	return $data;
358							}
359
360							sub check_null_primary_key {
361
362	37			37	0	24	my $arg = shift;
363	37					36	my $id = $arg->{id};
364	37					29	my $count = $arg->{count};
365	37					22	my $primary_key = $arg->{primary_key};
366	37					27	my $prefix = $arg->{prefix};
367	37	50				39	die
368							"Sorry but the JSON document ${prefix}[$count] does not have the primary_key <$primary_key> defined\n"
369							unless defined $id;
370	37					32	return 1;
371							}
372
373							sub restructure_pxf_interpretations {
374
375	17			17	0	33	my ( $data, $self ) = @_;
376
377							# Premature return if the format is not 'PXF'
378	17	100				53	return unless $self->{format} eq 'PXF';
379
380							# Premature return if "interpretations" is excluded
381	2	50				3	return if (grep { $_ eq 'interpretations' } @{ $self->{exclude_terms} });
	0					0
	2					6
382
383	2	50				6	say "Restructuring in PXFs..." if defined $self->{verbose};
384
385							# Function to restructure individual interpretation
386							my $restructure_interpretation = sub {
387	2			2		3	my $interpretation = shift;
388	2					4	my $disease_id = $interpretation->{diagnosis}{disease}{id};
389							my $new_interpretation = {
390							progressStatus => $interpretation->{progressStatus},
391	2					5	genomicInterpretations => {}
392							};
393
394	2					1	foreach my $genomic_interpretation (
395	2					3	@{ $interpretation->{diagnosis}{genomicInterpretations} } )
396							{
397	3					3	my $gene_id;
398							my $interpretation_data;
399
400	3	100				5	if ( exists $genomic_interpretation->{variantInterpretation} ) {
		50
401							my $variant_interpretation =
402	2					2	$genomic_interpretation->{variantInterpretation};
403
404							# Check if geneContext with valueId exists
405	2	100				10	if (
		50
406							exists $variant_interpretation->{variationDescriptor}
407							{geneContext}{valueId} )
408							{
409							$gene_id = $variant_interpretation->{variationDescriptor}
410	1					2	{geneContext}{valueId};
411							}
412
413							# Check if id within variationDescriptor exists as an alternative
414							elsif (
415							exists $variant_interpretation->{variationDescriptor}{id} )
416							{
417							$gene_id =
418	1					1	$variant_interpretation->{variationDescriptor}{id};
419							}
420
421	2					3	$interpretation_data = $variant_interpretation;
422							}
423							elsif ( exists $genomic_interpretation->{geneDescriptor} ) {
424	1					2	$gene_id = $genomic_interpretation->{geneDescriptor}{valueId};
425							$interpretation_data =
426	1					2	$genomic_interpretation->{geneDescriptor};
427							}
428
429							$new_interpretation->{genomicInterpretations}{$gene_id} = {
430							interpretationStatus =>
431							$genomic_interpretation->{interpretationStatus},
432							(
433							exists $genomic_interpretation->{variantInterpretation}
434	3	100				12	? ( variantInterpretation => $interpretation_data )
435							: ( geneDescriptor => $interpretation_data )
436							)
437							};
438							}
439
440	2					3	return ( $disease_id, $new_interpretation );
441	2					10	};
442
443							# Helper function to process a data structure
444							my $process_data = sub {
445	4			4		3	my $data = shift;
446	4	100				10	return unless exists $data->{interpretations};
447
448	2					1	my $new_data = {};
449
450	2					2	foreach my $interpretation ( @{ $data->{interpretations} } ) {
	2					3
451	2					3	my ( $disease_id, $new_interpretation ) =
452							$restructure_interpretation->($interpretation);
453	2					4	$new_data->{$disease_id} = $new_interpretation;
454							}
455
456	2					8	$data->{interpretations} = $new_data;
457	2					7	};
458
459							# Process $data if it's an array or a single object
460	2	50				19	if ( ref($data) eq 'ARRAY' ) {
		0
461	2					4	foreach my $entry (@$data) {
462	4	50				10	$process_data->($entry) if ref($entry) eq 'HASH';
463							}
464							}
465							elsif ( ref($data) eq 'HASH' ) {
466	0					0	$process_data->($data);
467							}
468
469	2					26	return 1;
470							}
471