File Coverage

lib/Pheno/Ranker/IO.pm

Criterion	Covered	Total	%
statement	184	220	83.6
branch	44	70	62.8
condition	15	23	65.2
subroutine	31	34	91.1
pod	0	16	0.0
total	274	363	75.4

line	stmt	bran	cond	sub	pod	time	code
1							package Pheno::Ranker::IO;
2
3	5			5		42	use strict;
	5					23
	5					185
4	5			5		50	use warnings;
	5					8
	5					270
5	5			5		27	use autodie;
	5					7
	5					43
6	5			5		30618	use feature qw(say);
	5					10
	5					786
7	5			5		4895	use Path::Tiny;
	5					182140
	5					458
8	5			5		74	use File::Basename;
	5					11
	5					444
9	5			5		29	use File::Spec::Functions qw(catdir catfile);
	5					10
	5					335
10	5			5		32	use List::Util qw(any);
	5					8
	5					318
11	5			5		27	use Hash::Util qw(lock_hash);
	5					10
	5					51
12	5			5		4320	use IO::Uncompress::Gunzip qw(gunzip $GunzipError);
	5					251934
	5					782
13	5			5		2811	use YAML::XS qw(Load LoadFile DumpFile);
	5					18373
	5					415
14	5			5		4176	use JSON::XS;
	5					31962
	5					535
15
16							#use Data::Dumper;
17
18							#use Sort::Naturally qw(nsort);
19	5			5		42	use Exporter 'import';
	5					10
	5					311
20							our @EXPORT =
21							qw(serialize_hashes write_alignment io_yaml_or_json read_json read_yaml write_json write_array2txt array2object validate_json write_poi coverage_stats check_existence_of_include_terms append_and_rename_primary_key restructure_pxf_interpretations);
22	5			5		26	use constant DEVEL_MODE => 0;
	5					8
	5					18777
23
24							#########################
25							#########################
26							# SUBROUTINES FOR I/O #
27							#########################
28							#########################
29
30							sub serialize_hashes {
31	0			0	0	0	my $arg = shift;
32	0					0	my $data = $arg->{data};
33	0					0	my $export_basename = $arg->{export_basename};
34							write_json(
35							{ data => $data->{$_}, filepath => qq/$export_basename.$_.json/ } )
36	0					0	for keys %{$data};
	0					0
37	0					0	return 1;
38							}
39
40							sub write_alignment {
41	2			2	0	4	my $arg = shift;
42	2					4	my $basename = $arg->{align};
43	2					3	my $ascii = $arg->{ascii};
44	2					4	my $dataframe = $arg->{dataframe};
45	2					4	my $csv = $arg->{csv};
46	2					11	my %hash = (
47							'.txt' => $ascii,
48							'.csv' => $dataframe,
49							'.target.csv' => $csv
50							);
51
52	2					6	for my $key ( keys %hash ) {
53	6					29	my $output = $basename . $key;
54	6					48	write_array2txt( { filepath => $output, data => $hash{$key} } );
55							}
56	2					44	return 1;
57							}
58
59							sub io_yaml_or_json {
60	18			18	0	34	my $arg = shift;
61	18					42	my $file = $arg->{filepath};
62	18					53	my $mode = $arg->{mode};
63	18	50				58	my $data = $mode eq 'write' ? $arg->{data} : undef;
64
65							# Check if the file is gzipped
66	18	100				80	my $is_gz = $file =~ /\.gz$/ ? 1 : 0;
67
68							# Remove .gz for extension recognition if present
69	18	100				76	my $file_for_ext = $is_gz ? ( $file =~ s/\.gz$//r ) : $file;
70
71							# Allowed extensions
72	18					61	my @exts = qw(.yaml .yml .json);
73
74							# Use fileparse on the file name without the .gz suffix
75	18					1361	my ( undef, undef, $ext ) = fileparse( $file_for_ext, @exts );
76	18					93	my $msg = qq(Can't recognize <$file> extension. Extensions allowed are: )
77							. join ',', @exts;
78	18	50		54		171	die $msg unless any { $_ eq $ext } @exts;
	54					104
79
80							# Unify extension by removing "a" and "."
81	18					111	$ext =~
82							tr/a.//d; # so ".yaml" or ".yml" become "yml" and ".json" becomes "json"
83
84							# Dispatch table for read/write operations
85	18					218	my $return = {
86							read => { json => \&read_json, yml => \&read_yaml },
87							write => { json => \&write_json, yml => \&write_yaml },
88							};
89
90							# Call the appropriate function based on the mode and extension
91							return $mode eq 'read'
92							? $return->{$mode}{$ext}->($file)
93	18	50				115	: $return->{$mode}{$ext}->( { filepath => $file, data => $data } );
94							}
95
96							sub read_json {
97	22			22	0	44	my $file = shift;
98	22					35	my $str;
99	22	100				77	if ( $file =~ /\.gz$/ ) {
100	2	50				17	gunzip $file => \$str
101							or die "gunzip failed for $file: $GunzipError\n";
102							}
103							else {
104	20					110	$str = path($file)->slurp;
105							}
106	22					111777	return decode_json($str);
107							}
108
109							sub read_yaml {
110	24			24	0	58	my $file = shift;
111	24					45	my $data;
112
113							# Check if the file ends with .gz
114	24	50				129	if ( $file =~ /\.gz$/ ) {
115	0					0	my $yaml_str;
116	0	0				0	gunzip $file => \$yaml_str
117							or die "gunzip failed for $file: $GunzipError\n";
118
119							# Decode the YAML from the string
120	0					0	$data = Load($yaml_str);
121							}
122							else {
123							# Directly load from the file
124	24					185	$data = LoadFile($file);
125							}
126	24					7441	return $data;
127							}
128
129							sub write_json {
130	1			1	0	3	my $arg = shift;
131	1					4	my $file = $arg->{filepath};
132	1					3	my $json_data = $arg->{data};
133
134							# Note that canonical DOES not match the order of nsort from Sort::Naturally
135	1					1413	my $json = JSON::XS->new->utf8->canonical->pretty->encode($json_data);
136	1					18	path($file)->spew($json);
137	1					9788	return 1;
138							}
139
140							sub write_yaml {
141	0			0	0	0	my $arg = shift;
142	0					0	my $file = $arg->{filepath};
143	0					0	my $json_data = $arg->{data};
144	0					0	local $YAML::XS::Boolean = 'JSON::PP';
145	0					0	DumpFile( $file, $json_data );
146	0					0	return 1;
147							}
148
149							sub write_array2txt {
150	8			8	0	19	my $arg = shift;
151	8					18	my $file = $arg->{filepath};
152	8					13	my $data = $arg->{data};
153
154							# Watch out for RAM usage!!!
155	8					96	path($file)->spew_utf8( join( "\n", @$data ) . "\n" );
156	8					37655	return 1;
157							}
158
159							sub write_poi {
160	0			0	0	0	my $arg = shift;
161	0					0	my $ref_data = $arg->{ref_data};
162	0					0	my $poi = $arg->{poi};
163	0					0	my $poi_out_dir = $arg->{poi_out_dir};
164	0					0	my $primary_key = $arg->{primary_key};
165	0					0	my $verbose = $arg->{verbose};
166	0					0	for my $name (@$poi) {
167	0					0	my ($match) = grep { $name eq $_->{$primary_key} } @$ref_data;
	0					0
168	0	0				0	if ($match) {
169	0					0	my $out = catfile( $poi_out_dir, "$name.json" );
170	0	0				0	say "Writting <$out>" if $verbose;
171	0					0	write_json( { filepath => $out, data => $match } );
172							}
173							else {
174	0					0	warn
175							"No individual found for <$name>. Are you sure you used the right prefix?\n";
176							}
177							}
178	0					0	return 1;
179							}
180
181							sub array2object {
182	2			2	0	10	my $data = shift;
183	2	50				13	if ( ref $data eq ref [] ) {
184	0					0	my $n = @$data;
185	0	0				0	if ( $n == 1 ) {
186	0					0	$data = $data->[0];
187							}
188							else {
189	0					0	die
190							"Sorry, your file has $n patients but only 1 patient is allowed with <-t>\n";
191							}
192							}
193	2					11	return $data;
194							}
195
196							sub validate_json {
197	18			18	0	33	my $file = shift;
198	18	100	66			191	my $data = ( $file && -f $file ) ? read_yaml($file) : undef;
199
200							# Premature return with undef if the file does not exist
201	18	100				69	return undef unless defined $data; #perlcritic severity 5
202
203							# schema for the weights file
204	5					57	my $schema = {
205							'$schema' => 'http://json-schema.org/draft-07/schema#',
206							'type' => 'object',
207							'patternProperties' => {
208							'^\w+([.:\w]*\w+)?$' => {
209							'type' => 'integer',
210							},
211							},
212							'additionalProperties' => JSON::XS::false,
213							};
214
215							# Load at runtime
216	5					2198	require JSON::Validator;
217
218							# Create object and load schema
219	5					118433	my $jv = JSON::Validator->new;
220
221							# Load schema in object
222	5					114	$jv->schema($schema);
223
224							# Validate data
225	5					35231	my @errors = $jv->validate($data);
226
227							# Show error(s) if any + die
228	5	100				3367	if (@errors) {
229	2					12	my $msg = join "\n", @errors;
230	2					147	die qq/$msg\n/;
231							}
232
233							# Lock config data (keys+values)
234	3					20	lock_hash(%$data);
235
236							# return data if ok
237	3					137	return $data;
238
239							}
240
241							sub coverage_stats {
242	16			16	0	37	my ( $data, $format ) = @_;
243	16					25	my $coverage = {};
244
245	16					42	for my $item (@$data) {
246	456					1197	for my $key ( keys %$item ) {
247
248							# Initialize key in coverage with 0 if not already present
249	2999		100			4112	$coverage->{$key} //= 0;
250
251							# Increment count only if value is not undef, not an empty hash, not an empty array,
252							# and not equal to 'NA' or 'NaN'
253	2999	50	100			11130	unless (
			66
			66
			66
			33
			33
254							!defined $item->{$key}
255	727					3161	\|\| ( ref $item->{$key} eq 'HASH' && !%{ $item->{$key} } )
256	1435					6316	\|\| ( ref $item->{$key} eq 'ARRAY' && !@{ $item->{$key} } )
257							\|\| $item->{$key} eq 'NA' # Check for 'NA'
258							\|\| $item->{$key} eq 'NaN'
259							) # Check for 'NaN'
260							{
261	2675					3489	$coverage->{$key}++;
262							}
263							}
264							}
265							return {
266	16					138	format => $format,
267							cohort_size => scalar @$data,
268							coverage_terms => $coverage
269							};
270							}
271
272							sub check_existence_of_include_terms {
273	16			16	0	35	my ( $coverage, $include_terms ) = @_;
274
275							# Return true if include_terms is empty
276	16	100				73	return 1 unless @$include_terms;
277
278							# Check for the existence of any term in include_terms within coverage
279							# Returns true if any term exists, false otherwise
280	2			2		28	return any { exists $coverage->{coverage_terms}{$_} } @$include_terms;
	2					14
281							}
282
283							sub append_and_rename_primary_key {
284	18			18	0	41	my $arg = shift;
285	18					45	my $ref_data = $arg->{ref_data};
286	18					39	my $append_prefixes = $arg->{append_prefixes};
287	18					34	my $primary_key = $arg->{primary_key};
288
289							# Premature return if @$ref_data == 1 (only 1 cohort)
290							# * IMPORTANT *
291							# $ref_data->[0] can be ARRAY or HASH
292							# We force HASH to be ARRAY
293	18	50				126	return ref $ref_data->[0] eq ref {} ? [ $ref_data->[0] ] : $ref_data->[0]
		100
294							if @$ref_data == 1;
295
296							# Count for prefixes
297	4					9	my $prefix_count = 1;
298
299							# We have to load into a new array data
300							# NB: for is a bit faster than map
301	4					8	my $data;
302	4					12	for my $item (@$ref_data) {
303
304							# Get prefix
305	2	50				8	my $prefix =
306							$append_prefixes->[ $prefix_count - 1 ]
307							? $append_prefixes->[ $prefix_count - 1 ] . '_'
308							: 'C' . $prefix_count . '_';
309
310							# ARRAY
311	2					1	my $item_count = 1;
312	2	100				6	if ( ref $item eq ref [] ) {
313	1					2	for my $individual (@$item) {
314	36					42	my $id = $individual->{$primary_key};
315	36					69	check_null_primary_key(
316							{
317							count => $item_count,
318							primary_key => $primary_key,
319							id => $id,
320							prefix => $prefix
321							}
322							);
323	36					46	$individual->{$primary_key} = $prefix . $id;
324	36					25	push @$data, $individual;
325	36					35	$item_count++;
326							}
327							}
328
329							# Object
330							else {
331
332							# Check if primary_key is defined
333	1					2	my $id = $item->{$primary_key};
334	1					3	check_null_primary_key(
335							{
336							count => $item_count,
337							primary_key => $primary_key,
338							id => $id,
339							prefix => $prefix
340							}
341							);
342	1					2	$item->{$primary_key} = $prefix . $id;
343	1					2	push @$data, $item;
344	1					2	$item_count++;
345							}
346	2					3	$prefix_count++;
347							}
348	4					13	return $data;
349							}
350
351							sub check_null_primary_key {
352	37			37	0	27	my $arg = shift;
353	37					31	my $id = $arg->{id};
354	37					42	my $count = $arg->{count};
355	37					27	my $primary_key = $arg->{primary_key};
356	37					29	my $prefix = $arg->{prefix};
357	37	50				36	die
358							"Sorry but the JSON document ${prefix}[$count] does not have the primary_key <$primary_key> defined\n"
359							unless defined $id;
360	37					26	return 1;
361							}
362
363							sub restructure_pxf_interpretations {
364	17			17	0	41	my ( $data, $self ) = @_;
365
366							# Premature return if the format is not 'PXF'
367	17	100				74	return unless $self->{format} eq 'PXF';
368
369							# Premature return if "interpretations" is excluded
370	2	50				4	return if ( grep { $_ eq 'interpretations' } @{ $self->{exclude_terms} } );
	0					0
	2					9
371
372							say "Restructuring in PXFs..."
373	2	50				8	if defined $self->{verbose};
374
375							# Function to restructure individual interpretation
376							my $restructure_interpretation = sub {
377	2			2		2	my $interpretation = shift;
378	2					5	my $disease_id = $interpretation->{diagnosis}{disease}{id};
379							my $new_interpretation = {
380							progressStatus => $interpretation->{progressStatus},
381	2					4	genomicInterpretations => {}
382							};
383
384	2					3	foreach my $genomic_interpretation (
385	2					3	@{ $interpretation->{diagnosis}{genomicInterpretations} } )
386							{
387	3					4	my $gene_id;
388							my $interpretation_data;
389
390	3	100				7	if ( exists $genomic_interpretation->{variantInterpretation} ) {
		50
391							my $variant_interpretation =
392	2					3	$genomic_interpretation->{variantInterpretation};
393
394							# Check if geneContext with valueId exists
395	2	100				8	if (
		50
396							exists $variant_interpretation->{variationDescriptor}
397							{geneContext}{valueId} )
398							{
399							$gene_id = $variant_interpretation->{variationDescriptor}
400	1					2	{geneContext}{valueId};
401							}
402
403							# Check if id within variationDescriptor exists as an alternative
404							elsif (
405							exists $variant_interpretation->{variationDescriptor}{id} )
406							{
407							$gene_id =
408	1					2	$variant_interpretation->{variationDescriptor}{id};
409							}
410
411	2					3	$interpretation_data = $variant_interpretation;
412							}
413							elsif ( exists $genomic_interpretation->{geneDescriptor} ) {
414	1					2	$gene_id = $genomic_interpretation->{geneDescriptor}{valueId};
415							$interpretation_data =
416	1					2	$genomic_interpretation->{geneDescriptor};
417							}
418
419							$new_interpretation->{genomicInterpretations}{$gene_id} = {
420							interpretationStatus =>
421							$genomic_interpretation->{interpretationStatus},
422							(
423							exists $genomic_interpretation->{variantInterpretation}
424	3	100				12	? ( variantInterpretation => $interpretation_data )
425							: ( geneDescriptor => $interpretation_data )
426							)
427							};
428							}
429
430	2					4	return ( $disease_id, $new_interpretation );
431	2					29	};
432
433							# Helper function to process a data structure
434							my $process_data = sub {
435	4			4		7	my $data = shift;
436	4	100				14	return unless exists $data->{interpretations};
437
438	2					4	my $new_data = {};
439
440	2					2	foreach my $interpretation ( @{ $data->{interpretations} } ) {
	2					3
441	2					4	my ( $disease_id, $new_interpretation ) =
442							$restructure_interpretation->($interpretation);
443	2					5	$new_data->{$disease_id} = $new_interpretation;
444							}
445
446	2					9	$data->{interpretations} = $new_data;
447	2					11	};
448
449							# Process $data if it's an array or a single object
450	2	50				9	if ( ref($data) eq 'ARRAY' ) {
		0
451	2					7	foreach my $entry (@$data) {
452	4	50				14	$process_data->($entry) if ref($entry) eq 'HASH';
453							}
454							}
455							elsif ( ref($data) eq 'HASH' ) {
456	0					0	$process_data->($data);
457							}
458
459	2					27	return 1;
460							}
461