File Coverage

lib/Pheno/Ranker/Compare.pm

Criterion	Covered	Total	%
statement	315	378	83.3
branch	91	130	70.0
condition	34	53	64.1
subroutine	27	33	81.8
pod	0	17	0.0
total	467	611	76.4

line	stmt	bran	cond	sub	pod	time	code
1							package Pheno::Ranker::Compare;
2
3	5			5		32	use strict;
	5					9
	5					198
4	5			5		40	use warnings;
	5					9
	5					350
5	5			5		33	use autodie;
	5					10
	5					45
6	5			5		29438	use feature qw(say);
	5					11
	5					823
7	5			5		41	use List::Util qw(any shuffle first);
	5					9
	5					578
8	5			5		35	use Data::Dumper;
	5					7
	5					373
9	5			5		3952	use Sort::Naturally qw(nsort);
	5					27209
	5					484
10	5			5		2744	use MIME::Base64;
	5					4363
	5					422
11	5			5		3251	use Compress::Zlib qw(compress);
	5					110954
	5					677
12	5			5		3311	use Hash::Fold fold => { array_delimiter => ':' };
	5					210452
	5					45
13	5			5		5172	use Pheno::Ranker::Metrics;
	5					63
	5					1181
14
15	5			5		61	use Exporter 'import';
	5					106
	5					505
16							our @EXPORT =
17							qw(check_format cohort_comparison compare_and_rank create_glob_and_ref_hashes randomize_variables remap_hash create_binary_digit_string parse_hpo_json);
18
19	5			5		78	use constant DEVEL_MODE => 0;
	5					12
	5					43097
20
21							our %nomenclature = ();
22
23							sub check_format {
24	13			13	0	29	my $data = shift;
25	13	100				98	return exists $data->[0]{subject} ? 'PXF' : 'BFF';
26							}
27
28							sub cohort_comparison {
29	14			14	0	41	my ( $ref_binary_hash, $self ) = @_;
30	14					56	my $out_file = $self->{out_file};
31	14					45	my $similarity_metric = $self->{similarity_metric_cohort};
32
33							# Define limit #items for switching to whole matrix calculation
34	14					34	my $max_items = $self->{max_matrix_records_in_ram};
35
36							# Inform about the start of the comparison process
37							say "Performing COHORT comparison"
38	14	50	33			109	if ( $self->{debug} \|\| $self->{verbose} );
39
40							# Define the subroutine to be used
41	14					91	my %similarity_function = (
42							'hamming' => \&hd_fast,
43							'jaccard' => \&jaccard_similarity_formatted
44							);
45
46							# Define values for diagonal elements depending on metric
47	14					50	my %similarity_diagonal = (
48							'hamming' => 0,
49							'jaccard' => 1
50							);
51
52							# Use previous hashes to define stuff
53	14					27	my $metric = $similarity_function{$similarity_metric};
54	14					29	my $similarity_diagonal = $similarity_diagonal{$similarity_metric};
55
56							# Sorting keys of the hash
57	14					23	my @sorted_keys_ref_binary_hash = nsort( keys %{$ref_binary_hash} );
	14					800
58	14					64722	my $num_items = scalar @sorted_keys_ref_binary_hash;
59
60							# Define $switch for going from RAM to all calculations
61	14	50				67	my $switch = $num_items > $max_items ? 1 : 0;
62
63							say "RAM efficient mode is: "
64							. ( $switch ? "on" : "off" )
65							. " (max_matrix_records_in_ram: $max_items)"
66	14	0	33			117	if ( $self->{debug} \|\| $self->{verbose} );
		50
67
68							# Opening file for output
69	14					86	open( my $fh, '>:encoding(UTF-8)', $out_file );
70	14					19037	say $fh "\t", join "\t", @sorted_keys_ref_binary_hash;
71
72							# Initialize matrix for storing similarity
73	14					33	my @matrix;
74
75							# Iterate over items (I elements)
76	14					65	for my $i ( 0 .. $#sorted_keys_ref_binary_hash ) {
77							say "Calculating <"
78							. $sorted_keys_ref_binary_hash[$i]
79							. "> against the cohort..."
80	420	50				1168	if $self->{verbose};
81							my $str1 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$i] }
82	420					887	{binary_digit_string_weighted};
83
84							# Print first column (w/o \t)
85	420					922	print $fh $sorted_keys_ref_binary_hash[$i];
86
87							# Iterate for pairwise comparisons (J elements)
88	420					1076	for my $j ( 0 .. $#sorted_keys_ref_binary_hash ) {
89							my $str2 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
90	15028					29755	{binary_digit_string_weighted};
91	15028					30776	my $similarity;
92
93	15028	50				19610	if ($switch) {
94
95							# For large datasets compute upper and lower triange
96							my $str2 =
97							$ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
98	0					0	{binary_digit_string_weighted};
99	0	0				0	$similarity =
100							$i == $j ? $similarity_diagonal : $metric->( $str1, $str2 );
101							}
102							else {
103	15028	100				24111	if ( $i == $j ) {
		100
104
105							# Similarity for diagonal elements
106	420					523	$similarity = $similarity_diagonal;
107							}
108							elsif ( $j > $i ) {
109
110							# Compute similarity for large cohorts or upper triangle
111	7304					11783	$similarity = $metric->( $str1, $str2 );
112	7304					12883	$matrix[$i][$j] = $similarity;
113							}
114							else {
115							# Use precomputed similarity from lower triangle
116	7304					10027	$similarity = $matrix[$j][$i];
117							}
118							}
119
120							# Print a tab before each similarity
121	15028					31766	print $fh "\t", $similarity;
122							}
123
124	420					800	print $fh "\n";
125							}
126
127							# Close the file handle
128	14					115	close $fh;
129
130							# Inform about the completion of the matrix computation
131	14	50	33			26047	say "Matrix saved to <$out_file>" if ( $self->{debug} \|\| $self->{verbose} );
132	14					737	return 1;
133							}
134
135							sub compare_and_rank {
136	2			2	0	6	my $arg = shift;
137	2					6	my $glob_hash = $arg->{glob_hash};
138	2					7	my $ref_hash = $arg->{ref_hash};
139	2					6	my $tar_hash = $arg->{tar_hash};
140	2					5	my $ref_binary_hash = $arg->{ref_binary_hash};
141	2					7	my $tar_binary_hash = $arg->{tar_binary_hash};
142	2					7	my $weight = $arg->{weight};
143	2					6	my $self = $arg->{self};
144	2					8	my $sort_by = $self->{sort_by};
145	2					7	my $align = $self->{align};
146	2					5	my $max_out = $self->{max_out};
147
148							say "Performing COHORT(REF)-PATIENT(TAR) comparison"
149	2	50	33			22	if ( $self->{debug} \|\| $self->{verbose} );
150
151							# Hash for compiling metrics
152	2					6	my $score;
153
154							# Hash for stats
155							my $stat;
156
157							# Load TAR binary string
158	2					5	my ($tar) = keys %{$tar_binary_hash};
	2					7
159							my $tar_str_weighted =
160	2					6	$tar_binary_hash->{$tar}{binary_digit_string_weighted};
161
162							# Load TAR number of vars
163	2					3	my $target_vars = keys %{ $tar_hash->{$tar} };
	2					9
164
165	2					4	for my $key ( keys %{$ref_binary_hash} ) { # No need to sort
	2					29
166
167							# Load REF binary string
168							my $ref_str_weighted =
169	72					184	$ref_binary_hash->{$key}{binary_digit_string_weighted};
170	72	50				151	say "Comparing --- " if $self->{verbose};
171							say "REF:$ref_str_weighted\nTAR:$tar_str_weighted\n"
172	72	50	33			166	if ( defined $self->{debug} && $self->{debug} > 1 );
173
174							# Hamming
175							$score->{$key}{hamming} =
176	72					140	hd_fast( $ref_str_weighted, $tar_str_weighted );
177
178							# Intersect and Jaccard
179	72					157	( $score->{$key}{jaccard}, $score->{$key}{intersect} ) =
180							jaccard_similarity( $ref_str_weighted, $tar_str_weighted );
181
182							# Load REF number of vars
183	72					93	$score->{$key}{reference_vars} = keys %{ $ref_hash->{$key} };
	72					174
184
185							# Add values
186	72					88	push @{ $stat->{hamming_data} }, $score->{$key}{hamming};
	72					138
187	72					80	push @{ $stat->{jaccard_data} }, $score->{$key}{jaccard};
	72					155
188							}
189
190							# Stats are only computed once (no overhead)
191	2					19	$stat->{hamming_stats} = add_stats( $stat->{hamming_data} );
192	2					9	$stat->{jaccard_stats} = add_stats( $stat->{jaccard_data} );
193
194							# Initialize a few variables
195	2					20	my @headers = (
196							'RANK', 'REFERENCE(ID)',
197							'TARGET(ID)', 'FORMAT',
198							'LENGTH', 'WEIGHTED',
199							'HAMMING-DISTANCE', 'DISTANCE-Z-SCORE',
200							'DISTANCE-P-VALUE', 'DISTANCE-Z-SCORE(RAND)',
201							'JACCARD-INDEX', 'JACCARD-Z-SCORE',
202							'JACCARD-P-VALUE', 'REFERENCE-VARS',
203							'TARGET-VARS', 'INTERSECT',
204							'INTERSECT-RATE(%)', 'COMPLETENESS(%)'
205							);
206	2					11	my $header = join "\t", @headers;
207	2					5	my @results = $header;
208	2					3	my %info;
209	2					5	my $length_align = length($tar_str_weighted);
210	2	100				7	my $weight_bool = $weight ? 'True' : 'False';
211	2					5	my @alignments_ascii;
212							my $alignment_str_csv;
213	2					5	my @alignments_csv = join ';',
214							qw/id ref indicator tar weight hamming-distance json-path label/;
215
216							# The dataframe will have two header lines
217							# * IMPORTANT *
218							# nsort does not yield same results as canonical from JSON::XS
219							# NB: we're sorting here and in create_binary_digit_string()
220	2					4	my @sort_keys_glob_hash = sort keys %{$glob_hash};
	2					215
221
222							# Creating @labels array from {id,label}, or guess_label()
223							my @labels =
224	2	100				21	map { exists $nomenclature{$_} ? $nomenclature{$_} : guess_label($_) }
	318					606
225							@sort_keys_glob_hash;
226
227							# Die if #elements in arrays differ
228	2	50				8	die "Mismatch between variables and labels"
229							if @sort_keys_glob_hash != @labels;
230
231							# Labels
232							# NB: there is a comma in 'Serum Glutamic Pyruvic Transaminase, CTCAE'
233	2					49	my @dataframe = join ';', 'Id', @labels;
234
235							# Variables (json path)
236	2					42	push @dataframe, join ';', 'Id', @sort_keys_glob_hash;
237
238							# 0/1
239							push @dataframe, join ';', qq/T\|$tar/,
240	2					90	( split //, $tar_binary_hash->{$tar}{binary_digit_string} ); # Add Target
241
242							# Sort %score by value and load results
243	2					22	my $count = 1;
244	2					5	$max_out++; # to be able to start w/ ONE
245
246							# Start loop
247	2					25	for my $key (
248							sort {
249							$sort_by eq 'jaccard' #
250							? $score->{$b}{$sort_by}
251							<=> $score->{$a}{$sort_by} # 1 to 0 (similarity)
252							: $score->{$a}{$sort_by}
253	284	50				504	<=> $score->{$b}{$sort_by} # 0 to N (distance)
254							} keys %$score
255							)
256							{
257
258	72	50				251	say "$count: Creating alignment " if $self->{verbose};
259
260							# Create ASCII alignment
261							# NB: We need it here to get $n_00
262							my ( $n_00, $alignment_str_ascii, $alignment_arr_csv ) =
263							create_alignment(
264							{
265							ref_key => $key,
266							ref_str_weighted =>
267							$ref_binary_hash->{$key}{binary_digit_string_weighted},
268	72					671	tar_str_weighted => $tar_str_weighted,
269							glob_hash => $glob_hash,
270							sorted_keys_glob_hash => \@sort_keys_glob_hash,
271							labels => \@labels
272							}
273							);
274
275							# * IMPORTANT *
276							# The LENGTH of the alignment is based on the #variables in the REF-COHORT
277							# Compute estimated av and dev for binary_string of L = length_align - n_00
278							# Corrected length_align L = length_align - n_00
279	72					334	my $length_align_corrected = $length_align - $n_00;
280
281							#$estimated_average, $estimated_std_dev
282	72					405	( $stat->{hamming_stats}{mean_rnd}, $stat->{hamming_stats}{sd_rnd} ) =
283							estimate_hamming_stats($length_align_corrected);
284
285							# Compute a few stats
286							my $hamming_z_score = z_score(
287							$score->{$key}{hamming},
288							$stat->{hamming_stats}{mean},
289							$stat->{hamming_stats}{sd}
290	72					443	);
291							my $hamming_z_score_from_random = z_score(
292							$score->{$key}{hamming},
293							$stat->{hamming_stats}{mean_rnd},
294							$stat->{hamming_stats}{sd_rnd}
295	72					236	);
296
297							#my $hamming_p_value =
298							# p_value( $score->{$key}{hamming}, $length_align_corrected );
299	72					235	my $hamming_p_value_from_z_score =
300							p_value_from_z_score($hamming_z_score);
301							my $jaccard_z_score = z_score(
302							$score->{$key}{jaccard},
303							$stat->{jaccard_stats}{mean},
304							$stat->{jaccard_stats}{sd}
305	72					263	);
306	72					150	my $jaccard_p_value_from_z_score =
307							p_value_from_z_score( 1 - $jaccard_z_score );
308
309							# Compute Intersect-Rate (I/T) * 100
310							# Completeness (T/R) * 100
311	72					181	my $reference_vars = $score->{$key}{reference_vars};
312	72					146	my $intersect = $score->{$key}{intersect};
313	72	50				205	my $intersect_rate =
314							( $target_vars == 0 ) ? 0 : ( $intersect / $target_vars ) * 100;
315	72	50				154	my $completeness =
316							( $reference_vars == 0 ) ? 0 : ( $intersect / $reference_vars ) * 100;
317
318							# Create a hash with formats
319							my $format = {
320							'RANK' => { value => $count, format => undef },
321							'REFERENCE(ID)' => { value => $key, format => undef },
322							'TARGET(ID)' => { value => $tar, format => undef },
323							'FORMAT' => { value => $self->{format}, format => undef },
324							'WEIGHTED' => { value => $weight_bool, format => undef },
325							'LENGTH' => { value => $length_align_corrected, format => '%6d' },
326							'HAMMING-DISTANCE' =>
327							{ value => $score->{$key}{hamming}, format => '%4d' },
328							'DISTANCE-Z-SCORE' =>
329							{ value => $hamming_z_score, format => '%7.3f' },
330							'DISTANCE-P-VALUE' =>
331							{ value => $hamming_p_value_from_z_score, format => '%12.7f' },
332							'DISTANCE-Z-SCORE(RAND)' =>
333							{ value => $hamming_z_score_from_random, format => '%8.4f' },
334							'JACCARD-INDEX' =>
335	72					2596	{ value => $score->{$key}{jaccard}, format => '%7.3f' },
336							'JACCARD-Z-SCORE' =>
337							{ value => $jaccard_z_score, format => '%7.3f' },
338							'JACCARD-P-VALUE' =>
339							{ value => $jaccard_p_value_from_z_score, format => '%12.7f' },
340							'REFERENCE-VARS' => { value => $reference_vars, format => '%6d' },
341							'TARGET-VARS' => { value => $target_vars, format => '%6d' },
342							'INTERSECT' => { value => $intersect, format => '%6d' },
343							'INTERSECT-RATE(%)' =>
344							{ value => $intersect_rate, format => '%8.2f' },
345							'COMPLETENESS(%)' => { value => $completeness, format => '%8.2f' }
346							};
347
348							# Serialize results
349							my $tmp_str = join "\t", map {
350	72					277	defined $format->{$_}{format}
351							? sprintf( $format->{$_}{format}, $format->{$_}{value} )
352							: $format->{$_}{value}
353	1296	100				5466	} @headers;
354	72					343	push @results, $tmp_str;
355
356							# To save memory only load if --align
357	72	50				211	if ( defined $align ) {
358
359							# Add all of the above to @alignments_ascii
360	72					171	my $sep = ('-') x 80;
361	72					2334	push @alignments_ascii,
362							qq/#$header\n$tmp_str\n$sep\n$$alignment_str_ascii/;
363
364							# Add all of the above to @alignments_csv
365	72					2718	push @alignments_csv, @$alignment_arr_csv;
366
367							# Add data to dataframe
368							push @dataframe, join ';', qq/R\|$key/,
369	72					3880	( split //, $ref_binary_hash->{$key}{binary_digit_string} );
370
371							# Add values to info
372							$info{$key} = {
373							weighted => $weight_bool eq 'True'
374							? JSON::XS::true
375							: JSON::XS::false,
376							reference_id => $key,
377							target_id => $tar,
378							reference_binary_string =>
379							$ref_binary_hash->{$key}{binary_digit_string},
380							target_binary_string =>
381							$tar_binary_hash->{$key}{binary_digit_string},
382							reference_binary_string_weighted =>
383							$ref_binary_hash->{$key}{binary_digit_string_weighted},
384							target_binary_string_weighted =>
385							$tar_binary_hash->{$key}{binary_digit_string_weighted},
386							alignment_length => $length_align_corrected,
387							hamming_distance => $score->{$key}{hamming},
388							hamming_z_score => $hamming_z_score,
389							hamming_p_value => $hamming_p_value_from_z_score,
390							jaccard_similarity => $score->{$key}{jaccard},
391							jaccard_z_score => $jaccard_z_score,
392							jaccard_p_value => $jaccard_p_value_from_z_score,
393							jaccard_distance => 1 - $score->{$key}{jaccard},
394							format => $self->{format},
395	72	100				1104	alignment => $$alignment_str_ascii,
396							reference_vars => $reference_vars,
397							target_vars => $target_vars,
398							intersect => $intersect,
399							intersect_rate => $intersect_rate,
400							completeness => $completeness
401							};
402
403							}
404
405	72					4302	$count++;
406	72	100				1960	last if $count == $max_out;
407							}
408
409	2					210	return \@results, \%info, \@alignments_ascii, \@dataframe, \@alignments_csv;
410							}
411
412							sub create_alignment {
413
414							# NB: The alignment will use the weighted string
415	72			72	0	144	my $arg = shift;
416	72					143	my $ref_key = $arg->{ref_key};
417	72					117	my $binary_string1 = $arg->{ref_str_weighted};
418	72					150	my $binary_string2 = $arg->{tar_str_weighted};
419	72					110	my $sorted_keys_glob_hash = $arg->{sorted_keys_glob_hash};
420	72					131	my $labels = $arg->{labels};
421	72					94	my $glob_hash = $arg->{glob_hash};
422	72					130	my $length1 = length($binary_string1);
423	72					98	my $length2 = length($binary_string2);
424
425							# Check that l1 = l2
426	72	50				178	die "The binary strings must have the same length"
427							if ( $length1 != $length2 );
428
429							# Expand array to have weights as N-elements
430	72					208	my $recreated_array = recreate_array( $glob_hash, $sorted_keys_glob_hash );
431
432							# Initialize some variables
433	72					154	my $out_ascii = "REF -- TAR\n";
434	72					90	my @out_csv;
435	72					126	my $cum_distance = 0;
436	72					95	my $n_00 = 0;
437
438							# For loop with 2 variables
439							# i index for weighted
440							# j the number of variables
441	72					135	my ( $i, $j ) = ( 0, 0 );
442	72					146	for ( $i = 0 ; $i < $length1 ; $i++, $j++ ) {
443
444							# Load key and value
445	11448					17793	my $key = $recreated_array->[$i];
446	11448					19839	my $val = sprintf( "%3d", $glob_hash->{$key} );
447
448							# We have to keep track with $j
449	11448					14232	my $sorted_key = $sorted_keys_glob_hash->[$j];
450	11448					16106	my $label = $labels->[$j];
451
452							# Load chars
453	11448					15037	my $char1 = substr( $binary_string1, $i, 1 );
454	11448					13522	my $char2 = substr( $binary_string2, $i, 1 );
455	11448	100	100			27128	$n_00++ if ( $char1 == 0 && $char2 == 0 );
456
457							# Correct $i index by adding weights
458	11448					15382	$i = $i + $glob_hash->{$key} - 1;
459
460							# Load metrics
461	11448	100				19047	$cum_distance += $glob_hash->{$key} if $char1 ne $char2;
462	11448					16922	my $cum_distance_pretty = sprintf( "%3d", $cum_distance );
463	11448	100				16965	my $distance = $char1 eq $char2 ? 0 : $glob_hash->{$key};
464	11448					15535	my $distance_pretty = sprintf( "%3d", $distance );
465
466							# w = weight, d = distance, cd = cumul distance
467	11448					27245	my %format = (
468							'11' => '-----',
469							'10' => 'xxx--',
470							'01' => '--xxx',
471							'00' => ' '
472							);
473	11448					22462	$out_ascii .=
474							qq/$char1 $format{ $char1 . $char2 } $char2 \| (w:$val\|d:$distance_pretty\|cd:$cum_distance_pretty\|) $sorted_key ($label)\n/;
475	11448					50917	push @out_csv,
476							qq/$ref_key;$char1;$format{ $char1 . $char2 };$char2;$glob_hash->{$key};$distance;$sorted_key;$label/;
477
478							#REF(107:week_0_arm_1);indicator;TAR(125:week_0_arm_1);weight;hamming-distance;json-path
479							#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P16Y
480							#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P24Y
481							#1;-----;1;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P39Y
482							}
483	72					1368	return $n_00, \$out_ascii, \@out_csv;
484							}
485
486							sub recreate_array {
487	72			72	0	150	my ( $glob_hash, $sorted_keys_glob_hash ) = @_;
488	72					119	my @recreated_array;
489	72					227	foreach my $key (@$sorted_keys_glob_hash) {
490	11448					20155	for ( my $i = 0 ; $i < $glob_hash->{$key} ; $i++ ) {
491	11736					24269	push @recreated_array, $key;
492							}
493							}
494	72					253	return \@recreated_array;
495							}
496
497							sub create_glob_and_ref_hashes {
498	15			15	0	39	my ( $array, $weight, $self ) = @_;
499	15					34	my $primary_key = $self->{primary_key};
500	15					25	my $glob_hash = {};
501	15					34	my $ref_hash_flattened;
502
503	15					24	my $count = 1;
504	15					31	for my $element ( @{$array} ) {
	15					33
505
506							# For consistency, we obtain the primary_key for both BFF/PXF
507							# from $_->{id} (not from subject.id)
508	456	50				1906	my $id = $element->{$primary_key}
509							or die
510							"Sorry but the JSON document [$count] does not have the primary_key <$primary_key> defined\n";
511
512							# Remapping hash
513	456	50				1561	say "Flattening and remapping ..." if $self->{verbose};
514
515	456					2630	my $ref_hash = remap_hash(
516							{
517							hash => $element,
518							weight => $weight,
519							self => $self
520							}
521							);
522
523							# * IMPORTANT *
524							# We eliminate keys with weight = 0 if defined $weight;
525	456	100				2897	prune_keys_with_weight_zero($ref_hash) if defined $weight;
526
527							# Load big hash ref_hash_flattened
528	456					1935	$ref_hash_flattened->{$id} = $ref_hash;
529
530							# The idea is to create a $glob_hash with unique key-values
531							# Duplicates will be automatically merged
532	456					32139	$glob_hash = { %$glob_hash, %$ref_hash };
533
534							# To keep track of array element indexes
535	456					4341	$count++;
536							}
537	15					112	return ( $glob_hash, $ref_hash_flattened );
538							}
539
540							sub randomize_variables {
541	0			0	0	0	my ( $glob_hash, $self ) = @_;
542	0					0	my $max = $self->{max_number_vars};
543	0					0	my $seed = $self->{seed};
544
545							# set random seed
546	0					0	srand($seed);
547
548							# Randomize
549							# NB:keys have to be sorted for srand to work!!!!
550							# perl -MList::Util=shuffle -E 'srand 123; say shuffle 1 .. 5'
551	0					0	my @items = ( shuffle sort keys %$glob_hash )[ 0 .. $max - 1 ];
552
553							# Create a new hash with a hash slice
554	0					0	my %new_glob_hash;
555	0					0	@new_glob_hash{@items} = @{$glob_hash}{@items};
	0					0
556
557							# return reference
558	0					0	return \%new_glob_hash;
559							}
560
561							sub prune_excluded_included {
562	458			458	0	969	my ( $hash, $self ) = @_;
563	458					618	my @included = @{ $self->{include_terms} };
	458					1323
564	458					688	my @excluded = @{ $self->{exclude_terms} };
	458					1134
565
566							# Die if we have both options at the same time
567	458	50	66			1346	die "Sorry, <--include> and <--exclude> are mutually exclusive\n"
568							if ( @included && @excluded );
569
570							# * IMPORTANT *
571							# Original $hash is modified
572
573							# INCLUDED
574	458	100				1194	if (@included) {
575	25					81	for my $key ( keys %$hash ) {
576	125	100		225		451	delete $hash->{$key} unless any { $_ eq $key } @included;
	225					586
577							}
578							}
579
580							# EXCLUDED
581	458	50				888	if (@excluded) {
582	0					0	for my $key (@excluded) {
583	0	0				0	delete $hash->{$key} if exists $hash->{$key};
584							}
585							}
586
587							# We will do nothing if @included = @excluded = [] (DEFAULT)
588	458					906	return 1;
589							}
590
591							sub set_excluded_phenotypicFeatures {
592	458			458	0	950	my ( $hash, $switch, $format ) = @_;
593
594							# Ensure phenotypicFeatures exist before processing
595	458	100				1199	return 1 unless exists $hash->{phenotypicFeatures};
596
597	320					666	foreach my $feature ( @{ $hash->{phenotypicFeatures} } ) {
	320					916
598
599							# Skip if 'excluded' is not set or false
600	787	100				3358	next unless $feature->{excluded};
601
602							# NB: remaining phenotypicFeatures:1.excluded
603							# will be discarded by $exclude_variables_regex_qr later
604	449	100				3355	if ($switch) {
605
606							# Determine the correct ID field based on the format
607	2	50				8	my $id_field = $format eq 'BFF' ? 'featureType' : 'type';
608
609							# Append '_excluded' to the appropriate ID
610	2					9	$feature->{$id_field}{id} .= '_excluded';
611							}
612							else {
613							# Remove the feature by setting it to undef
614	447					1428	$feature = undef;
615
616							# Due to properties being set to undef, it's possible for the coverage file to
617							# report phenotypicFeatures as 100% by all "excluded" = true
618							}
619							}
620
621	320					618	return 1;
622							}
623
624							sub remap_hash {
625	458			458	0	884	my $arg = shift;
626	458					1048	my $hash = $arg->{hash};
627	458					785	my $weight = $arg->{weight};
628	458					785	my $self = $arg->{self}; # $self from $arg
629	458					843	my $nodes = $self->{nodes};
630	458					767	my $edges = $self->{edges};
631	458					776	my $format = $self->{format};
632	458					1050	my $switch = $self->{retain_excluded_phenotypicFeatures};
633	458					624	my %out_hash;
634
635							# Do some pruning excluded / included
636	458					1803	prune_excluded_included( $hash, $self );
637
638							# * IMPORTANT *
639							# The user may include a term that:
640							# 1 - may not exist in any individual
641							# 2 - does not exist in some individuals
642							# If the term does not exist in a individual
643							# - a) -include-terms contains ANOTHER TERM THAT EXISTS
644							# %$hash will contain keys => OK
645							# - b) -include-terms only includes the term/terms not present
646							# %$hash = 0 , then we return {}, to avoid trouble w/ Fold.pm
647							#print Dumper $hash;
648	458	50				1091	return {} unless %$hash;
649
650							# A bit more pruning plus folding
651							# NB: Hash::Fold keeps white spaces on keys
652							#
653							# Options for 1D-array folding:
654							# A) Array to Hash then Fold
655							# B) Fold then Regex <=== CHOSEN
656							# - Avoids the need for deep cloning
657							# - Works across any JSON data structure (without specific key requirements)
658							# - BUT profiling shows it's ~5-10% slower than 'Array to Hash then Fold'
659							# - Does not accommodate specific remappings like 'interpretations.diagnosis.genomicInterpretations'
660	458					1130	set_excluded_phenotypicFeatures( $hash, $switch, $format );
661	458					2242	$hash = fold($hash);
662
663							# Load the hash that points to the hierarchy for ontology-term-id
664							# * IMPORTANT *
665							# - phenotypicFeatures.featureType.id => BFF
666							# - phenotypicFeatures.type.id => PXF
667	458					3746953	my $id_correspondence = $self->{id_correspondence};
668
669							# Load values for the for loop
670	458					1263	my $exclude_variables_regex_qr = $self->{exclude_variables_regex_qr};
671	458					2264	my $misc_regex_qr =
672							qr/1900-01-01\|NA0000\|NCIT:C126101\|P999Y\|P9999Y\|phenopacket_id/;
673
674							# Pre-compile a list of fixed scalar values to exclude into a hash for quick lookup
675							my %exclude_values =
676	458					1201	map { $_ => 1 }
	2290					5644
677							( 'NA', 'NaN', 'Fake', 'None:No matching concept', 'Not Available' );
678
679							# Now we proceed for each key
680	458					1105	for my $key ( keys %{$hash} ) {
	458					16580
681
682							# To see which ones were discarded
683							#say $key if !defined $hash->{$key};
684
685							# Discard undefined
686	87191	100				160227	next unless defined $hash->{$key};
687
688							# Discarding lines with 'low quality' keys (Time of regex profiled with :NYTProf: ms time)
689							# Some can be "rescued" by adding the ontology as ($1)
690							# NB1: We discard _labels too!!
691							# NB2: info\|metaData are always discarded
692							next
693	84392	100	100			480318	if ( defined $exclude_variables_regex_qr
694							&& $key =~ $exclude_variables_regex_qr );
695
696							# The user can turn on age related values
697							next
698							if ( ( $format eq 'PXF' \|\| $format eq 'BFF' )
699							&& $key =~ m/\.age(?!nt)\|onset/i
700	33663	100	100			301746	&& !$self->{age} ); # $self->{age} [0\|1]
			100
			66
701
702							# Load values
703	32154					49961	my $val = $hash->{$key};
704
705							# Discarding lines with unsupported val (Time profiled with :NYTProf: ms time)
706							next
707							if (
708							( ref($val) eq 'HASH'
709	1					5	&& !keys %{$val} ) # Discard {} (e.g.,subject.vitalStatus: {})
710	600					2194	\|\| ( ref($val) eq 'ARRAY' && !@{$val} ) # Discard []
711	32154	100	66			260457	\|\| exists $exclude_values{$val}
			66
			100
			66
			66
712							\|\| $val =~ $misc_regex_qr
713							);
714
715							# Add IDs to key
716	23564					44956	my $id_key = add_id2key( $key, $hash, $self );
717
718							# Finally add value to id_key
719	23564					38790	my $tmp_key_at_variable_level = $id_key . '.' . $val;
720
721							# Add HPO ascendants
722	23564	50	33			44162	if ( defined $edges && $val =~ /^HP:/ ) {
723	0					0	my $ascendants =
724							add_hpo_ascendants( $tmp_key_at_variable_level, $nodes, $edges );
725	0					0	$out_hash{$_} = 1 for @$ascendants; # weight 1 for now
726							}
727
728							##################
729							# Assign weights #
730							##################
731
732							# NB: mrueda (04-12-23) - it's ok if $weight == undef => NO AUTOVIVIFICATION!
733							# NB: We don't warn if user selection does not exist, just assign 1
734
735	23564					28324	my $tmp_key_at_term_level = $tmp_key_at_variable_level;
736
737							# If variable has . then capture $1
738	23564	50				47624	if ( $tmp_key_at_term_level =~ m/\./ ) {
739
740							# NB: For long str regex is faster than (split /\./, $foo)[0]
741	23564					48510	$tmp_key_at_term_level =~ m/^(\w+)\./;
742	23564					37334	$tmp_key_at_term_level = $1;
743							}
744
745	23564	100				34960	if ( defined $weight ) {
746
747							# * IMPORTANT *
748							# ORDER MATTERS !!!!
749							# We allow for assigning weights by TERM (e.g., 1D)
750							# but VARIABLE level takes precedence to TERM
751
752							$out_hash{$tmp_key_at_variable_level} =
753
754							# VARIABLE LEVEL
755							# NB: exists stringifies the weights
756							exists $weight->{$tmp_key_at_variable_level}
757							? $weight->{$tmp_key_at_variable_level} + 0 # coercing to number
758
759							# TERM LEVEL
760							: exists $weight->{$tmp_key_at_term_level}
761	5292	50				20546	? $weight->{$tmp_key_at_term_level} + 0 # coercing to number
		100
762
763							# NO WEIGHT
764							: 1;
765
766							}
767							else {
768
769							# Assign a weight of 1 if no users weights
770	18272					42492	$out_hash{$tmp_key_at_variable_level} = 1;
771
772							}
773
774							##############
775							# label Hash #
776							##############
777
778							# Finally we load the Nomenclature hash
779	23564					32364	my $label = $key;
780	23564					62246	$label =~ s/id/label/;
781							$nomenclature{$tmp_key_at_variable_level} = $hash->{$label}
782	23564	100				82654	if defined $hash->{$label};
783							}
784
785							# * IMPORTANT *
786							# We have to return an object {} when undef
787	458		50			32853	return \%out_hash // {};
788							}
789
790							sub add_hpo_ascendants {
791	0			0	0	0	my ( $key, $nodes, $edges ) = @_;
792
793							# First we obtain the ontology (0000539) from HP:0000539
794	0					0	$key =~ m/HP:(\w+)$/;
795	0					0	my $ontology = $1;
796
797							# We'll use it to build a string equivalent to a key from $edges
798	0					0	my $hpo_url = 'http://purl.obolibrary.org/obo/HP_';
799	0					0	my $hpo_key = $hpo_url . $ontology;
800
801							# We will include all ascendants in an array
802	0					0	my @ascendants;
803	0					0	for my $parent_id ( @{ $edges->{$hpo_key} } ) {
	0					0
804
805							# We have to create a copy to not modify the original $parent_id
806							# as it can appear in multiple individuals
807	0					0	my $copy_parent_id = $parent_id;
808	0					0	$copy_parent_id =~ m/\/(\w+)$/;
809	0					0	$copy_parent_id = $1;
810	0					0	$copy_parent_id =~ tr/_/:/;
811
812							# * IMPORTANT *
813							# We cannot add any label to the ascendants, otherwise they will
814							# not be matched by an indv down the tree
815							# Myopia
816							# Mild Myopia
817							# We want that 'Mild Myopia' matches 'Myopia', thus we can not add a label from 'Mild Myopia'
818							# Use the labels only for debug
819	0					0	my $asc_key = DEVEL_MODE ? $key . '.HPO_asc_DEBUG_ONLY' : $key;
820	0					0	$asc_key =~ s/HP:$ontology/$copy_parent_id/g;
821	0					0	push @ascendants, $asc_key;
822
823							# We finally add the label to %nomenclature
824	0					0	my $hpo_asc_str = $hpo_url
825							. $copy_parent_id; # 'http://purl.obolibrary.org/obo/HP_HP:0000539
826	0					0	$hpo_asc_str =~ s/HP://; # 0000539
827	0					0	$nomenclature{$asc_key} = $nodes->{$hpo_asc_str}{lbl};
828							}
829	0					0	return \@ascendants;
830							}
831
832							sub add_id2key {
833	23564			23564	0	39470	my ( $key, $hash, $self ) = @_;
834	23564					46568	my $id_correspondence = $self->{id_correspondence}{ $self->{format} };
835	23564					30256	my $array_regex_qr = $self->{array_regex_qr};
836	23564					31696	my $array_terms_regex_qr = $self->{array_terms_regex_qr};
837
838							#############
839							# OBJECTIVE #
840							#############
841
842							# This subroutine is important as it replaces the index (numeric) for a given
843							# array element by a selected ontology. It's done for all subkeys on that element
844
845							#"interventionsOrProcedures" : [
846							# {
847							# "bodySite" : {
848							# "id" : "NCIT:C12736",
849							# "label" : "intestine"
850							# },
851							# "procedureCode" : {
852							# "id" : "NCIT:C157823",
853							# "label" : "Colon Resection"
854							# }
855							# },
856							# {
857							# "bodySite" : {
858							# "id" : "NCIT:C12736",
859							# "label" : "intestine"
860							# },
861							# "procedureCode" : {
862							# "id" : "NCIT:C86074",
863							# "label" : "Hemicolectomy"
864							# }
865							# },
866							#]
867							#
868							# Will become:
869							#
870							#"interventionsOrProcedures.NCIT:C157823.bodySite.id.NCIT:C12736" : 1,
871							#"interventionsOrProcedures.NCIT:C157823.procedureCode.id.NCIT:C157823" : 1,
872							#"interventionsOrProcedures.NCIT:C86074.bodySite.id.NCIT:C12736" : 1,
873							#"interventionsOrProcedures.NCIT:C86074.procedureCode.id.NCIT:C86074" : 1,
874							#
875							# To make the replacement we use $id_correspondence, then we perform a regex
876							# to fetch the key parts
877
878							# Only proceed if $key is one of the array_terms
879	23564	100				97223	if ( $key =~ $array_terms_regex_qr ) {
880
881							# Now we use $array_regex_qr to capture $1, $2 and $3 for BFF/PXF
882							# NB: For others (e.g., MXF) we will have only $1 and $2
883	22358					82526	$key =~ $array_regex_qr;
884
885							#say "$array_regex_qr -- [$key] <$1> <$2> <$3>"; # $3 can be undefined
886
887	22358					28541	my ( $tmp_key, $val );
888
889							# Normal behaviour for BFF/PXF
890	22358	100				44769	if ( defined $3 ) {
891
892							# If id_correspondence is an array (e.g., medicalActions) we have to grep the right match
893	22228					24731	my $correspondence;
894	22228	50				57396	if ( ref $id_correspondence->{$1} eq ref [] ) {
895
896							# $1 $2 $3
897							# <0>
898	0					0	my $subkey = ( split /\./, $3 )[0]; # treatment
899							$correspondence =
900	0			0		0	first { $_ =~ m/^$subkey/ }
901	0					0	@{ $id_correspondence->{$1} }; # treatment.agent.id
	0					0
902							}
903							else {
904	22228					34022	$correspondence = $id_correspondence->{$1};
905							}
906
907							# Now that we know which is the term we use to find key-val in $hash
908	22228					54160	$tmp_key =
909							$1 . ':'
910							. $2 . '.'
911							. $correspondence; # medicalActions.0.treatment.agent.id
912	22228					37742	$val = $hash->{$tmp_key}; # DrugCentral:257
913	22228					53565	$key = join '.', $1, $val, $3
914							; # medicalActions.DrugCentral:257.treatment.routeOfAdministration.id
915							}
916
917							# MXF or similar (...we haven't encountered other regex yet)
918							else {
919
920	130					265	$tmp_key = $1 . ':' . $2;
921	130					139	$val = $hash->{$tmp_key};
922	130					184	$key = $1;
923							}
924							}
925
926							# $key = 'Bar:1' means that we have array but the user either:
927							# a) Made a mistake in the config
928							# b) Is not using the right config file
929							else {
930	1206	50				2947	die
931							"<$1> contains array elements but is not defined as an array in <$self->{config_file}>. Please check your syntax and configuration file.\n"
932							if $key =~ m/^(\w+):/;
933							}
934
935	23564					44595	return $key;
936							}
937
938							sub create_binary_digit_string {
939	17			17	0	53	my ( $export, $weight, $glob_hash, $cmp_hash ) = @_;
940	17					40	my %out_hash;
941
942							# * IMPORTANT *
943							# Being a nested for, keys %{$glob_hash} does not need sorting
944							# BUT, we sort to follow the same order as serialized (sorted)
945	17					42	my @sorted_keys_glob_hash = sort keys %{$glob_hash};
	17					1314
946
947							# IDs of each indidividual
948	17					137	for my $individual_id ( keys %{$cmp_hash} ) { # no need to sort
	17					141
949
950							# One-hot encoding = Representing categorical data as numerical
951	458					966	my ( $binary_str, $binary_str_weighted ) = ('') x 2;
952
953	458					716	for my $key (@sorted_keys_glob_hash) {
954	62270					78325	my $has_value = exists $cmp_hash->{$individual_id}{$key};
955	62270	100				80041	$binary_str .= $has_value ? '1' : '0';
956	62270	100				92453	if ( defined $weight ) {
957							$binary_str_weighted .=
958							$has_value
959							? ( '1' x $glob_hash->{$key} )
960	13396	100				26856	: ( '0' x $glob_hash->{$key} );
961							}
962							}
963
964							# If weight is not defined, simply assign the unweighted string once.
965	458	100				1025	$binary_str_weighted = $binary_str unless defined $weight;
966
967	458					1669	$out_hash{$individual_id}{binary_digit_string} = $binary_str;
968							$out_hash{$individual_id}{binary_digit_string_weighted} =
969	458					809	$binary_str_weighted;
970
971	458	50				922	if ( defined $export ) {
972
973							# Convert string => raw bytes > zlib-compres => Base64
974							$out_hash{$individual_id}{zlib_base64_binary_digit_string} =
975	0					0	binary_to_base64($binary_str);
976
977							$out_hash{$individual_id}{zlib_base64_binary_digit_string_weighted}
978							= defined $weight
979							? binary_to_base64($binary_str_weighted)
980	0	0				0	: $out_hash{$individual_id}{zlib_base64_binary_digit_string};
981							}
982
983							}
984	17					338	return \%out_hash;
985							}
986
987							sub parse_hpo_json {
988	0			0	0	0	my $data = shift;
989
990							# The file is a structured representation of the Human Phenotype Ontology (HPO) in JSON format.
991							# The HPO is structured into a directed acyclic graph (DAG)
992							# Here's a brief overview of the structure of the hpo.json file:
993							# - graphs: This key contains an array of ontology graphs. In the case of HPO, there is only one graph. The graph has two main keys:
994							# - nodes: An array of objects, each representing an HPO term. Each term object has the following keys:
995							# - id: The identifier of the term (e.g., "HP:0000118").
996							# - lbl: The label (name) of the term (e.g., "Phenotypic abnormality").
997							# - meta: Metadata associated with the term, including definition, synonyms, and other information.
998							# - type: The type of the term, usually "CLASS".
999							# - edges: An array of objects, each representing a relationship between two HPO terms. Each edge object has the following keys:
1000							# - sub: The subject (child) term ID (e.g., "HP:0000924").
1001							# - obj: The object (parent) term ID (e.g., "HP:0000118").
1002							# - pred: The predicate that describes the relationship between the subject and object terms, typically "is_a" in HPO.
1003							# - meta: This key contains metadata about the HPO ontology as a whole, such as version information, description, and other details.
1004
1005	0					0	my $graph = $data->{graphs}->[0];
1006	0					0	my %nodes = map { $_->{id} => $_ } @{ $graph->{nodes} };
	0					0
	0					0
1007	0					0	my %edges = ();
1008
1009	0					0	for my $edge ( @{ $graph->{edges} } ) {
	0					0
1010	0					0	my $child_id = $edge->{sub};
1011	0					0	my $parent_id = $edge->{obj};
1012	0					0	push @{ $edges{$child_id} }, $parent_id;
	0					0
1013							}
1014	0					0	return \%nodes, \%edges;
1015							}
1016
1017							sub prune_keys_with_weight_zero {
1018	97			97	0	190	my $hash_ref = shift;
1019
1020							# Iterate over the keys of the hash
1021	97					193	foreach my $key ( keys %{$hash_ref} ) {
	97					1102
1022
1023							# Delete the key if its value is 0
1024	5077	100				8834	delete $hash_ref->{$key} if $hash_ref->{$key} == 0;
1025							}
1026							}
1027
1028							sub guess_label {
1029	72			72	0	95	my $input_string = shift;
1030
1031	72	50				175	if (
1032							$input_string =~ /\. # Match a literal dot
1033							([^\.]+) # Match and capture everything except a dot
1034							$ # Anchor to the end of the string
1035							/x
1036							)
1037							{
1038	72					147	return $1;
1039							}
1040
1041							# If no dot is found, return the original string
1042	0						return $input_string;
1043							}
1044
1045							sub binary_to_base64 {
1046	0			0	0		my $binary_string = shift;
1047
1048							# Convert binary string (e.g. "0010...") to raw bytes
1049	0						my $raw_data = pack( "B*", $binary_string );
1050
1051							# Compress the raw data (note: compressing very short data may not save space)
1052	0						my $compressed = compress($raw_data);
1053
1054							# Base64 encode the compressed data, without any newline breaks
1055	0						return encode_base64( $compressed, "" );
1056							}
1057
1058							sub _base64_to_binary {
1059	0			0			my ( $b64_string, $original_length ) = @_;
1060
1061							# Decode the Base64 encoded compressed data
1062	0						my $compressed_data = decode_base64($b64_string);
1063
1064							# Decompress the data back to raw bytes
1065	0	0					my $raw_data = uncompress($compressed_data)
1066							or die "Decompression failed: $Compress::Zlib::gzerrno\n";
1067
1068							# Convert the raw bytes back into a binary string (sequence of 0s and 1s)
1069	0						my $binary_string = unpack( "B*", $raw_data );
1070
1071							# Trim the binary string to the original length to remove any padded bits
1072	0						return substr( $binary_string, 0, $original_length );
1073							}
1074
1075							1;