File Coverage

lib/Pheno/Ranker/Compare.pm

Criterion	Covered	Total	%
statement	307	359	85.5
branch	86	120	71.6
condition	34	53	64.1
subroutine	25	29	86.2
pod	0	16	0.0
total	452	577	78.3

line	stmt	bran	cond	sub	pod	time	code
1							package Pheno::Ranker::Compare;
2
3	5			5		23	use strict;
	5					6
	5					208
4	5			5		19	use warnings;
	5					7
	5					202
5	5			5		17	use autodie;
	5					5
	5					39
6	5			5		18177	use feature qw(say);
	5					7
	5					576
7	5			5		22	use List::Util qw(any shuffle first);
	5					5
	5					321
8	5			5		20	use Data::Dumper;
	5					8
	5					226
9	5			5		2743	use Sort::Naturally qw(nsort);
	5					19778
	5					362
10	5			5		2158	use Hash::Fold fold => { array_delimiter => ':' };
	5					150445
	5					36
11	5			5		3564	use Pheno::Ranker::Stats;
	5					13
	5					406
12
13	5			5		43	use Exporter 'import';
	5					7
	5					250
14							our @EXPORT =
15							qw(check_format cohort_comparison compare_and_rank create_glob_and_ref_hashes randomize_variables remap_hash create_binary_digit_string parse_hpo_json);
16
17	5			5		18	use constant DEVEL_MODE => 0;
	5					6
	5					24866
18
19							our %nomenclature = ();
20
21							sub check_format {
22
23	12			12	0	19	my $data = shift;
24	12	100				81	return exists $data->[0]{subject} ? 'PXF' : 'BFF';
25							}
26
27							sub cohort_comparison {
28
29	14			14	0	29	my ( $ref_binary_hash, $self ) = @_;
30	14					36	my $out_file = $self->{out_file};
31	14					27	my $similarity_metric = $self->{similarity_metric_cohort};
32
33							# Define limit #items for switching to whole matrix calculation
34	14					22	my $max_items = $self->{max_matrix_items_in_ram};
35
36							# Inform about the start of the comparison process
37							say "Performing COHORT comparison"
38	14	50	33			77	if ( $self->{debug} \|\| $self->{verbose} );
39
40							# Define the subroutine to be used
41	14					68	my %similarity_function = (
42							'hamming' => \&hd_fast,
43							'jaccard' => \&jaccard_similarity_formatted
44							);
45
46							# Define values for diagonal elements depending on metric
47	14					37	my %similarity_diagonal = (
48							'hamming' => 0,
49							'jaccard' => 1
50							);
51
52							# Use previous hashes to define stuff
53	14					21	my $metric = $similarity_function{$similarity_metric};
54	14					18	my $similarity_diagonal = $similarity_diagonal{$similarity_metric};
55
56							# Sorting keys of the hash
57	14					16	my @sorted_keys_ref_binary_hash = nsort( keys %{$ref_binary_hash} );
	14					645
58	14					44441	my $num_items = scalar @sorted_keys_ref_binary_hash;
59
60							# Define $switch for going from RAM to all calculations
61	14	50				43	my $switch = $num_items > $max_items ? 1 : 0;
62
63							say "RAM efficient mode is: "
64							. ( $switch ? "on" : "off" )
65							. " (max_matrix_items_in_ram: $max_items)"
66	14	0	33			81	if ( $self->{debug} \|\| $self->{verbose} );
		50
67
68							# Opening file for output
69	14					90	open( my $fh, '>:encoding(UTF-8)', $out_file );
70	14					14514	say $fh "\t", join "\t", @sorted_keys_ref_binary_hash;
71
72							# Initialize matrix for storing similarity
73	14					23	my @matrix;
74
75							# Iterate over items (I elements)
76	14					43	for my $i ( 0 .. $#sorted_keys_ref_binary_hash ) {
77							say "Calculating <"
78							. $sorted_keys_ref_binary_hash[$i]
79							. "> against the cohort..."
80	420	50				587	if $self->{verbose};
81							my $str1 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$i] }
82	420					563	{binary_digit_string_weighted};
83
84							# Print first column (w/o \t)
85	420					498	print $fh $sorted_keys_ref_binary_hash[$i];
86
87							# Iterate for pairwise comparisons (J elements)
88	420					503	for my $j ( 0 .. $#sorted_keys_ref_binary_hash ) {
89							my $str2 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
90	15028					14871	{binary_digit_string_weighted};
91	15028					10250	my $similarity;
92
93	15028	50				12445	if ($switch) {
94
95							# For large datasets compute upper and lower triange
96							my $str2 =
97							$ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
98	0					0	{binary_digit_string_weighted};
99	0	0				0	$similarity =
100							$i == $j ? $similarity_diagonal : $metric->( $str1, $str2 );
101							}
102							else {
103	15028	100				15256	if ( $i == $j ) {
		100
104
105							# Similarity for diagonal elements
106	420					351	$similarity = $similarity_diagonal;
107							}
108							elsif ( $j > $i ) {
109
110							# Compute similarity for large cohorts or upper triangle
111	7304					7124	$similarity = $metric->( $str1, $str2 );
112	7304					7456	$matrix[$i][$j] = $similarity;
113							}
114							else {
115							# Use precomputed similarity from lower triangle
116	7304					6318	$similarity = $matrix[$j][$i];
117							}
118							}
119
120							# Print a tab before each similarity
121	15028					20165	print $fh "\t", $similarity;
122							}
123
124	420					463	print $fh "\n";
125							}
126
127							# Close the file handle
128	14					65	close $fh;
129
130							# Inform about the completion of the matrix computation
131	14	50	33			6758	say "Matrix saved to <$out_file>" if ( $self->{debug} \|\| $self->{verbose} );
132	14					562	return 1;
133							}
134
135							sub compare_and_rank {
136
137	2			2	0	2	my $arg = shift;
138	2					4	my $glob_hash = $arg->{glob_hash};
139	2					4	my $ref_hash = $arg->{ref_hash};
140	2					3	my $tar_hash = $arg->{tar_hash};
141	2					3	my $ref_binary_hash = $arg->{ref_binary_hash};
142	2					4	my $tar_binary_hash = $arg->{tar_binary_hash};
143	2					2	my $weight = $arg->{weight};
144	2					5	my $self = $arg->{self};
145	2					4	my $sort_by = $self->{sort_by};
146	2					4	my $align = $self->{align};
147	2					5	my $max_out = $self->{max_out};
148
149							say "Performing COHORT(REF)-PATIENT(TAR) comparison"
150	2	50	33			12	if ( $self->{debug} \|\| $self->{verbose} );
151
152							# Hash for compiling metrics
153	2					2	my $score;
154
155							# Hash for stats
156							my $stat;
157
158							# Load TAR binary string
159	2					2	my ($tar) = keys %{$tar_binary_hash};
	2					5
160							my $tar_str_weighted =
161	2					3	$tar_binary_hash->{$tar}{binary_digit_string_weighted};
162
163							# Load TAR number of vars
164	2					4	my $target_vars = keys %{ $tar_hash->{$tar} };
	2					4
165
166	2					3	for my $key ( keys %{$ref_binary_hash} ) { # No need to sort
	2					16
167
168							# Load REF binary string
169							my $ref_str_weighted =
170	72					122	$ref_binary_hash->{$key}{binary_digit_string_weighted};
171	72	50				101	say "Comparing --- " if $self->{verbose};
172							say "REF:$ref_str_weighted\nTAR:$tar_str_weighted\n"
173	72	50	33			94	if ( defined $self->{debug} && $self->{debug} > 1 );
174
175							# Hamming
176							$score->{$key}{hamming} =
177	72					107	hd_fast( $ref_str_weighted, $tar_str_weighted );
178
179							# Intersect and Jaccard
180	72					89	( $score->{$key}{jaccard}, $score->{$key}{intersect} ) =
181							jaccard_similarity( $ref_str_weighted, $tar_str_weighted );
182
183							# Load REF number of vars
184	72					66	$score->{$key}{reference_vars} = keys %{ $ref_hash->{$key} };
	72					124
185
186							# Add values
187	72					53	push @{ $stat->{hamming_data} }, $score->{$key}{hamming};
	72					105
188	72					55	push @{ $stat->{jaccard_data} }, $score->{$key}{jaccard};
	72					101
189							}
190	2					14	$stat->{hamming_stats} = add_stats( $stat->{hamming_data} );
191	2					5	$stat->{jaccard_stats} = add_stats( $stat->{jaccard_data} );
192
193							# Initialize a few variables
194	2					118	my @headers = (
195							'RANK', 'REFERENCE(ID)',
196							'TARGET(ID)', 'FORMAT',
197							'LENGTH', 'WEIGHTED',
198							'HAMMING-DISTANCE', 'DISTANCE-Z-SCORE',
199							'DISTANCE-P-VALUE', 'DISTANCE-Z-SCORE(RAND)',
200							'JACCARD-INDEX', 'JACCARD-Z-SCORE',
201							'JACCARD-P-VALUE', 'REFERENCE-VARS',
202							'TARGET-VARS', 'INTERSECT',
203							'INTERSECT-RATE(%)', 'COMPLETENESS(%)'
204							);
205	2					8	my $header = join "\t", @headers;
206	2					4	my @results = $header;
207	2					2	my %info;
208	2					3	my $length_align = length($tar_str_weighted);
209	2	100				5	my $weight_bool = $weight ? 'True' : 'False';
210	2					3	my @alignments_ascii;
211							my $alignment_str_csv;
212	2					5	my @alignments_csv = join ';',
213							qw/id ref indicator tar weight hamming-distance json-path label/;
214
215							# The dataframe will have two header lines
216							# * IMPORTANT *
217							# nsort does not yield same results as canonical from JSON::XS
218							# NB: we're sorting here and in create_binary_digit_string()
219	2					2	my @sort_keys_glob_hash = sort keys %{$glob_hash};
	2					168
220
221							# Creating @labels array from {id,label}, or guess_label()
222							my @labels =
223	2	100				14	map { exists $nomenclature{$_} ? $nomenclature{$_} : guess_label($_) }
	318					417
224							@sort_keys_glob_hash;
225
226							# Die if #elements in arrays differ
227	2	50				6	die "Mismatch between variables and labels"
228							if @sort_keys_glob_hash != @labels;
229
230							# Labels
231							# NB: there is a comma in 'Serum Glutamic Pyruvic Transaminase, CTCAE'
232	2					31	my @dataframe = join ';', 'Id', @labels;
233
234							# Variables (json path)
235	2					26	push @dataframe, join ';', 'Id', @sort_keys_glob_hash;
236
237							# 0/1
238							push @dataframe, join ';', qq/T\|$tar/,
239	2					53	( split //, $tar_binary_hash->{$tar}{binary_digit_string} ); # Add Target
240
241							# Sort %score by value and load results
242	2					14	my $count = 1;
243	2					3	$max_out++; # to be able to start w/ ONE
244
245							# Start loop
246	2					21	for my $key (
247							sort {
248							$sort_by eq 'jaccard' #
249							? $score->{$b}{$sort_by} <=> $score->{$a}{$sort_by} # 1 to 0 (similarity)
250	292	50				367	: $score->{$a}{$sort_by} <=> $score->{$b}{$sort_by} # 0 to N (distance)
251							} keys %$score
252							)
253							{
254
255	72	50				142	say "$count: Creating alignment " if $self->{verbose};
256
257							# Create ASCII alignment
258							# NB: We need it here to get $n_00
259							my ( $n_00, $alignment_str_ascii, $alignment_arr_csv ) =
260							create_alignment(
261							{
262							ref_key => $key,
263							ref_str_weighted =>
264							$ref_binary_hash->{$key}{binary_digit_string_weighted},
265	72					367	tar_str_weighted => $tar_str_weighted,
266							glob_hash => $glob_hash,
267							sorted_keys_glob_hash => \@sort_keys_glob_hash,
268							labels => \@labels
269							}
270							);
271
272							# * IMPORTANT *
273							# The LENGTH of the alignment is based on the #variables in the REF-COHORT
274							# Compute estimated av and dev for binary_string of L = length_align - n_00
275							# Corrected length_align L = length_align - n_00
276	72					174	my $length_align_corrected = $length_align - $n_00;
277
278							#$estimated_average, $estimated_std_dev
279	72					184	( $stat->{hamming_stats}{mean_rnd}, $stat->{hamming_stats}{sd_rnd} ) =
280							estimate_hamming_stats($length_align_corrected);
281
282							# Compute a few stats
283							my $hamming_z_score = z_score(
284							$score->{$key}{hamming},
285							$stat->{hamming_stats}{mean},
286							$stat->{hamming_stats}{sd}
287	72					221	);
288							my $hamming_z_score_from_random = z_score(
289							$score->{$key}{hamming},
290							$stat->{hamming_stats}{mean_rnd},
291							$stat->{hamming_stats}{sd_rnd}
292	72					114	);
293
294							#my $hamming_p_value =
295							# p_value( $score->{$key}{hamming}, $length_align_corrected );
296	72					86	my $hamming_p_value_from_z_score =
297							p_value_from_z_score($hamming_z_score);
298							my $jaccard_z_score = z_score(
299							$score->{$key}{jaccard},
300							$stat->{jaccard_stats}{mean},
301							$stat->{jaccard_stats}{sd}
302	72					134	);
303	72					123	my $jaccard_p_value_from_z_score =
304							p_value_from_z_score( 1 - $jaccard_z_score );
305
306							# Compute Intersect-Rate (I/T) * 100
307							# Completeness (T/R) * 100
308	72					89	my $reference_vars = $score->{$key}{reference_vars};
309	72					87	my $intersect = $score->{$key}{intersect};
310	72	50				109	my $intersect_rate =
311							( $target_vars == 0 ) ? 0 : ( $intersect / $target_vars ) * 100;
312	72	50				88	my $completeness =
313							( $reference_vars == 0 ) ? 0 : ( $intersect / $reference_vars ) * 100;
314
315							# Create a hash with formats
316							my $format = {
317							'RANK' => { value => $count, format => undef },
318							'REFERENCE(ID)' => { value => $key, format => undef },
319							'TARGET(ID)' => { value => $tar, format => undef },
320							'FORMAT' => { value => $self->{format}, format => undef },
321							'WEIGHTED' => { value => $weight_bool, format => undef },
322							'LENGTH' => { value => $length_align_corrected, format => '%6d' },
323							'HAMMING-DISTANCE' =>
324							{ value => $score->{$key}{hamming}, format => '%4d' },
325							'DISTANCE-Z-SCORE' =>
326							{ value => $hamming_z_score, format => '%7.3f' },
327							'DISTANCE-P-VALUE' =>
328							{ value => $hamming_p_value_from_z_score, format => '%12.7f' },
329							'DISTANCE-Z-SCORE(RAND)' =>
330							{ value => $hamming_z_score_from_random, format => '%8.4f' },
331							'JACCARD-INDEX' =>
332	72					1113	{ value => $score->{$key}{jaccard}, format => '%7.3f' },
333							'JACCARD-Z-SCORE' =>
334							{ value => $jaccard_z_score, format => '%7.3f' },
335							'JACCARD-P-VALUE' =>
336							{ value => $jaccard_p_value_from_z_score, format => '%12.7f' },
337							'REFERENCE-VARS' => { value => $reference_vars, format => '%6d' },
338							'TARGET-VARS' => { value => $target_vars, format => '%6d' },
339							'INTERSECT' => { value => $intersect, format => '%6d' },
340							'INTERSECT-RATE(%)' =>
341							{ value => $intersect_rate, format => '%8.2f' },
342							'COMPLETENESS(%)' => { value => $completeness, format => '%8.2f' }
343							};
344
345							# Serialize results
346							my $tmp_str = join "\t", map {
347	72					161	defined $format->{$_}{format}
348							? sprintf( $format->{$_}{format}, $format->{$_}{value} )
349							: $format->{$_}{value}
350	1296	100				2916	} @headers;
351	72					171	push @results, $tmp_str;
352
353							# To save memory only load if --align
354	72	50				108	if ( defined $align ) {
355
356							# Add all of the above to @alignments_ascii
357	72					101	my $sep = ('-') x 80;
358	72					1952	push @alignments_ascii,
359							qq/#$header\n$tmp_str\n$sep\n$$alignment_str_ascii/;
360
361							# Add all of the above to @alignments_csv
362	72					1796	push @alignments_csv, @$alignment_arr_csv;
363
364							# Add data to dataframe
365							push @dataframe, join ';', qq/R\|$key/,
366	72					2054	( split //, $ref_binary_hash->{$key}{binary_digit_string} );
367
368							# Add values to info
369							$info{$key} = {
370							weighted => $weight_bool eq 'True'
371							? JSON::XS::true
372							: JSON::XS::false,
373							reference_id => $key,
374							target_id => $tar,
375							reference_binary_string =>
376							$ref_binary_hash->{$key}{binary_digit_string},
377							target_binary_string =>
378							$tar_binary_hash->{$key}{binary_digit_string},
379							reference_binary_string_weighted =>
380							$ref_binary_hash->{$key}{binary_digit_string_weighted},
381							target_binary_string_weighted =>
382							$tar_binary_hash->{$key}{binary_digit_string_weighted},
383							alignment_length => $length_align_corrected,
384							hamming_distance => $score->{$key}{hamming},
385							hamming_z_score => $hamming_z_score,
386							hamming_p_value => $hamming_p_value_from_z_score,
387							jaccard_similarity => $score->{$key}{jaccard},
388							jaccard_z_score => $jaccard_z_score,
389							jaccard_p_value => $jaccard_p_value_from_z_score,
390							jaccard_distance => 1 - $score->{$key}{jaccard},
391							format => $self->{format},
392	72	100				645	alignment => $$alignment_str_ascii,
393							reference_vars => $reference_vars,
394							target_vars => $target_vars,
395							intersect => $intersect,
396							intersect_rate => $intersect_rate,
397							completeness => $completeness
398							};
399
400							}
401
402	72					2448	$count++;
403	72	100				992	last if $count == $max_out;
404							}
405
406	2					144	return \@results, \%info, \@alignments_ascii, \@dataframe, \@alignments_csv;
407							}
408
409							sub create_alignment {
410
411							# NB: The alignment will use the weighted string
412	72			72	0	73	my $arg = shift;
413	72					76	my $ref_key = $arg->{ref_key};
414	72					74	my $binary_string1 = $arg->{ref_str_weighted};
415	72					61	my $binary_string2 = $arg->{tar_str_weighted};
416	72					70	my $sorted_keys_glob_hash = $arg->{sorted_keys_glob_hash};
417	72					75	my $labels = $arg->{labels};
418	72					70	my $glob_hash = $arg->{glob_hash};
419	72					72	my $length1 = length($binary_string1);
420	72					67	my $length2 = length($binary_string2);
421
422							# Check that l1 = l2
423	72	50				84	die "The binary strings must have the same length"
424							if ( $length1 != $length2 );
425
426							# Expand array to have weights as N-elements
427	72					97	my $recreated_array = recreate_array( $glob_hash, $sorted_keys_glob_hash );
428
429							# Initialize some variables
430	72					70	my $out_ascii = "REF -- TAR\n";
431	72					57	my @out_csv;
432	72					56	my $cum_distance = 0;
433	72					70	my $n_00 = 0;
434
435							# For loop with 2 variables
436							# i index for weighted
437							# j the number of variables
438	72					80	my ( $i, $j ) = ( 0, 0 );
439	72					98	for ( $i = 0 ; $i < $length1 ; $i++, $j++ ) {
440
441							# Load key and value
442	11448					9160	my $key = $recreated_array->[$i];
443	11448					11428	my $val = sprintf( "%3d", $glob_hash->{$key} );
444
445							# We have to keep track with $j
446	11448					8530	my $sorted_key = $sorted_keys_glob_hash->[$j];
447	11448					9195	my $label = $labels->[$j];
448
449							# Load chars
450	11448					9293	my $char1 = substr( $binary_string1, $i, 1 );
451	11448					8885	my $char2 = substr( $binary_string2, $i, 1 );
452	11448	100	100			16593	$n_00++ if ( $char1 == 0 && $char2 == 0 );
453
454							# Correct $i index by adding weights
455	11448					9136	$i = $i + $glob_hash->{$key} - 1;
456
457							# Load metrics
458	11448	100				11351	$cum_distance += $glob_hash->{$key} if $char1 ne $char2;
459	11448					10558	my $cum_distance_pretty = sprintf( "%3d", $cum_distance );
460	11448	100				10714	my $distance = $char1 eq $char2 ? 0 : $glob_hash->{$key};
461	11448					10013	my $distance_pretty = sprintf( "%3d", $distance );
462
463							# w = weight, d = distance, cd = cumul distance
464	11448					15928	my %format = (
465							'11' => '-----',
466							'10' => 'xxx--',
467							'01' => '--xxx',
468							'00' => ' '
469							);
470	11448					13065	$out_ascii .=
471							qq/$char1 $format{ $char1 . $char2 } $char2 \| (w:$val\|d:$distance_pretty\|cd:$cum_distance_pretty\|) $sorted_key ($label)\n/;
472	11448					26739	push @out_csv,
473							qq/$ref_key;$char1;$format{ $char1 . $char2 };$char2;$glob_hash->{$key};$distance;$sorted_key;$label/;
474
475							#REF(107:week_0_arm_1);indicator;TAR(125:week_0_arm_1);weight;hamming-distance;json-path
476							#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P16Y
477							#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P24Y
478							#1;-----;1;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P39Y
479							}
480	72					579	return $n_00, \$out_ascii, \@out_csv;
481							}
482
483							sub recreate_array {
484
485	72			72	0	108	my ( $glob_hash, $sorted_keys_glob_hash ) = @_;
486	72					72	my @recreated_array;
487	72					100	foreach my $key (@$sorted_keys_glob_hash) {
488	11448					12979	for ( my $i = 0 ; $i < $glob_hash->{$key} ; $i++ ) {
489	11736					15359	push @recreated_array, $key;
490							}
491							}
492	72					115	return \@recreated_array;
493							}
494
495							sub create_glob_and_ref_hashes {
496
497	15			15	0	34	my ( $array, $weight, $self ) = @_;
498	15					29	my $primary_key = $self->{primary_key};
499	15					20	my $glob_hash = {};
500	15					16	my $ref_hash_flattened;
501
502	15					20	my $count = 1;
503	15					16	for my $element ( @{$array} ) {
	15					29
504
505							# For consistency, we obtain the primary_key for both BFF/PXF
506							# from $_->{id} (not from subject.id)
507	456	50				1137	my $id = $element->{$primary_key}
508							or die
509							"Sorry but the JSON document [$count] does not have the primary_key <$primary_key> defined\n";
510
511							# Remapping hash
512	456	50				789	say "Flattening and remapping ..." if $self->{verbose};
513
514	456					1262	my $ref_hash = remap_hash(
515							{
516							hash => $element,
517							weight => $weight,
518							self => $self
519							}
520							);
521
522							# * IMPORTANT *
523							# We eliminate keys with weight = 0 if defined $weight;
524	456	100				1396	prune_keys_with_weight_zero($ref_hash) if defined $weight;
525
526							# Load big hash ref_hash_flattened
527	456					1077	$ref_hash_flattened->{$id} = $ref_hash;
528
529							# The idea is to create a $glob_hash with unique key-values
530							# Duplicates will be automatically merged
531	456					18740	$glob_hash = { %$glob_hash, %$ref_hash };
532
533							# To keep track of array element indexes
534	456					2794	$count++;
535							}
536	15					85	return ( $glob_hash, $ref_hash_flattened );
537							}
538
539							sub randomize_variables {
540
541	0			0	0	0	my ( $glob_hash, $self ) = @_;
542	0					0	my $max = $self->{max_number_vars};
543	0					0	my $seed = $self->{seed};
544
545							# set random seed
546	0					0	srand($seed);
547
548							# Randomize
549							# NB:keys have to be sorted for srand to work!!!!
550							# perl -MList::Util=shuffle -E 'srand 123; say shuffle 1 .. 5'
551	0					0	my @items = ( shuffle sort keys %$glob_hash )[ 0 .. $max - 1 ];
552
553							# Create a new hash with a hash slice
554	0					0	my %new_glob_hash;
555	0					0	@new_glob_hash{@items} = @{$glob_hash}{@items};
	0					0
556
557							# return reference
558	0					0	return \%new_glob_hash;
559							}
560
561							sub prune_excluded_included {
562
563	458			458	0	570	my ( $hash, $self ) = @_;
564	458					391	my @included = @{ $self->{include_terms} };
	458					775
565	458					447	my @excluded = @{ $self->{exclude_terms} };
	458					668
566
567							# Die if we have both options at the same time
568	458	50	66			750	die "Sorry, <--include> and <--exclude> are mutually exclusive\n"
569							if ( @included && @excluded );
570
571							# * IMPORTANT *
572							# Original $hash is modified
573
574							# INCLUDED
575	458	100				558	if (@included) {
576	25					41	for my $key ( keys %$hash ) {
577	125	100		225		197	delete $hash->{$key} unless any { $_ eq $key } @included;
	225					290
578							}
579							}
580
581							# EXCLUDED
582	458	50				537	if (@excluded) {
583	0					0	for my $key (@excluded) {
584	0	0				0	delete $hash->{$key} if exists $hash->{$key};
585							}
586							}
587
588							# We will do nothing if @included = @excluded = [] (DEFAULT)
589	458					551	return 1;
590							}
591
592							sub set_excluded_phenotypicFeatures {
593
594	458			458	0	563	my ( $hash, $switch, $format ) = @_;
595
596							# Ensure phenotypicFeatures exist before processing
597	458	100				782	return 1 unless exists $hash->{phenotypicFeatures};
598
599	320					254	foreach my $feature ( @{ $hash->{phenotypicFeatures} } ) {
	320					647
600
601							# Skip if 'excluded' is not set or false
602	787	100				1916	next unless $feature->{excluded};
603
604							# NB: remaining phenotypicFeatures:1.excluded
605							# will be discarded by $exclude_variables_regex_qr later
606	449	100				2257	if ($switch) {
607
608							# Determine the correct ID field based on the format
609	2	50				5	my $id_field = $format eq 'BFF' ? 'featureType' : 'type';
610
611							# Append '_excluded' to the appropriate ID
612	2					6	$feature->{$id_field}{id} .= '_excluded';
613							}
614							else {
615							# Remove the feature by setting it to undef
616	447					911	$feature = undef;
617
618							# Due to properties being set to undef, it's possible for the coverage file to
619							# report phenotypicFeatures as 100% by all "excluded" = true
620							}
621							}
622
623	320					443	return 1;
624							}
625
626							sub remap_hash {
627
628	458			458	0	494	my $arg = shift;
629	458					458	my $hash = $arg->{hash};
630	458					429	my $weight = $arg->{weight};
631	458					478	my $self = $arg->{self}; # $self from $arg
632	458					491	my $nodes = $self->{nodes};
633	458					443	my $edges = $self->{edges};
634	458					463	my $format = $self->{format};
635	458					461	my $switch = $self->{retain_excluded_phenotypicFeatures};
636	458					375	my $out_hash;
637
638							# Do some pruning excluded / included
639	458					794	prune_excluded_included( $hash, $self );
640
641							# * IMPORTANT *
642							# The user may include a term that:
643							# 1 - may not exist in any individual
644							# 2 - does not exist in some individuals
645							# If the term does not exist in a individual
646							# - a) -include-terms contains ANOTHER TERM THAT EXISTS
647							# %$hash will contain keys => OK
648							# - b) -include-terms only includes the term/terms not present
649							# %$hash = 0 , then we return {}, to avoid trouble w/ Fold.pm
650							#print Dumper $hash;
651	458	50				663	return {} unless %$hash;
652
653							# A bit more pruning plus folding
654							# NB: Hash::Fold keeps white spaces on keys
655							#
656							# Options for 1D-array folding:
657							# A) Array to Hash then Fold
658							# B) Fold then Regex <=== CHOSEN
659							# - Avoids the need for deep cloning
660							# - Works across any JSON data structure (without specific key requirements)
661							# - BUT profiling shows it's ~5-10% slower than 'Array to Hash then Fold'
662							# - Does not accommodate specific remappings like 'interpretations.diagnosis.genomicInterpretations'
663	458					693	set_excluded_phenotypicFeatures( $hash, $switch, $format );
664	458					1160	$hash = fold($hash);
665
666							# Load the hash that points to the hierarchy for ontology-term-id
667							# * IMPORTANT *
668							# - phenotypicFeatures.featureType.id => BFF
669							# - phenotypicFeatures.type.id => PXF
670	458					2469727	my $id_correspondence = $self->{id_correspondence};
671
672							# Load values for the for loop
673	458					631	my $exclude_variables_regex_qr = $self->{exclude_variables_regex_qr};
674	458					1254	my $misc_regex_qr =
675							qr/1900-01-01\|NA0000\|NCIT:C126101\|P999Y\|P9999Y\|phenopacket_id/;
676
677							# Pre-compile a list of fixed scalar values to exclude into a hash for quick lookup
678							my %exclude_values =
679	458					768	map { $_ => 1 }
	2290					3612
680							( 'NA', 'NaN', 'Fake', 'None:No matching concept', 'Not Available' );
681
682							# Now we proceed for each key
683	458					608	for my $key ( keys %{$hash} ) {
	458					10713
684
685							# To see which ones were discarded
686							#say $key if !defined $hash->{$key};
687
688							# Discard undefined
689	87191	100				100931	next unless defined $hash->{$key};
690
691							# Discarding lines with 'low quality' keys (Time of regex profiled with :NYTProf: ms time)
692							# Some can be "rescued" by adding the ontology as ($1)
693							# NB1: We discard _labels too!!
694							# NB2: info\|metaData are always discarded
695							next
696	84392	100	100			312292	if ( defined $exclude_variables_regex_qr
697							&& $key =~ $exclude_variables_regex_qr );
698
699							# The user can turn on age related values
700							next
701							if ( ( $format eq 'PXF' \|\| $format eq 'BFF' )
702							&& $key =~ m/\.age(?!nt)\|onset/i
703	33663	100	100			195585	&& !$self->{age} ); # $self->{age} [0\|1]
			100
			66
704
705							# Load values
706	32154					31018	my $val = $hash->{$key};
707
708							# Discarding lines with unsupported val (Time profiled with :NYTProf: ms time)
709							next
710							if (
711	1					4	( ref($val) eq 'HASH' && !keys %{$val} ) # Discard {} (e.g.,subject.vitalStatus: {})
712	600					1436	\|\| ( ref($val) eq 'ARRAY' && !@{$val} ) # Discard []
713	32154	100	66			168888	\|\| exists $exclude_values{$val}
			66
			100
			66
			66
714							\|\| $val =~ $misc_regex_qr
715							);
716
717							# Add IDs to key
718	23564					27207	my $id_key = add_id2key( $key, $hash, $self );
719
720							# Finally add value to id_key
721	23564					23436	my $tmp_key_at_variable_level = $id_key . '.' . $val;
722
723							# Add HPO ascendants
724	23564	50	33			29247	if ( defined $edges && $val =~ /^HP:/ ) {
725	0					0	my $ascendants =
726							add_hpo_ascendants( $tmp_key_at_variable_level, $nodes, $edges );
727	0					0	$out_hash->{$_} = 1 for @$ascendants; # weight 1 for now
728							}
729
730							##################
731							# Assign weights #
732							##################
733
734							# NB: mrueda (04-12-23) - it's ok if $weight == undef => NO AUTOVIVIFICATION!
735							# NB: We don't warn if user selection does not exist, just assign 1
736
737	23564					19907	my $tmp_key_at_term_level = $tmp_key_at_variable_level;
738
739							# If variable has . then capture $1
740	23564	50				30484	if ( $tmp_key_at_term_level =~ m/\./ ) {
741
742							# NB: For long str regex is faster than (split /\./, $foo)[0]
743	23564					29967	$tmp_key_at_term_level =~ m/^(\w+)\./;
744	23564					23398	$tmp_key_at_term_level = $1;
745							}
746
747	23564	100				23501	if ( defined $weight ) {
748
749							# * IMPORTANT *
750							# ORDER MATTERS !!!!
751							# We allow for assigning weights by TERM (e.g., 1D)
752							# but VARIABLE level takes precedence to TERM
753
754							$out_hash->{$tmp_key_at_variable_level} =
755
756							# VARIABLE LEVEL
757							# NB: exists stringifies the weights
758							exists $weight->{$tmp_key_at_variable_level}
759							? $weight->{$tmp_key_at_variable_level} + 0 # coercing to number
760
761							# TERM LEVEL
762							: exists $weight->{$tmp_key_at_term_level}
763	5292	50				10923	? $weight->{$tmp_key_at_term_level} + 0 # coercing to number
		100
764
765							# NO WEIGHT
766							: 1;
767
768							}
769							else {
770
771							# Assign a weight of 1 if no users weights
772	18272					23385	$out_hash->{$tmp_key_at_variable_level} = 1;
773
774							}
775
776							##############
777							# label Hash #
778							##############
779
780							# Finally we load the Nomenclature hash
781	23564					19753	my $label = $key;
782	23564					36929	$label =~ s/id/label/;
783							$nomenclature{$tmp_key_at_variable_level} = $hash->{$label}
784	23564	100				50096	if defined $hash->{$label};
785							}
786
787							# * IMPORTANT *
788							# We have to return an object {} when undef
789	458		50			17731	return $out_hash // {};
790							}
791
792							sub add_hpo_ascendants {
793
794	0			0	0	0	my ( $key, $nodes, $edges ) = @_;
795
796							# First we obtain the ontology (0000539) from HP:0000539
797	0					0	$key =~ m/HP:(\w+)$/;
798	0					0	my $ontology = $1;
799
800							# We'll use it to build a string equivalent to a key from $edges
801	0					0	my $hpo_url = 'http://purl.obolibrary.org/obo/HP_';
802	0					0	my $hpo_key = $hpo_url . $ontology;
803
804							# We will include all ascendants in an array
805	0					0	my @ascendants;
806	0					0	for my $parent_id ( @{ $edges->{$hpo_key} } ) {
	0					0
807
808							# We have to create a copy to not modify the original $parent_id
809							# as it can appear in multiple individuals
810	0					0	my $copy_parent_id = $parent_id;
811	0					0	$copy_parent_id =~ m/\/(\w+)$/;
812	0					0	$copy_parent_id = $1;
813	0					0	$copy_parent_id =~ tr/_/:/;
814
815							# * IMPORTANT *
816							# We cannot add any label to the ascendants, otherwise they will
817							# not be matched by an indv down the tree
818							# Myopia
819							# Mild Myopia
820							# We want that 'Mild Myopia' matches 'Myopia', thus we can not add a label from 'Mild Myopia'
821							# Use the labels only for debug
822	0					0	my $asc_key = DEVEL_MODE ? $key . '.HPO_asc_DEBUG_ONLY' : $key;
823	0					0	$asc_key =~ s/HP:$ontology/$copy_parent_id/g;
824	0					0	push @ascendants, $asc_key;
825
826							# We finally add the label to %nomenclature
827	0					0	my $hpo_asc_str = $hpo_url . $copy_parent_id; # 'http://purl.obolibrary.org/obo/HP_HP:0000539
828	0					0	$hpo_asc_str =~ s/HP://; # 0000539
829	0					0	$nomenclature{$asc_key} = $nodes->{$hpo_asc_str}{lbl};
830							}
831	0					0	return \@ascendants;
832							}
833
834							sub add_id2key {
835
836	23564			23564	0	25658	my ( $key, $hash, $self ) = @_;
837	23564					26442	my $id_correspondence = $self->{id_correspondence}{ $self->{format} };
838	23564					19969	my $array_regex_qr = $self->{array_regex_qr};
839	23564					19230	my $array_terms_regex_qr = $self->{array_terms_regex_qr};
840
841							#############
842							# OBJECTIVE #
843							#############
844
845							# This subroutine is important as it replaces the index (numeric) for a given
846							# array element by a selected ontology. It's done for all subkeys on that element
847
848							#"interventionsOrProcedures" : [
849							# {
850							# "bodySite" : {
851							# "id" : "NCIT:C12736",
852							# "label" : "intestine"
853							# },
854							# "procedureCode" : {
855							# "id" : "NCIT:C157823",
856							# "label" : "Colon Resection"
857							# }
858							# },
859							# {
860							# "bodySite" : {
861							# "id" : "NCIT:C12736",
862							# "label" : "intestine"
863							# },
864							# "procedureCode" : {
865							# "id" : "NCIT:C86074",
866							# "label" : "Hemicolectomy"
867							# }
868							# },
869							#]
870							#
871							# Will become:
872							#
873							#"interventionsOrProcedures.NCIT:C157823.bodySite.id.NCIT:C12736" : 1,
874							#"interventionsOrProcedures.NCIT:C157823.procedureCode.id.NCIT:C157823" : 1,
875							#"interventionsOrProcedures.NCIT:C86074.bodySite.id.NCIT:C12736" : 1,
876							#"interventionsOrProcedures.NCIT:C86074.procedureCode.id.NCIT:C86074" : 1,
877							#
878							# To make the replacement we use $id_correspondence, then we perform a regex
879							# to fetch the key parts
880
881							# Only proceed if $key is one of the array_terms
882	23564	100				52398	if ( $key =~ $array_terms_regex_qr ) {
883
884							# Now we use $array_regex_qr to capture $1, $2 and $3 for BFF/PXF
885							# NB: For others (e.g., MXF) we will have only $1 and $2
886	22358					48296	$key =~ $array_regex_qr;
887
888							#say "$array_regex_qr -- [$key] <$1> <$2> <$3>"; # $3 can be undefined
889
890	22358					19020	my ( $tmp_key, $val );
891
892							# Normal behaviour for BFF/PXF
893	22358	100				26540	if ( defined $3 ) {
894
895							# If id_correspondence is an array (e.g., medicalActions) we have to grep the right match
896	22228					16258	my $correspondence;
897	22228	50				32197	if ( ref $id_correspondence->{$1} eq ref [] ) {
898
899							# $1 $2 $3
900							# <0>
901	0					0	my $subkey = ( split /\./, $3 )[0]; # treatment
902							$correspondence =
903	0			0		0	first { $_ =~ m/^$subkey/ } @{ $id_correspondence->{$1} }; # treatment.agent.id
	0					0
	0					0
904							}
905							else {
906	22228					21527	$correspondence = $id_correspondence->{$1};
907							}
908
909							# Now that we know which is the term we use to find key-val in $hash
910	22228					32410	$tmp_key = $1 . ':' . $2 . '.' . $correspondence; # medicalActions.0.treatment.agent.id
911	22228					22853	$val = $hash->{$tmp_key}; # DrugCentral:257
912	22228					33279	$key = join '.', $1, $val, $3; # medicalActions.DrugCentral:257.treatment.routeOfAdministration.id
913							}
914
915							# MXF or similar (...we haven't encountered other regex yet)
916							else {
917
918	130					223	$tmp_key = $1 . ':' . $2;
919	130					126	$val = $hash->{$tmp_key};
920	130					139	$key = $1;
921							}
922							}
923
924							# $key = 'Bar:1' means that we have array but the user either:
925							# a) Made a mistake in the config
926							# b) Is not using the right config file
927							else {
928	1206	50				1770	die
929							"<$1> contains array elements but is not defined as an array in <$self->{config_file}>. Please check your syntax and configuration file.\n"
930							if $key =~ m/^(\w+):/;
931							}
932
933	23564					29309	return $key;
934							}
935
936							sub create_binary_digit_string {
937
938	17			17	0	36	my ( $glob_hash, $cmp_hash ) = @_;
939	17					40	my $out_hash;
940
941							# * IMPORTANT *
942							# Being a nested for, keys %{$glob_hash} does not need sorting
943							# BUT, we sort to follow the same order as serialized (sorted)
944	17					25	my @sorted_keys_glob_hash = sort keys %{$glob_hash};
	17					998
945
946							# IDs of each indidividual
947	17					88	for my $individual_id ( keys %{$cmp_hash} ) { # no need to sort
	17					94
948
949							# One-hot encoding = Representing categorical data as numerical
950	458					518	my ( $binary_str, $binary_str_weighted ) = ('') x 2;
951	458					408	for my $key (@sorted_keys_glob_hash) {
952	62270					55502	my $ones = (1) x $glob_hash->{$key};
953	62270					52977	my $zeros = (0) x $glob_hash->{$key};
954	62270	100				64465	$binary_str .= exists $cmp_hash->{$individual_id}{$key} ? 1 : 0;
955							$binary_str_weighted .=
956	62270	100				68244	exists $cmp_hash->{$individual_id}{$key} ? $ones : $zeros;
957							}
958	458					785	$out_hash->{$individual_id}{binary_digit_string} = $binary_str;
959							$out_hash->{$individual_id}{binary_digit_string_weighted} =
960	458					554	$binary_str_weighted;
961							}
962	17					163	return $out_hash;
963							}
964
965							sub parse_hpo_json {
966
967	0			0	0	0	my $data = shift;
968
969							# The file is a structured representation of the Human Phenotype Ontology (HPO) in JSON format.
970							# The HPO is structured into a directed acyclic graph (DAG)
971							# Here's a brief overview of the structure of the hpo.json file:
972							# - graphs: This key contains an array of ontology graphs. In the case of HPO, there is only one graph. The graph has two main keys:
973							# - nodes: An array of objects, each representing an HPO term. Each term object has the following keys:
974							# - id: The identifier of the term (e.g., "HP:0000118").
975							# - lbl: The label (name) of the term (e.g., "Phenotypic abnormality").
976							# - meta: Metadata associated with the term, including definition, synonyms, and other information.
977							# - type: The type of the term, usually "CLASS".
978							# - edges: An array of objects, each representing a relationship between two HPO terms. Each edge object has the following keys:
979							# - sub: The subject (child) term ID (e.g., "HP:0000924").
980							# - obj: The object (parent) term ID (e.g., "HP:0000118").
981							# - pred: The predicate that describes the relationship between the subject and object terms, typically "is_a" in HPO.
982							# - meta: This key contains metadata about the HPO ontology as a whole, such as version information, description, and other details.
983
984	0					0	my $graph = $data->{graphs}->[0];
985	0					0	my %nodes = map { $_->{id} => $_ } @{ $graph->{nodes} };
	0					0
	0					0
986	0					0	my %edges = ();
987
988	0					0	for my $edge ( @{ $graph->{edges} } ) {
	0					0
989	0					0	my $child_id = $edge->{sub};
990	0					0	my $parent_id = $edge->{obj};
991	0					0	push @{ $edges{$child_id} }, $parent_id;
	0					0
992							}
993	0					0	return \%nodes, \%edges;
994							}
995
996							sub prune_keys_with_weight_zero {
997
998	97			97	0	104	my $hash_ref = shift;
999
1000							# Iterate over the keys of the hash
1001	97					105	foreach my $key ( keys %{$hash_ref} ) {
	97					694
1002
1003							# Delete the key if its value is 0
1004	5077	100				5608	delete $hash_ref->{$key} if $hash_ref->{$key} == 0;
1005							}
1006							}
1007
1008							sub guess_label {
1009
1010	72			72	0	56	my $input_string = shift;
1011
1012	72	50				117	if (
1013							$input_string =~ /\. # Match a literal dot
1014							([^\.]+) # Match and capture everything except a dot
1015							$ # Anchor to the end of the string
1016							/x
1017							)
1018							{
1019	72					121	return $1;
1020							}
1021
1022							# If no dot is found, return the original string
1023	0						return $input_string;
1024							}
1025
1026							1;