File Coverage

lib/Pheno/Ranker/Compare.pm
Criterion Covered Total %
statement 307 359 85.5
branch 86 120 71.6
condition 34 53 64.1
subroutine 25 29 86.2
pod 0 16 0.0
total 452 577 78.3


line stmt bran cond sub pod time code
1             package Pheno::Ranker::Compare;
2              
3 5     5   23 use strict;
  5         6  
  5         208  
4 5     5   19 use warnings;
  5         7  
  5         202  
5 5     5   17 use autodie;
  5         5  
  5         39  
6 5     5   18177 use feature qw(say);
  5         7  
  5         576  
7 5     5   22 use List::Util qw(any shuffle first);
  5         5  
  5         321  
8 5     5   20 use Data::Dumper;
  5         8  
  5         226  
9 5     5   2743 use Sort::Naturally qw(nsort);
  5         19778  
  5         362  
10 5     5   2158 use Hash::Fold fold => { array_delimiter => ':' };
  5         150445  
  5         36  
11 5     5   3564 use Pheno::Ranker::Stats;
  5         13  
  5         406  
12              
13 5     5   43 use Exporter 'import';
  5         7  
  5         250  
14             our @EXPORT =
15             qw(check_format cohort_comparison compare_and_rank create_glob_and_ref_hashes randomize_variables remap_hash create_binary_digit_string parse_hpo_json);
16              
17 5     5   18 use constant DEVEL_MODE => 0;
  5         6  
  5         24866  
18              
19             our %nomenclature = ();
20              
21             sub check_format {
22              
23 12     12 0 19 my $data = shift;
24 12 100       81 return exists $data->[0]{subject} ? 'PXF' : 'BFF';
25             }
26              
27             sub cohort_comparison {
28              
29 14     14 0 29 my ( $ref_binary_hash, $self ) = @_;
30 14         36 my $out_file = $self->{out_file};
31 14         27 my $similarity_metric = $self->{similarity_metric_cohort};
32              
33             # Define limit #items for switching to whole matrix calculation
34 14         22 my $max_items = $self->{max_matrix_items_in_ram};
35              
36             # Inform about the start of the comparison process
37             say "Performing COHORT comparison"
38 14 50 33     77 if ( $self->{debug} || $self->{verbose} );
39              
40             # Define the subroutine to be used
41 14         68 my %similarity_function = (
42             'hamming' => \&hd_fast,
43             'jaccard' => \&jaccard_similarity_formatted
44             );
45              
46             # Define values for diagonal elements depending on metric
47 14         37 my %similarity_diagonal = (
48             'hamming' => 0,
49             'jaccard' => 1
50             );
51              
52             # Use previous hashes to define stuff
53 14         21 my $metric = $similarity_function{$similarity_metric};
54 14         18 my $similarity_diagonal = $similarity_diagonal{$similarity_metric};
55              
56             # Sorting keys of the hash
57 14         16 my @sorted_keys_ref_binary_hash = nsort( keys %{$ref_binary_hash} );
  14         645  
58 14         44441 my $num_items = scalar @sorted_keys_ref_binary_hash;
59              
60             # Define $switch for going from RAM to all calculations
61 14 50       43 my $switch = $num_items > $max_items ? 1 : 0;
62              
63             say "RAM efficient mode is: "
64             . ( $switch ? "on" : "off" )
65             . " (max_matrix_items_in_ram: $max_items)"
66 14 0 33     81 if ( $self->{debug} || $self->{verbose} );
    50          
67              
68             # Opening file for output
69 14         90 open( my $fh, '>:encoding(UTF-8)', $out_file );
70 14         14514 say $fh "\t", join "\t", @sorted_keys_ref_binary_hash;
71              
72             # Initialize matrix for storing similarity
73 14         23 my @matrix;
74              
75             # Iterate over items (I elements)
76 14         43 for my $i ( 0 .. $#sorted_keys_ref_binary_hash ) {
77             say "Calculating <"
78             . $sorted_keys_ref_binary_hash[$i]
79             . "> against the cohort..."
80 420 50       587 if $self->{verbose};
81             my $str1 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$i] }
82 420         563 {binary_digit_string_weighted};
83              
84             # Print first column (w/o \t)
85 420         498 print $fh $sorted_keys_ref_binary_hash[$i];
86              
87             # Iterate for pairwise comparisons (J elements)
88 420         503 for my $j ( 0 .. $#sorted_keys_ref_binary_hash ) {
89             my $str2 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
90 15028         14871 {binary_digit_string_weighted};
91 15028         10250 my $similarity;
92              
93 15028 50       12445 if ($switch) {
94              
95             # For large datasets compute upper and lower triange
96             my $str2 =
97             $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
98 0         0 {binary_digit_string_weighted};
99 0 0       0 $similarity =
100             $i == $j ? $similarity_diagonal : $metric->( $str1, $str2 );
101             }
102             else {
103 15028 100       15256 if ( $i == $j ) {
    100          
104              
105             # Similarity for diagonal elements
106 420         351 $similarity = $similarity_diagonal;
107             }
108             elsif ( $j > $i ) {
109              
110             # Compute similarity for large cohorts or upper triangle
111 7304         7124 $similarity = $metric->( $str1, $str2 );
112 7304         7456 $matrix[$i][$j] = $similarity;
113             }
114             else {
115             # Use precomputed similarity from lower triangle
116 7304         6318 $similarity = $matrix[$j][$i];
117             }
118             }
119              
120             # Print a tab before each similarity
121 15028         20165 print $fh "\t", $similarity;
122             }
123              
124 420         463 print $fh "\n";
125             }
126              
127             # Close the file handle
128 14         65 close $fh;
129              
130             # Inform about the completion of the matrix computation
131 14 50 33     6758 say "Matrix saved to <$out_file>" if ( $self->{debug} || $self->{verbose} );
132 14         562 return 1;
133             }
134              
135             sub compare_and_rank {
136              
137 2     2 0 2 my $arg = shift;
138 2         4 my $glob_hash = $arg->{glob_hash};
139 2         4 my $ref_hash = $arg->{ref_hash};
140 2         3 my $tar_hash = $arg->{tar_hash};
141 2         3 my $ref_binary_hash = $arg->{ref_binary_hash};
142 2         4 my $tar_binary_hash = $arg->{tar_binary_hash};
143 2         2 my $weight = $arg->{weight};
144 2         5 my $self = $arg->{self};
145 2         4 my $sort_by = $self->{sort_by};
146 2         4 my $align = $self->{align};
147 2         5 my $max_out = $self->{max_out};
148              
149             say "Performing COHORT(REF)-PATIENT(TAR) comparison"
150 2 50 33     12 if ( $self->{debug} || $self->{verbose} );
151              
152             # Hash for compiling metrics
153 2         2 my $score;
154              
155             # Hash for stats
156             my $stat;
157              
158             # Load TAR binary string
159 2         2 my ($tar) = keys %{$tar_binary_hash};
  2         5  
160             my $tar_str_weighted =
161 2         3 $tar_binary_hash->{$tar}{binary_digit_string_weighted};
162              
163             # Load TAR number of vars
164 2         4 my $target_vars = keys %{ $tar_hash->{$tar} };
  2         4  
165              
166 2         3 for my $key ( keys %{$ref_binary_hash} ) { # No need to sort
  2         16  
167              
168             # Load REF binary string
169             my $ref_str_weighted =
170 72         122 $ref_binary_hash->{$key}{binary_digit_string_weighted};
171 72 50       101 say "Comparing --- " if $self->{verbose};
172             say "REF:$ref_str_weighted\nTAR:$tar_str_weighted\n"
173 72 50 33     94 if ( defined $self->{debug} && $self->{debug} > 1 );
174              
175             # Hamming
176             $score->{$key}{hamming} =
177 72         107 hd_fast( $ref_str_weighted, $tar_str_weighted );
178              
179             # Intersect and Jaccard
180 72         89 ( $score->{$key}{jaccard}, $score->{$key}{intersect} ) =
181             jaccard_similarity( $ref_str_weighted, $tar_str_weighted );
182              
183             # Load REF number of vars
184 72         66 $score->{$key}{reference_vars} = keys %{ $ref_hash->{$key} };
  72         124  
185              
186             # Add values
187 72         53 push @{ $stat->{hamming_data} }, $score->{$key}{hamming};
  72         105  
188 72         55 push @{ $stat->{jaccard_data} }, $score->{$key}{jaccard};
  72         101  
189             }
190 2         14 $stat->{hamming_stats} = add_stats( $stat->{hamming_data} );
191 2         5 $stat->{jaccard_stats} = add_stats( $stat->{jaccard_data} );
192              
193             # Initialize a few variables
194 2         118 my @headers = (
195             'RANK', 'REFERENCE(ID)',
196             'TARGET(ID)', 'FORMAT',
197             'LENGTH', 'WEIGHTED',
198             'HAMMING-DISTANCE', 'DISTANCE-Z-SCORE',
199             'DISTANCE-P-VALUE', 'DISTANCE-Z-SCORE(RAND)',
200             'JACCARD-INDEX', 'JACCARD-Z-SCORE',
201             'JACCARD-P-VALUE', 'REFERENCE-VARS',
202             'TARGET-VARS', 'INTERSECT',
203             'INTERSECT-RATE(%)', 'COMPLETENESS(%)'
204             );
205 2         8 my $header = join "\t", @headers;
206 2         4 my @results = $header;
207 2         2 my %info;
208 2         3 my $length_align = length($tar_str_weighted);
209 2 100       5 my $weight_bool = $weight ? 'True' : 'False';
210 2         3 my @alignments_ascii;
211             my $alignment_str_csv;
212 2         5 my @alignments_csv = join ';',
213             qw/id ref indicator tar weight hamming-distance json-path label/;
214              
215             # The dataframe will have two header lines
216             # *** IMPORTANT ***
217             # nsort does not yield same results as canonical from JSON::XS
218             # NB: we're sorting here and in create_binary_digit_string()
219 2         2 my @sort_keys_glob_hash = sort keys %{$glob_hash};
  2         168  
220              
221             # Creating @labels array from {id,label}, or guess_label()
222             my @labels =
223 2 100       14 map { exists $nomenclature{$_} ? $nomenclature{$_} : guess_label($_) }
  318         417  
224             @sort_keys_glob_hash;
225              
226             # Die if #elements in arrays differ
227 2 50       6 die "Mismatch between variables and labels"
228             if @sort_keys_glob_hash != @labels;
229              
230             # Labels
231             # NB: there is a comma in 'Serum Glutamic Pyruvic Transaminase, CTCAE'
232 2         31 my @dataframe = join ';', 'Id', @labels;
233              
234             # Variables (json path)
235 2         26 push @dataframe, join ';', 'Id', @sort_keys_glob_hash;
236              
237             # 0/1
238             push @dataframe, join ';', qq/T|$tar/,
239 2         53 ( split //, $tar_binary_hash->{$tar}{binary_digit_string} ); # Add Target
240              
241             # Sort %score by value and load results
242 2         14 my $count = 1;
243 2         3 $max_out++; # to be able to start w/ ONE
244              
245             # Start loop
246 2         21 for my $key (
247             sort {
248             $sort_by eq 'jaccard' #
249             ? $score->{$b}{$sort_by} <=> $score->{$a}{$sort_by} # 1 to 0 (similarity)
250 292 50       367 : $score->{$a}{$sort_by} <=> $score->{$b}{$sort_by} # 0 to N (distance)
251             } keys %$score
252             )
253             {
254              
255 72 50       142 say "$count: Creating alignment " if $self->{verbose};
256              
257             # Create ASCII alignment
258             # NB: We need it here to get $n_00
259             my ( $n_00, $alignment_str_ascii, $alignment_arr_csv ) =
260             create_alignment(
261             {
262             ref_key => $key,
263             ref_str_weighted =>
264             $ref_binary_hash->{$key}{binary_digit_string_weighted},
265 72         367 tar_str_weighted => $tar_str_weighted,
266             glob_hash => $glob_hash,
267             sorted_keys_glob_hash => \@sort_keys_glob_hash,
268             labels => \@labels
269             }
270             );
271              
272             # *** IMPORTANT ***
273             # The LENGTH of the alignment is based on the #variables in the REF-COHORT
274             # Compute estimated av and dev for binary_string of L = length_align - n_00
275             # Corrected length_align L = length_align - n_00
276 72         174 my $length_align_corrected = $length_align - $n_00;
277              
278             #$estimated_average, $estimated_std_dev
279 72         184 ( $stat->{hamming_stats}{mean_rnd}, $stat->{hamming_stats}{sd_rnd} ) =
280             estimate_hamming_stats($length_align_corrected);
281              
282             # Compute a few stats
283             my $hamming_z_score = z_score(
284             $score->{$key}{hamming},
285             $stat->{hamming_stats}{mean},
286             $stat->{hamming_stats}{sd}
287 72         221 );
288             my $hamming_z_score_from_random = z_score(
289             $score->{$key}{hamming},
290             $stat->{hamming_stats}{mean_rnd},
291             $stat->{hamming_stats}{sd_rnd}
292 72         114 );
293              
294             #my $hamming_p_value =
295             # p_value( $score->{$key}{hamming}, $length_align_corrected );
296 72         86 my $hamming_p_value_from_z_score =
297             p_value_from_z_score($hamming_z_score);
298             my $jaccard_z_score = z_score(
299             $score->{$key}{jaccard},
300             $stat->{jaccard_stats}{mean},
301             $stat->{jaccard_stats}{sd}
302 72         134 );
303 72         123 my $jaccard_p_value_from_z_score =
304             p_value_from_z_score( 1 - $jaccard_z_score );
305              
306             # Compute Intersect-Rate (I/T) * 100
307             # Completeness (T/R) * 100
308 72         89 my $reference_vars = $score->{$key}{reference_vars};
309 72         87 my $intersect = $score->{$key}{intersect};
310 72 50       109 my $intersect_rate =
311             ( $target_vars == 0 ) ? 0 : ( $intersect / $target_vars ) * 100;
312 72 50       88 my $completeness =
313             ( $reference_vars == 0 ) ? 0 : ( $intersect / $reference_vars ) * 100;
314              
315             # Create a hash with formats
316             my $format = {
317             'RANK' => { value => $count, format => undef },
318             'REFERENCE(ID)' => { value => $key, format => undef },
319             'TARGET(ID)' => { value => $tar, format => undef },
320             'FORMAT' => { value => $self->{format}, format => undef },
321             'WEIGHTED' => { value => $weight_bool, format => undef },
322             'LENGTH' => { value => $length_align_corrected, format => '%6d' },
323             'HAMMING-DISTANCE' =>
324             { value => $score->{$key}{hamming}, format => '%4d' },
325             'DISTANCE-Z-SCORE' =>
326             { value => $hamming_z_score, format => '%7.3f' },
327             'DISTANCE-P-VALUE' =>
328             { value => $hamming_p_value_from_z_score, format => '%12.7f' },
329             'DISTANCE-Z-SCORE(RAND)' =>
330             { value => $hamming_z_score_from_random, format => '%8.4f' },
331             'JACCARD-INDEX' =>
332 72         1113 { value => $score->{$key}{jaccard}, format => '%7.3f' },
333             'JACCARD-Z-SCORE' =>
334             { value => $jaccard_z_score, format => '%7.3f' },
335             'JACCARD-P-VALUE' =>
336             { value => $jaccard_p_value_from_z_score, format => '%12.7f' },
337             'REFERENCE-VARS' => { value => $reference_vars, format => '%6d' },
338             'TARGET-VARS' => { value => $target_vars, format => '%6d' },
339             'INTERSECT' => { value => $intersect, format => '%6d' },
340             'INTERSECT-RATE(%)' =>
341             { value => $intersect_rate, format => '%8.2f' },
342             'COMPLETENESS(%)' => { value => $completeness, format => '%8.2f' }
343             };
344              
345             # Serialize results
346             my $tmp_str = join "\t", map {
347 72         161 defined $format->{$_}{format}
348             ? sprintf( $format->{$_}{format}, $format->{$_}{value} )
349             : $format->{$_}{value}
350 1296 100       2916 } @headers;
351 72         171 push @results, $tmp_str;
352              
353             # To save memory only load if --align
354 72 50       108 if ( defined $align ) {
355              
356             # Add all of the above to @alignments_ascii
357 72         101 my $sep = ('-') x 80;
358 72         1952 push @alignments_ascii,
359             qq/#$header\n$tmp_str\n$sep\n$$alignment_str_ascii/;
360              
361             # Add all of the above to @alignments_csv
362 72         1796 push @alignments_csv, @$alignment_arr_csv;
363              
364             # Add data to dataframe
365             push @dataframe, join ';', qq/R|$key/,
366 72         2054 ( split //, $ref_binary_hash->{$key}{binary_digit_string} );
367              
368             # Add values to info
369             $info{$key} = {
370             weighted => $weight_bool eq 'True'
371             ? JSON::XS::true
372             : JSON::XS::false,
373             reference_id => $key,
374             target_id => $tar,
375             reference_binary_string =>
376             $ref_binary_hash->{$key}{binary_digit_string},
377             target_binary_string =>
378             $tar_binary_hash->{$key}{binary_digit_string},
379             reference_binary_string_weighted =>
380             $ref_binary_hash->{$key}{binary_digit_string_weighted},
381             target_binary_string_weighted =>
382             $tar_binary_hash->{$key}{binary_digit_string_weighted},
383             alignment_length => $length_align_corrected,
384             hamming_distance => $score->{$key}{hamming},
385             hamming_z_score => $hamming_z_score,
386             hamming_p_value => $hamming_p_value_from_z_score,
387             jaccard_similarity => $score->{$key}{jaccard},
388             jaccard_z_score => $jaccard_z_score,
389             jaccard_p_value => $jaccard_p_value_from_z_score,
390             jaccard_distance => 1 - $score->{$key}{jaccard},
391             format => $self->{format},
392 72 100       645 alignment => $$alignment_str_ascii,
393             reference_vars => $reference_vars,
394             target_vars => $target_vars,
395             intersect => $intersect,
396             intersect_rate => $intersect_rate,
397             completeness => $completeness
398             };
399              
400             }
401              
402 72         2448 $count++;
403 72 100       992 last if $count == $max_out;
404             }
405              
406 2         144 return \@results, \%info, \@alignments_ascii, \@dataframe, \@alignments_csv;
407             }
408              
409             sub create_alignment {
410              
411             # NB: The alignment will use the weighted string
412 72     72 0 73 my $arg = shift;
413 72         76 my $ref_key = $arg->{ref_key};
414 72         74 my $binary_string1 = $arg->{ref_str_weighted};
415 72         61 my $binary_string2 = $arg->{tar_str_weighted};
416 72         70 my $sorted_keys_glob_hash = $arg->{sorted_keys_glob_hash};
417 72         75 my $labels = $arg->{labels};
418 72         70 my $glob_hash = $arg->{glob_hash};
419 72         72 my $length1 = length($binary_string1);
420 72         67 my $length2 = length($binary_string2);
421              
422             # Check that l1 = l2
423 72 50       84 die "The binary strings must have the same length"
424             if ( $length1 != $length2 );
425              
426             # Expand array to have weights as N-elements
427 72         97 my $recreated_array = recreate_array( $glob_hash, $sorted_keys_glob_hash );
428              
429             # Initialize some variables
430 72         70 my $out_ascii = "REF -- TAR\n";
431 72         57 my @out_csv;
432 72         56 my $cum_distance = 0;
433 72         70 my $n_00 = 0;
434              
435             # For loop with 2 variables
436             # i index for weighted
437             # j the number of variables
438 72         80 my ( $i, $j ) = ( 0, 0 );
439 72         98 for ( $i = 0 ; $i < $length1 ; $i++, $j++ ) {
440              
441             # Load key and value
442 11448         9160 my $key = $recreated_array->[$i];
443 11448         11428 my $val = sprintf( "%3d", $glob_hash->{$key} );
444              
445             # We have to keep track with $j
446 11448         8530 my $sorted_key = $sorted_keys_glob_hash->[$j];
447 11448         9195 my $label = $labels->[$j];
448              
449             # Load chars
450 11448         9293 my $char1 = substr( $binary_string1, $i, 1 );
451 11448         8885 my $char2 = substr( $binary_string2, $i, 1 );
452 11448 100 100     16593 $n_00++ if ( $char1 == 0 && $char2 == 0 );
453              
454             # Correct $i index by adding weights
455 11448         9136 $i = $i + $glob_hash->{$key} - 1;
456              
457             # Load metrics
458 11448 100       11351 $cum_distance += $glob_hash->{$key} if $char1 ne $char2;
459 11448         10558 my $cum_distance_pretty = sprintf( "%3d", $cum_distance );
460 11448 100       10714 my $distance = $char1 eq $char2 ? 0 : $glob_hash->{$key};
461 11448         10013 my $distance_pretty = sprintf( "%3d", $distance );
462              
463             # w = weight, d = distance, cd = cumul distance
464 11448         15928 my %format = (
465             '11' => '-----',
466             '10' => 'xxx--',
467             '01' => '--xxx',
468             '00' => ' '
469             );
470 11448         13065 $out_ascii .=
471             qq/$char1 $format{ $char1 . $char2 } $char2 | (w:$val|d:$distance_pretty|cd:$cum_distance_pretty|) $sorted_key ($label)\n/;
472 11448         26739 push @out_csv,
473             qq/$ref_key;$char1;$format{ $char1 . $char2 };$char2;$glob_hash->{$key};$distance;$sorted_key;$label/;
474              
475             #REF(107:week_0_arm_1);indicator;TAR(125:week_0_arm_1);weight;hamming-distance;json-path
476             #0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P16Y
477             #0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P24Y
478             #1;-----;1;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P39Y
479             }
480 72         579 return $n_00, \$out_ascii, \@out_csv;
481             }
482              
483             sub recreate_array {
484              
485 72     72 0 108 my ( $glob_hash, $sorted_keys_glob_hash ) = @_;
486 72         72 my @recreated_array;
487 72         100 foreach my $key (@$sorted_keys_glob_hash) {
488 11448         12979 for ( my $i = 0 ; $i < $glob_hash->{$key} ; $i++ ) {
489 11736         15359 push @recreated_array, $key;
490             }
491             }
492 72         115 return \@recreated_array;
493             }
494              
495             sub create_glob_and_ref_hashes {
496              
497 15     15 0 34 my ( $array, $weight, $self ) = @_;
498 15         29 my $primary_key = $self->{primary_key};
499 15         20 my $glob_hash = {};
500 15         16 my $ref_hash_flattened;
501              
502 15         20 my $count = 1;
503 15         16 for my $element ( @{$array} ) {
  15         29  
504              
505             # For consistency, we obtain the primary_key for both BFF/PXF
506             # from $_->{id} (not from subject.id)
507 456 50       1137 my $id = $element->{$primary_key}
508             or die
509             "Sorry but the JSON document [$count] does not have the primary_key <$primary_key> defined\n";
510              
511             # Remapping hash
512 456 50       789 say "Flattening and remapping ..." if $self->{verbose};
513              
514 456         1262 my $ref_hash = remap_hash(
515             {
516             hash => $element,
517             weight => $weight,
518             self => $self
519             }
520             );
521              
522             # *** IMPORTANT ***
523             # We eliminate keys with weight = 0 if defined $weight;
524 456 100       1396 prune_keys_with_weight_zero($ref_hash) if defined $weight;
525              
526             # Load big hash ref_hash_flattened
527 456         1077 $ref_hash_flattened->{$id} = $ref_hash;
528              
529             # The idea is to create a $glob_hash with unique key-values
530             # Duplicates will be automatically merged
531 456         18740 $glob_hash = { %$glob_hash, %$ref_hash };
532              
533             # To keep track of array element indexes
534 456         2794 $count++;
535             }
536 15         85 return ( $glob_hash, $ref_hash_flattened );
537             }
538              
539             sub randomize_variables {
540              
541 0     0 0 0 my ( $glob_hash, $self ) = @_;
542 0         0 my $max = $self->{max_number_vars};
543 0         0 my $seed = $self->{seed};
544              
545             # set random seed
546 0         0 srand($seed);
547              
548             # Randomize
549             # NB:keys have to be sorted for srand to work!!!!
550             # perl -MList::Util=shuffle -E 'srand 123; say shuffle 1 .. 5'
551 0         0 my @items = ( shuffle sort keys %$glob_hash )[ 0 .. $max - 1 ];
552              
553             # Create a new hash with a hash slice
554 0         0 my %new_glob_hash;
555 0         0 @new_glob_hash{@items} = @{$glob_hash}{@items};
  0         0  
556              
557             # return reference
558 0         0 return \%new_glob_hash;
559             }
560              
561             sub prune_excluded_included {
562              
563 458     458 0 570 my ( $hash, $self ) = @_;
564 458         391 my @included = @{ $self->{include_terms} };
  458         775  
565 458         447 my @excluded = @{ $self->{exclude_terms} };
  458         668  
566              
567             # Die if we have both options at the same time
568 458 50 66     750 die "Sorry, <--include> and <--exclude> are mutually exclusive\n"
569             if ( @included && @excluded );
570              
571             # *** IMPORTANT ***
572             # Original $hash is modified
573              
574             # INCLUDED
575 458 100       558 if (@included) {
576 25         41 for my $key ( keys %$hash ) {
577 125 100   225   197 delete $hash->{$key} unless any { $_ eq $key } @included;
  225         290  
578             }
579             }
580              
581             # EXCLUDED
582 458 50       537 if (@excluded) {
583 0         0 for my $key (@excluded) {
584 0 0       0 delete $hash->{$key} if exists $hash->{$key};
585             }
586             }
587              
588             # We will do nothing if @included = @excluded = [] (DEFAULT)
589 458         551 return 1;
590             }
591              
592             sub set_excluded_phenotypicFeatures {
593              
594 458     458 0 563 my ( $hash, $switch, $format ) = @_;
595              
596             # Ensure phenotypicFeatures exist before processing
597 458 100       782 return 1 unless exists $hash->{phenotypicFeatures};
598              
599 320         254 foreach my $feature ( @{ $hash->{phenotypicFeatures} } ) {
  320         647  
600              
601             # Skip if 'excluded' is not set or false
602 787 100       1916 next unless $feature->{excluded};
603              
604             # NB: remaining phenotypicFeatures:1.excluded
605             # will be discarded by $exclude_variables_regex_qr later
606 449 100       2257 if ($switch) {
607              
608             # Determine the correct ID field based on the format
609 2 50       5 my $id_field = $format eq 'BFF' ? 'featureType' : 'type';
610              
611             # Append '_excluded' to the appropriate ID
612 2         6 $feature->{$id_field}{id} .= '_excluded';
613             }
614             else {
615             # Remove the feature by setting it to undef
616 447         911 $feature = undef;
617              
618             # Due to properties being set to undef, it's possible for the coverage file to
619             # report phenotypicFeatures as 100% by all "excluded" = true
620             }
621             }
622              
623 320         443 return 1;
624             }
625              
626             sub remap_hash {
627              
628 458     458 0 494 my $arg = shift;
629 458         458 my $hash = $arg->{hash};
630 458         429 my $weight = $arg->{weight};
631 458         478 my $self = $arg->{self}; # $self from $arg
632 458         491 my $nodes = $self->{nodes};
633 458         443 my $edges = $self->{edges};
634 458         463 my $format = $self->{format};
635 458         461 my $switch = $self->{retain_excluded_phenotypicFeatures};
636 458         375 my $out_hash;
637              
638             # Do some pruning excluded / included
639 458         794 prune_excluded_included( $hash, $self );
640              
641             # *** IMPORTANT ***
642             # The user may include a term that:
643             # 1 - may not exist in any individual
644             # 2 - does not exist in some individuals
645             # If the term does not exist in a individual
646             # - a) -include-terms contains ANOTHER TERM THAT EXISTS
647             # %$hash will contain keys => OK
648             # - b) -include-terms only includes the term/terms not present
649             # %$hash = 0 , then we return {}, to avoid trouble w/ Fold.pm
650             #print Dumper $hash;
651 458 50       663 return {} unless %$hash;
652              
653             # A bit more pruning plus folding
654             # NB: Hash::Fold keeps white spaces on keys
655             #
656             # Options for 1D-array folding:
657             # A) Array to Hash then Fold
658             # B) Fold then Regex <=== CHOSEN
659             # - Avoids the need for deep cloning
660             # - Works across any JSON data structure (without specific key requirements)
661             # - BUT profiling shows it's ~5-10% slower than 'Array to Hash then Fold'
662             # - Does not accommodate specific remappings like 'interpretations.diagnosis.genomicInterpretations'
663 458         693 set_excluded_phenotypicFeatures( $hash, $switch, $format );
664 458         1160 $hash = fold($hash);
665              
666             # Load the hash that points to the hierarchy for ontology-term-id
667             # *** IMPORTANT ***
668             # - phenotypicFeatures.featureType.id => BFF
669             # - phenotypicFeatures.type.id => PXF
670 458         2469727 my $id_correspondence = $self->{id_correspondence};
671              
672             # Load values for the for loop
673 458         631 my $exclude_variables_regex_qr = $self->{exclude_variables_regex_qr};
674 458         1254 my $misc_regex_qr =
675             qr/1900-01-01|NA0000|NCIT:C126101|P999Y|P9999Y|phenopacket_id/;
676              
677             # Pre-compile a list of fixed scalar values to exclude into a hash for quick lookup
678             my %exclude_values =
679 458         768 map { $_ => 1 }
  2290         3612  
680             ( 'NA', 'NaN', 'Fake', 'None:No matching concept', 'Not Available' );
681              
682             # Now we proceed for each key
683 458         608 for my $key ( keys %{$hash} ) {
  458         10713  
684              
685             # To see which ones were discarded
686             #say $key if !defined $hash->{$key};
687              
688             # Discard undefined
689 87191 100       100931 next unless defined $hash->{$key};
690              
691             # Discarding lines with 'low quality' keys (Time of regex profiled with :NYTProf: ms time)
692             # Some can be "rescued" by adding the ontology as ($1)
693             # NB1: We discard _labels too!!
694             # NB2: info|metaData are always discarded
695             next
696 84392 100 100     312292 if ( defined $exclude_variables_regex_qr
697             && $key =~ $exclude_variables_regex_qr );
698              
699             # The user can turn on age related values
700             next
701             if ( ( $format eq 'PXF' || $format eq 'BFF' )
702             && $key =~ m/\.age(?!nt)|onset/i
703 33663 100 100     195585 && !$self->{age} ); # $self->{age} [0|1]
      100        
      66        
704              
705             # Load values
706 32154         31018 my $val = $hash->{$key};
707              
708             # Discarding lines with unsupported val (Time profiled with :NYTProf: ms time)
709             next
710             if (
711 1         4 ( ref($val) eq 'HASH' && !keys %{$val} ) # Discard {} (e.g.,subject.vitalStatus: {})
712 600         1436 || ( ref($val) eq 'ARRAY' && !@{$val} ) # Discard []
713 32154 100 66     168888 || exists $exclude_values{$val}
      66        
      100        
      66        
      66        
714             || $val =~ $misc_regex_qr
715             );
716              
717             # Add IDs to key
718 23564         27207 my $id_key = add_id2key( $key, $hash, $self );
719              
720             # Finally add value to id_key
721 23564         23436 my $tmp_key_at_variable_level = $id_key . '.' . $val;
722              
723             # Add HPO ascendants
724 23564 50 33     29247 if ( defined $edges && $val =~ /^HP:/ ) {
725 0         0 my $ascendants =
726             add_hpo_ascendants( $tmp_key_at_variable_level, $nodes, $edges );
727 0         0 $out_hash->{$_} = 1 for @$ascendants; # weight 1 for now
728             }
729              
730             ##################
731             # Assign weights #
732             ##################
733              
734             # NB: mrueda (04-12-23) - it's ok if $weight == undef => NO AUTOVIVIFICATION!
735             # NB: We don't warn if user selection does not exist, just assign 1
736              
737 23564         19907 my $tmp_key_at_term_level = $tmp_key_at_variable_level;
738              
739             # If variable has . then capture $1
740 23564 50       30484 if ( $tmp_key_at_term_level =~ m/\./ ) {
741              
742             # NB: For long str regex is faster than (split /\./, $foo)[0]
743 23564         29967 $tmp_key_at_term_level =~ m/^(\w+)\./;
744 23564         23398 $tmp_key_at_term_level = $1;
745             }
746              
747 23564 100       23501 if ( defined $weight ) {
748              
749             # *** IMPORTANT ***
750             # ORDER MATTERS !!!!
751             # We allow for assigning weights by TERM (e.g., 1D)
752             # but VARIABLE level takes precedence to TERM
753              
754             $out_hash->{$tmp_key_at_variable_level} =
755              
756             # VARIABLE LEVEL
757             # NB: exists stringifies the weights
758             exists $weight->{$tmp_key_at_variable_level}
759             ? $weight->{$tmp_key_at_variable_level} + 0 # coercing to number
760              
761             # TERM LEVEL
762             : exists $weight->{$tmp_key_at_term_level}
763 5292 50       10923 ? $weight->{$tmp_key_at_term_level} + 0 # coercing to number
    100          
764              
765             # NO WEIGHT
766             : 1;
767              
768             }
769             else {
770              
771             # Assign a weight of 1 if no users weights
772 18272         23385 $out_hash->{$tmp_key_at_variable_level} = 1;
773              
774             }
775              
776             ##############
777             # label Hash #
778             ##############
779              
780             # Finally we load the Nomenclature hash
781 23564         19753 my $label = $key;
782 23564         36929 $label =~ s/id/label/;
783             $nomenclature{$tmp_key_at_variable_level} = $hash->{$label}
784 23564 100       50096 if defined $hash->{$label};
785             }
786              
787             # *** IMPORTANT ***
788             # We have to return an object {} when undef
789 458   50     17731 return $out_hash // {};
790             }
791              
792             sub add_hpo_ascendants {
793              
794 0     0 0 0 my ( $key, $nodes, $edges ) = @_;
795              
796             # First we obtain the ontology (0000539) from HP:0000539
797 0         0 $key =~ m/HP:(\w+)$/;
798 0         0 my $ontology = $1;
799              
800             # We'll use it to build a string equivalent to a key from $edges
801 0         0 my $hpo_url = 'http://purl.obolibrary.org/obo/HP_';
802 0         0 my $hpo_key = $hpo_url . $ontology;
803              
804             # We will include all ascendants in an array
805 0         0 my @ascendants;
806 0         0 for my $parent_id ( @{ $edges->{$hpo_key} } ) {
  0         0  
807              
808             # We have to create a copy to not modify the original $parent_id
809             # as it can appear in multiple individuals
810 0         0 my $copy_parent_id = $parent_id;
811 0         0 $copy_parent_id =~ m/\/(\w+)$/;
812 0         0 $copy_parent_id = $1;
813 0         0 $copy_parent_id =~ tr/_/:/;
814              
815             # *** IMPORTANT ***
816             # We cannot add any label to the ascendants, otherwise they will
817             # not be matched by an indv down the tree
818             # Myopia
819             # Mild Myopia
820             # We want that 'Mild Myopia' matches 'Myopia', thus we can not add a label from 'Mild Myopia'
821             # Use the labels only for debug
822 0         0 my $asc_key = DEVEL_MODE ? $key . '.HPO_asc_DEBUG_ONLY' : $key;
823 0         0 $asc_key =~ s/HP:$ontology/$copy_parent_id/g;
824 0         0 push @ascendants, $asc_key;
825              
826             # We finally add the label to %nomenclature
827 0         0 my $hpo_asc_str = $hpo_url . $copy_parent_id; # 'http://purl.obolibrary.org/obo/HP_HP:0000539
828 0         0 $hpo_asc_str =~ s/HP://; # 0000539
829 0         0 $nomenclature{$asc_key} = $nodes->{$hpo_asc_str}{lbl};
830             }
831 0         0 return \@ascendants;
832             }
833              
834             sub add_id2key {
835              
836 23564     23564 0 25658 my ( $key, $hash, $self ) = @_;
837 23564         26442 my $id_correspondence = $self->{id_correspondence}{ $self->{format} };
838 23564         19969 my $array_regex_qr = $self->{array_regex_qr};
839 23564         19230 my $array_terms_regex_qr = $self->{array_terms_regex_qr};
840              
841             #############
842             # OBJECTIVE #
843             #############
844              
845             # This subroutine is important as it replaces the index (numeric) for a given
846             # array element by a selected ontology. It's done for all subkeys on that element
847              
848             #"interventionsOrProcedures" : [
849             # {
850             # "bodySite" : {
851             # "id" : "NCIT:C12736",
852             # "label" : "intestine"
853             # },
854             # "procedureCode" : {
855             # "id" : "NCIT:C157823",
856             # "label" : "Colon Resection"
857             # }
858             # },
859             # {
860             # "bodySite" : {
861             # "id" : "NCIT:C12736",
862             # "label" : "intestine"
863             # },
864             # "procedureCode" : {
865             # "id" : "NCIT:C86074",
866             # "label" : "Hemicolectomy"
867             # }
868             # },
869             #]
870             #
871             # Will become:
872             #
873             #"interventionsOrProcedures.NCIT:C157823.bodySite.id.NCIT:C12736" : 1,
874             #"interventionsOrProcedures.NCIT:C157823.procedureCode.id.NCIT:C157823" : 1,
875             #"interventionsOrProcedures.NCIT:C86074.bodySite.id.NCIT:C12736" : 1,
876             #"interventionsOrProcedures.NCIT:C86074.procedureCode.id.NCIT:C86074" : 1,
877             #
878             # To make the replacement we use $id_correspondence, then we perform a regex
879             # to fetch the key parts
880              
881             # Only proceed if $key is one of the array_terms
882 23564 100       52398 if ( $key =~ $array_terms_regex_qr ) {
883              
884             # Now we use $array_regex_qr to capture $1, $2 and $3 for BFF/PXF
885             # NB: For others (e.g., MXF) we will have only $1 and $2
886 22358         48296 $key =~ $array_regex_qr;
887              
888             #say "$array_regex_qr -- [$key] <$1> <$2> <$3>"; # $3 can be undefined
889              
890 22358         19020 my ( $tmp_key, $val );
891              
892             # Normal behaviour for BFF/PXF
893 22358 100       26540 if ( defined $3 ) {
894              
895             # If id_correspondence is an array (e.g., medicalActions) we have to grep the right match
896 22228         16258 my $correspondence;
897 22228 50       32197 if ( ref $id_correspondence->{$1} eq ref [] ) {
898              
899             # $1 $2 $3
900             # <0>
901 0         0 my $subkey = ( split /\./, $3 )[0]; # treatment
902             $correspondence =
903 0     0   0 first { $_ =~ m/^$subkey/ } @{ $id_correspondence->{$1} }; # treatment.agent.id
  0         0  
  0         0  
904             }
905             else {
906 22228         21527 $correspondence = $id_correspondence->{$1};
907             }
908              
909             # Now that we know which is the term we use to find key-val in $hash
910 22228         32410 $tmp_key = $1 . ':' . $2 . '.' . $correspondence; # medicalActions.0.treatment.agent.id
911 22228         22853 $val = $hash->{$tmp_key}; # DrugCentral:257
912 22228         33279 $key = join '.', $1, $val, $3; # medicalActions.DrugCentral:257.treatment.routeOfAdministration.id
913             }
914              
915             # MXF or similar (...we haven't encountered other regex yet)
916             else {
917              
918 130         223 $tmp_key = $1 . ':' . $2;
919 130         126 $val = $hash->{$tmp_key};
920 130         139 $key = $1;
921             }
922             }
923              
924             # $key = 'Bar:1' means that we have array but the user either:
925             # a) Made a mistake in the config
926             # b) Is not using the right config file
927             else {
928 1206 50       1770 die
929             "<$1> contains array elements but is not defined as an array in <$self->{config_file}>. Please check your syntax and configuration file.\n"
930             if $key =~ m/^(\w+):/;
931             }
932              
933 23564         29309 return $key;
934             }
935              
936             sub create_binary_digit_string {
937              
938 17     17 0 36 my ( $glob_hash, $cmp_hash ) = @_;
939 17         40 my $out_hash;
940              
941             # *** IMPORTANT ***
942             # Being a nested for, keys %{$glob_hash} does not need sorting
943             # BUT, we sort to follow the same order as serialized (sorted)
944 17         25 my @sorted_keys_glob_hash = sort keys %{$glob_hash};
  17         998  
945              
946             # IDs of each indidividual
947 17         88 for my $individual_id ( keys %{$cmp_hash} ) { # no need to sort
  17         94  
948              
949             # One-hot encoding = Representing categorical data as numerical
950 458         518 my ( $binary_str, $binary_str_weighted ) = ('') x 2;
951 458         408 for my $key (@sorted_keys_glob_hash) {
952 62270         55502 my $ones = (1) x $glob_hash->{$key};
953 62270         52977 my $zeros = (0) x $glob_hash->{$key};
954 62270 100       64465 $binary_str .= exists $cmp_hash->{$individual_id}{$key} ? 1 : 0;
955             $binary_str_weighted .=
956 62270 100       68244 exists $cmp_hash->{$individual_id}{$key} ? $ones : $zeros;
957             }
958 458         785 $out_hash->{$individual_id}{binary_digit_string} = $binary_str;
959             $out_hash->{$individual_id}{binary_digit_string_weighted} =
960 458         554 $binary_str_weighted;
961             }
962 17         163 return $out_hash;
963             }
964              
965             sub parse_hpo_json {
966              
967 0     0 0 0 my $data = shift;
968              
969             # The file is a structured representation of the Human Phenotype Ontology (HPO) in JSON format.
970             # The HPO is structured into a directed acyclic graph (DAG)
971             # Here's a brief overview of the structure of the hpo.json file:
972             # - graphs: This key contains an array of ontology graphs. In the case of HPO, there is only one graph. The graph has two main keys:
973             # - nodes: An array of objects, each representing an HPO term. Each term object has the following keys:
974             # - id: The identifier of the term (e.g., "HP:0000118").
975             # - lbl: The label (name) of the term (e.g., "Phenotypic abnormality").
976             # - meta: Metadata associated with the term, including definition, synonyms, and other information.
977             # - type: The type of the term, usually "CLASS".
978             # - edges: An array of objects, each representing a relationship between two HPO terms. Each edge object has the following keys:
979             # - sub: The subject (child) term ID (e.g., "HP:0000924").
980             # - obj: The object (parent) term ID (e.g., "HP:0000118").
981             # - pred: The predicate that describes the relationship between the subject and object terms, typically "is_a" in HPO.
982             # - meta: This key contains metadata about the HPO ontology as a whole, such as version information, description, and other details.
983              
984 0         0 my $graph = $data->{graphs}->[0];
985 0         0 my %nodes = map { $_->{id} => $_ } @{ $graph->{nodes} };
  0         0  
  0         0  
986 0         0 my %edges = ();
987              
988 0         0 for my $edge ( @{ $graph->{edges} } ) {
  0         0  
989 0         0 my $child_id = $edge->{sub};
990 0         0 my $parent_id = $edge->{obj};
991 0         0 push @{ $edges{$child_id} }, $parent_id;
  0         0  
992             }
993 0         0 return \%nodes, \%edges;
994             }
995              
996             sub prune_keys_with_weight_zero {
997              
998 97     97 0 104 my $hash_ref = shift;
999              
1000             # Iterate over the keys of the hash
1001 97         105 foreach my $key ( keys %{$hash_ref} ) {
  97         694  
1002              
1003             # Delete the key if its value is 0
1004 5077 100       5608 delete $hash_ref->{$key} if $hash_ref->{$key} == 0;
1005             }
1006             }
1007              
1008             sub guess_label {
1009              
1010 72     72 0 56 my $input_string = shift;
1011              
1012 72 50       117 if (
1013             $input_string =~ /\. # Match a literal dot
1014             ([^\.]+) # Match and capture everything except a dot
1015             $ # Anchor to the end of the string
1016             /x
1017             )
1018             {
1019 72         121 return $1;
1020             }
1021              
1022             # If no dot is found, return the original string
1023 0           return $input_string;
1024             }
1025              
1026             1;