File Coverage

lib/Pheno/Ranker/Compare.pm
Criterion Covered Total %
statement 315 378 83.3
branch 91 130 70.0
condition 34 53 64.1
subroutine 27 33 81.8
pod 0 17 0.0
total 467 611 76.4


line stmt bran cond sub pod time code
1             package Pheno::Ranker::Compare;
2              
3 5     5   32 use strict;
  5         9  
  5         198  
4 5     5   40 use warnings;
  5         9  
  5         350  
5 5     5   33 use autodie;
  5         10  
  5         45  
6 5     5   29438 use feature qw(say);
  5         11  
  5         823  
7 5     5   41 use List::Util qw(any shuffle first);
  5         9  
  5         578  
8 5     5   35 use Data::Dumper;
  5         7  
  5         373  
9 5     5   3952 use Sort::Naturally qw(nsort);
  5         27209  
  5         484  
10 5     5   2744 use MIME::Base64;
  5         4363  
  5         422  
11 5     5   3251 use Compress::Zlib qw(compress);
  5         110954  
  5         677  
12 5     5   3311 use Hash::Fold fold => { array_delimiter => ':' };
  5         210452  
  5         45  
13 5     5   5172 use Pheno::Ranker::Metrics;
  5         63  
  5         1181  
14              
15 5     5   61 use Exporter 'import';
  5         106  
  5         505  
16             our @EXPORT =
17             qw(check_format cohort_comparison compare_and_rank create_glob_and_ref_hashes randomize_variables remap_hash create_binary_digit_string parse_hpo_json);
18              
19 5     5   78 use constant DEVEL_MODE => 0;
  5         12  
  5         43097  
20              
21             our %nomenclature = ();
22              
23             sub check_format {
24 13     13 0 29 my $data = shift;
25 13 100       98 return exists $data->[0]{subject} ? 'PXF' : 'BFF';
26             }
27              
28             sub cohort_comparison {
29 14     14 0 41 my ( $ref_binary_hash, $self ) = @_;
30 14         56 my $out_file = $self->{out_file};
31 14         45 my $similarity_metric = $self->{similarity_metric_cohort};
32              
33             # Define limit #items for switching to whole matrix calculation
34 14         34 my $max_items = $self->{max_matrix_records_in_ram};
35              
36             # Inform about the start of the comparison process
37             say "Performing COHORT comparison"
38 14 50 33     109 if ( $self->{debug} || $self->{verbose} );
39              
40             # Define the subroutine to be used
41 14         91 my %similarity_function = (
42             'hamming' => \&hd_fast,
43             'jaccard' => \&jaccard_similarity_formatted
44             );
45              
46             # Define values for diagonal elements depending on metric
47 14         50 my %similarity_diagonal = (
48             'hamming' => 0,
49             'jaccard' => 1
50             );
51              
52             # Use previous hashes to define stuff
53 14         27 my $metric = $similarity_function{$similarity_metric};
54 14         29 my $similarity_diagonal = $similarity_diagonal{$similarity_metric};
55              
56             # Sorting keys of the hash
57 14         23 my @sorted_keys_ref_binary_hash = nsort( keys %{$ref_binary_hash} );
  14         800  
58 14         64722 my $num_items = scalar @sorted_keys_ref_binary_hash;
59              
60             # Define $switch for going from RAM to all calculations
61 14 50       67 my $switch = $num_items > $max_items ? 1 : 0;
62              
63             say "RAM efficient mode is: "
64             . ( $switch ? "on" : "off" )
65             . " (max_matrix_records_in_ram: $max_items)"
66 14 0 33     117 if ( $self->{debug} || $self->{verbose} );
    50          
67              
68             # Opening file for output
69 14         86 open( my $fh, '>:encoding(UTF-8)', $out_file );
70 14         19037 say $fh "\t", join "\t", @sorted_keys_ref_binary_hash;
71              
72             # Initialize matrix for storing similarity
73 14         33 my @matrix;
74              
75             # Iterate over items (I elements)
76 14         65 for my $i ( 0 .. $#sorted_keys_ref_binary_hash ) {
77             say "Calculating <"
78             . $sorted_keys_ref_binary_hash[$i]
79             . "> against the cohort..."
80 420 50       1168 if $self->{verbose};
81             my $str1 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$i] }
82 420         887 {binary_digit_string_weighted};
83              
84             # Print first column (w/o \t)
85 420         922 print $fh $sorted_keys_ref_binary_hash[$i];
86              
87             # Iterate for pairwise comparisons (J elements)
88 420         1076 for my $j ( 0 .. $#sorted_keys_ref_binary_hash ) {
89             my $str2 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
90 15028         29755 {binary_digit_string_weighted};
91 15028         30776 my $similarity;
92              
93 15028 50       19610 if ($switch) {
94              
95             # For large datasets compute upper and lower triange
96             my $str2 =
97             $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
98 0         0 {binary_digit_string_weighted};
99 0 0       0 $similarity =
100             $i == $j ? $similarity_diagonal : $metric->( $str1, $str2 );
101             }
102             else {
103 15028 100       24111 if ( $i == $j ) {
    100          
104              
105             # Similarity for diagonal elements
106 420         523 $similarity = $similarity_diagonal;
107             }
108             elsif ( $j > $i ) {
109              
110             # Compute similarity for large cohorts or upper triangle
111 7304         11783 $similarity = $metric->( $str1, $str2 );
112 7304         12883 $matrix[$i][$j] = $similarity;
113             }
114             else {
115             # Use precomputed similarity from lower triangle
116 7304         10027 $similarity = $matrix[$j][$i];
117             }
118             }
119              
120             # Print a tab before each similarity
121 15028         31766 print $fh "\t", $similarity;
122             }
123              
124 420         800 print $fh "\n";
125             }
126              
127             # Close the file handle
128 14         115 close $fh;
129              
130             # Inform about the completion of the matrix computation
131 14 50 33     26047 say "Matrix saved to <$out_file>" if ( $self->{debug} || $self->{verbose} );
132 14         737 return 1;
133             }
134              
135             sub compare_and_rank {
136 2     2 0 6 my $arg = shift;
137 2         6 my $glob_hash = $arg->{glob_hash};
138 2         7 my $ref_hash = $arg->{ref_hash};
139 2         6 my $tar_hash = $arg->{tar_hash};
140 2         5 my $ref_binary_hash = $arg->{ref_binary_hash};
141 2         7 my $tar_binary_hash = $arg->{tar_binary_hash};
142 2         7 my $weight = $arg->{weight};
143 2         6 my $self = $arg->{self};
144 2         8 my $sort_by = $self->{sort_by};
145 2         7 my $align = $self->{align};
146 2         5 my $max_out = $self->{max_out};
147              
148             say "Performing COHORT(REF)-PATIENT(TAR) comparison"
149 2 50 33     22 if ( $self->{debug} || $self->{verbose} );
150              
151             # Hash for compiling metrics
152 2         6 my $score;
153              
154             # Hash for stats
155             my $stat;
156              
157             # Load TAR binary string
158 2         5 my ($tar) = keys %{$tar_binary_hash};
  2         7  
159             my $tar_str_weighted =
160 2         6 $tar_binary_hash->{$tar}{binary_digit_string_weighted};
161              
162             # Load TAR number of vars
163 2         3 my $target_vars = keys %{ $tar_hash->{$tar} };
  2         9  
164              
165 2         4 for my $key ( keys %{$ref_binary_hash} ) { # No need to sort
  2         29  
166              
167             # Load REF binary string
168             my $ref_str_weighted =
169 72         184 $ref_binary_hash->{$key}{binary_digit_string_weighted};
170 72 50       151 say "Comparing --- " if $self->{verbose};
171             say "REF:$ref_str_weighted\nTAR:$tar_str_weighted\n"
172 72 50 33     166 if ( defined $self->{debug} && $self->{debug} > 1 );
173              
174             # Hamming
175             $score->{$key}{hamming} =
176 72         140 hd_fast( $ref_str_weighted, $tar_str_weighted );
177              
178             # Intersect and Jaccard
179 72         157 ( $score->{$key}{jaccard}, $score->{$key}{intersect} ) =
180             jaccard_similarity( $ref_str_weighted, $tar_str_weighted );
181              
182             # Load REF number of vars
183 72         93 $score->{$key}{reference_vars} = keys %{ $ref_hash->{$key} };
  72         174  
184              
185             # Add values
186 72         88 push @{ $stat->{hamming_data} }, $score->{$key}{hamming};
  72         138  
187 72         80 push @{ $stat->{jaccard_data} }, $score->{$key}{jaccard};
  72         155  
188             }
189              
190             # Stats are only computed once (no overhead)
191 2         19 $stat->{hamming_stats} = add_stats( $stat->{hamming_data} );
192 2         9 $stat->{jaccard_stats} = add_stats( $stat->{jaccard_data} );
193              
194             # Initialize a few variables
195 2         20 my @headers = (
196             'RANK', 'REFERENCE(ID)',
197             'TARGET(ID)', 'FORMAT',
198             'LENGTH', 'WEIGHTED',
199             'HAMMING-DISTANCE', 'DISTANCE-Z-SCORE',
200             'DISTANCE-P-VALUE', 'DISTANCE-Z-SCORE(RAND)',
201             'JACCARD-INDEX', 'JACCARD-Z-SCORE',
202             'JACCARD-P-VALUE', 'REFERENCE-VARS',
203             'TARGET-VARS', 'INTERSECT',
204             'INTERSECT-RATE(%)', 'COMPLETENESS(%)'
205             );
206 2         11 my $header = join "\t", @headers;
207 2         5 my @results = $header;
208 2         3 my %info;
209 2         5 my $length_align = length($tar_str_weighted);
210 2 100       7 my $weight_bool = $weight ? 'True' : 'False';
211 2         5 my @alignments_ascii;
212             my $alignment_str_csv;
213 2         5 my @alignments_csv = join ';',
214             qw/id ref indicator tar weight hamming-distance json-path label/;
215              
216             # The dataframe will have two header lines
217             # *** IMPORTANT ***
218             # nsort does not yield same results as canonical from JSON::XS
219             # NB: we're sorting here and in create_binary_digit_string()
220 2         4 my @sort_keys_glob_hash = sort keys %{$glob_hash};
  2         215  
221              
222             # Creating @labels array from {id,label}, or guess_label()
223             my @labels =
224 2 100       21 map { exists $nomenclature{$_} ? $nomenclature{$_} : guess_label($_) }
  318         606  
225             @sort_keys_glob_hash;
226              
227             # Die if #elements in arrays differ
228 2 50       8 die "Mismatch between variables and labels"
229             if @sort_keys_glob_hash != @labels;
230              
231             # Labels
232             # NB: there is a comma in 'Serum Glutamic Pyruvic Transaminase, CTCAE'
233 2         49 my @dataframe = join ';', 'Id', @labels;
234              
235             # Variables (json path)
236 2         42 push @dataframe, join ';', 'Id', @sort_keys_glob_hash;
237              
238             # 0/1
239             push @dataframe, join ';', qq/T|$tar/,
240 2         90 ( split //, $tar_binary_hash->{$tar}{binary_digit_string} ); # Add Target
241              
242             # Sort %score by value and load results
243 2         22 my $count = 1;
244 2         5 $max_out++; # to be able to start w/ ONE
245              
246             # Start loop
247 2         25 for my $key (
248             sort {
249             $sort_by eq 'jaccard' #
250             ? $score->{$b}{$sort_by}
251             <=> $score->{$a}{$sort_by} # 1 to 0 (similarity)
252             : $score->{$a}{$sort_by}
253 284 50       504 <=> $score->{$b}{$sort_by} # 0 to N (distance)
254             } keys %$score
255             )
256             {
257              
258 72 50       251 say "$count: Creating alignment " if $self->{verbose};
259              
260             # Create ASCII alignment
261             # NB: We need it here to get $n_00
262             my ( $n_00, $alignment_str_ascii, $alignment_arr_csv ) =
263             create_alignment(
264             {
265             ref_key => $key,
266             ref_str_weighted =>
267             $ref_binary_hash->{$key}{binary_digit_string_weighted},
268 72         671 tar_str_weighted => $tar_str_weighted,
269             glob_hash => $glob_hash,
270             sorted_keys_glob_hash => \@sort_keys_glob_hash,
271             labels => \@labels
272             }
273             );
274              
275             # *** IMPORTANT ***
276             # The LENGTH of the alignment is based on the #variables in the REF-COHORT
277             # Compute estimated av and dev for binary_string of L = length_align - n_00
278             # Corrected length_align L = length_align - n_00
279 72         334 my $length_align_corrected = $length_align - $n_00;
280              
281             #$estimated_average, $estimated_std_dev
282 72         405 ( $stat->{hamming_stats}{mean_rnd}, $stat->{hamming_stats}{sd_rnd} ) =
283             estimate_hamming_stats($length_align_corrected);
284              
285             # Compute a few stats
286             my $hamming_z_score = z_score(
287             $score->{$key}{hamming},
288             $stat->{hamming_stats}{mean},
289             $stat->{hamming_stats}{sd}
290 72         443 );
291             my $hamming_z_score_from_random = z_score(
292             $score->{$key}{hamming},
293             $stat->{hamming_stats}{mean_rnd},
294             $stat->{hamming_stats}{sd_rnd}
295 72         236 );
296              
297             #my $hamming_p_value =
298             # p_value( $score->{$key}{hamming}, $length_align_corrected );
299 72         235 my $hamming_p_value_from_z_score =
300             p_value_from_z_score($hamming_z_score);
301             my $jaccard_z_score = z_score(
302             $score->{$key}{jaccard},
303             $stat->{jaccard_stats}{mean},
304             $stat->{jaccard_stats}{sd}
305 72         263 );
306 72         150 my $jaccard_p_value_from_z_score =
307             p_value_from_z_score( 1 - $jaccard_z_score );
308              
309             # Compute Intersect-Rate (I/T) * 100
310             # Completeness (T/R) * 100
311 72         181 my $reference_vars = $score->{$key}{reference_vars};
312 72         146 my $intersect = $score->{$key}{intersect};
313 72 50       205 my $intersect_rate =
314             ( $target_vars == 0 ) ? 0 : ( $intersect / $target_vars ) * 100;
315 72 50       154 my $completeness =
316             ( $reference_vars == 0 ) ? 0 : ( $intersect / $reference_vars ) * 100;
317              
318             # Create a hash with formats
319             my $format = {
320             'RANK' => { value => $count, format => undef },
321             'REFERENCE(ID)' => { value => $key, format => undef },
322             'TARGET(ID)' => { value => $tar, format => undef },
323             'FORMAT' => { value => $self->{format}, format => undef },
324             'WEIGHTED' => { value => $weight_bool, format => undef },
325             'LENGTH' => { value => $length_align_corrected, format => '%6d' },
326             'HAMMING-DISTANCE' =>
327             { value => $score->{$key}{hamming}, format => '%4d' },
328             'DISTANCE-Z-SCORE' =>
329             { value => $hamming_z_score, format => '%7.3f' },
330             'DISTANCE-P-VALUE' =>
331             { value => $hamming_p_value_from_z_score, format => '%12.7f' },
332             'DISTANCE-Z-SCORE(RAND)' =>
333             { value => $hamming_z_score_from_random, format => '%8.4f' },
334             'JACCARD-INDEX' =>
335 72         2596 { value => $score->{$key}{jaccard}, format => '%7.3f' },
336             'JACCARD-Z-SCORE' =>
337             { value => $jaccard_z_score, format => '%7.3f' },
338             'JACCARD-P-VALUE' =>
339             { value => $jaccard_p_value_from_z_score, format => '%12.7f' },
340             'REFERENCE-VARS' => { value => $reference_vars, format => '%6d' },
341             'TARGET-VARS' => { value => $target_vars, format => '%6d' },
342             'INTERSECT' => { value => $intersect, format => '%6d' },
343             'INTERSECT-RATE(%)' =>
344             { value => $intersect_rate, format => '%8.2f' },
345             'COMPLETENESS(%)' => { value => $completeness, format => '%8.2f' }
346             };
347              
348             # Serialize results
349             my $tmp_str = join "\t", map {
350 72         277 defined $format->{$_}{format}
351             ? sprintf( $format->{$_}{format}, $format->{$_}{value} )
352             : $format->{$_}{value}
353 1296 100       5466 } @headers;
354 72         343 push @results, $tmp_str;
355              
356             # To save memory only load if --align
357 72 50       211 if ( defined $align ) {
358              
359             # Add all of the above to @alignments_ascii
360 72         171 my $sep = ('-') x 80;
361 72         2334 push @alignments_ascii,
362             qq/#$header\n$tmp_str\n$sep\n$$alignment_str_ascii/;
363              
364             # Add all of the above to @alignments_csv
365 72         2718 push @alignments_csv, @$alignment_arr_csv;
366              
367             # Add data to dataframe
368             push @dataframe, join ';', qq/R|$key/,
369 72         3880 ( split //, $ref_binary_hash->{$key}{binary_digit_string} );
370              
371             # Add values to info
372             $info{$key} = {
373             weighted => $weight_bool eq 'True'
374             ? JSON::XS::true
375             : JSON::XS::false,
376             reference_id => $key,
377             target_id => $tar,
378             reference_binary_string =>
379             $ref_binary_hash->{$key}{binary_digit_string},
380             target_binary_string =>
381             $tar_binary_hash->{$key}{binary_digit_string},
382             reference_binary_string_weighted =>
383             $ref_binary_hash->{$key}{binary_digit_string_weighted},
384             target_binary_string_weighted =>
385             $tar_binary_hash->{$key}{binary_digit_string_weighted},
386             alignment_length => $length_align_corrected,
387             hamming_distance => $score->{$key}{hamming},
388             hamming_z_score => $hamming_z_score,
389             hamming_p_value => $hamming_p_value_from_z_score,
390             jaccard_similarity => $score->{$key}{jaccard},
391             jaccard_z_score => $jaccard_z_score,
392             jaccard_p_value => $jaccard_p_value_from_z_score,
393             jaccard_distance => 1 - $score->{$key}{jaccard},
394             format => $self->{format},
395 72 100       1104 alignment => $$alignment_str_ascii,
396             reference_vars => $reference_vars,
397             target_vars => $target_vars,
398             intersect => $intersect,
399             intersect_rate => $intersect_rate,
400             completeness => $completeness
401             };
402              
403             }
404              
405 72         4302 $count++;
406 72 100       1960 last if $count == $max_out;
407             }
408              
409 2         210 return \@results, \%info, \@alignments_ascii, \@dataframe, \@alignments_csv;
410             }
411              
412             sub create_alignment {
413              
414             # NB: The alignment will use the weighted string
415 72     72 0 144 my $arg = shift;
416 72         143 my $ref_key = $arg->{ref_key};
417 72         117 my $binary_string1 = $arg->{ref_str_weighted};
418 72         150 my $binary_string2 = $arg->{tar_str_weighted};
419 72         110 my $sorted_keys_glob_hash = $arg->{sorted_keys_glob_hash};
420 72         131 my $labels = $arg->{labels};
421 72         94 my $glob_hash = $arg->{glob_hash};
422 72         130 my $length1 = length($binary_string1);
423 72         98 my $length2 = length($binary_string2);
424              
425             # Check that l1 = l2
426 72 50       178 die "The binary strings must have the same length"
427             if ( $length1 != $length2 );
428              
429             # Expand array to have weights as N-elements
430 72         208 my $recreated_array = recreate_array( $glob_hash, $sorted_keys_glob_hash );
431              
432             # Initialize some variables
433 72         154 my $out_ascii = "REF -- TAR\n";
434 72         90 my @out_csv;
435 72         126 my $cum_distance = 0;
436 72         95 my $n_00 = 0;
437              
438             # For loop with 2 variables
439             # i index for weighted
440             # j the number of variables
441 72         135 my ( $i, $j ) = ( 0, 0 );
442 72         146 for ( $i = 0 ; $i < $length1 ; $i++, $j++ ) {
443              
444             # Load key and value
445 11448         17793 my $key = $recreated_array->[$i];
446 11448         19839 my $val = sprintf( "%3d", $glob_hash->{$key} );
447              
448             # We have to keep track with $j
449 11448         14232 my $sorted_key = $sorted_keys_glob_hash->[$j];
450 11448         16106 my $label = $labels->[$j];
451              
452             # Load chars
453 11448         15037 my $char1 = substr( $binary_string1, $i, 1 );
454 11448         13522 my $char2 = substr( $binary_string2, $i, 1 );
455 11448 100 100     27128 $n_00++ if ( $char1 == 0 && $char2 == 0 );
456              
457             # Correct $i index by adding weights
458 11448         15382 $i = $i + $glob_hash->{$key} - 1;
459              
460             # Load metrics
461 11448 100       19047 $cum_distance += $glob_hash->{$key} if $char1 ne $char2;
462 11448         16922 my $cum_distance_pretty = sprintf( "%3d", $cum_distance );
463 11448 100       16965 my $distance = $char1 eq $char2 ? 0 : $glob_hash->{$key};
464 11448         15535 my $distance_pretty = sprintf( "%3d", $distance );
465              
466             # w = weight, d = distance, cd = cumul distance
467 11448         27245 my %format = (
468             '11' => '-----',
469             '10' => 'xxx--',
470             '01' => '--xxx',
471             '00' => ' '
472             );
473 11448         22462 $out_ascii .=
474             qq/$char1 $format{ $char1 . $char2 } $char2 | (w:$val|d:$distance_pretty|cd:$cum_distance_pretty|) $sorted_key ($label)\n/;
475 11448         50917 push @out_csv,
476             qq/$ref_key;$char1;$format{ $char1 . $char2 };$char2;$glob_hash->{$key};$distance;$sorted_key;$label/;
477              
478             #REF(107:week_0_arm_1);indicator;TAR(125:week_0_arm_1);weight;hamming-distance;json-path
479             #0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P16Y
480             #0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P24Y
481             #1;-----;1;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P39Y
482             }
483 72         1368 return $n_00, \$out_ascii, \@out_csv;
484             }
485              
486             sub recreate_array {
487 72     72 0 150 my ( $glob_hash, $sorted_keys_glob_hash ) = @_;
488 72         119 my @recreated_array;
489 72         227 foreach my $key (@$sorted_keys_glob_hash) {
490 11448         20155 for ( my $i = 0 ; $i < $glob_hash->{$key} ; $i++ ) {
491 11736         24269 push @recreated_array, $key;
492             }
493             }
494 72         253 return \@recreated_array;
495             }
496              
497             sub create_glob_and_ref_hashes {
498 15     15 0 39 my ( $array, $weight, $self ) = @_;
499 15         34 my $primary_key = $self->{primary_key};
500 15         25 my $glob_hash = {};
501 15         34 my $ref_hash_flattened;
502              
503 15         24 my $count = 1;
504 15         31 for my $element ( @{$array} ) {
  15         33  
505              
506             # For consistency, we obtain the primary_key for both BFF/PXF
507             # from $_->{id} (not from subject.id)
508 456 50       1906 my $id = $element->{$primary_key}
509             or die
510             "Sorry but the JSON document [$count] does not have the primary_key <$primary_key> defined\n";
511              
512             # Remapping hash
513 456 50       1561 say "Flattening and remapping ..." if $self->{verbose};
514              
515 456         2630 my $ref_hash = remap_hash(
516             {
517             hash => $element,
518             weight => $weight,
519             self => $self
520             }
521             );
522              
523             # *** IMPORTANT ***
524             # We eliminate keys with weight = 0 if defined $weight;
525 456 100       2897 prune_keys_with_weight_zero($ref_hash) if defined $weight;
526              
527             # Load big hash ref_hash_flattened
528 456         1935 $ref_hash_flattened->{$id} = $ref_hash;
529              
530             # The idea is to create a $glob_hash with unique key-values
531             # Duplicates will be automatically merged
532 456         32139 $glob_hash = { %$glob_hash, %$ref_hash };
533              
534             # To keep track of array element indexes
535 456         4341 $count++;
536             }
537 15         112 return ( $glob_hash, $ref_hash_flattened );
538             }
539              
540             sub randomize_variables {
541 0     0 0 0 my ( $glob_hash, $self ) = @_;
542 0         0 my $max = $self->{max_number_vars};
543 0         0 my $seed = $self->{seed};
544              
545             # set random seed
546 0         0 srand($seed);
547              
548             # Randomize
549             # NB:keys have to be sorted for srand to work!!!!
550             # perl -MList::Util=shuffle -E 'srand 123; say shuffle 1 .. 5'
551 0         0 my @items = ( shuffle sort keys %$glob_hash )[ 0 .. $max - 1 ];
552              
553             # Create a new hash with a hash slice
554 0         0 my %new_glob_hash;
555 0         0 @new_glob_hash{@items} = @{$glob_hash}{@items};
  0         0  
556              
557             # return reference
558 0         0 return \%new_glob_hash;
559             }
560              
561             sub prune_excluded_included {
562 458     458 0 969 my ( $hash, $self ) = @_;
563 458         618 my @included = @{ $self->{include_terms} };
  458         1323  
564 458         688 my @excluded = @{ $self->{exclude_terms} };
  458         1134  
565              
566             # Die if we have both options at the same time
567 458 50 66     1346 die "Sorry, <--include> and <--exclude> are mutually exclusive\n"
568             if ( @included && @excluded );
569              
570             # *** IMPORTANT ***
571             # Original $hash is modified
572              
573             # INCLUDED
574 458 100       1194 if (@included) {
575 25         81 for my $key ( keys %$hash ) {
576 125 100   225   451 delete $hash->{$key} unless any { $_ eq $key } @included;
  225         586  
577             }
578             }
579              
580             # EXCLUDED
581 458 50       888 if (@excluded) {
582 0         0 for my $key (@excluded) {
583 0 0       0 delete $hash->{$key} if exists $hash->{$key};
584             }
585             }
586              
587             # We will do nothing if @included = @excluded = [] (DEFAULT)
588 458         906 return 1;
589             }
590              
591             sub set_excluded_phenotypicFeatures {
592 458     458 0 950 my ( $hash, $switch, $format ) = @_;
593              
594             # Ensure phenotypicFeatures exist before processing
595 458 100       1199 return 1 unless exists $hash->{phenotypicFeatures};
596              
597 320         666 foreach my $feature ( @{ $hash->{phenotypicFeatures} } ) {
  320         916  
598              
599             # Skip if 'excluded' is not set or false
600 787 100       3358 next unless $feature->{excluded};
601              
602             # NB: remaining phenotypicFeatures:1.excluded
603             # will be discarded by $exclude_variables_regex_qr later
604 449 100       3355 if ($switch) {
605              
606             # Determine the correct ID field based on the format
607 2 50       8 my $id_field = $format eq 'BFF' ? 'featureType' : 'type';
608              
609             # Append '_excluded' to the appropriate ID
610 2         9 $feature->{$id_field}{id} .= '_excluded';
611             }
612             else {
613             # Remove the feature by setting it to undef
614 447         1428 $feature = undef;
615              
616             # Due to properties being set to undef, it's possible for the coverage file to
617             # report phenotypicFeatures as 100% by all "excluded" = true
618             }
619             }
620              
621 320         618 return 1;
622             }
623              
624             sub remap_hash {
625 458     458 0 884 my $arg = shift;
626 458         1048 my $hash = $arg->{hash};
627 458         785 my $weight = $arg->{weight};
628 458         785 my $self = $arg->{self}; # $self from $arg
629 458         843 my $nodes = $self->{nodes};
630 458         767 my $edges = $self->{edges};
631 458         776 my $format = $self->{format};
632 458         1050 my $switch = $self->{retain_excluded_phenotypicFeatures};
633 458         624 my %out_hash;
634              
635             # Do some pruning excluded / included
636 458         1803 prune_excluded_included( $hash, $self );
637              
638             # *** IMPORTANT ***
639             # The user may include a term that:
640             # 1 - may not exist in any individual
641             # 2 - does not exist in some individuals
642             # If the term does not exist in a individual
643             # - a) -include-terms contains ANOTHER TERM THAT EXISTS
644             # %$hash will contain keys => OK
645             # - b) -include-terms only includes the term/terms not present
646             # %$hash = 0 , then we return {}, to avoid trouble w/ Fold.pm
647             #print Dumper $hash;
648 458 50       1091 return {} unless %$hash;
649              
650             # A bit more pruning plus folding
651             # NB: Hash::Fold keeps white spaces on keys
652             #
653             # Options for 1D-array folding:
654             # A) Array to Hash then Fold
655             # B) Fold then Regex <=== CHOSEN
656             # - Avoids the need for deep cloning
657             # - Works across any JSON data structure (without specific key requirements)
658             # - BUT profiling shows it's ~5-10% slower than 'Array to Hash then Fold'
659             # - Does not accommodate specific remappings like 'interpretations.diagnosis.genomicInterpretations'
660 458         1130 set_excluded_phenotypicFeatures( $hash, $switch, $format );
661 458         2242 $hash = fold($hash);
662              
663             # Load the hash that points to the hierarchy for ontology-term-id
664             # *** IMPORTANT ***
665             # - phenotypicFeatures.featureType.id => BFF
666             # - phenotypicFeatures.type.id => PXF
667 458         3746953 my $id_correspondence = $self->{id_correspondence};
668              
669             # Load values for the for loop
670 458         1263 my $exclude_variables_regex_qr = $self->{exclude_variables_regex_qr};
671 458         2264 my $misc_regex_qr =
672             qr/1900-01-01|NA0000|NCIT:C126101|P999Y|P9999Y|phenopacket_id/;
673              
674             # Pre-compile a list of fixed scalar values to exclude into a hash for quick lookup
675             my %exclude_values =
676 458         1201 map { $_ => 1 }
  2290         5644  
677             ( 'NA', 'NaN', 'Fake', 'None:No matching concept', 'Not Available' );
678              
679             # Now we proceed for each key
680 458         1105 for my $key ( keys %{$hash} ) {
  458         16580  
681              
682             # To see which ones were discarded
683             #say $key if !defined $hash->{$key};
684              
685             # Discard undefined
686 87191 100       160227 next unless defined $hash->{$key};
687              
688             # Discarding lines with 'low quality' keys (Time of regex profiled with :NYTProf: ms time)
689             # Some can be "rescued" by adding the ontology as ($1)
690             # NB1: We discard _labels too!!
691             # NB2: info|metaData are always discarded
692             next
693 84392 100 100     480318 if ( defined $exclude_variables_regex_qr
694             && $key =~ $exclude_variables_regex_qr );
695              
696             # The user can turn on age related values
697             next
698             if ( ( $format eq 'PXF' || $format eq 'BFF' )
699             && $key =~ m/\.age(?!nt)|onset/i
700 33663 100 100     301746 && !$self->{age} ); # $self->{age} [0|1]
      100        
      66        
701              
702             # Load values
703 32154         49961 my $val = $hash->{$key};
704              
705             # Discarding lines with unsupported val (Time profiled with :NYTProf: ms time)
706             next
707             if (
708             ( ref($val) eq 'HASH'
709 1         5 && !keys %{$val} ) # Discard {} (e.g.,subject.vitalStatus: {})
710 600         2194 || ( ref($val) eq 'ARRAY' && !@{$val} ) # Discard []
711 32154 100 66     260457 || exists $exclude_values{$val}
      66        
      100        
      66        
      66        
712             || $val =~ $misc_regex_qr
713             );
714              
715             # Add IDs to key
716 23564         44956 my $id_key = add_id2key( $key, $hash, $self );
717              
718             # Finally add value to id_key
719 23564         38790 my $tmp_key_at_variable_level = $id_key . '.' . $val;
720              
721             # Add HPO ascendants
722 23564 50 33     44162 if ( defined $edges && $val =~ /^HP:/ ) {
723 0         0 my $ascendants =
724             add_hpo_ascendants( $tmp_key_at_variable_level, $nodes, $edges );
725 0         0 $out_hash{$_} = 1 for @$ascendants; # weight 1 for now
726             }
727              
728             ##################
729             # Assign weights #
730             ##################
731              
732             # NB: mrueda (04-12-23) - it's ok if $weight == undef => NO AUTOVIVIFICATION!
733             # NB: We don't warn if user selection does not exist, just assign 1
734              
735 23564         28324 my $tmp_key_at_term_level = $tmp_key_at_variable_level;
736              
737             # If variable has . then capture $1
738 23564 50       47624 if ( $tmp_key_at_term_level =~ m/\./ ) {
739              
740             # NB: For long str regex is faster than (split /\./, $foo)[0]
741 23564         48510 $tmp_key_at_term_level =~ m/^(\w+)\./;
742 23564         37334 $tmp_key_at_term_level = $1;
743             }
744              
745 23564 100       34960 if ( defined $weight ) {
746              
747             # *** IMPORTANT ***
748             # ORDER MATTERS !!!!
749             # We allow for assigning weights by TERM (e.g., 1D)
750             # but VARIABLE level takes precedence to TERM
751              
752             $out_hash{$tmp_key_at_variable_level} =
753              
754             # VARIABLE LEVEL
755             # NB: exists stringifies the weights
756             exists $weight->{$tmp_key_at_variable_level}
757             ? $weight->{$tmp_key_at_variable_level} + 0 # coercing to number
758              
759             # TERM LEVEL
760             : exists $weight->{$tmp_key_at_term_level}
761 5292 50       20546 ? $weight->{$tmp_key_at_term_level} + 0 # coercing to number
    100          
762              
763             # NO WEIGHT
764             : 1;
765              
766             }
767             else {
768              
769             # Assign a weight of 1 if no users weights
770 18272         42492 $out_hash{$tmp_key_at_variable_level} = 1;
771              
772             }
773              
774             ##############
775             # label Hash #
776             ##############
777              
778             # Finally we load the Nomenclature hash
779 23564         32364 my $label = $key;
780 23564         62246 $label =~ s/id/label/;
781             $nomenclature{$tmp_key_at_variable_level} = $hash->{$label}
782 23564 100       82654 if defined $hash->{$label};
783             }
784              
785             # *** IMPORTANT ***
786             # We have to return an object {} when undef
787 458   50     32853 return \%out_hash // {};
788             }
789              
790             sub add_hpo_ascendants {
791 0     0 0 0 my ( $key, $nodes, $edges ) = @_;
792              
793             # First we obtain the ontology (0000539) from HP:0000539
794 0         0 $key =~ m/HP:(\w+)$/;
795 0         0 my $ontology = $1;
796              
797             # We'll use it to build a string equivalent to a key from $edges
798 0         0 my $hpo_url = 'http://purl.obolibrary.org/obo/HP_';
799 0         0 my $hpo_key = $hpo_url . $ontology;
800              
801             # We will include all ascendants in an array
802 0         0 my @ascendants;
803 0         0 for my $parent_id ( @{ $edges->{$hpo_key} } ) {
  0         0  
804              
805             # We have to create a copy to not modify the original $parent_id
806             # as it can appear in multiple individuals
807 0         0 my $copy_parent_id = $parent_id;
808 0         0 $copy_parent_id =~ m/\/(\w+)$/;
809 0         0 $copy_parent_id = $1;
810 0         0 $copy_parent_id =~ tr/_/:/;
811              
812             # *** IMPORTANT ***
813             # We cannot add any label to the ascendants, otherwise they will
814             # not be matched by an indv down the tree
815             # Myopia
816             # Mild Myopia
817             # We want that 'Mild Myopia' matches 'Myopia', thus we can not add a label from 'Mild Myopia'
818             # Use the labels only for debug
819 0         0 my $asc_key = DEVEL_MODE ? $key . '.HPO_asc_DEBUG_ONLY' : $key;
820 0         0 $asc_key =~ s/HP:$ontology/$copy_parent_id/g;
821 0         0 push @ascendants, $asc_key;
822              
823             # We finally add the label to %nomenclature
824 0         0 my $hpo_asc_str = $hpo_url
825             . $copy_parent_id; # 'http://purl.obolibrary.org/obo/HP_HP:0000539
826 0         0 $hpo_asc_str =~ s/HP://; # 0000539
827 0         0 $nomenclature{$asc_key} = $nodes->{$hpo_asc_str}{lbl};
828             }
829 0         0 return \@ascendants;
830             }
831              
832             sub add_id2key {
833 23564     23564 0 39470 my ( $key, $hash, $self ) = @_;
834 23564         46568 my $id_correspondence = $self->{id_correspondence}{ $self->{format} };
835 23564         30256 my $array_regex_qr = $self->{array_regex_qr};
836 23564         31696 my $array_terms_regex_qr = $self->{array_terms_regex_qr};
837              
838             #############
839             # OBJECTIVE #
840             #############
841              
842             # This subroutine is important as it replaces the index (numeric) for a given
843             # array element by a selected ontology. It's done for all subkeys on that element
844              
845             #"interventionsOrProcedures" : [
846             # {
847             # "bodySite" : {
848             # "id" : "NCIT:C12736",
849             # "label" : "intestine"
850             # },
851             # "procedureCode" : {
852             # "id" : "NCIT:C157823",
853             # "label" : "Colon Resection"
854             # }
855             # },
856             # {
857             # "bodySite" : {
858             # "id" : "NCIT:C12736",
859             # "label" : "intestine"
860             # },
861             # "procedureCode" : {
862             # "id" : "NCIT:C86074",
863             # "label" : "Hemicolectomy"
864             # }
865             # },
866             #]
867             #
868             # Will become:
869             #
870             #"interventionsOrProcedures.NCIT:C157823.bodySite.id.NCIT:C12736" : 1,
871             #"interventionsOrProcedures.NCIT:C157823.procedureCode.id.NCIT:C157823" : 1,
872             #"interventionsOrProcedures.NCIT:C86074.bodySite.id.NCIT:C12736" : 1,
873             #"interventionsOrProcedures.NCIT:C86074.procedureCode.id.NCIT:C86074" : 1,
874             #
875             # To make the replacement we use $id_correspondence, then we perform a regex
876             # to fetch the key parts
877              
878             # Only proceed if $key is one of the array_terms
879 23564 100       97223 if ( $key =~ $array_terms_regex_qr ) {
880              
881             # Now we use $array_regex_qr to capture $1, $2 and $3 for BFF/PXF
882             # NB: For others (e.g., MXF) we will have only $1 and $2
883 22358         82526 $key =~ $array_regex_qr;
884              
885             #say "$array_regex_qr -- [$key] <$1> <$2> <$3>"; # $3 can be undefined
886              
887 22358         28541 my ( $tmp_key, $val );
888              
889             # Normal behaviour for BFF/PXF
890 22358 100       44769 if ( defined $3 ) {
891              
892             # If id_correspondence is an array (e.g., medicalActions) we have to grep the right match
893 22228         24731 my $correspondence;
894 22228 50       57396 if ( ref $id_correspondence->{$1} eq ref [] ) {
895              
896             # $1 $2 $3
897             # <0>
898 0         0 my $subkey = ( split /\./, $3 )[0]; # treatment
899             $correspondence =
900 0     0   0 first { $_ =~ m/^$subkey/ }
901 0         0 @{ $id_correspondence->{$1} }; # treatment.agent.id
  0         0  
902             }
903             else {
904 22228         34022 $correspondence = $id_correspondence->{$1};
905             }
906              
907             # Now that we know which is the term we use to find key-val in $hash
908 22228         54160 $tmp_key =
909             $1 . ':'
910             . $2 . '.'
911             . $correspondence; # medicalActions.0.treatment.agent.id
912 22228         37742 $val = $hash->{$tmp_key}; # DrugCentral:257
913 22228         53565 $key = join '.', $1, $val, $3
914             ; # medicalActions.DrugCentral:257.treatment.routeOfAdministration.id
915             }
916              
917             # MXF or similar (...we haven't encountered other regex yet)
918             else {
919              
920 130         265 $tmp_key = $1 . ':' . $2;
921 130         139 $val = $hash->{$tmp_key};
922 130         184 $key = $1;
923             }
924             }
925              
926             # $key = 'Bar:1' means that we have array but the user either:
927             # a) Made a mistake in the config
928             # b) Is not using the right config file
929             else {
930 1206 50       2947 die
931             "<$1> contains array elements but is not defined as an array in <$self->{config_file}>. Please check your syntax and configuration file.\n"
932             if $key =~ m/^(\w+):/;
933             }
934              
935 23564         44595 return $key;
936             }
937              
938             sub create_binary_digit_string {
939 17     17 0 53 my ( $export, $weight, $glob_hash, $cmp_hash ) = @_;
940 17         40 my %out_hash;
941              
942             # *** IMPORTANT ***
943             # Being a nested for, keys %{$glob_hash} does not need sorting
944             # BUT, we sort to follow the same order as serialized (sorted)
945 17         42 my @sorted_keys_glob_hash = sort keys %{$glob_hash};
  17         1314  
946              
947             # IDs of each indidividual
948 17         137 for my $individual_id ( keys %{$cmp_hash} ) { # no need to sort
  17         141  
949              
950             # One-hot encoding = Representing categorical data as numerical
951 458         966 my ( $binary_str, $binary_str_weighted ) = ('') x 2;
952              
953 458         716 for my $key (@sorted_keys_glob_hash) {
954 62270         78325 my $has_value = exists $cmp_hash->{$individual_id}{$key};
955 62270 100       80041 $binary_str .= $has_value ? '1' : '0';
956 62270 100       92453 if ( defined $weight ) {
957             $binary_str_weighted .=
958             $has_value
959             ? ( '1' x $glob_hash->{$key} )
960 13396 100       26856 : ( '0' x $glob_hash->{$key} );
961             }
962             }
963              
964             # If weight is not defined, simply assign the unweighted string once.
965 458 100       1025 $binary_str_weighted = $binary_str unless defined $weight;
966              
967 458         1669 $out_hash{$individual_id}{binary_digit_string} = $binary_str;
968             $out_hash{$individual_id}{binary_digit_string_weighted} =
969 458         809 $binary_str_weighted;
970              
971 458 50       922 if ( defined $export ) {
972              
973             # Convert string => raw bytes > zlib-compres => Base64
974             $out_hash{$individual_id}{zlib_base64_binary_digit_string} =
975 0         0 binary_to_base64($binary_str);
976              
977             $out_hash{$individual_id}{zlib_base64_binary_digit_string_weighted}
978             = defined $weight
979             ? binary_to_base64($binary_str_weighted)
980 0 0       0 : $out_hash{$individual_id}{zlib_base64_binary_digit_string};
981             }
982              
983             }
984 17         338 return \%out_hash;
985             }
986              
987             sub parse_hpo_json {
988 0     0 0 0 my $data = shift;
989              
990             # The file is a structured representation of the Human Phenotype Ontology (HPO) in JSON format.
991             # The HPO is structured into a directed acyclic graph (DAG)
992             # Here's a brief overview of the structure of the hpo.json file:
993             # - graphs: This key contains an array of ontology graphs. In the case of HPO, there is only one graph. The graph has two main keys:
994             # - nodes: An array of objects, each representing an HPO term. Each term object has the following keys:
995             # - id: The identifier of the term (e.g., "HP:0000118").
996             # - lbl: The label (name) of the term (e.g., "Phenotypic abnormality").
997             # - meta: Metadata associated with the term, including definition, synonyms, and other information.
998             # - type: The type of the term, usually "CLASS".
999             # - edges: An array of objects, each representing a relationship between two HPO terms. Each edge object has the following keys:
1000             # - sub: The subject (child) term ID (e.g., "HP:0000924").
1001             # - obj: The object (parent) term ID (e.g., "HP:0000118").
1002             # - pred: The predicate that describes the relationship between the subject and object terms, typically "is_a" in HPO.
1003             # - meta: This key contains metadata about the HPO ontology as a whole, such as version information, description, and other details.
1004              
1005 0         0 my $graph = $data->{graphs}->[0];
1006 0         0 my %nodes = map { $_->{id} => $_ } @{ $graph->{nodes} };
  0         0  
  0         0  
1007 0         0 my %edges = ();
1008              
1009 0         0 for my $edge ( @{ $graph->{edges} } ) {
  0         0  
1010 0         0 my $child_id = $edge->{sub};
1011 0         0 my $parent_id = $edge->{obj};
1012 0         0 push @{ $edges{$child_id} }, $parent_id;
  0         0  
1013             }
1014 0         0 return \%nodes, \%edges;
1015             }
1016              
1017             sub prune_keys_with_weight_zero {
1018 97     97 0 190 my $hash_ref = shift;
1019              
1020             # Iterate over the keys of the hash
1021 97         193 foreach my $key ( keys %{$hash_ref} ) {
  97         1102  
1022              
1023             # Delete the key if its value is 0
1024 5077 100       8834 delete $hash_ref->{$key} if $hash_ref->{$key} == 0;
1025             }
1026             }
1027              
1028             sub guess_label {
1029 72     72 0 95 my $input_string = shift;
1030              
1031 72 50       175 if (
1032             $input_string =~ /\. # Match a literal dot
1033             ([^\.]+) # Match and capture everything except a dot
1034             $ # Anchor to the end of the string
1035             /x
1036             )
1037             {
1038 72         147 return $1;
1039             }
1040              
1041             # If no dot is found, return the original string
1042 0           return $input_string;
1043             }
1044              
1045             sub binary_to_base64 {
1046 0     0 0   my $binary_string = shift;
1047              
1048             # Convert binary string (e.g. "0010...") to raw bytes
1049 0           my $raw_data = pack( "B*", $binary_string );
1050              
1051             # Compress the raw data (note: compressing very short data may not save space)
1052 0           my $compressed = compress($raw_data);
1053              
1054             # Base64 encode the compressed data, without any newline breaks
1055 0           return encode_base64( $compressed, "" );
1056             }
1057              
1058             sub _base64_to_binary {
1059 0     0     my ( $b64_string, $original_length ) = @_;
1060              
1061             # Decode the Base64 encoded compressed data
1062 0           my $compressed_data = decode_base64($b64_string);
1063              
1064             # Decompress the data back to raw bytes
1065 0 0         my $raw_data = uncompress($compressed_data)
1066             or die "Decompression failed: $Compress::Zlib::gzerrno\n";
1067              
1068             # Convert the raw bytes back into a binary string (sequence of 0s and 1s)
1069 0           my $binary_string = unpack( "B*", $raw_data );
1070              
1071             # Trim the binary string to the original length to remove any padded bits
1072 0           return substr( $binary_string, 0, $original_length );
1073             }
1074              
1075             1;