File Coverage

lib/Pheno/Ranker/Align.pm
Criterion Covered Total %
statement 262 307 85.3
branch 65 88 73.8
condition 20 38 52.6
subroutine 23 26 88.4
pod 0 14 0.0
total 370 473 78.2


line stmt bran cond sub pod time code
1             package Pheno::Ranker::Align;
2              
3 3     3   21 use strict;
  3         5  
  3         130  
4 3     3   15 use warnings;
  3         5  
  3         82  
5 3     3   15 use autodie;
  3         4  
  3         20  
6 3     3   17303 use feature qw(say);
  3         10  
  3         257  
7 3     3   18 use List::Util qw(any shuffle);
  3         6  
  3         208  
8              
9             #use List::MoreUtils qw(duplicates);
10 3     3   18 use Data::Dumper;
  3         6  
  3         139  
11 3     3   1834 use Sort::Naturally qw(nsort);
  3         13351  
  3         211  
12 3     3   1505 use Hash::Fold fold => { array_delimiter => ':' };
  3         106585  
  3         24  
13 3     3   2455 use Pheno::Ranker::Stats;
  3         9  
  3         246  
14              
15 3     3   21 use Exporter 'import';
  3         5  
  3         138  
16             our @EXPORT =
17             qw(check_format cohort_comparison compare_and_rank create_glob_and_ref_hashes randomize_variables remap_hash create_binary_digit_string parse_hpo_json);
18              
19 3     3   15 use constant DEVEL_MODE => 0;
  3         6  
  3         13419  
20              
21             our %nomenclature = ();
22              
23             sub check_format {
24              
25 3     3 0 7 my $data = shift;
26 3 50       23 return exists $data->[0]{subject} ? 'PXF' : 'BFF';
27             }
28              
29             sub cohort_comparison {
30              
31 5     5 0 14 my ( $ref_binary_hash, $self ) = @_;
32 5         17 my $out_file = $self->{out_file};
33              
34             say "Performing COHORT comparison"
35 5 50 33     43 if ( $self->{debug} || $self->{verbose} );
36              
37             # *** IMPORTANT ***
38             # The ids/cohorts are naturally sorted (it won't match --append-prefixes!!!)
39 5         9 my @sorted_keys_ref_binary_hash = nsort( keys %{$ref_binary_hash} );
  5         260  
40              
41             # Print to $out_file
42             #
43 5         23610 open( my $fh, ">", $out_file );
44 5         7218 say $fh "\t", join "\t", @sorted_keys_ref_binary_hash;
45              
46             # NB: It's a symmetric matrix so we could just compute
47             # triangle. However, R needs the whole matrix
48              
49             # Initialize the 2D matrix
50 5         19 my @matrix;
51              
52             # I elements
53 5         30 for my $i ( 0 .. $#sorted_keys_ref_binary_hash ) {
54             say "Calculating <"
55             . $sorted_keys_ref_binary_hash[$i]
56             . "> against the cohort..."
57 163 50       332 if $self->{verbose};
58             my $str1 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$i] }
59 163         235 {binary_digit_string_weighted};
60 163         241 print $fh $sorted_keys_ref_binary_hash[$i] . "\t";
61              
62             # J elements
63 163         257 for my $j ( 0 .. $#sorted_keys_ref_binary_hash ) {
64             my $str2 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] }
65 5875         7525 {binary_digit_string_weighted};
66 5875         5109 my $distance;
67              
68 5875 100       7546 if ( $j < $i ) {
    100          
69              
70             # Use the distance from the lower triangle
71 2856         3175 $distance = $matrix[$j][$i];
72             }
73             elsif ( $j == $i ) {
74 163         190 $distance = 0; # Diagonal element
75             }
76             else {
77             # Compute the distance and store it in the matrix
78 2856         3869 $distance = hd_fast( $str1, $str2 );
79 2856         4140 $matrix[$i][$j] = $distance;
80             }
81              
82             # Fill out the other triangle
83 5875         6879 $matrix[$j][$i] = $distance;
84              
85 5875         8488 print $fh $distance . "\t";
86             }
87              
88 163         237 print $fh "\n";
89             }
90              
91 5         35 close $fh;
92 5 50 33     3266 say "Matrix saved to <$out_file>" if ( $self->{debug} || $self->{verbose} );
93 5         238 return 1;
94             }
95              
96             sub compare_and_rank {
97              
98 1     1 0 4 my $arg = shift;
99 1         3 my $glob_hash = $arg->{glob_hash};
100 1         3 my $ref_binary_hash = $arg->{ref_binary_hash};
101 1         3 my $tar_binary_hash = $arg->{tar_binary_hash};
102 1         2 my $weight = $arg->{weight};
103 1         84 my $self = $arg->{self};
104 1         4 my $sort_by = $self->{sort_by};
105 1         3 my $align = $self->{align};
106 1         3 my $max_out = $self->{max_out};
107              
108             say "Performing COHORT(REF)-PATIENT(TAR) comparison"
109 1 50 33     10 if ( $self->{debug} || $self->{verbose} );
110              
111             # Hash for compiling distances
112 1         3 my $score;
113              
114             # Hash for stats
115             my $stat;
116              
117             # Load TAR binary string
118 1         2 my ($tar) = keys %{$tar_binary_hash};
  1         4  
119             my $tar_str_weighted =
120 1         3 $tar_binary_hash->{$tar}{binary_digit_string_weighted};
121              
122 1         3 for my $key ( keys %{$ref_binary_hash} ) { # No need to sort
  1         15  
123              
124             # Load REF binary string
125             my $ref_str_weighted =
126 36         100 $ref_binary_hash->{$key}{binary_digit_string_weighted};
127 36 50       65 say "Comparing <id:$key> --- <id:$tar>" if $self->{verbose};
128             say "REF:$ref_str_weighted\nTAR:$tar_str_weighted\n"
129 36 50 33     64 if ( defined $self->{debug} && $self->{debug} > 1 );
130             $score->{$key}{hamming} =
131 36         83 hd_fast( $ref_str_weighted, $tar_str_weighted );
132             $score->{$key}{jaccard} =
133 36         66 jaccard_similarity( $ref_str_weighted, $tar_str_weighted );
134              
135             # Add values
136 36         46 push @{ $stat->{hamming_data} }, $score->{$key}{hamming};
  36         77  
137 36         42 push @{ $stat->{jaccard_data} }, $score->{$key}{jaccard};
  36         66  
138             }
139 1         9 $stat->{hamming_stats} = add_stats( $stat->{hamming_data} );
140 1         6 $stat->{jaccard_stats} = add_stats( $stat->{jaccard_data} );
141              
142             # Initialize a few variables
143 1         8 my @headers = (
144             'RANK', 'REFERENCE(ID)',
145             'TARGET(ID)', 'FORMAT',
146             'LENGTH', 'WEIGHTED',
147             'HAMMING-DISTANCE', 'DISTANCE-Z-SCORE',
148             'DISTANCE-P-VALUE', 'DISTANCE-Z-SCORE(RAND)',
149             'JACCARD-INDEX', 'JACCARD-Z-SCORE',
150             'JACCARD-P-VALUE'
151             );
152 1         6 my $header = join "\t", @headers;
153 1         4 my @results = $header;
154 1         3 my %info;
155 1         3 my $length_align = length($tar_str_weighted);
156 1 50       4 my $weight_bool = $weight ? 'True' : 'False';
157 1         3 my @alignments_ascii;
158             my $alignment_str_csv;
159 1         3 my @alignments_csv = join ';',
160             qw/id ref indicator tar weight hamming-distance json-path label/;
161              
162             # The dataframe will have two header lines
163             # *** IMPORTANT ***
164             # nsort does not yield same results as canonical from JSON::XS
165             # NB: we're sorting here and in create_binary_digit_string()
166 1         2 my @sort_keys_glob_hash = sort keys %{$glob_hash};
  1         134  
167 1 100       8 my @labels = map { exists $nomenclature{$_} ? $nomenclature{$_} : $_ }
  159         303  
168             @sort_keys_glob_hash;
169              
170             # Die if #elements in arrays differ
171 1 50       6 die "Mismatch between variables and labels"
172             if @sort_keys_glob_hash != @labels;
173              
174             # Labels
175             # NB: there is a comma in 'Serum Glutamic Pyruvic Transaminase, CTCAE'
176 1         21 my @dataframe = join ';', 'Id', @labels;
177              
178             # Variables (json path)
179 1         22 push @dataframe, join ';', 'Id', @sort_keys_glob_hash;
180              
181             # 0/1
182             push @dataframe, join ';', qq/T|$tar/,
183 1         49 ( split //, $tar_binary_hash->{$tar}{binary_digit_string} ); # Add Target
184              
185             # Sort %score by value and load results
186 1         10 my $count = 1;
187 1         2 $max_out++; # to be able to start w/ ONE
188              
189             # Start loop
190 1         12 for my $key (
191             sort {
192             $sort_by eq 'jaccard' #
193             ? $score->{$b}{$sort_by}
194             <=> $score->{$a}{$sort_by} # 1 to 0 (similarity)
195             : $score->{$a}{$sort_by}
196 144 50       223 <=> $score->{$b}{$sort_by} # 0 to N (distance)
197             } keys %$score
198             )
199             {
200              
201 36 50       88 say "$count: Creating alignment <id:$key>" if $self->{verbose};
202              
203             # Create ASCII alignment
204             # NB: We need it here to get $n_00
205             my ( $n_00, $alignment_str_ascii, $alignment_arr_csv ) =
206             create_alignment(
207             {
208             ref_key => $key,
209             ref_str_weighted =>
210             $ref_binary_hash->{$key}{binary_digit_string_weighted},
211 36         259 tar_str_weighted => $tar_str_weighted,
212             glob_hash => $glob_hash,
213             sorted_keys_glob_hash => \@sort_keys_glob_hash,
214             labels => \@labels
215             }
216             );
217              
218             # *** IMPORTANT ***
219             # The LENGTH of the alignment is based on the #variables in the REF-COHORT
220             # Compute estimated av and dev for binary_string of L = length_align - n_00
221             # Corrected length_align L = length_align - n_00
222 36         113 my $length_align_corrected = $length_align - $n_00;
223 36         124 ( $stat->{hamming_stats}{mean_rnd}, $stat->{hamming_stats}{sd_rnd} ) =
224             estimate_hamming_stats($length_align_corrected);
225              
226             # Compute a few stats
227             my $hamming_z_score = z_score(
228             $score->{$key}{hamming},
229             $stat->{hamming_stats}{mean},
230             $stat->{hamming_stats}{sd}
231 36         118 );
232             my $hamming_z_score_from_random = z_score(
233             $score->{$key}{hamming},
234             $stat->{hamming_stats}{mean_rnd},
235             $stat->{hamming_stats}{sd_rnd}
236 36         73 );
237              
238             #my $hamming_p_value =
239             # p_value( $score->{$key}{hamming}, $length_align_corrected );
240 36         61 my $hamming_p_value_from_z_score =
241             p_value_from_z_score($hamming_z_score);
242             my $jaccard_z_score = z_score(
243             $score->{$key}{jaccard},
244             $stat->{jaccard_stats}{mean},
245             $stat->{jaccard_stats}{sd}
246 36         98 );
247 36         76 my $jaccard_p_value_from_z_score =
248             p_value_from_z_score( 1 - $jaccard_z_score );
249              
250             # Create a hash with formats
251             my $format = {
252             'RANK' => { value => $count, format => undef },
253             'REFERENCE(ID)' => { value => $key, format => undef },
254             'TARGET(ID)' => { value => $tar, format => undef },
255             'FORMAT' => { value => $self->{format}, format => undef },
256             'WEIGHTED' => { value => $weight_bool, format => undef },
257             'LENGTH' => { value => $length_align_corrected, format => '%6d' },
258             'HAMMING-DISTANCE' =>
259             { value => $score->{$key}{hamming}, format => '%4d' },
260             'DISTANCE-Z-SCORE' =>
261             { value => $hamming_z_score, format => '%7.3f' },
262             'DISTANCE-P-VALUE' =>
263             { value => $hamming_p_value_from_z_score, format => '%12.7f' },
264             'DISTANCE-Z-SCORE(RAND)' =>
265             { value => $hamming_z_score_from_random, format => '%8.4f' },
266             'JACCARD-INDEX' =>
267 36         677 { value => $score->{$key}{jaccard}, format => '%7.3f' },
268             'JACCARD-Z-SCORE' =>
269             { value => $jaccard_z_score, format => '%7.3f' },
270             'JACCARD-P-VALUE' =>
271             { value => $jaccard_p_value_from_z_score, format => '%12.7f' },
272             };
273              
274             # Serialize results
275             my $tmp_str = join "\t", map {
276 36         88 defined $format->{$_}{format}
277             ? sprintf( $format->{$_}{format}, $format->{$_}{value} )
278             : $format->{$_}{value}
279 468 100       1629 } @headers;
280 36         112 push @results, $tmp_str;
281              
282             # To save memory only load if --align
283 36 50       70 if ( defined $align ) {
284              
285             # Add all of the above to @alignments_ascii
286 36         64 my $sep = ('-') x 80;
287 36         1100 push @alignments_ascii,
288             qq/#$header\n$tmp_str\n$sep\n$$alignment_str_ascii/;
289              
290             # Add all of the above to @alignments_csv
291 36         945 push @alignments_csv, @$alignment_arr_csv;
292              
293             # Add data to dataframe
294             push @dataframe, join ';', qq/R|$key/,
295 36         1451 ( split //, $ref_binary_hash->{$key}{binary_digit_string} );
296              
297             # Add values to info
298             $info{$key} = {
299             weighted => $weight_bool eq 'True'
300             ? JSON::XS::true
301             : JSON::XS::false,
302             reference_id => $key,
303             target_id => $tar,
304             reference_binary_string =>
305             $ref_binary_hash->{$key}{binary_digit_string},
306             target_binary_string =>
307             $tar_binary_hash->{$key}{binary_digit_string},
308             reference_binary_string_weighted =>
309             $ref_binary_hash->{$key}{binary_digit_string_weighted},
310             target_binary_string_weighted =>
311             $tar_binary_hash->{$key}{binary_digit_string_weighted},
312             alignment_length => $length_align_corrected,
313             hamming_distance => $score->{$key}{hamming},
314             hamming_z_score => $hamming_z_score,
315             hamming_p_value => $hamming_p_value_from_z_score,
316             jaccard_similarity => $score->{$key}{jaccard},
317             jaccard_z_score => $jaccard_z_score,
318             jaccard_p_value => $jaccard_p_value_from_z_score,
319             jaccard_distance => 1 - $score->{$key}{jaccard},
320             format => $self->{format},
321 36 50       438 alignment => $$alignment_str_ascii,
322             };
323              
324             }
325              
326 36         1560 $count++;
327 36 100       469 last if $count == $max_out;
328             }
329              
330 1         60 return \@results, \%info, \@alignments_ascii, \@dataframe, \@alignments_csv;
331             }
332              
333             sub create_alignment {
334              
335             # NB: The alignment will use the weighted string
336 36     36 0 61 my $arg = shift;
337 36         60 my $ref_key = $arg->{ref_key};
338 36         47 my $binary_string1 = $arg->{ref_str_weighted};
339 36         39 my $binary_string2 = $arg->{tar_str_weighted};
340 36         46 my $sorted_keys_glob_hash = $arg->{sorted_keys_glob_hash};
341 36         42 my $labels = $arg->{labels};
342 36         46 my $glob_hash = $arg->{glob_hash};
343 36         53 my $length1 = length($binary_string1);
344 36         46 my $length2 = length($binary_string2);
345              
346             # Check that l1 = l2
347 36 50       71 die "The binary strings must have the same length"
348             if ( $length1 != $length2 );
349              
350             # Expand array to have weights as N-elements
351 36         61 my $recreated_array = recreate_array( $glob_hash, $sorted_keys_glob_hash );
352              
353             # Initialize some variables
354 36         55 my $out_ascii = "REF -- TAR\n";
355 36         36 my @out_csv;
356 36         47 my $cum_distance = 0;
357 36         36 my $n_00 = 0;
358              
359             # For loop with 2 variables
360             # i index for weighted
361             # j the number of variables
362 36         55 my ( $i, $j ) = ( 0, 0 );
363 36         67 for ( $i = 0 ; $i < $length1 ; $i++, $j++ ) {
364              
365             # Load key and value
366 5724         7760 my $key = $recreated_array->[$i];
367 5724         9358 my $val = sprintf( "%3d", $glob_hash->{$key} );
368              
369             # We have to keep track with $j
370 5724         6087 my $sorted_key = $sorted_keys_glob_hash->[$j];
371 5724         5932 my $label = $labels->[$j];
372              
373             # Load chars
374 5724         6883 my $char1 = substr( $binary_string1, $i, 1 );
375 5724         6081 my $char2 = substr( $binary_string2, $i, 1 );
376 5724 100 100     11789 $n_00++ if ( $char1 == 0 && $char2 == 0 );
377              
378             # Correct $i index by adding weights
379 5724         6537 $i = $i + $glob_hash->{$key} - 1;
380              
381             # Load metrics
382 5724 100       8082 $cum_distance += $glob_hash->{$key} if $char1 ne $char2;
383 5724         8226 my $cum_distance_pretty = sprintf( "%3d", $cum_distance );
384 5724 100       8244 my $distance = $char1 eq $char2 ? 0 : $glob_hash->{$key};
385 5724         7398 my $distance_pretty = sprintf( "%3d", $distance );
386              
387             # w = weight, d = distance, cd = cumul distance
388 5724         11626 my %format = (
389             '11' => '-----',
390             '10' => 'xxx--',
391             '01' => '--xxx',
392             '00' => ' '
393             );
394 5724         15241 $out_ascii .=
395             qq/$char1 $format{ $char1 . $char2 } $char2 | (w:$val|d:$distance_pretty|cd:$cum_distance_pretty|) $sorted_key ($label)\n/;
396 5724         26134 push @out_csv,
397             qq/$ref_key;$char1;$format{ $char1 . $char2 };$char2;$glob_hash->{$key};$distance;$sorted_key;$label/;
398              
399             #REF(107:week_0_arm_1);indicator;TAR(125:week_0_arm_1);weight;hamming-distance;json-path
400             #0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P16Y
401             #0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P24Y
402             #1;-----;1;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P39Y
403             }
404 36         354 return $n_00, \$out_ascii, \@out_csv;
405             }
406              
407             sub recreate_array {
408              
409 36     36 0 61 my ( $glob_hash, $sorted_keys_glob_hash ) = @_;
410 36         38 my @recreated_array;
411 36         64 foreach my $key (@$sorted_keys_glob_hash) {
412 5724         8491 for ( my $i = 0 ; $i < $glob_hash->{$key} ; $i++ ) {
413 6012         10328 push @recreated_array, $key;
414             }
415             }
416 36         73 return \@recreated_array;
417             }
418              
419             sub create_glob_and_ref_hashes {
420              
421 6     6 0 19 my ( $array, $weight, $self ) = @_;
422 6         19 my $primary_key = $self->{primary_key};
423 6         14 my $glob_hash = {};
424 6         9 my $ref_hash_flattened;
425              
426 6         14 for my $i ( @{$array} ) {
  6         18  
427              
428             # For consistency, we obtain the primary_key for both BFF/PXF
429             # from $_->{id} (not from subject.id)
430 199         502 my $id = $i->{$primary_key};
431 199 50       430 say "Flattening and remapping <id:$id> ..." if $self->{verbose};
432 199         675 my $ref_hash = remap_hash(
433             {
434             hash => $i,
435             weight => $weight,
436             self => $self
437             }
438             );
439 199         731 $ref_hash_flattened->{$id} = $ref_hash;
440              
441             # The idea is to create a $glob_hash with unique key-values
442             # Duplicates will be automatically merged
443 199         7991 $glob_hash = { %$glob_hash, %$ref_hash };
444             }
445 6         48 return ( $glob_hash, $ref_hash_flattened );
446             }
447              
448             sub randomize_variables {
449              
450 0     0 0 0 my ( $glob_hash, $self ) = @_;
451 0         0 my $max = $self->{max_number_var};
452 0         0 my $seed = $self->{seed};
453              
454             # set random seed
455 0         0 srand($seed);
456              
457             # Randomize
458             # NB:keys have to be sorted for srand to work!!!!
459             # perl -MList::Util=shuffle -E 'srand 123; say shuffle 1 .. 5'
460 0         0 my @items = ( shuffle sort keys %$glob_hash )[ 0 .. $max - 1 ];
461              
462             # Create a new hash with a hash slice
463 0         0 my %new_glob_hash;
464 0         0 @new_glob_hash{@items} = @{$glob_hash}{@items};
  0         0  
465              
466             # return reference
467 0         0 return \%new_glob_hash;
468             }
469              
470             sub prune_excluded_included {
471              
472 200     200 0 351 my ( $hash, $self ) = @_;
473 200         231 my @included = @{ $self->{include_terms} };
  200         414  
474 200         244 my @excluded = @{ $self->{exclude_terms} };
  200         310  
475              
476             # Die if we have both options at the same time
477 200 50 66     482 die "Sorry, <--include> and <--exclude> are mutually exclusive\n"
478             if ( @included && @excluded );
479              
480             # *** IMPORTANT ***
481             # Original $hash is modified
482              
483             # INCLUDED
484 200 100       366 if (@included) {
485 25         52 for my $key ( keys %$hash ) {
486 125 100   225   335 delete $hash->{$key} unless any { $_ eq $key } @included;
  225         442  
487             }
488             }
489              
490             # EXCLUDED
491 200 50       352 if (@excluded) {
492 0         0 for my $key (@excluded) {
493 0 0       0 delete $hash->{$key} if exists $hash->{$key};
494             }
495             }
496              
497             # We will do nothing if @included = @excluded = [] (DEFAULT)
498 200         286 return 1;
499             }
500              
501             sub undef_excluded_phenotypicFeatures {
502              
503 200     200 0 250 my $hash = shift;
504              
505             # Setting the property to undef (it will be discarded later)
506 200 100       448 if ( exists $hash->{phenotypicFeatures} ) {
507 71         261 @{ $hash->{phenotypicFeatures} } =
508 71 100       99 map { $_->{excluded} ? undef : $_ } @{ $hash->{phenotypicFeatures} };
  179         1412  
  71         293  
509             }
510              
511             # *** IMPORTANT ***
512             # Because of setting to undef the excludd properties, it can happen that in
513             # the stats file we have phenotypicFeatures = 100% but t hen it turns out
514             # that some individuals have phenotypicFeatures = {} (all excluded)
515 200         661 return $hash;
516             }
517              
518             sub remap_hash {
519              
520 200     200 0 335 my $arg = shift;
521 200         278 my $hash = $arg->{hash};
522 200         239 my $weight = $arg->{weight};
523 200         240 my $self = $arg->{self};
524 200         274 my $nodes = $self->{nodes};
525 200         240 my $edges = $self->{edges};
526 200         203 my $out_hash;
527              
528             # Do some pruning excluded / included
529 200         424 prune_excluded_included( $hash, $self );
530              
531             # *** IMPORTANT ***
532             # The user may include a term that:
533             # A - may not exist in any individual
534             # B - does not exist in some individuals
535             # If the term does not exist in a invidual
536             # - a) -include-terms contains ANOTHER TERM THAT EXISTS
537             # %$hash will contain keys => OK
538             # - b) -include-terms only includes the term/terms not present
539             # %$hash = 0 , then we return {}, to avoid trouble w/ Fold.pm
540             #print Dumper $hash;
541 200 50       407 return {} unless %$hash;
542              
543             # A bit more pruning plus collapsing
544 200         339 $hash = fold( undef_excluded_phenotypicFeatures($hash) );
545              
546             # Load the hash that points to the hierarchy for ontology
547             # *** IMPORTANT ***
548             # - phenotypicFeatures.featureType.id => BFF
549             # - phenotypicFeatures.type.id => PXF
550 200         885026 my $id_correspondence = $self->{id_correspondence};
551              
552             # Now we proceed for each key
553 200         288 for my $key ( keys %{$hash} ) {
  200         3573  
554              
555             # To see which ones were discarded
556             #say $key if !defined $hash->{$key};
557              
558             # Discard undefined
559 20603 100       34262 next unless defined $hash->{$key};
560              
561             # Discarding lines with 'low quality' keys (Time of regex profiled with :NYTProf: ms time)
562             # Some can be "rescued" by adding the ontology as ($1)
563             # NB1: We discard _labels too!!
564             # NB2: info|metaData are always discarded
565              
566 19847         23069 my $regex = $self->{exclude_properties_regex};
567             next
568 19847 100 100     92135 if ( $regex && $key =~ m/$regex/ )
569             ; # $regex has to be defined and be != ''
570              
571             # The user can turn on age related values
572 7976 100 66     28330 next if ( $key =~ m/age|onset/i && !$self->{age} ); # $self->{age} [0|1]
573              
574             # Load values
575 7563         9405 my $val = $hash->{$key};
576              
577             # Discarding lines with val (Time profiled with :NYTProf: ms time)
578             next
579 7563 100 33     39017 if ( $val eq 'NA'
      33        
      66        
580             || $val eq 'Fake'
581             || $val eq 'None:No matching concept'
582             || $val =~ m/1900-01-01|NA0000|P999Y|P9999Y|ARRAY|phenopacket_id/ );
583              
584             # Add IDs to key
585 5715         8912 my $id_key = add_id2key( $key, $hash, $self );
586              
587             # Finally add value to id_key
588 5715         11139 my $tmp_key = $id_key . '.' . $val;
589              
590             # Add HPO ascendants
591 5715 50 33     9955 if ( defined $edges && $val =~ /^HP:/ ) {
592 0         0 my $ascendants = add_hpo_ascendants( $tmp_key, $nodes, $edges );
593 0         0 $out_hash->{$_} = 1 for @$ascendants; # weight 1 for now
594             }
595              
596             ##################
597             # Assign weights #
598             ##################
599             # NB: mrueda (04-12-23) - it's ok if $weight == undef => NO AUTOVIVIFICATION!
600             # NB: We don't warn if it does not exist, just assign 1
601             # *** IMPORTANT *** 07-26-2023
602             # We allow for assigning weights by TERM (e.g., 1D)
603             # but VARIABLE level takes precedence to TERM
604              
605 5715         5936 my $tmp_key_at_term_level = $tmp_key;
606              
607             # If variable has . then capture $1
608 5715 50       10317 if ( $tmp_key_at_term_level =~ m/\./ ) {
609              
610             # NB: For long str regex is faster than (split /\./, $foo)[0]
611 5715         9891 $tmp_key_at_term_level =~ m/^(\w+)\./;
612 5715         9094 $tmp_key_at_term_level = $1;
613             }
614              
615             # ORDER MATTERS !!!!
616             $out_hash->{$tmp_key} =
617              
618             # VARIABLE LEVEL
619             exists $weight->{$tmp_key}
620             ? $weight->{$tmp_key}
621              
622             # TERM LEVEL
623             : exists $weight->{$tmp_key_at_term_level}
624 5715 50       14147 ? $weight->{$tmp_key_at_term_level}
    100          
625              
626             # NO WEIGHT
627             : 1;
628              
629             # Finally we load the Nomenclature hash
630 5715         6342 my $label = $key;
631 5715         13185 $label =~ s/id/label/;
632 5715 100       17650 $nomenclature{$tmp_key} = $hash->{$label} if defined $hash->{$label};
633             }
634              
635             # *** IMPORTANT ***
636             # We have to return an object {} when undef
637 200   50     4433 return $out_hash // {};
638             }
639              
640             sub add_hpo_ascendants {
641              
642 0     0 0 0 my ( $key, $nodes, $edges ) = @_;
643              
644             # First we obtain the ontology (0000539) from HP:0000539
645 0         0 $key =~ m/HP:(\w+)$/;
646 0         0 my $ontology = $1;
647              
648             # We'll use it to build a string equivalent to a key from $edges
649 0         0 my $hpo_url = 'http://purl.obolibrary.org/obo/HP_';
650 0         0 my $hpo_key = $hpo_url . $ontology;
651              
652             # We will include all ascendants in an array
653 0         0 my @ascendants;
654 0         0 for my $parent_id ( @{ $edges->{$hpo_key} } ) {
  0         0  
655              
656             # We have to create a copy to not modify the original $parent_id
657             # as it can appear in multiple individuals
658 0         0 my $copy_parent_id = $parent_id;
659 0         0 $copy_parent_id =~ m/\/(\w+)$/;
660 0         0 $copy_parent_id = $1;
661 0         0 $copy_parent_id =~ tr/_/:/;
662              
663             # *** IMPORTANT ***
664             # We cannot add any label to the ascendants, otherwise they will
665             # not be matched by an indv down the tree
666             # Myopia
667             # Mild Myopia
668             # We want that 'Mild Myopia' matches 'Myopia', thus we can not add a label from 'Mild Myopia'
669             # Use the labels only for debug
670 0         0 my $asc_key = DEVEL_MODE ? $key . '.HPO_asc_DEBUG_ONLY' : $key;
671 0         0 $asc_key =~ s/HP:$ontology/$copy_parent_id/g;
672 0         0 push @ascendants, $asc_key;
673              
674             # We finally add the label to %nomenclature
675 0         0 my $hpo_asc_str = $hpo_url
676             . $copy_parent_id; # 'http://purl.obolibrary.org/obo/HP_HP:0000539
677 0         0 $hpo_asc_str =~ s/HP://; # 0000539
678 0         0 $nomenclature{$asc_key} = $nodes->{$hpo_asc_str}{lbl};
679             }
680 0         0 return \@ascendants;
681             }
682              
683             sub add_id2key {
684              
685 5715     5715 0 8421 my ( $key, $hash, $self ) = @_;
686 5715         8485 my $id_correspondence = $self->{id_correspondence}{ $self->{format} };
687 5715         5708 my $array_terms_str = join '|', @{ $self->{array_terms} };
  5715         11642  
688 5715         7831 my $array_regex = $self->{array_regex};
689              
690 5715 100       19241 if ( $key =~ /$array_terms_str/ ) {
691 5094         14959 $key =~ m/$array_regex/;
692 5094         6176 my ( $tmp_key, $val );
693              
694             # Normal behaviour
695 5094 100       8942 if ( defined $3 ) {
696 4964         11548 $tmp_key = $1 . ':' . $2 . '.' . $id_correspondence->{$1};
697 4964         6868 $val = $hash->{$tmp_key};
698 4964         9405 $key = $1 . '.' . $val . '.' . $3;
699             }
700              
701             # Only two
702             else {
703              
704 130         254 $tmp_key = $1 . ':' . $2;
705 130         175 $val = $hash->{$tmp_key};
706 130         183 $key = $1;
707             }
708             }
709 5715         9011 return $key;
710             }
711              
712             sub create_binary_digit_string {
713              
714 7     7 0 24 my ( $glob_hash, $cmp_hash ) = @_;
715 7         17 my $out_hash;
716              
717             # *** IMPORTANT ***
718             # Being a nested for, keys %{$glob_hash} does not need sorting
719             # BUT, we sort to follow the same order as serialized (sorted)
720 7         15 my @sorted_keys_glob_hash = sort keys %{$glob_hash};
  7         474  
721              
722             # IDs of each indidividual
723 7         43 for my $individual_id ( keys %{$cmp_hash} ) { # no need to sort
  7         62  
724              
725             # One-hot encoding = Representing categorical data as numerical
726 200         319 my ( $binary_str, $binary_str_weighted ) = ('') x 2;
727 200         246 for my $key (@sorted_keys_glob_hash) {
728 21837         24713 my $ones = (1) x $glob_hash->{$key};
729 21837         23273 my $zeros = (0) x $glob_hash->{$key};
730 21837 100       29574 $binary_str .= exists $cmp_hash->{$individual_id}{$key} ? 1 : 0;
731             $binary_str_weighted .=
732 21837 100       31135 exists $cmp_hash->{$individual_id}{$key} ? $ones : $zeros;
733             }
734 200         478 $out_hash->{$individual_id}{binary_digit_string} = $binary_str;
735             $out_hash->{$individual_id}{binary_digit_string_weighted} =
736 200         333 $binary_str_weighted;
737             }
738 7         91 return $out_hash;
739             }
740              
741             sub parse_hpo_json {
742              
743 0     0 0   my $data = shift;
744              
745             # The <hp.json> file is a structured representation of the Human Phenotype Ontology (HPO) in JSON format.
746             # The HPO is structured into a directed acyclic graph (DAG)
747             # Here's a brief overview of the structure of the hpo.json file:
748             # - graphs: This key contains an array of ontology graphs. In the case of HPO, there is only one graph. The graph has two main keys:
749             # - nodes: An array of objects, each representing an HPO term. Each term object has the following keys:
750             # - id: The identifier of the term (e.g., "HP:0000118").
751             # - lbl: The label (name) of the term (e.g., "Phenotypic abnormality").
752             # - meta: Metadata associated with the term, including definition, synonyms, and other information.
753             # - type: The type of the term, usually "CLASS".
754             # - edges: An array of objects, each representing a relationship between two HPO terms. Each edge object has the following keys:
755             # - sub: The subject (child) term ID (e.g., "HP:0000924").
756             # - obj: The object (parent) term ID (e.g., "HP:0000118").
757             # - pred: The predicate that describes the relationship between the subject and object terms, typically "is_a" in HPO.
758             # - meta: This key contains metadata about the HPO ontology as a whole, such as version information, description, and other details.
759              
760 0           my $graph = $data->{graphs}->[0];
761 0           my %nodes = map { $_->{id} => $_ } @{ $graph->{nodes} };
  0            
  0            
762 0           my %edges = ();
763              
764 0           for my $edge ( @{ $graph->{edges} } ) {
  0            
765 0           my $child_id = $edge->{sub};
766 0           my $parent_id = $edge->{obj};
767 0           push @{ $edges{$child_id} }, $parent_id;
  0            
768             }
769 0           return \%nodes, \%edges;
770             }
771             1;