line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Pheno::Ranker::Align; |
2
|
|
|
|
|
|
|
|
3
|
3
|
|
|
3
|
|
21
|
use strict; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
130
|
|
4
|
3
|
|
|
3
|
|
15
|
use warnings; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
82
|
|
5
|
3
|
|
|
3
|
|
15
|
use autodie; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
20
|
|
6
|
3
|
|
|
3
|
|
17303
|
use feature qw(say); |
|
3
|
|
|
|
|
10
|
|
|
3
|
|
|
|
|
257
|
|
7
|
3
|
|
|
3
|
|
18
|
use List::Util qw(any shuffle); |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
208
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
#use List::MoreUtils qw(duplicates); |
10
|
3
|
|
|
3
|
|
18
|
use Data::Dumper; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
139
|
|
11
|
3
|
|
|
3
|
|
1834
|
use Sort::Naturally qw(nsort); |
|
3
|
|
|
|
|
13351
|
|
|
3
|
|
|
|
|
211
|
|
12
|
3
|
|
|
3
|
|
1505
|
use Hash::Fold fold => { array_delimiter => ':' }; |
|
3
|
|
|
|
|
106585
|
|
|
3
|
|
|
|
|
24
|
|
13
|
3
|
|
|
3
|
|
2455
|
use Pheno::Ranker::Stats; |
|
3
|
|
|
|
|
9
|
|
|
3
|
|
|
|
|
246
|
|
14
|
|
|
|
|
|
|
|
15
|
3
|
|
|
3
|
|
21
|
use Exporter 'import'; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
138
|
|
16
|
|
|
|
|
|
|
our @EXPORT = |
17
|
|
|
|
|
|
|
qw(check_format cohort_comparison compare_and_rank create_glob_and_ref_hashes randomize_variables remap_hash create_binary_digit_string parse_hpo_json); |
18
|
|
|
|
|
|
|
|
19
|
3
|
|
|
3
|
|
15
|
use constant DEVEL_MODE => 0; |
|
3
|
|
|
|
|
6
|
|
|
3
|
|
|
|
|
13419
|
|
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our %nomenclature = (); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub check_format { |
24
|
|
|
|
|
|
|
|
25
|
3
|
|
|
3
|
0
|
7
|
my $data = shift; |
26
|
3
|
50
|
|
|
|
23
|
return exists $data->[0]{subject} ? 'PXF' : 'BFF'; |
27
|
|
|
|
|
|
|
} |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
sub cohort_comparison { |
30
|
|
|
|
|
|
|
|
31
|
5
|
|
|
5
|
0
|
14
|
my ( $ref_binary_hash, $self ) = @_; |
32
|
5
|
|
|
|
|
17
|
my $out_file = $self->{out_file}; |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
say "Performing COHORT comparison" |
35
|
5
|
50
|
33
|
|
|
43
|
if ( $self->{debug} || $self->{verbose} ); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# *** IMPORTANT *** |
38
|
|
|
|
|
|
|
# The ids/cohorts are naturally sorted (it won't match --append-prefixes!!!) |
39
|
5
|
|
|
|
|
9
|
my @sorted_keys_ref_binary_hash = nsort( keys %{$ref_binary_hash} ); |
|
5
|
|
|
|
|
260
|
|
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
# Print to $out_file |
42
|
|
|
|
|
|
|
# |
43
|
5
|
|
|
|
|
23610
|
open( my $fh, ">", $out_file ); |
44
|
5
|
|
|
|
|
7218
|
say $fh "\t", join "\t", @sorted_keys_ref_binary_hash; |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# NB: It's a symmetric matrix so we could just compute |
47
|
|
|
|
|
|
|
# triangle. However, R needs the whole matrix |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
# Initialize the 2D matrix |
50
|
5
|
|
|
|
|
19
|
my @matrix; |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# I elements |
53
|
5
|
|
|
|
|
30
|
for my $i ( 0 .. $#sorted_keys_ref_binary_hash ) { |
54
|
|
|
|
|
|
|
say "Calculating <" |
55
|
|
|
|
|
|
|
. $sorted_keys_ref_binary_hash[$i] |
56
|
|
|
|
|
|
|
. "> against the cohort..." |
57
|
163
|
50
|
|
|
|
332
|
if $self->{verbose}; |
58
|
|
|
|
|
|
|
my $str1 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$i] } |
59
|
163
|
|
|
|
|
235
|
{binary_digit_string_weighted}; |
60
|
163
|
|
|
|
|
241
|
print $fh $sorted_keys_ref_binary_hash[$i] . "\t"; |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# J elements |
63
|
163
|
|
|
|
|
257
|
for my $j ( 0 .. $#sorted_keys_ref_binary_hash ) { |
64
|
|
|
|
|
|
|
my $str2 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] } |
65
|
5875
|
|
|
|
|
7525
|
{binary_digit_string_weighted}; |
66
|
5875
|
|
|
|
|
5109
|
my $distance; |
67
|
|
|
|
|
|
|
|
68
|
5875
|
100
|
|
|
|
7546
|
if ( $j < $i ) { |
|
|
100
|
|
|
|
|
|
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
# Use the distance from the lower triangle |
71
|
2856
|
|
|
|
|
3175
|
$distance = $matrix[$j][$i]; |
72
|
|
|
|
|
|
|
} |
73
|
|
|
|
|
|
|
elsif ( $j == $i ) { |
74
|
163
|
|
|
|
|
190
|
$distance = 0; # Diagonal element |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
else { |
77
|
|
|
|
|
|
|
# Compute the distance and store it in the matrix |
78
|
2856
|
|
|
|
|
3869
|
$distance = hd_fast( $str1, $str2 ); |
79
|
2856
|
|
|
|
|
4140
|
$matrix[$i][$j] = $distance; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
# Fill out the other triangle |
83
|
5875
|
|
|
|
|
6879
|
$matrix[$j][$i] = $distance; |
84
|
|
|
|
|
|
|
|
85
|
5875
|
|
|
|
|
8488
|
print $fh $distance . "\t"; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
163
|
|
|
|
|
237
|
print $fh "\n"; |
89
|
|
|
|
|
|
|
} |
90
|
|
|
|
|
|
|
|
91
|
5
|
|
|
|
|
35
|
close $fh; |
92
|
5
|
50
|
33
|
|
|
3266
|
say "Matrix saved to <$out_file>" if ( $self->{debug} || $self->{verbose} ); |
93
|
5
|
|
|
|
|
238
|
return 1; |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
|
96
|
|
|
|
|
|
|
sub compare_and_rank { |
97
|
|
|
|
|
|
|
|
98
|
1
|
|
|
1
|
0
|
4
|
my $arg = shift; |
99
|
1
|
|
|
|
|
3
|
my $glob_hash = $arg->{glob_hash}; |
100
|
1
|
|
|
|
|
3
|
my $ref_binary_hash = $arg->{ref_binary_hash}; |
101
|
1
|
|
|
|
|
3
|
my $tar_binary_hash = $arg->{tar_binary_hash}; |
102
|
1
|
|
|
|
|
2
|
my $weight = $arg->{weight}; |
103
|
1
|
|
|
|
|
84
|
my $self = $arg->{self}; |
104
|
1
|
|
|
|
|
4
|
my $sort_by = $self->{sort_by}; |
105
|
1
|
|
|
|
|
3
|
my $align = $self->{align}; |
106
|
1
|
|
|
|
|
3
|
my $max_out = $self->{max_out}; |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
say "Performing COHORT(REF)-PATIENT(TAR) comparison" |
109
|
1
|
50
|
33
|
|
|
10
|
if ( $self->{debug} || $self->{verbose} ); |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
# Hash for compiling distances |
112
|
1
|
|
|
|
|
3
|
my $score; |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
# Hash for stats |
115
|
|
|
|
|
|
|
my $stat; |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
# Load TAR binary string |
118
|
1
|
|
|
|
|
2
|
my ($tar) = keys %{$tar_binary_hash}; |
|
1
|
|
|
|
|
4
|
|
119
|
|
|
|
|
|
|
my $tar_str_weighted = |
120
|
1
|
|
|
|
|
3
|
$tar_binary_hash->{$tar}{binary_digit_string_weighted}; |
121
|
|
|
|
|
|
|
|
122
|
1
|
|
|
|
|
3
|
for my $key ( keys %{$ref_binary_hash} ) { # No need to sort |
|
1
|
|
|
|
|
15
|
|
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
# Load REF binary string |
125
|
|
|
|
|
|
|
my $ref_str_weighted = |
126
|
36
|
|
|
|
|
100
|
$ref_binary_hash->{$key}{binary_digit_string_weighted}; |
127
|
36
|
50
|
|
|
|
65
|
say "Comparing <id:$key> --- <id:$tar>" if $self->{verbose}; |
128
|
|
|
|
|
|
|
say "REF:$ref_str_weighted\nTAR:$tar_str_weighted\n" |
129
|
36
|
50
|
33
|
|
|
64
|
if ( defined $self->{debug} && $self->{debug} > 1 ); |
130
|
|
|
|
|
|
|
$score->{$key}{hamming} = |
131
|
36
|
|
|
|
|
83
|
hd_fast( $ref_str_weighted, $tar_str_weighted ); |
132
|
|
|
|
|
|
|
$score->{$key}{jaccard} = |
133
|
36
|
|
|
|
|
66
|
jaccard_similarity( $ref_str_weighted, $tar_str_weighted ); |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
# Add values |
136
|
36
|
|
|
|
|
46
|
push @{ $stat->{hamming_data} }, $score->{$key}{hamming}; |
|
36
|
|
|
|
|
77
|
|
137
|
36
|
|
|
|
|
42
|
push @{ $stat->{jaccard_data} }, $score->{$key}{jaccard}; |
|
36
|
|
|
|
|
66
|
|
138
|
|
|
|
|
|
|
} |
139
|
1
|
|
|
|
|
9
|
$stat->{hamming_stats} = add_stats( $stat->{hamming_data} ); |
140
|
1
|
|
|
|
|
6
|
$stat->{jaccard_stats} = add_stats( $stat->{jaccard_data} ); |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# Initialize a few variables |
143
|
1
|
|
|
|
|
8
|
my @headers = ( |
144
|
|
|
|
|
|
|
'RANK', 'REFERENCE(ID)', |
145
|
|
|
|
|
|
|
'TARGET(ID)', 'FORMAT', |
146
|
|
|
|
|
|
|
'LENGTH', 'WEIGHTED', |
147
|
|
|
|
|
|
|
'HAMMING-DISTANCE', 'DISTANCE-Z-SCORE', |
148
|
|
|
|
|
|
|
'DISTANCE-P-VALUE', 'DISTANCE-Z-SCORE(RAND)', |
149
|
|
|
|
|
|
|
'JACCARD-INDEX', 'JACCARD-Z-SCORE', |
150
|
|
|
|
|
|
|
'JACCARD-P-VALUE' |
151
|
|
|
|
|
|
|
); |
152
|
1
|
|
|
|
|
6
|
my $header = join "\t", @headers; |
153
|
1
|
|
|
|
|
4
|
my @results = $header; |
154
|
1
|
|
|
|
|
3
|
my %info; |
155
|
1
|
|
|
|
|
3
|
my $length_align = length($tar_str_weighted); |
156
|
1
|
50
|
|
|
|
4
|
my $weight_bool = $weight ? 'True' : 'False'; |
157
|
1
|
|
|
|
|
3
|
my @alignments_ascii; |
158
|
|
|
|
|
|
|
my $alignment_str_csv; |
159
|
1
|
|
|
|
|
3
|
my @alignments_csv = join ';', |
160
|
|
|
|
|
|
|
qw/id ref indicator tar weight hamming-distance json-path label/; |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# The dataframe will have two header lines |
163
|
|
|
|
|
|
|
# *** IMPORTANT *** |
164
|
|
|
|
|
|
|
# nsort does not yield same results as canonical from JSON::XS |
165
|
|
|
|
|
|
|
# NB: we're sorting here and in create_binary_digit_string() |
166
|
1
|
|
|
|
|
2
|
my @sort_keys_glob_hash = sort keys %{$glob_hash}; |
|
1
|
|
|
|
|
134
|
|
167
|
1
|
100
|
|
|
|
8
|
my @labels = map { exists $nomenclature{$_} ? $nomenclature{$_} : $_ } |
|
159
|
|
|
|
|
303
|
|
168
|
|
|
|
|
|
|
@sort_keys_glob_hash; |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# Die if #elements in arrays differ |
171
|
1
|
50
|
|
|
|
6
|
die "Mismatch between variables and labels" |
172
|
|
|
|
|
|
|
if @sort_keys_glob_hash != @labels; |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Labels |
175
|
|
|
|
|
|
|
# NB: there is a comma in 'Serum Glutamic Pyruvic Transaminase, CTCAE' |
176
|
1
|
|
|
|
|
21
|
my @dataframe = join ';', 'Id', @labels; |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# Variables (json path) |
179
|
1
|
|
|
|
|
22
|
push @dataframe, join ';', 'Id', @sort_keys_glob_hash; |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# 0/1 |
182
|
|
|
|
|
|
|
push @dataframe, join ';', qq/T|$tar/, |
183
|
1
|
|
|
|
|
49
|
( split //, $tar_binary_hash->{$tar}{binary_digit_string} ); # Add Target |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
# Sort %score by value and load results |
186
|
1
|
|
|
|
|
10
|
my $count = 1; |
187
|
1
|
|
|
|
|
2
|
$max_out++; # to be able to start w/ ONE |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# Start loop |
190
|
1
|
|
|
|
|
12
|
for my $key ( |
191
|
|
|
|
|
|
|
sort { |
192
|
|
|
|
|
|
|
$sort_by eq 'jaccard' # |
193
|
|
|
|
|
|
|
? $score->{$b}{$sort_by} |
194
|
|
|
|
|
|
|
<=> $score->{$a}{$sort_by} # 1 to 0 (similarity) |
195
|
|
|
|
|
|
|
: $score->{$a}{$sort_by} |
196
|
144
|
50
|
|
|
|
223
|
<=> $score->{$b}{$sort_by} # 0 to N (distance) |
197
|
|
|
|
|
|
|
} keys %$score |
198
|
|
|
|
|
|
|
) |
199
|
|
|
|
|
|
|
{ |
200
|
|
|
|
|
|
|
|
201
|
36
|
50
|
|
|
|
88
|
say "$count: Creating alignment <id:$key>" if $self->{verbose}; |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# Create ASCII alignment |
204
|
|
|
|
|
|
|
# NB: We need it here to get $n_00 |
205
|
|
|
|
|
|
|
my ( $n_00, $alignment_str_ascii, $alignment_arr_csv ) = |
206
|
|
|
|
|
|
|
create_alignment( |
207
|
|
|
|
|
|
|
{ |
208
|
|
|
|
|
|
|
ref_key => $key, |
209
|
|
|
|
|
|
|
ref_str_weighted => |
210
|
|
|
|
|
|
|
$ref_binary_hash->{$key}{binary_digit_string_weighted}, |
211
|
36
|
|
|
|
|
259
|
tar_str_weighted => $tar_str_weighted, |
212
|
|
|
|
|
|
|
glob_hash => $glob_hash, |
213
|
|
|
|
|
|
|
sorted_keys_glob_hash => \@sort_keys_glob_hash, |
214
|
|
|
|
|
|
|
labels => \@labels |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
); |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
# *** IMPORTANT *** |
219
|
|
|
|
|
|
|
# The LENGTH of the alignment is based on the #variables in the REF-COHORT |
220
|
|
|
|
|
|
|
# Compute estimated av and dev for binary_string of L = length_align - n_00 |
221
|
|
|
|
|
|
|
# Corrected length_align L = length_align - n_00 |
222
|
36
|
|
|
|
|
113
|
my $length_align_corrected = $length_align - $n_00; |
223
|
36
|
|
|
|
|
124
|
( $stat->{hamming_stats}{mean_rnd}, $stat->{hamming_stats}{sd_rnd} ) = |
224
|
|
|
|
|
|
|
estimate_hamming_stats($length_align_corrected); |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
# Compute a few stats |
227
|
|
|
|
|
|
|
my $hamming_z_score = z_score( |
228
|
|
|
|
|
|
|
$score->{$key}{hamming}, |
229
|
|
|
|
|
|
|
$stat->{hamming_stats}{mean}, |
230
|
|
|
|
|
|
|
$stat->{hamming_stats}{sd} |
231
|
36
|
|
|
|
|
118
|
); |
232
|
|
|
|
|
|
|
my $hamming_z_score_from_random = z_score( |
233
|
|
|
|
|
|
|
$score->{$key}{hamming}, |
234
|
|
|
|
|
|
|
$stat->{hamming_stats}{mean_rnd}, |
235
|
|
|
|
|
|
|
$stat->{hamming_stats}{sd_rnd} |
236
|
36
|
|
|
|
|
73
|
); |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
#my $hamming_p_value = |
239
|
|
|
|
|
|
|
# p_value( $score->{$key}{hamming}, $length_align_corrected ); |
240
|
36
|
|
|
|
|
61
|
my $hamming_p_value_from_z_score = |
241
|
|
|
|
|
|
|
p_value_from_z_score($hamming_z_score); |
242
|
|
|
|
|
|
|
my $jaccard_z_score = z_score( |
243
|
|
|
|
|
|
|
$score->{$key}{jaccard}, |
244
|
|
|
|
|
|
|
$stat->{jaccard_stats}{mean}, |
245
|
|
|
|
|
|
|
$stat->{jaccard_stats}{sd} |
246
|
36
|
|
|
|
|
98
|
); |
247
|
36
|
|
|
|
|
76
|
my $jaccard_p_value_from_z_score = |
248
|
|
|
|
|
|
|
p_value_from_z_score( 1 - $jaccard_z_score ); |
249
|
|
|
|
|
|
|
|
250
|
|
|
|
|
|
|
# Create a hash with formats |
251
|
|
|
|
|
|
|
my $format = { |
252
|
|
|
|
|
|
|
'RANK' => { value => $count, format => undef }, |
253
|
|
|
|
|
|
|
'REFERENCE(ID)' => { value => $key, format => undef }, |
254
|
|
|
|
|
|
|
'TARGET(ID)' => { value => $tar, format => undef }, |
255
|
|
|
|
|
|
|
'FORMAT' => { value => $self->{format}, format => undef }, |
256
|
|
|
|
|
|
|
'WEIGHTED' => { value => $weight_bool, format => undef }, |
257
|
|
|
|
|
|
|
'LENGTH' => { value => $length_align_corrected, format => '%6d' }, |
258
|
|
|
|
|
|
|
'HAMMING-DISTANCE' => |
259
|
|
|
|
|
|
|
{ value => $score->{$key}{hamming}, format => '%4d' }, |
260
|
|
|
|
|
|
|
'DISTANCE-Z-SCORE' => |
261
|
|
|
|
|
|
|
{ value => $hamming_z_score, format => '%7.3f' }, |
262
|
|
|
|
|
|
|
'DISTANCE-P-VALUE' => |
263
|
|
|
|
|
|
|
{ value => $hamming_p_value_from_z_score, format => '%12.7f' }, |
264
|
|
|
|
|
|
|
'DISTANCE-Z-SCORE(RAND)' => |
265
|
|
|
|
|
|
|
{ value => $hamming_z_score_from_random, format => '%8.4f' }, |
266
|
|
|
|
|
|
|
'JACCARD-INDEX' => |
267
|
36
|
|
|
|
|
677
|
{ value => $score->{$key}{jaccard}, format => '%7.3f' }, |
268
|
|
|
|
|
|
|
'JACCARD-Z-SCORE' => |
269
|
|
|
|
|
|
|
{ value => $jaccard_z_score, format => '%7.3f' }, |
270
|
|
|
|
|
|
|
'JACCARD-P-VALUE' => |
271
|
|
|
|
|
|
|
{ value => $jaccard_p_value_from_z_score, format => '%12.7f' }, |
272
|
|
|
|
|
|
|
}; |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
# Serialize results |
275
|
|
|
|
|
|
|
my $tmp_str = join "\t", map { |
276
|
36
|
|
|
|
|
88
|
defined $format->{$_}{format} |
277
|
|
|
|
|
|
|
? sprintf( $format->{$_}{format}, $format->{$_}{value} ) |
278
|
|
|
|
|
|
|
: $format->{$_}{value} |
279
|
468
|
100
|
|
|
|
1629
|
} @headers; |
280
|
36
|
|
|
|
|
112
|
push @results, $tmp_str; |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
# To save memory only load if --align |
283
|
36
|
50
|
|
|
|
70
|
if ( defined $align ) { |
284
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# Add all of the above to @alignments_ascii |
286
|
36
|
|
|
|
|
64
|
my $sep = ('-') x 80; |
287
|
36
|
|
|
|
|
1100
|
push @alignments_ascii, |
288
|
|
|
|
|
|
|
qq/#$header\n$tmp_str\n$sep\n$$alignment_str_ascii/; |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# Add all of the above to @alignments_csv |
291
|
36
|
|
|
|
|
945
|
push @alignments_csv, @$alignment_arr_csv; |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
# Add data to dataframe |
294
|
|
|
|
|
|
|
push @dataframe, join ';', qq/R|$key/, |
295
|
36
|
|
|
|
|
1451
|
( split //, $ref_binary_hash->{$key}{binary_digit_string} ); |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
# Add values to info |
298
|
|
|
|
|
|
|
$info{$key} = { |
299
|
|
|
|
|
|
|
weighted => $weight_bool eq 'True' |
300
|
|
|
|
|
|
|
? JSON::XS::true |
301
|
|
|
|
|
|
|
: JSON::XS::false, |
302
|
|
|
|
|
|
|
reference_id => $key, |
303
|
|
|
|
|
|
|
target_id => $tar, |
304
|
|
|
|
|
|
|
reference_binary_string => |
305
|
|
|
|
|
|
|
$ref_binary_hash->{$key}{binary_digit_string}, |
306
|
|
|
|
|
|
|
target_binary_string => |
307
|
|
|
|
|
|
|
$tar_binary_hash->{$key}{binary_digit_string}, |
308
|
|
|
|
|
|
|
reference_binary_string_weighted => |
309
|
|
|
|
|
|
|
$ref_binary_hash->{$key}{binary_digit_string_weighted}, |
310
|
|
|
|
|
|
|
target_binary_string_weighted => |
311
|
|
|
|
|
|
|
$tar_binary_hash->{$key}{binary_digit_string_weighted}, |
312
|
|
|
|
|
|
|
alignment_length => $length_align_corrected, |
313
|
|
|
|
|
|
|
hamming_distance => $score->{$key}{hamming}, |
314
|
|
|
|
|
|
|
hamming_z_score => $hamming_z_score, |
315
|
|
|
|
|
|
|
hamming_p_value => $hamming_p_value_from_z_score, |
316
|
|
|
|
|
|
|
jaccard_similarity => $score->{$key}{jaccard}, |
317
|
|
|
|
|
|
|
jaccard_z_score => $jaccard_z_score, |
318
|
|
|
|
|
|
|
jaccard_p_value => $jaccard_p_value_from_z_score, |
319
|
|
|
|
|
|
|
jaccard_distance => 1 - $score->{$key}{jaccard}, |
320
|
|
|
|
|
|
|
format => $self->{format}, |
321
|
36
|
50
|
|
|
|
438
|
alignment => $$alignment_str_ascii, |
322
|
|
|
|
|
|
|
}; |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
36
|
|
|
|
|
1560
|
$count++; |
327
|
36
|
100
|
|
|
|
469
|
last if $count == $max_out; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
|
330
|
1
|
|
|
|
|
60
|
return \@results, \%info, \@alignments_ascii, \@dataframe, \@alignments_csv; |
331
|
|
|
|
|
|
|
} |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
sub create_alignment { |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
# NB: The alignment will use the weighted string |
336
|
36
|
|
|
36
|
0
|
61
|
my $arg = shift; |
337
|
36
|
|
|
|
|
60
|
my $ref_key = $arg->{ref_key}; |
338
|
36
|
|
|
|
|
47
|
my $binary_string1 = $arg->{ref_str_weighted}; |
339
|
36
|
|
|
|
|
39
|
my $binary_string2 = $arg->{tar_str_weighted}; |
340
|
36
|
|
|
|
|
46
|
my $sorted_keys_glob_hash = $arg->{sorted_keys_glob_hash}; |
341
|
36
|
|
|
|
|
42
|
my $labels = $arg->{labels}; |
342
|
36
|
|
|
|
|
46
|
my $glob_hash = $arg->{glob_hash}; |
343
|
36
|
|
|
|
|
53
|
my $length1 = length($binary_string1); |
344
|
36
|
|
|
|
|
46
|
my $length2 = length($binary_string2); |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# Check that l1 = l2 |
347
|
36
|
50
|
|
|
|
71
|
die "The binary strings must have the same length" |
348
|
|
|
|
|
|
|
if ( $length1 != $length2 ); |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
# Expand array to have weights as N-elements |
351
|
36
|
|
|
|
|
61
|
my $recreated_array = recreate_array( $glob_hash, $sorted_keys_glob_hash ); |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
# Initialize some variables |
354
|
36
|
|
|
|
|
55
|
my $out_ascii = "REF -- TAR\n"; |
355
|
36
|
|
|
|
|
36
|
my @out_csv; |
356
|
36
|
|
|
|
|
47
|
my $cum_distance = 0; |
357
|
36
|
|
|
|
|
36
|
my $n_00 = 0; |
358
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
# For loop with 2 variables |
360
|
|
|
|
|
|
|
# i index for weighted |
361
|
|
|
|
|
|
|
# j the number of variables |
362
|
36
|
|
|
|
|
55
|
my ( $i, $j ) = ( 0, 0 ); |
363
|
36
|
|
|
|
|
67
|
for ( $i = 0 ; $i < $length1 ; $i++, $j++ ) { |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
# Load key and value |
366
|
5724
|
|
|
|
|
7760
|
my $key = $recreated_array->[$i]; |
367
|
5724
|
|
|
|
|
9358
|
my $val = sprintf( "%3d", $glob_hash->{$key} ); |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# We have to keep track with $j |
370
|
5724
|
|
|
|
|
6087
|
my $sorted_key = $sorted_keys_glob_hash->[$j]; |
371
|
5724
|
|
|
|
|
5932
|
my $label = $labels->[$j]; |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
# Load chars |
374
|
5724
|
|
|
|
|
6883
|
my $char1 = substr( $binary_string1, $i, 1 ); |
375
|
5724
|
|
|
|
|
6081
|
my $char2 = substr( $binary_string2, $i, 1 ); |
376
|
5724
|
100
|
100
|
|
|
11789
|
$n_00++ if ( $char1 == 0 && $char2 == 0 ); |
377
|
|
|
|
|
|
|
|
378
|
|
|
|
|
|
|
# Correct $i index by adding weights |
379
|
5724
|
|
|
|
|
6537
|
$i = $i + $glob_hash->{$key} - 1; |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# Load metrics |
382
|
5724
|
100
|
|
|
|
8082
|
$cum_distance += $glob_hash->{$key} if $char1 ne $char2; |
383
|
5724
|
|
|
|
|
8226
|
my $cum_distance_pretty = sprintf( "%3d", $cum_distance ); |
384
|
5724
|
100
|
|
|
|
8244
|
my $distance = $char1 eq $char2 ? 0 : $glob_hash->{$key}; |
385
|
5724
|
|
|
|
|
7398
|
my $distance_pretty = sprintf( "%3d", $distance ); |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
# w = weight, d = distance, cd = cumul distance |
388
|
5724
|
|
|
|
|
11626
|
my %format = ( |
389
|
|
|
|
|
|
|
'11' => '-----', |
390
|
|
|
|
|
|
|
'10' => 'xxx--', |
391
|
|
|
|
|
|
|
'01' => '--xxx', |
392
|
|
|
|
|
|
|
'00' => ' ' |
393
|
|
|
|
|
|
|
); |
394
|
5724
|
|
|
|
|
15241
|
$out_ascii .= |
395
|
|
|
|
|
|
|
qq/$char1 $format{ $char1 . $char2 } $char2 | (w:$val|d:$distance_pretty|cd:$cum_distance_pretty|) $sorted_key ($label)\n/; |
396
|
5724
|
|
|
|
|
26134
|
push @out_csv, |
397
|
|
|
|
|
|
|
qq/$ref_key;$char1;$format{ $char1 . $char2 };$char2;$glob_hash->{$key};$distance;$sorted_key;$label/; |
398
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
#REF(107:week_0_arm_1);indicator;TAR(125:week_0_arm_1);weight;hamming-distance;json-path |
400
|
|
|
|
|
|
|
#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P16Y |
401
|
|
|
|
|
|
|
#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P24Y |
402
|
|
|
|
|
|
|
#1;-----;1;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P39Y |
403
|
|
|
|
|
|
|
} |
404
|
36
|
|
|
|
|
354
|
return $n_00, \$out_ascii, \@out_csv; |
405
|
|
|
|
|
|
|
} |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
sub recreate_array { |
408
|
|
|
|
|
|
|
|
409
|
36
|
|
|
36
|
0
|
61
|
my ( $glob_hash, $sorted_keys_glob_hash ) = @_; |
410
|
36
|
|
|
|
|
38
|
my @recreated_array; |
411
|
36
|
|
|
|
|
64
|
foreach my $key (@$sorted_keys_glob_hash) { |
412
|
5724
|
|
|
|
|
8491
|
for ( my $i = 0 ; $i < $glob_hash->{$key} ; $i++ ) { |
413
|
6012
|
|
|
|
|
10328
|
push @recreated_array, $key; |
414
|
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
|
} |
416
|
36
|
|
|
|
|
73
|
return \@recreated_array; |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
sub create_glob_and_ref_hashes { |
420
|
|
|
|
|
|
|
|
421
|
6
|
|
|
6
|
0
|
19
|
my ( $array, $weight, $self ) = @_; |
422
|
6
|
|
|
|
|
19
|
my $primary_key = $self->{primary_key}; |
423
|
6
|
|
|
|
|
14
|
my $glob_hash = {}; |
424
|
6
|
|
|
|
|
9
|
my $ref_hash_flattened; |
425
|
|
|
|
|
|
|
|
426
|
6
|
|
|
|
|
14
|
for my $i ( @{$array} ) { |
|
6
|
|
|
|
|
18
|
|
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
# For consistency, we obtain the primary_key for both BFF/PXF |
429
|
|
|
|
|
|
|
# from $_->{id} (not from subject.id) |
430
|
199
|
|
|
|
|
502
|
my $id = $i->{$primary_key}; |
431
|
199
|
50
|
|
|
|
430
|
say "Flattening and remapping <id:$id> ..." if $self->{verbose}; |
432
|
199
|
|
|
|
|
675
|
my $ref_hash = remap_hash( |
433
|
|
|
|
|
|
|
{ |
434
|
|
|
|
|
|
|
hash => $i, |
435
|
|
|
|
|
|
|
weight => $weight, |
436
|
|
|
|
|
|
|
self => $self |
437
|
|
|
|
|
|
|
} |
438
|
|
|
|
|
|
|
); |
439
|
199
|
|
|
|
|
731
|
$ref_hash_flattened->{$id} = $ref_hash; |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
# The idea is to create a $glob_hash with unique key-values |
442
|
|
|
|
|
|
|
# Duplicates will be automatically merged |
443
|
199
|
|
|
|
|
7991
|
$glob_hash = { %$glob_hash, %$ref_hash }; |
444
|
|
|
|
|
|
|
} |
445
|
6
|
|
|
|
|
48
|
return ( $glob_hash, $ref_hash_flattened ); |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
sub randomize_variables { |
449
|
|
|
|
|
|
|
|
450
|
0
|
|
|
0
|
0
|
0
|
my ( $glob_hash, $self ) = @_; |
451
|
0
|
|
|
|
|
0
|
my $max = $self->{max_number_var}; |
452
|
0
|
|
|
|
|
0
|
my $seed = $self->{seed}; |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
# set random seed |
455
|
0
|
|
|
|
|
0
|
srand($seed); |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# Randomize |
458
|
|
|
|
|
|
|
# NB:keys have to be sorted for srand to work!!!! |
459
|
|
|
|
|
|
|
# perl -MList::Util=shuffle -E 'srand 123; say shuffle 1 .. 5' |
460
|
0
|
|
|
|
|
0
|
my @items = ( shuffle sort keys %$glob_hash )[ 0 .. $max - 1 ]; |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# Create a new hash with a hash slice |
463
|
0
|
|
|
|
|
0
|
my %new_glob_hash; |
464
|
0
|
|
|
|
|
0
|
@new_glob_hash{@items} = @{$glob_hash}{@items}; |
|
0
|
|
|
|
|
0
|
|
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# return reference |
467
|
0
|
|
|
|
|
0
|
return \%new_glob_hash; |
468
|
|
|
|
|
|
|
} |
469
|
|
|
|
|
|
|
|
470
|
|
|
|
|
|
|
sub prune_excluded_included { |
471
|
|
|
|
|
|
|
|
472
|
200
|
|
|
200
|
0
|
351
|
my ( $hash, $self ) = @_; |
473
|
200
|
|
|
|
|
231
|
my @included = @{ $self->{include_terms} }; |
|
200
|
|
|
|
|
414
|
|
474
|
200
|
|
|
|
|
244
|
my @excluded = @{ $self->{exclude_terms} }; |
|
200
|
|
|
|
|
310
|
|
475
|
|
|
|
|
|
|
|
476
|
|
|
|
|
|
|
# Die if we have both options at the same time |
477
|
200
|
50
|
66
|
|
|
482
|
die "Sorry, <--include> and <--exclude> are mutually exclusive\n" |
478
|
|
|
|
|
|
|
if ( @included && @excluded ); |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
# *** IMPORTANT *** |
481
|
|
|
|
|
|
|
# Original $hash is modified |
482
|
|
|
|
|
|
|
|
483
|
|
|
|
|
|
|
# INCLUDED |
484
|
200
|
100
|
|
|
|
366
|
if (@included) { |
485
|
25
|
|
|
|
|
52
|
for my $key ( keys %$hash ) { |
486
|
125
|
100
|
|
225
|
|
335
|
delete $hash->{$key} unless any { $_ eq $key } @included; |
|
225
|
|
|
|
|
442
|
|
487
|
|
|
|
|
|
|
} |
488
|
|
|
|
|
|
|
} |
489
|
|
|
|
|
|
|
|
490
|
|
|
|
|
|
|
# EXCLUDED |
491
|
200
|
50
|
|
|
|
352
|
if (@excluded) { |
492
|
0
|
|
|
|
|
0
|
for my $key (@excluded) { |
493
|
0
|
0
|
|
|
|
0
|
delete $hash->{$key} if exists $hash->{$key}; |
494
|
|
|
|
|
|
|
} |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
# We will do nothing if @included = @excluded = [] (DEFAULT) |
498
|
200
|
|
|
|
|
286
|
return 1; |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
sub undef_excluded_phenotypicFeatures { |
502
|
|
|
|
|
|
|
|
503
|
200
|
|
|
200
|
0
|
250
|
my $hash = shift; |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# Setting the property to undef (it will be discarded later) |
506
|
200
|
100
|
|
|
|
448
|
if ( exists $hash->{phenotypicFeatures} ) { |
507
|
71
|
|
|
|
|
261
|
@{ $hash->{phenotypicFeatures} } = |
508
|
71
|
100
|
|
|
|
99
|
map { $_->{excluded} ? undef : $_ } @{ $hash->{phenotypicFeatures} }; |
|
179
|
|
|
|
|
1412
|
|
|
71
|
|
|
|
|
293
|
|
509
|
|
|
|
|
|
|
} |
510
|
|
|
|
|
|
|
|
511
|
|
|
|
|
|
|
# *** IMPORTANT *** |
512
|
|
|
|
|
|
|
# Because of setting to undef the excludd properties, it can happen that in |
513
|
|
|
|
|
|
|
# the stats file we have phenotypicFeatures = 100% but t hen it turns out |
514
|
|
|
|
|
|
|
# that some individuals have phenotypicFeatures = {} (all excluded) |
515
|
200
|
|
|
|
|
661
|
return $hash; |
516
|
|
|
|
|
|
|
} |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
sub remap_hash { |
519
|
|
|
|
|
|
|
|
520
|
200
|
|
|
200
|
0
|
335
|
my $arg = shift; |
521
|
200
|
|
|
|
|
278
|
my $hash = $arg->{hash}; |
522
|
200
|
|
|
|
|
239
|
my $weight = $arg->{weight}; |
523
|
200
|
|
|
|
|
240
|
my $self = $arg->{self}; |
524
|
200
|
|
|
|
|
274
|
my $nodes = $self->{nodes}; |
525
|
200
|
|
|
|
|
240
|
my $edges = $self->{edges}; |
526
|
200
|
|
|
|
|
203
|
my $out_hash; |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
# Do some pruning excluded / included |
529
|
200
|
|
|
|
|
424
|
prune_excluded_included( $hash, $self ); |
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
# *** IMPORTANT *** |
532
|
|
|
|
|
|
|
# The user may include a term that: |
533
|
|
|
|
|
|
|
# A - may not exist in any individual |
534
|
|
|
|
|
|
|
# B - does not exist in some individuals |
535
|
|
|
|
|
|
|
# If the term does not exist in a invidual |
536
|
|
|
|
|
|
|
# - a) -include-terms contains ANOTHER TERM THAT EXISTS |
537
|
|
|
|
|
|
|
# %$hash will contain keys => OK |
538
|
|
|
|
|
|
|
# - b) -include-terms only includes the term/terms not present |
539
|
|
|
|
|
|
|
# %$hash = 0 , then we return {}, to avoid trouble w/ Fold.pm |
540
|
|
|
|
|
|
|
#print Dumper $hash; |
541
|
200
|
50
|
|
|
|
407
|
return {} unless %$hash; |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
# A bit more pruning plus collapsing |
544
|
200
|
|
|
|
|
339
|
$hash = fold( undef_excluded_phenotypicFeatures($hash) ); |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
# Load the hash that points to the hierarchy for ontology |
547
|
|
|
|
|
|
|
# *** IMPORTANT *** |
548
|
|
|
|
|
|
|
# - phenotypicFeatures.featureType.id => BFF |
549
|
|
|
|
|
|
|
# - phenotypicFeatures.type.id => PXF |
550
|
200
|
|
|
|
|
885026
|
my $id_correspondence = $self->{id_correspondence}; |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
# Now we proceed for each key |
553
|
200
|
|
|
|
|
288
|
for my $key ( keys %{$hash} ) { |
|
200
|
|
|
|
|
3573
|
|
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
# To see which ones were discarded |
556
|
|
|
|
|
|
|
#say $key if !defined $hash->{$key}; |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
# Discard undefined |
559
|
20603
|
100
|
|
|
|
34262
|
next unless defined $hash->{$key}; |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
# Discarding lines with 'low quality' keys (Time of regex profiled with :NYTProf: ms time) |
562
|
|
|
|
|
|
|
# Some can be "rescued" by adding the ontology as ($1) |
563
|
|
|
|
|
|
|
# NB1: We discard _labels too!! |
564
|
|
|
|
|
|
|
# NB2: info|metaData are always discarded |
565
|
|
|
|
|
|
|
|
566
|
19847
|
|
|
|
|
23069
|
my $regex = $self->{exclude_properties_regex}; |
567
|
|
|
|
|
|
|
next |
568
|
19847
|
100
|
100
|
|
|
92135
|
if ( $regex && $key =~ m/$regex/ ) |
569
|
|
|
|
|
|
|
; # $regex has to be defined and be != '' |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
# The user can turn on age related values |
572
|
7976
|
100
|
66
|
|
|
28330
|
next if ( $key =~ m/age|onset/i && !$self->{age} ); # $self->{age} [0|1] |
573
|
|
|
|
|
|
|
|
574
|
|
|
|
|
|
|
# Load values |
575
|
7563
|
|
|
|
|
9405
|
my $val = $hash->{$key}; |
576
|
|
|
|
|
|
|
|
577
|
|
|
|
|
|
|
# Discarding lines with val (Time profiled with :NYTProf: ms time) |
578
|
|
|
|
|
|
|
next |
579
|
7563
|
100
|
33
|
|
|
39017
|
if ( $val eq 'NA' |
|
|
|
33
|
|
|
|
|
|
|
|
66
|
|
|
|
|
580
|
|
|
|
|
|
|
|| $val eq 'Fake' |
581
|
|
|
|
|
|
|
|| $val eq 'None:No matching concept' |
582
|
|
|
|
|
|
|
|| $val =~ m/1900-01-01|NA0000|P999Y|P9999Y|ARRAY|phenopacket_id/ ); |
583
|
|
|
|
|
|
|
|
584
|
|
|
|
|
|
|
# Add IDs to key |
585
|
5715
|
|
|
|
|
8912
|
my $id_key = add_id2key( $key, $hash, $self ); |
586
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
# Finally add value to id_key |
588
|
5715
|
|
|
|
|
11139
|
my $tmp_key = $id_key . '.' . $val; |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
# Add HPO ascendants |
591
|
5715
|
50
|
33
|
|
|
9955
|
if ( defined $edges && $val =~ /^HP:/ ) { |
592
|
0
|
|
|
|
|
0
|
my $ascendants = add_hpo_ascendants( $tmp_key, $nodes, $edges ); |
593
|
0
|
|
|
|
|
0
|
$out_hash->{$_} = 1 for @$ascendants; # weight 1 for now |
594
|
|
|
|
|
|
|
} |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
################## |
597
|
|
|
|
|
|
|
# Assign weights # |
598
|
|
|
|
|
|
|
################## |
599
|
|
|
|
|
|
|
# NB: mrueda (04-12-23) - it's ok if $weight == undef => NO AUTOVIVIFICATION! |
600
|
|
|
|
|
|
|
# NB: We don't warn if it does not exist, just assign 1 |
601
|
|
|
|
|
|
|
# *** IMPORTANT *** 07-26-2023 |
602
|
|
|
|
|
|
|
# We allow for assigning weights by TERM (e.g., 1D) |
603
|
|
|
|
|
|
|
# but VARIABLE level takes precedence to TERM |
604
|
|
|
|
|
|
|
|
605
|
5715
|
|
|
|
|
5936
|
my $tmp_key_at_term_level = $tmp_key; |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
# If variable has . then capture $1 |
608
|
5715
|
50
|
|
|
|
10317
|
if ( $tmp_key_at_term_level =~ m/\./ ) { |
609
|
|
|
|
|
|
|
|
610
|
|
|
|
|
|
|
# NB: For long str regex is faster than (split /\./, $foo)[0] |
611
|
5715
|
|
|
|
|
9891
|
$tmp_key_at_term_level =~ m/^(\w+)\./; |
612
|
5715
|
|
|
|
|
9094
|
$tmp_key_at_term_level = $1; |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
# ORDER MATTERS !!!! |
616
|
|
|
|
|
|
|
$out_hash->{$tmp_key} = |
617
|
|
|
|
|
|
|
|
618
|
|
|
|
|
|
|
# VARIABLE LEVEL |
619
|
|
|
|
|
|
|
exists $weight->{$tmp_key} |
620
|
|
|
|
|
|
|
? $weight->{$tmp_key} |
621
|
|
|
|
|
|
|
|
622
|
|
|
|
|
|
|
# TERM LEVEL |
623
|
|
|
|
|
|
|
: exists $weight->{$tmp_key_at_term_level} |
624
|
5715
|
50
|
|
|
|
14147
|
? $weight->{$tmp_key_at_term_level} |
|
|
100
|
|
|
|
|
|
625
|
|
|
|
|
|
|
|
626
|
|
|
|
|
|
|
# NO WEIGHT |
627
|
|
|
|
|
|
|
: 1; |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
# Finally we load the Nomenclature hash |
630
|
5715
|
|
|
|
|
6342
|
my $label = $key; |
631
|
5715
|
|
|
|
|
13185
|
$label =~ s/id/label/; |
632
|
5715
|
100
|
|
|
|
17650
|
$nomenclature{$tmp_key} = $hash->{$label} if defined $hash->{$label}; |
633
|
|
|
|
|
|
|
} |
634
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
# *** IMPORTANT *** |
636
|
|
|
|
|
|
|
# We have to return an object {} when undef |
637
|
200
|
|
50
|
|
|
4433
|
return $out_hash // {}; |
638
|
|
|
|
|
|
|
} |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
sub add_hpo_ascendants { |
641
|
|
|
|
|
|
|
|
642
|
0
|
|
|
0
|
0
|
0
|
my ( $key, $nodes, $edges ) = @_; |
643
|
|
|
|
|
|
|
|
644
|
|
|
|
|
|
|
# First we obtain the ontology (0000539) from HP:0000539 |
645
|
0
|
|
|
|
|
0
|
$key =~ m/HP:(\w+)$/; |
646
|
0
|
|
|
|
|
0
|
my $ontology = $1; |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
# We'll use it to build a string equivalent to a key from $edges |
649
|
0
|
|
|
|
|
0
|
my $hpo_url = 'http://purl.obolibrary.org/obo/HP_'; |
650
|
0
|
|
|
|
|
0
|
my $hpo_key = $hpo_url . $ontology; |
651
|
|
|
|
|
|
|
|
652
|
|
|
|
|
|
|
# We will include all ascendants in an array |
653
|
0
|
|
|
|
|
0
|
my @ascendants; |
654
|
0
|
|
|
|
|
0
|
for my $parent_id ( @{ $edges->{$hpo_key} } ) { |
|
0
|
|
|
|
|
0
|
|
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
# We have to create a copy to not modify the original $parent_id |
657
|
|
|
|
|
|
|
# as it can appear in multiple individuals |
658
|
0
|
|
|
|
|
0
|
my $copy_parent_id = $parent_id; |
659
|
0
|
|
|
|
|
0
|
$copy_parent_id =~ m/\/(\w+)$/; |
660
|
0
|
|
|
|
|
0
|
$copy_parent_id = $1; |
661
|
0
|
|
|
|
|
0
|
$copy_parent_id =~ tr/_/:/; |
662
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
# *** IMPORTANT *** |
664
|
|
|
|
|
|
|
# We cannot add any label to the ascendants, otherwise they will |
665
|
|
|
|
|
|
|
# not be matched by an indv down the tree |
666
|
|
|
|
|
|
|
# Myopia |
667
|
|
|
|
|
|
|
# Mild Myopia |
668
|
|
|
|
|
|
|
# We want that 'Mild Myopia' matches 'Myopia', thus we can not add a label from 'Mild Myopia' |
669
|
|
|
|
|
|
|
# Use the labels only for debug |
670
|
0
|
|
|
|
|
0
|
my $asc_key = DEVEL_MODE ? $key . '.HPO_asc_DEBUG_ONLY' : $key; |
671
|
0
|
|
|
|
|
0
|
$asc_key =~ s/HP:$ontology/$copy_parent_id/g; |
672
|
0
|
|
|
|
|
0
|
push @ascendants, $asc_key; |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
# We finally add the label to %nomenclature |
675
|
0
|
|
|
|
|
0
|
my $hpo_asc_str = $hpo_url |
676
|
|
|
|
|
|
|
. $copy_parent_id; # 'http://purl.obolibrary.org/obo/HP_HP:0000539 |
677
|
0
|
|
|
|
|
0
|
$hpo_asc_str =~ s/HP://; # 0000539 |
678
|
0
|
|
|
|
|
0
|
$nomenclature{$asc_key} = $nodes->{$hpo_asc_str}{lbl}; |
679
|
|
|
|
|
|
|
} |
680
|
0
|
|
|
|
|
0
|
return \@ascendants; |
681
|
|
|
|
|
|
|
} |
682
|
|
|
|
|
|
|
|
683
|
|
|
|
|
|
|
sub add_id2key { |
684
|
|
|
|
|
|
|
|
685
|
5715
|
|
|
5715
|
0
|
8421
|
my ( $key, $hash, $self ) = @_; |
686
|
5715
|
|
|
|
|
8485
|
my $id_correspondence = $self->{id_correspondence}{ $self->{format} }; |
687
|
5715
|
|
|
|
|
5708
|
my $array_terms_str = join '|', @{ $self->{array_terms} }; |
|
5715
|
|
|
|
|
11642
|
|
688
|
5715
|
|
|
|
|
7831
|
my $array_regex = $self->{array_regex}; |
689
|
|
|
|
|
|
|
|
690
|
5715
|
100
|
|
|
|
19241
|
if ( $key =~ /$array_terms_str/ ) { |
691
|
5094
|
|
|
|
|
14959
|
$key =~ m/$array_regex/; |
692
|
5094
|
|
|
|
|
6176
|
my ( $tmp_key, $val ); |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
# Normal behaviour |
695
|
5094
|
100
|
|
|
|
8942
|
if ( defined $3 ) { |
696
|
4964
|
|
|
|
|
11548
|
$tmp_key = $1 . ':' . $2 . '.' . $id_correspondence->{$1}; |
697
|
4964
|
|
|
|
|
6868
|
$val = $hash->{$tmp_key}; |
698
|
4964
|
|
|
|
|
9405
|
$key = $1 . '.' . $val . '.' . $3; |
699
|
|
|
|
|
|
|
} |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
# Only two |
702
|
|
|
|
|
|
|
else { |
703
|
|
|
|
|
|
|
|
704
|
130
|
|
|
|
|
254
|
$tmp_key = $1 . ':' . $2; |
705
|
130
|
|
|
|
|
175
|
$val = $hash->{$tmp_key}; |
706
|
130
|
|
|
|
|
183
|
$key = $1; |
707
|
|
|
|
|
|
|
} |
708
|
|
|
|
|
|
|
} |
709
|
5715
|
|
|
|
|
9011
|
return $key; |
710
|
|
|
|
|
|
|
} |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
sub create_binary_digit_string { |
713
|
|
|
|
|
|
|
|
714
|
7
|
|
|
7
|
0
|
24
|
my ( $glob_hash, $cmp_hash ) = @_; |
715
|
7
|
|
|
|
|
17
|
my $out_hash; |
716
|
|
|
|
|
|
|
|
717
|
|
|
|
|
|
|
# *** IMPORTANT *** |
718
|
|
|
|
|
|
|
# Being a nested for, keys %{$glob_hash} does not need sorting |
719
|
|
|
|
|
|
|
# BUT, we sort to follow the same order as serialized (sorted) |
720
|
7
|
|
|
|
|
15
|
my @sorted_keys_glob_hash = sort keys %{$glob_hash}; |
|
7
|
|
|
|
|
474
|
|
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
# IDs of each indidividual |
723
|
7
|
|
|
|
|
43
|
for my $individual_id ( keys %{$cmp_hash} ) { # no need to sort |
|
7
|
|
|
|
|
62
|
|
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
# One-hot encoding = Representing categorical data as numerical |
726
|
200
|
|
|
|
|
319
|
my ( $binary_str, $binary_str_weighted ) = ('') x 2; |
727
|
200
|
|
|
|
|
246
|
for my $key (@sorted_keys_glob_hash) { |
728
|
21837
|
|
|
|
|
24713
|
my $ones = (1) x $glob_hash->{$key}; |
729
|
21837
|
|
|
|
|
23273
|
my $zeros = (0) x $glob_hash->{$key}; |
730
|
21837
|
100
|
|
|
|
29574
|
$binary_str .= exists $cmp_hash->{$individual_id}{$key} ? 1 : 0; |
731
|
|
|
|
|
|
|
$binary_str_weighted .= |
732
|
21837
|
100
|
|
|
|
31135
|
exists $cmp_hash->{$individual_id}{$key} ? $ones : $zeros; |
733
|
|
|
|
|
|
|
} |
734
|
200
|
|
|
|
|
478
|
$out_hash->{$individual_id}{binary_digit_string} = $binary_str; |
735
|
|
|
|
|
|
|
$out_hash->{$individual_id}{binary_digit_string_weighted} = |
736
|
200
|
|
|
|
|
333
|
$binary_str_weighted; |
737
|
|
|
|
|
|
|
} |
738
|
7
|
|
|
|
|
91
|
return $out_hash; |
739
|
|
|
|
|
|
|
} |
740
|
|
|
|
|
|
|
|
741
|
|
|
|
|
|
|
sub parse_hpo_json { |
742
|
|
|
|
|
|
|
|
743
|
0
|
|
|
0
|
0
|
|
my $data = shift; |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
# The <hp.json> file is a structured representation of the Human Phenotype Ontology (HPO) in JSON format. |
746
|
|
|
|
|
|
|
# The HPO is structured into a directed acyclic graph (DAG) |
747
|
|
|
|
|
|
|
# Here's a brief overview of the structure of the hpo.json file: |
748
|
|
|
|
|
|
|
# - graphs: This key contains an array of ontology graphs. In the case of HPO, there is only one graph. The graph has two main keys: |
749
|
|
|
|
|
|
|
# - nodes: An array of objects, each representing an HPO term. Each term object has the following keys: |
750
|
|
|
|
|
|
|
# - id: The identifier of the term (e.g., "HP:0000118"). |
751
|
|
|
|
|
|
|
# - lbl: The label (name) of the term (e.g., "Phenotypic abnormality"). |
752
|
|
|
|
|
|
|
# - meta: Metadata associated with the term, including definition, synonyms, and other information. |
753
|
|
|
|
|
|
|
# - type: The type of the term, usually "CLASS". |
754
|
|
|
|
|
|
|
# - edges: An array of objects, each representing a relationship between two HPO terms. Each edge object has the following keys: |
755
|
|
|
|
|
|
|
# - sub: The subject (child) term ID (e.g., "HP:0000924"). |
756
|
|
|
|
|
|
|
# - obj: The object (parent) term ID (e.g., "HP:0000118"). |
757
|
|
|
|
|
|
|
# - pred: The predicate that describes the relationship between the subject and object terms, typically "is_a" in HPO. |
758
|
|
|
|
|
|
|
# - meta: This key contains metadata about the HPO ontology as a whole, such as version information, description, and other details. |
759
|
|
|
|
|
|
|
|
760
|
0
|
|
|
|
|
|
my $graph = $data->{graphs}->[0]; |
761
|
0
|
|
|
|
|
|
my %nodes = map { $_->{id} => $_ } @{ $graph->{nodes} }; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
762
|
0
|
|
|
|
|
|
my %edges = (); |
763
|
|
|
|
|
|
|
|
764
|
0
|
|
|
|
|
|
for my $edge ( @{ $graph->{edges} } ) { |
|
0
|
|
|
|
|
|
|
765
|
0
|
|
|
|
|
|
my $child_id = $edge->{sub}; |
766
|
0
|
|
|
|
|
|
my $parent_id = $edge->{obj}; |
767
|
0
|
|
|
|
|
|
push @{ $edges{$child_id} }, $parent_id; |
|
0
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
} |
769
|
0
|
|
|
|
|
|
return \%nodes, \%edges; |
770
|
|
|
|
|
|
|
} |
771
|
|
|
|
|
|
|
1; |