| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Pheno::Ranker::Compare; |
|
2
|
|
|
|
|
|
|
|
|
3
|
5
|
|
|
5
|
|
32
|
use strict; |
|
|
5
|
|
|
|
|
9
|
|
|
|
5
|
|
|
|
|
198
|
|
|
4
|
5
|
|
|
5
|
|
40
|
use warnings; |
|
|
5
|
|
|
|
|
9
|
|
|
|
5
|
|
|
|
|
350
|
|
|
5
|
5
|
|
|
5
|
|
33
|
use autodie; |
|
|
5
|
|
|
|
|
10
|
|
|
|
5
|
|
|
|
|
45
|
|
|
6
|
5
|
|
|
5
|
|
29438
|
use feature qw(say); |
|
|
5
|
|
|
|
|
11
|
|
|
|
5
|
|
|
|
|
823
|
|
|
7
|
5
|
|
|
5
|
|
41
|
use List::Util qw(any shuffle first); |
|
|
5
|
|
|
|
|
9
|
|
|
|
5
|
|
|
|
|
578
|
|
|
8
|
5
|
|
|
5
|
|
35
|
use Data::Dumper; |
|
|
5
|
|
|
|
|
7
|
|
|
|
5
|
|
|
|
|
373
|
|
|
9
|
5
|
|
|
5
|
|
3952
|
use Sort::Naturally qw(nsort); |
|
|
5
|
|
|
|
|
27209
|
|
|
|
5
|
|
|
|
|
484
|
|
|
10
|
5
|
|
|
5
|
|
2744
|
use MIME::Base64; |
|
|
5
|
|
|
|
|
4363
|
|
|
|
5
|
|
|
|
|
422
|
|
|
11
|
5
|
|
|
5
|
|
3251
|
use Compress::Zlib qw(compress); |
|
|
5
|
|
|
|
|
110954
|
|
|
|
5
|
|
|
|
|
677
|
|
|
12
|
5
|
|
|
5
|
|
3311
|
use Hash::Fold fold => { array_delimiter => ':' }; |
|
|
5
|
|
|
|
|
210452
|
|
|
|
5
|
|
|
|
|
45
|
|
|
13
|
5
|
|
|
5
|
|
5172
|
use Pheno::Ranker::Metrics; |
|
|
5
|
|
|
|
|
63
|
|
|
|
5
|
|
|
|
|
1181
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
5
|
|
|
5
|
|
61
|
use Exporter 'import'; |
|
|
5
|
|
|
|
|
106
|
|
|
|
5
|
|
|
|
|
505
|
|
|
16
|
|
|
|
|
|
|
our @EXPORT = |
|
17
|
|
|
|
|
|
|
qw(check_format cohort_comparison compare_and_rank create_glob_and_ref_hashes randomize_variables remap_hash create_binary_digit_string parse_hpo_json); |
|
18
|
|
|
|
|
|
|
|
|
19
|
5
|
|
|
5
|
|
78
|
use constant DEVEL_MODE => 0; |
|
|
5
|
|
|
|
|
12
|
|
|
|
5
|
|
|
|
|
43097
|
|
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our %nomenclature = (); |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
sub check_format { |
|
24
|
13
|
|
|
13
|
0
|
29
|
my $data = shift; |
|
25
|
13
|
100
|
|
|
|
98
|
return exists $data->[0]{subject} ? 'PXF' : 'BFF'; |
|
26
|
|
|
|
|
|
|
} |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub cohort_comparison { |
|
29
|
14
|
|
|
14
|
0
|
41
|
my ( $ref_binary_hash, $self ) = @_; |
|
30
|
14
|
|
|
|
|
56
|
my $out_file = $self->{out_file}; |
|
31
|
14
|
|
|
|
|
45
|
my $similarity_metric = $self->{similarity_metric_cohort}; |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Define limit #items for switching to whole matrix calculation |
|
34
|
14
|
|
|
|
|
34
|
my $max_items = $self->{max_matrix_records_in_ram}; |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
# Inform about the start of the comparison process |
|
37
|
|
|
|
|
|
|
say "Performing COHORT comparison" |
|
38
|
14
|
50
|
33
|
|
|
109
|
if ( $self->{debug} || $self->{verbose} ); |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# Define the subroutine to be used |
|
41
|
14
|
|
|
|
|
91
|
my %similarity_function = ( |
|
42
|
|
|
|
|
|
|
'hamming' => \&hd_fast, |
|
43
|
|
|
|
|
|
|
'jaccard' => \&jaccard_similarity_formatted |
|
44
|
|
|
|
|
|
|
); |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# Define values for diagonal elements depending on metric |
|
47
|
14
|
|
|
|
|
50
|
my %similarity_diagonal = ( |
|
48
|
|
|
|
|
|
|
'hamming' => 0, |
|
49
|
|
|
|
|
|
|
'jaccard' => 1 |
|
50
|
|
|
|
|
|
|
); |
|
51
|
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
# Use previous hashes to define stuff |
|
53
|
14
|
|
|
|
|
27
|
my $metric = $similarity_function{$similarity_metric}; |
|
54
|
14
|
|
|
|
|
29
|
my $similarity_diagonal = $similarity_diagonal{$similarity_metric}; |
|
55
|
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# Sorting keys of the hash |
|
57
|
14
|
|
|
|
|
23
|
my @sorted_keys_ref_binary_hash = nsort( keys %{$ref_binary_hash} ); |
|
|
14
|
|
|
|
|
800
|
|
|
58
|
14
|
|
|
|
|
64722
|
my $num_items = scalar @sorted_keys_ref_binary_hash; |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
# Define $switch for going from RAM to all calculations |
|
61
|
14
|
50
|
|
|
|
67
|
my $switch = $num_items > $max_items ? 1 : 0; |
|
62
|
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
say "RAM efficient mode is: " |
|
64
|
|
|
|
|
|
|
. ( $switch ? "on" : "off" ) |
|
65
|
|
|
|
|
|
|
. " (max_matrix_records_in_ram: $max_items)" |
|
66
|
14
|
0
|
33
|
|
|
117
|
if ( $self->{debug} || $self->{verbose} ); |
|
|
|
50
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
# Opening file for output |
|
69
|
14
|
|
|
|
|
86
|
open( my $fh, '>:encoding(UTF-8)', $out_file ); |
|
70
|
14
|
|
|
|
|
19037
|
say $fh "\t", join "\t", @sorted_keys_ref_binary_hash; |
|
71
|
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
# Initialize matrix for storing similarity |
|
73
|
14
|
|
|
|
|
33
|
my @matrix; |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
# Iterate over items (I elements) |
|
76
|
14
|
|
|
|
|
65
|
for my $i ( 0 .. $#sorted_keys_ref_binary_hash ) { |
|
77
|
|
|
|
|
|
|
say "Calculating <" |
|
78
|
|
|
|
|
|
|
. $sorted_keys_ref_binary_hash[$i] |
|
79
|
|
|
|
|
|
|
. "> against the cohort..." |
|
80
|
420
|
50
|
|
|
|
1168
|
if $self->{verbose}; |
|
81
|
|
|
|
|
|
|
my $str1 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$i] } |
|
82
|
420
|
|
|
|
|
887
|
{binary_digit_string_weighted}; |
|
83
|
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
# Print first column (w/o \t) |
|
85
|
420
|
|
|
|
|
922
|
print $fh $sorted_keys_ref_binary_hash[$i]; |
|
86
|
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
# Iterate for pairwise comparisons (J elements) |
|
88
|
420
|
|
|
|
|
1076
|
for my $j ( 0 .. $#sorted_keys_ref_binary_hash ) { |
|
89
|
|
|
|
|
|
|
my $str2 = $ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] } |
|
90
|
15028
|
|
|
|
|
29755
|
{binary_digit_string_weighted}; |
|
91
|
15028
|
|
|
|
|
30776
|
my $similarity; |
|
92
|
|
|
|
|
|
|
|
|
93
|
15028
|
50
|
|
|
|
19610
|
if ($switch) { |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# For large datasets compute upper and lower triange |
|
96
|
|
|
|
|
|
|
my $str2 = |
|
97
|
|
|
|
|
|
|
$ref_binary_hash->{ $sorted_keys_ref_binary_hash[$j] } |
|
98
|
0
|
|
|
|
|
0
|
{binary_digit_string_weighted}; |
|
99
|
0
|
0
|
|
|
|
0
|
$similarity = |
|
100
|
|
|
|
|
|
|
$i == $j ? $similarity_diagonal : $metric->( $str1, $str2 ); |
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
else { |
|
103
|
15028
|
100
|
|
|
|
24111
|
if ( $i == $j ) { |
|
|
|
100
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
# Similarity for diagonal elements |
|
106
|
420
|
|
|
|
|
523
|
$similarity = $similarity_diagonal; |
|
107
|
|
|
|
|
|
|
} |
|
108
|
|
|
|
|
|
|
elsif ( $j > $i ) { |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# Compute similarity for large cohorts or upper triangle |
|
111
|
7304
|
|
|
|
|
11783
|
$similarity = $metric->( $str1, $str2 ); |
|
112
|
7304
|
|
|
|
|
12883
|
$matrix[$i][$j] = $similarity; |
|
113
|
|
|
|
|
|
|
} |
|
114
|
|
|
|
|
|
|
else { |
|
115
|
|
|
|
|
|
|
# Use precomputed similarity from lower triangle |
|
116
|
7304
|
|
|
|
|
10027
|
$similarity = $matrix[$j][$i]; |
|
117
|
|
|
|
|
|
|
} |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
# Print a tab before each similarity |
|
121
|
15028
|
|
|
|
|
31766
|
print $fh "\t", $similarity; |
|
122
|
|
|
|
|
|
|
} |
|
123
|
|
|
|
|
|
|
|
|
124
|
420
|
|
|
|
|
800
|
print $fh "\n"; |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
# Close the file handle |
|
128
|
14
|
|
|
|
|
115
|
close $fh; |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# Inform about the completion of the matrix computation |
|
131
|
14
|
50
|
33
|
|
|
26047
|
say "Matrix saved to <$out_file>" if ( $self->{debug} || $self->{verbose} ); |
|
132
|
14
|
|
|
|
|
737
|
return 1; |
|
133
|
|
|
|
|
|
|
} |
|
134
|
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub compare_and_rank { |
|
136
|
2
|
|
|
2
|
0
|
6
|
my $arg = shift; |
|
137
|
2
|
|
|
|
|
6
|
my $glob_hash = $arg->{glob_hash}; |
|
138
|
2
|
|
|
|
|
7
|
my $ref_hash = $arg->{ref_hash}; |
|
139
|
2
|
|
|
|
|
6
|
my $tar_hash = $arg->{tar_hash}; |
|
140
|
2
|
|
|
|
|
5
|
my $ref_binary_hash = $arg->{ref_binary_hash}; |
|
141
|
2
|
|
|
|
|
7
|
my $tar_binary_hash = $arg->{tar_binary_hash}; |
|
142
|
2
|
|
|
|
|
7
|
my $weight = $arg->{weight}; |
|
143
|
2
|
|
|
|
|
6
|
my $self = $arg->{self}; |
|
144
|
2
|
|
|
|
|
8
|
my $sort_by = $self->{sort_by}; |
|
145
|
2
|
|
|
|
|
7
|
my $align = $self->{align}; |
|
146
|
2
|
|
|
|
|
5
|
my $max_out = $self->{max_out}; |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
say "Performing COHORT(REF)-PATIENT(TAR) comparison" |
|
149
|
2
|
50
|
33
|
|
|
22
|
if ( $self->{debug} || $self->{verbose} ); |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
# Hash for compiling metrics |
|
152
|
2
|
|
|
|
|
6
|
my $score; |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# Hash for stats |
|
155
|
|
|
|
|
|
|
my $stat; |
|
156
|
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
# Load TAR binary string |
|
158
|
2
|
|
|
|
|
5
|
my ($tar) = keys %{$tar_binary_hash}; |
|
|
2
|
|
|
|
|
7
|
|
|
159
|
|
|
|
|
|
|
my $tar_str_weighted = |
|
160
|
2
|
|
|
|
|
6
|
$tar_binary_hash->{$tar}{binary_digit_string_weighted}; |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# Load TAR number of vars |
|
163
|
2
|
|
|
|
|
3
|
my $target_vars = keys %{ $tar_hash->{$tar} }; |
|
|
2
|
|
|
|
|
9
|
|
|
164
|
|
|
|
|
|
|
|
|
165
|
2
|
|
|
|
|
4
|
for my $key ( keys %{$ref_binary_hash} ) { # No need to sort |
|
|
2
|
|
|
|
|
29
|
|
|
166
|
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
# Load REF binary string |
|
168
|
|
|
|
|
|
|
my $ref_str_weighted = |
|
169
|
72
|
|
|
|
|
184
|
$ref_binary_hash->{$key}{binary_digit_string_weighted}; |
|
170
|
72
|
50
|
|
|
|
151
|
say "Comparing --- " if $self->{verbose}; |
|
171
|
|
|
|
|
|
|
say "REF:$ref_str_weighted\nTAR:$tar_str_weighted\n" |
|
172
|
72
|
50
|
33
|
|
|
166
|
if ( defined $self->{debug} && $self->{debug} > 1 ); |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Hamming |
|
175
|
|
|
|
|
|
|
$score->{$key}{hamming} = |
|
176
|
72
|
|
|
|
|
140
|
hd_fast( $ref_str_weighted, $tar_str_weighted ); |
|
177
|
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
# Intersect and Jaccard |
|
179
|
72
|
|
|
|
|
157
|
( $score->{$key}{jaccard}, $score->{$key}{intersect} ) = |
|
180
|
|
|
|
|
|
|
jaccard_similarity( $ref_str_weighted, $tar_str_weighted ); |
|
181
|
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
# Load REF number of vars |
|
183
|
72
|
|
|
|
|
93
|
$score->{$key}{reference_vars} = keys %{ $ref_hash->{$key} }; |
|
|
72
|
|
|
|
|
174
|
|
|
184
|
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
# Add values |
|
186
|
72
|
|
|
|
|
88
|
push @{ $stat->{hamming_data} }, $score->{$key}{hamming}; |
|
|
72
|
|
|
|
|
138
|
|
|
187
|
72
|
|
|
|
|
80
|
push @{ $stat->{jaccard_data} }, $score->{$key}{jaccard}; |
|
|
72
|
|
|
|
|
155
|
|
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
# Stats are only computed once (no overhead) |
|
191
|
2
|
|
|
|
|
19
|
$stat->{hamming_stats} = add_stats( $stat->{hamming_data} ); |
|
192
|
2
|
|
|
|
|
9
|
$stat->{jaccard_stats} = add_stats( $stat->{jaccard_data} ); |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
# Initialize a few variables |
|
195
|
2
|
|
|
|
|
20
|
my @headers = ( |
|
196
|
|
|
|
|
|
|
'RANK', 'REFERENCE(ID)', |
|
197
|
|
|
|
|
|
|
'TARGET(ID)', 'FORMAT', |
|
198
|
|
|
|
|
|
|
'LENGTH', 'WEIGHTED', |
|
199
|
|
|
|
|
|
|
'HAMMING-DISTANCE', 'DISTANCE-Z-SCORE', |
|
200
|
|
|
|
|
|
|
'DISTANCE-P-VALUE', 'DISTANCE-Z-SCORE(RAND)', |
|
201
|
|
|
|
|
|
|
'JACCARD-INDEX', 'JACCARD-Z-SCORE', |
|
202
|
|
|
|
|
|
|
'JACCARD-P-VALUE', 'REFERENCE-VARS', |
|
203
|
|
|
|
|
|
|
'TARGET-VARS', 'INTERSECT', |
|
204
|
|
|
|
|
|
|
'INTERSECT-RATE(%)', 'COMPLETENESS(%)' |
|
205
|
|
|
|
|
|
|
); |
|
206
|
2
|
|
|
|
|
11
|
my $header = join "\t", @headers; |
|
207
|
2
|
|
|
|
|
5
|
my @results = $header; |
|
208
|
2
|
|
|
|
|
3
|
my %info; |
|
209
|
2
|
|
|
|
|
5
|
my $length_align = length($tar_str_weighted); |
|
210
|
2
|
100
|
|
|
|
7
|
my $weight_bool = $weight ? 'True' : 'False'; |
|
211
|
2
|
|
|
|
|
5
|
my @alignments_ascii; |
|
212
|
|
|
|
|
|
|
my $alignment_str_csv; |
|
213
|
2
|
|
|
|
|
5
|
my @alignments_csv = join ';', |
|
214
|
|
|
|
|
|
|
qw/id ref indicator tar weight hamming-distance json-path label/; |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
# The dataframe will have two header lines |
|
217
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
218
|
|
|
|
|
|
|
# nsort does not yield same results as canonical from JSON::XS |
|
219
|
|
|
|
|
|
|
# NB: we're sorting here and in create_binary_digit_string() |
|
220
|
2
|
|
|
|
|
4
|
my @sort_keys_glob_hash = sort keys %{$glob_hash}; |
|
|
2
|
|
|
|
|
215
|
|
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# Creating @labels array from {id,label}, or guess_label() |
|
223
|
|
|
|
|
|
|
my @labels = |
|
224
|
2
|
100
|
|
|
|
21
|
map { exists $nomenclature{$_} ? $nomenclature{$_} : guess_label($_) } |
|
|
318
|
|
|
|
|
606
|
|
|
225
|
|
|
|
|
|
|
@sort_keys_glob_hash; |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
# Die if #elements in arrays differ |
|
228
|
2
|
50
|
|
|
|
8
|
die "Mismatch between variables and labels" |
|
229
|
|
|
|
|
|
|
if @sort_keys_glob_hash != @labels; |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
# Labels |
|
232
|
|
|
|
|
|
|
# NB: there is a comma in 'Serum Glutamic Pyruvic Transaminase, CTCAE' |
|
233
|
2
|
|
|
|
|
49
|
my @dataframe = join ';', 'Id', @labels; |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
# Variables (json path) |
|
236
|
2
|
|
|
|
|
42
|
push @dataframe, join ';', 'Id', @sort_keys_glob_hash; |
|
237
|
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# 0/1 |
|
239
|
|
|
|
|
|
|
push @dataframe, join ';', qq/T|$tar/, |
|
240
|
2
|
|
|
|
|
90
|
( split //, $tar_binary_hash->{$tar}{binary_digit_string} ); # Add Target |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
# Sort %score by value and load results |
|
243
|
2
|
|
|
|
|
22
|
my $count = 1; |
|
244
|
2
|
|
|
|
|
5
|
$max_out++; # to be able to start w/ ONE |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
# Start loop |
|
247
|
2
|
|
|
|
|
25
|
for my $key ( |
|
248
|
|
|
|
|
|
|
sort { |
|
249
|
|
|
|
|
|
|
$sort_by eq 'jaccard' # |
|
250
|
|
|
|
|
|
|
? $score->{$b}{$sort_by} |
|
251
|
|
|
|
|
|
|
<=> $score->{$a}{$sort_by} # 1 to 0 (similarity) |
|
252
|
|
|
|
|
|
|
: $score->{$a}{$sort_by} |
|
253
|
284
|
50
|
|
|
|
504
|
<=> $score->{$b}{$sort_by} # 0 to N (distance) |
|
254
|
|
|
|
|
|
|
} keys %$score |
|
255
|
|
|
|
|
|
|
) |
|
256
|
|
|
|
|
|
|
{ |
|
257
|
|
|
|
|
|
|
|
|
258
|
72
|
50
|
|
|
|
251
|
say "$count: Creating alignment " if $self->{verbose}; |
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# Create ASCII alignment |
|
261
|
|
|
|
|
|
|
# NB: We need it here to get $n_00 |
|
262
|
|
|
|
|
|
|
my ( $n_00, $alignment_str_ascii, $alignment_arr_csv ) = |
|
263
|
|
|
|
|
|
|
create_alignment( |
|
264
|
|
|
|
|
|
|
{ |
|
265
|
|
|
|
|
|
|
ref_key => $key, |
|
266
|
|
|
|
|
|
|
ref_str_weighted => |
|
267
|
|
|
|
|
|
|
$ref_binary_hash->{$key}{binary_digit_string_weighted}, |
|
268
|
72
|
|
|
|
|
671
|
tar_str_weighted => $tar_str_weighted, |
|
269
|
|
|
|
|
|
|
glob_hash => $glob_hash, |
|
270
|
|
|
|
|
|
|
sorted_keys_glob_hash => \@sort_keys_glob_hash, |
|
271
|
|
|
|
|
|
|
labels => \@labels |
|
272
|
|
|
|
|
|
|
} |
|
273
|
|
|
|
|
|
|
); |
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
276
|
|
|
|
|
|
|
# The LENGTH of the alignment is based on the #variables in the REF-COHORT |
|
277
|
|
|
|
|
|
|
# Compute estimated av and dev for binary_string of L = length_align - n_00 |
|
278
|
|
|
|
|
|
|
# Corrected length_align L = length_align - n_00 |
|
279
|
72
|
|
|
|
|
334
|
my $length_align_corrected = $length_align - $n_00; |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
#$estimated_average, $estimated_std_dev |
|
282
|
72
|
|
|
|
|
405
|
( $stat->{hamming_stats}{mean_rnd}, $stat->{hamming_stats}{sd_rnd} ) = |
|
283
|
|
|
|
|
|
|
estimate_hamming_stats($length_align_corrected); |
|
284
|
|
|
|
|
|
|
|
|
285
|
|
|
|
|
|
|
# Compute a few stats |
|
286
|
|
|
|
|
|
|
my $hamming_z_score = z_score( |
|
287
|
|
|
|
|
|
|
$score->{$key}{hamming}, |
|
288
|
|
|
|
|
|
|
$stat->{hamming_stats}{mean}, |
|
289
|
|
|
|
|
|
|
$stat->{hamming_stats}{sd} |
|
290
|
72
|
|
|
|
|
443
|
); |
|
291
|
|
|
|
|
|
|
my $hamming_z_score_from_random = z_score( |
|
292
|
|
|
|
|
|
|
$score->{$key}{hamming}, |
|
293
|
|
|
|
|
|
|
$stat->{hamming_stats}{mean_rnd}, |
|
294
|
|
|
|
|
|
|
$stat->{hamming_stats}{sd_rnd} |
|
295
|
72
|
|
|
|
|
236
|
); |
|
296
|
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
#my $hamming_p_value = |
|
298
|
|
|
|
|
|
|
# p_value( $score->{$key}{hamming}, $length_align_corrected ); |
|
299
|
72
|
|
|
|
|
235
|
my $hamming_p_value_from_z_score = |
|
300
|
|
|
|
|
|
|
p_value_from_z_score($hamming_z_score); |
|
301
|
|
|
|
|
|
|
my $jaccard_z_score = z_score( |
|
302
|
|
|
|
|
|
|
$score->{$key}{jaccard}, |
|
303
|
|
|
|
|
|
|
$stat->{jaccard_stats}{mean}, |
|
304
|
|
|
|
|
|
|
$stat->{jaccard_stats}{sd} |
|
305
|
72
|
|
|
|
|
263
|
); |
|
306
|
72
|
|
|
|
|
150
|
my $jaccard_p_value_from_z_score = |
|
307
|
|
|
|
|
|
|
p_value_from_z_score( 1 - $jaccard_z_score ); |
|
308
|
|
|
|
|
|
|
|
|
309
|
|
|
|
|
|
|
# Compute Intersect-Rate (I/T) * 100 |
|
310
|
|
|
|
|
|
|
# Completeness (T/R) * 100 |
|
311
|
72
|
|
|
|
|
181
|
my $reference_vars = $score->{$key}{reference_vars}; |
|
312
|
72
|
|
|
|
|
146
|
my $intersect = $score->{$key}{intersect}; |
|
313
|
72
|
50
|
|
|
|
205
|
my $intersect_rate = |
|
314
|
|
|
|
|
|
|
( $target_vars == 0 ) ? 0 : ( $intersect / $target_vars ) * 100; |
|
315
|
72
|
50
|
|
|
|
154
|
my $completeness = |
|
316
|
|
|
|
|
|
|
( $reference_vars == 0 ) ? 0 : ( $intersect / $reference_vars ) * 100; |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
# Create a hash with formats |
|
319
|
|
|
|
|
|
|
my $format = { |
|
320
|
|
|
|
|
|
|
'RANK' => { value => $count, format => undef }, |
|
321
|
|
|
|
|
|
|
'REFERENCE(ID)' => { value => $key, format => undef }, |
|
322
|
|
|
|
|
|
|
'TARGET(ID)' => { value => $tar, format => undef }, |
|
323
|
|
|
|
|
|
|
'FORMAT' => { value => $self->{format}, format => undef }, |
|
324
|
|
|
|
|
|
|
'WEIGHTED' => { value => $weight_bool, format => undef }, |
|
325
|
|
|
|
|
|
|
'LENGTH' => { value => $length_align_corrected, format => '%6d' }, |
|
326
|
|
|
|
|
|
|
'HAMMING-DISTANCE' => |
|
327
|
|
|
|
|
|
|
{ value => $score->{$key}{hamming}, format => '%4d' }, |
|
328
|
|
|
|
|
|
|
'DISTANCE-Z-SCORE' => |
|
329
|
|
|
|
|
|
|
{ value => $hamming_z_score, format => '%7.3f' }, |
|
330
|
|
|
|
|
|
|
'DISTANCE-P-VALUE' => |
|
331
|
|
|
|
|
|
|
{ value => $hamming_p_value_from_z_score, format => '%12.7f' }, |
|
332
|
|
|
|
|
|
|
'DISTANCE-Z-SCORE(RAND)' => |
|
333
|
|
|
|
|
|
|
{ value => $hamming_z_score_from_random, format => '%8.4f' }, |
|
334
|
|
|
|
|
|
|
'JACCARD-INDEX' => |
|
335
|
72
|
|
|
|
|
2596
|
{ value => $score->{$key}{jaccard}, format => '%7.3f' }, |
|
336
|
|
|
|
|
|
|
'JACCARD-Z-SCORE' => |
|
337
|
|
|
|
|
|
|
{ value => $jaccard_z_score, format => '%7.3f' }, |
|
338
|
|
|
|
|
|
|
'JACCARD-P-VALUE' => |
|
339
|
|
|
|
|
|
|
{ value => $jaccard_p_value_from_z_score, format => '%12.7f' }, |
|
340
|
|
|
|
|
|
|
'REFERENCE-VARS' => { value => $reference_vars, format => '%6d' }, |
|
341
|
|
|
|
|
|
|
'TARGET-VARS' => { value => $target_vars, format => '%6d' }, |
|
342
|
|
|
|
|
|
|
'INTERSECT' => { value => $intersect, format => '%6d' }, |
|
343
|
|
|
|
|
|
|
'INTERSECT-RATE(%)' => |
|
344
|
|
|
|
|
|
|
{ value => $intersect_rate, format => '%8.2f' }, |
|
345
|
|
|
|
|
|
|
'COMPLETENESS(%)' => { value => $completeness, format => '%8.2f' } |
|
346
|
|
|
|
|
|
|
}; |
|
347
|
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
# Serialize results |
|
349
|
|
|
|
|
|
|
my $tmp_str = join "\t", map { |
|
350
|
72
|
|
|
|
|
277
|
defined $format->{$_}{format} |
|
351
|
|
|
|
|
|
|
? sprintf( $format->{$_}{format}, $format->{$_}{value} ) |
|
352
|
|
|
|
|
|
|
: $format->{$_}{value} |
|
353
|
1296
|
100
|
|
|
|
5466
|
} @headers; |
|
354
|
72
|
|
|
|
|
343
|
push @results, $tmp_str; |
|
355
|
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
# To save memory only load if --align |
|
357
|
72
|
50
|
|
|
|
211
|
if ( defined $align ) { |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
# Add all of the above to @alignments_ascii |
|
360
|
72
|
|
|
|
|
171
|
my $sep = ('-') x 80; |
|
361
|
72
|
|
|
|
|
2334
|
push @alignments_ascii, |
|
362
|
|
|
|
|
|
|
qq/#$header\n$tmp_str\n$sep\n$$alignment_str_ascii/; |
|
363
|
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
# Add all of the above to @alignments_csv |
|
365
|
72
|
|
|
|
|
2718
|
push @alignments_csv, @$alignment_arr_csv; |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
# Add data to dataframe |
|
368
|
|
|
|
|
|
|
push @dataframe, join ';', qq/R|$key/, |
|
369
|
72
|
|
|
|
|
3880
|
( split //, $ref_binary_hash->{$key}{binary_digit_string} ); |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
# Add values to info |
|
372
|
|
|
|
|
|
|
$info{$key} = { |
|
373
|
|
|
|
|
|
|
weighted => $weight_bool eq 'True' |
|
374
|
|
|
|
|
|
|
? JSON::XS::true |
|
375
|
|
|
|
|
|
|
: JSON::XS::false, |
|
376
|
|
|
|
|
|
|
reference_id => $key, |
|
377
|
|
|
|
|
|
|
target_id => $tar, |
|
378
|
|
|
|
|
|
|
reference_binary_string => |
|
379
|
|
|
|
|
|
|
$ref_binary_hash->{$key}{binary_digit_string}, |
|
380
|
|
|
|
|
|
|
target_binary_string => |
|
381
|
|
|
|
|
|
|
$tar_binary_hash->{$key}{binary_digit_string}, |
|
382
|
|
|
|
|
|
|
reference_binary_string_weighted => |
|
383
|
|
|
|
|
|
|
$ref_binary_hash->{$key}{binary_digit_string_weighted}, |
|
384
|
|
|
|
|
|
|
target_binary_string_weighted => |
|
385
|
|
|
|
|
|
|
$tar_binary_hash->{$key}{binary_digit_string_weighted}, |
|
386
|
|
|
|
|
|
|
alignment_length => $length_align_corrected, |
|
387
|
|
|
|
|
|
|
hamming_distance => $score->{$key}{hamming}, |
|
388
|
|
|
|
|
|
|
hamming_z_score => $hamming_z_score, |
|
389
|
|
|
|
|
|
|
hamming_p_value => $hamming_p_value_from_z_score, |
|
390
|
|
|
|
|
|
|
jaccard_similarity => $score->{$key}{jaccard}, |
|
391
|
|
|
|
|
|
|
jaccard_z_score => $jaccard_z_score, |
|
392
|
|
|
|
|
|
|
jaccard_p_value => $jaccard_p_value_from_z_score, |
|
393
|
|
|
|
|
|
|
jaccard_distance => 1 - $score->{$key}{jaccard}, |
|
394
|
|
|
|
|
|
|
format => $self->{format}, |
|
395
|
72
|
100
|
|
|
|
1104
|
alignment => $$alignment_str_ascii, |
|
396
|
|
|
|
|
|
|
reference_vars => $reference_vars, |
|
397
|
|
|
|
|
|
|
target_vars => $target_vars, |
|
398
|
|
|
|
|
|
|
intersect => $intersect, |
|
399
|
|
|
|
|
|
|
intersect_rate => $intersect_rate, |
|
400
|
|
|
|
|
|
|
completeness => $completeness |
|
401
|
|
|
|
|
|
|
}; |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
} |
|
404
|
|
|
|
|
|
|
|
|
405
|
72
|
|
|
|
|
4302
|
$count++; |
|
406
|
72
|
100
|
|
|
|
1960
|
last if $count == $max_out; |
|
407
|
|
|
|
|
|
|
} |
|
408
|
|
|
|
|
|
|
|
|
409
|
2
|
|
|
|
|
210
|
return \@results, \%info, \@alignments_ascii, \@dataframe, \@alignments_csv; |
|
410
|
|
|
|
|
|
|
} |
|
411
|
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
sub create_alignment { |
|
413
|
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
# NB: The alignment will use the weighted string |
|
415
|
72
|
|
|
72
|
0
|
144
|
my $arg = shift; |
|
416
|
72
|
|
|
|
|
143
|
my $ref_key = $arg->{ref_key}; |
|
417
|
72
|
|
|
|
|
117
|
my $binary_string1 = $arg->{ref_str_weighted}; |
|
418
|
72
|
|
|
|
|
150
|
my $binary_string2 = $arg->{tar_str_weighted}; |
|
419
|
72
|
|
|
|
|
110
|
my $sorted_keys_glob_hash = $arg->{sorted_keys_glob_hash}; |
|
420
|
72
|
|
|
|
|
131
|
my $labels = $arg->{labels}; |
|
421
|
72
|
|
|
|
|
94
|
my $glob_hash = $arg->{glob_hash}; |
|
422
|
72
|
|
|
|
|
130
|
my $length1 = length($binary_string1); |
|
423
|
72
|
|
|
|
|
98
|
my $length2 = length($binary_string2); |
|
424
|
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
# Check that l1 = l2 |
|
426
|
72
|
50
|
|
|
|
178
|
die "The binary strings must have the same length" |
|
427
|
|
|
|
|
|
|
if ( $length1 != $length2 ); |
|
428
|
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
# Expand array to have weights as N-elements |
|
430
|
72
|
|
|
|
|
208
|
my $recreated_array = recreate_array( $glob_hash, $sorted_keys_glob_hash ); |
|
431
|
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
# Initialize some variables |
|
433
|
72
|
|
|
|
|
154
|
my $out_ascii = "REF -- TAR\n"; |
|
434
|
72
|
|
|
|
|
90
|
my @out_csv; |
|
435
|
72
|
|
|
|
|
126
|
my $cum_distance = 0; |
|
436
|
72
|
|
|
|
|
95
|
my $n_00 = 0; |
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
# For loop with 2 variables |
|
439
|
|
|
|
|
|
|
# i index for weighted |
|
440
|
|
|
|
|
|
|
# j the number of variables |
|
441
|
72
|
|
|
|
|
135
|
my ( $i, $j ) = ( 0, 0 ); |
|
442
|
72
|
|
|
|
|
146
|
for ( $i = 0 ; $i < $length1 ; $i++, $j++ ) { |
|
443
|
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
# Load key and value |
|
445
|
11448
|
|
|
|
|
17793
|
my $key = $recreated_array->[$i]; |
|
446
|
11448
|
|
|
|
|
19839
|
my $val = sprintf( "%3d", $glob_hash->{$key} ); |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
# We have to keep track with $j |
|
449
|
11448
|
|
|
|
|
14232
|
my $sorted_key = $sorted_keys_glob_hash->[$j]; |
|
450
|
11448
|
|
|
|
|
16106
|
my $label = $labels->[$j]; |
|
451
|
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# Load chars |
|
453
|
11448
|
|
|
|
|
15037
|
my $char1 = substr( $binary_string1, $i, 1 ); |
|
454
|
11448
|
|
|
|
|
13522
|
my $char2 = substr( $binary_string2, $i, 1 ); |
|
455
|
11448
|
100
|
100
|
|
|
27128
|
$n_00++ if ( $char1 == 0 && $char2 == 0 ); |
|
456
|
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# Correct $i index by adding weights |
|
458
|
11448
|
|
|
|
|
15382
|
$i = $i + $glob_hash->{$key} - 1; |
|
459
|
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
# Load metrics |
|
461
|
11448
|
100
|
|
|
|
19047
|
$cum_distance += $glob_hash->{$key} if $char1 ne $char2; |
|
462
|
11448
|
|
|
|
|
16922
|
my $cum_distance_pretty = sprintf( "%3d", $cum_distance ); |
|
463
|
11448
|
100
|
|
|
|
16965
|
my $distance = $char1 eq $char2 ? 0 : $glob_hash->{$key}; |
|
464
|
11448
|
|
|
|
|
15535
|
my $distance_pretty = sprintf( "%3d", $distance ); |
|
465
|
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
# w = weight, d = distance, cd = cumul distance |
|
467
|
11448
|
|
|
|
|
27245
|
my %format = ( |
|
468
|
|
|
|
|
|
|
'11' => '-----', |
|
469
|
|
|
|
|
|
|
'10' => 'xxx--', |
|
470
|
|
|
|
|
|
|
'01' => '--xxx', |
|
471
|
|
|
|
|
|
|
'00' => ' ' |
|
472
|
|
|
|
|
|
|
); |
|
473
|
11448
|
|
|
|
|
22462
|
$out_ascii .= |
|
474
|
|
|
|
|
|
|
qq/$char1 $format{ $char1 . $char2 } $char2 | (w:$val|d:$distance_pretty|cd:$cum_distance_pretty|) $sorted_key ($label)\n/; |
|
475
|
11448
|
|
|
|
|
50917
|
push @out_csv, |
|
476
|
|
|
|
|
|
|
qq/$ref_key;$char1;$format{ $char1 . $char2 };$char2;$glob_hash->{$key};$distance;$sorted_key;$label/; |
|
477
|
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
#REF(107:week_0_arm_1);indicator;TAR(125:week_0_arm_1);weight;hamming-distance;json-path |
|
479
|
|
|
|
|
|
|
#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P16Y |
|
480
|
|
|
|
|
|
|
#0;;0;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P24Y |
|
481
|
|
|
|
|
|
|
#1;-----;1;1;0;diseases.ageOfOnset.ageRange.end.iso8601duration.P39Y |
|
482
|
|
|
|
|
|
|
} |
|
483
|
72
|
|
|
|
|
1368
|
return $n_00, \$out_ascii, \@out_csv; |
|
484
|
|
|
|
|
|
|
} |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
sub recreate_array { |
|
487
|
72
|
|
|
72
|
0
|
150
|
my ( $glob_hash, $sorted_keys_glob_hash ) = @_; |
|
488
|
72
|
|
|
|
|
119
|
my @recreated_array; |
|
489
|
72
|
|
|
|
|
227
|
foreach my $key (@$sorted_keys_glob_hash) { |
|
490
|
11448
|
|
|
|
|
20155
|
for ( my $i = 0 ; $i < $glob_hash->{$key} ; $i++ ) { |
|
491
|
11736
|
|
|
|
|
24269
|
push @recreated_array, $key; |
|
492
|
|
|
|
|
|
|
} |
|
493
|
|
|
|
|
|
|
} |
|
494
|
72
|
|
|
|
|
253
|
return \@recreated_array; |
|
495
|
|
|
|
|
|
|
} |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
sub create_glob_and_ref_hashes { |
|
498
|
15
|
|
|
15
|
0
|
39
|
my ( $array, $weight, $self ) = @_; |
|
499
|
15
|
|
|
|
|
34
|
my $primary_key = $self->{primary_key}; |
|
500
|
15
|
|
|
|
|
25
|
my $glob_hash = {}; |
|
501
|
15
|
|
|
|
|
34
|
my $ref_hash_flattened; |
|
502
|
|
|
|
|
|
|
|
|
503
|
15
|
|
|
|
|
24
|
my $count = 1; |
|
504
|
15
|
|
|
|
|
31
|
for my $element ( @{$array} ) { |
|
|
15
|
|
|
|
|
33
|
|
|
505
|
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
# For consistency, we obtain the primary_key for both BFF/PXF |
|
507
|
|
|
|
|
|
|
# from $_->{id} (not from subject.id) |
|
508
|
456
|
50
|
|
|
|
1906
|
my $id = $element->{$primary_key} |
|
509
|
|
|
|
|
|
|
or die |
|
510
|
|
|
|
|
|
|
"Sorry but the JSON document [$count] does not have the primary_key <$primary_key> defined\n"; |
|
511
|
|
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
# Remapping hash |
|
513
|
456
|
50
|
|
|
|
1561
|
say "Flattening and remapping ..." if $self->{verbose}; |
|
514
|
|
|
|
|
|
|
|
|
515
|
456
|
|
|
|
|
2630
|
my $ref_hash = remap_hash( |
|
516
|
|
|
|
|
|
|
{ |
|
517
|
|
|
|
|
|
|
hash => $element, |
|
518
|
|
|
|
|
|
|
weight => $weight, |
|
519
|
|
|
|
|
|
|
self => $self |
|
520
|
|
|
|
|
|
|
} |
|
521
|
|
|
|
|
|
|
); |
|
522
|
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
524
|
|
|
|
|
|
|
# We eliminate keys with weight = 0 if defined $weight; |
|
525
|
456
|
100
|
|
|
|
2897
|
prune_keys_with_weight_zero($ref_hash) if defined $weight; |
|
526
|
|
|
|
|
|
|
|
|
527
|
|
|
|
|
|
|
# Load big hash ref_hash_flattened |
|
528
|
456
|
|
|
|
|
1935
|
$ref_hash_flattened->{$id} = $ref_hash; |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
# The idea is to create a $glob_hash with unique key-values |
|
531
|
|
|
|
|
|
|
# Duplicates will be automatically merged |
|
532
|
456
|
|
|
|
|
32139
|
$glob_hash = { %$glob_hash, %$ref_hash }; |
|
533
|
|
|
|
|
|
|
|
|
534
|
|
|
|
|
|
|
# To keep track of array element indexes |
|
535
|
456
|
|
|
|
|
4341
|
$count++; |
|
536
|
|
|
|
|
|
|
} |
|
537
|
15
|
|
|
|
|
112
|
return ( $glob_hash, $ref_hash_flattened ); |
|
538
|
|
|
|
|
|
|
} |
|
539
|
|
|
|
|
|
|
|
|
540
|
|
|
|
|
|
|
sub randomize_variables { |
|
541
|
0
|
|
|
0
|
0
|
0
|
my ( $glob_hash, $self ) = @_; |
|
542
|
0
|
|
|
|
|
0
|
my $max = $self->{max_number_vars}; |
|
543
|
0
|
|
|
|
|
0
|
my $seed = $self->{seed}; |
|
544
|
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
# set random seed |
|
546
|
0
|
|
|
|
|
0
|
srand($seed); |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
# Randomize |
|
549
|
|
|
|
|
|
|
# NB:keys have to be sorted for srand to work!!!! |
|
550
|
|
|
|
|
|
|
# perl -MList::Util=shuffle -E 'srand 123; say shuffle 1 .. 5' |
|
551
|
0
|
|
|
|
|
0
|
my @items = ( shuffle sort keys %$glob_hash )[ 0 .. $max - 1 ]; |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
# Create a new hash with a hash slice |
|
554
|
0
|
|
|
|
|
0
|
my %new_glob_hash; |
|
555
|
0
|
|
|
|
|
0
|
@new_glob_hash{@items} = @{$glob_hash}{@items}; |
|
|
0
|
|
|
|
|
0
|
|
|
556
|
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
# return reference |
|
558
|
0
|
|
|
|
|
0
|
return \%new_glob_hash; |
|
559
|
|
|
|
|
|
|
} |
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
sub prune_excluded_included { |
|
562
|
458
|
|
|
458
|
0
|
969
|
my ( $hash, $self ) = @_; |
|
563
|
458
|
|
|
|
|
618
|
my @included = @{ $self->{include_terms} }; |
|
|
458
|
|
|
|
|
1323
|
|
|
564
|
458
|
|
|
|
|
688
|
my @excluded = @{ $self->{exclude_terms} }; |
|
|
458
|
|
|
|
|
1134
|
|
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
# Die if we have both options at the same time |
|
567
|
458
|
50
|
66
|
|
|
1346
|
die "Sorry, <--include> and <--exclude> are mutually exclusive\n" |
|
568
|
|
|
|
|
|
|
if ( @included && @excluded ); |
|
569
|
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
571
|
|
|
|
|
|
|
# Original $hash is modified |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
# INCLUDED |
|
574
|
458
|
100
|
|
|
|
1194
|
if (@included) { |
|
575
|
25
|
|
|
|
|
81
|
for my $key ( keys %$hash ) { |
|
576
|
125
|
100
|
|
225
|
|
451
|
delete $hash->{$key} unless any { $_ eq $key } @included; |
|
|
225
|
|
|
|
|
586
|
|
|
577
|
|
|
|
|
|
|
} |
|
578
|
|
|
|
|
|
|
} |
|
579
|
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
# EXCLUDED |
|
581
|
458
|
50
|
|
|
|
888
|
if (@excluded) { |
|
582
|
0
|
|
|
|
|
0
|
for my $key (@excluded) { |
|
583
|
0
|
0
|
|
|
|
0
|
delete $hash->{$key} if exists $hash->{$key}; |
|
584
|
|
|
|
|
|
|
} |
|
585
|
|
|
|
|
|
|
} |
|
586
|
|
|
|
|
|
|
|
|
587
|
|
|
|
|
|
|
# We will do nothing if @included = @excluded = [] (DEFAULT) |
|
588
|
458
|
|
|
|
|
906
|
return 1; |
|
589
|
|
|
|
|
|
|
} |
|
590
|
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
sub set_excluded_phenotypicFeatures { |
|
592
|
458
|
|
|
458
|
0
|
950
|
my ( $hash, $switch, $format ) = @_; |
|
593
|
|
|
|
|
|
|
|
|
594
|
|
|
|
|
|
|
# Ensure phenotypicFeatures exist before processing |
|
595
|
458
|
100
|
|
|
|
1199
|
return 1 unless exists $hash->{phenotypicFeatures}; |
|
596
|
|
|
|
|
|
|
|
|
597
|
320
|
|
|
|
|
666
|
foreach my $feature ( @{ $hash->{phenotypicFeatures} } ) { |
|
|
320
|
|
|
|
|
916
|
|
|
598
|
|
|
|
|
|
|
|
|
599
|
|
|
|
|
|
|
# Skip if 'excluded' is not set or false |
|
600
|
787
|
100
|
|
|
|
3358
|
next unless $feature->{excluded}; |
|
601
|
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
# NB: remaining phenotypicFeatures:1.excluded |
|
603
|
|
|
|
|
|
|
# will be discarded by $exclude_variables_regex_qr later |
|
604
|
449
|
100
|
|
|
|
3355
|
if ($switch) { |
|
605
|
|
|
|
|
|
|
|
|
606
|
|
|
|
|
|
|
# Determine the correct ID field based on the format |
|
607
|
2
|
50
|
|
|
|
8
|
my $id_field = $format eq 'BFF' ? 'featureType' : 'type'; |
|
608
|
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# Append '_excluded' to the appropriate ID |
|
610
|
2
|
|
|
|
|
9
|
$feature->{$id_field}{id} .= '_excluded'; |
|
611
|
|
|
|
|
|
|
} |
|
612
|
|
|
|
|
|
|
else { |
|
613
|
|
|
|
|
|
|
# Remove the feature by setting it to undef |
|
614
|
447
|
|
|
|
|
1428
|
$feature = undef; |
|
615
|
|
|
|
|
|
|
|
|
616
|
|
|
|
|
|
|
# Due to properties being set to undef, it's possible for the coverage file to |
|
617
|
|
|
|
|
|
|
# report phenotypicFeatures as 100% by all "excluded" = true |
|
618
|
|
|
|
|
|
|
} |
|
619
|
|
|
|
|
|
|
} |
|
620
|
|
|
|
|
|
|
|
|
621
|
320
|
|
|
|
|
618
|
return 1; |
|
622
|
|
|
|
|
|
|
} |
|
623
|
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
sub remap_hash { |
|
625
|
458
|
|
|
458
|
0
|
884
|
my $arg = shift; |
|
626
|
458
|
|
|
|
|
1048
|
my $hash = $arg->{hash}; |
|
627
|
458
|
|
|
|
|
785
|
my $weight = $arg->{weight}; |
|
628
|
458
|
|
|
|
|
785
|
my $self = $arg->{self}; # $self from $arg |
|
629
|
458
|
|
|
|
|
843
|
my $nodes = $self->{nodes}; |
|
630
|
458
|
|
|
|
|
767
|
my $edges = $self->{edges}; |
|
631
|
458
|
|
|
|
|
776
|
my $format = $self->{format}; |
|
632
|
458
|
|
|
|
|
1050
|
my $switch = $self->{retain_excluded_phenotypicFeatures}; |
|
633
|
458
|
|
|
|
|
624
|
my %out_hash; |
|
634
|
|
|
|
|
|
|
|
|
635
|
|
|
|
|
|
|
# Do some pruning excluded / included |
|
636
|
458
|
|
|
|
|
1803
|
prune_excluded_included( $hash, $self ); |
|
637
|
|
|
|
|
|
|
|
|
638
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
639
|
|
|
|
|
|
|
# The user may include a term that: |
|
640
|
|
|
|
|
|
|
# 1 - may not exist in any individual |
|
641
|
|
|
|
|
|
|
# 2 - does not exist in some individuals |
|
642
|
|
|
|
|
|
|
# If the term does not exist in a individual |
|
643
|
|
|
|
|
|
|
# - a) -include-terms contains ANOTHER TERM THAT EXISTS |
|
644
|
|
|
|
|
|
|
# %$hash will contain keys => OK |
|
645
|
|
|
|
|
|
|
# - b) -include-terms only includes the term/terms not present |
|
646
|
|
|
|
|
|
|
# %$hash = 0 , then we return {}, to avoid trouble w/ Fold.pm |
|
647
|
|
|
|
|
|
|
#print Dumper $hash; |
|
648
|
458
|
50
|
|
|
|
1091
|
return {} unless %$hash; |
|
649
|
|
|
|
|
|
|
|
|
650
|
|
|
|
|
|
|
# A bit more pruning plus folding |
|
651
|
|
|
|
|
|
|
# NB: Hash::Fold keeps white spaces on keys |
|
652
|
|
|
|
|
|
|
# |
|
653
|
|
|
|
|
|
|
# Options for 1D-array folding: |
|
654
|
|
|
|
|
|
|
# A) Array to Hash then Fold |
|
655
|
|
|
|
|
|
|
# B) Fold then Regex <=== CHOSEN |
|
656
|
|
|
|
|
|
|
# - Avoids the need for deep cloning |
|
657
|
|
|
|
|
|
|
# - Works across any JSON data structure (without specific key requirements) |
|
658
|
|
|
|
|
|
|
# - BUT profiling shows it's ~5-10% slower than 'Array to Hash then Fold' |
|
659
|
|
|
|
|
|
|
# - Does not accommodate specific remappings like 'interpretations.diagnosis.genomicInterpretations' |
|
660
|
458
|
|
|
|
|
1130
|
set_excluded_phenotypicFeatures( $hash, $switch, $format ); |
|
661
|
458
|
|
|
|
|
2242
|
$hash = fold($hash); |
|
662
|
|
|
|
|
|
|
|
|
663
|
|
|
|
|
|
|
# Load the hash that points to the hierarchy for ontology-term-id |
|
664
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
665
|
|
|
|
|
|
|
# - phenotypicFeatures.featureType.id => BFF |
|
666
|
|
|
|
|
|
|
# - phenotypicFeatures.type.id => PXF |
|
667
|
458
|
|
|
|
|
3746953
|
my $id_correspondence = $self->{id_correspondence}; |
|
668
|
|
|
|
|
|
|
|
|
669
|
|
|
|
|
|
|
# Load values for the for loop |
|
670
|
458
|
|
|
|
|
1263
|
my $exclude_variables_regex_qr = $self->{exclude_variables_regex_qr}; |
|
671
|
458
|
|
|
|
|
2264
|
my $misc_regex_qr = |
|
672
|
|
|
|
|
|
|
qr/1900-01-01|NA0000|NCIT:C126101|P999Y|P9999Y|phenopacket_id/; |
|
673
|
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
# Pre-compile a list of fixed scalar values to exclude into a hash for quick lookup |
|
675
|
|
|
|
|
|
|
my %exclude_values = |
|
676
|
458
|
|
|
|
|
1201
|
map { $_ => 1 } |
|
|
2290
|
|
|
|
|
5644
|
|
|
677
|
|
|
|
|
|
|
( 'NA', 'NaN', 'Fake', 'None:No matching concept', 'Not Available' ); |
|
678
|
|
|
|
|
|
|
|
|
679
|
|
|
|
|
|
|
# Now we proceed for each key |
|
680
|
458
|
|
|
|
|
1105
|
for my $key ( keys %{$hash} ) { |
|
|
458
|
|
|
|
|
16580
|
|
|
681
|
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
# To see which ones were discarded |
|
683
|
|
|
|
|
|
|
#say $key if !defined $hash->{$key}; |
|
684
|
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
# Discard undefined |
|
686
|
87191
|
100
|
|
|
|
160227
|
next unless defined $hash->{$key}; |
|
687
|
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
# Discarding lines with 'low quality' keys (Time of regex profiled with :NYTProf: ms time) |
|
689
|
|
|
|
|
|
|
# Some can be "rescued" by adding the ontology as ($1) |
|
690
|
|
|
|
|
|
|
# NB1: We discard _labels too!! |
|
691
|
|
|
|
|
|
|
# NB2: info|metaData are always discarded |
|
692
|
|
|
|
|
|
|
next |
|
693
|
84392
|
100
|
100
|
|
|
480318
|
if ( defined $exclude_variables_regex_qr |
|
694
|
|
|
|
|
|
|
&& $key =~ $exclude_variables_regex_qr ); |
|
695
|
|
|
|
|
|
|
|
|
696
|
|
|
|
|
|
|
# The user can turn on age related values |
|
697
|
|
|
|
|
|
|
next |
|
698
|
|
|
|
|
|
|
if ( ( $format eq 'PXF' || $format eq 'BFF' ) |
|
699
|
|
|
|
|
|
|
&& $key =~ m/\.age(?!nt)|onset/i |
|
700
|
33663
|
100
|
100
|
|
|
301746
|
&& !$self->{age} ); # $self->{age} [0|1] |
|
|
|
|
100
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
701
|
|
|
|
|
|
|
|
|
702
|
|
|
|
|
|
|
# Load values |
|
703
|
32154
|
|
|
|
|
49961
|
my $val = $hash->{$key}; |
|
704
|
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
# Discarding lines with unsupported val (Time profiled with :NYTProf: ms time) |
|
706
|
|
|
|
|
|
|
next |
|
707
|
|
|
|
|
|
|
if ( |
|
708
|
|
|
|
|
|
|
( ref($val) eq 'HASH' |
|
709
|
1
|
|
|
|
|
5
|
&& !keys %{$val} ) # Discard {} (e.g.,subject.vitalStatus: {}) |
|
710
|
600
|
|
|
|
|
2194
|
|| ( ref($val) eq 'ARRAY' && !@{$val} ) # Discard [] |
|
711
|
32154
|
100
|
66
|
|
|
260457
|
|| exists $exclude_values{$val} |
|
|
|
|
66
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
712
|
|
|
|
|
|
|
|| $val =~ $misc_regex_qr |
|
713
|
|
|
|
|
|
|
); |
|
714
|
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
# Add IDs to key |
|
716
|
23564
|
|
|
|
|
44956
|
my $id_key = add_id2key( $key, $hash, $self ); |
|
717
|
|
|
|
|
|
|
|
|
718
|
|
|
|
|
|
|
# Finally add value to id_key |
|
719
|
23564
|
|
|
|
|
38790
|
my $tmp_key_at_variable_level = $id_key . '.' . $val; |
|
720
|
|
|
|
|
|
|
|
|
721
|
|
|
|
|
|
|
# Add HPO ascendants |
|
722
|
23564
|
50
|
33
|
|
|
44162
|
if ( defined $edges && $val =~ /^HP:/ ) { |
|
723
|
0
|
|
|
|
|
0
|
my $ascendants = |
|
724
|
|
|
|
|
|
|
add_hpo_ascendants( $tmp_key_at_variable_level, $nodes, $edges ); |
|
725
|
0
|
|
|
|
|
0
|
$out_hash{$_} = 1 for @$ascendants; # weight 1 for now |
|
726
|
|
|
|
|
|
|
} |
|
727
|
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
################## |
|
729
|
|
|
|
|
|
|
# Assign weights # |
|
730
|
|
|
|
|
|
|
################## |
|
731
|
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
# NB: mrueda (04-12-23) - it's ok if $weight == undef => NO AUTOVIVIFICATION! |
|
733
|
|
|
|
|
|
|
# NB: We don't warn if user selection does not exist, just assign 1 |
|
734
|
|
|
|
|
|
|
|
|
735
|
23564
|
|
|
|
|
28324
|
my $tmp_key_at_term_level = $tmp_key_at_variable_level; |
|
736
|
|
|
|
|
|
|
|
|
737
|
|
|
|
|
|
|
# If variable has . then capture $1 |
|
738
|
23564
|
50
|
|
|
|
47624
|
if ( $tmp_key_at_term_level =~ m/\./ ) { |
|
739
|
|
|
|
|
|
|
|
|
740
|
|
|
|
|
|
|
# NB: For long str regex is faster than (split /\./, $foo)[0] |
|
741
|
23564
|
|
|
|
|
48510
|
$tmp_key_at_term_level =~ m/^(\w+)\./; |
|
742
|
23564
|
|
|
|
|
37334
|
$tmp_key_at_term_level = $1; |
|
743
|
|
|
|
|
|
|
} |
|
744
|
|
|
|
|
|
|
|
|
745
|
23564
|
100
|
|
|
|
34960
|
if ( defined $weight ) { |
|
746
|
|
|
|
|
|
|
|
|
747
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
748
|
|
|
|
|
|
|
# ORDER MATTERS !!!! |
|
749
|
|
|
|
|
|
|
# We allow for assigning weights by TERM (e.g., 1D) |
|
750
|
|
|
|
|
|
|
# but VARIABLE level takes precedence to TERM |
|
751
|
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
$out_hash{$tmp_key_at_variable_level} = |
|
753
|
|
|
|
|
|
|
|
|
754
|
|
|
|
|
|
|
# VARIABLE LEVEL |
|
755
|
|
|
|
|
|
|
# NB: exists stringifies the weights |
|
756
|
|
|
|
|
|
|
exists $weight->{$tmp_key_at_variable_level} |
|
757
|
|
|
|
|
|
|
? $weight->{$tmp_key_at_variable_level} + 0 # coercing to number |
|
758
|
|
|
|
|
|
|
|
|
759
|
|
|
|
|
|
|
# TERM LEVEL |
|
760
|
|
|
|
|
|
|
: exists $weight->{$tmp_key_at_term_level} |
|
761
|
5292
|
50
|
|
|
|
20546
|
? $weight->{$tmp_key_at_term_level} + 0 # coercing to number |
|
|
|
100
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
# NO WEIGHT |
|
764
|
|
|
|
|
|
|
: 1; |
|
765
|
|
|
|
|
|
|
|
|
766
|
|
|
|
|
|
|
} |
|
767
|
|
|
|
|
|
|
else { |
|
768
|
|
|
|
|
|
|
|
|
769
|
|
|
|
|
|
|
# Assign a weight of 1 if no users weights |
|
770
|
18272
|
|
|
|
|
42492
|
$out_hash{$tmp_key_at_variable_level} = 1; |
|
771
|
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
} |
|
773
|
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
############## |
|
775
|
|
|
|
|
|
|
# label Hash # |
|
776
|
|
|
|
|
|
|
############## |
|
777
|
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
# Finally we load the Nomenclature hash |
|
779
|
23564
|
|
|
|
|
32364
|
my $label = $key; |
|
780
|
23564
|
|
|
|
|
62246
|
$label =~ s/id/label/; |
|
781
|
|
|
|
|
|
|
$nomenclature{$tmp_key_at_variable_level} = $hash->{$label} |
|
782
|
23564
|
100
|
|
|
|
82654
|
if defined $hash->{$label}; |
|
783
|
|
|
|
|
|
|
} |
|
784
|
|
|
|
|
|
|
|
|
785
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
786
|
|
|
|
|
|
|
# We have to return an object {} when undef |
|
787
|
458
|
|
50
|
|
|
32853
|
return \%out_hash // {}; |
|
788
|
|
|
|
|
|
|
} |
|
789
|
|
|
|
|
|
|
|
|
790
|
|
|
|
|
|
|
sub add_hpo_ascendants { |
|
791
|
0
|
|
|
0
|
0
|
0
|
my ( $key, $nodes, $edges ) = @_; |
|
792
|
|
|
|
|
|
|
|
|
793
|
|
|
|
|
|
|
# First we obtain the ontology (0000539) from HP:0000539 |
|
794
|
0
|
|
|
|
|
0
|
$key =~ m/HP:(\w+)$/; |
|
795
|
0
|
|
|
|
|
0
|
my $ontology = $1; |
|
796
|
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
# We'll use it to build a string equivalent to a key from $edges |
|
798
|
0
|
|
|
|
|
0
|
my $hpo_url = 'http://purl.obolibrary.org/obo/HP_'; |
|
799
|
0
|
|
|
|
|
0
|
my $hpo_key = $hpo_url . $ontology; |
|
800
|
|
|
|
|
|
|
|
|
801
|
|
|
|
|
|
|
# We will include all ascendants in an array |
|
802
|
0
|
|
|
|
|
0
|
my @ascendants; |
|
803
|
0
|
|
|
|
|
0
|
for my $parent_id ( @{ $edges->{$hpo_key} } ) { |
|
|
0
|
|
|
|
|
0
|
|
|
804
|
|
|
|
|
|
|
|
|
805
|
|
|
|
|
|
|
# We have to create a copy to not modify the original $parent_id |
|
806
|
|
|
|
|
|
|
# as it can appear in multiple individuals |
|
807
|
0
|
|
|
|
|
0
|
my $copy_parent_id = $parent_id; |
|
808
|
0
|
|
|
|
|
0
|
$copy_parent_id =~ m/\/(\w+)$/; |
|
809
|
0
|
|
|
|
|
0
|
$copy_parent_id = $1; |
|
810
|
0
|
|
|
|
|
0
|
$copy_parent_id =~ tr/_/:/; |
|
811
|
|
|
|
|
|
|
|
|
812
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
813
|
|
|
|
|
|
|
# We cannot add any label to the ascendants, otherwise they will |
|
814
|
|
|
|
|
|
|
# not be matched by an indv down the tree |
|
815
|
|
|
|
|
|
|
# Myopia |
|
816
|
|
|
|
|
|
|
# Mild Myopia |
|
817
|
|
|
|
|
|
|
# We want that 'Mild Myopia' matches 'Myopia', thus we can not add a label from 'Mild Myopia' |
|
818
|
|
|
|
|
|
|
# Use the labels only for debug |
|
819
|
0
|
|
|
|
|
0
|
my $asc_key = DEVEL_MODE ? $key . '.HPO_asc_DEBUG_ONLY' : $key; |
|
820
|
0
|
|
|
|
|
0
|
$asc_key =~ s/HP:$ontology/$copy_parent_id/g; |
|
821
|
0
|
|
|
|
|
0
|
push @ascendants, $asc_key; |
|
822
|
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
# We finally add the label to %nomenclature |
|
824
|
0
|
|
|
|
|
0
|
my $hpo_asc_str = $hpo_url |
|
825
|
|
|
|
|
|
|
. $copy_parent_id; # 'http://purl.obolibrary.org/obo/HP_HP:0000539 |
|
826
|
0
|
|
|
|
|
0
|
$hpo_asc_str =~ s/HP://; # 0000539 |
|
827
|
0
|
|
|
|
|
0
|
$nomenclature{$asc_key} = $nodes->{$hpo_asc_str}{lbl}; |
|
828
|
|
|
|
|
|
|
} |
|
829
|
0
|
|
|
|
|
0
|
return \@ascendants; |
|
830
|
|
|
|
|
|
|
} |
|
831
|
|
|
|
|
|
|
|
|
832
|
|
|
|
|
|
|
sub add_id2key { |
|
833
|
23564
|
|
|
23564
|
0
|
39470
|
my ( $key, $hash, $self ) = @_; |
|
834
|
23564
|
|
|
|
|
46568
|
my $id_correspondence = $self->{id_correspondence}{ $self->{format} }; |
|
835
|
23564
|
|
|
|
|
30256
|
my $array_regex_qr = $self->{array_regex_qr}; |
|
836
|
23564
|
|
|
|
|
31696
|
my $array_terms_regex_qr = $self->{array_terms_regex_qr}; |
|
837
|
|
|
|
|
|
|
|
|
838
|
|
|
|
|
|
|
############# |
|
839
|
|
|
|
|
|
|
# OBJECTIVE # |
|
840
|
|
|
|
|
|
|
############# |
|
841
|
|
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
# This subroutine is important as it replaces the index (numeric) for a given |
|
843
|
|
|
|
|
|
|
# array element by a selected ontology. It's done for all subkeys on that element |
|
844
|
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
#"interventionsOrProcedures" : [ |
|
846
|
|
|
|
|
|
|
# { |
|
847
|
|
|
|
|
|
|
# "bodySite" : { |
|
848
|
|
|
|
|
|
|
# "id" : "NCIT:C12736", |
|
849
|
|
|
|
|
|
|
# "label" : "intestine" |
|
850
|
|
|
|
|
|
|
# }, |
|
851
|
|
|
|
|
|
|
# "procedureCode" : { |
|
852
|
|
|
|
|
|
|
# "id" : "NCIT:C157823", |
|
853
|
|
|
|
|
|
|
# "label" : "Colon Resection" |
|
854
|
|
|
|
|
|
|
# } |
|
855
|
|
|
|
|
|
|
# }, |
|
856
|
|
|
|
|
|
|
# { |
|
857
|
|
|
|
|
|
|
# "bodySite" : { |
|
858
|
|
|
|
|
|
|
# "id" : "NCIT:C12736", |
|
859
|
|
|
|
|
|
|
# "label" : "intestine" |
|
860
|
|
|
|
|
|
|
# }, |
|
861
|
|
|
|
|
|
|
# "procedureCode" : { |
|
862
|
|
|
|
|
|
|
# "id" : "NCIT:C86074", |
|
863
|
|
|
|
|
|
|
# "label" : "Hemicolectomy" |
|
864
|
|
|
|
|
|
|
# } |
|
865
|
|
|
|
|
|
|
# }, |
|
866
|
|
|
|
|
|
|
#] |
|
867
|
|
|
|
|
|
|
# |
|
868
|
|
|
|
|
|
|
# Will become: |
|
869
|
|
|
|
|
|
|
# |
|
870
|
|
|
|
|
|
|
#"interventionsOrProcedures.NCIT:C157823.bodySite.id.NCIT:C12736" : 1, |
|
871
|
|
|
|
|
|
|
#"interventionsOrProcedures.NCIT:C157823.procedureCode.id.NCIT:C157823" : 1, |
|
872
|
|
|
|
|
|
|
#"interventionsOrProcedures.NCIT:C86074.bodySite.id.NCIT:C12736" : 1, |
|
873
|
|
|
|
|
|
|
#"interventionsOrProcedures.NCIT:C86074.procedureCode.id.NCIT:C86074" : 1, |
|
874
|
|
|
|
|
|
|
# |
|
875
|
|
|
|
|
|
|
# To make the replacement we use $id_correspondence, then we perform a regex |
|
876
|
|
|
|
|
|
|
# to fetch the key parts |
|
877
|
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
# Only proceed if $key is one of the array_terms |
|
879
|
23564
|
100
|
|
|
|
97223
|
if ( $key =~ $array_terms_regex_qr ) { |
|
880
|
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
# Now we use $array_regex_qr to capture $1, $2 and $3 for BFF/PXF |
|
882
|
|
|
|
|
|
|
# NB: For others (e.g., MXF) we will have only $1 and $2 |
|
883
|
22358
|
|
|
|
|
82526
|
$key =~ $array_regex_qr; |
|
884
|
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
#say "$array_regex_qr -- [$key] <$1> <$2> <$3>"; # $3 can be undefined |
|
886
|
|
|
|
|
|
|
|
|
887
|
22358
|
|
|
|
|
28541
|
my ( $tmp_key, $val ); |
|
888
|
|
|
|
|
|
|
|
|
889
|
|
|
|
|
|
|
# Normal behaviour for BFF/PXF |
|
890
|
22358
|
100
|
|
|
|
44769
|
if ( defined $3 ) { |
|
891
|
|
|
|
|
|
|
|
|
892
|
|
|
|
|
|
|
# If id_correspondence is an array (e.g., medicalActions) we have to grep the right match |
|
893
|
22228
|
|
|
|
|
24731
|
my $correspondence; |
|
894
|
22228
|
50
|
|
|
|
57396
|
if ( ref $id_correspondence->{$1} eq ref [] ) { |
|
895
|
|
|
|
|
|
|
|
|
896
|
|
|
|
|
|
|
# $1 $2 $3 |
|
897
|
|
|
|
|
|
|
# <0> |
|
898
|
0
|
|
|
|
|
0
|
my $subkey = ( split /\./, $3 )[0]; # treatment |
|
899
|
|
|
|
|
|
|
$correspondence = |
|
900
|
0
|
|
|
0
|
|
0
|
first { $_ =~ m/^$subkey/ } |
|
901
|
0
|
|
|
|
|
0
|
@{ $id_correspondence->{$1} }; # treatment.agent.id |
|
|
0
|
|
|
|
|
0
|
|
|
902
|
|
|
|
|
|
|
} |
|
903
|
|
|
|
|
|
|
else { |
|
904
|
22228
|
|
|
|
|
34022
|
$correspondence = $id_correspondence->{$1}; |
|
905
|
|
|
|
|
|
|
} |
|
906
|
|
|
|
|
|
|
|
|
907
|
|
|
|
|
|
|
# Now that we know which is the term we use to find key-val in $hash |
|
908
|
22228
|
|
|
|
|
54160
|
$tmp_key = |
|
909
|
|
|
|
|
|
|
$1 . ':' |
|
910
|
|
|
|
|
|
|
. $2 . '.' |
|
911
|
|
|
|
|
|
|
. $correspondence; # medicalActions.0.treatment.agent.id |
|
912
|
22228
|
|
|
|
|
37742
|
$val = $hash->{$tmp_key}; # DrugCentral:257 |
|
913
|
22228
|
|
|
|
|
53565
|
$key = join '.', $1, $val, $3 |
|
914
|
|
|
|
|
|
|
; # medicalActions.DrugCentral:257.treatment.routeOfAdministration.id |
|
915
|
|
|
|
|
|
|
} |
|
916
|
|
|
|
|
|
|
|
|
917
|
|
|
|
|
|
|
# MXF or similar (...we haven't encountered other regex yet) |
|
918
|
|
|
|
|
|
|
else { |
|
919
|
|
|
|
|
|
|
|
|
920
|
130
|
|
|
|
|
265
|
$tmp_key = $1 . ':' . $2; |
|
921
|
130
|
|
|
|
|
139
|
$val = $hash->{$tmp_key}; |
|
922
|
130
|
|
|
|
|
184
|
$key = $1; |
|
923
|
|
|
|
|
|
|
} |
|
924
|
|
|
|
|
|
|
} |
|
925
|
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
# $key = 'Bar:1' means that we have array but the user either: |
|
927
|
|
|
|
|
|
|
# a) Made a mistake in the config |
|
928
|
|
|
|
|
|
|
# b) Is not using the right config file |
|
929
|
|
|
|
|
|
|
else { |
|
930
|
1206
|
50
|
|
|
|
2947
|
die |
|
931
|
|
|
|
|
|
|
"<$1> contains array elements but is not defined as an array in <$self->{config_file}>. Please check your syntax and configuration file.\n" |
|
932
|
|
|
|
|
|
|
if $key =~ m/^(\w+):/; |
|
933
|
|
|
|
|
|
|
} |
|
934
|
|
|
|
|
|
|
|
|
935
|
23564
|
|
|
|
|
44595
|
return $key; |
|
936
|
|
|
|
|
|
|
} |
|
937
|
|
|
|
|
|
|
|
|
938
|
|
|
|
|
|
|
sub create_binary_digit_string { |
|
939
|
17
|
|
|
17
|
0
|
53
|
my ( $export, $weight, $glob_hash, $cmp_hash ) = @_; |
|
940
|
17
|
|
|
|
|
40
|
my %out_hash; |
|
941
|
|
|
|
|
|
|
|
|
942
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
943
|
|
|
|
|
|
|
# Being a nested for, keys %{$glob_hash} does not need sorting |
|
944
|
|
|
|
|
|
|
# BUT, we sort to follow the same order as serialized (sorted) |
|
945
|
17
|
|
|
|
|
42
|
my @sorted_keys_glob_hash = sort keys %{$glob_hash}; |
|
|
17
|
|
|
|
|
1314
|
|
|
946
|
|
|
|
|
|
|
|
|
947
|
|
|
|
|
|
|
# IDs of each indidividual |
|
948
|
17
|
|
|
|
|
137
|
for my $individual_id ( keys %{$cmp_hash} ) { # no need to sort |
|
|
17
|
|
|
|
|
141
|
|
|
949
|
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
# One-hot encoding = Representing categorical data as numerical |
|
951
|
458
|
|
|
|
|
966
|
my ( $binary_str, $binary_str_weighted ) = ('') x 2; |
|
952
|
|
|
|
|
|
|
|
|
953
|
458
|
|
|
|
|
716
|
for my $key (@sorted_keys_glob_hash) { |
|
954
|
62270
|
|
|
|
|
78325
|
my $has_value = exists $cmp_hash->{$individual_id}{$key}; |
|
955
|
62270
|
100
|
|
|
|
80041
|
$binary_str .= $has_value ? '1' : '0'; |
|
956
|
62270
|
100
|
|
|
|
92453
|
if ( defined $weight ) { |
|
957
|
|
|
|
|
|
|
$binary_str_weighted .= |
|
958
|
|
|
|
|
|
|
$has_value |
|
959
|
|
|
|
|
|
|
? ( '1' x $glob_hash->{$key} ) |
|
960
|
13396
|
100
|
|
|
|
26856
|
: ( '0' x $glob_hash->{$key} ); |
|
961
|
|
|
|
|
|
|
} |
|
962
|
|
|
|
|
|
|
} |
|
963
|
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
# If weight is not defined, simply assign the unweighted string once. |
|
965
|
458
|
100
|
|
|
|
1025
|
$binary_str_weighted = $binary_str unless defined $weight; |
|
966
|
|
|
|
|
|
|
|
|
967
|
458
|
|
|
|
|
1669
|
$out_hash{$individual_id}{binary_digit_string} = $binary_str; |
|
968
|
|
|
|
|
|
|
$out_hash{$individual_id}{binary_digit_string_weighted} = |
|
969
|
458
|
|
|
|
|
809
|
$binary_str_weighted; |
|
970
|
|
|
|
|
|
|
|
|
971
|
458
|
50
|
|
|
|
922
|
if ( defined $export ) { |
|
972
|
|
|
|
|
|
|
|
|
973
|
|
|
|
|
|
|
# Convert string => raw bytes > zlib-compres => Base64 |
|
974
|
|
|
|
|
|
|
$out_hash{$individual_id}{zlib_base64_binary_digit_string} = |
|
975
|
0
|
|
|
|
|
0
|
binary_to_base64($binary_str); |
|
976
|
|
|
|
|
|
|
|
|
977
|
|
|
|
|
|
|
$out_hash{$individual_id}{zlib_base64_binary_digit_string_weighted} |
|
978
|
|
|
|
|
|
|
= defined $weight |
|
979
|
|
|
|
|
|
|
? binary_to_base64($binary_str_weighted) |
|
980
|
0
|
0
|
|
|
|
0
|
: $out_hash{$individual_id}{zlib_base64_binary_digit_string}; |
|
981
|
|
|
|
|
|
|
} |
|
982
|
|
|
|
|
|
|
|
|
983
|
|
|
|
|
|
|
} |
|
984
|
17
|
|
|
|
|
338
|
return \%out_hash; |
|
985
|
|
|
|
|
|
|
} |
|
986
|
|
|
|
|
|
|
|
|
987
|
|
|
|
|
|
|
sub parse_hpo_json { |
|
988
|
0
|
|
|
0
|
0
|
0
|
my $data = shift; |
|
989
|
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
# The file is a structured representation of the Human Phenotype Ontology (HPO) in JSON format. |
|
991
|
|
|
|
|
|
|
# The HPO is structured into a directed acyclic graph (DAG) |
|
992
|
|
|
|
|
|
|
# Here's a brief overview of the structure of the hpo.json file: |
|
993
|
|
|
|
|
|
|
# - graphs: This key contains an array of ontology graphs. In the case of HPO, there is only one graph. The graph has two main keys: |
|
994
|
|
|
|
|
|
|
# - nodes: An array of objects, each representing an HPO term. Each term object has the following keys: |
|
995
|
|
|
|
|
|
|
# - id: The identifier of the term (e.g., "HP:0000118"). |
|
996
|
|
|
|
|
|
|
# - lbl: The label (name) of the term (e.g., "Phenotypic abnormality"). |
|
997
|
|
|
|
|
|
|
# - meta: Metadata associated with the term, including definition, synonyms, and other information. |
|
998
|
|
|
|
|
|
|
# - type: The type of the term, usually "CLASS". |
|
999
|
|
|
|
|
|
|
# - edges: An array of objects, each representing a relationship between two HPO terms. Each edge object has the following keys: |
|
1000
|
|
|
|
|
|
|
# - sub: The subject (child) term ID (e.g., "HP:0000924"). |
|
1001
|
|
|
|
|
|
|
# - obj: The object (parent) term ID (e.g., "HP:0000118"). |
|
1002
|
|
|
|
|
|
|
# - pred: The predicate that describes the relationship between the subject and object terms, typically "is_a" in HPO. |
|
1003
|
|
|
|
|
|
|
# - meta: This key contains metadata about the HPO ontology as a whole, such as version information, description, and other details. |
|
1004
|
|
|
|
|
|
|
|
|
1005
|
0
|
|
|
|
|
0
|
my $graph = $data->{graphs}->[0]; |
|
1006
|
0
|
|
|
|
|
0
|
my %nodes = map { $_->{id} => $_ } @{ $graph->{nodes} }; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
1007
|
0
|
|
|
|
|
0
|
my %edges = (); |
|
1008
|
|
|
|
|
|
|
|
|
1009
|
0
|
|
|
|
|
0
|
for my $edge ( @{ $graph->{edges} } ) { |
|
|
0
|
|
|
|
|
0
|
|
|
1010
|
0
|
|
|
|
|
0
|
my $child_id = $edge->{sub}; |
|
1011
|
0
|
|
|
|
|
0
|
my $parent_id = $edge->{obj}; |
|
1012
|
0
|
|
|
|
|
0
|
push @{ $edges{$child_id} }, $parent_id; |
|
|
0
|
|
|
|
|
0
|
|
|
1013
|
|
|
|
|
|
|
} |
|
1014
|
0
|
|
|
|
|
0
|
return \%nodes, \%edges; |
|
1015
|
|
|
|
|
|
|
} |
|
1016
|
|
|
|
|
|
|
|
|
1017
|
|
|
|
|
|
|
sub prune_keys_with_weight_zero { |
|
1018
|
97
|
|
|
97
|
0
|
190
|
my $hash_ref = shift; |
|
1019
|
|
|
|
|
|
|
|
|
1020
|
|
|
|
|
|
|
# Iterate over the keys of the hash |
|
1021
|
97
|
|
|
|
|
193
|
foreach my $key ( keys %{$hash_ref} ) { |
|
|
97
|
|
|
|
|
1102
|
|
|
1022
|
|
|
|
|
|
|
|
|
1023
|
|
|
|
|
|
|
# Delete the key if its value is 0 |
|
1024
|
5077
|
100
|
|
|
|
8834
|
delete $hash_ref->{$key} if $hash_ref->{$key} == 0; |
|
1025
|
|
|
|
|
|
|
} |
|
1026
|
|
|
|
|
|
|
} |
|
1027
|
|
|
|
|
|
|
|
|
1028
|
|
|
|
|
|
|
sub guess_label { |
|
1029
|
72
|
|
|
72
|
0
|
95
|
my $input_string = shift; |
|
1030
|
|
|
|
|
|
|
|
|
1031
|
72
|
50
|
|
|
|
175
|
if ( |
|
1032
|
|
|
|
|
|
|
$input_string =~ /\. # Match a literal dot |
|
1033
|
|
|
|
|
|
|
([^\.]+) # Match and capture everything except a dot |
|
1034
|
|
|
|
|
|
|
$ # Anchor to the end of the string |
|
1035
|
|
|
|
|
|
|
/x |
|
1036
|
|
|
|
|
|
|
) |
|
1037
|
|
|
|
|
|
|
{ |
|
1038
|
72
|
|
|
|
|
147
|
return $1; |
|
1039
|
|
|
|
|
|
|
} |
|
1040
|
|
|
|
|
|
|
|
|
1041
|
|
|
|
|
|
|
# If no dot is found, return the original string |
|
1042
|
0
|
|
|
|
|
|
return $input_string; |
|
1043
|
|
|
|
|
|
|
} |
|
1044
|
|
|
|
|
|
|
|
|
1045
|
|
|
|
|
|
|
sub binary_to_base64 { |
|
1046
|
0
|
|
|
0
|
0
|
|
my $binary_string = shift; |
|
1047
|
|
|
|
|
|
|
|
|
1048
|
|
|
|
|
|
|
# Convert binary string (e.g. "0010...") to raw bytes |
|
1049
|
0
|
|
|
|
|
|
my $raw_data = pack( "B*", $binary_string ); |
|
1050
|
|
|
|
|
|
|
|
|
1051
|
|
|
|
|
|
|
# Compress the raw data (note: compressing very short data may not save space) |
|
1052
|
0
|
|
|
|
|
|
my $compressed = compress($raw_data); |
|
1053
|
|
|
|
|
|
|
|
|
1054
|
|
|
|
|
|
|
# Base64 encode the compressed data, without any newline breaks |
|
1055
|
0
|
|
|
|
|
|
return encode_base64( $compressed, "" ); |
|
1056
|
|
|
|
|
|
|
} |
|
1057
|
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
sub _base64_to_binary { |
|
1059
|
0
|
|
|
0
|
|
|
my ( $b64_string, $original_length ) = @_; |
|
1060
|
|
|
|
|
|
|
|
|
1061
|
|
|
|
|
|
|
# Decode the Base64 encoded compressed data |
|
1062
|
0
|
|
|
|
|
|
my $compressed_data = decode_base64($b64_string); |
|
1063
|
|
|
|
|
|
|
|
|
1064
|
|
|
|
|
|
|
# Decompress the data back to raw bytes |
|
1065
|
0
|
0
|
|
|
|
|
my $raw_data = uncompress($compressed_data) |
|
1066
|
|
|
|
|
|
|
or die "Decompression failed: $Compress::Zlib::gzerrno\n"; |
|
1067
|
|
|
|
|
|
|
|
|
1068
|
|
|
|
|
|
|
# Convert the raw bytes back into a binary string (sequence of 0s and 1s) |
|
1069
|
0
|
|
|
|
|
|
my $binary_string = unpack( "B*", $raw_data ); |
|
1070
|
|
|
|
|
|
|
|
|
1071
|
|
|
|
|
|
|
# Trim the binary string to the original length to remove any padded bits |
|
1072
|
0
|
|
|
|
|
|
return substr( $binary_string, 0, $original_length ); |
|
1073
|
|
|
|
|
|
|
} |
|
1074
|
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
1; |