line
stmt
bran
cond
sub
pod
time
code
1
#!/usr/bin/env perl
2
#
3
# A script that performs semantic similarity in PXF|BFF data structures
4
#
5
# Last Modified: Feb/01/2025
6
#
7
# $VERSION taken from Pheno::Ranker
8
#
9
# Copyright (C) 2023-2025 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
10
#
11
# License: Artistic License 2.0
12
#
13
# If this program helps you in your research, please cite.
14
15
package main;
16
17
1
1
5616
use strict;
1
1
1
42
18
1
1
11
use warnings;
1
1
1
51
19
1
1
585
use autodie;
1
20787
1
4
20
1
1
8186
use feature qw(say);
1
5
1
202
21
1
1
863
use Getopt::Long qw(:config no_ignore_case);
1
17819
1
7
22
1
1
862
use Pod::Usage;
1
84143
1
147
23
1
1
781
use Data::Dumper;
1
9529
1
86
24
1
1
704
use Sys::Hostname;
1
1513
1
66
25
1
1
603
use POSIX qw(strftime);
1
6783
1
7
26
1
1
2789
use Term::ANSIColor qw(:constants);
1
11964
1
1457
27
1
1
672
use File::ShareDir::ProjectDistDir qw(dist_dir);
1
53476
1
9
28
1
1
1174
use FindBin qw($Bin);
1
1550
1
164
29
1
1
669
use lib "$Bin/../lib";
1
874
1
10
30
1
1
836
use Pheno::Ranker qw($VERSION write_json);
1
7
1
2111
31
32
# Defining a few variables
33
1
310523
my $out_file_cohort = 'matrix.txt';
34
1
3
my $out_file_patient = 'rank.txt';
35
1
3
my $out_file_graph = 'graph.json';
36
1
2
my $out_file_graph_stats = 'graph_stats.txt';
37
1
3
my $export_basename = 'export';
38
1
2
my $align_basename = 'alignment';
39
1
2
my $log_file = 'pheno-ranker-log.json';
40
1
2
my $color = 1;
41
1
1
my $age = 0;
42
1
2
my $cli = 1;
43
44
# Reading arguments
45
GetOptions(
46
'reference|r=s{1,}' => \my @reference_files, # array
47
'target|t=s' => \my $target_file, # string
48
'weights|w=s' => \my $weights_file, # string
49
'append-prefixes=s{1,}' => \my @append_prefixes, # array
50
'out-file|o=s' => \my $out_file_arg, # string
51
'max-out:i' => \my $max_out, # integer
52
'max-number-vars:i' => \my $max_number_vars, # integer
53
'include-hpo-ascendants' => \my $include_hpo_ascendants, # flag
54
'export|e:s' => \my $export, # opt-string (defined)
55
'align|a:s' => \my $align, # opt-string (defined)
56
'cytoscape-json:s' => \my $cytoscape_json, # opt-string (defined)
57
'graph-stats:s' => \my $graph_stats, # opt-string (defined)
58
'sort-by=s' => \my $sort_by, # string
59
'similarity-metric-cohort=s' => \my $similarity_metric_cohort, # string
60
'patients-of-interest|poi=s{1,}' => \my @patients_of_interest, # array
61
'poi-out-dir=s' => \my $poi_out_dir, # string
62
'include-terms=s{1,11}' => \my @include_terms, # array
63
'exclude-terms=s{1,11}' => \my @exclude_terms, # array
64
'retain-excluded-phenotypicFeatures' => \
65
my $retain_excluded_phenotypicFeatures, # flag
66
'prp|precomputed-ref-prefix=s' => \my $precomputed_ref_prefix, # string
67
'max-matrix-records-in-ram=i' => \my $max_matrix_records_in_ram, # integer
68
'config=s' => \my $config_file, # string
69
'age!' => \$age, # flag
70
'help|?' => \my $help, # flag
71
'log:s' => \my $log, # opt-string (defined)
72
'man' => \my $man, # flag
73
'debug=i' => \my $debug, # integer
74
'verbose|' => \my $verbose, # flag
75
'color!' => \$color, # flag
76
0
0
0
'version|V' => sub { say "$0 Version $VERSION"; exit; }
0
0
77
1
50
13
) or pod2usage(2);
78
1
50
2161
pod2usage(1) if $help;
79
1
50
3
pod2usage( -verbose => 2, -exitval => 0 ) if $man;
80
1
50
33
4
pod2usage(
81
-message => "Please specify a reference-cohort(s) with <--r>\n",
82
-exitval => 1
83
) unless ( @reference_files || $precomputed_ref_prefix );
84
1
50
33
4
pod2usage(
85
-message =>
86
"<--graph_stats> only works in conjunction with <--cytoscape-json>\n",
87
-exitval => 1
88
) if ( defined $graph_stats && !defined $cytoscape_json );
89
1
50
33
4
pod2usage(
90
-message => "Weights file <$weights_file> does not exist\n",
91
-exitval => 1
92
) if ( defined $weights_file && !-f $weights_file );
93
94
# Set the name of the output
95
1
50
33
4
my $out_file = $out_file_arg
96
// ( $target_file ? $out_file_patient : $out_file_cohort );
97
98
# Set import data
99
my (
100
1
3
$glob_hash_file, $ref_hash_file,
101
$ref_binary_hash_file, $coverage_stats_file
102
);
103
1
50
2
if ( defined $precomputed_ref_prefix ) {
104
105
# Check if any incompatible parameters are provided
106
0
0
0
my $has_incompatible_options =
107
@reference_files
108
|| @append_prefixes
109
|| $age
110
|| defined $include_hpo_ascendants
111
|| defined $retain_excluded_phenotypicFeatures
112
|| defined $weights_file;
113
114
0
0
my @incompatible_flags = (
115
'--reference', '--age',
116
'--hpo-ascendants', '--retain-excluded-phenotypicFeatures',
117
'--weights', '--append-prefixes'
118
);
119
120
0
0
0
if ($has_incompatible_options) {
121
0
0
my $flags_str = join( "\n", @incompatible_flags );
122
0
0
pod2usage(
123
-message =>
124
"Sorry, but the options\n$flags_str\nare incompatible with --prp <$precomputed_ref_prefix>\n",
125
-exitval => 1,
126
);
127
}
128
129
# Generate file names based on precomputed_ref_prefix
130
0
0
my $base_glob = $precomputed_ref_prefix . '.glob_hash.json';
131
0
0
my $base_ref = $precomputed_ref_prefix . '.ref_hash.json';
132
0
0
my $base_ref_binary = $precomputed_ref_prefix . '.ref_binary_hash.json';
133
0
0
my $base_coverage_stats = $precomputed_ref_prefix . '.coverage_stats.json';
134
135
# Use the helper to check for .gz versions if needed.
136
0
0
$glob_hash_file = resolve_file($base_glob);
137
0
0
$ref_hash_file = resolve_file($base_ref);
138
0
0
$ref_binary_hash_file = resolve_file($base_ref_binary);
139
0
0
$coverage_stats_file = resolve_file($base_coverage_stats);
140
}
141
142
# Set cytoscape-json logic
143
1
4
handle_option( \$cytoscape_json, "<--cytoscape-json> only works in cohort-mode",
144
$target_file, $out_file_graph );
145
146
# Set graph-stats logic
147
1
3
handle_option( \$graph_stats, "<--graph-stats> only works in cohort-mode",
148
$target_file, $out_file_graph_stats );
149
150
# Turning color off if argument <--no-color>
151
1
50
23
$ENV{'ANSI_COLORS_DISABLED'} = 1 unless $color;
152
153
# Start printing to STDOUT
154
1
50
2
say BOLD CYAN program_header($VERSION), RESET if $verbose;
155
156
######################
157
# START PHENO-RANKER #
158
######################
159
160
# Load data as hashref
161
1
25
my $data = {
162
reference_files => \@reference_files,
163
target_file => $target_file,
164
weights_file => $weights_file,
165
include_hpo_ascendants => $include_hpo_ascendants,
166
hpo_file => undef,
167
align => $align,
168
align_basename => $align_basename,
169
export => $export,
170
export_basename => $export_basename,
171
out_file => $out_file,
172
cytoscape_json => $cytoscape_json,
173
graph_stats => $graph_stats,
174
max_out => $max_out,
175
max_number_vars => $max_number_vars,
176
sort_by => $sort_by,
177
similarity_metric_cohort => $similarity_metric_cohort,
178
patients_of_interest => \@patients_of_interest,
179
poi_out_dir => $poi_out_dir,
180
include_terms => \@include_terms,
181
exclude_terms => \@exclude_terms,
182
retain_excluded_phenotypicFeatures => $retain_excluded_phenotypicFeatures,
183
precomputed_ref_prefix => $precomputed_ref_prefix,
184
max_matrix_records_in_ram => $max_matrix_records_in_ram,
185
glob_hash_file => $glob_hash_file,
186
ref_hash_file => $ref_hash_file,
187
ref_binary_hash_file => $ref_binary_hash_file,
188
coverage_stats_file => $coverage_stats_file,
189
config_file => $config_file,
190
age => $age, # Solution, use ageRange in PXF/BFF, measures' values more difficult
191
cli => $cli,
192
append_prefixes => \@append_prefixes,
193
log => $log,
194
debug => $debug,
195
verbose => $verbose
196
};
197
198
# Create object
199
1
11
my $ranker = Pheno::Ranker->new($data);
200
201
# Run method
202
1
4
$ranker->run();
203
204
# Create log if <--log>
205
1
0
145
write_log( $log ? $log : $log_file, $data, $VERSION )
50
206
if defined $log;
207
208
####################
209
# END PHENO-RANKER #
210
####################
211
212
sub handle_option {
213
2
2
49
my ( $option_ref, $message, $target_file, $default ) = @_;
214
2
50
4
if ( defined $$option_ref ) {
215
2
50
3
pod2usage( -message => $message, -exitval => 1 ) if $target_file;
216
2
50
4
$$option_ref = $$option_ref ? $$option_ref : $default;
217
}
218
}
219
220
sub resolve_file {
221
0
0
my $base = shift;
222
223
# If the base file exists, use it.
224
0
0
return $base if -e $base;
225
226
# If a gzipped version exists, use that.
227
0
my $gz = $base . '.gz';
228
0
0
return $gz if -e $gz;
229
230
# If neither exists, just return the base file (you might want to warn or error here)
231
0
return $base;
232
}
233
234
sub write_log {
235
0
0
my ( $log, $data, $VERSION ) = @_;
236
237
# NB: Darwin does not have nproc to show #logical-cores, using sysctl instead
238
0
my $os = $^O;
239
chomp(
240
0
0
0
my $threadshost =
0
0
241
lc($os) eq 'darwin' ? qx{/usr/sbin/sysctl -n hw.logicalcpu}
242
: lc($os) eq 'freebsd' ? qx{sysctl -n hw.ncpu}
243
: $os eq 'MSWin32' ? qx{wmic cpu get NumberOfLogicalProcessors}
244
: qx{/usr/bin/nproc} // 1
245
);
246
247
# For the Windows command, the result will also contain the string
248
# "NumberOfLogicalProcessors" which is the header of the output.
249
# So we need to extract the actual number from it:
250
0
0
if ( $os eq 'MSWin32' ) {
251
0
($threadshost) = $threadshost =~ /(\d+)/;
252
}
253
0
$threadshost = 0 + $threadshost; # coercing it to be a number
254
255
my $info = {
256
date => ( strftime "%a %b %e %H:%M:%S %Y", localtime ),
257
threadshost => $threadshost,
258
hostname => hostname,
259
id => time . substr( "00000$$", -5 ), # string
260
version => $VERSION,
261
user => $ENV{'LOGNAME'}
262
|| $ENV{'USER'}
263
0
0
|| $ENV{'USERNAME'}
264
|| 'dummy-user'
265
};
266
267
# Saving file
268
0
0
say BOLD GREEN "Writing <$log> file\n" if $data->{verbose};
269
0
write_json(
270
{
271
filepath => $log,
272
data => { info => $info, data => $data }
273
}
274
);
275
}
276
277
sub program_header {
278
0
0
my $VERSION = shift;
279
0
my $str = <
280
****************************************
281
* Rank against cohort(s) (BFF/PXF) *
282
* - PHENO-RANKER - *
283
* Version: $VERSION *
284
* (C) 2023-2025 Manuel Rueda, PhD *
285
* The Artistic License 2.0 *
286
****************************************
287
EOF
288
0
return $str;
289
}
290
291
=head1 NAME
292
293
pheno-ranker: A script that performs semantic similarity in PXF/BFF data structures and beyond (JSON|YAML)
294
295
=head1 SYNOPSIS
296
297
pheno-ranker -r -t [-options]
298
299
Arguments:
300
* Cohort mode:
301
-r, --reference JSON/YAML BFF/PXF file(s) (array/object), supports .gz
302
303
* Patient mode:
304
-t, --target JSON/YAML BFF/PXF file (object or single-object array), supports .gz
305
306
Options:
307
-age Include age-related variables; excludes agent-like terms (BFF/PXF-only) [>no-age|age]
308
-a, --align [path/basename] Write alignment file(s). If not specified, default filenames are used [default: alignment.*]
309
-append-prefixes Prefixes for primary_key when #cohorts >= 2 [default: C]
310
-config YAML config file to modify default parameters [default: share/conf/config.yaml]
311
-cytoscape-json [file] Serializes the pairwise comparison matrix as an undirected graph in JSON, compatible with Cytoscape [default: graph.json]
312
-e, --export [path/basename] Export miscellaneous JSON files. If not specified, default filenames are used [default: export.*]
313
-exclude-terms Exclude BFF/PXF terms (e.g., --exclude-terms sex, id) or column names in JSON-derived from CSV
314
-graph-stats [file] Generates a text file with key graph metrics, for use with <-cytoscape-json> [default: graph_stats.txt]
315
-include-hpo-ascendants Include ascendant terms from the Human Phenotype Ontology (HPO)
316
-include-terms Include BFF/PXF terms (e.g., --include-terms diseases) or column names in JSON-derived from CSV
317
-max-matrix-items-in-ram In cohort mode, set max items before switching to RAM-efficient mode (default: 5000)
318
-max-number-vars Maximum number of variables for binary string [default: 10000]
319
-max-out Print only N comparisons [default: 50]
320
-o, --out-file Output file path [default: -r matrix.txt | -t rank.txt]
321
-poi, --patients-of-interest Export JSON files for the selected individual IDs during a dry-run
322
-poi-out-dir Directory for JSON files (used with --poi)
323
-prp, --precomputed-ref-prefix [path/basename] Use precomputed data for the reference cohort(s). No need to use --r
324
-retain-excluded-phenotypicFeatures Retains features set to "excluded": true by appending '_excluded' to their IDs
325
-similarity-metric-cohort Similarity metric for cohort mode [>hamming|jaccard]
326
-sort-by Sort by Hamming distance or Jaccard index [>hamming|jaccard]
327
-w, --weights YAML file with weights
328
329
Generic Options:
330
-debug Print debugging (from 1 to 5, being 5 max)
331
-h, --help Brief help message
332
-log Save log file [default: pheno-ranker-log.json]
333
-man Full documentation
334
-no-color Toggle color output [>color|no-color]
335
-v, --verbose Verbosity on
336
-V, --version Print version
337
338
=head1 SUMMARY
339
340
Pheno-Ranker is a lightweight, easy-to-install tool for performing semantic similarity analysis on phenotypic data in JSON/YAML formats, including Beacon v2 Models and Phenopackets v2. It also supports pre-processed CSV files prepared using the included C utility.
341
342
=head1 INSTALLATION
343
344
If you plan to only use C CLI, we recommend installing it via CPAN. See details below.
345
346
=head2 Non containerized
347
348
The script runs on command-line Linux and it has been tested on Debian/RedHat/macOS based distributions (only showing commands for Debian). Perl 5 is installed by default on Linux,
349
but we will install a few CPAN modules with C.
350
351
=head3 Method 1: From CPAN
352
353
First install system level dependencies:
354
355
sudo apt-get install cpanminus libperl-dev gcc make
356
357
Now you have to choose between one of the 2 options below:
358
359
B System-level installation:
360
361
cpanm --notest --sudo Pheno::Ranker
362
pheno-ranker -h
363
364
B Install Pheno-Ranker and the dependencies at C<~/perl5>
365
366
cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
367
cpanm --notest Pheno::Ranker
368
pheno-ranker --help
369
370
To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
371
372
echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
373
374
To B to the newest version (showing commands for Option 2):
375
376
cpanm Pheno::Ranker
377
378
=head3 Method 2: From CPAN in a CONDA environment
379
380
Please follow L.
381
382
=head3 Method 3: From GitHub
383
384
To clone the repository for the first time:
385
386
git clone https://github.com/cnag-biomedical-informatics/pheno-ranker.git
387
cd pheno-ranker
388
389
To update an existing clone, navigate to the repository folder and run:
390
391
git pull
392
393
Install system level dependencies:
394
395
sudo apt-get install cpanminus libperl-dev
396
397
Now you have to choose between one of the 2 options below:
398
399
B Install dependencies (they're harmless to your system) as C:
400
401
cpanm --notest --sudo --installdeps .
402
bin/pheno-ranker --help
403
404
B Install the dependencies at C<~/perl5>:
405
406
cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
407
cpanm --notest --installdeps .
408
bin/pheno-ranker --help
409
410
To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
411
412
echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
413
414
I If you want to use C or C:
415
416
sudo apt-get install python3-pip libzbar0
417
pip3 install -r requirements.txt
418
419
=head2 Containerized
420
421
=head3 Method 4: From Docker Hub
422
423
(Estimated Time: Approximately 10 seconds)
424
425
Download the latest version of the Docker image (supports both amd64 and arm64 architectures) from L by executing:
426
427
docker pull manuelrueda/pheno-ranker:latest
428
docker image tag manuelrueda/pheno-ranker:latest cnag/pheno-ranker:latest
429
430
See additional instructions below.
431
432
=head3 Method 5: With Dockerfile
433
434
(Estimated Time: Approximately 1 minute)
435
436
Please download the C from the repo:
437
438
wget https://raw.githubusercontent.com/cnag-biomedical-informatics/pheno-ranker/main/Dockerfile
439
440
And then run:
441
442
# Docker Version 19.03 and Above (Supports buildx)
443
docker buildx build -t cnag/pheno-ranker:latest .
444
445
# Docker Version Older than 19.03 (Does Not Support buildx)
446
docker build -t cnag/pheno-ranker:latest .
447
448
=head3 Additional instructions for Methods 4 and 5
449
450
To run the container (detached) execute:
451
452
docker run -tid -e USERNAME=root --name pheno-ranker cnag/pheno-ranker:latest
453
454
To enter:
455
456
docker exec -ti pheno-ranker bash
457
458
The command-line executable can be found at:
459
460
/usr/share/pheno-ranker/bin/pheno-ranker
461
462
The default container user is C but you can also run the container as C<$UID=1000> (C).
463
464
docker run --user 1000 -tid --name pheno-ranker cnag/pheno-ranker:latest
465
466
=head3 Mounting volumes
467
468
Docker containers are fully isolated. If you need the mount a volume to the container please use the following syntax (C<-v host:container>).
469
Find an example below (note that you need to change the paths to match yours):
470
471
docker run -tid --volume /media/mrueda/4TBT/data:/data --name pheno-ranker-mount cnag/pheno-ranker:latest
472
473
Then I will do something like this:
474
475
# First I create an alias to simplify invocation (from the host)
476
alias pheno-ranker='docker exec -ti pheno-ranker-mount /usr/share/pheno-ranker/bin/pheno-ranker'
477
478
# Now I use the alias to run the command (note that I use the flag --o to specify the filepath)
479
pheno-ranker -r /data/individuals.json -o /data/matrix.txt
480
481
=head3 System requirements
482
483
- OS/ARCH supported: B and B.
484
- Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOS, OpenSUSE) should do as well (untested).
485
(It should also work on macOS and Windows Server, but we are only providing information for Linux here)
486
* Perl 5 (>= 5.26 core; installed by default in most Linux distributions). Check the version with "perl -v".
487
* >= 4GB of RAM
488
* 1 core
489
* At least 16GB HDD
490
491
=head1 HOW TO RUN PHENO-RANKER
492
493
For executing pheno-ranker you will need a PXF/BFF file(s) in JSON|YAML format. The reference cohort must be a JSON array, where each individual data are consolidated in one object.
494
495
You can download examples from L.
496
497
There are two modes of operation:
498
499
=over 4
500
501
=item Cohort mode:
502
503
B With C<--r> argument and 1 cohort.
504
505
B With C<--r> and multiple cohort files. It can be used in combination with C<--append-prefixes> to add prefixes to each individual id.
506
507
=item Patient Mode:
508
509
With C<-r> reference cohort(s) and C<--t> patient data.
510
511
=back
512
513
B
514
515
$ bin/pheno-ranker -r phenopackets.json # intra-cohort
516
517
$ bin/pheno-ranker -r phenopackets.yaml -o my_matrix.txt # intra-cohort
518
519
$ bin/pheno-ranker -r phenopackets.json -w weights.yaml --exclude-terms sex ethnicity exposures # intra-cohort with weights
520
521
$ $path/pheno-ranker -r individuals.json others.yaml --append-prefixes CANCER CONTROL # inter-cohort
522
523
$ $path/pheno-ranker -r individuals.json -t patient.yaml -max-out 100 # mode patient
524
525
526
=head2 COMMON ERRORS AND SOLUTIONS
527
528
* Error message: R plotting
529
Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
530
line 1 did not have X elements
531
Calls: as.matrix -> read.table -> scan
532
Execution halted
533
Solution: Make sure that the values of your primary key (e.g., "id") do not contain spaces (e.g., "my fav id" must be "my_fav_id")
534
535
* Error message: Foo
536
Solution: Bar
537
538
=head1 CITATION
539
540
The author requests that any published work that utilizes C includes a cite to the following reference:
541
542
Leist, I.C. et al., (2024). Pheno-Ranker: a toolkit for comparison of phenotypic data stored in GA4GH standards and beyond. I. DOI: 10.1186/s12859-024-05993-2
543
544
=head1 AUTHOR
545
546
Written by Manuel Rueda, PhD. Info about CNAG can be found at L.
547
548
=head1 COPYRIGHT AND LICENSE
549
550
This PERL file is copyrighted. See the LICENSE file included in this distribution.
551
552
=cut