line
stmt
bran
cond
sub
pod
time
code
1
#!/usr/bin/env perl
2
#
3
# A script that performs semantic similarity in PXF|BFF data structures
4
#
5
# Last Modified: Feb/01/2025
6
#
7
# $VERSION taken from Pheno::Ranker
8
#
9
# Copyright (C) 2023-2025 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
10
#
11
# License: Artistic License 2.0
12
#
13
# If this program helps you in your research, please cite.
14
15
package main;
16
17
1
1
3954
use strict;
1
2
1
31
18
1
1
7
use warnings;
1
1
1
35
19
1
1
372
use autodie;
1
13238
1
3
20
1
1
4368
use feature qw(say);
1
1
1
123
21
1
1
649
use Getopt::Long qw(:config no_ignore_case);
1
10913
1
5
22
1
1
583
use Pod::Usage;
1
51769
1
105
23
1
1
533
use Data::Dumper;
1
6032
1
56
24
1
1
512
use Sys::Hostname;
1
1054
1
54
25
1
1
415
use POSIX qw(strftime);
1
4114
1
3
26
1
1
1740
use Term::ANSIColor qw(:constants);
1
7121
1
828
27
1
1
432
use File::ShareDir::ProjectDistDir qw(dist_dir);
1
33413
1
7
28
1
1
817
use FindBin qw($Bin);
1
1042
1
127
29
1
1
466
use lib "$Bin/../lib";
1
558
1
5
30
1
1
544
use Pheno::Ranker qw($VERSION write_json);
1
4
1
1193
31
32
# Defining a few variables
33
1
162239
my $out_file_cohort = 'matrix.txt';
34
1
4
my $out_file_patient = 'rank.txt';
35
1
2
my $out_file_graph = 'graph.json';
36
1
2
my $out_file_graph_stats = 'graph_stats.txt';
37
1
2
my $export_basename = 'export';
38
1
1
my $align_basename = 'alignment';
39
1
3
my $log_file = 'pheno-ranker-log.json';
40
1
1
my $color = 1;
41
1
1
my $age = 0;
42
1
2
my $cli = 1;
43
44
# Reading arguments
45
GetOptions(
46
'reference|r=s{1,}' => \my @reference_files, # array
47
'target|t=s' => \my $target_file, # string
48
'weights|w=s' => \my $weights_file, # string
49
'append-prefixes=s{1,}' => \my @append_prefixes, # array
50
'out-file|o=s' => \my $out_file_arg, # string
51
'max-out:i' => \my $max_out, # integer
52
'max-number-vars:i' => \my $max_number_vars, # integer
53
'include-hpo-ascendants' => \my $include_hpo_ascendants, # flag
54
'export|e:s' => \my $export, # opt-string (defined)
55
'align|a:s' => \my $align, # opt-string (defined)
56
'cytoscape-json:s' => \my $cytoscape_json, # opt-string (defined)
57
'graph-stats:s' => \my $graph_stats, # opt-string (defined)
58
'sort-by=s' => \my $sort_by, # string
59
'similarity-metric-cohort=s' => \my $similarity_metric_cohort, # string
60
'patients-of-interest|poi=s{1,}' => \my @patients_of_interest, # array
61
'poi-out-dir=s' => \my $poi_out_dir, # string
62
'include-terms=s{1,11}' => \my @include_terms, # array
63
'exclude-terms=s{1,11}' => \my @exclude_terms, # array
64
'retain-excluded-phenotypicFeatures' => \
65
my $retain_excluded_phenotypicFeatures, # flag
66
'precomputed-ref-prefix=s' => \my $precomputed_ref_prefix, # string
67
'max-matrix-items-in-ram=i' => \my $max_matrix_items_in_ram, # integer
68
'config=s' => \my $config_file, # string
69
'age!' => \$age, # flag
70
'help|?' => \my $help, # flag
71
'log:s' => \my $log, # opt-string (defined)
72
'man' => \my $man, # flag
73
'debug=i' => \my $debug, # integer
74
'verbose|' => \my $verbose, # flag
75
'color!' => \$color, # flag
76
0
0
0
'version|V' => sub { say "$0 Version $VERSION"; exit; }
0
0
77
1
50
13
) or pod2usage(2);
78
1
50
2148
pod2usage(1) if $help;
79
1
50
3
pod2usage( -verbose => 2, -exitval => 0 ) if $man;
80
1
50
33
3
pod2usage(
81
-message => "Please specify a reference-cohort(s) with <--r>\n",
82
-exitval => 1
83
) unless ( @reference_files || $precomputed_ref_prefix );
84
1
50
33
6
pod2usage(
85
-message =>
86
"<--graph_stats> only works in conjunction with <--cytoscape-json>\n",
87
-exitval => 1
88
) if ( defined $graph_stats && !defined $cytoscape_json );
89
1
50
33
3
pod2usage(
90
-message => "Weights file <$weights_file> does not exist\n",
91
-exitval => 1
92
) if ( defined $weights_file && !-f $weights_file );
93
94
# Set the name of the output
95
1
50
33
6
my $out_file = $out_file_arg
96
// ( $target_file ? $out_file_patient : $out_file_cohort );
97
98
# Set import data
99
1
2
my ( $glob_hash_file, $ref_hash_file, $ref_binary_hash_file );
100
1
50
3
if ( defined $precomputed_ref_prefix ) {
101
102
# Check if any incompatible parameters are provided
103
0
0
0
my $has_incompatible_options =
104
@reference_files
105
|| @append_prefixes
106
|| $age
107
|| defined $include_hpo_ascendants
108
|| defined $retain_excluded_phenotypicFeatures
109
|| defined $weights_file;
110
111
0
0
0
if ($has_incompatible_options) {
112
0
0
pod2usage(
113
-message =>
114
"Sorry, but <--reference/age/hpo-ascendants/retain-excluded-phenotypicFeatures/weights/append-prefixes> are incompatible with --import <$precomputed_ref_prefix>\n",
115
-exitval => 1
116
);
117
}
118
119
# Generate file names based on precomputed_ref_prefix
120
0
0
my $base_glob = $precomputed_ref_prefix . '.glob_hash.json';
121
0
0
my $base_ref = $precomputed_ref_prefix . '.ref_hash.json';
122
0
0
my $base_ref_binary = $precomputed_ref_prefix . '.ref_binary_hash.json';
123
124
# Use the helper to check for .gz versions if needed.
125
0
0
$glob_hash_file = resolve_file($base_glob);
126
0
0
$ref_hash_file = resolve_file($base_ref);
127
0
0
$ref_binary_hash_file = resolve_file($base_ref_binary);
128
}
129
130
# Set cytoscape-json logic
131
1
5
handle_option( \$cytoscape_json, "<--cytoscape-json> only works in cohort-mode",
132
$target_file, $out_file_graph );
133
134
# Set graph-stats logic
135
1
2
handle_option( \$graph_stats, "<--graph-stats> only works in cohort-mode",
136
$target_file, $out_file_graph_stats );
137
138
# Turning color off if argument <--no-color>
139
1
50
2
$ENV{'ANSI_COLORS_DISABLED'} = 1 unless $color;
140
141
# Start printing to STDOUT
142
1
50
2
say BOLD CYAN program_header($VERSION), RESET if $verbose;
143
144
######################
145
# START PHENO-RANKER #
146
######################
147
148
# Load data as hashref
149
1
44
my $data = {
150
reference_files => \@reference_files,
151
target_file => $target_file,
152
weights_file => $weights_file,
153
include_hpo_ascendants => $include_hpo_ascendants,
154
hpo_file => undef,
155
align => $align,
156
align_basename => $align_basename,
157
export => $export,
158
export_basename => $export_basename,
159
out_file => $out_file,
160
cytoscape_json => $cytoscape_json,
161
graph_stats => $graph_stats,
162
max_out => $max_out,
163
max_number_vars => $max_number_vars,
164
sort_by => $sort_by,
165
similarity_metric_cohort => $similarity_metric_cohort,
166
patients_of_interest => \@patients_of_interest,
167
poi_out_dir => $poi_out_dir,
168
include_terms => \@include_terms,
169
exclude_terms => \@exclude_terms,
170
retain_excluded_phenotypicFeatures => $retain_excluded_phenotypicFeatures,
171
precomputed_ref_prefix => $precomputed_ref_prefix,
172
max_matrix_items_in_ram => $max_matrix_items_in_ram,
173
glob_hash_file => $glob_hash_file,
174
ref_hash_file => $ref_hash_file,
175
ref_binary_hash_file => $ref_binary_hash_file,
176
config_file => $config_file,
177
age => $age, # Solution, use ageRange in PXF/BFF, measures' values more difficult
178
cli => $cli,
179
append_prefixes => \@append_prefixes,
180
log => $log,
181
debug => $debug,
182
verbose => $verbose
183
};
184
185
# Create object
186
1
14
my $ranker = Pheno::Ranker->new($data);
187
188
# Run method
189
1
4
$ranker->run();
190
191
# Create log if <--log>
192
1
0
0
write_log( $log ? $log : $log_file, $data, $VERSION )
50
193
if defined $log;
194
195
####################
196
# END PHENO-RANKER #
197
####################
198
199
sub handle_option {
200
201
2
2
5
my ( $option_ref, $message, $target_file, $default ) = @_;
202
2
50
3
if ( defined $$option_ref ) {
203
2
50
3
pod2usage( -message => $message, -exitval => 1 ) if $target_file;
204
2
50
4
$$option_ref = $$option_ref ? $$option_ref : $default;
205
}
206
}
207
208
sub resolve_file {
209
210
0
0
my $base = shift;
211
212
# If the base file exists, use it.
213
0
0
return $base if -e $base;
214
215
# If a gzipped version exists, use that.
216
0
my $gz = $base . '.gz';
217
0
0
return $gz if -e $gz;
218
219
# If neither exists, just return the base file (you might want to warn or error here)
220
0
return $base;
221
}
222
223
sub write_log {
224
225
0
0
my ( $log, $data, $VERSION ) = @_;
226
227
# NB: Darwin does not have nproc to show #logical-cores, using sysctl instead
228
0
my $os = $^O;
229
chomp(
230
0
0
0
my $ncpuhost =
0
231
lc($os) eq 'darwin' ? qx{/usr/sbin/sysctl -n hw.logicalcpu}
232
: $os eq 'MSWin32' ? qx{wmic cpu get NumberOfLogicalProcessors}
233
: qx{/usr/bin/nproc} // 1
234
);
235
236
# For the Windows command, the result will also contain the string
237
# "NumberOfLogicalProcessors" which is the header of the output.
238
# So we need to extract the actual number from it:
239
0
0
if ( $os eq 'MSWin32' ) {
240
0
($ncpuhost) = $ncpuhost =~ /(\d+)/;
241
}
242
0
$ncpuhost = 0 + $ncpuhost; # coercing it to be a number
243
244
my $info = {
245
date => ( strftime "%a %b %e %H:%M:%S %Y", localtime ),
246
ncpuhost => $ncpuhost,
247
hostname => hostname,
248
id => time . substr( "00000$$", -5 ), # string
249
version => $VERSION,
250
user => $ENV{'LOGNAME'}
251
|| $ENV{'USER'}
252
0
0
|| $ENV{'USERNAME'}
253
|| 'dummy-user'
254
};
255
256
# Saving file
257
0
0
say BOLD GREEN "Writing <$log> file\n" if $data->{verbose};
258
0
write_json(
259
{
260
filepath => $log,
261
data => { info => $info, data => $data }
262
}
263
);
264
}
265
266
sub program_header {
267
268
0
0
my $VERSION = shift;
269
0
my $str = <
270
****************************************
271
* Rank against cohort(s) (BFF/PXF) *
272
* - PHENO-RANKER - *
273
* Version: $VERSION *
274
* (C) 2023-2025 Manuel Rueda, PhD *
275
* The Artistic License 2.0 *
276
****************************************
277
EOF
278
0
return $str;
279
}
280
281
=head1 NAME
282
283
pheno-ranker: A script that performs semantic similarity in PXF/BFF data structures and beyond (JSON|YAML)
284
285
=head1 SYNOPSIS
286
287
pheno-ranker -r -t [-options]
288
289
Arguments:
290
* Cohort mode:
291
-r, --reference JSON/YAML BFF/PXF file(s) (array/object), supports .gz
292
293
* Patient mode:
294
-t, --target JSON/YAML BFF/PXF file (object or single-object array), supports .gz
295
296
Options:
297
-age Include age-related variables; excludes agent-like terms (BFF/PXF-only) [>no-age|age]
298
-a, --align [path/basename] Write alignment file(s). If not specified, default filenames are used [default: alignment.*]
299
-append-prefixes Prefixes for primary_key when #cohorts >= 2 [default: C]
300
-config YAML config file to modify default parameters [default: share/conf/config.yaml]
301
-cytoscape-json [file] Serializes the pairwise comparison matrix as an undirected graph in JSON, compatible with Cytoscape [default: graph.json]
302
-e, --export [path/basename] Export miscellaneous JSON files. If not specified, default filenames are used [default: export.*]
303
-exclude-terms Exclude BFF/PXF terms (e.g., --exclude-terms sex, id) or column names in JSON-derived from CSV
304
-graph-stats [file] Generates a text file with key graph metrics, for use with <-cytoscape-json> [default: graph_stats.txt]
305
-prp, --precomputed-ref-prefix [path/basename] Use precomputed data for the reference cohort(s). No need to use --r
306
-include-hpo-ascendants Include ascendant terms from the Human Phenotype Ontology (HPO)
307
-include-terms Include BFF/PXF terms (e.g., --include-terms diseases) or column names in JSON-derived from CSV
308
-max-matrix-items-in-ram In cohort mode, set max items before switching to RAM-efficient mode (default: 5000)
309
-max-number-vars Maximum number of variables for binary string [default: 10000]
310
-max-out Print only N comparisons [default: 50]
311
-o, --out-file Output file path [default: -r matrix.txt | -t rank.txt]
312
-poi, --patients-of-interest Export JSON files for the selected individual IDs during a dry-run
313
-poi-out-dir Directory for JSON files (used with --poi)
314
-retain-excluded-phenotypicFeatures Retains features set to "excluded": true by appending '_excluded' to their IDs
315
-similarity-metric-cohort Similarity metric for cohort mode [>hamming|jaccard]
316
-sort-by Sort by Hamming distance or Jaccard index [>hamming|jaccard]
317
-w, --weights YAML file with weights
318
319
Generic Options:
320
-debug Print debugging (from 1 to 5, being 5 max)
321
-h, --help Brief help message
322
-log Save log file [default: pheno-ranker-log.json]
323
-man Full documentation
324
-no-color Toggle color output [>color|no-color]
325
-v, --verbose Verbosity on
326
-V, --version Print version
327
328
=head1 SUMMARY
329
330
Pheno-Ranker is a lightweight, easy-to-install tool for performing semantic similarity analysis on phenotypic data in JSON/YAML formats, including Beacon v2 Models and Phenopackets v2. It also supports pre-processed CSV files prepared using the included C utility.
331
332
=head1 INSTALLATION
333
334
If you plan to only use C CLI, we recommend installing it via CPAN. See details below.
335
336
=head2 Non containerized
337
338
The script runs on command-line Linux and it has been tested on Debian/RedHat/macOS based distributions (only showing commands for Debian). Perl 5 is installed by default on Linux,
339
but we will install a few CPAN modules with C.
340
341
=head3 Method 1: From CPAN
342
343
First install system level dependencies:
344
345
sudo apt-get install cpanminus libperl-dev
346
347
Now you have to choose between one of the 2 options below:
348
349
B System-level installation:
350
351
cpanm --notest --sudo Pheno::Ranker
352
pheno-ranker -h
353
354
B Install Pheno-Ranker and the dependencies at C<~/perl5>
355
356
cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
357
cpanm --notest Pheno::Ranker
358
pheno-ranker --help
359
360
To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
361
362
echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
363
364
=head3 Method 2: From CPAN in a CONDA environment
365
366
Please follow L.
367
368
=head3 Method 3: From GitHub
369
370
git clone https://github.com/cnag-biomedical-informatics/pheno-ranker.git
371
cd pheno-ranker
372
373
Install system level dependencies:
374
375
sudo apt-get install cpanminus libperl-dev
376
377
Now you have to choose between one of the 2 options below:
378
379
B Install dependencies (they're harmless to your system) as C:
380
381
cpanm --notest --sudo --installdeps .
382
bin/pheno-ranker --help
383
384
B Install the dependencies at C<~/perl5>:
385
386
cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
387
cpanm --notest --installdeps .
388
bin/pheno-ranker --help
389
390
To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
391
392
echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
393
394
I If you want to use C or C:
395
396
sudo apt-get install python3-pip libzbar0
397
pip3 install -r requirements.txt
398
399
=head2 Containerized
400
401
=head3 Method 4: From Docker Hub
402
403
Download the latest version of the Docker image (supports both amd64 and arm64 architectures) from L by executing:
404
405
docker pull manuelrueda/pheno-ranker:latest
406
docker image tag manuelrueda/pheno-ranker:latest cnag/pheno-ranker:latest
407
408
See additional instructions below.
409
410
=head3 Method 5: With Dockerfile
411
412
Please download the C from the repo:
413
414
wget https://raw.githubusercontent.com/cnag-biomedical-informatics/pheno-ranker/main/Dockerfile
415
416
And then run:
417
418
# Docker Version 19.03 and Above (Supports buildx)
419
docker buildx build -t cnag/pheno-ranker:latest .
420
421
# Docker Version Older than 19.03 (Does Not Support buildx)
422
docker build -t cnag/pheno-ranker:latest .
423
424
=head3 Additional instructions for Methods 4 and 5
425
426
To run the container (detached) execute:
427
428
docker run -tid -e USERNAME=root --name pheno-ranker cnag/pheno-ranker:latest
429
430
To enter:
431
432
docker exec -ti pheno-ranker bash
433
434
The command-line executable can be found at:
435
436
/usr/share/pheno-ranker/bin/pheno-ranker
437
438
The default container user is C but you can also run the container as C<$UID=1000> (C).
439
440
docker run --user 1000 -tid --name pheno-ranker cnag/pheno-ranker:latest
441
442
=head3 Mounting volumes
443
444
Docker containers are fully isolated. If you need the mount a volume to the container please use the following syntax (C<-v host:container>).
445
Find an example below (note that you need to change the paths to match yours):
446
447
docker run -tid --volume /media/mrueda/4TBT/data:/data --name pheno-ranker-mount cnag/pheno-ranker:latest
448
449
Then I will do something like this:
450
451
# First I create an alias to simplify invocation (from the host)
452
alias pheno-ranker='docker exec -ti pheno-ranker-mount /usr/share/pheno-ranker/bin/pheno-ranker'
453
454
# Now I use the alias to run the command (note that I use the flag --o to specify the filepath)
455
pheno-ranker -r /data/individuals.json -o /data/matrix.txt
456
457
=head3 System requirements
458
459
* Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOS, OpenSUSE) should do as well.
460
(It should also work on macOS and Windows Server, but we are only providing information for Linux here)
461
* Perl 5 (>= 5.26 core; installed by default in most Linux distributions). Check the version with "perl -v".
462
* >= 4GB of RAM
463
* 1 core
464
* At least 16GB HDD
465
466
=head1 HOW TO RUN PHENO-RANKER
467
468
For executing pheno-ranker you will need a PXF/BFF file(s) in JSON|YAML format. The reference cohort must be a JSON array, where each individual data are consolidated in one object.
469
470
You can download examples from L.
471
472
There are two modes of operation:
473
474
=over 4
475
476
=item Cohort mode:
477
478
B With C<--r> argument and 1 cohort.
479
480
B With C<--r> and multiple cohort files. It can be used in combination with C<--append-prefixes> to add prefixes to each individual id.
481
482
=item Patient Mode:
483
484
With C<-r> reference cohort(s) and C<--t> patient data.
485
486
=back
487
488
B
489
490
$ ./pheno-ranker -r phenopackets.json # intra-cohort
491
492
$ ./pheno-ranker -r phenopackets.yaml -o my_matrix.txt # intra-cohort
493
494
$ ./pheno-ranker -r phenopackets.json -w weights.yaml --exclude-terms sex ethnicity exposures # intra-cohort with weights
495
496
$ $path/pheno-ranker -r individuals.json others.yaml --append-prefixes CANCER CONTROL # inter-cohort
497
498
$ $path/pheno-ranker -r individuals.json -t patient.yaml -max-out 100 # mode patient
499
500
501
=head2 COMMON ERRORS AND SOLUTIONS
502
503
* Error message: R plotting
504
Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
505
line 1 did not have X elements
506
Calls: as.matrix -> read.table -> scan
507
Execution halted
508
Solution: Make sure that the values of your primary key (e.g., "id") do not contain spaces (e.g., "my fav id" must be "my_fav_id")
509
510
* Error message: Foo
511
Solution: Bar
512
513
=head1 CITATION
514
515
The author requests that any published work that utilizes C includes a cite to the following reference:
516
517
Leist, I.C. et al., (2024). Pheno-Ranker: a toolkit for comparison of phenotypic data stored in GA4GH standards and beyond. I. DOI: 10.1186/s12859-024-05993-2
518
519
=head1 AUTHOR
520
521
Written by Manuel Rueda, PhD. Info about CNAG can be found at L.
522
523
=head1 COPYRIGHT AND LICENSE
524
525
This PERL file is copyrighted. See the LICENSE file included in this distribution.
526
527
=cut