File Coverage

bin/pheno-ranker

Criterion	Covered	Total	%
statement	73	101	72.2
branch	14	44	31.8
condition	4	19	21.0
subroutine	15	19	78.9
pod			n/a
total	106	183	57.9

line	stmt	bran	cond	sub	time	code
1						#!/usr/bin/env perl
2						#
3						# A script that performs semantic similarity in PXF\|BFF data structures
4						#
5						# Last Modified: Feb/01/2025
6						#
7						# $VERSION taken from Pheno::Ranker
8						#
9						# Copyright (C) 2023-2025 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
10						#
11						# License: Artistic License 2.0
12						#
13						# If this program helps you in your research, please cite.
14
15						package main;
16
17	1			1	3954	use strict;
	1				2
	1				31
18	1			1	7	use warnings;
	1				1
	1				35
19	1			1	372	use autodie;
	1				13238
	1				3
20	1			1	4368	use feature qw(say);
	1				1
	1				123
21	1			1	649	use Getopt::Long qw(:config no_ignore_case);
	1				10913
	1				5
22	1			1	583	use Pod::Usage;
	1				51769
	1				105
23	1			1	533	use Data::Dumper;
	1				6032
	1				56
24	1			1	512	use Sys::Hostname;
	1				1054
	1				54
25	1			1	415	use POSIX qw(strftime);
	1				4114
	1				3
26	1			1	1740	use Term::ANSIColor qw(:constants);
	1				7121
	1				828
27	1			1	432	use File::ShareDir::ProjectDistDir qw(dist_dir);
	1				33413
	1				7
28	1			1	817	use FindBin qw($Bin);
	1				1042
	1				127
29	1			1	466	use lib "$Bin/../lib";
	1				558
	1				5
30	1			1	544	use Pheno::Ranker qw($VERSION write_json);
	1				4
	1				1193
31
32						# Defining a few variables
33	1				162239	my $out_file_cohort = 'matrix.txt';
34	1				4	my $out_file_patient = 'rank.txt';
35	1				2	my $out_file_graph = 'graph.json';
36	1				2	my $out_file_graph_stats = 'graph_stats.txt';
37	1				2	my $export_basename = 'export';
38	1				1	my $align_basename = 'alignment';
39	1				3	my $log_file = 'pheno-ranker-log.json';
40	1				1	my $color = 1;
41	1				1	my $age = 0;
42	1				2	my $cli = 1;
43
44						# Reading arguments
45						GetOptions(
46						'reference\|r=s{1,}' => \my @reference_files, # array
47						'target\|t=s' => \my $target_file, # string
48						'weights\|w=s' => \my $weights_file, # string
49						'append-prefixes=s{1,}' => \my @append_prefixes, # array
50						'out-file\|o=s' => \my $out_file_arg, # string
51						'max-out:i' => \my $max_out, # integer
52						'max-number-vars:i' => \my $max_number_vars, # integer
53						'include-hpo-ascendants' => \my $include_hpo_ascendants, # flag
54						'export\|e:s' => \my $export, # opt-string (defined)
55						'align\|a:s' => \my $align, # opt-string (defined)
56						'cytoscape-json:s' => \my $cytoscape_json, # opt-string (defined)
57						'graph-stats:s' => \my $graph_stats, # opt-string (defined)
58						'sort-by=s' => \my $sort_by, # string
59						'similarity-metric-cohort=s' => \my $similarity_metric_cohort, # string
60						'patients-of-interest\|poi=s{1,}' => \my @patients_of_interest, # array
61						'poi-out-dir=s' => \my $poi_out_dir, # string
62						'include-terms=s{1,11}' => \my @include_terms, # array
63						'exclude-terms=s{1,11}' => \my @exclude_terms, # array
64						'retain-excluded-phenotypicFeatures' => \
65						my $retain_excluded_phenotypicFeatures, # flag
66						'precomputed-ref-prefix=s' => \my $precomputed_ref_prefix, # string
67						'max-matrix-items-in-ram=i' => \my $max_matrix_items_in_ram, # integer
68						'config=s' => \my $config_file, # string
69						'age!' => \$age, # flag
70						'help\|?' => \my $help, # flag
71						'log:s' => \my $log, # opt-string (defined)
72						'man' => \my $man, # flag
73						'debug=i' => \my $debug, # integer
74						'verbose\|' => \my $verbose, # flag
75						'color!' => \$color, # flag
76	0			0	0	'version\|V' => sub { say "$0 Version $VERSION"; exit; }
	0				0
77	1	50			13	) or pod2usage(2);
78	1	50			2148	pod2usage(1) if $help;
79	1	50			3	pod2usage( -verbose => 2, -exitval => 0 ) if $man;
80	1	50	33		3	pod2usage(
81						-message => "Please specify a reference-cohort(s) with <--r>\n",
82						-exitval => 1
83						) unless ( @reference_files \|\| $precomputed_ref_prefix );
84	1	50	33		6	pod2usage(
85						-message =>
86						"<--graph_stats> only works in conjunction with <--cytoscape-json>\n",
87						-exitval => 1
88						) if ( defined $graph_stats && !defined $cytoscape_json );
89	1	50	33		3	pod2usage(
90						-message => "Weights file <$weights_file> does not exist\n",
91						-exitval => 1
92						) if ( defined $weights_file && !-f $weights_file );
93
94						# Set the name of the output
95	1	50	33		6	my $out_file = $out_file_arg
96						// ( $target_file ? $out_file_patient : $out_file_cohort );
97
98						# Set import data
99	1				2	my ( $glob_hash_file, $ref_hash_file, $ref_binary_hash_file );
100	1	50			3	if ( defined $precomputed_ref_prefix ) {
101
102						# Check if any incompatible parameters are provided
103	0		0		0	my $has_incompatible_options =
104						@reference_files
105						\|\| @append_prefixes
106						\|\| $age
107						\|\| defined $include_hpo_ascendants
108						\|\| defined $retain_excluded_phenotypicFeatures
109						\|\| defined $weights_file;
110
111	0	0			0	if ($has_incompatible_options) {
112	0				0	pod2usage(
113						-message =>
114						"Sorry, but <--reference/age/hpo-ascendants/retain-excluded-phenotypicFeatures/weights/append-prefixes> are incompatible with --import <$precomputed_ref_prefix>\n",
115						-exitval => 1
116						);
117						}
118
119						# Generate file names based on precomputed_ref_prefix
120	0				0	my $base_glob = $precomputed_ref_prefix . '.glob_hash.json';
121	0				0	my $base_ref = $precomputed_ref_prefix . '.ref_hash.json';
122	0				0	my $base_ref_binary = $precomputed_ref_prefix . '.ref_binary_hash.json';
123
124						# Use the helper to check for .gz versions if needed.
125	0				0	$glob_hash_file = resolve_file($base_glob);
126	0				0	$ref_hash_file = resolve_file($base_ref);
127	0				0	$ref_binary_hash_file = resolve_file($base_ref_binary);
128						}
129
130						# Set cytoscape-json logic
131	1				5	handle_option( \$cytoscape_json, "<--cytoscape-json> only works in cohort-mode",
132						$target_file, $out_file_graph );
133
134						# Set graph-stats logic
135	1				2	handle_option( \$graph_stats, "<--graph-stats> only works in cohort-mode",
136						$target_file, $out_file_graph_stats );
137
138						# Turning color off if argument <--no-color>
139	1	50			2	$ENV{'ANSI_COLORS_DISABLED'} = 1 unless $color;
140
141						# Start printing to STDOUT
142	1	50			2	say BOLD CYAN program_header($VERSION), RESET if $verbose;
143
144						######################
145						# START PHENO-RANKER #
146						######################
147
148						# Load data as hashref
149	1				44	my $data = {
150						reference_files => \@reference_files,
151						target_file => $target_file,
152						weights_file => $weights_file,
153						include_hpo_ascendants => $include_hpo_ascendants,
154						hpo_file => undef,
155						align => $align,
156						align_basename => $align_basename,
157						export => $export,
158						export_basename => $export_basename,
159						out_file => $out_file,
160						cytoscape_json => $cytoscape_json,
161						graph_stats => $graph_stats,
162						max_out => $max_out,
163						max_number_vars => $max_number_vars,
164						sort_by => $sort_by,
165						similarity_metric_cohort => $similarity_metric_cohort,
166						patients_of_interest => \@patients_of_interest,
167						poi_out_dir => $poi_out_dir,
168						include_terms => \@include_terms,
169						exclude_terms => \@exclude_terms,
170						retain_excluded_phenotypicFeatures => $retain_excluded_phenotypicFeatures,
171						precomputed_ref_prefix => $precomputed_ref_prefix,
172						max_matrix_items_in_ram => $max_matrix_items_in_ram,
173						glob_hash_file => $glob_hash_file,
174						ref_hash_file => $ref_hash_file,
175						ref_binary_hash_file => $ref_binary_hash_file,
176						config_file => $config_file,
177						age => $age, # Solution, use ageRange in PXF/BFF, measures' values more difficult
178						cli => $cli,
179						append_prefixes => \@append_prefixes,
180						log => $log,
181						debug => $debug,
182						verbose => $verbose
183						};
184
185						# Create object
186	1				14	my $ranker = Pheno::Ranker->new($data);
187
188						# Run method
189	1				4	$ranker->run();
190
191						# Create log if <--log>
192	1	0			0	write_log( $log ? $log : $log_file, $data, $VERSION )
		50
193						if defined $log;
194
195						####################
196						# END PHENO-RANKER #
197						####################
198
199						sub handle_option {
200
201	2			2	5	my ( $option_ref, $message, $target_file, $default ) = @_;
202	2	50			3	if ( defined $$option_ref ) {
203	2	50			3	pod2usage( -message => $message, -exitval => 1 ) if $target_file;
204	2	50			4	$$option_ref = $$option_ref ? $$option_ref : $default;
205						}
206						}
207
208						sub resolve_file {
209
210	0			0		my $base = shift;
211
212						# If the base file exists, use it.
213	0	0				return $base if -e $base;
214
215						# If a gzipped version exists, use that.
216	0					my $gz = $base . '.gz';
217	0	0				return $gz if -e $gz;
218
219						# If neither exists, just return the base file (you might want to warn or error here)
220	0					return $base;
221						}
222
223						sub write_log {
224
225	0			0		my ( $log, $data, $VERSION ) = @_;
226
227						# NB: Darwin does not have nproc to show #logical-cores, using sysctl instead
228	0					my $os = $^O;
229						chomp(
230	0	0	0			my $ncpuhost =
		0
231						lc($os) eq 'darwin' ? qx{/usr/sbin/sysctl -n hw.logicalcpu}
232						: $os eq 'MSWin32' ? qx{wmic cpu get NumberOfLogicalProcessors}
233						: qx{/usr/bin/nproc} // 1
234						);
235
236						# For the Windows command, the result will also contain the string
237						# "NumberOfLogicalProcessors" which is the header of the output.
238						# So we need to extract the actual number from it:
239	0	0				if ( $os eq 'MSWin32' ) {
240	0					($ncpuhost) = $ncpuhost =~ /(\d+)/;
241						}
242	0					$ncpuhost = 0 + $ncpuhost; # coercing it to be a number
243
244						my $info = {
245						date => ( strftime "%a %b %e %H:%M:%S %Y", localtime ),
246						ncpuhost => $ncpuhost,
247						hostname => hostname,
248						id => time . substr( "00000$$", -5 ), # string
249						version => $VERSION,
250						user => $ENV{'LOGNAME'}
251						\|\| $ENV{'USER'}
252	0		0			\|\| $ENV{'USERNAME'}
253						\|\| 'dummy-user'
254						};
255
256						# Saving file
257	0	0				say BOLD GREEN "Writing <$log> file\n" if $data->{verbose};
258	0					write_json(
259						{
260						filepath => $log,
261						data => { info => $info, data => $data }
262						}
263						);
264						}
265
266						sub program_header {
267
268	0			0		my $VERSION = shift;
269	0					my $str = <
270						****************************************
271						* Rank against cohort(s) (BFF/PXF) *
272						* - PHENO-RANKER - *
273						* Version: $VERSION *
274						* (C) 2023-2025 Manuel Rueda, PhD *
275						* The Artistic License 2.0 *
276						****************************************
277						EOF
278	0					return $str;
279						}
280
281						=head1 NAME
282
283						pheno-ranker: A script that performs semantic similarity in PXF/BFF data structures and beyond (JSON\|YAML)
284
285						=head1 SYNOPSIS
286
287						pheno-ranker -r -t [-options]
288
289						Arguments:
290						* Cohort mode:
291						-r, --reference JSON/YAML BFF/PXF file(s) (array/object), supports .gz
292
293						* Patient mode:
294						-t, --target JSON/YAML BFF/PXF file (object or single-object array), supports .gz
295
296						Options:
297						-age Include age-related variables; excludes agent-like terms (BFF/PXF-only) [>no-age\|age]
298						-a, --align [path/basename] Write alignment file(s). If not specified, default filenames are used [default: alignment.*]
299						-append-prefixes Prefixes for primary_key when #cohorts >= 2 [default: C]
300						-config YAML config file to modify default parameters [default: share/conf/config.yaml]
301						-cytoscape-json [file] Serializes the pairwise comparison matrix as an undirected graph in JSON, compatible with Cytoscape [default: graph.json]
302						-e, --export [path/basename] Export miscellaneous JSON files. If not specified, default filenames are used [default: export.*]
303						-exclude-terms Exclude BFF/PXF terms (e.g., --exclude-terms sex, id) or column names in JSON-derived from CSV
304						-graph-stats [file] Generates a text file with key graph metrics, for use with <-cytoscape-json> [default: graph_stats.txt]
305						-prp, --precomputed-ref-prefix [path/basename] Use precomputed data for the reference cohort(s). No need to use --r
306						-include-hpo-ascendants Include ascendant terms from the Human Phenotype Ontology (HPO)
307						-include-terms Include BFF/PXF terms (e.g., --include-terms diseases) or column names in JSON-derived from CSV
308						-max-matrix-items-in-ram In cohort mode, set max items before switching to RAM-efficient mode (default: 5000)
309						-max-number-vars Maximum number of variables for binary string [default: 10000]
310						-max-out Print only N comparisons [default: 50]
311						-o, --out-file Output file path [default: -r matrix.txt \| -t rank.txt]
312						-poi, --patients-of-interest Export JSON files for the selected individual IDs during a dry-run
313						-poi-out-dir Directory for JSON files (used with --poi)
314						-retain-excluded-phenotypicFeatures Retains features set to "excluded": true by appending '_excluded' to their IDs
315						-similarity-metric-cohort Similarity metric for cohort mode [>hamming\|jaccard]
316						-sort-by Sort by Hamming distance or Jaccard index [>hamming\|jaccard]
317						-w, --weights YAML file with weights
318
319						Generic Options:
320						-debug Print debugging (from 1 to 5, being 5 max)
321						-h, --help Brief help message
322						-log Save log file [default: pheno-ranker-log.json]
323						-man Full documentation
324						-no-color Toggle color output [>color\|no-color]
325						-v, --verbose Verbosity on
326						-V, --version Print version
327
328						=head1 SUMMARY
329
330						Pheno-Ranker is a lightweight, easy-to-install tool for performing semantic similarity analysis on phenotypic data in JSON/YAML formats, including Beacon v2 Models and Phenopackets v2. It also supports pre-processed CSV files prepared using the included C utility.
331
332						=head1 INSTALLATION
333
334						If you plan to only use C CLI, we recommend installing it via CPAN. See details below.
335
336						=head2 Non containerized
337
338						The script runs on command-line Linux and it has been tested on Debian/RedHat/macOS based distributions (only showing commands for Debian). Perl 5 is installed by default on Linux,
339						but we will install a few CPAN modules with C.
340
341						=head3 Method 1: From CPAN
342
343						First install system level dependencies:
344
345						sudo apt-get install cpanminus libperl-dev
346
347						Now you have to choose between one of the 2 options below:
348
349						B
350
351						cpanm --notest --sudo Pheno::Ranker
352						pheno-ranker -h
353
354						B
355
356						cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
357						cpanm --notest Pheno::Ranker
358						pheno-ranker --help
359
360						To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
361
362						echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
363
364						=head3 Method 2: From CPAN in a CONDA environment
365
366						Please follow L.
367
368						=head3 Method 3: From GitHub
369
370						git clone https://github.com/cnag-biomedical-informatics/pheno-ranker.git
371						cd pheno-ranker
372
373						Install system level dependencies:
374
375						sudo apt-get install cpanminus libperl-dev
376
377						Now you have to choose between one of the 2 options below:
378
379						B
380
381						cpanm --notest --sudo --installdeps .
382						bin/pheno-ranker --help
383
384						B
385
386						cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
387						cpanm --notest --installdeps .
388						bin/pheno-ranker --help
389
390						To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
391
392						echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
393
394						I If you want to use C or C:
395
396						sudo apt-get install python3-pip libzbar0
397						pip3 install -r requirements.txt
398
399						=head2 Containerized
400
401						=head3 Method 4: From Docker Hub
402
403						Download the latest version of the Docker image (supports both amd64 and arm64 architectures) from L by executing:
404
405						docker pull manuelrueda/pheno-ranker:latest
406						docker image tag manuelrueda/pheno-ranker:latest cnag/pheno-ranker:latest
407
408						See additional instructions below.
409
410						=head3 Method 5: With Dockerfile
411
412						Please download the C from the repo:
413
414						wget https://raw.githubusercontent.com/cnag-biomedical-informatics/pheno-ranker/main/Dockerfile
415
416						And then run:
417
418						# Docker Version 19.03 and Above (Supports buildx)
419						docker buildx build -t cnag/pheno-ranker:latest .
420
421						# Docker Version Older than 19.03 (Does Not Support buildx)
422						docker build -t cnag/pheno-ranker:latest .
423
424						=head3 Additional instructions for Methods 4 and 5
425
426						To run the container (detached) execute:
427
428						docker run -tid -e USERNAME=root --name pheno-ranker cnag/pheno-ranker:latest
429
430						To enter:
431
432						docker exec -ti pheno-ranker bash
433
434						The command-line executable can be found at:
435
436						/usr/share/pheno-ranker/bin/pheno-ranker
437
438						The default container user is C but you can also run the container as C<$UID=1000> (C).
439
440						docker run --user 1000 -tid --name pheno-ranker cnag/pheno-ranker:latest
441
442						=head3 Mounting volumes
443
444						Docker containers are fully isolated. If you need the mount a volume to the container please use the following syntax (C<-v host:container>).
445						Find an example below (note that you need to change the paths to match yours):
446
447						docker run -tid --volume /media/mrueda/4TBT/data:/data --name pheno-ranker-mount cnag/pheno-ranker:latest
448
449						Then I will do something like this:
450
451						# First I create an alias to simplify invocation (from the host)
452						alias pheno-ranker='docker exec -ti pheno-ranker-mount /usr/share/pheno-ranker/bin/pheno-ranker'
453
454						# Now I use the alias to run the command (note that I use the flag --o to specify the filepath)
455						pheno-ranker -r /data/individuals.json -o /data/matrix.txt
456
457						=head3 System requirements
458
459						* Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOS, OpenSUSE) should do as well.
460						(It should also work on macOS and Windows Server, but we are only providing information for Linux here)
461						* Perl 5 (>= 5.26 core; installed by default in most Linux distributions). Check the version with "perl -v".
462						* >= 4GB of RAM
463						* 1 core
464						* At least 16GB HDD
465
466						=head1 HOW TO RUN PHENO-RANKER
467
468						For executing pheno-ranker you will need a PXF/BFF file(s) in JSON\|YAML format. The reference cohort must be a JSON array, where each individual data are consolidated in one object.
469
470						You can download examples from L.
471
472						There are two modes of operation:
473
474						=over 4
475
476						=item Cohort mode:
477
478						B With C<--r> argument and 1 cohort.
479
480						B With C<--r> and multiple cohort files. It can be used in combination with C<--append-prefixes> to add prefixes to each individual id.
481
482						=item Patient Mode:
483
484						With C<-r> reference cohort(s) and C<--t> patient data.
485
486						=back
487
488						B
489
490						$ ./pheno-ranker -r phenopackets.json # intra-cohort
491
492						$ ./pheno-ranker -r phenopackets.yaml -o my_matrix.txt # intra-cohort
493
494						$ ./pheno-ranker -r phenopackets.json -w weights.yaml --exclude-terms sex ethnicity exposures # intra-cohort with weights
495
496						$ $path/pheno-ranker -r individuals.json others.yaml --append-prefixes CANCER CONTROL # inter-cohort
497
498						$ $path/pheno-ranker -r individuals.json -t patient.yaml -max-out 100 # mode patient
499
500
501						=head2 COMMON ERRORS AND SOLUTIONS
502
503						* Error message: R plotting
504						Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
505						line 1 did not have X elements
506						Calls: as.matrix -> read.table -> scan
507						Execution halted
508						Solution: Make sure that the values of your primary key (e.g., "id") do not contain spaces (e.g., "my fav id" must be "my_fav_id")
509
510						* Error message: Foo
511						Solution: Bar
512
513						=head1 CITATION
514
515						The author requests that any published work that utilizes C includes a cite to the following reference:
516
517						Leist, I.C. et al., (2024). Pheno-Ranker: a toolkit for comparison of phenotypic data stored in GA4GH standards and beyond. I. DOI: 10.1186/s12859-024-05993-2
518
519						=head1 AUTHOR
520
521						Written by Manuel Rueda, PhD. Info about CNAG can be found at L.
522
523						=head1 COPYRIGHT AND LICENSE
524
525						This PERL file is copyrighted. See the LICENSE file included in this distribution.
526
527						=cut