File Coverage

bin/pheno-ranker

Criterion	Covered	Total	%
statement	73	105	69.5
branch	14	46	30.4
condition	4	19	21.0
subroutine	15	19	78.9
pod			n/a
total	106	189	56.0

line	stmt	bran	cond	sub	time	code
1						#!/usr/bin/env perl
2						#
3						# A script that performs semantic similarity in PXF\|BFF data structures
4						#
5						# Last Modified: Feb/01/2025
6						#
7						# $VERSION taken from Pheno::Ranker
8						#
9						# Copyright (C) 2023-2025 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
10						#
11						# License: Artistic License 2.0
12						#
13						# If this program helps you in your research, please cite.
14
15						package main;
16
17	1			1	5616	use strict;
	1				1
	1				42
18	1			1	11	use warnings;
	1				1
	1				51
19	1			1	585	use autodie;
	1				20787
	1				4
20	1			1	8186	use feature qw(say);
	1				5
	1				202
21	1			1	863	use Getopt::Long qw(:config no_ignore_case);
	1				17819
	1				7
22	1			1	862	use Pod::Usage;
	1				84143
	1				147
23	1			1	781	use Data::Dumper;
	1				9529
	1				86
24	1			1	704	use Sys::Hostname;
	1				1513
	1				66
25	1			1	603	use POSIX qw(strftime);
	1				6783
	1				7
26	1			1	2789	use Term::ANSIColor qw(:constants);
	1				11964
	1				1457
27	1			1	672	use File::ShareDir::ProjectDistDir qw(dist_dir);
	1				53476
	1				9
28	1			1	1174	use FindBin qw($Bin);
	1				1550
	1				164
29	1			1	669	use lib "$Bin/../lib";
	1				874
	1				10
30	1			1	836	use Pheno::Ranker qw($VERSION write_json);
	1				7
	1				2111
31
32						# Defining a few variables
33	1				310523	my $out_file_cohort = 'matrix.txt';
34	1				3	my $out_file_patient = 'rank.txt';
35	1				3	my $out_file_graph = 'graph.json';
36	1				2	my $out_file_graph_stats = 'graph_stats.txt';
37	1				3	my $export_basename = 'export';
38	1				2	my $align_basename = 'alignment';
39	1				2	my $log_file = 'pheno-ranker-log.json';
40	1				2	my $color = 1;
41	1				1	my $age = 0;
42	1				2	my $cli = 1;
43
44						# Reading arguments
45						GetOptions(
46						'reference\|r=s{1,}' => \my @reference_files, # array
47						'target\|t=s' => \my $target_file, # string
48						'weights\|w=s' => \my $weights_file, # string
49						'append-prefixes=s{1,}' => \my @append_prefixes, # array
50						'out-file\|o=s' => \my $out_file_arg, # string
51						'max-out:i' => \my $max_out, # integer
52						'max-number-vars:i' => \my $max_number_vars, # integer
53						'include-hpo-ascendants' => \my $include_hpo_ascendants, # flag
54						'export\|e:s' => \my $export, # opt-string (defined)
55						'align\|a:s' => \my $align, # opt-string (defined)
56						'cytoscape-json:s' => \my $cytoscape_json, # opt-string (defined)
57						'graph-stats:s' => \my $graph_stats, # opt-string (defined)
58						'sort-by=s' => \my $sort_by, # string
59						'similarity-metric-cohort=s' => \my $similarity_metric_cohort, # string
60						'patients-of-interest\|poi=s{1,}' => \my @patients_of_interest, # array
61						'poi-out-dir=s' => \my $poi_out_dir, # string
62						'include-terms=s{1,11}' => \my @include_terms, # array
63						'exclude-terms=s{1,11}' => \my @exclude_terms, # array
64						'retain-excluded-phenotypicFeatures' => \
65						my $retain_excluded_phenotypicFeatures, # flag
66						'prp\|precomputed-ref-prefix=s' => \my $precomputed_ref_prefix, # string
67						'max-matrix-records-in-ram=i' => \my $max_matrix_records_in_ram, # integer
68						'config=s' => \my $config_file, # string
69						'age!' => \$age, # flag
70						'help\|?' => \my $help, # flag
71						'log:s' => \my $log, # opt-string (defined)
72						'man' => \my $man, # flag
73						'debug=i' => \my $debug, # integer
74						'verbose\|' => \my $verbose, # flag
75						'color!' => \$color, # flag
76	0			0	0	'version\|V' => sub { say "$0 Version $VERSION"; exit; }
	0				0
77	1	50			13	) or pod2usage(2);
78	1	50			2161	pod2usage(1) if $help;
79	1	50			3	pod2usage( -verbose => 2, -exitval => 0 ) if $man;
80	1	50	33		4	pod2usage(
81						-message => "Please specify a reference-cohort(s) with <--r>\n",
82						-exitval => 1
83						) unless ( @reference_files \|\| $precomputed_ref_prefix );
84	1	50	33		4	pod2usage(
85						-message =>
86						"<--graph_stats> only works in conjunction with <--cytoscape-json>\n",
87						-exitval => 1
88						) if ( defined $graph_stats && !defined $cytoscape_json );
89	1	50	33		4	pod2usage(
90						-message => "Weights file <$weights_file> does not exist\n",
91						-exitval => 1
92						) if ( defined $weights_file && !-f $weights_file );
93
94						# Set the name of the output
95	1	50	33		4	my $out_file = $out_file_arg
96						// ( $target_file ? $out_file_patient : $out_file_cohort );
97
98						# Set import data
99						my (
100	1				3	$glob_hash_file, $ref_hash_file,
101						$ref_binary_hash_file, $coverage_stats_file
102						);
103	1	50			2	if ( defined $precomputed_ref_prefix ) {
104
105						# Check if any incompatible parameters are provided
106	0		0		0	my $has_incompatible_options =
107						@reference_files
108						\|\| @append_prefixes
109						\|\| $age
110						\|\| defined $include_hpo_ascendants
111						\|\| defined $retain_excluded_phenotypicFeatures
112						\|\| defined $weights_file;
113
114	0				0	my @incompatible_flags = (
115						'--reference', '--age',
116						'--hpo-ascendants', '--retain-excluded-phenotypicFeatures',
117						'--weights', '--append-prefixes'
118						);
119
120	0	0			0	if ($has_incompatible_options) {
121	0				0	my $flags_str = join( "\n", @incompatible_flags );
122	0				0	pod2usage(
123						-message =>
124						"Sorry, but the options\n$flags_str\nare incompatible with --prp <$precomputed_ref_prefix>\n",
125						-exitval => 1,
126						);
127						}
128
129						# Generate file names based on precomputed_ref_prefix
130	0				0	my $base_glob = $precomputed_ref_prefix . '.glob_hash.json';
131	0				0	my $base_ref = $precomputed_ref_prefix . '.ref_hash.json';
132	0				0	my $base_ref_binary = $precomputed_ref_prefix . '.ref_binary_hash.json';
133	0				0	my $base_coverage_stats = $precomputed_ref_prefix . '.coverage_stats.json';
134
135						# Use the helper to check for .gz versions if needed.
136	0				0	$glob_hash_file = resolve_file($base_glob);
137	0				0	$ref_hash_file = resolve_file($base_ref);
138	0				0	$ref_binary_hash_file = resolve_file($base_ref_binary);
139	0				0	$coverage_stats_file = resolve_file($base_coverage_stats);
140						}
141
142						# Set cytoscape-json logic
143	1				4	handle_option( \$cytoscape_json, "<--cytoscape-json> only works in cohort-mode",
144						$target_file, $out_file_graph );
145
146						# Set graph-stats logic
147	1				3	handle_option( \$graph_stats, "<--graph-stats> only works in cohort-mode",
148						$target_file, $out_file_graph_stats );
149
150						# Turning color off if argument <--no-color>
151	1	50			23	$ENV{'ANSI_COLORS_DISABLED'} = 1 unless $color;
152
153						# Start printing to STDOUT
154	1	50			2	say BOLD CYAN program_header($VERSION), RESET if $verbose;
155
156						######################
157						# START PHENO-RANKER #
158						######################
159
160						# Load data as hashref
161	1				25	my $data = {
162						reference_files => \@reference_files,
163						target_file => $target_file,
164						weights_file => $weights_file,
165						include_hpo_ascendants => $include_hpo_ascendants,
166						hpo_file => undef,
167						align => $align,
168						align_basename => $align_basename,
169						export => $export,
170						export_basename => $export_basename,
171						out_file => $out_file,
172						cytoscape_json => $cytoscape_json,
173						graph_stats => $graph_stats,
174						max_out => $max_out,
175						max_number_vars => $max_number_vars,
176						sort_by => $sort_by,
177						similarity_metric_cohort => $similarity_metric_cohort,
178						patients_of_interest => \@patients_of_interest,
179						poi_out_dir => $poi_out_dir,
180						include_terms => \@include_terms,
181						exclude_terms => \@exclude_terms,
182						retain_excluded_phenotypicFeatures => $retain_excluded_phenotypicFeatures,
183						precomputed_ref_prefix => $precomputed_ref_prefix,
184						max_matrix_records_in_ram => $max_matrix_records_in_ram,
185						glob_hash_file => $glob_hash_file,
186						ref_hash_file => $ref_hash_file,
187						ref_binary_hash_file => $ref_binary_hash_file,
188						coverage_stats_file => $coverage_stats_file,
189						config_file => $config_file,
190						age => $age, # Solution, use ageRange in PXF/BFF, measures' values more difficult
191						cli => $cli,
192						append_prefixes => \@append_prefixes,
193						log => $log,
194						debug => $debug,
195						verbose => $verbose
196						};
197
198						# Create object
199	1				11	my $ranker = Pheno::Ranker->new($data);
200
201						# Run method
202	1				4	$ranker->run();
203
204						# Create log if <--log>
205	1	0			145	write_log( $log ? $log : $log_file, $data, $VERSION )
		50
206						if defined $log;
207
208						####################
209						# END PHENO-RANKER #
210						####################
211
212						sub handle_option {
213	2			2	49	my ( $option_ref, $message, $target_file, $default ) = @_;
214	2	50			4	if ( defined $$option_ref ) {
215	2	50			3	pod2usage( -message => $message, -exitval => 1 ) if $target_file;
216	2	50			4	$$option_ref = $$option_ref ? $$option_ref : $default;
217						}
218						}
219
220						sub resolve_file {
221	0			0		my $base = shift;
222
223						# If the base file exists, use it.
224	0	0				return $base if -e $base;
225
226						# If a gzipped version exists, use that.
227	0					my $gz = $base . '.gz';
228	0	0				return $gz if -e $gz;
229
230						# If neither exists, just return the base file (you might want to warn or error here)
231	0					return $base;
232						}
233
234						sub write_log {
235	0			0		my ( $log, $data, $VERSION ) = @_;
236
237						# NB: Darwin does not have nproc to show #logical-cores, using sysctl instead
238	0					my $os = $^O;
239						chomp(
240	0	0	0			my $threadshost =
		0
		0
241						lc($os) eq 'darwin' ? qx{/usr/sbin/sysctl -n hw.logicalcpu}
242						: lc($os) eq 'freebsd' ? qx{sysctl -n hw.ncpu}
243						: $os eq 'MSWin32' ? qx{wmic cpu get NumberOfLogicalProcessors}
244						: qx{/usr/bin/nproc} // 1
245						);
246
247						# For the Windows command, the result will also contain the string
248						# "NumberOfLogicalProcessors" which is the header of the output.
249						# So we need to extract the actual number from it:
250	0	0				if ( $os eq 'MSWin32' ) {
251	0					($threadshost) = $threadshost =~ /(\d+)/;
252						}
253	0					$threadshost = 0 + $threadshost; # coercing it to be a number
254
255						my $info = {
256						date => ( strftime "%a %b %e %H:%M:%S %Y", localtime ),
257						threadshost => $threadshost,
258						hostname => hostname,
259						id => time . substr( "00000$$", -5 ), # string
260						version => $VERSION,
261						user => $ENV{'LOGNAME'}
262						\|\| $ENV{'USER'}
263	0		0			\|\| $ENV{'USERNAME'}
264						\|\| 'dummy-user'
265						};
266
267						# Saving file
268	0	0				say BOLD GREEN "Writing <$log> file\n" if $data->{verbose};
269	0					write_json(
270						{
271						filepath => $log,
272						data => { info => $info, data => $data }
273						}
274						);
275						}
276
277						sub program_header {
278	0			0		my $VERSION = shift;
279	0					my $str = <
280						****************************************
281						* Rank against cohort(s) (BFF/PXF) *
282						* - PHENO-RANKER - *
283						* Version: $VERSION *
284						* (C) 2023-2025 Manuel Rueda, PhD *
285						* The Artistic License 2.0 *
286						****************************************
287						EOF
288	0					return $str;
289						}
290
291						=head1 NAME
292
293						pheno-ranker: A script that performs semantic similarity in PXF/BFF data structures and beyond (JSON\|YAML)
294
295						=head1 SYNOPSIS
296
297						pheno-ranker -r -t [-options]
298
299						Arguments:
300						* Cohort mode:
301						-r, --reference JSON/YAML BFF/PXF file(s) (array/object), supports .gz
302
303						* Patient mode:
304						-t, --target JSON/YAML BFF/PXF file (object or single-object array), supports .gz
305
306						Options:
307						-age Include age-related variables; excludes agent-like terms (BFF/PXF-only) [>no-age\|age]
308						-a, --align [path/basename] Write alignment file(s). If not specified, default filenames are used [default: alignment.*]
309						-append-prefixes Prefixes for primary_key when #cohorts >= 2 [default: C]
310						-config YAML config file to modify default parameters [default: share/conf/config.yaml]
311						-cytoscape-json [file] Serializes the pairwise comparison matrix as an undirected graph in JSON, compatible with Cytoscape [default: graph.json]
312						-e, --export [path/basename] Export miscellaneous JSON files. If not specified, default filenames are used [default: export.*]
313						-exclude-terms Exclude BFF/PXF terms (e.g., --exclude-terms sex, id) or column names in JSON-derived from CSV
314						-graph-stats [file] Generates a text file with key graph metrics, for use with <-cytoscape-json> [default: graph_stats.txt]
315						-include-hpo-ascendants Include ascendant terms from the Human Phenotype Ontology (HPO)
316						-include-terms Include BFF/PXF terms (e.g., --include-terms diseases) or column names in JSON-derived from CSV
317						-max-matrix-items-in-ram In cohort mode, set max items before switching to RAM-efficient mode (default: 5000)
318						-max-number-vars Maximum number of variables for binary string [default: 10000]
319						-max-out Print only N comparisons [default: 50]
320						-o, --out-file Output file path [default: -r matrix.txt \| -t rank.txt]
321						-poi, --patients-of-interest Export JSON files for the selected individual IDs during a dry-run
322						-poi-out-dir Directory for JSON files (used with --poi)
323						-prp, --precomputed-ref-prefix [path/basename] Use precomputed data for the reference cohort(s). No need to use --r
324						-retain-excluded-phenotypicFeatures Retains features set to "excluded": true by appending '_excluded' to their IDs
325						-similarity-metric-cohort Similarity metric for cohort mode [>hamming\|jaccard]
326						-sort-by Sort by Hamming distance or Jaccard index [>hamming\|jaccard]
327						-w, --weights YAML file with weights
328
329						Generic Options:
330						-debug Print debugging (from 1 to 5, being 5 max)
331						-h, --help Brief help message
332						-log Save log file [default: pheno-ranker-log.json]
333						-man Full documentation
334						-no-color Toggle color output [>color\|no-color]
335						-v, --verbose Verbosity on
336						-V, --version Print version
337
338						=head1 SUMMARY
339
340						Pheno-Ranker is a lightweight, easy-to-install tool for performing semantic similarity analysis on phenotypic data in JSON/YAML formats, including Beacon v2 Models and Phenopackets v2. It also supports pre-processed CSV files prepared using the included C utility.
341
342						=head1 INSTALLATION
343
344						If you plan to only use C CLI, we recommend installing it via CPAN. See details below.
345
346						=head2 Non containerized
347
348						The script runs on command-line Linux and it has been tested on Debian/RedHat/macOS based distributions (only showing commands for Debian). Perl 5 is installed by default on Linux,
349						but we will install a few CPAN modules with C.
350
351						=head3 Method 1: From CPAN
352
353						First install system level dependencies:
354
355						sudo apt-get install cpanminus libperl-dev gcc make
356
357						Now you have to choose between one of the 2 options below:
358
359						B
360
361						cpanm --notest --sudo Pheno::Ranker
362						pheno-ranker -h
363
364						B
365
366						cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
367						cpanm --notest Pheno::Ranker
368						pheno-ranker --help
369
370						To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
371
372						echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
373
374						To B to the newest version (showing commands for Option 2):
375
376						cpanm Pheno::Ranker
377
378						=head3 Method 2: From CPAN in a CONDA environment
379
380						Please follow L.
381
382						=head3 Method 3: From GitHub
383
384						To clone the repository for the first time:
385
386						git clone https://github.com/cnag-biomedical-informatics/pheno-ranker.git
387						cd pheno-ranker
388
389						To update an existing clone, navigate to the repository folder and run:
390
391						git pull
392
393						Install system level dependencies:
394
395						sudo apt-get install cpanminus libperl-dev
396
397						Now you have to choose between one of the 2 options below:
398
399						B
400
401						cpanm --notest --sudo --installdeps .
402						bin/pheno-ranker --help
403
404						B
405
406						cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
407						cpanm --notest --installdeps .
408						bin/pheno-ranker --help
409
410						To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
411
412						echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
413
414						I If you want to use C or C:
415
416						sudo apt-get install python3-pip libzbar0
417						pip3 install -r requirements.txt
418
419						=head2 Containerized
420
421						=head3 Method 4: From Docker Hub
422
423						(Estimated Time: Approximately 10 seconds)
424
425						Download the latest version of the Docker image (supports both amd64 and arm64 architectures) from L by executing:
426
427						docker pull manuelrueda/pheno-ranker:latest
428						docker image tag manuelrueda/pheno-ranker:latest cnag/pheno-ranker:latest
429
430						See additional instructions below.
431
432						=head3 Method 5: With Dockerfile
433
434						(Estimated Time: Approximately 1 minute)
435
436						Please download the C from the repo:
437
438						wget https://raw.githubusercontent.com/cnag-biomedical-informatics/pheno-ranker/main/Dockerfile
439
440						And then run:
441
442						# Docker Version 19.03 and Above (Supports buildx)
443						docker buildx build -t cnag/pheno-ranker:latest .
444
445						# Docker Version Older than 19.03 (Does Not Support buildx)
446						docker build -t cnag/pheno-ranker:latest .
447
448						=head3 Additional instructions for Methods 4 and 5
449
450						To run the container (detached) execute:
451
452						docker run -tid -e USERNAME=root --name pheno-ranker cnag/pheno-ranker:latest
453
454						To enter:
455
456						docker exec -ti pheno-ranker bash
457
458						The command-line executable can be found at:
459
460						/usr/share/pheno-ranker/bin/pheno-ranker
461
462						The default container user is C but you can also run the container as C<$UID=1000> (C).
463
464						docker run --user 1000 -tid --name pheno-ranker cnag/pheno-ranker:latest
465
466						=head3 Mounting volumes
467
468						Docker containers are fully isolated. If you need the mount a volume to the container please use the following syntax (C<-v host:container>).
469						Find an example below (note that you need to change the paths to match yours):
470
471						docker run -tid --volume /media/mrueda/4TBT/data:/data --name pheno-ranker-mount cnag/pheno-ranker:latest
472
473						Then I will do something like this:
474
475						# First I create an alias to simplify invocation (from the host)
476						alias pheno-ranker='docker exec -ti pheno-ranker-mount /usr/share/pheno-ranker/bin/pheno-ranker'
477
478						# Now I use the alias to run the command (note that I use the flag --o to specify the filepath)
479						pheno-ranker -r /data/individuals.json -o /data/matrix.txt
480
481						=head3 System requirements
482
483						- OS/ARCH supported: B and B.
484						- Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOS, OpenSUSE) should do as well (untested).
485						(It should also work on macOS and Windows Server, but we are only providing information for Linux here)
486						* Perl 5 (>= 5.26 core; installed by default in most Linux distributions). Check the version with "perl -v".
487						* >= 4GB of RAM
488						* 1 core
489						* At least 16GB HDD
490
491						=head1 HOW TO RUN PHENO-RANKER
492
493						For executing pheno-ranker you will need a PXF/BFF file(s) in JSON\|YAML format. The reference cohort must be a JSON array, where each individual data are consolidated in one object.
494
495						You can download examples from L.
496
497						There are two modes of operation:
498
499						=over 4
500
501						=item Cohort mode:
502
503						B With C<--r> argument and 1 cohort.
504
505						B With C<--r> and multiple cohort files. It can be used in combination with C<--append-prefixes> to add prefixes to each individual id.
506
507						=item Patient Mode:
508
509						With C<-r> reference cohort(s) and C<--t> patient data.
510
511						=back
512
513						B
514
515						$ bin/pheno-ranker -r phenopackets.json # intra-cohort
516
517						$ bin/pheno-ranker -r phenopackets.yaml -o my_matrix.txt # intra-cohort
518
519						$ bin/pheno-ranker -r phenopackets.json -w weights.yaml --exclude-terms sex ethnicity exposures # intra-cohort with weights
520
521						$ $path/pheno-ranker -r individuals.json others.yaml --append-prefixes CANCER CONTROL # inter-cohort
522
523						$ $path/pheno-ranker -r individuals.json -t patient.yaml -max-out 100 # mode patient
524
525
526						=head2 COMMON ERRORS AND SOLUTIONS
527
528						* Error message: R plotting
529						Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
530						line 1 did not have X elements
531						Calls: as.matrix -> read.table -> scan
532						Execution halted
533						Solution: Make sure that the values of your primary key (e.g., "id") do not contain spaces (e.g., "my fav id" must be "my_fav_id")
534
535						* Error message: Foo
536						Solution: Bar
537
538						=head1 CITATION
539
540						The author requests that any published work that utilizes C includes a cite to the following reference:
541
542						Leist, I.C. et al., (2024). Pheno-Ranker: a toolkit for comparison of phenotypic data stored in GA4GH standards and beyond. I. DOI: 10.1186/s12859-024-05993-2
543
544						=head1 AUTHOR
545
546						Written by Manuel Rueda, PhD. Info about CNAG can be found at L.
547
548						=head1 COPYRIGHT AND LICENSE
549
550						This PERL file is copyrighted. See the LICENSE file included in this distribution.
551
552						=cut