File Coverage

bin/pheno-ranker
Criterion Covered Total %
statement 73 105 69.5
branch 14 46 30.4
condition 4 19 21.0
subroutine 15 19 78.9
pod n/a
total 106 189 56.0


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2             #
3             # A script that performs semantic similarity in PXF|BFF data structures
4             #
5             # Last Modified: Feb/01/2025
6             #
7             # $VERSION taken from Pheno::Ranker
8             #
9             # Copyright (C) 2023-2025 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
10             #
11             # License: Artistic License 2.0
12             #
13             # If this program helps you in your research, please cite.
14              
15             package main;
16              
17 1     1   5616 use strict;
  1         1  
  1         42  
18 1     1   11 use warnings;
  1         1  
  1         51  
19 1     1   585 use autodie;
  1         20787  
  1         4  
20 1     1   8186 use feature qw(say);
  1         5  
  1         202  
21 1     1   863 use Getopt::Long qw(:config no_ignore_case);
  1         17819  
  1         7  
22 1     1   862 use Pod::Usage;
  1         84143  
  1         147  
23 1     1   781 use Data::Dumper;
  1         9529  
  1         86  
24 1     1   704 use Sys::Hostname;
  1         1513  
  1         66  
25 1     1   603 use POSIX qw(strftime);
  1         6783  
  1         7  
26 1     1   2789 use Term::ANSIColor qw(:constants);
  1         11964  
  1         1457  
27 1     1   672 use File::ShareDir::ProjectDistDir qw(dist_dir);
  1         53476  
  1         9  
28 1     1   1174 use FindBin qw($Bin);
  1         1550  
  1         164  
29 1     1   669 use lib "$Bin/../lib";
  1         874  
  1         10  
30 1     1   836 use Pheno::Ranker qw($VERSION write_json);
  1         7  
  1         2111  
31              
32             # Defining a few variables
33 1         310523 my $out_file_cohort = 'matrix.txt';
34 1         3 my $out_file_patient = 'rank.txt';
35 1         3 my $out_file_graph = 'graph.json';
36 1         2 my $out_file_graph_stats = 'graph_stats.txt';
37 1         3 my $export_basename = 'export';
38 1         2 my $align_basename = 'alignment';
39 1         2 my $log_file = 'pheno-ranker-log.json';
40 1         2 my $color = 1;
41 1         1 my $age = 0;
42 1         2 my $cli = 1;
43              
44             # Reading arguments
45             GetOptions(
46             'reference|r=s{1,}' => \my @reference_files, # array
47             'target|t=s' => \my $target_file, # string
48             'weights|w=s' => \my $weights_file, # string
49             'append-prefixes=s{1,}' => \my @append_prefixes, # array
50             'out-file|o=s' => \my $out_file_arg, # string
51             'max-out:i' => \my $max_out, # integer
52             'max-number-vars:i' => \my $max_number_vars, # integer
53             'include-hpo-ascendants' => \my $include_hpo_ascendants, # flag
54             'export|e:s' => \my $export, # opt-string (defined)
55             'align|a:s' => \my $align, # opt-string (defined)
56             'cytoscape-json:s' => \my $cytoscape_json, # opt-string (defined)
57             'graph-stats:s' => \my $graph_stats, # opt-string (defined)
58             'sort-by=s' => \my $sort_by, # string
59             'similarity-metric-cohort=s' => \my $similarity_metric_cohort, # string
60             'patients-of-interest|poi=s{1,}' => \my @patients_of_interest, # array
61             'poi-out-dir=s' => \my $poi_out_dir, # string
62             'include-terms=s{1,11}' => \my @include_terms, # array
63             'exclude-terms=s{1,11}' => \my @exclude_terms, # array
64             'retain-excluded-phenotypicFeatures' => \
65             my $retain_excluded_phenotypicFeatures, # flag
66             'prp|precomputed-ref-prefix=s' => \my $precomputed_ref_prefix, # string
67             'max-matrix-records-in-ram=i' => \my $max_matrix_records_in_ram, # integer
68             'config=s' => \my $config_file, # string
69             'age!' => \$age, # flag
70             'help|?' => \my $help, # flag
71             'log:s' => \my $log, # opt-string (defined)
72             'man' => \my $man, # flag
73             'debug=i' => \my $debug, # integer
74             'verbose|' => \my $verbose, # flag
75             'color!' => \$color, # flag
76 0     0   0 'version|V' => sub { say "$0 Version $VERSION"; exit; }
  0         0  
77 1 50       13 ) or pod2usage(2);
78 1 50       2161 pod2usage(1) if $help;
79 1 50       3 pod2usage( -verbose => 2, -exitval => 0 ) if $man;
80 1 50 33     4 pod2usage(
81             -message => "Please specify a reference-cohort(s) with <--r>\n",
82             -exitval => 1
83             ) unless ( @reference_files || $precomputed_ref_prefix );
84 1 50 33     4 pod2usage(
85             -message =>
86             "<--graph_stats> only works in conjunction with <--cytoscape-json>\n",
87             -exitval => 1
88             ) if ( defined $graph_stats && !defined $cytoscape_json );
89 1 50 33     4 pod2usage(
90             -message => "Weights file <$weights_file> does not exist\n",
91             -exitval => 1
92             ) if ( defined $weights_file && !-f $weights_file );
93              
94             # Set the name of the output
95 1 50 33     4 my $out_file = $out_file_arg
96             // ( $target_file ? $out_file_patient : $out_file_cohort );
97              
98             # Set import data
99             my (
100 1         3 $glob_hash_file, $ref_hash_file,
101             $ref_binary_hash_file, $coverage_stats_file
102             );
103 1 50       2 if ( defined $precomputed_ref_prefix ) {
104              
105             # Check if any incompatible parameters are provided
106 0   0     0 my $has_incompatible_options =
107             @reference_files
108             || @append_prefixes
109             || $age
110             || defined $include_hpo_ascendants
111             || defined $retain_excluded_phenotypicFeatures
112             || defined $weights_file;
113              
114 0         0 my @incompatible_flags = (
115             '--reference', '--age',
116             '--hpo-ascendants', '--retain-excluded-phenotypicFeatures',
117             '--weights', '--append-prefixes'
118             );
119              
120 0 0       0 if ($has_incompatible_options) {
121 0         0 my $flags_str = join( "\n", @incompatible_flags );
122 0         0 pod2usage(
123             -message =>
124             "Sorry, but the options\n$flags_str\nare incompatible with --prp <$precomputed_ref_prefix>\n",
125             -exitval => 1,
126             );
127             }
128              
129             # Generate file names based on precomputed_ref_prefix
130 0         0 my $base_glob = $precomputed_ref_prefix . '.glob_hash.json';
131 0         0 my $base_ref = $precomputed_ref_prefix . '.ref_hash.json';
132 0         0 my $base_ref_binary = $precomputed_ref_prefix . '.ref_binary_hash.json';
133 0         0 my $base_coverage_stats = $precomputed_ref_prefix . '.coverage_stats.json';
134              
135             # Use the helper to check for .gz versions if needed.
136 0         0 $glob_hash_file = resolve_file($base_glob);
137 0         0 $ref_hash_file = resolve_file($base_ref);
138 0         0 $ref_binary_hash_file = resolve_file($base_ref_binary);
139 0         0 $coverage_stats_file = resolve_file($base_coverage_stats);
140             }
141              
142             # Set cytoscape-json logic
143 1         4 handle_option( \$cytoscape_json, "<--cytoscape-json> only works in cohort-mode",
144             $target_file, $out_file_graph );
145              
146             # Set graph-stats logic
147 1         3 handle_option( \$graph_stats, "<--graph-stats> only works in cohort-mode",
148             $target_file, $out_file_graph_stats );
149              
150             # Turning color off if argument <--no-color>
151 1 50       23 $ENV{'ANSI_COLORS_DISABLED'} = 1 unless $color;
152              
153             # Start printing to STDOUT
154 1 50       2 say BOLD CYAN program_header($VERSION), RESET if $verbose;
155              
156             ######################
157             # START PHENO-RANKER #
158             ######################
159              
160             # Load data as hashref
161 1         25 my $data = {
162             reference_files => \@reference_files,
163             target_file => $target_file,
164             weights_file => $weights_file,
165             include_hpo_ascendants => $include_hpo_ascendants,
166             hpo_file => undef,
167             align => $align,
168             align_basename => $align_basename,
169             export => $export,
170             export_basename => $export_basename,
171             out_file => $out_file,
172             cytoscape_json => $cytoscape_json,
173             graph_stats => $graph_stats,
174             max_out => $max_out,
175             max_number_vars => $max_number_vars,
176             sort_by => $sort_by,
177             similarity_metric_cohort => $similarity_metric_cohort,
178             patients_of_interest => \@patients_of_interest,
179             poi_out_dir => $poi_out_dir,
180             include_terms => \@include_terms,
181             exclude_terms => \@exclude_terms,
182             retain_excluded_phenotypicFeatures => $retain_excluded_phenotypicFeatures,
183             precomputed_ref_prefix => $precomputed_ref_prefix,
184             max_matrix_records_in_ram => $max_matrix_records_in_ram,
185             glob_hash_file => $glob_hash_file,
186             ref_hash_file => $ref_hash_file,
187             ref_binary_hash_file => $ref_binary_hash_file,
188             coverage_stats_file => $coverage_stats_file,
189             config_file => $config_file,
190             age => $age, # Solution, use ageRange in PXF/BFF, measures' values more difficult
191             cli => $cli,
192             append_prefixes => \@append_prefixes,
193             log => $log,
194             debug => $debug,
195             verbose => $verbose
196             };
197              
198             # Create object
199 1         11 my $ranker = Pheno::Ranker->new($data);
200              
201             # Run method
202 1         4 $ranker->run();
203              
204             # Create log if <--log>
205 1 0       145 write_log( $log ? $log : $log_file, $data, $VERSION )
    50          
206             if defined $log;
207              
208             ####################
209             # END PHENO-RANKER #
210             ####################
211              
212             sub handle_option {
213 2     2   49 my ( $option_ref, $message, $target_file, $default ) = @_;
214 2 50       4 if ( defined $$option_ref ) {
215 2 50       3 pod2usage( -message => $message, -exitval => 1 ) if $target_file;
216 2 50       4 $$option_ref = $$option_ref ? $$option_ref : $default;
217             }
218             }
219              
220             sub resolve_file {
221 0     0     my $base = shift;
222              
223             # If the base file exists, use it.
224 0 0         return $base if -e $base;
225              
226             # If a gzipped version exists, use that.
227 0           my $gz = $base . '.gz';
228 0 0         return $gz if -e $gz;
229              
230             # If neither exists, just return the base file (you might want to warn or error here)
231 0           return $base;
232             }
233              
234             sub write_log {
235 0     0     my ( $log, $data, $VERSION ) = @_;
236              
237             # NB: Darwin does not have nproc to show #logical-cores, using sysctl instead
238 0           my $os = $^O;
239             chomp(
240 0 0 0       my $threadshost =
    0          
    0          
241             lc($os) eq 'darwin' ? qx{/usr/sbin/sysctl -n hw.logicalcpu}
242             : lc($os) eq 'freebsd' ? qx{sysctl -n hw.ncpu}
243             : $os eq 'MSWin32' ? qx{wmic cpu get NumberOfLogicalProcessors}
244             : qx{/usr/bin/nproc} // 1
245             );
246              
247             # For the Windows command, the result will also contain the string
248             # "NumberOfLogicalProcessors" which is the header of the output.
249             # So we need to extract the actual number from it:
250 0 0         if ( $os eq 'MSWin32' ) {
251 0           ($threadshost) = $threadshost =~ /(\d+)/;
252             }
253 0           $threadshost = 0 + $threadshost; # coercing it to be a number
254              
255             my $info = {
256             date => ( strftime "%a %b %e %H:%M:%S %Y", localtime ),
257             threadshost => $threadshost,
258             hostname => hostname,
259             id => time . substr( "00000$$", -5 ), # string
260             version => $VERSION,
261             user => $ENV{'LOGNAME'}
262             || $ENV{'USER'}
263 0   0       || $ENV{'USERNAME'}
264             || 'dummy-user'
265             };
266              
267             # Saving file
268 0 0         say BOLD GREEN "Writing <$log> file\n" if $data->{verbose};
269 0           write_json(
270             {
271             filepath => $log,
272             data => { info => $info, data => $data }
273             }
274             );
275             }
276              
277             sub program_header {
278 0     0     my $VERSION = shift;
279 0           my $str = <
280             ****************************************
281             * Rank against cohort(s) (BFF/PXF) *
282             * - PHENO-RANKER - *
283             * Version: $VERSION *
284             * (C) 2023-2025 Manuel Rueda, PhD *
285             * The Artistic License 2.0 *
286             ****************************************
287             EOF
288 0           return $str;
289             }
290              
291             =head1 NAME
292              
293             pheno-ranker: A script that performs semantic similarity in PXF/BFF data structures and beyond (JSON|YAML)
294              
295             =head1 SYNOPSIS
296              
297             pheno-ranker -r -t [-options]
298              
299             Arguments:
300             * Cohort mode:
301             -r, --reference JSON/YAML BFF/PXF file(s) (array/object), supports .gz
302              
303             * Patient mode:
304             -t, --target JSON/YAML BFF/PXF file (object or single-object array), supports .gz
305              
306             Options:
307             -age Include age-related variables; excludes agent-like terms (BFF/PXF-only) [>no-age|age]
308             -a, --align [path/basename] Write alignment file(s). If not specified, default filenames are used [default: alignment.*]
309             -append-prefixes Prefixes for primary_key when #cohorts >= 2 [default: C]
310             -config YAML config file to modify default parameters [default: share/conf/config.yaml]
311             -cytoscape-json [file] Serializes the pairwise comparison matrix as an undirected graph in JSON, compatible with Cytoscape [default: graph.json]
312             -e, --export [path/basename] Export miscellaneous JSON files. If not specified, default filenames are used [default: export.*]
313             -exclude-terms Exclude BFF/PXF terms (e.g., --exclude-terms sex, id) or column names in JSON-derived from CSV
314             -graph-stats [file] Generates a text file with key graph metrics, for use with <-cytoscape-json> [default: graph_stats.txt]
315             -include-hpo-ascendants Include ascendant terms from the Human Phenotype Ontology (HPO)
316             -include-terms Include BFF/PXF terms (e.g., --include-terms diseases) or column names in JSON-derived from CSV
317             -max-matrix-items-in-ram In cohort mode, set max items before switching to RAM-efficient mode (default: 5000)
318             -max-number-vars Maximum number of variables for binary string [default: 10000]
319             -max-out Print only N comparisons [default: 50]
320             -o, --out-file Output file path [default: -r matrix.txt | -t rank.txt]
321             -poi, --patients-of-interest Export JSON files for the selected individual IDs during a dry-run
322             -poi-out-dir Directory for JSON files (used with --poi)
323             -prp, --precomputed-ref-prefix [path/basename] Use precomputed data for the reference cohort(s). No need to use --r
324             -retain-excluded-phenotypicFeatures Retains features set to "excluded": true by appending '_excluded' to their IDs
325             -similarity-metric-cohort Similarity metric for cohort mode [>hamming|jaccard]
326             -sort-by Sort by Hamming distance or Jaccard index [>hamming|jaccard]
327             -w, --weights YAML file with weights
328              
329             Generic Options:
330             -debug Print debugging (from 1 to 5, being 5 max)
331             -h, --help Brief help message
332             -log Save log file [default: pheno-ranker-log.json]
333             -man Full documentation
334             -no-color Toggle color output [>color|no-color]
335             -v, --verbose Verbosity on
336             -V, --version Print version
337              
338             =head1 SUMMARY
339              
340             Pheno-Ranker is a lightweight, easy-to-install tool for performing semantic similarity analysis on phenotypic data in JSON/YAML formats, including Beacon v2 Models and Phenopackets v2. It also supports pre-processed CSV files prepared using the included C utility.
341              
342             =head1 INSTALLATION
343              
344             If you plan to only use C CLI, we recommend installing it via CPAN. See details below.
345              
346             =head2 Non containerized
347              
348             The script runs on command-line Linux and it has been tested on Debian/RedHat/macOS based distributions (only showing commands for Debian). Perl 5 is installed by default on Linux,
349             but we will install a few CPAN modules with C.
350              
351             =head3 Method 1: From CPAN
352              
353             First install system level dependencies:
354              
355             sudo apt-get install cpanminus libperl-dev gcc make
356              
357             Now you have to choose between one of the 2 options below:
358              
359             B
360              
361             cpanm --notest --sudo Pheno::Ranker
362             pheno-ranker -h
363              
364             B
365              
366             cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
367             cpanm --notest Pheno::Ranker
368             pheno-ranker --help
369              
370             To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
371              
372             echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
373              
374             To B to the newest version (showing commands for Option 2):
375            
376             cpanm Pheno::Ranker
377              
378             =head3 Method 2: From CPAN in a CONDA environment
379              
380             Please follow L.
381              
382             =head3 Method 3: From GitHub
383              
384             To clone the repository for the first time:
385              
386             git clone https://github.com/cnag-biomedical-informatics/pheno-ranker.git
387             cd pheno-ranker
388              
389             To update an existing clone, navigate to the repository folder and run:
390              
391             git pull
392              
393             Install system level dependencies:
394            
395             sudo apt-get install cpanminus libperl-dev
396              
397             Now you have to choose between one of the 2 options below:
398              
399             B
400              
401             cpanm --notest --sudo --installdeps .
402             bin/pheno-ranker --help
403              
404             B
405              
406             cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
407             cpanm --notest --installdeps .
408             bin/pheno-ranker --help
409              
410             To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
411              
412             echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
413              
414             I If you want to use C or C:
415              
416             sudo apt-get install python3-pip libzbar0
417             pip3 install -r requirements.txt
418              
419             =head2 Containerized
420              
421             =head3 Method 4: From Docker Hub
422              
423             (Estimated Time: Approximately 10 seconds)
424              
425             Download the latest version of the Docker image (supports both amd64 and arm64 architectures) from L by executing:
426              
427             docker pull manuelrueda/pheno-ranker:latest
428             docker image tag manuelrueda/pheno-ranker:latest cnag/pheno-ranker:latest
429              
430             See additional instructions below.
431              
432             =head3 Method 5: With Dockerfile
433              
434             (Estimated Time: Approximately 1 minute)
435              
436             Please download the C from the repo:
437              
438             wget https://raw.githubusercontent.com/cnag-biomedical-informatics/pheno-ranker/main/Dockerfile
439              
440             And then run:
441              
442             # Docker Version 19.03 and Above (Supports buildx)
443             docker buildx build -t cnag/pheno-ranker:latest .
444              
445             # Docker Version Older than 19.03 (Does Not Support buildx)
446             docker build -t cnag/pheno-ranker:latest .
447              
448             =head3 Additional instructions for Methods 4 and 5
449              
450             To run the container (detached) execute:
451              
452             docker run -tid -e USERNAME=root --name pheno-ranker cnag/pheno-ranker:latest
453              
454             To enter:
455              
456             docker exec -ti pheno-ranker bash
457              
458             The command-line executable can be found at:
459              
460             /usr/share/pheno-ranker/bin/pheno-ranker
461              
462             The default container user is C but you can also run the container as C<$UID=1000> (C).
463              
464             docker run --user 1000 -tid --name pheno-ranker cnag/pheno-ranker:latest
465            
466             =head3 Mounting volumes
467              
468             Docker containers are fully isolated. If you need the mount a volume to the container please use the following syntax (C<-v host:container>).
469             Find an example below (note that you need to change the paths to match yours):
470              
471             docker run -tid --volume /media/mrueda/4TBT/data:/data --name pheno-ranker-mount cnag/pheno-ranker:latest
472              
473             Then I will do something like this:
474              
475             # First I create an alias to simplify invocation (from the host)
476             alias pheno-ranker='docker exec -ti pheno-ranker-mount /usr/share/pheno-ranker/bin/pheno-ranker'
477              
478             # Now I use the alias to run the command (note that I use the flag --o to specify the filepath)
479             pheno-ranker -r /data/individuals.json -o /data/matrix.txt
480              
481             =head3 System requirements
482              
483             - OS/ARCH supported: B and B.
484             - Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOS, OpenSUSE) should do as well (untested).
485             (It should also work on macOS and Windows Server, but we are only providing information for Linux here)
486             * Perl 5 (>= 5.26 core; installed by default in most Linux distributions). Check the version with "perl -v".
487             * >= 4GB of RAM
488             * 1 core
489             * At least 16GB HDD
490              
491             =head1 HOW TO RUN PHENO-RANKER
492              
493             For executing pheno-ranker you will need a PXF/BFF file(s) in JSON|YAML format. The reference cohort must be a JSON array, where each individual data are consolidated in one object.
494              
495             You can download examples from L.
496              
497             There are two modes of operation:
498              
499             =over 4
500              
501             =item Cohort mode:
502            
503             B With C<--r> argument and 1 cohort.
504              
505             B With C<--r> and multiple cohort files. It can be used in combination with C<--append-prefixes> to add prefixes to each individual id.
506              
507             =item Patient Mode:
508              
509             With C<-r> reference cohort(s) and C<--t> patient data.
510              
511             =back
512              
513             B
514              
515             $ bin/pheno-ranker -r phenopackets.json # intra-cohort
516              
517             $ bin/pheno-ranker -r phenopackets.yaml -o my_matrix.txt # intra-cohort
518              
519             $ bin/pheno-ranker -r phenopackets.json -w weights.yaml --exclude-terms sex ethnicity exposures # intra-cohort with weights
520              
521             $ $path/pheno-ranker -r individuals.json others.yaml --append-prefixes CANCER CONTROL # inter-cohort
522              
523             $ $path/pheno-ranker -r individuals.json -t patient.yaml -max-out 100 # mode patient
524              
525              
526             =head2 COMMON ERRORS AND SOLUTIONS
527              
528             * Error message: R plotting
529             Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
530             line 1 did not have X elements
531             Calls: as.matrix -> read.table -> scan
532             Execution halted
533             Solution: Make sure that the values of your primary key (e.g., "id") do not contain spaces (e.g., "my fav id" must be "my_fav_id")
534              
535             * Error message: Foo
536             Solution: Bar
537              
538             =head1 CITATION
539              
540             The author requests that any published work that utilizes C includes a cite to the following reference:
541              
542             Leist, I.C. et al., (2024). Pheno-Ranker: a toolkit for comparison of phenotypic data stored in GA4GH standards and beyond. I. DOI: 10.1186/s12859-024-05993-2
543              
544             =head1 AUTHOR
545              
546             Written by Manuel Rueda, PhD. Info about CNAG can be found at L.
547              
548             =head1 COPYRIGHT AND LICENSE
549              
550             This PERL file is copyrighted. See the LICENSE file included in this distribution.
551              
552             =cut