File Coverage

bin/pheno-ranker
Criterion Covered Total %
statement 73 101 72.2
branch 14 44 31.8
condition 4 19 21.0
subroutine 15 19 78.9
pod n/a
total 106 183 57.9


line stmt bran cond sub pod time code
1             #!/usr/bin/env perl
2             #
3             # A script that performs semantic similarity in PXF|BFF data structures
4             #
5             # Last Modified: Feb/01/2025
6             #
7             # $VERSION taken from Pheno::Ranker
8             #
9             # Copyright (C) 2023-2025 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
10             #
11             # License: Artistic License 2.0
12             #
13             # If this program helps you in your research, please cite.
14              
15             package main;
16              
17 1     1   3954 use strict;
  1         2  
  1         31  
18 1     1   7 use warnings;
  1         1  
  1         35  
19 1     1   372 use autodie;
  1         13238  
  1         3  
20 1     1   4368 use feature qw(say);
  1         1  
  1         123  
21 1     1   649 use Getopt::Long qw(:config no_ignore_case);
  1         10913  
  1         5  
22 1     1   583 use Pod::Usage;
  1         51769  
  1         105  
23 1     1   533 use Data::Dumper;
  1         6032  
  1         56  
24 1     1   512 use Sys::Hostname;
  1         1054  
  1         54  
25 1     1   415 use POSIX qw(strftime);
  1         4114  
  1         3  
26 1     1   1740 use Term::ANSIColor qw(:constants);
  1         7121  
  1         828  
27 1     1   432 use File::ShareDir::ProjectDistDir qw(dist_dir);
  1         33413  
  1         7  
28 1     1   817 use FindBin qw($Bin);
  1         1042  
  1         127  
29 1     1   466 use lib "$Bin/../lib";
  1         558  
  1         5  
30 1     1   544 use Pheno::Ranker qw($VERSION write_json);
  1         4  
  1         1193  
31              
32             # Defining a few variables
33 1         162239 my $out_file_cohort = 'matrix.txt';
34 1         4 my $out_file_patient = 'rank.txt';
35 1         2 my $out_file_graph = 'graph.json';
36 1         2 my $out_file_graph_stats = 'graph_stats.txt';
37 1         2 my $export_basename = 'export';
38 1         1 my $align_basename = 'alignment';
39 1         3 my $log_file = 'pheno-ranker-log.json';
40 1         1 my $color = 1;
41 1         1 my $age = 0;
42 1         2 my $cli = 1;
43              
44             # Reading arguments
45             GetOptions(
46             'reference|r=s{1,}' => \my @reference_files, # array
47             'target|t=s' => \my $target_file, # string
48             'weights|w=s' => \my $weights_file, # string
49             'append-prefixes=s{1,}' => \my @append_prefixes, # array
50             'out-file|o=s' => \my $out_file_arg, # string
51             'max-out:i' => \my $max_out, # integer
52             'max-number-vars:i' => \my $max_number_vars, # integer
53             'include-hpo-ascendants' => \my $include_hpo_ascendants, # flag
54             'export|e:s' => \my $export, # opt-string (defined)
55             'align|a:s' => \my $align, # opt-string (defined)
56             'cytoscape-json:s' => \my $cytoscape_json, # opt-string (defined)
57             'graph-stats:s' => \my $graph_stats, # opt-string (defined)
58             'sort-by=s' => \my $sort_by, # string
59             'similarity-metric-cohort=s' => \my $similarity_metric_cohort, # string
60             'patients-of-interest|poi=s{1,}' => \my @patients_of_interest, # array
61             'poi-out-dir=s' => \my $poi_out_dir, # string
62             'include-terms=s{1,11}' => \my @include_terms, # array
63             'exclude-terms=s{1,11}' => \my @exclude_terms, # array
64             'retain-excluded-phenotypicFeatures' => \
65             my $retain_excluded_phenotypicFeatures, # flag
66             'precomputed-ref-prefix=s' => \my $precomputed_ref_prefix, # string
67             'max-matrix-items-in-ram=i' => \my $max_matrix_items_in_ram, # integer
68             'config=s' => \my $config_file, # string
69             'age!' => \$age, # flag
70             'help|?' => \my $help, # flag
71             'log:s' => \my $log, # opt-string (defined)
72             'man' => \my $man, # flag
73             'debug=i' => \my $debug, # integer
74             'verbose|' => \my $verbose, # flag
75             'color!' => \$color, # flag
76 0     0   0 'version|V' => sub { say "$0 Version $VERSION"; exit; }
  0         0  
77 1 50       13 ) or pod2usage(2);
78 1 50       2148 pod2usage(1) if $help;
79 1 50       3 pod2usage( -verbose => 2, -exitval => 0 ) if $man;
80 1 50 33     3 pod2usage(
81             -message => "Please specify a reference-cohort(s) with <--r>\n",
82             -exitval => 1
83             ) unless ( @reference_files || $precomputed_ref_prefix );
84 1 50 33     6 pod2usage(
85             -message =>
86             "<--graph_stats> only works in conjunction with <--cytoscape-json>\n",
87             -exitval => 1
88             ) if ( defined $graph_stats && !defined $cytoscape_json );
89 1 50 33     3 pod2usage(
90             -message => "Weights file <$weights_file> does not exist\n",
91             -exitval => 1
92             ) if ( defined $weights_file && !-f $weights_file );
93              
94             # Set the name of the output
95 1 50 33     6 my $out_file = $out_file_arg
96             // ( $target_file ? $out_file_patient : $out_file_cohort );
97              
98             # Set import data
99 1         2 my ( $glob_hash_file, $ref_hash_file, $ref_binary_hash_file );
100 1 50       3 if ( defined $precomputed_ref_prefix ) {
101              
102             # Check if any incompatible parameters are provided
103 0   0     0 my $has_incompatible_options =
104             @reference_files
105             || @append_prefixes
106             || $age
107             || defined $include_hpo_ascendants
108             || defined $retain_excluded_phenotypicFeatures
109             || defined $weights_file;
110              
111 0 0       0 if ($has_incompatible_options) {
112 0         0 pod2usage(
113             -message =>
114             "Sorry, but <--reference/age/hpo-ascendants/retain-excluded-phenotypicFeatures/weights/append-prefixes> are incompatible with --import <$precomputed_ref_prefix>\n",
115             -exitval => 1
116             );
117             }
118              
119             # Generate file names based on precomputed_ref_prefix
120 0         0 my $base_glob = $precomputed_ref_prefix . '.glob_hash.json';
121 0         0 my $base_ref = $precomputed_ref_prefix . '.ref_hash.json';
122 0         0 my $base_ref_binary = $precomputed_ref_prefix . '.ref_binary_hash.json';
123              
124             # Use the helper to check for .gz versions if needed.
125 0         0 $glob_hash_file = resolve_file($base_glob);
126 0         0 $ref_hash_file = resolve_file($base_ref);
127 0         0 $ref_binary_hash_file = resolve_file($base_ref_binary);
128             }
129              
130             # Set cytoscape-json logic
131 1         5 handle_option( \$cytoscape_json, "<--cytoscape-json> only works in cohort-mode",
132             $target_file, $out_file_graph );
133              
134             # Set graph-stats logic
135 1         2 handle_option( \$graph_stats, "<--graph-stats> only works in cohort-mode",
136             $target_file, $out_file_graph_stats );
137              
138             # Turning color off if argument <--no-color>
139 1 50       2 $ENV{'ANSI_COLORS_DISABLED'} = 1 unless $color;
140              
141             # Start printing to STDOUT
142 1 50       2 say BOLD CYAN program_header($VERSION), RESET if $verbose;
143              
144             ######################
145             # START PHENO-RANKER #
146             ######################
147              
148             # Load data as hashref
149 1         44 my $data = {
150             reference_files => \@reference_files,
151             target_file => $target_file,
152             weights_file => $weights_file,
153             include_hpo_ascendants => $include_hpo_ascendants,
154             hpo_file => undef,
155             align => $align,
156             align_basename => $align_basename,
157             export => $export,
158             export_basename => $export_basename,
159             out_file => $out_file,
160             cytoscape_json => $cytoscape_json,
161             graph_stats => $graph_stats,
162             max_out => $max_out,
163             max_number_vars => $max_number_vars,
164             sort_by => $sort_by,
165             similarity_metric_cohort => $similarity_metric_cohort,
166             patients_of_interest => \@patients_of_interest,
167             poi_out_dir => $poi_out_dir,
168             include_terms => \@include_terms,
169             exclude_terms => \@exclude_terms,
170             retain_excluded_phenotypicFeatures => $retain_excluded_phenotypicFeatures,
171             precomputed_ref_prefix => $precomputed_ref_prefix,
172             max_matrix_items_in_ram => $max_matrix_items_in_ram,
173             glob_hash_file => $glob_hash_file,
174             ref_hash_file => $ref_hash_file,
175             ref_binary_hash_file => $ref_binary_hash_file,
176             config_file => $config_file,
177             age => $age, # Solution, use ageRange in PXF/BFF, measures' values more difficult
178             cli => $cli,
179             append_prefixes => \@append_prefixes,
180             log => $log,
181             debug => $debug,
182             verbose => $verbose
183             };
184              
185             # Create object
186 1         14 my $ranker = Pheno::Ranker->new($data);
187              
188             # Run method
189 1         4 $ranker->run();
190              
191             # Create log if <--log>
192 1 0       0 write_log( $log ? $log : $log_file, $data, $VERSION )
    50          
193             if defined $log;
194              
195             ####################
196             # END PHENO-RANKER #
197             ####################
198              
199             sub handle_option {
200              
201 2     2   5 my ( $option_ref, $message, $target_file, $default ) = @_;
202 2 50       3 if ( defined $$option_ref ) {
203 2 50       3 pod2usage( -message => $message, -exitval => 1 ) if $target_file;
204 2 50       4 $$option_ref = $$option_ref ? $$option_ref : $default;
205             }
206             }
207              
208             sub resolve_file {
209              
210 0     0     my $base = shift;
211              
212             # If the base file exists, use it.
213 0 0         return $base if -e $base;
214              
215             # If a gzipped version exists, use that.
216 0           my $gz = $base . '.gz';
217 0 0         return $gz if -e $gz;
218              
219             # If neither exists, just return the base file (you might want to warn or error here)
220 0           return $base;
221             }
222              
223             sub write_log {
224              
225 0     0     my ( $log, $data, $VERSION ) = @_;
226              
227             # NB: Darwin does not have nproc to show #logical-cores, using sysctl instead
228 0           my $os = $^O;
229             chomp(
230 0 0 0       my $ncpuhost =
    0          
231             lc($os) eq 'darwin' ? qx{/usr/sbin/sysctl -n hw.logicalcpu}
232             : $os eq 'MSWin32' ? qx{wmic cpu get NumberOfLogicalProcessors}
233             : qx{/usr/bin/nproc} // 1
234             );
235              
236             # For the Windows command, the result will also contain the string
237             # "NumberOfLogicalProcessors" which is the header of the output.
238             # So we need to extract the actual number from it:
239 0 0         if ( $os eq 'MSWin32' ) {
240 0           ($ncpuhost) = $ncpuhost =~ /(\d+)/;
241             }
242 0           $ncpuhost = 0 + $ncpuhost; # coercing it to be a number
243              
244             my $info = {
245             date => ( strftime "%a %b %e %H:%M:%S %Y", localtime ),
246             ncpuhost => $ncpuhost,
247             hostname => hostname,
248             id => time . substr( "00000$$", -5 ), # string
249             version => $VERSION,
250             user => $ENV{'LOGNAME'}
251             || $ENV{'USER'}
252 0   0       || $ENV{'USERNAME'}
253             || 'dummy-user'
254             };
255              
256             # Saving file
257 0 0         say BOLD GREEN "Writing <$log> file\n" if $data->{verbose};
258 0           write_json(
259             {
260             filepath => $log,
261             data => { info => $info, data => $data }
262             }
263             );
264             }
265              
266             sub program_header {
267              
268 0     0     my $VERSION = shift;
269 0           my $str = <
270             ****************************************
271             * Rank against cohort(s) (BFF/PXF) *
272             * - PHENO-RANKER - *
273             * Version: $VERSION *
274             * (C) 2023-2025 Manuel Rueda, PhD *
275             * The Artistic License 2.0 *
276             ****************************************
277             EOF
278 0           return $str;
279             }
280              
281             =head1 NAME
282              
283             pheno-ranker: A script that performs semantic similarity in PXF/BFF data structures and beyond (JSON|YAML)
284              
285             =head1 SYNOPSIS
286              
287             pheno-ranker -r -t [-options]
288              
289             Arguments:
290             * Cohort mode:
291             -r, --reference JSON/YAML BFF/PXF file(s) (array/object), supports .gz
292              
293             * Patient mode:
294             -t, --target JSON/YAML BFF/PXF file (object or single-object array), supports .gz
295              
296             Options:
297             -age Include age-related variables; excludes agent-like terms (BFF/PXF-only) [>no-age|age]
298             -a, --align [path/basename] Write alignment file(s). If not specified, default filenames are used [default: alignment.*]
299             -append-prefixes Prefixes for primary_key when #cohorts >= 2 [default: C]
300             -config YAML config file to modify default parameters [default: share/conf/config.yaml]
301             -cytoscape-json [file] Serializes the pairwise comparison matrix as an undirected graph in JSON, compatible with Cytoscape [default: graph.json]
302             -e, --export [path/basename] Export miscellaneous JSON files. If not specified, default filenames are used [default: export.*]
303             -exclude-terms Exclude BFF/PXF terms (e.g., --exclude-terms sex, id) or column names in JSON-derived from CSV
304             -graph-stats [file] Generates a text file with key graph metrics, for use with <-cytoscape-json> [default: graph_stats.txt]
305             -prp, --precomputed-ref-prefix [path/basename] Use precomputed data for the reference cohort(s). No need to use --r
306             -include-hpo-ascendants Include ascendant terms from the Human Phenotype Ontology (HPO)
307             -include-terms Include BFF/PXF terms (e.g., --include-terms diseases) or column names in JSON-derived from CSV
308             -max-matrix-items-in-ram In cohort mode, set max items before switching to RAM-efficient mode (default: 5000)
309             -max-number-vars Maximum number of variables for binary string [default: 10000]
310             -max-out Print only N comparisons [default: 50]
311             -o, --out-file Output file path [default: -r matrix.txt | -t rank.txt]
312             -poi, --patients-of-interest Export JSON files for the selected individual IDs during a dry-run
313             -poi-out-dir Directory for JSON files (used with --poi)
314             -retain-excluded-phenotypicFeatures Retains features set to "excluded": true by appending '_excluded' to their IDs
315             -similarity-metric-cohort Similarity metric for cohort mode [>hamming|jaccard]
316             -sort-by Sort by Hamming distance or Jaccard index [>hamming|jaccard]
317             -w, --weights YAML file with weights
318              
319             Generic Options:
320             -debug Print debugging (from 1 to 5, being 5 max)
321             -h, --help Brief help message
322             -log Save log file [default: pheno-ranker-log.json]
323             -man Full documentation
324             -no-color Toggle color output [>color|no-color]
325             -v, --verbose Verbosity on
326             -V, --version Print version
327              
328             =head1 SUMMARY
329              
330             Pheno-Ranker is a lightweight, easy-to-install tool for performing semantic similarity analysis on phenotypic data in JSON/YAML formats, including Beacon v2 Models and Phenopackets v2. It also supports pre-processed CSV files prepared using the included C utility.
331              
332             =head1 INSTALLATION
333              
334             If you plan to only use C CLI, we recommend installing it via CPAN. See details below.
335              
336             =head2 Non containerized
337              
338             The script runs on command-line Linux and it has been tested on Debian/RedHat/macOS based distributions (only showing commands for Debian). Perl 5 is installed by default on Linux,
339             but we will install a few CPAN modules with C.
340              
341             =head3 Method 1: From CPAN
342              
343             First install system level dependencies:
344              
345             sudo apt-get install cpanminus libperl-dev
346              
347             Now you have to choose between one of the 2 options below:
348              
349             B
350              
351             cpanm --notest --sudo Pheno::Ranker
352             pheno-ranker -h
353              
354             B
355              
356             cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
357             cpanm --notest Pheno::Ranker
358             pheno-ranker --help
359              
360             To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
361              
362             echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
363              
364             =head3 Method 2: From CPAN in a CONDA environment
365              
366             Please follow L.
367              
368             =head3 Method 3: From GitHub
369              
370             git clone https://github.com/cnag-biomedical-informatics/pheno-ranker.git
371             cd pheno-ranker
372              
373             Install system level dependencies:
374            
375             sudo apt-get install cpanminus libperl-dev
376              
377             Now you have to choose between one of the 2 options below:
378              
379             B
380              
381             cpanm --notest --sudo --installdeps .
382             bin/pheno-ranker --help
383              
384             B
385              
386             cpanm --local-lib=~/perl5 local::lib && eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)
387             cpanm --notest --installdeps .
388             bin/pheno-ranker --help
389              
390             To ensure Perl recognizes your local modules every time you start a new terminal, you should type:
391              
392             echo 'eval $(perl -I ~/perl5/lib/perl5/ -Mlocal::lib)' >> ~/.bashrc
393              
394             I If you want to use C or C:
395              
396             sudo apt-get install python3-pip libzbar0
397             pip3 install -r requirements.txt
398              
399             =head2 Containerized
400              
401             =head3 Method 4: From Docker Hub
402              
403             Download the latest version of the Docker image (supports both amd64 and arm64 architectures) from L by executing:
404              
405             docker pull manuelrueda/pheno-ranker:latest
406             docker image tag manuelrueda/pheno-ranker:latest cnag/pheno-ranker:latest
407              
408             See additional instructions below.
409              
410             =head3 Method 5: With Dockerfile
411              
412             Please download the C from the repo:
413              
414             wget https://raw.githubusercontent.com/cnag-biomedical-informatics/pheno-ranker/main/Dockerfile
415              
416             And then run:
417              
418             # Docker Version 19.03 and Above (Supports buildx)
419             docker buildx build -t cnag/pheno-ranker:latest .
420              
421             # Docker Version Older than 19.03 (Does Not Support buildx)
422             docker build -t cnag/pheno-ranker:latest .
423              
424             =head3 Additional instructions for Methods 4 and 5
425              
426             To run the container (detached) execute:
427              
428             docker run -tid -e USERNAME=root --name pheno-ranker cnag/pheno-ranker:latest
429              
430             To enter:
431              
432             docker exec -ti pheno-ranker bash
433              
434             The command-line executable can be found at:
435              
436             /usr/share/pheno-ranker/bin/pheno-ranker
437              
438             The default container user is C but you can also run the container as C<$UID=1000> (C).
439              
440             docker run --user 1000 -tid --name pheno-ranker cnag/pheno-ranker:latest
441            
442             =head3 Mounting volumes
443              
444             Docker containers are fully isolated. If you need the mount a volume to the container please use the following syntax (C<-v host:container>).
445             Find an example below (note that you need to change the paths to match yours):
446              
447             docker run -tid --volume /media/mrueda/4TBT/data:/data --name pheno-ranker-mount cnag/pheno-ranker:latest
448              
449             Then I will do something like this:
450              
451             # First I create an alias to simplify invocation (from the host)
452             alias pheno-ranker='docker exec -ti pheno-ranker-mount /usr/share/pheno-ranker/bin/pheno-ranker'
453              
454             # Now I use the alias to run the command (note that I use the flag --o to specify the filepath)
455             pheno-ranker -r /data/individuals.json -o /data/matrix.txt
456              
457             =head3 System requirements
458              
459             * Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOS, OpenSUSE) should do as well.
460             (It should also work on macOS and Windows Server, but we are only providing information for Linux here)
461             * Perl 5 (>= 5.26 core; installed by default in most Linux distributions). Check the version with "perl -v".
462             * >= 4GB of RAM
463             * 1 core
464             * At least 16GB HDD
465              
466             =head1 HOW TO RUN PHENO-RANKER
467              
468             For executing pheno-ranker you will need a PXF/BFF file(s) in JSON|YAML format. The reference cohort must be a JSON array, where each individual data are consolidated in one object.
469              
470             You can download examples from L.
471              
472             There are two modes of operation:
473              
474             =over 4
475              
476             =item Cohort mode:
477            
478             B With C<--r> argument and 1 cohort.
479              
480             B With C<--r> and multiple cohort files. It can be used in combination with C<--append-prefixes> to add prefixes to each individual id.
481              
482             =item Patient Mode:
483              
484             With C<-r> reference cohort(s) and C<--t> patient data.
485              
486             =back
487              
488             B
489              
490             $ ./pheno-ranker -r phenopackets.json # intra-cohort
491              
492             $ ./pheno-ranker -r phenopackets.yaml -o my_matrix.txt # intra-cohort
493              
494             $ ./pheno-ranker -r phenopackets.json -w weights.yaml --exclude-terms sex ethnicity exposures # intra-cohort with weights
495              
496             $ $path/pheno-ranker -r individuals.json others.yaml --append-prefixes CANCER CONTROL # inter-cohort
497              
498             $ $path/pheno-ranker -r individuals.json -t patient.yaml -max-out 100 # mode patient
499              
500              
501             =head2 COMMON ERRORS AND SOLUTIONS
502              
503             * Error message: R plotting
504             Error in scan(file = file, what = what, sep = sep, quote = quote, dec = dec, :
505             line 1 did not have X elements
506             Calls: as.matrix -> read.table -> scan
507             Execution halted
508             Solution: Make sure that the values of your primary key (e.g., "id") do not contain spaces (e.g., "my fav id" must be "my_fav_id")
509              
510             * Error message: Foo
511             Solution: Bar
512              
513             =head1 CITATION
514              
515             The author requests that any published work that utilizes C includes a cite to the following reference:
516              
517             Leist, I.C. et al., (2024). Pheno-Ranker: a toolkit for comparison of phenotypic data stored in GA4GH standards and beyond. I. DOI: 10.1186/s12859-024-05993-2
518              
519             =head1 AUTHOR
520              
521             Written by Manuel Rueda, PhD. Info about CNAG can be found at L.
522              
523             =head1 COPYRIGHT AND LICENSE
524              
525             This PERL file is copyrighted. See the LICENSE file included in this distribution.
526              
527             =cut