File Coverage

utils/csv2pheno_ranker/csv2pheno-ranker

Criterion	Covered	Total	%
statement	146	149	97.9
branch	21	46	45.6
condition	7	21	33.3
subroutine	24	25	96.0
pod			n/a
total	198	241	82.1

line	stmt	bran	cond	sub	time	code
1						#!/usr/bin/env perl
2						#
3						# An utility from Pheno-Ranker to convert a CSV to:
4						#
5						# 1 - Input file (JSON array of objects)
6						# 2 - Configuration file (needed for Pheno-Ranker)
7						#
8						# Last Modified: Mar/21/2024
9						#
10						# $VERSION taken from Pheno::Ranker
11						#
12						# Copyright (C) 2023-2024 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
13						#
14						# License: Artistic License 2.0
15						#
16						# If this program helps you in your research, please cite.
17
18	1			1	3658	use strict;
	1				1
	1				51
19	1			1	3	use warnings;
	1				4
	1				41
20	1			1	539	use Getopt::Long qw(:config no_ignore_case);
	1				10795
	1				4
21	1			1	561	use Pod::Usage;
	1				51675
	1				319
22
23						### Main #####
24	1				138182	process_csv();
25						##############
26	1				0	exit;
27
28						sub process_csv {
29	1			1	4	my $VERSION = '0.09';
30	1				3	my $array_sep = '\\|';
31
32						# Reading arguments
33						GetOptions(
34						'input\|i=s' => \my $input, # string
35						'primary-key-name=s' => \my $primary_key_name, # string
36						'generate-primary-key' => \my $generate_primary_key, # flag
37						'separator\|sep=s' => \my $sep, # str
38						'array-separator=s' => \$array_sep, # str
39						'output-dir=s' => \my $output_dir, # str
40						'help\|?' => \my $help, # flag
41						'man' => \my $man, # flag
42						'debug=i' => \my $debug, # integer
43						'verbose\|' => \my $verbose, # flag
44	0			0	0	'version\|V' => sub { print "$0 Version $VERSION\n"; exit; }
	0				0
45	1	50			15	) or pod2usage(2);
46	1	50			2955	pod2usage(1) if $help;
47	1	50			4	pod2usage( -verbose => 2, -exitval => 0 ) if $man;
48	1	50	33		43	pod2usage(
			33
49						-message =>
50						"Please specify a valid CSV\|TSV input with --i \n",
51						-exitval => 1
52						) unless ( $input && $input =~ m/\.(csv\|tsv)$/ && -f $input );
53	1	50	33		8	pod2usage(
54						-message => "Please specify a --primary-key-name \n",
55						-exitval => 1
56						) if ( $generate_primary_key && !$primary_key_name );
57	1	50	33		5	pod2usage(
58						-message => "Please specify a valid directory for --output-dir\n",
59						-exitval => 1
60						) if ( defined $output_dir && !-d $output_dir );
61
62						# Create object
63	1				18	my $csv = CSV2PhenoRanker->new(
64						{
65						input => $input,
66						primary_key_name => $primary_key_name,
67						generate_primary_key => $generate_primary_key,
68						output_dir => $output_dir,
69						sep => $sep,
70						array_sep => $array_sep,
71						debug => $debug,
72						verbose => $verbose
73						}
74						);
75
76						# Run method
77	1				5	$csv->run;
78						}
79
80						package CSV2PhenoRanker;
81
82	1			1	7	use strict;
	1				1
	1				16
83	1			1	2	use warnings;
	1				1
	1				60
84	1			1	393	use autodie;
	1				13409
	1				6
85	1			1	8281	use feature qw(say);
	1				3
	1				182
86	1			1	855	use Data::Dumper;
	1				9549
	1				94
87	1			1	1043	use Path::Tiny;
	1				17113
	1				126
88	1			1	10	use File::Basename;
	1				3
	1				96
89	1			1	655	use File::Spec::Functions qw(catdir catfile);
	1				1073
	1				127
90	1			1	555	use YAML::XS qw(LoadFile DumpFile);
	1				3818
	1				105
91	1			1	873	use JSON::XS;
	1				7206
	1				76
92	1			1	1649	use Text::CSV_XS;
	1				24860
	1				2020
93
94						sub new {
95	1			1	4	my ( $class, $self ) = @_;
96	1				3	bless $self, $class;
97	1				3	return $self;
98						}
99
100						sub run {
101	1			1	3	my $self = shift;
102
103						# Read the input file
104	1				5	my ( $data, $arrays, $non_arrays ) = $self->read_csv();
105
106						# Define output directory
107	1				62	my ( $name, $path, undef ) = fileparse( $self->{input}, qr/\.[^.]*/ );
108	1		33		9	my $output_dir = $self->{output_dir} // $path; # Use defined-or operator
109
110						# Write data as JSON
111	1				10	my $json_file = catfile( $output_dir, qq/${name}.json/ );
112	1	50			4	say "Writting <$json_file> " if $self->{verbose};
113	1				7	write_json( { filepath => $json_file, data => $data } );
114
115						# Load the configuration file data
116	1				7	my $config = $self->create_config( $arrays, $non_arrays );
117
118						# Write the configuration file as YAML
119	1				8	my $yaml_file = catfile( $output_dir, qq/${name}_config.yaml/ );
120	1	50			5	say "Writting <$yaml_file> " if $self->{verbose};
121	1				7	write_yaml( { filepath => $yaml_file, data => $config } );
122	1				26	return 1;
123						}
124
125						sub write_json {
126	1			1	3	my $arg = shift;
127	1				3	my $file = $arg->{filepath};
128	1				2	my $json_data = $arg->{data};
129
130						# Note that canonical DOES not match the order of nsort from Sort::Naturally
131	1				66	my $json = JSON::XS->new->utf8->canonical->pretty->encode($json_data);
132	1				11	path($file)->spew($json);
133	1				9159	return 1;
134						}
135
136						sub write_yaml {
137	1			1	3	my $arg = shift;
138	1				3	my $file = $arg->{filepath};
139	1				3	my $json_data = $arg->{data};
140	1				2	local $YAML::XS::Boolean = 'JSON::PP';
141	1				7	DumpFile( $file, $json_data );
142	1				471	return 1;
143						}
144
145						sub read_csv {
146	1			1	2	my $self = shift;
147	1				10	my $input = $self->{input};
148	1				3	my $primary_key_name = $self->{primary_key_name}; # has to be non-array
149	1				19	my $generate_primary_key = $self->{generate_primary_key};
150	1				2	my $sep = $self->{sep};
151	1				3	my $array_sep = $self->{array_sep};
152	1				22	my $array_sep_qr = qr/$array_sep/;
153
154						# Create a Text::CSV object with semicolon as the separator
155	1				6	my $csv = Text::CSV_XS->new(
156						{
157						binary => 1,
158						sep_char => define_separator( $input, $sep ),
159						auto_diag => 1
160						}
161						);
162
163						# Open filehandle
164	1				227	open my $fh, '<:encoding(utf-8)', $input;
165
166						# Parse the CSV data
167	1				5247	my $headers = $csv->getline($fh);
168
169						# Get rid of problematic characters on headers
170	1				89	$_ =~ tr/()//d for @$headers;
171
172						# Add $primary_key_name to headers if $generate_primary_key
173	1	50			8	if ($generate_primary_key) {
174
175						# Check that primary_key_name does not exist
176						die
177						"Primary key <$primary_key_name> already exists. Are you sure you need the <--generate-primary-key> flag?\n"
178	1	50			4	if ( grep { $_ eq $primary_key_name } @$headers );
	3				10
179
180						# Make it last element of the array
181	1	50			5	push @$headers, $primary_key_name if $generate_primary_key;
182						}
183
184						#####################
185						# START READING CSV #
186						#####################
187
188	1				3	my ( @rows, %array, %non_array );
189	1				3	my $count = 1;
190	1	50			11	say "Start reading <$input>" if $self->{verbose};
191	1				53	while ( my $row = $csv->getline($fh) ) {
192
193						# Print if verbose
194						say "Reading row $count..."
195	3	50	33		14	if ( $self->{verbose} && $count % 1_000 == 0 );
196
197						# Add id if $generate_primary_key
198	3	50			20	push @$row, 'PR_' . sprintf( "%08d", $count ) if $generate_primary_key;
199
200						# Load data
201	3				6	my %data;
202	3				19	@data{@$headers} = @$row;
203
204						# * IMPORTANT *
205						# Columns can consist of arrays or strings
206						# Here we load all as strings and we re-format array fields a posteriori
207
208	3				63	for my $key ( keys %data ) {
209
210						# Check array/non-array based on regex
211	12	100			110	$array{$key}++ if $data{$key} =~ $array_sep_qr;
212						}
213
214	3				9	push @rows, \%data;
215	3				139	$count++;
216						}
217	1				91	close $fh;
218	1	50			1302	say "Reading <$input> completed!" if $self->{verbose};
219
220						###################
221						# END READING CSV #
222						###################
223
224						# Load array/non-array
225	1				5	my @array = keys %array;
226
227						# Filter the original array to exclude elements found in the @array
228	1				4	my @non_array = grep { !$array{$_} } @$headers;
	4				13
229
230						# Re-arrange array variables
231	1				6	split_array_fields( \@rows, \@array, $array_sep );
232
233	1				25	return ( \@rows, \@array, \@non_array );
234						}
235
236						sub split_array_fields {
237	1			1	3	my ( $rows, $array, $array_sep ) = @_;
238
239						# Split array fields (comma-separated values) into an array_ref
240						# Modify the original data structure directly
241	1				4	for my $row (@$rows) {
242	3				6	for my $key (@$array) {
243	3				31	$row->{$key} = [ split /$array_sep/, $row->{$key} ];
244						}
245						}
246	1				3	return 1;
247						}
248
249						sub create_config {
250	1			1	4	my ( $self, $array, $non_array ) = @_;
251	1				3	my $primary_key_name = $self->{primary_key_name};
252
253	1				5	my @arrays = sort @$array;
254	1				5	my @non_arrays = sort @$non_array;
255
256						# Convert arrays to hashes for quick look up
257	1				4	my %arrays_hash = map { $_ => 1 } @arrays;
	1				5
258	1				3	my %non_arrays_hash = map { $_ => 1 } @non_arrays;
	3				9
259
260						# Set primary key
261	1	50			5	if ( defined $primary_key_name ) {
262						die "Primary-key <$primary_key_name> not found\n"
263						unless ( exists $arrays_hash{$primary_key_name}
264	1	50	33		12	\|\| exists $non_arrays_hash{$primary_key_name} );
265						}
266
267						else {
268	0	0			0	$primary_key_name = exists $non_arrays_hash{id} ? 'id' : $non_arrays[0];
269						}
270
271						# Default for non-arrays
272	1				7	my $config = {
273						format => 'CSV',
274						primary_key => $primary_key_name,
275						allowed_terms => [@non_arrays]
276						};
277
278						# Update for arrays
279	1	50			5	if ( scalar @arrays ) {
280
281						# NB: Can't use $array more than once in the hash ref below. Need to deref
282	1				21	$config->{array_terms} = [@arrays];
283
284						# @non_arrays, @arrays are sorted, but if we merge them we need to re-sort
285	1				6	$config->{allowed_terms} = [ sort @non_arrays, @arrays ];
286						$config->{id_correspondence} = {
287						CSV => [
288						map {
289	1				3	my $val = { $_ => $_ };
	1				3
290	1				6	$val
291						} @arrays
292						]
293						};
294
295						}
296	1				5	return $config;
297						}
298
299						sub define_separator {
300	1			1	4	my ( $filepath, $sep ) = @_;
301
302						# Define split record separator from file extension
303	1				3	my @exts = qw(.csv .tsv);
304	1				77	my ( $undef, undef, $ext ) = fileparse( $filepath, @exts );
305
306						# Defining separator character
307	1	0			5	my $separator =
		0
		50
308						defined($sep) ? $sep
309						: $ext eq '.csv' ? ','
310						: # Use comma for csv files
311						$ext eq '.tsv' ? "\t" : # Use tab for tsv files
312						','; # Default to comma if no extension match or $sep is undefined
313
314						# Return separator
315	1				19	return $separator;
316						}
317
318						1;
319
320						=head1 NAME
321
322						csv2pheno-ranker: A script to convert a CSV to an input suitable for Pheno-Ranker
323
324						=head1 SYNOPSIS
325
326						csv2pheno-ranker -i [-options]
327
328						Arguments:
329						-i, --input CSV file
330
331						Options:
332						-generate-primary-key Generates a primary key if absent. Use --primary-key-name to set its name
333						-primary-key-name Sets the name for the primary key. Must be a single, non-array field
334						-sep, --separator Delimiter for CSV fields [;] (e.g., --sep $'\t' for tabs)
335						-array-separator Delimiter for nested arrays [\|] (e.g., --array-separator ';' for semicolons)
336						-output-dir Specify the directory where output files will be stored. If not specified, outputs will be placed in the same directory as the input file
337
338						Generic Options:
339						-debug Print debugging (from 1 to 5, being 5 max)
340						-h, --help Brief help message
341						-man Full documentation
342						-v, --verbose Verbosity on
343						-V, --version Print version
344
345						=head1 DESCRIPTION
346
347						Numerous tools exist for CSV to JSON conversion, but our focus here was on creating JSON specifically for C. The script supports both basic CSV files and complex, comma-separated CSV files with nested fields, ensuring seamless C integration.
348
349						The script will create both a JSON file and the configuration file for C. Then, you can run C as:
350
351						$ pheno-ranker -r my_csv.json --config --my_csv_config.yaml
352
353						Note that we load all data in memory before dumping the JSON file. If you have a huge CSV (e.g.,>5M rows) please use a computer that has enough RAM.
354
355						=head1 SUMMARY
356
357						A script to convert a CSV to an input suitable for C
358
359						=head1 INSTALLATION
360
361						(only needed if you did not install C)
362
363						$ cpanm --installdeps .
364
365						=head3 System requirements
366
367						* Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOs, OpenSuse) should do as well.
368						* Perl 5 (>= 5.10 core; installed by default in most Linux distributions). Check the version with "perl -v"
369						* 1GB of RAM.
370						* 1 core (it only uses one core per job).
371						* At least 1GB HDD.
372
373						=head1 HOW TO RUN CSV2PHENO-RANKER
374
375						The software requires a CSV file as the input and operates with default settings. By default, both the C file and the configuration file will be created in the same directory as the input file, and will share the same basename.
376
377						If you have columns with nested values make sure that you use C<--array-separator> to define the delimiting character (default is "\|").
378
379						If you want to change some parameters please take a look to the synopsis.
380
381						B
382
383						$ ./csv2pheno-ranker -i example.csv
384
385						$ ./csv2pheno-ranker -i example.csv --generate-primary-key --primary-key-name ID
386
387						$ ./csv2pheno-ranker -i example.csv --generate-primary-key --primary-key-name ID --output-dir /my-path --sep ';' --array-separator ','
388
389						=head2 COMMON ERRORS AND SOLUTIONS
390
391						* Error message: Foo
392						Solution: Bar
393
394						* Error message: Foo
395						Solution: Bar
396
397						=head1 AUTHOR
398
399						Written by Manuel Rueda, PhD. Info about CNAG can be found at L.
400
401						=head1 COPYRIGHT AND LICENSE
402
403						This PERL file is copyrighted. See the LICENSE file included in this distribution.
404
405						=cut