File Coverage

utils/csv2pheno_ranker/csv2pheno-ranker

Criterion	Covered	Total	%
statement	146	149	97.9
branch	21	46	45.6
condition	7	21	33.3
subroutine	24	25	96.0
pod			n/a
total	198	241	82.1

line	stmt	bran	cond	sub	time	code
1						#!/usr/bin/env perl
2						#
3						# An utility from Pheno-Ranker to convert a CSV to:
4						#
5						# 1 - Input file (JSON array of objects)
6						# 2 - Configuration file (needed for Pheno-Ranker)
7						#
8						# Last Modified: Mar/21/2024
9						#
10						# $VERSION taken from Pheno::Ranker
11						#
12						# Copyright (C) 2023-2024 Manuel Rueda - CNAG (manuel.rueda@cnag.eu)
13						#
14						# License: Artistic License 2.0
15						#
16						# If this program helps you in your research, please cite.
17
18	1			1	4176	use strict;
	1				1
	1				34
19	1			1	8	use warnings;
	1				1
	1				43
20	1			1	646	use Getopt::Long qw(:config no_ignore_case);
	1				11912
	1				4
21	1			1	605	use Pod::Usage;
	1				55781
	1				355
22
23						### Main #####
24	1				120470	process_csv();
25						##############
26	1				0	exit;
27
28						sub process_csv {
29
30	1			1	3	my $VERSION = '0.09';
31	1				2	my $array_sep = '\\|';
32
33						# Reading arguments
34						GetOptions(
35						'input\|i=s' => \my $input, # string
36						'primary-key-name=s' => \my $primary_key_name, # string
37						'generate-primary-key' => \my $generate_primary_key, # flag
38						'separator\|sep=s' => \my $sep, # str
39						'array-separator=s' => \$array_sep, # str
40						'output-dir=s' => \my $output_dir, # str
41						'help\|?' => \my $help, # flag
42						'man' => \my $man, # flag
43						'debug=i' => \my $debug, # integer
44						'verbose\|' => \my $verbose, # flag
45	0			0	0	'version\|V' => sub { print "$0 Version $VERSION\n"; exit; }
	0				0
46	1	50			12	) or pod2usage(2);
47	1	50			1298	pod2usage(1) if $help;
48	1	50			2	pod2usage( -verbose => 2, -exitval => 0 ) if $man;
49	1	50	33		45	pod2usage(
			33
50						-message =>
51						"Please specify a valid CSV\|TSV input with --i \n",
52						-exitval => 1
53						) unless ( $input && $input =~ m/\.(csv\|tsv)$/ && -f $input );
54	1	50	33		5	pod2usage(
55						-message => "Please specify a --primary-key-name \n",
56						-exitval => 1
57						) if ( $generate_primary_key && !$primary_key_name );
58	1	50	33		3	pod2usage(
59						-message => "Please specify a valid directory for --output-dir\n",
60						-exitval => 1
61						) if ( defined $output_dir && !-d $output_dir );
62
63						# Create object
64	1				16	my $csv = CSV2PhenoRanker->new(
65						{
66						input => $input,
67						primary_key_name => $primary_key_name,
68						generate_primary_key => $generate_primary_key,
69						output_dir => $output_dir,
70						sep => $sep,
71						array_sep => $array_sep,
72						debug => $debug,
73						verbose => $verbose
74						}
75						);
76
77						# Run method
78	1				4	$csv->run;
79						}
80
81						package CSV2PhenoRanker;
82
83	1			1	7	use strict;
	1				1
	1				18
84	1			1	3	use warnings;
	1				1
	1				35
85	1			1	422	use autodie;
	1				14366
	1				3
86	1			1	4515	use feature qw(say);
	1				5
	1				120
87	1			1	603	use Data::Dumper;
	1				6296
	1				62
88	1			1	794	use Path::Tiny;
	1				11579
	1				68
89	1			1	7	use File::Basename;
	1				1
	1				76
90	1			1	450	use File::Spec::Functions qw(catdir catfile);
	1				687
	1				66
91	1			1	359	use YAML::XS qw(LoadFile DumpFile);
	1				2625
	1				55
92	1			1	615	use JSON::XS;
	1				4467
	1				55
93	1			1	1239	use Text::CSV_XS;
	1				17850
	1				1286
94
95						sub new {
96	1			1	2	my ( $class, $self ) = @_;
97	1				1	bless $self, $class;
98	1				2	return $self;
99						}
100
101						sub run {
102
103	1			1	1	my $self = shift;
104
105						# Read the input file
106	1				4	my ( $data, $arrays, $non_arrays ) = $self->read_csv();
107
108						# Define output directory
109	1				50	my ( $name, $path, undef ) = fileparse( $self->{input}, qr/\.[^.]*/ );
110	1		33		6	my $output_dir = $self->{output_dir} // $path; # Use defined-or operator
111
112						# Write data as JSON
113	1				7	my $json_file = catfile( $output_dir, qq/${name}.json/ );
114	1	50			4	say "Writting <$json_file> " if $self->{verbose};
115	1				6	write_json( { filepath => $json_file, data => $data } );
116
117						# Load the configuration file data
118	1				5	my $config = $self->create_config( $arrays, $non_arrays );
119
120						# Write the configuration file as YAML
121	1				7	my $yaml_file = catfile( $output_dir, qq/${name}_config.yaml/ );
122	1	50			3	say "Writting <$yaml_file> " if $self->{verbose};
123	1				5	write_yaml( { filepath => $yaml_file, data => $config } );
124	1				19	return 1;
125						}
126
127						sub write_json {
128
129	1			1	1	my $arg = shift;
130	1				2	my $file = $arg->{filepath};
131	1				2	my $json_data = $arg->{data};
132
133						# Note that canonical DOES not match the order of nsort from Sort::Naturally
134	1				50	my $json = JSON::XS->new->utf8->canonical->pretty->encode($json_data);
135	1				10	path($file)->spew($json);
136	1				5605	return 1;
137						}
138
139						sub write_yaml {
140
141	1			1	1	my $arg = shift;
142	1				2	my $file = $arg->{filepath};
143	1				2	my $json_data = $arg->{data};
144	1				2	local $YAML::XS::Boolean = 'JSON::PP';
145	1				4	DumpFile( $file, $json_data );
146	1				384	return 1;
147						}
148
149						sub read_csv {
150
151	1			1	2	my $self = shift;
152	1				6	my $input = $self->{input};
153	1				2	my $primary_key_name = $self->{primary_key_name}; # has to be non-array
154	1				2	my $generate_primary_key = $self->{generate_primary_key};
155	1				1	my $sep = $self->{sep};
156	1				2	my $array_sep = $self->{array_sep};
157	1				15	my $array_sep_qr = qr/$array_sep/;
158
159						# Create a Text::CSV object with semicolon as the separator
160	1				4	my $csv = Text::CSV_XS->new(
161						{
162						binary => 1,
163						sep_char => define_separator( $input, $sep ),
164						auto_diag => 1
165						}
166						);
167
168						# Open filehandle
169	1				186	open my $fh, '<:encoding(utf-8)', $input;
170
171						# Parse the CSV data
172	1				3335	my $headers = $csv->getline($fh);
173
174						# Get rid of problematic characters on headers
175	1				39	$_ =~ tr/()//d for @$headers;
176
177						# Add $primary_key_name to headers if $generate_primary_key
178	1	50			3	if ($generate_primary_key) {
179
180						# Check that primary_key_name does not exist
181						die
182						"Primary key <$primary_key_name> already exists. Are you sure you need the <--generate-primary-key> flag?\n"
183	1	50			2	if ( grep { $_ eq $primary_key_name } @$headers );
	3				8
184
185						# Make it last element of the array
186	1	50			3	push @$headers, $primary_key_name if $generate_primary_key;
187						}
188
189						#####################
190						# START READING CSV #
191						#####################
192
193	1				2	my ( @rows, %array, %non_array );
194	1				1	my $count = 1;
195	1	50			7	say "Start reading <$input>" if $self->{verbose};
196	1				9	while ( my $row = $csv->getline($fh) ) {
197
198						# Print if verbose
199						say "Reading row $count..."
200	3	50	33		11	if ( $self->{verbose} && $count % 1_000 == 0 );
201
202						# Add id if $generate_primary_key
203	3	50			11	push @$row, 'PR_' . sprintf( "%08d", $count ) if $generate_primary_key;
204
205						# Load data
206	3				1	my %data;
207	3				10	@data{@$headers} = @$row;
208
209						# * IMPORTANT *
210						# Columns can consist of arrays or strings
211						# Here we load all as strings and we re-format array fields a posteriori
212
213	3				6	for my $key ( keys %data ) {
214
215						# Check array/non-array based on regex
216	12	100			29	$array{$key}++ if $data{$key} =~ $array_sep_qr;
217						}
218
219	3				6	push @rows, \%data;
220	3				28	$count++;
221						}
222	1				32	close $fh;
223	1	50			763	say "Reading <$input> completed!" if $self->{verbose};
224
225						###################
226						# END READING CSV #
227						###################
228
229						# Load array/non-array
230	1				3	my @array = keys %array;
231
232						# Filter the original array to exclude elements found in the @array
233	1				3	my @non_array = grep { !$array{$_} } @$headers;
	4				8
234
235						# Re-arrange array variables
236	1				5	split_array_fields( \@rows, \@array, $array_sep );
237
238	1				16	return ( \@rows, \@array, \@non_array );
239						}
240
241						sub split_array_fields {
242
243	1			1	2	my ( $rows, $array, $array_sep ) = @_;
244
245						# Split array fields (comma-separated values) into an array_ref
246						# Modify the original data structure directly
247	1				2	for my $row (@$rows) {
248	3				4	for my $key (@$array) {
249	3				22	$row->{$key} = [ split /$array_sep/, $row->{$key} ];
250						}
251						}
252	1				1	return 1;
253						}
254
255						sub create_config {
256
257	1			1	2	my ( $self, $array, $non_array ) = @_;
258	1				4	my $primary_key_name = $self->{primary_key_name};
259
260	1				3	my @arrays = sort @$array;
261	1				5	my @non_arrays = sort @$non_array;
262
263						# Convert arrays to hashes for quick look up
264	1				1	my %arrays_hash = map { $_ => 1 } @arrays;
	1				4
265	1				2	my %non_arrays_hash = map { $_ => 1 } @non_arrays;
	3				6
266
267						# Set primary key
268	1	50			3	if ( defined $primary_key_name ) {
269						die "Primary-key <$primary_key_name> not found\n"
270						unless ( exists $arrays_hash{$primary_key_name}
271	1	50	33		6	\|\| exists $non_arrays_hash{$primary_key_name} );
272						}
273
274						else {
275	0	0			0	$primary_key_name = exists $non_arrays_hash{id} ? 'id' : $non_arrays[0];
276						}
277
278						# Default for non-arrays
279	1				5	my $config = {
280						format => 'CSV',
281						primary_key => $primary_key_name,
282						allowed_terms => [@non_arrays]
283						};
284
285						# Update for arrays
286	1	50			18	if ( scalar @arrays ) {
287
288						# NB: Can't use $array more than once in the hash ref below. Need to deref
289	1				3	$config->{array_terms} = [@arrays];
290
291						# @non_arrays, @arrays are sorted, but if we merge them we need to re-sort
292	1				5	$config->{allowed_terms} = [ sort @non_arrays, @arrays ];
293						$config->{id_correspondence} = {
294						CSV => [
295						map {
296	1				14	my $val = { $_ => $_ };
	1				2
297	1				4	$val
298						} @arrays
299						]
300						};
301
302						}
303	1				4	return $config;
304						}
305
306						sub define_separator {
307
308	1			1	2	my ( $filepath, $sep ) = @_;
309
310						# Define split record separator from file extension
311	1				3	my @exts = qw(.csv .tsv);
312	1				62	my ( $undef, undef, $ext ) = fileparse( $filepath, @exts );
313
314						# Defining separator character
315	1	0			4	my $separator =
		0
		50
316						defined($sep) ? $sep
317						: $ext eq '.csv' ? ','
318						: # Use comma for csv files
319						$ext eq '.tsv' ? "\t" : # Use tab for tsv files
320						','; # Default to comma if no extension match or $sep is undefined
321
322						# Return separator
323	1				12	return $separator;
324						}
325
326						1;
327
328						=head1 NAME
329
330						csv2pheno-ranker: A script to convert a CSV to an input suitable for Pheno-Ranker
331
332						=head1 SYNOPSIS
333
334						csv2pheno-ranker -i [-options]
335
336						Arguments:
337						-i, --input CSV file
338
339						Options:
340						-generate-primary-key Generates a primary key if absent. Use --primary-key-name to set its name
341						-primary-key-name Sets the name for the primary key. Must be a single, non-array field
342						-sep, --separator Delimiter for CSV fields [;] (e.g., --sep $'\t' for tabs)
343						-array-separator Delimiter for nested arrays [\|] (e.g., --array-separator ';' for semicolons)
344						-output-dir Specify the directory where output files will be stored. If not specified, outputs will be placed in the same directory as the input file
345
346						Generic Options:
347						-debug Print debugging (from 1 to 5, being 5 max)
348						-h, --help Brief help message
349						-man Full documentation
350						-v, --verbose Verbosity on
351						-V, --version Print version
352
353						=head1 DESCRIPTION
354
355						Numerous tools exist for CSV to JSON conversion, but our focus here was on creating JSON specifically for C. The script supports both basic CSV files and complex, comma-separated CSV files with nested fields, ensuring seamless C integration.
356
357						The script will create both a JSON file and the configuration file for C. Then, you can run C as:
358
359						$ pheno-ranker -r my_csv.json --config --my_csv_config.yaml
360
361						Note that we load all data in memory before dumping the JSON file. If you have a huge CSV (e.g.,>5M rows) please use a computer that has enough RAM.
362
363						=head1 SUMMARY
364
365						A script to convert a CSV to an input suitable for C
366
367						=head1 INSTALLATION
368
369						(only needed if you did not install C)
370
371						$ cpanm --installdeps .
372
373						=head3 System requirements
374
375						* Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOs, OpenSuse) should do as well.
376						* Perl 5 (>= 5.10 core; installed by default in most Linux distributions). Check the version with "perl -v"
377						* 1GB of RAM.
378						* 1 core (it only uses one core per job).
379						* At least 1GB HDD.
380
381						=head1 HOW TO RUN CSV2PHENO-RANKER
382
383						The software requires a CSV file as the input and operates with default settings. By default, both the C file and the configuration file will be created in the same directory as the input file, and will share the same basename.
384
385						If you have columns with nested values make sure that you use C<--array-separator> to define the delimiting character (default is "\|").
386
387						If you want to change some parameters please take a look to the synopsis.
388
389						B
390
391						$ ./csv2pheno-ranker -i example.csv
392
393						$ ./csv2pheno-ranker -i example.csv --generate-primary-key --primary-key-name ID
394
395						$ ./csv2pheno-ranker -i example.csv --generate-primary-key --primary-key-name ID --output-dir /my-path --sep ';' --array-separator ','
396
397						=head2 COMMON ERRORS AND SOLUTIONS
398
399						* Error message: Foo
400						Solution: Bar
401
402						* Error message: Foo
403						Solution: Bar
404
405						=head1 AUTHOR
406
407						Written by Manuel Rueda, PhD. Info about CNAG can be found at L.
408
409						=head1 COPYRIGHT AND LICENSE
410
411						This PERL file is copyrighted. See the LICENSE file included in this distribution.
412
413						=cut