| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#!/usr/bin/env perl |
|
2
|
|
|
|
|
|
|
# |
|
3
|
|
|
|
|
|
|
# An utility from Pheno-Ranker to convert a CSV to: |
|
4
|
|
|
|
|
|
|
# |
|
5
|
|
|
|
|
|
|
# 1 - Input file (JSON array of objects) |
|
6
|
|
|
|
|
|
|
# 2 - Configuration file (needed for Pheno-Ranker) |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# Last Modified: Mar/21/2024 |
|
9
|
|
|
|
|
|
|
# |
|
10
|
|
|
|
|
|
|
# $VERSION taken from Pheno::Ranker |
|
11
|
|
|
|
|
|
|
# |
|
12
|
|
|
|
|
|
|
# Copyright (C) 2023-2024 Manuel Rueda - CNAG (manuel.rueda@cnag.eu) |
|
13
|
|
|
|
|
|
|
# |
|
14
|
|
|
|
|
|
|
# License: Artistic License 2.0 |
|
15
|
|
|
|
|
|
|
# |
|
16
|
|
|
|
|
|
|
# If this program helps you in your research, please cite. |
|
17
|
|
|
|
|
|
|
|
|
18
|
1
|
|
|
1
|
|
3658
|
use strict; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
51
|
|
|
19
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
|
1
|
|
|
|
|
4
|
|
|
|
1
|
|
|
|
|
41
|
|
|
20
|
1
|
|
|
1
|
|
539
|
use Getopt::Long qw(:config no_ignore_case); |
|
|
1
|
|
|
|
|
10795
|
|
|
|
1
|
|
|
|
|
4
|
|
|
21
|
1
|
|
|
1
|
|
561
|
use Pod::Usage; |
|
|
1
|
|
|
|
|
51675
|
|
|
|
1
|
|
|
|
|
319
|
|
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
### Main ##### |
|
24
|
1
|
|
|
|
|
138182
|
process_csv(); |
|
25
|
|
|
|
|
|
|
############## |
|
26
|
1
|
|
|
|
|
0
|
exit; |
|
27
|
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
sub process_csv { |
|
29
|
1
|
|
|
1
|
|
4
|
my $VERSION = '0.09'; |
|
30
|
1
|
|
|
|
|
3
|
my $array_sep = '\|'; |
|
31
|
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
# Reading arguments |
|
33
|
|
|
|
|
|
|
GetOptions( |
|
34
|
|
|
|
|
|
|
'input|i=s' => \my $input, # string |
|
35
|
|
|
|
|
|
|
'primary-key-name=s' => \my $primary_key_name, # string |
|
36
|
|
|
|
|
|
|
'generate-primary-key' => \my $generate_primary_key, # flag |
|
37
|
|
|
|
|
|
|
'separator|sep=s' => \my $sep, # str |
|
38
|
|
|
|
|
|
|
'array-separator=s' => \$array_sep, # str |
|
39
|
|
|
|
|
|
|
'output-dir=s' => \my $output_dir, # str |
|
40
|
|
|
|
|
|
|
'help|?' => \my $help, # flag |
|
41
|
|
|
|
|
|
|
'man' => \my $man, # flag |
|
42
|
|
|
|
|
|
|
'debug=i' => \my $debug, # integer |
|
43
|
|
|
|
|
|
|
'verbose|' => \my $verbose, # flag |
|
44
|
0
|
|
|
0
|
|
0
|
'version|V' => sub { print "$0 Version $VERSION\n"; exit; } |
|
|
0
|
|
|
|
|
0
|
|
|
45
|
1
|
50
|
|
|
|
15
|
) or pod2usage(2); |
|
46
|
1
|
50
|
|
|
|
2955
|
pod2usage(1) if $help; |
|
47
|
1
|
50
|
|
|
|
4
|
pod2usage( -verbose => 2, -exitval => 0 ) if $man; |
|
48
|
1
|
50
|
33
|
|
|
43
|
pod2usage( |
|
|
|
|
33
|
|
|
|
|
|
49
|
|
|
|
|
|
|
-message => |
|
50
|
|
|
|
|
|
|
"Please specify a valid CSV|TSV input with --i \n", |
|
51
|
|
|
|
|
|
|
-exitval => 1 |
|
52
|
|
|
|
|
|
|
) unless ( $input && $input =~ m/\.(csv|tsv)$/ && -f $input ); |
|
53
|
1
|
50
|
33
|
|
|
8
|
pod2usage( |
|
54
|
|
|
|
|
|
|
-message => "Please specify a --primary-key-name \n", |
|
55
|
|
|
|
|
|
|
-exitval => 1 |
|
56
|
|
|
|
|
|
|
) if ( $generate_primary_key && !$primary_key_name ); |
|
57
|
1
|
50
|
33
|
|
|
5
|
pod2usage( |
|
58
|
|
|
|
|
|
|
-message => "Please specify a valid directory for --output-dir\n", |
|
59
|
|
|
|
|
|
|
-exitval => 1 |
|
60
|
|
|
|
|
|
|
) if ( defined $output_dir && !-d $output_dir ); |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
# Create object |
|
63
|
1
|
|
|
|
|
18
|
my $csv = CSV2PhenoRanker->new( |
|
64
|
|
|
|
|
|
|
{ |
|
65
|
|
|
|
|
|
|
input => $input, |
|
66
|
|
|
|
|
|
|
primary_key_name => $primary_key_name, |
|
67
|
|
|
|
|
|
|
generate_primary_key => $generate_primary_key, |
|
68
|
|
|
|
|
|
|
output_dir => $output_dir, |
|
69
|
|
|
|
|
|
|
sep => $sep, |
|
70
|
|
|
|
|
|
|
array_sep => $array_sep, |
|
71
|
|
|
|
|
|
|
debug => $debug, |
|
72
|
|
|
|
|
|
|
verbose => $verbose |
|
73
|
|
|
|
|
|
|
} |
|
74
|
|
|
|
|
|
|
); |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# Run method |
|
77
|
1
|
|
|
|
|
5
|
$csv->run; |
|
78
|
|
|
|
|
|
|
} |
|
79
|
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
package CSV2PhenoRanker; |
|
81
|
|
|
|
|
|
|
|
|
82
|
1
|
|
|
1
|
|
7
|
use strict; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
16
|
|
|
83
|
1
|
|
|
1
|
|
2
|
use warnings; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
60
|
|
|
84
|
1
|
|
|
1
|
|
393
|
use autodie; |
|
|
1
|
|
|
|
|
13409
|
|
|
|
1
|
|
|
|
|
6
|
|
|
85
|
1
|
|
|
1
|
|
8281
|
use feature qw(say); |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
182
|
|
|
86
|
1
|
|
|
1
|
|
855
|
use Data::Dumper; |
|
|
1
|
|
|
|
|
9549
|
|
|
|
1
|
|
|
|
|
94
|
|
|
87
|
1
|
|
|
1
|
|
1043
|
use Path::Tiny; |
|
|
1
|
|
|
|
|
17113
|
|
|
|
1
|
|
|
|
|
126
|
|
|
88
|
1
|
|
|
1
|
|
10
|
use File::Basename; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
96
|
|
|
89
|
1
|
|
|
1
|
|
655
|
use File::Spec::Functions qw(catdir catfile); |
|
|
1
|
|
|
|
|
1073
|
|
|
|
1
|
|
|
|
|
127
|
|
|
90
|
1
|
|
|
1
|
|
555
|
use YAML::XS qw(LoadFile DumpFile); |
|
|
1
|
|
|
|
|
3818
|
|
|
|
1
|
|
|
|
|
105
|
|
|
91
|
1
|
|
|
1
|
|
873
|
use JSON::XS; |
|
|
1
|
|
|
|
|
7206
|
|
|
|
1
|
|
|
|
|
76
|
|
|
92
|
1
|
|
|
1
|
|
1649
|
use Text::CSV_XS; |
|
|
1
|
|
|
|
|
24860
|
|
|
|
1
|
|
|
|
|
2020
|
|
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
sub new { |
|
95
|
1
|
|
|
1
|
|
4
|
my ( $class, $self ) = @_; |
|
96
|
1
|
|
|
|
|
3
|
bless $self, $class; |
|
97
|
1
|
|
|
|
|
3
|
return $self; |
|
98
|
|
|
|
|
|
|
} |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
sub run { |
|
101
|
1
|
|
|
1
|
|
3
|
my $self = shift; |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# Read the input file |
|
104
|
1
|
|
|
|
|
5
|
my ( $data, $arrays, $non_arrays ) = $self->read_csv(); |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
# Define output directory |
|
107
|
1
|
|
|
|
|
62
|
my ( $name, $path, undef ) = fileparse( $self->{input}, qr/\.[^.]*/ ); |
|
108
|
1
|
|
33
|
|
|
9
|
my $output_dir = $self->{output_dir} // $path; # Use defined-or operator |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
# Write data as JSON |
|
111
|
1
|
|
|
|
|
10
|
my $json_file = catfile( $output_dir, qq/${name}.json/ ); |
|
112
|
1
|
50
|
|
|
|
4
|
say "Writting <$json_file> " if $self->{verbose}; |
|
113
|
1
|
|
|
|
|
7
|
write_json( { filepath => $json_file, data => $data } ); |
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
# Load the configuration file data |
|
116
|
1
|
|
|
|
|
7
|
my $config = $self->create_config( $arrays, $non_arrays ); |
|
117
|
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
# Write the configuration file as YAML |
|
119
|
1
|
|
|
|
|
8
|
my $yaml_file = catfile( $output_dir, qq/${name}_config.yaml/ ); |
|
120
|
1
|
50
|
|
|
|
5
|
say "Writting <$yaml_file> " if $self->{verbose}; |
|
121
|
1
|
|
|
|
|
7
|
write_yaml( { filepath => $yaml_file, data => $config } ); |
|
122
|
1
|
|
|
|
|
26
|
return 1; |
|
123
|
|
|
|
|
|
|
} |
|
124
|
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
sub write_json { |
|
126
|
1
|
|
|
1
|
|
3
|
my $arg = shift; |
|
127
|
1
|
|
|
|
|
3
|
my $file = $arg->{filepath}; |
|
128
|
1
|
|
|
|
|
2
|
my $json_data = $arg->{data}; |
|
129
|
|
|
|
|
|
|
|
|
130
|
|
|
|
|
|
|
# Note that canonical DOES not match the order of nsort from Sort::Naturally |
|
131
|
1
|
|
|
|
|
66
|
my $json = JSON::XS->new->utf8->canonical->pretty->encode($json_data); |
|
132
|
1
|
|
|
|
|
11
|
path($file)->spew($json); |
|
133
|
1
|
|
|
|
|
9159
|
return 1; |
|
134
|
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub write_yaml { |
|
137
|
1
|
|
|
1
|
|
3
|
my $arg = shift; |
|
138
|
1
|
|
|
|
|
3
|
my $file = $arg->{filepath}; |
|
139
|
1
|
|
|
|
|
3
|
my $json_data = $arg->{data}; |
|
140
|
1
|
|
|
|
|
2
|
local $YAML::XS::Boolean = 'JSON::PP'; |
|
141
|
1
|
|
|
|
|
7
|
DumpFile( $file, $json_data ); |
|
142
|
1
|
|
|
|
|
471
|
return 1; |
|
143
|
|
|
|
|
|
|
} |
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
sub read_csv { |
|
146
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
|
147
|
1
|
|
|
|
|
10
|
my $input = $self->{input}; |
|
148
|
1
|
|
|
|
|
3
|
my $primary_key_name = $self->{primary_key_name}; # has to be non-array |
|
149
|
1
|
|
|
|
|
19
|
my $generate_primary_key = $self->{generate_primary_key}; |
|
150
|
1
|
|
|
|
|
2
|
my $sep = $self->{sep}; |
|
151
|
1
|
|
|
|
|
3
|
my $array_sep = $self->{array_sep}; |
|
152
|
1
|
|
|
|
|
22
|
my $array_sep_qr = qr/$array_sep/; |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
# Create a Text::CSV object with semicolon as the separator |
|
155
|
1
|
|
|
|
|
6
|
my $csv = Text::CSV_XS->new( |
|
156
|
|
|
|
|
|
|
{ |
|
157
|
|
|
|
|
|
|
binary => 1, |
|
158
|
|
|
|
|
|
|
sep_char => define_separator( $input, $sep ), |
|
159
|
|
|
|
|
|
|
auto_diag => 1 |
|
160
|
|
|
|
|
|
|
} |
|
161
|
|
|
|
|
|
|
); |
|
162
|
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
# Open filehandle |
|
164
|
1
|
|
|
|
|
227
|
open my $fh, '<:encoding(utf-8)', $input; |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
# Parse the CSV data |
|
167
|
1
|
|
|
|
|
5247
|
my $headers = $csv->getline($fh); |
|
168
|
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# Get rid of problematic characters on headers |
|
170
|
1
|
|
|
|
|
89
|
$_ =~ tr/()//d for @$headers; |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# Add $primary_key_name to headers if $generate_primary_key |
|
173
|
1
|
50
|
|
|
|
8
|
if ($generate_primary_key) { |
|
174
|
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# Check that primary_key_name does not exist |
|
176
|
|
|
|
|
|
|
die |
|
177
|
|
|
|
|
|
|
"Primary key <$primary_key_name> already exists. Are you sure you need the <--generate-primary-key> flag?\n" |
|
178
|
1
|
50
|
|
|
|
4
|
if ( grep { $_ eq $primary_key_name } @$headers ); |
|
|
3
|
|
|
|
|
10
|
|
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
# Make it last element of the array |
|
181
|
1
|
50
|
|
|
|
5
|
push @$headers, $primary_key_name if $generate_primary_key; |
|
182
|
|
|
|
|
|
|
} |
|
183
|
|
|
|
|
|
|
|
|
184
|
|
|
|
|
|
|
##################### |
|
185
|
|
|
|
|
|
|
# START READING CSV # |
|
186
|
|
|
|
|
|
|
##################### |
|
187
|
|
|
|
|
|
|
|
|
188
|
1
|
|
|
|
|
3
|
my ( @rows, %array, %non_array ); |
|
189
|
1
|
|
|
|
|
3
|
my $count = 1; |
|
190
|
1
|
50
|
|
|
|
11
|
say "Start reading <$input>" if $self->{verbose}; |
|
191
|
1
|
|
|
|
|
53
|
while ( my $row = $csv->getline($fh) ) { |
|
192
|
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
# Print if verbose |
|
194
|
|
|
|
|
|
|
say "Reading row $count..." |
|
195
|
3
|
50
|
33
|
|
|
14
|
if ( $self->{verbose} && $count % 1_000 == 0 ); |
|
196
|
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
# Add id if $generate_primary_key |
|
198
|
3
|
50
|
|
|
|
20
|
push @$row, 'PR_' . sprintf( "%08d", $count ) if $generate_primary_key; |
|
199
|
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
# Load data |
|
201
|
3
|
|
|
|
|
6
|
my %data; |
|
202
|
3
|
|
|
|
|
19
|
@data{@$headers} = @$row; |
|
203
|
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
205
|
|
|
|
|
|
|
# Columns can consist of arrays or strings |
|
206
|
|
|
|
|
|
|
# Here we load all as strings and we re-format array fields a posteriori |
|
207
|
|
|
|
|
|
|
|
|
208
|
3
|
|
|
|
|
63
|
for my $key ( keys %data ) { |
|
209
|
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
# Check array/non-array based on regex |
|
211
|
12
|
100
|
|
|
|
110
|
$array{$key}++ if $data{$key} =~ $array_sep_qr; |
|
212
|
|
|
|
|
|
|
} |
|
213
|
|
|
|
|
|
|
|
|
214
|
3
|
|
|
|
|
9
|
push @rows, \%data; |
|
215
|
3
|
|
|
|
|
139
|
$count++; |
|
216
|
|
|
|
|
|
|
} |
|
217
|
1
|
|
|
|
|
91
|
close $fh; |
|
218
|
1
|
50
|
|
|
|
1302
|
say "Reading <$input> completed!" if $self->{verbose}; |
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
################### |
|
221
|
|
|
|
|
|
|
# END READING CSV # |
|
222
|
|
|
|
|
|
|
################### |
|
223
|
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
# Load array/non-array |
|
225
|
1
|
|
|
|
|
5
|
my @array = keys %array; |
|
226
|
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
# Filter the original array to exclude elements found in the @array |
|
228
|
1
|
|
|
|
|
4
|
my @non_array = grep { !$array{$_} } @$headers; |
|
|
4
|
|
|
|
|
13
|
|
|
229
|
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
# Re-arrange array variables |
|
231
|
1
|
|
|
|
|
6
|
split_array_fields( \@rows, \@array, $array_sep ); |
|
232
|
|
|
|
|
|
|
|
|
233
|
1
|
|
|
|
|
25
|
return ( \@rows, \@array, \@non_array ); |
|
234
|
|
|
|
|
|
|
} |
|
235
|
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
sub split_array_fields { |
|
237
|
1
|
|
|
1
|
|
3
|
my ( $rows, $array, $array_sep ) = @_; |
|
238
|
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
# Split array fields (comma-separated values) into an array_ref |
|
240
|
|
|
|
|
|
|
# Modify the original data structure directly |
|
241
|
1
|
|
|
|
|
4
|
for my $row (@$rows) { |
|
242
|
3
|
|
|
|
|
6
|
for my $key (@$array) { |
|
243
|
3
|
|
|
|
|
31
|
$row->{$key} = [ split /$array_sep/, $row->{$key} ]; |
|
244
|
|
|
|
|
|
|
} |
|
245
|
|
|
|
|
|
|
} |
|
246
|
1
|
|
|
|
|
3
|
return 1; |
|
247
|
|
|
|
|
|
|
} |
|
248
|
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
sub create_config { |
|
250
|
1
|
|
|
1
|
|
4
|
my ( $self, $array, $non_array ) = @_; |
|
251
|
1
|
|
|
|
|
3
|
my $primary_key_name = $self->{primary_key_name}; |
|
252
|
|
|
|
|
|
|
|
|
253
|
1
|
|
|
|
|
5
|
my @arrays = sort @$array; |
|
254
|
1
|
|
|
|
|
5
|
my @non_arrays = sort @$non_array; |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# Convert arrays to hashes for quick look up |
|
257
|
1
|
|
|
|
|
4
|
my %arrays_hash = map { $_ => 1 } @arrays; |
|
|
1
|
|
|
|
|
5
|
|
|
258
|
1
|
|
|
|
|
3
|
my %non_arrays_hash = map { $_ => 1 } @non_arrays; |
|
|
3
|
|
|
|
|
9
|
|
|
259
|
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
# Set primary key |
|
261
|
1
|
50
|
|
|
|
5
|
if ( defined $primary_key_name ) { |
|
262
|
|
|
|
|
|
|
die "Primary-key <$primary_key_name> not found\n" |
|
263
|
|
|
|
|
|
|
unless ( exists $arrays_hash{$primary_key_name} |
|
264
|
1
|
50
|
33
|
|
|
12
|
|| exists $non_arrays_hash{$primary_key_name} ); |
|
265
|
|
|
|
|
|
|
} |
|
266
|
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
else { |
|
268
|
0
|
0
|
|
|
|
0
|
$primary_key_name = exists $non_arrays_hash{id} ? 'id' : $non_arrays[0]; |
|
269
|
|
|
|
|
|
|
} |
|
270
|
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
# Default for non-arrays |
|
272
|
1
|
|
|
|
|
7
|
my $config = { |
|
273
|
|
|
|
|
|
|
format => 'CSV', |
|
274
|
|
|
|
|
|
|
primary_key => $primary_key_name, |
|
275
|
|
|
|
|
|
|
allowed_terms => [@non_arrays] |
|
276
|
|
|
|
|
|
|
}; |
|
277
|
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
# Update for arrays |
|
279
|
1
|
50
|
|
|
|
5
|
if ( scalar @arrays ) { |
|
280
|
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# NB: Can't use $array more than once in the hash ref below. Need to deref |
|
282
|
1
|
|
|
|
|
21
|
$config->{array_terms} = [@arrays]; |
|
283
|
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# @non_arrays, @arrays are sorted, but if we merge them we need to re-sort |
|
285
|
1
|
|
|
|
|
6
|
$config->{allowed_terms} = [ sort @non_arrays, @arrays ]; |
|
286
|
|
|
|
|
|
|
$config->{id_correspondence} = { |
|
287
|
|
|
|
|
|
|
CSV => [ |
|
288
|
|
|
|
|
|
|
map { |
|
289
|
1
|
|
|
|
|
3
|
my $val = { $_ => $_ }; |
|
|
1
|
|
|
|
|
3
|
|
|
290
|
1
|
|
|
|
|
6
|
$val |
|
291
|
|
|
|
|
|
|
} @arrays |
|
292
|
|
|
|
|
|
|
] |
|
293
|
|
|
|
|
|
|
}; |
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
} |
|
296
|
1
|
|
|
|
|
5
|
return $config; |
|
297
|
|
|
|
|
|
|
} |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
sub define_separator { |
|
300
|
1
|
|
|
1
|
|
4
|
my ( $filepath, $sep ) = @_; |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
# Define split record separator from file extension |
|
303
|
1
|
|
|
|
|
3
|
my @exts = qw(.csv .tsv); |
|
304
|
1
|
|
|
|
|
77
|
my ( $undef, undef, $ext ) = fileparse( $filepath, @exts ); |
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
# Defining separator character |
|
307
|
1
|
0
|
|
|
|
5
|
my $separator = |
|
|
|
0
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
defined($sep) ? $sep |
|
309
|
|
|
|
|
|
|
: $ext eq '.csv' ? ',' |
|
310
|
|
|
|
|
|
|
: # Use comma for csv files |
|
311
|
|
|
|
|
|
|
$ext eq '.tsv' ? "\t" : # Use tab for tsv files |
|
312
|
|
|
|
|
|
|
','; # Default to comma if no extension match or $sep is undefined |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
# Return separator |
|
315
|
1
|
|
|
|
|
19
|
return $separator; |
|
316
|
|
|
|
|
|
|
} |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
1; |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=head1 NAME |
|
321
|
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
csv2pheno-ranker: A script to convert a CSV to an input suitable for Pheno-Ranker |
|
323
|
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
=head1 SYNOPSIS |
|
325
|
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
csv2pheno-ranker -i [-options] |
|
327
|
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
Arguments: |
|
329
|
|
|
|
|
|
|
-i, --input CSV file |
|
330
|
|
|
|
|
|
|
|
|
331
|
|
|
|
|
|
|
Options: |
|
332
|
|
|
|
|
|
|
-generate-primary-key Generates a primary key if absent. Use --primary-key-name to set its name |
|
333
|
|
|
|
|
|
|
-primary-key-name Sets the name for the primary key. Must be a single, non-array field |
|
334
|
|
|
|
|
|
|
-sep, --separator Delimiter for CSV fields [;] (e.g., --sep $'\t' for tabs) |
|
335
|
|
|
|
|
|
|
-array-separator Delimiter for nested arrays [|] (e.g., --array-separator ';' for semicolons) |
|
336
|
|
|
|
|
|
|
-output-dir Specify the directory where output files will be stored. If not specified, outputs will be placed in the same directory as the input file |
|
337
|
|
|
|
|
|
|
|
|
338
|
|
|
|
|
|
|
Generic Options: |
|
339
|
|
|
|
|
|
|
-debug Print debugging (from 1 to 5, being 5 max) |
|
340
|
|
|
|
|
|
|
-h, --help Brief help message |
|
341
|
|
|
|
|
|
|
-man Full documentation |
|
342
|
|
|
|
|
|
|
-v, --verbose Verbosity on |
|
343
|
|
|
|
|
|
|
-V, --version Print version |
|
344
|
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
=head1 DESCRIPTION |
|
346
|
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
Numerous tools exist for CSV to JSON conversion, but our focus here was on creating JSON specifically for C. The script supports both basic CSV files and complex, comma-separated CSV files with nested fields, ensuring seamless C integration. |
|
348
|
|
|
|
|
|
|
|
|
349
|
|
|
|
|
|
|
The script will create both a JSON file and the configuration file for C. Then, you can run C as: |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
$ pheno-ranker -r my_csv.json --config --my_csv_config.yaml |
|
352
|
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
Note that we load all data in memory before dumping the JSON file. If you have a huge CSV (e.g.,>5M rows) please use a computer that has enough RAM. |
|
354
|
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
=head1 SUMMARY |
|
356
|
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
A script to convert a CSV to an input suitable for C |
|
358
|
|
|
|
|
|
|
|
|
359
|
|
|
|
|
|
|
=head1 INSTALLATION |
|
360
|
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
(only needed if you did not install C) |
|
362
|
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
$ cpanm --installdeps . |
|
364
|
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
=head3 System requirements |
|
366
|
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
* Ideally a Debian-based distribution (Ubuntu or Mint), but any other (e.g., CentOs, OpenSuse) should do as well. |
|
368
|
|
|
|
|
|
|
* Perl 5 (>= 5.10 core; installed by default in most Linux distributions). Check the version with "perl -v" |
|
369
|
|
|
|
|
|
|
* 1GB of RAM. |
|
370
|
|
|
|
|
|
|
* 1 core (it only uses one core per job). |
|
371
|
|
|
|
|
|
|
* At least 1GB HDD. |
|
372
|
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
=head1 HOW TO RUN CSV2PHENO-RANKER |
|
374
|
|
|
|
|
|
|
|
|
375
|
|
|
|
|
|
|
The software requires a CSV file as the input and operates with default settings. By default, both the C file and the configuration file will be created in the same directory as the input file, and will share the same basename. |
|
376
|
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
If you have columns with nested values make sure that you use C<--array-separator> to define the delimiting character (default is "|"). |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
If you want to change some parameters please take a look to the synopsis. |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
B |
|
382
|
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
$ ./csv2pheno-ranker -i example.csv |
|
384
|
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
$ ./csv2pheno-ranker -i example.csv --generate-primary-key --primary-key-name ID |
|
386
|
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
$ ./csv2pheno-ranker -i example.csv --generate-primary-key --primary-key-name ID --output-dir /my-path --sep ';' --array-separator ',' |
|
388
|
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=head2 COMMON ERRORS AND SOLUTIONS |
|
390
|
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
* Error message: Foo |
|
392
|
|
|
|
|
|
|
Solution: Bar |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
* Error message: Foo |
|
395
|
|
|
|
|
|
|
Solution: Bar |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
=head1 AUTHOR |
|
398
|
|
|
|
|
|
|
|
|
399
|
|
|
|
|
|
|
Written by Manuel Rueda, PhD. Info about CNAG can be found at L. |
|
400
|
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
This PERL file is copyrighted. See the LICENSE file included in this distribution. |
|
404
|
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
=cut |