line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Convert::Pheno::CSV; |
2
|
|
|
|
|
|
|
|
3
|
6
|
|
|
6
|
|
45
|
use strict; |
|
6
|
|
|
|
|
36
|
|
|
6
|
|
|
|
|
232
|
|
4
|
6
|
|
|
6
|
|
41
|
use warnings; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
176
|
|
5
|
6
|
|
|
6
|
|
38
|
use autodie; |
|
6
|
|
|
|
|
11
|
|
|
6
|
|
|
|
|
80
|
|
6
|
6
|
|
|
6
|
|
33980
|
use feature qw(say); |
|
6
|
|
|
|
|
22
|
|
|
6
|
|
|
|
|
637
|
|
7
|
6
|
|
|
6
|
|
40
|
use File::Basename; |
|
6
|
|
|
|
|
26
|
|
|
6
|
|
|
|
|
555
|
|
8
|
6
|
|
|
6
|
|
6762
|
use Text::CSV_XS qw(csv); |
|
6
|
|
|
|
|
99304
|
|
|
6
|
|
|
|
|
510
|
|
9
|
6
|
|
|
6
|
|
3363
|
use Sort::Naturally qw(nsort); |
|
6
|
|
|
|
|
28927
|
|
|
6
|
|
|
|
|
387
|
|
10
|
6
|
|
|
6
|
|
45
|
use List::Util qw(any); |
|
6
|
|
|
|
|
9
|
|
|
6
|
|
|
|
|
418
|
|
11
|
6
|
|
|
6
|
|
35
|
use File::Spec::Functions qw(catdir); |
|
6
|
|
|
|
|
10
|
|
|
6
|
|
|
|
|
331
|
|
12
|
6
|
|
|
6
|
|
3939
|
use IO::Compress::Gzip qw($GzipError); |
|
6
|
|
|
|
|
196996
|
|
|
6
|
|
|
|
|
751
|
|
13
|
6
|
|
|
6
|
|
3260
|
use IO::Uncompress::Gunzip qw($GunzipError); |
|
6
|
|
|
|
|
78286
|
|
|
6
|
|
|
|
|
682
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
#use Devel::Size qw(size total_size); |
16
|
6
|
|
|
6
|
|
57
|
use Convert::Pheno; |
|
6
|
|
|
|
|
8
|
|
|
6
|
|
|
|
|
221
|
|
17
|
6
|
|
|
6
|
|
3494
|
use Convert::Pheno::OMOP; |
|
6
|
|
|
|
|
20
|
|
|
6
|
|
|
|
|
749
|
|
18
|
6
|
|
|
6
|
|
2801
|
use Convert::Pheno::IO; |
|
6
|
|
|
|
|
19
|
|
|
6
|
|
|
|
|
375
|
|
19
|
6
|
|
|
6
|
|
2402
|
use Convert::Pheno::Schema; |
|
6
|
|
|
|
|
25
|
|
|
6
|
|
|
|
|
279
|
|
20
|
6
|
|
|
6
|
|
54
|
use Convert::Pheno::Mapping; |
|
6
|
|
|
|
|
15
|
|
|
6
|
|
|
|
|
771
|
|
21
|
6
|
|
|
6
|
|
53
|
use Exporter 'import'; |
|
6
|
|
|
|
|
20
|
|
|
6
|
|
|
|
|
421
|
|
22
|
|
|
|
|
|
|
our @EXPORT = |
23
|
|
|
|
|
|
|
qw(read_csv read_csv_stream read_redcap_dict_and_mapping_file transpose_ohdsi_dictionary read_sqldump_stream read_sqldump sqldump2csv transpose_omop_data_structure open_filehandle load_exposures transpose_visit_occurrence); |
24
|
|
|
|
|
|
|
|
25
|
6
|
|
|
6
|
|
50
|
use constant DEVEL_MODE => 0; |
|
6
|
|
|
|
|
24
|
|
|
6
|
|
|
|
|
22532
|
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
######################### |
28
|
|
|
|
|
|
|
######################### |
29
|
|
|
|
|
|
|
# SUBROUTINES FOR CSV # |
30
|
|
|
|
|
|
|
######################### |
31
|
|
|
|
|
|
|
######################### |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub read_redcap_dictionary { |
34
|
|
|
|
|
|
|
|
35
|
13
|
|
|
13
|
0
|
31
|
my $filepath = shift; |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# Define split record separator from file extension |
38
|
13
|
|
|
|
|
51
|
my ( $separator, $encoding ) = define_separator( $filepath, undef ); |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# We'll create an HoH using as 1D-key the 'Variable / Field Name' |
41
|
13
|
|
|
|
|
41
|
my $key = 'Variable / Field Name'; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# We'll be adding the key <_labels>. See sub add_labels |
44
|
13
|
|
|
|
|
27
|
my $labels = 'Choices, Calculations, OR Slider Labels'; |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# Loading data directly from Text::CSV_XS |
47
|
|
|
|
|
|
|
# NB1: We want HoH and sub read_csv returns AoH |
48
|
|
|
|
|
|
|
# NB2: By default the Text::CSV module treats all fields in a CSV file as strings, regardless of their actual data type. |
49
|
|
|
|
|
|
|
my $hoh = csv( |
50
|
|
|
|
|
|
|
in => $filepath, |
51
|
|
|
|
|
|
|
sep_char => $separator, |
52
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
#binary => 1, # default |
54
|
|
|
|
|
|
|
auto_diag => 1, |
55
|
|
|
|
|
|
|
encoding => $encoding, |
56
|
|
|
|
|
|
|
key => $key, |
57
|
5083
|
|
|
5083
|
|
358052
|
on_in => sub { $_{_labels} = add_labels( $_{$labels} ) } |
58
|
13
|
|
|
|
|
149
|
); |
59
|
13
|
|
|
|
|
504
|
return $hoh; |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub add_labels { |
63
|
|
|
|
|
|
|
|
64
|
5083
|
|
|
5083
|
0
|
6065
|
my $value = shift; |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# *** IMPORTANT *** |
67
|
|
|
|
|
|
|
# This sub can return undef, i.e., $_{labels} = undef |
68
|
|
|
|
|
|
|
# That's OK as we won't perform exists $_{_label} |
69
|
|
|
|
|
|
|
# Note that in $hoh (above) empty columns are key = ''. |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# Premature return if empty ('' = 0) |
72
|
5083
|
100
|
|
|
|
9330
|
return undef unless $value; # perlcritic Severity: 5 |
73
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# We'll skip values that don't provide even number of key-values |
75
|
2912
|
|
|
|
|
22220
|
my @tmp = map { s/^\s//; s/\s+$//; $_; } |
|
27053
|
|
|
|
|
41866
|
|
|
27053
|
|
|
|
|
35685
|
|
|
27053
|
|
|
|
|
36189
|
|
76
|
|
|
|
|
|
|
( split /\||,/, $value ); # perlcritic Severity: 5 |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
# Return undef for non-valid entries |
79
|
2912
|
100
|
|
|
|
16751
|
return @tmp % 2 == 0 ? {@tmp} : undef; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub read_redcap_dict_and_mapping_file { |
83
|
|
|
|
|
|
|
|
84
|
13
|
|
|
13
|
0
|
43
|
my $arg = shift; |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# Read and load REDCap CSV dictionary |
87
|
13
|
|
|
|
|
69
|
my $data_redcap_dict = read_redcap_dictionary( $arg->{redcap_dictionary} ); |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# Read and load mapping file |
90
|
|
|
|
|
|
|
my $data_mapping_file = |
91
|
13
|
|
|
|
|
153
|
io_yaml_or_json( { filepath => $arg->{mapping_file}, mode => 'read' } ); |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# Validate mapping file against JSON schema |
94
|
|
|
|
|
|
|
my $jv = Convert::Pheno::Schema->new( |
95
|
|
|
|
|
|
|
{ |
96
|
|
|
|
|
|
|
data => $data_mapping_file, |
97
|
|
|
|
|
|
|
debug => $arg->{self_validate_schema}, |
98
|
|
|
|
|
|
|
schema_file => $arg->{schema_file} |
99
|
|
|
|
|
|
|
} |
100
|
7
|
|
|
|
|
274
|
); |
101
|
7
|
|
|
|
|
57
|
$jv->json_validate; |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# Return if succesful |
104
|
6
|
|
|
|
|
207
|
return ( $data_redcap_dict, $data_mapping_file ); |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub transpose_ohdsi_dictionary { |
108
|
|
|
|
|
|
|
|
109
|
3
|
|
|
3
|
0
|
9
|
my $data = shift; |
110
|
3
|
|
|
|
|
8
|
my $column = 'concept_id'; |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
# The idea is the following: |
113
|
|
|
|
|
|
|
# $data comes as an array (from SQL/CSV) |
114
|
|
|
|
|
|
|
# |
115
|
|
|
|
|
|
|
# $VAR1 = [ |
116
|
|
|
|
|
|
|
# { |
117
|
|
|
|
|
|
|
# 'concept_class_id' => '4-char billing code', |
118
|
|
|
|
|
|
|
# 'concept_code' => 'K92.2', |
119
|
|
|
|
|
|
|
# 'concept_id' => 35208414, |
120
|
|
|
|
|
|
|
# 'concept_name' => 'Gastrointestinal hemorrhage, unspecified', |
121
|
|
|
|
|
|
|
# 'domain_id' => 'Condition', |
122
|
|
|
|
|
|
|
# 'invalid_reason' => undef, |
123
|
|
|
|
|
|
|
# 'standard_concept' => undef, |
124
|
|
|
|
|
|
|
# 'valid_end_date' => '2099-12-31', |
125
|
|
|
|
|
|
|
# 'valid_start_date' => '2007-01-01', |
126
|
|
|
|
|
|
|
# 'vocabulary_id' => 'ICD10CM' |
127
|
|
|
|
|
|
|
# }, |
128
|
|
|
|
|
|
|
# |
129
|
|
|
|
|
|
|
# and we convert it to hash to allow for quick searches by 'concept_id' |
130
|
|
|
|
|
|
|
# |
131
|
|
|
|
|
|
|
# $VAR1 = { |
132
|
|
|
|
|
|
|
# '1107830' => { |
133
|
|
|
|
|
|
|
# 'concept_class_id' => 'Ingredient', |
134
|
|
|
|
|
|
|
# 'concept_code' => 28889, |
135
|
|
|
|
|
|
|
# 'concept_id' => 1107830, |
136
|
|
|
|
|
|
|
# 'concept_name' => 'Loratadine', |
137
|
|
|
|
|
|
|
# 'domain_id' => 'Drug', |
138
|
|
|
|
|
|
|
# 'invalid_reason' => undef, |
139
|
|
|
|
|
|
|
# 'standard_concept' => 'S', |
140
|
|
|
|
|
|
|
# 'valid_end_date' => '2099-12-31', |
141
|
|
|
|
|
|
|
# 'valid_start_date' => '1970-01-01', |
142
|
|
|
|
|
|
|
# 'vocabulary_id' => 'RxNorm' |
143
|
|
|
|
|
|
|
# }, |
144
|
|
|
|
|
|
|
# |
145
|
|
|
|
|
|
|
# NB: We store all columns yet we'll use 4: |
146
|
|
|
|
|
|
|
# 'concept_id', 'concept_code', 'concept_name', 'vocabulary_id' |
147
|
|
|
|
|
|
|
# Note that we're duplicating @$data with $hoh |
148
|
|
|
|
|
|
|
#my $hoh = { map { $_->{$column} => $_ } @{$data} }; <--map is slower than for |
149
|
3
|
|
|
|
|
6
|
my $hoh; |
150
|
3
|
|
|
|
|
7
|
for my $item ( @{$data} ) { |
|
3
|
|
|
|
|
13
|
|
151
|
1332
|
|
|
|
|
4002
|
$hoh->{ $item->{$column} } = $item; |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
#say "transpose_ohdsi_dictionary:", to_gb( total_size($hoh) ) if DEVEL_MODE; |
155
|
3
|
|
|
|
|
25
|
return $hoh; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub read_sqldump_stream { |
159
|
|
|
|
|
|
|
|
160
|
1
|
|
|
1
|
0
|
4
|
my $arg = shift; |
161
|
1
|
|
|
|
|
4
|
my $filein = $arg->{in}; |
162
|
1
|
|
|
|
|
2
|
my $self = $arg->{self}; |
163
|
1
|
|
|
|
|
3
|
my $person = $arg->{person}; |
164
|
1
|
|
|
|
|
3
|
my $fileout = $self->{out_file}; |
165
|
1
|
|
|
|
|
2
|
my $switch = 0; |
166
|
1
|
|
|
|
|
2
|
my @headers; |
167
|
1
|
|
|
|
|
4
|
my $table_name = $self->{omop_tables}[0]; |
168
|
1
|
|
|
|
|
3
|
my $table_name_lc = lc($table_name); |
169
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# Open filehandles |
171
|
1
|
|
|
|
|
5
|
my $fh_in = open_filehandle( $filein, 'r' ); |
172
|
1
|
|
|
|
|
5
|
my $fh_out = open_filehandle( $fileout, 'a' ); |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Start printing the array |
175
|
|
|
|
|
|
|
#say $fh_out "["; |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# Now we we start processing line by line |
178
|
1
|
|
|
|
|
4
|
my $count = 0; |
179
|
1
|
|
|
|
|
6
|
while ( my $line = <$fh_in> ) { |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Only parsing $table_name_lc and discarding others |
182
|
|
|
|
|
|
|
# Note that double quotes are optional |
183
|
|
|
|
|
|
|
# - COPY "OMOP_cdm_eunomia".person |
184
|
|
|
|
|
|
|
# . COPY omop_cdm_eunomia_2.person |
185
|
386339
|
100
|
|
|
|
28103181
|
if ( $line =~ /^COPY \"?(\w+)\"?\.$table_name_lc / ) { |
186
|
1
|
|
|
|
|
11
|
chomp $line; |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# Create an array to hold the column names for this table |
189
|
1
|
|
|
|
|
20
|
$line =~ s/[\(\),]//g; # getting rid of (), |
190
|
1
|
|
|
|
|
26
|
@headers = split /\s+/, $line; |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# Discarding headers which are not terms/variables |
193
|
1
|
|
|
|
|
19
|
@headers = @headers[ 2 .. $#headers - 2 ]; |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# Turning on the switch for later |
196
|
1
|
|
|
|
|
4
|
$switch++; |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# Jump one line |
199
|
1
|
|
|
|
|
7
|
$line = <$fh_in>; |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# Loading the data if $switch |
204
|
386339
|
100
|
|
|
|
976881
|
if ($switch) { |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# get rid of \n |
207
|
67708
|
|
|
|
|
135389
|
chomp $line; |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# Order matters. We exit before loading |
210
|
67708
|
100
|
|
|
|
115953
|
last if $line =~ /^\\\.$/; |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# Solitting by tab, it's ok |
213
|
67707
|
|
|
|
|
353243
|
my @fields = split /\t/, $line; |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# Using tmp hashref to load all fields at once with slice |
216
|
67707
|
|
|
|
|
97251
|
my $hash_slice; |
217
|
67707
|
|
|
|
|
839106
|
@{$hash_slice}{@headers} = |
218
|
67707
|
|
|
|
|
115586
|
map { dotify_and_coerce_number($_) } @fields; |
|
1421847
|
|
|
|
|
1959630
|
|
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Initialize $data each time |
221
|
|
|
|
|
|
|
# Adding them as an array element (AoH) |
222
|
|
|
|
|
|
|
die |
223
|
|
|
|
|
|
|
"We could not find person_id:$hash_slice->{person_id}. Try increasing the #lines with --max-lines-sql\n" |
224
|
67707
|
50
|
|
|
|
287953
|
unless exists $person->{ $hash_slice->{person_id} }; |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
# Increase counter |
227
|
67707
|
|
|
|
|
86884
|
$count++; |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# Encode data |
230
|
67707
|
|
|
|
|
114582
|
my $encoded_data = |
231
|
|
|
|
|
|
|
encode_omop_stream( $table_name, $hash_slice, $person, $count, |
232
|
|
|
|
|
|
|
$self ); |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
# Only after encoding we are able to discard 'null' |
235
|
67707
|
50
|
|
|
|
347357
|
say $fh_out $encoded_data if $encoded_data ne 'null'; |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
# Print if verbose |
238
|
|
|
|
|
|
|
say "Rows processed: $count" |
239
|
67707
|
50
|
33
|
|
|
9125516
|
if ( $self->{verbose} && $count % 10_000 == 0 ); |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
} |
242
|
1
|
50
|
|
|
|
6
|
say "==============\nRows total: $count\n" if $self->{verbose}; |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
#say $fh_out "]"; # not needed |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
# Closing filehandles |
247
|
1
|
|
|
|
|
9
|
close $fh_in; |
248
|
1
|
|
|
|
|
267
|
close $fh_out; |
249
|
1
|
|
|
|
|
612
|
return 1; |
250
|
|
|
|
|
|
|
} |
251
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
sub encode_omop_stream { |
253
|
|
|
|
|
|
|
|
254
|
67707
|
|
|
67707
|
0
|
125565
|
my ( $table_name, $hash_slice, $person, $count, $self ) = @_; |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# *** IMPORTANT *** |
257
|
|
|
|
|
|
|
# We only print person_id ONCE!!! |
258
|
67707
|
|
|
|
|
103976
|
my $person_id = $hash_slice->{person_id}; |
259
|
|
|
|
|
|
|
my $data = { |
260
|
|
|
|
|
|
|
$table_name => [$hash_slice], |
261
|
|
|
|
|
|
|
PERSON => $count == 1 |
262
|
|
|
|
|
|
|
? $person->{$person_id} |
263
|
|
|
|
|
|
|
: { |
264
|
67707
|
100
|
|
|
|
175904
|
map { $_ => $person->{$person_id}{$_} } |
|
203118
|
|
|
|
|
626573
|
|
265
|
|
|
|
|
|
|
qw(person_id gender_concept_id birth_datetime) |
266
|
|
|
|
|
|
|
} |
267
|
|
|
|
|
|
|
}; |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# Print line by line (->canonical has some overhead but needed for t/) |
270
|
67707
|
|
|
|
|
416701
|
return JSON::XS->new->utf8->canonical->encode( |
271
|
|
|
|
|
|
|
Convert::Pheno::omop2bff_stream_processing( $self, $data ) ); |
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
sub read_sqldump { |
275
|
|
|
|
|
|
|
|
276
|
3
|
|
|
3
|
0
|
9
|
my $arg = shift; |
277
|
3
|
|
|
|
|
10
|
my $filepath = $arg->{in}; |
278
|
3
|
|
|
|
|
6
|
my $self = $arg->{self}; |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
# Before resorting to writting this subroutine I performed an exhaustive search on CPAN: |
281
|
|
|
|
|
|
|
# - Tested MySQL::Dump::Parser::XS but I could not make it work... |
282
|
|
|
|
|
|
|
# - App-MysqlUtils-0.022 has a CLI utility (mysql-sql-dump-extract-tables) |
283
|
|
|
|
|
|
|
# - Of course one can always use *nix tools (sed, grep, awk, etc) or other programming languages.... |
284
|
|
|
|
|
|
|
# Anyway, I ended up writting the parser myself... |
285
|
|
|
|
|
|
|
# The parser is based in reading COPY paragraphs from PostgreSQL dump by using Perl's paragraph mode $/ = ""; |
286
|
|
|
|
|
|
|
# NB: Each paragraph (TABLE) is loaded into memory. Not great for large files. |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# Define variables that modify what we load |
289
|
3
|
|
|
|
|
7
|
my $max_lines_sql = $self->{max_lines_sql}; |
290
|
3
|
|
|
|
|
9
|
my @omop_tables = @{ $self->{omop_tables} }; |
|
3
|
|
|
|
|
13
|
|
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
# Set record separator to paragraph |
293
|
3
|
|
|
|
|
27
|
local $/ = ""; |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
#COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, attribute_description, attribute_type_concept_id, attribute_syntax) FROM stdin; |
296
|
|
|
|
|
|
|
# ...... |
297
|
|
|
|
|
|
|
# \. |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# Start reading the SQL dump |
300
|
3
|
|
|
|
|
15
|
my $fh = open_filehandle( $filepath, 'r' ); |
301
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
# We'll store the data in the hashref $data |
303
|
3
|
|
|
|
|
10
|
my $data = {}; |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# Process paragraphs |
306
|
3
|
|
|
|
|
1520
|
while ( my $paragraph = <$fh> ) { |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# Discarding paragraphs not having m/^COPY/ |
309
|
618
|
100
|
|
|
|
1721892
|
next unless $paragraph =~ m/^COPY/; |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
# Load all lines into an array (via "\n") |
312
|
117
|
|
|
|
|
839126
|
my @lines = split /\n/, $paragraph; |
313
|
117
|
100
|
|
|
|
1262
|
next unless scalar @lines > 2; |
314
|
54
|
|
|
|
|
168
|
pop @lines; # last line eq '\.' |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# First line contains the headers |
317
|
|
|
|
|
|
|
#COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, ..., attribute_syntax) FROM stdin; |
318
|
54
|
|
|
|
|
1238
|
$lines[0] =~ s/[\(\),]//g; # getting rid of (), |
319
|
54
|
|
|
|
|
849
|
my @headers = split /\s+/, $lines[0]; |
320
|
54
|
|
|
|
|
473
|
my $table_name = |
321
|
|
|
|
|
|
|
uc( ( split /\./, $headers[1] )[1] ); # ATTRIBUTE_DEFINITION |
322
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# Discarding non @$omop_tables: |
324
|
|
|
|
|
|
|
# This step improves RAM consumption |
325
|
54
|
100
|
|
283
|
|
774
|
next unless any { $_ eq $table_name } @omop_tables; |
|
283
|
|
|
|
|
63177
|
|
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
# Say if verbose |
328
|
19
|
50
|
|
|
|
182
|
say "Processing table ... <$table_name>" if $self->{verbose}; |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
# Discarding first line |
331
|
19
|
|
|
|
|
49
|
shift @lines; |
332
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
# Discarding headers which are not terms/variables |
334
|
19
|
|
|
|
|
227
|
@headers = @headers[ 2 .. $#headers - 2 ]; |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# Initializing $data>key as empty arrayref |
337
|
19
|
|
|
|
|
126
|
$data->{$table_name} = []; |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
# Ad hoc counter for dev |
340
|
19
|
|
|
|
|
53
|
my $count = 0; |
341
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
# Processing line by line |
343
|
19
|
|
|
|
|
185
|
for my $line (@lines) { |
344
|
12063
|
|
|
|
|
12168
|
$count++; |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# Columns are separated by \t |
347
|
|
|
|
|
|
|
# NB: 'split' and 'Text::CSV' split to strings |
348
|
|
|
|
|
|
|
# We go with 'split'. Coercing a posteriori |
349
|
12063
|
|
|
|
|
68134
|
my @fields = split /\t/, $line; |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
# Loading the fields like this: |
352
|
|
|
|
|
|
|
# |
353
|
|
|
|
|
|
|
# $VAR1 = { |
354
|
|
|
|
|
|
|
# 'PERSON' => [ # NB: This is the table name |
355
|
|
|
|
|
|
|
# { |
356
|
|
|
|
|
|
|
# 'person_id' => 123, |
357
|
|
|
|
|
|
|
# 'test' => 'abc' |
358
|
|
|
|
|
|
|
# }, |
359
|
|
|
|
|
|
|
# { |
360
|
|
|
|
|
|
|
# 'person_id' => 456, |
361
|
|
|
|
|
|
|
# 'test' => 'def' |
362
|
|
|
|
|
|
|
# } |
363
|
|
|
|
|
|
|
# ] |
364
|
|
|
|
|
|
|
# }; |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
# Using tmp hashref to load all fields at once with slice |
367
|
12063
|
|
|
|
|
13697
|
my $hash_slice; |
368
|
12063
|
|
|
|
|
154607
|
@{$hash_slice}{@headers} = |
369
|
12063
|
|
|
|
|
14637
|
map { dotify_and_coerce_number($_) } @fields; |
|
197109
|
|
|
|
|
260001
|
|
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
# Adding them as an array element (AoH) |
372
|
12063
|
|
|
|
|
20080
|
push @{ $data->{$table_name} }, $hash_slice; |
|
12063
|
|
|
|
|
20949
|
|
373
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# adhoc filter to speed-up development |
375
|
12063
|
100
|
|
|
|
19497
|
last if $count == $max_lines_sql; |
376
|
|
|
|
|
|
|
say "Rows processed: $count" |
377
|
12049
|
50
|
33
|
|
|
35551
|
if ( $self->{verbose} && $count % 1_000 == 0 ); |
378
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# Print if verbose |
382
|
19
|
50
|
|
|
|
62799
|
say "==============\nRows total: $count\n" if $self->{verbose}; |
383
|
|
|
|
|
|
|
} |
384
|
3
|
|
|
|
|
50
|
close $fh; |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
#say total_size($data) and die; |
387
|
3
|
|
|
|
|
2875
|
return $data; |
388
|
|
|
|
|
|
|
} |
389
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
sub sqldump2csv { |
391
|
|
|
|
|
|
|
|
392
|
0
|
|
|
0
|
0
|
0
|
my ( $data, $dir ) = @_; |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
# CSV sep character |
395
|
0
|
|
|
|
|
0
|
my $sep = "\t"; |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
# The idea is to save a CSV table for each $data->key |
398
|
0
|
|
|
|
|
0
|
for my $table ( keys %{$data} ) { |
|
0
|
|
|
|
|
0
|
|
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
# File path for CSV file |
401
|
0
|
|
|
|
|
0
|
my $filepath = catdir( $dir, "$table.csv" ); |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
# We get header fields from row[0] and nsort them |
404
|
|
|
|
|
|
|
# NB: The order will not be the same as that in <.sql> |
405
|
0
|
|
|
|
|
0
|
my @headers = nsort keys %{ $data->{$table}[0] }; |
|
0
|
|
|
|
|
0
|
|
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# Print data as CSV |
408
|
|
|
|
|
|
|
write_csv( |
409
|
|
|
|
|
|
|
{ |
410
|
|
|
|
|
|
|
sep => $sep, |
411
|
|
|
|
|
|
|
filepath => $filepath, |
412
|
|
|
|
|
|
|
headers => \@headers, |
413
|
0
|
|
|
|
|
0
|
data => $data->{$table} |
414
|
|
|
|
|
|
|
} |
415
|
|
|
|
|
|
|
); |
416
|
|
|
|
|
|
|
} |
417
|
0
|
|
|
|
|
0
|
return 1; |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
sub transpose_omop_data_structure { |
421
|
|
|
|
|
|
|
|
422
|
2
|
|
|
2
|
0
|
9
|
my $data = shift; |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
# The situation is the following, $data comes in format: |
425
|
|
|
|
|
|
|
# |
426
|
|
|
|
|
|
|
#$VAR1 = { |
427
|
|
|
|
|
|
|
# 'MEASUREMENT' => [ |
428
|
|
|
|
|
|
|
# { |
429
|
|
|
|
|
|
|
# 'measurement_concept_id' => 1, |
430
|
|
|
|
|
|
|
# 'person_id' => 666 |
431
|
|
|
|
|
|
|
# }, |
432
|
|
|
|
|
|
|
# { |
433
|
|
|
|
|
|
|
# 'measurement_concept_id' => 2, |
434
|
|
|
|
|
|
|
# 'person_id' => 666 |
435
|
|
|
|
|
|
|
# } |
436
|
|
|
|
|
|
|
# ], |
437
|
|
|
|
|
|
|
# 'PERSON' => [ |
438
|
|
|
|
|
|
|
# { |
439
|
|
|
|
|
|
|
# 'person_id' => 666 |
440
|
|
|
|
|
|
|
# }, |
441
|
|
|
|
|
|
|
# { |
442
|
|
|
|
|
|
|
# 'person_id' => 1 |
443
|
|
|
|
|
|
|
# } |
444
|
|
|
|
|
|
|
# ] |
445
|
|
|
|
|
|
|
# }; |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
# where all 'person_id' are together inside the TABLE_NAME. |
448
|
|
|
|
|
|
|
# But, BFF "ideally" works at the individual level so we are going to |
449
|
|
|
|
|
|
|
# transpose the data structure to end up into something like this |
450
|
|
|
|
|
|
|
# NB: MEASUREMENT and OBSERVATION (among others, i.e., CONDITION_OCCURRENCE, PROCEDURE_OCCURRENCE) |
451
|
|
|
|
|
|
|
# can have multiple values for one 'person_id' so they will be loaded as arrays |
452
|
|
|
|
|
|
|
# |
453
|
|
|
|
|
|
|
# |
454
|
|
|
|
|
|
|
#$VAR1 = { |
455
|
|
|
|
|
|
|
# '1' => { |
456
|
|
|
|
|
|
|
# 'PERSON' => { |
457
|
|
|
|
|
|
|
# 'person_id' => 1 |
458
|
|
|
|
|
|
|
# } |
459
|
|
|
|
|
|
|
# }, |
460
|
|
|
|
|
|
|
# '666' => { |
461
|
|
|
|
|
|
|
# 'MEASUREMENT' => [ |
462
|
|
|
|
|
|
|
# { |
463
|
|
|
|
|
|
|
# 'measurement_concept_id' => 1, |
464
|
|
|
|
|
|
|
# 'person_id' => 666 |
465
|
|
|
|
|
|
|
# }, |
466
|
|
|
|
|
|
|
# { |
467
|
|
|
|
|
|
|
# 'measurement_concept_id' => 2, |
468
|
|
|
|
|
|
|
# 'person_id' => 666 |
469
|
|
|
|
|
|
|
# } |
470
|
|
|
|
|
|
|
# ], |
471
|
|
|
|
|
|
|
# 'PERSON' => { |
472
|
|
|
|
|
|
|
# 'person_id' => 666 |
473
|
|
|
|
|
|
|
# } |
474
|
|
|
|
|
|
|
# } |
475
|
|
|
|
|
|
|
# }; |
476
|
|
|
|
|
|
|
|
477
|
2
|
|
|
|
|
5
|
my $omop_person_id = {}; |
478
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
# Only performed for $omop_main_table |
480
|
2
|
|
|
|
|
6
|
for my $table ( @{ $omop_main_table->{$omop_version} } ) { # global |
|
2
|
|
|
|
|
14
|
|
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
# Loop over tables |
483
|
30
|
|
|
|
|
37
|
for my $item ( @{ $data->{$table} } ) { |
|
30
|
|
|
|
|
144
|
|
484
|
|
|
|
|
|
|
|
485
|
6000
|
50
|
33
|
|
|
32047
|
if ( exists $item->{person_id} && $item->{person_id} ) { |
486
|
6000
|
|
|
|
|
6572
|
my $person_id = $item->{person_id}; |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
# {person_id} can have multiple rows in @omop_array_tables |
489
|
6000
|
100
|
|
21000
|
|
13439
|
if ( any { $_ eq $table } @omop_array_tables ) { |
|
21000
|
|
|
|
|
23050
|
|
490
|
5000
|
|
|
|
|
4467
|
push @{ $omop_person_id->{$person_id}{$table} }, |
|
5000
|
|
|
|
|
17995
|
|
491
|
|
|
|
|
|
|
$item; # array |
492
|
|
|
|
|
|
|
} |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
# {person_id} only has one value in a given table |
495
|
|
|
|
|
|
|
else { |
496
|
1000
|
|
|
|
|
4315
|
$omop_person_id->{$person_id}{$table} = $item; # scalar |
497
|
|
|
|
|
|
|
} |
498
|
|
|
|
|
|
|
} |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
} |
501
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
# To get back unused memory for later.. |
503
|
2
|
|
|
|
|
5
|
$data = undef; |
504
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# Finally we get rid of the 'person_id' key and return values as an array |
506
|
|
|
|
|
|
|
# |
507
|
|
|
|
|
|
|
#$VAR1 = [ |
508
|
|
|
|
|
|
|
# { |
509
|
|
|
|
|
|
|
# 'PERSON' => { |
510
|
|
|
|
|
|
|
# 'person_id' => 1 |
511
|
|
|
|
|
|
|
# } |
512
|
|
|
|
|
|
|
# }, |
513
|
|
|
|
|
|
|
# ------------------------------------------------ |
514
|
|
|
|
|
|
|
# { |
515
|
|
|
|
|
|
|
# 'MEASUREMENT' => [ |
516
|
|
|
|
|
|
|
# { |
517
|
|
|
|
|
|
|
# 'measurement_concept_id' => 1, |
518
|
|
|
|
|
|
|
# 'person_id' => 666 |
519
|
|
|
|
|
|
|
# }, |
520
|
|
|
|
|
|
|
# { |
521
|
|
|
|
|
|
|
# 'measurement_concept_id' => 2, |
522
|
|
|
|
|
|
|
# 'person_id' => 666 |
523
|
|
|
|
|
|
|
# } |
524
|
|
|
|
|
|
|
# ], |
525
|
|
|
|
|
|
|
# 'PERSON' => { |
526
|
|
|
|
|
|
|
# 'person_id' => 666 |
527
|
|
|
|
|
|
|
# } |
528
|
|
|
|
|
|
|
# } |
529
|
|
|
|
|
|
|
# ]; |
530
|
|
|
|
|
|
|
# NB: We nsort keys to always have the same result but it's not needed |
531
|
|
|
|
|
|
|
# v1 - Easier but duplicates data structure |
532
|
|
|
|
|
|
|
# my $aoh = [ map { $omop_person_id->{$_} } nsort keys %{$omop_person_id} ]; |
533
|
|
|
|
|
|
|
# v2 - This version cleans memory after loading $aoh <=== Implemented |
534
|
2
|
|
|
|
|
6
|
my $aoh; |
535
|
2
|
|
|
|
|
5
|
for my $key ( nsort keys %{$omop_person_id} ) { |
|
2
|
|
|
|
|
526
|
|
536
|
1940
|
|
|
|
|
592628
|
push @{$aoh}, $omop_person_id->{$key}; |
|
1940
|
|
|
|
|
3646
|
|
537
|
1940
|
|
|
|
|
2260
|
delete $omop_person_id->{$key}; |
538
|
|
|
|
|
|
|
} |
539
|
2
|
|
|
|
|
1395
|
if (DEVEL_MODE) { |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
#say 'transpose_omop_data_structure(omop_person_id):', |
542
|
|
|
|
|
|
|
# to_gb( total_size($omop_person_id) ); |
543
|
|
|
|
|
|
|
#say 'transpose_omop_data_structure(map):', to_gb( total_size($aoh) ); |
544
|
|
|
|
|
|
|
} |
545
|
2
|
|
|
|
|
33
|
return $aoh; |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
sub transpose_visit_occurrence { |
549
|
|
|
|
|
|
|
|
550
|
3
|
|
|
3
|
0
|
7
|
my $data = shift; # arrayref |
551
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
# Going from |
553
|
|
|
|
|
|
|
#$VAR1 = [ |
554
|
|
|
|
|
|
|
# { |
555
|
|
|
|
|
|
|
# 'admitting_source_concept_id' => 0, |
556
|
|
|
|
|
|
|
# 'visit_occurrence_id' => 85, |
557
|
|
|
|
|
|
|
# ... |
558
|
|
|
|
|
|
|
# } |
559
|
|
|
|
|
|
|
# ]; |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
# To |
562
|
|
|
|
|
|
|
#$VAR1 = { |
563
|
|
|
|
|
|
|
# '85' => { |
564
|
|
|
|
|
|
|
# 'admitting_source_concept_id' => 0, |
565
|
|
|
|
|
|
|
# 'visit_occurrence_id' => 85, |
566
|
|
|
|
|
|
|
# ... |
567
|
|
|
|
|
|
|
# } |
568
|
|
|
|
|
|
|
# }; |
569
|
|
|
|
|
|
|
#my $hash = { map { $_->{visit_occurrence_id} => $_ } @$data }; # map is slower than for |
570
|
3
|
|
|
|
|
8
|
my $hash; |
571
|
3
|
|
|
|
|
12
|
for my $item (@$data) { |
572
|
|
|
|
|
|
|
my $key = $item->{visit_occurrence_id} |
573
|
2037
|
|
|
|
|
2809
|
; # otherwise $item->{visit_occurrence_id} goes from Int to Str in JSON and tests fail |
574
|
2037
|
|
|
|
|
4197
|
$hash->{$key} = $item; |
575
|
|
|
|
|
|
|
} |
576
|
3
|
|
|
|
|
22
|
return $hash; |
577
|
|
|
|
|
|
|
} |
578
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
sub read_csv { |
580
|
|
|
|
|
|
|
|
581
|
15
|
|
|
15
|
0
|
40
|
my $arg = shift; |
582
|
15
|
|
|
|
|
48
|
my $filepath = $arg->{in}; |
583
|
15
|
|
|
|
|
39
|
my $sep = $arg->{sep}; |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
# Define split record separator from file extension |
586
|
15
|
|
|
|
|
56
|
my ( $separator, $encoding ) = define_separator( $filepath, $sep ); |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
# Transform $filepath into an AoH |
589
|
|
|
|
|
|
|
# Using Text::CSV_XS functional interface |
590
|
15
|
|
|
|
|
109
|
my $aoh = csv( |
591
|
|
|
|
|
|
|
in => $filepath, |
592
|
|
|
|
|
|
|
sep_char => $separator, |
593
|
|
|
|
|
|
|
headers => "auto", |
594
|
|
|
|
|
|
|
eol => "\n", |
595
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
# binary => 1, # default |
597
|
|
|
|
|
|
|
encoding => $encoding, |
598
|
|
|
|
|
|
|
auto_diag => 1 |
599
|
|
|
|
|
|
|
); |
600
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
# $aoh = [ |
602
|
|
|
|
|
|
|
# { |
603
|
|
|
|
|
|
|
# 'abdominal_mass' => 0, |
604
|
|
|
|
|
|
|
# 'age_first_diagnosis' => 0, |
605
|
|
|
|
|
|
|
# 'alcohol' => 4, |
606
|
|
|
|
|
|
|
# }, {},,, |
607
|
|
|
|
|
|
|
# ] |
608
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# Coercing the data before returning it |
610
|
14
|
|
|
|
|
216531
|
for my $item (@$aoh) { |
611
|
1035
|
|
|
|
|
1273
|
for my $key ( keys %{$item} ) { |
|
1035
|
|
|
|
|
18177
|
|
612
|
177462
|
|
|
|
|
257167
|
$item->{$key} = dotify_and_coerce_number( $item->{$key} ); |
613
|
|
|
|
|
|
|
} |
614
|
|
|
|
|
|
|
} |
615
|
|
|
|
|
|
|
|
616
|
14
|
|
|
|
|
145
|
return $aoh; |
617
|
|
|
|
|
|
|
} |
618
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
sub read_csv_stream { |
620
|
|
|
|
|
|
|
|
621
|
0
|
|
|
0
|
0
|
0
|
my $arg = shift; |
622
|
0
|
|
|
|
|
0
|
my $filein = $arg->{in}; |
623
|
0
|
|
|
|
|
0
|
my $self = $arg->{self}; |
624
|
0
|
|
|
|
|
0
|
my $sep = $arg->{sep}; |
625
|
0
|
|
|
|
|
0
|
my $person = $arg->{person}; |
626
|
0
|
|
|
|
|
0
|
my $fileout = $self->{out_file}; |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
# Define split record separator |
629
|
0
|
|
|
|
|
0
|
my ( $separator, $encoding, $table_name ) = |
630
|
|
|
|
|
|
|
define_separator( $filein, $sep ); |
631
|
0
|
|
|
|
|
0
|
my $table_name_lc = lc($table_name); |
632
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
# Using Text::CSV_XS OO interface |
634
|
0
|
|
|
|
|
0
|
my $csv = Text::CSV_XS->new( |
635
|
|
|
|
|
|
|
{ binary => 1, auto_diag => 1, sep_char => $separator, eol => "\n" } ); |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
# Open filehandles |
638
|
0
|
|
|
|
|
0
|
my $fh_in = open_filehandle( $filein, 'r' ); |
639
|
0
|
|
|
|
|
0
|
my $fh_out = open_filehandle( $fileout, 'a' ); |
640
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
# Get rid of \n on first line |
642
|
0
|
|
|
|
|
0
|
chomp( my $line = <$fh_in> ); |
643
|
0
|
|
|
|
|
0
|
my @headers = split /$separator/, $line; |
644
|
|
|
|
|
|
|
|
645
|
0
|
|
|
|
|
0
|
my $hash_slice; |
646
|
0
|
|
|
|
|
0
|
my $count = 0; |
647
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
# *** IMPORTANT *** |
649
|
|
|
|
|
|
|
# On Feb-19-2023 I tested Parallel::ForkManager and: |
650
|
|
|
|
|
|
|
# 1 - The performance was by far slower than w/o it |
651
|
|
|
|
|
|
|
# 2 - We hot SQLite errors for concurring fh |
652
|
|
|
|
|
|
|
# Thus, it was not implemented |
653
|
|
|
|
|
|
|
|
654
|
0
|
|
|
|
|
0
|
while ( my $row = $csv->getline($fh_in) ) { |
655
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
# Load the values a a hash slice; |
657
|
0
|
|
|
|
|
0
|
my $hash_slice; |
658
|
0
|
|
|
|
|
0
|
@{$hash_slice}{@headers} = map { dotify_and_coerce_number($_) } @$row; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
659
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
# Encode data |
661
|
0
|
|
|
|
|
0
|
my $encoded_data = |
662
|
|
|
|
|
|
|
encode_omop_stream( $table_name, $hash_slice, $person, $count, |
663
|
|
|
|
|
|
|
$self ); |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
# Only after encoding we are able to discard 'null' |
666
|
0
|
0
|
|
|
|
0
|
say $fh_out $encoded_data if $encoded_data ne 'null'; |
667
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
# Increment $count |
669
|
0
|
|
|
|
|
0
|
$count++; |
670
|
|
|
|
|
|
|
say "Rows processed: $count" |
671
|
0
|
0
|
0
|
|
|
0
|
if ( $self->{verbose} && $count % 10_000 == 0 ); |
672
|
|
|
|
|
|
|
} |
673
|
0
|
0
|
|
|
|
0
|
say "==============\nRows total: $count\n" if $self->{verbose}; |
674
|
|
|
|
|
|
|
|
675
|
0
|
|
|
|
|
0
|
close $fh_in; |
676
|
0
|
|
|
|
|
0
|
close $fh_out; |
677
|
0
|
|
|
|
|
0
|
return 1; |
678
|
|
|
|
|
|
|
} |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
sub write_csv { |
681
|
|
|
|
|
|
|
|
682
|
0
|
|
|
0
|
0
|
0
|
my $arg = shift; |
683
|
0
|
|
|
|
|
0
|
my $sep = $arg->{sep}; |
684
|
0
|
|
|
|
|
0
|
my $aoh = $arg->{data}; |
685
|
0
|
|
|
|
|
0
|
my $filepath = $arg->{filepath}; |
686
|
0
|
|
|
|
|
0
|
my $headers = $arg->{headers}; |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
# Using Text::CSV_XS functional interface |
689
|
|
|
|
|
|
|
# NB: About speed: |
690
|
|
|
|
|
|
|
# https://metacpan.org/pod/Text::CSV#csv1 |
691
|
|
|
|
|
|
|
csv( |
692
|
|
|
|
|
|
|
in => $aoh, |
693
|
|
|
|
|
|
|
out => $filepath, |
694
|
|
|
|
|
|
|
sep_char => $sep, |
695
|
|
|
|
|
|
|
eol => "\n", |
696
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
#binary => 1, # default |
698
|
|
|
|
|
|
|
encoding => 'UTF-8', |
699
|
|
|
|
|
|
|
headers => $arg->{headers} |
700
|
0
|
|
|
|
|
0
|
); |
701
|
0
|
|
|
|
|
0
|
return 1; |
702
|
|
|
|
|
|
|
} |
703
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
sub open_filehandle { |
705
|
|
|
|
|
|
|
|
706
|
7
|
|
|
7
|
0
|
24
|
my ( $filepath, $mode ) = @_; |
707
|
7
|
50
|
|
|
|
40
|
my $handle = $mode eq 'a' ? '>>' : $mode eq 'w' ? '>' : '<'; |
|
|
100
|
|
|
|
|
|
708
|
7
|
|
|
|
|
13
|
my $fh; |
709
|
7
|
100
|
|
|
|
39
|
if ($filepath =~ /\.gz$/) { |
710
|
3
|
100
|
66
|
|
|
17
|
if ($mode eq 'a' || $mode eq 'w') { |
711
|
1
|
50
|
|
|
|
27
|
$fh = IO::Compress::Gzip->new($filepath, Append => ($mode eq 'a' ? 1 : 0)); |
712
|
|
|
|
|
|
|
} |
713
|
|
|
|
|
|
|
else { |
714
|
2
|
|
|
|
|
27
|
$fh = IO::Uncompress::Gunzip->new($filepath, MultiStream => 1); |
715
|
|
|
|
|
|
|
} |
716
|
3
|
|
|
|
|
8969
|
binmode($fh, ":encoding(UTF-8)"); |
717
|
|
|
|
|
|
|
} |
718
|
|
|
|
|
|
|
else { |
719
|
4
|
|
|
|
|
40
|
open $fh, qq($handle:encoding(UTF-8)), $filepath; |
720
|
|
|
|
|
|
|
} |
721
|
7
|
|
|
|
|
5803
|
return $fh; |
722
|
|
|
|
|
|
|
} |
723
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
sub define_separator { |
725
|
|
|
|
|
|
|
|
726
|
28
|
|
|
28
|
0
|
94
|
my ( $filepath, $sep ) = @_; |
727
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
# Define split record separator from file extension |
729
|
28
|
|
|
|
|
70
|
my @exts = map { $_, $_ . '.gz' } qw(.csv .tsv .sql .txt); |
|
112
|
|
|
|
|
294
|
|
730
|
28
|
|
|
|
|
2679
|
my ( $table_name, undef, $ext ) = fileparse( $filepath, @exts ); |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
# Defining separator character |
733
|
28
|
50
|
|
|
|
186
|
my $separator = |
|
|
50
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
734
|
|
|
|
|
|
|
$sep |
735
|
|
|
|
|
|
|
? $sep |
736
|
|
|
|
|
|
|
: $ext eq '.csv' ? ';' # Note we don't use comma but semicolon |
737
|
|
|
|
|
|
|
: $ext eq '.csv.gz' ? ';' # idem |
738
|
|
|
|
|
|
|
: $ext eq '.tsv' ? "\t" |
739
|
|
|
|
|
|
|
: $ext eq '.tsv.gz' ? "\t" |
740
|
|
|
|
|
|
|
: "\t"; |
741
|
|
|
|
|
|
|
|
742
|
28
|
50
|
|
|
|
102
|
my $encoding = |
743
|
|
|
|
|
|
|
$ext =~ m/\.gz/ ? ':gzip:encoding(utf-8)' : 'encoding(utf-8)'; |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
# Return 3 but some get only 2 |
746
|
28
|
|
|
|
|
125
|
return ( $separator, $encoding, $table_name ); |
747
|
|
|
|
|
|
|
} |
748
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
sub to_gb { |
750
|
|
|
|
|
|
|
|
751
|
0
|
|
|
0
|
0
|
0
|
my $bytes = shift; |
752
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
# base 2 => 1,073,741,824 |
754
|
0
|
|
|
|
|
0
|
my $gb = $bytes / 1_073_741_824; |
755
|
0
|
|
|
|
|
0
|
return sprintf( '%8.4f', $gb ) . ' GB'; |
756
|
|
|
|
|
|
|
} |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
sub load_exposures { |
759
|
|
|
|
|
|
|
|
760
|
3
|
|
|
3
|
0
|
33
|
my $data = read_csv( { in => shift, sep => "\t" } ); |
761
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
# We will only use the key 'concept_id' and discard the rest |
763
|
|
|
|
|
|
|
#$VAR1 = { |
764
|
|
|
|
|
|
|
# '4138352' => 1 |
765
|
|
|
|
|
|
|
# }; |
766
|
3
|
|
|
|
|
23
|
my %hash = map { $_->{concept_id} => 1 } @$data; |
|
639
|
|
|
|
|
1505
|
|
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
# Returning hashref |
769
|
3
|
|
|
|
|
730
|
return \%hash; |
770
|
|
|
|
|
|
|
} |
771
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
1; |