| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Convert::Pheno::CSV; |
|
2
|
|
|
|
|
|
|
|
|
3
|
6
|
|
|
6
|
|
44
|
use strict; |
|
|
6
|
|
|
|
|
14
|
|
|
|
6
|
|
|
|
|
229
|
|
|
4
|
6
|
|
|
6
|
|
31
|
use warnings; |
|
|
6
|
|
|
|
|
12
|
|
|
|
6
|
|
|
|
|
165
|
|
|
5
|
6
|
|
|
6
|
|
36
|
use autodie; |
|
|
6
|
|
|
|
|
14
|
|
|
|
6
|
|
|
|
|
67
|
|
|
6
|
6
|
|
|
6
|
|
33881
|
use feature qw(say); |
|
|
6
|
|
|
|
|
16
|
|
|
|
6
|
|
|
|
|
621
|
|
|
7
|
6
|
|
|
6
|
|
45
|
use File::Basename; |
|
|
6
|
|
|
|
|
14
|
|
|
|
6
|
|
|
|
|
591
|
|
|
8
|
6
|
|
|
6
|
|
6999
|
use Text::CSV_XS qw(csv); |
|
|
6
|
|
|
|
|
108550
|
|
|
|
6
|
|
|
|
|
513
|
|
|
9
|
6
|
|
|
6
|
|
3437
|
use Sort::Naturally qw(nsort); |
|
|
6
|
|
|
|
|
30012
|
|
|
|
6
|
|
|
|
|
397
|
|
|
10
|
6
|
|
|
6
|
|
46
|
use List::Util qw(any); |
|
|
6
|
|
|
|
|
13
|
|
|
|
6
|
|
|
|
|
400
|
|
|
11
|
6
|
|
|
6
|
|
44
|
use File::Spec::Functions qw(catdir); |
|
|
6
|
|
|
|
|
10
|
|
|
|
6
|
|
|
|
|
344
|
|
|
12
|
6
|
|
|
6
|
|
5716
|
use IO::Compress::Gzip qw($GzipError); |
|
|
6
|
|
|
|
|
200632
|
|
|
|
6
|
|
|
|
|
682
|
|
|
13
|
6
|
|
|
6
|
|
4783
|
use IO::Uncompress::Gunzip qw($GunzipError); |
|
|
6
|
|
|
|
|
78885
|
|
|
|
6
|
|
|
|
|
660
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
#use Devel::Size qw(size total_size); |
|
16
|
6
|
|
|
6
|
|
42
|
use Convert::Pheno; |
|
|
6
|
|
|
|
|
11
|
|
|
|
6
|
|
|
|
|
200
|
|
|
17
|
6
|
|
|
6
|
|
3331
|
use Convert::Pheno::OMOP; |
|
|
6
|
|
|
|
|
21
|
|
|
|
6
|
|
|
|
|
737
|
|
|
18
|
6
|
|
|
6
|
|
2676
|
use Convert::Pheno::IO; |
|
|
6
|
|
|
|
|
15
|
|
|
|
6
|
|
|
|
|
385
|
|
|
19
|
6
|
|
|
6
|
|
2397
|
use Convert::Pheno::Schema; |
|
|
6
|
|
|
|
|
27
|
|
|
|
6
|
|
|
|
|
279
|
|
|
20
|
6
|
|
|
6
|
|
47
|
use Convert::Pheno::Mapping; |
|
|
6
|
|
|
|
|
13
|
|
|
|
6
|
|
|
|
|
796
|
|
|
21
|
6
|
|
|
6
|
|
40
|
use Exporter 'import'; |
|
|
6
|
|
|
|
|
17
|
|
|
|
6
|
|
|
|
|
427
|
|
|
22
|
|
|
|
|
|
|
our @EXPORT = |
|
23
|
|
|
|
|
|
|
qw(read_csv read_csv_stream read_redcap_dict_and_mapping_file transpose_ohdsi_dictionary read_sqldump_stream read_sqldump sqldump2csv transpose_omop_data_structure open_filehandle load_exposures transpose_visit_occurrence); |
|
24
|
|
|
|
|
|
|
|
|
25
|
6
|
|
|
6
|
|
47
|
use constant DEVEL_MODE => 0; |
|
|
6
|
|
|
|
|
22
|
|
|
|
6
|
|
|
|
|
23611
|
|
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
######################### |
|
28
|
|
|
|
|
|
|
######################### |
|
29
|
|
|
|
|
|
|
# SUBROUTINES FOR CSV # |
|
30
|
|
|
|
|
|
|
######################### |
|
31
|
|
|
|
|
|
|
######################### |
|
32
|
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub read_redcap_dictionary { |
|
34
|
|
|
|
|
|
|
|
|
35
|
13
|
|
|
13
|
0
|
35
|
my $filepath = shift; |
|
36
|
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
# Define split record separator from file extension |
|
38
|
13
|
|
|
|
|
68
|
my ( $separator, $encoding ) = define_separator( $filepath, undef ); |
|
39
|
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
# We'll create an HoH using as 1D-key the 'Variable / Field Name' |
|
41
|
13
|
|
|
|
|
46
|
my $key = 'Variable / Field Name'; |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
# We'll be adding the key <_labels>. See sub add_labels |
|
44
|
13
|
|
|
|
|
32
|
my $labels = 'Choices, Calculations, OR Slider Labels'; |
|
45
|
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# Loading data directly from Text::CSV_XS |
|
47
|
|
|
|
|
|
|
# NB1: We want HoH and sub read_csv returns AoH |
|
48
|
|
|
|
|
|
|
# NB2: By default the Text::CSV module treats all fields in a CSV file as strings, regardless of their actual data type. |
|
49
|
|
|
|
|
|
|
my $hoh = csv( |
|
50
|
|
|
|
|
|
|
in => $filepath, |
|
51
|
|
|
|
|
|
|
sep_char => $separator, |
|
52
|
|
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
#binary => 1, # default |
|
54
|
|
|
|
|
|
|
auto_diag => 1, |
|
55
|
|
|
|
|
|
|
encoding => $encoding, |
|
56
|
|
|
|
|
|
|
key => $key, |
|
57
|
5083
|
|
|
5083
|
|
366706
|
on_in => sub { $_{_labels} = add_labels( $_{$labels} ) } |
|
58
|
13
|
|
|
|
|
220
|
); |
|
59
|
13
|
|
|
|
|
594
|
return $hoh; |
|
60
|
|
|
|
|
|
|
} |
|
61
|
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
sub add_labels { |
|
63
|
|
|
|
|
|
|
|
|
64
|
5083
|
|
|
5083
|
0
|
6134
|
my $value = shift; |
|
65
|
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
67
|
|
|
|
|
|
|
# This sub can return undef, i.e., $_{labels} = undef |
|
68
|
|
|
|
|
|
|
# That's OK as we won't perform exists $_{_label} |
|
69
|
|
|
|
|
|
|
# Note that in $hoh (above) empty columns are key = ''. |
|
70
|
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
# Premature return if empty ('' = 0) |
|
72
|
5083
|
100
|
|
|
|
9714
|
return undef unless $value; # perlcritic Severity: 5 |
|
73
|
|
|
|
|
|
|
|
|
74
|
|
|
|
|
|
|
# We'll skip values that don't provide even number of key-values |
|
75
|
2912
|
|
|
|
|
21939
|
my @tmp = map { s/^\s//; s/\s+$//; $_; } |
|
|
27053
|
|
|
|
|
41624
|
|
|
|
27053
|
|
|
|
|
35542
|
|
|
|
27053
|
|
|
|
|
37137
|
|
|
76
|
|
|
|
|
|
|
( split /\||,/, $value ); # perlcritic Severity: 5 |
|
77
|
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
# Return undef for non-valid entries |
|
79
|
2912
|
100
|
|
|
|
16062
|
return @tmp % 2 == 0 ? {@tmp} : undef; |
|
80
|
|
|
|
|
|
|
} |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
sub read_redcap_dict_and_mapping_file { |
|
83
|
|
|
|
|
|
|
|
|
84
|
13
|
|
|
13
|
0
|
49
|
my $arg = shift; |
|
85
|
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
# Read and load REDCap CSV dictionary |
|
87
|
13
|
|
|
|
|
71
|
my $data_redcap_dict = read_redcap_dictionary( $arg->{redcap_dictionary} ); |
|
88
|
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# Read and load mapping file |
|
90
|
|
|
|
|
|
|
my $data_mapping_file = |
|
91
|
13
|
|
|
|
|
159
|
io_yaml_or_json( { filepath => $arg->{mapping_file}, mode => 'read' } ); |
|
92
|
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
# Validate mapping file against JSON schema |
|
94
|
|
|
|
|
|
|
my $jv = Convert::Pheno::Schema->new( |
|
95
|
|
|
|
|
|
|
{ |
|
96
|
|
|
|
|
|
|
data => $data_mapping_file, |
|
97
|
|
|
|
|
|
|
debug => $arg->{self_validate_schema}, |
|
98
|
|
|
|
|
|
|
schema_file => $arg->{schema_file} |
|
99
|
|
|
|
|
|
|
} |
|
100
|
7
|
|
|
|
|
338
|
); |
|
101
|
7
|
|
|
|
|
70
|
$jv->json_validate; |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
# Return if succesful |
|
104
|
6
|
|
|
|
|
244
|
return ( $data_redcap_dict, $data_mapping_file ); |
|
105
|
|
|
|
|
|
|
} |
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub transpose_ohdsi_dictionary { |
|
108
|
|
|
|
|
|
|
|
|
109
|
3
|
|
|
3
|
0
|
7
|
my $data = shift; |
|
110
|
3
|
|
|
|
|
10
|
my $column = 'concept_id'; |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
# The idea is the following: |
|
113
|
|
|
|
|
|
|
# $data comes as an array (from SQL/CSV) |
|
114
|
|
|
|
|
|
|
# |
|
115
|
|
|
|
|
|
|
# $VAR1 = [ |
|
116
|
|
|
|
|
|
|
# { |
|
117
|
|
|
|
|
|
|
# 'concept_class_id' => '4-char billing code', |
|
118
|
|
|
|
|
|
|
# 'concept_code' => 'K92.2', |
|
119
|
|
|
|
|
|
|
# 'concept_id' => 35208414, |
|
120
|
|
|
|
|
|
|
# 'concept_name' => 'Gastrointestinal hemorrhage, unspecified', |
|
121
|
|
|
|
|
|
|
# 'domain_id' => 'Condition', |
|
122
|
|
|
|
|
|
|
# 'invalid_reason' => undef, |
|
123
|
|
|
|
|
|
|
# 'standard_concept' => undef, |
|
124
|
|
|
|
|
|
|
# 'valid_end_date' => '2099-12-31', |
|
125
|
|
|
|
|
|
|
# 'valid_start_date' => '2007-01-01', |
|
126
|
|
|
|
|
|
|
# 'vocabulary_id' => 'ICD10CM' |
|
127
|
|
|
|
|
|
|
# }, |
|
128
|
|
|
|
|
|
|
# |
|
129
|
|
|
|
|
|
|
# and we convert it to hash to allow for quick searches by 'concept_id' |
|
130
|
|
|
|
|
|
|
# |
|
131
|
|
|
|
|
|
|
# $VAR1 = { |
|
132
|
|
|
|
|
|
|
# '1107830' => { |
|
133
|
|
|
|
|
|
|
# 'concept_class_id' => 'Ingredient', |
|
134
|
|
|
|
|
|
|
# 'concept_code' => 28889, |
|
135
|
|
|
|
|
|
|
# 'concept_id' => 1107830, |
|
136
|
|
|
|
|
|
|
# 'concept_name' => 'Loratadine', |
|
137
|
|
|
|
|
|
|
# 'domain_id' => 'Drug', |
|
138
|
|
|
|
|
|
|
# 'invalid_reason' => undef, |
|
139
|
|
|
|
|
|
|
# 'standard_concept' => 'S', |
|
140
|
|
|
|
|
|
|
# 'valid_end_date' => '2099-12-31', |
|
141
|
|
|
|
|
|
|
# 'valid_start_date' => '1970-01-01', |
|
142
|
|
|
|
|
|
|
# 'vocabulary_id' => 'RxNorm' |
|
143
|
|
|
|
|
|
|
# }, |
|
144
|
|
|
|
|
|
|
# |
|
145
|
|
|
|
|
|
|
# NB: We store all columns yet we'll use 4: |
|
146
|
|
|
|
|
|
|
# 'concept_id', 'concept_code', 'concept_name', 'vocabulary_id' |
|
147
|
|
|
|
|
|
|
# Note that we're duplicating @$data with $hoh |
|
148
|
|
|
|
|
|
|
#my $hoh = { map { $_->{$column} => $_ } @{$data} }; <--map is slower than for |
|
149
|
3
|
|
|
|
|
5
|
my $hoh; |
|
150
|
3
|
|
|
|
|
8
|
for my $item ( @{$data} ) { |
|
|
3
|
|
|
|
|
9
|
|
|
151
|
1332
|
|
|
|
|
4092
|
$hoh->{ $item->{$column} } = $item; |
|
152
|
|
|
|
|
|
|
} |
|
153
|
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
#say "transpose_ohdsi_dictionary:", to_gb( total_size($hoh) ) if DEVEL_MODE; |
|
155
|
3
|
|
|
|
|
18
|
return $hoh; |
|
156
|
|
|
|
|
|
|
} |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
sub read_sqldump_stream { |
|
159
|
|
|
|
|
|
|
|
|
160
|
1
|
|
|
1
|
0
|
2
|
my $arg = shift; |
|
161
|
1
|
|
|
|
|
4
|
my $filein = $arg->{in}; |
|
162
|
1
|
|
|
|
|
3
|
my $self = $arg->{self}; |
|
163
|
1
|
|
|
|
|
3
|
my $person = $arg->{person}; |
|
164
|
1
|
|
|
|
|
3
|
my $fileout = $self->{out_file}; |
|
165
|
1
|
|
|
|
|
2
|
my $switch = 0; |
|
166
|
1
|
|
|
|
|
2
|
my @headers; |
|
167
|
1
|
|
|
|
|
2
|
my $table_name = $self->{omop_tables}[0]; |
|
168
|
1
|
|
|
|
|
4
|
my $table_name_lc = lc($table_name); |
|
169
|
|
|
|
|
|
|
|
|
170
|
|
|
|
|
|
|
# Open filehandles |
|
171
|
1
|
|
|
|
|
4
|
my $fh_in = open_filehandle( $filein, 'r' ); |
|
172
|
1
|
|
|
|
|
4
|
my $fh_out = open_filehandle( $fileout, 'a' ); |
|
173
|
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
# Start printing the array |
|
175
|
|
|
|
|
|
|
#say $fh_out "["; |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
# Now we we start processing line by line |
|
178
|
1
|
|
|
|
|
3
|
my $count = 0; |
|
179
|
1
|
|
|
|
|
7
|
while ( my $line = <$fh_in> ) { |
|
180
|
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
# Only parsing $table_name_lc and discarding others |
|
182
|
|
|
|
|
|
|
# Note that double quotes are optional |
|
183
|
|
|
|
|
|
|
# - COPY "OMOP_cdm_eunomia".person |
|
184
|
|
|
|
|
|
|
# . COPY omop_cdm_eunomia_2.person |
|
185
|
386339
|
100
|
|
|
|
28189586
|
if ( $line =~ /^COPY \"?(\w+)\"?\.$table_name_lc / ) { |
|
186
|
1
|
|
|
|
|
8
|
chomp $line; |
|
187
|
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
# Create an array to hold the column names for this table |
|
189
|
1
|
|
|
|
|
20
|
$line =~ s/[\(\),]//g; # getting rid of (), |
|
190
|
1
|
|
|
|
|
29
|
@headers = split /\s+/, $line; |
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
# Discarding headers which are not terms/variables |
|
193
|
1
|
|
|
|
|
17
|
@headers = @headers[ 2 .. $#headers - 2 ]; |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# Turning on the switch for later |
|
196
|
1
|
|
|
|
|
2
|
$switch++; |
|
197
|
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
# Jump one line |
|
199
|
1
|
|
|
|
|
7
|
$line = <$fh_in>; |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
} |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
# Loading the data if $switch |
|
204
|
386339
|
100
|
|
|
|
1075195
|
if ($switch) { |
|
205
|
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
# get rid of \n |
|
207
|
67708
|
|
|
|
|
124272
|
chomp $line; |
|
208
|
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
# Order matters. We exit before loading |
|
210
|
67708
|
100
|
|
|
|
129730
|
last if $line =~ /^\\\.$/; |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
# Solitting by tab, it's ok |
|
213
|
67707
|
|
|
|
|
352948
|
my @fields = split /\t/, $line; |
|
214
|
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
# Using tmp hashref to load all fields at once with slice |
|
216
|
67707
|
|
|
|
|
93727
|
my $hash_slice; |
|
217
|
67707
|
|
|
|
|
803884
|
@{$hash_slice}{@headers} = |
|
218
|
67707
|
|
|
|
|
117943
|
map { dotify_and_coerce_number($_) } @fields; |
|
|
1421847
|
|
|
|
|
1999477
|
|
|
219
|
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
# Initialize $data each time |
|
221
|
|
|
|
|
|
|
# Adding them as an array element (AoH) |
|
222
|
|
|
|
|
|
|
die |
|
223
|
|
|
|
|
|
|
"We could not find person_id:$hash_slice->{person_id}. Try increasing the #lines with --max-lines-sql\n" |
|
224
|
67707
|
50
|
|
|
|
284195
|
unless exists $person->{ $hash_slice->{person_id} }; |
|
225
|
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
# Increase counter |
|
227
|
67707
|
|
|
|
|
92169
|
$count++; |
|
228
|
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
# Encode data |
|
230
|
67707
|
|
|
|
|
115793
|
my $encoded_data = |
|
231
|
|
|
|
|
|
|
encode_omop_stream( $table_name, $hash_slice, $person, $count, |
|
232
|
|
|
|
|
|
|
$self ); |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
# Only after encoding we are able to discard 'null' |
|
235
|
67707
|
50
|
|
|
|
386565
|
say $fh_out $encoded_data if $encoded_data ne 'null'; |
|
236
|
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
# Print if verbose |
|
238
|
|
|
|
|
|
|
say "Rows processed: $count" |
|
239
|
67707
|
50
|
33
|
|
|
9323211
|
if ( $self->{verbose} && $count % 10_000 == 0 ); |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
} |
|
242
|
1
|
50
|
|
|
|
6
|
say "==============\nRows total: $count\n" if $self->{verbose}; |
|
243
|
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
#say $fh_out "]"; # not needed |
|
245
|
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
# Closing filehandles |
|
247
|
1
|
|
|
|
|
10
|
close $fh_in; |
|
248
|
1
|
|
|
|
|
254
|
close $fh_out; |
|
249
|
1
|
|
|
|
|
650
|
return 1; |
|
250
|
|
|
|
|
|
|
} |
|
251
|
|
|
|
|
|
|
|
|
252
|
|
|
|
|
|
|
sub encode_omop_stream { |
|
253
|
|
|
|
|
|
|
|
|
254
|
67707
|
|
|
67707
|
0
|
120570
|
my ( $table_name, $hash_slice, $person, $count, $self ) = @_; |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
257
|
|
|
|
|
|
|
# We only print person_id ONCE!!! |
|
258
|
67707
|
|
|
|
|
94727
|
my $person_id = $hash_slice->{person_id}; |
|
259
|
|
|
|
|
|
|
my $data = { |
|
260
|
|
|
|
|
|
|
$table_name => [$hash_slice], |
|
261
|
|
|
|
|
|
|
PERSON => $count == 1 |
|
262
|
|
|
|
|
|
|
? $person->{$person_id} |
|
263
|
|
|
|
|
|
|
: { |
|
264
|
67707
|
100
|
|
|
|
164783
|
map { $_ => $person->{$person_id}{$_} } |
|
|
203118
|
|
|
|
|
596206
|
|
|
265
|
|
|
|
|
|
|
qw(person_id gender_concept_id birth_datetime) |
|
266
|
|
|
|
|
|
|
} |
|
267
|
|
|
|
|
|
|
}; |
|
268
|
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
# Print line by line (->canonical has some overhead but needed for t/) |
|
270
|
67707
|
|
|
|
|
428287
|
return JSON::XS->new->utf8->canonical->encode( |
|
271
|
|
|
|
|
|
|
Convert::Pheno::omop2bff_stream_processing( $self, $data ) ); |
|
272
|
|
|
|
|
|
|
} |
|
273
|
|
|
|
|
|
|
|
|
274
|
|
|
|
|
|
|
sub read_sqldump { |
|
275
|
|
|
|
|
|
|
|
|
276
|
3
|
|
|
3
|
0
|
8
|
my $arg = shift; |
|
277
|
3
|
|
|
|
|
12
|
my $filepath = $arg->{in}; |
|
278
|
3
|
|
|
|
|
8
|
my $self = $arg->{self}; |
|
279
|
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
# Before resorting to writting this subroutine I performed an exhaustive search on CPAN: |
|
281
|
|
|
|
|
|
|
# - Tested MySQL::Dump::Parser::XS but I could not make it work... |
|
282
|
|
|
|
|
|
|
# - App-MysqlUtils-0.022 has a CLI utility (mysql-sql-dump-extract-tables) |
|
283
|
|
|
|
|
|
|
# - Of course one can always use *nix tools (sed, grep, awk, etc) or other programming languages.... |
|
284
|
|
|
|
|
|
|
# Anyway, I ended up writting the parser myself... |
|
285
|
|
|
|
|
|
|
# The parser is based in reading COPY paragraphs from PostgreSQL dump by using Perl's paragraph mode $/ = ""; |
|
286
|
|
|
|
|
|
|
# NB: Each paragraph (TABLE) is loaded into memory. Not great for large files. |
|
287
|
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
# Define variables that modify what we load |
|
289
|
3
|
|
|
|
|
8
|
my $max_lines_sql = $self->{max_lines_sql}; |
|
290
|
3
|
|
|
|
|
6
|
my @omop_tables = @{ $self->{omop_tables} }; |
|
|
3
|
|
|
|
|
12
|
|
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
# Set record separator to paragraph |
|
293
|
3
|
|
|
|
|
20
|
local $/ = ""; |
|
294
|
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
#COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, attribute_description, attribute_type_concept_id, attribute_syntax) FROM stdin; |
|
296
|
|
|
|
|
|
|
# ...... |
|
297
|
|
|
|
|
|
|
# \. |
|
298
|
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
# Start reading the SQL dump |
|
300
|
3
|
|
|
|
|
12
|
my $fh = open_filehandle( $filepath, 'r' ); |
|
301
|
|
|
|
|
|
|
|
|
302
|
|
|
|
|
|
|
# We'll store the data in the hashref $data |
|
303
|
3
|
|
|
|
|
9
|
my $data = {}; |
|
304
|
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
# Process paragraphs |
|
306
|
3
|
|
|
|
|
1009
|
while ( my $paragraph = <$fh> ) { |
|
307
|
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
# Discarding paragraphs not having m/^COPY/ |
|
309
|
618
|
100
|
|
|
|
1680875
|
next unless $paragraph =~ m/^COPY/; |
|
310
|
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
# Load all lines into an array (via "\n") |
|
312
|
117
|
|
|
|
|
819406
|
my @lines = split /\n/, $paragraph; |
|
313
|
117
|
100
|
|
|
|
1241
|
next unless scalar @lines > 2; |
|
314
|
54
|
|
|
|
|
127
|
pop @lines; # last line eq '\.' |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
# First line contains the headers |
|
317
|
|
|
|
|
|
|
#COPY "OMOP_cdm_eunomia".attribute_definition (attribute_definition_id, attribute_name, ..., attribute_syntax) FROM stdin; |
|
318
|
54
|
|
|
|
|
1127
|
$lines[0] =~ s/[\(\),]//g; # getting rid of (), |
|
319
|
54
|
|
|
|
|
843
|
my @headers = split /\s+/, $lines[0]; |
|
320
|
54
|
|
|
|
|
411
|
my $table_name = |
|
321
|
|
|
|
|
|
|
uc( ( split /\./, $headers[1] )[1] ); # ATTRIBUTE_DEFINITION |
|
322
|
|
|
|
|
|
|
|
|
323
|
|
|
|
|
|
|
# Discarding non @$omop_tables: |
|
324
|
|
|
|
|
|
|
# This step improves RAM consumption |
|
325
|
54
|
100
|
|
283
|
|
748
|
next unless any { $_ eq $table_name } @omop_tables; |
|
|
283
|
|
|
|
|
64753
|
|
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
# Say if verbose |
|
328
|
19
|
50
|
|
|
|
157
|
say "Processing table ... <$table_name>" if $self->{verbose}; |
|
329
|
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
# Discarding first line |
|
331
|
19
|
|
|
|
|
37
|
shift @lines; |
|
332
|
|
|
|
|
|
|
|
|
333
|
|
|
|
|
|
|
# Discarding headers which are not terms/variables |
|
334
|
19
|
|
|
|
|
201
|
@headers = @headers[ 2 .. $#headers - 2 ]; |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# Initializing $data>key as empty arrayref |
|
337
|
19
|
|
|
|
|
114
|
$data->{$table_name} = []; |
|
338
|
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
# Ad hoc counter for dev |
|
340
|
19
|
|
|
|
|
65
|
my $count = 0; |
|
341
|
|
|
|
|
|
|
|
|
342
|
|
|
|
|
|
|
# Processing line by line |
|
343
|
19
|
|
|
|
|
176
|
for my $line (@lines) { |
|
344
|
12063
|
|
|
|
|
12171
|
$count++; |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# Columns are separated by \t |
|
347
|
|
|
|
|
|
|
# NB: 'split' and 'Text::CSV' split to strings |
|
348
|
|
|
|
|
|
|
# We go with 'split'. Coercing a posteriori |
|
349
|
12063
|
|
|
|
|
66424
|
my @fields = split /\t/, $line; |
|
350
|
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
# Loading the fields like this: |
|
352
|
|
|
|
|
|
|
# |
|
353
|
|
|
|
|
|
|
# $VAR1 = { |
|
354
|
|
|
|
|
|
|
# 'PERSON' => [ # NB: This is the table name |
|
355
|
|
|
|
|
|
|
# { |
|
356
|
|
|
|
|
|
|
# 'person_id' => 123, |
|
357
|
|
|
|
|
|
|
# 'test' => 'abc' |
|
358
|
|
|
|
|
|
|
# }, |
|
359
|
|
|
|
|
|
|
# { |
|
360
|
|
|
|
|
|
|
# 'person_id' => 456, |
|
361
|
|
|
|
|
|
|
# 'test' => 'def' |
|
362
|
|
|
|
|
|
|
# } |
|
363
|
|
|
|
|
|
|
# ] |
|
364
|
|
|
|
|
|
|
# }; |
|
365
|
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
# Using tmp hashref to load all fields at once with slice |
|
367
|
12063
|
|
|
|
|
13582
|
my $hash_slice; |
|
368
|
12063
|
|
|
|
|
123790
|
@{$hash_slice}{@headers} = |
|
369
|
12063
|
|
|
|
|
15018
|
map { dotify_and_coerce_number($_) } @fields; |
|
|
197109
|
|
|
|
|
262791
|
|
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
# Adding them as an array element (AoH) |
|
372
|
12063
|
|
|
|
|
19747
|
push @{ $data->{$table_name} }, $hash_slice; |
|
|
12063
|
|
|
|
|
20655
|
|
|
373
|
|
|
|
|
|
|
|
|
374
|
|
|
|
|
|
|
# adhoc filter to speed-up development |
|
375
|
12063
|
100
|
|
|
|
19373
|
last if $count == $max_lines_sql; |
|
376
|
|
|
|
|
|
|
say "Rows processed: $count" |
|
377
|
12049
|
50
|
33
|
|
|
35746
|
if ( $self->{verbose} && $count % 1_000 == 0 ); |
|
378
|
|
|
|
|
|
|
|
|
379
|
|
|
|
|
|
|
} |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# Print if verbose |
|
382
|
19
|
50
|
|
|
|
66941
|
say "==============\nRows total: $count\n" if $self->{verbose}; |
|
383
|
|
|
|
|
|
|
} |
|
384
|
3
|
|
|
|
|
41
|
close $fh; |
|
385
|
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
#say total_size($data) and die; |
|
387
|
3
|
|
|
|
|
2758
|
return $data; |
|
388
|
|
|
|
|
|
|
} |
|
389
|
|
|
|
|
|
|
|
|
390
|
|
|
|
|
|
|
sub sqldump2csv { |
|
391
|
|
|
|
|
|
|
|
|
392
|
0
|
|
|
0
|
0
|
0
|
my ( $data, $dir ) = @_; |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
# CSV sep character |
|
395
|
0
|
|
|
|
|
0
|
my $sep = "\t"; |
|
396
|
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
# The idea is to save a CSV table for each $data->key |
|
398
|
0
|
|
|
|
|
0
|
for my $table ( keys %{$data} ) { |
|
|
0
|
|
|
|
|
0
|
|
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
# File path for CSV file |
|
401
|
0
|
|
|
|
|
0
|
my $filepath = catdir( $dir, "$table.csv" ); |
|
402
|
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
# We get header fields from row[0] and nsort them |
|
404
|
|
|
|
|
|
|
# NB: The order will not be the same as that in <.sql> |
|
405
|
0
|
|
|
|
|
0
|
my @headers = nsort keys %{ $data->{$table}[0] }; |
|
|
0
|
|
|
|
|
0
|
|
|
406
|
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
# Print data as CSV |
|
408
|
|
|
|
|
|
|
write_csv( |
|
409
|
|
|
|
|
|
|
{ |
|
410
|
|
|
|
|
|
|
sep => $sep, |
|
411
|
|
|
|
|
|
|
filepath => $filepath, |
|
412
|
|
|
|
|
|
|
headers => \@headers, |
|
413
|
0
|
|
|
|
|
0
|
data => $data->{$table} |
|
414
|
|
|
|
|
|
|
} |
|
415
|
|
|
|
|
|
|
); |
|
416
|
|
|
|
|
|
|
} |
|
417
|
0
|
|
|
|
|
0
|
return 1; |
|
418
|
|
|
|
|
|
|
} |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
sub transpose_omop_data_structure { |
|
421
|
|
|
|
|
|
|
|
|
422
|
2
|
|
|
2
|
0
|
5
|
my $data = shift; |
|
423
|
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
# The situation is the following, $data comes in format: |
|
425
|
|
|
|
|
|
|
# |
|
426
|
|
|
|
|
|
|
#$VAR1 = { |
|
427
|
|
|
|
|
|
|
# 'MEASUREMENT' => [ |
|
428
|
|
|
|
|
|
|
# { |
|
429
|
|
|
|
|
|
|
# 'measurement_concept_id' => 1, |
|
430
|
|
|
|
|
|
|
# 'person_id' => 666 |
|
431
|
|
|
|
|
|
|
# }, |
|
432
|
|
|
|
|
|
|
# { |
|
433
|
|
|
|
|
|
|
# 'measurement_concept_id' => 2, |
|
434
|
|
|
|
|
|
|
# 'person_id' => 666 |
|
435
|
|
|
|
|
|
|
# } |
|
436
|
|
|
|
|
|
|
# ], |
|
437
|
|
|
|
|
|
|
# 'PERSON' => [ |
|
438
|
|
|
|
|
|
|
# { |
|
439
|
|
|
|
|
|
|
# 'person_id' => 666 |
|
440
|
|
|
|
|
|
|
# }, |
|
441
|
|
|
|
|
|
|
# { |
|
442
|
|
|
|
|
|
|
# 'person_id' => 1 |
|
443
|
|
|
|
|
|
|
# } |
|
444
|
|
|
|
|
|
|
# ] |
|
445
|
|
|
|
|
|
|
# }; |
|
446
|
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
# where all 'person_id' are together inside the TABLE_NAME. |
|
448
|
|
|
|
|
|
|
# But, BFF "ideally" works at the individual level so we are going to |
|
449
|
|
|
|
|
|
|
# transpose the data structure to end up into something like this |
|
450
|
|
|
|
|
|
|
# NB: MEASUREMENT and OBSERVATION (among others, i.e., CONDITION_OCCURRENCE, PROCEDURE_OCCURRENCE) |
|
451
|
|
|
|
|
|
|
# can have multiple values for one 'person_id' so they will be loaded as arrays |
|
452
|
|
|
|
|
|
|
# |
|
453
|
|
|
|
|
|
|
# |
|
454
|
|
|
|
|
|
|
#$VAR1 = { |
|
455
|
|
|
|
|
|
|
# '1' => { |
|
456
|
|
|
|
|
|
|
# 'PERSON' => { |
|
457
|
|
|
|
|
|
|
# 'person_id' => 1 |
|
458
|
|
|
|
|
|
|
# } |
|
459
|
|
|
|
|
|
|
# }, |
|
460
|
|
|
|
|
|
|
# '666' => { |
|
461
|
|
|
|
|
|
|
# 'MEASUREMENT' => [ |
|
462
|
|
|
|
|
|
|
# { |
|
463
|
|
|
|
|
|
|
# 'measurement_concept_id' => 1, |
|
464
|
|
|
|
|
|
|
# 'person_id' => 666 |
|
465
|
|
|
|
|
|
|
# }, |
|
466
|
|
|
|
|
|
|
# { |
|
467
|
|
|
|
|
|
|
# 'measurement_concept_id' => 2, |
|
468
|
|
|
|
|
|
|
# 'person_id' => 666 |
|
469
|
|
|
|
|
|
|
# } |
|
470
|
|
|
|
|
|
|
# ], |
|
471
|
|
|
|
|
|
|
# 'PERSON' => { |
|
472
|
|
|
|
|
|
|
# 'person_id' => 666 |
|
473
|
|
|
|
|
|
|
# } |
|
474
|
|
|
|
|
|
|
# } |
|
475
|
|
|
|
|
|
|
# }; |
|
476
|
|
|
|
|
|
|
|
|
477
|
2
|
|
|
|
|
7
|
my $omop_person_id = {}; |
|
478
|
|
|
|
|
|
|
|
|
479
|
|
|
|
|
|
|
# Only performed for $omop_main_table |
|
480
|
2
|
|
|
|
|
4
|
for my $table ( @{ $omop_main_table->{$omop_version} } ) { # global |
|
|
2
|
|
|
|
|
12
|
|
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
# Loop over tables |
|
483
|
30
|
|
|
|
|
31
|
for my $item ( @{ $data->{$table} } ) { |
|
|
30
|
|
|
|
|
85
|
|
|
484
|
|
|
|
|
|
|
|
|
485
|
6000
|
50
|
33
|
|
|
21476
|
if ( exists $item->{person_id} && $item->{person_id} ) { |
|
486
|
6000
|
|
|
|
|
6852
|
my $person_id = $item->{person_id}; |
|
487
|
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
# {person_id} can have multiple rows in @omop_array_tables |
|
489
|
6000
|
100
|
|
21000
|
|
12896
|
if ( any { $_ eq $table } @omop_array_tables ) { |
|
|
21000
|
|
|
|
|
22154
|
|
|
490
|
5000
|
|
|
|
|
4692
|
push @{ $omop_person_id->{$person_id}{$table} }, |
|
|
5000
|
|
|
|
|
14627
|
|
|
491
|
|
|
|
|
|
|
$item; # array |
|
492
|
|
|
|
|
|
|
} |
|
493
|
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
# {person_id} only has one value in a given table |
|
495
|
|
|
|
|
|
|
else { |
|
496
|
1000
|
|
|
|
|
3452
|
$omop_person_id->{$person_id}{$table} = $item; # scalar |
|
497
|
|
|
|
|
|
|
} |
|
498
|
|
|
|
|
|
|
} |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
} |
|
501
|
|
|
|
|
|
|
|
|
502
|
|
|
|
|
|
|
# To get back unused memory for later.. |
|
503
|
2
|
|
|
|
|
4
|
$data = undef; |
|
504
|
|
|
|
|
|
|
|
|
505
|
|
|
|
|
|
|
# Finally we get rid of the 'person_id' key and return values as an array |
|
506
|
|
|
|
|
|
|
# |
|
507
|
|
|
|
|
|
|
#$VAR1 = [ |
|
508
|
|
|
|
|
|
|
# { |
|
509
|
|
|
|
|
|
|
# 'PERSON' => { |
|
510
|
|
|
|
|
|
|
# 'person_id' => 1 |
|
511
|
|
|
|
|
|
|
# } |
|
512
|
|
|
|
|
|
|
# }, |
|
513
|
|
|
|
|
|
|
# ------------------------------------------------ |
|
514
|
|
|
|
|
|
|
# { |
|
515
|
|
|
|
|
|
|
# 'MEASUREMENT' => [ |
|
516
|
|
|
|
|
|
|
# { |
|
517
|
|
|
|
|
|
|
# 'measurement_concept_id' => 1, |
|
518
|
|
|
|
|
|
|
# 'person_id' => 666 |
|
519
|
|
|
|
|
|
|
# }, |
|
520
|
|
|
|
|
|
|
# { |
|
521
|
|
|
|
|
|
|
# 'measurement_concept_id' => 2, |
|
522
|
|
|
|
|
|
|
# 'person_id' => 666 |
|
523
|
|
|
|
|
|
|
# } |
|
524
|
|
|
|
|
|
|
# ], |
|
525
|
|
|
|
|
|
|
# 'PERSON' => { |
|
526
|
|
|
|
|
|
|
# 'person_id' => 666 |
|
527
|
|
|
|
|
|
|
# } |
|
528
|
|
|
|
|
|
|
# } |
|
529
|
|
|
|
|
|
|
# ]; |
|
530
|
|
|
|
|
|
|
# NB: We nsort keys to always have the same result but it's not needed |
|
531
|
|
|
|
|
|
|
# v1 - Easier but duplicates data structure |
|
532
|
|
|
|
|
|
|
# my $aoh = [ map { $omop_person_id->{$_} } nsort keys %{$omop_person_id} ]; |
|
533
|
|
|
|
|
|
|
# v2 - This version cleans memory after loading $aoh <=== Implemented |
|
534
|
2
|
|
|
|
|
6
|
my $aoh; |
|
535
|
2
|
|
|
|
|
5
|
for my $key ( nsort keys %{$omop_person_id} ) { |
|
|
2
|
|
|
|
|
366
|
|
|
536
|
1940
|
|
|
|
|
574042
|
push @{$aoh}, $omop_person_id->{$key}; |
|
|
1940
|
|
|
|
|
2827
|
|
|
537
|
1940
|
|
|
|
|
2092
|
delete $omop_person_id->{$key}; |
|
538
|
|
|
|
|
|
|
} |
|
539
|
2
|
|
|
|
|
1062
|
if (DEVEL_MODE) { |
|
540
|
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
#say 'transpose_omop_data_structure(omop_person_id):', |
|
542
|
|
|
|
|
|
|
# to_gb( total_size($omop_person_id) ); |
|
543
|
|
|
|
|
|
|
#say 'transpose_omop_data_structure(map):', to_gb( total_size($aoh) ); |
|
544
|
|
|
|
|
|
|
} |
|
545
|
2
|
|
|
|
|
24
|
return $aoh; |
|
546
|
|
|
|
|
|
|
} |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
sub transpose_visit_occurrence { |
|
549
|
|
|
|
|
|
|
|
|
550
|
3
|
|
|
3
|
0
|
8
|
my $data = shift; # arrayref |
|
551
|
|
|
|
|
|
|
|
|
552
|
|
|
|
|
|
|
# Going from |
|
553
|
|
|
|
|
|
|
#$VAR1 = [ |
|
554
|
|
|
|
|
|
|
# { |
|
555
|
|
|
|
|
|
|
# 'admitting_source_concept_id' => 0, |
|
556
|
|
|
|
|
|
|
# 'visit_occurrence_id' => 85, |
|
557
|
|
|
|
|
|
|
# ... |
|
558
|
|
|
|
|
|
|
# } |
|
559
|
|
|
|
|
|
|
# ]; |
|
560
|
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
# To |
|
562
|
|
|
|
|
|
|
#$VAR1 = { |
|
563
|
|
|
|
|
|
|
# '85' => { |
|
564
|
|
|
|
|
|
|
# 'admitting_source_concept_id' => 0, |
|
565
|
|
|
|
|
|
|
# 'visit_occurrence_id' => 85, |
|
566
|
|
|
|
|
|
|
# ... |
|
567
|
|
|
|
|
|
|
# } |
|
568
|
|
|
|
|
|
|
# }; |
|
569
|
|
|
|
|
|
|
#my $hash = { map { $_->{visit_occurrence_id} => $_ } @$data }; # map is slower than for |
|
570
|
3
|
|
|
|
|
7
|
my $hash; |
|
571
|
3
|
|
|
|
|
10
|
for my $item (@$data) { |
|
572
|
|
|
|
|
|
|
my $key = $item->{visit_occurrence_id} |
|
573
|
2037
|
|
|
|
|
3181
|
; # otherwise $item->{visit_occurrence_id} goes from Int to Str in JSON and tests fail |
|
574
|
2037
|
|
|
|
|
3951
|
$hash->{$key} = $item; |
|
575
|
|
|
|
|
|
|
} |
|
576
|
3
|
|
|
|
|
28
|
return $hash; |
|
577
|
|
|
|
|
|
|
} |
|
578
|
|
|
|
|
|
|
|
|
579
|
|
|
|
|
|
|
sub read_csv { |
|
580
|
|
|
|
|
|
|
|
|
581
|
15
|
|
|
15
|
0
|
36
|
my $arg = shift; |
|
582
|
15
|
|
|
|
|
49
|
my $filepath = $arg->{in}; |
|
583
|
15
|
|
|
|
|
41
|
my $sep = $arg->{sep}; |
|
584
|
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
# Define split record separator from file extension |
|
586
|
15
|
|
|
|
|
63
|
my ( $separator, $encoding ) = define_separator( $filepath, $sep ); |
|
587
|
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
# Transform $filepath into an AoH |
|
589
|
|
|
|
|
|
|
# Using Text::CSV_XS functional interface |
|
590
|
15
|
|
|
|
|
119
|
my $aoh = csv( |
|
591
|
|
|
|
|
|
|
in => $filepath, |
|
592
|
|
|
|
|
|
|
sep_char => $separator, |
|
593
|
|
|
|
|
|
|
headers => "auto", |
|
594
|
|
|
|
|
|
|
eol => "\n", |
|
595
|
|
|
|
|
|
|
|
|
596
|
|
|
|
|
|
|
# binary => 1, # default |
|
597
|
|
|
|
|
|
|
encoding => $encoding, |
|
598
|
|
|
|
|
|
|
auto_diag => 1 |
|
599
|
|
|
|
|
|
|
); |
|
600
|
|
|
|
|
|
|
|
|
601
|
|
|
|
|
|
|
# $aoh = [ |
|
602
|
|
|
|
|
|
|
# { |
|
603
|
|
|
|
|
|
|
# 'abdominal_mass' => 0, |
|
604
|
|
|
|
|
|
|
# 'age_first_diagnosis' => 0, |
|
605
|
|
|
|
|
|
|
# 'alcohol' => 4, |
|
606
|
|
|
|
|
|
|
# }, {},,, |
|
607
|
|
|
|
|
|
|
# ] |
|
608
|
|
|
|
|
|
|
|
|
609
|
|
|
|
|
|
|
# Coercing the data before returning it |
|
610
|
14
|
|
|
|
|
220200
|
for my $item (@$aoh) { |
|
611
|
1035
|
|
|
|
|
1300
|
for my $key ( keys %{$item} ) { |
|
|
1035
|
|
|
|
|
22704
|
|
|
612
|
177462
|
|
|
|
|
259069
|
$item->{$key} = dotify_and_coerce_number( $item->{$key} ); |
|
613
|
|
|
|
|
|
|
} |
|
614
|
|
|
|
|
|
|
} |
|
615
|
|
|
|
|
|
|
|
|
616
|
14
|
|
|
|
|
143
|
return $aoh; |
|
617
|
|
|
|
|
|
|
} |
|
618
|
|
|
|
|
|
|
|
|
619
|
|
|
|
|
|
|
sub read_csv_stream { |
|
620
|
|
|
|
|
|
|
|
|
621
|
0
|
|
|
0
|
0
|
0
|
my $arg = shift; |
|
622
|
0
|
|
|
|
|
0
|
my $filein = $arg->{in}; |
|
623
|
0
|
|
|
|
|
0
|
my $self = $arg->{self}; |
|
624
|
0
|
|
|
|
|
0
|
my $sep = $arg->{sep}; |
|
625
|
0
|
|
|
|
|
0
|
my $person = $arg->{person}; |
|
626
|
0
|
|
|
|
|
0
|
my $fileout = $self->{out_file}; |
|
627
|
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
# Define split record separator |
|
629
|
0
|
|
|
|
|
0
|
my ( $separator, $encoding, $table_name ) = |
|
630
|
|
|
|
|
|
|
define_separator( $filein, $sep ); |
|
631
|
0
|
|
|
|
|
0
|
my $table_name_lc = lc($table_name); |
|
632
|
|
|
|
|
|
|
|
|
633
|
|
|
|
|
|
|
# Using Text::CSV_XS OO interface |
|
634
|
0
|
|
|
|
|
0
|
my $csv = Text::CSV_XS->new( |
|
635
|
|
|
|
|
|
|
{ binary => 1, auto_diag => 1, sep_char => $separator, eol => "\n" } ); |
|
636
|
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
# Open filehandles |
|
638
|
0
|
|
|
|
|
0
|
my $fh_in = open_filehandle( $filein, 'r' ); |
|
639
|
0
|
|
|
|
|
0
|
my $fh_out = open_filehandle( $fileout, 'a' ); |
|
640
|
|
|
|
|
|
|
|
|
641
|
|
|
|
|
|
|
# Get rid of \n on first line |
|
642
|
0
|
|
|
|
|
0
|
chomp( my $line = <$fh_in> ); |
|
643
|
0
|
|
|
|
|
0
|
my @headers = split /$separator/, $line; |
|
644
|
|
|
|
|
|
|
|
|
645
|
0
|
|
|
|
|
0
|
my $hash_slice; |
|
646
|
0
|
|
|
|
|
0
|
my $count = 0; |
|
647
|
|
|
|
|
|
|
|
|
648
|
|
|
|
|
|
|
# *** IMPORTANT *** |
|
649
|
|
|
|
|
|
|
# On Feb-19-2023 I tested Parallel::ForkManager and: |
|
650
|
|
|
|
|
|
|
# 1 - The performance was by far slower than w/o it |
|
651
|
|
|
|
|
|
|
# 2 - We hot SQLite errors for concurring fh |
|
652
|
|
|
|
|
|
|
# Thus, it was not implemented |
|
653
|
|
|
|
|
|
|
|
|
654
|
0
|
|
|
|
|
0
|
while ( my $row = $csv->getline($fh_in) ) { |
|
655
|
|
|
|
|
|
|
|
|
656
|
|
|
|
|
|
|
# Load the values a a hash slice; |
|
657
|
0
|
|
|
|
|
0
|
my $hash_slice; |
|
658
|
0
|
|
|
|
|
0
|
@{$hash_slice}{@headers} = map { dotify_and_coerce_number($_) } @$row; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
659
|
|
|
|
|
|
|
|
|
660
|
|
|
|
|
|
|
# Encode data |
|
661
|
0
|
|
|
|
|
0
|
my $encoded_data = |
|
662
|
|
|
|
|
|
|
encode_omop_stream( $table_name, $hash_slice, $person, $count, |
|
663
|
|
|
|
|
|
|
$self ); |
|
664
|
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
# Only after encoding we are able to discard 'null' |
|
666
|
0
|
0
|
|
|
|
0
|
say $fh_out $encoded_data if $encoded_data ne 'null'; |
|
667
|
|
|
|
|
|
|
|
|
668
|
|
|
|
|
|
|
# Increment $count |
|
669
|
0
|
|
|
|
|
0
|
$count++; |
|
670
|
|
|
|
|
|
|
say "Rows processed: $count" |
|
671
|
0
|
0
|
0
|
|
|
0
|
if ( $self->{verbose} && $count % 10_000 == 0 ); |
|
672
|
|
|
|
|
|
|
} |
|
673
|
0
|
0
|
|
|
|
0
|
say "==============\nRows total: $count\n" if $self->{verbose}; |
|
674
|
|
|
|
|
|
|
|
|
675
|
0
|
|
|
|
|
0
|
close $fh_in; |
|
676
|
0
|
|
|
|
|
0
|
close $fh_out; |
|
677
|
0
|
|
|
|
|
0
|
return 1; |
|
678
|
|
|
|
|
|
|
} |
|
679
|
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
sub write_csv { |
|
681
|
|
|
|
|
|
|
|
|
682
|
0
|
|
|
0
|
0
|
0
|
my $arg = shift; |
|
683
|
0
|
|
|
|
|
0
|
my $sep = $arg->{sep}; |
|
684
|
0
|
|
|
|
|
0
|
my $aoh = $arg->{data}; |
|
685
|
0
|
|
|
|
|
0
|
my $filepath = $arg->{filepath}; |
|
686
|
0
|
|
|
|
|
0
|
my $headers = $arg->{headers}; |
|
687
|
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
# Using Text::CSV_XS functional interface |
|
689
|
|
|
|
|
|
|
# NB: About speed: |
|
690
|
|
|
|
|
|
|
# https://metacpan.org/pod/Text::CSV#csv1 |
|
691
|
|
|
|
|
|
|
csv( |
|
692
|
|
|
|
|
|
|
in => $aoh, |
|
693
|
|
|
|
|
|
|
out => $filepath, |
|
694
|
|
|
|
|
|
|
sep_char => $sep, |
|
695
|
|
|
|
|
|
|
eol => "\n", |
|
696
|
|
|
|
|
|
|
|
|
697
|
|
|
|
|
|
|
#binary => 1, # default |
|
698
|
|
|
|
|
|
|
encoding => 'UTF-8', |
|
699
|
|
|
|
|
|
|
headers => $arg->{headers} |
|
700
|
0
|
|
|
|
|
0
|
); |
|
701
|
0
|
|
|
|
|
0
|
return 1; |
|
702
|
|
|
|
|
|
|
} |
|
703
|
|
|
|
|
|
|
|
|
704
|
|
|
|
|
|
|
sub open_filehandle { |
|
705
|
|
|
|
|
|
|
|
|
706
|
7
|
|
|
7
|
0
|
24
|
my ( $filepath, $mode ) = @_; |
|
707
|
7
|
50
|
|
|
|
37
|
my $handle = $mode eq 'a' ? '>>' : $mode eq 'w' ? '>' : '<'; |
|
|
|
100
|
|
|
|
|
|
|
708
|
7
|
|
|
|
|
13
|
my $fh; |
|
709
|
7
|
100
|
|
|
|
32
|
if ($filepath =~ /\.gz$/) { |
|
710
|
3
|
100
|
66
|
|
|
20
|
if ($mode eq 'a' || $mode eq 'w') { |
|
711
|
1
|
50
|
|
|
|
26
|
$fh = IO::Compress::Gzip->new($filepath, Append => ($mode eq 'a' ? 1 : 0)); |
|
712
|
|
|
|
|
|
|
} |
|
713
|
|
|
|
|
|
|
else { |
|
714
|
2
|
|
|
|
|
25
|
$fh = IO::Uncompress::Gunzip->new($filepath, MultiStream => 1); |
|
715
|
|
|
|
|
|
|
} |
|
716
|
3
|
|
|
|
|
8970
|
binmode($fh, ":encoding(UTF-8)"); |
|
717
|
|
|
|
|
|
|
} |
|
718
|
|
|
|
|
|
|
else { |
|
719
|
4
|
|
|
|
|
35
|
open $fh, qq($handle:encoding(UTF-8)), $filepath; |
|
720
|
|
|
|
|
|
|
} |
|
721
|
7
|
|
|
|
|
4710
|
return $fh; |
|
722
|
|
|
|
|
|
|
} |
|
723
|
|
|
|
|
|
|
|
|
724
|
|
|
|
|
|
|
sub define_separator { |
|
725
|
|
|
|
|
|
|
|
|
726
|
28
|
|
|
28
|
0
|
116
|
my ( $filepath, $sep ) = @_; |
|
727
|
|
|
|
|
|
|
|
|
728
|
|
|
|
|
|
|
# Define split record separator from file extension |
|
729
|
28
|
|
|
|
|
99
|
my @exts = map { $_, $_ . '.gz' } qw(.csv .tsv .sql .txt); |
|
|
112
|
|
|
|
|
346
|
|
|
730
|
28
|
|
|
|
|
3122
|
my ( $table_name, undef, $ext ) = fileparse( $filepath, @exts ); |
|
731
|
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
# Defining separator character |
|
733
|
28
|
50
|
|
|
|
236
|
my $separator = |
|
|
|
50
|
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
$sep |
|
735
|
|
|
|
|
|
|
? $sep |
|
736
|
|
|
|
|
|
|
: $ext eq '.csv' ? ';' # Note we don't use comma but semicolon |
|
737
|
|
|
|
|
|
|
: $ext eq '.csv.gz' ? ';' # idem |
|
738
|
|
|
|
|
|
|
: $ext eq '.tsv' ? "\t" |
|
739
|
|
|
|
|
|
|
: $ext eq '.tsv.gz' ? "\t" |
|
740
|
|
|
|
|
|
|
: "\t"; |
|
741
|
|
|
|
|
|
|
|
|
742
|
28
|
50
|
|
|
|
143
|
my $encoding = |
|
743
|
|
|
|
|
|
|
$ext =~ m/\.gz/ ? ':gzip:encoding(utf-8)' : 'encoding(utf-8)'; |
|
744
|
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
# Return 3 but some get only 2 |
|
746
|
28
|
|
|
|
|
142
|
return ( $separator, $encoding, $table_name ); |
|
747
|
|
|
|
|
|
|
} |
|
748
|
|
|
|
|
|
|
|
|
749
|
|
|
|
|
|
|
sub to_gb { |
|
750
|
|
|
|
|
|
|
|
|
751
|
0
|
|
|
0
|
0
|
0
|
my $bytes = shift; |
|
752
|
|
|
|
|
|
|
|
|
753
|
|
|
|
|
|
|
# base 2 => 1,073,741,824 |
|
754
|
0
|
|
|
|
|
0
|
my $gb = $bytes / 1_073_741_824; |
|
755
|
0
|
|
|
|
|
0
|
return sprintf( '%8.4f', $gb ) . ' GB'; |
|
756
|
|
|
|
|
|
|
} |
|
757
|
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
sub load_exposures { |
|
759
|
|
|
|
|
|
|
|
|
760
|
3
|
|
|
3
|
0
|
29
|
my $data = read_csv( { in => shift, sep => "\t" } ); |
|
761
|
|
|
|
|
|
|
|
|
762
|
|
|
|
|
|
|
# We will only use the key 'concept_id' and discard the rest |
|
763
|
|
|
|
|
|
|
#$VAR1 = { |
|
764
|
|
|
|
|
|
|
# '4138352' => 1 |
|
765
|
|
|
|
|
|
|
# }; |
|
766
|
3
|
|
|
|
|
16
|
my %hash = map { $_->{concept_id} => 1 } @$data; |
|
|
639
|
|
|
|
|
1478
|
|
|
767
|
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
# Returning hashref |
|
769
|
3
|
|
|
|
|
713
|
return \%hash; |
|
770
|
|
|
|
|
|
|
} |
|
771
|
|
|
|
|
|
|
|
|
772
|
|
|
|
|
|
|
1; |