File Coverage

lib/Convert/Pheno/SQLite.pm
Criterion Covered Total %
statement 116 139 83.4
branch 15 36 41.6
condition 8 23 34.7
subroutine 17 18 94.4
pod 0 8 0.0
total 156 224 69.6


line stmt bran cond sub pod time code
1             package Convert::Pheno::SQLite;
2              
3 6     6   59 use strict;
  6         9  
  6         184  
4 6     6   27 use warnings;
  6         20  
  6         132  
5 6     6   25 use autodie;
  6         10  
  6         31  
6 6     6   32437 use feature qw(say);
  6         23  
  6         513  
7              
8             #use Carp qw(confess);
9 6     6   11031 use DBI;
  6         113683  
  6         460  
10 6     6   72 use File::Spec::Functions qw(catdir catfile);
  6         11  
  6         384  
11 6     6   48 use Data::Dumper;
  6         43  
  6         289  
12 6     6   3469 use Text::Similarity::Overlaps;
  6         51063  
  6         342  
13 6     6   47 use Exporter 'import';
  6         45  
  6         378  
14             our @EXPORT =
15             qw( $VERSION open_connections_SQLite close_connections_SQLite get_ontology);
16              
17             my @sqlites = qw(ncit icd10 ohdsi cdisc omim hpo);
18             my @matches = qw(exact_match full_text_search contains);
19 6     6   44 use constant DEVEL_MODE => 0;
  6         15  
  6         12749  
20              
21             ########################
22             ########################
23             # SUBROUTINES FOR DB #
24             ########################
25             ########################
26              
27             sub open_connections_SQLite {
28              
29 10     10 0 31 my $self = shift;
30              
31             # **********************
32             # *** IMPORTANT STEP ***
33             # **********************
34             # Well open ALL databases once (instead that on each call), regardless if they user has selected them.
35             # It imrpoves speed by 15%
36             # The only exception is for <ohdsi> that is the larger and may interfere in timings
37              
38             # Only open ohdsi.db if $self->{ohdsi_db}
39             my @databases =
40 10 50       67 $self->{ohdsi_db} ? @sqlites : grep { !m/ohdsi/ } @sqlites; # global
  60         195  
41              
42             # Open databases
43 10         24 my $dbh;
44             $dbh->{$_} = open_db_SQLite( $_, $self->{path_to_ohdsi_db} )
45 10         64 for (@databases);
46              
47             # Add $dbh HANDLE to $self
48 10         49 $self->{dbh} = $dbh; # Dynamically adding attributes (setter)
49              
50             # Prepare the query once
51 10         50 prepare_query_SQLite($self);
52              
53 10         47 return 1;
54             }
55              
56             sub close_connections_SQLite {
57              
58 10     10 0 27 my $self = shift;
59 10         29 my $dbh = $self->{dbh};
60              
61             # Check flag ohdsi_db
62             my @databases =
63 10 50       66 $self->{ohdsi_db} ? @sqlites : grep { !m/ohdsi/ } @sqlites; # global
  60         180  
64 10         67 close_db_SQLite( $dbh->{$_} ) for (@databases);
65 10         49 return 1;
66             }
67              
68             sub open_db_SQLite {
69              
70 50     50 0 133 my ( $ontology, $path_to_ohdsi_db ) = @_;
71              
72             # Search file in two dirs, except for ($ontology eq 'ohdsi' && defined $path_to_ohdsi_db)
73 50         118 my $filename = qq/$ontology.db/;
74 50 50 33     407 my $path =
75             ( $ontology eq 'ohdsi' && defined $path_to_ohdsi_db )
76             ? $path_to_ohdsi_db
77             : catdir( $Convert::Pheno::share_dir, 'db' );
78 50         211 my $dbfile = catfile( $path, $filename );
79 50 50       1235 die "Sorry we could not find <$dbfile> file\n" unless -f $dbfile;
80              
81             # Connect to the database
82 50         140 my $user = '';
83 50         72 my $passwd = '';
84 50         162 my $dsn = "dbi:SQLite:dbname=$dbfile";
85 50         524 my $dbh = DBI->connect(
86             $dsn, $user, $passwd,
87              
88             # PRAGMAs
89             {
90             PrintError => 0,
91             RaiseError => 1,
92             ReadOnly => 1,
93             AutoCommit => 1,
94             FetchHashKeyName => 'NAME_lc',
95             }
96             );
97              
98             # These extra PRAGMAs are supposed to speed-up queries??
99 50         60316 $dbh->do("PRAGMA synchronous = OFF");
100 50         12007 $dbh->do("PRAGMA cache_size = 800000");
101              
102 50         1374 return $dbh;
103             }
104              
105             sub close_db_SQLite {
106              
107 50     50 0 93 my $dbh = shift;
108 50         8275 $dbh->disconnect();
109 50         294 return 1;
110             }
111              
112             sub prepare_query_SQLite {
113              
114 10     10 0 34 my $self = shift;
115              
116             ###############
117             # EXPLANATION #
118             ###############
119             #
120             # Even though we did not gain a lot of speed (~15%), we decided to do the "prepare step" once, instead of on each query.
121             # Then, if we want to search in a different column than 'label' we also need to create that $sth
122             # To solve that we have created a nested sth->{ncit}{label}, sth->{icd10}{label}, sth->{ohdsi}{concept_id} and sth->{ohdsi}{label}
123             # On top of that, we add the "match" type, so that we can have other matches in the future if needed
124             # NB: In principle, is is possible to change the "prepare" during queries but we must revert it back to default after using it
125             # We recommend using small db such as ncit/icd10 as they're fast
126              
127             # Check flag ohdsi_db
128             my @databases =
129 10 50       60 $self->{ohdsi_db} ? @sqlites : grep { !m/ohdsi/ } @sqlites; # global
  60         175  
130              
131             # NB:
132             # dbh = "Database Handle"
133             # sth = "Statement Handle"
134              
135 10         38 for my $match (@matches) {
136 30         61 for my $ontology (@databases) { #global
137 150         215 for my $column ( 'label', 'concept_id' ) {
138              
139             # We only need to open 'concept_id' in ohdsi
140 300 100 66     1138 next if ( $column eq 'concept_id' && $ontology ne 'ohdsi' );
141              
142             ##############################
143             # Start building the queries #
144             ##############################
145              
146             # NCIT_table or NCIT_fts depending on type of match
147 150         354 my $db = uc($ontology) . '_table';
148 150         232 my $db_fts = uc($ontology) . '_fts';
149 150         272 my $dbh = $self->{dbh}{$ontology};
150 150         938 my %query_type = (
151              
152             # Regular queries
153             contains =>
154             qq(SELECT * FROM $db WHERE $column LIKE '%' || ? || '%' COLLATE NOCASE)
155             , # NOT USED
156              
157             #begins_with => qq(SELECT * FROM $db WHERE $column LIKE ? || '%' COLLATE NOCASE), # NOT USED
158             exact_match =>
159             qq(SELECT * FROM $db WHERE $column = ? COLLATE NOCASE),
160              
161             # **********************
162             # *** IMPORTANT STEP ***
163             # **********************
164              
165             # Full-text-search queries only on column <label> BUT IT CAN BE DONE ALL COLUMNS!!!!
166             # The speed of the FTS in $column == $db_fts
167             # FTS is 2x faster than 'contains'
168             # NOTE (Jan-2023): We don't check for misspelled words
169             # --> TO DO - Tricky --> https://www.sqlite.org/spellfix1.html
170             full_text_search =>
171             qq(SELECT * FROM $db_fts WHERE $column MATCH ?)
172             , # SINGLE COLUMN
173             #qq(SELECT * FROM $db_fts WHERE $db_fts MATCH ?), # ALL TABLE
174              
175             # SOUNDEX using TABLE_fts but only on column <label>
176             # soundex => qq(SELECT * FROM $db_fts WHERE SOUNDEX($column) = SOUNDEX(?)) # NOT USED
177             );
178              
179             # Prepare the query
180 150         863 my $sth = $dbh->prepare( $query_type{$match} );
181              
182             # Autovivification of $self->{sth}{$ontology}{$column}{$match}
183 150         22288 $self->{sth}{$ontology}{$column}{$match} =
184             $sth; # Dynamically adding nested attributes (setter)
185             }
186             }
187             }
188              
189             #print Dumper $self and die;
190 10         30 return 1;
191             }
192              
193             sub get_ontology {
194              
195             ###############
196             # START QUERY #
197             ###############
198              
199 206     206 0 348 my $arg = shift;
200 206         377 my $ontology = $arg->{ontology};
201 206         322 my $sth_column_ref = $arg->{sth_column_ref}; #it contains hashref
202 206         306 my $query = $arg->{query};
203 206         326 my $column = $arg->{column};
204 206         327 my $search = $arg->{search};
205 206         303 my $text_similarity_method = $arg->{text_similarity_method};
206 206         341 my $min_text_similarity_score = $arg->{min_text_similarity_score};
207 206         293 my $type_of_search = 'full_text_search'; # 'contains' and 'full_text_search'
208             #say $type_of_search;
209 206         286 say "QUERY <$query>" if DEVEL_MODE;
210              
211             # A) 'exact'
212             # - exact_match
213             # B) Mixed queries:
214             # 1 - exact_match
215             # if we don't get results
216             # 2 - contains
217             # for which we rank by similarity w/ Text:Similarity
218              
219 206 50       614 my $default_id =
220             $ontology eq 'hpo' ? 'HP:NA0000' : uc($ontology) . ':NA0000';
221 206         266 my $default_label = 'NA';
222              
223             # exact_match (always performed)
224             my ( $id, $label ) = execute_query_SQLite(
225             {
226             sth => $sth_column_ref->{exact_match}, # IMPORTANT STEP
227 206         1124 query => $query,
228             ontology => $ontology,
229             match => 'exact_match',
230             text_similarity_method => $text_similarity_method, # Not used here
231             min_text_similarity_score => $min_text_similarity_score
232             }
233             );
234              
235             # mixed
236 206 50 0     1202 if ( $search eq 'mixed' && ( !defined $id && !defined $label ) ) {
      33        
237             ( $id, $label ) = execute_query_SQLite(
238             {
239 0         0 sth => $sth_column_ref->{$type_of_search}, # IMPORTANT STEP
240             query => $query,
241             ontology => $ontology,
242             match => $type_of_search,
243             text_similarity_method => $text_similarity_method,
244             min_text_similarity_score => $min_text_similarity_score
245             }
246             );
247             }
248              
249             # Set defaults if undef
250 206   66     629 $id = $id // $default_id;
251 206   66     486 $label = $label // $default_label;
252              
253             #############
254             # END QUERY #
255             #############
256              
257 206         1205 return ( $id, $label );
258              
259             }
260              
261             sub execute_query_SQLite {
262              
263 206     206 0 419 my $arg = shift;
264 206         343 my $sth = $arg->{sth};
265 206         351 my $query = $arg->{query};
266 206         293 my $text_similarity_method = $arg->{text_similarity_method};
267 206         301 my $min_text_similarity_score = $arg->{min_text_similarity_score};
268 206         291 my $ontology = $arg->{ontology};
269 206         325 my $match = $arg->{match};
270              
271             # set $id and $label to undef
272 206         487 my ( $id, $label ) = ( undef, undef );
273              
274             # Premature return if $query eq ''
275 206 100       562 return ( $id, $label ) if $query eq '';
276              
277             # Columns in DBs
278             # *<ncit.db>, <icd10.db> and <cdisc.db> were pre-processed to have "id" and "label" columns only
279             # label [0]
280             # id [1]
281             #
282             # * <ohdsi.db> consists of 4 columns:
283             # concept_name => label [0]
284             # concept_code => id [1]
285             # concept_id => concept_id [2]
286             # vocabulary_id => vocabulary_id [3]
287              
288             # Define a hash for column position on databases
289             # We may encounter a situation where order of columns is different
290 204         4655 my $position = {};
291 204         3023 $position->{$_} = { label => 0, id => 1 } for (@sqlites);
292 204         462 my $id_column = $position->{$ontology}{id};
293 204         312 my $label_column = $position->{$ontology}{label};
294              
295             # **********************
296             # *** IMPORTANT STEP ***
297             # **********************
298             # full_text_search is supposed to be ONLY in text fields, but, for
299             # whatever reaon the binding of parameters e.g, '2 - mild' (starts w/ number)
300             # produce exceptions on SQLite. We'll be parsing them for ALL SEARCHES!!!
301              
302             # NB: Order matters in the changes below
303 204         614 $query =~ s/^\d+\s+\-\s+//; # for ALL SEARCHES!!!
304 204 50       489 $query =~ tr#_,-/# # if $match eq 'full_text_search'; # FTS
305 204         551 $query =~
306             tr/ //s; # remove duplicated spaces # for ALL SEARCHES!!!
307              
308             # Execute query
309 204         1606 $sth->bind_param( 1, $query )
310             ; # docstore.mik.ua/orelly/linux/dbi/ch05_03.htm
311 204         2328308 $sth->execute(); # eq to $sth->execute($query);
312              
313             # Prune 'hpo' ontology for being printed as HP:
314 204 50       1587 $ontology = 'hp' if $ontology eq 'hpo';
315              
316             # Process depending on typf of match
317 204 50       624 if ( $match eq 'exact_match' ) {
318              
319             # Parse query
320 204         2026431 while ( my $row = $sth->fetchrow_arrayref ) {
321 168 50       1912 $id =
322             $ontology ne 'ohdsi'
323             ? uc($ontology) . ':' . $row->[$id_column]
324             : $row->[3] . ':' . $row->[$id_column];
325 168         408 $label = $row->[$label_column];
326 168         514 last; # Note that sometimes we get more than one (they're discarded)
327             }
328             }
329             else {
330              
331             # Parse query w/ sub
332 0         0 ( $id, $label ) = text_similarity(
333             {
334             sth => $sth,
335             query => $query,
336             ontology => $ontology,
337             id_column => $id_column,
338             label_column => $label_column,
339             text_similarity_method => $text_similarity_method,
340             min_text_similarity_score => $min_text_similarity_score
341             }
342             );
343             }
344              
345             # Finish $sth
346 204         1437 $sth->finish();
347              
348             # We return results
349 204         2288 return ( $id, $label );
350             }
351              
352             sub text_similarity {
353              
354 0     0 0   my $arg = shift;
355 0           my $sth = $arg->{sth};
356 0           my $query = $arg->{query};
357 0           my $ontology = $arg->{ontology};
358 0           my $id_column = $arg->{id_column};
359 0           my $label_column = $arg->{label_column};
360 0           my $min_score = $arg->{min_text_similarity_score};
361 0           my $text_similarity_method = $arg->{text_similarity_method};
362 0 0 0       die "--text-similarity-method <$text_similarity_method> not allowed"
363             unless ( $text_similarity_method eq 'dice'
364             || $text_similarity_method eq 'cosine' );
365              
366             #say $text_similarity_method;
367              
368             # Create a new Text::Similarity object
369 0           my $ts = Text::Similarity::Overlaps->new();
370              
371             # Fetch the query results
372 0           my $data; # hashref
373 0           while ( my $row = $sth->fetchrow_arrayref() ) {
374              
375 0           say "---Checking <$row->[$label_column]>" if DEVEL_MODE;
376              
377             # We have a threshold to assign a result as valid
378 0           my ( $score, %scores ) =
379             $ts->getSimilarityStrings( $query, $row->[$label_column] );
380              
381             # Only load $data if dice >= $min_score;
382             $data->{ $row->[$label_column] } = {
383             id => $ontology ne 'ohdsi'
384             ? uc($ontology) . ':' . $row->[$id_column]
385             : $row->[3] . ':' . $row->[$id_column],
386             label => $row->[$label_column],
387             scores => {%scores},
388             query => $query
389             }
390 0 0         if $scores{$text_similarity_method} >= $min_score;
    0          
391             }
392              
393             # Sort the results by similarity score
394             #$Data::Dumper::Sortkeys = 1 ;
395             my @sorted_keys =
396             sort {
397             $data->{$b}{scores}{$text_similarity_method}
398 0           <=> $data->{$a}{scores}{$text_similarity_method}
399 0           } keys %{$data};
  0            
400              
401 0           print Dumper $data if DEVEL_MODE;
402 0 0 0       say "WINNER <$sorted_keys[0]>" if ( $sorted_keys[0] && DEVEL_MODE );
403              
404             # Return 1st element if present
405             return $sorted_keys[0]
406             ? ( $data->{ $sorted_keys[0] }{id}, $data->{ $sorted_keys[0] }{label} )
407 0 0         : ( undef, undef );
408             }
409             1;