File Coverage

lib/Convert/Pheno/SQLite.pm

Criterion	Covered	Total	%
statement	116	139	83.4
branch	15	36	41.6
condition	8	23	34.7
subroutine	17	18	94.4
pod	0	8	0.0
total	156	224	69.6

line	stmt	bran	cond	sub	pod	time	code
1							package Convert::Pheno::SQLite;
2
3	6			6		40	use strict;
	6					11
	6					180
4	6			6		26	use warnings;
	6					9
	6					145
5	6			6		25	use autodie;
	6					10
	6					33
6	6			6		32234	use feature qw(say);
	6					13
	6					456
7
8							#use Carp qw(confess);
9	6			6		11659	use DBI;
	6					112027
	6					399
10	6			6		86	use File::Spec::Functions qw(catdir catfile);
	6					19
	6					343
11	6			6		48	use Data::Dumper;
	6					15
	6					251
12	6			6		3568	use Text::Similarity::Overlaps;
	6					36415
	6					373
13	6			6		46	use Exporter 'import';
	6					11
	6					356
14							our @EXPORT =
15							qw( $VERSION open_connections_SQLite close_connections_SQLite get_ontology);
16
17							my @sqlites = qw(ncit icd10 ohdsi cdisc omim hpo);
18							my @matches = qw(exact_match full_text_search contains);
19	6			6		36	use constant DEVEL_MODE => 0;
	6					18
	6					11815
20
21							########################
22							########################
23							# SUBROUTINES FOR DB #
24							########################
25							########################
26
27							sub open_connections_SQLite {
28
29	10			10	0	31	my $self = shift;
30
31							# **********************
32							# * IMPORTANT STEP *
33							# **********************
34							# Well open ALL databases once (instead that on each call), regardless if they user has selected them.
35							# It imrpoves speed by 15%
36							# The only exception is for <ohdsi> that is the larger and may interfere in timings
37
38							# Only open ohdsi.db if $self->{ohdsi_db}
39							my @databases =
40	10	50				65	$self->{ohdsi_db} ? @sqlites : grep { !m/ohdsi/ } @sqlites; # global
	60					191
41
42							# Open databases
43	10					23	my $dbh;
44							$dbh->{$_} = open_db_SQLite( $_, $self->{path_to_ohdsi_db} )
45	10					77	for (@databases);
46
47							# Add $dbh HANDLE to $self
48	10					83	$self->{dbh} = $dbh; # Dynamically adding attributes (setter)
49
50							# Prepare the query once
51	10					50	prepare_query_SQLite($self);
52
53	10					39	return 1;
54							}
55
56							sub close_connections_SQLite {
57
58	10			10	0	29	my $self = shift;
59	10					27	my $dbh = $self->{dbh};
60
61							# Check flag ohdsi_db
62							my @databases =
63	10	50				61	$self->{ohdsi_db} ? @sqlites : grep { !m/ohdsi/ } @sqlites; # global
	60					165
64	10					93	close_db_SQLite( $dbh->{$_} ) for (@databases);
65	10					45	return 1;
66							}
67
68							sub open_db_SQLite {
69
70	50			50	0	132	my ( $ontology, $path_to_ohdsi_db ) = @_;
71
72							# Search file in two dirs, except for ($ontology eq 'ohdsi' && defined $path_to_ohdsi_db)
73	50					127	my $filename = qq/$ontology.db/;
74	50	50	33			407	my $path =
75							( $ontology eq 'ohdsi' && defined $path_to_ohdsi_db )
76							? $path_to_ohdsi_db
77							: catdir( $Convert::Pheno::share_dir, 'db' );
78	50					231	my $dbfile = catfile( $path, $filename );
79	50	50				1280	die "Sorry we could not find <$dbfile> file\n" unless -f $dbfile;
80
81							# Connect to the database
82	50					172	my $user = '';
83	50					75	my $passwd = '';
84	50					162	my $dsn = "dbi:SQLite:dbname=$dbfile";
85	50					537	my $dbh = DBI->connect(
86							$dsn, $user, $passwd,
87
88							# PRAGMAs
89							{
90							PrintError => 0,
91							RaiseError => 1,
92							ReadOnly => 1,
93							AutoCommit => 1,
94							FetchHashKeyName => 'NAME_lc',
95							}
96							);
97
98							# These extra PRAGMAs are supposed to speed-up queries??
99	50					60994	$dbh->do("PRAGMA synchronous = OFF");
100	50					13130	$dbh->do("PRAGMA cache_size = 800000");
101
102	50					1517	return $dbh;
103							}
104
105							sub close_db_SQLite {
106
107	50			50	0	97	my $dbh = shift;
108	50					8193	$dbh->disconnect();
109	50					267	return 1;
110							}
111
112							sub prepare_query_SQLite {
113
114	10			10	0	21	my $self = shift;
115
116							###############
117							# EXPLANATION #
118							###############
119							#
120							# Even though we did not gain a lot of speed (~15%), we decided to do the "prepare step" once, instead of on each query.
121							# Then, if we want to search in a different column than 'label' we also need to create that $sth
122							# To solve that we have created a nested sth->{ncit}{label}, sth->{icd10}{label}, sth->{ohdsi}{concept_id} and sth->{ohdsi}{label}
123							# On top of that, we add the "match" type, so that we can have other matches in the future if needed
124							# NB: In principle, is is possible to change the "prepare" during queries but we must revert it back to default after using it
125							# We recommend using small db such as ncit/icd10 as they're fast
126
127							# Check flag ohdsi_db
128							my @databases =
129	10	50				60	$self->{ohdsi_db} ? @sqlites : grep { !m/ohdsi/ } @sqlites; # global
	60					169
130
131							# NB:
132							# dbh = "Database Handle"
133							# sth = "Statement Handle"
134
135	10					49	for my $match (@matches) {
136	30					61	for my $ontology (@databases) { #global
137	150					235	for my $column ( 'label', 'concept_id' ) {
138
139							# We only need to open 'concept_id' in ohdsi
140	300	100	66			1176	next if ( $column eq 'concept_id' && $ontology ne 'ohdsi' );
141
142							##############################
143							# Start building the queries #
144							##############################
145
146							# NCIT_table or NCIT_fts depending on type of match
147	150					366	my $db = uc($ontology) . '_table';
148	150					224	my $db_fts = uc($ontology) . '_fts';
149	150					283	my $dbh = $self->{dbh}{$ontology};
150	150					1016	my %query_type = (
151
152							# Regular queries
153							contains =>
154							qq(SELECT * FROM $db WHERE $column LIKE '%' \|\| ? \|\| '%' COLLATE NOCASE)
155							, # NOT USED
156
157							#begins_with => qq(SELECT * FROM $db WHERE $column LIKE ? \|\| '%' COLLATE NOCASE), # NOT USED
158							exact_match =>
159							qq(SELECT * FROM $db WHERE $column = ? COLLATE NOCASE),
160
161							# **********************
162							# * IMPORTANT STEP *
163							# **********************
164
165							# Full-text-search queries only on column <label> BUT IT CAN BE DONE ALL COLUMNS!!!!
166							# The speed of the FTS in $column == $db_fts
167							# FTS is 2x faster than 'contains'
168							# NOTE (Jan-2023): We don't check for misspelled words
169							# --> TO DO - Tricky --> https://www.sqlite.org/spellfix1.html
170							full_text_search =>
171							qq(SELECT * FROM $db_fts WHERE $column MATCH ?)
172							, # SINGLE COLUMN
173							#qq(SELECT * FROM $db_fts WHERE $db_fts MATCH ?), # ALL TABLE
174
175							# SOUNDEX using TABLE_fts but only on column <label>
176							# soundex => qq(SELECT * FROM $db_fts WHERE SOUNDEX($column) = SOUNDEX(?)) # NOT USED
177							);
178
179							# Prepare the query
180	150					932	my $sth = $dbh->prepare( $query_type{$match} );
181
182							# Autovivification of $self->{sth}{$ontology}{$column}{$match}
183	150					24298	$self->{sth}{$ontology}{$column}{$match} =
184							$sth; # Dynamically adding nested attributes (setter)
185							}
186							}
187							}
188
189							#print Dumper $self and die;
190	10					41	return 1;
191							}
192
193							sub get_ontology {
194
195							###############
196							# START QUERY #
197							###############
198
199	206			206	0	294	my $arg = shift;
200	206					370	my $ontology = $arg->{ontology};
201	206					320	my $sth_column_ref = $arg->{sth_column_ref}; #it contains hashref
202	206					331	my $query = $arg->{query};
203	206					323	my $column = $arg->{column};
204	206					308	my $search = $arg->{search};
205	206					290	my $text_similarity_method = $arg->{text_similarity_method};
206	206					357	my $min_text_similarity_score = $arg->{min_text_similarity_score};
207	206					299	my $type_of_search = 'full_text_search'; # 'contains' and 'full_text_search'
208							#say $type_of_search;
209	206					253	say "QUERY <$query>" if DEVEL_MODE;
210
211							# A) 'exact'
212							# - exact_match
213							# B) Mixed queries:
214							# 1 - exact_match
215							# if we don't get results
216							# 2 - contains
217							# for which we rank by similarity w/ Text:Similarity
218
219	206	50				701	my $default_id =
220							$ontology eq 'hpo' ? 'HP:NA0000' : uc($ontology) . ':NA0000';
221	206					260	my $default_label = 'NA';
222
223							# exact_match (always performed)
224							my ( $id, $label ) = execute_query_SQLite(
225							{
226							sth => $sth_column_ref->{exact_match}, # IMPORTANT STEP
227	206					1153	query => $query,
228							ontology => $ontology,
229							match => 'exact_match',
230							text_similarity_method => $text_similarity_method, # Not used here
231							min_text_similarity_score => $min_text_similarity_score
232							}
233							);
234
235							# mixed
236	206	50	0			1100	if ( $search eq 'mixed' && ( !defined $id && !defined $label ) ) {
			33
237							( $id, $label ) = execute_query_SQLite(
238							{
239	0					0	sth => $sth_column_ref->{$type_of_search}, # IMPORTANT STEP
240							query => $query,
241							ontology => $ontology,
242							match => $type_of_search,
243							text_similarity_method => $text_similarity_method,
244							min_text_similarity_score => $min_text_similarity_score
245							}
246							);
247							}
248
249							# Set defaults if undef
250	206		66			719	$id = $id // $default_id;
251	206		66			626	$label = $label // $default_label;
252
253							#############
254							# END QUERY #
255							#############
256
257	206					1191	return ( $id, $label );
258
259							}
260
261							sub execute_query_SQLite {
262
263	206			206	0	359	my $arg = shift;
264	206					367	my $sth = $arg->{sth};
265	206					320	my $query = $arg->{query};
266	206					293	my $text_similarity_method = $arg->{text_similarity_method};
267	206					311	my $min_text_similarity_score = $arg->{min_text_similarity_score};
268	206					363	my $ontology = $arg->{ontology};
269	206					365	my $match = $arg->{match};
270
271							# set $id and $label to undef
272	206					377	my ( $id, $label ) = ( undef, undef );
273
274							# Premature return if $query eq ''
275	206	100				469	return ( $id, $label ) if $query eq '';
276
277							# Columns in DBs
278							# *<ncit.db>, <icd10.db> and <cdisc.db> were pre-processed to have "id" and "label" columns only
279							# label [0]
280							# id [1]
281							#
282							# * <ohdsi.db> consists of 4 columns:
283							# concept_name => label [0]
284							# concept_code => id [1]
285							# concept_id => concept_id [2]
286							# vocabulary_id => vocabulary_id [3]
287
288							# Define a hash for column position on databases
289							# We may encounter a situation where order of columns is different
290	204					4683	my $position = {};
291	204					3542	$position->{$_} = { label => 0, id => 1 } for (@sqlites);
292	204					460	my $id_column = $position->{$ontology}{id};
293	204					333	my $label_column = $position->{$ontology}{label};
294
295							# **********************
296							# * IMPORTANT STEP *
297							# **********************
298							# full_text_search is supposed to be ONLY in text fields, but, for
299							# whatever reaon the binding of parameters e.g, '2 - mild' (starts w/ number)
300							# produce exceptions on SQLite. We'll be parsing them for ALL SEARCHES!!!
301
302							# NB: Order matters in the changes below
303	204					685	$query =~ s/^\d+\s+\-\s+//; # for ALL SEARCHES!!!
304	204	50				473	$query =~ tr#_,-/# # if $match eq 'full_text_search'; # FTS
305	204					672	$query =~
306							tr/ //s; # remove duplicated spaces # for ALL SEARCHES!!!
307
308							# Execute query
309	204					2017	$sth->bind_param( 1, $query )
310							; # docstore.mik.ua/orelly/linux/dbi/ch05_03.htm
311	204					2404261	$sth->execute(); # eq to $sth->execute($query);
312
313							# Prune 'hpo' ontology for being printed as HP:
314	204	50				1649	$ontology = 'hp' if $ontology eq 'hpo';
315
316							# Process depending on typf of match
317	204	50				593	if ( $match eq 'exact_match' ) {
318
319							# Parse query
320	204					2070228	while ( my $row = $sth->fetchrow_arrayref ) {
321	168	50				2146	$id =
322							$ontology ne 'ohdsi'
323							? uc($ontology) . ':' . $row->[$id_column]
324							: $row->[3] . ':' . $row->[$id_column];
325	168					366	$label = $row->[$label_column];
326	168					616	last; # Note that sometimes we get more than one (they're discarded)
327							}
328							}
329							else {
330
331							# Parse query w/ sub
332	0					0	( $id, $label ) = text_similarity(
333							{
334							sth => $sth,
335							query => $query,
336							ontology => $ontology,
337							id_column => $id_column,
338							label_column => $label_column,
339							text_similarity_method => $text_similarity_method,
340							min_text_similarity_score => $min_text_similarity_score
341							}
342							);
343							}
344
345							# Finish $sth
346	204					1497	$sth->finish();
347
348							# We return results
349	204					2334	return ( $id, $label );
350							}
351
352							sub text_similarity {
353
354	0			0	0		my $arg = shift;
355	0						my $sth = $arg->{sth};
356	0						my $query = $arg->{query};
357	0						my $ontology = $arg->{ontology};
358	0						my $id_column = $arg->{id_column};
359	0						my $label_column = $arg->{label_column};
360	0						my $min_score = $arg->{min_text_similarity_score};
361	0						my $text_similarity_method = $arg->{text_similarity_method};
362	0	0	0				die "--text-similarity-method <$text_similarity_method> not allowed"
363							unless ( $text_similarity_method eq 'dice'
364							\|\| $text_similarity_method eq 'cosine' );
365
366							#say $text_similarity_method;
367
368							# Create a new Text::Similarity object
369	0						my $ts = Text::Similarity::Overlaps->new();
370
371							# Fetch the query results
372	0						my $data; # hashref
373	0						while ( my $row = $sth->fetchrow_arrayref() ) {
374
375	0						say "---Checking <$row->[$label_column]>" if DEVEL_MODE;
376
377							# We have a threshold to assign a result as valid
378	0						my ( $score, %scores ) =
379							$ts->getSimilarityStrings( $query, $row->[$label_column] );
380
381							# Only load $data if dice >= $min_score;
382							$data->{ $row->[$label_column] } = {
383							id => $ontology ne 'ohdsi'
384							? uc($ontology) . ':' . $row->[$id_column]
385							: $row->[3] . ':' . $row->[$id_column],
386							label => $row->[$label_column],
387							scores => {%scores},
388							query => $query
389							}
390	0	0					if $scores{$text_similarity_method} >= $min_score;
		0
391							}
392
393							# Sort the results by similarity score
394							#$Data::Dumper::Sortkeys = 1 ;
395							my @sorted_keys =
396							sort {
397							$data->{$b}{scores}{$text_similarity_method}
398	0						<=> $data->{$a}{scores}{$text_similarity_method}
399	0						} keys %{$data};
	0
400
401	0						print Dumper $data if DEVEL_MODE;
402	0	0	0				say "WINNER <$sorted_keys[0]>" if ( $sorted_keys[0] && DEVEL_MODE );
403
404							# Return 1st element if present
405							return $sorted_keys[0]
406							? ( $data->{ $sorted_keys[0] }{id}, $data->{ $sorted_keys[0] }{label} )
407	0	0					: ( undef, undef );
408							}
409							1;