line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#UMLS::Association |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Perl module for scoring the semantic association of terms in the Unified |
4
|
|
|
|
|
|
|
# Medical Language System (UMLS). |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Copyright (c) 2015 |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Bridget T. McInnes, Virginia Commonwealth University |
9
|
|
|
|
|
|
|
# btmcinnes at vcu.edu |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# Keith Herbert, Virginia Commonwealth University |
12
|
|
|
|
|
|
|
# herbertkb at vcu.edu |
13
|
|
|
|
|
|
|
# |
14
|
|
|
|
|
|
|
# Alexander D. McQuilkin, Virginia Commonwealth University |
15
|
|
|
|
|
|
|
# alexmcq99 at yahoo.com |
16
|
|
|
|
|
|
|
# |
17
|
|
|
|
|
|
|
# Sam Henry, Virginia Commonwealth University |
18
|
|
|
|
|
|
|
# henryst at vcu.edu |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or |
21
|
|
|
|
|
|
|
# modify it under the terms of the GNU General Public License |
22
|
|
|
|
|
|
|
# as published by the Free Software Foundation; either version 2 |
23
|
|
|
|
|
|
|
# of the License, or (at your option) any later version. |
24
|
|
|
|
|
|
|
# |
25
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
26
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
27
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
28
|
|
|
|
|
|
|
# GNU General Public License for more details. |
29
|
|
|
|
|
|
|
# |
30
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
31
|
|
|
|
|
|
|
# along with this program; if not, write to |
32
|
|
|
|
|
|
|
# |
33
|
|
|
|
|
|
|
# The Free Software Foundation, Inc., |
34
|
|
|
|
|
|
|
# 59 Temple Place - Suite 330, |
35
|
|
|
|
|
|
|
# Boston, MA 02111-1307, USA. |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
package UMLS::Association::StatFinder; |
38
|
|
|
|
|
|
|
|
39
|
1
|
|
|
1
|
|
7
|
use Fcntl; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
214
|
|
40
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
19
|
|
41
|
1
|
|
|
1
|
|
4
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
29
|
|
42
|
1
|
|
|
1
|
|
4
|
use DBI; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
30
|
|
43
|
1
|
|
|
1
|
|
5
|
use bytes; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
4
|
|
44
|
1
|
|
|
1
|
|
18
|
use File::Spec; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
5346
|
|
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# error handling variables |
47
|
|
|
|
|
|
|
my $errorhandler = ""; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
my $pkg = "UMLS::Association::StatFinder"; |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# debug variables |
52
|
|
|
|
|
|
|
#local(*DEBUG_FILE); |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
#NOTE: every global variable is followed by a _G with the |
55
|
|
|
|
|
|
|
# exception of debug error handler, and constants which are all caps |
56
|
|
|
|
|
|
|
# global variables |
57
|
|
|
|
|
|
|
my $debug = 0; #in debug mode or not |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
#global options variables |
60
|
|
|
|
|
|
|
my $assocDB_G; |
61
|
|
|
|
|
|
|
my $lta_G = 0; #1 or 0 is using lta or not |
62
|
|
|
|
|
|
|
my $mwa_G = 0; #1 or 0 if using mwa or not |
63
|
|
|
|
|
|
|
my $vsa_G = 0; #1 or 0 if using vsa or not |
64
|
|
|
|
|
|
|
my $noOrder_G = 0; #1 or 0 if noOrder is enabled or not |
65
|
|
|
|
|
|
|
my $matrix_G = 0; #matrix file name is using a matrix file rather than DB |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
###################################################################### |
68
|
|
|
|
|
|
|
# Initialization Functions |
69
|
|
|
|
|
|
|
###################################################################### |
70
|
|
|
|
|
|
|
# method to create a new UMLS::Association::StatFinder object |
71
|
|
|
|
|
|
|
# input : $params <- reference to hash of database parameters |
72
|
|
|
|
|
|
|
# output: $self |
73
|
|
|
|
|
|
|
sub new { |
74
|
|
|
|
|
|
|
#grab params and create self |
75
|
1
|
|
|
1
|
0
|
2
|
my $self = {}; |
76
|
1
|
|
|
|
|
2
|
my $className = shift; |
77
|
1
|
|
|
|
|
2
|
my $params = shift; |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
#bless the object. |
80
|
1
|
|
|
|
|
2
|
bless($self, $className); |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
#initialize error handler |
83
|
1
|
|
|
|
|
5
|
$errorhandler = UMLS::Association::ErrorHandler->new(); |
84
|
1
|
50
|
|
|
|
13
|
if(! defined $errorhandler) { |
85
|
0
|
|
|
|
|
0
|
print STDERR "The error handler did not get passed properly.\n"; |
86
|
0
|
|
|
|
|
0
|
exit; |
87
|
|
|
|
|
|
|
} |
88
|
|
|
|
|
|
|
|
89
|
|
|
|
|
|
|
# initialize the object. |
90
|
1
|
|
|
|
|
3
|
$debug = 0; |
91
|
1
|
|
|
|
|
8
|
$self->_initialize($params); |
92
|
1
|
|
|
|
|
3
|
return $self; |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
# method to initialize the UMLS::Association::StatFinder object. |
96
|
|
|
|
|
|
|
# input : $parameters <- reference to a hash of database parameters |
97
|
|
|
|
|
|
|
# output: none, but $self is initialized |
98
|
|
|
|
|
|
|
sub _initialize { |
99
|
|
|
|
|
|
|
#grab parameters |
100
|
1
|
|
|
1
|
|
4
|
my $self = shift; |
101
|
1
|
|
|
|
|
3
|
my $paramsRef = shift; |
102
|
1
|
|
|
|
|
4
|
my %params = %{$paramsRef}; |
|
1
|
|
|
|
|
7
|
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
#set global variables using option hash |
105
|
1
|
|
|
|
|
5
|
$lta_G = $params{'lta'}; |
106
|
1
|
|
|
|
|
3
|
$mwa_G = $params{'mwa'}; |
107
|
1
|
|
|
|
|
3
|
$vsa_G = $params{'vsa'}; |
108
|
1
|
|
|
|
|
4
|
$noOrder_G = $params{'noorder'}; |
109
|
1
|
|
|
|
|
4
|
$matrix_G = $params{'matrix'}; |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
#connect to the database of association scores |
112
|
1
|
50
|
|
|
|
4
|
if (!$matrix_G) { |
113
|
0
|
|
|
|
|
0
|
$self->_setDatabase($paramsRef); |
114
|
|
|
|
|
|
|
} |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
#error checking |
117
|
1
|
|
|
|
|
2
|
my $function = "_initialize"; |
118
|
1
|
|
|
|
|
4
|
&_debug($function); |
119
|
1
|
50
|
33
|
|
|
5
|
if(!defined $self || !ref $self) { |
120
|
0
|
|
|
|
|
0
|
$errorhandler->_error($pkg, $function, "", 2); |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
#TODO, remove this once I have DB implemented |
124
|
|
|
|
|
|
|
#check that a matrix is specified for options (need to implement DB mode) |
125
|
1
|
50
|
33
|
|
|
11
|
if (!$matrix_G && $mwa_G) { |
126
|
0
|
|
|
|
|
0
|
$errorhandler->_error($pkg, $function, "MWA requires the --matrix option", 12); |
127
|
|
|
|
|
|
|
} |
128
|
1
|
50
|
33
|
|
|
6
|
if (!$matrix_G && $vsa_G) { |
129
|
0
|
|
|
|
|
0
|
$errorhandler->_error($pkg, $function, "VSA requires the --matrix option", 12); |
130
|
|
|
|
|
|
|
} |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub _debug { |
134
|
1
|
|
|
1
|
|
2
|
my $function = shift; |
135
|
1
|
50
|
|
|
|
3
|
if($debug) { print STDERR "In UMLS::Association::StatFinder::$function\n"; } |
|
0
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
} |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
# method to set the association database |
139
|
|
|
|
|
|
|
# input : $params <- reference to a hash |
140
|
|
|
|
|
|
|
# output: none, but association database is set and initialized |
141
|
|
|
|
|
|
|
sub _setDatabase { |
142
|
0
|
|
|
0
|
|
|
my $self = shift; |
143
|
0
|
|
|
|
|
|
my $params = shift; |
144
|
|
|
|
|
|
|
|
145
|
0
|
|
|
|
|
|
my $function = "_setDatabase"; |
146
|
0
|
|
|
|
|
|
&_debug($function); |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
# check self |
149
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
150
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
# check the params |
154
|
0
|
0
|
|
|
|
|
$params = {} if(!defined $params); |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
# get the database connection parameters |
157
|
0
|
|
|
|
|
|
my $database = $params->{'database'}; |
158
|
0
|
|
|
|
|
|
my $hostname = $params->{'hostname'}; |
159
|
0
|
|
|
|
|
|
my $socket = $params->{'socket'}; |
160
|
0
|
|
|
|
|
|
my $port = $params->{'port'}; |
161
|
0
|
|
|
|
|
|
my $username = $params->{'username'}; |
162
|
0
|
|
|
|
|
|
my $password = $params->{'password'}; |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
# set up defaults if the options were not passed |
165
|
0
|
0
|
|
|
|
|
if(! defined $database) { $database = "cuicounts"; } |
|
0
|
|
|
|
|
|
|
166
|
0
|
0
|
|
|
|
|
if(! defined $socket) { $socket = "/var/run/mysqld/mysqld.sock"; } |
|
0
|
|
|
|
|
|
|
167
|
0
|
0
|
|
|
|
|
if(! defined $hostname) { $hostname = "localhost"; } |
|
0
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
# initialize the database handler |
170
|
0
|
|
|
|
|
|
$assocDB_G = ""; |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# create the database object... |
173
|
0
|
0
|
0
|
|
|
|
if(defined $username and defined $password) { |
174
|
0
|
0
|
|
|
|
|
if($debug) { print STDERR "Connecting with username and password\n"; } |
|
0
|
|
|
|
|
|
|
175
|
0
|
|
|
|
|
|
$assocDB_G = DBI->connect("DBI:mysql:database=$database;mysql_socket=$socket;host=$hostname",$username, $password, {RaiseError => 0}); |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
else { |
178
|
0
|
0
|
|
|
|
|
if($debug) { print STDERR "Connecting using the my.cnf file\n"; } |
|
0
|
|
|
|
|
|
|
179
|
0
|
|
|
|
|
|
my $dsn = "DBI:mysql:umls;mysql_read_default_group=client;database=$database"; |
180
|
0
|
|
|
|
|
|
$assocDB_G = DBI->connect($dsn); |
181
|
|
|
|
|
|
|
} |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
# check if there is an error |
184
|
0
|
|
|
|
|
|
$errorhandler->_checkDbError($pkg, $function, $assocDB_G); |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
# check that the db exists |
187
|
0
|
0
|
|
|
|
|
if(!$assocDB_G) { $errorhandler->_error($pkg, $function, "Error with db.", 3); } |
|
0
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
# set database parameters |
190
|
0
|
|
|
|
|
|
$assocDB_G->{'mysql_enable_utf8'} = 1; |
191
|
0
|
|
|
|
|
|
$assocDB_G->do('SET NAMES utf8'); |
192
|
0
|
|
|
|
|
|
$assocDB_G->{mysql_auto_reconnect} = 1; |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
###################################################################### |
196
|
|
|
|
|
|
|
# public interface to get observed counts |
197
|
|
|
|
|
|
|
###################################################################### |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
# Gets observed counts (n11, n1p, np1, npp) of the cui sets |
200
|
|
|
|
|
|
|
# input: $pairHashListRef - a ref to an array of pairHashes |
201
|
|
|
|
|
|
|
# output: \@allStatsRef - a ref to an array of observed counts 4-tuples |
202
|
|
|
|
|
|
|
# each 4-tuple consists of in order: |
203
|
|
|
|
|
|
|
# $n11, $n1p, $np1, and $npp |
204
|
|
|
|
|
|
|
# and they correspond to the observed counts of |
205
|
|
|
|
|
|
|
# each of the pairHashes passed in |
206
|
|
|
|
|
|
|
sub getObservedCounts { |
207
|
|
|
|
|
|
|
#grab parameters |
208
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
209
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
#error checking |
212
|
0
|
|
|
|
|
|
my $function = "getObservedCounts"; |
213
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
214
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
#calculate n11, n1p, np1, npp using a matrix or DB |
218
|
|
|
|
|
|
|
# and according to the method of various other options |
219
|
0
|
|
|
|
|
|
my $allStatsRef = -1; |
220
|
0
|
0
|
|
|
|
|
if ($lta_G) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
221
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_LTA($pairHashListRef); |
222
|
|
|
|
|
|
|
} |
223
|
|
|
|
|
|
|
elsif ($mwa_G) { |
224
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_MWA($pairHashListRef); |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
elsif ($vsa_G) { |
227
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_VSA($pairHashListRef); |
228
|
|
|
|
|
|
|
} |
229
|
|
|
|
|
|
|
else { |
230
|
0
|
0
|
|
|
|
|
if ($matrix_G) { |
231
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_matrix($pairHashListRef); |
232
|
|
|
|
|
|
|
} |
233
|
|
|
|
|
|
|
else { |
234
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_DB($pairHashListRef); |
235
|
|
|
|
|
|
|
} |
236
|
|
|
|
|
|
|
} |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
#return a reference to a list of stats for each pairHash |
239
|
0
|
|
|
|
|
|
return $allStatsRef; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
###################################################################### |
244
|
|
|
|
|
|
|
# functions to get statistical information about the cuis using a DB |
245
|
|
|
|
|
|
|
###################################################################### |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
# gets N11, N1P, NP1, NPP for a pairHashList using a database |
248
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
249
|
|
|
|
|
|
|
# output: $\@data <- array ref containing array refs of four values |
250
|
|
|
|
|
|
|
# for each pair Hash, $n11, $n1p, $np1, and $npp |
251
|
|
|
|
|
|
|
sub _getStats_DB { |
252
|
|
|
|
|
|
|
#grab parameters |
253
|
0
|
|
|
0
|
|
|
my $self = shift; |
254
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
#error checking |
257
|
0
|
|
|
|
|
|
my $function = "_getStats_DB"; |
258
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
259
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
260
|
|
|
|
|
|
|
} |
261
|
|
|
|
|
|
|
|
262
|
|
|
|
|
|
|
#compute observed counts for each pair hash |
263
|
0
|
|
|
|
|
|
my @data = (); |
264
|
0
|
|
|
|
|
|
my $npp = $self->_getNpp_DB(); |
265
|
0
|
|
|
|
|
|
foreach my $pairHashRef(@{$pairHashListRef}) { |
|
0
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
#grab the data from a DB |
268
|
0
|
|
|
|
|
|
my $n11 = $self->_getN11_DB(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
269
|
0
|
|
|
|
|
|
my $n1p = $self->_getN1p_DB(${$pairHashRef}{'set1'}); |
|
0
|
|
|
|
|
|
|
270
|
0
|
|
|
|
|
|
my $np1 = $self->_getNp1_DB(${$pairHashRef}{'set2'}); |
|
0
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
|
272
|
|
|
|
|
|
|
#store the data |
273
|
0
|
|
|
|
|
|
my @values = ($n11, $n1p, $np1, $npp); |
274
|
0
|
|
|
|
|
|
push @data, \@values; |
275
|
|
|
|
|
|
|
} |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
#return the data |
278
|
0
|
|
|
|
|
|
return \@data; |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
# Gets N11 of the cui pair using a database |
282
|
|
|
|
|
|
|
# input: $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
283
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
284
|
|
|
|
|
|
|
# output: $n11 <- n11 of cui sets |
285
|
|
|
|
|
|
|
sub _getN11_DB { |
286
|
|
|
|
|
|
|
#grab parameters |
287
|
0
|
|
|
0
|
|
|
my $self = shift; |
288
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
289
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
#error checking |
292
|
0
|
|
|
|
|
|
my $function = "_getN11"; |
293
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
294
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
295
|
|
|
|
|
|
|
} |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
#build a query string for n11 |
298
|
0
|
|
|
|
|
|
my $firstCui = shift @{$cuis1Ref}; |
|
0
|
|
|
|
|
|
|
299
|
0
|
|
|
|
|
|
my $queryString = "select SUM(n_11) from N_11 where ((cui_1 = '$firstCui' "; |
300
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
301
|
0
|
|
|
|
|
|
$queryString .= "or cui_1 = '$cui' "; |
302
|
|
|
|
|
|
|
} |
303
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
#set all cui2's |
306
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
0
|
|
|
|
|
|
|
307
|
0
|
|
|
|
|
|
$queryString .= ") and (cui_2 = '$firstCui' "; |
308
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
309
|
0
|
|
|
|
|
|
$queryString .= "or cui_2 = '$cui' "; |
310
|
|
|
|
|
|
|
} |
311
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
#finalize the query string |
314
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
315
|
|
|
|
|
|
|
#swap the positions of the cuis |
316
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
0
|
|
|
|
|
|
|
317
|
0
|
|
|
|
|
|
$queryString .= ")) or ((cui_1 = '$firstCui' "; |
318
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
319
|
0
|
|
|
|
|
|
$queryString .= "or cui_1 = '$cui' "; |
320
|
|
|
|
|
|
|
} |
321
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
|
323
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis1Ref}; |
|
0
|
|
|
|
|
|
|
324
|
0
|
|
|
|
|
|
$queryString .= ") and (cui_2 = '$firstCui' "; |
325
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
326
|
0
|
|
|
|
|
|
$queryString .= "or cui_2 = '$cui' "; |
327
|
|
|
|
|
|
|
} |
328
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
329
|
|
|
|
|
|
|
} |
330
|
0
|
|
|
|
|
|
$queryString .= "));"; |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
#query the DB and return n11 |
333
|
0
|
|
|
|
|
|
my $n11 = shift @{$assocDB_G->selectcol_arrayref($queryString)}; |
|
0
|
|
|
|
|
|
|
334
|
0
|
0
|
|
|
|
|
if (!defined $n11) { |
335
|
0
|
|
|
|
|
|
$n11 = 0; |
336
|
|
|
|
|
|
|
} |
337
|
0
|
|
|
|
|
|
return $n11; |
338
|
|
|
|
|
|
|
} |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
# Method to return the np1 of a concept using a database |
341
|
|
|
|
|
|
|
# input : $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
342
|
|
|
|
|
|
|
# output: $np1 <- number of times the cuis2Ref set occurs in second bigram position |
343
|
|
|
|
|
|
|
sub _getNp1_DB { |
344
|
0
|
|
|
0
|
|
|
my $self = shift; |
345
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
#error checking |
348
|
0
|
|
|
|
|
|
my $function = "_getNp1_DB"; |
349
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
350
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
351
|
|
|
|
|
|
|
} |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
#build a query string for all where cui2's are in the second position |
354
|
0
|
|
|
|
|
|
my $firstCui = shift @{$cuis2Ref}; |
|
0
|
|
|
|
|
|
|
355
|
0
|
|
|
|
|
|
my $queryString = "select SUM(n_11) from N_11 where (cui_2 = '$firstCui' "; |
356
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
357
|
0
|
|
|
|
|
|
$queryString .= "or cui_2 = '$cui' "; |
358
|
|
|
|
|
|
|
} |
359
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
|
361
|
|
|
|
|
|
|
#finalize the query string |
362
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
363
|
|
|
|
|
|
|
#add where cui2 is in the first position |
364
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
0
|
|
|
|
|
|
|
365
|
0
|
|
|
|
|
|
$queryString .= ") or (cui_1 = '$firstCui' "; |
366
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
367
|
0
|
|
|
|
|
|
$queryString .= "or cui_1 = '$cui' "; |
368
|
|
|
|
|
|
|
} |
369
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
} |
371
|
0
|
|
|
|
|
|
$queryString .= ");"; |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
#query the db to retrive np1 |
374
|
0
|
|
|
|
|
|
my $np1 = shift @{$assocDB_G->selectcol_arrayref($queryString)}; |
|
0
|
|
|
|
|
|
|
375
|
0
|
0
|
|
|
|
|
if (!defined $np1) { |
376
|
0
|
|
|
|
|
|
$np1 = -1; |
377
|
|
|
|
|
|
|
} |
378
|
0
|
|
|
|
|
|
return $np1; |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
# Method to return the n1p of a concept from a database |
382
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
383
|
|
|
|
|
|
|
# output: $n1p <- number of times cuis in cuis1 set occurs in first bigram position |
384
|
|
|
|
|
|
|
sub _getN1p_DB { |
385
|
0
|
|
|
0
|
|
|
my $self = shift; |
386
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
387
|
|
|
|
|
|
|
|
388
|
|
|
|
|
|
|
#error checking |
389
|
0
|
|
|
|
|
|
my $function = "_getN1p"; |
390
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
391
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
#build the query string for all where cui1's are in the first position |
395
|
0
|
|
|
|
|
|
my $firstCui = shift @{$cuis1Ref}; |
|
0
|
|
|
|
|
|
|
396
|
0
|
|
|
|
|
|
my $queryString = "select SUM(n_11) from N_11 where (cui_1 = '$firstCui' "; |
397
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
398
|
0
|
|
|
|
|
|
$queryString .= "or cui_1 = '$cui' "; |
399
|
|
|
|
|
|
|
} |
400
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
#finalize the query string |
403
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
404
|
|
|
|
|
|
|
#add where cui1 is in the second position |
405
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis1Ref}; |
|
0
|
|
|
|
|
|
|
406
|
0
|
|
|
|
|
|
$queryString .= ") or (cui_2 = '$firstCui' "; |
407
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
408
|
0
|
|
|
|
|
|
$queryString .= "or cui_2 = '$cui' "; |
409
|
|
|
|
|
|
|
} |
410
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
411
|
|
|
|
|
|
|
} |
412
|
0
|
|
|
|
|
|
$queryString .= ");"; |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
#query the db to retrive n1p |
415
|
0
|
|
|
|
|
|
my $n1p = shift @{$assocDB_G->selectcol_arrayref($queryString)}; |
|
0
|
|
|
|
|
|
|
416
|
0
|
0
|
|
|
|
|
if (!defined $n1p) { |
417
|
0
|
|
|
|
|
|
$n1p = -1; |
418
|
|
|
|
|
|
|
} |
419
|
0
|
|
|
|
|
|
return $n1p; |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
# Method to calculate npp from a DB |
423
|
|
|
|
|
|
|
# input : none |
424
|
|
|
|
|
|
|
# output: $npp |
425
|
|
|
|
|
|
|
sub _getNpp_DB { |
426
|
0
|
|
|
0
|
|
|
my $self = shift; |
427
|
|
|
|
|
|
|
|
428
|
|
|
|
|
|
|
#error checking |
429
|
0
|
|
|
|
|
|
my $function = "getNpp_DB"; |
430
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
431
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
#get npp, the number of co-occurrences |
435
|
0
|
|
|
|
|
|
my $npp = shift @{$assocDB_G->selectcol_arrayref("select sum(N_11) from N_11")}; |
|
0
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
#update $npp for noOrder, since Cuis can be trailing or leading its 2x ordered npp |
438
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
439
|
0
|
|
|
|
|
|
$npp *= 2; |
440
|
|
|
|
|
|
|
} |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
#return npp |
443
|
0
|
0
|
|
|
|
|
if($npp <= 0) { $errorhandler->_error($pkg, $function, "", 5); } |
|
0
|
|
|
|
|
|
|
444
|
0
|
|
|
|
|
|
return $npp; |
445
|
|
|
|
|
|
|
} |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
######################################################################## |
448
|
|
|
|
|
|
|
# functions to get statistical information about the cuis using a matrix |
449
|
|
|
|
|
|
|
######################################################################## |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
# Gets arrays of all first (leading) and second (trailing) cuis |
453
|
|
|
|
|
|
|
# This is used when retreiving data from a matrix flat file |
454
|
|
|
|
|
|
|
# input: $pairHashListRef - a ref to an array of pairHashes |
455
|
|
|
|
|
|
|
# output: (\@cuis1, \@cuis2) - two array refs, the first contains |
456
|
|
|
|
|
|
|
# all leading cuis in the dataset, the |
457
|
|
|
|
|
|
|
# second contains all trailing cuis in |
458
|
|
|
|
|
|
|
# the dataset. |
459
|
|
|
|
|
|
|
sub _getAllLeadingAndTrailingCuis { |
460
|
0
|
|
|
0
|
|
|
my $self = shift; |
461
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
462
|
|
|
|
|
|
|
|
463
|
|
|
|
|
|
|
#create a list of all possible cuis in the first and second positions |
464
|
0
|
|
|
|
|
|
my @cuis1 = (); |
465
|
0
|
|
|
|
|
|
my @cuis2 = (); |
466
|
0
|
|
|
|
|
|
foreach my $pairHashRef(@{$pairHashListRef}) { |
|
0
|
|
|
|
|
|
|
467
|
0
|
|
|
|
|
|
foreach my $cui(@{${$pairHashRef}{'set1'}}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
468
|
0
|
|
|
|
|
|
push @cuis1, $cui; |
469
|
|
|
|
|
|
|
} |
470
|
0
|
|
|
|
|
|
foreach my $cui(@{${$pairHashRef}{'set2'}}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
471
|
0
|
|
|
|
|
|
push @cuis2, $cui; |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
} |
474
|
0
|
|
|
|
|
|
return (\@cuis1, \@cuis2); |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
|
478
|
|
|
|
|
|
|
# gets N11, N1P, NP1, NPP for a pairHashList using a matrix |
479
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
480
|
|
|
|
|
|
|
# output: $\@data <- array ref containing array refs of four values |
481
|
|
|
|
|
|
|
# for each pair Hash, $n11, $n1p, $np1, and $npp |
482
|
|
|
|
|
|
|
sub _getStats_matrix { |
483
|
|
|
|
|
|
|
#grab parameters |
484
|
0
|
|
|
0
|
|
|
my $self = shift; |
485
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
#error checking |
488
|
0
|
|
|
|
|
|
my $function = "_getStats_matrix"; |
489
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
490
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
|
493
|
|
|
|
|
|
|
#get all observed counts for all possible cuis in the term pairs |
494
|
0
|
|
|
|
|
|
(my $cuis1Ref, my $cuis2Ref) = $self->_getAllLeadingAndTrailingCuis($pairHashListRef); |
495
|
0
|
|
|
|
|
|
my $countsRef = $self->_getObservedCounts_matrix($cuis1Ref, $cuis2Ref); |
496
|
0
|
|
|
|
|
|
my $n11AllRef = ${$countsRef}[0]; |
|
0
|
|
|
|
|
|
|
497
|
0
|
|
|
|
|
|
my $n1pAllRef = ${$countsRef}[1]; |
|
0
|
|
|
|
|
|
|
498
|
0
|
|
|
|
|
|
my $np1AllRef = ${$countsRef}[2]; |
|
0
|
|
|
|
|
|
|
499
|
0
|
|
|
|
|
|
my $npp = ${$countsRef}[3]; |
|
0
|
|
|
|
|
|
|
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
#update $npp for noOrder, since Cuis can be trailing or leading its 2x ordered npp |
502
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
503
|
0
|
|
|
|
|
|
$npp *= 2; |
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
#get values for each pairHash based on what was retreived from the matrix |
507
|
0
|
|
|
|
|
|
my @data = (); |
508
|
0
|
|
|
|
|
|
foreach my $pairHashRef (@{$pairHashListRef}) { |
|
0
|
|
|
|
|
|
|
509
|
0
|
|
|
|
|
|
my $n11 = $self->_getN11_matrix(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}, $n11AllRef); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
510
|
0
|
|
|
|
|
|
my $n1p = $self->_getN1p_matrix(${$pairHashRef}{'set1'}, $n11AllRef, $n1pAllRef, $np1AllRef); |
|
0
|
|
|
|
|
|
|
511
|
0
|
|
|
|
|
|
my $np1 = $self->_getNp1_matrix(${$pairHashRef}{'set2'}, $n11AllRef, $n1pAllRef, $np1AllRef); |
|
0
|
|
|
|
|
|
|
512
|
|
|
|
|
|
|
|
513
|
0
|
|
|
|
|
|
my @vals = ($n11, $n1p, $np1, $npp); |
514
|
0
|
|
|
|
|
|
push @data, \@vals; |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
#return the data |
518
|
0
|
|
|
|
|
|
return \@data; |
519
|
|
|
|
|
|
|
} |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
#computes the observed counts for all combinations of the cuis passed in |
522
|
|
|
|
|
|
|
#doing this in a single function makes it so all values can be computed with a |
523
|
|
|
|
|
|
|
#single pass of the input file, making execution time much faster |
524
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
525
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
526
|
|
|
|
|
|
|
# output: $\@counts <- array ref containing four sets of values: |
527
|
|
|
|
|
|
|
# \%n11, \%n1p, \%np1, and $npp for the cui pairs |
528
|
|
|
|
|
|
|
# hashes are indexed: $n11{"$cui1,$cui2"}, $n1p{$cui}, |
529
|
|
|
|
|
|
|
# $np1{$cui} |
530
|
|
|
|
|
|
|
sub _getObservedCounts_matrix { |
531
|
0
|
|
|
0
|
|
|
my $self = shift; |
532
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
533
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
534
|
|
|
|
|
|
|
|
535
|
|
|
|
|
|
|
#convert cui arrays to hashes, makes looping thru |
536
|
|
|
|
|
|
|
# the file faster |
537
|
0
|
|
|
|
|
|
my %cuis1 = (); |
538
|
0
|
|
|
|
|
|
foreach my $cui(@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
539
|
0
|
|
|
|
|
|
$cuis1{$cui} = 1; |
540
|
|
|
|
|
|
|
} |
541
|
0
|
|
|
|
|
|
my %cuis2 = (); |
542
|
0
|
|
|
|
|
|
foreach my $cui(@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
543
|
0
|
|
|
|
|
|
$cuis2{$cui} = 1; |
544
|
|
|
|
|
|
|
} |
545
|
|
|
|
|
|
|
|
546
|
|
|
|
|
|
|
#precalculate values for all cuis and cui pairs |
547
|
0
|
|
|
|
|
|
my %n11 = (); |
548
|
0
|
|
|
|
|
|
my %n1p = (); |
549
|
0
|
|
|
|
|
|
my %np1 = (); |
550
|
0
|
|
|
|
|
|
my $npp = 0; |
551
|
0
|
0
|
|
|
|
|
open IN, $matrix_G or die "Cannot open $matrix_G for input: $!\n"; |
552
|
0
|
|
|
|
|
|
while (my $line = ) { |
553
|
|
|
|
|
|
|
#get cuis and value from the line |
554
|
0
|
|
|
|
|
|
chomp $line; |
555
|
0
|
|
|
|
|
|
my ($cui1, $cui2, $num) = split /\t/, $line; |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
#record any occurrence of any cui1 or 2, in case order is ignored |
558
|
0
|
0
|
0
|
|
|
|
if (exists $cuis1{$cui1} || exists $cuis1{$cui2} |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
559
|
|
|
|
|
|
|
|| exists $cuis2{$cui1} || exists $cuis2{$cui2}) { |
560
|
0
|
|
|
|
|
|
$n1p{$cui1} += $num; |
561
|
0
|
|
|
|
|
|
$np1{$cui2} += $num; |
562
|
0
|
|
|
|
|
|
$n11{"$cui1,$cui2"} = $num; |
563
|
|
|
|
|
|
|
} |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
#update npp |
566
|
0
|
|
|
|
|
|
$npp += $num; |
567
|
|
|
|
|
|
|
} |
568
|
0
|
|
|
|
|
|
close IN; |
569
|
|
|
|
|
|
|
|
570
|
|
|
|
|
|
|
#return counts |
571
|
0
|
|
|
|
|
|
my @counts = (\%n11, \%n1p, \%np1, $npp); |
572
|
0
|
|
|
|
|
|
return \@counts; |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
# Gets N11 of the cui pair using a matrix |
576
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
577
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
578
|
|
|
|
|
|
|
# $n11AllRef <- ref to an array containing n11 values for all possible |
579
|
|
|
|
|
|
|
# cui pairs of the cuis1 and cuis2, of the form |
580
|
|
|
|
|
|
|
# n11All{"$cui1,$cui2"}=value. See _getObservedCounts_matrix |
581
|
|
|
|
|
|
|
# output: $n11 <- frequency of co-occurrences of the cuis in the cui sets |
582
|
|
|
|
|
|
|
sub _getN11_matrix { |
583
|
|
|
|
|
|
|
#grab parameters |
584
|
0
|
|
|
0
|
|
|
my $self = shift; |
585
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
586
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
587
|
0
|
|
|
|
|
|
my $n11AllRef = shift; |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
#error checking |
590
|
0
|
|
|
|
|
|
my $function = "_getN11_matrix"; |
591
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
592
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
593
|
|
|
|
|
|
|
} |
594
|
|
|
|
|
|
|
|
595
|
|
|
|
|
|
|
#calculate n11 as the sum n11s for all combinations of |
596
|
|
|
|
|
|
|
# cuis1, cuis2 (order matters, cui1 must be first) |
597
|
0
|
|
|
|
|
|
my $n11 = 0; |
598
|
0
|
|
|
|
|
|
foreach my $cui1 (@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
599
|
0
|
|
|
|
|
|
foreach my $cui2 (@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
600
|
0
|
|
|
|
|
|
my $num = ${$n11AllRef}{"$cui1,$cui2"}; |
|
0
|
|
|
|
|
|
|
601
|
0
|
0
|
|
|
|
|
if(defined $num) { |
602
|
0
|
|
|
|
|
|
$n11 += $num; |
603
|
|
|
|
|
|
|
} |
604
|
|
|
|
|
|
|
} |
605
|
|
|
|
|
|
|
} |
606
|
|
|
|
|
|
|
|
607
|
|
|
|
|
|
|
#update values if ignoring word order |
608
|
0
|
0
|
|
|
|
|
if($noOrder_G) { |
609
|
|
|
|
|
|
|
#add all n11's, now with the order reversed |
610
|
0
|
|
|
|
|
|
foreach my $cui1 (@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
611
|
0
|
|
|
|
|
|
foreach my $cui2 (@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
612
|
0
|
|
|
|
|
|
my $num = ${$n11AllRef}{"$cui2,$cui1"}; |
|
0
|
|
|
|
|
|
|
613
|
0
|
0
|
|
|
|
|
if(defined $num) { |
614
|
0
|
|
|
|
|
|
$n11 += $num; |
615
|
|
|
|
|
|
|
} |
616
|
|
|
|
|
|
|
} |
617
|
|
|
|
|
|
|
} |
618
|
|
|
|
|
|
|
} |
619
|
|
|
|
|
|
|
|
620
|
0
|
|
|
|
|
|
return $n11; |
621
|
|
|
|
|
|
|
} |
622
|
|
|
|
|
|
|
|
623
|
|
|
|
|
|
|
# gets N1P for a concept using a matrix |
624
|
|
|
|
|
|
|
# input : $cuis1Ref <- reference to an array containing the first cuis in a set of cui pairs |
625
|
|
|
|
|
|
|
# $countsRef <- ref to an array containing n11, n1p, np1, and npp counts |
626
|
|
|
|
|
|
|
# for the cui combinations. See _getObservedCounts_matrix() |
627
|
|
|
|
|
|
|
# $n1pAllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, |
628
|
|
|
|
|
|
|
# of the form n1pAll{$cui} = value. See _getObservedCounts_matrix |
629
|
|
|
|
|
|
|
# $np1AllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, |
630
|
|
|
|
|
|
|
# of the form np1All{$cui} = value. See _getObservedCounts_matrix |
631
|
|
|
|
|
|
|
# output: $n1p <- the number of times the set of concepts occurs in first position |
632
|
|
|
|
|
|
|
sub _getN1p_matrix { |
633
|
|
|
|
|
|
|
#grab parameters |
634
|
0
|
|
|
0
|
|
|
my $self = shift; |
635
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
636
|
0
|
|
|
|
|
|
my $n11AllRef = shift; |
637
|
0
|
|
|
|
|
|
my $n1pAllRef = shift; |
638
|
0
|
|
|
|
|
|
my $np1AllRef = shift; |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
#error checking |
641
|
0
|
|
|
|
|
|
my $function = "_getN1P_matrix"; |
642
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
643
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
644
|
|
|
|
|
|
|
} |
645
|
|
|
|
|
|
|
|
646
|
|
|
|
|
|
|
#calculate n1p as the sum of n1p's for all cuis1 |
647
|
0
|
|
|
|
|
|
my $n1p = 0; |
648
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
649
|
0
|
|
|
|
|
|
my $num = ${$n1pAllRef}{$cui}; |
|
0
|
|
|
|
|
|
|
650
|
0
|
0
|
|
|
|
|
if(defined $num) { |
651
|
0
|
|
|
|
|
|
$n1p += $num; |
652
|
|
|
|
|
|
|
} |
653
|
|
|
|
|
|
|
} |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
#update values if ignoring word order |
656
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
657
|
|
|
|
|
|
|
#add all np1's to n1p |
658
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
659
|
0
|
|
|
|
|
|
my $num = ${$np1AllRef}{$cui}; |
|
0
|
|
|
|
|
|
|
660
|
0
|
0
|
|
|
|
|
if(defined $num) { |
661
|
0
|
|
|
|
|
|
$n1p += $num; |
662
|
|
|
|
|
|
|
} |
663
|
|
|
|
|
|
|
} |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
#avoid double counting occurrences with self, subtract them |
666
|
0
|
|
|
|
|
|
foreach my $cui1(@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
667
|
0
|
|
|
|
|
|
foreach my $cui2(@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
668
|
0
|
|
|
|
|
|
my $val = ${$n11AllRef}{"$cui1,$cui2"}; |
|
0
|
|
|
|
|
|
|
669
|
0
|
0
|
|
|
|
|
if (defined $val) { |
670
|
0
|
|
|
|
|
|
$n1p -= $val; |
671
|
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
} |
674
|
|
|
|
|
|
|
} |
675
|
|
|
|
|
|
|
|
676
|
|
|
|
|
|
|
#set n1p to -1 if there are no values for it since this indicates |
677
|
|
|
|
|
|
|
# there is not enough information to calculate the score |
678
|
0
|
0
|
|
|
|
|
if ($n1p == 0) { |
679
|
0
|
|
|
|
|
|
$n1p = -1; |
680
|
|
|
|
|
|
|
} |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
#return the value |
683
|
0
|
|
|
|
|
|
return $n1p; |
684
|
|
|
|
|
|
|
} |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
# gets NP1 for a concept using a matrix |
687
|
|
|
|
|
|
|
# input : $cuis2Ref <- reference to an array containing the first cuis in a set of cui pairs |
688
|
|
|
|
|
|
|
# $countsRef <- ref to an array containing n11, n1p, np1, and npp counts |
689
|
|
|
|
|
|
|
# for the cui combinations. See _getObservedCounts_matrix() |
690
|
|
|
|
|
|
|
# $n1pAllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, |
691
|
|
|
|
|
|
|
# of the form n1pAll{$cui} = value. See _getObservedCounts_matrix |
692
|
|
|
|
|
|
|
# $np1AllRef <- ref to an array containing n1p values for all cuis of cuis1 and cuis2, |
693
|
|
|
|
|
|
|
# of the form np1All{$cui} = value. See _getObservedCounts_matrix |
694
|
|
|
|
|
|
|
# output: $np1 <- the number of times the set of concepts occurs in second position |
695
|
|
|
|
|
|
|
sub _getNp1_matrix { |
696
|
|
|
|
|
|
|
#grab parameters |
697
|
0
|
|
|
0
|
|
|
my $self = shift; |
698
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
699
|
0
|
|
|
|
|
|
my $n11AllRef = shift; |
700
|
0
|
|
|
|
|
|
my $n1pAllRef = shift; |
701
|
0
|
|
|
|
|
|
my $np1AllRef = shift; |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
#calculate np1 as the sum of np1's for all cuis2 |
704
|
0
|
|
|
|
|
|
my $np1 = 0; |
705
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
706
|
0
|
|
|
|
|
|
my $num = ${$np1AllRef}{$cui}; |
|
0
|
|
|
|
|
|
|
707
|
0
|
0
|
|
|
|
|
if (defined $num) { |
708
|
0
|
|
|
|
|
|
$np1 += $num; |
709
|
|
|
|
|
|
|
} |
710
|
|
|
|
|
|
|
} |
711
|
|
|
|
|
|
|
|
712
|
|
|
|
|
|
|
#update values if ignoring word order |
713
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
714
|
|
|
|
|
|
|
#add all n1p's to np1s |
715
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
716
|
0
|
|
|
|
|
|
my $num = ${$n1pAllRef}{$cui}; |
|
0
|
|
|
|
|
|
|
717
|
0
|
0
|
|
|
|
|
if (defined $num) { |
718
|
0
|
|
|
|
|
|
$np1 += $num; |
719
|
|
|
|
|
|
|
} |
720
|
|
|
|
|
|
|
} |
721
|
|
|
|
|
|
|
|
722
|
|
|
|
|
|
|
#avoid double counting occurrences with self, subtract them |
723
|
0
|
|
|
|
|
|
foreach my $cui1(@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
724
|
0
|
|
|
|
|
|
foreach my $cui2(@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
725
|
0
|
|
|
|
|
|
my $val = ${$n11AllRef}{"$cui1,$cui2"}; |
|
0
|
|
|
|
|
|
|
726
|
0
|
0
|
|
|
|
|
if (defined $val) { |
727
|
0
|
|
|
|
|
|
$np1 -= $val; |
728
|
|
|
|
|
|
|
} |
729
|
|
|
|
|
|
|
} |
730
|
|
|
|
|
|
|
} |
731
|
|
|
|
|
|
|
} |
732
|
|
|
|
|
|
|
|
733
|
|
|
|
|
|
|
#set n1p to -1 if there are no values for it since this indicates |
734
|
|
|
|
|
|
|
# there is not enough information to calculate the score |
735
|
0
|
0
|
|
|
|
|
if ($np1 == 0) { |
736
|
0
|
|
|
|
|
|
$np1 = -1; |
737
|
|
|
|
|
|
|
} |
738
|
|
|
|
|
|
|
|
739
|
|
|
|
|
|
|
#return the value |
740
|
0
|
|
|
|
|
|
return $np1; |
741
|
|
|
|
|
|
|
} |
742
|
|
|
|
|
|
|
|
743
|
|
|
|
|
|
|
|
744
|
|
|
|
|
|
|
######################################################################## |
745
|
|
|
|
|
|
|
# functions to get statistical information about the cuis LTA, MWA, VSA |
746
|
|
|
|
|
|
|
######################################################################## |
747
|
|
|
|
|
|
|
# Gets contingency table values for Linking Term Association (LTA) |
748
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
749
|
|
|
|
|
|
|
# output: $\@data <- valuesarray ref containing array refs of four values |
750
|
|
|
|
|
|
|
# for each pairHash in the pairHash list. The |
751
|
|
|
|
|
|
|
# values are $n11, $n1p, $np1, and $npp |
752
|
|
|
|
|
|
|
sub _getStats_LTA { |
753
|
|
|
|
|
|
|
#grab parameters |
754
|
0
|
|
|
0
|
|
|
my $self = shift; |
755
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
756
|
|
|
|
|
|
|
|
757
|
|
|
|
|
|
|
#error checking |
758
|
0
|
|
|
|
|
|
my $function = "_getStats_LTA"; |
759
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
760
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
761
|
|
|
|
|
|
|
} |
762
|
|
|
|
|
|
|
#get data from the matrix |
763
|
0
|
|
|
|
|
|
(my $cooccurrences1ListRef, my $cooccurrences2ListRef, |
764
|
|
|
|
|
|
|
my $numCooccurrences, my $numUniqueCuis) |
765
|
|
|
|
|
|
|
= $self->_readMatrixValues_Linking($pairHashListRef); |
766
|
|
|
|
|
|
|
|
767
|
|
|
|
|
|
|
#for LTA, npp= num unique cuis in the dataset |
768
|
0
|
|
|
|
|
|
my $npp = $numUniqueCuis; |
769
|
|
|
|
|
|
|
|
770
|
|
|
|
|
|
|
#calculate stats for each pairHash based on the co-occurrences data |
771
|
0
|
|
|
|
|
|
my @data = (); |
772
|
0
|
|
|
|
|
|
for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { |
|
0
|
|
|
|
|
|
|
773
|
|
|
|
|
|
|
|
774
|
|
|
|
|
|
|
#calculate n1p and np1 as the number of co-occurring terms |
775
|
0
|
|
|
|
|
|
my $n1p = scalar keys %{${$cooccurrences1ListRef}[$i]}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
776
|
0
|
|
|
|
|
|
my $np1 = scalar keys %{${$cooccurrences2ListRef}[$i]}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
#calculate n11 |
779
|
0
|
|
|
|
|
|
my $n11 = 0; |
780
|
|
|
|
|
|
|
#Find number of CUIs that co-occur with both CUI 1 and CUI 2 |
781
|
0
|
|
|
|
|
|
foreach my $cui (keys %{${$cooccurrences1ListRef}[$i]}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
782
|
0
|
0
|
|
|
|
|
if (exists ${${$cooccurrences2ListRef}[$i]}{$cui}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
783
|
0
|
|
|
|
|
|
$n11++; |
784
|
|
|
|
|
|
|
} |
785
|
|
|
|
|
|
|
} |
786
|
|
|
|
|
|
|
|
787
|
|
|
|
|
|
|
#store the data for this pairHash |
788
|
0
|
|
|
|
|
|
my @vals = ($n11, $n1p, $np1, $npp); |
789
|
0
|
|
|
|
|
|
push @data, \@vals; |
790
|
|
|
|
|
|
|
} |
791
|
|
|
|
|
|
|
|
792
|
|
|
|
|
|
|
#return the data |
793
|
0
|
|
|
|
|
|
return \@data; |
794
|
|
|
|
|
|
|
} |
795
|
|
|
|
|
|
|
|
796
|
|
|
|
|
|
|
|
797
|
|
|
|
|
|
|
# Gets contingency table values for Minimum Weight Association (MWA) |
798
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
799
|
|
|
|
|
|
|
# output: $\@data <- array ref containing array refs of four values |
800
|
|
|
|
|
|
|
# for each pairHash in the pairHash list. The |
801
|
|
|
|
|
|
|
# values are $n11, $n1p, $np1, and $npp |
802
|
|
|
|
|
|
|
sub _getStats_MWA { |
803
|
|
|
|
|
|
|
#grab parameters |
804
|
0
|
|
|
0
|
|
|
my $self = shift; |
805
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
#error checking |
808
|
0
|
|
|
|
|
|
my $function = "_getStats_MWA"; |
809
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
810
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
811
|
|
|
|
|
|
|
} |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
#get data from the matrix |
814
|
0
|
|
|
|
|
|
(my $cooccurrences1ListRef, my $cooccurrences2ListRef, |
815
|
|
|
|
|
|
|
my $numCooccurrences, my $numUniqueCuis) |
816
|
|
|
|
|
|
|
= $self->_readMatrixValues_Linking($pairHashListRef); |
817
|
|
|
|
|
|
|
|
818
|
|
|
|
|
|
|
#for MWA, npp= numCooccurrences in the dataset |
819
|
0
|
|
|
|
|
|
my $npp = $numCooccurrences; |
820
|
|
|
|
|
|
|
|
821
|
|
|
|
|
|
|
#calculate stats for each pairHash based on the co-occurrences data |
822
|
0
|
|
|
|
|
|
my @data = (); |
823
|
0
|
|
|
|
|
|
for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { |
|
0
|
|
|
|
|
|
|
824
|
0
|
|
|
|
|
|
my $set1CoRef = ${$cooccurrences1ListRef}[$i]; |
|
0
|
|
|
|
|
|
|
825
|
0
|
|
|
|
|
|
my $set2CoRef = ${$cooccurrences2ListRef}[$i]; |
|
0
|
|
|
|
|
|
|
826
|
|
|
|
|
|
|
|
827
|
|
|
|
|
|
|
#calculate n1p and np1 as the number of co-occurrences for the term |
828
|
0
|
|
|
|
|
|
my $n1p = 0; |
829
|
0
|
|
|
|
|
|
foreach my $cui (keys %{$set1CoRef}) { |
|
0
|
|
|
|
|
|
|
830
|
0
|
|
|
|
|
|
$n1p += ${$set1CoRef}{$cui}; |
|
0
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
} |
832
|
0
|
|
|
|
|
|
my $np1 = 0; |
833
|
0
|
|
|
|
|
|
foreach my $cui (keys %{$set2CoRef}) { |
|
0
|
|
|
|
|
|
|
834
|
0
|
|
|
|
|
|
$np1 += ${$set2CoRef}{$cui}; |
|
0
|
|
|
|
|
|
|
835
|
|
|
|
|
|
|
} |
836
|
|
|
|
|
|
|
|
837
|
|
|
|
|
|
|
#Find $n11, the min co-occurrence value of the pair |
838
|
0
|
|
|
|
|
|
my $n11 = 0; |
839
|
0
|
|
|
|
|
|
foreach my $cui (keys %{$set1CoRef}) { |
|
0
|
|
|
|
|
|
|
840
|
|
|
|
|
|
|
#if this cui co-occurs with both sets, then increment n11 |
841
|
0
|
0
|
|
|
|
|
if (exists ${$set2CoRef}{$cui}) { |
|
0
|
|
|
|
|
|
|
842
|
|
|
|
|
|
|
#increment n11 by the minimum of the co-occurrences |
843
|
0
|
|
|
|
|
|
my $min = ${$set1CoRef}{$cui}; |
|
0
|
|
|
|
|
|
|
844
|
0
|
0
|
|
|
|
|
if (${$set2CoRef}{$cui} < $min) { |
|
0
|
|
|
|
|
|
|
845
|
0
|
|
|
|
|
|
$min = ${$set2CoRef}{$cui}; |
|
0
|
|
|
|
|
|
|
846
|
|
|
|
|
|
|
} |
847
|
0
|
|
|
|
|
|
$n11+=$min; |
848
|
|
|
|
|
|
|
} |
849
|
|
|
|
|
|
|
} |
850
|
|
|
|
|
|
|
|
851
|
|
|
|
|
|
|
#store the data for this pairHash |
852
|
0
|
|
|
|
|
|
my @vals = ($n11, $n1p, $np1, $npp); |
853
|
0
|
|
|
|
|
|
push @data, \@vals; |
854
|
|
|
|
|
|
|
} |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
#return the data |
857
|
0
|
|
|
|
|
|
return \@data; |
858
|
|
|
|
|
|
|
} |
859
|
|
|
|
|
|
|
|
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
# Gets contingency table values for Vector Set Association (VSA) |
862
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
863
|
|
|
|
|
|
|
# output: $\@data <- array ref containing array refs of four values |
864
|
|
|
|
|
|
|
# for each pairHash in the pairHash list. The |
865
|
|
|
|
|
|
|
# values are $n11, $n1p, $np1, and $npp |
866
|
|
|
|
|
|
|
sub _getStats_VSA { |
867
|
|
|
|
|
|
|
#grab parameters |
868
|
0
|
|
|
0
|
|
|
my $self = shift; |
869
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
#error checking |
872
|
0
|
|
|
|
|
|
my $function = "_getStats_VSA"; |
873
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
874
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
875
|
|
|
|
|
|
|
} |
876
|
|
|
|
|
|
|
#get data from the matrix |
877
|
0
|
|
|
|
|
|
(my $cooccurrences1ListRef, my $cooccurrences2ListRef, |
878
|
|
|
|
|
|
|
my $numCooccurrences, my $numUniqueCuis) |
879
|
|
|
|
|
|
|
= $self->_readMatrixValues_Linking($pairHashListRef); |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
#convert the cooccurrence lists to pairHashLists |
882
|
0
|
|
|
|
|
|
my @newPairHashList = (); |
883
|
0
|
|
|
|
|
|
for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { |
|
0
|
|
|
|
|
|
|
884
|
0
|
|
|
|
|
|
my %pairHash = (); |
885
|
|
|
|
|
|
|
|
886
|
|
|
|
|
|
|
#make set 1 an array |
887
|
0
|
|
|
|
|
|
my @set1 = (); |
888
|
0
|
|
|
|
|
|
foreach my $key (keys %{${$cooccurrences1ListRef}[$i]}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
889
|
0
|
|
|
|
|
|
push @set1, $key; |
890
|
|
|
|
|
|
|
} |
891
|
0
|
|
|
|
|
|
$pairHash{'set1'} = \@set1; |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
#make set 2 an array |
894
|
0
|
|
|
|
|
|
my @set2 = (); |
895
|
0
|
|
|
|
|
|
foreach my $key (keys %{${$cooccurrences2ListRef}[$i]}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
896
|
0
|
|
|
|
|
|
push @set2, $key; |
897
|
|
|
|
|
|
|
} |
898
|
0
|
|
|
|
|
|
$pairHash{'set2'} = \@set2; |
899
|
|
|
|
|
|
|
|
900
|
|
|
|
|
|
|
#add the pairHash to the pairHashList |
901
|
0
|
|
|
|
|
|
push @newPairHashList, \%pairHash; |
902
|
|
|
|
|
|
|
} |
903
|
|
|
|
|
|
|
#So, at this point we have converted the sets of B terms |
904
|
|
|
|
|
|
|
# into a pairhashlist. |
905
|
|
|
|
|
|
|
#Next we find the stats for each of those pair hashes and |
906
|
|
|
|
|
|
|
# use that as the stats for the original pair. |
907
|
|
|
|
|
|
|
# in this way we are finding the assocaition between |
908
|
|
|
|
|
|
|
# sets of co-occurring terms of the original terms |
909
|
0
|
|
|
|
|
|
my $allStatsRef; |
910
|
0
|
0
|
|
|
|
|
if ($matrix_G) { |
911
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_matrix(\@newPairHashList); |
912
|
|
|
|
|
|
|
} |
913
|
|
|
|
|
|
|
else { |
914
|
0
|
|
|
|
|
|
$allStatsRef = $self->_getStats_DB(\@newPairHashList); |
915
|
|
|
|
|
|
|
} |
916
|
|
|
|
|
|
|
#all stats ref contains n11, np1, n1p, and npp for |
917
|
|
|
|
|
|
|
# each of the pair hashes |
918
|
0
|
|
|
|
|
|
return $allStatsRef; |
919
|
|
|
|
|
|
|
} |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
|
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
# Gets co-occurrence data for each of the pairHashes in the pairHashList |
924
|
|
|
|
|
|
|
# and gets global stats, total number of co-occurrences in the dataset, |
925
|
|
|
|
|
|
|
# and the number of unique cuis in the dataset. The co-occurrences data |
926
|
|
|
|
|
|
|
# is returned in the form of a co-occurrences hash for cuis1 and cuis2 |
927
|
|
|
|
|
|
|
# of the pairHash. Each co-occurrences hash is: |
928
|
|
|
|
|
|
|
# $cooccurrences1{$cui2} = $val |
929
|
|
|
|
|
|
|
# There is no distinction between different cuis of cuis1 |
930
|
|
|
|
|
|
|
# input : $pairHashListRef <- ref to a pairHashList |
931
|
|
|
|
|
|
|
# output: $\@data <- array ref containing array refs of four values |
932
|
|
|
|
|
|
|
sub _readMatrixValues_Linking { |
933
|
|
|
|
|
|
|
#grab parameters |
934
|
0
|
|
|
0
|
|
|
my $self = shift; |
935
|
0
|
|
|
|
|
|
my $pairHashListRef = shift; |
936
|
|
|
|
|
|
|
|
937
|
|
|
|
|
|
|
#error checking |
938
|
0
|
|
|
|
|
|
my $function = "_readMatrixValues_Linking"; |
939
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
940
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
941
|
|
|
|
|
|
|
} |
942
|
|
|
|
|
|
|
|
943
|
|
|
|
|
|
|
#Get co-occurrences with each set of CUIs |
944
|
|
|
|
|
|
|
# for each set of cuis we find a list of cuis that co-occur with that set |
945
|
|
|
|
|
|
|
# this is done for cuis1 and cuis2. Once retreiving these two lists |
946
|
|
|
|
|
|
|
# of co-occurring cuis, we can calculate LTA based on the overlap of |
947
|
|
|
|
|
|
|
# co-occurrences. |
948
|
0
|
|
|
|
|
|
my @cooccurrences1List; |
949
|
|
|
|
|
|
|
my @cooccurrences2List; |
950
|
0
|
|
|
|
|
|
my $totalCooccurrences = 0; |
951
|
0
|
|
|
|
|
|
my $totalUniqueCuis = 0; |
952
|
0
|
0
|
|
|
|
|
if ($matrix_G) { |
953
|
|
|
|
|
|
|
#get observed counts for all data |
954
|
0
|
|
|
|
|
|
(my $cuis1Ref, my $cuis2Ref) = $self->_getAllLeadingAndTrailingCuis($pairHashListRef); |
955
|
0
|
|
|
|
|
|
(my $n1pAllRef, my $np1AllRef, $totalCooccurrences, $totalUniqueCuis) |
956
|
|
|
|
|
|
|
= $self->_getObserved_matrix_Linking($cuis1Ref, $cuis2Ref); |
957
|
|
|
|
|
|
|
|
958
|
|
|
|
|
|
|
#get co-occurrence data for each pairHash |
959
|
0
|
|
|
|
|
|
foreach my $pairHashRef(@{$pairHashListRef}) { |
|
0
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
(my $cooccurrences1Ref, my $cooccurrences2Ref) = $self |
961
|
0
|
|
|
|
|
|
->_getCUICooccurrences_matrix(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}, |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
962
|
|
|
|
|
|
|
$n1pAllRef, $np1AllRef); |
963
|
|
|
|
|
|
|
|
964
|
0
|
|
|
|
|
|
push @cooccurrences1List, $cooccurrences1Ref; |
965
|
0
|
|
|
|
|
|
push @cooccurrences2List, $cooccurrences2Ref; |
966
|
|
|
|
|
|
|
} |
967
|
|
|
|
|
|
|
} |
968
|
|
|
|
|
|
|
else { |
969
|
|
|
|
|
|
|
#get total co-occurrences and total unique cuis |
970
|
0
|
|
|
|
|
|
$totalCooccurrences = $self->_getNpp_DB(); |
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
#get npp, the number of unique cuis |
973
|
|
|
|
|
|
|
#TODO, query is slightly wrong. If the there are cuis that occur in the second position ONLY this will be wrong. I need to merge the CUI 1 and CUI2 tables then select distinct elements |
974
|
0
|
|
|
|
|
|
$totalUniqueCuis = shift @{$assocDB_G->selectcol_arrayref("SELECT COUNT(cui_1) FROM (SELECT DISTINCT cui_1 FROM N_11) AS names")}; |
|
0
|
|
|
|
|
|
|
975
|
|
|
|
|
|
|
|
976
|
|
|
|
|
|
|
#TODO, check this with MWA now ...will need to code it |
977
|
|
|
|
|
|
|
#get co-occurrence data for each pair hash |
978
|
0
|
|
|
|
|
|
foreach my $pairHashRef(@{$pairHashListRef}) { |
|
0
|
|
|
|
|
|
|
979
|
|
|
|
|
|
|
(my $cooccurrences1Ref, my $cooccurrences2Ref) = $self |
980
|
0
|
|
|
|
|
|
->_getCUICooccurrences_DB(${$pairHashRef}{'set1'}, ${$pairHashRef}{'set2'}); |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
981
|
0
|
|
|
|
|
|
push @cooccurrences1List, $cooccurrences1Ref; |
982
|
0
|
|
|
|
|
|
push @cooccurrences2List, $cooccurrences2Ref; |
983
|
|
|
|
|
|
|
} |
984
|
|
|
|
|
|
|
} |
985
|
|
|
|
|
|
|
|
986
|
0
|
|
|
|
|
|
return (\@cooccurrences1List, \@cooccurrences2List, $totalCooccurrences, $totalUniqueCuis); |
987
|
|
|
|
|
|
|
} |
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
# computes the observed co-occurrences for all combinations of the cuis passed in |
991
|
|
|
|
|
|
|
# doing this in a single function makes it so all values can be computed with a |
992
|
|
|
|
|
|
|
# single pass of the input file, making execution time much faster |
993
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
994
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
995
|
|
|
|
|
|
|
# output: $n1pAllRef <- a ref to a hash of hashes that contains co-occurence |
996
|
|
|
|
|
|
|
# data organized as: |
997
|
|
|
|
|
|
|
# matrix{leadingCUI}{trailingCUI} = cooccurrencecount |
998
|
|
|
|
|
|
|
# $np1AllRef <- a ref to a hash of hashes that contains co-occurence |
999
|
|
|
|
|
|
|
# data organized as: |
1000
|
|
|
|
|
|
|
# matrix{trailingCUI}{leadingCUI} = cooccurrencecount |
1001
|
|
|
|
|
|
|
# $cooccurrenceCount <- the total number of co-occurrences in |
1002
|
|
|
|
|
|
|
# the dataset |
1003
|
|
|
|
|
|
|
# $numUniquCuis <- the number of unique cuis in the dataset |
1004
|
|
|
|
|
|
|
sub _getObserved_matrix_Linking { |
1005
|
|
|
|
|
|
|
#grab parameters |
1006
|
0
|
|
|
0
|
|
|
my $self = shift; |
1007
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
1008
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
1009
|
|
|
|
|
|
|
|
1010
|
|
|
|
|
|
|
#convert cui arrays to hashes, makes looping thru |
1011
|
|
|
|
|
|
|
# the file faster |
1012
|
0
|
|
|
|
|
|
my %cuis1 = (); |
1013
|
0
|
|
|
|
|
|
foreach my $cui(@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
1014
|
0
|
|
|
|
|
|
$cuis1{$cui} = 1; |
1015
|
|
|
|
|
|
|
} |
1016
|
0
|
|
|
|
|
|
my %cuis2 = (); |
1017
|
0
|
|
|
|
|
|
foreach my $cui(@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
1018
|
0
|
|
|
|
|
|
$cuis2{$cui} = 1; |
1019
|
|
|
|
|
|
|
} |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
#get stats |
1022
|
0
|
|
|
|
|
|
my %n1pAll = (); |
1023
|
0
|
|
|
|
|
|
my %np1All = (); |
1024
|
0
|
|
|
|
|
|
my %uniqueCuis = (); |
1025
|
0
|
|
|
|
|
|
my $cooccurrenceCount = 0; |
1026
|
0
|
0
|
|
|
|
|
open IN, $matrix_G or die "Cannot open matrix_G for input: $matrix_G\n"; |
1027
|
0
|
|
|
|
|
|
while (my $line = ) { |
1028
|
|
|
|
|
|
|
#get cuis and value fro mthe line |
1029
|
0
|
|
|
|
|
|
chomp $line; |
1030
|
0
|
|
|
|
|
|
my ($cui1, $cui2, $num) = split /\t/, $line; |
1031
|
|
|
|
|
|
|
|
1032
|
|
|
|
|
|
|
#update unique cui lists |
1033
|
0
|
|
|
|
|
|
$uniqueCuis{$cui1} = 1; |
1034
|
0
|
|
|
|
|
|
$uniqueCuis{$cui2} = 1; |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
#update co-occurrence count |
1037
|
0
|
|
|
|
|
|
$cooccurrenceCount += $num; |
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
#update n1pAll and np1All. These just record data |
1040
|
|
|
|
|
|
|
# so we record any possible co-occurrence that matters |
1041
|
|
|
|
|
|
|
# with or without order mattering so just check |
1042
|
|
|
|
|
|
|
# if a CUI of interest is anywhere on the line |
1043
|
0
|
0
|
0
|
|
|
|
if (exists $cuis1{$cui1} || exists $cuis2{$cui2} |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
1044
|
|
|
|
|
|
|
|| exists $cuis1{$cui2} || exists $cuis2{$cui1}) { |
1045
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
#update n1pAll |
1047
|
|
|
|
|
|
|
#create n1p{$cui1} hash if needed |
1048
|
0
|
0
|
|
|
|
|
if (!defined $n1pAll{$cui1}) { |
1049
|
0
|
|
|
|
|
|
my %newHash = (); |
1050
|
0
|
|
|
|
|
|
$n1pAll{$cui1} = \%newHash; |
1051
|
|
|
|
|
|
|
} |
1052
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
#add cui2 and value |
1054
|
0
|
|
|
|
|
|
${$n1pAll{$cui1}}{$cui2} = $num; |
|
0
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
#update np1All |
1057
|
|
|
|
|
|
|
#create np1{$cui2} hash if needed |
1058
|
0
|
0
|
|
|
|
|
if (!defined $np1All{$cui2}) { |
1059
|
0
|
|
|
|
|
|
my %newHash = (); |
1060
|
0
|
|
|
|
|
|
$np1All{$cui2} = \%newHash; |
1061
|
|
|
|
|
|
|
} |
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
#add cui1 and value |
1064
|
0
|
|
|
|
|
|
${$np1All{$cui2}}{$cui1} = $num; |
|
0
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
|
1066
|
|
|
|
|
|
|
} |
1067
|
|
|
|
|
|
|
} |
1068
|
0
|
|
|
|
|
|
close IN; |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
#return the observed values |
1071
|
0
|
|
|
|
|
|
return (\%n1pAll, \%np1All, $cooccurrenceCount, (scalar keys %uniqueCuis)); |
1072
|
|
|
|
|
|
|
} |
1073
|
|
|
|
|
|
|
|
1074
|
|
|
|
|
|
|
|
1075
|
|
|
|
|
|
|
# Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using |
1076
|
|
|
|
|
|
|
# a matrix. This is the first step in computing linking term associations |
1077
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
1078
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
1079
|
|
|
|
|
|
|
# $n1pAllRef <- a ref to a hash of hashes that contains co-occurence |
1080
|
|
|
|
|
|
|
# data organized as: |
1081
|
|
|
|
|
|
|
# matrix{leadingCUI}{trailingCUI} = cooccurrencecount |
1082
|
|
|
|
|
|
|
# $np1AllRef <- a ref to a hash of hashes that contains co-occurence |
1083
|
|
|
|
|
|
|
# data organized as: |
1084
|
|
|
|
|
|
|
# matrix{trailingCUI}{leadingCUI} = cooccurrencecount |
1085
|
|
|
|
|
|
|
# output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1, |
1086
|
|
|
|
|
|
|
# values are the co-occurrence count |
1087
|
|
|
|
|
|
|
# \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2, |
1088
|
|
|
|
|
|
|
# values are the co-occurrence count |
1089
|
|
|
|
|
|
|
sub _getCUICooccurrences_matrix { |
1090
|
|
|
|
|
|
|
#grab parameters |
1091
|
0
|
|
|
0
|
|
|
my $self = shift; |
1092
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
1093
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
1094
|
0
|
|
|
|
|
|
my $n1pAllRef = shift; |
1095
|
0
|
|
|
|
|
|
my $np1AllRef = shift; |
1096
|
|
|
|
|
|
|
|
1097
|
|
|
|
|
|
|
#error checking |
1098
|
0
|
|
|
|
|
|
my $function = "_getCUICooccurrences"; |
1099
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
1100
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
1101
|
|
|
|
|
|
|
} |
1102
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
#get lists of explicitly co-occurring CUIs for each concept |
1104
|
|
|
|
|
|
|
#add trailing cui co-occurrences to cui1Data |
1105
|
0
|
|
|
|
|
|
my %cooccurrences1; |
1106
|
0
|
|
|
|
|
|
foreach my $cui1 (@{$cuis1Ref}){ |
|
0
|
|
|
|
|
|
|
1107
|
0
|
0
|
|
|
|
|
if (defined ${$n1pAllRef}{$cui1}) { |
|
0
|
|
|
|
|
|
|
1108
|
0
|
|
|
|
|
|
foreach my $cui2 (keys %{${$n1pAllRef}{$cui1}}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1109
|
0
|
|
|
|
|
|
$cooccurrences1{$cui2} = ${${$n1pAllRef}{$cui1}}{$cui2}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
} |
1111
|
|
|
|
|
|
|
} |
1112
|
|
|
|
|
|
|
} |
1113
|
|
|
|
|
|
|
|
1114
|
|
|
|
|
|
|
#add leading cui co-occurrences to cui2Data |
1115
|
0
|
|
|
|
|
|
my %cooccurrences2; |
1116
|
0
|
|
|
|
|
|
foreach my $cui2 (@{$cuis2Ref}){ |
|
0
|
|
|
|
|
|
|
1117
|
0
|
0
|
|
|
|
|
if (defined ${$np1AllRef}{$cui2}) { |
|
0
|
|
|
|
|
|
|
1118
|
0
|
|
|
|
|
|
foreach my $cui1 (keys %{${$np1AllRef}{$cui2}}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1119
|
0
|
|
|
|
|
|
$cooccurrences2{$cui1} = ${${$np1AllRef}{$cui2}}{$cui1}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1120
|
|
|
|
|
|
|
} |
1121
|
|
|
|
|
|
|
} |
1122
|
|
|
|
|
|
|
} |
1123
|
|
|
|
|
|
|
|
1124
|
|
|
|
|
|
|
#add more CUIs if order doesn't matter |
1125
|
0
|
0
|
|
|
|
|
if ($noOrder_G) { |
1126
|
|
|
|
|
|
|
#add leading cui co-occurrences to cui1Data |
1127
|
0
|
|
|
|
|
|
foreach my $cui1 (@{$cuis1Ref}){ |
|
0
|
|
|
|
|
|
|
1128
|
0
|
0
|
|
|
|
|
if (defined ${$np1AllRef}{$cui1}) { |
|
0
|
|
|
|
|
|
|
1129
|
0
|
|
|
|
|
|
foreach my $cui2 (keys %{${$np1AllRef}{$cui1}}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1130
|
0
|
|
|
|
|
|
$cooccurrences1{$cui2} = ${${$np1AllRef}{$cui1}}{$cui2}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1131
|
|
|
|
|
|
|
} |
1132
|
|
|
|
|
|
|
} |
1133
|
|
|
|
|
|
|
} |
1134
|
|
|
|
|
|
|
#add trailing cui co-occurrences to cui2Data |
1135
|
0
|
|
|
|
|
|
foreach my $cui2 (@{$cuis2Ref}){ |
|
0
|
|
|
|
|
|
|
1136
|
0
|
0
|
|
|
|
|
if (defined ${$n1pAllRef}{$cui2}) { |
|
0
|
|
|
|
|
|
|
1137
|
0
|
|
|
|
|
|
foreach my $cui1 (keys %{${$n1pAllRef}{$cui2}}) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1138
|
0
|
|
|
|
|
|
$cooccurrences2{$cui1} = ${${$n1pAllRef}{$cui2}}{$cui1}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
1139
|
|
|
|
|
|
|
} |
1140
|
|
|
|
|
|
|
} |
1141
|
|
|
|
|
|
|
} |
1142
|
|
|
|
|
|
|
} |
1143
|
|
|
|
|
|
|
|
1144
|
0
|
|
|
|
|
|
return (\%cooccurrences1, \%cooccurrences2); |
1145
|
|
|
|
|
|
|
} |
1146
|
|
|
|
|
|
|
|
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
# Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using |
1149
|
|
|
|
|
|
|
# a database. This is the first step in computing linking term associations |
1150
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
1151
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
1152
|
|
|
|
|
|
|
# output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1, |
1153
|
|
|
|
|
|
|
# values are 1 |
1154
|
|
|
|
|
|
|
# \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2, |
1155
|
|
|
|
|
|
|
# values are 1 |
1156
|
|
|
|
|
|
|
sub _getCUICooccurrences_DB { |
1157
|
|
|
|
|
|
|
#grab parameters |
1158
|
0
|
|
|
0
|
|
|
my $self = shift; |
1159
|
0
|
|
|
|
|
|
my $cuis1Ref = shift; |
1160
|
0
|
|
|
|
|
|
my $cuis2Ref = shift; |
1161
|
|
|
|
|
|
|
|
1162
|
|
|
|
|
|
|
#error checking |
1163
|
0
|
|
|
|
|
|
my $function = "_getCUICooccurrences_DB"; |
1164
|
0
|
0
|
0
|
|
|
|
if(!defined $self || !ref $self) { |
1165
|
0
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
1166
|
|
|
|
|
|
|
} |
1167
|
|
|
|
|
|
|
|
1168
|
|
|
|
|
|
|
#get hashes of co-occurring CUIs |
1169
|
0
|
|
|
|
|
|
my %cooccurrences1 = (); |
1170
|
0
|
|
|
|
|
|
my %cooccurrences2 = (); |
1171
|
|
|
|
|
|
|
|
1172
|
|
|
|
|
|
|
#query DB to get cuis, where concept 1 is the leading cui |
1173
|
0
|
|
|
|
|
|
my $firstCui = shift @{$cuis1Ref}; |
|
0
|
|
|
|
|
|
|
1174
|
0
|
|
|
|
|
|
my $query = "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; |
1175
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
1176
|
0
|
|
|
|
|
|
$query .= "OR N_11.cui_1 = '$cui' "; |
1177
|
|
|
|
|
|
|
} |
1178
|
0
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
1179
|
0
|
|
|
|
|
|
my @cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
|
0
|
|
|
|
|
|
|
1180
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
1181
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
#turn CUIs into a hash of cui1's cooccurrences |
1183
|
0
|
|
|
|
|
|
foreach my $cui (@cuis) { |
1184
|
0
|
|
|
|
|
|
$cooccurrences1{$cui} = 1; |
1185
|
|
|
|
|
|
|
} |
1186
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
#query DB to get cuis, where concept 2 is the trailing cui |
1188
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
0
|
|
|
|
|
|
|
1189
|
0
|
|
|
|
|
|
$query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; |
1190
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
1191
|
0
|
|
|
|
|
|
$query .= "OR N_11.cui_2 = '$cui' "; |
1192
|
|
|
|
|
|
|
} |
1193
|
0
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
1194
|
0
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
|
0
|
|
|
|
|
|
|
1195
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
1196
|
|
|
|
|
|
|
|
1197
|
|
|
|
|
|
|
#turn CUIs into a hash of cui2's co-occurrences |
1198
|
0
|
|
|
|
|
|
foreach my $cui (@cuis) { |
1199
|
0
|
|
|
|
|
|
$cooccurrences2{$cui} = 1; |
1200
|
|
|
|
|
|
|
} |
1201
|
|
|
|
|
|
|
|
1202
|
|
|
|
|
|
|
#add additional cuis if order doesn't matter |
1203
|
0
|
0
|
|
|
|
|
if($noOrder_G) { |
1204
|
|
|
|
|
|
|
#get cuis, where concept 1 is the trailing cui |
1205
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis1Ref}; |
|
0
|
|
|
|
|
|
|
1206
|
0
|
|
|
|
|
|
my $query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; |
1207
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
|
0
|
|
|
|
|
|
|
1208
|
0
|
|
|
|
|
|
$query .= "OR N_11.cui_2 = '$cui' "; |
1209
|
|
|
|
|
|
|
} |
1210
|
0
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
1211
|
0
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
|
0
|
|
|
|
|
|
|
1212
|
0
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
1213
|
|
|
|
|
|
|
|
1214
|
|
|
|
|
|
|
#add cuis to the hash of cui1's co-occurrences |
1215
|
0
|
|
|
|
|
|
foreach my $cui (@cuis) { |
1216
|
0
|
|
|
|
|
|
$cooccurrences1{$cui} = 1; |
1217
|
|
|
|
|
|
|
} |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
#get cuis, where concept 2 is the leading cui |
1220
|
0
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
|
0
|
|
|
|
|
|
|
1221
|
0
|
|
|
|
|
|
$query = "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; |
1222
|
0
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
|
0
|
|
|
|
|
|
|
1223
|
0
|
|
|
|
|
|
$query .= "OR N_11.cui_1 = '$cui' "; |
1224
|
|
|
|
|
|
|
} |
1225
|
0
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
1226
|
0
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
|
0
|
|
|
|
|
|
|
1227
|
0
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
|
0
|
|
|
|
|
|
|
1228
|
|
|
|
|
|
|
|
1229
|
|
|
|
|
|
|
#add cuis to the hash of cui2's co-occurrences |
1230
|
0
|
|
|
|
|
|
foreach my $cui (@cuis) { |
1231
|
0
|
|
|
|
|
|
$cooccurrences2{$cui} = 1; |
1232
|
|
|
|
|
|
|
} |
1233
|
|
|
|
|
|
|
} |
1234
|
|
|
|
|
|
|
|
1235
|
|
|
|
|
|
|
#return the cui co-occurrences |
1236
|
0
|
|
|
|
|
|
return (\%cooccurrences1, \%cooccurrences2); |
1237
|
|
|
|
|
|
|
} |
1238
|
|
|
|
|
|
|
|
1239
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
=comment |
1241
|
|
|
|
|
|
|
# Gets hashes of CUIs that co-occurr with the sets of cuis1 and cuis 2 using |
1242
|
|
|
|
|
|
|
# a database. This is the first step in computing linking term associations |
1243
|
|
|
|
|
|
|
# input : $cuis1Ref <- ref to an array of the first cuis in a set of cui pairs |
1244
|
|
|
|
|
|
|
# $cuis2Ref <- ref to an array of the second cuis in a set of cui pairs |
1245
|
|
|
|
|
|
|
# output: \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 1, |
1246
|
|
|
|
|
|
|
# values are 1 |
1247
|
|
|
|
|
|
|
# \%cooccurrences1 <- hash ref, keys are co-occurring cuis with cui 2, |
1248
|
|
|
|
|
|
|
# values are 1 |
1249
|
|
|
|
|
|
|
sub _getCUICooccurrences_DB { |
1250
|
|
|
|
|
|
|
#grab parameters |
1251
|
|
|
|
|
|
|
my $self = shift; |
1252
|
|
|
|
|
|
|
my $cuis1Ref = shift; |
1253
|
|
|
|
|
|
|
my $cuis2Ref = shift; |
1254
|
|
|
|
|
|
|
|
1255
|
|
|
|
|
|
|
#error checking |
1256
|
|
|
|
|
|
|
my $function = "_getCUICooccurrences_DB"; |
1257
|
|
|
|
|
|
|
if(!defined $self || !ref $self) { |
1258
|
|
|
|
|
|
|
$errorhandler->_error($pkg, $function, "", 2); |
1259
|
|
|
|
|
|
|
} |
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
#get hashes of co-occurring CUIs |
1262
|
|
|
|
|
|
|
my %cooccurrences1 = (); |
1263
|
|
|
|
|
|
|
my %cooccurrences2 = (); |
1264
|
|
|
|
|
|
|
|
1265
|
|
|
|
|
|
|
#query DB to get cuis, where concept 1 is the leading cui |
1266
|
|
|
|
|
|
|
my $firstCui = shift @{$cuis1Ref}; |
1267
|
|
|
|
|
|
|
my $query = "SELECT * FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; |
1268
|
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
1269
|
|
|
|
|
|
|
$query .= "OR N_11.cui_1 = '$cui' "; |
1270
|
|
|
|
|
|
|
} |
1271
|
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
1272
|
|
|
|
|
|
|
my $sth = $assocDB_G->prepare($query); |
1273
|
|
|
|
|
|
|
$sth->execute(); |
1274
|
|
|
|
|
|
|
my @rows = @{$sth->fetchall_arrayref()}; |
1275
|
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
1276
|
|
|
|
|
|
|
|
1277
|
|
|
|
|
|
|
#turn CUIs into a hash of cui1's cooccurrences |
1278
|
|
|
|
|
|
|
foreach my $rowRef (@rows) { |
1279
|
|
|
|
|
|
|
print STDERR join(' ', @{$rowRef})."\n"; |
1280
|
|
|
|
|
|
|
} |
1281
|
|
|
|
|
|
|
#TODO - this is done, it works ... it gets back the whole relevant table. Now fill up as needed. |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
|
1284
|
|
|
|
|
|
|
my @cuis; |
1285
|
|
|
|
|
|
|
#query DB to get cuis, where concept 2 is the trailing cui |
1286
|
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
1287
|
|
|
|
|
|
|
$query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; |
1288
|
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
1289
|
|
|
|
|
|
|
$query .= "OR N_11.cui_2 = '$cui' "; |
1290
|
|
|
|
|
|
|
} |
1291
|
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
1292
|
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
1293
|
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
1294
|
|
|
|
|
|
|
|
1295
|
|
|
|
|
|
|
#turn CUIs into a hash of cui2's co-occurrences |
1296
|
|
|
|
|
|
|
foreach my $cui (@cuis) { |
1297
|
|
|
|
|
|
|
$cooccurrences2{$cui} = 1; |
1298
|
|
|
|
|
|
|
} |
1299
|
|
|
|
|
|
|
|
1300
|
|
|
|
|
|
|
#add additional cuis if order doesn't matter |
1301
|
|
|
|
|
|
|
if($noOrder_G) { |
1302
|
|
|
|
|
|
|
#get cuis, where concept 1 is the trailing cui |
1303
|
|
|
|
|
|
|
$firstCui = shift @{$cuis1Ref}; |
1304
|
|
|
|
|
|
|
my $query = "SELECT N_11.cui_1 FROM N_11 WHERE (N_11.cui_2 = '$firstCui' "; |
1305
|
|
|
|
|
|
|
foreach my $cui (@{$cuis1Ref}) { |
1306
|
|
|
|
|
|
|
$query .= "OR N_11.cui_2 = '$cui' "; |
1307
|
|
|
|
|
|
|
} |
1308
|
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
1309
|
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
1310
|
|
|
|
|
|
|
unshift @{$cuis1Ref}, $firstCui; |
1311
|
|
|
|
|
|
|
|
1312
|
|
|
|
|
|
|
#add cuis to the hash of cui1's co-occurrences |
1313
|
|
|
|
|
|
|
foreach my $cui (@cuis) { |
1314
|
|
|
|
|
|
|
$cooccurrences1{$cui} = 1; |
1315
|
|
|
|
|
|
|
} |
1316
|
|
|
|
|
|
|
|
1317
|
|
|
|
|
|
|
#get cuis, where concept 2 is the leading cui |
1318
|
|
|
|
|
|
|
$firstCui = shift @{$cuis2Ref}; |
1319
|
|
|
|
|
|
|
$query = "SELECT N_11.cui_2 FROM N_11 WHERE (N_11.cui_1 = '$firstCui' "; |
1320
|
|
|
|
|
|
|
foreach my $cui (@{$cuis2Ref}) { |
1321
|
|
|
|
|
|
|
$query .= "OR N_11.cui_1 = '$cui' "; |
1322
|
|
|
|
|
|
|
} |
1323
|
|
|
|
|
|
|
$query .= ") AND N_11.n_11 > 0;"; |
1324
|
|
|
|
|
|
|
@cuis = @{$assocDB_G->selectcol_arrayref($query)}; |
1325
|
|
|
|
|
|
|
unshift @{$cuis2Ref}, $firstCui; |
1326
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
#add cuis to the hash of cui2's co-occurrences |
1328
|
|
|
|
|
|
|
foreach my $cui (@cuis) { |
1329
|
|
|
|
|
|
|
$cooccurrences2{$cui} = 1; |
1330
|
|
|
|
|
|
|
} |
1331
|
|
|
|
|
|
|
} |
1332
|
|
|
|
|
|
|
|
1333
|
|
|
|
|
|
|
#return the cui co-occurrences |
1334
|
|
|
|
|
|
|
return (\%cooccurrences1, \%cooccurrences2); |
1335
|
|
|
|
|
|
|
} |
1336
|
|
|
|
|
|
|
=cut |
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
1; |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
__END__ |