line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#UMLS::Association |
2
|
|
|
|
|
|
|
# |
3
|
|
|
|
|
|
|
# Perl module for scoring the semantic association of terms in the Unified |
4
|
|
|
|
|
|
|
# Medical Language System (UMLS). |
5
|
|
|
|
|
|
|
# |
6
|
|
|
|
|
|
|
# Copyright (c) 2015 |
7
|
|
|
|
|
|
|
# |
8
|
|
|
|
|
|
|
# Sam Henry, Virginia Commonwealth University |
9
|
|
|
|
|
|
|
# henryst at vcu.edu |
10
|
|
|
|
|
|
|
# |
11
|
|
|
|
|
|
|
# Bridget McInnes, Virginia Commonwealth University |
12
|
|
|
|
|
|
|
# btmcinnes at vcu.edu |
13
|
|
|
|
|
|
|
# |
14
|
|
|
|
|
|
|
# Keith Herbert, Virginia Commonwealth University |
15
|
|
|
|
|
|
|
# herbertkb at vcu.edu |
16
|
|
|
|
|
|
|
# |
17
|
|
|
|
|
|
|
# Alexander D. McQuilkin, Virginia Commonwealth University |
18
|
|
|
|
|
|
|
# alexmcq99 at yahoo.com |
19
|
|
|
|
|
|
|
# |
20
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or |
21
|
|
|
|
|
|
|
# modify it under the terms of the GNU General Public License |
22
|
|
|
|
|
|
|
# as published by the Free Software Foundation; either version 2 |
23
|
|
|
|
|
|
|
# of the License, or (at your option) any later version. |
24
|
|
|
|
|
|
|
# |
25
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
26
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
27
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
28
|
|
|
|
|
|
|
# GNU General Public License for more details. |
29
|
|
|
|
|
|
|
# |
30
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
31
|
|
|
|
|
|
|
# along with this program; if not, write to |
32
|
|
|
|
|
|
|
# |
33
|
|
|
|
|
|
|
# The Free Software Foundation, Inc., |
34
|
|
|
|
|
|
|
# 59 Temple Place - Suite 330, |
35
|
|
|
|
|
|
|
# Boston, MA 02111-1307, USA. |
36
|
|
|
|
|
|
|
package UMLS::Association::StatFinder; |
37
|
|
|
|
|
|
|
|
38
|
1
|
|
|
1
|
|
4
|
use Fcntl; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
120
|
|
39
|
1
|
|
|
1
|
|
4
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
13
|
|
40
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
22
|
|
41
|
1
|
|
|
1
|
|
3
|
use bytes; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
8
|
|
42
|
1
|
|
|
1
|
|
15
|
use File::Spec; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
25
|
|
43
|
|
|
|
|
|
|
|
44
|
1
|
|
|
1
|
|
299
|
use UMLS::Association::Measures::Direct; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
19
|
|
45
|
1
|
|
|
1
|
|
280
|
use UMLS::Association::Measures::LTA; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
18
|
|
46
|
1
|
|
|
1
|
|
308
|
use UMLS::Association::Measures::MWA; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
18
|
|
47
|
1
|
|
|
1
|
|
283
|
use UMLS::Association::Measures::SBC; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
17
|
|
48
|
1
|
|
|
1
|
|
272
|
use UMLS::Association::Measures::LSA; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
17
|
|
49
|
1
|
|
|
1
|
|
292
|
use UMLS::Association::Measures::WSA; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
1237
|
|
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# error handling variables |
52
|
|
|
|
|
|
|
my $errorhandler = ""; |
53
|
|
|
|
|
|
|
my $pkg = "UMLS::Association::StatFinder"; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
#NOTE: every global variable is followed by a _G with the |
56
|
|
|
|
|
|
|
# exception of debug error handler, and constants which are all caps |
57
|
|
|
|
|
|
|
# global variables |
58
|
|
|
|
|
|
|
my $debug = 0; #in debug mode or not |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
#global options variables |
61
|
|
|
|
|
|
|
my $lta_G = 0; #1 or 0 is using lta or not |
62
|
|
|
|
|
|
|
my $mwa_G = 0; #1 or 0 if using mwa or not |
63
|
|
|
|
|
|
|
my $lsa_G = 0; #1 or 0 if using lsa or not |
64
|
|
|
|
|
|
|
my $sbc_G = 0; #1 or 0 if using sbc or not |
65
|
|
|
|
|
|
|
my $wsa_G = 0; #1 or 0 if using wsa or not |
66
|
|
|
|
|
|
|
my $noOrder_G = 0; #1 or 0 if noOrder is enabled or not |
67
|
|
|
|
|
|
|
my $matrix_G = 0; #matrix file name is using a matrix file rather than DB |
68
|
|
|
|
|
|
|
my $params_G; #stores all params (if needed) |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
###################################################################### |
71
|
|
|
|
|
|
|
# Initialization Functions |
72
|
|
|
|
|
|
|
###################################################################### |
73
|
|
|
|
|
|
|
# method to create a new UMLS::Association::StatFinder object |
74
|
|
|
|
|
|
|
# input : $params <- reference to hash of database parameters |
75
|
|
|
|
|
|
|
# output: $self |
76
|
|
|
|
|
|
|
sub new { |
77
|
|
|
|
|
|
|
#grab params and create self |
78
|
24
|
|
|
24
|
0
|
24
|
my $self = {}; |
79
|
24
|
|
|
|
|
24
|
my $className = shift; |
80
|
24
|
|
|
|
|
23
|
my $params = shift; |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
#bless the object. |
83
|
24
|
|
|
|
|
40
|
bless($self, $className); |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
#initialize error handler |
86
|
24
|
|
|
|
|
33
|
$errorhandler = UMLS::Association::ErrorHandler->new(); |
87
|
24
|
50
|
|
|
|
34
|
if(! defined $errorhandler) { |
88
|
0
|
|
|
|
|
0
|
print STDERR "The error handler did not get passed properly.\n"; |
89
|
0
|
|
|
|
|
0
|
exit; |
90
|
|
|
|
|
|
|
} |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# initialize the object. |
93
|
24
|
|
|
|
|
24
|
$debug = 0; |
94
|
24
|
|
|
|
|
39
|
$self->_initialize($params); |
95
|
24
|
|
|
|
|
33
|
return $self; |
96
|
|
|
|
|
|
|
} |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# method to initialize the UMLS::Association::StatFinder object. |
99
|
|
|
|
|
|
|
# input : $parameters <- reference to a hash of database parameters |
100
|
|
|
|
|
|
|
# output: none, but $self is initialized |
101
|
|
|
|
|
|
|
sub _initialize { |
102
|
|
|
|
|
|
|
#grab parameters |
103
|
24
|
|
|
24
|
|
34
|
my $self = shift; |
104
|
24
|
|
|
|
|
20
|
my $paramsRef = shift; |
105
|
24
|
|
|
|
|
31
|
my %params = %{$paramsRef}; |
|
24
|
|
|
|
|
81
|
|
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
#set global variables using option hash |
108
|
24
|
|
|
|
|
39
|
$lta_G = $params{'lta'}; |
109
|
24
|
|
|
|
|
20
|
$mwa_G = $params{'mwa'}; |
110
|
24
|
|
|
|
|
19
|
$lsa_G = $params{'lsa'}; |
111
|
24
|
|
|
|
|
21
|
$sbc_G = $params{'sbc'}; |
112
|
24
|
|
|
|
|
23
|
$wsa_G = $params{'wsa'}; |
113
|
24
|
|
|
|
|
22
|
$noOrder_G = $params{'noorder'}; |
114
|
24
|
|
|
|
|
22
|
$matrix_G = $params{'matrix'}; |
115
|
24
|
|
|
|
|
22
|
$params_G = $paramsRef; |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
#error checking |
118
|
24
|
|
|
|
|
21
|
my $function = "_initialize"; |
119
|
24
|
|
|
|
|
34
|
&_debug($function); |
120
|
24
|
50
|
33
|
|
|
54
|
if(!defined $self || !ref $self) { |
121
|
0
|
|
|
|
|
0
|
$errorhandler->_error($pkg, $function, "", 2); |
122
|
|
|
|
|
|
|
} |
123
|
24
|
50
|
|
|
|
46
|
if (!defined $matrix_G) { |
124
|
0
|
|
|
|
|
0
|
die ("ERROR: co-occurrence matrix must be defined\n"); |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
} |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub _debug { |
129
|
24
|
|
|
24
|
|
21
|
my $function = shift; |
130
|
24
|
50
|
|
|
|
31
|
if($debug) { print STDERR "In UMLS::Association::StatFinder::$function\n"; } |
|
0
|
|
|
|
|
0
|
|
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
###################################################################### |
134
|
|
|
|
|
|
|
# public interface to get observed counts |
135
|
|
|
|
|
|
|
###################################################################### |
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# Gets observed counts (n11, n1p, np1, npp) of the cui sets |
138
|
|
|
|
|
|
|
# input: $pairHashListRef - a ref to an array of pairHashes |
139
|
|
|
|
|
|
|
# output: \@allStatsRef - a ref to an array of observed counts 4-tuples |
140
|
|
|
|
|
|
|
# each 4-tuple consists of in order: |
141
|
|
|
|
|
|
|
# $n11, $n1p, $np1, and $npp |
142
|
|
|
|
|
|
|
# and they correspond to the observed counts of |
143
|
|
|
|
|
|
|
# each of the pairHashes passed in |
144
|
|
|
|
|
|
|
sub getObservedCounts { |
145
|
|
|
|
|
|
|
#grab parameters |
146
|
24
|
|
|
24
|
0
|
24
|
my $self = shift; |
147
|
24
|
|
|
|
|
19
|
my $pairHashListRef = shift; |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
#error checking |
150
|
24
|
|
|
|
|
22
|
my $function = "getObservedCounts"; |
151
|
24
|
50
|
33
|
|
|
53
|
if(!defined $self || !ref $self) { |
152
|
0
|
|
|
|
|
0
|
$errorhandler->_error($pkg, $function, "", 2); |
153
|
|
|
|
|
|
|
} |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
#calculate n11, n1p, np1, npp |
156
|
24
|
|
|
|
|
23
|
my $allStatsRef = -1; |
157
|
24
|
100
|
|
|
|
41
|
if ($lta_G) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
158
|
4
|
|
|
|
|
5
|
$allStatsRef = &UMLS::Association::Measures::LTA::getStats($pairHashListRef, $matrix_G, $noOrder_G); |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
elsif ($mwa_G) { |
161
|
4
|
|
|
|
|
6
|
$allStatsRef = &UMLS::Association::Measures::MWA::getStats($pairHashListRef, $matrix_G, $noOrder_G); |
162
|
|
|
|
|
|
|
} |
163
|
|
|
|
|
|
|
elsif ($lsa_G) { |
164
|
5
|
|
|
|
|
8
|
$allStatsRef = &UMLS::Association::Measures::LSA::getStats($pairHashListRef, $matrix_G, $noOrder_G); |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
elsif ($sbc_G) { |
167
|
4
|
|
|
|
|
8
|
$allStatsRef = &UMLS::Association::Measures::SBC::getStats($pairHashListRef, $matrix_G, $noOrder_G); |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
elsif ($wsa_G) { |
170
|
2
|
|
|
|
|
3
|
$allStatsRef = &UMLS::Association::Measures::WSA::getStats($pairHashListRef, $matrix_G, $noOrder_G, $params_G); |
171
|
|
|
|
|
|
|
} |
172
|
|
|
|
|
|
|
else { |
173
|
5
|
|
|
|
|
8
|
$allStatsRef = &UMLS::Association::Measures::Direct::getStats($pairHashListRef, $matrix_G, $noOrder_G); |
174
|
|
|
|
|
|
|
} |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
#return a reference to a list of stats for each pairHash |
177
|
24
|
|
|
|
|
45
|
return $allStatsRef; |
178
|
|
|
|
|
|
|
} |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
####NOTE: fine for direct, LTA, and MWA |
181
|
|
|
|
|
|
|
#NOT for SBC and LSA |
182
|
|
|
|
|
|
|
#Reads in the matrix values that are needed |
183
|
|
|
|
|
|
|
# anything not in the pairHashListRef is counted |
184
|
|
|
|
|
|
|
# towards a universal source and/or universal sink node |
185
|
|
|
|
|
|
|
# Input: $pairHashListRef - reference to the pairHashList |
186
|
|
|
|
|
|
|
# $matrixFileName - fileName of the matrix |
187
|
|
|
|
|
|
|
# Output: \%matrix - matrix as read in. Matrix is stored |
188
|
|
|
|
|
|
|
# as a hash of hashes, where hash{node} |
189
|
|
|
|
|
|
|
# contains all a hash of all outgoing edges |
190
|
|
|
|
|
|
|
# from that node, such that: |
191
|
|
|
|
|
|
|
# ${$hash{$source}}{$target} = $weight |
192
|
|
|
|
|
|
|
sub readInMatrix { |
193
|
18
|
|
|
18
|
0
|
15
|
my $pairHashListRef = shift; |
194
|
18
|
|
|
|
|
27
|
my $matrixFileName = shift; |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
#convert the pairhash list to a list of leading and |
197
|
|
|
|
|
|
|
# trailing cuis |
198
|
18
|
|
|
|
|
17
|
my %leadingCuis = (); |
199
|
18
|
|
|
|
|
18
|
my %trailingCuis = (); |
200
|
18
|
|
|
|
|
18
|
my %allCuis = (); |
201
|
18
|
|
|
|
|
16
|
foreach my $pairHashRef(@{$pairHashListRef}) { |
|
18
|
|
|
|
|
43
|
|
202
|
25
|
|
|
|
|
22
|
foreach my $cui(@{${$pairHashRef}{'set1'}}) { |
|
25
|
|
|
|
|
49
|
|
|
25
|
|
|
|
|
33
|
|
203
|
52
|
|
|
|
|
49
|
$leadingCuis{$cui} = 1; |
204
|
52
|
|
|
|
|
56
|
$allCuis{$cui} = 1; |
205
|
|
|
|
|
|
|
} |
206
|
25
|
|
|
|
|
23
|
foreach my $cui(@{${$pairHashRef}{'set2'}}) { |
|
25
|
|
|
|
|
21
|
|
|
25
|
|
|
|
|
25
|
|
207
|
48
|
|
|
|
|
47
|
$trailingCuis{$cui} = 1; |
208
|
48
|
|
|
|
|
51
|
$allCuis{$cui} = 1; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
#Initalize the matrix, and ensure there is a source node |
213
|
18
|
|
|
|
|
17
|
my %matrix = (); |
214
|
18
|
|
|
|
|
18
|
my %sourceHash = (); |
215
|
18
|
|
|
|
|
22
|
$matrix{'source'} = \%sourceHash; |
216
|
18
|
|
|
|
|
19
|
${$matrix{'source'}}{'sink'} = 0; |
|
18
|
|
|
|
|
22
|
|
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
#also get a count of vocabulary when reading |
219
|
18
|
|
|
|
|
29
|
my %vocabulary = (); |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
#### Read in matrix, longest execution time, so keep this fast |
222
|
|
|
|
|
|
|
#read in all matrix values associated with all |
223
|
|
|
|
|
|
|
# cuis needed. For co-occurrences outside of the |
224
|
|
|
|
|
|
|
# needed set, created a universal source and sink |
225
|
18
|
50
|
|
|
|
450
|
open IN, $matrix_G or die "Cannot open $matrix_G for input: $!\n"; |
226
|
18
|
|
|
|
|
199
|
while (my $line = ) { |
227
|
|
|
|
|
|
|
#get cuis and value from the line |
228
|
179
|
|
|
|
|
192
|
chomp $line; |
229
|
179
|
|
|
|
|
346
|
my ($cui1, $cui2, $num) = split /\t/, $line; |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
#update the vocabulary |
232
|
179
|
|
|
|
|
212
|
$vocabulary{$cui1} = 1; |
233
|
179
|
|
|
|
|
178
|
$vocabulary{$cui2} = 1; |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
#if cui1 or cui2 are not cuis of interest, replace |
236
|
|
|
|
|
|
|
# them with universal source/sink |
237
|
179
|
100
|
100
|
|
|
275
|
if (!exists $allCuis{$cui1} && !exists $allCuis{$cui2}) { |
238
|
33
|
|
|
|
|
33
|
$cui1 = 'source'; |
239
|
33
|
|
|
|
|
27
|
$cui2 = 'sink'; |
240
|
|
|
|
|
|
|
} |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
#record the co-occurrence as a directed weighted |
243
|
|
|
|
|
|
|
# weighted edge list ${hash{n1}}{n2}=$val; |
244
|
179
|
100
|
|
|
|
205
|
if (!exists $matrix{$cui1}) { |
245
|
103
|
|
|
|
|
96
|
my %emptyHash = (); |
246
|
103
|
|
|
|
|
130
|
$matrix{$cui1} = \%emptyHash; |
247
|
|
|
|
|
|
|
} |
248
|
179
|
|
|
|
|
157
|
${$matrix{$cui1}}{$cui2} += $num; |
|
179
|
|
|
|
|
423
|
|
249
|
|
|
|
|
|
|
} |
250
|
18
|
|
|
|
|
111
|
close IN; |
251
|
|
|
|
|
|
|
##### END reading in matrix |
252
|
|
|
|
|
|
|
|
253
|
18
|
|
|
|
|
106
|
return \%matrix, (scalar keys %vocabulary); |
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
#gets a new pair hash list, that is the linking terms of the |
258
|
|
|
|
|
|
|
# original pair hashes |
259
|
|
|
|
|
|
|
# Input: |
260
|
|
|
|
|
|
|
# $pairHashListRef - ref to an array of pairHashes |
261
|
|
|
|
|
|
|
# $matrixFileName - the fileName of the co-occurrence matrix |
262
|
|
|
|
|
|
|
# $noOrder - 1 if order is enforced, 0 if not |
263
|
|
|
|
|
|
|
# $recordStats - 1 if stats are to be recorded (n1p for all, np1 for all, npp) |
264
|
|
|
|
|
|
|
# $recordCounts - 1 if unique counts are to be used rather than co-occurrence |
265
|
|
|
|
|
|
|
# counts (i.e. npp = vocab size, n1p = count of co-occurring |
266
|
|
|
|
|
|
|
# terms rather than npp = all co-occurrences, n1p = sum of |
267
|
|
|
|
|
|
|
# co-occurrences for a term). |
268
|
|
|
|
|
|
|
# Output: |
269
|
|
|
|
|
|
|
# output varies whether or not stats are recorded. |
270
|
|
|
|
|
|
|
# If stats are not recorded: |
271
|
|
|
|
|
|
|
# \@newPairHashList - an array of pair hashes, where each pairHash is |
272
|
|
|
|
|
|
|
# the linking terms, B_A and B_C of the original pair hash at that |
273
|
|
|
|
|
|
|
# index (order/NoOrder is accounted for in this list) |
274
|
|
|
|
|
|
|
# If stats are recorded: |
275
|
|
|
|
|
|
|
# \%n1p - n1p{cui} = n1p for that term (order enforced |
276
|
|
|
|
|
|
|
# \%np1 - np1{cui} = np1 for that term (order enforced |
277
|
|
|
|
|
|
|
# $npp - npp for this dataset (term counts or co-occurrence counts) |
278
|
|
|
|
|
|
|
# \%matrix - the co-occurrence matrix for all A, B_A, B_C, and C terms |
279
|
|
|
|
|
|
|
# this is essentially all possibly N11's (order enforced) |
280
|
|
|
|
|
|
|
# \@newPairHashList - same as above |
281
|
|
|
|
|
|
|
#NOTE: LSA and SBC do not need anything but newPairHashList |
282
|
|
|
|
|
|
|
# LTA needs npp (with counts), newPairHashList |
283
|
|
|
|
|
|
|
# MWA needs n1p, np1, npp, matrix, newPairHashList |
284
|
|
|
|
|
|
|
# ...I kept everything in this method because calculating the newPairHashList |
285
|
|
|
|
|
|
|
# of linking terms is fairly complicated and I didn't want to replicate code |
286
|
|
|
|
|
|
|
# to make bug fixes/updates easier. For minor speed iprovements, and |
287
|
|
|
|
|
|
|
# understandability, we could create 3 seperate methods, one for each of the |
288
|
|
|
|
|
|
|
# required sets of info needed |
289
|
|
|
|
|
|
|
sub getLinkingTermsPairHashList { |
290
|
17
|
|
|
17
|
0
|
19
|
my $pairHashListRef = shift; |
291
|
17
|
|
|
|
|
16
|
my $matrixFileName = shift; |
292
|
17
|
|
|
|
|
15
|
my $noOrder = shift; |
293
|
17
|
|
|
|
|
17
|
my $recordStats = shift; |
294
|
17
|
|
|
|
|
16
|
my $recordCounts = shift; |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
##################################### |
297
|
|
|
|
|
|
|
#create data structures to make line reading fast |
298
|
|
|
|
|
|
|
##################################### |
299
|
17
|
|
|
|
|
15
|
my %allCuis = (); #a hash of all cuis we need to grab hash{cui}=1 |
300
|
17
|
|
|
|
|
13
|
my %set1Cuis = (); #hash{cui} = list of pairHash indeces containing the cui in set1 |
301
|
17
|
|
|
|
|
14
|
my %set2Cuis = (); #hash{cui} = list of pairHash indeces containing the cui in set2 |
302
|
17
|
|
|
|
|
17
|
my @newPairHashList = (); #the linking term pairHashList |
303
|
17
|
|
|
|
|
16
|
for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { |
|
42
|
|
|
|
|
65
|
|
304
|
25
|
|
|
|
|
21
|
my $pairHashRef = ${$pairHashListRef}[$i]; |
|
25
|
|
|
|
|
25
|
|
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
#initialize set1 index |
307
|
25
|
|
|
|
|
25
|
my %set1Hash = (); |
308
|
25
|
|
|
|
|
22
|
foreach my $cui(@{${$pairHashRef}{'set1'}}) { |
|
25
|
|
|
|
|
29
|
|
|
25
|
|
|
|
|
36
|
|
309
|
33
|
50
|
|
|
|
53
|
if (!exists $set1Cuis{$cui}) { |
310
|
33
|
|
|
|
|
29
|
my @emptyArray = (); |
311
|
33
|
|
|
|
|
42
|
$set1Cuis{$cui} = \@emptyArray; |
312
|
33
|
|
|
|
|
57
|
$allCuis{$cui} = 1; |
313
|
|
|
|
|
|
|
} |
314
|
33
|
|
|
|
|
32
|
push @{$set1Cuis{$cui}}, $i; |
|
33
|
|
|
|
|
44
|
|
315
|
|
|
|
|
|
|
} |
316
|
|
|
|
|
|
|
#initialize set2 index |
317
|
25
|
|
|
|
|
25
|
my %set2Hash = (); |
318
|
25
|
|
|
|
|
19
|
foreach my $cui(@{${$pairHashRef}{'set2'}}) { |
|
25
|
|
|
|
|
51
|
|
|
25
|
|
|
|
|
30
|
|
319
|
41
|
50
|
|
|
|
54
|
if (!exists $set2Cuis{$cui}) { |
320
|
41
|
|
|
|
|
37
|
my @emptyArray = (); |
321
|
41
|
|
|
|
|
44
|
$set2Cuis{$cui} = \@emptyArray; |
322
|
41
|
|
|
|
|
42
|
$allCuis{$cui}=1; |
323
|
|
|
|
|
|
|
} |
324
|
41
|
|
|
|
|
34
|
push @{$set2Cuis{$cui}}, $i; |
|
41
|
|
|
|
|
46
|
|
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
#initialize the parallel linking set pairHash |
328
|
25
|
|
|
|
|
24
|
my %newPairHash = (); |
329
|
25
|
|
|
|
|
22
|
my @set1Array = (); |
330
|
25
|
|
|
|
|
51
|
$newPairHash{'set1'} = \@set1Array; |
331
|
25
|
|
|
|
|
22
|
my @set2Array = (); |
332
|
25
|
|
|
|
|
24
|
$newPairHash{'set2'} = \@set2Array; |
333
|
25
|
|
|
|
|
38
|
push @newPairHashList, \%newPairHash; |
334
|
|
|
|
|
|
|
} |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
################################## |
337
|
|
|
|
|
|
|
##### Initialize values for recording stats while reading the matrix |
338
|
|
|
|
|
|
|
################################## |
339
|
17
|
|
|
|
|
27
|
my %n1p = (); |
340
|
17
|
|
|
|
|
15
|
my %np1 = (); |
341
|
17
|
|
|
|
|
16
|
my $npp = 0; |
342
|
17
|
|
|
|
|
14
|
my %vocab = (); |
343
|
17
|
|
|
|
|
17
|
my %matrix = (); |
344
|
|
|
|
|
|
|
# We want to make sure that n1p and np1 have a value for |
345
|
|
|
|
|
|
|
# all set1 and set2 terms, so initialize to 0 |
346
|
|
|
|
|
|
|
# ...and we want to do this here, so that it is only done once |
347
|
17
|
|
|
|
|
17
|
for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { |
|
42
|
|
|
|
|
47
|
|
348
|
25
|
|
|
|
|
25
|
my $pairHashRef = ${$pairHashListRef}[$i]; |
|
25
|
|
|
|
|
22
|
|
349
|
25
|
|
|
|
|
32
|
foreach my $cui(@{${$pairHashRef}{'set1'}}) { |
|
25
|
|
|
|
|
19
|
|
|
25
|
|
|
|
|
26
|
|
350
|
33
|
|
|
|
|
36
|
$n1p{$cui}=0 |
351
|
|
|
|
|
|
|
} |
352
|
25
|
|
|
|
|
30
|
foreach my $cui(@{${$pairHashRef}{'set2'}}) { |
|
25
|
|
|
|
|
20
|
|
|
25
|
|
|
|
|
30
|
|
353
|
41
|
|
|
|
|
44
|
$np1{$cui}=0; |
354
|
|
|
|
|
|
|
} |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
#################################### |
358
|
|
|
|
|
|
|
#### File Read in, where most execution time is spent, keep this fast |
359
|
|
|
|
|
|
|
#################################### |
360
|
|
|
|
|
|
|
#read in the linking sets from the matrix |
361
|
17
|
50
|
|
|
|
464
|
open IN, $matrixFileName or die "Cannot open $matrixFileName for input: $!\n"; |
362
|
17
|
|
|
|
|
49
|
my @vals; |
363
|
17
|
|
|
|
|
186
|
while (my $line = ) { |
364
|
|
|
|
|
|
|
#get cuis and value from the line |
365
|
200
|
|
|
|
|
371
|
@vals = (split /\t/, $line); |
366
|
|
|
|
|
|
|
#$cui1 = $vals[0] |
367
|
|
|
|
|
|
|
#$cui2 = $vals[1] |
368
|
|
|
|
|
|
|
#$num = $vals[2] |
369
|
200
|
|
|
|
|
217
|
chomp $vals[2]; |
370
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
#update npp and vocab if needed |
372
|
200
|
100
|
|
|
|
211
|
if ($recordStats) { |
373
|
92
|
100
|
|
|
|
102
|
if ($recordCounts) { |
374
|
46
|
|
|
|
|
41
|
$vals[2] = 1; |
375
|
46
|
|
|
|
|
48
|
$vocab{$vals[0]} = 1; |
376
|
46
|
|
|
|
|
44
|
$vocab{$vals[1]} = 1; |
377
|
|
|
|
|
|
|
} |
378
|
92
|
|
|
|
|
90
|
$npp += $vals[2]; |
379
|
|
|
|
|
|
|
} |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
#If either of the CUIs are in the pairHashList, then record |
382
|
|
|
|
|
|
|
# their co-occurrences, and additional stats if needed |
383
|
200
|
100
|
100
|
|
|
381
|
if (exists $allCuis{$vals[0]} || exists $allCuis{$vals[1]}) { |
384
|
|
|
|
|
|
|
#add any cui1 linking terms if needed |
385
|
164
|
100
|
|
|
|
189
|
if (exists $set1Cuis{$vals[0]}) { |
386
|
|
|
|
|
|
|
#cui1 is in one more or more set1s |
387
|
|
|
|
|
|
|
# add to the linking term (cui2) to each pairhash |
388
|
|
|
|
|
|
|
# that cui1 is in |
389
|
74
|
|
|
|
|
63
|
foreach my $index (@{$set1Cuis{$vals[0]}}) { |
|
74
|
|
|
|
|
92
|
|
390
|
74
|
|
|
|
|
68
|
push @{${$newPairHashList[$index]}{'set1'}}, $vals[1]; |
|
74
|
|
|
|
|
58
|
|
|
74
|
|
|
|
|
129
|
|
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
#add any cui2 linking terms if needed |
395
|
164
|
100
|
|
|
|
189
|
if (exists $set2Cuis{$vals[1]}) { |
396
|
|
|
|
|
|
|
#cui2 is in one more or more set2s |
397
|
|
|
|
|
|
|
# add to the linking term (cui1) to each pairhash |
398
|
|
|
|
|
|
|
# that cui2 is in |
399
|
96
|
|
|
|
|
73
|
foreach my $index (@{$set2Cuis{$vals[1]}}) { |
|
96
|
|
|
|
|
124
|
|
400
|
96
|
|
|
|
|
69
|
push @{${$newPairHashList[$index]}{'set2'}}, $vals[0]; |
|
96
|
|
|
|
|
77
|
|
|
96
|
|
|
|
|
135
|
|
401
|
|
|
|
|
|
|
} |
402
|
|
|
|
|
|
|
} |
403
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
#record n1p and np1 and npp if needed |
405
|
164
|
100
|
|
|
|
180
|
if ($recordStats) { |
406
|
|
|
|
|
|
|
#n1p and np1 are recorded for MWA only |
407
|
80
|
|
|
|
|
91
|
$n1p{$vals[0]} += $vals[2]; |
408
|
80
|
|
|
|
|
79
|
$np1{$vals[1]} += $vals[2]; |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
#The matrix must be recorded to calculate MWA |
411
|
80
|
50
|
66
|
|
|
128
|
if (exists $set1Cuis{$vals[0]} || exists $set2Cuis{$vals[1]}) { |
412
|
80
|
100
|
|
|
|
93
|
if (!defined $matrix{$vals[0]}) { |
413
|
52
|
|
|
|
|
49
|
my %emptyHash = (); |
414
|
52
|
|
|
|
|
65
|
$matrix{$vals[0]} = \%emptyHash; |
415
|
|
|
|
|
|
|
} |
416
|
80
|
|
|
|
|
68
|
${$matrix{$vals[0]}}{$vals[1]} = $vals[2]; |
|
80
|
|
|
|
|
109
|
|
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
} |
419
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
#add cuis if order doesnt matter |
421
|
164
|
100
|
|
|
|
262
|
if ($noOrder) { |
422
|
|
|
|
|
|
|
#NOTE: this is the same as above, but with cui1 |
423
|
|
|
|
|
|
|
# and cui2 swapped. I didn't make this a seperate |
424
|
|
|
|
|
|
|
# method becuase method calls are slow, and |
425
|
|
|
|
|
|
|
# iterating thru lines should be fast |
426
|
84
|
|
|
|
|
77
|
my $temp = $vals[0]; |
427
|
84
|
|
|
|
|
71
|
$vals[0] = $vals[1]; |
428
|
84
|
|
|
|
|
74
|
$vals[1] = $temp; |
429
|
|
|
|
|
|
|
# ...literally copy and paste below |
430
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
#add any cui1 linking terms if needed |
432
|
84
|
100
|
|
|
|
95
|
if (exists $set1Cuis{$vals[0]}) { |
433
|
|
|
|
|
|
|
#cui1 is in one more or more set1s |
434
|
|
|
|
|
|
|
# add to the linking term (cui2) to each pairhash |
435
|
|
|
|
|
|
|
# that cui1 is in |
436
|
3
|
|
|
|
|
3
|
foreach my $index (@{$set1Cuis{$vals[0]}}) { |
|
3
|
|
|
|
|
5
|
|
437
|
3
|
|
|
|
|
3
|
push @{${$newPairHashList[$index]}{'set1'}}, $vals[1]; |
|
3
|
|
|
|
|
3
|
|
|
3
|
|
|
|
|
6
|
|
438
|
|
|
|
|
|
|
} |
439
|
|
|
|
|
|
|
} |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
#add any cui2 linking terms if needed |
442
|
84
|
100
|
|
|
|
188
|
if (exists $set2Cuis{$vals[1]}) { |
443
|
|
|
|
|
|
|
#cui2 is in one more or more set2s |
444
|
|
|
|
|
|
|
# add to the linking term (cui1) to each pairhash |
445
|
|
|
|
|
|
|
# that cui2 is in |
446
|
13
|
|
|
|
|
12
|
foreach my $index (@{$set2Cuis{$vals[1]}}) { |
|
13
|
|
|
|
|
13
|
|
447
|
13
|
|
|
|
|
14
|
push @{${$newPairHashList[$index]}{'set2'}}, $vals[0]; |
|
13
|
|
|
|
|
11
|
|
|
13
|
|
|
|
|
46
|
|
448
|
|
|
|
|
|
|
} |
449
|
|
|
|
|
|
|
} |
450
|
|
|
|
|
|
|
} |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
###### DONE reading in the file and constructing linking term sets |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
################################# |
456
|
|
|
|
|
|
|
# Remove Duplicates |
457
|
|
|
|
|
|
|
################################## |
458
|
|
|
|
|
|
|
#due to how cuis are added to the pair hashes |
459
|
|
|
|
|
|
|
# it is possible to have duplicate cuis in each |
460
|
|
|
|
|
|
|
# remove any duplicates |
461
|
17
|
|
|
|
|
28
|
for (my $i = 0; $i < scalar @newPairHashList; $i++) { |
462
|
25
|
|
|
|
|
17
|
my $set1Ref = ${$newPairHashList[$i]}{'set1'}; |
|
25
|
|
|
|
|
34
|
|
463
|
25
|
|
|
|
|
23
|
my $set2Ref = ${$newPairHashList[$i]}{'set2'}; |
|
25
|
|
|
|
|
21
|
|
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
#remove duplicates from set1 |
466
|
25
|
|
|
|
|
27
|
my %set1 = (); |
467
|
25
|
|
|
|
|
21
|
foreach my $term (@{$set1Ref}) { |
|
25
|
|
|
|
|
29
|
|
468
|
77
|
|
|
|
|
95
|
$set1{$term} = 1; |
469
|
|
|
|
|
|
|
} |
470
|
25
|
|
|
|
|
43
|
my @set1Terms = keys %set1; |
471
|
25
|
|
|
|
|
25
|
${$newPairHashList[$i]}{'set1'} = \@set1Terms; |
|
25
|
|
|
|
|
26
|
|
472
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
#remove duplicates from set2 |
474
|
25
|
|
|
|
|
36
|
my %set2 = (); |
475
|
25
|
|
|
|
|
21
|
foreach my $term (@{$set2Ref}) { |
|
25
|
|
|
|
|
25
|
|
476
|
109
|
|
|
|
|
105
|
$set2{$term} = 1; |
477
|
|
|
|
|
|
|
} |
478
|
25
|
|
|
|
|
36
|
my @set2Terms = keys %set2; |
479
|
25
|
|
|
|
|
31
|
${$newPairHashList[$i]}{'set2'} = \@set2Terms; |
|
25
|
|
|
|
|
74
|
|
480
|
|
|
|
|
|
|
} |
481
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
################################# |
483
|
|
|
|
|
|
|
# Return values |
484
|
|
|
|
|
|
|
################################## |
485
|
|
|
|
|
|
|
#return the pair hash list of linking sets |
486
|
17
|
100
|
|
|
|
22
|
if ($recordStats) { |
487
|
8
|
100
|
|
|
|
11
|
if ($recordCounts) { |
488
|
4
|
|
|
|
|
5
|
$npp = scalar keys %vocab; |
489
|
|
|
|
|
|
|
} |
490
|
8
|
|
|
|
|
40
|
return \%n1p, \%np1, $npp, \%matrix, \@newPairHashList; |
491
|
|
|
|
|
|
|
} |
492
|
|
|
|
|
|
|
else { |
493
|
9
|
|
|
|
|
35
|
return \@newPairHashList; |
494
|
|
|
|
|
|
|
} |
495
|
|
|
|
|
|
|
} |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
1; |
499
|
|
|
|
|
|
|
__END__ |