| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
#UMLS::Association |
|
2
|
|
|
|
|
|
|
# |
|
3
|
|
|
|
|
|
|
# Perl module for scoring the semantic association of terms in the Unified |
|
4
|
|
|
|
|
|
|
# Medical Language System (UMLS). |
|
5
|
|
|
|
|
|
|
# |
|
6
|
|
|
|
|
|
|
# Copyright (c) 2015 |
|
7
|
|
|
|
|
|
|
# |
|
8
|
|
|
|
|
|
|
# Sam Henry, Virginia Commonwealth University |
|
9
|
|
|
|
|
|
|
# henryst at vcu.edu |
|
10
|
|
|
|
|
|
|
# |
|
11
|
|
|
|
|
|
|
# Bridget McInnes, Virginia Commonwealth University |
|
12
|
|
|
|
|
|
|
# btmcinnes at vcu.edu |
|
13
|
|
|
|
|
|
|
# |
|
14
|
|
|
|
|
|
|
# Keith Herbert, Virginia Commonwealth University |
|
15
|
|
|
|
|
|
|
# herbertkb at vcu.edu |
|
16
|
|
|
|
|
|
|
# |
|
17
|
|
|
|
|
|
|
# Alexander D. McQuilkin, Virginia Commonwealth University |
|
18
|
|
|
|
|
|
|
# alexmcq99 at yahoo.com |
|
19
|
|
|
|
|
|
|
# |
|
20
|
|
|
|
|
|
|
# This program is free software; you can redistribute it and/or |
|
21
|
|
|
|
|
|
|
# modify it under the terms of the GNU General Public License |
|
22
|
|
|
|
|
|
|
# as published by the Free Software Foundation; either version 2 |
|
23
|
|
|
|
|
|
|
# of the License, or (at your option) any later version. |
|
24
|
|
|
|
|
|
|
# |
|
25
|
|
|
|
|
|
|
# This program is distributed in the hope that it will be useful, |
|
26
|
|
|
|
|
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
27
|
|
|
|
|
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
28
|
|
|
|
|
|
|
# GNU General Public License for more details. |
|
29
|
|
|
|
|
|
|
# |
|
30
|
|
|
|
|
|
|
# You should have received a copy of the GNU General Public License |
|
31
|
|
|
|
|
|
|
# along with this program; if not, write to |
|
32
|
|
|
|
|
|
|
# |
|
33
|
|
|
|
|
|
|
# The Free Software Foundation, Inc., |
|
34
|
|
|
|
|
|
|
# 59 Temple Place - Suite 330, |
|
35
|
|
|
|
|
|
|
# Boston, MA 02111-1307, USA. |
|
36
|
|
|
|
|
|
|
package UMLS::Association::StatFinder; |
|
37
|
|
|
|
|
|
|
|
|
38
|
1
|
|
|
1
|
|
4
|
use Fcntl; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
120
|
|
|
39
|
1
|
|
|
1
|
|
4
|
use strict; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
13
|
|
|
40
|
1
|
|
|
1
|
|
3
|
use warnings; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
22
|
|
|
41
|
1
|
|
|
1
|
|
3
|
use bytes; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
8
|
|
|
42
|
1
|
|
|
1
|
|
15
|
use File::Spec; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
25
|
|
|
43
|
|
|
|
|
|
|
|
|
44
|
1
|
|
|
1
|
|
299
|
use UMLS::Association::Measures::Direct; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
19
|
|
|
45
|
1
|
|
|
1
|
|
280
|
use UMLS::Association::Measures::LTA; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
18
|
|
|
46
|
1
|
|
|
1
|
|
308
|
use UMLS::Association::Measures::MWA; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
18
|
|
|
47
|
1
|
|
|
1
|
|
283
|
use UMLS::Association::Measures::SBC; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
17
|
|
|
48
|
1
|
|
|
1
|
|
272
|
use UMLS::Association::Measures::LSA; |
|
|
1
|
|
|
|
|
1
|
|
|
|
1
|
|
|
|
|
17
|
|
|
49
|
1
|
|
|
1
|
|
292
|
use UMLS::Association::Measures::WSA; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
1237
|
|
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
# error handling variables |
|
52
|
|
|
|
|
|
|
my $errorhandler = ""; |
|
53
|
|
|
|
|
|
|
my $pkg = "UMLS::Association::StatFinder"; |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
#NOTE: every global variable is followed by a _G with the |
|
56
|
|
|
|
|
|
|
# exception of debug error handler, and constants which are all caps |
|
57
|
|
|
|
|
|
|
# global variables |
|
58
|
|
|
|
|
|
|
my $debug = 0; #in debug mode or not |
|
59
|
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
#global options variables |
|
61
|
|
|
|
|
|
|
my $lta_G = 0; #1 or 0 is using lta or not |
|
62
|
|
|
|
|
|
|
my $mwa_G = 0; #1 or 0 if using mwa or not |
|
63
|
|
|
|
|
|
|
my $lsa_G = 0; #1 or 0 if using lsa or not |
|
64
|
|
|
|
|
|
|
my $sbc_G = 0; #1 or 0 if using sbc or not |
|
65
|
|
|
|
|
|
|
my $wsa_G = 0; #1 or 0 if using wsa or not |
|
66
|
|
|
|
|
|
|
my $noOrder_G = 0; #1 or 0 if noOrder is enabled or not |
|
67
|
|
|
|
|
|
|
my $matrix_G = 0; #matrix file name is using a matrix file rather than DB |
|
68
|
|
|
|
|
|
|
my $params_G; #stores all params (if needed) |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
###################################################################### |
|
71
|
|
|
|
|
|
|
# Initialization Functions |
|
72
|
|
|
|
|
|
|
###################################################################### |
|
73
|
|
|
|
|
|
|
# method to create a new UMLS::Association::StatFinder object |
|
74
|
|
|
|
|
|
|
# input : $params <- reference to hash of database parameters |
|
75
|
|
|
|
|
|
|
# output: $self |
|
76
|
|
|
|
|
|
|
sub new { |
|
77
|
|
|
|
|
|
|
#grab params and create self |
|
78
|
24
|
|
|
24
|
0
|
24
|
my $self = {}; |
|
79
|
24
|
|
|
|
|
24
|
my $className = shift; |
|
80
|
24
|
|
|
|
|
23
|
my $params = shift; |
|
81
|
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
#bless the object. |
|
83
|
24
|
|
|
|
|
40
|
bless($self, $className); |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
#initialize error handler |
|
86
|
24
|
|
|
|
|
33
|
$errorhandler = UMLS::Association::ErrorHandler->new(); |
|
87
|
24
|
50
|
|
|
|
34
|
if(! defined $errorhandler) { |
|
88
|
0
|
|
|
|
|
0
|
print STDERR "The error handler did not get passed properly.\n"; |
|
89
|
0
|
|
|
|
|
0
|
exit; |
|
90
|
|
|
|
|
|
|
} |
|
91
|
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
# initialize the object. |
|
93
|
24
|
|
|
|
|
24
|
$debug = 0; |
|
94
|
24
|
|
|
|
|
39
|
$self->_initialize($params); |
|
95
|
24
|
|
|
|
|
33
|
return $self; |
|
96
|
|
|
|
|
|
|
} |
|
97
|
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
# method to initialize the UMLS::Association::StatFinder object. |
|
99
|
|
|
|
|
|
|
# input : $parameters <- reference to a hash of database parameters |
|
100
|
|
|
|
|
|
|
# output: none, but $self is initialized |
|
101
|
|
|
|
|
|
|
sub _initialize { |
|
102
|
|
|
|
|
|
|
#grab parameters |
|
103
|
24
|
|
|
24
|
|
34
|
my $self = shift; |
|
104
|
24
|
|
|
|
|
20
|
my $paramsRef = shift; |
|
105
|
24
|
|
|
|
|
31
|
my %params = %{$paramsRef}; |
|
|
24
|
|
|
|
|
81
|
|
|
106
|
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
#set global variables using option hash |
|
108
|
24
|
|
|
|
|
39
|
$lta_G = $params{'lta'}; |
|
109
|
24
|
|
|
|
|
20
|
$mwa_G = $params{'mwa'}; |
|
110
|
24
|
|
|
|
|
19
|
$lsa_G = $params{'lsa'}; |
|
111
|
24
|
|
|
|
|
21
|
$sbc_G = $params{'sbc'}; |
|
112
|
24
|
|
|
|
|
23
|
$wsa_G = $params{'wsa'}; |
|
113
|
24
|
|
|
|
|
22
|
$noOrder_G = $params{'noorder'}; |
|
114
|
24
|
|
|
|
|
22
|
$matrix_G = $params{'matrix'}; |
|
115
|
24
|
|
|
|
|
22
|
$params_G = $paramsRef; |
|
116
|
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
#error checking |
|
118
|
24
|
|
|
|
|
21
|
my $function = "_initialize"; |
|
119
|
24
|
|
|
|
|
34
|
&_debug($function); |
|
120
|
24
|
50
|
33
|
|
|
54
|
if(!defined $self || !ref $self) { |
|
121
|
0
|
|
|
|
|
0
|
$errorhandler->_error($pkg, $function, "", 2); |
|
122
|
|
|
|
|
|
|
} |
|
123
|
24
|
50
|
|
|
|
46
|
if (!defined $matrix_G) { |
|
124
|
0
|
|
|
|
|
0
|
die ("ERROR: co-occurrence matrix must be defined\n"); |
|
125
|
|
|
|
|
|
|
} |
|
126
|
|
|
|
|
|
|
} |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
sub _debug { |
|
129
|
24
|
|
|
24
|
|
21
|
my $function = shift; |
|
130
|
24
|
50
|
|
|
|
31
|
if($debug) { print STDERR "In UMLS::Association::StatFinder::$function\n"; } |
|
|
0
|
|
|
|
|
0
|
|
|
131
|
|
|
|
|
|
|
} |
|
132
|
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
###################################################################### |
|
134
|
|
|
|
|
|
|
# public interface to get observed counts |
|
135
|
|
|
|
|
|
|
###################################################################### |
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
# Gets observed counts (n11, n1p, np1, npp) of the cui sets |
|
138
|
|
|
|
|
|
|
# input: $pairHashListRef - a ref to an array of pairHashes |
|
139
|
|
|
|
|
|
|
# output: \@allStatsRef - a ref to an array of observed counts 4-tuples |
|
140
|
|
|
|
|
|
|
# each 4-tuple consists of in order: |
|
141
|
|
|
|
|
|
|
# $n11, $n1p, $np1, and $npp |
|
142
|
|
|
|
|
|
|
# and they correspond to the observed counts of |
|
143
|
|
|
|
|
|
|
# each of the pairHashes passed in |
|
144
|
|
|
|
|
|
|
sub getObservedCounts { |
|
145
|
|
|
|
|
|
|
#grab parameters |
|
146
|
24
|
|
|
24
|
0
|
24
|
my $self = shift; |
|
147
|
24
|
|
|
|
|
19
|
my $pairHashListRef = shift; |
|
148
|
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
#error checking |
|
150
|
24
|
|
|
|
|
22
|
my $function = "getObservedCounts"; |
|
151
|
24
|
50
|
33
|
|
|
53
|
if(!defined $self || !ref $self) { |
|
152
|
0
|
|
|
|
|
0
|
$errorhandler->_error($pkg, $function, "", 2); |
|
153
|
|
|
|
|
|
|
} |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
#calculate n11, n1p, np1, npp |
|
156
|
24
|
|
|
|
|
23
|
my $allStatsRef = -1; |
|
157
|
24
|
100
|
|
|
|
41
|
if ($lta_G) { |
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
158
|
4
|
|
|
|
|
5
|
$allStatsRef = &UMLS::Association::Measures::LTA::getStats($pairHashListRef, $matrix_G, $noOrder_G); |
|
159
|
|
|
|
|
|
|
} |
|
160
|
|
|
|
|
|
|
elsif ($mwa_G) { |
|
161
|
4
|
|
|
|
|
6
|
$allStatsRef = &UMLS::Association::Measures::MWA::getStats($pairHashListRef, $matrix_G, $noOrder_G); |
|
162
|
|
|
|
|
|
|
} |
|
163
|
|
|
|
|
|
|
elsif ($lsa_G) { |
|
164
|
5
|
|
|
|
|
8
|
$allStatsRef = &UMLS::Association::Measures::LSA::getStats($pairHashListRef, $matrix_G, $noOrder_G); |
|
165
|
|
|
|
|
|
|
} |
|
166
|
|
|
|
|
|
|
elsif ($sbc_G) { |
|
167
|
4
|
|
|
|
|
8
|
$allStatsRef = &UMLS::Association::Measures::SBC::getStats($pairHashListRef, $matrix_G, $noOrder_G); |
|
168
|
|
|
|
|
|
|
} |
|
169
|
|
|
|
|
|
|
elsif ($wsa_G) { |
|
170
|
2
|
|
|
|
|
3
|
$allStatsRef = &UMLS::Association::Measures::WSA::getStats($pairHashListRef, $matrix_G, $noOrder_G, $params_G); |
|
171
|
|
|
|
|
|
|
} |
|
172
|
|
|
|
|
|
|
else { |
|
173
|
5
|
|
|
|
|
8
|
$allStatsRef = &UMLS::Association::Measures::Direct::getStats($pairHashListRef, $matrix_G, $noOrder_G); |
|
174
|
|
|
|
|
|
|
} |
|
175
|
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
#return a reference to a list of stats for each pairHash |
|
177
|
24
|
|
|
|
|
45
|
return $allStatsRef; |
|
178
|
|
|
|
|
|
|
} |
|
179
|
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
####NOTE: fine for direct, LTA, and MWA |
|
181
|
|
|
|
|
|
|
#NOT for SBC and LSA |
|
182
|
|
|
|
|
|
|
#Reads in the matrix values that are needed |
|
183
|
|
|
|
|
|
|
# anything not in the pairHashListRef is counted |
|
184
|
|
|
|
|
|
|
# towards a universal source and/or universal sink node |
|
185
|
|
|
|
|
|
|
# Input: $pairHashListRef - reference to the pairHashList |
|
186
|
|
|
|
|
|
|
# $matrixFileName - fileName of the matrix |
|
187
|
|
|
|
|
|
|
# Output: \%matrix - matrix as read in. Matrix is stored |
|
188
|
|
|
|
|
|
|
# as a hash of hashes, where hash{node} |
|
189
|
|
|
|
|
|
|
# contains all a hash of all outgoing edges |
|
190
|
|
|
|
|
|
|
# from that node, such that: |
|
191
|
|
|
|
|
|
|
# ${$hash{$source}}{$target} = $weight |
|
192
|
|
|
|
|
|
|
sub readInMatrix { |
|
193
|
18
|
|
|
18
|
0
|
15
|
my $pairHashListRef = shift; |
|
194
|
18
|
|
|
|
|
27
|
my $matrixFileName = shift; |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
#convert the pairhash list to a list of leading and |
|
197
|
|
|
|
|
|
|
# trailing cuis |
|
198
|
18
|
|
|
|
|
17
|
my %leadingCuis = (); |
|
199
|
18
|
|
|
|
|
18
|
my %trailingCuis = (); |
|
200
|
18
|
|
|
|
|
18
|
my %allCuis = (); |
|
201
|
18
|
|
|
|
|
16
|
foreach my $pairHashRef(@{$pairHashListRef}) { |
|
|
18
|
|
|
|
|
43
|
|
|
202
|
25
|
|
|
|
|
22
|
foreach my $cui(@{${$pairHashRef}{'set1'}}) { |
|
|
25
|
|
|
|
|
49
|
|
|
|
25
|
|
|
|
|
33
|
|
|
203
|
52
|
|
|
|
|
49
|
$leadingCuis{$cui} = 1; |
|
204
|
52
|
|
|
|
|
56
|
$allCuis{$cui} = 1; |
|
205
|
|
|
|
|
|
|
} |
|
206
|
25
|
|
|
|
|
23
|
foreach my $cui(@{${$pairHashRef}{'set2'}}) { |
|
|
25
|
|
|
|
|
21
|
|
|
|
25
|
|
|
|
|
25
|
|
|
207
|
48
|
|
|
|
|
47
|
$trailingCuis{$cui} = 1; |
|
208
|
48
|
|
|
|
|
51
|
$allCuis{$cui} = 1; |
|
209
|
|
|
|
|
|
|
} |
|
210
|
|
|
|
|
|
|
} |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
#Initalize the matrix, and ensure there is a source node |
|
213
|
18
|
|
|
|
|
17
|
my %matrix = (); |
|
214
|
18
|
|
|
|
|
18
|
my %sourceHash = (); |
|
215
|
18
|
|
|
|
|
22
|
$matrix{'source'} = \%sourceHash; |
|
216
|
18
|
|
|
|
|
19
|
${$matrix{'source'}}{'sink'} = 0; |
|
|
18
|
|
|
|
|
22
|
|
|
217
|
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
#also get a count of vocabulary when reading |
|
219
|
18
|
|
|
|
|
29
|
my %vocabulary = (); |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
#### Read in matrix, longest execution time, so keep this fast |
|
222
|
|
|
|
|
|
|
#read in all matrix values associated with all |
|
223
|
|
|
|
|
|
|
# cuis needed. For co-occurrences outside of the |
|
224
|
|
|
|
|
|
|
# needed set, created a universal source and sink |
|
225
|
18
|
50
|
|
|
|
450
|
open IN, $matrix_G or die "Cannot open $matrix_G for input: $!\n"; |
|
226
|
18
|
|
|
|
|
199
|
while (my $line = ) { |
|
227
|
|
|
|
|
|
|
#get cuis and value from the line |
|
228
|
179
|
|
|
|
|
192
|
chomp $line; |
|
229
|
179
|
|
|
|
|
346
|
my ($cui1, $cui2, $num) = split /\t/, $line; |
|
230
|
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
#update the vocabulary |
|
232
|
179
|
|
|
|
|
212
|
$vocabulary{$cui1} = 1; |
|
233
|
179
|
|
|
|
|
178
|
$vocabulary{$cui2} = 1; |
|
234
|
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
#if cui1 or cui2 are not cuis of interest, replace |
|
236
|
|
|
|
|
|
|
# them with universal source/sink |
|
237
|
179
|
100
|
100
|
|
|
275
|
if (!exists $allCuis{$cui1} && !exists $allCuis{$cui2}) { |
|
238
|
33
|
|
|
|
|
33
|
$cui1 = 'source'; |
|
239
|
33
|
|
|
|
|
27
|
$cui2 = 'sink'; |
|
240
|
|
|
|
|
|
|
} |
|
241
|
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
#record the co-occurrence as a directed weighted |
|
243
|
|
|
|
|
|
|
# weighted edge list ${hash{n1}}{n2}=$val; |
|
244
|
179
|
100
|
|
|
|
205
|
if (!exists $matrix{$cui1}) { |
|
245
|
103
|
|
|
|
|
96
|
my %emptyHash = (); |
|
246
|
103
|
|
|
|
|
130
|
$matrix{$cui1} = \%emptyHash; |
|
247
|
|
|
|
|
|
|
} |
|
248
|
179
|
|
|
|
|
157
|
${$matrix{$cui1}}{$cui2} += $num; |
|
|
179
|
|
|
|
|
423
|
|
|
249
|
|
|
|
|
|
|
} |
|
250
|
18
|
|
|
|
|
111
|
close IN; |
|
251
|
|
|
|
|
|
|
##### END reading in matrix |
|
252
|
|
|
|
|
|
|
|
|
253
|
18
|
|
|
|
|
106
|
return \%matrix, (scalar keys %vocabulary); |
|
254
|
|
|
|
|
|
|
} |
|
255
|
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
#gets a new pair hash list, that is the linking terms of the |
|
258
|
|
|
|
|
|
|
# original pair hashes |
|
259
|
|
|
|
|
|
|
# Input: |
|
260
|
|
|
|
|
|
|
# $pairHashListRef - ref to an array of pairHashes |
|
261
|
|
|
|
|
|
|
# $matrixFileName - the fileName of the co-occurrence matrix |
|
262
|
|
|
|
|
|
|
# $noOrder - 1 if order is enforced, 0 if not |
|
263
|
|
|
|
|
|
|
# $recordStats - 1 if stats are to be recorded (n1p for all, np1 for all, npp) |
|
264
|
|
|
|
|
|
|
# $recordCounts - 1 if unique counts are to be used rather than co-occurrence |
|
265
|
|
|
|
|
|
|
# counts (i.e. npp = vocab size, n1p = count of co-occurring |
|
266
|
|
|
|
|
|
|
# terms rather than npp = all co-occurrences, n1p = sum of |
|
267
|
|
|
|
|
|
|
# co-occurrences for a term). |
|
268
|
|
|
|
|
|
|
# Output: |
|
269
|
|
|
|
|
|
|
# output varies whether or not stats are recorded. |
|
270
|
|
|
|
|
|
|
# If stats are not recorded: |
|
271
|
|
|
|
|
|
|
# \@newPairHashList - an array of pair hashes, where each pairHash is |
|
272
|
|
|
|
|
|
|
# the linking terms, B_A and B_C of the original pair hash at that |
|
273
|
|
|
|
|
|
|
# index (order/NoOrder is accounted for in this list) |
|
274
|
|
|
|
|
|
|
# If stats are recorded: |
|
275
|
|
|
|
|
|
|
# \%n1p - n1p{cui} = n1p for that term (order enforced |
|
276
|
|
|
|
|
|
|
# \%np1 - np1{cui} = np1 for that term (order enforced |
|
277
|
|
|
|
|
|
|
# $npp - npp for this dataset (term counts or co-occurrence counts) |
|
278
|
|
|
|
|
|
|
# \%matrix - the co-occurrence matrix for all A, B_A, B_C, and C terms |
|
279
|
|
|
|
|
|
|
# this is essentially all possibly N11's (order enforced) |
|
280
|
|
|
|
|
|
|
# \@newPairHashList - same as above |
|
281
|
|
|
|
|
|
|
#NOTE: LSA and SBC do not need anything but newPairHashList |
|
282
|
|
|
|
|
|
|
# LTA needs npp (with counts), newPairHashList |
|
283
|
|
|
|
|
|
|
# MWA needs n1p, np1, npp, matrix, newPairHashList |
|
284
|
|
|
|
|
|
|
# ...I kept everything in this method because calculating the newPairHashList |
|
285
|
|
|
|
|
|
|
# of linking terms is fairly complicated and I didn't want to replicate code |
|
286
|
|
|
|
|
|
|
# to make bug fixes/updates easier. For minor speed iprovements, and |
|
287
|
|
|
|
|
|
|
# understandability, we could create 3 seperate methods, one for each of the |
|
288
|
|
|
|
|
|
|
# required sets of info needed |
|
289
|
|
|
|
|
|
|
sub getLinkingTermsPairHashList { |
|
290
|
17
|
|
|
17
|
0
|
19
|
my $pairHashListRef = shift; |
|
291
|
17
|
|
|
|
|
16
|
my $matrixFileName = shift; |
|
292
|
17
|
|
|
|
|
15
|
my $noOrder = shift; |
|
293
|
17
|
|
|
|
|
17
|
my $recordStats = shift; |
|
294
|
17
|
|
|
|
|
16
|
my $recordCounts = shift; |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
##################################### |
|
297
|
|
|
|
|
|
|
#create data structures to make line reading fast |
|
298
|
|
|
|
|
|
|
##################################### |
|
299
|
17
|
|
|
|
|
15
|
my %allCuis = (); #a hash of all cuis we need to grab hash{cui}=1 |
|
300
|
17
|
|
|
|
|
13
|
my %set1Cuis = (); #hash{cui} = list of pairHash indeces containing the cui in set1 |
|
301
|
17
|
|
|
|
|
14
|
my %set2Cuis = (); #hash{cui} = list of pairHash indeces containing the cui in set2 |
|
302
|
17
|
|
|
|
|
17
|
my @newPairHashList = (); #the linking term pairHashList |
|
303
|
17
|
|
|
|
|
16
|
for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { |
|
|
42
|
|
|
|
|
65
|
|
|
304
|
25
|
|
|
|
|
21
|
my $pairHashRef = ${$pairHashListRef}[$i]; |
|
|
25
|
|
|
|
|
25
|
|
|
305
|
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
#initialize set1 index |
|
307
|
25
|
|
|
|
|
25
|
my %set1Hash = (); |
|
308
|
25
|
|
|
|
|
22
|
foreach my $cui(@{${$pairHashRef}{'set1'}}) { |
|
|
25
|
|
|
|
|
29
|
|
|
|
25
|
|
|
|
|
36
|
|
|
309
|
33
|
50
|
|
|
|
53
|
if (!exists $set1Cuis{$cui}) { |
|
310
|
33
|
|
|
|
|
29
|
my @emptyArray = (); |
|
311
|
33
|
|
|
|
|
42
|
$set1Cuis{$cui} = \@emptyArray; |
|
312
|
33
|
|
|
|
|
57
|
$allCuis{$cui} = 1; |
|
313
|
|
|
|
|
|
|
} |
|
314
|
33
|
|
|
|
|
32
|
push @{$set1Cuis{$cui}}, $i; |
|
|
33
|
|
|
|
|
44
|
|
|
315
|
|
|
|
|
|
|
} |
|
316
|
|
|
|
|
|
|
#initialize set2 index |
|
317
|
25
|
|
|
|
|
25
|
my %set2Hash = (); |
|
318
|
25
|
|
|
|
|
19
|
foreach my $cui(@{${$pairHashRef}{'set2'}}) { |
|
|
25
|
|
|
|
|
51
|
|
|
|
25
|
|
|
|
|
30
|
|
|
319
|
41
|
50
|
|
|
|
54
|
if (!exists $set2Cuis{$cui}) { |
|
320
|
41
|
|
|
|
|
37
|
my @emptyArray = (); |
|
321
|
41
|
|
|
|
|
44
|
$set2Cuis{$cui} = \@emptyArray; |
|
322
|
41
|
|
|
|
|
42
|
$allCuis{$cui}=1; |
|
323
|
|
|
|
|
|
|
} |
|
324
|
41
|
|
|
|
|
34
|
push @{$set2Cuis{$cui}}, $i; |
|
|
41
|
|
|
|
|
46
|
|
|
325
|
|
|
|
|
|
|
} |
|
326
|
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
#initialize the parallel linking set pairHash |
|
328
|
25
|
|
|
|
|
24
|
my %newPairHash = (); |
|
329
|
25
|
|
|
|
|
22
|
my @set1Array = (); |
|
330
|
25
|
|
|
|
|
51
|
$newPairHash{'set1'} = \@set1Array; |
|
331
|
25
|
|
|
|
|
22
|
my @set2Array = (); |
|
332
|
25
|
|
|
|
|
24
|
$newPairHash{'set2'} = \@set2Array; |
|
333
|
25
|
|
|
|
|
38
|
push @newPairHashList, \%newPairHash; |
|
334
|
|
|
|
|
|
|
} |
|
335
|
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
################################## |
|
337
|
|
|
|
|
|
|
##### Initialize values for recording stats while reading the matrix |
|
338
|
|
|
|
|
|
|
################################## |
|
339
|
17
|
|
|
|
|
27
|
my %n1p = (); |
|
340
|
17
|
|
|
|
|
15
|
my %np1 = (); |
|
341
|
17
|
|
|
|
|
16
|
my $npp = 0; |
|
342
|
17
|
|
|
|
|
14
|
my %vocab = (); |
|
343
|
17
|
|
|
|
|
17
|
my %matrix = (); |
|
344
|
|
|
|
|
|
|
# We want to make sure that n1p and np1 have a value for |
|
345
|
|
|
|
|
|
|
# all set1 and set2 terms, so initialize to 0 |
|
346
|
|
|
|
|
|
|
# ...and we want to do this here, so that it is only done once |
|
347
|
17
|
|
|
|
|
17
|
for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) { |
|
|
42
|
|
|
|
|
47
|
|
|
348
|
25
|
|
|
|
|
25
|
my $pairHashRef = ${$pairHashListRef}[$i]; |
|
|
25
|
|
|
|
|
22
|
|
|
349
|
25
|
|
|
|
|
32
|
foreach my $cui(@{${$pairHashRef}{'set1'}}) { |
|
|
25
|
|
|
|
|
19
|
|
|
|
25
|
|
|
|
|
26
|
|
|
350
|
33
|
|
|
|
|
36
|
$n1p{$cui}=0 |
|
351
|
|
|
|
|
|
|
} |
|
352
|
25
|
|
|
|
|
30
|
foreach my $cui(@{${$pairHashRef}{'set2'}}) { |
|
|
25
|
|
|
|
|
20
|
|
|
|
25
|
|
|
|
|
30
|
|
|
353
|
41
|
|
|
|
|
44
|
$np1{$cui}=0; |
|
354
|
|
|
|
|
|
|
} |
|
355
|
|
|
|
|
|
|
} |
|
356
|
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
#################################### |
|
358
|
|
|
|
|
|
|
#### File Read in, where most execution time is spent, keep this fast |
|
359
|
|
|
|
|
|
|
#################################### |
|
360
|
|
|
|
|
|
|
#read in the linking sets from the matrix |
|
361
|
17
|
50
|
|
|
|
464
|
open IN, $matrixFileName or die "Cannot open $matrixFileName for input: $!\n"; |
|
362
|
17
|
|
|
|
|
49
|
my @vals; |
|
363
|
17
|
|
|
|
|
186
|
while (my $line = ) { |
|
364
|
|
|
|
|
|
|
#get cuis and value from the line |
|
365
|
200
|
|
|
|
|
371
|
@vals = (split /\t/, $line); |
|
366
|
|
|
|
|
|
|
#$cui1 = $vals[0] |
|
367
|
|
|
|
|
|
|
#$cui2 = $vals[1] |
|
368
|
|
|
|
|
|
|
#$num = $vals[2] |
|
369
|
200
|
|
|
|
|
217
|
chomp $vals[2]; |
|
370
|
|
|
|
|
|
|
|
|
371
|
|
|
|
|
|
|
#update npp and vocab if needed |
|
372
|
200
|
100
|
|
|
|
211
|
if ($recordStats) { |
|
373
|
92
|
100
|
|
|
|
102
|
if ($recordCounts) { |
|
374
|
46
|
|
|
|
|
41
|
$vals[2] = 1; |
|
375
|
46
|
|
|
|
|
48
|
$vocab{$vals[0]} = 1; |
|
376
|
46
|
|
|
|
|
44
|
$vocab{$vals[1]} = 1; |
|
377
|
|
|
|
|
|
|
} |
|
378
|
92
|
|
|
|
|
90
|
$npp += $vals[2]; |
|
379
|
|
|
|
|
|
|
} |
|
380
|
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
#If either of the CUIs are in the pairHashList, then record |
|
382
|
|
|
|
|
|
|
# their co-occurrences, and additional stats if needed |
|
383
|
200
|
100
|
100
|
|
|
381
|
if (exists $allCuis{$vals[0]} || exists $allCuis{$vals[1]}) { |
|
384
|
|
|
|
|
|
|
#add any cui1 linking terms if needed |
|
385
|
164
|
100
|
|
|
|
189
|
if (exists $set1Cuis{$vals[0]}) { |
|
386
|
|
|
|
|
|
|
#cui1 is in one more or more set1s |
|
387
|
|
|
|
|
|
|
# add to the linking term (cui2) to each pairhash |
|
388
|
|
|
|
|
|
|
# that cui1 is in |
|
389
|
74
|
|
|
|
|
63
|
foreach my $index (@{$set1Cuis{$vals[0]}}) { |
|
|
74
|
|
|
|
|
92
|
|
|
390
|
74
|
|
|
|
|
68
|
push @{${$newPairHashList[$index]}{'set1'}}, $vals[1]; |
|
|
74
|
|
|
|
|
58
|
|
|
|
74
|
|
|
|
|
129
|
|
|
391
|
|
|
|
|
|
|
} |
|
392
|
|
|
|
|
|
|
} |
|
393
|
|
|
|
|
|
|
|
|
394
|
|
|
|
|
|
|
#add any cui2 linking terms if needed |
|
395
|
164
|
100
|
|
|
|
189
|
if (exists $set2Cuis{$vals[1]}) { |
|
396
|
|
|
|
|
|
|
#cui2 is in one more or more set2s |
|
397
|
|
|
|
|
|
|
# add to the linking term (cui1) to each pairhash |
|
398
|
|
|
|
|
|
|
# that cui2 is in |
|
399
|
96
|
|
|
|
|
73
|
foreach my $index (@{$set2Cuis{$vals[1]}}) { |
|
|
96
|
|
|
|
|
124
|
|
|
400
|
96
|
|
|
|
|
69
|
push @{${$newPairHashList[$index]}{'set2'}}, $vals[0]; |
|
|
96
|
|
|
|
|
77
|
|
|
|
96
|
|
|
|
|
135
|
|
|
401
|
|
|
|
|
|
|
} |
|
402
|
|
|
|
|
|
|
} |
|
403
|
|
|
|
|
|
|
|
|
404
|
|
|
|
|
|
|
#record n1p and np1 and npp if needed |
|
405
|
164
|
100
|
|
|
|
180
|
if ($recordStats) { |
|
406
|
|
|
|
|
|
|
#n1p and np1 are recorded for MWA only |
|
407
|
80
|
|
|
|
|
91
|
$n1p{$vals[0]} += $vals[2]; |
|
408
|
80
|
|
|
|
|
79
|
$np1{$vals[1]} += $vals[2]; |
|
409
|
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
#The matrix must be recorded to calculate MWA |
|
411
|
80
|
50
|
66
|
|
|
128
|
if (exists $set1Cuis{$vals[0]} || exists $set2Cuis{$vals[1]}) { |
|
412
|
80
|
100
|
|
|
|
93
|
if (!defined $matrix{$vals[0]}) { |
|
413
|
52
|
|
|
|
|
49
|
my %emptyHash = (); |
|
414
|
52
|
|
|
|
|
65
|
$matrix{$vals[0]} = \%emptyHash; |
|
415
|
|
|
|
|
|
|
} |
|
416
|
80
|
|
|
|
|
68
|
${$matrix{$vals[0]}}{$vals[1]} = $vals[2]; |
|
|
80
|
|
|
|
|
109
|
|
|
417
|
|
|
|
|
|
|
} |
|
418
|
|
|
|
|
|
|
} |
|
419
|
|
|
|
|
|
|
|
|
420
|
|
|
|
|
|
|
#add cuis if order doesnt matter |
|
421
|
164
|
100
|
|
|
|
262
|
if ($noOrder) { |
|
422
|
|
|
|
|
|
|
#NOTE: this is the same as above, but with cui1 |
|
423
|
|
|
|
|
|
|
# and cui2 swapped. I didn't make this a seperate |
|
424
|
|
|
|
|
|
|
# method becuase method calls are slow, and |
|
425
|
|
|
|
|
|
|
# iterating thru lines should be fast |
|
426
|
84
|
|
|
|
|
77
|
my $temp = $vals[0]; |
|
427
|
84
|
|
|
|
|
71
|
$vals[0] = $vals[1]; |
|
428
|
84
|
|
|
|
|
74
|
$vals[1] = $temp; |
|
429
|
|
|
|
|
|
|
# ...literally copy and paste below |
|
430
|
|
|
|
|
|
|
|
|
431
|
|
|
|
|
|
|
#add any cui1 linking terms if needed |
|
432
|
84
|
100
|
|
|
|
95
|
if (exists $set1Cuis{$vals[0]}) { |
|
433
|
|
|
|
|
|
|
#cui1 is in one more or more set1s |
|
434
|
|
|
|
|
|
|
# add to the linking term (cui2) to each pairhash |
|
435
|
|
|
|
|
|
|
# that cui1 is in |
|
436
|
3
|
|
|
|
|
3
|
foreach my $index (@{$set1Cuis{$vals[0]}}) { |
|
|
3
|
|
|
|
|
5
|
|
|
437
|
3
|
|
|
|
|
3
|
push @{${$newPairHashList[$index]}{'set1'}}, $vals[1]; |
|
|
3
|
|
|
|
|
3
|
|
|
|
3
|
|
|
|
|
6
|
|
|
438
|
|
|
|
|
|
|
} |
|
439
|
|
|
|
|
|
|
} |
|
440
|
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
#add any cui2 linking terms if needed |
|
442
|
84
|
100
|
|
|
|
188
|
if (exists $set2Cuis{$vals[1]}) { |
|
443
|
|
|
|
|
|
|
#cui2 is in one more or more set2s |
|
444
|
|
|
|
|
|
|
# add to the linking term (cui1) to each pairhash |
|
445
|
|
|
|
|
|
|
# that cui2 is in |
|
446
|
13
|
|
|
|
|
12
|
foreach my $index (@{$set2Cuis{$vals[1]}}) { |
|
|
13
|
|
|
|
|
13
|
|
|
447
|
13
|
|
|
|
|
14
|
push @{${$newPairHashList[$index]}{'set2'}}, $vals[0]; |
|
|
13
|
|
|
|
|
11
|
|
|
|
13
|
|
|
|
|
46
|
|
|
448
|
|
|
|
|
|
|
} |
|
449
|
|
|
|
|
|
|
} |
|
450
|
|
|
|
|
|
|
} |
|
451
|
|
|
|
|
|
|
} |
|
452
|
|
|
|
|
|
|
} |
|
453
|
|
|
|
|
|
|
###### DONE reading in the file and constructing linking term sets |
|
454
|
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
################################# |
|
456
|
|
|
|
|
|
|
# Remove Duplicates |
|
457
|
|
|
|
|
|
|
################################## |
|
458
|
|
|
|
|
|
|
#due to how cuis are added to the pair hashes |
|
459
|
|
|
|
|
|
|
# it is possible to have duplicate cuis in each |
|
460
|
|
|
|
|
|
|
# remove any duplicates |
|
461
|
17
|
|
|
|
|
28
|
for (my $i = 0; $i < scalar @newPairHashList; $i++) { |
|
462
|
25
|
|
|
|
|
17
|
my $set1Ref = ${$newPairHashList[$i]}{'set1'}; |
|
|
25
|
|
|
|
|
34
|
|
|
463
|
25
|
|
|
|
|
23
|
my $set2Ref = ${$newPairHashList[$i]}{'set2'}; |
|
|
25
|
|
|
|
|
21
|
|
|
464
|
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
#remove duplicates from set1 |
|
466
|
25
|
|
|
|
|
27
|
my %set1 = (); |
|
467
|
25
|
|
|
|
|
21
|
foreach my $term (@{$set1Ref}) { |
|
|
25
|
|
|
|
|
29
|
|
|
468
|
77
|
|
|
|
|
95
|
$set1{$term} = 1; |
|
469
|
|
|
|
|
|
|
} |
|
470
|
25
|
|
|
|
|
43
|
my @set1Terms = keys %set1; |
|
471
|
25
|
|
|
|
|
25
|
${$newPairHashList[$i]}{'set1'} = \@set1Terms; |
|
|
25
|
|
|
|
|
26
|
|
|
472
|
|
|
|
|
|
|
|
|
473
|
|
|
|
|
|
|
#remove duplicates from set2 |
|
474
|
25
|
|
|
|
|
36
|
my %set2 = (); |
|
475
|
25
|
|
|
|
|
21
|
foreach my $term (@{$set2Ref}) { |
|
|
25
|
|
|
|
|
25
|
|
|
476
|
109
|
|
|
|
|
105
|
$set2{$term} = 1; |
|
477
|
|
|
|
|
|
|
} |
|
478
|
25
|
|
|
|
|
36
|
my @set2Terms = keys %set2; |
|
479
|
25
|
|
|
|
|
31
|
${$newPairHashList[$i]}{'set2'} = \@set2Terms; |
|
|
25
|
|
|
|
|
74
|
|
|
480
|
|
|
|
|
|
|
} |
|
481
|
|
|
|
|
|
|
|
|
482
|
|
|
|
|
|
|
################################# |
|
483
|
|
|
|
|
|
|
# Return values |
|
484
|
|
|
|
|
|
|
################################## |
|
485
|
|
|
|
|
|
|
#return the pair hash list of linking sets |
|
486
|
17
|
100
|
|
|
|
22
|
if ($recordStats) { |
|
487
|
8
|
100
|
|
|
|
11
|
if ($recordCounts) { |
|
488
|
4
|
|
|
|
|
5
|
$npp = scalar keys %vocab; |
|
489
|
|
|
|
|
|
|
} |
|
490
|
8
|
|
|
|
|
40
|
return \%n1p, \%np1, $npp, \%matrix, \@newPairHashList; |
|
491
|
|
|
|
|
|
|
} |
|
492
|
|
|
|
|
|
|
else { |
|
493
|
9
|
|
|
|
|
35
|
return \@newPairHashList; |
|
494
|
|
|
|
|
|
|
} |
|
495
|
|
|
|
|
|
|
} |
|
496
|
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
|
|
498
|
|
|
|
|
|
|
1; |
|
499
|
|
|
|
|
|
|
__END__ |