File Coverage

blib/lib/UMLS/Association/Measures/WSA.pm

Criterion	Covered	Total	%
statement	161	161	100.0
branch	23	24	95.8
condition	8	9	88.8
subroutine	6	6	100.0
pod	0	1	0.0
total	198	201	98.5

line	stmt	bran	cond	sub	pod	time	code
1							#UMLS::Association::Measures::WSA
2							# Computes the Weighted Set Association (WSA) between two sets of terms.
3							# WSA finds the shared linking terms between A and C and weights those
4							# terms based on their association to A. Each B term therefore has a
5							# weight relative to its association with A, which is multiplied by
6							# its n11,n1p,np1 to make more associated terms more or less important.
7							# The shared B to C set associaiton is then found using the weighted B
8							# terms to produce the final association score.
9	1			1		4	use strict;
	1					1
	1					18
10	1			1		3	use warnings;
	1					1
	1					808
11
12							package UMLS::Association::Measures::WSA;
13
14
15							# Gets stats (n11,n1p,np1,npp) for each pairHash in the pairHashList
16							# using linking set association (LSA)
17							# Input:
18							# $pairHashListRef - ref to an array of pairHashes
19							# $matrixFileName - the fileName of the co-occurrence matrix
20							# $noOrder - 1 if order is enforced, 0 if not
21							# $paramsRef - the params used to create UMLS::Association which
22							# are used when finding the A to B weights
23							# Output:
24							# \@statsList - ref to an array of \@stats, refs to arrays
25							# containing the ordered values: n11, n1p, np1, npp
26							# for each of the pair hashes. The index of the
27							# \@statsList corresponds to the index of the pairHash
28							# in the input $pairHashListRef
29							sub getStats {
30	2			2	0	2	my $pairHashListRef = shift;
31	2					2	my $matrixFileName = shift;
32	2					3	my $noOrder = shift;
33	2					2	my $paramsRef = shift;
34
35							################## STEP 1 #######################
36							######### Find the linking (B) terms ###########
37							#################################################
38							#read in the matrix - FILE READ ONE
39	2					3	my ($matrixRef, $vocabSize) = &UMLS::Association::StatFinder::readInMatrix($pairHashListRef, $matrixFileName);
40
41							#construct A to shared B pair hashListRef
42	2					3	my @newPairHashList = ();
43	2					2	my @sharedCoocs = ();
44	2					2	foreach my $pairHashRef (@{$pairHashListRef}) {
	2					2
45							#get the linking terms and shared linking terms
46							#MATRIX PASS 1
47	2					4	my ($set1CoocRef, $set2CoocRef) = &_getLinkingTermSets(
48							$pairHashRef, $matrixRef, $noOrder);
49	2					3	my $sharedCoocRef = &_getSharedLinkingTerms(
50							$set1CoocRef, $set2CoocRef);
51	2					2	my @sharedTerms = keys %{$sharedCoocRef};
	2					3
52	2					3	push @sharedCoocs, \@sharedTerms;
53
54							#construct pair hashes
55	2					2	foreach my $term (keys %{$sharedCoocRef}) {
	2					2
56	5					5	my %newPairHash = ();
57	5					5	$newPairHash{'set1'} = ${$pairHashRef}{'set1'};
	5					7
58
59	5					4	my @set2 = ();
60	5					5	push @set2, $term;
61	5					6	$newPairHash{'set2'} = \@set2;
62	5					7	push @newPairHashList, \%newPairHash;
63							}
64							}
65							#Now we have a new pair hash ref which we will get associations
66							# for. This is setA to each B linking term. It is arranged
67							# such that you iterate over the shared terms of each pair hash
68							# to get A to each B for that term (e.g. pairHash1 has 10 linking
69							# terms, the first 10 pairHashes are A a single B. The B terms
70							# are ordered in the sharedCoocs Array of Arrays
71							#Doing it in this manner allows for WSA to be calculated in 3
72							# file reads and number pair hashes + 2 passes of the matrix
73
74							################## STEP 2 #######################
75							####### Find the Weight of each B term ########
76							#################################################
77							#get A to shared B associations for all possible linking B terms
78							# FILE READ 2 - MATRIX PASS 2 (calculateAssociation_pairHashList)
79	2					2	my %optionsHash = ();
80	2	100	66			8	if (defined $noOrder && $noOrder > 0) { $optionsHash{'noorder'} = 1; }
	1					1
81	2					2	$optionsHash{'measure'} = ${$paramsRef}{'measure'};
	2					3
82	2					2	$optionsHash{'matrix'} = ${$paramsRef}{'matrix'};
	2					3
83	2					11	my $assoc = UMLS::Association->new(\%optionsHash);
84	2					3	my $aToBScoresRef = $assoc->_calculateAssociation_pairHashList(\@newPairHashList, ${$paramsRef}{'measure'});
	2					5
85
86							#Normalize the weights unless told not to
87	2					2	my $weightIterator = 0;
88	2					3	my $reweightIterator = 0;
89	2	50				4	if (!$paramsRef->{'nonorm'}) {
90							#normalize the weights for each pairhash
91	2					2	for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
	4					5
92							#get all set B term weights as a hash{term}=weight
93	2					3	my %pairWeights = ();
94	2					2	foreach my $term (@{$sharedCoocs[$i]}) {
	2					2
95	5					4	$pairWeights{$term} = ${$aToBScoresRef}[$weightIterator];
	5					5
96	5					5	$weightIterator++;
97							}
98
99							#scale the weights between 0 and 1 (weight/sum), so weight
100							# becomes a percentage of the total weights
101							# I need to keep weights <= 1 to maintain correctness
102							# of stats (npp in particular, but others as well?)
103	2					3	my $sum = 0;
104	2					3	foreach my $cui (keys %pairWeights) {
105	5					6	$sum += $pairWeights{$cui};
106							}
107	2					3	foreach my $cui (keys %pairWeights) {
108	5					5	${$aToBScoresRef}[$reweightIterator] /= $sum;
	5					4
109	5					7	$reweightIterator++;
110							}
111							}
112							##### Now we have the normalized weights
113							}
114
115							#So now we have the weights for all B terms and for each pair hash. Next
116							# step is to weight the subgraph using these weights for each
117							# pairhash and then calculate the B to C direct assocition
118							# using each of those re-weighted sub graphs
119
120							################## STEP 3 #######################
121							####### Find the WSA between B and C ##########
122							#################################################
123							# Create the B to C pair hash and read in the matrix of B to C terms
124							# MATRIX READ 3 - reqiured because of links between the linking set terms
125							# (e.g. edge 3->4 in sample4. This becomes a source sink if matrix isnt
126							# read in again
127	2					3	my @bToCPairHashList = ();
128	2					2	for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
	4					5
129							#construct the B to C pair Hash
130	2					2	my %pairHash = ();
131	2					2	$pairHash{'set1'} = $sharedCoocs[$i];
132	2					2	$pairHash{'set2'} = ${${$pairHashListRef}[$i]}{'set2'};
	2					2
	2					2
133	2					4	push @bToCPairHashList, \%pairHash;
134							}
135	2					3	($matrixRef, $vocabSize) = &UMLS::Association::StatFinder::readInMatrix(\@bToCPairHashList, $matrixFileName);
136
137							# MATRIX PASS +numPairHashes - to calculate WSA we need to
138							# reweight the matrix differently for each pairHash
139							#get WSA Stats (n11,n1p,np1,npp) for each pairHash
140	2					6	$weightIterator = 0;
141	2					3	my @statsList = ();
142	2					2	for (my $i = 0; $i < scalar @{$pairHashListRef}; $i++) {
	4					7
143							#get all set B term weights as a hash{term}=weight
144	2					2	my %weights = ();
145	2					2	foreach my $term (@{$sharedCoocs[$i]}) {
	2					4
146	5					3	$weights{$term} = ${$aToBScoresRef}[$weightIterator];
	5					7
147	5					5	$weightIterator++;
148							}
149
150							#get the weighted subgraph
151	2					4	my $weightedSubGraphRef = &_constructWeightedSubGraph($matrixRef, $bToCPairHashList[$i], \%weights);
152
153							#calculate n11, n1p, np1, npp, using the weights specific to
154							# this pair hash, and save the results
155	2					4	my ($n1pRef, $np1Ref, $npp) = &UMLS::Association::Measures::Direct::_getAllCounts($weightedSubGraphRef);
156	2					5	push @statsList, &UMLS::Association::Measures::Direct::_statsFromAllCounts(
157							$weightedSubGraphRef, $n1pRef, $np1Ref, $npp, $noOrder, $bToCPairHashList[$i], \%weights);
158							}
159
160							#return the stats list, an array of array refs
161							# each array ref conatins four values:
162							# n11, n1p, np1, and npp for the pair hash at
163							# the corresponding index in the pairHashList
164	2					10	return \@statsList;
165							}
166
167
168
169							##################################################
170							# Sub Graph Construction
171							##################################################
172							#builds a subgraph relevant to this pair hash this includes adding
173							# cuis in other pair hashes to the universal source/sink, and collapsing
174							# edges to create set-nodes rather than cui nodes This also takes care of
175							# noOrder weights contains - hash{term} = weight
176							# Input:
177							# $matrixRef - ref to a matrix from which we construct a subgraph
178							# $pairHashRef - ref to a pairHash
179							# $weightsRef - ref to a hash{cui} = weight of that cui
180							# Output:
181							# \%subGraph - a weighted subgraph for this pairHash
182							sub _constructWeightedSubGraph {
183	2			2		3	my $matrixRef = shift;
184	2					2	my $pairHashRef = shift;
185	2					2	my $weightsRef = shift;
186
187							#convert the pair hash to two hashes of cuis
188	2					2	my %set1 = ();
189	2					2	foreach my $key (@{${$pairHashRef}{'set1'}}) {
	2					2
	2					4
190	5					5	$set1{$key} = 1;
191							}
192	2					3	my %set2 = ();
193	2					2	foreach my $key (@{${$pairHashRef}{'set2'}}) {
	2					2
	2					4
194	4					5	$set2{$key} = 1;
195							}
196
197							# Restrict graph to nodes in this pairhash. That is,
198							# set any nodes outside of sets1 and 2 to be the
199							# universal source and sink
200							#initalize the sub graph
201	2					2	my %subGraph = ();
202	2					3	my %emptyHash = ();
203	2					3	$subGraph{'source'} = \%emptyHash;
204
205							#loop through all source and targets, and if not in
206							# either of the sets, replace with the universal
207							# sink or source
208	2					3	foreach my $source (keys %{$matrixRef}) {
	2					5
209							#convert source to the universal source
210							# node if it is not in this pair hash
211	15					15	my $newSource = $source;
212	15	100	100			28	if (!exists $set1{$source} && !exists $set2{$source}) {
213	6					6	$newSource = 'source';
214							}
215
216							#go through all targets for this source
217	15					16	foreach my $target (keys %{${$matrixRef}{$source}}) {
	15					10
	15					23
218							#convert to universal sink if node is
219							# not in this pair hash
220	18					17	my $newTarget = $target;
221	18	100	100			28	if (!exists $set1{$target} && !exists $set2{$target}) {
222	5					11	$newTarget = 'sink';
223							}
224
225							#weights the value (if both source and target
226							# have weights, then weight is the their product)
227	18					17	my $value = ${${$matrixRef}{$source}}{$target};
	18					13
	18					20
228	18	100				16	if (defined ${$weightsRef}{$source}){
	18					20
229	7					8	$value *= ${$weightsRef}{$source};
	7					7
230							}
231	18	100				16	if (defined ${$weightsRef}{$target}) {
	18					20
232	9					7	$value *= ${$weightsRef}{$target};
	9					10
233							}
234
235							#add the value to the subgraph
236	18					18	${$subGraph{$newSource}}{$newTarget} += $value;
	18					23
237							}
238							}
239							#At this point, the sub graph has been converted, such that
240							# it contains only the nodes in this pair hash. All other nodes
241							# have been converted to the universal source and univerals sink
242
243							#return the subgraph
244	2					5	return \%subGraph;
245							}
246
247
248							##################################################
249							# Linking Set Acquisition
250							##################################################
251							# Find the linking terms (direct co-occurrences) between sets 1 and
252							# sets 2 and outputs them as co-occurrence hashes (hash{cui}=1)
253							# Input:
254							# $pairHashRef - ref to a pairHash
255							# $matrixRef - ref to the read in co-occurrence matrix
256							# $noOrder - 1 if order is enforced, 0 if not
257							# Output:
258							# \%set1Cooc - a hash{cui}=1 of all of set 1's direct co-occurrences
259							# (order/noOrder is accounted for)
260							# \%set2Cooc - a hash{cui}=1 of all of set 2's direct co-occurrences
261							# (order/noOrder is accounted for)
262							sub _getLinkingTermSets {
263	2			2		3	my $pairHashRef = shift;
264	2					2	my $matrixRef = shift;
265	2					2	my $noOrder = shift;
266
267							#convert pair hash to sets 1 and 2 hashes
268	2					3	my %set1 = ();
269	2					2	foreach my $node (@{${$pairHashRef}{'set1'}}) {
	2					1
	2					4
270	4					6	$set1{$node} = 1;
271							}
272	2					2	my %set2 = ();
273	2					2	foreach my $node (@{${$pairHashRef}{'set2'}}) {
	2					1
	2					3
274	4					4	$set2{$node} = 1;
275							}
276
277							#get all co-occurring terms with set1 and set2
278	2					3	my %set1Cooc = ();
279	2					3	my %set2Cooc = ();
280							#check all nodes in the dataset
281	2					2	foreach my $source (keys %{$matrixRef}) {
	2					4
282	16					16	foreach my $target (keys %{${$matrixRef}{$source}}) {
	16					12
	16					23
283							#add co-occurrences to set1 and set2
284	18	100				21	if (exists $set1{$source}) {
285	6					7	$set1Cooc{$target} = 1;
286							}
287	18	100				20	if (exists $set2{$target}) {
288	4					4	$set2Cooc{$source} = 1;
289							}
290
291							#if noorder, add co-occurrences
292							# to set1 and set2
293	18	100				21	if ($noOrder) {
294	9	100				11	if (exists $set1{$target}) {
295	1					1	$set1Cooc{$source} = 1;
296							}
297	9	100				11	if (exists $set2{$source}) {
298	2					2	$set2Cooc{$target} = 1;
299							}
300							}
301							}
302							}
303
304							#return the two co-occurring sets
305	2					4	return (\%set1Cooc, \%set2Cooc);
306							}
307
308							# Finds the shared co-occurrences between the two input co-occurrence hashes
309							# Input:
310							# \%set1Cooc - a hash{cui}=1 of all of set 1's direct co-occurrences
311							# (order/noOrder is accounted for)
312							# \%set2Cooc - a hash{cui}=1 of all of set 2's direct co-occurrences
313							# (order/noOrder is accounted for)
314							# Output:
315							# \%sharedCooc - a hash{cui}=1 of the shared co-occurrences between
316							# the input co-occurrence hashes
317							sub _getSharedLinkingTerms {
318	2			2		2	my $set1CoocRef = shift;
319	2					2	my $set2CoocRef = shift;
320
321							#get the shared linking terms between
322							# set1 and set2 co-occurrences
323	2					2	my %sharedCooc = ();
324	2					2	foreach my $node (keys %{$set1CoocRef}) {
	2					3
325	6	100				6	if (defined ${$set2CoocRef}{$node}) {
	6					7
326	5					6	$sharedCooc{$node} = 1;
327							}
328							}
329	2					3	return \%sharedCooc;
330							}
331
332
333							1;