blib/lib/GO/AnnotationProvider/AnnotationParser.pm | |||
---|---|---|---|
Criterion | Covered | Total | % |
statement | 152 | 321 | 47.3 |
branch | 45 | 154 | 29.2 |
condition | 13 | 62 | 20.9 |
subroutine | 24 | 37 | 64.8 |
pod | 22 | 22 | 100.0 |
total | 256 | 596 | 42.9 |
line | stmt | bran | cond | sub | pod | time | code |
---|---|---|---|---|---|---|---|
1 | package GO::AnnotationProvider::AnnotationParser; | ||||||
2 | |||||||
3 | # File : AnnotationParser.pm | ||||||
4 | # Authors : Elizabeth Boyle; Gavin Sherlock | ||||||
5 | # Date Begun : Summer 2001 | ||||||
6 | # Rewritten : September 25th 2002 | ||||||
7 | |||||||
8 | # $Id: AnnotationParser.pm,v 1.35 2008/05/13 23:06:16 sherlock Exp $ | ||||||
9 | |||||||
10 | # Copyright (c) 2003 Gavin Sherlock; Stanford University | ||||||
11 | |||||||
12 | # Permission is hereby granted, free of charge, to any person | ||||||
13 | # obtaining a copy of this software and associated documentation files | ||||||
14 | # (the "Software"), to deal in the Software without restriction, | ||||||
15 | # including without limitation the rights to use, copy, modify, merge, | ||||||
16 | # publish, distribute, sublicense, and/or sell copies of the Software, | ||||||
17 | # and to permit persons to whom the Software is furnished to do so, | ||||||
18 | # subject to the following conditions: | ||||||
19 | |||||||
20 | # The above copyright notice and this permission notice shall be | ||||||
21 | # included in all copies or substantial portions of the Software. | ||||||
22 | |||||||
23 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||||||
24 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||||||
25 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||||||
26 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||||||
27 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||||||
28 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||||||
29 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||||
30 | # SOFTWARE. | ||||||
31 | |||||||
32 | =pod | ||||||
33 | |||||||
34 | =head1 NAME | ||||||
35 | |||||||
36 | GO::AnnotationProvider::AnnotationParser - parses a gene annotation file | ||||||
37 | |||||||
38 | =head1 SYNOPSIS | ||||||
39 | |||||||
40 | GO::AnnotationProvider::AnnotationParser - reads a Gene Ontology gene | ||||||
41 | associations file, and provides methods by which to retrieve the GO | ||||||
42 | annotations for the an annotated entity. Note, it is case | ||||||
43 | insensitive, with some caveats - see documentation below. | ||||||
44 | |||||||
45 | my $annotationParser = GO::AnnotationProvider::AnnotationParser->new(annotationFile => "data/gene_association.sgd"); | ||||||
46 | |||||||
47 | my $geneName = "AAT2"; | ||||||
48 | |||||||
49 | print "GO associations for gene: ", join (" ", $annotationParser->goIdsByName(name => $geneName, | ||||||
50 | aspect => 'P')), "\n"; | ||||||
51 | |||||||
52 | print "Database ID for gene: ", $annotationParser->databaseIdByName($geneName), "\n"; | ||||||
53 | |||||||
54 | print "Database name: ", $annotationParser->databaseName(), "\n"; | ||||||
55 | |||||||
56 | print "Standard name for gene: ", $annotationParser->standardNameByName($geneName), "\n"; | ||||||
57 | |||||||
58 | my $i; | ||||||
59 | |||||||
60 | my @geneNames = $annotationParser->allStandardNames(); | ||||||
61 | |||||||
62 | foreach $i (0..10) { | ||||||
63 | |||||||
64 | print "$geneNames[$i]\n"; | ||||||
65 | |||||||
66 | } | ||||||
67 | |||||||
68 | =head1 DESCRIPTION | ||||||
69 | |||||||
70 | GO::AnnotationProvider::AnnotationParser is a concrete subclass of | ||||||
71 | GO::AnnotationProvider, and creates a data structure mapping gene | ||||||
72 | names to GO annotations by parsing a file of annotations provided by | ||||||
73 | the Gene Ontology Consortium. | ||||||
74 | |||||||
75 | This package provides object methods for retrieving GO annotations | ||||||
76 | that have been parsed from a 'gene associations' file, provided by | ||||||
77 | the gene ontology consortium. The format for the file is: | ||||||
78 | |||||||
79 | Lines beginning with a '!' character are comment lines. | ||||||
80 | |||||||
81 | Column Cardinality Contents | ||||||
82 | ------ ----------- ------------------------------------------------------------- | ||||||
83 | 0 1 Database abbreviation for the source of annotation (e.g. SGD) | ||||||
84 | 1 1 Database identifier of the annotated entity | ||||||
85 | 2 1 Standard name of the annotated entity | ||||||
86 | 3 0,1 NOT (if a gene is specifically NOT annotated to the term) | ||||||
87 | 4 1 GOID of the annotation | ||||||
88 | 5 1,n Reference(s) for the annotation | ||||||
89 | 6 1 Evidence code for the annotation | ||||||
90 | 7 0,n With or From (a bit mysterious) | ||||||
91 | 8 1 Aspect of the Annotation (C, F, P) | ||||||
92 | 9 0,1 Name of the product being annotated | ||||||
93 | 10 0,n Alias(es) of the annotated product | ||||||
94 | 11 1 type of annotated entity (one of gene, transcript, protein) | ||||||
95 | 12 1,2 taxonomic id of the organism encoding and/or using the product | ||||||
96 | 13 1 Date of annotation YYYYMMDD | ||||||
97 | 14 1 Assigned_by : The database which made the annotation | ||||||
98 | |||||||
99 | Columns are separated by tabs. For those entries with a cardinality | ||||||
100 | greater than 1, multiple entries are pipe , |, delimited. | ||||||
101 | |||||||
102 | Further details can be found at: | ||||||
103 | |||||||
104 | http://www.geneontology.org/doc/GO.annotation.html#file | ||||||
105 | |||||||
106 | The following assumptions about the file are made (and should be true): | ||||||
107 | |||||||
108 | 1. All aliases appear for all entries of a given annotated product | ||||||
109 | 2. The database identifiers are unique, in that two different | ||||||
110 | entities cannot have the same database id. | ||||||
111 | |||||||
112 | =head1 TODO | ||||||
113 | |||||||
114 | Also see the TODO list in the parent, GO::AnnotationProvider. | ||||||
115 | |||||||
116 | 1. Add in methods that will allow retrieval of evidence codes with | ||||||
117 | the annotations for a particular entity. | ||||||
118 | |||||||
119 | 2. Add in methods that return all the annotated entities for a | ||||||
120 | particular GOID. | ||||||
121 | |||||||
122 | 3. Add in the ability to request only annotations either including | ||||||
123 | or excluding particular evidence codes. Such evidence codes | ||||||
124 | could be provided as an anonymous array as the value of a named | ||||||
125 | argument. | ||||||
126 | |||||||
127 | 4. Same as number 3, except allow the retrieval of annotated | ||||||
128 | entities for a particular GOID, based on inclusion or exclusion | ||||||
129 | of certain evidence codes. | ||||||
130 | |||||||
131 | These first four items will require a reworking of how data are | ||||||
132 | stored on the backend, and thus the parsing code itself, though it | ||||||
133 | should not affect any of the already existing API. | ||||||
134 | |||||||
135 | 5. Instead of 'use'ing Storable, 'require' it instead, only at the | ||||||
136 | point of use, which will mean that AnnotationParser can be | ||||||
137 | happily used in the absence of Storable, just without those | ||||||
138 | functions that need it. | ||||||
139 | |||||||
140 | 6. Extend the ValidateFile class method to check that an entity | ||||||
141 | should never be annotated to the same node twice, with the same | ||||||
142 | evidence, with the same reference. | ||||||
143 | |||||||
144 | 7. An additional checker, that uses an AnnotationProvider in | ||||||
145 | conjunction with an OntologyProvider, would be useful, that | ||||||
146 | checks that some of the annotations themselves are valid, ie | ||||||
147 | that no entities are annotated to the 'unknown' node in a | ||||||
148 | particular aspect, and also to another node within that same | ||||||
149 | aspect. Can annotations be redundant? ie, if an entity is | ||||||
150 | annotated to a node, and an ancestor of the node, is that | ||||||
151 | annotation redundant? Does it depend on the evidence codes and | ||||||
152 | references. Or are such annotations reinforcing? These things | ||||||
153 | are useful to consider when formulating the confidence which can | ||||||
154 | be attributed to an annotation. | ||||||
155 | |||||||
156 | =cut | ||||||
157 | |||||||
158 | 2 | 2 | 221291 | use strict; | |||
2 | 6 | ||||||
2 | 109 | ||||||
159 | 2 | 2 | 14 | use warnings; | |||
2 | 4 | ||||||
2 | 2399 | ||||||
160 | 2 | 2 | 18 | use diagnostics; | |||
2 | 5 | ||||||
2 | 18 | ||||||
161 | |||||||
162 | 2 | 2 | 6881 | use Storable qw (nstore); | |||
2 | 9108 | ||||||
2 | 158 | ||||||
163 | 2 | 2 | 1721 | use IO::File; | |||
2 | 19875 | ||||||
2 | 294 | ||||||
164 | |||||||
165 | 2 | 2 | 16 | use vars qw (@ISA $PACKAGE $VERSION); | |||
2 | 5 | ||||||
2 | 115 | ||||||
166 | |||||||
167 | 2 | 2 | 3515 | use GO::AnnotationProvider; | |||
2 | 5 | ||||||
2 | 10147 | ||||||
168 | @ISA = qw (GO::AnnotationProvider); | ||||||
169 | |||||||
170 | $PACKAGE = "GO::AnnotationProvider::AnnotationParser"; | ||||||
171 | $VERSION = "0.15"; | ||||||
172 | |||||||
173 | # CLASS Attributes | ||||||
174 | # | ||||||
175 | # These should be considered as constants, and are initialized here | ||||||
176 | |||||||
177 | my $DEBUG = 0; | ||||||
178 | |||||||
179 | # constants for instance attribute name | ||||||
180 | |||||||
181 | |||||||
182 | my $kDatabaseName = $PACKAGE.'::__databaseName'; # stores the name of the annotating database | ||||||
183 | my $kFileName = $PACKAGE.'::__fileName'; # stores the name of the file used to instantiate the object | ||||||
184 | my $kNameToIdMapInsensitive = $PACKAGE.'::__nameToIdMapInsensitive'; # stores a case insensitive map of all unambiguous names for a gene to the database id | ||||||
185 | my $kNameToIdMapSensitive = $PACKAGE.'::__nameToIdMapSensitive'; # stores a case sensitive map of all names where a particular casing is unambiguous for a gene to the database id | ||||||
186 | my $kAmbiguousNames = $PACKAGE.'::__ambiguousNames'; # stores the database id's for all ambiguous names | ||||||
187 | my $kIdToStandardName = $PACKAGE.'::__idToStandardName'; # stores a map of database id's to standard names of all entities | ||||||
188 | my $kStandardNameToId = $PACKAGE.'::__StandardNameToId'; # stores a map of standard names to their database id's | ||||||
189 | my $kUcIdToId = $PACKAGE.'::__ucIdToId'; # stores a map of uppercased databaseIds to the databaseId | ||||||
190 | my $kUcStdNameToStdName = $PACKAGE.'::__ucStdNameToStdName'; # stores a map of uppercased standard names to the standard name | ||||||
191 | my $kNameToCount = $PACKAGE.'::__nameToCount'; # stores a case sensitive map of the number of times a name has been seen | ||||||
192 | my $kGoids = $PACKAGE.'::__goids'; # stores all the goid annotations | ||||||
193 | my $kNumAnnotatedGenes = $PACKAGE.'::__numAnnotatedGenes'; # stores number of genes with annotations, per aspect | ||||||
194 | |||||||
195 | my $kAmbiguousNamesSensitive = $PACKAGE.'::__ambiguousNamesSensitive'; # names (case sensitive) that are ambiguous | ||||||
196 | |||||||
197 | my $kTotalNumAnnotatedGenes = $PACKAGE.'::__totalNumAnnotatedGenes'; # total number of annotated genes | ||||||
198 | |||||||
199 | # constants to describe what is in which column in the annotation file | ||||||
200 | |||||||
201 | my $kDatabaseNameColumn = 0; | ||||||
202 | my $kDatabaseIdColumn = 1; | ||||||
203 | my $kStandardNameColumn = 2; | ||||||
204 | my $kNotColumn = 3; | ||||||
205 | my $kGoidColumn = 4; | ||||||
206 | my $kReferenceColumn = 5; | ||||||
207 | my $kEvidenceColumn = 6; | ||||||
208 | my $kWithColumn = 7; | ||||||
209 | my $kAspectColumn = 8; | ||||||
210 | my $kNameColumn = 9; | ||||||
211 | my $kAliasesColumn = 10; | ||||||
212 | my $kEntityTypeColumn = 11; | ||||||
213 | my $kTaxonomicIDColumn = 12; | ||||||
214 | my $kDateColumn = 13; | ||||||
215 | my $kAssignedByColumn = 14; | ||||||
216 | |||||||
217 | # the following hash of anonymous arrays indicates for each column | ||||||
218 | # what the maximum and minimum number of entries per column can be. | ||||||
219 | # If no maximum is indicated, then the maximum is equal to the | ||||||
220 | # minimum, and exactly that number of entries must exist. | ||||||
221 | |||||||
222 | my %kColumnsToCardinality = ($kDatabaseNameColumn => [1 ], | ||||||
223 | $kDatabaseIdColumn => [1 ], | ||||||
224 | $kStandardNameColumn => [1 ], | ||||||
225 | $kNotColumn => [0, 1], | ||||||
226 | $kGoidColumn => [1 ], | ||||||
227 | $kReferenceColumn => [1, "n"], | ||||||
228 | $kEvidenceColumn => [1 ], | ||||||
229 | $kWithColumn => [0, "n"], | ||||||
230 | $kAspectColumn => [1 ], | ||||||
231 | $kNameColumn => [0, 1], | ||||||
232 | $kAliasesColumn => [0, "n"], | ||||||
233 | $kEntityTypeColumn => [1 ], | ||||||
234 | $kTaxonomicIDColumn => [1, 2], | ||||||
235 | $kDateColumn => [1 ], | ||||||
236 | $kAssignedByColumn => [1 ]); | ||||||
237 | |||||||
238 | my $kNumColumnsInFile = scalar keys %kColumnsToCardinality; | ||||||
239 | |||||||
240 | =pod | ||||||
241 | |||||||
242 | =head1 Class Methods | ||||||
243 | |||||||
244 | =cut | ||||||
245 | |||||||
246 | ############################################################################ | ||||||
247 | sub Usage{ | ||||||
248 | ############################################################################ | ||||||
249 | =pod | ||||||
250 | |||||||
251 | =head2 Usage | ||||||
252 | |||||||
253 | This class method simply prints out a usage statement, along with an | ||||||
254 | error message, if one was passed in. | ||||||
255 | |||||||
256 | Usage : | ||||||
257 | |||||||
258 | GO::AnnotationProvider::AnnotationParser->Usage(); | ||||||
259 | |||||||
260 | =cut | ||||||
261 | |||||||
262 | 0 | 0 | 1 | 0 | my ($class, $message) = @_; | ||
263 | |||||||
264 | 0 | 0 | 0 | defined $message && print $message."\n\n"; | |||
265 | |||||||
266 | 0 | 0 | print 'The constructor expects one of two arguments, either a | ||||
267 | \'annotationFile\' argument, or and \'objectFile\' argument. When | ||||||
268 | instantiated with an annotationFile argument, it expects it to | ||||||
269 | correspond to an annotation file created by one of the GO consortium | ||||||
270 | members, according to their file format. When instantiated with an | ||||||
271 | objectFile argument, it expects to open a previously created | ||||||
272 | annotationParser object that has been serialized to disk (see the | ||||||
273 | serializeToDisk method). | ||||||
274 | |||||||
275 | Usage: | ||||||
276 | |||||||
277 | my $annotationParser = '.$PACKAGE.'->new(annotationFile => $file); | ||||||
278 | |||||||
279 | my $annotationParser = '.$PACKAGE.'->new(objectFile => $file); | ||||||
280 | '; | ||||||
281 | |||||||
282 | } | ||||||
283 | |||||||
284 | ############################################################################ | ||||||
285 | sub ValidateFile{ | ||||||
286 | ############################################################################ | ||||||
287 | =pod | ||||||
288 | |||||||
289 | =head2 ValidateFile | ||||||
290 | |||||||
291 | This class method reads an annotation file, and returns a reference to | ||||||
292 | an array of errors that are present within the file. The errors are | ||||||
293 | simply strings, each beginning with "Line $lineNo : " where $lineNo is | ||||||
294 | the number of the line in the file where the error was found. | ||||||
295 | |||||||
296 | Usage: | ||||||
297 | |||||||
298 | my $errorsRef = GO::AnnotationProvider::AnnotationParser->ValidateFile(annotationFile => $file); | ||||||
299 | |||||||
300 | =cut | ||||||
301 | |||||||
302 | 0 | 0 | 1 | 0 | my ($class, %args) = @_; | ||
303 | |||||||
304 | 0 | 0 | 0 | my $file = $args{'annotationFile'} || $class->_handleMissingArgument(argument => 'annotationFile'); | |||
305 | |||||||
306 | 0 | 0 | 0 | my $annotationsFh = IO::File->new($file, q{<} )|| die "$PACKAGE cannot open $file : $!"; | |||
307 | |||||||
308 | 0 | 0 | my (@errors, @line); | ||||
309 | |||||||
310 | 0 | 0 | my ($databaseId, $standardName, $aliases); | ||||
311 | 0 | 0 | my (%idToName, %idToAliases); | ||||
312 | |||||||
313 | 0 | 0 | my $lineNo = 0; | ||||
314 | |||||||
315 | 0 | 0 | while (<$annotationsFh>){ | ||||
316 | |||||||
317 | 0 | 0 | ++$lineNo; | ||||
318 | |||||||
319 | 0 | 0 | 0 | next if $_ =~ m/^!/; # skip comment lines | |||
320 | |||||||
321 | 0 | 0 | chomp; | ||||
322 | |||||||
323 | 0 | 0 | 0 | next unless $_; # skip an empty line, if there is one | |||
324 | |||||||
325 | 0 | 0 | @line = split("\t", $_, -1); | ||||
326 | |||||||
327 | 0 | 0 | 0 | if (scalar @line != $kNumColumnsInFile){ # doesn't have the correct number of columns | |||
328 | |||||||
329 | 0 | 0 | push (@errors, "Line $lineNo has ". scalar @line. "columns, instead of $kNumColumnsInFile."); | ||||
330 | |||||||
331 | } | ||||||
332 | |||||||
333 | 0 | 0 | $class->__CheckCardinalityOfColumns(\@errors, \@line, $lineNo); | ||||
334 | |||||||
335 | # now want to deal with sanity checks... | ||||||
336 | |||||||
337 | 0 | 0 | ($databaseId, $standardName, $aliases) = @line[$kDatabaseIdColumn, $kStandardNameColumn, $kAliasesColumn]; | ||||
338 | |||||||
339 | 0 | 0 | 0 | next if ($databaseId eq ""); # will have given incorrect cardinality, but nothing more we can do with it | |||
340 | |||||||
341 | 0 | 0 | 0 | if (!exists $idToName{$databaseId}){ | |||
0 | |||||||
342 | |||||||
343 | 0 | 0 | $idToName{$databaseId} = $standardName; | ||||
344 | |||||||
345 | }elsif ($idToName{$databaseId} ne $standardName){ | ||||||
346 | |||||||
347 | 0 | 0 | push (@errors, "Line $lineNo : $databaseId has more than one standard name : $idToName{$databaseId} and $standardName."); | ||||
348 | |||||||
349 | } | ||||||
350 | |||||||
351 | 0 | 0 | 0 | if (!exists $idToAliases{$databaseId}){ | |||
0 | |||||||
352 | |||||||
353 | 0 | 0 | $idToAliases{$databaseId} = $aliases; | ||||
354 | |||||||
355 | }elsif($idToAliases{$databaseId} ne $aliases){ | ||||||
356 | |||||||
357 | 0 | 0 | push (@errors, "Line $lineNo : $databaseId has more than one collections of aliases : $idToAliases{$databaseId} and $aliases."); | ||||
358 | |||||||
359 | } | ||||||
360 | |||||||
361 | } | ||||||
362 | |||||||
363 | 0 | 0 | 0 | $annotationsFh->close || die "$PACKAGE cannot close $file : $!"; | |||
364 | |||||||
365 | 0 | 0 | return \@errors; | ||||
366 | |||||||
367 | } | ||||||
368 | |||||||
369 | ############################################################################ | ||||||
370 | sub __CheckCardinalityOfColumns{ | ||||||
371 | ############################################################################ | ||||||
372 | # This method checks the cardinality of each column on a line | ||||||
373 | # | ||||||
374 | # Usage: | ||||||
375 | # | ||||||
376 | # $class->__CheckCardinalityOfColumns(\@errors, \@line, $lineNo); | ||||||
377 | |||||||
378 | 0 | 0 | 0 | my ($class, $errorsRef, $lineRef, $lineNo) = @_; | |||
379 | |||||||
380 | 0 | 0 | my ($cardinality, $min, $max); | ||||
381 | |||||||
382 | 0 | 0 | foreach my $column (sort {$a<=>$b} keys %kColumnsToCardinality){ | ||||
0 | 0 | ||||||
383 | |||||||
384 | 0 | 0 | ($min, $max) = @{$kColumnsToCardinality{$column}}[0,1]; | ||||
0 | 0 | ||||||
385 | |||||||
386 | 0 | 0 | $cardinality = $class->__GetCardinality($lineRef->[$column], $errorsRef, $lineNo); | ||||
387 | |||||||
388 | 0 | 0 | 0 | if (!defined $max){ # must have a defined number of entries | |||
389 | |||||||
390 | 0 | 0 | 0 | if ($cardinality != $min){ | |||
391 | |||||||
392 | 0 | 0 | push (@{$errorsRef}, "Line $lineNo : column $column has a cardinality of $cardinality, instead of $min."); | ||||
0 | 0 | ||||||
393 | |||||||
394 | } | ||||||
395 | |||||||
396 | }else{ # there's a range of allowed number of entries | ||||||
397 | |||||||
398 | 0 | 0 | 0 | 0 | if ($cardinality < $min){ # check if less than minimum | ||
0 | |||||||
399 | |||||||
400 | 0 | 0 | push (@{$errorsRef}, "Line $lineNo : column $column has a cardinality of $cardinality, which is less than the required $min."); | ||||
0 | 0 | ||||||
401 | |||||||
402 | }elsif ($kColumnsToCardinality{$column}->[1] ne 'n' && | ||||||
403 | $cardinality > $max){ # check if more than maximum | ||||||
404 | |||||||
405 | 0 | 0 | push (@{$errorsRef}, "Line $lineNo : column $column has a cardinality of $cardinality, which is more than the allowed $max."); | ||||
0 | 0 | ||||||
406 | |||||||
407 | } | ||||||
408 | |||||||
409 | } | ||||||
410 | |||||||
411 | } | ||||||
412 | |||||||
413 | } | ||||||
414 | |||||||
415 | ############################################################################ | ||||||
416 | sub __GetCardinality{ | ||||||
417 | ############################################################################ | ||||||
418 | # This private method returns an integer that indicates the | ||||||
419 | # cardinality of a text string, where multiple entries are assumed to | ||||||
420 | # be seperated by the pipe character (|). In addition, it checks | ||||||
421 | # whether there are null or whitespace only entries. | ||||||
422 | # | ||||||
423 | # Usage: | ||||||
424 | # | ||||||
425 | # my $cardinality = $class->__GetCardinality($string); | ||||||
426 | |||||||
427 | 0 | 0 | 0 | my ($class, $string, $errorsRef, $lineNo) = @_; | |||
428 | |||||||
429 | 0 | 0 | my $cardinality; | ||||
430 | |||||||
431 | 0 | 0 | 0 | 0 | if (!defined $string || $string eq ""){ | ||
432 | |||||||
433 | 0 | 0 | $cardinality = 0; | ||||
434 | |||||||
435 | }else{ | ||||||
436 | |||||||
437 | 0 | 0 | my @entries = split(/\|/, $string, -1); | ||||
438 | |||||||
439 | 0 | 0 | foreach my $entry (@entries){ | ||||
440 | |||||||
441 | 0 | 0 | 0 | if (!defined $entry){ | |||
0 | |||||||
442 | |||||||
443 | 0 | 0 | push (@{$errorsRef}, "Line $lineNo : There is an undefined value in the string $string."); | ||||
0 | 0 | ||||||
444 | |||||||
445 | }elsif ($entry =~ /^\s+$/){ | ||||||
446 | |||||||
447 | 0 | 0 | push (@{$errorsRef}, "Line $lineNo : There is a white-space only value in the string $string."); | ||||
0 | 0 | ||||||
448 | |||||||
449 | } | ||||||
450 | |||||||
451 | } | ||||||
452 | |||||||
453 | 0 | 0 | $cardinality = scalar @entries; | ||||
454 | |||||||
455 | } | ||||||
456 | |||||||
457 | 0 | 0 | return $cardinality; | ||||
458 | |||||||
459 | } | ||||||
460 | |||||||
461 | ############################################################################ | ||||||
462 | # | ||||||
463 | # Constructor, and initialization methods. | ||||||
464 | # | ||||||
465 | # All initialization methods are private, except, of course, for the | ||||||
466 | # new() method. | ||||||
467 | # | ||||||
468 | ############################################################################ | ||||||
469 | |||||||
470 | ############################################################################ | ||||||
471 | sub new{ | ||||||
472 | ############################################################################ | ||||||
473 | =pod | ||||||
474 | |||||||
475 | =head1 Constructor | ||||||
476 | |||||||
477 | =head2 new | ||||||
478 | |||||||
479 | This is the constructor for an AnnotationParser object. | ||||||
480 | |||||||
481 | The constructor expects one of two arguments, either a | ||||||
482 | 'annotationFile' argument, or and 'objectFile' argument. When | ||||||
483 | instantiated with an annotationFile argument, it expects it to | ||||||
484 | correspond to an annotation file created by one of the GO consortium | ||||||
485 | members, according to their file format. When instantiated with an | ||||||
486 | objectFile argument, it expects to open a previously created | ||||||
487 | annotationParser object that has been serialized to disk (see the | ||||||
488 | serializeToDisk method). | ||||||
489 | |||||||
490 | Usage: | ||||||
491 | |||||||
492 | my $annotationParser = GO::AnnotationProvider::AnnotationParser->new(annotationFile => $file); | ||||||
493 | |||||||
494 | my $annotationParser = GO::AnnotationProvider::AnnotationParser->new(objectFile => $file); | ||||||
495 | |||||||
496 | =cut | ||||||
497 | |||||||
498 | |||||||
499 | 3 | 3 | 1 | 73 | my ($class, %args) = @_; | ||
500 | |||||||
501 | 3 | 6 | my $self; | ||||
502 | |||||||
503 | 3 | 50 | 27 | if (exists($args{'annotationFile'})){ | |||
0 | |||||||
504 | |||||||
505 | 3 | 6 | $self = {}; | ||||
506 | |||||||
507 | 3 | 9 | bless $self, $class; | ||||
508 | |||||||
509 | 3 | 16 | $self->__init($args{'annotationFile'}); | ||||
510 | |||||||
511 | }elsif (exists($args{'objectFile'})){ | ||||||
512 | |||||||
513 | 0 | 0 | 0 | $self = Storable::retrieve($args{'objectFile'}) || die "Could not instantiate $PACKAGE object from objectFile : $!"; | |||
514 | |||||||
515 | 0 | 0 | $self->__setFile($args{'objectFile'}); | ||||
516 | |||||||
517 | }else{ | ||||||
518 | |||||||
519 | 0 | 0 | $class->Usage("An annotationFile or objectFile argument must be provided."); | ||||
520 | 0 | 0 | die; | ||||
521 | |||||||
522 | } | ||||||
523 | |||||||
524 | # now, we have to make some alteration to some hashes to support | ||||||
525 | # our API for case insensitivity. The API says that if a name is | ||||||
526 | # supplied that would otherwise be ambiguous, but has a unique | ||||||
527 | # casing, then we will accept it as that unique cased version. | ||||||
528 | # Thus, we need to make sure that our $kNameToIdMapSensitive hash | ||||||
529 | # only tracks those names that were unique in a particular case | ||||||
530 | |||||||
531 | 3 | 2223 | foreach my $name (keys %{$self->{$kNameToCount}}){ | ||||
3 | 22829 | ||||||
532 | |||||||
533 | # go through the has that has a count of each name | ||||||
534 | |||||||
535 | 40383 | 100 | 100 | 233133 | if ($self->{$kNameToCount}{$name} > 1 || exists $self->{$kNameToIdMapInsensitive}{uc($name)}){ | ||
536 | |||||||
537 | # if it was seen more than once, or is known to be unique | ||||||
538 | # in a case insensitive fashion, then delete it. This | ||||||
539 | # will leave just those that are unique in a case | ||||||
540 | # sensitive fashion | ||||||
541 | |||||||
542 | 40368 | 102206 | delete $self->{$kNameToIdMapSensitive}{$name}; | ||||
543 | |||||||
544 | } | ||||||
545 | |||||||
546 | } | ||||||
547 | |||||||
548 | 3 | 7426 | return ($self); | ||||
549 | |||||||
550 | } | ||||||
551 | |||||||
552 | ############################################################################ | ||||||
553 | sub __init{ | ||||||
554 | ############################################################################ | ||||||
555 | # This private method initializes the object by reading in the data | ||||||
556 | # from the annotation file. | ||||||
557 | # | ||||||
558 | # Usage : | ||||||
559 | # | ||||||
560 | # $self->__init($file); | ||||||
561 | # | ||||||
562 | |||||||
563 | 3 | 3 | 6 | my ($self, $file) = @_; | |||
564 | |||||||
565 | 3 | 17 | $self->__setFile($file); | ||||
566 | |||||||
567 | 3 | 50 | 29 | my $annotationsFh = IO::File->new($file, q{<} )|| die "$PACKAGE cannot open $file : $!"; | |||
568 | |||||||
569 | # now read through annotations file | ||||||
570 | |||||||
571 | 3 | 446 | my (@line, $databaseId, $goid, $aspect, $standardName, $aliases); | ||||
572 | |||||||
573 | 3 | 91 | while (<$annotationsFh>){ | ||||
574 | |||||||
575 | 70620 | 100 | 138348 | next if $_ =~ m/^!/; # skip commented lines | |||
576 | |||||||
577 | 70543 | 87500 | chomp; | ||||
578 | |||||||
579 | 70543 | 50 | 133381 | next unless $_; # skip an empty line, if there is one | |||
580 | |||||||
581 | 70543 | 653737 | @line = split("\t", $_, -1); | ||||
582 | |||||||
583 | 70543 | 100 | 254001 | next if $line[$kNotColumn] eq 'NOT'; # skip annotations NOT to a GOID | |||
584 | |||||||
585 | 70387 | 125340 | ($databaseId, $goid, $aspect) = @line[$kDatabaseIdColumn, $kGoidColumn, $kAspectColumn]; | ||||
586 | 70387 | 94770 | ($standardName, $aliases) = @line[$kStandardNameColumn, $kAliasesColumn]; | ||||
587 | |||||||
588 | 70387 | 50 | 122047 | if ($databaseId eq ""){ | |||
589 | |||||||
590 | 0 | 0 | print "On line $. there is a missing databaseId, so it will be ignored.\n"; | ||||
591 | 0 | 0 | next; | ||||
592 | |||||||
593 | } | ||||||
594 | |||||||
595 | # record the source of the annotation | ||||||
596 | |||||||
597 | 70387 | 100 | 167118 | $self->{$kDatabaseName} = $line[$kDatabaseNameColumn] if (!exists($self->{$kDatabaseName})); | |||
598 | |||||||
599 | # now map the standard name and all aliases to the database id | ||||||
600 | |||||||
601 | 70387 | 136305 | $self->__mapNamesToDatabaseId($databaseId, $standardName, $aliases); | ||||
602 | |||||||
603 | # and store the GOID | ||||||
604 | |||||||
605 | 70387 | 134687 | $self->__storeGOID($databaseId, $goid, $aspect); | ||||
606 | |||||||
607 | } | ||||||
608 | |||||||
609 | 3 | 50 | 32 | $annotationsFh->close || die "AnnotationParser can't close $file: $!"; | |||
610 | |||||||
611 | # now count up how many annotated things we have | ||||||
612 | |||||||
613 | 3 | 139 | foreach my $databaseId (keys %{$self->{$kGoids}}){ | ||||
3 | 6383 | ||||||
614 | |||||||
615 | 12949 | 20401 | $self->{$kTotalNumAnnotatedGenes}++; | ||||
616 | |||||||
617 | 12949 | 12281 | foreach my $aspect (keys %{$self->{$kGoids}{$databaseId}}){ | ||||
12949 | 56705 | ||||||
618 | |||||||
619 | 38475 | 79955 | $self->{$kNumAnnotatedGenes}{$aspect}++; | ||||
620 | |||||||
621 | } | ||||||
622 | |||||||
623 | } | ||||||
624 | |||||||
625 | } | ||||||
626 | |||||||
627 | ############################################################################ | ||||||
628 | sub __setFile{ | ||||||
629 | ############################################################################ | ||||||
630 | # This method sets the name of the file used for construction. | ||||||
631 | # | ||||||
632 | # Usage: | ||||||
633 | # | ||||||
634 | # $self->__setFile($file); | ||||||
635 | # | ||||||
636 | |||||||
637 | 3 | 3 | 7 | my ($self, $file) = @_; | |||
638 | |||||||
639 | 3 | 27 | $self->{$kFileName} = $file; | ||||
640 | |||||||
641 | } | ||||||
642 | |||||||
643 | ############################################################################ | ||||||
644 | sub __mapNamesToDatabaseId{ | ||||||
645 | ############################################################################ | ||||||
646 | # This private method maps all names and aliases to the databaseId of | ||||||
647 | # an entity. It also maps the databaseId to itself, to facilitate a | ||||||
648 | # single way of mapping any identifier to the database id. | ||||||
649 | # | ||||||
650 | # This mapping is done so that it can be queried in a case insensitive | ||||||
651 | # fashion, and thus allow clients to be able to retrieve annotations | ||||||
652 | # without necessarily knowing the correct casing of any particular | ||||||
653 | # identifier. | ||||||
654 | # | ||||||
655 | # We have to keep the following considerations in mind: | ||||||
656 | # | ||||||
657 | # 1. Any identifier may be non-unique with respect to casing, that is, | ||||||
658 | # it is possible that there is ABC1 and abc1 | ||||||
659 | # | ||||||
660 | # 2. We want to be able to returns names and identifiers in their correct | ||||||
661 | # casing, irrespective of the casing that is provided in the query | ||||||
662 | # | ||||||
663 | # 3. In the situation when a name that is ambiguous when considered case | ||||||
664 | # insensitively is provided, we should check to see whether that casing | ||||||
665 | # corresponds to a know correct casing, and assume that that is the one | ||||||
666 | # that they meant. | ||||||
667 | # | ||||||
668 | # Usage : | ||||||
669 | # | ||||||
670 | # $self->__mapNamesToDatabaseId($databaseId, $standardName, $aliases); | ||||||
671 | # | ||||||
672 | # where $aliases is a pipe-delimited list of aliases | ||||||
673 | |||||||
674 | 70387 | 70387 | 104485 | my ($self, $databaseId, $standardName, $aliases) = @_; | |||
675 | |||||||
676 | 70387 | 100 | 189957 | if (exists $self->{$kIdToStandardName}{$databaseId}){ # we've already seen this databaseId | |||
677 | |||||||
678 | 57438 | 50 | 136470 | if ($self->{$kIdToStandardName}{$databaseId} ne $standardName){ | |||
679 | |||||||
680 | # there is a problem in the file - there should only be | ||||||
681 | # one standard name for a given database id, so we'll die | ||||||
682 | # here | ||||||
683 | |||||||
684 | 0 | 0 | die "databaseId $databaseId maps to more than one standard name : $self->{$kIdToStandardName}{$databaseId} ; $standardName\n"; | ||||
685 | |||||||
686 | }else{ | ||||||
687 | |||||||
688 | # we can simply return, as we've already processed | ||||||
689 | # information for this databaseId | ||||||
690 | |||||||
691 | 57438 | 84234 | return; | ||||
692 | |||||||
693 | } | ||||||
694 | |||||||
695 | } | ||||||
696 | |||||||
697 | # we haven't see this databaseId before, so process the data | ||||||
698 | |||||||
699 | 12949 | 28330 | my @aliases = split(/\|/, $aliases); | ||||
700 | |||||||
701 | 12949 | 15109 | my %seen; # sometimes an alias will be the same as the standard name | ||||
702 | |||||||
703 | 12949 | 18472 | foreach my $name ($databaseId, $standardName, @aliases){ | ||||
704 | |||||||
705 | # here, we simply store, in case sensitive fashion, a mapping | ||||||
706 | # of the name to databaseId. Later, this map will be | ||||||
707 | # modified, so it only contains those names where the case | ||||||
708 | # sensitive version is unique. We need this map to fulfill | ||||||
709 | # the API requirements that if databaseIdByName() is called | ||||||
710 | # with a name that is ambiguous, but the casing is unique, | ||||||
711 | # then it will correctly determine the casing match | ||||||
712 | |||||||
713 | 43917 | 150040 | $self->{$kNameToIdMapSensitive}{$name} = $databaseId; | ||||
714 | |||||||
715 | 43917 | 54621 | my $ucName = uc($name); # cache uppercased version for efficiency | ||||
716 | |||||||
717 | # occasionally, a standard name is also listed in the aliases, | ||||||
718 | # so we will skip the name if we've already seen it. | ||||||
719 | |||||||
720 | # note that for now, we are doing this case sensitively - it | ||||||
721 | # is possible that a gene is referred to by the same name | ||||||
722 | # twice but with different casing - however, if those are the | ||||||
723 | # only times that those particular versions are seen, then | ||||||
724 | # they will still be treated unambiguously. | ||||||
725 | |||||||
726 | 43917 | 100 | 83029 | next if exists ($seen{$name}); | |||
727 | |||||||
728 | # let's keep a count of every time a name with the same casing | ||||||
729 | # is seen, across all genes | ||||||
730 | |||||||
731 | 40689 | 99678 | $self->{$kNameToCount}{$name}++; | ||||
732 | |||||||
733 | # now we have to deal with the name, depending on whether we | ||||||
734 | # newly determine it is ambiguous, whether we already know | ||||||
735 | # that name is ambiguous, or whether (so far) the name appears | ||||||
736 | # to be unique | ||||||
737 | |||||||
738 | # for something to be newly ambiguous, the case insensitive | ||||||
739 | # version of its name must have been seen associated with some | ||||||
740 | # other database id already. | ||||||
741 | |||||||
742 | # if the case insensitive version of the name has already been | ||||||
743 | # seen with the same database id, it is still not ambiguous | ||||||
744 | |||||||
745 | 40689 | 100 | 100 | 185066 | if (exists $self->{$kNameToIdMapInsensitive}{$ucName} && $self->{$kNameToIdMapInsensitive}{$ucName} ne $databaseId){ | ||
100 | |||||||
746 | |||||||
747 | # so record what it maps to | ||||||
748 | |||||||
749 | # current databaseId | ||||||
750 | |||||||
751 | 277 | 376 | push (@{$self->{$kAmbiguousNames}{$ucName}}, $databaseId); | ||||
277 | 1214 | ||||||
752 | |||||||
753 | # and previously seen databaseId | ||||||
754 | |||||||
755 | 277 | 425 | push (@{$self->{$kAmbiguousNames}{$ucName}}, $self->{$kNameToIdMapInsensitive}{$ucName}); | ||||
277 | 912 | ||||||
756 | |||||||
757 | # and now delete the previously seen databaseId from the unambiguous mapping | ||||||
758 | |||||||
759 | 277 | 837 | delete $self->{$kNameToIdMapInsensitive}{$ucName}; | ||||
760 | |||||||
761 | }elsif (exists $self->{$kAmbiguousNames}{$ucName}){ # we already know it's ambiguous | ||||||
762 | |||||||
763 | # so add in this new databaseId | ||||||
764 | |||||||
765 | 36 | 47 | push (@{$self->{$kAmbiguousNames}{$ucName}}, $databaseId); | ||||
36 | 141 | ||||||
766 | |||||||
767 | }else{ # otherwise simply map it unambiguously for now, as we haven't see the name before | ||||||
768 | |||||||
769 | 40376 | 97840 | $self->{$kNameToIdMapInsensitive}{$ucName} = $databaseId; | ||||
770 | |||||||
771 | } | ||||||
772 | |||||||
773 | 40689 | 77922 | $seen{$name} = undef; # remember that we've seen the name for this row | ||||
774 | |||||||
775 | } | ||||||
776 | |||||||
777 | # now we need to record some useful mappings | ||||||
778 | |||||||
779 | # map databaseId and standardName to each other - these should | ||||||
780 | # always be unique when treated case sensitively | ||||||
781 | |||||||
782 | 12949 | 37602 | $self->{$kIdToStandardName}{$databaseId} = $standardName; # record the standard name for the database id | ||||
783 | 12949 | 33134 | $self->{$kStandardNameToId}{$standardName} = $databaseId; # also make the reverse look up | ||||
784 | |||||||
785 | # Now map upper cased versions of the databaseId and name to their original form | ||||||
786 | # These are not guaranteed to be unique, so we use arrays instead | ||||||
787 | |||||||
788 | 12949 | 12683 | push (@{$self->{$kUcIdToId}{uc($databaseId)}}, $databaseId); | ||||
12949 | 43808 | ||||||
789 | 12949 | 14721 | push (@{$self->{$kUcStdNameToStdName}{uc($standardName)}}, $standardName); | ||||
12949 | 63755 | ||||||
790 | |||||||
791 | } | ||||||
792 | |||||||
793 | ############################################################################ | ||||||
794 | sub __storeGOID{ | ||||||
795 | ############################################################################ | ||||||
796 | # This private method stores a GOID for a given databaseId, on a per | ||||||
797 | # aspect basis, in a hash. | ||||||
798 | # | ||||||
799 | # Usage: | ||||||
800 | # | ||||||
801 | # $self->__storeGOID($databaseId, $goid, $aspect); | ||||||
802 | # | ||||||
803 | |||||||
804 | 70387 | 70387 | 98564 | my ($self, $databaseId, $goid, $aspect) = @_; | |||
805 | |||||||
806 | 70387 | 393007 | $self->{$kGoids}{$databaseId}{$aspect}{$goid} = undef; | ||||
807 | |||||||
808 | } | ||||||
809 | |||||||
810 | =pod | ||||||
811 | |||||||
812 | =head1 Public instance methods | ||||||
813 | |||||||
814 | =head1 Some methods dealing with ambiguous names | ||||||
815 | |||||||
816 | Because there are many names by which an annotated entity may be | ||||||
817 | referred to, that are non-unique, there exist a set of methods for | ||||||
818 | determining whether a name is ambiguous, and to what database | ||||||
819 | identifiers such ambiguous names may refer. | ||||||
820 | |||||||
821 | Note, that the AnnotationParser is now case insensitive, but with some | ||||||
822 | caveats. For instance, you can use 'cdc6' to retrieve data for CDC6. | ||||||
823 | However, This if gene has been referred to as abc1, and another | ||||||
824 | referred to as ABC1, then these are treated as different, and | ||||||
825 | unambiguous. However, the text 'Abc1' would be considered ambiguous, | ||||||
826 | because it could refer to either. On the other hand, if a single gene | ||||||
827 | is referred to as XYZ1 and xyz1, and no other genes have that name (in | ||||||
828 | any casing), then Xyz1 would still be considered unambiguous. | ||||||
829 | |||||||
830 | =cut | ||||||
831 | |||||||
832 | ############################################################################## | ||||||
833 | sub nameIsAmbiguous{ | ||||||
834 | ############################################################################## | ||||||
835 | |||||||
836 | =pod | ||||||
837 | |||||||
838 | =head2 nameIsAmbiguous | ||||||
839 | |||||||
840 | This public method returns a boolean to indicate whether a name is | ||||||
841 | ambiguous, i.e. whether the name might map to more than one entity (and | ||||||
842 | therefore more than one databaseId). | ||||||
843 | |||||||
844 | NB: API change: | ||||||
845 | |||||||
846 | nameIsAmbiguous is now case insensitive - that is, if there is a name | ||||||
847 | that is used twice using different casing, that will be treated as | ||||||
848 | ambiguous. Previous versions would have not treated these as | ||||||
849 | ambiguous. In the case that a name is provided in a certain casing, | ||||||
850 | which was encountered only once, then it will be treated as | ||||||
851 | unambiguous. This is the price of wanting a case insensitive | ||||||
852 | annotation parser... | ||||||
853 | |||||||
854 | Usage: | ||||||
855 | |||||||
856 | if ($annotationParser->nameIsAmbiguous($name)){ | ||||||
857 | |||||||
858 | do something useful....or not.... | ||||||
859 | |||||||
860 | } | ||||||
861 | |||||||
862 | =cut | ||||||
863 | |||||||
864 | 106406 | 106406 | 1 | 148303 | my ($self, $name) = @_; | ||
865 | |||||||
866 | 106406 | 50 | 191864 | die "You must supply a name to nameIsAmbiguous" if !defined ($name); | |||
867 | |||||||
868 | # a name might appear in the hash of ambiguous names - however, | ||||||
869 | # it is possible that the provided name matches the case of one of | ||||||
870 | # the provided versions exactly, and thus may not be ambiguous | ||||||
871 | |||||||
872 | # of course, it is also possible that there were actually more than | ||||||
873 | # one copy of that alias, with exactly the same casing, which would | ||||||
874 | # be ambiguous | ||||||
875 | |||||||
876 | # thus, we need to find out whether the provided name matches the case | ||||||
877 | # of a something exactly, which refers to only one entity | ||||||
878 | |||||||
879 | # a name being ambiguous boils down to whether it has been seen | ||||||
880 | # more than once in that exact case, or in the case that it has | ||||||
881 | # not been seen at all in that exact case, whether it is ambiguous | ||||||
882 | # in upper case form. | ||||||
883 | |||||||
884 | 106406 | 121246 | my $isAmbiguous; | ||||
885 | |||||||
886 | 106406 | 100 | 416688 | if (!exists $self->{$kNameToCount}{$name}){ | |||
100 | |||||||
887 | |||||||
888 | # we haven't seen this casing at all, so see if it's ambiguous | ||||||
889 | # in the uppercased version | ||||||
890 | |||||||
891 | 438 | 1345 | $isAmbiguous = exists $self->{$kAmbiguousNames}{uc($name)}; | ||||
892 | |||||||
893 | }elsif ($self->{$kNameToCount}{$name} > 1){ | ||||||
894 | |||||||
895 | # we've seen this exact casing more than once, so it has to be | ||||||
896 | # ambiguous | ||||||
897 | |||||||
898 | 137 | 127 | $isAmbiguous = 1; | ||||
899 | |||||||
900 | }else{ | ||||||
901 | |||||||
902 | # it must only have ever been seen once in this exact casing, | ||||||
903 | # so it's unambiguous | ||||||
904 | |||||||
905 | 105831 | 137534 | $isAmbiguous = 0; | ||||
906 | |||||||
907 | } | ||||||
908 | |||||||
909 | 106406 | 324102 | return $isAmbiguous; | ||||
910 | |||||||
911 | } | ||||||
912 | |||||||
913 | ############################################################################ | ||||||
914 | sub databaseIdsForAmbiguousName{ | ||||||
915 | ############################################################################ | ||||||
916 | =pod | ||||||
917 | |||||||
918 | =head2 databaseIdsForAmbiguousName | ||||||
919 | |||||||
920 | This public method returns an array of database identifiers for an | ||||||
921 | ambiguous name. If the name is not ambiguous, an empty list will be | ||||||
922 | returned. | ||||||
923 | |||||||
924 | NB: API change: | ||||||
925 | |||||||
926 | databaseIdsForAmbiguousName is now case insensitive - that is, if | ||||||
927 | there is a name that is used twice using different casing, that will | ||||||
928 | be treated as ambiguous. Previous versions would have not treated | ||||||
929 | these as ambiguous. However, if the name provided is of the exact | ||||||
930 | casing as a name that appeared only once with that exact casing, then | ||||||
931 | it is treated as unambiguous. This is the price of wanting a case | ||||||
932 | insensitive annotation parser... | ||||||
933 | |||||||
934 | Usage: | ||||||
935 | |||||||
936 | my @databaseIds = $annotationParser->databaseIdsForAmbiguousName($name); | ||||||
937 | |||||||
938 | =cut | ||||||
939 | |||||||
940 | 2 | 2 | 1 | 4 | my ($self, $name) = @_; | ||
941 | |||||||
942 | 2 | 50 | 8 | die "You must supply a name to databaseIdsForAmbiguousName" if !defined ($name); | |||
943 | |||||||
944 | 2 | 50 | 6 | if ($self->nameIsAmbiguous($name)){ | |||
945 | |||||||
946 | 2 | 3 | return @{$self->{$kAmbiguousNames}{uc($name)}}; | ||||
2 | 13 | ||||||
947 | |||||||
948 | }else{ | ||||||
949 | |||||||
950 | 0 | 0 | return (); | ||||
951 | |||||||
952 | } | ||||||
953 | |||||||
954 | } | ||||||
955 | |||||||
956 | ############################################################################ | ||||||
957 | sub ambiguousNames{ | ||||||
958 | ############################################################################ | ||||||
959 | =pod | ||||||
960 | |||||||
961 | =head2 ambiguousNames | ||||||
962 | |||||||
963 | This method returns an array of names, which from the annotation file | ||||||
964 | have been deemed to be ambiguous. | ||||||
965 | |||||||
966 | Note - even though we have made the annotation parser case | ||||||
967 | insensitive, if something appeared in the annotations file as BLAH1 | ||||||
968 | and blah1, we would not deem either of these to be ambiguous. | ||||||
969 | However, if it appeared as blah1 twice, referring to two different | ||||||
970 | genes, then blah1 would be ambiguous. | ||||||
971 | |||||||
972 | Usage: | ||||||
973 | |||||||
974 | my @ambiguousNames = $annotationParser->ambiguousNames(); | ||||||
975 | |||||||
976 | =cut | ||||||
977 | |||||||
978 | 1 | 1 | 1 | 443 | my $self = shift; | ||
979 | |||||||
980 | # we can simply generate a list of case-sensitive names that have | ||||||
981 | # appeared more than once - we'll cache them so they don't have to | ||||||
982 | # be recalculated in the event that they're asked for again | ||||||
983 | |||||||
984 | 1 | 50 | 8 | if (!exists ($self->{$kAmbiguousNamesSensitive})){ | |||
985 | |||||||
986 | 1 | 3 | my @names; | ||||
987 | |||||||
988 | 1 | 2 | foreach my $name (keys %{$self->{$kNameToCount}}){ | ||||
1 | 8385 | ||||||
989 | |||||||
990 | 20180 | 100 | 49694 | push(@names, $name) if ($self->{$kNameToCount}{$name} > 1); | |||
991 | |||||||
992 | } | ||||||
993 | |||||||
994 | 1 | 3091 | $self->{$kAmbiguousNamesSensitive} = \@names; | ||||
995 | |||||||
996 | } | ||||||
997 | |||||||
998 | 1 | 4 | return @{$self->{$kAmbiguousNamesSensitive}}; | ||||
1 | 49 | ||||||
999 | |||||||
1000 | } | ||||||
1001 | |||||||
1002 | =pod | ||||||
1003 | |||||||
1004 | =head1 Methods for retrieving GO annotations for entities | ||||||
1005 | |||||||
1006 | =cut | ||||||
1007 | |||||||
1008 | ############################################################################ | ||||||
1009 | sub goIdsByDatabaseId{ | ||||||
1010 | ############################################################################ | ||||||
1011 | =pod | ||||||
1012 | |||||||
1013 | =head2 goIdsByDatabaseId | ||||||
1014 | |||||||
1015 | This public method returns a reference to an array of GOIDs that are | ||||||
1016 | associated with the supplied databaseId for a specific aspect. If no | ||||||
1017 | annotations are associated with that databaseId in that aspect, then a | ||||||
1018 | reference to an empty array will be returned. If the databaseId is | ||||||
1019 | not recognized, then undef will be returned. In the case that a | ||||||
1020 | databaseId is ambiguous (for instance the same databaseId exists but | ||||||
1021 | with different casings) then if the supplied database id matches the | ||||||
1022 | exact case of one of those supplied, then that is the one it will be | ||||||
1023 | treated as. In the case where the databaseId matches none of the | ||||||
1024 | possibilities by case, then a fatal error will occur, because the | ||||||
1025 | provided databaseId was ambiguous. | ||||||
1026 | |||||||
1027 | Usage: | ||||||
1028 | |||||||
1029 | my $goidsRef = $annotationParser->goIdsByDatabaseId(databaseId => $databaseId, | ||||||
1030 | aspect => ); |
||||||
1031 | |||||||
1032 | =cut | ||||||
1033 | |||||||
1034 | 19434 | 19434 | 1 | 60100 | my ($self, %args) = @_; | ||
1035 | |||||||
1036 | 19434 | 33 | 52739 | my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); | |||
1037 | 19434 | 33 | 43253 | my $databaseId = $args{'databaseId'} || $self->_handleMissingArgument(argument => 'databaseId'); | |||
1038 | |||||||
1039 | 19434 | 22411 | my $mappedId; # will store the id as listed in the annotations file | ||||
1040 | |||||||
1041 | 19434 | 50 | 67659 | if (exists $self->{$kUcIdToId}{uc($databaseId)}){ # we recognize it | |||
1042 | |||||||
1043 | 19434 | 100 | 35353 | if (scalar (@{$self->{$kUcIdToId}{uc($databaseId)}}) == 1){ | |||
19434 | 64529 | ||||||
1044 | |||||||
1045 | # it's unambiguous | ||||||
1046 | |||||||
1047 | 19432 | 57853 | $mappedId = $self->{$kUcIdToId}{uc($databaseId)}[0]; | ||||
1048 | |||||||
1049 | }else{ | ||||||
1050 | |||||||
1051 | # it may be ambiguous, but we'll check to see if the provided one | ||||||
1052 | # is of exactly the correct case | ||||||
1053 | |||||||
1054 | 2 | 3 | foreach my $id (@{$self->{$kUcIdToId}{uc($databaseId)}}){ | ||||
2 | 7 | ||||||
1055 | |||||||
1056 | 3 | 100 | 8 | if ($databaseId eq $id){ # we have a match | |||
1057 | |||||||
1058 | 2 | 3 | $mappedId = $id; | ||||
1059 | 2 | 3 | last; | ||||
1060 | |||||||
1061 | } | ||||||
1062 | |||||||
1063 | } | ||||||
1064 | |||||||
1065 | 2 | 50 | 6 | if (!defined $mappedId){ | |||
1066 | |||||||
1067 | # we got no perfect match, so it's ambiguous, and we die | ||||||
1068 | |||||||
1069 | 0 | 0 | die "$databaseId is ambiguous as a databaseId, and could be used to refer to one of:\n\n". | ||||
1070 | 0 | 0 | join("\n", @{$self->{$kUcIdToId}{uc($databaseId)}}); | ||||
1071 | |||||||
1072 | } | ||||||
1073 | |||||||
1074 | } | ||||||
1075 | |||||||
1076 | }else{ # we don't recognize it | ||||||
1077 | |||||||
1078 | 0 | 0 | return ; # note return here | ||||
1079 | |||||||
1080 | } | ||||||
1081 | |||||||
1082 | # if we get here, then we have a recognized, and unambiguous database id | ||||||
1083 | |||||||
1084 | 19434 | 48870 | return $self->_goIdsByMappedDatabaseId(databaseId => $mappedId, | ||||
1085 | aspect => $aspect); | ||||||
1086 | |||||||
1087 | } | ||||||
1088 | |||||||
1089 | ############################################################################ | ||||||
1090 | sub _goIdsByMappedDatabaseId{ | ||||||
1091 | ############################################################################ | ||||||
1092 | # This protected method returns a reference to an array of GOIDs that | ||||||
1093 | # are associated with the supplied databaseId for a specific aspect. | ||||||
1094 | # If no annotations are associated with that databaseId in that | ||||||
1095 | # aspect, then a reference to an empty array will be returned. If the | ||||||
1096 | # databaseId is not recognized, then undef will be returned. The | ||||||
1097 | # supplied databaseId must NOT be ambiguous, i.e. it must be a real | ||||||
1098 | # databaseId known to exist. If it is possibly ambiguous, use the | ||||||
1099 | # goIdsByDatabaseId method instead. | ||||||
1100 | # | ||||||
1101 | # Usage: | ||||||
1102 | # | ||||||
1103 | # my $goidsRef = $annotationParser->_goIdsByMappedDatabaseId(databaseId => $databaseId, | ||||||
1104 | # aspect => ); |
||||||
1105 | |||||||
1106 | |||||||
1107 | 19434 | 19434 | 53761 | my ($self, %args) = @_; | |||
1108 | |||||||
1109 | 19434 | 33 | 45607 | my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); | |||
1110 | 19434 | 33 | 39439 | my $mappedId = $args{'databaseId'} || $self->_handleMissingArgument(argument => 'databaseId'); | |||
1111 | |||||||
1112 | 19434 | 100 | 77637 | if (exists $self->{$kGoids}{$mappedId}{$aspect}){ # it has annotations | |||
1113 | |||||||
1114 | 18903 | 24652 | return [keys %{$self->{$kGoids}{$mappedId}{$aspect}}]; | ||||
18903 | 155797 | ||||||
1115 | |||||||
1116 | }else{ # it has no annotations | ||||||
1117 | |||||||
1118 | 531 | 2749 | return []; # reference to empty array | ||||
1119 | |||||||
1120 | } | ||||||
1121 | |||||||
1122 | } | ||||||
1123 | |||||||
1124 | ############################################################################ | ||||||
1125 | sub goIdsByStandardName{ | ||||||
1126 | ############################################################################ | ||||||
1127 | =pod | ||||||
1128 | |||||||
1129 | =head2 goIdsByStandardName | ||||||
1130 | |||||||
1131 | This public method returns a reference to an array of GOIDs that are | ||||||
1132 | associated with the supplied standardName for a specific aspect. If | ||||||
1133 | no annotations are associated with the entity with that standard name | ||||||
1134 | in that aspect, then a reference to an empty list will be returned. | ||||||
1135 | If the supplied name is not used as a standard name, then undef will | ||||||
1136 | be returned. In the case that the supplied standardName is ambiguous | ||||||
1137 | (for instance the same standardName exists but with different casings) | ||||||
1138 | then if the supplied standardName matches the exact case of one of | ||||||
1139 | those supplied, then that is the one it will be treated as. In the | ||||||
1140 | case where the standardName matches none of the possibilities by case, | ||||||
1141 | then a fatal error will occur, because the provided standardName was | ||||||
1142 | ambiguous. | ||||||
1143 | |||||||
1144 | Usage: | ||||||
1145 | |||||||
1146 | my $goidsRef = $annotationParser->goIdsByStandardName(standardName =>$standardName, | ||||||
1147 | aspect => ); |
||||||
1148 | |||||||
1149 | =cut | ||||||
1150 | |||||||
1151 | 0 | 0 | 1 | 0 | my ($self, %args) = @_; | ||
1152 | |||||||
1153 | 0 | 0 | 0 | my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); | |||
1154 | 0 | 0 | 0 | my $standardName = $args{'standardName'} || $self->_handleMissingArgument(argument => 'standardName'); | |||
1155 | |||||||
1156 | # now we have to determine if the standardName is ambiguous or not | ||||||
1157 | |||||||
1158 | # first, return if there is no standard name for the supplied string | ||||||
1159 | |||||||
1160 | 0 | 0 | 0 | return undef if !exists $self->{$kUcStdNameToStdName}{uc($standardName)}; | |||
1161 | |||||||
1162 | # now see if we have 1 or more mappings | ||||||
1163 | |||||||
1164 | 0 | 0 | my $mappedName; | ||||
1165 | |||||||
1166 | 0 | 0 | 0 | if (scalar @{$self->{$kUcStdNameToStdName}{uc($standardName)}} == 1){ | |||
0 | 0 | ||||||
1167 | |||||||
1168 | # we have a single mapping | ||||||
1169 | |||||||
1170 | 0 | 0 | $mappedName = $self->{$kUcStdNameToStdName}{uc($standardName)}[0]; | ||||
1171 | |||||||
1172 | }else{ | ||||||
1173 | |||||||
1174 | # there's more than one, so see if the case matched exactly | ||||||
1175 | |||||||
1176 | 0 | 0 | foreach my $name (@{$self->{$kUcStdNameToStdName}{uc($standardName)}}){ | ||||
0 | 0 | ||||||
1177 | |||||||
1178 | 0 | 0 | 0 | if ($name eq $standardName){ | |||
1179 | |||||||
1180 | 0 | 0 | $mappedName = $name; | ||||
1181 | 0 | 0 | last; | ||||
1182 | |||||||
1183 | } | ||||||
1184 | |||||||
1185 | } | ||||||
1186 | |||||||
1187 | 0 | 0 | 0 | if (!defined $mappedName){ | |||
1188 | |||||||
1189 | # we got no perfect match, so it's ambiguous, and we die | ||||||
1190 | |||||||
1191 | 0 | 0 | die "$standardName is ambiguous as a standardName, and could be used to refer to one of:\n\n". | ||||
1192 | 0 | 0 | join("\n", @{$self->{$kUcStdNameToStdName}{uc($standardName)}}); | ||||
1193 | |||||||
1194 | } | ||||||
1195 | |||||||
1196 | } | ||||||
1197 | |||||||
1198 | # now we're here, we know we have a mapped standard name, which | ||||||
1199 | # must thus map to a databaseId | ||||||
1200 | |||||||
1201 | 0 | 0 | my $databaseId = $self->_databaseIdByMappedStandardName($mappedName); | ||||
1202 | |||||||
1203 | 0 | 0 | return $self->_goIdsByMappedDatabaseId(databaseId => $databaseId, | ||||
1204 | aspect => $aspect); | ||||||
1205 | |||||||
1206 | } | ||||||
1207 | |||||||
1208 | ############################################################################ | ||||||
1209 | sub goIdsByName{ | ||||||
1210 | ############################################################################ | ||||||
1211 | =pod | ||||||
1212 | |||||||
1213 | =head2 goIdsByName | ||||||
1214 | |||||||
1215 | This public method returns a reference to an array of GO IDs that are | ||||||
1216 | associated with the supplied name for a specific aspect. If there are | ||||||
1217 | no GO associations for the entity corresponding to the supplied name | ||||||
1218 | in the provided aspect, then a reference to an empty list will be | ||||||
1219 | returned. If the supplied name does not correspond to any entity, | ||||||
1220 | then undef will be returned. Because the name can be any of the | ||||||
1221 | databaseId, the standard name, or any of the aliases, it is possible | ||||||
1222 | that the name might be ambiguous. Clients of this object should first | ||||||
1223 | test whether the name they are using is ambiguous, using the | ||||||
1224 | nameIsAmbiguous() method, and handle it accordingly. If an ambiguous | ||||||
1225 | name is supplied, then it will die. | ||||||
1226 | |||||||
1227 | NB: API change: | ||||||
1228 | |||||||
1229 | goIdsByName is now case insensitive - that is, if there is a name that | ||||||
1230 | is used twice using different casing, that will be treated as | ||||||
1231 | ambiguous. Previous versions would have not treated these as | ||||||
1232 | ambiguous. This is the price of wanting a case insensitive annotation | ||||||
1233 | parser. In the event that a name is provided that is ambiguous | ||||||
1234 | because of case, if it matches exactly the case of one of the possible | ||||||
1235 | matches, it will be treated unambiguously. | ||||||
1236 | |||||||
1237 | Usage: | ||||||
1238 | |||||||
1239 | my $goidsRef = $annotationParser->goIdsByName(name => $name, | ||||||
1240 | aspect => ); |
||||||
1241 | |||||||
1242 | =cut | ||||||
1243 | |||||||
1244 | 0 | 0 | 1 | 0 | my ($self, %args) = @_; | ||
1245 | |||||||
1246 | 0 | 0 | 0 | my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); | |||
1247 | 0 | 0 | 0 | my $name = $args{'name'} || $self->_handleMissingArgument(argument => 'name'); | |||
1248 | |||||||
1249 | 0 | 0 | 0 | die "You have supplied an ambiguous name to goIdsByName" if ($self->nameIsAmbiguous($name)); | |||
1250 | |||||||
1251 | # if we get here, the name is not ambiguous, so it's safe to call | ||||||
1252 | # databaseIdByName | ||||||
1253 | |||||||
1254 | 0 | 0 | my $databaseId = $self->databaseIdByName($name); | ||||
1255 | |||||||
1256 | 0 | 0 | 0 | return undef if !defined $databaseId; # there is no such name | |||
1257 | |||||||
1258 | # we should have a databaseId in the correct casing now | ||||||
1259 | |||||||
1260 | 0 | 0 | return $self->_goIdsByMappedDatabaseId(databaseId => $databaseId, | ||||
1261 | aspect => $aspect); | ||||||
1262 | |||||||
1263 | } | ||||||
1264 | |||||||
1265 | =pod | ||||||
1266 | |||||||
1267 | =head1 Methods for mapping different types of name to each other | ||||||
1268 | |||||||
1269 | =cut | ||||||
1270 | |||||||
1271 | ############################################################################ | ||||||
1272 | sub standardNameByDatabaseId{ | ||||||
1273 | ############################################################################ | ||||||
1274 | =pod | ||||||
1275 | |||||||
1276 | =head2 standardNameByDatabaseId | ||||||
1277 | |||||||
1278 | This method returns the standard name for a database id. | ||||||
1279 | |||||||
1280 | NB: API change | ||||||
1281 | |||||||
1282 | standardNameByDatabaseId is now case insensitive - that is, if there | ||||||
1283 | is a databaseId that is used twice (or more) using different casing, | ||||||
1284 | it will be treated as ambiguous. Previous versions would have not | ||||||
1285 | treated these as ambiguous. This is the price of wanting a case | ||||||
1286 | insensitive annotation parser. In the event that a name is provided | ||||||
1287 | that is ambiguous because of case, if it matches exactly the case of | ||||||
1288 | one of the possible matches, it will be treated unambiguously. | ||||||
1289 | |||||||
1290 | Usage: | ||||||
1291 | |||||||
1292 | my $standardName = $annotationParser->standardNameByDatabaseId($databaseId); | ||||||
1293 | |||||||
1294 | =cut | ||||||
1295 | |||||||
1296 | 0 | 0 | 1 | 0 | my ($self, $databaseId) = @_; | ||
1297 | |||||||
1298 | 0 | 0 | 0 | die "You must supply a databaseId to standardNameByDatabaseId" if !defined ($databaseId); | |||
1299 | |||||||
1300 | # first return if there is no databaseId for the supplied string | ||||||
1301 | |||||||
1302 | 0 | 0 | 0 | return undef if (!exists $self->{$kUcIdToId}{uc($databaseId)}); | |||
1303 | |||||||
1304 | # now, check whether it's ambiguous as a databaseId | ||||||
1305 | |||||||
1306 | 0 | 0 | my $mappedId; | ||||
1307 | |||||||
1308 | 0 | 0 | 0 | if (scalar(@{$self->{$kUcIdToId}{uc($databaseId)}}) == 1){ | |||
0 | 0 | ||||||
1309 | |||||||
1310 | # we have a single mapping | ||||||
1311 | |||||||
1312 | 0 | 0 | $mappedId = $self->{$kUcIdToId}{uc($databaseId)}[0]; | ||||
1313 | |||||||
1314 | }else{ | ||||||
1315 | |||||||
1316 | # there's more than one, so see if the provided case matches | ||||||
1317 | # exactly one of them | ||||||
1318 | |||||||
1319 | 0 | 0 | foreach my $id (@{$self->{$kUcIdToId}{uc($databaseId)}}){ | ||||
0 | 0 | ||||||
1320 | |||||||
1321 | 0 | 0 | 0 | if ($databaseId eq $id){ | |||
1322 | |||||||
1323 | 0 | 0 | $mappedId = $id; | ||||
1324 | 0 | 0 | last; | ||||
1325 | |||||||
1326 | } | ||||||
1327 | |||||||
1328 | } | ||||||
1329 | |||||||
1330 | 0 | 0 | 0 | if (!defined $mappedId){ | |||
1331 | |||||||
1332 | # we got no perfect match, so it's ambiguous, and we die | ||||||
1333 | |||||||
1334 | 0 | 0 | die "$databaseId is ambiguous as a databaseId, and could be used to refer to one of:\n\n". | ||||
1335 | 0 | 0 | join("\n", @{$self->{$kUcIdToId}{uc($databaseId)}}); | ||||
1336 | |||||||
1337 | } | ||||||
1338 | |||||||
1339 | } | ||||||
1340 | |||||||
1341 | |||||||
1342 | 0 | 0 | return ($self->{$kIdToStandardName}{$mappedId}); | ||||
1343 | |||||||
1344 | } | ||||||
1345 | |||||||
1346 | ############################################################################ | ||||||
1347 | sub databaseIdByStandardName{ | ||||||
1348 | ############################################################################ | ||||||
1349 | =pod | ||||||
1350 | |||||||
1351 | =head2 databaseIdByStandardName | ||||||
1352 | |||||||
1353 | This method returns the database id for a standard name. | ||||||
1354 | |||||||
1355 | NB: API change | ||||||
1356 | |||||||
1357 | databaseIdByStandardName is now case insensitive - that is, if there | ||||||
1358 | is a standard name that is used twice (or more) using different | ||||||
1359 | casing, it will be treated as ambiguous. Previous versions would have | ||||||
1360 | not treated these as ambiguous. This is the price of wanting a case | ||||||
1361 | insensitive annotation parser. In the event that a name is provided | ||||||
1362 | that is ambiguous because of case, if it matches exactly the case of | ||||||
1363 | one of the possible matches, it will be treated unambiguously. | ||||||
1364 | |||||||
1365 | Usage: | ||||||
1366 | |||||||
1367 | my $databaseId = $annotationParser->databaseIdByStandardName($standardName); | ||||||
1368 | |||||||
1369 | =cut | ||||||
1370 | |||||||
1371 | 0 | 0 | 1 | 0 | my ($self, $standardName) = @_; | ||
1372 | |||||||
1373 | 0 | 0 | 0 | die "You must supply a standardName to databaseIdByStandardName" if !defined ($standardName); | |||
1374 | |||||||
1375 | # first return if there is no standard name for the supplied string | ||||||
1376 | |||||||
1377 | 0 | 0 | 0 | return undef if (!exists $self->{$kUcStdNameToStdName}{uc($standardName)}); | |||
1378 | |||||||
1379 | # now see if it's ambiguous or not | ||||||
1380 | |||||||
1381 | 0 | 0 | my $mappedStandardName; | ||||
1382 | |||||||
1383 | 0 | 0 | 0 | if (scalar(@{$self->{$kUcStdNameToStdName}{uc($standardName)}}) == 1){ | |||
0 | 0 | ||||||
1384 | |||||||
1385 | # it's not ambiguous | ||||||
1386 | |||||||
1387 | 0 | 0 | $mappedStandardName = $self->{$kUcStdNameToStdName}{uc($standardName)}[0]; | ||||
1388 | |||||||
1389 | }else{ | ||||||
1390 | |||||||
1391 | # there's more than one, so see if the supplied name matches | ||||||
1392 | # the case of one of them exactly | ||||||
1393 | |||||||
1394 | 0 | 0 | foreach my $name (@{$self->{$kUcStdNameToStdName}{uc($standardName)}}){ | ||||
0 | 0 | ||||||
1395 | |||||||
1396 | 0 | 0 | 0 | if ($standardName eq $name){ | |||
1397 | |||||||
1398 | 0 | 0 | $mappedStandardName = $name; | ||||
1399 | 0 | 0 | last; | ||||
1400 | |||||||
1401 | } | ||||||
1402 | |||||||
1403 | } | ||||||
1404 | |||||||
1405 | 0 | 0 | 0 | if (!defined $mappedStandardName){ | |||
1406 | |||||||
1407 | 0 | 0 | die "$standardName is ambiguous as a standard name, and could be used to refer to one of:\n\n". | ||||
1408 | 0 | 0 | join("\n", @{$self->{$kUcStdNameToStdName}{uc($standardName)}}); | ||||
1409 | |||||||
1410 | } | ||||||
1411 | |||||||
1412 | } | ||||||
1413 | |||||||
1414 | 0 | 0 | return ($self->{$kStandardNameToId}{$standardName}); | ||||
1415 | |||||||
1416 | } | ||||||
1417 | |||||||
1418 | ############################################################################ | ||||||
1419 | sub _databaseIdByMappedStandardName{ | ||||||
1420 | ############################################################################ | ||||||
1421 | # This protected method returns the database id for a standard name that is | ||||||
1422 | # guaranteed to be non-ambiguous, and in the correct casing | ||||||
1423 | # | ||||||
1424 | # Usage: | ||||||
1425 | # | ||||||
1426 | # my $databaseId = $annotationParser->_databaseIdByMappedStandardName($standardName); | ||||||
1427 | # | ||||||
1428 | |||||||
1429 | 0 | 0 | 0 | my ($self, $standardName) = @_; | |||
1430 | |||||||
1431 | 0 | 0 | 0 | die "You must supply a standardName to _databaseIdByMappedStandardName" if !defined ($standardName); | |||
1432 | |||||||
1433 | 0 | 0 | return ($self->{$kStandardNameToId}{$standardName}); | ||||
1434 | |||||||
1435 | } | ||||||
1436 | |||||||
1437 | ############################################################################ | ||||||
1438 | sub databaseIdByName{ | ||||||
1439 | ############################################################################ | ||||||
1440 | =pod | ||||||
1441 | |||||||
1442 | =head2 databaseIdByName | ||||||
1443 | |||||||
1444 | This method returns the database id for any identifier for a gene | ||||||
1445 | (e.g. by databaseId itself, by standard name, or by alias). If the | ||||||
1446 | used name is ambiguous, then the program will die. Thus clients | ||||||
1447 | should call the nameIsAmbiguous() method, prior to using this method. | ||||||
1448 | If the name does not map to any databaseId, then undef will be | ||||||
1449 | returned. | ||||||
1450 | |||||||
1451 | NB: API change | ||||||
1452 | |||||||
1453 | databaseIdByName is now case insensitive - that is, if there is a name | ||||||
1454 | that is used twice using different casing, that will be treated as | ||||||
1455 | ambiguous. Previous versions would have not treated these as | ||||||
1456 | ambiguous. This is the price of wanting a case insensitive annotation | ||||||
1457 | parser. In the event that a name is provided that is ambiguous | ||||||
1458 | because of case, if it matches exactly the case of one of the possible | ||||||
1459 | matches, it will be treated unambiguously. | ||||||
1460 | |||||||
1461 | Usage: | ||||||
1462 | |||||||
1463 | my $databaseId = $annotationParser->databaseIdByName($name); | ||||||
1464 | |||||||
1465 | =cut | ||||||
1466 | |||||||
1467 | 53129 | 53129 | 1 | 73450 | my ($self, $name) = @_; | ||
1468 | |||||||
1469 | 53129 | 50 | 103970 | die "You must supply a name to databaseIdByName" if !defined ($name); | |||
1470 | |||||||
1471 | 53129 | 50 | 95474 | die "You have supplied an ambiguous name to databaseIdByName" if ($self->nameIsAmbiguous($name)); | |||
1472 | |||||||
1473 | # give them the case insensitive unique map, or if there is none, | ||||||
1474 | # then the case sensitive version | ||||||
1475 | |||||||
1476 | 53129 | 66 | 218623 | my $databaseId = $self->{$kNameToIdMapInsensitive}{uc($name)} || $self->{$kNameToIdMapSensitive}{$name}; | |||
1477 | |||||||
1478 | 53129 | 134962 | return $databaseId; | ||||
1479 | |||||||
1480 | } | ||||||
1481 | |||||||
1482 | ############################################################################ | ||||||
1483 | sub standardNameByName{ | ||||||
1484 | ############################################################################ | ||||||
1485 | =pod | ||||||
1486 | |||||||
1487 | =head2 standardNameByName | ||||||
1488 | |||||||
1489 | This public method returns the standard name for the the gene | ||||||
1490 | specified by the given name. Because a name may be ambiguous, the | ||||||
1491 | nameIsAmbiguous() method should be called first. If an ambiguous name | ||||||
1492 | is supplied, then it will die with an appropriate error message. If | ||||||
1493 | the name does not map to a standard name, then undef will be returned. | ||||||
1494 | |||||||
1495 | NB: API change | ||||||
1496 | |||||||
1497 | standardNameByName is now case insensitive - that is, if there is a | ||||||
1498 | name that is used twice using different casing, that will be treated | ||||||
1499 | as ambiguous. Previous versions would have not treated these as | ||||||
1500 | ambiguous. This is the price of wanting a case insensitive annotation | ||||||
1501 | parser. | ||||||
1502 | |||||||
1503 | Usage: | ||||||
1504 | |||||||
1505 | my $standardName = $annotationParser->standardNameByName($name); | ||||||
1506 | |||||||
1507 | =cut | ||||||
1508 | |||||||
1509 | 0 | 0 | 1 | 0 | my ($self, $name) = @_; | ||
1510 | |||||||
1511 | 0 | 0 | 0 | die "You must supply a name to standardNameByName" if !defined ($name); | |||
1512 | |||||||
1513 | 0 | 0 | 0 | die "You have supplied an ambiguous name to standardNameByName" if ($self->nameIsAmbiguous($name)); | |||
1514 | |||||||
1515 | 0 | 0 | my $databaseId = $self->databaseIdByName($name); | ||||
1516 | |||||||
1517 | 0 | 0 | 0 | if (defined $databaseId){ | |||
1518 | |||||||
1519 | 0 | 0 | return $self->{$kIdToStandardName}{$databaseId}; | ||||
1520 | |||||||
1521 | }else{ | ||||||
1522 | |||||||
1523 | 0 | 0 | return undef; | ||||
1524 | |||||||
1525 | } | ||||||
1526 | |||||||
1527 | } | ||||||
1528 | |||||||
1529 | =pod | ||||||
1530 | |||||||
1531 | =head1 Other methods relating to names | ||||||
1532 | |||||||
1533 | =cut | ||||||
1534 | |||||||
1535 | ############################################################################ | ||||||
1536 | sub nameIsStandardName{ | ||||||
1537 | ############################################################################ | ||||||
1538 | =pod | ||||||
1539 | |||||||
1540 | =head2 nameIsStandardName | ||||||
1541 | |||||||
1542 | This method returns a boolean to indicate whether the supplied name is | ||||||
1543 | used as a standard name. | ||||||
1544 | |||||||
1545 | NB : API change. | ||||||
1546 | |||||||
1547 | This is now case insensitive. If you provide abC1, and ABc1 is a | ||||||
1548 | standard name, then it will return true. | ||||||
1549 | |||||||
1550 | Usage : | ||||||
1551 | |||||||
1552 | if ($annotationParser->nameIsStandardName($name)){ | ||||||
1553 | |||||||
1554 | # do something | ||||||
1555 | |||||||
1556 | } | ||||||
1557 | |||||||
1558 | =cut | ||||||
1559 | |||||||
1560 | 6471 | 6471 | 1 | 22646 | my ($self, $name) = @_; | ||
1561 | |||||||
1562 | 6471 | 50 | 10980 | die "You must supply a name to nameIsStandardName" if !defined($name); | |||
1563 | |||||||
1564 | 6471 | 20060 | return exists ($self->{$kUcStdNameToStdName}{uc($name)}); | ||||
1565 | |||||||
1566 | } | ||||||
1567 | |||||||
1568 | ############################################################################ | ||||||
1569 | sub nameIsDatabaseId{ | ||||||
1570 | ############################################################################ | ||||||
1571 | =pod | ||||||
1572 | |||||||
1573 | =head2 nameIsDatabaseId | ||||||
1574 | |||||||
1575 | This method returns a boolean to indicate whether the supplied name is | ||||||
1576 | used as a database id. | ||||||
1577 | |||||||
1578 | NB : API change. | ||||||
1579 | |||||||
1580 | This is now case insensitive. If you provide abC1, and ABc1 is a | ||||||
1581 | database id, then it will return true. | ||||||
1582 | |||||||
1583 | Usage : | ||||||
1584 | |||||||
1585 | if ($annotationParser->nameIsDatabaseId($name)){ | ||||||
1586 | |||||||
1587 | # do something | ||||||
1588 | |||||||
1589 | } | ||||||
1590 | |||||||
1591 | =cut | ||||||
1592 | |||||||
1593 | |||||||
1594 | 6471 | 6471 | 1 | 19683 | my ($self, $databaseId) = @_; | ||
1595 | |||||||
1596 | 6471 | 50 | 10400 | die "You must supply a potential databaseId to nameIsDatabaseId" if !defined($databaseId); | |||
1597 | |||||||
1598 | 6471 | 19837 | return exists ($self->{$kUcIdToId}{uc($databaseId)}); | ||||
1599 | |||||||
1600 | } | ||||||
1601 | |||||||
1602 | ############################################################################ | ||||||
1603 | sub nameIsAnnotated{ | ||||||
1604 | ############################################################################ | ||||||
1605 | =pod | ||||||
1606 | |||||||
1607 | =head2 nameIsAnnotated | ||||||
1608 | |||||||
1609 | This method returns a boolean to indicate whether the supplied name has any | ||||||
1610 | annotations, either when considered as a databaseId, a standardName, or | ||||||
1611 | an alias. If an aspect is also supplied, then it indicates whether that | ||||||
1612 | name has any annotations in that aspect only. | ||||||
1613 | |||||||
1614 | NB: API change. | ||||||
1615 | |||||||
1616 | This is now case insensitive. If you provide abC1, and ABc1 has | ||||||
1617 | annotation, then it will return true. | ||||||
1618 | |||||||
1619 | Usage : | ||||||
1620 | |||||||
1621 | if ($annotationParser->nameIsAnnotated(name => $name)){ | ||||||
1622 | |||||||
1623 | # blah | ||||||
1624 | |||||||
1625 | } | ||||||
1626 | |||||||
1627 | or: | ||||||
1628 | |||||||
1629 | if ($annotationParser->nameIsAnnotated(name => $name, | ||||||
1630 | aspect => $aspect)){ | ||||||
1631 | |||||||
1632 | # blah | ||||||
1633 | |||||||
1634 | } | ||||||
1635 | |||||||
1636 | |||||||
1637 | =cut | ||||||
1638 | |||||||
1639 | 0 | 0 | 1 | 0 | my ($self, %args) = @_; | ||
1640 | |||||||
1641 | 0 | 0 | 0 | my $name = $args{'name'} || die "You must supply a name to nameIsAnnotated"; | |||
1642 | |||||||
1643 | 0 | 0 | my $aspect = $args{'aspect'}; | ||||
1644 | |||||||
1645 | 0 | 0 | my $isAnnotated = 0; | ||||
1646 | |||||||
1647 | 0 | 0 | my $ucName = uc($name); | ||||
1648 | |||||||
1649 | 0 | 0 | 0 | if (!defined ($aspect)){ # if there's no aspect | |||
1650 | |||||||
1651 | 0 | 0 | 0 | $isAnnotated = (exists ($self->{$kNameToIdMapInsensitive}{$ucName}) || exists ($self->{$kAmbiguousNames}{$ucName})); | |||
1652 | |||||||
1653 | }else{ | ||||||
1654 | |||||||
1655 | 0 | 0 | 0 | 0 | if ($self->nameIsDatabaseId($name) && @{$self->goIdsByDatabaseId(databaseId => $name, | ||
0 | 0 | 0 | 0 | ||||
0 | |||||||
1656 | 0 | 0 | aspect => $aspect)}){ | ||||
1657 | |||||||
1658 | 0 | 0 | $isAnnotated = 1; | ||||
1659 | |||||||
1660 | }elsif ($self->nameIsStandardName($name) && @{$self->goIdsByStandardName(standardName => $name, | ||||||
1661 | aspect => $aspect)}){ | ||||||
1662 | |||||||
1663 | 0 | 0 | $isAnnotated = 1; | ||||
1664 | |||||||
1665 | }elsif (!$self->nameIsAmbiguous($name)){ | ||||||
1666 | |||||||
1667 | 0 | 0 | my $goidsRef = $self->goIdsByName(name => $name, | ||||
1668 | aspect => $aspect); | ||||||
1669 | |||||||
1670 | 0 | 0 | 0 | 0 | if (defined $goidsRef && @{$goidsRef}){ | ||
0 | 0 | ||||||
1671 | |||||||
1672 | 0 | 0 | $isAnnotated = 1; | ||||
1673 | |||||||
1674 | } | ||||||
1675 | |||||||
1676 | }else { # MUST be an ambiguous name, that's not used as a standard name | ||||||
1677 | |||||||
1678 | 0 | 0 | foreach my $databaseId ($self->databaseIdsForAmbiguousName($name)){ | ||||
1679 | |||||||
1680 | 0 | 0 | 0 | if (@{$self->goIdsByDatabaseId(databaseId => $name, | |||
0 | 0 | ||||||
1681 | aspect => $aspect)}){ | ||||||
1682 | |||||||
1683 | 0 | 0 | $isAnnotated = 1; | ||||
1684 | 0 | 0 | last; # as soon as we know, we can finish | ||||
1685 | |||||||
1686 | } | ||||||
1687 | |||||||
1688 | } | ||||||
1689 | |||||||
1690 | } | ||||||
1691 | |||||||
1692 | } | ||||||
1693 | |||||||
1694 | 0 | 0 | return $isAnnotated; | ||||
1695 | |||||||
1696 | } | ||||||
1697 | |||||||
1698 | =pod | ||||||
1699 | |||||||
1700 | =head1 Other public methods | ||||||
1701 | |||||||
1702 | =cut | ||||||
1703 | |||||||
1704 | ############################################################################ | ||||||
1705 | sub databaseName{ | ||||||
1706 | ############################################################################ | ||||||
1707 | =pod | ||||||
1708 | |||||||
1709 | =head2 databaseName | ||||||
1710 | |||||||
1711 | This method returns the name of the annotating authority from the file | ||||||
1712 | that was supplied to the constructor. | ||||||
1713 | |||||||
1714 | Usage : | ||||||
1715 | |||||||
1716 | my $databaseName = $annotationParser->databaseName(); | ||||||
1717 | |||||||
1718 | =cut | ||||||
1719 | |||||||
1720 | 0 | 0 | 1 | 0 | my $self = shift; | ||
1721 | |||||||
1722 | 0 | 0 | return $self->{$kDatabaseName}; | ||||
1723 | |||||||
1724 | } | ||||||
1725 | |||||||
1726 | ############################################################################ | ||||||
1727 | sub numAnnotatedGenes{ | ||||||
1728 | ############################################################################ | ||||||
1729 | =pod | ||||||
1730 | |||||||
1731 | =head2 numAnnotatedGenes | ||||||
1732 | |||||||
1733 | This method returns the number of entities in the annotation file that | ||||||
1734 | have annotations in the supplied aspect. If no aspect is provided, | ||||||
1735 | then it will return the number of genes with an annotation in at least | ||||||
1736 | one aspect of GO. | ||||||
1737 | |||||||
1738 | Usage: | ||||||
1739 | |||||||
1740 | my $numAnnotatedGenes = $annotationParser->numAnnotatedGenes(); | ||||||
1741 | |||||||
1742 | my $numAnnotatedGenes = $annotationParser->numAnnotatedGenes($aspect); | ||||||
1743 | |||||||
1744 | =cut | ||||||
1745 | |||||||
1746 | 3 | 3 | 1 | 1523 | my ($self, $aspect) = @_; | ||
1747 | |||||||
1748 | 3 | 100 | 17 | if (defined ($aspect)){ | |||
1749 | |||||||
1750 | 1 | 8 | return $self->{$kNumAnnotatedGenes}{$aspect}; | ||||
1751 | |||||||
1752 | }else{ | ||||||
1753 | |||||||
1754 | 2 | 12 | return $self->{$kTotalNumAnnotatedGenes}; | ||||
1755 | |||||||
1756 | } | ||||||
1757 | |||||||
1758 | } | ||||||
1759 | |||||||
1760 | ############################################################################ | ||||||
1761 | sub allDatabaseIds{ | ||||||
1762 | ############################################################################ | ||||||
1763 | =pod | ||||||
1764 | |||||||
1765 | =head2 allDatabaseIds | ||||||
1766 | |||||||
1767 | This public method returns an array of all the database identifiers | ||||||
1768 | |||||||
1769 | Usage: | ||||||
1770 | |||||||
1771 | my @databaseIds = $annotationParser->allDatabaseIds(); | ||||||
1772 | |||||||
1773 | =cut | ||||||
1774 | |||||||
1775 | 10 | 10 | 1 | 1261 | my $self = shift; | ||
1776 | |||||||
1777 | 10 | 18 | return keys (%{$self->{$kIdToStandardName}}); | ||||
10 | 26887 | ||||||
1778 | |||||||
1779 | } | ||||||
1780 | |||||||
1781 | ############################################################################ | ||||||
1782 | sub allStandardNames{ | ||||||
1783 | ############################################################################ | ||||||
1784 | =pod | ||||||
1785 | |||||||
1786 | =head2 allStandardNames | ||||||
1787 | |||||||
1788 | This public method returns an array of all standard names. | ||||||
1789 | |||||||
1790 | Usage: | ||||||
1791 | |||||||
1792 | my @standardNames = $annotationParser->allStandardNames(); | ||||||
1793 | |||||||
1794 | =cut | ||||||
1795 | |||||||
1796 | 2 | 2 | 1 | 508 | my $self = shift; | ||
1797 | |||||||
1798 | 2 | 6 | return keys(%{$self->{$kStandardNameToId}}); | ||||
2 | 3605 | ||||||
1799 | |||||||
1800 | } | ||||||
1801 | |||||||
1802 | =pod | ||||||
1803 | |||||||
1804 | =head1 Methods to do with files | ||||||
1805 | |||||||
1806 | =cut | ||||||
1807 | |||||||
1808 | ############################################################################ | ||||||
1809 | sub file{ | ||||||
1810 | ############################################################################ | ||||||
1811 | =pod | ||||||
1812 | |||||||
1813 | =head2 file | ||||||
1814 | |||||||
1815 | This method returns the name of the file that was used to instantiate | ||||||
1816 | the object. | ||||||
1817 | |||||||
1818 | Usage: | ||||||
1819 | |||||||
1820 | my $file = $annotationParser->file; | ||||||
1821 | |||||||
1822 | =cut | ||||||
1823 | |||||||
1824 | 1 | 1 | 1 | 3730 | return $_[0]->{$kFileName}; | ||
1825 | |||||||
1826 | } | ||||||
1827 | |||||||
1828 | ############################################################################ | ||||||
1829 | sub serializeToDisk{ | ||||||
1830 | ############################################################################ | ||||||
1831 | =pod | ||||||
1832 | |||||||
1833 | =head2 serializeToDisk | ||||||
1834 | |||||||
1835 | This public method saves the current state of the Annotation Parser | ||||||
1836 | Object to a file, using the Storable package. The data are saved in | ||||||
1837 | network order for portability, just in case. The name of the object | ||||||
1838 | file is returned. By default, the name of the original file will be | ||||||
1839 | used to make the name of the object file (including the full path from | ||||||
1840 | where the file came), or the client can instead supply their own | ||||||
1841 | filename. | ||||||
1842 | |||||||
1843 | Usage: | ||||||
1844 | |||||||
1845 | my $fileName = $annotationParser->serializeToDisk; | ||||||
1846 | |||||||
1847 | my $fileName = $annotationParser->serializeToDisk(filename => $filename); | ||||||
1848 | |||||||
1849 | =cut | ||||||
1850 | |||||||
1851 | 0 | 0 | 1 | my ($self, %args) = @_; | |||
1852 | |||||||
1853 | 0 | my $fileName; | |||||
1854 | |||||||
1855 | 0 | 0 | if (exists ($args{'filename'})){ # they supply their own filename | ||||
1856 | |||||||
1857 | 0 | $fileName = $args{'filename'}; | |||||
1858 | |||||||
1859 | }else{ # we build a name from the file used to instantiate ourselves | ||||||
1860 | |||||||
1861 | 0 | $fileName = $self->file; | |||||
1862 | |||||||
1863 | 0 | 0 | if ($fileName !~ /\.obj$/){ # if we weren't instantiated from an object | ||||
1864 | |||||||
1865 | 0 | $fileName .= ".obj"; # add a .obj suffix to the name | |||||
1866 | |||||||
1867 | } | ||||||
1868 | |||||||
1869 | } | ||||||
1870 | |||||||
1871 | 0 | 0 | nstore ($self, $fileName) || die "$PACKAGE could not serialize itself to $fileName : $!"; | ||||
1872 | |||||||
1873 | 0 | return ($fileName); | |||||
1874 | |||||||
1875 | } | ||||||
1876 | |||||||
1877 | 1; # to keep perl happy | ||||||
1878 | |||||||
1879 | ############################################################################ | ||||||
1880 | # MORE P O D D O C U M E N T A T I O N # | ||||||
1881 | ############################################################################ | ||||||
1882 | |||||||
1883 | =pod | ||||||
1884 | |||||||
1885 | =head1 Modifications | ||||||
1886 | |||||||
1887 | CVS info is listed here: | ||||||
1888 | |||||||
1889 | # $Author: sherlock $ | ||||||
1890 | # $Date: 2008/05/13 23:06:16 $ | ||||||
1891 | # $Log: AnnotationParser.pm,v $ | ||||||
1892 | # Revision 1.35 2008/05/13 23:06:16 sherlock | ||||||
1893 | # updated to fix bug with querying with a name that was unambiguous when | ||||||
1894 | # taking its casing into account. | ||||||
1895 | # | ||||||
1896 | # Revision 1.34 2007/03/18 03:09:05 sherlock | ||||||
1897 | # couple of PerlCritic suggested improvements, and an extra check to | ||||||
1898 | # make sure that the cardinality between standard names and database ids | ||||||
1899 | # is 1:1 | ||||||
1900 | # | ||||||
1901 | # Revision 1.33 2006/07/28 00:02:14 sherlock | ||||||
1902 | # fixed a couple of typos | ||||||
1903 | # | ||||||
1904 | # Revision 1.32 2004/07/28 17:12:10 sherlock | ||||||
1905 | # bumped version | ||||||
1906 | # | ||||||
1907 | # Revision 1.31 2004/07/28 17:03:49 sherlock | ||||||
1908 | # fixed bugs when calling goidsByDatabaseId instead of goIdsByDatabaseId | ||||||
1909 | # on lines 1592 and 1617 - thanks to lfriedl@cs.umass.edu for spotting this. | ||||||
1910 | # | ||||||
1911 | # Revision 1.30 2003/11/26 18:44:28 sherlock | ||||||
1912 | # finished making all the changes that were required to make it case | ||||||
1913 | # insensitive, and modified POD accordingly. It appears to all work as | ||||||
1914 | # expected... | ||||||
1915 | # | ||||||
1916 | # Revision 1.29 2003/11/22 00:05:05 sherlock | ||||||
1917 | # made a very large number of changes to make much of it | ||||||
1918 | # case-insensitive, such that using CDC6 or cdc6 amounts to the same | ||||||
1919 | # query, as long as both versions of that name don't exist in the | ||||||
1920 | # annotations file. Still needs a little work to allow names that are | ||||||
1921 | # potentially ambiguous to be not ambiguous, if their casing matches | ||||||
1922 | # exactly one form of the name that has been seen. Have started to | ||||||
1923 | # update test suite to check all the case insensitive stuff, but is not | ||||||
1924 | # yet finished. | ||||||
1925 | # | ||||||
1926 | # | ||||||
1927 | |||||||
1928 | =head1 AUTHORS | ||||||
1929 | |||||||
1930 | Elizabeth Boyle, ell@mit.edu | ||||||
1931 | |||||||
1932 | Gavin Sherlock, sherlock@genome.stanford.edu | ||||||
1933 | |||||||
1934 | =cut |