| blib/lib/GO/AnnotationProvider/AnnotationParser.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % |
| statement | 152 | 321 | 47.3 |
| branch | 45 | 154 | 29.2 |
| condition | 13 | 62 | 20.9 |
| subroutine | 24 | 37 | 64.8 |
| pod | 22 | 22 | 100.0 |
| total | 256 | 596 | 42.9 |
| line | stmt | bran | cond | sub | pod | time | code |
|---|---|---|---|---|---|---|---|
| 1 | package GO::AnnotationProvider::AnnotationParser; | ||||||
| 2 | |||||||
| 3 | # File : AnnotationParser.pm | ||||||
| 4 | # Authors : Elizabeth Boyle; Gavin Sherlock | ||||||
| 5 | # Date Begun : Summer 2001 | ||||||
| 6 | # Rewritten : September 25th 2002 | ||||||
| 7 | |||||||
| 8 | # $Id: AnnotationParser.pm,v 1.35 2008/05/13 23:06:16 sherlock Exp $ | ||||||
| 9 | |||||||
| 10 | # Copyright (c) 2003 Gavin Sherlock; Stanford University | ||||||
| 11 | |||||||
| 12 | # Permission is hereby granted, free of charge, to any person | ||||||
| 13 | # obtaining a copy of this software and associated documentation files | ||||||
| 14 | # (the "Software"), to deal in the Software without restriction, | ||||||
| 15 | # including without limitation the rights to use, copy, modify, merge, | ||||||
| 16 | # publish, distribute, sublicense, and/or sell copies of the Software, | ||||||
| 17 | # and to permit persons to whom the Software is furnished to do so, | ||||||
| 18 | # subject to the following conditions: | ||||||
| 19 | |||||||
| 20 | # The above copyright notice and this permission notice shall be | ||||||
| 21 | # included in all copies or substantial portions of the Software. | ||||||
| 22 | |||||||
| 23 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | ||||||
| 24 | # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | ||||||
| 25 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | ||||||
| 26 | # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS | ||||||
| 27 | # BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN | ||||||
| 28 | # ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN | ||||||
| 29 | # CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | ||||||
| 30 | # SOFTWARE. | ||||||
| 31 | |||||||
| 32 | =pod | ||||||
| 33 | |||||||
| 34 | =head1 NAME | ||||||
| 35 | |||||||
| 36 | GO::AnnotationProvider::AnnotationParser - parses a gene annotation file | ||||||
| 37 | |||||||
| 38 | =head1 SYNOPSIS | ||||||
| 39 | |||||||
| 40 | GO::AnnotationProvider::AnnotationParser - reads a Gene Ontology gene | ||||||
| 41 | associations file, and provides methods by which to retrieve the GO | ||||||
| 42 | annotations for the an annotated entity. Note, it is case | ||||||
| 43 | insensitive, with some caveats - see documentation below. | ||||||
| 44 | |||||||
| 45 | my $annotationParser = GO::AnnotationProvider::AnnotationParser->new(annotationFile => "data/gene_association.sgd"); | ||||||
| 46 | |||||||
| 47 | my $geneName = "AAT2"; | ||||||
| 48 | |||||||
| 49 | print "GO associations for gene: ", join (" ", $annotationParser->goIdsByName(name => $geneName, | ||||||
| 50 | aspect => 'P')), "\n"; | ||||||
| 51 | |||||||
| 52 | print "Database ID for gene: ", $annotationParser->databaseIdByName($geneName), "\n"; | ||||||
| 53 | |||||||
| 54 | print "Database name: ", $annotationParser->databaseName(), "\n"; | ||||||
| 55 | |||||||
| 56 | print "Standard name for gene: ", $annotationParser->standardNameByName($geneName), "\n"; | ||||||
| 57 | |||||||
| 58 | my $i; | ||||||
| 59 | |||||||
| 60 | my @geneNames = $annotationParser->allStandardNames(); | ||||||
| 61 | |||||||
| 62 | foreach $i (0..10) { | ||||||
| 63 | |||||||
| 64 | print "$geneNames[$i]\n"; | ||||||
| 65 | |||||||
| 66 | } | ||||||
| 67 | |||||||
| 68 | =head1 DESCRIPTION | ||||||
| 69 | |||||||
| 70 | GO::AnnotationProvider::AnnotationParser is a concrete subclass of | ||||||
| 71 | GO::AnnotationProvider, and creates a data structure mapping gene | ||||||
| 72 | names to GO annotations by parsing a file of annotations provided by | ||||||
| 73 | the Gene Ontology Consortium. | ||||||
| 74 | |||||||
| 75 | This package provides object methods for retrieving GO annotations | ||||||
| 76 | that have been parsed from a 'gene associations' file, provided by | ||||||
| 77 | the gene ontology consortium. The format for the file is: | ||||||
| 78 | |||||||
| 79 | Lines beginning with a '!' character are comment lines. | ||||||
| 80 | |||||||
| 81 | Column Cardinality Contents | ||||||
| 82 | ------ ----------- ------------------------------------------------------------- | ||||||
| 83 | 0 1 Database abbreviation for the source of annotation (e.g. SGD) | ||||||
| 84 | 1 1 Database identifier of the annotated entity | ||||||
| 85 | 2 1 Standard name of the annotated entity | ||||||
| 86 | 3 0,1 NOT (if a gene is specifically NOT annotated to the term) | ||||||
| 87 | 4 1 GOID of the annotation | ||||||
| 88 | 5 1,n Reference(s) for the annotation | ||||||
| 89 | 6 1 Evidence code for the annotation | ||||||
| 90 | 7 0,n With or From (a bit mysterious) | ||||||
| 91 | 8 1 Aspect of the Annotation (C, F, P) | ||||||
| 92 | 9 0,1 Name of the product being annotated | ||||||
| 93 | 10 0,n Alias(es) of the annotated product | ||||||
| 94 | 11 1 type of annotated entity (one of gene, transcript, protein) | ||||||
| 95 | 12 1,2 taxonomic id of the organism encoding and/or using the product | ||||||
| 96 | 13 1 Date of annotation YYYYMMDD | ||||||
| 97 | 14 1 Assigned_by : The database which made the annotation | ||||||
| 98 | |||||||
| 99 | Columns are separated by tabs. For those entries with a cardinality | ||||||
| 100 | greater than 1, multiple entries are pipe , |, delimited. | ||||||
| 101 | |||||||
| 102 | Further details can be found at: | ||||||
| 103 | |||||||
| 104 | http://www.geneontology.org/doc/GO.annotation.html#file | ||||||
| 105 | |||||||
| 106 | The following assumptions about the file are made (and should be true): | ||||||
| 107 | |||||||
| 108 | 1. All aliases appear for all entries of a given annotated product | ||||||
| 109 | 2. The database identifiers are unique, in that two different | ||||||
| 110 | entities cannot have the same database id. | ||||||
| 111 | |||||||
| 112 | =head1 TODO | ||||||
| 113 | |||||||
| 114 | Also see the TODO list in the parent, GO::AnnotationProvider. | ||||||
| 115 | |||||||
| 116 | 1. Add in methods that will allow retrieval of evidence codes with | ||||||
| 117 | the annotations for a particular entity. | ||||||
| 118 | |||||||
| 119 | 2. Add in methods that return all the annotated entities for a | ||||||
| 120 | particular GOID. | ||||||
| 121 | |||||||
| 122 | 3. Add in the ability to request only annotations either including | ||||||
| 123 | or excluding particular evidence codes. Such evidence codes | ||||||
| 124 | could be provided as an anonymous array as the value of a named | ||||||
| 125 | argument. | ||||||
| 126 | |||||||
| 127 | 4. Same as number 3, except allow the retrieval of annotated | ||||||
| 128 | entities for a particular GOID, based on inclusion or exclusion | ||||||
| 129 | of certain evidence codes. | ||||||
| 130 | |||||||
| 131 | These first four items will require a reworking of how data are | ||||||
| 132 | stored on the backend, and thus the parsing code itself, though it | ||||||
| 133 | should not affect any of the already existing API. | ||||||
| 134 | |||||||
| 135 | 5. Instead of 'use'ing Storable, 'require' it instead, only at the | ||||||
| 136 | point of use, which will mean that AnnotationParser can be | ||||||
| 137 | happily used in the absence of Storable, just without those | ||||||
| 138 | functions that need it. | ||||||
| 139 | |||||||
| 140 | 6. Extend the ValidateFile class method to check that an entity | ||||||
| 141 | should never be annotated to the same node twice, with the same | ||||||
| 142 | evidence, with the same reference. | ||||||
| 143 | |||||||
| 144 | 7. An additional checker, that uses an AnnotationProvider in | ||||||
| 145 | conjunction with an OntologyProvider, would be useful, that | ||||||
| 146 | checks that some of the annotations themselves are valid, ie | ||||||
| 147 | that no entities are annotated to the 'unknown' node in a | ||||||
| 148 | particular aspect, and also to another node within that same | ||||||
| 149 | aspect. Can annotations be redundant? ie, if an entity is | ||||||
| 150 | annotated to a node, and an ancestor of the node, is that | ||||||
| 151 | annotation redundant? Does it depend on the evidence codes and | ||||||
| 152 | references. Or are such annotations reinforcing? These things | ||||||
| 153 | are useful to consider when formulating the confidence which can | ||||||
| 154 | be attributed to an annotation. | ||||||
| 155 | |||||||
| 156 | =cut | ||||||
| 157 | |||||||
| 158 | 2 | 2 | 221291 | use strict; | |||
| 2 | 6 | ||||||
| 2 | 109 | ||||||
| 159 | 2 | 2 | 14 | use warnings; | |||
| 2 | 4 | ||||||
| 2 | 2399 | ||||||
| 160 | 2 | 2 | 18 | use diagnostics; | |||
| 2 | 5 | ||||||
| 2 | 18 | ||||||
| 161 | |||||||
| 162 | 2 | 2 | 6881 | use Storable qw (nstore); | |||
| 2 | 9108 | ||||||
| 2 | 158 | ||||||
| 163 | 2 | 2 | 1721 | use IO::File; | |||
| 2 | 19875 | ||||||
| 2 | 294 | ||||||
| 164 | |||||||
| 165 | 2 | 2 | 16 | use vars qw (@ISA $PACKAGE $VERSION); | |||
| 2 | 5 | ||||||
| 2 | 115 | ||||||
| 166 | |||||||
| 167 | 2 | 2 | 3515 | use GO::AnnotationProvider; | |||
| 2 | 5 | ||||||
| 2 | 10147 | ||||||
| 168 | @ISA = qw (GO::AnnotationProvider); | ||||||
| 169 | |||||||
| 170 | $PACKAGE = "GO::AnnotationProvider::AnnotationParser"; | ||||||
| 171 | $VERSION = "0.15"; | ||||||
| 172 | |||||||
| 173 | # CLASS Attributes | ||||||
| 174 | # | ||||||
| 175 | # These should be considered as constants, and are initialized here | ||||||
| 176 | |||||||
| 177 | my $DEBUG = 0; | ||||||
| 178 | |||||||
| 179 | # constants for instance attribute name | ||||||
| 180 | |||||||
| 181 | |||||||
| 182 | my $kDatabaseName = $PACKAGE.'::__databaseName'; # stores the name of the annotating database | ||||||
| 183 | my $kFileName = $PACKAGE.'::__fileName'; # stores the name of the file used to instantiate the object | ||||||
| 184 | my $kNameToIdMapInsensitive = $PACKAGE.'::__nameToIdMapInsensitive'; # stores a case insensitive map of all unambiguous names for a gene to the database id | ||||||
| 185 | my $kNameToIdMapSensitive = $PACKAGE.'::__nameToIdMapSensitive'; # stores a case sensitive map of all names where a particular casing is unambiguous for a gene to the database id | ||||||
| 186 | my $kAmbiguousNames = $PACKAGE.'::__ambiguousNames'; # stores the database id's for all ambiguous names | ||||||
| 187 | my $kIdToStandardName = $PACKAGE.'::__idToStandardName'; # stores a map of database id's to standard names of all entities | ||||||
| 188 | my $kStandardNameToId = $PACKAGE.'::__StandardNameToId'; # stores a map of standard names to their database id's | ||||||
| 189 | my $kUcIdToId = $PACKAGE.'::__ucIdToId'; # stores a map of uppercased databaseIds to the databaseId | ||||||
| 190 | my $kUcStdNameToStdName = $PACKAGE.'::__ucStdNameToStdName'; # stores a map of uppercased standard names to the standard name | ||||||
| 191 | my $kNameToCount = $PACKAGE.'::__nameToCount'; # stores a case sensitive map of the number of times a name has been seen | ||||||
| 192 | my $kGoids = $PACKAGE.'::__goids'; # stores all the goid annotations | ||||||
| 193 | my $kNumAnnotatedGenes = $PACKAGE.'::__numAnnotatedGenes'; # stores number of genes with annotations, per aspect | ||||||
| 194 | |||||||
| 195 | my $kAmbiguousNamesSensitive = $PACKAGE.'::__ambiguousNamesSensitive'; # names (case sensitive) that are ambiguous | ||||||
| 196 | |||||||
| 197 | my $kTotalNumAnnotatedGenes = $PACKAGE.'::__totalNumAnnotatedGenes'; # total number of annotated genes | ||||||
| 198 | |||||||
| 199 | # constants to describe what is in which column in the annotation file | ||||||
| 200 | |||||||
| 201 | my $kDatabaseNameColumn = 0; | ||||||
| 202 | my $kDatabaseIdColumn = 1; | ||||||
| 203 | my $kStandardNameColumn = 2; | ||||||
| 204 | my $kNotColumn = 3; | ||||||
| 205 | my $kGoidColumn = 4; | ||||||
| 206 | my $kReferenceColumn = 5; | ||||||
| 207 | my $kEvidenceColumn = 6; | ||||||
| 208 | my $kWithColumn = 7; | ||||||
| 209 | my $kAspectColumn = 8; | ||||||
| 210 | my $kNameColumn = 9; | ||||||
| 211 | my $kAliasesColumn = 10; | ||||||
| 212 | my $kEntityTypeColumn = 11; | ||||||
| 213 | my $kTaxonomicIDColumn = 12; | ||||||
| 214 | my $kDateColumn = 13; | ||||||
| 215 | my $kAssignedByColumn = 14; | ||||||
| 216 | |||||||
| 217 | # the following hash of anonymous arrays indicates for each column | ||||||
| 218 | # what the maximum and minimum number of entries per column can be. | ||||||
| 219 | # If no maximum is indicated, then the maximum is equal to the | ||||||
| 220 | # minimum, and exactly that number of entries must exist. | ||||||
| 221 | |||||||
| 222 | my %kColumnsToCardinality = ($kDatabaseNameColumn => [1 ], | ||||||
| 223 | $kDatabaseIdColumn => [1 ], | ||||||
| 224 | $kStandardNameColumn => [1 ], | ||||||
| 225 | $kNotColumn => [0, 1], | ||||||
| 226 | $kGoidColumn => [1 ], | ||||||
| 227 | $kReferenceColumn => [1, "n"], | ||||||
| 228 | $kEvidenceColumn => [1 ], | ||||||
| 229 | $kWithColumn => [0, "n"], | ||||||
| 230 | $kAspectColumn => [1 ], | ||||||
| 231 | $kNameColumn => [0, 1], | ||||||
| 232 | $kAliasesColumn => [0, "n"], | ||||||
| 233 | $kEntityTypeColumn => [1 ], | ||||||
| 234 | $kTaxonomicIDColumn => [1, 2], | ||||||
| 235 | $kDateColumn => [1 ], | ||||||
| 236 | $kAssignedByColumn => [1 ]); | ||||||
| 237 | |||||||
| 238 | my $kNumColumnsInFile = scalar keys %kColumnsToCardinality; | ||||||
| 239 | |||||||
| 240 | =pod | ||||||
| 241 | |||||||
| 242 | =head1 Class Methods | ||||||
| 243 | |||||||
| 244 | =cut | ||||||
| 245 | |||||||
| 246 | ############################################################################ | ||||||
| 247 | sub Usage{ | ||||||
| 248 | ############################################################################ | ||||||
| 249 | =pod | ||||||
| 250 | |||||||
| 251 | =head2 Usage | ||||||
| 252 | |||||||
| 253 | This class method simply prints out a usage statement, along with an | ||||||
| 254 | error message, if one was passed in. | ||||||
| 255 | |||||||
| 256 | Usage : | ||||||
| 257 | |||||||
| 258 | GO::AnnotationProvider::AnnotationParser->Usage(); | ||||||
| 259 | |||||||
| 260 | =cut | ||||||
| 261 | |||||||
| 262 | 0 | 0 | 1 | 0 | my ($class, $message) = @_; | ||
| 263 | |||||||
| 264 | 0 | 0 | 0 | defined $message && print $message."\n\n"; | |||
| 265 | |||||||
| 266 | 0 | 0 | print 'The constructor expects one of two arguments, either a | ||||
| 267 | \'annotationFile\' argument, or and \'objectFile\' argument. When | ||||||
| 268 | instantiated with an annotationFile argument, it expects it to | ||||||
| 269 | correspond to an annotation file created by one of the GO consortium | ||||||
| 270 | members, according to their file format. When instantiated with an | ||||||
| 271 | objectFile argument, it expects to open a previously created | ||||||
| 272 | annotationParser object that has been serialized to disk (see the | ||||||
| 273 | serializeToDisk method). | ||||||
| 274 | |||||||
| 275 | Usage: | ||||||
| 276 | |||||||
| 277 | my $annotationParser = '.$PACKAGE.'->new(annotationFile => $file); | ||||||
| 278 | |||||||
| 279 | my $annotationParser = '.$PACKAGE.'->new(objectFile => $file); | ||||||
| 280 | '; | ||||||
| 281 | |||||||
| 282 | } | ||||||
| 283 | |||||||
| 284 | ############################################################################ | ||||||
| 285 | sub ValidateFile{ | ||||||
| 286 | ############################################################################ | ||||||
| 287 | =pod | ||||||
| 288 | |||||||
| 289 | =head2 ValidateFile | ||||||
| 290 | |||||||
| 291 | This class method reads an annotation file, and returns a reference to | ||||||
| 292 | an array of errors that are present within the file. The errors are | ||||||
| 293 | simply strings, each beginning with "Line $lineNo : " where $lineNo is | ||||||
| 294 | the number of the line in the file where the error was found. | ||||||
| 295 | |||||||
| 296 | Usage: | ||||||
| 297 | |||||||
| 298 | my $errorsRef = GO::AnnotationProvider::AnnotationParser->ValidateFile(annotationFile => $file); | ||||||
| 299 | |||||||
| 300 | =cut | ||||||
| 301 | |||||||
| 302 | 0 | 0 | 1 | 0 | my ($class, %args) = @_; | ||
| 303 | |||||||
| 304 | 0 | 0 | 0 | my $file = $args{'annotationFile'} || $class->_handleMissingArgument(argument => 'annotationFile'); | |||
| 305 | |||||||
| 306 | 0 | 0 | 0 | my $annotationsFh = IO::File->new($file, q{<} )|| die "$PACKAGE cannot open $file : $!"; | |||
| 307 | |||||||
| 308 | 0 | 0 | my (@errors, @line); | ||||
| 309 | |||||||
| 310 | 0 | 0 | my ($databaseId, $standardName, $aliases); | ||||
| 311 | 0 | 0 | my (%idToName, %idToAliases); | ||||
| 312 | |||||||
| 313 | 0 | 0 | my $lineNo = 0; | ||||
| 314 | |||||||
| 315 | 0 | 0 | while (<$annotationsFh>){ | ||||
| 316 | |||||||
| 317 | 0 | 0 | ++$lineNo; | ||||
| 318 | |||||||
| 319 | 0 | 0 | 0 | next if $_ =~ m/^!/; # skip comment lines | |||
| 320 | |||||||
| 321 | 0 | 0 | chomp; | ||||
| 322 | |||||||
| 323 | 0 | 0 | 0 | next unless $_; # skip an empty line, if there is one | |||
| 324 | |||||||
| 325 | 0 | 0 | @line = split("\t", $_, -1); | ||||
| 326 | |||||||
| 327 | 0 | 0 | 0 | if (scalar @line != $kNumColumnsInFile){ # doesn't have the correct number of columns | |||
| 328 | |||||||
| 329 | 0 | 0 | push (@errors, "Line $lineNo has ". scalar @line. "columns, instead of $kNumColumnsInFile."); | ||||
| 330 | |||||||
| 331 | } | ||||||
| 332 | |||||||
| 333 | 0 | 0 | $class->__CheckCardinalityOfColumns(\@errors, \@line, $lineNo); | ||||
| 334 | |||||||
| 335 | # now want to deal with sanity checks... | ||||||
| 336 | |||||||
| 337 | 0 | 0 | ($databaseId, $standardName, $aliases) = @line[$kDatabaseIdColumn, $kStandardNameColumn, $kAliasesColumn]; | ||||
| 338 | |||||||
| 339 | 0 | 0 | 0 | next if ($databaseId eq ""); # will have given incorrect cardinality, but nothing more we can do with it | |||
| 340 | |||||||
| 341 | 0 | 0 | 0 | if (!exists $idToName{$databaseId}){ | |||
| 0 | |||||||
| 342 | |||||||
| 343 | 0 | 0 | $idToName{$databaseId} = $standardName; | ||||
| 344 | |||||||
| 345 | }elsif ($idToName{$databaseId} ne $standardName){ | ||||||
| 346 | |||||||
| 347 | 0 | 0 | push (@errors, "Line $lineNo : $databaseId has more than one standard name : $idToName{$databaseId} and $standardName."); | ||||
| 348 | |||||||
| 349 | } | ||||||
| 350 | |||||||
| 351 | 0 | 0 | 0 | if (!exists $idToAliases{$databaseId}){ | |||
| 0 | |||||||
| 352 | |||||||
| 353 | 0 | 0 | $idToAliases{$databaseId} = $aliases; | ||||
| 354 | |||||||
| 355 | }elsif($idToAliases{$databaseId} ne $aliases){ | ||||||
| 356 | |||||||
| 357 | 0 | 0 | push (@errors, "Line $lineNo : $databaseId has more than one collections of aliases : $idToAliases{$databaseId} and $aliases."); | ||||
| 358 | |||||||
| 359 | } | ||||||
| 360 | |||||||
| 361 | } | ||||||
| 362 | |||||||
| 363 | 0 | 0 | 0 | $annotationsFh->close || die "$PACKAGE cannot close $file : $!"; | |||
| 364 | |||||||
| 365 | 0 | 0 | return \@errors; | ||||
| 366 | |||||||
| 367 | } | ||||||
| 368 | |||||||
| 369 | ############################################################################ | ||||||
| 370 | sub __CheckCardinalityOfColumns{ | ||||||
| 371 | ############################################################################ | ||||||
| 372 | # This method checks the cardinality of each column on a line | ||||||
| 373 | # | ||||||
| 374 | # Usage: | ||||||
| 375 | # | ||||||
| 376 | # $class->__CheckCardinalityOfColumns(\@errors, \@line, $lineNo); | ||||||
| 377 | |||||||
| 378 | 0 | 0 | 0 | my ($class, $errorsRef, $lineRef, $lineNo) = @_; | |||
| 379 | |||||||
| 380 | 0 | 0 | my ($cardinality, $min, $max); | ||||
| 381 | |||||||
| 382 | 0 | 0 | foreach my $column (sort {$a<=>$b} keys %kColumnsToCardinality){ | ||||
| 0 | 0 | ||||||
| 383 | |||||||
| 384 | 0 | 0 | ($min, $max) = @{$kColumnsToCardinality{$column}}[0,1]; | ||||
| 0 | 0 | ||||||
| 385 | |||||||
| 386 | 0 | 0 | $cardinality = $class->__GetCardinality($lineRef->[$column], $errorsRef, $lineNo); | ||||
| 387 | |||||||
| 388 | 0 | 0 | 0 | if (!defined $max){ # must have a defined number of entries | |||
| 389 | |||||||
| 390 | 0 | 0 | 0 | if ($cardinality != $min){ | |||
| 391 | |||||||
| 392 | 0 | 0 | push (@{$errorsRef}, "Line $lineNo : column $column has a cardinality of $cardinality, instead of $min."); | ||||
| 0 | 0 | ||||||
| 393 | |||||||
| 394 | } | ||||||
| 395 | |||||||
| 396 | }else{ # there's a range of allowed number of entries | ||||||
| 397 | |||||||
| 398 | 0 | 0 | 0 | 0 | if ($cardinality < $min){ # check if less than minimum | ||
| 0 | |||||||
| 399 | |||||||
| 400 | 0 | 0 | push (@{$errorsRef}, "Line $lineNo : column $column has a cardinality of $cardinality, which is less than the required $min."); | ||||
| 0 | 0 | ||||||
| 401 | |||||||
| 402 | }elsif ($kColumnsToCardinality{$column}->[1] ne 'n' && | ||||||
| 403 | $cardinality > $max){ # check if more than maximum | ||||||
| 404 | |||||||
| 405 | 0 | 0 | push (@{$errorsRef}, "Line $lineNo : column $column has a cardinality of $cardinality, which is more than the allowed $max."); | ||||
| 0 | 0 | ||||||
| 406 | |||||||
| 407 | } | ||||||
| 408 | |||||||
| 409 | } | ||||||
| 410 | |||||||
| 411 | } | ||||||
| 412 | |||||||
| 413 | } | ||||||
| 414 | |||||||
| 415 | ############################################################################ | ||||||
| 416 | sub __GetCardinality{ | ||||||
| 417 | ############################################################################ | ||||||
| 418 | # This private method returns an integer that indicates the | ||||||
| 419 | # cardinality of a text string, where multiple entries are assumed to | ||||||
| 420 | # be seperated by the pipe character (|). In addition, it checks | ||||||
| 421 | # whether there are null or whitespace only entries. | ||||||
| 422 | # | ||||||
| 423 | # Usage: | ||||||
| 424 | # | ||||||
| 425 | # my $cardinality = $class->__GetCardinality($string); | ||||||
| 426 | |||||||
| 427 | 0 | 0 | 0 | my ($class, $string, $errorsRef, $lineNo) = @_; | |||
| 428 | |||||||
| 429 | 0 | 0 | my $cardinality; | ||||
| 430 | |||||||
| 431 | 0 | 0 | 0 | 0 | if (!defined $string || $string eq ""){ | ||
| 432 | |||||||
| 433 | 0 | 0 | $cardinality = 0; | ||||
| 434 | |||||||
| 435 | }else{ | ||||||
| 436 | |||||||
| 437 | 0 | 0 | my @entries = split(/\|/, $string, -1); | ||||
| 438 | |||||||
| 439 | 0 | 0 | foreach my $entry (@entries){ | ||||
| 440 | |||||||
| 441 | 0 | 0 | 0 | if (!defined $entry){ | |||
| 0 | |||||||
| 442 | |||||||
| 443 | 0 | 0 | push (@{$errorsRef}, "Line $lineNo : There is an undefined value in the string $string."); | ||||
| 0 | 0 | ||||||
| 444 | |||||||
| 445 | }elsif ($entry =~ /^\s+$/){ | ||||||
| 446 | |||||||
| 447 | 0 | 0 | push (@{$errorsRef}, "Line $lineNo : There is a white-space only value in the string $string."); | ||||
| 0 | 0 | ||||||
| 448 | |||||||
| 449 | } | ||||||
| 450 | |||||||
| 451 | } | ||||||
| 452 | |||||||
| 453 | 0 | 0 | $cardinality = scalar @entries; | ||||
| 454 | |||||||
| 455 | } | ||||||
| 456 | |||||||
| 457 | 0 | 0 | return $cardinality; | ||||
| 458 | |||||||
| 459 | } | ||||||
| 460 | |||||||
| 461 | ############################################################################ | ||||||
| 462 | # | ||||||
| 463 | # Constructor, and initialization methods. | ||||||
| 464 | # | ||||||
| 465 | # All initialization methods are private, except, of course, for the | ||||||
| 466 | # new() method. | ||||||
| 467 | # | ||||||
| 468 | ############################################################################ | ||||||
| 469 | |||||||
| 470 | ############################################################################ | ||||||
| 471 | sub new{ | ||||||
| 472 | ############################################################################ | ||||||
| 473 | =pod | ||||||
| 474 | |||||||
| 475 | =head1 Constructor | ||||||
| 476 | |||||||
| 477 | =head2 new | ||||||
| 478 | |||||||
| 479 | This is the constructor for an AnnotationParser object. | ||||||
| 480 | |||||||
| 481 | The constructor expects one of two arguments, either a | ||||||
| 482 | 'annotationFile' argument, or and 'objectFile' argument. When | ||||||
| 483 | instantiated with an annotationFile argument, it expects it to | ||||||
| 484 | correspond to an annotation file created by one of the GO consortium | ||||||
| 485 | members, according to their file format. When instantiated with an | ||||||
| 486 | objectFile argument, it expects to open a previously created | ||||||
| 487 | annotationParser object that has been serialized to disk (see the | ||||||
| 488 | serializeToDisk method). | ||||||
| 489 | |||||||
| 490 | Usage: | ||||||
| 491 | |||||||
| 492 | my $annotationParser = GO::AnnotationProvider::AnnotationParser->new(annotationFile => $file); | ||||||
| 493 | |||||||
| 494 | my $annotationParser = GO::AnnotationProvider::AnnotationParser->new(objectFile => $file); | ||||||
| 495 | |||||||
| 496 | =cut | ||||||
| 497 | |||||||
| 498 | |||||||
| 499 | 3 | 3 | 1 | 73 | my ($class, %args) = @_; | ||
| 500 | |||||||
| 501 | 3 | 6 | my $self; | ||||
| 502 | |||||||
| 503 | 3 | 50 | 27 | if (exists($args{'annotationFile'})){ | |||
| 0 | |||||||
| 504 | |||||||
| 505 | 3 | 6 | $self = {}; | ||||
| 506 | |||||||
| 507 | 3 | 9 | bless $self, $class; | ||||
| 508 | |||||||
| 509 | 3 | 16 | $self->__init($args{'annotationFile'}); | ||||
| 510 | |||||||
| 511 | }elsif (exists($args{'objectFile'})){ | ||||||
| 512 | |||||||
| 513 | 0 | 0 | 0 | $self = Storable::retrieve($args{'objectFile'}) || die "Could not instantiate $PACKAGE object from objectFile : $!"; | |||
| 514 | |||||||
| 515 | 0 | 0 | $self->__setFile($args{'objectFile'}); | ||||
| 516 | |||||||
| 517 | }else{ | ||||||
| 518 | |||||||
| 519 | 0 | 0 | $class->Usage("An annotationFile or objectFile argument must be provided."); | ||||
| 520 | 0 | 0 | die; | ||||
| 521 | |||||||
| 522 | } | ||||||
| 523 | |||||||
| 524 | # now, we have to make some alteration to some hashes to support | ||||||
| 525 | # our API for case insensitivity. The API says that if a name is | ||||||
| 526 | # supplied that would otherwise be ambiguous, but has a unique | ||||||
| 527 | # casing, then we will accept it as that unique cased version. | ||||||
| 528 | # Thus, we need to make sure that our $kNameToIdMapSensitive hash | ||||||
| 529 | # only tracks those names that were unique in a particular case | ||||||
| 530 | |||||||
| 531 | 3 | 2223 | foreach my $name (keys %{$self->{$kNameToCount}}){ | ||||
| 3 | 22829 | ||||||
| 532 | |||||||
| 533 | # go through the has that has a count of each name | ||||||
| 534 | |||||||
| 535 | 40383 | 100 | 100 | 233133 | if ($self->{$kNameToCount}{$name} > 1 || exists $self->{$kNameToIdMapInsensitive}{uc($name)}){ | ||
| 536 | |||||||
| 537 | # if it was seen more than once, or is known to be unique | ||||||
| 538 | # in a case insensitive fashion, then delete it. This | ||||||
| 539 | # will leave just those that are unique in a case | ||||||
| 540 | # sensitive fashion | ||||||
| 541 | |||||||
| 542 | 40368 | 102206 | delete $self->{$kNameToIdMapSensitive}{$name}; | ||||
| 543 | |||||||
| 544 | } | ||||||
| 545 | |||||||
| 546 | } | ||||||
| 547 | |||||||
| 548 | 3 | 7426 | return ($self); | ||||
| 549 | |||||||
| 550 | } | ||||||
| 551 | |||||||
| 552 | ############################################################################ | ||||||
| 553 | sub __init{ | ||||||
| 554 | ############################################################################ | ||||||
| 555 | # This private method initializes the object by reading in the data | ||||||
| 556 | # from the annotation file. | ||||||
| 557 | # | ||||||
| 558 | # Usage : | ||||||
| 559 | # | ||||||
| 560 | # $self->__init($file); | ||||||
| 561 | # | ||||||
| 562 | |||||||
| 563 | 3 | 3 | 6 | my ($self, $file) = @_; | |||
| 564 | |||||||
| 565 | 3 | 17 | $self->__setFile($file); | ||||
| 566 | |||||||
| 567 | 3 | 50 | 29 | my $annotationsFh = IO::File->new($file, q{<} )|| die "$PACKAGE cannot open $file : $!"; | |||
| 568 | |||||||
| 569 | # now read through annotations file | ||||||
| 570 | |||||||
| 571 | 3 | 446 | my (@line, $databaseId, $goid, $aspect, $standardName, $aliases); | ||||
| 572 | |||||||
| 573 | 3 | 91 | while (<$annotationsFh>){ | ||||
| 574 | |||||||
| 575 | 70620 | 100 | 138348 | next if $_ =~ m/^!/; # skip commented lines | |||
| 576 | |||||||
| 577 | 70543 | 87500 | chomp; | ||||
| 578 | |||||||
| 579 | 70543 | 50 | 133381 | next unless $_; # skip an empty line, if there is one | |||
| 580 | |||||||
| 581 | 70543 | 653737 | @line = split("\t", $_, -1); | ||||
| 582 | |||||||
| 583 | 70543 | 100 | 254001 | next if $line[$kNotColumn] eq 'NOT'; # skip annotations NOT to a GOID | |||
| 584 | |||||||
| 585 | 70387 | 125340 | ($databaseId, $goid, $aspect) = @line[$kDatabaseIdColumn, $kGoidColumn, $kAspectColumn]; | ||||
| 586 | 70387 | 94770 | ($standardName, $aliases) = @line[$kStandardNameColumn, $kAliasesColumn]; | ||||
| 587 | |||||||
| 588 | 70387 | 50 | 122047 | if ($databaseId eq ""){ | |||
| 589 | |||||||
| 590 | 0 | 0 | print "On line $. there is a missing databaseId, so it will be ignored.\n"; | ||||
| 591 | 0 | 0 | next; | ||||
| 592 | |||||||
| 593 | } | ||||||
| 594 | |||||||
| 595 | # record the source of the annotation | ||||||
| 596 | |||||||
| 597 | 70387 | 100 | 167118 | $self->{$kDatabaseName} = $line[$kDatabaseNameColumn] if (!exists($self->{$kDatabaseName})); | |||
| 598 | |||||||
| 599 | # now map the standard name and all aliases to the database id | ||||||
| 600 | |||||||
| 601 | 70387 | 136305 | $self->__mapNamesToDatabaseId($databaseId, $standardName, $aliases); | ||||
| 602 | |||||||
| 603 | # and store the GOID | ||||||
| 604 | |||||||
| 605 | 70387 | 134687 | $self->__storeGOID($databaseId, $goid, $aspect); | ||||
| 606 | |||||||
| 607 | } | ||||||
| 608 | |||||||
| 609 | 3 | 50 | 32 | $annotationsFh->close || die "AnnotationParser can't close $file: $!"; | |||
| 610 | |||||||
| 611 | # now count up how many annotated things we have | ||||||
| 612 | |||||||
| 613 | 3 | 139 | foreach my $databaseId (keys %{$self->{$kGoids}}){ | ||||
| 3 | 6383 | ||||||
| 614 | |||||||
| 615 | 12949 | 20401 | $self->{$kTotalNumAnnotatedGenes}++; | ||||
| 616 | |||||||
| 617 | 12949 | 12281 | foreach my $aspect (keys %{$self->{$kGoids}{$databaseId}}){ | ||||
| 12949 | 56705 | ||||||
| 618 | |||||||
| 619 | 38475 | 79955 | $self->{$kNumAnnotatedGenes}{$aspect}++; | ||||
| 620 | |||||||
| 621 | } | ||||||
| 622 | |||||||
| 623 | } | ||||||
| 624 | |||||||
| 625 | } | ||||||
| 626 | |||||||
| 627 | ############################################################################ | ||||||
| 628 | sub __setFile{ | ||||||
| 629 | ############################################################################ | ||||||
| 630 | # This method sets the name of the file used for construction. | ||||||
| 631 | # | ||||||
| 632 | # Usage: | ||||||
| 633 | # | ||||||
| 634 | # $self->__setFile($file); | ||||||
| 635 | # | ||||||
| 636 | |||||||
| 637 | 3 | 3 | 7 | my ($self, $file) = @_; | |||
| 638 | |||||||
| 639 | 3 | 27 | $self->{$kFileName} = $file; | ||||
| 640 | |||||||
| 641 | } | ||||||
| 642 | |||||||
| 643 | ############################################################################ | ||||||
| 644 | sub __mapNamesToDatabaseId{ | ||||||
| 645 | ############################################################################ | ||||||
| 646 | # This private method maps all names and aliases to the databaseId of | ||||||
| 647 | # an entity. It also maps the databaseId to itself, to facilitate a | ||||||
| 648 | # single way of mapping any identifier to the database id. | ||||||
| 649 | # | ||||||
| 650 | # This mapping is done so that it can be queried in a case insensitive | ||||||
| 651 | # fashion, and thus allow clients to be able to retrieve annotations | ||||||
| 652 | # without necessarily knowing the correct casing of any particular | ||||||
| 653 | # identifier. | ||||||
| 654 | # | ||||||
| 655 | # We have to keep the following considerations in mind: | ||||||
| 656 | # | ||||||
| 657 | # 1. Any identifier may be non-unique with respect to casing, that is, | ||||||
| 658 | # it is possible that there is ABC1 and abc1 | ||||||
| 659 | # | ||||||
| 660 | # 2. We want to be able to returns names and identifiers in their correct | ||||||
| 661 | # casing, irrespective of the casing that is provided in the query | ||||||
| 662 | # | ||||||
| 663 | # 3. In the situation when a name that is ambiguous when considered case | ||||||
| 664 | # insensitively is provided, we should check to see whether that casing | ||||||
| 665 | # corresponds to a know correct casing, and assume that that is the one | ||||||
| 666 | # that they meant. | ||||||
| 667 | # | ||||||
| 668 | # Usage : | ||||||
| 669 | # | ||||||
| 670 | # $self->__mapNamesToDatabaseId($databaseId, $standardName, $aliases); | ||||||
| 671 | # | ||||||
| 672 | # where $aliases is a pipe-delimited list of aliases | ||||||
| 673 | |||||||
| 674 | 70387 | 70387 | 104485 | my ($self, $databaseId, $standardName, $aliases) = @_; | |||
| 675 | |||||||
| 676 | 70387 | 100 | 189957 | if (exists $self->{$kIdToStandardName}{$databaseId}){ # we've already seen this databaseId | |||
| 677 | |||||||
| 678 | 57438 | 50 | 136470 | if ($self->{$kIdToStandardName}{$databaseId} ne $standardName){ | |||
| 679 | |||||||
| 680 | # there is a problem in the file - there should only be | ||||||
| 681 | # one standard name for a given database id, so we'll die | ||||||
| 682 | # here | ||||||
| 683 | |||||||
| 684 | 0 | 0 | die "databaseId $databaseId maps to more than one standard name : $self->{$kIdToStandardName}{$databaseId} ; $standardName\n"; | ||||
| 685 | |||||||
| 686 | }else{ | ||||||
| 687 | |||||||
| 688 | # we can simply return, as we've already processed | ||||||
| 689 | # information for this databaseId | ||||||
| 690 | |||||||
| 691 | 57438 | 84234 | return; | ||||
| 692 | |||||||
| 693 | } | ||||||
| 694 | |||||||
| 695 | } | ||||||
| 696 | |||||||
| 697 | # we haven't see this databaseId before, so process the data | ||||||
| 698 | |||||||
| 699 | 12949 | 28330 | my @aliases = split(/\|/, $aliases); | ||||
| 700 | |||||||
| 701 | 12949 | 15109 | my %seen; # sometimes an alias will be the same as the standard name | ||||
| 702 | |||||||
| 703 | 12949 | 18472 | foreach my $name ($databaseId, $standardName, @aliases){ | ||||
| 704 | |||||||
| 705 | # here, we simply store, in case sensitive fashion, a mapping | ||||||
| 706 | # of the name to databaseId. Later, this map will be | ||||||
| 707 | # modified, so it only contains those names where the case | ||||||
| 708 | # sensitive version is unique. We need this map to fulfill | ||||||
| 709 | # the API requirements that if databaseIdByName() is called | ||||||
| 710 | # with a name that is ambiguous, but the casing is unique, | ||||||
| 711 | # then it will correctly determine the casing match | ||||||
| 712 | |||||||
| 713 | 43917 | 150040 | $self->{$kNameToIdMapSensitive}{$name} = $databaseId; | ||||
| 714 | |||||||
| 715 | 43917 | 54621 | my $ucName = uc($name); # cache uppercased version for efficiency | ||||
| 716 | |||||||
| 717 | # occasionally, a standard name is also listed in the aliases, | ||||||
| 718 | # so we will skip the name if we've already seen it. | ||||||
| 719 | |||||||
| 720 | # note that for now, we are doing this case sensitively - it | ||||||
| 721 | # is possible that a gene is referred to by the same name | ||||||
| 722 | # twice but with different casing - however, if those are the | ||||||
| 723 | # only times that those particular versions are seen, then | ||||||
| 724 | # they will still be treated unambiguously. | ||||||
| 725 | |||||||
| 726 | 43917 | 100 | 83029 | next if exists ($seen{$name}); | |||
| 727 | |||||||
| 728 | # let's keep a count of every time a name with the same casing | ||||||
| 729 | # is seen, across all genes | ||||||
| 730 | |||||||
| 731 | 40689 | 99678 | $self->{$kNameToCount}{$name}++; | ||||
| 732 | |||||||
| 733 | # now we have to deal with the name, depending on whether we | ||||||
| 734 | # newly determine it is ambiguous, whether we already know | ||||||
| 735 | # that name is ambiguous, or whether (so far) the name appears | ||||||
| 736 | # to be unique | ||||||
| 737 | |||||||
| 738 | # for something to be newly ambiguous, the case insensitive | ||||||
| 739 | # version of its name must have been seen associated with some | ||||||
| 740 | # other database id already. | ||||||
| 741 | |||||||
| 742 | # if the case insensitive version of the name has already been | ||||||
| 743 | # seen with the same database id, it is still not ambiguous | ||||||
| 744 | |||||||
| 745 | 40689 | 100 | 100 | 185066 | if (exists $self->{$kNameToIdMapInsensitive}{$ucName} && $self->{$kNameToIdMapInsensitive}{$ucName} ne $databaseId){ | ||
| 100 | |||||||
| 746 | |||||||
| 747 | # so record what it maps to | ||||||
| 748 | |||||||
| 749 | # current databaseId | ||||||
| 750 | |||||||
| 751 | 277 | 376 | push (@{$self->{$kAmbiguousNames}{$ucName}}, $databaseId); | ||||
| 277 | 1214 | ||||||
| 752 | |||||||
| 753 | # and previously seen databaseId | ||||||
| 754 | |||||||
| 755 | 277 | 425 | push (@{$self->{$kAmbiguousNames}{$ucName}}, $self->{$kNameToIdMapInsensitive}{$ucName}); | ||||
| 277 | 912 | ||||||
| 756 | |||||||
| 757 | # and now delete the previously seen databaseId from the unambiguous mapping | ||||||
| 758 | |||||||
| 759 | 277 | 837 | delete $self->{$kNameToIdMapInsensitive}{$ucName}; | ||||
| 760 | |||||||
| 761 | }elsif (exists $self->{$kAmbiguousNames}{$ucName}){ # we already know it's ambiguous | ||||||
| 762 | |||||||
| 763 | # so add in this new databaseId | ||||||
| 764 | |||||||
| 765 | 36 | 47 | push (@{$self->{$kAmbiguousNames}{$ucName}}, $databaseId); | ||||
| 36 | 141 | ||||||
| 766 | |||||||
| 767 | }else{ # otherwise simply map it unambiguously for now, as we haven't see the name before | ||||||
| 768 | |||||||
| 769 | 40376 | 97840 | $self->{$kNameToIdMapInsensitive}{$ucName} = $databaseId; | ||||
| 770 | |||||||
| 771 | } | ||||||
| 772 | |||||||
| 773 | 40689 | 77922 | $seen{$name} = undef; # remember that we've seen the name for this row | ||||
| 774 | |||||||
| 775 | } | ||||||
| 776 | |||||||
| 777 | # now we need to record some useful mappings | ||||||
| 778 | |||||||
| 779 | # map databaseId and standardName to each other - these should | ||||||
| 780 | # always be unique when treated case sensitively | ||||||
| 781 | |||||||
| 782 | 12949 | 37602 | $self->{$kIdToStandardName}{$databaseId} = $standardName; # record the standard name for the database id | ||||
| 783 | 12949 | 33134 | $self->{$kStandardNameToId}{$standardName} = $databaseId; # also make the reverse look up | ||||
| 784 | |||||||
| 785 | # Now map upper cased versions of the databaseId and name to their original form | ||||||
| 786 | # These are not guaranteed to be unique, so we use arrays instead | ||||||
| 787 | |||||||
| 788 | 12949 | 12683 | push (@{$self->{$kUcIdToId}{uc($databaseId)}}, $databaseId); | ||||
| 12949 | 43808 | ||||||
| 789 | 12949 | 14721 | push (@{$self->{$kUcStdNameToStdName}{uc($standardName)}}, $standardName); | ||||
| 12949 | 63755 | ||||||
| 790 | |||||||
| 791 | } | ||||||
| 792 | |||||||
| 793 | ############################################################################ | ||||||
| 794 | sub __storeGOID{ | ||||||
| 795 | ############################################################################ | ||||||
| 796 | # This private method stores a GOID for a given databaseId, on a per | ||||||
| 797 | # aspect basis, in a hash. | ||||||
| 798 | # | ||||||
| 799 | # Usage: | ||||||
| 800 | # | ||||||
| 801 | # $self->__storeGOID($databaseId, $goid, $aspect); | ||||||
| 802 | # | ||||||
| 803 | |||||||
| 804 | 70387 | 70387 | 98564 | my ($self, $databaseId, $goid, $aspect) = @_; | |||
| 805 | |||||||
| 806 | 70387 | 393007 | $self->{$kGoids}{$databaseId}{$aspect}{$goid} = undef; | ||||
| 807 | |||||||
| 808 | } | ||||||
| 809 | |||||||
| 810 | =pod | ||||||
| 811 | |||||||
| 812 | =head1 Public instance methods | ||||||
| 813 | |||||||
| 814 | =head1 Some methods dealing with ambiguous names | ||||||
| 815 | |||||||
| 816 | Because there are many names by which an annotated entity may be | ||||||
| 817 | referred to, that are non-unique, there exist a set of methods for | ||||||
| 818 | determining whether a name is ambiguous, and to what database | ||||||
| 819 | identifiers such ambiguous names may refer. | ||||||
| 820 | |||||||
| 821 | Note, that the AnnotationParser is now case insensitive, but with some | ||||||
| 822 | caveats. For instance, you can use 'cdc6' to retrieve data for CDC6. | ||||||
| 823 | However, This if gene has been referred to as abc1, and another | ||||||
| 824 | referred to as ABC1, then these are treated as different, and | ||||||
| 825 | unambiguous. However, the text 'Abc1' would be considered ambiguous, | ||||||
| 826 | because it could refer to either. On the other hand, if a single gene | ||||||
| 827 | is referred to as XYZ1 and xyz1, and no other genes have that name (in | ||||||
| 828 | any casing), then Xyz1 would still be considered unambiguous. | ||||||
| 829 | |||||||
| 830 | =cut | ||||||
| 831 | |||||||
| 832 | ############################################################################## | ||||||
| 833 | sub nameIsAmbiguous{ | ||||||
| 834 | ############################################################################## | ||||||
| 835 | |||||||
| 836 | =pod | ||||||
| 837 | |||||||
| 838 | =head2 nameIsAmbiguous | ||||||
| 839 | |||||||
| 840 | This public method returns a boolean to indicate whether a name is | ||||||
| 841 | ambiguous, i.e. whether the name might map to more than one entity (and | ||||||
| 842 | therefore more than one databaseId). | ||||||
| 843 | |||||||
| 844 | NB: API change: | ||||||
| 845 | |||||||
| 846 | nameIsAmbiguous is now case insensitive - that is, if there is a name | ||||||
| 847 | that is used twice using different casing, that will be treated as | ||||||
| 848 | ambiguous. Previous versions would have not treated these as | ||||||
| 849 | ambiguous. In the case that a name is provided in a certain casing, | ||||||
| 850 | which was encountered only once, then it will be treated as | ||||||
| 851 | unambiguous. This is the price of wanting a case insensitive | ||||||
| 852 | annotation parser... | ||||||
| 853 | |||||||
| 854 | Usage: | ||||||
| 855 | |||||||
| 856 | if ($annotationParser->nameIsAmbiguous($name)){ | ||||||
| 857 | |||||||
| 858 | do something useful....or not.... | ||||||
| 859 | |||||||
| 860 | } | ||||||
| 861 | |||||||
| 862 | =cut | ||||||
| 863 | |||||||
| 864 | 106406 | 106406 | 1 | 148303 | my ($self, $name) = @_; | ||
| 865 | |||||||
| 866 | 106406 | 50 | 191864 | die "You must supply a name to nameIsAmbiguous" if !defined ($name); | |||
| 867 | |||||||
| 868 | # a name might appear in the hash of ambiguous names - however, | ||||||
| 869 | # it is possible that the provided name matches the case of one of | ||||||
| 870 | # the provided versions exactly, and thus may not be ambiguous | ||||||
| 871 | |||||||
| 872 | # of course, it is also possible that there were actually more than | ||||||
| 873 | # one copy of that alias, with exactly the same casing, which would | ||||||
| 874 | # be ambiguous | ||||||
| 875 | |||||||
| 876 | # thus, we need to find out whether the provided name matches the case | ||||||
| 877 | # of a something exactly, which refers to only one entity | ||||||
| 878 | |||||||
| 879 | # a name being ambiguous boils down to whether it has been seen | ||||||
| 880 | # more than once in that exact case, or in the case that it has | ||||||
| 881 | # not been seen at all in that exact case, whether it is ambiguous | ||||||
| 882 | # in upper case form. | ||||||
| 883 | |||||||
| 884 | 106406 | 121246 | my $isAmbiguous; | ||||
| 885 | |||||||
| 886 | 106406 | 100 | 416688 | if (!exists $self->{$kNameToCount}{$name}){ | |||
| 100 | |||||||
| 887 | |||||||
| 888 | # we haven't seen this casing at all, so see if it's ambiguous | ||||||
| 889 | # in the uppercased version | ||||||
| 890 | |||||||
| 891 | 438 | 1345 | $isAmbiguous = exists $self->{$kAmbiguousNames}{uc($name)}; | ||||
| 892 | |||||||
| 893 | }elsif ($self->{$kNameToCount}{$name} > 1){ | ||||||
| 894 | |||||||
| 895 | # we've seen this exact casing more than once, so it has to be | ||||||
| 896 | # ambiguous | ||||||
| 897 | |||||||
| 898 | 137 | 127 | $isAmbiguous = 1; | ||||
| 899 | |||||||
| 900 | }else{ | ||||||
| 901 | |||||||
| 902 | # it must only have ever been seen once in this exact casing, | ||||||
| 903 | # so it's unambiguous | ||||||
| 904 | |||||||
| 905 | 105831 | 137534 | $isAmbiguous = 0; | ||||
| 906 | |||||||
| 907 | } | ||||||
| 908 | |||||||
| 909 | 106406 | 324102 | return $isAmbiguous; | ||||
| 910 | |||||||
| 911 | } | ||||||
| 912 | |||||||
| 913 | ############################################################################ | ||||||
| 914 | sub databaseIdsForAmbiguousName{ | ||||||
| 915 | ############################################################################ | ||||||
| 916 | =pod | ||||||
| 917 | |||||||
| 918 | =head2 databaseIdsForAmbiguousName | ||||||
| 919 | |||||||
| 920 | This public method returns an array of database identifiers for an | ||||||
| 921 | ambiguous name. If the name is not ambiguous, an empty list will be | ||||||
| 922 | returned. | ||||||
| 923 | |||||||
| 924 | NB: API change: | ||||||
| 925 | |||||||
| 926 | databaseIdsForAmbiguousName is now case insensitive - that is, if | ||||||
| 927 | there is a name that is used twice using different casing, that will | ||||||
| 928 | be treated as ambiguous. Previous versions would have not treated | ||||||
| 929 | these as ambiguous. However, if the name provided is of the exact | ||||||
| 930 | casing as a name that appeared only once with that exact casing, then | ||||||
| 931 | it is treated as unambiguous. This is the price of wanting a case | ||||||
| 932 | insensitive annotation parser... | ||||||
| 933 | |||||||
| 934 | Usage: | ||||||
| 935 | |||||||
| 936 | my @databaseIds = $annotationParser->databaseIdsForAmbiguousName($name); | ||||||
| 937 | |||||||
| 938 | =cut | ||||||
| 939 | |||||||
| 940 | 2 | 2 | 1 | 4 | my ($self, $name) = @_; | ||
| 941 | |||||||
| 942 | 2 | 50 | 8 | die "You must supply a name to databaseIdsForAmbiguousName" if !defined ($name); | |||
| 943 | |||||||
| 944 | 2 | 50 | 6 | if ($self->nameIsAmbiguous($name)){ | |||
| 945 | |||||||
| 946 | 2 | 3 | return @{$self->{$kAmbiguousNames}{uc($name)}}; | ||||
| 2 | 13 | ||||||
| 947 | |||||||
| 948 | }else{ | ||||||
| 949 | |||||||
| 950 | 0 | 0 | return (); | ||||
| 951 | |||||||
| 952 | } | ||||||
| 953 | |||||||
| 954 | } | ||||||
| 955 | |||||||
| 956 | ############################################################################ | ||||||
| 957 | sub ambiguousNames{ | ||||||
| 958 | ############################################################################ | ||||||
| 959 | =pod | ||||||
| 960 | |||||||
| 961 | =head2 ambiguousNames | ||||||
| 962 | |||||||
| 963 | This method returns an array of names, which from the annotation file | ||||||
| 964 | have been deemed to be ambiguous. | ||||||
| 965 | |||||||
| 966 | Note - even though we have made the annotation parser case | ||||||
| 967 | insensitive, if something appeared in the annotations file as BLAH1 | ||||||
| 968 | and blah1, we would not deem either of these to be ambiguous. | ||||||
| 969 | However, if it appeared as blah1 twice, referring to two different | ||||||
| 970 | genes, then blah1 would be ambiguous. | ||||||
| 971 | |||||||
| 972 | Usage: | ||||||
| 973 | |||||||
| 974 | my @ambiguousNames = $annotationParser->ambiguousNames(); | ||||||
| 975 | |||||||
| 976 | =cut | ||||||
| 977 | |||||||
| 978 | 1 | 1 | 1 | 443 | my $self = shift; | ||
| 979 | |||||||
| 980 | # we can simply generate a list of case-sensitive names that have | ||||||
| 981 | # appeared more than once - we'll cache them so they don't have to | ||||||
| 982 | # be recalculated in the event that they're asked for again | ||||||
| 983 | |||||||
| 984 | 1 | 50 | 8 | if (!exists ($self->{$kAmbiguousNamesSensitive})){ | |||
| 985 | |||||||
| 986 | 1 | 3 | my @names; | ||||
| 987 | |||||||
| 988 | 1 | 2 | foreach my $name (keys %{$self->{$kNameToCount}}){ | ||||
| 1 | 8385 | ||||||
| 989 | |||||||
| 990 | 20180 | 100 | 49694 | push(@names, $name) if ($self->{$kNameToCount}{$name} > 1); | |||
| 991 | |||||||
| 992 | } | ||||||
| 993 | |||||||
| 994 | 1 | 3091 | $self->{$kAmbiguousNamesSensitive} = \@names; | ||||
| 995 | |||||||
| 996 | } | ||||||
| 997 | |||||||
| 998 | 1 | 4 | return @{$self->{$kAmbiguousNamesSensitive}}; | ||||
| 1 | 49 | ||||||
| 999 | |||||||
| 1000 | } | ||||||
| 1001 | |||||||
| 1002 | =pod | ||||||
| 1003 | |||||||
| 1004 | =head1 Methods for retrieving GO annotations for entities | ||||||
| 1005 | |||||||
| 1006 | =cut | ||||||
| 1007 | |||||||
| 1008 | ############################################################################ | ||||||
| 1009 | sub goIdsByDatabaseId{ | ||||||
| 1010 | ############################################################################ | ||||||
| 1011 | =pod | ||||||
| 1012 | |||||||
| 1013 | =head2 goIdsByDatabaseId | ||||||
| 1014 | |||||||
| 1015 | This public method returns a reference to an array of GOIDs that are | ||||||
| 1016 | associated with the supplied databaseId for a specific aspect. If no | ||||||
| 1017 | annotations are associated with that databaseId in that aspect, then a | ||||||
| 1018 | reference to an empty array will be returned. If the databaseId is | ||||||
| 1019 | not recognized, then undef will be returned. In the case that a | ||||||
| 1020 | databaseId is ambiguous (for instance the same databaseId exists but | ||||||
| 1021 | with different casings) then if the supplied database id matches the | ||||||
| 1022 | exact case of one of those supplied, then that is the one it will be | ||||||
| 1023 | treated as. In the case where the databaseId matches none of the | ||||||
| 1024 | possibilities by case, then a fatal error will occur, because the | ||||||
| 1025 | provided databaseId was ambiguous. | ||||||
| 1026 | |||||||
| 1027 | Usage: | ||||||
| 1028 | |||||||
| 1029 | my $goidsRef = $annotationParser->goIdsByDatabaseId(databaseId => $databaseId, | ||||||
| 1030 | aspect => ); |
||||||
| 1031 | |||||||
| 1032 | =cut | ||||||
| 1033 | |||||||
| 1034 | 19434 | 19434 | 1 | 60100 | my ($self, %args) = @_; | ||
| 1035 | |||||||
| 1036 | 19434 | 33 | 52739 | my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); | |||
| 1037 | 19434 | 33 | 43253 | my $databaseId = $args{'databaseId'} || $self->_handleMissingArgument(argument => 'databaseId'); | |||
| 1038 | |||||||
| 1039 | 19434 | 22411 | my $mappedId; # will store the id as listed in the annotations file | ||||
| 1040 | |||||||
| 1041 | 19434 | 50 | 67659 | if (exists $self->{$kUcIdToId}{uc($databaseId)}){ # we recognize it | |||
| 1042 | |||||||
| 1043 | 19434 | 100 | 35353 | if (scalar (@{$self->{$kUcIdToId}{uc($databaseId)}}) == 1){ | |||
| 19434 | 64529 | ||||||
| 1044 | |||||||
| 1045 | # it's unambiguous | ||||||
| 1046 | |||||||
| 1047 | 19432 | 57853 | $mappedId = $self->{$kUcIdToId}{uc($databaseId)}[0]; | ||||
| 1048 | |||||||
| 1049 | }else{ | ||||||
| 1050 | |||||||
| 1051 | # it may be ambiguous, but we'll check to see if the provided one | ||||||
| 1052 | # is of exactly the correct case | ||||||
| 1053 | |||||||
| 1054 | 2 | 3 | foreach my $id (@{$self->{$kUcIdToId}{uc($databaseId)}}){ | ||||
| 2 | 7 | ||||||
| 1055 | |||||||
| 1056 | 3 | 100 | 8 | if ($databaseId eq $id){ # we have a match | |||
| 1057 | |||||||
| 1058 | 2 | 3 | $mappedId = $id; | ||||
| 1059 | 2 | 3 | last; | ||||
| 1060 | |||||||
| 1061 | } | ||||||
| 1062 | |||||||
| 1063 | } | ||||||
| 1064 | |||||||
| 1065 | 2 | 50 | 6 | if (!defined $mappedId){ | |||
| 1066 | |||||||
| 1067 | # we got no perfect match, so it's ambiguous, and we die | ||||||
| 1068 | |||||||
| 1069 | 0 | 0 | die "$databaseId is ambiguous as a databaseId, and could be used to refer to one of:\n\n". | ||||
| 1070 | 0 | 0 | join("\n", @{$self->{$kUcIdToId}{uc($databaseId)}}); | ||||
| 1071 | |||||||
| 1072 | } | ||||||
| 1073 | |||||||
| 1074 | } | ||||||
| 1075 | |||||||
| 1076 | }else{ # we don't recognize it | ||||||
| 1077 | |||||||
| 1078 | 0 | 0 | return ; # note return here | ||||
| 1079 | |||||||
| 1080 | } | ||||||
| 1081 | |||||||
| 1082 | # if we get here, then we have a recognized, and unambiguous database id | ||||||
| 1083 | |||||||
| 1084 | 19434 | 48870 | return $self->_goIdsByMappedDatabaseId(databaseId => $mappedId, | ||||
| 1085 | aspect => $aspect); | ||||||
| 1086 | |||||||
| 1087 | } | ||||||
| 1088 | |||||||
| 1089 | ############################################################################ | ||||||
| 1090 | sub _goIdsByMappedDatabaseId{ | ||||||
| 1091 | ############################################################################ | ||||||
| 1092 | # This protected method returns a reference to an array of GOIDs that | ||||||
| 1093 | # are associated with the supplied databaseId for a specific aspect. | ||||||
| 1094 | # If no annotations are associated with that databaseId in that | ||||||
| 1095 | # aspect, then a reference to an empty array will be returned. If the | ||||||
| 1096 | # databaseId is not recognized, then undef will be returned. The | ||||||
| 1097 | # supplied databaseId must NOT be ambiguous, i.e. it must be a real | ||||||
| 1098 | # databaseId known to exist. If it is possibly ambiguous, use the | ||||||
| 1099 | # goIdsByDatabaseId method instead. | ||||||
| 1100 | # | ||||||
| 1101 | # Usage: | ||||||
| 1102 | # | ||||||
| 1103 | # my $goidsRef = $annotationParser->_goIdsByMappedDatabaseId(databaseId => $databaseId, | ||||||
| 1104 | # aspect => ); |
||||||
| 1105 | |||||||
| 1106 | |||||||
| 1107 | 19434 | 19434 | 53761 | my ($self, %args) = @_; | |||
| 1108 | |||||||
| 1109 | 19434 | 33 | 45607 | my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); | |||
| 1110 | 19434 | 33 | 39439 | my $mappedId = $args{'databaseId'} || $self->_handleMissingArgument(argument => 'databaseId'); | |||
| 1111 | |||||||
| 1112 | 19434 | 100 | 77637 | if (exists $self->{$kGoids}{$mappedId}{$aspect}){ # it has annotations | |||
| 1113 | |||||||
| 1114 | 18903 | 24652 | return [keys %{$self->{$kGoids}{$mappedId}{$aspect}}]; | ||||
| 18903 | 155797 | ||||||
| 1115 | |||||||
| 1116 | }else{ # it has no annotations | ||||||
| 1117 | |||||||
| 1118 | 531 | 2749 | return []; # reference to empty array | ||||
| 1119 | |||||||
| 1120 | } | ||||||
| 1121 | |||||||
| 1122 | } | ||||||
| 1123 | |||||||
| 1124 | ############################################################################ | ||||||
| 1125 | sub goIdsByStandardName{ | ||||||
| 1126 | ############################################################################ | ||||||
| 1127 | =pod | ||||||
| 1128 | |||||||
| 1129 | =head2 goIdsByStandardName | ||||||
| 1130 | |||||||
| 1131 | This public method returns a reference to an array of GOIDs that are | ||||||
| 1132 | associated with the supplied standardName for a specific aspect. If | ||||||
| 1133 | no annotations are associated with the entity with that standard name | ||||||
| 1134 | in that aspect, then a reference to an empty list will be returned. | ||||||
| 1135 | If the supplied name is not used as a standard name, then undef will | ||||||
| 1136 | be returned. In the case that the supplied standardName is ambiguous | ||||||
| 1137 | (for instance the same standardName exists but with different casings) | ||||||
| 1138 | then if the supplied standardName matches the exact case of one of | ||||||
| 1139 | those supplied, then that is the one it will be treated as. In the | ||||||
| 1140 | case where the standardName matches none of the possibilities by case, | ||||||
| 1141 | then a fatal error will occur, because the provided standardName was | ||||||
| 1142 | ambiguous. | ||||||
| 1143 | |||||||
| 1144 | Usage: | ||||||
| 1145 | |||||||
| 1146 | my $goidsRef = $annotationParser->goIdsByStandardName(standardName =>$standardName, | ||||||
| 1147 | aspect => ); |
||||||
| 1148 | |||||||
| 1149 | =cut | ||||||
| 1150 | |||||||
| 1151 | 0 | 0 | 1 | 0 | my ($self, %args) = @_; | ||
| 1152 | |||||||
| 1153 | 0 | 0 | 0 | my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); | |||
| 1154 | 0 | 0 | 0 | my $standardName = $args{'standardName'} || $self->_handleMissingArgument(argument => 'standardName'); | |||
| 1155 | |||||||
| 1156 | # now we have to determine if the standardName is ambiguous or not | ||||||
| 1157 | |||||||
| 1158 | # first, return if there is no standard name for the supplied string | ||||||
| 1159 | |||||||
| 1160 | 0 | 0 | 0 | return undef if !exists $self->{$kUcStdNameToStdName}{uc($standardName)}; | |||
| 1161 | |||||||
| 1162 | # now see if we have 1 or more mappings | ||||||
| 1163 | |||||||
| 1164 | 0 | 0 | my $mappedName; | ||||
| 1165 | |||||||
| 1166 | 0 | 0 | 0 | if (scalar @{$self->{$kUcStdNameToStdName}{uc($standardName)}} == 1){ | |||
| 0 | 0 | ||||||
| 1167 | |||||||
| 1168 | # we have a single mapping | ||||||
| 1169 | |||||||
| 1170 | 0 | 0 | $mappedName = $self->{$kUcStdNameToStdName}{uc($standardName)}[0]; | ||||
| 1171 | |||||||
| 1172 | }else{ | ||||||
| 1173 | |||||||
| 1174 | # there's more than one, so see if the case matched exactly | ||||||
| 1175 | |||||||
| 1176 | 0 | 0 | foreach my $name (@{$self->{$kUcStdNameToStdName}{uc($standardName)}}){ | ||||
| 0 | 0 | ||||||
| 1177 | |||||||
| 1178 | 0 | 0 | 0 | if ($name eq $standardName){ | |||
| 1179 | |||||||
| 1180 | 0 | 0 | $mappedName = $name; | ||||
| 1181 | 0 | 0 | last; | ||||
| 1182 | |||||||
| 1183 | } | ||||||
| 1184 | |||||||
| 1185 | } | ||||||
| 1186 | |||||||
| 1187 | 0 | 0 | 0 | if (!defined $mappedName){ | |||
| 1188 | |||||||
| 1189 | # we got no perfect match, so it's ambiguous, and we die | ||||||
| 1190 | |||||||
| 1191 | 0 | 0 | die "$standardName is ambiguous as a standardName, and could be used to refer to one of:\n\n". | ||||
| 1192 | 0 | 0 | join("\n", @{$self->{$kUcStdNameToStdName}{uc($standardName)}}); | ||||
| 1193 | |||||||
| 1194 | } | ||||||
| 1195 | |||||||
| 1196 | } | ||||||
| 1197 | |||||||
| 1198 | # now we're here, we know we have a mapped standard name, which | ||||||
| 1199 | # must thus map to a databaseId | ||||||
| 1200 | |||||||
| 1201 | 0 | 0 | my $databaseId = $self->_databaseIdByMappedStandardName($mappedName); | ||||
| 1202 | |||||||
| 1203 | 0 | 0 | return $self->_goIdsByMappedDatabaseId(databaseId => $databaseId, | ||||
| 1204 | aspect => $aspect); | ||||||
| 1205 | |||||||
| 1206 | } | ||||||
| 1207 | |||||||
| 1208 | ############################################################################ | ||||||
| 1209 | sub goIdsByName{ | ||||||
| 1210 | ############################################################################ | ||||||
| 1211 | =pod | ||||||
| 1212 | |||||||
| 1213 | =head2 goIdsByName | ||||||
| 1214 | |||||||
| 1215 | This public method returns a reference to an array of GO IDs that are | ||||||
| 1216 | associated with the supplied name for a specific aspect. If there are | ||||||
| 1217 | no GO associations for the entity corresponding to the supplied name | ||||||
| 1218 | in the provided aspect, then a reference to an empty list will be | ||||||
| 1219 | returned. If the supplied name does not correspond to any entity, | ||||||
| 1220 | then undef will be returned. Because the name can be any of the | ||||||
| 1221 | databaseId, the standard name, or any of the aliases, it is possible | ||||||
| 1222 | that the name might be ambiguous. Clients of this object should first | ||||||
| 1223 | test whether the name they are using is ambiguous, using the | ||||||
| 1224 | nameIsAmbiguous() method, and handle it accordingly. If an ambiguous | ||||||
| 1225 | name is supplied, then it will die. | ||||||
| 1226 | |||||||
| 1227 | NB: API change: | ||||||
| 1228 | |||||||
| 1229 | goIdsByName is now case insensitive - that is, if there is a name that | ||||||
| 1230 | is used twice using different casing, that will be treated as | ||||||
| 1231 | ambiguous. Previous versions would have not treated these as | ||||||
| 1232 | ambiguous. This is the price of wanting a case insensitive annotation | ||||||
| 1233 | parser. In the event that a name is provided that is ambiguous | ||||||
| 1234 | because of case, if it matches exactly the case of one of the possible | ||||||
| 1235 | matches, it will be treated unambiguously. | ||||||
| 1236 | |||||||
| 1237 | Usage: | ||||||
| 1238 | |||||||
| 1239 | my $goidsRef = $annotationParser->goIdsByName(name => $name, | ||||||
| 1240 | aspect => ); |
||||||
| 1241 | |||||||
| 1242 | =cut | ||||||
| 1243 | |||||||
| 1244 | 0 | 0 | 1 | 0 | my ($self, %args) = @_; | ||
| 1245 | |||||||
| 1246 | 0 | 0 | 0 | my $aspect = $args{'aspect'} || $self->_handleMissingArgument(argument => 'aspect'); | |||
| 1247 | 0 | 0 | 0 | my $name = $args{'name'} || $self->_handleMissingArgument(argument => 'name'); | |||
| 1248 | |||||||
| 1249 | 0 | 0 | 0 | die "You have supplied an ambiguous name to goIdsByName" if ($self->nameIsAmbiguous($name)); | |||
| 1250 | |||||||
| 1251 | # if we get here, the name is not ambiguous, so it's safe to call | ||||||
| 1252 | # databaseIdByName | ||||||
| 1253 | |||||||
| 1254 | 0 | 0 | my $databaseId = $self->databaseIdByName($name); | ||||
| 1255 | |||||||
| 1256 | 0 | 0 | 0 | return undef if !defined $databaseId; # there is no such name | |||
| 1257 | |||||||
| 1258 | # we should have a databaseId in the correct casing now | ||||||
| 1259 | |||||||
| 1260 | 0 | 0 | return $self->_goIdsByMappedDatabaseId(databaseId => $databaseId, | ||||
| 1261 | aspect => $aspect); | ||||||
| 1262 | |||||||
| 1263 | } | ||||||
| 1264 | |||||||
| 1265 | =pod | ||||||
| 1266 | |||||||
| 1267 | =head1 Methods for mapping different types of name to each other | ||||||
| 1268 | |||||||
| 1269 | =cut | ||||||
| 1270 | |||||||
| 1271 | ############################################################################ | ||||||
| 1272 | sub standardNameByDatabaseId{ | ||||||
| 1273 | ############################################################################ | ||||||
| 1274 | =pod | ||||||
| 1275 | |||||||
| 1276 | =head2 standardNameByDatabaseId | ||||||
| 1277 | |||||||
| 1278 | This method returns the standard name for a database id. | ||||||
| 1279 | |||||||
| 1280 | NB: API change | ||||||
| 1281 | |||||||
| 1282 | standardNameByDatabaseId is now case insensitive - that is, if there | ||||||
| 1283 | is a databaseId that is used twice (or more) using different casing, | ||||||
| 1284 | it will be treated as ambiguous. Previous versions would have not | ||||||
| 1285 | treated these as ambiguous. This is the price of wanting a case | ||||||
| 1286 | insensitive annotation parser. In the event that a name is provided | ||||||
| 1287 | that is ambiguous because of case, if it matches exactly the case of | ||||||
| 1288 | one of the possible matches, it will be treated unambiguously. | ||||||
| 1289 | |||||||
| 1290 | Usage: | ||||||
| 1291 | |||||||
| 1292 | my $standardName = $annotationParser->standardNameByDatabaseId($databaseId); | ||||||
| 1293 | |||||||
| 1294 | =cut | ||||||
| 1295 | |||||||
| 1296 | 0 | 0 | 1 | 0 | my ($self, $databaseId) = @_; | ||
| 1297 | |||||||
| 1298 | 0 | 0 | 0 | die "You must supply a databaseId to standardNameByDatabaseId" if !defined ($databaseId); | |||
| 1299 | |||||||
| 1300 | # first return if there is no databaseId for the supplied string | ||||||
| 1301 | |||||||
| 1302 | 0 | 0 | 0 | return undef if (!exists $self->{$kUcIdToId}{uc($databaseId)}); | |||
| 1303 | |||||||
| 1304 | # now, check whether it's ambiguous as a databaseId | ||||||
| 1305 | |||||||
| 1306 | 0 | 0 | my $mappedId; | ||||
| 1307 | |||||||
| 1308 | 0 | 0 | 0 | if (scalar(@{$self->{$kUcIdToId}{uc($databaseId)}}) == 1){ | |||
| 0 | 0 | ||||||
| 1309 | |||||||
| 1310 | # we have a single mapping | ||||||
| 1311 | |||||||
| 1312 | 0 | 0 | $mappedId = $self->{$kUcIdToId}{uc($databaseId)}[0]; | ||||
| 1313 | |||||||
| 1314 | }else{ | ||||||
| 1315 | |||||||
| 1316 | # there's more than one, so see if the provided case matches | ||||||
| 1317 | # exactly one of them | ||||||
| 1318 | |||||||
| 1319 | 0 | 0 | foreach my $id (@{$self->{$kUcIdToId}{uc($databaseId)}}){ | ||||
| 0 | 0 | ||||||
| 1320 | |||||||
| 1321 | 0 | 0 | 0 | if ($databaseId eq $id){ | |||
| 1322 | |||||||
| 1323 | 0 | 0 | $mappedId = $id; | ||||
| 1324 | 0 | 0 | last; | ||||
| 1325 | |||||||
| 1326 | } | ||||||
| 1327 | |||||||
| 1328 | } | ||||||
| 1329 | |||||||
| 1330 | 0 | 0 | 0 | if (!defined $mappedId){ | |||
| 1331 | |||||||
| 1332 | # we got no perfect match, so it's ambiguous, and we die | ||||||
| 1333 | |||||||
| 1334 | 0 | 0 | die "$databaseId is ambiguous as a databaseId, and could be used to refer to one of:\n\n". | ||||
| 1335 | 0 | 0 | join("\n", @{$self->{$kUcIdToId}{uc($databaseId)}}); | ||||
| 1336 | |||||||
| 1337 | } | ||||||
| 1338 | |||||||
| 1339 | } | ||||||
| 1340 | |||||||
| 1341 | |||||||
| 1342 | 0 | 0 | return ($self->{$kIdToStandardName}{$mappedId}); | ||||
| 1343 | |||||||
| 1344 | } | ||||||
| 1345 | |||||||
| 1346 | ############################################################################ | ||||||
| 1347 | sub databaseIdByStandardName{ | ||||||
| 1348 | ############################################################################ | ||||||
| 1349 | =pod | ||||||
| 1350 | |||||||
| 1351 | =head2 databaseIdByStandardName | ||||||
| 1352 | |||||||
| 1353 | This method returns the database id for a standard name. | ||||||
| 1354 | |||||||
| 1355 | NB: API change | ||||||
| 1356 | |||||||
| 1357 | databaseIdByStandardName is now case insensitive - that is, if there | ||||||
| 1358 | is a standard name that is used twice (or more) using different | ||||||
| 1359 | casing, it will be treated as ambiguous. Previous versions would have | ||||||
| 1360 | not treated these as ambiguous. This is the price of wanting a case | ||||||
| 1361 | insensitive annotation parser. In the event that a name is provided | ||||||
| 1362 | that is ambiguous because of case, if it matches exactly the case of | ||||||
| 1363 | one of the possible matches, it will be treated unambiguously. | ||||||
| 1364 | |||||||
| 1365 | Usage: | ||||||
| 1366 | |||||||
| 1367 | my $databaseId = $annotationParser->databaseIdByStandardName($standardName); | ||||||
| 1368 | |||||||
| 1369 | =cut | ||||||
| 1370 | |||||||
| 1371 | 0 | 0 | 1 | 0 | my ($self, $standardName) = @_; | ||
| 1372 | |||||||
| 1373 | 0 | 0 | 0 | die "You must supply a standardName to databaseIdByStandardName" if !defined ($standardName); | |||
| 1374 | |||||||
| 1375 | # first return if there is no standard name for the supplied string | ||||||
| 1376 | |||||||
| 1377 | 0 | 0 | 0 | return undef if (!exists $self->{$kUcStdNameToStdName}{uc($standardName)}); | |||
| 1378 | |||||||
| 1379 | # now see if it's ambiguous or not | ||||||
| 1380 | |||||||
| 1381 | 0 | 0 | my $mappedStandardName; | ||||
| 1382 | |||||||
| 1383 | 0 | 0 | 0 | if (scalar(@{$self->{$kUcStdNameToStdName}{uc($standardName)}}) == 1){ | |||
| 0 | 0 | ||||||
| 1384 | |||||||
| 1385 | # it's not ambiguous | ||||||
| 1386 | |||||||
| 1387 | 0 | 0 | $mappedStandardName = $self->{$kUcStdNameToStdName}{uc($standardName)}[0]; | ||||
| 1388 | |||||||
| 1389 | }else{ | ||||||
| 1390 | |||||||
| 1391 | # there's more than one, so see if the supplied name matches | ||||||
| 1392 | # the case of one of them exactly | ||||||
| 1393 | |||||||
| 1394 | 0 | 0 | foreach my $name (@{$self->{$kUcStdNameToStdName}{uc($standardName)}}){ | ||||
| 0 | 0 | ||||||
| 1395 | |||||||
| 1396 | 0 | 0 | 0 | if ($standardName eq $name){ | |||
| 1397 | |||||||
| 1398 | 0 | 0 | $mappedStandardName = $name; | ||||
| 1399 | 0 | 0 | last; | ||||
| 1400 | |||||||
| 1401 | } | ||||||
| 1402 | |||||||
| 1403 | } | ||||||
| 1404 | |||||||
| 1405 | 0 | 0 | 0 | if (!defined $mappedStandardName){ | |||
| 1406 | |||||||
| 1407 | 0 | 0 | die "$standardName is ambiguous as a standard name, and could be used to refer to one of:\n\n". | ||||
| 1408 | 0 | 0 | join("\n", @{$self->{$kUcStdNameToStdName}{uc($standardName)}}); | ||||
| 1409 | |||||||
| 1410 | } | ||||||
| 1411 | |||||||
| 1412 | } | ||||||
| 1413 | |||||||
| 1414 | 0 | 0 | return ($self->{$kStandardNameToId}{$standardName}); | ||||
| 1415 | |||||||
| 1416 | } | ||||||
| 1417 | |||||||
| 1418 | ############################################################################ | ||||||
| 1419 | sub _databaseIdByMappedStandardName{ | ||||||
| 1420 | ############################################################################ | ||||||
| 1421 | # This protected method returns the database id for a standard name that is | ||||||
| 1422 | # guaranteed to be non-ambiguous, and in the correct casing | ||||||
| 1423 | # | ||||||
| 1424 | # Usage: | ||||||
| 1425 | # | ||||||
| 1426 | # my $databaseId = $annotationParser->_databaseIdByMappedStandardName($standardName); | ||||||
| 1427 | # | ||||||
| 1428 | |||||||
| 1429 | 0 | 0 | 0 | my ($self, $standardName) = @_; | |||
| 1430 | |||||||
| 1431 | 0 | 0 | 0 | die "You must supply a standardName to _databaseIdByMappedStandardName" if !defined ($standardName); | |||
| 1432 | |||||||
| 1433 | 0 | 0 | return ($self->{$kStandardNameToId}{$standardName}); | ||||
| 1434 | |||||||
| 1435 | } | ||||||
| 1436 | |||||||
| 1437 | ############################################################################ | ||||||
| 1438 | sub databaseIdByName{ | ||||||
| 1439 | ############################################################################ | ||||||
| 1440 | =pod | ||||||
| 1441 | |||||||
| 1442 | =head2 databaseIdByName | ||||||
| 1443 | |||||||
| 1444 | This method returns the database id for any identifier for a gene | ||||||
| 1445 | (e.g. by databaseId itself, by standard name, or by alias). If the | ||||||
| 1446 | used name is ambiguous, then the program will die. Thus clients | ||||||
| 1447 | should call the nameIsAmbiguous() method, prior to using this method. | ||||||
| 1448 | If the name does not map to any databaseId, then undef will be | ||||||
| 1449 | returned. | ||||||
| 1450 | |||||||
| 1451 | NB: API change | ||||||
| 1452 | |||||||
| 1453 | databaseIdByName is now case insensitive - that is, if there is a name | ||||||
| 1454 | that is used twice using different casing, that will be treated as | ||||||
| 1455 | ambiguous. Previous versions would have not treated these as | ||||||
| 1456 | ambiguous. This is the price of wanting a case insensitive annotation | ||||||
| 1457 | parser. In the event that a name is provided that is ambiguous | ||||||
| 1458 | because of case, if it matches exactly the case of one of the possible | ||||||
| 1459 | matches, it will be treated unambiguously. | ||||||
| 1460 | |||||||
| 1461 | Usage: | ||||||
| 1462 | |||||||
| 1463 | my $databaseId = $annotationParser->databaseIdByName($name); | ||||||
| 1464 | |||||||
| 1465 | =cut | ||||||
| 1466 | |||||||
| 1467 | 53129 | 53129 | 1 | 73450 | my ($self, $name) = @_; | ||
| 1468 | |||||||
| 1469 | 53129 | 50 | 103970 | die "You must supply a name to databaseIdByName" if !defined ($name); | |||
| 1470 | |||||||
| 1471 | 53129 | 50 | 95474 | die "You have supplied an ambiguous name to databaseIdByName" if ($self->nameIsAmbiguous($name)); | |||
| 1472 | |||||||
| 1473 | # give them the case insensitive unique map, or if there is none, | ||||||
| 1474 | # then the case sensitive version | ||||||
| 1475 | |||||||
| 1476 | 53129 | 66 | 218623 | my $databaseId = $self->{$kNameToIdMapInsensitive}{uc($name)} || $self->{$kNameToIdMapSensitive}{$name}; | |||
| 1477 | |||||||
| 1478 | 53129 | 134962 | return $databaseId; | ||||
| 1479 | |||||||
| 1480 | } | ||||||
| 1481 | |||||||
| 1482 | ############################################################################ | ||||||
| 1483 | sub standardNameByName{ | ||||||
| 1484 | ############################################################################ | ||||||
| 1485 | =pod | ||||||
| 1486 | |||||||
| 1487 | =head2 standardNameByName | ||||||
| 1488 | |||||||
| 1489 | This public method returns the standard name for the the gene | ||||||
| 1490 | specified by the given name. Because a name may be ambiguous, the | ||||||
| 1491 | nameIsAmbiguous() method should be called first. If an ambiguous name | ||||||
| 1492 | is supplied, then it will die with an appropriate error message. If | ||||||
| 1493 | the name does not map to a standard name, then undef will be returned. | ||||||
| 1494 | |||||||
| 1495 | NB: API change | ||||||
| 1496 | |||||||
| 1497 | standardNameByName is now case insensitive - that is, if there is a | ||||||
| 1498 | name that is used twice using different casing, that will be treated | ||||||
| 1499 | as ambiguous. Previous versions would have not treated these as | ||||||
| 1500 | ambiguous. This is the price of wanting a case insensitive annotation | ||||||
| 1501 | parser. | ||||||
| 1502 | |||||||
| 1503 | Usage: | ||||||
| 1504 | |||||||
| 1505 | my $standardName = $annotationParser->standardNameByName($name); | ||||||
| 1506 | |||||||
| 1507 | =cut | ||||||
| 1508 | |||||||
| 1509 | 0 | 0 | 1 | 0 | my ($self, $name) = @_; | ||
| 1510 | |||||||
| 1511 | 0 | 0 | 0 | die "You must supply a name to standardNameByName" if !defined ($name); | |||
| 1512 | |||||||
| 1513 | 0 | 0 | 0 | die "You have supplied an ambiguous name to standardNameByName" if ($self->nameIsAmbiguous($name)); | |||
| 1514 | |||||||
| 1515 | 0 | 0 | my $databaseId = $self->databaseIdByName($name); | ||||
| 1516 | |||||||
| 1517 | 0 | 0 | 0 | if (defined $databaseId){ | |||
| 1518 | |||||||
| 1519 | 0 | 0 | return $self->{$kIdToStandardName}{$databaseId}; | ||||
| 1520 | |||||||
| 1521 | }else{ | ||||||
| 1522 | |||||||
| 1523 | 0 | 0 | return undef; | ||||
| 1524 | |||||||
| 1525 | } | ||||||
| 1526 | |||||||
| 1527 | } | ||||||
| 1528 | |||||||
| 1529 | =pod | ||||||
| 1530 | |||||||
| 1531 | =head1 Other methods relating to names | ||||||
| 1532 | |||||||
| 1533 | =cut | ||||||
| 1534 | |||||||
| 1535 | ############################################################################ | ||||||
| 1536 | sub nameIsStandardName{ | ||||||
| 1537 | ############################################################################ | ||||||
| 1538 | =pod | ||||||
| 1539 | |||||||
| 1540 | =head2 nameIsStandardName | ||||||
| 1541 | |||||||
| 1542 | This method returns a boolean to indicate whether the supplied name is | ||||||
| 1543 | used as a standard name. | ||||||
| 1544 | |||||||
| 1545 | NB : API change. | ||||||
| 1546 | |||||||
| 1547 | This is now case insensitive. If you provide abC1, and ABc1 is a | ||||||
| 1548 | standard name, then it will return true. | ||||||
| 1549 | |||||||
| 1550 | Usage : | ||||||
| 1551 | |||||||
| 1552 | if ($annotationParser->nameIsStandardName($name)){ | ||||||
| 1553 | |||||||
| 1554 | # do something | ||||||
| 1555 | |||||||
| 1556 | } | ||||||
| 1557 | |||||||
| 1558 | =cut | ||||||
| 1559 | |||||||
| 1560 | 6471 | 6471 | 1 | 22646 | my ($self, $name) = @_; | ||
| 1561 | |||||||
| 1562 | 6471 | 50 | 10980 | die "You must supply a name to nameIsStandardName" if !defined($name); | |||
| 1563 | |||||||
| 1564 | 6471 | 20060 | return exists ($self->{$kUcStdNameToStdName}{uc($name)}); | ||||
| 1565 | |||||||
| 1566 | } | ||||||
| 1567 | |||||||
| 1568 | ############################################################################ | ||||||
| 1569 | sub nameIsDatabaseId{ | ||||||
| 1570 | ############################################################################ | ||||||
| 1571 | =pod | ||||||
| 1572 | |||||||
| 1573 | =head2 nameIsDatabaseId | ||||||
| 1574 | |||||||
| 1575 | This method returns a boolean to indicate whether the supplied name is | ||||||
| 1576 | used as a database id. | ||||||
| 1577 | |||||||
| 1578 | NB : API change. | ||||||
| 1579 | |||||||
| 1580 | This is now case insensitive. If you provide abC1, and ABc1 is a | ||||||
| 1581 | database id, then it will return true. | ||||||
| 1582 | |||||||
| 1583 | Usage : | ||||||
| 1584 | |||||||
| 1585 | if ($annotationParser->nameIsDatabaseId($name)){ | ||||||
| 1586 | |||||||
| 1587 | # do something | ||||||
| 1588 | |||||||
| 1589 | } | ||||||
| 1590 | |||||||
| 1591 | =cut | ||||||
| 1592 | |||||||
| 1593 | |||||||
| 1594 | 6471 | 6471 | 1 | 19683 | my ($self, $databaseId) = @_; | ||
| 1595 | |||||||
| 1596 | 6471 | 50 | 10400 | die "You must supply a potential databaseId to nameIsDatabaseId" if !defined($databaseId); | |||
| 1597 | |||||||
| 1598 | 6471 | 19837 | return exists ($self->{$kUcIdToId}{uc($databaseId)}); | ||||
| 1599 | |||||||
| 1600 | } | ||||||
| 1601 | |||||||
| 1602 | ############################################################################ | ||||||
| 1603 | sub nameIsAnnotated{ | ||||||
| 1604 | ############################################################################ | ||||||
| 1605 | =pod | ||||||
| 1606 | |||||||
| 1607 | =head2 nameIsAnnotated | ||||||
| 1608 | |||||||
| 1609 | This method returns a boolean to indicate whether the supplied name has any | ||||||
| 1610 | annotations, either when considered as a databaseId, a standardName, or | ||||||
| 1611 | an alias. If an aspect is also supplied, then it indicates whether that | ||||||
| 1612 | name has any annotations in that aspect only. | ||||||
| 1613 | |||||||
| 1614 | NB: API change. | ||||||
| 1615 | |||||||
| 1616 | This is now case insensitive. If you provide abC1, and ABc1 has | ||||||
| 1617 | annotation, then it will return true. | ||||||
| 1618 | |||||||
| 1619 | Usage : | ||||||
| 1620 | |||||||
| 1621 | if ($annotationParser->nameIsAnnotated(name => $name)){ | ||||||
| 1622 | |||||||
| 1623 | # blah | ||||||
| 1624 | |||||||
| 1625 | } | ||||||
| 1626 | |||||||
| 1627 | or: | ||||||
| 1628 | |||||||
| 1629 | if ($annotationParser->nameIsAnnotated(name => $name, | ||||||
| 1630 | aspect => $aspect)){ | ||||||
| 1631 | |||||||
| 1632 | # blah | ||||||
| 1633 | |||||||
| 1634 | } | ||||||
| 1635 | |||||||
| 1636 | |||||||
| 1637 | =cut | ||||||
| 1638 | |||||||
| 1639 | 0 | 0 | 1 | 0 | my ($self, %args) = @_; | ||
| 1640 | |||||||
| 1641 | 0 | 0 | 0 | my $name = $args{'name'} || die "You must supply a name to nameIsAnnotated"; | |||
| 1642 | |||||||
| 1643 | 0 | 0 | my $aspect = $args{'aspect'}; | ||||
| 1644 | |||||||
| 1645 | 0 | 0 | my $isAnnotated = 0; | ||||
| 1646 | |||||||
| 1647 | 0 | 0 | my $ucName = uc($name); | ||||
| 1648 | |||||||
| 1649 | 0 | 0 | 0 | if (!defined ($aspect)){ # if there's no aspect | |||
| 1650 | |||||||
| 1651 | 0 | 0 | 0 | $isAnnotated = (exists ($self->{$kNameToIdMapInsensitive}{$ucName}) || exists ($self->{$kAmbiguousNames}{$ucName})); | |||
| 1652 | |||||||
| 1653 | }else{ | ||||||
| 1654 | |||||||
| 1655 | 0 | 0 | 0 | 0 | if ($self->nameIsDatabaseId($name) && @{$self->goIdsByDatabaseId(databaseId => $name, | ||
| 0 | 0 | 0 | 0 | ||||
| 0 | |||||||
| 1656 | 0 | 0 | aspect => $aspect)}){ | ||||
| 1657 | |||||||
| 1658 | 0 | 0 | $isAnnotated = 1; | ||||
| 1659 | |||||||
| 1660 | }elsif ($self->nameIsStandardName($name) && @{$self->goIdsByStandardName(standardName => $name, | ||||||
| 1661 | aspect => $aspect)}){ | ||||||
| 1662 | |||||||
| 1663 | 0 | 0 | $isAnnotated = 1; | ||||
| 1664 | |||||||
| 1665 | }elsif (!$self->nameIsAmbiguous($name)){ | ||||||
| 1666 | |||||||
| 1667 | 0 | 0 | my $goidsRef = $self->goIdsByName(name => $name, | ||||
| 1668 | aspect => $aspect); | ||||||
| 1669 | |||||||
| 1670 | 0 | 0 | 0 | 0 | if (defined $goidsRef && @{$goidsRef}){ | ||
| 0 | 0 | ||||||
| 1671 | |||||||
| 1672 | 0 | 0 | $isAnnotated = 1; | ||||
| 1673 | |||||||
| 1674 | } | ||||||
| 1675 | |||||||
| 1676 | }else { # MUST be an ambiguous name, that's not used as a standard name | ||||||
| 1677 | |||||||
| 1678 | 0 | 0 | foreach my $databaseId ($self->databaseIdsForAmbiguousName($name)){ | ||||
| 1679 | |||||||
| 1680 | 0 | 0 | 0 | if (@{$self->goIdsByDatabaseId(databaseId => $name, | |||
| 0 | 0 | ||||||
| 1681 | aspect => $aspect)}){ | ||||||
| 1682 | |||||||
| 1683 | 0 | 0 | $isAnnotated = 1; | ||||
| 1684 | 0 | 0 | last; # as soon as we know, we can finish | ||||
| 1685 | |||||||
| 1686 | } | ||||||
| 1687 | |||||||
| 1688 | } | ||||||
| 1689 | |||||||
| 1690 | } | ||||||
| 1691 | |||||||
| 1692 | } | ||||||
| 1693 | |||||||
| 1694 | 0 | 0 | return $isAnnotated; | ||||
| 1695 | |||||||
| 1696 | } | ||||||
| 1697 | |||||||
| 1698 | =pod | ||||||
| 1699 | |||||||
| 1700 | =head1 Other public methods | ||||||
| 1701 | |||||||
| 1702 | =cut | ||||||
| 1703 | |||||||
| 1704 | ############################################################################ | ||||||
| 1705 | sub databaseName{ | ||||||
| 1706 | ############################################################################ | ||||||
| 1707 | =pod | ||||||
| 1708 | |||||||
| 1709 | =head2 databaseName | ||||||
| 1710 | |||||||
| 1711 | This method returns the name of the annotating authority from the file | ||||||
| 1712 | that was supplied to the constructor. | ||||||
| 1713 | |||||||
| 1714 | Usage : | ||||||
| 1715 | |||||||
| 1716 | my $databaseName = $annotationParser->databaseName(); | ||||||
| 1717 | |||||||
| 1718 | =cut | ||||||
| 1719 | |||||||
| 1720 | 0 | 0 | 1 | 0 | my $self = shift; | ||
| 1721 | |||||||
| 1722 | 0 | 0 | return $self->{$kDatabaseName}; | ||||
| 1723 | |||||||
| 1724 | } | ||||||
| 1725 | |||||||
| 1726 | ############################################################################ | ||||||
| 1727 | sub numAnnotatedGenes{ | ||||||
| 1728 | ############################################################################ | ||||||
| 1729 | =pod | ||||||
| 1730 | |||||||
| 1731 | =head2 numAnnotatedGenes | ||||||
| 1732 | |||||||
| 1733 | This method returns the number of entities in the annotation file that | ||||||
| 1734 | have annotations in the supplied aspect. If no aspect is provided, | ||||||
| 1735 | then it will return the number of genes with an annotation in at least | ||||||
| 1736 | one aspect of GO. | ||||||
| 1737 | |||||||
| 1738 | Usage: | ||||||
| 1739 | |||||||
| 1740 | my $numAnnotatedGenes = $annotationParser->numAnnotatedGenes(); | ||||||
| 1741 | |||||||
| 1742 | my $numAnnotatedGenes = $annotationParser->numAnnotatedGenes($aspect); | ||||||
| 1743 | |||||||
| 1744 | =cut | ||||||
| 1745 | |||||||
| 1746 | 3 | 3 | 1 | 1523 | my ($self, $aspect) = @_; | ||
| 1747 | |||||||
| 1748 | 3 | 100 | 17 | if (defined ($aspect)){ | |||
| 1749 | |||||||
| 1750 | 1 | 8 | return $self->{$kNumAnnotatedGenes}{$aspect}; | ||||
| 1751 | |||||||
| 1752 | }else{ | ||||||
| 1753 | |||||||
| 1754 | 2 | 12 | return $self->{$kTotalNumAnnotatedGenes}; | ||||
| 1755 | |||||||
| 1756 | } | ||||||
| 1757 | |||||||
| 1758 | } | ||||||
| 1759 | |||||||
| 1760 | ############################################################################ | ||||||
| 1761 | sub allDatabaseIds{ | ||||||
| 1762 | ############################################################################ | ||||||
| 1763 | =pod | ||||||
| 1764 | |||||||
| 1765 | =head2 allDatabaseIds | ||||||
| 1766 | |||||||
| 1767 | This public method returns an array of all the database identifiers | ||||||
| 1768 | |||||||
| 1769 | Usage: | ||||||
| 1770 | |||||||
| 1771 | my @databaseIds = $annotationParser->allDatabaseIds(); | ||||||
| 1772 | |||||||
| 1773 | =cut | ||||||
| 1774 | |||||||
| 1775 | 10 | 10 | 1 | 1261 | my $self = shift; | ||
| 1776 | |||||||
| 1777 | 10 | 18 | return keys (%{$self->{$kIdToStandardName}}); | ||||
| 10 | 26887 | ||||||
| 1778 | |||||||
| 1779 | } | ||||||
| 1780 | |||||||
| 1781 | ############################################################################ | ||||||
| 1782 | sub allStandardNames{ | ||||||
| 1783 | ############################################################################ | ||||||
| 1784 | =pod | ||||||
| 1785 | |||||||
| 1786 | =head2 allStandardNames | ||||||
| 1787 | |||||||
| 1788 | This public method returns an array of all standard names. | ||||||
| 1789 | |||||||
| 1790 | Usage: | ||||||
| 1791 | |||||||
| 1792 | my @standardNames = $annotationParser->allStandardNames(); | ||||||
| 1793 | |||||||
| 1794 | =cut | ||||||
| 1795 | |||||||
| 1796 | 2 | 2 | 1 | 508 | my $self = shift; | ||
| 1797 | |||||||
| 1798 | 2 | 6 | return keys(%{$self->{$kStandardNameToId}}); | ||||
| 2 | 3605 | ||||||
| 1799 | |||||||
| 1800 | } | ||||||
| 1801 | |||||||
| 1802 | =pod | ||||||
| 1803 | |||||||
| 1804 | =head1 Methods to do with files | ||||||
| 1805 | |||||||
| 1806 | =cut | ||||||
| 1807 | |||||||
| 1808 | ############################################################################ | ||||||
| 1809 | sub file{ | ||||||
| 1810 | ############################################################################ | ||||||
| 1811 | =pod | ||||||
| 1812 | |||||||
| 1813 | =head2 file | ||||||
| 1814 | |||||||
| 1815 | This method returns the name of the file that was used to instantiate | ||||||
| 1816 | the object. | ||||||
| 1817 | |||||||
| 1818 | Usage: | ||||||
| 1819 | |||||||
| 1820 | my $file = $annotationParser->file; | ||||||
| 1821 | |||||||
| 1822 | =cut | ||||||
| 1823 | |||||||
| 1824 | 1 | 1 | 1 | 3730 | return $_[0]->{$kFileName}; | ||
| 1825 | |||||||
| 1826 | } | ||||||
| 1827 | |||||||
| 1828 | ############################################################################ | ||||||
| 1829 | sub serializeToDisk{ | ||||||
| 1830 | ############################################################################ | ||||||
| 1831 | =pod | ||||||
| 1832 | |||||||
| 1833 | =head2 serializeToDisk | ||||||
| 1834 | |||||||
| 1835 | This public method saves the current state of the Annotation Parser | ||||||
| 1836 | Object to a file, using the Storable package. The data are saved in | ||||||
| 1837 | network order for portability, just in case. The name of the object | ||||||
| 1838 | file is returned. By default, the name of the original file will be | ||||||
| 1839 | used to make the name of the object file (including the full path from | ||||||
| 1840 | where the file came), or the client can instead supply their own | ||||||
| 1841 | filename. | ||||||
| 1842 | |||||||
| 1843 | Usage: | ||||||
| 1844 | |||||||
| 1845 | my $fileName = $annotationParser->serializeToDisk; | ||||||
| 1846 | |||||||
| 1847 | my $fileName = $annotationParser->serializeToDisk(filename => $filename); | ||||||
| 1848 | |||||||
| 1849 | =cut | ||||||
| 1850 | |||||||
| 1851 | 0 | 0 | 1 | my ($self, %args) = @_; | |||
| 1852 | |||||||
| 1853 | 0 | my $fileName; | |||||
| 1854 | |||||||
| 1855 | 0 | 0 | if (exists ($args{'filename'})){ # they supply their own filename | ||||
| 1856 | |||||||
| 1857 | 0 | $fileName = $args{'filename'}; | |||||
| 1858 | |||||||
| 1859 | }else{ # we build a name from the file used to instantiate ourselves | ||||||
| 1860 | |||||||
| 1861 | 0 | $fileName = $self->file; | |||||
| 1862 | |||||||
| 1863 | 0 | 0 | if ($fileName !~ /\.obj$/){ # if we weren't instantiated from an object | ||||
| 1864 | |||||||
| 1865 | 0 | $fileName .= ".obj"; # add a .obj suffix to the name | |||||
| 1866 | |||||||
| 1867 | } | ||||||
| 1868 | |||||||
| 1869 | } | ||||||
| 1870 | |||||||
| 1871 | 0 | 0 | nstore ($self, $fileName) || die "$PACKAGE could not serialize itself to $fileName : $!"; | ||||
| 1872 | |||||||
| 1873 | 0 | return ($fileName); | |||||
| 1874 | |||||||
| 1875 | } | ||||||
| 1876 | |||||||
| 1877 | 1; # to keep perl happy | ||||||
| 1878 | |||||||
| 1879 | ############################################################################ | ||||||
| 1880 | # MORE P O D D O C U M E N T A T I O N # | ||||||
| 1881 | ############################################################################ | ||||||
| 1882 | |||||||
| 1883 | =pod | ||||||
| 1884 | |||||||
| 1885 | =head1 Modifications | ||||||
| 1886 | |||||||
| 1887 | CVS info is listed here: | ||||||
| 1888 | |||||||
| 1889 | # $Author: sherlock $ | ||||||
| 1890 | # $Date: 2008/05/13 23:06:16 $ | ||||||
| 1891 | # $Log: AnnotationParser.pm,v $ | ||||||
| 1892 | # Revision 1.35 2008/05/13 23:06:16 sherlock | ||||||
| 1893 | # updated to fix bug with querying with a name that was unambiguous when | ||||||
| 1894 | # taking its casing into account. | ||||||
| 1895 | # | ||||||
| 1896 | # Revision 1.34 2007/03/18 03:09:05 sherlock | ||||||
| 1897 | # couple of PerlCritic suggested improvements, and an extra check to | ||||||
| 1898 | # make sure that the cardinality between standard names and database ids | ||||||
| 1899 | # is 1:1 | ||||||
| 1900 | # | ||||||
| 1901 | # Revision 1.33 2006/07/28 00:02:14 sherlock | ||||||
| 1902 | # fixed a couple of typos | ||||||
| 1903 | # | ||||||
| 1904 | # Revision 1.32 2004/07/28 17:12:10 sherlock | ||||||
| 1905 | # bumped version | ||||||
| 1906 | # | ||||||
| 1907 | # Revision 1.31 2004/07/28 17:03:49 sherlock | ||||||
| 1908 | # fixed bugs when calling goidsByDatabaseId instead of goIdsByDatabaseId | ||||||
| 1909 | # on lines 1592 and 1617 - thanks to lfriedl@cs.umass.edu for spotting this. | ||||||
| 1910 | # | ||||||
| 1911 | # Revision 1.30 2003/11/26 18:44:28 sherlock | ||||||
| 1912 | # finished making all the changes that were required to make it case | ||||||
| 1913 | # insensitive, and modified POD accordingly. It appears to all work as | ||||||
| 1914 | # expected... | ||||||
| 1915 | # | ||||||
| 1916 | # Revision 1.29 2003/11/22 00:05:05 sherlock | ||||||
| 1917 | # made a very large number of changes to make much of it | ||||||
| 1918 | # case-insensitive, such that using CDC6 or cdc6 amounts to the same | ||||||
| 1919 | # query, as long as both versions of that name don't exist in the | ||||||
| 1920 | # annotations file. Still needs a little work to allow names that are | ||||||
| 1921 | # potentially ambiguous to be not ambiguous, if their casing matches | ||||||
| 1922 | # exactly one form of the name that has been seen. Have started to | ||||||
| 1923 | # update test suite to check all the case insensitive stuff, but is not | ||||||
| 1924 | # yet finished. | ||||||
| 1925 | # | ||||||
| 1926 | # | ||||||
| 1927 | |||||||
| 1928 | =head1 AUTHORS | ||||||
| 1929 | |||||||
| 1930 | Elizabeth Boyle, ell@mit.edu | ||||||
| 1931 | |||||||
| 1932 | Gavin Sherlock, sherlock@genome.stanford.edu | ||||||
| 1933 | |||||||
| 1934 | =cut |