File Coverage

blib/lib/OBO/APO/NCBIToRDF.pm
Criterion Covered Total %
statement 78 78 100.0
branch 8 12 66.6
condition n/a
subroutine 8 8 100.0
pod 2 3 66.6
total 96 101 95.0


line stmt bran cond sub pod time code
1             package OBO::APO::NCBIToRDF;
2              
3             =head1 NAME
4              
5             OBO::APO::NCBIToRDF - A NCBI taxonomy dump to RDF converter.
6              
7             =head1 DESCRIPTION
8              
9             Converts NCBI taxonomy dump files (names and nodes) to a RDF graph.
10              
11             NCBI taxonomy dump files files can be obtained from ftp://ftp.ncbi.nih.gov/pub/taxonomy/
12              
13             The method 'work' gets the nodes file, the names file, and file handler for the RDF graph.
14              
15             =head1 AUTHOR
16              
17             Mikel Egana Aranguren
18             mikel.egana.aranguren@gmail.com
19              
20             =head1 COPYRIGHT AND LICENSE
21              
22             Copyright (C) 2008 by Mikel Egana Aranguren
23              
24             This library is free software; you can redistribute it and/or modify
25             it under the same terms as Perl itself, either Perl version 5.8.7 or,
26             at your option, any later version of Perl 5 you may have available.
27              
28             =cut
29              
30 1     1   4195 use strict;
  1         2  
  1         33  
31 1     1   4 use warnings;
  1         1  
  1         30  
32 1     1   3 use Carp;
  1         1  
  1         45  
33              
34 1     1   426 use open qw(:std :utf8); # Make All I/O Default to UTF-8
  1         911  
  1         4  
35              
36             sub new {
37 1     1 0 10 my $class = shift;
38 1         1 my $self = {};
39            
40 1         2 bless ($self, $class);
41 1         3 return $self;
42             }
43              
44             =head2 work
45              
46             Usage - $NCBIToRDF->work($NCBINodesFilePath,$NCBINamesFilePath,$RDF_file_handler)
47             Returns - RDF file handler
48             Args - The paths to the NCBI nodes and names files and a file handler for the new RDF file
49             Function - Converts NCBI nodes and NCBI names to an RDF graph.
50            
51             =cut
52             #vlmir
53             # Argumenents
54             # 1. Full path to the names.dmp file
55             # 2. Full path to the nodes.dmp file
56             # 3. File handle for writing RDF
57             # 4. base URI (e.g. 'http://www.semantic-systems-biology.org/')
58             # 5. name space (e.g. 'SSB')
59             #vlmir
60              
61             sub work {
62              
63            
64 1     1 1 199 my $self = shift;
65              
66             # TODO: have a thorough look into: ftp://ftp.ncbi.nih.gov/pub/taxonomy/taxdump_readme.txt
67              
68             # Get the arguments
69             # my ($NCBInodesFileName,$NCBInamesFileName,$file_handle ) = @_;
70 1         3 my ($NCBInamesFileName,$NCBInodesFileName, $file_handle, $base, $namespace ) = @_; #vlmir
71              
72             # For the ID
73             # $path_to_assoc_file =~ /.*\/(.*)/; # get what is after the slash in the path...
74             # my $f_name = $1;
75             # (my $prefix_id = $f_name) =~ s/\.goa//;
76             # $prefix_id =~ s/\./_/g;
77              
78             # TODO: set all the NS and URI via arguments
79             # my $default_URL = "http://www.semantic-systems-biology.org/";
80 1         1 my $default_URL = $base; #vlmir
81 1         2 my $NS = $namespace; #vlmir
82 1         3 my $ns = lc ($NS);
83 1         1 my $rdf_subnamespace = "taxon";
84            
85 1         4 my $obo_ns = $default_URL.$NS."#"; #$default_URL."OBO#";
86 1         2 my $ncbi_ns = $default_URL.$NS."#"; #$default_URL."NCBI#";
87              
88             # Preamble of RDF file
89 1         10 print $file_handle "\n";
90 1         2 print $file_handle "
91 1         2 print $file_handle "\txmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\"\n";
92 1         1 print $file_handle "\txmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\"\n";
93 1         3 print $file_handle "\txmlns:".$ns."=\"".$ncbi_ns."\">\n";
94             #print $file_handle "\txmlns:obo=\"".$obo_ns."\">\n";
95              
96 1         3 my %nodes = ();
97 1         2 my %names = ();
98              
99             # Open and parse names file (we want groups 1 and 2 only if group 4 is scientific name)
100 1 50       27 open(NCBInamesFile, $NCBInamesFileName) || croak("can't open file: $!");
101 1         26 my @mynamelines = ;
102 1         3 foreach my $theline (@mynamelines){
103 22 50       127 if ($theline =~ /(.+)\|(.+)\|(.+)\|(.+)\|/){
104 22         23 my $childid = $1;
105 22         23 my $childname = $2;
106 22         17 my $nametype = $4;
107 22         44 $childid =~ s/\s//g;
108 22         48 $nametype =~ s/\s//g;
109 22 100       36 if($nametype eq 'scientificname'){
110 10         18 $childname =~ s/^\s+//;
111 10         23 $childname =~ s/\s+$//;
112 10         22 $names{$childid} = $childname;
113             }
114             }
115             }
116 1         6 close(NCBInamesFile);
117              
118             # Open and parse the nodes file
119 1 50       24 open(NCBInodesFile, $NCBInodesFileName) || croak("can't open file: $!");
120 1         22 my @mynodelines =;
121 1         3 foreach my $theline (@mynodelines){
122 8 50       8430 if ($theline =~ /(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|(.+)\|/){
123 8         23 my $child = $1;
124 8         9 my $parent = $2;
125 8         9 my $rank = $3;
126 8         30 $child =~ s/\s//g;
127 8         20 $parent =~ s/\s//g;
128 8         16 $rank =~ s/\s//g;
129 8         15 $nodes{$child} = $parent;
130 8         27 print $file_handle "\t<",$ns,":".$rdf_subnamespace." rdf:about=\"#"."NCBI"."_".$child."\">\n";
131 8         27 print $file_handle "\t\t".&char_hex_http($names{$child})."\n";
132 8         15 print $file_handle "\t\t<".$ns.":name xml:lang=\"en\">".&char_hex_http($names{$child})."\n";
133 8         16 print $file_handle "\t\t<".$ns.":rank>".&char_hex_http($rank)."\n";
134              
135 8 100       17 unless ($child eq "1"){
136 7         14 print $file_handle "\t\t<".$ns.":is_a rdf:resource=\"#"."NCBI"."_".$parent."\"/>\n";
137             }
138              
139 8         18 print $file_handle "\t\n";
140             }
141             }
142 1         45 close(NCBInodesFile);
143              
144 1         3 print $file_handle "\n\n";
145 1         6 print $file_handle "";
146              
147 1         14 return $file_handle;
148             }
149              
150             sub __date {
151 1     1   51 my ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime(time);
152 1         12 my $result = sprintf "%02d:%02d:%4d %02d:%02d", $mday,$mon+1,$year+1900,$hour,$min; # e.g. 11:05:2008 12:52
153             }
154              
155             =head2 char_hex_http
156              
157             Usage - $ontology->char_hex_http($seq)
158             Returns - the sequence with the hexadecimal representation for the http special characters
159             Args - the sequence of characters
160             Function - Transforms a http character to its equivalent one in hexadecimal. E.g. : -> %3A
161            
162             =cut
163              
164              
165             sub char_hex_http {
166 24     24 1 26 $_[0] =~ s/:/%3A/g;
167 24         20 $_[0] =~ s/;/%3B/g;
168 24         18 $_[0] =~ s/
169 24         16 $_[0] =~ s/=/%3D/g;
170 24         18 $_[0] =~ s/>/%3E/g;
171 24         17 $_[0] =~ s/\?/%3F/g;
172            
173             #number sign # 23 # --> # # --> #
174             #dollar sign $ 24 $ --> $ $ --> $
175             #percent sign % 25 % --> % % --> %
176              
177 24         22 $_[0] =~ s/\//%2F/g;
178 24         15 $_[0] =~ s/&/%26/g;
179              
180 24         56 return $_[0];
181             }
182             1;