File Coverage

blib/lib/NanoB2B/NER/Metaman.pm
Criterion Covered Total %
statement 12 123 9.7
branch 0 20 0.0
condition n/a
subroutine 4 13 30.7
pod 0 8 0.0
total 16 164 9.7


line stmt bran cond sub pod time code
1             #!/usr/bin/perl
2             # NanoB2B-NER::NER::Metaman
3             #
4             # Turns file lines into MetaMap lines
5             # Version 1.0
6             #
7             # Program by Milk
8              
9             package NanoB2B::NER::Metaman;
10              
11 1     1   6 use NanoB2B::UniversalRoutines;
  1         2  
  1         26  
12 1     1   385 use MetaMap::DataStructures;
  1         8121  
  1         24  
13 1     1   11 use strict;
  1         2  
  1         16  
14 1     1   4 use warnings;
  1         2  
  1         971  
15              
16             #### GLOBAL VARIABLES ####
17              
18             #option variables
19             my $debug = 1;
20             my $program_dir = "";
21             my $metamap_arguments = "-q";
22             my $fileIndex = 0;
23              
24             #datastructure variables
25             my %params = ();
26             my $dataStructures = MetaMap::DataStructures->new(\%params);
27              
28             #universal subroutines object
29             my %uniParams = ();
30             my $uniSub;
31              
32             #hash object for later
33             my %metamapHash = ();
34              
35              
36              
37             #### A BACKSTORY IS CREATED ####
38              
39             # construction method to create a new Metaman object
40             # input : $directory <-- the name of the directory for the files
41             # \$index <-- the index to start metamapping from in the set of files
42             # \$debug <-- run the program with debug print statements
43             # output : $self <-- an instance of the Metaman object
44             sub new {
45             #grab class and parameters
46 0     0 0   my $self = {};
47 0           my $class = shift;
48 0 0         return undef if(ref $class);
49 0           my $params = shift;
50              
51             #bless this object
52 0           bless $self, $class;
53 0           $self->_init($params);
54              
55 0           $uniParams{'debug'} = $self->{debug};
56 0           $uniSub = NanoB2B::UniversalRoutines->new(\%uniParams);
57              
58 0           return $self;
59             }
60             # method to initialize the NanoB2B::NER::Metaman object.
61             # input : $parameters <- reference to a hash
62             # output:
63             sub _init {
64 0     0     my $self = shift;
65 0           my $params = shift;
66              
67 0 0         $params = {} if(!defined $params);
68              
69             # get some of the parameters
70 0           my $diroption = $params->{'directory'};
71 0           my $debugoption = $params->{'debug'};
72 0           my $fileindexoption = $params->{'fileIndex'};
73 0           my $metamapargoption = $params->{'metamap_arguments'};
74              
75             #set the global variables
76 0 0         if(defined $debugoption){$debug = $debugoption;}
  0            
77 0 0         if(defined $diroption){$program_dir = $diroption;}
  0            
78 0 0         if(defined $fileindexoption){$fileIndex = $fileindexoption;}
  0            
79 0 0         if(defined $metamapargoption){$metamap_arguments = $metamapargoption;}
  0            
80             }
81              
82             #### TO THE METAMOBILE! ####
83              
84             # imports the data, cleans the lines, runs through metamap, and exports the results to a file
85             # input : $file <-- name of the file to run through metamap
86             # output :
87             sub meta_file{
88 0     0 0   my $self = shift;
89 0           my $file = shift;
90              
91             #define and reset temp var
92 0           my $indexer = 0;
93 0           %metamapHash = ();
94              
95             #get the name of the file
96 0           my @n = split '/', $file;
97 0           my $l = @n;
98 0           my $filename = $n[$l - 1];
99 0           $filename = lc($filename);
100              
101             #import the data from the file
102 0           my $FILE;
103 0 0         open ($FILE, "$program_dir/$file") || die ("what is this 'dir/file' you speak of?\n");
104 0           my @fileLines = <$FILE>;
105 0           foreach my $l(@fileLines){
106 0           $l = lc($l);
107             }
108 0           $uniSub->printColorDebug("on_red", "$filename");
109              
110             #get the total num of lines
111 0           my $totalLines = 0;
112 0           $totalLines = @fileLines;
113 0           $uniSub->printColorDebug("red", "Lines: $totalLines\n");
114              
115             #clean it up for two separate sets
116 0           my @cleanLines = untagSet($filename, \@fileLines);
117              
118             #metamap all the lines --> metamaphash
119 0           $uniSub->printColorDebug("blue", "*Metamapping the lines into a hashtable....\n");
120 0           $indexer = 0;
121 0           my $total = @cleanLines;
122 0           foreach my $line (@cleanLines){
123             #printColorDebug("on_blue", "LINE: $line\n");
124 0           my $lnnum = $indexer + 1;
125 0           $uniSub->printColorDebug("green", "$filename - MM Line $lnnum / $total...\n");
126 0           my $mm = metaLine($self, $line, $filename);
127 0           $metamapHash{$indexer} = $mm;
128 0           $indexer++;
129             }
130              
131             #export the metamap data to a separate file
132 0           exportMetaData($filename);
133             }
134              
135             #### CLEANS THE LINE ####
136              
137             # cleans the line without getting rid of tags
138             # input : $input <-- the line to clean
139             # output :
140             sub cleanWords{
141 0     0 0   my $input = shift;
142              
143 0           $input =~ s/[^a-zA-z0-9\:\.\s<>&#;\*\/\,]/ /g; #get rid of non-ascii
144 0           $input =~ s/([0-9]+(\.[0-9]*)?)-[0-9]+(\.[0-9]*)?/RANGE/g; #get rid of range num (#-#)
145 0           $input =~ s/[0-9]+\.?[0-9]+/NUM/g; #get rid of normal num (# or #.#)
146 0           $input =~ s/\s?=\s?/eq/g; #get rid of =
147 0           $input =~ s///g; #get rid of
148 0           $input =~ s/[\*\/]//g; #get rid of *
149             #$input =~ s/[,\)\(\\\'\/\=\*\-]/ /g;
150 0           $input =~ s/\s\+/_/g; #get rid of _+ space
151 0           $input =~ s/\s+\.\s+/ /g; #get rid of _._ periods
152 0           $input =~ s/\.\s+/ /g; #get rid of ._ space
153 0           $input =~ s/\s+/ /g; #get rid of excessive blank space
154 0           return $input;
155             }
156              
157             # returns clean line with no tags or retaggings
158             # input : $line <-- the line to remove the tags from
159             # $id <-- the entity tag name (ex. )
160             # output : $input <-- the line untagged
161             sub untag{
162 0     0 0   my $line = shift;
163 0           my $id = shift;
164              
165 0           my $input = lc($line);
166 0           $id = lc($id);
167 0           $input =~ s/ //g;
168 0           $input =~ s/ //g;
169 0           $input =~ s/ //g;
170 0           $input = cleanWords($input);
171 0           return $input;
172             }
173              
174             # returns a clean set of lines
175             # input : $filename <-- the name of the file/tag for the entities
176             # @lines <-- the set of lines from the file
177             # output : @clean_set <-- the line set untagged
178             sub untagSet{
179 0     0 0   my $filename = shift;
180 0           my $lines_ref = shift;
181 0           my @lines = @$lines_ref;
182              
183 0           my @clean_set = ();
184 0           foreach my $line(@lines){
185 0           my $cl = untag($line, $filename);
186 0           push @clean_set, $cl;
187             }
188 0           return @clean_set;
189             }
190              
191             ###### METAMAPS THE LINE ######
192             #metamaps a single line
193             # input : $line <-- the line to run through metamap
194             # $name <-- the name of the file/tag for the entities
195             # output : $meta <-- the metamap output for the line
196             sub metaLine{
197 0     0 0   my $self = shift;
198 0           my $line = shift;
199 0           my $name = shift;
200              
201             #make a makeshift file to put the line
202 0           open IN, ">", "input" || die ("No input file...$!");
203 0           my $clean_line = untag($line, $name);
204 0           chomp($clean_line);
205              
206 0           print IN "$clean_line\n";
207 0           close IN;
208              
209             #analyze using nlm's program
210 0           my $meta = `metamap $metamap_arguments < input`;
211 0           return $meta;
212             }
213              
214             #metamaps an entire set of lines
215             # input : $name <-- the name of the file/tag for the entities
216             # @lines <-- the set of lines to run through metamap
217             # output : @set <-- the set of metamapped lines
218             sub metaSet{
219 0     0 0   my $name = shift;
220 0           my $lines_ref = shift;
221 0           my @lines = @$lines_ref;
222              
223 0           my @set = ();
224 0           foreach my $l (@lines){
225 0           my $ml = metaLine($l, $name);
226 0           push @set, $ml;
227             }
228 0           return @set;
229             }
230              
231             #exports metamap hashtable data by printing it to a file
232             # input : $name <-- the name of the file/tag for the entities
233             # output : (META) <-- a file with the metamap data stored in _META of the files' directory
234             sub exportMetaData{
235 0     0 0   my $name = shift;
236              
237 0           my $META;
238             #create a directory to save hashtable data
239 0 0         if($program_dir ne ""){
240 0           my $subdir = "_METAMAPS";
241 0           $uniSub->make_path("$program_dir/$subdir");
242 0 0         open($META, ">", ("$program_dir/$subdir/" . $name . "_meta")) || die ("Um...");
243             }else{
244 0 0         open($META, ">", ($name . "_meta")) || die ("Agh!");
245             }
246              
247             #print metamap data to the file
248 0           foreach my $key (sort { $a <=> $b } keys %metamapHash){
  0            
249 0           my $mm = $metamapHash{$key};
250 0           $uniSub->print2File($META, $mm);
251             }
252 0           close $META;
253             }
254              
255             1;