| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | #!/usr/bin/perl | 
| 2 |  |  |  |  |  |  | # NanoB2B-NER::NER::Metaman | 
| 3 |  |  |  |  |  |  | # | 
| 4 |  |  |  |  |  |  | # Turns file lines into MetaMap lines | 
| 5 |  |  |  |  |  |  | # Version 1.0 | 
| 6 |  |  |  |  |  |  | # | 
| 7 |  |  |  |  |  |  | # Program by Milk | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | package NanoB2B::NER::Metaman; | 
| 10 |  |  |  |  |  |  |  | 
| 11 | 1 |  |  | 1 |  | 9 | use NanoB2B::UniversalRoutines; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 38 |  | 
| 12 | 1 |  |  | 1 |  | 608 | use MetaMap::DataStructures; | 
|  | 1 |  |  |  |  | 12959 |  | 
|  | 1 |  |  |  |  | 37 |  | 
| 13 | 1 |  |  | 1 |  | 24 | use strict; | 
|  | 1 |  |  |  |  | 4 |  | 
|  | 1 |  |  |  |  | 30 |  | 
| 14 | 1 |  |  | 1 |  | 8 | use warnings; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 1689 |  | 
| 15 |  |  |  |  |  |  |  | 
| 16 |  |  |  |  |  |  | ####          GLOBAL VARIABLES           #### | 
| 17 |  |  |  |  |  |  |  | 
| 18 |  |  |  |  |  |  | #option variables | 
| 19 |  |  |  |  |  |  | my $debug = 1; | 
| 20 |  |  |  |  |  |  | my $program_dir = ""; | 
| 21 |  |  |  |  |  |  | my $metamap_arguments = "-q"; | 
| 22 |  |  |  |  |  |  | my $fileIndex = 0; | 
| 23 |  |  |  |  |  |  |  | 
| 24 |  |  |  |  |  |  | #datastructure variables | 
| 25 |  |  |  |  |  |  | my %params = (); | 
| 26 |  |  |  |  |  |  | my $dataStructures = MetaMap::DataStructures->new(\%params); | 
| 27 |  |  |  |  |  |  |  | 
| 28 |  |  |  |  |  |  | #universal subroutines object | 
| 29 |  |  |  |  |  |  | my %uniParams = (); | 
| 30 |  |  |  |  |  |  | my $uniSub; | 
| 31 |  |  |  |  |  |  |  | 
| 32 |  |  |  |  |  |  | #hash object for later | 
| 33 |  |  |  |  |  |  | my %metamapHash = (); | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  |  | 
| 36 |  |  |  |  |  |  |  | 
| 37 |  |  |  |  |  |  | ####      A BACKSTORY IS CREATED     #### | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | # construction method to create a new Metaman object | 
| 40 |  |  |  |  |  |  | # input  : $directory <-- the name of the directory for the files | 
| 41 |  |  |  |  |  |  | #		   \$index     <-- the index to start metamapping from in the set of files | 
| 42 |  |  |  |  |  |  | #		   \$debug     <-- run the program with debug print statements | 
| 43 |  |  |  |  |  |  | # output : $self      <-- an instance of the Metaman object | 
| 44 |  |  |  |  |  |  | sub new { | 
| 45 |  |  |  |  |  |  | #grab class and parameters | 
| 46 | 0 |  |  | 0 | 0 |  | my $self = {}; | 
| 47 | 0 |  |  |  |  |  | my $class = shift; | 
| 48 | 0 | 0 |  |  |  |  | return undef if(ref $class); | 
| 49 | 0 |  |  |  |  |  | my $params = shift; | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | #bless this object | 
| 52 | 0 |  |  |  |  |  | bless $self, $class; | 
| 53 | 0 |  |  |  |  |  | $self->_init($params); | 
| 54 |  |  |  |  |  |  |  | 
| 55 | 0 |  |  |  |  |  | $uniParams{'debug'} = $self->{debug}; | 
| 56 | 0 |  |  |  |  |  | $uniSub = NanoB2B::UniversalRoutines->new(\%uniParams); | 
| 57 |  |  |  |  |  |  |  | 
| 58 | 0 |  |  |  |  |  | return $self; | 
| 59 |  |  |  |  |  |  | } | 
| 60 |  |  |  |  |  |  | #  method to initialize the NanoB2B::NER::Metaman object. | 
| 61 |  |  |  |  |  |  | #  input : $parameters <- reference to a hash | 
| 62 |  |  |  |  |  |  | #  output: | 
| 63 |  |  |  |  |  |  | sub _init { | 
| 64 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 65 | 0 |  |  |  |  |  | my $params = shift; | 
| 66 |  |  |  |  |  |  |  | 
| 67 | 0 | 0 |  |  |  |  | $params = {} if(!defined $params); | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  | #  get some of the parameters | 
| 70 | 0 |  |  |  |  |  | my $diroption = $params->{'directory'}; | 
| 71 | 0 |  |  |  |  |  | my $debugoption = $params->{'debug'}; | 
| 72 | 0 |  |  |  |  |  | my $fileindexoption = $params->{'fileIndex'}; | 
| 73 | 0 |  |  |  |  |  | my $metamapargoption = $params->{'metamap_arguments'}; | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | #set the global variables | 
| 76 | 0 | 0 |  |  |  |  | if(defined $debugoption){$debug = $debugoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 77 | 0 | 0 |  |  |  |  | if(defined $diroption){$program_dir = $diroption;} | 
|  | 0 |  |  |  |  |  |  | 
| 78 | 0 | 0 |  |  |  |  | if(defined $fileindexoption){$fileIndex = $fileindexoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 79 | 0 | 0 |  |  |  |  | if(defined $metamapargoption){$metamap_arguments = $metamapargoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 80 |  |  |  |  |  |  | } | 
| 81 |  |  |  |  |  |  |  | 
| 82 |  |  |  |  |  |  | ####     TO THE METAMOBILE!    #### | 
| 83 |  |  |  |  |  |  |  | 
| 84 |  |  |  |  |  |  | # imports the data, cleans the lines, runs through metamap, and exports the results to a file | 
| 85 |  |  |  |  |  |  | # input  : $file <-- name of the file to run through metamap | 
| 86 |  |  |  |  |  |  | # output : | 
| 87 |  |  |  |  |  |  | sub meta_file{ | 
| 88 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 89 | 0 |  |  |  |  |  | my $file = shift; | 
| 90 |  |  |  |  |  |  |  | 
| 91 |  |  |  |  |  |  | #define and reset temp var | 
| 92 | 0 |  |  |  |  |  | my $indexer = 0; | 
| 93 | 0 |  |  |  |  |  | %metamapHash = (); | 
| 94 |  |  |  |  |  |  |  | 
| 95 |  |  |  |  |  |  | #get the name of the file | 
| 96 | 0 |  |  |  |  |  | my @n = split '/', $file; | 
| 97 | 0 |  |  |  |  |  | my $l = @n; | 
| 98 | 0 |  |  |  |  |  | my $filename = $n[$l - 1]; | 
| 99 | 0 |  |  |  |  |  | $filename = lc($filename); | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | #import the data from the file | 
| 102 | 0 |  |  |  |  |  | my $FILE; | 
| 103 | 0 | 0 |  |  |  |  | open ($FILE, "$program_dir/$file") || die ("what is this 'dir/file' you speak of?\n"); | 
| 104 | 0 |  |  |  |  |  | my @fileLines = <$FILE>; | 
| 105 | 0 |  |  |  |  |  | foreach my $l(@fileLines){ | 
| 106 | 0 |  |  |  |  |  | $l = lc($l); | 
| 107 |  |  |  |  |  |  | } | 
| 108 | 0 |  |  |  |  |  | $uniSub->printColorDebug("on_red", "$filename"); | 
| 109 |  |  |  |  |  |  |  | 
| 110 |  |  |  |  |  |  | #get the total num of lines | 
| 111 | 0 |  |  |  |  |  | my $totalLines = 0; | 
| 112 | 0 |  |  |  |  |  | $totalLines = @fileLines; | 
| 113 | 0 |  |  |  |  |  | $uniSub->printColorDebug("red", "Lines: $totalLines\n"); | 
| 114 |  |  |  |  |  |  |  | 
| 115 |  |  |  |  |  |  | #clean it up for two separate sets | 
| 116 | 0 |  |  |  |  |  | my @cleanLines = untagSet($filename, \@fileLines); | 
| 117 |  |  |  |  |  |  |  | 
| 118 |  |  |  |  |  |  | #metamap all the lines --> metamaphash | 
| 119 | 0 |  |  |  |  |  | $uniSub->printColorDebug("blue", "*Metamapping the lines into a hashtable....\n"); | 
| 120 | 0 |  |  |  |  |  | $indexer = 0; | 
| 121 | 0 |  |  |  |  |  | my $total = @cleanLines; | 
| 122 | 0 |  |  |  |  |  | foreach my $line (@cleanLines){ | 
| 123 |  |  |  |  |  |  | #printColorDebug("on_blue", "LINE: $line\n"); | 
| 124 | 0 |  |  |  |  |  | my $lnnum = $indexer + 1; | 
| 125 | 0 |  |  |  |  |  | $uniSub->printColorDebug("green", "$filename - MM Line $lnnum / $total...\n"); | 
| 126 | 0 |  |  |  |  |  | my $mm = metaLine($self, $line, $filename); | 
| 127 | 0 |  |  |  |  |  | $metamapHash{$indexer} = $mm; | 
| 128 | 0 |  |  |  |  |  | $indexer++; | 
| 129 |  |  |  |  |  |  | } | 
| 130 |  |  |  |  |  |  |  | 
| 131 |  |  |  |  |  |  | #export the metamap data to a separate file | 
| 132 | 0 |  |  |  |  |  | exportMetaData($filename); | 
| 133 |  |  |  |  |  |  | } | 
| 134 |  |  |  |  |  |  |  | 
| 135 |  |  |  |  |  |  | ####      CLEANS THE LINE     #### | 
| 136 |  |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  | # cleans the line without getting rid of tags | 
| 138 |  |  |  |  |  |  | # input  : $input <-- the line to clean | 
| 139 |  |  |  |  |  |  | # output : | 
| 140 |  |  |  |  |  |  | sub cleanWords{ | 
| 141 | 0 |  |  | 0 | 0 |  | my $input = shift; | 
| 142 |  |  |  |  |  |  |  | 
| 143 | 0 |  |  |  |  |  | $input =~ s/[^a-zA-z0-9\:\.\s<>\*\/\,]/ /g; 	#get rid of non-ascii | 
| 144 | 0 |  |  |  |  |  | $input =~ s/([0-9]+(\.[0-9]*)?)-[0-9]+(\.[0-9]*)?/RANGE/g;		#get rid of range num (#-#) | 
| 145 | 0 |  |  |  |  |  | $input =~ s/[0-9]+\.?[0-9]+/NUM/g;				#get rid of normal num (# or #.#) | 
| 146 | 0 |  |  |  |  |  | $input =~ s/\s?=\s?/eq/g;						#get rid of = | 
| 147 | 0 |  |  |  |  |  | $input =~ s///g;					#get rid of | 
| 148 | 0 |  |  |  |  |  | $input =~ s/[\*\/]//g;							#get rid of * | 
| 149 |  |  |  |  |  |  | #$input =~ s/[,\)\(\\\'\/\=\*\-]/ /g; | 
| 150 | 0 |  |  |  |  |  | $input =~ s/\s\+/_/g;							#get rid of _+ space | 
| 151 | 0 |  |  |  |  |  | $input =~ s/\s+\.\s+/ /g;						#get rid of _._ periods | 
| 152 | 0 |  |  |  |  |  | $input =~ s/\.\s+/ /g;							#get rid of ._ space | 
| 153 | 0 |  |  |  |  |  | $input =~ s/\s+/ /g;							#get rid of excessive blank space | 
| 154 | 0 |  |  |  |  |  | return $input; | 
| 155 |  |  |  |  |  |  | } | 
| 156 |  |  |  |  |  |  |  | 
| 157 |  |  |  |  |  |  | # returns clean line with no tags or retaggings | 
| 158 |  |  |  |  |  |  | # input  : $line  <-- the line to remove the tags from | 
| 159 |  |  |  |  |  |  | #		   $id    <-- the entity tag name (ex. ) | 
| 160 |  |  |  |  |  |  | # output : $input <-- the line untagged | 
| 161 |  |  |  |  |  |  | sub untag{ | 
| 162 | 0 |  |  | 0 | 0 |  | my $line = shift; | 
| 163 | 0 |  |  |  |  |  | my $id = shift; | 
| 164 |  |  |  |  |  |  |  | 
| 165 | 0 |  |  |  |  |  | my $input = lc($line); | 
| 166 | 0 |  |  |  |  |  | $id = lc($id); | 
| 167 | 0 |  |  |  |  |  | $input =~ s/ //g; | 
| 168 | 0 |  |  |  |  |  | $input =~ s/ //g; | 
| 169 | 0 |  |  |  |  |  | $input =~ s/ //g; | 
| 170 | 0 |  |  |  |  |  | $input = cleanWords($input); | 
| 171 | 0 |  |  |  |  |  | return $input; | 
| 172 |  |  |  |  |  |  | } | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | # returns a clean set of lines | 
| 175 |  |  |  |  |  |  | # input  : $filename  <-- the name of the file/tag for the entities | 
| 176 |  |  |  |  |  |  | #		   @lines     <-- the set of lines from the file | 
| 177 |  |  |  |  |  |  | # output : @clean_set <-- the line set untagged | 
| 178 |  |  |  |  |  |  | sub untagSet{ | 
| 179 | 0 |  |  | 0 | 0 |  | my $filename = shift; | 
| 180 | 0 |  |  |  |  |  | my $lines_ref = shift; | 
| 181 | 0 |  |  |  |  |  | my @lines = @$lines_ref; | 
| 182 |  |  |  |  |  |  |  | 
| 183 | 0 |  |  |  |  |  | my @clean_set = (); | 
| 184 | 0 |  |  |  |  |  | foreach my $line(@lines){ | 
| 185 | 0 |  |  |  |  |  | my $cl = untag($line, $filename); | 
| 186 | 0 |  |  |  |  |  | push @clean_set, $cl; | 
| 187 |  |  |  |  |  |  | } | 
| 188 | 0 |  |  |  |  |  | return @clean_set; | 
| 189 |  |  |  |  |  |  | } | 
| 190 |  |  |  |  |  |  |  | 
| 191 |  |  |  |  |  |  | ######    METAMAPS THE LINE   ###### | 
| 192 |  |  |  |  |  |  | #metamaps a single line | 
| 193 |  |  |  |  |  |  | # input  : $line     <-- the line to run through metamap | 
| 194 |  |  |  |  |  |  | #		   $name     <-- the name of the file/tag for the entities | 
| 195 |  |  |  |  |  |  | # output : $meta     <-- the metamap output for the line | 
| 196 |  |  |  |  |  |  | sub metaLine{ | 
| 197 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 198 | 0 |  |  |  |  |  | my $line = shift; | 
| 199 | 0 |  |  |  |  |  | my $name = shift; | 
| 200 |  |  |  |  |  |  |  | 
| 201 |  |  |  |  |  |  | #make a makeshift file to put the line | 
| 202 | 0 |  |  |  |  |  | open IN, ">", "input" || die ("No input file...$!"); | 
| 203 | 0 |  |  |  |  |  | my $clean_line = untag($line, $name); | 
| 204 | 0 |  |  |  |  |  | chomp($clean_line); | 
| 205 |  |  |  |  |  |  |  | 
| 206 | 0 |  |  |  |  |  | print IN "$clean_line\n"; | 
| 207 | 0 |  |  |  |  |  | close IN; | 
| 208 |  |  |  |  |  |  |  | 
| 209 |  |  |  |  |  |  | #analyze using nlm's program | 
| 210 | 0 |  |  |  |  |  | my $meta = `metamap $metamap_arguments < input`; | 
| 211 | 0 |  |  |  |  |  | return $meta; | 
| 212 |  |  |  |  |  |  | } | 
| 213 |  |  |  |  |  |  |  | 
| 214 |  |  |  |  |  |  | #metamaps an entire set of lines | 
| 215 |  |  |  |  |  |  | # input  : $name     <-- the name of the file/tag for the entities | 
| 216 |  |  |  |  |  |  | #		   @lines    <-- the set of lines to run through metamap | 
| 217 |  |  |  |  |  |  | # output : @set      <-- the set of metamapped lines | 
| 218 |  |  |  |  |  |  | sub metaSet{ | 
| 219 | 0 |  |  | 0 | 0 |  | my $name = shift; | 
| 220 | 0 |  |  |  |  |  | my $lines_ref = shift; | 
| 221 | 0 |  |  |  |  |  | my @lines = @$lines_ref; | 
| 222 |  |  |  |  |  |  |  | 
| 223 | 0 |  |  |  |  |  | my @set = (); | 
| 224 | 0 |  |  |  |  |  | foreach my $l (@lines){ | 
| 225 | 0 |  |  |  |  |  | my $ml = metaLine($l, $name); | 
| 226 | 0 |  |  |  |  |  | push @set, $ml; | 
| 227 |  |  |  |  |  |  | } | 
| 228 | 0 |  |  |  |  |  | return @set; | 
| 229 |  |  |  |  |  |  | } | 
| 230 |  |  |  |  |  |  |  | 
| 231 |  |  |  |  |  |  | #exports metamap hashtable data by printing it to a file | 
| 232 |  |  |  |  |  |  | # input  : $name     <-- the name of the file/tag for the entities | 
| 233 |  |  |  |  |  |  | # output : (META)    <-- a file with the metamap data stored in _META of the files' directory | 
| 234 |  |  |  |  |  |  | sub exportMetaData{ | 
| 235 | 0 |  |  | 0 | 0 |  | my $name = shift; | 
| 236 |  |  |  |  |  |  |  | 
| 237 | 0 |  |  |  |  |  | my $META; | 
| 238 |  |  |  |  |  |  | #create a directory to save hashtable data | 
| 239 | 0 | 0 |  |  |  |  | if($program_dir ne ""){ | 
| 240 | 0 |  |  |  |  |  | my $subdir = "_METAMAPS"; | 
| 241 | 0 |  |  |  |  |  | $uniSub->make_path("$program_dir/$subdir"); | 
| 242 | 0 | 0 |  |  |  |  | open($META, ">", ("$program_dir/$subdir/" . $name . "_meta")) || die ("Um..."); | 
| 243 |  |  |  |  |  |  | }else{ | 
| 244 | 0 | 0 |  |  |  |  | open($META, ">", ($name . "_meta")) || die ("Agh!"); | 
| 245 |  |  |  |  |  |  | } | 
| 246 |  |  |  |  |  |  |  | 
| 247 |  |  |  |  |  |  | #print metamap data to the file | 
| 248 | 0 |  |  |  |  |  | foreach my $key (sort { $a <=> $b } keys %metamapHash){ | 
|  | 0 |  |  |  |  |  |  | 
| 249 | 0 |  |  |  |  |  | my $mm = $metamapHash{$key}; | 
| 250 | 0 |  |  |  |  |  | $uniSub->print2File($META, $mm); | 
| 251 |  |  |  |  |  |  | } | 
| 252 | 0 |  |  |  |  |  | close $META; | 
| 253 |  |  |  |  |  |  | } | 
| 254 |  |  |  |  |  |  |  | 
| 255 |  |  |  |  |  |  | 1; |