| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | #!/usr/bin/perl | 
| 2 |  |  |  |  |  |  | # NanoB2B-NER::NER::Arffman | 
| 3 |  |  |  |  |  |  | # | 
| 4 |  |  |  |  |  |  | # Creates ARFF files from annotated files | 
| 5 |  |  |  |  |  |  | # Version 1.5 | 
| 6 |  |  |  |  |  |  | # | 
| 7 |  |  |  |  |  |  | # Program by Milk | 
| 8 |  |  |  |  |  |  |  | 
| 9 |  |  |  |  |  |  | package NanoB2B::NER::Arffman; | 
| 10 |  |  |  |  |  |  |  | 
| 11 | 1 |  |  | 1 |  | 9 | use NanoB2B::UniversalRoutines; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 38 |  | 
| 12 | 1 |  |  | 1 |  | 9 | use MetaMap::DataStructures; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 35 |  | 
| 13 | 1 |  |  | 1 |  | 8 | use File::Path qw(make_path);			#makes sub directories | 
|  | 1 |  |  |  |  | 2 |  | 
|  | 1 |  |  |  |  | 81 |  | 
| 14 | 1 |  |  | 1 |  | 9 | use List::MoreUtils qw(uniq); | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 13 |  | 
| 15 |  |  |  |  |  |  |  | 
| 16 | 1 |  |  | 1 |  | 756 | use strict; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 32 |  | 
| 17 | 1 |  |  | 1 |  | 8 | use warnings; | 
|  | 1 |  |  |  |  | 3 |  | 
|  | 1 |  |  |  |  | 10006 |  | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | #option variables | 
| 20 |  |  |  |  |  |  | my $debug = 1; | 
| 21 |  |  |  |  |  |  | my $program_dir = ""; | 
| 22 |  |  |  |  |  |  | my $fileIndex = 0; | 
| 23 |  |  |  |  |  |  | my $stopwords_file; | 
| 24 |  |  |  |  |  |  | my $prefix = 3; | 
| 25 |  |  |  |  |  |  | my $suffix = 3; | 
| 26 |  |  |  |  |  |  | my $bucketsNum = 10; | 
| 27 |  |  |  |  |  |  | my $is_cui = 0; | 
| 28 |  |  |  |  |  |  | my $sparse_matrix = 0; | 
| 29 |  |  |  |  |  |  | my $wcs = ""; | 
| 30 |  |  |  |  |  |  |  | 
| 31 |  |  |  |  |  |  | #datastructure object | 
| 32 |  |  |  |  |  |  | my %params = (); | 
| 33 |  |  |  |  |  |  | my $dataStructures = MetaMap::DataStructures->new(\%params); | 
| 34 |  |  |  |  |  |  |  | 
| 35 |  |  |  |  |  |  | #universal subroutines object | 
| 36 |  |  |  |  |  |  | my %uniParams = (); | 
| 37 |  |  |  |  |  |  | my $uniSub; | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | #other general global variables | 
| 40 |  |  |  |  |  |  | my @allBuckets; | 
| 41 |  |  |  |  |  |  | my %fileHash; | 
| 42 |  |  |  |  |  |  | my %metamapHash; | 
| 43 |  |  |  |  |  |  | my %tokenHash; | 
| 44 |  |  |  |  |  |  | my %conceptHash; | 
| 45 |  |  |  |  |  |  | my %posHash; | 
| 46 |  |  |  |  |  |  | my %semHash; | 
| 47 |  |  |  |  |  |  | my %cuiHash; | 
| 48 |  |  |  |  |  |  | my %orthoHash; | 
| 49 |  |  |  |  |  |  | my @features; | 
| 50 |  |  |  |  |  |  | my $selfId = "_self"; | 
| 51 |  |  |  |  |  |  | my $entId = "_e"; | 
| 52 |  |  |  |  |  |  | my $morphID = "_m"; | 
| 53 |  |  |  |  |  |  |  | 
| 54 |  |  |  |  |  |  | my $stopRegex; | 
| 55 |  |  |  |  |  |  |  | 
| 56 |  |  |  |  |  |  | ####      A HERO IS BORN     #### | 
| 57 |  |  |  |  |  |  |  | 
| 58 |  |  |  |  |  |  | # construction method to create a new Arffman object | 
| 59 |  |  |  |  |  |  | # input  : $directory     <-- the name of the directory for the files | 
| 60 |  |  |  |  |  |  | #		   $name 		  <-- name of the file to examine | 
| 61 |  |  |  |  |  |  | #		   $features      <-- the list of features to use [e.g. "ortho morph text pos cui sem"] | 
| 62 |  |  |  |  |  |  | #		   $bucketsNum    <-- the number of buckets to use for k-fold cross validation | 
| 63 |  |  |  |  |  |  | #		   \$debug         <-- run the program with debug print statements | 
| 64 |  |  |  |  |  |  | #		   \$prefix        <-- the number of letters to look at the beginning of each word | 
| 65 |  |  |  |  |  |  | #		   \$suffix        <-- the number of letters to look at the end of each word | 
| 66 |  |  |  |  |  |  | #		   \$index         <-- the index to start metamapping from in the set of files | 
| 67 |  |  |  |  |  |  | #		   \$no_stopwords  <-- exclude examining stop words [imported from the stop word list] | 
| 68 |  |  |  |  |  |  | # output : $self          <-- an instance of the Arffman object | 
| 69 |  |  |  |  |  |  | sub new { | 
| 70 |  |  |  |  |  |  | #grab class and parameters | 
| 71 | 0 |  |  | 0 | 0 |  | my $self = {}; | 
| 72 | 0 |  |  |  |  |  | my $class = shift; | 
| 73 | 0 | 0 |  |  |  |  | return undef if(ref $class); | 
| 74 | 0 |  |  |  |  |  | my $params = shift; | 
| 75 |  |  |  |  |  |  |  | 
| 76 |  |  |  |  |  |  | #reset all arrays and hashes | 
| 77 | 0 |  |  |  |  |  | @allBuckets = (); | 
| 78 | 0 |  |  |  |  |  | %fileHash = (); | 
| 79 | 0 |  |  |  |  |  | %metamapHash = (); | 
| 80 | 0 |  |  |  |  |  | %tokenHash = (); | 
| 81 | 0 |  |  |  |  |  | %conceptHash = (); | 
| 82 | 0 |  |  |  |  |  | %posHash = (); | 
| 83 | 0 |  |  |  |  |  | %semHash = (); | 
| 84 | 0 |  |  |  |  |  | %cuiHash = (); | 
| 85 | 0 |  |  |  |  |  | %orthoHash = (); | 
| 86 | 0 |  |  |  |  |  | @features = (); | 
| 87 |  |  |  |  |  |  |  | 
| 88 |  |  |  |  |  |  | #bless this object | 
| 89 | 0 |  |  |  |  |  | bless $self, $class; | 
| 90 | 0 |  |  |  |  |  | $self->_init($params); | 
| 91 | 0 |  |  |  |  |  | @allBuckets = (1..$bucketsNum); | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | #retrieve parameters for universal-routines | 
| 94 | 0 |  |  |  |  |  | $uniParams{'debug'} = $debug; | 
| 95 | 0 |  |  |  |  |  | $uniSub = NanoB2B::UniversalRoutines->new(\%uniParams); | 
| 96 |  |  |  |  |  |  |  | 
| 97 |  |  |  |  |  |  | #return the object | 
| 98 | 0 |  |  |  |  |  | return $self; | 
| 99 |  |  |  |  |  |  | } | 
| 100 |  |  |  |  |  |  |  | 
| 101 |  |  |  |  |  |  | #  method to initialize the NanoB2B::NER::Arffman object. | 
| 102 |  |  |  |  |  |  | #  input : $parameters <- reference to a hash | 
| 103 |  |  |  |  |  |  | #  output: | 
| 104 |  |  |  |  |  |  | sub _init { | 
| 105 | 0 |  |  | 0 |  |  | my $self = shift; | 
| 106 | 0 |  |  |  |  |  | my $params = shift; | 
| 107 |  |  |  |  |  |  |  | 
| 108 | 0 | 0 |  |  |  |  | $params = {} if(!defined $params); | 
| 109 |  |  |  |  |  |  |  | 
| 110 |  |  |  |  |  |  | #  get some of the parameters | 
| 111 | 0 |  |  |  |  |  | my $diroption = $params->{'directory'}; | 
| 112 | 0 |  |  |  |  |  | my $ftsoption = $params->{'features'}; | 
| 113 | 0 |  |  |  |  |  | my $bucketsNumoption = $params->{'bucketsNum'}; | 
| 114 | 0 |  |  |  |  |  | my $debugoption = $params->{'debug'}; | 
| 115 | 0 |  |  |  |  |  | my $prefixoption = $params->{'prefix'}; | 
| 116 | 0 |  |  |  |  |  | my $suffixoption = $params->{'suffix'}; | 
| 117 | 0 |  |  |  |  |  | my $indexoption = $params->{'index'}; | 
| 118 | 0 |  |  |  |  |  | my $stopwordoption = $params->{'stopwords'}; | 
| 119 | 0 |  |  |  |  |  | my $iscuioption = $params->{'is_cui'}; | 
| 120 | 0 |  |  |  |  |  | my $sparsematrixoption = $params->{'sparse_matrix'}; | 
| 121 | 0 |  |  |  |  |  | my $wcsoption = $params->{'wcs'}; | 
| 122 |  |  |  |  |  |  |  | 
| 123 |  |  |  |  |  |  | #set the global variables | 
| 124 | 0 | 0 |  |  |  |  | if(defined $debugoption){$debug = $debugoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 125 | 0 | 0 |  |  |  |  | if(defined $diroption){$program_dir = $diroption;} | 
|  | 0 |  |  |  |  |  |  | 
| 126 | 0 | 0 |  |  |  |  | if(defined $indexoption){$fileIndex = $indexoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 127 | 0 | 0 |  |  |  |  | if(defined $stopwordoption){$stopwords_file = $stopwordoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 128 | 0 | 0 |  |  |  |  | if(defined $iscuioption){$is_cui = $iscuioption;} | 
|  | 0 |  |  |  |  |  |  | 
| 129 | 0 | 0 |  |  |  |  | if(defined $sparsematrixoption){$sparse_matrix = $sparsematrixoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 130 | 0 | 0 |  |  |  |  | if(defined $prefixoption){$prefix = $prefixoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 131 | 0 | 0 |  |  |  |  | if(defined $suffixoption){$suffix = $suffixoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 132 | 0 | 0 |  |  |  |  | if(defined $wcsoption){$wcs = $wcsoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 133 | 0 | 0 |  |  |  |  | if(defined $bucketsNumoption){$bucketsNum = $bucketsNumoption;} | 
|  | 0 |  |  |  |  |  |  | 
| 134 | 0 | 0 |  |  |  |  | if(defined $ftsoption){@features = split(' ', $ftsoption);} | 
|  | 0 |  |  |  |  |  |  | 
| 135 |  |  |  |  |  |  | } | 
| 136 |  |  |  |  |  |  |  | 
| 137 |  |  |  |  |  |  |  | 
| 138 |  |  |  |  |  |  | #######		ARFFMAN AND THE METHODS OF MADNESS		##### | 
| 139 |  |  |  |  |  |  |  | 
| 140 |  |  |  |  |  |  |  | 
| 141 |  |  |  |  |  |  | # opens a single file and runs it through the process of creating buckets | 
| 142 |  |  |  |  |  |  | # extracting tokens and concepts, and creating arff files based on the features given | 
| 143 |  |  |  |  |  |  | # input  : $file <-- the name of the file to make into arff files | 
| 144 |  |  |  |  |  |  | # output : a set of arff files | 
| 145 |  |  |  |  |  |  | sub arff_file{ | 
| 146 | 0 |  |  | 0 | 0 |  | my $self = shift; | 
| 147 | 0 |  |  |  |  |  | my $file = shift; | 
| 148 |  |  |  |  |  |  |  | 
| 149 |  |  |  |  |  |  | #define and reset temp var | 
| 150 | 0 |  |  |  |  |  | my $indexer = 0; | 
| 151 | 0 |  |  |  |  |  | %fileHash = (); | 
| 152 | 0 |  |  |  |  |  | %metamapHash = (); | 
| 153 | 0 |  |  |  |  |  | %tokenHash = (); | 
| 154 | 0 |  |  |  |  |  | %conceptHash = (); | 
| 155 | 0 |  |  |  |  |  | %posHash = (); | 
| 156 | 0 |  |  |  |  |  | %semHash = (); | 
| 157 | 0 |  |  |  |  |  | %cuiHash = (); | 
| 158 |  |  |  |  |  |  |  | 
| 159 |  |  |  |  |  |  | #get the name of the file | 
| 160 | 0 |  |  |  |  |  | my @n = split '/', $file; | 
| 161 | 0 |  |  |  |  |  | my $l = @n; | 
| 162 | 0 |  |  |  |  |  | my $filename = $n[$l - 1]; | 
| 163 | 0 |  |  |  |  |  | $filename = lc($filename); | 
| 164 |  |  |  |  |  |  |  | 
| 165 | 0 |  |  |  |  |  | my $FILE; | 
| 166 | 0 | 0 |  |  |  |  | open ($FILE, "$program_dir/$file") || die ("what is this '$program_dir/$filename' you speak of?\n"); | 
| 167 | 0 |  |  |  |  |  | my @fileLines = <$FILE>; | 
| 168 | 0 |  |  |  |  |  | my @orthoLines = @fileLines; | 
| 169 |  |  |  |  |  |  | #my @orthoLines = ["Hi! I'm Milk", "I have a hamster named Scott", "I like pizza"]; | 
| 170 | 0 |  |  |  |  |  | foreach my $l(@fileLines){ | 
| 171 | 0 |  |  |  |  |  | $l = lc($l); | 
| 172 |  |  |  |  |  |  | } | 
| 173 | 0 |  |  |  |  |  | $uniSub->printColorDebug("on_red", "$filename"); | 
| 174 |  |  |  |  |  |  | #$uniSub->printColorDebug("on_cyan", "*** $wcs ***"); | 
| 175 |  |  |  |  |  |  |  | 
| 176 |  |  |  |  |  |  | #get the total num of lines | 
| 177 | 0 |  |  |  |  |  | my $totalLines = 0; | 
| 178 | 0 |  |  |  |  |  | $totalLines = @fileLines; | 
| 179 | 0 |  |  |  |  |  | $uniSub->printColorDebug("red", "Lines: $totalLines\n"); | 
| 180 |  |  |  |  |  |  |  | 
| 181 |  |  |  |  |  |  | #clean it up for two separate sets | 
| 182 | 0 |  |  |  |  |  | my @tagSet = retagSet($filename, \@fileLines); | 
| 183 | 0 |  |  |  |  |  | my @cleanLines = untagSet($filename, \@fileLines); | 
| 184 |  |  |  |  |  |  |  | 
| 185 |  |  |  |  |  |  | #get the orthographic based lines | 
| 186 |  |  |  |  |  |  | #my @orthoLines = ; | 
| 187 | 0 |  |  |  |  |  | @orthoLines = retagSetOrtho(\@orthoLines); | 
| 188 |  |  |  |  |  |  |  | 
| 189 |  |  |  |  |  |  | #$uniSub->printColorDebug("red", "TAG SET: "); | 
| 190 |  |  |  |  |  |  | #$uniSub->printArr(", ", \@tagSet); | 
| 191 |  |  |  |  |  |  |  | 
| 192 |  |  |  |  |  |  | #######     ASSIGN THE VALUES TO HASHTABLES O KEEP TRACK OF THEM    ####### | 
| 193 |  |  |  |  |  |  |  | 
| 194 |  |  |  |  |  |  | #put all the lines in a file hash | 
| 195 | 0 |  |  |  |  |  | $uniSub->printColorDebug("blue", "*Putting all the file lines into a hashtable....\n"); | 
| 196 | 0 |  |  |  |  |  | $indexer = 0; | 
| 197 | 0 |  |  |  |  |  | foreach my $line (@tagSet){ | 
| 198 | 0 |  |  |  |  |  | $fileHash{$indexer} = $line; | 
| 199 | 0 |  |  |  |  |  | $indexer++; | 
| 200 |  |  |  |  |  |  | } | 
| 201 |  |  |  |  |  |  |  | 
| 202 |  |  |  |  |  |  | #put the orthographic lines in a hash | 
| 203 | 0 |  |  |  |  |  | $indexer = 0; | 
| 204 | 0 |  |  |  |  |  | foreach my $line (@orthoLines){ | 
| 205 |  |  |  |  |  |  | #$uniSub->printColorDebug("red", "$line\n"); | 
| 206 | 0 |  |  |  |  |  | $orthoHash{$indexer} = $line; | 
| 207 | 0 |  |  |  |  |  | $indexer++; | 
| 208 |  |  |  |  |  |  | } | 
| 209 |  |  |  |  |  |  |  | 
| 210 |  |  |  |  |  |  | #import the hashtables from saved data | 
| 211 | 0 |  |  |  |  |  | importMetaData($filename); | 
| 212 |  |  |  |  |  |  |  | 
| 213 |  |  |  |  |  |  | #tokenize all the lines --> tokenhash | 
| 214 | 0 |  |  |  |  |  | $uniSub->printColorDebug("blue", "*Tokenizing the lines into a hashtable....\n"); | 
| 215 | 0 |  |  |  |  |  | $indexer = 0; | 
| 216 | 0 |  |  |  |  |  | my $totalTokens = 0; | 
| 217 | 0 |  |  |  |  |  | my $totalConcepts = 0; | 
| 218 | 0 |  |  |  |  |  | foreach my $line (@cleanLines){ | 
| 219 |  |  |  |  |  |  | #acquire the necessary variables | 
| 220 | 0 |  |  |  |  |  | my $special_ID = "$indexer.ti.1"; | 
| 221 | 0 |  |  |  |  |  | my $meta = $metamapHash{$indexer}; | 
| 222 |  |  |  |  |  |  |  | 
| 223 |  |  |  |  |  |  | #create citation first | 
| 224 | 0 |  |  |  |  |  | $dataStructures->createFromTextWithId($meta, $special_ID); | 
| 225 | 0 |  |  |  |  |  | my $citation = $dataStructures->getCitationWithId($special_ID); | 
| 226 |  |  |  |  |  |  |  | 
| 227 |  |  |  |  |  |  | #get tokens | 
| 228 | 0 |  |  |  |  |  | my @tokensOut = $citation->getOrderedTokens(); | 
| 229 |  |  |  |  |  |  | #double array - extract the inner one | 
| 230 | 0 |  |  |  |  |  | my @tokens = (); | 
| 231 | 0 |  |  |  |  |  | foreach my $tt (@tokensOut){ | 
| 232 | 0 |  |  |  |  |  | my @newSet = @$tt; | 
| 233 | 0 |  |  |  |  |  | push (@tokens, @newSet); | 
| 234 |  |  |  |  |  |  | } | 
| 235 | 0 |  |  |  |  |  | my $tnum = @tokens; | 
| 236 | 0 |  |  |  |  |  | $totalTokens += $tnum; | 
| 237 | 0 |  |  |  |  |  | push (@{$tokenHash{$indexer}}, @tokens); | 
|  | 0 |  |  |  |  |  |  | 
| 238 |  |  |  |  |  |  |  | 
| 239 |  |  |  |  |  |  | #get concepts | 
| 240 | 0 |  |  |  |  |  | my @conceptsOut = $citation->getOrderedConcepts(); | 
| 241 |  |  |  |  |  |  | #double array - extract the inner one | 
| 242 | 0 |  |  |  |  |  | my @concepts = (); | 
| 243 | 0 |  |  |  |  |  | foreach my $cc (@conceptsOut){ | 
| 244 | 0 |  |  |  |  |  | my @newSet = @$cc; | 
| 245 | 0 |  |  |  |  |  | push (@concepts, @newSet); | 
| 246 |  |  |  |  |  |  | } | 
| 247 | 0 |  |  |  |  |  | my $cnum = @concepts; | 
| 248 | 0 |  |  |  |  |  | $totalConcepts += $cnum; | 
| 249 | 0 |  |  |  |  |  | push (@{$conceptHash{$indexer}}, @concepts); | 
|  | 0 |  |  |  |  |  |  | 
| 250 |  |  |  |  |  |  |  | 
| 251 |  |  |  |  |  |  | #create the ordered POS lines | 
| 252 | 0 |  |  |  |  |  | my @posOrder = orderedTokenPOS($cleanLines[$indexer], \@tokens); | 
| 253 | 0 |  |  |  |  |  | my $posOrderLine = join " ", @posOrder; | 
| 254 | 0 |  |  |  |  |  | $posHash{$indexer} = $posOrderLine; | 
| 255 |  |  |  |  |  |  |  | 
| 256 |  |  |  |  |  |  | #create the ordered semantic type lines | 
| 257 | 0 |  |  |  |  |  | my @semantics = getConceptSets("sem", $line, \@concepts); | 
| 258 | 0 |  |  |  |  |  | my $semanticsLine = join " ", @semantics; | 
| 259 | 0 |  |  |  |  |  | $semHash{$indexer} = $semanticsLine; | 
| 260 |  |  |  |  |  |  |  | 
| 261 |  |  |  |  |  |  | #create the ordered cui lines | 
| 262 | 0 |  |  |  |  |  | my @cuis = getConceptSets("cui", $line, \@concepts); | 
| 263 | 0 |  |  |  |  |  | my $cuiLine = join " ", @cuis; | 
| 264 | 0 |  |  |  |  |  | $cuiHash{$indexer} = $cuiLine; | 
| 265 |  |  |  |  |  |  |  | 
| 266 |  |  |  |  |  |  |  | 
| 267 |  |  |  |  |  |  | #increment to the next set | 
| 268 | 0 |  |  |  |  |  | $indexer++; | 
| 269 |  |  |  |  |  |  | } | 
| 270 | 0 |  |  |  |  |  | $uniSub->printColorDebug("red", "TOKENS: $totalTokens\n"); | 
| 271 | 0 |  |  |  |  |  | $uniSub->printColorDebug("red", "CONCEPTS: $totalConcepts\n"); | 
| 272 |  |  |  |  |  |  |  | 
| 273 |  |  |  |  |  |  |  | 
| 274 |  |  |  |  |  |  | #######           BUCKET SORTING - TRAIN AND TEST DATA            ####### | 
| 275 |  |  |  |  |  |  |  | 
| 276 |  |  |  |  |  |  | #sort the lines to buckets | 
| 277 | 0 |  |  |  |  |  | $uniSub->printColorDebug("blue", "*Making buckets....\n"); | 
| 278 | 0 |  |  |  |  |  | my %buckets = (); | 
| 279 | 0 |  |  |  |  |  | %buckets = sort2Buckets($totalLines, $bucketsNum); | 
| 280 |  |  |  |  |  |  |  | 
| 281 | 0 |  |  |  |  |  | $uniSub->printColorDebug("blue", "*Making train and test files....\n"); | 
| 282 |  |  |  |  |  |  |  | 
| 283 | 0 |  |  |  |  |  | zhu_li($filename, \%buckets); | 
| 284 |  |  |  |  |  |  |  | 
| 285 | 0 |  |  |  |  |  | $uniSub->printDebug("\n"); | 
| 286 |  |  |  |  |  |  |  | 
| 287 |  |  |  |  |  |  | } | 
| 288 |  |  |  |  |  |  |  | 
| 289 |  |  |  |  |  |  |  | 
| 290 |  |  |  |  |  |  |  | 
| 291 |  |  |  |  |  |  | ######################              LINE MANIPULATION               ##################### | 
| 292 |  |  |  |  |  |  |  | 
| 293 |  |  |  |  |  |  | ######    RETAGS THE LINE    ###### | 
| 294 |  |  |  |  |  |  |  | 
| 295 |  |  |  |  |  |  | # turns the tagged entity words into special words with <> for the context words | 
| 296 |  |  |  |  |  |  | # input  : $input <-- the line to retag | 
| 297 |  |  |  |  |  |  | #		   $id    <-- the id within the tag to look for | 
| 298 |  |  |  |  |  |  | # output : (.arff files) | 
| 299 |  |  |  |  |  |  | sub retag{ | 
| 300 | 0 |  |  | 0 | 0 |  | my $input = shift; | 
| 301 | 0 |  |  |  |  |  | my $id = shift; | 
| 302 |  |  |  |  |  |  |  | 
| 303 | 0 |  |  |  |  |  | $id = lc($id); | 
| 304 | 0 |  |  |  |  |  | my $line = lc($input); | 
| 305 |  |  |  |  |  |  |  | 
| 306 |  |  |  |  |  |  | #get rid of any tags | 
| 307 | 0 |  |  |  |  |  | my @words = split (" ", $line); | 
| 308 | 0 |  |  |  |  |  | my @newSet = (); | 
| 309 | 0 |  |  |  |  |  | my $charact = 0; | 
| 310 | 0 |  |  |  |  |  | foreach my $word (@words){ | 
| 311 | 0 | 0 |  |  |  |  | if($charact){ | 
| 312 | 0 | 0 |  |  |  |  | if($word eq ""){ | 
| 313 | 0 |  |  |  |  |  | $charact = 0; | 
| 314 |  |  |  |  |  |  | }else{ | 
| 315 | 0 |  |  |  |  |  | my $charWord = "$word"."$entId"; | 
| 316 | 0 |  |  |  |  |  | push @newSet, $charWord; | 
| 317 |  |  |  |  |  |  | } | 
| 318 |  |  |  |  |  |  | }else{ | 
| 319 | 0 | 0 |  |  |  |  | if($word eq ""){ | 
| 320 | 0 |  |  |  |  |  | $charact = 1; | 
| 321 |  |  |  |  |  |  | }else{ | 
| 322 | 0 |  |  |  |  |  | push @newSet, $word; | 
| 323 |  |  |  |  |  |  | } | 
| 324 |  |  |  |  |  |  | } | 
| 325 |  |  |  |  |  |  | } | 
| 326 |  |  |  |  |  |  |  | 
| 327 |  |  |  |  |  |  | #clean up the new line | 
| 328 | 0 |  |  |  |  |  | my $new_line = join " ", @newSet; | 
| 329 | 0 |  |  |  |  |  | $new_line =~s/\b$entId\b//og; | 
| 330 | 0 |  |  |  |  |  | $new_line = $uniSub->cleanWords($new_line); | 
| 331 | 0 |  |  |  |  |  | return $new_line; | 
| 332 |  |  |  |  |  |  | } | 
| 333 |  |  |  |  |  |  |  | 
| 334 |  |  |  |  |  |  | # turns the tagged entity words in the entire file into special words with <> for the context words | 
| 335 |  |  |  |  |  |  | # input  : $name  <-- the name of the file to use as the id tag | 
| 336 |  |  |  |  |  |  | #		   @lines   <-- the set of lines to retag | 
| 337 |  |  |  |  |  |  | # output : @tagSet <-- set of retagged lines | 
| 338 |  |  |  |  |  |  | sub retagSet{ | 
| 339 | 0 |  |  | 0 | 0 |  | my $name = shift; | 
| 340 | 0 |  |  |  |  |  | my $lines_ref = shift; | 
| 341 | 0 |  |  |  |  |  | my @lines = @$lines_ref; | 
| 342 |  |  |  |  |  |  |  | 
| 343 | 0 |  |  |  |  |  | my @tagSet = (); | 
| 344 | 0 |  |  |  |  |  | foreach my $line (@lines){ | 
| 345 |  |  |  |  |  |  | #retag the line | 
| 346 | 0 |  |  |  |  |  | chomp($line); | 
| 347 | 0 |  |  |  |  |  | my $tag_line = retag($line, $name); | 
| 348 |  |  |  |  |  |  |  | 
| 349 |  |  |  |  |  |  | #add it to the set | 
| 350 | 0 |  |  |  |  |  | push @tagSet, $tag_line; | 
| 351 |  |  |  |  |  |  | } | 
| 352 | 0 |  |  |  |  |  | return @tagSet; | 
| 353 |  |  |  |  |  |  | } | 
| 354 |  |  |  |  |  |  |  | 
| 355 |  |  |  |  |  |  | #returns clean line with no tags or retaggings | 
| 356 |  |  |  |  |  |  | # input  : $line <-- the line to untag | 
| 357 |  |  |  |  |  |  | # 	     : $id   <-- the id label to look for | 
| 358 |  |  |  |  |  |  | # output : $input <-- untagged input line | 
| 359 |  |  |  |  |  |  | sub untag{ | 
| 360 | 0 |  |  | 0 | 0 |  | my $line = shift; | 
| 361 | 0 |  |  |  |  |  | my $id = shift; | 
| 362 |  |  |  |  |  |  |  | 
| 363 | 0 |  |  |  |  |  | my $input = lc($line); | 
| 364 | 0 |  |  |  |  |  | $id = lc($id); | 
| 365 | 0 |  |  |  |  |  | $input =~ s/ //og; | 
| 366 | 0 |  |  |  |  |  | $input =~ s/ //og; | 
| 367 | 0 |  |  |  |  |  | $input = $uniSub->cleanWords($input); | 
| 368 | 0 |  |  |  |  |  | return $input; | 
| 369 |  |  |  |  |  |  | } | 
| 370 |  |  |  |  |  |  | #returns a clean set of lines | 
| 371 |  |  |  |  |  |  | # input  : $filename <-- the name of the file for use in the id tag | 
| 372 |  |  |  |  |  |  | #		 : @lines    <-- the set of lines to untag | 
| 373 |  |  |  |  |  |  | # output : @clean_set  <-- untagged set of lines | 
| 374 |  |  |  |  |  |  | sub untagSet{ | 
| 375 | 0 |  |  | 0 | 0 |  | my $filename = shift; | 
| 376 | 0 |  |  |  |  |  | my $lines_ref = shift; | 
| 377 | 0 |  |  |  |  |  | my @lines = @$lines_ref; | 
| 378 |  |  |  |  |  |  |  | 
| 379 | 0 |  |  |  |  |  | my @clean_set = (); | 
| 380 | 0 |  |  |  |  |  | foreach my $line(@lines){ | 
| 381 | 0 |  |  |  |  |  | my $cl = untag($line, $filename); | 
| 382 | 0 |  |  |  |  |  | push @clean_set, $cl; | 
| 383 |  |  |  |  |  |  | } | 
| 384 | 0 |  |  |  |  |  | return @clean_set; | 
| 385 |  |  |  |  |  |  | } | 
| 386 |  |  |  |  |  |  |  | 
| 387 |  |  |  |  |  |  |  | 
| 388 |  |  |  |  |  |  | #import metamap hashtable data | 
| 389 |  |  |  |  |  |  | # input  : $name  <-- the name of the file to import from | 
| 390 |  |  |  |  |  |  | # output : (hashmap of metamap lines) | 
| 391 |  |  |  |  |  |  | sub importMetaData{ | 
| 392 | 0 |  |  | 0 | 0 |  | my $name = shift; | 
| 393 |  |  |  |  |  |  |  | 
| 394 |  |  |  |  |  |  | #create a directory to save hashtable data | 
| 395 | 0 |  |  |  |  |  | my $META; | 
| 396 | 0 |  |  |  |  |  | my $subdir = "_METAMAPS"; | 
| 397 | 0 | 0 |  |  |  |  | open($META, "<", ("$program_dir/$subdir/" . $name . "_meta")) || die ("HAHA No such thing!"); | 
| 398 |  |  |  |  |  |  |  | 
| 399 |  |  |  |  |  |  |  | 
| 400 |  |  |  |  |  |  | #import metamap data from the file | 
| 401 | 0 |  |  |  |  |  | my @metaLines = <$META>; | 
| 402 | 0 |  |  |  |  |  | my $metaCombo = join("", @metaLines); | 
| 403 | 0 |  |  |  |  |  | my @newMetaLines = split("\n\n", $metaCombo); | 
| 404 | 0 |  |  |  |  |  | my $t = @newMetaLines; | 
| 405 | 0 |  |  |  |  |  | $uniSub->printColorDebug("red", "META LINES: $t\n"); | 
| 406 | 0 |  |  |  |  |  | my $key = 0; | 
| 407 | 0 |  |  |  |  |  | foreach my $mm (@newMetaLines){ | 
| 408 | 0 |  |  |  |  |  | $metamapHash{$key} = $mm; | 
| 409 | 0 |  |  |  |  |  | $key++; | 
| 410 |  |  |  |  |  |  | } | 
| 411 | 0 |  |  |  |  |  | close $META; | 
| 412 |  |  |  |  |  |  | } | 
| 413 |  |  |  |  |  |  |  | 
| 414 |  |  |  |  |  |  | #####     FOR THE ORTHO SET     ##### | 
| 415 |  |  |  |  |  |  |  | 
| 416 |  |  |  |  |  |  | #turns the tagged entity words into special words with <> for the context words | 
| 417 |  |  |  |  |  |  | # input  : $input  <-- the line to retag | 
| 418 |  |  |  |  |  |  | # output : $new_line <-- the retagged line | 
| 419 |  |  |  |  |  |  | sub retagOrtho{ | 
| 420 | 0 |  |  | 0 | 0 |  | my $input = shift; | 
| 421 | 0 |  |  |  |  |  | my $line = $input; | 
| 422 |  |  |  |  |  |  |  | 
| 423 |  |  |  |  |  |  | #get rid of any tags | 
| 424 | 0 |  |  |  |  |  | my @words = split (" ", $line); | 
| 425 | 0 |  |  |  |  |  | my @newSet = (); | 
| 426 | 0 |  |  |  |  |  | my $charact = 0; | 
| 427 | 0 |  |  |  |  |  | foreach my $word (@words){ | 
| 428 | 0 | 0 |  |  |  |  | if($charact){ | 
| 429 | 0 | 0 |  |  |  |  | if($word =~//){ | 
| 430 | 0 |  |  |  |  |  | $charact = 0; | 
| 431 |  |  |  |  |  |  | }else{ | 
| 432 | 0 |  |  |  |  |  | my $charWord = "$word"."$entId"; | 
| 433 | 0 |  |  |  |  |  | push @newSet, $charWord; | 
| 434 |  |  |  |  |  |  | } | 
| 435 |  |  |  |  |  |  | }else{ | 
| 436 | 0 | 0 |  |  |  |  | if($word =~//g){ | 
| 437 | 0 |  |  |  |  |  | $charact = 1; | 
| 438 |  |  |  |  |  |  | }else{ | 
| 439 | 0 |  |  |  |  |  | push @newSet, $word; | 
| 440 |  |  |  |  |  |  | } | 
| 441 |  |  |  |  |  |  | } | 
| 442 |  |  |  |  |  |  | } | 
| 443 |  |  |  |  |  |  |  | 
| 444 |  |  |  |  |  |  | #clean up the new line | 
| 445 | 0 |  |  |  |  |  | my $new_line = join " ", @newSet; | 
| 446 | 0 |  |  |  |  |  | $new_line =~s/\b$entId\b//g; | 
| 447 | 0 |  |  |  |  |  | $new_line = noASCIIOrtho($new_line); | 
| 448 | 0 |  |  |  |  |  | return $new_line; | 
| 449 |  |  |  |  |  |  | } | 
| 450 |  |  |  |  |  |  | #turns the tagged entity words in the entire file into special words with <> for the context words | 
| 451 |  |  |  |  |  |  | # input  : @lines  <-- the set of lines to retag | 
| 452 |  |  |  |  |  |  | # output : @tagSet <-- the retagged line | 
| 453 |  |  |  |  |  |  | sub retagSetOrtho{ | 
| 454 | 0 |  |  | 0 | 0 |  | my $lines_ref = shift; | 
| 455 | 0 |  |  |  |  |  | my @lines = @$lines_ref; | 
| 456 |  |  |  |  |  |  |  | 
| 457 | 0 |  |  |  |  |  | my @tagSet = (); | 
| 458 | 0 |  |  |  |  |  | foreach my $line (@lines){ | 
| 459 |  |  |  |  |  |  | #retag the line | 
| 460 | 0 |  |  |  |  |  | chomp($line); | 
| 461 | 0 |  |  |  |  |  | my $tag_line = retagOrtho($line); | 
| 462 |  |  |  |  |  |  |  | 
| 463 |  |  |  |  |  |  | #add it to the set | 
| 464 | 0 |  |  |  |  |  | push @tagSet, $tag_line; | 
| 465 |  |  |  |  |  |  | } | 
| 466 | 0 |  |  |  |  |  | return @tagSet; | 
| 467 |  |  |  |  |  |  | } | 
| 468 |  |  |  |  |  |  |  | 
| 469 |  |  |  |  |  |  | #cleans the line without getting rid of tags | 
| 470 |  |  |  |  |  |  | # input  : $line     <-- line to clean up | 
| 471 |  |  |  |  |  |  | # output : $new_in   <-- the cleaned line | 
| 472 |  |  |  |  |  |  | sub noASCIIOrtho{ | 
| 473 | 0 |  |  | 0 | 0 |  | my $line = shift; | 
| 474 |  |  |  |  |  |  |  | 
| 475 | 0 |  |  |  |  |  | my $new_in = $line; | 
| 476 | 0 |  |  |  |  |  | $new_in =~ s/[^[:ascii:]]//g; | 
| 477 | 0 |  |  |  |  |  | return $new_in | 
| 478 |  |  |  |  |  |  | } | 
| 479 |  |  |  |  |  |  |  | 
| 480 |  |  |  |  |  |  |  | 
| 481 |  |  |  |  |  |  | #######################      TOKENS AND CONCEPT MANIPULATION       ####################### | 
| 482 |  |  |  |  |  |  |  | 
| 483 |  |  |  |  |  |  |  | 
| 484 |  |  |  |  |  |  | #gets rid of any special tokens | 
| 485 |  |  |  |  |  |  | # input  : $text     <-- the token text to fix | 
| 486 |  |  |  |  |  |  | # output : $tokenText <-- a cleaned up token | 
| 487 |  |  |  |  |  |  | sub cleanToken{ | 
| 488 | 0 |  |  | 0 | 0 |  | my $text = shift; | 
| 489 |  |  |  |  |  |  |  | 
| 490 | 0 |  |  |  |  |  | my $tokenText = $text; | 
| 491 |  |  |  |  |  |  |  | 
| 492 |  |  |  |  |  |  | #fix "# . #" tokens | 
| 493 | 0 | 0 |  |  |  |  | if($tokenText =~ /\d+\s\.\s\d+/o){ | 
| 494 | 0 |  |  |  |  |  | $tokenText =~s/\s\.\s/\./og; | 
| 495 |  |  |  |  |  |  | } | 
| 496 |  |  |  |  |  |  |  | 
| 497 |  |  |  |  |  |  | #fix "__ \' __" tokens | 
| 498 | 0 | 0 |  |  |  |  | if($tokenText =~ /\w+\s\\\'\s\w+/o){ | 
| 499 | 0 |  |  |  |  |  | $tokenText =~s/\s\\\'\s//og; | 
| 500 |  |  |  |  |  |  | } | 
| 501 |  |  |  |  |  |  |  | 
| 502 | 0 | 0 |  |  |  |  | if($tokenText =~ /[^a-zA-Z0-9]/o){ | 
| 503 | 0 |  |  |  |  |  | $tokenText = ""; | 
| 504 |  |  |  |  |  |  | } | 
| 505 |  |  |  |  |  |  |  | 
| 506 | 0 |  |  |  |  |  | return $tokenText; | 
| 507 |  |  |  |  |  |  | } | 
| 508 |  |  |  |  |  |  |  | 
| 509 |  |  |  |  |  |  | #grabs the part-of-speech part of the token that's matched up with the bucket tokens | 
| 510 |  |  |  |  |  |  | # input  : @buktTokens     <-- the set of tokens from the specific bucket[s] | 
| 511 |  |  |  |  |  |  | # output : @posTokens 	   <-- the part-of-speech tokens for the bucket | 
| 512 |  |  |  |  |  |  | sub getTokensPOS{ | 
| 513 | 0 |  |  | 0 | 0 |  | my $bucketTokens_ref = shift; | 
| 514 | 0 |  |  |  |  |  | my @buktTokens = @$bucketTokens_ref; | 
| 515 |  |  |  |  |  |  |  | 
| 516 |  |  |  |  |  |  | #finds the part of speech tokens | 
| 517 | 0 |  |  |  |  |  | my @posTokens = (); | 
| 518 | 0 |  |  |  |  |  | foreach my $token (@buktTokens){ | 
| 519 | 0 |  |  |  |  |  | my $pos = $token->{posTag}; | 
| 520 | 0 |  |  |  |  |  | push(@posTokens, $pos); | 
| 521 |  |  |  |  |  |  | } | 
| 522 | 0 |  |  |  |  |  | return @posTokens; | 
| 523 |  |  |  |  |  |  | } | 
| 524 |  |  |  |  |  |  |  | 
| 525 |  |  |  |  |  |  | #gets the positions of the POS words from a line | 
| 526 |  |  |  |  |  |  | # input  : $cleanLine     <-- the line to pinpoint the POS tokens to | 
| 527 |  |  |  |  |  |  | #		 : @tokens        <-- the set of tokens to use are reference | 
| 528 |  |  |  |  |  |  | # output : @orderPOS 	   <-- the part-of-speech tokens for the bucket | 
| 529 |  |  |  |  |  |  | sub orderedTokenPOS{ | 
| 530 | 0 |  |  | 0 | 0 |  | my $cleanLine = shift; | 
| 531 | 0 |  |  |  |  |  | my $tokens_ref = shift; | 
| 532 | 0 |  |  |  |  |  | my @tokens = @$tokens_ref; | 
| 533 |  |  |  |  |  |  |  | 
| 534 |  |  |  |  |  |  | #$uniSub->printColorDebug("on_red", "TAG : $tagLine\n"); | 
| 535 | 0 |  |  |  |  |  | my @lineWords = split " ", $cleanLine; | 
| 536 | 0 |  |  |  |  |  | my @orderPOS = (); | 
| 537 |  |  |  |  |  |  |  | 
| 538 |  |  |  |  |  |  | #make the connection between the word and the pos | 
| 539 | 0 |  |  |  |  |  | my %text2Pos = (); | 
| 540 | 0 |  |  |  |  |  | my @txtTokens = (); | 
| 541 | 0 |  |  |  |  |  | foreach my $token (@tokens){ | 
| 542 | 0 |  |  |  |  |  | my $txt = $token->{text}; | 
| 543 | 0 |  |  |  |  |  | $txt = cleanToken($txt); | 
| 544 |  |  |  |  |  |  | #$uniSub->printColorDebug("on_green", $txt); | 
| 545 | 0 |  |  |  |  |  | push @txtTokens, $txt; | 
| 546 | 0 |  |  |  |  |  | my $pos = $token->{posTag}; | 
| 547 | 0 |  |  |  |  |  | $text2Pos{$txt} = $pos; | 
| 548 |  |  |  |  |  |  | } | 
| 549 |  |  |  |  |  |  |  | 
| 550 |  |  |  |  |  |  | #associate each tagged word with it | 
| 551 | 0 |  |  |  |  |  | foreach my $word (@lineWords){ | 
| 552 | 0 | 0 |  |  |  |  | if($uniSub->inArr($word, \@txtTokens)){ | 
| 553 | 0 |  |  |  |  |  | my $newPos = $text2Pos{$word}; | 
| 554 | 0 |  |  |  |  |  | push @orderPOS, $newPos; | 
| 555 |  |  |  |  |  |  | }else{ | 
| 556 | 0 |  |  |  |  |  | push @orderPOS, "undef"; | 
| 557 |  |  |  |  |  |  | } | 
| 558 |  |  |  |  |  |  | } | 
| 559 |  |  |  |  |  |  |  | 
| 560 | 0 |  |  |  |  |  | return @orderPOS; | 
| 561 |  |  |  |  |  |  | } | 
| 562 |  |  |  |  |  |  |  | 
| 563 |  |  |  |  |  |  | #gets the tagged parts of the concepts | 
| 564 |  |  |  |  |  |  | # input  : $type     	<-- what kind of concepts you want to extract [e.g. "sem", "cui"] | 
| 565 |  |  |  |  |  |  | #		 : $line        <-- the line to pinpoint the concepts to | 
| 566 |  |  |  |  |  |  | #		 : @concepts    <-- the total set of concepts to use | 
| 567 |  |  |  |  |  |  | # output : @conceptSet 	   <-- the set of concepts used within the line | 
| 568 |  |  |  |  |  |  | sub getConceptSets{ | 
| 569 | 0 |  |  | 0 | 0 |  | my $type = shift; | 
| 570 | 0 |  |  |  |  |  | my $line = shift; | 
| 571 | 0 |  |  |  |  |  | my $concepts_ref = shift; | 
| 572 | 0 |  |  |  |  |  | my @concepts = @$concepts_ref; | 
| 573 |  |  |  |  |  |  |  | 
| 574 | 0 |  |  |  |  |  | $line = lc($line); | 
| 575 |  |  |  |  |  |  |  | 
| 576 |  |  |  |  |  |  | #assign each concept by their text name | 
| 577 | 0 |  |  |  |  |  | my @conceptsTxt = (); | 
| 578 | 0 |  |  |  |  |  | foreach my $concept (@concepts){ | 
| 579 | 0 |  |  |  |  |  | my $ohboi = @$concept[0]; | 
| 580 | 0 |  |  |  |  |  | my $name = lc($ohboi->{text}); | 
| 581 | 0 |  |  |  |  |  | push (@conceptsTxt, $name); | 
| 582 |  |  |  |  |  |  | } | 
| 583 |  |  |  |  |  |  |  | 
| 584 |  |  |  |  |  |  | #make a clean set of text words | 
| 585 | 0 |  |  |  |  |  | my @txtIn = split / /, $line; | 
| 586 | 0 |  |  |  |  |  | my @clean_txt = (); | 
| 587 | 0 |  |  |  |  |  | foreach my $word (@txtIn){ | 
| 588 | 0 |  |  |  |  |  | push @clean_txt, $word; | 
| 589 |  |  |  |  |  |  | } | 
| 590 |  |  |  |  |  |  |  | 
| 591 | 0 |  |  |  |  |  | my $totCon = @conceptsTxt; | 
| 592 |  |  |  |  |  |  | #get the set needed | 
| 593 | 0 |  |  |  |  |  | my @conceptSet = (); | 
| 594 | 0 |  |  |  |  |  | for(my $f = 0; $f < $totCon; $f++){ | 
| 595 | 0 |  |  |  |  |  | my @concept = @{$concepts[$f]}; | 
|  | 0 |  |  |  |  |  |  | 
| 596 | 0 |  |  |  |  |  | my $txtCon = $conceptsTxt[$f]; | 
| 597 |  |  |  |  |  |  |  | 
| 598 |  |  |  |  |  |  |  | 
| 599 | 0 |  |  |  |  |  | foreach my $cc (@concept){ | 
| 600 |  |  |  |  |  |  |  | 
| 601 |  |  |  |  |  |  | #get the right items | 
| 602 | 0 |  |  |  |  |  | my @items = (); | 
| 603 | 0 | 0 |  |  |  |  | if($type eq "sem"){ | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 604 | 0 |  |  |  |  |  | my $s = $cc->{semanticTypes}; | 
| 605 | 0 |  |  |  |  |  | @items = split /,/, $s; | 
| 606 |  |  |  |  |  |  | }elsif($type eq "cui"){ | 
| 607 | 0 |  |  |  |  |  | my $c = $cc->{cui}; | 
| 608 | 0 |  |  |  |  |  | @items = split /,/, $c; | 
| 609 |  |  |  |  |  |  | }elsif($type eq "text"){ | 
| 610 | 0 |  |  |  |  |  | my $t = $cc->{text}; | 
| 611 | 0 |  |  |  |  |  | @items = split /,/, $t; | 
| 612 |  |  |  |  |  |  | } | 
| 613 |  |  |  |  |  |  |  | 
| 614 |  |  |  |  |  |  | #add to the concept set | 
| 615 | 0 |  |  |  |  |  | push @conceptSet, @items; | 
| 616 |  |  |  |  |  |  | } | 
| 617 |  |  |  |  |  |  | } | 
| 618 | 0 |  |  |  |  |  | return @conceptSet; | 
| 619 |  |  |  |  |  |  | } | 
| 620 |  |  |  |  |  |  |  | 
| 621 |  |  |  |  |  |  |  | 
| 622 |  |  |  |  |  |  | #retrieves the feature for a single word | 
| 623 |  |  |  |  |  |  | # input  : $word     	<-- the word to extract the features from | 
| 624 |  |  |  |  |  |  | #		 : $type        <-- what type of feature to extract [e.g. "pos", "sem", "cui"] | 
| 625 |  |  |  |  |  |  | # output : if "pos"		<-- a scalar part-of-speech value | 
| 626 |  |  |  |  |  |  | #		 : else			<-- an array of semantic or cui values (a single text value can have more than one of these) | 
| 627 |  |  |  |  |  |  | sub getFeature{ | 
| 628 | 0 |  |  | 0 | 0 |  | my $word = shift; | 
| 629 | 0 |  |  |  |  |  | my $type = shift; | 
| 630 |  |  |  |  |  |  |  | 
| 631 |  |  |  |  |  |  | #if retrieving pos tag | 
| 632 | 0 | 0 | 0 |  |  |  | if($type eq "pos"){ | 
|  |  | 0 |  |  |  |  |  | 
| 633 |  |  |  |  |  |  | #get the token and it's pos tag | 
| 634 | 0 |  |  |  |  |  | foreach my $key (sort keys %tokenHash){ | 
| 635 | 0 |  |  |  |  |  | foreach my $token(@{$tokenHash{$key}}){ | 
|  | 0 |  |  |  |  |  |  | 
| 636 | 0 |  |  |  |  |  | my $tokenTxt = $token->{text}; | 
| 637 | 0 | 0 |  |  |  |  | if($tokenTxt eq $word){ | 
| 638 | 0 |  |  |  |  |  | return $token->{posTag}; | 
| 639 |  |  |  |  |  |  | } | 
| 640 |  |  |  |  |  |  | } | 
| 641 |  |  |  |  |  |  | } | 
| 642 | 0 |  |  |  |  |  | return ""; | 
| 643 |  |  |  |  |  |  | }elsif($type eq "sem" or $type eq "cui"){ | 
| 644 |  |  |  |  |  |  | #get the concept and it's cui or sem tag | 
| 645 | 0 |  |  |  |  |  | foreach my $key (sort keys %conceptHash){ | 
| 646 | 0 |  |  |  |  |  | foreach my $concept (@{$conceptHash{$key}}){ | 
|  | 0 |  |  |  |  |  |  | 
| 647 | 0 |  |  |  |  |  | my $ohboi = @$concept[0]; | 
| 648 | 0 |  |  |  |  |  | my $name = lc($ohboi->{text}); | 
| 649 | 0 | 0 |  |  |  |  | if($name eq $word){ | 
| 650 | 0 | 0 |  |  |  |  | if($type eq "sem"){ | 
|  |  | 0 |  |  |  |  |  | 
| 651 | 0 |  |  |  |  |  | my @semArr = (); | 
| 652 | 0 |  |  |  |  |  | foreach my $cc (@$concept){push(@semArr, $cc->{semanticTypes});} | 
|  | 0 |  |  |  |  |  |  | 
| 653 | 0 |  |  |  |  |  | return @semArr; | 
| 654 |  |  |  |  |  |  | }elsif($type eq "cui"){ | 
| 655 | 0 |  |  |  |  |  | my @cuiArr = (); | 
| 656 | 0 |  |  |  |  |  | foreach my $cc (@$concept){push(@cuiArr, $cc->{cui});} | 
|  | 0 |  |  |  |  |  |  | 
| 657 | 0 |  |  |  |  |  | return @cuiArr; | 
| 658 |  |  |  |  |  |  | } | 
| 659 |  |  |  |  |  |  | } | 
| 660 |  |  |  |  |  |  | } | 
| 661 |  |  |  |  |  |  | } | 
| 662 | 0 |  |  |  |  |  | return ""; | 
| 663 |  |  |  |  |  |  | } | 
| 664 | 0 |  |  |  |  |  | return ""; | 
| 665 |  |  |  |  |  |  | } | 
| 666 |  |  |  |  |  |  |  | 
| 667 |  |  |  |  |  |  | ######################      BUCKETS - TRAIN AND TEST ARFF FILES     ##################### | 
| 668 |  |  |  |  |  |  |  | 
| 669 |  |  |  |  |  |  |  | 
| 670 |  |  |  |  |  |  | #sorts the keys from the hashmaps into buckets so that certain values can be accessed | 
| 671 |  |  |  |  |  |  | # input  : $keyAmt     	    <-- the number of lines or "keys" to divvy up into the buckets | 
| 672 |  |  |  |  |  |  | #		 : $bucketNum       <-- how many buckets to use | 
| 673 |  |  |  |  |  |  | # output : %bucketList		<-- the set of buckets with keys in them | 
| 674 |  |  |  |  |  |  | sub sort2Buckets{ | 
| 675 | 0 |  |  | 0 | 0 |  | my $keyAmt = shift; | 
| 676 | 0 |  |  |  |  |  | my $bucketNum = shift; | 
| 677 |  |  |  |  |  |  |  | 
| 678 |  |  |  |  |  |  | #create sets | 
| 679 | 0 |  |  |  |  |  | my @keySet = (0..$keyAmt - 1);						#set of keys | 
| 680 | 0 |  |  |  |  |  | my %bucketList = ();							#all of the buckets | 
| 681 |  |  |  |  |  |  |  | 
| 682 |  |  |  |  |  |  | #add some buckets to the bucket list | 
| 683 | 0 |  |  |  |  |  | for(my $a = 1; $a <= $bucketNum; $a++){ | 
| 684 | 0 |  |  |  |  |  | $bucketList{$a} = []; | 
| 685 |  |  |  |  |  |  | } | 
| 686 |  |  |  |  |  |  |  | 
| 687 |  |  |  |  |  |  | #sort the lines into buckets | 
| 688 | 0 |  |  |  |  |  | my $bucketId = 1; | 
| 689 | 0 |  |  |  |  |  | foreach my $key (@keySet){ | 
| 690 | 0 |  |  |  |  |  | push (@{$bucketList{$bucketId}}, $key);	#add the line to the bucket | 
|  | 0 |  |  |  |  |  |  | 
| 691 |  |  |  |  |  |  |  | 
| 692 |  |  |  |  |  |  | #reset the id if at the max value | 
| 693 | 0 | 0 |  |  |  |  | if($bucketId == $bucketNum){ | 
| 694 | 0 |  |  |  |  |  | $bucketId = 1; | 
| 695 |  |  |  |  |  |  | }else{ | 
| 696 | 0 |  |  |  |  |  | $bucketId++; | 
| 697 |  |  |  |  |  |  | } | 
| 698 |  |  |  |  |  |  | } | 
| 699 |  |  |  |  |  |  |  | 
| 700 |  |  |  |  |  |  | #return the list of buckets | 
| 701 | 0 |  |  |  |  |  | return %bucketList; | 
| 702 |  |  |  |  |  |  | } | 
| 703 |  |  |  |  |  |  |  | 
| 704 |  |  |  |  |  |  | ######################               ARFF STUFF              ##################### | 
| 705 |  |  |  |  |  |  | #makes arff files for ortho, morpho, text, pos, cui, and sem attributes | 
| 706 |  |  |  |  |  |  |  | 
| 707 |  |  |  |  |  |  | #zhu li!! Do the thing!! | 
| 708 |  |  |  |  |  |  | # input  : $name     	    <-- the name of the file | 
| 709 |  |  |  |  |  |  | #		 : %bucketList       <-- the set of buckets with keys in them | 
| 710 |  |  |  |  |  |  | # output : (n arff files; n = # of buckets x (train and test) x # of features being used) | 
| 711 |  |  |  |  |  |  | sub zhu_li{ | 
| 712 | 0 |  |  | 0 | 0 |  | my $name = shift; | 
| 713 | 0 |  |  |  |  |  | my $bucketList_ref = shift; | 
| 714 | 0 |  |  |  |  |  | my %buckets = %$bucketList_ref; | 
| 715 |  |  |  |  |  |  |  | 
| 716 |  |  |  |  |  |  | #grab the attributes | 
| 717 | 0 |  |  |  |  |  | my %attrSets = (); | 
| 718 | 0 |  |  |  |  |  | $uniSub->printColorDebug("bold green", "Retrieving attributes...\n"); | 
| 719 | 0 |  |  |  |  |  | foreach my $item(@features){ | 
| 720 | 0 |  |  |  |  |  | $uniSub->printColorDebug("bright_green", "\t$item attr\n"); | 
| 721 | 0 |  |  |  |  |  | my %setOfAttr = grabAttr($name, $item, \%buckets); | 
| 722 | 0 |  |  |  |  |  | $attrSets{$item} = \%setOfAttr;						#gets both the vector and arff based attributes | 
| 723 |  |  |  |  |  |  | } | 
| 724 |  |  |  |  |  |  |  | 
| 725 | 0 | 0 |  |  |  |  | if(defined $stopwords_file){ | 
| 726 | 0 |  |  |  |  |  | $stopRegex = stop($stopwords_file); | 
| 727 |  |  |  |  |  |  | } | 
| 728 |  |  |  |  |  |  |  | 
| 729 |  |  |  |  |  |  | #let's make some vectors! | 
| 730 | 0 |  |  |  |  |  | $uniSub->printColorDebug("bold yellow", "Making Vectors...\n-------------------\n"); | 
| 731 | 0 |  |  |  |  |  | my @curFeatSet = (); | 
| 732 | 0 |  |  |  |  |  | my $abbrev = ""; | 
| 733 |  |  |  |  |  |  |  | 
| 734 |  |  |  |  |  |  | #run based on wcs | 
| 735 | 0 |  |  |  |  |  | my $wcs_bucket; | 
| 736 |  |  |  |  |  |  | my $wcs_feature; | 
| 737 | 0 |  |  |  |  |  | my $wcs_found = 0; | 
| 738 | 0 | 0 |  |  |  |  | if($wcs){ | 
| 739 | 0 |  |  |  |  |  | my @wcs_parts = split("-", $wcs); | 
| 740 | 0 |  |  |  |  |  | $wcs_feature = $wcs_parts[1]; | 
| 741 | 0 |  |  |  |  |  | $wcs_bucket = $wcs_parts[0]; | 
| 742 |  |  |  |  |  |  | } | 
| 743 |  |  |  |  |  |  |  | 
| 744 |  |  |  |  |  |  |  | 
| 745 |  |  |  |  |  |  | #iteratively add on the features [e.g. o, om, omt, omtp, omtpc, omtpcs] | 
| 746 | 0 |  |  |  |  |  | foreach my $feature (@features){ | 
| 747 | 0 |  |  |  |  |  | $uniSub->printColorDebug("yellow", "** $feature ** \n"); | 
| 748 | 0 |  |  |  |  |  | push(@curFeatSet, $feature); | 
| 749 | 0 |  |  |  |  |  | $abbrev .= substr($feature, 0, 1);		#add to abbreviations for the name | 
| 750 |  |  |  |  |  |  |  | 
| 751 |  |  |  |  |  |  | #$uniSub->printColorDebug("on_red", "$wcs - $wcs_found - $abbrev vs. $wcs_feature"); | 
| 752 | 0 | 0 | 0 |  |  |  | if(($wcs) && (!$wcs_found) && ($abbrev ne $wcs_feature)){ | 
|  |  |  | 0 |  |  |  |  | 
| 753 | 0 |  |  |  |  |  | print("**SKIP** \n"); | 
| 754 | 0 |  |  |  |  |  | next; | 
| 755 |  |  |  |  |  |  | } | 
| 756 |  |  |  |  |  |  |  | 
| 757 |  |  |  |  |  |  | #go through each bucket | 
| 758 | 0 |  |  |  |  |  | foreach my $bucket (sort keys %buckets){ | 
| 759 | 0 | 0 | 0 |  |  |  | if(($wcs) && (!$wcs_found) && ($bucket != $wcs_bucket)){ | 
|  |  |  | 0 |  |  |  |  | 
| 760 | 0 |  |  |  |  |  | print("\t**SKIP**\n"); | 
| 761 | 0 |  |  |  |  |  | next; | 
| 762 |  |  |  |  |  |  | }else{ | 
| 763 | 0 |  |  |  |  |  | $wcs_found = 1; | 
| 764 |  |  |  |  |  |  | } | 
| 765 |  |  |  |  |  |  |  | 
| 766 | 0 |  |  |  |  |  | my @range = $uniSub->bully($bucketsNum, $bucket); | 
| 767 |  |  |  |  |  |  |  | 
| 768 | 0 |  |  |  |  |  | $uniSub->printColorDebug("on_green", "BUCKET #$bucket"); | 
| 769 |  |  |  |  |  |  | #retrieve the vector attributes to use | 
| 770 | 0 |  |  |  |  |  | my %vecAttrSet = (); | 
| 771 | 0 |  |  |  |  |  | foreach my $curItem(@curFeatSet){ | 
| 772 | 0 | 0 |  |  |  |  | if($curItem eq "ortho"){ | 
| 773 | 0 |  |  |  |  |  | $vecAttrSet{$curItem} = (); | 
| 774 |  |  |  |  |  |  | }else{ | 
| 775 |  |  |  |  |  |  | #get outer layer (tpcs) | 
| 776 | 0 |  |  |  |  |  | my $a_ref = $attrSets{$curItem}; | 
| 777 | 0 |  |  |  |  |  | my %a = %$a_ref; | 
| 778 |  |  |  |  |  |  |  | 
| 779 |  |  |  |  |  |  | #get inner layer (vector) | 
| 780 | 0 |  |  |  |  |  | my $b_ref = $a{vector}; | 
| 781 | 0 |  |  |  |  |  | my %b = %$b_ref; | 
| 782 |  |  |  |  |  |  |  | 
| 783 |  |  |  |  |  |  | #foreach my $key (sort keys %b){print "$key\n";} | 
| 784 |  |  |  |  |  |  |  | 
| 785 |  |  |  |  |  |  | #finally get the bucket layer (1..$bucketNum) based on range | 
| 786 | 0 |  |  |  |  |  | my $c_ref = $b{$bucket}; | 
| 787 | 0 |  |  |  |  |  | my @c = @$c_ref; | 
| 788 | 0 |  |  |  |  |  | $vecAttrSet{$curItem} = \@c; | 
| 789 |  |  |  |  |  |  | } | 
| 790 |  |  |  |  |  |  | } | 
| 791 |  |  |  |  |  |  |  | 
| 792 |  |  |  |  |  |  | ### TRAIN ### | 
| 793 | 0 |  |  |  |  |  | $uniSub->printColorDebug("bold blue", "\ttraining...\n"); | 
| 794 |  |  |  |  |  |  | #retrieve the lines to use | 
| 795 | 0 |  |  |  |  |  | my @lineSetTrain = (); | 
| 796 | 0 |  |  |  |  |  | my @bucketSetTrain = (); | 
| 797 | 0 |  |  |  |  |  | foreach my $num (@range){push(@bucketSetTrain, @{$buckets{$num}});} | 
|  | 0 |  |  |  |  |  |  | 
|  | 0 |  |  |  |  |  |  | 
| 798 | 0 |  |  |  |  |  | foreach my $key (@bucketSetTrain){push(@lineSetTrain, $orthoHash{$key});} | 
|  | 0 |  |  |  |  |  |  | 
| 799 |  |  |  |  |  |  |  | 
| 800 |  |  |  |  |  |  | #make the vector | 
| 801 | 0 |  |  |  |  |  | my @vectorSetTrain = vectorMaker(\@lineSetTrain, \@curFeatSet, \%vecAttrSet); | 
| 802 | 0 |  |  |  |  |  | $uniSub->printDebug("\n"); | 
| 803 |  |  |  |  |  |  |  | 
| 804 |  |  |  |  |  |  | ### TEST ### | 
| 805 | 0 |  |  |  |  |  | $uniSub->printColorDebug("bold magenta", "\ttesting...\n"); | 
| 806 |  |  |  |  |  |  | #retrieve the lines to use | 
| 807 | 0 |  |  |  |  |  | my @lineSetTest = (); | 
| 808 | 0 |  |  |  |  |  | my @bucketSetTest = (); | 
| 809 | 0 |  |  |  |  |  | push(@bucketSetTest, @{$buckets{$bucket}}); | 
|  | 0 |  |  |  |  |  |  | 
| 810 | 0 |  |  |  |  |  | foreach my $key (@bucketSetTest){push(@lineSetTest, $orthoHash{$key});} | 
|  | 0 |  |  |  |  |  |  | 
| 811 |  |  |  |  |  |  |  | 
| 812 |  |  |  |  |  |  | #make the vector | 
| 813 | 0 |  |  |  |  |  | my @vectorSetTest = vectorMaker(\@lineSetTest, \@curFeatSet, \%vecAttrSet); | 
| 814 | 0 |  |  |  |  |  | $uniSub->printDebug("\n"); | 
| 815 |  |  |  |  |  |  |  | 
| 816 |  |  |  |  |  |  | ### ARFF ### | 
| 817 |  |  |  |  |  |  | #retrieve the arff attributes to use | 
| 818 | 0 |  |  |  |  |  | my @arffAttrSet = (); | 
| 819 | 0 |  |  |  |  |  | foreach my $curItem(@curFeatSet){ | 
| 820 | 0 | 0 |  |  |  |  | if($curItem eq "ortho"){ | 
| 821 |  |  |  |  |  |  | #get outer layer (ortho) | 
| 822 | 0 |  |  |  |  |  | my $a_ref = $attrSets{$curItem}; | 
| 823 | 0 |  |  |  |  |  | my %a = %$a_ref; | 
| 824 |  |  |  |  |  |  | #get the values from ortho | 
| 825 | 0 |  |  |  |  |  | push(@arffAttrSet, @{$a{arff}}); | 
|  | 0 |  |  |  |  |  |  | 
| 826 |  |  |  |  |  |  | }else{ | 
| 827 |  |  |  |  |  |  | #get outer layer (mtpcs) | 
| 828 | 0 |  |  |  |  |  | my $a_ref = $attrSets{$curItem}; | 
| 829 | 0 |  |  |  |  |  | my %a = %$a_ref; | 
| 830 |  |  |  |  |  |  |  | 
| 831 |  |  |  |  |  |  | #get inner layer (arff) | 
| 832 | 0 |  |  |  |  |  | my $b_ref = $a{arff}; | 
| 833 | 0 |  |  |  |  |  | my %b = %$b_ref; | 
| 834 |  |  |  |  |  |  |  | 
| 835 |  |  |  |  |  |  | #finally get the bucket layer (1..$bucketNum) based on range | 
| 836 | 0 |  |  |  |  |  | my $c_ref = $b{$bucket}; | 
| 837 | 0 |  |  |  |  |  | my @c = @$c_ref; | 
| 838 | 0 |  |  |  |  |  | push(@arffAttrSet, @c); | 
| 839 |  |  |  |  |  |  | } | 
| 840 |  |  |  |  |  |  | } | 
| 841 |  |  |  |  |  |  |  | 
| 842 |  |  |  |  |  |  |  | 
| 843 | 0 |  |  |  |  |  | $uniSub->printColorDebug("bright_yellow", "\tmaking arff files...\n"); | 
| 844 | 0 |  |  |  |  |  | $uniSub->printColorDebug("bright_red", "\t\tARFF TRAIN\n"); | 
| 845 | 0 |  |  |  |  |  | createARFF($name, $bucket, $abbrev, "train", \@arffAttrSet, \@vectorSetTrain); | 
| 846 | 0 |  |  |  |  |  | $uniSub->printColorDebug("bright_red", "\t\tARFF TEST\n"); | 
| 847 | 0 |  |  |  |  |  | createARFF($name, $bucket, $abbrev, "test", \@arffAttrSet, \@vectorSetTest); | 
| 848 |  |  |  |  |  |  | } | 
| 849 |  |  |  |  |  |  | } | 
| 850 |  |  |  |  |  |  |  | 
| 851 |  |  |  |  |  |  | } | 
| 852 |  |  |  |  |  |  |  | 
| 853 |  |  |  |  |  |  | #create the arff file | 
| 854 |  |  |  |  |  |  | # input  : $name     	    <-- the name of the file | 
| 855 |  |  |  |  |  |  | #		 : $bucket   		<-- the index of the bucket you're testing [e.g. bucket #1] | 
| 856 |  |  |  |  |  |  | #        : $abbrev          <-- the abbreviation label for the set of features | 
| 857 |  |  |  |  |  |  | #        : $type            <-- train or test ARFF? | 
| 858 |  |  |  |  |  |  | #        : @attrARFFSet     <-- the set of attributes exclusively for printing to the arff file | 
| 859 |  |  |  |  |  |  | #		 : @vecSec          <-- the set of vectors created | 
| 860 |  |  |  |  |  |  | # output : (an arff file) | 
| 861 |  |  |  |  |  |  | sub createARFF{ | 
| 862 | 0 |  |  | 0 | 0 |  | my $name = shift; | 
| 863 | 0 |  |  |  |  |  | my $bucket = shift; | 
| 864 | 0 |  |  |  |  |  | my $abbrev = shift; | 
| 865 | 0 |  |  |  |  |  | my $type = shift; | 
| 866 | 0 |  |  |  |  |  | my $attr_ref = shift; | 
| 867 | 0 |  |  |  |  |  | my $vec_ref = shift; | 
| 868 |  |  |  |  |  |  |  | 
| 869 | 0 |  |  |  |  |  | my $typeDir = "_$type"; | 
| 870 | 0 |  |  |  |  |  | my $ARFF; | 
| 871 |  |  |  |  |  |  | #print to files | 
| 872 | 0 |  |  |  |  |  | $uniSub->printColorDebug("bold cyan", "\t\tcreating $name/$abbrev - BUCKET #$bucket $type ARFF...\n"); | 
| 873 | 0 | 0 |  |  |  |  | if($program_dir ne ""){ | 
| 874 | 0 |  |  |  |  |  | my $subdir = "_ARFF"; | 
| 875 | 0 |  |  |  |  |  | my $arffdir = $name . "_ARFF"; | 
| 876 | 0 |  |  |  |  |  | my $featdir = "_$abbrev"; | 
| 877 | 0 |  |  |  |  |  | make_path("$program_dir/$subdir/$arffdir/$featdir/$typeDir"); | 
| 878 | 0 | 0 |  |  |  |  | open($ARFF, ">", ("$program_dir/$subdir/$arffdir/$featdir/$typeDir/" . $name . "_$type-" . $bucket .".arff")) || die ("OMG?!?!"); | 
| 879 |  |  |  |  |  |  | }else{ | 
| 880 | 0 |  |  |  |  |  | my $arffdir = $name . "_ARFF"; | 
| 881 | 0 |  |  |  |  |  | my $featdir = "_$abbrev"; | 
| 882 | 0 |  |  |  |  |  | make_path("$arffdir/$featdir/$typeDir"); | 
| 883 | 0 | 0 |  |  |  |  | open($ARFF, ">", ("$arffdir/$featdir/$typeDir/" . $name . "_$type-" . $bucket .".arff")) || die ("What?!?!"); | 
| 884 |  |  |  |  |  |  | } | 
| 885 |  |  |  |  |  |  |  | 
| 886 |  |  |  |  |  |  | #get the attr and vector set | 
| 887 | 0 |  |  |  |  |  | my @attrARFFSet = @$attr_ref; | 
| 888 | 0 |  |  |  |  |  | my @vecSet = @$vec_ref; | 
| 889 |  |  |  |  |  |  |  | 
| 890 |  |  |  |  |  |  | #get format for the file | 
| 891 | 0 |  |  |  |  |  | my $relation = "\@RELATION $name"; | 
| 892 | 0 |  |  |  |  |  | my @printAttr = makeAttrData(\@attrARFFSet); | 
| 893 | 0 |  |  |  |  |  | my $entity = "\@ATTRIBUTE Entity {Yes, No}";	#set if the entity word or not | 
| 894 | 0 |  |  |  |  |  | my $data = "\@DATA"; | 
| 895 |  |  |  |  |  |  |  | 
| 896 |  |  |  |  |  |  | #print everything to the file | 
| 897 | 0 |  |  |  |  |  | $uniSub->printDebug("\t\tprinting to file...\n"); | 
| 898 | 0 |  |  |  |  |  | $uniSub->print2File($ARFF, $relation); | 
| 899 | 0 |  |  |  |  |  | foreach my $a(@printAttr){$uniSub->print2File($ARFF, $a);} | 
|  | 0 |  |  |  |  |  |  | 
| 900 | 0 |  |  |  |  |  | $uniSub->print2File($ARFF, $entity); | 
| 901 | 0 |  |  |  |  |  | $uniSub->print2File($ARFF, $data); | 
| 902 | 0 |  |  |  |  |  | foreach my $d(@vecSet){$uniSub->print2File($ARFF, $d);} | 
|  | 0 |  |  |  |  |  |  | 
| 903 | 0 |  |  |  |  |  | close $ARFF; | 
| 904 |  |  |  |  |  |  | } | 
| 905 |  |  |  |  |  |  |  | 
| 906 |  |  |  |  |  |  | ######################               VECTOR THINGIES              ##################### | 
| 907 |  |  |  |  |  |  |  | 
| 908 |  |  |  |  |  |  |  | 
| 909 |  |  |  |  |  |  | #makes vectors from a set | 
| 910 |  |  |  |  |  |  | # input  : @txtLineSet 		<-- the retagged text lines to make vectors out of | 
| 911 |  |  |  |  |  |  | #		 : @featureList		<-- the list of features to make the vectors out of [e.g. (ortho, morph, text)] | 
| 912 |  |  |  |  |  |  | #		 : @attrs  			<-- the attributes to use to make the vectors | 
| 913 |  |  |  |  |  |  | # output : @setVectors		<-- the vectors for each word in all of the lines | 
| 914 |  |  |  |  |  |  | sub vectorMaker{ | 
| 915 | 0 |  |  | 0 | 0 |  | my $set_ref = shift; | 
| 916 | 0 |  |  |  |  |  | my $feat_ref = shift; | 
| 917 | 0 |  |  |  |  |  | my $attrib_ref = shift; | 
| 918 | 0 |  |  |  |  |  | my @txtLineSet = @$set_ref; | 
| 919 | 0 |  |  |  |  |  | my @featureList = @$feat_ref; | 
| 920 | 0 |  |  |  |  |  | my %attrs = %$attrib_ref; | 
| 921 |  |  |  |  |  |  |  | 
| 922 | 0 |  |  |  |  |  | my @setVectors = (); | 
| 923 |  |  |  |  |  |  | #go through each line of the set | 
| 924 | 0 |  |  |  |  |  | my $setLen = @txtLineSet; | 
| 925 |  |  |  |  |  |  |  | 
| 926 | 0 |  |  |  |  |  | for(my $l = 0; $l < $setLen; $l++){ | 
| 927 | 0 |  |  |  |  |  | my $line = $txtLineSet[$l]; | 
| 928 | 0 |  |  |  |  |  | my @words = split(' ', $line); | 
| 929 |  |  |  |  |  |  | #$uniSub->printArr(", ", \@words); | 
| 930 |  |  |  |  |  |  | #print "\n"; | 
| 931 | 0 |  |  |  |  |  | my $wordLen = @words; | 
| 932 |  |  |  |  |  |  | #go through each word | 
| 933 | 0 |  |  |  |  |  | for(my $a = 0; $a < $wordLen; $a++){ | 
| 934 |  |  |  |  |  |  |  | 
| 935 | 0 |  |  |  |  |  | $| = 1; | 
| 936 |  |  |  |  |  |  |  | 
| 937 | 0 |  |  |  |  |  | my $wordOrig = $words[$a]; | 
| 938 |  |  |  |  |  |  | #make the words for comparison | 
| 939 | 0 |  |  |  |  |  | my $word = $words[$a]; | 
| 940 | 0 |  |  |  |  |  | my $prevWord = ""; | 
| 941 | 0 |  |  |  |  |  | my $nextWord = ""; | 
| 942 |  |  |  |  |  |  |  | 
| 943 |  |  |  |  |  |  | #show progress | 
| 944 | 0 |  |  |  |  |  | my $l2 = $l + 1; | 
| 945 | 0 |  |  |  |  |  | my $a2 = $a + 1; | 
| 946 | 0 |  |  |  |  |  | $uniSub->printDebug("\r" . "\t\tLine - $l2/$setLen ------ Word - $a2/$wordLen  ----  "); | 
| 947 |  |  |  |  |  |  |  | 
| 948 | 0 |  |  |  |  |  | my $smlword = substr($word, 0, 8); | 
| 949 | 0 | 0 |  |  |  |  | if(length($word) > 8){ | 
| 950 | 0 |  |  |  |  |  | $smlword .= "..."; | 
| 951 |  |  |  |  |  |  | } | 
| 952 |  |  |  |  |  |  |  | 
| 953 | 0 | 0 |  |  |  |  | if($word =~/$entId/o){ | 
| 954 | 0 |  |  |  |  |  | $uniSub->printColorDebug("red", "$smlword!                "); | 
| 955 |  |  |  |  |  |  | }else{ | 
| 956 | 0 |  |  |  |  |  | $uniSub->printDebug("$smlword!                    ") | 
| 957 |  |  |  |  |  |  | } | 
| 958 |  |  |  |  |  |  |  | 
| 959 | 0 |  |  |  |  |  | my @word_cuis = getFeature($word, "cui"); | 
| 960 | 0 |  |  |  |  |  | my $ncui = $word_cuis[0]; | 
| 961 |  |  |  |  |  |  | #$uniSub->printColorDebug("red", "\n\t\t$word - $ncui\n"); | 
| 962 |  |  |  |  |  |  |  | 
| 963 |  |  |  |  |  |  | #check if it's a stopword | 
| 964 | 0 | 0 | 0 |  |  |  | if(($stopwords_file and $word=~/$stopRegex/o) || ($is_cui and $word_cuis[0] eq "") || ($word eq "." || $word eq ",")){ | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
| 965 |  |  |  |  |  |  | #$uniSub->printColorDebug("on_red", "\t\tSKIP!"); | 
| 966 | 0 |  |  |  |  |  | next; | 
| 967 |  |  |  |  |  |  | } | 
| 968 |  |  |  |  |  |  |  | 
| 969 | 0 | 0 |  |  |  |  | if($a > 0){$prevWord = $words[$a - 1];} | 
|  | 0 |  |  |  |  |  |  | 
| 970 | 0 | 0 |  |  |  |  | if($a < ($wordLen - 1)){$nextWord = $words[$a + 1];} | 
|  | 0 |  |  |  |  |  |  | 
| 971 |  |  |  |  |  |  |  | 
| 972 |  |  |  |  |  |  |  | 
| 973 |  |  |  |  |  |  |  | 
| 974 |  |  |  |  |  |  | #get rid of tag if necessary | 
| 975 | 0 |  |  |  |  |  | $prevWord =~s/$entId//og; | 
| 976 | 0 |  |  |  |  |  | $nextWord =~s/$entId//og; | 
| 977 | 0 |  |  |  |  |  | $word =~s/$entId//og; | 
| 978 |  |  |  |  |  |  |  | 
| 979 | 0 |  |  |  |  |  | my $vec = ""; | 
| 980 |  |  |  |  |  |  | #use each set of attributes | 
| 981 | 0 |  |  |  |  |  | foreach my $item(@featureList){ | 
| 982 | 0 |  |  |  |  |  | my $addVec = ""; | 
| 983 | 0 | 0 |  |  |  |  | if($item eq "ortho"){$addVec = orthoVec($word);} | 
|  | 0 | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 984 | 0 |  |  |  |  |  | elsif($item eq "morph"){$addVec = morphVec($word, \@{$attrs{"morph"}});} | 
|  | 0 |  |  |  |  |  |  | 
| 985 | 0 |  |  |  |  |  | elsif($item eq "text"){$addVec = textVec($word, $prevWord, $nextWord, \@{$attrs{"text"}});} | 
|  | 0 |  |  |  |  |  |  | 
| 986 | 0 |  |  |  |  |  | elsif($item eq "pos"){$addVec = posVec($word, $prevWord, $nextWord, \@{$attrs{"pos"}});} | 
|  | 0 |  |  |  |  |  |  | 
| 987 | 0 |  |  |  |  |  | elsif($item eq "cui"){$addVec = cuiVec($word, $prevWord, $nextWord, \@{$attrs{"cui"}});} | 
|  | 0 |  |  |  |  |  |  | 
| 988 | 0 |  |  |  |  |  | elsif($item eq "sem"){$addVec = semVec($word, $prevWord, $nextWord, \@{$attrs{"sem"}});} | 
|  | 0 |  |  |  |  |  |  | 
| 989 |  |  |  |  |  |  |  | 
| 990 |  |  |  |  |  |  |  | 
| 991 | 0 |  |  |  |  |  | $vec .= $addVec; | 
| 992 |  |  |  |  |  |  |  | 
| 993 |  |  |  |  |  |  | } | 
| 994 |  |  |  |  |  |  |  | 
| 995 |  |  |  |  |  |  | #convert binary to sparse if specified | 
| 996 | 0 | 0 |  |  |  |  | if($sparse_matrix){ | 
| 997 | 0 |  |  |  |  |  | $vec = convert2Sparse($vec); | 
| 998 |  |  |  |  |  |  | #$uniSub->printColorDebug("red", "$vec\n"); | 
| 999 |  |  |  |  |  |  | } | 
| 1000 |  |  |  |  |  |  |  | 
| 1001 |  |  |  |  |  |  | #check if the word is an entity or not | 
| 1002 |  |  |  |  |  |  | #$uniSub->printColorDebug("red", "\n$wordOrig\n"); | 
| 1003 | 0 | 0 |  |  |  |  | $vec .= (($wordOrig =~/\b[\S]+(_e)\b/) ? "Yes " : "No "); | 
| 1004 |  |  |  |  |  |  |  | 
| 1005 |  |  |  |  |  |  | #close it if using sparse matrix | 
| 1006 | 0 | 0 |  |  |  |  | if($sparse_matrix){ | 
| 1007 | 0 |  |  |  |  |  | $vec .= "}"; | 
| 1008 |  |  |  |  |  |  | } | 
| 1009 |  |  |  |  |  |  |  | 
| 1010 |  |  |  |  |  |  | #finally add the word back and add the entire vector to the set | 
| 1011 | 0 |  |  |  |  |  | $vec .= "\%$word"; | 
| 1012 |  |  |  |  |  |  |  | 
| 1013 | 0 | 0 |  |  |  |  | if($word ne ""){ | 
| 1014 | 0 |  |  |  |  |  | push(@setVectors, $vec); | 
| 1015 |  |  |  |  |  |  | } | 
| 1016 |  |  |  |  |  |  | } | 
| 1017 |  |  |  |  |  |  | } | 
| 1018 |  |  |  |  |  |  |  | 
| 1019 | 0 |  |  |  |  |  | return @setVectors; | 
| 1020 |  |  |  |  |  |  | } | 
| 1021 |  |  |  |  |  |  |  | 
| 1022 |  |  |  |  |  |  | #makes the orthographic based part of the vector | 
| 1023 |  |  |  |  |  |  | # input  : $word     	    <-- the word to analyze | 
| 1024 |  |  |  |  |  |  | # output : $strVec			<-- the orthographic vector string | 
| 1025 |  |  |  |  |  |  | sub orthoVec{ | 
| 1026 | 0 |  |  | 0 | 0 |  | my $word = shift; | 
| 1027 |  |  |  |  |  |  |  | 
| 1028 |  |  |  |  |  |  | ##  CHECKS  ## | 
| 1029 | 0 |  |  |  |  |  | my $strVec = ""; | 
| 1030 | 0 |  |  |  |  |  | my $addon = ""; | 
| 1031 |  |  |  |  |  |  |  | 
| 1032 |  |  |  |  |  |  | #check if first letter capital | 
| 1033 | 0 | 0 |  |  |  |  | $addon = ($word =~ /\b([A-Z])\w+\b/og ? 1 : 0); | 
| 1034 | 0 |  |  |  |  |  | $strVec .= "$addon, "; | 
| 1035 |  |  |  |  |  |  |  | 
| 1036 |  |  |  |  |  |  | #check if a single letter word | 
| 1037 | 0 | 0 |  |  |  |  | $addon = (length($word) == 1 ? 1 : 0); | 
| 1038 | 0 |  |  |  |  |  | $strVec .= "$addon, "; | 
| 1039 |  |  |  |  |  |  |  | 
| 1040 |  |  |  |  |  |  | #check if all capital letters | 
| 1041 | 0 | 0 |  |  |  |  | $addon = ($word =~ /\b[A-Z]+\b/og ? 1 : 0); | 
| 1042 | 0 |  |  |  |  |  | $strVec .= "$addon, "; | 
| 1043 |  |  |  |  |  |  |  | 
| 1044 |  |  |  |  |  |  | #check if contains a digit | 
| 1045 | 0 | 0 |  |  |  |  | $addon = ($word =~ /[0-9]+/og ? 1 : 0); | 
| 1046 | 0 |  |  |  |  |  | $strVec .= "$addon, "; | 
| 1047 |  |  |  |  |  |  |  | 
| 1048 |  |  |  |  |  |  | #check if all digits | 
| 1049 | 0 | 0 |  |  |  |  | $addon = ($word =~ /\b[0-9]+\b/og ? 1 : 0); | 
| 1050 | 0 |  |  |  |  |  | $strVec .= "$addon, "; | 
| 1051 |  |  |  |  |  |  |  | 
| 1052 |  |  |  |  |  |  | #check if contains a hyphen | 
| 1053 | 0 | 0 |  |  |  |  | $addon = ($word =~ /-/og ? 1 : 0); | 
| 1054 | 0 |  |  |  |  |  | $strVec .= "$addon, "; | 
| 1055 |  |  |  |  |  |  |  | 
| 1056 |  |  |  |  |  |  | #check if contains punctuation | 
| 1057 | 0 | 0 |  |  |  |  | $addon = ($word =~ /[^a-zA-Z0-9\s]/og ? 1 : 0); | 
| 1058 | 0 |  |  |  |  |  | $strVec .= "$addon, "; | 
| 1059 |  |  |  |  |  |  |  | 
| 1060 | 0 |  |  |  |  |  | return $strVec; | 
| 1061 |  |  |  |  |  |  | } | 
| 1062 |  |  |  |  |  |  |  | 
| 1063 |  |  |  |  |  |  | #makes the morphological based part of the vector | 
| 1064 |  |  |  |  |  |  | # input  : $word     	    <-- the word to analyze | 
| 1065 |  |  |  |  |  |  | #		 : @attrs 			<-- the set of morphological attributes to use | 
| 1066 |  |  |  |  |  |  | # output : $strVec			<-- the morphological vector string | 
| 1067 |  |  |  |  |  |  | sub morphVec{ | 
| 1068 | 0 |  |  | 0 | 0 |  | my $word = shift; | 
| 1069 | 0 |  |  |  |  |  | my $attrs_ref = shift; | 
| 1070 | 0 |  |  |  |  |  | my @attrs = @$attrs_ref; | 
| 1071 |  |  |  |  |  |  |  | 
| 1072 | 0 |  |  |  |  |  | my $strVec = ""; | 
| 1073 |  |  |  |  |  |  |  | 
| 1074 | 0 |  |  |  |  |  | my $preWord = substr($word, 0, $prefix); | 
| 1075 | 0 |  |  |  |  |  | my $sufWord = substr($word, -$suffix); | 
| 1076 |  |  |  |  |  |  |  | 
| 1077 | 0 |  |  |  |  |  | foreach my $a (@attrs){ | 
| 1078 | 0 | 0 |  |  |  |  | if($a eq $preWord){ | 
|  |  | 0 |  |  |  |  |  | 
| 1079 | 0 |  |  |  |  |  | $strVec .= "1, "; | 
| 1080 |  |  |  |  |  |  | }elsif($a eq $sufWord){ | 
| 1081 | 0 |  |  |  |  |  | $strVec .= "1, "; | 
| 1082 |  |  |  |  |  |  | }else{ | 
| 1083 | 0 |  |  |  |  |  | $strVec .= "0, "; | 
| 1084 |  |  |  |  |  |  | } | 
| 1085 |  |  |  |  |  |  | } | 
| 1086 |  |  |  |  |  |  |  | 
| 1087 | 0 |  |  |  |  |  | return $strVec; | 
| 1088 |  |  |  |  |  |  |  | 
| 1089 |  |  |  |  |  |  | } | 
| 1090 |  |  |  |  |  |  |  | 
| 1091 |  |  |  |  |  |  | #makes the text based part of the vector | 
| 1092 |  |  |  |  |  |  | # input  : $w     	    	<-- the word to analyze | 
| 1093 |  |  |  |  |  |  | #        : $pw     	    	<-- the previous word | 
| 1094 |  |  |  |  |  |  | #        : $nw     	    	<-- the next word | 
| 1095 |  |  |  |  |  |  | #		 : @attrbts 		<-- the set of text attributes to use | 
| 1096 |  |  |  |  |  |  | # output : $strVec			<-- the text vector string | 
| 1097 |  |  |  |  |  |  | sub textVec{ | 
| 1098 | 0 |  |  | 0 | 0 |  | my $w = shift; | 
| 1099 | 0 |  |  |  |  |  | my $pw = shift; | 
| 1100 | 0 |  |  |  |  |  | my $nw = shift; | 
| 1101 | 0 |  |  |  |  |  | my $at_ref = shift; | 
| 1102 | 0 |  |  |  |  |  | my @attrbts = @$at_ref; | 
| 1103 |  |  |  |  |  |  |  | 
| 1104 | 0 |  |  |  |  |  | my $strVec = ""; | 
| 1105 |  |  |  |  |  |  |  | 
| 1106 |  |  |  |  |  |  | #clean the words | 
| 1107 | 0 |  |  |  |  |  | $w = $uniSub->cleanWords($w); | 
| 1108 | 0 |  |  |  |  |  | $pw = $uniSub->cleanWords($pw); | 
| 1109 | 0 |  |  |  |  |  | $nw = $uniSub->cleanWords($nw); | 
| 1110 |  |  |  |  |  |  |  | 
| 1111 |  |  |  |  |  |  | #check if the word is the attribute or the words adjacent it are the attribute | 
| 1112 | 0 |  |  |  |  |  | foreach my $a(@attrbts){ | 
| 1113 |  |  |  |  |  |  |  | 
| 1114 | 0 |  |  |  |  |  | my $pair = ""; | 
| 1115 | 0 | 0 |  |  |  |  | $pair .= ($w eq $a ? "1, " : "0, "); | 
| 1116 | 0 | 0 | 0 |  |  |  | $pair .= (($pw eq $a or $nw eq $a) ? "1, " : "0, "); | 
| 1117 | 0 |  |  |  |  |  | $strVec .= $pair; | 
| 1118 |  |  |  |  |  |  | } | 
| 1119 |  |  |  |  |  |  |  | 
| 1120 | 0 |  |  |  |  |  | return $strVec; | 
| 1121 |  |  |  |  |  |  | } | 
| 1122 |  |  |  |  |  |  |  | 
| 1123 |  |  |  |  |  |  | #makes the part of speech based part of the vector | 
| 1124 |  |  |  |  |  |  | # input  : $w     	    	<-- the word to analyze | 
| 1125 |  |  |  |  |  |  | #        : $pw     	    	<-- the previous word | 
| 1126 |  |  |  |  |  |  | #        : $nw     	    	<-- the next word | 
| 1127 |  |  |  |  |  |  | #		 : @attrbts 		<-- the set of pos attributes to use | 
| 1128 |  |  |  |  |  |  | # output : $strVec			<-- the pos vector string | 
| 1129 |  |  |  |  |  |  | sub posVec{ | 
| 1130 | 0 |  |  | 0 | 0 |  | my $w = shift; | 
| 1131 | 0 |  |  |  |  |  | my $pw = shift; | 
| 1132 | 0 |  |  |  |  |  | my $nw = shift; | 
| 1133 | 0 |  |  |  |  |  | my $at_ref = shift; | 
| 1134 | 0 |  |  |  |  |  | my @attrbts = @$at_ref; | 
| 1135 |  |  |  |  |  |  |  | 
| 1136 |  |  |  |  |  |  | #clean the words | 
| 1137 | 0 |  |  |  |  |  | $w = $uniSub->cleanWords($w); | 
| 1138 | 0 |  |  |  |  |  | $pw = $uniSub->cleanWords($pw); | 
| 1139 | 0 |  |  |  |  |  | $nw = $uniSub->cleanWords($nw); | 
| 1140 |  |  |  |  |  |  |  | 
| 1141 |  |  |  |  |  |  | #alter the words to make them pos types | 
| 1142 | 0 |  |  |  |  |  | $w = getFeature($w, "pos"); | 
| 1143 | 0 |  |  |  |  |  | $pw = getFeature($pw, "pos"); | 
| 1144 | 0 |  |  |  |  |  | $nw = getFeature($nw, "pos"); | 
| 1145 |  |  |  |  |  |  |  | 
| 1146 | 0 |  |  |  |  |  | my $strVec = ""; | 
| 1147 |  |  |  |  |  |  |  | 
| 1148 |  |  |  |  |  |  | #check if the word is the attribute or the words adjacent it are the attribute | 
| 1149 | 0 |  |  |  |  |  | foreach my $a(@attrbts){ | 
| 1150 | 0 |  |  |  |  |  | my $pair = ""; | 
| 1151 | 0 | 0 |  |  |  |  | $pair .= ($w eq $a ? "1, " : "0, "); | 
| 1152 | 0 | 0 | 0 |  |  |  | $pair .= (($pw eq $a or $nw eq $a) ? "1, " : "0, "); | 
| 1153 | 0 |  |  |  |  |  | $strVec .= $pair; | 
| 1154 |  |  |  |  |  |  | } | 
| 1155 |  |  |  |  |  |  |  | 
| 1156 | 0 |  |  |  |  |  | return $strVec; | 
| 1157 |  |  |  |  |  |  | } | 
| 1158 |  |  |  |  |  |  |  | 
| 1159 |  |  |  |  |  |  | #makes the cui based part of the vector | 
| 1160 |  |  |  |  |  |  | # input  : $w     	    	<-- the word to analyze | 
| 1161 |  |  |  |  |  |  | #        : $pw     	    	<-- the previous word | 
| 1162 |  |  |  |  |  |  | #        : $nw     	    	<-- the next word | 
| 1163 |  |  |  |  |  |  | #		 : @attrbts 		<-- the set of cui attributes to use | 
| 1164 |  |  |  |  |  |  | # output : $strVec			<-- the cui vector string | 
| 1165 |  |  |  |  |  |  | sub cuiVec{ | 
| 1166 | 0 |  |  | 0 | 0 |  | my $w = shift; | 
| 1167 | 0 |  |  |  |  |  | my $pw = shift; | 
| 1168 | 0 |  |  |  |  |  | my $nw = shift; | 
| 1169 | 0 |  |  |  |  |  | my $at_ref = shift; | 
| 1170 | 0 |  |  |  |  |  | my @attrbts = @$at_ref; | 
| 1171 |  |  |  |  |  |  |  | 
| 1172 |  |  |  |  |  |  | #clean the words | 
| 1173 | 0 |  |  |  |  |  | $w = $uniSub->cleanWords($w); | 
| 1174 | 0 |  |  |  |  |  | $pw = $uniSub->cleanWords($pw); | 
| 1175 | 0 |  |  |  |  |  | $nw = $uniSub->cleanWords($nw); | 
| 1176 |  |  |  |  |  |  |  | 
| 1177 |  |  |  |  |  |  | #alter the words to make them cui types | 
| 1178 | 0 |  |  |  |  |  | my @wArr = getFeature($w, "cui"); | 
| 1179 | 0 |  |  |  |  |  | my @pwArr = getFeature($pw, "cui"); | 
| 1180 | 0 |  |  |  |  |  | my @nwArr = getFeature($nw, "cui"); | 
| 1181 |  |  |  |  |  |  |  | 
| 1182 | 0 |  |  |  |  |  | my $strVec = ""; | 
| 1183 |  |  |  |  |  |  | #check if the word is the attribute or the words adjacent it are the attribute | 
| 1184 | 0 |  |  |  |  |  | foreach my $a(@attrbts){ | 
| 1185 | 0 |  |  |  |  |  | my $pair = ""; | 
| 1186 | 0 | 0 |  |  |  |  | $pair .= ($uniSub->inArr($a, \@wArr) ? "1, " : "0, "); | 
| 1187 | 0 | 0 | 0 |  |  |  | $pair .= (($uniSub->inArr($a, \@pwArr) or $uniSub->inArr($a, \@nwArr)) ? "1, " : "0, "); | 
| 1188 | 0 |  |  |  |  |  | $strVec .= $pair; | 
| 1189 |  |  |  |  |  |  | } | 
| 1190 |  |  |  |  |  |  |  | 
| 1191 | 0 |  |  |  |  |  | return $strVec; | 
| 1192 |  |  |  |  |  |  | } | 
| 1193 |  |  |  |  |  |  |  | 
| 1194 |  |  |  |  |  |  | #makes the semantic based part of the vector | 
| 1195 |  |  |  |  |  |  | # input  : $w     	    	<-- the word to analyze | 
| 1196 |  |  |  |  |  |  | #        : $pw     	    	<-- the previous word | 
| 1197 |  |  |  |  |  |  | #        : $nw     	    	<-- the next word | 
| 1198 |  |  |  |  |  |  | #		 : @attrbts 		<-- the set of sem attributes to use | 
| 1199 |  |  |  |  |  |  | # output : $strVec			<-- the sem vector string | 
| 1200 |  |  |  |  |  |  | sub semVec{ | 
| 1201 | 0 |  |  | 0 | 0 |  | my $w = shift; | 
| 1202 | 0 |  |  |  |  |  | my $pw = shift; | 
| 1203 | 0 |  |  |  |  |  | my $nw = shift; | 
| 1204 | 0 |  |  |  |  |  | my $at_ref = shift; | 
| 1205 | 0 |  |  |  |  |  | my @attrbts = @$at_ref; | 
| 1206 |  |  |  |  |  |  |  | 
| 1207 |  |  |  |  |  |  | #clean the words | 
| 1208 | 0 |  |  |  |  |  | $w = $uniSub->cleanWords($w); | 
| 1209 | 0 |  |  |  |  |  | $pw = $uniSub->cleanWords($pw); | 
| 1210 | 0 |  |  |  |  |  | $nw = $uniSub->cleanWords($nw); | 
| 1211 |  |  |  |  |  |  |  | 
| 1212 |  |  |  |  |  |  | #alter the words to make them sem types | 
| 1213 | 0 |  |  |  |  |  | my @wArr = getFeature($w, "sem"); | 
| 1214 | 0 |  |  |  |  |  | my @pwArr = getFeature($pw, "sem"); | 
| 1215 | 0 |  |  |  |  |  | my @nwArr = getFeature($nw, "sem"); | 
| 1216 |  |  |  |  |  |  |  | 
| 1217 | 0 |  |  |  |  |  | my $strVec = ""; | 
| 1218 |  |  |  |  |  |  |  | 
| 1219 |  |  |  |  |  |  | #check if the word is the attribute or the words adjacent it are the attribute | 
| 1220 | 0 |  |  |  |  |  | foreach my $a(@attrbts){ | 
| 1221 |  |  |  |  |  |  | #remove "sem" label | 
| 1222 | 0 |  |  |  |  |  | $a = lc($a); | 
| 1223 |  |  |  |  |  |  |  | 
| 1224 | 0 |  |  |  |  |  | my $pair = ""; | 
| 1225 | 0 | 0 |  |  |  |  | $pair .= ($uniSub->inArr($a, \@wArr) ? "1, " : "0, "); | 
| 1226 | 0 | 0 | 0 |  |  |  | $pair .= (($uniSub->inArr($a, \@pwArr) or $uniSub->inArr($a, \@nwArr)) ? "1, " : "0, "); | 
| 1227 | 0 |  |  |  |  |  | $strVec .= $pair; | 
| 1228 |  |  |  |  |  |  | } | 
| 1229 | 0 |  |  |  |  |  | return $strVec; | 
| 1230 |  |  |  |  |  |  | } | 
| 1231 |  |  |  |  |  |  |  | 
| 1232 |  |  |  |  |  |  | #converts a binary vector to a sparse vector | 
| 1233 |  |  |  |  |  |  | sub convert2Sparse{ | 
| 1234 | 0 |  |  | 0 | 0 |  | my $bin_vec = shift; | 
| 1235 | 0 |  |  |  |  |  | my @vals = split(", ", $bin_vec); | 
| 1236 | 0 |  |  |  |  |  | my $numVals = @vals; | 
| 1237 |  |  |  |  |  |  |  | 
| 1238 | 0 |  |  |  |  |  | my $sparse_vec = "{"; | 
| 1239 | 0 |  |  |  |  |  | for(my $c=0;$c<$numVals;$c++){ | 
| 1240 | 0 |  |  |  |  |  | my $curVal = $vals[$c]; | 
| 1241 |  |  |  |  |  |  |  | 
| 1242 | 0 | 0 |  |  |  |  | if(($curVal eq "1")){ | 
| 1243 | 0 |  |  |  |  |  | $sparse_vec .= "$c $curVal, "; | 
| 1244 |  |  |  |  |  |  | #$uniSub->printColorDebug("red", "$c $curVal, "); | 
| 1245 |  |  |  |  |  |  | } | 
| 1246 |  |  |  |  |  |  | } | 
| 1247 | 0 |  |  |  |  |  | $sparse_vec .= "$numVals, "; | 
| 1248 |  |  |  |  |  |  |  | 
| 1249 | 0 |  |  |  |  |  | return $sparse_vec; | 
| 1250 |  |  |  |  |  |  | } | 
| 1251 |  |  |  |  |  |  |  | 
| 1252 |  |  |  |  |  |  |  | 
| 1253 |  |  |  |  |  |  | ######################               ATTRIBUTE BASED METHODS              ##################### | 
| 1254 |  |  |  |  |  |  |  | 
| 1255 |  |  |  |  |  |  | #gets the attributes based on the item | 
| 1256 |  |  |  |  |  |  | # input  : $feature     	<-- the feature type [e.g. ortho, morph, text] | 
| 1257 |  |  |  |  |  |  | #		 : %buckets 		<-- the bucket key set | 
| 1258 |  |  |  |  |  |  | # output : %vecARFFattr		<-- the vector set of attributes and arff set of attributes | 
| 1259 |  |  |  |  |  |  | sub grabAttr{ | 
| 1260 | 0 |  |  | 0 | 0 |  | my $name = shift; | 
| 1261 | 0 |  |  |  |  |  | my $feature = shift; | 
| 1262 | 0 |  |  |  |  |  | my $buckets_ref = shift; | 
| 1263 | 0 |  |  |  |  |  | my %buckets = %$buckets_ref; | 
| 1264 |  |  |  |  |  |  |  | 
| 1265 | 0 |  |  |  |  |  | my %vecARFFattr = (); | 
| 1266 | 0 | 0 |  |  |  |  | if($feature eq "ortho"){ | 
|  |  | 0 |  |  |  |  |  | 
| 1267 | 0 |  |  |  |  |  | my @vecSet = (); | 
| 1268 | 0 |  |  |  |  |  | my @arffSet = ("first_letter_capital", | 
| 1269 |  |  |  |  |  |  | "single_character", | 
| 1270 |  |  |  |  |  |  | "all_capital", | 
| 1271 |  |  |  |  |  |  | "has_digit", | 
| 1272 |  |  |  |  |  |  | "all_digit", | 
| 1273 |  |  |  |  |  |  | "has_hyphen", | 
| 1274 |  |  |  |  |  |  | "has_punctuation"); | 
| 1275 | 0 |  |  |  |  |  | $vecARFFattr{vector} = \@vecSet; | 
| 1276 | 0 |  |  |  |  |  | $vecARFFattr{arff} = \@arffSet; | 
| 1277 | 0 |  |  |  |  |  | return %vecARFFattr; | 
| 1278 |  |  |  |  |  |  | }elsif($feature eq "morph"){ | 
| 1279 | 0 |  |  |  |  |  | my %bucketAttr = (); | 
| 1280 | 0 |  |  |  |  |  | my %bucketAttrARFF = (); | 
| 1281 |  |  |  |  |  |  |  | 
| 1282 |  |  |  |  |  |  | #get the attributes for each bucket | 
| 1283 | 0 |  |  |  |  |  | foreach my $testBucket (@allBuckets){ | 
| 1284 | 0 |  |  |  |  |  | my @range = $uniSub->bully($bucketsNum, $testBucket); | 
| 1285 | 0 |  |  |  |  |  | $uniSub->printDebug("\t\t$name BUCKET #$testBucket/$feature MORPHO attributes...\n"); | 
| 1286 |  |  |  |  |  |  |  | 
| 1287 |  |  |  |  |  |  | #get attributes [ unique and deluxe ] | 
| 1288 | 0 |  |  |  |  |  | my @attr = getMorphoAttributes(\@range, \%buckets); | 
| 1289 | 0 |  |  |  |  |  | @attr = uniq(@attr);						#make unique forms | 
| 1290 | 0 |  |  |  |  |  | $bucketAttr{$testBucket} = \@attr; | 
| 1291 |  |  |  |  |  |  |  | 
| 1292 | 0 |  |  |  |  |  | my @attrARFF = @attr; | 
| 1293 | 0 |  |  |  |  |  | foreach my $a(@attrARFF){$a .= $morphID;} | 
|  | 0 |  |  |  |  |  |  | 
| 1294 | 0 |  |  |  |  |  | $bucketAttrARFF{$testBucket} = \@attrARFF; | 
| 1295 |  |  |  |  |  |  | } | 
| 1296 |  |  |  |  |  |  |  | 
| 1297 |  |  |  |  |  |  | #add to overall | 
| 1298 | 0 |  |  |  |  |  | $vecARFFattr{vector} = \%bucketAttr; | 
| 1299 | 0 |  |  |  |  |  | $vecARFFattr{arff} = \%bucketAttrARFF; | 
| 1300 |  |  |  |  |  |  |  | 
| 1301 | 0 |  |  |  |  |  | return %vecARFFattr; | 
| 1302 |  |  |  |  |  |  | }else{ | 
| 1303 | 0 |  |  |  |  |  | my %bucketAttr = (); | 
| 1304 | 0 |  |  |  |  |  | my %bucketAttrARFF = (); | 
| 1305 |  |  |  |  |  |  |  | 
| 1306 |  |  |  |  |  |  | #get the attributes for each bucket | 
| 1307 | 0 |  |  |  |  |  | foreach my $testBucket (@allBuckets){ | 
| 1308 | 0 |  |  |  |  |  | my @range = $uniSub->bully($bucketsNum, $testBucket); | 
| 1309 | 0 |  |  |  |  |  | $uniSub->printDebug("\t\t$name BUCKET #$testBucket/$feature attributes...\n"); | 
| 1310 |  |  |  |  |  |  |  | 
| 1311 |  |  |  |  |  |  | #get attributes [ unique and deluxe ] | 
| 1312 | 0 |  |  |  |  |  | my @attr = getRangeAttributes($feature, \@range, \%buckets); | 
| 1313 | 0 |  |  |  |  |  | @attr = uniq(@attr);						#make unique forms | 
| 1314 | 0 |  |  |  |  |  | $bucketAttr{$testBucket} = \@attr; | 
| 1315 |  |  |  |  |  |  |  | 
| 1316 | 0 |  |  |  |  |  | my @attrARFF = getAttrDelux($feature, \@attr); | 
| 1317 | 0 |  |  |  |  |  | $bucketAttrARFF{$testBucket} = \@attrARFF; | 
| 1318 |  |  |  |  |  |  | } | 
| 1319 |  |  |  |  |  |  |  | 
| 1320 |  |  |  |  |  |  | #add to overall | 
| 1321 | 0 |  |  |  |  |  | $vecARFFattr{vector} = \%bucketAttr; | 
| 1322 | 0 |  |  |  |  |  | $vecARFFattr{arff} = \%bucketAttrARFF; | 
| 1323 |  |  |  |  |  |  |  | 
| 1324 | 0 |  |  |  |  |  | return %vecARFFattr; | 
| 1325 |  |  |  |  |  |  | } | 
| 1326 |  |  |  |  |  |  | } | 
| 1327 |  |  |  |  |  |  |  | 
| 1328 |  |  |  |  |  |  |  | 
| 1329 |  |  |  |  |  |  |  | 
| 1330 |  |  |  |  |  |  | #makes an array with unique elements | 
| 1331 |  |  |  |  |  |  | # input  : @orig_arr <-- the original array w/ repeats | 
| 1332 |  |  |  |  |  |  | # output : @new_arr  <-- same array but w/o repeats | 
| 1333 |  |  |  |  |  |  | sub makeUniq{ | 
| 1334 | 0 |  |  | 0 | 0 |  | my $orig_arr_ref = shift; | 
| 1335 | 0 |  |  |  |  |  | my @orig_arr = @$orig_arr_ref; | 
| 1336 |  |  |  |  |  |  |  | 
| 1337 | 0 |  |  |  |  |  | my @new_arr = (); | 
| 1338 | 0 |  |  |  |  |  | foreach my $t (@orig_arr){ | 
| 1339 | 0 | 0 | 0 |  |  |  | unless($uniSub->inArr($t, \@new_arr) or $t =~/\s+/o or $t =~/\b$entId\b/o or length($t) == 0){ | 
|  |  |  | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
| 1340 | 0 |  |  |  |  |  | push @new_arr, $t; | 
| 1341 |  |  |  |  |  |  | } | 
| 1342 |  |  |  |  |  |  | } | 
| 1343 | 0 |  |  |  |  |  | @new_arr = grep { $_ ne '' } @new_arr; | 
|  | 0 |  |  |  |  |  |  | 
| 1344 | 0 |  |  |  |  |  | return @new_arr; | 
| 1345 |  |  |  |  |  |  | } | 
| 1346 |  |  |  |  |  |  |  | 
| 1347 |  |  |  |  |  |  |  | 
| 1348 |  |  |  |  |  |  | #returns the attribute values of a range of buckets | 
| 1349 |  |  |  |  |  |  | # input  : $type     	    <-- the feature type [e.g. ortho, morph, text] | 
| 1350 |  |  |  |  |  |  | #		 : @bucketRange     <-- the range of the buckets to use [e.g.(1-8,10) out of 10 buckets; use "$uniSub->bully" subroutine in UniversalRoutines.pm] | 
| 1351 |  |  |  |  |  |  | #		 : %buckets 		<-- the bucket key set | 
| 1352 |  |  |  |  |  |  | # output : @attributes		<-- the set of attributes for the specific type and range | 
| 1353 |  |  |  |  |  |  | sub getRangeAttributes{ | 
| 1354 | 0 |  |  | 0 | 0 |  | my $type = shift; | 
| 1355 | 0 |  |  |  |  |  | my $bucketRange_ref = shift; | 
| 1356 | 0 |  |  |  |  |  | my $buckets_ref = shift; | 
| 1357 | 0 |  |  |  |  |  | my @bucketRange = @$bucketRange_ref; | 
| 1358 | 0 |  |  |  |  |  | my %buckets = %$buckets_ref; | 
| 1359 |  |  |  |  |  |  |  | 
| 1360 |  |  |  |  |  |  | #collect all the necessary keys | 
| 1361 | 0 |  |  |  |  |  | my @keyRing = (); | 
| 1362 | 0 |  |  |  |  |  | foreach my $bucket (sort { $a <=> $b } keys %buckets){ | 
|  | 0 |  |  |  |  |  |  | 
| 1363 | 0 | 0 |  |  |  |  | if($uniSub->inArr($bucket, \@bucketRange)){ | 
| 1364 | 0 |  |  |  |  |  | my @keys = @{$buckets{$bucket}}; | 
|  | 0 |  |  |  |  |  |  | 
| 1365 | 0 |  |  |  |  |  | push @keyRing, @keys; | 
| 1366 |  |  |  |  |  |  | } | 
| 1367 |  |  |  |  |  |  | } | 
| 1368 |  |  |  |  |  |  |  | 
| 1369 |  |  |  |  |  |  | #get the tokens for each associated key | 
| 1370 | 0 |  |  |  |  |  | my @bucketTokens = (); | 
| 1371 | 0 |  |  |  |  |  | foreach my $key (@keyRing){ | 
| 1372 | 0 |  |  |  |  |  | push @bucketTokens, @{$tokenHash{$key}}; | 
|  | 0 |  |  |  |  |  |  | 
| 1373 |  |  |  |  |  |  | } | 
| 1374 |  |  |  |  |  |  |  | 
| 1375 |  |  |  |  |  |  | #get the concepts for each associated key | 
| 1376 | 0 |  |  |  |  |  | my @bucketConcepts = (); | 
| 1377 | 0 | 0 |  |  |  |  | if($type eq "sem"){ | 
|  |  | 0 |  |  |  |  |  | 
| 1378 | 0 |  |  |  |  |  | foreach my $key (@keyRing){ | 
| 1379 | 0 |  |  |  |  |  | push @bucketConcepts, $semHash{$key}; | 
| 1380 |  |  |  |  |  |  | } | 
| 1381 |  |  |  |  |  |  | }elsif($type eq "cui"){ | 
| 1382 | 0 |  |  |  |  |  | foreach my $key (@keyRing){ | 
| 1383 | 0 |  |  |  |  |  | push @bucketConcepts, $cuiHash{$key}; | 
| 1384 |  |  |  |  |  |  | } | 
| 1385 |  |  |  |  |  |  | } | 
| 1386 |  |  |  |  |  |  |  | 
| 1387 |  |  |  |  |  |  |  | 
| 1388 |  |  |  |  |  |  | #get particular value from the tokens and concepts | 
| 1389 | 0 |  |  |  |  |  | my @attributes = (); | 
| 1390 | 0 | 0 | 0 |  |  |  | if($type eq "text" or $type eq "pos"){					#get the text attributes | 
|  |  | 0 | 0 |  |  |  |  | 
| 1391 | 0 |  |  |  |  |  | my @tokenWords = (); | 
| 1392 | 0 |  |  |  |  |  | foreach my $token(@bucketTokens){ | 
| 1393 | 0 |  |  |  |  |  | my $tokenText = $token->{text}; | 
| 1394 |  |  |  |  |  |  |  | 
| 1395 |  |  |  |  |  |  | #add to the tokens | 
| 1396 | 0 | 0 | 0 |  |  |  | if($tokenText =~ /\w+\s\w+/o){ | 
|  |  | 0 | 0 |  |  |  |  | 
| 1397 | 0 |  |  |  |  |  | my @tokenText2 = split(" ", $tokenText); | 
| 1398 | 0 |  |  |  |  |  | push @tokenWords, @tokenText2; | 
| 1399 |  |  |  |  |  |  | }elsif($tokenText ne "." and $tokenText ne "-" and !($tokenText =~ /[^a-zA-Z0-9]/)){ | 
| 1400 | 0 |  |  |  |  |  | push @tokenWords, $tokenText; | 
| 1401 |  |  |  |  |  |  | } | 
| 1402 |  |  |  |  |  |  |  | 
| 1403 |  |  |  |  |  |  | #clean up the text | 
| 1404 | 0 |  |  |  |  |  | foreach my $toky(@tokenWords){ | 
| 1405 | 0 |  |  |  |  |  | $toky = cleanToken($toky); | 
| 1406 |  |  |  |  |  |  | } | 
| 1407 |  |  |  |  |  |  |  | 
| 1408 |  |  |  |  |  |  | } | 
| 1409 |  |  |  |  |  |  | #gets the tokens for the attributes and vector analysis | 
| 1410 | 0 | 0 |  |  |  |  | if($type eq "text"){ | 
|  |  | 0 |  |  |  |  |  | 
| 1411 | 0 |  |  |  |  |  | @attributes = @tokenWords; | 
| 1412 |  |  |  |  |  |  | }elsif($type eq "pos"){ | 
| 1413 | 0 |  |  |  |  |  | @attributes = getTokensPOS(\@bucketTokens, \@tokenWords, \@keyRing); | 
| 1414 |  |  |  |  |  |  | } | 
| 1415 |  |  |  |  |  |  |  | 
| 1416 |  |  |  |  |  |  | } | 
| 1417 |  |  |  |  |  |  | #get the concept-based attributes | 
| 1418 |  |  |  |  |  |  | elsif($type eq "sem" or $type eq "cui"){ | 
| 1419 | 0 |  |  |  |  |  | my @conWords = (); | 
| 1420 | 0 |  |  |  |  |  | foreach my $conFeat(@bucketConcepts){ | 
| 1421 | 0 |  |  |  |  |  | my @conLine = split / /, $conFeat; | 
| 1422 | 0 |  |  |  |  |  | push @conWords, @conLine; | 
| 1423 |  |  |  |  |  |  | } | 
| 1424 | 0 |  |  |  |  |  | @attributes = uniq (@conWords); | 
| 1425 |  |  |  |  |  |  |  | 
| 1426 |  |  |  |  |  |  | #add a semantic label for differentiation | 
| 1427 | 0 | 0 |  |  |  |  | if($type eq "sem"){ | 
| 1428 | 0 |  |  |  |  |  | foreach my $a (@attributes){$a = uc($a);} | 
|  | 0 |  |  |  |  |  |  | 
| 1429 |  |  |  |  |  |  | } | 
| 1430 |  |  |  |  |  |  |  | 
| 1431 |  |  |  |  |  |  | } | 
| 1432 |  |  |  |  |  |  | #my $a = @attributes; | 
| 1433 |  |  |  |  |  |  | #$uniSub->printColorDebug("red", "$type ATTR: #$a\n"); | 
| 1434 |  |  |  |  |  |  | #printArr("\n", @attributes); | 
| 1435 |  |  |  |  |  |  |  | 
| 1436 | 0 |  |  |  |  |  | return @attributes; | 
| 1437 |  |  |  |  |  |  | } | 
| 1438 |  |  |  |  |  |  |  | 
| 1439 |  |  |  |  |  |  | #makes the arff version attributes - makes a copy of each attribute but with "_self" at the end | 
| 1440 |  |  |  |  |  |  | # input  : $f     			<-- the feature type (used for special features like POS and morph) | 
| 1441 |  |  |  |  |  |  | #		 : @attrs 		    <-- the attributes to ready for arff output | 
| 1442 |  |  |  |  |  |  | # output : @attrDelux		<-- the delux-arff attribute set | 
| 1443 |  |  |  |  |  |  | sub getAttrDelux{ | 
| 1444 | 0 |  |  | 0 | 0 |  | my $f = shift; | 
| 1445 | 0 |  |  |  |  |  | my $attr_ref = shift; | 
| 1446 | 0 |  |  |  |  |  | my @attr = @$attr_ref; | 
| 1447 |  |  |  |  |  |  |  | 
| 1448 |  |  |  |  |  |  | #add the _self copy | 
| 1449 | 0 |  |  |  |  |  | my @attrDelux = (); | 
| 1450 | 0 |  |  |  |  |  | foreach my $word (@attr){ | 
| 1451 |  |  |  |  |  |  | #check if certain type of feature | 
| 1452 | 0 | 0 |  |  |  |  | if($f eq "pos"){ | 
| 1453 | 0 |  |  |  |  |  | $word = ($word . "_POS"); | 
| 1454 |  |  |  |  |  |  | } | 
| 1455 | 0 |  |  |  |  |  | $word =~s/$entId//g; | 
| 1456 |  |  |  |  |  |  |  | 
| 1457 |  |  |  |  |  |  | #add the copy and then the original | 
| 1458 | 0 |  |  |  |  |  | my $copy = "$word" . "$selfId"; | 
| 1459 | 0 | 0 |  |  |  |  | if(!$uniSub->inArr($word, \@attrDelux)){ | 
| 1460 | 0 |  |  |  |  |  | push (@attrDelux, $copy); | 
| 1461 | 0 |  |  |  |  |  | push(@attrDelux, $word); | 
| 1462 |  |  |  |  |  |  | } | 
| 1463 |  |  |  |  |  |  | } | 
| 1464 | 0 |  |  |  |  |  | return @attrDelux; | 
| 1465 |  |  |  |  |  |  | } | 
| 1466 |  |  |  |  |  |  |  | 
| 1467 |  |  |  |  |  |  | #returns the lines from a range of buckets | 
| 1468 |  |  |  |  |  |  | # input  : $type     	    <-- the feature type [e.g. ortho, morph, text] | 
| 1469 |  |  |  |  |  |  | #		 : @bucketRange     <-- the range of the buckets to use [e.g.(1-8,10) out of 10 buckets; use "$uniSub->bully" subroutine in UniversalRoutines.pm] | 
| 1470 |  |  |  |  |  |  | #		 : %buckets 		<-- the bucket key set | 
| 1471 |  |  |  |  |  |  | # output : @bucketLines		<-- the lines for the specific type and bucket keys based on the range | 
| 1472 |  |  |  |  |  |  | sub getRangeLines{ | 
| 1473 | 0 |  |  | 0 | 0 |  | my $type = shift; | 
| 1474 | 0 |  |  |  |  |  | my $bucketRange_ref = shift; | 
| 1475 | 0 |  |  |  |  |  | my $buckets_ref = shift; | 
| 1476 | 0 |  |  |  |  |  | my @bucketRange = @$bucketRange_ref; | 
| 1477 | 0 |  |  |  |  |  | my %buckets = %$buckets_ref; | 
| 1478 |  |  |  |  |  |  |  | 
| 1479 |  |  |  |  |  |  | #collect all the necessary keys | 
| 1480 | 0 |  |  |  |  |  | my @keyRing = (); | 
| 1481 | 0 |  |  |  |  |  | foreach my $bucket (sort { $a <=> $b } keys %buckets){ | 
|  | 0 |  |  |  |  |  |  | 
| 1482 | 0 |  |  |  |  |  | my @bucKeys = @{$buckets{$bucket}}; | 
|  | 0 |  |  |  |  |  |  | 
| 1483 | 0 | 0 |  |  |  |  | if($uniSub->inArr($bucket, \@bucketRange)){ | 
| 1484 | 0 |  |  |  |  |  | push @keyRing, @bucKeys; | 
| 1485 |  |  |  |  |  |  | } | 
| 1486 |  |  |  |  |  |  | } | 
| 1487 |  |  |  |  |  |  |  | 
| 1488 | 0 |  |  |  |  |  | my @bucketLines = (); | 
| 1489 |  |  |  |  |  |  | #get the lines for each associated key | 
| 1490 | 0 | 0 |  |  |  |  | if($type eq "text"){ | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 1491 |  |  |  |  |  |  | #[line based] | 
| 1492 | 0 |  |  |  |  |  | foreach my $key (@keyRing){ | 
| 1493 | 0 |  |  |  |  |  | my $line = $fileHash{$key}; | 
| 1494 | 0 |  |  |  |  |  | push @bucketLines, $line; | 
| 1495 |  |  |  |  |  |  | } | 
| 1496 |  |  |  |  |  |  | } | 
| 1497 |  |  |  |  |  |  | elsif($type eq "pos"){ | 
| 1498 | 0 |  |  |  |  |  | foreach my $key (@keyRing){ | 
| 1499 | 0 |  |  |  |  |  | my $line = $posHash{$key}; | 
| 1500 | 0 |  |  |  |  |  | push @bucketLines, $line; | 
| 1501 |  |  |  |  |  |  | } | 
| 1502 |  |  |  |  |  |  | } | 
| 1503 |  |  |  |  |  |  | elsif($type eq "sem"){ | 
| 1504 | 0 |  |  |  |  |  | foreach my $key (@keyRing){ | 
| 1505 | 0 |  |  |  |  |  | my $line = $semHash{$key}; | 
| 1506 | 0 |  |  |  |  |  | push @bucketLines, $line; | 
| 1507 |  |  |  |  |  |  | } | 
| 1508 |  |  |  |  |  |  | } | 
| 1509 |  |  |  |  |  |  | elsif($type eq "cui"){ | 
| 1510 | 0 |  |  |  |  |  | foreach my $key (@keyRing){ | 
| 1511 | 0 |  |  |  |  |  | my $line = $cuiHash{$key}; | 
| 1512 | 0 |  |  |  |  |  | push @bucketLines, $line; | 
| 1513 |  |  |  |  |  |  | } | 
| 1514 |  |  |  |  |  |  | } | 
| 1515 |  |  |  |  |  |  |  | 
| 1516 | 0 |  |  |  |  |  | return @bucketLines; | 
| 1517 |  |  |  |  |  |  | } | 
| 1518 |  |  |  |  |  |  | #looks at the prefix # and suffix # and returns a substring of each word found in the bucket text set | 
| 1519 |  |  |  |  |  |  | # input  : @bucketRange     <-- the range of the buckets to use [e.g.(1-8,10) out of 10 buckets; use "$uniSub->bully" subroutine in UniversalRoutines.pm] | 
| 1520 |  |  |  |  |  |  | #		 : %buckets 		<-- the bucket key set | 
| 1521 |  |  |  |  |  |  | # output : @attributes		<-- the morphological attribute set | 
| 1522 |  |  |  |  |  |  | sub getMorphoAttributes{ | 
| 1523 | 0 |  |  | 0 | 0 |  | my $bucketRange_ref = shift; | 
| 1524 | 0 |  |  |  |  |  | my $buckets_ref = shift; | 
| 1525 | 0 |  |  |  |  |  | my @bucketRange = @$bucketRange_ref; | 
| 1526 | 0 |  |  |  |  |  | my %buckets = %$buckets_ref; | 
| 1527 |  |  |  |  |  |  |  | 
| 1528 |  |  |  |  |  |  | #collect all the necessary keys | 
| 1529 | 0 |  |  |  |  |  | my @keyRing = (); | 
| 1530 | 0 |  |  |  |  |  | foreach my $bucket (sort { $a <=> $b } keys %buckets){ | 
|  | 0 |  |  |  |  |  |  | 
| 1531 | 0 | 0 |  |  |  |  | if($uniSub->inArr($bucket, \@bucketRange)){ | 
| 1532 | 0 |  |  |  |  |  | my @keys = @{$buckets{$bucket}}; | 
|  | 0 |  |  |  |  |  |  | 
| 1533 | 0 |  |  |  |  |  | push @keyRing, @keys; | 
| 1534 |  |  |  |  |  |  | } | 
| 1535 |  |  |  |  |  |  | } | 
| 1536 |  |  |  |  |  |  |  | 
| 1537 | 0 |  |  |  |  |  | my @bucketLines = (); | 
| 1538 |  |  |  |  |  |  | #get the lines for each associated key | 
| 1539 | 0 |  |  |  |  |  | foreach my $key (@keyRing){ | 
| 1540 | 0 |  |  |  |  |  | my $line = $fileHash{$key}; | 
| 1541 | 0 |  |  |  |  |  | push @bucketLines, $line; | 
| 1542 |  |  |  |  |  |  | } | 
| 1543 |  |  |  |  |  |  |  | 
| 1544 |  |  |  |  |  |  | #get each word from each line | 
| 1545 | 0 |  |  |  |  |  | my @wordSet = (); | 
| 1546 | 0 |  |  |  |  |  | foreach my $line (@bucketLines){ | 
| 1547 | 0 |  |  |  |  |  | my @words = split(" ", $line); | 
| 1548 | 0 |  |  |  |  |  | push(@wordSet, @words); | 
| 1549 |  |  |  |  |  |  | } | 
| 1550 |  |  |  |  |  |  |  | 
| 1551 |  |  |  |  |  |  | #get the prefix and suffix from each word | 
| 1552 | 0 |  |  |  |  |  | my @attributes = (); | 
| 1553 | 0 |  |  |  |  |  | foreach my $word (@wordSet){ | 
| 1554 | 0 |  |  |  |  |  | $word =~s/$entId//g; | 
| 1555 | 0 |  |  |  |  |  | push(@attributes, substr($word, 0, $prefix));									#add the word's prefix | 
| 1556 | 0 |  |  |  |  |  | push(@attributes, substr($word, -$suffix));		#add the word's suffix | 
| 1557 |  |  |  |  |  |  | } | 
| 1558 |  |  |  |  |  |  |  | 
| 1559 |  |  |  |  |  |  | #my $a = @attributes; | 
| 1560 |  |  |  |  |  |  | #$uniSub->printColorDebug("red", "$type ATTR: #$a\n"); | 
| 1561 |  |  |  |  |  |  | #printArr("\n", @attributes); | 
| 1562 |  |  |  |  |  |  |  | 
| 1563 | 0 |  |  |  |  |  | return @attributes; | 
| 1564 |  |  |  |  |  |  | } | 
| 1565 |  |  |  |  |  |  |  | 
| 1566 |  |  |  |  |  |  | #formats attributes for the ARFF file | 
| 1567 |  |  |  |  |  |  | # input  : @set    		    <-- the attribute set | 
| 1568 |  |  |  |  |  |  | # output : @attributes  	<-- the arff formatted attributes | 
| 1569 |  |  |  |  |  |  | sub makeAttrData{ | 
| 1570 | 0 |  |  | 0 | 0 |  | my $set_ref = shift; | 
| 1571 | 0 |  |  |  |  |  | my @set = @$set_ref; | 
| 1572 |  |  |  |  |  |  |  | 
| 1573 | 0 |  |  |  |  |  | my @attributes = (); | 
| 1574 | 0 |  |  |  |  |  | foreach my $attr (@set){ | 
| 1575 | 0 |  |  |  |  |  | push (@attributes, "\@ATTRIBUTE $attr NUMERIC"); | 
| 1576 |  |  |  |  |  |  | } | 
| 1577 |  |  |  |  |  |  |  | 
| 1578 | 0 |  |  |  |  |  | return @attributes; | 
| 1579 |  |  |  |  |  |  | } | 
| 1580 |  |  |  |  |  |  |  | 
| 1581 |  |  |  |  |  |  | ##new stoplist function | 
| 1582 |  |  |  |  |  |  | sub stop { | 
| 1583 |  |  |  |  |  |  |  | 
| 1584 | 0 |  |  | 0 | 0 |  | my $stopfile = shift; | 
| 1585 |  |  |  |  |  |  |  | 
| 1586 | 0 |  |  |  |  |  | my $stop_regex = ""; | 
| 1587 | 0 |  |  |  |  |  | my $stop_mode = "AND"; | 
| 1588 |  |  |  |  |  |  |  | 
| 1589 | 0 | 0 |  |  |  |  | open ( STP, $stopfile ) || | 
| 1590 |  |  |  |  |  |  | die ("Couldn't open the stoplist file $stopfile\n"); | 
| 1591 |  |  |  |  |  |  |  | 
| 1592 | 0 |  |  |  |  |  | while (  ) { | 
| 1593 | 0 |  |  |  |  |  | chomp; | 
| 1594 |  |  |  |  |  |  |  | 
| 1595 | 0 | 0 |  |  |  |  | if(/\@stop.mode\s*=\s*(\w+)\s*$/) { | 
| 1596 | 0 |  |  |  |  |  | $stop_mode=$1; | 
| 1597 | 0 | 0 |  |  |  |  | if(!($stop_mode=~/^(AND|and|OR|or)$/)) { | 
| 1598 | 0 |  |  |  |  |  | print STDERR "Requested Stop Mode $1 is not supported.\n"; | 
| 1599 | 0 |  |  |  |  |  | exit; | 
| 1600 |  |  |  |  |  |  | } | 
| 1601 | 0 |  |  |  |  |  | next; | 
| 1602 |  |  |  |  |  |  | } | 
| 1603 |  |  |  |  |  |  |  | 
| 1604 |  |  |  |  |  |  | # accepting Perl Regexs from Stopfile | 
| 1605 | 0 |  |  |  |  |  | s/^\s+//; | 
| 1606 | 0 |  |  |  |  |  | s/\s+$//; | 
| 1607 |  |  |  |  |  |  |  | 
| 1608 |  |  |  |  |  |  | #handling a blank lines | 
| 1609 | 0 | 0 |  |  |  |  | if(/^\s*$/) { next; } | 
|  | 0 |  |  |  |  |  |  | 
| 1610 |  |  |  |  |  |  |  | 
| 1611 |  |  |  |  |  |  | #check if a valid Perl Regex | 
| 1612 | 0 | 0 |  |  |  |  | if(!(/^\//)) { | 
| 1613 | 0 |  |  |  |  |  | print STDERR "Stop token regular expression <$_> should start with '/'\n"; | 
| 1614 | 0 |  |  |  |  |  | exit; | 
| 1615 |  |  |  |  |  |  | } | 
| 1616 | 0 | 0 |  |  |  |  | if(!(/\/$/)) { | 
| 1617 | 0 |  |  |  |  |  | print STDERR "Stop token regular expression <$_> should end with '/'\n"; | 
| 1618 | 0 |  |  |  |  |  | exit; | 
| 1619 |  |  |  |  |  |  | } | 
| 1620 |  |  |  |  |  |  |  | 
| 1621 |  |  |  |  |  |  | #remove the / s from beginning and end | 
| 1622 | 0 |  |  |  |  |  | s/^\///; | 
| 1623 | 0 |  |  |  |  |  | s/\/$//; | 
| 1624 |  |  |  |  |  |  |  | 
| 1625 |  |  |  |  |  |  | #form a single big regex | 
| 1626 | 0 |  |  |  |  |  | $stop_regex.="(".$_.")|"; | 
| 1627 |  |  |  |  |  |  | } | 
| 1628 |  |  |  |  |  |  |  | 
| 1629 | 0 | 0 |  |  |  |  | if(length($stop_regex)<=0) { | 
| 1630 | 0 |  |  |  |  |  | print STDERR "No valid Perl Regular Experssion found in Stop file $stopfile"; | 
| 1631 | 0 |  |  |  |  |  | exit; | 
| 1632 |  |  |  |  |  |  | } | 
| 1633 |  |  |  |  |  |  |  | 
| 1634 | 0 |  |  |  |  |  | chop $stop_regex; | 
| 1635 |  |  |  |  |  |  |  | 
| 1636 |  |  |  |  |  |  | # making AND a default stop mode | 
| 1637 | 0 | 0 |  |  |  |  | if(!defined $stop_mode) { | 
| 1638 | 0 |  |  |  |  |  | $stop_mode="AND"; | 
| 1639 |  |  |  |  |  |  | } | 
| 1640 |  |  |  |  |  |  |  | 
| 1641 | 0 |  |  |  |  |  | close STP; | 
| 1642 |  |  |  |  |  |  |  | 
| 1643 | 0 |  |  |  |  |  | return $stop_regex; | 
| 1644 |  |  |  |  |  |  | } | 
| 1645 |  |  |  |  |  |  |  | 
| 1646 |  |  |  |  |  |  | 1; |