File Coverage

blib/lib/NanoB2B/NER/Arffman.pm

Criterion	Covered	Total	%
statement	18	762	2.3
branch	0	204	0.0
condition	0	42	0.0
subroutine	6	37	16.2
pod	0	30	0.0
total	24	1075	2.2

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/perl
2							# NanoB2B-NER::NER::Arffman
3							#
4							# Creates ARFF files from annotated files
5							# Version 1.9
6							#
7							# Program by Milk
8
9							package NanoB2B::NER::Arffman;
10
11	1			1		7	use NanoB2B::UniversalRoutines;
	1					2
	1					27
12	1			1		5	use MetaMap::DataStructures;
	1					2
	1					23
13	1			1		8	use File::Path qw(make_path); #makes sub directories
	1					3
	1					65
14	1			1		5	use List::MoreUtils qw(uniq);
	1					1
	1					11
15
16	1			1		648	use strict;
	1					2
	1					42
17	1			1		9	use warnings;
	1					3
	1					7429
18
19							#option variables
20							my $debug = 1;
21							my $program_dir = "";
22							my $fileIndex = 0;
23							my $stopwords_file;
24							my $prefix = 3;
25							my $suffix = 3;
26							my $bucketsNum = 10;
27							my $is_cui = 0;
28							my $sparse_matrix = 0;
29							my $wcs = "";
30
31							#datastructure object
32							my %params = ();
33							my $dataStructures = MetaMap::DataStructures->new(\%params);
34
35							#universal subroutines object
36							my %uniParams = ();
37							my $uniSub;
38
39							#other general global variables
40							my @allBuckets;
41							my %fileHash;
42							my %metamapHash;
43							my %featHash;
44							my %orthoHash;
45							my @features;
46							my $selfId = "_self";
47							my $entId = "_e";
48							my $morphID = "_m";
49
50							my $stopRegex;
51
52							#### A HERO IS BORN ####
53
54							# construction method to create a new Arffman object
55							# input : $directory <-- the name of the directory for the files
56							# $name <-- name of the file to examine
57							# $features <-- the list of features to use [e.g. "ortho morph text pos cui sem"]
58							# $bucketsNum <-- the number of buckets to use for k-fold cross validation
59							# \$debug <-- run the program with debug print statements
60							# \$prefix <-- the number of letters to look at the beginning of each word
61							# \$suffix <-- the number of letters to look at the end of each word
62							# \$index <-- the index to start metamapping from in the set of files
63							# \$no_stopwords <-- exclude examining stop words [imported from the stop word list]
64							# output : $self <-- an instance of the Arffman object
65							sub new {
66							#grab class and parameters
67	0			0	0		my $self = {};
68	0						my $class = shift;
69	0	0					return undef if(ref $class);
70	0						my $params = shift;
71
72							#reset all arrays and hashes
73	0						@allBuckets = ();
74	0						%fileHash = ();
75	0						%metamapHash = ();
76	0						%featHash = ();
77	0						%orthoHash = ();
78	0						@features = ();
79
80							#bless this object - hallelujah
81	0						bless $self, $class;
82	0						$self->_init($params);
83	0						@allBuckets = (1..$bucketsNum);
84
85							#retrieve parameters for universal-routines
86	0						$uniParams{'debug'} = $debug;
87	0						$uniSub = NanoB2B::UniversalRoutines->new(\%uniParams);
88
89							#return the object
90	0						return $self;
91							}
92
93							# method to initialize the NanoB2B::NER::Arffman object.
94							# input : $parameters <- reference to a hash
95							# output:
96							sub _init {
97	0			0			my $self = shift;
98	0						my $params = shift;
99
100	0	0					$params = {} if(!defined $params);
101
102							# get some of the parameters
103	0						my $diroption = $params->{'directory'};
104	0						my $ftsoption = $params->{'features'};
105	0						my $bucketsNumoption = $params->{'bucketsNum'};
106	0						my $debugoption = $params->{'debug'};
107	0						my $prefixoption = $params->{'prefix'};
108	0						my $suffixoption = $params->{'suffix'};
109	0						my $indexoption = $params->{'index'};
110	0						my $stopwordoption = $params->{'stopwords'};
111	0						my $iscuioption = $params->{'is_cui'};
112	0						my $sparsematrixoption = $params->{'sparse_matrix'};
113	0						my $wcsoption = $params->{'wcs'};
114
115							#set the global variables
116	0	0					if(defined $debugoption){$debug = $debugoption;}
	0
117	0	0					if(defined $diroption){$program_dir = $diroption;}
	0
118	0	0					if(defined $indexoption){$fileIndex = $indexoption;}
	0
119	0	0					if(defined $stopwordoption){$stopwords_file = $stopwordoption;}
	0
120	0	0					if(defined $iscuioption){$is_cui = $iscuioption;}
	0
121	0	0					if(defined $sparsematrixoption){$sparse_matrix = $sparsematrixoption;}
	0
122	0	0					if(defined $prefixoption){$prefix = $prefixoption;}
	0
123	0	0					if(defined $suffixoption){$suffix = $suffixoption;}
	0
124	0	0					if(defined $wcsoption){$wcs = $wcsoption;}
	0
125	0	0					if(defined $bucketsNumoption){$bucketsNum = $bucketsNumoption;}
	0
126	0	0					if(defined $ftsoption){@features = split(' ', $ftsoption);}
	0
127							}
128
129
130							####### ARFFMAN AND THE METHODS OF MADNESS #####
131
132
133							# opens a single file and runs it through the process of creating buckets
134							# extracting tokens and concepts, and creating arff files based on the features given
135							# input : $file <-- the name of the file to make into arff files
136							# output : a set of arff files
137							sub arff_file{
138	0			0	0		my $self = shift;
139	0						my $file = shift;
140
141							#define and reset temp var
142	0						my $indexer = 0;
143	0						%fileHash = ();
144	0						%metamapHash = ();
145	0						%featHash = ();
146	0						%orthoHash = ();
147
148							#get the name of the file
149	0						my @n = split '/', $file;
150	0						my $l = @n;
151	0						my $filename = $n[$l - 1];
152	0						$filename = lc($filename);
153
154	0						my $FILE;
155	0	0					open ($FILE, "$program_dir/$file") \|\| die ("what is this '$program_dir/$filename' you speak of?\n$!");
156	0						my @fileLines = <$FILE>;
157	0						my @orthoLines = @fileLines;
158	0						foreach my $l(@fileLines){
159	0						$l = lc($l);
160							}
161	0						$uniSub->printColorDebug("on_red", "$filename");
162							#$uniSub->printColorDebug("on_cyan", "* $wcs *");
163
164							#get the total num of lines
165	0						my $totalLines = 0;
166	0						$totalLines = @fileLines;
167	0						$uniSub->printColorDebug("red", "Lines: $totalLines\n");
168
169							#clean it up for two separate sets
170	0						my @tagSet = retagSet($filename, \@fileLines);
171	0						my @cleanLines = untagSet($filename, \@fileLines);
172
173							#get the orthographic based lines
174							#my @orthoLines = ;
175	0						@orthoLines = retagSetOrtho($file, \@orthoLines);
176
177							#$uniSub->printColorDebug("red", "TAG SET: ");
178							#$uniSub->printArr(", ", \@tagSet);
179
180							####### ASSIGN THE VALUES TO HASHTABLES O KEEP TRACK OF THEM #######
181
182							#put all the lines in a file hash
183	0						$uniSub->printColorDebug("blue", "*Putting all the file lines into a hashtable....\n");
184	0						$indexer = 0;
185	0						foreach my $line (@tagSet){
186	0						$fileHash{$indexer} = $line;
187	0						$indexer++;
188							}
189
190							#put the orthographic lines in a hash
191	0						$indexer = 0;
192	0						foreach my $line (@orthoLines){
193							#$uniSub->printColorDebug("red", "$line\n");
194	0						$orthoHash{$indexer} = $line;
195	0						$indexer++;
196							}
197
198							#import the hashtables from saved data
199	0						importMetaData($filename);
200
201							#tokenize all the lines --> tokenhash
202	0						$uniSub->printColorDebug("blue", "*Tokenizing the lines into a hashtable....\n");
203	0						$indexer = 0;
204	0						my $totalTokens = 0;
205	0						my $totalConcepts = 0;
206	0						foreach my $line (@cleanLines){
207	0						$\| = 1;
208	0						$uniSub->printColorDebug("bright_blue", "\r" . "\t\tLine - $indexer/$totalLines");
209
210							#acquire the necessary variables
211	0						my $special_ID = "1.ti.1";
212	0						my $meta = $metamapHash{$indexer};
213
214							#create citation first
215	0						$dataStructures->createFromTextWithId($meta, $special_ID);
216	0						my $citation = $dataStructures->getCitationWithId($special_ID);
217
218							#get tokens
219	0						my @tokens = @{$citation->getOrderedTokens()};
	0
220	0						my $tnum = @tokens;
221	0						$totalTokens += $tnum;
222
223							#get concepts
224	0						my @concepts = @{$citation->getOrderedConcepts()};
	0
225	0						my $cnum = @concepts;
226	0						$totalConcepts += $cnum;
227
228							#grab EVERYTHING
229
230							#get tokens first
231	0						foreach my $token (@tokens){
232
233							#print "token ";
234							#get text and pos
235	0						my $super_text = lc($token->{text});
236
237	0						foreach my $text (split(" ", $super_text)){ #split just in case its a compound word
238							#check if already there
239	0	0					if($featHash{$text}){
240	0						next;
241							}
242
243	0						my $pos = $token->{posTag};
244	0						$featHash{$text}->{pos} = $pos;
245							}
246							}
247							#get concepts second
248	0						foreach my $cc (@concepts){
249	0						foreach my $concept (@{$cc}[0]){ #two levels for some strange reason
	0
250							#print "token ";
251	0						my $super_text = lc($concept->{text});
252	0						for my $text (split(" ", $super_text)){ #split just in case its a compound word
253							#grab original hash
254	0						my $hash_ref = $featHash{$text};
255	0	0	0				if (!$hash_ref \|\| $hash_ref->{cui} \|\| $hash_ref->{sem}){
			0
256	0						next;
257							}
258
259							#get sem and cuis v_1
260							#$hash_ref->{cui} = $concept->{cui};
261							#$hash_ref->{sem} = $concept->{semanticTypes};
262
263							#print "$text --> ";
264							#$uniSub->printArr(", ", \@sem);
265
266							#get sem and cuis v_2
267	0	0					if($hash_ref->{cui}){
268	0						$hash_ref->{cui} = ($hash_ref->{cui} . $concept->{cui});
269							}else{
270	0						$hash_ref->{cui} = $concept->{cui};
271							}
272
273	0	0					if($hash_ref->{sem}){
274	0						$hash_ref->{sem} = ($hash_ref->{sem} . $concept->{semanticTypes});
275							}else{
276	0						$hash_ref->{sem} = $concept->{semanticTypes};
277							}
278
279
280							#print "$text --> ";
281							#$uniSub->printArr(", ", \@sem);
282							}
283							}
284							}
285
286							#increment to the next set
287	0						$indexer++;
288							}
289
290							#clean up everything
291	0						foreach my $key(keys %featHash){
292	0						my $hash_ref = $featHash{$key};
293	0						my %hash = %$hash_ref;
294	0	0					if($hash{cui}){
295	0						$hash{cui} = join(',', uniq(split(',', $hash{cui})));
296							}
297	0	0					if($hash{sem}){
298	0						$hash{sem} = join(',', uniq(split(',', $hash{sem})));
299							}
300							}
301
302							#print it back
303	0						print("\n");
304							#hash_debug();
305
306	0						$uniSub->printColorDebug("red", "TOKENS: $totalTokens\n");
307	0						$uniSub->printColorDebug("red", "CONCEPTS: $totalConcepts\n");
308
309							#exit;
310
311
312							####### BUCKET SORTING - TRAIN AND TEST DATA #######
313
314							#sort the lines to buckets
315	0						my %buckets = ();
316	0	0					if($bucketsNum > 1){
317	0						$uniSub->printColorDebug("blue", "*Making buckets....\n");
318	0						%buckets = sort2Buckets($totalLines, $bucketsNum);
319							}else{
320	0						push(@{$buckets{1}}, (0..($totalLines-1)));
	0
321							}
322
323							#$uniSub->printArr(" ", $buckets{1});
324
325
326	0						$uniSub->printColorDebug("blue", "*Making train and test files....\n");
327
328	0						process_file($filename, \%buckets);
329
330	0						$uniSub->printDebug("\n");
331
332							}
333
334							#print out the contents of the feature hash
335							sub hash_debug{
336	0			0	0		foreach my $key (sort keys %featHash){
337	0						my $text = $key;
338	0						my $hash_ref = $featHash{$key};
339	0						my %hash = %$hash_ref;
340
341	0						my $pos = "";
342	0						my $cui = "";
343	0						my $sem = "";
344
345	0	0					if($hash{pos}){
346	0						$pos = $hash{pos};
347							}
348	0	0					if($hash{cui}){
349	0						$cui = join(',', @{$hash{cui}});
	0
350							}
351	0	0					if($hash{sem}){
352	0						$sem = join(',', @{$hash{sem}});
	0
353							}
354
355	0						print "$text - $pos - $cui - $sem\n";
356							}
357							}
358
359
360							###################### LINE MANIPULATION #####################
361
362							###### RETAGS THE LINE ######
363
364							# turns the tagged entity words into special words with <> for the context words
365							# input : $input <-- the line to retag
366							# $id <-- the id within the tag to look for
367							# output : (.arff files)
368							sub retag{
369	0			0	0		my $input = shift;
370	0						my $id = shift;
371
372	0						$id = lc($id);
373	0						my $line = lc($input);
374
375							#get rid of any tags
376	0						my @words = split (" ", $line);
377	0						my @newSet = ();
378	0						my $charact = 0;
379	0						foreach my $word (@words){
380							#if word is currently in the annotation
381	0	0					if($charact){
382	0	0					if($word =~//){ #reached the end
383	0						$charact = 0;
384							}else{ #continue adding extension to the annotation set
385	0						my $charWord = "$word"."$entId";
386	0						push @newSet, $charWord;
387							}
388							}else{
389	0	0					if($word =~//){ #start annotation set
390	0						$charact = 1;
391							}else{ #continue as normal
392	0						push @newSet, $word;
393							}
394							}
395							}
396
397							#clean up the new line
398	0						my $new_line = join " ", @newSet;
399	0						$new_line =~s/\b$entId\b//g;
400	0						$new_line = $uniSub->cleanWords($new_line);
401	0						return $new_line;
402							}
403
404							# turns the tagged entity words in the entire file into special words with <> for the context words
405							# input : $name <-- the name of the file to use as the id tag
406							# @lines <-- the set of lines to retag
407							# output : @tagSet <-- set of retagged lines
408							sub retagSet{
409	0			0	0		my $name = shift;
410	0						my $lines_ref = shift;
411	0						my @lines = @$lines_ref;
412
413	0						my @tagSet = ();
414	0						foreach my $line (@lines){
415							#retag the line
416	0						chomp($line);
417	0						my $tag_line = retag($line, $name);
418
419							#add it to the set
420	0						push @tagSet, $tag_line;
421							}
422	0						return @tagSet;
423							}
424
425							#returns clean line with no tags or retaggings
426							# input : $line <-- the line to untag
427							# : $id <-- the id label to look for
428							# output : $input <-- untagged input line
429							sub untag{
430	0			0	0		my $line = shift;
431	0						my $id = shift;
432
433							#remove and
434	0						my $input = lc($line);
435	0						$id = lc($id);
436	0						$input =~ s///g;
437	0						$input =~ s///g;
438	0						$input = $uniSub->cleanWords($input);
439	0						return $input;
440							}
441							#returns a clean set of lines
442							# input : $filename <-- the name of the file for use in the id tag
443							# : @lines <-- the set of lines to untag
444							# output : @clean_set <-- untagged set of lines
445							sub untagSet{
446	0			0	0		my $filename = shift;
447	0						my $lines_ref = shift;
448	0						my @lines = @$lines_ref;
449
450	0						my @clean_set = ();
451							#run untagger on each line
452	0						foreach my $line(@lines){
453	0						my $cl = untag($line, $filename);
454	0						push @clean_set, $cl;
455							}
456	0						return @clean_set;
457							}
458
459
460							#import metamap hashtable data
461							# input : $name <-- the name of the file to import from
462							# output : (hashmap of metamap lines)
463							sub importMetaData{
464	0			0	0		my $name = shift;
465
466							#create a directory to save hashtable data
467	0						my $META;
468	0						my $subdir = "_METAMAPS";
469	0	0					open($META, "<", ("$program_dir/$subdir/" . $name . "_meta")) \|\| die ("Metamap file for " . $name . " not found! Check the _METAMAP folder in the file directory\n$!");
470
471
472							#import metamap data from the file
473	0						my @metaLines = <$META>;
474	0						my $metaCombo = join("", @metaLines);
475	0						my @newMetaLines = split("\n\n", $metaCombo);
476	0						my $t = @newMetaLines;
477	0						$uniSub->printColorDebug("red", "META LINES: $t\n");
478	0						my $key = 0;
479							#assign each metamap line to a hash indexer
480	0						foreach my $mm (@newMetaLines){
481	0						$metamapHash{$key} = $mm;
482	0						$key++;
483							}
484	0						close $META;
485							}
486
487							##### FOR THE ORTHO SET #####
488
489							#turns the tagged entity words into special words with <> for the context words
490							# input : $input <-- the line to retag
491							# output : $new_line <-- the retagged line
492							sub retagOrtho{
493	0			0	0		my $filename = shift;
494	0						my $input = shift;
495	0						my $line = $input;
496
497							#get rid of any tags
498	0						my @words = split (" ", $line);
499	0						my @newSet = ();
500	0						my $charact = 0;
501	0						foreach my $word (@words){
502	0	0					if($charact){ #currently annotating
503	0	0					if($word =~//){ #stop annotation set
504	0						$charact = 0;
505							}else{
506	0						my $charWord = "$word"."$entId"; #add extension to annotation word
507	0						push @newSet, $charWord;
508							}
509							}else{
510	0	0					if($word =~//){ #start annotating
511	0						$charact = 1;
512							}else{ #continue as normal
513	0						push @newSet, $word;
514							}
515							}
516							}
517
518							#clean up the new line
519	0						my $new_line = join " ", @newSet;
520	0						$new_line =~s/\s$entId\b//g;
521	0						$new_line = noASCIIOrtho($new_line);
522	0						return $new_line;
523							}
524							#turns the tagged entity words in the entire file into special words with <> for the context words
525							# input : @lines <-- the set of lines to retag
526							# output : @tagSet <-- the retagged line
527							sub retagSetOrtho{
528	0			0	0		my $file = shift;
529	0						my $lines_ref = shift;
530	0						my @lines = @$lines_ref;
531
532	0						my @n = split '/', $file;
533	0						my $l = @n;
534	0						my $filename = $n[$l - 1];
535
536	0						my @tagSet = ();
537	0						foreach my $line (@lines){
538							#retag the line
539	0						chomp($line);
540	0						my $tag_line = retagOrtho($filename, $line);
541
542							#add it to the set
543	0						push @tagSet, $tag_line;
544							}
545	0						return @tagSet;
546							}
547
548							#cleans the line without getting rid of tags
549							# input : $line <-- line to clean up
550							# output : $new_in <-- the cleaned line
551							sub noASCIIOrtho{
552	0			0	0		my $line = shift;
553
554	0						my $new_in = $line;
555	0						$new_in =~ s/[^[:ascii:]]//g; #remove any words that do not contain ASCII characters
556	0						return $new_in
557							}
558
559
560							####################### TOKENS AND CONCEPT MANIPULATION #######################
561
562
563							#gets rid of any special tokens
564							# input : $text <-- the token text to fix
565							# output : $tokenText <-- a cleaned up token
566							sub cleanToken{
567	0			0	0		my $text = shift;
568
569	0						my $tokenText = $text;
570
571							#fix "# . #" tokens, fix "__ \' __" tokens, remove any non word based characters
572	0						$tokenText =~s/\s\.\s/\./og;
573	0						$tokenText =~s/\s\\\'\s//og;
574	0						$tokenText =~s/[^a-zA-Z0-9]//og;
575
576	0						return $tokenText;
577							}
578
579							#retrieves the feature for a single word
580							# input : $word <-- the word to extract the features from
581							# : $type <-- what type of feature to extract [e.g. "pos", "sem", "cui"]
582							# output : if "pos" <-- a scalar part-of-speech value
583							# : else <-- an array of semantic or cui values (a single text value can have more than one of these)
584							sub getFeature{
585	0			0	0		my $orig_word = shift;
586	0						my $type = shift;
587
588							#clean up thw word
589	0						my $word = lc($orig_word);
590	0						$word =~s/[^a-zA-Z0-9\s]//;
591
592	0	0					if(!$featHash{$word}){
593	0						return "";
594							}
595
596	0						my $hash_ref = $featHash{$word};
597	0						my %hash = %$hash_ref;
598
599	0	0					if($hash{$type}){
600	0						return $hash{$type};
601							}
602	0						return "";
603							}
604
605							###################### BUCKETS - TRAIN AND TEST ARFF FILES #####################
606
607
608							#sorts the keys from the hashmaps into buckets so that certain values can be accessed
609							# input : $keyAmt <-- the number of lines or "keys" to divvy up into the buckets
610							# : $bucketNum <-- how many buckets to use
611							# output : %bucketList <-- the set of buckets with keys in them
612							sub sort2Buckets{
613	0			0	0		my $keyAmt = shift;
614	0						my $bucketNum = shift;
615
616							#create sets
617	0						my @keySet = (0..$keyAmt - 1); #set of keys
618	0						my %bucketList = (); #all of the buckets
619
620							#add some buckets to the bucket list
621	0						for(my $a = 1; $a <= $bucketNum; $a++){
622	0						$bucketList{$a} = [];
623							}
624
625							#sort the lines into buckets
626	0						my $bucketId = 1;
627	0						foreach my $key (@keySet){
628	0						push (@{$bucketList{$bucketId}}, $key); #add the line to the bucket
	0
629
630							#reset the id if at the max value
631	0	0					if($bucketId == $bucketNum){
632	0						$bucketId = 1;
633							}else{
634	0						$bucketId++;
635							}
636							}
637
638							#return the list of buckets
639	0						return %bucketList;
640							}
641
642							###################### ARFF STUFF #####################
643							#makes arff files for ortho, morpho, text, pos, cui, and sem attributes
644
645							# Processes the file by retrieving attributes, making vectors, and splitting into buckets
646							# (formally known as 'zhu li!! Do the thing!!'')
647							# input : $name <-- the name of the file
648							# : %bucketList <-- the set of buckets with keys in them
649							# output : (n arff files; n = # of buckets x (train and test) x # of features being used)
650							sub process_file{
651	0			0	0		my $name = shift;
652	0						my $bucketList_ref = shift;
653	0						my %buckets = %$bucketList_ref;
654
655							#grab the attributes
656	0						my %attrSets = ();
657	0						$uniSub->printColorDebug("bold green", "Retrieving attributes...\n");
658	0						foreach my $item(@features){
659	0						$uniSub->printColorDebug("bright_green", "\t$item attr\n");
660	0						my %setOfAttr = grabAttr($name, $item, \%buckets);
661	0						$attrSets{$item} = \%setOfAttr; #gets both the vector and arff based attributes
662							}
663
664							#contain the stop words regular expressions if the parameter was defined
665	0	0					if(defined $stopwords_file){
666	0						$stopRegex = stop($stopwords_file);
667							}
668
669							#let's make some vectors!
670	0						$uniSub->printColorDebug("bold yellow", "Making Vectors...\n-------------------\n");
671	0						my @curFeatSet = ();
672	0						my $abbrev = "";
673
674							#run based on wcs
675	0						my $wcs_bucket;
676							my $wcs_feature;
677	0						my $wcs_found = 0;
678	0	0					if($wcs){
679	0						my @wcs_parts = split("-", $wcs);
680	0						$wcs_feature = $wcs_parts[1];
681	0						$wcs_bucket = $wcs_parts[0];
682							}
683
684
685							#iteratively add on the features [e.g. o, om, omt, omtp, omtpc, omtpcs]
686	0						foreach my $feature (@features){
687	0						$uniSub->printColorDebug("yellow", " $feature \n");
688	0						push(@curFeatSet, $feature);
689	0						$abbrev .= substr($feature, 0, 1); #add to abbreviations for the name
690
691							#$uniSub->printColorDebug("on_red", "$wcs - $wcs_found - $abbrev vs. $wcs_feature");
692	0	0	0				if(($wcs) && (!$wcs_found) && ($abbrev ne $wcs_feature)){
			0
693	0						print("SKIP \n");
694	0						next;
695							}
696
697							#go through each bucket
698	0						foreach my $bucket (sort keys %buckets){
699							#if wcs parameter defined - skip these buckets until the specified bucket and feature is reached
700	0	0	0				if(($wcs) && (!$wcs_found) && ($bucket != $wcs_bucket)){
			0
701	0						print("\tSKIP\n");
702	0						next;
703							}else{
704	0						$wcs_found = 1;
705							}
706
707							#make train-test bucket indexes
708	0						my @range = ();
709	0	0					if($bucketsNum > 1){
710	0						@range = $uniSub->bully($bucketsNum, $bucket);
711							}else{
712	0						@range = (1);
713							}
714
715	0						$uniSub->printColorDebug("on_green", "BUCKET #$bucket");
716							#retrieve the vector attributes to use
717	0						my %vecAttrSet = ();
718	0						foreach my $curItem(@curFeatSet){
719	0	0					if($curItem eq "ortho"){
720	0						$vecAttrSet{$curItem} = ();
721							}else{
722							#get outer layer (tpcs)
723	0						my $a_ref = $attrSets{$curItem};
724	0						my %a = %$a_ref;
725
726							#get inner layer (vector)
727	0						my $b_ref = $a{vector};
728	0						my %b = %$b_ref;
729
730							#foreach my $key (sort keys %b){print "$key\n";}
731
732							#finally get the bucket layer (1..$bucketNum) based on range
733	0						my $c_ref = $b{$bucket};
734	0						my @c = @$c_ref;
735	0						$vecAttrSet{$curItem} = \@c;
736							}
737							}
738
739							### TRAIN ###
740	0						$uniSub->printColorDebug("bold blue", "\ttraining...\n");
741							#retrieve the lines to use
742	0						my @lineSetTrain = ();
743	0						my @bucketSetTrain = ();
744	0						foreach my $num (@range){push(@bucketSetTrain, @{$buckets{$num}});}
	0
	0
745	0						foreach my $key (@bucketSetTrain){push(@lineSetTrain, $orthoHash{$key});}
	0
746
747							#make the vector
748	0						my @vectorSetTrain = vectorMaker(\@lineSetTrain, \@curFeatSet, \%vecAttrSet);
749	0						$uniSub->printDebug("\n");
750
751							### TEST ###
752	0						my @vectorSetTest = ();
753	0	0					if($bucketsNum > 1){ #skip this if only 1 bucket being used (train bucket)
754	0						$uniSub->printColorDebug("bold magenta", "\ttesting...\n");
755							#retrieve the lines to use
756	0						my @lineSetTest = ();
757	0						my @bucketSetTest = ();
758	0						push(@bucketSetTest, @{$buckets{$bucket}});
	0
759	0						foreach my $key (@bucketSetTest){push(@lineSetTest, $orthoHash{$key});}
	0
760
761							#make the vector
762	0						@vectorSetTest = vectorMaker(\@lineSetTest, \@curFeatSet, \%vecAttrSet);
763	0						$uniSub->printDebug("\n");
764							}
765
766
767							### ARFF ###
768							#retrieve the arff attributes to use
769	0						my @arffAttrSet = ();
770	0						foreach my $curItem(@curFeatSet){
771	0	0					if($curItem eq "ortho"){
772							#get outer layer (ortho)
773	0						my $a_ref = $attrSets{$curItem};
774	0						my %a = %$a_ref;
775							#get the values from ortho
776	0						push(@arffAttrSet, @{$a{arff}});
	0
777							}else{
778							#get outer layer (mtpcs)
779	0						my $a_ref = $attrSets{$curItem};
780	0						my %a = %$a_ref;
781
782							#get inner layer (arff)
783	0						my $b_ref = $a{arff};
784	0						my %b = %$b_ref;
785
786							#finally get the bucket layer (1..$bucketNum) based on range
787	0						my $c_ref = $b{$bucket};
788	0						my @c = @$c_ref;
789	0						push(@arffAttrSet, @c);
790							}
791							}
792
793							#create the arff files for the test and train features
794	0						$uniSub->printColorDebug("bright_yellow", "\tmaking arff files...\n");
795	0						$uniSub->printColorDebug("bright_red", "\t\tARFF TRAIN\n");
796	0						createARFF($name, $bucket, $abbrev, "train", \@arffAttrSet, \@vectorSetTrain);
797	0	0					if($bucketsNum > 1){
798	0						$uniSub->printColorDebug("bright_red", "\t\tARFF TEST\n");
799	0						createARFF($name, $bucket, $abbrev, "test", \@arffAttrSet, \@vectorSetTest);
800							}
801
802							}
803							}
804							}
805
806
807							#create the arff file
808							# input : $name <-- the name of the file
809							# : $bucket <-- the index of the bucket you're testing [e.g. bucket #1]
810							# : $abbrev <-- the abbreviation label for the set of features
811							# : $type <-- train or test ARFF?
812							# : @attrARFFSet <-- the set of attributes exclusively for printing to the arff file
813							# : @vecSec <-- the set of vectors created
814							# output : (an arff file)
815							sub createARFF{
816	0			0	0		my $name = shift;
817	0						my $bucket = shift;
818	0						my $abbrev = shift;
819	0						my $type = shift;
820	0						my $attr_ref = shift;
821	0						my $vec_ref = shift;
822
823	0						my $typeDir = "_$type";
824	0						my $ARFF;
825							#print to files
826	0						$uniSub->printColorDebug("bold cyan", "\t\tcreating $name/$abbrev - BUCKET #$bucket $type ARFF...\n");
827	0	0					if($program_dir ne ""){
828	0						my $subdir = "_ARFF";
829	0						my $arffdir = $name . "_ARFF";
830	0						my $featdir = "_$abbrev";
831	0						make_path("$program_dir/$subdir/$arffdir/$featdir/$typeDir");
832	0	0					open($ARFF, ">", ("$program_dir/$subdir/$arffdir/$featdir/$typeDir/" . $name . "_$type-" . $bucket .".arff")) \|\| die ("Cannot write ARFF file to the directory! Check permissions!\n$!");
833							}else{
834	0						my $arffdir = $name . "_ARFF";
835	0						my $featdir = "_$abbrev";
836	0						make_path("$arffdir/$featdir/$typeDir");
837	0	0					open($ARFF, ">", ("$arffdir/$featdir/$typeDir/" . $name . "_$type-" . $bucket .".arff")) \|\| die ("Cannot write ARFF file to the directory! Check permissions!\n$!");
838							}
839
840							#get the attr and vector set
841	0						my @attrARFFSet = @$attr_ref;
842	0						my @vecSet = @$vec_ref;
843
844							#get format for the file
845	0						my $relation = "\@RELATION $name";
846	0						my @printAttr = makeAttrData(\@attrARFFSet);
847	0						my $entity = "\@ATTRIBUTE Entity {No, Yes}"; #set if the entity word or not
848	0						my $data = "\@DATA";
849
850							#print everything to the file
851	0						$uniSub->printDebug("\t\tprinting to file...\n");
852	0						$uniSub->print2File($ARFF, $relation);
853	0						foreach my $a(@printAttr){$uniSub->print2File($ARFF, $a);}
	0
854	0						$uniSub->print2File($ARFF, $entity);
855	0						$uniSub->print2File($ARFF, $data);
856	0						foreach my $d(@vecSet){$uniSub->print2File($ARFF, $d);}
	0
857	0						close $ARFF;
858							}
859
860							###################### VECTOR THINGIES #####################
861
862
863							#makes vectors from a set
864							# input : @txtLineSet <-- the retagged text lines to make vectors out of
865							# : @featureList <-- the list of features to make the vectors out of [e.g. (ortho, morph, text)]
866							# : @attrs <-- the attributes to use to make the vectors
867							# output : @setVectors <-- the vectors for each word in all of the lines
868							sub vectorMaker{
869	0			0	0		my $set_ref = shift;
870	0						my $feat_ref = shift;
871	0						my $attrib_ref = shift;
872	0						my @txtLineSet = @$set_ref;
873	0						my @featureList = @$feat_ref;
874	0						my %attrs = %$attrib_ref;
875
876	0						my @setVectors = ();
877							#go through each line of the set
878	0						my $setLen = @txtLineSet;
879
880	0						for(my $l = 0; $l < $setLen; $l++){
881	0						my $line = $txtLineSet[$l];
882							#$uniSub->printColorDebug("on_red", $line);
883	0						my @words = split(' ', $line);
884							#$uniSub->printArr(", ", \@words);
885							#print "\n";
886	0						my $wordLen = @words;
887
888							#go through each word
889	0						for(my $a = 0; $a < $wordLen; $a++){
890
891	0						$\| = 1;
892
893	0						my $wordOrig = $words[$a];
894							#make the words for comparison
895	0						my $word = $words[$a];
896	0						my $prevWord = "";
897	0						my $nextWord = "";
898
899							#show progress
900	0						my $l2 = $l + 1;
901	0						my $a2 = $a + 1;
902	0						$uniSub->printDebug("\r" . "\t\tLine - $l2/$setLen ------ Word - $a2/$wordLen ---- ");
903
904							#cut-off a word in the display if it is too long
905							#(longer than 8 characters)
906	0						my $smlword = substr($word, 0, 8);
907	0	0					if(length($word) > 8){
908	0						$smlword .= "...";
909							}
910
911							#distinguish entity words from normal words
912	0	0					if($word =~/$entId/o){
913	0						$uniSub->printColorDebug("red", "$smlword! ");
914							}else{
915	0						$uniSub->printDebug("$smlword! ")
916							}
917
918	0						my @word_cuis = getFeature($word, "cui");
919	0						my $ncui = $word_cuis[0];
920							#$uniSub->printColorDebug("red", "\n\t\t$word - $ncui\n");
921
922							#check if it's a stopword
923	0	0	0				if(($stopwords_file and $word=~/$stopRegex/o) \|\| ($is_cui and $word_cuis[0] eq "")){
			0
			0
924							#$uniSub->printColorDebug("on_red", "\t$word\tSKIP!");
925	0	0					if(!($word =~/[a-zA-Z0-9\_\-]+$entId/)){
926	0						next;
927							}
928							}
929
930							#if a weird character word - skip
931	0	0	0				if((length($word) eq 1) and ($word =~/[^a-zA-Z0-9\_\-]/)){
932	0						next;
933							}
934
935							#get the word before and after the current word
936	0	0					if($a > 0){$prevWord = $words[$a - 1];}
	0
937	0	0					if($a < ($wordLen - 1)){$nextWord = $words[$a + 1];}
	0
938
939
940
941							#get rid of tag if necessary
942	0						$prevWord =~s/$entId//og;
943	0						$nextWord =~s/$entId//og;
944	0						$word =~s/$entId//og;
945
946	0						my $vec = "";
947							#use each set of attributes
948	0						foreach my $item(@featureList){
949	0						my $addVec = "";
950	0	0					if($item eq "ortho"){$addVec = orthoVec($word);}
	0	0
		0
		0
		0
		0
951	0						elsif($item eq "morph"){$addVec = morphVec($word, \@{$attrs{"morph"}});}
	0
952	0						elsif($item eq "text"){$addVec = textVec($word, $prevWord, $nextWord, \@{$attrs{"text"}});}
	0
953	0						elsif($item eq "pos"){$addVec = posVec($word, $prevWord, $nextWord, \@{$attrs{"pos"}});}
	0
954	0						elsif($item eq "cui"){$addVec = cuiVec($word, $prevWord, $nextWord, \@{$attrs{"cui"}});}
	0
955	0						elsif($item eq "sem"){$addVec = semVec($word, $prevWord, $nextWord, \@{$attrs{"sem"}});}
	0
956
957
958	0						$vec .= $addVec;
959
960							}
961
962							#convert binary to sparse if specified
963	0	0					if($sparse_matrix){
964	0						$vec = convert2Sparse($vec);
965							#$uniSub->printColorDebug("red", "$vec\n");
966							}
967
968							#check if the word is an entity or not
969							#$uniSub->printColorDebug("red", "\n$wordOrig\n");
970	0	0					$vec .= (($wordOrig =~/\b[\S]+($entId)\b/) ? "Yes " : "No ");
971
972							#close it if using sparse matrix
973	0	0					if($sparse_matrix){
974	0						$vec .= "}";
975							}
976
977							#finally add the word back and add the entire vector to the set
978	0						$vec .= "\%$word";
979	0	0					if($word ne ""){
980	0						push(@setVectors, $vec);
981							}
982							}
983							}
984
985	0						return @setVectors;
986							}
987
988							#makes the orthographic based part of the vector
989							# input : $word <-- the word to analyze
990							# output : $strVec <-- the orthographic vector string
991							sub orthoVec{
992	0			0	0		my $word = shift;
993
994							## CHECKS ##
995	0						my $strVec = "";
996	0						my $addon = "";
997
998							#check if first letter capital
999	0	0					$addon = ($word =~ /\b([A-Z])\w+\b/o ? 1 : 0);
1000	0						$strVec .= "$addon, ";
1001
1002							#check if a single letter word
1003	0	0					$addon = (length($word) == 1 ? 1 : 0);
1004	0						$strVec .= "$addon, ";
1005
1006							#check if all capital letters
1007	0	0					$addon = ($word =~ /\b[A-Z]+\b/o ? 1 : 0);
1008	0						$strVec .= "$addon, ";
1009
1010							#check if contains a digit
1011	0	0					$addon = ($word =~ /[0-9]+/o ? 1 : 0);
1012	0						$strVec .= "$addon, ";
1013
1014							#check if all digits
1015	0	0					$addon = ($word =~ /\b[0-9]+\b/o ? 1 : 0);
1016	0						$strVec .= "$addon, ";
1017
1018							#check if contains a hyphen
1019	0	0					$addon = ($word =~ /-/o ? 1 : 0);
1020	0						$strVec .= "$addon, ";
1021
1022							#check if contains punctuation
1023	0	0					$addon = ($word =~ /[^a-zA-Z0-9\s]/o ? 1 : 0);
1024	0						$strVec .= "$addon, ";
1025
1026	0						return $strVec;
1027							}
1028
1029							#makes the morphological based part of the vector
1030							# input : $word <-- the word to analyze
1031							# : @attrs <-- the set of morphological attributes to use
1032							# output : $strVec <-- the morphological vector string
1033							sub morphVec{
1034	0			0	0		my $word = shift;
1035	0						my $attrs_ref = shift;
1036	0						my @attrs = @$attrs_ref;
1037
1038	0						my $strVec = "";
1039
1040							#grab the first # characters and the last # characters
1041	0						my $preWord = substr($word, 0, $prefix);
1042	0						my $sufWord = substr($word, -$suffix);
1043
1044							#compare and build a binary vector
1045	0						foreach my $a (@attrs){
1046	0	0					if($a eq $preWord){
		0
1047	0						$strVec .= "1, ";
1048							}elsif($a eq $sufWord){
1049	0						$strVec .= "1, ";
1050							}else{
1051	0						$strVec .= "0, ";
1052							}
1053							}
1054
1055	0						return $strVec;
1056
1057							}
1058
1059							#makes the text based part of the vector
1060							# input : $w <-- the word to analyze
1061							# : $pw <-- the previous word
1062							# : $nw <-- the next word
1063							# : @attrbts <-- the set of text attributes to use
1064							# output : $strVec <-- the text vector string
1065							sub textVec{
1066	0			0	0		my $w = shift;
1067	0						my $pw = shift;
1068	0						my $nw = shift;
1069	0						my $at_ref = shift;
1070	0						my @attrbts = @$at_ref;
1071
1072	0						my $strVec = "";
1073
1074							#clean the words
1075	0						$w = $uniSub->cleanWords($w);
1076	0						$pw = $uniSub->cleanWords($pw);
1077	0						$nw = $uniSub->cleanWords($nw);
1078
1079							#check if the word is the attribute or the words adjacent it are the attribute
1080	0						foreach my $a(@attrbts){
1081
1082	0						my $pair = "";
1083	0	0					$pair .= ($w eq $a ? "1, " : "0, ");
1084	0	0	0				$pair .= (($pw eq $a or $nw eq $a) ? "1, " : "0, ");
1085	0						$strVec .= $pair;
1086							}
1087
1088	0						return $strVec;
1089							}
1090
1091							#makes the part of speech based part of the vector
1092							# input : $w <-- the word to analyze
1093							# : $pw <-- the previous word
1094							# : $nw <-- the next word
1095							# : @attrbts <-- the set of pos attributes to use
1096							# output : $strVec <-- the pos vector string
1097							sub posVec{
1098	0			0	0		my $w = shift;
1099	0						my $pw = shift;
1100	0						my $nw = shift;
1101	0						my $at_ref = shift;
1102	0						my @attrbts = @$at_ref;
1103
1104							#clean the words
1105	0						$w = $uniSub->cleanWords($w);
1106	0						$pw = $uniSub->cleanWords($pw);
1107	0						$nw = $uniSub->cleanWords($nw);
1108
1109							#alter the words to make them pos types
1110	0						$w = getFeature($w, "pos");
1111	0						$pw = getFeature($pw, "pos");
1112	0						$nw = getFeature($nw, "pos");
1113
1114	0						my $strVec = "";
1115
1116							#check if the word is the attribute or the words adjacent it are the attribute
1117	0						foreach my $a(@attrbts){
1118	0						my $pair = "";
1119	0	0					$pair .= ($w eq $a ? "1, " : "0, ");
1120	0	0	0				$pair .= (($pw eq $a or $nw eq $a) ? "1, " : "0, ");
1121	0						$strVec .= $pair;
1122							}
1123
1124	0						return $strVec;
1125							}
1126
1127							#makes the cui based part of the vector
1128							# input : $w <-- the word to analyze
1129							# : $pw <-- the previous word
1130							# : $nw <-- the next word
1131							# : @attrbts <-- the set of cui attributes to use
1132							# output : $strVec <-- the cui vector string
1133							sub cuiVec{
1134	0			0	0		my $w = shift;
1135	0						my $pw = shift;
1136	0						my $nw = shift;
1137	0						my $at_ref = shift;
1138	0						my @attrbts = @$at_ref;
1139
1140							#clean the words
1141	0						$w = $uniSub->cleanWords($w);
1142	0						$pw = $uniSub->cleanWords($pw);
1143	0						$nw = $uniSub->cleanWords($nw);
1144
1145							#alter the words to make them cui types
1146	0						my @wArr = getFeature($w, "cui");
1147	0						my @pwArr = getFeature($pw, "cui");
1148	0						my @nwArr = getFeature($nw, "cui");
1149
1150	0						my $strVec = "";
1151							#check if the word is the attribute or the words adjacent it are the attribute
1152	0						foreach my $a(@attrbts){
1153	0						my $pair = "";
1154	0	0					$pair .= ($uniSub->inArr($a, \@wArr) ? "1, " : "0, ");
1155	0	0	0				$pair .= (($uniSub->inArr($a, \@pwArr) or $uniSub->inArr($a, \@nwArr)) ? "1, " : "0, ");
1156	0						$strVec .= $pair;
1157							}
1158
1159	0						return $strVec;
1160							}
1161
1162							#makes the semantic based part of the vector
1163							# input : $w <-- the word to analyze
1164							# : $pw <-- the previous word
1165							# : $nw <-- the next word
1166							# : @attrbts <-- the set of sem attributes to use
1167							# output : $strVec <-- the sem vector string
1168							sub semVec{
1169	0			0	0		my $w = shift;
1170	0						my $pw = shift;
1171	0						my $nw = shift;
1172	0						my $at_ref = shift;
1173	0						my @attrbts = @$at_ref;
1174
1175							#clean the words
1176	0						$w = $uniSub->cleanWords($w);
1177	0						$pw = $uniSub->cleanWords($pw);
1178	0						$nw = $uniSub->cleanWords($nw);
1179
1180							#alter the words to make them sem types
1181	0						my @wArr = getFeature($w, "sem");
1182	0						my @pwArr = getFeature($pw, "sem");
1183	0						my @nwArr = getFeature($nw, "sem");
1184
1185	0						my $strVec = "";
1186
1187							#check if the word is the attribute or the words adjacent it are the attribute
1188	0						foreach my $a(@attrbts){
1189							#remove "sem" label
1190	0						$a = lc($a);
1191
1192	0						my $pair = "";
1193	0	0					$pair .= ($uniSub->inArr($a, \@wArr) ? "1, " : "0, ");
1194	0	0	0				$pair .= (($uniSub->inArr($a, \@pwArr) or $uniSub->inArr($a, \@nwArr)) ? "1, " : "0, ");
1195	0						$strVec .= $pair;
1196							}
1197	0						return $strVec;
1198							}
1199
1200							#converts a binary vector to a sparse vector
1201							sub convert2Sparse{
1202	0			0	0		my $bin_vec = shift;
1203	0						my @vals = split(", ", $bin_vec);
1204	0						my $numVals = @vals;
1205
1206	0						my $sparse_vec = "{";
1207	0						for(my $c=0;$c<$numVals;$c++){
1208	0						my $curVal = $vals[$c];
1209
1210							#if a non-zero value is found at the index - add it to the final
1211	0	0					if(($curVal eq "1")){
1212	0						$sparse_vec .= "$c $curVal, ";
1213							#$uniSub->printColorDebug("red", "$c $curVal, ");
1214							}
1215							}
1216	0						$sparse_vec .= "$numVals, ";
1217
1218	0						return $sparse_vec;
1219							}
1220
1221
1222							###################### ATTRIBUTE BASED METHODS #####################
1223
1224							#gets the attributes based on the item
1225							# input : $feature <-- the feature type [e.g. ortho, morph, text]
1226							# : %buckets <-- the bucket key set
1227							# output : %vecARFFattr <-- the vector set of attributes and arff set of attributes
1228							sub grabAttr{
1229	0			0	0		my $name = shift;
1230	0						my $feature = shift;
1231	0						my $buckets_ref = shift;
1232	0						my %buckets = %$buckets_ref;
1233
1234	0						my %vecARFFattr = ();
1235							#no importing of attributes is needed for ortho since they all check the same features
1236	0	0					if($feature eq "ortho"){
		0
1237	0						my @vecSet = ();
1238	0						my @arffSet = ("first_letter_capital", #output for the ARFF file
1239							"single_character",
1240							"all_capital",
1241							"has_digit",
1242							"all_digit",
1243							"has_hyphen",
1244							"has_punctuation");
1245	0						$vecARFFattr{vector} = \@vecSet;
1246	0						$vecARFFattr{arff} = \@arffSet;
1247	0						return %vecARFFattr;
1248							}
1249							#the morphological attributes look at the prefix and suffix of a word not the adjacencies
1250							elsif($feature eq "morph"){
1251	0						my %bucketAttr = ();
1252	0						my %bucketAttrARFF = ();
1253
1254							#get the attributes for each bucket
1255	0						foreach my $testBucket (@allBuckets){
1256	0						my @range = ();
1257	0	0					if($bucketsNum > 1){
1258	0						@range = $uniSub->bully($bucketsNum, $testBucket);
1259							}else{
1260	0						@range = (1);
1261							}
1262	0						$uniSub->printDebug("\t\t$name BUCKET #$testBucket/$feature MORPHO attributes...\n");
1263
1264							#get attributes [ unique and deluxe ]
1265	0						my @attr = getMorphoAttributes(\@range, \%buckets);
1266	0						@attr = uniq(@attr); #make unique forms
1267	0						$bucketAttr{$testBucket} = \@attr;
1268
1269	0						my @attrARFF = @attr;
1270	0						foreach my $a(@attrARFF){$a .= $morphID;}
	0
1271	0						$bucketAttrARFF{$testBucket} = \@attrARFF;
1272							}
1273
1274							#add to overall
1275	0						$vecARFFattr{vector} = \%bucketAttr;
1276	0						$vecARFFattr{arff} = \%bucketAttrARFF;
1277
1278	0						return %vecARFFattr;
1279							}
1280							#text, part-of-speech, semantics, cui attributes
1281							else{
1282	0						my %bucketAttr = ();
1283	0						my %bucketAttrARFF = ();
1284
1285							#get the attributes for each bucket
1286	0						foreach my $testBucket (@allBuckets){
1287	0						my @range = ();
1288	0	0					if($bucketsNum > 1){
1289	0						@range = $uniSub->bully($bucketsNum, $testBucket);
1290							}else{
1291	0						@range = (1);
1292							}
1293	0						$uniSub->printDebug("\t\t$name BUCKET #$testBucket/$feature attributes...\n");
1294
1295							#get attributes [ unique and deluxe ]
1296	0						my @attr = getRangeAttributes($feature, \@range, \%buckets);
1297	0						$bucketAttr{$testBucket} = \@attr;
1298
1299	0						my @attrARFF = getAttrDelux($feature, \@attr);
1300	0						$bucketAttrARFF{$testBucket} = \@attrARFF;
1301							}
1302
1303							#add to overall
1304	0						$vecARFFattr{vector} = \%bucketAttr;
1305	0						$vecARFFattr{arff} = \%bucketAttrARFF;
1306
1307	0						return %vecARFFattr;
1308							}
1309							}
1310
1311							#returns the attribute values of a range of buckets
1312							# input : $type <-- the feature type [e.g. ortho, morph, text]
1313							# : @bucketRange <-- the range of the buckets to use [e.g.(1-8,10) out of 10 buckets; use "$uniSub->bully" subroutine in UniversalRoutines.pm]
1314							# : %buckets <-- the bucket key set
1315							# output : @attributes <-- the set of attributes for the specific type and range
1316							sub getRangeAttributes{
1317	0			0	0		my $type = shift;
1318	0						my $bucketRange_ref = shift;
1319	0						my $buckets_ref = shift;
1320	0						my @bucketRange = @$bucketRange_ref;
1321	0						my %buckets = %$buckets_ref;
1322
1323							#collect all the necessary keys
1324	0						my @keyRing = ();
1325	0						foreach my $bucket (sort { $a <=> $b } keys %buckets){
	0
1326	0	0					if($uniSub->inArr($bucket, \@bucketRange)){
1327	0						my @keys = @{$buckets{$bucket}};
	0
1328	0						push @keyRing, @keys;
1329							}
1330							}
1331
1332							#get the lines
1333	0						my @bucketLines = ();
1334	0						foreach my $key (@keyRing){
1335	0						push(@bucketLines, $fileHash{$key});
1336							}
1337
1338							#gather the attributes based on the words in the lines
1339	0						my @attributes = ();
1340	0						foreach my $line (@bucketLines){ #in each line
1341	0						foreach my $o_word (split(' ', $line)){ #each word
1342	0						my $word = $o_word;
1343	0						$word =~s/$entId//; #remove the annotation marker
1344	0						$word =~s///; #remove the tag
1345	0						$word =~s///; #remove the tag
1346	0						$word =~s/[\:\.\-]//; #remove any : . - characters
1347
1348							#if word is empty - skip
1349	0	0					if($word eq ""){
1350	0						next;
1351							}
1352
1353							#if only looking for text attributes - use the actual word
1354	0	0					if($type eq "text"){
1355	0						push(@attributes, $word);
1356	0						next;
1357							}
1358
1359							#skip if dne
1360	0	0					if(!$featHash{$word}){
1361	0						next;
1362							}
1363							#grab attributes if it does
1364	0						my $hash_ref = $featHash{$word};
1365	0						my %hash = %$hash_ref;
1366
1367	0	0					if($hash{$type}){
1368	0						push(@attributes, split(',', $hash{$type}));
1369							}
1370							}
1371							}
1372
1373	0						@attributes = uniq(@attributes);
1374							#$uniSub->printArr(",", \@attributes);
1375	0						return @attributes;
1376
1377							}
1378
1379							#makes the arff version attributes - makes a copy of each attribute but with "_self" at the end
1380							# input : $f <-- the feature type (used for special features like POS and morph)
1381							# : @attrs <-- the attributes to ready for arff output
1382							# output : @attrDelux <-- the delux-arff attribute set
1383							sub getAttrDelux{
1384	0			0	0		my $f = shift;
1385	0						my $attr_ref = shift;
1386	0						my @attr = @$attr_ref;
1387
1388							#add the _self copy
1389	0						my @attrDelux = ();
1390	0						foreach my $word (@attr){
1391							#check if certain type of feature
1392	0	0					if($f eq "pos"){
		0
1393	0						$word = ($word . "_POS");
1394							}elsif($f eq "sem"){
1395	0						$word = uc $word;
1396							}
1397	0						$word =~s/$entId//g;
1398
1399							#add the copy and then the original
1400	0						my $copy = "$word" . "$selfId";
1401	0	0					if(!$uniSub->inArr($word, \@attrDelux)){
1402	0						push (@attrDelux, $copy);
1403	0						push(@attrDelux, $word);
1404							}
1405							}
1406	0						return @attrDelux;
1407							}
1408
1409							#looks at the prefix # and suffix # and returns a substring of each word found in the bucket text set
1410							# input : @bucketRange <-- the range of the buckets to use [e.g.(1-8,10) out of 10 buckets; use "$uniSub->bully" subroutine in UniversalRoutines.pm]
1411							# : %buckets <-- the bucket key set
1412							# output : @attributes <-- the morphological attribute set
1413							sub getMorphoAttributes{
1414	0			0	0		my $bucketRange_ref = shift;
1415	0						my $buckets_ref = shift;
1416	0						my @bucketRange = @$bucketRange_ref;
1417	0						my %buckets = %$buckets_ref;
1418
1419							#collect all the necessary keys
1420	0						my @keyRing = ();
1421	0						foreach my $bucket (sort { $a <=> $b } keys %buckets){
	0
1422	0	0					if($uniSub->inArr($bucket, \@bucketRange)){
1423	0						my @keys = @{$buckets{$bucket}};
	0
1424	0						push @keyRing, @keys;
1425							}
1426							}
1427
1428	0						my @bucketLines = ();
1429							#get the lines for each associated key
1430	0						foreach my $key (@keyRing){
1431	0						my $line = $fileHash{$key};
1432	0						push @bucketLines, $line;
1433							}
1434
1435							#get each word from each line
1436	0						my @wordSet = ();
1437	0						foreach my $line (@bucketLines){
1438	0						my @words = split(" ", $line);
1439	0						push(@wordSet, @words);
1440							}
1441
1442							#get the prefix and suffix from each word
1443	0						my @attributes = ();
1444	0						foreach my $word (@wordSet){
1445	0						$word =~s/$entId//g;
1446	0						push(@attributes, substr($word, 0, $prefix)); #add the word's prefix
1447	0						push(@attributes, substr($word, -$suffix)); #add the word's suffix
1448							}
1449
1450							#my $a = @attributes;
1451							#$uniSub->printColorDebug("red", "$type ATTR: #$a\n");
1452							#printArr("\n", @attributes);
1453
1454	0						return @attributes;
1455							}
1456
1457							#formats attributes for the ARFF file
1458							# input : @set <-- the attribute set
1459							# output : @attributes <-- the arff formatted attributes
1460							sub makeAttrData{
1461	0			0	0		my $set_ref = shift;
1462	0						my @set = @$set_ref;
1463
1464	0						my @attributes = ();
1465	0						foreach my $attr (@set){
1466	0						push (@attributes, "\@ATTRIBUTE $attr NUMERIC");
1467							}
1468
1469	0						return @attributes;
1470							}
1471
1472							##new stoplist function - by Dr. McInnes
1473							#generates a regex expression searching for the stop list words
1474							sub stop {
1475
1476	0			0	0		my $stopfile = shift;
1477
1478	0						my $stop_regex = "";
1479	0						my $stop_mode = "AND";
1480
1481	0	0					open ( STP, $stopfile ) \|\|
1482							die ("Couldn't open the stoplist file $stopfile\n$!");
1483
1484	0						while ( ) {
1485	0						chomp;
1486
1487	0	0					if(/\@stop.mode\s=\s(\w+)\s*$/) {
1488	0						$stop_mode=$1;
1489	0	0					if(!($stop_mode=~/^(AND\|and\|OR\|or)$/)) {
1490	0						print STDERR "Requested Stop Mode $1 is not supported.\n";
1491	0						exit;
1492							}
1493	0						next;
1494							}
1495
1496							# accepting Perl Regexs from Stopfile
1497	0						s/^\s+//;
1498	0						s/\s+$//;
1499
1500							#handling a blank lines
1501	0	0					if(/^\s*$/) { next; }
	0
1502
1503							#check if a valid Perl Regex
1504	0	0					if(!(/^\//)) {
1505	0						print STDERR "Stop token regular expression <$_> should start with '/'\n";
1506	0						exit;
1507							}
1508	0	0					if(!(/\/$/)) {
1509	0						print STDERR "Stop token regular expression <$_> should end with '/'\n";
1510	0						exit;
1511							}
1512
1513							#remove the / s from beginning and end
1514	0						s/^\///;
1515	0						s/\/$//;
1516
1517							#form a single big regex
1518	0						$stop_regex.="(".$_.")\|";
1519							}
1520
1521	0	0					if(length($stop_regex)<=0) {
1522	0						print STDERR "No valid Perl Regular Experssion found in Stop file $stopfile";
1523	0						exit;
1524							}
1525
1526	0						chop $stop_regex;
1527
1528							# making AND a default stop mode
1529	0	0					if(!defined $stop_mode) {
1530	0						$stop_mode="AND";
1531							}
1532
1533	0						close STP;
1534
1535	0						return $stop_regex;
1536							}
1537
1538							1;