File Coverage

blib/lib/Word2vec/Word2vec.pm
Criterion Covered Total %
statement 401 1177 34.0
branch 182 924 19.7
condition 22 291 7.5
subroutine 88 93 94.6
pod 77 85 90.5
total 770 2570 29.9


line stmt bran cond sub pod time code
1             #!usr/bin/perl
2              
3             ######################################################################################
4             # #
5             # Author: Clint Cuffy #
6             # Date: 06/16/2016 #
7             # Revised: 09/04/2017 #
8             # UMLS Similarity Word2Vec Executable Interface Module #
9             # #
10             ######################################################################################
11             # #
12             # Description: #
13             # ============ #
14             # Perl "word2vec" executable interface for UMLS Similarity #
15             # Features: #
16             # ========= #
17             # Supports Word2Vec Training Using Standard Options #
18             # Conversion of Word2Vec Binary Format To Plain Text And Vice Versa #
19             # Cosine Similarity Between Two Words #
20             # Summed Cosine Similarity #
21             # Average Cosine Similarity #
22             # Multi-Word Cosine Similarity #
23             # Manipulation of Word Vectors (Addition/Subtraction/Average) #
24             # #
25             ######################################################################################
26              
27              
28             package Word2vec::Word2vec;
29              
30 4     4   44217 use strict;
  4         8  
  4         105  
31 4     4   17 use warnings;
  4         8  
  4         89  
32              
33             # Standard Package(s)
34 4     4   19 use Cwd;
  4         5  
  4         235  
35 4     4   1287 use Encode qw( decode encode );
  4         28685  
  4         258  
36              
37              
38 4     4   29 use vars qw($VERSION);
  4         9  
  4         206  
39              
40             $VERSION = '0.03';
41              
42              
43             ######################################################################################
44             # Constructor
45             ######################################################################################
46              
47             BEGIN
48       4     {
49             # CONSTRUCTOR : DO SOMETHING HERE
50             }
51              
52              
53             ######################################################################################
54             # Deconstructor
55             ######################################################################################
56              
57             END
58       4     {
59             # DECONSTRUCTOR : DO SOMETHING HERE
60             }
61              
62              
63             ######################################################################################
64             # new Class Operator
65             ######################################################################################
66              
67             sub new
68             {
69 1     1 1 72 my $class = shift;
70 1         16 my $self = {
71             # Private Member Variables
72             _debugLog => shift, # Boolean (Binary): 0 = False, 1 = True
73             _writeLog => shift, # Boolean (Binary): 0 = False, 1 = True
74             _trainFileName => shift, # String
75             _outputFileName => shift, # String
76             _wordVecSize => shift, # Int
77             _windowSize => shift, # Int
78             _sample => shift, # Float
79             _hSoftMax => shift, # Int
80             _negative => shift, # Int
81             _numOfThreads => shift, # Int
82             _numOfIterations => shift, # Int
83             _minCount => shift, # Int
84             _alpha => shift, # Float
85             _classes => shift, # Int
86             _debug => shift, # Int
87             _binaryOutput => shift, # Boolean (Binary): 0 = False, 1 = True
88             _saveVocab => shift, # String (File Name To Save To)
89             _readVocab => shift, # String (File Name To Read From)
90             _useCBOW => shift, # Boolean (Binary): 0 = Use Skip-Gram Model, 1 = Use CBOW (Default)
91             _workingDir => shift, # String
92             _word2VecExeDir => shift, # String
93             _hashRefOfWordVectors => shift, # Hash Reference of Word2Vec Vectors
94             _overwriteOldFile => shift, # Boolean (Binary): 0 = False, 1 = True
95             _sparseVectorMode => shift, # Boolean (Binary): 0 = False, 1 = True
96             _vectorLength => shift, # Int
97             _numberOfWords => shift, # Int
98             _minimizeMemoryUsage => shift, # Boolean (Binary): 0 = False, 1 = True
99             };
100              
101             # Set debug log variable to false if not defined
102 1 50       5 $self->{ _debugLog } = 0 if !defined ( $self->{ _debugLog } );
103 1 50       3 $self->{ _writeLog } = 0 if !defined ( $self->{ _writeLog } );
104 1 50       4 $self->{ _trainFileName } = "" if !defined ( $self->{ _trainFileName } );
105 1 50       4 $self->{ _outputFileName } = "" if !defined ( $self->{ _outputFileName } );
106 1 50       4 $self->{ _wordVecSize } = 100 if !defined ( $self->{ _wordVecSize } );
107 1 50       3 $self->{ _windowSize } = 5 if !defined ( $self->{ _windowSize } );
108 1 50       3 $self->{ _sample } = 0.001 if !defined ( $self->{ _sample } );
109 1 50       3 $self->{ _hSoftMax } = 0 if !defined ( $self->{ _hSoftMax } );
110 1 50       3 $self->{ _negative } = 5 if !defined ( $self->{ _negative } );
111 1 50       3 $self->{ _numOfThreads } = 12 if !defined ( $self->{ _numOfThreads } );
112 1 50       3 $self->{ _numOfIterations } = 5 if !defined ( $self->{ _numOfIterations } );
113 1 50       4 $self->{ _minCount } = 5 if !defined ( $self->{ _minCount } );
114 1 50       2 $self->{ _classes } = 0 if !defined ( $self->{ _classes } );
115 1 50       3 $self->{ _debug } = 2 if !defined ( $self->{ _debug } );
116 1 50       3 $self->{ _binaryOutput } = 1 if !defined ( $self->{ _binaryOutput } );
117 1 50       3 $self->{ _saveVocab } = "" if !defined ( $self->{ _saveVocab } );
118 1 50       3 $self->{ _readVocab } = "" if !defined ( $self->{ _readVocab } );
119 1 50       2 $self->{ _useCBOW } = 1 if !defined ( $self->{ _useCBOW } );
120              
121 1 50 33     6 $self->{ _alpha } = 0.05 if ( !defined ( $self->{ _alpha } ) && $self->{ _useCBOW } == 1 );
122 1 50 33     4 $self->{ _alpha } = 0.025 if ( !defined ( $self->{ _alpha } ) && $self->{ _useCBOW } == 0 );
123              
124 1 50       12 $self->{ _workingDir } = Cwd::getcwd() if !defined ( $self->{ _workingDir } );
125              
126 1         2 my %hash = ();
127 1 50       3 $self->{ _hashRefOfWordVectors } = \%hash if !defined ( $self->{ _hashRefOfWordVectors } );
128 1 50       3 $self->{ _overwriteOldFile } = 0 if !defined $self->{ _overwriteOldFile };
129 1 50       2 $self->{ _sparseVectorMode } = 0 if !defined $self->{ _sparseVectorMode };
130 1 50       3 $self->{ _vectorLength } = 0 if !defined $self->{ _vectorLength };
131 1 50       2 $self->{ _numberOfWords } = 0 if !defined $self->{ _numberOfWords };
132 1 50       3 $self->{ _minimizeMemoryUsage } = 1 if !defined $self->{ _minimizeMemoryUsage };
133              
134              
135             # Try To Locate Word2Vec Executable Files Path
136 1         3 for my $dir ( @INC )
137             {
138 11 50       45 $self->{ _word2VecExeDir } = "$dir/External/Word2vec" if ( -e "$dir/External/Word2vec" ); # Test Directory
139 11 50       36 $self->{ _word2VecExeDir } = "$dir/../External/Word2vec" if ( -e "$dir/../External/Word2vec" ); # Dev Directory
140 11 50       33 $self->{ _word2VecExeDir } = "$dir/../../External/Word2vec" if ( -e "$dir/../../External/Word2vec" ); # Dev Directory
141 11 100       52 $self->{ _word2VecExeDir } = "$dir/Word2vec/External/Word2vec" if ( -e "$dir/Word2vec/External/Word2vec" ); # Release Directory
142             }
143              
144             # Open File Handler if checked variable is true
145 1 50       6 if( $self->{ _writeLog } )
146             {
147 0         0 open( $self->{ _fileHandle }, '>:encoding(UTF-8)', 'Word2vecLog.txt' );
148 0         0 $self->{ _fileHandle }->autoflush( 1 ); # Auto-flushes writes to log file
149             }
150              
151 1         6 bless $self, $class;
152              
153 1         5 $self->WriteLog( "New - Debug On" );
154 1 50       5 $self->WriteLog( "New - Word2Vec Executable Directory Found" ) if defined( $self->{ _word2VecExeDir } );
155 1 50       5 $self->WriteLog( "New - Setting Word2Vec Executable Directory To: \"" . $self->{ _word2VecExeDir } . "\"" ) if defined( $self->{ _word2VecExeDir } );
156              
157 1         2 return $self;
158             }
159              
160              
161             ######################################################################################
162             # DESTROY
163             ######################################################################################
164              
165             sub DESTROY
166             {
167 1     1   8 my ( $self ) = @_;
168              
169             # Close FileHandle
170 1 50       73 close( $self->{ _fileHandle } ) if( $self->{ _fileHandle } );
171             }
172              
173              
174             ######################################################################################
175             # Module Functions
176             ######################################################################################
177              
178             sub ExecuteTraining
179             {
180 2     2 1 6 my ( $self, $trainFilePath, $outputFilePath, $vectorSize, $windowSize, $minCount, $sample, $negative, $alpha, $hs, $binary, $numOfThreads, $iterations, $useCBOW, $classes, $readVocab, $saveVocab, $debug, $overwrite ) = @_;
181              
182             # Pre-Training Check(s)
183 2         5 my $executableFileDir = $self->GetWord2VecExeDir() . "/word2vec";
184 2 50       5 $executableFileDir .= ".exe" if $self->GetOSType() eq "MSWin32";
185              
186             # Override Train File Path Member Variable With Specified Train File Parameter
187 2 50       12 $self->WriteLog( "ExecuteTraining - \"TrainFilePath\" Parameter Specified / Overriding Member Variable" ) if defined( $trainFilePath );
188 2 50       4 $trainFilePath = $self->GetTrainFilePath() if !defined( $trainFilePath );
189              
190             # Override Output File Path Member Variable With Specified Train File Parameter
191 2 50       9 $self->WriteLog( "ExecuteTraining - \"OutputFilePath\" Parameter Specified / Overriding Member Variable" ) if defined( $outputFilePath );
192 2 50       5 $outputFilePath = $self->GetOutputFilePath() if !defined( $outputFilePath );
193              
194             # Override Overwrite Member Variable With Specified Train File Parameter
195 2 50       4 $self->WriteLog( "ExecuteTraining - \"Overwrite\" Parameter Specified / Overriding Member Variable" ) if defined( $overwrite );
196 2 50       7 $overwrite = $self->GetOverwriteOldFile() if !defined( $overwrite );
197              
198             # Check For 'word2vec' Executable and trainFile
199 2 50       41 $self->WriteLog( "ExecuteTraining - Error: \"word2vec\" Executable File Cannot Be Found" ) if !( -e "$executableFileDir" );
200 2 50       13 return -1 if !( -e "$executableFileDir" );
201 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Training File Not Found" ) if !( -e "$trainFilePath" );
202 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Training File Size = 0 bytes - No Data In Training File" ) if ( -z "$trainFilePath" );
203 0 0 0     0 return -1 if !( -e "$trainFilePath" ) || ( -z "$trainFilePath" );
204              
205             # Checks To See If Training Is Set To Use CBOW or Skip-Gram Model
206 0 0       0 $self->WriteLog( "ExecuteTraining - Attn: Continuous Bag Of Words Model = 0, Using Skip-Gram Model" ) if $self->GetUseCBOW() == 0;
207              
208             # Checks For Existing Output File And Returns -1 If Overwrite Option Is Not Enabled
209 0 0 0     0 $self->WriteLog( "ExecuteTraining - Warning: \"$outputFilePath\" Already Exists - Canceling Training" ) if ( -e "$outputFilePath" && $overwrite == 0 );
210 0 0 0     0 $self->WriteLog( "ExecuteTraining - Try Enabling \"Overwrite\" Option or Delete \"$outputFilePath\" In Working Directory" ) if ( -e "$outputFilePath" && $overwrite == 0 );
211 0 0 0     0 return -1 if ( -e "$outputFilePath" && $overwrite == 0 );
212              
213             # Fetch Other Training Parameters
214 0 0       0 $self->WriteLog( "ExecuteTraining - \"VectorSize\" Parameter Defined / Overriding Member Variable" ) if defined( $vectorSize );
215 0 0       0 $vectorSize = $self->GetWordVecSize() if !defined( $vectorSize );
216              
217 0 0       0 $self->WriteLog( "ExecuteTraining - \"WindowSize\" Parameter Defined / Overriding Member Variable" ) if defined( $windowSize );
218 0 0       0 $windowSize = $self->GetWindowSize() if !defined( $windowSize );
219              
220 0 0       0 $self->WriteLog( "ExecuteTraining - \"Min-Count\" Parameter Defined / Overriding Member Variable" ) if defined( $minCount );
221 0 0       0 $minCount = $self->GetMinCount() if !defined( $minCount );
222              
223 0 0       0 $self->WriteLog( "ExecuteTraining - \"Sample\" Parameter Defined / Overriding Member Variable" ) if defined( $sample );
224 0 0       0 $sample = $self->GetSample() if !defined( $sample );
225              
226 0 0       0 $self->WriteLog( "ExecuteTraining - \"Negative\" Parameter Defined / Overriding Member Variable" ) if defined( $negative );
227 0 0       0 $negative = $self->GetNegative() if !defined( $negative );
228              
229 0 0       0 $self->WriteLog( "ExecuteTraining - \"Alpha\" Parameter Defined / Overriding Member Variable" ) if defined( $alpha );
230 0 0       0 $alpha = $self->GetAlpha() if !defined( $alpha );
231              
232 0 0       0 $self->WriteLog( "ExecuteTraining - \"HSoftMax\" Parameter Defined / Overriding Member Variable" ) if defined( $hs );
233 0 0       0 $hs = $self->GetHSoftMax() if !defined( $hs );
234              
235 0 0       0 $self->WriteLog( "ExecuteTraining - \"Binary\" Parameter Defined / Overriding Member Variable" ) if defined( $binary );
236 0 0       0 $binary = $self->GetBinaryOutput() if !defined( $binary );
237              
238 0 0       0 $self->WriteLog( "ExecuteTraining - \"NumOfThreads\" Parameter Defined / Overriding Member Variable" ) if defined( $numOfThreads );
239 0 0       0 $numOfThreads = $self->GetNumOfThreads() if !defined( $numOfThreads );
240              
241 0 0       0 $self->WriteLog( "ExecuteTraining - \"Iterations\" Parameter Defined / Overriding Member Variable" ) if defined( $iterations );
242 0 0       0 $iterations = $self->GetNumOfIterations() if !defined( $iterations );
243              
244 0 0       0 $self->WriteLog( "ExecuteTraining - \"CBOW\" Parameter Defined / Overriding Member Variable" ) if defined( $useCBOW );
245 0 0       0 $useCBOW = $self->GetUseCBOW() if !defined( $useCBOW );
246              
247 0 0       0 $self->WriteLog( "ExecuteTraining - \"Classes\" Parameter Defined / Overriding Member Variable" ) if defined( $classes );
248 0 0       0 $classes = $self->GetClasses() if !defined( $classes );
249              
250 0 0       0 $self->WriteLog( "ExecuteTraining - \"ReadVocab\" Parameter Defined / Overriding Member Variable" ) if defined( $readVocab );
251 0 0       0 $readVocab = $self->GetReadVocabFilePath() if !defined( $readVocab );
252              
253 0 0       0 $self->WriteLog( "ExecuteTraining - \"SaveVocab\" Parameter Defined / Overriding Member Variable" ) if defined( $saveVocab );
254 0 0       0 $saveVocab = $self->GetSaveVocabFilePath() if !defined( $saveVocab );
255              
256 0 0       0 $self->WriteLog( "ExecuteTraining - \"Debug\" Parameter Defined / Overriding Member Variable" ) if defined( $debug );
257 0 0       0 $debug = $self->GetDebugTraining() if !defined( $debug );
258              
259             # Setting Up Command String
260 0         0 my $command = "\"$executableFileDir\" ";
261 0         0 $command .= ( "-train \"" . $trainFilePath . "\" " );
262 0         0 $command .= ( "-output \"" . $outputFilePath . "\" " );
263 0         0 $command .= ( "-size " . $vectorSize . " " );
264 0         0 $command .= ( "-window " . $windowSize . " " );
265 0         0 $command .= ( "-sample " . $sample . " " );
266 0         0 $command .= ( "-hs " . $hs . " " );
267 0         0 $command .= ( "-negative " . $negative . " " );
268 0         0 $command .= ( "-threads " . $numOfThreads . " " );
269 0         0 $command .= ( "-iter " . $iterations . " " );
270 0         0 $command .= ( "-min-count " . $minCount . " " );
271 0         0 $command .= ( "-alpha " . $alpha . " " );
272 0         0 $command .= ( "-classes " . $classes . " " );
273 0         0 $command .= ( "-binary " . $binary . " " );
274 0         0 $command .= ( "-cbow " . $useCBOW . " " );
275 0 0 0     0 $command .= ( "-read-vocab " . $readVocab . " " ) if ( defined( $readVocab ) && $readVocab ne "" );
276 0 0 0     0 $command .= ( "-save-vocab " . $saveVocab . " " ) if ( defined( $saveVocab ) && $saveVocab ne "" );
277 0         0 $command .= ( "-debug " . $debug . " " );
278              
279 0         0 $self->WriteLog( "Executing Command: $command" );
280              
281             # Execute External System Command To Train "word2vec"
282             # Execute command without capturing program output
283 0         0 my $result = system( "$command" );
284              
285 0         0 print "\n";
286              
287             # Post-Training Check(s)
288 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Unable To Spawn Executable File - Try Running '--clean' Command And Re-compile Executables" ) if ( $result == 65280 );
289              
290 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Word2Vec Output File Does Not Exist" ) if !( -e "$outputFilePath" );
291 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Word2Vec Output File Size = Zero" ) if ( -z "$outputFilePath" );
292 0 0 0     0 $result = -1 if ( !( -e "$outputFilePath" ) || ( -z "$outputFilePath" ) );
293              
294 0 0 0     0 $self->WriteLog( "ExecuteTraining - Training Successful" ) if $result == 0 && ( -e "$outputFilePath" );
295 0 0       0 $self->WriteLog( "ExecuteTraining - Training Unsuccessful" ) if $result != 0;
296              
297 0         0 return $result;
298             }
299              
300             sub ExecuteStringTraining
301             {
302 1     1 1 758 my ( $self, $trainingStr, $outputFilePath, $vectorSize, $windowSize, $minCount, $sample, $negative, $alpha, $hs, $binary,
303             $numOfThreads, $iterations, $useCBOW, $classes, $readVocab, $saveVocab, $debug, $overwrite ) = @_;
304              
305             # Check(s)
306 1 50       4 $self->WriteLog( "ExecuteStringTraining - Error: Training String Is Not Defined" ) if !defined( $trainingStr );
307 1 50       3 return -1 if !defined( $trainingStr );
308              
309 1 50       3 $self->WriteLog( "ExecuteStringTraining - Error: Training String Is Empty" ) if ( $trainingStr eq "" );
310 1 50       3 return -1 if ( $trainingStr eq "" );
311              
312             # Save Training String To Temporary File
313 1         2 my $result = 0;
314              
315 1         2 $self->WriteLog( "ExecuteStringTraining - Saving Training String To Temporary File At Working Directory: \"" . $self->GetWorkingDir() . "\"" );
316              
317 1         3 my $tempFilePath = $self->GetWorkingDir() . "/w2vtemp.txt";
318 1 50       68 open( my $fileHandle, ">:encoding(utf8)", "$tempFilePath" ) or $result = -1;
319              
320 1 50       41 $self->WriteLog( "ExecuteStringTraining - Error Creating File Handle : $!" ) if ( $result == -1 );
321 1 50       2 return -1 if ( $result == -1 );
322              
323             # Print Training String Data To File
324 1 50       55 print( $fileHandle "$trainingStr" ) if defined( $fileHandle );
325              
326 1         14 close( $fileHandle );
327 1         3 undef( $fileHandle );
328              
329 1         4 $self->WriteLog( "ExecuteStringTraining - Temporary Training String File Saved" );
330              
331 1         4 $result = $self->ExecuteTraining( $tempFilePath, $outputFilePath, $vectorSize, $windowSize,
332             $minCount, $sample, $negative, $alpha, $hs, $binary, $numOfThreads,
333             $iterations, $useCBOW, $classes, $readVocab, $saveVocab, $debug, $overwrite );
334              
335 1         3 $self->WriteLog( "ExecuteStringTraining - Removing Temporary Training String Data File" );
336 1         49 unlink( $tempFilePath );
337              
338 1 50       4 $self->WriteLog( "ExecuteStringTraining - Finished" ) if ( $result == 0 );
339 1 50 33     5 $self->WriteLog( "ExecuteStringTraining - Finished With Errors" ) if ( $result == -1 && $self->GetWriteLog() == 0 );
340 1 50 33     5 $self->WriteLog( "ExecuteStringTraining - Finished With Errors / See Log File For Details" ) if ( $result == -1 && $self->GetWriteLog() == 1 ) ;
341              
342 1         5 return $result;
343             }
344              
345             sub ComputeCosineSimilarity
346             {
347 1     1 1 5 my ( $self, $wordA, $wordB ) = @_;
348              
349             # Check(s)
350 1 50 33     5 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
351 1 50       4 $self->WriteLog( "ComputeCosineSimilarity - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
352 1 50       3 return undef if ( $self->IsVectorDataInMemory() == 0 );
353              
354 0 0 0     0 $self->WriteLog( "ComputeCosineSimilarity - Error: Function Requires Two Arguments (Words)" ) if !defined ( $wordA ) || !defined ( $wordB );
355 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
356              
357 0         0 $self->WriteLog( "ComputeCosineSimilarity - Computing Cosine Similarity Of Words: \"$wordA\" and \"$wordB\"" );
358              
359 0         0 my @wordAVtr = ();
360 0         0 my @wordBVtr = ();
361              
362              
363             # Search Dictionary For Specified Words
364 0         0 my $wordAData = $self->GetWordVector( $wordA );
365 0         0 my $wordBData = $self->GetWordVector( $wordB );
366 0 0       0 @wordAVtr = split( ' ', $wordAData ) if defined( $wordAData );
367 0 0       0 @wordBVtr = split( ' ', $wordBData ) if defined( $wordBData );
368              
369             # Post Search Check(s)
370 0 0       0 $self->WriteLog( "ComputeCosineSimilarity - Error: \"$wordA\" Not In Dictionary" ) if @wordAVtr == 0;
371 0 0       0 $self->WriteLog( "ComputeCosineSimilarity - Error: \"$wordB\" Not In Dictionary" ) if @wordBVtr == 0;
372 0 0 0     0 return undef if @wordAVtr == 0 || @wordBVtr == 0;
373              
374             # Remove Word From Vector To Compute Cosine Similarity Based On Vector Values
375 0         0 shift( @wordAVtr );
376 0         0 shift( @wordBVtr );
377 0         0 my $wordAVtrSize = @wordAVtr;
378 0         0 my $wordBVtrSize = @wordBVtr;
379              
380             # Check(s)
381 0 0       0 $wordAVtrSize = 0 if !defined( $wordAVtrSize );
382 0 0       0 $wordBVtrSize = 0 if !defined( $wordBVtrSize );
383              
384 0         0 $self->WriteLog( "ComputeCosineSimilarity - Words Present In Dictionary" );
385              
386             # Cosine Similarity => cos(angle) = -> ->
387             # A * B
388             # -------------------
389             # -> ->
390             # || A || * || B ||
391             #
392             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
393              
394 0         0 my $dpA = 0;
395 0         0 my $dpB = 0;
396 0         0 my $ldpA = 0;
397 0         0 my $ldpB = 0;
398 0         0 my $dpAB = 0;
399              
400             # Compute Dot Product Of VectorA
401 0         0 for my $value ( @wordAVtr )
402             {
403 0         0 $dpA += ( $value * $value );
404             }
405              
406             # Compute Dot Product Of VectorB
407 0         0 for my $value ( @wordBVtr )
408             {
409 0         0 $dpB += ( $value * $value );
410             }
411              
412             # Compute $ldpA & $ldpB
413 0         0 $ldpA = sqrt( $dpA );
414 0         0 $ldpB = sqrt( $dpB );
415              
416             # Compute Cosine Similarity Between Vector A & Vector B
417 0         0 for( my $i = 0; $i < $wordAVtrSize; $i++ )
418             {
419             # Compute Value If Not Dividing By Zero
420 0 0 0     0 $dpAB += ( ( $wordAVtr[$i] / $ldpA ) * ( $wordBVtr[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
421             }
422              
423             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
424 0         0 return sprintf( "%.6f", $dpAB );
425             }
426              
427             sub ComputeAvgOfWordsCosineSimilarity
428             {
429 1     1 1 3 my ( $self, $wordA, $wordB ) = @_;
430              
431             # Check(s)
432 1 50 33     4 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
433 1 50       7 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
434 1 50       4 return undef if ( $self->IsVectorDataInMemory() == 0 );
435              
436 0 0 0     0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Error: Function Requires Two Arguments (Words)" ) if !defined ( $wordA ) || !defined ( $wordB );
437 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
438              
439 0 0 0     0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Error: One Or More Arguments Consisting Of Empty String" ) if ( $wordA eq "" || $wordB eq "" );
440 0 0 0     0 return undef if ( $wordA eq "" || $wordB eq "" );
441              
442              
443 0         0 my @wordAAry = split( ' ', $wordA );
444 0         0 my @wordBAry = split( ' ', $wordB );
445              
446             # Check(s)
447 0 0 0     0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Error: One Or More Arguments Contains No Data" ) if ( @wordAAry == 0 || @wordBAry == 0 );
448 0 0 0     0 return undef if ( @wordAAry == 0 || @wordBAry == 0 );
449              
450 0         0 $wordA = $self->ComputeAverageOfWords( \@wordAAry );
451 0         0 $wordB = $self->ComputeAverageOfWords( \@wordBAry );
452              
453             # Check(s)
454 0 0       0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Unable To Compute Average Of Word(s): \"@wordAAry\"" ) if !defined( $wordA );
455 0 0       0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Unable To Compute Average Of Word(s): \"@wordBAry\"" ) if !defined( $wordB );
456 0 0 0     0 return undef if !defined( $wordA ) || !defined( $wordB );
457              
458 0         0 my @avgAVtr = split( ' ', $wordA );
459 0         0 my @avgBVtr = split( ' ', $wordB );
460 0         0 my $avgAVtrSize = @avgAVtr;
461 0         0 my $avgBVtrSize = @avgBVtr;
462              
463             # Check(s)
464 0 0       0 $avgAVtrSize = 0 if !defined( $avgAVtrSize );
465 0 0       0 $avgBVtrSize = 0 if !defined( $avgBVtrSize );
466              
467 0         0 undef( $wordA );
468 0         0 undef( $wordB );
469              
470             # Compute Cosine Similarity Between Word Averages
471              
472             # Cosine Similarity => cos(angle) = -> ->
473             # A * B
474             # -------------------
475             # -> ->
476             # || A || * || B ||
477             #
478             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
479              
480 0         0 my $dpA = 0;
481 0         0 my $dpB = 0;
482 0         0 my $ldpA = 0;
483 0         0 my $ldpB = 0;
484 0         0 my $dpAB = 0;
485              
486             # Compute Dot Product Of VectorA
487 0         0 for my $value ( @avgAVtr )
488             {
489 0         0 $dpA += ( $value * $value );
490             }
491              
492             # Compute Dot Product Of VectorB
493 0         0 for my $value ( @avgBVtr )
494             {
495 0         0 $dpB += ( $value * $value );
496             }
497              
498             # Compute $ldpA & $ldpB
499 0         0 $ldpA = sqrt( $dpA );
500 0         0 $ldpB = sqrt( $dpB );
501              
502             # Compute Cosine Similarity Between Vector A & Vector B
503 0         0 for( my $i = 0; $i < $avgAVtrSize; $i++ )
504             {
505             # Compute Value If Not Dividing By Zero
506 0 0 0     0 $dpAB += ( ( $avgAVtr[$i] / $ldpA ) * ( $avgBVtr[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
507             }
508              
509             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
510 0         0 return sprintf( "%.6f", $dpAB );
511             }
512              
513             sub ComputeMultiWordCosineSimilarity
514             {
515 2     2 1 6 my ( $self, $wordA, $wordB, $allWordsMustExist ) = @_;
516              
517             # Check(s)
518 2 50 33     6 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
519 2 50       7 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
520 2 50       6 return undef if ( $self->IsVectorDataInMemory() == 0 );
521              
522 0 0 0     0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: Function Requires Two Arguments (Words)" ) if !defined ( $wordA ) || !defined ( $wordB );
523 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
524              
525 0 0       0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Warning: \"All Words Must Exist\" Parameter Not Specified / Default = False" ) if !defined( $allWordsMustExist );
526 0 0       0 $allWordsMustExist = 0 if !defined( $allWordsMustExist );
527              
528 0         0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Computing Cosine Similarity Of Words: \"$wordA\" and \"$wordB\"" );
529              
530 0         0 my @wordAVtr = ();
531 0         0 my @wordBVtr = ();
532              
533              
534             # Split Words To Check For Existence In Dictionary
535 0         0 my @wordAAry = split( ' ', $wordA );
536 0         0 my @wordBAry = split( ' ', $wordB );
537 0         0 my $wordsFoundA = "";
538 0         0 my $wordsFoundB = "";
539              
540             # Check(s)
541 0 0 0     0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: One Or More Arguments Contains No Data" ) if ( @wordAAry == 0 || @wordBAry == 0 );
542 0 0 0     0 return undef if ( @wordAAry == 0 || @wordBAry == 0 );
543              
544             # Search Dictionary For Specified Words
545 0         0 for my $word ( @wordAAry )
546             {
547 0         0 my $wordData = $self->GetWordVector( $word );
548              
549 0 0       0 if( defined( $wordData ) )
550             {
551 0         0 my @wordVtr = split( ' ', $wordData );
552 0         0 push( @wordAVtr, [ @wordVtr ] );
553 0         0 $wordsFoundA .= ( " " . $word );
554             }
555             }
556              
557 0         0 for my $word ( @wordBAry )
558             {
559 0         0 my $wordData = $self->GetWordVector( $word );
560              
561 0 0       0 if( defined( $wordData ) )
562             {
563 0         0 my @wordVtr = split( ' ', $wordData );
564 0         0 push( @wordBVtr, [ @wordVtr ] );
565 0         0 $wordsFoundB .= ( " " . $word );
566             }
567             }
568              
569              
570             # Post Search Check(s)
571 0         0 my $error = 0;
572 0         0 for( my $i = 0; $i < @wordAAry; $i++ )
573             {
574 0 0       0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: \"" . $wordAAry[$i] . "\" Not In Dictionary" ) if index( $wordsFoundA, $wordAAry[$i] ) == -1;
575 0 0 0     0 $error = 1 if index( $wordsFoundA, $wordAAry[$i] ) == -1 && $allWordsMustExist == 1;
576             }
577              
578 0         0 for( my $i = 0; $i < @wordBAry; $i++ )
579             {
580 0 0       0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: \"" . $wordBAry[$i] . "\" Not In Dictionary" ) if index( $wordsFoundB, $wordBAry[$i] ) == -1;
581 0 0 0     0 $error = 1 if index( $wordsFoundB, $wordBAry[$i] ) == -1 && $allWordsMustExist == 1;
582             }
583              
584 0 0 0     0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: Comparing Empty String / No Found Words" ) if ( $wordsFoundA eq "" || $wordsFoundB eq "" );
585 0 0 0     0 $error = 1 if ( $wordsFoundA eq "" || $wordsFoundB eq "" );
586              
587 0 0       0 return undef if $error != 0;
588              
589              
590 0         0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Words Present In Dictionary" );
591              
592             # Remove Words From Word Vectors
593 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
594             {
595 0         0 my @tempAry = @{ $wordAVtr[$i] };
  0         0  
596 0         0 shift( @tempAry );
597 0         0 $wordAVtr[$i] = \@tempAry;
598             }
599              
600 0         0 for( my $i = 0; $i < @wordBVtr; $i++ )
601             {
602 0         0 my @tempAry = @{ $wordBVtr[$i] };
  0         0  
603 0         0 shift( @tempAry );
604 0         0 $wordBVtr[$i] = \@tempAry;
605             }
606              
607              
608             # Compute Sum Of Compound Words
609 0         0 my @wordASumAry = ();
610 0         0 my @wordBSumAry = ();
611              
612 0         0 my $wordVtrASize = @{ $wordAVtr[0] };
  0         0  
613 0         0 my $wordVtrBSize = @{ $wordBVtr[0] };
  0         0  
614              
615 0         0 for( my $i = 0; $i < $wordVtrASize; $i++ )
616             {
617 0         0 my $value = 0;
618              
619 0         0 for my $aryRef ( @wordAVtr )
620             {
621 0         0 $value += $aryRef->[$i];
622             }
623              
624 0         0 push( @wordASumAry, $value );
625             }
626              
627 0         0 for( my $i = 0; $i < $wordVtrBSize; $i++ )
628             {
629 0         0 my $value = 0;
630              
631 0         0 for my $aryRef ( @wordBVtr )
632             {
633 0         0 $value += $aryRef->[$i];
634             }
635              
636 0         0 push( @wordBSumAry, $value );
637             }
638              
639              
640             # Cosine Similarity => cos(angle) = -> ->
641             # A * B
642             # -------------------
643             # -> ->
644             # || A || * || B ||
645             #
646             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
647              
648 0         0 my $dpA = 0;
649 0         0 my $dpB = 0;
650 0         0 my $ldpA = 0;
651 0         0 my $ldpB = 0;
652 0         0 my $dpAB = 0;
653              
654             # Compute Dot Product Of VectorA
655 0         0 for my $value ( @wordASumAry )
656             {
657 0         0 $dpA += ( $value * $value );
658             }
659              
660             # Compute Dot Product Of VectorB
661 0         0 for my $value ( @wordBSumAry )
662             {
663 0         0 $dpB += ( $value * $value );
664             }
665              
666             # Compute $ldpA & $ldpB
667 0         0 $ldpA = sqrt( $dpA );
668 0         0 $ldpB = sqrt( $dpB );
669              
670             # Compute Cosine Similarity Between Vector A & Vector B
671 0         0 for( my $i = 0; $i < $wordVtrASize; $i++ )
672             {
673             # Compute Value If Not Dividing By Zero
674 0 0 0     0 $dpAB += ( ( $wordASumAry[$i] / $ldpA ) * ( $wordBSumAry[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
675             }
676              
677             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
678 0         0 return sprintf( "%.6f", $dpAB );
679             }
680              
681             sub ComputeCosineSimilarityOfWordVectors
682             {
683 1     1 1 5 my ( $self, $wordAData, $wordBData ) = @_;
684              
685             # Check(s)
686 1 50 33     5 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordAData ) || !defined ( $wordBData );
687 1 50 33     11 return undef if !defined ( $wordAData ) || !defined ( $wordBData );
688              
689 0 0 0     0 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Error: One Or More Word Vectors Consist Of No Data" ) if ( $wordAData eq "" || $wordBData eq "" );
690 0 0 0     0 return undef if ( $wordAData eq "" || $wordBData eq "" );
691              
692 0         0 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Computing Cosine Similarity Of Word Vectors: \"$wordAData\" and \"$wordBData\"" );
693              
694 0         0 my @wordAVtr = split( ' ', $wordAData );
695 0         0 my @wordBVtr = split( ' ', $wordBData );
696              
697 0         0 undef( $wordAData );
698 0         0 undef( $wordBData );
699              
700 0         0 my $wordAVtrSize = @wordAVtr;
701 0         0 my $wordBVtrSize = @wordBVtr;
702              
703             # Check(s)
704 0 0       0 $wordAVtrSize = 0 if !defined( $wordAVtrSize );
705 0 0       0 $wordBVtrSize = 0 if !defined( $wordBVtrSize );
706              
707 0         0 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Words Present In Dictionary" );
708              
709             # Cosine Similarity => cos(angle) = -> ->
710             # A * B
711             # -------------------
712             # -> ->
713             # || A || * || B ||
714             #
715             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
716              
717 0         0 my $dpA = 0;
718 0         0 my $dpB = 0;
719 0         0 my $ldpA = 0;
720 0         0 my $ldpB = 0;
721 0         0 my $dpAB = 0;
722              
723             # Compute Dot Product Of VectorA
724 0         0 for my $value ( @wordAVtr )
725             {
726 0         0 $dpA += ( $value * $value );
727             }
728              
729             # Compute Dot Product Of VectorB
730 0         0 for my $value ( @wordBVtr )
731             {
732 0         0 $dpB += ( $value * $value );
733             }
734              
735             # Compute $ldpA & $ldpB
736 0         0 $ldpA = sqrt( $dpA );
737 0         0 $ldpB = sqrt( $dpB );
738              
739             # Compute Cosine Similarity Between Vector A & Vector B
740 0         0 for( my $i = 0; $i < $wordAVtrSize; $i++ )
741             {
742             # Compute Value If Not Dividing By Zero
743 0 0 0     0 $dpAB += ( ( $wordAVtr[$i] / $ldpA ) * ( $wordBVtr[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
744             }
745              
746             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
747 0         0 return sprintf( "%.6f", $dpAB );
748             }
749              
750             sub CosSimWithUserInput
751             {
752 0     0 1 0 my ( $self ) = @_;
753              
754             # Check
755 0 0 0     0 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
756 0 0       0 $self->WriteLog( "CosSimWithUserInput - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
757 0 0       0 return undef if ( $self->IsVectorDataInMemory() == 0 );
758              
759 0         0 my $exit = 0;
760              
761 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
762 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog() == 0;
763              
764 0         0 while ( my $input = )
765             {
766 0         0 chomp( $input );
767 0 0       0 return if $input eq "EXIT";
768              
769 0         0 my @wordAry = split( ' ', $input );
770 0 0 0     0 $self->WriteLog( "Warning: Requires two words for input - ex \"man woman\"" ) if @wordAry == 0 || @wordAry == 1;
771 0 0 0     0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 ) if @wordAry == 0 || @wordAry == 1;
772              
773             # Print Data To Console When DebugLog == 0
774 0 0 0     0 print( "Warning: Requires two words for input - ex \"man woman\" \n" ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
775 0 0 0     0 print( "Input (Type \"EXIT\" to exit): " ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
776 0 0 0     0 next if ( @wordAry == 0 || @wordAry == 1 );
777              
778 0         0 my $value = $self->ComputeCosineSimilarity( $wordAry[0], $wordAry[1] );
779 0 0       0 $self->WriteLog( "Result: $value" ) if defined ( $value );
780 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
781              
782             # Print Data To Console When DebugLog == 0
783 0 0 0     0 print( "Error: One Or More Words Not Present In Dictionary\n" ) if ( !defined ( $value ) && $self->GetDebugLog() == 0 );
784 0 0 0     0 print( "Result: $value\n" ) if ( defined ( $value ) && $self->GetDebugLog == 0 );
785 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog == 0;
786             }
787             }
788              
789             sub MultiWordCosSimWithUserInput
790             {
791 0     0 1 0 my ( $self ) = @_;
792              
793             # Check
794 0 0 0     0 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
795 0 0       0 $self->WriteLog( "CosSimWithUserInput - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
796 0 0       0 return undef if ( $self->IsVectorDataInMemory() == 0 );
797              
798 0         0 my $exit = 0;
799              
800 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
801 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog() == 0;
802              
803 0         0 while ( my $input = )
804             {
805 0         0 chomp( $input );
806 0 0       0 return if $input eq "EXIT";
807              
808 0         0 my @wordAry = split( ' ', $input );
809 0 0 0     0 $self->WriteLog( "Warning: Requires two words for input - ex \"man woman\"" ) if @wordAry == 0 || @wordAry == 1;
810 0 0 0     0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 ) if @wordAry == 0 || @wordAry == 1;
811              
812             # Print Data To Console When DebugLog == 0
813 0 0 0     0 print( "Warning: Requires two words for input - ex \"man woman\"\n" ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
814 0 0 0     0 print( "Input (Type \"EXIT\" to exit): " ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
815 0 0 0     0 next if @wordAry == 0 || @wordAry == 1;
816              
817 0         0 my @wordArg1 = split( ':', $wordAry[0] );
818 0         0 my @wordArg2 = split( ':', $wordAry[1] );
819 0         0 my $arg1 = join( ' ', @wordArg1 );
820 0         0 my $arg2 = join( ' ', @wordArg2 );
821 0         0 my $value = $self->ComputeMultiWordCosineSimilarity( $arg1, $arg2 );
822 0 0       0 $self->WriteLog( "Result: $value" ) if defined ( $value );
823 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
824              
825             # Print Data To Console When DebugLog == 0
826 0 0 0     0 print( "Error: One Or More Words Not Present In Dictionary\n" ) if ( !defined ( $value ) && $self->GetDebugLog() == 0 );
827 0 0 0     0 print( "Result: $value\n" ) if ( defined ( $value ) && $self->GetDebugLog() == 0 );
828 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog() == 0;
829             }
830             }
831              
832             sub ComputeAverageOfWords
833             {
834 1     1 1 3 my ( $self, $wordAryRef ) = @_;
835              
836             # Check(s)
837 1 50 33     4 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
838 1 50       3 $self->WriteLog( "ComputeAverageOfWords - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
839 1 50       3 return undef if ( $self->IsVectorDataInMemory() == 0 );
840              
841 0 0       0 $self->WriteLog( "Error: Method Requires Array Reference Argument / Argument Not Defined" ) if !defined( $wordAryRef );
842 0 0       0 return undef if !defined( $wordAryRef );
843              
844 0         0 my @wordAry = @{ $wordAryRef };
  0         0  
845              
846 0         0 my @foundWords = ();
847 0         0 my @foundWordData = ();
848 0         0 my @resultAry = ();
849              
850 0         0 my $wordDataSize = 0;
851              
852 0         0 $self->WriteLog( "ComputeAverageOfWords - Locating Words In Vocabulary/Dictionary" );
853              
854             # Normal Memory Usage Mode
855 0 0       0 if( $self->GetMinimizeMemoryUsage() == 0 )
856             {
857             # Find Words
858 0         0 for my $word ( @wordAry )
859             {
860             # Dense Vector Data Algorithm
861 0 0       0 if( $self->GetSparseVectorMode() == 0 )
862             {
863             # Fetch Word From Vocabulary/Dictionary
864 0         0 my $result = $self->GetWordVector( $word );
865              
866             # Store Found Word
867 0 0       0 push( @foundWords, $word ) if defined( $result );
868              
869             # Store Found Word Vector Data
870 0 0       0 my @wordData = split( ' ', $result ) if defined( $result );
871 0 0       0 push( @foundWordData, [ @wordData ] ) if @wordData > 0;
872              
873 0 0 0     0 $wordDataSize = @wordData - 1 if $wordDataSize == 0 && defined( $result );
874             }
875             # Sparse Vector Data Algorithm
876             else
877             {
878             # Fetch Word From Vocabulary/Dictionary
879 0         0 my $result = $self->GetWordVector( $word, 1 );
880              
881             # Store Found Word
882 0 0       0 push( @foundWords, $word ) if defined( $result );
883              
884             # Store Found Word Vector Data
885 0 0       0 push( @foundWordData, $self->ConvertRawSparseTextToVectorDataHash( $result ) ) if defined( $result );
886              
887 0 0 0     0 $wordDataSize = $self->GetVectorLength() if $wordDataSize == 0 && defined( $result );
888             }
889             }
890              
891 0         0 $self->WriteLog( "ComputeAverageOfWords - Found: \"" . @foundWords . "\" Of \"" . @wordAry . "\" Words" );
892 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Computing Average Of Found Word(s): @foundWords" ) if @foundWords > 0;
893              
894             # Clear Found Words (Strings)
895 0         0 undef( @foundWords );
896 0         0 @foundWords = ();
897              
898             # Compute Average Of Vector Data For Found Words,
899             # Sum Values Of All Found Word Vectors / Dense Vector Format
900 0 0       0 if( $self->GetSparseVectorMode() == 0 )
901             {
902 0         0 for( my $i = 0; $i < $wordDataSize; $i++ )
903             {
904 0         0 my $value = 0;
905              
906 0         0 for( my $j = 0; $j < @foundWordData; $j++ )
907             {
908 0         0 $value += $foundWordData[$j]->[$i+1];
909             }
910              
911             # Compute Average
912 0         0 $value /= @foundWordData;
913              
914             # Round Decimal Places Greater Than Six
915 0         0 $value = sprintf( "%.6f", $value );
916              
917             # Store Value In Resulting Array
918 0         0 push( @resultAry, $value );
919             }
920             }
921             # Sum Values Of All Found Word Vectors / Sparse Vector Format
922             else
923             {
924             # Create And Zero Fill The Result Vector
925 0         0 @resultAry = ( "0.000000" ) x $wordDataSize;
926              
927 0         0 for( my $i = 0; $i < @foundWordData; $i++ )
928             {
929 0         0 for my $key ( keys( %{ $foundWordData[$i] } ) )
  0         0  
930             {
931 0         0 $resultAry[$key-1] += sprintf( "%.6f", $foundWordData[$i]->{$key} );
932             }
933             }
934              
935             # Compute Average Of All Result Vector Elements
936 0 0       0 if( @foundWordData > 1 )
937             {
938 0         0 for( my $i = 0; $i < @resultAry; $i++ )
939             {
940 0         0 $resultAry[$i] /= @foundWordData;
941 0         0 $resultAry[$i] = sprintf( "%.6f", $resultAry[$i] );
942             }
943             }
944             }
945              
946             # Clear Vector Data For Found Words
947 0 0       0 if( $self->GetSparseVectorMode() == 0 )
948             {
949 0         0 for( my $i = 0; $i < @foundWordData; $i++ )
950             {
951 0         0 $foundWordData[$i] = [];
952             }
953             }
954             else
955             {
956 0         0 for( my $i = 0; $i < @foundWordData; $i++ )
957             {
958 0         0 $foundWordData[$i] = {};
959             }
960             }
961              
962             # Clear Found Word Data
963 0         0 undef( @foundWordData );
964 0         0 @foundWordData = ();
965             }
966             # Minimal Memory Usage Mode
967             else
968             {
969             # Find Words
970 0         0 for my $word ( @wordAry )
971             {
972             # Dense Vector Format / Minimal Memory Usage Mode
973 0 0       0 if( $self->GetSparseVectorMode() == 0 )
974             {
975             # Fetch Word From Vocabulary/Dictionary
976 0         0 my $result = $self->GetWordVector( $word );
977              
978 0 0       0 next if !defined( $result );
979              
980             # Store Found Word
981 0 0       0 push( @foundWords, $word ) if defined( $result );
982              
983             # Split Found Word Vector Data Into An Array
984 0 0       0 my @wordData = split( ' ', $result ) if defined( $result );
985              
986             # Set Word Vector Length
987 0 0 0     0 $wordDataSize = @wordData - 1 if ( $wordDataSize == 0 && defined( $result ) );
988              
989             # Create And Zero Fill The Result Vector If Not Already Done
990 0 0 0     0 @resultAry = ( "0.000000" ) x $wordDataSize if ( @resultAry == 0 && @resultAry != $wordDataSize );
991              
992 0         0 for( my $i = 1; $i < @wordData; $i++ )
993             {
994 0         0 my $value = $wordData[$i];
995              
996             # Round Decimal Places Greater Than Six
997 0         0 $value = sprintf( "%.6f", $value );
998              
999 0         0 $resultAry[$i-1] += $value;
1000             }
1001              
1002 0 0 0     0 $result = "" if ( defined( $result ) && $result ne "" );
1003              
1004 0         0 undef( @wordData );
1005 0         0 @wordData = ();
1006             }
1007             # Sparse Vector Format / Minimal Memory Usage Mode
1008             else
1009             {
1010             # Create And Zero Fill The Result Vector If Not Already Done
1011 0 0       0 @resultAry = ( "0.000000" ) x $self->GetVectorLength() if @resultAry == 0;
1012              
1013             # Fetch Word From Vocabulary/Dictionary
1014 0         0 my $result = $self->GetWordVector( $word, 1 );
1015              
1016             # Store Found Word
1017 0 0       0 push( @foundWords, $word ) if defined( $result );
1018              
1019             # Store Found Word Vector Data
1020 0 0       0 my $wordData = $self->ConvertRawSparseTextToVectorDataHash( $result ) if defined( $result );
1021              
1022             # Copy Hash Element Data To Defined Array Indices
1023 0         0 for my $key ( keys( %{ $wordData } ) )
  0         0  
1024             {
1025 0         0 $resultAry[$key-1] += sprintf( "%.6f", $wordData->{$key} );
1026             }
1027              
1028             # Clear Hash Data
1029 0         0 $wordData = {};
1030 0         0 undef( %{ $wordData } );
  0         0  
1031 0         0 $result = "";
1032             }
1033             }
1034              
1035 0         0 $self->WriteLog( "ComputeAverageOfWords - Found: \"" . @foundWords . "\" Of \"" . @wordAry . "\" Words" );
1036 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Computing Average Of Found Word(s): @foundWords" ) if @foundWords > 0;
1037              
1038             # Compute Average Of All Result Vector Elements
1039 0 0       0 if( @foundWords > 1 )
1040             {
1041 0         0 for( my $i = 0; $i < @resultAry; $i++ )
1042             {
1043 0         0 $resultAry[$i] /= @foundWords;
1044 0         0 $resultAry[$i] = sprintf( "%.6f", $resultAry[$i] );
1045             }
1046             }
1047              
1048             # Clear Found Words (Strings)
1049 0         0 undef( @foundWords );
1050 0         0 @foundWords = ();
1051             }
1052              
1053 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Complete" ) if @resultAry > 0;
1054 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Completed With Errors" ) if @resultAry == 0;
1055              
1056 0 0       0 my $returnStr = join( ' ', @resultAry ) if @resultAry > 0;
1057 0 0       0 $returnStr = undef if @resultAry == 0;
1058 0         0 undef( @resultAry );
1059 0         0 return $returnStr;
1060             }
1061              
1062             sub AddTwoWords
1063             {
1064 1     1 1 4 my ( $self, $wordA, $wordB ) = @_;
1065              
1066             # Check(s)
1067 1 50 33     4 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
1068 1 50       10 $self->WriteLog( "AddTwoWords - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
1069 1 50       3 return undef if ( $self->IsVectorDataInMemory() == 0 );
1070              
1071 0 0 0     0 $self->WriteLog( "AddTwoWords - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1072 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
1073              
1074 0         0 my $wordAData = $self->GetWordVector( $wordA );
1075 0         0 my $wordBData = $self->GetWordVector( $wordB );
1076              
1077 0 0       0 $self->WriteLog( "AddTwoWords - Error: \"$wordA\" Not In Dictionary" ) if !defined( $wordAData );
1078 0 0       0 $self->WriteLog( "AddTwoWords - Error: \"$wordB\" Not In Dictionary" ) if !defined( $wordBData );
1079 0 0 0     0 return undef if !defined( $wordAData ) || !defined( $wordBData );
1080              
1081 0         0 my @wordAVtr = split( ' ', $wordAData );
1082 0         0 my @wordBVtr = split( ' ', $wordBData );
1083              
1084             # More Check(s)
1085 0 0       0 $self->WriteLog( "AddTwoWords - Cannot Add Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1086 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1087              
1088             # Remove Word From Word Vector (First Element)
1089 0         0 shift( @wordAVtr );
1090 0         0 shift( @wordBVtr );
1091              
1092 0         0 $self->WriteLog( "AddTwoWords - Adding Two Word Vectors" );
1093              
1094 0         0 my @resultVtr = ();
1095              
1096 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1097             {
1098 0         0 push( @resultVtr, $wordAVtr[$i] + $wordBVtr[$i] );
1099             }
1100              
1101 0         0 my $resultStr = join( ' ', @resultVtr );
1102 0         0 undef( @resultVtr );
1103              
1104 0         0 $self->WriteLog( "AddTwoWords - Complete" );
1105              
1106 0         0 return $resultStr;
1107             }
1108              
1109             sub SubtractTwoWords
1110             {
1111 1     1 1 4 my ( $self, $wordA, $wordB ) = @_;
1112              
1113             # Check(s)
1114 1 50 33     2 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
1115 1 50       4 $self->WriteLog( "AddTwoWords - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
1116 1 50       2 return undef if ( $self->IsVectorDataInMemory() == 0 );
1117              
1118 0 0 0     0 $self->WriteLog( "SubtractTwoWords - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1119 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
1120              
1121 0         0 my $wordAData = $self->GetWordVector( $wordA );
1122 0         0 my $wordBData = $self->GetWordVector( $wordB );
1123              
1124 0 0       0 $self->WriteLog( "SubtractTwoWords - Error: \"$wordA\" Not In Dictionary" ) if !defined( $wordAData );
1125 0 0       0 $self->WriteLog( "SubtractTwoWords - Error: \"$wordB\" Not In Dictionary" ) if !defined( $wordBData );
1126 0 0 0     0 return undef if !defined( $wordAData ) || !defined( $wordBData );
1127              
1128 0         0 my @wordAVtr = split( ' ', $wordAData );
1129 0         0 my @wordBVtr = split( ' ', $wordBData );
1130              
1131             # More Check(s)
1132 0 0       0 $self->WriteLog( "SubtractTwoWords - Cannot Add Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1133 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1134              
1135             # Remove Word From Word Vector (First Element)
1136 0         0 shift( @wordAVtr );
1137 0         0 shift( @wordBVtr );
1138              
1139 0         0 $self->WriteLog( "SubtractTwoWords - Subtracting Two Word Vectors" );
1140              
1141 0         0 my @resultVtr = ();
1142              
1143 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1144             {
1145 0         0 push( @resultVtr, $wordAVtr[$i] - $wordBVtr[$i] );
1146             }
1147              
1148 0         0 my $resultStr = join( ' ', @resultVtr );
1149 0         0 undef( @resultVtr );
1150              
1151 0         0 $self->WriteLog( "SubtractTwoWords - Complete" );
1152              
1153 0         0 return $resultStr;
1154             }
1155              
1156             sub AddTwoWordVectors
1157             {
1158 1     1 1 3 my ( $self, $wordA, $wordB ) = @_;
1159              
1160             # Check(s)
1161 1 50 33     6 $self->WriteLog( "AddTwoWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1162 1 50 33     6 return undef if !defined ( $wordA ) || !defined ( $wordB );
1163              
1164 0         0 my @wordAVtr = split( ' ', $wordA );
1165 0         0 my @wordBVtr = split( ' ', $wordB );
1166              
1167             # More Check(s)
1168 0 0       0 $self->WriteLog( "AddTwoWordVectors - Cannot Add Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1169 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1170              
1171 0         0 $self->WriteLog( "AddTwoWordVectors - Adding Two Word Vectors" );
1172              
1173 0         0 my @resultVtr = ();
1174              
1175 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1176             {
1177 0         0 push( @resultVtr, $wordAVtr[$i] + $wordBVtr[$i] );
1178             }
1179              
1180 0         0 my $resultStr = join( ' ', @resultVtr );
1181 0         0 undef( @resultVtr );
1182              
1183 0         0 $self->WriteLog( "AddTwoWordVectors - Complete" );
1184              
1185 0         0 return $resultStr;
1186             }
1187              
1188             sub SubtractTwoWordVectors
1189             {
1190 1     1 1 3 my ( $self, $wordA, $wordB ) = @_;
1191              
1192             # Check(s)
1193 1 50 33     6 $self->WriteLog( "SubtractTwoWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1194 1 50 33     6 return undef if !defined ( $wordA ) || !defined ( $wordB );
1195              
1196 0         0 my @wordAVtr = split( ' ', $wordA );
1197 0         0 my @wordBVtr = split( ' ', $wordB );
1198              
1199             # More Check(s)
1200 0 0       0 $self->WriteLog( "SubtractTwoWordVectors - Cannot Subtract Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1201 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1202              
1203 0         0 $self->WriteLog( "SubtractTwoWordVectors - Subtracting Two Word Vectors" );
1204              
1205 0         0 my @resultVtr = ();
1206              
1207 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1208             {
1209 0         0 push( @resultVtr, $wordAVtr[$i] - $wordBVtr[$i] );
1210             }
1211              
1212 0         0 my $resultStr = join( ' ', @resultVtr );
1213 0         0 undef( @resultVtr );
1214              
1215 0         0 $self->WriteLog( "SubtractTwoWordVectors - Complete" );
1216              
1217 0         0 return $resultStr;
1218             }
1219              
1220             sub AverageOfTwoWordVectors
1221             {
1222 1     1 1 4 my ( $self, $wordA, $wordB ) = @_;
1223              
1224             # Check(s)
1225 1 50 33     6 $self->WriteLog( "AverageOfTwoWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1226 1 50 33     7 return undef if !defined ( $wordA ) || !defined ( $wordB );
1227              
1228 0         0 my @wordAVtr = split( ' ', $wordA );
1229 0         0 my @wordBVtr = split( ' ', $wordB );
1230              
1231             # More Check(s)
1232 0 0       0 $self->WriteLog( "AverageOfTwoWordVectors - Cannot Compute Average Of Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1233 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1234              
1235 0         0 $self->WriteLog( "AverageOfTwoWordVectors - Averaging Two Word Vectors" );
1236              
1237 0         0 my @resultVtr = ();
1238              
1239 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1240             {
1241 0         0 push( @resultVtr, ( $wordAVtr[$i] - $wordBVtr[$i] ) / 2 );
1242             }
1243              
1244 0         0 my $resultStr = join( ' ', @resultVtr );
1245 0         0 undef( @resultVtr );
1246              
1247 0         0 $self->WriteLog( "AverageOfTwoWordVectors - Complete" );
1248              
1249 0         0 return $resultStr;
1250             }
1251              
1252             sub GetWordVector
1253             {
1254 5     5 1 13 my ( $self, $searchWord, $returnRawSparseText ) = @_;
1255              
1256 5 50       14 $returnRawSparseText = 1 if defined( $returnRawSparseText );
1257 5 50       14 $returnRawSparseText = 0 if !defined( $returnRawSparseText );
1258              
1259             # Check(s)
1260 5 50 33     11 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
1261 5 50       13 $self->WriteLog( "GetWordVector - Error: No Vector Data In Memory - Cannot Fetch Word Vector Data" ) if ( $self->IsVectorDataInMemory() == 0 );
1262 5 50       9 return undef if ( $self->IsVectorDataInMemory() == 0 );
1263              
1264 0         0 my $wordVectorData = $self->GetVocabularyHash->{ $searchWord };
1265              
1266 0 0       0 $self->WriteLog( "GetWordVector - Warning: \"$searchWord\" Not Found In Dictionary" ) if !defined( $wordVectorData );
1267              
1268 0 0       0 return undef if !defined( $wordVectorData );
1269              
1270 0         0 my $returnStr = "";
1271              
1272             # Convert Sparse Format To Regular Format
1273 0 0       0 if( $self->GetSparseVectorMode() == 1 )
1274             {
1275 0 0       0 if( $returnRawSparseText == 1 )
1276             {
1277 0         0 return $searchWord . " " . $wordVectorData;
1278             }
1279              
1280 0         0 my $vectorSize = $self->GetVectorLength();
1281              
1282             # Check
1283 0 0       0 $self->WriteLog( "GetWordVector - Error: Cannot Convert Sparse Data To Dense Format / Vector Length = 0 - Expects Vector Length >= 1" ) if ( $vectorSize == 0 );
1284 0 0       0 return undef if ( $vectorSize == 0 );
1285              
1286 0         0 my @data = split( ' ', $wordVectorData );
1287              
1288             # Make Array Of Vector Size With All Zeros
1289 0 0       0 my @wordVector = ( "0.000000" ) x $vectorSize if ( $vectorSize != 0 );
1290              
1291 0         0 for( my $i = 0; $i < @data; $i++ )
1292             {
1293             # If The Index ($i) Is Even, Then The Element Is An Index
1294 0 0       0 my $index = $data[$i] if ( $i % 2 == 0 );
1295              
1296             # If The Index Is Defined, Then Next Element Is An Index Element
1297 0 0       0 my $element = $data[$i+1] if defined( $index );
1298              
1299             # Assign The Correct Index Element To The Specified Index
1300 0 0 0     0 $wordVector[$index] = $element if defined( $index ) && defined( $element );
1301             }
1302              
1303             # Assign New Standard Format Word Vector To $returnStr
1304 0         0 $returnStr = $searchWord . " " . join( ' ', @wordVector );
1305              
1306             # Clear Array
1307 0         0 undef( @data );
1308 0         0 @data = ();
1309 0         0 undef( @wordVector );
1310 0         0 @wordVector = ();
1311             }
1312             else
1313             {
1314 0         0 $returnStr = $searchWord . " " . $wordVectorData;
1315             }
1316              
1317 0         0 return $returnStr;
1318             }
1319              
1320             sub IsVectorDataInMemory
1321             {
1322 52     52 1 93 my ( $self ) = @_;
1323              
1324 52         65 my $numberOfWordsInMemory = keys %{ $self->GetVocabularyHash() };
  52         91  
1325 52 50       101 return 1 if $numberOfWordsInMemory > 0;
1326              
1327 52         500 return 0;
1328             }
1329              
1330             sub IsWordOrCUIVectorData
1331             {
1332 4     4 1 8 my ( $self ) = @_;
1333              
1334             # Check(s)
1335 4 50       10 $self->WriteLog( "isWordOrCUIVectorData - Error: No Vector Vocabulary Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
1336 4 50       7 return undef if $self->IsVectorDataInMemory() == 0;
1337              
1338 0         0 my @vocabularyWords = keys %{ $self->GetVocabularyHash() };
  0         0  
1339 0         0 @vocabularyWords = sort( @vocabularyWords );
1340              
1341             # Choose Random Word, Avoiding First Three Vector Elements
1342 0         0 my $term = $vocabularyWords[ rand( @vocabularyWords - 2 ) + 2 ];
1343              
1344             # Clean Up
1345 0         0 undef( @vocabularyWords );
1346              
1347             # Perform Check
1348 0         0 $term = lc( $term );
1349 0         0 my @terms = split( 'c', $term );
1350              
1351             # Return Word Term If There Are Not Two Elements After Splitting
1352 0 0       0 return "word" if( @terms != 2 );
1353              
1354             # If $term Is CUI, Then First Element Should Be Empty String
1355 0 0       0 return "word" if ( $terms[0] ne "" );
1356              
1357             # Remove Numbers From Second Element
1358 0         0 $terms[1] =~ s/[0-9]//g;
1359              
1360             # If $term Is CUI, Then After Removing All Number From Second Element An Empty String Is All That Is Left
1361 0 0       0 return "word" if ( $terms[1] ne "" );
1362              
1363 0         0 return "cui";
1364             }
1365              
1366             sub IsVectorDataSorted
1367             {
1368 0     0 1 0 my ( $self, $aryRef ) = @_;
1369              
1370 0 0       0 my $vocabHashRef = $self->GetVocabularyHash() if !defined( $aryRef );
1371 0 0       0 $vocabHashRef = $aryRef if defined( $aryRef );
1372              
1373 0 0       0 $self->WriteLog( "IsVectorDataSorted - Error: No Vector Data In Memory" ) if ( keys %{ $vocabHashRef } == 0 );
  0         0  
1374 0 0       0 return -1 if ( keys %{ $vocabHashRef } == 0 );
  0         0  
1375              
1376 0         0 my $numOfWords = $self->GetNumberOfWords();
1377 0         0 my $vectorLength = $self->GetVectorLength();
1378              
1379 0 0 0     0 return 1 if defined( $vocabHashRef->{ $numOfWords } ) && $vocabHashRef->{ $numOfWords } eq "$vectorLength #\$\@RTED#";
1380 0         0 return 0;
1381             }
1382              
1383             sub CheckWord2VecDataFileType
1384             {
1385 3     3 1 10 my ( $self, $fileDir ) = @_;
1386              
1387             # Check(s)
1388 3 50       22 $self->WriteLog( "CheckWord2VecDataFileType - Error: File Path Not Defined" ) if !defined( $fileDir );
1389 3 50       9 return undef if !defined( $fileDir );
1390              
1391 3 100       35 $self->WriteLog( "CheckWord2VecDataFileType - Error: File Cannot Be Found / Does Not Exist" ) if !( -e $fileDir );
1392 3 100       36 return undef if !( -e $fileDir );
1393              
1394              
1395             # Check Word Vector File Format
1396 2         4 my $fileType = "";
1397 2         4 my $numOfWordVectors = 0;
1398 2         4 my $sizeOfVectors = 0;
1399 2         2 my $sparseVectorsFlag = 0;
1400              
1401 2 50       43 open( my $fh, "<:", "$fileDir" ) or $self->WriteLog( "CheckWord2VecDataFileType - Error Opening File : $!" );
1402              
1403 2         9 for( my $i = 0; $i < 2; $i++ )
1404             {
1405 4         46 my $data = <$fh>;
1406              
1407             # Store Number Of Word Vectors And Vector Size
1408 4 100       12 if( $i == 0 )
1409             {
1410 2         67 my @dimensionsAry = split( ' ', $data );
1411              
1412             # Fetch Number Of Word Vectors
1413 2 50       12 $numOfWordVectors = $dimensionsAry[0] if ( @dimensionsAry >= 2 );
1414              
1415             # Fetch Size Of Vectors
1416 2 50       7 $sizeOfVectors = $dimensionsAry[1] if ( @dimensionsAry >= 2 );
1417              
1418             # Skip First Line (First Line Is Always Plain Text Format)
1419 2         9 next;
1420             }
1421              
1422             # Check Second Line Of File To Determine Whether File Is Text Or Binary Format
1423 2         4 my $oldData = $data;
1424 2         12 my $newData = Encode::decode( "utf8", $data, Encode::FB_QUIET );
1425 2 50       52 $fileType = "text" if length( $oldData ) == length( $newData );
1426 2 50       25 $fileType = "binary" if length( $oldData ) != length( $newData );
1427              
1428             # Check Second Line For Sparse Vector
1429 2 50       8 my @dataAry = split( ' ', $oldData ) if defined( $oldData );
1430 2 50 33     15 $sparseVectorsFlag = 1 if defined( $oldData ) && ( @dataAry - 1 != $sizeOfVectors );
1431             }
1432              
1433             # Read A Couple Lines To Determine Whether Vectors Are 'Sparse' Or 'Full' Plain Vectors
1434 2 50       8 if( $fileType eq "text" )
1435             {
1436 2 50       8 my $checkLength = 50 if ( $numOfWordVectors > 50 );
1437 2 50       7 $checkLength = $numOfWordVectors if ( $numOfWordVectors < 50 );
1438              
1439             # Read Data From File To Check For Sparse Vectors
1440 2         11 for( my $i = 0; $i < $checkLength - 2; $i++ )
1441             {
1442 0         0 my $data = <$fh>;
1443 0 0       0 my @dataAry = split( ' ', $data ) if defined( $data );
1444 0 0 0     0 $sparseVectorsFlag = 1 if defined( $data ) && ( @dataAry - 1 != $sizeOfVectors );
1445             }
1446              
1447 2 50       6 $fileType = "sparsetext" if ( $sparseVectorsFlag == 1 );
1448             }
1449              
1450 2         15 close( $fh );
1451 2         9 undef( $fh );
1452              
1453 2         14 return $fileType;
1454             }
1455              
1456             sub ReadTrainedVectorDataFromFile
1457             {
1458 4     4 1 9 my ( $self, $fileDir, $searchWord ) = @_;
1459              
1460 4         19 $self->WriteLog( "ReadTrainedVectorDataFromFile - Reading File \"$fileDir\"" );
1461              
1462             # Check(s)
1463 4 50       10 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Directory Not Defined" ) if !defined ( $fileDir );
1464 4 50       9 return -1 if !defined ( $fileDir );
1465              
1466 4 100       36 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Directory/File Does Not Exist" ) if !( -e "$fileDir" );
1467 4 100       22 return -1 if !( -e "$fileDir" );
1468              
1469 1 50       8 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Vector Data File Size = 0 bytes / File Contains No Data" ) if ( -z "$fileDir" );
1470 1 50       8 return -1 if ( -z "$fileDir" );
1471              
1472 0         0 my $numberOfWordsInMemory = $self->GetNumberOfWords();
1473 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Module Already Contains Vector Training Data In Memory" ) if $numberOfWordsInMemory > 0;
1474 0 0       0 return -1 if $numberOfWordsInMemory > 0;
1475              
1476 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Searching For Word \"$searchWord\" In Vector Data File \"$fileDir\"" ) if defined( $searchWord );
1477 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Warning: Vector Data Will Be Cleared From Memory After Search Is Complete" ) if defined ( $searchWord );
1478              
1479             # Check To See If File Data Is Binary Or Text
1480 0         0 my $fileType = $self->CheckWord2VecDataFileType( $fileDir );
1481              
1482             # Check
1483 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Unable To Determine Vector Data Format" ) if !defined( $fileType );
1484 0 0       0 return -1 if !defined( $fileType );
1485              
1486 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Detected File Type As \"Plain Text Format\"" ) if $fileType eq "text" ;
1487 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Detected File Type As \"Sparse Vector Text Format\"" ) if $fileType eq "sparsetext" ;
1488 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Detected File Type As \"Word2Vec Binary Format\"" ) if $fileType eq "binary" ;
1489              
1490 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Setting \"Sparse Vector Mode\" = True" ) if $fileType eq "sparsetext" ;
1491 0 0       0 $self->SetSparseVectorMode( 1 ) if $fileType eq "sparsetext";
1492 0 0       0 $self->SetSparseVectorMode( 0 ) if $fileType ne "sparsetext";
1493              
1494 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Reading Data" );
1495              
1496              
1497             # Read Trained Vector Data From File To Memory
1498 0         0 my $fileHandle;
1499              
1500             # Read Plain Text Data Format From File
1501 0 0       0 if ( $fileType eq "text" )
    0          
    0          
1502             {
1503 0         0 my $lineCount = 0;
1504 0         0 open( $fileHandle, '<:encoding(UTF-8)', "$fileDir" );
1505              
1506 0         0 while( my $row = <$fileHandle> )
1507             {
1508 0         0 chomp $row;
1509 0         0 $row = lc( $row );
1510              
1511             # Progress Percent Indicator - Print Percentage Of File Loaded
1512 0 0       0 print( int( ( $lineCount / $self->GetNumberOfWords() ) * 100 ) . "%" ) if ( $self->GetNumberOfWords() > 0 );
1513              
1514             # Skip If Line Is Empty
1515 0 0       0 next if( length( $row ) == 0 );
1516              
1517 0 0       0 if( $lineCount == 0 )
1518             {
1519 0         0 my @data = split( ' ', $row );
1520              
1521             # Check(s)
1522 0 0       0 if( @data < 2 )
1523             {
1524 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: File Does Not Contain Header Information / NumOfWords & VectorLength" );
1525 0         0 close( $fileHandle );
1526 0         0 return -1;
1527             }
1528              
1529 0         0 $self->SetNumberOfWords( $data[0] );
1530 0         0 $self->SetVectorLength( $data[1] );
1531             }
1532              
1533             # Search For Search Word And Return If Found
1534 0 0       0 if ( defined( $searchWord ) )
1535             {
1536 0         0 my @data = split( ' ', $row );
1537              
1538 0 0       0 if ( $data[0] eq $searchWord )
1539             {
1540 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Search Word Found / Clearing Variables" );
1541 0         0 $self->ClearVocabularyHash();
1542 0         0 close( $fileHandle );
1543 0         0 return join( ' ', @data );
1544             }
1545             }
1546             # Store Vector Data In Memory
1547             else
1548             {
1549 0         0 $self->AddWordVectorToVocabHash( $row );
1550             }
1551              
1552             # Progress Percent Indicator - Return To Beginning Of Line
1553 0 0       0 print( "\r" ) if ( $self->GetNumberOfWords() > 0 );
1554              
1555 0         0 $lineCount++;
1556             }
1557              
1558 0         0 close( $fileHandle );
1559             }
1560             # Read Spare Text Format From File
1561             elsif( $fileType eq "sparsetext" )
1562             {
1563 0         0 my $lineCount = 0;
1564 0         0 my $numOfWordVectors = 0;
1565 0         0 my $vectorSize = 0;
1566              
1567 0         0 open( $fileHandle, '<:encoding(UTF-8)', "$fileDir" );
1568              
1569 0         0 while( my $row = <$fileHandle> )
1570             {
1571 0         0 chomp $row;
1572 0         0 $row = lc( $row );
1573              
1574             # Progress Percent Indicator - Print Percentage Of File Loaded
1575 0 0       0 print( int( ( $lineCount / $self->GetNumberOfWords() ) * 100 ) . "%" ) if ( $self->GetNumberOfWords() > 0 );
1576              
1577             # Skip If Line Is Empty
1578 0 0       0 next if( length( $row ) == 0 );
1579              
1580             # Skip First Line ( First Line Holds Number Of Word Vectors And Vector Size / Is Always Even )
1581 0 0       0 if( $lineCount == 0 )
    0          
1582             {
1583 0         0 my @data = split( ' ', $row );
1584              
1585             # Check(s)
1586 0 0       0 if( @data < 2 )
1587             {
1588 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: File Does Not Contain Header Information / NumOfWords" );
1589 0         0 close( $fileHandle );
1590 0         0 return -1;
1591             }
1592              
1593 0         0 $numOfWordVectors = $data[0];
1594 0         0 $vectorSize = $data[1] - 1;
1595              
1596 0         0 $self->SetNumberOfWords( $numOfWordVectors );
1597 0         0 $self->SetVectorLength( $vectorSize + 1 );
1598              
1599             }
1600             elsif( $lineCount > 0 )
1601             {
1602 0         0 my @data = split( ' ', $row );
1603              
1604             # If Array Size Is Odd, Then Error Out
1605             # Explanation: ie. - $dataAry[1] = "heart 1 0.002323 4 0.124342 16 0.005610 17"
1606             # There Are Four Indices And Three Index Elements, There Should Be
1607             # One Index Per Index Element. A Proper Sparse Vector Should Look As Follows.
1608             # ie. - $dataAry[1] = "heart 1 0.002323 4 0.124342 16 0.005610 17 0.846613"
1609             # With The Word Included In The Word Vector, The Vector Size Should Always
1610             # Be Odd By Nature.
1611             #
1612 0 0 0     0 if ( @data > 2 && @data % 2 == 0 )
1613             {
1614 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Improper Sparse Vector Format - Index/Index Element Number Mis-Match" );
1615 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Occured At Line #$lineCount: \"$row\"" );
1616 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Clearing Vocabulary Array" );
1617 0         0 $self->ClearVocabularyHash();
1618 0         0 return -1;
1619             }
1620              
1621             # Fetch String Word In First Element
1622 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: First Element Of Data Array (Word) Not Defined - Line: $lineCount" ) if !defined( $data[0] );
1623 0 0       0 return -1 if !defined( $data[0] );
1624              
1625             # Clear Array
1626 0         0 @data = ();
1627             }
1628              
1629             # Search For Search Word And Return If Found
1630 0 0       0 if ( defined( $searchWord ) )
1631             {
1632 0         0 my @data = split( ' ', $row );
1633              
1634 0 0       0 if ( $data[0] eq $searchWord )
1635             {
1636 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Search Word Found / Clearing Variables" );
1637 0         0 $self->ClearVocabularyHash();
1638 0         0 close( $fileHandle );
1639 0         0 return join( ' ', @data );
1640             }
1641             }
1642             # Store Vector Data In Memory
1643             else
1644             {
1645 0         0 $self->AddWordVectorToVocabHash( $row );
1646             }
1647              
1648             # Progress Percent Indicator - Return To Beginning Of Line
1649 0 0       0 print( "\r" ) if ( $self->GetNumberOfWords() > 0 );
1650              
1651 0         0 $lineCount++;
1652             }
1653              
1654 0         0 close( $fileHandle );
1655             }
1656             # Read Word2Vec Binary Data Format From File
1657             elsif( $fileType eq "binary" )
1658             {
1659 0         0 open( $fileHandle, "$fileDir" );
1660 0         0 binmode $fileHandle;
1661              
1662 0         0 my $buffer = "";
1663 0         0 my $word = "";
1664 0         0 my $wordVectorData = "";
1665              
1666             # Fetch "Number Of Words" and "Word Vector Size" From First Line
1667 0         0 my $row = <$fileHandle>;
1668 0         0 chomp( $row );
1669              
1670             # Skip If Line Is Empty
1671 0 0       0 next if( length( $row ) == 0 );
1672              
1673 0         0 my @strAry = split( ' ', $row );
1674              
1675             # Check(s)
1676 0 0       0 return if @strAry < 2;
1677              
1678              
1679 0         0 my $wordCount = $strAry[0];
1680 0         0 my $wordSize = $strAry[1];
1681 0         0 my $count = 1;
1682 0         0 $word = "";
1683              
1684 0         0 $self->SetNumberOfWords( $wordCount );
1685 0         0 $self->SetVectorLength( $wordSize );
1686              
1687             # Add Word Count & Word Vector Size To Memory
1688 0         0 $self->AddWordVectorToVocabHash( "$row" );
1689              
1690             # Begin Fetching Data From File
1691 0         0 while( $count < $wordCount + 1 )
1692             {
1693 0         0 my $cont = 1;
1694              
1695             # Progress Percent Indicator - Print Percentage Of File Loaded
1696 0 0       0 print( int( ( $count / $self->GetNumberOfWords() ) * 100 ) . "%" ) if ( $self->GetNumberOfWords() > 0 );
1697              
1698             # Fetch Word
1699 0         0 while( $cont == 1 )
1700             {
1701             # Fetch Word
1702 0         0 chomp( $buffer = getc( $fileHandle ) );
1703 0 0 0     0 $word .= $buffer if $buffer ne " " && defined( $buffer );
1704              
1705             # Check(s)
1706 0 0       0 $cont = 0 if eof;
1707 0 0       0 $cont = 0 if $buffer eq " ";
1708 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - ERROR: Unexpectedly Reached End Of File" ) if eof;
1709 0 0       0 $self->WriteLog( " Expected Word Count / Vector Size") if eof;
1710 0 0       0 $self->WriteLog( " $wordCount / $wordSize" ) if eof;
1711 0 0       0 $self->WriteLog( " Current Word Count" ) if eof;
1712 0 0       0 $self->WriteLog( " $count" ) if eof;
1713 0 0       0 $count = $wordCount + 1 if eof;
1714 0 0       0 next if eof;
1715             }
1716              
1717             # Fetch Word Vector Float Values
1718 0         0 for( my $i = 0; $i < $wordSize; $i++ )
1719             {
1720             # Read Specified Bytes Amount From File
1721 0         0 read( $fileHandle, $buffer, 4 ); # Assumes size of floating point is 4 bytes
1722 0         0 chomp( $buffer );
1723              
1724             # Check(s)
1725 0 0 0     0 $i = $wordSize + 1 if !defined( $buffer ) || $buffer eq 0;
1726 0 0 0     0 next if !defined( $buffer ) || $buffer eq 0;
1727              
1728 0 0 0     0 if( defined( $buffer ) && $buffer ne "" )
1729             {
1730             # Convert Binary Values To Float
1731 0         0 $buffer = unpack( "f", $buffer ); # Unpacks/convert 4 byte string to floating point
1732 0         0 $wordVectorData .= ( " " . sprintf( "%.6f", $buffer ) ); # Round Decimal At Sixth Place
1733             }
1734             }
1735              
1736             # Word Vector = Word + WordVectorData
1737 0         0 $word .= $wordVectorData;
1738              
1739             # Search For Search Word And Return If Found
1740 0 0       0 if ( defined( $searchWord ) )
1741             {
1742 0         0 my @data = split( ' ', $word );
1743              
1744 0 0       0 if ( $data[0] eq $searchWord )
1745             {
1746 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Search Word Found / Clearing Variables" );
1747 0         0 $self->ClearVocabularyHash();
1748 0         0 close( $fileHandle );
1749 0         0 return join( ' ', @data );
1750             }
1751             }
1752             # Store Vector Data In Memory
1753             else
1754             {
1755             # Add Word Vector To Memory
1756 0 0       0 $self->AddWordVectorToVocabHash( $word ) if $word ne "";
1757             }
1758              
1759             # Clear Variables
1760 0         0 $word = "";
1761 0         0 $wordVectorData = "";
1762 0         0 $buffer = "";
1763              
1764 0         0 $count++;
1765              
1766             # Progress Percent Indicator - Return To Beginning Of Line
1767 0 0       0 print( "\r" ) if ( $self->GetNumberOfWords() > 0 );
1768             }
1769              
1770 0         0 close( $fileHandle );
1771             }
1772              
1773 0 0       0 my $numberOfWords = keys %{ $self->GetVocabularyHash() } if defined( $self->GetVocabularyHash() );
  0         0  
1774 0 0       0 $numberOfWords = 0 if !defined( $numberOfWords );
1775 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Reading Data Complete" );
1776 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - $numberOfWords Word Vectors Stored In Memory" );
1777              
1778             # Used To Print New Line For Progress Percent Indicator
1779 0         0 print( "\n" );
1780              
1781             # Cannot Find Search Word In File
1782 0 0       0 return -1 if ( defined( $searchWord ) );
1783              
1784 0         0 return 0;
1785             }
1786              
1787             sub SaveTrainedVectorDataToFile
1788             {
1789 3     3 1 8 my ( $self, $savePath, $saveFormat ) = @_;
1790              
1791             # Check(s)
1792 3 50       8 $self->WriteLog( "SaveTrainedVectorDataToFile - Error: No Save Path Defined" ) if !defined( $savePath );
1793 3 50       7 return -1 if !defined ( $savePath );
1794              
1795 3 50       8 $saveFormat = 0 if !defined ( $saveFormat );
1796              
1797             # Save Data To File
1798 3         4 my $fileHandle;
1799              
1800             # Save Vector Data In Plain Text Format
1801 3 100       15 if ( $saveFormat == 0 )
    100          
    50          
1802             {
1803 1         5 $self->WriteLog( "SaveTrainedVectorDataToFile - Saving Word2Vec Data To Text File: \"$savePath\"" );
1804              
1805 1 50       92 open( $fileHandle, ">:encoding(utf8)", "$savePath" ) or return -1;
1806 1         60 my $vocabHashRef = $self->GetVocabularyHash();
1807 1         2 my @dataAry = sort( keys %{ $vocabHashRef } );
  1         3  
1808              
1809 1 50       5 if( $self->GetSparseVectorMode() == 1 )
1810             {
1811 0         0 my $numOfWords = $self->GetNumberOfWords();
1812 0         0 my $vectorSize = $self->GetVectorLength();
1813              
1814 0         0 for( my $i = 0; $i < @dataAry; $i++ )
1815             {
1816             # Progress Percent Indicator - Print Percentage Of File Loaded
1817 0 0       0 print( int( ( $i / $numOfWords ) * 100 ) . "%" ) if ( $numOfWords > 0 );
1818              
1819 0         0 my $wordVectorData = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1820              
1821             # Check(s)
1822 0 0       0 $self->WriteLog( "SaveTrainedVectorDataToFile - Warning: Word Vector Contains No Data / Empty String - Line: $i" ) if ( $wordVectorData eq "" );
1823 0 0       0 next if ( $wordVectorData eq "" );
1824              
1825 0 0       0 if( $i == 0 )
1826             {
1827 0         0 print( $fileHandle "$wordVectorData\n" )
1828             }
1829             else
1830             {
1831 0         0 my @data = split( ' ', $wordVectorData );
1832              
1833             # Get Word
1834 0         0 my $word = $data[0];
1835              
1836             # Make Array Of Vector Size With All Zeros
1837 0 0       0 my @wordVector = ( "0.000000" ) x $vectorSize if ( $vectorSize != 0 );
1838              
1839 0         0 for( my $j = 1; $j < @data; $j++ )
1840             {
1841             # If The Index ($i) Is Odd, Then The Element Is An Index
1842 0 0       0 my $index = $data[$j] if ( $j % 2 == 1 );
1843              
1844             # If The Index Is Defined, Then Next Element Is An Index Element
1845 0 0       0 my $element = $data[$j+1] if defined( $index );
1846              
1847             # Assign The Correct Index Element To The Specified Index
1848 0 0 0     0 $wordVector[$index] = $element if defined( $index ) && defined( $element );
1849             }
1850              
1851             # Generate Regular Formatted Word Vector
1852 0         0 $word = $word . " " . join( ' ', @wordVector );
1853              
1854             # Print Dictionary/Vocabulary Vector Data To File
1855 0         0 print( $fileHandle "$word \n" );
1856              
1857             # Clear Array
1858 0         0 @data = ();
1859 0         0 @wordVector = ();
1860             }
1861              
1862             # Progress Percent Indicator - Return To Beginning Of Line
1863 0 0       0 print( "\r" ) if ( $numOfWords > 0 );
1864             }
1865             }
1866             else
1867             {
1868             # Get Number Of Word Vectors and Vector Array Size
1869 1         20 my $numOfWords = $self->GetNumberOfWords();
1870 1         3 my $vectorSize = $self->GetVectorLength();
1871              
1872             # Print Dictionary/Vocabulary Vector Data To File
1873 1         6 for( my $i = 0; $i < @dataAry; $i++ )
1874             {
1875             # Progress Percent Indicator - Print Percentage Of File Loaded
1876 0 0       0 print( int( ( $i / $numOfWords ) * 100 ) . "%" ) if ( $numOfWords > 0 );
1877              
1878 0         0 my $data = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1879 0 0       0 print( $fileHandle "$data\n" ) if ( $i == 0 );
1880 0 0       0 print( $fileHandle "$data \n" ) if ( $i > 0 );
1881              
1882             # Progress Percent Indicator - Return To Beginning Of Line
1883 0 0       0 print( "\r" ) if ( $numOfWords > 0 );
1884             }
1885             }
1886              
1887 1         11 close( $fileHandle );
1888 1         4 undef( $fileHandle );
1889              
1890 1         2 $self->WriteLog( "SaveTrainedVectorDataToFile - File Saved" );
1891             }
1892             # Save Vector Data In Word2Vec Binary Format
1893             elsif ( $saveFormat == 1 )
1894             {
1895 1         5 $self->WriteLog( "SaveTrainedVectorDataToFile - Saving Word2Vec Data To Binary File: \"$savePath\"" );
1896              
1897             # Get Vocabulary and Vector Sizes
1898 1         3 my $vocabHashRef = $self->GetVocabularyHash();
1899 1         2 my @dataAry = sort( keys %{ $vocabHashRef } );
  1         3  
1900              
1901             # Check(s)
1902 1 50       4 $self->WriteLog( "SaveTrainedVectorDataToFile - Error: No Word2Vec Vector Data In Memory / Vocabulary Size == 0" ) if @dataAry == 0;
1903 1 50       6 return -1 if @dataAry == 0;
1904              
1905 0 0       0 open( $fileHandle, ">:raw", "$savePath" ) or return -1;
1906 0         0 binmode( $fileHandle ); # Not necessary as ":raw" implies binmode.
1907              
1908 0         0 my $headerStr = $dataAry[0] . " " . $vocabHashRef->{ $dataAry[0] };
1909 0         0 my @headerAry = split( ' ', $headerStr );
1910 0 0       0 return -1 if ( @headerAry < 2 );
1911              
1912 0         0 my $numOfWords = $headerAry[0];
1913 0         0 my $windowSize = $headerAry[1];
1914 0         0 @headerAry = ();
1915 0         0 undef( @headerAry );
1916              
1917             # Print Vocabulary and Windows Sizes To File With Line Feed
1918 0         0 print( $fileHandle "$headerStr\n" );
1919              
1920             # Print Word2Vec Vocabulary and Vector Data To File With Line Feed(s)
1921 0         0 for( my $i = 0; $i < @dataAry; $i++ )
1922             {
1923             # Progress Percent Indicator - Print Percentage Of File Loaded
1924 0 0       0 print( int( ( $i / $numOfWords ) * 100 ) . "%" ) if ( $numOfWords > 0 );
1925              
1926 0         0 my $data = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1927              
1928             # Check(s)
1929 0 0       0 next if ( $i == 0 );
1930              
1931             # Convert Sparse Vector Data To Dense Vector Format
1932 0 0       0 if ( $self->GetSparseVectorMode() == 1 )
1933             {
1934 0         0 my @tempAry = split( ' ', $data );
1935 0         0 my $word = $tempAry[0];
1936 0         0 @tempAry = ();
1937 0         0 @tempAry = @{ $self->ConvertRawSparseTextToVectorDataAry( $data ) };
  0         0  
1938 0         0 $data = "$word " . join( ' ', @tempAry );
1939 0         0 undef( @tempAry );
1940             }
1941              
1942 0         0 my @ary = split( ' ', $data );
1943 0 0       0 next if @ary < $windowSize;
1944              
1945             # Separate "Word" From "Vector Data"
1946 0         0 my $word = shift( @ary ) . " ";
1947 0         0 my $arySize = @ary;
1948              
1949             # Print Word To File
1950 0         0 print( $fileHandle $word );
1951              
1952             # Print Word Vector Data To File
1953 0         0 for my $value ( @ary )
1954             {
1955 0         0 print( $fileHandle pack( 'f', $value ) ); # Packs String Data In Decimal Binary Format
1956             }
1957              
1958             # Add Line Feed To End Of Word + Vector Data
1959 0         0 print( $fileHandle "\n" );
1960              
1961             # Progress Percent Indicator - Return To Beginning Of Line
1962 0 0       0 print( "\r" ) if ( $numOfWords > 0 );
1963             }
1964              
1965 0         0 close( $fileHandle );
1966 0         0 undef( $fileHandle );
1967              
1968 0         0 $self->WriteLog( "SaveTrainedVectorDataToFile - File Saved" );
1969             }
1970             # Save Vectors In Sparse Vector Format
1971             elsif ( $saveFormat == 2 )
1972             {
1973 1         8 $self->WriteLog( "SaveTrainedVectorDataToFile - Saving Word2Vec Data To Sparse Text File: \"$savePath\"" );
1974              
1975 1 50       59 open( $fileHandle, ">:encoding(utf8)", "$savePath" ) or return -1;
1976 1         47 my $vocabHashRef = $self->GetVocabularyHash();
1977 1         2 my @dataAry = sort( keys( %{ $vocabHashRef } ) );
  1         3  
1978              
1979 1 50       4 if( $self->GetSparseVectorMode() == 1 )
1980             {
1981 0         0 for my $data ( @dataAry )
1982             {
1983 0         0 print( $fileHandle $data . " " . $vocabHashRef->{ $data } . "\n" );
1984             }
1985             }
1986             else
1987             {
1988             # Get Number Of Word Vectors and Vector Array Size
1989 1         3 my $numOfWords = $self->GetNumberOfWords();
1990 1         3 my $vectorSize = $self->GetVectorLength();
1991              
1992             # Print Dictionary/Vocabulary Vector Data To File
1993 1         5 for( my $i = 0; $i < @dataAry; $i++ )
1994             {
1995             # Progress Percent Indicator - Print Percentage Of File Loaded
1996 0 0       0 print( int( ( $i / $numOfWords ) * 100 ) . "%" ) if ( $numOfWords > 0 );
1997              
1998 0         0 my $data = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1999 0 0       0 print( $fileHandle "$data\n" ) if ( $i == 0 );
2000              
2001 0 0 0     0 if( $i > 0 && defined( $data ) )
2002             {
2003 0         0 my @wordAry = split( ' ', $data );
2004              
2005 0         0 my $word = $wordAry[0];
2006              
2007             # Print The Vector Word To The File
2008 0         0 print( $fileHandle "$word" );
2009              
2010             # Print Vector Data To File
2011 0         0 for( my $j = 1; $j < @wordAry; $j++ )
2012             {
2013 0         0 my $index = $j - 1;
2014 0         0 my $value = $wordAry[$j];
2015 0 0       0 print( $fileHandle " $index $value" ) if ( $value != 0 );
2016             }
2017              
2018 0         0 print( $fileHandle " \n" );
2019             }
2020              
2021             # Progress Percent Indicator - Return To Beginning Of Line
2022 0 0       0 print( "\r" ) if ( $numOfWords > 0 );
2023             }
2024             }
2025              
2026 1         8 close( $fileHandle );
2027 1         3 undef( $fileHandle );
2028              
2029 1         3 $self->WriteLog( "SaveTrainedVectorDataToFile - File Saved" );
2030             }
2031              
2032             # Used To Print New Line For Progress Percent Indicator
2033 2         52 print( "\n" );
2034              
2035 2         14 return 0;
2036             }
2037              
2038             sub StringsAreEqual
2039             {
2040 2     2 1 5 my ( $self , $strA, $strB ) = @_;
2041              
2042 2         6 $strA = lc( $strA );
2043 2         4 $strB = lc( $strB );
2044              
2045 2 100       12 return 0 if length( $strA ) != length( $strB );
2046 1 50       5 return 0 if index( $strA, $strB ) != 0;
2047              
2048 1         4 return 1;
2049             }
2050              
2051             sub RemoveWordFromWordVectorString
2052             {
2053 3     3 1 14 my ( $self, $dataStr ) = @_;
2054              
2055             # Check(s)
2056 3 50       11 return undef if !defined( $dataStr );
2057              
2058 0         0 my @tempAry = split( ' ', $dataStr, 2 );
2059 0         0 $dataStr = $tempAry[1];
2060              
2061 0         0 undef( @tempAry );
2062              
2063 0         0 return $dataStr;
2064             }
2065              
2066             sub ConvertRawSparseTextToVectorDataAry
2067             {
2068 1     1 1 3 my ( $self, $rawSparseText ) = @_;
2069              
2070             # Check(s)
2071 1 50       4 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: No Sparse Text Defined" ) if !defined( $rawSparseText );
2072 1 50       4 return () if !defined( $rawSparseText );
2073              
2074 1 50       3 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Sparse Text String Empty" ) if ( $rawSparseText eq "" );
2075 1 50       4 return () if ( $rawSparseText eq "" );
2076              
2077 1         4 my $vectorSize = $self->GetVectorLength();
2078              
2079 1 50       5 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Vector Size == 0" ) if ( $vectorSize == 0 );
2080 1 50       29 return () if ( $vectorSize == 0 );
2081              
2082             # Begin Data Conversion
2083 0         0 my @data = split( ' ', $rawSparseText );
2084              
2085             # Make Array Of Vector Size With All Zeros
2086 0         0 my @wordVector = ( "0.000000" ) x $vectorSize;
2087              
2088 0         0 for( my $i = 0; $i < @data; $i++ )
2089             {
2090             # Skip First Element / First Element Contains Word
2091 0 0       0 next if $i == 0;
2092              
2093             # If The Index ($i) Is Odd, Then The Element Is An Index
2094 0 0       0 my $index = $data[$i] if ( $i % 2 == 1 );
2095              
2096             # If The Index Is Defined, Then Next Element Is An Index Element
2097 0 0       0 my $element = $data[$i+1] if defined( $index );
2098              
2099             # Assign The Correct Index Element To The Specified Index
2100 0 0 0     0 $wordVector[$index] = $element if defined( $index ) && defined( $element );
2101             }
2102              
2103             # Clear Data
2104 0         0 undef( @data );
2105 0         0 @data = ();
2106 0         0 $rawSparseText = undef;
2107              
2108 0         0 return \@wordVector;
2109             }
2110              
2111             sub ConvertRawSparseTextToVectorDataHash
2112             {
2113 0     0 1 0 my ( $self, $rawSparseText ) = @_;
2114              
2115             # Check(s)
2116 0 0       0 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: No Sparse Text Defined" ) if !defined( $rawSparseText );
2117 0 0       0 return () if !defined( $rawSparseText );
2118              
2119 0 0       0 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Sparse Text String Empty" ) if ( $rawSparseText eq "" );
2120 0 0       0 return () if ( $rawSparseText eq "" );
2121              
2122 0         0 my $vectorSize = $self->GetVectorLength();
2123              
2124 0 0       0 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Vector Size == 0" ) if ( $vectorSize == 0 );
2125 0 0       0 return () if ( $vectorSize == 0 );
2126              
2127             # Begin Data Conversion
2128 0         0 my @data = split( ' ', $rawSparseText );
2129              
2130 0         0 my %wordHash;
2131              
2132 0         0 for( my $i = 0; $i < @data; $i++ )
2133             {
2134             # Skip First Element / First Element Contains Word
2135 0 0       0 next if $i == 0;
2136              
2137             # If The Index ($i) Is Odd, Then The Element Is An Index
2138 0 0       0 my $index = $data[$i] if ( $i % 2 == 1 );
2139              
2140             # If The Index Is Defined, Then Next Element Is An Index Element
2141 0 0       0 my $element = $data[$i+1] if defined( $index );
2142              
2143             # Assign The Correct Index Element To The Specified Index
2144 0 0 0     0 $wordHash{$index} = $element if defined( $index ) && defined( $element );
2145             }
2146              
2147             # Clear Data
2148 0         0 undef( @data );
2149 0         0 @data = ();
2150 0         0 $rawSparseText = undef;
2151              
2152 0         0 return \%wordHash;
2153             }
2154              
2155             sub GetOSType
2156             {
2157 2     2 1 4 my ( $self ) = @_;
2158 2         9 return $^O;
2159             }
2160              
2161              
2162             ######################################################################################
2163             # Accessors
2164             ######################################################################################
2165              
2166             sub GetDebugLog
2167             {
2168 64     64 1 498 my ( $self ) = @_;
2169 64 50       125 $self->{ _debugLog } = 0 if !defined ( $self->{ _debugLog } );
2170 64         172 return $self->{ _debugLog };
2171             }
2172              
2173             sub GetWriteLog
2174             {
2175 54     54 1 72 my ( $self ) = @_;
2176 54 50       103 $self->{ _writeLog } = 0 if !defined ( $self->{ _writeLog } );
2177 54         112 return $self->{ _writeLog };
2178             }
2179              
2180             sub GetFileHandle
2181             {
2182 1     1 1 3 my ( $self ) = @_;
2183 1 50       5 $self->{ _fileHandle } = undef if !defined ( $self->{ _fileHandle } );
2184 1         3 return $self->{ _fileHandle };
2185             }
2186              
2187             sub GetTrainFilePath
2188             {
2189 2     2 1 4 my ( $self ) = @_;
2190 2 50       9 $self->{ _trainFileName } = "" if !defined ( $self->{ _trainFileName } );
2191 2         7 return $self->{ _trainFileName };
2192             }
2193              
2194             sub GetOutputFilePath
2195             {
2196 2     2 1 4 my ( $self ) = @_;
2197 2 50       6 $self->{ _outputFileName } = "" if !defined ( $self->{ _outputFileName } );
2198 2         7 return $self->{ _outputFileName };
2199             }
2200              
2201             sub GetWordVecSize
2202             {
2203 2     2 1 4 my ( $self ) = @_;
2204 2 50       6 $self->{ _wordVecSize } = 100 if !defined ( $self->{ _wordVecSize } );
2205 2         6 return $self->{ _wordVecSize };
2206             }
2207              
2208             sub GetWindowSize
2209             {
2210 2     2 1 5 my ( $self ) = @_;
2211 2 50       6 $self->{ _windowSize } = 5 if !defined ( $self->{ _windowSize } );
2212 2         6 return $self->{ _windowSize };
2213             }
2214              
2215             sub GetSample
2216             {
2217 2     2 1 4 my ( $self ) = @_;
2218 2 50       6 $self->{ _sample } = 0.001 if !defined ( $self->{ _sample } );
2219 2         11 return $self->{ _sample };
2220             }
2221              
2222             sub GetHSoftMax
2223             {
2224 2     2 1 3 my ( $self ) = @_;
2225 2 50       6 $self->{ _hSoftMax } = 0 if !defined ( $self->{ _hSoftMax } );
2226 2         6 return $self->{ _hSoftMax };
2227             }
2228              
2229             sub GetNegative
2230             {
2231 2     2 1 5 my ( $self ) = @_;
2232 2 50       6 $self->{ _negative } = 5 if !defined ( $self->{ _negative } );
2233 2         6 return $self->{ _negative };
2234             }
2235              
2236             sub GetNumOfThreads
2237             {
2238 2     2 1 4 my ( $self ) = @_;
2239 2 50       6 $self->{ _numOfThreads } = 12 if !defined ( $self->{ _numOfThreads } );
2240 2         5 return $self->{ _numOfThreads };
2241             }
2242              
2243             sub GetNumOfIterations
2244             {
2245 2     2 1 3 my ( $self ) = @_;
2246 2 50       6 $self->{ _numOfIterations } = 5 if !defined ( $self->{ _numOfIterations } );
2247 2         7 return $self->{ _numOfIterations };
2248             }
2249              
2250             sub GetMinCount
2251             {
2252 2     2 1 3 my ( $self ) = @_;
2253 2 50       14 $self->{ _minCount } = 5 if !defined ( $self->{ _minCount } );
2254 2         6 return $self->{ _minCount };
2255             }
2256              
2257             sub GetAlpha
2258             {
2259 3     3 1 7 my ( $self ) = @_;
2260 3 50 33     8 $self->{ _alpha } = 0.05 if ( !defined ( $self->{ _alpha } ) && $self->GetUseCBOW() == 1 );
2261 3 50 33     7 $self->{ _alpha } = 0.025 if ( !defined ( $self->{ _alpha } ) && $self->GetUseCBOW() == 0 );
2262 3         8 return $self->{ _alpha };
2263             }
2264              
2265             sub GetClasses
2266             {
2267 2     2 1 207 my ( $self ) = @_;
2268 2 50       6 $self->{ _classes } = 0 if !defined ( $self->{ _classes } );
2269 2         6 return $self->{ _classes };
2270             }
2271              
2272             sub GetDebugTraining
2273             {
2274 2     2 1 5 my ( $self ) = @_;
2275 2 50       5 $self->{ _debug } = 2 if !defined ( $self->{ _debug } );
2276 2         6 return $self->{ _debug };
2277             }
2278              
2279             sub GetBinaryOutput
2280             {
2281 2     2 1 4 my ( $self ) = @_;
2282 2 50       5 $self->{ _binaryOutput } = 1 if !defined ( $self->{ _binaryOutput } );
2283 2         7 return $self->{ _binaryOutput };
2284             }
2285              
2286             sub GetSaveVocabFilePath
2287             {
2288 2     2 1 3 my ( $self ) = @_;
2289 2 50       5 $self->{ _saveVocab } = "" if !defined ( $self->{ _saveVocab } );
2290 2         7 return $self->{ _saveVocab };
2291             }
2292              
2293             sub GetReadVocabFilePath
2294             {
2295 2     2 1 3 my ( $self ) = @_;
2296 2 50       6 $self->{ _readVocab } = "" if !defined ( $self->{ _readVocab } );
2297 2         7 return $self->{ _readVocab };
2298             }
2299              
2300             sub GetUseCBOW
2301             {
2302 2     2 1 4 my ( $self ) = @_;
2303 2 50       6 $self->{ _useCBOW } = 1 if !defined ( $self->{ _useCBOW } );
2304 2         6 return $self->{ _useCBOW };
2305             }
2306              
2307             sub GetWorkingDir
2308             {
2309 5     5 1 11 my ( $self ) = @_;
2310 5 50       10 $self->{ _workingDir } = Cwd::getcwd() if !defined ( $self->{ _workingDir } );
2311 5         31 return $self->{ _workingDir };
2312             }
2313              
2314             sub GetWord2VecExeDir
2315             {
2316 5     5 1 9 my ( $self ) = @_;
2317 5 50       11 $self->{ _word2VecExeDir } = "" if !defined( $self->{ _word2VecExeDir } );
2318 5         17 return $self->{ _word2VecExeDir };
2319             }
2320              
2321             sub GetVocabularyHash
2322             {
2323 57     57 1 84 my ( $self ) = @_;
2324 57 50       113 $self->{ _hashRefOfWordVectors } = undef if !defined ( $self->{ _hashRefOfWordVectors } );
2325 57         111 return $self->{ _hashRefOfWordVectors };
2326             }
2327              
2328             sub GetOverwriteOldFile
2329             {
2330 4     4 1 5 my ( $self ) = @_;
2331 4 50       10 $self->{ _overwriteOldFile } = 0 if !defined ( $self->{ _overwriteOldFile } );
2332 4         8 return $self->{ _overwriteOldFile };
2333             }
2334              
2335             sub GetSparseVectorMode
2336             {
2337 4     4 0 9 my ( $self ) = @_;
2338 4 50       10 $self->{ _sparseVectorMode } = 0 if !defined ( $self->{ _sparseVectorMode } );
2339 4         18 return $self->{ _sparseVectorMode };
2340             }
2341              
2342             sub GetVectorLength
2343             {
2344 5     5 0 10 my ( $self ) = @_;
2345 5 50       13 $self->{ _vectorLength } = 0 if !defined ( $self->{ _vectorLength } );
2346 5         12 return $self->{ _vectorLength };
2347             }
2348              
2349             sub GetNumberOfWords
2350             {
2351 4     4 0 6 my ( $self ) = @_;
2352 4 50       11 $self->{ _numberOfWords } = 0 if !defined ( $self->{ _numberOfWords } );
2353 4         11 return $self->{ _numberOfWords };
2354             }
2355              
2356             sub GetMinimizeMemoryUsage
2357             {
2358 2     2 0 4 my ( $self ) = @_;
2359 2 50       6 $self->{ _minimizeMemoryUsage } = 1 if !defined ( $self->{ _minimizeMemoryUsage } );
2360 2         6 return $self->{ _minimizeMemoryUsage };
2361             }
2362              
2363              
2364             ######################################################################################
2365             # Mutators
2366             ######################################################################################
2367              
2368             sub SetTrainFilePath
2369             {
2370 2     2 1 5 my ( $self, $str ) = @_;
2371 2         5 return $self->{ _trainFileName } = $str;
2372             }
2373              
2374             sub SetOutputFilePath
2375             {
2376 2     2 1 3 my ( $self, $str ) = @_;
2377 2         5 return $self->{ _outputFileName } = $str;
2378             }
2379              
2380             sub SetWordVecSize
2381             {
2382 2     2 1 5 my ( $self, $value ) = @_;
2383 2         3 return $self->{ _wordVecSize } = $value;
2384             }
2385              
2386             sub SetWindowSize
2387             {
2388 2     2 1 3 my ( $self, $value ) = @_;
2389 2         4 return $self->{ _windowSize } = $value;
2390             }
2391              
2392             sub SetSample
2393             {
2394 2     2 1 4 my ( $self, $value ) = @_;
2395 2         3 return $self->{ _sample } = $value;
2396             }
2397              
2398             sub SetHSoftMax
2399             {
2400 2     2 1 4 my ( $self, $value ) = @_;
2401 2         4 return $self->{ _hSoftMax } = $value;
2402             }
2403              
2404             sub SetNegative
2405             {
2406 2     2 1 4 my ( $self, $value ) = @_;
2407 2         4 return $self->{ _negative } = $value;
2408             }
2409              
2410             sub SetNumOfThreads
2411             {
2412 2     2 1 5 my ( $self, $value ) = @_;
2413 2         4 return $self->{ _numOfThreads } = $value;
2414             }
2415              
2416             sub SetNumOfIterations
2417             {
2418 2     2 1 4 my ( $self, $value ) = @_;
2419 2         4 return $self->{ _numOfIterations } = $value;
2420             }
2421              
2422             sub SetMinCount
2423             {
2424 2     2 1 3 my ( $self, $value ) = @_;
2425 2         4 return $self->{ _minCount } = $value;
2426             }
2427              
2428             sub SetAlpha
2429             {
2430 2     2 1 6 my ( $self, $value ) = @_;
2431 2         4 return $self->{ _alpha } = $value;
2432             }
2433              
2434             sub SetClasses
2435             {
2436 2     2 1 5 my ( $self, $value ) = @_;
2437 2         4 return $self->{ _classes } = $value;
2438             }
2439              
2440             sub SetDebugTraining
2441             {
2442 1     1 1 3 my ( $self, $value ) = @_;
2443 1         3 return $self->{ _debug } = $value;
2444             }
2445              
2446             sub SetBinaryOutput
2447             {
2448 1     1 1 3 my ( $self, $value ) = @_;
2449 1         2 return $self->{ _binaryOutput } = $value;
2450             }
2451              
2452             sub SetSaveVocabFilePath
2453             {
2454 2     2 1 5 my ( $self, $str ) = @_;
2455 2         3 return $self->{ _saveVocab } = $str;
2456             }
2457              
2458             sub SetReadVocabFilePath
2459             {
2460 2     2 1 5 my ( $self, $str ) = @_;
2461 2         4 return $self->{ _readVocab } = $str;
2462             }
2463              
2464             sub SetUseCBOW
2465             {
2466 2     2 1 3 my ( $self, $value ) = @_;
2467 2         4 return $self->{ _useCBOW } = $value;
2468             }
2469              
2470             sub SetWorkingDir
2471             {
2472 2     2 1 7 my ( $self, $dir ) = @_;
2473 2         4 return $self->{ _workingDir } = $dir;
2474             }
2475              
2476             sub SetWord2VecExeDir
2477             {
2478 2     2 1 5 my ( $self, $dir ) = @_;
2479 2         4 return $self->{ _word2VecExeDir } = $dir;
2480             }
2481              
2482             sub SetVocabularyHash
2483             {
2484 1     1 1 2 my ( $self, $ref ) = @_;
2485 1 50       3 return if !defined( $ref );
2486 1         2 return $self->{ _hashRefOfWordVectors } = $ref;
2487             }
2488              
2489             sub ClearVocabularyHash
2490             {
2491 4     4 1 9 my ( $self ) = @_;
2492              
2493 4         13 $self->SetNumberOfWords( 0 );
2494 4         13 $self->SetVectorLength( 0 );
2495              
2496 4         7 undef( %{ $self->{ _hashRefOfWordVectors } } );
  4         10  
2497              
2498 4         8 my %hash;
2499 4         10 return $self->{ _hashRefOfWordVectors } = \%hash;
2500             }
2501              
2502             sub AddWordVectorToVocabHash
2503             {
2504 0     0 1 0 my ( $self, $wordVectorStr ) = @_;
2505 0 0       0 return if !defined( $wordVectorStr );
2506 0         0 my @tempAry = split( ' ', $wordVectorStr, 2 );
2507              
2508             # Check(s)
2509 0 0       0 return if !defined( $self->{ _hashRefOfWordVectors } );
2510 0 0       0 return if ( @tempAry != 2 );
2511              
2512 0         0 $self->{ _hashRefOfWordVectors }->{ $tempAry[0] } = $tempAry[1];
2513             }
2514              
2515             sub SetOverwriteOldFile
2516             {
2517 1     1 1 3 my ( $self, $temp ) = @_;
2518 1         2 return $self->{ _overwriteOldFile } = $temp;
2519             }
2520              
2521             sub SetSparseVectorMode
2522             {
2523 2     2 0 5 my ( $self, $temp ) = @_;
2524 2         5 return $self->{ _sparseVectorMode } = $temp;
2525             }
2526              
2527             sub SetVectorLength
2528             {
2529 6     6 0 23 my ( $self, $temp ) = @_;
2530 6         11 return $self->{ _vectorLength } = $temp;
2531             }
2532              
2533             sub SetNumberOfWords
2534             {
2535 6     6 0 11 my ( $self, $temp ) = @_;
2536 6         12 return $self->{ _numberOfWords } = $temp;
2537             }
2538              
2539             sub SetMinimizeMemoryUsage
2540             {
2541 2     2 0 3 my ( $self, $temp ) = @_;
2542 2 100       8 $self->WriteLog( "SetMinimalMemoryUsage - Normal Memory Mode Enabled" ) if ( $temp == 0 );
2543 2 100       6 $self->WriteLog( "SetMinimalMemoryUsage - Low Memory Mode Enabled" ) if ( $temp == 1 );
2544 2         3 return $self->{ _minimizeMemoryUsage } = $temp;
2545             }
2546              
2547              
2548             ######################################################################################
2549             # Debug Functions
2550             ######################################################################################
2551              
2552             sub GetTime
2553             {
2554 1     1 1 3 my ( $self ) = @_;
2555 1         32 my( $sec, $min, $hour ) = localtime();
2556              
2557 1 50       5 if( $hour < 10 )
2558             {
2559 1         3 $hour = "0$hour";
2560             }
2561              
2562 1 50       3 if( $min < 10 )
2563             {
2564 1         2 $min = "0$min";
2565             }
2566              
2567 1 50       3 if( $sec < 10 )
2568             {
2569 0         0 $sec = "0$sec";
2570             }
2571              
2572 1         4 return "$hour:$min:$sec";
2573             }
2574              
2575             sub GetDate
2576             {
2577 1     1 1 4 my ( $self ) = @_;
2578 1         12 my ( $sec, $min, $hour, $mday, $mon, $year ) = localtime();
2579              
2580 1         2 $mon += 1;
2581 1         25 $year += 1900;
2582              
2583 1         6 return "$mon/$mday/$year";
2584             }
2585              
2586             sub WriteLog
2587             {
2588 51     51 1 80 my ( $self ) = shift;
2589 51         69 my $string = shift;
2590 51         68 my $printNewLine = shift;
2591              
2592 51 50       83 return if !defined ( $string );
2593 51 50       84 $printNewLine = 1 if !defined ( $printNewLine );
2594              
2595              
2596 51 50       81 if( $self->GetDebugLog() )
2597             {
2598 0 0       0 if( ref ( $self ) ne "Word2vec::Word2vec" )
2599             {
2600 0         0 print( GetDate() . " " . GetTime() . " - Word2vec: Cannot Call WriteLog() From Outside Module!\n" );
2601 0         0 return;
2602             }
2603              
2604 0 0       0 $string = "" if !defined ( $string );
2605 0         0 print GetDate() . " " . GetTime() . " - Word2vec::$string";
2606 0 0       0 print "\n" if( $printNewLine != 0 );
2607             }
2608              
2609 51 50       96 if( $self->GetWriteLog() )
2610             {
2611 0 0         if( ref ( $self ) ne "Word2vec::Word2vec" )
2612             {
2613 0           print( GetDate() . " " . GetTime() . " - Word2vec: Cannot Call WriteLog() From Outside Module!\n" );
2614 0           return;
2615             }
2616              
2617 0           my $fileHandle = $self->GetFileHandle();
2618              
2619 0 0         if( defined( $fileHandle ) )
2620             {
2621 0           print( $fileHandle GetDate() . " " . GetTime() . " - Word2vec::$string" );
2622 0 0         print( $fileHandle "\n" ) if( $printNewLine != 0 );
2623             }
2624             }
2625             }
2626              
2627             #################### All Modules Are To Output "1"(True) at EOF ######################
2628             1;
2629              
2630              
2631             =head1 NAME
2632              
2633             Word2vec::Word2vec - word2vec wrapper module.
2634              
2635             =head1 SYNOPSIS
2636              
2637             # Parameters: Enabled Debug Logging, Disabled Write Logging
2638             my $w2v = Word2vec::Word2vec->new( 1, 0 ); # Note: Specifiying no parameters implies default settings.
2639              
2640             $w2v->SetTrainFilePath( "textCorpus.txt" );
2641             $w2v->SetOutputFilePath( "vectors.bin" );
2642             $w2v->SetWordVecSize( 200 );
2643             $w2v->SetWindowSize( 8 );
2644             $w2v->SetSample( 0.0001 );
2645             $w2v->SetNegative( 25 );
2646             $w2v->SetHSoftMax( 0 );
2647             $w2v->SetBinaryOutput( 0 );
2648             $w2v->SetNumOfThreads( 20 );
2649             $w2v->SetNumOfIterations( 12 );
2650             $w2v->SetUseCBOW( 1 );
2651             $w2v->SetOverwriteOldFile( 0 );
2652              
2653             $w2v->ExecuteTraining();
2654              
2655             undef( $w2v );
2656              
2657             # or
2658              
2659             use Word2vec::Word2vec;
2660              
2661             my $w2v = Word2vec::Word2vec->new(); # Note: Specifying no parameters implies default settings.
2662              
2663             $w2v->ExecuteTraining( $trainFilePath, $outputFilePath, $vectorSize, $windowSize, $minCount, $sample, $negative,
2664             $alpha, $hs, $binary, $numOfThreads, $iterations, $useCBOW, $classes, $readVocab,
2665             $saveVocab, $debug, $overwrite );
2666              
2667             undef( $w2v );
2668              
2669             =head1 DESCRIPTION
2670              
2671             Word2vec::Word2vec is a word2vec package tool that trains text corpus data using the word2vec tool, provides multiple avenues for cosine
2672             similarity computation, manipulation of word vectors and conversion of word2vec's binary format to human readable text.
2673              
2674             =head2 Main Functions
2675              
2676             =head3 new
2677              
2678             Description:
2679              
2680             Returns a new "Word2vec::Word2vec" module object.
2681              
2682             Note: Specifying no parameters implies default options.
2683              
2684             Default Parameters:
2685             debugLog = 0
2686             writeLog = 0
2687             trainFileName = ""
2688             outputFileName = ""
2689             wordVecSize = 100
2690             sample = 5
2691             hSoftMax = 0
2692             negative = 5
2693             numOfThreads = 12
2694             numOfIterations = 5
2695             minCount = 5
2696             alpha = 0.05 (CBOW) or 0.025 (Skip-Gram)
2697             classes = 0
2698             debug = 2
2699             binaryOutput = 1
2700             saveVocab = ""
2701             readVocab = ""
2702             useCBOW = 1
2703             workingDir = Current Directory
2704             hashRefOfWordVectors = ()
2705             overwriteOldFile = 0
2706              
2707             Input:
2708              
2709             $debugLog -> Instructs module to print debug statements to the console. (1 = True / 0 = False)
2710             $writeLog -> Instructs module to print debug statements to a log file. (1 = True / 0 = False)
2711             $trainFileName -> Specifies the training text corpus file path. (String)
2712             $outputFileName -> Specifies the word2vec post training output file path. (String)
2713             $wordVecSize -> Specifies word2vec word vector parameter size.(Integer)
2714             $sample -> Specifies word2vec sample parameter value. (Integer)
2715             $hSoftMax -> Specifies word2vec HSoftMax parameter value. (Integer)
2716             $negative -> Specifies word2vec negative parameter value. (Integer)
2717             $numOfThreads -> Specifies word2vec number of threads parameter value. (Integer)
2718             $numOfIterations -> Specifies word2vec number of iterations parameter value. (Integer)
2719             $minCount -> Specifies word2vec min-count parameter value. (Integer)
2720             $alpha -> Specifies word2vec alpha parameter value. (Integer)
2721             $classes -> Specifies word2vec classes parameter value. (Integer)
2722             $debug -> Specifies word2vec debug training parameter value. (Integer: '0' = No Debug, '1' = Debug, '2' = Even more debug info)
2723             $binaryOutput -> Specifies word2vec binary output mode parameter value. (Integer: '1' = Binary, '0' = Plain Text)
2724             $saveVocab -> Specifies word2vec save vocabulary file path. (String)
2725             $readVocab -> Specifies word2vec read vocabulary file path. (String)
2726             $useCBOW -> Specifies word2vec CBOW algorithm parameter value. (Integer: '1' = CBOW, '0' = Skip-Gram)
2727             $workingDir -> Specifies module working directory. (String)
2728             $hashRefOfWordVectors -> Storage location for loaded word2vec trained vector data file in memory. (Hash)
2729             $overwriteOldFile -> Instructs the module to either overwrite any existing data with the same output file name and path. ( '1' or '0' )
2730              
2731             Note: It is not recommended to specify all new() parameters, as it has not been thoroughly tested.
2732              
2733             Output:
2734              
2735             Word2vec::Word2vec object.
2736              
2737             Example:
2738              
2739             use Word2vec::Word2vec;
2740              
2741             my $w2v = Word2vec::Word2vec->new();
2742              
2743             undef( $w2v );
2744              
2745             =head3 DESTROY
2746              
2747             Description:
2748              
2749             Removes member variables and file handle from memory.
2750              
2751             Input:
2752              
2753             None
2754              
2755             Output:
2756              
2757             None
2758              
2759             Example:
2760              
2761             use Word2vec::Word2vec;
2762              
2763             my $w2v = Word2vec::Word2vec->new();
2764             $w2v->DESTROY();
2765              
2766             undef( $w2v );
2767              
2768             =head3 ExecuteTraining
2769              
2770             Executes word2vec training based on parameters. Parameter variables have higher precedence
2771             than member variables. Any parameter specified will override its respective member variable.
2772              
2773             Note: If no parameters are specified, this module executes word2vec training based on preset
2774             member variables. Returns string regarding training status.
2775              
2776             Input:
2777              
2778             $trainFilePath -> Specifies word2vec text corpus training file in a given path. (String)
2779             $outputFilePath -> Specifies word2vec trained output data file name and save path. (String)
2780             $vectorSize -> Size of word2vec word vectors. (Integer)
2781             $windowSize -> Maximum skip length between words. (Integer)
2782             $minCount -> Disregard words that appear less than $minCount times. (Integer)
2783             $sample -> Threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled. (Float)
2784             $negative -> Number of negative examples. (Integer)
2785             $alpha -> Set that start learning rate. (Float)
2786             $hs -> Hierarchical Soft-max (Integer)
2787             $binary -> Save trained data as binary mode. (Integer)
2788             $numOfThreads -> Number of word2vec training threads. (Integer)
2789             $iterations -> Number of training iterations to run prior to completion of training. (Integer)
2790             $useCBOW -> Enable Continuous Bag Of Words model or Skip-Gram model. (Integer)
2791             $classes -> Output word classes rather than word vectors. (Integer)
2792             $readVocab -> Read vocabulary from file path without constructing from training data. (String)
2793             $saveVocab -> Save vocabulary to file path. (String)
2794             $debug -> Set word2vec debug mode. (Integer)
2795             $overwrite -> Instructs the module to either overwrite any existing text corpus files or append to the existing file. ( '1' = True / '0' = False )
2796              
2797             Note: It is not recommended to specify all new() parameters, as it has not been thoroughly tested.
2798              
2799             Output:
2800              
2801             $value -> '0' = Successful / '-1' = Un-successful
2802              
2803             Example:
2804              
2805             use Word2vec::Word2vec;
2806              
2807             my $w2v = Word2vec::Word2vec->new();
2808             $w2v->SetTrainFilePath( "textcorpus.txt" );
2809             $w2v->SetOutputFilePath( "vectors.bin" );
2810             $w2v->SetWordVecSize( 200 );
2811             $w2v->SetWindowSize( 8 );
2812             $w2v->SetSample( 0.0001 );
2813             $w2v->SetNegative( 25 );
2814             $w2v->SetHSoftMax( 0 );
2815             $w2v->SetBinaryOutput( 0 );
2816             $w2v->SetNumOfThreads( 20 );
2817             $w2v->SetNumOfIterations( 15 );
2818             $w2v->SetUseCBOW( 1 );
2819             $w2v->SetOverwriteOldFile( 0 );
2820             $w2v->ExecuteTraining();
2821              
2822             undef( $w2v );
2823              
2824             # or
2825              
2826             use Word2vec::Word2vec;
2827              
2828             my $w2v = Word2vec::Word2vec->new();
2829             $w2v->ExecuteTraining( "textcorpus.txt", "vectors.bin", 200, 8, 5, 0.001, 25, 0.05, 0, 0, 20, 15, 1, 0, "", "", 2, 0 );
2830              
2831             undef( $w2v );
2832              
2833             =head3 ExecuteStringTraining
2834              
2835             Executes word2vec training based on parameters. Parameter variables have higher precedence
2836             than member variables. Any parameter specified will override its respective member variable.
2837              
2838             Note: If no parameters are specified, this module executes word2vec training based on preset
2839             member variables. Returns string regarding training status.
2840              
2841             Input:
2842              
2843             $trainingStr -> String to train with word2vec.
2844             $outputFilePath -> Specifies word2vec trained output data file name and save path. (String)
2845             $vectorSize -> Size of word2vec word vectors. (Integer)
2846             $windowSize -> Maximum skip length between words. (Integer)
2847             $minCount -> Disregard words that appear less than $minCount times. (Integer)
2848             $sample -> Threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled. (Float)
2849             $negative -> Number of negative examples. (Integer)
2850             $alpha -> Set that start learning rate. (Float)
2851             $hs -> Hierarchical Soft-max (Integer)
2852             $binary -> Save trained data as binary mode. (Integer)
2853             $numOfThreads -> Number of word2vec training threads. (Integer)
2854             $iterations -> Number of training iterations to run prior to completion of training. (Integer)
2855             $useCBOW -> Enable Continuous Bag Of Words model or Skip-Gram model. (Integer)
2856             $classes -> Output word classes rather than word vectors. (Integer)
2857             $readVocab -> Read vocabulary from file path without constructing from training data. (String)
2858             $saveVocab -> Save vocabulary to file path. (String)
2859             $debug -> Set word2vec debug mode. (Integer)
2860             $overwrite -> Instructs the module to either overwrite any existing text corpus files or append to the existing file. ( '1' = True / '0' = False )
2861              
2862             Note: It is not recommended to specify all new() parameters, as it has not been thoroughly tested.
2863              
2864             Output:
2865              
2866             $value -> '0' = Successful / '-1' = Un-successful
2867              
2868             Example:
2869              
2870             use Word2vec::Word2vec;
2871              
2872             my $w2v = Word2vec::Word2vec->new();
2873             $w2v->SetOutputFilePath( "vectors.bin" );
2874             $w2v->SetWordVecSize( 200 );
2875             $w2v->SetWindowSize( 8 );
2876             $w2v->SetSample( 0.0001 );
2877             $w2v->SetNegative( 25 );
2878             $w2v->SetHSoftMax( 0 );
2879             $w2v->SetBinaryOutput( 0 );
2880             $w2v->SetNumOfThreads( 20 );
2881             $w2v->SetNumOfIterations( 15 );
2882             $w2v->SetUseCBOW( 1 );
2883             $w2v->SetOverwriteOldFile( 0 );
2884             $w2v->ExecuteStringTraining( "string to train here" );
2885              
2886             undef( $w2v );
2887              
2888             # or
2889              
2890             use Word2vec::Word2vec;
2891              
2892             my $w2v = Word2vec::Word2vec->new();
2893             $w2v->ExecuteStringTraining( "string to train here", "vectors.bin", 200, 8, 5, 0.001, 25, 0.05, 0, 0, 20, 15, 1, 0, "", "", 2, 0 );
2894              
2895             undef( $w2v );
2896              
2897             =head3 ComputeCosineSimilarity
2898              
2899             Description:
2900              
2901             Computes cosine similarity between two words using trained word2vec vector data. Returns
2902             float value or undefined if one or more words are not in the dictionary.
2903              
2904             Note: Supports single words only and requires vector data to be in memory with ReadTrainedVectorDataFromFile() prior to function execution.
2905              
2906             Input:
2907              
2908             $string -> Single string word
2909             $string -> Single string word
2910              
2911             Output:
2912              
2913             $value -> Float or Undefined
2914              
2915             Example:
2916              
2917             use Word2vec::Word2vec;
2918              
2919             my $w2v = Word2vec::Word2vec->new();
2920             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2921             print "Cosine similarity between words: \"of\" and \"the\": " . $w2v->ComputeCosineSimilarity( "of", "the" ) . "\n";
2922              
2923             undef( $w2v );
2924              
2925             =head3 ComputeAvgOfWordsCosineSimilarity
2926              
2927             Description:
2928              
2929             Computes cosine similarity between two words or compound words using trained word2vec vector data.
2930             Returns float value or undefined.
2931              
2932             Note: Supports multiple words concatenated by ' ' and requires vector data to be in memory prior
2933             to method execution. This method will not error out when a word is not located within the dictionary.
2934             It will take the average of all found words for each parameter then cosine similarity of both word vectors.
2935              
2936             Input:
2937              
2938             $string -> string of single or multiple words separated by ' ' (space).
2939             $string -> string of single or multiple words separated by ' ' (space).
2940              
2941             Output:
2942              
2943             $value -> Float or Undefined
2944              
2945             Example:
2946              
2947             use Word2vec::Word2vec;
2948              
2949             my $w2v = Word2vec::Word2vec->new();
2950             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2951             print "Cosine similarity between words: \"heart attack\" and \"acute myocardial infarction\": " .
2952             $w2v->ComputeAvgOfWordsCosineSimilarity( "heart attack", "acute myocardial infarction" ) . "\n";
2953              
2954             undef( $w2v );
2955              
2956             =head3 ComputeMultiWordCosineSimilarity
2957              
2958             Description:
2959              
2960             Computes cosine similarity between two words or compound words using trained word2vec vector data.
2961              
2962             Note: Supports multiple words concatenated by ' ' (space) and requires vector data to be in memory prior to method execution.
2963             If $allWordsMustExist is set to true, this function will error out when a specified word is not found and return undefined.
2964              
2965             Input:
2966              
2967             $string -> string of single or multiple words separated by ' ' (space).
2968             $string -> string of single or multiple words separated by ' ' (space).
2969             $allWordsMustExist -> 1 = True, 0 or undef = False
2970              
2971             Output:
2972              
2973             $value -> Float or Undefined
2974              
2975             Example:
2976              
2977             use Word2vec::Word2vec;
2978              
2979             my $w2v = Word2vec::Word2vec->new();
2980             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2981             print "Cosine similarity between words: \"heart attack\" and \"acute myocardial infarction\": " .
2982             $w2v->ComputeMultiWordCosineSimilarity( "heart attack", "acute myocardial infarction" ) . "\n";
2983              
2984             undef( $w2v );
2985              
2986             =head3 ComputeCosineSimilarityOfWordVectors
2987              
2988             Description:
2989              
2990             Computes cosine similarity between two word vectors.
2991             Returns float value or undefined if one or more words are not in the dictionary.
2992              
2993             Note: Function parameters require actual word vector data with words removed.
2994              
2995             Input:
2996              
2997             $string -> string of word vector representation data separated by ' ' (space).
2998             $string -> string of word vector representation data separated by ' ' (space).
2999              
3000             Output:
3001              
3002             $value -> Float or Undefined
3003              
3004             Example:
3005              
3006             use Word2vec::Word2vec;
3007              
3008             my $word2vec = Word2vec::Word2vec->new();
3009             $word2vec->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3010             my $vectorAData = $word2vec->GetWordVector( "heart" );
3011             my $vectorBData = $word2vec->GetWordVector( "attack" );
3012              
3013             # Remove Words From Data
3014             $vectorAData = RemoveWordFromWordVectorString( $vectorAData );
3015             $vectorBData = RemoveWordFromWordVectorString( $vectorBData );
3016              
3017             print "Cosine similarity between words: \"heart\" and \"attack\": " .
3018             $word2vec->ComputeCosineSimilarityOfWordVectors( $vectorAData, $vectorBData ) . "\n";
3019              
3020             undef( $word2vec );
3021              
3022             =head3 CosSimWithUserInput
3023              
3024             Description:
3025              
3026             Computes cosine similarity between two words using trained word2vec vector data based on user input.
3027              
3028             Note: No compound word support.
3029              
3030             Warning: Requires vector data to be in memory prior to method execution.
3031              
3032             Input:
3033              
3034             None
3035              
3036             Output:
3037              
3038             None
3039              
3040             Example:
3041              
3042             use Word2vec::Word2vec;
3043              
3044             my $w2v = Word2vec::Word2vec->new();
3045             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3046             $w2v->CosSimWIthUserInputTest();
3047              
3048             undef( $w2v );
3049              
3050             =head3 MultiWordCosSimWithUserInput
3051              
3052             Description:
3053              
3054             Computes cosine similarity between two words or compound words using trained word2vec vector data based on user input.
3055              
3056             Note: Supports multiple words concatenated by ':'.
3057              
3058             Warning: Requires vector data to be in memory prior to method execution.
3059              
3060             Input:
3061              
3062             None
3063              
3064             Output:
3065              
3066             None
3067              
3068             Example:
3069              
3070             use Word2vec::Word2vec;
3071              
3072             my $w2v = Word2vec::Word2vec->new();
3073             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3074             $w2v->MultiWordCosSimWithUserInput();
3075              
3076             undef( $w2v );
3077              
3078              
3079             =head3 ComputeAverageOfWords
3080              
3081             Description:
3082              
3083             Computes cosine similarity average of all found words given an array reference parameter of
3084             plain text words. Returns average values (string) or undefined.
3085              
3086             Warning: Requires vector data to be in memory prior to method execution.
3087              
3088             Input:
3089              
3090             $arrayReference -> Array reference of words
3091              
3092             Output:
3093              
3094             $string -> String of word2vec word average values
3095              
3096             Example:
3097              
3098             use Word2vec::Word2vec;
3099              
3100             my $w2v = Word2vec::Word2vec->new();
3101             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3102             my $data = $w2v->ComputeAverageOfWords( "of", "the", "and" );
3103             print( "Computed Average Of Words: $data" ) if defined( $data );
3104              
3105             undef( $w2v );
3106              
3107             =head3 AddTwoWords
3108              
3109             Description:
3110              
3111             Adds two word vectors and returns the result.
3112              
3113             Warning: This method also requires vector data to be in memory prior to method execution.
3114              
3115             Input:
3116              
3117             $string -> Word to add
3118             $string -> Word to add
3119              
3120             Output:
3121              
3122             $string -> String of word2vec summed word values
3123              
3124             Example:
3125              
3126             use Word2vec::Word2vec;
3127              
3128             my $w2v = Word2vec::Word2vec->new();
3129             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3130              
3131             my $data = $w2v->AddTwoWords( "heart", "attack" );
3132             print( "Computed Sum Of Words: $data" ) if defined( $data );
3133              
3134             undef( $w2v );
3135              
3136             =head3 SubtractTwoWords
3137              
3138             Description:
3139              
3140             Subtracts two word vectors and returns the result.
3141              
3142             Warning: This method also requires vector data to be in memory prior to method execution.
3143              
3144             Input:
3145              
3146             $string -> Word to subtract
3147             $string -> Word to subtract
3148              
3149             Output:
3150              
3151             $string -> String of word2vec difference between word values
3152              
3153             Example:
3154              
3155             use Word2vec::Word2vec;
3156              
3157             my $w2v = Word2vec::Word2vec->new();
3158             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3159              
3160             my $data = $w2v->SubtractTwoWords( "king", "man" );
3161             print( "Computed Difference Of Words: $data" ) if defined( $data );
3162              
3163             undef( $w2v );
3164              
3165             =head3 AddTwoWordVectors
3166              
3167             Description:
3168              
3169             Adds two vector data strings and returns the result.
3170              
3171             Warning: Text word must be removed from vector data prior to calling this method. This method
3172             also requires vector data to be in memory prior to method execution.
3173              
3174             Input:
3175              
3176             $string -> Word2vec word vector data (with string word removed)
3177             $string -> Word2vec word vector data (with string word removed)
3178              
3179             Output:
3180              
3181             $string -> String of word2vec summed word values
3182              
3183             Example:
3184              
3185             use Word2vec::Word2vec;
3186              
3187             my $w2v = Word2vec::Word2vec->new();
3188             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3189             my $wordAData = $w2v->GetWordVector( "of" );
3190             my $wordBData = $w2v->GetWordVector( "the" );
3191              
3192             # Removing Words From Vector Data Array
3193             $wordAData = RemoveWordFromWordVectorString( $wordAData );
3194             $wordBData = RemoveWordFromWordVectorString( $wordBData );
3195              
3196             my $data = $w2v->AddTwoWordVectors( $wordAData, $wordBData );
3197             print( "Computed Sum Of Words: $data" ) if defined( $data );
3198              
3199             undef( $w2v );
3200              
3201             =head3 SubtractTwoWordVectors
3202              
3203             Description:
3204              
3205             Subtracts two vector data strings and returns the result.
3206              
3207             Warning: Text word must be removed from vector data prior to calling this method. This method
3208             also requires vector data to be in memory prior to method execution.
3209              
3210             Input:
3211              
3212             $string -> Word2vec word vector data (with string word removed)
3213             $string -> Word2vec word vector data (with string word removed)
3214              
3215             Output:
3216              
3217             $string -> String of word2vec difference between word values
3218              
3219             Example:
3220              
3221             use Word2vec::Word2vec;
3222              
3223             my $w2v = Word2vec::Word2vec->new();
3224             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3225             my $wordAData = $w2v->GetWordVector( "of" );
3226             my $wordBData = $w2v->GetWordVector( "the" );
3227              
3228             # Removing Words From Vector Data Array
3229             $wordAData = RemoveWordFromWordVectorString( $wordAData );
3230             $wordBData = RemoveWordFromWordVectorString( $wordBData );
3231              
3232             my $data = $w2v->SubtractTwoWordVectors( $wordAData, $wordBData );
3233             print( "Computed Difference Of Words: $data" ) if defined( $data );
3234              
3235             undef( $w2v );
3236              
3237             =head3 AverageOfTwoWordVectors
3238              
3239             Description:
3240              
3241             Computes the average of two vectors and returns the result.
3242              
3243             Warning: Text word must be removed from vector data prior to calling this method. This method
3244             also requires vector data to be in memory prior to method execution.
3245              
3246             Input:
3247              
3248             $string -> Word2vec word vector data (with string word removed)
3249             $string -> Word2vec word vector data (with string word removed)
3250              
3251             Output:
3252              
3253             $string -> String of word2vec average between word values
3254              
3255             Example:
3256              
3257             use Word2vec::Word2vec;
3258              
3259             my $w2v = Word2vec::Word2vec->new();
3260             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3261             my $wordAData = $w2v->GetWordVector( "of" );
3262             my $wordBData = $w2v->GetWordVector( "the" );
3263              
3264             # Removing Words From Vector Data Array
3265             $wordAData = RemoveWordFromWordVectorString( $wordAData );
3266             $wordBData = RemoveWordFromWordVectorString( $wordBData );
3267              
3268             my $data = $w2v->AverageOfTwoWordVectors( $wordAData, $wordBData );
3269             print( "Computed Difference Of Words: $data" ) if defined( $data );
3270              
3271             undef( $w2v );
3272              
3273             =head3 GetWordVector
3274              
3275             Description:
3276              
3277             Searches dictionary in memory for the specified string argument and returns the vector data.
3278             Returns undefined if not found.
3279              
3280             Warning: Requires vector data to be in memory prior to method execution.
3281              
3282             Input:
3283              
3284             $string -> Word to locate in word2vec vocabulary/dictionary
3285              
3286             Output:
3287              
3288             $string -> Found word2vec word + word vector data or undefined.
3289              
3290             Example:
3291              
3292             use Word2vec::Word2vec;
3293              
3294             my $w2v = Word2vec::Word2vec->new();
3295             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3296             my $wordData = $w2v->GetWordVector( "of" );
3297             print( "Word2vec Word Data: $wordData\n" ) if defined( $wordData );
3298              
3299             undef( $w2v );
3300              
3301             =head3 IsVectorDataInMemory
3302              
3303             Description:
3304              
3305             Checks to see if vector data has been loaded in memory.
3306              
3307             Input:
3308              
3309             None
3310              
3311             Output:
3312              
3313             $value -> '1' = True / '0' = False
3314              
3315             Example:
3316              
3317             use Word2vec::Word2vec;
3318              
3319             my $w2v = Word2vec::Word2vec->new();
3320             my $result = $w2v->IsVectorDataInMemory();
3321              
3322             print( "No vector data in memory\n" ) if $result == 0;
3323             print( "Yes vector data in memory\n" ) if $result == 1;
3324              
3325             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3326              
3327             print( "No vector data in memory\n" ) if $result == 0;
3328             print( "Yes vector data in memory\n" ) if $result == 1;
3329              
3330             undef( $w2v );
3331              
3332             =head3 IsWordOrCUIVectorData
3333              
3334             Description:
3335              
3336             Checks to see if vector data consists of word or CUI terms.
3337              
3338             Input:
3339              
3340             None
3341              
3342             Output:
3343              
3344             $string -> 'cui', 'word' or undef
3345              
3346             Example:
3347              
3348             use Word2vec::Word2vec;
3349              
3350             my $w2v = Word2vec::Word2vec->new();
3351             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3352             my $isWordOrCUIData = $w2v->IsWordOrCUIVectorData();
3353              
3354             print( "Vector Data Consists Of \"$isWordOrCUIData\" Terms\n" ) if defined( $isWordOrCUIData );
3355             print( "Cannot Determine Type Of Terms\n" ) if !defined( $isWordOrCUIData );
3356              
3357             undef( $w2v );
3358              
3359             =head3 IsVectorDataSorted
3360              
3361             Description:
3362              
3363             Checks to see if vector data header is signed as sorted in memory.
3364              
3365             Input:
3366              
3367             None
3368              
3369             Output:
3370              
3371             $value -> '1' = True / '0' = False
3372              
3373             Example:
3374              
3375             use Word2vec::Word2vec;
3376              
3377             my $w2v = Word2vec::Word2vec->new();
3378             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3379              
3380             my $result = $w2v->IsVectorDataSorted();
3381              
3382             print( "No vector data is not sorted\n" ) if $result == 0;
3383             print( "Yes vector data is sorted\n" ) if $result == 1;
3384              
3385             undef( $w2v );
3386              
3387             =head3 CheckWord2VecDataFileType
3388              
3389             Description:
3390              
3391             Checks specified file to see if vector data is in binary or plain text format. Returns 'text'
3392             for plain text and 'binary' for binary data.
3393              
3394             Input:
3395              
3396             $string -> File path
3397              
3398             Output:
3399              
3400             $string -> File Type ( "text" = Plain text file / "binary" = Binary data file )
3401              
3402             Example:
3403              
3404             use Word2vec::Word2vec;
3405              
3406             my $w2v = Word2vec::Word2vec->new();
3407             my $fileType = $w2v->CheckWord2VecDataFileType( "samples/samplevectors.bin" );
3408              
3409             print( "FileType: $fileType\n" ) if defined( $fileType );
3410              
3411             undef( $fileType );
3412              
3413             =head3 ReadTrainedVectorDataFromFile
3414              
3415             Description:
3416              
3417             Reads trained vector data from file path in memory or searches for vector data from file. This function supports and
3418             automatically detects word2vec binary, plain text and sparse vector data formats.
3419              
3420             Note: If search word is undefined, the entire vector file is loaded in memory. If a search word is defined only the vector data is returned or undef.
3421              
3422             Input:
3423              
3424             $string -> Word2vec trained vector data file path
3425             $searchWord -> Searches trained vector data file for specific word vector
3426              
3427             Output:
3428              
3429             $value -> '0' = Successful / '-1' = Un-successful
3430              
3431             Example:
3432              
3433             # Loading data in memory
3434             use Word2vec::Word2vec;
3435              
3436             my $w2v = Word2vec::Word2vec->new();
3437             my $result = $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3438              
3439             print( "Success Loading Data\n" ) if $result == 0;
3440             print( "Un-successful, Data Not Loaded\n" ) if $result == -1;
3441              
3442             undef( $w2v );
3443              
3444             # or
3445              
3446             # Searching vector data file for a specific word vector
3447             use Word2vec::Word2vec;
3448              
3449             my $w2v = Word2vec::Word2vec->new();
3450             my $result = $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin", "medical" );
3451              
3452             print( "Found Vector Data In File\n" ) if $result != -1;
3453             print( "Vector Data Not Found\n" ) if $result == -1;
3454              
3455             undef( $w2v );
3456              
3457             =head3 SaveTrainedVectorDataToFile
3458              
3459             Description:
3460              
3461             Saves trained vector data at the location specified. Defining 'binaryFormat' parameter will
3462             save in word2vec's binary format.
3463              
3464             Input:
3465              
3466             $string -> Save Path
3467             $binaryFormat -> Integer ( '1' = Save data in word2vec binary format / '0' = Save as plain text )
3468              
3469             Note: Leaving $binaryFormat as undefined will save the file in plain text format.
3470              
3471             Warning: If the vector data is stored as a binary search tree, this method will error out gracefully.
3472              
3473             Output:
3474              
3475             $value -> '0' = Successful / '-1' = Un-successful
3476              
3477             Example:
3478              
3479             use Word2vec::Word2vec;
3480              
3481             my $w2v = Word2vec::Word2vec->new();
3482              
3483             # Instruct the module to store the method as an array, not a BST.
3484             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3485             $w2v->SaveTrainedVectorDataToFile( "samples/newvectors.bin" );
3486              
3487             undef( $w2v );
3488              
3489             =head3 StringsAreEqual
3490              
3491             Description:
3492              
3493             Compares two strings to check for equality, ignoring case-sensitivity.
3494              
3495             Note: This method is not case-sensitive. ie. "string" equals "StRiNg"
3496              
3497             Input:
3498              
3499             $string -> String to compare
3500             $string -> String to compare
3501              
3502             Output:
3503              
3504             $value -> '1' = Strings are equal / '0' = Strings are not equal
3505              
3506             Example:
3507              
3508             use Word2vec::Word2vec;
3509              
3510             my $w2v = Word2vec::Word2vec->new();
3511             my $result = $w2v->StringsAreEqual( "hello world", "HeLlO wOrLd" );
3512              
3513             print( "Strings are equal!\n" )if $result == 1;
3514             print( "Strings are not equal!\n" ) if $result == 0;
3515              
3516             undef( $w2v );
3517              
3518             =head3 RemoveWordFromWordVectorString
3519              
3520             Description:
3521              
3522             Given a vector data string as input, it removed the vector word from its data returning only data.
3523              
3524             Input:
3525              
3526             $string -> Vector word & data string.
3527              
3528             Output:
3529              
3530             $string -> Vector data string.
3531              
3532             Example:
3533              
3534             use Word2vec::Word2vec;
3535              
3536             my $w2v = Word2vec::Word2vec->new();
3537             my $str = "cookie 1 0.234 9 0.0002 13 0.234 17 -0.0023 19 1.0000";
3538              
3539             my $vectorData = $w2v->RemoveWordFromWordVectorString( $str );
3540              
3541             print( "Success!\n" ) if length( vectorData ) < length( $str );
3542              
3543             undef( $w2v );
3544              
3545             =head3 ConvertRawSparseTextToVectorDataAry
3546              
3547             Description:
3548              
3549             Converts sparse vector string to a dense vector format data array.
3550              
3551             Input:
3552              
3553             $string -> Vector data string.
3554              
3555             Output:
3556              
3557             $arrayReference -> Reference to array of vector data.
3558              
3559             Example:
3560              
3561             use Word2vec::Word2vec;
3562              
3563             my $w2v = Word2vec::Word2vec->new();
3564             my $str = "cookie 1 0.234 9 0.0002 13 0.234 17 -0.0023 19 1.0000";
3565              
3566             my @vectorData = @{ $w2v->ConvertRawSparseTextToVectorDataAry( $str ) };
3567              
3568             print( "Data conversion successful!\n" ) if @vectorData > 0;
3569             print( "Data conversion un-successful!\n" ) if @vectorData == 0;
3570              
3571             undef( $w2v );
3572              
3573             =head3 ConvertRawSparseTextToVectorDataHash
3574              
3575             Description:
3576              
3577             Converts sparse vector string to a dense vector format data hash.
3578              
3579             Input:
3580              
3581             $string -> Vector data string.
3582              
3583             Output:
3584              
3585             $hashReference -> Reference to array of hash data.
3586              
3587             Example:
3588              
3589             use Word2vec::Word2vec;
3590              
3591             my $w2v = Word2vec::Word2vec->new();
3592             my $str = "cookie 1 0.234 9 0.0002 13 0.234 17 -0.0023 19 1.0000";
3593              
3594             my %vectorData = %{ $w2v->ConvertRawSparseTextToVectorDataHash( $str ) };
3595              
3596             print( "Data conversion successful!\n" ) if ( keys %vectorData ) > 0;
3597             print( "Data conversion un-successful!\n" ) if ( keys %vectorData ) == 0;
3598              
3599             undef( $w2v );
3600              
3601             =head3 GetOSType
3602              
3603             Description:
3604              
3605             Returns (string) operating system type.
3606              
3607             Input:
3608              
3609             None
3610              
3611             Output:
3612              
3613             $string -> Operating System String
3614              
3615             Example:
3616              
3617             use Word2vec::Word2vec;
3618              
3619             my $w2v = Word2vec::Word2vec->new();
3620             my $os = $w2v->GetOSType();
3621              
3622             print( "Operating System: $os\n" );
3623              
3624             undef( $w2v );
3625              
3626             =head2 Accessor Functions
3627              
3628             =head3 GetDebugLog
3629              
3630             Description:
3631              
3632             Returns the _debugLog member variable set during Word2vec::Word2vec object initialization of new function.
3633              
3634             Input:
3635              
3636             None
3637              
3638             Output:
3639              
3640             $value -> '0' = False, '1' = True
3641              
3642             Example:
3643              
3644             use Word2vec::Word2vec;
3645              
3646             my $w2v = Word2vec::Word2vec->new()
3647             my $debugLog = $w2v->GetDebugLog();
3648              
3649             print( "Debug Logging Enabled\n" ) if $debugLog == 1;
3650             print( "Debug Logging Disabled\n" ) if $debugLog == 0;
3651              
3652              
3653             undef( $w2v );
3654              
3655             =head3 GetWriteLog
3656              
3657             Description:
3658              
3659             Returns the _writeLog member variable set during Word2vec::Word2vec object initialization of new function.
3660              
3661             Input:
3662              
3663             None
3664              
3665             Output:
3666              
3667             $value -> '0' = False, '1' = True
3668              
3669             Example:
3670              
3671             use Word2vec::Word2vec;
3672              
3673             my $w2v = Word2vec::Word2vec->new();
3674             my $writeLog = $w2v->GetWriteLog();
3675              
3676             print( "Write Logging Enabled\n" ) if $writeLog == 1;
3677             print( "Write Logging Disabled\n" ) if $writeLog == 0;
3678              
3679             undef( $w2v );
3680              
3681             =head3 GetFileHandle
3682              
3683             Description:
3684              
3685             Returns the _fileHandle member variable set during Word2vec::Word2vec object instantiation of new function.
3686              
3687             Warning: This is a private function. File handle is used by WriteLog() method. Do not manipulate this file handle as errors can result.
3688              
3689             Input:
3690              
3691             None
3692              
3693             Output:
3694              
3695             $fileHandle -> Returns file handle for WriteLog() method or undefined.
3696              
3697             Example:
3698              
3699             use Word2vec::Word2vec;
3700              
3701             my $w2v = Word2vec::Word2vec->new();
3702             my $fileHandle = $w2v->GetFileHandle();
3703              
3704             undef( $w2v );
3705              
3706             =head3 GetTrainFilePath
3707              
3708             Description:
3709              
3710             Returns the _trainFilePath member variable set during Word2vec::Word2vec object instantiation of new function.
3711              
3712             Input:
3713              
3714             None
3715              
3716             Output:
3717              
3718             $string -> Returns word2vec training text corpus file path.
3719              
3720             Example:
3721              
3722             use Word2vec::Word2vec;
3723              
3724             my $w2v = Word2vec::Word2vec->new();
3725             my $filePath = $w2v->GetTrainFilePath();
3726             print( "Training File Path: $filePath\n" );
3727              
3728             undef( $w2v );
3729              
3730             =head3 GetOutputFilePath
3731              
3732             Description:
3733              
3734             Returns the _outputFilePath member variable set during Word2vec::Word2vec object instantiation of new function.
3735              
3736             Input:
3737              
3738             None
3739              
3740             Output:
3741              
3742             $string -> Returns post word2vec training output file path.
3743              
3744             Example:
3745              
3746             use Word2vec::Word2vec;
3747              
3748             my $w2v = Word2vec::Word2vec->new();
3749             my $filePath = $w2v->GetOutputFilePath();
3750             print( "File Path: $filePath\n" );
3751              
3752             undef( $w2v );
3753              
3754             =head3 GetWordVecSize
3755              
3756             Description:
3757              
3758             Returns the _wordVecSize member variable set during Word2vec::Word2vec object instantiation of new function.
3759              
3760             Input:
3761              
3762             None
3763              
3764             Output:
3765              
3766             $value -> Returns (integer) size of word2vec word vectors. Default value = 100
3767              
3768             Example:
3769              
3770             use Word2vec::Word2vec;
3771              
3772             my $w2v = Word2vec::Word2vec->new();
3773             my $value = $w2v->GetWordVecSize();
3774             print( "Word Vector Size: $value\n" );
3775              
3776             undef( $w2v );
3777              
3778             =head3 GetWindowSize
3779              
3780             Description:
3781              
3782             Returns the _windowSize member variable set during Word2vec::Word2vec object instantiation of new function.
3783              
3784             Input:
3785              
3786             None
3787              
3788             Output:
3789              
3790             $value -> Returns (integer) word2vec window size. Default value = 5
3791              
3792             Example:
3793              
3794             use Word2vec::Word2vec;
3795              
3796             my $w2v = Word2vec::Word2vec->new();
3797             my $value = $w2v->GetWindowSize();
3798             print( "Window Size: $value\n" );
3799              
3800             undef( $w2v );
3801              
3802             =head3 GetSample
3803              
3804             Description:
3805              
3806             Returns the _sample member variable set during Word2vec::Word2vec object instantiation of new function.
3807              
3808             Input:
3809              
3810             None
3811              
3812             Output:
3813              
3814             $value -> Returns (integer) word2vec sample size. Default value = 0.001
3815              
3816             Example:
3817              
3818             use Word2vec::Word2vec;
3819              
3820             my $w2v = Word2vec::Word2vec->new();
3821             my $value = $w2v->GetSample();
3822             print( "Sample: $value\n" );
3823              
3824             undef( $w2v );
3825              
3826             =head3 GetHSoftMax
3827              
3828             Description:
3829              
3830             Returns the _hSoftMax member variable set during Word2vec::Word2vec object instantiation of new function.
3831              
3832             Input:
3833              
3834             None
3835              
3836             Output:
3837              
3838             $value -> Returns (integer) word2vec HSoftMax value. Default = 0
3839              
3840             Example:
3841              
3842             use Word2vec::Word2vec;
3843              
3844             my $w2v = Word2vec::Word2vec->new();
3845             my $value = $w2v->GetHSoftMax();
3846             print( "HSoftMax: $value\n" );
3847              
3848             undef( $w2v );
3849              
3850             =head3 GetNegative
3851              
3852             Description:
3853              
3854             Returns the _negative member variable set during Word2vec::Word2vec object instantiation of new function.
3855              
3856             Input:
3857              
3858             None
3859              
3860             Output:
3861              
3862             $value -> Returns (integer) word2vec negative value. Default = 5
3863              
3864             Example:
3865              
3866             use Word2vec::Word2vec;
3867              
3868             my $w2v = Word2vec::Word2vec->new();
3869             my $value = $w2v->GetNegative();
3870             print( "Negative: $value\n" );
3871              
3872             undef( $w2v );
3873              
3874             =head3 GetNumOfThreads
3875              
3876             Description:
3877              
3878             Returns the _numOfThreads member variable set during Word2vec::Word2vec object instantiation of new function.
3879              
3880             Input:
3881              
3882             None
3883              
3884             Output:
3885              
3886             $value -> Returns (integer) word2vec number of threads to use during training. Default = 12
3887              
3888             Example:
3889              
3890             use Word2vec::Word2vec;
3891              
3892             my $w2v = Word2vec::Word2vec->new();
3893             my $value = $w2v->GetNumOfThreads();
3894             print( "Number of threads: $value\n" );
3895              
3896             undef( $w2v );
3897              
3898             =head3 GetNumOfIterations
3899              
3900             Description:
3901              
3902             Returns the _iterations member variable set during Word2vec::Word2vec object instantiation of new function.
3903              
3904             Input:
3905              
3906             None
3907              
3908             Output:
3909              
3910             $value -> Returns (integer) word2vec number of word2vec iterations. Default = 5
3911              
3912             Example:
3913              
3914             use Word2vec::Word2vec;
3915              
3916             my $w2v = Word2vec::Word2vec->new();
3917             my $value = $w2v->GetNumOfIterations();
3918             print( "Number of iterations: $value\n" );
3919              
3920             undef( $w2v );
3921              
3922             =head3 GetMinCount
3923              
3924             Description:
3925              
3926             Returns the _minCount member variable set during Word2vec::Word2vec object instantiation of new function.
3927              
3928             Input:
3929              
3930             None
3931              
3932             Output:
3933              
3934             $value -> Returns (integer) word2vec min-count value. Default = 5
3935              
3936             Example:
3937              
3938             use Word2vec::Word2vec;
3939              
3940             my $w2v = Word2vec::Word2vec->new();
3941             my $value = $w2v->GetMinCount();
3942             print( "Min Count: $value\n" );
3943              
3944             undef( $w2v );
3945              
3946             =head3 GetAlpha
3947              
3948             Description:
3949              
3950             Returns the _alpha member variable set during Word2vec::Word2vec object instantiation of new function.
3951              
3952             Input:
3953              
3954             None
3955              
3956             Output:
3957              
3958             $value -> Returns (integer) word2vec alpha value. Default = 0.05 for CBOW and 0.025 for Skip-Gram.
3959              
3960             Example:
3961              
3962             use Word2vec::Word2vec;
3963              
3964             my $w2v = Word2vec::Word2vec->new();
3965             my $value = $w2v->GetAlpha();
3966             print( "Alpha: $value\n" );
3967              
3968             undef( $w2v );
3969              
3970             =head3 GetClasses
3971              
3972             Description:
3973              
3974             Returns the _classes member variable set during Word2vec::Word2vec object instantiation of new function.
3975              
3976             Input:
3977              
3978             None
3979              
3980             Output:
3981              
3982             $value -> Returns (integer) word2vec classes value. Default = 0
3983              
3984             Example:
3985              
3986             use Word2vec::Word2vec;
3987              
3988             my $w2v = Word2vec::Word2vec->new();
3989             my $value = $w2v->GetClasses();
3990             print( "Classes: $value\n" );
3991              
3992             undef( $w2v );
3993              
3994             =head3 GetDebugTraining
3995              
3996             Description:
3997              
3998             Returns the _debug member variable set during Word2vec::Word2vec object instantiation of new function.
3999              
4000             Note: 0 = No debug output, 1 = Enable debug output, 2 = Even more debug output
4001              
4002             Input:
4003              
4004             None
4005              
4006             Output:
4007              
4008             $value -> Returns (integer) word2vec debug value. Default = 2
4009              
4010             Example:
4011              
4012             use Word2vec::Word2vec;
4013              
4014             my $w2v = Word2vec::Word2vec->new();
4015             my $value = $w2v->GetDebugTraining();
4016             print( "Debug: $value\n" );
4017              
4018             undef( $w2v );
4019              
4020             =head3 GetBinaryOutput
4021              
4022             Description:
4023              
4024             Returns the _binaryOutput member variable set during Word2vec::Word2vec object instantiation of new function.
4025              
4026             Note: 1 = Save trained vector data in binary format, 2 = Save trained vector data in plain text format.
4027              
4028             Input:
4029              
4030             None
4031              
4032             Output:
4033              
4034             $value -> Returns (integer) word2vec binary flag. Default = 0
4035              
4036             Example:
4037              
4038             use Word2vec::Word2vec;
4039              
4040             my $w2v = Word2vec::Word2vec->new();
4041             my $value = $w2v->GetBinaryOutput();
4042             print( "Binary Output: $value\n" );
4043              
4044             undef( $w2v );
4045              
4046             =head3 GetReadVocabFilePath
4047              
4048             Description:
4049              
4050             Returns the _readVocab member variable set during Word2vec::Word2vec object instantiation of new function.
4051              
4052             Input:
4053              
4054             None
4055              
4056             Output:
4057              
4058             $string -> Returns (string) word2vec read vocabulary file name or empty string if not set.
4059              
4060             Example:
4061              
4062             use Word2vec::Word2vec;
4063              
4064             my $w2v = Word2vec::Word2vec->new();
4065             my $str = $w2v->GetReadVocabFilePath();
4066             print( "Read Vocab File Path: $str\n" );
4067              
4068             undef( $w2v );
4069              
4070             =head3 GetSaveVocabFilePath
4071              
4072             Description:
4073              
4074             Returns the _saveVocab member variable set during Word2vec::Word2vec object instantiation of new function.
4075              
4076             Input:
4077              
4078             None
4079              
4080             Output:
4081              
4082             $string -> Returns (string) word2vec save vocabulary file name or empty string if not set.
4083              
4084             Example:
4085              
4086             use Word2vec::Word2vec;
4087              
4088             my $w2v = Word2vec::Word2vec->new();
4089             my $str = $w2v->GetSaveVocabFilePath();
4090             print( "Save Vocab File Path: $str\n" );
4091              
4092             undef( $w2v );
4093              
4094             =head3 GetUseCBOW
4095              
4096             Description:
4097              
4098             Returns the _useCBOW member variable set during Word2vec::Word2vec object instantiation of new function.
4099              
4100             Note: 0 = Skip-Gram Model, 1 = Continuous Bag Of Words Model.
4101              
4102             Input:
4103              
4104             None
4105              
4106             Output:
4107              
4108             $value -> Returns (integer) word2vec Continuous-Bag-Of-Words flag. Default = 1
4109              
4110             Example:
4111              
4112             use Word2vec::Word2vec;
4113              
4114             my $w2v = Word2vec::Word2vec->new();
4115             my $value = $w2v->GetUseCBOW();
4116             print( "Use CBOW?: $value\n" );
4117              
4118             undef( $w2v );
4119              
4120             =head3 GetWorkingDir
4121              
4122             Description:
4123              
4124             Returns the _workingDir member variable set during Word2vec::Word2vec object instantiation of new function.
4125              
4126             Input:
4127              
4128             None
4129              
4130             Output:
4131              
4132             $value -> Returns (string) working directory path or current directory if not specified.
4133              
4134             Example:
4135              
4136             use Word2vec::Word2vec;
4137              
4138             my $w2v = Word2vec::Word2vec->new();
4139             my $str = $w2v->GetWorkingDir();
4140             print( "Working Directory: $str\n" );
4141              
4142             undef( $w2v );
4143              
4144             =head3 GetWord2VecExeDir
4145              
4146             Description:
4147              
4148             Returns the _word2VecExeDir member variable set during Word2vec::Word2vec object instantiation of new function.
4149              
4150             Input:
4151              
4152             None
4153              
4154             Output:
4155              
4156             $value -> Returns (string) word2vec executable directory path or empty string if not specified.
4157              
4158             Example:
4159              
4160             use Word2vec::Word2vec;
4161              
4162             my $w2v = Word2vec::Word2vec->new();
4163             my $str = $w2v->GetWord2VecExeDir();
4164             print( "Word2Vec Executable File Directory: $str\n" );
4165              
4166             undef( $w2v );
4167              
4168             =head3 GetVocabularyHash
4169              
4170             Description:
4171              
4172             Returns the _hashRefOfWordVectors member variable set during Word2vec::Word2vec object instantiation of new function.
4173              
4174             Input:
4175              
4176             None
4177              
4178             Output:
4179              
4180             $value -> Returns array of vocabulary/dictionary words. (Word2vec trained data in memory)
4181              
4182             Example:
4183              
4184             use Word2vec::Word2vec;
4185              
4186             my $w2v = Word2vec::Word2vec->new();
4187             my @vocabulary = $w2v->GetVocabularyHash();
4188              
4189             undef( $w2v );
4190              
4191             =head3 GetOverwriteOldFile
4192              
4193             Description:
4194              
4195             Returns the _overwriteOldFile member variable set during Word2vec::Word2vec object instantiation of new function.
4196              
4197             Input:
4198              
4199             None
4200              
4201             Output:
4202              
4203             $value -> Returns 1 = True or 0 = False.
4204              
4205             Example:
4206              
4207             use Word2vec::Word2vec;
4208              
4209             my $w2v = Word2vec::Word2vec->new();
4210             my $value = $w2v->GetOverwriteOldFile();
4211             print( "Overwrite Exiting File?: $value\n" );
4212              
4213             undef( $w2v );
4214              
4215             =head2 Mutator Functions
4216              
4217             =head3 SetTrainFilePath
4218              
4219             Description:
4220              
4221             Sets member variable to string parameter. Sets training file path.
4222              
4223             Input:
4224              
4225             $string -> Text corpus training file path
4226              
4227             Output:
4228              
4229             None
4230              
4231             Example:
4232              
4233             use Word2vec::Word2vec;
4234              
4235             my $w2v = Word2vec::Word2vec->new();
4236             $w2v->SetTrainFilePath( "samples/textcorpus.txt" );
4237              
4238             undef( $w2v );
4239              
4240             =head3 SetOutputFilePath
4241              
4242             Description:
4243              
4244             Sets member variable to string parameter. Sets output file path.
4245              
4246             Input:
4247              
4248             $string -> Post word2vec training save file path
4249              
4250             Output:
4251              
4252             None
4253              
4254             Example:
4255              
4256             use Word2vec::Word2vec;
4257              
4258             my $w2v = Word2vec::Word2vec->new();
4259             $w2v->SetOutputFilePath( "samples/tempvectors.bin" );
4260              
4261             undef( $w2v );
4262              
4263             =head3 SetWordVecSize
4264              
4265             Description:
4266              
4267             Sets member variable to integer parameter. Sets word2vec word vector size.
4268              
4269             Input:
4270              
4271             $value -> Word2vec word vector size
4272              
4273             Output:
4274              
4275             None
4276              
4277             Example:
4278              
4279             use Word2vec::Word2vec;
4280              
4281             my $w2v = Word2vec::Word2vec->new();
4282             $w2v->SetWordVecSize( 100 );
4283              
4284             undef( $w2v );
4285              
4286             =head3 SetWindowSize
4287              
4288             Description:
4289              
4290             Sets member variable to integer parameter. Sets word2vec window size.
4291              
4292             Input:
4293              
4294             $value -> Word2vec window size
4295              
4296             Output:
4297              
4298             None
4299              
4300             Example:
4301              
4302             use Word2vec::Word2vec;
4303              
4304             my $w2v = Word2vec::Word2vec->new();
4305             $w2v->SetWindowSize( 8 );
4306              
4307             undef( $w2v );
4308              
4309             =head3 SetSample
4310              
4311             Description:
4312              
4313             Sets member variable to integer parameter. Sets word2vec sample size.
4314              
4315             Input:
4316              
4317             $value -> Word2vec sample size
4318              
4319             Output:
4320              
4321             None
4322              
4323             Example:
4324              
4325             use Word2vec::Word2vec;
4326              
4327             my $w2v = Word2vec::Word2vec->new();
4328             $w2v->SetSample( 3 );
4329              
4330             undef( $w2v );
4331              
4332             =head3 SetHSoftMax
4333              
4334             Description:
4335              
4336             Sets member variable to integer parameter. Sets word2vec HSoftMax value.
4337              
4338             Input:
4339              
4340             $value -> Word2vec HSoftMax size
4341              
4342             Output:
4343              
4344             None
4345              
4346             Example:
4347              
4348             use Word2vec::Word2vec;
4349              
4350             my $w2v = Word2vec::Word2vec->new();
4351             $w2v->SetHSoftMax( 12 );
4352              
4353             undef( $w2v );
4354              
4355             =head3 SetNegative
4356              
4357             Description:
4358              
4359             Sets member variable to integer parameter. Sets word2vec negative value.
4360              
4361             Input:
4362              
4363             $value -> Word2vec negative value
4364              
4365             Output:
4366              
4367             None
4368              
4369             Example:
4370              
4371             use Word2vec::Word2vec;
4372              
4373             my $w2v = Word2vec::Word2vec->new();
4374             $w2v->SetNegative( 12 );
4375              
4376             undef( $w2v );
4377              
4378             =head3 SetNumOfThreads
4379              
4380             Description:
4381              
4382             Sets member variable to integer parameter. Sets word2vec number of training threads to specified value.
4383              
4384             Input:
4385              
4386             $value -> Word2vec number of threads value
4387              
4388             Output:
4389              
4390             None
4391              
4392             Example:
4393              
4394             use Word2vec::Word2vec;
4395              
4396             my $w2v = Word2vec::Word2vec->new();
4397             $w2v->SetNumOfThreads( 12 );
4398              
4399             undef( $w2v );
4400              
4401             =head3 SetNumOfIterations
4402              
4403             Description:
4404              
4405             Sets member variable to integer parameter. Sets word2vec iterations value.
4406              
4407             Input:
4408              
4409             $value -> Word2vec number of iterations value
4410              
4411             Output:
4412              
4413             None
4414              
4415             Example:
4416              
4417             use Word2vec::Word2vec;
4418              
4419             my $w2v = Word2vec::Word2vec->new();
4420             $w2v->SetNumOfIterations( 12 );
4421              
4422             undef( $w2v );
4423              
4424             =head3 SetMinCount
4425              
4426             Description:
4427              
4428             Sets member variable to integer parameter. Sets word2vec min-count value.
4429              
4430             Input:
4431              
4432             $value -> Word2vec min-count value
4433              
4434             Output:
4435              
4436             None
4437              
4438             Example:
4439              
4440             use Word2vec::Word2vec;
4441              
4442             my $w2v = Word2vec::Word2vec->new();
4443             $w2v->SetMinCount( 7 );
4444              
4445             undef( $w2v );
4446              
4447             =head3 SetAlpha
4448              
4449             Description:
4450              
4451             Sets member variable to float parameter. Sets word2vec alpha value.
4452              
4453             Input:
4454              
4455             $value -> Word2vec alpha value. (Float)
4456              
4457             Output:
4458              
4459             None
4460              
4461             Example:
4462              
4463             use Word2vec::Word2vec;
4464              
4465             my $w2v = Word2vec::Word2vec->new();
4466             $w2v->SetAlpha( 0.0012 );
4467              
4468             undef( $w2v );
4469              
4470             =head3 SetClasses
4471              
4472             Description:
4473              
4474             Sets member variable to integer parameter. Sets word2vec classes value.
4475              
4476             Input:
4477              
4478             $value -> Word2vec classes value.
4479              
4480             Output:
4481              
4482             None
4483              
4484             Example:
4485              
4486             use Word2vec::Word2vec;
4487              
4488             my $w2v = Word2vec::Word2vec->new();
4489             $w2v->SetClasses( 0 );
4490              
4491             undef( $w2v );
4492              
4493             =head3 SetDebugTraining
4494              
4495             Description:
4496              
4497             Sets member variable to integer parameter. Sets word2vec debug parameter value.
4498              
4499             Input:
4500              
4501             $value -> Word2vec debug training value.
4502              
4503             Output:
4504              
4505             None
4506              
4507             Example:
4508              
4509             use Word2vec::Word2vec;
4510              
4511             my $w2v = Word2vec::Word2vec->new();
4512             $w2v->SetDebugTraining( 0 );
4513              
4514             undef( $w2v );
4515              
4516             =head3 SetBinaryOutput
4517              
4518             Description:
4519              
4520             Sets member variable to integer parameter. Sets word2vec binary parameter value.
4521              
4522             Input:
4523              
4524             $value -> Word2vec binary output mode value. ( '1' = Binary Output / '0' = Plain Text )
4525              
4526             Output:
4527              
4528             None
4529              
4530             Example:
4531              
4532             use Word2vec::Word2vec;
4533              
4534             my $w2v = Word2vec::Word2vec->new();
4535             $w2v->SetBinaryOutput( 1 );
4536              
4537             undef( $w2v );
4538              
4539             =head3 SetSaveVocabFilePath
4540              
4541             Description:
4542              
4543             Sets member variable to string parameter. Sets word2vec save vocabulary file name.
4544              
4545             Input:
4546              
4547             $string -> Word2vec save vocabulary file name and path.
4548              
4549             Output:
4550              
4551             None
4552              
4553             Example:
4554              
4555             use Word2vec::Word2vec;
4556              
4557             my $w2v = Word2vec::Word2vec->new();
4558             $w2v->SetSaveVocabFilePath( "samples/vocab.txt" );
4559              
4560             undef( $w2v );
4561              
4562             =head3 SetReadVocabFilePath
4563              
4564             Description:
4565              
4566             Sets member variable to string parameter. Sets word2vec read vocabulary file name.
4567              
4568             Input:
4569              
4570             $string -> Word2vec read vocabulary file name and path.
4571              
4572             Output:
4573              
4574             None
4575              
4576             Example:
4577              
4578             use Word2vec::Word2vec;
4579              
4580             my $w2v = Word2vec::Word2vec->new();
4581             $w2v->SetReadVocabFilePath( "samples/vocab.txt" );
4582              
4583             undef( $w2v );
4584              
4585             =head3 SetUseCBOW
4586              
4587             Description:
4588              
4589             Sets member variable to integer parameter. Sets word2vec CBOW parameter value.
4590              
4591             Input:
4592              
4593             $value -> Word2vec CBOW mode value.
4594              
4595             Output:
4596              
4597             None
4598              
4599             Example:
4600              
4601             use Word2vec::Word2vec;
4602              
4603             my $w2v = Word2vec::Word2vec->new();
4604             $w2v->SetUseCBOW( 1 );
4605              
4606             undef( $w2v );
4607              
4608             =head3 SetWorkingDir
4609              
4610             Description:
4611              
4612             Sets member variable to string parameter. Sets working directory.
4613              
4614             Input:
4615              
4616             $string -> Working directory
4617              
4618             Output:
4619              
4620             None
4621              
4622             Example:
4623              
4624             use Word2vec::Word2vec;
4625              
4626             my $w2v = Word2vec::Word2vec->new();
4627             $w2v->SetWorkingDir( "/samples" );
4628              
4629             undef( $w2v );
4630              
4631             =head3 SetWord2VecExeDir
4632              
4633             Description:
4634              
4635             Sets member variable to string parameter. Sets word2vec executable file directory.
4636              
4637             Input:
4638              
4639             $string -> Word2vec directory
4640              
4641             Output:
4642              
4643             None
4644              
4645             Example:
4646              
4647             use Word2vec::Word2vec;
4648              
4649             my $w2v = Word2vec::Word2vec->new();
4650             $w2v->SetWord2VecExeDir( "/word2vec" );
4651              
4652             undef( $w2v );
4653              
4654             =head3 SetVocabularyHash
4655              
4656             Description:
4657              
4658             Sets vocabulary/dictionary array to de-referenced array reference parameter.
4659              
4660             Warning: This will overwrite any existing vocabulary/dictionary array data.
4661              
4662             Input:
4663              
4664             $arrayReference -> Vocabulary/Dictionary array reference of word2vec word vectors.
4665              
4666             Output:
4667              
4668             None
4669              
4670             Example:
4671              
4672             use Word2vec::Word2vec;
4673              
4674             my $w2v = Word2vec::Word2vec->new();
4675             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
4676             my @vocab = $w2v->GetVocabularyHash();
4677             $w2v->SetVocabularyHash( \@vocab );
4678              
4679             undef( $w2v );
4680              
4681             =head3 ClearVocabularyHash
4682              
4683             Description:
4684              
4685             Clears vocabulary/dictionary array.
4686              
4687             Input:
4688              
4689             None
4690              
4691             Output:
4692              
4693             None
4694              
4695             Example:
4696              
4697             use Word2vec::Word2vec;
4698              
4699             my $w2v = Word2vec::Word2vec->new();
4700             $w2v->ClearVocabularyHash();
4701              
4702             undef( $w2v );
4703              
4704             =head3 AddWordVectorToVocabHash
4705              
4706             Description:
4707              
4708             Adds word vector string to vocabulary/dictionary.
4709              
4710             Input:
4711              
4712             $string -> Word2vec word vector string
4713              
4714             Output:
4715              
4716             None
4717              
4718             Example:
4719              
4720             use Word2vec::Word2vec;
4721              
4722             my $w2v = Word2vec::Word2vec->new();
4723              
4724             # Note: This is representational data of word2vec's word vector format and not actual data.
4725             $w2v->AddWordVectorToVocabHash( "of 0.4346 -0.1235 0.5789 0.2347 -0.0056 -0.0001" );
4726              
4727             undef( $w2v );
4728              
4729             =head3 SetOverwriteOldFile
4730              
4731             Description:
4732              
4733             Sets member variable to integer parameter. Enables overwriting output file if one already exists.
4734              
4735             Input:
4736              
4737             $value -> '1' = Overwrite exiting file / '0' = Graceful termination when file with same name exists
4738              
4739             Output:
4740              
4741             None
4742              
4743             Example:
4744              
4745             use Word2vec::Word2vec;
4746              
4747             my $w2v = Word2vec::Word2vec->new();
4748             $w2v->SetOverwriteOldFile( 1 );
4749              
4750             undef( $w2v );
4751              
4752             =head2 Debug Functions
4753              
4754             =head3 GetTime
4755              
4756             Description:
4757              
4758             Returns current time string in "Hour:Minute:Second" format.
4759              
4760             Input:
4761              
4762             None
4763              
4764             Output:
4765              
4766             $string -> XX:XX:XX ("Hour:Minute:Second")
4767              
4768             Example:
4769              
4770             use Word2vec::Word2vec:
4771              
4772             my $w2v = Word2vec::Word2vec->new();
4773             my $time = $w2v->GetTime();
4774              
4775             print( "Current Time: $time\n" ) if defined( $time );
4776              
4777             undef( $w2v );
4778              
4779             =head3 GetDate
4780              
4781             Description:
4782              
4783             Returns current month, day and year string in "Month/Day/Year" format.
4784              
4785             Input:
4786              
4787             None
4788              
4789             Output:
4790              
4791             $string -> XX/XX/XXXX ("Month/Day/Year")
4792              
4793             Example:
4794              
4795             use Word2vec::Word2vec:
4796              
4797             my $w2v = Word2vec::Word2vec->new();
4798             my $date = $w2v->GetDate();
4799              
4800             print( "Current Date: $date\n" ) if defined( $date );
4801              
4802             undef( $w2v );
4803              
4804             =head3 WriteLog
4805              
4806             Description:
4807              
4808             Prints passed string parameter to the console, log file or both depending on user options.
4809              
4810             Note: printNewLine parameter prints a new line character following the string if the parameter
4811             is undefined and does not if parameter is 0.
4812              
4813             Input:
4814              
4815             $string -> String to print to the console/log file.
4816             $value -> 0 = Do not print newline character after string, all else prints new line character including 'undef'.
4817              
4818             Output:
4819              
4820             None
4821              
4822             Example:
4823              
4824             use Word2vec::Word2vec:
4825              
4826             my $w2v = Word2vec::Word2vec->new();
4827             $w2v->WriteLog( "Hello World" );
4828              
4829             undef( $w2v );
4830              
4831             =head1 Author
4832              
4833             Clint Cuffy, Virginia Commonwealth University
4834              
4835             =head1 COPYRIGHT
4836              
4837             Copyright (c) 2016
4838              
4839             Bridget T McInnes, Virginia Commonwealth University
4840             btmcinnes at vcu dot edu
4841              
4842             Clint Cuffy, Virginia Commonwealth University
4843             cuffyca at vcu dot edu
4844              
4845             This program is free software; you can redistribute it and/or modify it
4846             under the terms of the GNU General Public License as published by the Free
4847             Software Foundation; either version 2 of the License, or (at your option)
4848             any later version.
4849              
4850             This program is distributed in the hope that it will be useful, but WITHOUT
4851             ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
4852             FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
4853              
4854             You should have received a copy of the GNU General Public License along with
4855             this program; if not, write to:
4856              
4857             The Free Software Foundation, Inc.,
4858             59 Temple Place - Suite 330,
4859             Boston, MA 02111-1307, USA.
4860              
4861             =cut