File Coverage

blib/lib/Word2vec/Word2vec.pm
Criterion Covered Total %
statement 396 1104 35.8
branch 180 846 21.2
condition 22 267 8.2
subroutine 87 92 94.5
pod 75 84 89.2
total 760 2393 31.7


line stmt bran cond sub pod time code
1             #!usr/bin/perl
2              
3             ######################################################################################
4             # #
5             # Author: Clint Cuffy #
6             # Date: 06/16/2016 #
7             # Revised: 04/01/2017 #
8             # UMLS Similarity Word2Vec Executable Interface Module #
9             # #
10             ######################################################################################
11             # #
12             # Description: #
13             # ============ #
14             # Perl "word2vec" executable interface for UMLS Similarity #
15             # Features: #
16             # ========= #
17             # Supports Word2Vec Training Using Standard Options #
18             # Conversion of Word2Vec Binary Format To Plain Text And Vice Versa #
19             # Cosine Similarity Between Two Words #
20             # Summed Cosine Similarity #
21             # Average Cosine Similarity #
22             # Multi-Word Cosine Similarity #
23             # Manipulation of Word Vectors (Addition/Subtraction/Average) #
24             # #
25             ######################################################################################
26              
27              
28             package Word2vec::Word2vec;
29              
30 4     4   10453 use strict;
  4         6  
  4         102  
31 4     4   12 use warnings;
  4         5  
  4         94  
32              
33             # Standard Package(s)
34 4     4   12 use Cwd;
  4         3  
  4         298  
35 4     4   1686 use Encode qw( decode encode );
  4         29184  
  4         292  
36              
37              
38 4     4   23 use vars qw($VERSION);
  4         4  
  4         186  
39              
40             $VERSION = '0.02';
41              
42              
43             ######################################################################################
44             # Constructor
45             ######################################################################################
46              
47             BEGIN
48       4     {
49             # CONSTRUCTOR : DO SOMETHING HERE
50             }
51              
52              
53             ######################################################################################
54             # Deconstructor
55             ######################################################################################
56              
57             END
58       4     {
59             # DECONSTRUCTOR : DO SOMETHING HERE
60             }
61              
62              
63             ######################################################################################
64             # new Class Operator
65             ######################################################################################
66              
67             sub new
68             {
69 1     1 1 10 my $class = shift;
70 1         22 my $self = {
71             # Private Member Variables
72             _debugLog => shift, # Boolean (Binary): 0 = False, 1 = True
73             _writeLog => shift, # Boolean (Binary): 0 = False, 1 = True
74             _trainFileName => shift, # String
75             _outputFileName => shift, # String
76             _wordVecSize => shift, # Int
77             _windowSize => shift, # Int
78             _sample => shift, # Float
79             _hSoftMax => shift, # Int
80             _negative => shift, # Int
81             _numOfThreads => shift, # Int
82             _numOfIterations => shift, # Int
83             _minCount => shift, # Int
84             _alpha => shift, # Float
85             _classes => shift, # Int
86             _debug => shift, # Int
87             _binaryOutput => shift, # Boolean (Binary): 0 = False, 1 = True
88             _saveVocab => shift, # String (File Name To Save To)
89             _readVocab => shift, # String (File Name To Read From)
90             _useCBOW => shift, # Boolean (Binary): 0 = Use Skip-Gram Model, 1 = Use CBOW (Default)
91             _workingDir => shift, # String
92             _word2VecExeDir => shift, # String
93             _hashRefOfWordVectors => shift, # Hash Reference of Word2Vec Vectors
94             _overwriteOldFile => shift, # Boolean (Binary): 0 = False, 1 = True
95             _sparseVectorMode => shift, # Boolean (Binary): 0 = False, 1 = True
96             _vectorLength => shift, # Int
97             _numberOfWords => shift, # Int
98             _minimizeMemoryUsage => shift, # Boolean (Binary): 0 = False, 1 = True
99             };
100              
101             # Set debug log variable to false if not defined
102 1 50       5 $self->{ _debugLog } = 0 if !defined ( $self->{ _debugLog } );
103 1 50       3 $self->{ _writeLog } = 0 if !defined ( $self->{ _writeLog } );
104 1 50       4 $self->{ _trainFileName } = "" if !defined ( $self->{ _trainFileName } );
105 1 50       4 $self->{ _outputFileName } = "" if !defined ( $self->{ _outputFileName } );
106 1 50       4 $self->{ _wordVecSize } = 100 if !defined ( $self->{ _wordVecSize } );
107 1 50       3 $self->{ _windowSize } = 5 if !defined ( $self->{ _windowSize } );
108 1 50       4 $self->{ _sample } = 0.001 if !defined ( $self->{ _sample } );
109 1 50       4 $self->{ _hSoftMax } = 0 if !defined ( $self->{ _hSoftMax } );
110 1 50       3 $self->{ _negative } = 5 if !defined ( $self->{ _negative } );
111 1 50       2 $self->{ _numOfThreads } = 12 if !defined ( $self->{ _numOfThreads } );
112 1 50       3 $self->{ _numOfIterations } = 5 if !defined ( $self->{ _numOfIterations } );
113 1 50       3 $self->{ _minCount } = 5 if !defined ( $self->{ _minCount } );
114 1 50       2 $self->{ _classes } = 0 if !defined ( $self->{ _classes } );
115 1 50       3 $self->{ _debug } = 2 if !defined ( $self->{ _debug } );
116 1 50       2 $self->{ _binaryOutput } = 1 if !defined ( $self->{ _binaryOutput } );
117 1 50       6 $self->{ _saveVocab } = "" if !defined ( $self->{ _saveVocab } );
118 1 50       3 $self->{ _readVocab } = "" if !defined ( $self->{ _readVocab } );
119 1 50       3 $self->{ _useCBOW } = 1 if !defined ( $self->{ _useCBOW } );
120              
121 1 50 33     8 $self->{ _alpha } = 0.05 if ( !defined ( $self->{ _alpha } ) && $self->{ _useCBOW } == 1 );
122 1 50 33     3 $self->{ _alpha } = 0.025 if ( !defined ( $self->{ _alpha } ) && $self->{ _useCBOW } == 0 );
123              
124 1 50       8 $self->{ _workingDir } = Cwd::getcwd() if !defined ( $self->{ _workingDir } );
125              
126 1         3 my %hash = ();
127 1 50       2 $self->{ _hashRefOfWordVectors } = \%hash if !defined ( $self->{ _hashRefOfWordVectors } );
128 1 50       3 $self->{ _overwriteOldFile } = 0 if !defined $self->{ _overwriteOldFile };
129 1 50       2 $self->{ _sparseVectorMode } = 0 if !defined $self->{ _sparseVectorMode };
130 1 50       3 $self->{ _vectorLength } = 0 if !defined $self->{ _vectorLength };
131 1 50       3 $self->{ _numberOfWords } = 0 if !defined $self->{ _numberOfWords };
132 1 50       8 $self->{ _minimizeMemoryUsage } = 1 if !defined $self->{ _minimizeMemoryUsage };
133              
134              
135             # Try To Locate Word2Vec Executable Files Path
136 1         3 for my $dir ( @INC )
137             {
138 11 50       119 $self->{ _word2VecExeDir } = "$dir/External/Word2vec" if ( -e "$dir/External/Word2vec" ); # Test Directory
139 11 50       74 $self->{ _word2VecExeDir } = "$dir/../External/Word2vec" if ( -e "$dir/../External/Word2vec" ); # Dev Directory
140 11 50       75 $self->{ _word2VecExeDir } = "$dir/../../External/Word2vec" if ( -e "$dir/../../External/Word2vec" ); # Dev Directory
141 11 100       97 $self->{ _word2VecExeDir } = "$dir/Word2vec/External/Word2vec" if ( -e "$dir/Word2vec/External/Word2vec" ); # Release Directory
142             }
143              
144             # Open File Handler if checked variable is true
145 1 50       4 if( $self->{ _writeLog } )
146             {
147 0         0 open( $self->{ _fileHandle }, '>:encoding(UTF-8)', 'Word2vecLog.txt' );
148 0         0 $self->{ _fileHandle }->autoflush( 1 ); # Auto-flushes writes to log file
149             }
150              
151 1         3 bless $self, $class;
152              
153 1         5 $self->WriteLog( "New - Debug On" );
154 1 50       5 $self->WriteLog( "New - Word2Vec Executable Directory Found" ) if defined( $self->{ _word2VecExeDir } );
155 1 50       6 $self->WriteLog( "New - Setting Word2Vec Executable Directory To: \"" . $self->{ _word2VecExeDir } . "\"" ) if defined( $self->{ _word2VecExeDir } );
156              
157 1         2 return $self;
158             }
159              
160              
161             ######################################################################################
162             # DESTROY
163             ######################################################################################
164              
165             sub DESTROY
166             {
167 1     1   3 my ( $self ) = @_;
168              
169             # Close FileHandle
170 1 50       49 close( $self->{ _fileHandle } ) if( $self->{ _fileHandle } );
171             }
172              
173              
174             ######################################################################################
175             # Module Functions
176             ######################################################################################
177              
178             sub ExecuteTraining
179             {
180 2     2 1 4 my ( $self, $trainFilePath, $outputFilePath, $vectorSize, $windowSize, $minCount, $sample, $negative, $alpha, $hs, $binary, $numOfThreads, $iterations, $useCBOW, $classes, $readVocab, $saveVocab, $debug, $overwrite ) = @_;
181              
182             # Pre-Training Check(s)
183 2         3 my $executableFileDir = $self->GetWord2VecExeDir() . "/word2vec";
184 2 50       4 $executableFileDir .= ".exe" if $self->GetOSType() eq "MSWin32";
185              
186             # Override Train File Path Member Variable With Specified Train File Parameter
187 2 50       6 $self->WriteLog( "ExecuteTraining - \"TrainFilePath\" Parameter Specified / Overriding Member Variable" ) if defined( $trainFilePath );
188 2 50       4 $trainFilePath = $self->GetTrainFilePath() if !defined( $trainFilePath );
189              
190             # Override Output File Path Member Variable With Specified Train File Parameter
191 2 50       5 $self->WriteLog( "ExecuteTraining - \"OutputFilePath\" Parameter Specified / Overriding Member Variable" ) if defined( $outputFilePath );
192 2 50       3 $outputFilePath = $self->GetOutputFilePath() if !defined( $outputFilePath );
193              
194             # Override Overwrite Member Variable With Specified Train File Parameter
195 2 50       10 $self->WriteLog( "ExecuteTraining - \"Overwrite\" Parameter Specified / Overriding Member Variable" ) if defined( $overwrite );
196 2 50       7 $overwrite = $self->GetOverwriteOldFile() if !defined( $overwrite );
197              
198             # Check For 'word2vec' Executable and trainFile
199 2 50       79 $self->WriteLog( "ExecuteTraining - Error: \"word2vec\" Executable File Cannot Be Found" ) if !( -e "$executableFileDir" );
200 2 50       30 return -1 if !( -e "$executableFileDir" );
201 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Training File Not Found" ) if !( -e "$trainFilePath" );
202 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Training File Size = 0 bytes - No Data In Training File" ) if ( -z "$trainFilePath" );
203 0 0 0     0 return -1 if !( -e "$trainFilePath" ) || ( -z "$trainFilePath" );
204              
205             # Checks To See If Training Is Set To Use CBOW or Skip-Gram Model
206 0 0       0 $self->WriteLog( "ExecuteTraining - Attn: Continuous Bag Of Words Model = 0, Using Skip-Gram Model" ) if $self->GetUseCBOW() == 0;
207              
208             # Checks For Existing Output File And Returns -1 If Overwrite Option Is Not Enabled
209 0 0 0     0 $self->WriteLog( "ExecuteTraining - Warning: \"$outputFilePath\" Already Exists - Canceling Training" ) if ( -e "$outputFilePath" && $overwrite == 0 );
210 0 0 0     0 $self->WriteLog( "ExecuteTraining - Try Enabling \"Overwrite\" Option or Delete \"$outputFilePath\" In Working Directory" ) if ( -e "$outputFilePath" && $overwrite == 0 );
211 0 0 0     0 return -1 if ( -e "$outputFilePath" && $overwrite == 0 );
212              
213             # Fetch Other Training Parameters
214 0 0       0 $self->WriteLog( "ExecuteTraining - \"VectorSize\" Parameter Defined / Overriding Member Variable" ) if defined( $vectorSize );
215 0 0       0 $vectorSize = $self->GetWordVecSize() if !defined( $vectorSize );
216              
217 0 0       0 $self->WriteLog( "ExecuteTraining - \"WindowSize\" Parameter Defined / Overriding Member Variable" ) if defined( $windowSize );
218 0 0       0 $windowSize = $self->GetWindowSize() if !defined( $windowSize );
219              
220 0 0       0 $self->WriteLog( "ExecuteTraining - \"Min-Count\" Parameter Defined / Overriding Member Variable" ) if defined( $minCount );
221 0 0       0 $minCount = $self->GetMinCount() if !defined( $minCount );
222              
223 0 0       0 $self->WriteLog( "ExecuteTraining - \"Sample\" Parameter Defined / Overriding Member Variable" ) if defined( $sample );
224 0 0       0 $sample = $self->GetSample() if !defined( $sample );
225              
226 0 0       0 $self->WriteLog( "ExecuteTraining - \"Negative\" Parameter Defined / Overriding Member Variable" ) if defined( $negative );
227 0 0       0 $negative = $self->GetNegative() if !defined( $negative );
228              
229 0 0       0 $self->WriteLog( "ExecuteTraining - \"Alpha\" Parameter Defined / Overriding Member Variable" ) if defined( $alpha );
230 0 0       0 $alpha = $self->GetAlpha() if !defined( $alpha );
231              
232 0 0       0 $self->WriteLog( "ExecuteTraining - \"HSoftMax\" Parameter Defined / Overriding Member Variable" ) if defined( $hs );
233 0 0       0 $hs = $self->GetHSoftMax() if !defined( $hs );
234              
235 0 0       0 $self->WriteLog( "ExecuteTraining - \"Binary\" Parameter Defined / Overriding Member Variable" ) if defined( $binary );
236 0 0       0 $binary = $self->GetBinaryOutput() if !defined( $binary );
237              
238 0 0       0 $self->WriteLog( "ExecuteTraining - \"NumOfThreads\" Parameter Defined / Overriding Member Variable" ) if defined( $numOfThreads );
239 0 0       0 $numOfThreads = $self->GetNumOfThreads() if !defined( $numOfThreads );
240              
241 0 0       0 $self->WriteLog( "ExecuteTraining - \"Iterations\" Parameter Defined / Overriding Member Variable" ) if defined( $iterations );
242 0 0       0 $iterations = $self->GetNumOfIterations() if !defined( $iterations );
243              
244 0 0       0 $self->WriteLog( "ExecuteTraining - \"CBOW\" Parameter Defined / Overriding Member Variable" ) if defined( $useCBOW );
245 0 0       0 $useCBOW = $self->GetUseCBOW() if !defined( $useCBOW );
246              
247 0 0       0 $self->WriteLog( "ExecuteTraining - \"Classes\" Parameter Defined / Overriding Member Variable" ) if defined( $classes );
248 0 0       0 $classes = $self->GetClasses() if !defined( $classes );
249              
250 0 0       0 $self->WriteLog( "ExecuteTraining - \"ReadVocab\" Parameter Defined / Overriding Member Variable" ) if defined( $readVocab );
251 0 0       0 $readVocab = $self->GetReadVocabFilePath() if !defined( $readVocab );
252              
253 0 0       0 $self->WriteLog( "ExecuteTraining - \"SaveVocab\" Parameter Defined / Overriding Member Variable" ) if defined( $saveVocab );
254 0 0       0 $saveVocab = $self->GetSaveVocabFilePath() if !defined( $saveVocab );
255              
256 0 0       0 $self->WriteLog( "ExecuteTraining - \"Debug\" Parameter Defined / Overriding Member Variable" ) if defined( $debug );
257 0 0       0 $debug = $self->GetDebugTraining() if !defined( $debug );
258              
259             # Setting Up Command String
260 0         0 my $command = "\"$executableFileDir\" ";
261 0         0 $command .= ( "-train \"" . $trainFilePath . "\" " );
262 0         0 $command .= ( "-output \"" . $outputFilePath . "\" " );
263 0         0 $command .= ( "-size " . $vectorSize . " " );
264 0         0 $command .= ( "-window " . $windowSize . " " );
265 0         0 $command .= ( "-sample " . $sample . " " );
266 0         0 $command .= ( "-hs " . $hs . " " );
267 0         0 $command .= ( "-negative " . $negative . " " );
268 0         0 $command .= ( "-threads " . $numOfThreads . " " );
269 0         0 $command .= ( "-iter " . $iterations . " " );
270 0         0 $command .= ( "-min-count " . $minCount . " " );
271 0         0 $command .= ( "-alpha " . $alpha . " " );
272 0         0 $command .= ( "-classes " . $classes . " " );
273 0         0 $command .= ( "-binary " . $binary . " " );
274 0         0 $command .= ( "-cbow " . $useCBOW . " " );
275 0 0 0     0 $command .= ( "-read-vocab " . $readVocab . " " ) if ( defined( $readVocab ) && $readVocab ne "" );
276 0 0 0     0 $command .= ( "-save-vocab " . $saveVocab . " " ) if ( defined( $saveVocab ) && $saveVocab ne "" );
277 0         0 $command .= ( "-debug " . $debug . " " );
278              
279 0         0 $self->WriteLog( "Executing Command: $command" );
280              
281             # Execute External System Command To Train "word2vec"
282             # Execute command without capturing program output
283 0         0 my $result = system( "$command" );
284              
285 0         0 print "\n";
286              
287             # Post-Training Check(s)
288 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Unable To Spawn Executable File - Try Running '--clean' Command And Re-compile Executables" ) if ( $result == 65280 );
289              
290 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Word2Vec Output File Does Not Exist" ) if !( -e "$outputFilePath" );
291 0 0       0 $self->WriteLog( "ExecuteTraining - Error: Word2Vec Output File Size = Zero" ) if ( -z "$outputFilePath" );
292 0 0 0     0 $result = -1 if ( !( -e "$outputFilePath" ) || ( -z "$outputFilePath" ) );
293              
294 0 0 0     0 $self->WriteLog( "ExecuteTraining - Training Successful" ) if $result == 0 && ( -e "$outputFilePath" );
295 0 0       0 $self->WriteLog( "ExecuteTraining - Training Unsuccessful" ) if $result != 0;
296              
297 0         0 return $result;
298             }
299              
300             sub ExecuteStringTraining
301             {
302 1     1 1 1091 my ( $self, $trainingStr, $outputFilePath, $vectorSize, $windowSize, $minCount, $sample, $negative, $alpha, $hs, $binary,
303             $numOfThreads, $iterations, $useCBOW, $classes, $readVocab, $saveVocab, $debug, $overwrite ) = @_;
304              
305             # Check(s)
306 1 50       4 $self->WriteLog( "ExecuteStringTraining - Error: Training String Is Not Defined" ) if !defined( $trainingStr );
307 1 50       3 return -1 if !defined( $trainingStr );
308              
309 1 50       4 $self->WriteLog( "ExecuteStringTraining - Error: Training String Is Empty" ) if ( $trainingStr eq "" );
310 1 50       2 return -1 if ( $trainingStr eq "" );
311              
312             # Save Training String To Temporary File
313 1         2 my $result = 0;
314              
315 1         2 $self->WriteLog( "ExecuteStringTraining - Saving Training String To Temporary File At Working Directory: \"" . $self->GetWorkingDir() . "\"" );
316              
317 1         3 my $tempFilePath = $self->GetWorkingDir() . "/w2vtemp.txt";
318 1 50       94 open( my $fileHandle, ">:encoding(utf8)", "$tempFilePath" ) or $result = -1;
319              
320 1 50       32 $self->WriteLog( "ExecuteStringTraining - Error Creating File Handle : $!" ) if ( $result == -1 );
321 1 50       3 return -1 if ( $result == -1 );
322              
323             # Print Training String Data To File
324 1 50       65 print( $fileHandle "$trainingStr" ) if defined( $fileHandle );
325              
326 1         16 close( $fileHandle );
327 1         3 undef( $fileHandle );
328              
329 1         3 $self->WriteLog( "ExecuteStringTraining - Temporary Training String File Saved" );
330              
331 1         4 $result = $self->ExecuteTraining( $tempFilePath, $outputFilePath, $vectorSize, $windowSize,
332             $minCount, $sample, $negative, $alpha, $hs, $binary, $numOfThreads,
333             $iterations, $useCBOW, $classes, $readVocab, $saveVocab, $debug, $overwrite );
334              
335 1         3 $self->WriteLog( "ExecuteStringTraining - Removing Temporary Training String Data File" );
336 1         67 unlink( $tempFilePath );
337              
338 1 50       4 $self->WriteLog( "ExecuteStringTraining - Finished" ) if ( $result == 0 );
339 1 50 33     6 $self->WriteLog( "ExecuteStringTraining - Finished With Errors" ) if ( $result == -1 && $self->GetWriteLog() == 0 );
340 1 50 33     6 $self->WriteLog( "ExecuteStringTraining - Finished With Errors / See Log File For Details" ) if ( $result == -1 && $self->GetWriteLog() == 1 ) ;
341              
342 1         4 return $result;
343             }
344              
345             sub ComputeCosineSimilarity
346             {
347 1     1 1 2 my ( $self, $wordA, $wordB ) = @_;
348              
349             # Check(s)
350 1 50 33     3 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
351 1 50       3 $self->WriteLog( "ComputeCosineSimilarity - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
352 1 50       2 return undef if ( $self->IsVectorDataInMemory() == 0 );
353            
354 0 0 0     0 $self->WriteLog( "ComputeCosineSimilarity - Error: Function Requires Two Arguments (Words)" ) if !defined ( $wordA ) || !defined ( $wordB );
355 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
356              
357 0         0 $self->WriteLog( "ComputeCosineSimilarity - Computing Cosine Similarity Of Words: \"$wordA\" and \"$wordB\"" );
358              
359 0         0 my @wordAVtr = ();
360 0         0 my @wordBVtr = ();
361              
362              
363             # Search Dictionary For Specified Words
364 0         0 my $wordAData = $self->GetWordVector( $wordA );
365 0         0 my $wordBData = $self->GetWordVector( $wordB );
366 0 0       0 @wordAVtr = split( ' ', $wordAData ) if defined( $wordAData );
367 0 0       0 @wordBVtr = split( ' ', $wordBData ) if defined( $wordBData );
368              
369             # Post Search Check(s)
370 0 0       0 $self->WriteLog( "ComputeCosineSimilarity - Error: \"$wordA\" Not In Dictionary" ) if @wordAVtr == 0;
371 0 0       0 $self->WriteLog( "ComputeCosineSimilarity - Error: \"$wordB\" Not In Dictionary" ) if @wordBVtr == 0;
372 0 0 0     0 return undef if @wordAVtr == 0 || @wordBVtr == 0;
373              
374             # Remove Word From Vector To Compute Cosine Similarity Based On Vector Values
375 0         0 shift( @wordAVtr );
376 0         0 shift( @wordBVtr );
377 0         0 my $wordAVtrSize = @wordAVtr;
378 0         0 my $wordBVtrSize = @wordBVtr;
379              
380             # Check(s)
381 0 0       0 $wordAVtrSize = 0 if !defined( $wordAVtrSize );
382 0 0       0 $wordBVtrSize = 0 if !defined( $wordBVtrSize );
383              
384 0         0 $self->WriteLog( "ComputeCosineSimilarity - Words Present In Dictionary" );
385              
386             # Cosine Similarity => cos(angle) = -> ->
387             # A * B
388             # -------------------
389             # -> ->
390             # || A || * || B ||
391             #
392             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
393              
394 0         0 my $dpA = 0;
395 0         0 my $dpB = 0;
396 0         0 my $ldpA = 0;
397 0         0 my $ldpB = 0;
398 0         0 my $dpAB = 0;
399              
400             # Compute Dot Product Of VectorA
401 0         0 for my $value ( @wordAVtr )
402             {
403 0         0 $dpA += ( $value * $value );
404             }
405              
406             # Compute Dot Product Of VectorB
407 0         0 for my $value ( @wordBVtr )
408             {
409 0         0 $dpB += ( $value * $value );
410             }
411              
412             # Compute $ldpA & $ldpB
413 0         0 $ldpA = sqrt( $dpA );
414 0         0 $ldpB = sqrt( $dpB );
415              
416             # Compute Cosine Similarity Between Vector A & Vector B
417 0         0 for( my $i = 0; $i < $wordAVtrSize; $i++ )
418             {
419             # Compute Value If Not Dividing By Zero
420 0 0 0     0 $dpAB += ( ( $wordAVtr[$i] / $ldpA ) * ( $wordBVtr[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
421             }
422              
423             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
424 0         0 return sprintf( "%.6f", $dpAB );
425             }
426              
427             sub ComputeAvgOfWordsCosineSimilarity
428             {
429 1     1 1 2 my ( $self, $wordA, $wordB ) = @_;
430              
431             # Check(s)
432 1 50 33     2 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
433 1 50       2 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
434 1 50       2 return undef if ( $self->IsVectorDataInMemory() == 0 );
435            
436 0 0 0     0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Error: Function Requires Two Arguments (Words)" ) if !defined ( $wordA ) || !defined ( $wordB );
437 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
438              
439 0 0 0     0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Error: One Or More Arguments Consisting Of Empty String" ) if ( $wordA eq "" || $wordB eq "" );
440 0 0 0     0 return undef if ( $wordA eq "" || $wordB eq "" );
441              
442              
443 0         0 my @wordAAry = split( ' ', $wordA );
444 0         0 my @wordBAry = split( ' ', $wordB );
445              
446 0         0 $wordA = $self->ComputeAverageOfWords( \@wordAAry );
447 0         0 $wordB = $self->ComputeAverageOfWords( \@wordBAry );
448              
449             # Check(s)
450 0 0       0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Unable To Compute Average Of Word(s): \"@wordAAry\"" ) if !defined( $wordA );
451 0 0       0 $self->WriteLog( "ComputeAvgOfWordsCosineSimilarity - Unable To Compute Average Of Word(s): \"@wordBAry\"" ) if !defined( $wordB );
452 0 0 0     0 return undef if !defined( $wordA ) || !defined( $wordB );
453              
454 0         0 my @avgAVtr = split( ' ', $wordA );
455 0         0 my @avgBVtr = split( ' ', $wordB );
456 0         0 my $avgAVtrSize = @avgAVtr;
457 0         0 my $avgBVtrSize = @avgBVtr;
458              
459             # Check(s)
460 0 0       0 $avgAVtrSize = 0 if !defined( $avgAVtrSize );
461 0 0       0 $avgBVtrSize = 0 if !defined( $avgBVtrSize );
462              
463 0         0 undef( $wordA );
464 0         0 undef( $wordB );
465              
466             # Compute Cosine Similarity Between Word Averages
467              
468             # Cosine Similarity => cos(angle) = -> ->
469             # A * B
470             # -------------------
471             # -> ->
472             # || A || * || B ||
473             #
474             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
475              
476 0         0 my $dpA = 0;
477 0         0 my $dpB = 0;
478 0         0 my $ldpA = 0;
479 0         0 my $ldpB = 0;
480 0         0 my $dpAB = 0;
481              
482             # Compute Dot Product Of VectorA
483 0         0 for my $value ( @avgAVtr )
484             {
485 0         0 $dpA += ( $value * $value );
486             }
487              
488             # Compute Dot Product Of VectorB
489 0         0 for my $value ( @avgBVtr )
490             {
491 0         0 $dpB += ( $value * $value );
492             }
493              
494             # Compute $ldpA & $ldpB
495 0         0 $ldpA = sqrt( $dpA );
496 0         0 $ldpB = sqrt( $dpB );
497              
498             # Compute Cosine Similarity Between Vector A & Vector B
499 0         0 for( my $i = 0; $i < $avgAVtrSize; $i++ )
500             {
501             # Compute Value If Not Dividing By Zero
502 0 0 0     0 $dpAB += ( ( $avgAVtr[$i] / $ldpA ) * ( $avgBVtr[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
503             }
504              
505             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
506 0         0 return sprintf( "%.6f", $dpAB );
507             }
508              
509             sub ComputeMultiWordCosineSimilarity
510             {
511 2     2 1 4 my ( $self, $wordA, $wordB ) = @_;
512              
513             # Check(s)
514 2 50 33     3 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
515 2 50       4 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
516 2 50       3 return undef if ( $self->IsVectorDataInMemory() == 0 );
517            
518 0 0 0     0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: Function Requires Two Arguments (Words)" ) if !defined ( $wordA ) || !defined ( $wordB );
519 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
520              
521 0         0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Computing Cosine Similarity Of Words: \"$wordA\" and \"$wordB\"" );
522              
523 0         0 my @wordAVtr = ();
524 0         0 my @wordBVtr = ();
525              
526              
527             # Split Words To Check For Existence In Dictionary
528 0         0 my @wordAAry = split( ' ', $wordA );
529 0         0 my @wordBAry = split( ' ', $wordB );
530 0         0 my $wordsFoundA = "";
531 0         0 my $wordsFoundB = "";
532              
533             # Search Dictionary For Specified Words
534 0         0 for my $word ( @wordAAry )
535             {
536 0         0 my $wordData = $self->GetWordVector( $word );
537              
538 0 0       0 if( defined( $wordData ) )
539             {
540 0         0 my @wordVtr = split( ' ', $wordData );
541 0         0 push( @wordAVtr, [ @wordVtr ] );
542 0         0 $wordsFoundA .= ( " " . $word );
543             }
544             }
545              
546 0         0 for my $word ( @wordBAry )
547             {
548 0         0 my $wordData = $self->GetWordVector( $word );
549              
550 0 0       0 if( defined( $wordData ) )
551             {
552 0         0 my @wordVtr = split( ' ', $wordData );
553 0         0 push( @wordBVtr, [ @wordVtr ] );
554 0         0 $wordsFoundB .= ( " " . $word );
555             }
556             }
557              
558              
559             # Post Search Check(s)
560 0         0 my $error = 0;
561 0         0 for( my $i = 0; $i < @wordAAry; $i++ )
562             {
563 0 0       0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: \"" . $wordAAry[$i] . "\" Not In Dictionary" ) if index( $wordsFoundA, $wordAAry[$i] ) == -1;
564 0 0       0 $error = 1 if index( $wordsFoundA, $wordAAry[$i] ) == -1;
565             }
566              
567 0         0 for( my $i = 0; $i < @wordBAry; $i++ )
568             {
569 0 0       0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Error: \"" . $wordBAry[$i] . "\" Not In Dictionary" ) if index( $wordsFoundB, $wordBAry[$i] ) == -1;
570 0 0       0 $error = 1 if index( $wordsFoundB, $wordBAry[$i] ) == -1;
571             }
572              
573 0 0       0 return undef if $error != 0;
574              
575              
576 0         0 $self->WriteLog( "ComputeMultiWordCosineSimilarity - Words Present In Dictionary" );
577              
578             # Remove Words From Word Vectors
579 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
580             {
581 0         0 my @tempAry = @{ $wordAVtr[$i] };
  0         0  
582 0         0 shift( @tempAry );
583 0         0 $wordAVtr[$i] = \@tempAry;
584             }
585              
586 0         0 for( my $i = 0; $i < @wordBVtr; $i++ )
587             {
588 0         0 my @tempAry = @{ $wordBVtr[$i] };
  0         0  
589 0         0 shift( @tempAry );
590 0         0 $wordBVtr[$i] = \@tempAry;
591             }
592              
593              
594             # Compute Sum Of Compound Words
595 0         0 my @wordASumAry = ();
596 0         0 my @wordBSumAry = ();
597              
598 0         0 my $wordVtrASize = @{ $wordAVtr[0] };
  0         0  
599 0         0 my $wordVtrBSize = @{ $wordBVtr[0] };
  0         0  
600              
601 0         0 for( my $i = 0; $i < $wordVtrASize; $i++ )
602             {
603 0         0 my $value = 0;
604              
605 0         0 for my $aryRef ( @wordAVtr )
606             {
607 0         0 $value += $aryRef->[$i];
608             }
609              
610 0         0 push( @wordASumAry, $value );
611             }
612              
613 0         0 for( my $i = 0; $i < $wordVtrBSize; $i++ )
614             {
615 0         0 my $value = 0;
616              
617 0         0 for my $aryRef ( @wordBVtr )
618             {
619 0         0 $value += $aryRef->[$i];
620             }
621              
622 0         0 push( @wordBSumAry, $value );
623             }
624              
625              
626             # Cosine Similarity => cos(angle) = -> ->
627             # A * B
628             # -------------------
629             # -> ->
630             # || A || * || B ||
631             #
632             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
633              
634 0         0 my $dpA = 0;
635 0         0 my $dpB = 0;
636 0         0 my $ldpA = 0;
637 0         0 my $ldpB = 0;
638 0         0 my $dpAB = 0;
639              
640             # Compute Dot Product Of VectorA
641 0         0 for my $value ( @wordASumAry )
642             {
643 0         0 $dpA += ( $value * $value );
644             }
645              
646             # Compute Dot Product Of VectorB
647 0         0 for my $value ( @wordBSumAry )
648             {
649 0         0 $dpB += ( $value * $value );
650             }
651              
652             # Compute $ldpA & $ldpB
653 0         0 $ldpA = sqrt( $dpA );
654 0         0 $ldpB = sqrt( $dpB );
655              
656             # Compute Cosine Similarity Between Vector A & Vector B
657 0         0 for( my $i = 0; $i < $wordVtrASize; $i++ )
658             {
659             # Compute Value If Not Dividing By Zero
660 0 0 0     0 $dpAB += ( ( $wordASumAry[$i] / $ldpA ) * ( $wordBSumAry[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
661             }
662              
663             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
664 0         0 return sprintf( "%.6f", $dpAB );
665             }
666              
667             sub ComputeCosineSimilarityOfWordVectors
668             {
669 1     1 1 4 my ( $self, $wordAData, $wordBData ) = @_;
670              
671             # Check(s)
672 1 50 33     4 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordAData ) || !defined ( $wordBData );
673 1 50 33     9 return undef if !defined ( $wordAData ) || !defined ( $wordBData );
674              
675 0 0 0     0 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Error: One Or More Word Vectors Consist Of No Data" ) if ( $wordAData eq "" || $wordBData eq "" );
676 0 0 0     0 return undef if ( $wordAData eq "" || $wordBData eq "" );
677              
678 0         0 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Computing Cosine Similarity Of Word Vectors: \"$wordAData\" and \"$wordBData\"" );
679              
680 0         0 my @wordAVtr = split( ' ', $wordAData );
681 0         0 my @wordBVtr = split( ' ', $wordBData );
682              
683 0         0 undef( $wordAData );
684 0         0 undef( $wordBData );
685              
686 0         0 my $wordAVtrSize = @wordAVtr;
687 0         0 my $wordBVtrSize = @wordBVtr;
688              
689             # Check(s)
690 0 0       0 $wordAVtrSize = 0 if !defined( $wordAVtrSize );
691 0 0       0 $wordBVtrSize = 0 if !defined( $wordBVtrSize );
692              
693 0         0 $self->WriteLog( "ComputeCosineSimilarityOfWordVectors - Words Present In Dictionary" );
694              
695             # Cosine Similarity => cos(angle) = -> ->
696             # A * B
697             # -------------------
698             # -> ->
699             # || A || * || B ||
700             #
701             # Explanation: Dot Product Of VectorA By VectorB, Divided By The Square Root Of Dot Product Of Vector A Multiplied By Square Root Of Dot Product Of Vector B
702              
703 0         0 my $dpA = 0;
704 0         0 my $dpB = 0;
705 0         0 my $ldpA = 0;
706 0         0 my $ldpB = 0;
707 0         0 my $dpAB = 0;
708              
709             # Compute Dot Product Of VectorA
710 0         0 for my $value ( @wordAVtr )
711             {
712 0         0 $dpA += ( $value * $value );
713             }
714              
715             # Compute Dot Product Of VectorB
716 0         0 for my $value ( @wordBVtr )
717             {
718 0         0 $dpB += ( $value * $value );
719             }
720              
721             # Compute $ldpA & $ldpB
722 0         0 $ldpA = sqrt( $dpA );
723 0         0 $ldpB = sqrt( $dpB );
724              
725             # Compute Cosine Similarity Between Vector A & Vector B
726 0         0 for( my $i = 0; $i < $wordAVtrSize; $i++ )
727             {
728             # Compute Value If Not Dividing By Zero
729 0 0 0     0 $dpAB += ( ( $wordAVtr[$i] / $ldpA ) * ( $wordBVtr[$i] / $ldpB ) ) if ( $ldpA != 0 && $ldpB != 0 );
730             }
731              
732             # Return Value Cosine Similarity Value Rounded To Six Decimal Places
733 0         0 return sprintf( "%.6f", $dpAB );
734             }
735              
736             sub CosSimWithUserInput
737             {
738 0     0 1 0 my ( $self ) = @_;
739            
740             # Check
741 0 0 0     0 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
742 0 0       0 $self->WriteLog( "CosSimWithUserInput - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
743 0 0       0 return undef if ( $self->IsVectorDataInMemory() == 0 );
744              
745 0         0 my $exit = 0;
746              
747 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
748 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog() == 0;
749              
750 0         0 while ( my $input = )
751             {
752 0         0 chomp( $input );
753 0 0       0 return if $input eq "EXIT";
754              
755 0         0 my @wordAry = split( ' ', $input );
756 0 0 0     0 $self->WriteLog( "Warning: Requires two words for input - ex \"man woman\"" ) if @wordAry == 0 || @wordAry == 1;
757 0 0 0     0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 ) if @wordAry == 0 || @wordAry == 1;
758              
759             # Print Data To Console When DebugLog == 0
760 0 0 0     0 print( "Warning: Requires two words for input - ex \"man woman\" \n" ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
761 0 0 0     0 print( "Input (Type \"EXIT\" to exit): " ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
762 0 0 0     0 next if ( @wordAry == 0 || @wordAry == 1 );
763              
764 0         0 my $value = $self->ComputeCosineSimilarity( $wordAry[0], $wordAry[1] );
765 0 0       0 $self->WriteLog( "Result: $value" ) if defined ( $value );
766 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
767              
768             # Print Data To Console When DebugLog == 0
769 0 0 0     0 print( "Error: One Or More Words Not Present In Dictionary\n" ) if ( !defined ( $value ) && $self->GetDebugLog() == 0 );
770 0 0 0     0 print( "Result: $value\n" ) if ( defined ( $value ) && $self->GetDebugLog == 0 );
771 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog == 0;
772             }
773             }
774              
775             sub MultiWordCosSimWithUserInput
776             {
777 0     0 1 0 my ( $self ) = @_;
778            
779             # Check
780 0 0 0     0 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
781 0 0       0 $self->WriteLog( "CosSimWithUserInput - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
782 0 0       0 return undef if ( $self->IsVectorDataInMemory() == 0 );
783              
784 0         0 my $exit = 0;
785              
786 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
787 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog() == 0;
788              
789 0         0 while ( my $input = )
790             {
791 0         0 chomp( $input );
792 0 0       0 return if $input eq "EXIT";
793              
794 0         0 my @wordAry = split( ' ', $input );
795 0 0 0     0 $self->WriteLog( "Warning: Requires two words for input - ex \"man woman\"" ) if @wordAry == 0 || @wordAry == 1;
796 0 0 0     0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 ) if @wordAry == 0 || @wordAry == 1;
797              
798             # Print Data To Console When DebugLog == 0
799 0 0 0     0 print( "Warning: Requires two words for input - ex \"man woman\"\n" ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
800 0 0 0     0 print( "Input (Type \"EXIT\" to exit): " ) if ( $self->GetDebugLog == 0 && ( @wordAry == 0 || @wordAry == 1 ) );
      0        
801 0 0 0     0 next if @wordAry == 0 || @wordAry == 1;
802              
803 0         0 my @wordArg1 = split( ':', $wordAry[0] );
804 0         0 my @wordArg2 = split( ':', $wordAry[1] );
805 0         0 my $arg1 = join( ' ', @wordArg1 );
806 0         0 my $arg2 = join( ' ', @wordArg2 );
807 0         0 my $value = $self->ComputeMultiWordCosineSimilarity( $arg1, $arg2 );
808 0 0       0 $self->WriteLog( "Result: $value" ) if defined ( $value );
809 0         0 $self->WriteLog( "Input (Type \"EXIT\" to exit): ", 0 );
810              
811             # Print Data To Console When DebugLog == 0
812 0 0 0     0 print( "Error: One Or More Words Not Present In Dictionary\n" ) if ( !defined ( $value ) && $self->GetDebugLog() == 0 );
813 0 0 0     0 print( "Result: $value\n" ) if ( defined ( $value ) && $self->GetDebugLog() == 0 );
814 0 0       0 print( "Input (Type \"EXIT\" to exit): " ) if $self->GetDebugLog() == 0;
815             }
816             }
817              
818             sub ComputeAverageOfWords
819             {
820 1     1 1 1 my ( $self, $wordAryRef ) = @_;
821              
822             # Check(s)
823 1 50 33     2 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
824 1 50       3 $self->WriteLog( "ComputeAverageOfWords - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
825 1 50       2 return undef if ( $self->IsVectorDataInMemory() == 0 );
826            
827 0 0       0 $self->WriteLog( "Error: Method Requires Array Reference Argument / Argument Not Defined" ) if !defined( $wordAryRef );
828 0 0       0 return undef if !defined( $wordAryRef );
829              
830 0         0 my @wordAry = @{ $wordAryRef };
  0         0  
831              
832 0         0 my @foundWords = ();
833 0         0 my @foundWordData = ();
834 0         0 my @resultAry = ();
835              
836 0         0 my $wordDataSize = 0;
837              
838 0         0 $self->WriteLog( "ComputeAverageOfWords - Locating Words In Vocabulary/Dictionary" );
839              
840             # Normal Memory Usage Mode
841 0 0       0 if( $self->GetMinimizeMemoryUsage() == 0 )
842             {
843             # Find Words
844 0         0 for my $word ( @wordAry )
845             {
846             # Dense Vector Data Algorithm
847 0 0       0 if( $self->GetSparseVectorMode() == 0 )
848             {
849             # Fetch Word From Vocabulary/Dictionary
850 0         0 my $result = $self->GetWordVector( $word );
851              
852             # Store Found Word
853 0 0       0 push( @foundWords, $word ) if defined( $result );
854              
855             # Store Found Word Vector Data
856 0 0       0 my @wordData = split( ' ', $result ) if defined( $result );
857 0 0       0 push( @foundWordData, [ @wordData ] ) if @wordData > 0;
858              
859 0 0 0     0 $wordDataSize = @wordData - 1 if $wordDataSize == 0 && defined( $result );
860             }
861             # Sparse Vector Data Algorithm
862             else
863             {
864             # Fetch Word From Vocabulary/Dictionary
865 0         0 my $result = $self->GetWordVector( $word, 1 );
866              
867             # Store Found Word
868 0 0       0 push( @foundWords, $word ) if defined( $result );
869              
870             # Store Found Word Vector Data
871 0 0       0 push( @foundWordData, $self->ConvertRawSparseTextToVectorDataHash( $result ) ) if defined( $result );
872              
873 0 0 0     0 $wordDataSize = $self->GetVectorLength() if $wordDataSize == 0 && defined( $result );
874             }
875             }
876              
877 0         0 $self->WriteLog( "ComputeAverageOfWords - Found: \"" . @foundWords . "\" Of \"" . @wordAry . "\" Words" );
878 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Computing Average Of Found Word(s): @foundWords" ) if @foundWords > 0;
879              
880             # Clear Found Words (Strings)
881 0         0 undef( @foundWords );
882 0         0 @foundWords = ();
883              
884             # Compute Average Of Vector Data For Found Words,
885             # Sum Values Of All Found Word Vectors / Dense Vector Format
886 0 0       0 if( $self->GetSparseVectorMode() == 0 )
887             {
888 0         0 for( my $i = 0; $i < $wordDataSize; $i++ )
889             {
890 0         0 my $value = 0;
891              
892 0         0 for( my $j = 0; $j < @foundWordData; $j++ )
893             {
894 0         0 $value += $foundWordData[$j]->[$i+1];
895             }
896              
897             # Compute Average
898 0         0 $value /= @foundWordData;
899              
900             # Round Decimal Places Greater Than Six
901 0         0 $value = sprintf( "%.6f", $value );
902              
903             # Store Value In Resulting Array
904 0         0 push( @resultAry, $value );
905             }
906             }
907             # Sum Values Of All Found Word Vectors / Sparse Vector Format
908             else
909             {
910             # Create And Zero Fill The Result Vector
911 0         0 @resultAry = ( "0.000000" ) x $wordDataSize;
912              
913 0         0 for( my $i = 0; $i < @foundWordData; $i++ )
914             {
915 0         0 for my $key ( keys( %{ $foundWordData[$i] } ) )
  0         0  
916             {
917 0         0 $resultAry[$key-1] += sprintf( "%.6f", $foundWordData[$i]->{$key} );
918             }
919             }
920              
921             # Compute Average Of All Result Vector Elements
922 0 0       0 if( @foundWordData > 1 )
923             {
924 0         0 for( my $i = 0; $i < @resultAry; $i++ )
925             {
926 0         0 $resultAry[$i] /= @foundWordData;
927 0         0 $resultAry[$i] = sprintf( "%.6f", $resultAry[$i] );
928             }
929             }
930             }
931              
932             # Clear Vector Data For Found Words
933 0 0       0 if( $self->GetSparseVectorMode() == 0 )
934             {
935 0         0 for( my $i = 0; $i < @foundWordData; $i++ )
936             {
937 0         0 $foundWordData[$i] = [];
938             }
939             }
940             else
941             {
942 0         0 for( my $i = 0; $i < @foundWordData; $i++ )
943             {
944 0         0 $foundWordData[$i] = {};
945             }
946             }
947              
948             # Clear Found Word Data
949 0         0 undef( @foundWordData );
950 0         0 @foundWordData = ();
951             }
952             # Minimal Memory Usage Mode
953             else
954             {
955             # Find Words
956 0         0 for my $word ( @wordAry )
957             {
958             # Dense Vector Format / Minimal Memory Usage Mode
959 0 0       0 if( $self->GetSparseVectorMode() == 0 )
960             {
961             # Fetch Word From Vocabulary/Dictionary
962 0         0 my $result = $self->GetWordVector( $word );
963              
964 0 0       0 next if !defined( $result );
965              
966             # Store Found Word
967 0 0       0 push( @foundWords, $word ) if defined( $result );
968              
969             # Split Found Word Vector Data Into An Array
970 0 0       0 my @wordData = split( ' ', $result ) if defined( $result );
971              
972             # Set Word Vector Length
973 0 0 0     0 $wordDataSize = @wordData - 1 if ( $wordDataSize == 0 && defined( $result ) );
974              
975             # Create And Zero Fill The Result Vector If Not Already Done
976 0 0 0     0 @resultAry = ( "0.000000" ) x $wordDataSize if ( @resultAry == 0 && @resultAry != $wordDataSize );
977              
978 0         0 for( my $i = 1; $i < @wordData; $i++ )
979             {
980 0         0 my $value = $wordData[$i];
981              
982             # Round Decimal Places Greater Than Six
983 0         0 $value = sprintf( "%.6f", $value );
984              
985 0         0 $resultAry[$i-1] += $value;
986             }
987              
988 0 0 0     0 $result = "" if ( defined( $result ) && $result ne "" );
989              
990 0         0 undef( @wordData );
991 0         0 @wordData = ();
992             }
993             # Sparse Vector Format / Minimal Memory Usage Mode
994             else
995             {
996             # Create And Zero Fill The Result Vector If Not Already Done
997 0 0       0 @resultAry = ( "0.000000" ) x $self->GetVectorLength() if @resultAry == 0;
998              
999             # Fetch Word From Vocabulary/Dictionary
1000 0         0 my $result = $self->GetWordVector( $word, 1 );
1001              
1002             # Store Found Word
1003 0 0       0 push( @foundWords, $word ) if defined( $result );
1004              
1005             # Store Found Word Vector Data
1006 0 0       0 my $wordData = $self->ConvertRawSparseTextToVectorDataHash( $result ) if defined( $result );
1007              
1008             # Copy Hash Element Data To Defined Array Indices
1009 0         0 for my $key ( keys( %{ $wordData } ) )
  0         0  
1010             {
1011 0         0 $resultAry[$key-1] += sprintf( "%.6f", $wordData->{$key} );
1012             }
1013              
1014             # Clear Hash Data
1015 0         0 $wordData = {};
1016 0         0 undef( %{ $wordData } );
  0         0  
1017 0         0 $result = "";
1018             }
1019             }
1020              
1021 0         0 $self->WriteLog( "ComputeAverageOfWords - Found: \"" . @foundWords . "\" Of \"" . @wordAry . "\" Words" );
1022 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Computing Average Of Found Word(s): @foundWords" ) if @foundWords > 0;
1023              
1024             # Compute Average Of All Result Vector Elements
1025 0 0       0 if( @foundWords > 1 )
1026             {
1027 0         0 for( my $i = 0; $i < @resultAry; $i++ )
1028             {
1029 0         0 $resultAry[$i] /= @foundWords;
1030 0         0 $resultAry[$i] = sprintf( "%.6f", $resultAry[$i] );
1031             }
1032             }
1033              
1034             # Clear Found Words (Strings)
1035 0         0 undef( @foundWords );
1036 0         0 @foundWords = ();
1037             }
1038              
1039 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Complete" ) if @resultAry > 0;
1040 0 0       0 $self->WriteLog( "ComputeAverageOfWords - Completed With Errors" ) if @resultAry == 0;
1041              
1042 0 0       0 my $returnStr = join( ' ', @resultAry ) if @resultAry > 0;
1043 0 0       0 $returnStr = undef if @resultAry == 0;
1044 0         0 undef( @resultAry );
1045 0         0 return $returnStr;
1046             }
1047              
1048             sub AddTwoWords
1049             {
1050 1     1 1 2 my ( $self, $wordA, $wordB ) = @_;
1051              
1052             # Check(s)
1053 1 50 33     2 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
1054 1 50       3 $self->WriteLog( "AddTwoWords - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
1055 1 50       1 return undef if ( $self->IsVectorDataInMemory() == 0 );
1056            
1057 0 0 0     0 $self->WriteLog( "AddTwoWords - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1058 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
1059              
1060 0         0 my $wordAData = $self->GetWordVector( $wordA );
1061 0         0 my $wordBData = $self->GetWordVector( $wordB );
1062              
1063 0 0       0 $self->WriteLog( "AddTwoWords - Error: \"$wordA\" Not In Dictionary" ) if !defined( $wordAData );
1064 0 0       0 $self->WriteLog( "AddTwoWords - Error: \"$wordB\" Not In Dictionary" ) if !defined( $wordBData );
1065 0 0 0     0 return undef if !defined( $wordAData ) || !defined( $wordBData );
1066              
1067 0         0 my @wordAVtr = split( ' ', $wordAData );
1068 0         0 my @wordBVtr = split( ' ', $wordBData );
1069              
1070             # More Check(s)
1071 0 0       0 $self->WriteLog( "AddTwoWords - Cannot Add Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1072 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1073              
1074             # Remove Word From Word Vector (First Element)
1075 0         0 shift( @wordAVtr );
1076 0         0 shift( @wordBVtr );
1077              
1078 0         0 $self->WriteLog( "AddTwoWords - Adding Two Word Vectors" );
1079              
1080 0         0 my @resultVtr = ();
1081              
1082 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1083             {
1084 0         0 push( @resultVtr, $wordAVtr[$i] + $wordBVtr[$i] );
1085             }
1086              
1087 0         0 my $resultStr = join( ' ', @resultVtr );
1088 0         0 undef( @resultVtr );
1089              
1090 0         0 $self->WriteLog( "AddTwoWords - Complete" );
1091              
1092 0         0 return $resultStr;
1093             }
1094              
1095             sub SubtractTwoWords
1096             {
1097 1     1 1 2 my ( $self, $wordA, $wordB ) = @_;
1098              
1099             # Check(s)
1100 1 50 33     2 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
1101 1 50       3 $self->WriteLog( "AddTwoWords - Error: Dictionary Is Empty / No Vector Data In Memory" ) if $self->IsVectorDataInMemory() == 0;
1102 1 50       2 return undef if ( $self->IsVectorDataInMemory() == 0 );
1103            
1104 0 0 0     0 $self->WriteLog( "SubtractTwoWords - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1105 0 0 0     0 return undef if !defined ( $wordA ) || !defined ( $wordB );
1106              
1107 0         0 my $wordAData = $self->GetWordVector( $wordA );
1108 0         0 my $wordBData = $self->GetWordVector( $wordB );
1109              
1110 0 0       0 $self->WriteLog( "SubtractTwoWords - Error: \"$wordA\" Not In Dictionary" ) if !defined( $wordAData );
1111 0 0       0 $self->WriteLog( "SubtractTwoWords - Error: \"$wordB\" Not In Dictionary" ) if !defined( $wordBData );
1112 0 0 0     0 return undef if !defined( $wordAData ) || !defined( $wordBData );
1113              
1114 0         0 my @wordAVtr = split( ' ', $wordAData );
1115 0         0 my @wordBVtr = split( ' ', $wordBData );
1116              
1117             # More Check(s)
1118 0 0       0 $self->WriteLog( "SubtractTwoWords - Cannot Add Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1119 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1120              
1121             # Remove Word From Word Vector (First Element)
1122 0         0 shift( @wordAVtr );
1123 0         0 shift( @wordBVtr );
1124              
1125 0         0 $self->WriteLog( "SubtractTwoWords - Subtracting Two Word Vectors" );
1126              
1127 0         0 my @resultVtr = ();
1128              
1129 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1130             {
1131 0         0 push( @resultVtr, $wordAVtr[$i] - $wordBVtr[$i] );
1132             }
1133              
1134 0         0 my $resultStr = join( ' ', @resultVtr );
1135 0         0 undef( @resultVtr );
1136              
1137 0         0 $self->WriteLog( "SubtractTwoWords - Complete" );
1138              
1139 0         0 return $resultStr;
1140             }
1141              
1142             sub AddTwoWordVectors
1143             {
1144 1     1 1 6 my ( $self, $wordA, $wordB ) = @_;
1145              
1146             # Check(s)
1147 1 50 33     5 $self->WriteLog( "AddTwoWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1148 1 50 33     5 return undef if !defined ( $wordA ) || !defined ( $wordB );
1149              
1150 0         0 my @wordAVtr = split( ' ', $wordA );
1151 0         0 my @wordBVtr = split( ' ', $wordB );
1152              
1153             # More Check(s)
1154 0 0       0 $self->WriteLog( "AddTwoWordVectors - Cannot Add Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1155 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1156              
1157 0         0 $self->WriteLog( "AddTwoWordVectors - Adding Two Word Vectors" );
1158              
1159 0         0 my @resultVtr = ();
1160              
1161 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1162             {
1163 0         0 push( @resultVtr, $wordAVtr[$i] + $wordBVtr[$i] );
1164             }
1165              
1166 0         0 my $resultStr = join( ' ', @resultVtr );
1167 0         0 undef( @resultVtr );
1168              
1169 0         0 $self->WriteLog( "AddTwoWordVectors - Complete" );
1170              
1171 0         0 return $resultStr;
1172             }
1173              
1174             sub SubtractTwoWordVectors
1175             {
1176 1     1 1 3 my ( $self, $wordA, $wordB ) = @_;
1177              
1178             # Check(s)
1179 1 50 33     7 $self->WriteLog( "SubtractTwoWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1180 1 50 33     8 return undef if !defined ( $wordA ) || !defined ( $wordB );
1181              
1182 0         0 my @wordAVtr = split( ' ', $wordA );
1183 0         0 my @wordBVtr = split( ' ', $wordB );
1184              
1185             # More Check(s)
1186 0 0       0 $self->WriteLog( "SubtractTwoWordVectors - Cannot Subtract Two Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1187 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1188              
1189 0         0 $self->WriteLog( "SubtractTwoWordVectors - Subtracting Two Word Vectors" );
1190              
1191 0         0 my @resultVtr = ();
1192              
1193 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1194             {
1195 0         0 push( @resultVtr, $wordAVtr[$i] - $wordBVtr[$i] );
1196             }
1197              
1198 0         0 my $resultStr = join( ' ', @resultVtr );
1199 0         0 undef( @resultVtr );
1200              
1201 0         0 $self->WriteLog( "SubtractTwoWordVectors - Complete" );
1202              
1203 0         0 return $resultStr;
1204             }
1205              
1206             sub AverageOfTwoWordVectors
1207             {
1208 1     1 1 3 my ( $self, $wordA, $wordB ) = @_;
1209              
1210             # Check(s)
1211 1 50 33     8 $self->WriteLog( "AverageOfTwoWordVectors - Error: Function Requires Two Arguments (Word Vectors)" ) if !defined ( $wordA ) || !defined ( $wordB );
1212 1 50 33     7 return undef if !defined ( $wordA ) || !defined ( $wordB );
1213              
1214 0         0 my @wordAVtr = split( ' ', $wordA );
1215 0         0 my @wordBVtr = split( ' ', $wordB );
1216              
1217             # More Check(s)
1218 0 0       0 $self->WriteLog( "AverageOfTwoWordVectors - Cannot Compute Average Of Word Vectors / Vtr Sizes Not Equal" ) if ( @wordAVtr != @wordBVtr ) ;
1219 0 0       0 return undef if ( @wordAVtr != @wordBVtr );
1220              
1221 0         0 $self->WriteLog( "AverageOfTwoWordVectors - Averaging Two Word Vectors" );
1222              
1223 0         0 my @resultVtr = ();
1224              
1225 0         0 for( my $i = 0; $i < @wordAVtr; $i++ )
1226             {
1227 0         0 push( @resultVtr, ( $wordAVtr[$i] - $wordBVtr[$i] ) / 2 );
1228             }
1229              
1230 0         0 my $resultStr = join( ' ', @resultVtr );
1231 0         0 undef( @resultVtr );
1232              
1233 0         0 $self->WriteLog( "AverageOfTwoWordVectors - Complete" );
1234              
1235 0         0 return $resultStr;
1236             }
1237              
1238             sub GetWordVector
1239             {
1240 5     5 1 10 my ( $self, $searchWord, $returnRawSparseText ) = @_;
1241              
1242 5 50       11 $returnRawSparseText = 1 if defined( $returnRawSparseText );
1243 5 50       11 $returnRawSparseText = 0 if !defined( $returnRawSparseText );
1244              
1245             # Check(s)
1246 5 50 33     7 print( "Error: Dictionary Is Empty / No Vector Data In Memory\n" ) if ( $self->GetDebugLog() == 0 && $self->IsVectorDataInMemory() == 0 );
1247 5 50       9 $self->WriteLog( "GetWordVector - Error: No Vector Data In Memory - Cannot Fetch Word Vector Data" ) if ( $self->IsVectorDataInMemory() == 0 );
1248 5 50       6 return undef if ( $self->IsVectorDataInMemory() == 0 );
1249              
1250 0         0 my $wordVectorData = $self->GetVocabularyHash->{ $searchWord };
1251              
1252 0 0       0 $self->WriteLog( "GetWordVector - Warning: \"$searchWord\" Not Found In Dictionary" ) if !defined( $wordVectorData );
1253              
1254 0 0       0 return undef if !defined( $wordVectorData );
1255              
1256 0         0 my $returnStr = "";
1257              
1258             # Convert Sparse Format To Regular Format
1259 0 0       0 if( $self->GetSparseVectorMode() == 1 )
1260             {
1261 0 0       0 if( $returnRawSparseText == 1 )
1262             {
1263 0         0 return $searchWord . " " . $wordVectorData;
1264             }
1265              
1266 0         0 my $vectorSize = $self->GetVectorLength();
1267 0         0 my @data = split( ' ', $wordVectorData );
1268              
1269             # Make Array Of Vector Size With All Zeros
1270 0 0       0 my @wordVector = ( "0.000000" ) x $vectorSize if ( $vectorSize != 0 );
1271              
1272 0         0 for( my $i = 0; $i < @data; $i++ )
1273             {
1274             # If The Index ($i) Is Even, Then The Element Is An Index
1275 0 0       0 my $index = $data[$i] if ( $i % 2 == 0 );
1276              
1277             # If The Index Is Defined, Then Next Element Is An Index Element
1278 0 0       0 my $element = $data[$i+1] if defined( $index );
1279              
1280             # Assign The Correct Index Element To The Specified Index
1281 0 0 0     0 $wordVector[$index] = $element if defined( $index ) && defined( $element );
1282             }
1283              
1284             # Assign New Standard Format Word Vector To $returnStr
1285 0         0 $returnStr = $searchWord . " " . join( ' ', @wordVector );
1286              
1287             # Clear Array
1288 0         0 undef( @data );
1289 0         0 @data = ();
1290 0         0 undef( @wordVector );
1291 0         0 @wordVector = ();
1292             }
1293             else
1294             {
1295 0         0 $returnStr = $searchWord . " " . $wordVectorData;
1296             }
1297              
1298 0         0 return $returnStr;
1299             }
1300              
1301             sub IsVectorDataInMemory
1302             {
1303 37     37 1 33 my ( $self ) = @_;
1304              
1305 37         19 my $numberOfWordsInMemory = keys %{ $self->GetVocabularyHash() };
  37         37  
1306 37 50       48 return 1 if $numberOfWordsInMemory > 0;
1307              
1308 37         284 return 0;
1309             }
1310              
1311             sub IsVectorDataSorted
1312             {
1313 0     0 1 0 my ( $self, $aryRef ) = @_;
1314              
1315 0 0       0 my $vocabHashRef = $self->GetVocabularyHash() if !defined( $aryRef );
1316 0 0       0 $vocabHashRef = $aryRef if defined( $aryRef );
1317              
1318 0 0       0 $self->WriteLog( "IsVectorDataSorted - Error: No Vector Data In Memory" ) if ( keys %{ $vocabHashRef } == 0 );
  0         0  
1319 0 0       0 return -1 if ( keys %{ $vocabHashRef } == 0 );
  0         0  
1320              
1321 0         0 my $numOfWords = $self->GetNumberOfWords();
1322 0         0 my $vectorLength = $self->GetVectorLength();
1323              
1324 0 0 0     0 return 1 if defined( $vocabHashRef->{ $numOfWords } ) && $vocabHashRef->{ $numOfWords } eq "$vectorLength #\$\@RTED#";
1325 0         0 return 0;
1326             }
1327              
1328             sub CheckWord2VecDataFileType
1329             {
1330 3     3 1 4 my ( $self, $fileDir ) = @_;
1331              
1332             # Check(s)
1333 3 50       5 $self->WriteLog( "CheckWord2VecDataFileType - Error: File Path Not Defined" ) if !defined( $fileDir );
1334 3 50       12 return undef if !defined( $fileDir );
1335              
1336 3 100       35 $self->WriteLog( "CheckWord2VecDataFileType - Error: File Cannot Be Found / Does Not Exist" ) if !( -e $fileDir );
1337 3 100       31 return undef if !( -e $fileDir );
1338              
1339              
1340             # Check Word Vector File Format
1341 2         3 my $fileType = "";
1342 2         2 my $numOfWordVectors = 0;
1343 2         3 my $sizeOfVectors = 0;
1344 2         1 my $sparseVectorsFlag = 0;
1345              
1346 2 50       40 open( my $fh, "<:", "$fileDir" ) or $self->WriteLog( "CheckWord2VecDataFileType - Error Opening File : $!" );
1347              
1348 2         6 for( my $i = 0; $i < 2; $i++ )
1349             {
1350 4         36 my $data = <$fh>;
1351              
1352             # Store Number Of Word Vectors And Vector Size
1353 4 100       8 if( $i == 0 )
1354             {
1355 2         61 my @dimensionsAry = split( ' ', $data );
1356              
1357             # Fetch Number Of Word Vectors
1358 2 50       6 $numOfWordVectors = $dimensionsAry[0] if ( @dimensionsAry >= 2 );
1359              
1360             # Fetch Size Of Vectors
1361 2 50       4 $sizeOfVectors = $dimensionsAry[1] if ( @dimensionsAry >= 2 );
1362              
1363             # Skip First Line (First Line Is Always Plain Text Format)
1364 2         5 next;
1365             }
1366              
1367             # Check Second Line Of File To Determine Whether File Is Text Or Binary Format
1368 2         4 my $oldData = $data;
1369 2         8 my $newData = Encode::decode( "utf8", $data, Encode::FB_QUIET );
1370 2 50       43 $fileType = "text" if length( $oldData ) == length( $newData );
1371 2 50       15 $fileType = "binary" if length( $oldData ) != length( $newData );
1372              
1373             # Check Second Line For Sparse Vector
1374 2 50       5 my @dataAry = split( ' ', $oldData ) if defined( $oldData );
1375 2 50 33     7 $sparseVectorsFlag = 1 if defined( $oldData ) && ( @dataAry - 1 != $sizeOfVectors );
1376             }
1377              
1378             # Read A Couple Lines To Determine Whether Vectors Are 'Sparse' Or 'Full' Plain Vectors
1379 2 50       5 if( $fileType eq "text" )
1380             {
1381 2 50       3 my $checkLength = 50 if ( $numOfWordVectors > 50 );
1382 2 50       5 $checkLength = $numOfWordVectors if ( $numOfWordVectors < 50 );
1383              
1384             # Read Data From File To Check For Sparse Vectors
1385 2         6 for( my $i = 0; $i < $checkLength - 2; $i++ )
1386             {
1387 0         0 my $data = <$fh>;
1388 0 0       0 my @dataAry = split( ' ', $data ) if defined( $data );
1389 0 0 0     0 $sparseVectorsFlag = 1 if defined( $data ) && ( @dataAry - 1 != $sizeOfVectors );
1390             }
1391              
1392 2 50       4 $fileType = "sparsetext" if ( $sparseVectorsFlag == 1 );
1393             }
1394              
1395 2         10 close( $fh );
1396 2         4 undef( $fh );
1397              
1398 2         9 return $fileType;
1399             }
1400              
1401             sub ReadTrainedVectorDataFromFile
1402             {
1403 4     4 1 5 my ( $self, $fileDir ) = @_;
1404              
1405 4         10 $self->WriteLog( "ReadTrainedVectorDataFromFile - Reading File \"$fileDir\"" );
1406              
1407             # Check(s)
1408 4 50       7 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Directory Not Defined" ) if !defined ( $fileDir );
1409 4 50       7 return -1 if !defined ( $fileDir );
1410              
1411 4 100       32 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Directory/File Does Not Exist" ) if !( -e "$fileDir" );
1412 4 100       19 return -1 if !( -e "$fileDir" );
1413              
1414 1 50       9 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Vector Data File Size = 0 bytes / File Contains No Data" ) if ( -z "$fileDir" );
1415 1 50       8 return -1 if ( -z "$fileDir" );
1416              
1417 0         0 my $numberOfWordsInMemory = $self->GetNumberOfWords();
1418 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Module Already Contains Vector Training Data In Memory" ) if $numberOfWordsInMemory > 0;
1419 0 0       0 return -1 if $numberOfWordsInMemory > 0;
1420              
1421              
1422             # Check To See If File Data Is Binary Or Text
1423 0         0 my $fileType = $self->CheckWord2VecDataFileType( $fileDir );
1424              
1425             # Check
1426 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Unable To Determine Vector Data Format" ) if !defined( $fileType );
1427 0 0       0 return -1 if !defined( $fileType );
1428              
1429 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Detected File Type As \"Plain Text Format\"" ) if $fileType eq "text" ;
1430 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Detected File Type As \"Sparse Vector Text Format\"" ) if $fileType eq "sparsetext" ;
1431 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Detected File Type As \"Word2Vec Binary Format\"" ) if $fileType eq "binary" ;
1432              
1433 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Setting \"Sparse Vector Mode\" = True" ) if $fileType eq "sparsetext" ;
1434 0 0       0 $self->SetSparseVectorMode( 1 ) if $fileType eq "sparsetext";
1435 0 0       0 $self->SetSparseVectorMode( 0 ) if $fileType ne "sparsetext";
1436              
1437 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Reading Data" );
1438              
1439              
1440             # Read Trained Vector Data From File To Memory
1441 0         0 my $fileHandle;
1442              
1443             # Read Plain Text Data Format From File
1444 0 0       0 if ( $fileType eq "text" )
    0          
    0          
1445             {
1446 0         0 my $lineCount = 0;
1447 0         0 open( $fileHandle, '<:encoding(UTF-8)', "$fileDir" );
1448              
1449 0         0 while( my $row = <$fileHandle> )
1450             {
1451 0         0 chomp $row;
1452 0         0 $row = lc( $row );
1453              
1454 0 0       0 if( $lineCount == 0 )
1455             {
1456 0         0 my @data = split( ' ', $row );
1457              
1458 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: File Does Not Contain Header Information / NumOfWords & VectorLength" ) if ( @data < 2 );
1459 0 0       0 close( $fileHandle ) if ( @data < 2 );
1460 0 0       0 return -1 if ( @data < 2 );
1461              
1462 0         0 $self->SetNumberOfWords( $data[0] );
1463 0         0 $self->SetVectorLength( $data[1] );
1464             }
1465              
1466 0         0 $self->AddWordVectorToVocabHash( $row );
1467 0         0 $lineCount++;
1468             }
1469              
1470 0         0 close( $fileHandle );
1471             }
1472             # Read Spare Text Format From File
1473             elsif( $fileType eq "sparsetext" )
1474             {
1475 0         0 my $lineCount = 0;
1476 0         0 my $numOfWordVectors = 0;
1477 0         0 my $vectorSize = 0;
1478              
1479 0         0 open( $fileHandle, '<:encoding(UTF-8)', "$fileDir" );
1480              
1481 0         0 while( my $row = <$fileHandle> )
1482             {
1483 0         0 chomp $row;
1484              
1485             # Skip First Line ( First Line Holds Number Of Word Vectors And Vector Size / Is Always Even )
1486 0 0       0 if( $lineCount == 0 )
    0          
1487             {
1488 0         0 my @data = split( ' ', $row );
1489              
1490 0 0       0 if( @data >= 2 )
1491             {
1492 0         0 $numOfWordVectors = $data[0];
1493 0         0 $vectorSize = $data[1] - 1;
1494              
1495 0         0 $self->SetNumberOfWords( $numOfWordVectors );
1496 0         0 $self->SetVectorLength( $vectorSize + 1 );
1497             }
1498              
1499             }
1500             elsif( $lineCount > 0 )
1501             {
1502 0         0 my @data = split( ' ', $row );
1503              
1504             # If Array Size Is Odd, Then Error Out
1505             # Explanation: ie. - $dataAry[1] = "heart 1 0.002323 4 0.124342 16 0.005610 17"
1506             # There Are Four Indices And Three Index Elements, There Should Be
1507             # One Index Per Index Element. A Proper Sparse Vector Should Look As Follows.
1508             # ie. - $dataAry[1] = "heart 1 0.002323 4 0.124342 16 0.005610 17 0.846613"
1509             # With The Word Included In The Word Vector, The Vector Size Should Always
1510             # Be Odd By Nature.
1511             #
1512 0 0 0     0 if ( @data > 2 && @data % 2 == 0 )
1513             {
1514 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: Improper Sparse Vector Format - Index/Index Element Number Mis-Match" );
1515 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Occured At Line #$lineCount: \"$row\"" );
1516 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Clearing Vocabulary Array" );
1517 0         0 $self->ClearVocabularyHash();
1518 0         0 return -1;
1519             }
1520              
1521             # Fetch String Word In First Element
1522 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Error: First Element Of Data Array (Word) Not Defined - Line: $lineCount" ) if !defined( $data[0] );
1523 0 0       0 return -1 if !defined( $data[0] );
1524              
1525             # Clear Array
1526 0         0 @data = ();
1527             }
1528              
1529 0         0 $self->AddWordVectorToVocabHash( $row );
1530              
1531 0         0 $lineCount++;
1532             }
1533              
1534 0         0 close( $fileHandle );
1535             }
1536             # Read Word2Vec Binary Data Format From File
1537             elsif( $fileType eq "binary" )
1538             {
1539 0         0 open( $fileHandle, "$fileDir" );
1540 0         0 binmode $fileHandle;
1541              
1542 0         0 my $buffer = "";
1543 0         0 my $word = "";
1544 0         0 my $wordVectorData = "";
1545              
1546             # Fetch "Number Of Words" and "Word Vector Size" From First Line
1547 0         0 my $row = <$fileHandle>;
1548 0         0 chomp( $row );
1549 0         0 my @strAry = split( ' ', $row );
1550              
1551             # Check(s)
1552 0 0       0 return if @strAry < 2;
1553              
1554              
1555 0         0 my $wordCount = $strAry[0];
1556 0         0 my $wordSize = $strAry[1];
1557 0         0 my $count = 1;
1558 0         0 $word = "";
1559              
1560 0         0 $self->SetNumberOfWords( $wordCount );
1561 0         0 $self->SetVectorLength( $wordSize );
1562              
1563             # Add Word Count & Word Vector Size To Memory
1564 0         0 $self->AddWordVectorToVocabHash( "$row" );
1565              
1566             # Begin Fetching Data From File
1567 0         0 while( $count < $wordCount + 1 )
1568             {
1569 0         0 my $cont = 1;
1570              
1571             # Fetch Word
1572 0         0 while( $cont == 1 )
1573             {
1574             # Fetch Word
1575 0         0 chomp( $buffer = getc( $fileHandle ) );
1576 0 0 0     0 $word .= $buffer if $buffer ne " " && defined( $buffer );
1577              
1578             # Check(s)
1579 0 0       0 $cont = 0 if eof;
1580 0 0       0 $cont = 0 if $buffer eq " ";
1581 0 0       0 $self->WriteLog( "ReadTrainedVectorDataFromFile - ERROR: Unexpectedly Reached End Of File" ) if eof;
1582 0 0       0 $self->WriteLog( " Expected Word Count / Vector Size") if eof;
1583 0 0       0 $self->WriteLog( " $wordCount / $wordSize" ) if eof;
1584 0 0       0 $self->WriteLog( " Current Word Count" ) if eof;
1585 0 0       0 $self->WriteLog( " $count" ) if eof;
1586 0 0       0 $count = $wordCount + 1 if eof;
1587 0 0       0 next if eof;
1588             }
1589              
1590             # Fetch Word Vector Float Values
1591 0         0 for( my $i = 0; $i < $wordSize; $i++ )
1592             {
1593             # Read Specified Bytes Amount From File
1594 0         0 read( $fileHandle, $buffer, 4 ); # Assumes size of floating point is 4 bytes
1595 0         0 chomp( $buffer );
1596              
1597             # Check(s)
1598 0 0 0     0 $i = $wordSize + 1 if !defined( $buffer ) || $buffer eq 0;
1599 0 0 0     0 next if !defined( $buffer ) || $buffer eq 0;
1600              
1601 0 0 0     0 if( defined( $buffer ) && $buffer ne "" )
1602             {
1603             # Convert Binary Values To Float
1604 0         0 $buffer = unpack( "f", $buffer ); # Unpacks/convert 4 byte string to floating point
1605 0         0 $wordVectorData .= ( " " . sprintf( "%.6f", $buffer ) ); # Round Decimal At Sixth Place
1606             }
1607             }
1608              
1609             # Word Vector = Word + WordVectorData
1610 0         0 $word .= $wordVectorData;
1611              
1612             # Add Word Vector To Memory
1613 0 0       0 $self->AddWordVectorToVocabHash( $word ) if $word ne "";
1614              
1615             # Clear Variables
1616 0         0 $word = "";
1617 0         0 $wordVectorData = "";
1618 0         0 $buffer = "";
1619              
1620 0         0 $count++;
1621             }
1622              
1623 0         0 close( $fileHandle );
1624             }
1625              
1626 0 0       0 my $numberOfWords = keys %{ $self->GetVocabularyHash() } if defined( $self->GetVocabularyHash() );
  0         0  
1627 0 0       0 $numberOfWords = 0 if !defined( $numberOfWords );
1628 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - Reading Data Complete" );
1629 0         0 $self->WriteLog( "ReadTrainedVectorDataFromFile - $numberOfWords Word Vectors Stored In Memory" );
1630              
1631 0         0 return 0;
1632             }
1633              
1634             sub SaveTrainedVectorDataToFile
1635             {
1636 3     3 1 4 my ( $self, $savePath, $saveFormat ) = @_;
1637              
1638             # Check(s)
1639 3 50       5 $self->WriteLog( "SaveTrainedVectorDataToFile - Error: No Save Path Defined" ) if !defined( $savePath );
1640 3 50       5 return -1 if !defined ( $savePath );
1641              
1642 3 50       6 $saveFormat = 0 if !defined ( $saveFormat );
1643              
1644             # Save Data To File
1645 3         2 my $fileHandle;
1646              
1647             # Save Vector Data In Plain Text Format
1648 3 100       11 if ( $saveFormat == 0 )
    100          
    50          
1649             {
1650 1         4 $self->WriteLog( "SaveTrainedVectorDataToFile - Saving Word2Vec Data To Text File: \"$savePath\"" );
1651              
1652 1 50       51 open( $fileHandle, ">:encoding(utf8)", "$savePath" ) or return -1;
1653 1         34 my $vocabHashRef = $self->GetVocabularyHash();
1654 1         2 my @dataAry = sort( keys %{ $vocabHashRef } );
  1         3  
1655              
1656 1 50       3 if( $self->GetSparseVectorMode() == 1 )
1657             {
1658 0         0 my $numOfWords = $self->GetNumberOfWords();
1659 0         0 my $vectorSize = $self->GetVectorLength();
1660              
1661 0         0 for( my $i = 0; $i < @dataAry; $i++ )
1662             {
1663 0         0 my $wordVectorData = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1664              
1665             # Check(s)
1666 0 0       0 $self->WriteLog( "SaveTrainedVectorDataToFile - Warning: Word Vector Contains No Data / Empty String - Line: $i" ) if ( $wordVectorData eq "" );
1667 0 0       0 next if ( $wordVectorData eq "" );
1668              
1669 0 0       0 if( $i == 0 )
1670             {
1671 0         0 print( $fileHandle "$wordVectorData\n" )
1672             }
1673             else
1674             {
1675 0         0 my @data = split( ' ', $wordVectorData );
1676              
1677             # Get Word
1678 0         0 my $word = $data[0];
1679              
1680             # Make Array Of Vector Size With All Zeros
1681 0 0       0 my @wordVector = ( "0.000000" ) x $vectorSize if ( $vectorSize != 0 );
1682              
1683 0         0 for( my $j = 1; $j < @data; $j++ )
1684             {
1685             # If The Index ($i) Is Odd, Then The Element Is An Index
1686 0 0       0 my $index = $data[$j] if ( $j % 2 == 1 );
1687              
1688             # If The Index Is Defined, Then Next Element Is An Index Element
1689 0 0       0 my $element = $data[$j+1] if defined( $index );
1690              
1691             # Assign The Correct Index Element To The Specified Index
1692 0 0 0     0 $wordVector[$index] = $element if defined( $index ) && defined( $element );
1693             }
1694              
1695             # Generate Regular Formatted Word Vector
1696 0         0 $word = $word . " " . join( ' ', @wordVector );
1697              
1698             # Print Dictionary/Vocabulary Vector Data To File
1699 0         0 print( $fileHandle "$word \n" );
1700              
1701             # Clear Array
1702 0         0 @data = ();
1703 0         0 @wordVector = ();
1704             }
1705             }
1706             }
1707             else
1708             {
1709             # Get Number Of Word Vectors and Vector Array Size
1710 1         3 my $numOfWords = $self->GetNumberOfWords();
1711 1         2 my $vectorSize = $self->GetVectorLength();
1712              
1713             # Print Dictionary/Vocabulary Vector Data To File
1714 1         5 for( my $i = 0; $i < @dataAry; $i++ )
1715             {
1716 0         0 my $data = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1717 0 0       0 print( $fileHandle "$data\n" ) if ( $i == 0 );
1718 0 0       0 print( $fileHandle "$data \n" ) if ( $i > 0 );
1719             }
1720             }
1721              
1722 1         7 close( $fileHandle );
1723 1         3 undef( $fileHandle );
1724              
1725 1         2 $self->WriteLog( "SaveTrainedVectorDataToFile - File Saved" );
1726             }
1727             # Save Vector Data In Word2Vec Binary Format
1728             elsif ( $saveFormat == 1 )
1729             {
1730 1         4 $self->WriteLog( "SaveTrainedVectorDataToFile - Saving Word2Vec Data To Binary File: \"$savePath\"" );
1731              
1732             # Get Vocabulary and Vector Sizes
1733 1         3 my $vocabHashRef = $self->GetVocabularyHash();
1734 1         2 my @dataAry = sort( keys %{ $vocabHashRef } );
  1         4  
1735              
1736             # Check(s)
1737 1 50       4 $self->WriteLog( "SaveTrainedVectorDataToFile - Error: No Word2Vec Vector Data In Memory / Vocabulary Size == 0" ) if @dataAry == 0;
1738 1 50       5 return -1 if @dataAry == 0;
1739              
1740 0 0       0 open( $fileHandle, ">:raw", "$savePath" ) or return -1;
1741 0         0 binmode( $fileHandle ); # Not necessary as ":raw" implies binmode.
1742              
1743 0         0 my $headerStr = $dataAry[0] . " " . $vocabHashRef->{ $dataAry[0] };
1744 0         0 my @headerAry = split( ' ', $headerStr );
1745 0 0       0 return -1 if ( @headerAry < 2 );
1746              
1747 0         0 my $numOfWords = $headerAry[0];
1748 0         0 my $windowSize = $headerAry[1];
1749 0         0 @headerAry = ();
1750 0         0 undef( @headerAry );
1751              
1752             # Print Vocabulary and Windows Sizes To File With Line Feed
1753 0         0 print( $fileHandle "$headerStr\n" );
1754              
1755             # Print Word2Vec Vocabulary and Vector Data To File With Line Feed(s)
1756 0         0 for( my $i = 0; $i < @dataAry; $i++ )
1757             {
1758 0         0 my $data = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1759              
1760             # Check(s)
1761 0 0       0 next if ( $i == 0 );
1762              
1763             # Convert Sparse Vector Data To Dense Vector Format
1764 0 0       0 if ( $self->GetSparseVectorMode() == 1 )
1765             {
1766 0         0 my @tempAry = split( ' ', $data );
1767 0         0 my $word = $tempAry[0];
1768 0         0 @tempAry = ();
1769 0         0 @tempAry = @{ $self->ConvertRawSparseTextToVectorDataAry( $data ) };
  0         0  
1770 0         0 $data = "$word " . join( ' ', @tempAry );
1771 0         0 undef( @tempAry );
1772             }
1773              
1774 0         0 my @ary = split( ' ', $data );
1775 0 0       0 next if @ary < $windowSize;
1776              
1777             # Separate "Word" From "Vector Data"
1778 0         0 my $word = shift( @ary ) . " ";
1779 0         0 my $arySize = @ary;
1780              
1781             # Print Word To File
1782 0         0 print( $fileHandle $word );
1783              
1784             # Print Word Vector Data To File
1785 0         0 for my $value ( @ary )
1786             {
1787 0         0 print( $fileHandle pack( 'f', $value ) ); # Packs String Data In Decimal Binary Format
1788             }
1789              
1790             # Add Line Feed To End Of Word + Vector Data
1791 0         0 print( $fileHandle "\n" );
1792             }
1793              
1794 0         0 close( $fileHandle );
1795 0         0 undef( $fileHandle );
1796              
1797 0         0 $self->WriteLog( "SaveTrainedVectorDataToFile - File Saved" );
1798             }
1799             # Save Vectors In Sparse Vector Format
1800             elsif ( $saveFormat == 2 )
1801             {
1802 1         9 $self->WriteLog( "SaveTrainedVectorDataToFile - Saving Word2Vec Data To Sparse Text File: \"$savePath\"" );
1803              
1804 1 50       48 open( $fileHandle, ">:encoding(utf8)", "$savePath" ) or return -1;
1805 1         33 my $vocabHashRef = $self->GetVocabularyHash();
1806 1         1 my @dataAry = sort( keys( %{ $vocabHashRef } ) );
  1         4  
1807              
1808 1 50       3 if( $self->GetSparseVectorMode() == 1 )
1809             {
1810 0         0 for my $data ( @dataAry )
1811             {
1812 0         0 print( $fileHandle $data . " " . $vocabHashRef->{ $data } . "\n" );
1813             }
1814             }
1815             else
1816             {
1817             # Get Number Of Word Vectors and Vector Array Size
1818 1         2 my $numOfWords = $self->GetNumberOfWords();
1819 1         3 my $vectorSize = $self->GetVectorLength();
1820              
1821             # Print Dictionary/Vocabulary Vector Data To File
1822 1         5 for( my $i = 0; $i < @dataAry; $i++ )
1823             {
1824 0         0 my $data = $dataAry[$i] . " " . $vocabHashRef->{ $dataAry[$i] };
1825 0 0       0 print( $fileHandle "$data\n" ) if ( $i == 0 );
1826              
1827 0 0 0     0 if( $i > 0 && defined( $data ) )
1828             {
1829 0         0 my @wordAry = split( ' ', $data );
1830              
1831 0         0 my $word = $wordAry[0];
1832              
1833             # Print The Vector Word To The File
1834 0         0 print( $fileHandle "$word" );
1835              
1836             # Print Vector Data To File
1837 0         0 for( my $j = 1; $j < @wordAry; $j++ )
1838             {
1839 0         0 my $index = $j - 1;
1840 0         0 my $value = $wordAry[$j];
1841 0 0       0 print( $fileHandle " $index $value" ) if ( $value != 0 );
1842             }
1843              
1844 0         0 print( $fileHandle " \n" );
1845             }
1846             }
1847             }
1848              
1849 1         7 close( $fileHandle );
1850 1         2 undef( $fileHandle );
1851              
1852 1         2 $self->WriteLog( "SaveTrainedVectorDataToFile - File Saved" );
1853             }
1854              
1855 2         7 return 0;
1856             }
1857              
1858             sub StringsAreEqual
1859             {
1860 2     2 1 5 my ( $self , $strA, $strB ) = @_;
1861              
1862 2         5 $strA = lc( $strA );
1863 2         3 $strB = lc( $strB );
1864              
1865 2 100       10 return 0 if length( $strA ) != length( $strB );
1866 1 50       8 return 0 if index( $strA, $strB ) != 0;
1867              
1868 1         4 return 1;
1869             }
1870              
1871             sub RemoveWordFromWordVectorString
1872             {
1873 3     3 0 11 my ( $self, $dataStr ) = @_;
1874              
1875             # Check(s)
1876 3 50       9 return undef if !defined( $dataStr );
1877              
1878 0         0 my @tempAry = split( ' ', $dataStr, 2 );
1879 0         0 $dataStr = $tempAry[1];
1880              
1881 0         0 undef( @tempAry );
1882              
1883 0         0 return $dataStr;
1884             }
1885              
1886             sub ConvertRawSparseTextToVectorDataAry
1887             {
1888 1     1 1 2 my ( $self, $rawSparseText ) = @_;
1889              
1890             # Check(s)
1891 1 50       5 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: No Sparse Text Defined" ) if !defined( $rawSparseText );
1892 1 50       3 return () if !defined( $rawSparseText );
1893              
1894 1 50       5 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Sparse Text String Empty" ) if ( $rawSparseText eq "" );
1895 1 50       4 return () if ( $rawSparseText eq "" );
1896              
1897 1         4 my $vectorSize = $self->GetVectorLength();
1898              
1899 1 50       5 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Vector Size == 0" ) if ( $vectorSize == 0 );
1900 1 50       59 return () if ( $vectorSize == 0 );
1901              
1902             # Begin Data Conversion
1903 0         0 my @data = split( ' ', $rawSparseText );
1904              
1905             # Make Array Of Vector Size With All Zeros
1906 0         0 my @wordVector = ( "0.000000" ) x $vectorSize;
1907              
1908 0         0 for( my $i = 0; $i < @data; $i++ )
1909             {
1910             # Skip First Element / First Element Contains Word
1911 0 0       0 next if $i == 0;
1912              
1913             # If The Index ($i) Is Odd, Then The Element Is An Index
1914 0 0       0 my $index = $data[$i] if ( $i % 2 == 1 );
1915              
1916             # If The Index Is Defined, Then Next Element Is An Index Element
1917 0 0       0 my $element = $data[$i+1] if defined( $index );
1918              
1919             # Assign The Correct Index Element To The Specified Index
1920 0 0 0     0 $wordVector[$index] = $element if defined( $index ) && defined( $element );
1921             }
1922              
1923             # Clear Data
1924 0         0 undef( @data );
1925 0         0 @data = ();
1926 0         0 $rawSparseText = undef;
1927              
1928 0         0 return \@wordVector;
1929             }
1930              
1931             sub ConvertRawSparseTextToVectorDataHash
1932             {
1933 0     0 1 0 my ( $self, $rawSparseText ) = @_;
1934              
1935             # Check(s)
1936 0 0       0 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: No Sparse Text Defined" ) if !defined( $rawSparseText );
1937 0 0       0 return () if !defined( $rawSparseText );
1938              
1939 0 0       0 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Sparse Text String Empty" ) if ( $rawSparseText eq "" );
1940 0 0       0 return () if ( $rawSparseText eq "" );
1941              
1942 0         0 my $vectorSize = $self->GetVectorLength();
1943              
1944 0 0       0 $self->WriteLog( "ConvertRawSparseTextToVectorDataAry - Error: Vector Size == 0" ) if ( $vectorSize == 0 );
1945 0 0       0 return () if ( $vectorSize == 0 );
1946              
1947             # Begin Data Conversion
1948 0         0 my @data = split( ' ', $rawSparseText );
1949              
1950 0         0 my %wordHash;
1951              
1952 0         0 for( my $i = 0; $i < @data; $i++ )
1953             {
1954             # Skip First Element / First Element Contains Word
1955 0 0       0 next if $i == 0;
1956              
1957             # If The Index ($i) Is Odd, Then The Element Is An Index
1958 0 0       0 my $index = $data[$i] if ( $i % 2 == 1 );
1959              
1960             # If The Index Is Defined, Then Next Element Is An Index Element
1961 0 0       0 my $element = $data[$i+1] if defined( $index );
1962              
1963             # Assign The Correct Index Element To The Specified Index
1964 0 0 0     0 $wordHash{$index} = $element if defined( $index ) && defined( $element );
1965             }
1966              
1967             # Clear Data
1968 0         0 undef( @data );
1969 0         0 @data = ();
1970 0         0 $rawSparseText = undef;
1971              
1972 0         0 return \%wordHash;
1973             }
1974              
1975             sub GetOSType
1976             {
1977 2     2 1 3 my ( $self ) = @_;
1978 2         8 return $^O;
1979             }
1980              
1981              
1982             ######################################################################################
1983             # Accessors
1984             ######################################################################################
1985              
1986             sub GetDebugLog
1987             {
1988 60     60 1 356 my ( $self ) = @_;
1989 60 50       93 $self->{ _debugLog } = 0 if !defined ( $self->{ _debugLog } );
1990 60         131 return $self->{ _debugLog };
1991             }
1992              
1993             sub GetWriteLog
1994             {
1995 50     50 1 46 my ( $self ) = @_;
1996 50 50       69 $self->{ _writeLog } = 0 if !defined ( $self->{ _writeLog } );
1997 50         87 return $self->{ _writeLog };
1998             }
1999              
2000             sub GetFileHandle
2001             {
2002 1     1 1 1 my ( $self ) = @_;
2003 1 50       6 $self->{ _fileHandle } = undef if !defined ( $self->{ _fileHandle } );
2004 1         3 return $self->{ _fileHandle };
2005             }
2006              
2007             sub GetTrainFilePath
2008             {
2009 2     2 1 3 my ( $self ) = @_;
2010 2 50       6 $self->{ _trainFileName } = "" if !defined ( $self->{ _trainFileName } );
2011 2         5 return $self->{ _trainFileName };
2012             }
2013              
2014             sub GetOutputFilePath
2015             {
2016 2     2 1 3 my ( $self ) = @_;
2017 2 50       7 $self->{ _outputFileName } = "" if !defined ( $self->{ _outputFileName } );
2018 2         8 return $self->{ _outputFileName };
2019             }
2020              
2021             sub GetWordVecSize
2022             {
2023 2     2 1 3 my ( $self ) = @_;
2024 2 50       6 $self->{ _wordVecSize } = 100 if !defined ( $self->{ _wordVecSize } );
2025 2         6 return $self->{ _wordVecSize };
2026             }
2027              
2028             sub GetWindowSize
2029             {
2030 2     2 1 2 my ( $self ) = @_;
2031 2 50       5 $self->{ _windowSize } = 5 if !defined ( $self->{ _windowSize } );
2032 2         5 return $self->{ _windowSize };
2033             }
2034              
2035             sub GetSample
2036             {
2037 2     2 1 3 my ( $self ) = @_;
2038 2 50       6 $self->{ _sample } = 0.001 if !defined ( $self->{ _sample } );
2039 2         6 return $self->{ _sample };
2040             }
2041              
2042             sub GetHSoftMax
2043             {
2044 2     2 1 3 my ( $self ) = @_;
2045 2 50       5 $self->{ _hSoftMax } = 0 if !defined ( $self->{ _hSoftMax } );
2046 2         6 return $self->{ _hSoftMax };
2047             }
2048              
2049             sub GetNegative
2050             {
2051 2     2 1 3 my ( $self ) = @_;
2052 2 50       5 $self->{ _negative } = 5 if !defined ( $self->{ _negative } );
2053 2         7 return $self->{ _negative };
2054             }
2055              
2056             sub GetNumOfThreads
2057             {
2058 2     2 1 4 my ( $self ) = @_;
2059 2 50       12 $self->{ _numOfThreads } = 12 if !defined ( $self->{ _numOfThreads } );
2060 2         8 return $self->{ _numOfThreads };
2061             }
2062              
2063             sub GetNumOfIterations
2064             {
2065 2     2 1 2 my ( $self ) = @_;
2066 2 50       7 $self->{ _numOfIterations } = 5 if !defined ( $self->{ _numOfIterations } );
2067 2         4 return $self->{ _numOfIterations };
2068             }
2069              
2070             sub GetMinCount
2071             {
2072 2     2 1 3 my ( $self ) = @_;
2073 2 50       5 $self->{ _minCount } = 5 if !defined ( $self->{ _minCount } );
2074 2         8 return $self->{ _minCount };
2075             }
2076              
2077             sub GetAlpha
2078             {
2079 3     3 1 4 my ( $self ) = @_;
2080 3 50 33     10 $self->{ _alpha } = 0.05 if ( !defined ( $self->{ _alpha } ) && $self->GetUseCBOW() == 1 );
2081 3 50 33     7 $self->{ _alpha } = 0.025 if ( !defined ( $self->{ _alpha } ) && $self->GetUseCBOW() == 0 );
2082 3         7 return $self->{ _alpha };
2083             }
2084              
2085             sub GetClasses
2086             {
2087 2     2 1 158 my ( $self ) = @_;
2088 2 50       5 $self->{ _classes } = 0 if !defined ( $self->{ _classes } );
2089 2         5 return $self->{ _classes };
2090             }
2091              
2092             sub GetDebugTraining
2093             {
2094 2     2 1 3 my ( $self ) = @_;
2095 2 50       5 $self->{ _debug } = 2 if !defined ( $self->{ _debug } );
2096 2         5 return $self->{ _debug };
2097             }
2098              
2099             sub GetBinaryOutput
2100             {
2101 2     2 1 3 my ( $self ) = @_;
2102 2 50       4 $self->{ _binaryOutput } = 1 if !defined ( $self->{ _binaryOutput } );
2103 2         6 return $self->{ _binaryOutput };
2104             }
2105              
2106             sub GetSaveVocabFilePath
2107             {
2108 2     2 1 2 my ( $self ) = @_;
2109 2 50       14 $self->{ _saveVocab } = "" if !defined ( $self->{ _saveVocab } );
2110 2         6 return $self->{ _saveVocab };
2111             }
2112              
2113             sub GetReadVocabFilePath
2114             {
2115 2     2 1 3 my ( $self ) = @_;
2116 2 50       5 $self->{ _readVocab } = "" if !defined ( $self->{ _readVocab } );
2117 2         7 return $self->{ _readVocab };
2118             }
2119              
2120             sub GetUseCBOW
2121             {
2122 2     2 1 4 my ( $self ) = @_;
2123 2 50       6 $self->{ _useCBOW } = 1 if !defined ( $self->{ _useCBOW } );
2124 2         6 return $self->{ _useCBOW };
2125             }
2126              
2127             sub GetWorkingDir
2128             {
2129 5     5 1 5 my ( $self ) = @_;
2130 5 50       10 $self->{ _workingDir } = Cwd::getcwd() if !defined ( $self->{ _workingDir } );
2131 5         22 return $self->{ _workingDir };
2132             }
2133              
2134             sub GetWord2VecExeDir
2135             {
2136 5     5 1 8 my ( $self ) = @_;
2137 5 50       9 $self->{ _word2VecExeDir } = "" if !defined( $self->{ _word2VecExeDir } );
2138 5         12 return $self->{ _word2VecExeDir };
2139             }
2140              
2141             sub GetVocabularyHash
2142             {
2143 42     42 1 33 my ( $self ) = @_;
2144 42 50       66 $self->{ _hashRefOfWordVectors } = undef if !defined ( $self->{ _hashRefOfWordVectors } );
2145 42         55 return $self->{ _hashRefOfWordVectors };
2146             }
2147              
2148             sub GetOverwriteOldFile
2149             {
2150 4     4 1 4 my ( $self ) = @_;
2151 4 50       8 $self->{ _overwriteOldFile } = 0 if !defined ( $self->{ _overwriteOldFile } );
2152 4         6 return $self->{ _overwriteOldFile };
2153             }
2154              
2155             sub GetSparseVectorMode
2156             {
2157 4     4 0 3 my ( $self ) = @_;
2158 4 50       8 $self->{ _sparseVectorMode } = 0 if !defined ( $self->{ _sparseVectorMode } );
2159 4         10 return $self->{ _sparseVectorMode };
2160             }
2161              
2162             sub GetVectorLength
2163             {
2164 5     5 0 6 my ( $self ) = @_;
2165 5 50       12 $self->{ _vectorLength } = 0 if !defined ( $self->{ _vectorLength } );
2166 5         10 return $self->{ _vectorLength };
2167             }
2168              
2169             sub GetNumberOfWords
2170             {
2171 4     4 0 5 my ( $self ) = @_;
2172 4 50       8 $self->{ _numberOfWords } = 0 if !defined ( $self->{ _numberOfWords } );
2173 4         7 return $self->{ _numberOfWords };
2174             }
2175              
2176             sub GetMinimizeMemoryUsage
2177             {
2178 2     2 0 3 my ( $self ) = @_;
2179 2 50       8 $self->{ _minimizeMemoryUsage } = 1 if !defined ( $self->{ _minimizeMemoryUsage } );
2180 2         5 return $self->{ _minimizeMemoryUsage };
2181             }
2182              
2183              
2184             ######################################################################################
2185             # Mutators
2186             ######################################################################################
2187              
2188             sub SetTrainFilePath
2189             {
2190 2     2 1 4 my ( $self, $str ) = @_;
2191 2         5 return $self->{ _trainFileName } = $str;
2192             }
2193              
2194             sub SetOutputFilePath
2195             {
2196 2     2 1 4 my ( $self, $str ) = @_;
2197 2         5 return $self->{ _outputFileName } = $str;
2198             }
2199              
2200             sub SetWordVecSize
2201             {
2202 2     2 1 4 my ( $self, $value ) = @_;
2203 2         3 return $self->{ _wordVecSize } = $value;
2204             }
2205              
2206             sub SetWindowSize
2207             {
2208 2     2 1 5 my ( $self, $value ) = @_;
2209 2         3 return $self->{ _windowSize } = $value;
2210             }
2211              
2212             sub SetSample
2213             {
2214 2     2 1 3 my ( $self, $value ) = @_;
2215 2         3 return $self->{ _sample } = $value;
2216             }
2217              
2218             sub SetHSoftMax
2219             {
2220 2     2 1 2 my ( $self, $value ) = @_;
2221 2         4 return $self->{ _hSoftMax } = $value;
2222             }
2223              
2224             sub SetNegative
2225             {
2226 2     2 1 3 my ( $self, $value ) = @_;
2227 2         3 return $self->{ _negative } = $value;
2228             }
2229              
2230             sub SetNumOfThreads
2231             {
2232 2     2 1 3 my ( $self, $value ) = @_;
2233 2         6 return $self->{ _numOfThreads } = $value;
2234             }
2235              
2236             sub SetNumOfIterations
2237             {
2238 2     2 1 3 my ( $self, $value ) = @_;
2239 2         4 return $self->{ _numOfIterations } = $value;
2240             }
2241              
2242             sub SetMinCount
2243             {
2244 2     2 1 3 my ( $self, $value ) = @_;
2245 2         3 return $self->{ _minCount } = $value;
2246             }
2247              
2248             sub SetAlpha
2249             {
2250 2     2 1 5 my ( $self, $value ) = @_;
2251 2         4 return $self->{ _alpha } = $value;
2252             }
2253              
2254             sub SetClasses
2255             {
2256 2     2 1 3 my ( $self, $value ) = @_;
2257 2         3 return $self->{ _classes } = $value;
2258             }
2259              
2260             sub SetDebugTraining
2261             {
2262 3     3 1 4 my ( $self, $value ) = @_;
2263 3         4 return $self->{ _debug } = $value;
2264             }
2265              
2266             sub SetBinaryOutput
2267             {
2268 1     1 1 1 my ( $self, $value ) = @_;
2269 1         2 return $self->{ _binaryOutput } = $value;
2270             }
2271              
2272             sub SetSaveVocabFilePath
2273             {
2274 2     2 1 2 my ( $self, $str ) = @_;
2275 2         3 return $self->{ _saveVocab } = $str;
2276             }
2277              
2278             sub SetReadVocabFilePath
2279             {
2280 2     2 1 3 my ( $self, $str ) = @_;
2281 2         2 return $self->{ _readVocab } = $str;
2282             }
2283              
2284             sub SetUseCBOW
2285             {
2286 2     2 1 3 my ( $self, $value ) = @_;
2287 2         3 return $self->{ _useCBOW } = $value;
2288             }
2289              
2290             sub SetWorkingDir
2291             {
2292 2     2 1 6 my ( $self, $dir ) = @_;
2293 2         3 return $self->{ _workingDir } = $dir;
2294             }
2295              
2296             sub SetWord2VecExeDir
2297             {
2298 2     2 1 8 my ( $self, $dir ) = @_;
2299 2         4 return $self->{ _word2VecExeDir } = $dir;
2300             }
2301              
2302             sub SetVocabularyHash
2303             {
2304 1     1 1 1 my ( $self, $ref ) = @_;
2305 1 50       4 return if !defined( $ref );
2306 1         2 return $self->{ _hashRefOfWordVectors } = $ref;
2307             }
2308              
2309             sub ClearVocabularyHash
2310             {
2311 4     4 1 5 my ( $self ) = @_;
2312              
2313 4         7 $self->SetNumberOfWords( 0 );
2314 4         6 $self->SetVectorLength( 0 );
2315              
2316 4         2 undef( %{ $self->{ _hashRefOfWordVectors } } );
  4         9  
2317              
2318 4         4 my %hash;
2319 4         7 return $self->{ _hashRefOfWordVectors } = \%hash;
2320             }
2321              
2322             sub AddWordVectorToVocabHash
2323             {
2324 0     0 1 0 my ( $self, $wordVectorStr ) = @_;
2325 0 0       0 return if !defined( $wordVectorStr );
2326 0         0 my @tempAry = split( ' ', $wordVectorStr, 2 );
2327              
2328             # Check(s)
2329 0 0       0 return if !defined( $self->{ _hashRefOfWordVectors } );
2330 0 0       0 return if ( @tempAry != 2 );
2331              
2332 0         0 $self->{ _hashRefOfWordVectors }->{ $tempAry[0] } = $tempAry[1];
2333             }
2334              
2335             sub SetOverwriteOldFile
2336             {
2337 1     1 1 2 my ( $self, $temp ) = @_;
2338 1         1 return $self->{ _overwriteOldFile } = $temp;
2339             }
2340              
2341             sub SetSparseVectorMode
2342             {
2343 2     2 0 2 my ( $self, $temp ) = @_;
2344 2         4 return $self->{ _sparseVectorMode } = $temp;
2345             }
2346              
2347             sub SetVectorLength
2348             {
2349 6     6 0 5 my ( $self, $temp ) = @_;
2350 6         6 return $self->{ _vectorLength } = $temp;
2351             }
2352              
2353             sub SetNumberOfWords
2354             {
2355 6     6 0 6 my ( $self, $temp ) = @_;
2356 6         8 return $self->{ _numberOfWords } = $temp;
2357             }
2358              
2359             sub SetMinimizeMemoryUsage
2360             {
2361 2     2 0 7 my ( $self, $temp ) = @_;
2362 2 100       7 $self->WriteLog( "SetMinimalMemoryUsage - Normal Memory Mode Enabled" ) if ( $temp == 0 );
2363 2 100       4 $self->WriteLog( "SetMinimalMemoryUsage - Low Memory Mode Enabled" ) if ( $temp == 1 );
2364 2         4 return $self->{ _minimizeMemoryUsage } = $temp;
2365             }
2366              
2367              
2368             ######################################################################################
2369             # Debug Functions
2370             ######################################################################################
2371              
2372             sub GetTime
2373             {
2374 1     1 1 1 my ( $self ) = @_;
2375 1         151 my( $sec, $min, $hour ) = localtime();
2376              
2377 1 50       16 if( $hour < 10 )
2378             {
2379 0         0 $hour = "0$hour";
2380             }
2381              
2382 1 50       3 if( $min < 10 )
2383             {
2384 1         4 $min = "0$min";
2385             }
2386              
2387 1 50       4 if( $sec < 10 )
2388             {
2389 0         0 $sec = "0$sec";
2390             }
2391              
2392 1         4 return "$hour:$min:$sec";
2393             }
2394              
2395             sub GetDate
2396             {
2397 1     1 1 27 my ( $self ) = @_;
2398 1         24 my ( $sec, $min, $hour, $mday, $mon, $year ) = localtime();
2399              
2400 1         3 $mon += 1;
2401 1         2 $year += 1900;
2402              
2403 1         5 return "$mon/$mday/$year";
2404             }
2405              
2406             sub WriteLog
2407             {
2408 47     47 1 40 my ( $self ) = shift;
2409 47         43 my $string = shift;
2410 47         45 my $printNewLine = shift;
2411              
2412 47 50       64 return if !defined ( $string );
2413 47 50       66 $printNewLine = 1 if !defined ( $printNewLine );
2414              
2415              
2416 47 50       57 if( $self->GetDebugLog() )
2417             {
2418 0 0       0 if( ref ( $self ) ne "Word2vec::Word2vec" )
2419             {
2420 0         0 print( GetDate() . " " . GetTime() . " - Word2vec: Cannot Call WriteLog() From Outside Module!\n" );
2421 0         0 return;
2422             }
2423              
2424 0 0       0 $string = "" if !defined ( $string );
2425 0         0 print GetDate() . " " . GetTime() . " - Word2vec::$string";
2426 0 0       0 print "\n" if( $printNewLine != 0 );
2427             }
2428              
2429 47 50       56 if( $self->GetWriteLog() )
2430             {
2431 0 0         if( ref ( $self ) ne "Word2vec::Word2vec" )
2432             {
2433 0           print( GetDate() . " " . GetTime() . " - Word2vec: Cannot Call WriteLog() From Outside Module!\n" );
2434 0           return;
2435             }
2436              
2437 0           my $fileHandle = $self->GetFileHandle();
2438              
2439 0 0         if( defined( $fileHandle ) )
2440             {
2441 0           print( $fileHandle GetDate() . " " . GetTime() . " - Word2vec::$string" );
2442 0 0         print( $fileHandle "\n" ) if( $printNewLine != 0 );
2443             }
2444             }
2445             }
2446              
2447             #################### All Modules Are To Output "1"(True) at EOF ######################
2448             1;
2449              
2450              
2451             =head1 NAME
2452              
2453             Word2vec::Word2vec - word2vec wrapper module.
2454              
2455             =head1 SYNOPSIS
2456              
2457             # Parameters: Enabled Debug Logging, Disabled Write Logging
2458             my $w2v = Word2vec::Word2vec->new( 1, 0 ); # Note: Specifiying no parameters implies default settings.
2459              
2460             $w2v->SetTrainFilePath( "textCorpus.txt" );
2461             $w2v->SetOutputFilePath( "vectors.bin" );
2462             $w2v->SetWordVecSize( 200 );
2463             $w2v->SetWindowSize( 8 );
2464             $w2v->SetSample( 0.0001 );
2465             $w2v->SetNegative( 25 );
2466             $w2v->SetHSoftMax( 0 );
2467             $w2v->SetBinaryOutput( 0 );
2468             $w2v->SetNumOfThreads( 20 );
2469             $w2v->SetNumOfIterations( 12 );
2470             $w2v->SetUseCBOW( 1 );
2471             $w2v->SetOverwriteOldFile( 0 );
2472              
2473             $w2v->ExecuteTraining();
2474              
2475             undef( $w2v );
2476              
2477             # or
2478              
2479             use Word2vec::Word2vec;
2480              
2481             my $w2v = Word2vec::Word2vec->new(); # Note: Specifying no parameters implies default settings.
2482              
2483             $w2v->ExecuteTraining( $trainFilePath, $outputFilePath, $vectorSize, $windowSize, $minCount, $sample, $negative,
2484             $alpha, $hs, $binary, $numOfThreads, $iterations, $useCBOW, $classes, $readVocab,
2485             $saveVocab, $debug, $overwrite );
2486              
2487             undef( $w2v );
2488              
2489             =head1 DESCRIPTION
2490              
2491             Word2vec::Word2vec is a word2vec package tool that trains text corpus data using the word2vec tool, provides multiple avenues for cosine
2492             similarity computation, manipulation of word vectors and conversion of word2vec's binary format to human readable text.
2493              
2494             =head2 Main Functions
2495              
2496             =head3 new
2497              
2498             Description:
2499              
2500             Returns a new "Word2vec::Word2vec" module object.
2501              
2502             Note: Specifying no parameters implies default options.
2503              
2504             Default Parameters:
2505             debugLog = 0
2506             writeLog = 0
2507             trainFileName = ""
2508             outputFileName = ""
2509             wordVecSize = 100
2510             sample = 5
2511             hSoftMax = 0
2512             negative = 5
2513             numOfThreads = 12
2514             numOfIterations = 5
2515             minCount = 5
2516             alpha = 0.05 (CBOW) or 0.025 (Skip-Gram)
2517             classes = 0
2518             debug = 2
2519             binaryOutput = 1
2520             saveVocab = ""
2521             readVocab = ""
2522             useCBOW = 1
2523             workingDir = Current Directory
2524             hashRefOfWordVectors = ()
2525             overwriteOldFile = 0
2526              
2527             Input:
2528              
2529             $debugLog -> Instructs module to print debug statements to the console. (1 = True / 0 = False)
2530             $writeLog -> Instructs module to print debug statements to a log file. (1 = True / 0 = False)
2531             $trainFileName -> Specifies the training text corpus file path. (String)
2532             $outputFileName -> Specifies the word2vec post training output file path. (String)
2533             $wordVecSize -> Specifies word2vec word vector parameter size.(Integer)
2534             $sample -> Specifies word2vec sample parameter value. (Integer)
2535             $hSoftMax -> Specifies word2vec HSoftMax parameter value. (Integer)
2536             $negative -> Specifies word2vec negative parameter value. (Integer)
2537             $numOfThreads -> Specifies word2vec number of threads parameter value. (Integer)
2538             $numOfIterations -> Specifies word2vec number of iterations parameter value. (Integer)
2539             $minCount -> Specifies word2vec min-count parameter value. (Integer)
2540             $alpha -> Specifies word2vec alpha parameter value. (Integer)
2541             $classes -> Specifies word2vec classes parameter value. (Integer)
2542             $debug -> Specifies word2vec debug training parameter value. (Integer: '0' = No Debug, '1' = Debug, '2' = Even more debug info)
2543             $binaryOutput -> Specifies word2vec binary output mode parameter value. (Integer: '1' = Binary, '0' = Plain Text)
2544             $saveVocab -> Specifies word2vec save vocabulary file path. (String)
2545             $readVocab -> Specifies word2vec read vocabulary file path. (String)
2546             $useCBOW -> Specifies word2vec CBOW algorithm parameter value. (Integer: '1' = CBOW, '0' = Skip-Gram)
2547             $workingDir -> Specifies module working directory. (String)
2548             $hashRefOfWordVectors -> Storage location for loaded word2vec trained vector data file in memory. (Hash)
2549             $overwriteOldFile -> Instructs the module to either overwrite any existing data with the same output file name and path. ( '1' or '0' )
2550              
2551             Note: It is not recommended to specify all new() parameters, as it has not been thoroughly tested.
2552              
2553             Output:
2554              
2555             Word2vec::Word2vec object.
2556              
2557             Example:
2558              
2559             use Word2vec::Word2vec;
2560              
2561             my $w2v = Word2vec::Word2vec->new();
2562              
2563             undef( $w2v );
2564              
2565             =head3 DESTROY
2566              
2567             Description:
2568              
2569             Removes member variables and file handle from memory.
2570              
2571             Input:
2572              
2573             None
2574              
2575             Output:
2576              
2577             None
2578              
2579             Example:
2580              
2581             use Word2vec::Word2vec;
2582              
2583             my $w2v = Word2vec::Word2vec->new();
2584             $w2v->DESTROY();
2585              
2586             undef( $w2v );
2587              
2588             =head3 ExecuteTraining
2589              
2590             Executes word2vec training based on parameters. Parameter variables have higher precedence
2591             than member variables. Any parameter specified will override its respective member variable.
2592              
2593             Note: If no parameters are specified, this module executes word2vec training based on preset
2594             member variables. Returns string regarding training status.
2595              
2596             Input:
2597              
2598             $trainFilePath -> Specifies word2vec text corpus training file in a given path. (String)
2599             $outputFilePath -> Specifies word2vec trained output data file name and save path. (String)
2600             $vectorSize -> Size of word2vec word vectors. (Integer)
2601             $windowSize -> Maximum skip length between words. (Integer)
2602             $minCount -> Disregard words that appear less than $minCount times. (Integer)
2603             $sample -> Threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled. (Float)
2604             $negative -> Number of negative examples. (Integer)
2605             $alpha -> Set that start learning rate. (Float)
2606             $hs -> Hierarchical Soft-max (Integer)
2607             $binary -> Save trained data as binary mode. (Integer)
2608             $numOfThreads -> Number of word2vec training threads. (Integer)
2609             $iterations -> Number of training iterations to run prior to completion of training. (Integer)
2610             $useCBOW -> Enable Continuous Bag Of Words model or Skip-Gram model. (Integer)
2611             $classes -> Output word classes rather than word vectors. (Integer)
2612             $readVocab -> Read vocabulary from file path without constructing from training data. (String)
2613             $saveVocab -> Save vocabulary to file path. (String)
2614             $debug -> Set word2vec debug mode. (Integer)
2615             $overwrite -> Instructs the module to either overwrite any existing text corpus files or append to the existing file. ( '1' = True / '0' = False )
2616              
2617             Note: It is not recommended to specify all new() parameters, as it has not been thoroughly tested.
2618              
2619             Output:
2620              
2621             $value -> '0' = Successful / '-1' = Un-successful
2622              
2623             Example:
2624              
2625             use Word2vec::Word2vec;
2626              
2627             my $w2v = Word2vec::Word2vec->new();
2628             $w2v->SetTrainFilePath( "textcorpus.txt" );
2629             $w2v->SetOutputFilePath( "vectors.bin" );
2630             $w2v->SetWordVecSize( 200 );
2631             $w2v->SetWindowSize( 8 );
2632             $w2v->SetSample( 0.0001 );
2633             $w2v->SetNegative( 25 );
2634             $w2v->SetHSoftMax( 0 );
2635             $w2v->SetBinaryOutput( 0 );
2636             $w2v->SetNumOfThreads( 20 );
2637             $w2v->SetNumOfIterations( 15 );
2638             $w2v->SetUseCBOW( 1 );
2639             $w2v->SetOverwriteOldFile( 0 );
2640             $w2v->ExecuteTraining();
2641              
2642             undef( $w2v );
2643              
2644             # or
2645              
2646             use Word2vec::Word2vec;
2647              
2648             my $w2v = Word2vec::Word2vec->new();
2649             $w2v->ExecuteTraining( "textcorpus.txt", "vectors.bin", 200, 8, 5, 0.001, 25, 0.05, 0, 0, 20, 15, 1, 0, "", "", 2, 0 );
2650              
2651             undef( $w2v );
2652              
2653             =head3 ExecuteStringTraining
2654              
2655             Executes word2vec training based on parameters. Parameter variables have higher precedence
2656             than member variables. Any parameter specified will override its respective member variable.
2657              
2658             Note: If no parameters are specified, this module executes word2vec training based on preset
2659             member variables. Returns string regarding training status.
2660              
2661             Input:
2662              
2663             $trainingStr -> String to train with word2vec.
2664             $outputFilePath -> Specifies word2vec trained output data file name and save path. (String)
2665             $vectorSize -> Size of word2vec word vectors. (Integer)
2666             $windowSize -> Maximum skip length between words. (Integer)
2667             $minCount -> Disregard words that appear less than $minCount times. (Integer)
2668             $sample -> Threshold for occurrence of words. Those that appear with higher frequency in the training data will be randomly down-sampled. (Float)
2669             $negative -> Number of negative examples. (Integer)
2670             $alpha -> Set that start learning rate. (Float)
2671             $hs -> Hierarchical Soft-max (Integer)
2672             $binary -> Save trained data as binary mode. (Integer)
2673             $numOfThreads -> Number of word2vec training threads. (Integer)
2674             $iterations -> Number of training iterations to run prior to completion of training. (Integer)
2675             $useCBOW -> Enable Continuous Bag Of Words model or Skip-Gram model. (Integer)
2676             $classes -> Output word classes rather than word vectors. (Integer)
2677             $readVocab -> Read vocabulary from file path without constructing from training data. (String)
2678             $saveVocab -> Save vocabulary to file path. (String)
2679             $debug -> Set word2vec debug mode. (Integer)
2680             $overwrite -> Instructs the module to either overwrite any existing text corpus files or append to the existing file. ( '1' = True / '0' = False )
2681              
2682             Note: It is not recommended to specify all new() parameters, as it has not been thoroughly tested.
2683              
2684             Output:
2685              
2686             $value -> '0' = Successful / '-1' = Un-successful
2687              
2688             Example:
2689              
2690             use Word2vec::Word2vec;
2691              
2692             my $w2v = Word2vec::Word2vec->new();
2693             $w2v->SetOutputFilePath( "vectors.bin" );
2694             $w2v->SetWordVecSize( 200 );
2695             $w2v->SetWindowSize( 8 );
2696             $w2v->SetSample( 0.0001 );
2697             $w2v->SetNegative( 25 );
2698             $w2v->SetHSoftMax( 0 );
2699             $w2v->SetBinaryOutput( 0 );
2700             $w2v->SetNumOfThreads( 20 );
2701             $w2v->SetNumOfIterations( 15 );
2702             $w2v->SetUseCBOW( 1 );
2703             $w2v->SetOverwriteOldFile( 0 );
2704             $w2v->ExecuteStringTraining( "string to train here" );
2705              
2706             undef( $w2v );
2707              
2708             # or
2709              
2710             use Word2vec::Word2vec;
2711              
2712             my $w2v = Word2vec::Word2vec->new();
2713             $w2v->ExecuteStringTraining( "string to train here", "vectors.bin", 200, 8, 5, 0.001, 25, 0.05, 0, 0, 20, 15, 1, 0, "", "", 2, 0 );
2714              
2715             undef( $w2v );
2716              
2717             =head3 ComputeCosineSimilarity
2718              
2719             Description:
2720              
2721             Computes cosine similarity between two words using trained word2vec vector data. Returns
2722             float value or undefined if one or more words are not in the dictionary.
2723              
2724             Note: Supports single words only and requires vector data to be in memory with ReadTrainedVectorDataFromFile() prior to function execution.
2725              
2726             Input:
2727              
2728             $string -> Single string word
2729             $string -> Single string word
2730              
2731             Output:
2732              
2733             $value -> Float or Undefined
2734              
2735             Example:
2736              
2737             use Word2vec::Word2vec;
2738              
2739             my $w2v = Word2vec::Word2vec->new();
2740             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2741             print "Cosine similarity between words: \"of\" and \"the\": " . $w2v->ComputeCosineSimilarity( "of", "the" ) . "\n";
2742              
2743             undef( $w2v );
2744              
2745             =head3 ComputeAvgOfWordsCosineSimilarity
2746              
2747             Description:
2748              
2749             Computes cosine similarity between two words or compound words using trained word2vec vector data.
2750             Returns float value or undefined.
2751              
2752             Note: Supports multiple words concatenated by ' ' and requires vector data to be in memory prior
2753             to method execution. This method will not error out when a word is not located within the dictionary.
2754             It will take the average of all found words for each parameter then cosine similarity of both word vectors.
2755              
2756             Input:
2757              
2758             $string -> string of single or multiple words separated by ' ' (space).
2759             $string -> string of single or multiple words separated by ' ' (space).
2760              
2761             Output:
2762              
2763             $value -> Float or Undefined
2764              
2765             Example:
2766              
2767             use Word2vec::Word2vec;
2768              
2769             my $w2v = Word2vec::Word2vec->new();
2770             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2771             print "Cosine similarity between words: \"heart attack\" and \"acute myocardial infarction\": " .
2772             $w2v->ComputeAvgOfWordsCosineSimilarity( "heart attack", "acute myocardial infarction" ) . "\n";
2773              
2774             undef( $w2v );
2775              
2776             =head3 ComputeMultiWordCosineSimilarity
2777              
2778             Description:
2779              
2780             Computes cosine similarity between two words or compound words using trained word2vec vector data.
2781             Returns float value or undefined if one or more words are not in the dictionary.
2782              
2783             Note: Supports multiple words concatenated by ' ' and requires vector data to be in memory prior to method execution.
2784             This function will error out when a specified word is not found and return undefined.
2785              
2786             Input:
2787              
2788             $string -> string of single or multiple words separated by ' ' (space).
2789             $string -> string of single or multiple words separated by ' ' (space).
2790              
2791             Output:
2792              
2793             $value -> Float or Undefined
2794              
2795             Example:
2796              
2797             use Word2vec::Word2vec;
2798              
2799             my $w2v = Word2vec::Word2vec->new();
2800             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2801             print "Cosine similarity between words: \"heart attack\" and \"acute myocardial infarction\": " .
2802             $w2v->ComputeMultiWordCosineSimilarity( "heart attack", "acute myocardial infarction" ) . "\n";
2803              
2804             undef( $w2v );
2805              
2806             =head3 ComputeCosineSimilarityOfWordVectors
2807              
2808             Description:
2809              
2810             Computes cosine similarity between two word vectors.
2811             Returns float value or undefined if one or more words are not in the dictionary.
2812              
2813             Note: Function parameters require actual word vector data with words removed.
2814              
2815             Input:
2816              
2817             $string -> string of word vector representation data separated by ' ' (space).
2818             $string -> string of word vector representation data separated by ' ' (space).
2819              
2820             Output:
2821              
2822             $value -> Float or Undefined
2823              
2824             Example:
2825              
2826             use Word2vec::Word2vec;
2827              
2828             my $word2vec = Word2vec::Word2vec->new();
2829             $word2vec->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2830             my $vectorAData = $word2vec->GetWordVector( "heart" );
2831             my $vectorBData = $word2vec->GetWordVector( "attack" );
2832              
2833             # Remove Words From Data
2834             $vectorAData = RemoveWordFromWordVectorString( $vectorAData );
2835             $vectorBData = RemoveWordFromWordVectorString( $vectorBData );
2836              
2837             print "Cosine similarity between words: \"heart\" and \"attack\": " .
2838             $word2vec->ComputeCosineSimilarityOfWordVectors( $vectorAData, $vectorBData ) . "\n";
2839              
2840             undef( $word2vec );
2841              
2842             =head3 CosSimWithUserInput
2843              
2844             Description:
2845              
2846             Computes cosine similarity between two words using trained word2vec vector data based on user input.
2847              
2848             Note: No compound word support.
2849              
2850             Warning: Requires vector data to be in memory prior to method execution.
2851              
2852             Input:
2853              
2854             None
2855              
2856             Output:
2857              
2858             None
2859              
2860             Example:
2861              
2862             use Word2vec::Word2vec;
2863              
2864             my $w2v = Word2vec::Word2vec->new();
2865             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2866             $w2v->CosSimWIthUserInputTest();
2867              
2868             undef( $w2v );
2869              
2870             =head3 MultiWordCosSimWithUserInput
2871              
2872             Description:
2873              
2874             Computes cosine similarity between two words or compound words using trained word2vec vector data based on user input.
2875              
2876             Note: Supports multiple words concatenated by ':'.
2877              
2878             Warning: Requires vector data to be in memory prior to method execution.
2879              
2880             Input:
2881              
2882             None
2883              
2884             Output:
2885              
2886             None
2887              
2888             Example:
2889              
2890             use Word2vec::Word2vec;
2891              
2892             my $w2v = Word2vec::Word2vec->new();
2893             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
2894             $w2v->MultiWordCosSimWithUserInput();
2895              
2896             undef( $w2v );
2897              
2898              
2899             =head3 ComputeAverageOfWords
2900              
2901             Description:
2902              
2903             Computes cosine similarity average of all found words given an array reference parameter of
2904             plain text words. Returns average values (string) or undefined.
2905              
2906             Warning: Requires vector data to be in memory prior to method execution.
2907              
2908             Input:
2909              
2910             $arrayReference -> Array reference of words
2911              
2912             Output:
2913              
2914             $string -> String of word2vec word average values
2915              
2916             Example:
2917              
2918             use Word2vec::Word2vec;
2919              
2920             my $w2v = Word2vec::Word2vec->new();
2921             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
2922             my $data = $w2v->ComputeAverageOfWords( "of", "the", "and" );
2923             print( "Computed Average Of Words: $data" ) if defined( $data );
2924              
2925             undef( $w2v );
2926              
2927             =head3 AddTwoWords
2928              
2929             Description:
2930              
2931             Adds two word vectors and returns the result.
2932              
2933             Warning: This method also requires vector data to be in memory prior to method execution.
2934              
2935             Input:
2936              
2937             $string -> Word to add
2938             $string -> Word to add
2939              
2940             Output:
2941              
2942             $string -> String of word2vec summed word values
2943              
2944             Example:
2945              
2946             use Word2vec::Word2vec;
2947              
2948             my $w2v = Word2vec::Word2vec->new();
2949             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
2950              
2951             my $data = $w2v->AddTwoWords( "heart", "attack" );
2952             print( "Computed Sum Of Words: $data" ) if defined( $data );
2953              
2954             undef( $w2v );
2955              
2956             =head3 SubtractTwoWords
2957              
2958             Description:
2959              
2960             Subtracts two word vectors and returns the result.
2961              
2962             Warning: This method also requires vector data to be in memory prior to method execution.
2963              
2964             Input:
2965              
2966             $string -> Word to subtract
2967             $string -> Word to subtract
2968              
2969             Output:
2970              
2971             $string -> String of word2vec difference between word values
2972              
2973             Example:
2974              
2975             use Word2vec::Word2vec;
2976              
2977             my $w2v = Word2vec::Word2vec->new();
2978             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
2979              
2980             my $data = $w2v->SubtractTwoWords( "king", "man" );
2981             print( "Computed Difference Of Words: $data" ) if defined( $data );
2982              
2983             undef( $w2v );
2984              
2985             =head3 AddTwoWordVectors
2986              
2987             Description:
2988              
2989             Adds two vector data strings and returns the result.
2990              
2991             Warning: Text word must be removed from vector data prior to calling this method. This method
2992             also requires vector data to be in memory prior to method execution.
2993              
2994             Input:
2995              
2996             $string -> Word2vec word vector data (with string word removed)
2997             $string -> Word2vec word vector data (with string word removed)
2998              
2999             Output:
3000              
3001             $string -> String of word2vec summed word values
3002              
3003             Example:
3004              
3005             use Word2vec::Word2vec;
3006              
3007             my $w2v = Word2vec::Word2vec->new();
3008             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3009             my $wordAData = $w2v->GetWordVector( "of" );
3010             my $wordBData = $w2v->GetWordVector( "the" );
3011              
3012             # Removing Words From Vector Data Array
3013             $wordAData = RemoveWordFromWordVectorString( $wordAData );
3014             $wordBData = RemoveWordFromWordVectorString( $wordBData );
3015              
3016             my $data = $w2v->AddTwoWordVectors( $wordAData, $wordBData );
3017             print( "Computed Sum Of Words: $data" ) if defined( $data );
3018              
3019             undef( $w2v );
3020              
3021             =head3 SubtractTwoWordVectors
3022              
3023             Description:
3024              
3025             Subtracts two vector data strings and returns the result.
3026              
3027             Warning: Text word must be removed from vector data prior to calling this method. This method
3028             also requires vector data to be in memory prior to method execution.
3029              
3030             Input:
3031              
3032             $string -> Word2vec word vector data (with string word removed)
3033             $string -> Word2vec word vector data (with string word removed)
3034              
3035             Output:
3036              
3037             $string -> String of word2vec difference between word values
3038              
3039             Example:
3040              
3041             use Word2vec::Word2vec;
3042              
3043             my $w2v = Word2vec::Word2vec->new();
3044             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3045             my $wordAData = $w2v->GetWordVector( "of" );
3046             my $wordBData = $w2v->GetWordVector( "the" );
3047              
3048             # Removing Words From Vector Data Array
3049             $wordAData = RemoveWordFromWordVectorString( $wordAData );
3050             $wordBData = RemoveWordFromWordVectorString( $wordBData );
3051              
3052             my $data = $w2v->SubtractTwoWordVectors( $wordAData, $wordBData );
3053             print( "Computed Difference Of Words: $data" ) if defined( $data );
3054              
3055             undef( $w2v );
3056              
3057             =head3 AverageOfTwoWordVectors
3058              
3059             Description:
3060              
3061             Computes the average of two vectors and returns the result.
3062              
3063             Warning: Text word must be removed from vector data prior to calling this method. This method
3064             also requires vector data to be in memory prior to method execution.
3065              
3066             Input:
3067              
3068             $string -> Word2vec word vector data (with string word removed)
3069             $string -> Word2vec word vector data (with string word removed)
3070              
3071             Output:
3072              
3073             $string -> String of word2vec average between word values
3074              
3075             Example:
3076              
3077             use Word2vec::Word2vec;
3078              
3079             my $w2v = Word2vec::Word2vec->new();
3080             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3081             my $wordAData = $w2v->GetWordVector( "of" );
3082             my $wordBData = $w2v->GetWordVector( "the" );
3083              
3084             # Removing Words From Vector Data Array
3085             $wordAData = RemoveWordFromWordVectorString( $wordAData );
3086             $wordBData = RemoveWordFromWordVectorString( $wordBData );
3087              
3088             my $data = $w2v->AverageOfTwoWordVectors( $wordAData, $wordBData );
3089             print( "Computed Difference Of Words: $data" ) if defined( $data );
3090              
3091             undef( $w2v );
3092              
3093             =head3 GetWordVector
3094              
3095             Description:
3096              
3097             Searches dictionary in memory for the specified string argument and returns the vector data.
3098             Returns undefined if not found.
3099              
3100             Warning: Requires vector data to be in memory prior to method execution.
3101              
3102             Input:
3103              
3104             $string -> Word to locate in word2vec vocabulary/dictionary
3105              
3106             Output:
3107              
3108             $string -> Found word2vec word + word vector data or undefined.
3109              
3110             Example:
3111              
3112             use Word2vec::Word2vec;
3113              
3114             my $w2v = Word2vec::Word2vec->new();
3115             $w2v->ReadTrainedVectorDataFromFile( "sample/samplevectors.bin" );
3116             my $wordData = $w2v->GetWordVector( "of" );
3117             print( "Word2vec Word Data: $wordData\n" ) if defined( $wordData );
3118              
3119             undef( $w2v );
3120              
3121             =head3 IsVectorDataInMemory
3122              
3123             Description:
3124              
3125             Checks to see if vector data has been loaded in memory.
3126              
3127             Input:
3128              
3129             None
3130              
3131             Output:
3132              
3133             $value -> '1' = True / '0' = False
3134              
3135             Example:
3136              
3137             use Word2vec::Word2vec;
3138              
3139             my $w2v = Word2vec::Word2vec->new();
3140             my $result = $w2v->IsVectorDataInMemory();
3141              
3142             print( "No vector data in memory\n" ) if $result == 0;
3143             print( "Yes vector data in memory\n" ) if $result == 1;
3144              
3145             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3146              
3147             print( "No vector data in memory\n" ) if $result == 0;
3148             print( "Yes vector data in memory\n" ) if $result == 1;
3149              
3150             undef( $w2v );
3151              
3152             =head3 IsVectorDataSorted
3153              
3154             Description:
3155              
3156             Checks to see if vector data header is signed as sorted in memory.
3157              
3158             Input:
3159              
3160             None
3161              
3162             Output:
3163              
3164             $value -> '1' = True / '0' = False
3165              
3166             Example:
3167              
3168             use Word2vec::Word2vec;
3169              
3170             my $w2v = Word2vec::Word2vec->new();
3171             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3172              
3173             my $result = $w2v->IsVectorDataSorted();
3174              
3175             print( "No vector data is not sorted\n" ) if $result == 0;
3176             print( "Yes vector data is sorted\n" ) if $result == 1;
3177              
3178             undef( $w2v );
3179              
3180             =head3 CheckWord2VecDataFileType
3181              
3182             Description:
3183              
3184             Checks specified file to see if vector data is in binary or plain text format. Returns 'text'
3185             for plain text and 'binary' for binary data.
3186              
3187             Input:
3188              
3189             $string -> File path
3190              
3191             Output:
3192              
3193             $string -> File Type ( "text" = Plain text file / "binary" = Binary data file )
3194              
3195             Example:
3196              
3197             use Word2vec::Word2vec;
3198              
3199             my $w2v = Word2vec::Word2vec->new();
3200             my $fileType = $w2v->CheckWord2VecDataFileType( "samples/samplevectors.bin" );
3201              
3202             print( "FileType: $fileType\n" ) if defined( $fileType );
3203              
3204             undef( $fileType );
3205              
3206             =head3 ReadTrainedVectorDataFromFile
3207              
3208             Description:
3209              
3210             Reads trained vector data from file path in memory.
3211              
3212             Input:
3213              
3214             $string -> Word2vec trained vector data file path
3215              
3216             Output:
3217              
3218             $value -> '0' = Successful / '-1' = Un-successful
3219              
3220             Example:
3221              
3222             # Loading data in a Binary Search Tree
3223             use Word2vec::Word2vec;
3224              
3225             my $w2v = Word2vec::Word2vec->new();
3226             my $result = $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3227              
3228             print( "Success Loading Data\n" ) if $result == 0;
3229             print( "Un-successful, Data Not Loaded\n" ) if $result == -1;
3230              
3231             undef( $w2v );
3232              
3233             # or
3234              
3235             # Loading data in an array
3236             use Word2vec::Word2vec;
3237              
3238             my $w2v = Word2vec::Word2vec->new();
3239             my $result = $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3240              
3241             print( "Success Loading Data\n" ) if $result == 0;
3242             print( "Un-successful, Data Not Loaded\n" ) if $result == -1;
3243              
3244             undef( $w2v );
3245              
3246             =head3 SaveTrainedVectorDataToFile
3247              
3248             Description:
3249              
3250             Saves trained vector data at the location specified. Defining 'binaryFormat' parameter will
3251             save in word2vec's binary format.
3252              
3253             Input:
3254              
3255             $string -> Save Path
3256             $binaryFormat -> Integer ( '1' = Save data in word2vec binary format / '0' = Save as plain text )
3257              
3258             Note: Leaving $binaryFormat as undefined will save the file in plain text format.
3259              
3260             Warning: If the vector data is stored as a binary search tree, this method will error out gracefully.
3261              
3262             Output:
3263              
3264             $value -> '0' = Successful / '-1' = Un-successful
3265              
3266             Example:
3267              
3268             use Word2vec::Word2vec;
3269              
3270             my $w2v = Word2vec::Word2vec->new();
3271            
3272             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
3273             $w2v->SaveTrainedVectorDataToFile( "samples/newvectors.bin" );
3274              
3275             undef( $w2v );
3276              
3277             =head3 StringsAreEqual
3278              
3279             Description:
3280              
3281             Compares two strings to check for equality, ignoring case-sensitivity.
3282              
3283             Note: This method is not case-sensitive. ie. "string" equals "StRiNg"
3284              
3285             Input:
3286              
3287             $string -> String to compare
3288             $string -> String to compare
3289              
3290             Output:
3291              
3292             $value -> '1' = Strings are equal / '0' = Strings are not equal
3293              
3294             Example:
3295              
3296             use Word2vec::Word2vec;
3297              
3298             my $w2v = Word2vec::Word2vec->new();
3299             my $result = $w2v->StringsAreEqual( "hello world", "HeLlO wOrLd" );
3300              
3301             print( "Strings are equal!\n" )if $result == 1;
3302             print( "Strings are not equal!\n" ) if $result == 0;
3303              
3304             undef( $w2v );
3305              
3306             =head3 ConvertRawSparseTextToVectorDataAry
3307              
3308             Description:
3309              
3310             Converts sparse vector string to a dense vector format data array.
3311              
3312             Input:
3313              
3314             $string -> Vector data string.
3315              
3316             Output:
3317              
3318             $arrayReference -> Reference to array of vector data.
3319              
3320             Example:
3321              
3322             use Word2vec::Word2vec;
3323              
3324             my $w2v = Word2vec::Word2vec->new();
3325             my $str = "cookie 1 0.234 9 0.0002 13 0.234 17 -0.0023 19 1.0000";
3326              
3327             my @vectorData = @{ $w2v->ConvertRawSparseTextToVectorDataAry( $str ) };
3328              
3329             print( "Data conversion successful!\n" ) if @vectorData > 0;
3330             print( "Data conversion un-successful!\n" ) if @vectorData == 0;
3331              
3332             undef( $w2v );
3333              
3334             =head3 ConvertRawSparseTextToVectorDataHash
3335              
3336             Description:
3337              
3338             Converts sparse vector string to a dense vector format data hash.
3339              
3340             Input:
3341              
3342             $string -> Vector data string.
3343              
3344             Output:
3345              
3346             $hashReference -> Reference to array of hash data.
3347              
3348             Example:
3349              
3350             use Word2vec::Word2vec;
3351              
3352             my $w2v = Word2vec::Word2vec->new();
3353             my $str = "cookie 1 0.234 9 0.0002 13 0.234 17 -0.0023 19 1.0000";
3354              
3355             my %vectorData = %{ $w2v->ConvertRawSparseTextToVectorDataHash( $str ) };
3356              
3357             print( "Data conversion successful!\n" ) if ( keys %vectorData ) > 0;
3358             print( "Data conversion un-successful!\n" ) if ( keys %vectorData ) == 0;
3359              
3360             undef( $w2v );
3361              
3362             =head3 GetOSType
3363              
3364             Description:
3365              
3366             Returns (string) operating system type.
3367              
3368             Input:
3369              
3370             None
3371              
3372             Output:
3373              
3374             $string -> Operating System String
3375              
3376             Example:
3377              
3378             use Word2vec::Word2vec;
3379              
3380             my $w2v = Word2vec::Word2vec->new();
3381             my $os = $w2v->GetOSType();
3382              
3383             print( "Operating System: $os\n" );
3384              
3385             undef( $w2v );
3386              
3387             =head2 Accessor Functions
3388              
3389             =head3 GetDebugLog
3390              
3391             Description:
3392              
3393             Returns the _debugLog member variable set during Word2vec::Word2vec object initialization of new function.
3394              
3395             Input:
3396              
3397             None
3398              
3399             Output:
3400              
3401             $value -> '0' = False, '1' = True
3402              
3403             Example:
3404              
3405             use Word2vec::Word2vec;
3406              
3407             my $w2v = Word2vec::Word2vec->new()
3408             my $debugLog = $w2v->GetDebugLog();
3409              
3410             print( "Debug Logging Enabled\n" ) if $debugLog == 1;
3411             print( "Debug Logging Disabled\n" ) if $debugLog == 0;
3412              
3413              
3414             undef( $w2v );
3415              
3416             =head3 GetWriteLog
3417              
3418             Description:
3419              
3420             Returns the _writeLog member variable set during Word2vec::Word2vec object initialization of new function.
3421              
3422             Input:
3423              
3424             None
3425              
3426             Output:
3427              
3428             $value -> '0' = False, '1' = True
3429              
3430             Example:
3431              
3432             use Word2vec::Word2vec;
3433              
3434             my $w2v = Word2vec::Word2vec->new();
3435             my $writeLog = $w2v->GetWriteLog();
3436              
3437             print( "Write Logging Enabled\n" ) if $writeLog == 1;
3438             print( "Write Logging Disabled\n" ) if $writeLog == 0;
3439              
3440             undef( $w2v );
3441              
3442             =head3 GetFileHandle
3443              
3444             Description:
3445              
3446             Returns the _fileHandle member variable set during Word2vec::Word2vec object instantiation of new function.
3447              
3448             Warning: This is a private function. File handle is used by WriteLog() method. Do not manipulate this file handle as errors can result.
3449              
3450             Input:
3451              
3452             None
3453              
3454             Output:
3455              
3456             $fileHandle -> Returns file handle for WriteLog() method or undefined.
3457              
3458             Example:
3459              
3460             use Word2vec::Word2vec;
3461              
3462             my $w2v = Word2vec::Word2vec->new();
3463             my $fileHandle = $w2v->GetFileHandle();
3464              
3465             undef( $w2v );
3466              
3467             =head3 GetTrainFilePath
3468              
3469             Description:
3470              
3471             Returns the _trainFilePath member variable set during Word2vec::Word2vec object instantiation of new function.
3472              
3473             Input:
3474              
3475             None
3476              
3477             Output:
3478              
3479             $string -> Returns word2vec training text corpus file path.
3480              
3481             Example:
3482              
3483             use Word2vec::Word2vec;
3484              
3485             my $w2v = Word2vec::Word2vec->new();
3486             my $filePath = $w2v->GetTrainFilePath();
3487             print( "Training File Path: $filePath\n" );
3488              
3489             undef( $w2v );
3490              
3491             =head3 GetOutputFilePath
3492              
3493             Description:
3494              
3495             Returns the _outputFilePath member variable set during Word2vec::Word2vec object instantiation of new function.
3496              
3497             Input:
3498              
3499             None
3500              
3501             Output:
3502              
3503             $string -> Returns post word2vec training output file path.
3504              
3505             Example:
3506              
3507             use Word2vec::Word2vec;
3508              
3509             my $w2v = Word2vec::Word2vec->new();
3510             my $filePath = $w2v->GetOutputFilePath();
3511             print( "File Path: $filePath\n" );
3512              
3513             undef( $w2v );
3514              
3515             =head3 GetWordVecSize
3516              
3517             Description:
3518              
3519             Returns the _wordVecSize member variable set during Word2vec::Word2vec object instantiation of new function.
3520              
3521             Input:
3522              
3523             None
3524              
3525             Output:
3526              
3527             $value -> Returns (integer) size of word2vec word vectors. Default value = 100
3528              
3529             Example:
3530              
3531             use Word2vec::Word2vec;
3532              
3533             my $w2v = Word2vec::Word2vec->new();
3534             my $value = $w2v->GetWordVecSize();
3535             print( "Word Vector Size: $value\n" );
3536              
3537             undef( $w2v );
3538              
3539             =head3 GetWindowSize
3540              
3541             Description:
3542              
3543             Returns the _windowSize member variable set during Word2vec::Word2vec object instantiation of new function.
3544              
3545             Input:
3546              
3547             None
3548              
3549             Output:
3550              
3551             $value -> Returns (integer) word2vec window size. Default value = 5
3552              
3553             Example:
3554              
3555             use Word2vec::Word2vec;
3556              
3557             my $w2v = Word2vec::Word2vec->new();
3558             my $value = $w2v->GetWindowSize();
3559             print( "Window Size: $value\n" );
3560              
3561             undef( $w2v );
3562              
3563             =head3 GetSample
3564              
3565             Description:
3566              
3567             Returns the _sample member variable set during Word2vec::Word2vec object instantiation of new function.
3568              
3569             Input:
3570              
3571             None
3572              
3573             Output:
3574              
3575             $value -> Returns (integer) word2vec sample size. Default value = 0.001
3576              
3577             Example:
3578              
3579             use Word2vec::Word2vec;
3580              
3581             my $w2v = Word2vec::Word2vec->new();
3582             my $value = $w2v->GetSample();
3583             print( "Sample: $value\n" );
3584              
3585             undef( $w2v );
3586              
3587             =head3 GetHSoftMax
3588              
3589             Description:
3590              
3591             Returns the _hSoftMax member variable set during Word2vec::Word2vec object instantiation of new function.
3592              
3593             Input:
3594              
3595             None
3596              
3597             Output:
3598              
3599             $value -> Returns (integer) word2vec HSoftMax value. Default = 0
3600              
3601             Example:
3602              
3603             use Word2vec::Word2vec;
3604              
3605             my $w2v = Word2vec::Word2vec->new();
3606             my $value = $w2v->GetHSoftMax();
3607             print( "HSoftMax: $value\n" );
3608              
3609             undef( $w2v );
3610              
3611             =head3 GetNegative
3612              
3613             Description:
3614              
3615             Returns the _negative member variable set during Word2vec::Word2vec object instantiation of new function.
3616              
3617             Input:
3618              
3619             None
3620              
3621             Output:
3622              
3623             $value -> Returns (integer) word2vec negative value. Default = 5
3624              
3625             Example:
3626              
3627             use Word2vec::Word2vec;
3628              
3629             my $w2v = Word2vec::Word2vec->new();
3630             my $value = $w2v->GetNegative();
3631             print( "Negative: $value\n" );
3632              
3633             undef( $w2v );
3634              
3635             =head3 GetNumOfThreads
3636              
3637             Description:
3638              
3639             Returns the _numOfThreads member variable set during Word2vec::Word2vec object instantiation of new function.
3640              
3641             Input:
3642              
3643             None
3644              
3645             Output:
3646              
3647             $value -> Returns (integer) word2vec number of threads to use during training. Default = 12
3648              
3649             Example:
3650              
3651             use Word2vec::Word2vec;
3652              
3653             my $w2v = Word2vec::Word2vec->new();
3654             my $value = $w2v->GetNumOfThreads();
3655             print( "Number of threads: $value\n" );
3656              
3657             undef( $w2v );
3658              
3659             =head3 GetNumOfIterations
3660              
3661             Description:
3662              
3663             Returns the _iterations member variable set during Word2vec::Word2vec object instantiation of new function.
3664              
3665             Input:
3666              
3667             None
3668              
3669             Output:
3670              
3671             $value -> Returns (integer) word2vec number of word2vec iterations. Default = 5
3672              
3673             Example:
3674              
3675             use Word2vec::Word2vec;
3676              
3677             my $w2v = Word2vec::Word2vec->new();
3678             my $value = $w2v->GetNumOfIterations();
3679             print( "Number of iterations: $value\n" );
3680              
3681             undef( $w2v );
3682              
3683             =head3 GetMinCount
3684              
3685             Description:
3686              
3687             Returns the _minCount member variable set during Word2vec::Word2vec object instantiation of new function.
3688              
3689             Input:
3690              
3691             None
3692              
3693             Output:
3694              
3695             $value -> Returns (integer) word2vec min-count value. Default = 5
3696              
3697             Example:
3698              
3699             use Word2vec::Word2vec;
3700              
3701             my $w2v = Word2vec::Word2vec->new();
3702             my $value = $w2v->GetMinCount();
3703             print( "Min Count: $value\n" );
3704              
3705             undef( $w2v );
3706              
3707             =head3 GetAlpha
3708              
3709             Description:
3710              
3711             Returns the _alpha member variable set during Word2vec::Word2vec object instantiation of new function.
3712              
3713             Input:
3714              
3715             None
3716              
3717             Output:
3718              
3719             $value -> Returns (integer) word2vec alpha value. Default = 0.05 for CBOW and 0.025 for Skip-Gram.
3720              
3721             Example:
3722              
3723             use Word2vec::Word2vec;
3724              
3725             my $w2v = Word2vec::Word2vec->new();
3726             my $value = $w2v->GetAlpha();
3727             print( "Alpha: $value\n" );
3728              
3729             undef( $w2v );
3730              
3731             =head3 GetClasses
3732              
3733             Description:
3734              
3735             Returns the _classes member variable set during Word2vec::Word2vec object instantiation of new function.
3736              
3737             Input:
3738              
3739             None
3740              
3741             Output:
3742              
3743             $value -> Returns (integer) word2vec classes value. Default = 0
3744              
3745             Example:
3746              
3747             use Word2vec::Word2vec;
3748              
3749             my $w2v = Word2vec::Word2vec->new();
3750             my $value = $w2v->GetClasses();
3751             print( "Classes: $value\n" );
3752              
3753             undef( $w2v );
3754              
3755             =head3 GetDebugTraining
3756              
3757             Description:
3758              
3759             Returns the _debug member variable set during Word2vec::Word2vec object instantiation of new function.
3760              
3761             Note: 0 = No debug output, 1 = Enable debug output, 2 = Even more debug output
3762              
3763             Input:
3764              
3765             None
3766              
3767             Output:
3768              
3769             $value -> Returns (integer) word2vec debug value. Default = 2
3770              
3771             Example:
3772              
3773             use Word2vec::Word2vec;
3774              
3775             my $w2v = Word2vec::Word2vec->new();
3776             my $value = $w2v->GetDebugTraining();
3777             print( "Debug: $value\n" );
3778              
3779             undef( $w2v );
3780              
3781             =head3 GetBinaryOutput
3782              
3783             Description:
3784              
3785             Returns the _binaryOutput member variable set during Word2vec::Word2vec object instantiation of new function.
3786              
3787             Note: 1 = Save trained vector data in binary format, 2 = Save trained vector data in plain text format.
3788              
3789             Input:
3790              
3791             None
3792              
3793             Output:
3794              
3795             $value -> Returns (integer) word2vec binary flag. Default = 0
3796              
3797             Example:
3798              
3799             use Word2vec::Word2vec;
3800              
3801             my $w2v = Word2vec::Word2vec->new();
3802             my $value = $w2v->GetBinaryOutput();
3803             print( "Binary Output: $value\n" );
3804              
3805             undef( $w2v );
3806              
3807             =head3 GetReadVocabFilePath
3808              
3809             Description:
3810              
3811             Returns the _readVocab member variable set during Word2vec::Word2vec object instantiation of new function.
3812              
3813             Input:
3814              
3815             None
3816              
3817             Output:
3818              
3819             $string -> Returns (string) word2vec read vocabulary file name or empty string if not set.
3820              
3821             Example:
3822              
3823             use Word2vec::Word2vec;
3824              
3825             my $w2v = Word2vec::Word2vec->new();
3826             my $str = $w2v->GetReadVocabFilePath();
3827             print( "Read Vocab File Path: $str\n" );
3828              
3829             undef( $w2v );
3830              
3831             =head3 GetSaveVocabFilePath
3832              
3833             Description:
3834              
3835             Returns the _saveVocab member variable set during Word2vec::Word2vec object instantiation of new function.
3836              
3837             Input:
3838              
3839             None
3840              
3841             Output:
3842              
3843             $string -> Returns (string) word2vec save vocabulary file name or empty string if not set.
3844              
3845             Example:
3846              
3847             use Word2vec::Word2vec;
3848              
3849             my $w2v = Word2vec::Word2vec->new();
3850             my $str = $w2v->GetSaveVocabFilePath();
3851             print( "Save Vocab File Path: $str\n" );
3852              
3853             undef( $w2v );
3854              
3855             =head3 GetUseCBOW
3856              
3857             Description:
3858              
3859             Returns the _useCBOW member variable set during Word2vec::Word2vec object instantiation of new function.
3860              
3861             Note: 0 = Skip-Gram Model, 1 = Continuous Bag Of Words Model.
3862              
3863             Input:
3864              
3865             None
3866              
3867             Output:
3868              
3869             $value -> Returns (integer) word2vec Continuous-Bag-Of-Words flag. Default = 1
3870              
3871             Example:
3872              
3873             use Word2vec::Word2vec;
3874              
3875             my $w2v = Word2vec::Word2vec->new();
3876             my $value = $w2v->GetUseCBOW();
3877             print( "Use CBOW?: $value\n" );
3878              
3879             undef( $w2v );
3880              
3881             =head3 GetWorkingDir
3882              
3883             Description:
3884              
3885             Returns the _workingDir member variable set during Word2vec::Word2vec object instantiation of new function.
3886              
3887             Input:
3888              
3889             None
3890              
3891             Output:
3892              
3893             $value -> Returns (string) working directory path or current directory if not specified.
3894              
3895             Example:
3896              
3897             use Word2vec::Word2vec;
3898              
3899             my $w2v = Word2vec::Word2vec->new();
3900             my $str = $w2v->GetWorkingDir();
3901             print( "Working Directory: $str\n" );
3902              
3903             undef( $w2v );
3904              
3905             =head3 GetWord2VecExeDir
3906              
3907             Description:
3908              
3909             Returns the _word2VecExeDir member variable set during Word2vec::Word2vec object instantiation of new function.
3910              
3911             Input:
3912              
3913             None
3914              
3915             Output:
3916              
3917             $value -> Returns (string) word2vec executable directory path or empty string if not specified.
3918              
3919             Example:
3920              
3921             use Word2vec::Word2vec;
3922              
3923             my $w2v = Word2vec::Word2vec->new();
3924             my $str = $w2v->GetWord2VecExeDir();
3925             print( "Word2Vec Executable File Directory: $str\n" );
3926              
3927             undef( $w2v );
3928              
3929             =head3 GetVocabularyHash
3930              
3931             Description:
3932              
3933             Returns the _hashRefOfWordVectors member variable set during Word2vec::Word2vec object instantiation of new function.
3934              
3935             Input:
3936              
3937             None
3938              
3939             Output:
3940              
3941             $value -> Returns array of vocabulary/dictionary words. (Word2vec trained data in memory)
3942              
3943             Example:
3944              
3945             use Word2vec::Word2vec;
3946              
3947             my $w2v = Word2vec::Word2vec->new();
3948             my @vocabulary = $w2v->GetVocabularyHash();
3949              
3950             undef( $w2v );
3951              
3952             =head3 GetOverwriteOldFile
3953              
3954             Description:
3955              
3956             Returns the _overwriteOldFile member variable set during Word2vec::Word2vec object instantiation of new function.
3957              
3958             Input:
3959              
3960             None
3961              
3962             Output:
3963              
3964             $value -> Returns 1 = True or 0 = False.
3965              
3966             Example:
3967              
3968             use Word2vec::Word2vec;
3969              
3970             my $w2v = Word2vec::Word2vec->new();
3971             my $value = $w2v->GetOverwriteOldFile();
3972             print( "Overwrite Exiting File?: $value\n" );
3973              
3974             undef( $w2v );
3975              
3976             =head2 Mutator Functions
3977              
3978             =head3 SetTrainFilePath
3979              
3980             Description:
3981              
3982             Sets member variable to string parameter. Sets training file path.
3983              
3984             Input:
3985              
3986             $string -> Text corpus training file path
3987              
3988             Output:
3989              
3990             None
3991              
3992             Example:
3993              
3994             use Word2vec::Word2vec;
3995              
3996             my $w2v = Word2vec::Word2vec->new();
3997             $w2v->SetTrainFilePath( "samples/textcorpus.txt" );
3998              
3999             undef( $w2v );
4000              
4001             =head3 SetOutputFilePath
4002              
4003             Description:
4004              
4005             Sets member variable to string parameter. Sets output file path.
4006              
4007             Input:
4008              
4009             $string -> Post word2vec training save file path
4010              
4011             Output:
4012              
4013             None
4014              
4015             Example:
4016              
4017             use Word2vec::Word2vec;
4018              
4019             my $w2v = Word2vec::Word2vec->new();
4020             $w2v->SetOutputFilePath( "samples/tempvectors.bin" );
4021              
4022             undef( $w2v );
4023              
4024             =head3 SetWordVecSize
4025              
4026             Description:
4027              
4028             Sets member variable to integer parameter. Sets word2vec word vector size.
4029              
4030             Input:
4031              
4032             $value -> Word2vec word vector size
4033              
4034             Output:
4035              
4036             None
4037              
4038             Example:
4039              
4040             use Word2vec::Word2vec;
4041              
4042             my $w2v = Word2vec::Word2vec->new();
4043             $w2v->SetWordVecSize( 100 );
4044              
4045             undef( $w2v );
4046              
4047             =head3 SetWindowSize
4048              
4049             Description:
4050              
4051             Sets member variable to integer parameter. Sets word2vec window size.
4052              
4053             Input:
4054              
4055             $value -> Word2vec window size
4056              
4057             Output:
4058              
4059             None
4060              
4061             Example:
4062              
4063             use Word2vec::Word2vec;
4064              
4065             my $w2v = Word2vec::Word2vec->new();
4066             $w2v->SetWindowSize( 8 );
4067              
4068             undef( $w2v );
4069              
4070             =head3 SetSample
4071              
4072             Description:
4073              
4074             Sets member variable to integer parameter. Sets word2vec sample size.
4075              
4076             Input:
4077              
4078             $value -> Word2vec sample size
4079              
4080             Output:
4081              
4082             None
4083              
4084             Example:
4085              
4086             use Word2vec::Word2vec;
4087              
4088             my $w2v = Word2vec::Word2vec->new();
4089             $w2v->SetSample( 3 );
4090              
4091             undef( $w2v );
4092              
4093             =head3 SetHSoftMax
4094              
4095             Description:
4096              
4097             Sets member variable to integer parameter. Sets word2vec HSoftMax value.
4098              
4099             Input:
4100              
4101             $value -> Word2vec HSoftMax size
4102              
4103             Output:
4104              
4105             None
4106              
4107             Example:
4108              
4109             use Word2vec::Word2vec;
4110              
4111             my $w2v = Word2vec::Word2vec->new();
4112             $w2v->SetHSoftMax( 12 );
4113              
4114             undef( $w2v );
4115              
4116             =head3 SetNegative
4117              
4118             Description:
4119              
4120             Sets member variable to integer parameter. Sets word2vec negative value.
4121              
4122             Input:
4123              
4124             $value -> Word2vec negative value
4125              
4126             Output:
4127              
4128             None
4129              
4130             Example:
4131              
4132             use Word2vec::Word2vec;
4133              
4134             my $w2v = Word2vec::Word2vec->new();
4135             $w2v->SetNegative( 12 );
4136              
4137             undef( $w2v );
4138              
4139             =head3 SetNumOfThreads
4140              
4141             Description:
4142              
4143             Sets member variable to integer parameter. Sets word2vec number of training threads to specified value.
4144              
4145             Input:
4146              
4147             $value -> Word2vec number of threads value
4148              
4149             Output:
4150              
4151             None
4152              
4153             Example:
4154              
4155             use Word2vec::Word2vec;
4156              
4157             my $w2v = Word2vec::Word2vec->new();
4158             $w2v->SetNumOfThreads( 12 );
4159              
4160             undef( $w2v );
4161              
4162             =head3 SetNumOfIterations
4163              
4164             Description:
4165              
4166             Sets member variable to integer parameter. Sets word2vec iterations value.
4167              
4168             Input:
4169              
4170             $value -> Word2vec number of iterations value
4171              
4172             Output:
4173              
4174             None
4175              
4176             Example:
4177              
4178             use Word2vec::Word2vec;
4179              
4180             my $w2v = Word2vec::Word2vec->new();
4181             $w2v->SetNumOfIterations( 12 );
4182              
4183             undef( $w2v );
4184              
4185             =head3 SetMinCount
4186              
4187             Description:
4188              
4189             Sets member variable to integer parameter. Sets word2vec min-count value.
4190              
4191             Input:
4192              
4193             $value -> Word2vec min-count value
4194              
4195             Output:
4196              
4197             None
4198              
4199             Example:
4200              
4201             use Word2vec::Word2vec;
4202              
4203             my $w2v = Word2vec::Word2vec->new();
4204             $w2v->SetMinCount( 7 );
4205              
4206             undef( $w2v );
4207              
4208             =head3 SetAlpha
4209              
4210             Description:
4211              
4212             Sets member variable to float parameter. Sets word2vec alpha value.
4213              
4214             Input:
4215              
4216             $value -> Word2vec alpha value. (Float)
4217              
4218             Output:
4219              
4220             None
4221              
4222             Example:
4223              
4224             use Word2vec::Word2vec;
4225              
4226             my $w2v = Word2vec::Word2vec->new();
4227             $w2v->SetAlpha( 0.0012 );
4228              
4229             undef( $w2v );
4230              
4231             =head3 SetClasses
4232              
4233             Description:
4234              
4235             Sets member variable to integer parameter. Sets word2vec classes value.
4236              
4237             Input:
4238              
4239             $value -> Word2vec classes value.
4240              
4241             Output:
4242              
4243             None
4244              
4245             Example:
4246              
4247             use Word2vec::Word2vec;
4248              
4249             my $w2v = Word2vec::Word2vec->new();
4250             $w2v->SetClasses( 0 );
4251              
4252             undef( $w2v );
4253              
4254             =head3 SetDebugTraining
4255              
4256             Description:
4257              
4258             Sets member variable to integer parameter. Sets word2vec debug parameter value.
4259              
4260             Input:
4261              
4262             $value -> Word2vec debug training value.
4263              
4264             Output:
4265              
4266             None
4267              
4268             Example:
4269              
4270             use Word2vec::Word2vec;
4271              
4272             my $w2v = Word2vec::Word2vec->new();
4273             $w2v->SetDebugTraining( 0 );
4274              
4275             undef( $w2v );
4276              
4277             =head3 SetBinaryOutput
4278              
4279             Description:
4280              
4281             Sets member variable to integer parameter. Sets word2vec binary parameter value.
4282              
4283             Input:
4284              
4285             $value -> Word2vec binary output mode value. ( '1' = Binary Output / '0' = Plain Text )
4286              
4287             Output:
4288              
4289             None
4290              
4291             Example:
4292              
4293             use Word2vec::Word2vec;
4294              
4295             my $w2v = Word2vec::Word2vec->new();
4296             $w2v->SetBinaryOutput( 1 );
4297              
4298             undef( $w2v );
4299              
4300             =head3 SetSaveVocabFilePath
4301              
4302             Description:
4303              
4304             Sets member variable to string parameter. Sets word2vec save vocabulary file name.
4305              
4306             Input:
4307              
4308             $string -> Word2vec save vocabulary file name and path.
4309              
4310             Output:
4311              
4312             None
4313              
4314             Example:
4315              
4316             use Word2vec::Word2vec;
4317              
4318             my $w2v = Word2vec::Word2vec->new();
4319             $w2v->SetSaveVocabFilePath( "samples/vocab.txt" );
4320              
4321             undef( $w2v );
4322              
4323             =head3 SetReadVocabFilePath
4324              
4325             Description:
4326              
4327             Sets member variable to string parameter. Sets word2vec read vocabulary file name.
4328              
4329             Input:
4330              
4331             $string -> Word2vec read vocabulary file name and path.
4332              
4333             Output:
4334              
4335             None
4336              
4337             Example:
4338              
4339             use Word2vec::Word2vec;
4340              
4341             my $w2v = Word2vec::Word2vec->new();
4342             $w2v->SetReadVocabFilePath( "samples/vocab.txt" );
4343              
4344             undef( $w2v );
4345              
4346             =head3 SetUseCBOW
4347              
4348             Description:
4349              
4350             Sets member variable to integer parameter. Sets word2vec CBOW parameter value.
4351              
4352             Input:
4353              
4354             $value -> Word2vec CBOW mode value.
4355              
4356             Output:
4357              
4358             None
4359              
4360             Example:
4361              
4362             use Word2vec::Word2vec;
4363              
4364             my $w2v = Word2vec::Word2vec->new();
4365             $w2v->SetUseCBOW( 1 );
4366              
4367             undef( $w2v );
4368              
4369             =head3 SetWorkingDir
4370              
4371             Description:
4372              
4373             Sets member variable to string parameter. Sets working directory.
4374              
4375             Input:
4376              
4377             $string -> Working directory
4378              
4379             Output:
4380              
4381             None
4382              
4383             Example:
4384              
4385             use Word2vec::Word2vec;
4386              
4387             my $w2v = Word2vec::Word2vec->new();
4388             $w2v->SetWorkingDir( "/samples" );
4389              
4390             undef( $w2v );
4391              
4392             =head3 SetWord2VecExeDir
4393              
4394             Description:
4395              
4396             Sets member variable to string parameter. Sets word2vec executable file directory.
4397              
4398             Input:
4399              
4400             $string -> Word2vec directory
4401              
4402             Output:
4403              
4404             None
4405              
4406             Example:
4407              
4408             use Word2vec::Word2vec;
4409              
4410             my $w2v = Word2vec::Word2vec->new();
4411             $w2v->SetWord2VecExeDir( "/word2vec" );
4412              
4413             undef( $w2v );
4414              
4415             =head3 SetVocabularyHash
4416              
4417             Description:
4418              
4419             Sets vocabulary/dictionary array to de-referenced array reference parameter.
4420              
4421             Warning: This will overwrite any existing vocabulary/dictionary array data.
4422              
4423             Input:
4424              
4425             $arrayReference -> Vocabulary/Dictionary array reference of word2vec word vectors.
4426              
4427             Output:
4428              
4429             None
4430              
4431             Example:
4432              
4433             use Word2vec::Word2vec;
4434              
4435             my $w2v = Word2vec::Word2vec->new();
4436             $w2v->ReadTrainedVectorDataFromFile( "samples/samplevectors.bin" );
4437             my @vocab = $w2v->GetVocabularyHash();
4438             $w2v->SetVocabularyHash( \@vocab );
4439              
4440             undef( $w2v );
4441              
4442             =head3 ClearVocabularyHash
4443              
4444             Description:
4445              
4446             Clears vocabulary/dictionary array.
4447              
4448             Input:
4449              
4450             None
4451              
4452             Output:
4453              
4454             None
4455              
4456             Example:
4457              
4458             use Word2vec::Word2vec;
4459              
4460             my $w2v = Word2vec::Word2vec->new();
4461             $w2v->ClearVocabularyHash();
4462              
4463             undef( $w2v );
4464              
4465             =head3 AddWordVectorToVocabHash
4466              
4467             Description:
4468              
4469             Adds word vector string to vocabulary/dictionary.
4470              
4471             Input:
4472              
4473             $string -> Word2vec word vector string
4474              
4475             Output:
4476              
4477             None
4478              
4479             Example:
4480              
4481             use Word2vec::Word2vec;
4482              
4483             my $w2v = Word2vec::Word2vec->new();
4484              
4485             # Note: This is representational data of word2vec's word vector format and not actual data.
4486             $w2v->AddWordVectorToVocabHash( "of 0.4346 -0.1235 0.5789 0.2347 -0.0056 -0.0001" );
4487              
4488             undef( $w2v );
4489              
4490             =head3 SetOverwriteOldFile
4491              
4492             Description:
4493              
4494             Sets member variable to integer parameter. Enables overwriting output file if one already exists.
4495              
4496             Input:
4497              
4498             $value -> '1' = Overwrite exiting file / '0' = Graceful termination when file with same name exists
4499              
4500             Output:
4501              
4502             None
4503              
4504             Example:
4505              
4506             use Word2vec::Word2vec;
4507              
4508             my $w2v = Word2vec::Word2vec->new();
4509             $w2v->SetOverwriteOldFile( 1 );
4510              
4511             undef( $w2v );
4512              
4513             =head2 Debug Functions
4514              
4515             =head3 GetTime
4516              
4517             Description:
4518              
4519             Returns current time string in "Hour:Minute:Second" format.
4520              
4521             Input:
4522              
4523             None
4524              
4525             Output:
4526              
4527             $string -> XX:XX:XX ("Hour:Minute:Second")
4528              
4529             Example:
4530              
4531             use Word2vec::Word2vec:
4532              
4533             my $w2v = Word2vec::Word2vec->new();
4534             my $time = $w2v->GetTime();
4535              
4536             print( "Current Time: $time\n" ) if defined( $time );
4537              
4538             undef( $w2v );
4539              
4540             =head3 GetDate
4541              
4542             Description:
4543              
4544             Returns current month, day and year string in "Month/Day/Year" format.
4545              
4546             Input:
4547              
4548             None
4549              
4550             Output:
4551              
4552             $string -> XX/XX/XXXX ("Month/Day/Year")
4553              
4554             Example:
4555              
4556             use Word2vec::Word2vec:
4557              
4558             my $w2v = Word2vec::Word2vec->new();
4559             my $date = $w2v->GetDate();
4560              
4561             print( "Current Date: $date\n" ) if defined( $date );
4562              
4563             undef( $w2v );
4564              
4565             =head3 WriteLog
4566              
4567             Description:
4568              
4569             Prints passed string parameter to the console, log file or both depending on user options.
4570              
4571             Note: printNewLine parameter prints a new line character following the string if the parameter
4572             is undefined and does not if parameter is 0.
4573              
4574             Input:
4575              
4576             $string -> String to print to the console/log file.
4577             $value -> 0 = Do not print newline character after string, all else prints new line character including 'undef'.
4578              
4579             Output:
4580              
4581             None
4582              
4583             Example:
4584              
4585             use Word2vec::Word2vec:
4586              
4587             my $w2v = Word2vec::Word2vec->new();
4588             $w2v->WriteLog( "Hello World" );
4589              
4590             undef( $w2v );
4591              
4592             =head1 Author
4593              
4594             Clint Cuffy, Virginia Commonwealth University
4595              
4596             =head1 COPYRIGHT
4597              
4598             Copyright (c) 2016
4599              
4600             Bridget T McInnes, Virginia Commonwealth University
4601             btmcinnes at vcu dot edu
4602              
4603             Clint Cuffy, Virginia Commonwealth University
4604             cuffyca at vcu dot edu
4605              
4606             This program is free software; you can redistribute it and/or modify it
4607             under the terms of the GNU General Public License as published by the Free
4608             Software Foundation; either version 2 of the License, or (at your option)
4609             any later version.
4610              
4611             This program is distributed in the hope that it will be useful, but WITHOUT
4612             ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
4613             FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
4614              
4615             You should have received a copy of the GNU General Public License along with
4616             this program; if not, write to:
4617              
4618             The Free Software Foundation, Inc.,
4619             59 Temple Place - Suite 330,
4620             Boston, MA 02111-1307, USA.
4621              
4622             =cut