line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
#!usr/bin/perl |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
###################################################################################### |
4
|
|
|
|
|
|
|
# # |
5
|
|
|
|
|
|
|
# Author: Clint Cuffy # |
6
|
|
|
|
|
|
|
# Date: 06/16/2016 # |
7
|
|
|
|
|
|
|
# Revised: 10/10/2017 # |
8
|
|
|
|
|
|
|
# UMLS Similarity - Medline XML-To-Word2Vec Input Format Conversion Module # |
9
|
|
|
|
|
|
|
# # |
10
|
|
|
|
|
|
|
###################################################################################### |
11
|
|
|
|
|
|
|
# # |
12
|
|
|
|
|
|
|
# Description: # |
13
|
|
|
|
|
|
|
# ============ # |
14
|
|
|
|
|
|
|
# Perl Medline XML-To-Word2Vec Input Format Conversion Module # |
15
|
|
|
|
|
|
|
# for the "word2vec" package. # |
16
|
|
|
|
|
|
|
# Features: # |
17
|
|
|
|
|
|
|
# ========= # |
18
|
|
|
|
|
|
|
# Supports Parsing Individual Files or Directories # |
19
|
|
|
|
|
|
|
# Plain XML files or .gz XML files (extracts and processes in RAM) # |
20
|
|
|
|
|
|
|
# Include results by specified Date Ranges: 00/00/0000 Format # |
21
|
|
|
|
|
|
|
# Include results by title, abstract or both per article # |
22
|
|
|
|
|
|
|
# Multi-Threading Support - Divides work by number of threads # |
23
|
|
|
|
|
|
|
# Text Compoundify # |
24
|
|
|
|
|
|
|
# # |
25
|
|
|
|
|
|
|
###################################################################################### |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
package Word2vec::Xmltow2v; |
29
|
|
|
|
|
|
|
|
30
|
4
|
|
|
4
|
|
43624
|
use strict; |
|
4
|
|
|
|
|
15
|
|
|
4
|
|
|
|
|
119
|
|
31
|
4
|
|
|
4
|
|
18
|
use warnings; |
|
4
|
|
|
|
|
5
|
|
|
4
|
|
|
|
|
98
|
|
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
# Standard Package(s) |
34
|
4
|
|
|
4
|
|
1439
|
use utf8; |
|
4
|
|
|
|
|
54
|
|
|
4
|
|
|
|
|
19
|
|
35
|
4
|
|
|
4
|
|
1445
|
use threads; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
use threads::shared; |
37
|
|
|
|
|
|
|
use IO::Uncompress::Gunzip qw(gunzip $GunzipError); |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
# CPAN Package(s) |
40
|
|
|
|
|
|
|
use Cwd; |
41
|
|
|
|
|
|
|
use File::Type; |
42
|
|
|
|
|
|
|
use Text::Unidecode; |
43
|
|
|
|
|
|
|
use XML::Twig; |
44
|
|
|
|
|
|
|
use Sys::CpuAffinity; |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
# Word2Vec Utility Package(s) |
47
|
|
|
|
|
|
|
use Word2vec::Bst; |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
use vars qw($VERSION); |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
$VERSION = '0.021'; |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# Global Variables |
56
|
|
|
|
|
|
|
my $debugLock :shared; |
57
|
|
|
|
|
|
|
my $writeLock :shared; |
58
|
|
|
|
|
|
|
my $queueLock :shared; |
59
|
|
|
|
|
|
|
my $appendLock :shared; |
60
|
|
|
|
|
|
|
my @xmlJobQueue :shared; |
61
|
|
|
|
|
|
|
my $totalJobCount :shared; |
62
|
|
|
|
|
|
|
my $finishedJobCount :shared; |
63
|
|
|
|
|
|
|
my $preCompWordCount :shared; |
64
|
|
|
|
|
|
|
my $postCompWordCount :shared; |
65
|
|
|
|
|
|
|
my $compoundWordCount :shared; |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
###################################################################################### |
69
|
|
|
|
|
|
|
# Constructor |
70
|
|
|
|
|
|
|
###################################################################################### |
71
|
|
|
|
|
|
|
|
72
|
|
|
|
|
|
|
BEGIN |
73
|
|
|
|
|
|
|
{ |
74
|
|
|
|
|
|
|
# CONSTRUCTOR : DO SOMETHING HERE |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
###################################################################################### |
79
|
|
|
|
|
|
|
# Deconstructor |
80
|
|
|
|
|
|
|
###################################################################################### |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
END |
83
|
|
|
|
|
|
|
{ |
84
|
|
|
|
|
|
|
# DECONSTRUCTOR : DO SOMETHING HERE |
85
|
|
|
|
|
|
|
} |
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
###################################################################################### |
89
|
|
|
|
|
|
|
# new Class Operator |
90
|
|
|
|
|
|
|
###################################################################################### |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub new |
93
|
|
|
|
|
|
|
{ |
94
|
|
|
|
|
|
|
my $class = shift; |
95
|
|
|
|
|
|
|
my $self = { |
96
|
|
|
|
|
|
|
# Private Member Variables |
97
|
|
|
|
|
|
|
_debugLog => shift, # Boolean (Binary): 0 = False, 1 = True |
98
|
|
|
|
|
|
|
_writeLog => shift, # Boolean (Binary): 0 = False, 1 = True |
99
|
|
|
|
|
|
|
_storeTitle => shift, # Boolean (Binary): 0 = False, 1 = True |
100
|
|
|
|
|
|
|
_storeAbstract => shift, # Boolean (Binary): 0 = False, 1 = True |
101
|
|
|
|
|
|
|
_quickParse => shift, # Boolean (Binary): 0 = False, 1 = True |
102
|
|
|
|
|
|
|
_compoundifyText => shift, # Boolean (Binary): 0 = False, 1 = True |
103
|
|
|
|
|
|
|
_storeAsSentencePerLine => shift, # Boolean (Binary): 0 = False, 1 = True |
104
|
|
|
|
|
|
|
_numOfThreads => shift, # Integer |
105
|
|
|
|
|
|
|
_workingDir => shift, # String |
106
|
|
|
|
|
|
|
_savePath => shift, # String |
107
|
|
|
|
|
|
|
_beginDate => shift, # String Format: Month/Day/Year |
108
|
|
|
|
|
|
|
_endDate => shift, # String Format: Month/Day/Year |
109
|
|
|
|
|
|
|
_xmlStringToParse => shift, # String |
110
|
|
|
|
|
|
|
_textCorpusStr => shift, # String |
111
|
|
|
|
|
|
|
_fileHandle => shift, # File Handle |
112
|
|
|
|
|
|
|
_twigHandler => shift, # File Handle |
113
|
|
|
|
|
|
|
_parsedCount => shift, # Int |
114
|
|
|
|
|
|
|
_tempDate => shift, # String (Temporary Placeholder) |
115
|
|
|
|
|
|
|
_tempStr => shift, # String (Temporary Placeholder) |
116
|
|
|
|
|
|
|
_compoundWordAry => shift, # Array Of Compound Words |
117
|
|
|
|
|
|
|
_compoundWordBST => shift, # Binary Search Tree Reference |
118
|
|
|
|
|
|
|
_maxCompoundWordLength => shift, # Integer |
119
|
|
|
|
|
|
|
_overwriteExistingFile => shift, # Integer |
120
|
|
|
|
|
|
|
_compoundWordCount => shift, # Integer |
121
|
|
|
|
|
|
|
}; |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
# Set debug log variable to false if not defined |
124
|
|
|
|
|
|
|
$self->{ _debugLog } = 0 if !defined ( $self->{ _debugLog } ); |
125
|
|
|
|
|
|
|
$self->{ _writeLog } = 0 if !defined ( $self->{ _writeLog } ); |
126
|
|
|
|
|
|
|
$self->{ _storeTitle } = 1 if !defined ( $self->{ _storeTitle } ); |
127
|
|
|
|
|
|
|
$self->{ _storeAbstract } = 1 if !defined ( $self->{ _storeAbstract } ); |
128
|
|
|
|
|
|
|
$self->{ _quickParse } = 0 if !defined ( $self->{ _quickParse } ); |
129
|
|
|
|
|
|
|
$self->{ _compoundifyText } = 0 if !defined ( $self->{ _compoundifyText } ); |
130
|
|
|
|
|
|
|
$self->{ _storeAsSentencePerLine } = 0 if !defined ( $self->{ _storeAsSentencePerLine } ); |
131
|
|
|
|
|
|
|
$self->{ _numOfThreads } = Sys::CpuAffinity::getNumCpus() if !defined ( $self->{ _numOfThreads } ); |
132
|
|
|
|
|
|
|
$self->{ _workingDir } = Cwd::getcwd() if !defined ( $self->{ _workingDir } ); |
133
|
|
|
|
|
|
|
$self->{ _savePath } = Cwd::getcwd() if !defined ( $self->{ _savePath } ); |
134
|
|
|
|
|
|
|
$self->{ _beginDate } = "00/00/0000" if !defined ( $self->{ _beginDate } ); |
135
|
|
|
|
|
|
|
$self->{ _endDate } = "99/99/9999" if !defined ( $self->{ _endDate } ); |
136
|
|
|
|
|
|
|
$self->{ _xmlStringToParse } = "(null)" if !defined ( $self->{ _xmlStringToParse } ); |
137
|
|
|
|
|
|
|
$self->{ _textCorpusStr } = "" if !defined ( $self->{ _textCorpusStr } ); |
138
|
|
|
|
|
|
|
$self->{ _twigHandler } = 0 if !defined ( $self->{ _twigHandler } ); |
139
|
|
|
|
|
|
|
$self->{ _parsedCount } = 0 if !defined ( $self->{ _parsedCount } ); |
140
|
|
|
|
|
|
|
$self->{ _tempDate } = "" if !defined ( $self->{ _tempDate } ); |
141
|
|
|
|
|
|
|
$self->{ _tempStr } = "" if !defined ( $self->{ _tempStr } ); |
142
|
|
|
|
|
|
|
$self->{ _outputFileName } = "textcorpus.txt" if !defined ( $self->{ _outputFileName } ); |
143
|
|
|
|
|
|
|
@{ $self->{ _compoundWordAry } } = () if !defined ( $self->{ _compoundWordAry } ); |
144
|
|
|
|
|
|
|
@{ $self->{ _compoundWordAry } } = @{ $self->{ _compoundWordAry } } if defined ( $self->{ _compoundWordAry } ); |
145
|
|
|
|
|
|
|
$self->{ _compoundWordBST } = Word2vec::Bst->new() if !defined ( $self->{ _compoundWordBST } ); |
146
|
|
|
|
|
|
|
$self->{ _maxCompoundWordLength } = 20 if !defined ( $self->{ _maxCompoundWordLength } ); |
147
|
|
|
|
|
|
|
$self->{ _overwriteExistingFile } = 0 if !defined ( $self->{ _overwriteExistingFile } ); |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
# Initialize Thread Safe Counting Variables |
150
|
|
|
|
|
|
|
@xmlJobQueue = (); |
151
|
|
|
|
|
|
|
$compoundWordCount = 0; |
152
|
|
|
|
|
|
|
$preCompWordCount = 0; |
153
|
|
|
|
|
|
|
$postCompWordCount = 0; |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
# Open File Handler if checked variable is true |
156
|
|
|
|
|
|
|
if( $self->{ _writeLog } ) |
157
|
|
|
|
|
|
|
{ |
158
|
|
|
|
|
|
|
open( $self->{ _fileHandle }, '>:utf8', 'Xmltow2vLog.txt' ); |
159
|
|
|
|
|
|
|
$self->{ _fileHandle }->autoflush( 1 ); # Auto-flushes writes to log |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# Declare XML parser |
163
|
|
|
|
|
|
|
# Quick Parse Method(s): Much Faster With Less Hardware Requirements and Accuracy |
164
|
|
|
|
|
|
|
if( $self->{ _quickParse } == 1 ) |
165
|
|
|
|
|
|
|
{ |
166
|
|
|
|
|
|
|
$self->{ _twigHandler } = XML::Twig->new( |
167
|
|
|
|
|
|
|
twig_handlers => |
168
|
|
|
|
|
|
|
{ |
169
|
|
|
|
|
|
|
'DateCreated' => sub { _QuickParseDateCreated( @_, $self ) }, |
170
|
|
|
|
|
|
|
'Journal' => sub { _QuickParseJournal( @_, $self ) }, |
171
|
|
|
|
|
|
|
'Article' => sub { _QuickParseArticle( @_, $self ) }, |
172
|
|
|
|
|
|
|
'OtherAbstract' => sub { _QuickParseOtherAbstract( @_, $self ) }, |
173
|
|
|
|
|
|
|
}, |
174
|
|
|
|
|
|
|
); |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
# Default Parse Method: Much Slower With High RAM Requirements and Better Accuracy |
177
|
|
|
|
|
|
|
else |
178
|
|
|
|
|
|
|
{ |
179
|
|
|
|
|
|
|
$self->{ _twigHandler } = XML::Twig->new( |
180
|
|
|
|
|
|
|
twig_handlers => |
181
|
|
|
|
|
|
|
{ |
182
|
|
|
|
|
|
|
'MedlineCitationSet' => sub { _ParseMedlineCitationSet( @_, $self ) }, |
183
|
|
|
|
|
|
|
}, |
184
|
|
|
|
|
|
|
); |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
bless $self, $class; |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
$self->WriteLog( "New - Debug On" ); |
190
|
|
|
|
|
|
|
$self->WriteLog( "New - QuickParse Enabled" ) if( $self->{ _quickParse } == 1 ); |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
if( $self->{ _xmlStringToParse } ne "(null)" ) |
193
|
|
|
|
|
|
|
{ |
194
|
|
|
|
|
|
|
#$self->_RemoveXMLVersion( \$self->{ _xmlStringToParse } ); |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
if( $self->_CheckForNullData ( $self->{ _xmlStringToParse } ) ) |
197
|
|
|
|
|
|
|
{ |
198
|
|
|
|
|
|
|
$self->WriteLog( "New - Error: XML String is null" ); |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
else |
201
|
|
|
|
|
|
|
{ |
202
|
|
|
|
|
|
|
$self->{ _twigHandler }->parse( $self->{ _xmlStringToParse } ); |
203
|
|
|
|
|
|
|
} |
204
|
|
|
|
|
|
|
} |
205
|
|
|
|
|
|
|
else |
206
|
|
|
|
|
|
|
{ |
207
|
|
|
|
|
|
|
$self->WriteLog( "New - No XML String Argument To Parse" ); |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
return $self; |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
###################################################################################### |
215
|
|
|
|
|
|
|
# DESTROY |
216
|
|
|
|
|
|
|
###################################################################################### |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
sub DESTROY |
219
|
|
|
|
|
|
|
{ |
220
|
|
|
|
|
|
|
my ( $self ) = @_; |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
# Close FileHandle |
223
|
|
|
|
|
|
|
close( $self->{ _fileHandle } ) if( $self->{ _fileHandle } ); |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
###################################################################################### |
228
|
|
|
|
|
|
|
# Module Functions |
229
|
|
|
|
|
|
|
###################################################################################### |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
sub ConvertMedlineXMLToW2V |
232
|
|
|
|
|
|
|
{ |
233
|
|
|
|
|
|
|
my ( $self, $dir ) = @_; |
234
|
|
|
|
|
|
|
$dir = $self->GetWorkingDir() if !defined ( $dir ); |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
my $result = $self->_DateCheck(); |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
# Check(s) |
239
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2v - Error: Date Check Failed" ) if ( $result == -1 ); |
240
|
|
|
|
|
|
|
return -1 if ( $result == -1 ); |
241
|
|
|
|
|
|
|
|
242
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Error: StoreTitle and StoreAbstract Variables Set To 0 - No Data Will Be Extracted" ) |
243
|
|
|
|
|
|
|
if ( $self->GetStoreTitle() == 0 && $self->GetStoreAbstract() == 0 ); |
244
|
|
|
|
|
|
|
return -1 if ( $self->GetStoreTitle() == 0 && $self->GetStoreAbstract() == 0 ); |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
# Check To See If Overwrite Existing File Option Is Enabled And Overwrite |
247
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Overwrite Existing File Option Enabled" ) if $self->GetOverwriteExistingFile() == 1; |
248
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Existing File Found - Removing Existing File" ) if ( $self->GetOverwriteExistingFile() == 1 && -e $self->GetSavePath() ); |
249
|
|
|
|
|
|
|
unlink( $self->GetSavePath() ) if ( $self->GetOverwriteExistingFile() == 1 && -e $self->GetSavePath() ); |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
my $isFileOrDir = $self->IsFileOrDirectory( $dir ); |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
# Process File In Working Directory |
254
|
|
|
|
|
|
|
if( $isFileOrDir eq "file" ) |
255
|
|
|
|
|
|
|
{ |
256
|
|
|
|
|
|
|
$self->SetXMLStringToParse( $self->_ReadXMLDataFromFile( $dir ) ); |
257
|
|
|
|
|
|
|
return -1 if ( $self->GetXMLStringToParse() ) eq "(null)"; |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Parsing XML File: $dir" ); |
260
|
|
|
|
|
|
|
$self->_ParseXMLString( $self->GetXMLStringToParse() ); |
261
|
|
|
|
|
|
|
$self->_SaveTextCorpusToFile( $self->GetSavePath() ); |
262
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Parsing Complete" ); |
263
|
|
|
|
|
|
|
} |
264
|
|
|
|
|
|
|
# Process All Files In Directory |
265
|
|
|
|
|
|
|
elsif( $isFileOrDir eq "dir" ) |
266
|
|
|
|
|
|
|
{ |
267
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - No File Specified/Using Directory Option" ); |
268
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Obtaining File(s) In Directory" ); |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
# Read File Name(s) From Specified Directory |
271
|
|
|
|
|
|
|
opendir( my $dirHandle, "$dir" ) or $result = -1; |
272
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Error: Can't open $dir: $!" ) if $result == -1; |
273
|
|
|
|
|
|
|
return -1 if $result == -1; |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
for my $file ( readdir( $dirHandle ) ) |
276
|
|
|
|
|
|
|
{ |
277
|
|
|
|
|
|
|
push( @xmlJobQueue, $file ) if ( ( index( $file, ".xml" ) != -1 ) && ( index( $file, ".xml.gz") == -1 ) ); |
278
|
|
|
|
|
|
|
push( @xmlJobQueue, $file ) if ( index( $file, ".gz" ) != -1 ); |
279
|
|
|
|
|
|
|
} |
280
|
|
|
|
|
|
|
|
281
|
|
|
|
|
|
|
closedir $dirHandle; |
282
|
|
|
|
|
|
|
undef $dirHandle; |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
# Set Total Job Count |
285
|
|
|
|
|
|
|
$totalJobCount = @xmlJobQueue; |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Parsing $totalJobCount File(s)" ); |
288
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Starting Worker Thread(s) / Compiling Text Corpus" ); |
289
|
|
|
|
|
|
|
|
290
|
|
|
|
|
|
|
# Start Thread(s) |
291
|
|
|
|
|
|
|
for( my $i = 0; $i < $self->GetNumOfThreads(); $i++ ) |
292
|
|
|
|
|
|
|
{ |
293
|
|
|
|
|
|
|
my $thread = threads->create( "_ThreadedConvert", $self, $dir ); |
294
|
|
|
|
|
|
|
} |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
# Join All Running Threads Prior To Termination |
297
|
|
|
|
|
|
|
my @threadAry = threads->list(); |
298
|
|
|
|
|
|
|
|
299
|
|
|
|
|
|
|
for my $thread ( @threadAry ) |
300
|
|
|
|
|
|
|
{ |
301
|
|
|
|
|
|
|
$thread->join() if ( $thread->is_running() || $thread->is_joinable() ); |
302
|
|
|
|
|
|
|
} |
303
|
|
|
|
|
|
|
|
304
|
|
|
|
|
|
|
print( "Parsed $finishedJobCount of $totalJobCount Files\n" ) if ( $self->GetDebugLog() == 0 ); |
305
|
|
|
|
|
|
|
print( "Number Of Compound Words: $compoundWordCount\n" ) if ( $self->GetDebugLog() == 0 ); |
306
|
|
|
|
|
|
|
print( "Number Of Words (Before Compounding): $preCompWordCount\n" ) if ( $self->GetDebugLog() == 0 ); |
307
|
|
|
|
|
|
|
print( "Number Of Words (After Compounding): $postCompWordCount\n" ) if ( $self->GetDebugLog() == 0 ); |
308
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Parsed $finishedJobCount of $totalJobCount Files" ); |
309
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Number Of Compound Words: $compoundWordCount" ); |
310
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Number Of Words (Before Compounding): $preCompWordCount" ); |
311
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Number Of Words (After Compounding): $postCompWordCount" ); |
312
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Parsing Complete" ); |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
# Clean Up |
315
|
|
|
|
|
|
|
ClearTempStr(); |
316
|
|
|
|
|
|
|
ClearTextCorpusStr(); |
317
|
|
|
|
|
|
|
$totalJobCount = 0; |
318
|
|
|
|
|
|
|
$preCompWordCount = 0; |
319
|
|
|
|
|
|
|
$compoundWordCount = 0; |
320
|
|
|
|
|
|
|
$postCompWordCount = 0; |
321
|
|
|
|
|
|
|
} |
322
|
|
|
|
|
|
|
else |
323
|
|
|
|
|
|
|
{ |
324
|
|
|
|
|
|
|
$self->WriteLog( "ConvertMedlineXMLToW2V - Unknown Parameter Type: Not File Or Directory" ); |
325
|
|
|
|
|
|
|
} |
326
|
|
|
|
|
|
|
|
327
|
|
|
|
|
|
|
return 0; |
328
|
|
|
|
|
|
|
} |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
sub _ThreadedConvert |
331
|
|
|
|
|
|
|
{ |
332
|
|
|
|
|
|
|
my ( $self, $dir ) = @_; |
333
|
|
|
|
|
|
|
|
334
|
|
|
|
|
|
|
my $keepWorking = 1; |
335
|
|
|
|
|
|
|
my $tid = threads->tid(); |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Warning: Requested Thread $tid Not Needed/Threads Exceed Work Load - Terminating Thread" ) if ( @xmlJobQueue == 0 ); |
338
|
|
|
|
|
|
|
return 1 if ( @xmlJobQueue == 0 ); |
339
|
|
|
|
|
|
|
|
340
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Starting Thread: $tid" ); |
341
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Thread $tid Parsing File(s) In Job Queue" ); |
342
|
|
|
|
|
|
|
|
343
|
|
|
|
|
|
|
while( $keepWorking == 1 ) |
344
|
|
|
|
|
|
|
{ |
345
|
|
|
|
|
|
|
my $file; |
346
|
|
|
|
|
|
|
|
347
|
|
|
|
|
|
|
# Prevent Other Threads From Reading Shared Job Queue (Array) At The Same Time |
348
|
|
|
|
|
|
|
{ |
349
|
|
|
|
|
|
|
lock( $queueLock ); |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
# Fetch A File Name To Parse |
352
|
|
|
|
|
|
|
my $index = 0; |
353
|
|
|
|
|
|
|
|
354
|
|
|
|
|
|
|
# Keep Iterating Through Queue While Elements Are Not Defined |
355
|
|
|
|
|
|
|
while( $index < @xmlJobQueue ) |
356
|
|
|
|
|
|
|
{ |
357
|
|
|
|
|
|
|
$file = $xmlJobQueue[$index]; |
358
|
|
|
|
|
|
|
delete( $xmlJobQueue[$index] ) if defined( $file ); |
359
|
|
|
|
|
|
|
|
360
|
|
|
|
|
|
|
# Exit Loop If Element Array Defined |
361
|
|
|
|
|
|
|
$index = @xmlJobQueue if defined( $file ); |
362
|
|
|
|
|
|
|
|
363
|
|
|
|
|
|
|
$index++; |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
# Increment Parsed File Counter |
367
|
|
|
|
|
|
|
$finishedJobCount++ if defined( $file ); |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
# Exit The Main Loop If The Last Element Was Parsed |
370
|
|
|
|
|
|
|
$keepWorking = 0 if ( @xmlJobQueue == 0 ); |
371
|
|
|
|
|
|
|
} |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
if( defined( $file ) ) |
374
|
|
|
|
|
|
|
{ |
375
|
|
|
|
|
|
|
print( "Thread $tid: Parsing $file\n" ) if ( !$self->GetDebugLog() ); |
376
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Thread $tid: Processing File: $file" ); |
377
|
|
|
|
|
|
|
$self->SetXMLStringToParse( $self->_ReadXMLDataFromFile( "$dir/$file" ) ); |
378
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Thread $tid: Parsing XML Data" ); |
379
|
|
|
|
|
|
|
$self->_ParseXMLString( $self->GetXMLStringToParse() ); |
380
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Thread $tid: Parsed $file" ); |
381
|
|
|
|
|
|
|
print( "Thread $tid: Parsed $file\n" ) if ( !$self->GetDebugLog() ); |
382
|
|
|
|
|
|
|
$self->_SaveTextCorpusToFile( $self->GetSavePath(), 1 ); |
383
|
|
|
|
|
|
|
$self->ClearTextCorpusStr(); |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
} |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
$self->WriteLog( "_ThreadedConvert - Thread $tid Finished - Terminating" ); |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
return 0; |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
|
392
|
|
|
|
|
|
|
sub _ParseXMLString |
393
|
|
|
|
|
|
|
{ |
394
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
395
|
|
|
|
|
|
|
$string = "" if !defined ( $string ); |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
if( $self->_CheckParseRequirements( $string ) eq -1 ) |
398
|
|
|
|
|
|
|
{ |
399
|
|
|
|
|
|
|
return -1; |
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
# REMOVEME |
403
|
|
|
|
|
|
|
#$self->_RemoveXMLVersion( \$string ); |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
if( $self->_CheckForNullData( $string ) ) |
406
|
|
|
|
|
|
|
{ |
407
|
|
|
|
|
|
|
$self->WriteLog( "_ParseXMLString - Cannot Parse (null) string" ); |
408
|
|
|
|
|
|
|
return -1; |
409
|
|
|
|
|
|
|
} |
410
|
|
|
|
|
|
|
else |
411
|
|
|
|
|
|
|
{ |
412
|
|
|
|
|
|
|
$self->{ _twigHandler }->parse( $string ); |
413
|
|
|
|
|
|
|
$self->WriteLog( "_ParseXMLString: Released PubmedArticle from memory" ); |
414
|
|
|
|
|
|
|
|
415
|
|
|
|
|
|
|
# Print how many entries were parsed |
416
|
|
|
|
|
|
|
$self->WriteLog( "_ParseXMLString: Parsed " . $self->GetParsedCount() . " entries" ); |
417
|
|
|
|
|
|
|
} |
418
|
|
|
|
|
|
|
|
419
|
|
|
|
|
|
|
return 0; |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
sub _CheckParseRequirements |
423
|
|
|
|
|
|
|
{ |
424
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
425
|
|
|
|
|
|
|
$string = "" if !defined ( $string ); |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
if( $string eq "" ) |
428
|
|
|
|
|
|
|
{ |
429
|
|
|
|
|
|
|
$self->WriteLog( "_CheckParseRequirements - Error: Nothing To Parse" ); |
430
|
|
|
|
|
|
|
return -1; |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
elsif( $self->GetTwigHandler() == 0 ) |
433
|
|
|
|
|
|
|
{ |
434
|
|
|
|
|
|
|
$self->WriteLog( "_CheckParseRequirements - Error: Unable To Parse XML Data/TwigHandler = (null)" ); |
435
|
|
|
|
|
|
|
return -1; |
436
|
|
|
|
|
|
|
} |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
return 0; |
439
|
|
|
|
|
|
|
} |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
# Checks to see if Medline XML data in memory is a null string |
442
|
|
|
|
|
|
|
sub _CheckForNullData |
443
|
|
|
|
|
|
|
{ |
444
|
|
|
|
|
|
|
my ( $self, $temp ) = @_; |
445
|
|
|
|
|
|
|
my $nullStr = "(null)"; |
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
if( my $n = index( $temp, $nullStr ) != -1 ) |
448
|
|
|
|
|
|
|
{ |
449
|
|
|
|
|
|
|
# Return True |
450
|
|
|
|
|
|
|
return 1 if $n == 0; |
451
|
|
|
|
|
|
|
} |
452
|
|
|
|
|
|
|
|
453
|
|
|
|
|
|
|
# Return False |
454
|
|
|
|
|
|
|
return 0; |
455
|
|
|
|
|
|
|
} |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
# Removes the XML Version string prior to parsing the XML string |
458
|
|
|
|
|
|
|
sub _RemoveXMLVersion |
459
|
|
|
|
|
|
|
{ |
460
|
|
|
|
|
|
|
my ( $self, $temp ) = @_; |
461
|
|
|
|
|
|
|
|
462
|
|
|
|
|
|
|
# Checking For XML Version |
463
|
|
|
|
|
|
|
my $xmlVersion = '
|
464
|
|
|
|
|
|
|
my $docType = '!DOCTYPE'; |
465
|
|
|
|
|
|
|
|
466
|
|
|
|
|
|
|
my $line = ""; |
467
|
|
|
|
|
|
|
my $newXMLString = ""; |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
foreach $line ( split /\n/ , ${$temp} ) |
470
|
|
|
|
|
|
|
{ |
471
|
|
|
|
|
|
|
if( index( $line, $xmlVersion ) == -1 && index( $line, $docType ) == -1 ) |
472
|
|
|
|
|
|
|
{ |
473
|
|
|
|
|
|
|
$newXMLString .= ( $line . "\n" ); |
474
|
|
|
|
|
|
|
} |
475
|
|
|
|
|
|
|
} |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
${$temp} = $newXMLString; |
478
|
|
|
|
|
|
|
} |
479
|
|
|
|
|
|
|
|
480
|
|
|
|
|
|
|
sub _ParseMedlineCitationSet |
481
|
|
|
|
|
|
|
{ |
482
|
|
|
|
|
|
|
my ( $twigSelf, $root, $self ) = @_; |
483
|
|
|
|
|
|
|
my @pubMedArticles = $root->children(); |
484
|
|
|
|
|
|
|
|
485
|
|
|
|
|
|
|
my $parsedData = 0; |
486
|
|
|
|
|
|
|
|
487
|
|
|
|
|
|
|
foreach my $pubMedArticle ( @pubMedArticles ) |
488
|
|
|
|
|
|
|
{ |
489
|
|
|
|
|
|
|
# Parse XML Data |
490
|
|
|
|
|
|
|
$parsedData = $self->_ParseMedlineArticle( $pubMedArticle ); |
491
|
|
|
|
|
|
|
|
492
|
|
|
|
|
|
|
# Compoundify String If Option Is Enabled |
493
|
|
|
|
|
|
|
if( $self->GetCompoundifyText() == 1 && ( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) ) |
494
|
|
|
|
|
|
|
{ |
495
|
|
|
|
|
|
|
my $tempStr = $self->CompoundifyString( lc( $self->GetTempStr() ) ); |
496
|
|
|
|
|
|
|
|
497
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
498
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $tempStr ); |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
elsif( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) |
501
|
|
|
|
|
|
|
{ |
502
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
503
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $self->GetTempStr() ); |
504
|
|
|
|
|
|
|
} |
505
|
|
|
|
|
|
|
|
506
|
|
|
|
|
|
|
# Clear string placeholders |
507
|
|
|
|
|
|
|
$self->ClearTempStr(); |
508
|
|
|
|
|
|
|
$self->ClearTempDate(); |
509
|
|
|
|
|
|
|
|
510
|
|
|
|
|
|
|
# Increment Parsed Counter |
511
|
|
|
|
|
|
|
$self->{ _parsedCount }++ if ( $parsedData == 1 ); |
512
|
|
|
|
|
|
|
|
513
|
|
|
|
|
|
|
# Release the stored XML section from memory (not fully tested) |
514
|
|
|
|
|
|
|
$pubMedArticle->purge() if defined( $pubMedArticle ); |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
# Reset Parsed Data Flag |
517
|
|
|
|
|
|
|
$parsedData = 0; |
518
|
|
|
|
|
|
|
} |
519
|
|
|
|
|
|
|
|
520
|
|
|
|
|
|
|
# Release the stored XML section from memory (not fully tested) |
521
|
|
|
|
|
|
|
$root->purge(); |
522
|
|
|
|
|
|
|
$self->WriteLog( "_ParseMedlineCitationSet: Released PubmedArticleSet group from memory" ); |
523
|
|
|
|
|
|
|
} |
524
|
|
|
|
|
|
|
|
525
|
|
|
|
|
|
|
sub _ParseMedlineArticle |
526
|
|
|
|
|
|
|
{ |
527
|
|
|
|
|
|
|
my ( $self, $medlineArticle ) = @_; |
528
|
|
|
|
|
|
|
|
529
|
|
|
|
|
|
|
my @articles = $medlineArticle->children(); |
530
|
|
|
|
|
|
|
my $dateCreated = ""; |
531
|
|
|
|
|
|
|
|
532
|
|
|
|
|
|
|
for my $article ( @articles ) |
533
|
|
|
|
|
|
|
{ |
534
|
|
|
|
|
|
|
if( $article->tag() eq "Article" ) |
535
|
|
|
|
|
|
|
{ |
536
|
|
|
|
|
|
|
$self->_ParseArticle( $article ); |
537
|
|
|
|
|
|
|
} |
538
|
|
|
|
|
|
|
elsif( $article->tag() eq "DateCreated" ) |
539
|
|
|
|
|
|
|
{ |
540
|
|
|
|
|
|
|
$self->SetTempDate( $self->_ParseDateCreated( $article ) ); |
541
|
|
|
|
|
|
|
} |
542
|
|
|
|
|
|
|
elsif( $article->tag() eq "OtherAbstract" ) |
543
|
|
|
|
|
|
|
{ |
544
|
|
|
|
|
|
|
$self->_ParseOtherAbstract( $article ); |
545
|
|
|
|
|
|
|
} |
546
|
|
|
|
|
|
|
else |
547
|
|
|
|
|
|
|
{ |
548
|
|
|
|
|
|
|
$self->WriteLog( "_ParseMedlineArticle - (New Data Found) - Tag: " . $article->tag() . ", Field: " . $article->field() ); |
549
|
|
|
|
|
|
|
} |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
# Release article from memory |
552
|
|
|
|
|
|
|
$article->purge(); |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
return 1; |
556
|
|
|
|
|
|
|
} |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
sub _ParseDateCreated |
559
|
|
|
|
|
|
|
{ |
560
|
|
|
|
|
|
|
my ( $self, $article ) = @_; |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
my $month = ""; |
563
|
|
|
|
|
|
|
my $day = ""; |
564
|
|
|
|
|
|
|
my $year = ""; |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
my @dateAry = $article->children(); |
567
|
|
|
|
|
|
|
|
568
|
|
|
|
|
|
|
for my $date ( @dateAry ) |
569
|
|
|
|
|
|
|
{ |
570
|
|
|
|
|
|
|
$day = $date->field() if ( $date->tag() eq "Day" ); |
571
|
|
|
|
|
|
|
$month = $date->field if ( $date->tag() eq "Month" ); |
572
|
|
|
|
|
|
|
$year = $date->field() if ( $date->tag() eq "Year" ); |
573
|
|
|
|
|
|
|
} |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
# Check(s) |
576
|
|
|
|
|
|
|
$day = "00" if !defined ( $day ); |
577
|
|
|
|
|
|
|
$month = "00" if !defined ( $month ); |
578
|
|
|
|
|
|
|
$year = "0000" if !defined ( $year ); |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
$self->WriteLog( "_ParseDateCreated - Month: $month, Day: $day, Year: $year " ); |
581
|
|
|
|
|
|
|
|
582
|
|
|
|
|
|
|
return "$month/$day/$year"; |
583
|
|
|
|
|
|
|
} |
584
|
|
|
|
|
|
|
|
585
|
|
|
|
|
|
|
sub _ParseArticle |
586
|
|
|
|
|
|
|
{ |
587
|
|
|
|
|
|
|
my ( $self, $article ) = @_; |
588
|
|
|
|
|
|
|
|
589
|
|
|
|
|
|
|
my @articleChildren = $article->children(); |
590
|
|
|
|
|
|
|
|
591
|
|
|
|
|
|
|
for my $articleChild ( @articleChildren ) |
592
|
|
|
|
|
|
|
{ |
593
|
|
|
|
|
|
|
if( $articleChild->tag() eq "Journal" ) |
594
|
|
|
|
|
|
|
{ |
595
|
|
|
|
|
|
|
$self->_ParseJournal( $articleChild ); |
596
|
|
|
|
|
|
|
} |
597
|
|
|
|
|
|
|
elsif( $articleChild->tag() eq "ArticleTitle" ) |
598
|
|
|
|
|
|
|
{ |
599
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $articleChild->field() ); |
600
|
|
|
|
|
|
|
chomp( $tempStr ); |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
# Store String |
603
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreTitle() == 1 ); |
604
|
|
|
|
|
|
|
|
605
|
|
|
|
|
|
|
$self->WriteLog( "_ParseArticle - Tag: " . $articleChild->tag() . ", Field: " . $tempStr ); |
606
|
|
|
|
|
|
|
} |
607
|
|
|
|
|
|
|
elsif( $articleChild->tag() eq "Abstract" ) |
608
|
|
|
|
|
|
|
{ |
609
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $articleChild->field() ); |
610
|
|
|
|
|
|
|
chomp( $tempStr ); |
611
|
|
|
|
|
|
|
|
612
|
|
|
|
|
|
|
# Store String |
613
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreAbstract() == 1 ); |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
$self->WriteLog( "_ParseArticle - Tag: " . $articleChild->tag() . ", Field: " . $tempStr ); |
616
|
|
|
|
|
|
|
} |
617
|
|
|
|
|
|
|
else |
618
|
|
|
|
|
|
|
{ |
619
|
|
|
|
|
|
|
$self->WriteLog( "_ParseArticle - (New Tag Found) - Tag: " . $articleChild->tag() . ", Field: " . $articleChild->field() ); |
620
|
|
|
|
|
|
|
} |
621
|
|
|
|
|
|
|
} |
622
|
|
|
|
|
|
|
} |
623
|
|
|
|
|
|
|
|
624
|
|
|
|
|
|
|
sub _ParseJournal |
625
|
|
|
|
|
|
|
{ |
626
|
|
|
|
|
|
|
my ( $self, $journalRoot ) = @_; |
627
|
|
|
|
|
|
|
|
628
|
|
|
|
|
|
|
my @journalChildren = $journalRoot->children(); |
629
|
|
|
|
|
|
|
|
630
|
|
|
|
|
|
|
for my $journalChild ( @journalChildren ) |
631
|
|
|
|
|
|
|
{ |
632
|
|
|
|
|
|
|
if( $journalChild->tag() eq "Title" ) |
633
|
|
|
|
|
|
|
{ |
634
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $journalChild->field() ); |
635
|
|
|
|
|
|
|
chomp( $tempStr ); |
636
|
|
|
|
|
|
|
|
637
|
|
|
|
|
|
|
# Store String |
638
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreTitle() == 1 ); |
639
|
|
|
|
|
|
|
|
640
|
|
|
|
|
|
|
$self->WriteLog( "_ParseJournal - Tag: " . $journalChild->tag() . ", Field: " . $tempStr ); |
641
|
|
|
|
|
|
|
} |
642
|
|
|
|
|
|
|
else |
643
|
|
|
|
|
|
|
{ |
644
|
|
|
|
|
|
|
$self->WriteLog( "_ParseJournal - (New Tag Found) - Tag: " . $journalChild->tag() . ", Field: " . $journalChild->field() ); |
645
|
|
|
|
|
|
|
} |
646
|
|
|
|
|
|
|
} |
647
|
|
|
|
|
|
|
} |
648
|
|
|
|
|
|
|
|
649
|
|
|
|
|
|
|
sub _ParseOtherAbstract |
650
|
|
|
|
|
|
|
{ |
651
|
|
|
|
|
|
|
my ( $self, $abstractRoot ) = @_; |
652
|
|
|
|
|
|
|
|
653
|
|
|
|
|
|
|
my @otherAbstractChildren = $abstractRoot->children(); |
654
|
|
|
|
|
|
|
|
655
|
|
|
|
|
|
|
for my $abstractChild ( @otherAbstractChildren ) |
656
|
|
|
|
|
|
|
{ |
657
|
|
|
|
|
|
|
if( $abstractChild->tag() eq "AbstractText" ) |
658
|
|
|
|
|
|
|
{ |
659
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $abstractChild->field() ); |
660
|
|
|
|
|
|
|
chomp( $tempStr ); |
661
|
|
|
|
|
|
|
|
662
|
|
|
|
|
|
|
# Store String |
663
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreAbstract() == 1 ); |
664
|
|
|
|
|
|
|
|
665
|
|
|
|
|
|
|
$self->WriteLog( "_ParseOtherAbstract - Tag: " . $abstractChild->tag() . ", Field: " . $tempStr ); |
666
|
|
|
|
|
|
|
} |
667
|
|
|
|
|
|
|
else |
668
|
|
|
|
|
|
|
{ |
669
|
|
|
|
|
|
|
$self->WriteLog( "_ParseOtherAbstract - (New Tag Found) - Tag: " . $abstractChild->tag() . ", Field: " . $abstractChild->field() ); |
670
|
|
|
|
|
|
|
} |
671
|
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
} |
673
|
|
|
|
|
|
|
|
674
|
|
|
|
|
|
|
sub _QuickParseDateCreated |
675
|
|
|
|
|
|
|
{ |
676
|
|
|
|
|
|
|
my ( $twigSelf, $article, $self ) = @_; |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
my $month = ""; |
679
|
|
|
|
|
|
|
my $day = ""; |
680
|
|
|
|
|
|
|
my $year = ""; |
681
|
|
|
|
|
|
|
|
682
|
|
|
|
|
|
|
# Clear Old Date |
683
|
|
|
|
|
|
|
$self->ClearTempDate(); |
684
|
|
|
|
|
|
|
|
685
|
|
|
|
|
|
|
my @dateAry = $article->children(); |
686
|
|
|
|
|
|
|
|
687
|
|
|
|
|
|
|
for my $date ( @dateAry ) |
688
|
|
|
|
|
|
|
{ |
689
|
|
|
|
|
|
|
$day = $date->field() if ( $date->tag() eq "Day" ); |
690
|
|
|
|
|
|
|
$month = $date->field if ( $date->tag() eq "Month" ); |
691
|
|
|
|
|
|
|
$year = $date->field() if ( $date->tag() eq "Year" ); |
692
|
|
|
|
|
|
|
} |
693
|
|
|
|
|
|
|
|
694
|
|
|
|
|
|
|
# Check(s) |
695
|
|
|
|
|
|
|
$day = "00" if !defined ( $day ); |
696
|
|
|
|
|
|
|
$month = "00" if !defined ( $month ); |
697
|
|
|
|
|
|
|
$year = "0000" if !defined ( $year ); |
698
|
|
|
|
|
|
|
|
699
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseDateCreated - Month: $month, Day: $day, Year: $year " ); |
700
|
|
|
|
|
|
|
|
701
|
|
|
|
|
|
|
$self->SetTempDate( "$month/$day/$year" ); |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
# Free Memory |
704
|
|
|
|
|
|
|
$article->purge(); |
705
|
|
|
|
|
|
|
} |
706
|
|
|
|
|
|
|
|
707
|
|
|
|
|
|
|
sub _QuickParseJournal |
708
|
|
|
|
|
|
|
{ |
709
|
|
|
|
|
|
|
my ( $twigSelf, $journalRoot, $self ) = @_; |
710
|
|
|
|
|
|
|
|
711
|
|
|
|
|
|
|
my @journalChildren = $journalRoot->children(); |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
for my $journalChild ( @journalChildren ) |
714
|
|
|
|
|
|
|
{ |
715
|
|
|
|
|
|
|
if( $journalChild->tag() eq "Title" ) |
716
|
|
|
|
|
|
|
{ |
717
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $journalChild->field() ); |
718
|
|
|
|
|
|
|
chomp( $tempStr ); |
719
|
|
|
|
|
|
|
|
720
|
|
|
|
|
|
|
# Store String |
721
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreTitle() == 1 ); |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseJournal - Tag: " . $journalChild->tag() . ", Field: " . $tempStr ); |
724
|
|
|
|
|
|
|
} |
725
|
|
|
|
|
|
|
else |
726
|
|
|
|
|
|
|
{ |
727
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseJournal - (New Tag Found) - Tag: " . $journalChild->tag() . ", Field: " . $journalChild->field() ); |
728
|
|
|
|
|
|
|
} |
729
|
|
|
|
|
|
|
} |
730
|
|
|
|
|
|
|
|
731
|
|
|
|
|
|
|
# Compoundify String If Option Is Enabled |
732
|
|
|
|
|
|
|
if( $self->GetCompoundifyText() == 1 && ( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) ) |
733
|
|
|
|
|
|
|
{ |
734
|
|
|
|
|
|
|
my $tempStr = $self->CompoundifyString( lc( $self->GetTempStr() ) ); |
735
|
|
|
|
|
|
|
|
736
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
737
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $tempStr ); |
738
|
|
|
|
|
|
|
} |
739
|
|
|
|
|
|
|
elsif( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) |
740
|
|
|
|
|
|
|
{ |
741
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
742
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $self->GetTempStr() ); |
743
|
|
|
|
|
|
|
} |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
# Clear string placeholders |
746
|
|
|
|
|
|
|
$self->ClearTempStr(); |
747
|
|
|
|
|
|
|
|
748
|
|
|
|
|
|
|
# Free Memory |
749
|
|
|
|
|
|
|
$journalRoot->purge(); |
750
|
|
|
|
|
|
|
} |
751
|
|
|
|
|
|
|
|
752
|
|
|
|
|
|
|
sub _QuickParseArticle |
753
|
|
|
|
|
|
|
{ |
754
|
|
|
|
|
|
|
my ( $twigSelf, $article, $self ) = @_; |
755
|
|
|
|
|
|
|
|
756
|
|
|
|
|
|
|
my @articleChildren = $article->children(); |
757
|
|
|
|
|
|
|
|
758
|
|
|
|
|
|
|
for my $articleChild ( @articleChildren ) |
759
|
|
|
|
|
|
|
{ |
760
|
|
|
|
|
|
|
if( $articleChild->tag() eq "ArticleTitle" ) |
761
|
|
|
|
|
|
|
{ |
762
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $articleChild->field() ); |
763
|
|
|
|
|
|
|
chomp( $tempStr ); |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
# Store String |
766
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreTitle() == 1 ); |
767
|
|
|
|
|
|
|
|
768
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseArticle - Tag: " . $articleChild->tag() . ", Field: " . $tempStr ); |
769
|
|
|
|
|
|
|
} |
770
|
|
|
|
|
|
|
elsif( $articleChild->tag() eq "Abstract" ) |
771
|
|
|
|
|
|
|
{ |
772
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $articleChild->field() ); |
773
|
|
|
|
|
|
|
chomp( $tempStr ); |
774
|
|
|
|
|
|
|
|
775
|
|
|
|
|
|
|
# Store String |
776
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreAbstract() == 1 ); |
777
|
|
|
|
|
|
|
|
778
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseArticle - Tag: " . $articleChild->tag() . ", Field: " . $tempStr ); |
779
|
|
|
|
|
|
|
} |
780
|
|
|
|
|
|
|
else |
781
|
|
|
|
|
|
|
{ |
782
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseArticle - (New Tag Found) - Tag: " . $articleChild->tag() . ", Field: " . $articleChild->field() ); |
783
|
|
|
|
|
|
|
} |
784
|
|
|
|
|
|
|
} |
785
|
|
|
|
|
|
|
|
786
|
|
|
|
|
|
|
# Compoundify String If Option Is Enabled |
787
|
|
|
|
|
|
|
if( $self->GetCompoundifyText() == 1 && ( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) ) |
788
|
|
|
|
|
|
|
{ |
789
|
|
|
|
|
|
|
my $tempStr = $self->CompoundifyString( lc( $self->GetTempStr() ) ); |
790
|
|
|
|
|
|
|
|
791
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
792
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $tempStr ); |
793
|
|
|
|
|
|
|
} |
794
|
|
|
|
|
|
|
elsif( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) |
795
|
|
|
|
|
|
|
{ |
796
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
797
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $self->GetTempStr() ); |
798
|
|
|
|
|
|
|
} |
799
|
|
|
|
|
|
|
|
800
|
|
|
|
|
|
|
# Clear string placeholders |
801
|
|
|
|
|
|
|
$self->ClearTempStr(); |
802
|
|
|
|
|
|
|
|
803
|
|
|
|
|
|
|
# Free Memory |
804
|
|
|
|
|
|
|
$article->purge(); |
805
|
|
|
|
|
|
|
} |
806
|
|
|
|
|
|
|
|
807
|
|
|
|
|
|
|
sub _QuickParseOtherAbstract |
808
|
|
|
|
|
|
|
{ |
809
|
|
|
|
|
|
|
my ( $twigSelf, $abstractRoot, $self ) = @_; |
810
|
|
|
|
|
|
|
|
811
|
|
|
|
|
|
|
my @otherAbstractChildren = $abstractRoot->children(); |
812
|
|
|
|
|
|
|
|
813
|
|
|
|
|
|
|
for my $abstractChild ( @otherAbstractChildren ) |
814
|
|
|
|
|
|
|
{ |
815
|
|
|
|
|
|
|
if( $abstractChild->tag() eq "AbstractText" ) |
816
|
|
|
|
|
|
|
{ |
817
|
|
|
|
|
|
|
my $tempStr = Text::Unidecode::unidecode( $abstractChild->field() ); |
818
|
|
|
|
|
|
|
chomp( $tempStr ); |
819
|
|
|
|
|
|
|
|
820
|
|
|
|
|
|
|
# Store String |
821
|
|
|
|
|
|
|
$self->AppendToTempStr( $tempStr ) if ( $self->GetStoreAbstract() == 1 ); |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseOtherAbstract - Tag: " . $abstractChild->tag() . ", Field: " . $tempStr ); |
824
|
|
|
|
|
|
|
} |
825
|
|
|
|
|
|
|
else |
826
|
|
|
|
|
|
|
{ |
827
|
|
|
|
|
|
|
$self->WriteLog( "_QuickParseOtherAbstract - (New Tag Found) - Tag: " . $abstractChild->tag() . ", Field: " . $abstractChild->field() ); |
828
|
|
|
|
|
|
|
} |
829
|
|
|
|
|
|
|
} |
830
|
|
|
|
|
|
|
|
831
|
|
|
|
|
|
|
# Compoundify String If Option Is Enabled |
832
|
|
|
|
|
|
|
if( $self->GetCompoundifyText() == 1 && ( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) ) |
833
|
|
|
|
|
|
|
{ |
834
|
|
|
|
|
|
|
my $tempStr = $self->CompoundifyString( lc( $self->GetTempStr() ) ); |
835
|
|
|
|
|
|
|
|
836
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
837
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $tempStr ); |
838
|
|
|
|
|
|
|
} |
839
|
|
|
|
|
|
|
elsif( $self->IsDateInSpecifiedRange( $self->GetTempDate(), $self->GetBeginDate(), $self->GetEndDate() ) == 1 ) |
840
|
|
|
|
|
|
|
{ |
841
|
|
|
|
|
|
|
# Append Article Data To Text Corpus |
842
|
|
|
|
|
|
|
$self->AppendStrToTextCorpus( $self->GetTempStr() ); |
843
|
|
|
|
|
|
|
} |
844
|
|
|
|
|
|
|
|
845
|
|
|
|
|
|
|
# Clear string placeholders |
846
|
|
|
|
|
|
|
$self->ClearTempStr(); |
847
|
|
|
|
|
|
|
|
848
|
|
|
|
|
|
|
# Free Memory |
849
|
|
|
|
|
|
|
$abstractRoot->purge(); |
850
|
|
|
|
|
|
|
} |
851
|
|
|
|
|
|
|
|
852
|
|
|
|
|
|
|
sub CreateCompoundWordBST |
853
|
|
|
|
|
|
|
{ |
854
|
|
|
|
|
|
|
my ( $self ) = @_; |
855
|
|
|
|
|
|
|
|
856
|
|
|
|
|
|
|
$self->WriteLog( "CreateCompoundWordBST - Creating Binary Search Tree From Compound Word Array" ); |
857
|
|
|
|
|
|
|
|
858
|
|
|
|
|
|
|
my $bst = $self->GetCompoundWordBST(); |
859
|
|
|
|
|
|
|
my @compoundWordAry = $self->GetCompoundWordAry(); |
860
|
|
|
|
|
|
|
my $arySize = @compoundWordAry; |
861
|
|
|
|
|
|
|
|
862
|
|
|
|
|
|
|
# Check(s) |
863
|
|
|
|
|
|
|
$self->WriteLog( "CreateCompoundWordBST - Error: Cannot Create BST / Compound Word Array Is Empty - Have You Read The Compound Word File To Memory?" ) if $arySize == 0; |
864
|
|
|
|
|
|
|
return -1 if $arySize == 0; |
865
|
|
|
|
|
|
|
|
866
|
|
|
|
|
|
|
my $rootNode = $bst->CreateBST( \@compoundWordAry, 0, $arySize - 1, undef ); |
867
|
|
|
|
|
|
|
$bst->SetRootNode( $rootNode ); |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
# Clean-Up |
870
|
|
|
|
|
|
|
$self->ClearCompoundWordAry(); |
871
|
|
|
|
|
|
|
|
872
|
|
|
|
|
|
|
$self->WriteLog( "CreateCompoundWordBST - Compound Word Binary Search Tree Created" ); |
873
|
|
|
|
|
|
|
|
874
|
|
|
|
|
|
|
return 0; |
875
|
|
|
|
|
|
|
} |
876
|
|
|
|
|
|
|
|
877
|
|
|
|
|
|
|
sub CompoundifyString |
878
|
|
|
|
|
|
|
{ |
879
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
880
|
|
|
|
|
|
|
|
881
|
|
|
|
|
|
|
return "(null)" if !defined ( $str ); |
882
|
|
|
|
|
|
|
|
883
|
|
|
|
|
|
|
$self->WriteLog( "CompoundifyString - Compoundifying String - $str" ); |
884
|
|
|
|
|
|
|
|
885
|
|
|
|
|
|
|
my $bst = $self->GetCompoundWordBST(); |
886
|
|
|
|
|
|
|
|
887
|
|
|
|
|
|
|
my @strAry = split( ' ', $str ); |
888
|
|
|
|
|
|
|
$str = ""; |
889
|
|
|
|
|
|
|
|
890
|
|
|
|
|
|
|
my $arySize = @strAry; |
891
|
|
|
|
|
|
|
my $maxCompoundWordLength = $self->GetMaxCompoundWordLength(); |
892
|
|
|
|
|
|
|
|
893
|
|
|
|
|
|
|
for( my $i = 0; $i < @strAry; $i++ ) |
894
|
|
|
|
|
|
|
{ |
895
|
|
|
|
|
|
|
my $lastIndex = $i + $maxCompoundWordLength; |
896
|
|
|
|
|
|
|
$lastIndex = $arySize - 1 if ( $i + $maxCompoundWordLength > $arySize ); |
897
|
|
|
|
|
|
|
my @tempAry = @strAry[$i..$lastIndex]; |
898
|
|
|
|
|
|
|
|
899
|
|
|
|
|
|
|
my $node = $self->_CompoundifySearch( \@tempAry, undef, $strAry[$i], 0 ); |
900
|
|
|
|
|
|
|
undef( @tempAry ); |
901
|
|
|
|
|
|
|
|
902
|
|
|
|
|
|
|
# Compound Word(s) Found |
903
|
|
|
|
|
|
|
if( defined( $node ) ) |
904
|
|
|
|
|
|
|
{ |
905
|
|
|
|
|
|
|
# Split Compound Word Data And Set Next Index After Located Compound Word(s) |
906
|
|
|
|
|
|
|
my @nodeDataAry = split( ' ', $node->data ); |
907
|
|
|
|
|
|
|
$i += @nodeDataAry - 1; |
908
|
|
|
|
|
|
|
|
909
|
|
|
|
|
|
|
# Add Compound Words To The Return String |
910
|
|
|
|
|
|
|
$str .= join( '_', @nodeDataAry ) . " "; |
911
|
|
|
|
|
|
|
undef( @nodeDataAry ); |
912
|
|
|
|
|
|
|
|
913
|
|
|
|
|
|
|
# Increment Compound Word Counter |
914
|
|
|
|
|
|
|
$compoundWordCount++; |
915
|
|
|
|
|
|
|
} |
916
|
|
|
|
|
|
|
# No Compound Word(s) Found |
917
|
|
|
|
|
|
|
else |
918
|
|
|
|
|
|
|
{ |
919
|
|
|
|
|
|
|
# Add Single Word At Array Index To Return String |
920
|
|
|
|
|
|
|
$str .= $strAry[$i] . " "; |
921
|
|
|
|
|
|
|
} |
922
|
|
|
|
|
|
|
|
923
|
|
|
|
|
|
|
# Increment Word Counter |
924
|
|
|
|
|
|
|
$postCompWordCount++; |
925
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
# Debug Print Statements |
927
|
|
|
|
|
|
|
#$self->WriteLog( "Data: " . $node->data . " : Next Index: $i" ) if defined ( $node ); |
928
|
|
|
|
|
|
|
#$self->WriteLog( "Undefined : Index: $i" ) if !defined ( $node ); |
929
|
|
|
|
|
|
|
} |
930
|
|
|
|
|
|
|
|
931
|
|
|
|
|
|
|
$self->WriteLog( "CompoundifyString - Compounded String - $str" ); |
932
|
|
|
|
|
|
|
|
933
|
|
|
|
|
|
|
return $str; |
934
|
|
|
|
|
|
|
} |
935
|
|
|
|
|
|
|
|
936
|
|
|
|
|
|
|
sub _CompoundifySearch |
937
|
|
|
|
|
|
|
{ |
938
|
|
|
|
|
|
|
my ( $self, $strAryRef, $oldNode, $searchStr, $index ) = @_; |
939
|
|
|
|
|
|
|
|
940
|
|
|
|
|
|
|
# Checks(s) |
941
|
|
|
|
|
|
|
return undef if !defined ( $strAryRef ); |
942
|
|
|
|
|
|
|
return undef if !defined ( $searchStr ); |
943
|
|
|
|
|
|
|
return undef if !defined ( $index ); |
944
|
|
|
|
|
|
|
|
945
|
|
|
|
|
|
|
my @strAry = @{ $strAryRef }; |
946
|
|
|
|
|
|
|
my $arySize = @strAry; |
947
|
|
|
|
|
|
|
my $bst = $self->GetCompoundWordBST(); |
948
|
|
|
|
|
|
|
|
949
|
|
|
|
|
|
|
|
950
|
|
|
|
|
|
|
my $resultNode = $bst->BSTContainsSearch( $bst->GetRootNode(), $searchStr ); |
951
|
|
|
|
|
|
|
|
952
|
|
|
|
|
|
|
if( defined( $resultNode ) && $index < $arySize ) |
953
|
|
|
|
|
|
|
{ |
954
|
|
|
|
|
|
|
$index++; |
955
|
|
|
|
|
|
|
|
956
|
|
|
|
|
|
|
# Make Sure Returned Node Data Is Equal With Search String Or Return Old Node |
957
|
|
|
|
|
|
|
$resultNode = $bst->BSTExactSearch( $bst->GetRootNode(), $searchStr ); |
958
|
|
|
|
|
|
|
$resultNode = $oldNode if !defined( $resultNode ); |
959
|
|
|
|
|
|
|
|
960
|
|
|
|
|
|
|
$searchStr .= ( " " . $strAry[$index] ) if ( $index < $arySize ); |
961
|
|
|
|
|
|
|
return $self->_CompoundifySearch( $strAryRef, $resultNode, $searchStr, $index ) if ( $index < $arySize ); |
962
|
|
|
|
|
|
|
} |
963
|
|
|
|
|
|
|
|
964
|
|
|
|
|
|
|
# Post Check(s) |
965
|
|
|
|
|
|
|
$resultNode = undef if defined( $resultNode ) && ( $resultNode->data ne $searchStr ); |
966
|
|
|
|
|
|
|
|
967
|
|
|
|
|
|
|
if( defined( $oldNode ) ) |
968
|
|
|
|
|
|
|
{ |
969
|
|
|
|
|
|
|
my @searchStrAry = split( ' ', $searchStr ); |
970
|
|
|
|
|
|
|
my @nodeStrAry = split( ' ', $oldNode->data ); |
971
|
|
|
|
|
|
|
|
972
|
|
|
|
|
|
|
if( @searchStrAry > @nodeStrAry ) |
973
|
|
|
|
|
|
|
{ |
974
|
|
|
|
|
|
|
@searchStrAry = splice( @searchStrAry, 0, @nodeStrAry ); |
975
|
|
|
|
|
|
|
my $strA = join( ' ', @searchStrAry ); |
976
|
|
|
|
|
|
|
my $strB = join( ' ', @nodeStrAry ); |
977
|
|
|
|
|
|
|
$oldNode = undef if $strA ne $strB; |
978
|
|
|
|
|
|
|
} |
979
|
|
|
|
|
|
|
elsif( @searchStrAry == @nodeStrAry ) |
980
|
|
|
|
|
|
|
{ |
981
|
|
|
|
|
|
|
$oldNode = undef if $oldNode->data ne $searchStr; |
982
|
|
|
|
|
|
|
} |
983
|
|
|
|
|
|
|
else |
984
|
|
|
|
|
|
|
{ |
985
|
|
|
|
|
|
|
$oldNode = undef; |
986
|
|
|
|
|
|
|
} |
987
|
|
|
|
|
|
|
} |
988
|
|
|
|
|
|
|
|
989
|
|
|
|
|
|
|
|
990
|
|
|
|
|
|
|
|
991
|
|
|
|
|
|
|
|
992
|
|
|
|
|
|
|
# Bug Fix: If Search Word Found At First Array Index And Second Word Not Found. |
993
|
|
|
|
|
|
|
# Prevent Invalid Data From Being Returned. |
994
|
|
|
|
|
|
|
return undef if !defined( $resultNode ) && $index == 1; |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
return $oldNode if !defined( $resultNode ); |
997
|
|
|
|
|
|
|
|
998
|
|
|
|
|
|
|
return $resultNode; |
999
|
|
|
|
|
|
|
} |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
sub ReadCompoundWordDataFromFile |
1002
|
|
|
|
|
|
|
{ |
1003
|
|
|
|
|
|
|
my ( $self, $fileDir, $autoSetMaxCompoundWordLength ) = @_; |
1004
|
|
|
|
|
|
|
|
1005
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Error: Directory Not Defined" ) if !defined ( $fileDir ); |
1006
|
|
|
|
|
|
|
return -1 if !defined ( $fileDir ); |
1007
|
|
|
|
|
|
|
|
1008
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Error: Directory/File Does Not Exist" ) if !( -e "$fileDir" ); |
1009
|
|
|
|
|
|
|
return -1 if !( -e "$fileDir" ); |
1010
|
|
|
|
|
|
|
|
1011
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Reading Compound Word File: \"$fileDir\"" ); |
1012
|
|
|
|
|
|
|
|
1013
|
|
|
|
|
|
|
my @dataAry = (); |
1014
|
|
|
|
|
|
|
|
1015
|
|
|
|
|
|
|
# Read XML Data From File To Memory |
1016
|
|
|
|
|
|
|
open( my $fileHandle, '<:encoding(UTF-8)', "$fileDir" ); |
1017
|
|
|
|
|
|
|
|
1018
|
|
|
|
|
|
|
# Prepare Max Compound Word Length |
1019
|
|
|
|
|
|
|
$self->SetMaxCompoundWordLength( 0 ) if defined ( $autoSetMaxCompoundWordLength ); |
1020
|
|
|
|
|
|
|
|
1021
|
|
|
|
|
|
|
while( my $row = <$fileHandle> ) |
1022
|
|
|
|
|
|
|
{ |
1023
|
|
|
|
|
|
|
chomp( $row ); |
1024
|
|
|
|
|
|
|
$row = $self->RemoveSpecialCharactersFromString( $row ); |
1025
|
|
|
|
|
|
|
push( @dataAry, $row ); |
1026
|
|
|
|
|
|
|
|
1027
|
|
|
|
|
|
|
# Find Max Compound Word Length |
1028
|
|
|
|
|
|
|
my @words = split( ' ', $row ); |
1029
|
|
|
|
|
|
|
my $size = @words; |
1030
|
|
|
|
|
|
|
undef( @words ); |
1031
|
|
|
|
|
|
|
$self->SetMaxCompoundWordLength( $size ) if defined( $autoSetMaxCompoundWordLength ) && ( $self->GetMaxCompoundWordLength() < $size ); |
1032
|
|
|
|
|
|
|
} |
1033
|
|
|
|
|
|
|
|
1034
|
|
|
|
|
|
|
close( $fileHandle ); |
1035
|
|
|
|
|
|
|
|
1036
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Error: Compound Word Length > 100" ) if ( $self->GetMaxCompoundWordLength() > 100 ); |
1037
|
|
|
|
|
|
|
return -1 if ( $self->GetMaxCompoundWordLength() > 100 ); |
1038
|
|
|
|
|
|
|
|
1039
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Auto Set Max Compound Word Length To \"" . $self->GetMaxCompoundWordLength() . "\"") if defined ( $autoSetMaxCompoundWordLength ); |
1040
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Reading Complete" ); |
1041
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Sorting Compound Word List" ); |
1042
|
|
|
|
|
|
|
|
1043
|
|
|
|
|
|
|
@dataAry = sort( @dataAry ); |
1044
|
|
|
|
|
|
|
$self->SetCompoundWordAry( \@dataAry ); |
1045
|
|
|
|
|
|
|
|
1046
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Stored " . @dataAry . " Compound Words In Memory" ) if ( @dataAry > 0 ); |
1047
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Detected Compound Word Array Data / Auto-Setting Compoundify Text = 1" ) if @dataAry > 0; |
1048
|
|
|
|
|
|
|
$self->SetCompoundifyText( 1 ) if ( @dataAry > 0 ); |
1049
|
|
|
|
|
|
|
|
1050
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundwordDataFromFile - No Compound Word Array Data Detected / Auto-Setting Compoundify Text = 0" ) if @dataAry == 0; |
1051
|
|
|
|
|
|
|
$self->SetCompoundifyText( 0 ) if ( @dataAry == 0 ); |
1052
|
|
|
|
|
|
|
|
1053
|
|
|
|
|
|
|
$self->WriteLog( "ReadCompoundWordDataFromFile - Sorting Complete" ); |
1054
|
|
|
|
|
|
|
|
1055
|
|
|
|
|
|
|
return 0; |
1056
|
|
|
|
|
|
|
} |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
sub SaveCompoundWordListToFile |
1059
|
|
|
|
|
|
|
{ |
1060
|
|
|
|
|
|
|
my ( $self, $savePath ) = @_; |
1061
|
|
|
|
|
|
|
|
1062
|
|
|
|
|
|
|
$self->WriteLog( "SaveCompoundWordListToFile - Error: Save Path Not Specified" ) if !defined( $savePath ); |
1063
|
|
|
|
|
|
|
return -1 if !defined( $savePath ); |
1064
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
$self->WriteLog( "SaveCompoundWordListToFile - Saving Compound Word List To \"$savePath\"" ); |
1066
|
|
|
|
|
|
|
|
1067
|
|
|
|
|
|
|
# Create File Handle |
1068
|
|
|
|
|
|
|
open( my $fileHandle, '>:encoding(UTF-8)', "$savePath" ); |
1069
|
|
|
|
|
|
|
|
1070
|
|
|
|
|
|
|
# Write Data To File |
1071
|
|
|
|
|
|
|
for my $compoundWord ( $self->GetCompoundWordAry() ) |
1072
|
|
|
|
|
|
|
{ |
1073
|
|
|
|
|
|
|
print( $fileHandle "$compoundWord\n" ); |
1074
|
|
|
|
|
|
|
} |
1075
|
|
|
|
|
|
|
|
1076
|
|
|
|
|
|
|
close( $fileHandle ); |
1077
|
|
|
|
|
|
|
undef( $fileHandle ); |
1078
|
|
|
|
|
|
|
|
1079
|
|
|
|
|
|
|
$self->WriteLog( "SaveCompoundWordListToFile - Compound Word List Saved To \"$savePath\"" ); |
1080
|
|
|
|
|
|
|
|
1081
|
|
|
|
|
|
|
return 0; |
1082
|
|
|
|
|
|
|
} |
1083
|
|
|
|
|
|
|
|
1084
|
|
|
|
|
|
|
sub ReadTextFromFile |
1085
|
|
|
|
|
|
|
{ |
1086
|
|
|
|
|
|
|
my ( $self, $fileDir ) = @_; |
1087
|
|
|
|
|
|
|
|
1088
|
|
|
|
|
|
|
$self->WriteLog( "ReadTextFromFile - Error: Directory Not Defined" ) if !defined ( $fileDir ); |
1089
|
|
|
|
|
|
|
return "(null)" if !defined ( $fileDir ); |
1090
|
|
|
|
|
|
|
|
1091
|
|
|
|
|
|
|
$self->WriteLog( "ReadTextFromFile - Error: Directory/File Does Not Exist" ) if !( -e "$fileDir" ); |
1092
|
|
|
|
|
|
|
return "(null)" if !( -e "$fileDir" ); |
1093
|
|
|
|
|
|
|
|
1094
|
|
|
|
|
|
|
my $str = ""; |
1095
|
|
|
|
|
|
|
|
1096
|
|
|
|
|
|
|
# Read XML Data From File To Memory |
1097
|
|
|
|
|
|
|
open( my $fileHandle, '<:encoding(UTF-8)', "$fileDir" ); |
1098
|
|
|
|
|
|
|
|
1099
|
|
|
|
|
|
|
while( my $row = <$fileHandle> ) |
1100
|
|
|
|
|
|
|
{ |
1101
|
|
|
|
|
|
|
chomp $row; |
1102
|
|
|
|
|
|
|
$str .= " $row"; |
1103
|
|
|
|
|
|
|
} |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
close( $fileHandle ); |
1106
|
|
|
|
|
|
|
|
1107
|
|
|
|
|
|
|
$self->WriteLog( "ReadTextFromFile - Reading Complete" ); |
1108
|
|
|
|
|
|
|
|
1109
|
|
|
|
|
|
|
return $str; |
1110
|
|
|
|
|
|
|
} |
1111
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
sub SaveTextToFile |
1113
|
|
|
|
|
|
|
{ |
1114
|
|
|
|
|
|
|
my ( $self, $savePath, $str ) = @_; |
1115
|
|
|
|
|
|
|
|
1116
|
|
|
|
|
|
|
$self->WriteLog( "SaveTextToFile - Error: No Save Path Specified" ) if !defined( $savePath ); |
1117
|
|
|
|
|
|
|
return -1 if !defined( $savePath ); |
1118
|
|
|
|
|
|
|
|
1119
|
|
|
|
|
|
|
$self->WriteLog( "SaveTextToFile - Saving Data To \"$savePath\"" ); |
1120
|
|
|
|
|
|
|
|
1121
|
|
|
|
|
|
|
# Create file handle |
1122
|
|
|
|
|
|
|
my $fileHandle = undef; |
1123
|
|
|
|
|
|
|
|
1124
|
|
|
|
|
|
|
# Over write file if $appendToFile == 0 |
1125
|
|
|
|
|
|
|
open( $fileHandle, '>:encoding(UTF-8)', "$savePath" ); |
1126
|
|
|
|
|
|
|
|
1127
|
|
|
|
|
|
|
# Write Data To File |
1128
|
|
|
|
|
|
|
print( $fileHandle "$str" ); |
1129
|
|
|
|
|
|
|
|
1130
|
|
|
|
|
|
|
close( $fileHandle ); |
1131
|
|
|
|
|
|
|
undef( $fileHandle ); |
1132
|
|
|
|
|
|
|
|
1133
|
|
|
|
|
|
|
$self->WriteLog( "SaveTextToFile - File Saved To \"$savePath\"" ); |
1134
|
|
|
|
|
|
|
|
1135
|
|
|
|
|
|
|
return 0; |
1136
|
|
|
|
|
|
|
} |
1137
|
|
|
|
|
|
|
|
1138
|
|
|
|
|
|
|
sub _ReadXMLDataFromFile |
1139
|
|
|
|
|
|
|
{ |
1140
|
|
|
|
|
|
|
my ( $self, $fileDir ) = @_; |
1141
|
|
|
|
|
|
|
|
1142
|
|
|
|
|
|
|
$self->WriteLog( "_ReadXMLDataFromFile - Error: Directory Not Defined" ) if !defined ( $fileDir ); |
1143
|
|
|
|
|
|
|
return "(null)" if !defined ( $fileDir ); |
1144
|
|
|
|
|
|
|
|
1145
|
|
|
|
|
|
|
$self->WriteLog( "_ReadXMLDataFromFile - Error: Directory/File Does Not Exist" ) if !( -e "$fileDir" ); |
1146
|
|
|
|
|
|
|
return "(null)" if !( -e "$fileDir" ); |
1147
|
|
|
|
|
|
|
|
1148
|
|
|
|
|
|
|
my $data = ""; |
1149
|
|
|
|
|
|
|
|
1150
|
|
|
|
|
|
|
# Extract XML File From GZip To Memory |
1151
|
|
|
|
|
|
|
if ( index( $fileDir, ".gz" ) != -1 ) |
1152
|
|
|
|
|
|
|
{ |
1153
|
|
|
|
|
|
|
IO::Uncompress::Gunzip::gunzip "$fileDir" => \$data or die "gunzip failed\n"; |
1154
|
|
|
|
|
|
|
} |
1155
|
|
|
|
|
|
|
# Read XML Data From File To Memory |
1156
|
|
|
|
|
|
|
else |
1157
|
|
|
|
|
|
|
{ |
1158
|
|
|
|
|
|
|
open( my $fileHandle, '<:encoding(UTF-8)', "$fileDir" ); |
1159
|
|
|
|
|
|
|
|
1160
|
|
|
|
|
|
|
while( my $row = <$fileHandle> ) |
1161
|
|
|
|
|
|
|
{ |
1162
|
|
|
|
|
|
|
chomp $row; |
1163
|
|
|
|
|
|
|
$data .= "$row\n"; |
1164
|
|
|
|
|
|
|
} |
1165
|
|
|
|
|
|
|
|
1166
|
|
|
|
|
|
|
close( $fileHandle ); |
1167
|
|
|
|
|
|
|
} |
1168
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
$self->WriteLog( "_ReadXMLDataFromFile - Reading Data Complete/Data Stored" ); |
1170
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
return $data; |
1172
|
|
|
|
|
|
|
} |
1173
|
|
|
|
|
|
|
|
1174
|
|
|
|
|
|
|
sub _SaveTextCorpusToFile |
1175
|
|
|
|
|
|
|
{ |
1176
|
|
|
|
|
|
|
my ( $self, $savePath, $appendToFile ) = @_; |
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
# Prevent Other Threads From Writing At The Same Time |
1179
|
|
|
|
|
|
|
{ |
1180
|
|
|
|
|
|
|
lock( $writeLock ); |
1181
|
|
|
|
|
|
|
|
1182
|
|
|
|
|
|
|
$self->WriteLog( "_SaveTextCorpusToFile - Error: No Save Path Specified" ) if !defined( $savePath ); |
1183
|
|
|
|
|
|
|
return -1 if !defined( $savePath ); |
1184
|
|
|
|
|
|
|
|
1185
|
|
|
|
|
|
|
$appendToFile = $self->GetOverwriteExistingFile() if !defined ( $appendToFile ); |
1186
|
|
|
|
|
|
|
|
1187
|
|
|
|
|
|
|
$self->WriteLog( "_SaveTextCorpusToFile - Saving Text Corpus To \"$savePath\"" ); |
1188
|
|
|
|
|
|
|
|
1189
|
|
|
|
|
|
|
# Create file handle |
1190
|
|
|
|
|
|
|
my $fileHandle = undef; |
1191
|
|
|
|
|
|
|
|
1192
|
|
|
|
|
|
|
# Over write file if $appendToFile == 0 |
1193
|
|
|
|
|
|
|
open( $fileHandle, '>:encoding(UTF-8)', "$savePath" ) if $appendToFile == 0; |
1194
|
|
|
|
|
|
|
|
1195
|
|
|
|
|
|
|
# Append to file if $appendToFile == 1 |
1196
|
|
|
|
|
|
|
open( $fileHandle, '>>:encoding(UTF-8)', "$savePath" ) if $appendToFile == 1; |
1197
|
|
|
|
|
|
|
|
1198
|
|
|
|
|
|
|
# Write Data To File |
1199
|
|
|
|
|
|
|
my $str = $self->GetTextCorpusStr(); |
1200
|
|
|
|
|
|
|
|
1201
|
|
|
|
|
|
|
# Remove Extra Spaces In Text Corpus String |
1202
|
|
|
|
|
|
|
$str =~ s/ +/ /g; |
1203
|
|
|
|
|
|
|
|
1204
|
|
|
|
|
|
|
print( $fileHandle $str ); |
1205
|
|
|
|
|
|
|
|
1206
|
|
|
|
|
|
|
close( $fileHandle ); |
1207
|
|
|
|
|
|
|
undef( $fileHandle ); |
1208
|
|
|
|
|
|
|
|
1209
|
|
|
|
|
|
|
$self->WriteLog( "_SaveTextCorpusToFile - Text Corpus Saved To \"$savePath\"" ); |
1210
|
|
|
|
|
|
|
} |
1211
|
|
|
|
|
|
|
|
1212
|
|
|
|
|
|
|
return 1; |
1213
|
|
|
|
|
|
|
} |
1214
|
|
|
|
|
|
|
|
1215
|
|
|
|
|
|
|
sub IsDateInSpecifiedRange |
1216
|
|
|
|
|
|
|
{ |
1217
|
|
|
|
|
|
|
my ( $self, $date, $beginDate, $endDate ) = @_; |
1218
|
|
|
|
|
|
|
|
1219
|
|
|
|
|
|
|
$self->WriteLog( "Error: Date Not Specified To Check Against Date Range" ) if !defined ( $date ); |
1220
|
|
|
|
|
|
|
return 0 if !defined ( $date ); |
1221
|
|
|
|
|
|
|
|
1222
|
|
|
|
|
|
|
$self->WriteLog( "Warning - BeginDate Parameter Not Specified - Using Default Value: " . $self->GetBeginDate() ) if !defined ( $beginDate ); |
1223
|
|
|
|
|
|
|
$self->WriteLog( "Warning - EndDate Parameter Not Specified - Using Default Value: " . $self->GetEndDate() ) if !defined ( $endDate ); |
1224
|
|
|
|
|
|
|
$beginDate = $self->GetBeginDate() if !defined ( $beginDate ); |
1225
|
|
|
|
|
|
|
$endDate = $self->GetEndDate() if !defined ( $endDate ); |
1226
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
my @dateAry = split( '/', $date ); |
1228
|
|
|
|
|
|
|
my @beginDateAry = split( '/', $beginDate ); |
1229
|
|
|
|
|
|
|
my @endDateAry = split( '/', $endDate ); |
1230
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
# Check(s) |
1232
|
|
|
|
|
|
|
if( @dateAry != 3 ) |
1233
|
|
|
|
|
|
|
{ |
1234
|
|
|
|
|
|
|
$self->WriteLog( "Invalid Date Format - Requested Format: Month/Day/Year : Specified Format - $date" ); |
1235
|
|
|
|
|
|
|
return 0; |
1236
|
|
|
|
|
|
|
} |
1237
|
|
|
|
|
|
|
elsif( @beginDateAry != 3 ) |
1238
|
|
|
|
|
|
|
{ |
1239
|
|
|
|
|
|
|
$self->WriteLog( "Invalid Date Format - Requested Format: Month/Day/Year : Specified Format - $beginDate" ); |
1240
|
|
|
|
|
|
|
return 0; |
1241
|
|
|
|
|
|
|
} |
1242
|
|
|
|
|
|
|
elsif( @endDateAry != 3 ) |
1243
|
|
|
|
|
|
|
{ |
1244
|
|
|
|
|
|
|
$self->WriteLog( "Invalid Date Format - Requested Format: Month/Day/Year : Specified Format - $endDate" ); |
1245
|
|
|
|
|
|
|
return 0; |
1246
|
|
|
|
|
|
|
} |
1247
|
|
|
|
|
|
|
|
1248
|
|
|
|
|
|
|
# Begin Date Comparison |
1249
|
|
|
|
|
|
|
my $dateYear = $dateAry[2]; |
1250
|
|
|
|
|
|
|
my $beginYear = $beginDateAry[2]; |
1251
|
|
|
|
|
|
|
my $endYear = $endDateAry[2]; |
1252
|
|
|
|
|
|
|
|
1253
|
|
|
|
|
|
|
my $dateMonth = $dateAry[0]; |
1254
|
|
|
|
|
|
|
my $beginMonth = $beginDateAry[0]; |
1255
|
|
|
|
|
|
|
my $endMonth = $endDateAry[0]; |
1256
|
|
|
|
|
|
|
|
1257
|
|
|
|
|
|
|
my $dateDay = $dateAry[1]; |
1258
|
|
|
|
|
|
|
my $beginDay = $beginDateAry[1]; |
1259
|
|
|
|
|
|
|
my $endDay = $endDateAry[1]; |
1260
|
|
|
|
|
|
|
|
1261
|
|
|
|
|
|
|
# Check(s) |
1262
|
|
|
|
|
|
|
return 0 if ( $dateYear < 0 || $beginYear < 0 || $endYear < 0 || |
1263
|
|
|
|
|
|
|
$dateMonth < 0 || $beginMonth < 0 || $endMonth < 0 || |
1264
|
|
|
|
|
|
|
$dateDay < 0 || $beginDay < 0 || $endDay < 0 ); |
1265
|
|
|
|
|
|
|
|
1266
|
|
|
|
|
|
|
return 0 if ( $dateYear < $beginYear || $dateYear > $endYear ); |
1267
|
|
|
|
|
|
|
return 0 if ( ( $dateYear == $beginYear && $dateMonth < $beginMonth ) || ( $dateYear == $endYear && $dateMonth > $endMonth ) ); |
1268
|
|
|
|
|
|
|
return 0 if ( ( $dateYear == $beginYear && $dateMonth == $beginMonth && $dateDay < $beginDay ) |
1269
|
|
|
|
|
|
|
|| ( $dateYear == $endYear && $dateMonth == $endMonth && $dateDay > $endDay ) ); |
1270
|
|
|
|
|
|
|
|
1271
|
|
|
|
|
|
|
return 1; |
1272
|
|
|
|
|
|
|
} |
1273
|
|
|
|
|
|
|
|
1274
|
|
|
|
|
|
|
sub IsFileOrDirectory |
1275
|
|
|
|
|
|
|
{ |
1276
|
|
|
|
|
|
|
my ( $self, $path ) = @_; |
1277
|
|
|
|
|
|
|
|
1278
|
|
|
|
|
|
|
# Check(s) |
1279
|
|
|
|
|
|
|
return "unknown" if !defined( $path ); |
1280
|
|
|
|
|
|
|
return "unknown" if !( -e $path ); |
1281
|
|
|
|
|
|
|
|
1282
|
|
|
|
|
|
|
return "file" if ( -f $path ); |
1283
|
|
|
|
|
|
|
return "dir" if ( -d $path ); |
1284
|
|
|
|
|
|
|
} |
1285
|
|
|
|
|
|
|
|
1286
|
|
|
|
|
|
|
sub RemoveSpecialCharactersFromString |
1287
|
|
|
|
|
|
|
{ |
1288
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1289
|
|
|
|
|
|
|
$str = lc( $str ); # Convert all characters to lowercase |
1290
|
|
|
|
|
|
|
$str =~ s/ +/ /g; # Remove duplicate white spaces between words |
1291
|
|
|
|
|
|
|
$str =~ s/'s//g; # Remove "'s" characters (Apostrophe 's') |
1292
|
|
|
|
|
|
|
$str =~ s/-/ /g; # Replace all hyphen characters to spaces |
1293
|
|
|
|
|
|
|
$str =~ s/\./\n/g if ( $self->GetStoreAsSentencePerLine() == 1 ); # Convert Period To New Line Character |
1294
|
|
|
|
|
|
|
$str =~ tr/a-z\015\012/ /cs; # Remove all characters except 'a' to 'z' and new-line characters |
1295
|
|
|
|
|
|
|
#$str =~ s/[\$#@~!&*()\[\];.,:?^\-'`\\\/]+//g; # Does not include numeric characters |
1296
|
|
|
|
|
|
|
|
1297
|
|
|
|
|
|
|
# Convert String Line Ending Suitable To The Target |
1298
|
|
|
|
|
|
|
my $lineEnding = ""; |
1299
|
|
|
|
|
|
|
my $os = $self->GetOSType(); |
1300
|
|
|
|
|
|
|
|
1301
|
|
|
|
|
|
|
$lineEnding = "\015\012" if ( $os eq "MSWin32" ); |
1302
|
|
|
|
|
|
|
$lineEnding = "\012" if ( $os eq "linux" ); |
1303
|
|
|
|
|
|
|
$lineEnding = "\015" if ( $os eq "MacOS" ); |
1304
|
|
|
|
|
|
|
|
1305
|
|
|
|
|
|
|
$str =~ s/(\015\012|\012|\015)/$lineEnding/g; |
1306
|
|
|
|
|
|
|
|
1307
|
|
|
|
|
|
|
# Removes Spaces At Left Side Of String |
1308
|
|
|
|
|
|
|
$str =~ s/^\s+// if ( $self->GetStoreAsSentencePerLine() == 1 ); |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
# Removes Spaces At Both Ends Of String And More Than Once Space In-Between Ends |
1311
|
|
|
|
|
|
|
$str =~ s/^\s+|\s(?=\s)|\s+$//g if ( $self->GetStoreAsSentencePerLine() == 0 ); |
1312
|
|
|
|
|
|
|
|
1313
|
|
|
|
|
|
|
return $str; |
1314
|
|
|
|
|
|
|
} |
1315
|
|
|
|
|
|
|
|
1316
|
|
|
|
|
|
|
sub GetFileType |
1317
|
|
|
|
|
|
|
{ |
1318
|
|
|
|
|
|
|
my ( $self, $filePath ) = @_; |
1319
|
|
|
|
|
|
|
|
1320
|
|
|
|
|
|
|
my $ft = File::Type->new(); |
1321
|
|
|
|
|
|
|
my $fileType = $ft->checktype_filename( $filePath ); |
1322
|
|
|
|
|
|
|
undef( $ft ); |
1323
|
|
|
|
|
|
|
|
1324
|
|
|
|
|
|
|
return $fileType; |
1325
|
|
|
|
|
|
|
} |
1326
|
|
|
|
|
|
|
|
1327
|
|
|
|
|
|
|
sub _DateCheck |
1328
|
|
|
|
|
|
|
{ |
1329
|
|
|
|
|
|
|
my ( $self ) = @_; |
1330
|
|
|
|
|
|
|
|
1331
|
|
|
|
|
|
|
my $beginDate = $self->GetBeginDate(); |
1332
|
|
|
|
|
|
|
my $endDate = $self->GetEndDate(); |
1333
|
|
|
|
|
|
|
|
1334
|
|
|
|
|
|
|
# Check(s) |
1335
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Begin Date Node Defined" ) if !defined( $beginDate ); |
1336
|
|
|
|
|
|
|
return -1 if !defined( $beginDate ); |
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
$self->Writelog( "_DateCheck - Error: End Date Not Defined" ) if !defined( $endDate ); |
1339
|
|
|
|
|
|
|
return -1 if !defined( $endDate ); |
1340
|
|
|
|
|
|
|
|
1341
|
|
|
|
|
|
|
# Parse Begin Date |
1342
|
|
|
|
|
|
|
my $delimiter = ""; |
1343
|
|
|
|
|
|
|
$delimiter = "-" if index( $beginDate, "-" ) != -1; |
1344
|
|
|
|
|
|
|
$delimiter = "/" if index( $beginDate, "/" ) != -1; |
1345
|
|
|
|
|
|
|
|
1346
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Begin Date Improper Format" ) if ( $delimiter eq "" ); |
1347
|
|
|
|
|
|
|
return -1 if ( $delimiter eq "" ); |
1348
|
|
|
|
|
|
|
|
1349
|
|
|
|
|
|
|
my @bDateAry = split( $delimiter, $beginDate ); |
1350
|
|
|
|
|
|
|
|
1351
|
|
|
|
|
|
|
# Check For Default Begin Date And Adjust Accordingly |
1352
|
|
|
|
|
|
|
if( $bDateAry[0] == 0 && $bDateAry[1] == 0 && $bDateAry[2] == 0000 ) |
1353
|
|
|
|
|
|
|
{ |
1354
|
|
|
|
|
|
|
$bDateAry[0] = 01; |
1355
|
|
|
|
|
|
|
$bDateAry[1] = 01; |
1356
|
|
|
|
|
|
|
$bDateAry[2] = 0000; |
1357
|
|
|
|
|
|
|
} |
1358
|
|
|
|
|
|
|
|
1359
|
|
|
|
|
|
|
# Set Date In Proper Format |
1360
|
|
|
|
|
|
|
$beginDate = join( '/', @bDateAry ) if ( $delimiter eq "-" ); |
1361
|
|
|
|
|
|
|
$self->SetBeginDate( $beginDate ) if ( $delimiter eq "-" ); |
1362
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
# Parse End Date |
1364
|
|
|
|
|
|
|
$delimiter = ""; |
1365
|
|
|
|
|
|
|
$delimiter = "-" if index( $endDate, "-" ) != -1; |
1366
|
|
|
|
|
|
|
$delimiter = "/" if index( $endDate, "/" ) != -1; |
1367
|
|
|
|
|
|
|
|
1368
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: End Date Improper Format" ) if ( $delimiter eq "" ); |
1369
|
|
|
|
|
|
|
return -1 if ( $delimiter eq "" ); |
1370
|
|
|
|
|
|
|
|
1371
|
|
|
|
|
|
|
my @eDateAry = split( $delimiter, $endDate ); |
1372
|
|
|
|
|
|
|
|
1373
|
|
|
|
|
|
|
# Check For Default End Date And Adjust Accordingly |
1374
|
|
|
|
|
|
|
if( $eDateAry[0] == 99 && $eDateAry[1] == 99 && $eDateAry[2] == 9999 ) |
1375
|
|
|
|
|
|
|
{ |
1376
|
|
|
|
|
|
|
$eDateAry[0] = 12; |
1377
|
|
|
|
|
|
|
$eDateAry[1] = 31; |
1378
|
|
|
|
|
|
|
$eDateAry[2] = 9999; |
1379
|
|
|
|
|
|
|
} |
1380
|
|
|
|
|
|
|
|
1381
|
|
|
|
|
|
|
# Set Date In Proper Format |
1382
|
|
|
|
|
|
|
$endDate = join( '/', @eDateAry ) if ( $delimiter eq "-" ); |
1383
|
|
|
|
|
|
|
$self->SetEndDate( $endDate ) if ( $delimiter eq "-" ); |
1384
|
|
|
|
|
|
|
|
1385
|
|
|
|
|
|
|
# Basic Checks |
1386
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Begin Date Not Specified In \"Month/Day/Year\" or \"Month-Day-Year\" Format" ) if ( @bDateAry != 3 ); |
1387
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: End Date Not Specified In \"Month/Day/Year\" or \"Month-Day-Year\" Format" ) if ( @eDateAry != 3 ); |
1388
|
|
|
|
|
|
|
return -1 if ( @bDateAry != 3 ) || ( @eDateAry != 3 ); |
1389
|
|
|
|
|
|
|
|
1390
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect Begin Date Month Value - Expected Value: 1-12 / Specified Value: " . $bDateAry[0] ) if ( $bDateAry[0] < 1 || $bDateAry[0] > 12 ); |
1391
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect End Date Month Value - Expected Value: 1-12 / Specified Value: " . $eDateAry[0] ) if ( $eDateAry[0] < 1 || $eDateAry[0] > 12 ); |
1392
|
|
|
|
|
|
|
return -1 if ( $bDateAry[0] < 1 || $bDateAry[0] > 12 ) || ( $eDateAry[0] < 1 || $eDateAry[0] > 12 ); |
1393
|
|
|
|
|
|
|
|
1394
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect Begin Date Day Value - Expected Value: 1-31 / Specified Value: " . $bDateAry[1] ) if ( $bDateAry[1] < 1 || $bDateAry[1] > 31 ); |
1395
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect End Date Day Value - Expected Value: 1-31 / Specified Value: " . $eDateAry[1] ) if ( $eDateAry[1] < 1 || $eDateAry[1] > 31 ); |
1396
|
|
|
|
|
|
|
return -1 if ( $bDateAry[1] < 1 || $bDateAry[1] > 31 ) || ( $eDateAry[1] < 1 || $eDateAry[1] > 31 ); |
1397
|
|
|
|
|
|
|
|
1398
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect Begin Date Year Value - Expected Value: 0-9999 / Specified Value: " . $bDateAry[2] ) if ( $bDateAry[2] < 0 || $bDateAry[2] > 9999 ); |
1399
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Incorrect End Date Year Value - Expected Value: 0-9999 / Specified Value: " . $eDateAry[2] ) if ( $eDateAry[2] < 0 || $eDateAry[2] > 9999 ); |
1400
|
|
|
|
|
|
|
return -1 if ( $bDateAry[2] < 0 || $bDateAry[2] > 9999 ) || ( $eDateAry[2] < 0 || $eDateAry[2] > 9999 ); |
1401
|
|
|
|
|
|
|
|
1402
|
|
|
|
|
|
|
# Advanced Checks |
1403
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Begin Date Year > End Date Year" ) if ( $bDateAry[2] > $eDateAry[2] ); |
1404
|
|
|
|
|
|
|
return -1 if ( $bDateAry[2] > $eDateAry[2] ); |
1405
|
|
|
|
|
|
|
|
1406
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Years Equal, Begin Date Month > End Date Month" ) if ( $bDateAry[2] == $eDateAry[2] && $bDateAry[0] > $eDateAry[0] ); |
1407
|
|
|
|
|
|
|
return -1 if ( $bDateAry[2] == $eDateAry[2] && $bDateAry[0] > $eDateAry[0] ); |
1408
|
|
|
|
|
|
|
|
1409
|
|
|
|
|
|
|
$self->WriteLog( "_DateCheck - Error: Years And Months Equal, Begin Date Day > End Date Day" ) if ( $bDateAry[2] == $eDateAry[2] && $bDateAry[0] == $eDateAry[0] && $bDateAry[1] > $eDateAry[1] ); |
1410
|
|
|
|
|
|
|
return -1 if ( $bDateAry[2] == $eDateAry[2] && $bDateAry[0] == $eDateAry[0] && $bDateAry[1] > $eDateAry[1] ); |
1411
|
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
|
# Clean Up |
1413
|
|
|
|
|
|
|
$beginDate = ""; |
1414
|
|
|
|
|
|
|
$endDate = ""; |
1415
|
|
|
|
|
|
|
$delimiter = ""; |
1416
|
|
|
|
|
|
|
@bDateAry = (); |
1417
|
|
|
|
|
|
|
@eDateAry = (); |
1418
|
|
|
|
|
|
|
|
1419
|
|
|
|
|
|
|
return 0; |
1420
|
|
|
|
|
|
|
} |
1421
|
|
|
|
|
|
|
|
1422
|
|
|
|
|
|
|
sub GetOSType |
1423
|
|
|
|
|
|
|
{ |
1424
|
|
|
|
|
|
|
my ( $self ) = @_; |
1425
|
|
|
|
|
|
|
return $^O; |
1426
|
|
|
|
|
|
|
} |
1427
|
|
|
|
|
|
|
|
1428
|
|
|
|
|
|
|
|
1429
|
|
|
|
|
|
|
###################################################################################### |
1430
|
|
|
|
|
|
|
# Accessors |
1431
|
|
|
|
|
|
|
###################################################################################### |
1432
|
|
|
|
|
|
|
|
1433
|
|
|
|
|
|
|
sub GetDebugLog |
1434
|
|
|
|
|
|
|
{ |
1435
|
|
|
|
|
|
|
my ( $self ) = @_; |
1436
|
|
|
|
|
|
|
$self->{ _debugLog } = 0 if !defined ( $self->{ _debugLog } ); |
1437
|
|
|
|
|
|
|
return $self->{ _debugLog }; |
1438
|
|
|
|
|
|
|
} |
1439
|
|
|
|
|
|
|
|
1440
|
|
|
|
|
|
|
sub GetWriteLog |
1441
|
|
|
|
|
|
|
{ |
1442
|
|
|
|
|
|
|
my ( $self ) = @_; |
1443
|
|
|
|
|
|
|
$self->{ _writeLog } = 0 if !defined ( $self->{ _writeLog } ); |
1444
|
|
|
|
|
|
|
return $self->{ _writeLog }; |
1445
|
|
|
|
|
|
|
} |
1446
|
|
|
|
|
|
|
|
1447
|
|
|
|
|
|
|
sub GetStoreTitle |
1448
|
|
|
|
|
|
|
{ |
1449
|
|
|
|
|
|
|
my ( $self ) = @_; |
1450
|
|
|
|
|
|
|
$self->{ _storeTitle } = 1 if !defined ( $self->{ _storeTitle } ); |
1451
|
|
|
|
|
|
|
return $self->{ _storeTitle }; |
1452
|
|
|
|
|
|
|
} |
1453
|
|
|
|
|
|
|
|
1454
|
|
|
|
|
|
|
sub GetStoreAbstract |
1455
|
|
|
|
|
|
|
{ |
1456
|
|
|
|
|
|
|
my ( $self ) = @_; |
1457
|
|
|
|
|
|
|
$self->{ _storeAbstract } = 1 if !defined ( $self->{ _storeAbstract } ); |
1458
|
|
|
|
|
|
|
return $self->{ _storeAbstract }; |
1459
|
|
|
|
|
|
|
} |
1460
|
|
|
|
|
|
|
|
1461
|
|
|
|
|
|
|
sub GetQuickParse |
1462
|
|
|
|
|
|
|
{ |
1463
|
|
|
|
|
|
|
my ( $self ) = @_; |
1464
|
|
|
|
|
|
|
$self->{ _quickParse } = 0 if !defined ( $self->{ _quickParse } ); |
1465
|
|
|
|
|
|
|
return $self->{ _quickParse }; |
1466
|
|
|
|
|
|
|
} |
1467
|
|
|
|
|
|
|
|
1468
|
|
|
|
|
|
|
sub GetCompoundifyText |
1469
|
|
|
|
|
|
|
{ |
1470
|
|
|
|
|
|
|
my ( $self ) = @_; |
1471
|
|
|
|
|
|
|
$self->{ _compoundifyText } = 0 if !defined ( $self->{ _compoundifyText } ); |
1472
|
|
|
|
|
|
|
return $self->{ _compoundifyText }; |
1473
|
|
|
|
|
|
|
} |
1474
|
|
|
|
|
|
|
|
1475
|
|
|
|
|
|
|
sub GetStoreAsSentencePerLine |
1476
|
|
|
|
|
|
|
{ |
1477
|
|
|
|
|
|
|
my ( $self ) = @_; |
1478
|
|
|
|
|
|
|
$self->{ _storeAsSentencePerLine } = 0 if !defined ( $self->{ _storeAsSentencePerLine } ); |
1479
|
|
|
|
|
|
|
return $self->{ _storeAsSentencePerLine }; |
1480
|
|
|
|
|
|
|
} |
1481
|
|
|
|
|
|
|
|
1482
|
|
|
|
|
|
|
sub GetNumOfThreads |
1483
|
|
|
|
|
|
|
{ |
1484
|
|
|
|
|
|
|
my ( $self ) = @_; |
1485
|
|
|
|
|
|
|
$self->{ _numOfThreads } = Sys::CpuAffinity::getNumCpus() if !defined ( $self->{ _numOfThreads } ); |
1486
|
|
|
|
|
|
|
return $self->{ _numOfThreads }; |
1487
|
|
|
|
|
|
|
} |
1488
|
|
|
|
|
|
|
|
1489
|
|
|
|
|
|
|
sub GetWorkingDir |
1490
|
|
|
|
|
|
|
{ |
1491
|
|
|
|
|
|
|
my ( $self ) = @_; |
1492
|
|
|
|
|
|
|
$self->{ _workingDir } = Cwd::getcwd() if !defined $self->{ _workingDir }; |
1493
|
|
|
|
|
|
|
return $self->{ _workingDir }; |
1494
|
|
|
|
|
|
|
} |
1495
|
|
|
|
|
|
|
|
1496
|
|
|
|
|
|
|
sub GetSavePath |
1497
|
|
|
|
|
|
|
{ |
1498
|
|
|
|
|
|
|
my ( $self ) = @_; |
1499
|
|
|
|
|
|
|
$self->{ _savePath } = "(null)" if !defined $self->{ _savePath }; |
1500
|
|
|
|
|
|
|
return $self->{ _savePath }; |
1501
|
|
|
|
|
|
|
} |
1502
|
|
|
|
|
|
|
|
1503
|
|
|
|
|
|
|
sub GetBeginDate |
1504
|
|
|
|
|
|
|
{ |
1505
|
|
|
|
|
|
|
my ( $self ) = @_; |
1506
|
|
|
|
|
|
|
$self->{ _beginDate } = "00/00/0000" if !defined ( $self->{ _beginDate } ); |
1507
|
|
|
|
|
|
|
return $self->{ _beginDate }; |
1508
|
|
|
|
|
|
|
} |
1509
|
|
|
|
|
|
|
|
1510
|
|
|
|
|
|
|
sub GetEndDate |
1511
|
|
|
|
|
|
|
{ |
1512
|
|
|
|
|
|
|
my ( $self ) = @_; |
1513
|
|
|
|
|
|
|
$self->{ _endDate } = "99/99/9999" if !defined ( $self->{ _endDate } ); |
1514
|
|
|
|
|
|
|
return $self->{ _endDate }; |
1515
|
|
|
|
|
|
|
} |
1516
|
|
|
|
|
|
|
|
1517
|
|
|
|
|
|
|
sub GetXMLStringToParse |
1518
|
|
|
|
|
|
|
{ |
1519
|
|
|
|
|
|
|
my ( $self ) = @_; |
1520
|
|
|
|
|
|
|
$self->{ _xmlStringToParse } = "(null)" if !defined ( $self->{ _xmlStringToParse } ); |
1521
|
|
|
|
|
|
|
return $self->{ _xmlStringToParse }; |
1522
|
|
|
|
|
|
|
} |
1523
|
|
|
|
|
|
|
|
1524
|
|
|
|
|
|
|
sub GetTextCorpusStr |
1525
|
|
|
|
|
|
|
{ |
1526
|
|
|
|
|
|
|
my ( $self ) = @_; |
1527
|
|
|
|
|
|
|
$self->{ _textCorpusStr } = "" if !defined ( $self->{_textCorpusStr } ); |
1528
|
|
|
|
|
|
|
return $self->{ _textCorpusStr }; |
1529
|
|
|
|
|
|
|
} |
1530
|
|
|
|
|
|
|
|
1531
|
|
|
|
|
|
|
sub GetFileHandle |
1532
|
|
|
|
|
|
|
{ |
1533
|
|
|
|
|
|
|
my ( $self ) = @_; |
1534
|
|
|
|
|
|
|
$self->{ _fileHandle } = undef if !defined ( $self->{ _fileHandle } ); |
1535
|
|
|
|
|
|
|
return $self->{ _fileHandle }; |
1536
|
|
|
|
|
|
|
} |
1537
|
|
|
|
|
|
|
|
1538
|
|
|
|
|
|
|
sub GetTwigHandler |
1539
|
|
|
|
|
|
|
{ |
1540
|
|
|
|
|
|
|
my ( $self ) = @_; |
1541
|
|
|
|
|
|
|
$self->{ _twigHandler } = "(null)" if !defined ( $self->{ _twigHandler } ); |
1542
|
|
|
|
|
|
|
return $self->{ _twigHandler }; |
1543
|
|
|
|
|
|
|
} |
1544
|
|
|
|
|
|
|
|
1545
|
|
|
|
|
|
|
sub GetParsedCount |
1546
|
|
|
|
|
|
|
{ |
1547
|
|
|
|
|
|
|
my ( $self ) = @_; |
1548
|
|
|
|
|
|
|
$self->{ _parsedCount } = 0 if !defined ( $self->{ _parsedCount } ); |
1549
|
|
|
|
|
|
|
return $self->{ _parsedCount }; |
1550
|
|
|
|
|
|
|
} |
1551
|
|
|
|
|
|
|
|
1552
|
|
|
|
|
|
|
sub GetTempStr |
1553
|
|
|
|
|
|
|
{ |
1554
|
|
|
|
|
|
|
my ( $self ) = @_; |
1555
|
|
|
|
|
|
|
$self->{ _tempStr } = "" if !defined ( $self->{ _tempStr } ); |
1556
|
|
|
|
|
|
|
return $self->{ _tempStr }; |
1557
|
|
|
|
|
|
|
} |
1558
|
|
|
|
|
|
|
|
1559
|
|
|
|
|
|
|
sub GetTempDate |
1560
|
|
|
|
|
|
|
{ |
1561
|
|
|
|
|
|
|
my ( $self ) = @_; |
1562
|
|
|
|
|
|
|
$self->{ _tempDate } = "" if !defined ( $self->{ _tempDate } ); |
1563
|
|
|
|
|
|
|
return $self->{ _tempDate }; |
1564
|
|
|
|
|
|
|
} |
1565
|
|
|
|
|
|
|
|
1566
|
|
|
|
|
|
|
sub GetCompoundWordAry |
1567
|
|
|
|
|
|
|
{ |
1568
|
|
|
|
|
|
|
my ( $self ) = @_; |
1569
|
|
|
|
|
|
|
$self->{ _compoundWordAry } = () if !defined ( $self->{ _compoundWordAry } ); |
1570
|
|
|
|
|
|
|
return @{ $self->{ _compoundWordAry } }; |
1571
|
|
|
|
|
|
|
} |
1572
|
|
|
|
|
|
|
|
1573
|
|
|
|
|
|
|
sub GetCompoundWordBST |
1574
|
|
|
|
|
|
|
{ |
1575
|
|
|
|
|
|
|
my ( $self ) = @_; |
1576
|
|
|
|
|
|
|
$self->{ _compoundWordBST } = Word2vec::Bst->new() if !defined ( $self->{ _compoundWordBST } ); |
1577
|
|
|
|
|
|
|
return $self->{ _compoundWordBST }; |
1578
|
|
|
|
|
|
|
} |
1579
|
|
|
|
|
|
|
|
1580
|
|
|
|
|
|
|
sub GetMaxCompoundWordLength |
1581
|
|
|
|
|
|
|
{ |
1582
|
|
|
|
|
|
|
my ( $self ) = @_; |
1583
|
|
|
|
|
|
|
$self->{ _maxCompoundWordLength } = 20 if !defined ( $self->{ _maxCompoundWordLength } ); |
1584
|
|
|
|
|
|
|
return $self->{ _maxCompoundWordLength }; |
1585
|
|
|
|
|
|
|
} |
1586
|
|
|
|
|
|
|
|
1587
|
|
|
|
|
|
|
sub GetOverwriteExistingFile |
1588
|
|
|
|
|
|
|
{ |
1589
|
|
|
|
|
|
|
my ( $self ) = @_; |
1590
|
|
|
|
|
|
|
$self->{ _overwriteExistingFile } = 0 if !defined ( $self->{ _overwriteExistingFile } ); |
1591
|
|
|
|
|
|
|
return $self->{ _overwriteExistingFile }; |
1592
|
|
|
|
|
|
|
} |
1593
|
|
|
|
|
|
|
|
1594
|
|
|
|
|
|
|
|
1595
|
|
|
|
|
|
|
###################################################################################### |
1596
|
|
|
|
|
|
|
# Mutators |
1597
|
|
|
|
|
|
|
###################################################################################### |
1598
|
|
|
|
|
|
|
|
1599
|
|
|
|
|
|
|
sub SetStoreTitle |
1600
|
|
|
|
|
|
|
{ |
1601
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1602
|
|
|
|
|
|
|
return $self->{ _storeTitle } = $value; |
1603
|
|
|
|
|
|
|
} |
1604
|
|
|
|
|
|
|
|
1605
|
|
|
|
|
|
|
sub SetStoreAbstract |
1606
|
|
|
|
|
|
|
{ |
1607
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1608
|
|
|
|
|
|
|
return $self->{ _storeAbstract } = $value; |
1609
|
|
|
|
|
|
|
} |
1610
|
|
|
|
|
|
|
|
1611
|
|
|
|
|
|
|
sub SetWorkingDir |
1612
|
|
|
|
|
|
|
{ |
1613
|
|
|
|
|
|
|
my ( $self, $dir ) = @_; |
1614
|
|
|
|
|
|
|
return $self->{ _workingDir } = $dir; |
1615
|
|
|
|
|
|
|
} |
1616
|
|
|
|
|
|
|
|
1617
|
|
|
|
|
|
|
sub SetSavePath |
1618
|
|
|
|
|
|
|
{ |
1619
|
|
|
|
|
|
|
my ( $self, $dir ) = @_; |
1620
|
|
|
|
|
|
|
return $self->{ _savePath } = $dir; |
1621
|
|
|
|
|
|
|
} |
1622
|
|
|
|
|
|
|
|
1623
|
|
|
|
|
|
|
sub SetQuickParse |
1624
|
|
|
|
|
|
|
{ |
1625
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1626
|
|
|
|
|
|
|
return $self->{ _quickParse } = $value; |
1627
|
|
|
|
|
|
|
} |
1628
|
|
|
|
|
|
|
|
1629
|
|
|
|
|
|
|
sub SetCompoundifyText |
1630
|
|
|
|
|
|
|
{ |
1631
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1632
|
|
|
|
|
|
|
return $self->{ _compoundifyText } = $value; |
1633
|
|
|
|
|
|
|
} |
1634
|
|
|
|
|
|
|
|
1635
|
|
|
|
|
|
|
sub SetStoreAsSentencePerLine |
1636
|
|
|
|
|
|
|
{ |
1637
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1638
|
|
|
|
|
|
|
return $self->{ _storeAsSentencePerLine } = $value; |
1639
|
|
|
|
|
|
|
} |
1640
|
|
|
|
|
|
|
|
1641
|
|
|
|
|
|
|
sub SetNumOfThreads |
1642
|
|
|
|
|
|
|
{ |
1643
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1644
|
|
|
|
|
|
|
|
1645
|
|
|
|
|
|
|
# Check |
1646
|
|
|
|
|
|
|
$self->WriteLog( "SetNumOfThreads - Warning: Number Of Threads Value < 0 / Setting Default Value" ) if ( $value < 0 ); |
1647
|
|
|
|
|
|
|
$value = Sys::CpuAffinity::getNumCpus() if ( $value < 0 ); |
1648
|
|
|
|
|
|
|
|
1649
|
|
|
|
|
|
|
return $self->{ _numOfThreads } = $value; |
1650
|
|
|
|
|
|
|
} |
1651
|
|
|
|
|
|
|
|
1652
|
|
|
|
|
|
|
sub SetBeginDate |
1653
|
|
|
|
|
|
|
{ |
1654
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1655
|
|
|
|
|
|
|
return $self->{ _beginDate } = $str; |
1656
|
|
|
|
|
|
|
} |
1657
|
|
|
|
|
|
|
|
1658
|
|
|
|
|
|
|
sub SetEndDate |
1659
|
|
|
|
|
|
|
{ |
1660
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1661
|
|
|
|
|
|
|
return $self->{ _endDate } = $str; |
1662
|
|
|
|
|
|
|
} |
1663
|
|
|
|
|
|
|
|
1664
|
|
|
|
|
|
|
sub SetXMLStringToParse |
1665
|
|
|
|
|
|
|
{ |
1666
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1667
|
|
|
|
|
|
|
return $self->{ _xmlStringToParse } = $str; |
1668
|
|
|
|
|
|
|
} |
1669
|
|
|
|
|
|
|
|
1670
|
|
|
|
|
|
|
sub SetTextCorpusStr |
1671
|
|
|
|
|
|
|
{ |
1672
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1673
|
|
|
|
|
|
|
return $self->{ _textCorpusStr } = $str; |
1674
|
|
|
|
|
|
|
} |
1675
|
|
|
|
|
|
|
|
1676
|
|
|
|
|
|
|
sub AppendStrToTextCorpus |
1677
|
|
|
|
|
|
|
{ |
1678
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1679
|
|
|
|
|
|
|
|
1680
|
|
|
|
|
|
|
return if ( $str eq "" || !defined( $str ) ); |
1681
|
|
|
|
|
|
|
|
1682
|
|
|
|
|
|
|
# Prevent Other Threads From Appending Data At The Same Time |
1683
|
|
|
|
|
|
|
{ |
1684
|
|
|
|
|
|
|
lock( $appendLock ); |
1685
|
|
|
|
|
|
|
|
1686
|
|
|
|
|
|
|
# Removes Spaces At Left Side Of String |
1687
|
|
|
|
|
|
|
$str =~ s/^\s+// if ( $self->GetStoreAsSentencePerLine() == 1 ); |
1688
|
|
|
|
|
|
|
|
1689
|
|
|
|
|
|
|
# Removes Spaces At Both Ends Of String And More Than Once Space In-Between Ends |
1690
|
|
|
|
|
|
|
$str =~ s/^\s+|\s(?=\s)|\s+$//g if ( $self->GetStoreAsSentencePerLine() == 0 ); |
1691
|
|
|
|
|
|
|
|
1692
|
|
|
|
|
|
|
# Append string to text corpus |
1693
|
|
|
|
|
|
|
if( substr( $str, -1 ) eq "\n" ) |
1694
|
|
|
|
|
|
|
{ |
1695
|
|
|
|
|
|
|
$self->{ _textCorpusStr } .= "$str" ; |
1696
|
|
|
|
|
|
|
} |
1697
|
|
|
|
|
|
|
else |
1698
|
|
|
|
|
|
|
{ |
1699
|
|
|
|
|
|
|
$self->{ _textCorpusStr } .= "$str "; |
1700
|
|
|
|
|
|
|
} |
1701
|
|
|
|
|
|
|
} |
1702
|
|
|
|
|
|
|
} |
1703
|
|
|
|
|
|
|
|
1704
|
|
|
|
|
|
|
sub ClearTextCorpusStr |
1705
|
|
|
|
|
|
|
{ |
1706
|
|
|
|
|
|
|
my ( $self ) = @_; |
1707
|
|
|
|
|
|
|
return $self->{ _textCorpusStr } = ""; |
1708
|
|
|
|
|
|
|
} |
1709
|
|
|
|
|
|
|
|
1710
|
|
|
|
|
|
|
sub SetTempStr |
1711
|
|
|
|
|
|
|
{ |
1712
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1713
|
|
|
|
|
|
|
|
1714
|
|
|
|
|
|
|
# Convert String To UTF8 Format Encoding (Removes Special Characters / Fixes Wide Character Bug) |
1715
|
|
|
|
|
|
|
$str = $self->RemoveSpecialCharactersFromString( $str ); |
1716
|
|
|
|
|
|
|
$str = Text::Unidecode::unidecode( $str ); |
1717
|
|
|
|
|
|
|
|
1718
|
|
|
|
|
|
|
return $self->{ _tempStr } = $str; |
1719
|
|
|
|
|
|
|
} |
1720
|
|
|
|
|
|
|
|
1721
|
|
|
|
|
|
|
sub AppendToTempStr |
1722
|
|
|
|
|
|
|
{ |
1723
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1724
|
|
|
|
|
|
|
|
1725
|
|
|
|
|
|
|
# Convert String To UTF8 Format Encoding (Removes Special Characters / Fixes Wide Character Bug) |
1726
|
|
|
|
|
|
|
$str = $self->RemoveSpecialCharactersFromString( $str ); |
1727
|
|
|
|
|
|
|
$str = Text::Unidecode::unidecode( $str ); |
1728
|
|
|
|
|
|
|
|
1729
|
|
|
|
|
|
|
# Removes Spaces At Left Side Of String |
1730
|
|
|
|
|
|
|
$str =~ s/^\s+// if ( $self->GetStoreAsSentencePerLine() == 1 ); |
1731
|
|
|
|
|
|
|
|
1732
|
|
|
|
|
|
|
# Removes Spaces At Both Ends Of String And More Than Once Space In-Between Ends |
1733
|
|
|
|
|
|
|
$str =~ s/^\s+|\s(?=\s)|\s+$//g if ( $self->GetStoreAsSentencePerLine() == 0 ); |
1734
|
|
|
|
|
|
|
|
1735
|
|
|
|
|
|
|
# Increment Word Counter |
1736
|
|
|
|
|
|
|
my @words = split( ' ', $str ); |
1737
|
|
|
|
|
|
|
$preCompWordCount += scalar( @words ); |
1738
|
|
|
|
|
|
|
undef( @words ); |
1739
|
|
|
|
|
|
|
|
1740
|
|
|
|
|
|
|
# Append String To Temp String |
1741
|
|
|
|
|
|
|
return $self->{ _tempStr } .= "$str" if ( index( ( scalar reverse $str ), "\n" ) == 0 ); |
1742
|
|
|
|
|
|
|
return $self->{ _tempStr } .= "$str "; |
1743
|
|
|
|
|
|
|
} |
1744
|
|
|
|
|
|
|
|
1745
|
|
|
|
|
|
|
sub ClearTempStr |
1746
|
|
|
|
|
|
|
{ |
1747
|
|
|
|
|
|
|
my ( $self ) = @_; |
1748
|
|
|
|
|
|
|
return $self->{ _tempStr } = ""; |
1749
|
|
|
|
|
|
|
} |
1750
|
|
|
|
|
|
|
|
1751
|
|
|
|
|
|
|
sub SetTempDate |
1752
|
|
|
|
|
|
|
{ |
1753
|
|
|
|
|
|
|
my ( $self, $str ) = @_; |
1754
|
|
|
|
|
|
|
return $self->{ _tempDate } = $str; |
1755
|
|
|
|
|
|
|
} |
1756
|
|
|
|
|
|
|
|
1757
|
|
|
|
|
|
|
sub ClearTempDate |
1758
|
|
|
|
|
|
|
{ |
1759
|
|
|
|
|
|
|
my ( $self ) = @_; |
1760
|
|
|
|
|
|
|
return $self->{ _tempDate } = ""; |
1761
|
|
|
|
|
|
|
} |
1762
|
|
|
|
|
|
|
|
1763
|
|
|
|
|
|
|
sub SetCompoundWordAry |
1764
|
|
|
|
|
|
|
{ |
1765
|
|
|
|
|
|
|
my ( $self, $aryRef ) = @_; |
1766
|
|
|
|
|
|
|
$self->WriteLog( "Warning: Setting CompoundWordArray when array is already defined - Clearing Previous Array" ) if ( @{ $self->{ _compoundWordAry } } > 0 ); |
1767
|
|
|
|
|
|
|
undef( $self->{ _compoundWordAry } ) if ( @{ $self->{ _compoundWordAry } } > 0 ); |
1768
|
|
|
|
|
|
|
return @{ $self->{ _compoundWordAry } } = @{ $aryRef }; |
1769
|
|
|
|
|
|
|
} |
1770
|
|
|
|
|
|
|
|
1771
|
|
|
|
|
|
|
sub ClearCompoundWordAry |
1772
|
|
|
|
|
|
|
{ |
1773
|
|
|
|
|
|
|
my ( $self ) = @_; |
1774
|
|
|
|
|
|
|
undef( $self->{ _compoundWordAry } ); |
1775
|
|
|
|
|
|
|
return @{ $self->{ _compoundWordAry } } = (); |
1776
|
|
|
|
|
|
|
} |
1777
|
|
|
|
|
|
|
|
1778
|
|
|
|
|
|
|
sub SetCompoundWordBST |
1779
|
|
|
|
|
|
|
{ |
1780
|
|
|
|
|
|
|
my ( $self, $bst ) = @_; |
1781
|
|
|
|
|
|
|
$self->WriteLog( "Warning: Setting CompoundWordBST when BST is already defined - Clearing Previous BST" ) if defined ( $self->{ _compoundWordBST } ); |
1782
|
|
|
|
|
|
|
$self->{ _compoundWordBST }->DESTROY() if defined( $self->{ _compoundWordBST } ); |
1783
|
|
|
|
|
|
|
undef( $self->{ _compoundWordBST } ) if defined ( $self->{ _compoundWordBST } ); |
1784
|
|
|
|
|
|
|
return $self->{ _compoundWordBST } = $bst; |
1785
|
|
|
|
|
|
|
} |
1786
|
|
|
|
|
|
|
|
1787
|
|
|
|
|
|
|
sub ClearCompoundWordBST |
1788
|
|
|
|
|
|
|
{ |
1789
|
|
|
|
|
|
|
my ( $self ) = @_; |
1790
|
|
|
|
|
|
|
undef( $self->{ _compoundWordBST } ); |
1791
|
|
|
|
|
|
|
return $self->{ _compoundWordBST }; |
1792
|
|
|
|
|
|
|
} |
1793
|
|
|
|
|
|
|
|
1794
|
|
|
|
|
|
|
sub SetMaxCompoundWordLength |
1795
|
|
|
|
|
|
|
{ |
1796
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1797
|
|
|
|
|
|
|
return $self->{ _maxCompoundWordLength } = $value; |
1798
|
|
|
|
|
|
|
} |
1799
|
|
|
|
|
|
|
|
1800
|
|
|
|
|
|
|
sub SetOverwriteExistingFile |
1801
|
|
|
|
|
|
|
{ |
1802
|
|
|
|
|
|
|
my ( $self, $value ) = @_; |
1803
|
|
|
|
|
|
|
return $self->{ _overwriteExistingFile } = $value; |
1804
|
|
|
|
|
|
|
} |
1805
|
|
|
|
|
|
|
|
1806
|
|
|
|
|
|
|
|
1807
|
|
|
|
|
|
|
###################################################################################### |
1808
|
|
|
|
|
|
|
# Debug Functions |
1809
|
|
|
|
|
|
|
###################################################################################### |
1810
|
|
|
|
|
|
|
|
1811
|
|
|
|
|
|
|
sub GetTime |
1812
|
|
|
|
|
|
|
{ |
1813
|
|
|
|
|
|
|
my ( $self ) = @_; |
1814
|
|
|
|
|
|
|
my( $sec, $min, $hour ) = localtime(); |
1815
|
|
|
|
|
|
|
|
1816
|
|
|
|
|
|
|
$hour = "0$hour" if( $hour < 10 ); |
1817
|
|
|
|
|
|
|
$min = "0$min" if( $min < 10 ); |
1818
|
|
|
|
|
|
|
$sec = "0$sec" if( $sec < 10 ); |
1819
|
|
|
|
|
|
|
|
1820
|
|
|
|
|
|
|
return "$hour:$min:$sec"; |
1821
|
|
|
|
|
|
|
} |
1822
|
|
|
|
|
|
|
|
1823
|
|
|
|
|
|
|
sub GetDate |
1824
|
|
|
|
|
|
|
{ |
1825
|
|
|
|
|
|
|
my ( $self ) = @_; |
1826
|
|
|
|
|
|
|
my ( $sec, $min, $hour, $mday, $mon, $year ) = localtime(); |
1827
|
|
|
|
|
|
|
|
1828
|
|
|
|
|
|
|
$mon += 1; |
1829
|
|
|
|
|
|
|
$year += 1900; |
1830
|
|
|
|
|
|
|
|
1831
|
|
|
|
|
|
|
return "$mon/$mday/$year"; |
1832
|
|
|
|
|
|
|
} |
1833
|
|
|
|
|
|
|
|
1834
|
|
|
|
|
|
|
sub WriteLog |
1835
|
|
|
|
|
|
|
{ |
1836
|
|
|
|
|
|
|
my ( $self ) = shift; |
1837
|
|
|
|
|
|
|
my $string = shift; |
1838
|
|
|
|
|
|
|
my $printNewLine = shift; |
1839
|
|
|
|
|
|
|
|
1840
|
|
|
|
|
|
|
return if !defined ( $string ); |
1841
|
|
|
|
|
|
|
$printNewLine = 1 if !defined ( $printNewLine ); |
1842
|
|
|
|
|
|
|
|
1843
|
|
|
|
|
|
|
# Prevent Other Threads From Writing At The Same Time |
1844
|
|
|
|
|
|
|
lock( $debugLock ); |
1845
|
|
|
|
|
|
|
|
1846
|
|
|
|
|
|
|
if( $self->GetDebugLog() ) |
1847
|
|
|
|
|
|
|
{ |
1848
|
|
|
|
|
|
|
if( ref ( $self ) ne "Word2vec::Xmltow2v" ) |
1849
|
|
|
|
|
|
|
{ |
1850
|
|
|
|
|
|
|
print( GetDate() . " " . GetTime() . " - xmltow2v: Cannot Call WriteLog() From Outside Module!\n" ); |
1851
|
|
|
|
|
|
|
return; |
1852
|
|
|
|
|
|
|
} |
1853
|
|
|
|
|
|
|
|
1854
|
|
|
|
|
|
|
$string = "" if !defined ( $string ); |
1855
|
|
|
|
|
|
|
print GetDate() . " " . GetTime() . " - xmltow2v::$string"; |
1856
|
|
|
|
|
|
|
print "\n" if( $printNewLine != 0 ); |
1857
|
|
|
|
|
|
|
} |
1858
|
|
|
|
|
|
|
|
1859
|
|
|
|
|
|
|
if( $self->GetWriteLog() ) |
1860
|
|
|
|
|
|
|
{ |
1861
|
|
|
|
|
|
|
if( ref ( $self ) ne "Word2vec::Xmltow2v" ) |
1862
|
|
|
|
|
|
|
{ |
1863
|
|
|
|
|
|
|
print( GetDate() . " " . GetTime() . " - xmltow2v: Cannot Call WriteLog() From Outside Module!\n" ); |
1864
|
|
|
|
|
|
|
return; |
1865
|
|
|
|
|
|
|
} |
1866
|
|
|
|
|
|
|
|
1867
|
|
|
|
|
|
|
my $fileHandle = $self->GetFileHandle(); |
1868
|
|
|
|
|
|
|
|
1869
|
|
|
|
|
|
|
if( defined( $fileHandle ) ) |
1870
|
|
|
|
|
|
|
{ |
1871
|
|
|
|
|
|
|
print( $fileHandle GetDate() . " " . GetTime() . " - xmltow2v::$string" ); |
1872
|
|
|
|
|
|
|
print( $fileHandle "\n" ) if( $printNewLine != 0 ); |
1873
|
|
|
|
|
|
|
} |
1874
|
|
|
|
|
|
|
} |
1875
|
|
|
|
|
|
|
} |
1876
|
|
|
|
|
|
|
|
1877
|
|
|
|
|
|
|
#################### All Modules Are To Output "1"(True) at EOF ###################### |
1878
|
|
|
|
|
|
|
1; |
1879
|
|
|
|
|
|
|
|
1880
|
|
|
|
|
|
|
|
1881
|
|
|
|
|
|
|
=head1 NAME |
1882
|
|
|
|
|
|
|
|
1883
|
|
|
|
|
|
|
Word2vec::Xmltow2v - Medline XML-To-W2V Module. |
1884
|
|
|
|
|
|
|
|
1885
|
|
|
|
|
|
|
=head1 SYNOPSIS |
1886
|
|
|
|
|
|
|
|
1887
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
1888
|
|
|
|
|
|
|
|
1889
|
|
|
|
|
|
|
# Parameters: Debug Output = True, Write Log = False, StoreTitle = True, StoreAbstract = True, Quick Parse = True, CompoundifyText = True, Use Multi-Threading (Default = 1 Thread Per CPU Core) |
1890
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new( 1, 0, 1, 1, 1, 1, 2 ); # Note: Specifying no parameters implies default settings. |
1891
|
|
|
|
|
|
|
$xmlconv->SetWorkingDir( "Medline/XML/Directory/Here" ); |
1892
|
|
|
|
|
|
|
$xmlconv->SetSavePath( "textcorpus.txt" ); |
1893
|
|
|
|
|
|
|
$xmlconv->SetStoreTitle( 1 ); |
1894
|
|
|
|
|
|
|
$xmlconv->SetStoreAbstract( 1 ); |
1895
|
|
|
|
|
|
|
$xmlconv->SetBeginDate( "01/01/2004" ); |
1896
|
|
|
|
|
|
|
$xmlconv->SetEndDate( "08/13/2016" ); |
1897
|
|
|
|
|
|
|
$xmlconv->SetOverwriteExistingFile( 1 ); |
1898
|
|
|
|
|
|
|
|
1899
|
|
|
|
|
|
|
# If Compound Word File Exists, Store It In Memory And Create Compound Word Binary Search Tree |
1900
|
|
|
|
|
|
|
$xmlconv->ReadCompoundWordDataFromFile( "compoundword.txt", 1 ); |
1901
|
|
|
|
|
|
|
$xmlconv->CreateCompoundWordBST(); |
1902
|
|
|
|
|
|
|
|
1903
|
|
|
|
|
|
|
# Parse XML Files or Directory Of Files |
1904
|
|
|
|
|
|
|
$xmlconv->ConvertMedlineXMLToW2V( "/xmlDirectory/" ); |
1905
|
|
|
|
|
|
|
undef( $xmlconv ); |
1906
|
|
|
|
|
|
|
|
1907
|
|
|
|
|
|
|
=head1 DESCRIPTION |
1908
|
|
|
|
|
|
|
|
1909
|
|
|
|
|
|
|
Word2vec::Xmltow2v is a XML-to-text module which converts Medline XML article title |
1910
|
|
|
|
|
|
|
and abstract data, given a date range, into a plain text corpus for use |
1911
|
|
|
|
|
|
|
with Word2vec::Interface. It also "compoundifies" during text corpus compilation |
1912
|
|
|
|
|
|
|
given a compound word file. |
1913
|
|
|
|
|
|
|
|
1914
|
|
|
|
|
|
|
=head2 Main Functions |
1915
|
|
|
|
|
|
|
|
1916
|
|
|
|
|
|
|
=head3 new |
1917
|
|
|
|
|
|
|
|
1918
|
|
|
|
|
|
|
Description: |
1919
|
|
|
|
|
|
|
|
1920
|
|
|
|
|
|
|
Returns a new 'Word2vec::Xmltow2v' module object. |
1921
|
|
|
|
|
|
|
|
1922
|
|
|
|
|
|
|
Note: Specifying no parameters implies default options. |
1923
|
|
|
|
|
|
|
|
1924
|
|
|
|
|
|
|
Default Parameters: |
1925
|
|
|
|
|
|
|
debugLog = 0 |
1926
|
|
|
|
|
|
|
writeLog = 0 |
1927
|
|
|
|
|
|
|
storeTitle = 1 |
1928
|
|
|
|
|
|
|
storeAbstract = 1 |
1929
|
|
|
|
|
|
|
quickParse = 0 |
1930
|
|
|
|
|
|
|
compoundifyText = 0 |
1931
|
|
|
|
|
|
|
storeAsSentencePerLine = 0 |
1932
|
|
|
|
|
|
|
numOfThreads = Number of CPUs/CPU cores (1 thread per core/CPU) |
1933
|
|
|
|
|
|
|
workingDir = Current Directory |
1934
|
|
|
|
|
|
|
savePath = Current Directory |
1935
|
|
|
|
|
|
|
beginDate = "00/00/0000" |
1936
|
|
|
|
|
|
|
endDate = "99/99/9999" |
1937
|
|
|
|
|
|
|
xmlStringToParse = "(null)" |
1938
|
|
|
|
|
|
|
textCorpusString = "" |
1939
|
|
|
|
|
|
|
twigHandler = 0 |
1940
|
|
|
|
|
|
|
parsedCount = 0 |
1941
|
|
|
|
|
|
|
tempDate = "" |
1942
|
|
|
|
|
|
|
tempStr = "" |
1943
|
|
|
|
|
|
|
outputFileName = "textcorpus.txt" |
1944
|
|
|
|
|
|
|
compoundWordAry = () |
1945
|
|
|
|
|
|
|
compoundWordBST = Word2vec::Bst->new() |
1946
|
|
|
|
|
|
|
maxCompoundWordLength = 0 |
1947
|
|
|
|
|
|
|
overwriteExistingFile = 0 |
1948
|
|
|
|
|
|
|
|
1949
|
|
|
|
|
|
|
Input: |
1950
|
|
|
|
|
|
|
|
1951
|
|
|
|
|
|
|
$debugLog -> Instructs module to print debug statements to the console. (1 = True / 0 = False) |
1952
|
|
|
|
|
|
|
$writeLog -> Instructs module to print debug statements to a log file. (1 = True / 0 = False) |
1953
|
|
|
|
|
|
|
$storeTitle -> Instructs module to store Medline article titles during text corpus compilation. (1 = True / 0 = False) |
1954
|
|
|
|
|
|
|
$storeAbstract -> Instructs module to store Medline article abstracts during text corpus compilation. (1 = True / 0 = False) |
1955
|
|
|
|
|
|
|
$quickParse -> Instructs module to utilize quick XML parsing Functions for known Medline article title and abstract tags. (1 = True / 0 = False) |
1956
|
|
|
|
|
|
|
$compoundifyText -> Instructs module to compoundify text on the fly given a compound word file. This is automatically set |
1957
|
|
|
|
|
|
|
when reading the compound word file to memory regardless of user setting. (1 = True / 0 = False) |
1958
|
|
|
|
|
|
|
$storeAsSentencePerLine -> Instructs module to store parsed medline data as a length single sentence or separate sentences on new lines based on period character. (1 = True / 0 = False) |
1959
|
|
|
|
|
|
|
$numOfThreads -> Specifies the number of worker threads which parse Medline XML files simultaneously to create the text corpus. |
1960
|
|
|
|
|
|
|
This speeds up text corpus generation by the number of physical cores present an a given machine. (Positive integer value) |
1961
|
|
|
|
|
|
|
ie. Using four threads of a Intel i7 core machine speeds up text corpus generation roughly four times faster than being single threaded. |
1962
|
|
|
|
|
|
|
$workingDir -> Specifies the current working directory. (String) |
1963
|
|
|
|
|
|
|
$savePath -> Specifies the save path for text corpus generation. (String) |
1964
|
|
|
|
|
|
|
$beginDate -> Specifies the beginning date range for Medline article text corpus composition. (Format: XX/XX/XXXX) |
1965
|
|
|
|
|
|
|
$endDate -> Specifies the ending date range for Medline article text corpus composition. (Format: XX/XX/XXXX) |
1966
|
|
|
|
|
|
|
$xmlStringToParse -> Storage location for the current Medline XML file in memory. (String) |
1967
|
|
|
|
|
|
|
$textCorpusString -> Temporary storage location for text corpus generation in memory. (String) |
1968
|
|
|
|
|
|
|
$twigHandler -> XML::Twig object location. |
1969
|
|
|
|
|
|
|
$parsedCount -> Number of parsed Medline articles during text corpus generation. |
1970
|
|
|
|
|
|
|
$tempDate -> Temporary storage location for current Medline article date during text corpus compilation. |
1971
|
|
|
|
|
|
|
$tempStr -> Temporary storage location for current Medline article title/abstract during text corpus compilation. |
1972
|
|
|
|
|
|
|
$outputFileName -> Output file path/name. |
1973
|
|
|
|
|
|
|
$compoundWordAry -> Storage location for compound words, used to compoundify text. (Array) <- Depreciated |
1974
|
|
|
|
|
|
|
$compoundWordBST -> Storage location for compound words, used to compoundify text. (Binary Search Tree) <- Supersedes '$compoundWordAry' |
1975
|
|
|
|
|
|
|
$maxCompoundWordLength -> Maximum number of words able to be compoundified in one phrase. ie "six_sea_snakes_were_sailing" = 5 compoundified words. |
1976
|
|
|
|
|
|
|
The compounding algorithm will attempt to compoundify no more than this set value, even-though the compound word list could |
1977
|
|
|
|
|
|
|
possibly contain larger compounded phrases. |
1978
|
|
|
|
|
|
|
$overwriteExistingFile -> Instructs the module to either overwrite any existing text corpus files or append to the existing file. |
1979
|
|
|
|
|
|
|
|
1980
|
|
|
|
|
|
|
Note: It is not recommended to specify all new() parameters, as it has not been thoroughly tested. Maximum recommended parameters to be specified include: |
1981
|
|
|
|
|
|
|
"debugLog, writeLog, storeTitle, storeAbstract, quickParse, compoundifyText, numOfThreads, workingDir, savePath, beginDate, endDate" |
1982
|
|
|
|
|
|
|
|
1983
|
|
|
|
|
|
|
Output: |
1984
|
|
|
|
|
|
|
|
1985
|
|
|
|
|
|
|
Word2vec::Xmltow2v object. |
1986
|
|
|
|
|
|
|
|
1987
|
|
|
|
|
|
|
Example: |
1988
|
|
|
|
|
|
|
|
1989
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
1990
|
|
|
|
|
|
|
|
1991
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); # Note: Specifying no parameters implies default settings as listed above. |
1992
|
|
|
|
|
|
|
|
1993
|
|
|
|
|
|
|
undef( $xmlconv ); |
1994
|
|
|
|
|
|
|
|
1995
|
|
|
|
|
|
|
# Or |
1996
|
|
|
|
|
|
|
|
1997
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
1998
|
|
|
|
|
|
|
|
1999
|
|
|
|
|
|
|
# Parameters: Debug Output = True, Write Log = False, StoreTitle = True, StoreAbstract = True, Quick Parse = True, CompoundifyText = True, Use Multi-Threading (2 Threads) |
2000
|
|
|
|
|
|
|
my $xmlconv = new xmltow2v( 1, 0, 1, 1, 1, 1, 2 ); |
2001
|
|
|
|
|
|
|
|
2002
|
|
|
|
|
|
|
undef( $xmlconv ); |
2003
|
|
|
|
|
|
|
|
2004
|
|
|
|
|
|
|
=head3 DESTROY |
2005
|
|
|
|
|
|
|
|
2006
|
|
|
|
|
|
|
Description: |
2007
|
|
|
|
|
|
|
|
2008
|
|
|
|
|
|
|
Removes module objects and variables from memory. |
2009
|
|
|
|
|
|
|
|
2010
|
|
|
|
|
|
|
Input: |
2011
|
|
|
|
|
|
|
|
2012
|
|
|
|
|
|
|
None |
2013
|
|
|
|
|
|
|
|
2014
|
|
|
|
|
|
|
Output: |
2015
|
|
|
|
|
|
|
|
2016
|
|
|
|
|
|
|
None |
2017
|
|
|
|
|
|
|
|
2018
|
|
|
|
|
|
|
Example: |
2019
|
|
|
|
|
|
|
|
2020
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2021
|
|
|
|
|
|
|
|
2022
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2023
|
|
|
|
|
|
|
|
2024
|
|
|
|
|
|
|
$xmlconv->DESTROY(); |
2025
|
|
|
|
|
|
|
undef( $xmlconv ); |
2026
|
|
|
|
|
|
|
|
2027
|
|
|
|
|
|
|
=head3 ConvertMedlineXMLToW2V |
2028
|
|
|
|
|
|
|
|
2029
|
|
|
|
|
|
|
Description: |
2030
|
|
|
|
|
|
|
|
2031
|
|
|
|
|
|
|
Parses specified parameter Medline XML file or directory of files, creating a text corpus. Returns 0 if successful or -1 during an error. |
2032
|
|
|
|
|
|
|
|
2033
|
|
|
|
|
|
|
Note: Supports plain Medline XML or gun-zipped XML files. |
2034
|
|
|
|
|
|
|
|
2035
|
|
|
|
|
|
|
Input: |
2036
|
|
|
|
|
|
|
|
2037
|
|
|
|
|
|
|
$filePath -> XML file path to parse. (This can be a single file or directory of XML/XML.gz files). |
2038
|
|
|
|
|
|
|
|
2039
|
|
|
|
|
|
|
Output: |
2040
|
|
|
|
|
|
|
|
2041
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-Successful |
2042
|
|
|
|
|
|
|
|
2043
|
|
|
|
|
|
|
Example: |
2044
|
|
|
|
|
|
|
|
2045
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2046
|
|
|
|
|
|
|
|
2047
|
|
|
|
|
|
|
$xmlconv = new xmltow2v(); # Note: Specifying no parameters implies default settings |
2048
|
|
|
|
|
|
|
$xmlconv->SetSavePath( "testCorpus.txt" ); |
2049
|
|
|
|
|
|
|
$xmlconv->SetStoreTitle( 1 ); |
2050
|
|
|
|
|
|
|
$xmlconv->SetStoreAbstract( 1 ); |
2051
|
|
|
|
|
|
|
$xmlconv->SetBeginDate( "01/01/2004" ); |
2052
|
|
|
|
|
|
|
$xmlconv->SetEndDate( "08/13/2016" ); |
2053
|
|
|
|
|
|
|
$xmlconv->SetOverwriteExistingFile( 1 ); |
2054
|
|
|
|
|
|
|
$xmlconv->ConvertMedlineXMLToW2V( "/xmlDirectory/" ); |
2055
|
|
|
|
|
|
|
undef( $xmlconv ); |
2056
|
|
|
|
|
|
|
|
2057
|
|
|
|
|
|
|
|
2058
|
|
|
|
|
|
|
=head3 _ThreadedConvert |
2059
|
|
|
|
|
|
|
|
2060
|
|
|
|
|
|
|
Description: |
2061
|
|
|
|
|
|
|
|
2062
|
|
|
|
|
|
|
Multi-Threaded Medline XML to text corpus conversion function. |
2063
|
|
|
|
|
|
|
|
2064
|
|
|
|
|
|
|
Input: |
2065
|
|
|
|
|
|
|
|
2066
|
|
|
|
|
|
|
$directory -> File directory or directory of files to parse. |
2067
|
|
|
|
|
|
|
|
2068
|
|
|
|
|
|
|
Output: |
2069
|
|
|
|
|
|
|
|
2070
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2071
|
|
|
|
|
|
|
|
2072
|
|
|
|
|
|
|
Example: |
2073
|
|
|
|
|
|
|
|
2074
|
|
|
|
|
|
|
Warning: This is a private function called by 'ConvertMedlineXMLToW2V()'. It should not be called outside of xmltow2v module. |
2075
|
|
|
|
|
|
|
|
2076
|
|
|
|
|
|
|
=head3 _ParseXMLString |
2077
|
|
|
|
|
|
|
|
2078
|
|
|
|
|
|
|
Description: |
2079
|
|
|
|
|
|
|
|
2080
|
|
|
|
|
|
|
Parses passed string parameter for Medline XML article title and abstract data and appends found data to the text corpus. |
2081
|
|
|
|
|
|
|
|
2082
|
|
|
|
|
|
|
Input: |
2083
|
|
|
|
|
|
|
|
2084
|
|
|
|
|
|
|
$string -> Medline XML string data to parse. |
2085
|
|
|
|
|
|
|
|
2086
|
|
|
|
|
|
|
Output: |
2087
|
|
|
|
|
|
|
|
2088
|
|
|
|
|
|
|
None |
2089
|
|
|
|
|
|
|
|
2090
|
|
|
|
|
|
|
Example: |
2091
|
|
|
|
|
|
|
|
2092
|
|
|
|
|
|
|
Warning: This is a private function called by "ConvertMedlineXMLToW2V()" and "_ThreadedConvert()". It should not be called outside of xmltow2v module. |
2093
|
|
|
|
|
|
|
|
2094
|
|
|
|
|
|
|
=head3 _CheckParseRequirements |
2095
|
|
|
|
|
|
|
|
2096
|
|
|
|
|
|
|
Description: |
2097
|
|
|
|
|
|
|
|
2098
|
|
|
|
|
|
|
Checks passed string parameter to see if it contains relevant data and XML::Twig handler is initialized. |
2099
|
|
|
|
|
|
|
|
2100
|
|
|
|
|
|
|
Input: |
2101
|
|
|
|
|
|
|
|
2102
|
|
|
|
|
|
|
$string -> String data to check |
2103
|
|
|
|
|
|
|
|
2104
|
|
|
|
|
|
|
Output: |
2105
|
|
|
|
|
|
|
|
2106
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2107
|
|
|
|
|
|
|
|
2108
|
|
|
|
|
|
|
Example: |
2109
|
|
|
|
|
|
|
|
2110
|
|
|
|
|
|
|
Warning: This is a private function called "_ParseXMLString()". It should not be called outside of xmltow2v module. |
2111
|
|
|
|
|
|
|
|
2112
|
|
|
|
|
|
|
=head3 _CheckForNullData |
2113
|
|
|
|
|
|
|
|
2114
|
|
|
|
|
|
|
Description: |
2115
|
|
|
|
|
|
|
|
2116
|
|
|
|
|
|
|
Checks passed string parameter for "(null)" string. |
2117
|
|
|
|
|
|
|
|
2118
|
|
|
|
|
|
|
Input: |
2119
|
|
|
|
|
|
|
|
2120
|
|
|
|
|
|
|
$string -> String data to be checked. |
2121
|
|
|
|
|
|
|
|
2122
|
|
|
|
|
|
|
Output: |
2123
|
|
|
|
|
|
|
|
2124
|
|
|
|
|
|
|
$value -> '1' = True/Null data or '0' = False/Valid data |
2125
|
|
|
|
|
|
|
|
2126
|
|
|
|
|
|
|
Example: |
2127
|
|
|
|
|
|
|
|
2128
|
|
|
|
|
|
|
Warning: This is a private function called by "new()" and "_ParseXMLString()". It should not be called outside of xmltow2v module. |
2129
|
|
|
|
|
|
|
|
2130
|
|
|
|
|
|
|
=head3 _RemoveXMLVersion |
2131
|
|
|
|
|
|
|
|
2132
|
|
|
|
|
|
|
Description: |
2133
|
|
|
|
|
|
|
|
2134
|
|
|
|
|
|
|
Removes the XML Version string prior to parsing the XML string data. (Depreciated) |
2135
|
|
|
|
|
|
|
|
2136
|
|
|
|
|
|
|
Input: |
2137
|
|
|
|
|
|
|
|
2138
|
|
|
|
|
|
|
$string -> Medline XML string data |
2139
|
|
|
|
|
|
|
|
2140
|
|
|
|
|
|
|
Output: |
2141
|
|
|
|
|
|
|
|
2142
|
|
|
|
|
|
|
None |
2143
|
|
|
|
|
|
|
|
2144
|
|
|
|
|
|
|
Example: |
2145
|
|
|
|
|
|
|
|
2146
|
|
|
|
|
|
|
Warning: This is a private function called by "new()" and "_ParseXMLString()". It should not be called outside of xmltow2v module. |
2147
|
|
|
|
|
|
|
|
2148
|
|
|
|
|
|
|
=head3 _ParseMedlineCitationSet |
2149
|
|
|
|
|
|
|
|
2150
|
|
|
|
|
|
|
Description: |
2151
|
|
|
|
|
|
|
|
2152
|
|
|
|
|
|
|
Parses 'MedlineCitationSet' tag data in Medline XML file. |
2153
|
|
|
|
|
|
|
|
2154
|
|
|
|
|
|
|
Input: |
2155
|
|
|
|
|
|
|
|
2156
|
|
|
|
|
|
|
$twigHandler -> XML::Twig handler |
2157
|
|
|
|
|
|
|
$root -> Beginning of XML directory to parse. ( Directory in Medline XML string data ) |
2158
|
|
|
|
|
|
|
|
2159
|
|
|
|
|
|
|
Output: |
2160
|
|
|
|
|
|
|
|
2161
|
|
|
|
|
|
|
None |
2162
|
|
|
|
|
|
|
|
2163
|
|
|
|
|
|
|
Example: |
2164
|
|
|
|
|
|
|
|
2165
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2166
|
|
|
|
|
|
|
|
2167
|
|
|
|
|
|
|
=head3 _ParseMedlineArticle |
2168
|
|
|
|
|
|
|
|
2169
|
|
|
|
|
|
|
Description: |
2170
|
|
|
|
|
|
|
|
2171
|
|
|
|
|
|
|
Parses 'MedlineArticle' tag data in Medline XML file. |
2172
|
|
|
|
|
|
|
|
2173
|
|
|
|
|
|
|
Input: |
2174
|
|
|
|
|
|
|
|
2175
|
|
|
|
|
|
|
$medlineArticle -> Current Medline article directory in XML data (XML::Twig directory) |
2176
|
|
|
|
|
|
|
|
2177
|
|
|
|
|
|
|
Output: |
2178
|
|
|
|
|
|
|
|
2179
|
|
|
|
|
|
|
$value -> '1' = Finished parsing Medline article. |
2180
|
|
|
|
|
|
|
|
2181
|
|
|
|
|
|
|
Example: |
2182
|
|
|
|
|
|
|
|
2183
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2184
|
|
|
|
|
|
|
|
2185
|
|
|
|
|
|
|
=head3 _ParseDateCreated |
2186
|
|
|
|
|
|
|
|
2187
|
|
|
|
|
|
|
Description: |
2188
|
|
|
|
|
|
|
|
2189
|
|
|
|
|
|
|
Parses 'DateCreated' tag data in Medline XML file. |
2190
|
|
|
|
|
|
|
|
2191
|
|
|
|
|
|
|
Input: |
2192
|
|
|
|
|
|
|
|
2193
|
|
|
|
|
|
|
$article -> Current Medline article in XML data (XML::Twig directory) |
2194
|
|
|
|
|
|
|
|
2195
|
|
|
|
|
|
|
Output: |
2196
|
|
|
|
|
|
|
|
2197
|
|
|
|
|
|
|
$date -> 'XX/XX/XXXX' (Month/Day/Year) |
2198
|
|
|
|
|
|
|
|
2199
|
|
|
|
|
|
|
Example: |
2200
|
|
|
|
|
|
|
|
2201
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2202
|
|
|
|
|
|
|
|
2203
|
|
|
|
|
|
|
=head3 _ParseArticle |
2204
|
|
|
|
|
|
|
|
2205
|
|
|
|
|
|
|
Description: |
2206
|
|
|
|
|
|
|
|
2207
|
|
|
|
|
|
|
Parses 'Article' tag data in Medline XML file. Fetches 'ArticleTitle', 'Journal' and 'Abstract' XML tags. |
2208
|
|
|
|
|
|
|
|
2209
|
|
|
|
|
|
|
Input: |
2210
|
|
|
|
|
|
|
|
2211
|
|
|
|
|
|
|
$article -> Current Medline article in XML data (XML::Twig directory) |
2212
|
|
|
|
|
|
|
|
2213
|
|
|
|
|
|
|
Output: |
2214
|
|
|
|
|
|
|
|
2215
|
|
|
|
|
|
|
None |
2216
|
|
|
|
|
|
|
|
2217
|
|
|
|
|
|
|
Example: |
2218
|
|
|
|
|
|
|
|
2219
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2220
|
|
|
|
|
|
|
|
2221
|
|
|
|
|
|
|
=head3 _ParseJournal |
2222
|
|
|
|
|
|
|
|
2223
|
|
|
|
|
|
|
Description: |
2224
|
|
|
|
|
|
|
|
2225
|
|
|
|
|
|
|
Parses 'Journal' tag data in Medline XML file. Fetches 'Title' XML tag. |
2226
|
|
|
|
|
|
|
|
2227
|
|
|
|
|
|
|
Input: |
2228
|
|
|
|
|
|
|
|
2229
|
|
|
|
|
|
|
$journalRoot -> Current Medline journal directory in XML data (XML::Twig directory) |
2230
|
|
|
|
|
|
|
|
2231
|
|
|
|
|
|
|
Output: |
2232
|
|
|
|
|
|
|
|
2233
|
|
|
|
|
|
|
None |
2234
|
|
|
|
|
|
|
|
2235
|
|
|
|
|
|
|
Example: |
2236
|
|
|
|
|
|
|
|
2237
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2238
|
|
|
|
|
|
|
|
2239
|
|
|
|
|
|
|
=head3 _ParseOtherAbstract |
2240
|
|
|
|
|
|
|
|
2241
|
|
|
|
|
|
|
Description: |
2242
|
|
|
|
|
|
|
|
2243
|
|
|
|
|
|
|
Parses 'Abstract' tag data in Medline XML file. Fetches 'AbstractText' XML tag. |
2244
|
|
|
|
|
|
|
|
2245
|
|
|
|
|
|
|
Input: |
2246
|
|
|
|
|
|
|
|
2247
|
|
|
|
|
|
|
$abstractRoot -> Current Medline abstract directory in XML data (XML::Twig directory) |
2248
|
|
|
|
|
|
|
|
2249
|
|
|
|
|
|
|
Output: |
2250
|
|
|
|
|
|
|
|
2251
|
|
|
|
|
|
|
None |
2252
|
|
|
|
|
|
|
|
2253
|
|
|
|
|
|
|
Example: |
2254
|
|
|
|
|
|
|
|
2255
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2256
|
|
|
|
|
|
|
|
2257
|
|
|
|
|
|
|
=head3 _QuickParseDateCreated |
2258
|
|
|
|
|
|
|
|
2259
|
|
|
|
|
|
|
Description: |
2260
|
|
|
|
|
|
|
|
2261
|
|
|
|
|
|
|
Parses 'DateCreated' tag data in Medline XML file. Used when 'QuickParse' member variable is enabled. Sets $tempDate member variable to parsed 'DateCreated' tag data. |
2262
|
|
|
|
|
|
|
|
2263
|
|
|
|
|
|
|
Input: |
2264
|
|
|
|
|
|
|
|
2265
|
|
|
|
|
|
|
$twigHandler -> 'XML::Twig' handler |
2266
|
|
|
|
|
|
|
$article -> Current Medline article directory in XML data (XML::Twig directory) |
2267
|
|
|
|
|
|
|
|
2268
|
|
|
|
|
|
|
Output: |
2269
|
|
|
|
|
|
|
|
2270
|
|
|
|
|
|
|
None |
2271
|
|
|
|
|
|
|
|
2272
|
|
|
|
|
|
|
Example: |
2273
|
|
|
|
|
|
|
|
2274
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2275
|
|
|
|
|
|
|
|
2276
|
|
|
|
|
|
|
=head3 _QuickParseJournal |
2277
|
|
|
|
|
|
|
|
2278
|
|
|
|
|
|
|
Description: |
2279
|
|
|
|
|
|
|
|
2280
|
|
|
|
|
|
|
Parses 'Journal' tag data in Medline XML file. Fetches 'Title' XML tag. Used when 'QuickParse' member variable is enabled. |
2281
|
|
|
|
|
|
|
Sets $tempStr to parsed data and stores in text corpus. |
2282
|
|
|
|
|
|
|
|
2283
|
|
|
|
|
|
|
Input: |
2284
|
|
|
|
|
|
|
|
2285
|
|
|
|
|
|
|
$twigHandler -> 'XML::Twig' handler. |
2286
|
|
|
|
|
|
|
$journalRoot -> Current Medline journal directory in XML data (XML::Twig directory) |
2287
|
|
|
|
|
|
|
|
2288
|
|
|
|
|
|
|
Output: |
2289
|
|
|
|
|
|
|
|
2290
|
|
|
|
|
|
|
None |
2291
|
|
|
|
|
|
|
|
2292
|
|
|
|
|
|
|
Example: |
2293
|
|
|
|
|
|
|
|
2294
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2295
|
|
|
|
|
|
|
|
2296
|
|
|
|
|
|
|
=head3 _QuickParseArticle |
2297
|
|
|
|
|
|
|
|
2298
|
|
|
|
|
|
|
Description: |
2299
|
|
|
|
|
|
|
|
2300
|
|
|
|
|
|
|
Parses 'Article' tag data in Medline XML file. Fetches 'ArticleTitle' and 'Abstract' XML tags. Used when 'QuickParse' member variable is enabled. |
2301
|
|
|
|
|
|
|
Sets $tempStr to parsed data and stores in text corpus. |
2302
|
|
|
|
|
|
|
|
2303
|
|
|
|
|
|
|
Input: |
2304
|
|
|
|
|
|
|
|
2305
|
|
|
|
|
|
|
$twigHandler -> 'XML::Twig' handler. |
2306
|
|
|
|
|
|
|
$article -> Current Medline article directory in XML data (XML::Twig directory) |
2307
|
|
|
|
|
|
|
|
2308
|
|
|
|
|
|
|
Output: |
2309
|
|
|
|
|
|
|
|
2310
|
|
|
|
|
|
|
None |
2311
|
|
|
|
|
|
|
|
2312
|
|
|
|
|
|
|
Example: |
2313
|
|
|
|
|
|
|
|
2314
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2315
|
|
|
|
|
|
|
|
2316
|
|
|
|
|
|
|
=head3 _QuickParseOtherAbstract |
2317
|
|
|
|
|
|
|
|
2318
|
|
|
|
|
|
|
Description: |
2319
|
|
|
|
|
|
|
|
2320
|
|
|
|
|
|
|
Parses 'Abstract' tag data in Medline XML file. Fetches 'AbstractText' XML tag. Used when 'QuickParse' member variable is enabled. |
2321
|
|
|
|
|
|
|
Sets $tempStr to parsed data and stores in text corpus. |
2322
|
|
|
|
|
|
|
|
2323
|
|
|
|
|
|
|
Input: |
2324
|
|
|
|
|
|
|
|
2325
|
|
|
|
|
|
|
$twigHandler -> 'XML::Twig' handler. |
2326
|
|
|
|
|
|
|
$anstractRoot -> Current Medline abstract directory in XML data (XML::Twig directory) |
2327
|
|
|
|
|
|
|
|
2328
|
|
|
|
|
|
|
Output: |
2329
|
|
|
|
|
|
|
|
2330
|
|
|
|
|
|
|
None |
2331
|
|
|
|
|
|
|
|
2332
|
|
|
|
|
|
|
Example: |
2333
|
|
|
|
|
|
|
|
2334
|
|
|
|
|
|
|
Warning: This is a private function and is called by xmltow2v's XML::Twig handler. It should not be called outside of xmltow2v module. |
2335
|
|
|
|
|
|
|
|
2336
|
|
|
|
|
|
|
=head3 CreateCompoundWordBST |
2337
|
|
|
|
|
|
|
|
2338
|
|
|
|
|
|
|
Description: |
2339
|
|
|
|
|
|
|
|
2340
|
|
|
|
|
|
|
Creates a binary search tree using compound word data in memory and stores root node. This also clears the compound word array afterwards. |
2341
|
|
|
|
|
|
|
|
2342
|
|
|
|
|
|
|
Warning: Compound word file must be loaded into memory using ReadCompoundWordDataFromFile() prior to calling this method. This function |
2343
|
|
|
|
|
|
|
will also delete the compound word array upon completion as it will no longer be necessary. |
2344
|
|
|
|
|
|
|
|
2345
|
|
|
|
|
|
|
Input: |
2346
|
|
|
|
|
|
|
|
2347
|
|
|
|
|
|
|
None |
2348
|
|
|
|
|
|
|
|
2349
|
|
|
|
|
|
|
Output: |
2350
|
|
|
|
|
|
|
|
2351
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2352
|
|
|
|
|
|
|
|
2353
|
|
|
|
|
|
|
Example: |
2354
|
|
|
|
|
|
|
|
2355
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2356
|
|
|
|
|
|
|
|
2357
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2358
|
|
|
|
|
|
|
$xmlconv->ReadCompoundWordDataFromFile( "samples/compoundword.txt" ); |
2359
|
|
|
|
|
|
|
$xmlconv->CreateCompoundWordBST(); |
2360
|
|
|
|
|
|
|
|
2361
|
|
|
|
|
|
|
=head3 CompoundifyString |
2362
|
|
|
|
|
|
|
|
2363
|
|
|
|
|
|
|
Description: |
2364
|
|
|
|
|
|
|
|
2365
|
|
|
|
|
|
|
Compoundifies string parameter based on compound word data in memory using the compound word binary search tree. |
2366
|
|
|
|
|
|
|
|
2367
|
|
|
|
|
|
|
Warning: Compound word file must be loaded into memory using ReadCompoundWordDataFromFile() prior to calling this method. |
2368
|
|
|
|
|
|
|
|
2369
|
|
|
|
|
|
|
Input: |
2370
|
|
|
|
|
|
|
|
2371
|
|
|
|
|
|
|
$string -> String to compoundify |
2372
|
|
|
|
|
|
|
|
2373
|
|
|
|
|
|
|
Output: |
2374
|
|
|
|
|
|
|
|
2375
|
|
|
|
|
|
|
$string -> Compounded string or "(null)" if string parameter is not defined. |
2376
|
|
|
|
|
|
|
|
2377
|
|
|
|
|
|
|
Example: |
2378
|
|
|
|
|
|
|
|
2379
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2380
|
|
|
|
|
|
|
|
2381
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2382
|
|
|
|
|
|
|
$xmlconv->ReadCompoundWordDataFromFile( "samples/compoundword.txt" ); |
2383
|
|
|
|
|
|
|
$xmlconv->CreateCompoundWordBST(); |
2384
|
|
|
|
|
|
|
my $compoundedString = $xmlconv->CompoundifyString( "String to compoundify" ); |
2385
|
|
|
|
|
|
|
print( "Compounded String: $compoundedString\n" ); |
2386
|
|
|
|
|
|
|
|
2387
|
|
|
|
|
|
|
undef( $xmlconv ); |
2388
|
|
|
|
|
|
|
|
2389
|
|
|
|
|
|
|
=head3 _CompoundifySearch |
2390
|
|
|
|
|
|
|
|
2391
|
|
|
|
|
|
|
Description: |
2392
|
|
|
|
|
|
|
|
2393
|
|
|
|
|
|
|
Recursive method used by CompoundifyString() to fetch compound word data in binary search tree. |
2394
|
|
|
|
|
|
|
|
2395
|
|
|
|
|
|
|
Warning: This function requires specific parameters and should not be called outside of CompoundifyString() method. |
2396
|
|
|
|
|
|
|
|
2397
|
|
|
|
|
|
|
Input: |
2398
|
|
|
|
|
|
|
|
2399
|
|
|
|
|
|
|
$stringArrayRef -> Array reference containing string data |
2400
|
|
|
|
|
|
|
$oldNode -> Last 'Word2vec::Node' data match was found |
2401
|
|
|
|
|
|
|
$searchStr -> Search phrase |
2402
|
|
|
|
|
|
|
$index -> Current string array index |
2403
|
|
|
|
|
|
|
|
2404
|
|
|
|
|
|
|
Output: |
2405
|
|
|
|
|
|
|
|
2406
|
|
|
|
|
|
|
Word2vec::Node -> Last node containing positive search phrase match |
2407
|
|
|
|
|
|
|
|
2408
|
|
|
|
|
|
|
Example: |
2409
|
|
|
|
|
|
|
|
2410
|
|
|
|
|
|
|
Warning: This is a private function and is called by 'CompoundifyString()'. It should not be called outside of xmltow2v module. |
2411
|
|
|
|
|
|
|
|
2412
|
|
|
|
|
|
|
=head3 ReadCompoundWordDataFromFile |
2413
|
|
|
|
|
|
|
|
2414
|
|
|
|
|
|
|
Description: |
2415
|
|
|
|
|
|
|
|
2416
|
|
|
|
|
|
|
Reads compound word file and stores in memory. $autoSetMaxCompWordLength parameter is not required to be set. This |
2417
|
|
|
|
|
|
|
parameter instructs the method to auto set the maximum compound word length dependent on the longest compound word found. |
2418
|
|
|
|
|
|
|
|
2419
|
|
|
|
|
|
|
Note: $autoSetMaxCompWordLength options: defined = True and Undefined = False. |
2420
|
|
|
|
|
|
|
|
2421
|
|
|
|
|
|
|
Input: |
2422
|
|
|
|
|
|
|
|
2423
|
|
|
|
|
|
|
$filePath -> Compound word file path |
2424
|
|
|
|
|
|
|
$autoSetMaxCompWordLength -> Maximum length of a given compoundified phrase the module's compoundify algorithm will permit. |
2425
|
|
|
|
|
|
|
|
2426
|
|
|
|
|
|
|
Note: Calling this method with $autoSetMaxCompWordLength defined will automatically set the maxCompoundWordLength variable to the longest compound phrase. |
2427
|
|
|
|
|
|
|
|
2428
|
|
|
|
|
|
|
Output: |
2429
|
|
|
|
|
|
|
|
2430
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2431
|
|
|
|
|
|
|
|
2432
|
|
|
|
|
|
|
Example: |
2433
|
|
|
|
|
|
|
|
2434
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2435
|
|
|
|
|
|
|
|
2436
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2437
|
|
|
|
|
|
|
$xmlconv->ReadCompoundWordDataFromFile( "samples/compoundword.txt", 1 ); |
2438
|
|
|
|
|
|
|
|
2439
|
|
|
|
|
|
|
undef( $xmlconv ); |
2440
|
|
|
|
|
|
|
|
2441
|
|
|
|
|
|
|
=head3 SaveCompoundWordListToFile |
2442
|
|
|
|
|
|
|
|
2443
|
|
|
|
|
|
|
Description: |
2444
|
|
|
|
|
|
|
|
2445
|
|
|
|
|
|
|
Saves compound word data in memory to a specified file location. |
2446
|
|
|
|
|
|
|
|
2447
|
|
|
|
|
|
|
Input: |
2448
|
|
|
|
|
|
|
|
2449
|
|
|
|
|
|
|
$savePath -> Path to save compound word list to file. |
2450
|
|
|
|
|
|
|
|
2451
|
|
|
|
|
|
|
Output: |
2452
|
|
|
|
|
|
|
|
2453
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2454
|
|
|
|
|
|
|
|
2455
|
|
|
|
|
|
|
Example: |
2456
|
|
|
|
|
|
|
|
2457
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2458
|
|
|
|
|
|
|
|
2459
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2460
|
|
|
|
|
|
|
$xmlconv->ReadCompoundWordDataFromFile( "samples/compoundword.txt" ); |
2461
|
|
|
|
|
|
|
$xmlconv->SaveCompoundWordDataFromFile( "samples/newcompoundword.txt" ); |
2462
|
|
|
|
|
|
|
undef( $xmlconv ); |
2463
|
|
|
|
|
|
|
|
2464
|
|
|
|
|
|
|
=head3 ReadTextFromFile |
2465
|
|
|
|
|
|
|
|
2466
|
|
|
|
|
|
|
Description: |
2467
|
|
|
|
|
|
|
|
2468
|
|
|
|
|
|
|
Reads a plain text file with utf8 encoding in memory. Returns string data if successful and "(null)" if unsuccessful. |
2469
|
|
|
|
|
|
|
|
2470
|
|
|
|
|
|
|
Input: |
2471
|
|
|
|
|
|
|
|
2472
|
|
|
|
|
|
|
$filePath -> Text file to read into memory |
2473
|
|
|
|
|
|
|
|
2474
|
|
|
|
|
|
|
Output: |
2475
|
|
|
|
|
|
|
|
2476
|
|
|
|
|
|
|
$string -> String data if successful or "(null)" if un-successful. |
2477
|
|
|
|
|
|
|
|
2478
|
|
|
|
|
|
|
Example: |
2479
|
|
|
|
|
|
|
|
2480
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2481
|
|
|
|
|
|
|
|
2482
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2483
|
|
|
|
|
|
|
my $textData = $xmlconv->ReadTextFromFile( "samples/textcorpus.txt" ); |
2484
|
|
|
|
|
|
|
print( "Text Data: $textData\n" ); |
2485
|
|
|
|
|
|
|
undef( $xmlconv ); |
2486
|
|
|
|
|
|
|
|
2487
|
|
|
|
|
|
|
=head3 SaveTextToFile |
2488
|
|
|
|
|
|
|
|
2489
|
|
|
|
|
|
|
Description: |
2490
|
|
|
|
|
|
|
|
2491
|
|
|
|
|
|
|
Saves a plain text file with utf8 encoding in a specified location. |
2492
|
|
|
|
|
|
|
|
2493
|
|
|
|
|
|
|
Input: |
2494
|
|
|
|
|
|
|
|
2495
|
|
|
|
|
|
|
$savePath -> Path to save string data. |
2496
|
|
|
|
|
|
|
$string -> String to save |
2497
|
|
|
|
|
|
|
|
2498
|
|
|
|
|
|
|
Output: |
2499
|
|
|
|
|
|
|
|
2500
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2501
|
|
|
|
|
|
|
|
2502
|
|
|
|
|
|
|
Example: |
2503
|
|
|
|
|
|
|
|
2504
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2505
|
|
|
|
|
|
|
|
2506
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2507
|
|
|
|
|
|
|
my $result = $xmlconv->SaveTextToFile( "text.txt", "Hello world!" ); |
2508
|
|
|
|
|
|
|
|
2509
|
|
|
|
|
|
|
print( "File saved\n" ) if $result == 0; |
2510
|
|
|
|
|
|
|
print( "File unable to save\n" ) if $result == -1; |
2511
|
|
|
|
|
|
|
|
2512
|
|
|
|
|
|
|
undef( $xmlconv ); |
2513
|
|
|
|
|
|
|
|
2514
|
|
|
|
|
|
|
=head3 _ReadXMLDataFromFile |
2515
|
|
|
|
|
|
|
|
2516
|
|
|
|
|
|
|
Description: |
2517
|
|
|
|
|
|
|
|
2518
|
|
|
|
|
|
|
Reads an XML file from a specified location. Returns string in memory if successful and "(null)" if unsuccessful. |
2519
|
|
|
|
|
|
|
|
2520
|
|
|
|
|
|
|
Input: |
2521
|
|
|
|
|
|
|
|
2522
|
|
|
|
|
|
|
$filePath -> File to read given path |
2523
|
|
|
|
|
|
|
|
2524
|
|
|
|
|
|
|
Output: |
2525
|
|
|
|
|
|
|
|
2526
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2527
|
|
|
|
|
|
|
|
2528
|
|
|
|
|
|
|
Example: |
2529
|
|
|
|
|
|
|
|
2530
|
|
|
|
|
|
|
Warning: This is a private function and is called by XML::Twig parsing functions. It should not be called outside of xmltow2v module. |
2531
|
|
|
|
|
|
|
|
2532
|
|
|
|
|
|
|
=head3 _SaveTextCorpusToFile |
2533
|
|
|
|
|
|
|
|
2534
|
|
|
|
|
|
|
Description: |
2535
|
|
|
|
|
|
|
|
2536
|
|
|
|
|
|
|
Saves text corpus data to specified file path. This method will append to any existing file if $appendToFile parameter |
2537
|
|
|
|
|
|
|
is defined or "overwrite" option is disabled. Enabling "overwrite" option will overwrite any existing files. |
2538
|
|
|
|
|
|
|
|
2539
|
|
|
|
|
|
|
Input: |
2540
|
|
|
|
|
|
|
|
2541
|
|
|
|
|
|
|
$savePath -> Path to save the text corpus |
2542
|
|
|
|
|
|
|
$appendToFile -> Specifies whether the module will overwrite any existing data or append to existing text corpus data. |
2543
|
|
|
|
|
|
|
|
2544
|
|
|
|
|
|
|
Note: Leaving this variable undefined will fetch the "Overwrite" member variable and set the value to this parameter. |
2545
|
|
|
|
|
|
|
|
2546
|
|
|
|
|
|
|
Output: |
2547
|
|
|
|
|
|
|
|
2548
|
|
|
|
|
|
|
$value -> '0' = Successful / '-1' = Un-successful |
2549
|
|
|
|
|
|
|
|
2550
|
|
|
|
|
|
|
Example: |
2551
|
|
|
|
|
|
|
|
2552
|
|
|
|
|
|
|
Warning: This is a private function and is called by XML::Twig parsing functions. It should not be called outside of xmltow2v module. |
2553
|
|
|
|
|
|
|
|
2554
|
|
|
|
|
|
|
=head3 IsDateInSpecifiedRange |
2555
|
|
|
|
|
|
|
|
2556
|
|
|
|
|
|
|
Description: |
2557
|
|
|
|
|
|
|
|
2558
|
|
|
|
|
|
|
Checks to see if $date is within $beginDate and $endDate range. Returns 1 if true and 0 if false. |
2559
|
|
|
|
|
|
|
|
2560
|
|
|
|
|
|
|
Note: Date Format: XX/XX/XXXX (Month/Day/Year) |
2561
|
|
|
|
|
|
|
|
2562
|
|
|
|
|
|
|
Input: |
2563
|
|
|
|
|
|
|
|
2564
|
|
|
|
|
|
|
$date -> Date to check against minimum and maximum data range. (String) |
2565
|
|
|
|
|
|
|
$beginDate -> Minimum date range (String) |
2566
|
|
|
|
|
|
|
$endDate -> Maximum date range (String) |
2567
|
|
|
|
|
|
|
|
2568
|
|
|
|
|
|
|
Output: |
2569
|
|
|
|
|
|
|
|
2570
|
|
|
|
|
|
|
$value -> '1' = True/Date is within specified range Or '0' = False/Date is not within specified range. |
2571
|
|
|
|
|
|
|
|
2572
|
|
|
|
|
|
|
Example: |
2573
|
|
|
|
|
|
|
|
2574
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2575
|
|
|
|
|
|
|
|
2576
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2577
|
|
|
|
|
|
|
print( "Is \"01/01/2004\" within the date range: \"02/21/1985\" to \"08/13/2016\"?\n" ); |
2578
|
|
|
|
|
|
|
print( "Yes\n" ) if $xmlconv->IsDateInSpecifiedRange( "01/01/2004", "02/21/1985", "08/13/2016" ) == 1; |
2579
|
|
|
|
|
|
|
print( "No\n" ) if $xmlconv->IsDateInSpecifiedRange( "01/01/2004", "02/21/1985", "08/13/2016" ) == 0; |
2580
|
|
|
|
|
|
|
|
2581
|
|
|
|
|
|
|
undef( $xmlconv ); |
2582
|
|
|
|
|
|
|
|
2583
|
|
|
|
|
|
|
=head3 IsFileOrDirectory |
2584
|
|
|
|
|
|
|
|
2585
|
|
|
|
|
|
|
Description: |
2586
|
|
|
|
|
|
|
|
2587
|
|
|
|
|
|
|
Checks to see if specified path is a file or directory. |
2588
|
|
|
|
|
|
|
|
2589
|
|
|
|
|
|
|
Input: |
2590
|
|
|
|
|
|
|
|
2591
|
|
|
|
|
|
|
$path -> File or directory path. (String) |
2592
|
|
|
|
|
|
|
|
2593
|
|
|
|
|
|
|
Output: |
2594
|
|
|
|
|
|
|
|
2595
|
|
|
|
|
|
|
$string -> Returns: "file" = file, "dir" = directory and "unknown" if the path is not a file or directory (undefined). |
2596
|
|
|
|
|
|
|
|
2597
|
|
|
|
|
|
|
Example: |
2598
|
|
|
|
|
|
|
|
2599
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2600
|
|
|
|
|
|
|
|
2601
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2602
|
|
|
|
|
|
|
my $path = "path/to/a/directory"; |
2603
|
|
|
|
|
|
|
|
2604
|
|
|
|
|
|
|
print( "Is \"$path\" a file or directory? " . $xmlconv->IsFileOrDirectory( $path ) . "\n" ); |
2605
|
|
|
|
|
|
|
|
2606
|
|
|
|
|
|
|
$path = "path/to/a/file.file"; |
2607
|
|
|
|
|
|
|
|
2608
|
|
|
|
|
|
|
print( "Is \"$path\" a file or directory? " . $xmlconv->IsFileOrDirectory( $path ) . "\n" ); |
2609
|
|
|
|
|
|
|
|
2610
|
|
|
|
|
|
|
undef( $xmlconv ); |
2611
|
|
|
|
|
|
|
|
2612
|
|
|
|
|
|
|
=head3 RemoveSpecialCharactersFromString |
2613
|
|
|
|
|
|
|
|
2614
|
|
|
|
|
|
|
Description: |
2615
|
|
|
|
|
|
|
|
2616
|
|
|
|
|
|
|
Removes special characters from string parameter, removes extra spaces and converts text to lowercase. |
2617
|
|
|
|
|
|
|
|
2618
|
|
|
|
|
|
|
Note: This method is called when parsing and compiling Medline title/abstract data. |
2619
|
|
|
|
|
|
|
|
2620
|
|
|
|
|
|
|
Input: |
2621
|
|
|
|
|
|
|
|
2622
|
|
|
|
|
|
|
$string -> String passed to remove special characters from and convert to lowercase. |
2623
|
|
|
|
|
|
|
|
2624
|
|
|
|
|
|
|
Output: |
2625
|
|
|
|
|
|
|
|
2626
|
|
|
|
|
|
|
$string -> String with all special characters removed and converted to lowercase. |
2627
|
|
|
|
|
|
|
|
2628
|
|
|
|
|
|
|
Example: |
2629
|
|
|
|
|
|
|
|
2630
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2631
|
|
|
|
|
|
|
|
2632
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2633
|
|
|
|
|
|
|
|
2634
|
|
|
|
|
|
|
my $str = "Heart Attack is$ an!@ also KNOWN as an Acute MYOCARDIAL inFARCTion!"; |
2635
|
|
|
|
|
|
|
|
2636
|
|
|
|
|
|
|
print( "Original String: $str\n" ); |
2637
|
|
|
|
|
|
|
|
2638
|
|
|
|
|
|
|
$str = $xmlconv->RemoveSpecialCharactersFromString( $str ); |
2639
|
|
|
|
|
|
|
|
2640
|
|
|
|
|
|
|
print( "Modified String: $str\n" ); |
2641
|
|
|
|
|
|
|
|
2642
|
|
|
|
|
|
|
undef( $xmlconv ); |
2643
|
|
|
|
|
|
|
|
2644
|
|
|
|
|
|
|
=head3 GetFileType |
2645
|
|
|
|
|
|
|
|
2646
|
|
|
|
|
|
|
Description: |
2647
|
|
|
|
|
|
|
|
2648
|
|
|
|
|
|
|
Returns file data type (string). |
2649
|
|
|
|
|
|
|
|
2650
|
|
|
|
|
|
|
Input: |
2651
|
|
|
|
|
|
|
|
2652
|
|
|
|
|
|
|
$filePath -> File to check located at file path |
2653
|
|
|
|
|
|
|
|
2654
|
|
|
|
|
|
|
Output: |
2655
|
|
|
|
|
|
|
|
2656
|
|
|
|
|
|
|
$string -> File type |
2657
|
|
|
|
|
|
|
|
2658
|
|
|
|
|
|
|
Example: |
2659
|
|
|
|
|
|
|
|
2660
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2661
|
|
|
|
|
|
|
|
2662
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2663
|
|
|
|
|
|
|
my $fileType = $xmlconv->GetFileType( "samples/textcorpus.txt" ); |
2664
|
|
|
|
|
|
|
|
2665
|
|
|
|
|
|
|
undef( $xmlconv ); |
2666
|
|
|
|
|
|
|
|
2667
|
|
|
|
|
|
|
=head3 _DateCheck |
2668
|
|
|
|
|
|
|
|
2669
|
|
|
|
|
|
|
Description: |
2670
|
|
|
|
|
|
|
|
2671
|
|
|
|
|
|
|
Checks specified begin and end date strings for formatting and logic errors. |
2672
|
|
|
|
|
|
|
|
2673
|
|
|
|
|
|
|
Input: |
2674
|
|
|
|
|
|
|
|
2675
|
|
|
|
|
|
|
None |
2676
|
|
|
|
|
|
|
|
2677
|
|
|
|
|
|
|
Output: |
2678
|
|
|
|
|
|
|
|
2679
|
|
|
|
|
|
|
$value -> "0" = Passed Checks / "-1" = Failed Checks |
2680
|
|
|
|
|
|
|
|
2681
|
|
|
|
|
|
|
Example: |
2682
|
|
|
|
|
|
|
|
2683
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2684
|
|
|
|
|
|
|
|
2685
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2686
|
|
|
|
|
|
|
print "Passed Date Checks\n" if ( $xmlconv->_DateCheck() == 0 ); |
2687
|
|
|
|
|
|
|
print "Failed Date Checks\n" if ( $xmlconv->_DateCheck() == -1 ); |
2688
|
|
|
|
|
|
|
|
2689
|
|
|
|
|
|
|
undef( $xmlconv ); |
2690
|
|
|
|
|
|
|
|
2691
|
|
|
|
|
|
|
=head2 Accessor Functions |
2692
|
|
|
|
|
|
|
|
2693
|
|
|
|
|
|
|
=head3 GetDebugLog |
2694
|
|
|
|
|
|
|
|
2695
|
|
|
|
|
|
|
Description: |
2696
|
|
|
|
|
|
|
|
2697
|
|
|
|
|
|
|
Returns the _debugLog member variable set during Word2vec::Xmltow2v object initialization of new function. |
2698
|
|
|
|
|
|
|
|
2699
|
|
|
|
|
|
|
Input: |
2700
|
|
|
|
|
|
|
|
2701
|
|
|
|
|
|
|
None |
2702
|
|
|
|
|
|
|
|
2703
|
|
|
|
|
|
|
Output: |
2704
|
|
|
|
|
|
|
|
2705
|
|
|
|
|
|
|
$value -> '0' = False, '1' = True |
2706
|
|
|
|
|
|
|
|
2707
|
|
|
|
|
|
|
Example: |
2708
|
|
|
|
|
|
|
|
2709
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2710
|
|
|
|
|
|
|
|
2711
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2712
|
|
|
|
|
|
|
my $debugLog = $xmlconv->GetDebugLog(); |
2713
|
|
|
|
|
|
|
|
2714
|
|
|
|
|
|
|
print( "Debug Logging Enabled\n" ) if $debugLog == 1; |
2715
|
|
|
|
|
|
|
print( "Debug Logging Disabled\n" ) if $debugLog == 0; |
2716
|
|
|
|
|
|
|
|
2717
|
|
|
|
|
|
|
|
2718
|
|
|
|
|
|
|
undef( $xmlconv ); |
2719
|
|
|
|
|
|
|
|
2720
|
|
|
|
|
|
|
=head3 GetWriteLog |
2721
|
|
|
|
|
|
|
|
2722
|
|
|
|
|
|
|
Description: |
2723
|
|
|
|
|
|
|
|
2724
|
|
|
|
|
|
|
Returns the _writeLog member variable set during Word2vec::Xmltow2v object initialization of new function. |
2725
|
|
|
|
|
|
|
|
2726
|
|
|
|
|
|
|
Input: |
2727
|
|
|
|
|
|
|
|
2728
|
|
|
|
|
|
|
None |
2729
|
|
|
|
|
|
|
|
2730
|
|
|
|
|
|
|
Output: |
2731
|
|
|
|
|
|
|
|
2732
|
|
|
|
|
|
|
$value -> '0' = False, '1' = True |
2733
|
|
|
|
|
|
|
|
2734
|
|
|
|
|
|
|
Example: |
2735
|
|
|
|
|
|
|
|
2736
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2737
|
|
|
|
|
|
|
|
2738
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2739
|
|
|
|
|
|
|
my $writeLog = $xmlconv->GetWriteLog(); |
2740
|
|
|
|
|
|
|
|
2741
|
|
|
|
|
|
|
print( "Write Logging Enabled\n" ) if $writeLog == 1; |
2742
|
|
|
|
|
|
|
print( "Write Logging Disabled\n" ) if $writeLog == 0; |
2743
|
|
|
|
|
|
|
|
2744
|
|
|
|
|
|
|
undef( $xmlconv ); |
2745
|
|
|
|
|
|
|
|
2746
|
|
|
|
|
|
|
=head3 GetStoreTitle |
2747
|
|
|
|
|
|
|
|
2748
|
|
|
|
|
|
|
Description: |
2749
|
|
|
|
|
|
|
|
2750
|
|
|
|
|
|
|
Returns the _storeTitle member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2751
|
|
|
|
|
|
|
|
2752
|
|
|
|
|
|
|
Input: |
2753
|
|
|
|
|
|
|
|
2754
|
|
|
|
|
|
|
None |
2755
|
|
|
|
|
|
|
|
2756
|
|
|
|
|
|
|
Output: |
2757
|
|
|
|
|
|
|
|
2758
|
|
|
|
|
|
|
$value -> '1' = True / '0' = False |
2759
|
|
|
|
|
|
|
|
2760
|
|
|
|
|
|
|
Example: |
2761
|
|
|
|
|
|
|
|
2762
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2763
|
|
|
|
|
|
|
|
2764
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2765
|
|
|
|
|
|
|
my $storeTitle = $xmlconv->GetStoreTitle(); |
2766
|
|
|
|
|
|
|
|
2767
|
|
|
|
|
|
|
print( "Store Title Option: Enabled\n" ) if $storeTitle == 1; |
2768
|
|
|
|
|
|
|
print( "Store Title Option: Disabled\n" ) if $storeTitle == 0; |
2769
|
|
|
|
|
|
|
|
2770
|
|
|
|
|
|
|
undef( $xmlconv ); |
2771
|
|
|
|
|
|
|
|
2772
|
|
|
|
|
|
|
=head3 GetStoreAbstract |
2773
|
|
|
|
|
|
|
|
2774
|
|
|
|
|
|
|
Description: |
2775
|
|
|
|
|
|
|
|
2776
|
|
|
|
|
|
|
Returns the _storeAbstract member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2777
|
|
|
|
|
|
|
|
2778
|
|
|
|
|
|
|
Input: |
2779
|
|
|
|
|
|
|
|
2780
|
|
|
|
|
|
|
None |
2781
|
|
|
|
|
|
|
|
2782
|
|
|
|
|
|
|
Output: |
2783
|
|
|
|
|
|
|
|
2784
|
|
|
|
|
|
|
$value -> '1' = True / '0' = False |
2785
|
|
|
|
|
|
|
|
2786
|
|
|
|
|
|
|
Example: |
2787
|
|
|
|
|
|
|
|
2788
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2789
|
|
|
|
|
|
|
|
2790
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2791
|
|
|
|
|
|
|
my $storeAbstract = $xmlconv->GetStoreAbstract(); |
2792
|
|
|
|
|
|
|
|
2793
|
|
|
|
|
|
|
print( "Store Abstract Option: Enabled\n" ) if $storeAbsract == 1; |
2794
|
|
|
|
|
|
|
print( "Store Abstract Option: Disabled\n" ) if $storeAbstract == 0; |
2795
|
|
|
|
|
|
|
|
2796
|
|
|
|
|
|
|
undef( $xmlconv ); |
2797
|
|
|
|
|
|
|
|
2798
|
|
|
|
|
|
|
=head3 GetQuickParse |
2799
|
|
|
|
|
|
|
|
2800
|
|
|
|
|
|
|
Description: |
2801
|
|
|
|
|
|
|
|
2802
|
|
|
|
|
|
|
Returns the _quickParse member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2803
|
|
|
|
|
|
|
|
2804
|
|
|
|
|
|
|
Input: |
2805
|
|
|
|
|
|
|
|
2806
|
|
|
|
|
|
|
None |
2807
|
|
|
|
|
|
|
|
2808
|
|
|
|
|
|
|
Output: |
2809
|
|
|
|
|
|
|
|
2810
|
|
|
|
|
|
|
$value -> '1' = True / '0' = False |
2811
|
|
|
|
|
|
|
|
2812
|
|
|
|
|
|
|
Example: |
2813
|
|
|
|
|
|
|
|
2814
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2815
|
|
|
|
|
|
|
|
2816
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2817
|
|
|
|
|
|
|
my $quickParse = $xmlconv->GetQuickParse(); |
2818
|
|
|
|
|
|
|
|
2819
|
|
|
|
|
|
|
print( "Quick Parse Option: Enabled\n" ) if $quickParse == 1; |
2820
|
|
|
|
|
|
|
print( "Quick Parse Option: Disabled\n" ) if $quickParse == 0; |
2821
|
|
|
|
|
|
|
|
2822
|
|
|
|
|
|
|
undef( $xmlconv ); |
2823
|
|
|
|
|
|
|
|
2824
|
|
|
|
|
|
|
=head3 GetCompoundifyText |
2825
|
|
|
|
|
|
|
|
2826
|
|
|
|
|
|
|
Description: |
2827
|
|
|
|
|
|
|
|
2828
|
|
|
|
|
|
|
Returns the _compoundifyText member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2829
|
|
|
|
|
|
|
|
2830
|
|
|
|
|
|
|
Input: |
2831
|
|
|
|
|
|
|
|
2832
|
|
|
|
|
|
|
None |
2833
|
|
|
|
|
|
|
|
2834
|
|
|
|
|
|
|
Output: |
2835
|
|
|
|
|
|
|
|
2836
|
|
|
|
|
|
|
$value -> '1' = True / '0' = False |
2837
|
|
|
|
|
|
|
|
2838
|
|
|
|
|
|
|
Example: |
2839
|
|
|
|
|
|
|
|
2840
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2841
|
|
|
|
|
|
|
|
2842
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2843
|
|
|
|
|
|
|
my $compoundify = $xmlconv->GetCompoundifyText(); |
2844
|
|
|
|
|
|
|
|
2845
|
|
|
|
|
|
|
print( "Compoundify Text Option: Enabled\n" ) if $compoundify == 1; |
2846
|
|
|
|
|
|
|
print( "Compoundify Text Option: Disabled\n" ) if $compoundify == 0; |
2847
|
|
|
|
|
|
|
|
2848
|
|
|
|
|
|
|
undef( $xmlconv ); |
2849
|
|
|
|
|
|
|
|
2850
|
|
|
|
|
|
|
=head3 GetStoreAsSentencePerLine |
2851
|
|
|
|
|
|
|
|
2852
|
|
|
|
|
|
|
Description: |
2853
|
|
|
|
|
|
|
|
2854
|
|
|
|
|
|
|
Returns the _storeAsSentencePerLine member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2855
|
|
|
|
|
|
|
|
2856
|
|
|
|
|
|
|
Input: |
2857
|
|
|
|
|
|
|
|
2858
|
|
|
|
|
|
|
None |
2859
|
|
|
|
|
|
|
|
2860
|
|
|
|
|
|
|
Output: |
2861
|
|
|
|
|
|
|
|
2862
|
|
|
|
|
|
|
$value -> '1' = True / '0' = False |
2863
|
|
|
|
|
|
|
|
2864
|
|
|
|
|
|
|
Example: |
2865
|
|
|
|
|
|
|
|
2866
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2867
|
|
|
|
|
|
|
|
2868
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2869
|
|
|
|
|
|
|
my $storeAsSentencePerLine = $xmlconv->GetStoreAsSentencePerLine(); |
2870
|
|
|
|
|
|
|
|
2871
|
|
|
|
|
|
|
print( "Store As Sentence Per Line: Enabled\n" ) if $storeAsSentencePerLine == 1; |
2872
|
|
|
|
|
|
|
print( "Store As Sentence Per Line: Disabled\n" ) if $storeAsSentencePerLine == 0; |
2873
|
|
|
|
|
|
|
|
2874
|
|
|
|
|
|
|
undef( $xmlconv ); |
2875
|
|
|
|
|
|
|
|
2876
|
|
|
|
|
|
|
=head3 GetNumOfThreads |
2877
|
|
|
|
|
|
|
|
2878
|
|
|
|
|
|
|
Description: |
2879
|
|
|
|
|
|
|
|
2880
|
|
|
|
|
|
|
Returns the _numOfThreads member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2881
|
|
|
|
|
|
|
|
2882
|
|
|
|
|
|
|
Input: |
2883
|
|
|
|
|
|
|
|
2884
|
|
|
|
|
|
|
None |
2885
|
|
|
|
|
|
|
|
2886
|
|
|
|
|
|
|
Output: |
2887
|
|
|
|
|
|
|
|
2888
|
|
|
|
|
|
|
$value -> Number of threads |
2889
|
|
|
|
|
|
|
|
2890
|
|
|
|
|
|
|
Example: |
2891
|
|
|
|
|
|
|
|
2892
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2893
|
|
|
|
|
|
|
|
2894
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2895
|
|
|
|
|
|
|
my $numOfThreads = $xmlconv->GetNumOfThreads(); |
2896
|
|
|
|
|
|
|
|
2897
|
|
|
|
|
|
|
print( "Number of threads: $numOfThreads\n" ); |
2898
|
|
|
|
|
|
|
|
2899
|
|
|
|
|
|
|
undef( $xmlconv ); |
2900
|
|
|
|
|
|
|
|
2901
|
|
|
|
|
|
|
=head3 GetWorkingDir |
2902
|
|
|
|
|
|
|
|
2903
|
|
|
|
|
|
|
Description: |
2904
|
|
|
|
|
|
|
|
2905
|
|
|
|
|
|
|
Returns the _workingDir member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2906
|
|
|
|
|
|
|
|
2907
|
|
|
|
|
|
|
Input: |
2908
|
|
|
|
|
|
|
|
2909
|
|
|
|
|
|
|
None |
2910
|
|
|
|
|
|
|
|
2911
|
|
|
|
|
|
|
Output: |
2912
|
|
|
|
|
|
|
|
2913
|
|
|
|
|
|
|
$string -> Working directory string |
2914
|
|
|
|
|
|
|
|
2915
|
|
|
|
|
|
|
Example: |
2916
|
|
|
|
|
|
|
|
2917
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2918
|
|
|
|
|
|
|
|
2919
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2920
|
|
|
|
|
|
|
my $workingDirectory = $xmlconv->GetWorkingDir(); |
2921
|
|
|
|
|
|
|
|
2922
|
|
|
|
|
|
|
print( "Working Directory: $workingDirectory\n" ); |
2923
|
|
|
|
|
|
|
|
2924
|
|
|
|
|
|
|
undef( $xmlconv ); |
2925
|
|
|
|
|
|
|
|
2926
|
|
|
|
|
|
|
=head3 GetSavePath |
2927
|
|
|
|
|
|
|
|
2928
|
|
|
|
|
|
|
Description: |
2929
|
|
|
|
|
|
|
|
2930
|
|
|
|
|
|
|
Returns the _saveDir member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2931
|
|
|
|
|
|
|
|
2932
|
|
|
|
|
|
|
Input: |
2933
|
|
|
|
|
|
|
|
2934
|
|
|
|
|
|
|
None |
2935
|
|
|
|
|
|
|
|
2936
|
|
|
|
|
|
|
Output: |
2937
|
|
|
|
|
|
|
|
2938
|
|
|
|
|
|
|
$string -> Save directory string |
2939
|
|
|
|
|
|
|
|
2940
|
|
|
|
|
|
|
Example: |
2941
|
|
|
|
|
|
|
|
2942
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2943
|
|
|
|
|
|
|
|
2944
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2945
|
|
|
|
|
|
|
my $savePath = $xmlconv->GetSavePath(); |
2946
|
|
|
|
|
|
|
|
2947
|
|
|
|
|
|
|
print( "Save Directory: $savePath\n" ); |
2948
|
|
|
|
|
|
|
|
2949
|
|
|
|
|
|
|
undef( $xmlconv ); |
2950
|
|
|
|
|
|
|
|
2951
|
|
|
|
|
|
|
=head3 GetBeginDate |
2952
|
|
|
|
|
|
|
|
2953
|
|
|
|
|
|
|
Description: |
2954
|
|
|
|
|
|
|
|
2955
|
|
|
|
|
|
|
Returns the _beginDate member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2956
|
|
|
|
|
|
|
|
2957
|
|
|
|
|
|
|
Input: |
2958
|
|
|
|
|
|
|
|
2959
|
|
|
|
|
|
|
None |
2960
|
|
|
|
|
|
|
|
2961
|
|
|
|
|
|
|
Output: |
2962
|
|
|
|
|
|
|
|
2963
|
|
|
|
|
|
|
$date -> Beginning date range - Format: XX/XX/XXXX (Mon/Day/Year) |
2964
|
|
|
|
|
|
|
|
2965
|
|
|
|
|
|
|
Example: |
2966
|
|
|
|
|
|
|
|
2967
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2968
|
|
|
|
|
|
|
|
2969
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2970
|
|
|
|
|
|
|
my $date = $xmlconv->GetBeginDate(); |
2971
|
|
|
|
|
|
|
|
2972
|
|
|
|
|
|
|
print( "Date: $date\n" ); |
2973
|
|
|
|
|
|
|
|
2974
|
|
|
|
|
|
|
undef( $xmlconv ); |
2975
|
|
|
|
|
|
|
|
2976
|
|
|
|
|
|
|
=head3 GetEndDate |
2977
|
|
|
|
|
|
|
|
2978
|
|
|
|
|
|
|
Description: |
2979
|
|
|
|
|
|
|
|
2980
|
|
|
|
|
|
|
Returns the _endDate member variable set during Word2vec::Xmltow2v object instantiation of new function. |
2981
|
|
|
|
|
|
|
|
2982
|
|
|
|
|
|
|
Input: |
2983
|
|
|
|
|
|
|
|
2984
|
|
|
|
|
|
|
None |
2985
|
|
|
|
|
|
|
|
2986
|
|
|
|
|
|
|
Output: |
2987
|
|
|
|
|
|
|
|
2988
|
|
|
|
|
|
|
$date -> End date range - Format: XX/XX/XXXX (Mon/Day/Year). |
2989
|
|
|
|
|
|
|
|
2990
|
|
|
|
|
|
|
Example: |
2991
|
|
|
|
|
|
|
|
2992
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
2993
|
|
|
|
|
|
|
|
2994
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
2995
|
|
|
|
|
|
|
my $date = $xmlconv->GetEndDate(); |
2996
|
|
|
|
|
|
|
|
2997
|
|
|
|
|
|
|
print( "Date: $date\n" ); |
2998
|
|
|
|
|
|
|
|
2999
|
|
|
|
|
|
|
undef( $xmlconv ); |
3000
|
|
|
|
|
|
|
|
3001
|
|
|
|
|
|
|
=head3 GetXMLStringToParse |
3002
|
|
|
|
|
|
|
|
3003
|
|
|
|
|
|
|
Returns the XML data (string) to be parsed. |
3004
|
|
|
|
|
|
|
|
3005
|
|
|
|
|
|
|
Description: |
3006
|
|
|
|
|
|
|
|
3007
|
|
|
|
|
|
|
Returns the _xmlStringToParse member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3008
|
|
|
|
|
|
|
|
3009
|
|
|
|
|
|
|
Input: |
3010
|
|
|
|
|
|
|
|
3011
|
|
|
|
|
|
|
None |
3012
|
|
|
|
|
|
|
|
3013
|
|
|
|
|
|
|
Output: |
3014
|
|
|
|
|
|
|
|
3015
|
|
|
|
|
|
|
$string -> Medline XML data string |
3016
|
|
|
|
|
|
|
|
3017
|
|
|
|
|
|
|
Example: |
3018
|
|
|
|
|
|
|
|
3019
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3020
|
|
|
|
|
|
|
|
3021
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3022
|
|
|
|
|
|
|
my $xmlStr = $xmlconv->GetXMLStringToParse(); |
3023
|
|
|
|
|
|
|
|
3024
|
|
|
|
|
|
|
print( "XML String: $xmlStr\n" ); |
3025
|
|
|
|
|
|
|
|
3026
|
|
|
|
|
|
|
undef( $xmlconv ); |
3027
|
|
|
|
|
|
|
|
3028
|
|
|
|
|
|
|
=head3 GetTextCorpusStr |
3029
|
|
|
|
|
|
|
|
3030
|
|
|
|
|
|
|
Description: |
3031
|
|
|
|
|
|
|
|
3032
|
|
|
|
|
|
|
Returns the _textCorpusStr member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3033
|
|
|
|
|
|
|
|
3034
|
|
|
|
|
|
|
Input: |
3035
|
|
|
|
|
|
|
|
3036
|
|
|
|
|
|
|
None |
3037
|
|
|
|
|
|
|
|
3038
|
|
|
|
|
|
|
Output: |
3039
|
|
|
|
|
|
|
|
3040
|
|
|
|
|
|
|
$string -> Text corpus string |
3041
|
|
|
|
|
|
|
|
3042
|
|
|
|
|
|
|
Example: |
3043
|
|
|
|
|
|
|
|
3044
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3045
|
|
|
|
|
|
|
|
3046
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3047
|
|
|
|
|
|
|
my $str = $xmlconv->GetTextCorpusStr(); |
3048
|
|
|
|
|
|
|
|
3049
|
|
|
|
|
|
|
print( "Text Corpus: $str\n" ); |
3050
|
|
|
|
|
|
|
|
3051
|
|
|
|
|
|
|
undef( $xmlconv ); |
3052
|
|
|
|
|
|
|
|
3053
|
|
|
|
|
|
|
=head3 GetFileHandle |
3054
|
|
|
|
|
|
|
|
3055
|
|
|
|
|
|
|
Description: |
3056
|
|
|
|
|
|
|
|
3057
|
|
|
|
|
|
|
Returns the _fileHandle member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3058
|
|
|
|
|
|
|
|
3059
|
|
|
|
|
|
|
Warning: This is a private function. File handle is used by WriteLog() method. Do not manipulate this file handle as errors can result. |
3060
|
|
|
|
|
|
|
|
3061
|
|
|
|
|
|
|
Input: |
3062
|
|
|
|
|
|
|
|
3063
|
|
|
|
|
|
|
None |
3064
|
|
|
|
|
|
|
|
3065
|
|
|
|
|
|
|
Output: |
3066
|
|
|
|
|
|
|
|
3067
|
|
|
|
|
|
|
$fileHandle -> Returns file handle for WriteLog() method. |
3068
|
|
|
|
|
|
|
|
3069
|
|
|
|
|
|
|
Example: |
3070
|
|
|
|
|
|
|
|
3071
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3072
|
|
|
|
|
|
|
|
3073
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3074
|
|
|
|
|
|
|
my $fileHandle = $xmlconv->GetFileHandle(); |
3075
|
|
|
|
|
|
|
|
3076
|
|
|
|
|
|
|
undef( $xmlconv ); |
3077
|
|
|
|
|
|
|
|
3078
|
|
|
|
|
|
|
=head3 GetTwigHandler |
3079
|
|
|
|
|
|
|
|
3080
|
|
|
|
|
|
|
Returns XML::Twig handler. |
3081
|
|
|
|
|
|
|
|
3082
|
|
|
|
|
|
|
Description: |
3083
|
|
|
|
|
|
|
|
3084
|
|
|
|
|
|
|
Returns the _twigHandler member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3085
|
|
|
|
|
|
|
|
3086
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3087
|
|
|
|
|
|
|
|
3088
|
|
|
|
|
|
|
Input: |
3089
|
|
|
|
|
|
|
|
3090
|
|
|
|
|
|
|
None |
3091
|
|
|
|
|
|
|
|
3092
|
|
|
|
|
|
|
Output: |
3093
|
|
|
|
|
|
|
|
3094
|
|
|
|
|
|
|
$twigHandler -> XML::Twig handler. |
3095
|
|
|
|
|
|
|
|
3096
|
|
|
|
|
|
|
Example: |
3097
|
|
|
|
|
|
|
|
3098
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3099
|
|
|
|
|
|
|
|
3100
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3101
|
|
|
|
|
|
|
my $xmlHandler = $xmlconv->GetTwigHandler(); |
3102
|
|
|
|
|
|
|
|
3103
|
|
|
|
|
|
|
undef( $xmlconv ); |
3104
|
|
|
|
|
|
|
|
3105
|
|
|
|
|
|
|
=head3 GetParsedCount |
3106
|
|
|
|
|
|
|
|
3107
|
|
|
|
|
|
|
Description: |
3108
|
|
|
|
|
|
|
|
3109
|
|
|
|
|
|
|
Returns the _parsedCount member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3110
|
|
|
|
|
|
|
|
3111
|
|
|
|
|
|
|
Input: |
3112
|
|
|
|
|
|
|
|
3113
|
|
|
|
|
|
|
None |
3114
|
|
|
|
|
|
|
|
3115
|
|
|
|
|
|
|
Output: |
3116
|
|
|
|
|
|
|
|
3117
|
|
|
|
|
|
|
$value -> Number of parsed Medline articles. |
3118
|
|
|
|
|
|
|
|
3119
|
|
|
|
|
|
|
Example: |
3120
|
|
|
|
|
|
|
|
3121
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3122
|
|
|
|
|
|
|
|
3123
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3124
|
|
|
|
|
|
|
my $numOfParsed = $xmlconv->GetParsedCount(); |
3125
|
|
|
|
|
|
|
|
3126
|
|
|
|
|
|
|
print( "Number of parsed Medline articles: $numOfParsed\n" ); |
3127
|
|
|
|
|
|
|
|
3128
|
|
|
|
|
|
|
undef( $xmlconv ); |
3129
|
|
|
|
|
|
|
|
3130
|
|
|
|
|
|
|
=head3 GetTempStr |
3131
|
|
|
|
|
|
|
|
3132
|
|
|
|
|
|
|
Description: |
3133
|
|
|
|
|
|
|
|
3134
|
|
|
|
|
|
|
Returns the _tempStr member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3135
|
|
|
|
|
|
|
|
3136
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. Used by module as a temporary storage |
3137
|
|
|
|
|
|
|
location for parsed Medline 'Title' and 'Abstract' flag string data. |
3138
|
|
|
|
|
|
|
|
3139
|
|
|
|
|
|
|
Input: |
3140
|
|
|
|
|
|
|
|
3141
|
|
|
|
|
|
|
None |
3142
|
|
|
|
|
|
|
|
3143
|
|
|
|
|
|
|
Output: |
3144
|
|
|
|
|
|
|
|
3145
|
|
|
|
|
|
|
$string -> Temporary string storage location. |
3146
|
|
|
|
|
|
|
|
3147
|
|
|
|
|
|
|
Example: |
3148
|
|
|
|
|
|
|
|
3149
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3150
|
|
|
|
|
|
|
|
3151
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3152
|
|
|
|
|
|
|
my $tempStr = $xmlconv->GetTempStr(); |
3153
|
|
|
|
|
|
|
|
3154
|
|
|
|
|
|
|
print( "Temp String: $tempStr\n" ); |
3155
|
|
|
|
|
|
|
|
3156
|
|
|
|
|
|
|
undef( $xmlconv ); |
3157
|
|
|
|
|
|
|
|
3158
|
|
|
|
|
|
|
=head3 GetTempDate |
3159
|
|
|
|
|
|
|
|
3160
|
|
|
|
|
|
|
Description: |
3161
|
|
|
|
|
|
|
|
3162
|
|
|
|
|
|
|
Returns the _tempDate member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3163
|
|
|
|
|
|
|
Used by module as a temporary storage location for parsed Medline 'DateCreated' flag string data. |
3164
|
|
|
|
|
|
|
|
3165
|
|
|
|
|
|
|
Input: |
3166
|
|
|
|
|
|
|
|
3167
|
|
|
|
|
|
|
None |
3168
|
|
|
|
|
|
|
|
3169
|
|
|
|
|
|
|
Output: |
3170
|
|
|
|
|
|
|
|
3171
|
|
|
|
|
|
|
$date -> Date string - Format: XX/XX/XXXX (Mon/Day/Year). |
3172
|
|
|
|
|
|
|
|
3173
|
|
|
|
|
|
|
Example: |
3174
|
|
|
|
|
|
|
|
3175
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3176
|
|
|
|
|
|
|
|
3177
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3178
|
|
|
|
|
|
|
my $date = $xmlconv->GetTempDate(); |
3179
|
|
|
|
|
|
|
|
3180
|
|
|
|
|
|
|
print( "Temp Date: $date\n" ); |
3181
|
|
|
|
|
|
|
|
3182
|
|
|
|
|
|
|
undef( $xmlconv ); |
3183
|
|
|
|
|
|
|
|
3184
|
|
|
|
|
|
|
=head3 GetCompoundWordAry |
3185
|
|
|
|
|
|
|
|
3186
|
|
|
|
|
|
|
Description: |
3187
|
|
|
|
|
|
|
|
3188
|
|
|
|
|
|
|
Returns the _compoundWordAry member array reference set during Word2vec::Xmltow2v object instantiation of new function. |
3189
|
|
|
|
|
|
|
|
3190
|
|
|
|
|
|
|
Warning: Compound word data must be loaded in memory first via ReadCompoundWordDataFromFile(). |
3191
|
|
|
|
|
|
|
|
3192
|
|
|
|
|
|
|
Input: |
3193
|
|
|
|
|
|
|
|
3194
|
|
|
|
|
|
|
None |
3195
|
|
|
|
|
|
|
|
3196
|
|
|
|
|
|
|
Output: |
3197
|
|
|
|
|
|
|
|
3198
|
|
|
|
|
|
|
$arrayReference -> Compound word array reference. |
3199
|
|
|
|
|
|
|
|
3200
|
|
|
|
|
|
|
Example: |
3201
|
|
|
|
|
|
|
|
3202
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3203
|
|
|
|
|
|
|
|
3204
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3205
|
|
|
|
|
|
|
my $arrayReference = $xmlconv->GetCompoundWordAry(); |
3206
|
|
|
|
|
|
|
my @compoundWord = @{ $arrayReference }; |
3207
|
|
|
|
|
|
|
|
3208
|
|
|
|
|
|
|
print( "Compound Word Array: @compoundWord\n" ); |
3209
|
|
|
|
|
|
|
|
3210
|
|
|
|
|
|
|
undef( $xmlconv ); |
3211
|
|
|
|
|
|
|
|
3212
|
|
|
|
|
|
|
=head3 GetCompoundWordBST |
3213
|
|
|
|
|
|
|
|
3214
|
|
|
|
|
|
|
Description: |
3215
|
|
|
|
|
|
|
|
3216
|
|
|
|
|
|
|
Returns the _compoundWordBST member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3217
|
|
|
|
|
|
|
|
3218
|
|
|
|
|
|
|
Input: |
3219
|
|
|
|
|
|
|
|
3220
|
|
|
|
|
|
|
None |
3221
|
|
|
|
|
|
|
|
3222
|
|
|
|
|
|
|
Output: |
3223
|
|
|
|
|
|
|
|
3224
|
|
|
|
|
|
|
$bst -> Compound word binary search tree. |
3225
|
|
|
|
|
|
|
|
3226
|
|
|
|
|
|
|
Example: |
3227
|
|
|
|
|
|
|
|
3228
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3229
|
|
|
|
|
|
|
|
3230
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3231
|
|
|
|
|
|
|
my $bst = $xmlconv->GetCompoundWordBST(); |
3232
|
|
|
|
|
|
|
|
3233
|
|
|
|
|
|
|
undef( $xmlconv ); |
3234
|
|
|
|
|
|
|
|
3235
|
|
|
|
|
|
|
=head3 GetMaxCompoundWordLength |
3236
|
|
|
|
|
|
|
|
3237
|
|
|
|
|
|
|
Description: |
3238
|
|
|
|
|
|
|
|
3239
|
|
|
|
|
|
|
Returns the _maxCompoundWordLength member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3240
|
|
|
|
|
|
|
|
3241
|
|
|
|
|
|
|
Note: If not defined, it is automatically set to and returns 20. |
3242
|
|
|
|
|
|
|
|
3243
|
|
|
|
|
|
|
Input: |
3244
|
|
|
|
|
|
|
|
3245
|
|
|
|
|
|
|
None |
3246
|
|
|
|
|
|
|
|
3247
|
|
|
|
|
|
|
Output: |
3248
|
|
|
|
|
|
|
|
3249
|
|
|
|
|
|
|
$value -> Maximum number of compound words in a given phrase. |
3250
|
|
|
|
|
|
|
|
3251
|
|
|
|
|
|
|
Example: |
3252
|
|
|
|
|
|
|
|
3253
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3254
|
|
|
|
|
|
|
|
3255
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3256
|
|
|
|
|
|
|
my $compoundWordLength = $xmlconv->GetMaxCompoundWordLength(); |
3257
|
|
|
|
|
|
|
|
3258
|
|
|
|
|
|
|
print( "Maximum Compound Word Length: $compoundWordLength\n" ); |
3259
|
|
|
|
|
|
|
|
3260
|
|
|
|
|
|
|
undef( $xmlconv ); |
3261
|
|
|
|
|
|
|
|
3262
|
|
|
|
|
|
|
=head3 GetOverwriteExistingFile |
3263
|
|
|
|
|
|
|
|
3264
|
|
|
|
|
|
|
Description: |
3265
|
|
|
|
|
|
|
|
3266
|
|
|
|
|
|
|
Returns the _overwriteExisitingFile member variable set during Word2vec::Xmltow2v object instantiation of new function. |
3267
|
|
|
|
|
|
|
Enables overwriting of existing text corpus if set to '1' or appends to the existing text corpus if set to '0'. |
3268
|
|
|
|
|
|
|
|
3269
|
|
|
|
|
|
|
Input: |
3270
|
|
|
|
|
|
|
|
3271
|
|
|
|
|
|
|
None |
3272
|
|
|
|
|
|
|
|
3273
|
|
|
|
|
|
|
Output: |
3274
|
|
|
|
|
|
|
|
3275
|
|
|
|
|
|
|
$value -> '1' = Overwrite existing file / '0' = Append to exiting file. |
3276
|
|
|
|
|
|
|
|
3277
|
|
|
|
|
|
|
Example: |
3278
|
|
|
|
|
|
|
|
3279
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3280
|
|
|
|
|
|
|
|
3281
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3282
|
|
|
|
|
|
|
my $overwriteExitingFile = $xmlconv->GetOverwriteExistingFile(); |
3283
|
|
|
|
|
|
|
|
3284
|
|
|
|
|
|
|
print( "Overwrite Existing File? YES\n" ) if ( $overwriteExistingFile == 1 ); |
3285
|
|
|
|
|
|
|
print( "Overwrite Existing File? NO\n" ) if ( $overwriteExistingFile == 0 ); |
3286
|
|
|
|
|
|
|
|
3287
|
|
|
|
|
|
|
undef( $xmlconv ); |
3288
|
|
|
|
|
|
|
|
3289
|
|
|
|
|
|
|
=head2 Mutator Functions |
3290
|
|
|
|
|
|
|
|
3291
|
|
|
|
|
|
|
=head3 SetStoreTitle |
3292
|
|
|
|
|
|
|
|
3293
|
|
|
|
|
|
|
Description: |
3294
|
|
|
|
|
|
|
|
3295
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Instructs module to store article title if true or omit if false. |
3296
|
|
|
|
|
|
|
|
3297
|
|
|
|
|
|
|
Input: |
3298
|
|
|
|
|
|
|
|
3299
|
|
|
|
|
|
|
$value -> '1' = Store Titles / '0' = Omit Titles |
3300
|
|
|
|
|
|
|
|
3301
|
|
|
|
|
|
|
Ouput: |
3302
|
|
|
|
|
|
|
|
3303
|
|
|
|
|
|
|
None |
3304
|
|
|
|
|
|
|
|
3305
|
|
|
|
|
|
|
Example: |
3306
|
|
|
|
|
|
|
|
3307
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3308
|
|
|
|
|
|
|
|
3309
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3310
|
|
|
|
|
|
|
$xmlconv->SetStoreTitle( 1 ); |
3311
|
|
|
|
|
|
|
|
3312
|
|
|
|
|
|
|
undef( $xmlconv ); |
3313
|
|
|
|
|
|
|
|
3314
|
|
|
|
|
|
|
=head3 SetStoreAbstract |
3315
|
|
|
|
|
|
|
|
3316
|
|
|
|
|
|
|
Description: |
3317
|
|
|
|
|
|
|
|
3318
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Instructs module to store article abstracts if true or omit if false. |
3319
|
|
|
|
|
|
|
|
3320
|
|
|
|
|
|
|
Input: |
3321
|
|
|
|
|
|
|
|
3322
|
|
|
|
|
|
|
$value -> '1' = Store Abstracts / '0' = Omit Abstracts |
3323
|
|
|
|
|
|
|
|
3324
|
|
|
|
|
|
|
Ouput: |
3325
|
|
|
|
|
|
|
|
3326
|
|
|
|
|
|
|
None |
3327
|
|
|
|
|
|
|
|
3328
|
|
|
|
|
|
|
Example: |
3329
|
|
|
|
|
|
|
|
3330
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3331
|
|
|
|
|
|
|
|
3332
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3333
|
|
|
|
|
|
|
$xmlconv->SetStoreAbstract( 1 ); |
3334
|
|
|
|
|
|
|
|
3335
|
|
|
|
|
|
|
undef( $xmlconv ); |
3336
|
|
|
|
|
|
|
|
3337
|
|
|
|
|
|
|
=head3 SetWorkingDir |
3338
|
|
|
|
|
|
|
|
3339
|
|
|
|
|
|
|
Description: |
3340
|
|
|
|
|
|
|
|
3341
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Represents the working directory. |
3342
|
|
|
|
|
|
|
|
3343
|
|
|
|
|
|
|
Input: |
3344
|
|
|
|
|
|
|
|
3345
|
|
|
|
|
|
|
$string -> Working directory string |
3346
|
|
|
|
|
|
|
|
3347
|
|
|
|
|
|
|
Ouput: |
3348
|
|
|
|
|
|
|
|
3349
|
|
|
|
|
|
|
None |
3350
|
|
|
|
|
|
|
|
3351
|
|
|
|
|
|
|
Example: |
3352
|
|
|
|
|
|
|
|
3353
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3354
|
|
|
|
|
|
|
|
3355
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3356
|
|
|
|
|
|
|
$xmlconv->SetWorkingDir( "/samples/" ); |
3357
|
|
|
|
|
|
|
|
3358
|
|
|
|
|
|
|
undef( $xmlconv ); |
3359
|
|
|
|
|
|
|
|
3360
|
|
|
|
|
|
|
=head3 SetSavePath |
3361
|
|
|
|
|
|
|
|
3362
|
|
|
|
|
|
|
Description: |
3363
|
|
|
|
|
|
|
|
3364
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Represents the text corpus save path. |
3365
|
|
|
|
|
|
|
|
3366
|
|
|
|
|
|
|
Input: |
3367
|
|
|
|
|
|
|
|
3368
|
|
|
|
|
|
|
$string -> Text corpus save path |
3369
|
|
|
|
|
|
|
|
3370
|
|
|
|
|
|
|
Output: |
3371
|
|
|
|
|
|
|
|
3372
|
|
|
|
|
|
|
None |
3373
|
|
|
|
|
|
|
|
3374
|
|
|
|
|
|
|
Example: |
3375
|
|
|
|
|
|
|
|
3376
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3377
|
|
|
|
|
|
|
|
3378
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3379
|
|
|
|
|
|
|
$xmlconv->SetSavePath( "samples/textcorpus.txt" ); |
3380
|
|
|
|
|
|
|
|
3381
|
|
|
|
|
|
|
undef( $xmlconv ); |
3382
|
|
|
|
|
|
|
|
3383
|
|
|
|
|
|
|
=head3 SetQuickParse |
3384
|
|
|
|
|
|
|
|
3385
|
|
|
|
|
|
|
Description: |
3386
|
|
|
|
|
|
|
|
3387
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Instructs module to utilize quick parse |
3388
|
|
|
|
|
|
|
routines to speed up text corpus compilation. This method is somewhat less accurate due to its non-exhaustive nature. |
3389
|
|
|
|
|
|
|
|
3390
|
|
|
|
|
|
|
Input: |
3391
|
|
|
|
|
|
|
|
3392
|
|
|
|
|
|
|
$value -> '1' = Enable Quick Parse / '0' = Disable Quick Parse |
3393
|
|
|
|
|
|
|
|
3394
|
|
|
|
|
|
|
Ouput: |
3395
|
|
|
|
|
|
|
|
3396
|
|
|
|
|
|
|
None |
3397
|
|
|
|
|
|
|
|
3398
|
|
|
|
|
|
|
Example: |
3399
|
|
|
|
|
|
|
|
3400
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3401
|
|
|
|
|
|
|
|
3402
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3403
|
|
|
|
|
|
|
$xmlconv->SetQuickParse( 1 ); |
3404
|
|
|
|
|
|
|
|
3405
|
|
|
|
|
|
|
undef( $xmlconv ); |
3406
|
|
|
|
|
|
|
|
3407
|
|
|
|
|
|
|
=head3 SetCompoundifyText |
3408
|
|
|
|
|
|
|
|
3409
|
|
|
|
|
|
|
Description: |
3410
|
|
|
|
|
|
|
|
3411
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Instructs module to utilize 'compoundify' option if true. |
3412
|
|
|
|
|
|
|
|
3413
|
|
|
|
|
|
|
Warning: This requires compound word data to be loaded into memory with ReadCompoundWordDataFromFile() method prior |
3414
|
|
|
|
|
|
|
to executing text corpus compilation. |
3415
|
|
|
|
|
|
|
|
3416
|
|
|
|
|
|
|
Input: |
3417
|
|
|
|
|
|
|
|
3418
|
|
|
|
|
|
|
$value -> '1' = Compoundify text / '0' = Do not compoundify text |
3419
|
|
|
|
|
|
|
|
3420
|
|
|
|
|
|
|
Ouput: |
3421
|
|
|
|
|
|
|
|
3422
|
|
|
|
|
|
|
None |
3423
|
|
|
|
|
|
|
|
3424
|
|
|
|
|
|
|
Example: |
3425
|
|
|
|
|
|
|
|
3426
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3427
|
|
|
|
|
|
|
|
3428
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3429
|
|
|
|
|
|
|
$xmlconv->SetCompoundifyText( 1 ); |
3430
|
|
|
|
|
|
|
|
3431
|
|
|
|
|
|
|
undef( $xmlconv ); |
3432
|
|
|
|
|
|
|
|
3433
|
|
|
|
|
|
|
=head3 SetStoreAsSentencePerLine |
3434
|
|
|
|
|
|
|
|
3435
|
|
|
|
|
|
|
Description: |
3436
|
|
|
|
|
|
|
|
3437
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Instructs module to utilize 'storeAsSentencePerLine' option if true. |
3438
|
|
|
|
|
|
|
|
3439
|
|
|
|
|
|
|
Input: |
3440
|
|
|
|
|
|
|
|
3441
|
|
|
|
|
|
|
$value -> '1' = Store as sentence per line / '0' = Do not store as sentence per line |
3442
|
|
|
|
|
|
|
|
3443
|
|
|
|
|
|
|
Ouput: |
3444
|
|
|
|
|
|
|
|
3445
|
|
|
|
|
|
|
None |
3446
|
|
|
|
|
|
|
|
3447
|
|
|
|
|
|
|
Example: |
3448
|
|
|
|
|
|
|
|
3449
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3450
|
|
|
|
|
|
|
|
3451
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3452
|
|
|
|
|
|
|
$xmlconv->SetStoreAsSentencePerLine( 1 ); |
3453
|
|
|
|
|
|
|
|
3454
|
|
|
|
|
|
|
undef( $xmlconv ); |
3455
|
|
|
|
|
|
|
|
3456
|
|
|
|
|
|
|
=head3 SetNumOfThreads |
3457
|
|
|
|
|
|
|
|
3458
|
|
|
|
|
|
|
Description: |
3459
|
|
|
|
|
|
|
|
3460
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Sets the requested number of threads to parse Medline XML files |
3461
|
|
|
|
|
|
|
and compile the text corpus. |
3462
|
|
|
|
|
|
|
|
3463
|
|
|
|
|
|
|
Input: |
3464
|
|
|
|
|
|
|
|
3465
|
|
|
|
|
|
|
$value -> Integer (Positive value) |
3466
|
|
|
|
|
|
|
|
3467
|
|
|
|
|
|
|
Ouput: |
3468
|
|
|
|
|
|
|
|
3469
|
|
|
|
|
|
|
None |
3470
|
|
|
|
|
|
|
|
3471
|
|
|
|
|
|
|
Example: |
3472
|
|
|
|
|
|
|
|
3473
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3474
|
|
|
|
|
|
|
|
3475
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3476
|
|
|
|
|
|
|
$xmlconv->SetNumOfThreads( 4 ); |
3477
|
|
|
|
|
|
|
|
3478
|
|
|
|
|
|
|
undef( $xmlconv ); |
3479
|
|
|
|
|
|
|
|
3480
|
|
|
|
|
|
|
=head3 SetBeginDate |
3481
|
|
|
|
|
|
|
|
3482
|
|
|
|
|
|
|
Description: |
3483
|
|
|
|
|
|
|
|
3484
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Sets beginning date range for earliest articles to store, by |
3485
|
|
|
|
|
|
|
'DateCreated' Medline tag, within the text corpus during compilation. |
3486
|
|
|
|
|
|
|
|
3487
|
|
|
|
|
|
|
Note: Expected format - "XX/XX/XXXX" (Mon/Day/Year) |
3488
|
|
|
|
|
|
|
|
3489
|
|
|
|
|
|
|
Input: |
3490
|
|
|
|
|
|
|
|
3491
|
|
|
|
|
|
|
$string -> Date string - Format: "XX/XX/XXXX" |
3492
|
|
|
|
|
|
|
|
3493
|
|
|
|
|
|
|
Ouput: |
3494
|
|
|
|
|
|
|
|
3495
|
|
|
|
|
|
|
None |
3496
|
|
|
|
|
|
|
|
3497
|
|
|
|
|
|
|
Example: |
3498
|
|
|
|
|
|
|
|
3499
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3500
|
|
|
|
|
|
|
|
3501
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3502
|
|
|
|
|
|
|
$xmlconv->SetBeginDate( "01/01/2004" ); |
3503
|
|
|
|
|
|
|
|
3504
|
|
|
|
|
|
|
undef( $xmlconv ); |
3505
|
|
|
|
|
|
|
|
3506
|
|
|
|
|
|
|
=head3 SetEndDate |
3507
|
|
|
|
|
|
|
|
3508
|
|
|
|
|
|
|
Description: |
3509
|
|
|
|
|
|
|
|
3510
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Sets ending date range for latest article to store, by |
3511
|
|
|
|
|
|
|
'DateCreated' Medline tag, within the text corpus during compilation. |
3512
|
|
|
|
|
|
|
|
3513
|
|
|
|
|
|
|
Note: Expected format - "XX/XX/XXXX" (Mon/Day/Year) |
3514
|
|
|
|
|
|
|
|
3515
|
|
|
|
|
|
|
Input: |
3516
|
|
|
|
|
|
|
|
3517
|
|
|
|
|
|
|
$string -> Date string - Format: "XX/XX/XXXX" |
3518
|
|
|
|
|
|
|
|
3519
|
|
|
|
|
|
|
Ouput: |
3520
|
|
|
|
|
|
|
|
3521
|
|
|
|
|
|
|
None |
3522
|
|
|
|
|
|
|
|
3523
|
|
|
|
|
|
|
Example: |
3524
|
|
|
|
|
|
|
|
3525
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3526
|
|
|
|
|
|
|
|
3527
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3528
|
|
|
|
|
|
|
$xmlconv->SetEndDate( "08/13/2016" ); |
3529
|
|
|
|
|
|
|
|
3530
|
|
|
|
|
|
|
undef( $xmlconv ); |
3531
|
|
|
|
|
|
|
|
3532
|
|
|
|
|
|
|
=head3 SetXMLStringToParse |
3533
|
|
|
|
|
|
|
|
3534
|
|
|
|
|
|
|
Description: |
3535
|
|
|
|
|
|
|
|
3536
|
|
|
|
|
|
|
Sets member variable to passed string parameter. This string normally consists of Medline XML data to be |
3537
|
|
|
|
|
|
|
parsed for text corpus compilation. |
3538
|
|
|
|
|
|
|
|
3539
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3540
|
|
|
|
|
|
|
|
3541
|
|
|
|
|
|
|
Input: |
3542
|
|
|
|
|
|
|
|
3543
|
|
|
|
|
|
|
$string -> String |
3544
|
|
|
|
|
|
|
|
3545
|
|
|
|
|
|
|
Ouput: |
3546
|
|
|
|
|
|
|
|
3547
|
|
|
|
|
|
|
None |
3548
|
|
|
|
|
|
|
|
3549
|
|
|
|
|
|
|
Example: |
3550
|
|
|
|
|
|
|
|
3551
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3552
|
|
|
|
|
|
|
|
3553
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3554
|
|
|
|
|
|
|
$xmlconv->SetXMLStringToParse( "Hello World!" ); |
3555
|
|
|
|
|
|
|
|
3556
|
|
|
|
|
|
|
undef( $xmlconv ); |
3557
|
|
|
|
|
|
|
|
3558
|
|
|
|
|
|
|
=head3 SetTextCorpusStr |
3559
|
|
|
|
|
|
|
|
3560
|
|
|
|
|
|
|
Description: |
3561
|
|
|
|
|
|
|
|
3562
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Overwrites any stored text corpus data in memory to the string parameter. |
3563
|
|
|
|
|
|
|
|
3564
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3565
|
|
|
|
|
|
|
|
3566
|
|
|
|
|
|
|
Input: |
3567
|
|
|
|
|
|
|
|
3568
|
|
|
|
|
|
|
$string -> String |
3569
|
|
|
|
|
|
|
|
3570
|
|
|
|
|
|
|
Ouput: |
3571
|
|
|
|
|
|
|
|
3572
|
|
|
|
|
|
|
None |
3573
|
|
|
|
|
|
|
|
3574
|
|
|
|
|
|
|
Example: |
3575
|
|
|
|
|
|
|
|
3576
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3577
|
|
|
|
|
|
|
|
3578
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3579
|
|
|
|
|
|
|
$xmlconv->SetTextCorpusStr( "Hello World!" ); |
3580
|
|
|
|
|
|
|
|
3581
|
|
|
|
|
|
|
undef( $xmlconv ); |
3582
|
|
|
|
|
|
|
|
3583
|
|
|
|
|
|
|
=head3 AppendStrToTextCorpus |
3584
|
|
|
|
|
|
|
|
3585
|
|
|
|
|
|
|
Description: |
3586
|
|
|
|
|
|
|
|
3587
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Appends string parameter to text corpus string in memory. |
3588
|
|
|
|
|
|
|
|
3589
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3590
|
|
|
|
|
|
|
|
3591
|
|
|
|
|
|
|
Input: |
3592
|
|
|
|
|
|
|
|
3593
|
|
|
|
|
|
|
$string -> String |
3594
|
|
|
|
|
|
|
|
3595
|
|
|
|
|
|
|
Ouput: |
3596
|
|
|
|
|
|
|
|
3597
|
|
|
|
|
|
|
None |
3598
|
|
|
|
|
|
|
|
3599
|
|
|
|
|
|
|
Example: |
3600
|
|
|
|
|
|
|
|
3601
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3602
|
|
|
|
|
|
|
|
3603
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3604
|
|
|
|
|
|
|
$xmlconv->AppendStrToTextCorpus( "Hello World!" ); |
3605
|
|
|
|
|
|
|
|
3606
|
|
|
|
|
|
|
undef( $xmlconv ); |
3607
|
|
|
|
|
|
|
|
3608
|
|
|
|
|
|
|
=head3 ClearTextCorpus |
3609
|
|
|
|
|
|
|
|
3610
|
|
|
|
|
|
|
Description: |
3611
|
|
|
|
|
|
|
|
3612
|
|
|
|
|
|
|
Clears text corpus data in memory. |
3613
|
|
|
|
|
|
|
|
3614
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3615
|
|
|
|
|
|
|
|
3616
|
|
|
|
|
|
|
Input: |
3617
|
|
|
|
|
|
|
|
3618
|
|
|
|
|
|
|
None |
3619
|
|
|
|
|
|
|
|
3620
|
|
|
|
|
|
|
Ouput: |
3621
|
|
|
|
|
|
|
|
3622
|
|
|
|
|
|
|
None |
3623
|
|
|
|
|
|
|
|
3624
|
|
|
|
|
|
|
Example: |
3625
|
|
|
|
|
|
|
|
3626
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3627
|
|
|
|
|
|
|
|
3628
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3629
|
|
|
|
|
|
|
$xmlconv->ClearTextCorpus(); |
3630
|
|
|
|
|
|
|
|
3631
|
|
|
|
|
|
|
undef( $xmlconv ); |
3632
|
|
|
|
|
|
|
|
3633
|
|
|
|
|
|
|
=head3 SetTempStr |
3634
|
|
|
|
|
|
|
|
3635
|
|
|
|
|
|
|
Description: |
3636
|
|
|
|
|
|
|
|
3637
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Sets temporary member string to passed string parameter. |
3638
|
|
|
|
|
|
|
(Temporary placeholder for Medline Title and Abstract data). |
3639
|
|
|
|
|
|
|
|
3640
|
|
|
|
|
|
|
Note: This removes special characters and converts all characters to lowercase. |
3641
|
|
|
|
|
|
|
|
3642
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3643
|
|
|
|
|
|
|
|
3644
|
|
|
|
|
|
|
Input: |
3645
|
|
|
|
|
|
|
|
3646
|
|
|
|
|
|
|
$string -> String |
3647
|
|
|
|
|
|
|
|
3648
|
|
|
|
|
|
|
Ouput: |
3649
|
|
|
|
|
|
|
|
3650
|
|
|
|
|
|
|
None |
3651
|
|
|
|
|
|
|
|
3652
|
|
|
|
|
|
|
Example: |
3653
|
|
|
|
|
|
|
|
3654
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3655
|
|
|
|
|
|
|
|
3656
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3657
|
|
|
|
|
|
|
$xmlconv->SetTempStr( "Hello World!" ); |
3658
|
|
|
|
|
|
|
|
3659
|
|
|
|
|
|
|
undef( $xmlconv ); |
3660
|
|
|
|
|
|
|
|
3661
|
|
|
|
|
|
|
=head3 AppendToTempStr |
3662
|
|
|
|
|
|
|
|
3663
|
|
|
|
|
|
|
Description: |
3664
|
|
|
|
|
|
|
|
3665
|
|
|
|
|
|
|
Appends string parameter to temporary member string in memory. |
3666
|
|
|
|
|
|
|
|
3667
|
|
|
|
|
|
|
Note: This removes special characters and converts all characters to lowercase. |
3668
|
|
|
|
|
|
|
|
3669
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3670
|
|
|
|
|
|
|
|
3671
|
|
|
|
|
|
|
Input: |
3672
|
|
|
|
|
|
|
|
3673
|
|
|
|
|
|
|
$string -> String |
3674
|
|
|
|
|
|
|
|
3675
|
|
|
|
|
|
|
Ouput: |
3676
|
|
|
|
|
|
|
|
3677
|
|
|
|
|
|
|
None |
3678
|
|
|
|
|
|
|
|
3679
|
|
|
|
|
|
|
Example: |
3680
|
|
|
|
|
|
|
|
3681
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3682
|
|
|
|
|
|
|
|
3683
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3684
|
|
|
|
|
|
|
$xmlconv->AppendToTempStr( "Hello World!" ); |
3685
|
|
|
|
|
|
|
|
3686
|
|
|
|
|
|
|
undef( $xmlconv ); |
3687
|
|
|
|
|
|
|
|
3688
|
|
|
|
|
|
|
=head3 ClearTempStr |
3689
|
|
|
|
|
|
|
|
3690
|
|
|
|
|
|
|
Clears the temporary string storage in memory. |
3691
|
|
|
|
|
|
|
|
3692
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3693
|
|
|
|
|
|
|
|
3694
|
|
|
|
|
|
|
Input: |
3695
|
|
|
|
|
|
|
|
3696
|
|
|
|
|
|
|
None |
3697
|
|
|
|
|
|
|
|
3698
|
|
|
|
|
|
|
Ouput: |
3699
|
|
|
|
|
|
|
|
3700
|
|
|
|
|
|
|
None |
3701
|
|
|
|
|
|
|
|
3702
|
|
|
|
|
|
|
Example: |
3703
|
|
|
|
|
|
|
|
3704
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3705
|
|
|
|
|
|
|
|
3706
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3707
|
|
|
|
|
|
|
$xmlconv->ClearTempStr(); |
3708
|
|
|
|
|
|
|
|
3709
|
|
|
|
|
|
|
undef( $xmlconv ); |
3710
|
|
|
|
|
|
|
|
3711
|
|
|
|
|
|
|
=head3 SetTempDate |
3712
|
|
|
|
|
|
|
|
3713
|
|
|
|
|
|
|
Description: |
3714
|
|
|
|
|
|
|
|
3715
|
|
|
|
|
|
|
Sets member variable to passed string parameter. Sets temporary date string to passed string. |
3716
|
|
|
|
|
|
|
|
3717
|
|
|
|
|
|
|
Note: Date Format - "XX/XX/XXXX" (Mon/Day/Year) |
3718
|
|
|
|
|
|
|
|
3719
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3720
|
|
|
|
|
|
|
|
3721
|
|
|
|
|
|
|
Input: |
3722
|
|
|
|
|
|
|
|
3723
|
|
|
|
|
|
|
$string -> Date string - Format: "XX/XX/XXXX" |
3724
|
|
|
|
|
|
|
|
3725
|
|
|
|
|
|
|
Ouput: |
3726
|
|
|
|
|
|
|
|
3727
|
|
|
|
|
|
|
None |
3728
|
|
|
|
|
|
|
|
3729
|
|
|
|
|
|
|
Example: |
3730
|
|
|
|
|
|
|
|
3731
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3732
|
|
|
|
|
|
|
|
3733
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3734
|
|
|
|
|
|
|
$xmlconv->SetTempDate( "08/13/2016" ); |
3735
|
|
|
|
|
|
|
|
3736
|
|
|
|
|
|
|
undef( $xmlconv ); |
3737
|
|
|
|
|
|
|
|
3738
|
|
|
|
|
|
|
=head3 ClearTempDate |
3739
|
|
|
|
|
|
|
|
3740
|
|
|
|
|
|
|
Description: |
3741
|
|
|
|
|
|
|
|
3742
|
|
|
|
|
|
|
Clears the temporary date storage location in memory. |
3743
|
|
|
|
|
|
|
|
3744
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3745
|
|
|
|
|
|
|
|
3746
|
|
|
|
|
|
|
Input: |
3747
|
|
|
|
|
|
|
|
3748
|
|
|
|
|
|
|
None |
3749
|
|
|
|
|
|
|
|
3750
|
|
|
|
|
|
|
Ouput: |
3751
|
|
|
|
|
|
|
|
3752
|
|
|
|
|
|
|
None |
3753
|
|
|
|
|
|
|
|
3754
|
|
|
|
|
|
|
Example: |
3755
|
|
|
|
|
|
|
|
3756
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3757
|
|
|
|
|
|
|
|
3758
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3759
|
|
|
|
|
|
|
$xmlconv->ClearTempDate(); |
3760
|
|
|
|
|
|
|
|
3761
|
|
|
|
|
|
|
undef( $xmlconv ); |
3762
|
|
|
|
|
|
|
|
3763
|
|
|
|
|
|
|
=head3 SetCompoundWordAry |
3764
|
|
|
|
|
|
|
|
3765
|
|
|
|
|
|
|
Description: |
3766
|
|
|
|
|
|
|
|
3767
|
|
|
|
|
|
|
Sets member variable to de-referenced passed array reference parameter. Stores compound word array by |
3768
|
|
|
|
|
|
|
de-referencing array reference parameter. |
3769
|
|
|
|
|
|
|
|
3770
|
|
|
|
|
|
|
Note: Clears previous data if existing. |
3771
|
|
|
|
|
|
|
|
3772
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3773
|
|
|
|
|
|
|
|
3774
|
|
|
|
|
|
|
Input: |
3775
|
|
|
|
|
|
|
|
3776
|
|
|
|
|
|
|
$arrayReference -> Array reference of compound words |
3777
|
|
|
|
|
|
|
|
3778
|
|
|
|
|
|
|
Ouput: |
3779
|
|
|
|
|
|
|
|
3780
|
|
|
|
|
|
|
None |
3781
|
|
|
|
|
|
|
|
3782
|
|
|
|
|
|
|
Example: |
3783
|
|
|
|
|
|
|
|
3784
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3785
|
|
|
|
|
|
|
|
3786
|
|
|
|
|
|
|
my @compoundWordAry = ( "big dog", "respiratory failure", "seven large masses" ); |
3787
|
|
|
|
|
|
|
|
3788
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3789
|
|
|
|
|
|
|
$xmlconv->SetCompoundWordAry( \@compoundWordAry ); |
3790
|
|
|
|
|
|
|
|
3791
|
|
|
|
|
|
|
undef( $xmlconv ); |
3792
|
|
|
|
|
|
|
|
3793
|
|
|
|
|
|
|
=head3 ClearCompoundWordAry |
3794
|
|
|
|
|
|
|
|
3795
|
|
|
|
|
|
|
Description: |
3796
|
|
|
|
|
|
|
|
3797
|
|
|
|
|
|
|
Clears compound word array in memory. |
3798
|
|
|
|
|
|
|
|
3799
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3800
|
|
|
|
|
|
|
|
3801
|
|
|
|
|
|
|
Input: |
3802
|
|
|
|
|
|
|
|
3803
|
|
|
|
|
|
|
None |
3804
|
|
|
|
|
|
|
|
3805
|
|
|
|
|
|
|
Ouput: |
3806
|
|
|
|
|
|
|
|
3807
|
|
|
|
|
|
|
None |
3808
|
|
|
|
|
|
|
|
3809
|
|
|
|
|
|
|
Example: |
3810
|
|
|
|
|
|
|
|
3811
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3812
|
|
|
|
|
|
|
|
3813
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3814
|
|
|
|
|
|
|
$xmlconv->ClearCompoundWordAry(); |
3815
|
|
|
|
|
|
|
|
3816
|
|
|
|
|
|
|
undef( $xmlconv ); |
3817
|
|
|
|
|
|
|
|
3818
|
|
|
|
|
|
|
=head3 SetCompoundWordBST |
3819
|
|
|
|
|
|
|
|
3820
|
|
|
|
|
|
|
Description: |
3821
|
|
|
|
|
|
|
|
3822
|
|
|
|
|
|
|
Sets member variable to passed Word2vec::Bst parameter. Sets compound word binary search tree to passed binary tree parameter. |
3823
|
|
|
|
|
|
|
|
3824
|
|
|
|
|
|
|
Note: Un-defines previous binary tree if existing. |
3825
|
|
|
|
|
|
|
|
3826
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3827
|
|
|
|
|
|
|
|
3828
|
|
|
|
|
|
|
Input: |
3829
|
|
|
|
|
|
|
|
3830
|
|
|
|
|
|
|
Word2vec::Bst -> Binary Search Tree |
3831
|
|
|
|
|
|
|
|
3832
|
|
|
|
|
|
|
Ouput: |
3833
|
|
|
|
|
|
|
|
3834
|
|
|
|
|
|
|
None |
3835
|
|
|
|
|
|
|
|
3836
|
|
|
|
|
|
|
Example: |
3837
|
|
|
|
|
|
|
|
3838
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3839
|
|
|
|
|
|
|
|
3840
|
|
|
|
|
|
|
my @compoundWordAry = ( "big dog", "respiratory failure", "seven large masses" ); |
3841
|
|
|
|
|
|
|
@compoundWordAry = sort( @compoundWordAry ); |
3842
|
|
|
|
|
|
|
|
3843
|
|
|
|
|
|
|
my $arySize = @compoundWordAry; |
3844
|
|
|
|
|
|
|
|
3845
|
|
|
|
|
|
|
my $bst = Word2vec::Bst; |
3846
|
|
|
|
|
|
|
$bst->CreateTree( \@compoundWordAry, 0, $arySize, undef ); |
3847
|
|
|
|
|
|
|
|
3848
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3849
|
|
|
|
|
|
|
$xmlconv->SetCompoundWordBST( $bst ); |
3850
|
|
|
|
|
|
|
|
3851
|
|
|
|
|
|
|
undef( $xmlconv ); |
3852
|
|
|
|
|
|
|
|
3853
|
|
|
|
|
|
|
=head3 ClearCompoundWordBST |
3854
|
|
|
|
|
|
|
|
3855
|
|
|
|
|
|
|
Description: |
3856
|
|
|
|
|
|
|
|
3857
|
|
|
|
|
|
|
Clears/Un-defines existing compound word binary search tree from memory. |
3858
|
|
|
|
|
|
|
|
3859
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3860
|
|
|
|
|
|
|
|
3861
|
|
|
|
|
|
|
Input: |
3862
|
|
|
|
|
|
|
|
3863
|
|
|
|
|
|
|
None |
3864
|
|
|
|
|
|
|
|
3865
|
|
|
|
|
|
|
Ouput: |
3866
|
|
|
|
|
|
|
|
3867
|
|
|
|
|
|
|
None |
3868
|
|
|
|
|
|
|
|
3869
|
|
|
|
|
|
|
Example: |
3870
|
|
|
|
|
|
|
|
3871
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3872
|
|
|
|
|
|
|
|
3873
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3874
|
|
|
|
|
|
|
$xmlconv->ClearCompoundWordBST(); |
3875
|
|
|
|
|
|
|
|
3876
|
|
|
|
|
|
|
undef( $xmlconv ); |
3877
|
|
|
|
|
|
|
|
3878
|
|
|
|
|
|
|
=head3 SetMaxCompoundWordLength |
3879
|
|
|
|
|
|
|
|
3880
|
|
|
|
|
|
|
Description: |
3881
|
|
|
|
|
|
|
|
3882
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Sets maximum number of compound words in a phrase for comparison. |
3883
|
|
|
|
|
|
|
|
3884
|
|
|
|
|
|
|
ie. "medical campus of Virginia Commonwealth University" can be interpreted as a compound word of 6 words. |
3885
|
|
|
|
|
|
|
Setting this variable to 3 will only attempt compoundifying a maximum amount of three words. |
3886
|
|
|
|
|
|
|
The result would be "medical_campus_of Virginia commonwealth university" even-though an exact representation |
3887
|
|
|
|
|
|
|
of this compounded string can exist. Setting this variable to 6 will result in compounding all six words if |
3888
|
|
|
|
|
|
|
they exists in the compound word array/bst. |
3889
|
|
|
|
|
|
|
|
3890
|
|
|
|
|
|
|
Warning: This is a private function and should not be called or manipulated. |
3891
|
|
|
|
|
|
|
|
3892
|
|
|
|
|
|
|
Input: |
3893
|
|
|
|
|
|
|
|
3894
|
|
|
|
|
|
|
$value -> Integer |
3895
|
|
|
|
|
|
|
|
3896
|
|
|
|
|
|
|
Ouput: |
3897
|
|
|
|
|
|
|
|
3898
|
|
|
|
|
|
|
None |
3899
|
|
|
|
|
|
|
|
3900
|
|
|
|
|
|
|
Example: |
3901
|
|
|
|
|
|
|
|
3902
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3903
|
|
|
|
|
|
|
|
3904
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3905
|
|
|
|
|
|
|
$xmlconv->SetMaxCompoundWordLength( 8 ); |
3906
|
|
|
|
|
|
|
|
3907
|
|
|
|
|
|
|
undef( $xmlconv ); |
3908
|
|
|
|
|
|
|
|
3909
|
|
|
|
|
|
|
=head3 SetOverwriteExistingFile |
3910
|
|
|
|
|
|
|
|
3911
|
|
|
|
|
|
|
Description: |
3912
|
|
|
|
|
|
|
|
3913
|
|
|
|
|
|
|
Sets member variable to passed integer parameter. Sets option to overwrite existing text corpus during compilation |
3914
|
|
|
|
|
|
|
if 1 or append to existing text corpus if 0. |
3915
|
|
|
|
|
|
|
|
3916
|
|
|
|
|
|
|
Input: |
3917
|
|
|
|
|
|
|
|
3918
|
|
|
|
|
|
|
$value -> '1' = Overwrite existing text corpus / '0' = Append to existing text corpus during compilation. |
3919
|
|
|
|
|
|
|
|
3920
|
|
|
|
|
|
|
Output: |
3921
|
|
|
|
|
|
|
|
3922
|
|
|
|
|
|
|
None |
3923
|
|
|
|
|
|
|
|
3924
|
|
|
|
|
|
|
Example: |
3925
|
|
|
|
|
|
|
|
3926
|
|
|
|
|
|
|
use Word2vec::Xmltow2v; |
3927
|
|
|
|
|
|
|
|
3928
|
|
|
|
|
|
|
my $xmltow2v = Word2vec::Xmltow2v->new(); |
3929
|
|
|
|
|
|
|
$xmltow2v->SetOverWriteExistingFile( 1 ); |
3930
|
|
|
|
|
|
|
|
3931
|
|
|
|
|
|
|
undef( $xmltow2v ); |
3932
|
|
|
|
|
|
|
|
3933
|
|
|
|
|
|
|
=head2 Debug Functions |
3934
|
|
|
|
|
|
|
|
3935
|
|
|
|
|
|
|
=head3 GetTime |
3936
|
|
|
|
|
|
|
|
3937
|
|
|
|
|
|
|
Description: |
3938
|
|
|
|
|
|
|
|
3939
|
|
|
|
|
|
|
Returns current time string in "Hour:Minute:Second" format. |
3940
|
|
|
|
|
|
|
|
3941
|
|
|
|
|
|
|
Input: |
3942
|
|
|
|
|
|
|
|
3943
|
|
|
|
|
|
|
None |
3944
|
|
|
|
|
|
|
|
3945
|
|
|
|
|
|
|
Output: |
3946
|
|
|
|
|
|
|
|
3947
|
|
|
|
|
|
|
$string -> XX:XX:XX ("Hour:Minute:Second") |
3948
|
|
|
|
|
|
|
|
3949
|
|
|
|
|
|
|
Example: |
3950
|
|
|
|
|
|
|
|
3951
|
|
|
|
|
|
|
use Word2vec::Xmltow2v: |
3952
|
|
|
|
|
|
|
|
3953
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3954
|
|
|
|
|
|
|
my $time = $xmlconv->GetTime(); |
3955
|
|
|
|
|
|
|
|
3956
|
|
|
|
|
|
|
print( "Current Time: $time\n" ) if defined( $time ); |
3957
|
|
|
|
|
|
|
|
3958
|
|
|
|
|
|
|
undef( $xmlconv ); |
3959
|
|
|
|
|
|
|
|
3960
|
|
|
|
|
|
|
=head3 GetDate |
3961
|
|
|
|
|
|
|
|
3962
|
|
|
|
|
|
|
Description: |
3963
|
|
|
|
|
|
|
|
3964
|
|
|
|
|
|
|
Returns current month, day and year string in "Month/Day/Year" format. |
3965
|
|
|
|
|
|
|
|
3966
|
|
|
|
|
|
|
Input: |
3967
|
|
|
|
|
|
|
|
3968
|
|
|
|
|
|
|
None |
3969
|
|
|
|
|
|
|
|
3970
|
|
|
|
|
|
|
Output: |
3971
|
|
|
|
|
|
|
|
3972
|
|
|
|
|
|
|
$string -> XX/XX/XXXX ("Month/Day/Year") |
3973
|
|
|
|
|
|
|
|
3974
|
|
|
|
|
|
|
Example: |
3975
|
|
|
|
|
|
|
|
3976
|
|
|
|
|
|
|
use Word2vec::Xmltow2v: |
3977
|
|
|
|
|
|
|
|
3978
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
3979
|
|
|
|
|
|
|
my $date = $xmlconv->GetDate(); |
3980
|
|
|
|
|
|
|
|
3981
|
|
|
|
|
|
|
print( "Current Date: $date\n" ) if defined( $date ); |
3982
|
|
|
|
|
|
|
|
3983
|
|
|
|
|
|
|
undef( $xmlconv ); |
3984
|
|
|
|
|
|
|
|
3985
|
|
|
|
|
|
|
=head3 WriteLog |
3986
|
|
|
|
|
|
|
|
3987
|
|
|
|
|
|
|
Description: |
3988
|
|
|
|
|
|
|
|
3989
|
|
|
|
|
|
|
Prints passed string parameter to the console, log file or both depending on user options. |
3990
|
|
|
|
|
|
|
|
3991
|
|
|
|
|
|
|
Note: printNewLine parameter prints a new line character following the string if the parameter |
3992
|
|
|
|
|
|
|
is undefined and does not if parameter is 0. |
3993
|
|
|
|
|
|
|
|
3994
|
|
|
|
|
|
|
Input: |
3995
|
|
|
|
|
|
|
|
3996
|
|
|
|
|
|
|
$string -> String to print to the console/log file. |
3997
|
|
|
|
|
|
|
$value -> 0 = Do not print newline character after string, all else prints new line character including 'undef'. |
3998
|
|
|
|
|
|
|
|
3999
|
|
|
|
|
|
|
Output: |
4000
|
|
|
|
|
|
|
|
4001
|
|
|
|
|
|
|
None |
4002
|
|
|
|
|
|
|
|
4003
|
|
|
|
|
|
|
Example: |
4004
|
|
|
|
|
|
|
|
4005
|
|
|
|
|
|
|
use Word2vec::Xmltow2v: |
4006
|
|
|
|
|
|
|
|
4007
|
|
|
|
|
|
|
my $xmlconv = Word2vec::Xmltow2v->new(); |
4008
|
|
|
|
|
|
|
$xmlconv->WriteLog( "Hello World" ); |
4009
|
|
|
|
|
|
|
|
4010
|
|
|
|
|
|
|
undef( $xmlconv ); |
4011
|
|
|
|
|
|
|
|
4012
|
|
|
|
|
|
|
=head1 Author |
4013
|
|
|
|
|
|
|
|
4014
|
|
|
|
|
|
|
Clint Cuffy, Virginia Commonwealth University |
4015
|
|
|
|
|
|
|
|
4016
|
|
|
|
|
|
|
=head1 COPYRIGHT |
4017
|
|
|
|
|
|
|
|
4018
|
|
|
|
|
|
|
Copyright (c) 2016 |
4019
|
|
|
|
|
|
|
|
4020
|
|
|
|
|
|
|
Bridget T McInnes, Virginia Commonwealth University |
4021
|
|
|
|
|
|
|
btmcinnes at vcu dot edu |
4022
|
|
|
|
|
|
|
|
4023
|
|
|
|
|
|
|
Clint Cuffy, Virginia Commonwealth University |
4024
|
|
|
|
|
|
|
cuffyca at vcu dot edu |
4025
|
|
|
|
|
|
|
|
4026
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
4027
|
|
|
|
|
|
|
under the terms of the GNU General Public License as published by the Free |
4028
|
|
|
|
|
|
|
Software Foundation; either version 2 of the License, or (at your option) |
4029
|
|
|
|
|
|
|
any later version. |
4030
|
|
|
|
|
|
|
|
4031
|
|
|
|
|
|
|
This program is distributed in the hope that it will be useful, but WITHOUT |
4032
|
|
|
|
|
|
|
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
4033
|
|
|
|
|
|
|
FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
4034
|
|
|
|
|
|
|
|
4035
|
|
|
|
|
|
|
You should have received a copy of the GNU General Public License along with |
4036
|
|
|
|
|
|
|
this program; if not, write to: |
4037
|
|
|
|
|
|
|
|
4038
|
|
|
|
|
|
|
The Free Software Foundation, Inc., |
4039
|
|
|
|
|
|
|
59 Temple Place - Suite 330, |
4040
|
|
|
|
|
|
|
Boston, MA 02111-1307, USA. |
4041
|
|
|
|
|
|
|
|
4042
|
|
|
|
|
|
|
=cut |