line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Mining::Base; |
2
|
6
|
|
|
6
|
|
54497
|
use Class::Std; |
|
6
|
|
|
|
|
113922
|
|
|
6
|
|
|
|
|
42
|
|
3
|
6
|
|
|
6
|
|
6929
|
use Class::Std::Utils; |
|
6
|
|
|
|
|
41756
|
|
|
6
|
|
|
|
|
42
|
|
4
|
6
|
|
|
6
|
|
3554
|
use DBIx::MySperqlOO; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
use File::Spec; |
6
|
|
|
|
|
|
|
use YAML qw(DumpFile LoadFile); |
7
|
|
|
|
|
|
|
use Module::Runtime qw(use_module); |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use warnings; |
10
|
|
|
|
|
|
|
use strict; |
11
|
|
|
|
|
|
|
use Carp; |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use version; our $VERSION = qv('0.0.8'); |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
our $config_filename = '.corpus/config'; |
16
|
|
|
|
|
|
|
our $status_filename = '.tm-status'; |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
{ |
19
|
|
|
|
|
|
|
my %library_dbh_of : ATTR(); |
20
|
|
|
|
|
|
|
my %analysis_dbh_of : ATTR(); |
21
|
|
|
|
|
|
|
my %root_dir_of : ATTR(); |
22
|
|
|
|
|
|
|
my %root_url_of : ATTR(); |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
sub library { my ( $self ) = @_; return $library_dbh_of{ident $self}; } |
25
|
|
|
|
|
|
|
sub analysis { my ( $self ) = @_; return $analysis_dbh_of{ident $self}; } |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
sub get_root_url { my ( $self ) = @_; return $root_url_of{ident $self}; } |
28
|
|
|
|
|
|
|
sub get_root_dir { my ( $self ) = @_; return $root_dir_of{ident $self}; } |
29
|
|
|
|
|
|
|
sub get_data_dir { my ( $self, $corpus_id ) = @_; return $self->get_root_dir() . "/documents/corpus_$corpus_id"; } |
30
|
|
|
|
|
|
|
sub get_config_filename { return File::Spec->catfile( $ENV{HOME}, $config_filename ); } |
31
|
|
|
|
|
|
|
sub get_status_filename { return File::Spec->catfile( $ENV{HOME}, $status_filename ); } |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub BUILD { |
34
|
|
|
|
|
|
|
my ($self, $ident, $arg_ref) = @_; |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
my $config = $self->_load_config(); |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
$root_dir_of{$ident} = $config->{root_dir}; |
39
|
|
|
|
|
|
|
$root_url_of{$ident} = $config->{root_url}; |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
$library_dbh_of{$ident} = DBIx::MySperqlOO->new( $config->{library} ); |
42
|
|
|
|
|
|
|
$analysis_dbh_of{$ident} = DBIx::MySperqlOO->new( $config->{analysis} ); |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
return; |
45
|
|
|
|
|
|
|
} |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
sub get_corpus_id_from_name { |
48
|
|
|
|
|
|
|
my ( $self, $arg_ref ) = @_; |
49
|
|
|
|
|
|
|
my $sql = "select corpus_id from corpuses where corpus_name = '" . $arg_ref->{corpus_name} . "'"; |
50
|
|
|
|
|
|
|
my ( $corpus_id ) = $self->library()->sqlexec( $sql, '@' ); |
51
|
|
|
|
|
|
|
return $corpus_id; |
52
|
|
|
|
|
|
|
} |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
sub _load_config { |
55
|
|
|
|
|
|
|
my ( $self ) = @_; |
56
|
|
|
|
|
|
|
return LoadFile( $self->get_config_filename() ); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
sub _parse_file_name { |
60
|
|
|
|
|
|
|
my ( $self, $url ) = @_; |
61
|
|
|
|
|
|
|
my @path = split(/\//, $url); |
62
|
|
|
|
|
|
|
return pop(@path); |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub _download_file { |
66
|
|
|
|
|
|
|
my ( $self, $arg_ref ) = @_; |
67
|
|
|
|
|
|
|
my @stat; |
68
|
|
|
|
|
|
|
my $target_dir = defined $arg_ref->{target_dir} ? $arg_ref->{target_dir} : ''; |
69
|
|
|
|
|
|
|
my $url = defined $arg_ref->{url} ? $arg_ref->{url} : ''; |
70
|
|
|
|
|
|
|
my $tries = defined $arg_ref->{tries} ? $arg_ref->{tries} : 2; |
71
|
|
|
|
|
|
|
if ($target_dir && $url) { |
72
|
|
|
|
|
|
|
my $file_name = $self->_parse_file_name( $url ); |
73
|
|
|
|
|
|
|
my $wget = "wget --tries=$tries --directory-prefix=$target_dir $url"; |
74
|
|
|
|
|
|
|
`$wget`; |
75
|
|
|
|
|
|
|
@stat = stat("$target_dir/$file_name"); |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
return $stat[7] || '0'; |
78
|
|
|
|
|
|
|
} |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
sub _sql_escape { |
81
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
82
|
|
|
|
|
|
|
if ($string) { $string =~ s/(['"\\])/\\$1/g; } |
83
|
|
|
|
|
|
|
return $string; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
sub _html_to_sql { |
87
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
88
|
|
|
|
|
|
|
$string = $self->_html_unescape( $string ); |
89
|
|
|
|
|
|
|
$string = $self->_sql_escape( $string ); |
90
|
|
|
|
|
|
|
return $string; |
91
|
|
|
|
|
|
|
} |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
sub _html_escape { |
94
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
95
|
|
|
|
|
|
|
$string =~ s/'/'/g; |
96
|
|
|
|
|
|
|
$string =~ s/"/"/g; |
97
|
|
|
|
|
|
|
return $string; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
sub _html_encode { |
101
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
102
|
|
|
|
|
|
|
$string =~ s/ /%20/g; |
103
|
|
|
|
|
|
|
$string =~ s/'/%27/g; |
104
|
|
|
|
|
|
|
$string =~ s/\{/%7B/g; |
105
|
|
|
|
|
|
|
$string =~ s/\}/%7D/g; |
106
|
|
|
|
|
|
|
return $string; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub _html_unescape { |
110
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
111
|
|
|
|
|
|
|
$string =~ s/'/'/g; |
112
|
|
|
|
|
|
|
$string =~ s/"/"/g; |
113
|
|
|
|
|
|
|
$string =~ s/%20/ /g; |
114
|
|
|
|
|
|
|
return $string; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub _phone_format { |
118
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
119
|
|
|
|
|
|
|
$string =~ s/(\d{3})(\d{3})(\d{4})/($1) $2-$3/; |
120
|
|
|
|
|
|
|
return $string; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub _phone_unformat { |
124
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
125
|
|
|
|
|
|
|
$string =~ s/[^\d]//g; |
126
|
|
|
|
|
|
|
return $string; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
sub _commify { # Perl Cookbook 2.17 |
130
|
|
|
|
|
|
|
my ( $self, $string ) = @_; |
131
|
|
|
|
|
|
|
my $text = reverse $string; |
132
|
|
|
|
|
|
|
$text =~ s/(\d\d\d)(?=\d)(?!\d*\.)/$1,/g; |
133
|
|
|
|
|
|
|
return scalar reverse $text; |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
sub _get_files { |
137
|
|
|
|
|
|
|
my ( $self, $root_dir ) = @_; |
138
|
|
|
|
|
|
|
if (opendir(DIR, $root_dir)) { |
139
|
|
|
|
|
|
|
my (@files); |
140
|
|
|
|
|
|
|
my (@nodes) = (readdir(DIR)); |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
foreach my $node (@nodes) { |
143
|
|
|
|
|
|
|
if ($node =~ m/^\./) { next; } |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
my $pathnode = $root_dir . "/" . $node; |
146
|
|
|
|
|
|
|
my @stat = stat($pathnode); |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
my $value = defined $stat[2] ? $stat[2] : ''; |
149
|
|
|
|
|
|
|
if ($value =~ /^[^1]/) { |
150
|
|
|
|
|
|
|
push(@files, $node); |
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
} |
153
|
|
|
|
|
|
|
return @files; |
154
|
|
|
|
|
|
|
} else { |
155
|
|
|
|
|
|
|
return 0; |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
sub _get_dirs { |
160
|
|
|
|
|
|
|
my ( $self, $path, $nestedflag) = @_; |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
# If the directory opens |
163
|
|
|
|
|
|
|
if (opendir(DIR, $path)) { |
164
|
|
|
|
|
|
|
# Read it |
165
|
|
|
|
|
|
|
my (@dirs); |
166
|
|
|
|
|
|
|
my (@nodes) = sort (readdir(DIR)); |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
foreach my $node (@nodes) { |
169
|
|
|
|
|
|
|
# Drop any dirs (or files) that start with a period |
170
|
|
|
|
|
|
|
if ($node =~ m/^\./) { next; } |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# Get file system node status |
173
|
|
|
|
|
|
|
my @stat = stat($path . '/' . $node); |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
# if the first character of $mode is 1, then it is a dir |
176
|
|
|
|
|
|
|
if ($stat[2] =~ /^1/) { |
177
|
|
|
|
|
|
|
my $newpath = $path . "/" . $node; |
178
|
|
|
|
|
|
|
push(@dirs, $newpath); |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
if ($nestedflag) { |
181
|
|
|
|
|
|
|
my @subnodes = &GetDirs($newpath, $nestedflag); |
182
|
|
|
|
|
|
|
push(@dirs, @subnodes); |
183
|
|
|
|
|
|
|
} |
184
|
|
|
|
|
|
|
} |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
return @dirs; |
187
|
|
|
|
|
|
|
} else { |
188
|
|
|
|
|
|
|
return 0; |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
} |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
sub _get_file_text { |
193
|
|
|
|
|
|
|
my ( $self, $path_file_name ) = @_; |
194
|
|
|
|
|
|
|
my ($text, $line); |
195
|
|
|
|
|
|
|
if (-e $path_file_name) { |
196
|
|
|
|
|
|
|
open (my $IN, '<', $path_file_name) || $self->_status( "(Get) Cannot open $path_file_name: $!" ); |
197
|
|
|
|
|
|
|
while ($line = <$IN>) { $text .= $line; } |
198
|
|
|
|
|
|
|
close ($IN) || $self->_status( "(Get) Cannot close $path_file_name: $!" ); |
199
|
|
|
|
|
|
|
} |
200
|
|
|
|
|
|
|
return $text; |
201
|
|
|
|
|
|
|
} |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
sub _set_file_text { |
204
|
|
|
|
|
|
|
my ( $self, $path_file_name, $text ) = @_; |
205
|
|
|
|
|
|
|
open (my $OUT, '>', $path_file_name) || $self->_status( "(Set) Cannot open $path_file_name: $!" ); |
206
|
|
|
|
|
|
|
print {$OUT} $text || $self->_status( "(Set) Cannot write $path_file_name: $!" ); |
207
|
|
|
|
|
|
|
close ($OUT) || $self->_status( "(Set) Cannot close $path_file_name: $!" ); |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
sub _add_file_text { |
211
|
|
|
|
|
|
|
my ( $self, $path_file_name, $text ) = @_; |
212
|
|
|
|
|
|
|
open (my $OUT, '>>', $path_file_name) || $self->_status( "(Add) Cannot open $path_file_name: $!" ); |
213
|
|
|
|
|
|
|
print {$OUT} $text || $self->_status( "(Add) Cannot write $path_file_name: $!" ); |
214
|
|
|
|
|
|
|
close ($OUT) || $self->_status( "(Add) Cannot close $path_file_name: $!" ); |
215
|
|
|
|
|
|
|
} |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
sub _status { |
218
|
|
|
|
|
|
|
my ( $self, $msg ) = @_; |
219
|
|
|
|
|
|
|
my $status_file = $self->get_status_filename(); |
220
|
|
|
|
|
|
|
open (my $OUT, '>>', $status_file) || croak( "(Status) Cannot open $status_file: $!" ); |
221
|
|
|
|
|
|
|
print {$OUT} " STATUS: $msg \n" || croak( "(Status) Cannot write $status_file: $!" ); |
222
|
|
|
|
|
|
|
close ($OUT) || croak( "(Status) Cannot close $status_file: $!" ); |
223
|
|
|
|
|
|
|
return; |
224
|
|
|
|
|
|
|
} |
225
|
|
|
|
|
|
|
} |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
1; # Magic true value required at end of module |
228
|
|
|
|
|
|
|
__END__ |