line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## File: DiaColloDB::Corpus::Filters.pm |
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
4
|
|
|
|
|
|
|
## Description: collocation db, source corpus content filters |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Corpus::Filters; |
7
|
1
|
|
|
1
|
|
8
|
use DiaColloDB::Persistent; |
|
1
|
|
|
|
|
5
|
|
|
1
|
|
|
|
|
33
|
|
8
|
1
|
|
|
1
|
|
7
|
use Exporter; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
38
|
|
9
|
1
|
|
|
1
|
|
131
|
use strict; |
|
1
|
|
|
|
|
21
|
|
|
1
|
|
|
|
|
937
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
##============================================================================== |
12
|
|
|
|
|
|
|
## Administrivia |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our @ISA = qw(Exporter DiaColloDB::Persistent); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our @NAMES = qw(pgood pbad wgood wbad lgood lbad); |
17
|
|
|
|
|
|
|
our @FILES = map {$_."file"} @NAMES; |
18
|
|
|
|
|
|
|
our %EXPORT_TAGS = |
19
|
|
|
|
|
|
|
( |
20
|
|
|
|
|
|
|
'names' => [qw(@NAMES)], |
21
|
|
|
|
|
|
|
'defaults' => [map {uc($_)."_DEFAULT"} @NAMES], |
22
|
|
|
|
|
|
|
); |
23
|
|
|
|
|
|
|
$EXPORT_TAGS{all} = [@{$EXPORT_TAGS{names}},@{$EXPORT_TAGS{defaults}}]; |
24
|
|
|
|
|
|
|
our @EXPORT_OK = @{$EXPORT_TAGS{all}}; |
25
|
|
|
|
|
|
|
our @EXPORT = qw(); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
##============================================================================== |
28
|
|
|
|
|
|
|
## Defaults (formerly in DiaColloDB.pm) |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
## $PGOOD_DEFAULT |
31
|
|
|
|
|
|
|
## + default positive pos regex for document parsing |
32
|
|
|
|
|
|
|
## + don't use qr// here, since Storable doesn't like pre-compiled Regexps |
33
|
|
|
|
|
|
|
our $PGOOD_DEFAULT = q/^(?:N|TRUNC|VV|ADJ)/; #ITJ|FM|XY |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
## $PBAD_DEFAULT |
36
|
|
|
|
|
|
|
## + default negative pos regex for document parsing |
37
|
|
|
|
|
|
|
our $PBAD_DEFAULT = undef; |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
## $WGOOD_DEFAULT |
40
|
|
|
|
|
|
|
## + default positive word regex for document parsing |
41
|
|
|
|
|
|
|
our $WGOOD_DEFAULT = q/[[:alpha:]]/; |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
## $WBAD_DEFAULT |
44
|
|
|
|
|
|
|
## + default negative word regex for document parsing |
45
|
|
|
|
|
|
|
our $WBAD_DEFAULT = q/[\.]/; |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
## $LGOOD_DEFAULT |
48
|
|
|
|
|
|
|
## + default positive lemma regex for document parsing |
49
|
|
|
|
|
|
|
our $LGOOD_DEFAULT = undef; |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
## $LBAD_DEFAULT |
52
|
|
|
|
|
|
|
## + default negative lemma regex for document parsing |
53
|
|
|
|
|
|
|
our $LBAD_DEFAULT = undef; |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
##============================================================================== |
56
|
|
|
|
|
|
|
## Methods |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
## $filters = CLASS_OR_OBJECT->new(%opts) |
59
|
|
|
|
|
|
|
## + simple HASH-ref wrapping filters |
60
|
|
|
|
|
|
|
sub new { |
61
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
62
|
|
|
|
|
|
|
my $filters = bless({ |
63
|
|
|
|
|
|
|
pgood => $PGOOD_DEFAULT, |
64
|
|
|
|
|
|
|
pbad => $PBAD_DEFAULT, |
65
|
|
|
|
|
|
|
wgood => $WGOOD_DEFAULT, |
66
|
|
|
|
|
|
|
wbad => $WBAD_DEFAULT, |
67
|
|
|
|
|
|
|
lgood => $LGOOD_DEFAULT, |
68
|
|
|
|
|
|
|
lbad => $LBAD_DEFAULT, |
69
|
0
|
|
0
|
|
|
|
(map {($_=>undef)} @FILES), |
|
0
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
@_, |
71
|
|
|
|
|
|
|
}, ref($that)||$that); |
72
|
0
|
|
|
|
|
|
return $filters; |
73
|
|
|
|
|
|
|
} |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
## $filters = $CLASS_OR_OBJECT->null() |
76
|
|
|
|
|
|
|
sub null { |
77
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
78
|
0
|
|
0
|
|
|
|
return bless({},ref($that)||$that) |
79
|
|
|
|
|
|
|
} |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
## $filters = $filters->clear() |
82
|
|
|
|
|
|
|
sub clear { |
83
|
0
|
|
|
0
|
1
|
|
my $filters = shift; |
84
|
0
|
|
|
|
|
|
$_ = undef foreach (values %$filters); |
85
|
0
|
|
|
|
|
|
return $filters; |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
## $bool = $filters->empty() |
89
|
|
|
|
|
|
|
## + returns true iff all filters are undefined |
90
|
|
|
|
|
|
|
sub isnull { |
91
|
0
|
|
|
0
|
1
|
|
return !grep {$_} @{$_[0]}{@NAMES,@FILES}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
## $bool = $filters1->equal($filters2) |
95
|
|
|
|
|
|
|
## $bool = PACKAGE->equal($filters1,$filters2) |
96
|
|
|
|
|
|
|
## + returns true iff filters are equal |
97
|
|
|
|
|
|
|
sub equal { |
98
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
99
|
0
|
0
|
0
|
|
|
|
my ($f1,$f2) = map {($_//{})} (@_ > 1 ? @_ : ($that,shift)); |
|
0
|
|
|
|
|
|
|
100
|
0
|
|
0
|
|
|
|
return !grep {($f1->{$_}//'') ne ($f2->{$_}//'')} @NAMES,@FILES; |
|
0
|
|
0
|
|
|
|
|
101
|
|
|
|
|
|
|
} |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
## \%name2obj = $filters->compile() |
104
|
|
|
|
|
|
|
## \%name2obj = PACKAGE->compile(\%filters) |
105
|
|
|
|
|
|
|
## + returns HASH-ref of compiled filter regexes and (stop|go)-hashes |
106
|
|
|
|
|
|
|
## ${NAME} => $REGEX, |
107
|
|
|
|
|
|
|
## ${NAME}file => \%HASHREF, |
108
|
|
|
|
|
|
|
sub compile { |
109
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
110
|
0
|
0
|
|
|
|
|
my $filters = @_ ? shift : $that; |
111
|
|
|
|
|
|
|
return { |
112
|
|
|
|
|
|
|
##-- compile: filter regexes |
113
|
0
|
|
|
|
|
|
(map {($_=>qr{$filters->{$_}})} grep {$filters->{$_}} @NAMES), |
|
0
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
##-- compile: filter list-files |
116
|
0
|
|
|
|
|
|
(map {($_=>$that->loadListFile($filters->{$_}))} grep {$filters->{$_}} @FILES), |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
}; |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
## \%line2undef = $coldb->loadListFile($filename_or_undef) |
121
|
|
|
|
|
|
|
sub loadListFile { |
122
|
0
|
|
|
0
|
1
|
|
my ($that,$file) = @_; |
123
|
0
|
0
|
0
|
|
|
|
return undef if (($file//'') eq ''); |
124
|
0
|
0
|
|
|
|
|
CORE::open(my $fh,"<$file") |
125
|
|
|
|
|
|
|
or $that->logconfess("loadListFile(): open failed for '$file': $!"); |
126
|
0
|
|
|
|
|
|
my $h = {}; |
127
|
0
|
|
|
|
|
|
while (defined($_=<$fh>)) { |
128
|
0
|
|
|
|
|
|
chomp; |
129
|
0
|
0
|
|
|
|
|
next if (/^\s*(?:\#.*)$/); ##-- skip comments and blank lines |
130
|
0
|
|
|
|
|
|
$h->{$_} = undef; |
131
|
|
|
|
|
|
|
} |
132
|
0
|
|
|
|
|
|
CORE::close($file); |
133
|
0
|
|
|
|
|
|
return $h; |
134
|
|
|
|
|
|
|
} |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
##============================================================================== |
138
|
|
|
|
|
|
|
## Footer |
139
|
|
|
|
|
|
|
1; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
__END__ |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
|