| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
|
2
|
|
|
|
|
|
|
## File: DiaColloDB::Corpus::Filters.pm |
|
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
|
4
|
|
|
|
|
|
|
## Description: collocation db, source corpus content filters |
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Corpus::Filters; |
|
7
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::Persistent; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
24
|
|
|
8
|
1
|
|
|
1
|
|
4
|
use Exporter; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
14
|
|
|
9
|
1
|
|
|
1
|
|
76
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
710
|
|
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
##============================================================================== |
|
12
|
|
|
|
|
|
|
## Administrivia |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our @ISA = qw(Exporter DiaColloDB::Persistent); |
|
15
|
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
our @NAMES = qw(pgood pbad wgood wbad lgood lbad); |
|
17
|
|
|
|
|
|
|
our @FILES = map {$_."file"} @NAMES; |
|
18
|
|
|
|
|
|
|
our %EXPORT_TAGS = |
|
19
|
|
|
|
|
|
|
( |
|
20
|
|
|
|
|
|
|
'names' => [qw(@NAMES)], |
|
21
|
|
|
|
|
|
|
'defaults' => [map {uc($_)."_DEFAULT"} @NAMES], |
|
22
|
|
|
|
|
|
|
); |
|
23
|
|
|
|
|
|
|
$EXPORT_TAGS{all} = [@{$EXPORT_TAGS{names}},@{$EXPORT_TAGS{defaults}}]; |
|
24
|
|
|
|
|
|
|
our @EXPORT_OK = @{$EXPORT_TAGS{all}}; |
|
25
|
|
|
|
|
|
|
our @EXPORT = qw(); |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
##============================================================================== |
|
28
|
|
|
|
|
|
|
## Defaults (formerly in DiaColloDB.pm) |
|
29
|
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
## $PGOOD_DEFAULT |
|
31
|
|
|
|
|
|
|
## + default positive pos regex for document parsing |
|
32
|
|
|
|
|
|
|
## + don't use qr// here, since Storable doesn't like pre-compiled Regexps |
|
33
|
|
|
|
|
|
|
our $PGOOD_DEFAULT = q/^(?:N|TRUNC|VV|ADJ)/; #ITJ|FM|XY |
|
34
|
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
## $PBAD_DEFAULT |
|
36
|
|
|
|
|
|
|
## + default negative pos regex for document parsing |
|
37
|
|
|
|
|
|
|
our $PBAD_DEFAULT = undef; |
|
38
|
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
## $WGOOD_DEFAULT |
|
40
|
|
|
|
|
|
|
## + default positive word regex for document parsing |
|
41
|
|
|
|
|
|
|
our $WGOOD_DEFAULT = q/[[:alpha:]]/; |
|
42
|
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
## $WBAD_DEFAULT |
|
44
|
|
|
|
|
|
|
## + default negative word regex for document parsing |
|
45
|
|
|
|
|
|
|
our $WBAD_DEFAULT = q/[\.]/; |
|
46
|
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
## $LGOOD_DEFAULT |
|
48
|
|
|
|
|
|
|
## + default positive lemma regex for document parsing |
|
49
|
|
|
|
|
|
|
our $LGOOD_DEFAULT = undef; |
|
50
|
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
## $LBAD_DEFAULT |
|
52
|
|
|
|
|
|
|
## + default negative lemma regex for document parsing |
|
53
|
|
|
|
|
|
|
our $LBAD_DEFAULT = undef; |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
##============================================================================== |
|
56
|
|
|
|
|
|
|
## Methods |
|
57
|
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
## $filters = CLASS_OR_OBJECT->new(%opts) |
|
59
|
|
|
|
|
|
|
## + simple HASH-ref wrapping filters |
|
60
|
|
|
|
|
|
|
sub new { |
|
61
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
|
62
|
|
|
|
|
|
|
my $filters = bless({ |
|
63
|
|
|
|
|
|
|
pgood => $PGOOD_DEFAULT, |
|
64
|
|
|
|
|
|
|
pbad => $PBAD_DEFAULT, |
|
65
|
|
|
|
|
|
|
wgood => $WGOOD_DEFAULT, |
|
66
|
|
|
|
|
|
|
wbad => $WBAD_DEFAULT, |
|
67
|
|
|
|
|
|
|
lgood => $LGOOD_DEFAULT, |
|
68
|
|
|
|
|
|
|
lbad => $LBAD_DEFAULT, |
|
69
|
0
|
|
0
|
|
|
|
(map {($_=>undef)} @FILES), |
|
|
0
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
@_, |
|
71
|
|
|
|
|
|
|
}, ref($that)||$that); |
|
72
|
0
|
|
|
|
|
|
return $filters; |
|
73
|
|
|
|
|
|
|
} |
|
74
|
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
## $filters = $CLASS_OR_OBJECT->null() |
|
76
|
|
|
|
|
|
|
sub null { |
|
77
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
|
78
|
0
|
|
0
|
|
|
|
return bless({},ref($that)||$that) |
|
79
|
|
|
|
|
|
|
} |
|
80
|
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
## $filters = $filters->clear() |
|
82
|
|
|
|
|
|
|
sub clear { |
|
83
|
0
|
|
|
0
|
1
|
|
my $filters = shift; |
|
84
|
0
|
|
|
|
|
|
$_ = undef foreach (values %$filters); |
|
85
|
0
|
|
|
|
|
|
return $filters; |
|
86
|
|
|
|
|
|
|
} |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
## $bool = $filters->empty() |
|
89
|
|
|
|
|
|
|
## + returns true iff all filters are undefined |
|
90
|
|
|
|
|
|
|
sub isnull { |
|
91
|
0
|
|
|
0
|
1
|
|
return !grep {$_} @{$_[0]}{@NAMES,@FILES}; |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
} |
|
93
|
|
|
|
|
|
|
|
|
94
|
|
|
|
|
|
|
## $bool = $filters1->equal($filters2) |
|
95
|
|
|
|
|
|
|
## $bool = PACKAGE->equal($filters1,$filters2) |
|
96
|
|
|
|
|
|
|
## + returns true iff filters are equal |
|
97
|
|
|
|
|
|
|
sub equal { |
|
98
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
|
99
|
0
|
0
|
0
|
|
|
|
my ($f1,$f2) = map {($_//{})} (@_ > 1 ? @_ : ($that,shift)); |
|
|
0
|
|
|
|
|
|
|
|
100
|
0
|
|
0
|
|
|
|
return !grep {($f1->{$_}//'') ne ($f2->{$_}//'')} @NAMES,@FILES; |
|
|
0
|
|
0
|
|
|
|
|
|
101
|
|
|
|
|
|
|
} |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
## \%name2obj = $filters->compile() |
|
104
|
|
|
|
|
|
|
## \%name2obj = PACKAGE->compile(\%filters) |
|
105
|
|
|
|
|
|
|
## + returns HASH-ref of compiled filter regexes and (stop|go)-hashes |
|
106
|
|
|
|
|
|
|
## ${NAME} => $REGEX, |
|
107
|
|
|
|
|
|
|
## ${NAME}file => \%HASHREF, |
|
108
|
|
|
|
|
|
|
sub compile { |
|
109
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
|
110
|
0
|
0
|
|
|
|
|
my $filters = @_ ? shift : $that; |
|
111
|
|
|
|
|
|
|
return { |
|
112
|
|
|
|
|
|
|
##-- compile: filter regexes |
|
113
|
0
|
|
|
|
|
|
(map {($_=>qr{$filters->{$_}})} grep {$filters->{$_}} @NAMES), |
|
|
0
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
##-- compile: filter list-files |
|
116
|
0
|
|
|
|
|
|
(map {($_=>$that->loadListFile($filters->{$_}))} grep {$filters->{$_}} @FILES), |
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
}; |
|
118
|
|
|
|
|
|
|
} |
|
119
|
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
## \%line2undef = $coldb->loadListFile($filename_or_undef) |
|
121
|
|
|
|
|
|
|
sub loadListFile { |
|
122
|
0
|
|
|
0
|
1
|
|
my ($that,$file) = @_; |
|
123
|
0
|
0
|
0
|
|
|
|
return undef if (($file//'') eq ''); |
|
124
|
0
|
0
|
|
|
|
|
CORE::open(my $fh,"<$file") |
|
125
|
|
|
|
|
|
|
or $that->logconfess("loadListFile(): open failed for '$file': $!"); |
|
126
|
0
|
|
|
|
|
|
my $h = {}; |
|
127
|
0
|
|
|
|
|
|
while (defined($_=<$fh>)) { |
|
128
|
0
|
|
|
|
|
|
chomp; |
|
129
|
0
|
0
|
|
|
|
|
next if (/^\s*(?:\#.*)$/); ##-- skip comments and blank lines |
|
130
|
0
|
|
|
|
|
|
$h->{$_} = undef; |
|
131
|
|
|
|
|
|
|
} |
|
132
|
0
|
|
|
|
|
|
CORE::close($file); |
|
133
|
0
|
|
|
|
|
|
return $h; |
|
134
|
|
|
|
|
|
|
} |
|
135
|
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
|
|
137
|
|
|
|
|
|
|
##============================================================================== |
|
138
|
|
|
|
|
|
|
## Footer |
|
139
|
|
|
|
|
|
|
1; |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
__END__ |
|
142
|
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
|