line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
## -*- Mode: CPerl -*- |
2
|
|
|
|
|
|
|
## File: DiaColloDB::Document::DDCTabs.pm |
3
|
|
|
|
|
|
|
## Author: Bryan Jurish <moocow@cpan.org> |
4
|
|
|
|
|
|
|
## Description: collocation db, source document, DDC tab-dump |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
package DiaColloDB::Document::DDCTabs; |
7
|
1
|
|
|
1
|
|
7
|
use DiaColloDB::Document; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
30
|
|
8
|
1
|
|
|
1
|
|
5
|
use IO::File; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
35
|
|
9
|
1
|
|
|
1
|
|
187
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
1133
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
##============================================================================== |
12
|
|
|
|
|
|
|
## Globals & Constants |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
our @ISA = qw(DiaColloDB::Document); |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
##============================================================================== |
17
|
|
|
|
|
|
|
## Constructors etc. |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
## $doc = CLASS_OR_OBJECT->new(%args) |
20
|
|
|
|
|
|
|
## + %args, object structure: |
21
|
|
|
|
|
|
|
## ( |
22
|
|
|
|
|
|
|
## ##-- parsing options |
23
|
|
|
|
|
|
|
## eosre => $re, ##-- EOS regex (empty or undef for file-breaks only; default='^$') |
24
|
|
|
|
|
|
|
## utf8 => $bool, ##-- enable utf8 parsing? (default=1) |
25
|
|
|
|
|
|
|
## trimPND => $bool, ##-- create trimmed "pnd" meta-attribute? (default=1) |
26
|
|
|
|
|
|
|
## trimAuthor => $bool, ##-- trim "author" meta-attribute (eliminate DTA PNDs)? (default=1) |
27
|
|
|
|
|
|
|
## trimGenre => $bool, ##-- create trimmed "genre" meta-attribute? (default=1) |
28
|
|
|
|
|
|
|
## foreign => $bool, ##-- alias for trimAuthor=0 trimPND=0 trimGenre=0 |
29
|
|
|
|
|
|
|
## ## |
30
|
|
|
|
|
|
|
## ##-- document data |
31
|
|
|
|
|
|
|
## date =>$date, ##-- year |
32
|
|
|
|
|
|
|
## wf =>$iw, ##-- index-field for $word attribute (default=0) |
33
|
|
|
|
|
|
|
## pf =>$ip, ##-- index-field for $pos attribute (default=1) |
34
|
|
|
|
|
|
|
## lf =>$il, ##-- index-field for $lemma attribute (default=2) |
35
|
|
|
|
|
|
|
## pagef =>$ipage, ##-- index-field for $page attribute (default=undef:none) |
36
|
|
|
|
|
|
|
## tokens =>\@tokens, ##-- tokens, including undef for EOS |
37
|
|
|
|
|
|
|
## meta =>\%meta, ##-- document metadata (e.g. author, title, collection, ...) |
38
|
|
|
|
|
|
|
## ## + may also generate special $meta->{genre} as 1st component of $meta->{textClass} if available |
39
|
|
|
|
|
|
|
## ) |
40
|
|
|
|
|
|
|
## + each token in @tokens is a HASH-ref {w=>$word,p=>$pos,l=>$lemma,...} |
41
|
|
|
|
|
|
|
## + default attribute positions ($iw,$ip,$il,$ipage) are overridden doc lines '%%$DDC:index[INDEX]=LONGNAME w' etc if present |
42
|
|
|
|
|
|
|
sub new { |
43
|
0
|
|
|
0
|
1
|
|
my $that = shift; |
44
|
0
|
|
|
|
|
|
my $doc = $that->SUPER::new( |
45
|
|
|
|
|
|
|
utf8=>1, |
46
|
|
|
|
|
|
|
trimPND=>1, |
47
|
|
|
|
|
|
|
trimAuthor=>1, |
48
|
|
|
|
|
|
|
trimGenre=>1, |
49
|
|
|
|
|
|
|
eosre=>qr{^$}, |
50
|
|
|
|
|
|
|
wf=>0, |
51
|
|
|
|
|
|
|
pf=>1, |
52
|
|
|
|
|
|
|
lf=>2, |
53
|
|
|
|
|
|
|
pagef=>undef, |
54
|
|
|
|
|
|
|
@_, ##-- user arguments |
55
|
|
|
|
|
|
|
); |
56
|
0
|
|
|
|
|
|
return $doc; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
##============================================================================== |
60
|
|
|
|
|
|
|
## API: I/O |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
## $ext = $doc->extension() |
63
|
|
|
|
|
|
|
## + default extension, for Corpus::Compiled |
64
|
|
|
|
|
|
|
sub extension { |
65
|
0
|
|
|
0
|
0
|
|
return '.tabs'; |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
##-------------------------------------------------------------- |
69
|
|
|
|
|
|
|
## API: I/O: parse |
70
|
|
|
|
|
|
|
|
71
|
|
|
|
|
|
|
## $bool = $doc->fromFile($filename_or_fh, %opts) |
72
|
|
|
|
|
|
|
## + parse tokens from $filename_or_fh |
73
|
|
|
|
|
|
|
## + %opts : clobbers %$doc |
74
|
|
|
|
|
|
|
sub fromFile { |
75
|
0
|
|
|
0
|
1
|
|
my ($doc,$file,%opts) = @_; |
76
|
0
|
0
|
|
|
|
|
$doc = $doc->new() if (!ref($doc)); |
77
|
0
|
|
|
|
|
|
@$doc{keys %opts} = values %opts; |
78
|
0
|
0
|
|
|
|
|
$doc->{label} = ref($file) ? "$file" : $file; |
79
|
0
|
0
|
|
|
|
|
my $fh = ref($file) ? $file : IO::File->new("<$file"); |
80
|
0
|
0
|
|
|
|
|
$doc->logconfess("fromFile(): cannot open file '$file': $!") if (!ref($fh)); |
81
|
0
|
0
|
|
|
|
|
binmode($fh,':utf8') if ($doc->{utf8}); |
82
|
|
|
|
|
|
|
|
83
|
0
|
|
0
|
|
|
|
my ($wf,$pf,$lf,$pagef) = map {($_//-1)} @$doc{qw(wf pf lf pagef)}; |
|
0
|
|
|
|
|
|
|
84
|
0
|
|
|
|
|
|
my $tokens = $doc->{tokens}; |
85
|
0
|
|
|
|
|
|
@$tokens = qw(); |
86
|
0
|
|
|
|
|
|
my $meta = $doc->{meta}; |
87
|
0
|
|
|
|
|
|
%$meta = qw(); |
88
|
0
|
|
|
|
|
|
my $eos = undef; |
89
|
0
|
|
|
|
|
|
my $eosre = $doc->{eosre}; |
90
|
0
|
0
|
0
|
|
|
|
$eosre = qr{$eosre} if ($eosre && !ref($eosre)); |
91
|
0
|
|
|
|
|
|
my $last_was_eos = 1; |
92
|
0
|
|
|
|
|
|
my $is_eos = 0; |
93
|
0
|
|
|
|
|
|
my $curpage = ''; |
94
|
0
|
|
|
|
|
|
my ($w,$p,$l,$page); |
95
|
0
|
|
|
|
|
|
while (defined($_=<$fh>)) { |
96
|
0
|
|
|
|
|
|
chomp; |
97
|
0
|
0
|
0
|
|
|
|
if (/^%%/) { |
|
|
0
|
|
|
|
|
|
98
|
0
|
0
|
|
|
|
|
if (/^%%(?:\$DDC:meta\.date_|\$?date)=([0-9]+)/) { |
99
|
0
|
|
|
|
|
|
$doc->{date} = $1; |
100
|
|
|
|
|
|
|
} |
101
|
0
|
0
|
0
|
|
|
|
if (/^%%\$DDC:meta\.([^=]+)=(.*)$/) { |
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
0
|
0
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
102
|
0
|
|
|
|
|
|
$meta->{$1} = $2; |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
elsif (/^%%\$DDC:index\[([0-9]+)\]=Token\b/ || /^%%\$DDC:index\[([0-9]+)\]=\S+ w$/) { |
105
|
0
|
|
|
|
|
|
$wf = $doc->{wf} = $1; |
106
|
|
|
|
|
|
|
} |
107
|
|
|
|
|
|
|
elsif (/^%%\$DDC:index\[([0-9]+)\]=Pos\b/ || /^%%\$DDC:index\[([0-9]+)\]=\S+ p$/) { |
108
|
0
|
|
|
|
|
|
$pf = $doc->{pf} = $1; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
elsif (/^%%\$DDC:index\[([0-9]+)\]=Lemma\b/ || /^%%\$DDC:index\[([0-9]+)\]=\S+ l$/) { |
111
|
0
|
|
|
|
|
|
$lf = $doc->{lf} = $1; |
112
|
|
|
|
|
|
|
} |
113
|
|
|
|
|
|
|
elsif (/^%%\$DDC:index\[([0-9]+)\]=Pos\b/ || /^%%\$DDC:index\[([0-9]+)\]=\S+ page$/) { |
114
|
0
|
|
|
|
|
|
$pagef = $doc->{pagef} = $1; |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
elsif (/^%%\$DDC:BREAK.([^=\[\]]+)/) { |
117
|
0
|
|
|
|
|
|
push(@$tokens,"#$1"); |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
elsif (/^%%\$DDC:PAGE=/) { |
120
|
0
|
|
|
|
|
|
push(@$tokens,"#page"); |
121
|
|
|
|
|
|
|
} |
122
|
0
|
0
|
0
|
|
|
|
if ($eosre && $_ =~ $eosre) { |
123
|
0
|
0
|
|
|
|
|
push(@$tokens,$eos) if (!$last_was_eos); |
124
|
0
|
|
|
|
|
|
$last_was_eos = 1; |
125
|
|
|
|
|
|
|
} |
126
|
0
|
|
|
|
|
|
next; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
elsif ($eosre && $_ =~ $eosre) { |
129
|
0
|
0
|
|
|
|
|
push(@$tokens,$eos) if (!$last_was_eos); |
130
|
0
|
|
|
|
|
|
$last_was_eos = 1; |
131
|
0
|
|
|
|
|
|
next; |
132
|
|
|
|
|
|
|
} |
133
|
0
|
|
|
|
|
|
($w,$p,$l,$page) = (split(/\t/,$_))[$wf,$pf,$lf,$pagef]; |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
##-- honor dta-style $page index |
136
|
0
|
0
|
0
|
|
|
|
if ($pagef > 0 && $page ne $curpage) { |
137
|
0
|
|
|
|
|
|
push(@$tokens, "#page"); |
138
|
0
|
|
|
|
|
|
$curpage = $page; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
##-- add token |
142
|
0
|
|
0
|
|
|
|
push(@$tokens, {w=>($w//''), p=>($p//''), l=>($l//'')}); |
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
143
|
0
|
|
|
|
|
|
$last_was_eos = 0; |
144
|
|
|
|
|
|
|
} |
145
|
0
|
0
|
|
|
|
|
push(@$tokens,$eos) if (!$last_was_eos); |
146
|
|
|
|
|
|
|
|
147
|
0
|
0
|
|
|
|
|
if (!$doc->{foreign}) { |
148
|
|
|
|
|
|
|
##-- hack: compute top-level $meta->{genre} from $meta->{textClass} if requested |
149
|
0
|
|
0
|
|
|
|
$meta->{genre} //= $meta->{textClass}; |
150
|
|
|
|
|
|
|
$meta->{genre} =~ s/\:.*$// |
151
|
0
|
0
|
0
|
|
|
|
if ($doc->{trimGenre} && defined($meta->{genre})); |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
##-- hack: compute/trim top-level $meta->{pnd} if requested |
154
|
0
|
|
0
|
|
|
|
$meta->{pnd} //= $meta->{author}; |
155
|
0
|
0
|
0
|
|
|
|
if ($doc->{trimPND} && defined($meta->{pnd})) { |
156
|
0
|
|
|
|
|
|
$meta->{pnd} = join(' ', ($meta->{pnd} =~ m/\#[0-9a-zA-Z]+/g)); |
157
|
0
|
0
|
0
|
|
|
|
delete($meta->{pnd}) if (($meta->{pnd}//'') eq ''); |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
##-- hack: trim top-level $meta->{author} if requested |
161
|
|
|
|
|
|
|
$meta->{author} =~ s/\s*\([^\)]*\)$// |
162
|
0
|
0
|
0
|
|
|
|
if ($doc->{trimAuthor} && defined($meta->{author})); |
163
|
|
|
|
|
|
|
} |
164
|
|
|
|
|
|
|
|
165
|
0
|
0
|
|
|
|
|
$fh->close() if (!ref($file)); |
166
|
0
|
|
|
|
|
|
return $doc; |
167
|
|
|
|
|
|
|
} |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
##============================================================================== |
170
|
|
|
|
|
|
|
## Footer |
171
|
|
|
|
|
|
|
1; |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
__END__ |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
|