File Coverage

blib/lib/DiaColloDB/Document/DDCTabs.pm

Criterion	Covered	Total	%
statement	9	70	12.8
branch	0	54	0.0
condition	0	49	0.0
subroutine	3	6	50.0
pod	2	3	66.6
total	14	182	7.6

line	stmt	bran	cond	sub	pod	time	code
1							## -- Mode: CPerl --
2							## File: DiaColloDB::Document::DDCTabs.pm
3							## Author: Bryan Jurish <moocow@cpan.org>
4							## Description: collocation db, source document, DDC tab-dump
5
6							package DiaColloDB::Document::DDCTabs;
7	1			1		7	use DiaColloDB::Document;
	1					2
	1					23
8	1			1		5	use IO::File;
	1					1
	1					29
9	1			1		150	use strict;
	1					2
	1					950
10
11							##==============================================================================
12							## Globals & Constants
13
14							our @ISA = qw(DiaColloDB::Document);
15
16							##==============================================================================
17							## Constructors etc.
18
19							## $doc = CLASS_OR_OBJECT->new(%args)
20							## + %args, object structure:
21							## (
22							## ##-- parsing options
23							## eosre => $re, ##-- EOS regex (empty or undef for file-breaks only; default='^$')
24							## utf8 => $bool, ##-- enable utf8 parsing? (default=1)
25							## trimPND => $bool, ##-- create trimmed "pnd" meta-attribute? (default=1)
26							## trimAuthor => $bool, ##-- trim "author" meta-attribute (eliminate DTA PNDs)? (default=1)
27							## trimGenre => $bool, ##-- create trimmed "genre" meta-attribute? (default=1)
28							## foreign => $bool, ##-- alias for trimAuthor=0 trimPND=0 trimGenre=0
29							## ##
30							## ##-- document data
31							## date =>$date, ##-- year
32							## wf =>$iw, ##-- index-field for $word attribute (default=0)
33							## pf =>$ip, ##-- index-field for $pos attribute (default=1)
34							## lf =>$il, ##-- index-field for $lemma attribute (default=2)
35							## pagef =>$ipage, ##-- index-field for $page attribute (default=undef:none)
36							## tokens =>\@tokens, ##-- tokens, including undef for EOS
37							## meta =>\%meta, ##-- document metadata (e.g. author, title, collection, ...)
38							## ## + may also generate special $meta->{genre} as 1st component of $meta->{textClass} if available
39							## )
40							## + each token in @tokens is a HASH-ref {w=>$word,p=>$pos,l=>$lemma,...}
41							## + default attribute positions ($iw,$ip,$il,$ipage) are overridden doc lines '%%$DDC:index[INDEX]=LONGNAME w' etc if present
42							sub new {
43	0			0	1		my $that = shift;
44	0						my $doc = $that->SUPER::new(
45							utf8=>1,
46							trimPND=>1,
47							trimAuthor=>1,
48							trimGenre=>1,
49							eosre=>qr{^$},
50							wf=>0,
51							pf=>1,
52							lf=>2,
53							pagef=>undef,
54							@_, ##-- user arguments
55							);
56	0						return $doc;
57							}
58
59							##==============================================================================
60							## API: I/O
61
62							## $ext = $doc->extension()
63							## + default extension, for Corpus::Compiled
64							sub extension {
65	0			0	0		return '.tabs';
66							}
67
68							##--------------------------------------------------------------
69							## API: I/O: parse
70
71							## $bool = $doc->fromFile($filename_or_fh, %opts)
72							## + parse tokens from $filename_or_fh
73							## + %opts : clobbers %$doc
74							sub fromFile {
75	0			0	1		my ($doc,$file,%opts) = @_;
76	0	0					$doc = $doc->new() if (!ref($doc));
77	0						@$doc{keys %opts} = values %opts;
78	0	0					$doc->{label} = ref($file) ? "$file" : $file;
79	0	0					my $fh = ref($file) ? $file : IO::File->new("<$file");
80	0	0					$doc->logconfess("fromFile(): cannot open file '$file': $!") if (!ref($fh));
81	0	0					binmode($fh,':utf8') if ($doc->{utf8});
82
83	0		0				my ($wf,$pf,$lf,$pagef) = map {($_//-1)} @$doc{qw(wf pf lf pagef)};
	0
84	0						my $tokens = $doc->{tokens};
85	0						@$tokens = qw();
86	0						my $meta = $doc->{meta};
87	0						%$meta = qw();
88	0						my $eos = undef;
89	0						my $eosre = $doc->{eosre};
90	0	0	0				$eosre = qr{$eosre} if ($eosre && !ref($eosre));
91	0						my $last_was_eos = 1;
92	0						my $is_eos = 0;
93	0						my $curpage = '';
94	0						my ($w,$p,$l,$page);
95	0						while (defined($_=<$fh>)) {
96	0						chomp;
97	0	0	0				if (/^%%/) {
		0
98	0	0					if (/^%%(?:\$DDC:meta\.date_\|\$?date)=([0-9]+)/) {
99	0						$doc->{date} = $1;
100							}
101	0	0	0				if (/^%%\$DDC:meta\.([^=]+)=(.*)$/) {
		0	0
		0	0
		0	0
		0
		0
		0
102	0						$meta->{$1} = $2;
103							}
104							elsif (/^%%\$DDC:index\[([0-9]+)\]=Token\b/ \|\| /^%%\$DDC:index\[([0-9]+)\]=\S+ w$/) {
105	0						$wf = $doc->{wf} = $1;
106							}
107							elsif (/^%%\$DDC:index\[([0-9]+)\]=Pos\b/ \|\| /^%%\$DDC:index\[([0-9]+)\]=\S+ p$/) {
108	0						$pf = $doc->{pf} = $1;
109							}
110							elsif (/^%%\$DDC:index\[([0-9]+)\]=Lemma\b/ \|\| /^%%\$DDC:index\[([0-9]+)\]=\S+ l$/) {
111	0						$lf = $doc->{lf} = $1;
112							}
113							elsif (/^%%\$DDC:index\[([0-9]+)\]=Pos\b/ \|\| /^%%\$DDC:index\[([0-9]+)\]=\S+ page$/) {
114	0						$pagef = $doc->{pagef} = $1;
115							}
116							elsif (/^%%\$DDC:BREAK.([^=\[\]]+)/) {
117	0						push(@$tokens,"#$1");
118							}
119							elsif (/^%%\$DDC:PAGE=/) {
120	0						push(@$tokens,"#page");
121							}
122	0	0	0				if ($eosre && $_ =~ $eosre) {
123	0	0					push(@$tokens,$eos) if (!$last_was_eos);
124	0						$last_was_eos = 1;
125							}
126	0						next;
127							}
128							elsif ($eosre && $_ =~ $eosre) {
129	0	0					push(@$tokens,$eos) if (!$last_was_eos);
130	0						$last_was_eos = 1;
131	0						next;
132							}
133	0						($w,$p,$l,$page) = (split(/\t/,$_))[$wf,$pf,$lf,$pagef];
134
135							##-- honor dta-style $page index
136	0	0	0				if ($pagef > 0 && $page ne $curpage) {
137	0						push(@$tokens, "#page");
138	0						$curpage = $page;
139							}
140
141							##-- add token
142	0		0				push(@$tokens, {w=>($w//''), p=>($p//''), l=>($l//'')});
			0
			0
143	0						$last_was_eos = 0;
144							}
145	0	0					push(@$tokens,$eos) if (!$last_was_eos);
146
147	0	0					if (!$doc->{foreign}) {
148							##-- hack: compute top-level $meta->{genre} from $meta->{textClass} if requested
149	0		0				$meta->{genre} //= $meta->{textClass};
150							$meta->{genre} =~ s/\:.*$//
151	0	0	0				if ($doc->{trimGenre} && defined($meta->{genre}));
152
153							##-- hack: compute/trim top-level $meta->{pnd} if requested
154	0		0				$meta->{pnd} //= $meta->{author};
155	0	0	0				if ($doc->{trimPND} && defined($meta->{pnd})) {
156	0						$meta->{pnd} = join(' ', ($meta->{pnd} =~ m/\#[0-9a-zA-Z]+/g));
157	0	0	0				delete($meta->{pnd}) if (($meta->{pnd}//'') eq '');
158							}
159
160							##-- hack: trim top-level $meta->{author} if requested
161							$meta->{author} =~ s/\s$[^$]\)$//
162	0	0	0				if ($doc->{trimAuthor} && defined($meta->{author}));
163							}
164
165	0	0					$fh->close() if (!ref($file));
166	0						return $doc;
167							}
168
169							##==============================================================================
170							## Footer
171							1;
172
173							__END__
174
175
176
177