line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
2
|
|
|
2
|
|
93951
|
use warnings; |
|
2
|
|
|
|
|
9
|
|
|
2
|
|
|
|
|
59
|
|
2
|
2
|
|
|
2
|
|
8
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
54
|
|
3
|
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
package Parse::GutenbergRoget; |
5
|
2
|
|
|
2
|
|
583
|
use parent qw(Exporter); |
|
2
|
|
|
|
|
438
|
|
|
2
|
|
|
|
|
8
|
|
6
|
|
|
|
|
|
|
|
7
|
2
|
|
|
2
|
|
73
|
use Carp (); |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
22
|
|
8
|
2
|
|
|
2
|
|
1271
|
use Text::CSV_XS; |
|
2
|
|
|
|
|
27110
|
|
|
2
|
|
|
|
|
1855
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
our @EXPORT = qw(parse_roget); ## no critic Export |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 NAME |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Parse::GutenbergRoget - parse Project Gutenberg's Roget's Thesaurus |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=head1 VERSION |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
version 0.023 |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
=cut |
21
|
|
|
|
|
|
|
|
22
|
|
|
|
|
|
|
our $VERSION = '0.023'; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 SYNOPSIS |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
use Parse::GutenbergRoget |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
my %section = parse_roget("./roget15a.txt"); |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
print $section{1}[0][0]{text}; # existence |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 DESCRIPTION |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
A Roget's Thesaurus is more than the simple synonym/antonym finder included in |
35
|
|
|
|
|
|
|
many dictionary sets. It organizes words into semantically realted categories, |
36
|
|
|
|
|
|
|
so that words with related meanings can be found in proximity to one another, |
37
|
|
|
|
|
|
|
with the level of proximity indicating the level of similarity. |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
Project Gutenberg has produced an etext of the 1911 edition of Roget's |
40
|
|
|
|
|
|
|
Thesaurus, and later began to revise it, in 1991. While it's not the best |
41
|
|
|
|
|
|
|
Roget-style thesaurus available, it's the best public domain electronic |
42
|
|
|
|
|
|
|
thesaurus datasource I've found. |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
This module parses the file's contents into a Perl data structure, which can |
45
|
|
|
|
|
|
|
then be stored in systems for searching and browsing it. This module does |
46
|
|
|
|
|
|
|
I implement those systems. |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
The code is not complete. This means that everything that can be parsed is not |
49
|
|
|
|
|
|
|
yet being parsed. It's important to realize that not everything is going to be |
50
|
|
|
|
|
|
|
parseable. There are too many typos and broken rules which, due to the lousy |
51
|
|
|
|
|
|
|
nature of the rules, create ambiguity. For a description of these rules see |
52
|
|
|
|
|
|
|
L"RULES"> below. |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=head1 FUNCTIONS |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head2 C<< parse_roget($filename) >> |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
This function, exported by default, will attempt to open, read, and parse the |
59
|
|
|
|
|
|
|
named file as a Project Gutenberg Roget's Thesaurus. It has only been tested |
60
|
|
|
|
|
|
|
with C, which is not included in the distribution, because it's |
61
|
|
|
|
|
|
|
too big. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
It returns a hash with the following structure: |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
%section = ( |
66
|
|
|
|
|
|
|
... |
67
|
|
|
|
|
|
|
'100a' => { |
68
|
|
|
|
|
|
|
major => 100, # major and minor form section identity |
69
|
|
|
|
|
|
|
minor => 'a', |
70
|
|
|
|
|
|
|
name => 'Fraction', |
71
|
|
|
|
|
|
|
comments => [ 'Less than one' ], |
72
|
|
|
|
|
|
|
subsections => [ |
73
|
|
|
|
|
|
|
{ |
74
|
|
|
|
|
|
|
type => 'N', # these entries are nouns |
75
|
|
|
|
|
|
|
groups => [ |
76
|
|
|
|
|
|
|
{ entries => [ |
77
|
|
|
|
|
|
|
{ text => 'fraction' }, |
78
|
|
|
|
|
|
|
{ text => 'fractional part' } |
79
|
|
|
|
|
|
|
] }, |
80
|
|
|
|
|
|
|
{ entries => [ { text => 'part &c. 51' } ] } |
81
|
|
|
|
|
|
|
] |
82
|
|
|
|
|
|
|
}, |
83
|
|
|
|
|
|
|
{ |
84
|
|
|
|
|
|
|
type => 'Adj', |
85
|
|
|
|
|
|
|
groups => [ { entries => [ ... ] } ] |
86
|
|
|
|
|
|
|
} |
87
|
|
|
|
|
|
|
] |
88
|
|
|
|
|
|
|
} |
89
|
|
|
|
|
|
|
... |
90
|
|
|
|
|
|
|
); |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
This structure isn't pretty or perfect, and is subject to change. All of its |
93
|
|
|
|
|
|
|
elements are shown here, except for one exception, which is the next likely |
94
|
|
|
|
|
|
|
subject for change: flags. Entries may have flags, in addition to text, which |
95
|
|
|
|
|
|
|
note things like "French" or "archaic". Entries (or possibly groups) will also |
96
|
|
|
|
|
|
|
gain cross reference attribues, replacing the ugly "&c. XX" text. I'd also |
97
|
|
|
|
|
|
|
like to deal with references to other subsections, which come in the form "&c. |
98
|
|
|
|
|
|
|
Adj." There isn't any reason for these to be needed, I think. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=cut |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub parse_roget { |
103
|
0
|
|
|
0
|
1
|
|
my ($filename) = @_; |
104
|
0
|
|
|
|
|
|
my %section = parse_sections($filename); |
105
|
0
|
|
|
|
|
|
bloom_sections(\%section); |
106
|
0
|
|
|
|
|
|
return %section; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
=head2 C<< parse_sections($filename) >> |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
This function is used internally by C to read the named file, |
112
|
|
|
|
|
|
|
returning the above structure, parsed only to the section level. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=cut |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
sub parse_sections { |
117
|
0
|
|
|
0
|
1
|
|
my ($filename) = @_; |
118
|
|
|
|
|
|
|
|
119
|
0
|
0
|
|
|
|
|
open my $roget, '<', $filename |
120
|
|
|
|
|
|
|
or Carp::croak "couldn't open $filename: $!"; |
121
|
|
|
|
|
|
|
|
122
|
0
|
|
|
|
|
|
my $previous_section; |
123
|
|
|
|
|
|
|
my %section; |
124
|
|
|
|
|
|
|
|
125
|
0
|
|
|
|
|
|
my $peeked_line; |
126
|
0
|
|
|
|
|
|
my ($in_newheader, $in_longcomment); |
127
|
|
|
|
|
|
|
|
128
|
0
|
|
0
|
|
|
|
while (my $line = ($peeked_line || <$roget>)) { |
129
|
0
|
|
|
|
|
|
undef $peeked_line; |
130
|
|
|
|
|
|
|
|
131
|
0
|
|
|
|
|
|
chomp $line; |
132
|
0
|
0
|
|
|
|
|
next unless $line; |
133
|
0
|
0
|
|
|
|
|
next if ($line =~ /^#/); # comment |
134
|
|
|
|
|
|
|
|
135
|
0
|
0
|
|
|
|
|
if ($line =~ /^\s*<--/) { $in_longcomment = 1; } |
|
0
|
|
|
|
|
|
|
136
|
0
|
0
|
|
|
|
|
if ($line =~ /-->$/) { $in_longcomment = 0; next; } |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
137
|
0
|
0
|
|
|
|
|
next if $in_longcomment; |
138
|
|
|
|
|
|
|
|
139
|
0
|
0
|
|
|
|
|
if ($line =~ /^%/) { |
140
|
0
|
|
|
|
|
|
$in_newheader = not $in_newheader; |
141
|
0
|
|
|
|
|
|
next; |
142
|
|
|
|
|
|
|
} |
143
|
0
|
0
|
|
|
|
|
next if $in_newheader; |
144
|
|
|
|
|
|
|
|
145
|
0
|
|
|
|
|
|
$line =~ s/^\s+//; |
146
|
|
|
|
|
|
|
|
147
|
0
|
|
|
|
|
|
until ($peeked_line) { |
148
|
0
|
|
|
|
|
|
$peeked_line = <$roget>; |
149
|
0
|
0
|
|
|
|
|
last unless defined $peeked_line; |
150
|
0
|
|
|
|
|
|
chomp $peeked_line; |
151
|
0
|
0
|
0
|
|
|
|
if ($peeked_line and $peeked_line !~ /^\s{4}/ |
|
|
|
0
|
|
|
|
|
152
|
|
|
|
|
|
|
and $peeked_line !~ /^(?:#|%|<--)/) |
153
|
|
|
|
|
|
|
{ |
154
|
0
|
0
|
|
|
|
|
$line .= q{ } unless (substr($line, -1, 1) eq q{-}); |
155
|
0
|
|
|
|
|
|
$line .= $peeked_line; |
156
|
0
|
|
|
|
|
|
undef $peeked_line; |
157
|
0
|
0
|
|
|
|
|
if ($line =~ /[^,]+,[^.]+\.\s{4}/) { |
158
|
0
|
|
|
|
|
|
($line, $peeked_line) = split /\s{4}/, $line, 2; |
159
|
|
|
|
|
|
|
} |
160
|
|
|
|
|
|
|
} |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
0
|
|
|
|
|
|
my ($sec, $title, $newline) = |
164
|
|
|
|
|
|
|
($line =~ /^#?(\d+[a-z]?). (.*?)(?:--(.*))?$/); |
165
|
0
|
0
|
0
|
|
|
|
$line = ($newline||'') if ($sec); |
166
|
|
|
|
|
|
|
|
167
|
0
|
0
|
|
|
|
|
if ($sec) { |
168
|
0
|
|
|
|
|
|
(my($comment_beginning), $title, my($comment_end)) = |
169
|
|
|
|
|
|
|
($title =~ /(?:\[(.+?)\.?\])?\s*([^.]+)\.?\s*(?:\[(.+?)\.?\])?/); |
170
|
0
|
|
|
|
|
|
$title =~ s/(^\s+|\s{2,}|\s+$)//g; |
171
|
|
|
|
|
|
|
$section{$sec} = { |
172
|
|
|
|
|
|
|
name => $title, |
173
|
|
|
|
|
|
|
subsections => [ { text => $line||'' } ], |
174
|
0
|
|
0
|
|
|
|
comments => [ grep { defined $_ } ($comment_beginning, $comment_end) ] |
|
0
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
}; |
176
|
0
|
|
|
|
|
|
@{$section{$sec}}{qw[major minor]} = ($sec =~ /^(\d+)(.*)$/); |
|
0
|
|
|
|
|
|
|
177
|
0
|
0
|
|
|
|
|
Carp::confess "couldn't parse section: $sec" unless $section{$sec}{major}; |
178
|
0
|
|
|
|
|
|
$previous_section = $sec; |
179
|
|
|
|
|
|
|
} else { |
180
|
0
|
|
0
|
|
|
|
$section{$previous_section}{subsections} ||= []; |
181
|
0
|
|
|
|
|
|
push @{$section{$previous_section}{subsections}}, { text => $line }; |
|
0
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
} |
183
|
|
|
|
|
|
|
} |
184
|
0
|
|
|
|
|
|
return %section; |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=head2 C<< bloom_sections(\%sections) >> |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
Given a reference to the section hash, this subroutine expands the sections |
191
|
|
|
|
|
|
|
into subsections, groups, and entries. |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
=cut |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
sub bloom_sections { |
196
|
0
|
|
|
0
|
1
|
|
my ($section) = @_; |
197
|
|
|
|
|
|
|
|
198
|
0
|
|
|
|
|
|
my $decomma = Text::CSV_XS->new; |
199
|
0
|
|
|
|
|
|
my $desemi = Text::CSV_XS->new({sep_char => q{;}}); |
200
|
|
|
|
|
|
|
|
201
|
0
|
|
|
|
|
|
my $types = qr/(Adj|Adv|Int|N|Phr|Pron|V)/; |
202
|
|
|
|
|
|
|
|
203
|
0
|
|
|
|
|
|
for (values %$section) { |
204
|
0
|
|
|
|
|
|
my $previous_subsection; |
205
|
0
|
|
|
|
|
|
for my $subsection (@{$_->{subsections}}) { |
|
0
|
|
|
|
|
|
|
206
|
0
|
|
|
|
|
|
$subsection->{text} =~ s/\.$//; |
207
|
0
|
|
|
|
|
|
$subsection->{text} =~ s/ {2,}/ /g; |
208
|
0
|
|
|
|
|
|
$subsection->{text} =~ s/(^\s+|\s+$)//g; |
209
|
|
|
|
|
|
|
|
210
|
0
|
0
|
|
|
|
|
if (my ($type) = ($subsection->{text} =~ /^$types\./)) { |
|
|
0
|
|
|
|
|
|
211
|
0
|
|
|
|
|
|
$subsection->{text} =~ s/^$type\.//; |
212
|
0
|
|
|
|
|
|
$subsection->{type} = $type; |
213
|
|
|
|
|
|
|
} elsif ($previous_subsection) { |
214
|
0
|
|
|
|
|
|
$subsection->{type} = $previous_subsection->{type}; |
215
|
|
|
|
|
|
|
} else { |
216
|
0
|
|
|
|
|
|
$subsection->{type} = 'UNKNOWN'; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
0
|
|
|
|
|
|
$desemi->parse(delete $subsection->{text}); |
220
|
0
|
|
|
|
|
|
$subsection->{groups} = [ map { { text => $_ } } $desemi->fields ]; |
|
0
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
|
222
|
0
|
|
|
|
|
|
for my $group (@{$subsection->{groups}}) { |
|
0
|
|
|
|
|
|
|
223
|
0
|
|
|
|
|
|
$decomma->parse(delete $group->{text}); |
224
|
0
|
|
|
|
|
|
$group->{entries} = [ map { { text => $_, flags => [] } } $decomma->fields ]; |
|
0
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
|
226
|
0
|
|
|
|
|
|
for (@{$group->{entries}}) { |
|
0
|
|
|
|
|
|
|
227
|
0
|
|
0
|
|
|
|
$_->{text}||= 'UNPARSED'; |
228
|
0
|
0
|
|
|
|
|
if ($_->{text} =~ s/\[obs3\]//) { |
229
|
0
|
|
|
|
|
|
push @{$_->{flags}}, 'archaic? (1991)'; |
|
0
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
} |
231
|
0
|
0
|
|
|
|
|
if ($_->{text} =~ s/|!//) { |
232
|
0
|
|
|
|
|
|
push @{$_->{flags}}, 'obsolete (1991)'; |
|
0
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
} |
234
|
0
|
0
|
|
|
|
|
if ($_->{text} =~ s/|//) { |
235
|
0
|
|
|
|
|
|
push @{$_->{flags}}, 'obsolete (1911)'; |
|
0
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
} |
237
|
0
|
|
|
|
|
|
$_->{text} =~ s/(^\s+|\s+$)//; |
238
|
|
|
|
|
|
|
} |
239
|
|
|
|
|
|
|
} |
240
|
0
|
|
|
|
|
|
$previous_subsection = $subsection; |
241
|
|
|
|
|
|
|
} |
242
|
|
|
|
|
|
|
} |
243
|
|
|
|
|
|
|
} |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
=head1 THE FILE |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
=over 4 |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
=item * The thesaurus file is plain text, in 7-bit ASCII. |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=item * Lines with a C<#> as their first character are comments. |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
=item * Lines beginning with C<< <-- >> begin multi-line comments. |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=item * Lines ending with C<< --> >> end multi-line comments. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
These multi-line comments were originally used for including the page numbers |
258
|
|
|
|
|
|
|
from the original text. Later editors used them (instead of C<#> comments) to |
259
|
|
|
|
|
|
|
mark their editorial notes. |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
There exists one situation in C where the C<< <-- >> occurs |
262
|
|
|
|
|
|
|
outside of position zero in a line. |
263
|
|
|
|
|
|
|
|
264
|
|
|
|
|
|
|
=item * A line containing only C<%> begins or ends new "supersection" data. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
So, if we wanted to begin a new supersection for "States of Mind", we might |
267
|
|
|
|
|
|
|
have the following: |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
% |
270
|
|
|
|
|
|
|
STATES OF MIND |
271
|
|
|
|
|
|
|
% |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
Unfortunately, there is almost no consistency in how these supersections are |
274
|
|
|
|
|
|
|
organized. Some entries declare new sections "SECTION VII. CHANGE" and |
275
|
|
|
|
|
|
|
immediately begin new subsupersections, "1. SIMPLE CHANGE". Others just give |
276
|
|
|
|
|
|
|
headings: "Present Events" |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
Then there is this excerpt: |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
% |
281
|
|
|
|
|
|
|
CLASS II |
282
|
|
|
|
|
|
|
WORDS RELATING TO SPACE |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
SECTION I. SPACE IN GENERAL |
285
|
|
|
|
|
|
|
|
286
|
|
|
|
|
|
|
1. ABSTRACT SPACE |
287
|
|
|
|
|
|
|
% |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
I think the only thing to do is ignore all this crap, so that's what is done. |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
=item * Lines do not exceed 79 characters in width. |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
=item * New lines begin in column 6. |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
Some, though, begin in column 5, 6, or 8. A line starting in column 4 or less |
296
|
|
|
|
|
|
|
is always a continuation, unless it's in 0. (I'm not considering comments.) |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
Also, new lines might begin in any column, when a period that does not follow |
299
|
|
|
|
|
|
|
the declaration of a subsection type is followed by at least four spaces. |
300
|
|
|
|
|
|
|
|
301
|
|
|
|
|
|
|
=item * Lines in column 0 are continuation. |
302
|
|
|
|
|
|
|
|
303
|
|
|
|
|
|
|
Unless the previous line ended with a period. |
304
|
|
|
|
|
|
|
|
305
|
|
|
|
|
|
|
Continuations are appended to the continued line. Unless the continued line |
306
|
|
|
|
|
|
|
ended with a hyphen, a space is used to join the lines. |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
There is at least one case where a word is split without a hyphen, appearing as |
309
|
|
|
|
|
|
|
"crocu\ns". This word is left broken in the parsed text. |
310
|
|
|
|
|
|
|
|
311
|
|
|
|
|
|
|
=item * New lines beginning with a C<#> (not in column 0) begin new sections. |
312
|
|
|
|
|
|
|
|
313
|
|
|
|
|
|
|
The C<#> is omitted in one case, section 252a. |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
=item * The C<#> in a section heading is followed by the section identifier. |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
The section identifier is a series of numbers followed by an optional letter |
318
|
|
|
|
|
|
|
and terminated by a period. |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=item * The section header is followed by a name and comments. |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
The comments are marked off by square brackets, and may occur before or after |
323
|
|
|
|
|
|
|
the name, or both. |
324
|
|
|
|
|
|
|
|
325
|
|
|
|
|
|
|
The name is all the rest of the text between the comments and before a |
326
|
|
|
|
|
|
|
terminating C<-->. |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=item * Once section headers are removed, every new line is a new subsection. |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
=item * Subsections may begin with a type declaration. |
331
|
|
|
|
|
|
|
|
332
|
|
|
|
|
|
|
A type declaration indicates that all entries in the subsection are of one |
333
|
|
|
|
|
|
|
part of speech: adjective, adverb, noun, interjection, phrase, and so on. |
334
|
|
|
|
|
|
|
|
335
|
|
|
|
|
|
|
=item * Subsections with no declared type are of the type of the previous subsection. |
336
|
|
|
|
|
|
|
|
337
|
|
|
|
|
|
|
Despite this rule, some subsections have no type. I file them as UNKNOWN. |
338
|
|
|
|
|
|
|
|
339
|
|
|
|
|
|
|
=item * Subsections are divided into groups by semicolons. |
340
|
|
|
|
|
|
|
|
341
|
|
|
|
|
|
|
Semicolons don't divide subsections, if they occur within double quotes. The |
342
|
|
|
|
|
|
|
quotes can be part of the line, and need not quote the entire group. |
343
|
|
|
|
|
|
|
|
344
|
|
|
|
|
|
|
In other words, this is a valid subsection, consisting of three groups: |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
... |
347
|
|
|
|
|
|
|
Int. go for it; "go ahead; make my day" [Dirty Harry]; give it a try, |
348
|
|
|
|
|
|
|
take a shot. |
349
|
|
|
|
|
|
|
... |
350
|
|
|
|
|
|
|
|
351
|
|
|
|
|
|
|
=item * Groups are divided into entries by commas. |
352
|
|
|
|
|
|
|
|
353
|
|
|
|
|
|
|
Commas don't divide groups, if they occur within double quotes. The quotes can |
354
|
|
|
|
|
|
|
be just part of the entry, and need not quote the entire entry. |
355
|
|
|
|
|
|
|
|
356
|
|
|
|
|
|
|
Some groups include the entry-breaking comma inside the quotes, like this one: |
357
|
|
|
|
|
|
|
|
358
|
|
|
|
|
|
|
Phr. it must go no further, it will go no further; don't tell a |
359
|
|
|
|
|
|
|
soul;"tell it not in Gath,"nobody the wiser; alitur vitium vivitque |
360
|
|
|
|
|
|
|
tegendo[Lat][obs3]; "let it be tenable in your silence still"[Hamlet]. |
361
|
|
|
|
|
|
|
|
362
|
|
|
|
|
|
|
The comma after "Gath" should be on the outside. |
363
|
|
|
|
|
|
|
|
364
|
|
|
|
|
|
|
=item * Flags may follow the text of an entry. |
365
|
|
|
|
|
|
|
|
366
|
|
|
|
|
|
|
Flags are either text enclosed in square brackets, as "[Lat]" or the special |
367
|
|
|
|
|
|
|
identifiers C<|!> or C<|>. These flags provide metadata about the entry, like |
368
|
|
|
|
|
|
|
its language of origin or the domain of jargon in which it is relevant. C<|> |
369
|
|
|
|
|
|
|
indicates that a word was obsolete in the 1911 edition, and C<|!> indicates |
370
|
|
|
|
|
|
|
words that were no longer used as indicated by 1991. (A third obsolete status |
371
|
|
|
|
|
|
|
is indicated by the flag "[obs3]" meaning "entirely archaic as of 1991.") |
372
|
|
|
|
|
|
|
|
373
|
|
|
|
|
|
|
Sometimes, flags occur outside of quotes, and sometimes they're inside. The |
374
|
|
|
|
|
|
|
example text above shows flags outside of the Hamlet quote, but this section |
375
|
|
|
|
|
|
|
(which really exists) is a counter-example: |
376
|
|
|
|
|
|
|
|
377
|
|
|
|
|
|
|
Phr. noli me tangere[Lat]; nemo me impune lacessit[Lat]; don't tread |
378
|
|
|
|
|
|
|
on me; don't you dare; don't even think of it; "Go ahead, make my day!" |
379
|
|
|
|
|
|
|
[Dirty Harry]. |
380
|
|
|
|
|
|
|
|
381
|
|
|
|
|
|
|
=item * Cross-references may follow the entry. |
382
|
|
|
|
|
|
|
|
383
|
|
|
|
|
|
|
I haven't determined whether any entries have both a cross-reference I |
384
|
|
|
|
|
|
|
flags. |
385
|
|
|
|
|
|
|
|
386
|
|
|
|
|
|
|
A cross-reference is in the form "c&. 123" where 123 is a section, or "c&. n." |
387
|
|
|
|
|
|
|
where "n" refers to the noun-type subsections for this section. |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
Unfortunately, cross-references are not always followed by commas or |
390
|
|
|
|
|
|
|
semicolons, meaning that they sometimes seem to appear in the middle of an |
391
|
|
|
|
|
|
|
entry, as follows: |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
miss, miss one's aim, miss the mark, miss one's footing, miss stays; |
394
|
|
|
|
|
|
|
slip, trip, stumble; make a slip &c., n. blunder &c. 495, make a mess of, |
395
|
|
|
|
|
|
|
... |
396
|
|
|
|
|
|
|
|
397
|
|
|
|
|
|
|
The cross-reference after "make a slip" should have a comma after the "n." but |
398
|
|
|
|
|
|
|
does not, so it appears to be the middle of an entry beginning after "stumble;" |
399
|
|
|
|
|
|
|
and ending before "make a mess of." |
400
|
|
|
|
|
|
|
|
401
|
|
|
|
|
|
|
=back |
402
|
|
|
|
|
|
|
|
403
|
|
|
|
|
|
|
=head1 TODO |
404
|
|
|
|
|
|
|
|
405
|
|
|
|
|
|
|
Well, a good first step would be a TODO section. |
406
|
|
|
|
|
|
|
|
407
|
|
|
|
|
|
|
I'll write some tests that will only run if you put a C file in |
408
|
|
|
|
|
|
|
the right place. I'll also try the tests with previous revisions of the file. |
409
|
|
|
|
|
|
|
|
410
|
|
|
|
|
|
|
I'm also tempted to produce newer revisions on my own, after I contact the |
411
|
|
|
|
|
|
|
address listed in the file. The changes would just be to eliminate anomalies |
412
|
|
|
|
|
|
|
that prevent parsing. Distraction by shiny objects may prevent this goal. |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
The flags and cross reference bits above will be implemented. |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
The need for Text::CSV_XS may be eliminated. |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
Entries with internal quoting (especially common in phrases) will no longer |
419
|
|
|
|
|
|
|
become UNPARSED. |
420
|
|
|
|
|
|
|
|
421
|
|
|
|
|
|
|
I'll try to eliminate more UNKNOWN subsection types. |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
=head1 AUTHOR |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
Ricardo Signes, C<< >> |
426
|
|
|
|
|
|
|
|
427
|
|
|
|
|
|
|
=head1 BUGS |
428
|
|
|
|
|
|
|
|
429
|
|
|
|
|
|
|
This software has been abandoned. Any bugs, you're on your own. Or look to |
430
|
|
|
|
|
|
|
take it over. |
431
|
|
|
|
|
|
|
|
432
|
|
|
|
|
|
|
=head1 COPYRIGHT |
433
|
|
|
|
|
|
|
|
434
|
|
|
|
|
|
|
Copyright 2004 Ricardo Signes, All Rights Reserved. |
435
|
|
|
|
|
|
|
|
436
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
437
|
|
|
|
|
|
|
under the same terms as Perl itself. |
438
|
|
|
|
|
|
|
|
439
|
|
|
|
|
|
|
=cut |
440
|
|
|
|
|
|
|
|
441
|
|
|
|
|
|
|
1; |