line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
|
2
|
|
|
|
|
|
|
# Time-stamp: "2004-04-03 20:20:51 ADT" |
3
|
|
|
|
|
|
|
require 5; |
4
|
|
|
|
|
|
|
package Text::Shoebox; |
5
|
4
|
|
|
4
|
|
13572
|
use strict; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
155
|
|
6
|
4
|
|
|
4
|
|
4489
|
use integer; # we don't need noninteger math in here |
|
4
|
|
|
|
|
195
|
|
|
4
|
|
|
|
|
25
|
|
7
|
4
|
|
|
4
|
|
144
|
use Carp qw(carp croak); |
|
4
|
|
|
|
|
14
|
|
|
4
|
|
|
|
|
306
|
|
8
|
4
|
|
|
4
|
|
24
|
use vars qw(@ISA @EXPORT $Debug $VERSION %p); |
|
4
|
|
|
|
|
13
|
|
|
4
|
|
|
|
|
12959
|
|
9
|
|
|
|
|
|
|
require Exporter; |
10
|
|
|
|
|
|
|
require UNIVERSAL; |
11
|
|
|
|
|
|
|
@ISA = qw(Exporter); |
12
|
|
|
|
|
|
|
@EXPORT = qw(read_sf write_sf are_hw_keys_uniform are_hw_values_unique); |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
$Debug = 0 unless defined $Debug; |
15
|
|
|
|
|
|
|
$VERSION = "1.02"; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 NAME |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
Text::Shoebox - read and write SIL Shoebox Standard Format (.sf) files |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 SYNOPSIS |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
use Text::Shoebox; |
24
|
|
|
|
|
|
|
my $lex = []; |
25
|
|
|
|
|
|
|
foreach my $file (@ARGV) { |
26
|
|
|
|
|
|
|
read_sf( |
27
|
|
|
|
|
|
|
from_file => $file, into => $lex, |
28
|
|
|
|
|
|
|
) or warn "read from $file failed\n"; |
29
|
|
|
|
|
|
|
} |
30
|
|
|
|
|
|
|
print scalar(@$lex), " entries read.\n"; |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
die "hw field-names differ\n" |
33
|
|
|
|
|
|
|
unless are_hw_keys_uniform($lex); |
34
|
|
|
|
|
|
|
warn "hw field-values aren't unique\n" |
35
|
|
|
|
|
|
|
unless are_hw_values_unique($lex); |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
write_sf(from => $lex, to_file => "merged.sf") |
38
|
|
|
|
|
|
|
or die "Couldn't write to merged.sf: $!"; |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 DESCRIPTION |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
The Summer Institute of Linguistics (C) makes a |
43
|
|
|
|
|
|
|
piece of free software called "the Linguist's Shoebox", or just |
44
|
|
|
|
|
|
|
"Shoebox" for short. It's a simple database program generally used |
45
|
|
|
|
|
|
|
for making lexicon databases (altho it can also be used for databases |
46
|
|
|
|
|
|
|
of field notes, etc.). |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
Shoebox can export its databases to SF (Standard Format) files, a |
49
|
|
|
|
|
|
|
simple text format. Reading and writing those SF files is what this |
50
|
|
|
|
|
|
|
Perl module, Text::Shoebox, is for. (I have heard that Standard Format |
51
|
|
|
|
|
|
|
predates Shoebox quite a bit, and is used by other programs. If you |
52
|
|
|
|
|
|
|
use SF files with something other than Shoebox, I'd be interested in |
53
|
|
|
|
|
|
|
hearing about it, particularly about whether such files and |
54
|
|
|
|
|
|
|
Text::Shoebox are happily compatible.) |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
=head1 OBJECT-ORIENTED INTERFACE |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
This module provides a functional interface. If you want an |
59
|
|
|
|
|
|
|
object-oriented interface, with a bit more convenience, then see |
60
|
|
|
|
|
|
|
the classes L and L. |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
=head1 FUNCTIONS |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
=over |
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
=item $lex_lol = read_sf(...options...) |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
Reads entries in Standard Format from the source specified. If no |
70
|
|
|
|
|
|
|
entries were read, returns false. Otherwise, returns a reference to |
71
|
|
|
|
|
|
|
the array that the entries were added to (which will be a new array, |
72
|
|
|
|
|
|
|
unless the "into" option is set). If there's an I/O error while reading |
73
|
|
|
|
|
|
|
(like if you specify an unreadable file), then this routine dies. |
74
|
|
|
|
|
|
|
|
75
|
|
|
|
|
|
|
The options are: |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
=over |
78
|
|
|
|
|
|
|
|
79
|
|
|
|
|
|
|
=item from_file => STRING |
80
|
|
|
|
|
|
|
|
81
|
|
|
|
|
|
|
This specifies that the source of the SF data is a file, whose |
82
|
|
|
|
|
|
|
filespec is given. |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
=item from_handle => FILEHANDLE |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
This specifies that the source of the SF data is a given filehandle. |
87
|
|
|
|
|
|
|
(Examples of filehandle values: a global filehandle passed either |
88
|
|
|
|
|
|
|
like C<*MYFH{IO}> or C<*MYFH>; or an object value from an IO class like |
89
|
|
|
|
|
|
|
IO::Socket or IO::Handle.) |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
The filehandle isn't closed when all its data is read. |
92
|
|
|
|
|
|
|
|
93
|
|
|
|
|
|
|
=item rs => STRING |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
This specifies that the given string should be used as the record |
96
|
|
|
|
|
|
|
separator (newline string) for the data source being read from. |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
If the SF source is specified by a "from_file" option, and you don't |
99
|
|
|
|
|
|
|
specify an "rs" option, then Text::Shoebox will try guessing the line |
100
|
|
|
|
|
|
|
format of the file by reading the first 2K of the file and looking for |
101
|
|
|
|
|
|
|
a CRLF ("\cm\cj"), an LF ("\cj"), or a CR ("\cm"). If you need to |
102
|
|
|
|
|
|
|
stop it from trying to guess, just stipulate an "rs" value of C<$/>. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
If the SF source is specified by a "from_handle" option, and you don't |
105
|
|
|
|
|
|
|
specify an "rs" option, then Text::Shoebox will just use the value in |
106
|
|
|
|
|
|
|
the Perl variable C<$/> (the global RS value). |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=item into => ARRAYREF |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
If this option is stipulated, then entries read will be pushed to the |
111
|
|
|
|
|
|
|
end of the array specified. Otherwise the entries will be put into a |
112
|
|
|
|
|
|
|
new array. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
=back |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
Example use: |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
use Text::Shoebox; |
119
|
|
|
|
|
|
|
my $lexicon = read_sf(from_file => 'foo.sf') |
120
|
|
|
|
|
|
|
|| die "No entries?"; |
121
|
|
|
|
|
|
|
print scalar(@$lexicon), " entries read.\n"; |
122
|
|
|
|
|
|
|
print "First entry has ", |
123
|
|
|
|
|
|
|
@{ $lexicon->[0] } / 2 , " fields.\n"; |
124
|
|
|
|
|
|
|
|
125
|
|
|
|
|
|
|
=cut |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
sub read_sf { |
128
|
21
|
|
|
21
|
1
|
735
|
my(%options) = @_; |
129
|
|
|
|
|
|
|
|
130
|
21
|
|
|
|
|
26
|
my($target); |
131
|
21
|
100
|
|
|
|
56
|
if(exists $options{'into'} ) { |
132
|
12
|
|
|
|
|
25
|
$target = $options{'into'}; |
133
|
|
|
|
|
|
|
} else { |
134
|
9
|
|
|
|
|
15
|
$target = []; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
21
|
|
|
|
|
92
|
local $/ = $/; |
138
|
21
|
|
|
|
|
26
|
my($fh, $to_close); |
139
|
21
|
50
|
|
|
|
77
|
if( exists $options{'from_handle'}) { |
|
|
50
|
|
|
|
|
|
140
|
0
|
|
|
|
|
0
|
$fh = $options{'from_handle'}; |
141
|
0
|
0
|
|
|
|
0
|
$/ = $options{'rs'} if exists $options{'rs'}; |
142
|
|
|
|
|
|
|
# otherwise use default $/ value |
143
|
|
|
|
|
|
|
} elsif(exists $options{'from_file'}) { |
144
|
21
|
|
|
|
|
53
|
local *IN; |
145
|
21
|
|
|
|
|
35
|
my $from_file = $options{'from_file'}; |
146
|
21
|
50
|
|
|
|
957
|
open(IN, "<$from_file") or croak "Can't read-open $from_file: $!"; |
147
|
21
|
|
|
|
|
56
|
binmode(IN); |
148
|
21
|
|
|
|
|
41
|
$fh = *IN{IO}; |
149
|
21
|
|
|
|
|
31
|
$to_close = 1; |
150
|
|
|
|
|
|
|
|
151
|
21
|
100
|
|
|
|
51
|
if(exists $options{'rs'}) { |
152
|
11
|
|
|
|
|
37
|
$/ = $options{'rs'}; |
153
|
|
|
|
|
|
|
} else { |
154
|
10
|
|
|
|
|
13
|
my $chunk; |
155
|
10
|
|
|
|
|
297
|
read($fh, $chunk, 2048); # should be plenty long enough! |
156
|
10
|
|
|
|
|
77
|
seek($fh, 0,0); # rewind |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
# All the NL formats I know about... |
159
|
10
|
50
|
33
|
|
|
136
|
if(defined $chunk and $chunk =~ m<(\cm\cj|\cm|\cj)>s) { |
160
|
10
|
|
|
|
|
50
|
$/ = $1; |
161
|
|
|
|
|
|
|
} else { |
162
|
0
|
0
|
|
|
|
0
|
print "Couldn't set \$/ for some reason.\n" if $Debug; |
163
|
|
|
|
|
|
|
# Otherwise let it stand. |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
} |
166
|
|
|
|
|
|
|
} else { |
167
|
0
|
|
|
|
|
0
|
croak "read_sf needs an option specifying input source"; |
168
|
|
|
|
|
|
|
} |
169
|
|
|
|
|
|
|
|
170
|
21
|
50
|
|
|
|
50
|
print "\$/ is ", unpack("H*", $/), "\n" if $Debug; |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
#my $lines_so_far = 0; |
173
|
21
|
|
|
|
|
28
|
my $line; # current line |
174
|
|
|
|
|
|
|
my $hw_field; # set from the first real field name we see |
175
|
0
|
|
|
|
|
0
|
my @new_entries; # to fill up with new things from this file |
176
|
21
|
|
|
|
|
25
|
my $Debug = $Debug; # lexical for speed |
177
|
21
|
|
|
|
|
26
|
my $last_field_was_comment = 0; |
178
|
|
|
|
|
|
|
|
179
|
21
|
|
|
|
|
248
|
while(defined($line = <$fh>)) { |
180
|
210
|
|
|
|
|
328
|
chomp($line); |
181
|
|
|
|
|
|
|
#next if !defined($hw_field) and |
182
|
|
|
|
|
|
|
## ++$lines_so_far == 1 and |
183
|
210
|
50
|
66
|
|
|
757
|
if(length $line > 1 and substr($line,0,2) eq '\_') { |
184
|
0
|
|
|
|
|
0
|
$last_field_was_comment = 1; |
185
|
0
|
|
|
|
|
0
|
next; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
210
|
100
|
|
|
|
551
|
if($line =~ m<^\\(\S+) ?(.*)>s) { # It's a normal "\foo val" line... |
189
|
|
|
|
|
|
|
# This is the typical line in typical .sf files |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
# That RE matches either "\foo" or "\foo bar..." |
192
|
|
|
|
|
|
|
# (Because the \S+ stops either at end-of-string, or at ' '. |
193
|
|
|
|
|
|
|
# Note that in either case, $2 is never undef. |
194
|
|
|
|
|
|
|
|
195
|
105
|
50
|
|
|
|
197
|
print "<$line> parses as <$1> = <$2>\n" if $Debug > 1; |
196
|
105
|
|
|
|
|
108
|
$last_field_was_comment = 0; |
197
|
105
|
100
|
|
|
|
172
|
if(@new_entries) { |
198
|
84
|
100
|
|
|
|
162
|
if($1 eq $hw_field) { # it's a non-initial new entry |
199
|
|
|
|
|
|
|
# A new entry means that the preceding entry's last value got |
200
|
|
|
|
|
|
|
# one too many \n's at the end. So chop it. |
201
|
|
|
|
|
|
|
# (Assumes "\n" is a single byte long; safe, I hope.) |
202
|
21
|
50
|
|
|
|
69
|
chop($new_entries[-1][-1]) |
203
|
|
|
|
|
|
|
if substr($new_entries[-1][-1], -1, 1) eq "\n"; |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
# Start off a new entry |
206
|
21
|
|
|
|
|
116
|
push @new_entries, [$1, $2]; |
207
|
|
|
|
|
|
|
} else { |
208
|
63
|
|
|
|
|
61
|
push @{$new_entries[-1]}, $1, $2; |
|
63
|
|
|
|
|
380
|
|
209
|
|
|
|
|
|
|
# This is all that happens to typical lines: |
210
|
|
|
|
|
|
|
# Just tack more items to the end of the last entry. |
211
|
|
|
|
|
|
|
} |
212
|
|
|
|
|
|
|
} else { # No entries seen yet |
213
|
21
|
|
|
|
|
42
|
$hw_field = $1; |
214
|
|
|
|
|
|
|
# First field ever seen (ignoring _sh). |
215
|
|
|
|
|
|
|
# That must be the headword field! Note it as such. |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
# Now start off a new entry (the first, it so happens) |
218
|
21
|
|
|
|
|
145
|
push @new_entries, [$1, $2]; |
219
|
|
|
|
|
|
|
} |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
} else { # It's a continuation line... |
222
|
105
|
50
|
|
|
|
190
|
next if $last_field_was_comment; # just toss this. |
223
|
|
|
|
|
|
|
|
224
|
105
|
50
|
|
|
|
159
|
print "<$line> is a continuation line.\n" if $Debug > 1; |
225
|
105
|
50
|
|
|
|
163
|
if(@new_entries) { # expected case! |
226
|
105
|
|
|
|
|
122
|
$line =~ s<^ \\><\\>s; |
227
|
|
|
|
|
|
|
# Continuations starting with '\' get a leading space put on |
228
|
|
|
|
|
|
|
# the front them -- so take it off. (Even tho it could have |
229
|
|
|
|
|
|
|
# originated as a real ' \'.) |
230
|
|
|
|
|
|
|
|
231
|
105
|
|
|
|
|
533
|
$new_entries[-1][-1] .= "\n" . $line; |
232
|
|
|
|
|
|
|
# So, yes, newline in the file ($/) turns into "\n". |
233
|
|
|
|
|
|
|
# Tack this line onto the end of the last value in the last new entry |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
} else { # most unexpected -- continuation of nothing! |
236
|
0
|
0
|
|
|
|
0
|
warn "line \"$line\" is a continuation, but of what?" |
237
|
|
|
|
|
|
|
if $line =~ m<\S>s; |
238
|
|
|
|
|
|
|
# (but forgive such things if they're pure whitespace) |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
} # end of continuation line |
241
|
|
|
|
|
|
|
} # end while() over the lines |
242
|
|
|
|
|
|
|
|
243
|
21
|
50
|
|
|
|
402
|
close($fh) if $to_close; |
244
|
|
|
|
|
|
|
|
245
|
21
|
50
|
|
|
|
56
|
print "All read: ", scalar(@new_entries), " entries read.\n" if $Debug; |
246
|
|
|
|
|
|
|
|
247
|
21
|
50
|
|
|
|
84
|
return 0 unless @new_entries; |
248
|
|
|
|
|
|
|
|
249
|
21
|
|
|
|
|
42
|
push @$target, @new_entries; |
250
|
|
|
|
|
|
|
|
251
|
21
|
|
|
|
|
117
|
return $target; |
252
|
|
|
|
|
|
|
} |
253
|
|
|
|
|
|
|
|
254
|
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
255
|
|
|
|
|
|
|
|
256
|
|
|
|
|
|
|
=item write_sf(...options...) |
257
|
|
|
|
|
|
|
|
258
|
|
|
|
|
|
|
This writes the given lexicon, in Standard Format, to the destination |
259
|
|
|
|
|
|
|
specified. If all entries were written, returns true; otherwise (in |
260
|
|
|
|
|
|
|
case of an IO error), returns false, in which case you should |
261
|
|
|
|
|
|
|
check C<$!>. Note that this routine I die in the case of |
262
|
|
|
|
|
|
|
an I/O error, so you should always check the return value of this |
263
|
|
|
|
|
|
|
function, as with: |
264
|
|
|
|
|
|
|
|
265
|
|
|
|
|
|
|
write_sf(...) || die "Couldn't write: $!"; |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
The options are: |
268
|
|
|
|
|
|
|
|
269
|
|
|
|
|
|
|
=over |
270
|
|
|
|
|
|
|
|
271
|
|
|
|
|
|
|
=item from => ARRAYREF |
272
|
|
|
|
|
|
|
|
273
|
|
|
|
|
|
|
This option must be present, to specify the lexicon that you want to |
274
|
|
|
|
|
|
|
write out. |
275
|
|
|
|
|
|
|
|
276
|
|
|
|
|
|
|
=item to_file => STRING |
277
|
|
|
|
|
|
|
|
278
|
|
|
|
|
|
|
This specifies that the SF data is to be written to the file |
279
|
|
|
|
|
|
|
specified. (Note that the file is opened in overwrite mode, not |
280
|
|
|
|
|
|
|
append mode.) |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
=item to_handle => FILEHANDLE |
283
|
|
|
|
|
|
|
|
284
|
|
|
|
|
|
|
This specifies that the destination for the SF data is the given |
285
|
|
|
|
|
|
|
filehandle. |
286
|
|
|
|
|
|
|
|
287
|
|
|
|
|
|
|
The filehandle isn't closed when all the data is written to it. |
288
|
|
|
|
|
|
|
|
289
|
|
|
|
|
|
|
=item rs => STRING |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
This specifies that the given string should be used as the record |
292
|
|
|
|
|
|
|
separator (newline string) for the SF data written. |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
If not specified, defaults to "\n". |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
=back |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
=cut |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
sub write_sf { |
301
|
11
|
|
|
11
|
1
|
1183
|
my(%options) = @_; |
302
|
11
|
|
|
|
|
15
|
my $from; |
303
|
11
|
50
|
|
|
|
28
|
if(exists $options{'from'}) { |
304
|
11
|
|
|
|
|
21
|
$from = $options{'from'}; |
305
|
|
|
|
|
|
|
} else { |
306
|
0
|
0
|
0
|
|
|
0
|
croak("'from' should be a reference") |
307
|
|
|
|
|
|
|
unless defined $from and ref $from; |
308
|
|
|
|
|
|
|
} |
309
|
|
|
|
|
|
|
|
310
|
11
|
|
|
|
|
15
|
my($fh, $to_close); |
311
|
11
|
50
|
|
|
|
39
|
if(exists $options{'to_handle'}) { |
|
|
50
|
|
|
|
|
|
312
|
0
|
|
|
|
|
0
|
$fh = $options{'to_handle'}; |
313
|
0
|
0
|
|
|
|
0
|
print "Writing to $fh from object $from\n" if $Debug; |
314
|
|
|
|
|
|
|
} elsif(exists $options{'to_file'}) { |
315
|
|
|
|
|
|
|
# passed a filespec |
316
|
11
|
|
|
|
|
28
|
local *OUT; |
317
|
11
|
|
|
|
|
20
|
my $dest = $options{'to_file'}; |
318
|
11
|
50
|
|
|
|
27
|
print "Writing to $dest from object $from\n" if $Debug; |
319
|
11
|
50
|
|
|
|
1313
|
open(OUT, ">$dest") or return 0; |
320
|
11
|
|
|
|
|
30
|
$fh = *OUT{IO}; |
321
|
11
|
|
|
|
|
51
|
binmode($fh); |
322
|
|
|
|
|
|
|
} else { |
323
|
0
|
|
|
|
|
0
|
croak "write_sf needs either a to_handle or a to_file option"; |
324
|
|
|
|
|
|
|
} |
325
|
|
|
|
|
|
|
|
326
|
11
|
|
|
|
|
28
|
my $nl; |
327
|
11
|
100
|
|
|
|
36
|
if(exists $options{'rs'}) { |
328
|
9
|
|
|
|
|
17
|
$nl = $options{'rs'}; |
329
|
|
|
|
|
|
|
# Some sanity checks: |
330
|
9
|
50
|
|
|
|
24
|
croak "rs can't be undef" unless defined $nl; |
331
|
9
|
50
|
|
|
|
21
|
croak "rs can't be empty-string" unless length $nl; |
332
|
9
|
50
|
|
|
|
22
|
croak "rs can't be a reference" if ref $nl; |
333
|
|
|
|
|
|
|
} else { |
334
|
2
|
|
|
|
|
5
|
$nl = "\n"; |
335
|
|
|
|
|
|
|
} |
336
|
|
|
|
|
|
|
|
337
|
11
|
|
|
|
|
24
|
my $qnl = quotemeta $nl; |
338
|
|
|
|
|
|
|
|
339
|
11
|
|
|
|
|
28
|
my $nl_is_weird = 0; |
340
|
11
|
100
|
|
|
|
63
|
$nl_is_weird = 1 unless $nl =~ m<^[\cm\cj]+$>s; |
341
|
|
|
|
|
|
|
|
342
|
11
|
|
|
|
|
18
|
my $am_first_entry = 1; |
343
|
11
|
|
|
|
|
14
|
my($k, $v, $i, $i_entry, $e); # scratch vars |
344
|
|
|
|
|
|
|
|
345
|
|
|
|
|
|
|
Entry: |
346
|
11
|
|
|
|
|
51
|
for($i_entry = 0; $i_entry < @$from; ++$i_entry) { |
347
|
22
|
50
|
66
|
|
|
166
|
unless(defined( |
|
|
|
33
|
|
|
|
|
348
|
|
|
|
|
|
|
$e = $from->[$i_entry] # copy the entry ref |
349
|
|
|
|
|
|
|
) and ( |
350
|
|
|
|
|
|
|
ref $e eq 'ARRAY' |
351
|
|
|
|
|
|
|
or UNIVERSAL::isa($e, 'ARRAY') |
352
|
|
|
|
|
|
|
) |
353
|
|
|
|
|
|
|
) { |
354
|
0
|
0
|
|
|
|
0
|
print "Skipping $e -- not an entry\n" if $Debug; |
355
|
0
|
|
|
|
|
0
|
Carp::cluck "Skipping $e -- not an entry"; |
356
|
0
|
|
|
|
|
0
|
next Entry; |
357
|
|
|
|
|
|
|
} |
358
|
22
|
50
|
|
|
|
57
|
unless(@$e) { |
359
|
0
|
0
|
|
|
|
0
|
print "Skipping $e -- a null entry\n" if $Debug; |
360
|
0
|
|
|
|
|
0
|
next Entry; |
361
|
|
|
|
|
|
|
} |
362
|
|
|
|
|
|
|
|
363
|
22
|
100
|
|
|
|
37
|
if($am_first_entry) { |
364
|
11
|
|
|
|
|
17
|
$am_first_entry = undef; # do nothing but turn it off. |
365
|
|
|
|
|
|
|
} else { # print a NL before every entry except the first |
366
|
11
|
50
|
|
|
|
47
|
return 0 unless print $fh $nl; |
367
|
|
|
|
|
|
|
} |
368
|
|
|
|
|
|
|
|
369
|
|
|
|
|
|
|
Field: |
370
|
22
|
|
|
|
|
64
|
for($i = 0; $i < @$e; $i += 2) { # iterate across keys |
371
|
55
|
50
|
33
|
|
|
254
|
unless(defined( |
372
|
|
|
|
|
|
|
$k = $e->[$i] # copy the key |
373
|
|
|
|
|
|
|
) and length $k |
374
|
|
|
|
|
|
|
) { |
375
|
0
|
|
|
|
|
0
|
next Field; |
376
|
|
|
|
|
|
|
} |
377
|
|
|
|
|
|
|
|
378
|
55
|
100
|
|
|
|
84
|
if($nl_is_weird) { |
379
|
10
|
|
|
|
|
43
|
$k =~ s<$qnl><>g; # basic attempt at sanity. |
380
|
10
|
|
|
|
|
15
|
$k =~ tr< ><>d; |
381
|
|
|
|
|
|
|
# Up to the user to keep [\cm\cj\t] out of the keys! |
382
|
|
|
|
|
|
|
} else { |
383
|
45
|
|
|
|
|
77
|
$k =~ tr<\cm\cj\t ><>d; # basic sanity for any sane NL value |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
|
386
|
55
|
50
|
|
|
|
683
|
unless(length $k) { |
387
|
0
|
0
|
|
|
|
0
|
carp "Key field in lexicon->[ $i_entry ][ $i ] is null!\n" if $Debug; |
388
|
0
|
|
|
|
|
0
|
next Field; |
389
|
|
|
|
|
|
|
} |
390
|
|
|
|
|
|
|
|
391
|
55
|
50
|
|
|
|
147
|
if(defined( |
392
|
|
|
|
|
|
|
$v = $e->[1 + $i] # copy value |
393
|
|
|
|
|
|
|
)) { |
394
|
55
|
100
|
|
|
|
108
|
if(length $v) { |
395
|
44
|
|
|
|
|
67
|
$v =~ s<\n\\><\n \\>g; |
396
|
44
|
100
|
|
|
|
126
|
$v =~ s<\n><$nl>g if $nl ne "\n"; # swap NL |
397
|
|
|
|
|
|
|
} |
398
|
|
|
|
|
|
|
} else { |
399
|
0
|
|
|
|
|
0
|
$v = ''; |
400
|
|
|
|
|
|
|
} |
401
|
|
|
|
|
|
|
|
402
|
55
|
100
|
|
|
|
412
|
return 0 unless # return if there's an error in printing |
|
|
50
|
|
|
|
|
|
403
|
|
|
|
|
|
|
length($v) ? (print $fh "\\", $k, ' ', $v, $nl) # "\foo bar" + NL |
404
|
|
|
|
|
|
|
: (print $fh "\\", $k, $nl) # "\foo" + NL |
405
|
|
|
|
|
|
|
; |
406
|
|
|
|
|
|
|
} |
407
|
|
|
|
|
|
|
} |
408
|
11
|
50
|
|
|
|
39
|
close($fh) if $to_close; |
409
|
11
|
|
|
|
|
644
|
return 1; |
410
|
|
|
|
|
|
|
} |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
413
|
|
|
|
|
|
|
|
414
|
|
|
|
|
|
|
=item are_hw_keys_uniform($lol) |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
This function returns true iff all the entries in the lexicon have the |
417
|
|
|
|
|
|
|
same key for their headword fields (i.e., the first field per record). |
418
|
|
|
|
|
|
|
This will always be true if you read the lexicon from one file; but if |
419
|
|
|
|
|
|
|
you read it from several, it's possible that the different files have |
420
|
|
|
|
|
|
|
different keys marking headword fields. |
421
|
|
|
|
|
|
|
|
422
|
|
|
|
|
|
|
=cut |
423
|
|
|
|
|
|
|
|
424
|
|
|
|
|
|
|
sub are_hw_keys_uniform { |
425
|
1
|
50
|
|
1
|
1
|
188
|
carp('Wrong number of arguments to are_hw_keys_uniform'), return 0 |
426
|
|
|
|
|
|
|
unless @_ == 1; |
427
|
1
|
|
|
|
|
3
|
my $lex = $_[0]; |
428
|
1
|
50
|
0
|
|
|
8
|
$Debug && carp('Argument to are_hw_keys_uniform isn\'t a listref'), return 0 |
|
|
|
33
|
|
|
|
|
429
|
|
|
|
|
|
|
unless defined $lex and ref $lex eq 'ARRAY'; |
430
|
1
|
50
|
0
|
|
|
3
|
$Debug && carp('Empty lexicon to are_hw_keys_uniform'), return 0 |
431
|
|
|
|
|
|
|
unless @$lex; |
432
|
|
|
|
|
|
|
|
433
|
1
|
|
|
|
|
3
|
my($hw_key, $e, $i); |
434
|
1
|
|
|
|
|
6
|
for(my $i = 0; $i < @$lex; ++$i) { |
435
|
2
|
50
|
|
|
|
3
|
next unless @{$e = $lex->[$i]}; # just skip null entries, I guess. |
|
2
|
|
|
|
|
6
|
|
436
|
2
|
50
|
0
|
|
|
7
|
$Debug && carp("Entry $i has an undef headword"), return 0 |
437
|
|
|
|
|
|
|
unless defined $e->[0]; |
438
|
2
|
100
|
|
|
|
4
|
if(defined($hw_key)) { |
439
|
1
|
50
|
|
|
|
6
|
if($e->[0] ne $hw_key) { |
440
|
0
|
0
|
|
|
|
0
|
carp("Entry $i\'s hw key \"" . $e->[0] . |
441
|
|
|
|
|
|
|
"\" differs from previous hw key \"$hw_key\"") if $Debug; |
442
|
0
|
|
|
|
|
0
|
return 0; |
443
|
|
|
|
|
|
|
} |
444
|
|
|
|
|
|
|
} else { |
445
|
1
|
|
|
|
|
4
|
$hw_key = $e->[0]; |
446
|
|
|
|
|
|
|
} |
447
|
|
|
|
|
|
|
} |
448
|
1
|
50
|
0
|
|
|
3
|
$Debug && carp("Entry $i\'s hw key \"" . $e->[0]), return 0 |
449
|
|
|
|
|
|
|
unless defined $hw_key; |
450
|
|
|
|
|
|
|
|
451
|
1
|
|
|
|
|
4
|
return 1; # all fine. |
452
|
|
|
|
|
|
|
} |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
455
|
|
|
|
|
|
|
|
456
|
|
|
|
|
|
|
=item are_hw_values_unique($lex_lol) |
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
This function returns true iff all the headword values in all non-null |
459
|
|
|
|
|
|
|
entries in the lexicon $lol are unique -- i.e., if no two (or more) |
460
|
|
|
|
|
|
|
entries have the same values for their headword fields. I don't know |
461
|
|
|
|
|
|
|
if uniqueness is a requirement for SF lexicons that you'd want to |
462
|
|
|
|
|
|
|
import into Shoebox, but some tasks you put lexicons to might require |
463
|
|
|
|
|
|
|
it. |
464
|
|
|
|
|
|
|
|
465
|
|
|
|
|
|
|
=cut |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
sub are_hw_values_unique { |
468
|
2
|
|
|
2
|
1
|
4
|
my %seen; |
469
|
2
|
|
|
|
|
14
|
foreach my $e (@{$_[0]}) { |
|
2
|
|
|
|
|
5
|
|
470
|
5
|
50
|
66
|
|
|
38
|
return 0 if @$e and $seen{ defined($e->[1]) ? $e->[1] : '' }++; |
|
|
100
|
|
|
|
|
|
471
|
|
|
|
|
|
|
} |
472
|
1
|
|
|
|
|
5
|
return 1; # no duplicates found |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
476
|
|
|
|
|
|
|
%p = ( |
477
|
|
|
|
|
|
|
( map {; (chr($_), sprintf('\x%02X',$_)) } 0.. 255 ), |
478
|
|
|
|
|
|
|
"\a" => '\a', # ding! |
479
|
|
|
|
|
|
|
"\b" => '\b', # BS |
480
|
|
|
|
|
|
|
"\e" => '\e', # ESC |
481
|
|
|
|
|
|
|
"\f" => '\f', # FF |
482
|
|
|
|
|
|
|
"\t" => '\t', # tab |
483
|
|
|
|
|
|
|
"\cm" => '\cm', |
484
|
|
|
|
|
|
|
"\cj" => '\cj', |
485
|
|
|
|
|
|
|
"\n" => '\n', # presumably overrides one of either \cm or \cj |
486
|
|
|
|
|
|
|
'"' => '\"', |
487
|
|
|
|
|
|
|
'\\' => '\\\\', |
488
|
|
|
|
|
|
|
'$' => '\\$', |
489
|
|
|
|
|
|
|
'@' => '\\@', |
490
|
|
|
|
|
|
|
'%' => '\\%', |
491
|
|
|
|
|
|
|
'#' => '\\#', |
492
|
|
|
|
|
|
|
); |
493
|
|
|
|
|
|
|
|
494
|
|
|
|
|
|
|
sub _dump { |
495
|
0
|
|
|
0
|
|
|
my $lol = $_[0]; |
496
|
|
|
|
|
|
|
|
497
|
0
|
|
|
|
|
|
print "[ #", scalar(@$lol), " entries...\n"; |
498
|
|
|
|
|
|
|
|
499
|
0
|
|
|
|
|
|
my $safe; |
500
|
0
|
|
|
|
|
|
my $toggle = 0; |
501
|
0
|
|
|
|
|
|
foreach my $e (@$lol) { |
502
|
0
|
0
|
0
|
|
|
|
next unless defined $e and ref $e and UNIVERSAL::isa($e, 'ARRAY'); |
|
|
|
0
|
|
|
|
|
503
|
0
|
|
|
|
|
|
print " [ "; |
504
|
0
|
|
|
|
|
|
foreach my $v (@$e) { |
505
|
0
|
|
|
|
|
|
($safe = $v) =~ |
506
|
|
|
|
|
|
|
s<([^\x20\x21\x23\x27-\x3F\x41-\x5B\x5D-\x7E\xA1-\xFE])> |
507
|
0
|
|
|
|
|
|
<$p{$1}>eg; |
508
|
0
|
0
|
|
|
|
|
print( |
509
|
|
|
|
|
|
|
($toggle ^= 1) ? qq{"$safe" => } : qq{"$safe", \n } |
510
|
|
|
|
|
|
|
); |
511
|
|
|
|
|
|
|
} |
512
|
0
|
|
|
|
|
|
print "],\n"; |
513
|
|
|
|
|
|
|
} |
514
|
0
|
|
|
|
|
|
print "];\n"; |
515
|
|
|
|
|
|
|
} |
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
#-------------------------------------------------------------------------- |
518
|
|
|
|
|
|
|
|
519
|
|
|
|
|
|
|
=back |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
=head1 A NOTE ABOUT VALIDITY |
522
|
|
|
|
|
|
|
|
523
|
|
|
|
|
|
|
I make very few assumptions about what characters can be in a field |
524
|
|
|
|
|
|
|
key in SF files. Just now, I happen to assume they can't start with |
525
|
|
|
|
|
|
|
an underscore (lest they be considered comments), and can't contain |
526
|
|
|
|
|
|
|
any whitespace characters. |
527
|
|
|
|
|
|
|
|
528
|
|
|
|
|
|
|
I make essentially no assumptions about what can be in a field value, |
529
|
|
|
|
|
|
|
except that there can be no newline followed immediately by a |
530
|
|
|
|
|
|
|
backslash. (Any newline-backslash sequence in turned into |
531
|
|
|
|
|
|
|
newline-space-backslash.) |
532
|
|
|
|
|
|
|
|
533
|
|
|
|
|
|
|
You should be aware that Shoebox, or whatever other programs use SF |
534
|
|
|
|
|
|
|
files, may have a I more restricted view of what can be in a |
535
|
|
|
|
|
|
|
field key or value. |
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=head1 SEE ALSO |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
L |
540
|
|
|
|
|
|
|
|
541
|
|
|
|
|
|
|
L |
542
|
|
|
|
|
|
|
|
543
|
|
|
|
|
|
|
=head1 COPYRIGHT |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
Copyright 2000-2004, Sean M. Burke C, all rights |
546
|
|
|
|
|
|
|
reserved. This program is free software; you can redistribute it |
547
|
|
|
|
|
|
|
and/or modify it under the same terms as Perl itself. |
548
|
|
|
|
|
|
|
|
549
|
|
|
|
|
|
|
=head1 AUTHOR |
550
|
|
|
|
|
|
|
|
551
|
|
|
|
|
|
|
Sean M. Burke, C |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
Please contact me if you find that this module is not behaving |
554
|
|
|
|
|
|
|
correctly. I've tested it only on Shoebox files I generate on my own. |
555
|
|
|
|
|
|
|
|
556
|
|
|
|
|
|
|
I hasten to point out, incidentally, that I am not in any way |
557
|
|
|
|
|
|
|
affiliated with the Summer Institute of Linguistics. |
558
|
|
|
|
|
|
|
|
559
|
|
|
|
|
|
|
=cut |
560
|
|
|
|
|
|
|
|
561
|
|
|
|
|
|
|
1; |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
__END__ |