line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::PT::Abbrev; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
31233
|
use warnings; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
60
|
|
4
|
2
|
|
|
2
|
|
8
|
use strict; |
|
2
|
|
|
|
|
2
|
|
|
2
|
|
|
|
|
1811
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=encoding ISO-8859-1 |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
Lingua::PT::Abbrev - An abbreviations dictionary manager for NLP |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 VERSION |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Version 0.10 |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=cut |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = '0.10'; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# dic => system dict |
21
|
|
|
|
|
|
|
# cdic => custom dict |
22
|
|
|
|
|
|
|
# sdic => session dict |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 SYNOPSIS |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
use Lingua::PT::Abbrev; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
my $dic = Lingua::PT::Abbrev->new; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
my $exp = $dic -> expand("sr"); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
$text = $dic -> text_expand($text); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 ABSTRACT |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
This module handles a built-in abbreviations dictionary, and a user |
37
|
|
|
|
|
|
|
customized abbreviations dictionary. It provides handy functions for |
38
|
|
|
|
|
|
|
NLP processing. |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 FUNCTIONS |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head2 new |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
This is the Lingua::PT::Abbrev dictionaries constructor. You don't |
45
|
|
|
|
|
|
|
need to pass it any parameter, unless you want to maintain a personal |
46
|
|
|
|
|
|
|
dictionary. In that case, pass the path to your personal dictionary |
47
|
|
|
|
|
|
|
file. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
The dictionary file is a text file, one abbreviation by line, as: |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sr senhor |
52
|
|
|
|
|
|
|
sra senhora |
53
|
|
|
|
|
|
|
dr doutor |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=cut |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub new { |
58
|
2
|
|
|
2
|
1
|
18
|
my $class = shift; |
59
|
2
|
|
50
|
|
|
10
|
my $custom = shift || undef; |
60
|
2
|
|
|
|
|
6
|
my $self = bless { custom => $custom }, $class; |
61
|
|
|
|
|
|
|
|
62
|
2
|
|
|
|
|
8
|
$self->_load_dictionary; |
63
|
2
|
50
|
|
|
|
6
|
$self->_load_dictionary($custom) if ($custom); |
64
|
|
|
|
|
|
|
|
65
|
2
|
|
|
|
|
4
|
return $self; |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub _load_dictionary { |
69
|
2
|
|
|
2
|
|
3
|
my $self = shift; |
70
|
2
|
|
50
|
|
|
8
|
my $file = shift || undef; |
71
|
|
|
|
|
|
|
|
72
|
2
|
|
|
|
|
3
|
local $_; |
73
|
|
|
|
|
|
|
|
74
|
2
|
50
|
|
|
|
5
|
if ($file) { |
75
|
0
|
0
|
|
|
|
0
|
open C, $file or die; |
76
|
0
|
|
|
|
|
0
|
while() { |
77
|
0
|
|
|
|
|
0
|
chomp; |
78
|
0
|
0
|
|
|
|
0
|
next if m!^\s*$!; |
79
|
0
|
|
|
|
|
0
|
($a,$b) = split /\s+/, lc; |
80
|
0
|
|
|
|
|
0
|
$self->{cdic}{$a} = $b; |
81
|
|
|
|
|
|
|
} |
82
|
0
|
|
|
|
|
0
|
close C; |
83
|
|
|
|
|
|
|
} else { |
84
|
2
|
|
|
|
|
6
|
my $f = _find_file(); |
85
|
2
|
50
|
|
|
|
54
|
open D, $f or die "Cannot open file $f: $!\n"; |
86
|
2
|
|
|
|
|
36
|
while() { |
87
|
216
|
|
|
|
|
171
|
chomp; |
88
|
216
|
|
|
|
|
197
|
s/\s*#.*//; # delete comments |
89
|
216
|
50
|
|
|
|
380
|
next if m!^\s*$!; |
90
|
216
|
|
|
|
|
359
|
($a,$b) = split /\s+/, lc; |
91
|
216
|
|
|
|
|
1349
|
$self->{dic}{$a} = $b; |
92
|
|
|
|
|
|
|
} |
93
|
2
|
|
|
|
|
15
|
close D; |
94
|
|
|
|
|
|
|
} |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
=head2 expand |
98
|
|
|
|
|
|
|
|
99
|
|
|
|
|
|
|
Given an abbreviation, this method expands it. For expanding |
100
|
|
|
|
|
|
|
abbreviations in a text use C<>, a lot faster. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
Returns undef if the abbreviation is not known. |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
=cut |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
sub expand { |
107
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
108
|
0
|
|
|
|
|
0
|
my $abbrev = lc(shift); |
109
|
0
|
|
|
|
|
0
|
$abbrev =~ s!\.$!!; |
110
|
0
|
|
0
|
|
|
0
|
return $self->_expand($abbrev) || undef; |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
sub _exists { |
114
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
115
|
0
|
|
|
|
|
0
|
my $word = shift; |
116
|
0
|
|
0
|
|
|
0
|
return exists($self->{dic}{$word}) || |
117
|
|
|
|
|
|
|
exists($self->{cdic}{$word}) || |
118
|
|
|
|
|
|
|
exists($self->{sdic}{$word}) |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub _expand { |
122
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
123
|
0
|
|
|
|
|
0
|
my $word = shift; |
124
|
0
|
|
0
|
|
|
0
|
return $self->{sdic}{$word} || |
125
|
|
|
|
|
|
|
$self->{cdic}{$word} || |
126
|
|
|
|
|
|
|
$self->{dic}{$word}; |
127
|
|
|
|
|
|
|
} |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
=head2 text_expand |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
Given a text, this method expands all known abbreviations |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
=cut |
134
|
|
|
|
|
|
|
|
135
|
|
|
|
|
|
|
sub text_expand { |
136
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
137
|
0
|
|
|
|
|
0
|
my $text = shift; |
138
|
|
|
|
|
|
|
|
139
|
0
|
|
|
|
|
0
|
$text =~ s{((\w+)\.)}{ |
140
|
0
|
0
|
|
|
|
0
|
$self->_expand(lc($2))||$1 |
141
|
|
|
|
|
|
|
}eg; |
142
|
|
|
|
|
|
|
|
143
|
0
|
|
|
|
|
0
|
return $text; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head2 add |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
Use this method to add an abbreviation to your current dictionary. |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
$abrev -> add( "dr" => "Doutor" ); |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
=cut |
153
|
|
|
|
|
|
|
|
154
|
|
|
|
|
|
|
sub add { |
155
|
0
|
|
|
0
|
1
|
0
|
my ($self,$abr,$exp) = @_; |
156
|
0
|
0
|
0
|
|
|
0
|
return undef unless $abr and $exp; |
157
|
0
|
|
|
|
|
0
|
$self->{cdic}{lc($abr)} = lc($exp); |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=head2 session_add |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
Use this method to add an abbreviation to your session dictionary. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
$abrev -> session_add( "dr" => "Doutor" ); |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=cut |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
sub session_add { |
169
|
0
|
|
|
0
|
1
|
0
|
my ($self,$abr,$exp) = @_; |
170
|
0
|
0
|
0
|
|
|
0
|
return undef unless $abr and $exp; |
171
|
0
|
|
|
|
|
0
|
$self->{sdic}{lc($abr)} = lc($exp); |
172
|
|
|
|
|
|
|
} |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
=head2 save |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
This method saves the custom dictionary |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
=cut |
179
|
|
|
|
|
|
|
|
180
|
|
|
|
|
|
|
sub save { |
181
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
182
|
0
|
0
|
|
|
|
0
|
open DIC, ">$self->{custom}" or die; |
183
|
0
|
|
|
|
|
0
|
for (keys %{$self->{cdic}}) { |
|
0
|
|
|
|
|
0
|
|
184
|
0
|
|
|
|
|
0
|
print DIC "$_ $self->{cdic}{$_}\n"; |
185
|
|
|
|
|
|
|
} |
186
|
0
|
|
|
|
|
0
|
close DIC; |
187
|
|
|
|
|
|
|
} |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head2 regexp |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
This method returns a regular expression matching all abbreviations. |
193
|
|
|
|
|
|
|
Pass as option an hash table for configuration. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
The key C<> is used to define a regular expression not |
196
|
|
|
|
|
|
|
containing the final dot. |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
=cut |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
sub regexp { |
201
|
1
|
|
|
1
|
1
|
6
|
my $self = shift; |
202
|
1
|
|
|
|
|
2
|
my %conf = @_; |
203
|
1
|
50
|
|
|
|
3
|
if ($conf{nodot}) { |
204
|
0
|
|
|
|
|
0
|
my $re = "(?:".join("|",keys %{$self->{dic}}, keys %{$self->{cdic}}, keys %{$self->{sdic}}).")"; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
205
|
0
|
|
|
|
|
0
|
return qr/$re/i; |
206
|
|
|
|
|
|
|
} else { |
207
|
1
|
|
|
|
|
2
|
my $re = "(?:".join("|",keys %{$self->{dic}}, keys %{$self->{cdic}}, keys %{$self->{sdic}}).")\\."; |
|
1
|
|
|
|
|
12
|
|
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
11
|
|
208
|
1
|
|
|
|
|
258
|
return qr/$re/i; |
209
|
|
|
|
|
|
|
} |
210
|
|
|
|
|
|
|
} |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
sub _find_file { |
214
|
2
|
|
|
2
|
|
6
|
my @files = grep { -e $_ } map { "$_/Lingua/PT/Abbrev/abbrev.dat" } @INC; |
|
22
|
|
|
|
|
248
|
|
|
22
|
|
|
|
|
29
|
|
215
|
2
|
|
|
|
|
7
|
return $files[0]; |
216
|
|
|
|
|
|
|
} |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=head1 AUTHOR |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
Alberto Simões, C<< >> |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
=head1 BUGS |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
225
|
|
|
|
|
|
|
C, or through the web interface at |
226
|
|
|
|
|
|
|
L. I will be notified, and then you'll automatically |
227
|
|
|
|
|
|
|
be notified of progress on your bug as I make changes. |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
Copyright 2004-2005 Alberto Simões, All Rights Reserved. |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
234
|
|
|
|
|
|
|
under the same terms as Perl itself. |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
=cut |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
1; # End of Lingua::PT::Abbrev |
239
|
|
|
|
|
|
|
|