line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::PT::Abbrev; |
2
|
|
|
|
|
|
|
|
3
|
2
|
|
|
2
|
|
55767
|
use warnings; |
|
2
|
|
|
|
|
5
|
|
|
2
|
|
|
|
|
67
|
|
4
|
2
|
|
|
2
|
|
11
|
use strict; |
|
2
|
|
|
|
|
4
|
|
|
2
|
|
|
|
|
2675
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=encoding ISO-8859-1 |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
=head1 NAME |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
Lingua::PT::Abbrev - An abbreviations dictionary manager for NLP |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
=head1 VERSION |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
Version 0.09 |
15
|
|
|
|
|
|
|
|
16
|
|
|
|
|
|
|
=cut |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
our $VERSION = '0.09'; |
19
|
|
|
|
|
|
|
|
20
|
|
|
|
|
|
|
# dic => system dict |
21
|
|
|
|
|
|
|
# cdic => custom dict |
22
|
|
|
|
|
|
|
# sdic => session dict |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 SYNOPSIS |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
use Lingua::PT::Abbrev; |
27
|
|
|
|
|
|
|
|
28
|
|
|
|
|
|
|
my $dic = Lingua::PT::Abbrev->new; |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
my $exp = $dic -> expand("sr"); |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
$text = $dic -> text_expand($text); |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 ABSTRACT |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
This module handles a built-in abbreviations dictionary, and a user |
37
|
|
|
|
|
|
|
customized abbreviations dictionary. It provides handy functions for |
38
|
|
|
|
|
|
|
NLP processing. |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
=head1 FUNCTIONS |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head2 new |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
This is the Lingua::PT::Abbrev dictionaries constructor. You don't |
45
|
|
|
|
|
|
|
need to pass it any parameter, unless you want to maintain a personal |
46
|
|
|
|
|
|
|
dictionary. In that case, pass the path to your personal dictionary |
47
|
|
|
|
|
|
|
file. |
48
|
|
|
|
|
|
|
|
49
|
|
|
|
|
|
|
The dictionary file is a text file, one abbreviation by line, as: |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
sr senhor |
52
|
|
|
|
|
|
|
sra senhora |
53
|
|
|
|
|
|
|
dr doutor |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=cut |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub new { |
58
|
2
|
|
|
2
|
1
|
23
|
my $class = shift; |
59
|
2
|
|
50
|
|
|
16
|
my $custom = shift || undef; |
60
|
2
|
|
|
|
|
10
|
my $self = bless { custom => $custom }, $class; |
61
|
|
|
|
|
|
|
|
62
|
2
|
|
|
|
|
13
|
$self->_load_dictionary; |
63
|
2
|
50
|
|
|
|
8
|
$self->_load_dictionary($custom) if ($custom); |
64
|
|
|
|
|
|
|
|
65
|
2
|
|
|
|
|
9
|
return $self; |
66
|
|
|
|
|
|
|
} |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
sub _load_dictionary { |
69
|
2
|
|
|
2
|
|
5
|
my $self = shift; |
70
|
2
|
|
50
|
|
|
12
|
my $file = shift || undef; |
71
|
|
|
|
|
|
|
|
72
|
2
|
50
|
|
|
|
9
|
if ($file) { |
73
|
0
|
0
|
|
|
|
0
|
open C, $file or die; |
74
|
0
|
|
|
|
|
0
|
while() { |
75
|
0
|
|
|
|
|
0
|
chomp; |
76
|
0
|
0
|
|
|
|
0
|
next if m!^\s*$!; |
77
|
0
|
|
|
|
|
0
|
($a,$b) = split /\s+/, lc; |
78
|
0
|
|
|
|
|
0
|
$self->{cdic}{$a} = $b; |
79
|
|
|
|
|
|
|
} |
80
|
0
|
|
|
|
|
0
|
close C; |
81
|
|
|
|
|
|
|
} else { |
82
|
2
|
|
|
|
|
10
|
my $f = _find_file(); |
83
|
2
|
50
|
|
|
|
110
|
open D, $f or die "Cannot open file $f: $!\n"; |
84
|
2
|
|
|
|
|
82
|
while() { |
85
|
216
|
|
|
|
|
273
|
chomp; |
86
|
216
|
|
|
|
|
286
|
s/\s*#.*//; # delete comments |
87
|
216
|
50
|
|
|
|
548
|
next if m!^\s*$!; |
88
|
216
|
|
|
|
|
605
|
($a,$b) = split /\s+/, lc; |
89
|
216
|
|
|
|
|
2075
|
$self->{dic}{$a} = $b; |
90
|
|
|
|
|
|
|
} |
91
|
2
|
|
|
|
|
30
|
close D; |
92
|
|
|
|
|
|
|
} |
93
|
|
|
|
|
|
|
} |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
=head2 expand |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Given an abbreviation, this method expands it. For expanding |
98
|
|
|
|
|
|
|
abbreviations in a text use C<>, a lot faster. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
Returns undef if the abbreviation is not known. |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
=cut |
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
sub expand { |
105
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
106
|
0
|
|
|
|
|
0
|
my $abbrev = lc(shift); |
107
|
0
|
|
|
|
|
0
|
$abbrev =~ s!\.$!!; |
108
|
0
|
|
0
|
|
|
0
|
return $self->_expand($abbrev) || undef; |
109
|
|
|
|
|
|
|
} |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
sub _exists { |
112
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
113
|
0
|
|
|
|
|
0
|
my $word = shift; |
114
|
0
|
|
0
|
|
|
0
|
return exists($self->{dic}{$word}) || |
115
|
|
|
|
|
|
|
exists($self->{cdic}{$word}) || |
116
|
|
|
|
|
|
|
exists($self->{sdic}{$word}) |
117
|
|
|
|
|
|
|
} |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
sub _expand { |
120
|
0
|
|
|
0
|
|
0
|
my $self = shift; |
121
|
0
|
|
|
|
|
0
|
my $word = shift; |
122
|
0
|
|
0
|
|
|
0
|
return $self->{sdic}{$word} || |
123
|
|
|
|
|
|
|
$self->{cdic}{$word} || |
124
|
|
|
|
|
|
|
$self->{dic}{$word}; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=head2 text_expand |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
Given a text, this method expands all known abbreviations |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=cut |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub text_expand { |
134
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
135
|
0
|
|
|
|
|
0
|
my $text = shift; |
136
|
|
|
|
|
|
|
|
137
|
0
|
|
|
|
|
0
|
$text =~ s{((\w+)\.)}{ |
138
|
0
|
0
|
|
|
|
0
|
$self->_expand(lc($2))||$1 |
139
|
|
|
|
|
|
|
}eg; |
140
|
|
|
|
|
|
|
|
141
|
0
|
|
|
|
|
0
|
return $text; |
142
|
|
|
|
|
|
|
} |
143
|
|
|
|
|
|
|
|
144
|
|
|
|
|
|
|
=head2 add |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
Use this method to add an abbreviation to your current dictionary. |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
$abrev -> add( "dr" => "Doutor" ); |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=cut |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub add { |
153
|
0
|
|
|
0
|
1
|
0
|
my ($self,$abr,$exp) = @_; |
154
|
0
|
0
|
0
|
|
|
0
|
return undef unless $abr and $exp; |
155
|
0
|
|
|
|
|
0
|
$self->{cdic}{lc($abr)} = lc($exp); |
156
|
|
|
|
|
|
|
} |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
=head2 session_add |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
Use this method to add an abbreviation to your session dictionary. |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
$abrev -> session_add( "dr" => "Doutor" ); |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
=cut |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
sub session_add { |
167
|
0
|
|
|
0
|
1
|
0
|
my ($self,$abr,$exp) = @_; |
168
|
0
|
0
|
0
|
|
|
0
|
return undef unless $abr and $exp; |
169
|
0
|
|
|
|
|
0
|
$self->{sdic}{lc($abr)} = lc($exp); |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head2 save |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
This method saves the custom dictionary |
175
|
|
|
|
|
|
|
|
176
|
|
|
|
|
|
|
=cut |
177
|
|
|
|
|
|
|
|
178
|
|
|
|
|
|
|
sub save { |
179
|
0
|
|
|
0
|
1
|
0
|
my $self = shift; |
180
|
0
|
0
|
|
|
|
0
|
open DIC, ">$self->{custom}" or die; |
181
|
0
|
|
|
|
|
0
|
for (keys %{$self->{cdic}}) { |
|
0
|
|
|
|
|
0
|
|
182
|
0
|
|
|
|
|
0
|
print DIC "$_ $self->{cdic}{$_}\n"; |
183
|
|
|
|
|
|
|
} |
184
|
0
|
|
|
|
|
0
|
close DIC; |
185
|
|
|
|
|
|
|
} |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
=head2 regexp |
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
This method returns a regular expression matching all abbreviations. |
191
|
|
|
|
|
|
|
Pass as option an hash table for configuration. |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
The key C<> is used to define a regular expression not |
194
|
|
|
|
|
|
|
containing the final dot. |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
=cut |
197
|
|
|
|
|
|
|
|
198
|
|
|
|
|
|
|
sub regexp { |
199
|
1
|
|
|
1
|
1
|
6
|
my $self = shift; |
200
|
1
|
|
|
|
|
3
|
my %conf = @_; |
201
|
1
|
50
|
|
|
|
4
|
if ($conf{nodot}) { |
202
|
0
|
|
|
|
|
0
|
my $re = "(?:".join("|",keys %{$self->{dic}}, keys %{$self->{cdic}}, keys %{$self->{sdic}}).")"; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
203
|
0
|
|
|
|
|
0
|
return qr/$re/i; |
204
|
|
|
|
|
|
|
} else { |
205
|
1
|
|
|
|
|
2
|
my $re = "(?:".join("|",keys %{$self->{dic}}, keys %{$self->{cdic}}, keys %{$self->{sdic}}).")\\."; |
|
1
|
|
|
|
|
17
|
|
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
15
|
|
206
|
1
|
|
|
|
|
502
|
return qr/$re/i; |
207
|
|
|
|
|
|
|
} |
208
|
|
|
|
|
|
|
} |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
sub _find_file { |
212
|
2
|
|
|
2
|
|
8
|
my @files = grep { -e $_ } map { "$_/Lingua/PT/Abbrev/abbrev.dat" } @INC; |
|
22
|
|
|
|
|
621
|
|
|
22
|
|
|
|
|
51
|
|
213
|
2
|
|
|
|
|
9
|
return $files[0]; |
214
|
|
|
|
|
|
|
} |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=head1 AUTHOR |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
Alberto Simões, C<< >> |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=head1 BUGS |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
223
|
|
|
|
|
|
|
C, or through the web interface at |
224
|
|
|
|
|
|
|
L. I will be notified, and then you'll automatically |
225
|
|
|
|
|
|
|
be notified of progress on your bug as I make changes. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
Copyright 2004-2005 Alberto Simões, All Rights Reserved. |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
232
|
|
|
|
|
|
|
under the same terms as Perl itself. |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
=cut |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
1; # End of Lingua::PT::Abbrev |
237
|
|
|
|
|
|
|
|