| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package Lingua::PT::ProperNames; |
|
2
|
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
#require Exporter; |
|
4
|
3
|
|
|
3
|
|
102307
|
use locale; |
|
|
3
|
|
|
|
|
502
|
|
|
|
3
|
|
|
|
|
19
|
|
|
5
|
3
|
|
|
3
|
|
2816
|
use IO::String; |
|
|
3
|
|
|
|
|
15654
|
|
|
|
3
|
|
|
|
|
106
|
|
|
6
|
3
|
|
|
3
|
|
28
|
use warnings; |
|
|
3
|
|
|
|
|
12
|
|
|
|
3
|
|
|
|
|
105
|
|
|
7
|
3
|
|
|
3
|
|
14
|
use strict; |
|
|
3
|
|
|
|
|
4
|
|
|
|
3
|
|
|
|
|
155
|
|
|
8
|
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=encoding ISO-8859-1 |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME |
|
12
|
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Lingua::PT::ProperNames - Simple module to extract proper names from Portuguese Text |
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 Version |
|
16
|
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
Version 0.10 |
|
18
|
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=cut |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '0.10'; |
|
22
|
3
|
|
|
3
|
|
16
|
use base 'Exporter'; |
|
|
3
|
|
|
|
|
5
|
|
|
|
3
|
|
|
|
|
1491
|
|
|
23
|
|
|
|
|
|
|
our @EXPORT = qw/getPN printPN printPNstring forPN forPNstring/; |
|
24
|
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
our ($em, $np1, $np, $prof, $sep1, $sep2, %vazia, @stopw); |
|
26
|
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
BEGIN { |
|
28
|
|
|
|
|
|
|
|
|
29
|
3
|
|
|
3
|
|
17
|
$np1 = qr{(?:(?: [A-ZÈÉÚÓÁÂ][.])+ |
|
30
|
|
|
|
|
|
|
| [sS]r[.] |
|
31
|
|
|
|
|
|
|
| [dD]r[.] |
|
32
|
|
|
|
|
|
|
| St[oa]?[.] |
|
33
|
|
|
|
|
|
|
| [A-ZÈÉÚÓÁÂ]\w+(?:[\'\-]\w+)* |
|
34
|
|
|
|
|
|
|
)}x; |
|
35
|
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
#if ($e) { |
|
37
|
|
|
|
|
|
|
#$np= qr{$np1(?:\s+(?:d[eao]s?\s+|e\s+)?$np1)*}; |
|
38
|
|
|
|
|
|
|
#} else { |
|
39
|
3
|
|
|
|
|
370
|
$np= qr{$np1 |
|
40
|
|
|
|
|
|
|
(?: \s+ (?:d[eaou]s?\s+ |
|
41
|
|
|
|
|
|
|
| d' |
|
42
|
|
|
|
|
|
|
| de \s+ l[ae]s? \s+ |
|
43
|
|
|
|
|
|
|
| v[oa]n\s+ |
|
44
|
|
|
|
|
|
|
)? |
|
45
|
|
|
|
|
|
|
$np1)* |
|
46
|
|
|
|
|
|
|
}x; |
|
47
|
|
|
|
|
|
|
#} |
|
48
|
|
|
|
|
|
|
|
|
49
|
3
|
|
|
|
|
65
|
@stopw = qw{ |
|
50
|
|
|
|
|
|
|
no com se em segundo a o os as na nos nas do das dos da tanto |
|
51
|
|
|
|
|
|
|
para de desde mas quando esta sem nem só apenas mesmo até uma uns um |
|
52
|
|
|
|
|
|
|
pela por pelo pelas pelos depois ao sobre como umas já enquanto aos |
|
53
|
|
|
|
|
|
|
também amanhã ontem embora essa nesse olhe hoje não eu ele eles |
|
54
|
|
|
|
|
|
|
primeiro simplesmente era foi é será são seja nosso nossa nossos nossas |
|
55
|
|
|
|
|
|
|
chama-se chamam-se subtitui resta diz salvo disse diz vamos entra entram |
|
56
|
|
|
|
|
|
|
aqui começou lá seu vinham passou quanto sou vi onde este então temos |
|
57
|
|
|
|
|
|
|
num aquele tivemos |
|
58
|
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
en la pour le |
|
60
|
|
|
|
|
|
|
}; |
|
61
|
|
|
|
|
|
|
|
|
62
|
3
|
|
|
|
|
17
|
$prof = join("|", qw{ |
|
63
|
|
|
|
|
|
|
astrólogo astrónomo advogado actor |
|
64
|
|
|
|
|
|
|
baterista |
|
65
|
|
|
|
|
|
|
cantor compositor |
|
66
|
|
|
|
|
|
|
dramaturgo |
|
67
|
|
|
|
|
|
|
engenheiro escritor |
|
68
|
|
|
|
|
|
|
filósofo flautista físico |
|
69
|
|
|
|
|
|
|
investigador |
|
70
|
|
|
|
|
|
|
jogador |
|
71
|
|
|
|
|
|
|
matemático médico ministro músico |
|
72
|
|
|
|
|
|
|
pianista poeta professor |
|
73
|
|
|
|
|
|
|
químico |
|
74
|
|
|
|
|
|
|
teólogo |
|
75
|
|
|
|
|
|
|
}); |
|
76
|
3
|
|
|
|
|
8
|
$sep1 = join("|", qw{chamado "conhecido como"}); |
|
77
|
|
|
|
|
|
|
|
|
78
|
3
|
|
|
|
|
6
|
$sep2 = join("|", qw{brilhante conhecido reputado popular}); |
|
79
|
3
|
|
|
|
|
259
|
@vazia{@stopw} = (@stopw); # para ser mais facil ver se uma pal é stopword |
|
80
|
3
|
|
|
|
|
11379
|
$em = '\b(?:[Ee]m|[nN][oa]s?)'; |
|
81
|
|
|
|
|
|
|
} |
|
82
|
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=head1 Synopsis |
|
84
|
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
This module contains simple Perl-based functions to detect and extract |
|
86
|
|
|
|
|
|
|
proper names from Portuguese text. |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
use Lingua::PT::ProperNames; |
|
89
|
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
printPN(@options); |
|
92
|
|
|
|
|
|
|
printPNstring({ %options... } ,$textstrint); |
|
93
|
|
|
|
|
|
|
printPNstring([ @options... ] ,$textstrint); |
|
94
|
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
forPN( sub{my ($pn, $contex)=@_;... } ) ; |
|
96
|
|
|
|
|
|
|
forPN( {t=>"double"}, |
|
97
|
|
|
|
|
|
|
sub{my ($pn, $contex)=@_;... }, sub{...} ) ; |
|
98
|
|
|
|
|
|
|
$outstr = forPN($instr, sub{my ($pn, $contex)=@_;... }, ... ) ; |
|
99
|
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
forPNstring(sub{my ($pn, $contex)=@_;... }, |
|
101
|
|
|
|
|
|
|
$textstring, regsep) ; |
|
102
|
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
my $pndict = Lingua::PT::ProperNames->new; |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=head1 Functions related to ProperNames dictionary |
|
107
|
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=head2 new |
|
109
|
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
Creates a new ProperNames dictionary |
|
111
|
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=cut |
|
113
|
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub new { |
|
115
|
1
|
|
|
1
|
1
|
14
|
my $class = shift; |
|
116
|
|
|
|
|
|
|
# my $filename = shift; |
|
117
|
|
|
|
|
|
|
|
|
118
|
1
|
|
|
|
|
4
|
my $self = bless {}, $class; |
|
119
|
1
|
|
|
|
|
5
|
$self->_load_dictionary; |
|
120
|
1
|
|
|
|
|
10
|
return $self; |
|
121
|
|
|
|
|
|
|
} |
|
122
|
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub _load_dictionary { |
|
124
|
1
|
|
|
1
|
|
4
|
my $self = shift; |
|
125
|
1
|
|
50
|
|
|
8
|
my $file = shift || undef; |
|
126
|
|
|
|
|
|
|
|
|
127
|
1
|
50
|
|
|
|
5
|
if ($file) { |
|
128
|
0
|
0
|
|
|
|
0
|
open C, $file or die; |
|
129
|
0
|
|
|
|
|
0
|
while() { |
|
130
|
0
|
|
|
|
|
0
|
chomp; |
|
131
|
0
|
0
|
|
|
|
0
|
next if m!^\s*$!; |
|
132
|
0
|
|
|
|
|
0
|
$self->{cdic}{$_} = $_; |
|
133
|
|
|
|
|
|
|
} |
|
134
|
0
|
|
|
|
|
0
|
close C; |
|
135
|
|
|
|
|
|
|
} else { |
|
136
|
1
|
|
|
|
|
4
|
my $f = _find_file(); |
|
137
|
1
|
50
|
|
|
|
55
|
open D, $f or die "Cannot open file $f: $!\n"; |
|
138
|
1
|
|
|
|
|
48
|
while() { |
|
139
|
12854
|
|
|
|
|
21307
|
chomp; |
|
140
|
12854
|
50
|
|
|
|
29773
|
next if m!^\s*$!; |
|
141
|
12854
|
|
|
|
|
34391
|
my ($nome,$prob,$type) = split /\s+/; |
|
142
|
12854
|
|
|
|
|
81724
|
$self->{dic}{$nome} = {type=>$type,prob=>$prob}; |
|
143
|
|
|
|
|
|
|
} |
|
144
|
1
|
|
|
|
|
4218
|
close D; |
|
145
|
|
|
|
|
|
|
} |
|
146
|
|
|
|
|
|
|
} |
|
147
|
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub _exists { |
|
149
|
5
|
|
|
5
|
|
851
|
my $self = shift; |
|
150
|
5
|
|
|
|
|
8
|
my $word = shift; |
|
151
|
5
|
0
|
0
|
|
|
38
|
return exists($self->{dic}{$word}) or |
|
152
|
|
|
|
|
|
|
exists($self->{cdic}{$word}) or |
|
153
|
|
|
|
|
|
|
exists($self->{sdic}{$word}) |
|
154
|
|
|
|
|
|
|
} |
|
155
|
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=head2 is_name |
|
157
|
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
This method checks if a name exists in the Names dictionary as a Given Name. |
|
159
|
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=cut |
|
161
|
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub is_name { |
|
163
|
1
|
|
|
1
|
1
|
4
|
return _exists(@_) |
|
164
|
|
|
|
|
|
|
} |
|
165
|
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head2 is_surname |
|
167
|
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
Thie method checks if a name exists in the Names dictionary as a |
|
169
|
|
|
|
|
|
|
Surname. |
|
170
|
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=cut |
|
172
|
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub is_surname { |
|
174
|
1
|
|
33
|
1
|
1
|
5
|
return _exists(@_) && _type(@_) eq "apelido"; |
|
175
|
|
|
|
|
|
|
} |
|
176
|
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub _type { |
|
178
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
|
179
|
1
|
|
|
|
|
4
|
my $word = shift; |
|
180
|
1
|
50
|
|
|
|
5
|
if (exists($self->{dic}{$word})) { |
|
|
|
0
|
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
181
|
1
|
|
|
|
|
50
|
return $self->{dic}{$word}{type} |
|
182
|
|
|
|
|
|
|
} elsif (exists($self->{cdic}{$word})) { |
|
183
|
0
|
|
|
|
|
0
|
return $self->{cdic}{$word}{type} |
|
184
|
|
|
|
|
|
|
} elsif (exists($self->{sdic}{$word})) { |
|
185
|
0
|
|
|
|
|
0
|
return $self->{sdic}{$word}{type} |
|
186
|
|
|
|
|
|
|
} else { |
|
187
|
0
|
|
|
|
|
0
|
return undef; |
|
188
|
|
|
|
|
|
|
} |
|
189
|
|
|
|
|
|
|
} |
|
190
|
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=head1 Detecting Proper Names |
|
193
|
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=head2 forPN |
|
195
|
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
Substitutes all Proper Names found on STDIN by the result of calling a function C<> |
|
197
|
|
|
|
|
|
|
with arguments ($propername,$context). The result is sent to STDOUT. |
|
198
|
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Usage: |
|
200
|
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
forPN({options...}, sub{ propername processor...}) |
|
202
|
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Optionally you can define input or output files: |
|
204
|
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
forPN({in=> "inputfile", out => "outputfile" }, sub{...}) |
|
206
|
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
Also, C<<{t => "double"}>> helps to treat in a special way |
|
208
|
|
|
|
|
|
|
names after punctuation (".", etc). |
|
209
|
|
|
|
|
|
|
With this options you must provide 2 functions: one for standard Proper Names |
|
210
|
|
|
|
|
|
|
and one for names after punctuation. |
|
211
|
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
forPN({t=>"double"}, sub{...}, sub{...}) |
|
213
|
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
You can also define record paragraph separator |
|
215
|
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
forPN({sep=>"\n", t=>"normal"}, sub{...}) ## each line is a par. |
|
217
|
|
|
|
|
|
|
forPN({sep=>""}, sub{...}) ## par. empty lines |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=cut |
|
220
|
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
sub forPN{ |
|
223
|
|
|
|
|
|
|
## opt: in=> inputfile(sdtin), out => file(stdout) |
|
224
|
0
|
|
|
0
|
1
|
0
|
my %opt = (sep => "", t => "normal" ); |
|
225
|
|
|
|
|
|
|
|
|
226
|
0
|
0
|
|
|
|
0
|
%opt = (%opt , %{shift(@_)}) if ref($_[0]) eq "HASH"; |
|
|
0
|
|
|
|
|
0
|
|
|
227
|
0
|
|
|
|
|
0
|
my $instring = ""; |
|
228
|
0
|
0
|
|
|
|
0
|
$instring = shift(@_) if ! ref($_[0]); |
|
229
|
|
|
|
|
|
|
|
|
230
|
0
|
|
|
|
|
0
|
my ($f,$f1) = @_; |
|
231
|
0
|
|
|
|
|
0
|
my $m="\x01"; |
|
232
|
0
|
|
|
|
|
0
|
my $old; |
|
233
|
0
|
|
|
|
|
0
|
my ($F1, $F2) ; |
|
234
|
|
|
|
|
|
|
|
|
235
|
0
|
0
|
|
|
|
0
|
die "invalid parameter to 'forPN'" unless ref($f) eq "CODE"; |
|
236
|
|
|
|
|
|
|
|
|
237
|
0
|
0
|
|
|
|
0
|
if ($opt{t} eq "double") { |
|
238
|
0
|
0
|
|
|
|
0
|
die "invalid parameter ". ref($f1) unless ref($f1) eq "CODE"; |
|
239
|
|
|
|
|
|
|
} |
|
240
|
|
|
|
|
|
|
|
|
241
|
0
|
|
|
|
|
0
|
local $/ = $opt{sep}; # input record separator=one or more empty lines |
|
242
|
|
|
|
|
|
|
|
|
243
|
0
|
0
|
|
|
|
0
|
if (defined $opt{in}) { |
|
|
|
0
|
|
|
|
|
|
|
244
|
0
|
0
|
|
|
|
0
|
open $F1, "$opt{in}" or die "cant open $opt{in}\n"; |
|
245
|
|
|
|
|
|
|
} elsif (defined $instring) { ## input is a string (1st parameter) |
|
246
|
0
|
|
|
|
|
0
|
$F1 = IO::String->new($instring); |
|
247
|
|
|
|
|
|
|
} else { |
|
248
|
0
|
|
|
|
|
0
|
$F1=*STDIN; |
|
249
|
|
|
|
|
|
|
} |
|
250
|
|
|
|
|
|
|
|
|
251
|
0
|
0
|
|
|
|
0
|
if (defined $opt{out}) { |
|
|
|
0
|
|
|
|
|
|
|
252
|
0
|
0
|
|
|
|
0
|
open F, ">$opt{out}" or die "cant create $opt{out}\n"; |
|
253
|
0
|
|
|
|
|
0
|
$old = select(F); |
|
254
|
|
|
|
|
|
|
} elsif (defined $instring) { ## input is a string (1st parameter) |
|
255
|
0
|
|
|
|
|
0
|
$F2 = IO::String->new(); |
|
256
|
0
|
|
|
|
|
0
|
$old = select($F2); |
|
257
|
|
|
|
|
|
|
} |
|
258
|
|
|
|
|
|
|
|
|
259
|
0
|
|
|
|
|
0
|
while (<$F1>) { |
|
260
|
0
|
|
|
|
|
0
|
my $ctx = $_; |
|
261
|
0
|
0
|
|
|
|
0
|
if ($opt{t} eq "double") { |
|
262
|
|
|
|
|
|
|
|
|
263
|
0
|
|
|
|
|
0
|
s{($np)}{$m($1$m)}g; |
|
264
|
0
|
|
|
|
|
0
|
s{(^\s* |
|
265
|
|
|
|
|
|
|
| [-]\s+ |
|
266
|
|
|
|
|
|
|
| [.!?]\s* |
|
267
|
|
|
|
|
|
|
) $m\( ($np) $m\) |
|
268
|
|
|
|
|
|
|
}{ |
|
269
|
0
|
|
|
|
|
0
|
my ($aux1,$aux2,$aux3)= ($1,$2, $f1->($2,$ctx)); |
|
270
|
0
|
0
|
|
|
|
0
|
if (defined($aux3)){$aux1 . $aux3} |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
271
|
|
|
|
|
|
|
else {$aux1 . _tryright($aux2)} }xge; |
|
272
|
|
|
|
|
|
|
|
|
273
|
0
|
|
|
|
|
0
|
s{$m\(($np)$m\)}{ $f->($1,$ctx) }ge; |
|
|
0
|
|
|
|
|
0
|
|
|
274
|
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
} else { |
|
276
|
0
|
|
|
|
|
0
|
s{( \w+\s+ |
|
277
|
|
|
|
|
|
|
| [\«\»,:()'`"]\s* |
|
278
|
0
|
|
|
|
|
0
|
) ($np) |
|
279
|
|
|
|
|
|
|
}{$1 . $f->($2,$ctx) }xge; |
|
280
|
|
|
|
|
|
|
} |
|
281
|
0
|
|
|
|
|
0
|
print; |
|
282
|
|
|
|
|
|
|
} |
|
283
|
0
|
0
|
|
|
|
0
|
close $F1 if $opt{in}; |
|
284
|
0
|
0
|
|
|
|
0
|
if (defined $opt{out}) { |
|
|
|
0
|
|
|
|
|
|
|
285
|
0
|
|
|
|
|
0
|
select $old; |
|
286
|
0
|
|
|
|
|
0
|
close F; |
|
287
|
|
|
|
|
|
|
} elsif (defined $instring) { ## input is a string (1st parameter) |
|
288
|
0
|
|
|
|
|
0
|
return ${$F2->string_ref()}; |
|
|
0
|
|
|
|
|
0
|
|
|
289
|
|
|
|
|
|
|
} |
|
290
|
|
|
|
|
|
|
} |
|
291
|
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=head2 forPNstring |
|
293
|
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
forPNstring( $funref, "textstring" [, regSeparator] )> |
|
295
|
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
Substitutes all C by C in the text string. |
|
297
|
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
=cut |
|
299
|
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
sub forPNstring { |
|
301
|
0
|
|
|
0
|
1
|
0
|
my $f = shift; |
|
302
|
0
|
0
|
|
|
|
0
|
die "invalid parameter to 'forPNstring': function expected" unless ref($f) eq "CODE"; |
|
303
|
0
|
|
|
|
|
0
|
my $text = shift; |
|
304
|
0
|
|
0
|
|
|
0
|
my $sep = shift || "\n"; |
|
305
|
0
|
|
|
|
|
0
|
my $r = ''; |
|
306
|
0
|
|
|
|
|
0
|
for (split(/$sep/,$text)) { |
|
307
|
0
|
|
|
|
|
0
|
my $ctx = $_; |
|
308
|
0
|
|
|
|
|
0
|
s/(\w+\s+|[\«\»,()'`i"]\s*)($np)/$1 . $f->($2,$ctx)/ge ; |
|
|
0
|
|
|
|
|
0
|
|
|
309
|
0
|
|
|
|
|
0
|
$r .= "$_$sep"; |
|
310
|
|
|
|
|
|
|
} |
|
311
|
0
|
|
|
|
|
0
|
return $r; |
|
312
|
|
|
|
|
|
|
} |
|
313
|
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=head2 printPNstring |
|
315
|
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
printPNstring("oco") |
|
317
|
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=cut |
|
319
|
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
sub printPNstring{ |
|
321
|
0
|
|
|
0
|
1
|
0
|
my $text = shift; |
|
322
|
0
|
|
|
|
|
0
|
my %opt = (); |
|
323
|
|
|
|
|
|
|
|
|
324
|
0
|
0
|
|
|
|
0
|
if (ref($text) eq "HASH") { %opt = %$text ; $text = shift; } |
|
|
0
|
0
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
325
|
0
|
|
|
|
|
0
|
elsif(ref($text) eq "ARRAY"){ @opt{@$text} = @$text; $text = shift; } |
|
|
0
|
|
|
|
|
0
|
|
|
326
|
|
|
|
|
|
|
|
|
327
|
0
|
|
|
|
|
0
|
my (%profissao, %names, %namesduv, %gnames); |
|
328
|
|
|
|
|
|
|
|
|
329
|
0
|
|
|
|
|
0
|
for ($text) { |
|
330
|
0
|
|
|
|
|
0
|
chop; |
|
331
|
0
|
|
|
|
|
0
|
s/\n/ /g; |
|
332
|
0
|
|
|
|
|
0
|
for (m/[.?!:;"]\s+($np1\s+$np)/gxs) { $namesduv{$_}++ } |
|
|
0
|
|
|
|
|
0
|
|
|
333
|
0
|
|
|
|
|
0
|
for (m![)>(]\s*($np1\s+$np)!gxs) { $namesduv{$_}++ } |
|
|
0
|
|
|
|
|
0
|
|
|
334
|
0
|
|
|
|
|
0
|
for (m/(?:[\w\«\»,]\s+)($np)/gxs) { $names{$_}++ } |
|
|
0
|
|
|
|
|
0
|
|
|
335
|
0
|
0
|
|
|
|
0
|
if ($opt{em}) { |
|
336
|
0
|
|
|
|
|
0
|
for (/$em\s+($np)/g) { $gnames{$_}++ } |
|
|
0
|
|
|
|
|
0
|
|
|
337
|
|
|
|
|
|
|
} |
|
338
|
0
|
0
|
|
|
|
0
|
if ($opt{prof}) { |
|
339
|
0
|
|
|
|
|
0
|
while(/\b($prof)\s+(?:(?:$sep1)\s+)?($np)/g) |
|
340
|
0
|
|
|
|
|
0
|
{ $profissao{$2} = $1 } |
|
341
|
0
|
|
|
|
|
0
|
while(/(?:[\w\«\»,]\s+|[(])($np),\s*(?:(?:$sep2)\s+)?($prof)/g) |
|
342
|
0
|
|
|
|
|
0
|
{ $profissao{$1} = $2 } |
|
343
|
|
|
|
|
|
|
} |
|
344
|
|
|
|
|
|
|
} |
|
345
|
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# tratamento dos nomes "duvidosos" = Nome prop no inicio duma frase |
|
347
|
|
|
|
|
|
|
# |
|
348
|
|
|
|
|
|
|
|
|
349
|
0
|
|
|
|
|
0
|
for (keys %namesduv) { |
|
350
|
0
|
0
|
0
|
|
|
0
|
if (/^(\w+)/ && $vazia{lc($1)}) { #exemplo "Como Jose Manuel" |
|
351
|
0
|
|
|
|
|
0
|
s/^\w+\s*//; # retira-se a 1.a palavra |
|
352
|
0
|
|
|
|
|
0
|
$names{$_}++ |
|
353
|
|
|
|
|
|
|
} else { |
|
354
|
0
|
|
|
|
|
0
|
$names{$_}++ |
|
355
|
|
|
|
|
|
|
} |
|
356
|
|
|
|
|
|
|
} |
|
357
|
|
|
|
|
|
|
|
|
358
|
0
|
|
|
|
|
0
|
for (keys %names) { |
|
359
|
0
|
0
|
0
|
|
|
0
|
if (/^(\w+)/ && $vazia{lc($1)}) { #exemplo "Como Jose Manuel" |
|
360
|
0
|
|
|
|
|
0
|
my $ant = $_; |
|
361
|
0
|
|
|
|
|
0
|
s/^\w+\s*//; # retira-se a 1.a palavra |
|
362
|
0
|
|
|
|
|
0
|
$names{$_} += $names{$ant}; |
|
363
|
0
|
|
|
|
|
0
|
delete $names{$ant} |
|
364
|
|
|
|
|
|
|
} |
|
365
|
|
|
|
|
|
|
} |
|
366
|
|
|
|
|
|
|
|
|
367
|
0
|
0
|
|
|
|
0
|
if ($opt{oco}) { |
|
368
|
0
|
|
|
|
|
0
|
for (sort {$names{$b} <=> $names{$a}} keys %names ) { |
|
|
0
|
|
|
|
|
0
|
|
|
369
|
0
|
|
|
|
|
0
|
printf("%60s - %d\n", $_ ,$names{$_}); |
|
370
|
|
|
|
|
|
|
} |
|
371
|
|
|
|
|
|
|
} else { |
|
372
|
0
|
0
|
|
|
|
0
|
if ($opt{comp}) { |
|
373
|
0
|
|
|
|
|
0
|
my @l = sort _compara keys %names; |
|
374
|
0
|
|
|
|
|
0
|
_compacta(\%names, @l) |
|
375
|
|
|
|
|
|
|
} else { |
|
376
|
0
|
|
|
|
|
0
|
for (sort _compara keys %names ) { |
|
377
|
0
|
|
|
|
|
0
|
printf("%60s - %d\n", $_ ,$names{$_}); |
|
378
|
|
|
|
|
|
|
} |
|
379
|
|
|
|
|
|
|
} |
|
380
|
0
|
0
|
|
|
|
0
|
if ($opt{prof}) { |
|
381
|
0
|
|
|
|
|
0
|
print "\nProfissões\n"; |
|
382
|
0
|
|
|
|
|
0
|
for (keys %profissao) { |
|
383
|
0
|
|
|
|
|
0
|
print "$_ -- $profissao{$_}" |
|
384
|
|
|
|
|
|
|
} |
|
385
|
|
|
|
|
|
|
} |
|
386
|
0
|
0
|
|
|
|
0
|
if ($opt{em}) { |
|
387
|
0
|
|
|
|
|
0
|
print "\nGeograficos\n"; |
|
388
|
0
|
|
|
|
|
0
|
for (sort _compara keys %gnames ) { |
|
389
|
0
|
|
|
|
|
0
|
printf("%60s - %d\n", $_ ,$gnames{$_}) |
|
390
|
|
|
|
|
|
|
} |
|
391
|
|
|
|
|
|
|
} |
|
392
|
|
|
|
|
|
|
} |
|
393
|
|
|
|
|
|
|
} |
|
394
|
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=head2 getPN |
|
397
|
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=cut |
|
399
|
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
sub getPN { |
|
401
|
0
|
|
|
0
|
1
|
0
|
local $/ = ""; # input record separator=1 or more empty lines |
|
402
|
|
|
|
|
|
|
|
|
403
|
0
|
|
|
|
|
0
|
my %opt; |
|
404
|
0
|
|
|
|
|
0
|
@opt{@_} = @_; |
|
405
|
0
|
|
|
|
|
0
|
my (%profissao, %names, %namesduv, %gnames); |
|
406
|
|
|
|
|
|
|
|
|
407
|
0
|
|
|
|
|
0
|
while (<>) { |
|
408
|
0
|
|
|
|
|
0
|
chop; |
|
409
|
0
|
|
|
|
|
0
|
s/\n/ /g; |
|
410
|
0
|
|
|
|
|
0
|
for (/[.?!:;"]\s+($np1\s+$np)/g) { $namesduv{$_}++;} |
|
|
0
|
|
|
|
|
0
|
|
|
411
|
0
|
|
|
|
|
0
|
for (/[)>(]\s*($np1\s+$np)/g) { $namesduv{$_}++;} |
|
|
0
|
|
|
|
|
0
|
|
|
412
|
0
|
|
|
|
|
0
|
for (/(?:[\w\«\»,]\s+)($np)/g) { $names{$_}++;} |
|
|
0
|
|
|
|
|
0
|
|
|
413
|
0
|
0
|
|
|
|
0
|
if ($opt{em}) { |
|
414
|
0
|
|
|
|
|
0
|
for (/$em\s+($np)/g) { $gnames{$_}++;}} |
|
|
0
|
|
|
|
|
0
|
|
|
415
|
0
|
0
|
|
|
|
0
|
if ($opt{prof}) { |
|
416
|
0
|
|
|
|
|
0
|
while(/\b($prof)\s+(?:(?:$sep1)\s+)?($np)/g) |
|
417
|
0
|
|
|
|
|
0
|
{ $profissao{$2} = $1 } |
|
418
|
0
|
|
|
|
|
0
|
while(/(?:[\w\«\»,]\s+|[(])($np),\s*(?:(?:$sep2)\s+)?($prof)/g) |
|
419
|
0
|
|
|
|
|
0
|
{ $profissao{$1} = $2 } |
|
420
|
|
|
|
|
|
|
} |
|
421
|
|
|
|
|
|
|
} |
|
422
|
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# tratamento dos nomes "duvidosos" = Nome prop no inicio duma frase |
|
424
|
|
|
|
|
|
|
# |
|
425
|
|
|
|
|
|
|
|
|
426
|
0
|
|
|
|
|
0
|
for (keys %namesduv) { |
|
427
|
0
|
0
|
0
|
|
|
0
|
if(/^(\w+)/ && $vazia{lc($1)}) { # exemplo "Como Jose Manuel" |
|
428
|
0
|
|
|
|
|
0
|
s/^\w+\s*//; # retira-se a 1.a palavra |
|
429
|
0
|
|
|
|
|
0
|
$names{$_}++ |
|
430
|
|
|
|
|
|
|
} else { |
|
431
|
0
|
|
|
|
|
0
|
$names{$_}++ |
|
432
|
|
|
|
|
|
|
} |
|
433
|
|
|
|
|
|
|
} |
|
434
|
0
|
|
|
|
|
0
|
return (%names) |
|
435
|
|
|
|
|
|
|
} |
|
436
|
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=head2 printPN |
|
439
|
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
printPN("oco") |
|
441
|
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
printPN - extrai os nomes próprios dum texto. |
|
443
|
|
|
|
|
|
|
-comp junta certos nomes: Fermat + Pierre de Fermat = (Pierre de) Fermat |
|
444
|
|
|
|
|
|
|
-prof |
|
445
|
|
|
|
|
|
|
-e "Sebastiao e Silva" "e" como pertencente a PN |
|
446
|
|
|
|
|
|
|
-em "em Famalicão" como pertencente a PN |
|
447
|
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=cut |
|
450
|
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
sub printPN{ |
|
452
|
0
|
|
|
0
|
1
|
0
|
local $/ = ""; # input record separator=1 or more empty lines |
|
453
|
|
|
|
|
|
|
|
|
454
|
0
|
|
|
|
|
0
|
my %opt; |
|
455
|
0
|
|
|
|
|
0
|
@opt{@_} = @_; |
|
456
|
0
|
|
|
|
|
0
|
my (%profissao, %names, %namesduv, %gnames); |
|
457
|
|
|
|
|
|
|
|
|
458
|
0
|
|
|
|
|
0
|
while (<>) { |
|
459
|
0
|
|
|
|
|
0
|
chop; |
|
460
|
0
|
|
|
|
|
0
|
s/\n/ /g; |
|
461
|
0
|
|
|
|
|
0
|
for (/[.?!:;"]\s+($np1\s+$np)/g) { $namesduv{$_}++ } |
|
|
0
|
|
|
|
|
0
|
|
|
462
|
0
|
|
|
|
|
0
|
for (/[)>(]\s*($np1\s+$np)/g) { $namesduv{$_}++ } |
|
|
0
|
|
|
|
|
0
|
|
|
463
|
0
|
|
|
|
|
0
|
for (/(?:[\w\«\»,]\s+)($np)/g) { $names{$_}++ } |
|
|
0
|
|
|
|
|
0
|
|
|
464
|
0
|
0
|
|
|
|
0
|
if ($opt{em}) { |
|
465
|
0
|
|
|
|
|
0
|
for (/$em\s+($np)/g) { $gnames{$_}++ } |
|
|
0
|
|
|
|
|
0
|
|
|
466
|
|
|
|
|
|
|
} |
|
467
|
0
|
0
|
|
|
|
0
|
if ($opt{prof}) { |
|
468
|
0
|
|
|
|
|
0
|
while(/\b($prof)\s+(?:(?:$sep1)\s+)?($np)/g) |
|
469
|
0
|
|
|
|
|
0
|
{ $profissao{$2} = $1 } |
|
470
|
0
|
|
|
|
|
0
|
while(/(?:[\w\«\»,]\s+|[(])($np),\s*(?:(?:$sep2)\s+)?($prof)/g) |
|
471
|
0
|
|
|
|
|
0
|
{ $profissao{$1} = $2 } |
|
472
|
|
|
|
|
|
|
} |
|
473
|
|
|
|
|
|
|
} |
|
474
|
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
# tratamento dos nomes "duvidosos" = Nome prop no inicio duma frase |
|
476
|
|
|
|
|
|
|
# |
|
477
|
|
|
|
|
|
|
|
|
478
|
0
|
|
|
|
|
0
|
for (keys %namesduv){ |
|
479
|
0
|
0
|
0
|
|
|
0
|
if(/^(\w+)/ && $vazia{lc($1)} ) #exemplo "Como Jose Manuel" |
|
|
0
|
|
|
|
|
0
|
|
|
480
|
|
|
|
|
|
|
{s/^\w+\s*//; # retira-se a 1.a palavra |
|
481
|
0
|
|
|
|
|
0
|
$names{$_}++;} |
|
482
|
|
|
|
|
|
|
else |
|
483
|
0
|
|
|
|
|
0
|
{ $names{$_}++;} |
|
484
|
|
|
|
|
|
|
} |
|
485
|
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
##### Não sei bem se isto serve... |
|
487
|
|
|
|
|
|
|
|
|
488
|
0
|
|
|
|
|
0
|
for (keys %names){ |
|
489
|
0
|
0
|
0
|
|
|
0
|
if(/^(\w+)/ && $vazia{lc($1)} ) #exemplo "Como Jose Manuel" |
|
490
|
0
|
|
|
|
|
0
|
{ my $ant = $_; |
|
491
|
0
|
|
|
|
|
0
|
s/^\w+\s*//; # retira-se a 1.a palavra |
|
492
|
0
|
|
|
|
|
0
|
$names{$_}+=$names{$ant}; |
|
493
|
0
|
|
|
|
|
0
|
delete $names{$ant};} |
|
494
|
|
|
|
|
|
|
} |
|
495
|
|
|
|
|
|
|
|
|
496
|
0
|
0
|
|
|
|
0
|
if($opt{oco}){ |
|
497
|
0
|
|
|
|
|
0
|
for (sort {$names{$b} <=> $names{$a}} keys %names ) |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
498
|
|
|
|
|
|
|
{printf("%6d - %s\n",$names{$_}, $_ );} |
|
499
|
|
|
|
|
|
|
} |
|
500
|
|
|
|
|
|
|
else |
|
501
|
|
|
|
|
|
|
{ |
|
502
|
0
|
0
|
|
|
|
0
|
if($opt{comp}){my @l = sort _compara keys %names; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
503
|
0
|
|
|
|
|
0
|
_compacta(\%names, @l); } |
|
504
|
0
|
|
|
|
|
0
|
else{for (sort _compara keys %names ) |
|
505
|
|
|
|
|
|
|
{printf("%60s - %d\n", $_ ,$names{$_});} } |
|
506
|
|
|
|
|
|
|
|
|
507
|
0
|
0
|
|
|
|
0
|
if($opt{prof}){print "\nProfissões\n"; |
|
|
0
|
|
|
|
|
0
|
|
|
508
|
0
|
|
|
|
|
0
|
for (keys %profissao){print "$_ -- $profissao{$_}";} } |
|
|
0
|
|
|
|
|
0
|
|
|
509
|
|
|
|
|
|
|
|
|
510
|
0
|
0
|
|
|
|
0
|
if($opt{em}){print "\nGeograficos\n"; |
|
|
0
|
|
|
|
|
0
|
|
|
511
|
0
|
|
|
|
|
0
|
for (sort _compara keys %gnames ) |
|
|
0
|
|
|
|
|
0
|
|
|
512
|
|
|
|
|
|
|
{printf("%60s - %d\n", $_ ,$gnames{$_});} } |
|
513
|
|
|
|
|
|
|
} |
|
514
|
|
|
|
|
|
|
} |
|
515
|
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
## |
|
519
|
|
|
|
|
|
|
# Auxiliary stuff |
|
520
|
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
sub _tryright{ |
|
522
|
0
|
|
|
0
|
|
0
|
my $a = shift; |
|
523
|
0
|
0
|
|
|
|
0
|
return $a unless $a =~ /(\w+)(.*)$/; |
|
524
|
0
|
|
|
|
|
0
|
my ($w,$r) = ($1,$2); |
|
525
|
0
|
|
|
|
|
0
|
my $m = "\x01"; |
|
526
|
0
|
|
|
|
|
0
|
$r =~ s{($np)}{$m($1$m)}g; |
|
527
|
0
|
|
|
|
|
0
|
return "$w$r"; |
|
528
|
|
|
|
|
|
|
} |
|
529
|
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
sub _compacta{ |
|
532
|
0
|
|
|
0
|
|
0
|
my $s; |
|
533
|
0
|
|
|
|
|
0
|
my $names = shift; |
|
534
|
|
|
|
|
|
|
|
|
535
|
0
|
|
|
|
|
0
|
my $p = shift; |
|
536
|
0
|
|
|
|
|
0
|
my $r = $p; |
|
537
|
0
|
|
|
|
|
0
|
my $q = $names->{$p}; |
|
538
|
0
|
|
|
|
|
0
|
while ($s = shift) |
|
539
|
0
|
0
|
|
|
|
0
|
{ if ($s =~ (/^(.+) $p/)) { $r = "($1) $r" ; |
|
|
0
|
|
|
|
|
0
|
|
|
|
0
|
|
|
|
|
0
|
|
|
540
|
0
|
|
|
|
|
0
|
$q += $names->{$s}; |
|
541
|
|
|
|
|
|
|
} |
|
542
|
0
|
|
|
|
|
0
|
else {print "$r - $q"; $r=$s; $q = $names->{$s}; } |
|
|
0
|
|
|
|
|
0
|
|
|
543
|
0
|
|
|
|
|
0
|
$p=$s; |
|
544
|
|
|
|
|
|
|
} |
|
545
|
0
|
|
|
|
|
0
|
print "$r - $q"; |
|
546
|
|
|
|
|
|
|
} |
|
547
|
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
sub _compara { |
|
549
|
|
|
|
|
|
|
# ordena pela lista de palavras invertida |
|
550
|
0
|
|
|
0
|
|
0
|
join(" ", reverse(split(" ",$a))) cmp join(" ", reverse(split(" ",$b))); |
|
551
|
|
|
|
|
|
|
} |
|
552
|
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
sub _find_file { |
|
554
|
1
|
|
|
1
|
|
4
|
my @files = grep { -e $_ } map { "$_/Lingua/PT/ProperNames/names.dat" } @INC; |
|
|
11
|
|
|
|
|
328
|
|
|
|
11
|
|
|
|
|
28
|
|
|
555
|
1
|
|
|
|
|
5
|
return $files[0]; |
|
556
|
|
|
|
|
|
|
} |
|
557
|
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=head1 Author |
|
559
|
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
José João Almeida, C<< >> |
|
561
|
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
Alberto Simões, C<< >> |
|
563
|
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
=head1 Bugs |
|
565
|
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
|
567
|
|
|
|
|
|
|
C, or through the web interface at |
|
568
|
|
|
|
|
|
|
L. I will be notified, and then you'll automatically |
|
569
|
|
|
|
|
|
|
be notified of progress on your bug as I make changes. |
|
570
|
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
|
572
|
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
Copyright 2004-2008 Projecto Natura, All Rights Reserved. |
|
574
|
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
|
576
|
|
|
|
|
|
|
under the same terms as Perl itself. |
|
577
|
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
=cut |
|
579
|
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
1; # End of Lingua::PT::ProperNames |
|
581
|
|
|
|
|
|
|
|