line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::PT::ProperNames; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
#require Exporter; |
4
|
3
|
|
|
3
|
|
102307
|
use locale; |
|
3
|
|
|
|
|
502
|
|
|
3
|
|
|
|
|
19
|
|
5
|
3
|
|
|
3
|
|
2816
|
use IO::String; |
|
3
|
|
|
|
|
15654
|
|
|
3
|
|
|
|
|
106
|
|
6
|
3
|
|
|
3
|
|
28
|
use warnings; |
|
3
|
|
|
|
|
12
|
|
|
3
|
|
|
|
|
105
|
|
7
|
3
|
|
|
3
|
|
14
|
use strict; |
|
3
|
|
|
|
|
4
|
|
|
3
|
|
|
|
|
155
|
|
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=encoding ISO-8859-1 |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Lingua::PT::ProperNames - Simple module to extract proper names from Portuguese Text |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 Version |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
Version 0.10 |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=cut |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '0.10'; |
22
|
3
|
|
|
3
|
|
16
|
use base 'Exporter'; |
|
3
|
|
|
|
|
5
|
|
|
3
|
|
|
|
|
1491
|
|
23
|
|
|
|
|
|
|
our @EXPORT = qw/getPN printPN printPNstring forPN forPNstring/; |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
our ($em, $np1, $np, $prof, $sep1, $sep2, %vazia, @stopw); |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
BEGIN { |
28
|
|
|
|
|
|
|
|
29
|
3
|
|
|
3
|
|
17
|
$np1 = qr{(?:(?: [A-ZÈÉÚÓÁÂ][.])+ |
30
|
|
|
|
|
|
|
| [sS]r[.] |
31
|
|
|
|
|
|
|
| [dD]r[.] |
32
|
|
|
|
|
|
|
| St[oa]?[.] |
33
|
|
|
|
|
|
|
| [A-ZÈÉÚÓÁÂ]\w+(?:[\'\-]\w+)* |
34
|
|
|
|
|
|
|
)}x; |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
#if ($e) { |
37
|
|
|
|
|
|
|
#$np= qr{$np1(?:\s+(?:d[eao]s?\s+|e\s+)?$np1)*}; |
38
|
|
|
|
|
|
|
#} else { |
39
|
3
|
|
|
|
|
370
|
$np= qr{$np1 |
40
|
|
|
|
|
|
|
(?: \s+ (?:d[eaou]s?\s+ |
41
|
|
|
|
|
|
|
| d' |
42
|
|
|
|
|
|
|
| de \s+ l[ae]s? \s+ |
43
|
|
|
|
|
|
|
| v[oa]n\s+ |
44
|
|
|
|
|
|
|
)? |
45
|
|
|
|
|
|
|
$np1)* |
46
|
|
|
|
|
|
|
}x; |
47
|
|
|
|
|
|
|
#} |
48
|
|
|
|
|
|
|
|
49
|
3
|
|
|
|
|
65
|
@stopw = qw{ |
50
|
|
|
|
|
|
|
no com se em segundo a o os as na nos nas do das dos da tanto |
51
|
|
|
|
|
|
|
para de desde mas quando esta sem nem só apenas mesmo até uma uns um |
52
|
|
|
|
|
|
|
pela por pelo pelas pelos depois ao sobre como umas já enquanto aos |
53
|
|
|
|
|
|
|
também amanhã ontem embora essa nesse olhe hoje não eu ele eles |
54
|
|
|
|
|
|
|
primeiro simplesmente era foi é será são seja nosso nossa nossos nossas |
55
|
|
|
|
|
|
|
chama-se chamam-se subtitui resta diz salvo disse diz vamos entra entram |
56
|
|
|
|
|
|
|
aqui começou lá seu vinham passou quanto sou vi onde este então temos |
57
|
|
|
|
|
|
|
num aquele tivemos |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
en la pour le |
60
|
|
|
|
|
|
|
}; |
61
|
|
|
|
|
|
|
|
62
|
3
|
|
|
|
|
17
|
$prof = join("|", qw{ |
63
|
|
|
|
|
|
|
astrólogo astrónomo advogado actor |
64
|
|
|
|
|
|
|
baterista |
65
|
|
|
|
|
|
|
cantor compositor |
66
|
|
|
|
|
|
|
dramaturgo |
67
|
|
|
|
|
|
|
engenheiro escritor |
68
|
|
|
|
|
|
|
filósofo flautista físico |
69
|
|
|
|
|
|
|
investigador |
70
|
|
|
|
|
|
|
jogador |
71
|
|
|
|
|
|
|
matemático médico ministro músico |
72
|
|
|
|
|
|
|
pianista poeta professor |
73
|
|
|
|
|
|
|
químico |
74
|
|
|
|
|
|
|
teólogo |
75
|
|
|
|
|
|
|
}); |
76
|
3
|
|
|
|
|
8
|
$sep1 = join("|", qw{chamado "conhecido como"}); |
77
|
|
|
|
|
|
|
|
78
|
3
|
|
|
|
|
6
|
$sep2 = join("|", qw{brilhante conhecido reputado popular}); |
79
|
3
|
|
|
|
|
259
|
@vazia{@stopw} = (@stopw); # para ser mais facil ver se uma pal é stopword |
80
|
3
|
|
|
|
|
11379
|
$em = '\b(?:[Ee]m|[nN][oa]s?)'; |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
|
83
|
|
|
|
|
|
|
=head1 Synopsis |
84
|
|
|
|
|
|
|
|
85
|
|
|
|
|
|
|
This module contains simple Perl-based functions to detect and extract |
86
|
|
|
|
|
|
|
proper names from Portuguese text. |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
use Lingua::PT::ProperNames; |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
printPN(@options); |
92
|
|
|
|
|
|
|
printPNstring({ %options... } ,$textstrint); |
93
|
|
|
|
|
|
|
printPNstring([ @options... ] ,$textstrint); |
94
|
|
|
|
|
|
|
|
95
|
|
|
|
|
|
|
forPN( sub{my ($pn, $contex)=@_;... } ) ; |
96
|
|
|
|
|
|
|
forPN( {t=>"double"}, |
97
|
|
|
|
|
|
|
sub{my ($pn, $contex)=@_;... }, sub{...} ) ; |
98
|
|
|
|
|
|
|
$outstr = forPN($instr, sub{my ($pn, $contex)=@_;... }, ... ) ; |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
forPNstring(sub{my ($pn, $contex)=@_;... }, |
101
|
|
|
|
|
|
|
$textstring, regsep) ; |
102
|
|
|
|
|
|
|
|
103
|
|
|
|
|
|
|
|
104
|
|
|
|
|
|
|
my $pndict = Lingua::PT::ProperNames->new; |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
=head1 Functions related to ProperNames dictionary |
107
|
|
|
|
|
|
|
|
108
|
|
|
|
|
|
|
=head2 new |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
Creates a new ProperNames dictionary |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
=cut |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
sub new { |
115
|
1
|
|
|
1
|
1
|
14
|
my $class = shift; |
116
|
|
|
|
|
|
|
# my $filename = shift; |
117
|
|
|
|
|
|
|
|
118
|
1
|
|
|
|
|
4
|
my $self = bless {}, $class; |
119
|
1
|
|
|
|
|
5
|
$self->_load_dictionary; |
120
|
1
|
|
|
|
|
10
|
return $self; |
121
|
|
|
|
|
|
|
} |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub _load_dictionary { |
124
|
1
|
|
|
1
|
|
4
|
my $self = shift; |
125
|
1
|
|
50
|
|
|
8
|
my $file = shift || undef; |
126
|
|
|
|
|
|
|
|
127
|
1
|
50
|
|
|
|
5
|
if ($file) { |
128
|
0
|
0
|
|
|
|
0
|
open C, $file or die; |
129
|
0
|
|
|
|
|
0
|
while() { |
130
|
0
|
|
|
|
|
0
|
chomp; |
131
|
0
|
0
|
|
|
|
0
|
next if m!^\s*$!; |
132
|
0
|
|
|
|
|
0
|
$self->{cdic}{$_} = $_; |
133
|
|
|
|
|
|
|
} |
134
|
0
|
|
|
|
|
0
|
close C; |
135
|
|
|
|
|
|
|
} else { |
136
|
1
|
|
|
|
|
4
|
my $f = _find_file(); |
137
|
1
|
50
|
|
|
|
55
|
open D, $f or die "Cannot open file $f: $!\n"; |
138
|
1
|
|
|
|
|
48
|
while() { |
139
|
12854
|
|
|
|
|
21307
|
chomp; |
140
|
12854
|
50
|
|
|
|
29773
|
next if m!^\s*$!; |
141
|
12854
|
|
|
|
|
34391
|
my ($nome,$prob,$type) = split /\s+/; |
142
|
12854
|
|
|
|
|
81724
|
$self->{dic}{$nome} = {type=>$type,prob=>$prob}; |
143
|
|
|
|
|
|
|
} |
144
|
1
|
|
|
|
|
4218
|
close D; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
} |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
sub _exists { |
149
|
5
|
|
|
5
|
|
851
|
my $self = shift; |
150
|
5
|
|
|
|
|
8
|
my $word = shift; |
151
|
5
|
0
|
0
|
|
|
38
|
return exists($self->{dic}{$word}) or |
152
|
|
|
|
|
|
|
exists($self->{cdic}{$word}) or |
153
|
|
|
|
|
|
|
exists($self->{sdic}{$word}) |
154
|
|
|
|
|
|
|
} |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=head2 is_name |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
This method checks if a name exists in the Names dictionary as a Given Name. |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
=cut |
161
|
|
|
|
|
|
|
|
162
|
|
|
|
|
|
|
sub is_name { |
163
|
1
|
|
|
1
|
1
|
4
|
return _exists(@_) |
164
|
|
|
|
|
|
|
} |
165
|
|
|
|
|
|
|
|
166
|
|
|
|
|
|
|
=head2 is_surname |
167
|
|
|
|
|
|
|
|
168
|
|
|
|
|
|
|
Thie method checks if a name exists in the Names dictionary as a |
169
|
|
|
|
|
|
|
Surname. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
=cut |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
sub is_surname { |
174
|
1
|
|
33
|
1
|
1
|
5
|
return _exists(@_) && _type(@_) eq "apelido"; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
sub _type { |
178
|
1
|
|
|
1
|
|
2
|
my $self = shift; |
179
|
1
|
|
|
|
|
4
|
my $word = shift; |
180
|
1
|
50
|
|
|
|
5
|
if (exists($self->{dic}{$word})) { |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
181
|
1
|
|
|
|
|
50
|
return $self->{dic}{$word}{type} |
182
|
|
|
|
|
|
|
} elsif (exists($self->{cdic}{$word})) { |
183
|
0
|
|
|
|
|
0
|
return $self->{cdic}{$word}{type} |
184
|
|
|
|
|
|
|
} elsif (exists($self->{sdic}{$word})) { |
185
|
0
|
|
|
|
|
0
|
return $self->{sdic}{$word}{type} |
186
|
|
|
|
|
|
|
} else { |
187
|
0
|
|
|
|
|
0
|
return undef; |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
} |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
=head1 Detecting Proper Names |
193
|
|
|
|
|
|
|
|
194
|
|
|
|
|
|
|
=head2 forPN |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
Substitutes all Proper Names found on STDIN by the result of calling a function C<> |
197
|
|
|
|
|
|
|
with arguments ($propername,$context). The result is sent to STDOUT. |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
Usage: |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
forPN({options...}, sub{ propername processor...}) |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
Optionally you can define input or output files: |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
forPN({in=> "inputfile", out => "outputfile" }, sub{...}) |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
Also, C<<{t => "double"}>> helps to treat in a special way |
208
|
|
|
|
|
|
|
names after punctuation (".", etc). |
209
|
|
|
|
|
|
|
With this options you must provide 2 functions: one for standard Proper Names |
210
|
|
|
|
|
|
|
and one for names after punctuation. |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
forPN({t=>"double"}, sub{...}, sub{...}) |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
You can also define record paragraph separator |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
forPN({sep=>"\n", t=>"normal"}, sub{...}) ## each line is a par. |
217
|
|
|
|
|
|
|
forPN({sep=>""}, sub{...}) ## par. empty lines |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=cut |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
sub forPN{ |
223
|
|
|
|
|
|
|
## opt: in=> inputfile(sdtin), out => file(stdout) |
224
|
0
|
|
|
0
|
1
|
0
|
my %opt = (sep => "", t => "normal" ); |
225
|
|
|
|
|
|
|
|
226
|
0
|
0
|
|
|
|
0
|
%opt = (%opt , %{shift(@_)}) if ref($_[0]) eq "HASH"; |
|
0
|
|
|
|
|
0
|
|
227
|
0
|
|
|
|
|
0
|
my $instring = ""; |
228
|
0
|
0
|
|
|
|
0
|
$instring = shift(@_) if ! ref($_[0]); |
229
|
|
|
|
|
|
|
|
230
|
0
|
|
|
|
|
0
|
my ($f,$f1) = @_; |
231
|
0
|
|
|
|
|
0
|
my $m="\x01"; |
232
|
0
|
|
|
|
|
0
|
my $old; |
233
|
0
|
|
|
|
|
0
|
my ($F1, $F2) ; |
234
|
|
|
|
|
|
|
|
235
|
0
|
0
|
|
|
|
0
|
die "invalid parameter to 'forPN'" unless ref($f) eq "CODE"; |
236
|
|
|
|
|
|
|
|
237
|
0
|
0
|
|
|
|
0
|
if ($opt{t} eq "double") { |
238
|
0
|
0
|
|
|
|
0
|
die "invalid parameter ". ref($f1) unless ref($f1) eq "CODE"; |
239
|
|
|
|
|
|
|
} |
240
|
|
|
|
|
|
|
|
241
|
0
|
|
|
|
|
0
|
local $/ = $opt{sep}; # input record separator=one or more empty lines |
242
|
|
|
|
|
|
|
|
243
|
0
|
0
|
|
|
|
0
|
if (defined $opt{in}) { |
|
|
0
|
|
|
|
|
|
244
|
0
|
0
|
|
|
|
0
|
open $F1, "$opt{in}" or die "cant open $opt{in}\n"; |
245
|
|
|
|
|
|
|
} elsif (defined $instring) { ## input is a string (1st parameter) |
246
|
0
|
|
|
|
|
0
|
$F1 = IO::String->new($instring); |
247
|
|
|
|
|
|
|
} else { |
248
|
0
|
|
|
|
|
0
|
$F1=*STDIN; |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
|
251
|
0
|
0
|
|
|
|
0
|
if (defined $opt{out}) { |
|
|
0
|
|
|
|
|
|
252
|
0
|
0
|
|
|
|
0
|
open F, ">$opt{out}" or die "cant create $opt{out}\n"; |
253
|
0
|
|
|
|
|
0
|
$old = select(F); |
254
|
|
|
|
|
|
|
} elsif (defined $instring) { ## input is a string (1st parameter) |
255
|
0
|
|
|
|
|
0
|
$F2 = IO::String->new(); |
256
|
0
|
|
|
|
|
0
|
$old = select($F2); |
257
|
|
|
|
|
|
|
} |
258
|
|
|
|
|
|
|
|
259
|
0
|
|
|
|
|
0
|
while (<$F1>) { |
260
|
0
|
|
|
|
|
0
|
my $ctx = $_; |
261
|
0
|
0
|
|
|
|
0
|
if ($opt{t} eq "double") { |
262
|
|
|
|
|
|
|
|
263
|
0
|
|
|
|
|
0
|
s{($np)}{$m($1$m)}g; |
264
|
0
|
|
|
|
|
0
|
s{(^\s* |
265
|
|
|
|
|
|
|
| [-]\s+ |
266
|
|
|
|
|
|
|
| [.!?]\s* |
267
|
|
|
|
|
|
|
) $m\( ($np) $m\) |
268
|
|
|
|
|
|
|
}{ |
269
|
0
|
|
|
|
|
0
|
my ($aux1,$aux2,$aux3)= ($1,$2, $f1->($2,$ctx)); |
270
|
0
|
0
|
|
|
|
0
|
if (defined($aux3)){$aux1 . $aux3} |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
271
|
|
|
|
|
|
|
else {$aux1 . _tryright($aux2)} }xge; |
272
|
|
|
|
|
|
|
|
273
|
0
|
|
|
|
|
0
|
s{$m\(($np)$m\)}{ $f->($1,$ctx) }ge; |
|
0
|
|
|
|
|
0
|
|
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
} else { |
276
|
0
|
|
|
|
|
0
|
s{( \w+\s+ |
277
|
|
|
|
|
|
|
| [\«\»,:()'`"]\s* |
278
|
0
|
|
|
|
|
0
|
) ($np) |
279
|
|
|
|
|
|
|
}{$1 . $f->($2,$ctx) }xge; |
280
|
|
|
|
|
|
|
} |
281
|
0
|
|
|
|
|
0
|
print; |
282
|
|
|
|
|
|
|
} |
283
|
0
|
0
|
|
|
|
0
|
close $F1 if $opt{in}; |
284
|
0
|
0
|
|
|
|
0
|
if (defined $opt{out}) { |
|
|
0
|
|
|
|
|
|
285
|
0
|
|
|
|
|
0
|
select $old; |
286
|
0
|
|
|
|
|
0
|
close F; |
287
|
|
|
|
|
|
|
} elsif (defined $instring) { ## input is a string (1st parameter) |
288
|
0
|
|
|
|
|
0
|
return ${$F2->string_ref()}; |
|
0
|
|
|
|
|
0
|
|
289
|
|
|
|
|
|
|
} |
290
|
|
|
|
|
|
|
} |
291
|
|
|
|
|
|
|
|
292
|
|
|
|
|
|
|
=head2 forPNstring |
293
|
|
|
|
|
|
|
|
294
|
|
|
|
|
|
|
forPNstring( $funref, "textstring" [, regSeparator] )> |
295
|
|
|
|
|
|
|
|
296
|
|
|
|
|
|
|
Substitutes all C by C in the text string. |
297
|
|
|
|
|
|
|
|
298
|
|
|
|
|
|
|
=cut |
299
|
|
|
|
|
|
|
|
300
|
|
|
|
|
|
|
sub forPNstring { |
301
|
0
|
|
|
0
|
1
|
0
|
my $f = shift; |
302
|
0
|
0
|
|
|
|
0
|
die "invalid parameter to 'forPNstring': function expected" unless ref($f) eq "CODE"; |
303
|
0
|
|
|
|
|
0
|
my $text = shift; |
304
|
0
|
|
0
|
|
|
0
|
my $sep = shift || "\n"; |
305
|
0
|
|
|
|
|
0
|
my $r = ''; |
306
|
0
|
|
|
|
|
0
|
for (split(/$sep/,$text)) { |
307
|
0
|
|
|
|
|
0
|
my $ctx = $_; |
308
|
0
|
|
|
|
|
0
|
s/(\w+\s+|[\«\»,()'`i"]\s*)($np)/$1 . $f->($2,$ctx)/ge ; |
|
0
|
|
|
|
|
0
|
|
309
|
0
|
|
|
|
|
0
|
$r .= "$_$sep"; |
310
|
|
|
|
|
|
|
} |
311
|
0
|
|
|
|
|
0
|
return $r; |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=head2 printPNstring |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
printPNstring("oco") |
317
|
|
|
|
|
|
|
|
318
|
|
|
|
|
|
|
=cut |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
sub printPNstring{ |
321
|
0
|
|
|
0
|
1
|
0
|
my $text = shift; |
322
|
0
|
|
|
|
|
0
|
my %opt = (); |
323
|
|
|
|
|
|
|
|
324
|
0
|
0
|
|
|
|
0
|
if (ref($text) eq "HASH") { %opt = %$text ; $text = shift; } |
|
0
|
0
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
325
|
0
|
|
|
|
|
0
|
elsif(ref($text) eq "ARRAY"){ @opt{@$text} = @$text; $text = shift; } |
|
0
|
|
|
|
|
0
|
|
326
|
|
|
|
|
|
|
|
327
|
0
|
|
|
|
|
0
|
my (%profissao, %names, %namesduv, %gnames); |
328
|
|
|
|
|
|
|
|
329
|
0
|
|
|
|
|
0
|
for ($text) { |
330
|
0
|
|
|
|
|
0
|
chop; |
331
|
0
|
|
|
|
|
0
|
s/\n/ /g; |
332
|
0
|
|
|
|
|
0
|
for (m/[.?!:;"]\s+($np1\s+$np)/gxs) { $namesduv{$_}++ } |
|
0
|
|
|
|
|
0
|
|
333
|
0
|
|
|
|
|
0
|
for (m![)>(]\s*($np1\s+$np)!gxs) { $namesduv{$_}++ } |
|
0
|
|
|
|
|
0
|
|
334
|
0
|
|
|
|
|
0
|
for (m/(?:[\w\«\»,]\s+)($np)/gxs) { $names{$_}++ } |
|
0
|
|
|
|
|
0
|
|
335
|
0
|
0
|
|
|
|
0
|
if ($opt{em}) { |
336
|
0
|
|
|
|
|
0
|
for (/$em\s+($np)/g) { $gnames{$_}++ } |
|
0
|
|
|
|
|
0
|
|
337
|
|
|
|
|
|
|
} |
338
|
0
|
0
|
|
|
|
0
|
if ($opt{prof}) { |
339
|
0
|
|
|
|
|
0
|
while(/\b($prof)\s+(?:(?:$sep1)\s+)?($np)/g) |
340
|
0
|
|
|
|
|
0
|
{ $profissao{$2} = $1 } |
341
|
0
|
|
|
|
|
0
|
while(/(?:[\w\«\»,]\s+|[(])($np),\s*(?:(?:$sep2)\s+)?($prof)/g) |
342
|
0
|
|
|
|
|
0
|
{ $profissao{$1} = $2 } |
343
|
|
|
|
|
|
|
} |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
# tratamento dos nomes "duvidosos" = Nome prop no inicio duma frase |
347
|
|
|
|
|
|
|
# |
348
|
|
|
|
|
|
|
|
349
|
0
|
|
|
|
|
0
|
for (keys %namesduv) { |
350
|
0
|
0
|
0
|
|
|
0
|
if (/^(\w+)/ && $vazia{lc($1)}) { #exemplo "Como Jose Manuel" |
351
|
0
|
|
|
|
|
0
|
s/^\w+\s*//; # retira-se a 1.a palavra |
352
|
0
|
|
|
|
|
0
|
$names{$_}++ |
353
|
|
|
|
|
|
|
} else { |
354
|
0
|
|
|
|
|
0
|
$names{$_}++ |
355
|
|
|
|
|
|
|
} |
356
|
|
|
|
|
|
|
} |
357
|
|
|
|
|
|
|
|
358
|
0
|
|
|
|
|
0
|
for (keys %names) { |
359
|
0
|
0
|
0
|
|
|
0
|
if (/^(\w+)/ && $vazia{lc($1)}) { #exemplo "Como Jose Manuel" |
360
|
0
|
|
|
|
|
0
|
my $ant = $_; |
361
|
0
|
|
|
|
|
0
|
s/^\w+\s*//; # retira-se a 1.a palavra |
362
|
0
|
|
|
|
|
0
|
$names{$_} += $names{$ant}; |
363
|
0
|
|
|
|
|
0
|
delete $names{$ant} |
364
|
|
|
|
|
|
|
} |
365
|
|
|
|
|
|
|
} |
366
|
|
|
|
|
|
|
|
367
|
0
|
0
|
|
|
|
0
|
if ($opt{oco}) { |
368
|
0
|
|
|
|
|
0
|
for (sort {$names{$b} <=> $names{$a}} keys %names ) { |
|
0
|
|
|
|
|
0
|
|
369
|
0
|
|
|
|
|
0
|
printf("%60s - %d\n", $_ ,$names{$_}); |
370
|
|
|
|
|
|
|
} |
371
|
|
|
|
|
|
|
} else { |
372
|
0
|
0
|
|
|
|
0
|
if ($opt{comp}) { |
373
|
0
|
|
|
|
|
0
|
my @l = sort _compara keys %names; |
374
|
0
|
|
|
|
|
0
|
_compacta(\%names, @l) |
375
|
|
|
|
|
|
|
} else { |
376
|
0
|
|
|
|
|
0
|
for (sort _compara keys %names ) { |
377
|
0
|
|
|
|
|
0
|
printf("%60s - %d\n", $_ ,$names{$_}); |
378
|
|
|
|
|
|
|
} |
379
|
|
|
|
|
|
|
} |
380
|
0
|
0
|
|
|
|
0
|
if ($opt{prof}) { |
381
|
0
|
|
|
|
|
0
|
print "\nProfissões\n"; |
382
|
0
|
|
|
|
|
0
|
for (keys %profissao) { |
383
|
0
|
|
|
|
|
0
|
print "$_ -- $profissao{$_}" |
384
|
|
|
|
|
|
|
} |
385
|
|
|
|
|
|
|
} |
386
|
0
|
0
|
|
|
|
0
|
if ($opt{em}) { |
387
|
0
|
|
|
|
|
0
|
print "\nGeograficos\n"; |
388
|
0
|
|
|
|
|
0
|
for (sort _compara keys %gnames ) { |
389
|
0
|
|
|
|
|
0
|
printf("%60s - %d\n", $_ ,$gnames{$_}) |
390
|
|
|
|
|
|
|
} |
391
|
|
|
|
|
|
|
} |
392
|
|
|
|
|
|
|
} |
393
|
|
|
|
|
|
|
} |
394
|
|
|
|
|
|
|
|
395
|
|
|
|
|
|
|
|
396
|
|
|
|
|
|
|
=head2 getPN |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=cut |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
sub getPN { |
401
|
0
|
|
|
0
|
1
|
0
|
local $/ = ""; # input record separator=1 or more empty lines |
402
|
|
|
|
|
|
|
|
403
|
0
|
|
|
|
|
0
|
my %opt; |
404
|
0
|
|
|
|
|
0
|
@opt{@_} = @_; |
405
|
0
|
|
|
|
|
0
|
my (%profissao, %names, %namesduv, %gnames); |
406
|
|
|
|
|
|
|
|
407
|
0
|
|
|
|
|
0
|
while (<>) { |
408
|
0
|
|
|
|
|
0
|
chop; |
409
|
0
|
|
|
|
|
0
|
s/\n/ /g; |
410
|
0
|
|
|
|
|
0
|
for (/[.?!:;"]\s+($np1\s+$np)/g) { $namesduv{$_}++;} |
|
0
|
|
|
|
|
0
|
|
411
|
0
|
|
|
|
|
0
|
for (/[)>(]\s*($np1\s+$np)/g) { $namesduv{$_}++;} |
|
0
|
|
|
|
|
0
|
|
412
|
0
|
|
|
|
|
0
|
for (/(?:[\w\«\»,]\s+)($np)/g) { $names{$_}++;} |
|
0
|
|
|
|
|
0
|
|
413
|
0
|
0
|
|
|
|
0
|
if ($opt{em}) { |
414
|
0
|
|
|
|
|
0
|
for (/$em\s+($np)/g) { $gnames{$_}++;}} |
|
0
|
|
|
|
|
0
|
|
415
|
0
|
0
|
|
|
|
0
|
if ($opt{prof}) { |
416
|
0
|
|
|
|
|
0
|
while(/\b($prof)\s+(?:(?:$sep1)\s+)?($np)/g) |
417
|
0
|
|
|
|
|
0
|
{ $profissao{$2} = $1 } |
418
|
0
|
|
|
|
|
0
|
while(/(?:[\w\«\»,]\s+|[(])($np),\s*(?:(?:$sep2)\s+)?($prof)/g) |
419
|
0
|
|
|
|
|
0
|
{ $profissao{$1} = $2 } |
420
|
|
|
|
|
|
|
} |
421
|
|
|
|
|
|
|
} |
422
|
|
|
|
|
|
|
|
423
|
|
|
|
|
|
|
# tratamento dos nomes "duvidosos" = Nome prop no inicio duma frase |
424
|
|
|
|
|
|
|
# |
425
|
|
|
|
|
|
|
|
426
|
0
|
|
|
|
|
0
|
for (keys %namesduv) { |
427
|
0
|
0
|
0
|
|
|
0
|
if(/^(\w+)/ && $vazia{lc($1)}) { # exemplo "Como Jose Manuel" |
428
|
0
|
|
|
|
|
0
|
s/^\w+\s*//; # retira-se a 1.a palavra |
429
|
0
|
|
|
|
|
0
|
$names{$_}++ |
430
|
|
|
|
|
|
|
} else { |
431
|
0
|
|
|
|
|
0
|
$names{$_}++ |
432
|
|
|
|
|
|
|
} |
433
|
|
|
|
|
|
|
} |
434
|
0
|
|
|
|
|
0
|
return (%names) |
435
|
|
|
|
|
|
|
} |
436
|
|
|
|
|
|
|
|
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=head2 printPN |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
printPN("oco") |
441
|
|
|
|
|
|
|
|
442
|
|
|
|
|
|
|
printPN - extrai os nomes próprios dum texto. |
443
|
|
|
|
|
|
|
-comp junta certos nomes: Fermat + Pierre de Fermat = (Pierre de) Fermat |
444
|
|
|
|
|
|
|
-prof |
445
|
|
|
|
|
|
|
-e "Sebastiao e Silva" "e" como pertencente a PN |
446
|
|
|
|
|
|
|
-em "em Famalicão" como pertencente a PN |
447
|
|
|
|
|
|
|
|
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
=cut |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
sub printPN{ |
452
|
0
|
|
|
0
|
1
|
0
|
local $/ = ""; # input record separator=1 or more empty lines |
453
|
|
|
|
|
|
|
|
454
|
0
|
|
|
|
|
0
|
my %opt; |
455
|
0
|
|
|
|
|
0
|
@opt{@_} = @_; |
456
|
0
|
|
|
|
|
0
|
my (%profissao, %names, %namesduv, %gnames); |
457
|
|
|
|
|
|
|
|
458
|
0
|
|
|
|
|
0
|
while (<>) { |
459
|
0
|
|
|
|
|
0
|
chop; |
460
|
0
|
|
|
|
|
0
|
s/\n/ /g; |
461
|
0
|
|
|
|
|
0
|
for (/[.?!:;"]\s+($np1\s+$np)/g) { $namesduv{$_}++ } |
|
0
|
|
|
|
|
0
|
|
462
|
0
|
|
|
|
|
0
|
for (/[)>(]\s*($np1\s+$np)/g) { $namesduv{$_}++ } |
|
0
|
|
|
|
|
0
|
|
463
|
0
|
|
|
|
|
0
|
for (/(?:[\w\«\»,]\s+)($np)/g) { $names{$_}++ } |
|
0
|
|
|
|
|
0
|
|
464
|
0
|
0
|
|
|
|
0
|
if ($opt{em}) { |
465
|
0
|
|
|
|
|
0
|
for (/$em\s+($np)/g) { $gnames{$_}++ } |
|
0
|
|
|
|
|
0
|
|
466
|
|
|
|
|
|
|
} |
467
|
0
|
0
|
|
|
|
0
|
if ($opt{prof}) { |
468
|
0
|
|
|
|
|
0
|
while(/\b($prof)\s+(?:(?:$sep1)\s+)?($np)/g) |
469
|
0
|
|
|
|
|
0
|
{ $profissao{$2} = $1 } |
470
|
0
|
|
|
|
|
0
|
while(/(?:[\w\«\»,]\s+|[(])($np),\s*(?:(?:$sep2)\s+)?($prof)/g) |
471
|
0
|
|
|
|
|
0
|
{ $profissao{$1} = $2 } |
472
|
|
|
|
|
|
|
} |
473
|
|
|
|
|
|
|
} |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
# tratamento dos nomes "duvidosos" = Nome prop no inicio duma frase |
476
|
|
|
|
|
|
|
# |
477
|
|
|
|
|
|
|
|
478
|
0
|
|
|
|
|
0
|
for (keys %namesduv){ |
479
|
0
|
0
|
0
|
|
|
0
|
if(/^(\w+)/ && $vazia{lc($1)} ) #exemplo "Como Jose Manuel" |
|
0
|
|
|
|
|
0
|
|
480
|
|
|
|
|
|
|
{s/^\w+\s*//; # retira-se a 1.a palavra |
481
|
0
|
|
|
|
|
0
|
$names{$_}++;} |
482
|
|
|
|
|
|
|
else |
483
|
0
|
|
|
|
|
0
|
{ $names{$_}++;} |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
##### Não sei bem se isto serve... |
487
|
|
|
|
|
|
|
|
488
|
0
|
|
|
|
|
0
|
for (keys %names){ |
489
|
0
|
0
|
0
|
|
|
0
|
if(/^(\w+)/ && $vazia{lc($1)} ) #exemplo "Como Jose Manuel" |
490
|
0
|
|
|
|
|
0
|
{ my $ant = $_; |
491
|
0
|
|
|
|
|
0
|
s/^\w+\s*//; # retira-se a 1.a palavra |
492
|
0
|
|
|
|
|
0
|
$names{$_}+=$names{$ant}; |
493
|
0
|
|
|
|
|
0
|
delete $names{$ant};} |
494
|
|
|
|
|
|
|
} |
495
|
|
|
|
|
|
|
|
496
|
0
|
0
|
|
|
|
0
|
if($opt{oco}){ |
497
|
0
|
|
|
|
|
0
|
for (sort {$names{$b} <=> $names{$a}} keys %names ) |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
498
|
|
|
|
|
|
|
{printf("%6d - %s\n",$names{$_}, $_ );} |
499
|
|
|
|
|
|
|
} |
500
|
|
|
|
|
|
|
else |
501
|
|
|
|
|
|
|
{ |
502
|
0
|
0
|
|
|
|
0
|
if($opt{comp}){my @l = sort _compara keys %names; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
503
|
0
|
|
|
|
|
0
|
_compacta(\%names, @l); } |
504
|
0
|
|
|
|
|
0
|
else{for (sort _compara keys %names ) |
505
|
|
|
|
|
|
|
{printf("%60s - %d\n", $_ ,$names{$_});} } |
506
|
|
|
|
|
|
|
|
507
|
0
|
0
|
|
|
|
0
|
if($opt{prof}){print "\nProfissões\n"; |
|
0
|
|
|
|
|
0
|
|
508
|
0
|
|
|
|
|
0
|
for (keys %profissao){print "$_ -- $profissao{$_}";} } |
|
0
|
|
|
|
|
0
|
|
509
|
|
|
|
|
|
|
|
510
|
0
|
0
|
|
|
|
0
|
if($opt{em}){print "\nGeograficos\n"; |
|
0
|
|
|
|
|
0
|
|
511
|
0
|
|
|
|
|
0
|
for (sort _compara keys %gnames ) |
|
0
|
|
|
|
|
0
|
|
512
|
|
|
|
|
|
|
{printf("%60s - %d\n", $_ ,$gnames{$_});} } |
513
|
|
|
|
|
|
|
} |
514
|
|
|
|
|
|
|
} |
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
|
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
## |
519
|
|
|
|
|
|
|
# Auxiliary stuff |
520
|
|
|
|
|
|
|
|
521
|
|
|
|
|
|
|
sub _tryright{ |
522
|
0
|
|
|
0
|
|
0
|
my $a = shift; |
523
|
0
|
0
|
|
|
|
0
|
return $a unless $a =~ /(\w+)(.*)$/; |
524
|
0
|
|
|
|
|
0
|
my ($w,$r) = ($1,$2); |
525
|
0
|
|
|
|
|
0
|
my $m = "\x01"; |
526
|
0
|
|
|
|
|
0
|
$r =~ s{($np)}{$m($1$m)}g; |
527
|
0
|
|
|
|
|
0
|
return "$w$r"; |
528
|
|
|
|
|
|
|
} |
529
|
|
|
|
|
|
|
|
530
|
|
|
|
|
|
|
|
531
|
|
|
|
|
|
|
sub _compacta{ |
532
|
0
|
|
|
0
|
|
0
|
my $s; |
533
|
0
|
|
|
|
|
0
|
my $names = shift; |
534
|
|
|
|
|
|
|
|
535
|
0
|
|
|
|
|
0
|
my $p = shift; |
536
|
0
|
|
|
|
|
0
|
my $r = $p; |
537
|
0
|
|
|
|
|
0
|
my $q = $names->{$p}; |
538
|
0
|
|
|
|
|
0
|
while ($s = shift) |
539
|
0
|
0
|
|
|
|
0
|
{ if ($s =~ (/^(.+) $p/)) { $r = "($1) $r" ; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
540
|
0
|
|
|
|
|
0
|
$q += $names->{$s}; |
541
|
|
|
|
|
|
|
} |
542
|
0
|
|
|
|
|
0
|
else {print "$r - $q"; $r=$s; $q = $names->{$s}; } |
|
0
|
|
|
|
|
0
|
|
543
|
0
|
|
|
|
|
0
|
$p=$s; |
544
|
|
|
|
|
|
|
} |
545
|
0
|
|
|
|
|
0
|
print "$r - $q"; |
546
|
|
|
|
|
|
|
} |
547
|
|
|
|
|
|
|
|
548
|
|
|
|
|
|
|
sub _compara { |
549
|
|
|
|
|
|
|
# ordena pela lista de palavras invertida |
550
|
0
|
|
|
0
|
|
0
|
join(" ", reverse(split(" ",$a))) cmp join(" ", reverse(split(" ",$b))); |
551
|
|
|
|
|
|
|
} |
552
|
|
|
|
|
|
|
|
553
|
|
|
|
|
|
|
sub _find_file { |
554
|
1
|
|
|
1
|
|
4
|
my @files = grep { -e $_ } map { "$_/Lingua/PT/ProperNames/names.dat" } @INC; |
|
11
|
|
|
|
|
328
|
|
|
11
|
|
|
|
|
28
|
|
555
|
1
|
|
|
|
|
5
|
return $files[0]; |
556
|
|
|
|
|
|
|
} |
557
|
|
|
|
|
|
|
|
558
|
|
|
|
|
|
|
=head1 Author |
559
|
|
|
|
|
|
|
|
560
|
|
|
|
|
|
|
José João Almeida, C<< >> |
561
|
|
|
|
|
|
|
|
562
|
|
|
|
|
|
|
Alberto Simões, C<< >> |
563
|
|
|
|
|
|
|
|
564
|
|
|
|
|
|
|
=head1 Bugs |
565
|
|
|
|
|
|
|
|
566
|
|
|
|
|
|
|
Please report any bugs or feature requests to |
567
|
|
|
|
|
|
|
C, or through the web interface at |
568
|
|
|
|
|
|
|
L. I will be notified, and then you'll automatically |
569
|
|
|
|
|
|
|
be notified of progress on your bug as I make changes. |
570
|
|
|
|
|
|
|
|
571
|
|
|
|
|
|
|
=head1 COPYRIGHT & LICENSE |
572
|
|
|
|
|
|
|
|
573
|
|
|
|
|
|
|
Copyright 2004-2008 Projecto Natura, All Rights Reserved. |
574
|
|
|
|
|
|
|
|
575
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
576
|
|
|
|
|
|
|
under the same terms as Perl itself. |
577
|
|
|
|
|
|
|
|
578
|
|
|
|
|
|
|
=cut |
579
|
|
|
|
|
|
|
|
580
|
|
|
|
|
|
|
1; # End of Lingua::PT::ProperNames |
581
|
|
|
|
|
|
|
|