File Coverage

blib/lib/Text/Statistics/Cyrillic.pm

Criterion	Covered	Total	%
statement	18	213	8.4
branch	0	66	0.0
condition			n/a
subroutine	6	7	85.7
pod	0	1	0.0
total	24	287	8.3

line	stmt	bran	sub	pod	time	code
1						package Text::Statistics::Cyrillic;
2
3	1		1		32230	use strict;
	1				3
	1				46
4	1		1		6	no warnings;
	1				2
	1				78
5
6						require Exporter;
7
8						our @ISA = qw(Exporter);
9
10						=head1 NAME
11
12						Text::Statistics::Cyrillic - Performs statistical corpora analysis
13
14						=head1 VERSION
15
16						Version 0.04
17
18						=cut
19
20						our $VERSION = '0.04';
21	1		1		29	use 5.006;
	1				9
	1				57
22	1		1		1100	use Text::ParseWords;
	1				1512
	1				81
23	1		1		1096	use utf8;
	1				10
	1				6
24	1		1		45	use base 'Exporter';
	1				2
	1				3882
25
26						our @EXPORT = qw(cyrillic);
27						our @words;
28						our $tokens;
29						our @out11;
30						our @outtott;
31						our $tokensgeral;
32
33						=head1 SYNOPSIS
34
35						Text::Statistics::Cyrillic creates a seven column CSV file output with one line each
36						token per text given as input a corpus that files names follows '
37						1 (1). txt', '1 (2). txt', ..., '1 (n).txt' or
38						1 $([1-9]\|[1-9][0-9]+)$\.txt
39						Columns stores statistical information:
40						(1) number of word forms in document d;
41						(2) number of tokens in d;
42						(3) Id number of d, ie., n;
43						(4) frequency of term t in d;
44						(5) corpus frequency of t ;
45						(6) document frequency of t (number of documents where t occurs at least once);
46						(7) t, UTF8 latin coded token-string
47
48						Main output file name is '1 (n + 5).txt' and it is stored in the same directory as
49						the corpus itself, toghether with residual files on each input file with .txu and .txv extensions.
50
51						Example:
52
53						use Text::Statistics::Cyrillic;
54						&cyrillic("4"); #3 (4-1) texts will be analised.
55
56						=head1 EXPORT
57
58						&cyrillic();
59
60						=cut
61
62						sub cyrillic{
63
64	0		0	0		print "inicio de programa, aguardes", "\n";
65	0					my $min = 1; #número do arquivo inicial
66	0					our $max=shift;
67
68	0					my $dif = $max - $min;
69	0					my $tempo = 1;
70
71	0					while ($tempo < 3){ #limita o procedimenento aos ciclos inicial e meta-dado
72	0					my $nome4 = "1 ($max).txt"; #arquivo de mescla para a obtenção automática de df
73	0					my $nome5 = "registro1 ($max).txt"; #arquivos de log dos dados e dos metadados
74	0	0				open (our $result, ">", $nome4) \|\| die "Não posso escrever $nome4: $!";
75	0	0				open (my $registro, ">", $nome5) \|\| die "Não posso escrever $nome5: $!";
76	0					my $num = $min; #número do arquivo inicial
77	0					my $maximo = $max; #número do arquivo final + 1
78	0					while ($num < $maximo){
79
80	0					my $nome1 = "1 ($num).txt"; #arquivos de texto
81	0					my $nome2 = "1 ($num).txu"; #\sToken\n
82	0					my $nome3 = "1 ($num).txv"; #Número do arquivo,Frequencia,\sType
83
84	0					my $i = 1; #primeira string
85	0					my $reg = /\r\n/; #necessário para a limpeza em UTF-8
86	0					my $reg2 = /\s/; #devido a um erro conhecido (cf. www.unicode.org,
87						#http://unicode.org/reports/tr13/tr13-5.html)
88
89						###início módulo de tokenização
90
91	0	0				open (my $in, "<", $nome1) \|\| die "Can not open", $nome1, ": $!";
92	0	0				open (my $out, ">", $nome2) \|\| die "Can not write", $nome2, ": $!";
93	0					print "inicio de tokenização, aguardes", "\n";
94
95	0					while (1) {
96	0					my $line = <$in>;
97	0					our $tokens = $i;
98	0	0				last unless $line;
99	0					for ($line) {
100	0					s/[-]\|[Ͽ-Ͽ]\|[Ա-��]/ /g;; #separadores exclusivamente de alfabeto cirílico
101						}
102	0					@words = &shellwords(' ', 0, $line); #separador anterior: \s+
103	0					foreach (@words) {
104	0	0				unless ($_ eq "s+"\|$_ eq "0"\|$_ eq $reg\|$_ eq $reg2){ #limpeza final
105	0					print $out " $_\n";
106	0					$i++;
107						}
108						}
109						}
110	0					close $in;
111
112	0	0				if ($tempo < 2){
113	0					our $tokensgeral = $tokensgeral + $tokens;
114						}
115	0					close $out;
116	0					print "fim de tokenização", "\n";
117	0					print "início de typeficação, aguardes", "\n";
118
119						# início módulo de contagem de frequência ("typeficação")
120
121	0					my $ii = ($i - 1); #úlitma string processada no módulo anterior - 1
122	0	0				open (my $in2, "<", $nome2) \|\| die "Can not open $nome2: $!";
123	0	0				open (my $out2, ">", $nome3) \|\| die "Can not write $nome3: $!";
124	0					our @lista = <$in2>;
125	0					my $controle2 = 0;
126	0					my $types = 0;
127	0					while ($controle2 < $ii){
128	0					our $inicio = -1;
129	0					my $controle = 0; #freqüência dos termos
130	0					my $pesquisa = $lista[$controle2]; #termos pesquisado
131	0					while (1){
132	0	0				last unless ($lista[$inicio]);
133	0					foreach ($lista[$inicio]){
134	0					$inicio++;
135	0	0				if ($lista[$inicio] =~ /$pesquisa/i){ #localiza a palavra
136	0					$controle++; #acrescenta um "feijão"
137						}
138						}
139						}
140	0	0				if ($controle < $ii){
141	0					$types++;
142	0					print $out2 "$num,$controle,$pesquisa";
143	0					print $result $num, ",", $controle, ",", $pesquisa; #não deu
144						}
145	0					for (@lista){
146	0					s/$pesquisa/\n/i; #limpa o que já foi calculado, para minimizar os esforços.
147						}
148	0					$controle2++;
149						}
150	0					print "Foram encontrados ", $types, " types no arquivo ", $nome1, "!", "\n";
151	0					print $registro $types, ",", $ii, ",", $nome1, "\n";
152	0					close $in2;
153	0					close $out2;
154	0					$num++;
155						}
156	0					close $result;
157	0					$tempo++; #acrescenta um "feijão" ao tempo
158	0					$min = $max; #alteram o intervalo de alvos
159	0					$max++; #para a extração dos meta-dados
160	0					print "fim de typeficação", "\n";
161	0					print "início de primeira contagem, aguardes", "\n";
162
163						# início do módulo de frequencia da coleção
164
165	0	0				if ($tempo == 3){
166	0					do{
167	0					$num = $num - 1;
168	0					my $nome1 = "1 ($num).txv";
169	0					my $nome2 = "1 ($num).txt";
170
171	0					$num = $num + 2;
172	0					my $nome3 = "1 ($num).txt";
173
174	0	0				open (my $in1, "<", $nome1) \|\| die "Não posso abrir $nome1: $!";
175	0	0				open (my $in2, "<", $nome2) \|\| die "Não posso abrir $nome2: $!";
176	0	0				open (my $out1, ">", $nome3) \|\| die "Não posso escrever $nome3: $!";
177
178	0					my @in1 = <$in1>;
179	0					my @in2 = <$in2>;
180
181	0					my $tempo1 = 0;
182
183	0					while ($in1[$tempo1]){
184	0					my $linha1 = $in1[$tempo1];
185	0					for ($linha1){
186	0					s/.+,.+, / /g;
187						}
188	0					my $tempo2 = 0;
189	0					my $cont = 0;
190	0					while ($in2[$tempo2]){
191	0					my $linha2 = $in2[$tempo2];
192	0	0				if ($linha2 =~ /.+,.+,$linha1/i){
193	0					for ($linha2){
194	0					s/[^0-9]/ /ig;
195	0					s/[0-9]+\s//;
196						}
197	0					for ($linha2){
198	0					$cont = $cont + $linha2;
199						}
200						}
201	0					$tempo2++;
202						}
203	0					$out11[$tempo1] = "$cont,$linha1"; #ok
204	0					$tempo1++;
205						}
206	0					close $in1;
207	0					close $in2;
208	0					print $out1 @out11;
209	0					close $out1;
210	0					print "fim de primeira contagem", "\n"; };
211
212						#inicio modulo de unificação tf df cf
213						#onde se cria o arquivo cf,df, termo, penúltimo na lista txt.
214
215	0					do{
216	0					print "início de terceira contagem, aguardes", "\n";
217	0					my $numm = $num - 2;
218	0					my $nome2 = "1 ($num).txt";
219	0	0				open (my $incf, "<", $nome2) \|\| die "Não posso abrir $nome2: $!";
220	0					$num = $num - 1;
221	0					$nome2 = "1 ($num).txt";
222	0	0				open (my $indf, "<", $nome2) \|\| die "Não posso abrir $nome2: $!";
223	0					$num = $num + 2;
224	0					$nome2 = "1 ($num).txt";
225	0	0				open (my $out, ">", $nome2) \|\| die "Não posso abrir $nome2: $!";
226
227	0					my @lista1 = <$indf>;
228	0					my @lista2 = <$incf>;
229	0					my $linha = 0;
230
231	0					while(1){
232	0	0				last unless ($lista1[$linha]);
233	0					for ($lista1[$linha]){
234	0					s/$numm,//i;
235						}
236	0					for ($lista2[$linha]){
237	0					s/, .+//;
238	0					s/\n//;
239						}
240	0					print $out "$lista2[$linha],$lista1[$linha]";
241	0					$linha = $linha + 1;
242						}
243	0					close $out;
244	0					close $incf;
245	0					close $indf;
246	0					print "fim de terceira contagem", "\n";
247
248						#inicio módulo de união final - doc, tf, cf, df, termo, cria o último txt
249
250	0					print "inicio de unificação, aguardes", "\n";
251	0	0				open (my $incfdf, "<", $nome2) \|\| die "Não posso abrir $nome2: $!";
252	0					$num = $num - 3;
253	0					$nome2 = "1 ($num).txt";
254	0	0				open (my $intf, "<", $nome2) \|\| die "Não posso abrir $nome2: $!";
255	0					$num = $num + 4;
256	0					$nome2 = "1 ($num).txt";
257	0	0				open (my $outtot, ">", $nome2) \|\| die "Não posso abrir $nome2: $!"; #arquivo de união final
258						#texto, tf, cf, df, termo
259	0					my @listatf = <$intf>;
260	0					my @listadf = <$incfdf>;
261	0					my @listadf1 = @listadf;
262	0					my $linhatf = 0;
263
264	0					while (1){
265	0	0				last unless ($listatf[$linhatf]);
266	0					my $linhadf = 0;
267	0					while (1){
268	0	0				last unless ($listadf[$linhadf]);
269	0					for ($listadf[$linhadf]){
270	0					s/.+,.+,//;
271						}
272	0	0				if ($listatf[$linhatf] =~ /.+,.+,$listadf[$linhadf]/i){ #localiza a linha em df na qual ocorre
273						#o termo de cada linha de tf
274	0					for ($listatf[$linhatf]){
275	0					s/, .+\n/,/i;
276						}
277	0					$outtott[$listadf[$linhatf]] = "$listatf[$linhatf]$listadf1[$linhadf]";
278						}
279	0					$linhadf++;
280						}
281	0					print $outtot @outtott;
282	0					$linhatf++;
283						}
284	0					print "fim de unificação", "\n";
285	0					close $outtot;
286	0					close $intf;
287	0					close $incfdf;
288	0					print "inicio de unificação para Okapi BM 25", "\n";
289
290						#início módulo de unificação de frequencia total de ocorrências por documento (para Okapi BM 25)
291
292	0					my $znum = $num;
293	0	0				open (my $zincinco, "<", "1 ($znum).txt") \|\| die "Não posso escrever registro1 ($znum).txt: $!";
294	0					my @zin2 = <$zincinco>;
295	0					my @zin3 = @zin2;
296	0					$znum++;
297	0	0				open (my $zoutx, ">", "1 ($znum).txt") \|\| die "Não posso escrever 1 ($znum).txt: $!";
298	0					$znum = $znum - 5;
299	0	0				open (my $zregistro, "<", "registro1 ($znum).txt") \|\| die "Não posso escrever registro1 ($znum).txt: $!";
300	0					my @zin1 = <$zregistro>;
301	0					my $zindex = 0;
302	0					my $zinic = 0;
303	0					while (1){
304	0	0				last unless ($zin1[$zinic]);
305	0					my $zlinhac = 0;
306	0					for ($zin1[$zinic]){
307	0					s/1 .+\n//;
308						}
309	0					my $zinicc = $zinic + 1;
310	0					while (1){
311	0	0				last unless ($zin2[$zlinhac]);
312	0					for ($zin3[$zlinhac]){
313	0					s/,.+\n//;
314						}
315	0	0				if ("$zin3[$zlinhac]\n" =~ /$zinicc\n/){
316	0					print $zoutx "$zin1[$zinic]$zin2[$zlinhac]";
317	0					$zindex++;
318						}
319	0					$zlinhac++;
320						}
321	0					$zinic++;
322						}
323						};
324	0					$tokensgeral = $tokensgeral - $dif;
325	0					print "Neste corpus há ", $tokensgeral, " tokens!", "\n"; #exportar esta informação para o último registro
326						}
327						}
328	0					print "\n", "fim de programa";
329						}
330						=head1 AUTHOR
331
332						Rodrigo Panchiniak Fernandes, C<< >>
333
334						=head1 BUGS
335
336						Please report any bugs or feature requests to
337						C, or through the web interface at
338						L.
339						I will be notified, and then you'll automatically be notified of progress on
340						your bug as I make changes.
341
342						=head1 SUPPORT
343
344						You can find documentation for this module with the perldoc command.
345
346						perldoc Text::Statistics::Cyrillic
347
348						You can also look for information at:
349
350						=over 4
351
352						=item * AnnoCPAN: Annotated CPAN documentation
353
354						L
355
356						=item * CPAN Ratings
357
358						L
359
360						=item * RT: CPAN's request tracker
361
362						L
363
364						=item * Search CPAN
365
366						L
367
368						=back
369
370						=head1 ACKNOWLEDGEMENTS
371
372						Alberto Manuel Brandão Simões
373
374						=head1 COPYRIGHT & LICENSE
375
376						Copyright 2007 Rodrigo Panchiniak Fernandes, all rights reserved.
377
378						This program is free software; you can redistribute it and/or modify it
379						under the same terms as Perl itself.
380
381						This code was written under CAPES BEX-09323-5
382						=cut
383
384						1; # End of Text::Statistics::Cyrillic
385						__END__