File Coverage

blib/lib/Text/Perfide/BookCleaner.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				package Text::Perfide::BookCleaner;
2
3
4	1	1	49370	use warnings;
	1		2
	1		38
5	1	1	6	use strict;
	1		2
	1		35
6	1	1	2250	use utf8;
	1		18
	1		6
7	1	1	841	use Text::Perfide::BookCleaner::Aux;
	0
	0
8				use Roman;
9
10				use base 'Exporter';
11				our @EXPORT = (qw/gettxt pages sections paragraphs footnotes chars writefile commit paux_pnum_pbr paux_pnum_nopbr paux_pbr paux_hef/);
12
13
14				=head1 NAME
15
16				Text::Perfide::BookCleaner - A module for processing books in plain text formats.
17
18				=head1 VERSION
19
20				Version 0.01_01
21
22				=cut
23
24				our $VERSION = '0.01_01';
25
26				our($commit,$minhf,$simplify,$normpar);
27				my ($v1,$v2);
28
29				$minhf //= 5;
30				$normpar //= 1;
31				my $hthreashold = $minhf;
32				my $fthreashold = $minhf;
33
34				my $cp1252_simplification = $simplify;
35				my $utf_simplification = $simplify;
36
37				my $aux = Text::Perfide::BookCleaner::Aux->new();
38
39				#my @_sect = ();
40				push(@{$aux->_sect},qw(__BEGIN__ __END__ )) unless $commit;
41
42				my $sectpatt = '(?:'. join("\|", ( map {s/([^_])_([^_])/$1 $2/g;$_} @{$aux->_sect})) .')' ;
43				my $sectpattalone = '(?:'. join("\|", ( map {s/([^_])_([^_])/$1 $2/g;$_} @{$aux->ALONE})) .')' ;
44				my $sectpattaloneornum = '(?:'. join("\|", ( map {s/([^_])_([^_])/$1 $2/g;$_} @{$aux->ALONE_OR_NUMBER})) .')' ;
45				my $sectpattnumr = '(?:'. join("\|", ( map {s/([^_])_([^_])/$1 $2/g;$_} @{$aux->NUMBER_RIG})) .')' ;
46				my $ord = '(?:'. join("\|", ( map {s/([^_])_([^_])/$1 $2/g;$_} @{$aux->NUMERAL})) .')' ;
47
48				my $nrom = qr{(?:\b(?:[IVXLC]+\|x\|v\|xv)\b)}; ## Roman Number
49
50				my $pb = qr{(?: ?_pb\d*_)}; ## page break mark
51				my $fn = qr{(?:_fn\d*_)}; ## footnote mark
52				my $hyph = qr{[-‒–—―]}; ## hyphens
53				my $snline = qr{(?:\n)};
54				my $nline = qr{(?:\n[ \t]*)}; ## \n
55				my $nline2 = qr{(?:$pb\|$nline)$nline}; ## pageBreak \n or \n\n
56				my $eopar = qr{[.:!?’'"…»]}; ##
57
58
59
60				=head1 SYNOPSIS
61
62				Quick summary of what the module does.
63
64				Perhaps a little code snippet.
65
66				use Text::Perfide::BookCleaner;
67
68				my $foo = Text::Perfide::BookCleaner->new();
69				...
70
71				=head1 EXPORT
72
73				A list of functions that can be exported. You can delete this section
74				if you don't export anything, such as for a purely object-oriented module.
75
76				=head1 FUNCTIONS
77
78				=head2 gettxt
79
80				Opens a text file and returns its contents.
81
82				Optionally, the file encoding may be defined. Default encoding is UTF-8.
83
84				Removes all ^M characters.
85
86				=cut
87
88				sub gettxt{
89				my ($file,$enc) = @_;
90				local $/;
91				undef $/;
92				open(F,$file) or die "Could not open file $file";
93				if($enc){ binmode(F,":encoding($enc)")}
94				else { binmode(F,":utf8");}
95				my $txt=;
96				close F;
97				$txt =~ s/[ \t\cM]+\n/\n/g;
98				return $txt;
99				}
100
101				=head2 pages
102
103				Extracts and removes from text page breaks, headers and footers.
104
105				=cut
106
107				sub pages{ ## pages, head, foot
108				my $txt = shift;
109				my %dig = ();
110				my $pbnum = 0;
111				my $pbtab = {};
112				my %head = ();
113				my %foot = ();
114
115				my ($dig1,$dig2,$dig3,$dig4);
116				($dig1,$txt) = paux_pnum_pbr ($txt,$pbnum,$pbtab);
117				($dig2,$txt) = paux_pnum_nopbr ($txt,$pbnum,$pbtab);
118				($dig3,$txt) = paux_pbr ($txt,$pbnum,$pbtab);
119				($dig4,$txt) = paux_hef ($txt,$pbnum,$pbtab,\%head,\%foot);
120
121				@dig{keys %$dig1} = values %$dig1;
122				@dig{keys %$dig2} = values %$dig2;
123				@dig{keys %$dig3} = values %$dig3;
124				@dig{keys %$dig4} = values %$dig4;
125				return (\%dig,$txt,$pbtab);
126				}
127
128				=head2 paux_pnum_pbr
129
130				Removes pagenumbers + pagebreaks
131
132				=cut
133
134				sub paux_pnum_pbr {
135				my ($txt,$pbnum,$pbtab) = @_;
136				my %dig = ();
137				$dig{pagnum_ctrL} = ($txt =~ s{$hyph?\s\b(\d{1,3})\s$hyph?\s*\cL}{
138				my $pn = $1;
139				$pbnum++;
140				$pbtab->{$pbnum}{'np'} = $pn;
141				$pbtab->{$pbnum}{'ctrlL'} = 1;
142				' _pb'.$pbnum.'_';
143				}ge);
144				return (\%dig,$txt);
145				}
146
147
148
149				=head2 paux_pnum_nopbr
150
151				Removes pagenumbers with no pagebreaks
152
153				=cut
154
155				sub paux_pnum_nopbr {
156				my ($txt,$pbnum,$pbtab) = @_;
157				my %dig = ();
158				$dig{pbNoCtrlL} = ($txt =~ s{\n\s$hyph?\s\b(\d{1,3})\s$hyph?\s\n}{
159				my $pn = $1;
160				$pbnum++;
161				$pbtab->{$pbnum}{'np'} = $pn;
162				$pbtab->{$pbnum}{'ctrlL'} = 0;
163				' _pb'.$pbnum.'_'."\n";
164				}ge);
165				return (\%dig,$txt);
166				}
167
168
169				=head2 paux_pbr
170
171				Removes single page breaks
172
173				=cut
174
175				sub paux_pbr {
176				my ($txt,$pbnum,$pbtab) = @_;
177				my %dig = ();
178				$dig{ctrL} = ($txt =~ s{\cL}{
179				$pbnum++;
180				$pbtab->{$pbnum}{'ctrlL'} = 1;
181				' _pb'.$pbnum.'_';
182				}ge);
183				return (\%dig,$txt);
184
185				}
186
187				=head2 paux_hef
188
189				Counts and removes headers and footers
190
191				=cut
192
193				sub paux_hef {
194				my ($txt,$pbnum,$pbtab,$headref,$footref) = @_;
195				my (%head,%foot) = (%$headref,%$footref);
196				my %dig = ();
197
198				### has headers and/or footers
199
200				### contagem de headers e footers
201				$txt =~ s{(\n+(.)\n\s_pb(\d+)_(.)\n+)}{
202				my ($x,$b,$pbnum,$c)=($1,$2,$3,$4);
203				my $header = _n($c);
204				my $footer = _n($b);
205				$foot{$footer}++ if $footer !~ /^\s*$/;
206				$head{$header}++ if $header !~ /^\s*$/;
207				"$x";
208				}ge ;
209
210				### remove headers e footers
211				$txt =~ s{(\n+(.)\n\s_pb(\d+)_(.)\n*)}{
212				my ($x,$b,$pbnum,$c)=($1,$2,$3,$4);
213				my $header = _n($c);
214				my $footer = _n($b);
215				my $result = '';
216				if ($foot{$footer} and $foot{$footer} > $fthreashold )
217				{ $pbtab->{$pbnum}{'foot'} = $footer }
218				else { $result = "\n$b"; }
219				$result.=" _pb$pbnum"."_";
220				if ($head{$header} and $head{$header} > $hthreashold )
221				{ $pbtab->{$pbnum}{'head'} = $header }
222				else { $result.= "\n$c" if $c; }
223				"$result\n";
224				}ge ;
225
226				my @aux = map
227				{($head{$_} > 5)? ("($_) = $head{$_}"):()}
228				(sort {$head{$b} <=> $head{$a}} keys %head );
229				$dig{headers}= [@aux] if @aux;
230
231				@aux = map
232				{($foot{$_} > 5)? ("($_) = $foot{$_}"):()}
233				(sort {$foot{$b} <=> $foot{$a}} keys %foot );
234				$dig{footers}= [@aux] if @aux;
235
236				### has pagenumber (line with just a number)
237				$dig{line_with_num} = ($txt =~ s{\n[ \t]*(\d{1,3})\n}{\n_n_\n}g );
238
239				return (\%dig,$txt);
240				}
241
242				=head2 sections
243
244				Detects section titles and breaks.
245
246				=cut
247
248				sub sections{ ## secções
249				my $txt = shift;
250				my %dig=();
251
252				### numeração romana
253				$dig{sectionsRom } = ($txt =~ s{$nline($nrom)(?:\s*[.])?$snline}{
254				my $norm1 = arabic($1); $norm1 //= $1;
255				"\n_sec+Rom:$norm1"."_ $1\n"}gei );
256				## $dig{num_rom2} = ($txt =~ s{$nline2($nrom)([^'´’])}{\n\n_rom2:$1_'$2'}g );
257
258				### secções variadas e ao molho
259				$dig{sectionsN} = ($txt =~ s{$nline2($sectpatt)\s+(\d+\|$nrom)}{
260				my $norm1 = $aux->dicnorm->{lc($1)}; $norm1 //= $1;
261				my $norm2 = arabic($2);
262				$norm2 //= $2;
263				"\n\n_sec+N:$norm1=$norm2"."_ $1 $2"}gei );
264				$dig{sectionsN} += ($txt =~ s{$nline2(\d+\|$nrom)\s+($sectpattaloneornum)$snline}{
265				my $norm1 = arabic($1);
266				$norm1 //= $1;
267				my $norm2 = $aux->dicnorm->{lc($2)}; $norm2 //= $2;
268				"\n\n_sec+N:$norm2=$norm1"."_ $1 $2\n"}gei );
269
270				$dig{sectionsNAN} = ($txt =~ s{$nline2($sectpattaloneornum)\s+(\d+\|$nrom)}{
271				my $norm1 = $aux->dicnorm->{lc($1)}; $norm1 //= $1;
272				my $norm2 = arabic($2);
273				$norm2 //= $2;
274				"\n\n_sec+NAN:$norm1=$norm2"."_ $1 $2"}gei );
275
276				$dig{sectionsNNR} = ($txt =~ s{$nline2($sectpattnumr\s+(\d+\|$nrom))}{
277				my $norm1 = $aux->dicnorm->{lc($1)}; $norm1 //= $1;
278				my $norm2 = arabic($2);
279				$norm2 //= $2;
280				"\n\n_sec+NNR:${norm1}=${norm2}"."_ $1 $2"}gei );
281
282				$dig{sectionsNA} = ($txt =~ s{$nline2($sectpattalone)$snline}{
283				my $norm = $aux->dicnorm->{lc($1)}; $norm //= $1;
284				"\n\n_sec+NA:${norm}_ $1\n"}gei );
285
286				# $dig{sectionsN}+= ($txt =~ s{$nline($sectpatt\s+(\d+\|$nrom))}{\n_sec-N:$1_}gi );
287
288				$dig{sectionsO} = ($txt =~ s{$nline2($sectpatt)\s+($ord)$snline}{
289				my $norm1 = $aux->dicnorm->{lc($1)}; $norm1 //= $1;
290				my $norm2 = $aux->dicnorm->{lc($2)}; $norm2 //= $2;
291				"\n\n_sec+O:$norm1=$norm2"."_ $1 $2\n"
292				}gei
293				);
294				$dig{sectionsO} += ($txt =~ s{$nline2($sectpattaloneornum)\s+($ord)$snline}{
295				my $norm1 = $aux->dicnorm->{lc($1)}; $norm1 //= $1;
296				my $norm2 = $aux->dicnorm->{lc($2)}; $norm2 //= $2;
297				"\n\n_sec+O:$norm1=$norm2"."_ $1 $2\n"
298				}gei );
299
300				$dig{sections} = ($txt =~ s{$nline2($sectpatt)\b}{
301				my $norm = $aux->dicnorm->{lc($1)}; $norm //= $1;
302				"\n\n_sec:${norm}_ $1"}gei );
303
304				$dig{sectionsO} += ($txt =~ s{$nline2($ord)\s+($sectpatt)\b}{
305				my $norm1 = $aux->dicnorm->{lc($1)}; $norm1 //= $1;
306				my $norm2 = $aux->dicnorm->{lc($2)}; $norm2 //= $2;
307				"\n\n_sec+O:$norm2=$norm1"."_ $1 $2"
308				}gei );
309				# $dig{sectionsO}+= ($txt =~ s{$nline($ord)\s+($sectpatt)\b}{\n_sec-O:$2=$1_}gi );
310
311				$dig{sectionsHR } = ($txt =~ s{$nline(\S)\1{30,}$snline}{\n_sec+HR:$1_\n}g );
312
313				$dig{sectionsHTML} = ($txt =~ s{(.*?)}{\n_sec+HTML$1_ $2}gi );
314				$dig{sectionsHTML} += ($txt =~ s{(.*?)}{\n_sec+HTMLtit_ $1}gi );
315
316				return (\%dig,$txt);
317				}
318
319				=head2 paragraphs
320
321				Detects and normalizes paragraph notation.
322
323				=cut
324
325				sub paragraphs{ ## paragrafos
326				##TODO something is not working. testing with _FR_15 and diff with mkbookclear's output gives different results
327
328				my $txt = shift;
329				my %dig=(word_per_indent => 100000); # infinity
330				$dig{emptylines}=0;
331
332				### calculating empty lines
333				while($txt =~ m{\n(\s)\n(_sec.\s*)?}g){ $dig{emptylines}++ unless $2 }
334				for($txt =~ m{(\S.*\n)}g) { $dig{lines}++ }
335				for($txt =~ m{($eopar\n)}g) { $dig{lines_w_pont}++ }
336
337				my %indent=();
338				### calculating indentations
339				while($txt =~ m{\S\n([ \t]*)\S}g){ $indent{length($1)}++; }
340				### debug $txt =~ s{(\S\n)([ \t]*)(\S)}{$1=indent$2=$3}g;
341				my @aux = map
342				{($_ != 1 && $_ < 10 && $indent{$_} > 10)? ([$_ ,$indent{$_}]):()}
343				(sort {$indent{$b} <=> $indent{$a}} keys %indent );
344				### print Dumper(\@aux);
345
346				### how many words? 5
347				$dig{words}= ($txt =~ s{(\S+)}{$1}g );
348
349				$dig{word_per_emptyline}= $dig{words} / (1+ $dig{emptylines} );
350				$dig{word_per_line} = $dig{words} / ( $dig{lines} );
351				$dig{word_per_indent} = $dig{words} / ( $aux[1][1] )
352				if (defined($aux[0]) && $aux[0][0] == 0 && $aux[1]) ;
353
354				if($dig{word_per_emptyline} > 150 &&
355				$dig{word_per_indent} > 10 &&
356				$dig{word_per_indent} < 100 ){ $dig{To_be_indented}=1 ;
357				$txt =~ s{((?:$pb\|$eopar)\n)([ \t]{2,10}\S)}{$1\n$2}g if $normpar;
358				}
359				elsif($dig{word_per_emptyline} > 150 &&
360				$dig{word_per_line} > 10 &&
361				$dig{word_per_line} < 100 &&
362				$dig{lines_w_pont} / $dig{lines} > 0.6 ){ $dig{To_be_lineseparated}=1 ;
363				$txt =~ s{((?:$eopar)\n)([ \t]*\S)}{$1\n$2}g if $normpar;
364				}
365				return (\%dig,$txt);
366				}
367
368				=head2 footnotes
369
370				Detects and removes footnotes.
371
372				=cut
373
374				sub footnotes{ ## footnotes
375				my %dig=();
376				my $txt = shift;
377				my $footnotes = {};
378				my $fnnum = 0;
379
380				my $fn1 = qr{<<(\d+)>>};
381				my $fn2 = qr{\[(\d+)\]};
382				my $fn3 = qr{\^(\d+)};
383				my $fns = qr{(?:$fn1\|$fn2\|$fn3)};
384				my $end = qr{(\n\n)\|$fns};
385
386				$dig{fn_ext} = $txt =~ s{^\s$fns.?\n\n}{
387				my $fn = $1;
388				$fn //= $2;
389				$fn //= $3;
390				$fnnum++;
391
392				'_fne'.$fnnum."_\n";
393				}gmse;
394
395				# $dig{fn_ext} = $txt =~ s{^\s$fns.?$end}{
396				# my $fn = $1;
397				# $fn //= $2;
398				# $fn //= $3;
399				# $fnnum++;
400
401				# my $fn_end = $4;
402				# $fn_end //= $5;
403				# $fn_end //= $6;
404				# $fn_end //= $7;
405				# '_fne'.$fnnum.'_'."\n$fn_end";
406				# }gmse;
407
408				$dig{fn_refs} = $txt =~ s{$fn1\|$fn2\|$fn3}{
409				my $fn = $1;
410				$fn //= $2;
411				$fn //= $3;
412				$fnnum++;
413				'_fnr'.$fnnum.'_';
414				}ge;
415
416				#$dig{fn_refs} .= $txt =~ s{\[(\d+)\]}{
417				# my $fn = $1;
418				# $fnnum++;
419				# '_fn'.$fnnum.'_';
420				#}ge;
421
422				# utf8::encode($txt); ##TODO isto e' boa ideia?
423				return (\%dig,$txt,$footnotes);
424				}
425
426				=head2 chars
427
428				Several character-level operations: replacing non-ISO characters
429
430				=cut
431
432				sub chars{ ## char level
433				my %dig=();
434				my $txt=shift;
435
436				### _ vs -
437				$dig{under_vs_hifen} =($txt =~ s{(\b_\b)}{_-_}g);
438
439				$dig{fix_retic} = ($txt =~ s{\. \. ?\.\|\. ?\. \.}{...}g);
440
441				### has word transliniations?
442				### $dig{word_tr}= ($txt =~ s{(\w)-(\n[ \t]*)(\S+)}{$1$3!!!!$2}g );
443
444				## Strange char
445				$dig{charNonIso}= ($txt =~ s{([\x80-\x99])}{$1_cp1252_}g );
446
447				$dig{char_dig}+= ($txt =~ s{ﬁ}{fi}g );
448				$dig{char_dig}+= ($txt =~ s{ﬂ}{fl}g );
449				$dig{char_dig}+= ($txt =~ s{ﬀ}{ff}g );
450
451				## Utf simplification...
452
453				if($cp1252_simplification){
454				$txt =~ s/\x85/…/g;
455				$txt =~ s/\x80/€/g;
456				$txt =~ s/\x8C/OE/g;
457				$txt =~ s/\x91/‘/g ; # / LEFT SINGLE QUOTATION MARK
458				$txt =~ s/\x92/’/g ; # / RIGHT SINGLE QUOTATION MARK
459				$txt =~ s/\x93/“/g ; # / LEFT DOUBLE QUOTATION MARK
460				$txt =~ s/\x94/”/g ; # / RIGHT DOUBLE QUOTATION MARK
461				$txt =~ s/\x95/•/g ; # / BULLET
462				$txt =~ s/\x96/-/g ; # / EN DASH
463				$txt =~ s/\x97/-/g ; # / EM DASH
464				}
465
466
467				if($utf_simplification){
468				$txt =~ s/…/.../g; # 8230 => "..." , # …
469				# 8364 => " Euros " , # €
470				$txt =~ s/’/'/g; # 8217 => "'" , # ’ 226?
471				$txt =~ s/‘/'/g; # 8216 => "'" , # ‘
472				$txt =~ s/“/"/g; # 8220 => "\"" , # “
473				$txt =~ s/”/"/g; # 8221 => "\"" , # ”
474
475				$txt =~ s/‐/-/g; # 8208 => "-" , # ‐
476				$txt =~ s/‑/-/g; # 8209 => "-" , # ‑
477				$txt =~ s/–/--/g; # 8211 => "--" , # –
478				$txt =~ s/—/--/g; # 8212 => "--" , # —
479
480				$txt =~ s/—/--/g; # 8212 => "--" , # —
481				$txt =~ s/⇒/=>/g; #
482				$txt =~ s/→/->/g; #
483				$txt =~ s/•/*/g; #
484				}
485
486				my $digref;
487				($digref,$txt) = translin($txt);
488				@dig{keys %$digref} = values %$digref;
489				return (\%dig,$txt);
490				}
491
492				=head2 translin
493
494				Deals with translineations (words split across lines caused by line-wrapping)
495				and transpaginations (same situation but for pages).
496
497				=cut
498
499				sub translin {
500				my %dig = ();
501				my $txt = shift;
502
503				my $init_s = qr/ {1,6}\|\t\t?/; # Spaces at the beginning of line -- max 6 spaces or 2 tabs
504				my $lower = qr{[a-z]}; # Lowercase letters -- #TODO accentuated/russian chars
505
506				my $tlpat = qr{(\S\w)-\n($init_s?)($lower\S)}; # Normal translineation
507				my $dtlpat = qr{(\S\w)-\n($init_s?)-($lower\S)}; # Double hifen translineation
508				my $tppat = qr{(\S\w)-(\s_pb\d+_)\n($init_s?)($lower\S)}; # Transpagination
509				my $dtppat = qr{(\S\w)-(\s_pb\d+_)\n($init_s?)-($lower\S)}; # Double hifen transpagination
510
511				$dig{translin} = ($txt =~ s{$tlpat} {"\n$2$1$3"}gei );
512				$dig{translin} += ($txt =~ s{$dtlpat}{"\n$2$1-$3"}gei );
513				$dig{transpag} = ($txt =~ s{$tppat} {"$2\n$3$1$4"}gei );
514				$dig{transpag} += ($txt =~ s{$dtppat}{"$2\n$3$1-$4"}gei );
515
516				return (\%dig,$txt);
517				#return $txt;
518				}
519
520
521				=head2 commit
522
523				Returns a text with all changes commited (removes marks left by other functions).
524
525				=cut
526
527				sub commit{
528				#my %dig=();
529				my $txt=shift;
530				$txt =~ s/^.*?\n__BEGIN__\n//s;
531				$txt =~ s/\n__END__\n.*$/\n/s;
532				$txt =~ s/$pb/ /g;
533				$txt =~ s/$fn/ /g;
534				$txt =~ s/_cp1252_//g;
535				return $txt;
536				}
537
538				=head2 writefile
539
540				Writes text in file pointed by given file descriptor (default enconding UTF8).
541
542				=cut
543
544				sub writefile{
545				my ($txt,$fd)=@_;
546				$fd = *STDOUT unless $fd;
547				binmode($fd,":utf8");
548				print $fd $txt;
549				}
550
551				## Auxiliary functions
552
553				sub _n{ my $a = shift;
554				$a =~ s/\s+/ /g;
555				$a =~ s/\d+/_NUM_/g;
556				$a;
557				}
558
559
560				1;
561
562				__END__