line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Perfide::BookSync; |
2
|
|
|
|
|
|
|
|
3
|
7
|
|
|
7
|
|
244974
|
use 5.006; |
|
7
|
|
|
|
|
31
|
|
|
7
|
|
|
|
|
356
|
|
4
|
7
|
|
|
7
|
|
43
|
use strict; |
|
7
|
|
|
|
|
11
|
|
|
7
|
|
|
|
|
262
|
|
5
|
7
|
|
|
7
|
|
39
|
use warnings; |
|
7
|
|
|
|
|
17
|
|
|
7
|
|
|
|
|
229
|
|
6
|
7
|
|
|
7
|
|
23493
|
use Data::Dumper; |
|
7
|
|
|
|
|
128000
|
|
|
7
|
|
|
|
|
564
|
|
7
|
7
|
|
|
7
|
|
69
|
use List::Util qw/min/; |
|
7
|
|
|
|
|
15
|
|
|
7
|
|
|
|
|
842
|
|
8
|
7
|
|
|
7
|
|
6541
|
use HTML::Auto qw/matrix h v/; |
|
7
|
|
|
|
|
320352
|
|
|
7
|
|
|
|
|
1236
|
|
9
|
7
|
|
|
7
|
|
85
|
use File::Basename; |
|
7
|
|
|
|
|
15
|
|
|
7
|
|
|
|
|
452
|
|
10
|
7
|
|
|
7
|
|
10175
|
use utf8::all; |
|
7
|
|
|
|
|
715293
|
|
|
7
|
|
|
|
|
60
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 NAME |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
Text::Perfide::BookSync - Synchronize books in plain text format. |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
=head1 VERSION |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
Version 0.01_09 |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=cut |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
our $VERSION = '0.01_09'; |
24
|
|
|
|
|
|
|
|
25
|
7
|
|
|
7
|
|
38362
|
use base 'Exporter'; |
|
7
|
|
|
|
|
19
|
|
|
7
|
|
|
|
|
55847
|
|
26
|
|
|
|
|
|
|
our @EXPORT = (qw/ htmlmatrix |
27
|
|
|
|
|
|
|
marksync |
28
|
|
|
|
|
|
|
splitchunks |
29
|
|
|
|
|
|
|
calchunks |
30
|
|
|
|
|
|
|
populate |
31
|
|
|
|
|
|
|
moreinfosecs |
32
|
|
|
|
|
|
|
moreinfochunks |
33
|
|
|
|
|
|
|
/); |
34
|
|
|
|
|
|
|
|
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=head1 SYNOPSIS |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
Text::Perfide::BookSync performs a structural alignment at section level of |
39
|
|
|
|
|
|
|
books in plain text format. The books have to be previously annotated by |
40
|
|
|
|
|
|
|
Text::Perfide::BookCleaner. |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
=head1 EXPORT |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=head1 SUBROUTINES/METHODS |
45
|
|
|
|
|
|
|
=cut |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
our($split,$mark,$noclean,$html,$rm,$num,$pfile,$dump); |
48
|
|
|
|
|
|
|
$rm //= 0; |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=head2 htmlmatrix |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
Generates an HTML file containing a matrix showing the matches between sections |
53
|
|
|
|
|
|
|
of two books. |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
=cut |
56
|
|
|
|
|
|
|
|
57
|
|
|
|
|
|
|
sub htmlmatrix{ |
58
|
0
|
|
|
0
|
1
|
0
|
my ($chunks,$tabsec,$options) = @_; |
59
|
0
|
|
|
|
|
0
|
my (@lines,@cols); |
60
|
0
|
|
|
|
|
0
|
my ($l,$c)=(0,0); |
61
|
0
|
|
|
|
|
0
|
my $h = scalar @{$tabsec->{left}{secs}}; |
|
0
|
|
|
|
|
0
|
|
62
|
0
|
|
|
|
|
0
|
my $w = scalar @{$tabsec->{right}{secs}}; |
|
0
|
|
|
|
|
0
|
|
63
|
|
|
|
|
|
|
|
64
|
0
|
|
|
|
|
0
|
my $data = [ map { [(undef)x$w] } (undef)x$h ]; |
|
0
|
|
|
|
|
0
|
|
65
|
0
|
|
|
|
|
0
|
my $more_info = { left => [], right => [] }; |
66
|
|
|
|
|
|
|
|
67
|
0
|
|
|
|
|
0
|
my $ccount = 0; |
68
|
0
|
|
|
|
|
0
|
for my $chun (@$chunks){ |
69
|
0
|
|
|
|
|
0
|
my $ls = scalar @{$chun->{left}{secs}}; |
|
0
|
|
|
|
|
0
|
|
70
|
0
|
|
|
|
|
0
|
my $rs = scalar @{$chun->{right}{secs}}; |
|
0
|
|
|
|
|
0
|
|
71
|
0
|
|
|
|
|
0
|
for(my $i=0; $i<$ls; $i++){ |
72
|
0
|
|
|
|
|
0
|
push @lines, $tabsec->{left}{secs}[$l+$i]{id}; |
73
|
0
|
|
|
|
|
0
|
push @{$more_info->{left}},$tabsec->{left}{secs}[$l+$i]{title}; |
|
0
|
|
|
|
|
0
|
|
74
|
|
|
|
|
|
|
} |
75
|
0
|
|
|
|
|
0
|
for(my $j=0; $j<$rs; $j++){ |
76
|
0
|
|
|
|
|
0
|
push @cols, $tabsec->{right}{secs}[$c+$j]{id}; |
77
|
0
|
|
|
|
|
0
|
push @{$more_info->{right}},$tabsec->{right}{secs}[$c+$j]{title}; |
|
0
|
|
|
|
|
0
|
|
78
|
|
|
|
|
|
|
} |
79
|
0
|
|
|
|
|
0
|
my $style; |
80
|
0
|
|
|
|
|
0
|
my $wcmp = _numcmp($chun->{left}{wc},$chun->{right}{wc}); |
81
|
0
|
0
|
0
|
|
|
0
|
if ($wcmp < 1.1 && $wcmp > 0.9){ $style = 'background: green'; } |
|
0
|
0
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
82
|
0
|
|
|
|
|
0
|
elsif ($wcmp > 1.1 && $wcmp < 1.5 or $wcmp < 0.9 && $wcmp > 0.5){ $style = 'background: yellow'; } |
83
|
0
|
|
|
|
|
0
|
else { $style = 'background: red'; } |
84
|
|
|
|
|
|
|
|
85
|
0
|
|
|
|
|
0
|
for(my $i=0; $i<$ls; $i++){ |
86
|
0
|
|
|
|
|
0
|
for(my $j=0; $j<$rs; $j++){ |
87
|
0
|
|
|
|
|
0
|
$data->[$l+$i][$c+$j] = { |
88
|
|
|
|
|
|
|
v => $ccount, |
89
|
|
|
|
|
|
|
a => { style => $style }, |
90
|
|
|
|
|
|
|
more_info => matrix( |
91
|
|
|
|
|
|
|
[], # column titles |
92
|
|
|
|
|
|
|
[$lines[$l+$i],$cols[$c+$j]], # line titles |
93
|
|
|
|
|
|
|
[[ $more_info->{left}[$l+$i]], |
94
|
|
|
|
|
|
|
[$more_info->{right}[$c+$j]]], |
95
|
|
|
|
|
|
|
{ css => ".more_info th.empty { display:none; }\n"}, |
96
|
|
|
|
|
|
|
), |
97
|
|
|
|
|
|
|
}; |
98
|
|
|
|
|
|
|
} |
99
|
|
|
|
|
|
|
} |
100
|
0
|
|
|
|
|
0
|
$l+=$ls; |
101
|
0
|
|
|
|
|
0
|
$c+=$rs; |
102
|
0
|
|
|
|
|
0
|
$ccount++; |
103
|
|
|
|
|
|
|
} |
104
|
0
|
|
|
|
|
0
|
my ($fileL,$fileR) = (basename($tabsec->{left}{file}),basename($tabsec->{right}{file})); |
105
|
|
|
|
|
|
|
|
106
|
0
|
|
|
|
|
0
|
my $dir = $options->{dir}; |
107
|
0
|
0
|
|
|
|
0
|
if(defined($dir)) { $dir.= "/"; } |
|
0
|
|
|
|
|
0
|
|
108
|
0
|
|
|
|
|
0
|
else { $dir = ""; } |
109
|
|
|
|
|
|
|
|
110
|
0
|
0
|
|
|
|
0
|
open my $html,'>',"$dir$fileL"."_$fileR.html" or die "Could not open file '$dir${fileL}_$fileR.html' for writing!"; |
111
|
0
|
|
|
|
|
0
|
my $m = matrix(\@cols,\@lines,$data); |
112
|
0
|
|
|
|
|
0
|
print $html v(h($m)); |
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=head2 marksync |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
Given two files FILEL and FILER, creates new versions of these files |
118
|
|
|
|
|
|
|
(FILEL.sync and FILER.sync) with synchronization tags marking the |
119
|
|
|
|
|
|
|
points where the texts synchronize. |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
=cut |
122
|
|
|
|
|
|
|
|
123
|
|
|
|
|
|
|
sub marksync{ |
124
|
0
|
|
|
0
|
1
|
0
|
my ($chunks,$tabsec,$fileL,$fileR,$options) = @_; |
125
|
0
|
|
|
|
|
0
|
my ($dirL,$dirR); |
126
|
0
|
|
|
|
|
0
|
my $dir = $options->{dir}; |
127
|
0
|
|
|
|
|
0
|
my ($fL,$fR); |
128
|
0
|
0
|
|
|
|
0
|
if(defined($dir)){ |
129
|
0
|
|
|
|
|
0
|
$fL = "$dir/".basename($fileL); |
130
|
0
|
|
|
|
|
0
|
$fR = "$dir/".basename($fileR); |
131
|
|
|
|
|
|
|
} |
132
|
|
|
|
|
|
|
else { |
133
|
0
|
|
|
|
|
0
|
$fL = $fileL; |
134
|
0
|
|
|
|
|
0
|
$fR = $fileR; |
135
|
|
|
|
|
|
|
} |
136
|
|
|
|
|
|
|
|
137
|
0
|
0
|
|
|
|
0
|
open my $syncl,'>',"$fL.sync" or die "Could not open file '$fileL.sync'"; |
138
|
0
|
0
|
|
|
|
0
|
open my $syncr,'>',"$fR.sync" or die "Could not open file '$fileR.sync'"; |
139
|
|
|
|
|
|
|
|
140
|
0
|
0
|
|
|
|
0
|
push @{$options->{outlist}},"$fL.sync\t$fR.sync" if defined($options->{outfile}); |
|
0
|
|
|
|
|
0
|
|
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
# Print first section mark |
143
|
0
|
|
0
|
|
|
0
|
my $rm = $options->{rm} // 0; |
144
|
0
|
0
|
|
|
|
0
|
print $syncl qq/\n/ unless $rm>0; |
145
|
0
|
0
|
|
|
|
0
|
print $syncr qq/\n/ unless $rm>0; |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
|
148
|
0
|
0
|
|
|
|
0
|
open my $fhL, '<', $fL or die; |
149
|
0
|
0
|
|
|
|
0
|
open my $fhR, '<', $fR or die; |
150
|
0
|
|
|
|
|
0
|
my $t; |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
# For each chunk |
153
|
0
|
|
|
|
|
0
|
for(my $i=$rm; $i < @$chunks; $i++){ |
154
|
|
|
|
|
|
|
|
155
|
0
|
|
|
|
|
0
|
my ($ltext,$rtext); |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
# Get the chunk's start offset and end offset (LEFT) |
158
|
0
|
|
|
|
|
0
|
my $l_start = $chunks->[$i]{left}{start}; |
159
|
0
|
|
|
|
|
0
|
my $l_length = $chunks->[$i]{left}{end} - $chunks->[$i]{left}{start}; |
160
|
|
|
|
|
|
|
# Get the chunk's text (LEFT) |
161
|
0
|
|
|
|
|
0
|
seek($fhL,0,0); |
162
|
0
|
|
|
|
|
0
|
read($fhL,$ltext,$l_start); |
163
|
0
|
|
|
|
|
0
|
read($fhL,$ltext,$l_length); |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
# Get the chunk's start offset and end offset (RIGHT) |
166
|
0
|
|
|
|
|
0
|
my $r_start = $chunks->[$i]{right}{start}; |
167
|
0
|
|
|
|
|
0
|
my $r_length = $chunks->[$i]{right}{end} - $chunks->[$i]{right}{start}; |
168
|
|
|
|
|
|
|
# Get the chunk's text (REFT) |
169
|
0
|
|
|
|
|
0
|
seek($fhR,0,0); |
170
|
0
|
|
|
|
|
0
|
read($fhR,$rtext,$r_start); |
171
|
0
|
|
|
|
|
0
|
read($fhR,$rtext,$r_length); |
172
|
|
|
|
|
|
|
|
173
|
|
|
|
|
|
|
# if both sides start with section mark, put sync mark |
174
|
0
|
0
|
0
|
|
|
0
|
if ($ltext =~ /^\s*_sec[^:]*:([^_]+)_/ and $rtext =~ /^\s*_sec[^:]*:([^_]+)_/){ |
175
|
0
|
|
|
|
|
0
|
$ltext =~ s/^\s*_sec[^:]*:([^_]+)_\n?/\n/; |
176
|
0
|
|
|
|
|
0
|
$rtext =~ s/^\s*_sec[^:]*:([^_]+)_\n?/\n/; |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
# Clean section marks left (unless $noclean is defined) |
180
|
0
|
0
|
|
|
|
0
|
$ltext =~ s/_sec[^:]*:([^_]+)_//g unless $options->{noclean}; |
181
|
0
|
0
|
|
|
|
0
|
$rtext =~ s/_sec[^:]*:([^_]+)_//g unless $options->{noclean}; |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
# Print to .sync file |
184
|
0
|
|
|
|
|
0
|
print $syncl $ltext; |
185
|
0
|
|
|
|
|
0
|
print $syncr $rtext; |
186
|
|
|
|
|
|
|
} |
187
|
|
|
|
|
|
|
|
188
|
0
|
|
|
|
|
0
|
print $syncl qq{\n}; |
189
|
0
|
|
|
|
|
0
|
print $syncr qq{\n}; |
190
|
|
|
|
|
|
|
|
191
|
0
|
|
|
|
|
0
|
close $syncl; |
192
|
0
|
|
|
|
|
0
|
close $syncr; |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head2 splitchunks |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
Given two files FILEL and FILER, splits them by their synchronization points, |
198
|
|
|
|
|
|
|
storing each chunk in a file, where each FILEL.cXX matches FILER.cXX. |
199
|
|
|
|
|
|
|
|
200
|
|
|
|
|
|
|
=cut |
201
|
|
|
|
|
|
|
|
202
|
|
|
|
|
|
|
sub splitchunks{ |
203
|
0
|
|
|
0
|
1
|
0
|
my ($chunks,$fileL,$fileR) = @_; |
204
|
0
|
|
|
|
|
0
|
my $ch=1; |
205
|
0
|
0
|
|
|
|
0
|
open my $fhL, '<', $fileL or die; |
206
|
0
|
0
|
|
|
|
0
|
open my $fhR, '<', $fileR or die; |
207
|
0
|
|
|
|
|
0
|
for my $c (@$chunks){ |
208
|
0
|
|
|
|
|
0
|
my $id = sprintf("%.3d",int($ch)); |
209
|
0
|
|
|
|
|
0
|
my ($t,$fout); |
210
|
|
|
|
|
|
|
|
211
|
0
|
|
|
|
|
0
|
my $l_start = $c->{left}{start}; |
212
|
0
|
|
|
|
|
0
|
my $l_length = $c->{left}{end} - $c->{left}{start}; |
213
|
0
|
|
|
|
|
0
|
seek($fhL,0,0); |
214
|
0
|
|
|
|
|
0
|
read($fhL,$t,$l_start); # because seek only works in bytes |
215
|
0
|
|
|
|
|
0
|
read($fhL,$t,$l_length); |
216
|
0
|
0
|
|
|
|
0
|
open $fout, '>', "$fileL.c$id" or die; |
217
|
0
|
|
|
|
|
0
|
print $fout $t; |
218
|
0
|
|
|
|
|
0
|
close $fout; |
219
|
|
|
|
|
|
|
|
220
|
0
|
|
|
|
|
0
|
my $r_start = $c->{right}{start}; |
221
|
0
|
|
|
|
|
0
|
my $r_length = $c->{right}{end} - $c->{right}{start}; |
222
|
0
|
|
|
|
|
0
|
seek($fhR,0,0); |
223
|
0
|
|
|
|
|
0
|
read($fhR,$t,$r_start); # because seek only works in bytes |
224
|
0
|
|
|
|
|
0
|
read($fhR,$t,$r_length); |
225
|
0
|
0
|
|
|
|
0
|
open $fout, '>', "$fileR.c$id" or die; |
226
|
0
|
|
|
|
|
0
|
print $fout $t; |
227
|
0
|
|
|
|
|
0
|
close $fout; |
228
|
|
|
|
|
|
|
|
229
|
0
|
|
|
|
|
0
|
$ch++; |
230
|
|
|
|
|
|
|
} |
231
|
|
|
|
|
|
|
} |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
=head2 calchunks |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
Calculates chunks for a given pair of files. A chunk is a set of consecutive |
236
|
|
|
|
|
|
|
sections, which are grouped in order to match the corresponding chunk. |
237
|
|
|
|
|
|
|
|
238
|
|
|
|
|
|
|
=cut |
239
|
|
|
|
|
|
|
|
240
|
|
|
|
|
|
|
sub calchunks{ |
241
|
6
|
|
|
6
|
1
|
445
|
my ($tabsec,$fileL,$fileR,$options) = @_; |
242
|
6
|
|
|
|
|
311
|
my $fL = basename($fileL); |
243
|
6
|
|
|
|
|
130
|
my $fR = basename($fileR); |
244
|
|
|
|
|
|
|
|
245
|
6
|
|
|
|
|
18
|
my $dir = $options->{dir}; |
246
|
6
|
50
|
|
|
|
26
|
if(defined($dir)) { $dir.= "/"; } |
|
0
|
|
|
|
|
0
|
|
247
|
6
|
|
|
|
|
17
|
else { $dir = ""; } |
248
|
|
|
|
|
|
|
|
249
|
6
|
50
|
|
|
|
972
|
open my $secsL,'>',"$dir$fL.secs" or die "Can't open file '$dir$fL.secs' for writing!"; |
250
|
6
|
50
|
|
|
|
891
|
open my $secsR,'>',"$dir$fR.secs" or die "Can't open file '$dir$fR.secs' for writing!"; |
251
|
6
|
50
|
|
|
|
357
|
if ($options->{num}){ ## Compare only section numbers |
252
|
0
|
|
|
|
|
0
|
map {my $x = $_->{id}; $x =~ s/.*=//; print $secsL $x,"\n"} @{$tabsec->{left}{secs}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
253
|
0
|
|
|
|
|
0
|
map {my $x = $_->{id}; $x =~ s/.*=//; print $secsR $x,"\n"} @{$tabsec->{right}{secs}}; |
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
|
0
|
|
|
|
|
0
|
|
254
|
|
|
|
|
|
|
} |
255
|
|
|
|
|
|
|
else{ |
256
|
6
|
|
|
|
|
14
|
map {print $secsL $_->{id},"\n"} @{$tabsec->{left}{secs}}; |
|
119
|
|
|
|
|
337
|
|
|
6
|
|
|
|
|
26
|
|
257
|
6
|
|
|
|
|
15
|
map {print $secsR $_->{id},"\n"} @{$tabsec->{right}{secs}}; |
|
190
|
|
|
|
|
505
|
|
|
6
|
|
|
|
|
22
|
|
258
|
|
|
|
|
|
|
} |
259
|
|
|
|
|
|
|
|
260
|
6
|
|
|
|
|
27
|
my ($l,$r) = (-1,-1); |
261
|
6
|
|
|
|
|
31
|
my $diff_file = "$dir${fL}_$fR.diff"; |
262
|
6
|
|
|
|
|
43406
|
qx{diff -y "$dir$fL.secs" "$dir$fR.secs" > '$diff_file'}; |
263
|
6
|
|
|
|
|
61978
|
open my $diff,"<", "$diff_file"; |
264
|
6
|
|
|
|
|
906
|
my $chunks = []; |
265
|
|
|
|
|
|
|
|
266
|
6
|
|
|
|
|
494
|
while(<$diff>){ |
267
|
223
|
|
|
|
|
826
|
chomp; |
268
|
223
|
|
|
|
|
1215
|
my @a = split /\t+/; |
269
|
223
|
100
|
|
|
|
2321
|
if ($a[1] =~ /^\s*<$/) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
270
|
33
|
|
|
|
|
42
|
$l++; |
271
|
33
|
|
|
|
|
42
|
push @{$chunks->[-1]{left}{secs}}, $l; |
|
33
|
|
|
|
|
158
|
|
272
|
|
|
|
|
|
|
} |
273
|
|
|
|
|
|
|
elsif ($a[1] =~ /^\s*>$/) { |
274
|
104
|
|
|
|
|
237
|
$r++; |
275
|
104
|
|
|
|
|
189
|
push @{$chunks->[-1]{right}{secs}}, $r; |
|
104
|
|
|
|
|
691
|
|
276
|
|
|
|
|
|
|
} |
277
|
|
|
|
|
|
|
elsif ($a[1] =~ /^\s*[|]$/) { |
278
|
31
|
|
|
|
|
51
|
$l++; |
279
|
31
|
|
|
|
|
44
|
$r++; |
280
|
31
|
|
|
|
|
27
|
push @{$chunks->[-1]{left}{secs}}, $l; |
|
31
|
|
|
|
|
90
|
|
281
|
31
|
|
|
|
|
5663
|
push @{$chunks->[-1]{right}{secs}}, $r; |
|
31
|
|
|
|
|
131
|
|
282
|
|
|
|
|
|
|
} |
283
|
|
|
|
|
|
|
else{ |
284
|
55
|
|
|
|
|
93
|
$l++; $r++; |
|
55
|
|
|
|
|
92
|
|
285
|
55
|
100
|
|
|
|
177
|
if($chunks->[-1]){ |
286
|
49
|
|
|
|
|
226
|
$chunks->[-1]{left}{end} = $tabsec->{left}{secs}[$l]{start}; |
287
|
49
|
|
|
|
|
163
|
$chunks->[-1]{right}{end} = $tabsec->{right}{secs}[$r]{start}; |
288
|
|
|
|
|
|
|
} |
289
|
55
|
|
|
|
|
837
|
push @$chunks, { |
290
|
|
|
|
|
|
|
left => { |
291
|
|
|
|
|
|
|
start => $tabsec->{left}{secs}[$l]{start}, |
292
|
|
|
|
|
|
|
secs => [] }, |
293
|
|
|
|
|
|
|
right => { |
294
|
|
|
|
|
|
|
start => $tabsec->{right}{secs}[$r]{start}, |
295
|
|
|
|
|
|
|
secs => [] } |
296
|
|
|
|
|
|
|
}; |
297
|
55
|
|
|
|
|
103
|
push @{$chunks->[-1]{left}{secs}}, $l; |
|
55
|
|
|
|
|
216
|
|
298
|
55
|
|
|
|
|
92
|
push @{$chunks->[-1]{right}{secs}}, $r; |
|
55
|
|
|
|
|
288
|
|
299
|
|
|
|
|
|
|
} |
300
|
|
|
|
|
|
|
} |
301
|
|
|
|
|
|
|
|
302
|
6
|
50
|
|
|
|
1022
|
unlink("$fL.secs","$fR.secs",$diff_file) unless defined($options->{dump}); |
303
|
|
|
|
|
|
|
|
304
|
6
|
|
|
|
|
55
|
$chunks->[-1]{left}{end} = $tabsec->{left}{secs}[-1]{end}; |
305
|
6
|
|
|
|
|
51
|
$chunks->[-1]{right}{end} = $tabsec->{right}{secs}[-1]{end}; |
306
|
|
|
|
|
|
|
# if(defined($options->{dump})){ |
307
|
|
|
|
|
|
|
# open my $cf, '>', "$dir${fL}_$fR.chunks"; |
308
|
|
|
|
|
|
|
# print $cf Dumper($chunks); |
309
|
|
|
|
|
|
|
# close $cf; |
310
|
|
|
|
|
|
|
# } |
311
|
6
|
|
|
|
|
1983
|
return $chunks; |
312
|
|
|
|
|
|
|
} |
313
|
|
|
|
|
|
|
|
314
|
|
|
|
|
|
|
=head2 populate |
315
|
|
|
|
|
|
|
|
316
|
|
|
|
|
|
|
From a given file in which sections have been delimited with |
317
|
|
|
|
|
|
|
Text::Perfide::BookCleaner, creates and returns a list containing information |
318
|
|
|
|
|
|
|
about the sections of this file: id, start offset and end offset. |
319
|
|
|
|
|
|
|
|
320
|
|
|
|
|
|
|
=cut |
321
|
|
|
|
|
|
|
|
322
|
|
|
|
|
|
|
sub populate{ |
323
|
12
|
|
|
12
|
1
|
907
|
my ($file) = shift; |
324
|
12
|
|
|
|
|
31
|
my (@idlist,$text); |
325
|
12
|
50
|
|
|
|
914
|
open my $fh, '<', $file or die "Could not open file '$file'"; |
326
|
12
|
|
|
|
|
2413
|
$text = join '',<$fh>; |
327
|
|
|
|
|
|
|
|
328
|
12
|
|
|
|
|
194067
|
push @idlist, { 'id' => 'begin', 'start' => 0, title => 'begin' }; |
329
|
12
|
|
|
|
|
730
|
while($text =~ /(_sec.*:)(.*?_)/g){ |
330
|
297
|
|
|
|
|
600397
|
$idlist[-1]{end} = $-[0]-1; |
331
|
297
|
|
|
|
|
881153
|
push @idlist,{ |
332
|
|
|
|
|
|
|
'id' => $2, |
333
|
|
|
|
|
|
|
'start' => $-[0], |
334
|
|
|
|
|
|
|
}; |
335
|
|
|
|
|
|
|
|
336
|
|
|
|
|
|
|
# Get title |
337
|
297
|
|
|
|
|
646643
|
my $subs = substr($text,$-[0],200); |
338
|
|
|
|
|
|
|
# if ($subs =~ /.*\n*(.|\n){5,100}?(?=\n)/pg) { $idlist[-1]->{title} = ${^MATCH}; } |
339
|
297
|
100
|
|
|
|
4374
|
if ($subs =~ /.*\s+([^\n]{1,100})/g) { $idlist[-1]->{title} = ${^MATCH}; } |
|
295
|
|
|
|
|
10514
|
|
340
|
2
|
|
|
|
|
30
|
else { $idlist[-1]->{title} = '' } |
341
|
|
|
|
|
|
|
} |
342
|
12
|
|
|
|
|
44
|
$idlist[-1]{end} = length $text; |
343
|
12
|
|
|
|
|
749
|
return \@idlist; |
344
|
|
|
|
|
|
|
} |
345
|
|
|
|
|
|
|
|
346
|
|
|
|
|
|
|
=head2 moreinfosecs |
347
|
|
|
|
|
|
|
|
348
|
|
|
|
|
|
|
Calculates metrics on each pair of sections (length in words, ...) |
349
|
|
|
|
|
|
|
|
350
|
|
|
|
|
|
|
=cut |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
sub moreinfosecs{ |
353
|
6
|
|
|
6
|
1
|
528
|
my ($tabsec,$options) = @_; |
354
|
6
|
|
|
|
|
33
|
my ($fileL,$fileR) = ($tabsec->{left}{file},$tabsec->{right}{file}); |
355
|
6
|
|
|
|
|
538
|
open my $fhL, '<', $fileL; |
356
|
6
|
|
|
|
|
637
|
open my $fhR, '<', $fileR; |
357
|
6
|
|
|
|
|
272
|
my $t; |
358
|
|
|
|
|
|
|
|
359
|
6
|
|
|
|
|
17
|
for my $sec (@{$tabsec->{left}{secs}}){ |
|
6
|
|
|
|
|
29
|
|
360
|
119
|
|
|
|
|
1530
|
my $start = $sec->{start}; |
361
|
119
|
|
|
|
|
395
|
my $length = $sec->{end} - $sec->{start}; |
362
|
119
|
|
|
|
|
8631
|
seek($fhL,0,0); |
363
|
119
|
|
|
|
|
10284
|
read($fhL,$t,$start); # because seek only works in bytes |
364
|
119
|
|
|
|
|
1064811
|
read($fhL,$t,$length); |
365
|
119
|
|
|
|
|
126597
|
$sec->{wc} = split /\s+/,$t; |
366
|
|
|
|
|
|
|
} |
367
|
|
|
|
|
|
|
|
368
|
6
|
|
|
|
|
149
|
for my $sec (@{$tabsec->{right}{secs}}){ |
|
6
|
|
|
|
|
38
|
|
369
|
190
|
|
|
|
|
651
|
my $start = $sec->{start}; |
370
|
190
|
|
|
|
|
890
|
my $length = $sec->{end} - $sec->{start}; |
371
|
190
|
|
|
|
|
5810
|
seek($fhL,0,0); |
372
|
190
|
|
|
|
|
9017
|
read($fhL,$t,$start); # because seek only works in bytes |
373
|
190
|
|
|
|
|
1684866
|
read($fhL,$t,$length); |
374
|
190
|
|
|
|
|
79658
|
$sec->{wc} = split /\s+/,$t; |
375
|
|
|
|
|
|
|
} |
376
|
|
|
|
|
|
|
|
377
|
6
|
|
|
|
|
27
|
my $dir = $options->{dir}; |
378
|
6
|
50
|
|
|
|
44
|
if(defined($dir)) { $dir.= "/"; } |
|
0
|
|
|
|
|
0
|
|
379
|
6
|
|
|
|
|
21
|
else { $dir = ""; } |
380
|
6
|
|
|
|
|
630
|
my ($fL,$fR) = (basename($tabsec->{left}{file}), basename($tabsec->{right}{file})); |
381
|
|
|
|
|
|
|
|
382
|
6
|
50
|
|
|
|
366
|
_dump2file($tabsec,"$dir${fL}_$fR.tabsec") if ($options->{dump}); |
383
|
|
|
|
|
|
|
} |
384
|
|
|
|
|
|
|
|
385
|
|
|
|
|
|
|
=head2 moreinfochunks |
386
|
|
|
|
|
|
|
|
387
|
|
|
|
|
|
|
Calculates metrics on each pair of chunks (length in words, ...) |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
=cut |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
sub moreinfochunks{ |
392
|
6
|
|
|
6
|
1
|
288
|
my ($chunks,$tabsec,$options) = @_; |
393
|
6
|
|
|
|
|
46
|
for my $chun (@$chunks){ |
394
|
55
|
|
|
|
|
75
|
my $sum; |
395
|
55
|
|
|
|
|
87
|
map { $sum+= $tabsec->{left}{secs}[$_]{wc} } @{$chun->{left}{secs}}; |
|
119
|
|
|
|
|
255
|
|
|
55
|
|
|
|
|
117
|
|
396
|
55
|
|
|
|
|
114
|
$chun->{left}{wc} = $sum; |
397
|
55
|
|
|
|
|
155
|
$sum=0; |
398
|
55
|
|
|
|
|
84
|
map { $sum+= $tabsec->{right}{secs}[$_]{wc} } @{$chun->{right}{secs}}; |
|
190
|
|
|
|
|
564
|
|
|
55
|
|
|
|
|
104
|
|
399
|
55
|
|
|
|
|
135
|
$chun->{right}{wc} = $sum; |
400
|
|
|
|
|
|
|
} |
401
|
6
|
|
|
|
|
38
|
my $dir = $options->{dir}; |
402
|
6
|
50
|
|
|
|
46
|
if(defined($dir)) { $dir.= "/"; } |
|
0
|
|
|
|
|
0
|
|
403
|
6
|
|
|
|
|
40
|
else { $dir = ""; } |
404
|
6
|
|
|
|
|
961
|
my ($fL,$fR) = (basename($tabsec->{left}{file}), basename($tabsec->{right}{file})); |
405
|
|
|
|
|
|
|
|
406
|
6
|
50
|
|
|
|
64
|
_dump2file($chunks,"$dir${fL}_$fR.chunks") if ($options->{dump}); |
407
|
|
|
|
|
|
|
} |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
=head2 load_localrc |
410
|
|
|
|
|
|
|
=cut |
411
|
|
|
|
|
|
|
|
412
|
|
|
|
|
|
|
sub load_localrc { |
413
|
0
|
|
|
0
|
1
|
|
my $localrc = shift; |
414
|
0
|
0
|
|
|
|
|
die "Could not find configuration file '$localrc'" unless -e $localrc; |
415
|
0
|
|
|
|
|
|
return do $localrc; |
416
|
|
|
|
|
|
|
} |
417
|
|
|
|
|
|
|
|
418
|
|
|
|
|
|
|
sub _dump2file { |
419
|
0
|
|
|
0
|
|
|
my ($ref,$filename) = @_; |
420
|
0
|
0
|
|
|
|
|
open my $fn, '>', $filename or die "Could not open file '$filename' for writing."; |
421
|
0
|
|
|
|
|
|
print $fn Dumper($ref); |
422
|
0
|
|
|
|
|
|
close $fn; |
423
|
|
|
|
|
|
|
} |
424
|
|
|
|
|
|
|
|
425
|
|
|
|
|
|
|
sub _numcmp{ |
426
|
0
|
|
|
0
|
|
|
my ($a,$b) = @_; |
427
|
0
|
0
|
|
|
|
|
return _numcmp($b,$a) if ($a>$b); |
428
|
0
|
0
|
0
|
|
|
|
return 1 if ($a==0 and $b==0); |
429
|
0
|
0
|
|
|
|
|
return 10 if ($b==0); |
430
|
0
|
|
|
|
|
|
return $a/$b; |
431
|
|
|
|
|
|
|
} |
432
|
|
|
|
|
|
|
|
433
|
|
|
|
|
|
|
=head1 AUTHOR |
434
|
|
|
|
|
|
|
|
435
|
|
|
|
|
|
|
Andre Santos, C<< >> |
436
|
|
|
|
|
|
|
Jose Joao Almeida, C<< >> |
437
|
|
|
|
|
|
|
|
438
|
|
|
|
|
|
|
=head1 BUGS |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through |
441
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll |
442
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
443
|
|
|
|
|
|
|
|
444
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
445
|
|
|
|
|
|
|
|
446
|
|
|
|
|
|
|
|
447
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
448
|
|
|
|
|
|
|
|
449
|
|
|
|
|
|
|
Copyright 2011 Project Natura. |
450
|
|
|
|
|
|
|
|
451
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
452
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published |
453
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License. |
454
|
|
|
|
|
|
|
|
455
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
456
|
|
|
|
|
|
|
|
457
|
|
|
|
|
|
|
|
458
|
|
|
|
|
|
|
=cut |
459
|
|
|
|
|
|
|
|
460
|
|
|
|
|
|
|
1; # End of Text::Perfide::BookSync |