| blib/lib/Mail/Digest/Tools.pm | |||
|---|---|---|---|
| Criterion | Covered | Total | % | 
| statement | 572 | 854 | 66.9 | 
| branch | 166 | 422 | 39.3 | 
| condition | 9 | 68 | 13.2 | 
| subroutine | 30 | 34 | 88.2 | 
| pod | 7 | 7 | 100.0 | 
| total | 784 | 1385 | 56.6 | 
| line | stmt | bran | cond | sub | pod | time | code | 
|---|---|---|---|---|---|---|---|
| 1 | package Mail::Digest::Tools; | ||||||
| 2 | $VERSION = 2.12; # 05/14/2011 | ||||||
| 3 | 2 | 2 | 49382 | use strict; | |||
| 2 | 7 | ||||||
| 2 | 159 | ||||||
| 4 | 2 | 2 | 14 | use warnings; | |||
| 2 | 4 | ||||||
| 2 | 66 | ||||||
| 5 | 2 | 2 | 3589 | use Time::Local; | |||
| 2 | 5002 | ||||||
| 2 | 32009 | ||||||
| 6 | our @ISA = ("Exporter"); | ||||||
| 7 | our @EXPORT_OK = qw( | ||||||
| 8 | process_new_digests | ||||||
| 9 | reprocess_ALL_digests | ||||||
| 10 | reply_to_digest_message | ||||||
| 11 | repair_message_order | ||||||
| 12 | consolidate_threads_multiple | ||||||
| 13 | consolidate_threads_single | ||||||
| 14 | delete_deletables | ||||||
| 15 | ); | ||||||
| 16 | our %EXPORT_TAGS = ( | ||||||
| 17 | all => \@EXPORT_OK, | ||||||
| 18 | ); | ||||||
| 19 | |||||||
| 20 | ########################## Package Variables ################################### | ||||||
| 21 | |||||||
| 22 | our %month30 = map {$_, 1} (4,6,9,11); | ||||||
| 23 | our %month31 = map {$_, 1} (1,3,5,7,8,10,12); | ||||||
| 24 | our %unix = map {$_, 1} | ||||||
| 25 | qw| Unix linux darwin freebsd netbsd openbsd mirbsd cygwin solaris |; | ||||||
| 26 | |||||||
| 27 | ############################### Initializer ################################### | ||||||
| 28 | |||||||
| 29 | sub _config_check { | ||||||
| 30 | 6 | 6 | 24 | my ($config_in_ref, $config_out_ref) = @_; | |||
| 31 | 0 | 0 | die "Cannot find ${$config_out_ref}{'dir_digest'}: $!" | ||||
| 6 | 314 | ||||||
| 32 | 6 | 50 | 104 | unless (-d ${$config_out_ref}{'dir_digest'}); | |||
| 33 | 6 | 178 | die "Missing threads directory: $!" | ||||
| 34 | 6 | 50 | 16 | unless (-d ${$config_out_ref}{'dir_threads'}); | |||
| 35 | 6 | 87 | die "Except for '\n' newline, backslashes are not permitted\n in Thread Message Delimiter: $!" | ||||
| 36 | 6 | 50 | 24 | if (${$config_out_ref}{'thread_msg_delimiter'} =~ /\\[^n]|\\$/); | |||
| 37 | # to do: | ||||||
| 38 | # here do error checking on other digest.data info that is | ||||||
| 39 | # absolutely necessary for all conceivable uses of Mail::Digest::Tools | ||||||
| 40 | } | ||||||
| 41 | |||||||
| 42 | ############################ Public Methods #################################### | ||||||
| 43 | |||||||
| 44 | sub process_new_digests { | ||||||
| 45 | 2 | 2 | 1 | 86320 | my ($config_in_ref, $config_out_ref) = @_; | ||
| 46 | 2 | 27 | _config_check($config_in_ref, $config_out_ref); | ||||
| 47 | 2 | 8 | my $choice = _start_new_only(${$config_out_ref}{'title'}); | ||||
| 2 | 28 | ||||||
| 48 | 2 | 1157 | _main_processor($config_in_ref, $config_out_ref, $choice); | ||||
| 49 | } | ||||||
| 50 | |||||||
| 51 | sub reprocess_ALL_digests { | ||||||
| 52 | 0 | 0 | 1 | 0 | my ($config_in_ref, $config_out_ref) = @_; | ||
| 53 | 0 | 0 | _config_check($config_in_ref, $config_out_ref); | ||||
| 54 | 0 | 0 | my $choice = _start_ALL(${$config_out_ref}{'title'}); | ||||
| 0 | 0 | ||||||
| 55 | 0 | 0 | _main_processor($config_in_ref, $config_out_ref, $choice); | ||||
| 56 | } | ||||||
| 57 | |||||||
| 58 | sub reply_to_digest_message { | ||||||
| 59 | 2 | 2 | 1 | 65539 | my ($config_in_ref, $config_out_ref, | ||
| 60 | $dig_number, $dig_entry, $dir_for_reply) = @_; | ||||||
| 61 | 2 | 15 | _config_check($config_in_ref, $config_out_ref); | ||||
| 62 | 2 | 12 | my $digests_ref = _get_digest_list( | ||||
| 63 | $config_in_ref, | ||||||
| 64 | $config_out_ref, | ||||||
| 65 | ); | ||||||
| 66 | 2 | 11 | my $digest_verified = _identify_target_digest( | ||||
| 67 | $config_in_ref, | ||||||
| 68 | $config_out_ref, | ||||||
| 69 | $dig_number, | ||||||
| 70 | $dig_entry, | ||||||
| 71 | $digests_ref | ||||||
| 72 | ); | ||||||
| 73 | 2 | 90 | my $replyfile = _strip_down_for_reply( | ||||
| 74 | $config_in_ref, | ||||||
| 75 | $config_out_ref, | ||||||
| 76 | $digest_verified, | ||||||
| 77 | $dig_entry, | ||||||
| 78 | $dir_for_reply, | ||||||
| 79 | ); | ||||||
| 80 | 2 | 16 | return $replyfile; | ||||
| 81 | } | ||||||
| 82 | |||||||
| 83 | sub repair_message_order { | ||||||
| 84 | # But what about todays_topics.txt? It will be out of order as well. | ||||||
| 85 | 2 | 2 | 1 | 6339 | my ($config_in_ref, $config_out_ref, $error_date_ref) = @_; | ||
| 86 | 2 | 11 | _config_check($config_in_ref, $config_out_ref); | ||||
| 87 | 2 | 5 | local $_; | ||||
| 88 | 2 | 19 | my $date_threshold = _verify_date($error_date_ref); | ||||
| 89 | 2 | 313 | my $delimiter = ${$config_out_ref}{'thread_msg_delimiter'}; | ||||
| 2 | 9 | ||||||
| 90 | 2 | 10 | my $dir_threads = ${$config_out_ref}{'dir_threads'}; | ||||
| 2 | 6 | ||||||
| 91 | 2 | 4 | my (@threadfiles, @resorted_threadfiles); | ||||
| 92 | 2 | 50 | 49 | chdir $dir_threads or die "Unable to change to $dir_threads: $!"; | |||
| 93 | 2 | 50 | 87 | opendir DIR, $dir_threads or die "Unable to open $dir_threads: $!"; | |||
| 94 | 2 | 51 | @threadfiles = grep {! m/^\./ } readdir DIR; | ||||
| 8 | 42 | ||||||
| 95 | 2 | 50 | 27 | closedir DIR or die "Unable to close $dir_threads: $!"; | |||
| 96 | 2 | 15 | foreach my $in (@threadfiles) { | ||||
| 97 | 4 | 6 | my (@msgids); | ||||
| 98 | my (%messages); | ||||||
| 99 | 4 | 53 | my $mtime = (stat($in))[9]; | ||||
| 100 | 4 | 50 | 15 | if ($date_threshold < $mtime) { | |||
| 101 | 4 | 13 | my $msgs_ref = _get_array_of_messages($in, $delimiter); | ||||
| 102 | 4 | 7 | foreach my $msg (@{$msgs_ref}) { | ||||
| 4 | 11 | ||||||
| 103 | 16 | 179 | my @lines = split(/\n/, $msg); | ||||
| 104 | 16 | 33 | my ($ln); | ||||
| 105 | 16 | 38 | while (defined($ln = shift(@lines))) { | ||||
| 106 | 32 | 100 | 142 | if ($ln =~ /^Message: ([\d_]+)$/) { | |||
| 107 | 16 | 39 | push(@msgids, $1); | ||||
| 108 | 16 | 45 | $messages{$1} = $msg; | ||||
| 109 | 16 | 65 | last; | ||||
| 110 | } | ||||||
| 111 | } | ||||||
| 112 | } | ||||||
| 113 | 4 | 8 | my ($need_resort_flag); | ||||
| 114 | 4 | 15 | for (my $el = 1; $el <= $#msgids; $el++) { | ||||
| 115 | 6 | 100 | 25 | if ($msgids[$el] lt $msgids[$el-1]) { | |||
| 116 | 4 | 6 | $need_resort_flag++; | ||||
| 117 | 4 | 6 | last; | ||||
| 118 | } | ||||||
| 119 | } | ||||||
| 120 | 4 | 50 | 11 | if ($need_resort_flag) { | |||
| 121 | 4 | 10 | my $out = "$in.bak"; | ||||
| 122 | 4 | 50 | 361 | open OUT, ">$out" or die "Couldn't open $out for writing: $!"; | |||
| 123 | 4 | 29 | foreach my $msg (sort keys %messages) { | ||||
| 124 | 16 | 62 | print OUT $messages{$msg}, $delimiter; | ||||
| 125 | } | ||||||
| 126 | 4 | 50 | 170 | close OUT or die "Couldn't close $out after writing: $!"; | |||
| 127 | 4 | 50 | 369 | rename($out, $in) or die "Couldn't rename $out to $in: $!"; | |||
| 128 | 4 | 32 | push(@resorted_threadfiles, $in); | ||||
| 129 | } | ||||||
| 130 | } | ||||||
| 131 | } | ||||||
| 132 | 2 | 50 | 12 | if (@resorted_threadfiles) { | |||
| 133 | 2 | 254 | print "Message order has been re-sorted in\n"; | ||||
| 134 | 2 | 285 | print " $_\n" foreach @resorted_threadfiles; | ||||
| 135 | } | ||||||
| 136 | } | ||||||
| 137 | |||||||
| 138 | sub consolidate_threads_multiple { | ||||||
| 139 | 0 | 0 | 1 | 0 | my ($config_in_ref, $config_out_ref); | ||
| 140 | 0 | 0 | $config_in_ref = shift; | ||||
| 141 | 0 | 0 | $config_out_ref = shift; | ||||
| 142 | 0 | 0 | 0 | my $first_common_letters = defined $_[0] ? $_[0] : 20; | |||
| 143 | 0 | 0 | my $delimiter = ${$config_out_ref}{'thread_msg_delimiter'}; | ||||
| 0 | 0 | ||||||
| 144 | 0 | 0 | my $dir_threads = ${$config_out_ref}{'dir_threads'}; | ||||
| 0 | 0 | ||||||
| 145 | 0 | 0 | local $_; | ||||
| 146 | 0 | 0 | my (@threadfiles, %threadstubs, %stubs_for_consol); | ||||
| 147 | 0 | 0 | 0 | chdir $dir_threads or die "Unable to change to $dir_threads: $!"; | |||
| 148 | 0 | 0 | 0 | opendir DIR, $dir_threads or die "Unable to open $dir_threads: $!"; | |||
| 149 | 0 | 0 | @threadfiles = map {/(.*)\.thr\.txt$/} readdir DIR; | ||||
| 0 | 0 | ||||||
| 150 | 0 | 0 | 0 | closedir DIR or die "Unable to close $dir_threads: $!"; | |||
| 151 | 0 | 0 | foreach (@threadfiles) { | ||||
| 152 | 0 | 0 | my $stub = substr($_, 0, $first_common_letters); | ||||
| 153 | 0 | 0 | push @{$threadstubs{$stub}}, "$_.thr.txt"; | ||||
| 0 | 0 | ||||||
| 154 | } | ||||||
| 155 | 0 | 0 | my ($k,$v, $consolcount); | ||||
| 156 | 0 | 0 | CONSOL: while ( ($k,$v) = each(%threadstubs)) { | ||||
| 157 | 0 | 0 | 0 | if (@{$v} > 1) { | |||
| 0 | 0 | ||||||
| 158 | 0 | 0 | $consolcount++; | ||||
| 159 | 0 | 0 | print "Candidates for consolidation:\n"; | ||||
| 160 | 0 | 0 | foreach my $thrfile (@{$v}) { | ||||
| 0 | 0 | ||||||
| 161 | 0 | 0 | print " $thrfile\n"; | ||||
| 162 | } | ||||||
| 163 | 0 | 0 | while () { | ||||
| 164 | 0 | 0 | my ($selection); | ||||
| 165 | 0 | 0 | print "\nTo consolidate, type YES: "; | ||||
| 166 | 0 | 0 | chomp ($selection = <>); | ||||
| 167 | 0 | 0 | 0 | if ($selection eq 'YES') { | |||
| 168 | 0 | 0 | print "\n Files will be consolidated\n\n"; | ||||
| 169 | 0 | 0 | $stubs_for_consol{$k} = $v; | ||||
| 170 | } else { | ||||||
| 171 | 0 | 0 | print "\n Files will not be consolidated\n\n"; | ||||
| 172 | } | ||||||
| 173 | 0 | 0 | next CONSOL; | ||||
| 174 | } | ||||||
| 175 | } | ||||||
| 176 | } | ||||||
| 177 | 0 | 0 | 0 | unless ($consolcount) { | |||
| 178 | 0 | 0 | warn "\nAnalysis of the first $first_common_letters letters of each file in\n $dir_threads\n shows no candidates for consolidation. Please hard-code\n names of files you wish to consolidate as arguments to\n \&consolidate_threads_single:\n $!"; | ||||
| 179 | } | ||||||
| 180 | 0 | 0 | foreach my $k (keys %stubs_for_consol) { | ||||
| 181 | 0 | 0 | consolidate_threads_single( | ||||
| 182 | $config_in_ref, | ||||||
| 183 | $config_out_ref, | ||||||
| 184 | 0 | 0 | \@{$stubs_for_consol{$k}} | ||||
| 185 | ); | ||||||
| 186 | } | ||||||
| 187 | } | ||||||
| 188 | |||||||
| 189 | sub consolidate_threads_single { | ||||||
| 190 | 2 | 2 | 1 | 25666 | my ($config_in_ref, $config_out_ref, $filesref) = @_; | ||
| 191 | 2 | 5 | my $delimiter = ${$config_out_ref}{'thread_msg_delimiter'}; | ||||
| 2 | 10 | ||||||
| 192 | 2 | 4 | my $dir_threads = ${$config_out_ref}{'dir_threads'}; | ||||
| 2 | 7 | ||||||
| 193 | 2 | 4 | local $_; | ||||
| 194 | 2 | 5 | my (%messages, @superseded); | ||||
| 195 | 2 | 4 | foreach my $in (@{$filesref}) { | ||||
| 2 | 10 | ||||||
| 196 | 4 | 50 | 107 | unless ($in =~ /^$dir_threads/) { | |||
| 197 | 4 | 15 | $in = "$dir_threads/$in"; | ||||
| 198 | } | ||||||
| 199 | 4 | 18 | my $msgs_ref = _get_array_of_messages($in, $delimiter); | ||||
| 200 | 4 | 9 | foreach my $msg (@{$msgs_ref}) { | ||||
| 4 | 12 | ||||||
| 201 | 9 | 1303 | my @lines = split(/\n/, $msg); | ||||
| 202 | 9 | 26 | my ($ln); | ||||
| 203 | 9 | 29 | while (defined($ln = shift(@lines))) { | ||||
| 204 | 18 | 100 | 84 | if ($ln =~ /^Message: ([\d_]+)$/) { | |||
| 205 | 9 | 50 | 39 | die "Message $1 already exists: $!" | |||
| 206 | if (exists $messages{$1}); | ||||||
| 207 | 9 | 45 | $messages{$1} = [ $msg, $in ]; | ||||
| 208 | 9 | 51 | last; | ||||
| 209 | } | ||||||
| 210 | } | ||||||
| 211 | } | ||||||
| 212 | 4 | 16 | push(@superseded, $in); | ||||
| 213 | } | ||||||
| 214 | 2 | 21 | my @msgids = sort keys %messages; | ||||
| 215 | 2 | 11 | my $first_in_thread = "$messages{$msgids[0]}[1]"; | ||||
| 216 | 2 | 7 | my $out = $first_in_thread . '.bak'; | ||||
| 217 | 2 | 50 | 244 | open OUT, ">$out" or die "Couldn't open $out for writing: $!"; | |||
| 218 | 2 | 16 | foreach (sort keys %messages) { | ||||
| 219 | 9 | 38 | print OUT $messages{$_}[0], $delimiter; | ||||
| 220 | } | ||||||
| 221 | 2 | 50 | 116 | close OUT or die "Couldn't close $out after writing: $!"; | |||
| 222 | 2 | 8 | foreach (@superseded) { | ||||
| 223 | 4 | 50 | 352 | rename($_, $_ . '.DELETABLE') or die "Couldn't rename $_: $!"; | |||
| 224 | } | ||||||
| 225 | 2 | 50 | 158 | rename($out, $first_in_thread) | |||
| 226 | || die "Couldn't rename $out to $first_in_thread: $!"; | ||||||
| 227 | } | ||||||
| 228 | |||||||
| 229 | sub delete_deletables { | ||||||
| 230 | 2 | 2 | 1 | 8262 | my $config_out_ref = shift; | ||
| 231 | 2 | 6 | my $dir_threads = ${$config_out_ref}{'dir_threads'}; | ||||
| 2 | 10 | ||||||
| 232 | 2 | 4 | local $_; | ||||
| 233 | 2 | 4 | my (@deletables); | ||||
| 234 | 2 | 50 | 906 | chdir $dir_threads or die "Unable to change to $dir_threads: $!"; | |||
| 235 | 2 | 50 | 5286 | opendir DIR, $dir_threads or die "Unable to open $dir_threads: $!"; | |||
| 236 | 2 | 41 | @deletables = grep { /\.DELETABLE$/ } readdir DIR; | ||||
| 10 | 53 | ||||||
| 237 | 2 | 50 | 33 | closedir DIR or die "Unable to close $dir_threads: $!"; | |||
| 238 | 2 | 26 | foreach (@deletables) { | ||||
| 239 | 4 | 251 | print "Deleting $_\n"; | ||||
| 240 | 4 | 50 | 539 | unlink $_ or die "Couldn't unlink $_: $!"; | |||
| 241 | } | ||||||
| 242 | } | ||||||
| 243 | |||||||
| 244 | ############################ Private Methods ################################### | ||||||
| 245 | |||||||
| 246 | sub _start_new_only { | ||||||
| 247 | 2 | 2 | 15 | my $full_title = shift; | |||
| 248 | 2 | 309 | print "\nProcessing new $full_title digest files only!\n\n"; | ||||
| 249 | 2 | 19 | return ''; | ||||
| 250 | } | ||||||
| 251 | |||||||
| 252 | sub _start_ALL { | ||||||
| 253 | # prints screen prompts which ask user to choose between | ||||||
| 254 | # default version (process newly arrived digests only) and | ||||||
| 255 | # full version (process or re-process all digests) | ||||||
| 256 | 0 | 0 | 0 | my $full_title = shift @_; | |||
| 257 | 0 | 0 | my ($choice); | ||||
| 258 | 0 | 0 | print "\n " . uc($full_title) . "\n"; | ||||
| 259 | 0 | 0 | print < | ||||
| 260 | |||||||
| 261 | By default, this program processes only NEWLY ARRIVED | ||||||
| 262 | $full_title files found in this directory. Messages in | ||||||
| 263 | these new digests are sorted and appended to the appropriate | ||||||
| 264 | ".thr.txt" files in the "Threads" subdirectory. | ||||||
| 265 | |||||||
| 266 | However, by choosing method 'reprocess_ALL_digests()' you have | ||||||
| 267 | indicated that you wish to process ALL digest files found in this | ||||||
| 268 | directory -- regardless of whether or not they have previously been | ||||||
| 269 | processed. This is recommended ONLY for initialization and testing | ||||||
| 270 | of this program. | ||||||
| 271 | |||||||
| 272 | Since this will wipe out all threads files ('.thr.txt') as well -- | ||||||
| 273 | including threads files for which you no longer have their source | ||||||
| 274 | digest files -- please confirm that this is your intent by typing | ||||||
| 275 | ALL at the prompt. | ||||||
| 276 | |||||||
| 277 | |||||||
| 278 | GOT IT? | ||||||
| 279 | |||||||
| 280 | XQ18 | ||||||
| 281 | |||||||
| 282 | 0 | 0 | print qq{Hit 'Enter' -- or, to process ALL digests in this directory, | ||||
| 283 | type 'ALL' and hit 'Enter': }; | ||||||
| 284 | 0 | 0 | chomp ($choice = | ||||
| 285 | 0 | 0 | 0 | if ($choice eq 'ALL') { | |||
| 286 | 0 | 0 | print qq{ | ||||
| 287 | You have chosen to WIPE OUT all '.thr.txt' files currently | ||||||
| 288 | existing in the 'Threads' subdirectory and reprocess all | ||||||
| 289 | $full_title digest files from scratch. | ||||||
| 290 | |||||||
| 291 | Please re-confirm your choice by once again typing 'ALL' | ||||||
| 292 | and hitting 'Enter': }; | ||||||
| 293 | |||||||
| 294 | 0 | 0 | chomp (my $confirm = | ||||
| 295 | 0 | 0 | 0 | if ($choice eq $confirm) { | |||
| 296 | 0 | 0 | print "\n Processing ALL digests in this directory!\n"; | ||||
| 297 | } else { | ||||||
| 298 | 0 | 0 | die "\n Choice not confirmed; exiting program. $!\n"; | ||||
| 299 | } | ||||||
| 300 | } else { | ||||||
| 301 | 0 | 0 | print "\n Processing new digest files only!\n"; | ||||
| 302 | 0 | 0 | $choice = ''; | ||||
| 303 | } | ||||||
| 304 | 0 | 0 | print "\n"; | ||||
| 305 | 0 | 0 | return $choice; | ||||
| 306 | } | ||||||
| 307 | |||||||
| 308 | sub _main_processor { | ||||||
| 309 | 2 | 2 | 8 | my ($config_in_ref, $config_out_ref, $choice) = @_; | |||
| 310 | |||||||
| 311 | 2 | 25 | my $recentref = _archive_or_kill($config_out_ref); | ||||
| 312 | |||||||
| 313 | 2 | 77 | my $digests_ref = _get_digest_list($config_in_ref, $config_out_ref); | ||||
| 314 | |||||||
| 315 | 2 | 12 | my $in_out_ref = _prep_source_file( | ||||
| 316 | $config_in_ref, $config_out_ref, $digests_ref); #v1.94 | ||||||
| 317 | |||||||
| 318 | 2 | 24 | $in_out_ref = _get_log_data($config_out_ref, $choice, $in_out_ref); | ||||
| 319 | |||||||
| 320 | 2 | 4 | my ($message_count, $thread_count); | ||||
| 321 | 2 | 20 | ($in_out_ref, $message_count, $thread_count) = _strip_down( | ||||
| 322 | $in_out_ref, | ||||||
| 323 | $config_in_ref, | ||||||
| 324 | $config_out_ref, | ||||||
| 325 | $recentref, | ||||||
| 326 | ); | ||||||
| 327 | |||||||
| 328 | 2 | 22 | _update_all_topics($choice, $config_out_ref, $in_out_ref); | ||||
| 329 | |||||||
| 330 | 2 | 11 | _print_results( | ||||
| 331 | scalar(keys %$in_out_ref), | ||||||
| 332 | $message_count, | ||||||
| 333 | $config_out_ref, | ||||||
| 334 | $thread_count, | ||||||
| 335 | ); | ||||||
| 336 | } | ||||||
| 337 | |||||||
| 338 | sub _archive_or_kill { | ||||||
| 339 | 2 | 2 | 9 | my $config_out_ref = shift; | |||
| 340 | 2 | 10 | my $dir_threads = ${$config_out_ref}{'dir_threads'}; | ||||
| 2 | 8 | ||||||
| 341 | 2 | 8 | my $trigger = ${$config_out_ref}{'archive_kill_trigger'}; | ||||
| 2 | 153 | ||||||
| 342 | 2 | 19 | my $threshold = defined ${$config_out_ref}{'archive_kill_days'} | ||||
| 2 | 8 | ||||||
| 343 | 2 | 50 | 4 | ? ${$config_out_ref}{'archive_kill_days'} | |||
| 344 | : 14; # v1.95 | ||||||
| 345 | 2 | 10 | my ($thr, %recent, %nonrecent, $recentref); | ||||
| 346 | 2 | 50 | 87 | chdir($dir_threads) || die "cannot chdir to $dir_threads $!"; | |||
| 347 | 2 | 50 | 116 | opendir THR, $dir_threads or die "cannot open $dir_threads: $!"; | |||
| 348 | 2 | 102 | while ($thr = readdir THR) { | ||||
| 349 | 6 | 50 | 33 | 46 | next unless ( ($thr =~ /\.thr\.txt$/) and (-f $thr) ); | ||
| 350 | 0 | 0 | 0 | if ($trigger == 0) { | |||
| 351 | 0 | 0 | $recent{$thr}++; | ||||
| 352 | } else { | ||||||
| 353 | 0 | 0 | 0 | -M $thr <= $threshold # v1.95 | |||
| 354 | ? $recent{$thr}++ | ||||||
| 355 | : $nonrecent{$thr}++; | ||||||
| 356 | } | ||||||
| 357 | } | ||||||
| 358 | 2 | 50 | 36 | closedir THR or die "Cannot close $dir_threads: $!"; | |||
| 359 | 2 | 50 | 10 | return \%recent if ($trigger == 0); | |||
| 360 | 2 | 50 | 8 | if ($trigger == 1) { | |||
| 0 | |||||||
| 361 | 2 | 17 | _archive_old_files($config_out_ref, \%nonrecent); | ||||
| 362 | } elsif ($trigger == -1) { | ||||||
| 363 | 0 | 0 | _kill_old_files($config_out_ref, \%nonrecent); | ||||
| 364 | } else { | ||||||
| 365 | 0 | 0 | die "$trigger is invalid value for archive_kill_trigger: $!"; | ||||
| 366 | } | ||||||
| 367 | 2 | 19 | return \%recent; | ||||
| 368 | } | ||||||
| 369 | |||||||
| 370 | sub _archive_old_files { | ||||||
| 371 | 2 | 2 | 8 | my ($config_out_ref, $nonrecentref) = @_; | |||
| 372 | 2 | 3 | my $dir_threads = ${$config_out_ref}{'dir_threads'}; | ||||
| 2 | 8 | ||||||
| 373 | 2 | 82 | my $archfile = defined ${$config_out_ref}{'archived_today'} | ||||
| 2 | 9 | ||||||
| 374 | 2 | 50 | 12 | ? ${$config_out_ref}{'archived_today'} | |||
| 375 | 0 | 0 | : "${$config_out_ref}{'dir_digest'}/archived_today.txt"; | ||||
| 376 | 2 | 9 | my $dir_archive_top = ${$config_out_ref}{'dir_archive_top'}; | ||||
| 2 | 7 | ||||||
| 377 | 2 | 50 | 67 | die "Missing top archive directory: $!" unless (-d $dir_archive_top); | |||
| 378 | 2 | 17 | foreach ('a'..'z') { | ||||
| 379 | 52 | 50 | 3285 | die "Missing archive subdirectory $_: $!" unless (-d "$dir_archive_top/$_"); | |||
| 380 | } | ||||||
| 381 | 2 | 50 | 67 | die "Missing archive subdirectory 'other': $!" unless (-d "$dir_archive_top/other"); | |||
| 382 | |||||||
| 383 | 2 | 50 | 237 | open ARCH, ">$archfile" or die "Couldn't open $archfile for writing: $!"; | |||
| 384 | 2 | 144 | print ARCH 'Archived today (', scalar(localtime), "):\n"; | ||||
| 385 | 2 | 225 | print ARCH '-' x 41, "\n"; | ||||
| 386 | |||||||
| 387 | 2 | 6 | my ($thr, $archstr); | ||||
| 388 | 2 | 6 | my $toarchive = 0; | ||||
| 389 | 2 | 5 | foreach $thr (sort keys %{$nonrecentref}) { | ||||
| 2 | 19 | ||||||
| 390 | 0 | 0 | my $initial = lc(substr $thr, 0, 1); | ||||
| 391 | 0 | 0 | print "Archiving: $thr\n"; | ||||
| 392 | 0 | 0 | $archstr .= $thr . "\n"; | ||||
| 393 | 0 | 0 | 0 | if ($initial =~ /[a-zA-Z]/) { | |||
| 394 | 0 | 0 | 0 | rename($thr, "$dir_archive_top/$initial/$thr") or die "Couldn't move $thr: $!"; | |||
| 395 | } else { | ||||||
| 396 | 0 | 0 | 0 | rename($thr, "$dir_archive_top/other/$thr") or die "Couldn't move $thr: $!"; | |||
| 397 | } | ||||||
| 398 | 0 | 0 | $toarchive++; | ||||
| 399 | 0 | 0 | 0 | print "$toarchive files archived\n\n" if ($toarchive % 100 == 0); | |||
| 400 | } | ||||||
| 401 | 2 | 2244 | print "$toarchive files archived\n\n"; | ||||
| 402 | 2 | 50 | 18 | $toarchive ? print ARCH $archstr : print ARCH "[None.]\n"; | |||
| 403 | 2 | 50 | 303 | close ARCH or die "Couldn't close $archfile after writing: $!"; | |||
| 404 | } | ||||||
| 405 | |||||||
| 406 | sub _kill_old_files { | ||||||
| 407 | 0 | 0 | 0 | my ($config_out_ref, $nonrecentref) = @_; | |||
| 408 | 0 | 0 | my $dir_threads = ${$config_out_ref}{'dir_threads'}; | ||||
| 0 | 0 | ||||||
| 409 | 0 | 0 | my $killfile = defined ${$config_out_ref}{'deleted_today'} | ||||
| 0 | 0 | ||||||
| 410 | 0 | 0 | 0 | ? ${$config_out_ref}{'deleted_today'} | |||
| 411 | 0 | 0 | : "${$config_out_ref}{'dir_digest'}/deleted_today.txt"; # v1.95 | ||||
| 412 | 0 | 0 | 0 | open KILL, ">$killfile" or die "Couldn't open $killfile for writing: $!"; | |||
| 413 | 0 | 0 | print KILL 'Deleted today (', scalar(localtime), "):\n"; | ||||
| 414 | 0 | 0 | print KILL '-' x 40, "\n"; | ||||
| 415 | |||||||
| 416 | 0 | 0 | my ($thr, $killstr); | ||||
| 417 | 0 | 0 | my $tokill = 0; | ||||
| 418 | 0 | 0 | foreach $thr (sort keys %{$nonrecentref}) { | ||||
| 0 | 0 | ||||||
| 419 | 0 | 0 | print "Unlinking: $thr\n"; | ||||
| 420 | 0 | 0 | $killstr .= $thr . "\n"; | ||||
| 421 | 0 | 0 | 0 | unlink $thr or die "Couldn't unlink $thr: $!"; | |||
| 422 | 0 | 0 | $tokill++; | ||||
| 423 | 0 | 0 | 0 | print "$tokill files deleted\n" if ($tokill % 100 == 0); | |||
| 424 | } | ||||||
| 425 | 0 | 0 | print "$tokill files deleted\n"; | ||||
| 426 | 0 | 0 | 0 | $tokill ? print KILL $killstr : print KILL "[None.]\n"; | |||
| 427 | 0 | 0 | 0 | close KILL or die "Couldn't close $killfile after writing: $!"; | |||
| 428 | } | ||||||
| 429 | |||||||
| 430 | sub _get_digest_list { | ||||||
| 431 | 4 | 4 | 11 | my ($config_in_ref, $config_out_ref) = @_; | |||
| 432 | 4 | 50 | 8 | opendir(DIR, ${$config_out_ref}{'dir_digest'}) || die "no ${$config_out_ref}{'dir_digest'}?: $!"; | |||
| 0 | 0 | ||||||
| 4 | 218 | ||||||
| 433 | 8 | 42 | my @digests = | ||||
| 434 | 40 | 45 | sort { lc($a) cmp lc($b) } | ||||
| 435 | 4 | 81 | grep { /${$config_in_ref}{'grep_formula'}/ } | ||||
| 40 | 486 | ||||||
| 436 | readdir(DIR); | ||||||
| 437 | 4 | 50 | 82 | closedir(DIR) || die "Could not close ${$config_out_ref}{'dir_digest'}: $!"; | |||
| 0 | 0 | ||||||
| 438 | 4 | 17 | return \@digests; | ||||
| 439 | } | ||||||
| 440 | |||||||
| 441 | sub _prep_source_file { | ||||||
| 442 | 2 | 2 | 6 | my ($config_in_ref, $config_out_ref, $digests_ref) = @_; # v1.94 | |||
| 443 | # %in_out: hash of all instances in directory of a given digest, | ||||||
| 444 | # value refers to digest's title and its message topics | ||||||
| 445 | 2 | 5 | my (%in_out, $id); | ||||
| 446 | 2 | 6 | foreach (@{$digests_ref}) { | ||||
| 2 | 9 | ||||||
| 447 | 5 | 9 | $_ =~ m/${$config_in_ref}{'pattern_target'}/; | ||||
| 5 | 83 | ||||||
| 448 | 5 | 7 | $id = eval(${$config_out_ref}{'id_format'}); # v1.94 | ||||
| 5 | 1016 | ||||||
| 449 | 5 | 36 | $in_out{$id} = [ $_ ]; | ||||
| 450 | } | ||||||
| 451 | 2 | 7 | return \%in_out; | ||||
| 452 | } | ||||||
| 453 | |||||||
| 454 | sub _identify_target_digest { | ||||||
| 455 | 2 | 2 | 7 | my ($config_in_ref, $config_out_ref, | |||
| 456 | $dig_number, $dig_entry, $digests_ref) = @_; | ||||||
| 457 | 2 | 3 | my ($hit); | ||||
| 458 | 2 | 6 | foreach my $digfile (@{$digests_ref}) { | ||||
| 2 | 18 | ||||||
| 459 | 5 | 10 | $digfile =~ m/${$config_in_ref}{'pattern_target'}/; | ||||
| 5 | 74 | ||||||
| 460 | 5 | 100 | 33 | 39 | if (defined $2) { | ||
| 50 | |||||||
| 461 | 2 | 100 | 12 | next unless ($2 == $dig_number); | |||
| 462 | 1 | 3 | $hit = $digfile; | ||||
| 463 | 1 | 4 | last; | ||||
| 464 | } elsif ((defined $1) and (! defined $2)) { | ||||||
| 465 | 3 | 100 | 11 | next unless ($1 == $dig_number); | |||
| 466 | 1 | 3 | $hit = $digfile; | ||||
| 467 | 1 | 4 | last; | ||||
| 468 | } else { | ||||||
| 469 | 0 | 0 | die "Could'nt process digest filename to identify target digest: $!"; | ||||
| 470 | } | ||||||
| 471 | } | ||||||
| 472 | 2 | 50 | 12 | if (defined $hit) { | |||
| 473 | 2 | 2855 | return $hit; | ||||
| 474 | } else { | ||||||
| 475 | 0 | 0 | print STDERR "No ${$config_out_ref}{'title'} digest numbered $dig_number could be found in directory\n"; | ||||
| 0 | 0 | ||||||
| 476 | 0 | 0 | print STDERR " ${$config_out_ref}{'dir_digest'}\n"; | ||||
| 0 | 0 | ||||||
| 477 | 0 | 0 | exit 0; | ||||
| 478 | } | ||||||
| 479 | } | ||||||
| 480 | |||||||
| 481 | sub _get_log_data { | ||||||
| 482 | 2 | 2 | 6 | my ($config_out_ref, $choice, $in_out_ref) = @_; | |||
| 483 | 2 | 6 | my $dir_digest = ${$config_out_ref}{'dir_digest'}; | ||||
| 2 | 6 | ||||||
| 484 | 2 | 4 | my $dir_threads = ${$config_out_ref}{'dir_threads'}; | ||||
| 2 | 9 | ||||||
| 485 | 2 | 4 | my $logfile = ${$config_out_ref}{'digests_log'}; | ||||
| 2 | 7 | ||||||
| 486 | 2 | 9 | my $readfile = defined ${$config_out_ref}{'digests_read'} # new in 1.95 | ||||
| 2 | 6 | ||||||
| 487 | 2 | 50 | 4 | ? ${$config_out_ref}{'digests_read'} | |||
| 488 | : "$dir_digest/digests_read.txt"; | ||||||
| 489 | |||||||
| 490 | # hash which pulls in data from an external log file that | ||||||
| 491 | # records which digests have been previously processed | ||||||
| 492 | 2 | 6 | my (%hashlog); | ||||
| 493 | 2 | 50 | 205 | open(LOG, $logfile) || die "cannot open $logfile for reading: $!"; | |||
| 494 | 2 | 66 | while ( | ||||
| 495 | 0 | 0 | chomp; | ||||
| 496 | 0 | 0 | my @entrydata = split(/;/); | ||||
| 497 | 0 | 0 | $hashlog{$entrydata[0]} = [ @entrydata[1..$#entrydata] ]; | ||||
| 498 | } | ||||||
| 499 | 2 | 50 | 30 | close(LOG) || die "cannot close $logfile: $!"; | |||
| 500 | |||||||
| 501 | 2 | 13 | foreach ( sort keys %$in_out_ref ) { | ||||
| 502 | # if this is 1st time this digest has been seen for processing ... | ||||||
| 503 | 5 | 50 | 15 | if (! exists $hashlog{$_}) { | |||
| 504 | 5 | 131 | $hashlog{$_}[1] = $hashlog{$_}[0] = scalar localtime; | ||||
| 505 | |||||||
| 506 | # if this digest has been seen for processing already ... | ||||||
| 507 | } else { | ||||||
| 508 | |||||||
| 509 | # either we're going to re-process every digest ... | ||||||
| 510 | 0 | 0 | 0 | if ($choice eq 'ALL') { | |||
| 511 | 0 | 0 | 0 | chdir($dir_threads) || die "cannot chdir to $dir_threads $!"; | |||
| 512 | 0 | 0 | my ($thrfile); | ||||
| 513 | 0 | 0 | 0 | opendir(THREADS, $dir_threads) || die "no $dir_threads?: $!"; | |||
| 514 | 0 | 0 | while ($thrfile = readdir(THREADS) ) { | ||||
| 515 | 0 | 0 | 0 | next unless $thrfile =~ /\.thr\.txt$/; | |||
| 516 | 0 | 0 | 0 | unlink $thrfile || warn "having trouble deleting $thrfile: $!"; | |||
| 517 | } | ||||||
| 518 | 0 | 0 | 0 | closedir(THREADS) or die "Couldn't close $dir_threads: $!"; | |||
| 519 | 0 | 0 | 0 | chdir($dir_digest) || die "cannot chdir to $dir_digest $!"; | |||
| 520 | 0 | 0 | $hashlog{$_}[1] = scalar localtime; | ||||
| 521 | |||||||
| 522 | # or we're only going to process new digest files | ||||||
| 523 | } else { | ||||||
| 524 | 0 | 0 | delete ${$in_out_ref}{$_}; | ||||
| 0 | 0 | ||||||
| 525 | } | ||||||
| 526 | } | ||||||
| 527 | } | ||||||
| 528 | 2 | 11 | _update_digests_log(\%hashlog, $logfile); | ||||
| 529 | 2 | 19 | _update_digests_read( | ||||
| 530 | 2 | 11 | ${$config_out_ref}{'title'}, | ||||
| 531 | \%hashlog, | ||||||
| 532 | $readfile, # new in v1.95 | ||||||
| 533 | 2 | 50 | 3 | ) if ${$config_out_ref}{'digests_read_flag'}; | |||
| 534 | 2 | 10 | return ($in_out_ref); | ||||
| 535 | } | ||||||
| 536 | |||||||
| 537 | sub _update_digests_log { # must be supplied with ref to %hashlog | ||||||
| 538 | 2 | 2 | 5 | my ($hashlog_ref, $logfile) = @_; | |||
| 539 | 2 | 5 | my ($logstring); | ||||
| 540 | 2 | 11 | foreach ( sort keys %$hashlog_ref ) { | ||||
| 541 | # $logstring .= $_ . ';' . ${%$hashlog_ref}{$_}[0] . ';' . | ||||||
| 542 | # ${%$hashlog_ref}{$_}[1]. "\n"; | ||||||
| 543 | 5 | 299 | $logstring .= $_ . ';' . ${$hashlog_ref}{$_}[0] . ';' . | ||||
| 5 | 19 | ||||||
| 544 | 5 | 9 | ${$hashlog_ref}{$_}[1]. "\n"; | ||||
| 545 | } | ||||||
| 546 | 2 | 50 | 156 | open(LOG, ">$logfile") || die "cannot open $logfile for writing: $!"; | |||
| 547 | 2 | 25 | print LOG $logstring; | ||||
| 548 | 2 | 50 | 81 | close(LOG) || die "cannot close $logfile: $!"; | |||
| 549 | } | ||||||
| 550 | |||||||
| 551 | sub _update_digests_read { # must be supplied with $title and ref to %hashlog | ||||||
| 552 | 2 | 2 | 7 | my ($title, $hashlog_ref, $readfile) = @_; | |||
| 553 | 2 | 10 | my $readstring = ''; | ||||
| 554 | 2 | 9 | $readstring .= "$title Digest\n"; | ||||
| 555 | 2 | 12 | foreach ( sort keys %$hashlog_ref ) { | ||||
| 556 | 5 | 11 | $readstring .= "\n$_:\n"; | ||||
| 557 | # $readstring .= " first processed at ${%$hashlog_ref}{$_}[0]\n"; | ||||||
| 558 | # $readstring .= " most recently processed at ${%$hashlog_ref}{$_}[1]\n"; | ||||||
| 559 | 5 | 8 | $readstring .= " first processed at ${$hashlog_ref}{$_}[0]\n"; | ||||
| 5 | 17 | ||||||
| 560 | 5 | 7 | $readstring .= " most recently processed at ${$hashlog_ref}{$_}[1]\n"; | ||||
| 5 | 17 | ||||||
| 561 | } | ||||||
| 562 | 2 | 50 | 220 | open(READ, ">$readfile") || die "cannot open $readfile for writing: $!"; | |||
| 563 | 2 | 9 | print READ $readstring; | ||||
| 564 | 2 | 50 | 66 | close(READ) || die "can't close $readfile:$!"; | |||
| 565 | } | ||||||
| 566 | |||||||
| 567 | sub _strip_down { | ||||||
| 568 | 2 | 2 | 6 | my ($in_out_ref, $config_in_ref, $config_out_ref, $recentref) = @_; | |||
| 569 | 2 | 5 | my $MIME_cleanup_flag = ${$config_in_ref}{'MIME_cleanup_flag'}; | ||||
| 2 | 5 | ||||||
| 570 | 2 | 4 | my $topics_intro = ${$config_in_ref}{'topics_intro'}; | ||||
| 2 | 7 | ||||||
| 571 | 2 | 5 | my $post_topics_delimiter = ${$config_in_ref}{'post_topics_delimiter'}; | ||||
| 2 | 6 | ||||||
| 572 | 2 | 4 | my $source_msg_delimiter = ${$config_in_ref}{'source_msg_delimiter'}; | ||||
| 2 | 7 | ||||||
| 573 | 0 | 0 | my $subject_constant = ${$config_in_ref}{'subject_constant'} | ||||
| 2 | 11 | ||||||
| 574 | 2 | 50 | 51 | if (defined ${$config_in_ref}{'subject_constant'}); | |||
| 575 | 2 | 3 | my $archive_kill_trigger = ${$config_out_ref}{'archive_kill_trigger'}; | ||||
| 2 | 5 | ||||||
| 576 | 2 | 5 | my $dir_digest = ${$config_out_ref}{'dir_digest'}; | ||||
| 2 | 4 | ||||||
| 577 | 2 | 5 | my $dir_threads = ${$config_out_ref}{'dir_threads'}; | ||||
| 2 | 6 | ||||||
| 578 | 2 | 3 | my $thread_msg_delimiter = ${$config_out_ref}{'thread_msg_delimiter'}; | ||||
| 2 | 5 | ||||||
| 579 | 0 | 0 | my $optional_fields_ref = ${$config_out_ref}{'optional_fields'} | ||||
| 2 | 9 | ||||||
| 580 | 2 | 50 | 4 | if (defined ${$config_out_ref}{'optional_fields'}); | |||
| 581 | 2 | 5 | my $MIME_cleanup_log_flag = ${$config_out_ref}{'MIME_cleanup_log_flag'} | ||||
| 2 | 9 | ||||||
| 582 | 2 | 50 | 5 | if (defined ${$config_out_ref}{'MIME_cleanup_log_flag'}); | |||
| 583 | |||||||
| 584 | 2 | 4 | my (%recent, $mimelog, %optional_fields); | ||||
| 585 | 2 | 50 | 15 | %recent = defined $recentref ? %$recentref : (); | |||
| 586 | 2 | 50 | 9 | if (defined $optional_fields_ref) { | |||
| 587 | 0 | 0 | my $i = 0; | ||||
| 588 | 0 | 0 | foreach my $opt (@{$optional_fields_ref}) { | ||||
| 0 | 0 | ||||||
| 589 | 0 | 0 | my $longkey = $opt . '_style_flag'; | ||||
| 590 | 0 | 0 | 0 | if (defined ${$config_in_ref}{$longkey}) { | |||
| 0 | 0 | ||||||
| 591 | 0 | 0 | 0 | next unless (${$config_in_ref}{$longkey} =~ /\^(.*?):/); | |||
| 0 | 0 | ||||||
| 592 | 0 | 0 | $optional_fields{$i} = [ $opt, $1 ]; | ||||
| 593 | 0 | 0 | $i++; | ||||
| 594 | } else { | ||||||
| 595 | 0 | 0 | warn "WARNING:\n '$opt' is not available as a header field for digest ${$config_out_ref}{'title'}\n"; | ||||
| 0 | 0 | ||||||
| 596 | } | ||||||
| 597 | } | ||||||
| 598 | } | ||||||
| 599 | |||||||
| 600 | # Analysis of source message delimiter: | ||||||
| 601 | 2 | 12 | my $delimiter_core = | ||||
| 602 | substr( $source_msg_delimiter, 0, index($source_msg_delimiter, "\n") ); | ||||||
| 603 | |||||||
| 604 | 2 | 6 | my $message_count = 0; | ||||
| 605 | 2 | 5 | my %seen = (); | ||||
| 606 | 2 | 4 | my $seen_ref = \%seen; | ||||
| 607 | 2 | 3 | my ($output_ref); | ||||
| 608 | 2 | 100 | 8 | if ($MIME_cleanup_flag) { | |||
| 609 | 1 | 4 | $mimelog = defined ${$config_out_ref}{'mimelog'} # v1.96 | ||||
| 1 | 2 | ||||||
| 610 | 1 | 50 | 1 | ? ${$config_out_ref}{'mimelog'} | |||
| 611 | 0 | 0 | : "${$config_out_ref}{'dir_digest'}/mimelog.txt"; | ||||
| 612 | 1 | 50 | 4 | if ($MIME_cleanup_log_flag) { | |||
| 613 | 1 | 50 | 72 | open MIME, ">$mimelog" or die "Couldn't open $mimelog for writing: $!"; | |||
| 614 | 1 | 25 | print MIME < | ||||
| 615 | Processed Problem | ||||||
| 616 | |||||||
| 617 | MIMELOG | ||||||
| 618 | } | ||||||
| 619 | } | ||||||
| 620 | 2 | 50 | 94 | chdir($dir_digest) || die "cannot chdir to $dir_digest $!"; | |||
| 621 | 2 | 58 | foreach my $digest_no ( sort keys %$in_out_ref ) { | ||||
| 622 | 5 | 11 | my (@newfile, %messages_sorted_by_thread); | ||||
| 623 | 5 | 8 | my $file = ${$in_out_ref}{$digest_no}[0]; | ||||
| 5 | 18 | ||||||
| 624 | 5 | 13 | my ($bigstr, $digest_head, $digest_bal, @digest_header, @digest_balance); | ||||
| 625 | 5 | 50 | 413 | open(IN, $file) || die "cannot open $file for reading: $!"; | |||
| 626 | { | ||||||
| 627 | 5 | 18 | local $/ = undef; | ||||
| 5 | 31 | ||||||
| 628 | 5 | 285 | $bigstr = | ||||
| 629 | } | ||||||
| 630 | 5 | 50 | 75 | close (IN) || die "can't close $file:$!"; | |||
| 631 | |||||||
| 632 | 5 | 50 | 366 | if ($bigstr =~ /(.*?)$post_topics_delimiter(.*)/s) { | |||
| 633 | 5 | 24 | $digest_head = $1; | ||||
| 634 | 5 | 71 | $digest_bal = $2; | ||||
| 635 | } else { | ||||||
| 636 | 0 | 0 | die "Couldn't extract: $!"; | ||||
| 637 | } | ||||||
| 638 | |||||||
| 639 | 5 | 91 | @digest_header = split(/\n/, $digest_head); | ||||
| 640 | |||||||
| 641 | 5 | 309 | @digest_balance = split(/$source_msg_delimiter/, $digest_bal); | ||||
| 642 | 5 | 15 | pop @digest_balance; | ||||
| 643 | 5 | 14 | $message_count += scalar(@digest_balance); | ||||
| 644 | |||||||
| 645 | # extract topics listing | ||||||
| 646 | 5 | 34 | $in_out_ref = _prepare_todays_topics( | ||||
| 647 | \@digest_header, | ||||||
| 648 | $topics_intro, | ||||||
| 649 | $delimiter_core, | ||||||
| 650 | $in_out_ref, | ||||||
| 651 | $digest_no, | ||||||
| 652 | ); | ||||||
| 653 | |||||||
| 654 | # process each message in a digest file | ||||||
| 655 | 5 | 20 | foreach my $el (@digest_balance) { | ||||
| 656 | # analyze message's header | ||||||
| 657 | 43 | 97 | my $header_ref = _analyze_message_header( | ||||
| 658 | $el, $config_in_ref, $config_out_ref | ||||||
| 659 | ); | ||||||
| 660 | # clean up message's title to eliminate characters | ||||||
| 661 | # forbidden as filenames on this system | ||||||
| 662 | 43 | 823 | my $thread = _clean_up_thread_title( | ||||
| 663 | 43 | 224 | ${$header_ref}{'subject'}, $subject_constant); | ||||
| 664 | 43 | 90 | my $full_id = $digest_no . '_' . ${$header_ref}{'message_no'}; | ||||
| 43 | 129 | ||||||
| 665 | 43 | 130 | my $thread_full_id = lc($thread . $full_id); | ||||
| 666 | |||||||
| 667 | # clean up message's text to eliminate MIME multiparts | ||||||
| 668 | 43 | 171 | my $text = _analyze_message_body( | ||||
| 669 | $el, $MIME_cleanup_flag, $full_id, $MIME_cleanup_log_flag); | ||||||
| 670 | |||||||
| 671 | |||||||
| 672 | # add info to hash from which output will be generated | ||||||
| 673 | 43 | 691 | $messages_sorted_by_thread{$thread_full_id} = [ | ||||
| 674 | $thread, | ||||||
| 675 | $full_id, | ||||||
| 676 | $header_ref, | ||||||
| 677 | $text, | ||||||
| 678 | ]; | ||||||
| 679 | } | ||||||
| 680 | # prepare output for this digest file | ||||||
| 681 | 5 | 63 | foreach ( sort keys %messages_sorted_by_thread ) { | ||||
| 682 | 43 | 150 | ($seen_ref, $output_ref) = _prepare_output_string( | ||||
| 683 | \%messages_sorted_by_thread, | ||||||
| 684 | $seen_ref, | ||||||
| 685 | $dir_threads, | ||||||
| 686 | $thread_msg_delimiter, | ||||||
| 687 | $output_ref, | ||||||
| 688 | \%optional_fields, # new in v1.67 | ||||||
| 689 | ); | ||||||
| 690 | } | ||||||
| 691 | } | ||||||
| 692 | 2 | 100 | 13 | if ($MIME_cleanup_log_flag) { | |||
| 693 | 1 | 50 | 106 | close MIME, ">$mimelog" or die "Couldn't close $mimelog after writing: $!"; | |||
| 694 | } | ||||||
| 695 | |||||||
| 696 | # If I am not archiving a particular digest, then I would never be calling a | ||||||
| 697 | # thread file for that digest back from the archive. | ||||||
| 698 | # Hence, I can simply append. | ||||||
| 699 | 2 | 50 | 50 | 39 | if ($archive_kill_trigger == 0 or $archive_kill_trigger = -1) { | ||
| 0 | |||||||
| 700 | 2 | 6 | foreach (keys %{$output_ref}) { | ||||
| 2 | 11 | ||||||
| 701 | 11 | 50 | 2364 | open(NOARCH, ">>$_") || die "cannot open $_ for appending: $!"; | |||
| 702 | 11 | 29 | print NOARCH ${$output_ref}{$_}; | ||||
| 11 | 82 | ||||||
| 703 | 11 | 50 | 748 | close(NOARCH) || die "can't close $_: $!"; | |||
| 704 | } | ||||||
| 705 | } elsif ($archive_kill_trigger == 1) { | ||||||
| 706 | 0 | 0 | my $fromarchive = 0; | ||||
| 707 | 0 | 0 | my $dearchfile = defined ${$config_out_ref}{'de_archived_today'} | ||||
| 0 | 0 | ||||||
| 708 | 0 | 0 | 0 | ? ${$config_out_ref}{'de_archived_today'} | |||
| 709 | 0 | 0 | : "${$config_out_ref}{'dir_digest'}/de_archived_today.txt"; | ||||
| 710 | 0 | 0 | my $dir_archive_top = ${$config_out_ref}{'dir_archive_top'}; | ||||
| 0 | 0 | ||||||
| 711 | 0 | 0 | my ($dearchstr); | ||||
| 712 | 0 | 0 | 0 | open DEARCH, ">$dearchfile" | |||
| 713 | or die "Couldn't open $dearchfile for writing: $!"; | ||||||
| 714 | 0 | 0 | print DEARCH 'De-archived today (', scalar(localtime), "):\n"; | ||||
| 715 | 0 | 0 | print DEARCH '-' x 44, "\n"; | ||||
| 716 | |||||||
| 717 | # 1st: See if recent thread exists; if so, open for appending | ||||||
| 718 | # 2nd: See if archive thread exists; | ||||||
| 719 | # if so, move from archive to current and open for appending | ||||||
| 720 | # [of course, if a thread has not been active for 14 days, | ||||||
| 721 | # we may wish to treat a message | ||||||
| 722 | # with the same name as a temporarily new thread and only append it | ||||||
| 723 | # when archiving once it's stale ] | ||||||
| 724 | # 3rd: If no recent/archive thread can be found, open new file for writing | ||||||
| 725 | |||||||
| 726 | 0 | 0 | foreach (keys %{$output_ref}) { | ||||
| 0 | 0 | ||||||
| 727 | 0 | 0 | my ($stub); | ||||
| 728 | 0 | 0 | 0 | if ($_ =~ m|[/\\]([^/\\]*)$|) { | |||
| 729 | 0 | 0 | $stub = $1; | ||||
| 730 | } else { | ||||||
| 731 | 0 | 0 | die "Couldn't extract stub from $_: $!"; | ||||
| 732 | } | ||||||
| 733 | 0 | 0 | 0 | if ($recent{$stub}) { | |||
| 734 | 0 | 0 | 0 | open(OUT2, ">>$_") || die "cannot open $_ for appending: $!"; | |||
| 735 | } else { | ||||||
| 736 | 0 | 0 | my ($initial, $newstub); | ||||
| 737 | 0 | 0 | $initial = lc(substr $stub, 0, 1); | ||||
| 738 | 0 | 0 | $newstub = "$dir_threads/$stub"; | ||||
| 739 | 0 | 0 | 0 | 0 | if ( ($initial =~ /[a-zA-Z]/) and | ||
| 0 | |||||||
| 740 | (-f "$dir_archive_top/$initial/$stub") ) { | ||||||
| 741 | 0 | 0 | 0 | rename("$dir_archive_top/$initial/$stub", $newstub ) or | |||
| 742 | die "Couldn't de-archive $stub: $!"; | ||||||
| 743 | 0 | 0 | print "De-archiving: $stub\n"; | ||||
| 744 | 0 | 0 | $dearchstr .= $stub . "\n"; | ||||
| 745 | 0 | 0 | $fromarchive++; | ||||
| 746 | 0 | 0 | 0 | open(OUT2, ">>$newstub") || | |||
| 747 | die "cannot open $newstub for appending: $!"; | ||||||
| 748 | } elsif (-f "$dir_archive_top/other/$stub") { | ||||||
| 749 | 0 | 0 | 0 | rename("$dir_archive_top/other/$stub", $newstub ) or | |||
| 750 | die "Couldn't de-archive $stub: $!"; | ||||||
| 751 | 0 | 0 | print "De-archiving: $stub\n"; | ||||
| 752 | 0 | 0 | $dearchstr .= $stub . "\n"; | ||||
| 753 | 0 | 0 | $fromarchive++; | ||||
| 754 | 0 | 0 | 0 | open(OUT2, ">>$newstub") || | |||
| 755 | die "cannot open $newstub for appending: $!"; | ||||||
| 756 | } else { | ||||||
| 757 | 0 | 0 | 0 | open(OUT2, ">$_") || die "cannot open $_ for writing: $!"; | |||
| 758 | } | ||||||
| 759 | } | ||||||
| 760 | 0 | 0 | print OUT2 ${$output_ref}{$_}; | ||||
| 0 | 0 | ||||||
| 761 | 0 | 0 | 0 | close(OUT2) || die "can't close $_: $!"; | |||
| 762 | } | ||||||
| 763 | 0 | 0 | print "$fromarchive files de-archived\n"; | ||||
| 764 | 0 | 0 | 0 | $fromarchive ? print DEARCH $dearchstr : print DEARCH "[None.]\n"; | |||
| 765 | 0 | 0 | 0 | close DEARCH or die "Couldn't close $dearchfile after writing: $!"; | |||
| 766 | } else { | ||||||
| 767 | 0 | 0 | die "Bad value for archive/kill trigger: $!"; | ||||
| 768 | } | ||||||
| 769 | 2 | 7 | return ($in_out_ref, $message_count, scalar(keys %{$seen_ref})); | ||||
| 2 | 91 | ||||||
| 770 | } | ||||||
| 771 | |||||||
| 772 | sub _strip_down_for_reply { | ||||||
| 773 | 2 | 2 | 20 | my ($config_in_ref, $config_out_ref, | |||
| 774 | $digest_verified, $dig_entry, $dir_for_reply) = @_; | ||||||
| 775 | 2 | 5 | my $MIME_cleanup_flag = ${$config_in_ref}{'MIME_cleanup_flag'}; | ||||
| 2 | 12 | ||||||
| 776 | 2 | 3 | my $post_topics_delimiter = ${$config_in_ref}{'post_topics_delimiter'}; | ||||
| 2 | 7 | ||||||
| 777 | 2 | 4 | my $source_msg_delimiter = ${$config_in_ref}{'source_msg_delimiter'}; | ||||
| 2 | 7 | ||||||
| 778 | 0 | 0 | my $subject_constant = ${$config_in_ref}{'subject_constant'} | ||||
| 2 | 20 | ||||||
| 779 | 2 | 50 | 12 | if (defined ${$config_in_ref}{'subject_constant'}); | |||
| 780 | 2 | 5 | my $dir_digest = ${$config_out_ref}{'dir_digest'}; | ||||
| 2 | 7 | ||||||
| 781 | |||||||
| 782 | 2 | 50 | 74 | chdir($dir_digest) || die "cannot chdir to $dir_digest $!"; | |||
| 783 | |||||||
| 784 | # slurp the digest file in, splitting on message delimiters | ||||||
| 785 | # so that each message is an array element | ||||||
| 786 | 2 | 5 | my ($bigstr, $digest_head, $digest_bal, @digest_header, @digest_balance); | ||||
| 787 | 2 | 50 | 393 | open(IN, $digest_verified) || | |||
| 788 | die "cannot open $digest_verified for reading: $!"; | ||||||
| 789 | { | ||||||
| 790 | 2 | 6 | local $/ = undef; | ||||
| 2 | 16 | ||||||
| 791 | 2 | 350 | $bigstr = | ||||
| 792 | } | ||||||
| 793 | 2 | 50 | 46 | close (IN) || die "can't close $digest_verified:$!"; | |||
| 794 | |||||||
| 795 | 2 | 50 | 110 | if ($bigstr =~ /(.*?)$post_topics_delimiter(.*)/s) { | |||
| 796 | 2 | 12 | $digest_head = $1; | ||||
| 797 | 2 | 48 | $digest_bal = $2; | ||||
| 798 | } else { | ||||||
| 799 | 0 | 0 | die "Couldn't extract: $!"; | ||||
| 800 | } | ||||||
| 801 | |||||||
| 802 | 2 | 167 | @digest_balance = split(/$source_msg_delimiter/, $digest_bal); | ||||
| 803 | 2 | 8 | pop @digest_balance; | ||||
| 804 | |||||||
| 805 | 2 | 10 | my ($el, $replyfile); | ||||
| 806 | 2 | 12 | while (defined ($el = shift @digest_balance)) { | ||||
| 807 | # analyze message's header | ||||||
| 808 | 17 | 44 | my $header_ref = | ||||
| 809 | _analyze_message_header($el, $config_in_ref, $config_out_ref); # v1.94 | ||||||
| 810 | 17 | 100 | 3166 | next unless (${$header_ref}{'message_no'} == $dig_entry); | |||
| 17 | 128 | ||||||
| 811 | |||||||
| 812 | # clean up message's title to eliminate characters | ||||||
| 813 | # forbidden as filenames on this system | ||||||
| 814 | 2 | 12 | my $thread = _clean_up_thread_title( | ||||
| 815 | 2 | 51 | ${$header_ref}{'subject'}, $subject_constant); | ||||
| 816 | 2 | 66 | $replyfile = "$dir_for_reply/${thread}.reply.txt"; | ||||
| 817 | |||||||
| 818 | # clean up message's text to eliminate MIME multiparts | ||||||
| 819 | 2 | 11 | my $text = _analyze_message_body($el, $MIME_cleanup_flag, undef, 0); | ||||
| 820 | 2 | 22 | my @lines = split(/\n/, $text); | ||||
| 821 | 2 | 4 | my ($replytext); | ||||
| 822 | 2 | 21 | foreach my $l (@lines) { | ||||
| 823 | 29 | 30 | chomp($l); | ||||
| 824 | 29 | 51 | $replytext .= '> ' . $l . "\n"; | ||||
| 825 | } | ||||||
| 826 | |||||||
| 827 | # print reply | ||||||
| 828 | 2 | 48 | my $old_fh = select(REPLY); | ||||
| 829 | 2 | 50 | 297 | open REPLY, ">$replyfile" or die "Couldn't open $replyfile: $!"; | |||
| 830 | 2 | 50 | 4 | if (defined ${$header_ref}{'reply_to'}) { | |||
| 2 | 100 | 11 | |||||
| 2 | 11 | ||||||
| 831 | 0 | 0 | print "Reply-To:\n"; | ||||
| 832 | 0 | 0 | print "${$header_ref}{'reply_to'}\n\n"; | ||||
| 0 | 0 | ||||||
| 833 | } elsif (defined ${$header_ref}{'to'}) { | ||||||
| 834 | 1 | 9 | print "To:\n"; | ||||
| 835 | 1 | 2 | print "${$header_ref}{'to'}\n\n"; | ||||
| 1 | 4 | ||||||
| 836 | } | ||||||
| 837 | 2 | 50 | 3 | if (defined ${$header_ref}{'subject'}) { | |||
| 2 | 10 | ||||||
| 838 | 2 | 5 | my ($subject_clean); | ||||
| 839 | 2 | 50 | 3 | if (${$header_ref}{'subject'} =~ | |||
| 2 | 18 | ||||||
| 840 | /^(?:(Re2?|RE2?|re2?|FWD?|Fwd?|AW):?\s+)*(.*)$/) { | ||||||
| 841 | 2 | 128 | $subject_clean = $2; | ||||
| 842 | } else { | ||||||
| 843 | 0 | 0 | $subject_clean = ${$header_ref}{'subject'}; | ||||
| 0 | 0 | ||||||
| 844 | } | ||||||
| 845 | 2 | 8 | print "Subject:\n"; | ||||
| 846 | 2 | 6 | print "$subject_clean\n\n"; | ||||
| 847 | } | ||||||
| 848 | 2 | 4 | print "On ${$header_ref}{'date'}, ${$header_ref}{'from'} wrote:\n\n"; | ||||
| 2 | 7 | ||||||
| 2 | 6 | ||||||
| 849 | 2 | 5 | print $replytext; | ||||
| 850 | 2 | 3 | print "\n"; | ||||
| 851 | 2 | 50 | 106622 | close REPLY or die "Couldn't close $replyfile: $!"; | |||
| 852 | 2 | 23 | select $old_fh; | ||||
| 853 | 2 | 22 | last; | ||||
| 854 | } | ||||||
| 855 | 2 | 18 | return $replyfile; | ||||
| 856 | } | ||||||
| 857 | |||||||
| 858 | sub _prepare_todays_topics { | ||||||
| 859 | 5 | 5 | 15 | my ($digest_header_ref, $topics_intro, | |||
| 860 | $delimiter_core, $in_out_ref, $digest_no) = @_; | ||||||
| 861 | 5 | 10 | my $counter = 0; | ||||
| 862 | 5 | 17 | my @todays_topics = (); # empty out @todays_topics | ||||
| 863 | 5 | 9 | foreach ( @{$digest_header_ref} ) { | ||||
| 5 | 19 | ||||||
| 864 | 138 | 100 | 1257 | if (m/^$topics_intro/) { # digest-specific | |||
| 865 | 5 | 7 | $counter = 1; | ||||
| 866 | } | ||||||
| 867 | 138 | 100 | 261 | if ($counter == 1) { | |||
| 868 | 79 | 100 | 1058 | if (m/^$topics_intro|^$/) { next; } # digest-specific | |||
| 10 | 50 | 27 | |||||
| 869 | elsif ($_ !~ m/$delimiter_core/) | ||||||
| 870 | 69 | 283 | { push (@todays_topics, $_); } | ||||
| 871 | 0 | 0 | else { last; } | ||||
| 872 | } | ||||||
| 873 | # ${%$in_out_ref}{$digest_no}[1] = [ @todays_topics ]; | ||||||
| 874 | # # Note: this is 1st point at which ${%$in_out_ref}{$digest_no}[1] | ||||||
| 875 | 128 | 410 | ${$in_out_ref}{$digest_no}[1] = [ @todays_topics ]; | ||||
| 128 | 767 | ||||||
| 876 | # Note: this is 1st point at which ${$in_out_ref}{$digest_no}[1] | ||||||
| 877 | # gets meaningful content | ||||||
| 878 | } | ||||||
| 879 | 5 | 124 | return $in_out_ref; | ||||
| 880 | } | ||||||
| 881 | |||||||
| 882 | sub _analyze_message_header { | ||||||
| 883 | 60 | 60 | 106 | my ($el, $config_in_ref, $config_out_ref) = @_; # v1.94 | |||
| 884 | 60 | 848 | my @all = split(/\n/, $el); | ||||
| 885 | 60 | 227 | my ($hl, @lines); | ||||
| 886 | 60 | 160 | while (defined ($hl = shift(@all)) ) { | ||||
| 887 | 371 | 100 | 5197 | last if $hl =~ /^\s*$/; | |||
| 888 | 311 | 1359 | push(@lines, $hl); | ||||
| 889 | } | ||||||
| 890 | 60 | 69 | my (%header, %init, $last_analyzed); | ||||
| 891 | 60 | 66 | foreach my $key (keys %{$config_in_ref}) { | ||||
| 60 | 276 | ||||||
| 892 | 840 | 100 | 4751 | next unless ($key =~ /_style_flag$/); | |||
| 893 | 480 | 2515 | my ($shortkey); | ||||
| 894 | 480 | 50 | 1905 | if ($key =~ /(.*)_style_flag$/) { | |||
| 895 | 480 | 854 | $shortkey = $1; | ||||
| 896 | } else { | ||||||
| 897 | 0 | 0 | warn "Problem in analyzing message header: $!"; | ||||
| 898 | } | ||||||
| 899 | 480 | 100 | 537 | $init{$shortkey}++ unless defined ${$config_in_ref}{$key}; | |||
| 480 | 1915 | ||||||
| 900 | } | ||||||
| 901 | 60 | 697 | foreach (@lines) { | ||||
| 902 | 311 | 575 | chomp; | ||||
| 903 | 311 | 624 | my ($matched); | ||||
| 904 | 311 | 100 | 1224 | unless ($init{'message'}) { | |||
| 905 | 60 | 50 | 64 | if (/${$config_in_ref}{'message_style_flag'}/) { | |||
| 60 | 566 | ||||||
| 906 | 60 | 6434 | $header{'message_no'} = | ||||
| 907 | 60 | 66 | eval(${$config_out_ref}{'output_id_format'}); | ||||
| 908 | 60 | 267 | $init{'message'}++; | ||||
| 909 | 60 | 82 | $last_analyzed = 'message'; | ||||
| 910 | 60 | 88 | $matched++; | ||||
| 911 | } | ||||||
| 912 | } | ||||||
| 913 | 311 | 100 | 724 | unless ($init{'from'}) { | |||
| 914 | 173 | 100 | 268 | if (/${$config_in_ref}{'from_style_flag'}/) { | |||
| 173 | 1084 | ||||||
| 915 | 60 | 231 | $header{'from'} = $1; | ||||
| 916 | 60 | 94 | $init{'from'}++; | ||||
| 917 | 60 | 71 | $last_analyzed = 'from'; | ||||
| 918 | 60 | 223 | $matched++; | ||||
| 919 | } | ||||||
| 920 | } | ||||||
| 921 | 311 | 100 | 789 | unless ($init{'subject'}) { | |||
| 922 | 260 | 100 | 323 | if (/${$config_in_ref}{'subject_style_flag'}/) { | |||
| 260 | 1190 | ||||||
| 923 | 60 | 234 | $header{'subject'} = $1; | ||||
| 924 | 60 | 246 | $init{'subject'}++; | ||||
| 925 | 60 | 71 | $last_analyzed = 'subject'; | ||||
| 926 | 60 | 77 | $matched++; | ||||
| 927 | } | ||||||
| 928 | } | ||||||
| 929 | 311 | 100 | 18910 | unless ($init{'to'}) { | |||
| 930 | 125 | 100 | 2369 | if (/${$config_in_ref}{'to_style_flag'}/) { | |||
| 125 | 425 | ||||||
| 931 | 31 | 86 | $header{'to'} = $1; | ||||
| 932 | 31 | 43 | $init{'to'}++; | ||||
| 933 | 31 | 47 | $last_analyzed = 'to'; | ||||
| 934 | 31 | 34 | $matched++; | ||||
| 935 | } | ||||||
| 936 | } | ||||||
| 937 | 311 | 100 | 719 | unless ($init{'reply_to'}) { | |||
| 938 | 163 | 100 | 155 | if (/${$config_in_ref}{'reply_to_style_flag'}/) { | |||
| 163 | 491 | ||||||
| 939 | 8 | 23 | $header{'reply_to'} = $1; | ||||
| 940 | 8 | 14 | $init{'reply_to'}++; | ||||
| 941 | 8 | 10 | $last_analyzed = 'reply_to'; | ||||
| 942 | 8 | 10 | $matched++; | ||||
| 943 | } | ||||||
| 944 | } | ||||||
| 945 | 311 | 100 | 711 | unless ($init{'cc'}) { | |||
| 946 | 169 | 100 | 154 | if (/${$config_in_ref}{'cc_style_flag'}/i) { | |||
| 169 | 573 | ||||||
| 947 | 17 | 58 | $header{'cc'} = $1; | ||||
| 948 | 17 | 24 | $init{'cc'}++; | ||||
| 949 | 17 | 26 | $last_analyzed = 'cc'; | ||||
| 950 | 17 | 25 | $matched++; | ||||
| 951 | } | ||||||
| 952 | } | ||||||
| 953 | 311 | 100 | 2106 | unless ($init{'date'}) { | |||
| 954 | 195 | 100 | 577 | if (/${$config_in_ref}{'date_style_flag'}/) { | |||
| 195 | 905 | ||||||
| 955 | 60 | 204 | $header{'date'} = $1; | ||||
| 956 | 60 | 99 | $init{'date'}++; | ||||
| 957 | 60 | 73 | $last_analyzed = 'date'; | ||||
| 958 | 60 | 143 | $matched++; | ||||
| 959 | } | ||||||
| 960 | } | ||||||
| 961 | 311 | 100 | 641 | unless ($init{'org'}) { | |||
| 962 | 195 | 50 | 175 | if (/${$config_in_ref}{'org_style_flag'}/) { | |||
| 195 | 543 | ||||||
| 963 | 0 | 0 | $header{'org'} = $1; | ||||
| 964 | 0 | 0 | $init{'org'}++; | ||||
| 965 | 0 | 0 | $last_analyzed = 'org'; | ||||
| 966 | 0 | 0 | $matched++; | ||||
| 967 | } | ||||||
| 968 | } | ||||||
| 969 | 311 | 100 | 773 | unless ($matched) { | |||
| 970 | 15 | 50 | 41 | if ($last_analyzed ne 'subject') { | |||
| 971 | 15 | 58 | $_ =~ s/^\s+//; | ||||
| 972 | 15 | 51 | $header{$last_analyzed} .= "\n" . ' ' x 14 . $_; | ||||
| 973 | } | ||||||
| 974 | } | ||||||
| 975 | } | ||||||
| 976 | 60 | 670 | return \%header; | ||||
| 977 | } | ||||||
| 978 | |||||||
| 979 | sub _clean_up_thread_title { | ||||||
| 980 | 45 | 45 | 72 | my $subj = shift; | |||
| 981 | 45 | 50 | 101 | my $subject_constant = shift if defined $_[0]; | |||
| 982 | 45 | 87 | my ($thread, @thread); | ||||
| 983 | 45 | 50 | 160 | $subj = "No subject" unless $subj; #messages on some lists can be subject-less | |||
| 984 | |||||||
| 985 | 45 | 372 | $subj =~ | ||||
| 986 | /^(?:(Re\d?|RE\d?|re\d?|Re\[\d?\]|RE\[\d?\]|re\[\d?\]|FWD?|Fwd?|AW):?\s+)*(.*)$/; | ||||||
| 987 | 45 | 128 | $thread = $2; | ||||
| 988 | 45 | 50 | 33 | 159 | if (defined $subject_constant and $thread =~ /^$subject_constant\s+(.*)/) { | ||
| 989 | 0 | 0 | $thread = $1; | ||||
| 990 | } | ||||||
| 991 | 45 | 445 | @thread = split(//, $thread); | ||||
| 992 | 45 | 50 | 568 | if ($^O eq 'MSWin32') { | |||
| 993 | 0 | 0 | $thread = join("", (grep m/[^*|\\:"<>?\/]/, @thread) ); #" | ||||
| 994 | } | ||||||
| 995 | 45 | 50 | 224 | if ($unix{$^O}) { # v2.08 | |||
| 996 | 45 | 770 | $thread = join("", (grep m/[^\/]/, @thread) ); | ||||
| 997 | } | ||||||
| 998 | # squish repeated periods anywhere in file name | ||||||
| 999 | 45 | 80 | $thread =~ tr/././s; | ||||
| 1000 | # Win32 allows periods in file names, | ||||||
| 1001 | # but I don't want any periods or spaces immediately before '.thr.txt' | ||||||
| 1002 | # or at the beginning of the file name | ||||||
| 1003 | 45 | 206 | $thread =~ s/[.\s]+$//; | ||||
| 1004 | 45 | 95 | $thread =~ s/^[.\s]+//; | ||||
| 1005 | # squish repeated whitespace anywhere in file name (EPP, Item 20, p. 76) | ||||||
| 1006 | 45 | 65 | $thread =~ tr/ \n\r\t\f/ /s; | ||||
| 1007 | 45 | 50 | 119 | $thread = '[Illegal subject]' unless $thread; | |||
| 1008 | 45 | 437 | return $thread; | ||||
| 1009 | } | ||||||
| 1010 | |||||||
| 1011 | sub _analyze_message_body { | ||||||
| 1012 | 45 | 45 | 89 | my ($el, $MIME_cleanup_flag, $postid, $MIME_cleanup_log_flag) = @_; | |||
| 1013 | 45 | 499 | my @chunks = split(/\n{2,}/, $el); | ||||
| 1014 | 45 | 100 | 256 | return join("\n\n", @chunks[1 .. ($#chunks)] ) | |||
| 1015 | unless $MIME_cleanup_flag; | ||||||
| 1016 | 18 | 23 | my (@nextparts); | ||||
| 1017 | 18 | 50 | 104 | if ($chunks[1] =~ /Content-Type:\smultipart\/alternative/o) { | |||
| 50 | |||||||
| 50 | |||||||
| 1018 | # New in v1.84 1/23/04 | ||||||
| 1019 | 0 | 0 | for (my $i=1; $i<=$#chunks; $i++) { | ||||
| 1020 | 0 | 0 | 0 | push(@nextparts, $i) if ($chunks[$i] =~ /Content-Type:/); | |||
| 1021 | } | ||||||
| 1022 | 0 | 0 | 0 | if (@nextparts == 4) { | |||
| 1023 | 0 | 0 | 0 | print MIME "$postid CASE I\n" if $MIME_cleanup_log_flag; | |||
| 1024 | 0 | 0 | splice @chunks, $nextparts[2], $nextparts[3] - $nextparts[2] + 1; | ||||
| 1025 | 0 | 0 | splice @chunks, 1, 2; | ||||
| 1026 | 0 | 0 | return join("\n\n", @chunks[1 .. ($#chunks-1)] ); | ||||
| 1027 | } else { | ||||||
| 1028 | 0 | 0 | 0 | print MIME ' ' x 30, | |||
| 1029 | "$postid; count: ", sprintf("%3d", scalar(@nextparts)), " CASE I\n" | ||||||
| 1030 | if $MIME_cleanup_log_flag; | ||||||
| 1031 | 0 | 0 | return join("\n\n", @chunks ); | ||||
| 1032 | } | ||||||
| 1033 | } elsif ($chunks[1] =~ /--Apple-Mail-/o) { # New in v1.85 1/23/04 | ||||||
| 1034 | 0 | 0 | for (my $i=1; $i<=$#chunks; $i++) { | ||||
| 1035 | 0 | 0 | 0 | push(@nextparts, $i) if ($chunks[$i] =~ /--Apple-Mail-/o); | |||
| 1036 | } | ||||||
| 1037 | 0 | 0 | 0 | 0 | if (@nextparts == 3 or @nextparts == 4) { | ||
| 1038 | 0 | 0 | 0 | print MIME "$postid CASE J\n" if $MIME_cleanup_log_flag; | |||
| 1039 | 0 | 0 | my ($fragment); | ||||
| 1040 | 0 | 0 | 0 | if (@nextparts == 4) { | |||
| 1041 | 0 | 0 | splice @chunks, $nextparts[-1], 1; | ||||
| 1042 | } | ||||||
| 1043 | 0 | 0 | 0 | if ($chunks[$nextparts[1]] =~ /(.*?)--Apple-Mail-/os) { | |||
| 1044 | 0 | 0 | $fragment = $1; | ||||
| 1045 | } | ||||||
| 1046 | 0 | 0 | splice @chunks, $nextparts[1]; | ||||
| 1047 | 0 | 0 | 0 | push @chunks, $fragment if ($fragment); | |||
| 1048 | 0 | 0 | splice @chunks, $nextparts[0], 1; | ||||
| 1049 | 0 | 0 | return join("\n\n", @chunks[1 .. $#chunks] ); | ||||
| 1050 | } else { | ||||||
| 1051 | 0 | 0 | 0 | print MIME ' ' x 30, "$postid; count: ", | |||
| 1052 | sprintf("%3d", scalar(@nextparts)), " CASE J\n" | ||||||
| 1053 | if $MIME_cleanup_log_flag; | ||||||
| 1054 | 0 | 0 | return join("\n\n", @chunks ); | ||||
| 1055 | } | ||||||
| 1056 | } elsif ($chunks[1] !~ /^This.+?message.+?MIME format/o) { | ||||||
| 1057 | 18 | 153 | return join("\n\n", @chunks[1 .. ($#chunks)] ); | ||||
| 1058 | } else { | ||||||
| 1059 | 0 | 0 | 0 | if ($chunks[1] =~ /--=_alternative/) { | |||
| 0 | |||||||
| 1060 | 0 | 0 | for (my $i=1; $i<=$#chunks; $i++) { | ||||
| 1061 | 0 | 0 | 0 | push(@nextparts, $i) if ($chunks[$i] =~ /--=_alternative/); | |||
| 1062 | } | ||||||
| 1063 | 0 | 0 | 0 | if (@nextparts == 3) { | |||
| 1064 | 0 | 0 | 0 | print MIME "$postid CASE A\n" if $MIME_cleanup_log_flag; | |||
| 1065 | 0 | 0 | splice @chunks, | ||||
| 1066 | $nextparts[1] + 1, $nextparts[2] - $nextparts[1] + 1; | ||||||
| 1067 | 0 | 0 | $nextparts[1] =~ /^(.*\n)--=_alternative/s; | ||||
| 1068 | 0 | 0 | my $fragment = $1; | ||||
| 1069 | 0 | 0 | $chunks[$nextparts[1]] = $fragment; | ||||
| 1070 | 0 | 0 | splice @chunks, 1, 2; | ||||
| 1071 | 0 | 0 | return join("\n\n", @chunks[1 .. ($#chunks-1)] ); | ||||
| 1072 | } else { | ||||||
| 1073 | 0 | 0 | 0 | print MIME ' ' x 30, "$postid; count: ", | |||
| 1074 | sprintf("%3d", scalar(@nextparts)), " CASE B\n" | ||||||
| 1075 | if $MIME_cleanup_log_flag; | ||||||
| 1076 | 0 | 0 | return join("\n\n", @chunks ); | ||||
| 1077 | } | ||||||
| 1078 | } elsif ($chunks[1] =~ /cryptographically\ssigned/) { | ||||||
| 1079 | 0 | 0 | 0 | print MIME "$postid CASE H\n" if $MIME_cleanup_log_flag; | |||
| 1080 | 0 | 0 | splice @chunks, -3, 2; | ||||
| 1081 | 0 | 0 | splice @chunks, 1, 2; | ||||
| 1082 | 0 | 0 | return join("\n\n", @chunks[1 .. ($#chunks-1)] ); | ||||
| 1083 | } else { | ||||||
| 1084 | 0 | 0 | for (my $i=2; $i<=$#chunks; $i++) { | ||||
| 1085 | 0 | 0 | 0 | push(@nextparts, $i) if ( | |||
| 1086 | $chunks[$i] =~ /-{4,6}[_\s]?=_NextPart| | ||||||
| 1087 | --Boundary_| | ||||||
| 1088 | --------------InterScan_NT_MIME_Boundary/x | ||||||
| 1089 | ); | ||||||
| 1090 | } | ||||||
| 1091 | 0 | 0 | 0 | 0 | if (@nextparts == 3) { | ||
| 0 | 0 | ||||||
| 0 | |||||||
| 0 | |||||||
| 1092 | 0 | 0 | 0 | print MIME "$postid CASE C\n" if $MIME_cleanup_log_flag; | |||
| 1093 | 0 | 0 | splice @chunks, $nextparts[1], $nextparts[2] - $nextparts[1] + 1; | ||||
| 1094 | 0 | 0 | splice @chunks, 1, 2; | ||||
| 1095 | 0 | 0 | return join("\n\n", @chunks[1 .. ($#chunks-1)] ); | ||||
| 1096 | } elsif (@nextparts == 1) { | ||||||
| 1097 | 0 | 0 | 0 | print MIME "$postid CASE D\n" if $MIME_cleanup_log_flag; | |||
| 1098 | 0 | 0 | splice @chunks, 1, 1; | ||||
| 1099 | 0 | 0 | return join("\n\n", @chunks[1 .. ($#chunks-1)] ); | ||||
| 1100 | } elsif (@nextparts == 5 or @nextparts == 6) { | ||||||
| 1101 | 0 | 0 | 0 | print MIME "$postid CASE E\n" if $MIME_cleanup_log_flag; | |||
| 1102 | 0 | 0 | splice @chunks, $nextparts[2], $nextparts[-1] - $nextparts[2] + 1; | ||||
| 1103 | 0 | 0 | splice @chunks, 1, 3; | ||||
| 1104 | 0 | 0 | return join("\n\n", @chunks[1 .. ($#chunks-1)] ); | ||||
| 1105 | } elsif (@nextparts == 7 or @nextparts == 8) { | ||||||
| 1106 | 0 | 0 | 0 | print MIME "$postid CASE F\n" if $MIME_cleanup_log_flag; | |||
| 1107 | 0 | 0 | splice @chunks, $nextparts[3], $nextparts[-1] - $nextparts[3] + 1; | ||||
| 1108 | 0 | 0 | splice @chunks, 1, 3; | ||||
| 1109 | 0 | 0 | return join("\n\n", @chunks[1 .. ($#chunks-1)] ); | ||||
| 1110 | } else { | ||||||
| 1111 | 0 | 0 | 0 | print MIME ' ' x 30, "$postid; count: ", | |||
| 1112 | sprintf("%3d", scalar(@nextparts)), " CASE G\n" | ||||||
| 1113 | if $MIME_cleanup_log_flag; | ||||||
| 1114 | 0 | 0 | return join("\n\n", @chunks[1 .. ($#chunks-1)] ); | ||||
| 1115 | } | ||||||
| 1116 | } | ||||||
| 1117 | } | ||||||
| 1118 | } | ||||||
| 1119 | |||||||
| 1120 | sub _prepare_output_string { | ||||||
| 1121 | 43 | 43 | 83 | my ($threads_hash_ref, $seen_ref, $dir_threads, $thread_msg_delimiter, | |||
| 1122 | $output_ref, $optional_fields_ref) = @_; | ||||||
| 1123 | 43 | 56 | my %messages = %{$threads_hash_ref}; | ||||
| 43 | 3101 | ||||||
| 1124 | 43 | 91 | my %seen = %{$seen_ref}; | ||||
| 43 | 247 | ||||||
| 1125 | 43 | 71 | my (%output, %opt_fields); | ||||
| 1126 | 43 | 100 | 102 | %output = %{$output_ref} if defined $output_ref; | |||
| 41 | 177 | ||||||
| 1127 | 43 | 62 | %opt_fields = %{$optional_fields_ref}; | ||||
| 43 | 86 | ||||||
| 1128 | 43 | 52 | my ($pathsep, $out, $lc_out, $outstr); | ||||
| 1129 | 43 | 50 | 428 | $pathsep = ($^O eq 'MSWin32') ? "\\" : '/'; | |||
| 1130 | 43 | 130 | $out = $dir_threads . $pathsep . $messages{$_}[0] . '.thr.txt'; | ||||
| 1131 | 43 | 117 | $lc_out = lc($out); | ||||
| 1132 | 43 | 94 | $seen{$lc_out}++; | ||||
| 1133 | 43 | 219 | $outstr = "Thread: $messages{$_}[0]\n"; | ||||
| 1134 | 43 | 84 | $outstr .= "Message: $messages{$_}[1]\n"; | ||||
| 1135 | 43 | 110 | $outstr .= "From: $messages{$_}[2]{'from'}\n"; | ||||
| 1136 | 43 | 142 | foreach my $i (sort keys %opt_fields) { | ||||
| 1137 | 0 | 0 | 0 | next unless (defined $messages{$_}[2]{$opt_fields{$i}[0]}); | |||
| 1138 | 0 | 0 | my $space = 13 - length($opt_fields{$i}[1]); | ||||
| 1139 | 0 | 0 | $outstr .= $opt_fields{$i}[1] . ':' . ' ' x $space . | ||||
| 1140 | "$messages{$_}[2]{$opt_fields{$i}[0]}\n"; | ||||||
| 1141 | } | ||||||
| 1142 | 43 | 195 | $outstr .= 'Text:' . "\n\n" . $messages{$_}[3] . "\n"; | ||||
| 1143 | 43 | 85 | $outstr .= "\n"; | ||||
| 1144 | 43 | 50 | 439 | $outstr .= "$thread_msg_delimiter" | |||
| 1145 | unless (! defined $thread_msg_delimiter); | ||||||
| 1146 | 43 | 244 | $output{$out} .= $outstr; | ||||
| 1147 | 43 | 683 | return \%seen, \%output; | ||||
| 1148 | } | ||||||
| 1149 | |||||||
| 1150 | sub _update_all_topics { | ||||||
| 1151 | 2 | 2 | 7 | my ($choice, $config_out_ref, $in_out_ref) = @_; | |||
| 1152 | 2 | 4 | my $title = ${$config_out_ref}{'title'}; | ||||
| 2 | 9 | ||||||
| 1153 | 2 | 12 | my $topicsfile = defined ${$config_out_ref}{'todays_topics'} # v1.96 | ||||
| 2 | 6 | ||||||
| 1154 | 2 | 50 | 5 | ? ${$config_out_ref}{'todays_topics'} | |||
| 1155 | 0 | 0 | : "${$config_out_ref}{'dir_digest'}/todays_topics.txt"; | ||||
| 1156 | 2 | 5 | my ($topic, $topicstring); | ||||
| 1157 | 2 | 50 | 12 | if ($choice eq 'ALL') { | |||
| 1158 | 0 | 0 | $topicstring = "$title Digest: Today's Topics\n"; | ||||
| 1159 | 0 | 0 | foreach ( sort keys %$in_out_ref ) { | ||||
| 1160 | 0 | 0 | $topicstring .= "\n${$in_out_ref}{$_}[0]\n"; | ||||
| 0 | 0 | ||||||
| 1161 | 0 | 0 | foreach $topic ( @{${$in_out_ref}{$_}[1]} ) { | ||||
| 0 | 0 | ||||||
| 0 | 0 | ||||||
| 1162 | 0 | 0 | $topicstring .= "$topic\n"; | ||||
| 1163 | } | ||||||
| 1164 | } | ||||||
| 1165 | 0 | 0 | 0 | open(TOPICS, ">$topicsfile") | |||
| 1166 | || die "cannot open $topicsfile for writing: $!"; | ||||||
| 1167 | 0 | 0 | print TOPICS $topicstring; | ||||
| 1168 | 0 | 0 | 0 | close(TOPICS) || die "can't close $topicsfile:$!"; | |||
| 1169 | } else { | ||||||
| 1170 | 2 | 14 | $topicstring = ''; | ||||
| 1171 | 2 | 17 | foreach ( sort keys %$in_out_ref ) { | ||||
| 1172 | 5 | 9 | $topicstring .= "\n${$in_out_ref}{$_}[0]\n"; | ||||
| 5 | 25 | ||||||
| 1173 | 5 | 8 | foreach $topic ( @{${$in_out_ref}{$_}[1]} ) { | ||||
| 5 | 6 | ||||||
| 5 | 19 | ||||||
| 1174 | 69 | 121 | $topicstring .= "$topic\n"; | ||||
| 1175 | } | ||||||
| 1176 | } | ||||||
| 1177 | 2 | 50 | 115 | open(TOPICS, ">>$topicsfile") | |||
| 1178 | || die "cannot open $topicsfile for appending: $!"; | ||||||
| 1179 | 2 | 13 | print TOPICS $topicstring; | ||||
| 1180 | 2 | 50 | 87 | close(TOPICS) || die "can't close $topicsfile:$!"; | |||
| 1181 | } | ||||||
| 1182 | } | ||||||
| 1183 | |||||||
| 1184 | sub _print_results { | ||||||
| 1185 | 2 | 2 | 6 | my ($total_digests_processed, $message_count, | |||
| 1186 | $config_out_ref, $thread_count) = @_; | ||||||
| 1187 | 2 | 12 | print < | ||||
| 1188 | |||||||
| 1189 | |||||||
| 1190 | RESULTS | ||||||
| 1191 | |||||||
| 1192 | 2 | 3097 | Digests processed:\t\t$total_digests_processed | ||||
| 1193 | Messages processed:\t\t$message_count | ||||||
| 1194 | Threads directory:\t\t${$config_out_ref}{'dir_threads'} | ||||||
| 1195 | Threads created/modified:\t$thread_count | ||||||
| 1196 | XQ19 | ||||||
| 1197 | } | ||||||
| 1198 | |||||||
| 1199 | sub _verify_date { | ||||||
| 1200 | 2 | 2 | 6 | my $dateref = shift; | |||
| 1201 | 2 | 21 | die "Incorrect date specification: $!" | ||||
| 1202 | unless ( | ||||||
| 1203 | 2 | 20 | (exists ${$dateref}{'year'}) && | ||||
| 1204 | 2 | 12 | (exists ${$dateref}{'month'}) && | ||||
| 1205 | 2 | 50 | 33 | 4 | (exists ${$dateref}{'day'}) | ||
| 33 | |||||||
| 1206 | ); | ||||||
| 1207 | 0 | 0 | die "${$dateref}{'year'} is incorrect year specification: $!" | ||||
| 2 | 13 | ||||||
| 1208 | 2 | 50 | 4 | unless (1900 <= ${$dateref}{'year'}); | |||
| 1209 | 0 | 0 | die "${$dateref}{'month'} is incorrect month specification: $!" | ||||
| 2 | 17 | ||||||
| 1210 | unless ( | ||||||
| 1211 | 2 | 21 | (1 <= ${$dateref}{'month'}) && | ||||
| 1212 | 2 | 50 | 33 | 4 | (${$dateref}{'month'} <= 12) | ||
| 1213 | ); | ||||||
| 1214 | 0 | 0 | die "${$dateref}{'day'} is incorrect day of month specification: $!" | ||||
| 2 | 20 | ||||||
| 1215 | unless ( | ||||||
| 1216 | ( | ||||||
| 1217 | 2 | 12 | ${$dateref}{'day'} >= 1 and | ||||
| 1218 | 0 | 0 | ${$dateref}{'day'} <= 28 | ||||
| 1219 | ) | ||||||
| 1220 | || | ||||||
| 1221 | ( | ||||||
| 1222 | 0 | 0 | $month31{${$dateref}{'month'}} and | ||||
| 1223 | 0 | 0 | ${$dateref}{'day'} >= 29 and | ||||
| 1224 | 0 | 0 | ${$dateref}{'day'} <= 31 | ||||
| 1225 | ) | ||||||
| 1226 | || | ||||||
| 1227 | ( | ||||||
| 1228 | 0 | 0 | $month30{${$dateref}{'month'}} and | ||||
| 1229 | 0 | 0 | ${$dateref}{'day'} >= 29 and | ||||
| 1230 | 0 | 0 | ${$dateref}{'day'} <= 30 | ||||
| 1231 | ) | ||||||
| 1232 | || | ||||||
| 1233 | ( | ||||||
| 1234 | 0 | 0 | ${$dateref}{'month'} == 2 and | ||||
| 1235 | ${$dateref}{'day'} == 29 and | ||||||
| 1236 | ( | ||||||
| 1237 | ${$dateref}{'year'} % 400 == 0 or | ||||||
| 1238 | ( | ||||||
| 1239 | ${$dateref}{'year'} % 100 != 0 and | ||||||
| 1240 | 2 | 0 | 33 | 4 | ${$dateref}{'year'} % 4 == 0 | ||
| 0 | |||||||
| 0 | |||||||
| 33 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 0 | |||||||
| 1241 | ) | ||||||
| 1242 | ) | ||||||
| 1243 | ) | ||||||
| 1244 | ); | ||||||
| 1245 | 2 | 5 | return timelocal( | ||||
| 1246 | 0, 0, 0, | ||||||
| 1247 | 2 | 6 | ${$dateref}{'day'}, | ||||
| 1248 | 2 | 14 | ${$dateref}{'month'} - 1, | ||||
| 1249 | 2 | 5 | ${$dateref}{'year'} | ||||
| 1250 | ); | ||||||
| 1251 | } | ||||||
| 1252 | |||||||
| 1253 | sub _get_array_of_messages { | ||||||
| 1254 | 8 | 8 | 23 | my ($in, $delimiter) = @_; | |||
| 1255 | 8 | 11 | my ($fh, $bigstr); | ||||
| 1256 | 8 | 50 | 486 | open $fh, $in or die "Couldn't open $in for reading: $!"; | |||
| 1257 | { | ||||||
| 1258 | 8 | 16 | local $/ = undef; | ||||
| 8 | 31 | ||||||
| 1259 | 8 | 215 | $bigstr = <$fh>; | ||||
| 1260 | } | ||||||
| 1261 | 8 | 50 | 232 | close $fh or die "Couldn't close $in after reading: $!"; | |||
| 1262 | 8 | 135 | my @messages = split(/$delimiter/, $bigstr); | ||||
| 1263 | 8 | 52 | return \@messages; | ||||
| 1264 | } | ||||||
| 1265 | |||||||
| 1266 | 1; | ||||||
| 1267 | |||||||
| 1268 | ############################ DOCUMENTATION ##################################### | ||||||
| 1269 | |||||||
| 1270 | =head1 NAME | ||||||
| 1271 | |||||||
| 1272 | Mail::Digest::Tools - Tools for digest versions of mailing lists | ||||||
| 1273 | |||||||
| 1274 | =head1 VERSION | ||||||
| 1275 | |||||||
| 1276 | This document refers to version 2.12 of digest.pl, released May 14, 2011. | ||||||
| 1277 | |||||||
| 1278 | =head1 SYNOPSIS | ||||||
| 1279 | |||||||
| 1280 | use Mail::Digest::Tools qw( | ||||||
| 1281 | process_new_digests | ||||||
| 1282 | reprocess_ALL_digests | ||||||
| 1283 | reply_to_digest_message | ||||||
| 1284 | repair_message_order | ||||||
| 1285 | consolidate_threads_multiple | ||||||
| 1286 | consolidate_threads_single | ||||||
| 1287 | delete_deletables | ||||||
| 1288 | ); | ||||||
| 1289 | |||||||
| 1290 | C<%config_in> and C<%config_out> are two configuration hashes whose setup | ||||||
| 1291 | is discussed in detail below. | ||||||
| 1292 | |||||||
| 1293 | process_new_digests(\%config_in, \%config_out); | ||||||
| 1294 | |||||||
| 1295 | reprocess_ALL_digests(\%config_in, \%config_out); | ||||||
| 1296 | |||||||
| 1297 | $full_reply_file = reply_to_digest_message( | ||||||
| 1298 | \%config_in, | ||||||
| 1299 | \%config_out, | ||||||
| 1300 | $digest_number, | ||||||
| 1301 | $digest_entry, | ||||||
| 1302 | $directory_for_reply, | ||||||
| 1303 | ); | ||||||
| 1304 | |||||||
| 1305 | repair_message_order( | ||||||
| 1306 | \%config_in, | ||||||
| 1307 | \%config_out, | ||||||
| 1308 | { | ||||||
| 1309 | year => 2004, | ||||||
| 1310 | month => 01, | ||||||
| 1311 | day => 27, | ||||||
| 1312 | } | ||||||
| 1313 | ); | ||||||
| 1314 | |||||||
| 1315 | consolidate_threads_multiple( | ||||||
| 1316 | \%config_in, | ||||||
| 1317 | \%config_out, | ||||||
| 1318 | $first_common_letters, # optional integer argument; defaults to 20 | ||||||
| 1319 | ); | ||||||
| 1320 | |||||||
| 1321 | consolidate_threads_single( | ||||||
| 1322 | \%config_in, | ||||||
| 1323 | \%config_out, | ||||||
| 1324 | [ | ||||||
| 1325 | 'first_dummy_file_for_consolidation.thr.txt', | ||||||
| 1326 | 'second_dummy_file_for_consolidation.thr.txt', | ||||||
| 1327 | ], | ||||||
| 1328 | ); | ||||||
| 1329 | |||||||
| 1330 | delete_deletables(\%config_out); | ||||||
| 1331 | |||||||
| 1332 | =head1 DESCRIPTION | ||||||
| 1333 | |||||||
| 1334 | Mail::Digest::Tools provides useful tools for processing mail which an | ||||||
| 1335 | individual receives in a 'daily digest' version from a mailing list. | ||||||
| 1336 | Digest versions of mailing lists are provided by a variety of mail processing | ||||||
| 1337 | programs and by a variety of list hosts. Within the Perl community, digest | ||||||
| 1338 | versions of mailing lists are offered by such sponsors as Active State, | ||||||
| 1339 | Sourceforge, Yahoo! Groups and London.pm. However, you do not have to be | ||||||
| 1340 | interested in Perl to make use of Mail::Digest::Tools.  Mail from I | ||||||
| 1341 | the thousands of Yahoo! Groups, for example, may be processed with this module. | ||||||
| 1342 | |||||||
| 1343 | If, when you receive e-mail from the digest version of a mailing list, you | ||||||
| 1344 | simply read the digest in an e-mail client and then discard it, you may stop | ||||||
| 1345 | reading here. If, however, you wish to read or store such mail by subject, | ||||||
| 1346 | read on. As printed in a normal web browser, this document contains 40 | ||||||
| 1347 | pages of documentation. You are urged to print this documentation out and | ||||||
| 1348 | study it before using this module. | ||||||
| 1349 | |||||||
| 1350 | To understand how to use Mail::Digest::Tools, we will first take a look at a | ||||||
| 1351 | typical mailing list digest. We will then sketch how that digest looks once | ||||||
| 1352 | processed by Mail::Digest::Tool. We will then discuss Mail::Digest::Tool's | ||||||
| 1353 | exportable functions. Next, we will study how to prepare the two configuration | ||||||
| 1354 | hashes which hold the configuration data. Finally, we will provide some tips | ||||||
| 1355 | for everyday use of Mail::Digest::Tools. | ||||||
| 1356 | |||||||
| 1357 | =head1 A TYPICAL MAILING LIST DIGEST | ||||||
| 1358 | |||||||
| 1359 | Here is a dummied-up version of a typical mailing list digest as it appears | ||||||
| 1360 | once saved to a plain-text file. For illustrative purposes, let us suppose | ||||||
| 1361 | that the file is named: 'Perl-Win32-Users Digest, Vol 1 Issue 9999.txt' | ||||||
| 1362 | |||||||
| 1363 | Send Perl-Win32-Users mailing list submissions to | ||||||
| 1364 | perl-win32-users@listserv.ActiveState.com | ||||||
| 1365 | |||||||
| 1366 | When replying, please edit your Subject line so it is more specific | ||||||
| 1367 | than "Re: Contents of Perl-Win32-Users digest..." | ||||||
| 1368 | |||||||
| 1369 | Today's Topics: | ||||||
| 1370 | |||||||
| 1371 | 1. Introducing Mail::Digest::Tools (James E Keenan) | ||||||
| 1372 | 2. A Different Discussion (steve) | ||||||
| 1373 | 3. Re: Introducing Mail::Digest::Tools (David H Adler) | ||||||
| 1374 | |||||||
| 1375 | ---------------------------------------------------------------------- | ||||||
| 1376 | |||||||
| 1377 | Message: 1 | ||||||
| 1378 | From: "James E Keenan" | ||||||
| 1379 | To: | ||||||
| 1380 | Subject: Introducing Mail::Digest::Tools | ||||||
| 1381 | Date: Sat, 31 Jan 2004 14:10:20 -0600 | ||||||
| 1382 | |||||||
| 1383 | Mail::Digest::Tools is the greatest thing since sliced bread. | ||||||
| 1384 | Go download it now! | ||||||
| 1385 | |||||||
| 1386 | ------------------------------ | ||||||
| 1387 | |||||||
| 1388 | Message: 2 | ||||||
| 1389 | From: "steve" | ||||||
| 1390 | To: | ||||||
| 1391 | Subject: A Different Discussion | ||||||
| 1392 | Date: Sat, 31 Jan 2004 14:40:20 -0600 | ||||||
| 1393 | |||||||
| 1394 | This is a new topic. I am not discussing Mail::Digest::Tools in this | ||||||
| 1395 | submission. | ||||||
| 1396 | |||||||
| 1397 | ------------------------------ | ||||||
| 1398 | |||||||
| 1399 | Message: 3 | ||||||
| 1400 | From: "David H Adler" | ||||||
| 1401 | To: | ||||||
| 1402 | Subject: Re: Introducing Mail::Digest::Tools | ||||||
| 1403 | Date: Sat, 31 Jan 2004 14:50:20 -0600 | ||||||
| 1404 | |||||||
| 1405 | Jim, what's this nonsense about sliced bread. Weren't you on the Atkins | ||||||
| 1406 | diet? Unlike beer, sliced bread is Off Topic. | ||||||
| 1407 | |||||||
| 1408 | ------------------------------ | ||||||
| 1409 | |||||||
| 1410 | _______________________________________________ | ||||||
| 1411 | Perl-Win32-Users mailing list | ||||||
| 1412 | Perl-Win32-Users@listserv.ActiveState.com | ||||||
| 1413 | To unsubscribe: http://listserv.ActiveState.com/mailman/mysubs | ||||||
| 1414 | |||||||
| 1415 | End of Perl-Win32-Users Digest | ||||||
| 1416 | |||||||
| 1417 | Note that the digest has an I | ||||||
| 1418 | the digest has its own structure. | ||||||
| 1419 | |||||||
| 1420 | The digest's overall structure consists of: | ||||||
| 1421 | |||||||
| 1422 | =over 4 | ||||||
| 1423 | |||||||
| 1424 | =item * | ||||||
| 1425 | |||||||
| 1426 | I | ||||||
| 1427 | |||||||
| 1428 | The digest header consists of one or more paragraphs providing instructions | ||||||
| 1429 | on how to subscribe, post messages, unsubscribe and contact the list | ||||||
| 1430 | administrator. | ||||||
| 1431 | |||||||
| 1432 | In processing a digest, Mail::Digest::Tools generally discards the digest | ||||||
| 1433 | header. | ||||||
| 1434 | |||||||
| 1435 | =item * | ||||||
| 1436 | |||||||
| 1437 | I | ||||||
| 1438 | |||||||
| 1439 | Next, each daily digest contains a list of the subjects of the messages found | ||||||
| 1440 | in that particular digest. This list is introduced by a paragraph such as: | ||||||
| 1441 | |||||||
| 1442 | Today's Topics | ||||||
| 1443 | |||||||
| 1444 | and is followed by a numbered list of the message subjects and authors. Some | ||||||
| 1445 | digests break the authors into two lines for names and e-mail addresses. | ||||||
| 1446 | Others, such as the example above, list only names. | ||||||
| 1447 | |||||||
| 1448 | When Mail::Digest::Tools process a digest, it extracts the list of topics as a | ||||||
| 1449 | single chunk and appends it to a file containing the topics from all previous | ||||||
| 1450 | digests which the user has similarly processed. | ||||||
| 1451 | |||||||
| 1452 | =item * | ||||||
| 1453 | |||||||
| 1454 | I | ||||||
| 1455 | |||||||
| 1456 | The list of topics is separated from the first message by a string of | ||||||
| 1457 | characters which the list sponsor has, we hope, determined is not likely to | ||||||
| 1458 | occur in the text of any message posted to that list. In the example above, | ||||||
| 1459 | the source message delimiter is the string: | ||||||
| 1460 | |||||||
| 1461 | ---------------------------------------------------------------------- | ||||||
| 1462 | |||||||
| 1463 | followed by two C<\n> newlines (so that the delimiter is a paragraph unto | ||||||
| 1464 | itself). Other digests may use a two-line delimiter such as: | ||||||
| 1465 | |||||||
| 1466 | _______________________________________________________ | ||||||
| 1467 | _______________________________________________________ | ||||||
| 1468 | |||||||
| 1469 | or | ||||||
| 1470 | |||||||
| 1471 | --__--__-- | ||||||
| 1472 | |||||||
| 1473 | =item * | ||||||
| 1474 | |||||||
| 1475 | I | ||||||
| 1476 | |||||||
| 1477 | Most mailing list digests use the same string to delimit individual messages | ||||||
| 1478 | within the digest that they use to delimit the list of today's topics from the | ||||||
| 1479 | very first message in the digest. (The author tracked one digest for more | ||||||
| 1480 | than three-and-a-half years that used the same string for both functions -- | ||||||
| 1481 | only to see that digest's provider change its format while this module was | ||||||
| 1482 | being prepared for CPAN!) But the digest may use a different string to | ||||||
| 1483 | separate individual messages from each other. In the sample digest above, | ||||||
| 1484 | the source message delimiter is the string: | ||||||
| 1485 | |||||||
| 1486 | ------------------------------ | ||||||
| 1487 | |||||||
| 1488 | followed by two C<\n> newlines (so that the delimiter is a paragraph unto | ||||||
| 1489 | itself). | ||||||
| 1490 | |||||||
| 1491 | As we shall see below, correctly identifying the post-topics delimiter and | ||||||
| 1492 | source message delimiter used in a particular digest is essential to correct | ||||||
| 1493 | configuration of Mail::Digest::Tools, as the module will repeatedly C | ||||||
| 1494 | digests on this delimiter. | ||||||
| 1495 | |||||||
| 1496 | =item * | ||||||
| 1497 | |||||||
| 1498 | I | ||||||
| 1499 | |||||||
| 1500 | Individual messages have their own structure. | ||||||
| 1501 | |||||||
| 1502 | =over 4 | ||||||
| 1503 | |||||||
| 1504 | =item * | ||||||
| 1505 | |||||||
| 1506 | I | ||||||
| 1507 | |||||||
| 1508 | In addition to normal mail headers, a message in a digest must have a | ||||||
| 1509 | message number representing its position within that day's digest. So a | ||||||
| 1510 | message in a digest will typically have some or all of the following headers: | ||||||
| 1511 | |||||||
| 1512 | Message: | ||||||
| 1513 | From: | ||||||
| 1514 | Organization: | ||||||
| 1515 | Reply-To: | ||||||
| 1516 | To: | ||||||
| 1517 | CC: | ||||||
| 1518 | Date: | ||||||
| 1519 | Subject: | ||||||
| 1520 | |||||||
| 1521 | =item * | ||||||
| 1522 | |||||||
| 1523 | I | ||||||
| 1524 | |||||||
| 1525 | One of more paragraphs of text, frequently including citations from earlier | ||||||
| 1526 | postings to the mailing list. | ||||||
| 1527 | |||||||
| 1528 | The main objective of Mail::Digest::Tools is to extract headers and bodies | ||||||
| 1529 | from particular digest entries and to append them to plain-text files which | ||||||
| 1530 | hold all postings on a particular subject. See discussion of | ||||||
| 1531 | C | ||||||
| 1532 | |||||||
| 1533 | Many mailing lists allow subscribers to post in either plain-text or HTML. | ||||||
| 1534 | Some allow users to post attachments; others do not. Others still | ||||||
| 1535 | incorporate the attachments into the message body, often using 'multipart | ||||||
| 1536 | MIME' format. Regrettably, certain mailing list digest programs fail to | ||||||
| 1537 | eliminate redundant MIME parts before posting a message to a digest. This | ||||||
| 1538 | leads to severe bloat once Mail::Digest::Tools extracts a message's content | ||||||
| 1539 | and posts it to a thread file. Mail::Digest::Tools, however, provides its | ||||||
| 1540 | users with the option of stripping redundant MIME parts from a message | ||||||
| 1541 | before posting. | ||||||
| 1542 | |||||||
| 1543 | =item * | ||||||
| 1544 | |||||||
| 1545 | I | ||||||
| 1546 | |||||||
| 1547 | As discussed above, each message within a digest is delimited by a string | ||||||
| 1548 | which may or may not be the same string which separates the list of Today's | ||||||
| 1549 | Topics from the first message in the digest. | ||||||
| 1550 | |||||||
| 1551 | =back | ||||||
| 1552 | |||||||
| 1553 | =item * | ||||||
| 1554 | |||||||
| 1555 | I | ||||||
| 1556 | |||||||
| 1557 | The digest footer consists of one or more paragraphs containing | ||||||
| 1558 | additional information on the digest and signaling the end of the digest. It | ||||||
| 1559 | follows the source message delimiter corresponding to the last message in a | ||||||
| 1560 | particular digest. | ||||||
| 1561 | |||||||
| 1562 | In processing a given digest, Mail::Digest::Tools generally discards the | ||||||
| 1563 | digest footer. | ||||||
| 1564 | |||||||
| 1565 | =back | ||||||
| 1566 | |||||||
| 1567 | =head2 The Typical Digest After Processing with Mail::Digest::Tools | ||||||
| 1568 | |||||||
| 1569 | Using the dummy messages provided above, typical use of Mail::Digest::Tools | ||||||
| 1570 | would produce (in a bare-bones configuration) the following results: | ||||||
| 1571 | |||||||
| 1572 | =over 4 | ||||||
| 1573 | |||||||
| 1574 | =item * | ||||||
| 1575 | |||||||
| 1576 | Two plain-text 'thread' files holding the ongoing discussion of each topic: | ||||||
| 1577 | |||||||
| 1578 | =over 4 | ||||||
| 1579 | |||||||
| 1580 | =item * | ||||||
| 1581 | |||||||
| 1582 | F | ||||||
| 1583 | |||||||
| 1584 | Thread: Introducing Mail::Digest::Tools | ||||||
| 1585 | Message: 001_9999_001 | ||||||
| 1586 | From:         "James E Keenan" | ||||||
| 1587 | Text: | ||||||
| 1588 | |||||||
| 1589 | Mail::Digest::Tools is the greatest thing since sliced bread. | ||||||
| 1590 | Go download it now! | ||||||
| 1591 | |||||||
| 1592 | --__--__-- | ||||||
| 1593 | |||||||
| 1594 | Thread: Introducing Mail::Digest::Tools | ||||||
| 1595 | Message: 001_9999_003 | ||||||
| 1596 | From: "David H Adler" | ||||||
| 1597 | Text: | ||||||
| 1598 | |||||||
| 1599 | Jim, what's this nonsense about sliced bread. Weren't you on the Atkins | ||||||
| 1600 | diet? Unlike beer, sliced bread is Off Topic. | ||||||
| 1601 | |||||||
| 1602 | --__--__-- | ||||||
| 1603 | |||||||
| 1604 | =item * | ||||||
| 1605 | |||||||
| 1606 | F | ||||||
| 1607 | |||||||
| 1608 | Thread: A Different Discussion | ||||||
| 1609 | Message: 001_9999_002 | ||||||
| 1610 | From: "steve" | ||||||
| 1611 | Text: | ||||||
| 1612 | |||||||
| 1613 | This is a new topic. I am not discussing Mail::Digest::Tools in this | ||||||
| 1614 | submission. | ||||||
| 1615 | |||||||
| 1616 | --__--__-- | ||||||
| 1617 | |||||||
| 1618 | =back | ||||||
| 1619 | |||||||
| 1620 | =item * | ||||||
| 1621 | |||||||
| 1622 | A new entry at the end of file F | ||||||
| 1623 | |||||||
| 1624 | Today's Topics | ||||||
| 1625 | |||||||
| 1626 | ... | ||||||
| 1627 | |||||||
| 1628 | Perl-Win32-Users digest, Vol 1 #9999 - 3 msgs.txt | ||||||
| 1629 | 1. Introducing Mail::Digest::Tools (James E Keenan) | ||||||
| 1630 | 2. A Different Discussion (steve) | ||||||
| 1631 | 3. Re: Introducing Mail::Digest::Tools (David H Adler) | ||||||
| 1632 | |||||||
| 1633 | =item * | ||||||
| 1634 | |||||||
| 1635 | A new entry at the end of file F | ||||||
| 1636 | |||||||
| 1637 | 001_9999;Fri Feb 6 18:57:41 2004;Fri Feb 6 18:57:41 2004 | ||||||
| 1638 | |||||||
| 1639 | =back | ||||||
| 1640 | |||||||
| 1641 | =head1 FUNCTIONS | ||||||
| 1642 | |||||||
| 1643 | Mail::Digest::Tools exports no functions by default. Each of its current | ||||||
| 1644 | seven functions is imported only on request by your script. | ||||||
| 1645 | |||||||
| 1646 | In everyday use, you will probably call just I | ||||||
| 1647 | exportable functions in a particular Perl script. Typically, you will import | ||||||
| 1648 | the function as described in the SYNOPSIS above, populate two configuration | ||||||
| 1649 | hashes, and finally call the one function you have imported. | ||||||
| 1650 | |||||||
| 1651 | As will become evident, the most challenging part of using Mail::Digest::Tools | ||||||
| 1652 | is I | ||||||
| 1653 | of configuration files from which the two configuration hashes passed as | ||||||
| 1654 | arguments to the various Mail::Digest::Tools functions are drawn. | ||||||
| 1655 | |||||||
| 1656 | More on those configuration hashes later. For now, let's look at the | ||||||
| 1657 | exportable functions. | ||||||
| 1658 | |||||||
| 1659 | =head2 C | ||||||
| 1660 | |||||||
| 1661 | process_new_digests(\%config_in, \%config_out); | ||||||
| 1662 | |||||||
| 1663 | C | ||||||
| 1664 | use most frequently on a daily basis. Based on information supplied in the | ||||||
| 1665 | two configuration hashes passed to it as arguments, C | ||||||
| 1666 | does the following: | ||||||
| 1667 | |||||||
| 1668 | =over 4 | ||||||
| 1669 | |||||||
| 1670 | =item * | ||||||
| 1671 | |||||||
| 1672 | Validates the configuration data. | ||||||
| 1673 | |||||||
| 1674 | =item * | ||||||
| 1675 | |||||||
| 1676 | Conducts an analysis of the directory in which thread files for a given | ||||||
| 1677 | digest are stored to determine are old enough: | ||||||
| 1678 | |||||||
| 1679 | =over 4 | ||||||
| 1680 | |||||||
| 1681 | =item * | ||||||
| 1682 | |||||||
| 1683 | I | ||||||
| 1684 | configuration file that you wish to archive older threads in a subdirectory | ||||||
| 1685 | |||||||
| 1686 | =item * | ||||||
| 1687 | |||||||
| 1688 | I | ||||||
| 1689 | I | ||||||
| 1690 | |||||||
| 1691 | =back | ||||||
| 1692 | |||||||
| 1693 | =item * | ||||||
| 1694 | |||||||
| 1695 | Conducts an analysis of the directory in which digest files (I | ||||||
| 1696 | plain-text versions of mailing list digests you have received) are stored to | ||||||
| 1697 | determine which digest files are new and need processing and which have | ||||||
| 1698 | previously been processed. | ||||||
| 1699 | |||||||
| 1700 | =item * | ||||||
| 1701 | |||||||
| 1702 | Updates a log file to put a timestamp on the processing of the new digest | ||||||
| 1703 | file or files. Based on options set in the configuration file, this function | ||||||
| 1704 | may also update a more human-readable version of this log file. | ||||||
| 1705 | |||||||
| 1706 | =item * | ||||||
| 1707 | |||||||
| 1708 | Opens each of the digest files identified as needing processing and proceeds | ||||||
| 1709 | to 'strip down' those files. This 'stripping down' includes the following: | ||||||
| 1710 | |||||||
| 1711 | =over 4 | ||||||
| 1712 | |||||||
| 1713 | =item * | ||||||
| 1714 | |||||||
| 1715 | The digest file's name is analyzed to extract the digest's number as issued by | ||||||
| 1716 | the provider's mailing list program. This number is used to form part of the | ||||||
| 1717 | unique identifier which Mail::Digest::Tools assigns to each message within | ||||||
| 1718 | each digest. | ||||||
| 1719 | |||||||
| 1720 | =item * | ||||||
| 1721 | |||||||
| 1722 | The list of today's topics in the digest is extracted and appended to a | ||||||
| 1723 | permanent log file of such topics. | ||||||
| 1724 | |||||||
| 1725 | =item * | ||||||
| 1726 | |||||||
| 1727 | The digest's contents are split into individual messages. Each message, in | ||||||
| 1728 | turn, is split into headers and body. | ||||||
| 1729 | |||||||
| 1730 | =item * | ||||||
| 1731 | |||||||
| 1732 | If you have requested in the configuration file that superfluous multipart | ||||||
| 1733 | MIME content be purged from messages before posting to thread files, this | ||||||
| 1734 | purging is now conducted. | ||||||
| 1735 | |||||||
| 1736 | =item * | ||||||
| 1737 | |||||||
| 1738 | Each message is appended to an appropriate, plain-text thread file which | ||||||
| 1739 | holds the ongoing discussion of that topic. The following factors are taken | ||||||
| 1740 | into consideration: | ||||||
| 1741 | |||||||
| 1742 | =over 4 | ||||||
| 1743 | |||||||
| 1744 | =item * | ||||||
| 1745 | |||||||
| 1746 | The name of the thread file is derived from the message's subject, though | ||||||
| 1747 | characters in the message's subject which would not be valid in file names | ||||||
| 1748 | on your operating system are skipped over. | ||||||
| 1749 | |||||||
| 1750 | =item * | ||||||
| 1751 | |||||||
| 1752 | To the greatest extent possible, extraneous words in a message's subject | ||||||
| 1753 | such as 'Re:' or 'Fwd:' are deleted so that all relevant postings on a given | ||||||
| 1754 | subject can be included in a single thread file. (Should this not succeed | ||||||
| 1755 | and a new thread file beginning with 'Re:' or some similar term be created, | ||||||
| 1756 | you can fix this later by using Mail::Digest::Tool's | ||||||
| 1757 | C | ||||||
| 1758 | |||||||
| 1759 | =back | ||||||
| 1760 | |||||||
| 1761 | =item * | ||||||
| 1762 | |||||||
| 1763 | A brief summation of results is printed to standard output. | ||||||
| 1764 | |||||||
| 1765 | =back | ||||||
| 1766 | |||||||
| 1767 | =back | ||||||
| 1768 | |||||||
| 1769 | =head2 C | ||||||
| 1770 | |||||||
| 1771 | reprocess_ALL_digests(\%config_in, \%config_out); | ||||||
| 1772 | |||||||
| 1773 | C | ||||||
| 1774 | should use ONLY when you are setting up and fine-tuning Mail::Digest::Tools | ||||||
| 1775 | to process a given digest -- and you should NEVER use it thereafter! | ||||||
| 1776 | |||||||
| 1777 | Why? Read on! | ||||||
| 1778 | |||||||
| 1779 | C | ||||||
| 1780 | C | ||||||
| 1781 | directory in which you store such digests -- not just on those previously | ||||||
| 1782 | processed. But in the process it does not merely append new messages to | ||||||
| 1783 | already existing thread files, leaving older thread files untouched. Instead, | ||||||
| 1784 | C | ||||||
| 1785 | rebuilds it from scratch. | ||||||
| 1786 | |||||||
| 1787 | That's cool if you have retained all instances of a given digest which you | ||||||
| 1788 | wish to process into thread files. But if you've thrown out older instances | ||||||
| 1789 | of a given digest and call C | ||||||
| 1790 | to process the messages contained in those discarded digests. The message | ||||||
| 1791 | sources are gone. That's cool once you're certain that you've got a given | ||||||
| 1792 | digest configured just the way you want it -- but not until that moment. | ||||||
| 1793 | |||||||
| 1794 | =over 4 | ||||||
| 1795 | |||||||
| 1796 | =item * Example | ||||||
| 1797 | |||||||
| 1798 | Let's make this more concrete. Suppose that you have begun to subscribe to | ||||||
| 1799 | the digest version of the London Perlmongers mailing list. When you receive | ||||||
| 1800 | e-mails from this provider, you store them in a directory whose contents look | ||||||
| 1801 | like this: | ||||||
| 1802 | |||||||
| 1803 | london.pm digest, Vol 1 #1856 - 7 msgs.txt | ||||||
| 1804 | london.pm digest, Vol 1 #1857 - 18 msgs.txt | ||||||
| 1805 | london.pm digest, Vol 1 #1858 - 15 msgs.txt | ||||||
| 1806 | london.pm digest, Vol 1 #1859 - 17 msgs.txt | ||||||
| 1807 | london.pm digest, Vol 1 #1860 - 11 msgs.txt | ||||||
| 1808 | |||||||
| 1809 | Initially, you decide that you want to post the messages in these digests | ||||||
| 1810 | to thread files that are discarded after three days. You set up your | ||||||
| 1811 | configuration files to do precisely this. (See below for how this is done.) | ||||||
| 1812 | You then write a script which calls | ||||||
| 1813 | |||||||
| 1814 | reprocess_ALL_digests(\%config_in, \%config_out); | ||||||
| 1815 | |||||||
| 1816 | Three days go by. One or two new london.pm digests arrive each day. You | ||||||
| 1817 | want to process only the newly arrived files, so each day you simply call: | ||||||
| 1818 | |||||||
| 1819 | process_new_digests(\%config_in, \%config_out); | ||||||
| 1820 | |||||||
| 1821 | and on Day 4 Mail::Digest::Tools starts to notify you on standard output | ||||||
| 1822 | that it is discarding thread files which have not been changed (I | ||||||
| 1823 | received new postings) in three days. | ||||||
| 1824 | |||||||
| 1825 | But then you decide that London.pm's contributors are the most witty and | ||||||
| 1826 | erudite Perlmongers anywhere and you wish to archive their contributions | ||||||
| 1827 | until the end of time (or until the first production release of | ||||||
| 1828 | Perl 6, whichever comes first). Fortunately, you've still got all your | ||||||
| 1829 | London.pm digest files going back to the beginning of your subscription. | ||||||
| 1830 | You make appropriate changes to your configuration setup to say, ''Instead | ||||||
| 1831 | of killing these thread files after 3 days of inactivity, archive them after | ||||||
| 1832 | 3 days instead.'' (Again, we'll see how to do this below.) You then call: | ||||||
| 1833 | |||||||
| 1834 | reprocess_ALL_digests(\%config_in, \%config_out); | ||||||
| 1835 | |||||||
| 1836 | one last time. All your previously existing thread files are wiped out, and | ||||||
| 1837 | all your London.pm digests are reprocessed from scratch. But that's okay, | ||||||
| 1838 | because you've decided to live with your configuration decisions. So you | ||||||
| 1839 | can now begin to discard older digest files and process newly arrived files | ||||||
| 1840 | only with | ||||||
| 1841 | |||||||
| 1842 | process_new_digests(\%config_in, \%config_out); | ||||||
| 1843 | |||||||
| 1844 | Your London.pm thread archive grows exponentially, and you live happily ever | ||||||
| 1845 | after. | ||||||
| 1846 | |||||||
| 1847 | =back | ||||||
| 1848 | |||||||
| 1849 | The ALL CAPS in C | ||||||
| 1850 | Mail::Digest::Tools function is very powerful, but potentially very dangerous. | ||||||
| 1851 | You are also alerted to this danger by this screen prompt which appears when | ||||||
| 1852 | you call this function: | ||||||
| 1853 | |||||||
| 1854 | By default, this program processes only NEWLY ARRIVED | ||||||
| 1855 | [London.pm/other digest] files found in this directory. Messages in | ||||||
| 1856 | these new digests are sorted and appended to the appropriate | ||||||
| 1857 | '.thr.txt' files in the 'Threads' subdirectory. | ||||||
| 1858 | |||||||
| 1859 | However, by choosing method 'reprocess_ALL_digests()' you have | ||||||
| 1860 | indicated that you wish to process ALL digest files found in this | ||||||
| 1861 | directory -- regardless of whether or not they have previously been | ||||||
| 1862 | processed. This is recommended ONLY for initialization and testing | ||||||
| 1863 | of this program. | ||||||
| 1864 | |||||||
| 1865 | Since this will wipe out all threads files ('.thr.txt') as well -- | ||||||
| 1866 | including threads files for which you no longer have their source | ||||||
| 1867 | digest files -- please confirm that this is your intent by typing | ||||||
| 1868 | ALL at the prompt. | ||||||
| 1869 | |||||||
| 1870 | |||||||
| 1871 | GOT IT? | ||||||
| 1872 | |||||||
| 1873 | To proceed, you must type C | ||||||
| 1874 | yet another prompt: | ||||||
| 1875 | |||||||
| 1876 | You have chosen to WIPE OUT all '.thr.txt' files currently | ||||||
| 1877 | existing in the 'Threads' subdirectory and reprocess all | ||||||
| 1878 | [London.pm/other digest] digest files from scratch. | ||||||
| 1879 | |||||||
| 1880 | Please re-confirm your choice by once again typing 'ALL' | ||||||
| 1881 | and hitting [Enter]: | ||||||
| 1882 | |||||||
| 1883 | You must again type C | ||||||
| 1884 | digests.  Should you fail to type C | ||||||
| 1885 | script will default to C | ||||||
| 1886 | arrived digest files. | ||||||
| 1887 | |||||||
| 1888 | =head2 C | ||||||
| 1889 | |||||||
| 1890 | $full_reply_file = reply_to_digest_message( | ||||||
| 1891 | \%config_in, | ||||||
| 1892 | \%config_out, | ||||||
| 1893 | $digest_number, | ||||||
| 1894 | $digest_entry, | ||||||
| 1895 | $directory_for_reply, | ||||||
| 1896 | ); | ||||||
| 1897 | |||||||
| 1898 | Once you have begun to follow discussion threads on a mailing list with the | ||||||
| 1899 | aid of Mail::Digest::Tools, you may wish to join the discussion and reply to | ||||||
| 1900 | a message. | ||||||
| 1901 | |||||||
| 1902 | If you tried to do this by hitting the 'Reply' button in your e-mail client, | ||||||
| 1903 | you would probably end up with a 'Subject' line in your e-mail that looked | ||||||
| 1904 | this: | ||||||
| 1905 | |||||||
| 1906 | Re: london.pm digest, Vol 1 #1814 - 2 msgs | ||||||
| 1907 | |||||||
| 1908 | Needless to say, this is tacky. So tacky that many mailing list digest | ||||||
| 1909 | programs insert this message into each digest's headers: | ||||||
| 1910 | |||||||
| 1911 | When replying, please edit your Subject line so it is more specific | ||||||
| 1912 | than "Re: Contents of london.pm digest, Vol 1, #xxxx..." | ||||||
| 1913 | |||||||
| 1914 | You don't want to be tacky; you want to be lazy. You want Perl to do the | ||||||
| 1915 | work of initiating an e-mail with a meaningful subject header for you. | ||||||
| 1916 | Mail::Digest::Tool's C | ||||||
| 1917 | a plain-text file for you that has a meaningful subject line and prepends | ||||||
| 1918 | each line of the body of the message with C<\> >. You then open this | ||||||
| 1919 | plain-text file, edit it to reply to its contents, copy-and-paste it into | ||||||
| 1920 | your e-mail client, and send it. | ||||||
| 1921 | |||||||
| 1922 | The arguments passed to C | ||||||
| 1923 | |||||||
| 1924 | =over 4 | ||||||
| 1925 | |||||||
| 1926 | =item * | ||||||
| 1927 | |||||||
| 1928 | a reference to the 'in' configuration hash | ||||||
| 1929 | |||||||
| 1930 | =item * | ||||||
| 1931 | |||||||
| 1932 | a reference to the 'out' configuration hash | ||||||
| 1933 | |||||||
| 1934 | =item * | ||||||
| 1935 | |||||||
| 1936 | the number of the digest containing the message to which you are replying | ||||||
| 1937 | |||||||
| 1938 | =item * | ||||||
| 1939 | |||||||
| 1940 | the number of the message to which you are replying within that digest | ||||||
| 1941 | |||||||
| 1942 | =item * | ||||||
| 1943 | |||||||
| 1944 | a path to the directory in which you want the plain-text reply file to be | ||||||
| 1945 | created | ||||||
| 1946 | |||||||
| 1947 | =back | ||||||
| 1948 | |||||||
| 1949 | =over 4 | ||||||
| 1950 | |||||||
| 1951 | =item * Example | ||||||
| 1952 | |||||||
| 1953 | Suppose that you wished to reply to message #2 in London.pm digest #1814: | ||||||
| 1954 | |||||||
| 1955 | Message: 2 | ||||||
| 1956 | From:     James E Keenan | ||||||
| 1957 | To:       London Perlmongers | ||||||
| 1958 | Date: Fri, 2 Jan 2004 23:41:01 -0500 | ||||||
| 1959 | Subject: re: language courses | ||||||
| 1960 | Reply-To: london.pm@london.pm.org | ||||||
| 1961 | |||||||
| 1962 | On Fri, 2 Jan 2004 22:38:40 +0000 (GMT), Ali Young wrote concerning: | ||||||
| 1963 | language courses | ||||||
| 1964 | |||||||
| 1965 | > Depends what you count as useful. Learning Esperanto means that you | ||||||
| 1966 | > can read the current London.pm website. | ||||||
| 1967 | |||||||
| 1968 | BTW, wasn't the Esperanto on the website supposed to expire on 31 Dec? | ||||||
| 1969 | |||||||
| 1970 | Jim Keenan | ||||||
| 1971 | Brooklyn, NY | ||||||
| 1972 | |||||||
| 1973 | You would call the function as follows: | ||||||
| 1974 | |||||||
| 1975 | $full_reply_file = reply_to_digest_message( | ||||||
| 1976 | \%config_in, | ||||||
| 1977 | \%config_out, | ||||||
| 1978 | 1814, | ||||||
| 1979 | 2, | ||||||
| 1980 | '/home/jimk/mail/digest/london', | ||||||
| 1981 | ); | ||||||
| 1982 | |||||||
| 1983 | Mail::Digest::Tools will then create a plain-text file which you can use as | ||||||
| 1984 | the first draft of your reply. It will print this screen prompt: | ||||||
| 1985 | |||||||
| 1986 | To complete reply, edit text in: | ||||||
| 1987 | /home/jimk/mail/digest/london/language_courses.reply.txt | ||||||
| 1988 | |||||||
| 1989 | When you open F | ||||||
| 1990 | like this: | ||||||
| 1991 | |||||||
| 1992 | Reply-To: | ||||||
| 1993 | london.pm@london.pm.org | ||||||
| 1994 | |||||||
| 1995 | Subject: | ||||||
| 1996 | language courses | ||||||
| 1997 | |||||||
| 1998 | On Fri, 2 Jan 2004 23:41:01 -0500, James E Keenan | ||||||
| 1999 |  | ||||||
| 2000 | |||||||
| 2001 | > On Fri, 2 Jan 2004 22:38:40 +0000 (GMT), Ali Young wrote concerning: | ||||||
| 2002 | > language courses | ||||||
| 2003 | > | ||||||
| 2004 | > > Depends what you count as useful. Learning Esperanto means that you | ||||||
| 2005 | > can | ||||||
| 2006 | > > read the current London.pm website. | ||||||
| 2007 | > | ||||||
| 2008 | > BTW, wasn't the Esperanto on the website supposed to expire on 31 Dec? | ||||||
| 2009 | > | ||||||
| 2010 | > Jim Keenan | ||||||
| 2011 | > Brooklyn, NY | ||||||
| 2012 | > | ||||||
| 2013 | |||||||
| 2014 | The 'Reply-To' and 'Subject' paragraphs are provided simply to give you | ||||||
| 2015 | something to cut-and-paste into a GUI e-mail client. The 'Reply-To' | ||||||
| 2016 | paragraph will only appear if in C<%config_in> the key | ||||||
| 2017 | C | ||||||
| 2018 | |||||||
| 2019 | You edit this plain-text file, pop it into the body of your e-mail | ||||||
| 2020 | window and send it. Not elegant, but it at least gives you a first draft. | ||||||
| 2021 | |||||||
| 2022 | =back | ||||||
| 2023 | |||||||
| 2024 | =head2 C | ||||||
| 2025 | |||||||
| 2026 | repair_message_order( | ||||||
| 2027 | \%config_in, | ||||||
| 2028 | \%config_out, | ||||||
| 2029 | { | ||||||
| 2030 | year => 2004, | ||||||
| 2031 | month => 01, | ||||||
| 2032 | day => 27, | ||||||
| 2033 | } | ||||||
| 2034 | ); | ||||||
| 2035 | |||||||
| 2036 | From time to time you may receive digest versions of mailing lists out of | ||||||
| 2037 | chronological/numerical sequence. This is especially true when e-mail | ||||||
| 2038 | traffic is being disrupted by worms or viruses. You may discover that you | ||||||
| 2039 | have received and processed | ||||||
| 2040 | |||||||
| 2041 | london.pm digest, Vol 1 #1856 - 7 msgs | ||||||
| 2042 | london.pm digest, Vol 1 #1858 - 15 msgs | ||||||
| 2043 | |||||||
| 2044 | before realizing that you were missing | ||||||
| 2045 | |||||||
| 2046 | london.pm digest, Vol 1 #1857 - 18 msgs | ||||||
| 2047 | |||||||
| 2048 | If you were to now process digest 1857 with C | ||||||
| 2049 | from that digest would be appended to their respective thread files I | ||||||
| 2050 | messages from digest 1858. Since the whole point of Mail::Digest::Tools is to | ||||||
| 2051 | be able to read a discussion thread in chronological order, this would not be | ||||||
| 2052 | desirable. | ||||||
| 2053 | |||||||
| 2054 | Fortunately, you can fix this problem as follows: | ||||||
| 2055 | |||||||
| 2056 | =over 4 | ||||||
| 2057 | |||||||
| 2058 | =item * Apply C | ||||||
| 2059 | |||||||
| 2060 | Call C | ||||||
| 2061 | go ahead and call it on digest 1857 even though it creates thread files with | ||||||
| 2062 | messages out of chronological order. | ||||||
| 2063 | |||||||
| 2064 | =item * Determine date where need for repair begins | ||||||
| 2065 | |||||||
| 2066 | Examine the timestamps on your digest files for the date of the first digest | ||||||
| 2067 | you received out of sequence. In the above example, that would be the date | ||||||
| 2068 | of digest 1858. Since digest files were received out of proper sequence on or | ||||||
| 2069 | after that date, all thread files generated after that date may have | ||||||
| 2070 | out-of-sequence messages and need re-ordering. | ||||||
| 2071 | |||||||
| 2072 | =item * Apply C | ||||||
| 2073 | |||||||
| 2074 | Call C | ||||||
| 2075 | |||||||
| 2076 | =over 4 | ||||||
| 2077 | |||||||
| 2078 | =item * | ||||||
| 2079 | |||||||
| 2080 | a reference to the 'in' configuration hash | ||||||
| 2081 | |||||||
| 2082 | =item * | ||||||
| 2083 | |||||||
| 2084 | a reference to the 'out' configuration hash | ||||||
| 2085 | |||||||
| 2086 | =item * | ||||||
| 2087 | |||||||
| 2088 | a reference to an anonymous hash whose keys are C | ||||||
| 2089 | the values for which keys are the elements of the repair date. | ||||||
| 2090 | |||||||
| 2091 | =back | ||||||
| 2092 | |||||||
| 2093 | Mail::Digest::Tools will examine all thread files from midnight local time on | ||||||
| 2094 | that date. Where messages have been posted to the thread files out of proper | ||||||
| 2095 | sequence, they will be reposted in the correct order. The thread file with | ||||||
| 2096 | the correct sequence will overwrite the file with the incorrect sequence. | ||||||
| 2097 | |||||||
| 2098 | =back | ||||||
| 2099 | |||||||
| 2100 | =head2 C | ||||||
| 2101 | |||||||
| 2102 | consolidate_threads_multiple( | ||||||
| 2103 | \%config_in, | ||||||
| 2104 | \%config_out, | ||||||
| 2105 | ); | ||||||
| 2106 | |||||||
| 2107 | or | ||||||
| 2108 | |||||||
| 2109 | consolidate_threads_multiple( | ||||||
| 2110 | \%config_in, | ||||||
| 2111 | \%config_out, | ||||||
| 2112 | $first_common_letters, # optional integer argument | ||||||
| 2113 | ); | ||||||
| 2114 | |||||||
| 2115 | As described above, Mail::Digest::Tool's C | ||||||
| 2116 | will, to the greatest extent possible, delete extraneous words such as 'Re:' | ||||||
| 2117 | or 'Fwd:' from a message's subject so that all relevant postings on a given | ||||||
| 2118 | subject can be included in a single thread file. What happens when this is | ||||||
| 2119 | not sufficient? For example, suppose someone posts a message to a list with a | ||||||
| 2120 | slightly misspelled or altered subject line: | ||||||
| 2121 | |||||||
| 2122 | =over 4 | ||||||
| 2123 | |||||||
| 2124 | =item * Original thread file: | ||||||
| 2125 | |||||||
| 2126 | Help telnetting to remote host through CGI.thr.txt | ||||||
| 2127 | |||||||
| 2128 | =item * Thread file created due to altered subject line: | ||||||
| 2129 | |||||||
| 2130 | Help telnetting to remote host thru CGI.thr.txt | ||||||
| 2131 | |||||||
| 2132 | =back | ||||||
| 2133 | |||||||
| 2134 | Mail::Digest::Tools offers two functions to address this problem. | ||||||
| 2135 | C | ||||||
| 2136 | first. This function presumes that people who re-type e-mail subject lines | ||||||
| 2137 | when replying tend to type the first several words correctly, then make errors | ||||||
| 2138 | or alterations toward the end of the subject line.  If the first I | ||||||
| 2139 | of the subject line of two or more messages are identical, there is a strong | ||||||
| 2140 | chance that the messages are discussing the same topic and should be posted to | ||||||
| 2141 | the same discussion thread.  Mail::Digest::Tool's default value for I | ||||||
| 2142 | 20, but you can set a different value for a particular digest by passing an | ||||||
| 2143 | optional third argument as shown above.  C | ||||||
| 2144 | accordingly: | ||||||
| 2145 | |||||||
| 2146 | =over 4 | ||||||
| 2147 | |||||||
| 2148 | =item * | ||||||
| 2149 | |||||||
| 2150 | Makes a list of all thread files for a particular digest. | ||||||
| 2151 | |||||||
| 2152 | =item * | ||||||
| 2153 | |||||||
| 2154 | Identifies groups of thread files whose names share the first 20 letters. | ||||||
| 2155 | |||||||
| 2156 | =item * | ||||||
| 2157 | |||||||
| 2158 | Displays a prompt on standard output asking you whether you wish to | ||||||
| 2159 | consolidate the files in each such group: | ||||||
| 2160 | |||||||
| 2161 | Candidates for consolidation: | ||||||
| 2162 | Help telnetting to remote host through CGI.thr.txt | ||||||
| 2163 | Help telnetting to remote host thru CGI.thr.txt | ||||||
| 2164 | |||||||
| 2165 | To consolidate, type YES: | ||||||
| 2166 | |||||||
| 2167 | =over 4 | ||||||
| 2168 | |||||||
| 2169 | =item * | ||||||
| 2170 | |||||||
| 2171 | If you type C | ||||||
| 2172 | thread file whose name will be derived from the Subject line of the very first | ||||||
| 2173 | posting to the discussion thread. Standard output will display: | ||||||
| 2174 | |||||||
| 2175 | Files will be consolidated | ||||||
| 2176 | |||||||
| 2177 | =item * | ||||||
| 2178 | |||||||
| 2179 | If you type anything other than C | ||||||
| 2180 | then the files will not be consolidated and standard output will display: | ||||||
| 2181 | |||||||
| 2182 | Files will not be consolidated | ||||||
| 2183 | |||||||
| 2184 | =item * | ||||||
| 2185 | |||||||
| 2186 | If the files are consolidated, the original thread files will not automatically | ||||||
| 2187 | be deleted. Rather, they are renamed with the extension C<.DELETABLE>. | ||||||
| 2188 | |||||||
| 2189 | Help telnetting to remote host through CGI.thr.txt.DELETABLE | ||||||
| 2190 | Help telnetting to remote host thru CGI.thr.txt.DELETABLE | ||||||
| 2191 | |||||||
| 2192 | This is a safety precaution. The user can then delete the deletable files | ||||||
| 2193 | by calling the C | ||||||
| 2194 | |||||||
| 2195 | =back | ||||||
| 2196 | |||||||
| 2197 | =item * | ||||||
| 2198 | |||||||
| 2199 | If there are no files in the threads directory which share the first 20 letters | ||||||
| 2200 | in common (or the first I | ||||||
| 2201 | argument), then you are warned at standard output: | ||||||
| 2202 | |||||||
| 2203 | Analysis of the first 20 letters of each file in | ||||||
| 2204 | [threads directory] | ||||||
| 2205 | shows no candidates for consolidation. Please hard-code | ||||||
| 2206 | names of files you wish to consolidate as arguments to | ||||||
| 2207 | &consolidate_threads_single | ||||||
| 2208 | |||||||
| 2209 | =back | ||||||
| 2210 | |||||||
| 2211 | =head2 C | ||||||
| 2212 | |||||||
| 2213 | consolidate_threads_single( | ||||||
| 2214 | \%config_in, | ||||||
| 2215 | \%config_out, | ||||||
| 2216 | [ | ||||||
| 2217 | 'first_dummy_file_for_consolidation.thr.txt', | ||||||
| 2218 | 'second_dummy_file_for_consolidation.thr.txt', | ||||||
| 2219 | ], | ||||||
| 2220 | ); | ||||||
| 2221 | |||||||
| 2222 | Suppose that the thread files which you wish to consolidate have names whose | ||||||
| 2223 | spelling diverges before the 21st letter. The algorithm which | ||||||
| 2224 | C | ||||||
| 2225 | rationale for consolidation. This could happen when someone tries to change | ||||||
| 2226 | the subject of discussion from: | ||||||
| 2227 | |||||||
| 2228 | Best book for extreme Newbie to programming | ||||||
| 2229 | |||||||
| 2230 | to: | ||||||
| 2231 | |||||||
| 2232 | De incunabula nostra (Was Best book for extreme Newbie to programming) | ||||||
| 2233 | |||||||
| 2234 | I | ||||||
| 2235 | anonymous array. Pass a reference to that anonymous array as the third | ||||||
| 2236 | argument to C | ||||||
| 2237 | |||||||
| 2238 | As with C | ||||||
| 2239 | will bear the name of the source file containing the very first posting to | ||||||
| 2240 | the discussion thread. The files so consolidated will not automatically be | ||||||
| 2241 | deleted. Rather, they will be renamed with the extension C<.DELETABLE> as a | ||||||
| 2242 | safety precaution and left for you to delete with C | ||||||
| 2243 | |||||||
| 2244 | =head2 C | ||||||
| 2245 | |||||||
| 2246 | delete_deletables(\%config_out); | ||||||
| 2247 | |||||||
| 2248 | Mail::Digest::Tools function C | ||||||
| 2249 | either C | ||||||
| 2250 | Unlike all other public functions provided by Mail::Digest::Tools, | ||||||
| 2251 | C | ||||||
| 2252 | two configuration hashes, I | ||||||
| 2253 | function simply changes to the directory where thread files for a given | ||||||
| 2254 | digest are stored and deletes all files with the extension C<.DELETABLE>. | ||||||
| 2255 | |||||||
| 2256 | =head1 CONFIGURATION SETUP OVERVIEW | ||||||
| 2257 | |||||||
| 2258 | To use a Mail::Digest::Tool function, you need to answer two fundamental | ||||||
| 2259 | questions: | ||||||
| 2260 | |||||||
| 2261 | =over 4 | ||||||
| 2262 | |||||||
| 2263 | =item 1 | ||||||
| 2264 | |||||||
| 2265 | What internal structure has the mailing list sponsor provided for a given | ||||||
| 2266 | digest? | ||||||
| 2267 | |||||||
| 2268 | =item 2 | ||||||
| 2269 | |||||||
| 2270 | How do I want to structure the results of applying Mail::Digest::Tools to a | ||||||
| 2271 | particular digest on my system? | ||||||
| 2272 | |||||||
| 2273 | =back | ||||||
| 2274 | |||||||
| 2275 | Each of these two questions breaks down into sub-parts. Their answers | ||||||
| 2276 | supply you with the information with which you will construct the two | ||||||
| 2277 | configuration hashes passed to most Mail::Digest::Tools functions. | ||||||
| 2278 | Let us take each in turn. | ||||||
| 2279 | |||||||
| 2280 | =head1 C<%config_in>: THE INTERNAL STRUCTURE OF A DIGEST | ||||||
| 2281 | |||||||
| 2282 | The best way to learn about the internal structure of a mailing list digest | ||||||
| 2283 | (other than to study the application which created the digest in the first | ||||||
| 2284 | place) is to accumulate several instances of the digest on your system in a | ||||||
| 2285 | directory devoted to that purpose. Examine the way the digest's filename is | ||||||
| 2286 | formed. Then examine the digest file itself. You will soon pick up a feel | ||||||
| 2287 | for the structure of the digest, which will guide you in configuring | ||||||
| 2288 | Mail::Digest::Tools for your system. That configuration will take the form | ||||||
| 2289 | of a Perl hash which, for illustrative purposes, we shall here call | ||||||
| 2290 | C<%xxx_config_in> where C | ||||||
| 2291 | |||||||
| 2292 | For heuristic purposes we will examine the characteristics of two mailing | ||||||
| 2293 | list digests which the author has been following and archiving for several | ||||||
| 2294 | years: ActiveState's 'Perl-Win32-Users' digest and Yahoo! Groups' Perl | ||||||
| 2295 | Beginners group digest. | ||||||
| 2296 | |||||||
| 2297 | =head2 Analysis of Digest's File Name | ||||||
| 2298 | |||||||
| 2299 | We must study a digest's file name in order to be able to write a pattern | ||||||
| 2300 | with which we will be able to distinguish a digest file from any non-digest | ||||||
| 2301 | file sitting in the same directory, as well as to be able to extract the | ||||||
| 2302 | digest number from that file name. | ||||||
| 2303 | |||||||
| 2304 | Once saved as plain-text files, Perl-Win32-Users digest files typically look | ||||||
| 2305 | like this in a directory: | ||||||
| 2306 | |||||||
| 2307 | Perl-Win32-Users Digest, Vol 1 Issue 1771.txt | ||||||
| 2308 | Perl-Win32-Users Digest, Vol 1 Issue 1772.txt | ||||||
| 2309 | |||||||
| 2310 | Similarly, the Perl Beginner digest files look like this: | ||||||
| 2311 | |||||||
| 2312 | [PBML] Digest Number 1491.txt | ||||||
| 2313 | [PBML] Digest Number 1492.txt | ||||||
| 2314 | |||||||
| 2315 | To correctly identify Perl-Win32-Users digest files from any other files in | ||||||
| 2316 | the same directory, we compose a string which would form the core of a Perl | ||||||
| 2317 | regular expression, I | ||||||
| 2318 | delimiters. Internally, Mail::Digest::Tools passes the file name through a | ||||||
| 2319 | C | ||||||
| 2320 | |||||||
| 2321 | %pw32u_config_in = ( | ||||||
| 2322 | grep_formula => 'Perl-Win32-Users Digest', | ||||||
| 2323 | ... | ||||||
| 2324 | ); | ||||||
| 2325 | |||||||
| 2326 | The equivalent pattern for the Perl Beginners digest would be: | ||||||
| 2327 | |||||||
| 2328 | %pbml_config_in = ( | ||||||
| 2329 | grep_formula => '\[PBML\]', | ||||||
| 2330 | ... | ||||||
| 2331 | ); | ||||||
| 2332 | |||||||
| 2333 | Note that the C<[> and C<]> characters have to be escaped with a C<\> | ||||||
| 2334 | backslash because they are normally metacharacters inside Perl regular | ||||||
| 2335 | expressions. | ||||||
| 2336 | |||||||
| 2337 | We next have to extract the digest number from the digest's file name. | ||||||
| 2338 | Certain mailing list programs give individual digests both a 'Volume' number | ||||||
| 2339 | as well as an individual digest number. Perl-Win32-Users typifies this. In | ||||||
| 2340 | the example above we need to capture both the C<1> as volume number and C<1771> | ||||||
| 2341 | as digest number. The next key in our configuration hash is called | ||||||
| 2342 | C | ||||||
| 2343 | |||||||
| 2344 | %pw32u_config_in = ( | ||||||
| 2345 | grep_formula => 'Perl-Win32-Users Digest', | ||||||
| 2346 | pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt', | ||||||
| 2347 | ... | ||||||
| 2348 | ); | ||||||
| 2349 | |||||||
| 2350 | Note the two sets of capturing parentheses. | ||||||
| 2351 | |||||||
| 2352 | Other digests, such as those at Yahoo! Groups, dispense with a volume number | ||||||
| 2353 | and simply increment each digest number: | ||||||
| 2354 | |||||||
| 2355 | %pbml_config_in = ( | ||||||
| 2356 | grep_formula => '\[PBML\]', | ||||||
| 2357 | pattern_target => '.*\s(\d+)\.txt$', | ||||||
| 2358 | ... | ||||||
| 2359 | ); | ||||||
| 2360 | |||||||
| 2361 | Note that this C | ||||||
| 2362 | parentheses. | ||||||
| 2363 | |||||||
| 2364 | =head2 Analysis of Digest's Internal Structure | ||||||
| 2365 | |||||||
| 2366 | A digest's internal structure is discussed in detail above (see | ||||||
| 2367 | 'A TYPICAL MAILING LIST DIGEST'). Here we need to identify two | ||||||
| 2368 | characteristics: the way the digest introduces its list of today's topics | ||||||
| 2369 | and the string it uses to delimit the list of today's topics from the first | ||||||
| 2370 | individual message in the digest and all subsequent messages from one another. | ||||||
| 2371 | Continuing with our two examples from above, we provide values for keys | ||||||
| 2372 | C | ||||||
| 2373 | |||||||
| 2374 | %pw32u_config_in = ( | ||||||
| 2375 | grep_formula => 'Perl-Win32-Users digest', | ||||||
| 2376 | pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt', | ||||||
| 2377 | topics_intro => 'Today\'s Topics:', | ||||||
| 2378 | source_msg_delimiter => "--__--__--\n\n", | ||||||
| 2379 | ... | ||||||
| 2380 | ); | ||||||
| 2381 | |||||||
| 2382 | Note the escaped C<'> apostrophe character in the value for key | ||||||
| 2383 | C | ||||||
| 2384 | |||||||
| 2385 | %pbml_config_in = ( | ||||||
| 2386 | grep_formula => '\[PBML\]', | ||||||
| 2387 | pattern_target => '.*\s(\d+)\.txt$', | ||||||
| 2388 | topics_intro => 'Topics in this digest:', | ||||||
| 2389 | source_msg_delimiter => "________________________________________________________________________\n________________________________________________________________________\n\n", | ||||||
| 2390 | ... | ||||||
| 2391 | ); | ||||||
| 2392 | |||||||
| 2393 | Note that the values provided for the respective C | ||||||
| 2394 | had to be double-quoted strings. That's because all such delimiters include | ||||||
| 2395 | two or more C<\n> newline characters so that they form paragraphs unto | ||||||
| 2396 | themselves. Unless indicated otherwise, the values for all other values in | ||||||
| 2397 | the configuration hash are single-quoted strings. | ||||||
| 2398 | |||||||
| 2399 | Note: In early 2004, while Mail::Digest::Tools was being prepared for its | ||||||
| 2400 | initial distribution on CPAN, ActiveState changed certain features in the | ||||||
| 2401 | daily digest versions of its mailing lists. Hence, the code example presented | ||||||
| 2402 | above should not be 'copied-and-pasted' into a configuration hash with which | ||||||
| 2403 | you, the user, might follow the current Perl-Win32-Users digest. In | ||||||
| 2404 | particular, the source message delimiter was changed to a string of 30 | ||||||
| 2405 | hyphens followed by 2 C<\n> newline characters: | ||||||
| 2406 | |||||||
| 2407 | "------------------------------\n\n" | ||||||
| 2408 | |||||||
| 2409 | However, since it is not unheard of for contributors to a mailing list to use | ||||||
| 2410 | such a string of hyphens within their postings or signatures, using a string | ||||||
| 2411 | of hyphens is not a particularly apt choice for a source message delimiter. | ||||||
| 2412 | In this particular case, the author is getting better (but not fully tested) | ||||||
| 2413 | results by including an additional newline I | ||||||
| 2414 | order to more uniquely identify the source message delimiter: | ||||||
| 2415 | |||||||
| 2416 | "\n------------------------------\n\n" | ||||||
| 2417 | |||||||
| 2418 | =head2 Analysis of Individual Messages | ||||||
| 2419 | |||||||
| 2420 | The internal structure of an individual message within a digest is also | ||||||
| 2421 | discussed in detail above. Here we need to identify patterns with which we | ||||||
| 2422 | can extract the content of the message's headers. | ||||||
| 2423 | |||||||
| 2424 | Certain mailing list digest programs allow a wide variety of headers to appear | ||||||
| 2425 | in digested messages. The Perl-Win32-Users digest typifies this. Each | ||||||
| 2426 | message in a Perl-Win32_Users digest I | ||||||
| 2427 | for the message's author, recipients, subject and date. | ||||||
| 2428 | |||||||
| 2429 | Message: 1 | ||||||
| 2430 | From: Chris Smithson | ||||||
| 2431 | To: "'Carter Kraus'" | ||||||
| 2432 | "Perl-Win32-Users (E-mail)" | ||||||
| 2433 | Subject: RE: OO Perl Issue. | ||||||
| 2434 | Date: Wed, 4 Feb 2004 14:17:24 -0600 | ||||||
| 2435 | |||||||
| 2436 | But a message in this digest may have additional headers for the author's | ||||||
| 2437 | organization, reply address and/or carbon-copy recipients. | ||||||
| 2438 | |||||||
| 2439 | Message: 5 | ||||||
| 2440 | Date: Wed, 4 Feb 2004 15:15:44 -0800 | ||||||
| 2441 | From: Sam Spade | ||||||
| 2442 | Organization: Some Web Address | ||||||
| 2443 | Reply-To: Sam Spade | ||||||
| 2444 | To: "Time" | ||||||
| 2445 | CC: "Perl List" | ||||||
| 2446 | Subject: Re: New IE Update causes script problems | ||||||
| 2447 | |||||||
| 2448 | Patterns are easily developed to capture this information and store it in the | ||||||
| 2449 | configuration hash: | ||||||
| 2450 | |||||||
| 2451 | %pw32u_config_in = ( | ||||||
| 2452 | grep_formula => 'Perl-Win32-Users digest', | ||||||
| 2453 | pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt', | ||||||
| 2454 | topics_intro => 'Today\'s Topics:', | ||||||
| 2455 | source_msg_delimiter => "--__--__--\n\n", | ||||||
| 2456 | message_style_flag => '^Message:\s+(\d+)$', | ||||||
| 2457 | from_style_flag => '^From:\s+(.+)$', | ||||||
| 2458 | org_style_flag => '^Organization:\s+(.+)$', | ||||||
| 2459 | to_style_flag => '^To:\s+(.+)$', | ||||||
| 2460 | cc_style_flag => '^CC:\s+(.+)$', | ||||||
| 2461 | subject_style_flag => '^Subject:\s+(.+)$', | ||||||
| 2462 | date_style_flag => '^Date:\s+(.+)$', | ||||||
| 2463 | reply_to_style_flag => '^Reply-To:\s+(.+)$', | ||||||
| 2464 | ... | ||||||
| 2465 | ); | ||||||
| 2466 | |||||||
| 2467 | Other mailing list digest programs allow much fewer headers in digested | ||||||
| 2468 | messages. The Yahoo! Groups digests such as Perl Beginner typify this. | ||||||
| 2469 | |||||||
| 2470 | Message: 4 | ||||||
| 2471 | Date: Sun, 7 Dec 2003 19:24:03 +1100 | ||||||
| 2472 | From: Philip Streets | ||||||
| 2473 | Subject: RH9.0, perl 5.8.2 and qmail-localfilter question | ||||||
| 2474 | |||||||
| 2475 | The patterns developed to capture this information and store it in the | ||||||
| 2476 | configuration hash would be as follows: | ||||||
| 2477 | |||||||
| 2478 | %pbml_config_in = ( | ||||||
| 2479 | grep_formula => '\[PBML\]', | ||||||
| 2480 | pattern_target => '.*\s(\d+)\.txt$', | ||||||
| 2481 | topics_intro => 'Topics in this digest:', | ||||||
| 2482 | source_msg_delimiter => "________________________________________________________________________\n________________________________________________________________________\n\n", | ||||||
| 2483 | message_style_flag => '^Message:\s+(\d+)$', | ||||||
| 2484 | from_style_flag => '^\s+From:\s+(.+)$', | ||||||
| 2485 | subject_style_flag => '^Subject:\s+(.+)$', | ||||||
| 2486 | date_style_flag => '^\s+Date:\s+(.+)$', | ||||||
| 2487 | ... | ||||||
| 2488 | ); | ||||||
| 2489 | |||||||
| 2490 | Note that this pattern is written to expect 1 or more whitespaces at the | ||||||
| 2491 | beginning of the C | ||||||
| 2492 | |||||||
| 2493 | We could -- but do not need to -- add the following key-value pairs to the | ||||||
| 2494 | C<%pbml_config_in> hash. | ||||||
| 2495 | |||||||
| 2496 | org_style_flag => undef, | ||||||
| 2497 | to_style_flag => undef, | ||||||
| 2498 | cc_style_flag => undef, | ||||||
| 2499 | reply_to_style_flag => undef, | ||||||
| 2500 | |||||||
| 2501 | =head2 Inspection of Messages for Multipart MIME Content | ||||||
| 2502 | |||||||
| 2503 | Certain mailing lists allow subscribers to post messages in either plain-text | ||||||
| 2504 | or HTML. Certain lists allow subscribers to post attachments; others do not. | ||||||
| 2505 | When it comes to preparing digests of these messages, the programs which | ||||||
| 2506 | different lists take lead to different results. The most annoying situation | ||||||
| 2507 | occurs when a list allows a subscriber to post in 'multipart MIME format' and | ||||||
| 2508 | then fails to strip out the redundant HTML part after printing the needed | ||||||
| 2509 | plain-text part. | ||||||
| 2510 | |||||||
| 2511 | I | ||||||
| 2512 | list digest. (ActiveState changed the format of its digests in early 2004 to | ||||||
| 2513 | strip out HTML attachments. Hence, the following code no longer accurately | ||||||
| 2514 | represents what a subscriber to an ActiveState digest will see. Other mailing | ||||||
| 2515 | lists still suffer from MIME bloat, however, so treat the following code as | ||||||
| 2516 | illustrative.) The message begins: | ||||||
| 2517 | |||||||
| 2518 | Message: 1 | ||||||
| 2519 | To: Perl-Win32-Users@activestate.com | ||||||
| 2520 | Subject: Can not tie STDOUT to scolled Tk widget | ||||||
| 2521 | From: John_Wonderman@some.web.address.ca | ||||||
| 2522 | Date: Thu, 15 Jan 2004 16:25:17 -0500 | ||||||
| 2523 | This is a multipart message in MIME format. | ||||||
| 2524 | --=_alternative 00750F0485256E1C_= | ||||||
| 2525 | Content-Type: text/plain; charset="US-ASCII" | ||||||
| 2526 | Hi; | ||||||
| 2527 | I am trying to implement a scrolling text widget to capture output for for | ||||||
| 2528 | at tk app. Without scrolling: | ||||||
| 2529 | my $text = $mw->Text(-width => 78, | ||||||
| 2530 | -height => 32, | ||||||
| 2531 | -wrap => 'word', | ||||||
| 2532 | -font => ['Courier New','11'] | ||||||
| 2533 | )->pack(-side => 'bottom', | ||||||
| 2534 | -expand => 1, | ||||||
| 2535 | -fill => 'both', | ||||||
| 2536 | ); | ||||||
| 2537 | ... | ||||||
| 2538 | |||||||
| 2539 | When the plain-text part of the message is finished, it is then repeated in | ||||||
| 2540 | HTML: | ||||||
| 2541 | |||||||
| 2542 | --=_alternative 00750F0485256E1C_= | ||||||
| 2543 | Content-Type: text/html; charset="US-ASCII" | ||||||
| 2544 | Hi; | ||||||
| 2545 | I am trying to implement a scrolling text | ||||||
| 2546 | widget to capture output for for at tk app. Without scrolling: | ||||||
| 2547 | my $text = $mw->Text(-width | ||||||
| 2548 | => 78, | ||||||
| 2549 |  | ||||||
| 2550 | -height => 32, | ||||||
| 2551 |  | ||||||
| 2552 | -wrap => 'word', | ||||||
| 2553 |  | ||||||
| 2554 | -font => ['Courier New','11'] | ||||||
| 2555 | )->pack(-side => | ||||||
| 2556 | 'bottom', | ||||||
| 2557 |  | ||||||
| 2558 | -expand => 1, | ||||||
| 2559 |  | ||||||
| 2560 | -fill => 'both', | ||||||
| 2561 | |||||||
| 2562 | There is no reason to retain this bloat in your thread file. The digest | ||||||
| 2563 | providers should have stripped it out, but the program they were using failed | ||||||
| 2564 | to do so. Other digests, such as those at Yahoo! Groups, eliminate all this | ||||||
| 2565 | blather. | ||||||
| 2566 | |||||||
| 2567 | Now, with Mail::Digest::Tools, you can eliminate much of the bloat yourself. | ||||||
| 2568 | After examining 6-10 instances of a particular mailing list digest, you should | ||||||
| 2569 | be able to determine whether the digest needs a dose of digital castor oil or | ||||||
| 2570 | not, and you set key C | ||||||
| 2571 | unnecessary multipart MIME content, you set this flag to C<1>; otherwise, to | ||||||
| 2572 | C<0>. | ||||||
| 2573 | |||||||
| 2574 | And with that you have completed your analysis of the internal structure of a | ||||||
| 2575 | given digest and entered the relevant information into the first configuration | ||||||
| 2576 | hash: | ||||||
| 2577 | |||||||
| 2578 | %pw32u_config_in = ( | ||||||
| 2579 | grep_formula => 'Perl-Win32-Users digest', | ||||||
| 2580 | pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt', | ||||||
| 2581 | topics_intro => 'Today\'s Topics:', | ||||||
| 2582 | source_msg_delimiter => "--__--__--\n\n", | ||||||
| 2583 | message_style_flag => '^Message:\s+(\d+)$', | ||||||
| 2584 | from_style_flag => '^From:\s+(.+)$', | ||||||
| 2585 | org_style_flag => '^Organization:\s+(.+)$', | ||||||
| 2586 | to_style_flag => '^To:\s+(.+)$', | ||||||
| 2587 | cc_style_flag => '^CC:\s+(.+)$', | ||||||
| 2588 | subject_style_flag => '^Subject:\s+(.+)$', | ||||||
| 2589 | date_style_flag => '^Date:\s+(.+)$', | ||||||
| 2590 | reply_to_style_flag => '^Reply-To:\s+(.+)$', | ||||||
| 2591 | MIME_cleanup_flag => 1, | ||||||
| 2592 | ); | ||||||
| 2593 | |||||||
| 2594 | %pbml_config_in = ( | ||||||
| 2595 | grep_formula => '\[PBML\]', | ||||||
| 2596 | pattern_target => '.*\s(\d+)\.txt$', | ||||||
| 2597 | topics_intro => 'Topics in this digest:', | ||||||
| 2598 | source_msg_delimiter => "________________________________________________________________________\n________________________________________________________________________\n\n", | ||||||
| 2599 | message_style_flag => '^Message:\s+(\d+)$', | ||||||
| 2600 | from_style_flag => '^\s+From:\s+(.+)$', | ||||||
| 2601 | subject_style_flag => '^Subject:\s+(.+)$', | ||||||
| 2602 | date_style_flag => '^\s+Date:\s+(.+)$', | ||||||
| 2603 | MIME_cleanup_flag => 0, | ||||||
| 2604 | ); | ||||||
| 2605 | |||||||
| 2606 | =head1 C<%config_out>: HOW TO PROCESS A DIGEST ON YOUR SYSTEM | ||||||
| 2607 | |||||||
| 2608 | C<%config_in> holds the answers to the question: What internal structure has | ||||||
| 2609 | the mailing list sponsor provided for a given digest? In contrast, | ||||||
| 2610 | C<%config_out> will hold the answer to this question: How do I want to | ||||||
| 2611 | structure the results of applying Mail::Digest::Tools to a particular digest | ||||||
| 2612 | on my system? | ||||||
| 2613 | |||||||
| 2614 | For purpose of illustration, we will continue to assume that we are processing | ||||||
| 2615 | digest files received from the Perl-Win32-Users and Perl Beginner lists. We | ||||||
| 2616 | will make slightly different choices as to how we process those digest files | ||||||
| 2617 | so as to illustrate different options available from Mail::Digest::Tools. | ||||||
| 2618 | |||||||
| 2619 | We shall also assume that we going to place the scripts from which we call | ||||||
| 2620 | Mail::Digest::Tools functions in the directory I | ||||||
| 2621 | which we store the digest files once they have been saved as plain-text files. | ||||||
| 2622 | If we call this directory C | ||||||
| 2623 | then we will have a directory structure that starts out like this: | ||||||
| 2624 | |||||||
| 2625 | digest/ | ||||||
| 2626 | process_new.pl | ||||||
| 2627 | process_ALL.pl | ||||||
| 2628 | reply_digest_message.pl | ||||||
| 2629 | repair_digest_order.pl | ||||||
| 2630 | consolidate_threads.pl | ||||||
| 2631 | deletables.pl | ||||||
| 2632 | pw32u/ | ||||||
| 2633 | Perl-Win32-Users Digest, Vol 1 Issue 1771.txt | ||||||
| 2634 | Perl-Win32-Users Digest, Vol 1 Issue 1772.txt | ||||||
| 2635 | pbml/ | ||||||
| 2636 | [PBML] Digest Number 1491.txt | ||||||
| 2637 | [PBML] Digest Number 1492.txt | ||||||
| 2638 | |||||||
| 2639 | =head2 Required C<%config_out> Keys | ||||||
| 2640 | |||||||
| 2641 | There are 9 keys which are required in C<%config_out> in order for | ||||||
| 2642 | Mail::Digest::Tools to function properly. They correspond to 9 decisions | ||||||
| 2643 | which you must make in setting up a Mail::Digest::Tools configuration on | ||||||
| 2644 | your system. | ||||||
| 2645 | |||||||
| 2646 | =over 4 | ||||||
| 2647 | |||||||
| 2648 | =item 1 Title | ||||||
| 2649 | |||||||
| 2650 | Each digest must be given a title which is used whenever Mail::Digest::Tools | ||||||
| 2651 | needs to prompt or warn you on standard output. The key which holds this | ||||||
| 2652 | information in C<%config_out> must be called C | ||||||
| 2653 | element should be sensible. | ||||||
| 2654 | |||||||
| 2655 | %pw32u_config_out = ( | ||||||
| 2656 | title => 'Perl-Win32-Users', | ||||||
| 2657 | ... | ||||||
| 2658 | ); | ||||||
| 2659 | |||||||
| 2660 | %pbml_config_out = ( | ||||||
| 2661 | title => 'Perl Beginner', | ||||||
| 2662 | ... | ||||||
| 2663 | ); | ||||||
| 2664 | |||||||
| 2665 | =item 2 Digest Directory | ||||||
| 2666 | |||||||
| 2667 | For each digest a directory must be designated where individual digest files | ||||||
| 2668 | are stored in plain-text format. The key which holds this information in | ||||||
| 2669 | C<%config_out> must be called C | ||||||
| 2670 | directories are named relative to the 'current' directory (C<..>), | ||||||
| 2671 | I | ||||||
| 2672 | Mail::Digest::Function is stored. | ||||||
| 2673 | |||||||
| 2674 | %pw32u_config_out = ( | ||||||
| 2675 | title => 'Perl-Win32-Users', | ||||||
| 2676 | dir_digest => "../pw32u", | ||||||
| 2677 | ... | ||||||
| 2678 | ); | ||||||
| 2679 | |||||||
| 2680 | %pbml_config_out = ( | ||||||
| 2681 | title => 'Perl Beginner', | ||||||
| 2682 | dir_digest => "../pbml", | ||||||
| 2683 | ... | ||||||
| 2684 | ); | ||||||
| 2685 | |||||||
| 2686 | =item 3 Threads Directory | ||||||
| 2687 | |||||||
| 2688 | For each digest a directory must be designated where the thread files created | ||||||
| 2689 | by use of Mail::Digest::Tools functions are stored. The key which holds this | ||||||
| 2690 | information in C<%config_out> must be called C | ||||||
| 2691 | below the threads directory is a subdirectory of the digest directory, but | ||||||
| 2692 | you may make other choices. | ||||||
| 2693 | |||||||
| 2694 | %pw32u_config_out = ( | ||||||
| 2695 | title => 'Perl-Win32-Users', | ||||||
| 2696 | dir_digest => "../pw32u", | ||||||
| 2697 | dir_threads => "../pw32u/Threads", | ||||||
| 2698 | ... | ||||||
| 2699 | ); | ||||||
| 2700 | |||||||
| 2701 | %pbml_config_out = ( | ||||||
| 2702 | title => 'Perl Beginner', | ||||||
| 2703 | dir_digest => "../pbml", | ||||||
| 2704 | dir_threads => "../pbml/Threads", | ||||||
| 2705 | ... | ||||||
| 2706 | ); | ||||||
| 2707 | |||||||
| 2708 | =item 4 Digests Log File | ||||||
| 2709 | |||||||
| 2710 | For each digest a file must be kept which logs whether a given digest file | ||||||
| 2711 | has already been processed or not and, if so, when. The key which holds this | ||||||
| 2712 | information in C<%config_out> must be called C | ||||||
| 2713 | found convenient to keep this file in the digests directory, but you may make | ||||||
| 2714 | other choices. | ||||||
| 2715 | |||||||
| 2716 | %pw32u_config_out = ( | ||||||
| 2717 | title => 'Perl-Win32-Users', | ||||||
| 2718 | dir_digest => "../pw32u", | ||||||
| 2719 | dir_threads => "../pw32u/Threads", | ||||||
| 2720 | digests_log => "../pw32u/digests_log.txt", | ||||||
| 2721 | ... | ||||||
| 2722 | ); | ||||||
| 2723 | |||||||
| 2724 | %pbml_config_out = ( | ||||||
| 2725 | title => 'Perl Beginner', | ||||||
| 2726 | dir_digest => "../pbml", | ||||||
| 2727 | dir_threads => "../pbml/Threads", | ||||||
| 2728 | digests_log => "../pbml/digests_log.txt", | ||||||
| 2729 | ... | ||||||
| 2730 | ); | ||||||
| 2731 | |||||||
| 2732 | =item 5 Today's Topics | ||||||
| 2733 | |||||||
| 2734 | For each digest a file must be kept which holds an ongoing record of the | ||||||
| 2735 | list of topics found in each individual digest file. The key which holds this | ||||||
| 2736 | information in C<%config_out> must be called | ||||||
| 2737 | found convenient to keep this file in the digests directory, but you may make | ||||||
| 2738 | other choices. | ||||||
| 2739 | |||||||
| 2740 | %pw32u_config_out = ( | ||||||
| 2741 | title => 'Perl-Win32-Users', | ||||||
| 2742 | dir_digest => "../pw32u", | ||||||
| 2743 | dir_threads => "../pw32u/Threads", | ||||||
| 2744 | digests_log => "../pw32u/digests_log.txt", | ||||||
| 2745 | todays_topics => "../pw32u/todays_topics.txt", | ||||||
| 2746 | ... | ||||||
| 2747 | ); | ||||||
| 2748 | |||||||
| 2749 | %pbml_config_out = ( | ||||||
| 2750 | title => 'Perl Beginner', | ||||||
| 2751 | dir_digest => "../pbml", | ||||||
| 2752 | dir_threads => "../pbml/Threads", | ||||||
| 2753 | digests_log => "../pbml/digests_log.txt", | ||||||
| 2754 | todays_topics => "../pbml/todays_topics.txt", | ||||||
| 2755 | ... | ||||||
| 2756 | ); | ||||||
| 2757 | |||||||
| 2758 | =item 6 Format for Identifying Digest Number in Output | ||||||
| 2759 | |||||||
| 2760 | For each digest you must choose how to format the number(s) of the individual | ||||||
| 2761 | digest file being processed when messages from that file are written to a | ||||||
| 2762 | threads file. What you are doing here is formatting the information captured | ||||||
| 2763 | by the C | ||||||
| 2764 | You express this choice as a single-quoted string which formats the data | ||||||
| 2765 | captured by Perl regular expression which in C | ||||||
| 2766 | formatting is done via the Perl C | ||||||
| 2767 | is assigned to be the value of C<%config_out> key | ||||||
| 2768 | |||||||
| 2769 | We saw above that digests from the Perl-Win32-Users list carried both a volume | ||||||
| 2770 | number and an individual digest number. | ||||||
| 2771 | |||||||
| 2772 | Perl-Win32-Users Digest, Vol 1 Issue 1771.txt | ||||||
| 2773 | Perl-Win32-Users Digest, Vol 1 Issue 1772.txt | ||||||
| 2774 | |||||||
| 2775 | Both numbers were captured by the Perl regular expression in | ||||||
| 2776 | C<%pw32u_config_in> key | ||||||
| 2777 | |||||||
| 2778 | '.*Vol\s(\d+),\sIssue\s(\d+)\.txt', | ||||||
| 2779 | |||||||
| 2780 | Here we have chosen to format the volume number as a 3-digit, 0-padded number | ||||||
| 2781 | and the individual digest number as a 4-digit, 0-padded number. We then join | ||||||
| 2782 | these two data with an underscore. | ||||||
| 2783 | |||||||
| 2784 | %pw32u_config_out = ( | ||||||
| 2785 | title => 'Perl-Win32-Users', | ||||||
| 2786 | dir_digest => "../pw32u", | ||||||
| 2787 | dir_threads => "../pw32u/Threads", | ||||||
| 2788 | digests_log => "../pw32u/digests_log.txt", | ||||||
| 2789 | todays_topics => "../pw32u/todays_topics.txt", | ||||||
| 2790 | id_format => 'sprintf("%03d",$1) . \'_\' . sprintf("%04d",$2)', | ||||||
| 2791 | ... | ||||||
| 2792 | ); | ||||||
| 2793 | |||||||
| 2794 | We saw above that digests from the Perl Beginners list carried only an | ||||||
| 2795 | digest number -- no volume number. | ||||||
| 2796 | |||||||
| 2797 | [PBML] Digest Number 1491.txt | ||||||
| 2798 | [PBML] Digest Number 1492.txt | ||||||
| 2799 | |||||||
| 2800 | This number was captured by the Perl regular expression in C<%pbml_config_in> | ||||||
| 2801 | key | ||||||
| 2802 | |||||||
| 2803 | '.*\s(\d+)\.txt$' | ||||||
| 2804 | |||||||
| 2805 | Here we have chosen to format the digest number as a 5-digit, 0-padded number. | ||||||
| 2806 | |||||||
| 2807 | %pbml_config_out = ( | ||||||
| 2808 | title => 'Perl Beginner', | ||||||
| 2809 | dir_digest => "../pbml", | ||||||
| 2810 | dir_threads => "../pbml/Threads", | ||||||
| 2811 | digests_log => "../pbml/digests_log.txt", | ||||||
| 2812 | todays_topics => "../pbml/todays_topics.txt", | ||||||
| 2813 | id_format => 'sprintf("%05d",$1)', | ||||||
| 2814 | ... | ||||||
| 2815 | ); | ||||||
| 2816 | |||||||
| 2817 | Note that if you allow for a 4-digit number, the highest numbered digest you | ||||||
| 2818 | can process off a given mailing list will be C<9999>. If you allow for a | ||||||
| 2819 | 5-digit number, the upper limit will be C<99999>. The latter should be | ||||||
| 2820 | sufficient for a lifetime even for a mailing list (I | ||||||
| 2821 | generates 3 or 4 digest files per day or over 1000 per year. | ||||||
| 2822 | |||||||
| 2823 | =item 7 Format for Numbering Individual Messages in Output | ||||||
| 2824 | |||||||
| 2825 | For each digest you must choose how to format the number which the digest | ||||||
| 2826 | assigns to its individual messages. Experience suggests that 2 digits should | ||||||
| 2827 | be more than sufficient to format this number, as all digests which the author | ||||||
| 2828 | has observed have fewer than 100 entries. However, below we have arbitrarily | ||||||
| 2829 | decided to allow for up to 9999 entries in a given digest. As with the digest | ||||||
| 2830 | number, the formatting is accomplished via the Perl C | ||||||
| 2831 | The result is stored in a C<%config_out> key which must be called | ||||||
| 2832 | C | ||||||
| 2833 | |||||||
| 2834 | %pw32u_config_out = ( | ||||||
| 2835 | title => 'Perl-Win32-Users', | ||||||
| 2836 | dir_digest => "../pw32u", | ||||||
| 2837 | dir_threads => "../pw32u/Threads", | ||||||
| 2838 | digests_log => "../pw32u/digests_log.txt", | ||||||
| 2839 | todays_topics => "../pw32u/todays_topics.txt", | ||||||
| 2840 | id_format => 'sprintf("%03d",$1) . | ||||||
| 2841 | \'_\' . sprintf("%04d",$2)', | ||||||
| 2842 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 2843 | ... | ||||||
| 2844 | ); | ||||||
| 2845 | |||||||
| 2846 | %pbml_config_out = ( | ||||||
| 2847 | title => 'Perl Beginner', | ||||||
| 2848 | dir_digest => "../pbml", | ||||||
| 2849 | dir_threads => "../pbml/Threads", | ||||||
| 2850 | digests_log => "../pbml/digests_log.txt", | ||||||
| 2851 | todays_topics => "../pbml/todays_topics.txt", | ||||||
| 2852 | id_format => 'sprintf("%05d",$1)', | ||||||
| 2853 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 2854 | ... | ||||||
| 2855 | ); | ||||||
| 2856 | |||||||
| 2857 | =item 8 Thread Message Delimiter | ||||||
| 2858 | |||||||
| 2859 | For each digest you must compose a string which will separate one message in | ||||||
| 2860 | a threads file from its successor. This string must be double-quoted and | ||||||
| 2861 | assigned to C<%config_out> key C | ||||||
| 2862 | string should terminate in two or more C<\n\n> newline characters so that the | ||||||
| 2863 | delimiter is always a paragraph unto itself. | ||||||
| 2864 | |||||||
| 2865 | This delimiter may -- or may not -- be the same string which the mailing list | ||||||
| 2866 | provider uses to separate messages in the digest files themselves. In other | ||||||
| 2867 | words, you may choose to use the same string for C | ||||||
| 2868 | C<%config_out> as you reported the list provider used in C<%config_in> key | ||||||
| 2869 | C | ||||||
| 2870 | |||||||
| 2871 | In the example below we make the C | ||||||
| 2872 | Perl-Win32-Users to be the same as its C | ||||||
| 2873 | |||||||
| 2874 | %pw32u_config_out = ( | ||||||
| 2875 | title => 'Perl-Win32-Users', | ||||||
| 2876 | dir_digest => "../pw32u", | ||||||
| 2877 | dir_threads => "../pw32u/Threads", | ||||||
| 2878 | digests_log => "../pw32u/digests_log.txt", | ||||||
| 2879 | todays_topics => "../pw32u/todays_topics.txt", | ||||||
| 2880 | id_format => 'sprintf("%03d",$1) . | ||||||
| 2881 | \'_\' . sprintf("%04d",$2)', | ||||||
| 2882 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 2883 | thread_msg_delimiter => "--__--__--\n\n", | ||||||
| 2884 | ... | ||||||
| 2885 | ); | ||||||
| 2886 | |||||||
| 2887 | Note: In light of the earlier discussion of the changes ActiveState made | ||||||
| 2888 | to its mailing list digests in early 2004, the reader is cautioned that the | ||||||
| 2889 | code above should not be directly 'copied-and-pasted' into a configuration | ||||||
| 2890 | hash with which you might follow an ActiveState mailing list. Treat it as | ||||||
| 2891 | educational. In particular, the author is now testing the following as a | ||||||
| 2892 | setting for C<$pw32u_config_out{'thread_msg_delimiter'}>: | ||||||
| 2893 | |||||||
| 2894 | "\n--__--__--\n\n", | ||||||
| 2895 | |||||||
| 2896 | For threads generated by appling Mail::Digest::Tools to the Perl | ||||||
| 2897 | Beginners list, we choose an output message delimiter which differs from the | ||||||
| 2898 | source message delimiter. | ||||||
| 2899 | |||||||
| 2900 | %pbml_config_out = ( | ||||||
| 2901 | title => 'Perl Beginner', | ||||||
| 2902 | dir_digest => "../pbml", | ||||||
| 2903 | dir_threads => "../pbml/Threads", | ||||||
| 2904 | digests_log => "../pbml/digests_log.txt", | ||||||
| 2905 | todays_topics => "../pbml/todays_topics.txt", | ||||||
| 2906 | id_format => 'sprintf("%05d",$1)', | ||||||
| 2907 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 2908 | thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n", | ||||||
| 2909 | ... | ||||||
| 2910 | ); | ||||||
| 2911 | |||||||
| 2912 | Whatever choice you make for the C | ||||||
| 2913 | unlikely to occur within the text of a message and should terminate in two or | ||||||
| 2914 | more newlines. | ||||||
| 2915 | |||||||
| 2916 | =item 9 Archive or Delete Threads? | ||||||
| 2917 | |||||||
| 2918 | For each digest you process with Mail::Digest::Tools, you must decide whether | ||||||
| 2919 | to retain the resulting thread files in an archive them in a separate | ||||||
| 2920 | directory after a specified period of time, to delete them from disk | ||||||
| 2921 | after a specified period of time, or to do neither and allow them to | ||||||
| 2922 | accumulate indefinitely in the threads directory. Your decision is represented | ||||||
| 2923 | as the value of C<%config_out> key | ||||||
| 2924 | be expressed as one of three numerical values: | ||||||
| 2925 | |||||||
| 2926 | 0 Thread files are neither archived nor deleted | ||||||
| 2927 | |||||||
| 2928 | 1 Thread files are archived in a separate directory (or directories) | ||||||
| 2929 | after the number of days specified by key 'archive_kill_days' | ||||||
| 2930 | (see below) | ||||||
| 2931 | |||||||
| 2932 | -1    Thread files are deleted after I | ||||||
| 2933 | 'archive_kill_days' | ||||||
| 2934 | |||||||
| 2935 | In the examples below we have chosen to archive all threads generated by the | ||||||
| 2936 | Perl-Win32-Users list but to kill all threads generated by the Perl Beginner | ||||||
| 2937 | list after a number of days whose specification we shall come to shortly. | ||||||
| 2938 | |||||||
| 2939 | %pw32u_config_out = ( | ||||||
| 2940 | title => 'Perl-Win32-Users', | ||||||
| 2941 | dir_digest => "../pw32u", | ||||||
| 2942 | dir_threads => "../pw32u/Threads", | ||||||
| 2943 | digests_log => "../pw32u/digests_log.txt", | ||||||
| 2944 | todays_topics => "../pw32u/todays_topics.txt", | ||||||
| 2945 | id_format => 'sprintf("%03d",$1) . \'_\' . | ||||||
| 2946 | sprintf("%04d",$2)', | ||||||
| 2947 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 2948 | thread_msg_delimiter => "--__--__--\n\n", | ||||||
| 2949 | archive_kill_trigger => 1, | ||||||
| 2950 | ... | ||||||
| 2951 | ); | ||||||
| 2952 | |||||||
| 2953 | %pbml_config_out = ( | ||||||
| 2954 | title => 'Perl Beginner', | ||||||
| 2955 | dir_digest => "../pbml", | ||||||
| 2956 | dir_threads => "../pbml/Threads", | ||||||
| 2957 | digests_log => "../pbml/digests_log.txt", | ||||||
| 2958 | todays_topics => "../pbml/todays_topics.txt", | ||||||
| 2959 | id_format => 'sprintf("%05d",$1)', | ||||||
| 2960 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 2961 | thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n", | ||||||
| 2962 | archive_kill_trigger => -1, | ||||||
| 2963 | ... | ||||||
| 2964 | ); | ||||||
| 2965 | |||||||
| 2966 | =back | ||||||
| 2967 | |||||||
| 2968 | This completes the 9 required keys for C<%config_out>. We now turn to keys | ||||||
| 2969 | which are either optional or which are required if you have assigned a value | ||||||
| 2970 | of C<1> or C<-1> to key C | ||||||
| 2971 | |||||||
| 2972 | =head2 Optional C<%config_out> Keys | ||||||
| 2973 | |||||||
| 2974 | =over 4 | ||||||
| 2975 | |||||||
| 2976 | =item * Digests Read File | ||||||
| 2977 | |||||||
| 2978 | As an option, Mail::Digest::Tools offers file to log which instances of a | ||||||
| 2979 | particular digest have previously been processed which is more | ||||||
| 2980 | human-readable than the file named in C<%config_out> key C | ||||||
| 2981 | That file logs a digest as follows: | ||||||
| 2982 | |||||||
| 2983 | 001_9999;Fri Feb 6 18:57:41 2004;Fri Feb 6 18:57:41 2004 | ||||||
| 2984 | |||||||
| 2985 | It is probably easier to read this data like this: | ||||||
| 2986 | |||||||
| 2987 | 09999: | ||||||
| 2988 | first processed at Fri Feb 6 18:57:41 2004 | ||||||
| 2989 | most recently processed at Fri Feb 6 18:57:41 2004 | ||||||
| 2990 | |||||||
| 2991 | To choose this option you need to set I | ||||||
| 2992 | |||||||
| 2993 | =over 4 | ||||||
| 2994 | |||||||
| 2995 | =item 1 C | ||||||
| 2996 | |||||||
| 2997 | This must be assigned a true value such as C<1>. This tells | ||||||
| 2998 | Mail::Digest::Tools that you indeed want a 'digests read' file. | ||||||
| 2999 | |||||||
| 3000 | =item 2 C | ||||||
| 3001 | |||||||
| 3002 | This should be assigned the name of the 'digests read' file, but it will | ||||||
| 3003 | default to a file F | ||||||
| 3004 | C | ||||||
| 3005 | |||||||
| 3006 | =back | ||||||
| 3007 | |||||||
| 3008 | Adding these keys to our C<%config_out>, we get: | ||||||
| 3009 | |||||||
| 3010 | %pw32u_config_out = ( | ||||||
| 3011 | title => 'Perl-Win32-Users', | ||||||
| 3012 | dir_digest => "../pw32u", | ||||||
| 3013 | dir_threads => "../pw32u/Threads", | ||||||
| 3014 | digests_log => "../pw32u/digests_log.txt", | ||||||
| 3015 | todays_topics => "../pw32u/todays_topics.txt", | ||||||
| 3016 | id_format => 'sprintf("%03d",$1) . \'_\' . | ||||||
| 3017 | sprintf("%04d",$2)', | ||||||
| 3018 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 3019 | thread_msg_delimiter => "--__--__--\n\n", | ||||||
| 3020 | archive_kill_trigger => 1, | ||||||
| 3021 | digests_read_flag => 1, | ||||||
| 3022 | digests_read => "../pw32u/digests_read.txt", | ||||||
| 3023 | ... | ||||||
| 3024 | ); | ||||||
| 3025 | |||||||
| 3026 | %pbml_config_out = ( | ||||||
| 3027 | title => 'Perl Beginner', | ||||||
| 3028 | dir_digest => "../pbml", | ||||||
| 3029 | dir_threads => "../pbml/Threads", | ||||||
| 3030 | digests_log => "../pbml/digests_log.txt", | ||||||
| 3031 | todays_topics => "../pbml/todays_topics.txt", | ||||||
| 3032 | id_format => 'sprintf("%05d",$1)', | ||||||
| 3033 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 3034 | thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n", | ||||||
| 3035 | archive_kill_trigger => -1, | ||||||
| 3036 | digests_read_flag => 1, | ||||||
| 3037 | digests_read => "../pbml/digests_read.txt", | ||||||
| 3038 | ... | ||||||
| 3039 | ); | ||||||
| 3040 | |||||||
| 3041 | =item * Keys Needed When Archiving Thread Files | ||||||
| 3042 | |||||||
| 3043 | If, as discussed above, you have assigned the value C<1> to the | ||||||
| 3044 | C< | ||||||
| 3045 | will archive older thread files, I | ||||||
| 3046 | directory specified in key C | ||||||
| 3047 | thread file has not been modified in a specified number of days. If new | ||||||
| 3048 | messages need to be posted to a thread file which has been archived, that | ||||||
| 3049 | file will be de-archived and brought back to the C | ||||||
| 3050 | Thread files which are either archived or de-archived via a call to | ||||||
| 3051 | C | ||||||
| 3052 | appropriately named files. | ||||||
| 3053 | |||||||
| 3054 | Hence, the keys you will need to define when archiving thread files are: | ||||||
| 3055 | |||||||
| 3056 | =over 4 | ||||||
| 3057 | |||||||
| 3058 | =item 1 C | ||||||
| 3059 | |||||||
| 3060 | This key must be assigned the number of days after which a thread file sitting | ||||||
| 3061 | in the C | ||||||
| 3062 | specified, will default to 14 days. | ||||||
| 3063 | |||||||
| 3064 | =item 2 C | ||||||
| 3065 | |||||||
| 3066 | This key must be assigned the name of the I | ||||||
| 3067 | the directory at the top of a tree of archive directories. | ||||||
| 3068 | |||||||
| 3069 | When you track a particular mailing list digest for a number of years, the | ||||||
| 3070 | number of different thread files can grow to enormous proportions. For | ||||||
| 3071 | example, the author has tracked over 10,000 distinct thread files from the | ||||||
| 3072 | Perl-Win32-Users list over a three-and-a-half year period. 10,000 files in a | ||||||
| 3073 | single directory is completely unwieldy and slows directory read-times | ||||||
| 3074 | tremendously. Mail::Digest::Tools therefore by default provides a tree of | ||||||
| 3075 | archive directories: a top directory which contains no thread files but | ||||||
| 3076 | instead holds 27 subdirectories , one for each letter of the English alphabet | ||||||
| 3077 | and one for thread files which start with any other character (guaranteed to | ||||||
| 3078 | work with ASCII only; not tested with other character sets). | ||||||
| 3079 | |||||||
| 3080 | dir_archive_top | ||||||
| 3081 | a | ||||||
| 3082 | b | ||||||
| 3083 | c | ||||||
| 3084 | ... | ||||||
| 3085 | z | ||||||
| 3086 | other | ||||||
| 3087 | |||||||
| 3088 | The user gets to choose where to place the top archive directory but the 27 | ||||||
| 3089 | subdirectories are automatically placed beneath that one. The top archive | ||||||
| 3090 | directory is the value assigned to C<%config_out> key C | ||||||
| 3091 | |||||||
| 3092 | =item 3 C | ||||||
| 3093 | |||||||
| 3094 | This key should be assigned the name of a file which will log any and all | ||||||
| 3095 | files archived by a single call to C | ||||||
| 3096 | C | ||||||
| 3097 | an ongoing log; it only shows what happened today.) If not assigned a value, | ||||||
| 3098 | it will default to a file called F | ||||||
| 3099 | directory named by key C | ||||||
| 3100 | |||||||
| 3101 | =item 4 C | ||||||
| 3102 | |||||||
| 3103 | This key should be assigned the name of a file which will log any and all | ||||||
| 3104 | files de-archived by a single call to C | ||||||
| 3105 | C | ||||||
| 3106 | an ongoing log; it only shows what happened today.) If not assigned a value, | ||||||
| 3107 | it will default to a file called F | ||||||
| 3108 | directory named by key C | ||||||
| 3109 | |||||||
| 3110 | =item 5 C | ||||||
| 3111 | |||||||
| 3112 | This key is reserved for future use. In the current version of | ||||||
| 3113 | Mail::Digest::Tools it does not need to be set, but, should you be obsessive | ||||||
| 3114 | about this, set it to C<0>. | ||||||
| 3115 | |||||||
| 3116 | =back | ||||||
| 3117 | |||||||
| 3118 | Adding these keys to our sample C<%config_out> hashes, we get: | ||||||
| 3119 | |||||||
| 3120 | %pw32u_config_out = ( | ||||||
| 3121 | title => 'Perl-Win32-Users', | ||||||
| 3122 | dir_digest => "../pw32u", | ||||||
| 3123 | dir_threads => "../pw32u/Threads", | ||||||
| 3124 | digests_log => "../pw32u/digests_log.txt", | ||||||
| 3125 | todays_topics => "../pw32u/todays_topics.txt", | ||||||
| 3126 | id_format => 'sprintf("%03d",$1) . \'_\' . | ||||||
| 3127 | sprintf("%04d",$2)', | ||||||
| 3128 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 3129 | thread_msg_delimiter => "--__--__--\n\n", | ||||||
| 3130 | archive_kill_trigger => 1, | ||||||
| 3131 | digests_read_flag => 1, | ||||||
| 3132 | digests_read => "../pw32u/digests_read.txt", | ||||||
| 3133 | archive_kill_days => 14, | ||||||
| 3134 | dir_archive_top => "../pw32u/Threads/archive", | ||||||
| 3135 | archived_today => "../pw32u/archived_today.txt", | ||||||
| 3136 | de_archived_today => "../pw32u/de_archived_today.txt", | ||||||
| 3137 | ... | ||||||
| 3138 | ); | ||||||
| 3139 | |||||||
| 3140 | %pbml_config_out = ( | ||||||
| 3141 | title => 'Perl Beginner', | ||||||
| 3142 | dir_digest => "../pbml", | ||||||
| 3143 | dir_threads => "../pbml/Threads", | ||||||
| 3144 | digests_log => "../pbml/digests_log.txt", | ||||||
| 3145 | todays_topics => "../pbml/todays_topics.txt", | ||||||
| 3146 | id_format => 'sprintf("%05d",$1)', | ||||||
| 3147 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 3148 | thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n", | ||||||
| 3149 | archive_kill_trigger => -1, | ||||||
| 3150 | digests_read_flag => 1, | ||||||
| 3151 | digests_read => "../pbml/digests_read.txt", | ||||||
| 3152 | ... | ||||||
| 3153 | ); | ||||||
| 3154 | |||||||
| 3155 | Note that since in our example we chose I | ||||||
| 3156 | the Perl Beginner list -- as evinced by the assignment of C<-1> to key | ||||||
| 3157 | C | ||||||
| 3158 | C | ||||||
| 3159 | C<%pbml_config_out>. | ||||||
| 3160 | |||||||
| 3161 | =item * Keys Needed When Deleting Thread Files | ||||||
| 3162 | |||||||
| 3163 | The keys needed for C<%config_out> when you have chosen to delete thread | ||||||
| 3164 | files after a specified interval parallel those you would have needed if you | ||||||
| 3165 | had chosen to archive those files instead. | ||||||
| 3166 | |||||||
| 3167 | =over 4 | ||||||
| 3168 | |||||||
| 3169 | =item 1 C | ||||||
| 3170 | |||||||
| 3171 | This key must be assigned the number of days after which a thread file sitting | ||||||
| 3172 | in the C | ||||||
| 3173 | to 14 days. | ||||||
| 3174 | |||||||
| 3175 | =item 2 C | ||||||
| 3176 | |||||||
| 3177 | This key should be assigned the name of a file which will log any and all | ||||||
| 3178 | files deleted by a single call to C | ||||||
| 3179 | C | ||||||
| 3180 | an ongoing log; it only shows what happened today.) If not assigned a value, | ||||||
| 3181 | it will default to a file called F | ||||||
| 3182 | directory named by key C | ||||||
| 3183 | |||||||
| 3184 | =back | ||||||
| 3185 | |||||||
| 3186 | Adding these keys to our sample C<%config_out> hashes, we get: | ||||||
| 3187 | |||||||
| 3188 | %pw32u_config_out = ( | ||||||
| 3189 | title => 'Perl-Win32-Users', | ||||||
| 3190 | dir_digest => "../pw32u", | ||||||
| 3191 | dir_threads => "../pw32u/Threads", | ||||||
| 3192 | digests_log => "../pw32u/digests_log.txt", | ||||||
| 3193 | todays_topics => "../pw32u/todays_topics.txt", | ||||||
| 3194 | id_format => 'sprintf("%03d",$1) . \'_\' . | ||||||
| 3195 | sprintf("%04d",$2)', | ||||||
| 3196 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 3197 | thread_msg_delimiter => "--__--__--\n\n", | ||||||
| 3198 | archive_kill_trigger => 1, | ||||||
| 3199 | digests_read_flag => 1, | ||||||
| 3200 | digests_read => "../pw32u/digests_read.txt", | ||||||
| 3201 | archive_kill_days => 14, | ||||||
| 3202 | dir_archive_top => "../pw32u/Threads/archive", | ||||||
| 3203 | archived_today => "../pw32u/archived_today.txt", | ||||||
| 3204 | de_archived_today => "../pw32u/de_archived_today.txt", | ||||||
| 3205 | ... | ||||||
| 3206 | ); | ||||||
| 3207 | |||||||
| 3208 | %pbml_config_out = ( | ||||||
| 3209 | title => 'Perl Beginner', | ||||||
| 3210 | dir_digest => "../pbml", | ||||||
| 3211 | dir_threads => "../pbml/Threads", | ||||||
| 3212 | digests_log => "../pbml/digests_log.txt", | ||||||
| 3213 | todays_topics => "../pbml/todays_topics.txt", | ||||||
| 3214 | id_format => 'sprintf("%05d",$1)', | ||||||
| 3215 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 3216 | thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n", | ||||||
| 3217 | archive_kill_trigger => -1, | ||||||
| 3218 | digests_read_flag => 1, | ||||||
| 3219 | digests_read => "../pbml/digests_read.txt", | ||||||
| 3220 | archive_kill_days => 14, | ||||||
| 3221 | deleted_today => "../pbml/deleted_today.txt", | ||||||
| 3222 | ... | ||||||
| 3223 | ); | ||||||
| 3224 | |||||||
| 3225 | Note that since in our example we chose to archive thread files from | ||||||
| 3226 | the Perl-Win32-Users list -- as evinced by the assignment of C<1> to key | ||||||
| 3227 | C | ||||||
| 3228 | C | ||||||
| 3229 | |||||||
| 3230 | =item * Keys Needed When Stripping Multipart MIME Content from Thread Files | ||||||
| 3231 | |||||||
| 3232 | Recall from above that you had to study a given digest to determine whether or | ||||||
| 3233 | not it contained multipart MIME content in need of stripping out. If a digest, | ||||||
| 3234 | such as the ActiveState Perl-Win32-Users digest, contained a lot of such bloat, | ||||||
| 3235 | you set key C | ||||||
| 3236 | the other hand, the mailing list provider stripped out the multipart MIME | ||||||
| 3237 | content before distributing the digest, you set that key to a value of C<0>. | ||||||
| 3238 | |||||||
| 3239 | Mail::Digest::Tools will automatically strip out multipart MIME content once | ||||||
| 3240 | you have set C | ||||||
| 3241 | is:  Do I want to view a log of which messages processed in a I | ||||||
| 3242 | C | ||||||
| 3243 | content stripped out -- or not? If so, you must set two keys in | ||||||
| 3244 | C<%config_out>: | ||||||
| 3245 | |||||||
| 3246 | =over 4 | ||||||
| 3247 | |||||||
| 3248 | =item 1 C | ||||||
| 3249 | |||||||
| 3250 | This key must be set to a true value such as C<1>. | ||||||
| 3251 | |||||||
| 3252 | =item 2 C | ||||||
| 3253 | |||||||
| 3254 | This key should be assigned the name of the 'mimelog' file, but if you do not | ||||||
| 3255 | specify a value it will default to a file F | ||||||
| 3256 | directory named by key C | ||||||
| 3257 | |||||||
| 3258 | =back | ||||||
| 3259 | |||||||
| 3260 | The logfile so created looks like this: | ||||||
| 3261 | |||||||
| 3262 | Processed Problem | ||||||
| 3263 | |||||||
| 3264 | 001_1775_0003 CASE C | ||||||
| 3265 | 001_1775_0015 CASE C | ||||||
| 3266 | 001_1775_0018 CASE C | ||||||
| 3267 | 001_1775_0021 CASE E | ||||||
| 3268 | |||||||
| 3269 | where items in the 'Processed' column were either (a) successfully stripped of | ||||||
| 3270 | multipart MIME content by Mail::Digest::Tools as specified by the internal rule | ||||||
| 3271 | denoted by the 'CASE'; or (b) were recognized by Mail::Digest::Tools as | ||||||
| 3272 | containing multipart MIME content that could not be stripped out. | ||||||
| 3273 | |||||||
| 3274 | This is relatively esoteric and probably of interest mainly to the module's | ||||||
| 3275 | developer. So if you are not interested in this feature set | ||||||
| 3276 | C | ||||||
| 3277 | Mail::Digest::Tools will still do its best to strip out extraneous multipart | ||||||
| 3278 | MIME content. | ||||||
| 3279 | |||||||
| 3280 | Our sample C<%config_out> hashes are now complete. They look like this: | ||||||
| 3281 | |||||||
| 3282 | %pw32u_config_out = ( | ||||||
| 3283 | title => 'Perl-Win32-Users', | ||||||
| 3284 | dir_digest => "../pw32u", | ||||||
| 3285 | dir_threads => "../pw32u/Threads", | ||||||
| 3286 | digests_log => "../pw32u/digests_log.txt", | ||||||
| 3287 | todays_topics => "../pw32u/todays_topics.txt", | ||||||
| 3288 | id_format => 'sprintf("%03d",$1) . \'_\' . | ||||||
| 3289 | sprintf("%04d",$2)', | ||||||
| 3290 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 3291 | thread_msg_delimiter => "--__--__--\n\n", | ||||||
| 3292 | archive_kill_trigger => 1, | ||||||
| 3293 | digests_read_flag => 1, | ||||||
| 3294 | digests_read => "../pw32u/digests_read.txt", | ||||||
| 3295 | archive_kill_days => 14, | ||||||
| 3296 | dir_archive_top => "../pw32u/Threads/archive", | ||||||
| 3297 | archived_today => "../pw32u/archived_today.txt", | ||||||
| 3298 | de_archived_today => "../pw32u/de_archived_today.txt", | ||||||
| 3299 | mimelog => "../pw32u/mimelog.txt", | ||||||
| 3300 | MIME_cleanup_log_flag => 1, | ||||||
| 3301 | ); | ||||||
| 3302 | |||||||
| 3303 | %pbml_config_out = ( | ||||||
| 3304 | title => 'Perl Beginner', | ||||||
| 3305 | dir_digest => "../pbml", | ||||||
| 3306 | dir_threads => "../pbml/Threads", | ||||||
| 3307 | digests_log => "../pbml/digests_log.txt", | ||||||
| 3308 | todays_topics => "../pbml/todays_topics.txt", | ||||||
| 3309 | id_format => 'sprintf("%05d",$1)', | ||||||
| 3310 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 3311 | thread_msg_delimiter => "_*_*_*_*_*_\n_*_*_*_*_*_\n\n\n", | ||||||
| 3312 | archive_kill_trigger => -1, | ||||||
| 3313 | digests_read_flag => 1, | ||||||
| 3314 | digests_read => "../pbml/digests_read.txt", | ||||||
| 3315 | archive_kill_days => 14, | ||||||
| 3316 | deleted_today => "../pbml/deleted_today.txt", | ||||||
| 3317 | ); | ||||||
| 3318 | |||||||
| 3319 | Note that C<%pbml_config_out> does not have C | ||||||
| 3320 | C | ||||||
| 3321 | Beginners mailing list Yahoo! Groups strips out unnecessary multipart | ||||||
| 3322 | MIME content before sending the digest to you. | ||||||
| 3323 | |||||||
| 3324 | =back | ||||||
| 3325 | |||||||
| 3326 | =head1 HELPFUL HINTS | ||||||
| 3327 | |||||||
| 3328 | ... in which the module author shares what he has learned using | ||||||
| 3329 | Mail::Digest::Tools and its predecessors since August 2000. | ||||||
| 3330 | |||||||
| 3331 | =head2 Initial Configuration and Testing | ||||||
| 3332 | |||||||
| 3333 | As mentioned above, if you are considering creating a local archive of threads | ||||||
| 3334 | originating in daily digest versions of a mailing list, you should first | ||||||
| 3335 | accumulate 6-10 instances of such digests and both: | ||||||
| 3336 | |||||||
| 3337 | =over 4 | ||||||
| 3338 | |||||||
| 3339 | =item 1 | ||||||
| 3340 | |||||||
| 3341 | study the internal structure of the digest -- needed to develop a | ||||||
| 3342 | C<%config_in> for the digest; and | ||||||
| 3343 | |||||||
| 3344 | =item 2 | ||||||
| 3345 | |||||||
| 3346 | carefully consider how you wish to structure the output from the module's | ||||||
| 3347 | use on your system -- needed to develop C<%config_out> for the digest | ||||||
| 3348 | |||||||
| 3349 | =back | ||||||
| 3350 | |||||||
| 3351 | Once you have developed the initial configuration, you should call | ||||||
| 3352 | C | ||||||
| 3353 | if the results are what you want.  If they are I | ||||||
| 3354 | need to think about what you should change in C<%config_in> and/or | ||||||
| 3355 | C<%config_out>.  Make those changes, then call C | ||||||
| 3356 | again. Repeat as needed, making sure not to delete any of the digest files | ||||||
| 3357 | you are using as sources until you are completely satisfied with your | ||||||
| 3358 | configuration. | ||||||
| 3359 | |||||||
| 3360 | Once, however, you I | ||||||
| 3361 | C | ||||||
| 3362 | C | ||||||
| 3363 | regenerate threads containing messages from digests you have deleted over | ||||||
| 3364 | time). | ||||||
| 3365 | |||||||
| 3366 | =head2 Where to Store the Configuration Hashes | ||||||
| 3367 | |||||||
| 3368 | As mentioned above, you will probably find it convenient to write separate | ||||||
| 3369 | Perl scripts to call each one of Mail::Digest::Tool's public functions. You | ||||||
| 3370 | could code C<%config_in> and C<%config_out> in each of those scripts just | ||||||
| 3371 | before the respective function calls. But that would violate the principle of | ||||||
| 3372 | 'Repeated Code Is a Mistake' and multiply maintenance problems. It's far | ||||||
| 3373 | better to code the two configuration hashes in a separate plain-text file and | ||||||
| 3374 | 'require' that file into your script. That way, any changes you make in the | ||||||
| 3375 | configuration will be automatically picked up by each script that calls a | ||||||
| 3376 | Mail::Digest::Tools function. | ||||||
| 3377 | |||||||
| 3378 | Here is an example of such a file holding the configuration hashes governing | ||||||
| 3379 | use of the Perl-Win32-Users digest, along with a script making use of that file. | ||||||
| 3380 | |||||||
| 3381 | # file: pw32u.digest.data | ||||||
| 3382 | $topdir = "E:/Digest/pw32u"; | ||||||
| 3383 | %config_in = ( | ||||||
| 3384 | grep_formula => 'Perl-Win32-Users digest', | ||||||
| 3385 | pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt', | ||||||
| 3386 | # next element's value must be double-quoted | ||||||
| 3387 | source_msg_delimiter => "--__--__--\n\n", | ||||||
| 3388 | topics_intro => 'Today\'s Topics:', | ||||||
| 3389 | message_style_flag => '^Message:\s+(\d+)$', | ||||||
| 3390 | from_style_flag => '^From:\s+(.+)$', | ||||||
| 3391 | org_style_flag => '^Organization:\s+(.+)$', | ||||||
| 3392 | to_style_flag => '^To:\s+(.+)$', | ||||||
| 3393 | cc_style_flag => '^CC:\s+(.+)$', | ||||||
| 3394 | subject_style_flag => '^Subject:\s+(.+)$', | ||||||
| 3395 | date_style_flag => '^Date:\s+(.+)$', | ||||||
| 3396 | reply_to_style_flag => '^Reply-To:\s+(.+)$', | ||||||
| 3397 | MIME_cleanup_flag => 1, | ||||||
| 3398 | ); | ||||||
| 3399 | |||||||
| 3400 | %config_out = ( | ||||||
| 3401 | title => 'Perl-Win32-Users', | ||||||
| 3402 | dir_digest => $topdir, | ||||||
| 3403 | dir_threads => "$topdir/Threads", | ||||||
| 3404 | dir_archive_top => "$topdir/Threads/archive", | ||||||
| 3405 | archived_today => "$topdir/archived_today.txt", | ||||||
| 3406 | de_archived_today => "$topdir/de_archived_today.txt", | ||||||
| 3407 | deleted_today => "$topdir/deleted_today.txt", | ||||||
| 3408 | digests_log => "$topdir/digests_log.txt", | ||||||
| 3409 | digests_read => "$topdir/digests_read.txt", | ||||||
| 3410 | todays_topics => "$topdir/todays_topics.txt", | ||||||
| 3411 | mimelog => "$topdir/mimelog.txt", | ||||||
| 3412 | id_format => 'sprintf("%03d",$1) . \'_\' . | ||||||
| 3413 | sprintf("%04d",$2)', | ||||||
| 3414 | output_id_format => 'sprintf("%04d",$1)', | ||||||
| 3415 | MIME_cleanup_log_flag => 1, | ||||||
| 3416 | # next element's value must be double-quoted | ||||||
| 3417 | thread_msg_delimiter => "--__--__--\n\n", | ||||||
| 3418 | archive_kill_trigger => 1, | ||||||
| 3419 | archive_kill_days => 14, | ||||||
| 3420 | digests_read_flag => 1, | ||||||
| 3421 | archive_config => 0, | ||||||
| 3422 | ); | ||||||
| 3423 | |||||||
| 3424 | # script: dig.pl | ||||||
| 3425 | # USAGE: perl dig.pl | ||||||
| 3426 | #!/usr/bin/perl | ||||||
| 3427 | use strict; | ||||||
| 3428 | use warnings; | ||||||
| 3429 | use Mail::Digest::Tools qw( process_new_digests ); | ||||||
| 3430 | |||||||
| 3431 | our (%config_in, %config_out); | ||||||
| 3432 | my $data_file = 'pw32u.digest.data'; | ||||||
| 3433 | require $data_file; | ||||||
| 3434 | |||||||
| 3435 | process_new_digests(\%config_in, \%config_out); | ||||||
| 3436 | |||||||
| 3437 | print "\nFinished\n"; | ||||||
| 3438 | |||||||
| 3439 | =head2 Maintaining Local Archives of More than One Digest | ||||||
| 3440 | |||||||
| 3441 | The module author has maintained local archives of more than a half dozen | ||||||
| 3442 | different mailing list digests over the past several years. He has found it | ||||||
| 3443 | convenient to maintain the configuration information for I | ||||||
| 3444 | he is following at a given time in a I | ||||||
| 3445 | advantage to this approach is that if two digests share a similar internal | ||||||
| 3446 | structure (perhaps due to being generated by the same mailing list program or | ||||||
| 3447 | list provider) and if the user chooses to structure the output from the two | ||||||
| 3448 | digests in similar or identical ways, then getting the configuration hashes | ||||||
| 3449 | becomes much easier and the potential for error is reduced. | ||||||
| 3450 | |||||||
| 3451 | Here is a sample directory and file structure for maintaining archives of | ||||||
| 3452 | two different digests on a Win32 system: | ||||||
| 3453 | |||||||
| 3454 | digest/ | ||||||
| 3455 | digest.data | ||||||
| 3456 | process_new.pl | ||||||
| 3457 | process_ALL.pl | ||||||
| 3458 | reply_digest_message.pl | ||||||
| 3459 | repair_digest_order.pl | ||||||
| 3460 | consolidate_threads.pl | ||||||
| 3461 | deletables.pl | ||||||
| 3462 | pw32u/ | ||||||
| 3463 | Perl-Win32-Users Digest, Vol 1 Issue 1771.txt | ||||||
| 3464 | Perl-Win32-Users Digest, Vol 1 Issue 1772.txt | ||||||
| 3465 | digest_log.txt | ||||||
| 3466 | digest_read.txt | ||||||
| 3467 | mimelog.txt | ||||||
| 3468 | Threads/ | ||||||
| 3469 | pbml/ | ||||||
| 3470 | [PBML] Digest Number 1491.txt | ||||||
| 3471 | [PBML] Digest Number 1492.txt | ||||||
| 3472 | digest_log.txt | ||||||
| 3473 | Threads/ | ||||||
| 3474 | |||||||
| 3475 | File F | ||||||
| 3476 | |||||||
| 3477 | # digest.data | ||||||
| 3478 | $topdir = "E:/Digest"; | ||||||
| 3479 | %digest_structure = ( | ||||||
| 3480 | pbml => { | ||||||
| 3481 | grep_formula => '\[PBML\]', | ||||||
| 3482 | pattern_target => '.*\s(\d+)\.txt$', | ||||||
| 3483 | ... | ||||||
| 3484 | }, | ||||||
| 3485 | pw32u => { | ||||||
| 3486 | grep_formula => 'Perl-Win32-Users digest', | ||||||
| 3487 | pattern_target => '.*Vol\s(\d+),\sIssue\s(\d+)\.txt', | ||||||
| 3488 | ... | ||||||
| 3489 | }, | ||||||
| 3490 | ); | ||||||
| 3491 | %digest_output_format = ( | ||||||
| 3492 | pbml => { | ||||||
| 3493 | title => 'Perl Beginner', | ||||||
| 3494 | dir_digest => "$topdir/pbml", | ||||||
| 3495 | dir_threads => "$topdir/pbml/Threads", | ||||||
| 3496 | ... | ||||||
| 3497 | }, | ||||||
| 3498 | pw32u => { | ||||||
| 3499 | title => 'Perl-Win32-Users', | ||||||
| 3500 | dir_digest => "$topdir/pw32u", | ||||||
| 3501 | dir_threads => "$topdir/pw32u/Threads", | ||||||
| 3502 | ... | ||||||
| 3503 | }, | ||||||
| 3504 | ); | ||||||
| 3505 | |||||||
| 3506 | To accomodate this slightly more complex structure in the configuration file, | ||||||
| 3507 | the calling script might be modified as follows: | ||||||
| 3508 | |||||||
| 3509 | # script: dig.pl | ||||||
| 3510 | # USAGE: perl dig.pl [short-name for digest] | ||||||
| 3511 | #!/usr/bin/perl | ||||||
| 3512 | use Mail::Digest::Tools qw( process_new_digests ); | ||||||
| 3513 | |||||||
| 3514 | my ($this_key, %config_in, %config_out); | ||||||
| 3515 | # variables imported from $data_file | ||||||
| 3516 | our (%digest_structure, %digest_output_format); | ||||||
| 3517 | |||||||
| 3518 | my $data_file = 'digest.data'; | ||||||
| 3519 | require $data_file; | ||||||
| 3520 | |||||||
| 3521 | $this_key = shift @ARGV; | ||||||
| 3522 | die "\n The command-line argument you typed: $this_key\n does not call an accessible digest$!" | ||||||
| 3523 | unless (defined $digest_structure{$this_key} | ||||||
| 3524 | and defined $digest_output_format{$this_key}); | ||||||
| 3525 | |||||||
| 3526 | my ($k,$v); | ||||||
| 3527 | while ( ($k, $v) = each %{$digest_structure{$this_key}} ) { | ||||||
| 3528 | $config_in{$k} = $v; | ||||||
| 3529 | } | ||||||
| 3530 | while ( ($k, $v) = each %{$digest_output_format{$this_key}} ) { | ||||||
| 3531 | $config_out{$k} = $v; | ||||||
| 3532 | } | ||||||
| 3533 | |||||||
| 3534 | process_new_digests(\%config_in, \%config_out); | ||||||
| 3535 | |||||||
| 3536 | print "\nFinished\n"; | ||||||
| 3537 | |||||||
| 3538 | =head2 Getting Your Mail to the Right Place on Your System | ||||||
| 3539 | |||||||
| 3540 | For several years the module author used the scripts which were predecessors | ||||||
| 3541 | to Mail::Digest::Tools on a Win32 system where mail was read with Microsoft | ||||||
| 3542 | Outlook Express. He would do a "File/Save as.." on an instance of a digest, | ||||||
| 3543 | select text format (*.txt) and save it to an appropriate directory. Later, | ||||||
| 3544 | the author used the shareware e-mail client Poco, in which the same operation | ||||||
| 3545 | was accomplished by highlighting a file and keying "Ctrl+S". | ||||||
| 3546 | |||||||
| 3547 | But as the number of digests the author was tracking grew, this procedure | ||||||
| 3548 | became more and more tedious. Fortunately, about that time the author was | ||||||
| 3549 | assigned to write a review of the second edition of the Perl Cookbook, and he | ||||||
| 3550 | learned how to use the Net::POP3 module to receive his e-mail directly. So | ||||||
| 3551 | now he uses a Perl script to get all his digests and save them as text files | ||||||
| 3552 | to appropriate directories -- and then lets a GUI e-mail client take care of | ||||||
| 3553 | the rest. | ||||||
| 3554 | |||||||
| 3555 | Here is a script which more or less accomplishes this: | ||||||
| 3556 | |||||||
| 3557 | # script: get_digests.pl | ||||||
| 3558 | #!/usr/bin/perl | ||||||
| 3559 | use strict; | ||||||
| 3560 | use warnings; | ||||||
| 3561 | use Net::POP3; | ||||||
| 3562 | use Term::ReadKey; | ||||||
| 3563 | |||||||
| 3564 | my ($site, $username, $password); | ||||||
| 3565 | my ($verref, $pop3, $messagesref, $undeleted, $msgnum, $message); | ||||||
| 3566 | my ($k,$v); | ||||||
| 3567 | my ($oldfh, $output); | ||||||
| 3568 | |||||||
| 3569 | my %digests = ( | ||||||
| 3570 | 'pbml' => "E:/Digest/pbml", | ||||||
| 3571 | 'pw32u' => "E:/Digest/pw32u", | ||||||
| 3572 | 'london' => "E:/Digest/london", | ||||||
| 3573 | ); | ||||||
| 3574 | |||||||
| 3575 | $site = 'pop3.someISP.com'; | ||||||
| 3576 | $username = 'myuserid'; | ||||||
| 3577 | |||||||
| 3578 | $pop3 = Net::POP3->new($site) | ||||||
| 3579 | or die "Couldn't open connection to $site: $!"; | ||||||
| 3580 | |||||||
| 3581 | print "Enter password for $username at $site: "; | ||||||
| 3582 | ReadMode('noecho'); | ||||||
| 3583 | $password = ReadLine(0); | ||||||
| 3584 | chomp $password; | ||||||
| 3585 | ReadMode(0); | ||||||
| 3586 | print "\n"; | ||||||
| 3587 | |||||||
| 3588 | defined ($pop3->login($username, $password)) | ||||||
| 3589 | or die "Can't authenticate: $!"; | ||||||
| 3590 | |||||||
| 3591 | $messagesref = $pop3->list | ||||||
| 3592 | or die "Can't get list of undeleted messages: $!"; | ||||||
| 3593 | |||||||
| 3594 | while ( ($k,$v) = each %$messagesref ) { | ||||||
| 3595 | my ($messageref, $line, %headers); | ||||||
| 3596 | print "$k:\t$v\n"; | ||||||
| 3597 | $messageref = $pop3->top($k); | ||||||
| 3598 | local $_; | ||||||
| 3599 | foreach (@$messageref) { | ||||||
| 3600 | chomp; | ||||||
| 3601 | last if (/^\s*$/); | ||||||
| 3602 | next unless (/^\s*(Date:|From:|Subject:|To:)/); | ||||||
| 3603 | if (/^\s*Date:\s*(.*)/) { | ||||||
| 3604 | $headers{'Date'} = $1; | ||||||
| 3605 | } | ||||||
| 3606 | if (/^\s*From:\s*(.*)/) { | ||||||
| 3607 | $headers{'From'} = $1; | ||||||
| 3608 | } | ||||||
| 3609 | if (/^\s*Subject:\s*(.*)/) { | ||||||
| 3610 | $headers{'Subject'} = $1; | ||||||
| 3611 | } | ||||||
| 3612 | if (/^\s*To:\s*(.*)/) { | ||||||
| 3613 | $headers{'To'} = $1; | ||||||
| 3614 | } | ||||||
| 3615 | } | ||||||
| 3616 | if ($headers{'Subject'} =~ /^\[PBML\]/) { | ||||||
| 3617 | get_digest($pop3, $k, 'pbml', $headers{'Subject'}); | ||||||
| 3618 | } | ||||||
| 3619 | if ($headers{'Subject'} =~ /^Perl-Win32-Users/) { | ||||||
| 3620 | get_digest($pop3, $k, 'pw32u', $headers{'Subject'}); | ||||||
| 3621 | } | ||||||
| 3622 | if ($headers{'Subject'} =~ /^london\.pm/) { | ||||||
| 3623 | get_digest($pop3, $k, 'london', $headers{'Subject'}); | ||||||
| 3624 | } | ||||||
| 3625 | } | ||||||
| 3626 | |||||||
| 3627 | $pop3->quit() or die "Couldn't quit cleanly: $!"; | ||||||
| 3628 | |||||||
| 3629 | print "Finished!\n"; | ||||||
| 3630 | |||||||
| 3631 | sub get_digest { | ||||||
| 3632 | my ($pop3, $msgnum, $digest, $subj) = @_; | ||||||
| 3633 | print "Retrieving $msgnum: $subj"; | ||||||
| 3634 | my $message = | ||||||
| 3635 | $pop3->get($msgnum) or die "Couldn't get message $msgnum: $!"; | ||||||
| 3636 | if ($message) { | ||||||
| 3637 | print "\n"; | ||||||
| 3638 | my $digestfile = "$digests{$digest}/$subj.txt"; | ||||||
| 3639 | _print_message($digestfile, $message); | ||||||
| 3640 | print "Marking $msgnum for deletion\n";; | ||||||
| 3641 | $pop3->delete($msgnum) or die "Couldn't delete message $msgnum: $!"; | ||||||
| 3642 | } else { | ||||||
| 3643 | print "Failed: $!\n"; | ||||||
| 3644 | } | ||||||
| 3645 | } | ||||||
| 3646 | |||||||
| 3647 | sub _print_message { | ||||||
| 3648 | my ($digestfile, $message) = @_; | ||||||
| 3649 | my @lines = @{$message}; | ||||||
| 3650 | my $counter = 0; | ||||||
| 3651 | open(FH, ">$digestfile") | ||||||
| 3652 | or die "Couldn't open $digestfile for writing: $!"; | ||||||
| 3653 | for (my $i = 0; $i<=$#lines; $i++) { | ||||||
| 3654 | chomp($lines[$i]); | ||||||
| 3655 | # Identify the first blank line in the digest, | ||||||
| 3656 | # i.e., the end of the headers | ||||||
| 3657 | if ($lines[$i] =~ /^$/) { | ||||||
| 3658 | $counter = $i; | ||||||
| 3659 | last; | ||||||
| 3660 | } | ||||||
| 3661 | }; | ||||||
| 3662 | # Transfer digest to appropriate directory, skipping over digest header | ||||||
| 3663 | # so as to start just above Today's Topics | ||||||
| 3664 | foreach my $line (@lines[$counter+1 .. $#lines]) { | ||||||
| 3665 | chomp($line); | ||||||
| 3666 | # For some reason the $pop3->get() puts a single whitespace at the | ||||||
| 3667 | # start of most (all but the first?) lines | ||||||
| 3668 | # That has to be cleaned up so digest.pl can correctly process | ||||||
| 3669 | # header info and identify beginning of Today's Topics | ||||||
| 3670 | if ($line =~ /^\s(.*)/) { | ||||||
| 3671 | print FH $1, "\n"; | ||||||
| 3672 | } else { | ||||||
| 3673 | print FH $line, "\n"; | ||||||
| 3674 | } | ||||||
| 3675 | } | ||||||
| 3676 | close FH or die "Couldn't close after writing: $!"; | ||||||
| 3677 | } | ||||||
| 3678 | |||||||
| 3679 | No promise is made that this script or any script contained in this | ||||||
| 3680 | documentation will work correctly on your system. Hack it up to get it to | ||||||
| 3681 | work the way you want it to. | ||||||
| 3682 | |||||||
| 3683 | =head1 ASSUMPTIONS AND QUALIFICATIONS | ||||||
| 3684 | |||||||
| 3685 | =over 4 | ||||||
| 3686 | |||||||
| 3687 | =item 1 No Change in Mailing List Digest Software | ||||||
| 3688 | |||||||
| 3689 | The main assumption on which Mail::Digest::Tools depends for its success is | ||||||
| 3690 | that the provider of a particular digest continues to use the same mailing | ||||||
| 3691 | list software to produce the digest. If the provider changes his/her software, | ||||||
| 3692 | you must modify Mail::Digest::Tools' configuration data accordingly. | ||||||
| 3693 | |||||||
| 3694 | =item 2 Digest Must Be One E-mail Without Attachments | ||||||
| 3695 | |||||||
| 3696 | At its current stage of development Mail::Digest::Tools is only applicable to | ||||||
| 3697 | mailing list digests which arrive as one continuous file.  It is C | ||||||
| 3698 | applicable to digests (e.g., Cygwin, module-authors@perl.org) which are | ||||||
| 3699 | supplied in a format consisting of (a) one file with instructions and a table | ||||||
| 3700 | of contents and (b) all the individual messages provided as e-mail attachments. | ||||||
| 3701 | |||||||
| 3702 | =item 3 Perl 5.6+ Only | ||||||
| 3703 | |||||||
| 3704 | The program was created with Perl 5.6. Certain features, such as the use of | ||||||
| 3705 | the C | ||||||
| 3706 | account for pre-5.6 features are left as an exercise for the user. | ||||||
| 3707 | |||||||
| 3708 | =item 4 Time::Local | ||||||
| 3709 | |||||||
| 3710 | Mail::Digest::Tools internally uses Perl core extension Time::Local. If at | ||||||
| 3711 | some future point this module is not included as part of a Perl core | ||||||
| 3712 | distribution, you would have to install it manually from CPAN. | ||||||
| 3713 | |||||||
| 3714 | =back | ||||||
| 3715 | |||||||
| 3716 | =head1 HISTORY AND FUTURE DEVELOPMENT | ||||||
| 3717 | |||||||
| 3718 | =head2 PRE-CPAN HISTORY | ||||||
| 3719 | |||||||
| 3720 | ActiveState maintains Perl for Windows-based platforms and also maintains a | ||||||
| 3721 | variety of mailing lists for users of its Windows-compatible versions of Perl. | ||||||
| 3722 | Subscribers to these lists can receive messages either as individual e-mails | ||||||
| 3723 | or as part of a daily digest which contains a listing of the day's topics and | ||||||
| 3724 | the complete text of each message. The messages are often best followed as | ||||||
| 3725 | discussion 'threads' which may extend over several days' worth of digests. | ||||||
| 3726 | |||||||
| 3727 | In June of 2000, however, ActiveState had to temporarily take its mailing lists | ||||||
| 3728 | off-line for technical reasons. When these lists were restored to service, | ||||||
| 3729 | their archive capacities were not immediately restored. I had just begun my | ||||||
| 3730 | study of Perl and had come to enjoy reading the Perl-Win32-Users digest. As | ||||||
| 3731 | I set off for the Yet Another Perl Conference in Pittsburgh, I shouted out, | ||||||
| 3732 | 'I want my Perl-Win32-Users digest!'  I wrote a Perl script called C | ||||||
| 3733 | to fill that gap. | ||||||
| 3734 | |||||||
| 3735 | ActiveState has since restored archiving capacity to their lists. For reasons | ||||||
| 3736 | that would perhaps best be explored in a psychotherapeutic context, however, I | ||||||
| 3737 | had become attached to my local archive of the 'pw32u' list, so I continued to | ||||||
| 3738 | maintain this program and fine-tune its coding. | ||||||
| 3739 | |||||||
| 3740 | In early 2001 it became apparent that this program could be applied to a wide | ||||||
| 3741 | variety of mailing list digests -- not just those provided by ActiveState. In | ||||||
| 3742 | particular, valuable digests provided by Yahoo Groups (formerly E-groups) such | ||||||
| 3743 | as NT Emacs Users, Perl 5 Porters and Perl Beginners could also be archived if | ||||||
| 3744 | C | ||||||
| 3745 | began to track several other digests. I was able to use the archive I had | ||||||
| 3746 | developed as a window into one part of the Perl community in a Lightning Talk | ||||||
| 3747 | I gave at YAPC::North America in Montreal in June 2001, ''An Index of | ||||||
| 3748 | Incivility in the Perl Community.'' | ||||||
| 3749 | |||||||
| 3750 | Maintaining C | ||||||
| 3751 | Perl. Along the way I incorporated my first profiler into the script -- and | ||||||
| 3752 | then discarded it. Some of the subroutines I had written for early versions of | ||||||
| 3753 | the program had applicability to other scripts -- and thus was born my first | ||||||
| 3754 | module -- also since discarded. By July 2003 I was up to version 1.3. | ||||||
| 3755 | Following a suggestion by Uri Guttman at the YAPC::EU conference held in Paris | ||||||
| 3756 | in July 2003, wherever possible the use of separate | ||||||
| 3757 | print statements for each line to be printed was eliminated in favor of | ||||||
| 3758 | concatenating strings to be printed into much larger strings which could be | ||||||
| 3759 | printed all at once. This revision reduced the number of times filehandles | ||||||
| 3760 | had to be opened for writing. A given thread file was now opened only once | ||||||
| 3761 | per call of this program, rather than once for each message in each digest | ||||||
| 3762 | processed per call of the program. | ||||||
| 3763 | |||||||
| 3764 | Various other improvements, such as the possibility of stripping out | ||||||
| 3765 | unnecessary multipart MIME content and the introduction of subdirectories | ||||||
| 3766 | for archiving, were made in late 2003. At that point I | ||||||
| 3767 | decided to transform the script into a full-fledged Perl module. At first I | ||||||
| 3768 | tried out an object-oriented structure (with which I was familiar from my first | ||||||
| 3769 | two CPAN modules, F | ||||||
| 3770 | necessitated one constructor and one method call per typical script, but since | ||||||
| 3771 | the constructor did nothing but some cursory validation of the configuration | ||||||
| 3772 | data, it was mostly superfluous. Hence, I jettisoned the OO structure in favor | ||||||
| 3773 | of a functional approach. The result: Mail::Digest::Tools. | ||||||
| 3774 | |||||||
| 3775 | =head2 CPAN | ||||||
| 3776 | |||||||
| 3777 | After these revisions, I was up to version 1.96. Why revert to a lower | ||||||
| 3778 | version number at this point? That is why Mail::Digest::Tools makes its CPAN | ||||||
| 3779 | debut in version 2.04. | ||||||
| 3780 | |||||||
| 3781 | v1.97 (2/18/2004): Dealing with problem that Win32 and Unix/Linux may create | ||||||
| 3782 | different thread names for the same set of source messages because they have | ||||||
| 3783 | different lists of characters forbidden in file names. This became a problem | ||||||
| 3784 | while writing tests for C | ||||||
| 3785 | the names of thread files created via that function more difficult to predict. | ||||||
| 3786 | Tests adjusted appropriately. | ||||||
| 3787 | |||||||
| 3788 | v1.98 (2/19/2004): Eliminated suspect uses of C modifier on regexes. | ||||||
| 3789 | This was causing problems when I called C | ||||||
| 3790 | different types of digests in the same script. Also, eliminated code | ||||||
| 3791 | referring to DOS (I | ||||||
| 3792 | DOS filenames) as I have no way to test this module on a DOS box. | ||||||
| 3793 | |||||||
| 3794 | v1.99 (2/22/2004): ActiveState introduced a new format for its | ||||||
| 3795 | Perl-Win32-Users digest -- the digest which originally inspired the creation | ||||||
| 3796 | of this module's predecessor in 2000. One aspect of this new format was a | ||||||
| 3797 | clear improvement: HTML attachments are now stripped before messages are | ||||||
| 3798 | posted to the digest, so multipart MIME content has either been reduced | ||||||
| 3799 | considerably or eliminated altogether. But another aspect of this new | ||||||
| 3800 | format upset code going back four years: The delimiter immediately | ||||||
| 3801 | following Today's Topics is now different from the delimiters separating each | ||||||
| 3802 | message in the digest. Working around this appeared to be surprisingly | ||||||
| 3803 | difficult, especially since this revision had to be done in the middle of | ||||||
| 3804 | writing a test suite for CPAN distribution. A new key has been added to the | ||||||
| 3805 | C<%config_in> hash for each digest: | ||||||
| 3806 | |||||||
| 3807 | $config_in{'post_topics_delimiter'} | ||||||
| 3808 | |||||||
| 3809 | v2.00 (2/23/2004): Testing conducted after the last revision revealed a bug | ||||||
| 3810 | going back several versions in the internal subroutine stripping multipart | ||||||
| 3811 | MIME content.  The last paragraph of each message which did I | ||||||
| 3812 | content was being stripped off. The offending code was found within | ||||||
| 3813 | C<_analyze_message_body()>. (The author recently learned of the CPAN | ||||||
| 3814 | module F | ||||||
| 3815 | the hand-rolled subroutine used within Mail::Digest::Tools, but a full study | ||||||
| 3816 | of its possibilities will be deferred to a later version. Also in this | ||||||
| 3817 | version, POD was rewritten to reflect the introduction of the post-topics | ||||||
| 3818 | delimiter. | ||||||
| 3819 | |||||||
| 3820 | v2.01 (2/24/2004): Backslashes (except as part of C<\n> newline characters) | ||||||
| 3821 | are prohibited in C<%config_out> key C | ||||||
| 3822 | because in the test suite that key's value is used as a variable inside a | ||||||
| 3823 | regular expression which in turn is used as an argument to C | ||||||
| 3824 | Preliminary investigation suggests that to work around the backslash | ||||||
| 3825 | metacharacter in that situation would be very time-consuming. | ||||||
| 3826 | |||||||
| 3827 | v2.02 (2/26/2004):  Revised C | ||||||
| 3828 | subroutine C<_strip_down_for_reply> to reflect distinction between post-topics | ||||||
| 3829 | delimiter and source message delimiter. | ||||||
| 3830 | |||||||
| 3831 | v2.03 (3/04/2004):  Fixed bug in C | ||||||
| 3832 | Extensive reworking of test suite. | ||||||
| 3833 | |||||||
| 3834 | v2.04 (3/05/2004): No changes in module. Refinement of test suite only. | ||||||
| 3835 | |||||||
| 3836 | v2.05 (3/07/2004): Fixed accidental deletion of incrementation of | ||||||
| 3837 | C<$message_count> in C<_strip_down()>. | ||||||
| 3838 | |||||||
| 3839 | v2.06 (3/10/2004): Correction of errors in test suite. Elimination of use of List::Compare in test suite. | ||||||
| 3840 | |||||||
| 3841 | v2.07 (3/11/2004): Correction of error in t/03.t | ||||||
| 3842 | |||||||
| 3843 | v2.08 (3/11/2004): Correction in _clean_up_thread_title and in tests. | ||||||
| 3844 | |||||||
| 3845 | v2.10 (3/15/2004): Corrections to README and documentation only. | ||||||
| 3846 | |||||||
| 3847 | v2.11 (10/23/2004): Fixed several errors which resulted in "Bizarre copy of hash in leave" error when running test suite under Devel::Cover. | ||||||
| 3848 | |||||||
| 3849 | v2.12 (05/14/2011): Added 'mirbsd' to list of Unixish-OSes. | ||||||
| 3850 | |||||||
| 3851 | =head1 AUTHOR | ||||||
| 3852 | |||||||
| 3853 | James E. Keenan (F | ||||||
| 3854 | |||||||
| 3855 | Creation date: August 21, 2000. | ||||||
| 3856 | Last modification date: May 14, 2011. | ||||||
| 3857 | Copyright (c) 2000-2011 James E. Keenan. United States. All rights reserved. | ||||||
| 3858 | |||||||
| 3859 | This software is distributed with absolutely no warranty, express or implied. | ||||||
| 3860 | Use it at your own risk. This is free software which you may distribute under | ||||||
| 3861 | the same terms as Perl itself. | ||||||
| 3862 | |||||||
| 3863 | =cut | ||||||
| 3864 | |||||||
| 3865 | |||||||
| 3866 | |||||||
| 3867 |