File Coverage

blib/lib/File/Mork.pm

Criterion	Covered	Total	%
statement	137	153	89.5
branch	45	68	66.1
condition	10	19	52.6
subroutine	15	17	88.2
pod	5	8	62.5
total	212	265	80.0

line	stmt	bran	cond	sub	pod	time	code
1							package File::Mork;
2
3	3			3		44929	use strict;
	3					6
	3					96
4	3			3		16	use vars qw($VERSION $ERROR);
	3					5
	3					194
5	3			3		2380	use POSIX qw(strftime);
	3					24548
	3					17
6	3			3		6929	use Encode;
	3					33700
	3					7322
7
8							$VERSION = "0.4";
9
10							=head1 NAME
11
12							File::Mork - a module to read Mozilla URL history files
13
14							=head1 SYNOPSIS
15
16							my $mork = File::Mork->new($filename, verbose => 1)
17							\|\| die $File::Mork::ERROR."\n";
18
19
20							foreach my $entry ($mork->entries) {
21							while (my($key,$val) = each %$entry) {
22							printf ("%14s = %s\n", $key, $val);
23							}
24							}
25
26							=head1 DESCRIPTION
27
28							This is a module that can read the Mozilla URL history file -- normally
29							$HOME/.mozilla/default/*.slt/history.dat -- and extract the id, url,
30							name, hostname, first visted dat, last visited date and visit count.
31
32							To find your history file it might be worth using B
33							which has some platform-independent code for finding the profiles of
34							various Mozilla-isms (including Firefox, Camino, K-Meleon, etc.).
35
36							=cut
37
38							=head1 METHODS
39
40							=head2 new [opts]
41
42							Takes a filename and parses that file.
43
44							Returns C on error, setting C<$File::Mork::Error>.
45
46							Takes an optional hash of options
47
48							=over 4
49
50							=item
51
52							verbose
53
54							A value up to 3 - defines the level of verbosity
55
56							=item
57
58							age
59
60							A ctime which forces C to only parse entries later than this.
61
62							=back
63
64							=cut
65
66							sub new {
67	2			2	1	423	my ($class, $file, %opts) = @_;
68	2					7	my $self = bless \%opts, $class;
69
70	2		50			22	$self->{verbose} \|\|= 0;
71
72	2	100				9	unless ($self->parse($file)) {
73	1					2	$ERROR = $self->{error};
74	1					12	return;
75							}
76
77	1					8	return $self;
78							}
79
80
81							##########################################################################
82							# Define the messy regexen up here
83							##########################################################################
84
85							my $top_level_comment = qr@//.*\n@;
86
87							my $key_table_re = qr/ < \s* < # "< <"
88							$ a=c $ > # "(a=c)>"
89							(?> ([^>]) ) > \s # Grab anything that's not ">"
90							/sx;
91
92							my $value_table_re = qr/ < ( .?\) )> \s /sx;
93
94							my $table_re = qr/ \{ -? # "{" or "{-"
95							[\da-f]+ : # hex, ":"
96							(?> .*?\{ ) # Eat up to a {...
97							((?> .*?\} ) # and then the closing }...
98							(?> .*?\} )) # Finally, grab the table section
99							\s* /six;
100
101							my $row_re = qr/ ( (?> \[ [^]]* \] # "["..."]"
102							\s*)+ ) # Perhaps repeated many times
103							/sx;
104
105							my $section_begin_re = qr/ \@\$\$\{ # "@$${"
106							([\dA-F]+) # hex
107							\{\@ \s* # "{@"
108							/six;
109
110
111							my $crlf = qr/\x0a\x0d\|\x0d\x0a\|\x0a\|\x0d/; # We are liberal in what we accept.
112							# But then, so is a six dollar whore.
113
114							=head2 parse
115
116							Internal method to parse the file. Obviously.
117
118							=cut
119
120							sub parse {
121	2			2	1	6	my ($self, $file) = @_;
122
123	2	50				9	$self->{since} = ($self->{age} ? time() - $self->{age} : 0);
124	2					6	$self->{section} = "top level";
125	2					5	$self->{section_end_re} = undef;
126
127
128
129							##########################################################################
130							# Read in the file.
131							##########################################################################
132
133	2					6	local $/ = undef;
134	2					7	local *IN;
135
136	2					4	$self->{file} = $file;
137	2					5	$self->{total} = 0;
138	2					4	$self->{skipped} = 0;
139
140	2	100				80	unless (open (IN, $file)) {
141	1					15	$self->{error} = "Couldn't open $file : $!";
142	1					8	return;
143							}
144
145	1					6	$self->debug("reading ...",1);
146	1					27	my $body = ;
147	1					7	close IN;
148
149
150	1					95	$body =~ s/($crlf)/\n/gs; # Windows Mozilla uses \r\n
151							# Presumably Mac Mozilla is similarly dumb
152
153	1					6	$body =~ s/\\\\/\$5C/gs; # Sometimes backslash is quoted with a
154							# backslash; convert to hex.
155	1					14	$body =~ s/\\\)/\$29/gs; # close-paren is quoted with a backslash;
156							# convert to hex.
157	1					12	$body =~ s/\\\n//gs; # backslash at end of line is continuation.
158
159							##########################################################################
160							# Figure out what we're looking at, and parse it.
161							##########################################################################
162
163	1					4	$self->debug("parsing ...",1);
164	1					3	pos($body) = 0;
165	1					4	my $length = length($body);
166
167	1					9	while( pos($body) < $length ) {
168	10					16	my $section_end_re = $self->{section_end_re};
169							# Key table
170
171	10	100	66			289	if ( $body =~ m/\G$key_table_re/gc ) {
		100
		100
		50
		100
		100
		50
172	1	50				6	return unless $self->parse_key_table($1);
173
174							# Values
175							} elsif ( $body =~ m/\G$value_table_re/gco ) {
176	2	50				8	return unless $self->parse_value_table($1);
177
178							# Table
179							} elsif ( $body =~ m/\G$table_re/gco ) {
180	2	50				8	return unless $self->parse_table($1);
181
182							# Rows (-> table)
183							} elsif ( $body =~ m/\G$row_re/gco ) {
184	0	0				0	return unless $self->parse_table($1);
185
186							# Section begin
187							} elsif ( $body =~ m/\G$section_begin_re/gco ) {
188	2					4	my $section = $1;
189	2					43	$self->{section_end_re} = qr/\@\$\$\}$section\}\@\s*/s;
190	2					8	$self->{section} = $section;
191							# Section end
192							} elsif ( $section_end_re && $body =~ m/\G$section_end_re/gc ) {
193	2					5	$self->{section_end_re} = undef;
194	2					11	$self->{section} = "top level";
195
196							# Comment
197							} elsif ( $body =~ m/\G$top_level_comment/gco ) {
198							#no-op
199
200							} else {
201							# $body =~ m/\G (.{0,300}) /gcsx; print "<$1>\n";
202	0					0	return $self->error($self->{section}.": Cannot parse");
203							}
204							}
205
206	1	50				4	if($self->{section_end_re}) {
207	0					0	return $self->error("Unterminated section ".$self->{section});
208							}
209
210
211	1					7	$self->debug("sorting...",1);
212
213	7					29	my @entries = map { File::Mork::Entry->new(%$_) }
214							sort { $b->{LastVisitDate} <=>
215	1					2	$a->{LastVisitDate} } values(%{$self->{row_hash}});
	13					20
	1					11
216
217	1					8	$self->debug("done! (".$self->{total}." total, ".$self->{skipped}." skipped)",1);
218
219	1					3	for (qw(key_table val_table row_hash total skipped)) {
220	5					26	$self->{$_} = undef;
221							}
222
223	1					4	$self->{entries} = \@entries;
224	1					11	return 1;
225							}
226
227							=head2 entries
228
229							Return a list of C objects sorted by B.
230
231							=cut
232
233							sub entries {
234	1			1	1	3	return @{$_[0]->{entries}};
	1					13
235							}
236
237
238							##########################################################################
239							# parse a row and column table
240							##########################################################################
241
242							sub parse_table {
243	2			2	0	8	my($self, $table_part) = (@_);
244
245	2					5	$self->debug("",3);
246
247							# Assumption: no relevant spaces in values in this section
248	2					18	$table_part =~ s/\s+//g;
249
250							# print $table_part; #exit(0);
251
252							# Grab each complete [...] block
253	2					12	while( $table_part =~ m/\G [^[]* \[ # find a "["
254							( [^]]+ ) \] # capture up to "]"
255							/gcx ) {
256	7					16	$_ = $1;
257
258	7					42	my ($id, @cells) = split (m/[()]+/s);
259
260	7	50				21	next unless scalar(@cells);
261
262							# Trim junk
263	7					12	$id =~ s/^-//;
264	7					12	$id =~ s/:.*//;
265
266	7	50				31	my %hash = ($self->{row_hash}->{$id}) ? %{$self->{row_hash}->{$id}} :
	0					0
267							( 'ID' => $id,
268							'LastVisitDate' => 0 );
269
270	7					13	foreach (@cells) {
271	35	50				71	next unless $_;
272
273	35					129	my ($keyi, $which, $vali) =
274							m/^\^ ([-\dA-F]+)
275							([\^=])
276							(.*)
277							$/xi;
278
279	35	50				76	return $self->error("unparsable cell: $_\n") unless defined ($vali);
280
281							# If the key isn't in the key table, ignore it
282							#
283	35					66	my $key = $self->{key_table}->{$keyi};
284	35	50				69	next unless defined($key);
285
286							my $val = ($which eq '='
287							? $vali
288	35	100				78	: $self->{val_table}->{$vali});
289
290	35	100	100			140	if ($key eq 'LastVisitDate' \|\| $key eq 'FirstVisitDate') {
291	12					23	$val = int ($val / 1000000); # we don't need milliseconds, dude.
292							}
293
294	35					92	$hash{$key} = $val;
295							#print "$id: $key -> $val\n";
296							}
297
298
299	7	50	0			20	if ($self->{age} && ($hash{LastVisitDate} \|\| $self->{since}) < $self->{since}) {
			33
300	0					0	$self->debug("skipping old: $hash{LastVisitDate} $hash{URL}",3);
301	0					0	$self->{skipped}++;
302	0					0	next;
303							}
304
305	7					12	$self->{total}++;
306	7					44	$self->{row_hash}->{$id} = \%hash;
307							}
308	2					11	return 1;
309							}
310
311
312							##########################################################################
313							# parse a values table
314							##########################################################################
315
316							sub parse_value_table {
317	2			2	0	7	my($self, $val_part) = (@_);
318
319	2	50				7	return 1 unless $val_part;
320
321	2					57	my @pairs = split (m/$([^$]+)\)/, $val_part);
322	2					4	$val_part = undef;
323
324	2					4	$self->debug("",3);
325
326	2					4	foreach (@pairs) {
327	58	100				151	next unless (m/[^\s]/s);
328	29					112	my ($key, $val) = m/([\dA-F])[\t\n ]=[\t\n ](.)/i;
329
330	29	50				67	if (! defined ($val)) {
331	0					0	$self->debug($self->{section}.": unparsable val: $_");
332	0					0	next;
333							}
334
335							# recognize the byte order of UTF-16 encoding
336	29	100	66			83	if (! defined ($self->{byte_order}) && $val =~ m/(?:BE\|LE)$/) {
337	1					3	$self->{byte_order} = $val;
338							}
339
340							# Assume that URLs and LastVisited are never hexilated; so
341							# don't bother unhexilating if we won't be using Name, etc.
342	29	100				74	if($val =~ m/\$/) {
343	6	50				13	if ( defined $self->{byte_order} ) {
344	6					12	my $encoding = 'UTF-16' . $self->{byte_order};
345	6					26	$val =~ s/\$([\dA-F]{2})/chr(hex($1))/ge;
	319					858
346	6					21	$val = encode_utf8(decode($encoding, $val));
347							}
348							else {
349							# Approximate wchar_t -> ASCII and remove NULs
350	0					0	$val =~ s/\$00//g; # faster if we remove these first
351	0					0	$val =~ s/\$([\dA-F]{2})/chr(hex($1))/ge;
	0					0
352							}
353							}
354
355	29					333163	$self->{val_table}->{$key} = $val;
356	29					111	$self->debug($self->{section}.": val $key = \"$val\"", 3);
357							}
358	2					23	return 1;
359							}
360
361
362							##########################################################################
363							# parse a key table
364							##########################################################################
365
366							sub parse_key_table {
367	1			1	0	23	my ($self, $key_table) = (@_);
368
369	1					4	$self->debug("",3);
370
371	1					6	$key_table =~ s@\s+//.*$@@gm;
372
373	1					20	my @pairs = split (m/$([^$]+)\)/s, $key_table);
374	1					2	$key_table = undef;
375
376	1					3	foreach (@pairs) {
377	26	100				67	next unless (m/[^\s]/s);
378	13					49	my ($key, $val) = m/([\dA-F]+)\s=\s(.*)/i;
379	13	50				29	return $self->error ("unparsable key: $_") unless defined ($val);
380
381							# savie the other fields that we aren't interested in.
382	13					36	$self->{key_table}->{$key} = $val;
383	13					41	$self->debug($self->{section}.": key $key = \"$val\"",3);
384							}
385	1					8	return 1;
386							}
387
388
389							=head2 error
390
391							Internal method to set the internal error message
392
393							=cut
394
395							sub error {
396	0			0	1	0	my ($self, $message) = @_;
397	0					0	$self->{error} = $self->{file}.": $message";
398	0					0	return undef;
399							}
400
401							=head2 debug
402
403							Internal method to print out a debug message if it's a higher priority
404							than the the current verbosity level.
405
406							=cut
407
408							sub debug {
409	51			51	1	79	my ($self, $message, $level) = @_;
410	51		50			101	$level \|\|= 0;
411	51	50				163	return if $self->{verbose} < $level;
412	0	0				0	print STDERR "".(($message eq "")? "\n" : $self->{file}.": $message\n" );
413							}
414
415
416							=head1 THE UGLY TRUTH LAID BARE
417
418							I
419
420							In Netscape Navigator 1.0 through 4.0, the history.db file was just a
421							Berkeley DBM file. You could trivially bind to it from Perl, and pull
422							out the URLs and last-access time. In Mozilla, this has been replaced
423							with a "Mork" database for which no tools exist.
424
425							Let me make it clear that McCusker is a complete barking lunatic.
426							This is just about the stupidest file format I've ever seen.
427
428							http://www.mozilla.org/mailnews/arch/mork/primer.txt
429							http://jwz.livejournal.com/312657.html
430							http://www.jwz.org/doc/mailsum.html
431							http://bugzilla.mozilla.org/show_bug.cgi?id=241438
432
433							In brief, let's count its sins:
434
435							=over 4
436
437							=item
438
439							Two different numerical namespaces that overlap.
440
441							=item
442
443							It can't decide what kind of character-quoting syntax to use:
444							Backslash? Hex encoding with dollar-sign?
445
446							=item
447
448							C++ line comments are allowed sometimes, but sometimes // is just a
449							pair of characters in a URL.
450
451							=item
452
453							It goes to all this serious compression effort (two different
454							string-interning hash tables) and then writes out Unicode strings
455							without using UTF-8: writes out the unpacked wchar_t characters!
456
457							=item
458
459							Worse, it hex-encodes each wchar_t with a 3-byte encoding, meaning the
460							file size will be 3x or 6x (depending on whether whchar_t is 2 bytes or
461							4 bytes.)
462
463							=item
464
465							It masquerades as a "textual" file format when in fact it's just
466							another binary-blob file, except that it represents all its magic
467							numbers in ASCII. It's not human-readable, it's not hand-editable, so
468							the only benefit there is to the fact that it uses short lines and
469							doesn't use binary characters is that it makes the file bigger. Oh wait,
470							my mistake, that isn't actually a benefit at all.
471
472							=back
473
474							Pure comedy.
475
476
477							=head1 AUTHOR
478
479							Module-ised by Simon Wistow
480
481							based on
482
483							http://www.jwz.org/hacks/mork.pl
484
485							Created: 3-Mar-2004 by Jamie Zawinski, Anonymous, and Jacob Post.
486
487
488							=head1 COPYRIGHT
489
490							Copyright � 2004 Jamie Zawinski
491
492							=head1 LICENSE
493
494							Permission to use, copy, modify, distribute, and sell this software and its
495							documentation for any purpose is hereby granted without fee, provided that
496							the above copyright notice appear in all copies and that both that
497							copyright notice and this permission notice appear in supporting
498							documentation. No representations are made about the suitability of this
499							software for any purpose. It is provided "as is" without express or
500							implied warranty.
501
502							=head1 BUGS
503
504							Might be a bit memory heavy? Could do with an iterator interface.
505
506							Can't write Mork dbs.
507
508							=head1 SEE ALSO
509
510							http://www.livejournal.com/users/jwz/312657.html
511
512							http://www.erys.org/resume/netscape/mork/jwz.html
513
514							=cut
515
516
517							package File::Mork::Entry;
518	3			3		25	use strict;
	3					6
	3					85
519	3			3		15	use vars qw($AUTOLOAD);
	3					10
	3					541
520
521							=head1 NAME
522
523							File::Mork::Entry - an single entry in a mork DB
524
525							=head1 METHODS
526
527							All methods except C take an optional argument to set them.
528
529							=head2 new <%opts>
530
531							blesses C<%opts> into the class File::Mork::Entry
532
533							=cut
534
535							sub new {
536	7			7		25	my ($class, %self) = @_;
537	7					23	return bless \%self, $class;
538							}
539
540
541							=head2 ID
542
543							The internal id of the entry
544
545							=head2 URL
546
547							The url visited
548
549							=head2 NAME
550
551							The name of the url visited
552
553							=head2 Hostname
554
555							The hostname of the url visited
556
557							=head2 FirstVisitDate
558
559							The first time this url was visited as a C
560
561							=head2 LastVisitDate
562
563							The last time this url was visited as a C
564
565							=head2 Hidden
566
567							Whether this URL is hidden from the history list or not
568
569							=head2 VisitCount
570
571							The number of times this url has been visited
572
573							=head2 ByteOrder
574
575							The byte order - this is associated with ID number 1.
576
577							=cut
578
579				0			sub DESTROY { }
580
581							sub AUTOLOAD {
582	14			14		3023	my $self = shift;
583	14					33	my $attr = $AUTOLOAD;
584	14					88	$attr =~ s/.*:://;
585
586	14	50				61	$self->{$attr} = $_[0] if @_;
587	14					105	return $self->{$attr};
588							}
589
590							1;