File Coverage

blib/lib/Parse/IRCLog.pm

Criterion	Covered	Total	%
statement	51	51	100.0
branch	14	16	87.5
condition	3	3	100.0
subroutine	11	11	100.0
pod	6	6	100.0
total	85	87	97.7

line	stmt	bran	cond	sub	pod	time	code
1	4			4		3527	use strict;
	4					10
	4					163
2	4			4		20	use warnings;
	4					7
	4					224
3							package Parse::IRCLog;
4							# ABSTRACT: parse internet relay chat logs
5							$Parse::IRCLog::VERSION = '1.106';
6	4			4		22	use Carp ();
	4					9
	4					59
7	4			4		2081	use Parse::IRCLog::Result;
	4					10
	4					97
8	4			4		3386	use Symbol ();
	4					3829
	4					2692
9
10							# =head1 SYNOPSIS
11							#
12							# use Parse::IRCLog;
13							#
14							# $result = Parse::IRCLog->parse("perl-2004-02-01.log");
15							#
16							# my %to_print = ( msg => 1, action => 1 );
17							#
18							# for ($result->events) {
19							# next unless $to_print{ $_->{type} };
20							# print "$_->{nick}: $_->{text}\n";
21							# }
22							#
23							# =head1 DESCRIPTION
24							#
25							# This module provides a simple framework to parse IRC logs in arbitrary formats.
26							#
27							# A parser has a set of regular expressions for matching different events that
28							# occur in an IRC log, such as "msg" and "action" events. Each line in the log
29							# is matched against these rules and a result object, representing the event
30							# stream, is returned.
31							#
32							# The rule set, described in greated detail below, can be customized by
33							# subclassing Parse::IRCLog. In this way, Parse::IRCLog can provide a generic
34							# interface for log analysis across many log formats, including custom formats.
35							#
36							# Normally, the C method is used to create a result set without storing a
37							# parser object, but a parser may be created and reused.
38							#
39							# =method new
40							#
41							# This method constructs a new parser (with C<< $class->construct >>) and
42							# initializes it (with C<< $obj->init >>). Construction and initialization are
43							# separated for ease of subclassing initialization for future pipe dreams like
44							# guessing what ruleset to use.
45							#
46							# =cut
47
48							sub new {
49	5			5	1	1222	my $class = shift;
50	5	100				182	Carp::croak "new is a class method" if ref $class;
51
52	4					14	$class->construct->init;
53							}
54
55							# =method construct
56							#
57							# The parser constructor just returns a new, empty parser object. It should be a
58							# blessed hashref.
59							#
60							# =cut
61
62	4			4	1	16	sub construct { bless {} => shift; }
63
64							# =method init
65							#
66							# The initialization method configures the object, loading its ruleset.
67							#
68							# =cut
69
70							sub init {
71	4			4	1	8	my $self = shift;
72	4					10	$self->{patterns} = $self->patterns;
73	4					21	$self;
74							}
75
76							# =method patterns
77							#
78							# This method returns a reference to a hash of regular expressions, which are
79							# used to parse the logs. Only a few, so far, are required by the parser,
80							# although internally a few more are used to break down the task of parsing
81							# lines.
82							#
83							# C matches an action; that is, the result of /ME in IRC. It should
84							# return the following matches:
85							#
86							# $1 - timestamp
87							# $2 - nick prefix
88							# $3 - nick
89							# $4 - the action
90							#
91							# C matches a message; that is, the result of /MSG (or "normal talking") in
92							# IRC. It should return the following matches:
93							#
94							# $1 - timestamp
95							# $2 - nick prefix
96							# $3 - nick
97							# $3 - channel
98							# $5 - the action
99							#
100							# Read the source for a better idea as to how these regexps break down. Oh, and
101							# for what it's worth, the default patterns are based on my boring, default irssi
102							# configuration. Expect more rulesets to be included in future distributions.
103							#
104							# =cut
105
106							sub patterns {
107	17			17	1	921	my ($self) = @_;
108
109	17	100	100			222	return $self->{patterns} if ref $self and defined $self->{patterns};
110
111	5					9	my $p;
112
113							# nick and chan are (mostly) specified in RFC2812, section 2.3.1
114
115	5					22	my $letter = qr/[\x41-\x5A\x61-\x7A]/; # A-Z / a-z
116	5					14	my $digit = qr/[\x30-\x39]/; # 0-9
117	5					24	my $special = qr/[\x5B-\x60\x7B-\x7D]/; # [\]^_`{\|}
118
119	5					191	$p->{nick} = qr/( (?: $letter \| $special )
120							(?: $letter \| $digit \| $special \| - )* )/x;
121
122	5					18	my $channelid = qr/[A-Z0-9]{5}/;
123	5					15	my $chanstring = qr/[^\x00\a\r\n ,:]*/;
124
125	5					214	$p->{chan} = qr/( (?: \# \| \+ \| !$channelid \| & ) $chanstring
126							(?: :$chanstring )? )/x;
127
128							# the other regexes are more relevant to the way irssi formats logs
129
130	5					297	$p->{nick_container} = qr/
131							<
132							\s*
133							([+%@])?
134							\s*
135							$p->{nick}
136							(?:
137							:
138							$p->{chan}
139							)?
140							\s*
141							>
142							/x;
143
144	5					20	$p->{timestamp} = qr/\[?(\d\d:\d\d(?::\d\d)?)?\]?/;
145
146	5					27	$p->{action_leader} = qr/\*/;
147
148	5					415	$p->{msg} = qr/
149							$p->{timestamp}
150							\s*
151							$p->{nick_container}
152							\s+
153							(.+)
154							/x;
155
156	5					257	$p->{action} = qr/
157							$p->{timestamp}
158							\s*
159							$p->{action_leader}
160							\s+
161							([%@])?
162							\s*
163							$p->{nick}
164							\s
165							(.+)
166							/x;
167
168	5	100				38	$self->{patterns} = $p if ref $self;
169	5					32	$p;
170							}
171
172							# =method parse
173							#
174							# my $result = $parser->parse($file)
175							#
176							# This method parses the file named and returns a Parse::IRCLog::Result object
177							# representing the results. The C method can be called on a parser object
178							# or on the class. If called on the class, a parser will be instantiated for the
179							# method call and discarded when C returns.
180							#
181							# =cut
182
183							sub parse {
184	1			1	1	890	my $self = shift;
185	1	50				7	$self = $self->new unless ref $self;
186
187	1					4	my $symbol = Symbol::gensym;
188	1	50				64	open $symbol, "<", $_[0] or Carp::croak "couldn't open $_[0]: $!";
189
190	1					2	my @events;
191	1					27	push @events, $self->parse_line($_) while (<$symbol>);
192	1					9	Parse::IRCLog::Result->new(@events);
193							}
194
195							# =method parse_line
196							#
197							# my $info = $parser->parse_line($line);
198							#
199							# This method is used internally by C to turn each line into an event.
200							# While it could someday be made slick, it's adequate for now. It attempts to
201							# match each line against the required patterns from the C result and
202							# if successful returns a hashref describing the event.
203							#
204							# If no match can be found, an "unknown" event is returned.
205							#
206							# =cut
207
208							sub parse_line {
209	9			9	1	16	my ($self, $line) = @_;
210	9	100				22	if ($line) {
211	7	100				14	return { type => 'msg', timestamp => $1, nick_prefix => $2, nick => $3, text => $5 }
212							if $line =~ $self->patterns->{msg};
213	5	100				28	return { type => 'action', timestamp => $1, nick_prefix => $2, nick => $3, text => $4 }
214							if $line =~ $self->patterns->{action};
215							}
216	5					40	return { type => 'unknown', text => $line };
217							}
218
219							# =head1 TODO
220							#
221							# Write a few example subclasses for common log formats.
222							#
223							# Add a few more default event types: join, part, nick. Others?
224							#
225							# Possibly make the C sub an module, to allow subclassing to override
226							# only one or two patterns. For example, to use the default C pattern but
227							# override the C or C. This sounds like a very
228							# good idea, actually, now that I write it down.
229							#
230							# =cut
231
232							1;
233
234							__END__