File Coverage

blib/lib/Parse/IRCLog.pm
Criterion Covered Total %
statement 51 51 100.0
branch 14 16 87.5
condition 3 3 100.0
subroutine 11 11 100.0
pod 6 6 100.0
total 85 87 97.7


line stmt bran cond sub pod time code
1 4     4   3527 use strict;
  4         10  
  4         163  
2 4     4   20 use warnings;
  4         7  
  4         224  
3             package Parse::IRCLog;
4             # ABSTRACT: parse internet relay chat logs
5             $Parse::IRCLog::VERSION = '1.106';
6 4     4   22 use Carp ();
  4         9  
  4         59  
7 4     4   2081 use Parse::IRCLog::Result;
  4         10  
  4         97  
8 4     4   3386 use Symbol ();
  4         3829  
  4         2692  
9              
10             # =head1 SYNOPSIS
11             #
12             # use Parse::IRCLog;
13             #
14             # $result = Parse::IRCLog->parse("perl-2004-02-01.log");
15             #
16             # my %to_print = ( msg => 1, action => 1 );
17             #
18             # for ($result->events) {
19             # next unless $to_print{ $_->{type} };
20             # print "$_->{nick}: $_->{text}\n";
21             # }
22             #
23             # =head1 DESCRIPTION
24             #
25             # This module provides a simple framework to parse IRC logs in arbitrary formats.
26             #
27             # A parser has a set of regular expressions for matching different events that
28             # occur in an IRC log, such as "msg" and "action" events. Each line in the log
29             # is matched against these rules and a result object, representing the event
30             # stream, is returned.
31             #
32             # The rule set, described in greated detail below, can be customized by
33             # subclassing Parse::IRCLog. In this way, Parse::IRCLog can provide a generic
34             # interface for log analysis across many log formats, including custom formats.
35             #
36             # Normally, the C method is used to create a result set without storing a
37             # parser object, but a parser may be created and reused.
38             #
39             # =method new
40             #
41             # This method constructs a new parser (with C<< $class->construct >>) and
42             # initializes it (with C<< $obj->init >>). Construction and initialization are
43             # separated for ease of subclassing initialization for future pipe dreams like
44             # guessing what ruleset to use.
45             #
46             # =cut
47              
48             sub new {
49 5     5 1 1222 my $class = shift;
50 5 100       182 Carp::croak "new is a class method" if ref $class;
51              
52 4         14 $class->construct->init;
53             }
54              
55             # =method construct
56             #
57             # The parser constructor just returns a new, empty parser object. It should be a
58             # blessed hashref.
59             #
60             # =cut
61              
62 4     4 1 16 sub construct { bless {} => shift; }
63              
64             # =method init
65             #
66             # The initialization method configures the object, loading its ruleset.
67             #
68             # =cut
69              
70             sub init {
71 4     4 1 8 my $self = shift;
72 4         10 $self->{patterns} = $self->patterns;
73 4         21 $self;
74             }
75              
76             # =method patterns
77             #
78             # This method returns a reference to a hash of regular expressions, which are
79             # used to parse the logs. Only a few, so far, are required by the parser,
80             # although internally a few more are used to break down the task of parsing
81             # lines.
82             #
83             # C matches an action; that is, the result of /ME in IRC. It should
84             # return the following matches:
85             #
86             # $1 - timestamp
87             # $2 - nick prefix
88             # $3 - nick
89             # $4 - the action
90             #
91             # C matches a message; that is, the result of /MSG (or "normal talking") in
92             # IRC. It should return the following matches:
93             #
94             # $1 - timestamp
95             # $2 - nick prefix
96             # $3 - nick
97             # $3 - channel
98             # $5 - the action
99             #
100             # Read the source for a better idea as to how these regexps break down. Oh, and
101             # for what it's worth, the default patterns are based on my boring, default irssi
102             # configuration. Expect more rulesets to be included in future distributions.
103             #
104             # =cut
105              
106             sub patterns {
107 17     17 1 921 my ($self) = @_;
108              
109 17 100 100     222 return $self->{patterns} if ref $self and defined $self->{patterns};
110              
111 5         9 my $p;
112              
113             # nick and chan are (mostly) specified in RFC2812, section 2.3.1
114              
115 5         22 my $letter = qr/[\x41-\x5A\x61-\x7A]/; # A-Z / a-z
116 5         14 my $digit = qr/[\x30-\x39]/; # 0-9
117 5         24 my $special = qr/[\x5B-\x60\x7B-\x7D]/; # [\]^_`{|}
118              
119 5         191 $p->{nick} = qr/( (?: $letter | $special )
120             (?: $letter | $digit | $special | - )* )/x;
121              
122 5         18 my $channelid = qr/[A-Z0-9]{5}/;
123 5         15 my $chanstring = qr/[^\x00\a\r\n ,:]*/;
124              
125 5         214 $p->{chan} = qr/( (?: \# | \+ | !$channelid | & ) $chanstring
126             (?: :$chanstring )? )/x;
127              
128             # the other regexes are more relevant to the way irssi formats logs
129              
130 5         297 $p->{nick_container} = qr/
131             <
132             \s*
133             ([+%@])?
134             \s*
135             $p->{nick}
136             (?:
137             :
138             $p->{chan}
139             )?
140             \s*
141             >
142             /x;
143              
144 5         20 $p->{timestamp} = qr/\[?(\d\d:\d\d(?::\d\d)?)?\]?/;
145              
146 5         27 $p->{action_leader} = qr/\*/;
147              
148 5         415 $p->{msg} = qr/
149             $p->{timestamp}
150             \s*
151             $p->{nick_container}
152             \s+
153             (.+)
154             /x;
155              
156 5         257 $p->{action} = qr/
157             $p->{timestamp}
158             \s*
159             $p->{action_leader}
160             \s+
161             ([%@])?
162             \s*
163             $p->{nick}
164             \s
165             (.+)
166             /x;
167              
168 5 100       38 $self->{patterns} = $p if ref $self;
169 5         32 $p;
170             }
171              
172             # =method parse
173             #
174             # my $result = $parser->parse($file)
175             #
176             # This method parses the file named and returns a Parse::IRCLog::Result object
177             # representing the results. The C method can be called on a parser object
178             # or on the class. If called on the class, a parser will be instantiated for the
179             # method call and discarded when C returns.
180             #
181             # =cut
182              
183             sub parse {
184 1     1 1 890 my $self = shift;
185 1 50       7 $self = $self->new unless ref $self;
186              
187 1         4 my $symbol = Symbol::gensym;
188 1 50       64 open $symbol, "<", $_[0] or Carp::croak "couldn't open $_[0]: $!";
189              
190 1         2 my @events;
191 1         27 push @events, $self->parse_line($_) while (<$symbol>);
192 1         9 Parse::IRCLog::Result->new(@events);
193             }
194              
195             # =method parse_line
196             #
197             # my $info = $parser->parse_line($line);
198             #
199             # This method is used internally by C to turn each line into an event.
200             # While it could someday be made slick, it's adequate for now. It attempts to
201             # match each line against the required patterns from the C result and
202             # if successful returns a hashref describing the event.
203             #
204             # If no match can be found, an "unknown" event is returned.
205             #
206             # =cut
207              
208             sub parse_line {
209 9     9 1 16 my ($self, $line) = @_;
210 9 100       22 if ($line) {
211 7 100       14 return { type => 'msg', timestamp => $1, nick_prefix => $2, nick => $3, text => $5 }
212             if $line =~ $self->patterns->{msg};
213 5 100       28 return { type => 'action', timestamp => $1, nick_prefix => $2, nick => $3, text => $4 }
214             if $line =~ $self->patterns->{action};
215             }
216 5         40 return { type => 'unknown', text => $line };
217             }
218              
219             # =head1 TODO
220             #
221             # Write a few example subclasses for common log formats.
222             #
223             # Add a few more default event types: join, part, nick. Others?
224             #
225             # Possibly make the C sub an module, to allow subclassing to override
226             # only one or two patterns. For example, to use the default C pattern but
227             # override the C or C. This sounds like a very
228             # good idea, actually, now that I write it down.
229             #
230             # =cut
231              
232             1;
233              
234             __END__