| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
4
|
|
|
4
|
|
3527
|
use strict; |
|
|
4
|
|
|
|
|
10
|
|
|
|
4
|
|
|
|
|
163
|
|
|
2
|
4
|
|
|
4
|
|
20
|
use warnings; |
|
|
4
|
|
|
|
|
7
|
|
|
|
4
|
|
|
|
|
224
|
|
|
3
|
|
|
|
|
|
|
package Parse::IRCLog; |
|
4
|
|
|
|
|
|
|
# ABSTRACT: parse internet relay chat logs |
|
5
|
|
|
|
|
|
|
$Parse::IRCLog::VERSION = '1.106'; |
|
6
|
4
|
|
|
4
|
|
22
|
use Carp (); |
|
|
4
|
|
|
|
|
9
|
|
|
|
4
|
|
|
|
|
59
|
|
|
7
|
4
|
|
|
4
|
|
2081
|
use Parse::IRCLog::Result; |
|
|
4
|
|
|
|
|
10
|
|
|
|
4
|
|
|
|
|
97
|
|
|
8
|
4
|
|
|
4
|
|
3386
|
use Symbol (); |
|
|
4
|
|
|
|
|
3829
|
|
|
|
4
|
|
|
|
|
2692
|
|
|
9
|
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# =head1 SYNOPSIS |
|
11
|
|
|
|
|
|
|
# |
|
12
|
|
|
|
|
|
|
# use Parse::IRCLog; |
|
13
|
|
|
|
|
|
|
# |
|
14
|
|
|
|
|
|
|
# $result = Parse::IRCLog->parse("perl-2004-02-01.log"); |
|
15
|
|
|
|
|
|
|
# |
|
16
|
|
|
|
|
|
|
# my %to_print = ( msg => 1, action => 1 ); |
|
17
|
|
|
|
|
|
|
# |
|
18
|
|
|
|
|
|
|
# for ($result->events) { |
|
19
|
|
|
|
|
|
|
# next unless $to_print{ $_->{type} }; |
|
20
|
|
|
|
|
|
|
# print "$_->{nick}: $_->{text}\n"; |
|
21
|
|
|
|
|
|
|
# } |
|
22
|
|
|
|
|
|
|
# |
|
23
|
|
|
|
|
|
|
# =head1 DESCRIPTION |
|
24
|
|
|
|
|
|
|
# |
|
25
|
|
|
|
|
|
|
# This module provides a simple framework to parse IRC logs in arbitrary formats. |
|
26
|
|
|
|
|
|
|
# |
|
27
|
|
|
|
|
|
|
# A parser has a set of regular expressions for matching different events that |
|
28
|
|
|
|
|
|
|
# occur in an IRC log, such as "msg" and "action" events. Each line in the log |
|
29
|
|
|
|
|
|
|
# is matched against these rules and a result object, representing the event |
|
30
|
|
|
|
|
|
|
# stream, is returned. |
|
31
|
|
|
|
|
|
|
# |
|
32
|
|
|
|
|
|
|
# The rule set, described in greated detail below, can be customized by |
|
33
|
|
|
|
|
|
|
# subclassing Parse::IRCLog. In this way, Parse::IRCLog can provide a generic |
|
34
|
|
|
|
|
|
|
# interface for log analysis across many log formats, including custom formats. |
|
35
|
|
|
|
|
|
|
# |
|
36
|
|
|
|
|
|
|
# Normally, the C method is used to create a result set without storing a |
|
37
|
|
|
|
|
|
|
# parser object, but a parser may be created and reused. |
|
38
|
|
|
|
|
|
|
# |
|
39
|
|
|
|
|
|
|
# =method new |
|
40
|
|
|
|
|
|
|
# |
|
41
|
|
|
|
|
|
|
# This method constructs a new parser (with C<< $class->construct >>) and |
|
42
|
|
|
|
|
|
|
# initializes it (with C<< $obj->init >>). Construction and initialization are |
|
43
|
|
|
|
|
|
|
# separated for ease of subclassing initialization for future pipe dreams like |
|
44
|
|
|
|
|
|
|
# guessing what ruleset to use. |
|
45
|
|
|
|
|
|
|
# |
|
46
|
|
|
|
|
|
|
# =cut |
|
47
|
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub new { |
|
49
|
5
|
|
|
5
|
1
|
1222
|
my $class = shift; |
|
50
|
5
|
100
|
|
|
|
182
|
Carp::croak "new is a class method" if ref $class; |
|
51
|
|
|
|
|
|
|
|
|
52
|
4
|
|
|
|
|
14
|
$class->construct->init; |
|
53
|
|
|
|
|
|
|
} |
|
54
|
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# =method construct |
|
56
|
|
|
|
|
|
|
# |
|
57
|
|
|
|
|
|
|
# The parser constructor just returns a new, empty parser object. It should be a |
|
58
|
|
|
|
|
|
|
# blessed hashref. |
|
59
|
|
|
|
|
|
|
# |
|
60
|
|
|
|
|
|
|
# =cut |
|
61
|
|
|
|
|
|
|
|
|
62
|
4
|
|
|
4
|
1
|
16
|
sub construct { bless {} => shift; } |
|
63
|
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# =method init |
|
65
|
|
|
|
|
|
|
# |
|
66
|
|
|
|
|
|
|
# The initialization method configures the object, loading its ruleset. |
|
67
|
|
|
|
|
|
|
# |
|
68
|
|
|
|
|
|
|
# =cut |
|
69
|
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
sub init { |
|
71
|
4
|
|
|
4
|
1
|
8
|
my $self = shift; |
|
72
|
4
|
|
|
|
|
10
|
$self->{patterns} = $self->patterns; |
|
73
|
4
|
|
|
|
|
21
|
$self; |
|
74
|
|
|
|
|
|
|
} |
|
75
|
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# =method patterns |
|
77
|
|
|
|
|
|
|
# |
|
78
|
|
|
|
|
|
|
# This method returns a reference to a hash of regular expressions, which are |
|
79
|
|
|
|
|
|
|
# used to parse the logs. Only a few, so far, are required by the parser, |
|
80
|
|
|
|
|
|
|
# although internally a few more are used to break down the task of parsing |
|
81
|
|
|
|
|
|
|
# lines. |
|
82
|
|
|
|
|
|
|
# |
|
83
|
|
|
|
|
|
|
# C matches an action; that is, the result of /ME in IRC. It should |
|
84
|
|
|
|
|
|
|
# return the following matches: |
|
85
|
|
|
|
|
|
|
# |
|
86
|
|
|
|
|
|
|
# $1 - timestamp |
|
87
|
|
|
|
|
|
|
# $2 - nick prefix |
|
88
|
|
|
|
|
|
|
# $3 - nick |
|
89
|
|
|
|
|
|
|
# $4 - the action |
|
90
|
|
|
|
|
|
|
# |
|
91
|
|
|
|
|
|
|
# C matches a message; that is, the result of /MSG (or "normal talking") in |
|
92
|
|
|
|
|
|
|
# IRC. It should return the following matches: |
|
93
|
|
|
|
|
|
|
# |
|
94
|
|
|
|
|
|
|
# $1 - timestamp |
|
95
|
|
|
|
|
|
|
# $2 - nick prefix |
|
96
|
|
|
|
|
|
|
# $3 - nick |
|
97
|
|
|
|
|
|
|
# $3 - channel |
|
98
|
|
|
|
|
|
|
# $5 - the action |
|
99
|
|
|
|
|
|
|
# |
|
100
|
|
|
|
|
|
|
# Read the source for a better idea as to how these regexps break down. Oh, and |
|
101
|
|
|
|
|
|
|
# for what it's worth, the default patterns are based on my boring, default irssi |
|
102
|
|
|
|
|
|
|
# configuration. Expect more rulesets to be included in future distributions. |
|
103
|
|
|
|
|
|
|
# |
|
104
|
|
|
|
|
|
|
# =cut |
|
105
|
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
sub patterns { |
|
107
|
17
|
|
|
17
|
1
|
921
|
my ($self) = @_; |
|
108
|
|
|
|
|
|
|
|
|
109
|
17
|
100
|
100
|
|
|
222
|
return $self->{patterns} if ref $self and defined $self->{patterns}; |
|
110
|
|
|
|
|
|
|
|
|
111
|
5
|
|
|
|
|
9
|
my $p; |
|
112
|
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# nick and chan are (mostly) specified in RFC2812, section 2.3.1 |
|
114
|
|
|
|
|
|
|
|
|
115
|
5
|
|
|
|
|
22
|
my $letter = qr/[\x41-\x5A\x61-\x7A]/; # A-Z / a-z |
|
116
|
5
|
|
|
|
|
14
|
my $digit = qr/[\x30-\x39]/; # 0-9 |
|
117
|
5
|
|
|
|
|
24
|
my $special = qr/[\x5B-\x60\x7B-\x7D]/; # [\]^_`{|} |
|
118
|
|
|
|
|
|
|
|
|
119
|
5
|
|
|
|
|
191
|
$p->{nick} = qr/( (?: $letter | $special ) |
|
120
|
|
|
|
|
|
|
(?: $letter | $digit | $special | - )* )/x; |
|
121
|
|
|
|
|
|
|
|
|
122
|
5
|
|
|
|
|
18
|
my $channelid = qr/[A-Z0-9]{5}/; |
|
123
|
5
|
|
|
|
|
15
|
my $chanstring = qr/[^\x00\a\r\n ,:]*/; |
|
124
|
|
|
|
|
|
|
|
|
125
|
5
|
|
|
|
|
214
|
$p->{chan} = qr/( (?: \# | \+ | !$channelid | & ) $chanstring |
|
126
|
|
|
|
|
|
|
(?: :$chanstring )? )/x; |
|
127
|
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# the other regexes are more relevant to the way irssi formats logs |
|
129
|
|
|
|
|
|
|
|
|
130
|
5
|
|
|
|
|
297
|
$p->{nick_container} = qr/ |
|
131
|
|
|
|
|
|
|
< |
|
132
|
|
|
|
|
|
|
\s* |
|
133
|
|
|
|
|
|
|
([+%@])? |
|
134
|
|
|
|
|
|
|
\s* |
|
135
|
|
|
|
|
|
|
$p->{nick} |
|
136
|
|
|
|
|
|
|
(?: |
|
137
|
|
|
|
|
|
|
: |
|
138
|
|
|
|
|
|
|
$p->{chan} |
|
139
|
|
|
|
|
|
|
)? |
|
140
|
|
|
|
|
|
|
\s* |
|
141
|
|
|
|
|
|
|
> |
|
142
|
|
|
|
|
|
|
/x; |
|
143
|
|
|
|
|
|
|
|
|
144
|
5
|
|
|
|
|
20
|
$p->{timestamp} = qr/\[?(\d\d:\d\d(?::\d\d)?)?\]?/; |
|
145
|
|
|
|
|
|
|
|
|
146
|
5
|
|
|
|
|
27
|
$p->{action_leader} = qr/\*/; |
|
147
|
|
|
|
|
|
|
|
|
148
|
5
|
|
|
|
|
415
|
$p->{msg} = qr/ |
|
149
|
|
|
|
|
|
|
$p->{timestamp} |
|
150
|
|
|
|
|
|
|
\s* |
|
151
|
|
|
|
|
|
|
$p->{nick_container} |
|
152
|
|
|
|
|
|
|
\s+ |
|
153
|
|
|
|
|
|
|
(.+) |
|
154
|
|
|
|
|
|
|
/x; |
|
155
|
|
|
|
|
|
|
|
|
156
|
5
|
|
|
|
|
257
|
$p->{action} = qr/ |
|
157
|
|
|
|
|
|
|
$p->{timestamp} |
|
158
|
|
|
|
|
|
|
\s* |
|
159
|
|
|
|
|
|
|
$p->{action_leader} |
|
160
|
|
|
|
|
|
|
\s+ |
|
161
|
|
|
|
|
|
|
([%@])? |
|
162
|
|
|
|
|
|
|
\s* |
|
163
|
|
|
|
|
|
|
$p->{nick} |
|
164
|
|
|
|
|
|
|
\s |
|
165
|
|
|
|
|
|
|
(.+) |
|
166
|
|
|
|
|
|
|
/x; |
|
167
|
|
|
|
|
|
|
|
|
168
|
5
|
100
|
|
|
|
38
|
$self->{patterns} = $p if ref $self; |
|
169
|
5
|
|
|
|
|
32
|
$p; |
|
170
|
|
|
|
|
|
|
} |
|
171
|
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# =method parse |
|
173
|
|
|
|
|
|
|
# |
|
174
|
|
|
|
|
|
|
# my $result = $parser->parse($file) |
|
175
|
|
|
|
|
|
|
# |
|
176
|
|
|
|
|
|
|
# This method parses the file named and returns a Parse::IRCLog::Result object |
|
177
|
|
|
|
|
|
|
# representing the results. The C method can be called on a parser object |
|
178
|
|
|
|
|
|
|
# or on the class. If called on the class, a parser will be instantiated for the |
|
179
|
|
|
|
|
|
|
# method call and discarded when C returns. |
|
180
|
|
|
|
|
|
|
# |
|
181
|
|
|
|
|
|
|
# =cut |
|
182
|
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub parse { |
|
184
|
1
|
|
|
1
|
1
|
890
|
my $self = shift; |
|
185
|
1
|
50
|
|
|
|
7
|
$self = $self->new unless ref $self; |
|
186
|
|
|
|
|
|
|
|
|
187
|
1
|
|
|
|
|
4
|
my $symbol = Symbol::gensym; |
|
188
|
1
|
50
|
|
|
|
64
|
open $symbol, "<", $_[0] or Carp::croak "couldn't open $_[0]: $!"; |
|
189
|
|
|
|
|
|
|
|
|
190
|
1
|
|
|
|
|
2
|
my @events; |
|
191
|
1
|
|
|
|
|
27
|
push @events, $self->parse_line($_) while (<$symbol>); |
|
192
|
1
|
|
|
|
|
9
|
Parse::IRCLog::Result->new(@events); |
|
193
|
|
|
|
|
|
|
} |
|
194
|
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# =method parse_line |
|
196
|
|
|
|
|
|
|
# |
|
197
|
|
|
|
|
|
|
# my $info = $parser->parse_line($line); |
|
198
|
|
|
|
|
|
|
# |
|
199
|
|
|
|
|
|
|
# This method is used internally by C to turn each line into an event. |
|
200
|
|
|
|
|
|
|
# While it could someday be made slick, it's adequate for now. It attempts to |
|
201
|
|
|
|
|
|
|
# match each line against the required patterns from the C result and |
|
202
|
|
|
|
|
|
|
# if successful returns a hashref describing the event. |
|
203
|
|
|
|
|
|
|
# |
|
204
|
|
|
|
|
|
|
# If no match can be found, an "unknown" event is returned. |
|
205
|
|
|
|
|
|
|
# |
|
206
|
|
|
|
|
|
|
# =cut |
|
207
|
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
sub parse_line { |
|
209
|
9
|
|
|
9
|
1
|
16
|
my ($self, $line) = @_; |
|
210
|
9
|
100
|
|
|
|
22
|
if ($line) { |
|
211
|
7
|
100
|
|
|
|
14
|
return { type => 'msg', timestamp => $1, nick_prefix => $2, nick => $3, text => $5 } |
|
212
|
|
|
|
|
|
|
if $line =~ $self->patterns->{msg}; |
|
213
|
5
|
100
|
|
|
|
28
|
return { type => 'action', timestamp => $1, nick_prefix => $2, nick => $3, text => $4 } |
|
214
|
|
|
|
|
|
|
if $line =~ $self->patterns->{action}; |
|
215
|
|
|
|
|
|
|
} |
|
216
|
5
|
|
|
|
|
40
|
return { type => 'unknown', text => $line }; |
|
217
|
|
|
|
|
|
|
} |
|
218
|
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# =head1 TODO |
|
220
|
|
|
|
|
|
|
# |
|
221
|
|
|
|
|
|
|
# Write a few example subclasses for common log formats. |
|
222
|
|
|
|
|
|
|
# |
|
223
|
|
|
|
|
|
|
# Add a few more default event types: join, part, nick. Others? |
|
224
|
|
|
|
|
|
|
# |
|
225
|
|
|
|
|
|
|
# Possibly make the C sub an module, to allow subclassing to override |
|
226
|
|
|
|
|
|
|
# only one or two patterns. For example, to use the default C pattern but |
|
227
|
|
|
|
|
|
|
# override the C or C. This sounds like a very |
|
228
|
|
|
|
|
|
|
# good idea, actually, now that I write it down. |
|
229
|
|
|
|
|
|
|
# |
|
230
|
|
|
|
|
|
|
# =cut |
|
231
|
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
1; |
|
233
|
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
__END__ |