line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
4
|
|
|
4
|
|
3527
|
use strict; |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
163
|
|
2
|
4
|
|
|
4
|
|
20
|
use warnings; |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
224
|
|
3
|
|
|
|
|
|
|
package Parse::IRCLog; |
4
|
|
|
|
|
|
|
# ABSTRACT: parse internet relay chat logs |
5
|
|
|
|
|
|
|
$Parse::IRCLog::VERSION = '1.106'; |
6
|
4
|
|
|
4
|
|
22
|
use Carp (); |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
59
|
|
7
|
4
|
|
|
4
|
|
2081
|
use Parse::IRCLog::Result; |
|
4
|
|
|
|
|
10
|
|
|
4
|
|
|
|
|
97
|
|
8
|
4
|
|
|
4
|
|
3386
|
use Symbol (); |
|
4
|
|
|
|
|
3829
|
|
|
4
|
|
|
|
|
2692
|
|
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
# =head1 SYNOPSIS |
11
|
|
|
|
|
|
|
# |
12
|
|
|
|
|
|
|
# use Parse::IRCLog; |
13
|
|
|
|
|
|
|
# |
14
|
|
|
|
|
|
|
# $result = Parse::IRCLog->parse("perl-2004-02-01.log"); |
15
|
|
|
|
|
|
|
# |
16
|
|
|
|
|
|
|
# my %to_print = ( msg => 1, action => 1 ); |
17
|
|
|
|
|
|
|
# |
18
|
|
|
|
|
|
|
# for ($result->events) { |
19
|
|
|
|
|
|
|
# next unless $to_print{ $_->{type} }; |
20
|
|
|
|
|
|
|
# print "$_->{nick}: $_->{text}\n"; |
21
|
|
|
|
|
|
|
# } |
22
|
|
|
|
|
|
|
# |
23
|
|
|
|
|
|
|
# =head1 DESCRIPTION |
24
|
|
|
|
|
|
|
# |
25
|
|
|
|
|
|
|
# This module provides a simple framework to parse IRC logs in arbitrary formats. |
26
|
|
|
|
|
|
|
# |
27
|
|
|
|
|
|
|
# A parser has a set of regular expressions for matching different events that |
28
|
|
|
|
|
|
|
# occur in an IRC log, such as "msg" and "action" events. Each line in the log |
29
|
|
|
|
|
|
|
# is matched against these rules and a result object, representing the event |
30
|
|
|
|
|
|
|
# stream, is returned. |
31
|
|
|
|
|
|
|
# |
32
|
|
|
|
|
|
|
# The rule set, described in greated detail below, can be customized by |
33
|
|
|
|
|
|
|
# subclassing Parse::IRCLog. In this way, Parse::IRCLog can provide a generic |
34
|
|
|
|
|
|
|
# interface for log analysis across many log formats, including custom formats. |
35
|
|
|
|
|
|
|
# |
36
|
|
|
|
|
|
|
# Normally, the C method is used to create a result set without storing a |
37
|
|
|
|
|
|
|
# parser object, but a parser may be created and reused. |
38
|
|
|
|
|
|
|
# |
39
|
|
|
|
|
|
|
# =method new |
40
|
|
|
|
|
|
|
# |
41
|
|
|
|
|
|
|
# This method constructs a new parser (with C<< $class->construct >>) and |
42
|
|
|
|
|
|
|
# initializes it (with C<< $obj->init >>). Construction and initialization are |
43
|
|
|
|
|
|
|
# separated for ease of subclassing initialization for future pipe dreams like |
44
|
|
|
|
|
|
|
# guessing what ruleset to use. |
45
|
|
|
|
|
|
|
# |
46
|
|
|
|
|
|
|
# =cut |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
sub new { |
49
|
5
|
|
|
5
|
1
|
1222
|
my $class = shift; |
50
|
5
|
100
|
|
|
|
182
|
Carp::croak "new is a class method" if ref $class; |
51
|
|
|
|
|
|
|
|
52
|
4
|
|
|
|
|
14
|
$class->construct->init; |
53
|
|
|
|
|
|
|
} |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
# =method construct |
56
|
|
|
|
|
|
|
# |
57
|
|
|
|
|
|
|
# The parser constructor just returns a new, empty parser object. It should be a |
58
|
|
|
|
|
|
|
# blessed hashref. |
59
|
|
|
|
|
|
|
# |
60
|
|
|
|
|
|
|
# =cut |
61
|
|
|
|
|
|
|
|
62
|
4
|
|
|
4
|
1
|
16
|
sub construct { bless {} => shift; } |
63
|
|
|
|
|
|
|
|
64
|
|
|
|
|
|
|
# =method init |
65
|
|
|
|
|
|
|
# |
66
|
|
|
|
|
|
|
# The initialization method configures the object, loading its ruleset. |
67
|
|
|
|
|
|
|
# |
68
|
|
|
|
|
|
|
# =cut |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
sub init { |
71
|
4
|
|
|
4
|
1
|
8
|
my $self = shift; |
72
|
4
|
|
|
|
|
10
|
$self->{patterns} = $self->patterns; |
73
|
4
|
|
|
|
|
21
|
$self; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
# =method patterns |
77
|
|
|
|
|
|
|
# |
78
|
|
|
|
|
|
|
# This method returns a reference to a hash of regular expressions, which are |
79
|
|
|
|
|
|
|
# used to parse the logs. Only a few, so far, are required by the parser, |
80
|
|
|
|
|
|
|
# although internally a few more are used to break down the task of parsing |
81
|
|
|
|
|
|
|
# lines. |
82
|
|
|
|
|
|
|
# |
83
|
|
|
|
|
|
|
# C matches an action; that is, the result of /ME in IRC. It should |
84
|
|
|
|
|
|
|
# return the following matches: |
85
|
|
|
|
|
|
|
# |
86
|
|
|
|
|
|
|
# $1 - timestamp |
87
|
|
|
|
|
|
|
# $2 - nick prefix |
88
|
|
|
|
|
|
|
# $3 - nick |
89
|
|
|
|
|
|
|
# $4 - the action |
90
|
|
|
|
|
|
|
# |
91
|
|
|
|
|
|
|
# C matches a message; that is, the result of /MSG (or "normal talking") in |
92
|
|
|
|
|
|
|
# IRC. It should return the following matches: |
93
|
|
|
|
|
|
|
# |
94
|
|
|
|
|
|
|
# $1 - timestamp |
95
|
|
|
|
|
|
|
# $2 - nick prefix |
96
|
|
|
|
|
|
|
# $3 - nick |
97
|
|
|
|
|
|
|
# $3 - channel |
98
|
|
|
|
|
|
|
# $5 - the action |
99
|
|
|
|
|
|
|
# |
100
|
|
|
|
|
|
|
# Read the source for a better idea as to how these regexps break down. Oh, and |
101
|
|
|
|
|
|
|
# for what it's worth, the default patterns are based on my boring, default irssi |
102
|
|
|
|
|
|
|
# configuration. Expect more rulesets to be included in future distributions. |
103
|
|
|
|
|
|
|
# |
104
|
|
|
|
|
|
|
# =cut |
105
|
|
|
|
|
|
|
|
106
|
|
|
|
|
|
|
sub patterns { |
107
|
17
|
|
|
17
|
1
|
921
|
my ($self) = @_; |
108
|
|
|
|
|
|
|
|
109
|
17
|
100
|
100
|
|
|
222
|
return $self->{patterns} if ref $self and defined $self->{patterns}; |
110
|
|
|
|
|
|
|
|
111
|
5
|
|
|
|
|
9
|
my $p; |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
# nick and chan are (mostly) specified in RFC2812, section 2.3.1 |
114
|
|
|
|
|
|
|
|
115
|
5
|
|
|
|
|
22
|
my $letter = qr/[\x41-\x5A\x61-\x7A]/; # A-Z / a-z |
116
|
5
|
|
|
|
|
14
|
my $digit = qr/[\x30-\x39]/; # 0-9 |
117
|
5
|
|
|
|
|
24
|
my $special = qr/[\x5B-\x60\x7B-\x7D]/; # [\]^_`{|} |
118
|
|
|
|
|
|
|
|
119
|
5
|
|
|
|
|
191
|
$p->{nick} = qr/( (?: $letter | $special ) |
120
|
|
|
|
|
|
|
(?: $letter | $digit | $special | - )* )/x; |
121
|
|
|
|
|
|
|
|
122
|
5
|
|
|
|
|
18
|
my $channelid = qr/[A-Z0-9]{5}/; |
123
|
5
|
|
|
|
|
15
|
my $chanstring = qr/[^\x00\a\r\n ,:]*/; |
124
|
|
|
|
|
|
|
|
125
|
5
|
|
|
|
|
214
|
$p->{chan} = qr/( (?: \# | \+ | !$channelid | & ) $chanstring |
126
|
|
|
|
|
|
|
(?: :$chanstring )? )/x; |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
# the other regexes are more relevant to the way irssi formats logs |
129
|
|
|
|
|
|
|
|
130
|
5
|
|
|
|
|
297
|
$p->{nick_container} = qr/ |
131
|
|
|
|
|
|
|
< |
132
|
|
|
|
|
|
|
\s* |
133
|
|
|
|
|
|
|
([+%@])? |
134
|
|
|
|
|
|
|
\s* |
135
|
|
|
|
|
|
|
$p->{nick} |
136
|
|
|
|
|
|
|
(?: |
137
|
|
|
|
|
|
|
: |
138
|
|
|
|
|
|
|
$p->{chan} |
139
|
|
|
|
|
|
|
)? |
140
|
|
|
|
|
|
|
\s* |
141
|
|
|
|
|
|
|
> |
142
|
|
|
|
|
|
|
/x; |
143
|
|
|
|
|
|
|
|
144
|
5
|
|
|
|
|
20
|
$p->{timestamp} = qr/\[?(\d\d:\d\d(?::\d\d)?)?\]?/; |
145
|
|
|
|
|
|
|
|
146
|
5
|
|
|
|
|
27
|
$p->{action_leader} = qr/\*/; |
147
|
|
|
|
|
|
|
|
148
|
5
|
|
|
|
|
415
|
$p->{msg} = qr/ |
149
|
|
|
|
|
|
|
$p->{timestamp} |
150
|
|
|
|
|
|
|
\s* |
151
|
|
|
|
|
|
|
$p->{nick_container} |
152
|
|
|
|
|
|
|
\s+ |
153
|
|
|
|
|
|
|
(.+) |
154
|
|
|
|
|
|
|
/x; |
155
|
|
|
|
|
|
|
|
156
|
5
|
|
|
|
|
257
|
$p->{action} = qr/ |
157
|
|
|
|
|
|
|
$p->{timestamp} |
158
|
|
|
|
|
|
|
\s* |
159
|
|
|
|
|
|
|
$p->{action_leader} |
160
|
|
|
|
|
|
|
\s+ |
161
|
|
|
|
|
|
|
([%@])? |
162
|
|
|
|
|
|
|
\s* |
163
|
|
|
|
|
|
|
$p->{nick} |
164
|
|
|
|
|
|
|
\s |
165
|
|
|
|
|
|
|
(.+) |
166
|
|
|
|
|
|
|
/x; |
167
|
|
|
|
|
|
|
|
168
|
5
|
100
|
|
|
|
38
|
$self->{patterns} = $p if ref $self; |
169
|
5
|
|
|
|
|
32
|
$p; |
170
|
|
|
|
|
|
|
} |
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
# =method parse |
173
|
|
|
|
|
|
|
# |
174
|
|
|
|
|
|
|
# my $result = $parser->parse($file) |
175
|
|
|
|
|
|
|
# |
176
|
|
|
|
|
|
|
# This method parses the file named and returns a Parse::IRCLog::Result object |
177
|
|
|
|
|
|
|
# representing the results. The C method can be called on a parser object |
178
|
|
|
|
|
|
|
# or on the class. If called on the class, a parser will be instantiated for the |
179
|
|
|
|
|
|
|
# method call and discarded when C returns. |
180
|
|
|
|
|
|
|
# |
181
|
|
|
|
|
|
|
# =cut |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
sub parse { |
184
|
1
|
|
|
1
|
1
|
890
|
my $self = shift; |
185
|
1
|
50
|
|
|
|
7
|
$self = $self->new unless ref $self; |
186
|
|
|
|
|
|
|
|
187
|
1
|
|
|
|
|
4
|
my $symbol = Symbol::gensym; |
188
|
1
|
50
|
|
|
|
64
|
open $symbol, "<", $_[0] or Carp::croak "couldn't open $_[0]: $!"; |
189
|
|
|
|
|
|
|
|
190
|
1
|
|
|
|
|
2
|
my @events; |
191
|
1
|
|
|
|
|
27
|
push @events, $self->parse_line($_) while (<$symbol>); |
192
|
1
|
|
|
|
|
9
|
Parse::IRCLog::Result->new(@events); |
193
|
|
|
|
|
|
|
} |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
# =method parse_line |
196
|
|
|
|
|
|
|
# |
197
|
|
|
|
|
|
|
# my $info = $parser->parse_line($line); |
198
|
|
|
|
|
|
|
# |
199
|
|
|
|
|
|
|
# This method is used internally by C to turn each line into an event. |
200
|
|
|
|
|
|
|
# While it could someday be made slick, it's adequate for now. It attempts to |
201
|
|
|
|
|
|
|
# match each line against the required patterns from the C result and |
202
|
|
|
|
|
|
|
# if successful returns a hashref describing the event. |
203
|
|
|
|
|
|
|
# |
204
|
|
|
|
|
|
|
# If no match can be found, an "unknown" event is returned. |
205
|
|
|
|
|
|
|
# |
206
|
|
|
|
|
|
|
# =cut |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
sub parse_line { |
209
|
9
|
|
|
9
|
1
|
16
|
my ($self, $line) = @_; |
210
|
9
|
100
|
|
|
|
22
|
if ($line) { |
211
|
7
|
100
|
|
|
|
14
|
return { type => 'msg', timestamp => $1, nick_prefix => $2, nick => $3, text => $5 } |
212
|
|
|
|
|
|
|
if $line =~ $self->patterns->{msg}; |
213
|
5
|
100
|
|
|
|
28
|
return { type => 'action', timestamp => $1, nick_prefix => $2, nick => $3, text => $4 } |
214
|
|
|
|
|
|
|
if $line =~ $self->patterns->{action}; |
215
|
|
|
|
|
|
|
} |
216
|
5
|
|
|
|
|
40
|
return { type => 'unknown', text => $line }; |
217
|
|
|
|
|
|
|
} |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
# =head1 TODO |
220
|
|
|
|
|
|
|
# |
221
|
|
|
|
|
|
|
# Write a few example subclasses for common log formats. |
222
|
|
|
|
|
|
|
# |
223
|
|
|
|
|
|
|
# Add a few more default event types: join, part, nick. Others? |
224
|
|
|
|
|
|
|
# |
225
|
|
|
|
|
|
|
# Possibly make the C sub an module, to allow subclassing to override |
226
|
|
|
|
|
|
|
# only one or two patterns. For example, to use the default C pattern but |
227
|
|
|
|
|
|
|
# override the C or C. This sounds like a very |
228
|
|
|
|
|
|
|
# good idea, actually, now that I write it down. |
229
|
|
|
|
|
|
|
# |
230
|
|
|
|
|
|
|
# =cut |
231
|
|
|
|
|
|
|
|
232
|
|
|
|
|
|
|
1; |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
__END__ |