File Coverage

blib/lib/Message/Style.pm

Criterion	Covered	Total	%
statement	6	91	6.5
branch	0	52	0.0
condition	0	18	0.0
subroutine	2	3	66.6
pod	1	1	100.0
total	9	165	5.4

line	stmt	bran	cond	sub	pod	time	code
1							package Message::Style;
2							require 5.005;
3	1			1		12094	use strict;
	1					3
	1					49
4	1			1		5	use vars qw( $VERSION @ISA );
	1					1
	1					1211
5							# $Id: Style.pm,v 1.3 2004/10/26 15:53:37 abuse Exp $
6							$VERSION = do { my @r=(q$Revision: 1.3 $=~/\d+/g); sprintf "%d."."%03d"x$#r,@r };
7
8							#use Carp;
9							#use Data::Dumper;
10
11							=head1 NAME
12
13							Message::Style - Perl module to perform stylistic analysis of messages
14
15							=head1 SYNOPSIS
16
17							use Message::Style;
18
19							my $score=Message::Style::score(\@article);
20							# or
21							my $score=Message::Style::score(@article);
22
23							=head1 DESCRIPTION
24
25							This Perl library does an analysis of a RFC2822 format message
26							(typically email messages or Usenet posts) and produces a score that,
27							in the author's opinion, gives a good indication as to whether the
28							poster is a fsckwit, and therefore whether their message should be
29							ignored.
30
31							=head1 SCORING MECHANISM
32
33							This script takes a Usenet article (or other RFC822 formatted text)
34							and attempts to identify whether the sender is a fsckwit. It does this
35							by analysing quoting style, line length, spelling, and various other
36							criteria.
37
38							There are several things that are annoying about Usenet posts, the
39							scores are related to the "cost" of these. There are Byte Points
40							(bandwidth wasted in transmission of pointless material) and Line
41							Points (time wasted scrolling through pointless material). These, and
42							their justifications are:
43
44							=over 2
45
46							=item 1
47
48							Article has excessively long lines.
49
50							Long lines are wrapped by some newsreaders, truncated by others, or a
51							horizontal scrollbar is presented. Whatever the case, these cause
52							extra effort for the reader to scroll. A Line Point is given for every
53							block of 80 chars (or part) beyond char 80.
54
55							=item 2
56
57							Article is not completely in plain text.
58
59							Non-plain Content-Type, e.g. text/html, or a non-text Content-Encoding
60							is unreadable to many. Byte Points are given for the entire article.
61
62							=item 3
63
64							Article has a very large signature.
65
66							Signatures are generally a waste of bandwidth, and long ones need to
67							be paged through. It is considered bad form to have a signature larger
68							than the McQuary limit of 80x4. Because of that, Byte Points and Line
69							Points scored for every character and line outside the 80x4 box.
70
71							=item 4
72
73							Article contains a Big Ugly ASCII Graphic (BUAG)
74
75							BUAGs are those annoying graphics that always seem to come with "cute"
76							extralong signatures. These are warned of, but not scored since
77							they've already been accounted for in 3 (and also because BUAGs in the
78							body of the message are sometimes useful.)
79
80							=item 5
81
82							Article has incorrectly-formatted quoted material.
83
84							A quote is expected to precede the original material. Scoring is based
85							upon this. The first four lines of the quoted material doesn't score
86							at all. The original material is then counted for lines and bytes, and
87							half of each is also allowed for quoted material. Beyond that, Byte
88							and Line scores are applied. Top-posted articles are expected to score
89							badly from this heuristic.
90
91							=cut
92
93							# =item 6.
94							#
95							#Capitalisation. Score Byte and Line points (the latter by
96							#heuristic) for each capitalised letter beyond 20% of the original
97							#material. (Not yet done).
98
99							=back
100
101							In addition, Byte and Line scores are multipled by the number of
102							newsgroups crossposted to.
103
104							For final scoring, a Line point equals 40 Byte points.
105
106							=head1 FUNCTIONS
107
108							=over 4
109
110							=item B
111
112							my $score=Message::Style::score(@article);
113
114							Performs a scoring operation on the article, and returns the score.
115
116							=cut
117
118							sub score {
119	0	0		0	1		my $aref=ref $_[0] ? $_[0] : \@_;
120
121							# warning, neophyte code, only recently dredged up from the
122							# archives, marginally cleaned-up, and turned into a CPANable module
123	0						my @article=();
124	0						my @header=();
125	0						my %fault=();
126	0						my %header=();
127	0						my %meta=();
128	0						my $t="";
129	0						my ($lscore, $bscore)=(0,0);
130
131	0						while(@$aref) {
132	0						$_=shift @$aref;
133	0						chomp;
134	0	0					last unless length;
135	0						push @header, $_;
136							}
137
138	0						@article=@$aref;
139	0						chomp @article;
140
141							# Firstly, the header is parsed. Folded lines are unfolded, and a
142							# hash of header names vs. values is created. Dupes and duff
143							# headers are noted.
144	0						foreach(reverse @header) {
145							# Join folded lines
146	0						$t="$_$t";
147	0	0					unless(/^[\t\ ]/) { # not folded
148	0						chomp $t;
149	0	0					if($t=~/^([A-Za-z0-9-]+)\: (.*)$/) {
150	0	0					$fault{"Duplicated header: $1: $2"}++ if(exists $header{$1});
151	0						$header{lc $1}=$2;
152	0						$t="";
153							} else {
154	0						$fault{"No colon-space in header ($t)"}++;
155							}
156							}
157							}
158
159							# Check if this is a plain text posting or something else.
160
161	0	0	0				if(defined $header{'content-type'}
162							and $header{'content-type'}!~/^text\/plain/i) {
163	0						$fault{"Non plaintext content: $header{'content-type'}"}++;
164	0						$meta{isbinary}++;
165							}
166
167	0	0	0				if(defined $header{'content-transfer-encoding'}
168							and $header{'content-transfer-encoding'}!~/^(7bit\|8bit\|quoted-printable)/i) {
169	0						$fault{"Non plaintext encoding: $header{'content-transfer-encoding'}"}++;
170	0						$meta{isbinary}++;
171							}
172
173	0						foreach my $line (@article) {
174							# @words is a list of words in this line
175	0						my @words=grep { $_ ne '' } split(/\s+/, $line);
	0
176	0						my $len=length $line; # For speed
177
178							# Check for indentation, $qlevel contains level of indentation.
179							# 0=original material, >=1 is quoted
180	0						my $qlevel=0;
181	0						$_=$line;
182	0						s/\s+//g;
183	0	0					$qlevel=length $1 if /(^\>*)/;
184
185							# Check for long lines
186	0	0					if(length($line)>80) {
187	0	0					$meta{toolong}++ if $len>80;
188	0	0	0				$meta{maxlen}=$len
189							unless exists $meta{maxlen} and $meta{maxlen}>$len;
190	0						$lscore+=int($len/80);
191							}
192
193	0	0					if(scalar @words) { # Nonblank line
194	0	0					if($words[0]=~/^\>/) { # Quoted material
195	0						$meta{qlines}++;
196	0						$meta{qwords}+=scalar @words;
197	0						$meta{qchars}+=length;
198							} else { # "Original" material
199	0						$meta{olines}++;
200	0						$meta{owords}+=scalar @words;
201	0						$meta{ochars}+=length;
202	0						foreach(@words) { # Crude check for BUAGs
203	0	0					if(/[^A-Za-z0-9]{3,}/) {
204	0	0					next if m#(\.{3,3}\|://)#;
205	0						$meta{buag}++;
206	0						last;
207							}
208							}
209							}
210							}
211
212							# Check for and count signature
213	0	0					if(exists $meta{hassig}) {
214	0						$meta{siglines}++;
215	0	0	0				if(exists $meta{siglines} && $meta{siglines}>4) {
		0
216	0						$lscore++;
217	0						$bscore+=$len;
218							} elsif($len>80) {
219	0						$fault{'Wide signature'}++;
220	0						$lscore+=$len-80;
221							}
222							}
223	0	0					$meta{hassig}++ if(/^--\ ?$/);
224	0	0					$fault{'Broken sigsep'}++
225							if($line eq '--');
226
227							# if(/-----BEGIN PGP SIGNATURE-----/
228							# .. /-----END PGP SIGNATURE-----/) {
229							# $fault{'PGP signature'}++;
230							# next;
231							# }
232
233							}
234
235							# Let's start moaning
236	0	0	0				if(exists $meta{siglines} and $meta{siglines}>4) {
237	0						$fault{"Signature is $meta{siglines} lines, should be four at most"}++;
238							# Score already applied
239							};
240
241	0	0	0				if(exists $meta{buag} and $meta{buag}>10) {
242	0						$fault{"Large BUAG/nontext present"}++;
243							# No score, just a warning
244							}
245
246	0	0					if(exists $meta{toolong}) {
247	0						$fault{"Overlong lines ($meta{toolong} of them), longest is $meta{maxlen} chars"}++;
248							# Score already applied
249							}
250
251	0	0					if(exists $meta{isbinary}) {
252							# Apply score to whole article
253	0						map { $bscore+=length; $lscore++ } @header;
	0
	0
254	0						map { $bscore+=length; $lscore++ } @article;
	0
	0
255							}
256
257	0						my $groups=1;
258	0	0					$groups=($header{newsgroups}=~tr[,][,])+1
259							if defined $header{newsgroups};
260	0						my $score=int(($lscore40+$bscore)sqrt $groups);
261	0						my $name=$header{from};
262
263	0	0					if($lscore\|$bscore) {
264	0						$fault{"Score: $score"}++;
265	0						$fault{"Lscore: $lscore, Bscore: $bscore, Groups: $groups"}++;
266							}
267
268							# You may correctly assume that this code was ripped from something
269							# that used %fault, even though this package doesn't do anything
270							# useful with it.
271
272							# carp Dumper \@article, \%fault, \%meta;
273
274	0						return $score;
275
276							# end neophyte code. May @DEITY smile upon me one day to have the
277							# tuits to clean up the code and turn it into a useful package.
278							}
279
280							=back
281
282							=head1 WARNINGS
283
284							This module is basically the result of ripping out the core of a
285							really nasty script I wrote early in my Perl career and wrapping the
286							minimum around it to pass CPAN muster. So the code is a bit crufty,
287							although it does certainly work and has heard of strict and warn.
288
289							It was however reasonably well-tested at the time thanks to plenty of
290							fsckwit source material on birmingham.misc / uk.local.birmingham.
291
292							=head1 SEE ALSO
293
294							=head1 AUTHOR
295
296							All code and documentation by Peter Corlett .
297
298							=head1 COPYRIGHT
299
300							Copyright (C) 2000-2004 Peter Corlett . All
301							rights reserved.
302
303							This program is free software; you can redistribute it and/or modify
304							it under the same terms as Perl itself.
305
306							=head1 SUPPORT / WARRANTY
307
308							This is free software. IT COMES WITHOUT WARRANTY OF ANY KIND.
309
310							=cut
311
312							1;
313