| line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
|
1
|
|
|
|
|
|
|
package WWW::Blog::Identify; |
|
2
|
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
561716
|
use strict; |
|
|
1
|
|
|
|
|
2
|
|
|
|
1
|
|
|
|
|
36
|
|
|
4
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
|
1
|
|
|
|
|
3
|
|
|
|
1
|
|
|
|
|
2133
|
|
|
5
|
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
require Exporter; |
|
7
|
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
|
9
|
|
|
|
|
|
|
our @EXPORT_OK = qw/identify/; |
|
10
|
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our $VERSION = '0.06'; |
|
13
|
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
sub identify { |
|
16
|
0
|
|
|
0
|
1
|
|
my ($url, $text) = @_; |
|
17
|
|
|
|
|
|
|
|
|
18
|
0
|
|
|
|
|
|
$url = lc( $url ); |
|
19
|
0
|
|
|
|
|
|
local $_ = $url; |
|
20
|
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# patterns ordered roughly in terms of frequency |
|
22
|
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# |
|
25
|
|
|
|
|
|
|
# URL CHECKING |
|
26
|
|
|
|
|
|
|
# |
|
27
|
|
|
|
|
|
|
|
|
28
|
0
|
0
|
|
|
|
|
return "blogspot" if /\.blogspot\.com/o; |
|
29
|
0
|
0
|
|
|
|
|
return "blogger" if m|\.blogger\.com/|o; |
|
30
|
|
|
|
|
|
|
|
|
31
|
0
|
0
|
|
|
|
|
return "blogger (br)" if m|\.blogger\.com\.br|o; # Brazilian Blogger |
|
32
|
0
|
0
|
|
|
|
|
return 'terra' if m|weblogger\.(terra\.)?com\.br/|o; |
|
33
|
0
|
0
|
|
|
|
|
return "diaryland" if /\.diaryland\./o; |
|
34
|
0
|
0
|
|
|
|
|
return "livejournal" if /\.livejournal\.com/o; |
|
35
|
0
|
0
|
|
|
|
|
return "journalspace" if /\.journalspace\.com/o; |
|
36
|
0
|
0
|
|
|
|
|
return "blogalia" if /\.blogalia\.com/o; |
|
37
|
0
|
0
|
|
|
|
|
return "pitas" if /\.pitas\.com/o; |
|
38
|
0
|
0
|
|
|
|
|
return "persianblog" if /\.persianblog\.com/o; # Farsi |
|
39
|
0
|
0
|
|
|
|
|
return "persianlog" if /\bpersianlog\.com/o; # Farsi |
|
40
|
0
|
0
|
|
|
|
|
return "diaryhub" if /\.diaryhub\.(?:com|net)\/?$/io; # Thai |
|
41
|
|
|
|
|
|
|
|
|
42
|
0
|
0
|
|
|
|
|
return "radio" if /radio.weblogs\.com/o; |
|
43
|
0
|
0
|
|
|
|
|
return "radio" if /blogs.law.harvard.edu/o; |
|
44
|
0
|
0
|
|
|
|
|
return "radio" if /\.blogs.it\b/o; |
|
45
|
|
|
|
|
|
|
|
|
46
|
0
|
0
|
|
|
|
|
return "manila" if /\.manilasites\.com/o; |
|
47
|
0
|
0
|
|
|
|
|
return "manila" if /\.editthispage\.com/o; |
|
48
|
0
|
0
|
|
|
|
|
return "manila" if m|\.weblogger\.com/|o; |
|
49
|
0
|
0
|
|
|
|
|
return "manila" if m|\.weblogs\.com/|o; |
|
50
|
|
|
|
|
|
|
|
|
51
|
0
|
0
|
|
|
|
|
return "20six" if m|\.20six\.|o; |
|
52
|
0
|
0
|
|
|
|
|
return "typepad" if m|\.typepad\.|o; |
|
53
|
|
|
|
|
|
|
|
|
54
|
0
|
0
|
|
|
|
|
return "twoday" if /\.twoday\.net/o; |
|
55
|
0
|
0
|
|
|
|
|
return "salon" if /blogs\.salon\.com/o; |
|
56
|
0
|
0
|
|
|
|
|
return "splinder" if /\.splinder\.it/o; # Italy |
|
57
|
0
|
0
|
|
|
|
|
return "diarist" if /\.diarist\.com/o; |
|
58
|
0
|
0
|
|
|
|
|
return "antville" if /\.antville\.org/o; |
|
59
|
0
|
0
|
|
|
|
|
return 'bloggingnetwork' if m|\.bloggingnetwork\.com/blogs|o; |
|
60
|
0
|
0
|
|
|
|
|
return "crimsonblog" if /\.crimsonblog\./o; |
|
61
|
0
|
0
|
|
|
|
|
return "skyblog" if /\.skyblog\.com/o; # French |
|
62
|
|
|
|
|
|
|
|
|
63
|
0
|
0
|
|
|
|
|
return "blog.pl (polish)" if /\.blog\.pl/o; |
|
64
|
0
|
0
|
|
|
|
|
return "e-blog.pl (polish)" if /\.e-blog\.pl/o; |
|
65
|
0
|
0
|
|
|
|
|
return "weblog.pl (polish)" if /\.weblog\.pl/o; |
|
66
|
|
|
|
|
|
|
|
|
67
|
0
|
0
|
|
|
|
|
return "twoday" if /\.twoday\.net/o; |
|
68
|
0
|
0
|
|
|
|
|
return "monblogue" if /\.monblogue\.com/o; |
|
69
|
0
|
0
|
|
|
|
|
return 'joueb' if m|joueb\.com/|o; # France |
|
70
|
0
|
0
|
|
|
|
|
return 'blogstudio' if m|\.blogstudio\.com/|o; |
|
71
|
0
|
0
|
|
|
|
|
return 'blog-city' if m|blog-city\.com/|o; |
|
72
|
0
|
0
|
|
|
|
|
return 'blogsky' if m|\.blogsky\.com/|o; # English and Persian |
|
73
|
0
|
0
|
|
|
|
|
return 'u-blog' if m|u-blog\.net/|o; # France |
|
74
|
0
|
0
|
|
|
|
|
return 'barrapunto' if m|\bbarrapunto\.com/index\.pl|o; # Spain |
|
75
|
0
|
0
|
|
|
|
|
return 'blig' if m|\.blig\.(?:ig.)?com\.br|o; # Brazil |
|
76
|
0
|
0
|
|
|
|
|
return 'g-blog' if m|g-blog\.net/|o; |
|
77
|
0
|
0
|
|
|
|
|
return 'babelogue' if m|babelogue\.citypages\.com|io; |
|
78
|
0
|
0
|
|
|
|
|
return 'jevon' if m|\.jevon\.org/|io; |
|
79
|
0
|
0
|
|
|
|
|
return 'tripod' if m|\.tripod\.com/|io; |
|
80
|
|
|
|
|
|
|
|
|
81
|
0
|
0
|
|
|
|
|
return 'xanga' if m|\.xanga\.com|o; |
|
82
|
|
|
|
|
|
|
# |
|
83
|
|
|
|
|
|
|
# CONTENT CHECKING |
|
84
|
|
|
|
|
|
|
# |
|
85
|
|
|
|
|
|
|
|
|
86
|
0
|
|
|
|
|
|
local $_ = $text; |
|
87
|
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# First, check META tags |
|
89
|
|
|
|
|
|
|
|
|
90
|
0
|
0
|
|
|
|
|
return "postnuke" if m|CONTENT="Post-?Nuke|io; # Nuke is nice enough to use META tags |
|
91
|
0
|
0
|
|
|
|
|
return "php-nuke" if m|CONTENT="PHP-?Nuke|io; |
|
92
|
0
|
0
|
|
|
|
|
return "microsoft" if m|]+Content=['"]Microsoft Visual|io; |
|
93
|
0
|
0
|
|
|
|
|
return "nucleus" if m|]+content=['"]Nucleus|io; |
|
94
|
0
|
0
|
|
|
|
|
return "greymatter" if m|]+content=['"]Greymatter|io; |
|
95
|
0
|
0
|
|
|
|
|
return "land down under" if m|]+content=['"]Land Down Under|io; |
|
96
|
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# Next, check actual content |
|
98
|
|
|
|
|
|
|
|
|
99
|
0
|
0
|
|
|
|
|
return "movable type" if m|cgi-bin/mt|o; |
|
100
|
0
|
0
|
|
|
|
|
return "movable type" if m|Powered by.*Move?able ?Type|io; # common typo is 'Moveable' |
|
101
|
0
|
0
|
|
|
|
|
return "movable type" if m|mtblog.gif|io; |
|
102
|
0
|
0
|
|
|
|
|
return "movable type" if m|move?abletype.gif|o; |
|
103
|
0
|
0
|
|
|
|
|
return "movable type" if m!function Open(Trackback|Comments)\s+\(c\)!o; # default MT JavaScript |
|
104
|
|
|
|
|
|
|
|
|
105
|
0
|
0
|
|
|
|
|
return "blogger pro" if m|powered_by_blogger_pro[0-9]*\.gif|io; |
|
106
|
0
|
0
|
|
|
|
|
return "blogger pro" if m|powered by:?
107
|
|
|
|
|
|
|
|
|
108
|
0
|
0
|
|
|
|
|
return "blogger" if m|bloggerbutton[0-9]+.gif|io; |
|
109
|
0
|
0
|
|
|
|
|
return "blogger" if m|bloggertemplate[^.]+.gif|io; |
|
110
|
0
|
0
|
|
|
|
|
return "blogger" if m|blogger_bluelong.gif|o; |
|
111
|
0
|
0
|
|
|
|
|
return "blogger" if m|powered by (
112
|
|
|
|
|
|
|
|
|
113
|
0
|
0
|
|
|
|
|
return "radio" if m|img src="http://radio.weblogs.com|io; |
|
114
|
0
|
0
|
|
|
|
|
return "radio" if m|http://radio.xmlstoragesystem.com/weblogStats|oi; |
|
115
|
0
|
0
|
|
|
|
|
return "radio" if m|images/radioUserLand|oi; |
|
116
|
0
|
0
|
|
|
|
|
return "radio" if m|xmlCoffeeCup|oi; |
|
117
|
|
|
|
|
|
|
|
|
118
|
0
|
0
|
|
|
|
|
return "manila" if m|thisIsAManilaSite|oi; |
|
119
|
|
|
|
|
|
|
|
|
120
|
0
|
0
|
|
|
|
|
return "cafelog" if m!function b2(?:open|comment)!o; # default cafelog JavaScript |
|
121
|
0
|
0
|
|
|
|
|
return "cafelog" if m|powered by (
122
|
|
|
|
|
|
|
|
|
123
|
0
|
0
|
|
|
|
|
return "pivot" if m||io; |
|
124
|
0
|
0
|
|
|
|
|
return "pivot" if m|pivot-?banner[^.]*.gif|io; |
|
125
|
|
|
|
|
|
|
|
|
126
|
0
|
0
|
|
|
|
|
return "textpattern" if m|txp_slug|o; |
|
127
|
0
|
0
|
|
|
|
|
return "blosxom" if /blosxom\.gif/o; |
|
128
|
|
|
|
|
|
|
|
|
129
|
0
|
0
|
|
|
|
|
return "slogger" if /Created by Slogger/io; |
|
130
|
|
|
|
|
|
|
|
|
131
|
0
|
0
|
|
|
|
|
return "greymatter" if /gm-icon.gif/o; |
|
132
|
0
|
0
|
|
|
|
|
return "greymatter" if /Powered by Greymatter/io; |
|
133
|
|
|
|
|
|
|
|
|
134
|
0
|
0
|
|
|
|
|
return "pMachine" if m|alt="[^"]+ pMachine|io; # This can be "Powered by" or "Gemaakt mit", for example |
|
135
|
0
|
0
|
|
|
|
|
return "pMachine" if m|powered by (?:
136
|
0
|
0
|
|
|
|
|
return "pMachine" if m|pmachine.gif|io; |
|
137
|
|
|
|
|
|
|
|
|
138
|
0
|
0
|
|
|
|
|
return "psychoblogger" if m|Powered by (?:]+>)?Psychoblogger|io; |
|
139
|
0
|
0
|
|
|
|
|
return "WebCrimson" if m|Powered by (?:]+>)?WebCrimson|io; |
|
140
|
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# Tests of last resort |
|
142
|
0
|
|
|
|
|
|
my @blog_count = $text =~ /\bblog\b/gi; |
|
143
|
|
|
|
|
|
|
|
|
144
|
0
|
0
|
|
|
|
|
return "suspected by URL" if $url =~ /[\W\-_](?:we)?blog/o; |
|
145
|
0
|
0
|
|
|
|
|
return "suspected by URL" if $url =~ /\bbitacoras\b/i; |
|
146
|
0
|
0
|
|
|
|
|
return "suspected by rss" if $text =~ /\brss\b/i; |
|
147
|
0
|
0
|
|
|
|
|
return "suspected by content" if scalar @blog_count > 5; |
|
148
|
|
|
|
|
|
|
|
|
149
|
0
|
|
|
|
|
|
return; |
|
150
|
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
} |
|
152
|
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
1; |
|
154
|
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
__END__ |