line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WWW::Blog::Identify; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
561716
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
4
|
1
|
|
|
1
|
|
7
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
2133
|
|
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
require Exporter; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
our @ISA = qw(Exporter); |
9
|
|
|
|
|
|
|
our @EXPORT_OK = qw/identify/; |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
our $VERSION = '0.06'; |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
sub identify { |
16
|
0
|
|
|
0
|
1
|
|
my ($url, $text) = @_; |
17
|
|
|
|
|
|
|
|
18
|
0
|
|
|
|
|
|
$url = lc( $url ); |
19
|
0
|
|
|
|
|
|
local $_ = $url; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
# patterns ordered roughly in terms of frequency |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
# |
25
|
|
|
|
|
|
|
# URL CHECKING |
26
|
|
|
|
|
|
|
# |
27
|
|
|
|
|
|
|
|
28
|
0
|
0
|
|
|
|
|
return "blogspot" if /\.blogspot\.com/o; |
29
|
0
|
0
|
|
|
|
|
return "blogger" if m|\.blogger\.com/|o; |
30
|
|
|
|
|
|
|
|
31
|
0
|
0
|
|
|
|
|
return "blogger (br)" if m|\.blogger\.com\.br|o; # Brazilian Blogger |
32
|
0
|
0
|
|
|
|
|
return 'terra' if m|weblogger\.(terra\.)?com\.br/|o; |
33
|
0
|
0
|
|
|
|
|
return "diaryland" if /\.diaryland\./o; |
34
|
0
|
0
|
|
|
|
|
return "livejournal" if /\.livejournal\.com/o; |
35
|
0
|
0
|
|
|
|
|
return "journalspace" if /\.journalspace\.com/o; |
36
|
0
|
0
|
|
|
|
|
return "blogalia" if /\.blogalia\.com/o; |
37
|
0
|
0
|
|
|
|
|
return "pitas" if /\.pitas\.com/o; |
38
|
0
|
0
|
|
|
|
|
return "persianblog" if /\.persianblog\.com/o; # Farsi |
39
|
0
|
0
|
|
|
|
|
return "persianlog" if /\bpersianlog\.com/o; # Farsi |
40
|
0
|
0
|
|
|
|
|
return "diaryhub" if /\.diaryhub\.(?:com|net)\/?$/io; # Thai |
41
|
|
|
|
|
|
|
|
42
|
0
|
0
|
|
|
|
|
return "radio" if /radio.weblogs\.com/o; |
43
|
0
|
0
|
|
|
|
|
return "radio" if /blogs.law.harvard.edu/o; |
44
|
0
|
0
|
|
|
|
|
return "radio" if /\.blogs.it\b/o; |
45
|
|
|
|
|
|
|
|
46
|
0
|
0
|
|
|
|
|
return "manila" if /\.manilasites\.com/o; |
47
|
0
|
0
|
|
|
|
|
return "manila" if /\.editthispage\.com/o; |
48
|
0
|
0
|
|
|
|
|
return "manila" if m|\.weblogger\.com/|o; |
49
|
0
|
0
|
|
|
|
|
return "manila" if m|\.weblogs\.com/|o; |
50
|
|
|
|
|
|
|
|
51
|
0
|
0
|
|
|
|
|
return "20six" if m|\.20six\.|o; |
52
|
0
|
0
|
|
|
|
|
return "typepad" if m|\.typepad\.|o; |
53
|
|
|
|
|
|
|
|
54
|
0
|
0
|
|
|
|
|
return "twoday" if /\.twoday\.net/o; |
55
|
0
|
0
|
|
|
|
|
return "salon" if /blogs\.salon\.com/o; |
56
|
0
|
0
|
|
|
|
|
return "splinder" if /\.splinder\.it/o; # Italy |
57
|
0
|
0
|
|
|
|
|
return "diarist" if /\.diarist\.com/o; |
58
|
0
|
0
|
|
|
|
|
return "antville" if /\.antville\.org/o; |
59
|
0
|
0
|
|
|
|
|
return 'bloggingnetwork' if m|\.bloggingnetwork\.com/blogs|o; |
60
|
0
|
0
|
|
|
|
|
return "crimsonblog" if /\.crimsonblog\./o; |
61
|
0
|
0
|
|
|
|
|
return "skyblog" if /\.skyblog\.com/o; # French |
62
|
|
|
|
|
|
|
|
63
|
0
|
0
|
|
|
|
|
return "blog.pl (polish)" if /\.blog\.pl/o; |
64
|
0
|
0
|
|
|
|
|
return "e-blog.pl (polish)" if /\.e-blog\.pl/o; |
65
|
0
|
0
|
|
|
|
|
return "weblog.pl (polish)" if /\.weblog\.pl/o; |
66
|
|
|
|
|
|
|
|
67
|
0
|
0
|
|
|
|
|
return "twoday" if /\.twoday\.net/o; |
68
|
0
|
0
|
|
|
|
|
return "monblogue" if /\.monblogue\.com/o; |
69
|
0
|
0
|
|
|
|
|
return 'joueb' if m|joueb\.com/|o; # France |
70
|
0
|
0
|
|
|
|
|
return 'blogstudio' if m|\.blogstudio\.com/|o; |
71
|
0
|
0
|
|
|
|
|
return 'blog-city' if m|blog-city\.com/|o; |
72
|
0
|
0
|
|
|
|
|
return 'blogsky' if m|\.blogsky\.com/|o; # English and Persian |
73
|
0
|
0
|
|
|
|
|
return 'u-blog' if m|u-blog\.net/|o; # France |
74
|
0
|
0
|
|
|
|
|
return 'barrapunto' if m|\bbarrapunto\.com/index\.pl|o; # Spain |
75
|
0
|
0
|
|
|
|
|
return 'blig' if m|\.blig\.(?:ig.)?com\.br|o; # Brazil |
76
|
0
|
0
|
|
|
|
|
return 'g-blog' if m|g-blog\.net/|o; |
77
|
0
|
0
|
|
|
|
|
return 'babelogue' if m|babelogue\.citypages\.com|io; |
78
|
0
|
0
|
|
|
|
|
return 'jevon' if m|\.jevon\.org/|io; |
79
|
0
|
0
|
|
|
|
|
return 'tripod' if m|\.tripod\.com/|io; |
80
|
|
|
|
|
|
|
|
81
|
0
|
0
|
|
|
|
|
return 'xanga' if m|\.xanga\.com|o; |
82
|
|
|
|
|
|
|
# |
83
|
|
|
|
|
|
|
# CONTENT CHECKING |
84
|
|
|
|
|
|
|
# |
85
|
|
|
|
|
|
|
|
86
|
0
|
|
|
|
|
|
local $_ = $text; |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
# First, check META tags |
89
|
|
|
|
|
|
|
|
90
|
0
|
0
|
|
|
|
|
return "postnuke" if m|CONTENT="Post-?Nuke|io; # Nuke is nice enough to use META tags |
91
|
0
|
0
|
|
|
|
|
return "php-nuke" if m|CONTENT="PHP-?Nuke|io; |
92
|
0
|
0
|
|
|
|
|
return "microsoft" if m|]+Content=['"]Microsoft Visual|io; |
93
|
0
|
0
|
|
|
|
|
return "nucleus" if m|]+content=['"]Nucleus|io; |
94
|
0
|
0
|
|
|
|
|
return "greymatter" if m|]+content=['"]Greymatter|io; |
95
|
0
|
0
|
|
|
|
|
return "land down under" if m|]+content=['"]Land Down Under|io; |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
# Next, check actual content |
98
|
|
|
|
|
|
|
|
99
|
0
|
0
|
|
|
|
|
return "movable type" if m|cgi-bin/mt|o; |
100
|
0
|
0
|
|
|
|
|
return "movable type" if m|Powered by.*Move?able ?Type|io; # common typo is 'Moveable' |
101
|
0
|
0
|
|
|
|
|
return "movable type" if m|mtblog.gif|io; |
102
|
0
|
0
|
|
|
|
|
return "movable type" if m|move?abletype.gif|o; |
103
|
0
|
0
|
|
|
|
|
return "movable type" if m!function Open(Trackback|Comments)\s+\(c\)!o; # default MT JavaScript |
104
|
|
|
|
|
|
|
|
105
|
0
|
0
|
|
|
|
|
return "blogger pro" if m|powered_by_blogger_pro[0-9]*\.gif|io; |
106
|
0
|
0
|
|
|
|
|
return "blogger pro" if m|powered by:?
107
|
|
|
|
|
|
|
|
108
|
0
|
0
|
|
|
|
|
return "blogger" if m|bloggerbutton[0-9]+.gif|io; |
109
|
0
|
0
|
|
|
|
|
return "blogger" if m|bloggertemplate[^.]+.gif|io; |
110
|
0
|
0
|
|
|
|
|
return "blogger" if m|blogger_bluelong.gif|o; |
111
|
0
|
0
|
|
|
|
|
return "blogger" if m|powered by (
112
|
|
|
|
|
|
|
|
113
|
0
|
0
|
|
|
|
|
return "radio" if m|img src="http://radio.weblogs.com|io; |
114
|
0
|
0
|
|
|
|
|
return "radio" if m|http://radio.xmlstoragesystem.com/weblogStats|oi; |
115
|
0
|
0
|
|
|
|
|
return "radio" if m|images/radioUserLand|oi; |
116
|
0
|
0
|
|
|
|
|
return "radio" if m|xmlCoffeeCup|oi; |
117
|
|
|
|
|
|
|
|
118
|
0
|
0
|
|
|
|
|
return "manila" if m|thisIsAManilaSite|oi; |
119
|
|
|
|
|
|
|
|
120
|
0
|
0
|
|
|
|
|
return "cafelog" if m!function b2(?:open|comment)!o; # default cafelog JavaScript |
121
|
0
|
0
|
|
|
|
|
return "cafelog" if m|powered by (
122
|
|
|
|
|
|
|
|
123
|
0
|
0
|
|
|
|
|
return "pivot" if m||io; |
124
|
0
|
0
|
|
|
|
|
return "pivot" if m|pivot-?banner[^.]*.gif|io; |
125
|
|
|
|
|
|
|
|
126
|
0
|
0
|
|
|
|
|
return "textpattern" if m|txp_slug|o; |
127
|
0
|
0
|
|
|
|
|
return "blosxom" if /blosxom\.gif/o; |
128
|
|
|
|
|
|
|
|
129
|
0
|
0
|
|
|
|
|
return "slogger" if /Created by Slogger/io; |
130
|
|
|
|
|
|
|
|
131
|
0
|
0
|
|
|
|
|
return "greymatter" if /gm-icon.gif/o; |
132
|
0
|
0
|
|
|
|
|
return "greymatter" if /Powered by Greymatter/io; |
133
|
|
|
|
|
|
|
|
134
|
0
|
0
|
|
|
|
|
return "pMachine" if m|alt="[^"]+ pMachine|io; # This can be "Powered by" or "Gemaakt mit", for example |
135
|
0
|
0
|
|
|
|
|
return "pMachine" if m|powered by (?:
136
|
0
|
0
|
|
|
|
|
return "pMachine" if m|pmachine.gif|io; |
137
|
|
|
|
|
|
|
|
138
|
0
|
0
|
|
|
|
|
return "psychoblogger" if m|Powered by (?:]+>)?Psychoblogger|io; |
139
|
0
|
0
|
|
|
|
|
return "WebCrimson" if m|Powered by (?:]+>)?WebCrimson|io; |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
# Tests of last resort |
142
|
0
|
|
|
|
|
|
my @blog_count = $text =~ /\bblog\b/gi; |
143
|
|
|
|
|
|
|
|
144
|
0
|
0
|
|
|
|
|
return "suspected by URL" if $url =~ /[\W\-_](?:we)?blog/o; |
145
|
0
|
0
|
|
|
|
|
return "suspected by URL" if $url =~ /\bbitacoras\b/i; |
146
|
0
|
0
|
|
|
|
|
return "suspected by rss" if $text =~ /\brss\b/i; |
147
|
0
|
0
|
|
|
|
|
return "suspected by content" if scalar @blog_count > 5; |
148
|
|
|
|
|
|
|
|
149
|
0
|
|
|
|
|
|
return; |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
} |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
1; |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
__END__ |