line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package WWW::HtmlUnit; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
WWW::HtmlUnit - Inline::Java based wrapper of the HtmlUnit v2.14 library |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
use WWW::HtmlUnit; |
10
|
|
|
|
|
|
|
my $webClient = WWW::HtmlUnit->new; |
11
|
|
|
|
|
|
|
my $page = $webClient->getPage("http://google.com/"); |
12
|
|
|
|
|
|
|
my $f = $page->getFormByName('f'); |
13
|
|
|
|
|
|
|
my $submit = $f->getInputByName("btnG"); |
14
|
|
|
|
|
|
|
my $query = $f->getInputByName("q"); |
15
|
|
|
|
|
|
|
$page = $query->type("HtmlUnit"); |
16
|
|
|
|
|
|
|
$page = $query->type("\n"); |
17
|
|
|
|
|
|
|
|
18
|
|
|
|
|
|
|
my $content = $page->asXml; |
19
|
|
|
|
|
|
|
print "Result:\n$content\n\n"; |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 DESCRIPTION |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
This is a wrapper around the HtmlUnit library. It includes the HtmlUnit jar itself and it's dependencies. All this library really does is find the jars and load them up using L. |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
The reason all this is interesting? HtmlUnit has very good javascript support, so you can automate, scrape, or test javascript-required websites. |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
See especially the HtmlUnit documentation on their site for deeper API documentation, L. |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
=head1 INSTALLING |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
There is one special thing that I've run into when installing L, and thus L, which is telling the installer where to find your java home. It turns out this is really really easy, just define the JAVA_HOME environment variable before you start your CPAN shell / installer. From Debian/Ubuntu, I do: |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sudo apt-get install default-jdk |
34
|
|
|
|
|
|
|
sudo JAVA_HOME=/usr/lib/jvm/default-java cpanm WWW::HtmlUnit |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
and everything works the way I want! |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
=head1 DOCUMENTATION |
39
|
|
|
|
|
|
|
|
40
|
|
|
|
|
|
|
You can get the bulk of the documentation directly from the L. Since WWW::HtmlUnit is mostly a wrapper around the real Java API, what you actually have to do is translate some of the java notation into perl notation. Mostly this is replacing '.' with '->'. |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
Key classes that you might want to look at: |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
=over 4 |
45
|
|
|
|
|
|
|
|
46
|
|
|
|
|
|
|
=item L |
47
|
|
|
|
|
|
|
|
48
|
|
|
|
|
|
|
Represents a web browser. This is what C<< WWW::HtmlUnit->new >> returns. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=item L |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
A single HTML Page. |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
=item L |
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
An individual HTML element (node). |
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
=back |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
Also see L for a way to pretend that HtmlUnit works a little like L, but not really. |
61
|
|
|
|
|
|
|
|
62
|
|
|
|
|
|
|
=cut |
63
|
|
|
|
|
|
|
|
64
|
4
|
|
|
4
|
|
62559
|
use strict; |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
147
|
|
65
|
4
|
|
|
4
|
|
26
|
use warnings; |
|
4
|
|
|
|
|
6
|
|
|
4
|
|
|
|
|
2088
|
|
66
|
|
|
|
|
|
|
|
67
|
|
|
|
|
|
|
our $VERSION = '0.22'; |
68
|
|
|
|
|
|
|
|
69
|
|
|
|
|
|
|
sub find_jar_path { |
70
|
0
|
|
|
0
|
0
|
0
|
my $self = shift; |
71
|
0
|
|
|
|
|
0
|
my $path = $INC{'WWW/HtmlUnit.pm'}; |
72
|
0
|
|
|
|
|
0
|
$path =~ s/\.pm$/\/jar/; |
73
|
0
|
|
|
|
|
0
|
return $path; |
74
|
|
|
|
|
|
|
} |
75
|
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
our $classpath_separator = $^O =~ /win/i ? ";" : ":"; |
77
|
|
|
|
|
|
|
sub collect_default_jars { |
78
|
0
|
|
|
0
|
0
|
0
|
my $jar_path = find_jar_path(); |
79
|
0
|
|
|
|
|
0
|
return join $classpath_separator, map { "$jar_path/$_" } qw( |
|
0
|
|
|
|
|
0
|
|
80
|
|
|
|
|
|
|
commons-codec-1.9.jar |
81
|
|
|
|
|
|
|
commons-collections-3.2.1.jar |
82
|
|
|
|
|
|
|
commons-io-2.4.jar |
83
|
|
|
|
|
|
|
commons-lang3-3.2.1.jar |
84
|
|
|
|
|
|
|
commons-logging-1.1.3.jar |
85
|
|
|
|
|
|
|
cssparser-0.9.13.jar |
86
|
|
|
|
|
|
|
htmlunit-2.14.jar |
87
|
|
|
|
|
|
|
htmlunit-confirmhandler-2.8.jar |
88
|
|
|
|
|
|
|
htmlunit-core-js-2.14.jar |
89
|
|
|
|
|
|
|
httpclient-4.3.2.jar |
90
|
|
|
|
|
|
|
httpcore-4.3.1.jar |
91
|
|
|
|
|
|
|
httpmime-4.3.2.jar |
92
|
|
|
|
|
|
|
jetty-http-8.1.14.v20131031.jar |
93
|
|
|
|
|
|
|
jetty-io-8.1.14.v20131031.jar |
94
|
|
|
|
|
|
|
jetty-util-8.1.14.v20131031.jar |
95
|
|
|
|
|
|
|
jetty-websocket-8.1.14.v20131031.jar |
96
|
|
|
|
|
|
|
nekohtml-1.9.20.jar |
97
|
|
|
|
|
|
|
sac-1.3.jar |
98
|
|
|
|
|
|
|
serializer-2.7.1.jar |
99
|
|
|
|
|
|
|
xalan-2.7.1.jar |
100
|
|
|
|
|
|
|
xercesImpl-2.11.0.jar |
101
|
|
|
|
|
|
|
xml-apis-1.4.01.jar |
102
|
|
|
|
|
|
|
); |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
=head1 MODULE IMPORT PARAMETERS |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
In general, any parameters you pass while importing ('use'-ing) L will be passed on to L. A handy one is the 'DIRECTORY' parameter, for example. A few parameters are handled specially, however. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
If you need to include extra .jar files, and/or if you want to study more java classes, you can do: |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
use HtmlUnit |
112
|
|
|
|
|
|
|
jars => ['/path/to/blah.jar'], |
113
|
|
|
|
|
|
|
study => ['class.to.study']; |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
and that will be added to the list of jars for L to autostudy, and add to the list of classes for L to immediately study. A class must be on the study list to be directly instantiated. |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
Whether you ask for it or not, WebClient, BrowserVersion, and Cookie (each in the com.gargoylesoftware.htmlunit package) are studied. You can get to studied classes by adding WWW::HtmlUnit:: to their package name. So, you could make a cookie like this: |
118
|
|
|
|
|
|
|
|
119
|
|
|
|
|
|
|
my $cookie = WWW::HtmlUnit::com::gargoylesoftware::htmlunit::Cookie->new($name, $value); |
120
|
|
|
|
|
|
|
$webClient->getCookieManager->addCookie($cookie); |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
Which is, incidentally, just the sort of thing that I should wrap in WWW::HtmlUnit::Sweet or elsewhere, 'cause that is UGLY! |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
=cut |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
sub import { |
127
|
4
|
|
|
4
|
|
22
|
my $class = shift; |
128
|
4
|
|
|
|
|
11
|
my %parameters = @_; |
129
|
4
|
|
|
|
|
7
|
my $custom_jars = ""; |
130
|
4
|
50
|
|
|
|
20
|
if ($parameters{'jars'}) { |
131
|
0
|
|
|
|
|
0
|
$custom_jars = join($classpath_separator, @{$parameters{'jars'}}); |
|
0
|
|
|
|
|
0
|
|
132
|
0
|
|
|
|
|
0
|
delete $parameters{'jars'}; |
133
|
|
|
|
|
|
|
} |
134
|
|
|
|
|
|
|
|
135
|
4
|
|
|
|
|
10
|
my @STUDY = ( |
136
|
|
|
|
|
|
|
'com.gargoylesoftware.htmlunit.WebClient', |
137
|
|
|
|
|
|
|
'com.gargoylesoftware.htmlunit.BrowserVersion', |
138
|
|
|
|
|
|
|
'com.gargoylesoftware.htmlunit.util.Cookie', |
139
|
|
|
|
|
|
|
'com.gargoylesoftware.htmlunit.CollectingAlertHandler', |
140
|
|
|
|
|
|
|
'com.gargoylesoftware.htmlunit.ClickConfirmHandler', |
141
|
|
|
|
|
|
|
); |
142
|
4
|
50
|
|
|
|
13
|
if ($parameters{'study'}) { |
143
|
0
|
|
|
|
|
0
|
push(@STUDY, @{$parameters{'study'}}); |
|
0
|
|
|
|
|
0
|
|
144
|
0
|
|
|
|
|
0
|
delete $parameters{'study'}; |
145
|
|
|
|
|
|
|
} |
146
|
|
|
|
|
|
|
|
147
|
4
|
|
|
|
|
5038
|
require Inline; |
148
|
0
|
|
|
|
|
|
Inline->import( |
149
|
|
|
|
|
|
|
Java => 'STUDY', |
150
|
|
|
|
|
|
|
STUDY => \@STUDY, |
151
|
|
|
|
|
|
|
AUTOSTUDY => 1, |
152
|
|
|
|
|
|
|
CLASSPATH => collect_default_jars() . $classpath_separator . $custom_jars, |
153
|
|
|
|
|
|
|
%parameters |
154
|
|
|
|
|
|
|
); |
155
|
|
|
|
|
|
|
} |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
=head1 METHODS |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
=head2 $webClient = WWW::HtmlUnit->new($browser_name) |
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
This is just a shortcut for |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
$webClient = WWW::HtmlUnit::com::gargoylesoftware::htmlunit::WebClient->new; |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
The optional $browser_name allows you to specify which browser version to pass to the WebClient->new method. You could pass "FIREFOX_3" for example, to make the engine especially try to emulate Firefox 3 quirks, I imagine. |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
=cut |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
sub new { |
170
|
0
|
|
|
0
|
1
|
|
my ($class, $version) = @_; |
171
|
0
|
0
|
|
|
|
|
if($version) { |
172
|
0
|
|
|
|
|
|
my $browser_version = eval "\$WWW::HtmlUnit::com::gargoylesoftware::htmlunit::BrowserVersion::$version"; |
173
|
0
|
|
|
|
|
|
return WWW::HtmlUnit::com::gargoylesoftware::htmlunit::WebClient->new($browser_version); |
174
|
|
|
|
|
|
|
} else { |
175
|
0
|
|
|
|
|
|
return WWW::HtmlUnit::com::gargoylesoftware::htmlunit::WebClient->new; |
176
|
|
|
|
|
|
|
} |
177
|
|
|
|
|
|
|
} |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
=head1 DEPENDENCIES |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
When installed using the CPAN shell, all dependencies besides java itself will be installed. This includes the HtmlUnit jar files, and in fact those files make up the bulk of the distribution, byte-wise. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
=head1 TIPS |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
=head2 Working with java list/collections |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
When you get a java list, it is actually an object-thingie. You gotta call C<< ->toArray() >> on it, and then you'll get a lovely perl arrayref, which is most likely what you wanted in the first place. I am open to suggestions for a mass work-around for this. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
|
190
|
|
|
|
|
|
|
=head2 HTTP Authentication |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
my $credentialsProvider = $webclient->getCredentialsProvider; |
193
|
|
|
|
|
|
|
$credentialsProvider->addCredentials($username, $password); |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
=head2 Disable SSL certificate checking |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
$webclient->setUseInsecureSSL(1); |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
=head2 Handling alerts and confirmations |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
We (thanks lungching!) wrote a wee bit of java to make this easy. Though I admit that it could be a bit more... perlish. For a full example, see L. |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
my $alert_handler = WWW::HtmlUnit::com::gargoylesoftware::htmlunit::CollectingAlertHandler->new(); |
204
|
|
|
|
|
|
|
$webClient->setAlertHandler($alert_handler); |
205
|
|
|
|
|
|
|
# ... |
206
|
|
|
|
|
|
|
my $alert_arrayref = $alert_handler->getCollectedAlerts->toArray(); |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
=head1 TODO |
209
|
|
|
|
|
|
|
|
210
|
|
|
|
|
|
|
=over 4 |
211
|
|
|
|
|
|
|
|
212
|
|
|
|
|
|
|
=item * Capture HtmlUnit output to a variable |
213
|
|
|
|
|
|
|
|
214
|
|
|
|
|
|
|
=item * Use that to have a quiet-mode |
215
|
|
|
|
|
|
|
|
216
|
|
|
|
|
|
|
=item * Document lungching's confirmation handler code, automate build |
217
|
|
|
|
|
|
|
|
218
|
|
|
|
|
|
|
=back |
219
|
|
|
|
|
|
|
|
220
|
|
|
|
|
|
|
=head1 SEE ALSO |
221
|
|
|
|
|
|
|
|
222
|
|
|
|
|
|
|
L, L, L |
223
|
|
|
|
|
|
|
|
224
|
|
|
|
|
|
|
=head1 AUTHOR |
225
|
|
|
|
|
|
|
|
226
|
|
|
|
|
|
|
Brock Wilcox - http://thelackthereof.org/ |
227
|
|
|
|
|
|
|
|
228
|
|
|
|
|
|
|
=head1 COPYRIGHT |
229
|
|
|
|
|
|
|
|
230
|
|
|
|
|
|
|
Copyright (c) 2009-2014 Brock Wilcox . All rights |
231
|
|
|
|
|
|
|
reserved. This program is free software; you can redistribute it and/or |
232
|
|
|
|
|
|
|
modify it under the same terms as Perl itself. |
233
|
|
|
|
|
|
|
|
234
|
|
|
|
|
|
|
HtmlUnit library includes the following copyright: |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
/* |
237
|
|
|
|
|
|
|
* Copyright (c) 2002-2014 Gargoyle Software Inc. |
238
|
|
|
|
|
|
|
* |
239
|
|
|
|
|
|
|
* Licensed under the Apache License, Version 2.0 (the "License"); |
240
|
|
|
|
|
|
|
* you may not use this file except in compliance with the License. |
241
|
|
|
|
|
|
|
* You may obtain a copy of the License at |
242
|
|
|
|
|
|
|
* http://www.apache.org/licenses/LICENSE-2.0 |
243
|
|
|
|
|
|
|
* |
244
|
|
|
|
|
|
|
* Unless required by applicable law or agreed to in writing, software |
245
|
|
|
|
|
|
|
* distributed under the License is distributed on an "AS IS" BASIS, |
246
|
|
|
|
|
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
247
|
|
|
|
|
|
|
* See the License for the specific language governing permissions and |
248
|
|
|
|
|
|
|
* limitations under the License. |
249
|
|
|
|
|
|
|
*/ |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=cut |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
1; |
254
|
|
|
|
|
|
|
|