File Coverage

blib/lib/WebService/LOC/CongRec/Page.pm

Criterion	Covered	Total	%
statement	4	6	66.6
branch			n/a
condition			n/a
subroutine	2	2	100.0
pod			n/a
total	6	8	75.0

line	stmt	sub	time	code
1	1	1	3264	use 5.12.0;
	1		4
	1		70
2
3				package WebService::LOC::CongRec::Page;
4				our $VERSION = '0.1_04';
5	1	1	217	use Moose 1.13;
	0
	0
6				with 'MooseX::Log::Log4perl';
7
8				use HTML::Strip;
9				use Data::Dumper;
10
11				=head1 DESCRIPTION
12
13				A single page from the Congressional Record on THOMAS.
14
15				The URL is not persistent, but is along the lines of:
16				http://thomas.loc.gov/cgi-bin/query/D?r111:15:./temp/~r111h782Bg::
17
18				=cut
19
20				=head1 ATTRIBUTES
21
22				=over 1
23
24				=item mech
25
26				A WWW::Mechanize object that we can use to grab the page from Thomas.
27
28				=cut
29
30				has 'mech' => (
31				is => 'rw',
32				isa => 'Object',
33				required => 1,
34				);
35
36				=item url
37
38				The page URL.
39
40				=cut
41
42				has 'url' => (
43				is => 'ro',
44				isa => 'Str',
45				required => 1,
46				);
47
48				=item pageID
49
50				This page's ID.
51
52				=cut
53
54				has 'pageID' => (
55				is => 'rw',
56				isa => 'Str',
57				);
58
59				=item summary
60
61				This page's summary.
62
63				=cut
64
65				has 'summary' => (
66				is => 'rw',
67				isa => 'Str',
68				);
69
70				=item content
71
72				This page's content.
73
74				=cut
75
76				has 'content' => (
77				is => 'rw',
78				isa => 'Str',
79				default => '',
80				);
81
82				=back
83
84				=cut
85
86				sub BUILD {
87				my ($self) = @_;
88
89				my $tagStripper = HTML::Strip->new();
90				$self->mech->get($self->url);
91
92				my @lines = split /\n/, $self->mech->content;
93				foreach my $line (@lines) {
94				# Summary line doesn't have a <p> leader
95				if ($line =~ m!^<b>(.+)</b><br/>!) {
96				$self->summary($1);
97				next;
98				}
99
100				next if ($line !~ /^<p>/ \|\| $line =~ /^<p>---/);
101
102				# Page ID
103				if ($line =~ m!^<p><center><pre>\[Page: ([HSE]\d{1,6})\] <b>!) {
104				$self->pageID($1);
105				next;
106				}
107
108				# Line of actual content
109				if ($line =~ m/^<p>(.*)$/) {
110				my $text = $tagStripper->parse($1) . "\n";
111
112				# Strip non-breaking spaces
113				$text =~ s/\xA0//g;
114
115				$self->content($self->content . $text);
116
117				$tagStripper->eof();
118				}
119				}
120				}
121
122				1;