File Coverage

blib/lib/Bio/WebService/LANL/SequenceLocator.pm

Criterion	Covered	Total	%
statement	29	193	15.0
branch	0	78	0.0
condition	0	36	0.0
subroutine	10	26	38.4
pod	1	6	16.6
total	40	339	11.8

line	stmt	bran	cond	sub	pod	time	code
1	1			1		916	use strictures 1;
	1					1150
	1					37
2	1			1		522	use utf8;
	1					7
	1					5
3	1			1		31	use 5.018;
	1					2
4
5							=head1 NAME
6
7							Bio::WebService::LANL::SequenceLocator - Locate sequences within HIV using LANL's web tool
8
9							=head1 SYNOPSIS
10
11							use Bio::WebService::LANL::SequenceLocator;
12
13							my $locator = Bio::WebService::LANL::SequenceLocator->new(
14							agent_string => 'Your Organization - you@example.com',
15							);
16							my @sequences = $locator->find([
17							"agcaatcagatggtcagccaaaattgccctatagtgcagaacatccaggggcaagtggtacatcaggccatatcacctagaactttaaatgca",
18							]);
19
20							See L below.
21
22							=head1 DESCRIPTION
23
24							This library provides simple programmatic access to
25							L
26							web tool and is also used to power
27							L
28							for the same tool (via L).
29
30							Nearly all of the information output by LANL's sequence locator is parsed and
31							provided by this library, though the results do vary slightly depending on the
32							base type of the query sequence. Multiple query sequences can be located at
33							the same time and results will be returned for all.
34
35							Results are extracted from both tab-delimited files provided by LANL as well as
36							the HTML itself.
37
38							=head1 EXAMPLE RESULTS
39
40							# Using @sequences from the SYNOPSIS above
41							use JSON;
42							print encode_json(\@sequences);
43
44							__END__
45							[
46							{
47							"query" : "sequence_1",
48							"query_sequence" : "AGCAATCAGATGGTCAGCCAAAATTGCCCTATAGTGCAGAACATCCAGGGGCAAGTGGTACATCAGGCCATATCACCTAGAACTTTAAATGCA",
49							"base_type" : "nucleotide",
50							"reverse_complement" : "0",
51							"alignment" : "\n Query AGCAATCAGA TGGTCAGCCA AAATTGCCCT ATAGTGCAGA ACATCCAGGG 50\n :::::::: ::::::::: ::::: :::: :::::::::: :::::::::: \n HXB2 AGCAATCA-- -GGTCAGCCA AAATTACCCT ATAGTGCAGA ACATCCAGGG 1208\n\n Query GCAAGTGGTA CATCAGGCCA TATCACCTAG AACTTTAAAT GCA 93\n :::: ::::: :::::::::: :::::::::: :::::::::: ::: \n HXB2 GCAAATGGTA CATCAGGCCA TATCACCTAG AACTTTAAAT GCA 1251\n\n ",
52							"hxb2_sequence" : "AGCAATCA---GGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCA",
53							"similarity_to_hxb2" : "94.6",
54							"start" : "373",
55							"end" : "462",
56							"genome_start" : "1162",
57							"genome_end" : "1251",
58							"polyprotein" : "Gag",
59							"region_names" : [
60							"Gag",
61							"p17",
62							"p24"
63							],
64							"regions" : [
65							{
66							"cds" : "Gag",
67							"aa_from_protein_start" : [ "125", "154" ],
68							"na_from_cds_start" : [ "373", "462" ],
69							"na_from_hxb2_start" : [ "1162", "1251" ],
70							"na_from_query_start" : [ "1", "93" ],
71							"protein_translation" : "SNQMVSQNCPIVQNIQGQVVHQAISPRTLNA"
72							},
73							{
74							"cds" : "p17",
75							"aa_from_protein_start" : [ "125", "132" ],
76							"na_from_cds_start" : [ "373", "396" ],
77							"na_from_hxb2_start" : [ "1162", "1185" ],
78							"na_from_query_start" : [ "1", "27" ],
79							"protein_translation" : "SNQMVSQNC"
80							},
81							{
82							"cds" : "p24",
83							"aa_from_protein_start" : [ "1", "22" ],
84							"na_from_cds_start" : [ "1", "66" ],
85							"na_from_hxb2_start" : [ "1186", "1251" ],
86							"na_from_query_start" : [ "28", "93" ],
87							"protein_translation" : "PIVQNIQGQVVHQAISPRTLNA"
88							}
89							]
90							}
91							]
92
93							=cut
94
95							package Bio::WebService::LANL::SequenceLocator;
96
97	1			1		410	use Moo;
	1					9457
	1					4
98	1			1		1457	use Data::Dumper;
	1					4327
	1					47
99	1			1		399	use HTML::LinkExtor;
	1					5618
	1					28
100	1			1		496	use HTML::TableExtract;
	1					5506
	1					5
101	1			1		413	use HTML::TokeParser;
	1					1562
	1					23
102	1			1		403	use HTTP::Request::Common;
	1					16182
	1					63
103	1			1		448	use List::AllUtils qw< pairwise part min max >;
	1					9420
	1					2206
104
105							our $VERSION = 20170324;
106
107							=head1 METHODS
108
109							=head2 new
110
111							Returns a new instance of this class. An optional parameter C
112							should be provided to identify yourself to LANL out of politeness. See the
113							L for an example.
114
115							=cut
116
117							has agent_string => (
118							is => 'ro',
119							lazy => 1,
120	0			0			builder => sub { '' },
121							);
122
123							has agent => (
124							is => 'ro',
125							lazy => 1,
126							builder => sub {
127	0			0			require LWP::UserAgent;
128	0						my $self = shift;
129	0						my $agent = LWP::UserAgent->new(
130							agent => join(" ", __PACKAGE__ . "/$VERSION", $self->agent_string),
131							);
132	0						$agent->env_proxy;
133	0						return $agent;
134							},
135							);
136
137							has lanl_base => (
138							is => 'ro',
139							lazy => 1,
140	0			0			builder => sub { 'https://www.hiv.lanl.gov' },
141							);
142
143							has lanl_endpoint => (
144							is => 'ro',
145							lazy => 1,
146	0			0			builder => sub { shift->lanl_base . '/cgi-bin/LOCATE/locate.cgi' },
147							);
148
149							has _bogus_slug => (
150							is => 'ro',
151							default => sub { 'BOGUS_SEQ_SO_TABULAR_FILES_ARE_LINKED_IN_OUTPUT' },
152							);
153
154							sub _request {
155	0			0			my $self = shift;
156	0						my $req = shift;
157	0						my $response = $self->agent->request($req);
158
159	0	0					if (not $response->is_success) {
160	0						warn sprintf "Request failed: %s %s -> %s\n",
161							$req->method, $req->uri, $response->status_line;
162	0						return;
163							}
164
165	0						return $response->decoded_content;
166							}
167
168							=head2 find
169
170							Takes an array ref of sequence strings. Sequences may be in amino acids or
171							nucleotides and mixed freely. Sequences should not be in FASTA format.
172
173							If sequence bases are not clearly nucleotides or clearly amino acids, LANL
174							seems to default to nucleotides. This can be an issue for some sequences since
175							the full alphabet for nucleotides overlaps with the alphabet for amino acids.
176							To overcome this problem, you may specify C<< base => 'nucleotide' >>
177							or C<< base => 'amino acid' >> after the array ref of sequences. This forces
178							every sequence to be interpreted as nucleotides or amino acids, so you cannot
179							mix base types in your sequences if you use this option. C, C, and
180							C are accepted aliases for C. C, C, C,
181							and C are accepted aliases for C.
182
183							Returns a list of hashrefs when called in list context, otherwise returns an
184							arrayref of hashrefs.
185
186							See L for the structure of the data returned.
187
188							=cut
189
190							sub find {
191	0			0	1		my ($self, $sequences, %args) = @_;
192
193	0	0					my $content = $self->submit_sequences($sequences, %args)
194							or return;
195
196	0						return $self->parse_html($content);
197							}
198
199							sub submit_sequences {
200	0			0	0		my ($self, $sequences, %args) = @_;
201
202	0	0					if (defined $args{base}) {
203	0						my $base = lc $args{base};
204	0	0					if ($base =~ /^n(uc(leotides?)?)?$/i) {
		0
205	0						$args{base} = 1;
206							} elsif ($base =~ /^(a(mino( acids?)?)?\|aa)$/i) {
207	0						$args{base} = 0;
208							} else {
209	0						warn "Unknown base type <$args{base}>, ignoring";
210	0						delete $args{base};
211							}
212							}
213
214							# Submit multiple sequences at once using FASTA
215							my $fasta = join "\n", map {
216	0						("> sequence_$_", $sequences->[$_ - 1])
	0
217							} 1 .. @$sequences;
218
219							# LANL only presents the parseable table.txt we want if there's more
220							# than a single sequence. We always add it so we can reliably skip it.
221	0						$fasta .= "\n> " . $self->_bogus_slug . "\n";
222
223							return $self->_request(
224							POST $self->lanl_endpoint,
225							Content_Type => 'form-data',
226							Content => [
227							organism => 'HIV',
228							DoReverseComplement => 0,
229							seq_input => $fasta,
230							(defined $args{base}
231							? ( base => $args{base} )
232	0	0					: ()),
233							],
234							);
235							}
236
237							sub parse_html {
238	0			0	0		my ($self, $content) = @_;
239
240							# Fetch and parse the two tables provided as links which removes the need
241							# to parse all of the HTML.
242	0						my @results = $self->parse_tsv($content);
243
244							# Now parse the table data from the HTML
245	0						my @tables = $self->parse_tables($content);
246
247							# Extract the alignments, parsing the HTML a third time!
248	0						my @alignments = $self->parse_alignments($content);
249
250	0	0	0				unless (@results and @tables and @alignments) {
			0
251	0						warn "Didn't find all three of TSV, tables, and alignments!\n";
252	0						warn "TSV: ", scalar @results, "\n";
253	0						warn "HTML tables: ", scalar @tables, "\n";
254	0						warn "HTML alignments: ", scalar @alignments, "\n";
255	0						warn "Content:\n$content\n", "=" x 80, "\n";
256	0						return;
257							}
258
259	0	0	0				unless (@results == @tables and @results == @alignments) {
260	0						warn "Tab-delimited results count doesn't match parsed HTML result count. Bug!\n";
261	0						warn "TSV: ", scalar @results, "\n";
262	0						warn "HTML tables: ", scalar @tables, "\n";
263	0						warn "HTML alignments: ", scalar @alignments, "\n";
264	0						warn "Content:\n$content\n", "=" x 80, "\n";
265	0						return;
266							}
267
268							@results = pairwise {
269							my $new = {
270							%$a,
271							base_type => $b->{base_type},
272							regions => $b->{rows},
273	0			0			region_names => [ map { $_->{cds} } @{$b->{rows}} ],
	0
	0
274							};
275	0						delete $new->{$_} for qw(protein protein_start protein_end);
276	0						$new;
277	0						} @results, @tables;
278
279	0			0			@results = pairwise { +{ %$b, %$a } } @results, @alignments;
	0
280
281							# Fill in genome start/end for amino acid sequences
282	0						for my $r (@results) {
283	0	0					next unless $r->{base_type} eq 'amino acid';
284
285	0	0	0				if ($r->{genome_start} or $r->{genome_end}) {
286	0						warn "Amino acid sequence with genome start/end already?!",
287							" query <$r->{query_sequence}>";
288	0						next;
289							}
290
291	0						$r->{genome_start} = min map { $_->{na_from_hxb2_start}[0] } @{$r->{regions}};
	0
	0
292	0						$r->{genome_end} = max map { $_->{na_from_hxb2_start}[1] } @{$r->{regions}};
	0
	0
293							}
294
295	0	0					return wantarray ? @results : \@results;
296							}
297
298							sub parse_tsv {
299	0			0	0		my ($self, $content) = @_;
300	0						my @results;
301							my %urls;
302
303							my $extract = HTML::LinkExtor->new(
304							sub {
305	0			0			my ($tag, %attr) = @_;
306	0	0	0				return unless $tag eq 'a' and $attr{href};
307	0	0					return unless $attr{href} =~ m{/(table\|simple_results)\.txt$};
308	0						$urls{$1} = $attr{href};
309							},
310	0						$self->lanl_base,
311							);
312	0						$extract->parse($content);
313
314	0						for my $table_name (qw(table simple_results)) {
315	0	0					next unless $urls{$table_name};
316	0	0					my $table = $self->_request(GET $urls{$table_name})
317							or next;
318
319	0						my (@these_results, %seen);
320	0						my @lines = split "\n", $table;
321							my @fields = map {
322	0						s/^SeqName$/query/; # standard key
	0
323	0						s/(?<=[a-z])(?=[A-Z])/_/g; # undo CamelCase
324	0						s/ +/_/g; # no spaces
325	0						y/A-Z/a-z/; # normalize to lowercase
326							# Account for the same field twice in the same data table
327	0	0					if ($seen{$_}++) {
328							$_ = /^(start\|end)$/
329							? "protein_$_"
330	0	0					: join "_", $_, $seen{$_};
331							}
332	0						$_;
333							} split "\t", shift @lines;
334
335	0						for (@lines) {
336	0						my @values = split "\t";
337	0						my %data;
338	0						@data{@fields} = @values;
339
340	0	0					next if $data{query} eq $self->_bogus_slug;
341
342							$data{query_sequence} =~ s/\s+//g
343	0	0					if $data{query_sequence};
344	0						push @these_results, \%data;
345							}
346
347							# Merge with existing results, if any
348							@results = @results
349	0	0		0			? pairwise { +{ %$a, %$b } } @results, @these_results
	0
350							: @these_results;
351							}
352
353	0						return @results;
354							}
355
356							sub parse_tables {
357	0			0	0		my ($self, $content) = @_;
358	0						my @tables;
359
360	0						my %columns_for = (
361							'amino acid' => [
362							"CDS" => "cds",
363							"AA position relative to protein start in HXB2" => "aa_from_protein_start",
364							"AA position relative to query sequence start" => "aa_from_query_start",
365							"AA position relative to polyprotein start in HXB2" => "aa_from_polyprotein_start",
366							"NA position relative to CDS start in HXB2" => "aa_from_cds_start",
367							"NA position relative to HXB2 genome start" => "na_from_hxb2_start",
368							],
369							'nucleotide' => [
370							"CDS" => "cds",
371							"Nucleotide position relative to CDS start in HXB2" => "na_from_cds_start",
372							"Nucleotide position relative to query sequence start" => "na_from_query_start",
373							"Nucleotide position relative to HXB2 genome start" => "na_from_hxb2_start",
374							"Amino Acid position relative to protein start in HXB2" => "aa_from_protein_start",
375							],
376							);
377
378	0						for my $base_type (sort keys %columns_for) {
379							my ($their_cols, $our_cols) = part {
380	0			0			state $i = 0;
381	0						$i++ % 2
382	0						} @{ $columns_for{$base_type} };
	0
383
384	0						my $extract = HTML::TableExtract->new( headers => $their_cols );
385	0						$extract->parse($content);
386
387							# Examine all matching tables
388	0						for my $table ($extract->tables) {
389	0						my %table = (
390							coords => [$table->coords],
391							base_type => $base_type,
392							columns => $our_cols,
393							rows => [],
394							);
395	0						for my $row ($table->rows) {
396	0	0					@$row = map { defined $_ ? s/^\s+\|\s*$//gr : $_ } @$row;
	0
397
398							# An empty row with only a sequence string in the first column.
399	0	0	0				if ( $row->[0]
			0
400							and $row->[0] =~ /^[A-Za-z]+$/
401	0	0					and not grep { defined and length } @$row[1 .. scalar @$row - 1])
402							{
403	0						$table{rows}->[-1]{protein_translation} = $row->[0];
404	0						next;
405							}
406
407							# Not all rows are data, some are informational sentences.
408	0	0					next if grep { not defined } @$row;
	0
409
410	0						my %row;
411							@row{@$our_cols} =
412	0	0	0				map { ($_ and $_ eq "NA") ? undef : $_ }
413	0	0	0				map { ($_ and /(\d+) → (\d+)/) ? [$1, $2] : $_ }
	0
414							@$row;
415
416	0						push @{$table{rows}}, \%row;
	0
417							}
418							push @tables, \%table
419	0	0					if @{$table{rows}};
	0
420							}
421							}
422
423							# Sort by depth, then within each depth by count
424							@tables = sort {
425	0						$a->{coords}[0] <=> $b->{coords}[0]
426	0	0					or $a->{coords}[1] <=> $b->{coords}[1]
427							} @tables;
428
429	0	0					if (@tables > 1) {
430	0	0	0				unless ( $tables[-1]->{rows}[0]{na_from_query_start} eq "1 →"
431							and $tables[-1]->{rows}[0]{protein_translation} eq "X") {
432	0						warn "Last table appears to be real!? It should be the bogus table of the bogus sequence.";
433	0						warn "Table is ", Dumper($tables[-1]), "\n";
434	0						return;
435							} else {
436	0						pop @tables;
437							}
438							}
439
440	0						return @tables;
441							}
442
443							sub parse_alignments {
444	0			0	0		my ($self, $content) = @_;
445	0						my @alignments;
446
447	0						my $doc = HTML::TokeParser->new(
448							\$content,
449							unbroken_text => 1,
450							);
451
452	0						my $expect_alignment = 0;
453
454	0						while (my $tag = $doc->get_tag("b", "pre")) {
455	0						my $name = lc $tag->[0];
456	0						my $text = $doc->get_text;
457	0	0					next unless defined $text;
458
459							# s are preceeded by a bold header, which we use as an indicator
460	0	0					if ($name eq 'b') {
		0
461	0						$expect_alignment = $text =~ /Alignment\s+of\s+the\s+query\s+sequence\s+to\s+HXB2/i;
462							} elsif ($name eq 'pre') {
463	0	0	0				if ($text =~ /^\sQuery\b/m and $text =~ /^\sHXB2\b/m) {
		0	0
464	0						push @alignments, $text;
465	0	0					warn "Not expecting alignment, but found one‽"
466							unless $expect_alignment;
467							}
468							elsif ($text =~ /^\s+$/ and $expect_alignment) {
469	0						push @alignments, undef; # We appear to have found an unaligned sequence.
470							}
471	0						$expect_alignment = 0;
472							}
473							}
474
475	0	0					if (defined $alignments[-1]) {
476	0						warn "Last alignment is non-null! It should be the empty alignment of the bogus sequence.";
477	0						warn "Alignment is <$alignments[-1]>\n";
478	0						return;
479							} else {
480	0						pop @alignments;
481							}
482
483	0						my @results;
484	0						for (@alignments) {
485	0						my @hxb2;
486	0	0					if (defined) {
487	0						push @hxb2, $1 =~ s/\s+//gr
488							while /^\sHXB2\b\s+(.+?)(?:\s+\d+\|\s)$/gm;
489							}
490	0	0					push @results, {
491							alignment => $_,
492							hxb2_sequence => @hxb2 ? join("", @hxb2) : undef,
493							};
494							}
495
496	0						return @results;
497							}
498
499							=head1 AUTHOR
500
501							Thomas Sibley Etrsibley@uw.eduE
502
503							=head1 COPYRIGHT
504
505							Copyright 2014 by the Mullins Lab, Department of Microbiology, University of
506							Washington.
507
508							=head1 LICENSE
509
510							Licensed under the same terms as Perl 5 itself.
511
512							=cut
513
514							42;