File Coverage

blib/lib/Bio/WebService/LANL/SequenceLocator.pm
Criterion Covered Total %
statement 29 193 15.0
branch 0 78 0.0
condition 0 36 0.0
subroutine 10 26 38.4
pod 1 6 16.6
total 40 339 11.8


line stmt bran cond sub pod time code
1 1     1   916 use strictures 1;
  1         1150  
  1         37  
2 1     1   522 use utf8;
  1         7  
  1         5  
3 1     1   31 use 5.018;
  1         2  
4              
5             =head1 NAME
6              
7             Bio::WebService::LANL::SequenceLocator - Locate sequences within HIV using LANL's web tool
8              
9             =head1 SYNOPSIS
10              
11             use Bio::WebService::LANL::SequenceLocator;
12            
13             my $locator = Bio::WebService::LANL::SequenceLocator->new(
14             agent_string => 'Your Organization - you@example.com',
15             );
16             my @sequences = $locator->find([
17             "agcaatcagatggtcagccaaaattgccctatagtgcagaacatccaggggcaagtggtacatcaggccatatcacctagaactttaaatgca",
18             ]);
19              
20             See L below.
21              
22             =head1 DESCRIPTION
23              
24             This library provides simple programmatic access to
25             L
26             web tool and is also used to power
27             L
28             for the same tool (via L).
29              
30             Nearly all of the information output by LANL's sequence locator is parsed and
31             provided by this library, though the results do vary slightly depending on the
32             base type of the query sequence. Multiple query sequences can be located at
33             the same time and results will be returned for all.
34              
35             Results are extracted from both tab-delimited files provided by LANL as well as
36             the HTML itself.
37              
38             =head1 EXAMPLE RESULTS
39              
40             # Using @sequences from the SYNOPSIS above
41             use JSON;
42             print encode_json(\@sequences);
43            
44             __END__
45             [
46             {
47             "query" : "sequence_1",
48             "query_sequence" : "AGCAATCAGATGGTCAGCCAAAATTGCCCTATAGTGCAGAACATCCAGGGGCAAGTGGTACATCAGGCCATATCACCTAGAACTTTAAATGCA",
49             "base_type" : "nucleotide",
50             "reverse_complement" : "0",
51             "alignment" : "\n Query AGCAATCAGA TGGTCAGCCA AAATTGCCCT ATAGTGCAGA ACATCCAGGG 50\n :::::::: ::::::::: ::::: :::: :::::::::: :::::::::: \n HXB2 AGCAATCA-- -GGTCAGCCA AAATTACCCT ATAGTGCAGA ACATCCAGGG 1208\n\n Query GCAAGTGGTA CATCAGGCCA TATCACCTAG AACTTTAAAT GCA 93\n :::: ::::: :::::::::: :::::::::: :::::::::: ::: \n HXB2 GCAAATGGTA CATCAGGCCA TATCACCTAG AACTTTAAAT GCA 1251\n\n ",
52             "hxb2_sequence" : "AGCAATCA---GGTCAGCCAAAATTACCCTATAGTGCAGAACATCCAGGGGCAAATGGTACATCAGGCCATATCACCTAGAACTTTAAATGCA",
53             "similarity_to_hxb2" : "94.6",
54             "start" : "373",
55             "end" : "462",
56             "genome_start" : "1162",
57             "genome_end" : "1251",
58             "polyprotein" : "Gag",
59             "region_names" : [
60             "Gag",
61             "p17",
62             "p24"
63             ],
64             "regions" : [
65             {
66             "cds" : "Gag",
67             "aa_from_protein_start" : [ "125", "154" ],
68             "na_from_cds_start" : [ "373", "462" ],
69             "na_from_hxb2_start" : [ "1162", "1251" ],
70             "na_from_query_start" : [ "1", "93" ],
71             "protein_translation" : "SNQMVSQNCPIVQNIQGQVVHQAISPRTLNA"
72             },
73             {
74             "cds" : "p17",
75             "aa_from_protein_start" : [ "125", "132" ],
76             "na_from_cds_start" : [ "373", "396" ],
77             "na_from_hxb2_start" : [ "1162", "1185" ],
78             "na_from_query_start" : [ "1", "27" ],
79             "protein_translation" : "SNQMVSQNC"
80             },
81             {
82             "cds" : "p24",
83             "aa_from_protein_start" : [ "1", "22" ],
84             "na_from_cds_start" : [ "1", "66" ],
85             "na_from_hxb2_start" : [ "1186", "1251" ],
86             "na_from_query_start" : [ "28", "93" ],
87             "protein_translation" : "PIVQNIQGQVVHQAISPRTLNA"
88             }
89             ]
90             }
91             ]
92              
93             =cut
94              
95             package Bio::WebService::LANL::SequenceLocator;
96              
97 1     1   410 use Moo;
  1         9457  
  1         4  
98 1     1   1457 use Data::Dumper;
  1         4327  
  1         47  
99 1     1   399 use HTML::LinkExtor;
  1         5618  
  1         28  
100 1     1   496 use HTML::TableExtract;
  1         5506  
  1         5  
101 1     1   413 use HTML::TokeParser;
  1         1562  
  1         23  
102 1     1   403 use HTTP::Request::Common;
  1         16182  
  1         63  
103 1     1   448 use List::AllUtils qw< pairwise part min max >;
  1         9420  
  1         2206  
104              
105             our $VERSION = 20170324;
106              
107             =head1 METHODS
108              
109             =head2 new
110              
111             Returns a new instance of this class. An optional parameter C
112             should be provided to identify yourself to LANL out of politeness. See the
113             L for an example.
114              
115             =cut
116              
117             has agent_string => (
118             is => 'ro',
119             lazy => 1,
120 0     0     builder => sub { '' },
121             );
122              
123             has agent => (
124             is => 'ro',
125             lazy => 1,
126             builder => sub {
127 0     0     require LWP::UserAgent;
128 0           my $self = shift;
129 0           my $agent = LWP::UserAgent->new(
130             agent => join(" ", __PACKAGE__ . "/$VERSION", $self->agent_string),
131             );
132 0           $agent->env_proxy;
133 0           return $agent;
134             },
135             );
136              
137             has lanl_base => (
138             is => 'ro',
139             lazy => 1,
140 0     0     builder => sub { 'https://www.hiv.lanl.gov' },
141             );
142              
143             has lanl_endpoint => (
144             is => 'ro',
145             lazy => 1,
146 0     0     builder => sub { shift->lanl_base . '/cgi-bin/LOCATE/locate.cgi' },
147             );
148              
149             has _bogus_slug => (
150             is => 'ro',
151             default => sub { 'BOGUS_SEQ_SO_TABULAR_FILES_ARE_LINKED_IN_OUTPUT' },
152             );
153              
154             sub _request {
155 0     0     my $self = shift;
156 0           my $req = shift;
157 0           my $response = $self->agent->request($req);
158              
159 0 0         if (not $response->is_success) {
160 0           warn sprintf "Request failed: %s %s -> %s\n",
161             $req->method, $req->uri, $response->status_line;
162 0           return;
163             }
164              
165 0           return $response->decoded_content;
166             }
167              
168             =head2 find
169              
170             Takes an array ref of sequence strings. Sequences may be in amino acids or
171             nucleotides and mixed freely. Sequences should not be in FASTA format.
172              
173             If sequence bases are not clearly nucleotides or clearly amino acids, LANL
174             seems to default to nucleotides. This can be an issue for some sequences since
175             the full alphabet for nucleotides overlaps with the alphabet for amino acids.
176             To overcome this problem, you may specify C<< base => 'nucleotide' >>
177             or C<< base => 'amino acid' >> after the array ref of sequences. This forces
178             every sequence to be interpreted as nucleotides or amino acids, so you cannot
179             mix base types in your sequences if you use this option. C, C, and
180             C are accepted aliases for C. C, C, C,
181             and C are accepted aliases for C.
182              
183             Returns a list of hashrefs when called in list context, otherwise returns an
184             arrayref of hashrefs.
185              
186             See L for the structure of the data returned.
187              
188             =cut
189              
190             sub find {
191 0     0 1   my ($self, $sequences, %args) = @_;
192              
193 0 0         my $content = $self->submit_sequences($sequences, %args)
194             or return;
195              
196 0           return $self->parse_html($content);
197             }
198              
199             sub submit_sequences {
200 0     0 0   my ($self, $sequences, %args) = @_;
201              
202 0 0         if (defined $args{base}) {
203 0           my $base = lc $args{base};
204 0 0         if ($base =~ /^n(uc(leotides?)?)?$/i) {
    0          
205 0           $args{base} = 1;
206             } elsif ($base =~ /^(a(mino( acids?)?)?|aa)$/i) {
207 0           $args{base} = 0;
208             } else {
209 0           warn "Unknown base type <$args{base}>, ignoring";
210 0           delete $args{base};
211             }
212             }
213              
214             # Submit multiple sequences at once using FASTA
215             my $fasta = join "\n", map {
216 0           ("> sequence_$_", $sequences->[$_ - 1])
  0            
217             } 1 .. @$sequences;
218              
219             # LANL only presents the parseable table.txt we want if there's more
220             # than a single sequence. We always add it so we can reliably skip it.
221 0           $fasta .= "\n> " . $self->_bogus_slug . "\n";
222              
223             return $self->_request(
224             POST $self->lanl_endpoint,
225             Content_Type => 'form-data',
226             Content => [
227             organism => 'HIV',
228             DoReverseComplement => 0,
229             seq_input => $fasta,
230             (defined $args{base}
231             ? ( base => $args{base} )
232 0 0         : ()),
233             ],
234             );
235             }
236              
237             sub parse_html {
238 0     0 0   my ($self, $content) = @_;
239              
240             # Fetch and parse the two tables provided as links which removes the need
241             # to parse all of the HTML.
242 0           my @results = $self->parse_tsv($content);
243              
244             # Now parse the table data from the HTML
245 0           my @tables = $self->parse_tables($content);
246              
247             # Extract the alignments, parsing the HTML a third time!
248 0           my @alignments = $self->parse_alignments($content);
249              
250 0 0 0       unless (@results and @tables and @alignments) {
      0        
251 0           warn "Didn't find all three of TSV, tables, and alignments!\n";
252 0           warn "TSV: ", scalar @results, "\n";
253 0           warn "HTML tables: ", scalar @tables, "\n";
254 0           warn "HTML alignments: ", scalar @alignments, "\n";
255 0           warn "Content:\n$content\n", "=" x 80, "\n";
256 0           return;
257             }
258              
259 0 0 0       unless (@results == @tables and @results == @alignments) {
260 0           warn "Tab-delimited results count doesn't match parsed HTML result count. Bug!\n";
261 0           warn "TSV: ", scalar @results, "\n";
262 0           warn "HTML tables: ", scalar @tables, "\n";
263 0           warn "HTML alignments: ", scalar @alignments, "\n";
264 0           warn "Content:\n$content\n", "=" x 80, "\n";
265 0           return;
266             }
267              
268             @results = pairwise {
269             my $new = {
270             %$a,
271             base_type => $b->{base_type},
272             regions => $b->{rows},
273 0     0     region_names => [ map { $_->{cds} } @{$b->{rows}} ],
  0            
  0            
274             };
275 0           delete $new->{$_} for qw(protein protein_start protein_end);
276 0           $new;
277 0           } @results, @tables;
278              
279 0     0     @results = pairwise { +{ %$b, %$a } } @results, @alignments;
  0            
280              
281             # Fill in genome start/end for amino acid sequences
282 0           for my $r (@results) {
283 0 0         next unless $r->{base_type} eq 'amino acid';
284              
285 0 0 0       if ($r->{genome_start} or $r->{genome_end}) {
286 0           warn "Amino acid sequence with genome start/end already?!",
287             " query <$r->{query_sequence}>";
288 0           next;
289             }
290              
291 0           $r->{genome_start} = min map { $_->{na_from_hxb2_start}[0] } @{$r->{regions}};
  0            
  0            
292 0           $r->{genome_end} = max map { $_->{na_from_hxb2_start}[1] } @{$r->{regions}};
  0            
  0            
293             }
294              
295 0 0         return wantarray ? @results : \@results;
296             }
297              
298             sub parse_tsv {
299 0     0 0   my ($self, $content) = @_;
300 0           my @results;
301             my %urls;
302              
303             my $extract = HTML::LinkExtor->new(
304             sub {
305 0     0     my ($tag, %attr) = @_;
306 0 0 0       return unless $tag eq 'a' and $attr{href};
307 0 0         return unless $attr{href} =~ m{/(table|simple_results)\.txt$};
308 0           $urls{$1} = $attr{href};
309             },
310 0           $self->lanl_base,
311             );
312 0           $extract->parse($content);
313              
314 0           for my $table_name (qw(table simple_results)) {
315 0 0         next unless $urls{$table_name};
316 0 0         my $table = $self->_request(GET $urls{$table_name})
317             or next;
318              
319 0           my (@these_results, %seen);
320 0           my @lines = split "\n", $table;
321             my @fields = map {
322 0           s/^SeqName$/query/; # standard key
  0            
323 0           s/(?<=[a-z])(?=[A-Z])/_/g; # undo CamelCase
324 0           s/ +/_/g; # no spaces
325 0           y/A-Z/a-z/; # normalize to lowercase
326             # Account for the same field twice in the same data table
327 0 0         if ($seen{$_}++) {
328             $_ = /^(start|end)$/
329             ? "protein_$_"
330 0 0         : join "_", $_, $seen{$_};
331             }
332 0           $_;
333             } split "\t", shift @lines;
334              
335 0           for (@lines) {
336 0           my @values = split "\t";
337 0           my %data;
338 0           @data{@fields} = @values;
339              
340 0 0         next if $data{query} eq $self->_bogus_slug;
341              
342             $data{query_sequence} =~ s/\s+//g
343 0 0         if $data{query_sequence};
344 0           push @these_results, \%data;
345             }
346              
347             # Merge with existing results, if any
348             @results = @results
349 0 0   0     ? pairwise { +{ %$a, %$b } } @results, @these_results
  0            
350             : @these_results;
351             }
352              
353 0           return @results;
354             }
355              
356             sub parse_tables {
357 0     0 0   my ($self, $content) = @_;
358 0           my @tables;
359              
360 0           my %columns_for = (
361             'amino acid' => [
362             "CDS" => "cds",
363             "AA position relative to protein start in HXB2" => "aa_from_protein_start",
364             "AA position relative to query sequence start" => "aa_from_query_start",
365             "AA position relative to polyprotein start in HXB2" => "aa_from_polyprotein_start",
366             "NA position relative to CDS start in HXB2" => "aa_from_cds_start",
367             "NA position relative to HXB2 genome start" => "na_from_hxb2_start",
368             ],
369             'nucleotide' => [
370             "CDS" => "cds",
371             "Nucleotide position relative to CDS start in HXB2" => "na_from_cds_start",
372             "Nucleotide position relative to query sequence start" => "na_from_query_start",
373             "Nucleotide position relative to HXB2 genome start" => "na_from_hxb2_start",
374             "Amino Acid position relative to protein start in HXB2" => "aa_from_protein_start",
375             ],
376             );
377              
378 0           for my $base_type (sort keys %columns_for) {
379             my ($their_cols, $our_cols) = part {
380 0     0     state $i = 0;
381 0           $i++ % 2
382 0           } @{ $columns_for{$base_type} };
  0            
383              
384 0           my $extract = HTML::TableExtract->new( headers => $their_cols );
385 0           $extract->parse($content);
386              
387             # Examine all matching tables
388 0           for my $table ($extract->tables) {
389 0           my %table = (
390             coords => [$table->coords],
391             base_type => $base_type,
392             columns => $our_cols,
393             rows => [],
394             );
395 0           for my $row ($table->rows) {
396 0 0         @$row = map { defined $_ ? s/^\s+|\s*$//gr : $_ } @$row;
  0            
397              
398             # An empty row with only a sequence string in the first column.
399 0 0 0       if ( $row->[0]
      0        
400             and $row->[0] =~ /^[A-Za-z]+$/
401 0 0         and not grep { defined and length } @$row[1 .. scalar @$row - 1])
402             {
403 0           $table{rows}->[-1]{protein_translation} = $row->[0];
404 0           next;
405             }
406              
407             # Not all rows are data, some are informational sentences.
408 0 0         next if grep { not defined } @$row;
  0            
409              
410 0           my %row;
411             @row{@$our_cols} =
412 0 0 0       map { ($_ and $_ eq "NA") ? undef : $_ }
413 0 0 0       map { ($_ and /(\d+) → (\d+)/) ? [$1, $2] : $_ }
  0            
414             @$row;
415              
416 0           push @{$table{rows}}, \%row;
  0            
417             }
418             push @tables, \%table
419 0 0         if @{$table{rows}};
  0            
420             }
421             }
422              
423             # Sort by depth, then within each depth by count
424             @tables = sort {
425 0           $a->{coords}[0] <=> $b->{coords}[0]
426 0 0         or $a->{coords}[1] <=> $b->{coords}[1]
427             } @tables;
428              
429 0 0         if (@tables > 1) {
430 0 0 0       unless ( $tables[-1]->{rows}[0]{na_from_query_start} eq "1 →"
431             and $tables[-1]->{rows}[0]{protein_translation} eq "X") {
432 0           warn "Last table appears to be real!? It should be the bogus table of the bogus sequence.";
433 0           warn "Table is ", Dumper($tables[-1]), "\n";
434 0           return;
435             } else {
436 0           pop @tables;
437             }
438             }
439              
440 0           return @tables;
441             }
442              
443             sub parse_alignments {
444 0     0 0   my ($self, $content) = @_;
445 0           my @alignments;
446              
447 0           my $doc = HTML::TokeParser->new(
448             \$content,
449             unbroken_text => 1,
450             );
451              
452 0           my $expect_alignment = 0;
453              
454 0           while (my $tag = $doc->get_tag("b", "pre")) {
455 0           my $name = lc $tag->[0];
456 0           my $text = $doc->get_text;
457 0 0         next unless defined $text;
458              
459             #
s are preceeded by a bold header, which we use as an indicator 
460 0 0         if ($name eq 'b') {
    0          
461 0           $expect_alignment = $text =~ /Alignment\s+of\s+the\s+query\s+sequence\s+to\s+HXB2/i;
462             } elsif ($name eq 'pre') {
463 0 0 0       if ($text =~ /^\s*Query\b/m and $text =~ /^\s*HXB2\b/m) {
    0 0        
464 0           push @alignments, $text;
465 0 0         warn "Not expecting alignment, but found one‽"
466             unless $expect_alignment;
467             }
468             elsif ($text =~ /^\s+$/ and $expect_alignment) {
469 0           push @alignments, undef; # We appear to have found an unaligned sequence.
470             }
471 0           $expect_alignment = 0;
472             }
473             }
474              
475 0 0         if (defined $alignments[-1]) {
476 0           warn "Last alignment is non-null! It should be the empty alignment of the bogus sequence.";
477 0           warn "Alignment is <$alignments[-1]>\n";
478 0           return;
479             } else {
480 0           pop @alignments;
481             }
482              
483 0           my @results;
484 0           for (@alignments) {
485 0           my @hxb2;
486 0 0         if (defined) {
487 0           push @hxb2, $1 =~ s/\s+//gr
488             while /^\s*HXB2\b\s+(.+?)(?:\s+\d+|\s*)$/gm;
489             }
490 0 0         push @results, {
491             alignment => $_,
492             hxb2_sequence => @hxb2 ? join("", @hxb2) : undef,
493             };
494             }
495              
496 0           return @results;
497             }
498              
499             =head1 AUTHOR
500              
501             Thomas Sibley Etrsibley@uw.eduE
502              
503             =head1 COPYRIGHT
504              
505             Copyright 2014 by the Mullins Lab, Department of Microbiology, University of
506             Washington.
507              
508             =head1 LICENSE
509              
510             Licensed under the same terms as Perl 5 itself.
511              
512             =cut
513              
514             42;