File Coverage

blib/lib/Bio/MUST/Apps/FortyTwo/OrgProcessor.pm
Criterion Covered Total %
statement 66 351 18.8
branch 0 132 0.0
condition 0 64 0.0
subroutine 22 30 73.3
pod 0 1 0.0
total 88 578 15.2


line stmt bran cond sub pod time code
1             package Bio::MUST::Apps::FortyTwo::OrgProcessor;
2             # ABSTRACT: Internal class for forty-two tool
3             $Bio::MUST::Apps::FortyTwo::OrgProcessor::VERSION = '0.213470';
4 1     1   1045 use Moose;
  1         4  
  1         10  
5 1     1   7078 use namespace::autoclean;
  1         2  
  1         15  
6              
7 1     1   107 use autodie;
  1         2  
  1         11  
8 1     1   4857 use feature qw(say);
  1         3  
  1         115  
9              
10 1     1   8 use Smart::Comments -ENV;
  1         2  
  1         12  
11              
12 1     1   5636 use Carp;
  1         4  
  1         98  
13 1     1   6 use Const::Fast;
  1         2  
  1         13  
14 1     1   68 use List::AllUtils qw(each_array pairmap);
  1         2  
  1         47  
15 1     1   6 use Path::Class qw(file);
  1         2  
  1         37  
16 1     1   6 use Tie::IxHash;
  1         2  
  1         37  
17              
18 1     1   5 use Bio::MUST::Core;
  1         33  
  1         26  
19 1     1   6 use Bio::MUST::Core::Constants qw(:gaps :ncbi);
  1         2  
  1         204  
20 1     1   6 use Bio::MUST::Core::Utils qw(:filenames);
  1         2  
  1         85  
21 1     1   7 use Bio::MUST::Drivers;
  1         2  
  1         23  
22 1     1   5 use aliased 'Bio::MUST::Core::Ali';
  1         1  
  1         9  
23 1     1   249 use aliased 'Bio::MUST::Core::Ali::Temporary';
  1         2  
  1         4  
24 1     1   155 use aliased 'Bio::MUST::Core::IdList';
  1         18  
  1         3  
25 1     1   173 use aliased 'Bio::MUST::Core::Seq';
  1         2  
  1         3  
26 1     1   164 use aliased 'Bio::MUST::Core::SeqId';
  1         1  
  1         4  
27 1     1   159 use aliased 'Bio::MUST::Core::SeqMask';
  1         3  
  1         3  
28 1     1   184 use aliased 'Bio::MUST::Drivers::Cap3';
  1         3  
  1         4  
29 1     1   172 use aliased 'Bio::MUST::Apps::Debrief42::TaxReport::NewSeq';
  1         2  
  1         4  
30              
31             with 'Bio::MUST::Apps::Roles::OrgProcable';
32              
33              
34             # org
35              
36             # banks
37              
38             # code
39              
40              
41             has 'ali_proc' => (
42             is => 'ro',
43             isa => 'Bio::MUST::Apps::FortyTwo::AliProcessor',
44             required => 1,
45             );
46              
47              
48             has 'tax_filter' => (
49             is => 'ro',
50             isa => 'Maybe[ArrayRef[Str]]',
51             );
52              
53              
54             has '_tax_filter' => (
55             is => 'ro',
56             isa => 'Maybe[Bio::MUST::Core::Taxonomy::Filter]',
57             init_arg => undef,
58             writer => '_set_tax_filter',
59             handles => [ qw(is_allowed) ],
60             );
61              
62              
63             has $_ . '_seqs' => (
64             is => 'ro',
65             isa => 'Bio::MUST::Core::Ali::Temporary',
66             init_arg => undef,
67             lazy => 1,
68             builder => '_build_' . $_ . '_seqs',
69             ) for qw(homologous orthologous);
70              
71              
72             has 'aligned_seqs' => (
73             is => 'ro',
74             isa => 'Bio::MUST::Core::Ali',
75             init_arg => undef,
76             lazy => 1,
77             builder => '_build_aligned_seqs',
78             );
79              
80              
81             has '_' . $_ . '_para_scores' => (
82             traits => ['Hash'],
83             is => 'ro',
84             isa => 'HashRef[Num]',
85             init_arg => undef,
86             lazy => 1,
87             builder => '_build_' . $_ . '_scores',
88             handles => {
89             $_ . '_score_for' => 'get',
90             },
91             ) for qw(para tol);
92              
93              
94             has '_count_for' => (
95             is => 'ro',
96             isa => 'HashRef[HashRef[Num]]',
97             init_arg => undef,
98             default => sub { {} },
99             );
100              
101              
102             ## no critic (ProhibitUnusedPrivateSubroutines)
103              
104             sub _build_homologous_seqs {
105 0     0     my $self = shift;
106              
107             ##### [ORG] Searching for homologues...
108              
109 0           my $ap = $self->ali_proc;
110 0           my $rp = $ap->run_proc;
111              
112 0   0       my $args = $rp->blast_args_for('homologues') // {};
113 0           $args->{-outfmt} = 6;
114             $args->{-max_target_seqs} = 10000
115 0 0         unless defined $args->{-max_target_seqs};
116              
117 0           my @seqs;
118 0           for my $bank ($self->all_banks) {
119              
120             ###### [ORG] Processing BANK: $bank
121 0           my $query_seqs = $ap->query_seqs;
122 0           my $file = file( $rp->bank_dir, $bank );
123 0           my $blastdb = Bio::MUST::Drivers::Blast::Database->new( file => $file );
124 0 0 0       $args->{-db_gencode} = $self->code # if TBLASTN
125             if $query_seqs->type eq 'prot' && $blastdb->type eq 'nucl';
126 0           my $parser = $blastdb->blast($query_seqs, $args);
127             ###### [ORG] TBLASTN (or BLASTP/N): $bank . q{ } . $parser->filename
128              
129             # tie might help making 42 completely deterministic
130 0           tie my %mask_for, 'Tie::IxHash';
131 0           my %strand_for;
132              
133             # collect hit ids, masks and strands covered by queries
134 0           $self->_collect_hsps(
135             parser => $parser,
136             mask_for => \%mask_for,
137             strand_for => \%strand_for,
138             );
139 0 0         $parser->remove unless $rp->debug_mode;
140              
141             # fetch (and optionally trim) hits to their max range
142 0           my @hits = $self->_fetch_and_trim_hits(
143             blastdb => $blastdb,
144             mask_for => \%mask_for,
145             strand_for => \%strand_for,
146             );
147             ######## [DEBUG] hits: display( map { $_->full_id . ' => ' . $_->seq } @hits )
148              
149             # add seqs from current bank to homologous seqs
150 0           push @seqs, @hits;
151             }
152              
153             # build BLAST query file from homologous seqs
154 0           return Temporary->new( seqs => \@seqs );
155             }
156              
157              
158             sub _build_orthologous_seqs {
159 0     0     my $self = shift;
160              
161 0           my $ap = $self->ali_proc;
162 0           my $rp = $ap->run_proc;
163              
164 0           my $seqs;
165              
166 0 0         unless ($rp->ref_brh eq 'off') {
167             ##### [ORG] Identifying orthologues among homologues...
168              
169 0   0       my $args = $rp->blast_args_for('orthologues') // {};
170 0           $args->{-outfmt} = 6;
171 0           $args->{-max_target_seqs} = 1;
172              
173             # tie might help making 42 completely deterministic
174 0           tie my %count_for, 'Tie::IxHash';
175 0           for my $ref_org ($ap->all_ref_orgs) {
176              
177 0           my $homologous_seqs = $self->homologous_seqs;
178 0           my $blastdb = $rp->ref_blastdb_for($ref_org);
179 0 0 0       $args->{-query_gencode} = $self->code # if BLASTX
180             if $homologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
181 0           my $parser = $blastdb->blast($homologous_seqs, $args);
182             ##### [ORG] BLASTX (or BLASTP/N): $ref_org . q{ } . $parser->filename
183              
184 0           my $best_hits = $ap->best_hits_for($ref_org);
185 0           while (my $hsp = $parser->next_query) {
186 0           my $candidate = $homologous_seqs->long_id_for( $hsp->query_id );
187 0           my $in_best_hits = $best_hits->is_listed( $hsp->hit_id );
188             ######## [DEBUG]: $candidate . ( $in_best_hits && ' [=BRH=]' )
189 0 0         $count_for{$candidate}++ if $in_best_hits;
190             }
191 0 0         $parser->remove unless $rp->debug_mode;
192             }
193              
194             ######## [DEBUG] BRH counts: display( pairmap { "$a => $b" } %count_for )
195              
196             # keep only homologues that are orthologous for all effective ref_orgs
197 0           my $ref_org_n = $ap->count_ref_orgs;
198             my $orthologues = IdList->new(
199 0           ids => [ grep { $count_for{$_} == $ref_org_n } keys %count_for ]
  0            
200             );
201 0           $seqs = $orthologues->filtered_ali( $self->homologous_seqs );
202             }
203              
204             else {
205             ##### [ORG] Skipping orthology assessment!
206 0           $seqs = $self->homologous_seqs->seqs;
207             }
208              
209             # optionally merge orthologues before aligning
210             # Note: this is always disabled in metagenomic mode
211 0 0         unless ($rp->run_mode eq 'metagenomic') { # TODO: warn user?
212 0 0         if ($rp->merge_orthologues eq 'on') {
213             ##### [ORG] pre-merge orthologues: display( map { $_->full_id } $seqs->all_seq_ids )
214 0           $self->_compress_seqs($seqs);
215             }
216             }
217              
218             # build BLAST query file from orthologous seqs
219 0           return Temporary->new( seqs => $seqs );
220             }
221              
222              
223             sub _build_para_scores {
224 0     0     my $self = shift;
225              
226 0           my $ap = $self->ali_proc;
227 0           my $rp = $ap->run_proc;
228              
229             # ignore if no .para file in use
230 0           my $blastdb = $ap->para_blastdb;
231 0 0         return unless $blastdb;
232              
233 0   0       my $args = $rp->blast_args_for('templates') // {};
234 0           $args->{-outfmt} = 6;
235 0           $args->{-max_target_seqs} = 1;
236              
237 0           my $orthologous_seqs = $self->orthologous_seqs;
238 0 0 0       $args->{-query_gencode} = $self->code # if BLASTX
239             if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
240 0           my $parser = $blastdb->blast($orthologous_seqs, $args);
241             ##### [ORG] BLASTX (or BLASTP/N): 'PARA ' . $parser->filename
242              
243 0           my %para_score_for;
244 0           while (my $hsp = $parser->next_query) {
245 0           my $transcript_acc = $orthologous_seqs->long_id_for( $hsp->query_id );
246 0           my $hit_id = $blastdb->long_id_for( $hsp->hit_id );
247 0           my $score = $hsp->bit_score;
248             ######## [DEBUG] orthologue: $transcript_acc
249             ######## [DEBUG] PARA hit: $hit_id
250             ######## [DEBUG] score: $score
251 0           $para_score_for{$transcript_acc} = $score;
252             }
253 0 0         $parser->remove unless $rp->debug_mode;
254              
255 0           return \%para_score_for;
256             }
257              
258              
259             sub _build_tol_scores {
260 0     0     my $self = shift;
261              
262 0           my $ap = $self->ali_proc;
263 0           my $rp = $ap->run_proc;
264              
265             # ignore if no tol_check in use
266 0           my $blastdb = $rp->tol_blastdb;
267 0 0         return unless $blastdb;
268              
269 0   0       my $args = $rp->blast_args_for('templates') // {};
270 0           $args->{-outfmt} = 5;
271 0           $args->{-max_target_seqs} = 250;
272              
273 0           my $orthologous_seqs = $self->orthologous_seqs;
274 0 0 0       $args->{-query_gencode} = $self->code # if BLASTX
275             if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
276 0           my $parser = $blastdb->blast($orthologous_seqs, $args);
277             ##### [ORG] XML BLASTX (or BLASTP/N): 'TOL ' . $parser->filename
278              
279             # abort if no hit
280 0           my $bo = $parser->blast_output;
281 0 0         return unless $bo;
282              
283 0           my %tol_score_for;
284              
285             ORTHOLOGUE:
286 0           for my $orthologue ($bo->all_iterations) {
287 0 0         next ORTHOLOGUE unless $orthologue->count_hits;
288             # Note: this should never happen...
289              
290 0           my $query_def = $orthologue->query_def;
291 0           my $transcript_acc = $orthologous_seqs->long_id_for($query_def);
292             ######## [DEBUG] orthologue: $transcript_acc
293              
294             TOL_HIT:
295 0           for my $hit ($orthologue->all_hits) {
296              
297             # check taxonomy of hit: skip SELF-like hits
298 0           my $hit_id = SeqId->new( full_id => $hit->id );
299             ######## [DEBUG] TOL hit: $hit_id->full_id
300 0 0         next TOL_HIT if $self->is_allowed($hit_id);
301              
302             # fetch score for first non-SELF-like hit
303 0           my $score = $hit->get_hsp(0)->bit_score;
304             ######## [DEBUG] score: $score
305 0           $tol_score_for{$transcript_acc} = $score;
306 0           next ORTHOLOGUE;
307             }
308             }
309 0 0         $parser->remove unless $rp->debug_mode;
310              
311 0           return \%tol_score_for;
312             }
313              
314              
315             sub _build_aligned_seqs { ## no critic (ProhibitExcessComplexity)
316 0     0     my $self = shift;
317              
318             ##### [ORG] Aligning orthologues...
319              
320 0           my $ap = $self->ali_proc;
321 0           my $rp = $ap->run_proc;
322              
323 0   0       my $args = $rp->blast_args_for('templates') // {};
324 0           $args->{-outfmt} = 5;
325 0           $args->{-max_target_seqs} = $rp->tax_max_hits * 2;
326              
327 0           my $orthologous_seqs = $self->orthologous_seqs;
328 0           my $blastdb = $ap->blastdb;
329 0 0 0       $args->{-query_gencode} = $self->code # if BLASTX
330             if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
331 0           my $parser = $blastdb->blast($orthologous_seqs, $args);
332             ##### [ORG] XML BLASTX (or BLASTP/N): $parser->filename
333              
334 0           my $aligned_seqs = Ali->new();
335              
336             # abort if no hit
337 0           my $bo = $parser->blast_output;
338 0 0         return $aligned_seqs unless $bo;
339              
340             ORTHOLOGUE:
341 0           for my $orthologue ($bo->all_iterations) {
342 0 0         unless ($orthologue->count_hits) {
343             ###### [ORG] skipped orthologue due to lack of significant template
344 0           next ORTHOLOGUE;
345             } # TODO: investigate why this should happen at all...
346              
347             # fetch tax_line from orthologue
348             # Note: .para check is also done in this sub (hence the next below)
349 0           my $tax_line = $self->_fetch_tax_line_for_transcript($orthologue);
350 0 0         next ORTHOLOGUE unless $tax_line;
351              
352             # extract contam_org and transcript_id from tax_line...
353 0           my $contam_org = $tax_line->contam_org;
354 0           my $transcript_id = $tax_line->seq_id . '%s'; # chunk placeholder
355 0 0         $transcript_id .= "...$contam_org" if $contam_org;
356 0           $transcript_id .= '#NEW#';
357              
358             # ... and use it "as is" to store tax_line in tax_report (no chunk tag)
359 0           my $tax_line_id = sprintf $transcript_id, q{};
360 0           $ap->set_tax_line( $tax_line_id => $tax_line );
361              
362             # extract transcript_seq from tax_line
363 0           my $transcript_seq = Seq->new(
364             seq_id => $tax_line_id, # same id without alignment
365             seq => $tax_line->seq
366             );
367              
368             # skip both orthologue alignment and integration if metagenomic mode
369 0 0         next ORTHOLOGUE if $rp->run_mode eq 'metagenomic';
370              
371             # optionally add transcript_seq as is (without alignment)
372 0 0         if ($rp->aligner_mode eq 'off') {
373              
374             ####### [ORG] Adding: $transcript_seq->full_id
375              
376 0           $aligned_seqs->add_seq($transcript_seq);
377 0           next ORTHOLOGUE;
378             }
379              
380             # build a template_seq list as long as query coverage improves
381             # exonerate will try to use the longest template for alignment
382             # while BLAST will use each hit in turn (possibly as a fall-back)
383 0           my @templates;
384             my @template_seqs;
385 0           my $best_coverage = 0;
386              
387             TEMPLATE:
388 0           for my $template ($orthologue->all_hits) {
389              
390             # fetch template full_id
391 0           my $template_id = SeqId->new(
392             full_id => $blastdb->long_id_for($template->def)
393             );
394              
395             # optionally skip template if from same org as the orthologue
396 0 0         if ($rp->ali_skip_self eq 'on') {
397 0 0         if ($template_id->full_org eq $transcript_seq->full_org) {
398             ###### [ORG] skipped same-org template due to ali_skip_self
399 0           next TEMPLATE;
400             }
401             }
402              
403             # compute query coverage by template HSPs
404 0           my $mask = SeqMask->empty_mask( $orthologue->query_len );
405             $mask->mark_block($_->query_start, $_->query_end)
406 0           for $template->all_hsps;
407 0           my $coverage = $mask->coverage;
408              
409 0 0         last TEMPLATE if $coverage
410             < $best_coverage * $rp->ali_cover_mul;
411 0           $best_coverage = $coverage;
412              
413             ######## [DEBUG] template: $template_id->full_id
414             ######## [DEBUG] coverage: sprintf '%.2f', $coverage
415              
416             # fetch and cache aligned template seq from Ali
417             # Note: we emulate a fast get_seq_with_id using Ali lookup
418 0           push @templates, $template;
419 0           push @template_seqs, $ap->ali->get_seq(
420             $ap->lookup->index_for( $template_id->full_id )
421             );
422             }
423              
424             # Note: as we (optionally) skip templates, we may run out of templates
425 0 0         unless (@templates) {
426             ###### [ORG] skipped orthologue due to lack of suitable template
427 0           next ORTHOLOGUE;
428             }
429              
430             # flag tracking exonerate failure (for exoblast mode)
431 0           my $failure;
432              
433 0 0         if ($rp->aligner_mode =~ m/exonerate|exoblast/xms) {
434              
435             # use longest template for exonerate alignment
436 0           my $template_seq = $template_seqs[-1];
437              
438             # align transcript on template protein and get its translation
439             # Note: in exonerate the translated dna_seq is called 'target_seq'
440 0           my $exo = Bio::MUST::Drivers::Exonerate::Aligned->new(
441             dna_seq => $transcript_seq,
442             pep_seq => $template_seq->clone->degap,
443             code => $self->code,
444             );
445              
446             # get new_seq from exonerate report
447             # build new_id from transcript_id and exonerate model used
448 0           my $new_id = sprintf $transcript_id, '.E.' . $exo->model;
449 0           my $new_seq = $exo->target_seq;
450 0           $new_seq->set_seq_id($new_id);
451              
452             ####### [ORG] Adding: $new_seq->full_id
453              
454 0 0         unless ($new_seq->seq_len) {
455 0 0         my $fate = $rp->aligner_mode eq 'exoblast'
456             ? 'will retry with BLAST' : 'discarded'
457             ;
458             ####### [ORG] exonerate failure: $fate
459 0           $failure = 1;
460             }
461              
462             else {
463             # align new_seq on template_seq using subject_seq as a guide
464 0           $aligned_seqs->add_seq(
465             $ap->integrator->align(
466             new_seq => $new_seq,
467             subject => $exo->query_seq,
468             template => $template_seq,
469             start => $exo->query_start,
470             )
471             );
472              
473             ######## [DEBUG] aligned with exonerate model: $exo->model
474             }
475             }
476              
477 0 0 0       if ( ($rp->aligner_mode eq 'blast')
      0        
478             || ($rp->aligner_mode eq 'exoblast' && $failure) ) {
479              
480             TEMPLATE:
481 0           for my $template (@templates) {
482              
483             # use each template in turn for BLAST alignment
484 0           my $template_seq = shift @template_seqs;
485 0 0         last TEMPLATE unless $template_seq;
486              
487             HSP:
488 0           for my $hsp ($template->all_hsps) {
489              
490             # build HSP id from transcript_id and template/HSP ranks
491 0           my $hsp_id = sprintf $transcript_id,
492             '.H' . $template->num . '.' . $hsp->num;
493              
494             # build HSP seq from BLASTX report
495 0           (my $hsp_seq = $hsp->qseq) =~ s{\*}{$FRAMESHIFT}xmsg;
496 0           my $new_seq = Seq->new(
497             seq_id => $hsp_id,
498             seq => $hsp_seq
499             );
500              
501             # fetch aligned template seq from HSP (= subject)
502 0           my $subject_seq = Seq->new(
503             seq_id => $template_seq->seq_id,
504             seq => $hsp->hseq
505             );
506              
507             # reverse complement seqs if template on reverse in BLASTN
508 0 0 0       if ($ap->query_seqs->type eq 'nucl'
      0        
509             && $blastdb->type eq 'nucl'
510             && $hsp->hit_strand == -1) {
511 0           $new_seq = $new_seq->reverse_complemented_seq;
512 0           $subject_seq = $subject_seq->reverse_complemented_seq;
513             }
514              
515             ####### [ORG] Adding: $new_seq->full_id
516              
517             # align new_seq on template_seq using subject_seq as a guide
518             $aligned_seqs->add_seq(
519 0           $ap->integrator->align(
520             new_seq => $new_seq,
521             subject => $subject_seq,
522             template => $template_seq,
523             start => $hsp->hit_start,
524             )
525             );
526              
527             ######## [DEBUG] aligned with BLAST
528             }
529             }
530             }
531             }
532              
533 0 0         $parser->remove unless $rp->debug_mode;
534              
535 0           return $aligned_seqs;
536             }
537              
538             ## use critic
539              
540              
541             sub _compress_seqs {
542 0     0     my $self = shift;
543 0           my $ali = shift;
544              
545 0           my $rp = $self->ali_proc->run_proc;
546              
547             # first fetch all seqs...
548             # ... then clear existing seqs
549 0           my @seqs2cap = $ali->all_seqs;
550 0           $ali->_set_seqs( [] );
551              
552             # proceed only if at least one seq
553 0 0         return unless @seqs2cap;
554              
555             # Note: this might seem sub-optimal in case of singletons
556             # but we follow as closely as possible the logic of compress-db.pl
557              
558             # proceed only if at least two seqs
559             # otherwise add lone seq
560 0 0         if (@seqs2cap < 2) {
561 0           $ali->add_seq( shift @seqs2cap );
562 0           return;
563             }
564              
565             # TODO: add debugging comments?
566              
567             # try to cap seqs
568 0           my $cap = Cap3->new(
569             seqs => \@seqs2cap,
570             cap3_args => {
571             -p => $rp->merge_min_ident * 100.0, # CAP3 expects percents
572             -o => $rp->merge_min_len,
573             },
574             );
575              
576             # add singlet seqs
577 0           my @singlets = $cap->all_singlets;
578 0           $ali->add_seq($_) for @singlets;
579              
580             # proceed only if contigs of seqs
581 0           my @contigs = $cap->all_contigs;
582 0 0         return unless @contigs;
583              
584 0           for my $contig (@contigs) {
585 0           my @ids = map { $_->full_id }
586 0           @{ $cap->seq_ids_for( $contig->full_id ) };
  0            
587 0           my $contig_id = shift(@ids) . ':::+' . @ids;
588 0           my $contig_seq = $contig->seq;
589              
590             # add contig seq
591 0           $ali->add_seq(
592             Seq->new( seq_id => $contig_id, seq => $contig_seq )
593             );
594             }
595              
596             # Note: we do not need to return anything as we modified $ali in place
597 0           return;
598             }
599              
600              
601             sub _fetch_tax_line_for_transcript {
602 0     0     my $self = shift;
603 0           my $orthologue = shift;
604              
605 0           my $ap = $self->ali_proc;
606 0           my $rp = $ap->run_proc;
607              
608             # fetch transcript accession (= query id) from BLAST report
609 0           my $query_def = $orthologue->query_def;
610 0           my $transcript_acc = $self->orthologous_seqs->long_id_for($query_def);
611              
612             # extract range and strand (if any) from transcript accession...
613             # Note: this mostly makes sense for genomic contigs/scaffolds
614             # ... and extract potential tail '+N' (if orthologues have been merged)
615             # Note: this looks convoluted but it has to work in many different setups
616             # depending on the value of trim_homologues and merge_orthologues:
617             # - off/off: acc
618             # - off/on: acc:::+N
619             # - on /off: acc:::start:::end:::strand
620             # - on /on: acc:::start:::end:::strand:::+N
621 0           my @fields = split /:::/xms, $transcript_acc;
622 0           my ($strip_acc, $start, $end, $strand, $more) = @fields;
623 0 0 0       ($more, $start) = ($start, undef)
      0        
624             if !defined $end && defined $start && $start =~ m/\A \+\d+ \z/xms;
625              
626             # set acc to transcript accession
627 0           my ($first_chunk) = $strip_acc =~ m/^(\S+)/xms;
628 0           my @parts = split /\|/xms, $first_chunk;
629 0           my $acc = pop @parts;
630             ###### [ORG] orthologous transcript: $acc
631              
632             # fetch score for best template
633 0           my $temp_score = $orthologue->get_hit(0)->get_hsp(0)->bit_score;
634              
635             # check again for paralogy if .para file in use
636 0 0         if ($ap->para_blastdb) {
637 0   0       my $para_score = $self->para_score_for($transcript_acc) // 0;
638 0 0         if ($para_score > $temp_score) {
639             ###### [ORG] rejected for PARA[logy]: "$para_score > $temp_score"
640 0           return;
641             }
642             }
643              
644             # check for contamination if tol_check in use
645 0 0         if ($rp->tol_blastdb ) {
646 0   0       my $tol_score = $self->tol_score_for($transcript_acc) // 0;
647 0 0         if ($tol_score > $temp_score) {
648             ###### [ORG] rejected due to TOL check: "$tol_score > $temp_score"
649 0           return;
650             }
651             }
652              
653             # prebuild SeqId list for all templates
654             # this is needed for family affiliation (first only)
655             # ... and for assessing template taxonomy (all)
656 0           my $blastdb = $self->ali_proc->blastdb;
657 0           my @templates = $orthologue->all_hits;
658             my @template_ids = map {
659 0           SeqId->new( full_id => $blastdb->long_id_for( $_->def ) )
  0            
660             } @templates;
661              
662             # set family to first template family (if any)
663 0   0       my $family = $template_ids[0]->family // q{};
664 0 0         $family .= '-' if $family;
665              
666             # set org to current (bank) org
667 0           my $org = $self->org;
668 0           my $contam_org = q{};
669              
670             # setup tax_report data
671 0           my $common_tax;
672             my $lca;
673 0           my $lineage;
674 0           my $tag = q{};
675 0           my $top_score = 0;
676 0           my $rel_n = 0;
677 0           my $mean_len;
678             my $mean_ident;
679              
680             # optionally analyze template taxonomy
681             # ... to build tax_reports of additions
682             # ... to tag potential contaminations
683 0 0 0       if ( $rp->tax_reports eq 'on' || $self->_tax_filter ) {
684 0           my @relatives;
685 0           my $best_score = 0;
686 0           my $sum_len = 0;
687 0           my $sum_ident = 0;
688              
689 0           my $tax = $rp->tax;
690 0           my $ea = each_array @templates, @template_ids;
691              
692             # accumulate relatives for LCA inference
693             # Note: three logical modes can be distinguished.
694             # These should be implemented in the config file.
695             # 1. best-hit
696             # - tax_max_hits: 1
697             # - filter on tax_min_score or tax_min_len / tax_min_ident
698             # 2. classic LCA
699             # - tax_max_hits: to be specified
700             # - filter on tax_min_score or tax_min_len / tax_min_ident
701             # 3. MEGAN-like LCA
702             # - tax_max_hits: should be ignored
703             # - filter on tax_min_score / tax_score_mul
704             # however all criteria can be simultaneously activated
705              
706             REL:
707 0           while ( my ($template, $template_id) = $ea->() ) {
708              
709             # accumulate at most tax_max_hits relatives for LCA inference
710 0 0         last REL if @relatives >= $rp->tax_max_hits;
711              
712             # ensure that match to template is good enough...
713             # by default all these thresholds are disabled
714 0           my $hsp = $template->get_hsp(0);
715              
716             # ... MEGAN-like mode (and top_score needed for one-on-one)
717 0           my $score = $hsp->bit_score;
718 0           $top_score = List::AllUtils::max($top_score, $score);
719             # Note: we do not use last due to non-monotonic score decrease
720              
721             # no need to accumulate relatives if no NCBI Taxonomy is available
722 0 0         last REL unless $tax;
723              
724 0 0         next REL if $score < $rp->tax_min_score;
725 0 0         next REL if $score < $rp->tax_score_mul * $best_score;
726              
727             # ... classic or best-hit mode
728 0           my $len = $hsp->align_len;
729 0 0         next REL if $len < $rp->tax_min_len;
730 0           my $ident = $hsp->identity / $len;
731 0 0         next REL if $ident < $rp->tax_min_ident;
732              
733             # try to determine taxonomy of template
734             # if possible add template to list of relatives...
735             # ... and set best_score for MEGAN-like mode
736 0           my @taxonomy = $tax->fetch_lineage($template_id);
737 0 0         if (@taxonomy) {
738             # Note: contrary to ref_score_mul the best_score never changes
739 0   0       $best_score ||= $score;
740 0           $sum_len += $len;
741 0           $sum_ident += $ident;
742 0           push @relatives, \@taxonomy;
743             ######## [DEBUG] hit: $template_id->full_id
744             ######## [DEBUG] length: $len
745             ######## [DEBUG] identity: sprintf '%.2f', $ident
746             ######## [DEBUG] lineage: join '; ', @taxonomy
747             }
748             }
749              
750             ######## [DEBUG] relatives: display( map { join '; ', @$_ } @relatives )
751              
752 0 0         if (@relatives < $rp->tax_min_hits) {
753             ###### [ORG] tagged as unclassified due to lack of relatives
754 0           $common_tax = [ 'unclassified sequences' ];
755 0 0         if ($self->_tax_filter) {
756 0           $tag = 'u#';
757             }
758             }
759              
760             else {
761             # infer LCA from all relatives...
762 0           $common_tax = $tax->compute_lca(@relatives);
763 0           $lineage = join '; ', @{$common_tax};
  0            
764             ######## [DEBUG] LCA lineage: $lineage
765 0           $lca = $common_tax->[-1];
766             ###### [ORG] affiliated to: $lca
767              
768 0 0         if ($rp->tax_reports eq 'on') {
769 0           $rel_n = @relatives;
770 0           $mean_len = sprintf '%.2f', $sum_len / $rel_n;
771 0           $mean_ident = sprintf '%.2f', $sum_ident / $rel_n;
772             }
773              
774             # optionally tag potential contamination based on LCA
775             # Note: LCAs are used "as is" because is_allowed handles them
776             # Note: tol_check 'on' disables positive taxonomic filtering
777 0 0 0       if ($self->_tax_filter && $rp->tol_check eq 'off') {
778             ######## [DEBUG] pass tax_filter: $self->is_allowed($common_tax)
779 0 0         unless ( $self->is_allowed($common_tax) ) {
780             ###### [ORG] tagged as contaminated
781 0           $tag = 'c#';
782             $contam_org = join '_',
783 0   0       map { $_ // () } SeqId->parse_ncbi_name($lca);
  0            
784             } # delete undef species/strains
785             }
786             }
787             }
788              
789             # check for multiple orthologues derived from the same transcript
790             # Note: this mostly makes sense for genomic contigs/scaffolds
791 0           my $acc_u = $acc;
792 0           my $count = ++$self->_count_for->{$org}{$acc};
793 0 0         if ($count > 1) { # direct access to private attr
794 0           carp "[ORG] Note: more than one orthologue extracted from $acc;"
795             . " appending .$count to accession.";
796 0           $acc_u .= ".$count";
797             }
798              
799             # add potential accession tail '+N' (if orthologues have been merged)
800 0 0         $acc_u .= $more if $more;
801              
802             # fetch orthologous transcript and give it nearly complete full_id
803 0           my $transcript_seq
804             = $self->orthologous_seqs->get_seq_with_id($transcript_acc)->seq;
805 0 0 0       my $transcript_id
806             = $org =~ $PKEYONLY || $org =~ $GCAONLY ? $org . '|' . $acc_u
807             : $family . $tag . $org . '@' . $acc_u
808             ;
809              
810             # build tax_report line for transcript
811 0           my $tax_line = NewSeq->new(
812             seq_id => $transcript_id,
813             contam_org => $contam_org,
814             top_score => $top_score,
815             rel_n => $rel_n,
816             mean_len => $mean_len,
817             mean_ident => $mean_ident,
818             lca => $lca,
819             lineage => $lineage,
820             acc => $acc, # Note: not $acc_u here!
821             start => $start,
822             end => $end,
823             strand => $strand,
824             seq => $transcript_seq,
825             );
826              
827 0           return $tax_line;
828             }
829              
830              
831             sub BUILD {
832 0     0 0   my $self = shift;
833              
834 0           my $ap = $self->ali_proc;
835 0           my $rp = $ap->run_proc;
836              
837             # optionally build _tax_filter from tax_filter (specs) and tax attrs
838             # Note: cannot use builder/coercion because of the need for a tax object
839 0 0         if ($self->tax_filter) {
840 0           my $tax = $rp->tax;
841 0 0         $self->_set_tax_filter( $tax->tax_filter( $self->tax_filter ) )
842             if $tax; # warning already issued at this stage
843             }
844              
845             ##### [ORG] homologues: display( $self->homologous_seqs->all_long_ids )
846 0 0         return unless $self->homologous_seqs->all_long_ids;
847              
848             ##### [ORG] orthologues: display( $self->orthologous_seqs->all_long_ids )
849 0 0         return unless $self->orthologous_seqs->all_long_ids;
850              
851             ##### [ORG] aligned: display( map { $_->full_id } $self->aligned_seqs->all_seq_ids )
852 0 0         return unless $self->aligned_seqs->all_seq_ids;
853              
854             ##### [ORG] Adding aligned seqs to file...
855 0           $ap->ali->add_seq( $self->aligned_seqs->all_seqs );
856              
857 0           return;
858             }
859              
860              
861             __PACKAGE__->meta->make_immutable;
862             1;
863              
864             __END__
865              
866             =pod
867              
868             =head1 NAME
869              
870             Bio::MUST::Apps::FortyTwo::OrgProcessor - Internal class for forty-two tool
871              
872             =head1 VERSION
873              
874             version 0.213470
875              
876             =head1 AUTHOR
877              
878             Denis BAURAIN <denis.baurain@uliege.be>
879              
880             =head1 COPYRIGHT AND LICENSE
881              
882             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
883              
884             This is free software; you can redistribute it and/or modify it under
885             the same terms as the Perl 5 programming language system itself.
886              
887             =cut