File Coverage

blib/lib/Bio/MUST/Apps/FortyTwo/OrgProcessor.pm
Criterion Covered Total %
statement 66 350 18.8
branch 0 132 0.0
condition 0 64 0.0
subroutine 22 30 73.3
pod 0 1 0.0
total 88 577 15.2


line stmt bran cond sub pod time code
1             package Bio::MUST::Apps::FortyTwo::OrgProcessor;
2             # ABSTRACT: Internal class for forty-two tool
3             $Bio::MUST::Apps::FortyTwo::OrgProcessor::VERSION = '0.210570';
4 1     1   762 use Moose;
  1         3  
  1         7  
5 1     1   6871 use namespace::autoclean;
  1         3  
  1         7  
6              
7 1     1   93 use autodie;
  1         3  
  1         8  
8 1     1   5536 use feature qw(say);
  1         3  
  1         86  
9              
10 1     1   8 use Smart::Comments -ENV;
  1         3  
  1         8  
11              
12 1     1   6316 use Carp;
  1         3  
  1         64  
13 1     1   6 use Const::Fast;
  1         2  
  1         9  
14 1     1   64 use List::AllUtils qw(each_array pairmap);
  1         3  
  1         50  
15 1     1   7 use Path::Class qw(file);
  1         2  
  1         52  
16 1     1   7 use Tie::IxHash;
  1         2  
  1         29  
17              
18 1     1   5 use Bio::MUST::Core;
  1         23  
  1         26  
19 1     1   7 use Bio::MUST::Core::Constants qw(:gaps :ncbi);
  1         2  
  1         191  
20 1     1   7 use Bio::MUST::Core::Utils qw(:filenames);
  1         2  
  1         93  
21 1     1   7 use Bio::MUST::Drivers;
  1         2  
  1         24  
22 1     1   6 use aliased 'Bio::MUST::Core::Ali';
  1         2  
  1         6  
23 1     1   194 use aliased 'Bio::MUST::Core::Ali::Temporary';
  1         2  
  1         5  
24 1     1   215 use aliased 'Bio::MUST::Core::IdList';
  1         17  
  1         4  
25 1     1   201 use aliased 'Bio::MUST::Core::Seq';
  1         2  
  1         5  
26 1     1   188 use aliased 'Bio::MUST::Core::SeqId';
  1         2  
  1         4  
27 1     1   189 use aliased 'Bio::MUST::Core::SeqMask';
  1         2  
  1         4  
28 1     1   198 use aliased 'Bio::MUST::Drivers::Cap3';
  1         2  
  1         5  
29 1     1   207 use aliased 'Bio::MUST::Apps::Debrief42::TaxReport::NewSeq';
  1         2  
  1         3  
30              
31             with 'Bio::MUST::Apps::Roles::OrgProcable';
32              
33              
34             # org
35              
36             # banks
37              
38             # code
39              
40              
41             has 'ali_proc' => (
42             is => 'ro',
43             isa => 'Bio::MUST::Apps::FortyTwo::AliProcessor',
44             required => 1,
45             );
46              
47              
48             has 'tax_filter' => (
49             is => 'ro',
50             isa => 'Maybe[ArrayRef[Str]]',
51             );
52              
53              
54             has '_tax_filter' => (
55             is => 'ro',
56             isa => 'Maybe[Bio::MUST::Core::Taxonomy::Filter]',
57             init_arg => undef,
58             writer => '_set_tax_filter',
59             handles => [ qw(is_allowed) ],
60             );
61              
62              
63             has $_ . '_seqs' => (
64             is => 'ro',
65             isa => 'Bio::MUST::Core::Ali::Temporary',
66             init_arg => undef,
67             lazy => 1,
68             builder => '_build_' . $_ . '_seqs',
69             ) for qw(homologous orthologous);
70              
71              
72             has 'aligned_seqs' => (
73             is => 'ro',
74             isa => 'Bio::MUST::Core::Ali',
75             init_arg => undef,
76             lazy => 1,
77             builder => '_build_aligned_seqs',
78             );
79              
80              
81             has '_' . $_ . '_para_scores' => (
82             traits => ['Hash'],
83             is => 'ro',
84             isa => 'HashRef[Num]',
85             init_arg => undef,
86             lazy => 1,
87             builder => '_build_' . $_ . '_scores',
88             handles => {
89             $_ . '_score_for' => 'get',
90             },
91             ) for qw(para tol);
92              
93              
94             has '_count_for' => (
95             is => 'ro',
96             isa => 'HashRef[HashRef[Num]]',
97             init_arg => undef,
98             default => sub { {} },
99             );
100              
101              
102             ## no critic (ProhibitUnusedPrivateSubroutines)
103              
104             sub _build_homologous_seqs {
105 0     0     my $self = shift;
106              
107             ##### [ORG] Searching for homologues...
108              
109 0           my $ap = $self->ali_proc;
110 0           my $rp = $ap->run_proc;
111              
112 0   0       my $args = $rp->blast_args_for('homologues') // {};
113 0           $args->{-outfmt} = 6;
114             $args->{-max_target_seqs} = 10000
115 0 0         unless defined $args->{-max_target_seqs};
116              
117 0           my @seqs;
118 0           for my $bank ($self->all_banks) {
119              
120             ###### [ORG] Processing BANK: $bank
121 0           my $query_seqs = $ap->query_seqs;
122 0           my $file = file( $rp->bank_dir, $bank );
123 0           my $blastdb = Bio::MUST::Drivers::Blast::Database->new( file => $file );
124 0 0 0       $args->{-db_gencode} = $self->code # if TBLASTN
125             if $query_seqs->type eq 'prot' && $blastdb->type eq 'nucl';
126 0           my $parser = $blastdb->blast($query_seqs, $args);
127             ###### [ORG] TBLASTN (or BLASTP/N): $bank . q{ } . $parser->filename
128              
129             # tie might help making 42 completely deterministic
130 0           tie my %mask_for, 'Tie::IxHash';
131 0           my %strand_for;
132              
133             # collect hit ids, masks and strands covered by queries
134 0           $self->_collect_hsps(
135             parser => $parser,
136             mask_for => \%mask_for,
137             strand_for => \%strand_for,
138             );
139 0 0         $parser->remove unless $rp->debug_mode;
140              
141             # fetch (and optionally trim) hits to their max range
142 0           my @hits = $self->_fetch_and_trim_hits(
143             blastdb => $blastdb,
144             mask_for => \%mask_for,
145             strand_for => \%strand_for,
146             );
147             ######## [DEBUG] hits: display( map { $_->full_id . ' => ' . $_->seq } @hits )
148              
149             # add seqs from current bank to homologous seqs
150 0           push @seqs, @hits;
151             }
152              
153             # build BLAST query file from homologous seqs
154 0           return Temporary->new( seqs => \@seqs );
155             }
156              
157              
158             sub _build_orthologous_seqs {
159 0     0     my $self = shift;
160              
161 0           my $ap = $self->ali_proc;
162 0           my $rp = $ap->run_proc;
163              
164 0 0         if ($rp->ref_brh eq 'off') {
165             ##### [ORG] Skipping orthology assessment!
166 0           return $self->homologous_seqs;
167             }
168              
169             ##### [ORG] Identifying orthologues among homologues...
170              
171 0   0       my $args = $rp->blast_args_for('orthologues') // {};
172 0           $args->{-outfmt} = 6;
173 0           $args->{-max_target_seqs} = 1;
174              
175             # tie might help making 42 completely deterministic
176 0           tie my %count_for, 'Tie::IxHash';
177 0           for my $ref_org ($ap->all_ref_orgs) {
178              
179 0           my $homologous_seqs = $self->homologous_seqs;
180 0           my $blastdb = $rp->ref_blastdb_for($ref_org);
181 0 0 0       $args->{-query_gencode} = $self->code # if BLASTX
182             if $homologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
183 0           my $parser = $blastdb->blast($homologous_seqs, $args);
184             ##### [ORG] BLASTX (or BLASTP/N): $ref_org . q{ } . $parser->filename
185              
186 0           my $best_hits = $ap->best_hits_for($ref_org);
187 0           while (my $hsp = $parser->next_query) {
188 0           my $candidate = $homologous_seqs->long_id_for( $hsp->query_id );
189 0           my $in_best_hits = $best_hits->is_listed( $hsp->hit_id );
190             ######## [DEBUG]: $candidate . ( $in_best_hits && ' [=BRH=]' )
191 0 0         $count_for{$candidate}++ if $in_best_hits;
192             }
193 0 0         $parser->remove unless $rp->debug_mode;
194             }
195              
196             ######## [DEBUG] BRH counts: display( pairmap { "$a => $b" } %count_for )
197              
198             # keep only homologues that are orthologous for all effective ref_orgs
199 0           my $ref_org_n = $ap->count_ref_orgs;
200             my $orthologues = IdList->new(
201 0           ids => [ grep { $count_for{$_} == $ref_org_n } keys %count_for ]
  0            
202             );
203 0           my $seqs = $orthologues->filtered_ali( $self->homologous_seqs );
204              
205             # optionally merge orthologues before aligning
206             # Note: this is always disabled in metagenomic mode
207 0 0         unless ($rp->run_mode eq 'metagenomic') { # TODO: warn user?
208 0 0         if ($rp->merge_orthologues eq 'on') {
209             ##### [ORG] pre-merge orthologues: display( map { $_->full_id } $seqs->all_seq_ids )
210 0           $self->_compress_seqs($seqs);
211             }
212             }
213              
214             # build BLAST query file from orthologous seqs
215 0           return Temporary->new( seqs => $seqs );
216             }
217              
218              
219             sub _build_para_scores {
220 0     0     my $self = shift;
221              
222 0           my $ap = $self->ali_proc;
223 0           my $rp = $ap->run_proc;
224              
225             # ignore if no .para file in use
226 0           my $blastdb = $ap->para_blastdb;
227 0 0         return unless $blastdb;
228              
229 0   0       my $args = $rp->blast_args_for('templates') // {};
230 0           $args->{-outfmt} = 6;
231 0           $args->{-max_target_seqs} = 1;
232              
233 0           my $orthologous_seqs = $self->orthologous_seqs;
234 0 0 0       $args->{-query_gencode} = $self->code # if BLASTX
235             if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
236 0           my $parser = $blastdb->blast($orthologous_seqs, $args);
237             ##### [ORG] BLASTX (or BLASTP/N): 'PARA ' . $parser->filename
238              
239 0           my %para_score_for;
240 0           while (my $hsp = $parser->next_query) {
241 0           my $transcript_acc = $orthologous_seqs->long_id_for( $hsp->query_id );
242 0           my $hit_id = $blastdb->long_id_for( $hsp->hit_id );
243 0           my $score = $hsp->bit_score;
244             ######## [DEBUG] orthologue: $transcript_acc
245             ######## [DEBUG] PARA hit: $hit_id
246             ######## [DEBUG] score: $score
247 0           $para_score_for{$transcript_acc} = $score;
248             }
249 0 0         $parser->remove unless $rp->debug_mode;
250              
251 0           return \%para_score_for;
252             }
253              
254              
255             sub _build_tol_scores {
256 0     0     my $self = shift;
257              
258 0           my $ap = $self->ali_proc;
259 0           my $rp = $ap->run_proc;
260              
261             # ignore if no tol_check in use
262 0           my $blastdb = $rp->tol_blastdb;
263 0 0         return unless $blastdb;
264              
265 0   0       my $args = $rp->blast_args_for('templates') // {};
266 0           $args->{-outfmt} = 5;
267 0           $args->{-max_target_seqs} = 250;
268              
269 0           my $orthologous_seqs = $self->orthologous_seqs;
270 0 0 0       $args->{-query_gencode} = $self->code # if BLASTX
271             if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
272 0           my $parser = $blastdb->blast($orthologous_seqs, $args);
273             ##### [ORG] XML BLASTX (or BLASTP/N): 'TOL ' . $parser->filename
274              
275             # abort if no hit
276 0           my $bo = $parser->blast_output;
277 0 0         return unless $bo;
278              
279 0           my %tol_score_for;
280              
281             ORTHOLOGUE:
282 0           for my $orthologue ($bo->all_iterations) {
283 0 0         next ORTHOLOGUE unless $orthologue->count_hits;
284             # Note: this should never happen...
285              
286 0           my $query_def = $orthologue->query_def;
287 0           my $transcript_acc = $orthologous_seqs->long_id_for($query_def);
288             ######## [DEBUG] orthologue: $transcript_acc
289              
290             TOL_HIT:
291 0           for my $hit ($orthologue->all_hits) {
292              
293             # check taxonomy of hit: skip SELF-like hits
294 0           my $hit_id = SeqId->new( full_id => $hit->id );
295             ######## [DEBUG] TOL hit: $hit_id->full_id
296 0 0         next TOL_HIT if $self->is_allowed($hit_id);
297              
298             # fetch score for first non-SELF-like hit
299 0           my $score = $hit->get_hsp(0)->bit_score;
300             ######## [DEBUG] score: $score
301 0           $tol_score_for{$transcript_acc} = $score;
302 0           next ORTHOLOGUE;
303             }
304             }
305 0 0         $parser->remove unless $rp->debug_mode;
306              
307 0           return \%tol_score_for;
308             }
309              
310              
311             sub _build_aligned_seqs { ## no critic (ProhibitExcessComplexity)
312 0     0     my $self = shift;
313              
314             ##### [ORG] Aligning orthologues...
315              
316 0           my $ap = $self->ali_proc;
317 0           my $rp = $ap->run_proc;
318              
319 0   0       my $args = $rp->blast_args_for('templates') // {};
320 0           $args->{-outfmt} = 5;
321 0           $args->{-max_target_seqs} = $rp->tax_max_hits * 2;
322              
323 0           my $orthologous_seqs = $self->orthologous_seqs;
324 0           my $blastdb = $ap->blastdb;
325 0 0 0       $args->{-query_gencode} = $self->code # if BLASTX
326             if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
327 0           my $parser = $blastdb->blast($orthologous_seqs, $args);
328             ##### [ORG] XML BLASTX (or BLASTP/N): $parser->filename
329              
330 0           my $aligned_seqs = Ali->new();
331              
332             # abort if no hit
333 0           my $bo = $parser->blast_output;
334 0 0         return $aligned_seqs unless $bo;
335              
336             ORTHOLOGUE:
337 0           for my $orthologue ($bo->all_iterations) {
338 0 0         unless ($orthologue->count_hits) {
339             ###### [ORG] skipped orthologue due to lack of significant template
340 0           next ORTHOLOGUE;
341             } # TODO: investigate why this should happen at all...
342              
343             # fetch tax_line from orthologue
344             # Note: .para check is also done in this sub (hence the next below)
345 0           my $tax_line = $self->_fetch_tax_line_for_transcript($orthologue);
346 0 0         next ORTHOLOGUE unless $tax_line;
347              
348             # extract contam_org and transcript_id from tax_line...
349 0           my $contam_org = $tax_line->contam_org;
350 0           my $transcript_id = $tax_line->seq_id . '%s'; # chunk placeholder
351 0 0         $transcript_id .= "...$contam_org" if $contam_org;
352 0           $transcript_id .= '#NEW#';
353              
354             # ... and use it "as is" to store tax_line in tax_report (no chunk tag)
355 0           my $tax_line_id = sprintf $transcript_id, q{};
356 0           $ap->set_tax_line( $tax_line_id => $tax_line );
357              
358             # extract transcript_seq from tax_line
359 0           my $transcript_seq = Seq->new(
360             seq_id => $tax_line_id, # same id without alignment
361             seq => $tax_line->seq
362             );
363              
364             # skip both orthologue alignment and integration if metagenomic mode
365 0 0         next ORTHOLOGUE if $rp->run_mode eq 'metagenomic';
366              
367             # optionally add transcript_seq as is (without alignment)
368 0 0         if ($rp->aligner_mode eq 'off') {
369              
370             ####### [ORG] Adding: $transcript_seq->full_id
371              
372 0           $aligned_seqs->add_seq($transcript_seq);
373 0           next ORTHOLOGUE;
374             }
375              
376             # build a template_seq list as long as query coverage improves
377             # exonerate will try to use the longest template for alignment
378             # while BLAST will use each hit in turn (possibly as a fall-back)
379 0           my @templates;
380             my @template_seqs;
381 0           my $best_coverage = 0;
382              
383             TEMPLATE:
384 0           for my $template ($orthologue->all_hits) {
385              
386             # fetch template full_id
387 0           my $template_id = SeqId->new(
388             full_id => $blastdb->long_id_for($template->def)
389             );
390              
391             # optionally skip template if from same org as the orthologue
392 0 0         if ($rp->ali_skip_self eq 'on') {
393 0 0         if ($template_id->full_org eq $transcript_seq->full_org) {
394             ###### [ORG] skipped same-org template due to ali_skip_self
395 0           next TEMPLATE;
396             }
397             }
398              
399             # compute query coverage by template HSPs
400 0           my $mask = SeqMask->empty_mask( $orthologue->query_len );
401             $mask->mark_block($_->query_start, $_->query_end)
402 0           for $template->all_hsps;
403 0           my $coverage = $mask->coverage;
404              
405 0 0         last TEMPLATE if $coverage
406             < $best_coverage * $rp->ali_cover_mul;
407 0           $best_coverage = $coverage;
408              
409             ######## [DEBUG] template: $template_id->full_id
410             ######## [DEBUG] coverage: sprintf '%.2f', $coverage
411              
412             # fetch and cache aligned template seq from Ali
413             # Note: we emulate a fast get_seq_with_id using Ali lookup
414 0           push @templates, $template;
415 0           push @template_seqs, $ap->ali->get_seq(
416             $ap->lookup->index_for( $template_id->full_id )
417             );
418             }
419              
420             # Note: as we (optionally) skip templates, we may run out of templates
421 0 0         unless (@templates) {
422             ###### [ORG] skipped orthologue due to lack of suitable template
423 0           next ORTHOLOGUE;
424             }
425              
426             # flag tracking exonerate failure (for exoblast mode)
427 0           my $failure;
428              
429 0 0         if ($rp->aligner_mode =~ m/exonerate|exoblast/xms) {
430              
431             # use longest template for exonerate alignment
432 0           my $template_seq = $template_seqs[-1];
433              
434             # align transcript on template protein and get its translation
435             # Note: in exonerate the translated dna_seq is called 'target_seq'
436 0           my $exo = Bio::MUST::Drivers::Exonerate::Aligned->new(
437             dna_seq => $transcript_seq,
438             pep_seq => $template_seq->clone->degap,
439             code => $self->code,
440             );
441              
442             # get new_seq from exonerate report
443             # build new_id from transcript_id and exonerate model used
444 0           my $new_id = sprintf $transcript_id, '.E.' . $exo->model;
445 0           my $new_seq = $exo->target_seq;
446 0           $new_seq->set_seq_id($new_id);
447              
448             ####### [ORG] Adding: $new_seq->full_id
449              
450 0 0         unless ($new_seq->seq_len) {
451 0 0         my $fate = $rp->aligner_mode eq 'exoblast'
452             ? 'will retry with BLAST' : 'discarded'
453             ;
454             ####### [ORG] exonerate failure: $fate
455 0           $failure = 1;
456             }
457              
458             else {
459             # align new_seq on template_seq using subject_seq as a guide
460 0           $aligned_seqs->add_seq(
461             $ap->integrator->align(
462             new_seq => $new_seq,
463             subject => $exo->query_seq,
464             template => $template_seq,
465             start => $exo->query_start,
466             )
467             );
468              
469             ######## [DEBUG] aligned with exonerate model: $exo->model
470             }
471             }
472              
473 0 0 0       if ( ($rp->aligner_mode eq 'blast')
      0        
474             || ($rp->aligner_mode eq 'exoblast' && $failure) ) {
475              
476             TEMPLATE:
477 0           for my $template (@templates) {
478              
479             # use each template in turn for BLAST alignment
480 0           my $template_seq = shift @template_seqs;
481 0 0         last TEMPLATE unless $template_seq;
482              
483             HSP:
484 0           for my $hsp ($template->all_hsps) {
485              
486             # build HSP id from transcript_id and template/HSP ranks
487 0           my $hsp_id = sprintf $transcript_id,
488             '.H' . $template->num . '.' . $hsp->num;
489              
490             # build HSP seq from BLASTX report
491 0           (my $hsp_seq = $hsp->qseq) =~ s{\*}{$FRAMESHIFT}xmsg;
492 0           my $new_seq = Seq->new(
493             seq_id => $hsp_id,
494             seq => $hsp_seq
495             );
496              
497             # fetch aligned template seq from HSP (= subject)
498 0           my $subject_seq = Seq->new(
499             seq_id => $template_seq->seq_id,
500             seq => $hsp->hseq
501             );
502              
503             # reverse complement seqs if template on reverse in BLASTN
504 0 0 0       if ($ap->query_seqs->type eq 'nucl'
      0        
505             && $blastdb->type eq 'nucl'
506             && $hsp->hit_strand == -1) {
507 0           $new_seq = $new_seq->reverse_complemented_seq;
508 0           $subject_seq = $subject_seq->reverse_complemented_seq;
509             }
510              
511             ####### [ORG] Adding: $new_seq->full_id
512              
513             # align new_seq on template_seq using subject_seq as a guide
514             $aligned_seqs->add_seq(
515 0           $ap->integrator->align(
516             new_seq => $new_seq,
517             subject => $subject_seq,
518             template => $template_seq,
519             start => $hsp->hit_start,
520             )
521             );
522              
523             ######## [DEBUG] aligned with BLAST
524             }
525             }
526             }
527             }
528              
529 0 0         $parser->remove unless $rp->debug_mode;
530              
531 0           return $aligned_seqs;
532             }
533              
534             ## use critic
535              
536              
537             sub _compress_seqs {
538 0     0     my $self = shift;
539 0           my $ali = shift;
540              
541 0           my $rp = $self->ali_proc->run_proc;
542              
543             # first fetch all seqs...
544             # ... then clear existing seqs
545 0           my @seqs2cap = $ali->all_seqs;
546 0           $ali->_set_seqs( [] );
547              
548             # proceed only if at least one seq
549 0 0         return unless @seqs2cap;
550              
551             # Note: this might seem sub-optimal in case of singletons
552             # but we follow as closely as possible the logic of compress-db.pl
553              
554             # proceed only if at least two seqs
555             # otherwise add lone seq
556 0 0         if (@seqs2cap < 2) {
557 0           $ali->add_seq( shift @seqs2cap );
558 0           return;
559             }
560              
561             # TODO: add debugging comments?
562              
563             # try to cap seqs
564 0           my $cap = Cap3->new(
565             seqs => \@seqs2cap,
566             cap3_args => {
567             -p => $rp->merge_min_ident * 100.0, # CAP3 expects percents
568             -o => $rp->merge_min_len,
569             },
570             );
571              
572             # add singlet seqs
573 0           my @singlets = $cap->all_singlets;
574 0           $ali->add_seq($_) for @singlets;
575              
576             # proceed only if contigs of seqs
577 0           my @contigs = $cap->all_contigs;
578 0 0         return unless @contigs;
579              
580 0           for my $contig (@contigs) {
581 0           my @ids = map { $_->full_id }
582 0           @{ $cap->seq_ids_for( $contig->full_id ) };
  0            
583 0           my $contig_id = shift(@ids) . ':::+' . @ids;
584 0           my $contig_seq = $contig->seq;
585              
586             # add contig seq
587 0           $ali->add_seq(
588             Seq->new( seq_id => $contig_id, seq => $contig_seq )
589             );
590             }
591              
592             # Note: we do not need to return anything as we modified $ali in place
593 0           return;
594             }
595              
596              
597             sub _fetch_tax_line_for_transcript {
598 0     0     my $self = shift;
599 0           my $orthologue = shift;
600              
601 0           my $ap = $self->ali_proc;
602 0           my $rp = $ap->run_proc;
603              
604             # fetch transcript accession (= query id) from BLAST report
605 0           my $query_def = $orthologue->query_def;
606 0           my $transcript_acc = $self->orthologous_seqs->long_id_for($query_def);
607              
608             # extract range and strand (if any) from transcript accession...
609             # Note: this mostly makes sense for genomic contigs/scaffolds
610             # ... and extract potential tail '+N' (if orthologues have been merged)
611             # Note: this looks convoluted but it has to work in many different setups
612             # depending on the value of trim_homologues and merge_orthologues:
613             # - off/off: acc
614             # - off/on: acc:::+N
615             # - on /off: acc:::start:::end:::strand
616             # - on /on: acc:::start:::end:::strand:::+N
617 0           my @fields = split /:::/xms, $transcript_acc;
618 0           my ($strip_acc, $start, $end, $strand, $more) = @fields;
619 0 0 0       ($more, $start) = ($start, undef)
      0        
620             if !defined $end && defined $start && $start =~ m/\A \+\d+ \z/xms;
621              
622             # set acc to transcript accession
623 0           my ($first_chunk) = $strip_acc =~ m/^(\S+)/xms;
624 0           my @parts = split /\|/xms, $first_chunk;
625 0           my $acc = pop @parts;
626             ###### [ORG] orthologous transcript: $acc
627              
628             # fetch score for best template
629 0           my $temp_score = $orthologue->get_hit(0)->get_hsp(0)->bit_score;
630              
631             # check again for paralogy if .para file in use
632 0 0         if ($ap->para_blastdb) {
633 0   0       my $para_score = $self->para_score_for($transcript_acc) // 0;
634 0 0         if ($para_score > $temp_score) {
635             ###### [ORG] rejected for PARA[logy]: "$para_score > $temp_score"
636 0           return;
637             }
638             }
639              
640             # check for contamination if tol_check in use
641 0 0         if ($rp->tol_blastdb ) {
642 0   0       my $tol_score = $self->tol_score_for($transcript_acc) // 0;
643 0 0         if ($tol_score > $temp_score) {
644             ###### [ORG] rejected due to TOL check: "$tol_score > $temp_score"
645 0           return;
646             }
647             }
648              
649             # prebuild SeqId list for all templates
650             # this is needed for family affiliation (first only)
651             # ... and for assessing template taxonomy (all)
652 0           my $blastdb = $self->ali_proc->blastdb;
653 0           my @templates = $orthologue->all_hits;
654             my @template_ids = map {
655 0           SeqId->new( full_id => $blastdb->long_id_for( $_->def ) )
  0            
656             } @templates;
657              
658             # set family to first template family (if any)
659 0   0       my $family = $template_ids[0]->family // q{};
660 0 0         $family .= '-' if $family;
661              
662             # set org to current (bank) org
663 0           my $org = $self->org;
664 0           my $contam_org = q{};
665              
666             # setup tax_report data
667 0           my $common_tax;
668             my $lca;
669 0           my $lineage;
670 0           my $tag = q{};
671 0           my $top_score = 0;
672 0           my $rel_n = 0;
673 0           my $mean_len;
674             my $mean_ident;
675              
676             # optionally analyze template taxonomy
677             # ... to build tax_reports of additions
678             # ... to tag potential contaminations
679 0 0 0       if ( $rp->tax_reports eq 'on' || $self->_tax_filter ) {
680 0           my @relatives;
681 0           my $best_score = 0;
682 0           my $sum_len = 0;
683 0           my $sum_ident = 0;
684              
685 0           my $tax = $rp->tax;
686 0           my $ea = each_array @templates, @template_ids;
687              
688             # accumulate relatives for LCA inference
689             # Note: three logical modes can be distinguished.
690             # These should be implemented in the config file.
691             # 1. best-hit
692             # - tax_max_hits: 1
693             # - filter on tax_min_score or tax_min_len / tax_min_ident
694             # 2. classic LCA
695             # - tax_max_hits: to be specified
696             # - filter on tax_min_score or tax_min_len / tax_min_ident
697             # 3. MEGAN-like LCA
698             # - tax_max_hits: should be ignored
699             # - filter on tax_min_score / tax_score_mul
700             # however all criteria can be simultaneously activated
701              
702             REL:
703 0           while ( my ($template, $template_id) = $ea->() ) {
704              
705             # accumulate at most tax_max_hits relatives for LCA inference
706 0 0         last REL if @relatives >= $rp->tax_max_hits;
707              
708             # ensure that match to template is good enough...
709             # by default all these thresholds are disabled
710 0           my $hsp = $template->get_hsp(0);
711              
712             # ... MEGAN-like mode (and top_score needed for one-on-one)
713 0           my $score = $hsp->bit_score;
714 0           $top_score = List::AllUtils::max($top_score, $score);
715             # Note: we do not use last due to non-monotonic score decrease
716              
717             # no need to accumulate relatives if no NCBI Taxonomy is available
718 0 0         last REL unless $tax;
719              
720 0 0         next REL if $score < $rp->tax_min_score;
721 0 0         next REL if $score < $rp->tax_score_mul * $best_score;
722              
723             # ... classic or best-hit mode
724 0           my $len = $hsp->align_len;
725 0 0         next REL if $len < $rp->tax_min_len;
726 0           my $ident = $hsp->identity / $len;
727 0 0         next REL if $ident < $rp->tax_min_ident;
728              
729             # try to determine taxonomy of template
730             # if possible add template to list of relatives...
731             # ... and set best_score for MEGAN-like mode
732 0           my @taxonomy = $tax->fetch_lineage($template_id);
733 0 0         if (@taxonomy) {
734             # Note: contrary to ref_score_mul the best_score never changes
735 0   0       $best_score ||= $score;
736 0           $sum_len += $len;
737 0           $sum_ident += $ident;
738 0           push @relatives, \@taxonomy;
739             ######## [DEBUG] hit: $template_id->full_id
740             ######## [DEBUG] length: $len
741             ######## [DEBUG] identity: sprintf '%.2f', $ident
742             ######## [DEBUG] lineage: join '; ', @taxonomy
743             }
744             }
745              
746             ######## [DEBUG] relatives: display( map { join '; ', @$_ } @relatives )
747              
748 0 0         if (@relatives < $rp->tax_min_hits) {
749             ###### [ORG] tagged as unclassified due to lack of relatives
750 0           $common_tax = [ 'unclassified sequences' ];
751 0 0         if ($self->_tax_filter) {
752 0           $tag = 'u#';
753             }
754             }
755              
756             else {
757             # infer LCA from all relatives...
758 0           $common_tax = $tax->compute_lca(@relatives);
759 0           $lineage = join '; ', @{$common_tax};
  0            
760             ######## [DEBUG] LCA lineage: $lineage
761 0           $lca = $common_tax->[-1];
762             ###### [ORG] affiliated to: $lca
763              
764 0 0         if ($rp->tax_reports eq 'on') {
765 0           $rel_n = @relatives;
766 0           $mean_len = sprintf '%.2f', $sum_len / $rel_n;
767 0           $mean_ident = sprintf '%.2f', $sum_ident / $rel_n;
768             }
769              
770             # optionally tag potential contamination based on LCA
771             # Note: LCAs are used "as is" because is_allowed handles them
772             # Note: tol_check 'on' disables positive taxonomic filtering
773 0 0 0       if ($self->_tax_filter && $rp->tol_check eq 'off') {
774             ######## [DEBUG] pass tax_filter: $self->is_allowed($common_tax)
775 0 0         unless ( $self->is_allowed($common_tax) ) {
776             ###### [ORG] tagged as contaminated
777 0           $tag = 'c#';
778             $contam_org = join '_',
779 0   0       map { $_ // () } SeqId->parse_ncbi_name($lca);
  0            
780             } # delete undef species/strains
781             }
782             }
783             }
784              
785             # check for multiple orthologues derived from the same transcript
786             # Note: this mostly makes sense for genomic contigs/scaffolds
787 0           my $acc_u = $acc;
788 0           my $count = ++$self->_count_for->{$org}{$acc};
789 0 0         if ($count > 1) { # direct access to private attr
790 0           carp "[ORG] Note: more than one orthologue extracted from $acc;"
791             . " appending .$count to accession.";
792 0           $acc_u .= ".$count";
793             }
794              
795             # add potential accession tail '+N' (if orthologues have been merged)
796 0 0         $acc_u .= $more if $more;
797              
798             # fetch orthologous transcript and give it nearly complete full_id
799 0           my $transcript_seq
800             = $self->orthologous_seqs->get_seq_with_id($transcript_acc)->seq;
801 0 0 0       my $transcript_id
802             = $org =~ $PKEYONLY || $org =~ $GCAONLY ? $org . '|' . $acc_u
803             : $family . $tag . $org . '@' . $acc_u
804             ;
805              
806             # build tax_report line for transcript
807 0           my $tax_line = NewSeq->new(
808             seq_id => $transcript_id,
809             contam_org => $contam_org,
810             top_score => $top_score,
811             rel_n => $rel_n,
812             mean_len => $mean_len,
813             mean_ident => $mean_ident,
814             lca => $lca,
815             lineage => $lineage,
816             acc => $acc, # Note: not $acc_u here!
817             start => $start,
818             end => $end,
819             strand => $strand,
820             seq => $transcript_seq,
821             );
822              
823 0           return $tax_line;
824             }
825              
826              
827             sub BUILD {
828 0     0 0   my $self = shift;
829              
830 0           my $ap = $self->ali_proc;
831 0           my $rp = $ap->run_proc;
832              
833             # optionally build _tax_filter from tax_filter (specs) and tax attrs
834             # Note: cannot use builder/coercion because of the need for a tax object
835 0 0         if ($self->tax_filter) {
836 0           my $tax = $rp->tax;
837 0 0         $self->_set_tax_filter( $tax->tax_filter( $self->tax_filter ) )
838             if $tax; # warning already issued at this stage
839             }
840              
841             ##### [ORG] homologues: display( $self->homologous_seqs->all_long_ids )
842 0 0         return unless $self->homologous_seqs->all_long_ids;
843              
844             ##### [ORG] orthologues: display( $self->orthologous_seqs->all_long_ids )
845 0 0         return unless $self->orthologous_seqs->all_long_ids;
846              
847             ##### [ORG] aligned: display( map { $_->full_id } $self->aligned_seqs->all_seq_ids )
848 0 0         return unless $self->aligned_seqs->all_seq_ids;
849              
850             ##### [ORG] Adding aligned seqs to file...
851 0           $ap->ali->add_seq( $self->aligned_seqs->all_seqs );
852              
853 0           return;
854             }
855              
856              
857             __PACKAGE__->meta->make_immutable;
858             1;
859              
860             __END__
861              
862             =pod
863              
864             =head1 NAME
865              
866             Bio::MUST::Apps::FortyTwo::OrgProcessor - Internal class for forty-two tool
867              
868             =head1 VERSION
869              
870             version 0.210570
871              
872             =head1 AUTHOR
873              
874             Denis BAURAIN <denis.baurain@uliege.be>
875              
876             =head1 COPYRIGHT AND LICENSE
877              
878             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
879              
880             This is free software; you can redistribute it and/or modify it under
881             the same terms as the Perl 5 programming language system itself.
882              
883             =cut