File Coverage

blib/lib/Bio/MUST/Apps/FortyTwo/OrgProcessor.pm

Criterion	Covered	Total	%
statement	66	350	18.8
branch	0	132	0.0
condition	0	64	0.0
subroutine	22	30	73.3
pod	0	1	0.0
total	88	577	15.2

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MUST::Apps::FortyTwo::OrgProcessor;
2							# ABSTRACT: Internal class for forty-two tool
3							$Bio::MUST::Apps::FortyTwo::OrgProcessor::VERSION = '0.210570';
4	1			1		762	use Moose;
	1					3
	1					7
5	1			1		6871	use namespace::autoclean;
	1					3
	1					7
6
7	1			1		93	use autodie;
	1					3
	1					8
8	1			1		5536	use feature qw(say);
	1					3
	1					86
9
10	1			1		8	use Smart::Comments -ENV;
	1					3
	1					8
11
12	1			1		6316	use Carp;
	1					3
	1					64
13	1			1		6	use Const::Fast;
	1					2
	1					9
14	1			1		64	use List::AllUtils qw(each_array pairmap);
	1					3
	1					50
15	1			1		7	use Path::Class qw(file);
	1					2
	1					52
16	1			1		7	use Tie::IxHash;
	1					2
	1					29
17
18	1			1		5	use Bio::MUST::Core;
	1					23
	1					26
19	1			1		7	use Bio::MUST::Core::Constants qw(:gaps :ncbi);
	1					2
	1					191
20	1			1		7	use Bio::MUST::Core::Utils qw(:filenames);
	1					2
	1					93
21	1			1		7	use Bio::MUST::Drivers;
	1					2
	1					24
22	1			1		6	use aliased 'Bio::MUST::Core::Ali';
	1					2
	1					6
23	1			1		194	use aliased 'Bio::MUST::Core::Ali::Temporary';
	1					2
	1					5
24	1			1		215	use aliased 'Bio::MUST::Core::IdList';
	1					17
	1					4
25	1			1		201	use aliased 'Bio::MUST::Core::Seq';
	1					2
	1					5
26	1			1		188	use aliased 'Bio::MUST::Core::SeqId';
	1					2
	1					4
27	1			1		189	use aliased 'Bio::MUST::Core::SeqMask';
	1					2
	1					4
28	1			1		198	use aliased 'Bio::MUST::Drivers::Cap3';
	1					2
	1					5
29	1			1		207	use aliased 'Bio::MUST::Apps::Debrief42::TaxReport::NewSeq';
	1					2
	1					3
30
31							with 'Bio::MUST::Apps::Roles::OrgProcable';
32
33
34							# org
35
36							# banks
37
38							# code
39
40
41							has 'ali_proc' => (
42							is => 'ro',
43							isa => 'Bio::MUST::Apps::FortyTwo::AliProcessor',
44							required => 1,
45							);
46
47
48							has 'tax_filter' => (
49							is => 'ro',
50							isa => 'Maybe[ArrayRef[Str]]',
51							);
52
53
54							has '_tax_filter' => (
55							is => 'ro',
56							isa => 'Maybe[Bio::MUST::Core::Taxonomy::Filter]',
57							init_arg => undef,
58							writer => '_set_tax_filter',
59							handles => [ qw(is_allowed) ],
60							);
61
62
63							has $_ . '_seqs' => (
64							is => 'ro',
65							isa => 'Bio::MUST::Core::Ali::Temporary',
66							init_arg => undef,
67							lazy => 1,
68							builder => '_build_' . $_ . '_seqs',
69							) for qw(homologous orthologous);
70
71
72							has 'aligned_seqs' => (
73							is => 'ro',
74							isa => 'Bio::MUST::Core::Ali',
75							init_arg => undef,
76							lazy => 1,
77							builder => '_build_aligned_seqs',
78							);
79
80
81							has '_' . $_ . '_para_scores' => (
82							traits => ['Hash'],
83							is => 'ro',
84							isa => 'HashRef[Num]',
85							init_arg => undef,
86							lazy => 1,
87							builder => '_build_' . $_ . '_scores',
88							handles => {
89							$_ . '_score_for' => 'get',
90							},
91							) for qw(para tol);
92
93
94							has '_count_for' => (
95							is => 'ro',
96							isa => 'HashRef[HashRef[Num]]',
97							init_arg => undef,
98							default => sub { {} },
99							);
100
101
102							## no critic (ProhibitUnusedPrivateSubroutines)
103
104							sub _build_homologous_seqs {
105	0			0			my $self = shift;
106
107							##### [ORG] Searching for homologues...
108
109	0						my $ap = $self->ali_proc;
110	0						my $rp = $ap->run_proc;
111
112	0		0				my $args = $rp->blast_args_for('homologues') // {};
113	0						$args->{-outfmt} = 6;
114							$args->{-max_target_seqs} = 10000
115	0	0					unless defined $args->{-max_target_seqs};
116
117	0						my @seqs;
118	0						for my $bank ($self->all_banks) {
119
120							###### [ORG] Processing BANK: $bank
121	0						my $query_seqs = $ap->query_seqs;
122	0						my $file = file( $rp->bank_dir, $bank );
123	0						my $blastdb = Bio::MUST::Drivers::Blast::Database->new( file => $file );
124	0	0	0				$args->{-db_gencode} = $self->code # if TBLASTN
125							if $query_seqs->type eq 'prot' && $blastdb->type eq 'nucl';
126	0						my $parser = $blastdb->blast($query_seqs, $args);
127							###### [ORG] TBLASTN (or BLASTP/N): $bank . q{ } . $parser->filename
128
129							# tie might help making 42 completely deterministic
130	0						tie my %mask_for, 'Tie::IxHash';
131	0						my %strand_for;
132
133							# collect hit ids, masks and strands covered by queries
134	0						$self->_collect_hsps(
135							parser => $parser,
136							mask_for => \%mask_for,
137							strand_for => \%strand_for,
138							);
139	0	0					$parser->remove unless $rp->debug_mode;
140
141							# fetch (and optionally trim) hits to their max range
142	0						my @hits = $self->_fetch_and_trim_hits(
143							blastdb => $blastdb,
144							mask_for => \%mask_for,
145							strand_for => \%strand_for,
146							);
147							######## [DEBUG] hits: display( map { $_->full_id . ' => ' . $_->seq } @hits )
148
149							# add seqs from current bank to homologous seqs
150	0						push @seqs, @hits;
151							}
152
153							# build BLAST query file from homologous seqs
154	0						return Temporary->new( seqs => \@seqs );
155							}
156
157
158							sub _build_orthologous_seqs {
159	0			0			my $self = shift;
160
161	0						my $ap = $self->ali_proc;
162	0						my $rp = $ap->run_proc;
163
164	0	0					if ($rp->ref_brh eq 'off') {
165							##### [ORG] Skipping orthology assessment!
166	0						return $self->homologous_seqs;
167							}
168
169							##### [ORG] Identifying orthologues among homologues...
170
171	0		0				my $args = $rp->blast_args_for('orthologues') // {};
172	0						$args->{-outfmt} = 6;
173	0						$args->{-max_target_seqs} = 1;
174
175							# tie might help making 42 completely deterministic
176	0						tie my %count_for, 'Tie::IxHash';
177	0						for my $ref_org ($ap->all_ref_orgs) {
178
179	0						my $homologous_seqs = $self->homologous_seqs;
180	0						my $blastdb = $rp->ref_blastdb_for($ref_org);
181	0	0	0				$args->{-query_gencode} = $self->code # if BLASTX
182							if $homologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
183	0						my $parser = $blastdb->blast($homologous_seqs, $args);
184							##### [ORG] BLASTX (or BLASTP/N): $ref_org . q{ } . $parser->filename
185
186	0						my $best_hits = $ap->best_hits_for($ref_org);
187	0						while (my $hsp = $parser->next_query) {
188	0						my $candidate = $homologous_seqs->long_id_for( $hsp->query_id );
189	0						my $in_best_hits = $best_hits->is_listed( $hsp->hit_id );
190							######## [DEBUG]: $candidate . ( $in_best_hits && ' [=BRH=]' )
191	0	0					$count_for{$candidate}++ if $in_best_hits;
192							}
193	0	0					$parser->remove unless $rp->debug_mode;
194							}
195
196							######## [DEBUG] BRH counts: display( pairmap { "$a => $b" } %count_for )
197
198							# keep only homologues that are orthologous for all effective ref_orgs
199	0						my $ref_org_n = $ap->count_ref_orgs;
200							my $orthologues = IdList->new(
201	0						ids => [ grep { $count_for{$_} == $ref_org_n } keys %count_for ]
	0
202							);
203	0						my $seqs = $orthologues->filtered_ali( $self->homologous_seqs );
204
205							# optionally merge orthologues before aligning
206							# Note: this is always disabled in metagenomic mode
207	0	0					unless ($rp->run_mode eq 'metagenomic') { # TODO: warn user?
208	0	0					if ($rp->merge_orthologues eq 'on') {
209							##### [ORG] pre-merge orthologues: display( map { $_->full_id } $seqs->all_seq_ids )
210	0						$self->_compress_seqs($seqs);
211							}
212							}
213
214							# build BLAST query file from orthologous seqs
215	0						return Temporary->new( seqs => $seqs );
216							}
217
218
219							sub _build_para_scores {
220	0			0			my $self = shift;
221
222	0						my $ap = $self->ali_proc;
223	0						my $rp = $ap->run_proc;
224
225							# ignore if no .para file in use
226	0						my $blastdb = $ap->para_blastdb;
227	0	0					return unless $blastdb;
228
229	0		0				my $args = $rp->blast_args_for('templates') // {};
230	0						$args->{-outfmt} = 6;
231	0						$args->{-max_target_seqs} = 1;
232
233	0						my $orthologous_seqs = $self->orthologous_seqs;
234	0	0	0				$args->{-query_gencode} = $self->code # if BLASTX
235							if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
236	0						my $parser = $blastdb->blast($orthologous_seqs, $args);
237							##### [ORG] BLASTX (or BLASTP/N): 'PARA ' . $parser->filename
238
239	0						my %para_score_for;
240	0						while (my $hsp = $parser->next_query) {
241	0						my $transcript_acc = $orthologous_seqs->long_id_for( $hsp->query_id );
242	0						my $hit_id = $blastdb->long_id_for( $hsp->hit_id );
243	0						my $score = $hsp->bit_score;
244							######## [DEBUG] orthologue: $transcript_acc
245							######## [DEBUG] PARA hit: $hit_id
246							######## [DEBUG] score: $score
247	0						$para_score_for{$transcript_acc} = $score;
248							}
249	0	0					$parser->remove unless $rp->debug_mode;
250
251	0						return \%para_score_for;
252							}
253
254
255							sub _build_tol_scores {
256	0			0			my $self = shift;
257
258	0						my $ap = $self->ali_proc;
259	0						my $rp = $ap->run_proc;
260
261							# ignore if no tol_check in use
262	0						my $blastdb = $rp->tol_blastdb;
263	0	0					return unless $blastdb;
264
265	0		0				my $args = $rp->blast_args_for('templates') // {};
266	0						$args->{-outfmt} = 5;
267	0						$args->{-max_target_seqs} = 250;
268
269	0						my $orthologous_seqs = $self->orthologous_seqs;
270	0	0	0				$args->{-query_gencode} = $self->code # if BLASTX
271							if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
272	0						my $parser = $blastdb->blast($orthologous_seqs, $args);
273							##### [ORG] XML BLASTX (or BLASTP/N): 'TOL ' . $parser->filename
274
275							# abort if no hit
276	0						my $bo = $parser->blast_output;
277	0	0					return unless $bo;
278
279	0						my %tol_score_for;
280
281							ORTHOLOGUE:
282	0						for my $orthologue ($bo->all_iterations) {
283	0	0					next ORTHOLOGUE unless $orthologue->count_hits;
284							# Note: this should never happen...
285
286	0						my $query_def = $orthologue->query_def;
287	0						my $transcript_acc = $orthologous_seqs->long_id_for($query_def);
288							######## [DEBUG] orthologue: $transcript_acc
289
290							TOL_HIT:
291	0						for my $hit ($orthologue->all_hits) {
292
293							# check taxonomy of hit: skip SELF-like hits
294	0						my $hit_id = SeqId->new( full_id => $hit->id );
295							######## [DEBUG] TOL hit: $hit_id->full_id
296	0	0					next TOL_HIT if $self->is_allowed($hit_id);
297
298							# fetch score for first non-SELF-like hit
299	0						my $score = $hit->get_hsp(0)->bit_score;
300							######## [DEBUG] score: $score
301	0						$tol_score_for{$transcript_acc} = $score;
302	0						next ORTHOLOGUE;
303							}
304							}
305	0	0					$parser->remove unless $rp->debug_mode;
306
307	0						return \%tol_score_for;
308							}
309
310
311							sub _build_aligned_seqs { ## no critic (ProhibitExcessComplexity)
312	0			0			my $self = shift;
313
314							##### [ORG] Aligning orthologues...
315
316	0						my $ap = $self->ali_proc;
317	0						my $rp = $ap->run_proc;
318
319	0		0				my $args = $rp->blast_args_for('templates') // {};
320	0						$args->{-outfmt} = 5;
321	0						$args->{-max_target_seqs} = $rp->tax_max_hits * 2;
322
323	0						my $orthologous_seqs = $self->orthologous_seqs;
324	0						my $blastdb = $ap->blastdb;
325	0	0	0				$args->{-query_gencode} = $self->code # if BLASTX
326							if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
327	0						my $parser = $blastdb->blast($orthologous_seqs, $args);
328							##### [ORG] XML BLASTX (or BLASTP/N): $parser->filename
329
330	0						my $aligned_seqs = Ali->new();
331
332							# abort if no hit
333	0						my $bo = $parser->blast_output;
334	0	0					return $aligned_seqs unless $bo;
335
336							ORTHOLOGUE:
337	0						for my $orthologue ($bo->all_iterations) {
338	0	0					unless ($orthologue->count_hits) {
339							###### [ORG] skipped orthologue due to lack of significant template
340	0						next ORTHOLOGUE;
341							} # TODO: investigate why this should happen at all...
342
343							# fetch tax_line from orthologue
344							# Note: .para check is also done in this sub (hence the next below)
345	0						my $tax_line = $self->_fetch_tax_line_for_transcript($orthologue);
346	0	0					next ORTHOLOGUE unless $tax_line;
347
348							# extract contam_org and transcript_id from tax_line...
349	0						my $contam_org = $tax_line->contam_org;
350	0						my $transcript_id = $tax_line->seq_id . '%s'; # chunk placeholder
351	0	0					$transcript_id .= "...$contam_org" if $contam_org;
352	0						$transcript_id .= '#NEW#';
353
354							# ... and use it "as is" to store tax_line in tax_report (no chunk tag)
355	0						my $tax_line_id = sprintf $transcript_id, q{};
356	0						$ap->set_tax_line( $tax_line_id => $tax_line );
357
358							# extract transcript_seq from tax_line
359	0						my $transcript_seq = Seq->new(
360							seq_id => $tax_line_id, # same id without alignment
361							seq => $tax_line->seq
362							);
363
364							# skip both orthologue alignment and integration if metagenomic mode
365	0	0					next ORTHOLOGUE if $rp->run_mode eq 'metagenomic';
366
367							# optionally add transcript_seq as is (without alignment)
368	0	0					if ($rp->aligner_mode eq 'off') {
369
370							####### [ORG] Adding: $transcript_seq->full_id
371
372	0						$aligned_seqs->add_seq($transcript_seq);
373	0						next ORTHOLOGUE;
374							}
375
376							# build a template_seq list as long as query coverage improves
377							# exonerate will try to use the longest template for alignment
378							# while BLAST will use each hit in turn (possibly as a fall-back)
379	0						my @templates;
380							my @template_seqs;
381	0						my $best_coverage = 0;
382
383							TEMPLATE:
384	0						for my $template ($orthologue->all_hits) {
385
386							# fetch template full_id
387	0						my $template_id = SeqId->new(
388							full_id => $blastdb->long_id_for($template->def)
389							);
390
391							# optionally skip template if from same org as the orthologue
392	0	0					if ($rp->ali_skip_self eq 'on') {
393	0	0					if ($template_id->full_org eq $transcript_seq->full_org) {
394							###### [ORG] skipped same-org template due to ali_skip_self
395	0						next TEMPLATE;
396							}
397							}
398
399							# compute query coverage by template HSPs
400	0						my $mask = SeqMask->empty_mask( $orthologue->query_len );
401							$mask->mark_block($_->query_start, $_->query_end)
402	0						for $template->all_hsps;
403	0						my $coverage = $mask->coverage;
404
405	0	0					last TEMPLATE if $coverage
406							< $best_coverage * $rp->ali_cover_mul;
407	0						$best_coverage = $coverage;
408
409							######## [DEBUG] template: $template_id->full_id
410							######## [DEBUG] coverage: sprintf '%.2f', $coverage
411
412							# fetch and cache aligned template seq from Ali
413							# Note: we emulate a fast get_seq_with_id using Ali lookup
414	0						push @templates, $template;
415	0						push @template_seqs, $ap->ali->get_seq(
416							$ap->lookup->index_for( $template_id->full_id )
417							);
418							}
419
420							# Note: as we (optionally) skip templates, we may run out of templates
421	0	0					unless (@templates) {
422							###### [ORG] skipped orthologue due to lack of suitable template
423	0						next ORTHOLOGUE;
424							}
425
426							# flag tracking exonerate failure (for exoblast mode)
427	0						my $failure;
428
429	0	0					if ($rp->aligner_mode =~ m/exonerate\|exoblast/xms) {
430
431							# use longest template for exonerate alignment
432	0						my $template_seq = $template_seqs[-1];
433
434							# align transcript on template protein and get its translation
435							# Note: in exonerate the translated dna_seq is called 'target_seq'
436	0						my $exo = Bio::MUST::Drivers::Exonerate::Aligned->new(
437							dna_seq => $transcript_seq,
438							pep_seq => $template_seq->clone->degap,
439							code => $self->code,
440							);
441
442							# get new_seq from exonerate report
443							# build new_id from transcript_id and exonerate model used
444	0						my $new_id = sprintf $transcript_id, '.E.' . $exo->model;
445	0						my $new_seq = $exo->target_seq;
446	0						$new_seq->set_seq_id($new_id);
447
448							####### [ORG] Adding: $new_seq->full_id
449
450	0	0					unless ($new_seq->seq_len) {
451	0	0					my $fate = $rp->aligner_mode eq 'exoblast'
452							? 'will retry with BLAST' : 'discarded'
453							;
454							####### [ORG] exonerate failure: $fate
455	0						$failure = 1;
456							}
457
458							else {
459							# align new_seq on template_seq using subject_seq as a guide
460	0						$aligned_seqs->add_seq(
461							$ap->integrator->align(
462							new_seq => $new_seq,
463							subject => $exo->query_seq,
464							template => $template_seq,
465							start => $exo->query_start,
466							)
467							);
468
469							######## [DEBUG] aligned with exonerate model: $exo->model
470							}
471							}
472
473	0	0	0				if ( ($rp->aligner_mode eq 'blast')
			0
474							\|\| ($rp->aligner_mode eq 'exoblast' && $failure) ) {
475
476							TEMPLATE:
477	0						for my $template (@templates) {
478
479							# use each template in turn for BLAST alignment
480	0						my $template_seq = shift @template_seqs;
481	0	0					last TEMPLATE unless $template_seq;
482
483							HSP:
484	0						for my $hsp ($template->all_hsps) {
485
486							# build HSP id from transcript_id and template/HSP ranks
487	0						my $hsp_id = sprintf $transcript_id,
488							'.H' . $template->num . '.' . $hsp->num;
489
490							# build HSP seq from BLASTX report
491	0						(my $hsp_seq = $hsp->qseq) =~ s{\*}{$FRAMESHIFT}xmsg;
492	0						my $new_seq = Seq->new(
493							seq_id => $hsp_id,
494							seq => $hsp_seq
495							);
496
497							# fetch aligned template seq from HSP (= subject)
498	0						my $subject_seq = Seq->new(
499							seq_id => $template_seq->seq_id,
500							seq => $hsp->hseq
501							);
502
503							# reverse complement seqs if template on reverse in BLASTN
504	0	0	0				if ($ap->query_seqs->type eq 'nucl'
			0
505							&& $blastdb->type eq 'nucl'
506							&& $hsp->hit_strand == -1) {
507	0						$new_seq = $new_seq->reverse_complemented_seq;
508	0						$subject_seq = $subject_seq->reverse_complemented_seq;
509							}
510
511							####### [ORG] Adding: $new_seq->full_id
512
513							# align new_seq on template_seq using subject_seq as a guide
514							$aligned_seqs->add_seq(
515	0						$ap->integrator->align(
516							new_seq => $new_seq,
517							subject => $subject_seq,
518							template => $template_seq,
519							start => $hsp->hit_start,
520							)
521							);
522
523							######## [DEBUG] aligned with BLAST
524							}
525							}
526							}
527							}
528
529	0	0					$parser->remove unless $rp->debug_mode;
530
531	0						return $aligned_seqs;
532							}
533
534							## use critic
535
536
537							sub _compress_seqs {
538	0			0			my $self = shift;
539	0						my $ali = shift;
540
541	0						my $rp = $self->ali_proc->run_proc;
542
543							# first fetch all seqs...
544							# ... then clear existing seqs
545	0						my @seqs2cap = $ali->all_seqs;
546	0						$ali->_set_seqs( [] );
547
548							# proceed only if at least one seq
549	0	0					return unless @seqs2cap;
550
551							# Note: this might seem sub-optimal in case of singletons
552							# but we follow as closely as possible the logic of compress-db.pl
553
554							# proceed only if at least two seqs
555							# otherwise add lone seq
556	0	0					if (@seqs2cap < 2) {
557	0						$ali->add_seq( shift @seqs2cap );
558	0						return;
559							}
560
561							# TODO: add debugging comments?
562
563							# try to cap seqs
564	0						my $cap = Cap3->new(
565							seqs => \@seqs2cap,
566							cap3_args => {
567							-p => $rp->merge_min_ident * 100.0, # CAP3 expects percents
568							-o => $rp->merge_min_len,
569							},
570							);
571
572							# add singlet seqs
573	0						my @singlets = $cap->all_singlets;
574	0						$ali->add_seq($_) for @singlets;
575
576							# proceed only if contigs of seqs
577	0						my @contigs = $cap->all_contigs;
578	0	0					return unless @contigs;
579
580	0						for my $contig (@contigs) {
581	0						my @ids = map { $_->full_id }
582	0						@{ $cap->seq_ids_for( $contig->full_id ) };
	0
583	0						my $contig_id = shift(@ids) . ':::+' . @ids;
584	0						my $contig_seq = $contig->seq;
585
586							# add contig seq
587	0						$ali->add_seq(
588							Seq->new( seq_id => $contig_id, seq => $contig_seq )
589							);
590							}
591
592							# Note: we do not need to return anything as we modified $ali in place
593	0						return;
594							}
595
596
597							sub _fetch_tax_line_for_transcript {
598	0			0			my $self = shift;
599	0						my $orthologue = shift;
600
601	0						my $ap = $self->ali_proc;
602	0						my $rp = $ap->run_proc;
603
604							# fetch transcript accession (= query id) from BLAST report
605	0						my $query_def = $orthologue->query_def;
606	0						my $transcript_acc = $self->orthologous_seqs->long_id_for($query_def);
607
608							# extract range and strand (if any) from transcript accession...
609							# Note: this mostly makes sense for genomic contigs/scaffolds
610							# ... and extract potential tail '+N' (if orthologues have been merged)
611							# Note: this looks convoluted but it has to work in many different setups
612							# depending on the value of trim_homologues and merge_orthologues:
613							# - off/off: acc
614							# - off/on: acc:::+N
615							# - on /off: acc:::start:::end:::strand
616							# - on /on: acc:::start:::end:::strand:::+N
617	0						my @fields = split /:::/xms, $transcript_acc;
618	0						my ($strip_acc, $start, $end, $strand, $more) = @fields;
619	0	0	0				($more, $start) = ($start, undef)
			0
620							if !defined $end && defined $start && $start =~ m/\A \+\d+ \z/xms;
621
622							# set acc to transcript accession
623	0						my ($first_chunk) = $strip_acc =~ m/^(\S+)/xms;
624	0						my @parts = split /\\|/xms, $first_chunk;
625	0						my $acc = pop @parts;
626							###### [ORG] orthologous transcript: $acc
627
628							# fetch score for best template
629	0						my $temp_score = $orthologue->get_hit(0)->get_hsp(0)->bit_score;
630
631							# check again for paralogy if .para file in use
632	0	0					if ($ap->para_blastdb) {
633	0		0				my $para_score = $self->para_score_for($transcript_acc) // 0;
634	0	0					if ($para_score > $temp_score) {
635							###### [ORG] rejected for PARA[logy]: "$para_score > $temp_score"
636	0						return;
637							}
638							}
639
640							# check for contamination if tol_check in use
641	0	0					if ($rp->tol_blastdb ) {
642	0		0				my $tol_score = $self->tol_score_for($transcript_acc) // 0;
643	0	0					if ($tol_score > $temp_score) {
644							###### [ORG] rejected due to TOL check: "$tol_score > $temp_score"
645	0						return;
646							}
647							}
648
649							# prebuild SeqId list for all templates
650							# this is needed for family affiliation (first only)
651							# ... and for assessing template taxonomy (all)
652	0						my $blastdb = $self->ali_proc->blastdb;
653	0						my @templates = $orthologue->all_hits;
654							my @template_ids = map {
655	0						SeqId->new( full_id => $blastdb->long_id_for( $_->def ) )
	0
656							} @templates;
657
658							# set family to first template family (if any)
659	0		0				my $family = $template_ids[0]->family // q{};
660	0	0					$family .= '-' if $family;
661
662							# set org to current (bank) org
663	0						my $org = $self->org;
664	0						my $contam_org = q{};
665
666							# setup tax_report data
667	0						my $common_tax;
668							my $lca;
669	0						my $lineage;
670	0						my $tag = q{};
671	0						my $top_score = 0;
672	0						my $rel_n = 0;
673	0						my $mean_len;
674							my $mean_ident;
675
676							# optionally analyze template taxonomy
677							# ... to build tax_reports of additions
678							# ... to tag potential contaminations
679	0	0	0				if ( $rp->tax_reports eq 'on' \|\| $self->_tax_filter ) {
680	0						my @relatives;
681	0						my $best_score = 0;
682	0						my $sum_len = 0;
683	0						my $sum_ident = 0;
684
685	0						my $tax = $rp->tax;
686	0						my $ea = each_array @templates, @template_ids;
687
688							# accumulate relatives for LCA inference
689							# Note: three logical modes can be distinguished.
690							# These should be implemented in the config file.
691							# 1. best-hit
692							# - tax_max_hits: 1
693							# - filter on tax_min_score or tax_min_len / tax_min_ident
694							# 2. classic LCA
695							# - tax_max_hits: to be specified
696							# - filter on tax_min_score or tax_min_len / tax_min_ident
697							# 3. MEGAN-like LCA
698							# - tax_max_hits: should be ignored
699							# - filter on tax_min_score / tax_score_mul
700							# however all criteria can be simultaneously activated
701
702							REL:
703	0						while ( my ($template, $template_id) = $ea->() ) {
704
705							# accumulate at most tax_max_hits relatives for LCA inference
706	0	0					last REL if @relatives >= $rp->tax_max_hits;
707
708							# ensure that match to template is good enough...
709							# by default all these thresholds are disabled
710	0						my $hsp = $template->get_hsp(0);
711
712							# ... MEGAN-like mode (and top_score needed for one-on-one)
713	0						my $score = $hsp->bit_score;
714	0						$top_score = List::AllUtils::max($top_score, $score);
715							# Note: we do not use last due to non-monotonic score decrease
716
717							# no need to accumulate relatives if no NCBI Taxonomy is available
718	0	0					last REL unless $tax;
719
720	0	0					next REL if $score < $rp->tax_min_score;
721	0	0					next REL if $score < $rp->tax_score_mul * $best_score;
722
723							# ... classic or best-hit mode
724	0						my $len = $hsp->align_len;
725	0	0					next REL if $len < $rp->tax_min_len;
726	0						my $ident = $hsp->identity / $len;
727	0	0					next REL if $ident < $rp->tax_min_ident;
728
729							# try to determine taxonomy of template
730							# if possible add template to list of relatives...
731							# ... and set best_score for MEGAN-like mode
732	0						my @taxonomy = $tax->fetch_lineage($template_id);
733	0	0					if (@taxonomy) {
734							# Note: contrary to ref_score_mul the best_score never changes
735	0		0				$best_score \|\|= $score;
736	0						$sum_len += $len;
737	0						$sum_ident += $ident;
738	0						push @relatives, \@taxonomy;
739							######## [DEBUG] hit: $template_id->full_id
740							######## [DEBUG] length: $len
741							######## [DEBUG] identity: sprintf '%.2f', $ident
742							######## [DEBUG] lineage: join '; ', @taxonomy
743							}
744							}
745
746							######## [DEBUG] relatives: display( map { join '; ', @$_ } @relatives )
747
748	0	0					if (@relatives < $rp->tax_min_hits) {
749							###### [ORG] tagged as unclassified due to lack of relatives
750	0						$common_tax = [ 'unclassified sequences' ];
751	0	0					if ($self->_tax_filter) {
752	0						$tag = 'u#';
753							}
754							}
755
756							else {
757							# infer LCA from all relatives...
758	0						$common_tax = $tax->compute_lca(@relatives);
759	0						$lineage = join '; ', @{$common_tax};
	0
760							######## [DEBUG] LCA lineage: $lineage
761	0						$lca = $common_tax->[-1];
762							###### [ORG] affiliated to: $lca
763
764	0	0					if ($rp->tax_reports eq 'on') {
765	0						$rel_n = @relatives;
766	0						$mean_len = sprintf '%.2f', $sum_len / $rel_n;
767	0						$mean_ident = sprintf '%.2f', $sum_ident / $rel_n;
768							}
769
770							# optionally tag potential contamination based on LCA
771							# Note: LCAs are used "as is" because is_allowed handles them
772							# Note: tol_check 'on' disables positive taxonomic filtering
773	0	0	0				if ($self->_tax_filter && $rp->tol_check eq 'off') {
774							######## [DEBUG] pass tax_filter: $self->is_allowed($common_tax)
775	0	0					unless ( $self->is_allowed($common_tax) ) {
776							###### [ORG] tagged as contaminated
777	0						$tag = 'c#';
778							$contam_org = join '_',
779	0		0				map { $_ // () } SeqId->parse_ncbi_name($lca);
	0
780							} # delete undef species/strains
781							}
782							}
783							}
784
785							# check for multiple orthologues derived from the same transcript
786							# Note: this mostly makes sense for genomic contigs/scaffolds
787	0						my $acc_u = $acc;
788	0						my $count = ++$self->_count_for->{$org}{$acc};
789	0	0					if ($count > 1) { # direct access to private attr
790	0						carp "[ORG] Note: more than one orthologue extracted from $acc;"
791							. " appending .$count to accession.";
792	0						$acc_u .= ".$count";
793							}
794
795							# add potential accession tail '+N' (if orthologues have been merged)
796	0	0					$acc_u .= $more if $more;
797
798							# fetch orthologous transcript and give it nearly complete full_id
799	0						my $transcript_seq
800							= $self->orthologous_seqs->get_seq_with_id($transcript_acc)->seq;
801	0	0	0				my $transcript_id
802							= $org =~ $PKEYONLY \|\| $org =~ $GCAONLY ? $org . '\|' . $acc_u
803							: $family . $tag . $org . '@' . $acc_u
804							;
805
806							# build tax_report line for transcript
807	0						my $tax_line = NewSeq->new(
808							seq_id => $transcript_id,
809							contam_org => $contam_org,
810							top_score => $top_score,
811							rel_n => $rel_n,
812							mean_len => $mean_len,
813							mean_ident => $mean_ident,
814							lca => $lca,
815							lineage => $lineage,
816							acc => $acc, # Note: not $acc_u here!
817							start => $start,
818							end => $end,
819							strand => $strand,
820							seq => $transcript_seq,
821							);
822
823	0						return $tax_line;
824							}
825
826
827							sub BUILD {
828	0			0	0		my $self = shift;
829
830	0						my $ap = $self->ali_proc;
831	0						my $rp = $ap->run_proc;
832
833							# optionally build _tax_filter from tax_filter (specs) and tax attrs
834							# Note: cannot use builder/coercion because of the need for a tax object
835	0	0					if ($self->tax_filter) {
836	0						my $tax = $rp->tax;
837	0	0					$self->_set_tax_filter( $tax->tax_filter( $self->tax_filter ) )
838							if $tax; # warning already issued at this stage
839							}
840
841							##### [ORG] homologues: display( $self->homologous_seqs->all_long_ids )
842	0	0					return unless $self->homologous_seqs->all_long_ids;
843
844							##### [ORG] orthologues: display( $self->orthologous_seqs->all_long_ids )
845	0	0					return unless $self->orthologous_seqs->all_long_ids;
846
847							##### [ORG] aligned: display( map { $_->full_id } $self->aligned_seqs->all_seq_ids )
848	0	0					return unless $self->aligned_seqs->all_seq_ids;
849
850							##### [ORG] Adding aligned seqs to file...
851	0						$ap->ali->add_seq( $self->aligned_seqs->all_seqs );
852
853	0						return;
854							}
855
856
857							__PACKAGE__->meta->make_immutable;
858							1;
859
860							__END__
861
862							=pod
863
864							=head1 NAME
865
866							Bio::MUST::Apps::FortyTwo::OrgProcessor - Internal class for forty-two tool
867
868							=head1 VERSION
869
870							version 0.210570
871
872							=head1 AUTHOR
873
874							Denis BAURAIN <denis.baurain@uliege.be>
875
876							=head1 COPYRIGHT AND LICENSE
877
878							This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
879
880							This is free software; you can redistribute it and/or modify it under
881							the same terms as the Perl 5 programming language system itself.
882
883							=cut