File Coverage

blib/lib/Bio/MUST/Apps/FortyTwo/OrgProcessor.pm

Criterion	Covered	Total	%
statement	66	351	18.8
branch	0	132	0.0
condition	0	64	0.0
subroutine	22	30	73.3
pod	0	1	0.0
total	88	578	15.2

line	stmt	bran	cond	sub	pod	time	code
1							package Bio::MUST::Apps::FortyTwo::OrgProcessor;
2							# ABSTRACT: Internal class for forty-two tool
3							$Bio::MUST::Apps::FortyTwo::OrgProcessor::VERSION = '0.213470';
4	1			1		1045	use Moose;
	1					4
	1					10
5	1			1		7078	use namespace::autoclean;
	1					2
	1					15
6
7	1			1		107	use autodie;
	1					2
	1					11
8	1			1		4857	use feature qw(say);
	1					3
	1					115
9
10	1			1		8	use Smart::Comments -ENV;
	1					2
	1					12
11
12	1			1		5636	use Carp;
	1					4
	1					98
13	1			1		6	use Const::Fast;
	1					2
	1					13
14	1			1		68	use List::AllUtils qw(each_array pairmap);
	1					2
	1					47
15	1			1		6	use Path::Class qw(file);
	1					2
	1					37
16	1			1		6	use Tie::IxHash;
	1					2
	1					37
17
18	1			1		5	use Bio::MUST::Core;
	1					33
	1					26
19	1			1		6	use Bio::MUST::Core::Constants qw(:gaps :ncbi);
	1					2
	1					204
20	1			1		6	use Bio::MUST::Core::Utils qw(:filenames);
	1					2
	1					85
21	1			1		7	use Bio::MUST::Drivers;
	1					2
	1					23
22	1			1		5	use aliased 'Bio::MUST::Core::Ali';
	1					1
	1					9
23	1			1		249	use aliased 'Bio::MUST::Core::Ali::Temporary';
	1					2
	1					4
24	1			1		155	use aliased 'Bio::MUST::Core::IdList';
	1					18
	1					3
25	1			1		173	use aliased 'Bio::MUST::Core::Seq';
	1					2
	1					3
26	1			1		164	use aliased 'Bio::MUST::Core::SeqId';
	1					1
	1					4
27	1			1		159	use aliased 'Bio::MUST::Core::SeqMask';
	1					3
	1					3
28	1			1		184	use aliased 'Bio::MUST::Drivers::Cap3';
	1					3
	1					4
29	1			1		172	use aliased 'Bio::MUST::Apps::Debrief42::TaxReport::NewSeq';
	1					2
	1					4
30
31							with 'Bio::MUST::Apps::Roles::OrgProcable';
32
33
34							# org
35
36							# banks
37
38							# code
39
40
41							has 'ali_proc' => (
42							is => 'ro',
43							isa => 'Bio::MUST::Apps::FortyTwo::AliProcessor',
44							required => 1,
45							);
46
47
48							has 'tax_filter' => (
49							is => 'ro',
50							isa => 'Maybe[ArrayRef[Str]]',
51							);
52
53
54							has '_tax_filter' => (
55							is => 'ro',
56							isa => 'Maybe[Bio::MUST::Core::Taxonomy::Filter]',
57							init_arg => undef,
58							writer => '_set_tax_filter',
59							handles => [ qw(is_allowed) ],
60							);
61
62
63							has $_ . '_seqs' => (
64							is => 'ro',
65							isa => 'Bio::MUST::Core::Ali::Temporary',
66							init_arg => undef,
67							lazy => 1,
68							builder => '_build_' . $_ . '_seqs',
69							) for qw(homologous orthologous);
70
71
72							has 'aligned_seqs' => (
73							is => 'ro',
74							isa => 'Bio::MUST::Core::Ali',
75							init_arg => undef,
76							lazy => 1,
77							builder => '_build_aligned_seqs',
78							);
79
80
81							has '_' . $_ . '_para_scores' => (
82							traits => ['Hash'],
83							is => 'ro',
84							isa => 'HashRef[Num]',
85							init_arg => undef,
86							lazy => 1,
87							builder => '_build_' . $_ . '_scores',
88							handles => {
89							$_ . '_score_for' => 'get',
90							},
91							) for qw(para tol);
92
93
94							has '_count_for' => (
95							is => 'ro',
96							isa => 'HashRef[HashRef[Num]]',
97							init_arg => undef,
98							default => sub { {} },
99							);
100
101
102							## no critic (ProhibitUnusedPrivateSubroutines)
103
104							sub _build_homologous_seqs {
105	0			0			my $self = shift;
106
107							##### [ORG] Searching for homologues...
108
109	0						my $ap = $self->ali_proc;
110	0						my $rp = $ap->run_proc;
111
112	0		0				my $args = $rp->blast_args_for('homologues') // {};
113	0						$args->{-outfmt} = 6;
114							$args->{-max_target_seqs} = 10000
115	0	0					unless defined $args->{-max_target_seqs};
116
117	0						my @seqs;
118	0						for my $bank ($self->all_banks) {
119
120							###### [ORG] Processing BANK: $bank
121	0						my $query_seqs = $ap->query_seqs;
122	0						my $file = file( $rp->bank_dir, $bank );
123	0						my $blastdb = Bio::MUST::Drivers::Blast::Database->new( file => $file );
124	0	0	0				$args->{-db_gencode} = $self->code # if TBLASTN
125							if $query_seqs->type eq 'prot' && $blastdb->type eq 'nucl';
126	0						my $parser = $blastdb->blast($query_seqs, $args);
127							###### [ORG] TBLASTN (or BLASTP/N): $bank . q{ } . $parser->filename
128
129							# tie might help making 42 completely deterministic
130	0						tie my %mask_for, 'Tie::IxHash';
131	0						my %strand_for;
132
133							# collect hit ids, masks and strands covered by queries
134	0						$self->_collect_hsps(
135							parser => $parser,
136							mask_for => \%mask_for,
137							strand_for => \%strand_for,
138							);
139	0	0					$parser->remove unless $rp->debug_mode;
140
141							# fetch (and optionally trim) hits to their max range
142	0						my @hits = $self->_fetch_and_trim_hits(
143							blastdb => $blastdb,
144							mask_for => \%mask_for,
145							strand_for => \%strand_for,
146							);
147							######## [DEBUG] hits: display( map { $_->full_id . ' => ' . $_->seq } @hits )
148
149							# add seqs from current bank to homologous seqs
150	0						push @seqs, @hits;
151							}
152
153							# build BLAST query file from homologous seqs
154	0						return Temporary->new( seqs => \@seqs );
155							}
156
157
158							sub _build_orthologous_seqs {
159	0			0			my $self = shift;
160
161	0						my $ap = $self->ali_proc;
162	0						my $rp = $ap->run_proc;
163
164	0						my $seqs;
165
166	0	0					unless ($rp->ref_brh eq 'off') {
167							##### [ORG] Identifying orthologues among homologues...
168
169	0		0				my $args = $rp->blast_args_for('orthologues') // {};
170	0						$args->{-outfmt} = 6;
171	0						$args->{-max_target_seqs} = 1;
172
173							# tie might help making 42 completely deterministic
174	0						tie my %count_for, 'Tie::IxHash';
175	0						for my $ref_org ($ap->all_ref_orgs) {
176
177	0						my $homologous_seqs = $self->homologous_seqs;
178	0						my $blastdb = $rp->ref_blastdb_for($ref_org);
179	0	0	0				$args->{-query_gencode} = $self->code # if BLASTX
180							if $homologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
181	0						my $parser = $blastdb->blast($homologous_seqs, $args);
182							##### [ORG] BLASTX (or BLASTP/N): $ref_org . q{ } . $parser->filename
183
184	0						my $best_hits = $ap->best_hits_for($ref_org);
185	0						while (my $hsp = $parser->next_query) {
186	0						my $candidate = $homologous_seqs->long_id_for( $hsp->query_id );
187	0						my $in_best_hits = $best_hits->is_listed( $hsp->hit_id );
188							######## [DEBUG]: $candidate . ( $in_best_hits && ' [=BRH=]' )
189	0	0					$count_for{$candidate}++ if $in_best_hits;
190							}
191	0	0					$parser->remove unless $rp->debug_mode;
192							}
193
194							######## [DEBUG] BRH counts: display( pairmap { "$a => $b" } %count_for )
195
196							# keep only homologues that are orthologous for all effective ref_orgs
197	0						my $ref_org_n = $ap->count_ref_orgs;
198							my $orthologues = IdList->new(
199	0						ids => [ grep { $count_for{$_} == $ref_org_n } keys %count_for ]
	0
200							);
201	0						$seqs = $orthologues->filtered_ali( $self->homologous_seqs );
202							}
203
204							else {
205							##### [ORG] Skipping orthology assessment!
206	0						$seqs = $self->homologous_seqs->seqs;
207							}
208
209							# optionally merge orthologues before aligning
210							# Note: this is always disabled in metagenomic mode
211	0	0					unless ($rp->run_mode eq 'metagenomic') { # TODO: warn user?
212	0	0					if ($rp->merge_orthologues eq 'on') {
213							##### [ORG] pre-merge orthologues: display( map { $_->full_id } $seqs->all_seq_ids )
214	0						$self->_compress_seqs($seqs);
215							}
216							}
217
218							# build BLAST query file from orthologous seqs
219	0						return Temporary->new( seqs => $seqs );
220							}
221
222
223							sub _build_para_scores {
224	0			0			my $self = shift;
225
226	0						my $ap = $self->ali_proc;
227	0						my $rp = $ap->run_proc;
228
229							# ignore if no .para file in use
230	0						my $blastdb = $ap->para_blastdb;
231	0	0					return unless $blastdb;
232
233	0		0				my $args = $rp->blast_args_for('templates') // {};
234	0						$args->{-outfmt} = 6;
235	0						$args->{-max_target_seqs} = 1;
236
237	0						my $orthologous_seqs = $self->orthologous_seqs;
238	0	0	0				$args->{-query_gencode} = $self->code # if BLASTX
239							if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
240	0						my $parser = $blastdb->blast($orthologous_seqs, $args);
241							##### [ORG] BLASTX (or BLASTP/N): 'PARA ' . $parser->filename
242
243	0						my %para_score_for;
244	0						while (my $hsp = $parser->next_query) {
245	0						my $transcript_acc = $orthologous_seqs->long_id_for( $hsp->query_id );
246	0						my $hit_id = $blastdb->long_id_for( $hsp->hit_id );
247	0						my $score = $hsp->bit_score;
248							######## [DEBUG] orthologue: $transcript_acc
249							######## [DEBUG] PARA hit: $hit_id
250							######## [DEBUG] score: $score
251	0						$para_score_for{$transcript_acc} = $score;
252							}
253	0	0					$parser->remove unless $rp->debug_mode;
254
255	0						return \%para_score_for;
256							}
257
258
259							sub _build_tol_scores {
260	0			0			my $self = shift;
261
262	0						my $ap = $self->ali_proc;
263	0						my $rp = $ap->run_proc;
264
265							# ignore if no tol_check in use
266	0						my $blastdb = $rp->tol_blastdb;
267	0	0					return unless $blastdb;
268
269	0		0				my $args = $rp->blast_args_for('templates') // {};
270	0						$args->{-outfmt} = 5;
271	0						$args->{-max_target_seqs} = 250;
272
273	0						my $orthologous_seqs = $self->orthologous_seqs;
274	0	0	0				$args->{-query_gencode} = $self->code # if BLASTX
275							if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
276	0						my $parser = $blastdb->blast($orthologous_seqs, $args);
277							##### [ORG] XML BLASTX (or BLASTP/N): 'TOL ' . $parser->filename
278
279							# abort if no hit
280	0						my $bo = $parser->blast_output;
281	0	0					return unless $bo;
282
283	0						my %tol_score_for;
284
285							ORTHOLOGUE:
286	0						for my $orthologue ($bo->all_iterations) {
287	0	0					next ORTHOLOGUE unless $orthologue->count_hits;
288							# Note: this should never happen...
289
290	0						my $query_def = $orthologue->query_def;
291	0						my $transcript_acc = $orthologous_seqs->long_id_for($query_def);
292							######## [DEBUG] orthologue: $transcript_acc
293
294							TOL_HIT:
295	0						for my $hit ($orthologue->all_hits) {
296
297							# check taxonomy of hit: skip SELF-like hits
298	0						my $hit_id = SeqId->new( full_id => $hit->id );
299							######## [DEBUG] TOL hit: $hit_id->full_id
300	0	0					next TOL_HIT if $self->is_allowed($hit_id);
301
302							# fetch score for first non-SELF-like hit
303	0						my $score = $hit->get_hsp(0)->bit_score;
304							######## [DEBUG] score: $score
305	0						$tol_score_for{$transcript_acc} = $score;
306	0						next ORTHOLOGUE;
307							}
308							}
309	0	0					$parser->remove unless $rp->debug_mode;
310
311	0						return \%tol_score_for;
312							}
313
314
315							sub _build_aligned_seqs { ## no critic (ProhibitExcessComplexity)
316	0			0			my $self = shift;
317
318							##### [ORG] Aligning orthologues...
319
320	0						my $ap = $self->ali_proc;
321	0						my $rp = $ap->run_proc;
322
323	0		0				my $args = $rp->blast_args_for('templates') // {};
324	0						$args->{-outfmt} = 5;
325	0						$args->{-max_target_seqs} = $rp->tax_max_hits * 2;
326
327	0						my $orthologous_seqs = $self->orthologous_seqs;
328	0						my $blastdb = $ap->blastdb;
329	0	0	0				$args->{-query_gencode} = $self->code # if BLASTX
330							if $orthologous_seqs->type eq 'nucl' && $blastdb->type eq 'prot';
331	0						my $parser = $blastdb->blast($orthologous_seqs, $args);
332							##### [ORG] XML BLASTX (or BLASTP/N): $parser->filename
333
334	0						my $aligned_seqs = Ali->new();
335
336							# abort if no hit
337	0						my $bo = $parser->blast_output;
338	0	0					return $aligned_seqs unless $bo;
339
340							ORTHOLOGUE:
341	0						for my $orthologue ($bo->all_iterations) {
342	0	0					unless ($orthologue->count_hits) {
343							###### [ORG] skipped orthologue due to lack of significant template
344	0						next ORTHOLOGUE;
345							} # TODO: investigate why this should happen at all...
346
347							# fetch tax_line from orthologue
348							# Note: .para check is also done in this sub (hence the next below)
349	0						my $tax_line = $self->_fetch_tax_line_for_transcript($orthologue);
350	0	0					next ORTHOLOGUE unless $tax_line;
351
352							# extract contam_org and transcript_id from tax_line...
353	0						my $contam_org = $tax_line->contam_org;
354	0						my $transcript_id = $tax_line->seq_id . '%s'; # chunk placeholder
355	0	0					$transcript_id .= "...$contam_org" if $contam_org;
356	0						$transcript_id .= '#NEW#';
357
358							# ... and use it "as is" to store tax_line in tax_report (no chunk tag)
359	0						my $tax_line_id = sprintf $transcript_id, q{};
360	0						$ap->set_tax_line( $tax_line_id => $tax_line );
361
362							# extract transcript_seq from tax_line
363	0						my $transcript_seq = Seq->new(
364							seq_id => $tax_line_id, # same id without alignment
365							seq => $tax_line->seq
366							);
367
368							# skip both orthologue alignment and integration if metagenomic mode
369	0	0					next ORTHOLOGUE if $rp->run_mode eq 'metagenomic';
370
371							# optionally add transcript_seq as is (without alignment)
372	0	0					if ($rp->aligner_mode eq 'off') {
373
374							####### [ORG] Adding: $transcript_seq->full_id
375
376	0						$aligned_seqs->add_seq($transcript_seq);
377	0						next ORTHOLOGUE;
378							}
379
380							# build a template_seq list as long as query coverage improves
381							# exonerate will try to use the longest template for alignment
382							# while BLAST will use each hit in turn (possibly as a fall-back)
383	0						my @templates;
384							my @template_seqs;
385	0						my $best_coverage = 0;
386
387							TEMPLATE:
388	0						for my $template ($orthologue->all_hits) {
389
390							# fetch template full_id
391	0						my $template_id = SeqId->new(
392							full_id => $blastdb->long_id_for($template->def)
393							);
394
395							# optionally skip template if from same org as the orthologue
396	0	0					if ($rp->ali_skip_self eq 'on') {
397	0	0					if ($template_id->full_org eq $transcript_seq->full_org) {
398							###### [ORG] skipped same-org template due to ali_skip_self
399	0						next TEMPLATE;
400							}
401							}
402
403							# compute query coverage by template HSPs
404	0						my $mask = SeqMask->empty_mask( $orthologue->query_len );
405							$mask->mark_block($_->query_start, $_->query_end)
406	0						for $template->all_hsps;
407	0						my $coverage = $mask->coverage;
408
409	0	0					last TEMPLATE if $coverage
410							< $best_coverage * $rp->ali_cover_mul;
411	0						$best_coverage = $coverage;
412
413							######## [DEBUG] template: $template_id->full_id
414							######## [DEBUG] coverage: sprintf '%.2f', $coverage
415
416							# fetch and cache aligned template seq from Ali
417							# Note: we emulate a fast get_seq_with_id using Ali lookup
418	0						push @templates, $template;
419	0						push @template_seqs, $ap->ali->get_seq(
420							$ap->lookup->index_for( $template_id->full_id )
421							);
422							}
423
424							# Note: as we (optionally) skip templates, we may run out of templates
425	0	0					unless (@templates) {
426							###### [ORG] skipped orthologue due to lack of suitable template
427	0						next ORTHOLOGUE;
428							}
429
430							# flag tracking exonerate failure (for exoblast mode)
431	0						my $failure;
432
433	0	0					if ($rp->aligner_mode =~ m/exonerate\|exoblast/xms) {
434
435							# use longest template for exonerate alignment
436	0						my $template_seq = $template_seqs[-1];
437
438							# align transcript on template protein and get its translation
439							# Note: in exonerate the translated dna_seq is called 'target_seq'
440	0						my $exo = Bio::MUST::Drivers::Exonerate::Aligned->new(
441							dna_seq => $transcript_seq,
442							pep_seq => $template_seq->clone->degap,
443							code => $self->code,
444							);
445
446							# get new_seq from exonerate report
447							# build new_id from transcript_id and exonerate model used
448	0						my $new_id = sprintf $transcript_id, '.E.' . $exo->model;
449	0						my $new_seq = $exo->target_seq;
450	0						$new_seq->set_seq_id($new_id);
451
452							####### [ORG] Adding: $new_seq->full_id
453
454	0	0					unless ($new_seq->seq_len) {
455	0	0					my $fate = $rp->aligner_mode eq 'exoblast'
456							? 'will retry with BLAST' : 'discarded'
457							;
458							####### [ORG] exonerate failure: $fate
459	0						$failure = 1;
460							}
461
462							else {
463							# align new_seq on template_seq using subject_seq as a guide
464	0						$aligned_seqs->add_seq(
465							$ap->integrator->align(
466							new_seq => $new_seq,
467							subject => $exo->query_seq,
468							template => $template_seq,
469							start => $exo->query_start,
470							)
471							);
472
473							######## [DEBUG] aligned with exonerate model: $exo->model
474							}
475							}
476
477	0	0	0				if ( ($rp->aligner_mode eq 'blast')
			0
478							\|\| ($rp->aligner_mode eq 'exoblast' && $failure) ) {
479
480							TEMPLATE:
481	0						for my $template (@templates) {
482
483							# use each template in turn for BLAST alignment
484	0						my $template_seq = shift @template_seqs;
485	0	0					last TEMPLATE unless $template_seq;
486
487							HSP:
488	0						for my $hsp ($template->all_hsps) {
489
490							# build HSP id from transcript_id and template/HSP ranks
491	0						my $hsp_id = sprintf $transcript_id,
492							'.H' . $template->num . '.' . $hsp->num;
493
494							# build HSP seq from BLASTX report
495	0						(my $hsp_seq = $hsp->qseq) =~ s{\*}{$FRAMESHIFT}xmsg;
496	0						my $new_seq = Seq->new(
497							seq_id => $hsp_id,
498							seq => $hsp_seq
499							);
500
501							# fetch aligned template seq from HSP (= subject)
502	0						my $subject_seq = Seq->new(
503							seq_id => $template_seq->seq_id,
504							seq => $hsp->hseq
505							);
506
507							# reverse complement seqs if template on reverse in BLASTN
508	0	0	0				if ($ap->query_seqs->type eq 'nucl'
			0
509							&& $blastdb->type eq 'nucl'
510							&& $hsp->hit_strand == -1) {
511	0						$new_seq = $new_seq->reverse_complemented_seq;
512	0						$subject_seq = $subject_seq->reverse_complemented_seq;
513							}
514
515							####### [ORG] Adding: $new_seq->full_id
516
517							# align new_seq on template_seq using subject_seq as a guide
518							$aligned_seqs->add_seq(
519	0						$ap->integrator->align(
520							new_seq => $new_seq,
521							subject => $subject_seq,
522							template => $template_seq,
523							start => $hsp->hit_start,
524							)
525							);
526
527							######## [DEBUG] aligned with BLAST
528							}
529							}
530							}
531							}
532
533	0	0					$parser->remove unless $rp->debug_mode;
534
535	0						return $aligned_seqs;
536							}
537
538							## use critic
539
540
541							sub _compress_seqs {
542	0			0			my $self = shift;
543	0						my $ali = shift;
544
545	0						my $rp = $self->ali_proc->run_proc;
546
547							# first fetch all seqs...
548							# ... then clear existing seqs
549	0						my @seqs2cap = $ali->all_seqs;
550	0						$ali->_set_seqs( [] );
551
552							# proceed only if at least one seq
553	0	0					return unless @seqs2cap;
554
555							# Note: this might seem sub-optimal in case of singletons
556							# but we follow as closely as possible the logic of compress-db.pl
557
558							# proceed only if at least two seqs
559							# otherwise add lone seq
560	0	0					if (@seqs2cap < 2) {
561	0						$ali->add_seq( shift @seqs2cap );
562	0						return;
563							}
564
565							# TODO: add debugging comments?
566
567							# try to cap seqs
568	0						my $cap = Cap3->new(
569							seqs => \@seqs2cap,
570							cap3_args => {
571							-p => $rp->merge_min_ident * 100.0, # CAP3 expects percents
572							-o => $rp->merge_min_len,
573							},
574							);
575
576							# add singlet seqs
577	0						my @singlets = $cap->all_singlets;
578	0						$ali->add_seq($_) for @singlets;
579
580							# proceed only if contigs of seqs
581	0						my @contigs = $cap->all_contigs;
582	0	0					return unless @contigs;
583
584	0						for my $contig (@contigs) {
585	0						my @ids = map { $_->full_id }
586	0						@{ $cap->seq_ids_for( $contig->full_id ) };
	0
587	0						my $contig_id = shift(@ids) . ':::+' . @ids;
588	0						my $contig_seq = $contig->seq;
589
590							# add contig seq
591	0						$ali->add_seq(
592							Seq->new( seq_id => $contig_id, seq => $contig_seq )
593							);
594							}
595
596							# Note: we do not need to return anything as we modified $ali in place
597	0						return;
598							}
599
600
601							sub _fetch_tax_line_for_transcript {
602	0			0			my $self = shift;
603	0						my $orthologue = shift;
604
605	0						my $ap = $self->ali_proc;
606	0						my $rp = $ap->run_proc;
607
608							# fetch transcript accession (= query id) from BLAST report
609	0						my $query_def = $orthologue->query_def;
610	0						my $transcript_acc = $self->orthologous_seqs->long_id_for($query_def);
611
612							# extract range and strand (if any) from transcript accession...
613							# Note: this mostly makes sense for genomic contigs/scaffolds
614							# ... and extract potential tail '+N' (if orthologues have been merged)
615							# Note: this looks convoluted but it has to work in many different setups
616							# depending on the value of trim_homologues and merge_orthologues:
617							# - off/off: acc
618							# - off/on: acc:::+N
619							# - on /off: acc:::start:::end:::strand
620							# - on /on: acc:::start:::end:::strand:::+N
621	0						my @fields = split /:::/xms, $transcript_acc;
622	0						my ($strip_acc, $start, $end, $strand, $more) = @fields;
623	0	0	0				($more, $start) = ($start, undef)
			0
624							if !defined $end && defined $start && $start =~ m/\A \+\d+ \z/xms;
625
626							# set acc to transcript accession
627	0						my ($first_chunk) = $strip_acc =~ m/^(\S+)/xms;
628	0						my @parts = split /\\|/xms, $first_chunk;
629	0						my $acc = pop @parts;
630							###### [ORG] orthologous transcript: $acc
631
632							# fetch score for best template
633	0						my $temp_score = $orthologue->get_hit(0)->get_hsp(0)->bit_score;
634
635							# check again for paralogy if .para file in use
636	0	0					if ($ap->para_blastdb) {
637	0		0				my $para_score = $self->para_score_for($transcript_acc) // 0;
638	0	0					if ($para_score > $temp_score) {
639							###### [ORG] rejected for PARA[logy]: "$para_score > $temp_score"
640	0						return;
641							}
642							}
643
644							# check for contamination if tol_check in use
645	0	0					if ($rp->tol_blastdb ) {
646	0		0				my $tol_score = $self->tol_score_for($transcript_acc) // 0;
647	0	0					if ($tol_score > $temp_score) {
648							###### [ORG] rejected due to TOL check: "$tol_score > $temp_score"
649	0						return;
650							}
651							}
652
653							# prebuild SeqId list for all templates
654							# this is needed for family affiliation (first only)
655							# ... and for assessing template taxonomy (all)
656	0						my $blastdb = $self->ali_proc->blastdb;
657	0						my @templates = $orthologue->all_hits;
658							my @template_ids = map {
659	0						SeqId->new( full_id => $blastdb->long_id_for( $_->def ) )
	0
660							} @templates;
661
662							# set family to first template family (if any)
663	0		0				my $family = $template_ids[0]->family // q{};
664	0	0					$family .= '-' if $family;
665
666							# set org to current (bank) org
667	0						my $org = $self->org;
668	0						my $contam_org = q{};
669
670							# setup tax_report data
671	0						my $common_tax;
672							my $lca;
673	0						my $lineage;
674	0						my $tag = q{};
675	0						my $top_score = 0;
676	0						my $rel_n = 0;
677	0						my $mean_len;
678							my $mean_ident;
679
680							# optionally analyze template taxonomy
681							# ... to build tax_reports of additions
682							# ... to tag potential contaminations
683	0	0	0				if ( $rp->tax_reports eq 'on' \|\| $self->_tax_filter ) {
684	0						my @relatives;
685	0						my $best_score = 0;
686	0						my $sum_len = 0;
687	0						my $sum_ident = 0;
688
689	0						my $tax = $rp->tax;
690	0						my $ea = each_array @templates, @template_ids;
691
692							# accumulate relatives for LCA inference
693							# Note: three logical modes can be distinguished.
694							# These should be implemented in the config file.
695							# 1. best-hit
696							# - tax_max_hits: 1
697							# - filter on tax_min_score or tax_min_len / tax_min_ident
698							# 2. classic LCA
699							# - tax_max_hits: to be specified
700							# - filter on tax_min_score or tax_min_len / tax_min_ident
701							# 3. MEGAN-like LCA
702							# - tax_max_hits: should be ignored
703							# - filter on tax_min_score / tax_score_mul
704							# however all criteria can be simultaneously activated
705
706							REL:
707	0						while ( my ($template, $template_id) = $ea->() ) {
708
709							# accumulate at most tax_max_hits relatives for LCA inference
710	0	0					last REL if @relatives >= $rp->tax_max_hits;
711
712							# ensure that match to template is good enough...
713							# by default all these thresholds are disabled
714	0						my $hsp = $template->get_hsp(0);
715
716							# ... MEGAN-like mode (and top_score needed for one-on-one)
717	0						my $score = $hsp->bit_score;
718	0						$top_score = List::AllUtils::max($top_score, $score);
719							# Note: we do not use last due to non-monotonic score decrease
720
721							# no need to accumulate relatives if no NCBI Taxonomy is available
722	0	0					last REL unless $tax;
723
724	0	0					next REL if $score < $rp->tax_min_score;
725	0	0					next REL if $score < $rp->tax_score_mul * $best_score;
726
727							# ... classic or best-hit mode
728	0						my $len = $hsp->align_len;
729	0	0					next REL if $len < $rp->tax_min_len;
730	0						my $ident = $hsp->identity / $len;
731	0	0					next REL if $ident < $rp->tax_min_ident;
732
733							# try to determine taxonomy of template
734							# if possible add template to list of relatives...
735							# ... and set best_score for MEGAN-like mode
736	0						my @taxonomy = $tax->fetch_lineage($template_id);
737	0	0					if (@taxonomy) {
738							# Note: contrary to ref_score_mul the best_score never changes
739	0		0				$best_score \|\|= $score;
740	0						$sum_len += $len;
741	0						$sum_ident += $ident;
742	0						push @relatives, \@taxonomy;
743							######## [DEBUG] hit: $template_id->full_id
744							######## [DEBUG] length: $len
745							######## [DEBUG] identity: sprintf '%.2f', $ident
746							######## [DEBUG] lineage: join '; ', @taxonomy
747							}
748							}
749
750							######## [DEBUG] relatives: display( map { join '; ', @$_ } @relatives )
751
752	0	0					if (@relatives < $rp->tax_min_hits) {
753							###### [ORG] tagged as unclassified due to lack of relatives
754	0						$common_tax = [ 'unclassified sequences' ];
755	0	0					if ($self->_tax_filter) {
756	0						$tag = 'u#';
757							}
758							}
759
760							else {
761							# infer LCA from all relatives...
762	0						$common_tax = $tax->compute_lca(@relatives);
763	0						$lineage = join '; ', @{$common_tax};
	0
764							######## [DEBUG] LCA lineage: $lineage
765	0						$lca = $common_tax->[-1];
766							###### [ORG] affiliated to: $lca
767
768	0	0					if ($rp->tax_reports eq 'on') {
769	0						$rel_n = @relatives;
770	0						$mean_len = sprintf '%.2f', $sum_len / $rel_n;
771	0						$mean_ident = sprintf '%.2f', $sum_ident / $rel_n;
772							}
773
774							# optionally tag potential contamination based on LCA
775							# Note: LCAs are used "as is" because is_allowed handles them
776							# Note: tol_check 'on' disables positive taxonomic filtering
777	0	0	0				if ($self->_tax_filter && $rp->tol_check eq 'off') {
778							######## [DEBUG] pass tax_filter: $self->is_allowed($common_tax)
779	0	0					unless ( $self->is_allowed($common_tax) ) {
780							###### [ORG] tagged as contaminated
781	0						$tag = 'c#';
782							$contam_org = join '_',
783	0		0				map { $_ // () } SeqId->parse_ncbi_name($lca);
	0
784							} # delete undef species/strains
785							}
786							}
787							}
788
789							# check for multiple orthologues derived from the same transcript
790							# Note: this mostly makes sense for genomic contigs/scaffolds
791	0						my $acc_u = $acc;
792	0						my $count = ++$self->_count_for->{$org}{$acc};
793	0	0					if ($count > 1) { # direct access to private attr
794	0						carp "[ORG] Note: more than one orthologue extracted from $acc;"
795							. " appending .$count to accession.";
796	0						$acc_u .= ".$count";
797							}
798
799							# add potential accession tail '+N' (if orthologues have been merged)
800	0	0					$acc_u .= $more if $more;
801
802							# fetch orthologous transcript and give it nearly complete full_id
803	0						my $transcript_seq
804							= $self->orthologous_seqs->get_seq_with_id($transcript_acc)->seq;
805	0	0	0				my $transcript_id
806							= $org =~ $PKEYONLY \|\| $org =~ $GCAONLY ? $org . '\|' . $acc_u
807							: $family . $tag . $org . '@' . $acc_u
808							;
809
810							# build tax_report line for transcript
811	0						my $tax_line = NewSeq->new(
812							seq_id => $transcript_id,
813							contam_org => $contam_org,
814							top_score => $top_score,
815							rel_n => $rel_n,
816							mean_len => $mean_len,
817							mean_ident => $mean_ident,
818							lca => $lca,
819							lineage => $lineage,
820							acc => $acc, # Note: not $acc_u here!
821							start => $start,
822							end => $end,
823							strand => $strand,
824							seq => $transcript_seq,
825							);
826
827	0						return $tax_line;
828							}
829
830
831							sub BUILD {
832	0			0	0		my $self = shift;
833
834	0						my $ap = $self->ali_proc;
835	0						my $rp = $ap->run_proc;
836
837							# optionally build _tax_filter from tax_filter (specs) and tax attrs
838							# Note: cannot use builder/coercion because of the need for a tax object
839	0	0					if ($self->tax_filter) {
840	0						my $tax = $rp->tax;
841	0	0					$self->_set_tax_filter( $tax->tax_filter( $self->tax_filter ) )
842							if $tax; # warning already issued at this stage
843							}
844
845							##### [ORG] homologues: display( $self->homologous_seqs->all_long_ids )
846	0	0					return unless $self->homologous_seqs->all_long_ids;
847
848							##### [ORG] orthologues: display( $self->orthologous_seqs->all_long_ids )
849	0	0					return unless $self->orthologous_seqs->all_long_ids;
850
851							##### [ORG] aligned: display( map { $_->full_id } $self->aligned_seqs->all_seq_ids )
852	0	0					return unless $self->aligned_seqs->all_seq_ids;
853
854							##### [ORG] Adding aligned seqs to file...
855	0						$ap->ali->add_seq( $self->aligned_seqs->all_seqs );
856
857	0						return;
858							}
859
860
861							__PACKAGE__->meta->make_immutable;
862							1;
863
864							__END__
865
866							=pod
867
868							=head1 NAME
869
870							Bio::MUST::Apps::FortyTwo::OrgProcessor - Internal class for forty-two tool
871
872							=head1 VERSION
873
874							version 0.213470
875
876							=head1 AUTHOR
877
878							Denis BAURAIN <denis.baurain@uliege.be>
879
880							=head1 COPYRIGHT AND LICENSE
881
882							This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
883
884							This is free software; you can redistribute it and/or modify it under
885							the same terms as the Perl 5 programming language system itself.
886
887							=cut