| line | stmt | bran | cond | sub | pod | time | code | 
| 1 |  |  |  |  |  |  | package App::Sandy::Simulator; | 
| 2 |  |  |  |  |  |  | # ABSTRACT: Class responsible to make the simulation | 
| 3 |  |  |  |  |  |  |  | 
| 4 | 6 |  |  | 6 |  | 97186 | use App::Sandy::Base 'class'; | 
|  | 6 |  |  |  |  | 17 |  | 
|  | 6 |  |  |  |  | 64 |  | 
| 5 | 6 |  |  | 6 |  | 131 | use App::Sandy::Seq::SingleEnd; | 
|  | 6 |  |  |  |  | 12 |  | 
|  | 6 |  |  |  |  | 270 |  | 
| 6 | 6 |  |  | 6 |  | 40 | use App::Sandy::Seq::PairedEnd; | 
|  | 6 |  |  |  |  | 864 |  | 
|  | 6 |  |  |  |  | 265 |  | 
| 7 | 6 |  |  | 6 |  | 2885 | use App::Sandy::InterlaceProcesses; | 
|  | 6 |  |  |  |  | 2311 |  | 
|  | 6 |  |  |  |  | 241 |  | 
| 8 | 6 |  |  | 6 |  | 532 | use App::Sandy::RNG; | 
|  | 6 |  |  |  |  | 13 |  | 
|  | 6 |  |  |  |  | 394 |  | 
| 9 | 6 |  |  | 6 |  | 2965 | use App::Sandy::WeightedRaffle; | 
|  | 6 |  |  |  |  | 2117 |  | 
|  | 6 |  |  |  |  | 261 |  | 
| 10 | 6 |  |  | 6 |  | 496 | use App::Sandy::PieceTable; | 
|  | 6 |  |  |  |  | 541 |  | 
|  | 6 |  |  |  |  | 175 |  | 
| 11 | 6 |  |  | 6 |  | 3214 | use App::Sandy::DB::Handle::Expression; | 
|  | 6 |  |  |  |  | 2038 |  | 
|  | 6 |  |  |  |  | 240 |  | 
| 12 | 6 |  |  | 6 |  | 3051 | use App::Sandy::DB::Handle::Variation; | 
|  | 6 |  |  |  |  | 2317 |  | 
|  | 6 |  |  |  |  | 243 |  | 
| 13 | 6 |  |  | 6 |  | 51 | use List::Util 'min'; | 
|  | 6 |  |  |  |  | 12 |  | 
|  | 6 |  |  |  |  | 532 |  | 
| 14 | 6 |  |  | 6 |  | 3651 | use File::Cat 'cat'; | 
|  | 6 |  |  |  |  | 3364 |  | 
|  | 6 |  |  |  |  | 460 |  | 
| 15 | 6 |  |  | 6 |  | 4248 | use Parallel::ForkManager; | 
|  | 6 |  |  |  |  | 341540 |  | 
|  | 6 |  |  |  |  | 56536 |  | 
| 16 |  |  |  |  |  |  |  | 
| 17 |  |  |  |  |  |  | with qw/App::Sandy::Role::IO App::Sandy::Role::SeqID/; | 
| 18 |  |  |  |  |  |  |  | 
| 19 |  |  |  |  |  |  | our $VERSION = '0.25'; # VERSION | 
| 20 |  |  |  |  |  |  |  | 
| 21 |  |  |  |  |  |  | has 'argv' => ( | 
| 22 |  |  |  |  |  |  | is       => 'ro', | 
| 23 |  |  |  |  |  |  | isa      => 'ArrayRef[Str]', | 
| 24 |  |  |  |  |  |  | required => 1 | 
| 25 |  |  |  |  |  |  | ); | 
| 26 |  |  |  |  |  |  |  | 
| 27 |  |  |  |  |  |  | has 'truncate' => ( | 
| 28 |  |  |  |  |  |  | is       => 'ro', | 
| 29 |  |  |  |  |  |  | isa      => 'Bool', | 
| 30 |  |  |  |  |  |  | required => 1 | 
| 31 |  |  |  |  |  |  | ); | 
| 32 |  |  |  |  |  |  |  | 
| 33 |  |  |  |  |  |  | has 'seed' => ( | 
| 34 |  |  |  |  |  |  | is        => 'ro', | 
| 35 |  |  |  |  |  |  | isa       => 'Int', | 
| 36 |  |  |  |  |  |  | required  => 1 | 
| 37 |  |  |  |  |  |  | ); | 
| 38 |  |  |  |  |  |  |  | 
| 39 |  |  |  |  |  |  | has 'jobs' => ( | 
| 40 |  |  |  |  |  |  | is         => 'ro', | 
| 41 |  |  |  |  |  |  | isa        => 'My:IntGt0', | 
| 42 |  |  |  |  |  |  | required   => 1 | 
| 43 |  |  |  |  |  |  | ); | 
| 44 |  |  |  |  |  |  |  | 
| 45 |  |  |  |  |  |  | has 'prefix' => ( | 
| 46 |  |  |  |  |  |  | is         => 'ro', | 
| 47 |  |  |  |  |  |  | isa        => 'Str', | 
| 48 |  |  |  |  |  |  | required   => 1 | 
| 49 |  |  |  |  |  |  | ); | 
| 50 |  |  |  |  |  |  |  | 
| 51 |  |  |  |  |  |  | has 'join_paired_ends' => ( | 
| 52 |  |  |  |  |  |  | is         => 'ro', | 
| 53 |  |  |  |  |  |  | isa        => 'Bool', | 
| 54 |  |  |  |  |  |  | required   => 1 | 
| 55 |  |  |  |  |  |  | ); | 
| 56 |  |  |  |  |  |  |  | 
| 57 |  |  |  |  |  |  | has 'output_format' => ( | 
| 58 |  |  |  |  |  |  | is          => 'ro', | 
| 59 |  |  |  |  |  |  | isa         => 'My:Format', | 
| 60 |  |  |  |  |  |  | required    => 1 | 
| 61 |  |  |  |  |  |  | ); | 
| 62 |  |  |  |  |  |  |  | 
| 63 |  |  |  |  |  |  | has 'compression_level' => ( | 
| 64 |  |  |  |  |  |  | is          => 'ro', | 
| 65 |  |  |  |  |  |  | isa         => 'My:Level', | 
| 66 |  |  |  |  |  |  | required    => 1 | 
| 67 |  |  |  |  |  |  | ); | 
| 68 |  |  |  |  |  |  |  | 
| 69 |  |  |  |  |  |  | has 'fasta_file' => ( | 
| 70 |  |  |  |  |  |  | is         => 'ro', | 
| 71 |  |  |  |  |  |  | isa        => 'My:Fasta', | 
| 72 |  |  |  |  |  |  | required   => 1 | 
| 73 |  |  |  |  |  |  | ); | 
| 74 |  |  |  |  |  |  |  | 
| 75 |  |  |  |  |  |  | has 'coverage' => ( | 
| 76 |  |  |  |  |  |  | is         => 'ro', | 
| 77 |  |  |  |  |  |  | isa        => 'My:NumGt0', | 
| 78 |  |  |  |  |  |  | required   => 0 | 
| 79 |  |  |  |  |  |  | ); | 
| 80 |  |  |  |  |  |  |  | 
| 81 |  |  |  |  |  |  | has 'number_of_reads' => ( | 
| 82 |  |  |  |  |  |  | is         => 'ro', | 
| 83 |  |  |  |  |  |  | isa        => 'My:IntGt0', | 
| 84 |  |  |  |  |  |  | required   => 0 | 
| 85 |  |  |  |  |  |  | ); | 
| 86 |  |  |  |  |  |  |  | 
| 87 |  |  |  |  |  |  | has 'count_loops_by' => ( | 
| 88 |  |  |  |  |  |  | is         => 'ro', | 
| 89 |  |  |  |  |  |  | isa        => 'My:CountLoopBy', | 
| 90 |  |  |  |  |  |  | required   => 1 | 
| 91 |  |  |  |  |  |  | ); | 
| 92 |  |  |  |  |  |  |  | 
| 93 |  |  |  |  |  |  | has 'strand_bias' => ( | 
| 94 |  |  |  |  |  |  | is         => 'ro', | 
| 95 |  |  |  |  |  |  | isa        => 'My:StrandBias', | 
| 96 |  |  |  |  |  |  | required   => 1 | 
| 97 |  |  |  |  |  |  | ); | 
| 98 |  |  |  |  |  |  |  | 
| 99 |  |  |  |  |  |  | has 'seqid_weight' => ( | 
| 100 |  |  |  |  |  |  | is         => 'ro', | 
| 101 |  |  |  |  |  |  | isa        => 'My:SeqIdWeight', | 
| 102 |  |  |  |  |  |  | required   => 1 | 
| 103 |  |  |  |  |  |  | ); | 
| 104 |  |  |  |  |  |  |  | 
| 105 |  |  |  |  |  |  | has 'expression_matrix' => ( | 
| 106 |  |  |  |  |  |  | is         => 'ro', | 
| 107 |  |  |  |  |  |  | isa        => 'Str', | 
| 108 |  |  |  |  |  |  | required   => 0 | 
| 109 |  |  |  |  |  |  | ); | 
| 110 |  |  |  |  |  |  |  | 
| 111 |  |  |  |  |  |  | has 'genomic_variation' => ( | 
| 112 |  |  |  |  |  |  | is         => 'ro', | 
| 113 |  |  |  |  |  |  | isa        => 'ArrayRef[Str]', | 
| 114 |  |  |  |  |  |  | required   => 0 | 
| 115 |  |  |  |  |  |  | ); | 
| 116 |  |  |  |  |  |  |  | 
| 117 |  |  |  |  |  |  | has '_genomic_variation_names' => ( | 
| 118 |  |  |  |  |  |  | is         => 'ro', | 
| 119 |  |  |  |  |  |  | isa        => 'Maybe[Str]', | 
| 120 |  |  |  |  |  |  | builder    => '_build_genomic_variation_names', | 
| 121 |  |  |  |  |  |  | lazy_build => 1 | 
| 122 |  |  |  |  |  |  | ); | 
| 123 |  |  |  |  |  |  |  | 
| 124 |  |  |  |  |  |  | has 'seq' => ( | 
| 125 |  |  |  |  |  |  | is         => 'ro', | 
| 126 |  |  |  |  |  |  | isa        => 'App::Sandy::Seq::SingleEnd | App::Sandy::Seq::PairedEnd', | 
| 127 |  |  |  |  |  |  | required   => 1, | 
| 128 |  |  |  |  |  |  | handles    => [ qw{ sprint_seq gen_sam_header gen_eof_marker } ] | 
| 129 |  |  |  |  |  |  | ); | 
| 130 |  |  |  |  |  |  |  | 
| 131 |  |  |  |  |  |  | has '_fasta' => ( | 
| 132 |  |  |  |  |  |  | is         => 'ro', | 
| 133 |  |  |  |  |  |  | isa        => 'My:IdxFasta', | 
| 134 |  |  |  |  |  |  | builder    => '_build_fasta', | 
| 135 |  |  |  |  |  |  | lazy_build => 1 | 
| 136 |  |  |  |  |  |  | ); | 
| 137 |  |  |  |  |  |  |  | 
| 138 |  |  |  |  |  |  | has '_fasta_tree' => ( | 
| 139 |  |  |  |  |  |  | traits     => ['Hash'], | 
| 140 |  |  |  |  |  |  | is         => 'ro', | 
| 141 |  |  |  |  |  |  | isa        => 'HashRef[ArrayRef]', | 
| 142 |  |  |  |  |  |  | default    => sub { {} }, | 
| 143 |  |  |  |  |  |  | handles    => { | 
| 144 |  |  |  |  |  |  | _set_fasta_tree    => 'set', | 
| 145 |  |  |  |  |  |  | _get_fasta_tree    => 'get', | 
| 146 |  |  |  |  |  |  | _exists_fasta_tree => 'exists', | 
| 147 |  |  |  |  |  |  | _fasta_tree_pairs  => 'kv', | 
| 148 |  |  |  |  |  |  | _has_no_fasta_tree => 'is_empty' | 
| 149 |  |  |  |  |  |  | } | 
| 150 |  |  |  |  |  |  | ); | 
| 151 |  |  |  |  |  |  |  | 
| 152 |  |  |  |  |  |  | has '_fasta_rtree' => ( | 
| 153 |  |  |  |  |  |  | traits     => ['Hash'], | 
| 154 |  |  |  |  |  |  | is         => 'ro', | 
| 155 |  |  |  |  |  |  | isa        => 'HashRef[Str]', | 
| 156 |  |  |  |  |  |  | default    => sub { {} }, | 
| 157 |  |  |  |  |  |  | handles    => { | 
| 158 |  |  |  |  |  |  | _set_fasta_rtree    => 'set', | 
| 159 |  |  |  |  |  |  | _get_fasta_rtree    => 'get', | 
| 160 |  |  |  |  |  |  | _delete_fasta_rtree => 'delete', | 
| 161 |  |  |  |  |  |  | _exists_fasta_rtree => 'exists', | 
| 162 |  |  |  |  |  |  | _fasta_rtree_pairs  => 'kv', | 
| 163 |  |  |  |  |  |  | _has_no_fasta_rtree => 'is_empty' | 
| 164 |  |  |  |  |  |  | } | 
| 165 |  |  |  |  |  |  | ); | 
| 166 |  |  |  |  |  |  |  | 
| 167 |  |  |  |  |  |  | has '_fasta_blacklist' => ( | 
| 168 |  |  |  |  |  |  | is         => 'ro', | 
| 169 |  |  |  |  |  |  | isa        => 'My:FastaBlackList', | 
| 170 |  |  |  |  |  |  | builder    => '_build_fasta_blacklist', | 
| 171 |  |  |  |  |  |  | lazy_build => 1 | 
| 172 |  |  |  |  |  |  | ); | 
| 173 |  |  |  |  |  |  |  | 
| 174 |  |  |  |  |  |  | has '_seqname' => ( | 
| 175 |  |  |  |  |  |  | traits     => ['Hash'], | 
| 176 |  |  |  |  |  |  | is         => 'ro', | 
| 177 |  |  |  |  |  |  | isa        => 'HashRef[Str]', | 
| 178 |  |  |  |  |  |  | default    => sub { {} }, | 
| 179 |  |  |  |  |  |  | handles    => { | 
| 180 |  |  |  |  |  |  | _set_seqname => 'set', | 
| 181 |  |  |  |  |  |  | _get_seqname => 'get' | 
| 182 |  |  |  |  |  |  | } | 
| 183 |  |  |  |  |  |  | ); | 
| 184 |  |  |  |  |  |  |  | 
| 185 |  |  |  |  |  |  | has '_piece_table' => ( | 
| 186 |  |  |  |  |  |  | is         => 'ro', | 
| 187 |  |  |  |  |  |  | isa        => 'HashRef[HashRef[My:PieceTable]]', | 
| 188 |  |  |  |  |  |  | builder    => '_build_piece_table', | 
| 189 |  |  |  |  |  |  | lazy_build => 1 | 
| 190 |  |  |  |  |  |  | ); | 
| 191 |  |  |  |  |  |  |  | 
| 192 |  |  |  |  |  |  | has '_strand' => ( | 
| 193 |  |  |  |  |  |  | is         => 'ro', | 
| 194 |  |  |  |  |  |  | isa        => 'CodeRef', | 
| 195 |  |  |  |  |  |  | builder    => '_build_strand', | 
| 196 |  |  |  |  |  |  | lazy_build => 1 | 
| 197 |  |  |  |  |  |  | ); | 
| 198 |  |  |  |  |  |  |  | 
| 199 |  |  |  |  |  |  | has '_seqid_raffle' => ( | 
| 200 |  |  |  |  |  |  | is         => 'ro', | 
| 201 |  |  |  |  |  |  | isa        => 'CodeRef', | 
| 202 |  |  |  |  |  |  | builder    => '_build_seqid_raffle', | 
| 203 |  |  |  |  |  |  | lazy_build => 1 | 
| 204 |  |  |  |  |  |  | ); | 
| 205 |  |  |  |  |  |  |  | 
| 206 |  |  |  |  |  |  | sub BUILD { | 
| 207 | 20 |  |  | 20 | 0 | 40 | my $self = shift; | 
| 208 |  |  |  |  |  |  |  | 
| 209 |  |  |  |  |  |  | # If seqid_weight is 'count', then expression_matrix must be defined | 
| 210 | 20 | 50 | 33 |  |  | 575 | if ($self->seqid_weight eq 'count' and not defined $self->expression_matrix) { | 
| 211 | 0 |  |  |  |  | 0 | croak "seqid_weight=count requires a expression_matrix\n"; | 
| 212 |  |  |  |  |  |  | } | 
| 213 |  |  |  |  |  |  |  | 
| 214 |  |  |  |  |  |  | # If count_loops_by is 'coverage', then coverage must be defined. Else if | 
| 215 |  |  |  |  |  |  | # it is equal to 'number_of_reads', then number_of_reads must be defined | 
| 216 | 20 | 50 | 33 |  |  | 525 | if ($self->count_loops_by eq 'coverage' and not defined $self->coverage) { | 
|  |  | 50 | 33 |  |  |  |  | 
| 217 | 0 |  |  |  |  | 0 | croak "count_loops_by=coverage requires a coverage number\n"; | 
| 218 |  |  |  |  |  |  | } elsif ($self->count_loops_by eq 'number_of_reads' and not defined $self->number_of_reads) { | 
| 219 | 0 |  |  |  |  | 0 | croak "count_loops_by=number_of_reads requires a number_of_reads number\n"; | 
| 220 |  |  |  |  |  |  | } | 
| 221 |  |  |  |  |  |  |  | 
| 222 |  |  |  |  |  |  | ## Just to ensure that the lazy attributes are built before &new returns | 
| 223 | 20 |  |  |  |  | 610 | $self->_piece_table; | 
| 224 | 20 |  |  |  |  | 525 | $self->_seqid_raffle; | 
| 225 | 20 |  |  |  |  | 510 | $self->_fasta; | 
| 226 | 20 |  |  |  |  | 530 | $self->_fasta_blacklist; | 
| 227 | 20 |  |  |  |  | 535 | $self->_strand; | 
| 228 |  |  |  |  |  |  | } | 
| 229 |  |  |  |  |  |  |  | 
| 230 |  |  |  |  |  |  | sub _build_strand { | 
| 231 | 20 |  |  | 20 |  | 25 | my $self = shift; | 
| 232 | 20 |  |  |  |  | 30 | my $strand_sub; | 
| 233 |  |  |  |  |  |  |  | 
| 234 | 20 | 50 |  |  |  | 525 | if ($self->strand_bias eq 'plus') { | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 235 | 0 |  |  | 0 |  | 0 | $strand_sub = sub {1}; | 
|  | 0 |  |  |  |  | 0 |  | 
| 236 |  |  |  |  |  |  | } elsif ($self->strand_bias eq 'minus') { | 
| 237 | 0 |  |  | 0 |  | 0 | $strand_sub = sub {0}; | 
|  | 0 |  |  |  |  | 0 |  | 
| 238 |  |  |  |  |  |  | } elsif ($self->strand_bias eq 'random') { | 
| 239 |  |  |  |  |  |  | # Use App::Sandy::RNG | 
| 240 | 20 |  |  | 6840 |  | 140 | $strand_sub = sub { $_[0]->get_n(2) }; | 
|  | 6840 |  |  |  |  | 37489 |  | 
| 241 |  |  |  |  |  |  | } else { | 
| 242 | 0 |  |  |  |  | 0 | croak sprintf "Unknown option '%s' for strand bias\n", | 
| 243 |  |  |  |  |  |  | $self->strand_bias; | 
| 244 |  |  |  |  |  |  | } | 
| 245 |  |  |  |  |  |  |  | 
| 246 | 20 |  |  |  |  | 495 | return $strand_sub; | 
| 247 |  |  |  |  |  |  | } | 
| 248 |  |  |  |  |  |  |  | 
| 249 |  |  |  |  |  |  | sub _clean_fasta_id { | 
| 250 | 100 |  |  | 100 |  | 180 | my ($self, $id_ref) = @_; | 
| 251 |  |  |  |  |  |  |  | 
| 252 |  |  |  |  |  |  | # Remove '>' | 
| 253 | 100 |  |  |  |  | 310 | $$id_ref =~ s/^>//; | 
| 254 |  |  |  |  |  |  |  | 
| 255 |  |  |  |  |  |  | # Chomp | 
| 256 | 100 |  |  |  |  | 325 | $$id_ref =~ s/^\s+|\s+$//g; | 
| 257 |  |  |  |  |  |  |  | 
| 258 |  |  |  |  |  |  | # Remove id version | 
| 259 | 100 |  |  |  |  | 220 | $$id_ref =~ s/\.\d+$//; | 
| 260 |  |  |  |  |  |  | } | 
| 261 |  |  |  |  |  |  |  | 
| 262 |  |  |  |  |  |  | sub _index_fasta { | 
| 263 | 20 |  |  | 20 |  | 45 | my $self = shift; | 
| 264 | 20 |  |  |  |  | 550 | my $fasta = $self->fasta_file; | 
| 265 |  |  |  |  |  |  |  | 
| 266 | 20 |  |  |  |  | 90 | my $fh = $self->with_open_r($fasta); | 
| 267 |  |  |  |  |  |  |  | 
| 268 |  |  |  |  |  |  | # indexed_genome = ID => (seq, len) | 
| 269 | 20 |  |  |  |  | 65 | my %indexed_fasta; | 
| 270 |  |  |  |  |  |  |  | 
| 271 |  |  |  |  |  |  | # >ID|PID as in gencode transcripts | 
| 272 |  |  |  |  |  |  | my %fasta_rtree; | 
| 273 | 20 |  |  |  |  | 0 | my $id; | 
| 274 |  |  |  |  |  |  |  | 
| 275 | 20 |  |  |  |  | 780 | while (<$fh>) { | 
| 276 | 860 |  |  |  |  | 1355 | chomp; | 
| 277 | 860 | 50 |  |  |  | 1455 | next if /^;/; | 
| 278 | 860 | 100 |  |  |  | 1545 | if (/^>/) { | 
| 279 | 100 |  |  |  |  | 340 | my @fields = split /\|/; | 
| 280 | 100 |  |  |  |  | 165 | $id = $fields[0]; | 
| 281 |  |  |  |  |  |  |  | 
| 282 |  |  |  |  |  |  | # Remove '>', chomp and remove id version | 
| 283 | 100 |  |  |  |  | 295 | $self->_clean_fasta_id(\$id); | 
| 284 |  |  |  |  |  |  |  | 
| 285 |  |  |  |  |  |  | # Seq ID standardization in order to manage comparations | 
| 286 |  |  |  |  |  |  | # between chr1, Chr1, CHR1, 1 etc; | 
| 287 | 100 |  |  |  |  | 325 | my $std_id = $self->with_std_seqid($id); | 
| 288 | 100 |  |  |  |  | 3780 | $self->_set_seqname( | 
| 289 |  |  |  |  |  |  | $id     => $std_id, | 
| 290 |  |  |  |  |  |  | $std_id => $id | 
| 291 |  |  |  |  |  |  | ); | 
| 292 |  |  |  |  |  |  |  | 
| 293 |  |  |  |  |  |  | # It is necessary to catch gene -> transcript relation | 
| 294 |  |  |  |  |  |  | # # TODO: Make a hash tarit for indexed fasta | 
| 295 | 100 | 50 |  |  |  | 540 | if (defined $fields[1]) { | 
| 296 | 0 |  |  |  |  | 0 | my $pid = $fields[1]; | 
| 297 |  |  |  |  |  |  |  | 
| 298 |  |  |  |  |  |  | # Chomp and remove id version | 
| 299 | 0 |  |  |  |  | 0 | $self->_clean_fasta_id(\$pid); | 
| 300 |  |  |  |  |  |  |  | 
| 301 | 0 |  |  |  |  | 0 | $fasta_rtree{$id} = $pid; | 
| 302 |  |  |  |  |  |  | } | 
| 303 |  |  |  |  |  |  | } else { | 
| 304 | 760 | 50 |  |  |  | 1160 | die "Error reading fasta file '$fasta': Not defined id" | 
| 305 |  |  |  |  |  |  | unless defined $id; | 
| 306 | 760 |  |  |  |  | 2465 | $indexed_fasta{$id}{seq} .= uc($_); | 
| 307 |  |  |  |  |  |  | } | 
| 308 |  |  |  |  |  |  | } | 
| 309 |  |  |  |  |  |  |  | 
| 310 | 20 |  |  |  |  | 125 | for (keys %indexed_fasta) { | 
| 311 | 100 |  |  |  |  | 200 | $indexed_fasta{$_}{size} = length $indexed_fasta{$_}{seq}; | 
| 312 |  |  |  |  |  |  | } | 
| 313 |  |  |  |  |  |  |  | 
| 314 | 20 | 50 |  |  |  | 60 | unless (%indexed_fasta) { | 
| 315 | 0 |  |  |  |  | 0 | die "Error parsing '$fasta'. Maybe the file is empty\n"; | 
| 316 |  |  |  |  |  |  | } | 
| 317 |  |  |  |  |  |  |  | 
| 318 |  |  |  |  |  |  | $fh->close | 
| 319 | 20 | 50 |  |  |  | 160 | or die "Cannot close file $fasta: $!\n"; | 
| 320 |  |  |  |  |  |  |  | 
| 321 | 20 | 50 |  |  |  | 545 | $self->_set_fasta_rtree(%fasta_rtree) if %fasta_rtree; | 
| 322 | 20 |  |  |  |  | 110 | return \%indexed_fasta; | 
| 323 |  |  |  |  |  |  | } | 
| 324 |  |  |  |  |  |  |  | 
| 325 |  |  |  |  |  |  | sub _build_fasta { | 
| 326 | 20 |  |  | 20 |  | 35 | my $self = shift; | 
| 327 | 20 |  |  |  |  | 530 | my $fasta = $self->fasta_file; | 
| 328 |  |  |  |  |  |  |  | 
| 329 | 20 |  |  |  |  | 110 | log_msg ":: Indexing fasta file '$fasta' ..."; | 
| 330 | 20 |  |  |  |  | 85 | my $indexed_fasta = $self->_index_fasta; | 
| 331 |  |  |  |  |  |  |  | 
| 332 |  |  |  |  |  |  | # Validate genome about the read size required | 
| 333 | 20 |  |  |  |  | 105 | log_msg ":: Validating fasta file '$fasta' ..."; | 
| 334 |  |  |  |  |  |  | # Entries to remove | 
| 335 | 20 |  |  |  |  | 30 | my @blacklist; | 
| 336 |  |  |  |  |  |  |  | 
| 337 | 20 | 50 |  |  |  | 695 | unless ($self->truncate) { | 
| 338 | 20 |  |  |  |  | 70 | for my $id (keys %$indexed_fasta) { | 
| 339 | 100 |  |  |  |  | 170 | my $index_size = $indexed_fasta->{$id}{size}; | 
| 340 | 100 |  |  |  |  | 2365 | my $class = ref $self->seq; | 
| 341 |  |  |  |  |  |  |  | 
| 342 | 100 | 100 |  |  |  | 255 | if ($class eq 'App::Sandy::Seq::SingleEnd') { | 
|  |  | 50 |  |  |  |  |  | 
| 343 | 50 |  |  |  |  | 1975 | my $read_mean = $self->seq->read_mean; | 
| 344 | 50 | 50 |  |  |  | 135 | if ($index_size < $read_mean) { | 
| 345 | 0 |  |  |  |  | 0 | log_msg ":: Parsing fasta file '$fasta': Seqid sequence length (>$id => $index_size) lesser than required read mean ($read_mean)"; | 
| 346 | 0 |  |  |  |  | 0 | delete $indexed_fasta->{$id}; | 
| 347 | 0 |  |  |  |  | 0 | push @blacklist => $id; | 
| 348 |  |  |  |  |  |  | } | 
| 349 |  |  |  |  |  |  | } elsif ($class eq 'App::Sandy::Seq::PairedEnd') { | 
| 350 | 50 |  |  |  |  | 1400 | my $fragment_mean = $self->seq->fragment_mean; | 
| 351 | 50 | 50 |  |  |  | 190 | if ($index_size < $fragment_mean) { | 
| 352 | 0 |  |  |  |  | 0 | log_msg ":: Parsing fasta file '$fasta': Seqid sequence length (>$id => $index_size) lesser than required fragment mean ($fragment_mean)"; | 
| 353 | 0 |  |  |  |  | 0 | delete $indexed_fasta->{$id}; | 
| 354 | 0 |  |  |  |  | 0 | push @blacklist => $id; | 
| 355 |  |  |  |  |  |  | } | 
| 356 |  |  |  |  |  |  | } else { | 
| 357 | 0 |  |  |  |  | 0 | croak "Unknown option '$class' for sequencing type\n"; | 
| 358 |  |  |  |  |  |  | } | 
| 359 |  |  |  |  |  |  | } | 
| 360 |  |  |  |  |  |  | } | 
| 361 |  |  |  |  |  |  |  | 
| 362 | 20 | 50 |  |  |  | 75 | unless (%$indexed_fasta) { | 
| 363 | 0 |  |  |  |  | 0 | die sprintf "Fasta file '%s' has no valid entry\n" => $self->fasta_file; | 
| 364 |  |  |  |  |  |  | } | 
| 365 |  |  |  |  |  |  |  | 
| 366 |  |  |  |  |  |  | # If fasta_rtree has entries | 
| 367 | 20 | 50 |  |  |  | 755 | unless ($self->_has_no_fasta_rtree) { | 
| 368 |  |  |  |  |  |  | # Remove no valid entries from id -> pid relation | 
| 369 | 0 | 0 |  |  |  | 0 | $self->_delete_fasta_rtree(@blacklist) if @blacklist; | 
| 370 |  |  |  |  |  |  | } | 
| 371 |  |  |  |  |  |  |  | 
| 372 | 20 |  |  |  |  | 520 | return $indexed_fasta; | 
| 373 |  |  |  |  |  |  | } | 
| 374 |  |  |  |  |  |  |  | 
| 375 |  |  |  |  |  |  | sub _populate_fasta_tree { | 
| 376 | 20 |  |  | 20 |  | 45 | my $self = shift; | 
| 377 |  |  |  |  |  |  |  | 
| 378 |  |  |  |  |  |  | # If fasta_rtree has entries | 
| 379 | 20 | 50 |  |  |  | 770 | unless ($self->_has_no_fasta_rtree) { | 
| 380 |  |  |  |  |  |  | # Build parent -> child ids relation | 
| 381 | 0 |  |  |  |  | 0 | my %fasta_tree; | 
| 382 |  |  |  |  |  |  |  | 
| 383 |  |  |  |  |  |  | # Reverse fasta_rtree to pid -> \@ids | 
| 384 | 0 |  |  |  |  | 0 | for my $pair ($self->_fasta_rtree_pairs) { | 
| 385 | 0 |  |  |  |  | 0 | my ($id, $pid) = (@$pair); | 
| 386 | 0 |  |  |  |  | 0 | push @{ $fasta_tree{$pid} } => $id; | 
|  | 0 |  |  |  |  | 0 |  | 
| 387 |  |  |  |  |  |  | } | 
| 388 |  |  |  |  |  |  |  | 
| 389 | 0 |  |  |  |  | 0 | $self->_set_fasta_tree(%fasta_tree); | 
| 390 |  |  |  |  |  |  | } | 
| 391 |  |  |  |  |  |  | } | 
| 392 |  |  |  |  |  |  |  | 
| 393 |  |  |  |  |  |  | sub _build_fasta_blacklist { | 
| 394 | 20 |  |  | 20 |  | 30 | my $self = shift; | 
| 395 |  |  |  |  |  |  |  | 
| 396 | 20 |  |  |  |  | 85 | log_msg ":: Index unidentified (NNN..) regions"; | 
| 397 |  |  |  |  |  |  |  | 
| 398 | 20 |  |  |  |  | 490 | my $indexed_fasta = $self->_fasta; | 
| 399 | 20 |  |  |  |  | 560 | my $class = ref $self->seq; | 
| 400 |  |  |  |  |  |  |  | 
| 401 | 20 |  |  |  |  | 45 | my %fasta_blacklist; | 
| 402 |  |  |  |  |  |  | my @blacklist_id; | 
| 403 |  |  |  |  |  |  |  | 
| 404 | 20 |  |  |  |  | 70 | for my $id (keys %$indexed_fasta) { | 
| 405 | 100 |  |  |  |  | 1485 | my $seq = $indexed_fasta->{$id}{seq}; | 
| 406 | 100 |  |  |  |  | 180 | my $seq_len = $indexed_fasta->{$id}{size}; | 
| 407 |  |  |  |  |  |  |  | 
| 408 | 100 |  |  |  |  | 130 | my $offset = 0; | 
| 409 | 100 |  |  |  |  | 170 | my ($st, $en) = (0, 0); | 
| 410 | 100 |  |  |  |  | 145 | my $pos = 0; | 
| 411 | 100 |  |  |  |  | 115 | my $init = 0; | 
| 412 | 100 |  |  |  |  | 135 | my $len = 0; | 
| 413 | 100 |  |  |  |  | 110 | my @stack; | 
| 414 |  |  |  |  |  |  |  | 
| 415 | 100 |  |  |  |  | 275 | while (($pos = index $seq, "N", $offset) != -1) { | 
| 416 | 0 | 0 |  |  |  | 0 | unless ($init) { | 
| 417 | 0 |  |  |  |  | 0 | $st = $pos; | 
| 418 | 0 |  |  |  |  | 0 | $en = $pos; | 
| 419 | 0 |  |  |  |  | 0 | $init = 1; | 
| 420 |  |  |  |  |  |  | } | 
| 421 |  |  |  |  |  |  |  | 
| 422 | 0 | 0 |  |  |  | 0 | if ($en < ($pos - 1)) { | 
| 423 | 0 |  |  |  |  | 0 | push @stack => [$st, $en]; | 
| 424 | 0 |  |  |  |  | 0 | $len += $en - $st + 1; | 
| 425 | 0 |  |  |  |  | 0 | $st = $pos; | 
| 426 |  |  |  |  |  |  | } | 
| 427 |  |  |  |  |  |  |  | 
| 428 | 0 |  |  |  |  | 0 | $en = $pos; | 
| 429 | 0 |  |  |  |  | 0 | $offset = $pos + 1; | 
| 430 |  |  |  |  |  |  | } | 
| 431 |  |  |  |  |  |  |  | 
| 432 | 100 | 50 |  |  |  | 170 | if ($init) { | 
| 433 | 0 |  |  |  |  | 0 | push @stack => [$st, $en]; | 
| 434 | 0 |  |  |  |  | 0 | $len += $en - $st + 1; | 
| 435 |  |  |  |  |  |  | } | 
| 436 |  |  |  |  |  |  |  | 
| 437 | 100 | 100 |  |  |  | 2625 | my $read_len = $class eq 'App::Sandy::Seq::SingleEnd' | 
| 438 |  |  |  |  |  |  | ? $self->seq->read_mean | 
| 439 |  |  |  |  |  |  | : $self->seq->fragment_mean; | 
| 440 |  |  |  |  |  |  |  | 
| 441 | 100 | 50 |  |  |  | 190 | if ($len > ($seq_len - $read_len)) { | 
| 442 | 0 |  |  |  |  | 0 | log_msg ":: Seqid '$id' has too much NNN and will not be used"; | 
| 443 | 0 |  |  |  |  | 0 | delete $indexed_fasta->{$id}; | 
| 444 | 0 |  |  |  |  | 0 | push @blacklist_id => $id; | 
| 445 |  |  |  |  |  |  | } else { | 
| 446 | 100 |  |  |  |  | 1165 | $fasta_blacklist{$id} = \@stack; | 
| 447 |  |  |  |  |  |  | } | 
| 448 |  |  |  |  |  |  | } | 
| 449 |  |  |  |  |  |  |  | 
| 450 | 20 | 50 |  |  |  | 75 | unless (%$indexed_fasta) { | 
| 451 | 0 |  |  |  |  | 0 | die sprintf "Fasta file '%s' has too much NNN regions\n" => $self->fasta_file; | 
| 452 |  |  |  |  |  |  | } | 
| 453 |  |  |  |  |  |  |  | 
| 454 |  |  |  |  |  |  | # If fasta_rtree has entries | 
| 455 | 20 | 50 |  |  |  | 750 | unless ($self->_has_no_fasta_rtree) { | 
| 456 |  |  |  |  |  |  | # Remove no valid entries from id -> pid relation | 
| 457 | 0 | 0 |  |  |  | 0 | $self->_delete_fasta_rtree(@blacklist_id) if @blacklist_id; | 
| 458 |  |  |  |  |  |  | } | 
| 459 |  |  |  |  |  |  |  | 
| 460 | 20 |  |  |  |  | 540 | return \%fasta_blacklist; | 
| 461 |  |  |  |  |  |  | } | 
| 462 |  |  |  |  |  |  |  | 
| 463 |  |  |  |  |  |  | sub _retrieve_expression_matrix { | 
| 464 | 0 |  |  | 0 |  | 0 | my $self = shift; | 
| 465 |  |  |  |  |  |  |  | 
| 466 | 0 |  |  |  |  | 0 | my $expression = App::Sandy::DB::Handle::Expression->new; | 
| 467 | 0 |  |  |  |  | 0 | my $indexed_file = $expression->retrievedb($self->expression_matrix); | 
| 468 |  |  |  |  |  |  |  | 
| 469 |  |  |  |  |  |  | # Remove transcript id version | 
| 470 | 0 |  |  |  |  | 0 | for my $id (keys %$indexed_file) { | 
| 471 | 0 |  |  |  |  | 0 | my $old_id = $id; | 
| 472 | 0 |  |  |  |  | 0 | $self->_clean_fasta_id(\$id); | 
| 473 | 0 | 0 |  |  |  | 0 | $indexed_file->{$id} = delete $indexed_file->{$old_id} | 
| 474 |  |  |  |  |  |  | if $old_id ne $id; | 
| 475 |  |  |  |  |  |  | } | 
| 476 |  |  |  |  |  |  |  | 
| 477 | 0 |  |  |  |  | 0 | return $indexed_file; | 
| 478 |  |  |  |  |  |  | } | 
| 479 |  |  |  |  |  |  |  | 
| 480 |  |  |  |  |  |  | sub _build_seqid_raffle { | 
| 481 | 20 |  |  | 20 |  | 85 | my $self = shift; | 
| 482 |  |  |  |  |  |  |  | 
| 483 |  |  |  |  |  |  | # Get the piece table | 
| 484 | 20 |  |  |  |  | 545 | my $piece_table = $self->_piece_table; | 
| 485 |  |  |  |  |  |  |  | 
| 486 |  |  |  |  |  |  | # The builded function | 
| 487 | 20 |  |  |  |  | 45 | my $seqid_sub; | 
| 488 |  |  |  |  |  |  |  | 
| 489 | 20 | 50 |  |  |  | 515 | if ($self->seqid_weight eq 'same') { | 
|  |  | 50 |  |  |  |  |  | 
|  |  | 50 |  |  |  |  |  | 
| 490 | 0 |  |  | 0 |  | 0 | my ($keys, $weights) = $self->_populate_key_weight($piece_table, sub { 1 }); | 
|  | 0 |  |  |  |  | 0 |  | 
| 491 |  |  |  |  |  |  |  | 
| 492 |  |  |  |  |  |  | # If weight == 1 means that there are 2 keys for | 
| 493 |  |  |  |  |  |  | # the same seq_id. | 
| 494 |  |  |  |  |  |  | # If weight == 2 means that there is only one key | 
| 495 |  |  |  |  |  |  | # for the seq_id, so I double that key | 
| 496 | 0 |  |  |  |  | 0 | for (my $i = 0; $i < @$weights; $i++) { | 
| 497 | 0 | 0 |  |  |  | 0 | if ($weights->[$i] > 1) { | 
| 498 | 0 |  |  |  |  | 0 | push @$keys => $keys->[$i]; | 
| 499 |  |  |  |  |  |  | } | 
| 500 |  |  |  |  |  |  | } | 
| 501 |  |  |  |  |  |  |  | 
| 502 | 0 |  |  |  |  | 0 | my $keys_size = scalar @$keys; | 
| 503 |  |  |  |  |  |  | # Use App::Sandy::RNG | 
| 504 | 0 |  |  | 0 |  | 0 | $seqid_sub = sub { $keys->[$_[0]->get_n($keys_size)] }; | 
|  | 0 |  |  |  |  | 0 |  | 
| 505 |  |  |  |  |  |  | } elsif ($self->seqid_weight eq 'count') { | 
| 506 |  |  |  |  |  |  | # Catch expression-matrix entry from database | 
| 507 | 0 |  |  |  |  | 0 | my $indexed_file = $self->_retrieve_expression_matrix; | 
| 508 |  |  |  |  |  |  |  | 
| 509 |  |  |  |  |  |  | # Catch indexed fasta | 
| 510 | 0 |  |  |  |  | 0 | my $indexed_fasta = $self->_fasta; | 
| 511 |  |  |  |  |  |  |  | 
| 512 |  |  |  |  |  |  | # Validate expression_matrix | 
| 513 | 0 |  |  |  |  | 0 | for my $id (keys %$indexed_file) { | 
| 514 |  |  |  |  |  |  | # If not exists into indexed_fasta, it must then exist into fasta_tree | 
| 515 | 0 | 0 | 0 |  |  | 0 | unless (exists $piece_table->{$id} || $self->_exists_fasta_tree($id)) { | 
| 516 | 0 |  |  |  |  | 0 | log_msg sprintf ":: Ignoring seqid '%s' from expression-matrix '%s': It is not found into the indexed fasta" | 
| 517 |  |  |  |  |  |  | => $id, $self->expression_matrix; | 
| 518 | 0 |  |  |  |  | 0 | delete $indexed_file->{$id}; | 
| 519 |  |  |  |  |  |  | } | 
| 520 |  |  |  |  |  |  | } | 
| 521 |  |  |  |  |  |  |  | 
| 522 | 0 | 0 |  |  |  | 0 | unless (%$indexed_file) { | 
| 523 | 0 |  |  |  |  | 0 | die sprintf "No valid seqid entry of the expression-matrix '%s' is recorded into the indexed fasta\n" | 
| 524 |  |  |  |  |  |  | => $self->expression_matrix; | 
| 525 |  |  |  |  |  |  | } | 
| 526 |  |  |  |  |  |  |  | 
| 527 | 0 |  |  |  |  | 0 | my (%ptable_ind, %ptable_cluster); | 
| 528 |  |  |  |  |  |  |  | 
| 529 |  |  |  |  |  |  | # Split indexed_file seq_ids between those | 
| 530 |  |  |  |  |  |  | # into piece_table and those that represents a cluster | 
| 531 |  |  |  |  |  |  | # of seq_ids as in gene -> transcript relationship | 
| 532 | 0 |  |  |  |  | 0 | for my $seq_id (keys %$indexed_file) { | 
| 533 | 0 | 0 |  |  |  | 0 | if (exists $piece_table->{$seq_id}) { | 
| 534 | 0 |  |  |  |  | 0 | $ptable_ind{$seq_id} = $piece_table->{$seq_id}; | 
| 535 |  |  |  |  |  |  |  | 
| 536 |  |  |  |  |  |  | } else { | 
| 537 | 0 |  |  |  |  | 0 | my $ids = $self->_get_fasta_tree($seq_id); | 
| 538 |  |  |  |  |  |  |  | 
| 539 |  |  |  |  |  |  | # Bug catcher | 
| 540 | 0 | 0 |  |  |  | 0 | unless (@$ids) { | 
| 541 | 0 |  |  |  |  | 0 | croak "seq_id '$seq_id' not found into piece_table"; | 
| 542 |  |  |  |  |  |  | } | 
| 543 |  |  |  |  |  |  |  | 
| 544 | 0 |  |  |  |  | 0 | $ptable_cluster{$seq_id} = $ids; | 
| 545 |  |  |  |  |  |  | } | 
| 546 |  |  |  |  |  |  | } | 
| 547 |  |  |  |  |  |  |  | 
| 548 |  |  |  |  |  |  | # Let's calculate the weight taking in acount | 
| 549 |  |  |  |  |  |  | # the size  increase/decrease | 
| 550 |  |  |  |  |  |  | my $calc_ind_weight = sub { | 
| 551 | 0 |  |  | 0 |  | 0 | my ($seq_id, $type) = @_; | 
| 552 |  |  |  |  |  |  |  | 
| 553 | 0 |  |  |  |  | 0 | my $counts = $indexed_file->{$seq_id}; | 
| 554 | 0 |  |  |  |  | 0 | my $size = $piece_table->{$seq_id}{$type}{size}; | 
| 555 | 0 |  |  |  |  | 0 | my $fasta_size = $indexed_fasta->{$seq_id}{size}; | 
| 556 |  |  |  |  |  |  |  | 
| 557 |  |  |  |  |  |  | # Correct the weight according to the | 
| 558 |  |  |  |  |  |  | # genomic variation change by the ratio | 
| 559 |  |  |  |  |  |  | # between the table size and fasta size | 
| 560 | 0 |  |  |  |  | 0 | my $factor = $size / $fasta_size; | 
| 561 |  |  |  |  |  |  |  | 
| 562 | 0 |  |  |  |  | 0 | return $counts * $factor; | 
| 563 | 0 |  |  |  |  | 0 | }; | 
| 564 |  |  |  |  |  |  |  | 
| 565 | 0 |  |  |  |  | 0 | my ($keys, $weights); | 
| 566 |  |  |  |  |  |  |  | 
| 567 | 0 | 0 |  |  |  | 0 | if (%ptable_ind) { | 
| 568 | 0 |  |  |  |  | 0 | ($keys, $weights) = $self->_populate_key_weight(\%ptable_ind, | 
| 569 |  |  |  |  |  |  | $calc_ind_weight); | 
| 570 |  |  |  |  |  |  | } | 
| 571 |  |  |  |  |  |  |  | 
| 572 |  |  |  |  |  |  | # If there are seq_id cluster like, then its is | 
| 573 |  |  |  |  |  |  | # time to calculate these weights | 
| 574 | 0 |  |  |  |  | 0 | for my $seq_id (sort keys %ptable_cluster) { | 
| 575 | 0 |  |  |  |  | 0 | my %ptable; | 
| 576 |  |  |  |  |  |  |  | 
| 577 |  |  |  |  |  |  | # Slice piece_table hash | 
| 578 | 0 |  |  |  |  | 0 | my $ids = $ptable_cluster{$seq_id}; | 
| 579 | 0 |  |  |  |  | 0 | @ptable{@$ids} = @$piece_table{@$ids}; | 
| 580 |  |  |  |  |  |  |  | 
| 581 |  |  |  |  |  |  | # total size among all ids of cluster | 
| 582 | 0 |  |  |  |  | 0 | my %total; | 
| 583 |  |  |  |  |  |  |  | 
| 584 |  |  |  |  |  |  | # Calculate the total size by type | 
| 585 | 0 |  |  |  |  | 0 | for my $type_h (values %ptable) { | 
| 586 | 0 |  |  |  |  | 0 | for my $type (keys %$type_h) { | 
| 587 | 0 |  |  |  |  | 0 | $total{$type} += $type_h->{$type}{size}; | 
| 588 |  |  |  |  |  |  | } | 
| 589 |  |  |  |  |  |  | } | 
| 590 |  |  |  |  |  |  |  | 
| 591 |  |  |  |  |  |  | # Calculate the weight taking in acount the size increase/decrease | 
| 592 |  |  |  |  |  |  | # and the ratio between the total size by type and the table size. | 
| 593 |  |  |  |  |  |  | # The problem here is that I must divide the 'counts' for some 'seq_id' | 
| 594 |  |  |  |  |  |  | # among all ids that belong to it | 
| 595 |  |  |  |  |  |  | my $calc_cluster_weight = sub { | 
| 596 | 0 |  |  | 0 |  | 0 | my ($id, $type) = @_; | 
| 597 |  |  |  |  |  |  |  | 
| 598 | 0 |  |  |  |  | 0 | my $counts = $indexed_file->{$seq_id}; | 
| 599 | 0 |  |  |  |  | 0 | my $size = $piece_table->{$id}{$type}{size}; | 
| 600 | 0 |  |  |  |  | 0 | my $fasta_size = $indexed_fasta->{$id}{size}; | 
| 601 |  |  |  |  |  |  |  | 
| 602 |  |  |  |  |  |  | # Divide the counts among all ids | 
| 603 | 0 |  |  |  |  | 0 | my $ratio = $size / $total{$type}; | 
| 604 |  |  |  |  |  |  |  | 
| 605 |  |  |  |  |  |  | # Correct the weight according to the size | 
| 606 | 0 |  |  |  |  | 0 | my $factor = $size / $fasta_size; | 
| 607 |  |  |  |  |  |  |  | 
| 608 | 0 |  |  |  |  | 0 | return $counts * $factor * $ratio; | 
| 609 | 0 |  |  |  |  | 0 | }; | 
| 610 |  |  |  |  |  |  |  | 
| 611 | 0 |  |  |  |  | 0 | my ($k, $w) = $self->_populate_key_weight(\%ptable, | 
| 612 |  |  |  |  |  |  | $calc_cluster_weight); | 
| 613 |  |  |  |  |  |  |  | 
| 614 | 0 |  |  |  |  | 0 | push @$keys => @$k; | 
| 615 | 0 |  |  |  |  | 0 | push @$weights => @$w; | 
| 616 |  |  |  |  |  |  | } | 
| 617 |  |  |  |  |  |  |  | 
| 618 | 0 | 0 | 0 |  |  | 0 | unless (@$keys && @$weights) { | 
| 619 | 0 |  |  |  |  | 0 | croak "No keys weights have been set"; | 
| 620 |  |  |  |  |  |  | } | 
| 621 |  |  |  |  |  |  |  | 
| 622 |  |  |  |  |  |  | # It is very necessary in order | 
| 623 |  |  |  |  |  |  | # to avoid truncation of numbers | 
| 624 |  |  |  |  |  |  | # between zero and one | 
| 625 | 0 |  |  |  |  | 0 | $self->_round_weight($weights); | 
| 626 |  |  |  |  |  |  |  | 
| 627 | 0 |  |  |  |  | 0 | my $raffler = App::Sandy::WeightedRaffle->new( | 
| 628 |  |  |  |  |  |  | 'weights' => $weights, | 
| 629 |  |  |  |  |  |  | 'keys'    => $keys | 
| 630 |  |  |  |  |  |  | ); | 
| 631 |  |  |  |  |  |  |  | 
| 632 |  |  |  |  |  |  | # Use App::Sandy::Rand | 
| 633 | 0 |  |  | 0 |  | 0 | $seqid_sub = sub { $raffler->weighted_raffle($_[0]) }; | 
|  | 0 |  |  |  |  | 0 |  | 
| 634 |  |  |  |  |  |  | } elsif ($self->seqid_weight eq 'length') { | 
| 635 |  |  |  |  |  |  | my $calc_weight = sub { | 
| 636 | 100 |  |  | 100 |  | 165 | my ($seq_id, $type) = @_; | 
| 637 | 100 |  |  |  |  | 190 | return $piece_table->{$seq_id}{$type}{size}; | 
| 638 | 20 |  |  |  |  | 120 | }; | 
| 639 |  |  |  |  |  |  |  | 
| 640 | 20 |  |  |  |  | 70 | my ($keys, $weights) = $self->_populate_key_weight($piece_table, | 
| 641 |  |  |  |  |  |  | $calc_weight); | 
| 642 |  |  |  |  |  |  |  | 
| 643 |  |  |  |  |  |  | # Just in case ... | 
| 644 | 20 |  |  |  |  | 65 | $self->_round_weight($weights); | 
| 645 |  |  |  |  |  |  |  | 
| 646 | 20 |  |  |  |  | 780 | my $raffler = App::Sandy::WeightedRaffle->new( | 
| 647 |  |  |  |  |  |  | weights => $weights, | 
| 648 |  |  |  |  |  |  | keys    => $keys | 
| 649 |  |  |  |  |  |  | ); | 
| 650 |  |  |  |  |  |  |  | 
| 651 |  |  |  |  |  |  | # Use App::Sandy::Rand | 
| 652 | 20 |  |  | 6840 |  | 215 | $seqid_sub = sub { $raffler->weighted_raffle($_[0]) }; | 
|  | 6840 |  |  |  |  | 25781 |  | 
| 653 |  |  |  |  |  |  | } else { | 
| 654 | 0 |  |  |  |  | 0 | croak sprintf "Unknown option '%s' for seqid_weight\n", | 
| 655 |  |  |  |  |  |  | $self->seqid_weight; | 
| 656 |  |  |  |  |  |  | } | 
| 657 |  |  |  |  |  |  |  | 
| 658 | 20 |  |  |  |  | 590 | return $seqid_sub; | 
| 659 |  |  |  |  |  |  | } | 
| 660 |  |  |  |  |  |  |  | 
| 661 |  |  |  |  |  |  | sub _round_weight { | 
| 662 | 20 |  |  | 20 |  | 50 | my ($self, $weights) = @_; | 
| 663 |  |  |  |  |  |  |  | 
| 664 | 20 |  |  |  |  | 70 | my $min = min @$weights; | 
| 665 |  |  |  |  |  |  |  | 
| 666 | 20 | 50 |  |  |  | 60 | if ($min <= 0) { | 
| 667 | 0 |  |  |  |  | 0 | croak "min weight le to zero: $min"; | 
| 668 |  |  |  |  |  |  | } | 
| 669 |  |  |  |  |  |  |  | 
| 670 | 20 | 50 |  |  |  | 60 | my $factor = $min < 1 | 
| 671 |  |  |  |  |  |  | ? (1 / $min) | 
| 672 |  |  |  |  |  |  | : 1; | 
| 673 |  |  |  |  |  |  |  | 
| 674 | 20 |  |  |  |  | 40 | for my $weight (@$weights) { | 
| 675 | 100 |  |  |  |  | 190 | $weight = int($weight * $factor + 0.5); | 
| 676 |  |  |  |  |  |  | } | 
| 677 |  |  |  |  |  |  | } | 
| 678 |  |  |  |  |  |  |  | 
| 679 |  |  |  |  |  |  | sub _populate_key_weight { | 
| 680 | 20 |  |  | 20 |  | 50 | my ($self, $piece_table, $calc_weight) = @_; | 
| 681 |  |  |  |  |  |  |  | 
| 682 | 20 |  |  |  |  | 35 | my (@keys, @weights); | 
| 683 |  |  |  |  |  |  |  | 
| 684 |  |  |  |  |  |  | # It needs to be sorted in order to the | 
| 685 |  |  |  |  |  |  | # seed works | 
| 686 | 20 |  |  |  |  | 105 | for my $seq_id (sort keys %$piece_table) { | 
| 687 | 100 |  |  |  |  | 155 | my $type_h = $piece_table->{$seq_id}; | 
| 688 |  |  |  |  |  |  |  | 
| 689 |  |  |  |  |  |  | # If there is no alternative seq_id, then | 
| 690 |  |  |  |  |  |  | # set a factor to correct the size. | 
| 691 |  |  |  |  |  |  | # It is necessary because the seq_ids with | 
| 692 |  |  |  |  |  |  | # alternative and reference will double its | 
| 693 |  |  |  |  |  |  | # own coverage | 
| 694 | 100 | 50 |  |  |  | 195 | my $factor = scalar keys %$type_h == 1 | 
| 695 |  |  |  |  |  |  | ? 2 | 
| 696 |  |  |  |  |  |  | : 1; | 
| 697 |  |  |  |  |  |  |  | 
| 698 | 100 |  |  |  |  | 215 | for my $type (sort keys %$type_h) { | 
| 699 |  |  |  |  |  |  |  | 
| 700 | 100 |  |  |  |  | 290 | my %key = ( | 
| 701 |  |  |  |  |  |  | 'seq_id' => $seq_id, | 
| 702 |  |  |  |  |  |  | 'type'   => $type | 
| 703 |  |  |  |  |  |  | ); | 
| 704 |  |  |  |  |  |  |  | 
| 705 | 100 |  |  |  |  | 150 | my $weight = $calc_weight->($seq_id, $type); | 
| 706 |  |  |  |  |  |  |  | 
| 707 | 100 |  |  |  |  | 160 | push @keys => \%key; | 
| 708 | 100 |  |  |  |  | 230 | push @weights => $weight * $factor; | 
| 709 |  |  |  |  |  |  | } | 
| 710 |  |  |  |  |  |  | } | 
| 711 |  |  |  |  |  |  |  | 
| 712 | 20 |  |  |  |  | 80 | return (\@keys, \@weights); | 
| 713 |  |  |  |  |  |  | } | 
| 714 |  |  |  |  |  |  |  | 
| 715 |  |  |  |  |  |  | sub _build_genomic_variation_names { | 
| 716 | 20 |  |  | 20 |  | 40 | my $self = shift; | 
| 717 | 20 | 50 |  |  |  | 665 | if ($self->genomic_variation) { | 
| 718 | 0 |  |  |  |  | 0 | return sprintf "[%s]", => join ", ", @{ $self->genomic_variation }; | 
|  | 0 |  |  |  |  | 0 |  | 
| 719 |  |  |  |  |  |  | } | 
| 720 |  |  |  |  |  |  | } | 
| 721 |  |  |  |  |  |  |  | 
| 722 |  |  |  |  |  |  | sub _retrieve_genomic_variation { | 
| 723 | 0 |  |  | 0 |  | 0 | my $self = shift; | 
| 724 | 0 |  |  |  |  | 0 | my $variation = App::Sandy::DB::Handle::Variation->new; | 
| 725 | 0 |  |  |  |  | 0 | return $variation->retrievedb($self->genomic_variation); | 
| 726 |  |  |  |  |  |  | } | 
| 727 |  |  |  |  |  |  |  | 
| 728 |  |  |  |  |  |  | sub _build_piece_table { | 
| 729 | 20 |  |  | 20 |  | 45 | my $self = shift; | 
| 730 |  |  |  |  |  |  |  | 
| 731 | 20 |  |  |  |  | 590 | my $genomic_variation = $self->_genomic_variation_names; | 
| 732 | 20 |  |  |  |  | 30 | my $indexed_snv; | 
| 733 |  |  |  |  |  |  |  | 
| 734 |  |  |  |  |  |  | # Retrieve genomic variation if the user provided it | 
| 735 | 20 | 50 |  |  |  | 50 | if (defined $genomic_variation) { | 
| 736 | 0 |  |  |  |  | 0 | $indexed_snv = $self->_retrieve_genomic_variation; | 
| 737 | 0 |  |  |  |  | 0 | log_msg ":: Validate genomic variation '$genomic_variation' against indexed fasta ..."; | 
| 738 | 0 |  |  |  |  | 0 | $self->_validate_indexed_snv_against_fasta($indexed_snv); | 
| 739 |  |  |  |  |  |  | } | 
| 740 |  |  |  |  |  |  |  | 
| 741 |  |  |  |  |  |  | # Catch index fasta | 
| 742 | 20 |  |  |  |  | 525 | my $indexed_fasta = $self->_fasta; | 
| 743 |  |  |  |  |  |  |  | 
| 744 |  |  |  |  |  |  | # Build piece table | 
| 745 | 20 |  |  |  |  | 35 | my %piece_table; | 
| 746 |  |  |  |  |  |  |  | 
| 747 |  |  |  |  |  |  | # Let's construct the piece_table | 
| 748 | 20 |  |  |  |  | 75 | log_msg ":: Build piece table ..."; | 
| 749 |  |  |  |  |  |  |  | 
| 750 | 20 |  |  |  |  | 150 | while (my ($seq_id, $fasta_h) = each %$indexed_fasta) { | 
| 751 | 100 |  |  |  |  | 170 | my $seq = \$fasta_h->{seq}; | 
| 752 | 100 |  |  |  |  | 3415 | my $std_seq_id = $self->_get_seqname($seq_id); | 
| 753 |  |  |  |  |  |  |  | 
| 754 |  |  |  |  |  |  | # Initialize piece tables for $seq_id ref | 
| 755 | 100 |  |  |  |  | 2710 | $piece_table{$seq_id}{ref}{table} = App::Sandy::PieceTable->new(orig => $seq); | 
| 756 |  |  |  |  |  |  |  | 
| 757 |  |  |  |  |  |  | # If there is indexed_snv for seq_id, then construct the piece table with it | 
| 758 | 100 | 50 | 33 |  |  | 465 | if (defined $indexed_snv && defined $indexed_snv->{$std_seq_id}) { | 
| 759 | 0 |  |  |  |  | 0 | my $snvs = $indexed_snv->{$std_seq_id}; | 
| 760 |  |  |  |  |  |  |  | 
| 761 |  |  |  |  |  |  | # Filter only the homozygotic snvs to feed reference seq_id | 
| 762 | 0 |  |  |  |  | 0 | my @snvs_homo = grep { $_->{plo} eq 'HO' } @$snvs; | 
|  | 0 |  |  |  |  | 0 |  | 
| 763 |  |  |  |  |  |  |  | 
| 764 | 0 | 0 |  |  |  | 0 | if (@snvs_homo) { | 
| 765 |  |  |  |  |  |  | # Populate reference seq_id | 
| 766 | 0 |  |  |  |  | 0 | $self->_populate_piece_table($piece_table{$seq_id}{ref}{table}, \@snvs_homo); | 
| 767 |  |  |  |  |  |  | } | 
| 768 |  |  |  |  |  |  |  | 
| 769 |  |  |  |  |  |  | # Initialize piece tables for $seq_id alt | 
| 770 | 0 |  |  |  |  | 0 | $piece_table{$seq_id}{alt}{table} = App::Sandy::PieceTable->new(orig => $seq); | 
| 771 |  |  |  |  |  |  |  | 
| 772 |  |  |  |  |  |  | # Populate alternative seq_id | 
| 773 | 0 |  |  |  |  | 0 | $self->_populate_piece_table($piece_table{$seq_id}{alt}{table}, $snvs); | 
| 774 |  |  |  |  |  |  | } | 
| 775 |  |  |  |  |  |  | } | 
| 776 |  |  |  |  |  |  |  | 
| 777 |  |  |  |  |  |  | # Initialize the logical offsets and valodate the | 
| 778 |  |  |  |  |  |  | # new size due to the genomic variation | 
| 779 |  |  |  |  |  |  |  | 
| 780 | 20 |  |  |  |  | 60 | my @blacklist; | 
| 781 |  |  |  |  |  |  |  | 
| 782 | 20 |  |  |  |  | 155 | for my $seq_id (keys %piece_table) { | 
| 783 | 100 |  |  |  |  | 185 | my $type_h = delete $piece_table{$seq_id}; | 
| 784 |  |  |  |  |  |  |  | 
| 785 | 100 |  |  |  |  | 280 | for my $type (keys %$type_h) { | 
| 786 | 100 |  |  |  |  | 190 | my $table_h = delete $type_h->{$type}; | 
| 787 | 100 |  |  |  |  | 175 | my $table = $table_h->{table}; | 
| 788 |  |  |  |  |  |  |  | 
| 789 |  |  |  |  |  |  | # Initialize the logical offset | 
| 790 | 100 |  |  |  |  | 330 | $table->calculate_logical_offset; | 
| 791 |  |  |  |  |  |  |  | 
| 792 |  |  |  |  |  |  | # Get the new size | 
| 793 | 100 |  |  |  |  | 2555 | my $new_size = $table->logical_len; | 
| 794 |  |  |  |  |  |  |  | 
| 795 | 100 | 50 |  |  |  | 2540 | unless ($self->truncate) { | 
| 796 | 100 |  |  |  |  | 2365 | my $class = ref $self->seq; | 
| 797 |  |  |  |  |  |  |  | 
| 798 | 100 | 100 |  |  |  | 360 | if ($class eq 'App::Sandy::Seq::SingleEnd') { | 
|  |  | 50 |  |  |  |  |  | 
| 799 | 50 | 50 |  |  |  | 1185 | if ($new_size < $self->seq->read_mean) { | 
| 800 | 0 |  |  |  |  | 0 | log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required read-mean"; | 
| 801 | 0 |  |  |  |  | 0 | next; | 
| 802 |  |  |  |  |  |  | } | 
| 803 |  |  |  |  |  |  | } elsif ($class eq 'App::Sandy::Seq::PairedEnd') { | 
| 804 | 50 | 50 |  |  |  | 1175 | if ($new_size < $self->seq->fragment_mean) { | 
| 805 | 0 |  |  |  |  | 0 | log_msg ":: Skip '$seq_id:$type': So many deletions resulted in a sequence lesser than the required fragment mean"; | 
| 806 | 0 |  |  |  |  | 0 | next; | 
| 807 |  |  |  |  |  |  | } | 
| 808 |  |  |  |  |  |  | } else { | 
| 809 | 0 |  |  |  |  | 0 | die "No valid options for 'seq'"; | 
| 810 |  |  |  |  |  |  | } | 
| 811 |  |  |  |  |  |  | } | 
| 812 |  |  |  |  |  |  |  | 
| 813 |  |  |  |  |  |  | # If all's right | 
| 814 | 100 |  |  |  |  | 300 | $table_h->{size} = $new_size; | 
| 815 | 100 |  |  |  |  | 330 | $type_h->{$type} = $table_h; | 
| 816 |  |  |  |  |  |  | } | 
| 817 |  |  |  |  |  |  |  | 
| 818 |  |  |  |  |  |  | # if there is at least one type, | 
| 819 |  |  |  |  |  |  | # then return it to the piece_table | 
| 820 | 100 | 50 |  |  |  | 235 | if (%$type_h) { | 
| 821 | 100 |  |  |  |  | 230 | $piece_table{$seq_id} = $type_h; | 
| 822 |  |  |  |  |  |  |  | 
| 823 |  |  |  |  |  |  | # else, just remove it! | 
| 824 |  |  |  |  |  |  | } else { | 
| 825 | 0 |  |  |  |  | 0 | push @blacklist => $seq_id; | 
| 826 |  |  |  |  |  |  | } | 
| 827 |  |  |  |  |  |  | } | 
| 828 |  |  |  |  |  |  |  | 
| 829 | 20 | 50 |  |  |  | 110 | unless (%piece_table) { | 
| 830 | 0 |  |  |  |  | 0 | die "All fasta entries were removed due to deletions. ", | 
| 831 |  |  |  |  |  |  | "Please, verify the genomic variation '$genomic_variation'\n"; | 
| 832 |  |  |  |  |  |  | } | 
| 833 |  |  |  |  |  |  |  | 
| 834 |  |  |  |  |  |  | # If fasta_rtree has entries | 
| 835 | 20 | 50 |  |  |  | 740 | unless ($self->_has_no_fasta_rtree) { | 
| 836 |  |  |  |  |  |  | # Remove no valid entries from id -> pid relation | 
| 837 | 0 | 0 |  |  |  | 0 | $self->_delete_fasta_rtree(@blacklist) if @blacklist; | 
| 838 |  |  |  |  |  |  | } | 
| 839 |  |  |  |  |  |  |  | 
| 840 |  |  |  |  |  |  | # Make the id -> pid relationship | 
| 841 | 20 |  |  |  |  | 85 | $self->_populate_fasta_tree; | 
| 842 |  |  |  |  |  |  |  | 
| 843 |  |  |  |  |  |  | # HASH -> SEQ_ID -> @(REF @ALT) -> @(TABLE SIZE) | 
| 844 | 20 |  |  |  |  | 530 | return \%piece_table; | 
| 845 |  |  |  |  |  |  | } | 
| 846 |  |  |  |  |  |  |  | 
| 847 |  |  |  |  |  |  | sub _populate_piece_table { | 
| 848 | 0 |  |  | 0 |  | 0 | my ($self, $table, $snvs) = @_; | 
| 849 |  |  |  |  |  |  |  | 
| 850 | 0 |  |  |  |  | 0 | for my $snv (@$snvs) { | 
| 851 |  |  |  |  |  |  | # If there is an ID, make sure that it is not a comma, colon | 
| 852 |  |  |  |  |  |  | # separated list. Else, make sure to keep the ref/alt length | 
| 853 |  |  |  |  |  |  | # to max 25+25+1=51 | 
| 854 |  |  |  |  |  |  | my $annot = defined $snv->{id} && $snv->{id} ne '.' | 
| 855 |  |  |  |  |  |  | ? sprintf "%d:%s" => $snv->{pos} + 1, (split(/[,;]/, $snv->{id}))[0] | 
| 856 | 0 | 0 | 0 |  |  | 0 | : sprintf "%d:%.25s/%.25s" => $snv->{pos} + 1, $snv->{ref}, $snv->{alt}; | 
| 857 |  |  |  |  |  |  |  | 
| 858 |  |  |  |  |  |  | # Insertion | 
| 859 | 0 | 0 |  |  |  | 0 | if ($snv->{ref} eq '-') { | 
|  |  | 0 |  |  |  |  |  | 
| 860 | 0 |  |  |  |  | 0 | $table->insert(\$snv->{alt}, $snv->{pos}, $annot); | 
| 861 |  |  |  |  |  |  |  | 
| 862 |  |  |  |  |  |  | # Deletion | 
| 863 |  |  |  |  |  |  | } elsif ($snv->{alt} eq '-') { | 
| 864 | 0 |  |  |  |  | 0 | $table->delete($snv->{pos}, length $snv->{ref}, $annot); | 
| 865 |  |  |  |  |  |  |  | 
| 866 |  |  |  |  |  |  | # Change | 
| 867 |  |  |  |  |  |  | } else { | 
| 868 | 0 |  |  |  |  | 0 | $table->change(\$snv->{alt}, $snv->{pos}, length $snv->{ref}, $annot); | 
| 869 |  |  |  |  |  |  | } | 
| 870 |  |  |  |  |  |  | } | 
| 871 |  |  |  |  |  |  | } | 
| 872 |  |  |  |  |  |  |  | 
| 873 |  |  |  |  |  |  | sub _validate_indexed_snv_against_fasta { | 
| 874 | 0 |  |  | 0 |  | 0 | my ($self, $indexed_snv) = @_; | 
| 875 |  |  |  |  |  |  |  | 
| 876 | 0 |  |  |  |  | 0 | my $indexed_fasta = $self->_fasta; | 
| 877 | 0 |  |  |  |  | 0 | my $genomic_variation = $self->_genomic_variation_names; | 
| 878 |  |  |  |  |  |  |  | 
| 879 | 0 |  |  |  |  | 0 | for my $std_seq_id (keys %$indexed_snv) { | 
| 880 | 0 |  |  |  |  | 0 | my $snvs = delete $indexed_snv->{$std_seq_id}; | 
| 881 | 0 |  |  |  |  | 0 | my $seq_id = $self->_get_seqname($std_seq_id); | 
| 882 |  |  |  |  |  |  |  | 
| 883 | 0 | 0 | 0 |  |  | 0 | unless (defined $seq_id && exists $indexed_fasta->{$seq_id}) { | 
| 884 | 0 |  |  |  |  | 0 | next; | 
| 885 |  |  |  |  |  |  | } | 
| 886 |  |  |  |  |  |  |  | 
| 887 | 0 |  |  |  |  | 0 | my $seq = \$indexed_fasta->{$seq_id}{seq}; | 
| 888 | 0 |  |  |  |  | 0 | my $size = $indexed_fasta->{$seq_id}{size}; | 
| 889 |  |  |  |  |  |  |  | 
| 890 | 0 |  |  |  |  | 0 | my @saved_snvs; | 
| 891 |  |  |  |  |  |  |  | 
| 892 | 0 |  |  |  |  | 0 | for my $snv (@$snvs) { | 
| 893 |  |  |  |  |  |  | # Insertions may accur until one base after the | 
| 894 |  |  |  |  |  |  | # end of the sequence, not more | 
| 895 | 0 | 0 | 0 |  |  | 0 | if (($snv->{ref} eq '-' && $snv->{pos} > $size) || ($snv->{ref} ne '-' && $snv->{pos} >= $size)) { | 
|  |  | 0 | 0 |  |  |  |  | 
|  |  |  | 0 |  |  |  |  | 
| 896 |  |  |  |  |  |  | log_msg sprintf ":: In validating '%s': Position, %s/%s at %s:%d, outside fasta sequence", | 
| 897 | 0 |  |  |  |  | 0 | $genomic_variation, $snv->{ref}, $snv->{alt}, $seq_id, $snv->{pos} + 1; | 
| 898 |  |  |  |  |  |  |  | 
| 899 |  |  |  |  |  |  | # Next snv | 
| 900 | 0 |  |  |  |  | 0 | next; | 
| 901 |  |  |  |  |  |  | # Deletions and changes. Just verify if the reference exists | 
| 902 |  |  |  |  |  |  | } elsif ($snv->{ref} ne '-') { | 
| 903 | 0 |  |  |  |  | 0 | my $ref = substr $$seq, $snv->{pos}, length($snv->{ref}); | 
| 904 |  |  |  |  |  |  |  | 
| 905 | 0 | 0 |  |  |  | 0 | if (uc($ref) ne uc($snv->{ref})) { | 
| 906 |  |  |  |  |  |  | log_msg sprintf ":: In validating '%s': Not found reference '%s' at fasta position %s:%d", | 
| 907 | 0 |  |  |  |  | 0 | $genomic_variation, $snv->{ref}, $seq_id, $snv->{pos} + 1; | 
| 908 |  |  |  |  |  |  |  | 
| 909 |  |  |  |  |  |  | # Next snv | 
| 910 | 0 |  |  |  |  | 0 | next; | 
| 911 |  |  |  |  |  |  | } | 
| 912 |  |  |  |  |  |  | } | 
| 913 |  |  |  |  |  |  |  | 
| 914 | 0 |  |  |  |  | 0 | push @saved_snvs  => $snv; | 
| 915 |  |  |  |  |  |  | } | 
| 916 |  |  |  |  |  |  |  | 
| 917 | 0 | 0 |  |  |  | 0 | if (@saved_snvs) { | 
| 918 | 0 |  |  |  |  | 0 | $indexed_snv->{$std_seq_id} = [@saved_snvs]; | 
| 919 |  |  |  |  |  |  | } | 
| 920 |  |  |  |  |  |  | } | 
| 921 |  |  |  |  |  |  | } | 
| 922 |  |  |  |  |  |  |  | 
| 923 |  |  |  |  |  |  | sub _calculate_number_of_reads { | 
| 924 | 8 |  |  | 8 |  | 19 | my $self = shift; | 
| 925 | 8 |  |  |  |  | 16 | my $number_of_reads; | 
| 926 |  |  |  |  |  |  |  | 
| 927 | 8 | 50 |  |  |  | 239 | if ($self->count_loops_by eq 'coverage') { | 
|  |  | 0 |  |  |  |  |  | 
| 928 |  |  |  |  |  |  | # It is needed to calculate the genome size | 
| 929 | 8 |  |  |  |  | 259 | my $fasta = $self->_fasta; | 
| 930 | 8 |  |  |  |  | 27 | my $fasta_size = 0; | 
| 931 | 8 |  |  |  |  | 29 | $fasta_size += $fasta->{$_}{size} for keys %{ $fasta }; | 
|  | 8 |  |  |  |  | 101 |  | 
| 932 | 8 |  |  |  |  | 239 | $number_of_reads = int(($fasta_size * $self->coverage) / $self->seq->read_mean); | 
| 933 |  |  |  |  |  |  | # In case it is paired-end read, divide the number of reads by 2 because | 
| 934 |  |  |  |  |  |  | # App::Sandy::Seq::PairedEnd class returns 2 reads at time | 
| 935 | 8 | 100 |  |  |  | 215 | $number_of_reads = int($number_of_reads / 2) | 
| 936 |  |  |  |  |  |  | if ref($self->seq) eq 'App::Sandy::Seq::PairedEnd'; | 
| 937 |  |  |  |  |  |  | } elsif ($self->count_loops_by eq 'number-of-reads') { | 
| 938 | 0 |  |  |  |  | 0 | $number_of_reads = $self->number_of_reads; | 
| 939 |  |  |  |  |  |  | } else { | 
| 940 | 0 |  |  |  |  | 0 | croak sprintf "Unknown option '%s' for calculating the number of reads\n", | 
| 941 |  |  |  |  |  |  | $self->count_loops_by; | 
| 942 |  |  |  |  |  |  | } | 
| 943 |  |  |  |  |  |  |  | 
| 944 |  |  |  |  |  |  | # Maybe the number_of_reads is zero. It may occur due to the low coverage and/or fasta_file size | 
| 945 | 8 | 50 |  |  |  | 30 | if ($number_of_reads <= 0) { | 
| 946 | 0 |  |  |  |  | 0 | die "The computed number of reads is equal to zero.\n" . | 
| 947 |  |  |  |  |  |  | "It may occur due to the low coverage, fasta-file sequence size or number of reads directly passed by the user\n"; | 
| 948 |  |  |  |  |  |  | } | 
| 949 |  |  |  |  |  |  |  | 
| 950 | 8 |  |  |  |  | 24 | return $number_of_reads; | 
| 951 |  |  |  |  |  |  | } | 
| 952 |  |  |  |  |  |  |  | 
| 953 |  |  |  |  |  |  | sub _calculate_parent_count { | 
| 954 | 0 |  |  | 0 |  | 0 | my ($self, $counter_ref) = @_; | 
| 955 | 0 | 0 |  |  |  | 0 | return if $self->_has_no_fasta_rtree; | 
| 956 |  |  |  |  |  |  |  | 
| 957 | 0 |  |  |  |  | 0 | my %parent_count; | 
| 958 |  |  |  |  |  |  |  | 
| 959 | 0 |  |  |  |  | 0 | while (my ($id, $count) = each %$counter_ref) { | 
| 960 | 0 |  |  |  |  | 0 | my $pid = $self->_get_fasta_rtree($id); | 
| 961 | 0 | 0 |  |  |  | 0 | $parent_count{$pid} += $count if defined $pid; | 
| 962 |  |  |  |  |  |  | } | 
| 963 |  |  |  |  |  |  |  | 
| 964 | 0 |  |  |  |  | 0 | return \%parent_count; | 
| 965 |  |  |  |  |  |  | } | 
| 966 |  |  |  |  |  |  |  | 
| 967 |  |  |  |  |  |  | sub run_simulation { | 
| 968 | 8 |  |  | 8 | 0 | 3764 | my $self = shift; | 
| 969 | 8 |  |  |  |  | 334 | my $piece_table = $self->_piece_table; | 
| 970 |  |  |  |  |  |  |  | 
| 971 |  |  |  |  |  |  | # Calculate the number of reads to be generated | 
| 972 | 8 |  |  |  |  | 63 | my $number_of_reads = $self->_calculate_number_of_reads; | 
| 973 |  |  |  |  |  |  |  | 
| 974 |  |  |  |  |  |  | # Function that returns strand by strand_bias | 
| 975 | 8 |  |  |  |  | 253 | my $strand = $self->_strand; | 
| 976 |  |  |  |  |  |  |  | 
| 977 |  |  |  |  |  |  | # Function that returns seqid by seqid_weight | 
| 978 | 8 |  |  |  |  | 265 | my $seqid = $self->_seqid_raffle; | 
| 979 |  |  |  |  |  |  |  | 
| 980 |  |  |  |  |  |  | # Count file to be generated | 
| 981 | 8 |  |  |  |  | 249 | my %count_file = ( | 
| 982 |  |  |  |  |  |  | transcripts => $self->prefix . '_abundance_transcripts.tsv', | 
| 983 |  |  |  |  |  |  | genes       => $self->prefix . '_abundance_genes.tsv', | 
| 984 |  |  |  |  |  |  | coverage    => $self->prefix . '_coverage.tsv' | 
| 985 |  |  |  |  |  |  | ); | 
| 986 |  |  |  |  |  |  |  | 
| 987 |  |  |  |  |  |  | # Main files | 
| 988 | 8 |  |  |  |  | 218 | my %files = ( | 
| 989 |  |  |  |  |  |  | bam                  => [ | 
| 990 |  |  |  |  |  |  | $self->prefix . '.bam' | 
| 991 |  |  |  |  |  |  | ], | 
| 992 |  |  |  |  |  |  | sam                  => [ | 
| 993 |  |  |  |  |  |  | $self->prefix . '.sam' | 
| 994 |  |  |  |  |  |  | ], | 
| 995 |  |  |  |  |  |  | single_fastq         => [ | 
| 996 |  |  |  |  |  |  | $self->prefix . '_R1_001.fastq' | 
| 997 |  |  |  |  |  |  | ], | 
| 998 |  |  |  |  |  |  | single_fastq_gz      => [ | 
| 999 |  |  |  |  |  |  | $self->prefix . '_R1_001.fastq.gz' | 
| 1000 |  |  |  |  |  |  | ], | 
| 1001 |  |  |  |  |  |  | join_paired_fastq    => [ | 
| 1002 |  |  |  |  |  |  | $self->prefix . '.fastq' | 
| 1003 |  |  |  |  |  |  | ], | 
| 1004 |  |  |  |  |  |  | join_paired_fastq_gz => [ | 
| 1005 |  |  |  |  |  |  | $self->prefix . '.fastq.gz' | 
| 1006 |  |  |  |  |  |  | ], | 
| 1007 |  |  |  |  |  |  | paired_fastq         => [ | 
| 1008 |  |  |  |  |  |  | $self->prefix . '_R1_001.fastq', | 
| 1009 |  |  |  |  |  |  | $self->prefix . '_R2_001.fastq' | 
| 1010 |  |  |  |  |  |  | ], | 
| 1011 |  |  |  |  |  |  | paired_fastq_gz      => [ | 
| 1012 |  |  |  |  |  |  | $self->prefix . '_R1_001.fastq.gz', | 
| 1013 |  |  |  |  |  |  | $self->prefix . '_R2_001.fastq.gz' | 
| 1014 |  |  |  |  |  |  | ] | 
| 1015 |  |  |  |  |  |  | ); | 
| 1016 |  |  |  |  |  |  |  | 
| 1017 |  |  |  |  |  |  | # Set the file class in order to know | 
| 1018 |  |  |  |  |  |  | # how to deal with all files options | 
| 1019 | 8 |  |  |  |  | 222 | my $seq_class = ref $self->seq; | 
| 1020 | 8 |  |  |  |  | 271 | my $output_format = $self->output_format; | 
| 1021 | 8 |  |  |  |  | 22 | my $file_class; | 
| 1022 |  |  |  |  |  |  |  | 
| 1023 |  |  |  |  |  |  | # This mess is necessary to catch the | 
| 1024 |  |  |  |  |  |  | # right value into the %files hash | 
| 1025 | 8 | 50 |  |  |  | 113 | if ($output_format =~ /(sam|bam)/) { | 
|  |  | 50 |  |  |  |  |  | 
| 1026 | 0 |  |  |  |  | 0 | $file_class = $output_format; | 
| 1027 |  |  |  |  |  |  | } elsif ($output_format =~ /fastq/) { | 
| 1028 | 8 | 100 |  |  |  | 39 | if ($seq_class eq 'App::Sandy::Seq::SingleEnd') { | 
|  |  | 50 |  |  |  |  |  | 
| 1029 | 5 |  |  |  |  | 15 | $file_class = 'single_fastq'; | 
| 1030 |  |  |  |  |  |  | } elsif ($seq_class eq 'App::Sandy::Seq::PairedEnd') { | 
| 1031 | 3 |  |  |  |  | 15 | $file_class = 'paired_fastq'; | 
| 1032 | 3 | 50 |  |  |  | 150 | $file_class = "join_$file_class" if $self->join_paired_ends; | 
| 1033 |  |  |  |  |  |  | } else { | 
| 1034 | 0 |  |  |  |  | 0 | croak "Something wrong with the seq class: $seq_class"; | 
| 1035 |  |  |  |  |  |  | } | 
| 1036 | 8 | 50 |  |  |  | 30 | if ($output_format eq 'fastq.gz') { | 
| 1037 | 0 |  |  |  |  | 0 | $file_class .= '_gz'; | 
| 1038 |  |  |  |  |  |  | } | 
| 1039 |  |  |  |  |  |  | } else { | 
| 1040 | 0 |  |  |  |  | 0 | croak "Something wrong with the output format: $output_format"; | 
| 1041 |  |  |  |  |  |  | } | 
| 1042 |  |  |  |  |  |  |  | 
| 1043 |  |  |  |  |  |  | # Forks | 
| 1044 | 8 |  |  |  |  | 253 | my $number_of_jobs = $self->jobs; | 
| 1045 | 8 |  |  |  |  | 159 | my $pm = Parallel::ForkManager->new($number_of_jobs); | 
| 1046 |  |  |  |  |  |  |  | 
| 1047 |  |  |  |  |  |  | # Parent child pids | 
| 1048 | 8 |  |  |  |  | 27375 | my $parent_pid = $$; | 
| 1049 | 8 |  |  |  |  | 24 | my @child_pid; | 
| 1050 |  |  |  |  |  |  |  | 
| 1051 |  |  |  |  |  |  | # Temporary files tracker | 
| 1052 |  |  |  |  |  |  | my @tmp_files; | 
| 1053 |  |  |  |  |  |  |  | 
| 1054 |  |  |  |  |  |  | # Run in parent right after creating child process | 
| 1055 |  |  |  |  |  |  | $pm->run_on_start( | 
| 1056 |  |  |  |  |  |  | sub { | 
| 1057 | 10 |  |  | 10 |  | 31570 | my $pid = shift; | 
| 1058 | 10 |  |  |  |  | 303 | push @child_pid => $pid; | 
| 1059 |  |  |  |  |  |  | } | 
| 1060 | 8 |  |  |  |  | 112 | ); | 
| 1061 |  |  |  |  |  |  |  | 
| 1062 |  |  |  |  |  |  | # Count the overall cumulative number of reads for each seqid | 
| 1063 | 8 |  |  |  |  | 65 | my %counters; | 
| 1064 |  |  |  |  |  |  |  | 
| 1065 |  |  |  |  |  |  | # Run in parent right after finishing child process | 
| 1066 |  |  |  |  |  |  | $pm->run_on_finish( | 
| 1067 |  |  |  |  |  |  | sub { | 
| 1068 | 8 |  |  | 8 |  | 40076476 | my ($pid, $exit_code, $ident, $exit_signal, $core_dump, $counter_ref) = @_; | 
| 1069 | 8 |  |  |  |  | 121 | while (my ($seqid, $count) = each %$counter_ref) { | 
| 1070 | 40 |  |  |  |  | 267 | $counters{$seqid} += $count; | 
| 1071 |  |  |  |  |  |  | } | 
| 1072 |  |  |  |  |  |  | } | 
| 1073 | 8 |  |  |  |  | 69 | ); | 
| 1074 |  |  |  |  |  |  |  | 
| 1075 | 8 | 50 |  |  |  | 166 | log_msg sprintf ":: Creating %d child %s ...", | 
| 1076 |  |  |  |  |  |  | $number_of_jobs, $number_of_jobs == 1 ? "job" : "jobs"; | 
| 1077 |  |  |  |  |  |  |  | 
| 1078 | 8 |  |  |  |  | 35 | for my $tid (1..$number_of_jobs) { | 
| 1079 |  |  |  |  |  |  | #------------------------------------------------------------------------------- | 
| 1080 |  |  |  |  |  |  | # Inside parent | 
| 1081 |  |  |  |  |  |  | #------------------------------------------------------------------------------- | 
| 1082 | 14 |  |  |  |  | 661 | log_msg ":: Creating job $tid ..."; | 
| 1083 | 14 |  |  |  |  | 28 | my @files_t = map { "$_.${parent_pid}.part$tid" } @{ $files{$file_class} }; | 
|  | 19 |  |  |  |  | 159 |  | 
|  | 14 |  |  |  |  | 188 |  | 
| 1084 | 14 |  |  |  |  | 78 | push @tmp_files => @files_t; | 
| 1085 | 14 | 100 |  |  |  | 113 | my $pid = $pm->start and next; | 
| 1086 |  |  |  |  |  |  |  | 
| 1087 |  |  |  |  |  |  | #------------------------------------------------------------------------------- | 
| 1088 |  |  |  |  |  |  | # Inside child | 
| 1089 |  |  |  |  |  |  | #------------------------------------------------------------------------------- | 
| 1090 |  |  |  |  |  |  | # Intelace child/parent processes | 
| 1091 | 4 |  |  |  |  | 19924 | my $sig = App::Sandy::InterlaceProcesses->new(foreign_pid => [$parent_pid]); | 
| 1092 |  |  |  |  |  |  |  | 
| 1093 |  |  |  |  |  |  | # Get the FASTA blacklist regions ID => @{ TUPLE } | 
| 1094 | 4 |  |  |  |  | 421 | my $fasta_blacklist = $self->_fasta_blacklist; | 
| 1095 |  |  |  |  |  |  |  | 
| 1096 |  |  |  |  |  |  | # Set child RNG | 
| 1097 | 4 |  |  |  |  | 302 | my $rng = App::Sandy::RNG->new($self->seed + $tid); | 
| 1098 |  |  |  |  |  |  |  | 
| 1099 |  |  |  |  |  |  | # Calculate the number of reads to this job and correct this local index | 
| 1100 |  |  |  |  |  |  | # to the global index | 
| 1101 | 4 |  |  |  |  | 129 | my $number_of_reads_t = int($number_of_reads/$number_of_jobs); | 
| 1102 | 4 |  |  |  |  | 57 | my $last_read_idx = $number_of_reads_t * $tid; | 
| 1103 | 4 |  |  |  |  | 64 | my $idx = $last_read_idx - $number_of_reads_t + 1; | 
| 1104 |  |  |  |  |  |  |  | 
| 1105 |  |  |  |  |  |  | # If it is the last job, make it work on the leftover reads of int() truncation | 
| 1106 | 4 | 100 |  |  |  | 62 | $last_read_idx += $number_of_reads % $number_of_jobs | 
| 1107 |  |  |  |  |  |  | if $tid == $number_of_jobs; | 
| 1108 |  |  |  |  |  |  |  | 
| 1109 | 4 |  |  |  |  | 247 | log_msg "  => Job $tid: Working on sequences from $idx to $last_read_idx"; | 
| 1110 |  |  |  |  |  |  |  | 
| 1111 |  |  |  |  |  |  | # Create temporary files | 
| 1112 | 4 |  |  |  |  | 40 | log_msg "  => Job $tid: Creating temporary file: @files_t"; | 
| 1113 |  |  |  |  |  |  |  | 
| 1114 |  |  |  |  |  |  | # And here we go ... | 
| 1115 | 4 |  |  |  |  | 26 | my @fhs; | 
| 1116 |  |  |  |  |  |  |  | 
| 1117 |  |  |  |  |  |  | # Set the right filehandle format | 
| 1118 | 4 | 50 |  |  |  | 219 | if ($output_format =~ /^(sam|fastq)$/) { | 
|  |  | 0 |  |  |  |  |  | 
|  |  | 0 |  |  |  |  |  | 
| 1119 | 4 |  |  |  |  | 38 | @fhs = map { $self->with_open_w($_, 0) } @files_t; | 
|  | 6 |  |  |  |  | 268 |  | 
| 1120 |  |  |  |  |  |  | } elsif ($output_format eq 'fastq.gz') { | 
| 1121 | 0 |  |  |  |  | 0 | @fhs = map { $self->with_open_w($_, $self->compression_level) } @files_t; | 
|  | 0 |  |  |  |  | 0 |  | 
| 1122 |  |  |  |  |  |  | } elsif ($output_format eq 'bam') { | 
| 1123 | 0 |  |  |  |  | 0 | @fhs = map { $self->with_open_bam_w($_, $self->compression_level) } @files_t; | 
|  | 0 |  |  |  |  | 0 |  | 
| 1124 |  |  |  |  |  |  | } else { | 
| 1125 | 0 |  |  |  |  | 0 | croak "Something wrong with the output format: $file_class"; | 
| 1126 |  |  |  |  |  |  | } | 
| 1127 |  |  |  |  |  |  |  | 
| 1128 |  |  |  |  |  |  | # sprint_seq gives two entries for paired-emd, so | 
| 1129 |  |  |  |  |  |  | # if it is a bam|sam|join-paired-ends, it is necessary | 
| 1130 |  |  |  |  |  |  | # to copy the filehandle in order to print both entries | 
| 1131 |  |  |  |  |  |  | # to the same file | 
| 1132 | 4 | 50 | 66 |  |  | 120 | if ($seq_class eq 'App::Sandy::Seq::PairedEnd' | 
| 1133 |  |  |  |  |  |  | && $file_class =~ /(sam|bam|join)/) { | 
| 1134 | 0 |  |  |  |  | 0 | $fhs[1] = $fhs[0]; | 
| 1135 |  |  |  |  |  |  | } | 
| 1136 |  |  |  |  |  |  |  | 
| 1137 |  |  |  |  |  |  | # Count the cumulative number of reads for each seqid | 
| 1138 | 4 |  |  |  |  | 22 | my %counter; | 
| 1139 |  |  |  |  |  |  |  | 
| 1140 |  |  |  |  |  |  | # If the output format is 'bam|sam' and it is the first job, then | 
| 1141 |  |  |  |  |  |  | # write the header | 
| 1142 | 4 | 50 | 33 |  |  | 81 | if ($output_format =~ /^(sam|bam)$/ && $tid == 1) { | 
| 1143 | 0 |  |  |  |  | 0 | my $header_ref = $self->gen_sam_header($self->argv); | 
| 1144 | 0 |  |  |  |  | 0 | print {$fhs[0]} "$$header_ref"; | 
|  | 0 |  |  |  |  | 0 |  | 
| 1145 |  |  |  |  |  |  | } | 
| 1146 |  |  |  |  |  |  |  | 
| 1147 |  |  |  |  |  |  | # Run simulation in child | 
| 1148 | 4 |  | 66 |  |  | 445 | for (my $i = $idx; $i <= $last_read_idx and not $sig->signal_catched; $i++) { | 
| 1149 | 6840 |  |  |  |  | 15660 | my $id = $seqid->($rng); | 
| 1150 | 6840 |  |  |  |  | 16968 | my $blacklist = $fasta_blacklist->{$id->{seq_id}}; | 
| 1151 | 6840 |  |  |  |  | 15738 | my $ptable = $piece_table->{$id->{seq_id}}{$id->{type}}; | 
| 1152 | 6840 |  |  |  |  | 9693 | my @seq_entry; | 
| 1153 |  |  |  |  |  |  | try { | 
| 1154 |  |  |  |  |  |  | @seq_entry = $self->sprint_seq($tid, $i, $id->{seq_id}, $id->{type}, | 
| 1155 | 6840 |  |  | 6840 |  | 434086 | $ptable->{table}, $ptable->{size}, $strand->($rng), $rng, $blacklist); | 
| 1156 |  |  |  |  |  |  | } catch { | 
| 1157 | 0 |  |  | 0 |  | 0 | die "Not defined entry for seqid '>$id->{seq_id}' at job $tid: $_"; | 
| 1158 |  |  |  |  |  |  | } finally { | 
| 1159 | 6840 | 50 |  | 6840 |  | 143378 | unless (@_) { | 
| 1160 | 6840 |  |  |  |  | 18645 | for my $fh_idx (0..$#fhs) { | 
| 1161 | 9120 |  |  |  |  | 19306 | $counter{$id->{seq_id}}++; | 
| 1162 | 9120 |  |  |  |  | 11998 | print {$fhs[$fh_idx]} "${$seq_entry[$fh_idx]}"; | 
|  | 9120 |  |  |  |  | 16510 |  | 
|  | 9120 |  |  |  |  | 44405 |  | 
| 1163 |  |  |  |  |  |  | } | 
| 1164 |  |  |  |  |  |  | } | 
| 1165 | 6840 |  |  |  |  | 53388 | }; | 
| 1166 |  |  |  |  |  |  | } | 
| 1167 |  |  |  |  |  |  |  | 
| 1168 | 4 |  |  |  |  | 123 | log_msg "  => Job $tid: Writing and closing file: @files_t"; | 
| 1169 |  |  |  |  |  |  |  | 
| 1170 |  |  |  |  |  |  | # Close temporary files | 
| 1171 |  |  |  |  |  |  | # Get index from @files_t in order to avoid | 
| 1172 |  |  |  |  |  |  | # close the same filehandle twice - When the | 
| 1173 |  |  |  |  |  |  | # position 1-N is a copy | 
| 1174 | 4 |  |  |  |  | 14 | for my $fh_idx (0..$#files_t) { | 
| 1175 | 6 |  |  |  |  | 284 | close $fhs[$fh_idx]; | 
| 1176 |  |  |  |  |  |  | } | 
| 1177 |  |  |  |  |  |  |  | 
| 1178 |  |  |  |  |  |  | # If it is a bam and it is the last loop, then | 
| 1179 |  |  |  |  |  |  | # write a eof marker | 
| 1180 | 4 | 50 | 33 |  |  | 49 | if ($output_format eq 'bam' && $tid == $number_of_jobs) { | 
| 1181 | 0 |  |  |  |  | 0 | $self->gen_eof_marker($files_t[0]); | 
| 1182 |  |  |  |  |  |  | } | 
| 1183 |  |  |  |  |  |  |  | 
| 1184 |  |  |  |  |  |  | # Child exit | 
| 1185 | 4 |  |  |  |  | 82 | log_msg "  => Job $tid is finished"; | 
| 1186 | 4 |  |  |  |  | 129 | $pm->finish(0, \%counter); | 
| 1187 |  |  |  |  |  |  | } | 
| 1188 |  |  |  |  |  |  |  | 
| 1189 |  |  |  |  |  |  | # Back to parent | 
| 1190 |  |  |  |  |  |  | # Interlace parent/child(s) processes | 
| 1191 | 4 |  |  |  |  | 1254 | my $sig = App::Sandy::InterlaceProcesses->new(foreign_pid => \@child_pid); | 
| 1192 | 4 |  |  |  |  | 74 | $pm->wait_all_children; | 
| 1193 |  |  |  |  |  |  |  | 
| 1194 | 4 | 50 |  |  |  | 598 | if ($sig->signal_catched) { | 
| 1195 | 0 |  |  |  |  | 0 | log_msg ":: Termination signal received!"; | 
| 1196 |  |  |  |  |  |  | } | 
| 1197 |  |  |  |  |  |  |  | 
| 1198 | 4 |  |  |  |  | 118 | log_msg ":: Saving the work ..."; | 
| 1199 |  |  |  |  |  |  |  | 
| 1200 |  |  |  |  |  |  | # Concatenate all temporary files | 
| 1201 | 4 |  |  |  |  | 36 | log_msg ":: Concatenate all temporary files"; | 
| 1202 |  |  |  |  |  |  |  | 
| 1203 |  |  |  |  |  |  | # Save time. Rename tmp_file (1,2) | 
| 1204 | 4 |  |  |  |  | 13 | for my $file (@{ $files{$file_class} }) { | 
|  | 4 |  |  |  |  | 80 |  | 
| 1205 | 5 |  |  |  |  | 76 | my $tmp = shift @tmp_files; | 
| 1206 | 5 |  |  |  |  | 63 | log_msg "  => Concatenating $tmp to $file ..."; | 
| 1207 | 5 | 50 |  |  |  | 487 | rename $tmp => $file | 
| 1208 |  |  |  |  |  |  | or die "Cannot create '$file': $!\n"; | 
| 1209 |  |  |  |  |  |  | } | 
| 1210 |  |  |  |  |  |  |  | 
| 1211 |  |  |  |  |  |  | # Append to renamed tmp files | 
| 1212 | 4 |  |  |  |  | 45 | my @fh = map { $self->with_open_a($_) } @{ $files{$file_class} }; | 
|  | 5 |  |  |  |  | 174 |  | 
|  | 4 |  |  |  |  | 30 |  | 
| 1213 |  |  |  |  |  |  |  | 
| 1214 | 4 |  |  |  |  | 44 | for my $i (0..$#tmp_files) { | 
| 1215 | 5 |  |  |  |  | 27 | my $fh_idx = $i % scalar @fh; | 
| 1216 |  |  |  |  |  |  |  | 
| 1217 | 5 |  |  |  |  | 91 | log_msg "  => Concatenating $tmp_files[$i] to $files{$file_class}[$fh_idx] ..."; | 
| 1218 | 5 | 50 |  |  |  | 118 | cat $tmp_files[$i] => $fh[$fh_idx] | 
| 1219 |  |  |  |  |  |  | or die "Cannot concatenate $tmp_files[$i] to $files{$file_class}[$fh_idx]: $!\n"; | 
| 1220 |  |  |  |  |  |  |  | 
| 1221 |  |  |  |  |  |  | # Clean up the mess | 
| 1222 | 5 | 50 |  |  |  | 54438 | unlink $tmp_files[$i] | 
| 1223 |  |  |  |  |  |  | or die "Cannot remove temporary file '$tmp_files[$i]': $!\n"; | 
| 1224 |  |  |  |  |  |  | } | 
| 1225 |  |  |  |  |  |  |  | 
| 1226 |  |  |  |  |  |  | # Close files | 
| 1227 | 4 |  |  |  |  | 25 | log_msg ":: Writing and closing output file: @{ $files{$file_class} }"; | 
|  | 4 |  |  |  |  | 82 |  | 
| 1228 | 4 |  |  |  |  | 23 | for my $fh_idx (0..$#fh) { | 
| 1229 | 5 | 50 |  |  |  | 133 | close $fh[$fh_idx] | 
| 1230 |  |  |  |  |  |  | or die "Cannot write file $files{$file_class}[$fh_idx]: $!\n"; | 
| 1231 |  |  |  |  |  |  | } | 
| 1232 |  |  |  |  |  |  |  | 
| 1233 | 4 | 50 |  |  |  | 396 | if ($self->count_loops_by eq 'number-of-reads') { | 
| 1234 |  |  |  |  |  |  | # It is necessary to correct the abundance according to | 
| 1235 |  |  |  |  |  |  | # fragment sequencing end | 
| 1236 | 0 | 0 |  |  |  | 0 | my $count_factor = ref($self->seq) eq 'App::Sandy::Seq::PairedEnd' | 
| 1237 |  |  |  |  |  |  | ? 2 | 
| 1238 |  |  |  |  |  |  | : 1; | 
| 1239 |  |  |  |  |  |  |  | 
| 1240 |  |  |  |  |  |  | # Save transcripts | 
| 1241 | 0 |  |  |  |  | 0 | log_msg ":: Saving transcripts count"; | 
| 1242 | 0 |  |  |  |  | 0 | my $fh = $self->with_open_w($count_file{transcripts}, 0); | 
| 1243 |  |  |  |  |  |  |  | 
| 1244 | 0 |  |  |  |  | 0 | log_msg "  => Writing counts to $count_file{transcripts} ..."; | 
| 1245 | 0 |  |  |  |  | 0 | for my $id (sort keys %counters) { | 
| 1246 | 0 |  |  |  |  | 0 | printf {$fh} "%s\t%d\n" => $id, | 
| 1247 | 0 |  |  |  |  | 0 | int($counters{$id} / $count_factor); | 
| 1248 |  |  |  |  |  |  | } | 
| 1249 |  |  |  |  |  |  |  | 
| 1250 |  |  |  |  |  |  | # Close transcripts file | 
| 1251 | 0 |  |  |  |  | 0 | log_msg ":; Writing and closing $count_file{transcripts} ..."; | 
| 1252 | 0 | 0 |  |  |  | 0 | close $fh | 
| 1253 |  |  |  |  |  |  | or die "Cannot write file $count_file{transcripts}: $!\n"; | 
| 1254 |  |  |  |  |  |  |  | 
| 1255 |  |  |  |  |  |  | # Calculate 'gene' like expression | 
| 1256 | 0 |  |  |  |  | 0 | my $parent_count = $self->_calculate_parent_count(\%counters); | 
| 1257 |  |  |  |  |  |  |  | 
| 1258 | 0 | 0 |  |  |  | 0 | if (%$parent_count) { | 
| 1259 |  |  |  |  |  |  | # Save genes | 
| 1260 | 0 |  |  |  |  | 0 | log_msg ":: Saving genes count"; | 
| 1261 | 0 |  |  |  |  | 0 | my $fh = $self->with_open_w($count_file{genes}, 0); | 
| 1262 |  |  |  |  |  |  |  | 
| 1263 | 0 |  |  |  |  | 0 | log_msg "  => Writing counts to $count_file{genes} ..."; | 
| 1264 | 0 |  |  |  |  | 0 | for my $id (sort keys %$parent_count) { | 
| 1265 | 0 |  |  |  |  | 0 | printf {$fh} "%s\t%d\n" => $id, | 
| 1266 | 0 |  |  |  |  | 0 | int($parent_count->{$id} / $count_factor); | 
| 1267 |  |  |  |  |  |  | } | 
| 1268 |  |  |  |  |  |  |  | 
| 1269 |  |  |  |  |  |  | # Close genes file | 
| 1270 | 0 |  |  |  |  | 0 | log_msg ":; Writing and closing $count_file{genes} ..."; | 
| 1271 | 0 | 0 |  |  |  | 0 | close $fh | 
| 1272 |  |  |  |  |  |  | or die "Cannot write file $count_file{genes}: $!\n"; | 
| 1273 |  |  |  |  |  |  | } | 
| 1274 |  |  |  |  |  |  | } else { | 
| 1275 |  |  |  |  |  |  | # Save coverage | 
| 1276 | 4 |  |  |  |  | 51 | log_msg ":: Saving coverage count"; | 
| 1277 | 4 |  |  |  |  | 100 | my $fh = $self->with_open_w($count_file{coverage}, 0); | 
| 1278 |  |  |  |  |  |  |  | 
| 1279 | 4 |  |  |  |  | 43 | log_msg "  => Writing counts to $count_file{coverage} ..."; | 
| 1280 | 4 |  |  |  |  | 70 | for my $id (sort keys %counters) { | 
| 1281 | 20 |  |  |  |  | 50 | printf {$fh} "%s\t%d\n" => $id, $counters{$id}; | 
|  | 20 |  |  |  |  | 113 |  | 
| 1282 |  |  |  |  |  |  | } | 
| 1283 |  |  |  |  |  |  |  | 
| 1284 |  |  |  |  |  |  | # Close coverage file | 
| 1285 | 4 |  |  |  |  | 42 | log_msg ":; Writing and closing $count_file{coverage} ..."; | 
| 1286 | 4 | 50 |  |  |  | 400 | close $fh | 
| 1287 |  |  |  |  |  |  | or die "Cannot write file $count_file{coverage}: $!\n"; | 
| 1288 |  |  |  |  |  |  | } | 
| 1289 |  |  |  |  |  |  | } | 
| 1290 |  |  |  |  |  |  |  | 
| 1291 |  |  |  |  |  |  | __END__ | 
| 1292 |  |  |  |  |  |  |  | 
| 1293 |  |  |  |  |  |  | =pod | 
| 1294 |  |  |  |  |  |  |  | 
| 1295 |  |  |  |  |  |  | =encoding UTF-8 | 
| 1296 |  |  |  |  |  |  |  | 
| 1297 |  |  |  |  |  |  | =head1 NAME | 
| 1298 |  |  |  |  |  |  |  | 
| 1299 |  |  |  |  |  |  | App::Sandy::Simulator - Class responsible to make the simulation | 
| 1300 |  |  |  |  |  |  |  | 
| 1301 |  |  |  |  |  |  | =head1 VERSION | 
| 1302 |  |  |  |  |  |  |  | 
| 1303 |  |  |  |  |  |  | version 0.25 | 
| 1304 |  |  |  |  |  |  |  | 
| 1305 |  |  |  |  |  |  | =head1 AUTHORS | 
| 1306 |  |  |  |  |  |  |  | 
| 1307 |  |  |  |  |  |  | =over 4 | 
| 1308 |  |  |  |  |  |  |  | 
| 1309 |  |  |  |  |  |  | =item * | 
| 1310 |  |  |  |  |  |  |  | 
| 1311 |  |  |  |  |  |  | Thiago L. A. Miller <tmiller@mochsl.org.br> | 
| 1312 |  |  |  |  |  |  |  | 
| 1313 |  |  |  |  |  |  | =item * | 
| 1314 |  |  |  |  |  |  |  | 
| 1315 |  |  |  |  |  |  | J. Leonel Buzzo <lbuzzo@mochsl.org.br> | 
| 1316 |  |  |  |  |  |  |  | 
| 1317 |  |  |  |  |  |  | =item * | 
| 1318 |  |  |  |  |  |  |  | 
| 1319 |  |  |  |  |  |  | Felipe R. C. dos Santos <fsantos@mochsl.org.br> | 
| 1320 |  |  |  |  |  |  |  | 
| 1321 |  |  |  |  |  |  | =item * | 
| 1322 |  |  |  |  |  |  |  | 
| 1323 |  |  |  |  |  |  | Helena B. Conceição <hconceicao@mochsl.org.br> | 
| 1324 |  |  |  |  |  |  |  | 
| 1325 |  |  |  |  |  |  | =item * | 
| 1326 |  |  |  |  |  |  |  | 
| 1327 |  |  |  |  |  |  | Rodrigo Barreiro <rbarreiro@mochsl.org.br> | 
| 1328 |  |  |  |  |  |  |  | 
| 1329 |  |  |  |  |  |  | =item * | 
| 1330 |  |  |  |  |  |  |  | 
| 1331 |  |  |  |  |  |  | Gabriela Guardia <gguardia@mochsl.org.br> | 
| 1332 |  |  |  |  |  |  |  | 
| 1333 |  |  |  |  |  |  | =item * | 
| 1334 |  |  |  |  |  |  |  | 
| 1335 |  |  |  |  |  |  | Fernanda Orpinelli <forpinelli@mochsl.org.br> | 
| 1336 |  |  |  |  |  |  |  | 
| 1337 |  |  |  |  |  |  | =item * | 
| 1338 |  |  |  |  |  |  |  | 
| 1339 |  |  |  |  |  |  | Rafael Mercuri <rmercuri@mochsl.org.br> | 
| 1340 |  |  |  |  |  |  |  | 
| 1341 |  |  |  |  |  |  | =item * | 
| 1342 |  |  |  |  |  |  |  | 
| 1343 |  |  |  |  |  |  | Rodrigo Barreiro <rbarreiro@mochsl.org.br> | 
| 1344 |  |  |  |  |  |  |  | 
| 1345 |  |  |  |  |  |  | =item * | 
| 1346 |  |  |  |  |  |  |  | 
| 1347 |  |  |  |  |  |  | Pedro A. F. Galante <pgalante@mochsl.org.br> | 
| 1348 |  |  |  |  |  |  |  | 
| 1349 |  |  |  |  |  |  | =back | 
| 1350 |  |  |  |  |  |  |  | 
| 1351 |  |  |  |  |  |  | =head1 COPYRIGHT AND LICENSE | 
| 1352 |  |  |  |  |  |  |  | 
| 1353 |  |  |  |  |  |  | This software is Copyright (c) 2023 by Teaching and Research Institute from Sírio-Libanês Hospital. | 
| 1354 |  |  |  |  |  |  |  | 
| 1355 |  |  |  |  |  |  | This is free software, licensed under: | 
| 1356 |  |  |  |  |  |  |  | 
| 1357 |  |  |  |  |  |  | The GNU General Public License, Version 3, June 2007 | 
| 1358 |  |  |  |  |  |  |  | 
| 1359 |  |  |  |  |  |  | =cut |