File Coverage

blib/lib/RestrictionDigest.pm
Criterion Covered Total %
statement 45 1604 2.8
branch 0 620 0.0
condition 0 96 0.0
subroutine 15 43 34.8
pod n/a
total 60 2363 2.5


line stmt bran cond sub pod time code
1            
2             package RestrictionDigest;
3            
4 1     1   16068 use 5.8.0;
  1         3  
  1         49  
5 1     1   5 use strict;
  1         1  
  1         37  
6 1     1   3 use warnings FATAL => 'all';
  1         5  
  1         74  
7            
8            
9            
10             =head1 NAME
11            
12             RestrictionDigest - A simulation tool for reducing the genome with one DNA endonuclease or a pair DNA endonucleases!
13            
14             =head1 VERSION
15            
16             Version 0.01
17            
18             =cut
19            
20             our $VERSION = '0.01';
21            
22             =head1 DESCRIPTION
23            
24            
25             Next Generation Sequencing (NGS) is developing quickly and many cost-saving approaches are developed based on NGS. As sequencing
26             the whole genome is expensive and sometimes unnecessary, sequencing the representative part of the whole genome becomes practicable
27             and attractive. The most famous two approaches are Genotyping By Sequencing (GBS) and Restrict site Associated DNA sequencing (RAD).
28             These two approaches can reduce the ratio of sequenced part of the whole genome which will save a lot of money. Both of them
29             utilize tpye II DNA endonuclease to digest the whole genome. In order to select the correct enzyme or enzyme-pair, we developed
30             this perl module: RestrictionDigest.
31            
32             RestrictionDigest is used for the simulation of single/double-enzyme GBS/RAD approach. The most important thing of representative sequencing
33             is choosing the right enzymes to digest the whole genome. The appropriate enzymes should satisfy several criteria including: the
34             recognition sites of these two enzymes are evenly dispersed on each single chromsome; the recognition sites of these two enzymes
35             are not or rarely located at the repetitive region of the genome; the fragments produced by two-enzyme digestion are neither too long
36             nor too short, they should be in the suitable range which is most effective for PCR amplification; the complexity reducing rate
37             shoule be in the suitable range(1%-10%). RestrictionDigest can produce several result files including the fragments seqence and their
38             locations on the reference. The ratio of these fragments on the reference seqences are also produced. If User can provide the gff
39             annotation file, RestrictionDigest can also provide the coverage ratio of these fragments on gene region, even CDS, INTRON, UTR region,
40             and intergenic region.
41            
42             =cut
43            
44            
45            
46            
47            
48             package RestrictionDigest::Double;
49            
50 1     1   7 use 5.8.0;
  1         3  
  1         27  
51 1     1   4 use strict;
  1         1  
  1         25  
52 1     1   4 use warnings FATAL => 'all';
  1         2  
  1         66  
53             =head1 NAME
54            
55             RestrictionDigest::Double - for double-enzyme simulation.
56            
57             =head1 VERSION
58            
59             Version 0.01
60            
61             =cut
62            
63             our $VERSION = '0.01';
64            
65            
66            
67             =head1 SYNOPSIS
68            
69             RestrictionDigest::Double is used for the simulation of double-enzyme GBS/RAD approach.
70            
71            
72             use RestrictionDigest;
73            
74             $digest = RestrictionDigest::Double->new();
75            
76             $digest->add_ref(-reference=>'Full path of the reference file');
77            
78             $digest->new_enzyme(-enzyme_name=>'Ncol',-recognition_site=>'C|CATGG');
79            
80             $digest->add_enzyme_pair(-front_enzyme=>'EcoRI',-behind_enzyme=>'HinfI');
81            
82             $digest->add_output_dir(-output_dir=>'Full path of the output directory');
83            
84             $digest->double_digest();
85            
86            
87            
88            
89             =cut
90            
91 1     1   484 use IO::File;
  1         7089  
  1         109  
92 1     1   9 use File::Basename;
  1         1  
  1         55  
93            
94 1     1   4 use vars qw(%fields %enzyme_list);
  1         1  
  1         9319  
95            
96             # Define the fields to be used as identifiers.
97             %fields = (
98             ref => undef,
99             gff => undef,
100             SNPs => undef,
101             enzyme1 => undef,
102             enzyme2 => undef,
103             output_dir=> undef,
104             range_start => 201,
105             range_end => 500,
106             lengths_distribution_start =>100,
107             lengths_distribution_end => 1000,
108             lengths_distribution_step => 50,
109             sequence_type => '100PE',
110             sequence_end => 'front_enzyme',
111             );
112            
113             # Make the endonuclease enzyme-container--a hash.
114             %enzyme_list=(
115             EcoRI => 'G|AATTC',
116             HinfI => 'G|ANTC',
117             AluI => 'AG|CT',
118             AvaII => 'G|GWCC',
119             BamHI => 'G|GATCC',
120             HindIII => 'A|AGCTT',
121             MseI => 'T|TAA',
122             MspI => 'C|CGG',
123             NdeI => 'CA|TATG',
124             PstI => 'CTGCA|G',
125             SalI => 'G|TCGAC',
126             XbaI => 'T|CTAGA',
127             XhoI => 'C|TCGAG',
128             NlaIII => 'NCATG|N',
129             AatII => 'GACGT|C',
130             AccI => 'GT|MKAC',
131             Acc65I => 'G|GTACC',
132             AciI => 'C|CGC',
133             AclI => 'AA|CGTT',
134             AfeI => 'AGC|GCT',
135             AflII => 'C|TTAAGA',
136             AflIII => 'A|CRYGT',
137             AgeI => 'A|CCGGT',
138             AlwNI => 'CAGNNN|CTG',
139             ApaI => 'GGGCC|C',
140             ApaLI => 'G|TGCAC',
141             ApeKI => 'G|CWGC',
142             ApoI => 'R|AATTY',
143             AscI => 'GG|CGCGCC',
144             AseI => 'AT|TAAT',
145             AvaI => 'C|YCGRG',
146             BclI => 'T|GATCA',
147             AvrII => 'C|CTAGG',
148             BaeGI => 'GKGCM|C',
149             BanI => 'G|GYRCC',
150             BanII => 'GRGCY|C',
151             BbvCI => 'CC|TCAGC',
152             BsaAI => 'YAC|GTR',
153             BsaJI => 'C|CNNGG',
154             BsaWI => 'W|CCGGW',
155            
156            
157            
158             );
159            
160             =head1 SUBROUTINES/METHODS
161            
162             =head2 new
163            
164             $digest=RestrictionDigest::Double->new();
165             Create a new object.
166            
167             =cut
168            
169             sub new {
170 0     0     my $class = shift;
171 0           my $self = {
172             %fields,
173             };
174 0           bless ($self, $class);
175 0           return $self;
176             }
177            
178             =head2 add_ref
179            
180             $digest->add_ref(-reference=>'Full path of the reference file');
181             Add the reference file to the RestrictionDigest variable.
182            
183             =cut
184            
185             sub add_ref {
186 0     0     my $self=shift;
187 0           my %parameters=@_;
188 0           for(keys %parameters){
189 0 0         if($_=~/^-reference$/){
190 0           $self->{ref}=$parameters{$_};
191             }
192             else{
193 0           die "Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
194             }
195             }
196            
197             # Check the format of reference
198 0           my $ref_fh=IO::File->new("$self->{ref}",'r');
199 0           my $line_num=0;
200 0           while(<$ref_fh>){
201 0           chomp;
202 0           $line_num++;
203 0 0         if($line_num % 2 != 0){
204 0 0         unless($_=~/^>\S+$/){
205 0           die "The format of reference file is not allowed, please convert it.\nYou can find the example format in the README!\n";
206             }
207             }
208 0 0         if($line_num % 2 ==0 ){
209 0 0         unless($_=~/^[AGTCNRYMKSWHBVD]+$/i){
210 0           die"The sequences of reference contain some charactors other than AGTCNRYMKSWHBVD, please correct that!\n";
211             }
212             }
213             }
214 0           $ref_fh->close;
215             }
216            
217             =head2 add_enzyme_pair
218            
219             $digest->add_enzyme_pair(-front_enzyme=>'EcoRI', -behind_enzyme=>'HinfI');
220             Add two enzymes to the RestrictionDigest variable.
221             The names of the two enzymes are case-insensitive.
222            
223             =cut
224            
225            
226            
227             sub add_enzyme_pair {
228            
229 0     0     my $self=shift;
230 0           my %parameters=@_;
231 0           for (keys %parameters){
232 0 0         if($_=~/^-front_enzyme$/){
    0          
233 0           $self->{enzyme1}=$parameters{$_};
234             }
235             elsif($_=~/^-behind_enzyme$/){
236 0           $self->{enzyme2}=$parameters{$_};
237             }
238             else{
239 0           die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
240             }
241             }
242            
243             # Check the existence of enzymes in the enzyme-container.
244 0           my ($enzyme1_exists, $enzyme2_exists)=qw(0 0);
245 0           foreach(keys %enzyme_list){
246 0 0         if($_=~/^$self->{enzyme1}$/i){
247 0           $enzyme1_exists++;
248             }
249 0 0         if($_=~/^$self->{enzyme2}$/i){
250 0           $enzyme2_exists++;
251             }
252             }
253 0 0         if($enzyme1_exists >0 ){
254            
255             }else{
256 0           die "The front restrict endonuclease User provides does not exist.\nHowever User can add this enzyme and its recognition sites to enzyme-container via the method 'new_enzyme' .\n";
257             }
258 0 0         if($enzyme2_exists >0 ){
259            
260             }else{
261 0           die "The behind restrict endonuclease User provides does not exist.\nHowever User can add this enzyme and its recognition sites to enzyme-container via the method 'new_enzyme' .\n";
262             }
263             }
264            
265             =head2 new_enzyme
266            
267             $digest->new_enzyme(-enzyme_name=>'EcoRI', -recognition_site=>'G|AATTC');
268            
269             If the enzyme User wants to use does not exists in the default enzyme-pool, the enzyme can be added to the enzyme-pool.
270            
271             For now, the enzymes in the default enzyme pool are as follows:
272             EcoRI,HinfI,AluI,AvaII,BamHI,HindIII,MseI,MspI,NdeI,PstI,SalI,XbaI,XhoI,NlaIII.
273            
274             Adding new enzyme(s) to the enzyme-pool is easy but the enzyme added to the pool is temporary. The added enzyme(s) is active
275             only in the class context to which the 'new_method' belongs.
276            
277             =cut
278            
279             sub new_enzyme {
280 0     0     my $self=shift;
281 0           my %parameters=@_;
282 0           my $new_enzyme_name;
283             my $new_enzyme_site;
284 0           for (keys %parameters){
285 0 0         if($_=~/^-enzyme_name$/){
    0          
286 0           $new_enzyme_name=$parameters{$_};
287             }
288             elsif($_=~/^-recognition_site$/){
289 0           $new_enzyme_site=$parameters{$_};
290 0 0         unless($new_enzyme_site=~/|/){
291 0           die "The 'recognition_site' must contain a cut flag '|'. Please perldoc RestrictionDigest for example or read README for moer help\n";
292             }
293             }
294             else{
295 0           die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help\n";
296             }
297             }
298 0           $enzyme_list{$new_enzyme_name}=$new_enzyme_site;
299             }
300            
301            
302             =head2 change_range
303            
304             $digest->change_range(-start=>201, -end=>500);
305             Usually, not all fragments are needed by the scientists. The fragments in particular range are more attractive to them.
306             RestrictionDigest provides the sequences and the positions on the reference of these fragments in range. The default range is
307             [201bp, 500]. User can change this range via the method .
308            
309            
310             =cut
311            
312             sub change_range {
313 0     0     my $self=shift;
314 0           my %parameters=@_;
315 0           for(keys %parameters){
316 0 0         if($_=~/^-start$/){
    0          
317 0           $self->{range_start}=$parameters{$_};
318             }
319             elsif($_=~/^-end$/){
320 0           $self->{range_end}=$parameters{$_};
321             }
322             else{
323 0           die"Unacceptable parameters in the method ,please perldoc RestrictionDigest for example or read README for more help.\n";
324             }
325             }
326 0 0         if($self->{range_start} < $self->{range_end} ) {
327            
328             }else{
329 0           die "The first parameter of the method must be smaller than the second parameter\n";
330             }
331             }
332            
333             =head2 change_lengths_distribution_parameters
334            
335             $digest->change_lengths_distribution_parameters(-front=>100,-behind=>1000,-step=>50);
336             This program will give a simple summary of the lengths of all fragments after digestion. By default, three scopes, which are
337             <=100bp, 101bp-1000bp and >1000bp, are given. The number of the fragments whose length fallen in these scopes are given. More
338             fine sorted, the 101bp-1000bp scope is split by the step of 50bp. These three parameters: the front edge and the behind edge
339             of the scope and the step can be changed via the method 'change_lengths_distribution_parameters';
340             =cut
341            
342             sub change_lengths_distribution_parameters {
343 0     0     my $self=shift;
344 0           my %parameters=@_;
345 0           for (keys %parameters){
346 0 0         if($_=~/^-front$/){
    0          
    0          
347 0           $self->{lengths_distribution_start}=$parameters{$_};
348             }
349             elsif($_=~/^-behind$/){
350 0           $self->{lengths_distribution_end}=$parameters{$_};
351             }
352             elsif($_=~/^-step$/){
353 0           $self->{lengths_distribution_step}=$parameters{$_};
354             }
355             else{
356 0           die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
357             }
358             }
359             }
360            
361            
362             =head2 add_output_dir
363            
364             $digest->add_output_dir(-output_dir=>'Full path of the output directory');
365             Adds the directory into which the result files are put by the program.
366             The directory must be in the absolute path format.
367            
368             =cut
369            
370            
371             sub add_output_dir {
372 0     0     my $self=shift;
373 0           my %parameters=@_;
374 0           for(keys %parameters){
375 0 0         if($_=~/^-output_dir$/){
376 0           $self->{output_dir}=$parameters{$_};
377             }
378             else{
379 0           die "Unacceptable parameter in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
380             }
381             }
382 0 0         if(-d $self->{output_dir} ) {
383             }else{
384 0           die "The output directory User provides does not exist.\n";
385             }
386             }
387            
388             =head2 double_digest
389            
390             $digest->double_digest();
391             Execute the digestion process.
392             This process will produce several result files which will be located in the output directory
393             added through the add_output_dir method.
394             This process will take some time. It depends on the size of the reference file.
395             During the process, Prompting messages will be output to the STDOUT
396             like "Digestion: >ScaffoldName is done!".
397            
398             =cut
399            
400             sub double_digest {
401            
402 0     0     my $self=shift;
403 0           my $ref=$self->{ref};
404 0 0         if($ref){
405 0           my $name_reference=basename($ref);
406 0           print "The reference file User provides is:\t",$name_reference,"\n";
407             }else{
408 0           die "No reference file provied! Please perldoc RestrictionDigest for example.\n";
409             }
410 0           my $front_enzyme=$self->{enzyme1};
411 0 0         if($front_enzyme){
412 0           print "The front restrict endonuclease User provides is:\t", $front_enzyme, "\n";
413             }
414             else{
415 0           die"No front restrict endonuclease provided! Please perldoc RestrictionDigest for example.\n";
416             }
417 0           my $later_enzyme=$self->{enzyme2};
418 0 0         if($later_enzyme){
419 0           print "The behind restrict endonuclease User provides is:\t", $later_enzyme, "\n";
420             }
421             else{
422 0           die "No behind restrict endonuclease provided! Please perldoc RestrictionDigest for example.\n";
423             }
424 0           my $range1=$self->{range_start};
425 0           my $range2=$self->{range_end};
426 0           print "The seperate range of the fragments User wants to output is:\t$range1 to $range2 bp.\n";
427 0           my $output_dir=$self->{output_dir};
428 0 0         if($output_dir){
429 0           print "The output directory User provides is:\t",$output_dir, "\n";
430             }
431             else{
432 0           die "No output directory provided! Please perldoc RestrictionDigest for example.\n";
433             }
434 0           $output_dir=~s/^(.+)\/?$/$1/;
435            
436             # Make the file handle to the reference file.
437 0           my $ref_fh=IO::File->new("$ref",'r');
438            
439             # Get the basename of reference,not include the path.
440 0           my $name_reference=basename($ref);
441            
442             # Make the file handles of result files.
443 0           my $all_seq_file_fh=IO::File->new(">>$output_dir/seq_frags_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
444 0           my $all_loc_file_fh=IO::File->new(">>$output_dir/position_frags_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
445 0           my $seq_in_range_file_fh=IO::File->new(">>$output_dir/seq_frags_in_range_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
446 0           my $loc_in_range_file_fh=IO::File->new(">>$output_dir/position_frags_in_range_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
447 0           my $reduced_ratio_every_scaffold_file_fh=IO::File->new(">>$output_dir/reduced_ratio_every_scaffold_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
448 0           my $summary_digestion_fh=IO::File->new(">>$output_dir/digestion_summary_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
449            
450             # Get the base-compositions of these two enzymes which User inputs.
451 0           my ($front_seq,$later_seq);
452 0           foreach(keys %enzyme_list){
453 0 0         if($_=~/^$front_enzyme$/i){
454 0           $front_seq=$enzyme_list{$_};
455             }
456 0 0         if($_=~/^$later_enzyme$/i){
457 0           $later_seq=$enzyme_list{$_};
458             }
459             }
460            
461             #get the relative location of cutting of the recognation sequences; $front_enzyme_cut_loc and $later_enzyme_cut_loc are two relative locations we wanted;
462 0           my @front_enzyme_rec_seq=split //, $front_seq;
463 0           my @later_enzyme_rec_seq=split //, $later_seq;
464 0           my @front_enzyme_cut_seq;
465             my @later_enzyme_cut_seq;
466 0           my $front_enzyme_rec_start=0;
467 0           my $later_enzyme_rec_start=0;
468 0           my $front_enzyme_cut_loc;
469             my $later_enzyme_cut_loc;
470 0           foreach(@front_enzyme_rec_seq){
471 0           $front_enzyme_rec_start++;
472 0 0         if($_=~/\|/){
473 0           $front_enzyme_cut_loc=$front_enzyme_rec_start;
474             }
475             else{
476 0           push @front_enzyme_cut_seq,$_;
477             }
478             }
479 0           foreach(@later_enzyme_rec_seq){
480 0           $later_enzyme_rec_start++;
481 0 0         if($_=~/\|/){
482 0           $later_enzyme_cut_loc=$later_enzyme_rec_start;
483             }
484             else{
485 0           push @later_enzyme_cut_seq, $_;
486             }
487             }
488            
489             # Replace the degenerate bases of front enzyme;
490 0           my $num_of_front_enzyme_cut_seq=@front_enzyme_cut_seq;
491 0           for(@front_enzyme_cut_seq){
492 0 0         if($_=~/N/){$_="[ATCG]";}
  0            
493 0 0         if($_=~/R/){$_="[AG]";}
  0            
494 0 0         if($_=~/Y/){$_="[CT]";}
  0            
495 0 0         if($_=~/M/){$_="[AC]";}
  0            
496 0 0         if($_=~/K/){$_="[GT]";}
  0            
497 0 0         if($_=~/S/){$_="[GC]";}
  0            
498 0 0         if($_=~/W/){$_="[AT]";}
  0            
499 0 0         if($_=~/H/){$_="[ATC]";}
  0            
500 0 0         if($_=~/B/){$_="[GTC]";}
  0            
501 0 0         if($_=~/V/){$_="[GAC]";}
  0            
502 0 0         if($_=~/D/){$_="[GAT]";}
  0            
503             }
504 0           my $seq_string_front_enzyme=join '', @front_enzyme_cut_seq;
505            
506            
507            
508             # Replace the degenerate bases of later enzyme;
509 0           my $num_of_later_enzyme_cut_seq=@later_enzyme_cut_seq;
510 0           foreach(@later_enzyme_cut_seq){
511 0 0         if($_=~/N/){$_="[ATCG]";}
  0            
512 0 0         if($_=~/R/){$_="[AG]";}
  0            
513 0 0         if($_=~/Y/){$_="[CT]";}
  0            
514 0 0         if($_=~/M/){$_="[AC]";}
  0            
515 0 0         if($_=~/K/){$_="[GT]";}
  0            
516 0 0         if($_=~/S/){$_="[GC]";}
  0            
517 0 0         if($_=~/W/){$_="[AT]";}
  0            
518 0 0         if($_=~/H/){$_="[ATC]";}
  0            
519 0 0         if($_=~/B/){$_="[GTC]";}
  0            
520 0 0         if($_=~/V/){$_="[GAC]";}
  0            
521 0 0         if($_=~/D/){$_="[GAT]";}
  0            
522             }
523 0           my $seq_string_later_enzyme=join '', @later_enzyme_cut_seq;
524            
525            
526            
527             # Make the scalars and arrays used in the whole programme;
528 0           my $rate_overall=0;
529 0           my $rate_in_range=0;
530 0           my @lengths_of_fragments=();
531 0           my @lengths_fragments_in_range=();
532 0           my $overall_fragments_length=0;
533 0           my $overall_fragments_in_range_length=0;
534 0           my $overall_length_of_scfd=0;
535            
536             ########## CIRCLES START HERE!###########
537            
538             # Process one scaffold one circle;
539 0           my $scaffold_name; my $scaffold_seq;
540 0           while(<$ref_fh>){
541 0           chomp;
542 0 0         if($_=~/^>/){
543 0           $scaffold_name=$_;
544             }
545             else{
546 0           print"Digesting $scaffold_name...\t";
547 0           $scaffold_seq=$_;
548 0           $reduced_ratio_every_scaffold_file_fh->print("$scaffold_name\t");
549 0           my $length_of_scafd=length $scaffold_seq;
550            
551             # Set the arrays which contain cut locations of front enzyme and later enzyme;
552 0           my @front_enzyme_locs=();
553            
554 0           my @later_enzyme_locs=();
555            
556 0           my $seq_for_search=$scaffold_seq;
557             # Find locs of recognition of front enzyme in this scaffold;the locs are meaningful in this array; not in the human context;
558 0           my $string=$seq_string_front_enzyme;
559 0 0         if($string=~/^(\w+)$/){
    0          
    0          
    0          
    0          
    0          
560 0           my $loc_in_array;
561 0           $loc_in_array=index($seq_for_search,$string);
562 0 0         unless($loc_in_array==-1){
563 0           push @front_enzyme_locs, $loc_in_array;
564             }
565 0           while($loc_in_array!=-1){
566 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
567 0 0         unless($loc_in_array==-1){
568 0           push @front_enzyme_locs, $loc_in_array;
569             }
570             }
571             }
572             elsif($string=~/(\w+)\[(\w+)\](\w+)/){
573 0           my $breast_bases=$1;
574 0           my $back_bases=$3;
575 0           my $generate_bases=$2;
576 0           my @generate_bases=split //, $generate_bases;
577 0           my $seq_for_search=$scaffold_seq;
578 0           for my $one_base(@generate_bases){
579 0           $string=$breast_bases.$one_base.$back_bases;
580 0           my $loc_in_array;
581 0           $loc_in_array=index($seq_for_search,$string);
582 0 0         unless($loc_in_array==-1){
583 0           push @front_enzyme_locs, $loc_in_array;
584             }
585 0           while($loc_in_array!=-1){
586 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
587 0 0         unless($loc_in_array==-1){
588 0           push @front_enzyme_locs, $loc_in_array;
589             }
590             }
591             }
592             }
593            
594             elsif($string=~/^\[(\w+)\](\w+)\[(\w+)\]$/){
595 0           my $g1_bases=$1;
596 0           my $middle=$2;
597 0           my $g2_bases=$3;
598 0           my @g1_bases=split //, $g1_bases;
599 0           my @g2_bases=split //, $g2_bases;
600 0           for my $one_base(@g1_bases){
601 0           for my $two_base(@g2_bases){
602 0           $string=$one_base.$middle.$two_base;
603 0           my $loc_in_array;
604 0           $loc_in_array=index($seq_for_search,$string);
605 0 0         unless($loc_in_array==-1){
606 0           push @front_enzyme_locs, $loc_in_array;
607             }
608 0           while($loc_in_array!=-1){
609 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
610 0 0         unless($loc_in_array==-1){
611 0           push @front_enzyme_locs, $loc_in_array;
612             }
613             }
614             }
615             }
616             }
617            
618             elsif($string=~/(\w+)\[(\w+)\](\w+)\[(\w+)\](\w+)/){
619 0           my $breast_bases=$1;
620 0           my $g1_bases=$2;
621 0           my $middle=$3;
622 0           my $back_bases=$5;
623 0           my $g2_bases=$4;
624 0           my @g1_bases=split //, $g1_bases;
625 0           my @g2_bases=split //, $g2_bases;
626 0           for my $one_base(@g1_bases){
627 0           for my $two_base(@g2_bases){
628 0           $string=$breast_bases.$one_base.$middle.$two_base.$back_bases;
629 0           my $loc_in_array;
630 0           $loc_in_array=index($seq_for_search,$string);
631 0 0         unless($loc_in_array==-1){
632 0           push @front_enzyme_locs, $loc_in_array;
633             }
634 0           while($loc_in_array!=-1){
635 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
636 0 0         unless($loc_in_array==-1){
637 0           push @front_enzyme_locs, $loc_in_array;
638             }
639             }
640             }
641             }
642             }
643             elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\]\[(\w+)\](\w+)/){
644 0           my $breast_bases=$1;
645 0           my $g1_bases=$2;
646 0           my $g2_bases=$3;
647 0           my $back_bases=$5;
648 0           my $g3_bases=$4;
649 0           my @g1_bases=split //, $g1_bases;
650 0           my @g2_bases=split //, $g2_bases;
651 0           my @g3_bases=split //, $g3_bases;
652 0           for my $one_base(@g1_bases){
653 0           for my $two_base(@g2_bases){
654 0           for my $three_base(@g3_bases){
655 0           $string=$breast_bases.$one_base.$two_base.$three_base.$back_bases;
656 0           my $loc_in_array;
657 0           $loc_in_array=index($seq_for_search,$string);
658 0 0         unless($loc_in_array==-1){
659 0           push @front_enzyme_locs, $loc_in_array;
660             }
661 0           while($loc_in_array!=-1){
662 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
663 0 0         unless($loc_in_array==-1){
664 0           push @front_enzyme_locs, $loc_in_array;
665             }
666             }
667             }
668             }
669             }
670             }
671             elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\](\w+)/){
672 0           my $breast_bases=$1;
673 0           my $g1_bases=$2;
674 0           my $g2_bases=$3;
675 0           my $back_bases=$4;
676 0           my @g1_bases=split //, $g1_bases;
677 0           my @g2_bases=split //, $g2_bases;
678 0           for my $one_base(@g1_bases){
679 0           for my $two_base(@g2_bases){
680 0           $string=$breast_bases.$one_base.$two_base.$back_bases;
681 0           my $loc_in_array;
682 0           $loc_in_array=index($seq_for_search,$string);
683 0 0         unless($loc_in_array==-1){
684 0           push @front_enzyme_locs, $loc_in_array;
685             }
686 0           while($loc_in_array!=-1){
687 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
688 0 0         unless($loc_in_array==-1){
689 0           push @front_enzyme_locs, $loc_in_array;
690             }
691             }
692             }
693             }
694             }
695            
696            
697            
698            
699             # Find locs of recognition of later enzyme in this scaffold;the locs are meaningful in this array; not in the human context;
700            
701 0           $string=$seq_string_later_enzyme;
702 0 0         if($string=~/^(\w+)$/){
    0          
    0          
    0          
    0          
    0          
703 0           my $loc_in_array;
704 0           $loc_in_array=index($seq_for_search,$string);
705 0 0         unless($loc_in_array==-1){
706 0           push @later_enzyme_locs, $loc_in_array;
707             }
708 0           while($loc_in_array!=-1){
709 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
710 0 0         unless($loc_in_array==-1){
711 0           push @later_enzyme_locs, $loc_in_array;
712             }
713             }
714             }
715             elsif($string=~/(\w+)\[(\w+)\](\w+)/){
716 0           my $breast_bases=$1;
717 0           my $back_bases=$3;
718 0           my $generate_bases=$2;
719 0           my @generate_bases=split //, $generate_bases;
720 0           for my $one_base(@generate_bases){
721 0           $string=$breast_bases.$one_base.$back_bases;
722 0           my $loc_in_array;
723 0           $loc_in_array=index($seq_for_search,$string);
724 0 0         unless($loc_in_array==-1){
725 0           push @later_enzyme_locs, $loc_in_array;
726             }
727 0           while($loc_in_array!=-1){
728 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
729 0 0         unless($loc_in_array==-1){
730 0           push @later_enzyme_locs, $loc_in_array;
731             }
732             }
733             }
734             }
735            
736             elsif($string=~/^\[(\w+)\](\w+)\[(\w+)\]$/){
737 0           my $g1_bases=$1;
738 0           my $middle=$2;
739 0           my $g2_bases=$3;
740 0           my @g1_bases=split //, $g1_bases;
741 0           my @g2_bases=split //, $g2_bases;
742 0           for my $one_base(@g1_bases){
743 0           for my $two_base(@g2_bases){
744 0           $string=$one_base.$middle.$two_base;
745 0           my $loc_in_array;
746 0           $loc_in_array=index($seq_for_search,$string);
747 0 0         unless($loc_in_array==-1){
748 0           push @later_enzyme_locs, $loc_in_array;
749             }
750 0           while($loc_in_array!=-1){
751 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
752 0 0         unless($loc_in_array==-1){
753 0           push @later_enzyme_locs, $loc_in_array;
754             }
755             }
756             }
757             }
758             }
759            
760             elsif($string=~/(\w+)\[(\w+)\](\w+)\[(\w+)\](\w+)/){
761 0           my $breast_bases=$1;
762 0           my $g1_bases=$2;
763 0           my $middle=$3;
764 0           my $back_bases=$5;
765 0           my $g2_bases=$4;
766 0           my @g1_bases=split //, $g1_bases;
767 0           my @g2_bases=split //, $g2_bases;
768 0           for my $one_base(@g1_bases){
769 0           for my $two_base(@g2_bases){
770 0           $string=$breast_bases.$one_base.$middle.$two_base.$back_bases;
771 0           my $loc_in_array;
772 0           $loc_in_array=index($seq_for_search,$string);
773 0 0         unless($loc_in_array==-1){
774 0           push @later_enzyme_locs, $loc_in_array;
775             }
776 0           while($loc_in_array!=-1){
777 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
778 0 0         unless($loc_in_array==-1){
779 0           push @later_enzyme_locs, $loc_in_array;
780             }
781             }
782             }
783             }
784             }
785             elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\]\[(\w+)\](\w+)/){
786 0           my $breast_bases=$1;
787 0           my $g1_bases=$2;
788 0           my $g2_bases=$3;
789 0           my $back_bases=$5;
790 0           my $g3_bases=$4;
791 0           my @g1_bases=split //, $g1_bases;
792 0           my @g2_bases=split //, $g2_bases;
793 0           my @g3_bases=split //, $g3_bases;
794 0           for my $one_base(@g1_bases){
795 0           for my $two_base(@g2_bases){
796 0           for my $three_base(@g3_bases){
797 0           $string=$breast_bases.$one_base.$two_base.$three_base.$back_bases;
798 0           my $loc_in_array;
799 0           $loc_in_array=index($seq_for_search,$string);
800 0 0         unless($loc_in_array==-1){
801 0           push @later_enzyme_locs, $loc_in_array;
802             }
803 0           while($loc_in_array!=-1){
804 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
805 0 0         unless($loc_in_array==-1){
806 0           push @later_enzyme_locs, $loc_in_array;
807             }
808             }
809             }
810             }
811             }
812             }
813             elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\](\w+)/){
814 0           my $breast_bases=$1;
815 0           my $g1_bases=$2;
816 0           my $g2_bases=$3;
817 0           my $back_bases=$4;
818 0           my @g1_bases=split //, $g1_bases;
819 0           my @g2_bases=split //, $g2_bases;
820 0           for my $one_base(@g1_bases){
821 0           for my $two_base(@g2_bases){
822 0           $string=$breast_bases.$one_base.$two_base.$back_bases;
823 0           my $loc_in_array;
824 0           $loc_in_array=index($seq_for_search,$string);
825 0 0         unless($loc_in_array==-1){
826 0           push @later_enzyme_locs, $loc_in_array;
827             }
828 0           while($loc_in_array!=-1){
829 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
830 0 0         unless($loc_in_array==-1){
831 0           push @later_enzyme_locs, $loc_in_array;
832             }
833             }
834             }
835             }
836             }
837            
838            
839            
840            
841             # Integrate the front enzyme locs and later enzyme locs into a hash;
842 0           my %hash;
843 0           while(@front_enzyme_locs){
844 0           my $loc=shift @front_enzyme_locs;
845 0           $hash{$loc}=$front_enzyme;
846             }
847            
848 0           while(@later_enzyme_locs){
849 0           my $loc=shift @later_enzyme_locs;
850 0           $hash{$loc}=$later_enzyme;
851             }
852            
853             # Sort the locs by the locations of them and restore the fragments' locs with different ends.
854 0           my @locs_sorted=sort{$a<=>$b} keys %hash;
  0            
855 0           my @locs_wanted_before;
856             my @enzyme_wanted_before;
857 0           my @locs_wanted_after;
858 0           my @enzyme_wanted_after;
859 0           my ($after_loc,$enzyme_before, $enzyme_after);
860 0           my $before_loc=shift @locs_sorted;
861 0           while(@locs_sorted){
862 0           $after_loc=shift @locs_sorted;
863 0           $enzyme_before=$hash{$before_loc};
864 0           $enzyme_after=$hash{$after_loc};
865 0 0         unless($enzyme_before eq $enzyme_after){
866 0           push @locs_wanted_before, $before_loc;
867 0           push @enzyme_wanted_before, $enzyme_before;
868 0           push @locs_wanted_after,$after_loc;
869 0           push @enzyme_wanted_after, $enzyme_after;
870             }
871 0           $before_loc=$after_loc;
872             }
873            
874            
875             # Define the scalars which contains the length of fragments;
876 0           my $nums_of_fragments=@locs_wanted_before;
877 0           my @pairs_of_locs=(0..$nums_of_fragments-1);
878 0           my $length_of_fragments=0;
879 0           my $length_of_fragments_in_range=0;
880 0           my $count_all=0;
881 0           my $count_in_range=0;
882 0           my ($before_cut_coorr,$after_cut_coorr,$bf_ct_crr_hm_rd,$af_ct_crr_hm_rd);
883            
884            
885            
886            
887 0           while(@locs_wanted_before){
888 0           my $before_coorr=shift @locs_wanted_before;
889 0           my $before_enzyme=shift @enzyme_wanted_before;
890 0           my $after_coorr=shift @locs_wanted_after;
891 0           my $after_enzyme=shift @enzyme_wanted_after;
892 0           my $length_of_fragment;my $seq_selected;
893 0           $count_all++;
894 0 0         if($front_enzyme=~/$before_enzyme/i){
    0          
895 0           $before_cut_coorr=$before_coorr+$front_enzyme_cut_loc-1;
896 0           $after_cut_coorr=$after_coorr+$later_enzyme_cut_loc-2;
897 0           $length_of_fragment=$after_cut_coorr-$before_cut_coorr+1;
898 0           $seq_selected=substr($scaffold_seq, $before_cut_coorr, $length_of_fragment);
899             }
900             elsif($later_enzyme=~/$before_enzyme/i){
901 0           $before_cut_coorr=$before_coorr+$later_enzyme_cut_loc-1;
902 0           $after_cut_coorr=$after_coorr+$later_enzyme_cut_loc-2;
903 0           $length_of_fragment=$after_cut_coorr-$before_cut_coorr+1;
904 0           $seq_selected=substr($scaffold_seq, $before_cut_coorr, $length_of_fragment);
905             }
906 0           my $bf_ct_crr=$before_cut_coorr+1;
907 0           my $af_ct_crr=$after_cut_coorr+1;
908            
909             # Output the sequence and fragment positions of all-fragments to files;
910 0           $all_seq_file_fh->print("$scaffold_name-$count_all\n$seq_selected\n");
911 0           my $strand;
912 0 0         if($before_enzyme=~/$front_enzyme/i){$strand="+";}
  0 0          
  0            
913             elsif($before_enzyme=~/$later_enzyme/i){$strand="-";}
914 0           $all_loc_file_fh->print("$scaffold_name-$count_all\t$strand\t$before_enzyme\t$bf_ct_crr\t$after_enzyme\t$af_ct_crr\t$length_of_fragment\n");
915 0           push @lengths_of_fragments, $length_of_fragment;
916            
917             # We treat the fragments in range in the same way of all fragments as above;
918 0 0         if($length_of_fragment>=$range1){
919 0 0         if($length_of_fragment<=$range2){
920 0           $count_in_range++;
921 0           $seq_in_range_file_fh->print("$scaffold_name-$count_all\n$seq_selected\n");
922 0           $loc_in_range_file_fh->print("$scaffold_name-$count_all\t$strand\t$before_enzyme\t$bf_ct_crr\t$after_enzyme\t$af_ct_crr\t$length_of_fragment\n");
923 0           push @lengths_fragments_in_range,$length_of_fragment;
924 0           $length_of_fragments_in_range+=$length_of_fragment;
925             }
926             }
927            
928             # Count all fragments length no matter in range or not!
929 0           $length_of_fragments+=$length_of_fragment;
930            
931            
932             }
933 0           my $reduced_rate=$length_of_fragments/$length_of_scafd;
934 0           my $fragment_in_range_rate_scafd=$length_of_fragments_in_range/$length_of_scafd;
935 0           $overall_fragments_length+=$length_of_fragments;
936 0           $overall_fragments_in_range_length+=$length_of_fragments_in_range;
937 0           $overall_length_of_scfd+=$length_of_scafd;
938 0           $reduced_ratio_every_scaffold_file_fh->print("$reduced_rate\t$fragment_in_range_rate_scafd\n");
939 0           print"Done!\n";
940             }
941             }
942 0           my $length_ratio_all_frags=$overall_fragments_length/$overall_length_of_scfd;
943 0           my $length_ratio_frags_in_range=$overall_fragments_in_range_length/$overall_length_of_scfd;
944 0           my $num_all_frags=@lengths_of_fragments;
945 0           my $num_frags_in_range=@lengths_fragments_in_range;
946 0           my $lengths_fragments_in_range=\@lengths_fragments_in_range;
947            
948            
949             # Output the summary of digestion.
950 0           $summary_digestion_fh->print("Expected all fragments:\t$num_all_frags\n");
951 0           $summary_digestion_fh->print("Expected fragments in range:\t$num_frags_in_range\n");
952 0           $summary_digestion_fh->print("Expected reducing ratio of all fragments:\t$length_ratio_all_frags\n");
953 0           $summary_digestion_fh->print("Expected reducing ratio of fragments in range:\t$length_ratio_frags_in_range\n");
954            
955 0           my $lengths_distribution_start=$self->{lengths_distribution_start};
956 0           my $lengths_distribution_end=$self->{lengths_distribution_end};
957 0           my $lengths_distribution_step=$self->{lengths_distribution_step};
958 0           my $lengths_distribution_tmp;
959 0           my (@starts,@ends,@counts,%lengths_distribution);
960            
961 0           my $num_pair=0;
962            
963 0           for($lengths_distribution_tmp= $lengths_distribution_start;$lengths_distribution_tmp<$lengths_distribution_end+$lengths_distribution_step;$lengths_distribution_tmp=$lengths_distribution_tmp+$lengths_distribution_step){
964 0 0         if($lengths_distribution_tmp== $lengths_distribution_start){
    0          
965 0           $num_pair++;
966 0           push @starts, '0';
967 0           push @ends , $lengths_distribution_tmp;
968 0           $lengths_distribution{$num_pair}{"0bp-${lengths_distribution_tmp}bp"}=0;
969            
970 0           $num_pair++;
971 0           my $tmp_start=$lengths_distribution_tmp+1;
972 0           my $tmp_end=$lengths_distribution_tmp+$lengths_distribution_step;
973 0           push @starts, $tmp_start;
974 0           push @ends, $tmp_end;
975 0           $lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
976            
977             }
978             elsif($lengths_distribution_tmp < $lengths_distribution_end ){
979 0 0         if($lengths_distribution_tmp+$lengths_distribution_step <$lengths_distribution_end ){
    0          
980 0           $num_pair++;
981 0           my $tmp_start=$lengths_distribution_tmp+1;
982 0           my $tmp_end=$lengths_distribution_tmp+$lengths_distribution_step;
983 0           push @starts, $tmp_start;
984 0           push @ends, $tmp_end;
985 0           $lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
986            
987             }
988             elsif($lengths_distribution_tmp +$lengths_distribution_step >= $lengths_distribution_end){
989 0           $num_pair++;
990 0           my $tmp_start=$lengths_distribution_tmp+1;
991 0           my $tmp_end=$lengths_distribution_end;
992 0           push @starts, $tmp_start;
993 0           push @ends, $tmp_end;
994 0           $lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
995            
996 0           $num_pair++;
997 0           $tmp_start=$tmp_end+1;
998            
999 0           push @starts, $tmp_start;
1000 0           push @ends, 'bigger';
1001 0           $lengths_distribution{$num_pair}{"${tmp_start}bp-bigger bp"}=0;
1002            
1003             }
1004             }
1005             }
1006            
1007            
1008            
1009 0           for (@lengths_of_fragments){
1010 0           my $length=$_;
1011 0           my $num_pair=1;
1012 0           my $num_last_pair=@starts;
1013 0           for($num_pair=1;$num_pair<=$num_last_pair;$num_pair++){
1014            
1015 0           my $array_index=$num_pair-1;
1016 0           my $start=$starts[$array_index];
1017 0           my $end=$ends[$array_index];
1018 0 0         if($num_pair<$num_last_pair){
    0          
1019 0 0 0       if($length>=$start && $length<=$end){
1020 0           for my $description (keys %{$lengths_distribution{$num_pair}}){
  0            
1021 0           $lengths_distribution{$num_pair}{$description}++;
1022 0           last;
1023             }
1024 0           last;
1025             }
1026             }
1027             elsif($num_pair==$num_last_pair){
1028 0 0         if($length>=$start){
1029 0           for my $description (keys %{$lengths_distribution{$num_pair}}){
  0            
1030 0           $lengths_distribution{$num_pair}{$description}++;
1031 0           last;
1032             }
1033 0           last;
1034             }
1035             }
1036             }
1037             }
1038            
1039 0           $summary_digestion_fh->print("\nLengths\' scope\tNumber of fragments in this scope\tRatio of these fragments in all\n");
1040            
1041 0           for(1..@starts){
1042 0           my $num_pair=$_;
1043 0           for my $description(keys %{$lengths_distribution{$num_pair}}){
  0            
1044 0           my $num_fragments=$lengths_distribution{$num_pair}{$description};
1045 0           my $ratio;
1046 0 0         if($num_all_frags>0){
1047 0           $ratio=$num_fragments/$num_all_frags;
1048             }else{
1049 0           $ratio='N/A';
1050             }
1051 0           $summary_digestion_fh->print("$description\t$num_fragments\t$ratio\n");
1052             }
1053             }
1054 0           $summary_digestion_fh->print("\n");
1055             }
1056            
1057             =head2 add_SNPs
1058            
1059             $digest->add_SNPs(-SNPs=>'Full path of the SNPs file');
1060             Adds the absolute path of the SNPs file to RestrictionDigest variable.
1061             =cut
1062            
1063             sub add_SNPs {
1064 0     0     my $self=shift;
1065 0           my %parameters=@_;
1066 0           for (keys %parameters){
1067 0 0         if($_=~/^-SNPs$/){
1068 0           $self->{SNPs}=$parameters{$_};
1069             }
1070             else{
1071 0           die "Unacceptable parameters of the method , please perldoc RestrictionDigest for example or read README for more help\n";
1072             }
1073             }
1074             }
1075            
1076             =head2 count_SNPs_at_fragments
1077            
1078             $digest->count_SNPs_at_fragments(-sequence_type=>'100SE',-sequence_end=>'front_enzyme');
1079             Count the expected SNPs appeared in the framgents.
1080             =cut
1081            
1082             sub count_SNPs_at_fragments {
1083 0     0     my $self=shift;
1084 0           my %parameters=@_;
1085            
1086 0 0         if(keys %parameters){
1087 0           for(keys %parameters){
1088 0 0         if($_=~/^-sequence_type$/){
    0          
1089 0           $self->{sequence_type}=$parameters{$_};
1090             }
1091             elsif($_=~/^-sequence_end$/){
1092 0           $self->{sequence_end}=$parameters{$_};
1093             }
1094             else{
1095 0           die "Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help\n";
1096             }
1097             }
1098             }
1099            
1100 0           my $ref=$self->{ref};
1101 0           my $front_enzyme=$self->{enzyme1};
1102 0           my $later_enzyme=$self->{enzyme2};
1103 0           my $range1=$self->{range_start};
1104 0           my $SNPs=$self->{SNPs};
1105 0 0         if($SNPs){
1106 0           my $SNPs_basename=basename($SNPs);
1107 0           print "The SNPs file provided is\t", $SNPs_basename, ".\n";
1108             }
1109             else{
1110 0           die"No SNPs file provided! Please perldoc RestrictionDigest for example.\n";
1111             }
1112            
1113 0           my $sequence_type=$self->{sequence_type};
1114            
1115             # Check the sequencing type and determine the domains of the fragments to calculate the SNPs number.
1116 0           my ($pe_sequence_length, $se_sequence_length);
1117 0           my $sequence_type_code;
1118 0 0         if($sequence_type=~/^(\d+)pe$/i){
    0          
1119 0           $sequence_type_code=2;
1120 0           my $sequence_length=$1;
1121 0 0         unless($sequence_length*2 <= $range1){
1122 0           die"Unacceptable sequence_type when range's front border is $range1 bp. Please perldoc RestrictionDigest for example or read README for more help!\n";
1123             }
1124             else{
1125 0           $pe_sequence_length=$sequence_length;
1126             }
1127             }
1128             elsif($sequence_type=~/^(\d+)se$/i){
1129 0           $sequence_type_code=1;
1130 0           my $sequence_length=$1;
1131 0 0         unless($sequence_length<=$range1){
1132 0           die "Unacceptable sequence_type when range's front border is $range1 bp. Please perldoc RestrictionDigest for example or read README for more help!\n";
1133             }
1134             else{
1135 0           $se_sequence_length=$sequence_length;
1136             }
1137             }
1138             else{
1139 0           die "Unacceptable sequence_type. Please perldoc RestrictionDigest for example or read README for more help!\n";
1140             }
1141            
1142 0           print "The sequencing type provided is\t", $sequence_type, ".\n";
1143            
1144             # Check the sequencing end of single-end sequencing.
1145 0           my $se_sequence_end;
1146             my $se_sequence_end_code;
1147 0 0         if($sequence_type_code==1){
1148 0           $se_sequence_end=$self->{sequence_end};
1149 0 0         if($se_sequence_end=~/^front_enzyme$/i){
    0          
1150 0           $se_sequence_end_code=1;
1151 0           print "The sequencing end provided is\t", $se_sequence_end, ".\n";
1152             }
1153             elsif($se_sequence_end=~/^behind_enzyme$/i){
1154 0           $se_sequence_end_code=2;
1155 0           print "The sequencing end provided is\t", $se_sequence_end, ".\n";
1156             }
1157             else{
1158 0           die "Unacceptable sequence_end. Please perldoc RestrictionDigest for example or read README for more help!\n";
1159             }
1160             }
1161            
1162            
1163            
1164 0           my $output_dir=$self->{output_dir};
1165 0           $output_dir=~s/^(.+)\/?$/$1/;
1166            
1167             # Get the basename of reference,not include the path.
1168 0           my $name_reference=basename($ref);
1169            
1170             # Make the file handle to the summary file;
1171 0           my $summary_digestion_fh=IO::File->new(">>$output_dir/digestion_summary_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
1172            
1173             # Make the file handle to the locs file of all fragments and fragments in range.
1174 0           my $all_loc_file_fh=IO::File->new("$output_dir/position_frags_${name_reference}_by_${front_enzyme}_and_${later_enzyme}",'r');
1175 0           my $loc_in_range_file_fh=IO::File->new("$output_dir/position_frags_in_range_${name_reference}_by_${front_enzyme}_and_${later_enzyme}",'r');
1176            
1177             # Hold SNPs into hash.
1178 0           my $SNPs_fh=IO::File->new("$SNPs",'r');
1179 0           my %SNPs;
1180 0           while(<$SNPs_fh>){
1181 0           chomp;
1182 0           my ($SNPs_scaffold_name,$SNPs_pos,$type)=split /\s+/, $_;
1183 0           push @{$SNPs{$SNPs_scaffold_name}}, $SNPs_pos;
  0            
1184             }
1185 0           print "Counting SNPs at the output fragments...\t";
1186            
1187             # Create the SNPs files to hold SNPs of all fragments and fragments in range.
1188 0           my $SNPs_at_all_frags_fh=IO::File->new(">>$output_dir/SNPs_at_all_frags_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
1189 0           my $SNPs_at_frags_in_range_fh=IO::File->new(">>$output_dir/SNPs_at_frags_in_range_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
1190            
1191            
1192            
1193             # Scan all locs file to count SNPs and output the result to the summary file.
1194 0           my $all_SNPs_number=0;
1195 0 0         if($sequence_type_code == 1){
    0          
1196 0 0         if($se_sequence_end_code==1){
    0          
1197 0           while(<$all_loc_file_fh>){
1198 0           chomp;
1199 0           my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1200 0           my $scaffold_name;
1201 0 0         if($fragment_name=~/^>(.+)-(\d+)/){
1202 0           $scaffold_name=$1;
1203             }
1204 0 0         if($SNPs{$scaffold_name}){
1205 0           for my $SNP(@{$SNPs{$scaffold_name}}){
  0            
1206 0 0         if($front_enzyme=~/$enzyme1/i){
    0          
1207 0 0 0       if($SNP >=$pos1 && $SNP <=($pos1+$se_sequence_length-1)){
1208 0           $all_SNPs_number++;
1209 0           $SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
1210             }
1211             }
1212             elsif($front_enzyme=~/$enzyme2/i){
1213 0 0 0       if($SNP >= ($pos2-$se_sequence_length+1) && $SNP <=$pos2){
1214 0           $all_SNPs_number++;
1215 0           $SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
1216             }
1217             }
1218             }
1219             }
1220             }
1221             }
1222             elsif($se_sequence_end_code==2){
1223 0           while(<$all_loc_file_fh>){
1224 0           chomp;
1225 0           my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1226 0           my $scaffold_name;
1227 0 0         if($fragment_name=~/^>(.+)-(\d+)/){
1228 0           $scaffold_name=$1;
1229             }
1230 0 0         if($SNPs{$scaffold_name}){
1231 0           for my $SNP(@{$SNPs{$scaffold_name}}){
  0            
1232 0 0         if($later_enzyme=~/$enzyme1/i){
    0          
1233 0 0 0       if($SNP>=$pos1 && $SNP<=($pos1+$se_sequence_length-1)){
1234 0           $all_SNPs_number++;
1235 0           $SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
1236             }
1237             }
1238             elsif($later_enzyme=~/$enzyme2/i){
1239 0 0 0       if($SNP>= ($pos2-$se_sequence_length+1) && $SNP<=$pos2){
1240 0           $all_SNPs_number++;
1241 0           $SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
1242             }
1243             }
1244             }
1245             }
1246             }
1247             }
1248 0           $summary_digestion_fh->print("Expected SNPs at all fragments via $sequence_type:\t$all_SNPs_number\n");
1249             }
1250             elsif($sequence_type_code ==2){
1251 0           while(<$all_loc_file_fh>){
1252 0           chomp;
1253 0           my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1254 0           my $scaffold_name;
1255 0 0         if($fragment_name=~/^>(.+)-(\d+)/){
1256 0           $scaffold_name=$1;
1257             }
1258 0 0         if($SNPs{$scaffold_name}){
1259 0           for my $SNP(@{$SNPs{$scaffold_name}}){
  0            
1260 0 0 0       if( ($SNP>=$pos1 && $SNP<=($pos1+$pe_sequence_length-1)) || ( $SNP>= ($pos2-$pe_sequence_length+1) && $SNP<=$pos2 ) ){
      0        
      0        
1261 0           $all_SNPs_number++;
1262 0           $SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
1263             }
1264             }
1265             }
1266             }
1267 0           $summary_digestion_fh->print("Expected SNPs at all fragments via $sequence_type:\t$all_SNPs_number\n");
1268             }
1269            
1270            
1271             # Scan locs in range file to count SNPs and output the result to the summary file.
1272            
1273 0           my $SNPs_in_range_number=0;
1274            
1275 0 0         if($sequence_type_code == 1){
    0          
1276 0 0         if($se_sequence_end_code==1){
    0          
1277 0           while(<$loc_in_range_file_fh>){
1278 0           chomp;
1279 0           my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1280 0           my $scaffold_name;
1281 0 0         if($fragment_name=~/^>(.+)-(\d+)/){
1282 0           $scaffold_name=$1;
1283             }
1284 0 0         if($SNPs{$scaffold_name}){
1285 0           for my $SNP(@{$SNPs{$scaffold_name}}){
  0            
1286 0 0         if($front_enzyme=~/$enzyme1/i){
    0          
1287 0 0 0       if($SNP>=$pos1 && $SNP<=($pos1+$se_sequence_length-1)){
1288 0           $SNPs_in_range_number++;
1289 0           $SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
1290             }
1291             }
1292             elsif($front_enzyme=~/$enzyme2/i){
1293 0 0 0       if($SNP>= ($pos2-$se_sequence_length+1) && $SNP<=$pos2){
1294 0           $SNPs_in_range_number++;
1295 0           $SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
1296             }
1297             }
1298             }
1299             }
1300             }
1301             }
1302             elsif($se_sequence_end_code==2){
1303 0           while(<$loc_in_range_file_fh>){
1304 0           chomp;
1305 0           my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1306 0           my $scaffold_name;
1307 0 0         if($fragment_name=~/^>(.+)-(\d+)/){
1308 0           $scaffold_name=$1;
1309             }
1310 0 0         if($SNPs{$scaffold_name}){
1311 0           for my $SNP(@{$SNPs{$scaffold_name}}){
  0            
1312 0 0         if($later_enzyme=~/$enzyme1/i){
    0          
1313 0 0 0       if($SNP>=$pos1 && $SNP<=($pos1+$se_sequence_length-1)){
1314 0           $SNPs_in_range_number++;
1315 0           $SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
1316             }
1317             }
1318             elsif($later_enzyme=~/$enzyme2/i){
1319 0 0 0       if($SNP>= ($pos2-$se_sequence_length+1) && $SNP<=$pos2){
1320 0           $SNPs_in_range_number++;
1321 0           $SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
1322             }
1323             }
1324             }
1325             }
1326             }
1327             }
1328 0           $summary_digestion_fh->print("Expected SNPs at fragments in range via $sequence_type:\t$SNPs_in_range_number\n");
1329             }
1330             elsif($sequence_type_code ==2){
1331 0           while(<$loc_in_range_file_fh>){
1332 0           chomp;
1333 0           my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1334 0           my $scaffold_name;
1335 0 0         if($fragment_name=~/^>(.+)-(\d+)/){
1336 0           $scaffold_name=$1;
1337             }
1338 0 0         if($SNPs{$scaffold_name}){
1339 0           for my $SNP(@{$SNPs{$scaffold_name}}){
  0            
1340 0 0 0       if( ($SNP>=$pos1 && $SNP<=($pos1+$pe_sequence_length-1)) || ( $SNP>= ($pos2-$pe_sequence_length+1) && $SNP<=$pos2 ) ){
      0        
      0        
1341 0           $SNPs_in_range_number++;
1342 0           $SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
1343             }
1344             }
1345             }
1346             }
1347 0           $summary_digestion_fh->print("Expected SNPs at fragments in range via $sequence_type:\t$SNPs_in_range_number\n");
1348             }
1349 0           print "Done!\n";
1350             }
1351            
1352            
1353             =head2 add_gff
1354            
1355             $digest->add_gff(-ref=>'Full path of the gff file');
1356             Adds the absolute path of gff file to RestrictionDigest variable.
1357             If User want to calculate the coverage ratio of different genomic strutures by the digested
1358             fragments, this method is obligatory.
1359             =cut
1360            
1361             sub add_gff {
1362 0     0     my $self=shift;
1363 0           my %parameters=@_;
1364 0           for (keys %parameters){
1365 0 0         if($_=~/^-gff$/){
1366 0           $self->{gff}=$parameters{$_};
1367             }
1368             else{
1369 0           die"Unacceptable parameters of the method , please perldoc RestrictionDigest for example or read README for more help\n";
1370             }
1371             }
1372             }
1373            
1374             =head2 all_frags_coverage_ratio
1375            
1376             $digest->all_frags_coverage_ratio();
1377             Calculate the coverage ratio of different parts of the genome reference,
1378             including mRNA, CDS, UTR and intergenic region, by all digested fragments.
1379             This process will take a very long time, please be patient.
1380             =cut
1381            
1382             sub all_frags_coverage_ratio {
1383 0     0     my $self=shift;
1384            
1385 0           my $ref=$self->{ref};
1386 0           my $basename_ref=basename($ref);
1387            
1388 0           my $e1=$self->{enzyme1};
1389 0           my $e2=$self->{enzyme2};
1390            
1391 0           my $output_dir=$self->{output_dir};
1392 0           $output_dir=~s/^(.+)\/?$/$1/;
1393            
1394 0           my $loc_file_all_frags="$output_dir/position_frags_${basename_ref}_by_${e1}_and_${e2}";
1395            
1396 0           $self->genome_structure_coverage_ratio($loc_file_all_frags);
1397             }
1398            
1399             =head2 frags_in_range_coverage_ratio
1400            
1401             $digest->frags_in_range_coverage_ratio();
1402             Calculate the coverage ratio of different parts of the genome reference,
1403             including mRNA, CDS, UTR and intergenic region, by the fragments in range.
1404             This process will take a very long time, please be patient.
1405             =cut
1406            
1407             sub frags_in_range_coverage_ratio {
1408 0     0     my $self=shift;
1409            
1410 0           my $ref=$self->{ref};
1411 0           my $basename_ref=basename($ref);
1412            
1413 0           my $e1=$self->{enzyme1};
1414 0           my $e2=$self->{enzyme2};
1415            
1416 0           my $output_dir=$self->{output_dir};
1417 0           $output_dir=~s/^(.+)\/?$/$1/;
1418            
1419 0           my $loc_file_all_frags="$output_dir/position_frags_in_range_${basename_ref}_by_${e1}_and_${e2}";
1420            
1421 0           $self->genome_structure_coverage_ratio($loc_file_all_frags);
1422             }
1423            
1424             =head2 genome_structure_coverage_ratio
1425            
1426             A subroutine invoked by the 'all_frags_coverage_ratio' and
1427             the 'frags_in_range_coverage_ratio' methods.
1428             User do not use this subroutine.
1429            
1430             =cut
1431            
1432            
1433             sub genome_structure_coverage_ratio {
1434 0     0     my $self=shift;
1435 0           my $loc_frags_file=shift;
1436            
1437             # Get the reference name, enzyme names from hash.
1438 0           my $ref=$self->{ref};
1439 0           my $ref_fh=IO::File->new($ref,'r');
1440            
1441 0           my $basename_ref=basename($ref);
1442 0           my $e1=$self->{enzyme1};
1443 0           my $e2=$self->{enzyme2};
1444 0           my $output_dir=$self->{output_dir};
1445 0           $output_dir=~s/^(.+)\/?$/$1/;
1446            
1447             # Get the full path and name of gff file and make a filehandle to it.
1448 0           my $gff=$self->{gff};
1449            
1450 0 0         if($gff){
1451 0           my $gff_basename=basename($gff);
1452 0           print "The GFF file provided is\t", $gff_basename, ".\n";
1453             }
1454             else{
1455 0           die"No GFF file provided! Please perldoc RestrictionDigest for example.\n";
1456             }
1457            
1458             # Make output filehandles to generic region and intergenic region coverage ratio files.
1459 0           my $coverage_ratio_fh;
1460            
1461 0           my $basename_loc_frags_file=basename($loc_frags_file);
1462            
1463 0 0         if($basename_loc_frags_file =~/in_range/){
1464 0           $coverage_ratio_fh=IO::File->new(">>$output_dir/genome_coverage_ratio_in_range_${basename_ref}_by_${e1}_and_${e2}");
1465             }
1466             else{
1467 0           $coverage_ratio_fh=IO::File->new(">>$output_dir/genome_coverage_ratio_${basename_ref}_by_${e1}_and_${e2}");
1468             }
1469            
1470 0           $coverage_ratio_fh->print("ScaffoldName\tIntergenicLength\tIntergenicMapLength\tIntergenicMapRatio\tGenesLength\tGenesMapLength\tGenesMapRatio\tExonsLength\tExonsMapLength\tExonsMapRatio\tIntronsLength\tIntronsMapLength\tIntronsMapRatio\n");
1471            
1472            
1473             # Set variables used in this program.
1474 0           my($all_intergenic_length,$all_intergenic_map_length,$all_gene_length,$all_gene_map_length,$all_exon_length,$all_exon_map_length,$all_intron_length,$all_intron_map_length)=qw(0 0 0 0 0 0 0 0);
1475            
1476            
1477            
1478             # Fetch the lengths of different parts of this scaffold from the GFF file.
1479 0           my $gff_fh=IO::File->new($gff,'r');
1480 0           my %gff;
1481 0           print "Scanning the GFF file...\t";
1482 0           while(<$gff_fh>){
1483 0           chomp;
1484            
1485 0 0         unless($_=~/^#/){
1486 0           my @gff=split /\t/, $_;
1487 0 0         if(@gff == 9){
1488 0           my ($gff_scfd_name,$gff_type,$gff_start,$gff_stop)=@gff[0,2,3,4];
1489 0 0         if($gff_type=~/CDS|exon/){
    0          
1490 0           $gff_type='exon';
1491 0 0         if($gff{$gff_scfd_name}{$gff_type}){
1492            
1493 0 0         unless(grep {$gff_start == $_} @{$gff{$gff_scfd_name}{$gff_type}}){
  0            
  0            
1494            
1495 0           push @{$gff{$gff_scfd_name}{$gff_type}}, ($gff_start,$gff_stop);
  0            
1496             }
1497             }
1498             else{
1499 0           push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
  0            
1500             }
1501             }
1502             elsif($gff_type=~/mRNA|gene/){
1503 0           $gff_type='gene';
1504 0 0         if($gff{$gff_scfd_name}{$gff_type}){
1505 0 0         unless(grep {$gff_start == $_} @{$gff{$gff_scfd_name}{$gff_type}}){
  0            
  0            
1506 0           push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
  0            
1507             }
1508             }
1509             else{
1510 0           push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
  0            
1511             }
1512             }
1513             }
1514             }
1515             }
1516 0           $gff_fh->close;
1517 0           print "Done!\n";
1518            
1519             # Scan the loc file and fetch the information.
1520            
1521 0           my $loc_frags_file_fh=IO::File->new($loc_frags_file,'r');
1522 0           my %all_locs;
1523 0           print "Scanning the positions file...\t";
1524 0           while(<$loc_frags_file_fh>){
1525 0           chomp;
1526 0           my($frgt_name,$strand,$front_enzyme,$start_coor,$later_enzyme,$stop_coor,$length)=split /\t/,$_;
1527 0           my $scfd_frgt;
1528 0 0         if($frgt_name=~/^>(.+)-\d+$/){
1529 0           $scfd_frgt=$1;
1530 0           push @{$all_locs{$scfd_frgt}}, ($start_coor, $stop_coor);
  0            
1531             }
1532             }
1533 0           $loc_frags_file_fh->close;
1534 0           print "Done!\n";
1535            
1536            
1537             # Handle every scaffold one by one.
1538 0           my($scfd_name);
1539 0           while(<$ref_fh>){
1540 0           chomp;
1541 0 0         if($_=~/^>(\S+)$/){
1542 0           $scfd_name=$1;
1543             }
1544             else{
1545 0           my $scaffold_length=length $_;
1546 0           print "CoverageRatio: Processing $scfd_name...\t";
1547            
1548            
1549             # Calculate the lengths of different parts of this scaffold.
1550 0           my ($exon_length,$intron_length,$gene_length,$intergenic_length)=qw(0 0 0 0);
1551            
1552 0 0         if($gff{$scfd_name}{gene}){
1553 0           my @gene_tmp=@{$gff{$scfd_name}{gene}};
  0            
1554            
1555 0 0         if(@gene_tmp % 2){
1556 0           print "gene tmp not correct\n";
1557             }
1558            
1559 0           while(@gene_tmp){
1560 0           my $front=shift @gene_tmp;
1561 0           my $behind=shift @gene_tmp;
1562 0           $gene_length+=$behind-$front+1;
1563             }
1564 0           $intergenic_length=$scaffold_length-$gene_length;
1565             }else{
1566 0           $intergenic_length=$scaffold_length;
1567             }
1568            
1569 0 0         if($gff{$scfd_name}{exon}){
1570 0           my @exon_tmp=@{$gff{$scfd_name}{exon}};
  0            
1571            
1572 0 0         if(@exon_tmp % 2){
1573 0           print "exon tmp not correct\n";
1574             }
1575            
1576 0           while(@exon_tmp){
1577 0           my $front=shift @exon_tmp;
1578 0           my $behind=shift @exon_tmp;
1579 0           $exon_length+=$behind-$front+1;
1580             }
1581 0           $intron_length=$gene_length-$exon_length;
1582             }
1583 0           $all_intergenic_length+=$intergenic_length;
1584 0           $all_gene_length+=$gene_length;
1585 0           $all_exon_length+=$exon_length;
1586 0           $all_intron_length+=$intron_length;
1587            
1588             # Scan the locs file and fetch the map lengths of different parts.
1589 0           my $gene_map_length=0;
1590 0           my $exon_map_length=0;
1591 0           my $intron_map_length=0;
1592 0           my $intergenic_map_length=0;
1593            
1594             # Extract fragments' locs for %all_locs;
1595 0 0         if($all_locs{$scfd_name}){
1596 0           my @tmp_locs=@{$all_locs{$scfd_name}};
  0            
1597 0           while(@tmp_locs){
1598 0           my $front=shift @tmp_locs;
1599 0           my $behind=shift @tmp_locs;
1600 0           for my $one($front..$behind){
1601 0           my $gene_map_count=0;
1602 0           my $exon_map_count=0;
1603 0           my $intron_map_count=0;
1604            
1605 0 0         if($gff{$scfd_name}{gene}){
1606 0           my @gene_tmp=@{$gff{$scfd_name}{gene}};
  0            
1607 0           while(@gene_tmp){
1608 0           my $front=shift @gene_tmp;
1609 0           my $behind=shift @gene_tmp;
1610 0 0 0       if($one>=$front && $one<=$behind){
1611 0           $gene_map_count++;
1612 0           $gene_map_length++;
1613 0           last;
1614             }
1615             }
1616             }
1617            
1618 0 0         if($gff{$scfd_name}{exon}){
1619 0           my @exon_tmp=@{$gff{$scfd_name}{exon}};
  0            
1620 0           while(@exon_tmp){
1621 0           my $front=shift @exon_tmp;
1622 0           my $behind=shift @exon_tmp;
1623 0 0 0       if($one>=$front && $one<=$behind){
1624 0           $exon_map_count++;
1625 0           $exon_map_length++;
1626 0           last;
1627             }
1628             }
1629             }
1630            
1631 0 0         if($gene_map_count==0){
    0          
1632 0           $intergenic_map_length++;
1633             }
1634             elsif($exon_map_count==0){
1635 0           $intron_map_length++;
1636             }
1637             }
1638             }
1639             }
1640            
1641            
1642            
1643             # Output part!!!
1644 0           my($intergenic_map_ratio,$gene_map_ratio,$exon_map_ratio,$intron_map_ratio)=qw(0 0 0 0);
1645 0 0         unless($intergenic_length==0){
  0            
1646 0           $intergenic_map_ratio=$intergenic_map_length/$intergenic_length;
1647             }
1648             else{$intergenic_map_ratio="N/A";
1649             }
1650            
1651 0 0         unless($gene_length==0){
  0            
1652 0           $gene_map_ratio=$gene_map_length/$gene_length;
1653             }
1654             else{$gene_map_ratio="N/A";
1655             }
1656            
1657 0 0         unless($exon_length==0){
  0            
1658 0           $exon_map_ratio=$exon_map_length/$exon_length;
1659             }
1660             else{$exon_map_ratio="N/A";
1661             }
1662            
1663 0 0         unless($intron_length==0){
  0            
1664 0           $intron_map_ratio=$intron_map_length/$intron_length;
1665             }
1666             else{$intron_map_ratio="N/A";
1667             }
1668 0           $all_intergenic_map_length+=$intergenic_map_length;
1669 0           $all_gene_map_length+=$gene_map_length;
1670 0           $all_exon_map_length+=$exon_map_length;
1671 0           $all_intron_map_length+=$intron_map_length;
1672            
1673            
1674 0           $coverage_ratio_fh->print("$scfd_name\t$intergenic_length\t$intergenic_map_length\t$intergenic_map_ratio\t$gene_length\t$gene_map_length\t$gene_map_ratio\t$exon_length\t$exon_map_length\t$exon_map_ratio\t$intron_length\t$intron_map_length\t$intron_map_ratio\n");
1675 0           print "Done!\n";
1676             }
1677             }
1678 0           my ($all_intergenic_map_ratio,$all_gene_map_ratio,$all_exon_map_ratio,$all_intron_map_ratio)=qw(0 0 0 0);
1679 0 0         if($all_gene_length==0){$all_gene_map_ratio='N/A';}
  0            
1680             else{
1681 0           $all_gene_map_ratio=$all_gene_map_length/$all_gene_length;}
1682 0 0         if($all_exon_length==0){$all_exon_map_ratio='N/A';}
  0            
1683             else{
1684 0           $all_exon_map_ratio=$all_exon_map_length/$all_exon_length;}
1685 0 0         if($all_intron_length==0){$all_intron_map_ratio='N/A';}
  0            
1686             else{
1687 0           $all_intron_map_ratio=$all_intron_map_length/$all_intron_length;}
1688 0 0         if($all_intergenic_length==0){$all_intergenic_map_ratio='N/A';}
  0            
1689             else{
1690 0           $all_intergenic_map_ratio=$all_intergenic_map_length/$all_intergenic_length;}
1691            
1692 0           $coverage_ratio_fh->print("Intergenic region map ratio is\t$all_intergenic_map_ratio\nGenes region map ratio is\t$all_gene_map_ratio\nExon region map ratio is\t$all_exon_map_ratio\nIntron region map ratio is\t$all_intron_map_ratio\n");
1693             }
1694            
1695            
1696            
1697            
1698             package RestrictionDigest::Single;
1699            
1700 1     1   20 use 5.8.0;
  1         2  
  1         34  
1701 1     1   6 use strict;
  1         2  
  1         37  
1702 1     1   3 use warnings FATAL => 'all';
  1         1  
  1         66  
1703            
1704             =head1 NAME
1705            
1706            
1707             RestrictionDigest::Single - the simulation tool for singel enzyme digestion!
1708            
1709             =head1 VERSION
1710            
1711             Version 0.01
1712            
1713             =cut
1714            
1715             our $VERSION = '0.01';
1716            
1717            
1718            
1719             =head1 SYNOPSIS
1720            
1721             RestrictionDigest::Single is used for the simulation of single-enzyme GBS/RAD approaches.
1722            
1723            
1724             use RestrictionDigest;
1725            
1726             $digest = RestrictionDigest::Single->new();
1727            
1728             $digest->add_ref($reference_file); # $reference_file refers to the full path of the reference file;
1729            
1730             $digest->add_single_enzyme($enzyme); # $enzyme refers to the enzyme User wants to use in the digestion simulation;
1731            
1732             $digest->add_output_dir($output_dir);# $output_dir="A directory in which you want to put the result files";
1733            
1734             $species->single_digest();
1735            
1736            
1737             =cut
1738            
1739 1     1   5 use IO::File;
  1         1  
  1         144  
1740 1     1   16 use File::Basename;
  1         2  
  1         52  
1741            
1742 1     1   5 use vars qw(%fields %enzyme_list);
  1         0  
  1         6617  
1743            
1744             # Define the fields to be used as identifiers.
1745             %fields = (
1746             ref => undef,
1747             gff => undef,
1748             SNPs => undef,
1749             enzyme => undef,
1750             output_dir=> undef,
1751             range_start => 201,
1752             range_end => 500,
1753             sequence_type => '100PE',
1754             lengths_distribution_start =>100,
1755             lengths_distribution_end => 1000,
1756             lengths_distribution_step => 50,
1757             );
1758            
1759             # Make the endonuclease enzyme-container--a hash.
1760             %enzyme_list=(
1761             EcoRI => 'G|AATTC',
1762             HinfI => 'G|ANTC',
1763             AluI => 'AG|CT',
1764             AvaII => 'G|GWCC',
1765             BamHI => 'G|GATCC',
1766             HindIII => 'A|AGCTT',
1767             MseI => 'T|TAA',
1768             MspI => 'C|CGG',
1769             NdeI => 'CA|TATG',
1770             PstI => 'CTGCA|G',
1771             SalI => 'G|TCGAC',
1772             XbaI => 'T|CTAGA',
1773             XhoI => 'C|TCGAG',
1774             NlaIII => 'NCATG|N',
1775             AatII => 'GACGT|C',
1776             AccI => 'GT|MKAC',
1777             Acc65I => 'G|GTACC',
1778             AciI => 'C|CGC',
1779             AclI => 'AA|CGTT',
1780             AfeI => 'AGC|GCT',
1781             AflII => 'C|TTAAGA',
1782             AflIII => 'A|CRYGT',
1783             AgeI => 'A|CCGGT',
1784             AlwNI => 'CAGNNN|CTG',
1785             ApaI => 'GGGCC|C',
1786             ApaLI => 'G|TGCAC',
1787             ApeKI => 'G|CWGC',
1788             ApoI => 'R|AATTY',
1789             AscI => 'GG|CGCGCC',
1790             AseI => 'AT|TAAT',
1791             AvaI => 'C|YCGRG',
1792             BclI => 'T|GATCA',
1793             AvrII => 'C|CTAGG',
1794             BaeGI => 'GKGCM|C',
1795             BanI => 'G|GYRCC',
1796             BanII => 'GRGCY|C',
1797             BbvCI => 'CC|TCAGC',
1798             BsaAI => 'YAC|GTR',
1799             BsaJI => 'C|CNNGG',
1800             BsaWI => 'W|CCGGW',
1801             );
1802            
1803             =head1 SUBROUTINES/METHODS
1804            
1805             =head2 new
1806            
1807             $digest=RestrictionDigest::Single->new();
1808             Create a new object.
1809            
1810             =cut
1811            
1812             sub new {
1813 0     0     my $class = shift;
1814 0           my $self = {
1815             %fields,
1816             };
1817 0           bless ($self, $class);
1818 0           return $self;
1819             }
1820            
1821             =head2 add_ref
1822            
1823             $digest->add_ref(-reference=>'Full path of the reference file');
1824             Add the reference file to the RestrictionDigest variable.
1825            
1826             =cut
1827            
1828             sub add_ref {
1829 0     0     my $self=shift;
1830 0           my %parameters=@_;
1831 0           for(keys %parameters){
1832 0 0         if($_=~/^-reference$/){
1833 0           $self->{ref}=$parameters{$_};
1834             }
1835             else{
1836 0           die "Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
1837             }
1838             }
1839            
1840             # Check the format of reference
1841 0           my $ref_fh=IO::File->new("$self->{ref}",'r');
1842 0           my $line_num=0;
1843 0           while(<$ref_fh>){
1844 0           chomp;
1845 0           $line_num++;
1846 0 0         if($line_num % 2 != 0){
1847 0 0         unless($_=~/^>\S+$/){
1848 0           die "The format of reference file is not allowed, please convert it.\nYou can find the example format in the perldoc RestrictionDigest!\n";
1849             }
1850             }
1851 0 0         if($line_num % 2 ==0 ){
1852 0 0         unless($_=~/^[AGTCNRYMKSWHBVD]+$/i){
1853 0           die"The sequences of reference contain some charactors other than AGTCNRYMKSWHBVD, please correct that!\n";
1854             }
1855             }
1856             }
1857 0           $ref_fh->close;
1858             }
1859            
1860             =head2 add_single_enzyme
1861            
1862             $digest->add_single_enzyme(-enzyme=>'EcoRI');
1863             Add the single enzyme's name used to digest the whole genome.
1864            
1865             =cut
1866            
1867            
1868             sub add_single_enzyme {
1869 0     0     my $self=shift;
1870 0           my %parameters=@_;
1871 0           for (keys %parameters){
1872 0 0         if($_=~/^-enzyme$/){
1873 0           $self->{enzyme}=$parameters{$_};
1874             }
1875             else{
1876 0           die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
1877             }
1878             }
1879            
1880             # Check the existence of enzymes in the enzyme-container.
1881 0           my $enzyme_exists=0;
1882 0           foreach(keys %enzyme_list){
1883 0 0         if($_=~/^$self->{enzyme}$/i){
1884 0           $enzyme_exists++;
1885             }
1886             }
1887 0 0         if($enzyme_exists >0 ){
1888             }else{
1889 0           die "The single restrict endonuclease User provides does not exist.\nHowever User can add this enzyme and its recognition sites to enzyme-container via the method 'new_enzyme'.\n";
1890             }
1891             }
1892            
1893             =head2 new_enzyme
1894            
1895             $digest->new_enzyme(-enzyme_name=>'EcoRI', -recognition_site=>'G|AATTC');
1896            
1897             If the enzyme User wants to use does not exists in the default enzyme-pool, the enzyme can be added to the enzyme-pool.
1898            
1899             For now, the enzymes in the default enzyme pool are as follows:
1900             EcoRI,HinfI,AluI,AvaII,BamHI,HindIII,MseI,MspI,NdeI,PstI,SalI,XbaI,XhoI,NlaIII.
1901            
1902             Adding new enzyme(s) to the enzyme-pool is easy but the enzyme added to the pool is temporary. The added enzyme(s) is active
1903             only in the class context to which the 'new_method' belongs.
1904            
1905             =cut
1906            
1907             sub new_enzyme {
1908 0     0     my $self=shift;
1909 0           my %parameters=@_;
1910 0           my $new_enzyme_name;
1911             my $new_enzyme_site;
1912 0           for (keys %parameters){
1913 0 0         if($_=~/^-enzyme_name$/){
    0          
1914 0           $new_enzyme_name=$parameters{$_};
1915             }
1916             elsif($_=~/^-recognition_site$/){
1917 0           $new_enzyme_site=$parameters{$_};
1918 0 0         unless($new_enzyme_site=~/|/){
1919 0           die "The 'recognition_site' must contain a cut flag '|'. Please perldoc RestrictionDigest for example or read README for moer help\n";
1920             }
1921             }
1922             else{
1923 0           die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help\n";
1924             }
1925             }
1926 0           $enzyme_list{$new_enzyme_name}=$new_enzyme_site;
1927             }
1928            
1929             =head2 change_range
1930            
1931             $digest->change_range(-start=>201, -end=>500);
1932             Usually, not all fragments are needed by the scientists. The fragments in particular range are more attractive to them.
1933             RestrictionDigest provides the sequences and the positions on the reference of these fragments in range. The default range is
1934             [201bp, 500]. User can change this range via the method .
1935            
1936             =cut
1937            
1938             sub change_range {
1939 0     0     my $self=shift;
1940 0           my %parameters=@_;
1941 0           for(keys %parameters){
1942 0 0         if($_=~/^-start$/){
    0          
1943 0           $self->{range_start}=$parameters{$_};
1944             }
1945             elsif($_=~/^-end$/){
1946 0           $self->{range_end}=$parameters{$_};
1947             }
1948             else{
1949 0           die"Unacceptable parameters in the method ,please perldoc RestrictionDigest for example or read README for more help.\n";
1950             }
1951             }
1952 0 0         if($self->{range_start} < $self->{range_end} ) {
1953             }else{
1954 0           die "The first parameter of the method must be smaller than the second parameter\n";
1955             }
1956             }
1957            
1958             =head2 change_lengths_distribution_parameters
1959            
1960             $digest->change_lengths_distribution_parameters(-front=>100,-behind=>1000,-step=>50);
1961             This program will give a simple summary of the lengths of all fragments after digestion. By default, three scopes, which are
1962             <=100bp, 101bp-1000bp and >1000bp, are given. The number of the fragments whose length fallen in these scopes are given. More
1963             fine sorted, the 101bp-1000bp scope is split by the step of 50bp. These three parameters: the front edge and the behind edge
1964             of the scope and the step can be changed via the method 'change_lengths_distribution_parameters';
1965             =cut
1966            
1967             sub change_lengths_distribution_parameters {
1968 0     0     my $self=shift;
1969 0           my %parameters=@_;
1970 0           for (keys %parameters){
1971 0 0         if($_=~/^-front$/){
    0          
    0          
1972 0           $self->{lengths_distribution_start}=$parameters{$_};
1973             }
1974             elsif($_=~/^-behind$/){
1975 0           $self->{lengths_distribution_end}=$parameters{$_};
1976             }
1977             elsif($_=~/^-step$/){
1978 0           $self->{lengths_distribution_step}=$parameters{$_};
1979             }
1980             else{
1981 0           die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
1982             }
1983             }
1984             }
1985            
1986            
1987            
1988             =head2 add_output_dir
1989            
1990             $digest->add_output_dir(-output_dir=>'Full path of the output directory');
1991             Adds the directory into which the result files are put by the program.
1992             The directory must be in the absolute path format.
1993            
1994             =cut
1995            
1996             sub add_output_dir {
1997 0     0     my $self=shift;
1998 0           my %parameters=@_;
1999 0           for(keys %parameters){
2000 0 0         if($_=~/^-output_dir$/){
2001 0           $self->{output_dir}=$parameters{$_};
2002             }
2003             else{
2004 0           die "Unacceptable parameter in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
2005             }
2006             }
2007 0 0         if(-d $self->{output_dir} ) {
2008             }else{
2009 0           die "The output directory User provides does not exist.\n";
2010             }
2011             }
2012            
2013             =head2 single_digest
2014            
2015             $digest->single_digest();
2016             Execute the digestion process.
2017             This process will produce several result files which will be located in the output directory
2018             added through the add_output_dir method.
2019             This process will take some time. It depends on the size of the reference file.
2020             During the process, Prompting messages will be output to the STDOUT
2021             like "Digestion: >ScaffoldName is done!".
2022            
2023             =cut
2024            
2025             sub single_digest {
2026            
2027 0     0     my $self=shift;
2028 0           my $ref=$self->{ref};
2029 0 0         if($ref){
2030 0           my $name_reference=basename($ref);
2031 0           print "The reference file User provides is:\t",$name_reference,"\n";
2032             }else{
2033 0           die "No reference file provied! Please perldoc RestrictionDigest for example.\n";
2034             }
2035 0           my $enzyme=$self->{enzyme};
2036 0 0         if($enzyme){
2037 0           print "The restrict endonuclease User provides is:\t", $enzyme, "\n";
2038             }
2039             else{
2040 0           die"No restrict endonuclease provided! Please perldoc RestrictionDigest for example.\n";
2041             }
2042 0           my $range1=$self->{range_start};
2043 0           my $range2=$self->{range_end};
2044 0           print "The seperate range of the fragments User wants to output is:\t$range1 to $range2 bp.\n";
2045 0           my $output_dir=$self->{output_dir};
2046 0 0         if($output_dir){
2047 0           print "The output directory User provides is:\t",$output_dir, "\n";
2048             }
2049             else{
2050 0           die "No output directory provided! Please perldoc RestrictionDigest for example.\n";
2051             }
2052 0           $output_dir=~s/^(.+)\/?$/$1/;
2053            
2054             # Make the file handle to the reference file.
2055 0           my $ref_fh=IO::File->new("$ref",'r');
2056            
2057             # Get the basename of reference,not include the path.
2058 0           my $name_reference=basename($ref);
2059            
2060             # Make the file handles of result files.
2061 0           my $all_seq_file_fh=IO::File->new(">>$output_dir/seq_frags_${name_reference}_by_${enzyme}");
2062 0           my $all_loc_file_fh=IO::File->new(">>$output_dir/position_frags_${name_reference}_by_${enzyme}");
2063 0           my $seq_in_range_file_fh=IO::File->new(">>$output_dir/seq_frags_in_range_${name_reference}_by_${enzyme}");
2064 0           my $loc_in_range_file_fh=IO::File->new(">>$output_dir/position_frags_in_range_${name_reference}_by_${enzyme}");
2065 0           my $reduced_ratio_every_scaffold_file_fh=IO::File->new(">>$output_dir/reduced_ratio_every_scaffold_${name_reference}_by_${enzyme}");
2066 0           my $summary_digestion_fh=IO::File->new(">>$output_dir/digestion_summary_${name_reference}_by_${enzyme}");
2067            
2068             # Get the base-compositions of the enzyme which User inputs.
2069 0           my $enzyme_seq;
2070 0           foreach(keys %enzyme_list){
2071 0 0         if($_=~/^$enzyme$/i){
2072 0           $enzyme_seq=$enzyme_list{$_};
2073             }
2074             }
2075            
2076             #get the relative location of cutting of the recognation sequences;
2077 0           my @enzyme_rec_seq=split //, $enzyme_seq;
2078 0           my @enzyme_cut_seq;
2079 0           my $enzyme_rec_start=0;
2080 0           my $enzyme_cut_loc;
2081            
2082 0           foreach(@enzyme_rec_seq){
2083 0           $enzyme_rec_start++;
2084 0 0         if($_=~/\|/){
2085 0           $enzyme_cut_loc=$enzyme_rec_start;
2086             }
2087             else{
2088 0           push @enzyme_cut_seq,$_;
2089             }
2090             }
2091            
2092             # Replace the degenerate bases of the enzyme;
2093 0           my $num_of_enzyme_cut_seq=@enzyme_cut_seq;
2094 0           for(@enzyme_cut_seq){
2095 0 0         if($_=~/N/){$_="[ATCG]";}
  0            
2096 0 0         if($_=~/R/){$_="[AG]";}
  0            
2097 0 0         if($_=~/Y/){$_="[CT]";}
  0            
2098 0 0         if($_=~/M/){$_="[AC]";}
  0            
2099 0 0         if($_=~/K/){$_="[GT]";}
  0            
2100 0 0         if($_=~/S/){$_="[GC]";}
  0            
2101 0 0         if($_=~/W/){$_="[AT]";}
  0            
2102 0 0         if($_=~/H/){$_="[ATC]";}
  0            
2103 0 0         if($_=~/B/){$_="[GTC]";}
  0            
2104 0 0         if($_=~/V/){$_="[GAC]";}
  0            
2105 0 0         if($_=~/D/){$_="[GAT]";}
  0            
2106             }
2107 0           my $seq_string_enzyme=join '', @enzyme_cut_seq;
2108            
2109            
2110            
2111             # Make the scalars and arrays used in the whole programme;
2112 0           my $rate_overall=0;
2113 0           my $rate_in_range=0;
2114 0           my @lengths_of_fragments=();
2115 0           my @lengths_fragments_in_range=();
2116 0           my $overall_fragments_length=0;
2117 0           my $overall_fragments_in_range_length=0;
2118 0           my $overall_length_of_scfd=0;
2119            
2120             ########## CIRCLES START HERE!###########
2121            
2122             # Process one scaffold one circle;
2123 0           my ($scaffold_name, $scaffold_seq);
2124 0           while(<$ref_fh>){
2125 0           chomp;
2126 0 0         if($_=~/^>/){
2127 0           $scaffold_name=$_;
2128             }
2129             else{
2130 0           print"Digesting $scaffold_name...\t";
2131 0           $scaffold_seq=$_;
2132 0           $reduced_ratio_every_scaffold_file_fh->print("$scaffold_name\t");
2133 0           my $length_of_scafd=length $scaffold_seq;
2134             # Set the array which contains cut locations of the enzyme.
2135 0           my @enzyme_locs=();
2136             # Find locs of recognition of the enzyme in this scaffold;the locs are meaningful in this array; not in the human context;
2137 0           my $seq_for_search=$scaffold_seq;
2138 0           my $string=$seq_string_enzyme;
2139             # Find the recognition sites of this enzyme in this scaffold.
2140 0 0         if($string=~/^(\w+)$/){
    0          
    0          
    0          
    0          
    0          
2141 0           my $loc_in_array;
2142 0           $loc_in_array=index($seq_for_search,$string);
2143 0 0         unless($loc_in_array==-1){
2144 0           push @enzyme_locs, $loc_in_array;
2145             }
2146 0           while($loc_in_array!=-1){
2147 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2148 0 0         unless($loc_in_array==-1){
2149 0           push @enzyme_locs, $loc_in_array;
2150             }
2151             }
2152             }
2153             elsif($string=~/(\w+)\[(\w+)\](\w+)/){
2154 0           my $breast_bases=$1;
2155 0           my $back_bases=$3;
2156 0           my $generate_bases=$2;
2157 0           my @generate_bases=split //, $generate_bases;
2158 0           my $seq_for_search=$scaffold_seq;
2159 0           for my $one_base(@generate_bases){
2160 0           $string=$breast_bases.$one_base.$back_bases;
2161 0           my $loc_in_array;
2162 0           $loc_in_array=index($seq_for_search,$string);
2163 0 0         unless($loc_in_array==-1){
2164 0           push @enzyme_locs, $loc_in_array;
2165             }
2166 0           while($loc_in_array!=-1){
2167 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2168 0 0         unless($loc_in_array==-1){
2169 0           push @enzyme_locs, $loc_in_array;
2170             }
2171             }
2172             }
2173             }
2174            
2175             elsif($string=~/^\[(\w+)\](\w+)\[(\w+)\]$/){
2176 0           my $g1_bases=$1;
2177 0           my $middle=$2;
2178 0           my $g2_bases=$3;
2179 0           my @g1_bases=split //, $g1_bases;
2180 0           my @g2_bases=split //, $g2_bases;
2181 0           for my $one_base(@g1_bases){
2182 0           for my $two_base(@g2_bases){
2183 0           $string=$one_base.$middle.$two_base;
2184 0           my $loc_in_array;
2185 0           $loc_in_array=index($seq_for_search,$string);
2186 0 0         unless($loc_in_array==-1){
2187 0           push @enzyme_locs, $loc_in_array;
2188             }
2189 0           while($loc_in_array!=-1){
2190 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2191 0 0         unless($loc_in_array==-1){
2192 0           push @enzyme_locs, $loc_in_array;
2193             }
2194             }
2195             }
2196             }
2197             }
2198            
2199             elsif($string=~/(\w+)\[(\w+)\](\w+)\[(\w+)\](\w+)/){
2200 0           my $breast_bases=$1;
2201 0           my $g1_bases=$2;
2202 0           my $middle=$3;
2203 0           my $back_bases=$5;
2204 0           my $g2_bases=$4;
2205 0           my @g1_bases=split //, $g1_bases;
2206 0           my @g2_bases=split //, $g2_bases;
2207 0           for my $one_base(@g1_bases){
2208 0           for my $two_base(@g2_bases){
2209 0           $string=$breast_bases.$one_base.$middle.$two_base.$back_bases;
2210 0           my $loc_in_array;
2211 0           $loc_in_array=index($seq_for_search,$string);
2212 0 0         unless($loc_in_array==-1){
2213 0           push @enzyme_locs, $loc_in_array;
2214             }
2215 0           while($loc_in_array!=-1){
2216 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2217 0 0         unless($loc_in_array==-1){
2218 0           push @enzyme_locs, $loc_in_array;
2219             }
2220             }
2221             }
2222             }
2223             }
2224             elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\]\[(\w+)\](\w+)/){
2225 0           my $breast_bases=$1;
2226 0           my $g1_bases=$2;
2227 0           my $g2_bases=$3;
2228 0           my $back_bases=$5;
2229 0           my $g3_bases=$4;
2230 0           my @g1_bases=split //, $g1_bases;
2231 0           my @g2_bases=split //, $g2_bases;
2232 0           my @g3_bases=split //, $g3_bases;
2233 0           for my $one_base(@g1_bases){
2234 0           for my $two_base(@g2_bases){
2235 0           for my $three_base(@g3_bases){
2236 0           $string=$breast_bases.$one_base.$two_base.$three_base.$back_bases;
2237 0           my $loc_in_array;
2238 0           $loc_in_array=index($seq_for_search,$string);
2239 0 0         unless($loc_in_array==-1){
2240 0           push @enzyme_locs, $loc_in_array;
2241             }
2242 0           while($loc_in_array!=-1){
2243 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2244 0 0         unless($loc_in_array==-1){
2245 0           push @enzyme_locs, $loc_in_array;
2246             }
2247             }
2248             }
2249             }
2250             }
2251             }
2252             elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\](\w+)/){
2253 0           my $breast_bases=$1;
2254 0           my $g1_bases=$2;
2255 0           my $g2_bases=$3;
2256 0           my $back_bases=$4;
2257 0           my @g1_bases=split //, $g1_bases;
2258 0           my @g2_bases=split //, $g2_bases;
2259 0           for my $one_base(@g1_bases){
2260 0           for my $two_base(@g2_bases){
2261 0           $string=$breast_bases.$one_base.$two_base.$back_bases;
2262 0           my $loc_in_array;
2263 0           $loc_in_array=index($seq_for_search,$string);
2264 0 0         unless($loc_in_array==-1){
2265 0           push @enzyme_locs, $loc_in_array;
2266             }
2267 0           while($loc_in_array!=-1){
2268 0           $loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2269 0 0         unless($loc_in_array==-1){
2270 0           push @enzyme_locs, $loc_in_array;
2271             }
2272             }
2273             }
2274             }
2275             }
2276            
2277             # Sort the array.
2278 0           @enzyme_locs=sort {$a<=>$b} @enzyme_locs;
  0            
2279            
2280            
2281             # Produce the fragments' loc-pair.
2282 0           my @fragments_loc_pair;
2283 0           my ($front_rec_loc, $behind_rec_loc)=qw(0 0);
2284 0 0         if(@enzyme_locs) {
2285 0           while(@enzyme_locs){
2286 0           $behind_rec_loc=shift @enzyme_locs;
2287             # Push the front end loc;
2288 0 0         if($front_rec_loc==0){
2289 0           push @fragments_loc_pair, $front_rec_loc;
2290             }
2291             else{
2292 0           push @fragments_loc_pair, $front_rec_loc+$enzyme_cut_loc-1;
2293             }
2294             # Push the behind end loc;
2295 0           push @fragments_loc_pair, $behind_rec_loc+$enzyme_cut_loc-2;
2296            
2297             # Check whether the @enzyme_locs is empty.
2298 0 0         if(@enzyme_locs){
2299 0           $front_rec_loc=$behind_rec_loc;
2300             }
2301             else{
2302 0           push @fragments_loc_pair, $behind_rec_loc+$enzyme_cut_loc-1;
2303 0           push @fragments_loc_pair, $length_of_scafd-1;
2304             }
2305             }
2306             }
2307            
2308             # Output the fragments' locs of this scaffold.
2309            
2310             # Define the scalars which contains the length of fragments;
2311 0           my $nums_of_fragments=@fragments_loc_pair/2;
2312 0           my $length_of_fragments=0;
2313 0           my $length_of_fragments_in_range=0;
2314 0           my $count_all=0;
2315 0           my $count_in_range=0;
2316 0           my ($before_cut_coorr,$after_cut_coorr,$bf_ct_crr_hm_rd,$af_ct_crr_hm_rd);
2317            
2318            
2319             # Attentions here!!! the output sequences of fragments are in accordance with the results of illumian sequencing.
2320             # For removing the sequencing primers, read1 and read2 can align with the fragments here directly!!!
2321             # That is to say, fragments here contain the restriction overhangs!!!
2322            
2323 0           while(@fragments_loc_pair){
2324 0           $count_all++;
2325 0           my $front_end_loc=shift @fragments_loc_pair;
2326 0           my $behind_end_loc=shift @fragments_loc_pair;
2327 0           my $length_of_fragment=$behind_end_loc-$front_end_loc+1;
2328 0           my $seq_selected=substr($scaffold_seq, $front_end_loc, $length_of_fragment);
2329            
2330             # Output the sequence and fragment positions of all-fragments to files;
2331 0           $all_seq_file_fh->print("$scaffold_name-$count_all\n$seq_selected\n");
2332 0           my $front_loc_on_scaffold=$front_end_loc+1;
2333 0           my $behind_loc_on_scaffold=$behind_end_loc+1;
2334            
2335 0           $all_loc_file_fh->print("$scaffold_name-$count_all\t$front_loc_on_scaffold\t$behind_loc_on_scaffold\t$length_of_fragment\n");
2336 0           push @lengths_of_fragments, $length_of_fragment;
2337            
2338             # We treat the fragments in range in the same way of all fragments as above;
2339 0 0         if($length_of_fragment>=$range1){
2340 0 0         if($length_of_fragment<=$range2){
2341 0           $count_in_range++;
2342 0           $seq_in_range_file_fh->print("$scaffold_name-$count_all\n$seq_selected\n");
2343 0           $loc_in_range_file_fh->print("$scaffold_name-$count_all\t$front_loc_on_scaffold\t$behind_loc_on_scaffold\t$length_of_fragment\n");
2344 0           push @lengths_fragments_in_range,$length_of_fragment;
2345 0           $length_of_fragments_in_range+=$length_of_fragment;
2346             }
2347             }
2348            
2349             # Count all fragments length no matter in range or not!
2350 0           $length_of_fragments+=$length_of_fragment;
2351             }
2352            
2353            
2354 0           my $reduced_rate=$length_of_fragments/$length_of_scafd;
2355 0           my $fragment_in_range_rate_scafd=$length_of_fragments_in_range/$length_of_scafd;
2356 0           $overall_fragments_length+=$length_of_fragments;
2357 0           $overall_fragments_in_range_length+=$length_of_fragments_in_range;
2358 0           $overall_length_of_scfd+=$length_of_scafd;
2359 0           $reduced_ratio_every_scaffold_file_fh->print("$reduced_rate\t$fragment_in_range_rate_scafd\n");
2360 0           print"Done!\n";
2361             }
2362             }
2363 0           my $length_ratio_all_frags=$overall_fragments_length/$overall_length_of_scfd;
2364 0           my $length_ratio_frags_in_range=$overall_fragments_in_range_length/$overall_length_of_scfd;
2365 0           my $num_all_frags=@lengths_of_fragments;
2366 0           my $num_frags_in_range=@lengths_fragments_in_range;
2367 0           my $lengths_fragments_in_range=\@lengths_fragments_in_range;
2368            
2369            
2370             # Output the summary of digestion.
2371 0           $summary_digestion_fh->print("Number of all fragments\t$num_all_frags\n");
2372 0           $summary_digestion_fh->print("Number of fragments in range\t$num_frags_in_range\n");
2373 0           $summary_digestion_fh->print("Ratio after reducing of all fragments\t$length_ratio_all_frags\n");
2374 0           $summary_digestion_fh->print("Ratio after reducing of fragments in range\t$length_ratio_frags_in_range\n");
2375            
2376 0           my $lengths_distribution_start=$self->{lengths_distribution_start};
2377 0           my $lengths_distribution_end=$self->{lengths_distribution_end};
2378 0           my $lengths_distribution_step=$self->{lengths_distribution_step};
2379 0           my $lengths_distribution_tmp;
2380 0           my (@starts,@ends,@counts,%lengths_distribution);
2381            
2382 0           my $num_pair=0;
2383            
2384 0           for($lengths_distribution_tmp= $lengths_distribution_start;$lengths_distribution_tmp<$lengths_distribution_end+$lengths_distribution_step;$lengths_distribution_tmp=$lengths_distribution_tmp+$lengths_distribution_step){
2385 0 0         if($lengths_distribution_tmp== $lengths_distribution_start){
    0          
2386 0           $num_pair++;
2387 0           push @starts, '0';
2388 0           push @ends , $lengths_distribution_tmp;
2389 0           $lengths_distribution{$num_pair}{"0bp-${lengths_distribution_tmp}bp"}=0;
2390            
2391 0           $num_pair++;
2392 0           my $tmp_start=$lengths_distribution_tmp+1;
2393 0           my $tmp_end=$lengths_distribution_tmp+$lengths_distribution_step;
2394 0           push @starts, $tmp_start;
2395 0           push @ends, $tmp_end;
2396 0           $lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
2397            
2398             }
2399             elsif($lengths_distribution_tmp < $lengths_distribution_end ){
2400 0 0         if($lengths_distribution_tmp+$lengths_distribution_step <$lengths_distribution_end ){
    0          
2401 0           $num_pair++;
2402 0           my $tmp_start=$lengths_distribution_tmp+1;
2403 0           my $tmp_end=$lengths_distribution_tmp+$lengths_distribution_step;
2404 0           push @starts, $tmp_start;
2405 0           push @ends, $tmp_end;
2406 0           $lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
2407            
2408             }
2409             elsif($lengths_distribution_tmp +$lengths_distribution_step >= $lengths_distribution_end){
2410 0           $num_pair++;
2411 0           my $tmp_start=$lengths_distribution_tmp+1;
2412 0           my $tmp_end=$lengths_distribution_end;
2413 0           push @starts, $tmp_start;
2414 0           push @ends, $tmp_end;
2415 0           $lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
2416            
2417 0           $num_pair++;
2418 0           $tmp_start=$tmp_end+1;
2419            
2420 0           push @starts, $tmp_start;
2421 0           push @ends, 'bigger';
2422 0           $lengths_distribution{$num_pair}{"${tmp_start}bp-bigger bp"}=0;
2423            
2424             }
2425             }
2426             }
2427            
2428            
2429            
2430 0           for (@lengths_of_fragments){
2431 0           my $length=$_;
2432 0           my $num_pair=1;
2433 0           my $num_last_pair=@starts;
2434 0           for($num_pair=1;$num_pair<=$num_last_pair;$num_pair++){
2435            
2436 0           my $array_index=$num_pair-1;
2437 0           my $start=$starts[$array_index];
2438 0           my $end=$ends[$array_index];
2439 0 0         if($num_pair<$num_last_pair){
    0          
2440 0 0 0       if($length>=$start && $length<=$end){
2441 0           for my $description (keys %{$lengths_distribution{$num_pair}}){
  0            
2442 0           $lengths_distribution{$num_pair}{$description}++;
2443 0           last;
2444             }
2445 0           last;
2446             }
2447             }
2448             elsif($num_pair==$num_last_pair){
2449 0 0         if($length>=$start){
2450 0           for my $description (keys %{$lengths_distribution{$num_pair}}){
  0            
2451 0           $lengths_distribution{$num_pair}{$description}++;
2452 0           last;
2453             }
2454 0           last;
2455             }
2456             }
2457             }
2458             }
2459            
2460 0           $summary_digestion_fh->print("\nLengths\' scope\tNumber of fragments in this scope\tRatio of these fragments in all\n");
2461            
2462 0           for(1..@starts){
2463 0           my $num_pair=$_;
2464 0           for my $description(keys %{$lengths_distribution{$num_pair}}){
  0            
2465 0           my $num_fragments=$lengths_distribution{$num_pair}{$description};
2466 0           my $ratio;
2467 0 0         if($num_all_frags>0){
2468 0           $ratio=$num_fragments/$num_all_frags;
2469             }
2470             else{
2471 0           $ratio='N/A';
2472             }
2473 0           $summary_digestion_fh->print("$description\t$num_fragments\t$ratio\n");
2474             }
2475             }
2476 0           $summary_digestion_fh->print("\n");
2477             }
2478            
2479             =head2 add_SNPs
2480            
2481             $digest->add_SNPs(-SNPs=>'Full path of the SNPs file');
2482             Adds the absolute path of the SNPs file to RestrictionDigest variable.
2483             =cut
2484            
2485             sub add_SNPs {
2486 0     0     my $self=shift;
2487 0           my %parameters=@_;
2488 0           for (keys %parameters){
2489 0 0         if($_=~/^-SNPs$/){
2490 0           $self->{SNPs}=$parameters{$_};
2491             }
2492             else{
2493 0           die "Unacceptable parameters of the method , please perldoc RestrictionDigest for example or read README for more help\n";
2494             }
2495             }
2496             }
2497            
2498             =head2 count_SNPs_at_fragments
2499            
2500             $digest->count_SNPs_at_fragments(-sequence_type=>'100PE');
2501             Count the expected SNPs appeared in the framgents.
2502             =cut
2503            
2504             sub count_SNPs_at_fragments {
2505 0     0     my $self=shift;
2506 0           my %parameters=@_;
2507            
2508 0 0         if(keys %parameters){
2509 0           for(keys %parameters){
2510 0 0         if($_=~/^-sequence_type$/){
2511 0           $self->{sequence_type}=$parameters{$_};
2512             }
2513             else{
2514 0           die "Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help\n";
2515             }
2516             }
2517             }
2518            
2519 0           my $ref=$self->{ref};
2520 0           my $enzyme=$self->{enzyme};
2521 0           my $range1=$self->{range_start};
2522 0           my $SNPs=$self->{SNPs};
2523 0 0         if($SNPs){
2524 0           my $SNPs_basename=basename($SNPs);
2525 0           print "The SNPs file provided is\t", $SNPs_basename, ".\n";
2526             }
2527             else{
2528 0           die"No SNPs file provided! Please perldoc RestrictionDigest for example.\n";
2529             }
2530 0           my $sequence_type=$self->{sequence_type};
2531            
2532             # Check the sequencing type and determine the domains of the fragments to calculate the SNPs number.
2533 0           my ($pe_sequence_length, $se_sequence_length);
2534 0           my $sequence_type_code;
2535 0 0         if($sequence_type=~/^(\d+)pe$/i){
    0          
2536 0           $sequence_type_code=2;
2537 0           my $sequence_length=$1;
2538 0 0         unless($sequence_length*2 <= $range1){
2539 0           die"Unacceptable sequence_type when range's front border is $range1 bp. Please perldoc RestrictionDigest for example or read README for more help!\n";
2540             }
2541             else{
2542 0           $pe_sequence_length=$sequence_length;
2543             }
2544             }
2545             elsif($sequence_type=~/^(\d+)se$/i){
2546 0           $sequence_type_code=1;
2547 0           my $sequence_length=$1;
2548 0 0         unless($sequence_length<=$range1){
2549 0           die "Unacceptable sequence_type when range's front border is $range1 bp. Please perldoc RestrictionDigest for example or read README for more help!\n";
2550             }
2551             else{
2552 0           $se_sequence_length=$sequence_length;
2553             }
2554             }
2555             else{
2556 0           die "Unacceptable sequence_type. Please perldoc RestrictionDigest for example or read README for more help!\n";
2557             }
2558            
2559 0           print "The sequencing type provided is\t", $sequence_type, ".\n";
2560            
2561 0           my $output_dir=$self->{output_dir};
2562 0           $output_dir=~s/^(.+)\/?$/$1/;
2563            
2564             # Get the basename of reference,not include the path.
2565 0           my $name_reference=basename($ref);
2566            
2567             # Make the file handle to the summary file;
2568 0           my $summary_digestion_fh=IO::File->new(">>$output_dir/digestion_summary_${name_reference}_by_${enzyme}");
2569            
2570             # Make the file handle to the locs file of all fragments and fragments in range.
2571 0           my $all_loc_file_fh=IO::File->new("$output_dir/position_frags_${name_reference}_by_${enzyme}",'r');
2572 0           my $loc_in_range_file_fh=IO::File->new("$output_dir/position_frags_in_range_${name_reference}_by_${enzyme}",'r');
2573            
2574             # Make the file handle to the SNPs file;
2575 0           my $SNPs_fh=IO::File->new("$SNPs",'r');
2576 0           my %SNPs;
2577 0           while(<$SNPs_fh>){
2578 0           chomp;
2579 0           my ($SNPs_scaffold_name,$SNPs_pos,$type)=split /\s+/, $_;
2580 0           push @{$SNPs{$SNPs_scaffold_name}}, $SNPs_pos;
  0            
2581             }
2582            
2583             # Create the SNPs file to hold SNPs of all fragments and fragments in range when sequencing through pair-end.
2584 0           my($SNPs_at_all_frags_fh,$SNPs_at_frags_in_range_fh);
2585 0 0         if($sequence_type_code==2){
2586 0           $SNPs_at_all_frags_fh=IO::File->new(">>$output_dir/SNPs_at_all_frags_${name_reference}_by_${enzyme}");
2587 0           $SNPs_at_frags_in_range_fh=IO::File->new(">>$output_dir/SNPs_at_frags_in_range_${name_reference}_by_${enzyme}");
2588             }
2589            
2590 0           print "Counting SNPs at the output fragments...\t";
2591             # Scan all locs file to count SNPs and output the result to the summary file.
2592 0           my $all_SNPs_number=0;
2593 0 0         if($sequence_type_code == 1){
    0          
2594 0           while(<$all_loc_file_fh>){
2595 0           chomp;
2596 0           my($fragment_name,$pos1,$pos2,$length)=split /\t/, $_;
2597 0           my $scaffold_name;
2598 0 0         if($fragment_name=~/^>(.+)-(\d+)/){
2599 0           $scaffold_name=$1;
2600             }
2601 0 0         if($SNPs{$scaffold_name}){
2602 0           for my $SNP (@{$SNPs{$scaffold_name}}){
  0            
2603 0 0 0       if( ($SNP>=$pos1 && $SNP<=($pos1+$se_sequence_length-1)) || ( $SNP>= ($pos2-$se_sequence_length+1) && $SNP<=$pos2 ) ){
      0        
      0        
2604 0           $all_SNPs_number++;
2605             }
2606             }
2607             }
2608             }
2609 0           $all_SNPs_number=int($all_SNPs_number/2);
2610 0           $summary_digestion_fh->print("Expected SNPs at all fragments via $sequence_type:\t$all_SNPs_number\n");
2611             }
2612             elsif($sequence_type_code ==2){
2613 0           while(<$all_loc_file_fh>){
2614 0           chomp;
2615 0           my($fragment_name,$pos1,$pos2,$length)=split /\t/, $_;
2616 0           my $scaffold_name;
2617 0 0         if($fragment_name=~/^>(.+)-(\d+)/){
2618 0           $scaffold_name=$1;
2619             }
2620 0 0         if($SNPs{$scaffold_name}){
2621 0           for my $SNP (@{$SNPs{$scaffold_name}}){
  0            
2622 0 0 0       if( ($SNP>=$pos1 && $SNP<=($pos1+$pe_sequence_length-1)) || ( $SNP>= ($pos2-$pe_sequence_length+1) && $SNP<=$pos2 ) ){
      0        
      0        
2623 0           $all_SNPs_number++;
2624 0           $SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
2625             }
2626             }
2627             }
2628             }
2629 0           $summary_digestion_fh->print("Expected SNPs at all fragments via $sequence_type:\t$all_SNPs_number\n");
2630             }
2631             # Scan locs in range file to count SNPs and output the result to the summary file.
2632            
2633 0           my $SNPs_in_range_number=0;
2634            
2635 0 0         if($sequence_type_code == 1){
    0          
2636 0           while(<$loc_in_range_file_fh>){
2637 0           chomp;
2638 0           my($fragment_name,$pos1,$pos2,$length)=split /\t/, $_;
2639 0           my $scaffold_name;
2640 0 0         if($fragment_name=~/^>(.+)-(\d+)/){
2641 0           $scaffold_name=$1;
2642             }
2643 0 0         if($SNPs{$scaffold_name}){
2644 0           for my $SNP (@{$SNPs{$scaffold_name}}){
  0            
2645 0 0 0       if( ($SNP>=$pos1 && $SNP<=($pos1+$se_sequence_length-1)) || ( $SNP>= ($pos2-$se_sequence_length+1) && $SNP<=$pos2 ) ){
      0        
      0        
2646 0           $SNPs_in_range_number++;
2647             }
2648             }
2649             }
2650             }
2651 0           $SNPs_in_range_number=int($SNPs_in_range_number/2);
2652 0           $summary_digestion_fh->print("Expected SNPs at fragments in range via $sequence_type:\t$SNPs_in_range_number\n");
2653             }
2654             elsif($sequence_type_code ==2){
2655 0           while(<$loc_in_range_file_fh>){
2656 0           chomp;
2657 0           my($fragment_name,$pos1,$pos2,$length)=split /\t/, $_;
2658 0           my $scaffold_name;
2659 0 0         if($fragment_name=~/^>(.+)-(\d+)/){
2660 0           $scaffold_name=$1;
2661             }
2662 0 0         if($SNPs{$scaffold_name}){
2663 0           for my $SNP (@{$SNPs{$scaffold_name}}){
  0            
2664 0 0 0       if( ($SNP>=$pos1 && $SNP<=($pos1+$pe_sequence_length-1)) || ( $SNP>= ($pos2-$pe_sequence_length+1) && $SNP<=$pos2 ) ){
      0        
      0        
2665 0           $SNPs_in_range_number++;
2666 0           $SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
2667             }
2668             }
2669             }
2670             }
2671 0           $summary_digestion_fh->print("Expected SNPs at fragments in range via $sequence_type:\t$SNPs_in_range_number\n");
2672             }
2673 0           print "Done!\n";
2674             }
2675            
2676             =head2 add_gff
2677            
2678             $digest->add_gff(-ref=>'Full path of the gff file');
2679             Adds the absolute path of gff file to RestrictionDigest variable.
2680             If User want to calculate the coverage ratio of different genomic strutures by the digested
2681             fragments, this method is obligatory.
2682            
2683             =cut
2684            
2685             sub add_gff {
2686 0     0     my $self=shift;
2687 0           my %parameters=@_;
2688 0           for (keys %parameters){
2689 0 0         if($_=~/^-gff$/){
2690 0           $self->{gff}=$parameters{$_};
2691             }
2692             else{
2693 0           die"Unacceptable parameters of the method , please perldoc RestrictionDigest for example or read README for more help\n";
2694             }
2695             }
2696             }
2697            
2698             =head2 all_frags_coverage_ratio
2699            
2700             $digest->all_frags_coverage_ratio();
2701             Calculate the coverage ratio of different parts of the genome reference,
2702             including mRNA, CDS, UTR and intergenic region, by all digested fragments.
2703             This process will take a very long time, please be patient.
2704             =cut
2705            
2706             sub all_frags_coverage_ratio {
2707 0     0     my $self=shift;
2708            
2709 0           my $ref=$self->{ref};
2710 0           my $basename_ref=basename($ref);
2711            
2712 0           my $enzyme=$self->{enzyme};
2713            
2714 0           my $output_dir=$self->{output_dir};
2715 0           $output_dir=~s/^(.+)\/?$/$1/;
2716            
2717 0           my $loc_file_all_frags="$output_dir/position_frags_${basename_ref}_by_${enzyme}";
2718            
2719 0           $self->genome_structure_coverage_ratio($loc_file_all_frags);
2720             }
2721            
2722             =head2 frags_in_range_coverage_ratio
2723            
2724             $digest->frags_in_range_coverage_ratio();
2725             Calculate the coverage ratio of different parts of the genome reference,
2726             including mRNA, CDS, UTR and intergenic region, by the fragments in range.
2727             This process will take a very long time, please be patient.
2728             =cut
2729            
2730             sub frags_in_range_coverage_ratio {
2731 0     0     my $self=shift;
2732            
2733 0           my $ref=$self->{ref};
2734 0           my $basename_ref=basename($ref);
2735            
2736 0           my $enzyme=$self->{enzyme};
2737            
2738 0           my $output_dir=$self->{output_dir};
2739 0           $output_dir=~s/^(.+)\/?$/$1/;
2740            
2741 0           my $loc_file_all_frags="$output_dir/position_frags_in_range_${basename_ref}_by_${enzyme}";
2742            
2743 0           $self->genome_structure_coverage_ratio($loc_file_all_frags);
2744             }
2745            
2746             =head2 genome_structure_coverage_ratio
2747            
2748             A subroutine invoked by the 'all_frags_coverage_ratio' and
2749             the 'frags_in_range_coverage_ratio' methods.
2750             User do not use this subroutine.
2751            
2752             =cut
2753            
2754            
2755             sub genome_structure_coverage_ratio {
2756 0     0     my $self=shift;
2757 0           my $loc_frags_file=shift;
2758            
2759             # Get the reference name, enzyme names from hash.
2760 0           my $ref=$self->{ref};
2761 0           my $ref_fh=IO::File->new($ref,'r');
2762            
2763 0           my $basename_ref=basename($ref);
2764 0           my $enzyme=$self->{enzyme};
2765 0           my $output_dir=$self->{output_dir};
2766 0           $output_dir=~s/^(.+)\/?$/$1/;
2767            
2768             # Get the full path and name of gff file and make a filehandle to it.
2769 0           my $gff=$self->{gff};
2770            
2771 0 0         if($gff){
2772 0           my $gff_basename=basename($gff);
2773 0           print "The GFF file provided is\t", $gff_basename, ".\n";
2774             }
2775             else{
2776 0           die"No GFF file provided! Please perldoc RestrictionDigest for example.\n";
2777             }
2778            
2779            
2780             # Make output filehandles to generic region and intergenic region coverage ratio files.
2781 0           my $coverage_ratio_fh;
2782            
2783 0           my $basename_loc_frags_file=basename($loc_frags_file);
2784            
2785 0 0         if($basename_loc_frags_file =~/in_range/){
2786 0           $coverage_ratio_fh=IO::File->new(">>$output_dir/genome_coverage_ratio_in_range_${basename_ref}_by_${enzyme}");
2787             }
2788             else{
2789 0           $coverage_ratio_fh=IO::File->new(">>$output_dir/genome_coverage_ratio_${basename_ref}_by_${enzyme}");
2790             }
2791            
2792 0           $coverage_ratio_fh->print("ScaffoldName\tIntergenicLength\tIntergenicMapLength\tIntergenicMapRatio\tGenesLength\tGenesMapLength\tGenesMapRatio\tExonsLength\tExonsMapLength\tExonsMapRatio\tIntronsLength\tIntronsMapLength\tIntronsMapRatio\n");
2793            
2794            
2795             # Set variables used in this program.
2796 0           my($all_intergenic_length,$all_intergenic_map_length,$all_gene_length,$all_gene_map_length,$all_exon_length,$all_exon_map_length,$all_intron_length,$all_intron_map_length)=qw(0 0 0 0 0 0 0 0);
2797            
2798             # Fetch the lengths of different parts of this scaffold from the GFF file.
2799            
2800 0           my $gff_fh=IO::File->new($gff,'r');
2801 0           my %gff;
2802 0           print "Scanning the GFF file...\t";
2803 0           while(<$gff_fh>){
2804 0           chomp;
2805 0 0         unless($_=~/^#/){
2806 0           my @gff=split /\t/, $_;
2807 0 0         if(@gff == 9){
2808 0           my ($gff_scfd_name,$gff_type,$gff_start,$gff_stop)=@gff[0,2,3,4];
2809 0 0         if($gff_type=~/CDS|exon/){
    0          
2810 0           $gff_type='exon';
2811 0 0         if($gff{$gff_scfd_name}{$gff_type}){
2812 0 0         unless(grep {$gff_start == $_} @{$gff{$gff_scfd_name}{$gff_type}}){
  0            
  0            
2813 0           push @{$gff{$gff_scfd_name}{$gff_type}}, ($gff_start,$gff_stop);
  0            
2814             }
2815             }
2816             else{
2817 0           push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
  0            
2818             }
2819             }
2820             elsif($gff_type=~/mRNA|gene/){
2821 0           $gff_type='gene';
2822 0 0         if($gff{$gff_scfd_name}{$gff_type}){
2823 0 0         unless(grep {$gff_start == $_} @{$gff{$gff_scfd_name}{$gff_type}}){
  0            
  0            
2824 0           push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
  0            
2825             }
2826             }
2827             else{
2828 0           push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
  0            
2829             }
2830             }
2831             }
2832             }
2833             }
2834 0           $gff_fh->close;
2835 0           print"Done!\n";
2836            
2837             # Scan the loc file and fetch the information.
2838            
2839 0           my $loc_frags_file_fh=IO::File->new($loc_frags_file,'r');
2840 0           my %all_locs;
2841 0           print "Scanning the positions file...\t";
2842 0           while(<$loc_frags_file_fh>){
2843 0           chomp;
2844 0           my($frgt_name,$start_coor,$stop_coor,$length)=split /\t/,$_;
2845 0           my $scfd_frgt;
2846 0 0         if($frgt_name=~/^>(.+)-\d+$/){
2847 0           $scfd_frgt=$1;
2848 0           push @{$all_locs{$scfd_frgt}}, ($start_coor, $stop_coor);
  0            
2849             }
2850             }
2851 0           $loc_frags_file_fh->close;
2852 0           print"Done!\n";
2853            
2854            
2855            
2856             # Handle every scaffold one by one.
2857 0           my($scfd_name);
2858 0           while(<$ref_fh>){
2859 0           chomp;
2860 0 0         if($_=~/^>(\S+)$/){
2861 0           $scfd_name=$1;
2862             }
2863             else{
2864 0           my $scaffold_length=length $_;
2865 0           print "CoverageRatio: Processing $scfd_name...\t";
2866            
2867            
2868             # Calculate the lengths of different parts of this scaffold.
2869 0           my ($exon_length,$intron_length,$gene_length,$intergenic_length)=qw(0 0 0 0);
2870            
2871 0 0         if($gff{$scfd_name}{gene}){
2872 0           my @gene_tmp=@{$gff{$scfd_name}{gene}};
  0            
2873 0           while(@gene_tmp){
2874 0           my $front=shift @gene_tmp;
2875 0           my $behind=shift @gene_tmp;
2876 0           $gene_length+=$behind-$front+1;
2877             }
2878 0           $intergenic_length=$scaffold_length-$gene_length;
2879             }
2880             else{
2881 0           $intergenic_length=$scaffold_length;
2882             }
2883            
2884 0 0         if($gff{$scfd_name}{exon}){
2885 0           my @exon_tmp=@{$gff{$scfd_name}{exon}};
  0            
2886 0           while(@exon_tmp){
2887 0           my $front=shift @exon_tmp;
2888 0           my $behind=shift @exon_tmp;
2889 0           $exon_length+=$behind-$front+1;
2890             }
2891 0           $intron_length=$gene_length-$exon_length;
2892             }
2893            
2894 0           $all_intergenic_length+=$intergenic_length;
2895 0           $all_gene_length+=$gene_length;
2896 0           $all_exon_length+=$exon_length;
2897 0           $all_intron_length+=$intron_length;
2898            
2899            
2900            
2901             # Scan the locs file and store information about this scaffold.
2902            
2903 0           my $gene_map_length=0;
2904 0           my $exon_map_length=0;
2905 0           my $intron_map_length=0;
2906 0           my $intergenic_map_length=0;
2907            
2908             # Extract fragments' locs for %all_locs;
2909 0 0         if($all_locs{$scfd_name}){
2910 0           my @tmp_locs=@{$all_locs{$scfd_name}};
  0            
2911 0           while(@tmp_locs){
2912 0           my $front=shift @tmp_locs;
2913 0           my $behind=shift @tmp_locs;
2914 0           for my $one($front..$behind){
2915 0           my $gene_map_count=0;
2916 0           my $exon_map_count=0;
2917 0           my $intron_map_count=0;
2918            
2919 0 0         if($gff{$scfd_name}{gene}){
2920 0           my @gene_tmp=@{$gff{$scfd_name}{gene}};
  0            
2921 0           while(@gene_tmp){
2922 0           my $front=shift @gene_tmp;
2923 0           my $behind=shift @gene_tmp;
2924 0 0 0       if($one>=$front && $one<=$behind){
2925 0           $gene_map_count++;
2926 0           $gene_map_length++;
2927 0           last;
2928             }
2929             }
2930             }
2931            
2932 0 0         if($gff{$scfd_name}{exon}){
2933 0           my @exon_tmp=@{$gff{$scfd_name}{exon}};
  0            
2934 0           while(@exon_tmp){
2935 0           my $front=shift @exon_tmp;
2936 0           my $behind=shift @exon_tmp;
2937 0 0 0       if($one>=$front && $one<=$behind){
2938 0           $exon_map_count++;
2939 0           $exon_map_length++;
2940 0           last;
2941             }
2942             }
2943             }
2944            
2945 0 0         if($gene_map_count==0){
    0          
2946 0           $intergenic_map_length++;
2947             }
2948             elsif($exon_map_count==0){
2949 0           $intron_map_length++;
2950             }
2951             }
2952             }
2953             }
2954            
2955             # Output part!!!
2956 0           my($intergenic_map_ratio,$gene_map_ratio,$exon_map_ratio,$intron_map_ratio)=qw(0 0 0 0);
2957 0 0         unless($intergenic_length==0){
  0            
2958 0           $intergenic_map_ratio=$intergenic_map_length/$intergenic_length;
2959             }
2960             else{$intergenic_map_ratio="N/A";
2961             }
2962            
2963 0 0         unless($gene_length==0){
  0            
2964 0           $gene_map_ratio=$gene_map_length/$gene_length;
2965             }
2966             else{$gene_map_ratio="N/A";
2967             }
2968            
2969 0 0         unless($exon_length==0){
  0            
2970 0           $exon_map_ratio=$exon_map_length/$exon_length;
2971             }
2972             else{$exon_map_ratio="N/A";
2973             }
2974            
2975 0 0         unless($intron_length==0){
  0            
2976 0           $intron_map_ratio=$intron_map_length/$intron_length;
2977             }
2978             else{$intron_map_ratio="N/A";
2979             }
2980 0           $all_intergenic_map_length+=$intergenic_map_length;
2981 0           $all_gene_map_length+=$gene_map_length;
2982 0           $all_exon_map_length+=$exon_map_length;
2983 0           $all_intron_map_length+=$intron_map_length;
2984            
2985            
2986 0           $coverage_ratio_fh->print("$scfd_name\t$intergenic_length\t$intergenic_map_length\t$intergenic_map_ratio\t$gene_length\t$gene_map_length\t$gene_map_ratio\t$exon_length\t$exon_map_length\t$exon_map_ratio\t$intron_length\t$intron_map_length\t$intron_map_ratio\n");
2987 0           print "Done!\n";
2988             }
2989             }
2990 0           my ($all_intergenic_map_ratio,$all_gene_map_ratio,$all_exon_map_ratio,$all_intron_map_ratio)=qw(0 0 0 0);
2991 0 0         if($all_gene_length==0){$all_gene_map_ratio='N/A';}
  0            
2992             else{
2993 0           $all_gene_map_ratio=$all_gene_map_length/$all_gene_length;}
2994 0 0         if($all_exon_length==0){$all_exon_map_ratio='N/A';}
  0            
2995             else{
2996 0           $all_exon_map_ratio=$all_exon_map_length/$all_exon_length;}
2997 0 0         if($all_intron_length==0){$all_intron_map_ratio='N/A';}
  0            
2998             else{
2999 0           $all_intron_map_ratio=$all_intron_map_length/$all_intron_length;}
3000 0 0         if($all_intergenic_length==0){$all_intergenic_map_ratio='N/A';}
  0            
3001             else{
3002 0           $all_intergenic_map_ratio=$all_intergenic_map_length/$all_intergenic_length;}
3003            
3004 0           $coverage_ratio_fh->print("Intergenic region map ratio is\t$all_intergenic_map_ratio\nGenes region map ratio is\t$all_gene_map_ratio\nExon region map ratio is\t$all_exon_map_ratio\nIntron region map ratio is\t$all_intron_map_ratio\n");
3005             }
3006            
3007            
3008            
3009             =head1 AUTHOR
3010            
3011             Jinpeng Wang, Li Li, Guofan Zhang, C<< >>
3012            
3013             =head1 BUGS
3014            
3015             Please report any bugs or feature requests to C, or through
3016             the web interface at L. I will be notified, and then you'll
3017             automatically be notified of progress on your bug as I make changes.
3018            
3019            
3020            
3021            
3022             =head1 SUPPORT
3023            
3024             You can find documentation for this module with the perldoc command.
3025            
3026             perldoc RestrictionDigest
3027            
3028            
3029             You can also look for information at:
3030            
3031             =over 4
3032            
3033             =item * RT: CPAN's request tracker (report bugs here)
3034            
3035             L
3036            
3037             =item * AnnoCPAN: Annotated CPAN documentation
3038            
3039             L
3040            
3041             =item * CPAN Ratings
3042            
3043             L
3044            
3045             =item * Search CPAN
3046            
3047             L
3048            
3049             =back
3050            
3051            
3052             =head1 ACKNOWLEDGEMENTS
3053            
3054            
3055             =head1 LICENSE AND COPYRIGHT
3056            
3057             Copyright 2014 Jinpeng Wang, Li Li, Guofan Zhang.
3058            
3059             This program is free software; you can redistribute it and/or modify it
3060             under the terms of the the Artistic License (2.0). You may obtain a
3061             copy of the full license at:
3062            
3063             L
3064            
3065             Any use, modification, and distribution of the Standard or Modified
3066             Versions is governed by this Artistic License. By using, modifying or
3067             distributing the Package, you accept this license. Do not use, modify,
3068             or distribute the Package, if you do not accept this license.
3069            
3070             If your Modified Version has been derived from a Modified Version made
3071             by someone other than you, you are nevertheless required to ensure that
3072             your Modified Version complies with the requirements of this license.
3073            
3074             This license does not grant you the right to use any trademark, service
3075             mark, tradename, or logo of the Copyright Holder.
3076            
3077             This license includes the non-exclusive, worldwide, free-of-charge
3078             patent license to make, have made, use, offer to sell, sell, import and
3079             otherwise transfer the Package with respect to any patent claims
3080             licensable by the Copyright Holder that are necessarily infringed by the
3081             Package. If you institute patent litigation (including a cross-claim or
3082             counterclaim) against any party alleging that the Package constitutes
3083             direct or contributory patent infringement, then this Artistic License
3084             to you shall terminate on the date that such litigation is filed.
3085            
3086             Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
3087             AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
3088             THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
3089             PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
3090             YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
3091             CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
3092             CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
3093             EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3094            
3095            
3096             =cut
3097            
3098             1; # End of RestrictionDigest