File Coverage

blib/lib/RestrictionDigest.pm

Criterion	Covered	Total	%
statement	45	1604	2.8
branch	0	620	0.0
condition	0	96	0.0
subroutine	15	43	34.8
pod			n/a
total	60	2363	2.5

line	stmt	bran	cond	sub	time	code
1
2						package RestrictionDigest;
3
4	1			1	16068	use 5.8.0;
	1				3
	1				49
5	1			1	5	use strict;
	1				1
	1				37
6	1			1	3	use warnings FATAL => 'all';
	1				5
	1				74
7
8
9
10						=head1 NAME
11
12						RestrictionDigest - A simulation tool for reducing the genome with one DNA endonuclease or a pair DNA endonucleases!
13
14						=head1 VERSION
15
16						Version 0.01
17
18						=cut
19
20						our $VERSION = '0.01';
21
22						=head1 DESCRIPTION
23
24
25						Next Generation Sequencing (NGS) is developing quickly and many cost-saving approaches are developed based on NGS. As sequencing
26						the whole genome is expensive and sometimes unnecessary, sequencing the representative part of the whole genome becomes practicable
27						and attractive. The most famous two approaches are Genotyping By Sequencing (GBS) and Restrict site Associated DNA sequencing (RAD).
28						These two approaches can reduce the ratio of sequenced part of the whole genome which will save a lot of money. Both of them
29						utilize tpye II DNA endonuclease to digest the whole genome. In order to select the correct enzyme or enzyme-pair, we developed
30						this perl module: RestrictionDigest.
31
32						RestrictionDigest is used for the simulation of single/double-enzyme GBS/RAD approach. The most important thing of representative sequencing
33						is choosing the right enzymes to digest the whole genome. The appropriate enzymes should satisfy several criteria including: the
34						recognition sites of these two enzymes are evenly dispersed on each single chromsome; the recognition sites of these two enzymes
35						are not or rarely located at the repetitive region of the genome; the fragments produced by two-enzyme digestion are neither too long
36						nor too short, they should be in the suitable range which is most effective for PCR amplification; the complexity reducing rate
37						shoule be in the suitable range(1%-10%). RestrictionDigest can produce several result files including the fragments seqence and their
38						locations on the reference. The ratio of these fragments on the reference seqences are also produced. If User can provide the gff
39						annotation file, RestrictionDigest can also provide the coverage ratio of these fragments on gene region, even CDS, INTRON, UTR region,
40						and intergenic region.
41
42						=cut
43
44
45
46
47
48						package RestrictionDigest::Double;
49
50	1			1	7	use 5.8.0;
	1				3
	1				27
51	1			1	4	use strict;
	1				1
	1				25
52	1			1	4	use warnings FATAL => 'all';
	1				2
	1				66
53						=head1 NAME
54
55						RestrictionDigest::Double - for double-enzyme simulation.
56
57						=head1 VERSION
58
59						Version 0.01
60
61						=cut
62
63						our $VERSION = '0.01';
64
65
66
67						=head1 SYNOPSIS
68
69						RestrictionDigest::Double is used for the simulation of double-enzyme GBS/RAD approach.
70
71
72						use RestrictionDigest;
73
74						$digest = RestrictionDigest::Double->new();
75
76						$digest->add_ref(-reference=>'Full path of the reference file');
77
78						$digest->new_enzyme(-enzyme_name=>'Ncol',-recognition_site=>'C\|CATGG');
79
80						$digest->add_enzyme_pair(-front_enzyme=>'EcoRI',-behind_enzyme=>'HinfI');
81
82						$digest->add_output_dir(-output_dir=>'Full path of the output directory');
83
84						$digest->double_digest();
85
86
87
88
89						=cut
90
91	1			1	484	use IO::File;
	1				7089
	1				109
92	1			1	9	use File::Basename;
	1				1
	1				55
93
94	1			1	4	use vars qw(%fields %enzyme_list);
	1				1
	1				9319
95
96						# Define the fields to be used as identifiers.
97						%fields = (
98						ref => undef,
99						gff => undef,
100						SNPs => undef,
101						enzyme1 => undef,
102						enzyme2 => undef,
103						output_dir=> undef,
104						range_start => 201,
105						range_end => 500,
106						lengths_distribution_start =>100,
107						lengths_distribution_end => 1000,
108						lengths_distribution_step => 50,
109						sequence_type => '100PE',
110						sequence_end => 'front_enzyme',
111						);
112
113						# Make the endonuclease enzyme-container--a hash.
114						%enzyme_list=(
115						EcoRI => 'G\|AATTC',
116						HinfI => 'G\|ANTC',
117						AluI => 'AG\|CT',
118						AvaII => 'G\|GWCC',
119						BamHI => 'G\|GATCC',
120						HindIII => 'A\|AGCTT',
121						MseI => 'T\|TAA',
122						MspI => 'C\|CGG',
123						NdeI => 'CA\|TATG',
124						PstI => 'CTGCA\|G',
125						SalI => 'G\|TCGAC',
126						XbaI => 'T\|CTAGA',
127						XhoI => 'C\|TCGAG',
128						NlaIII => 'NCATG\|N',
129						AatII => 'GACGT\|C',
130						AccI => 'GT\|MKAC',
131						Acc65I => 'G\|GTACC',
132						AciI => 'C\|CGC',
133						AclI => 'AA\|CGTT',
134						AfeI => 'AGC\|GCT',
135						AflII => 'C\|TTAAGA',
136						AflIII => 'A\|CRYGT',
137						AgeI => 'A\|CCGGT',
138						AlwNI => 'CAGNNN\|CTG',
139						ApaI => 'GGGCC\|C',
140						ApaLI => 'G\|TGCAC',
141						ApeKI => 'G\|CWGC',
142						ApoI => 'R\|AATTY',
143						AscI => 'GG\|CGCGCC',
144						AseI => 'AT\|TAAT',
145						AvaI => 'C\|YCGRG',
146						BclI => 'T\|GATCA',
147						AvrII => 'C\|CTAGG',
148						BaeGI => 'GKGCM\|C',
149						BanI => 'G\|GYRCC',
150						BanII => 'GRGCY\|C',
151						BbvCI => 'CC\|TCAGC',
152						BsaAI => 'YAC\|GTR',
153						BsaJI => 'C\|CNNGG',
154						BsaWI => 'W\|CCGGW',
155
156
157
158						);
159
160						=head1 SUBROUTINES/METHODS
161
162						=head2 new
163
164						$digest=RestrictionDigest::Double->new();
165						Create a new object.
166
167						=cut
168
169						sub new {
170	0			0		my $class = shift;
171	0					my $self = {
172						%fields,
173						};
174	0					bless ($self, $class);
175	0					return $self;
176						}
177
178						=head2 add_ref
179
180						$digest->add_ref(-reference=>'Full path of the reference file');
181						Add the reference file to the RestrictionDigest variable.
182
183						=cut
184
185						sub add_ref {
186	0			0		my $self=shift;
187	0					my %parameters=@_;
188	0					for(keys %parameters){
189	0	0				if($_=~/^-reference$/){
190	0					$self->{ref}=$parameters{$_};
191						}
192						else{
193	0					die "Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
194						}
195						}
196
197						# Check the format of reference
198	0					my $ref_fh=IO::File->new("$self->{ref}",'r');
199	0					my $line_num=0;
200	0					while(<$ref_fh>){
201	0					chomp;
202	0					$line_num++;
203	0	0				if($line_num % 2 != 0){
204	0	0				unless($_=~/^>\S+$/){
205	0					die "The format of reference file is not allowed, please convert it.\nYou can find the example format in the README!\n";
206						}
207						}
208	0	0				if($line_num % 2 ==0 ){
209	0	0				unless($_=~/^[AGTCNRYMKSWHBVD]+$/i){
210	0					die"The sequences of reference contain some charactors other than AGTCNRYMKSWHBVD, please correct that!\n";
211						}
212						}
213						}
214	0					$ref_fh->close;
215						}
216
217						=head2 add_enzyme_pair
218
219						$digest->add_enzyme_pair(-front_enzyme=>'EcoRI', -behind_enzyme=>'HinfI');
220						Add two enzymes to the RestrictionDigest variable.
221						The names of the two enzymes are case-insensitive.
222
223						=cut
224
225
226
227						sub add_enzyme_pair {
228
229	0			0		my $self=shift;
230	0					my %parameters=@_;
231	0					for (keys %parameters){
232	0	0				if($_=~/^-front_enzyme$/){
		0
233	0					$self->{enzyme1}=$parameters{$_};
234						}
235						elsif($_=~/^-behind_enzyme$/){
236	0					$self->{enzyme2}=$parameters{$_};
237						}
238						else{
239	0					die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
240						}
241						}
242
243						# Check the existence of enzymes in the enzyme-container.
244	0					my ($enzyme1_exists, $enzyme2_exists)=qw(0 0);
245	0					foreach(keys %enzyme_list){
246	0	0				if($_=~/^$self->{enzyme1}$/i){
247	0					$enzyme1_exists++;
248						}
249	0	0				if($_=~/^$self->{enzyme2}$/i){
250	0					$enzyme2_exists++;
251						}
252						}
253	0	0				if($enzyme1_exists >0 ){
254
255						}else{
256	0					die "The front restrict endonuclease User provides does not exist.\nHowever User can add this enzyme and its recognition sites to enzyme-container via the method 'new_enzyme' .\n";
257						}
258	0	0				if($enzyme2_exists >0 ){
259
260						}else{
261	0					die "The behind restrict endonuclease User provides does not exist.\nHowever User can add this enzyme and its recognition sites to enzyme-container via the method 'new_enzyme' .\n";
262						}
263						}
264
265						=head2 new_enzyme
266
267						$digest->new_enzyme(-enzyme_name=>'EcoRI', -recognition_site=>'G\|AATTC');
268
269						If the enzyme User wants to use does not exists in the default enzyme-pool, the enzyme can be added to the enzyme-pool.
270
271						For now, the enzymes in the default enzyme pool are as follows:
272						EcoRI,HinfI,AluI,AvaII,BamHI,HindIII,MseI,MspI,NdeI,PstI,SalI,XbaI,XhoI,NlaIII.
273
274						Adding new enzyme(s) to the enzyme-pool is easy but the enzyme added to the pool is temporary. The added enzyme(s) is active
275						only in the class context to which the 'new_method' belongs.
276
277						=cut
278
279						sub new_enzyme {
280	0			0		my $self=shift;
281	0					my %parameters=@_;
282	0					my $new_enzyme_name;
283						my $new_enzyme_site;
284	0					for (keys %parameters){
285	0	0				if($_=~/^-enzyme_name$/){
		0
286	0					$new_enzyme_name=$parameters{$_};
287						}
288						elsif($_=~/^-recognition_site$/){
289	0					$new_enzyme_site=$parameters{$_};
290	0	0				unless($new_enzyme_site=~/\|/){
291	0					die "The 'recognition_site' must contain a cut flag '\|'. Please perldoc RestrictionDigest for example or read README for moer help\n";
292						}
293						}
294						else{
295	0					die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help\n";
296						}
297						}
298	0					$enzyme_list{$new_enzyme_name}=$new_enzyme_site;
299						}
300
301
302						=head2 change_range
303
304						$digest->change_range(-start=>201, -end=>500);
305						Usually, not all fragments are needed by the scientists. The fragments in particular range are more attractive to them.
306						RestrictionDigest provides the sequences and the positions on the reference of these fragments in range. The default range is
307						[201bp, 500]. User can change this range via the method .
308
309
310						=cut
311
312						sub change_range {
313	0			0		my $self=shift;
314	0					my %parameters=@_;
315	0					for(keys %parameters){
316	0	0				if($_=~/^-start$/){
		0
317	0					$self->{range_start}=$parameters{$_};
318						}
319						elsif($_=~/^-end$/){
320	0					$self->{range_end}=$parameters{$_};
321						}
322						else{
323	0					die"Unacceptable parameters in the method ,please perldoc RestrictionDigest for example or read README for more help.\n";
324						}
325						}
326	0	0				if($self->{range_start} < $self->{range_end} ) {
327
328						}else{
329	0					die "The first parameter of the method must be smaller than the second parameter\n";
330						}
331						}
332
333						=head2 change_lengths_distribution_parameters
334
335						$digest->change_lengths_distribution_parameters(-front=>100,-behind=>1000,-step=>50);
336						This program will give a simple summary of the lengths of all fragments after digestion. By default, three scopes, which are
337						<=100bp, 101bp-1000bp and >1000bp, are given. The number of the fragments whose length fallen in these scopes are given. More
338						fine sorted, the 101bp-1000bp scope is split by the step of 50bp. These three parameters: the front edge and the behind edge
339						of the scope and the step can be changed via the method 'change_lengths_distribution_parameters';
340						=cut
341
342						sub change_lengths_distribution_parameters {
343	0			0		my $self=shift;
344	0					my %parameters=@_;
345	0					for (keys %parameters){
346	0	0				if($_=~/^-front$/){
		0
		0
347	0					$self->{lengths_distribution_start}=$parameters{$_};
348						}
349						elsif($_=~/^-behind$/){
350	0					$self->{lengths_distribution_end}=$parameters{$_};
351						}
352						elsif($_=~/^-step$/){
353	0					$self->{lengths_distribution_step}=$parameters{$_};
354						}
355						else{
356	0					die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
357						}
358						}
359						}
360
361
362						=head2 add_output_dir
363
364						$digest->add_output_dir(-output_dir=>'Full path of the output directory');
365						Adds the directory into which the result files are put by the program.
366						The directory must be in the absolute path format.
367
368						=cut
369
370
371						sub add_output_dir {
372	0			0		my $self=shift;
373	0					my %parameters=@_;
374	0					for(keys %parameters){
375	0	0				if($_=~/^-output_dir$/){
376	0					$self->{output_dir}=$parameters{$_};
377						}
378						else{
379	0					die "Unacceptable parameter in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
380						}
381						}
382	0	0				if(-d $self->{output_dir} ) {
383						}else{
384	0					die "The output directory User provides does not exist.\n";
385						}
386						}
387
388						=head2 double_digest
389
390						$digest->double_digest();
391						Execute the digestion process.
392						This process will produce several result files which will be located in the output directory
393						added through the add_output_dir method.
394						This process will take some time. It depends on the size of the reference file.
395						During the process, Prompting messages will be output to the STDOUT
396						like "Digestion: >ScaffoldName is done!".
397
398						=cut
399
400						sub double_digest {
401
402	0			0		my $self=shift;
403	0					my $ref=$self->{ref};
404	0	0				if($ref){
405	0					my $name_reference=basename($ref);
406	0					print "The reference file User provides is:\t",$name_reference,"\n";
407						}else{
408	0					die "No reference file provied! Please perldoc RestrictionDigest for example.\n";
409						}
410	0					my $front_enzyme=$self->{enzyme1};
411	0	0				if($front_enzyme){
412	0					print "The front restrict endonuclease User provides is:\t", $front_enzyme, "\n";
413						}
414						else{
415	0					die"No front restrict endonuclease provided! Please perldoc RestrictionDigest for example.\n";
416						}
417	0					my $later_enzyme=$self->{enzyme2};
418	0	0				if($later_enzyme){
419	0					print "The behind restrict endonuclease User provides is:\t", $later_enzyme, "\n";
420						}
421						else{
422	0					die "No behind restrict endonuclease provided! Please perldoc RestrictionDigest for example.\n";
423						}
424	0					my $range1=$self->{range_start};
425	0					my $range2=$self->{range_end};
426	0					print "The seperate range of the fragments User wants to output is:\t$range1 to $range2 bp.\n";
427	0					my $output_dir=$self->{output_dir};
428	0	0				if($output_dir){
429	0					print "The output directory User provides is:\t",$output_dir, "\n";
430						}
431						else{
432	0					die "No output directory provided! Please perldoc RestrictionDigest for example.\n";
433						}
434	0					$output_dir=~s/^(.+)\/?$/$1/;
435
436						# Make the file handle to the reference file.
437	0					my $ref_fh=IO::File->new("$ref",'r');
438
439						# Get the basename of reference,not include the path.
440	0					my $name_reference=basename($ref);
441
442						# Make the file handles of result files.
443	0					my $all_seq_file_fh=IO::File->new(">>$output_dir/seq_frags_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
444	0					my $all_loc_file_fh=IO::File->new(">>$output_dir/position_frags_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
445	0					my $seq_in_range_file_fh=IO::File->new(">>$output_dir/seq_frags_in_range_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
446	0					my $loc_in_range_file_fh=IO::File->new(">>$output_dir/position_frags_in_range_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
447	0					my $reduced_ratio_every_scaffold_file_fh=IO::File->new(">>$output_dir/reduced_ratio_every_scaffold_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
448	0					my $summary_digestion_fh=IO::File->new(">>$output_dir/digestion_summary_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
449
450						# Get the base-compositions of these two enzymes which User inputs.
451	0					my ($front_seq,$later_seq);
452	0					foreach(keys %enzyme_list){
453	0	0				if($_=~/^$front_enzyme$/i){
454	0					$front_seq=$enzyme_list{$_};
455						}
456	0	0				if($_=~/^$later_enzyme$/i){
457	0					$later_seq=$enzyme_list{$_};
458						}
459						}
460
461						#get the relative location of cutting of the recognation sequences; $front_enzyme_cut_loc and $later_enzyme_cut_loc are two relative locations we wanted;
462	0					my @front_enzyme_rec_seq=split //, $front_seq;
463	0					my @later_enzyme_rec_seq=split //, $later_seq;
464	0					my @front_enzyme_cut_seq;
465						my @later_enzyme_cut_seq;
466	0					my $front_enzyme_rec_start=0;
467	0					my $later_enzyme_rec_start=0;
468	0					my $front_enzyme_cut_loc;
469						my $later_enzyme_cut_loc;
470	0					foreach(@front_enzyme_rec_seq){
471	0					$front_enzyme_rec_start++;
472	0	0				if($_=~/\\|/){
473	0					$front_enzyme_cut_loc=$front_enzyme_rec_start;
474						}
475						else{
476	0					push @front_enzyme_cut_seq,$_;
477						}
478						}
479	0					foreach(@later_enzyme_rec_seq){
480	0					$later_enzyme_rec_start++;
481	0	0				if($_=~/\\|/){
482	0					$later_enzyme_cut_loc=$later_enzyme_rec_start;
483						}
484						else{
485	0					push @later_enzyme_cut_seq, $_;
486						}
487						}
488
489						# Replace the degenerate bases of front enzyme;
490	0					my $num_of_front_enzyme_cut_seq=@front_enzyme_cut_seq;
491	0					for(@front_enzyme_cut_seq){
492	0	0				if($_=~/N/){$_="[ATCG]";}
	0
493	0	0				if($_=~/R/){$_="[AG]";}
	0
494	0	0				if($_=~/Y/){$_="[CT]";}
	0
495	0	0				if($_=~/M/){$_="[AC]";}
	0
496	0	0				if($_=~/K/){$_="[GT]";}
	0
497	0	0				if($_=~/S/){$_="[GC]";}
	0
498	0	0				if($_=~/W/){$_="[AT]";}
	0
499	0	0				if($_=~/H/){$_="[ATC]";}
	0
500	0	0				if($_=~/B/){$_="[GTC]";}
	0
501	0	0				if($_=~/V/){$_="[GAC]";}
	0
502	0	0				if($_=~/D/){$_="[GAT]";}
	0
503						}
504	0					my $seq_string_front_enzyme=join '', @front_enzyme_cut_seq;
505
506
507
508						# Replace the degenerate bases of later enzyme;
509	0					my $num_of_later_enzyme_cut_seq=@later_enzyme_cut_seq;
510	0					foreach(@later_enzyme_cut_seq){
511	0	0				if($_=~/N/){$_="[ATCG]";}
	0
512	0	0				if($_=~/R/){$_="[AG]";}
	0
513	0	0				if($_=~/Y/){$_="[CT]";}
	0
514	0	0				if($_=~/M/){$_="[AC]";}
	0
515	0	0				if($_=~/K/){$_="[GT]";}
	0
516	0	0				if($_=~/S/){$_="[GC]";}
	0
517	0	0				if($_=~/W/){$_="[AT]";}
	0
518	0	0				if($_=~/H/){$_="[ATC]";}
	0
519	0	0				if($_=~/B/){$_="[GTC]";}
	0
520	0	0				if($_=~/V/){$_="[GAC]";}
	0
521	0	0				if($_=~/D/){$_="[GAT]";}
	0
522						}
523	0					my $seq_string_later_enzyme=join '', @later_enzyme_cut_seq;
524
525
526
527						# Make the scalars and arrays used in the whole programme;
528	0					my $rate_overall=0;
529	0					my $rate_in_range=0;
530	0					my @lengths_of_fragments=();
531	0					my @lengths_fragments_in_range=();
532	0					my $overall_fragments_length=0;
533	0					my $overall_fragments_in_range_length=0;
534	0					my $overall_length_of_scfd=0;
535
536						########## CIRCLES START HERE!###########
537
538						# Process one scaffold one circle;
539	0					my $scaffold_name; my $scaffold_seq;
540	0					while(<$ref_fh>){
541	0					chomp;
542	0	0				if($_=~/^>/){
543	0					$scaffold_name=$_;
544						}
545						else{
546	0					print"Digesting $scaffold_name...\t";
547	0					$scaffold_seq=$_;
548	0					$reduced_ratio_every_scaffold_file_fh->print("$scaffold_name\t");
549	0					my $length_of_scafd=length $scaffold_seq;
550
551						# Set the arrays which contain cut locations of front enzyme and later enzyme;
552	0					my @front_enzyme_locs=();
553
554	0					my @later_enzyme_locs=();
555
556	0					my $seq_for_search=$scaffold_seq;
557						# Find locs of recognition of front enzyme in this scaffold;the locs are meaningful in this array; not in the human context;
558	0					my $string=$seq_string_front_enzyme;
559	0	0				if($string=~/^(\w+)$/){
		0
		0
		0
		0
		0
560	0					my $loc_in_array;
561	0					$loc_in_array=index($seq_for_search,$string);
562	0	0				unless($loc_in_array==-1){
563	0					push @front_enzyme_locs, $loc_in_array;
564						}
565	0					while($loc_in_array!=-1){
566	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
567	0	0				unless($loc_in_array==-1){
568	0					push @front_enzyme_locs, $loc_in_array;
569						}
570						}
571						}
572						elsif($string=~/(\w+)\[(\w+)\](\w+)/){
573	0					my $breast_bases=$1;
574	0					my $back_bases=$3;
575	0					my $generate_bases=$2;
576	0					my @generate_bases=split //, $generate_bases;
577	0					my $seq_for_search=$scaffold_seq;
578	0					for my $one_base(@generate_bases){
579	0					$string=$breast_bases.$one_base.$back_bases;
580	0					my $loc_in_array;
581	0					$loc_in_array=index($seq_for_search,$string);
582	0	0				unless($loc_in_array==-1){
583	0					push @front_enzyme_locs, $loc_in_array;
584						}
585	0					while($loc_in_array!=-1){
586	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
587	0	0				unless($loc_in_array==-1){
588	0					push @front_enzyme_locs, $loc_in_array;
589						}
590						}
591						}
592						}
593
594						elsif($string=~/^\[(\w+)\](\w+)\[(\w+)\]$/){
595	0					my $g1_bases=$1;
596	0					my $middle=$2;
597	0					my $g2_bases=$3;
598	0					my @g1_bases=split //, $g1_bases;
599	0					my @g2_bases=split //, $g2_bases;
600	0					for my $one_base(@g1_bases){
601	0					for my $two_base(@g2_bases){
602	0					$string=$one_base.$middle.$two_base;
603	0					my $loc_in_array;
604	0					$loc_in_array=index($seq_for_search,$string);
605	0	0				unless($loc_in_array==-1){
606	0					push @front_enzyme_locs, $loc_in_array;
607						}
608	0					while($loc_in_array!=-1){
609	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
610	0	0				unless($loc_in_array==-1){
611	0					push @front_enzyme_locs, $loc_in_array;
612						}
613						}
614						}
615						}
616						}
617
618						elsif($string=~/(\w+)\[(\w+)\](\w+)\[(\w+)\](\w+)/){
619	0					my $breast_bases=$1;
620	0					my $g1_bases=$2;
621	0					my $middle=$3;
622	0					my $back_bases=$5;
623	0					my $g2_bases=$4;
624	0					my @g1_bases=split //, $g1_bases;
625	0					my @g2_bases=split //, $g2_bases;
626	0					for my $one_base(@g1_bases){
627	0					for my $two_base(@g2_bases){
628	0					$string=$breast_bases.$one_base.$middle.$two_base.$back_bases;
629	0					my $loc_in_array;
630	0					$loc_in_array=index($seq_for_search,$string);
631	0	0				unless($loc_in_array==-1){
632	0					push @front_enzyme_locs, $loc_in_array;
633						}
634	0					while($loc_in_array!=-1){
635	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
636	0	0				unless($loc_in_array==-1){
637	0					push @front_enzyme_locs, $loc_in_array;
638						}
639						}
640						}
641						}
642						}
643						elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\]\[(\w+)\](\w+)/){
644	0					my $breast_bases=$1;
645	0					my $g1_bases=$2;
646	0					my $g2_bases=$3;
647	0					my $back_bases=$5;
648	0					my $g3_bases=$4;
649	0					my @g1_bases=split //, $g1_bases;
650	0					my @g2_bases=split //, $g2_bases;
651	0					my @g3_bases=split //, $g3_bases;
652	0					for my $one_base(@g1_bases){
653	0					for my $two_base(@g2_bases){
654	0					for my $three_base(@g3_bases){
655	0					$string=$breast_bases.$one_base.$two_base.$three_base.$back_bases;
656	0					my $loc_in_array;
657	0					$loc_in_array=index($seq_for_search,$string);
658	0	0				unless($loc_in_array==-1){
659	0					push @front_enzyme_locs, $loc_in_array;
660						}
661	0					while($loc_in_array!=-1){
662	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
663	0	0				unless($loc_in_array==-1){
664	0					push @front_enzyme_locs, $loc_in_array;
665						}
666						}
667						}
668						}
669						}
670						}
671						elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\](\w+)/){
672	0					my $breast_bases=$1;
673	0					my $g1_bases=$2;
674	0					my $g2_bases=$3;
675	0					my $back_bases=$4;
676	0					my @g1_bases=split //, $g1_bases;
677	0					my @g2_bases=split //, $g2_bases;
678	0					for my $one_base(@g1_bases){
679	0					for my $two_base(@g2_bases){
680	0					$string=$breast_bases.$one_base.$two_base.$back_bases;
681	0					my $loc_in_array;
682	0					$loc_in_array=index($seq_for_search,$string);
683	0	0				unless($loc_in_array==-1){
684	0					push @front_enzyme_locs, $loc_in_array;
685						}
686	0					while($loc_in_array!=-1){
687	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
688	0	0				unless($loc_in_array==-1){
689	0					push @front_enzyme_locs, $loc_in_array;
690						}
691						}
692						}
693						}
694						}
695
696
697
698
699						# Find locs of recognition of later enzyme in this scaffold;the locs are meaningful in this array; not in the human context;
700
701	0					$string=$seq_string_later_enzyme;
702	0	0				if($string=~/^(\w+)$/){
		0
		0
		0
		0
		0
703	0					my $loc_in_array;
704	0					$loc_in_array=index($seq_for_search,$string);
705	0	0				unless($loc_in_array==-1){
706	0					push @later_enzyme_locs, $loc_in_array;
707						}
708	0					while($loc_in_array!=-1){
709	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
710	0	0				unless($loc_in_array==-1){
711	0					push @later_enzyme_locs, $loc_in_array;
712						}
713						}
714						}
715						elsif($string=~/(\w+)\[(\w+)\](\w+)/){
716	0					my $breast_bases=$1;
717	0					my $back_bases=$3;
718	0					my $generate_bases=$2;
719	0					my @generate_bases=split //, $generate_bases;
720	0					for my $one_base(@generate_bases){
721	0					$string=$breast_bases.$one_base.$back_bases;
722	0					my $loc_in_array;
723	0					$loc_in_array=index($seq_for_search,$string);
724	0	0				unless($loc_in_array==-1){
725	0					push @later_enzyme_locs, $loc_in_array;
726						}
727	0					while($loc_in_array!=-1){
728	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
729	0	0				unless($loc_in_array==-1){
730	0					push @later_enzyme_locs, $loc_in_array;
731						}
732						}
733						}
734						}
735
736						elsif($string=~/^\[(\w+)\](\w+)\[(\w+)\]$/){
737	0					my $g1_bases=$1;
738	0					my $middle=$2;
739	0					my $g2_bases=$3;
740	0					my @g1_bases=split //, $g1_bases;
741	0					my @g2_bases=split //, $g2_bases;
742	0					for my $one_base(@g1_bases){
743	0					for my $two_base(@g2_bases){
744	0					$string=$one_base.$middle.$two_base;
745	0					my $loc_in_array;
746	0					$loc_in_array=index($seq_for_search,$string);
747	0	0				unless($loc_in_array==-1){
748	0					push @later_enzyme_locs, $loc_in_array;
749						}
750	0					while($loc_in_array!=-1){
751	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
752	0	0				unless($loc_in_array==-1){
753	0					push @later_enzyme_locs, $loc_in_array;
754						}
755						}
756						}
757						}
758						}
759
760						elsif($string=~/(\w+)\[(\w+)\](\w+)\[(\w+)\](\w+)/){
761	0					my $breast_bases=$1;
762	0					my $g1_bases=$2;
763	0					my $middle=$3;
764	0					my $back_bases=$5;
765	0					my $g2_bases=$4;
766	0					my @g1_bases=split //, $g1_bases;
767	0					my @g2_bases=split //, $g2_bases;
768	0					for my $one_base(@g1_bases){
769	0					for my $two_base(@g2_bases){
770	0					$string=$breast_bases.$one_base.$middle.$two_base.$back_bases;
771	0					my $loc_in_array;
772	0					$loc_in_array=index($seq_for_search,$string);
773	0	0				unless($loc_in_array==-1){
774	0					push @later_enzyme_locs, $loc_in_array;
775						}
776	0					while($loc_in_array!=-1){
777	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
778	0	0				unless($loc_in_array==-1){
779	0					push @later_enzyme_locs, $loc_in_array;
780						}
781						}
782						}
783						}
784						}
785						elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\]\[(\w+)\](\w+)/){
786	0					my $breast_bases=$1;
787	0					my $g1_bases=$2;
788	0					my $g2_bases=$3;
789	0					my $back_bases=$5;
790	0					my $g3_bases=$4;
791	0					my @g1_bases=split //, $g1_bases;
792	0					my @g2_bases=split //, $g2_bases;
793	0					my @g3_bases=split //, $g3_bases;
794	0					for my $one_base(@g1_bases){
795	0					for my $two_base(@g2_bases){
796	0					for my $three_base(@g3_bases){
797	0					$string=$breast_bases.$one_base.$two_base.$three_base.$back_bases;
798	0					my $loc_in_array;
799	0					$loc_in_array=index($seq_for_search,$string);
800	0	0				unless($loc_in_array==-1){
801	0					push @later_enzyme_locs, $loc_in_array;
802						}
803	0					while($loc_in_array!=-1){
804	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
805	0	0				unless($loc_in_array==-1){
806	0					push @later_enzyme_locs, $loc_in_array;
807						}
808						}
809						}
810						}
811						}
812						}
813						elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\](\w+)/){
814	0					my $breast_bases=$1;
815	0					my $g1_bases=$2;
816	0					my $g2_bases=$3;
817	0					my $back_bases=$4;
818	0					my @g1_bases=split //, $g1_bases;
819	0					my @g2_bases=split //, $g2_bases;
820	0					for my $one_base(@g1_bases){
821	0					for my $two_base(@g2_bases){
822	0					$string=$breast_bases.$one_base.$two_base.$back_bases;
823	0					my $loc_in_array;
824	0					$loc_in_array=index($seq_for_search,$string);
825	0	0				unless($loc_in_array==-1){
826	0					push @later_enzyme_locs, $loc_in_array;
827						}
828	0					while($loc_in_array!=-1){
829	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
830	0	0				unless($loc_in_array==-1){
831	0					push @later_enzyme_locs, $loc_in_array;
832						}
833						}
834						}
835						}
836						}
837
838
839
840
841						# Integrate the front enzyme locs and later enzyme locs into a hash;
842	0					my %hash;
843	0					while(@front_enzyme_locs){
844	0					my $loc=shift @front_enzyme_locs;
845	0					$hash{$loc}=$front_enzyme;
846						}
847
848	0					while(@later_enzyme_locs){
849	0					my $loc=shift @later_enzyme_locs;
850	0					$hash{$loc}=$later_enzyme;
851						}
852
853						# Sort the locs by the locations of them and restore the fragments' locs with different ends.
854	0					my @locs_sorted=sort{$a<=>$b} keys %hash;
	0
855	0					my @locs_wanted_before;
856						my @enzyme_wanted_before;
857	0					my @locs_wanted_after;
858	0					my @enzyme_wanted_after;
859	0					my ($after_loc,$enzyme_before, $enzyme_after);
860	0					my $before_loc=shift @locs_sorted;
861	0					while(@locs_sorted){
862	0					$after_loc=shift @locs_sorted;
863	0					$enzyme_before=$hash{$before_loc};
864	0					$enzyme_after=$hash{$after_loc};
865	0	0				unless($enzyme_before eq $enzyme_after){
866	0					push @locs_wanted_before, $before_loc;
867	0					push @enzyme_wanted_before, $enzyme_before;
868	0					push @locs_wanted_after,$after_loc;
869	0					push @enzyme_wanted_after, $enzyme_after;
870						}
871	0					$before_loc=$after_loc;
872						}
873
874
875						# Define the scalars which contains the length of fragments;
876	0					my $nums_of_fragments=@locs_wanted_before;
877	0					my @pairs_of_locs=(0..$nums_of_fragments-1);
878	0					my $length_of_fragments=0;
879	0					my $length_of_fragments_in_range=0;
880	0					my $count_all=0;
881	0					my $count_in_range=0;
882	0					my ($before_cut_coorr,$after_cut_coorr,$bf_ct_crr_hm_rd,$af_ct_crr_hm_rd);
883
884
885
886
887	0					while(@locs_wanted_before){
888	0					my $before_coorr=shift @locs_wanted_before;
889	0					my $before_enzyme=shift @enzyme_wanted_before;
890	0					my $after_coorr=shift @locs_wanted_after;
891	0					my $after_enzyme=shift @enzyme_wanted_after;
892	0					my $length_of_fragment;my $seq_selected;
893	0					$count_all++;
894	0	0				if($front_enzyme=~/$before_enzyme/i){
		0
895	0					$before_cut_coorr=$before_coorr+$front_enzyme_cut_loc-1;
896	0					$after_cut_coorr=$after_coorr+$later_enzyme_cut_loc-2;
897	0					$length_of_fragment=$after_cut_coorr-$before_cut_coorr+1;
898	0					$seq_selected=substr($scaffold_seq, $before_cut_coorr, $length_of_fragment);
899						}
900						elsif($later_enzyme=~/$before_enzyme/i){
901	0					$before_cut_coorr=$before_coorr+$later_enzyme_cut_loc-1;
902	0					$after_cut_coorr=$after_coorr+$later_enzyme_cut_loc-2;
903	0					$length_of_fragment=$after_cut_coorr-$before_cut_coorr+1;
904	0					$seq_selected=substr($scaffold_seq, $before_cut_coorr, $length_of_fragment);
905						}
906	0					my $bf_ct_crr=$before_cut_coorr+1;
907	0					my $af_ct_crr=$after_cut_coorr+1;
908
909						# Output the sequence and fragment positions of all-fragments to files;
910	0					$all_seq_file_fh->print("$scaffold_name-$count_all\n$seq_selected\n");
911	0					my $strand;
912	0	0				if($before_enzyme=~/$front_enzyme/i){$strand="+";}
	0	0
	0
913						elsif($before_enzyme=~/$later_enzyme/i){$strand="-";}
914	0					$all_loc_file_fh->print("$scaffold_name-$count_all\t$strand\t$before_enzyme\t$bf_ct_crr\t$after_enzyme\t$af_ct_crr\t$length_of_fragment\n");
915	0					push @lengths_of_fragments, $length_of_fragment;
916
917						# We treat the fragments in range in the same way of all fragments as above;
918	0	0				if($length_of_fragment>=$range1){
919	0	0				if($length_of_fragment<=$range2){
920	0					$count_in_range++;
921	0					$seq_in_range_file_fh->print("$scaffold_name-$count_all\n$seq_selected\n");
922	0					$loc_in_range_file_fh->print("$scaffold_name-$count_all\t$strand\t$before_enzyme\t$bf_ct_crr\t$after_enzyme\t$af_ct_crr\t$length_of_fragment\n");
923	0					push @lengths_fragments_in_range,$length_of_fragment;
924	0					$length_of_fragments_in_range+=$length_of_fragment;
925						}
926						}
927
928						# Count all fragments length no matter in range or not!
929	0					$length_of_fragments+=$length_of_fragment;
930
931
932						}
933	0					my $reduced_rate=$length_of_fragments/$length_of_scafd;
934	0					my $fragment_in_range_rate_scafd=$length_of_fragments_in_range/$length_of_scafd;
935	0					$overall_fragments_length+=$length_of_fragments;
936	0					$overall_fragments_in_range_length+=$length_of_fragments_in_range;
937	0					$overall_length_of_scfd+=$length_of_scafd;
938	0					$reduced_ratio_every_scaffold_file_fh->print("$reduced_rate\t$fragment_in_range_rate_scafd\n");
939	0					print"Done!\n";
940						}
941						}
942	0					my $length_ratio_all_frags=$overall_fragments_length/$overall_length_of_scfd;
943	0					my $length_ratio_frags_in_range=$overall_fragments_in_range_length/$overall_length_of_scfd;
944	0					my $num_all_frags=@lengths_of_fragments;
945	0					my $num_frags_in_range=@lengths_fragments_in_range;
946	0					my $lengths_fragments_in_range=\@lengths_fragments_in_range;
947
948
949						# Output the summary of digestion.
950	0					$summary_digestion_fh->print("Expected all fragments:\t$num_all_frags\n");
951	0					$summary_digestion_fh->print("Expected fragments in range:\t$num_frags_in_range\n");
952	0					$summary_digestion_fh->print("Expected reducing ratio of all fragments:\t$length_ratio_all_frags\n");
953	0					$summary_digestion_fh->print("Expected reducing ratio of fragments in range:\t$length_ratio_frags_in_range\n");
954
955	0					my $lengths_distribution_start=$self->{lengths_distribution_start};
956	0					my $lengths_distribution_end=$self->{lengths_distribution_end};
957	0					my $lengths_distribution_step=$self->{lengths_distribution_step};
958	0					my $lengths_distribution_tmp;
959	0					my (@starts,@ends,@counts,%lengths_distribution);
960
961	0					my $num_pair=0;
962
963	0					for($lengths_distribution_tmp= $lengths_distribution_start;$lengths_distribution_tmp<$lengths_distribution_end+$lengths_distribution_step;$lengths_distribution_tmp=$lengths_distribution_tmp+$lengths_distribution_step){
964	0	0				if($lengths_distribution_tmp== $lengths_distribution_start){
		0
965	0					$num_pair++;
966	0					push @starts, '0';
967	0					push @ends , $lengths_distribution_tmp;
968	0					$lengths_distribution{$num_pair}{"0bp-${lengths_distribution_tmp}bp"}=0;
969
970	0					$num_pair++;
971	0					my $tmp_start=$lengths_distribution_tmp+1;
972	0					my $tmp_end=$lengths_distribution_tmp+$lengths_distribution_step;
973	0					push @starts, $tmp_start;
974	0					push @ends, $tmp_end;
975	0					$lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
976
977						}
978						elsif($lengths_distribution_tmp < $lengths_distribution_end ){
979	0	0				if($lengths_distribution_tmp+$lengths_distribution_step <$lengths_distribution_end ){
		0
980	0					$num_pair++;
981	0					my $tmp_start=$lengths_distribution_tmp+1;
982	0					my $tmp_end=$lengths_distribution_tmp+$lengths_distribution_step;
983	0					push @starts, $tmp_start;
984	0					push @ends, $tmp_end;
985	0					$lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
986
987						}
988						elsif($lengths_distribution_tmp +$lengths_distribution_step >= $lengths_distribution_end){
989	0					$num_pair++;
990	0					my $tmp_start=$lengths_distribution_tmp+1;
991	0					my $tmp_end=$lengths_distribution_end;
992	0					push @starts, $tmp_start;
993	0					push @ends, $tmp_end;
994	0					$lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
995
996	0					$num_pair++;
997	0					$tmp_start=$tmp_end+1;
998
999	0					push @starts, $tmp_start;
1000	0					push @ends, 'bigger';
1001	0					$lengths_distribution{$num_pair}{"${tmp_start}bp-bigger bp"}=0;
1002
1003						}
1004						}
1005						}
1006
1007
1008
1009	0					for (@lengths_of_fragments){
1010	0					my $length=$_;
1011	0					my $num_pair=1;
1012	0					my $num_last_pair=@starts;
1013	0					for($num_pair=1;$num_pair<=$num_last_pair;$num_pair++){
1014
1015	0					my $array_index=$num_pair-1;
1016	0					my $start=$starts[$array_index];
1017	0					my $end=$ends[$array_index];
1018	0	0				if($num_pair<$num_last_pair){
		0
1019	0	0	0			if($length>=$start && $length<=$end){
1020	0					for my $description (keys %{$lengths_distribution{$num_pair}}){
	0
1021	0					$lengths_distribution{$num_pair}{$description}++;
1022	0					last;
1023						}
1024	0					last;
1025						}
1026						}
1027						elsif($num_pair==$num_last_pair){
1028	0	0				if($length>=$start){
1029	0					for my $description (keys %{$lengths_distribution{$num_pair}}){
	0
1030	0					$lengths_distribution{$num_pair}{$description}++;
1031	0					last;
1032						}
1033	0					last;
1034						}
1035						}
1036						}
1037						}
1038
1039	0					$summary_digestion_fh->print("\nLengths\' scope\tNumber of fragments in this scope\tRatio of these fragments in all\n");
1040
1041	0					for(1..@starts){
1042	0					my $num_pair=$_;
1043	0					for my $description(keys %{$lengths_distribution{$num_pair}}){
	0
1044	0					my $num_fragments=$lengths_distribution{$num_pair}{$description};
1045	0					my $ratio;
1046	0	0				if($num_all_frags>0){
1047	0					$ratio=$num_fragments/$num_all_frags;
1048						}else{
1049	0					$ratio='N/A';
1050						}
1051	0					$summary_digestion_fh->print("$description\t$num_fragments\t$ratio\n");
1052						}
1053						}
1054	0					$summary_digestion_fh->print("\n");
1055						}
1056
1057						=head2 add_SNPs
1058
1059						$digest->add_SNPs(-SNPs=>'Full path of the SNPs file');
1060						Adds the absolute path of the SNPs file to RestrictionDigest variable.
1061						=cut
1062
1063						sub add_SNPs {
1064	0			0		my $self=shift;
1065	0					my %parameters=@_;
1066	0					for (keys %parameters){
1067	0	0				if($_=~/^-SNPs$/){
1068	0					$self->{SNPs}=$parameters{$_};
1069						}
1070						else{
1071	0					die "Unacceptable parameters of the method , please perldoc RestrictionDigest for example or read README for more help\n";
1072						}
1073						}
1074						}
1075
1076						=head2 count_SNPs_at_fragments
1077
1078						$digest->count_SNPs_at_fragments(-sequence_type=>'100SE',-sequence_end=>'front_enzyme');
1079						Count the expected SNPs appeared in the framgents.
1080						=cut
1081
1082						sub count_SNPs_at_fragments {
1083	0			0		my $self=shift;
1084	0					my %parameters=@_;
1085
1086	0	0				if(keys %parameters){
1087	0					for(keys %parameters){
1088	0	0				if($_=~/^-sequence_type$/){
		0
1089	0					$self->{sequence_type}=$parameters{$_};
1090						}
1091						elsif($_=~/^-sequence_end$/){
1092	0					$self->{sequence_end}=$parameters{$_};
1093						}
1094						else{
1095	0					die "Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help\n";
1096						}
1097						}
1098						}
1099
1100	0					my $ref=$self->{ref};
1101	0					my $front_enzyme=$self->{enzyme1};
1102	0					my $later_enzyme=$self->{enzyme2};
1103	0					my $range1=$self->{range_start};
1104	0					my $SNPs=$self->{SNPs};
1105	0	0				if($SNPs){
1106	0					my $SNPs_basename=basename($SNPs);
1107	0					print "The SNPs file provided is\t", $SNPs_basename, ".\n";
1108						}
1109						else{
1110	0					die"No SNPs file provided! Please perldoc RestrictionDigest for example.\n";
1111						}
1112
1113	0					my $sequence_type=$self->{sequence_type};
1114
1115						# Check the sequencing type and determine the domains of the fragments to calculate the SNPs number.
1116	0					my ($pe_sequence_length, $se_sequence_length);
1117	0					my $sequence_type_code;
1118	0	0				if($sequence_type=~/^(\d+)pe$/i){
		0
1119	0					$sequence_type_code=2;
1120	0					my $sequence_length=$1;
1121	0	0				unless($sequence_length*2 <= $range1){
1122	0					die"Unacceptable sequence_type when range's front border is $range1 bp. Please perldoc RestrictionDigest for example or read README for more help!\n";
1123						}
1124						else{
1125	0					$pe_sequence_length=$sequence_length;
1126						}
1127						}
1128						elsif($sequence_type=~/^(\d+)se$/i){
1129	0					$sequence_type_code=1;
1130	0					my $sequence_length=$1;
1131	0	0				unless($sequence_length<=$range1){
1132	0					die "Unacceptable sequence_type when range's front border is $range1 bp. Please perldoc RestrictionDigest for example or read README for more help!\n";
1133						}
1134						else{
1135	0					$se_sequence_length=$sequence_length;
1136						}
1137						}
1138						else{
1139	0					die "Unacceptable sequence_type. Please perldoc RestrictionDigest for example or read README for more help!\n";
1140						}
1141
1142	0					print "The sequencing type provided is\t", $sequence_type, ".\n";
1143
1144						# Check the sequencing end of single-end sequencing.
1145	0					my $se_sequence_end;
1146						my $se_sequence_end_code;
1147	0	0				if($sequence_type_code==1){
1148	0					$se_sequence_end=$self->{sequence_end};
1149	0	0				if($se_sequence_end=~/^front_enzyme$/i){
		0
1150	0					$se_sequence_end_code=1;
1151	0					print "The sequencing end provided is\t", $se_sequence_end, ".\n";
1152						}
1153						elsif($se_sequence_end=~/^behind_enzyme$/i){
1154	0					$se_sequence_end_code=2;
1155	0					print "The sequencing end provided is\t", $se_sequence_end, ".\n";
1156						}
1157						else{
1158	0					die "Unacceptable sequence_end. Please perldoc RestrictionDigest for example or read README for more help!\n";
1159						}
1160						}
1161
1162
1163
1164	0					my $output_dir=$self->{output_dir};
1165	0					$output_dir=~s/^(.+)\/?$/$1/;
1166
1167						# Get the basename of reference,not include the path.
1168	0					my $name_reference=basename($ref);
1169
1170						# Make the file handle to the summary file;
1171	0					my $summary_digestion_fh=IO::File->new(">>$output_dir/digestion_summary_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
1172
1173						# Make the file handle to the locs file of all fragments and fragments in range.
1174	0					my $all_loc_file_fh=IO::File->new("$output_dir/position_frags_${name_reference}_by_${front_enzyme}_and_${later_enzyme}",'r');
1175	0					my $loc_in_range_file_fh=IO::File->new("$output_dir/position_frags_in_range_${name_reference}_by_${front_enzyme}_and_${later_enzyme}",'r');
1176
1177						# Hold SNPs into hash.
1178	0					my $SNPs_fh=IO::File->new("$SNPs",'r');
1179	0					my %SNPs;
1180	0					while(<$SNPs_fh>){
1181	0					chomp;
1182	0					my ($SNPs_scaffold_name,$SNPs_pos,$type)=split /\s+/, $_;
1183	0					push @{$SNPs{$SNPs_scaffold_name}}, $SNPs_pos;
	0
1184						}
1185	0					print "Counting SNPs at the output fragments...\t";
1186
1187						# Create the SNPs files to hold SNPs of all fragments and fragments in range.
1188	0					my $SNPs_at_all_frags_fh=IO::File->new(">>$output_dir/SNPs_at_all_frags_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
1189	0					my $SNPs_at_frags_in_range_fh=IO::File->new(">>$output_dir/SNPs_at_frags_in_range_${name_reference}_by_${front_enzyme}_and_${later_enzyme}");
1190
1191
1192
1193						# Scan all locs file to count SNPs and output the result to the summary file.
1194	0					my $all_SNPs_number=0;
1195	0	0				if($sequence_type_code == 1){
		0
1196	0	0				if($se_sequence_end_code==1){
		0
1197	0					while(<$all_loc_file_fh>){
1198	0					chomp;
1199	0					my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1200	0					my $scaffold_name;
1201	0	0				if($fragment_name=~/^>(.+)-(\d+)/){
1202	0					$scaffold_name=$1;
1203						}
1204	0	0				if($SNPs{$scaffold_name}){
1205	0					for my $SNP(@{$SNPs{$scaffold_name}}){
	0
1206	0	0				if($front_enzyme=~/$enzyme1/i){
		0
1207	0	0	0			if($SNP >=$pos1 && $SNP <=($pos1+$se_sequence_length-1)){
1208	0					$all_SNPs_number++;
1209	0					$SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
1210						}
1211						}
1212						elsif($front_enzyme=~/$enzyme2/i){
1213	0	0	0			if($SNP >= ($pos2-$se_sequence_length+1) && $SNP <=$pos2){
1214	0					$all_SNPs_number++;
1215	0					$SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
1216						}
1217						}
1218						}
1219						}
1220						}
1221						}
1222						elsif($se_sequence_end_code==2){
1223	0					while(<$all_loc_file_fh>){
1224	0					chomp;
1225	0					my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1226	0					my $scaffold_name;
1227	0	0				if($fragment_name=~/^>(.+)-(\d+)/){
1228	0					$scaffold_name=$1;
1229						}
1230	0	0				if($SNPs{$scaffold_name}){
1231	0					for my $SNP(@{$SNPs{$scaffold_name}}){
	0
1232	0	0				if($later_enzyme=~/$enzyme1/i){
		0
1233	0	0	0			if($SNP>=$pos1 && $SNP<=($pos1+$se_sequence_length-1)){
1234	0					$all_SNPs_number++;
1235	0					$SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
1236						}
1237						}
1238						elsif($later_enzyme=~/$enzyme2/i){
1239	0	0	0			if($SNP>= ($pos2-$se_sequence_length+1) && $SNP<=$pos2){
1240	0					$all_SNPs_number++;
1241	0					$SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
1242						}
1243						}
1244						}
1245						}
1246						}
1247						}
1248	0					$summary_digestion_fh->print("Expected SNPs at all fragments via $sequence_type:\t$all_SNPs_number\n");
1249						}
1250						elsif($sequence_type_code ==2){
1251	0					while(<$all_loc_file_fh>){
1252	0					chomp;
1253	0					my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1254	0					my $scaffold_name;
1255	0	0				if($fragment_name=~/^>(.+)-(\d+)/){
1256	0					$scaffold_name=$1;
1257						}
1258	0	0				if($SNPs{$scaffold_name}){
1259	0					for my $SNP(@{$SNPs{$scaffold_name}}){
	0
1260	0	0	0			if( ($SNP>=$pos1 && $SNP<=($pos1+$pe_sequence_length-1)) \|\| ( $SNP>= ($pos2-$pe_sequence_length+1) && $SNP<=$pos2 ) ){
			0
			0
1261	0					$all_SNPs_number++;
1262	0					$SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
1263						}
1264						}
1265						}
1266						}
1267	0					$summary_digestion_fh->print("Expected SNPs at all fragments via $sequence_type:\t$all_SNPs_number\n");
1268						}
1269
1270
1271						# Scan locs in range file to count SNPs and output the result to the summary file.
1272
1273	0					my $SNPs_in_range_number=0;
1274
1275	0	0				if($sequence_type_code == 1){
		0
1276	0	0				if($se_sequence_end_code==1){
		0
1277	0					while(<$loc_in_range_file_fh>){
1278	0					chomp;
1279	0					my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1280	0					my $scaffold_name;
1281	0	0				if($fragment_name=~/^>(.+)-(\d+)/){
1282	0					$scaffold_name=$1;
1283						}
1284	0	0				if($SNPs{$scaffold_name}){
1285	0					for my $SNP(@{$SNPs{$scaffold_name}}){
	0
1286	0	0				if($front_enzyme=~/$enzyme1/i){
		0
1287	0	0	0			if($SNP>=$pos1 && $SNP<=($pos1+$se_sequence_length-1)){
1288	0					$SNPs_in_range_number++;
1289	0					$SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
1290						}
1291						}
1292						elsif($front_enzyme=~/$enzyme2/i){
1293	0	0	0			if($SNP>= ($pos2-$se_sequence_length+1) && $SNP<=$pos2){
1294	0					$SNPs_in_range_number++;
1295	0					$SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
1296						}
1297						}
1298						}
1299						}
1300						}
1301						}
1302						elsif($se_sequence_end_code==2){
1303	0					while(<$loc_in_range_file_fh>){
1304	0					chomp;
1305	0					my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1306	0					my $scaffold_name;
1307	0	0				if($fragment_name=~/^>(.+)-(\d+)/){
1308	0					$scaffold_name=$1;
1309						}
1310	0	0				if($SNPs{$scaffold_name}){
1311	0					for my $SNP(@{$SNPs{$scaffold_name}}){
	0
1312	0	0				if($later_enzyme=~/$enzyme1/i){
		0
1313	0	0	0			if($SNP>=$pos1 && $SNP<=($pos1+$se_sequence_length-1)){
1314	0					$SNPs_in_range_number++;
1315	0					$SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
1316						}
1317						}
1318						elsif($later_enzyme=~/$enzyme2/i){
1319	0	0	0			if($SNP>= ($pos2-$se_sequence_length+1) && $SNP<=$pos2){
1320	0					$SNPs_in_range_number++;
1321	0					$SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
1322						}
1323						}
1324						}
1325						}
1326						}
1327						}
1328	0					$summary_digestion_fh->print("Expected SNPs at fragments in range via $sequence_type:\t$SNPs_in_range_number\n");
1329						}
1330						elsif($sequence_type_code ==2){
1331	0					while(<$loc_in_range_file_fh>){
1332	0					chomp;
1333	0					my($fragment_name,$strand,$enzyme1,$pos1,$enzyme2,$pos2,$length)=split /\t/, $_;
1334	0					my $scaffold_name;
1335	0	0				if($fragment_name=~/^>(.+)-(\d+)/){
1336	0					$scaffold_name=$1;
1337						}
1338	0	0				if($SNPs{$scaffold_name}){
1339	0					for my $SNP(@{$SNPs{$scaffold_name}}){
	0
1340	0	0	0			if( ($SNP>=$pos1 && $SNP<=($pos1+$pe_sequence_length-1)) \|\| ( $SNP>= ($pos2-$pe_sequence_length+1) && $SNP<=$pos2 ) ){
			0
			0
1341	0					$SNPs_in_range_number++;
1342	0					$SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
1343						}
1344						}
1345						}
1346						}
1347	0					$summary_digestion_fh->print("Expected SNPs at fragments in range via $sequence_type:\t$SNPs_in_range_number\n");
1348						}
1349	0					print "Done!\n";
1350						}
1351
1352
1353						=head2 add_gff
1354
1355						$digest->add_gff(-ref=>'Full path of the gff file');
1356						Adds the absolute path of gff file to RestrictionDigest variable.
1357						If User want to calculate the coverage ratio of different genomic strutures by the digested
1358						fragments, this method is obligatory.
1359						=cut
1360
1361						sub add_gff {
1362	0			0		my $self=shift;
1363	0					my %parameters=@_;
1364	0					for (keys %parameters){
1365	0	0				if($_=~/^-gff$/){
1366	0					$self->{gff}=$parameters{$_};
1367						}
1368						else{
1369	0					die"Unacceptable parameters of the method , please perldoc RestrictionDigest for example or read README for more help\n";
1370						}
1371						}
1372						}
1373
1374						=head2 all_frags_coverage_ratio
1375
1376						$digest->all_frags_coverage_ratio();
1377						Calculate the coverage ratio of different parts of the genome reference,
1378						including mRNA, CDS, UTR and intergenic region, by all digested fragments.
1379						This process will take a very long time, please be patient.
1380						=cut
1381
1382						sub all_frags_coverage_ratio {
1383	0			0		my $self=shift;
1384
1385	0					my $ref=$self->{ref};
1386	0					my $basename_ref=basename($ref);
1387
1388	0					my $e1=$self->{enzyme1};
1389	0					my $e2=$self->{enzyme2};
1390
1391	0					my $output_dir=$self->{output_dir};
1392	0					$output_dir=~s/^(.+)\/?$/$1/;
1393
1394	0					my $loc_file_all_frags="$output_dir/position_frags_${basename_ref}_by_${e1}_and_${e2}";
1395
1396	0					$self->genome_structure_coverage_ratio($loc_file_all_frags);
1397						}
1398
1399						=head2 frags_in_range_coverage_ratio
1400
1401						$digest->frags_in_range_coverage_ratio();
1402						Calculate the coverage ratio of different parts of the genome reference,
1403						including mRNA, CDS, UTR and intergenic region, by the fragments in range.
1404						This process will take a very long time, please be patient.
1405						=cut
1406
1407						sub frags_in_range_coverage_ratio {
1408	0			0		my $self=shift;
1409
1410	0					my $ref=$self->{ref};
1411	0					my $basename_ref=basename($ref);
1412
1413	0					my $e1=$self->{enzyme1};
1414	0					my $e2=$self->{enzyme2};
1415
1416	0					my $output_dir=$self->{output_dir};
1417	0					$output_dir=~s/^(.+)\/?$/$1/;
1418
1419	0					my $loc_file_all_frags="$output_dir/position_frags_in_range_${basename_ref}_by_${e1}_and_${e2}";
1420
1421	0					$self->genome_structure_coverage_ratio($loc_file_all_frags);
1422						}
1423
1424						=head2 genome_structure_coverage_ratio
1425
1426						A subroutine invoked by the 'all_frags_coverage_ratio' and
1427						the 'frags_in_range_coverage_ratio' methods.
1428						User do not use this subroutine.
1429
1430						=cut
1431
1432
1433						sub genome_structure_coverage_ratio {
1434	0			0		my $self=shift;
1435	0					my $loc_frags_file=shift;
1436
1437						# Get the reference name, enzyme names from hash.
1438	0					my $ref=$self->{ref};
1439	0					my $ref_fh=IO::File->new($ref,'r');
1440
1441	0					my $basename_ref=basename($ref);
1442	0					my $e1=$self->{enzyme1};
1443	0					my $e2=$self->{enzyme2};
1444	0					my $output_dir=$self->{output_dir};
1445	0					$output_dir=~s/^(.+)\/?$/$1/;
1446
1447						# Get the full path and name of gff file and make a filehandle to it.
1448	0					my $gff=$self->{gff};
1449
1450	0	0				if($gff){
1451	0					my $gff_basename=basename($gff);
1452	0					print "The GFF file provided is\t", $gff_basename, ".\n";
1453						}
1454						else{
1455	0					die"No GFF file provided! Please perldoc RestrictionDigest for example.\n";
1456						}
1457
1458						# Make output filehandles to generic region and intergenic region coverage ratio files.
1459	0					my $coverage_ratio_fh;
1460
1461	0					my $basename_loc_frags_file=basename($loc_frags_file);
1462
1463	0	0				if($basename_loc_frags_file =~/in_range/){
1464	0					$coverage_ratio_fh=IO::File->new(">>$output_dir/genome_coverage_ratio_in_range_${basename_ref}_by_${e1}_and_${e2}");
1465						}
1466						else{
1467	0					$coverage_ratio_fh=IO::File->new(">>$output_dir/genome_coverage_ratio_${basename_ref}_by_${e1}_and_${e2}");
1468						}
1469
1470	0					$coverage_ratio_fh->print("ScaffoldName\tIntergenicLength\tIntergenicMapLength\tIntergenicMapRatio\tGenesLength\tGenesMapLength\tGenesMapRatio\tExonsLength\tExonsMapLength\tExonsMapRatio\tIntronsLength\tIntronsMapLength\tIntronsMapRatio\n");
1471
1472
1473						# Set variables used in this program.
1474	0					my($all_intergenic_length,$all_intergenic_map_length,$all_gene_length,$all_gene_map_length,$all_exon_length,$all_exon_map_length,$all_intron_length,$all_intron_map_length)=qw(0 0 0 0 0 0 0 0);
1475
1476
1477
1478						# Fetch the lengths of different parts of this scaffold from the GFF file.
1479	0					my $gff_fh=IO::File->new($gff,'r');
1480	0					my %gff;
1481	0					print "Scanning the GFF file...\t";
1482	0					while(<$gff_fh>){
1483	0					chomp;
1484
1485	0	0				unless($_=~/^#/){
1486	0					my @gff=split /\t/, $_;
1487	0	0				if(@gff == 9){
1488	0					my ($gff_scfd_name,$gff_type,$gff_start,$gff_stop)=@gff[0,2,3,4];
1489	0	0				if($gff_type=~/CDS\|exon/){
		0
1490	0					$gff_type='exon';
1491	0	0				if($gff{$gff_scfd_name}{$gff_type}){
1492
1493	0	0				unless(grep {$gff_start == $_} @{$gff{$gff_scfd_name}{$gff_type}}){
	0
	0
1494
1495	0					push @{$gff{$gff_scfd_name}{$gff_type}}, ($gff_start,$gff_stop);
	0
1496						}
1497						}
1498						else{
1499	0					push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
	0
1500						}
1501						}
1502						elsif($gff_type=~/mRNA\|gene/){
1503	0					$gff_type='gene';
1504	0	0				if($gff{$gff_scfd_name}{$gff_type}){
1505	0	0				unless(grep {$gff_start == $_} @{$gff{$gff_scfd_name}{$gff_type}}){
	0
	0
1506	0					push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
	0
1507						}
1508						}
1509						else{
1510	0					push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
	0
1511						}
1512						}
1513						}
1514						}
1515						}
1516	0					$gff_fh->close;
1517	0					print "Done!\n";
1518
1519						# Scan the loc file and fetch the information.
1520
1521	0					my $loc_frags_file_fh=IO::File->new($loc_frags_file,'r');
1522	0					my %all_locs;
1523	0					print "Scanning the positions file...\t";
1524	0					while(<$loc_frags_file_fh>){
1525	0					chomp;
1526	0					my($frgt_name,$strand,$front_enzyme,$start_coor,$later_enzyme,$stop_coor,$length)=split /\t/,$_;
1527	0					my $scfd_frgt;
1528	0	0				if($frgt_name=~/^>(.+)-\d+$/){
1529	0					$scfd_frgt=$1;
1530	0					push @{$all_locs{$scfd_frgt}}, ($start_coor, $stop_coor);
	0
1531						}
1532						}
1533	0					$loc_frags_file_fh->close;
1534	0					print "Done!\n";
1535
1536
1537						# Handle every scaffold one by one.
1538	0					my($scfd_name);
1539	0					while(<$ref_fh>){
1540	0					chomp;
1541	0	0				if($_=~/^>(\S+)$/){
1542	0					$scfd_name=$1;
1543						}
1544						else{
1545	0					my $scaffold_length=length $_;
1546	0					print "CoverageRatio: Processing $scfd_name...\t";
1547
1548
1549						# Calculate the lengths of different parts of this scaffold.
1550	0					my ($exon_length,$intron_length,$gene_length,$intergenic_length)=qw(0 0 0 0);
1551
1552	0	0				if($gff{$scfd_name}{gene}){
1553	0					my @gene_tmp=@{$gff{$scfd_name}{gene}};
	0
1554
1555	0	0				if(@gene_tmp % 2){
1556	0					print "gene tmp not correct\n";
1557						}
1558
1559	0					while(@gene_tmp){
1560	0					my $front=shift @gene_tmp;
1561	0					my $behind=shift @gene_tmp;
1562	0					$gene_length+=$behind-$front+1;
1563						}
1564	0					$intergenic_length=$scaffold_length-$gene_length;
1565						}else{
1566	0					$intergenic_length=$scaffold_length;
1567						}
1568
1569	0	0				if($gff{$scfd_name}{exon}){
1570	0					my @exon_tmp=@{$gff{$scfd_name}{exon}};
	0
1571
1572	0	0				if(@exon_tmp % 2){
1573	0					print "exon tmp not correct\n";
1574						}
1575
1576	0					while(@exon_tmp){
1577	0					my $front=shift @exon_tmp;
1578	0					my $behind=shift @exon_tmp;
1579	0					$exon_length+=$behind-$front+1;
1580						}
1581	0					$intron_length=$gene_length-$exon_length;
1582						}
1583	0					$all_intergenic_length+=$intergenic_length;
1584	0					$all_gene_length+=$gene_length;
1585	0					$all_exon_length+=$exon_length;
1586	0					$all_intron_length+=$intron_length;
1587
1588						# Scan the locs file and fetch the map lengths of different parts.
1589	0					my $gene_map_length=0;
1590	0					my $exon_map_length=0;
1591	0					my $intron_map_length=0;
1592	0					my $intergenic_map_length=0;
1593
1594						# Extract fragments' locs for %all_locs;
1595	0	0				if($all_locs{$scfd_name}){
1596	0					my @tmp_locs=@{$all_locs{$scfd_name}};
	0
1597	0					while(@tmp_locs){
1598	0					my $front=shift @tmp_locs;
1599	0					my $behind=shift @tmp_locs;
1600	0					for my $one($front..$behind){
1601	0					my $gene_map_count=0;
1602	0					my $exon_map_count=0;
1603	0					my $intron_map_count=0;
1604
1605	0	0				if($gff{$scfd_name}{gene}){
1606	0					my @gene_tmp=@{$gff{$scfd_name}{gene}};
	0
1607	0					while(@gene_tmp){
1608	0					my $front=shift @gene_tmp;
1609	0					my $behind=shift @gene_tmp;
1610	0	0	0			if($one>=$front && $one<=$behind){
1611	0					$gene_map_count++;
1612	0					$gene_map_length++;
1613	0					last;
1614						}
1615						}
1616						}
1617
1618	0	0				if($gff{$scfd_name}{exon}){
1619	0					my @exon_tmp=@{$gff{$scfd_name}{exon}};
	0
1620	0					while(@exon_tmp){
1621	0					my $front=shift @exon_tmp;
1622	0					my $behind=shift @exon_tmp;
1623	0	0	0			if($one>=$front && $one<=$behind){
1624	0					$exon_map_count++;
1625	0					$exon_map_length++;
1626	0					last;
1627						}
1628						}
1629						}
1630
1631	0	0				if($gene_map_count==0){
		0
1632	0					$intergenic_map_length++;
1633						}
1634						elsif($exon_map_count==0){
1635	0					$intron_map_length++;
1636						}
1637						}
1638						}
1639						}
1640
1641
1642
1643						# Output part!!!
1644	0					my($intergenic_map_ratio,$gene_map_ratio,$exon_map_ratio,$intron_map_ratio)=qw(0 0 0 0);
1645	0	0				unless($intergenic_length==0){
	0
1646	0					$intergenic_map_ratio=$intergenic_map_length/$intergenic_length;
1647						}
1648						else{$intergenic_map_ratio="N/A";
1649						}
1650
1651	0	0				unless($gene_length==0){
	0
1652	0					$gene_map_ratio=$gene_map_length/$gene_length;
1653						}
1654						else{$gene_map_ratio="N/A";
1655						}
1656
1657	0	0				unless($exon_length==0){
	0
1658	0					$exon_map_ratio=$exon_map_length/$exon_length;
1659						}
1660						else{$exon_map_ratio="N/A";
1661						}
1662
1663	0	0				unless($intron_length==0){
	0
1664	0					$intron_map_ratio=$intron_map_length/$intron_length;
1665						}
1666						else{$intron_map_ratio="N/A";
1667						}
1668	0					$all_intergenic_map_length+=$intergenic_map_length;
1669	0					$all_gene_map_length+=$gene_map_length;
1670	0					$all_exon_map_length+=$exon_map_length;
1671	0					$all_intron_map_length+=$intron_map_length;
1672
1673
1674	0					$coverage_ratio_fh->print("$scfd_name\t$intergenic_length\t$intergenic_map_length\t$intergenic_map_ratio\t$gene_length\t$gene_map_length\t$gene_map_ratio\t$exon_length\t$exon_map_length\t$exon_map_ratio\t$intron_length\t$intron_map_length\t$intron_map_ratio\n");
1675	0					print "Done!\n";
1676						}
1677						}
1678	0					my ($all_intergenic_map_ratio,$all_gene_map_ratio,$all_exon_map_ratio,$all_intron_map_ratio)=qw(0 0 0 0);
1679	0	0				if($all_gene_length==0){$all_gene_map_ratio='N/A';}
	0
1680						else{
1681	0					$all_gene_map_ratio=$all_gene_map_length/$all_gene_length;}
1682	0	0				if($all_exon_length==0){$all_exon_map_ratio='N/A';}
	0
1683						else{
1684	0					$all_exon_map_ratio=$all_exon_map_length/$all_exon_length;}
1685	0	0				if($all_intron_length==0){$all_intron_map_ratio='N/A';}
	0
1686						else{
1687	0					$all_intron_map_ratio=$all_intron_map_length/$all_intron_length;}
1688	0	0				if($all_intergenic_length==0){$all_intergenic_map_ratio='N/A';}
	0
1689						else{
1690	0					$all_intergenic_map_ratio=$all_intergenic_map_length/$all_intergenic_length;}
1691
1692	0					$coverage_ratio_fh->print("Intergenic region map ratio is\t$all_intergenic_map_ratio\nGenes region map ratio is\t$all_gene_map_ratio\nExon region map ratio is\t$all_exon_map_ratio\nIntron region map ratio is\t$all_intron_map_ratio\n");
1693						}
1694
1695
1696
1697
1698						package RestrictionDigest::Single;
1699
1700	1			1	20	use 5.8.0;
	1				2
	1				34
1701	1			1	6	use strict;
	1				2
	1				37
1702	1			1	3	use warnings FATAL => 'all';
	1				1
	1				66
1703
1704						=head1 NAME
1705
1706
1707						RestrictionDigest::Single - the simulation tool for singel enzyme digestion!
1708
1709						=head1 VERSION
1710
1711						Version 0.01
1712
1713						=cut
1714
1715						our $VERSION = '0.01';
1716
1717
1718
1719						=head1 SYNOPSIS
1720
1721						RestrictionDigest::Single is used for the simulation of single-enzyme GBS/RAD approaches.
1722
1723
1724						use RestrictionDigest;
1725
1726						$digest = RestrictionDigest::Single->new();
1727
1728						$digest->add_ref($reference_file); # $reference_file refers to the full path of the reference file;
1729
1730						$digest->add_single_enzyme($enzyme); # $enzyme refers to the enzyme User wants to use in the digestion simulation;
1731
1732						$digest->add_output_dir($output_dir);# $output_dir="A directory in which you want to put the result files";
1733
1734						$species->single_digest();
1735
1736
1737						=cut
1738
1739	1			1	5	use IO::File;
	1				1
	1				144
1740	1			1	16	use File::Basename;
	1				2
	1				52
1741
1742	1			1	5	use vars qw(%fields %enzyme_list);
	1				0
	1				6617
1743
1744						# Define the fields to be used as identifiers.
1745						%fields = (
1746						ref => undef,
1747						gff => undef,
1748						SNPs => undef,
1749						enzyme => undef,
1750						output_dir=> undef,
1751						range_start => 201,
1752						range_end => 500,
1753						sequence_type => '100PE',
1754						lengths_distribution_start =>100,
1755						lengths_distribution_end => 1000,
1756						lengths_distribution_step => 50,
1757						);
1758
1759						# Make the endonuclease enzyme-container--a hash.
1760						%enzyme_list=(
1761						EcoRI => 'G\|AATTC',
1762						HinfI => 'G\|ANTC',
1763						AluI => 'AG\|CT',
1764						AvaII => 'G\|GWCC',
1765						BamHI => 'G\|GATCC',
1766						HindIII => 'A\|AGCTT',
1767						MseI => 'T\|TAA',
1768						MspI => 'C\|CGG',
1769						NdeI => 'CA\|TATG',
1770						PstI => 'CTGCA\|G',
1771						SalI => 'G\|TCGAC',
1772						XbaI => 'T\|CTAGA',
1773						XhoI => 'C\|TCGAG',
1774						NlaIII => 'NCATG\|N',
1775						AatII => 'GACGT\|C',
1776						AccI => 'GT\|MKAC',
1777						Acc65I => 'G\|GTACC',
1778						AciI => 'C\|CGC',
1779						AclI => 'AA\|CGTT',
1780						AfeI => 'AGC\|GCT',
1781						AflII => 'C\|TTAAGA',
1782						AflIII => 'A\|CRYGT',
1783						AgeI => 'A\|CCGGT',
1784						AlwNI => 'CAGNNN\|CTG',
1785						ApaI => 'GGGCC\|C',
1786						ApaLI => 'G\|TGCAC',
1787						ApeKI => 'G\|CWGC',
1788						ApoI => 'R\|AATTY',
1789						AscI => 'GG\|CGCGCC',
1790						AseI => 'AT\|TAAT',
1791						AvaI => 'C\|YCGRG',
1792						BclI => 'T\|GATCA',
1793						AvrII => 'C\|CTAGG',
1794						BaeGI => 'GKGCM\|C',
1795						BanI => 'G\|GYRCC',
1796						BanII => 'GRGCY\|C',
1797						BbvCI => 'CC\|TCAGC',
1798						BsaAI => 'YAC\|GTR',
1799						BsaJI => 'C\|CNNGG',
1800						BsaWI => 'W\|CCGGW',
1801						);
1802
1803						=head1 SUBROUTINES/METHODS
1804
1805						=head2 new
1806
1807						$digest=RestrictionDigest::Single->new();
1808						Create a new object.
1809
1810						=cut
1811
1812						sub new {
1813	0			0		my $class = shift;
1814	0					my $self = {
1815						%fields,
1816						};
1817	0					bless ($self, $class);
1818	0					return $self;
1819						}
1820
1821						=head2 add_ref
1822
1823						$digest->add_ref(-reference=>'Full path of the reference file');
1824						Add the reference file to the RestrictionDigest variable.
1825
1826						=cut
1827
1828						sub add_ref {
1829	0			0		my $self=shift;
1830	0					my %parameters=@_;
1831	0					for(keys %parameters){
1832	0	0				if($_=~/^-reference$/){
1833	0					$self->{ref}=$parameters{$_};
1834						}
1835						else{
1836	0					die "Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
1837						}
1838						}
1839
1840						# Check the format of reference
1841	0					my $ref_fh=IO::File->new("$self->{ref}",'r');
1842	0					my $line_num=0;
1843	0					while(<$ref_fh>){
1844	0					chomp;
1845	0					$line_num++;
1846	0	0				if($line_num % 2 != 0){
1847	0	0				unless($_=~/^>\S+$/){
1848	0					die "The format of reference file is not allowed, please convert it.\nYou can find the example format in the perldoc RestrictionDigest!\n";
1849						}
1850						}
1851	0	0				if($line_num % 2 ==0 ){
1852	0	0				unless($_=~/^[AGTCNRYMKSWHBVD]+$/i){
1853	0					die"The sequences of reference contain some charactors other than AGTCNRYMKSWHBVD, please correct that!\n";
1854						}
1855						}
1856						}
1857	0					$ref_fh->close;
1858						}
1859
1860						=head2 add_single_enzyme
1861
1862						$digest->add_single_enzyme(-enzyme=>'EcoRI');
1863						Add the single enzyme's name used to digest the whole genome.
1864
1865						=cut
1866
1867
1868						sub add_single_enzyme {
1869	0			0		my $self=shift;
1870	0					my %parameters=@_;
1871	0					for (keys %parameters){
1872	0	0				if($_=~/^-enzyme$/){
1873	0					$self->{enzyme}=$parameters{$_};
1874						}
1875						else{
1876	0					die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
1877						}
1878						}
1879
1880						# Check the existence of enzymes in the enzyme-container.
1881	0					my $enzyme_exists=0;
1882	0					foreach(keys %enzyme_list){
1883	0	0				if($_=~/^$self->{enzyme}$/i){
1884	0					$enzyme_exists++;
1885						}
1886						}
1887	0	0				if($enzyme_exists >0 ){
1888						}else{
1889	0					die "The single restrict endonuclease User provides does not exist.\nHowever User can add this enzyme and its recognition sites to enzyme-container via the method 'new_enzyme'.\n";
1890						}
1891						}
1892
1893						=head2 new_enzyme
1894
1895						$digest->new_enzyme(-enzyme_name=>'EcoRI', -recognition_site=>'G\|AATTC');
1896
1897						If the enzyme User wants to use does not exists in the default enzyme-pool, the enzyme can be added to the enzyme-pool.
1898
1899						For now, the enzymes in the default enzyme pool are as follows:
1900						EcoRI,HinfI,AluI,AvaII,BamHI,HindIII,MseI,MspI,NdeI,PstI,SalI,XbaI,XhoI,NlaIII.
1901
1902						Adding new enzyme(s) to the enzyme-pool is easy but the enzyme added to the pool is temporary. The added enzyme(s) is active
1903						only in the class context to which the 'new_method' belongs.
1904
1905						=cut
1906
1907						sub new_enzyme {
1908	0			0		my $self=shift;
1909	0					my %parameters=@_;
1910	0					my $new_enzyme_name;
1911						my $new_enzyme_site;
1912	0					for (keys %parameters){
1913	0	0				if($_=~/^-enzyme_name$/){
		0
1914	0					$new_enzyme_name=$parameters{$_};
1915						}
1916						elsif($_=~/^-recognition_site$/){
1917	0					$new_enzyme_site=$parameters{$_};
1918	0	0				unless($new_enzyme_site=~/\|/){
1919	0					die "The 'recognition_site' must contain a cut flag '\|'. Please perldoc RestrictionDigest for example or read README for moer help\n";
1920						}
1921						}
1922						else{
1923	0					die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help\n";
1924						}
1925						}
1926	0					$enzyme_list{$new_enzyme_name}=$new_enzyme_site;
1927						}
1928
1929						=head2 change_range
1930
1931						$digest->change_range(-start=>201, -end=>500);
1932						Usually, not all fragments are needed by the scientists. The fragments in particular range are more attractive to them.
1933						RestrictionDigest provides the sequences and the positions on the reference of these fragments in range. The default range is
1934						[201bp, 500]. User can change this range via the method .
1935
1936						=cut
1937
1938						sub change_range {
1939	0			0		my $self=shift;
1940	0					my %parameters=@_;
1941	0					for(keys %parameters){
1942	0	0				if($_=~/^-start$/){
		0
1943	0					$self->{range_start}=$parameters{$_};
1944						}
1945						elsif($_=~/^-end$/){
1946	0					$self->{range_end}=$parameters{$_};
1947						}
1948						else{
1949	0					die"Unacceptable parameters in the method ,please perldoc RestrictionDigest for example or read README for more help.\n";
1950						}
1951						}
1952	0	0				if($self->{range_start} < $self->{range_end} ) {
1953						}else{
1954	0					die "The first parameter of the method must be smaller than the second parameter\n";
1955						}
1956						}
1957
1958						=head2 change_lengths_distribution_parameters
1959
1960						$digest->change_lengths_distribution_parameters(-front=>100,-behind=>1000,-step=>50);
1961						This program will give a simple summary of the lengths of all fragments after digestion. By default, three scopes, which are
1962						<=100bp, 101bp-1000bp and >1000bp, are given. The number of the fragments whose length fallen in these scopes are given. More
1963						fine sorted, the 101bp-1000bp scope is split by the step of 50bp. These three parameters: the front edge and the behind edge
1964						of the scope and the step can be changed via the method 'change_lengths_distribution_parameters';
1965						=cut
1966
1967						sub change_lengths_distribution_parameters {
1968	0			0		my $self=shift;
1969	0					my %parameters=@_;
1970	0					for (keys %parameters){
1971	0	0				if($_=~/^-front$/){
		0
		0
1972	0					$self->{lengths_distribution_start}=$parameters{$_};
1973						}
1974						elsif($_=~/^-behind$/){
1975	0					$self->{lengths_distribution_end}=$parameters{$_};
1976						}
1977						elsif($_=~/^-step$/){
1978	0					$self->{lengths_distribution_step}=$parameters{$_};
1979						}
1980						else{
1981	0					die"Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
1982						}
1983						}
1984						}
1985
1986
1987
1988						=head2 add_output_dir
1989
1990						$digest->add_output_dir(-output_dir=>'Full path of the output directory');
1991						Adds the directory into which the result files are put by the program.
1992						The directory must be in the absolute path format.
1993
1994						=cut
1995
1996						sub add_output_dir {
1997	0			0		my $self=shift;
1998	0					my %parameters=@_;
1999	0					for(keys %parameters){
2000	0	0				if($_=~/^-output_dir$/){
2001	0					$self->{output_dir}=$parameters{$_};
2002						}
2003						else{
2004	0					die "Unacceptable parameter in the method , please perldoc RestrictionDigest for example or read README for more help!\n";
2005						}
2006						}
2007	0	0				if(-d $self->{output_dir} ) {
2008						}else{
2009	0					die "The output directory User provides does not exist.\n";
2010						}
2011						}
2012
2013						=head2 single_digest
2014
2015						$digest->single_digest();
2016						Execute the digestion process.
2017						This process will produce several result files which will be located in the output directory
2018						added through the add_output_dir method.
2019						This process will take some time. It depends on the size of the reference file.
2020						During the process, Prompting messages will be output to the STDOUT
2021						like "Digestion: >ScaffoldName is done!".
2022
2023						=cut
2024
2025						sub single_digest {
2026
2027	0			0		my $self=shift;
2028	0					my $ref=$self->{ref};
2029	0	0				if($ref){
2030	0					my $name_reference=basename($ref);
2031	0					print "The reference file User provides is:\t",$name_reference,"\n";
2032						}else{
2033	0					die "No reference file provied! Please perldoc RestrictionDigest for example.\n";
2034						}
2035	0					my $enzyme=$self->{enzyme};
2036	0	0				if($enzyme){
2037	0					print "The restrict endonuclease User provides is:\t", $enzyme, "\n";
2038						}
2039						else{
2040	0					die"No restrict endonuclease provided! Please perldoc RestrictionDigest for example.\n";
2041						}
2042	0					my $range1=$self->{range_start};
2043	0					my $range2=$self->{range_end};
2044	0					print "The seperate range of the fragments User wants to output is:\t$range1 to $range2 bp.\n";
2045	0					my $output_dir=$self->{output_dir};
2046	0	0				if($output_dir){
2047	0					print "The output directory User provides is:\t",$output_dir, "\n";
2048						}
2049						else{
2050	0					die "No output directory provided! Please perldoc RestrictionDigest for example.\n";
2051						}
2052	0					$output_dir=~s/^(.+)\/?$/$1/;
2053
2054						# Make the file handle to the reference file.
2055	0					my $ref_fh=IO::File->new("$ref",'r');
2056
2057						# Get the basename of reference,not include the path.
2058	0					my $name_reference=basename($ref);
2059
2060						# Make the file handles of result files.
2061	0					my $all_seq_file_fh=IO::File->new(">>$output_dir/seq_frags_${name_reference}_by_${enzyme}");
2062	0					my $all_loc_file_fh=IO::File->new(">>$output_dir/position_frags_${name_reference}_by_${enzyme}");
2063	0					my $seq_in_range_file_fh=IO::File->new(">>$output_dir/seq_frags_in_range_${name_reference}_by_${enzyme}");
2064	0					my $loc_in_range_file_fh=IO::File->new(">>$output_dir/position_frags_in_range_${name_reference}_by_${enzyme}");
2065	0					my $reduced_ratio_every_scaffold_file_fh=IO::File->new(">>$output_dir/reduced_ratio_every_scaffold_${name_reference}_by_${enzyme}");
2066	0					my $summary_digestion_fh=IO::File->new(">>$output_dir/digestion_summary_${name_reference}_by_${enzyme}");
2067
2068						# Get the base-compositions of the enzyme which User inputs.
2069	0					my $enzyme_seq;
2070	0					foreach(keys %enzyme_list){
2071	0	0				if($_=~/^$enzyme$/i){
2072	0					$enzyme_seq=$enzyme_list{$_};
2073						}
2074						}
2075
2076						#get the relative location of cutting of the recognation sequences;
2077	0					my @enzyme_rec_seq=split //, $enzyme_seq;
2078	0					my @enzyme_cut_seq;
2079	0					my $enzyme_rec_start=0;
2080	0					my $enzyme_cut_loc;
2081
2082	0					foreach(@enzyme_rec_seq){
2083	0					$enzyme_rec_start++;
2084	0	0				if($_=~/\\|/){
2085	0					$enzyme_cut_loc=$enzyme_rec_start;
2086						}
2087						else{
2088	0					push @enzyme_cut_seq,$_;
2089						}
2090						}
2091
2092						# Replace the degenerate bases of the enzyme;
2093	0					my $num_of_enzyme_cut_seq=@enzyme_cut_seq;
2094	0					for(@enzyme_cut_seq){
2095	0	0				if($_=~/N/){$_="[ATCG]";}
	0
2096	0	0				if($_=~/R/){$_="[AG]";}
	0
2097	0	0				if($_=~/Y/){$_="[CT]";}
	0
2098	0	0				if($_=~/M/){$_="[AC]";}
	0
2099	0	0				if($_=~/K/){$_="[GT]";}
	0
2100	0	0				if($_=~/S/){$_="[GC]";}
	0
2101	0	0				if($_=~/W/){$_="[AT]";}
	0
2102	0	0				if($_=~/H/){$_="[ATC]";}
	0
2103	0	0				if($_=~/B/){$_="[GTC]";}
	0
2104	0	0				if($_=~/V/){$_="[GAC]";}
	0
2105	0	0				if($_=~/D/){$_="[GAT]";}
	0
2106						}
2107	0					my $seq_string_enzyme=join '', @enzyme_cut_seq;
2108
2109
2110
2111						# Make the scalars and arrays used in the whole programme;
2112	0					my $rate_overall=0;
2113	0					my $rate_in_range=0;
2114	0					my @lengths_of_fragments=();
2115	0					my @lengths_fragments_in_range=();
2116	0					my $overall_fragments_length=0;
2117	0					my $overall_fragments_in_range_length=0;
2118	0					my $overall_length_of_scfd=0;
2119
2120						########## CIRCLES START HERE!###########
2121
2122						# Process one scaffold one circle;
2123	0					my ($scaffold_name, $scaffold_seq);
2124	0					while(<$ref_fh>){
2125	0					chomp;
2126	0	0				if($_=~/^>/){
2127	0					$scaffold_name=$_;
2128						}
2129						else{
2130	0					print"Digesting $scaffold_name...\t";
2131	0					$scaffold_seq=$_;
2132	0					$reduced_ratio_every_scaffold_file_fh->print("$scaffold_name\t");
2133	0					my $length_of_scafd=length $scaffold_seq;
2134						# Set the array which contains cut locations of the enzyme.
2135	0					my @enzyme_locs=();
2136						# Find locs of recognition of the enzyme in this scaffold;the locs are meaningful in this array; not in the human context;
2137	0					my $seq_for_search=$scaffold_seq;
2138	0					my $string=$seq_string_enzyme;
2139						# Find the recognition sites of this enzyme in this scaffold.
2140	0	0				if($string=~/^(\w+)$/){
		0
		0
		0
		0
		0
2141	0					my $loc_in_array;
2142	0					$loc_in_array=index($seq_for_search,$string);
2143	0	0				unless($loc_in_array==-1){
2144	0					push @enzyme_locs, $loc_in_array;
2145						}
2146	0					while($loc_in_array!=-1){
2147	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2148	0	0				unless($loc_in_array==-1){
2149	0					push @enzyme_locs, $loc_in_array;
2150						}
2151						}
2152						}
2153						elsif($string=~/(\w+)\[(\w+)\](\w+)/){
2154	0					my $breast_bases=$1;
2155	0					my $back_bases=$3;
2156	0					my $generate_bases=$2;
2157	0					my @generate_bases=split //, $generate_bases;
2158	0					my $seq_for_search=$scaffold_seq;
2159	0					for my $one_base(@generate_bases){
2160	0					$string=$breast_bases.$one_base.$back_bases;
2161	0					my $loc_in_array;
2162	0					$loc_in_array=index($seq_for_search,$string);
2163	0	0				unless($loc_in_array==-1){
2164	0					push @enzyme_locs, $loc_in_array;
2165						}
2166	0					while($loc_in_array!=-1){
2167	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2168	0	0				unless($loc_in_array==-1){
2169	0					push @enzyme_locs, $loc_in_array;
2170						}
2171						}
2172						}
2173						}
2174
2175						elsif($string=~/^\[(\w+)\](\w+)\[(\w+)\]$/){
2176	0					my $g1_bases=$1;
2177	0					my $middle=$2;
2178	0					my $g2_bases=$3;
2179	0					my @g1_bases=split //, $g1_bases;
2180	0					my @g2_bases=split //, $g2_bases;
2181	0					for my $one_base(@g1_bases){
2182	0					for my $two_base(@g2_bases){
2183	0					$string=$one_base.$middle.$two_base;
2184	0					my $loc_in_array;
2185	0					$loc_in_array=index($seq_for_search,$string);
2186	0	0				unless($loc_in_array==-1){
2187	0					push @enzyme_locs, $loc_in_array;
2188						}
2189	0					while($loc_in_array!=-1){
2190	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2191	0	0				unless($loc_in_array==-1){
2192	0					push @enzyme_locs, $loc_in_array;
2193						}
2194						}
2195						}
2196						}
2197						}
2198
2199						elsif($string=~/(\w+)\[(\w+)\](\w+)\[(\w+)\](\w+)/){
2200	0					my $breast_bases=$1;
2201	0					my $g1_bases=$2;
2202	0					my $middle=$3;
2203	0					my $back_bases=$5;
2204	0					my $g2_bases=$4;
2205	0					my @g1_bases=split //, $g1_bases;
2206	0					my @g2_bases=split //, $g2_bases;
2207	0					for my $one_base(@g1_bases){
2208	0					for my $two_base(@g2_bases){
2209	0					$string=$breast_bases.$one_base.$middle.$two_base.$back_bases;
2210	0					my $loc_in_array;
2211	0					$loc_in_array=index($seq_for_search,$string);
2212	0	0				unless($loc_in_array==-1){
2213	0					push @enzyme_locs, $loc_in_array;
2214						}
2215	0					while($loc_in_array!=-1){
2216	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2217	0	0				unless($loc_in_array==-1){
2218	0					push @enzyme_locs, $loc_in_array;
2219						}
2220						}
2221						}
2222						}
2223						}
2224						elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\]\[(\w+)\](\w+)/){
2225	0					my $breast_bases=$1;
2226	0					my $g1_bases=$2;
2227	0					my $g2_bases=$3;
2228	0					my $back_bases=$5;
2229	0					my $g3_bases=$4;
2230	0					my @g1_bases=split //, $g1_bases;
2231	0					my @g2_bases=split //, $g2_bases;
2232	0					my @g3_bases=split //, $g3_bases;
2233	0					for my $one_base(@g1_bases){
2234	0					for my $two_base(@g2_bases){
2235	0					for my $three_base(@g3_bases){
2236	0					$string=$breast_bases.$one_base.$two_base.$three_base.$back_bases;
2237	0					my $loc_in_array;
2238	0					$loc_in_array=index($seq_for_search,$string);
2239	0	0				unless($loc_in_array==-1){
2240	0					push @enzyme_locs, $loc_in_array;
2241						}
2242	0					while($loc_in_array!=-1){
2243	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2244	0	0				unless($loc_in_array==-1){
2245	0					push @enzyme_locs, $loc_in_array;
2246						}
2247						}
2248						}
2249						}
2250						}
2251						}
2252						elsif($string=~/(\w+)\[(\w+)\]\[(\w+)\](\w+)/){
2253	0					my $breast_bases=$1;
2254	0					my $g1_bases=$2;
2255	0					my $g2_bases=$3;
2256	0					my $back_bases=$4;
2257	0					my @g1_bases=split //, $g1_bases;
2258	0					my @g2_bases=split //, $g2_bases;
2259	0					for my $one_base(@g1_bases){
2260	0					for my $two_base(@g2_bases){
2261	0					$string=$breast_bases.$one_base.$two_base.$back_bases;
2262	0					my $loc_in_array;
2263	0					$loc_in_array=index($seq_for_search,$string);
2264	0	0				unless($loc_in_array==-1){
2265	0					push @enzyme_locs, $loc_in_array;
2266						}
2267	0					while($loc_in_array!=-1){
2268	0					$loc_in_array=index($seq_for_search, $string, $loc_in_array+1);
2269	0	0				unless($loc_in_array==-1){
2270	0					push @enzyme_locs, $loc_in_array;
2271						}
2272						}
2273						}
2274						}
2275						}
2276
2277						# Sort the array.
2278	0					@enzyme_locs=sort {$a<=>$b} @enzyme_locs;
	0
2279
2280
2281						# Produce the fragments' loc-pair.
2282	0					my @fragments_loc_pair;
2283	0					my ($front_rec_loc, $behind_rec_loc)=qw(0 0);
2284	0	0				if(@enzyme_locs) {
2285	0					while(@enzyme_locs){
2286	0					$behind_rec_loc=shift @enzyme_locs;
2287						# Push the front end loc;
2288	0	0				if($front_rec_loc==0){
2289	0					push @fragments_loc_pair, $front_rec_loc;
2290						}
2291						else{
2292	0					push @fragments_loc_pair, $front_rec_loc+$enzyme_cut_loc-1;
2293						}
2294						# Push the behind end loc;
2295	0					push @fragments_loc_pair, $behind_rec_loc+$enzyme_cut_loc-2;
2296
2297						# Check whether the @enzyme_locs is empty.
2298	0	0				if(@enzyme_locs){
2299	0					$front_rec_loc=$behind_rec_loc;
2300						}
2301						else{
2302	0					push @fragments_loc_pair, $behind_rec_loc+$enzyme_cut_loc-1;
2303	0					push @fragments_loc_pair, $length_of_scafd-1;
2304						}
2305						}
2306						}
2307
2308						# Output the fragments' locs of this scaffold.
2309
2310						# Define the scalars which contains the length of fragments;
2311	0					my $nums_of_fragments=@fragments_loc_pair/2;
2312	0					my $length_of_fragments=0;
2313	0					my $length_of_fragments_in_range=0;
2314	0					my $count_all=0;
2315	0					my $count_in_range=0;
2316	0					my ($before_cut_coorr,$after_cut_coorr,$bf_ct_crr_hm_rd,$af_ct_crr_hm_rd);
2317
2318
2319						# Attentions here!!! the output sequences of fragments are in accordance with the results of illumian sequencing.
2320						# For removing the sequencing primers, read1 and read2 can align with the fragments here directly!!!
2321						# That is to say, fragments here contain the restriction overhangs!!!
2322
2323	0					while(@fragments_loc_pair){
2324	0					$count_all++;
2325	0					my $front_end_loc=shift @fragments_loc_pair;
2326	0					my $behind_end_loc=shift @fragments_loc_pair;
2327	0					my $length_of_fragment=$behind_end_loc-$front_end_loc+1;
2328	0					my $seq_selected=substr($scaffold_seq, $front_end_loc, $length_of_fragment);
2329
2330						# Output the sequence and fragment positions of all-fragments to files;
2331	0					$all_seq_file_fh->print("$scaffold_name-$count_all\n$seq_selected\n");
2332	0					my $front_loc_on_scaffold=$front_end_loc+1;
2333	0					my $behind_loc_on_scaffold=$behind_end_loc+1;
2334
2335	0					$all_loc_file_fh->print("$scaffold_name-$count_all\t$front_loc_on_scaffold\t$behind_loc_on_scaffold\t$length_of_fragment\n");
2336	0					push @lengths_of_fragments, $length_of_fragment;
2337
2338						# We treat the fragments in range in the same way of all fragments as above;
2339	0	0				if($length_of_fragment>=$range1){
2340	0	0				if($length_of_fragment<=$range2){
2341	0					$count_in_range++;
2342	0					$seq_in_range_file_fh->print("$scaffold_name-$count_all\n$seq_selected\n");
2343	0					$loc_in_range_file_fh->print("$scaffold_name-$count_all\t$front_loc_on_scaffold\t$behind_loc_on_scaffold\t$length_of_fragment\n");
2344	0					push @lengths_fragments_in_range,$length_of_fragment;
2345	0					$length_of_fragments_in_range+=$length_of_fragment;
2346						}
2347						}
2348
2349						# Count all fragments length no matter in range or not!
2350	0					$length_of_fragments+=$length_of_fragment;
2351						}
2352
2353
2354	0					my $reduced_rate=$length_of_fragments/$length_of_scafd;
2355	0					my $fragment_in_range_rate_scafd=$length_of_fragments_in_range/$length_of_scafd;
2356	0					$overall_fragments_length+=$length_of_fragments;
2357	0					$overall_fragments_in_range_length+=$length_of_fragments_in_range;
2358	0					$overall_length_of_scfd+=$length_of_scafd;
2359	0					$reduced_ratio_every_scaffold_file_fh->print("$reduced_rate\t$fragment_in_range_rate_scafd\n");
2360	0					print"Done!\n";
2361						}
2362						}
2363	0					my $length_ratio_all_frags=$overall_fragments_length/$overall_length_of_scfd;
2364	0					my $length_ratio_frags_in_range=$overall_fragments_in_range_length/$overall_length_of_scfd;
2365	0					my $num_all_frags=@lengths_of_fragments;
2366	0					my $num_frags_in_range=@lengths_fragments_in_range;
2367	0					my $lengths_fragments_in_range=\@lengths_fragments_in_range;
2368
2369
2370						# Output the summary of digestion.
2371	0					$summary_digestion_fh->print("Number of all fragments\t$num_all_frags\n");
2372	0					$summary_digestion_fh->print("Number of fragments in range\t$num_frags_in_range\n");
2373	0					$summary_digestion_fh->print("Ratio after reducing of all fragments\t$length_ratio_all_frags\n");
2374	0					$summary_digestion_fh->print("Ratio after reducing of fragments in range\t$length_ratio_frags_in_range\n");
2375
2376	0					my $lengths_distribution_start=$self->{lengths_distribution_start};
2377	0					my $lengths_distribution_end=$self->{lengths_distribution_end};
2378	0					my $lengths_distribution_step=$self->{lengths_distribution_step};
2379	0					my $lengths_distribution_tmp;
2380	0					my (@starts,@ends,@counts,%lengths_distribution);
2381
2382	0					my $num_pair=0;
2383
2384	0					for($lengths_distribution_tmp= $lengths_distribution_start;$lengths_distribution_tmp<$lengths_distribution_end+$lengths_distribution_step;$lengths_distribution_tmp=$lengths_distribution_tmp+$lengths_distribution_step){
2385	0	0				if($lengths_distribution_tmp== $lengths_distribution_start){
		0
2386	0					$num_pair++;
2387	0					push @starts, '0';
2388	0					push @ends , $lengths_distribution_tmp;
2389	0					$lengths_distribution{$num_pair}{"0bp-${lengths_distribution_tmp}bp"}=0;
2390
2391	0					$num_pair++;
2392	0					my $tmp_start=$lengths_distribution_tmp+1;
2393	0					my $tmp_end=$lengths_distribution_tmp+$lengths_distribution_step;
2394	0					push @starts, $tmp_start;
2395	0					push @ends, $tmp_end;
2396	0					$lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
2397
2398						}
2399						elsif($lengths_distribution_tmp < $lengths_distribution_end ){
2400	0	0				if($lengths_distribution_tmp+$lengths_distribution_step <$lengths_distribution_end ){
		0
2401	0					$num_pair++;
2402	0					my $tmp_start=$lengths_distribution_tmp+1;
2403	0					my $tmp_end=$lengths_distribution_tmp+$lengths_distribution_step;
2404	0					push @starts, $tmp_start;
2405	0					push @ends, $tmp_end;
2406	0					$lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
2407
2408						}
2409						elsif($lengths_distribution_tmp +$lengths_distribution_step >= $lengths_distribution_end){
2410	0					$num_pair++;
2411	0					my $tmp_start=$lengths_distribution_tmp+1;
2412	0					my $tmp_end=$lengths_distribution_end;
2413	0					push @starts, $tmp_start;
2414	0					push @ends, $tmp_end;
2415	0					$lengths_distribution{$num_pair}{"${tmp_start}bp-${tmp_end}bp"}=0;
2416
2417	0					$num_pair++;
2418	0					$tmp_start=$tmp_end+1;
2419
2420	0					push @starts, $tmp_start;
2421	0					push @ends, 'bigger';
2422	0					$lengths_distribution{$num_pair}{"${tmp_start}bp-bigger bp"}=0;
2423
2424						}
2425						}
2426						}
2427
2428
2429
2430	0					for (@lengths_of_fragments){
2431	0					my $length=$_;
2432	0					my $num_pair=1;
2433	0					my $num_last_pair=@starts;
2434	0					for($num_pair=1;$num_pair<=$num_last_pair;$num_pair++){
2435
2436	0					my $array_index=$num_pair-1;
2437	0					my $start=$starts[$array_index];
2438	0					my $end=$ends[$array_index];
2439	0	0				if($num_pair<$num_last_pair){
		0
2440	0	0	0			if($length>=$start && $length<=$end){
2441	0					for my $description (keys %{$lengths_distribution{$num_pair}}){
	0
2442	0					$lengths_distribution{$num_pair}{$description}++;
2443	0					last;
2444						}
2445	0					last;
2446						}
2447						}
2448						elsif($num_pair==$num_last_pair){
2449	0	0				if($length>=$start){
2450	0					for my $description (keys %{$lengths_distribution{$num_pair}}){
	0
2451	0					$lengths_distribution{$num_pair}{$description}++;
2452	0					last;
2453						}
2454	0					last;
2455						}
2456						}
2457						}
2458						}
2459
2460	0					$summary_digestion_fh->print("\nLengths\' scope\tNumber of fragments in this scope\tRatio of these fragments in all\n");
2461
2462	0					for(1..@starts){
2463	0					my $num_pair=$_;
2464	0					for my $description(keys %{$lengths_distribution{$num_pair}}){
	0
2465	0					my $num_fragments=$lengths_distribution{$num_pair}{$description};
2466	0					my $ratio;
2467	0	0				if($num_all_frags>0){
2468	0					$ratio=$num_fragments/$num_all_frags;
2469						}
2470						else{
2471	0					$ratio='N/A';
2472						}
2473	0					$summary_digestion_fh->print("$description\t$num_fragments\t$ratio\n");
2474						}
2475						}
2476	0					$summary_digestion_fh->print("\n");
2477						}
2478
2479						=head2 add_SNPs
2480
2481						$digest->add_SNPs(-SNPs=>'Full path of the SNPs file');
2482						Adds the absolute path of the SNPs file to RestrictionDigest variable.
2483						=cut
2484
2485						sub add_SNPs {
2486	0			0		my $self=shift;
2487	0					my %parameters=@_;
2488	0					for (keys %parameters){
2489	0	0				if($_=~/^-SNPs$/){
2490	0					$self->{SNPs}=$parameters{$_};
2491						}
2492						else{
2493	0					die "Unacceptable parameters of the method , please perldoc RestrictionDigest for example or read README for more help\n";
2494						}
2495						}
2496						}
2497
2498						=head2 count_SNPs_at_fragments
2499
2500						$digest->count_SNPs_at_fragments(-sequence_type=>'100PE');
2501						Count the expected SNPs appeared in the framgents.
2502						=cut
2503
2504						sub count_SNPs_at_fragments {
2505	0			0		my $self=shift;
2506	0					my %parameters=@_;
2507
2508	0	0				if(keys %parameters){
2509	0					for(keys %parameters){
2510	0	0				if($_=~/^-sequence_type$/){
2511	0					$self->{sequence_type}=$parameters{$_};
2512						}
2513						else{
2514	0					die "Unacceptable parameters in the method , please perldoc RestrictionDigest for example or read README for more help\n";
2515						}
2516						}
2517						}
2518
2519	0					my $ref=$self->{ref};
2520	0					my $enzyme=$self->{enzyme};
2521	0					my $range1=$self->{range_start};
2522	0					my $SNPs=$self->{SNPs};
2523	0	0				if($SNPs){
2524	0					my $SNPs_basename=basename($SNPs);
2525	0					print "The SNPs file provided is\t", $SNPs_basename, ".\n";
2526						}
2527						else{
2528	0					die"No SNPs file provided! Please perldoc RestrictionDigest for example.\n";
2529						}
2530	0					my $sequence_type=$self->{sequence_type};
2531
2532						# Check the sequencing type and determine the domains of the fragments to calculate the SNPs number.
2533	0					my ($pe_sequence_length, $se_sequence_length);
2534	0					my $sequence_type_code;
2535	0	0				if($sequence_type=~/^(\d+)pe$/i){
		0
2536	0					$sequence_type_code=2;
2537	0					my $sequence_length=$1;
2538	0	0				unless($sequence_length*2 <= $range1){
2539	0					die"Unacceptable sequence_type when range's front border is $range1 bp. Please perldoc RestrictionDigest for example or read README for more help!\n";
2540						}
2541						else{
2542	0					$pe_sequence_length=$sequence_length;
2543						}
2544						}
2545						elsif($sequence_type=~/^(\d+)se$/i){
2546	0					$sequence_type_code=1;
2547	0					my $sequence_length=$1;
2548	0	0				unless($sequence_length<=$range1){
2549	0					die "Unacceptable sequence_type when range's front border is $range1 bp. Please perldoc RestrictionDigest for example or read README for more help!\n";
2550						}
2551						else{
2552	0					$se_sequence_length=$sequence_length;
2553						}
2554						}
2555						else{
2556	0					die "Unacceptable sequence_type. Please perldoc RestrictionDigest for example or read README for more help!\n";
2557						}
2558
2559	0					print "The sequencing type provided is\t", $sequence_type, ".\n";
2560
2561	0					my $output_dir=$self->{output_dir};
2562	0					$output_dir=~s/^(.+)\/?$/$1/;
2563
2564						# Get the basename of reference,not include the path.
2565	0					my $name_reference=basename($ref);
2566
2567						# Make the file handle to the summary file;
2568	0					my $summary_digestion_fh=IO::File->new(">>$output_dir/digestion_summary_${name_reference}_by_${enzyme}");
2569
2570						# Make the file handle to the locs file of all fragments and fragments in range.
2571	0					my $all_loc_file_fh=IO::File->new("$output_dir/position_frags_${name_reference}_by_${enzyme}",'r');
2572	0					my $loc_in_range_file_fh=IO::File->new("$output_dir/position_frags_in_range_${name_reference}_by_${enzyme}",'r');
2573
2574						# Make the file handle to the SNPs file;
2575	0					my $SNPs_fh=IO::File->new("$SNPs",'r');
2576	0					my %SNPs;
2577	0					while(<$SNPs_fh>){
2578	0					chomp;
2579	0					my ($SNPs_scaffold_name,$SNPs_pos,$type)=split /\s+/, $_;
2580	0					push @{$SNPs{$SNPs_scaffold_name}}, $SNPs_pos;
	0
2581						}
2582
2583						# Create the SNPs file to hold SNPs of all fragments and fragments in range when sequencing through pair-end.
2584	0					my($SNPs_at_all_frags_fh,$SNPs_at_frags_in_range_fh);
2585	0	0				if($sequence_type_code==2){
2586	0					$SNPs_at_all_frags_fh=IO::File->new(">>$output_dir/SNPs_at_all_frags_${name_reference}_by_${enzyme}");
2587	0					$SNPs_at_frags_in_range_fh=IO::File->new(">>$output_dir/SNPs_at_frags_in_range_${name_reference}_by_${enzyme}");
2588						}
2589
2590	0					print "Counting SNPs at the output fragments...\t";
2591						# Scan all locs file to count SNPs and output the result to the summary file.
2592	0					my $all_SNPs_number=0;
2593	0	0				if($sequence_type_code == 1){
		0
2594	0					while(<$all_loc_file_fh>){
2595	0					chomp;
2596	0					my($fragment_name,$pos1,$pos2,$length)=split /\t/, $_;
2597	0					my $scaffold_name;
2598	0	0				if($fragment_name=~/^>(.+)-(\d+)/){
2599	0					$scaffold_name=$1;
2600						}
2601	0	0				if($SNPs{$scaffold_name}){
2602	0					for my $SNP (@{$SNPs{$scaffold_name}}){
	0
2603	0	0	0			if( ($SNP>=$pos1 && $SNP<=($pos1+$se_sequence_length-1)) \|\| ( $SNP>= ($pos2-$se_sequence_length+1) && $SNP<=$pos2 ) ){
			0
			0
2604	0					$all_SNPs_number++;
2605						}
2606						}
2607						}
2608						}
2609	0					$all_SNPs_number=int($all_SNPs_number/2);
2610	0					$summary_digestion_fh->print("Expected SNPs at all fragments via $sequence_type:\t$all_SNPs_number\n");
2611						}
2612						elsif($sequence_type_code ==2){
2613	0					while(<$all_loc_file_fh>){
2614	0					chomp;
2615	0					my($fragment_name,$pos1,$pos2,$length)=split /\t/, $_;
2616	0					my $scaffold_name;
2617	0	0				if($fragment_name=~/^>(.+)-(\d+)/){
2618	0					$scaffold_name=$1;
2619						}
2620	0	0				if($SNPs{$scaffold_name}){
2621	0					for my $SNP (@{$SNPs{$scaffold_name}}){
	0
2622	0	0	0			if( ($SNP>=$pos1 && $SNP<=($pos1+$pe_sequence_length-1)) \|\| ( $SNP>= ($pos2-$pe_sequence_length+1) && $SNP<=$pos2 ) ){
			0
			0
2623	0					$all_SNPs_number++;
2624	0					$SNPs_at_all_frags_fh->print("$scaffold_name\t$SNP\n");
2625						}
2626						}
2627						}
2628						}
2629	0					$summary_digestion_fh->print("Expected SNPs at all fragments via $sequence_type:\t$all_SNPs_number\n");
2630						}
2631						# Scan locs in range file to count SNPs and output the result to the summary file.
2632
2633	0					my $SNPs_in_range_number=0;
2634
2635	0	0				if($sequence_type_code == 1){
		0
2636	0					while(<$loc_in_range_file_fh>){
2637	0					chomp;
2638	0					my($fragment_name,$pos1,$pos2,$length)=split /\t/, $_;
2639	0					my $scaffold_name;
2640	0	0				if($fragment_name=~/^>(.+)-(\d+)/){
2641	0					$scaffold_name=$1;
2642						}
2643	0	0				if($SNPs{$scaffold_name}){
2644	0					for my $SNP (@{$SNPs{$scaffold_name}}){
	0
2645	0	0	0			if( ($SNP>=$pos1 && $SNP<=($pos1+$se_sequence_length-1)) \|\| ( $SNP>= ($pos2-$se_sequence_length+1) && $SNP<=$pos2 ) ){
			0
			0
2646	0					$SNPs_in_range_number++;
2647						}
2648						}
2649						}
2650						}
2651	0					$SNPs_in_range_number=int($SNPs_in_range_number/2);
2652	0					$summary_digestion_fh->print("Expected SNPs at fragments in range via $sequence_type:\t$SNPs_in_range_number\n");
2653						}
2654						elsif($sequence_type_code ==2){
2655	0					while(<$loc_in_range_file_fh>){
2656	0					chomp;
2657	0					my($fragment_name,$pos1,$pos2,$length)=split /\t/, $_;
2658	0					my $scaffold_name;
2659	0	0				if($fragment_name=~/^>(.+)-(\d+)/){
2660	0					$scaffold_name=$1;
2661						}
2662	0	0				if($SNPs{$scaffold_name}){
2663	0					for my $SNP (@{$SNPs{$scaffold_name}}){
	0
2664	0	0	0			if( ($SNP>=$pos1 && $SNP<=($pos1+$pe_sequence_length-1)) \|\| ( $SNP>= ($pos2-$pe_sequence_length+1) && $SNP<=$pos2 ) ){
			0
			0
2665	0					$SNPs_in_range_number++;
2666	0					$SNPs_at_frags_in_range_fh->print("$scaffold_name\t$SNP\n");
2667						}
2668						}
2669						}
2670						}
2671	0					$summary_digestion_fh->print("Expected SNPs at fragments in range via $sequence_type:\t$SNPs_in_range_number\n");
2672						}
2673	0					print "Done!\n";
2674						}
2675
2676						=head2 add_gff
2677
2678						$digest->add_gff(-ref=>'Full path of the gff file');
2679						Adds the absolute path of gff file to RestrictionDigest variable.
2680						If User want to calculate the coverage ratio of different genomic strutures by the digested
2681						fragments, this method is obligatory.
2682
2683						=cut
2684
2685						sub add_gff {
2686	0			0		my $self=shift;
2687	0					my %parameters=@_;
2688	0					for (keys %parameters){
2689	0	0				if($_=~/^-gff$/){
2690	0					$self->{gff}=$parameters{$_};
2691						}
2692						else{
2693	0					die"Unacceptable parameters of the method , please perldoc RestrictionDigest for example or read README for more help\n";
2694						}
2695						}
2696						}
2697
2698						=head2 all_frags_coverage_ratio
2699
2700						$digest->all_frags_coverage_ratio();
2701						Calculate the coverage ratio of different parts of the genome reference,
2702						including mRNA, CDS, UTR and intergenic region, by all digested fragments.
2703						This process will take a very long time, please be patient.
2704						=cut
2705
2706						sub all_frags_coverage_ratio {
2707	0			0		my $self=shift;
2708
2709	0					my $ref=$self->{ref};
2710	0					my $basename_ref=basename($ref);
2711
2712	0					my $enzyme=$self->{enzyme};
2713
2714	0					my $output_dir=$self->{output_dir};
2715	0					$output_dir=~s/^(.+)\/?$/$1/;
2716
2717	0					my $loc_file_all_frags="$output_dir/position_frags_${basename_ref}_by_${enzyme}";
2718
2719	0					$self->genome_structure_coverage_ratio($loc_file_all_frags);
2720						}
2721
2722						=head2 frags_in_range_coverage_ratio
2723
2724						$digest->frags_in_range_coverage_ratio();
2725						Calculate the coverage ratio of different parts of the genome reference,
2726						including mRNA, CDS, UTR and intergenic region, by the fragments in range.
2727						This process will take a very long time, please be patient.
2728						=cut
2729
2730						sub frags_in_range_coverage_ratio {
2731	0			0		my $self=shift;
2732
2733	0					my $ref=$self->{ref};
2734	0					my $basename_ref=basename($ref);
2735
2736	0					my $enzyme=$self->{enzyme};
2737
2738	0					my $output_dir=$self->{output_dir};
2739	0					$output_dir=~s/^(.+)\/?$/$1/;
2740
2741	0					my $loc_file_all_frags="$output_dir/position_frags_in_range_${basename_ref}_by_${enzyme}";
2742
2743	0					$self->genome_structure_coverage_ratio($loc_file_all_frags);
2744						}
2745
2746						=head2 genome_structure_coverage_ratio
2747
2748						A subroutine invoked by the 'all_frags_coverage_ratio' and
2749						the 'frags_in_range_coverage_ratio' methods.
2750						User do not use this subroutine.
2751
2752						=cut
2753
2754
2755						sub genome_structure_coverage_ratio {
2756	0			0		my $self=shift;
2757	0					my $loc_frags_file=shift;
2758
2759						# Get the reference name, enzyme names from hash.
2760	0					my $ref=$self->{ref};
2761	0					my $ref_fh=IO::File->new($ref,'r');
2762
2763	0					my $basename_ref=basename($ref);
2764	0					my $enzyme=$self->{enzyme};
2765	0					my $output_dir=$self->{output_dir};
2766	0					$output_dir=~s/^(.+)\/?$/$1/;
2767
2768						# Get the full path and name of gff file and make a filehandle to it.
2769	0					my $gff=$self->{gff};
2770
2771	0	0				if($gff){
2772	0					my $gff_basename=basename($gff);
2773	0					print "The GFF file provided is\t", $gff_basename, ".\n";
2774						}
2775						else{
2776	0					die"No GFF file provided! Please perldoc RestrictionDigest for example.\n";
2777						}
2778
2779
2780						# Make output filehandles to generic region and intergenic region coverage ratio files.
2781	0					my $coverage_ratio_fh;
2782
2783	0					my $basename_loc_frags_file=basename($loc_frags_file);
2784
2785	0	0				if($basename_loc_frags_file =~/in_range/){
2786	0					$coverage_ratio_fh=IO::File->new(">>$output_dir/genome_coverage_ratio_in_range_${basename_ref}_by_${enzyme}");
2787						}
2788						else{
2789	0					$coverage_ratio_fh=IO::File->new(">>$output_dir/genome_coverage_ratio_${basename_ref}_by_${enzyme}");
2790						}
2791
2792	0					$coverage_ratio_fh->print("ScaffoldName\tIntergenicLength\tIntergenicMapLength\tIntergenicMapRatio\tGenesLength\tGenesMapLength\tGenesMapRatio\tExonsLength\tExonsMapLength\tExonsMapRatio\tIntronsLength\tIntronsMapLength\tIntronsMapRatio\n");
2793
2794
2795						# Set variables used in this program.
2796	0					my($all_intergenic_length,$all_intergenic_map_length,$all_gene_length,$all_gene_map_length,$all_exon_length,$all_exon_map_length,$all_intron_length,$all_intron_map_length)=qw(0 0 0 0 0 0 0 0);
2797
2798						# Fetch the lengths of different parts of this scaffold from the GFF file.
2799
2800	0					my $gff_fh=IO::File->new($gff,'r');
2801	0					my %gff;
2802	0					print "Scanning the GFF file...\t";
2803	0					while(<$gff_fh>){
2804	0					chomp;
2805	0	0				unless($_=~/^#/){
2806	0					my @gff=split /\t/, $_;
2807	0	0				if(@gff == 9){
2808	0					my ($gff_scfd_name,$gff_type,$gff_start,$gff_stop)=@gff[0,2,3,4];
2809	0	0				if($gff_type=~/CDS\|exon/){
		0
2810	0					$gff_type='exon';
2811	0	0				if($gff{$gff_scfd_name}{$gff_type}){
2812	0	0				unless(grep {$gff_start == $_} @{$gff{$gff_scfd_name}{$gff_type}}){
	0
	0
2813	0					push @{$gff{$gff_scfd_name}{$gff_type}}, ($gff_start,$gff_stop);
	0
2814						}
2815						}
2816						else{
2817	0					push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
	0
2818						}
2819						}
2820						elsif($gff_type=~/mRNA\|gene/){
2821	0					$gff_type='gene';
2822	0	0				if($gff{$gff_scfd_name}{$gff_type}){
2823	0	0				unless(grep {$gff_start == $_} @{$gff{$gff_scfd_name}{$gff_type}}){
	0
	0
2824	0					push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
	0
2825						}
2826						}
2827						else{
2828	0					push @{$gff{$gff_scfd_name}{$gff_type}},($gff_start,$gff_stop);
	0
2829						}
2830						}
2831						}
2832						}
2833						}
2834	0					$gff_fh->close;
2835	0					print"Done!\n";
2836
2837						# Scan the loc file and fetch the information.
2838
2839	0					my $loc_frags_file_fh=IO::File->new($loc_frags_file,'r');
2840	0					my %all_locs;
2841	0					print "Scanning the positions file...\t";
2842	0					while(<$loc_frags_file_fh>){
2843	0					chomp;
2844	0					my($frgt_name,$start_coor,$stop_coor,$length)=split /\t/,$_;
2845	0					my $scfd_frgt;
2846	0	0				if($frgt_name=~/^>(.+)-\d+$/){
2847	0					$scfd_frgt=$1;
2848	0					push @{$all_locs{$scfd_frgt}}, ($start_coor, $stop_coor);
	0
2849						}
2850						}
2851	0					$loc_frags_file_fh->close;
2852	0					print"Done!\n";
2853
2854
2855
2856						# Handle every scaffold one by one.
2857	0					my($scfd_name);
2858	0					while(<$ref_fh>){
2859	0					chomp;
2860	0	0				if($_=~/^>(\S+)$/){
2861	0					$scfd_name=$1;
2862						}
2863						else{
2864	0					my $scaffold_length=length $_;
2865	0					print "CoverageRatio: Processing $scfd_name...\t";
2866
2867
2868						# Calculate the lengths of different parts of this scaffold.
2869	0					my ($exon_length,$intron_length,$gene_length,$intergenic_length)=qw(0 0 0 0);
2870
2871	0	0				if($gff{$scfd_name}{gene}){
2872	0					my @gene_tmp=@{$gff{$scfd_name}{gene}};
	0
2873	0					while(@gene_tmp){
2874	0					my $front=shift @gene_tmp;
2875	0					my $behind=shift @gene_tmp;
2876	0					$gene_length+=$behind-$front+1;
2877						}
2878	0					$intergenic_length=$scaffold_length-$gene_length;
2879						}
2880						else{
2881	0					$intergenic_length=$scaffold_length;
2882						}
2883
2884	0	0				if($gff{$scfd_name}{exon}){
2885	0					my @exon_tmp=@{$gff{$scfd_name}{exon}};
	0
2886	0					while(@exon_tmp){
2887	0					my $front=shift @exon_tmp;
2888	0					my $behind=shift @exon_tmp;
2889	0					$exon_length+=$behind-$front+1;
2890						}
2891	0					$intron_length=$gene_length-$exon_length;
2892						}
2893
2894	0					$all_intergenic_length+=$intergenic_length;
2895	0					$all_gene_length+=$gene_length;
2896	0					$all_exon_length+=$exon_length;
2897	0					$all_intron_length+=$intron_length;
2898
2899
2900
2901						# Scan the locs file and store information about this scaffold.
2902
2903	0					my $gene_map_length=0;
2904	0					my $exon_map_length=0;
2905	0					my $intron_map_length=0;
2906	0					my $intergenic_map_length=0;
2907
2908						# Extract fragments' locs for %all_locs;
2909	0	0				if($all_locs{$scfd_name}){
2910	0					my @tmp_locs=@{$all_locs{$scfd_name}};
	0
2911	0					while(@tmp_locs){
2912	0					my $front=shift @tmp_locs;
2913	0					my $behind=shift @tmp_locs;
2914	0					for my $one($front..$behind){
2915	0					my $gene_map_count=0;
2916	0					my $exon_map_count=0;
2917	0					my $intron_map_count=0;
2918
2919	0	0				if($gff{$scfd_name}{gene}){
2920	0					my @gene_tmp=@{$gff{$scfd_name}{gene}};
	0
2921	0					while(@gene_tmp){
2922	0					my $front=shift @gene_tmp;
2923	0					my $behind=shift @gene_tmp;
2924	0	0	0			if($one>=$front && $one<=$behind){
2925	0					$gene_map_count++;
2926	0					$gene_map_length++;
2927	0					last;
2928						}
2929						}
2930						}
2931
2932	0	0				if($gff{$scfd_name}{exon}){
2933	0					my @exon_tmp=@{$gff{$scfd_name}{exon}};
	0
2934	0					while(@exon_tmp){
2935	0					my $front=shift @exon_tmp;
2936	0					my $behind=shift @exon_tmp;
2937	0	0	0			if($one>=$front && $one<=$behind){
2938	0					$exon_map_count++;
2939	0					$exon_map_length++;
2940	0					last;
2941						}
2942						}
2943						}
2944
2945	0	0				if($gene_map_count==0){
		0
2946	0					$intergenic_map_length++;
2947						}
2948						elsif($exon_map_count==0){
2949	0					$intron_map_length++;
2950						}
2951						}
2952						}
2953						}
2954
2955						# Output part!!!
2956	0					my($intergenic_map_ratio,$gene_map_ratio,$exon_map_ratio,$intron_map_ratio)=qw(0 0 0 0);
2957	0	0				unless($intergenic_length==0){
	0
2958	0					$intergenic_map_ratio=$intergenic_map_length/$intergenic_length;
2959						}
2960						else{$intergenic_map_ratio="N/A";
2961						}
2962
2963	0	0				unless($gene_length==0){
	0
2964	0					$gene_map_ratio=$gene_map_length/$gene_length;
2965						}
2966						else{$gene_map_ratio="N/A";
2967						}
2968
2969	0	0				unless($exon_length==0){
	0
2970	0					$exon_map_ratio=$exon_map_length/$exon_length;
2971						}
2972						else{$exon_map_ratio="N/A";
2973						}
2974
2975	0	0				unless($intron_length==0){
	0
2976	0					$intron_map_ratio=$intron_map_length/$intron_length;
2977						}
2978						else{$intron_map_ratio="N/A";
2979						}
2980	0					$all_intergenic_map_length+=$intergenic_map_length;
2981	0					$all_gene_map_length+=$gene_map_length;
2982	0					$all_exon_map_length+=$exon_map_length;
2983	0					$all_intron_map_length+=$intron_map_length;
2984
2985
2986	0					$coverage_ratio_fh->print("$scfd_name\t$intergenic_length\t$intergenic_map_length\t$intergenic_map_ratio\t$gene_length\t$gene_map_length\t$gene_map_ratio\t$exon_length\t$exon_map_length\t$exon_map_ratio\t$intron_length\t$intron_map_length\t$intron_map_ratio\n");
2987	0					print "Done!\n";
2988						}
2989						}
2990	0					my ($all_intergenic_map_ratio,$all_gene_map_ratio,$all_exon_map_ratio,$all_intron_map_ratio)=qw(0 0 0 0);
2991	0	0				if($all_gene_length==0){$all_gene_map_ratio='N/A';}
	0
2992						else{
2993	0					$all_gene_map_ratio=$all_gene_map_length/$all_gene_length;}
2994	0	0				if($all_exon_length==0){$all_exon_map_ratio='N/A';}
	0
2995						else{
2996	0					$all_exon_map_ratio=$all_exon_map_length/$all_exon_length;}
2997	0	0				if($all_intron_length==0){$all_intron_map_ratio='N/A';}
	0
2998						else{
2999	0					$all_intron_map_ratio=$all_intron_map_length/$all_intron_length;}
3000	0	0				if($all_intergenic_length==0){$all_intergenic_map_ratio='N/A';}
	0
3001						else{
3002	0					$all_intergenic_map_ratio=$all_intergenic_map_length/$all_intergenic_length;}
3003
3004	0					$coverage_ratio_fh->print("Intergenic region map ratio is\t$all_intergenic_map_ratio\nGenes region map ratio is\t$all_gene_map_ratio\nExon region map ratio is\t$all_exon_map_ratio\nIntron region map ratio is\t$all_intron_map_ratio\n");
3005						}
3006
3007
3008
3009						=head1 AUTHOR
3010
3011						Jinpeng Wang, Li Li, Guofan Zhang, C<< >>
3012
3013						=head1 BUGS
3014
3015						Please report any bugs or feature requests to C, or through
3016						the web interface at L. I will be notified, and then you'll
3017						automatically be notified of progress on your bug as I make changes.
3018
3019
3020
3021
3022						=head1 SUPPORT
3023
3024						You can find documentation for this module with the perldoc command.
3025
3026						perldoc RestrictionDigest
3027
3028
3029						You can also look for information at:
3030
3031						=over 4
3032
3033						=item * RT: CPAN's request tracker (report bugs here)
3034
3035						L
3036
3037						=item * AnnoCPAN: Annotated CPAN documentation
3038
3039						L
3040
3041						=item * CPAN Ratings
3042
3043						L
3044
3045						=item * Search CPAN
3046
3047						L
3048
3049						=back
3050
3051
3052						=head1 ACKNOWLEDGEMENTS
3053
3054
3055						=head1 LICENSE AND COPYRIGHT
3056
3057						Copyright 2014 Jinpeng Wang, Li Li, Guofan Zhang.
3058
3059						This program is free software; you can redistribute it and/or modify it
3060						under the terms of the the Artistic License (2.0). You may obtain a
3061						copy of the full license at:
3062
3063						L
3064
3065						Any use, modification, and distribution of the Standard or Modified
3066						Versions is governed by this Artistic License. By using, modifying or
3067						distributing the Package, you accept this license. Do not use, modify,
3068						or distribute the Package, if you do not accept this license.
3069
3070						If your Modified Version has been derived from a Modified Version made
3071						by someone other than you, you are nevertheless required to ensure that
3072						your Modified Version complies with the requirements of this license.
3073
3074						This license does not grant you the right to use any trademark, service
3075						mark, tradename, or logo of the Copyright Holder.
3076
3077						This license includes the non-exclusive, worldwide, free-of-charge
3078						patent license to make, have made, use, offer to sell, sell, import and
3079						otherwise transfer the Package with respect to any patent claims
3080						licensable by the Copyright Holder that are necessarily infringed by the
3081						Package. If you institute patent litigation (including a cross-claim or
3082						counterclaim) against any party alleging that the Package constitutes
3083						direct or contributory patent infringement, then this Artistic License
3084						to you shall terminate on the date that such litigation is filed.
3085
3086						Disclaimer of Warranty: THE PACKAGE IS PROVIDED BY THE COPYRIGHT HOLDER
3087						AND CONTRIBUTORS "AS IS' AND WITHOUT ANY EXPRESS OR IMPLIED WARRANTIES.
3088						THE IMPLIED WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR
3089						PURPOSE, OR NON-INFRINGEMENT ARE DISCLAIMED TO THE EXTENT PERMITTED BY
3090						YOUR LOCAL LAW. UNLESS REQUIRED BY LAW, NO COPYRIGHT HOLDER OR
3091						CONTRIBUTOR WILL BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, OR
3092						CONSEQUENTIAL DAMAGES ARISING IN ANY WAY OUT OF THE USE OF THE PACKAGE,
3093						EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
3094
3095
3096						=cut
3097
3098						1; # End of RestrictionDigest