File Coverage

blib/lib/Bio/MUST/Core/Constants.pm
Criterion Covered Total %
statement 12 12 100.0
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 16 16 100.0


line stmt bran cond sub pod time code
1             package Bio::MUST::Core::Constants;
2             # ABSTRACT: Distribution-wide constants for Bio::MUST::Core
3             $Bio::MUST::Core::Constants::VERSION = '0.212530';
4 17     17   193906 use strict;
  17         45  
  17         623  
5 17     17   98 use warnings;
  17         40  
  17         460  
6              
7 17     17   472 use Const::Fast;
  17         2208  
  17         106  
8              
9             use Exporter::Easy (
10 17         239 OK => [ qw(:seqtypes :gaps :ncbi :seqids :files) ],
11             TAGS => [
12             seqtypes => [ qw($PROTLIKE $RNALIKE $NONPUREDNA) ],
13             gaps => [ qw($GAP $PROTMISS $DNAMISS
14             $GAPPROTMISS $GAPDNAMISS $FRAMESHIFT) ],
15             ncbi => [ qw($NCBIPART $NCBIACC $NCBIDBABBR
16             $NCBIPKEY $PKEYONLY $NCBIGCA $GCAONLY) ],
17             seqids => [ qw($NOID_CHARS $NEW_TAG $TAIL_42
18             $DEF_ID $GI_ID $LCL_ID $GNL_ID $JGI_ID $PAC_ID) ],
19             files => [ qw($EMPTY_LINE $COMMENT_LINE $DEF_LINE
20             $DIM_LINE $PHY_LINE
21             $STK_COMMENT $STK_SEQ $END_LINE
22             $COUNT_LINE $ALI_SUFFIX) ],
23             dirs => [ qw(%SUFFICES_FOR) ],
24             ],
25 17     17   11114 );
  17         26617  
26              
27              
28             # regexes for determining sequence type
29             const our $PROTLIKE => qr{[EFILPQefilpq]}xms;
30             const our $RNALIKE => qr{[Uu]}xms;
31             const our $NONPUREDNA => qr{[^ACGTacgt]}xms;
32              
33             # regexes for gap and missing symbols
34             # see also Bio::MUST::Core::Types for "more" gaps
35             # Note the 2-step definition of char classes for maximal regex speed
36             const my $GAPCHCL => q{\*\-\ };
37             const my $PROTMISSCHCL => q{\?Xx};
38             const my $DNAMISSCHCL => q{\?XxNn};
39             const my $PROTAMBIGCHCL => q{BJOUZbjouz}; # O/U actually are not ambiguous...
40             const my $DNAAMBIGCHCL => q{BDHKMRSVWYbdhkmrsvwy};
41              
42             const our $GAP => qr{[$GAPCHCL]}xms;
43             const our $PROTMISS => qr{[$PROTMISSCHCL$PROTAMBIGCHCL]}xms;
44             const our $DNAMISS => qr{[$DNAMISSCHCL$DNAAMBIGCHCL]}xms;
45              
46             const our $GAPPROTMISS => qr{[$GAPCHCL$PROTMISSCHCL$PROTAMBIGCHCL]}xms;
47             const our $GAPDNAMISS => qr{[$GAPCHCL$DNAMISSCHCL$DNAAMBIGCHCL]}xms;
48              
49             const our $FRAMESHIFT => 'x';
50              
51             # regexes for NCBI id components
52             const our $NCBIPART => qr{[^\|\s]+}xms;
53             const our $NCBIACC => qr{[A-Z0-9\.\_]+}xms;
54             const our $NCBIDBABBR => qr{[a-z]{2,}}xms;
55             const our $NCBIPKEY => qr{[1-9]\d*}xms;
56              
57             const our $PKEYONLY => qr{\A $NCBIPKEY \z}xms;
58              
59             # http://www.ncbi.nlm.nih.gov/assembly/model/
60             # The assembly accession starts with a three letter prefix, GCA for GenBank
61             # assemblies and GCF for RefSeq assemblies. This is followed by an underscore
62             # and 9 digits. A version is then added to the accession. For example, the
63             # assembly accession for the GenBank version of the current public human
64             # reference assembly ( GRCh37.p2 ) is GCA_000001405.3.
65              
66             const our $NCBIGCA => qr{GC[AF]_\d{9} \. \d+}xms;
67             const our $GCAONLY => qr{\A $NCBIGCA \z}xms;
68              
69             # regexes for parsing seq_ids
70             const our $NOID_CHARS => qr{[,;:]}xms;
71             const our $NEW_TAG => qr{\#NEW\#}xms;
72             const our $TAIL_42 => qr{(?: \.H\d+\.\d+ | \.E\.bf | \.E\.lc) $NEW_TAG? \z}xms;
73             const our $DEF_ID => qr{\A (\S+) }xms;
74             const our $GI_ID => qr{\A gi \| ($NCBIPKEY) }xms;
75             const our $LCL_ID => qr{\A lcl \| (\S+) }xms;
76             const our $GNL_ID => qr{\A gnl \| $NCBIPART \| ($NCBIPART) }xms;
77             const our $JGI_ID => qr{\A jgi \| $NCBIPART \| (\d+) }xms;
78             const our $PAC_ID => qr{\| PACid: (\d+) }xms;
79              
80             # regexes for parsing files
81              
82             # common
83             const our $EMPTY_LINE => qr{\A \s* \z}xms;
84             const our $COMMENT_LINE => qr{\A (\#)\s*(.*)}xms;
85              
86             # FASTA-like
87             const our $DEF_LINE => qr{\A >(.*)}xms;
88              
89             # PHYLIP-related
90             const our $DIM_LINE => qr{\A \s*(\d+)\s+(\d+)\s* \z}xms;
91             const our $PHY_LINE => qr{\A (?:(\S+)\s)? \s* (.*) }xms;
92              
93             # STOCKHOLM-related
94             const our $STK_COMMENT => qr{\A (\#=GF)\s*(.*)}xms;
95             const our $STK_SEQ => qr{\A (\S+)\s+(.*)}xms;
96             const our $END_LINE => qr{\A //}xms;
97              
98             # MUST-related
99             const our $COUNT_LINE => qr{\A (\d+) \z}xms;
100             const our $ALI_SUFFIX => qr{\.ali \z}xmsi;
101              
102             # regexes for traversing directories
103             # Note: hash values correspond to -name arg in File::Find::Rule constructor
104              
105             const our %SUFFICES_FOR => (
106             Ali => qr{\. (?: ali|fasta|fas|fa|faa|fna ) \z}xmsi,
107             );
108              
109             1;
110              
111             __END__
112              
113             =pod
114              
115             =head1 NAME
116              
117             Bio::MUST::Core::Constants - Distribution-wide constants for Bio::MUST::Core
118              
119             =head1 VERSION
120              
121             version 0.212530
122              
123             =head1 AUTHOR
124              
125             Denis BAURAIN <denis.baurain@uliege.be>
126              
127             =head1 COPYRIGHT AND LICENSE
128              
129             This software is copyright (c) 2013 by University of Liege / Unit of Eukaryotic Phylogenomics / Denis BAURAIN.
130              
131             This is free software; you can redistribute it and/or modify it under
132             the same terms as the Perl 5 programming language system itself.
133              
134             =cut