line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Peptide::Pubmed; |
2
|
|
|
|
|
|
|
|
3
|
4
|
|
|
4
|
|
93925
|
use vars qw($VERSION); |
|
4
|
|
|
|
|
12
|
|
|
4
|
|
|
|
|
681
|
|
4
|
|
|
|
|
|
|
$VERSION = '1.02'; |
5
|
|
|
|
|
|
|
|
6
|
|
|
|
|
|
|
=head1 NAME |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
Peptide::Pubmed - extract peptide sequences from MEDLINE article abstracts. |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
=head1 SYNOPSIS |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use Peptide::Pubmed; |
13
|
|
|
|
|
|
|
$parser = Peptide::Pubmed->new; |
14
|
|
|
|
|
|
|
$in = { |
15
|
|
|
|
|
|
|
PMID => q[15527327], |
16
|
|
|
|
|
|
|
Author => q[Doe JJ, Smith Q], |
17
|
|
|
|
|
|
|
Journal => q[J Biological Foo. 2004;8(2):123-30.], |
18
|
|
|
|
|
|
|
Title => q[Foo, bar and its significance in phage display.], |
19
|
|
|
|
|
|
|
Abstract => |
20
|
|
|
|
|
|
|
q[Peptide sequences EYHHYNK and Arg-Gly-Asp, but not ACCCGTNA or VEGFRI.], |
21
|
|
|
|
|
|
|
Mesh => q[Genes, p53/genetics; Humans; Bar], |
22
|
|
|
|
|
|
|
Chemical => q[Multienzyme Complexes; Peptide Library; Foo], |
23
|
|
|
|
|
|
|
}; |
24
|
|
|
|
|
|
|
$parser->parse_abstract($in); |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
# get the peptide sequences in 1 letter symbols (select all words where the |
27
|
|
|
|
|
|
|
# combined word/abstract score is above threshold: |
28
|
|
|
|
|
|
|
# WordAbstScore >= WordAbstScoreMin): |
29
|
|
|
|
|
|
|
@seqs = $parser->get_seqs; |
30
|
|
|
|
|
|
|
print "@seqs\n"; # prints: 'EYHHYNK RGD' |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
=head1 EXAMPLES |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
# same as above, set threshold explicitly: |
35
|
|
|
|
|
|
|
$parser->WordAbstScoreMin(0.4); |
36
|
|
|
|
|
|
|
@seqs = $parser->get_seqs; |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
# set low threshold to get more peptide sequences (but at a cost of getting |
39
|
|
|
|
|
|
|
# more false positives) |
40
|
|
|
|
|
|
|
$parser->WordAbstScoreMin(-1); |
41
|
|
|
|
|
|
|
@seqs = $parser->get_seqs; |
42
|
|
|
|
|
|
|
print "@seqs\n"; # prints: 'EYHHYNK RGD ACCCGTNA VEGFRI' |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
# reset threshold back: |
45
|
|
|
|
|
|
|
$parser->WordAbstScoreMin(0.4); |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
# get more data for the abstract: |
48
|
|
|
|
|
|
|
$abst = $parser->get_abst; |
49
|
|
|
|
|
|
|
print "$abst->{AbstScore}\n"; # abstract score, in the [0,1] interval |
50
|
|
|
|
|
|
|
print "$abst->{AbstMtext}\n"; # abstract with sequences marked up: |
51
|
|
|
|
|
|
|
# 'Peptide sequences EYHHYNK and Arg-Gly-Asp, |
52
|
|
|
|
|
|
|
# but not ACCCGTNA or VEGFRI.' |
53
|
|
|
|
|
|
|
|
54
|
|
|
|
|
|
|
# get more data for the words, in addition to peptide sequences: |
55
|
|
|
|
|
|
|
@words = $parser->get_words; |
56
|
|
|
|
|
|
|
for my $word (@words) { |
57
|
|
|
|
|
|
|
# combined word/abstract score, in the [0,1] interval |
58
|
|
|
|
|
|
|
print "$word->{WordAbstScore}\n"; |
59
|
|
|
|
|
|
|
# word as found in the abstract, eg 'Arg-Gly-Asp,' |
60
|
|
|
|
|
|
|
print "$word->{WordOrig}\n"; |
61
|
|
|
|
|
|
|
# peptide sequence in 1 letter symbols, eg 'RGD' |
62
|
|
|
|
|
|
|
print "$word->{WordSequence}\n"; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
# There are no mandatory input fields. This will work too, but may give lower score. |
66
|
|
|
|
|
|
|
$in = { |
67
|
|
|
|
|
|
|
Abstract => |
68
|
|
|
|
|
|
|
q[Peptide sequences EYHHYNK and Arg-Gly-Asp, but not ACCCGTNA or VEGFRI.], |
69
|
|
|
|
|
|
|
}; |
70
|
|
|
|
|
|
|
$parser->parse_abstract($in); |
71
|
|
|
|
|
|
|
@words = $parser->get_words; |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
# No peptide sequences are found in empty input: |
74
|
|
|
|
|
|
|
$in = undef; |
75
|
|
|
|
|
|
|
$parser->parse_abstract($in); |
76
|
|
|
|
|
|
|
@words = $parser->get_words; |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
=head1 DESCRIPTION |
79
|
|
|
|
|
|
|
|
80
|
|
|
|
|
|
|
Provides common methods to parse peptide sequences from Pubmed |
81
|
|
|
|
|
|
|
abstracts. The computed variables can be used for classification in |
82
|
|
|
|
|
|
|
external programs. |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
For all variables below: |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
Varables (Abst|Word)Is*: Allowed values: 1/0. |
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
Variables (Abst|Word)Num*: Allowed values: integer. |
89
|
|
|
|
|
|
|
|
90
|
|
|
|
|
|
|
Variables (Abst|Word)Prop*: Allowed values: real in [0,1]. |
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
Variables (Abst|Word)Score*: Allowed values: real in [0,1]. Score near |
93
|
|
|
|
|
|
|
1 corresponds to "more relevant" abstracts or words (that is, likely |
94
|
|
|
|
|
|
|
to contain peptide sequences), score near 0 - to "less relevant" |
95
|
|
|
|
|
|
|
abstracts or words. |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
Variables (Abst|Word)* (all other): Allowed values: string, unless |
98
|
|
|
|
|
|
|
otherwise specified below. |
99
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
=head2 Input variables |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
Input variables can be used optionally without the 'Abst' prefix, so |
103
|
|
|
|
|
|
|
'AbstPMID' and 'PMID' are treated identically. |
104
|
|
|
|
|
|
|
|
105
|
|
|
|
|
|
|
AbstPMID: PubMed ID. |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
AbstAuthor: article authors. |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
AbstJournal: journal citation, with year if available. For format, see |
110
|
|
|
|
|
|
|
examples. |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
AbstTitle: article title. |
113
|
|
|
|
|
|
|
|
114
|
|
|
|
|
|
|
AbstAbstract: abstract. |
115
|
|
|
|
|
|
|
|
116
|
|
|
|
|
|
|
AbstMesh: Medical Subject Headings (MeSH) terms. |
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
AbstChemical: chemical list. |
119
|
|
|
|
|
|
|
|
120
|
|
|
|
|
|
|
=head2 Output variables, for the abstract text |
121
|
|
|
|
|
|
|
|
122
|
|
|
|
|
|
|
AbstNumAb: number of matches to words like antibody, epitope, etc. |
123
|
|
|
|
|
|
|
|
124
|
|
|
|
|
|
|
AbstNumAllCap: number of all capitalized words in the abstract. |
125
|
|
|
|
|
|
|
|
126
|
|
|
|
|
|
|
AbstNumBind: number of matches to words like binds, interacts, etc. |
127
|
|
|
|
|
|
|
|
128
|
|
|
|
|
|
|
AbstNumDigest: number of matches to words like Edman, MS/MS, trypsin, |
129
|
|
|
|
|
|
|
etc. |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
AbstNumMHC: number of matches to words like MHC, TCR, etc. |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
AbstNumPeptide: number of matches to words like peptide, oligopeptide, |
134
|
|
|
|
|
|
|
motif, etc. |
135
|
|
|
|
|
|
|
|
136
|
|
|
|
|
|
|
AbstNumPhage: number of matches to words like phage, display, etc. |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
AbstNumProtease: number of matches to words like peptidase, cutting, |
139
|
|
|
|
|
|
|
etc. |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
AbstNumWords: number of words. Allowed values: integer. |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
AbstPropAllCap: proportion of all capitalized words in the abstract (= |
144
|
|
|
|
|
|
|
AbstNumAllCap / AbstNumWords). |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
AbstScore: heuristic score to predict whether the abstract contains a |
147
|
|
|
|
|
|
|
peptide sequence, computed based on Abst* variables. |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
Other variables: |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
AbstComment: free text comment for debugging. |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
AbstMtext: abstract with sequences marked using '...' |
154
|
|
|
|
|
|
|
tags. |
155
|
|
|
|
|
|
|
|
156
|
|
|
|
|
|
|
=head2 Output variables, for the word |
157
|
|
|
|
|
|
|
|
158
|
|
|
|
|
|
|
WordIsDNA: is an all DNA sequence? Example: ACCTTG. |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
WordIsDict: is in the dictionary (currently, of english words and of |
161
|
|
|
|
|
|
|
scientific terms, software names and abbreviations)? Example: |
162
|
|
|
|
|
|
|
MATERIALS, RT-PCR. |
163
|
|
|
|
|
|
|
|
164
|
|
|
|
|
|
|
WordIsGene: is a gene name, protein name, gene symbol, protein symbol, |
165
|
|
|
|
|
|
|
etc? Example: TFIIA. |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
WordSeqLen: peptide length, in amino acids. |
168
|
|
|
|
|
|
|
|
169
|
|
|
|
|
|
|
WordOrig: the word as found in the abstract text. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
WordPropDegen: proportion of degenerate amino acids, e.g, 0.6 for |
172
|
|
|
|
|
|
|
AXXXC. |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
WordPropProtein: a measure which is positive if a given word |
175
|
|
|
|
|
|
|
composition looks more like a protein sequence than like an english |
176
|
|
|
|
|
|
|
word, and negative otherwise. Computed using frequencies of |
177
|
|
|
|
|
|
|
overlapping k-mers. It is defined as follows: WordPropProtein = sum |
178
|
|
|
|
|
|
|
(over all overlapping k-mers within the word) of (log10Pp - |
179
|
|
|
|
|
|
|
log10Pn). log10Pp is log base 10 of the proportion of the k-mer in the |
180
|
|
|
|
|
|
|
database of known protein sequences, and log10Pn - same, for |
181
|
|
|
|
|
|
|
non-sequences (here, english words from pubmed abstracts not related |
182
|
|
|
|
|
|
|
to peptides). For a word with all k-mers equally frequent among |
183
|
|
|
|
|
|
|
sequences and non-sequences, log10Pp = log10Pn, and WordPropProtein = |
184
|
|
|
|
|
|
|
0. Allowed values: real, [-Inf,+Inf] |
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
WordScore: heuristic score to predict whether the word contains a |
187
|
|
|
|
|
|
|
peptide sequence, computed based on Word* variables. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
WordAbstScore: heuristic score to predict whether the word contains a |
190
|
|
|
|
|
|
|
peptide sequence, computed based on both Abst* and Word* variables. |
191
|
|
|
|
|
|
|
|
192
|
|
|
|
|
|
|
WordSequence: word converted to peptide sequence in 1-letter amino |
193
|
|
|
|
|
|
|
acid symbols. |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
Other variables: |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
WordAaSymbols: amino acid code. Allowed values: 1 (1 letter), 3 (3 |
198
|
|
|
|
|
|
|
letter or full name). Note that separate handling of 3 letter symbols |
199
|
|
|
|
|
|
|
and full names is currently not implemented. |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
WordIdx: word index in the abstract. Allowed values: nonnegative |
202
|
|
|
|
|
|
|
integer. |
203
|
|
|
|
|
|
|
|
204
|
|
|
|
|
|
|
=head1 NOTES |
205
|
|
|
|
|
|
|
|
206
|
|
|
|
|
|
|
=head2 WordPropProtein |
207
|
|
|
|
|
|
|
|
208
|
|
|
|
|
|
|
Note that to optimize classification using frequencies of k-mers, k |
209
|
|
|
|
|
|
|
should be chosen so that for a given text, there are 'not too many' |
210
|
|
|
|
|
|
|
empty cells, that is, 'not too many; k-mers that did not occur. For a |
211
|
|
|
|
|
|
|
typical english text, with the combined text length of 100,000, |
212
|
|
|
|
|
|
|
alphabet size of 26 (A-Z, case-insensitive), k=3 is a good choice, |
213
|
|
|
|
|
|
|
because there are 26 ** 3 = 17576 different k-mers, and the expected |
214
|
|
|
|
|
|
|
frequency of each k-mer is 5.7. Actually, the expected frequency is |
215
|
|
|
|
|
|
|
somewhat less because of the effect of the word boundaries. That is, |
216
|
|
|
|
|
|
|
text: 'foobletch' contains more 3-mers than 'foo bletch' because after |
217
|
|
|
|
|
|
|
splitting on whitespace, 'oob', 'obl' do not occur in 'foo bletch'. |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
=head2 Variable and method names |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
By convention, method, variable and keys names like these: |
222
|
|
|
|
|
|
|
VarNamesLikeThese are used for cases where the corresponding field |
223
|
|
|
|
|
|
|
names may be printed in the output table, such as the rdb table in |
224
|
|
|
|
|
|
|
parse_file(). For the rest of the names, var_names_like_these are |
225
|
|
|
|
|
|
|
used. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
=head1 KNOWN BUGS |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
=head2 False negatives |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
=head3 Peptide length cutoff |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
Peptide length cutoff is 20 amino acids. This is a somewhat arbitary |
234
|
|
|
|
|
|
|
choice. In various sources, cutoffs between 15 and 50 amino acids are |
235
|
|
|
|
|
|
|
used to define oligopeptides. |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=head2 False positives |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
=head3 Gene symbols, english words, scientific terms and abbreviations |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
Some of these were misclassified as peptide sequence, even though this |
242
|
|
|
|
|
|
|
code uses several dictionaries to find such non-sequence words. |
243
|
|
|
|
|
|
|
|
244
|
|
|
|
|
|
|
=head2 Incorrectly parsed sequence |
245
|
|
|
|
|
|
|
|
246
|
|
|
|
|
|
|
The recommendations of IUPAC can be found in: Nomenclature and |
247
|
|
|
|
|
|
|
Symbolism for Amino Acids and Peptides, |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
http://www.chem.qmul.ac.uk/iupac/AminoAcid/A1416.html |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
They are not always followed in the published abstracts. More flexible |
254
|
|
|
|
|
|
|
input rules are thus allowed for peptide sequences. However, the |
255
|
|
|
|
|
|
|
following bugs may occur in parsed sequences. |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
=head3 Amino acid position and the number of repetitions are not |
258
|
|
|
|
|
|
|
resolved |
259
|
|
|
|
|
|
|
|
260
|
|
|
|
|
|
|
Y(n) usually means amino acid Y at position n, but sometimes also |
261
|
|
|
|
|
|
|
means Y repeated n times. It is always resolved as Y, and n is |
262
|
|
|
|
|
|
|
ignored. However, X(n), where X is 'X', 'Xaa', 'Xxx', etc, usually |
263
|
|
|
|
|
|
|
means 'any amino acid, repeated n times. It is always resolved as X |
264
|
|
|
|
|
|
|
repeated n times. |
265
|
|
|
|
|
|
|
|
266
|
|
|
|
|
|
|
=head1 REFERENCES |
267
|
|
|
|
|
|
|
|
268
|
|
|
|
|
|
|
=head2 ADAM |
269
|
|
|
|
|
|
|
|
270
|
|
|
|
|
|
|
This section refers to ADAM, on which Peptide::DictionaryAbbreviations |
271
|
|
|
|
|
|
|
is based. See http://arrowsmith.psych.uic.edu. I would like to thank |
272
|
|
|
|
|
|
|
ADAM authors (in particular, Neil Smalheiser) for graciously providing |
273
|
|
|
|
|
|
|
ADAM. |
274
|
|
|
|
|
|
|
|
275
|
|
|
|
|
|
|
ADAM citation: |
276
|
|
|
|
|
|
|
|
277
|
|
|
|
|
|
|
Zhou W, Torvik VI, Smalheiser NR. ADAM: Another Database of |
278
|
|
|
|
|
|
|
Abbreviations in MEDLINE. Bioinformatics 2006; 22(22): 2813-2818. |
279
|
|
|
|
|
|
|
|
280
|
|
|
|
|
|
|
ADAM OVERVIEW |
281
|
|
|
|
|
|
|
|
282
|
|
|
|
|
|
|
ADAM is an abbreviation database generated from the 2006 |
283
|
|
|
|
|
|
|
baseline of MEDLINE. It covers frequently used abbreviations |
284
|
|
|
|
|
|
|
and their definitions (or long-forms), including both |
285
|
|
|
|
|
|
|
acronyms and non-acronym abbreviations. |
286
|
|
|
|
|
|
|
Reference |
287
|
|
|
|
|
|
|
|
288
|
|
|
|
|
|
|
Zhou W, Torvik VI, Smalheiser NR. ADAM: Another Database |
289
|
|
|
|
|
|
|
of Abbreviations in MEDLINE. Bioinformatics 2006; 22(22): 2813-2818. |
290
|
|
|
|
|
|
|
|
291
|
|
|
|
|
|
|
ADAM Copyright |
292
|
|
|
|
|
|
|
|
293
|
|
|
|
|
|
|
University of Illinois at Chicago, 2006. |
294
|
|
|
|
|
|
|
|
295
|
|
|
|
|
|
|
ADAM License |
296
|
|
|
|
|
|
|
|
297
|
|
|
|
|
|
|
By using this software, you expressly agree that your use will be |
298
|
|
|
|
|
|
|
noncommercial, that you will not use this software to make money, and that |
299
|
|
|
|
|
|
|
you will not distribute the software to anyone else or let anyone else use |
300
|
|
|
|
|
|
|
it. Moreover, you will give credit to the University of Illinois and Dr. |
301
|
|
|
|
|
|
|
Smalheiser as the author of the software. The software is provided "as is" |
302
|
|
|
|
|
|
|
and without warranties of any kind, express or implied, including but not |
303
|
|
|
|
|
|
|
limited to the implied warranties of merchantability and fitness, and |
304
|
|
|
|
|
|
|
statutory warranties of noninfringement. |
305
|
|
|
|
|
|
|
|
306
|
|
|
|
|
|
|
=head1 AUTHOR |
307
|
|
|
|
|
|
|
|
308
|
|
|
|
|
|
|
Timur Shtatland, tshtatland at mgh dot harvard dot edu |
309
|
|
|
|
|
|
|
|
310
|
|
|
|
|
|
|
Copyright (C) 2007 by The General Hospital Corporation. |
311
|
|
|
|
|
|
|
|
312
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify |
313
|
|
|
|
|
|
|
it under the same terms as Perl itself. |
314
|
|
|
|
|
|
|
|
315
|
|
|
|
|
|
|
See http://www.perl.com/perl/misc/Artistic.html |
316
|
|
|
|
|
|
|
|
317
|
|
|
|
|
|
|
=head1 SEE ALSO |
318
|
|
|
|
|
|
|
|
319
|
|
|
|
|
|
|
RDB - a fast, portable, relational database management system without |
320
|
|
|
|
|
|
|
arbitrary limits, (other than memory and processor speed) that runs |
321
|
|
|
|
|
|
|
under, and interacts with, the UNIX Operating system, by Walter |
322
|
|
|
|
|
|
|
V. Hobbs. |
323
|
|
|
|
|
|
|
|
324
|
|
|
|
|
|
|
=head1 APPENDIX |
325
|
|
|
|
|
|
|
|
326
|
|
|
|
|
|
|
The rest of the documentation details the methods. |
327
|
|
|
|
|
|
|
|
328
|
|
|
|
|
|
|
=cut |
329
|
|
|
|
|
|
|
|
330
|
|
|
|
|
|
|
# for printing diagnostic msgs when the module is loaded. |
331
|
|
|
|
|
|
|
# use $obj->{verbose} for object related diagnostic msgs. |
332
|
|
|
|
|
|
|
our $VERBOSE; |
333
|
|
|
|
|
|
|
BEGIN { |
334
|
4
|
|
50
|
4
|
|
185
|
$VERBOSE ||= $ENV{VERBOSE} || $ENV{TEST_VERBOSE} || 0; |
|
|
|
33
|
|
|
|
|
335
|
|
|
|
|
|
|
} |
336
|
|
|
|
|
|
|
|
337
|
4
|
|
|
4
|
|
13368
|
use Peptide::DictionaryAbbreviations; |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
179
|
|
338
|
4
|
|
|
4
|
|
65719
|
use Peptide::DictionaryEnglish; |
|
4
|
|
|
|
|
13
|
|
|
4
|
|
|
|
|
171
|
|
339
|
4
|
|
|
4
|
|
9878
|
use Peptide::DictionaryFrequent; |
|
4
|
|
|
|
|
11
|
|
|
4
|
|
|
|
|
133
|
|
340
|
4
|
|
|
4
|
|
2507
|
use Peptide::DictionaryScientific; |
|
4
|
|
|
|
|
11
|
|
|
4
|
|
|
|
|
109
|
|
341
|
4
|
|
|
4
|
|
21995
|
use Peptide::Gene; |
|
4
|
|
|
|
|
13
|
|
|
4
|
|
|
|
|
146
|
|
342
|
4
|
|
|
4
|
|
29337
|
use Peptide::Sequence; |
|
4
|
|
|
|
|
13
|
|
|
4
|
|
|
|
|
212
|
|
343
|
4
|
|
|
4
|
|
27601
|
use Peptide::NonSequence; |
|
4
|
|
|
|
|
14
|
|
|
4
|
|
|
|
|
164
|
|
344
|
4
|
|
|
4
|
|
7349
|
use Data::Dumper; |
|
4
|
|
|
|
|
42794
|
|
|
4
|
|
|
|
|
298
|
|
345
|
4
|
|
|
4
|
|
36
|
use Carp; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
316
|
|
346
|
4
|
|
|
4
|
|
21
|
use warnings; |
|
4
|
|
|
|
|
9
|
|
|
4
|
|
|
|
|
134
|
|
347
|
4
|
|
|
4
|
|
20
|
use strict; |
|
4
|
|
|
|
|
8
|
|
|
4
|
|
|
|
|
196
|
|
348
|
|
|
|
|
|
|
|
349
|
4
|
|
|
4
|
|
22
|
use constant LOG_10_2 => log(2) / log(10); # empirically derived weight for WordPropProtein |
|
4
|
|
|
|
|
7
|
|
|
4
|
|
|
|
|
43886
|
|
350
|
|
|
|
|
|
|
our $MAX_PEPTIDE_LENGTH = 20; # Somewhat arbitrary cutoff, consistent with IUPAC. Find the peptides sequences not longer than this cutoff. |
351
|
|
|
|
|
|
|
|
352
|
|
|
|
|
|
|
# word / score assignment |
353
|
|
|
|
|
|
|
# |
354
|
|
|
|
|
|
|
|
355
|
|
|
|
|
|
|
# for WordScore() |
356
|
|
|
|
|
|
|
|
357
|
|
|
|
|
|
|
my %Dict = map { $_ => 1 } |
358
|
|
|
|
|
|
|
splitln(Peptide::DictionaryAbbreviations->words), |
359
|
|
|
|
|
|
|
splitln(Peptide::DictionaryEnglish->words), |
360
|
|
|
|
|
|
|
splitln(Peptide::DictionaryFrequent->words), |
361
|
|
|
|
|
|
|
splitln(Peptide::DictionaryScientific->words); |
362
|
|
|
|
|
|
|
my %Gene = map { $_ => 1 } |
363
|
|
|
|
|
|
|
splitln(Peptide::Gene->words); |
364
|
|
|
|
|
|
|
|
365
|
|
|
|
|
|
|
# for WordPropProtein() |
366
|
|
|
|
|
|
|
|
367
|
|
|
|
|
|
|
my %log_prop_sequence = map /(\S+)\s+(\S+)\s*\n/g, Peptide::Sequence->log_prop_kmers; |
368
|
|
|
|
|
|
|
my %log_prop_nonsequence = map /(\S+)\s+(\S+)\s*\n/g, Peptide::NonSequence->log_prop_kmers; |
369
|
|
|
|
|
|
|
|
370
|
|
|
|
|
|
|
if ($VERBOSE > 3) { |
371
|
|
|
|
|
|
|
foreach (sort keys %log_prop_sequence) { |
372
|
|
|
|
|
|
|
print join("\t", $_, $log_prop_sequence{$_}, $log_prop_nonsequence{$_}), "\n"; |
373
|
|
|
|
|
|
|
} |
374
|
|
|
|
|
|
|
} |
375
|
|
|
|
|
|
|
|
376
|
|
|
|
|
|
|
{ |
377
|
|
|
|
|
|
|
my %args_default |
378
|
|
|
|
|
|
|
= ( |
379
|
|
|
|
|
|
|
AbstScoreMin => 0.2, |
380
|
|
|
|
|
|
|
WordScoreMin => 0.2, |
381
|
|
|
|
|
|
|
WordAbstScoreMin => 0.2, |
382
|
|
|
|
|
|
|
WordNumPrintMin => -1, |
383
|
|
|
|
|
|
|
WordNumPrintMax => -1, |
384
|
|
|
|
|
|
|
in_col => [ qw(PMID Author Journal Title Abstract Mesh Chemical) ], |
385
|
|
|
|
|
|
|
kmer_length => 3, |
386
|
|
|
|
|
|
|
print_col => [ qw( |
387
|
|
|
|
|
|
|
AbstPMID AbstAuthor AbstJournal AbstTitle AbstAbstract AbstMesh AbstChemical |
388
|
|
|
|
|
|
|
|
389
|
|
|
|
|
|
|
AbstNumAb AbstNumAllCap AbstNumBind AbstNumDigest AbstNumMHC AbstNumPeptide AbstNumPhage AbstNumProtease AbstNumWords AbstPropAllCap AbstScore AbstComment AbstMtext |
390
|
|
|
|
|
|
|
|
391
|
|
|
|
|
|
|
WordAbstScore WordIsDNA WordIsDict WordIsGene WordSeqLen WordOrig WordPropDegen WordPropProtein WordScore WordSequence WordAaSymbols WordIdx |
392
|
|
|
|
|
|
|
|
393
|
|
|
|
|
|
|
)], |
394
|
|
|
|
|
|
|
print_colf => undef, |
395
|
|
|
|
|
|
|
verbose => 1, |
396
|
|
|
|
|
|
|
); |
397
|
|
|
|
|
|
|
|
398
|
|
|
|
|
|
|
=head2 new |
399
|
|
|
|
|
|
|
|
400
|
|
|
|
|
|
|
Args : %named_parameters: |
401
|
|
|
|
|
|
|
|
402
|
|
|
|
|
|
|
AbstScoreMin : print abstract data if AbstScore >= this threshold. |
403
|
|
|
|
|
|
|
WordScoreMin : print word data if WordScore >= this threshold. |
404
|
|
|
|
|
|
|
WordAbstScoreMin : print word data if WordAbstScore >= this threshold. |
405
|
|
|
|
|
|
|
Allowed values: real in [0,1] interval, or -1. |
406
|
|
|
|
|
|
|
Set to -1 to undefine these, and print all that satisfy |
407
|
|
|
|
|
|
|
*PrintMin*, *PrintMax* thresholds. |
408
|
|
|
|
|
|
|
|
409
|
|
|
|
|
|
|
WordNumPrintMin : min number of words to print per abstract. |
410
|
|
|
|
|
|
|
Allowed values: 1 and -1 |
411
|
|
|
|
|
|
|
WordNumPrintMax : max number of words to print per abstract. |
412
|
|
|
|
|
|
|
Allowed values: non-negative integer, or -1. |
413
|
|
|
|
|
|
|
Set to -1 to undefine these, and print all that satisfy |
414
|
|
|
|
|
|
|
*ScoreMin* thresholds. |
415
|
|
|
|
|
|
|
|
416
|
|
|
|
|
|
|
If WordNumPrintMin = 1, at least one line is printed |
417
|
|
|
|
|
|
|
per each abstract for which AbstScore satisfies |
418
|
|
|
|
|
|
|
AbstScoreMin threshold. That is, 1 line is printed |
419
|
|
|
|
|
|
|
even if no sequences were found (many Word* vars will |
420
|
|
|
|
|
|
|
be empty or 0 when this happens) or if no sequences |
421
|
|
|
|
|
|
|
satisfy WordScore / WordScoreMin threshold. If there |
422
|
|
|
|
|
|
|
are sequences found, at least 1 line is printed (which |
423
|
|
|
|
|
|
|
may or may not satisfy WordScore / WordScoreMin |
424
|
|
|
|
|
|
|
threshold), and the rest are printed only if they |
425
|
|
|
|
|
|
|
satisfy it. If WordNumPrintMax = N, where N is a |
426
|
|
|
|
|
|
|
positive integer, only the first N words are printed |
427
|
|
|
|
|
|
|
(_not_ the best N words by score). A special case is |
428
|
|
|
|
|
|
|
(WordNumPrintMin => 1, WordNumPrintMax => 1), which |
429
|
|
|
|
|
|
|
prints exactly 1 line per abstract. Word* vars are not |
430
|
|
|
|
|
|
|
computed, which make the code much faster. An even |
431
|
|
|
|
|
|
|
more special case is (AbstScoreMin => -1, WordScoreMin |
432
|
|
|
|
|
|
|
=> -1, WordAbstScoreMin => -1, WordNumPrintMin => 1, |
433
|
|
|
|
|
|
|
WordNumPrintMax => 1), which prints all abstracts, 1 |
434
|
|
|
|
|
|
|
abstract per line, and computes Abst*, but not Word* |
435
|
|
|
|
|
|
|
vars, thus allowing to separate computation into 2 |
436
|
|
|
|
|
|
|
steps: compute and select abstracts with good |
437
|
|
|
|
|
|
|
AbstScore, then compute and select abstracts and words |
438
|
|
|
|
|
|
|
with good WordAbstScore, WordScore. |
439
|
|
|
|
|
|
|
|
440
|
|
|
|
|
|
|
in_col : columns for input |
441
|
|
|
|
|
|
|
kmer_length : for computing WordPropProtein: => 3, |
442
|
|
|
|
|
|
|
|
443
|
|
|
|
|
|
|
print_col : columns for printing |
444
|
|
|
|
|
|
|
print_colf : column formats for rdb table, eg [qw(50S 10N)]; |
445
|
|
|
|
|
|
|
default: undef, to be determined automatically from col names. |
446
|
|
|
|
|
|
|
verbose : verbosity level for diagnostic msgs. Allowed values: 0..4. |
447
|
|
|
|
|
|
|
Example : my $parser = Peptide::Pubmed->new(verbose => $verbose) or |
448
|
|
|
|
|
|
|
carp "not ok: new Peptide::Pubmed" and return; |
449
|
|
|
|
|
|
|
Description : bare constructor. All work done by init(). |
450
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
451
|
|
|
|
|
|
|
|
452
|
|
|
|
|
|
|
=cut |
453
|
|
|
|
|
|
|
|
454
|
|
|
|
|
|
|
sub new { |
455
|
9
|
|
|
9
|
1
|
11553
|
my $class = shift; |
456
|
9
|
|
|
|
|
54
|
my %args = @_; |
457
|
9
|
|
|
|
|
55
|
foreach (sort keys %args) { |
458
|
39
|
50
|
0
|
|
|
101
|
exists $args_default{$_} or carp "not ok: arg=$_ is not allowed in new()" and |
459
|
|
|
|
|
|
|
return; |
460
|
|
|
|
|
|
|
} |
461
|
9
|
|
33
|
|
|
149
|
my $self = bless { %args_default, %args }, ref($class) || $class; |
462
|
9
|
50
|
|
|
|
41
|
$self->init or return; |
463
|
9
|
|
|
|
|
37
|
return $self; |
464
|
|
|
|
|
|
|
} |
465
|
|
|
|
|
|
|
} |
466
|
|
|
|
|
|
|
|
467
|
|
|
|
|
|
|
=head2 get_abst |
468
|
|
|
|
|
|
|
|
469
|
|
|
|
|
|
|
Args : none |
470
|
|
|
|
|
|
|
Example : $abst = $parser->get_abst; |
471
|
|
|
|
|
|
|
Description : get the output data for Abst* variables. |
472
|
|
|
|
|
|
|
Returns : a ref to hash with Abst* variables if successful, |
473
|
|
|
|
|
|
|
ref to empty hash otherwise. |
474
|
|
|
|
|
|
|
|
475
|
|
|
|
|
|
|
=cut |
476
|
|
|
|
|
|
|
|
477
|
|
|
|
|
|
|
sub get_abst { |
478
|
1
|
|
|
1
|
1
|
5
|
my ($self) = @_; |
479
|
1
|
|
|
|
|
3
|
my $abst = {}; |
480
|
1
|
|
|
|
|
3
|
foreach my $key (grep /^Abst/, @{ $self->{abst_col} } ) { |
|
1
|
|
|
|
|
37
|
|
481
|
20
|
50
|
|
|
|
88
|
$abst->{$key} = $self->{abst}{$key} if exists $self->{abst}{$key}; |
482
|
|
|
|
|
|
|
} |
483
|
1
|
|
|
|
|
6
|
return $abst; |
484
|
|
|
|
|
|
|
} |
485
|
|
|
|
|
|
|
|
486
|
|
|
|
|
|
|
=head2 get_words |
487
|
|
|
|
|
|
|
|
488
|
|
|
|
|
|
|
Args : none |
489
|
|
|
|
|
|
|
Example : # get data for words likely to contain peptide sequences: |
490
|
|
|
|
|
|
|
@words = $parser->get_words; |
491
|
|
|
|
|
|
|
Description : get the output data for Words* variables. |
492
|
|
|
|
|
|
|
Returns : an array where each element is a ref to hash with |
493
|
|
|
|
|
|
|
Word* variables output data for 1 word. Only words |
494
|
|
|
|
|
|
|
that satisfy the threshold: combined word/abstract score |
495
|
|
|
|
|
|
|
WordAbstScore greater than or equal to WordAbstScoreMin, |
496
|
|
|
|
|
|
|
are included, otherwise an empty array is returned. |
497
|
|
|
|
|
|
|
To return all words, set WordAbstScoreMin to -1. |
498
|
|
|
|
|
|
|
|
499
|
|
|
|
|
|
|
=cut |
500
|
|
|
|
|
|
|
|
501
|
|
|
|
|
|
|
sub get_words { |
502
|
2
|
|
|
2
|
1
|
1082
|
my ($self) = @_; |
503
|
2
|
|
|
|
|
4
|
my @words = (); |
504
|
2
|
|
|
|
|
3
|
foreach my $rh_word ( @{ $self->{words} } ) { |
|
2
|
|
|
|
|
6
|
|
505
|
19
|
100
|
|
|
|
51
|
next unless $rh_word->{WordAbstScore} >= $self->{WordAbstScoreMin}; |
506
|
2
|
|
|
|
|
5
|
my $rh_word_out = {}; |
507
|
2
|
|
|
|
|
3
|
foreach my $key (grep /^Word/, @{ $self->{words_col} } ) { |
|
2
|
|
|
|
|
22
|
|
508
|
24
|
50
|
|
|
|
91
|
$rh_word_out->{$key} = $rh_word->{$key} if exists $rh_word->{$key}; |
509
|
|
|
|
|
|
|
} |
510
|
2
|
|
|
|
|
9
|
push @words, $rh_word_out; |
511
|
|
|
|
|
|
|
} |
512
|
2
|
|
|
|
|
15
|
return @words; |
513
|
|
|
|
|
|
|
} |
514
|
|
|
|
|
|
|
|
515
|
|
|
|
|
|
|
|
516
|
|
|
|
|
|
|
=head2 get_seqs |
517
|
|
|
|
|
|
|
|
518
|
|
|
|
|
|
|
Args : none |
519
|
|
|
|
|
|
|
Example : @seqs = $parser->get_seqs; |
520
|
|
|
|
|
|
|
Description : |
521
|
|
|
|
|
|
|
Returns : a list with sequences for all words in the abstract, |
522
|
|
|
|
|
|
|
empty list if none found. |
523
|
|
|
|
|
|
|
|
524
|
|
|
|
|
|
|
=cut |
525
|
|
|
|
|
|
|
|
526
|
|
|
|
|
|
|
sub get_seqs { |
527
|
3
|
|
|
3
|
1
|
12
|
my ($self) = @_; |
528
|
3
|
|
|
|
|
6
|
my @seqs = (); |
529
|
3
|
|
|
|
|
4
|
foreach my $rh_word ( @{ $self->{words} } ) { |
|
3
|
|
|
|
|
7
|
|
530
|
57
|
100
|
|
|
|
131
|
next unless $rh_word->{WordAbstScore} >= $self->{WordAbstScoreMin}; |
531
|
23
|
100
|
|
|
|
52
|
push @seqs, $rh_word->{WordSequence} if $rh_word->{WordSequence}; |
532
|
|
|
|
|
|
|
} |
533
|
3
|
|
|
|
|
15
|
return @seqs; |
534
|
|
|
|
|
|
|
} |
535
|
|
|
|
|
|
|
|
536
|
|
|
|
|
|
|
|
537
|
|
|
|
|
|
|
=head2 WordAbstScoreMin |
538
|
|
|
|
|
|
|
|
539
|
|
|
|
|
|
|
Args : (optional) new value |
540
|
|
|
|
|
|
|
Example : $parser->WordAbstScoreMin(0.4); |
541
|
|
|
|
|
|
|
$parser->WordAbstScoreMin; |
542
|
|
|
|
|
|
|
Description : assign to WordAbstScoreMin a new value, if called with an argument. |
543
|
|
|
|
|
|
|
Returns : WordAbstScoreMin value (after assignment) |
544
|
|
|
|
|
|
|
|
545
|
|
|
|
|
|
|
=cut |
546
|
|
|
|
|
|
|
|
547
|
|
|
|
|
|
|
sub WordAbstScoreMin { |
548
|
3
|
|
|
3
|
1
|
1994
|
my ($self, $val) = @_; |
549
|
3
|
50
|
|
|
|
10
|
if (@_ == 2) { |
550
|
3
|
|
|
|
|
7
|
$self->{WordAbstScoreMin} = $val; |
551
|
|
|
|
|
|
|
} |
552
|
3
|
|
|
|
|
7
|
return $self->{WordAbstScoreMin}; |
553
|
|
|
|
|
|
|
} |
554
|
|
|
|
|
|
|
|
555
|
|
|
|
|
|
|
=head2 init |
556
|
|
|
|
|
|
|
|
557
|
|
|
|
|
|
|
Args : none |
558
|
|
|
|
|
|
|
Example : $parser->init or return; |
559
|
|
|
|
|
|
|
Description : check to see if the entry is ok. Initialize several fields used for |
560
|
|
|
|
|
|
|
printing, such as column definitions. |
561
|
|
|
|
|
|
|
Returns : always TRUE |
562
|
|
|
|
|
|
|
|
563
|
|
|
|
|
|
|
=cut |
564
|
|
|
|
|
|
|
|
565
|
|
|
|
|
|
|
sub init { |
566
|
9
|
|
|
9
|
1
|
20
|
my ($self) = @_; |
567
|
9
|
50
|
|
|
|
44
|
unless (defined $self->{print_colf}) { |
568
|
|
|
|
|
|
|
# assign to numeric or text |
569
|
288
|
100
|
|
|
|
859
|
$self->{print_colf} = [ |
570
|
|
|
|
|
|
|
map { |
571
|
9
|
|
|
|
|
24
|
/^(Abst|Word|AbstWord)(Idx|Is|Num|Prop|Score)([A-Z]|$)/ ? |
572
|
|
|
|
|
|
|
"10N" : "50S" |
573
|
|
|
|
|
|
|
} |
574
|
9
|
|
|
|
|
14
|
@{ $self->{print_col} } |
575
|
|
|
|
|
|
|
] |
576
|
|
|
|
|
|
|
} |
577
|
|
|
|
|
|
|
# cols to be printed are either words cols, or abstract cols |
578
|
9
|
|
|
|
|
30
|
$self->{abst_col} = [ grep { /^Abst/ } @{ $self->{print_col} } ]; |
|
288
|
|
|
|
|
555
|
|
|
9
|
|
|
|
|
23
|
|
579
|
9
|
|
|
|
|
16
|
$self->{words_col} = [ grep { /^Word/ } @{ $self->{print_col} } ]; |
|
288
|
|
|
|
|
489
|
|
|
9
|
|
|
|
|
20
|
|
580
|
9
|
50
|
|
|
|
35
|
if ($self->{verbose} > 2) { |
581
|
0
|
|
|
|
|
0
|
foreach (sort keys %{ $self }) { |
|
0
|
|
|
|
|
0
|
|
582
|
0
|
|
|
|
|
0
|
warn "$_ => $self->{$_}"; |
583
|
|
|
|
|
|
|
} |
584
|
|
|
|
|
|
|
} |
585
|
9
|
|
|
|
|
31
|
return 1; |
586
|
|
|
|
|
|
|
} |
587
|
|
|
|
|
|
|
|
588
|
|
|
|
|
|
|
=head2 splitln |
589
|
|
|
|
|
|
|
|
590
|
|
|
|
|
|
|
Arg[1] : string of items, one item per line. |
591
|
|
|
|
|
|
|
Arg[2] : (optional) %named_parameters: |
592
|
|
|
|
|
|
|
lc : convert to lowercase? 1 = yes, 0 = no. Default = 1. |
593
|
|
|
|
|
|
|
Example : my %words = map { $_ => 1 } splitln("foo\nbar\n", lc => 0); |
594
|
|
|
|
|
|
|
my %words = map { $_ => 1 } $this->splitln("foo\nbar\n"); |
595
|
|
|
|
|
|
|
Description : parse input string into list of items, remove leading and trailing |
596
|
|
|
|
|
|
|
whitespace, remove empty lines, convert to lc. Can be used as a |
597
|
|
|
|
|
|
|
method call or direct call. |
598
|
|
|
|
|
|
|
Returns : list of items |
599
|
|
|
|
|
|
|
|
600
|
|
|
|
|
|
|
=cut |
601
|
|
|
|
|
|
|
|
602
|
|
|
|
|
|
|
sub splitln { |
603
|
20
|
50
|
|
20
|
1
|
135
|
ref $_[0] and shift; |
604
|
20
|
|
|
|
|
57
|
my $str = shift; |
605
|
20
|
|
|
|
|
106
|
my %args = (lc => 1, @_); |
606
|
303664
|
|
|
|
|
529571
|
my @lines = |
607
|
303664
|
|
|
|
|
832295
|
grep { $_ ne '' } |
608
|
20
|
50
|
|
|
|
75369
|
map { s/^\s+|\s+$//g; $args{lc} ? lc $_ : $_ } |
|
303664
|
|
|
|
|
726980
|
|
609
|
|
|
|
|
|
|
split /\n/, $str; |
610
|
20
|
|
|
|
|
96838
|
return @lines; |
611
|
|
|
|
|
|
|
} |
612
|
|
|
|
|
|
|
|
613
|
|
|
|
|
|
|
=head2 parse_file |
614
|
|
|
|
|
|
|
|
615
|
|
|
|
|
|
|
Args : %named_parameters: |
616
|
|
|
|
|
|
|
mandatory: |
617
|
|
|
|
|
|
|
in_fname : input file name. |
618
|
|
|
|
|
|
|
out_fname : output file name. |
619
|
|
|
|
|
|
|
Example : $parser->parse_file(in_fname => $in_fname, out_fname => $out_fname); |
620
|
|
|
|
|
|
|
Description : reads input and write output, both in rdb |
621
|
|
|
|
|
|
|
format. Input data is 1 abstract per line. Prints data |
622
|
|
|
|
|
|
|
for all sequences found, 1 sequence per line. Skips |
623
|
|
|
|
|
|
|
abstracts and words based on rules described in |
624
|
|
|
|
|
|
|
new(). Because it prints data for 1 word per line, the |
625
|
|
|
|
|
|
|
same abstract data is printed for all sequences from |
626
|
|
|
|
|
|
|
the same abstract. |
627
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
628
|
|
|
|
|
|
|
|
629
|
|
|
|
|
|
|
=cut |
630
|
|
|
|
|
|
|
|
631
|
|
|
|
|
|
|
sub parse_file { |
632
|
2
|
|
|
2
|
1
|
1224
|
my ($self, %args) = @_; |
633
|
2
|
|
|
|
|
7
|
foreach (qw(in_fname out_fname)) { |
634
|
4
|
|
|
|
|
13
|
$self->{$_} = $args{$_}; |
635
|
|
|
|
|
|
|
} |
636
|
2
|
|
|
|
|
6
|
my $in_fh; |
637
|
|
|
|
|
|
|
my $out_fh; |
638
|
2
|
50
|
0
|
|
|
81
|
open $in_fh, $self->{in_fname} or carp "not ok: open $self->{in_fname}: $!" and return; |
639
|
2
|
50
|
0
|
|
|
4659
|
open $out_fh, ">$self->{out_fname}" or |
640
|
|
|
|
|
|
|
carp "not ok: open $self->{out_fname}: $!" and return; |
641
|
2
|
|
|
|
|
13
|
my ($comment, $ra_col, $ra_colf) = read_rdb_header($in_fh); |
642
|
2
|
|
|
|
|
12
|
print_rdb_header($out_fh, $comment, $self->{print_col}, $self->{print_colf}); |
643
|
2
|
|
|
|
|
28
|
while (<$in_fh>) { # for 1 pubmed entry (=abstract) |
644
|
3
|
|
|
|
|
6
|
chomp; |
645
|
3
|
50
|
33
|
|
|
15
|
print STDERR "processing line $.\n" if ($self->{verbose} > 1 and not $. % 500); |
646
|
3
|
|
|
|
|
7
|
my $rh_pm_in = {}; # input data for 1 abstract |
647
|
3
|
|
|
|
|
39
|
@{$rh_pm_in}{ @$ra_col } = split "\t", $_; |
|
3
|
|
|
|
|
26
|
|
648
|
3
|
50
|
0
|
|
|
12
|
my $rh_pm_out = $self->parse_abstract($rh_pm_in) or |
649
|
|
|
|
|
|
|
carp "not ok: parse_abstract" and return; |
650
|
3
|
50
|
|
|
|
20
|
next unless $rh_pm_out->{abst}{AbstScore} >= $self->{AbstScoreMin}; |
651
|
3
|
|
|
|
|
5
|
my %seen; |
652
|
3
|
|
|
|
|
5
|
my $num_words = 0; |
653
|
3
|
|
|
|
|
7
|
foreach my $rh_word (@{ $rh_pm_out->{words} }) { |
|
3
|
|
|
|
|
7
|
|
654
|
593
|
|
|
|
|
808
|
$num_words++; |
655
|
593
|
50
|
66
|
|
|
2479
|
last if $self->{WordNumPrintMax} != -1 and $num_words > $self->{WordNumPrintMax}; |
656
|
593
|
100
|
33
|
|
|
2007
|
if ($self->{WordNumPrintMin} == -1 || |
|
|
|
66
|
|
|
|
|
657
|
|
|
|
|
|
|
($self->{WordNumPrintMin} == 1 && $num_words > $self->{WordNumPrintMin}) ) { |
658
|
591
|
50
|
33
|
|
|
6625
|
next unless $rh_word->{WordScore} >= $self->{WordScoreMin} && |
659
|
|
|
|
|
|
|
$rh_word->{WordAbstScore} >= $self->{WordAbstScoreMin}; |
660
|
591
|
100
|
100
|
|
|
2052
|
next if defined $rh_word->{WordSequence} and |
661
|
|
|
|
|
|
|
$seen{ $rh_word->{WordSequence} }++; |
662
|
|
|
|
|
|
|
} |
663
|
36992
|
100
|
|
|
|
77993
|
my %line = map { tr/\t/ /; $_ } # tab is used as the delimiter |
|
36992
|
|
|
|
|
76408
|
|
|
6936
|
|
|
|
|
22263
|
|
664
|
|
|
|
|
|
|
( |
665
|
578
|
|
|
|
|
1437
|
map( { $_ => defined $rh_word->{$_} ? $rh_word->{$_} : '' } |
666
|
11560
|
50
|
|
|
|
33892
|
@{ $self->{words_col} }), |
667
|
578
|
|
|
|
|
1240
|
map( { $_ => defined $rh_pm_out->{abst}{$_} ? $rh_pm_out->{abst}{$_} : '' } |
668
|
578
|
|
|
|
|
916
|
@{ $self->{abst_col} }), |
669
|
|
|
|
|
|
|
); |
670
|
578
|
|
|
|
|
7012
|
print $out_fh join("\t", @line{ @{ $self->{print_col} } }), "\n"; |
|
578
|
|
|
|
|
260511
|
|
671
|
|
|
|
|
|
|
} |
672
|
|
|
|
|
|
|
} |
673
|
2
|
50
|
0
|
|
|
157
|
close $out_fh or carp "not ok: close $self->{out_fname}: $!" and return; |
674
|
2
|
50
|
0
|
|
|
22
|
close $in_fh or carp "not ok: close $self->{in_fname}: $!" and return; |
675
|
2
|
|
|
|
|
35
|
return 1; |
676
|
|
|
|
|
|
|
} |
677
|
|
|
|
|
|
|
|
678
|
|
|
|
|
|
|
=head2 read_rdb_header |
679
|
|
|
|
|
|
|
|
680
|
|
|
|
|
|
|
Args : input rdb file handle |
681
|
|
|
|
|
|
|
Example : my ($comment, $ra_col, $ra_colf) = read_rdb_header($in_fh); |
682
|
|
|
|
|
|
|
Description : Reads header of an rdb table. |
683
|
|
|
|
|
|
|
Returns : Returns comment (or empty string), ref to array of column names, |
684
|
|
|
|
|
|
|
reference to array of column definitions. |
685
|
|
|
|
|
|
|
|
686
|
|
|
|
|
|
|
=cut |
687
|
|
|
|
|
|
|
|
688
|
|
|
|
|
|
|
sub read_rdb_header { |
689
|
2
|
|
|
2
|
1
|
5
|
my ($in_fh) = @_; |
690
|
2
|
|
|
|
|
5
|
my ($comment, $ra_col, $ra_colf); |
691
|
2
|
|
|
|
|
64
|
while (<$in_fh>) { |
692
|
2
|
50
|
0
|
|
|
10
|
$comment .= $_ and next if /^\#/; # table comments (optional) |
693
|
2
|
|
|
|
|
7
|
chomp; |
694
|
2
|
|
|
|
|
14
|
@$ra_col = split /\t/; # column names (eg, Hit, Expect) |
695
|
2
|
50
|
|
|
|
11
|
last unless defined ($_ = <$in_fh> ); |
696
|
2
|
|
|
|
|
3
|
chomp; |
697
|
2
|
|
|
|
|
11
|
@$ra_colf = split /\t/; # column formats (eg, 10N, 50S) |
698
|
2
|
|
|
|
|
5
|
last; |
699
|
|
|
|
|
|
|
} |
700
|
2
|
|
50
|
|
|
20
|
return $comment || '', $ra_col, $ra_colf; |
701
|
|
|
|
|
|
|
} |
702
|
|
|
|
|
|
|
|
703
|
|
|
|
|
|
|
=head2 print_rdb_header |
704
|
|
|
|
|
|
|
|
705
|
|
|
|
|
|
|
Args : output rdb file handle, comment (or undef), ref to array of |
706
|
|
|
|
|
|
|
column names, reference to array of column definitions. |
707
|
|
|
|
|
|
|
Example : print_rdb_header($out_fh, $comment, $self->{print_col}, |
708
|
|
|
|
|
|
|
$self->{print_colf}); |
709
|
|
|
|
|
|
|
Description : Prints header of an rdb table. If comment is undefined, |
710
|
|
|
|
|
|
|
it is not printed. |
711
|
|
|
|
|
|
|
Returns : always TRUE. |
712
|
|
|
|
|
|
|
|
713
|
|
|
|
|
|
|
=cut |
714
|
|
|
|
|
|
|
|
715
|
|
|
|
|
|
|
sub print_rdb_header { |
716
|
2
|
|
|
2
|
1
|
5
|
my ($out_fh, $comment, $ra_col, $ra_colf) = @_; |
717
|
2
|
50
|
|
|
|
11
|
print $out_fh $comment if defined $comment; |
718
|
2
|
|
|
|
|
31
|
print $out_fh join( "\t", @$ra_col ), "\n"; |
719
|
2
|
50
|
|
|
|
18
|
print $out_fh join( "\t", @$ra_colf ), "\n" if defined $ra_colf; |
720
|
2
|
|
|
|
|
6
|
return 1; |
721
|
|
|
|
|
|
|
} |
722
|
|
|
|
|
|
|
|
723
|
|
|
|
|
|
|
=head2 parse_abstract |
724
|
|
|
|
|
|
|
|
725
|
|
|
|
|
|
|
Args : ref to hash with input data for one pubmed abstract |
726
|
|
|
|
|
|
|
Example : $parser->parse_abstract($rh_pm_in); |
727
|
|
|
|
|
|
|
Description : does all parsing for 1 abstract by calling init_abstract, AbstVars. |
728
|
|
|
|
|
|
|
Returns : ref to hash with parsed data if successful, FALSE if error. |
729
|
|
|
|
|
|
|
If no sequences are found, the hash will have a single sequence |
730
|
|
|
|
|
|
|
consisting of an empty string (this is not considered an error). |
731
|
|
|
|
|
|
|
|
732
|
|
|
|
|
|
|
=cut |
733
|
|
|
|
|
|
|
|
734
|
|
|
|
|
|
|
sub parse_abstract { |
735
|
12
|
|
|
12
|
1
|
3571
|
my ($self, $rh_pm_in) = @_; |
736
|
12
|
|
|
|
|
21
|
my $rh_pm_out = {}; |
737
|
12
|
50
|
|
|
|
35
|
$self->init_abstract($rh_pm_in) or return; |
738
|
12
|
50
|
|
|
|
33
|
$self->AbstVars or return; |
739
|
12
|
|
|
|
|
24
|
%$rh_pm_out = map { $_ => $self->{$_} } qw(words abst); |
|
24
|
|
|
|
|
91
|
|
740
|
12
|
50
|
|
|
|
42
|
carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$self], ['self'])) |
741
|
|
|
|
|
|
|
if $self->{verbose} > 2; |
742
|
12
|
|
|
|
|
45
|
return $rh_pm_out; |
743
|
|
|
|
|
|
|
} |
744
|
|
|
|
|
|
|
|
745
|
|
|
|
|
|
|
{ |
746
|
|
|
|
|
|
|
my %abst_default = |
747
|
|
|
|
|
|
|
( |
748
|
|
|
|
|
|
|
AbstComment => undef, |
749
|
|
|
|
|
|
|
AbstMtext => undef, |
750
|
|
|
|
|
|
|
AbstNumAb => 0, |
751
|
|
|
|
|
|
|
AbstNumAllCap => 0, |
752
|
|
|
|
|
|
|
AbstNumBind => 0, |
753
|
|
|
|
|
|
|
AbstNumDigest => 0, |
754
|
|
|
|
|
|
|
AbstNumMHC => 0, |
755
|
|
|
|
|
|
|
AbstNumPeptide => 0, |
756
|
|
|
|
|
|
|
AbstNumPhage => 0, |
757
|
|
|
|
|
|
|
AbstNumProtease => 0, |
758
|
|
|
|
|
|
|
AbstNumWords => 0, |
759
|
|
|
|
|
|
|
AbstPropAllCap => 0, |
760
|
|
|
|
|
|
|
AbstScore => 0, |
761
|
|
|
|
|
|
|
); |
762
|
|
|
|
|
|
|
|
763
|
|
|
|
|
|
|
=head2 init_abstract |
764
|
|
|
|
|
|
|
|
765
|
|
|
|
|
|
|
Args : ref to hash with input data with field names 'Title', 'Abstract', etc |
766
|
|
|
|
|
|
|
(optionally, with prefix 'Abst', eg 'AbstTitle', 'AbstAbstract'). |
767
|
|
|
|
|
|
|
See field names listed in in_col. |
768
|
|
|
|
|
|
|
Example : $rh_pm_in = { Title => q[Some title.], Abstract => q[Some abstract.] }; |
769
|
|
|
|
|
|
|
$self->init_abstract($rh_pm_in) or return; |
770
|
|
|
|
|
|
|
Description : Reads the input data and stores in 'Abst*' fields |
771
|
|
|
|
|
|
|
('Abst' prefix is added if needed). Adds prefix 'Abst' |
772
|
|
|
|
|
|
|
to field names if needed. Converts to lower case and |
773
|
|
|
|
|
|
|
stores text in AbstLc* fields. Undefined fields in the |
774
|
|
|
|
|
|
|
input are converted to empty strings and |
775
|
|
|
|
|
|
|
stored. Currently, there are no mandatory fields: |
776
|
|
|
|
|
|
|
undefined $rh_pm_in is not considered an error. In |
777
|
|
|
|
|
|
|
such case, all fields will be empty strings. No |
778
|
|
|
|
|
|
|
sequences will be found, and the default score (zero) |
779
|
|
|
|
|
|
|
will be assigned. |
780
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
781
|
|
|
|
|
|
|
|
782
|
|
|
|
|
|
|
=cut |
783
|
|
|
|
|
|
|
|
784
|
|
|
|
|
|
|
sub init_abstract { |
785
|
12
|
|
|
12
|
1
|
16
|
my ($self, $rh_pm_in) = @_; |
786
|
|
|
|
|
|
|
# reset abst and words data from the prev parsed abstract, if any |
787
|
12
|
|
|
|
|
127
|
$self->{abst} = { %abst_default }; |
788
|
12
|
|
|
|
|
89
|
$self->{words} = []; |
789
|
12
|
|
|
|
|
93
|
my @in_col = @{ $self->{in_col} }; |
|
12
|
|
|
|
|
46
|
|
790
|
12
|
|
|
|
|
16
|
foreach (@{ $self->{in_col} }) { |
|
12
|
|
|
|
|
34
|
|
791
|
84
|
|
|
|
|
294
|
$self->{abst}{"Abst$_"} = do { |
792
|
84
|
100
|
|
|
|
213
|
if (defined $rh_pm_in->{$_}) { |
|
|
50
|
|
|
|
|
|
793
|
40
|
|
|
|
|
127
|
$rh_pm_in->{$_} |
794
|
|
|
|
|
|
|
} elsif (defined $rh_pm_in->{"Abst$_"}) { |
795
|
0
|
|
|
|
|
0
|
$rh_pm_in->{"Abst$_"} |
796
|
|
|
|
|
|
|
} else { |
797
|
44
|
|
|
|
|
151
|
'' |
798
|
|
|
|
|
|
|
} |
799
|
|
|
|
|
|
|
}; |
800
|
|
|
|
|
|
|
} |
801
|
|
|
|
|
|
|
# mandatory field: |
802
|
|
|
|
|
|
|
#my @missing = grep { $self->{abst}{$_} eq '' } qw(AbstAbstract); |
803
|
|
|
|
|
|
|
#@missing and carp "not ok: missing fields: @missing" and return; |
804
|
12
|
|
50
|
|
|
190
|
$self->{abst}{AbstAllText} = |
805
|
|
|
|
|
|
|
join(";; ", |
806
|
|
|
|
|
|
|
map "$_: $self->{abst}{$_}", |
807
|
|
|
|
|
|
|
qw(AbstTitle AbstAbstract AbstMesh AbstChemical) |
808
|
|
|
|
|
|
|
) || ''; |
809
|
12
|
|
|
|
|
84
|
$self->{abst}{AbstLcAllText} = lc $self->{abst}{AbstAllText}; |
810
|
12
|
|
|
|
|
55
|
$self->{abst}{AbstLcAbstract} = lc $self->{abst}{AbstAbstract}; |
811
|
12
|
|
|
|
|
47
|
return 1; |
812
|
|
|
|
|
|
|
} |
813
|
|
|
|
|
|
|
} |
814
|
|
|
|
|
|
|
|
815
|
|
|
|
|
|
|
=head2 AbstVars |
816
|
|
|
|
|
|
|
|
817
|
|
|
|
|
|
|
Args : none |
818
|
|
|
|
|
|
|
Example : $parser->AbstVars or return; |
819
|
|
|
|
|
|
|
Description : Calls all Abst* methods that compute the corresponding variables, |
820
|
|
|
|
|
|
|
eg AbstNumAb. Calls Words which in turn calls Word* methods. |
821
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
822
|
|
|
|
|
|
|
|
823
|
|
|
|
|
|
|
=cut |
824
|
|
|
|
|
|
|
|
825
|
|
|
|
|
|
|
sub AbstVars { |
826
|
12
|
|
|
12
|
1
|
18
|
my ($self) = @_; |
827
|
12
|
50
|
|
|
|
66
|
$self->AbstNumAb or return; |
828
|
|
|
|
|
|
|
# AbstNumAllCap is computed in AbstScore() |
829
|
12
|
50
|
|
|
|
31
|
$self->AbstNumBind or return; |
830
|
12
|
50
|
|
|
|
46
|
$self->AbstNumDigest or return; |
831
|
12
|
50
|
|
|
|
36
|
$self->AbstNumMHC or return; |
832
|
12
|
50
|
|
|
|
32
|
$self->AbstNumPeptide or return; |
833
|
12
|
50
|
|
|
|
30
|
$self->AbstNumPhage or return; |
834
|
12
|
50
|
|
|
|
30
|
$self->AbstNumProtease or return; |
835
|
12
|
50
|
|
|
|
32
|
$self->AbstScore or return; |
836
|
12
|
50
|
|
|
|
31
|
$self->Words or return; |
837
|
12
|
50
|
|
|
|
45
|
$self->WordAbstScore or return; |
838
|
12
|
50
|
|
|
|
52
|
$self->AbstMtext or return; |
839
|
12
|
|
|
|
|
27
|
foreach (qw(AbstMaxWordScore AbstPropAllCap AbstScore)) { |
840
|
36
|
|
|
|
|
98
|
$self->{abst}{$_} = formatScoreProp( $self->{abst}{$_} ); |
841
|
|
|
|
|
|
|
} |
842
|
12
|
|
|
|
|
20
|
foreach my $rh_word ( @{ $self->{words} }) { |
|
12
|
|
|
|
|
26
|
|
843
|
1126
|
|
|
|
|
1430
|
foreach (qw(WordScore WordAbstScore WordPropDNA WordPropDegen)) { |
844
|
4504
|
|
|
|
|
8734
|
$rh_word->{$_} = formatScoreProp( $rh_word->{$_} ); |
845
|
|
|
|
|
|
|
} |
846
|
|
|
|
|
|
|
} |
847
|
12
|
50
|
|
|
|
48
|
if ($self->{verbose} > 0) { |
848
|
61
|
|
|
|
|
220
|
$self->{abst}{AbstComment} .= join '; ', map { |
849
|
12
|
|
|
|
|
42
|
defined $self->{abst}{"ra_$_"} && |
850
|
|
|
|
|
|
|
ref $self->{abst}{"ra_$_"} eq 'ARRAY' ? |
851
|
84
|
100
|
66
|
|
|
560
|
"$_='" . join(", ", @{ $self->{abst}{"ra_$_"} }) . "'" : "$_=''" |
852
|
|
|
|
|
|
|
} |
853
|
|
|
|
|
|
|
qw( |
854
|
|
|
|
|
|
|
AbstNumAb AbstNumBind AbstNumDigest AbstNumMHC AbstNumPeptide |
855
|
|
|
|
|
|
|
AbstNumPhage AbstNumProtease |
856
|
|
|
|
|
|
|
); |
857
|
|
|
|
|
|
|
} |
858
|
12
|
|
|
|
|
50
|
return 1; |
859
|
|
|
|
|
|
|
} |
860
|
|
|
|
|
|
|
|
861
|
|
|
|
|
|
|
=head2 AbstNum* |
862
|
|
|
|
|
|
|
|
863
|
|
|
|
|
|
|
All AbstNum* methods use the same general scheme, unless specified otherwise. Each method refers to a specific class of patterns, eg AbstNumAb() refers to antibody related patterns. Each method looks searches the AbstLcAllText field (concatenated abstract, mesh terms, chemical terms) for matches to the class of patterns. The matches are stored in an array ref (eg $self->{abst}{ra_AbstNumAb}). The number of matches is stored in a scalar ref (eg, $self->{abst}{AbstNumAb}). |
864
|
|
|
|
|
|
|
|
865
|
|
|
|
|
|
|
Pattern matching is done in several steps: patterns that can match anywhere in the word, and patterns that must match the entire word (typically shorter patterns, like 'mab' or 'fab'). Often, there are additional steps. Step 1 pattern matches are mandatory - if there are no matches at step 1, then step 2 is skipped. If step 1 succeeds, then step 2 (optional) pattern matches are done. For example, for AbstNumPeptide(), step 1 patterns include 'peptid', step 2 - 'synthe'. This is because 'synthe' by itself often refers to non-peptides, but together with 'peptid' matches often refers to peptides. |
866
|
|
|
|
|
|
|
|
867
|
|
|
|
|
|
|
=cut |
868
|
|
|
|
|
|
|
|
869
|
|
|
|
|
|
|
=head2 AbstNumAb |
870
|
|
|
|
|
|
|
|
871
|
|
|
|
|
|
|
Args : none |
872
|
|
|
|
|
|
|
Example : $parser->AbstNumAb or return; |
873
|
|
|
|
|
|
|
Description : See package DESCRIPTION above. |
874
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
875
|
|
|
|
|
|
|
|
876
|
|
|
|
|
|
|
=cut |
877
|
|
|
|
|
|
|
|
878
|
|
|
|
|
|
|
sub AbstNumAb { |
879
|
12
|
|
|
12
|
1
|
20
|
my ($self) = @_; |
880
|
12
|
50
|
|
|
|
84
|
return unless defined $self->{abst}{AbstLcAllText}; |
881
|
12
|
|
|
|
|
219
|
(my $var = (caller(0))[3]) =~ s/.*:://; |
882
|
12
|
|
|
|
|
28
|
my @matches; |
883
|
12
|
|
|
|
|
32
|
for ($self->{abst}{AbstLcAllText}) { |
884
|
12
|
|
|
|
|
663
|
push @matches, |
885
|
|
|
|
|
|
|
m[ |
886
|
|
|
|
|
|
|
adjuvant| |
887
|
|
|
|
|
|
|
antibod| |
888
|
|
|
|
|
|
|
antigen| |
889
|
|
|
|
|
|
|
antiser| |
890
|
|
|
|
|
|
|
epitope| |
891
|
|
|
|
|
|
|
freund| |
892
|
|
|
|
|
|
|
heavy\ chain| |
893
|
|
|
|
|
|
|
immun| |
894
|
|
|
|
|
|
|
light\ chain| |
895
|
|
|
|
|
|
|
mimotope| |
896
|
|
|
|
|
|
|
monoclonal| |
897
|
|
|
|
|
|
|
phagotope| |
898
|
|
|
|
|
|
|
polyclonal| |
899
|
|
|
|
|
|
|
recombinant\ fab| |
900
|
|
|
|
|
|
|
vaccin| |
901
|
|
|
|
|
|
|
variable\ region| |
902
|
|
|
|
|
|
|
variable\ regions |
903
|
|
|
|
|
|
|
]gx, |
904
|
|
|
|
|
|
|
m[ |
905
|
|
|
|
|
|
|
\b( |
906
|
|
|
|
|
|
|
mab| |
907
|
|
|
|
|
|
|
mabs| |
908
|
|
|
|
|
|
|
rfab| |
909
|
|
|
|
|
|
|
sera| |
910
|
|
|
|
|
|
|
serum |
911
|
|
|
|
|
|
|
)\b |
912
|
|
|
|
|
|
|
]gx; |
913
|
12
|
|
|
|
|
31
|
$self->{abst}{$var} = @matches; |
914
|
12
|
|
|
|
|
53
|
$self->{abst}{"ra_$var"} = [ @matches ]; |
915
|
|
|
|
|
|
|
} |
916
|
12
|
|
|
|
|
42
|
return 1; |
917
|
|
|
|
|
|
|
} |
918
|
|
|
|
|
|
|
|
919
|
|
|
|
|
|
|
=head2 AbstNumBind |
920
|
|
|
|
|
|
|
|
921
|
|
|
|
|
|
|
Args : none |
922
|
|
|
|
|
|
|
Example : $parser->AbstNumBind or return; |
923
|
|
|
|
|
|
|
Description : See package DESCRIPTION above. |
924
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
925
|
|
|
|
|
|
|
|
926
|
|
|
|
|
|
|
=cut |
927
|
|
|
|
|
|
|
|
928
|
|
|
|
|
|
|
sub AbstNumBind { |
929
|
12
|
|
|
12
|
1
|
15
|
my ($self) = @_; |
930
|
12
|
50
|
|
|
|
38
|
return unless defined $self->{abst}{AbstLcAllText}; |
931
|
12
|
|
|
|
|
90
|
(my $var = (caller(0))[3]) =~ s/.*:://; |
932
|
12
|
|
|
|
|
21
|
my @matches; |
933
|
12
|
|
|
|
|
27
|
for ($self->{abst}{AbstLcAllText}) { |
934
|
12
|
|
|
|
|
652
|
push @matches, |
935
|
|
|
|
|
|
|
m[ |
936
|
|
|
|
|
|
|
abolish| |
937
|
|
|
|
|
|
|
activate| |
938
|
|
|
|
|
|
|
adher| |
939
|
|
|
|
|
|
|
adhes| |
940
|
|
|
|
|
|
|
affinit| |
941
|
|
|
|
|
|
|
agonise| |
942
|
|
|
|
|
|
|
agonist| |
943
|
|
|
|
|
|
|
agonize| |
944
|
|
|
|
|
|
|
associat| |
945
|
|
|
|
|
|
|
bind| |
946
|
|
|
|
|
|
|
block| |
947
|
|
|
|
|
|
|
bound| |
948
|
|
|
|
|
|
|
compet| |
949
|
|
|
|
|
|
|
disrupt| |
950
|
|
|
|
|
|
|
dissociat| |
951
|
|
|
|
|
|
|
inhibit| |
952
|
|
|
|
|
|
|
interact| |
953
|
|
|
|
|
|
|
interfere| |
954
|
|
|
|
|
|
|
ligand| |
955
|
|
|
|
|
|
|
prevent| |
956
|
|
|
|
|
|
|
recogn| |
957
|
|
|
|
|
|
|
regulat| |
958
|
|
|
|
|
|
|
suppress |
959
|
|
|
|
|
|
|
]gx, |
960
|
|
|
|
|
|
|
m[ |
961
|
|
|
|
|
|
|
\b( |
962
|
|
|
|
|
|
|
ec50| |
963
|
|
|
|
|
|
|
ic50| |
964
|
|
|
|
|
|
|
kd| |
965
|
|
|
|
|
|
|
ki |
966
|
|
|
|
|
|
|
)\b |
967
|
|
|
|
|
|
|
]gx; |
968
|
12
|
|
|
|
|
24
|
$self->{abst}{$var} = @matches; |
969
|
12
|
|
|
|
|
51
|
$self->{abst}{"ra_$var"} = [ @matches ]; |
970
|
|
|
|
|
|
|
} |
971
|
12
|
|
|
|
|
39
|
return 1; |
972
|
|
|
|
|
|
|
} |
973
|
|
|
|
|
|
|
|
974
|
|
|
|
|
|
|
{ |
975
|
|
|
|
|
|
|
# older papers are more likely to have data on protein digestion |
976
|
|
|
|
|
|
|
# into peptides which are relevant only to the protein, |
977
|
|
|
|
|
|
|
# rather than to the peptides. |
978
|
|
|
|
|
|
|
# for these time intervals, multiply by this factor |
979
|
|
|
|
|
|
|
my %AbstNumDigestScoreForYear; |
980
|
|
|
|
|
|
|
for (1900..2100) { |
981
|
|
|
|
|
|
|
if ($_ > 1993) { |
982
|
|
|
|
|
|
|
$AbstNumDigestScoreForYear{$_} = 1; |
983
|
|
|
|
|
|
|
} elsif ($_ > 1991) { |
984
|
|
|
|
|
|
|
$AbstNumDigestScoreForYear{$_} = 1.5; |
985
|
|
|
|
|
|
|
} elsif ($_ > 1990) { |
986
|
|
|
|
|
|
|
$AbstNumDigestScoreForYear{$_} = 2; |
987
|
|
|
|
|
|
|
} elsif ($_ > 1989) { |
988
|
|
|
|
|
|
|
$AbstNumDigestScoreForYear{$_} = 3; |
989
|
|
|
|
|
|
|
} else { # <= 1989 |
990
|
|
|
|
|
|
|
$AbstNumDigestScoreForYear{$_} = 4; |
991
|
|
|
|
|
|
|
} ## stopped here |
992
|
|
|
|
|
|
|
} |
993
|
|
|
|
|
|
|
|
994
|
|
|
|
|
|
|
=head2 AbstNumDigest |
995
|
|
|
|
|
|
|
|
996
|
|
|
|
|
|
|
Args : none |
997
|
|
|
|
|
|
|
Example : $parser->AbstNumDigest or return; |
998
|
|
|
|
|
|
|
Description : See package DESCRIPTION above. |
999
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
1000
|
|
|
|
|
|
|
|
1001
|
|
|
|
|
|
|
=cut |
1002
|
|
|
|
|
|
|
|
1003
|
|
|
|
|
|
|
sub AbstNumDigest { |
1004
|
12
|
|
|
12
|
1
|
17
|
my ($self) = @_; |
1005
|
12
|
50
|
|
|
|
43
|
return unless defined $self->{abst}{AbstLcAllText}; |
1006
|
12
|
|
|
|
|
92
|
(my $var = (caller(0))[3]) =~ s/.*:://; |
1007
|
12
|
|
|
|
|
23
|
my @matches; |
1008
|
12
|
|
|
|
|
28
|
for ($self->{abst}{AbstLcAllText}) { |
1009
|
|
|
|
|
|
|
# do not shorten to 'trypt' to avoid confusion with tryptophan |
1010
|
12
|
|
|
|
|
364
|
push @matches, |
1011
|
|
|
|
|
|
|
m[ |
1012
|
|
|
|
|
|
|
borohydride| |
1013
|
|
|
|
|
|
|
complete\ amino\ acid\ sequence| |
1014
|
|
|
|
|
|
|
chromatogra| |
1015
|
|
|
|
|
|
|
cnbr| |
1016
|
|
|
|
|
|
|
cyanogen\ bromide| |
1017
|
|
|
|
|
|
|
dansyl| |
1018
|
|
|
|
|
|
|
digest| |
1019
|
|
|
|
|
|
|
edman| |
1020
|
|
|
|
|
|
|
hplc| |
1021
|
|
|
|
|
|
|
lsims| |
1022
|
|
|
|
|
|
|
maldi-tof| |
1023
|
|
|
|
|
|
|
mass\ spec| |
1024
|
|
|
|
|
|
|
matrix-assisted\ laser\ desorption| |
1025
|
|
|
|
|
|
|
nabh4| |
1026
|
|
|
|
|
|
|
peptide\ map| |
1027
|
|
|
|
|
|
|
protease\ digestion| |
1028
|
|
|
|
|
|
|
proteolytic\ digestion| |
1029
|
|
|
|
|
|
|
rplc| |
1030
|
|
|
|
|
|
|
spectrometry,\ mass| |
1031
|
|
|
|
|
|
|
spectrum analysis,\ mass| |
1032
|
|
|
|
|
|
|
thermolysin| |
1033
|
|
|
|
|
|
|
thermolytic| |
1034
|
|
|
|
|
|
|
tryptic| |
1035
|
|
|
|
|
|
|
trypsin| |
1036
|
|
|
|
|
|
|
v-8\ prote| |
1037
|
|
|
|
|
|
|
v8\ prote |
1038
|
|
|
|
|
|
|
]gx, |
1039
|
|
|
|
|
|
|
m[ |
1040
|
|
|
|
|
|
|
\b( |
1041
|
|
|
|
|
|
|
ms |
1042
|
|
|
|
|
|
|
)\b |
1043
|
|
|
|
|
|
|
]gx; |
1044
|
12
|
|
|
|
|
28
|
$self->{abst}{$var} = @matches; |
1045
|
12
|
|
|
|
|
35
|
$self->{abst}{"ra_$var"} = [ @matches ]; |
1046
|
12
|
|
|
|
|
31
|
for ($self->{abst}{AbstJournal}) { |
1047
|
12
|
100
|
66
|
|
|
145
|
$self->{abst}{$var} *= $AbstNumDigestScoreForYear{$1} if |
|
|
|
66
|
|
|
|
|
1048
|
|
|
|
|
|
|
defined $_ and /\b((19|20)\d\d)\b/ and $AbstNumDigestScoreForYear{$1}; |
1049
|
|
|
|
|
|
|
} |
1050
|
12
|
|
|
|
|
62
|
$self->{abst}{$var} = sprintf "%.0f", $self->{abst}{$var}; |
1051
|
|
|
|
|
|
|
} |
1052
|
12
|
|
|
|
|
39
|
return 1; |
1053
|
|
|
|
|
|
|
} |
1054
|
|
|
|
|
|
|
} |
1055
|
|
|
|
|
|
|
|
1056
|
|
|
|
|
|
|
=head2 AbstNumMHC |
1057
|
|
|
|
|
|
|
|
1058
|
|
|
|
|
|
|
Args : none |
1059
|
|
|
|
|
|
|
Example : $parser->AbstNumMHC or return; |
1060
|
|
|
|
|
|
|
Description : See package DESCRIPTION above. |
1061
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
1062
|
|
|
|
|
|
|
|
1063
|
|
|
|
|
|
|
=cut |
1064
|
|
|
|
|
|
|
|
1065
|
|
|
|
|
|
|
sub AbstNumMHC { |
1066
|
12
|
|
|
12
|
1
|
58
|
my ($self) = @_; |
1067
|
12
|
50
|
|
|
|
36
|
return unless defined $self->{abst}{AbstLcAllText}; |
1068
|
12
|
|
|
|
|
89
|
(my $var = (caller(0))[3]) =~ s/.*:://; |
1069
|
12
|
|
|
|
|
28
|
my @matches; |
1070
|
12
|
|
|
|
|
29
|
for ($self->{abst}{AbstLcAllText}) { |
1071
|
12
|
|
|
|
|
695
|
push @matches, |
1072
|
|
|
|
|
|
|
m[ |
1073
|
|
|
|
|
|
|
alloge| |
1074
|
|
|
|
|
|
|
allograft| |
1075
|
|
|
|
|
|
|
antibod| |
1076
|
|
|
|
|
|
|
antigen| |
1077
|
|
|
|
|
|
|
b\ cell| |
1078
|
|
|
|
|
|
|
dendri| |
1079
|
|
|
|
|
|
|
epitope| |
1080
|
|
|
|
|
|
|
histocompatibility| |
1081
|
|
|
|
|
|
|
immun| |
1082
|
|
|
|
|
|
|
present| |
1083
|
|
|
|
|
|
|
restrict| |
1084
|
|
|
|
|
|
|
tolero| |
1085
|
|
|
|
|
|
|
t\ cell |
1086
|
|
|
|
|
|
|
]gx, |
1087
|
|
|
|
|
|
|
m[ |
1088
|
|
|
|
|
|
|
\b( |
1089
|
|
|
|
|
|
|
cd\d{1,2}| |
1090
|
|
|
|
|
|
|
class\ i{1,2}| |
1091
|
|
|
|
|
|
|
hla| |
1092
|
|
|
|
|
|
|
mhc| |
1093
|
|
|
|
|
|
|
self| |
1094
|
|
|
|
|
|
|
tcr |
1095
|
|
|
|
|
|
|
)\b |
1096
|
|
|
|
|
|
|
]gx; |
1097
|
12
|
|
|
|
|
29
|
$self->{abst}{$var} = @matches; |
1098
|
12
|
|
|
|
|
50
|
$self->{abst}{"ra_$var"} = [ @matches ]; |
1099
|
|
|
|
|
|
|
} |
1100
|
12
|
|
|
|
|
40
|
return 1; |
1101
|
|
|
|
|
|
|
} |
1102
|
|
|
|
|
|
|
|
1103
|
|
|
|
|
|
|
=head2 AbstNumPeptide |
1104
|
|
|
|
|
|
|
|
1105
|
|
|
|
|
|
|
Args : none |
1106
|
|
|
|
|
|
|
Example : $parser->AbstNumPeptide or return; |
1107
|
|
|
|
|
|
|
Description : See package DESCRIPTION above. |
1108
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
1109
|
|
|
|
|
|
|
|
1110
|
|
|
|
|
|
|
=cut |
1111
|
|
|
|
|
|
|
|
1112
|
|
|
|
|
|
|
sub AbstNumPeptide { |
1113
|
12
|
|
|
12
|
1
|
18
|
my ($self) = @_; |
1114
|
12
|
50
|
|
|
|
32
|
return unless defined $self->{abst}{AbstLcAllText}; |
1115
|
12
|
|
|
|
|
94
|
(my $var = (caller(0))[3]) =~ s/.*:://; |
1116
|
12
|
|
|
|
|
24
|
my @matches; |
1117
|
|
|
|
|
|
|
my @matches_neg; |
1118
|
|
|
|
|
|
|
##cc TODO: possibly add matches to /protein/ and /fragment/ in the same sentence. |
1119
|
12
|
|
|
|
|
29
|
for ($self->{abst}{AbstLcAllText}) { |
1120
|
|
|
|
|
|
|
# step 1, mandatory matches |
1121
|
12
|
|
|
|
|
162
|
push @matches, |
1122
|
|
|
|
|
|
|
m[ |
1123
|
|
|
|
|
|
|
benzyloxycarbonyl| |
1124
|
|
|
|
|
|
|
hormone| |
1125
|
|
|
|
|
|
|
peptid |
1126
|
|
|
|
|
|
|
]gx; |
1127
|
12
|
100
|
|
|
|
38
|
next unless @matches; |
1128
|
6
|
|
|
|
|
78
|
push @matches_neg, |
1129
|
|
|
|
|
|
|
m[ |
1130
|
|
|
|
|
|
|
peptidase| |
1131
|
|
|
|
|
|
|
polypeptide |
1132
|
|
|
|
|
|
|
]gx; |
1133
|
6
|
50
|
|
|
|
36
|
next unless (scalar(@matches) - scalar(@matches_neg)) > 0; |
1134
|
|
|
|
|
|
|
# step 2, optional matches |
1135
|
6
|
|
|
|
|
1411
|
push @matches, |
1136
|
|
|
|
|
|
|
m[ |
1137
|
|
|
|
|
|
|
consensus| |
1138
|
|
|
|
|
|
|
constrain| |
1139
|
|
|
|
|
|
|
\bcyclic| |
1140
|
|
|
|
|
|
|
librar| |
1141
|
|
|
|
|
|
|
linear| |
1142
|
|
|
|
|
|
|
motif| |
1143
|
|
|
|
|
|
|
sequenc| |
1144
|
|
|
|
|
|
|
synthe |
1145
|
|
|
|
|
|
|
]gx, |
1146
|
|
|
|
|
|
|
m[ |
1147
|
|
|
|
|
|
|
\b( |
1148
|
|
|
|
|
|
|
ac| |
1149
|
|
|
|
|
|
|
acetyl| |
1150
|
|
|
|
|
|
|
amide| |
1151
|
|
|
|
|
|
|
boc| |
1152
|
|
|
|
|
|
|
ch2nh| |
1153
|
|
|
|
|
|
|
conh| |
1154
|
|
|
|
|
|
|
cyclo| |
1155
|
|
|
|
|
|
|
dansyl| |
1156
|
|
|
|
|
|
|
h2n| |
1157
|
|
|
|
|
|
|
nh| |
1158
|
|
|
|
|
|
|
nh2| |
1159
|
|
|
|
|
|
|
ome| |
1160
|
|
|
|
|
|
|
tos |
1161
|
|
|
|
|
|
|
)\b |
1162
|
|
|
|
|
|
|
]gx; |
1163
|
6
|
|
|
|
|
20
|
$self->{abst}{$var} = scalar(@matches) - scalar(@matches_neg); |
1164
|
6
|
|
|
|
|
38
|
$self->{abst}{"ra_$var"} = [ @matches ]; |
1165
|
|
|
|
|
|
|
} |
1166
|
12
|
|
|
|
|
40
|
return 1; |
1167
|
|
|
|
|
|
|
} |
1168
|
|
|
|
|
|
|
|
1169
|
|
|
|
|
|
|
=head2 AbstNumPhage |
1170
|
|
|
|
|
|
|
|
1171
|
|
|
|
|
|
|
Args : none |
1172
|
|
|
|
|
|
|
Example : $parser->AbstNumPhage or return; |
1173
|
|
|
|
|
|
|
Description : See package DESCRIPTION above. |
1174
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
1175
|
|
|
|
|
|
|
|
1176
|
|
|
|
|
|
|
=cut |
1177
|
|
|
|
|
|
|
|
1178
|
|
|
|
|
|
|
sub AbstNumPhage { |
1179
|
12
|
|
|
12
|
1
|
17
|
my ($self) = @_; |
1180
|
12
|
50
|
|
|
|
36
|
return unless defined $self->{abst}{AbstLcAllText}; |
1181
|
12
|
|
|
|
|
85
|
(my $var = (caller(0))[3]) =~ s/.*:://; |
1182
|
12
|
|
|
|
|
24
|
my @matches; |
1183
|
|
|
|
|
|
|
my @matches_neg; |
1184
|
12
|
|
|
|
|
31
|
for ($self->{abst}{AbstLcAllText}) { |
1185
|
|
|
|
|
|
|
# step 1, mandatory matches |
1186
|
12
|
|
|
|
|
123
|
push @matches, |
1187
|
|
|
|
|
|
|
m[ |
1188
|
|
|
|
|
|
|
bio-pan| |
1189
|
|
|
|
|
|
|
biopan| |
1190
|
|
|
|
|
|
|
phagemid| |
1191
|
|
|
|
|
|
|
phage\b| |
1192
|
|
|
|
|
|
|
phages\b |
1193
|
|
|
|
|
|
|
]gx, |
1194
|
|
|
|
|
|
|
m[ |
1195
|
|
|
|
|
|
|
\b( |
1196
|
|
|
|
|
|
|
panned| # match panning related, but not spanning, pan-caspase, etc |
1197
|
|
|
|
|
|
|
panning| |
1198
|
|
|
|
|
|
|
pannings| |
1199
|
|
|
|
|
|
|
pans |
1200
|
|
|
|
|
|
|
)\b |
1201
|
|
|
|
|
|
|
]gx; |
1202
|
12
|
100
|
|
|
|
36
|
next unless @matches; |
1203
|
5
|
|
|
|
|
25
|
push @matches_neg, |
1204
|
|
|
|
|
|
|
m[ |
1205
|
|
|
|
|
|
|
macrophage |
1206
|
|
|
|
|
|
|
]gx; |
1207
|
5
|
50
|
|
|
|
18
|
next unless (scalar(@matches) - scalar(@matches_neg)) > 0; |
1208
|
|
|
|
|
|
|
# step 2, optional matches |
1209
|
5
|
|
|
|
|
162
|
push @matches, |
1210
|
|
|
|
|
|
|
m[ |
1211
|
|
|
|
|
|
|
clone| |
1212
|
|
|
|
|
|
|
cloni| |
1213
|
|
|
|
|
|
|
display| |
1214
|
|
|
|
|
|
|
librar| |
1215
|
|
|
|
|
|
|
screen| |
1216
|
|
|
|
|
|
|
select |
1217
|
|
|
|
|
|
|
]gx, |
1218
|
|
|
|
|
|
|
m[ |
1219
|
|
|
|
|
|
|
\b( |
1220
|
|
|
|
|
|
|
round| |
1221
|
|
|
|
|
|
|
rounds |
1222
|
|
|
|
|
|
|
)\b |
1223
|
|
|
|
|
|
|
]gx; |
1224
|
5
|
|
|
|
|
14
|
$self->{abst}{$var} = scalar(@matches) - scalar(@matches_neg); |
1225
|
5
|
|
|
|
|
30
|
$self->{abst}{"ra_$var"} = [ @matches ]; |
1226
|
|
|
|
|
|
|
|
1227
|
|
|
|
|
|
|
} |
1228
|
12
|
|
|
|
|
38
|
return 1; |
1229
|
|
|
|
|
|
|
} |
1230
|
|
|
|
|
|
|
|
1231
|
|
|
|
|
|
|
=head2 AbstNumProtease |
1232
|
|
|
|
|
|
|
|
1233
|
|
|
|
|
|
|
Args : none |
1234
|
|
|
|
|
|
|
Example : $parser->AbstNumProtease or return; |
1235
|
|
|
|
|
|
|
Description : See package DESCRIPTION above. |
1236
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
1237
|
|
|
|
|
|
|
|
1238
|
|
|
|
|
|
|
=cut |
1239
|
|
|
|
|
|
|
|
1240
|
|
|
|
|
|
|
sub AbstNumProtease { |
1241
|
12
|
|
|
12
|
1
|
19
|
my ($self) = @_; |
1242
|
12
|
50
|
|
|
|
39
|
return unless defined $self->{abst}{AbstLcAllText}; |
1243
|
12
|
|
|
|
|
90
|
(my $var = (caller(0))[3]) =~ s/.*:://; |
1244
|
12
|
|
|
|
|
23
|
my @matches; |
1245
|
12
|
|
|
|
|
33
|
for ($self->{abst}{AbstLcAllText}) { |
1246
|
12
|
|
|
|
|
199
|
push @matches, |
1247
|
|
|
|
|
|
|
m[ |
1248
|
|
|
|
|
|
|
aminobenzoyl| |
1249
|
|
|
|
|
|
|
caspase| |
1250
|
|
|
|
|
|
|
cathepsin| |
1251
|
|
|
|
|
|
|
convertase| |
1252
|
|
|
|
|
|
|
dinitrophenyl| |
1253
|
|
|
|
|
|
|
nitroanilide| |
1254
|
|
|
|
|
|
|
peptidase| |
1255
|
|
|
|
|
|
|
protease| |
1256
|
|
|
|
|
|
|
proteasom| |
1257
|
|
|
|
|
|
|
proteinase| |
1258
|
|
|
|
|
|
|
proteol |
1259
|
|
|
|
|
|
|
]gx; |
1260
|
12
|
100
|
|
|
|
41
|
next unless @matches; |
1261
|
|
|
|
|
|
|
# match: MMP, MMPs MMP1, MMP-1, but not MMPFOO |
1262
|
|
|
|
|
|
|
# match: Dnp-Met |
1263
|
2
|
|
|
|
|
847
|
push @matches, |
1264
|
|
|
|
|
|
|
m[ |
1265
|
|
|
|
|
|
|
(?:\b|[^a-z])( |
1266
|
|
|
|
|
|
|
abz| |
1267
|
|
|
|
|
|
|
amc |
1268
|
|
|
|
|
|
|
dnp| |
1269
|
|
|
|
|
|
|
dpa| |
1270
|
|
|
|
|
|
|
eddnp| |
1271
|
|
|
|
|
|
|
fmk| |
1272
|
|
|
|
|
|
|
mca| |
1273
|
|
|
|
|
|
|
mmp| |
1274
|
|
|
|
|
|
|
mmps |
1275
|
|
|
|
|
|
|
)(?:\b|[^a-z]) |
1276
|
|
|
|
|
|
|
]gx; |
1277
|
2
|
|
|
|
|
6
|
$self->{abst}{$var} = @matches; |
1278
|
2
|
|
|
|
|
13
|
$self->{abst}{"ra_$var"} = [ @matches ]; |
1279
|
|
|
|
|
|
|
} |
1280
|
12
|
|
|
|
|
36
|
return 1; |
1281
|
|
|
|
|
|
|
} |
1282
|
|
|
|
|
|
|
|
1283
|
|
|
|
|
|
|
=head2 AbstScore |
1284
|
|
|
|
|
|
|
|
1285
|
|
|
|
|
|
|
Args : none |
1286
|
|
|
|
|
|
|
Example : $parser->AbstScore or return; |
1287
|
|
|
|
|
|
|
Description : Computes AbstScore based on the results of Abst* vars in the abstract. |
1288
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
1289
|
|
|
|
|
|
|
|
1290
|
|
|
|
|
|
|
=cut |
1291
|
|
|
|
|
|
|
|
1292
|
|
|
|
|
|
|
sub AbstScore { |
1293
|
12
|
|
|
12
|
1
|
17
|
my ($self) = @_; |
1294
|
12
|
|
|
|
|
19
|
my $rh_abst = $self->{abst}; |
1295
|
|
|
|
|
|
|
# Split with saving the original whitespace, which is then added back |
1296
|
|
|
|
|
|
|
# during join after marking up is done, in AbstMtext(). |
1297
|
|
|
|
|
|
|
# This preserves the original character positions. |
1298
|
12
|
|
|
|
|
596
|
@{ $rh_abst->{ra_WordOrig} } = split /(\s+)/, $rh_abst->{AbstAbstract}; |
|
12
|
|
|
|
|
361
|
|
1299
|
|
|
|
|
|
|
|
1300
|
12
|
|
|
|
|
69
|
$rh_abst->{AbstNumWords} = grep /\S/, @{ $rh_abst->{ra_WordOrig} }; |
|
12
|
|
|
|
|
572
|
|
1301
|
12
|
100
|
|
|
|
16
|
$rh_abst->{AbstNumAllCap} = @{[grep { /[A-Z]/ and not /[a-z]/ } |
|
2160
|
|
|
|
|
5015
|
|
|
12
|
|
|
|
|
28
|
|
1302
|
12
|
|
|
|
|
15
|
@{ $rh_abst->{ra_WordOrig} } ]}; |
1303
|
|
|
|
|
|
|
# initialize: |
1304
|
12
|
|
|
|
|
31
|
$rh_abst->{AbstScore} = 0; |
1305
|
|
|
|
|
|
|
# linear terms: |
1306
|
12
|
|
|
|
|
41
|
$rh_abst->{AbstScore} += |
1307
|
|
|
|
|
|
|
(-1) * $rh_abst->{AbstNumDigest} + $rh_abst->{AbstNumPeptide} + $rh_abst->{AbstNumPhage}; |
1308
|
12
|
50
|
|
|
|
40
|
warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2; |
1309
|
|
|
|
|
|
|
|
1310
|
|
|
|
|
|
|
# interaction terms (score increases more if both vars increase simultaneously) |
1311
|
12
|
|
|
|
|
44
|
$rh_abst->{AbstScore} += |
1312
|
|
|
|
|
|
|
$rh_abst->{AbstNumBind} * |
1313
|
|
|
|
|
|
|
($rh_abst->{AbstNumPeptide} + $rh_abst->{AbstNumPhage}) + |
1314
|
|
|
|
|
|
|
$rh_abst->{AbstNumProtease} * |
1315
|
|
|
|
|
|
|
($rh_abst->{AbstNumPeptide} + $rh_abst->{AbstNumPhage}) + |
1316
|
|
|
|
|
|
|
$rh_abst->{AbstNumPeptide} * $rh_abst->{AbstNumPhage}; |
1317
|
12
|
50
|
|
|
|
35
|
warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2; |
1318
|
|
|
|
|
|
|
|
1319
|
|
|
|
|
|
|
# abstracts with too many all cap words are more likely to have all cap |
1320
|
|
|
|
|
|
|
# non-sequence words rather than true sequences. |
1321
|
12
|
|
100
|
|
|
54
|
$rh_abst->{AbstPropAllCap} = $rh_abst->{AbstNumAllCap} / ($rh_abst->{AbstNumWords} || 1); |
1322
|
12
|
100
|
66
|
|
|
68
|
if ($rh_abst->{AbstPropAllCap} > 0.3 || $rh_abst->{AbstNumAllCap} > 30) { |
1323
|
2
|
|
|
|
|
6
|
$rh_abst->{AbstScore} *= 0.01; # strong penalty if above threshold |
1324
|
|
|
|
|
|
|
} else { |
1325
|
10
|
|
|
|
|
23
|
$rh_abst->{AbstScore} *= (1 - $rh_abst->{AbstPropAllCap}); |
1326
|
|
|
|
|
|
|
} |
1327
|
12
|
50
|
|
|
|
42
|
warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2; |
1328
|
|
|
|
|
|
|
# AbstScore < 0 can happen for hi AbstNumDigest: |
1329
|
12
|
50
|
|
|
|
31
|
$rh_abst->{AbstScore} = 0 if $rh_abst->{AbstScore} < 0; |
1330
|
|
|
|
|
|
|
# transform (0, +inf) to (0,1): |
1331
|
12
|
|
|
|
|
30
|
$rh_abst->{AbstScore} = $rh_abst->{AbstScore} / (1 + $rh_abst->{AbstScore}); |
1332
|
12
|
50
|
|
|
|
29
|
warn "\$rh_abst->{AbstScore}=$rh_abst->{AbstScore}" if $self->{verbose} > 2; |
1333
|
12
|
50
|
|
|
|
29
|
carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$self], ['self'])) |
1334
|
|
|
|
|
|
|
if $self->{verbose} > 2; |
1335
|
12
|
|
|
|
|
32
|
return 1; |
1336
|
|
|
|
|
|
|
} |
1337
|
|
|
|
|
|
|
|
1338
|
|
|
|
|
|
|
{ |
1339
|
|
|
|
|
|
|
|
1340
|
|
|
|
|
|
|
my %word_default = |
1341
|
|
|
|
|
|
|
( |
1342
|
|
|
|
|
|
|
WordAbstScore => 0, |
1343
|
|
|
|
|
|
|
WordAaSymbols => 0, |
1344
|
|
|
|
|
|
|
WordIsDNA => 0, |
1345
|
|
|
|
|
|
|
WordIsDNALen => 0, |
1346
|
|
|
|
|
|
|
WordIsDict => 0, |
1347
|
|
|
|
|
|
|
WordIsGene => 0, |
1348
|
|
|
|
|
|
|
WordIsRomanLen => 0, |
1349
|
|
|
|
|
|
|
WordNumDegen => 0, |
1350
|
|
|
|
|
|
|
WordOrig => undef, |
1351
|
|
|
|
|
|
|
WordPropDNA => 0, |
1352
|
|
|
|
|
|
|
WordPropDegen => 0, |
1353
|
|
|
|
|
|
|
WordPropProtein => 0, |
1354
|
|
|
|
|
|
|
WordSeqLen => 0, |
1355
|
|
|
|
|
|
|
WordScore => 0, |
1356
|
|
|
|
|
|
|
WordSequence => undef, |
1357
|
|
|
|
|
|
|
WordSubLenMax => 0, |
1358
|
|
|
|
|
|
|
##cc provide defaults for all , to be used for 3 letter and full names of aa. |
1359
|
|
|
|
|
|
|
); |
1360
|
|
|
|
|
|
|
|
1361
|
|
|
|
|
|
|
=head2 Words |
1362
|
|
|
|
|
|
|
|
1363
|
|
|
|
|
|
|
Args : none |
1364
|
|
|
|
|
|
|
Example : $parser->Words or return; |
1365
|
|
|
|
|
|
|
Description : Parses all peptide sequences from abstract text in AbstAbstract. |
1366
|
|
|
|
|
|
|
Calls the necessary methods, and stores results in array ref |
1367
|
|
|
|
|
|
|
$parser->{words}. |
1368
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
1369
|
|
|
|
|
|
|
|
1370
|
|
|
|
|
|
|
=cut |
1371
|
|
|
|
|
|
|
|
1372
|
|
|
|
|
|
|
sub Words { |
1373
|
12
|
|
|
12
|
1
|
24
|
my ($self) = @_; |
1374
|
12
|
|
|
|
|
16
|
my @words; # each el is a ref to hash with data |
1375
|
12
|
100
|
66
|
|
|
48
|
if ($self->{WordNumPrintMin} == 1 and $self->{WordNumPrintMax} == 1) { |
1376
|
|
|
|
|
|
|
# need exactly 1 empty word |
1377
|
5
|
|
|
|
|
66
|
$self->{words} = [ { %word_default } ]; |
1378
|
5
|
|
|
|
|
24
|
return 1; |
1379
|
|
|
|
|
|
|
} |
1380
|
7
|
|
|
|
|
10
|
my $idx = 0; |
1381
|
7
|
|
|
|
|
8
|
my @WordOrig = @{ $self->{abst}{ra_WordOrig} }; |
|
7
|
|
|
|
|
154
|
|
1382
|
7
|
|
|
|
|
18
|
foreach (@WordOrig) { |
1383
|
1121
|
|
|
|
|
10325
|
my $rh_word = { %word_default }; |
1384
|
1121
|
|
|
|
|
3265
|
$rh_word->{WordOrig} = $_; |
1385
|
1121
|
|
|
|
|
2034
|
$rh_word->{WordLcOrig} = lc $_; |
1386
|
1121
|
|
|
|
|
1591
|
$rh_word->{WordIdx} = $idx++; |
1387
|
1121
|
100
|
|
|
|
3921
|
if (not /\S/) { |
|
|
100
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
1388
|
|
|
|
|
|
|
# do nothing |
1389
|
|
|
|
|
|
|
} |
1390
|
|
|
|
|
|
|
elsif ( $_ = $self->aa3_to_aa1( $rh_word->{WordOrig} ) ) { |
1391
|
6
|
|
|
|
|
13
|
$rh_word->{WordAaSymbols} = 3; # 3 letter aa symbols (and full names, too) |
1392
|
6
|
|
|
|
|
12
|
$rh_word->{WordSequence} = $_; |
1393
|
6
|
50
|
|
|
|
18
|
$self->WordVars($rh_word) or return; |
1394
|
|
|
|
|
|
|
} |
1395
|
|
|
|
|
|
|
elsif ( $_ = $self->to_aa1( $rh_word->{WordOrig} ) ) { |
1396
|
72
|
|
|
|
|
105
|
$rh_word->{WordAaSymbols} = 1; # 1 letter aa symbols: |
1397
|
72
|
|
|
|
|
111
|
$rh_word->{WordSequence} = $_; |
1398
|
72
|
50
|
|
|
|
157
|
$self->WordVars($rh_word) or return; |
1399
|
|
|
|
|
|
|
} |
1400
|
|
|
|
|
|
|
# else: has non-whitespace chars, but is not a sequence: do nothing |
1401
|
1121
|
50
|
|
|
|
2274
|
$self->WordScore($rh_word) or return; |
1402
|
1121
|
|
|
|
|
2051
|
push @words, $rh_word; |
1403
|
|
|
|
|
|
|
} |
1404
|
7
|
50
|
33
|
|
|
37
|
if ($self->{WordNumPrintMin} == 1 and not @words) { # need at least 1 word |
1405
|
0
|
|
|
|
|
0
|
push @words, { %word_default }; |
1406
|
|
|
|
|
|
|
} |
1407
|
7
|
|
|
|
|
239
|
$self->{words} = [ @words ]; |
1408
|
7
|
|
|
|
|
148
|
return 1; |
1409
|
|
|
|
|
|
|
} |
1410
|
|
|
|
|
|
|
} |
1411
|
|
|
|
|
|
|
|
1412
|
|
|
|
|
|
|
{ |
1413
|
|
|
|
|
|
|
# See more: Nomenclature and Symbolism for Amino Acids and Peptides |
1414
|
|
|
|
|
|
|
# http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html |
1415
|
|
|
|
|
|
|
# Amino acids below are converted to X for blasting, no need to worry here. |
1416
|
|
|
|
|
|
|
|
1417
|
|
|
|
|
|
|
# keep order for correct substitution below. Generally, longer patterns go first, so that less non-replaced chars remain, so that there is less to clean up later. |
1418
|
|
|
|
|
|
|
my @aa3_to_aa1 = |
1419
|
|
|
|
|
|
|
( |
1420
|
|
|
|
|
|
|
# full names: |
1421
|
|
|
|
|
|
|
# Omitted: B, U, Z |
1422
|
|
|
|
|
|
|
alanine => 'A', |
1423
|
|
|
|
|
|
|
cysteine => 'C', |
1424
|
|
|
|
|
|
|
cystein => 'C', # alternative spelling |
1425
|
|
|
|
|
|
|
aspartic => 'D', |
1426
|
|
|
|
|
|
|
aspartate => 'D', |
1427
|
|
|
|
|
|
|
glutamic => 'E', |
1428
|
|
|
|
|
|
|
glutamate => 'E', |
1429
|
|
|
|
|
|
|
phenylalanine => 'F', |
1430
|
|
|
|
|
|
|
glycine => 'G', |
1431
|
|
|
|
|
|
|
histidine => 'H', |
1432
|
|
|
|
|
|
|
isoleucine => 'I', |
1433
|
|
|
|
|
|
|
lysine => 'K', |
1434
|
|
|
|
|
|
|
leucine => 'L', |
1435
|
|
|
|
|
|
|
methionine => 'M', |
1436
|
|
|
|
|
|
|
asparagine => 'N', |
1437
|
|
|
|
|
|
|
proline => 'P', |
1438
|
|
|
|
|
|
|
glutamine => 'Q', |
1439
|
|
|
|
|
|
|
arginine => 'R', |
1440
|
|
|
|
|
|
|
serine => 'S', |
1441
|
|
|
|
|
|
|
threonine => 'T', |
1442
|
|
|
|
|
|
|
selenocysteine => 'U', |
1443
|
|
|
|
|
|
|
valine => 'V', |
1444
|
|
|
|
|
|
|
tryptophane => 'W', # alternative spelling |
1445
|
|
|
|
|
|
|
tryptophan => 'W', |
1446
|
|
|
|
|
|
|
tyrosine => 'Y', |
1447
|
|
|
|
|
|
|
|
1448
|
|
|
|
|
|
|
# full names, 'yl' endings: |
1449
|
|
|
|
|
|
|
# matches: |
1450
|
|
|
|
|
|
|
# glycyl-L-prolyl-L-glutamic acid; |
1451
|
|
|
|
|
|
|
# arginyl glycyl aspartyl serine |
1452
|
|
|
|
|
|
|
alanyl => 'A', |
1453
|
|
|
|
|
|
|
cysteinyl => 'C', |
1454
|
|
|
|
|
|
|
aspartyl => 'D', |
1455
|
|
|
|
|
|
|
glutamyl => 'E', |
1456
|
|
|
|
|
|
|
phenylalanyl => 'F', |
1457
|
|
|
|
|
|
|
glycyl => 'G', |
1458
|
|
|
|
|
|
|
histidyl => 'H', |
1459
|
|
|
|
|
|
|
isoleucyl => 'I', |
1460
|
|
|
|
|
|
|
lysyl => 'K', |
1461
|
|
|
|
|
|
|
leucyl => 'L', |
1462
|
|
|
|
|
|
|
methionyl => 'M', |
1463
|
|
|
|
|
|
|
asparaginyl => 'N', |
1464
|
|
|
|
|
|
|
prolyl => 'P', |
1465
|
|
|
|
|
|
|
glutaminyl => 'Q', |
1466
|
|
|
|
|
|
|
arginyl => 'R', |
1467
|
|
|
|
|
|
|
seryl => 'S', |
1468
|
|
|
|
|
|
|
threonyl => 'T', |
1469
|
|
|
|
|
|
|
selenocysteinyl => 'U', |
1470
|
|
|
|
|
|
|
valyl => 'V', |
1471
|
|
|
|
|
|
|
tryptophyl => 'W', |
1472
|
|
|
|
|
|
|
tyrosyl => 'Y', |
1473
|
|
|
|
|
|
|
|
1474
|
|
|
|
|
|
|
# 3 letter aa symbols to 1 letter. |
1475
|
|
|
|
|
|
|
Ala => 'A', |
1476
|
|
|
|
|
|
|
Cys => 'C', |
1477
|
|
|
|
|
|
|
Asp => 'D', |
1478
|
|
|
|
|
|
|
Glu => 'E', |
1479
|
|
|
|
|
|
|
Phe => 'F', |
1480
|
|
|
|
|
|
|
Gly => 'G', |
1481
|
|
|
|
|
|
|
His => 'H', |
1482
|
|
|
|
|
|
|
Ile => 'I', |
1483
|
|
|
|
|
|
|
Lys => 'K', |
1484
|
|
|
|
|
|
|
Leu => 'L', |
1485
|
|
|
|
|
|
|
Met => 'M', |
1486
|
|
|
|
|
|
|
Asn => 'N', |
1487
|
|
|
|
|
|
|
Pro => 'P', |
1488
|
|
|
|
|
|
|
Gln => 'Q', |
1489
|
|
|
|
|
|
|
Arg => 'R', |
1490
|
|
|
|
|
|
|
Ser => 'S', |
1491
|
|
|
|
|
|
|
Thr => 'T', |
1492
|
|
|
|
|
|
|
Sec => 'U', |
1493
|
|
|
|
|
|
|
Val => 'V', |
1494
|
|
|
|
|
|
|
Trp => 'W', |
1495
|
|
|
|
|
|
|
Tyr => 'Y', |
1496
|
|
|
|
|
|
|
|
1497
|
|
|
|
|
|
|
# non-standard amino acids are all converted to X |
1498
|
|
|
|
|
|
|
Aad => 'X', |
1499
|
|
|
|
|
|
|
Abu => 'X', |
1500
|
|
|
|
|
|
|
Aib => 'X', |
1501
|
|
|
|
|
|
|
Ahx => 'X', |
1502
|
|
|
|
|
|
|
Ape => 'X', |
1503
|
|
|
|
|
|
|
Apm => 'X', |
1504
|
|
|
|
|
|
|
Asx => 'X', # B in http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html |
1505
|
|
|
|
|
|
|
Cit => 'X', |
1506
|
|
|
|
|
|
|
Dab => 'X', |
1507
|
|
|
|
|
|
|
Dap => 'X', |
1508
|
|
|
|
|
|
|
Dpm => 'X', |
1509
|
|
|
|
|
|
|
Dpr => 'X', |
1510
|
|
|
|
|
|
|
Glx => 'X', # Z in http://www.chem.qmul.ac.uk/iupac/AminoAcid/A2021.html |
1511
|
|
|
|
|
|
|
Hyl => 'X', |
1512
|
|
|
|
|
|
|
Hyp => 'X', |
1513
|
|
|
|
|
|
|
Nle => 'X', |
1514
|
|
|
|
|
|
|
Nva => 'X', |
1515
|
|
|
|
|
|
|
Orn => 'X', |
1516
|
|
|
|
|
|
|
Pyr => 'X', |
1517
|
|
|
|
|
|
|
|
1518
|
|
|
|
|
|
|
# unknown amino acids |
1519
|
|
|
|
|
|
|
Xaa => 'X', # keep order as shown, so that these replacements are correct: 'Asn-x-Glu-x-x-(aromatic)-x-x-Gly', |
1520
|
|
|
|
|
|
|
# unknown amino acids, not in IUPAC, but frequently used: |
1521
|
|
|
|
|
|
|
Xxx => 'X', # such as Gly-Xxx-Gly, but some XXX or xxx may refer to X{3}, eg Gly-xxx-Gly |
1522
|
|
|
|
|
|
|
Yaa => 'X', |
1523
|
|
|
|
|
|
|
Zaa => 'X', |
1524
|
|
|
|
|
|
|
|
1525
|
|
|
|
|
|
|
); |
1526
|
|
|
|
|
|
|
|
1527
|
|
|
|
|
|
|
my %aa3_to_aa1 = @aa3_to_aa1; |
1528
|
|
|
|
|
|
|
my @aa3_to_aa1_keys = grep exists $aa3_to_aa1{$_}, @aa3_to_aa1; |
1529
|
|
|
|
|
|
|
# print to redo sort: |
1530
|
|
|
|
|
|
|
# for (sort { $aa3_to_aa1{$a} cmp $aa3_to_aa1{$b} } keys %aa3_to_aa1) { |
1531
|
|
|
|
|
|
|
# next unless length $_ == 3; |
1532
|
|
|
|
|
|
|
# print "$_ => '$aa3_to_aa1{$_}',\n"; |
1533
|
|
|
|
|
|
|
# } |
1534
|
|
|
|
|
|
|
my $aa3_to_aa1 = join "|", @aa3_to_aa1_keys; |
1535
|
|
|
|
|
|
|
my $aa3_to_aa1_re = qr!$aa3_to_aa1!; |
1536
|
|
|
|
|
|
|
|
1537
|
|
|
|
|
|
|
# enable parsing lowercase 3 letter symbols, eg 'NAc-lys-gly-gln-OH'. |
1538
|
|
|
|
|
|
|
my %aa3_to_aa1_lc = map { lc($_) => $aa3_to_aa1{$_} } keys %aa3_to_aa1; |
1539
|
|
|
|
|
|
|
my @aa3_to_aa1_keys_lc = map { lc $_ } @aa3_to_aa1_keys; |
1540
|
|
|
|
|
|
|
my $aa3_to_aa1_lc = join "|", @aa3_to_aa1_keys_lc; |
1541
|
|
|
|
|
|
|
my $aa3_to_aa1_lc_re = qr!$aa3_to_aa1_lc!; |
1542
|
|
|
|
|
|
|
|
1543
|
|
|
|
|
|
|
=head2 aa3_to_aa1 |
1544
|
|
|
|
|
|
|
|
1545
|
|
|
|
|
|
|
Args : string with a peptide sequence |
1546
|
|
|
|
|
|
|
Example : $str = $self->aa3_to_aa1($str) |
1547
|
|
|
|
|
|
|
Description : finds the first string that looks like protein |
1548
|
|
|
|
|
|
|
sequence in 3 letter amino acid symbols or full |
1549
|
|
|
|
|
|
|
names. Cleans (remove non-informative parens, -, /, |
1550
|
|
|
|
|
|
|
etc), and returns the cleaned sequence. Currently, |
1551
|
|
|
|
|
|
|
full names are also handled here, but this may change. |
1552
|
|
|
|
|
|
|
Returns : sequence (TRUE) if a sequence is found, FALSE otherwise. |
1553
|
|
|
|
|
|
|
|
1554
|
|
|
|
|
|
|
=cut |
1555
|
|
|
|
|
|
|
|
1556
|
|
|
|
|
|
|
sub aa3_to_aa1 { |
1557
|
605
|
|
|
605
|
1
|
789
|
my ($self, $str) = @_; |
1558
|
|
|
|
|
|
|
# first try more frequent, uppercase, then lowercase. |
1559
|
|
|
|
|
|
|
# tag with _BeginTag_ and _EndTag_ the replaced amino acids, |
1560
|
|
|
|
|
|
|
# then use the tag to remove the rest |
1561
|
|
|
|
|
|
|
# of the chars, except for a set of allowed chars that did not |
1562
|
|
|
|
|
|
|
# need 3 letter to 1 letter conversion and thus did not get tagged. |
1563
|
|
|
|
|
|
|
# proceed if > 1 replacements , eg > 1 aa. |
1564
|
|
|
|
|
|
|
return unless |
1565
|
|
|
|
|
|
|
( |
1566
|
605
|
100
|
66
|
|
|
13258
|
($_ = $str and s!($aa3_to_aa1_re)!_BeginTag_$aa3_to_aa1{$1}_EndTag_!g > 1) or |
|
|
|
66
|
|
|
|
|
|
|
|
66
|
|
|
|
|
1567
|
|
|
|
|
|
|
($_ = $str and s!\b($aa3_to_aa1_lc_re)\b!_BeginTag_$aa3_to_aa1_lc{$1}_EndTag_!g > 1) |
1568
|
|
|
|
|
|
|
); |
1569
|
|
|
|
|
|
|
# warn "tag aa3_to_aa1: \$str=$str;" if $self->{verbose} > 2; |
1570
|
|
|
|
|
|
|
# warn "tag aa3_to_aa1: \$_=$_;" if $self->{verbose} > 2; |
1571
|
|
|
|
|
|
|
# handle x and X here, so that the method returns TRUE only if 3 letter |
1572
|
|
|
|
|
|
|
# symbols occurred. x and X do not count. |
1573
|
49
|
|
|
|
|
116
|
s!\bx\b!_BeginTag_X_EndTag_!g; |
1574
|
|
|
|
|
|
|
# neg lookbehind: tag each X, except those already tagged |
1575
|
|
|
|
|
|
|
# (eg, converted from Xaa to X) (= preceded by _), or |
1576
|
|
|
|
|
|
|
# those where X means something other than amino acid |
1577
|
|
|
|
|
|
|
# (eg, Met(OX) means Met, oxidized) (= preceded by non-X upper case letter). |
1578
|
49
|
|
|
|
|
103
|
s!(?<\![_A-WYZ])X!_BeginTag_X_EndTag_!g; |
1579
|
|
|
|
|
|
|
# warn "tag X: \$_=$_;" if $self->{verbose} > 2; |
1580
|
|
|
|
|
|
|
# change 'X(3)' to 'XXX' |
1581
|
|
|
|
|
|
|
# repeat only Xs, because X(n) usually refers to X repeated n times, |
1582
|
|
|
|
|
|
|
# while for other aa, such as G, |
1583
|
|
|
|
|
|
|
# G(n) refers to the position within the protein. |
1584
|
|
|
|
|
|
|
# Some cases where X(n) refers to position n are eliminated later if n is too large, |
1585
|
|
|
|
|
|
|
# eg X(20) results in sequence length > 20 and is assigned lower score. |
1586
|
49
|
|
|
|
|
99
|
s!_BeginTag_X_EndTag_\W*(\d)\W*!'_BeginTag_X_EndTag_' x $1!eg; |
|
8
|
|
|
|
|
33
|
|
1587
|
49
|
|
|
|
|
530
|
$_ = join '', grep { defined $_ } m!([\-/()])|_BeginTag_(.)_EndTag_!g; |
|
1078
|
|
|
|
|
1470
|
|
1588
|
49
|
|
|
|
|
150
|
return clean_seq($_); |
1589
|
|
|
|
|
|
|
} |
1590
|
|
|
|
|
|
|
} |
1591
|
|
|
|
|
|
|
|
1592
|
|
|
|
|
|
|
|
1593
|
|
|
|
|
|
|
{ |
1594
|
|
|
|
|
|
|
|
1595
|
|
|
|
|
|
|
# p = phospho, eg pY = phosphotyrosine |
1596
|
|
|
|
|
|
|
# x and X are both frequently used in motifs as any aa. |
1597
|
|
|
|
|
|
|
# B, Z, U are not included because they do not occur very frequently in aa sequences in abstracts. |
1598
|
|
|
|
|
|
|
my $aa1_allowed = '\-/()ACDEFGHIKLMNPQRSTVWYXpx'; |
1599
|
|
|
|
|
|
|
my $aa1_re = qr! |
1600
|
|
|
|
|
|
|
# not immed preceded by other letters |
1601
|
|
|
|
|
|
|
# optional N-terminus |
1602
|
|
|
|
|
|
|
(?:N\-|HN\-|H2N\-|H\(2\)N\-|[\W\d]|^) |
1603
|
|
|
|
|
|
|
# sequence, >= 2 aa |
1604
|
|
|
|
|
|
|
([$aa1_allowed]{2,}) |
1605
|
|
|
|
|
|
|
# not immed followed by other letters |
1606
|
|
|
|
|
|
|
(?:[\W\d]|\-.*|$) |
1607
|
|
|
|
|
|
|
!x; |
1608
|
|
|
|
|
|
|
|
1609
|
|
|
|
|
|
|
=head2 to_aa1 |
1610
|
|
|
|
|
|
|
|
1611
|
|
|
|
|
|
|
Args : string with a peptide sequence |
1612
|
|
|
|
|
|
|
Example : $str = $self->to_aa1($str) |
1613
|
|
|
|
|
|
|
Description : finds the first string that looks like peptide sequence in uppercase |
1614
|
|
|
|
|
|
|
1 letter amino acid symbols. Cleans (remove non-informative parens, |
1615
|
|
|
|
|
|
|
-, /, etc), and returns the cleaned sequence. |
1616
|
|
|
|
|
|
|
Returns : sequence (TRUE) if a sequence is found, FALSE otherwise. |
1617
|
|
|
|
|
|
|
|
1618
|
|
|
|
|
|
|
=cut |
1619
|
|
|
|
|
|
|
|
1620
|
|
|
|
|
|
|
sub to_aa1 { |
1621
|
576
|
|
|
576
|
1
|
813
|
my ($self, $str) = @_; |
1622
|
576
|
|
|
|
|
853
|
for ($str) { |
1623
|
576
|
100
|
|
|
|
3732
|
if (m!$aa1_re!) { |
1624
|
91
|
|
|
|
|
188
|
return clean_seq($1); |
1625
|
|
|
|
|
|
|
} |
1626
|
|
|
|
|
|
|
} |
1627
|
485
|
|
|
|
|
1369
|
return; |
1628
|
|
|
|
|
|
|
} |
1629
|
|
|
|
|
|
|
} |
1630
|
|
|
|
|
|
|
|
1631
|
|
|
|
|
|
|
=head2 clean_seq |
1632
|
|
|
|
|
|
|
|
1633
|
|
|
|
|
|
|
Args : string with a peptide sequence in 1 letter aa symbols |
1634
|
|
|
|
|
|
|
Example : $str = clean_seq($str); |
1635
|
|
|
|
|
|
|
Description : for sequence in 1 letter aa symbols, remove extra parens, etc. |
1636
|
|
|
|
|
|
|
Simple motifs are ok: A(B/C/D)E... |
1637
|
|
|
|
|
|
|
Returns : cleaned string. |
1638
|
|
|
|
|
|
|
|
1639
|
|
|
|
|
|
|
=cut |
1640
|
|
|
|
|
|
|
|
1641
|
|
|
|
|
|
|
sub clean_seq { |
1642
|
140
|
|
|
140
|
1
|
281
|
my ($str) = @_; |
1643
|
140
|
|
|
|
|
206
|
for ($str) { |
1644
|
140
|
|
|
|
|
210
|
s!\-NH[()\d]*$!!; # remove C/N terminal mark |
1645
|
140
|
|
|
|
|
420
|
s!(^[CN]'|[CN]'$)!!g; |
1646
|
140
|
|
|
|
|
172
|
s!FMK\W*$!!; # remove aa modifications, eg fluorophores |
1647
|
140
|
|
|
|
|
193
|
tr!\-!!d; # delete connecting '-', eg change 'G-E-T' to 'GET' |
1648
|
140
|
|
|
|
|
160
|
tr!p!!d; # delete p=phospho, not P=proline. lowercase only. |
1649
|
140
|
|
|
|
|
204
|
$_ = uc $_; # do this after resolving: p=phospho, not P=proline; |
1650
|
|
|
|
|
|
|
# Xxx = X, not XXX = X{3} |
1651
|
140
|
|
|
|
|
192
|
s!X\W*(\d)\W*!'X' x $1!eg; |
|
0
|
|
|
|
|
0
|
|
1652
|
140
|
|
|
|
|
226
|
$_ = clean_orphan_parens($_); |
1653
|
140
|
|
|
|
|
177
|
s!^/+!!; # clean up beginning and end |
1654
|
140
|
|
|
|
|
168
|
s!/+$!!; # clean up beginning and end |
1655
|
140
|
|
|
|
|
188
|
s!/+!/!g; # clean up repeated separator chars |
1656
|
140
|
|
|
|
|
234
|
$_ = parse_slashes($_); |
1657
|
140
|
|
|
|
|
238
|
$_ = clean_orphan_parens($_); |
1658
|
140
|
|
|
|
|
677
|
return $_; |
1659
|
|
|
|
|
|
|
} |
1660
|
|
|
|
|
|
|
} |
1661
|
|
|
|
|
|
|
|
1662
|
|
|
|
|
|
|
=head2 clean_orphan_parens |
1663
|
|
|
|
|
|
|
|
1664
|
|
|
|
|
|
|
Args : string with a sequence to clean up. |
1665
|
|
|
|
|
|
|
Example : $_ = clean_orphan_parens($_); |
1666
|
|
|
|
|
|
|
Description : removes orphan parens from a string. |
1667
|
|
|
|
|
|
|
Returns : cleaned string. |
1668
|
|
|
|
|
|
|
|
1669
|
|
|
|
|
|
|
=cut |
1670
|
|
|
|
|
|
|
|
1671
|
|
|
|
|
|
|
sub clean_orphan_parens { |
1672
|
280
|
|
|
280
|
1
|
363
|
my ($str) = @_; |
1673
|
280
|
|
|
|
|
616
|
for (my $i = 0; $i < 100; $i++) { # magic number 100 to prevent inf loops |
1674
|
1903
|
|
|
|
|
14306
|
my $num_changes = 0; |
1675
|
1903
|
|
|
|
|
2051
|
$num_changes += s!^\)+!!g; # delete wrong sided parens |
1676
|
1903
|
|
|
|
|
2858
|
$num_changes += s!\(+$!!g; # delete wrong sided parens |
1677
|
1903
|
|
|
|
|
4853
|
$num_changes += s!\(+!\(!g; # change '((GET)' to '(GET)' |
1678
|
1903
|
|
|
|
|
4674
|
$num_changes += s!\)+!\)!g; # change '(GET))' to '(GET)' |
1679
|
1903
|
|
|
|
|
3710
|
$num_changes += s!\([^A-Z]*\)!!g; # remove '(/)', '()' |
1680
|
1903
|
|
|
|
|
2333
|
$num_changes += s!^\((.)\)!$1!; # change '(G)ETRAPL' to 'GETRAPL' |
1681
|
|
|
|
|
|
|
# change 'GETR(A)PL' to 'GETRAPL' |
1682
|
|
|
|
|
|
|
# change 'GET(RAP)L' to 'GETRAPL' |
1683
|
1903
|
|
|
|
|
3027
|
$num_changes += s!\(([^()/]*)\)!$1!g; |
1684
|
|
|
|
|
|
|
# change '(GETR(A)PL)' to 'GETR(A)PL' - delete outer parens in 2 steps: |
1685
|
1903
|
|
|
|
|
2741
|
$num_changes += s!\(([^()]+)\(!$1\(!g; # change '(GETR(A)PL)' to 'GETR(A)PL)' |
1686
|
1903
|
|
|
|
|
2603
|
$num_changes += s!\)([^()]+)\)!\)$1!g; # change 'GETR(A)PL)' to 'GETR(A)PL' |
1687
|
1903
|
|
|
|
|
2068
|
my $num_parens = tr!()!!; |
1688
|
1903
|
100
|
|
|
|
3433
|
if ($num_parens == 1) { |
1689
|
13
|
|
|
|
|
71
|
$num_changes += tr!()!!d; # change 'GE(TRAPL' to 'GETRAPL' |
1690
|
|
|
|
|
|
|
} |
1691
|
1903
|
100
|
|
|
|
4769
|
last unless $num_changes; # clean stubborn sequences, eg: |
1692
|
|
|
|
|
|
|
# 'G(ET(RA)PL' => 'G(ETRAPL' (after iteration 1), 'GETRAPL' (after iteration 2). |
1693
|
|
|
|
|
|
|
} |
1694
|
280
|
|
|
|
|
537
|
return $_; |
1695
|
|
|
|
|
|
|
} |
1696
|
|
|
|
|
|
|
|
1697
|
|
|
|
|
|
|
|
1698
|
|
|
|
|
|
|
=head2 parse_slashes |
1699
|
|
|
|
|
|
|
|
1700
|
|
|
|
|
|
|
Args : string with a sequence. |
1701
|
|
|
|
|
|
|
Example : $_ = parse_slashes($_); |
1702
|
|
|
|
|
|
|
Description : handles slashes. Changes A/B/C/etc to (A/B/C/etc) |
1703
|
|
|
|
|
|
|
which means any of A, B, C, etc. Returns the resulting |
1704
|
|
|
|
|
|
|
string. Exceptions are as follows, based on the the |
1705
|
|
|
|
|
|
|
fact that the result would make no sense. If X occurs |
1706
|
|
|
|
|
|
|
next to '/' or if the result would contain a repeated |
1707
|
|
|
|
|
|
|
character. For exceptions, splits the string on |
1708
|
|
|
|
|
|
|
slashes, and returns the first longest string. |
1709
|
|
|
|
|
|
|
Returns : see above |
1710
|
|
|
|
|
|
|
|
1711
|
|
|
|
|
|
|
=cut |
1712
|
|
|
|
|
|
|
|
1713
|
|
|
|
|
|
|
sub parse_slashes { |
1714
|
140
|
|
|
140
|
1
|
150
|
$_ = $_[0]; |
1715
|
140
|
100
|
|
|
|
601
|
return $_ unless m!/!; |
1716
|
15
|
100
|
100
|
|
|
100
|
if (m!/X|X/! or has_repeats_at_slashes($_) ) { |
1717
|
3
|
|
|
|
|
5
|
my $str = ''; |
1718
|
3
|
|
|
|
|
16
|
for (split m!/+!) { |
1719
|
6
|
50
|
|
|
|
22
|
$str = $_ if length($str) < length($_); # change PXXP/PXPXP to PXPXP |
1720
|
|
|
|
|
|
|
} |
1721
|
3
|
|
|
|
|
8
|
return $str; |
1722
|
|
|
|
|
|
|
} else { |
1723
|
12
|
|
|
|
|
100
|
s!(\w(/\w)+)!($1)!g; # change PXXP/GXPXP to PXX(P/G)XPXP |
1724
|
12
|
|
|
|
|
29
|
return $_; |
1725
|
|
|
|
|
|
|
} |
1726
|
|
|
|
|
|
|
} |
1727
|
|
|
|
|
|
|
|
1728
|
|
|
|
|
|
|
|
1729
|
|
|
|
|
|
|
=head2 has_repeats_at_slashes |
1730
|
|
|
|
|
|
|
|
1731
|
|
|
|
|
|
|
Args : string with a sequence. |
1732
|
|
|
|
|
|
|
Example : print 1 if has_repeats_at_slashes($_); |
1733
|
|
|
|
|
|
|
Description : see below |
1734
|
|
|
|
|
|
|
Returns : TRUE if the string has a repeated char next to a series of |
1735
|
|
|
|
|
|
|
slashes and chars, FALSE otherwise. |
1736
|
|
|
|
|
|
|
|
1737
|
|
|
|
|
|
|
=cut |
1738
|
|
|
|
|
|
|
|
1739
|
|
|
|
|
|
|
sub has_repeats_at_slashes { |
1740
|
14
|
|
|
14
|
1
|
32
|
for ($_[0]) { |
1741
|
14
|
|
|
|
|
77
|
for (m!(\w(?:/\w)+)!g) { |
1742
|
19
|
|
|
|
|
57
|
my %seen; |
1743
|
19
|
|
|
|
|
78
|
for (/\w/g) { |
1744
|
39
|
100
|
|
|
|
274
|
return 1 if $seen{$_}++; |
1745
|
|
|
|
|
|
|
} |
1746
|
|
|
|
|
|
|
} |
1747
|
|
|
|
|
|
|
} |
1748
|
12
|
|
|
|
|
54
|
return; |
1749
|
|
|
|
|
|
|
} |
1750
|
|
|
|
|
|
|
|
1751
|
|
|
|
|
|
|
=head2 WordVars |
1752
|
|
|
|
|
|
|
|
1753
|
|
|
|
|
|
|
Args : $rh_word - ref to hash with data for the word |
1754
|
|
|
|
|
|
|
Example : $parser->WordVars($rh_word) or return; |
1755
|
|
|
|
|
|
|
Description : computes variables for WordScore, eg WordPropDegen, WordIsDNA, etc. |
1756
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
1757
|
|
|
|
|
|
|
|
1758
|
|
|
|
|
|
|
=cut |
1759
|
|
|
|
|
|
|
|
1760
|
|
|
|
|
|
|
sub WordVars { |
1761
|
78
|
|
|
78
|
1
|
110
|
my ($self, $rh_word) = @_; |
1762
|
78
|
50
|
33
|
|
|
388
|
return unless $rh_word and ref $rh_word eq 'HASH'; |
1763
|
78
|
|
|
|
|
154
|
$_ = $rh_word->{WordSequence}; |
1764
|
|
|
|
|
|
|
# keep allowed chars in kmers(): 'a'..'z' in sync with |
1765
|
|
|
|
|
|
|
# WordPropProtein() args : lc tr/A-Z//cd; |
1766
|
78
|
|
|
|
|
187
|
( $rh_word->{WordAlpha} = $_ ) =~ tr/A-Z//cd; |
1767
|
|
|
|
|
|
|
|
1768
|
78
|
50
|
|
|
|
179
|
if ( $rh_word->{WordSequence} ) { |
1769
|
78
|
|
|
|
|
119
|
$rh_word->{WordSeqLen} = length $rh_word->{WordAlpha}; |
1770
|
78
|
|
|
|
|
103
|
$rh_word->{WordNumDegen} = tr/X//; |
1771
|
78
|
|
|
|
|
156
|
$rh_word->{WordNumNotDegen} = $rh_word->{WordSeqLen} - $rh_word->{WordNumDegen}; |
1772
|
78
|
|
50
|
|
|
274
|
$rh_word->{WordPropDegen} = $rh_word->{WordNumDegen} / ($rh_word->{WordSeqLen} || 1); |
1773
|
78
|
|
|
|
|
174
|
$rh_word->{WordLcSequence} = lc $rh_word->{WordSequence}; |
1774
|
|
|
|
|
|
|
} |
1775
|
|
|
|
|
|
|
|
1776
|
78
|
100
|
|
|
|
181
|
if ($rh_word->{WordAaSymbols} == 1) { |
1777
|
|
|
|
|
|
|
# Compute WordPropProtein only for 1 letter symbols, because |
1778
|
|
|
|
|
|
|
# it is important for classification if 'GET' looks like an english word, |
1779
|
|
|
|
|
|
|
# but not important if 'Gly-Glu-Thr' does. |
1780
|
72
|
|
|
|
|
210
|
$rh_word->{WordPropProtein} = $self->WordPropProtein(lc $rh_word->{WordAlpha}); |
1781
|
72
|
|
|
|
|
141
|
$rh_word->{WordIsDNALen} = tr/ACTGN//; |
1782
|
72
|
50
|
|
|
|
248
|
$rh_word->{WordPropDNA} = $rh_word->{WordIsDNALen} / $rh_word->{WordSeqLen} |
1783
|
|
|
|
|
|
|
if $rh_word->{WordSeqLen}; |
1784
|
|
|
|
|
|
|
# if all or mostly dna, eg, '(A/C)TGG' or 'WCCGGGCGGCCCC' |
1785
|
72
|
100
|
|
|
|
167
|
$rh_word->{WordIsDNA} = 1 if $rh_word->{WordPropDNA} >= 0.9; |
1786
|
|
|
|
|
|
|
} |
1787
|
78
|
|
|
|
|
211
|
return 1; |
1788
|
|
|
|
|
|
|
} |
1789
|
|
|
|
|
|
|
|
1790
|
|
|
|
|
|
|
{ |
1791
|
|
|
|
|
|
|
my $re; |
1792
|
|
|
|
|
|
|
|
1793
|
|
|
|
|
|
|
=head2 WordPropProtein |
1794
|
|
|
|
|
|
|
|
1795
|
|
|
|
|
|
|
Args : $word : string |
1796
|
|
|
|
|
|
|
Example : $rh_word->{WordPropProtein} = $self->WordPropProtein($word); |
1797
|
|
|
|
|
|
|
Description : See package DESCRIPTION above. |
1798
|
|
|
|
|
|
|
Returns : sum of log10(proportion(k-mers)) |
1799
|
|
|
|
|
|
|
|
1800
|
|
|
|
|
|
|
=cut |
1801
|
|
|
|
|
|
|
|
1802
|
|
|
|
|
|
|
sub WordPropProtein { |
1803
|
72
|
|
|
72
|
1
|
106
|
my ($self, $str) = @_; |
1804
|
72
|
|
|
|
|
86
|
my $WordPropProtein = 0; |
1805
|
72
|
100
|
|
|
|
201
|
return $WordPropProtein if (length($str) < $self->{kmer_length}); |
1806
|
66
|
|
66
|
|
|
174
|
$re ||= qr/(?=(.{$self->{kmer_length}}))/; |
1807
|
66
|
|
|
|
|
662
|
foreach ($str =~ /$re/g) { |
1808
|
218
|
50
|
|
|
|
597
|
unless (defined $log_prop_sequence{$_}) { |
1809
|
0
|
|
|
|
|
0
|
carp "not ok: undefined log_prop_sequence for $_"; |
1810
|
0
|
|
|
|
|
0
|
next; |
1811
|
|
|
|
|
|
|
} |
1812
|
218
|
50
|
|
|
|
509
|
unless (defined $log_prop_nonsequence{$_}) { |
1813
|
0
|
|
|
|
|
0
|
carp "not ok: undefined log_prop_nonsequence for $_"; |
1814
|
0
|
|
|
|
|
0
|
next; |
1815
|
|
|
|
|
|
|
} |
1816
|
218
|
|
|
|
|
768
|
$WordPropProtein += ($log_prop_sequence{$_} - $log_prop_nonsequence{$_}); |
1817
|
|
|
|
|
|
|
} |
1818
|
66
|
|
|
|
|
419
|
$WordPropProtein = sprintf("%.1f", $WordPropProtein); |
1819
|
66
|
|
|
|
|
206
|
return $WordPropProtein; |
1820
|
|
|
|
|
|
|
} |
1821
|
|
|
|
|
|
|
} |
1822
|
|
|
|
|
|
|
|
1823
|
|
|
|
|
|
|
{ |
1824
|
|
|
|
|
|
|
# WordScore factors for different values of vars, eg |
1825
|
|
|
|
|
|
|
# for WordAaSymbols = 3, multiply WordScore by 1000. |
1826
|
|
|
|
|
|
|
|
1827
|
|
|
|
|
|
|
# WordScoreForAaSymbols: |
1828
|
|
|
|
|
|
|
# Keep the ratio of 3 letter or full name symbols relative to 1 letter symbols high, |
1829
|
|
|
|
|
|
|
# because even for very short words they are more likely to be a sequence, eg: |
1830
|
|
|
|
|
|
|
# 'Pro-Ala-Pro' relative to 'PAP'. |
1831
|
|
|
|
|
|
|
my %WordScoreForAaSymbols = |
1832
|
|
|
|
|
|
|
( |
1833
|
|
|
|
|
|
|
0 => 0, |
1834
|
|
|
|
|
|
|
1 => 1, |
1835
|
|
|
|
|
|
|
3 => 1000, |
1836
|
|
|
|
|
|
|
5 => 10000, # full names (not implemented) |
1837
|
|
|
|
|
|
|
); |
1838
|
|
|
|
|
|
|
# |
1839
|
|
|
|
|
|
|
my %WordScoreForSeqLen = |
1840
|
|
|
|
|
|
|
( |
1841
|
|
|
|
|
|
|
0 => 0, |
1842
|
|
|
|
|
|
|
1 => 0, |
1843
|
|
|
|
|
|
|
2 => 0.0001, |
1844
|
|
|
|
|
|
|
3 => 0.1, |
1845
|
|
|
|
|
|
|
4 => 0.5, |
1846
|
|
|
|
|
|
|
5 => 1, |
1847
|
|
|
|
|
|
|
); |
1848
|
|
|
|
|
|
|
|
1849
|
|
|
|
|
|
|
# as WordSeqLen increases between 5 and $MAX_PEPTIDE_LENGTH, |
1850
|
|
|
|
|
|
|
# WordScore factor increases linearly from 1 to 5 |
1851
|
|
|
|
|
|
|
foreach (5..$MAX_PEPTIDE_LENGTH) { |
1852
|
|
|
|
|
|
|
$WordScoreForSeqLen{$_} = 1 + 4 * ($_ - 5) / ($MAX_PEPTIDE_LENGTH - 5); |
1853
|
|
|
|
|
|
|
} |
1854
|
|
|
|
|
|
|
my $max_penalized_length = 6; # after this length, flat rate penalty applies: |
1855
|
|
|
|
|
|
|
my $WordScoreForMatchLenMin = 3**(-$max_penalized_length); |
1856
|
|
|
|
|
|
|
# up to $max_penalized_length, penalize by this factor (divide score by 3 for each char) |
1857
|
|
|
|
|
|
|
my %WordScoreForMatchLen = map { ($_ => 3**(-$_)) } 0..$max_penalized_length; |
1858
|
|
|
|
|
|
|
|
1859
|
|
|
|
|
|
|
=head2 WordScore |
1860
|
|
|
|
|
|
|
|
1861
|
|
|
|
|
|
|
Args : $rh_word - ref to hash with word data. |
1862
|
|
|
|
|
|
|
Example : $parser->WordScore($rh_word) or return; |
1863
|
|
|
|
|
|
|
Description : See package DESCRIPTION above. |
1864
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
1865
|
|
|
|
|
|
|
|
1866
|
|
|
|
|
|
|
=cut |
1867
|
|
|
|
|
|
|
|
1868
|
|
|
|
|
|
|
sub WordScore { |
1869
|
1121
|
|
|
1121
|
1
|
1475
|
my ($self, $rh_word) = @_; |
1870
|
1121
|
50
|
0
|
|
|
2269
|
ref $rh_word eq 'HASH' or |
1871
|
|
|
|
|
|
|
carp "not ok: expected WordScore arg[1] = hash ref, got $rh_word" and return; |
1872
|
1121
|
|
|
|
|
1386
|
$rh_word->{WordScore} = 1; # initialize WordScore |
1873
|
1121
|
100
|
66
|
|
|
2778
|
unless ($rh_word->{WordSequence} and $rh_word->{WordOrig}) { |
1874
|
1043
|
|
|
|
|
1385
|
$rh_word->{WordScore} = 0; |
1875
|
|
|
|
|
|
|
} |
1876
|
1121
|
50
|
|
|
|
2140
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1877
|
1121
|
100
|
|
|
|
3400
|
return 1 unless $rh_word->{WordScore}; |
1878
|
78
|
|
|
|
|
185
|
$rh_word->{WordScore} *= $WordScoreForAaSymbols{ $rh_word->{WordAaSymbols} }; |
1879
|
78
|
50
|
|
|
|
167
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1880
|
78
|
50
|
|
|
|
207
|
if (exists $WordScoreForSeqLen{ $rh_word->{WordSeqLen} }) { |
1881
|
78
|
|
|
|
|
165
|
$rh_word->{WordScore} *= $WordScoreForSeqLen{ $rh_word->{WordSeqLen} }; |
1882
|
|
|
|
|
|
|
} else { # if length > $MAX_PEPTIDE_LENGTH, discard regardless of anything else |
1883
|
0
|
|
|
|
|
0
|
$rh_word->{WordScore} = 0; |
1884
|
|
|
|
|
|
|
} |
1885
|
78
|
50
|
|
|
|
160
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1886
|
78
|
100
|
|
|
|
154
|
return 1 unless $rh_word->{WordScore}; |
1887
|
|
|
|
|
|
|
|
1888
|
77
|
100
|
33
|
|
|
874
|
if ( |
|
|
100
|
100
|
|
|
|
|
|
|
50
|
33
|
|
|
|
|
|
|
|
100
|
|
|
|
|
|
|
|
33
|
|
|
|
|
1889
|
|
|
|
|
|
|
$rh_word->{WordPropDegen} > 0.95 || |
1890
|
|
|
|
|
|
|
$rh_word->{WordNumNotDegen} <= 1 || |
1891
|
|
|
|
|
|
|
($rh_word->{WordNumNotDegen} <= 2 && $rh_word->{WordSeqLen} >= 6) # <= 2 of 6 aa |
1892
|
|
|
|
|
|
|
) { |
1893
|
2
|
|
|
|
|
4
|
$rh_word->{WordScore} = 0; |
1894
|
|
|
|
|
|
|
} |
1895
|
|
|
|
|
|
|
elsif ( |
1896
|
|
|
|
|
|
|
$rh_word->{WordPropDegen} > 0.7 || |
1897
|
|
|
|
|
|
|
($rh_word->{WordNumNotDegen} <= 2 && $rh_word->{WordSeqLen} >= 5) |
1898
|
|
|
|
|
|
|
) { |
1899
|
1
|
|
|
|
|
3
|
$rh_word->{WordScore} *= 0.001; |
1900
|
|
|
|
|
|
|
} |
1901
|
|
|
|
|
|
|
elsif ($rh_word->{WordPropDegen} > 0.6) { |
1902
|
0
|
|
|
|
|
0
|
$rh_word->{WordScore} *= 0.01; |
1903
|
|
|
|
|
|
|
} |
1904
|
77
|
50
|
|
|
|
170
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1905
|
77
|
100
|
|
|
|
152
|
return 1 unless $rh_word->{WordScore}; |
1906
|
|
|
|
|
|
|
|
1907
|
75
|
50
|
|
|
|
153
|
$self->WordScoreForMatch($rh_word) or return; |
1908
|
75
|
50
|
|
|
|
185
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1909
|
75
|
50
|
|
|
|
197
|
return 1 unless $rh_word->{WordScore}; |
1910
|
|
|
|
|
|
|
# normalize for the ease of use, so that (after all transformations), |
1911
|
|
|
|
|
|
|
# at WordScore = 0.5, approximately 50% of the words are |
1912
|
|
|
|
|
|
|
# true positives by manual inspection |
1913
|
75
|
|
|
|
|
125
|
$rh_word->{WordScore} = (1/6) * $rh_word->{WordScore}; |
1914
|
75
|
50
|
|
|
|
142
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1915
|
|
|
|
|
|
|
# transform (0, +inf) to (0,1): |
1916
|
75
|
|
|
|
|
135
|
$rh_word->{WordScore} = $rh_word->{WordScore} / ( 1 + $rh_word->{WordScore} ); |
1917
|
75
|
50
|
|
|
|
144
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1918
|
75
|
|
|
|
|
175
|
return 1; |
1919
|
|
|
|
|
|
|
} |
1920
|
|
|
|
|
|
|
|
1921
|
|
|
|
|
|
|
=head2 WordScoreForMatch |
1922
|
|
|
|
|
|
|
|
1923
|
|
|
|
|
|
|
Args : $rh_word - ref to hash with word data. |
1924
|
|
|
|
|
|
|
Example : $parser->WordScoreForMatch($rh_word) or return; |
1925
|
|
|
|
|
|
|
Description : Calls WordIsGeneOrDict to compute WordIsDict, WordIsGene. |
1926
|
|
|
|
|
|
|
Changes WordScore based on these vars, as well as based on WordIsDNA, |
1927
|
|
|
|
|
|
|
WordPropProtein, etc. |
1928
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
1929
|
|
|
|
|
|
|
|
1930
|
|
|
|
|
|
|
=cut |
1931
|
|
|
|
|
|
|
|
1932
|
|
|
|
|
|
|
sub WordScoreForMatch { |
1933
|
75
|
|
|
75
|
1
|
92
|
my ($self, $rh_word) = @_; |
1934
|
75
|
50
|
0
|
|
|
183
|
ref $rh_word eq 'HASH' or |
1935
|
|
|
|
|
|
|
carp "not ok: expected WordScoreForMatch arg[1]: hash ref, got: $rh_word" and |
1936
|
|
|
|
|
|
|
return; |
1937
|
|
|
|
|
|
|
# no need to check this for 3 and full letter symbols: |
1938
|
75
|
100
|
|
|
|
200
|
return 1 unless $rh_word->{WordAaSymbols} == 1; |
1939
|
70
|
50
|
|
|
|
138
|
$self->WordIsGeneOrDict($rh_word) or return; |
1940
|
70
|
50
|
|
|
|
170
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1941
|
|
|
|
|
|
|
# roman numeral < 50 that ends within 0 or 1 char of the end of word, eg |
1942
|
|
|
|
|
|
|
# TFIII or TFIIA. WordIsRomanLen = length of the roman |
1943
|
|
|
|
|
|
|
# numeral (minus 1 if there is 1 char between the numeral and the |
1944
|
|
|
|
|
|
|
# end of the word). |
1945
|
70
|
|
|
|
|
385
|
while ($rh_word->{WordOrig} =~ /(X{0,3})(I[XV]|V?I{0,3})([A-Z])?\b/g) { |
1946
|
261
|
100
|
|
|
|
912
|
my $next_roman_len = length($1) + length($2) - ($3 ? length($3) : 0 ); |
1947
|
261
|
100
|
|
|
|
1420
|
next unless $next_roman_len > 0; # skip 0 or -1 (not a roman numeral). |
1948
|
14
|
|
|
|
|
23
|
$rh_word->{WordIsRomanLen} += $next_roman_len; |
1949
|
14
|
100
|
|
|
|
83
|
$rh_word->{WordIsRoman} = 1 if $rh_word->{WordIsRomanLen} >= 2; # eg, HYNKIIIA |
1950
|
|
|
|
|
|
|
} |
1951
|
|
|
|
|
|
|
# for these vars with values: 0/1, penalize based on length: |
1952
|
70
|
|
|
|
|
117
|
foreach (qw(WordIsDNA WordIsDict WordIsGene WordIsRoman)) { |
1953
|
280
|
100
|
|
|
|
619
|
next unless $rh_word->{$_}; |
1954
|
|
|
|
|
|
|
# limit the penalty by the sequence length, eg |
1955
|
|
|
|
|
|
|
# if 'SDS-PAGE' is entered into dictionary twice, |
1956
|
|
|
|
|
|
|
# as both 'SDS-PAGE' and 'SDSPAGE' |
1957
|
95
|
100
|
|
|
|
261
|
$rh_word->{"${_}Len"} = $rh_word->{WordSeqLen} |
1958
|
|
|
|
|
|
|
if $rh_word->{"${_}Len"} > $rh_word->{WordSeqLen}; |
1959
|
95
|
|
66
|
|
|
336
|
$rh_word->{WordScore} *= |
1960
|
|
|
|
|
|
|
( $WordScoreForMatchLen{ $rh_word->{"${_}Len"} } || |
1961
|
|
|
|
|
|
|
$WordScoreForMatchLenMin ); |
1962
|
95
|
50
|
|
|
|
226
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1963
|
|
|
|
|
|
|
} |
1964
|
|
|
|
|
|
|
# if the word was not flagged by WordIsGene WordIsDict: |
1965
|
70
|
100
|
100
|
|
|
296
|
unless ($rh_word->{WordIsGene} || $rh_word->{WordIsDict}) { |
1966
|
|
|
|
|
|
|
# penalize for similarity to a gene symbol, eg KLF4. |
1967
|
21
|
|
|
|
|
121
|
foreach ($rh_word->{WordOrig} =~ m!([^A-Z]|\b)[A-Z]{2,3}\d*\b!g) { |
1968
|
5
|
|
|
|
|
10
|
$rh_word->{WordScore} *= 0.1; |
1969
|
5
|
50
|
|
|
|
125
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1970
|
|
|
|
|
|
|
# penalize for similarity to abbreviations, or to gene/protein interactions, eg |
1971
|
|
|
|
|
|
|
# 'GM-CSF', 'HPLC/FPLC', 'MAP/MAPKK'. |
1972
|
5
|
100
|
100
|
|
|
28
|
$rh_word->{WordScore} *= 0.01 |
1973
|
|
|
|
|
|
|
if $rh_word->{WordSubLenMax} && # if the word could be split into subwords and |
1974
|
|
|
|
|
|
|
$rh_word->{WordSubLenMax} <= 4; # all sub-words are short |
1975
|
5
|
50
|
|
|
|
21
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1976
|
|
|
|
|
|
|
} |
1977
|
|
|
|
|
|
|
} |
1978
|
|
|
|
|
|
|
# WordPropProtein is a good predictor if abs(WordPropProtein) > 8. |
1979
|
|
|
|
|
|
|
# WordPropProtein is very high for WordIsDNA, eg 'CGGTTAAAA', or |
1980
|
|
|
|
|
|
|
# WordIsRoman, eg 'HYNKIIIA', thus not used. |
1981
|
70
|
100
|
100
|
|
|
401
|
unless( |
|
|
|
66
|
|
|
|
|
|
|
|
100
|
|
|
|
|
1982
|
|
|
|
|
|
|
$rh_word->{WordIsDNA} || $rh_word->{WordIsDict} || |
1983
|
|
|
|
|
|
|
$rh_word->{WordIsGene} || $rh_word->{WordIsRoman} |
1984
|
|
|
|
|
|
|
) { |
1985
|
13
|
|
|
|
|
88
|
$rh_word->{WordScore} *= |
1986
|
|
|
|
|
|
|
(10 ** (LOG_10_2 * $rh_word->{WordPropProtein}) ); |
1987
|
|
|
|
|
|
|
} |
1988
|
70
|
50
|
|
|
|
165
|
warn "WordScore = $rh_word->{WordScore}; WordOrig = $rh_word->{WordOrig}" if $self->{verbose} > 2; |
1989
|
70
|
50
|
|
|
|
143
|
carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$rh_word], ['rh_word'])) |
1990
|
|
|
|
|
|
|
if $self->{verbose} > 2; |
1991
|
70
|
|
|
|
|
170
|
return 1; |
1992
|
|
|
|
|
|
|
} |
1993
|
|
|
|
|
|
|
} |
1994
|
|
|
|
|
|
|
|
1995
|
|
|
|
|
|
|
=head2 WordIsGeneOrDict |
1996
|
|
|
|
|
|
|
|
1997
|
|
|
|
|
|
|
Args : $rh_word - ref to hash with word data. |
1998
|
|
|
|
|
|
|
Example : $parser->WordIsGeneOrDict($rh_word) or return; |
1999
|
|
|
|
|
|
|
Description : Computes WordIsDict, WordIsGene and related vars. |
2000
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
2001
|
|
|
|
|
|
|
|
2002
|
|
|
|
|
|
|
=cut |
2003
|
|
|
|
|
|
|
|
2004
|
|
|
|
|
|
|
sub WordIsGeneOrDict { |
2005
|
70
|
|
|
70
|
1
|
85
|
my ($self, $rh_word) = @_; |
2006
|
70
|
50
|
0
|
|
|
137
|
ref $rh_word eq 'HASH' or |
2007
|
|
|
|
|
|
|
carp "not ok: expected WordIsGeneOrDict arg[1]: hash ref, got: $rh_word" and |
2008
|
|
|
|
|
|
|
return; |
2009
|
|
|
|
|
|
|
# select words for matching. Do not penalize more than once for the |
2010
|
|
|
|
|
|
|
# same subword, unless it actually occurs more than once. |
2011
|
70
|
|
|
|
|
323
|
my @subwords = split /[^A-Za-z]+/, lc $rh_word->{WordLcOrig}; |
2012
|
70
|
|
|
|
|
92
|
my @subwords_long; |
2013
|
70
|
100
|
|
|
|
153
|
if (@subwords != 1) { # if could split; otherwise just handle $rh_word->{WordLcOrig} below. |
2014
|
|
|
|
|
|
|
# select sub-words that are long in absolute terms and also long relative |
2015
|
|
|
|
|
|
|
# to the original word |
2016
|
|
|
|
|
|
|
# (eg, select 'RAS' in 'RAS/EGFR' , but not in 'RAS/GETRAPLGETRAPL' |
2017
|
20
|
|
|
|
|
39
|
foreach (@subwords) { |
2018
|
52
|
|
|
|
|
59
|
my $length = length $_; |
2019
|
52
|
100
|
66
|
|
|
195
|
push @subwords_long, $_ |
2020
|
|
|
|
|
|
|
if $length > 2 and ( $length / $rh_word->{WordSeqLen} ) > 0.25; |
2021
|
|
|
|
|
|
|
# WordSubLenMax = max sub-word length, if could split; otherwise 0. |
2022
|
52
|
100
|
|
|
|
153
|
$rh_word->{WordSubLenMax} = $length if $rh_word->{WordSubLenMax} < $length; |
2023
|
|
|
|
|
|
|
} |
2024
|
|
|
|
|
|
|
} |
2025
|
70
|
100
|
|
|
|
232
|
foreach my $subword ( $rh_word->{WordLcSequence}, |
2026
|
|
|
|
|
|
|
# do not penalize more than once for 1 occurrence: |
2027
|
|
|
|
|
|
|
( $rh_word->{WordLcOrig} eq $rh_word->{WordLcSequence} ? |
2028
|
|
|
|
|
|
|
() : $rh_word->{WordLcOrig} ), |
2029
|
|
|
|
|
|
|
@subwords_long ) { |
2030
|
131
|
|
|
|
|
191
|
foreach my $bad (qw(Dict Gene)) { |
2031
|
|
|
|
|
|
|
# if ( eval { ${$bad}{$subword} } ) { # eval does not work - ?? |
2032
|
262
|
|
|
|
|
246
|
my $is_bad = do { |
2033
|
262
|
100
|
|
|
|
566
|
if ($bad eq 'Dict') { $Dict{$subword} } |
|
131
|
50
|
|
|
|
303
|
|
2034
|
131
|
|
|
|
|
413
|
elsif ($bad eq 'Gene') { $Gene{$subword} } |
2035
|
|
|
|
|
|
|
}; |
2036
|
262
|
100
|
|
|
|
681
|
if ($is_bad) { |
2037
|
109
|
|
|
|
|
176
|
$rh_word->{"WordIs$bad"} = 1; |
2038
|
109
|
|
|
|
|
341
|
$rh_word->{"WordIs${bad}Len"} += length($subword); |
2039
|
|
|
|
|
|
|
} |
2040
|
|
|
|
|
|
|
} |
2041
|
|
|
|
|
|
|
} |
2042
|
70
|
|
|
|
|
213
|
return 1; |
2043
|
|
|
|
|
|
|
} |
2044
|
|
|
|
|
|
|
|
2045
|
|
|
|
|
|
|
=head2 WordAbstScore |
2046
|
|
|
|
|
|
|
|
2047
|
|
|
|
|
|
|
Args : none |
2048
|
|
|
|
|
|
|
Example : $parser->WordAbstScore or return; |
2049
|
|
|
|
|
|
|
Description : Computes WordAbstScore based on the results of |
2050
|
|
|
|
|
|
|
AbstScore and WordScore vars for all words in the |
2051
|
|
|
|
|
|
|
abstract. AbstMaxWordScore is high if either AbstScore |
2052
|
|
|
|
|
|
|
is high or WordScore is high, and higher if both are |
2053
|
|
|
|
|
|
|
high. Instead of using simply AbstMaxWordScore = |
2054
|
|
|
|
|
|
|
AbstScore * WordScore, AbstMaxWordScore is also added |
2055
|
|
|
|
|
|
|
to WordScore with a smaller weight. This is because |
2056
|
|
|
|
|
|
|
probability that a given word is a sequence increases |
2057
|
|
|
|
|
|
|
if another word in the same abstract is a |
2058
|
|
|
|
|
|
|
sequence. The weight for AbstMaxWordScore is smaller |
2059
|
|
|
|
|
|
|
than that for WordScore in order not to make all words |
2060
|
|
|
|
|
|
|
have equal AbstMaxWordScore. WordAbstScore = 0 for |
2061
|
|
|
|
|
|
|
WordScore = 0, to reduce obvious noise (otherwise, all |
2062
|
|
|
|
|
|
|
words that are clearly not peptide sequences will get |
2063
|
|
|
|
|
|
|
positive WordAbstScore when they occur in an abstract |
2064
|
|
|
|
|
|
|
with at least one sequence). Skip transform to (0,1) |
2065
|
|
|
|
|
|
|
because the scores below are all in (0,1), and the |
2066
|
|
|
|
|
|
|
weights add to 1, so WordAbstScore is already in |
2067
|
|
|
|
|
|
|
(0,1). |
2068
|
|
|
|
|
|
|
Returns : TRUE if successful, FALSE otherwise. |
2069
|
|
|
|
|
|
|
|
2070
|
|
|
|
|
|
|
=cut |
2071
|
|
|
|
|
|
|
|
2072
|
|
|
|
|
|
|
sub WordAbstScore { |
2073
|
12
|
|
|
12
|
1
|
16
|
my ($self) = @_; |
2074
|
12
|
|
|
|
|
23
|
my $rh_abst = $self->{abst}; |
2075
|
|
|
|
|
|
|
# max WordScore for all words in this abstract: |
2076
|
12
|
|
|
|
|
34
|
$rh_abst->{AbstMaxWordScore} = 0; |
2077
|
12
|
|
|
|
|
18
|
foreach my $rh_word (@{ $self->{words} }) { |
|
12
|
|
|
|
|
29
|
|
2078
|
1126
|
100
|
|
|
|
2418
|
$rh_abst->{AbstMaxWordScore} = $rh_word->{WordScore} |
2079
|
|
|
|
|
|
|
if $rh_abst->{AbstMaxWordScore} < $rh_word->{WordScore}; |
2080
|
|
|
|
|
|
|
} |
2081
|
|
|
|
|
|
|
|
2082
|
12
|
|
|
|
|
21
|
foreach my $rh_word (@{ $self->{words} }) { |
|
12
|
|
|
|
|
27
|
|
2083
|
1126
|
100
|
|
|
|
2110
|
$rh_word->{WordAbstScore} = $rh_word->{WordScore} > 0 ? |
2084
|
|
|
|
|
|
|
$rh_abst->{AbstScore} * |
2085
|
|
|
|
|
|
|
(0.1 * $rh_abst->{AbstMaxWordScore} + 0.9 * $rh_word->{WordScore}) |
2086
|
|
|
|
|
|
|
: 0; |
2087
|
|
|
|
|
|
|
} |
2088
|
12
|
50
|
|
|
|
36
|
if ($self->{verbose} > 0) { |
2089
|
12
|
|
|
|
|
113
|
$rh_abst->{AbstComment} .= |
2090
|
|
|
|
|
|
|
sprintf("AbstMaxWordScore=%.4f; ", $rh_abst->{AbstMaxWordScore}); |
2091
|
|
|
|
|
|
|
} |
2092
|
12
|
50
|
|
|
|
38
|
carp((caller(0))[3], "(@_)", ' ', Data::Dumper->Dump([$self], ['self'])) |
2093
|
|
|
|
|
|
|
if $self->{verbose} > 2; |
2094
|
12
|
|
|
|
|
32
|
return 1; |
2095
|
|
|
|
|
|
|
} |
2096
|
|
|
|
|
|
|
|
2097
|
|
|
|
|
|
|
=head2 AbstMtext |
2098
|
|
|
|
|
|
|
|
2099
|
|
|
|
|
|
|
Args : none |
2100
|
|
|
|
|
|
|
Example : $parser->AbstMtext |
2101
|
|
|
|
|
|
|
Description : Marks all words that are putative sequences with |
2102
|
|
|
|
|
|
|
'...' tags. This is done only for words |
2103
|
|
|
|
|
|
|
for which WordScore is at least WordScoreMin. |
2104
|
|
|
|
|
|
|
Returns : always TRUE |
2105
|
|
|
|
|
|
|
|
2106
|
|
|
|
|
|
|
=cut |
2107
|
|
|
|
|
|
|
|
2108
|
|
|
|
|
|
|
sub AbstMtext { |
2109
|
12
|
|
|
12
|
1
|
17
|
my ($self) = @_; |
2110
|
12
|
50
|
33
|
|
|
80
|
if ( |
2111
|
|
|
|
|
|
|
$self->{words} and |
2112
|
|
|
|
|
|
|
ref $self->{words} eq 'ARRAY' |
2113
|
|
|
|
|
|
|
) { |
2114
|
|
|
|
|
|
|
$self->{abst}{AbstMtext} = |
2115
|
|
|
|
|
|
|
join '', map { |
2116
|
1126
|
100
|
66
|
|
|
11811
|
if (! defined $_->{WordOrig} ) { () } # happens if WordNumPrintMin = 1 |
|
5
|
100
|
|
|
|
16
|
|
|
12
|
|
|
|
|
35
|
|
2117
|
|
|
|
|
|
|
elsif ( |
2118
|
|
|
|
|
|
|
$_->{WordScore} >= $self->{WordScoreMin} && |
2119
|
|
|
|
|
|
|
$_->{WordAbstScore} >= $self->{WordAbstScoreMin} |
2120
|
|
|
|
|
|
|
) |
2121
|
597
|
|
|
|
|
1873
|
{ "$_->{WordOrig}"} |
2122
|
524
|
|
|
|
|
1109
|
else { $_->{WordOrig} } |
2123
|
|
|
|
|
|
|
} |
2124
|
12
|
|
|
|
|
19
|
@{ $self->{words} } ; |
2125
|
|
|
|
|
|
|
} |
2126
|
|
|
|
|
|
|
else { |
2127
|
0
|
|
0
|
|
|
0
|
$self->{abst}{AbstMtext} = $self->{abst}{AbstAbstract} || ''; |
2128
|
|
|
|
|
|
|
} |
2129
|
12
|
|
|
|
|
120
|
return 1; |
2130
|
|
|
|
|
|
|
} |
2131
|
|
|
|
|
|
|
|
2132
|
|
|
|
|
|
|
=head2 formatScoreProp |
2133
|
|
|
|
|
|
|
|
2134
|
|
|
|
|
|
|
Args[1] : $number - score or proportion. |
2135
|
|
|
|
|
|
|
Example : $rh_word->{WordScore} = formatScoreProp($rh_word->{WordScore}); |
2136
|
|
|
|
|
|
|
Description : changes $number to be in (0,1) interval, formats $number |
2137
|
|
|
|
|
|
|
for nice printing |
2138
|
|
|
|
|
|
|
Returns : formatted score |
2139
|
|
|
|
|
|
|
|
2140
|
|
|
|
|
|
|
=cut |
2141
|
|
|
|
|
|
|
|
2142
|
|
|
|
|
|
|
sub formatScoreProp { |
2143
|
4540
|
|
|
4540
|
1
|
4647
|
my ($number) = @_; |
2144
|
4540
|
50
|
|
|
|
7768
|
return unless defined $number; |
2145
|
|
|
|
|
|
|
# number should be between 0 and 1, |
2146
|
|
|
|
|
|
|
# In methods in this package, the input $number is supposed to be always in (0,1) - |
2147
|
|
|
|
|
|
|
# maybe issue warning otherwise? |
2148
|
4540
|
50
|
|
|
|
10136
|
$number = 0 if $number < 0; |
2149
|
4540
|
50
|
|
|
|
21907
|
$number = 1 if $number > 1; |
2150
|
4540
|
|
|
|
|
19396
|
$number = sprintf "%.4f", $number; |
2151
|
|
|
|
|
|
|
} |
2152
|
|
|
|
|
|
|
|
2153
|
|
|
|
|
|
|
=head2 find |
2154
|
|
|
|
|
|
|
|
2155
|
|
|
|
|
|
|
Args[1] : $key - field name |
2156
|
|
|
|
|
|
|
Args[2] : $val - value |
2157
|
|
|
|
|
|
|
Example : # print WordScore for the first GETRAPL sequence. |
2158
|
|
|
|
|
|
|
# print $parser->find(WordSequence => 'GETRAPL')->{WordScore}; |
2159
|
|
|
|
|
|
|
Description : finds the first word for which the value of field $key |
2160
|
|
|
|
|
|
|
is equal to $val (string eq, not numeric ==) |
2161
|
|
|
|
|
|
|
Returns : returns this word (hash ref) if successful, FALSE otherwise. |
2162
|
|
|
|
|
|
|
|
2163
|
|
|
|
|
|
|
=cut |
2164
|
|
|
|
|
|
|
|
2165
|
|
|
|
|
|
|
sub find { |
2166
|
40
|
|
|
40
|
1
|
104
|
my ($self, $key, $val) = @_; |
2167
|
40
|
|
|
|
|
53
|
foreach ( @{ $self->{words} } ) { |
|
40
|
|
|
|
|
107
|
|
2168
|
16962
|
100
|
66
|
|
|
72372
|
return $_ if defined $_->{$key} and $_->{$key} eq $val; |
2169
|
|
|
|
|
|
|
} |
2170
|
0
|
|
|
|
|
|
return; |
2171
|
|
|
|
|
|
|
} |
2172
|
|
|
|
|
|
|
|
2173
|
|
|
|
|
|
|
1; |