line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::DxExtractor; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
12647
|
use 5.008008; |
|
1
|
|
|
|
|
2
|
|
4
|
1
|
|
|
1
|
|
3
|
use strict; |
|
1
|
|
|
|
|
1
|
|
|
1
|
|
|
|
|
15
|
|
5
|
1
|
|
|
1
|
|
2
|
use warnings; |
|
1
|
|
|
|
|
9
|
|
|
1
|
|
|
|
|
32
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = '2.31'; |
8
|
|
|
|
|
|
|
|
9
|
1
|
|
|
1
|
|
405
|
use Text::Sentence qw( split_sentences ); |
|
1
|
|
|
|
|
311
|
|
|
1
|
|
|
|
|
47
|
|
10
|
1
|
|
|
1
|
|
384
|
use Lingua::NegEx qw( negation_scope ); |
|
1
|
|
|
|
|
775
|
|
|
1
|
|
|
|
|
66
|
|
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
use Class::MakeMethods ( |
14
|
1
|
|
|
|
|
6
|
'Template::Hash:array' => [ |
15
|
|
|
|
|
|
|
'target_phrases', 'skip_phrases', |
16
|
|
|
|
|
|
|
'absolute_present_phrases', 'absolute_negative_phrases', |
17
|
|
|
|
|
|
|
], |
18
|
|
|
|
|
|
|
'Template::Hash:scalar' => [ |
19
|
|
|
|
|
|
|
'orig_text', 'final_answer', 'ambiguous', |
20
|
|
|
|
|
|
|
'start_phrase', |
21
|
|
|
|
|
|
|
], |
22
|
|
|
|
|
|
|
'Template::Hash:hash' => [ |
23
|
|
|
|
|
|
|
'target_sentence', 'negex_debug', |
24
|
|
|
|
|
|
|
], |
25
|
1
|
|
|
1
|
|
530
|
); |
|
1
|
|
|
|
|
1637
|
|
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
###################################################################### |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
sub new { |
30
|
0
|
|
|
0
|
0
|
|
my $callee = shift; |
31
|
0
|
|
0
|
|
|
|
my $package = ref $callee || $callee; |
32
|
0
|
|
|
|
|
|
my $self = shift; |
33
|
0
|
|
|
|
|
|
bless $self, $package; |
34
|
0
|
0
|
|
|
|
|
die 'Need to define target phrases' unless $self->target_phrases; |
35
|
0
|
|
|
|
|
|
return $self; |
36
|
|
|
|
|
|
|
} |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
sub process_text { |
39
|
0
|
|
|
0
|
0
|
|
my ($self,$text) = @_; |
40
|
0
|
|
|
|
|
|
$self->orig_text( $text ); |
41
|
0
|
|
|
|
|
|
$self->examine_text; |
42
|
0
|
|
|
|
|
|
return $self->final_answer; |
43
|
|
|
|
|
|
|
} |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
sub examine_text { |
46
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
47
|
0
|
|
|
|
|
|
my $text = $self->orig_text; |
48
|
0
|
0
|
|
|
|
|
return if ! $text; |
49
|
|
|
|
|
|
|
|
50
|
0
|
|
|
|
|
|
my $start_phrase = $self->start_phrase; |
51
|
0
|
0
|
0
|
|
|
|
if ( $start_phrase and $text =~ /$start_phrase(.*)\Z/ix ) { |
52
|
0
|
|
|
|
|
|
$text = $1; |
53
|
|
|
|
|
|
|
} |
54
|
0
|
|
|
|
|
|
$text =~ s/\s+/ /gxms; |
55
|
|
|
|
|
|
|
# treat colon ':' like a period '.' |
56
|
0
|
|
|
|
|
|
$text =~ s/:/./g; |
57
|
|
|
|
|
|
|
|
58
|
0
|
|
|
|
|
|
my @sentences = split_sentences( $text ); |
59
|
0
|
|
|
|
|
|
foreach my $line ( @sentences ) { |
60
|
|
|
|
|
|
|
|
61
|
0
|
0
|
|
|
|
|
next if scalar grep { $line =~ /\b$_\b/i } @{$self->skip_phrases}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
62
|
0
|
0
|
|
|
|
|
next unless grep { $line =~ /\b$_\b/i } @{$self->target_phrases}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
|
64
|
0
|
|
|
|
|
|
$self->target_sentence->{ $line } = 'present'; |
65
|
0
|
|
|
|
|
|
my $n_scope = negation_scope( $line ); |
66
|
|
|
|
|
|
|
|
67
|
0
|
0
|
|
|
|
|
if ( $n_scope ) { |
68
|
0
|
|
|
|
|
|
$self->negex_debug->{ $line } = @$n_scope[0] . ' - ' . @$n_scope[1]; |
69
|
0
|
|
|
|
|
|
my @words; |
70
|
0
|
|
|
|
|
|
foreach ( split /\s/xms, $line ) { |
71
|
0
|
|
|
|
|
|
s/\W//xms; |
72
|
0
|
|
|
|
|
|
push @words, $_; |
73
|
|
|
|
|
|
|
} |
74
|
0
|
|
|
|
|
|
foreach my $c ( @$n_scope[0] .. @$n_scope[1] ) { |
75
|
0
|
|
|
|
|
|
my @match = grep { $words[ $c ] =~ /$_/ixms } @{$self->target_phrases}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
76
|
|
|
|
|
|
|
|
77
|
0
|
0
|
|
|
|
|
if ( scalar @match ) { |
78
|
0
|
|
|
|
|
|
$self->target_sentence->{ $line } = 'absent'; |
79
|
0
|
|
|
|
|
|
last; |
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
} |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
} |
84
|
|
|
|
|
|
|
|
85
|
0
|
0
|
|
|
|
|
if ( scalar keys %{$self->target_sentence} ) { |
|
0
|
0
|
|
|
|
|
|
86
|
0
|
|
|
|
|
|
my %final_answer; |
87
|
0
|
|
|
|
|
|
while ( my($sentence,$answer) = each %{$self->target_sentence} ) { |
|
0
|
|
|
|
|
|
|
88
|
0
|
|
|
|
|
|
$final_answer{ $answer }++; |
89
|
0
|
|
|
|
|
|
$self->final_answer( $answer ); |
90
|
|
|
|
|
|
|
} |
91
|
0
|
0
|
|
|
|
|
if ( scalar keys %final_answer > 1 ) { |
92
|
0
|
|
|
|
|
|
$self->ambiguous( 1 ); |
93
|
0
|
|
0
|
|
|
|
$final_answer{ 'absent' } ||= 0; |
94
|
0
|
|
0
|
|
|
|
$final_answer{ 'present' } ||= 0; |
95
|
|
|
|
|
|
|
|
96
|
0
|
0
|
|
|
|
|
if ( $final_answer{ 'absent' } > $final_answer{ 'present' } ) { |
|
|
0
|
|
|
|
|
|
97
|
0
|
|
|
|
|
|
$self->final_answer( 'absent' ); |
98
|
|
|
|
|
|
|
} elsif ( $final_answer{ 'present' } > $final_answer{ 'absent' } ) { |
99
|
0
|
|
|
|
|
|
$self->final_answer( 'present' ); |
100
|
|
|
|
|
|
|
} else { |
101
|
|
|
|
|
|
|
# There are an equal number of absent/present findings - defaulting to present |
102
|
0
|
|
|
|
|
|
$self->final_answer( 'present' ); |
103
|
|
|
|
|
|
|
} |
104
|
|
|
|
|
|
|
} |
105
|
|
|
|
|
|
|
|
106
|
0
|
|
|
|
|
|
} elsif ( ! scalar keys %{$self->target_sentence} ) { |
107
|
0
|
|
|
|
|
|
$self->final_answer( 'absent' ); |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
|
110
|
|
|
|
|
|
|
|
111
|
0
|
0
|
0
|
|
|
|
if ( grep { $text =~ /$_/i } @{$self->absolute_negative_phrases} and $self->final_answer eq 'present' ) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
112
|
0
|
|
|
|
|
|
$self->final_answer( 'absent' ); |
113
|
0
|
|
|
|
|
|
$self->ambiguous( 3 ); |
114
|
|
|
|
|
|
|
} |
115
|
0
|
0
|
0
|
|
|
|
if ( grep { $text =~ /$_/i } @{$self->absolute_present_phrases} and $self->final_answer eq 'absent' ) { |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
116
|
0
|
|
|
|
|
|
$self->final_answer( 'present' ); |
117
|
0
|
|
|
|
|
|
$self->ambiguous( 2 ); |
118
|
|
|
|
|
|
|
} |
119
|
|
|
|
|
|
|
} |
120
|
|
|
|
|
|
|
|
121
|
|
|
|
|
|
|
sub debug { |
122
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
123
|
0
|
|
|
|
|
|
my $out = "Target Phrases(" . (join ', ', map { qq{'$_'} } @{$self->target_phrases}) . ")\r\n\r\n"; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
124
|
0
|
0
|
|
|
|
|
$out .= "Skip Phrases(" . (join ', ', map { qq{'$_'} } @{$self->skip_phrases}) . ")\r\n\r\n" if $self->skip_phrases; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
125
|
0
|
0
|
|
|
|
|
$out .= "Absolute Present Phrases(" . (join ', ', map { qq{'$_'} } @{$self->absolute_present_phrases}) . ")\r\n\r\n" if $self->absolute_present_phrases; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
126
|
0
|
0
|
|
|
|
|
$out .= "Absolute Negative Phrases(" . (join ', ', map { qq{'$_'} } @{$self->absolute_negative_phrases}) . ")\r\n\r\n" if $self->absolute_negative_phrases; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
127
|
0
|
0
|
|
|
|
|
$out .= "Start Phrase( '" . $self->start_phrase . "' )\r\n\r\n" if $self->start_phrase; |
128
|
|
|
|
|
|
|
|
129
|
0
|
|
|
|
|
|
$out .= "Sentences with a target phrase match:\r\n"; |
130
|
0
|
|
|
|
|
|
my $count = 1; |
131
|
0
|
|
|
|
|
|
while ( my($sentence,$answer) = each %{$self->target_sentence} ) { |
|
0
|
|
|
|
|
|
|
132
|
0
|
|
|
|
|
|
$out .= "$count) $sentence -- $answer. "; |
133
|
0
|
|
|
|
|
|
$count++; |
134
|
0
|
|
0
|
|
|
|
$out .= "NegEx: " . ($self->negex_debug->{ $sentence } || 'None') . "\r\n"; |
135
|
|
|
|
|
|
|
} |
136
|
0
|
0
|
|
|
|
|
$out .= "\r\nAmbiguous: " . ($self->ambiguous == 1 ? 'Yes' : ( $self->ambiguous == 2 ? 'Absolute Present Phrase was present but the answer was going to be absent.' : ( $self->ambiguous == 3 ? 'Absolute Negative Phrase was present but the answer was going to be present.' : 'No' ) ) ); |
|
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
137
|
0
|
|
|
|
|
|
$out .= "\r\nFinal Answer: " . $self->final_answer . "\r\n"; |
138
|
0
|
|
|
|
|
|
return $out; |
139
|
|
|
|
|
|
|
} |
140
|
|
|
|
|
|
|
|
141
|
|
|
|
|
|
|
sub reset { |
142
|
0
|
|
|
0
|
0
|
|
my $self = shift; |
143
|
0
|
|
|
|
|
|
$self->orig_text( '' ); |
144
|
0
|
|
|
|
|
|
$self->target_sentence( {} ); |
145
|
0
|
|
|
|
|
|
$self->final_answer( '' ); |
146
|
0
|
|
|
|
|
|
$self->ambiguous( '' ); |
147
|
|
|
|
|
|
|
} |
148
|
|
|
|
|
|
|
|
149
|
|
|
|
|
|
|
1; |
150
|
|
|
|
|
|
|
|
151
|
|
|
|
|
|
|
=head1 NAME |
152
|
|
|
|
|
|
|
|
153
|
|
|
|
|
|
|
Lingua::DxExtractor - Perl extension to extract the presence or absence of a clinical condition from medical reports. |
154
|
|
|
|
|
|
|
|
155
|
|
|
|
|
|
|
=head1 SYNOPSIS |
156
|
|
|
|
|
|
|
|
157
|
|
|
|
|
|
|
use Lingua::DxExtractor; |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
$extractor = Lingua::DxExtractor->new( { |
160
|
|
|
|
|
|
|
target_phrases => [ qw( embolus embolism emboli defect pe clot clots ) ], |
161
|
|
|
|
|
|
|
skip_phrases => [ qw( history indication technique nondiagnostic ) ], |
162
|
|
|
|
|
|
|
absolute_present_phrases => [ ( 'This is definitely a PE', 'absolutely positive for pe' ) ], |
163
|
|
|
|
|
|
|
absolute_negative_phrases => [ ( 'there is no way this is a pe', 'no clots seen at all' ) ], |
164
|
|
|
|
|
|
|
start_phrase => 'Impression:', |
165
|
|
|
|
|
|
|
} ); |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
$text = <
|
168
|
|
|
|
|
|
|
Indication: To rule out pulmonary embolism. Findings: There is no evidence of vascular filling defect to the subsegmental level... |
169
|
|
|
|
|
|
|
END |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
$final_answer = $extractor->process_text( $text ); # 'absent' or 'present' |
172
|
|
|
|
|
|
|
$is_final_answer_ambiguous = $extractor->ambiguous; # 1 or 0 |
173
|
|
|
|
|
|
|
$debug = $extractor->debug; |
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
$original_text = $extractor->orig_text; |
176
|
|
|
|
|
|
|
$final_answer = $extractor->final_answer; |
177
|
|
|
|
|
|
|
$ambiguous = $extractor->ambiguous; |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
$extractor->clear; # clears orig_text, final_answer, target_sentence and ambiguous |
180
|
|
|
|
|
|
|
|
181
|
|
|
|
|
|
|
=head1 DESCRIPTION |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
A tool to be used to look for the presence or absence of a clinical condition as reported in medical reports. The extractor reports a 'final answer', 'absent' or 'present', as well as reports whether this answer is 'ambiguous' or not. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
The 'use case' for this is when performing a research project with a large number of records and you need to identify a subset based on a diagnostic entity, you can use this tool to reduce the number of charts that have to be manually examined. |
186
|
|
|
|
|
|
|
|
187
|
|
|
|
|
|
|
The medical reports don't require textual preprocessing however clearly the selection of target_phrases and skip_phrases requires reading through reports to get a sense of what vocabulary is being used in the particular dataset that is being evaluated. |
188
|
|
|
|
|
|
|
|
189
|
|
|
|
|
|
|
Negated terms are identified using Lingua::NegEx which is a perl implementation of Wendy Chapman's NegEx algorithm. |
190
|
|
|
|
|
|
|
|
191
|
|
|
|
|
|
|
=head2 GETTING STARTED |
192
|
|
|
|
|
|
|
|
193
|
|
|
|
|
|
|
Create a new extractor object with your extraction rules: |
194
|
|
|
|
|
|
|
|
195
|
|
|
|
|
|
|
target_phrases( \@words ); |
196
|
|
|
|
|
|
|
|
197
|
|
|
|
|
|
|
This is a list of phrases that describe the clinical entity in question. All forms of the entity in question need to explicitly stated since the package is currently not using lemmatization or stemming. This is the only required parameter for the extractor object. |
198
|
|
|
|
|
|
|
|
199
|
|
|
|
|
|
|
skip_phrases( \@skip ); |
200
|
|
|
|
|
|
|
|
201
|
|
|
|
|
|
|
This is a list of phrases that can be used to eliminate sentences in the text that might confuse the extractor. For example most radiographic reports start with a brief description of the indication for the test. This statement may state the clinical entity in question but does not mean it is present in the study (ie. Indication: to rule out pulmonary embolism). |
202
|
|
|
|
|
|
|
|
203
|
|
|
|
|
|
|
absolute_negative_phrases( \@absolute_negative_assertions ); |
204
|
|
|
|
|
|
|
|
205
|
|
|
|
|
|
|
This is a list of phrases which if present in the text mean the condition is certainly not there and all ambiguity checking can be skipped. |
206
|
|
|
|
|
|
|
|
207
|
|
|
|
|
|
|
absolute_present_phrases( \@absolute_positive_assertions ); |
208
|
|
|
|
|
|
|
|
209
|
|
|
|
|
|
|
This is a list of phrases which if present in the text mean the condition is certainly there and all ambiguity checking can be skipped. |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
start_phrase( $start_phrase ); |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
A phrase if present in the text which indicates where to focus the search. All text prior to the start_phrase is ignored. Often times in radiology reports there is a 'Conclusion: ' or 'Impression: ' section which can be reviewed rather than analyzing the full report. |
214
|
|
|
|
|
|
|
|
215
|
|
|
|
|
|
|
=head2 ANALYSIS |
216
|
|
|
|
|
|
|
|
217
|
|
|
|
|
|
|
Once defined, the extractor object you created can be used to analyze target text. The analysis consists of: |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
1. If there is a start phrase defined, eliminate all text for analysis prior to the start phrase. |
220
|
|
|
|
|
|
|
|
221
|
|
|
|
|
|
|
2. Make text all lowercase, eliminate extra spaces, and change all colons ':' into periods '.' to treat them as sentence breaks. |
222
|
|
|
|
|
|
|
|
223
|
|
|
|
|
|
|
3. Split text into sentences using Text::Sentence. |
224
|
|
|
|
|
|
|
|
225
|
|
|
|
|
|
|
4. Examine each sentence for the presence of any skip phrases and if found, ignore the sentence. |
226
|
|
|
|
|
|
|
|
227
|
|
|
|
|
|
|
5. Examine each sentence for the presence of any target phrases and if found evaluate for negation using Lingua:Negex. |
228
|
|
|
|
|
|
|
|
229
|
|
|
|
|
|
|
-if no negation found, mark this sentence as 'present' |
230
|
|
|
|
|
|
|
|
231
|
|
|
|
|
|
|
-if the target phrase is negated then mark the sentence as 'absent' |
232
|
|
|
|
|
|
|
|
233
|
|
|
|
|
|
|
6. Go through all the flagged sentences and see if there is any discrepancy -- if so set the ambiguous flag. If there are more sentences that indicate absent than those that indicate present then mark the final answer as absent and vice versa. If there are an equal number of absent and present phrases mark the final answer as present. |
234
|
|
|
|
|
|
|
|
235
|
|
|
|
|
|
|
7. The possible values for ambiguous: 1 = there were some positive and some absent sentences; 2 = there was a match on an absolute positive phrase but the answer was going to be absent had this absolute phrase not been indicated; 3 = there was a match on an absolute negative phrase but the answer was going to be present had this absolute phrase not been indicated; If both an absolute positive and negation phrase was present, mark the final answer as present. |
236
|
|
|
|
|
|
|
|
237
|
|
|
|
|
|
|
=head2 EXPORT |
238
|
|
|
|
|
|
|
|
239
|
|
|
|
|
|
|
None by default. |
240
|
|
|
|
|
|
|
|
241
|
|
|
|
|
|
|
=head1 SEE ALSO |
242
|
|
|
|
|
|
|
|
243
|
|
|
|
|
|
|
This module depends on: |
244
|
|
|
|
|
|
|
|
245
|
|
|
|
|
|
|
Text::Sentence |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
Class::MakeMethods |
248
|
|
|
|
|
|
|
|
249
|
|
|
|
|
|
|
Lingua::NegEx |
250
|
|
|
|
|
|
|
|
251
|
|
|
|
|
|
|
=head1 To Do |
252
|
|
|
|
|
|
|
|
253
|
|
|
|
|
|
|
Add lemmatization or stemming to target_phrases so you don't have to explicitly write out all forms of words |
254
|
|
|
|
|
|
|
|
255
|
|
|
|
|
|
|
=head1 AUTHOR |
256
|
|
|
|
|
|
|
|
257
|
|
|
|
|
|
|
Eduardo Iturrate, ed@iturrate.comE |
258
|
|
|
|
|
|
|
|
259
|
|
|
|
|
|
|
=head1 COPYRIGHT AND LICENSE |
260
|
|
|
|
|
|
|
|
261
|
|
|
|
|
|
|
Copyright (C) 2016 by Eduardo Iturrate |
262
|
|
|
|
|
|
|
|
263
|
|
|
|
|
|
|
This library is free software; you can redistribute it and/or modify |
264
|
|
|
|
|
|
|
it under the same terms as Perl itself, either Perl version 5.8.8 or, |
265
|
|
|
|
|
|
|
at your option, any later version of Perl 5 you may have available. |
266
|
|
|
|
|
|
|
|
267
|
|
|
|
|
|
|
=cut |