line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::ParseAHD; |
2
|
|
|
|
|
|
|
#use base qw(BASE); |
3
|
1
|
|
|
1
|
|
33020
|
use Text::ParseAHD::Word; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
4
|
|
|
|
|
|
|
use Text::ParseAHD::Definition; |
5
|
|
|
|
|
|
|
use Class::Std; |
6
|
|
|
|
|
|
|
use Class::Std::Utils; |
7
|
|
|
|
|
|
|
|
8
|
|
|
|
|
|
|
use warnings; |
9
|
|
|
|
|
|
|
use strict; |
10
|
|
|
|
|
|
|
use Carp; |
11
|
|
|
|
|
|
|
|
12
|
|
|
|
|
|
|
use version; our $VERSION = qv('0.0.2'); |
13
|
|
|
|
|
|
|
|
14
|
|
|
|
|
|
|
{ |
15
|
|
|
|
|
|
|
my %html_of :ATTR( :get :set :default<''> :init_arg ); |
16
|
|
|
|
|
|
|
my %word_of :ATTR( :get :set :default<''> :init_arg ); |
17
|
|
|
|
|
|
|
my %pos_of :ATTR( :get :set :default<''> :init_arg ); |
18
|
|
|
|
|
|
|
my %syllables_of :ATTR( :get :set :default<''> :init_arg ); |
19
|
|
|
|
|
|
|
my %defs_of :ATTR( :get :set :default<''> :init_arg ); |
20
|
|
|
|
|
|
|
my %text_of :ATTR( :get :set :default<''> :init_arg ); |
21
|
|
|
|
|
|
|
my %word_obj_obj_of :ATTR( :get :set :default<''> :init_arg ); |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
sub START { |
25
|
|
|
|
|
|
|
my ($self, $ident, $arg_ref) = @_; |
26
|
|
|
|
|
|
|
#$html_of{$ident} = $arg_ref->{html}; |
27
|
|
|
|
|
|
|
#$word_of{$ident} = $arg_ref->{word}; |
28
|
|
|
|
|
|
|
#$Word_of{$ident} = Text::ParseAHD::Word->new({'word',$word_of{$ident}}); |
29
|
|
|
|
|
|
|
$self->set_word_obj( Text::ParseAHD::Word->new({word => $self->get_word() }) ); |
30
|
|
|
|
|
|
|
return; |
31
|
|
|
|
|
|
|
} |
32
|
|
|
|
|
|
|
|
33
|
|
|
|
|
|
|
sub parse_html { |
34
|
|
|
|
|
|
|
my ( $self ) = @_; |
35
|
|
|
|
|
|
|
my $ident = ident $self; |
36
|
|
|
|
|
|
|
my $html = $html_of{$ident}; |
37
|
|
|
|
|
|
|
while ( $html =~ m/(.*?)/gsix ) { |
38
|
|
|
|
|
|
|
my $definition_text = $1; |
39
|
|
|
|
|
|
|
$definition_text =~ s/(.*?)<\/b>//i; |
40
|
|
|
|
|
|
|
my $word = $self->clean_word( $1 ); |
41
|
|
|
|
|
|
|
|
42
|
|
|
|
|
|
|
# print " STATUS: AHD4 definition " . ++$count . " found\n"; |
43
|
|
|
|
|
|
|
|
44
|
|
|
|
|
|
|
if ($word eq $word_of{$ident}) { |
45
|
|
|
|
|
|
|
$definition_text =~ s/\n//g; # Remove newlines |
46
|
|
|
|
|
|
|
$definition_text =~ s/\ //g; # Remove nbsp |
47
|
|
|
|
|
|
|
$definition_text =~ s/ //g; # Remove br |
48
|
|
|
|
|
|
|
$definition_text =~ s/^(.*?)//six; # Remove leading |
49
|
|
|
|
|
|
|
$definition_text =~ s/<\/td>.*$//six; # Remove trailing |
50
|
|
|
|
|
|
|
|
51
|
|
|
|
|
|
|
my @defs = $self->split_defs( $definition_text ); |
52
|
|
|
|
|
|
|
my $i=1; |
53
|
|
|
|
|
|
|
foreach my $def (@defs) { |
54
|
|
|
|
|
|
|
print $i . "\n\n" . $def . "\n\n\n\n"; |
55
|
|
|
|
|
|
|
$self->parse_definition( $def ); |
56
|
|
|
|
|
|
|
$i++; |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
#print $definition_text."\n\n\n"; |
60
|
|
|
|
|
|
|
} |
61
|
|
|
|
|
|
|
$self->report_word(); |
62
|
|
|
|
|
|
|
return; |
63
|
|
|
|
|
|
|
} |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
sub clean_word { |
66
|
|
|
|
|
|
|
my ($self, $word) = @_; |
67
|
|
|
|
|
|
|
my $A = chr(194); |
68
|
|
|
|
|
|
|
my $bullet = chr(183); |
69
|
|
|
|
|
|
|
$word =~ s/://g; |
70
|
|
|
|
|
|
|
$word =~ s/$A//g; |
71
|
|
|
|
|
|
|
$word =~ s/$bullet//g; |
72
|
|
|
|
|
|
|
$word =~ s/.*?<\/sup>//; # RH 080719 |
73
|
|
|
|
|
|
|
$word =~ s/^\s+//; # RH 080719 |
74
|
|
|
|
|
|
|
$word =~ s/\s+$//; # RH 080719 |
75
|
|
|
|
|
|
|
return $word; |
76
|
|
|
|
|
|
|
} |
77
|
|
|
|
|
|
|
|
78
|
|
|
|
|
|
|
sub split_defs { |
79
|
|
|
|
|
|
|
my ($self, $text) = @_; |
80
|
|
|
|
|
|
|
my @defs = split(//, $text ); shift @defs; |
81
|
|
|
|
|
|
|
return @defs; |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
|
84
|
|
|
|
|
|
|
sub parse_definition { |
85
|
|
|
|
|
|
|
my ($self, $text) = @_; |
86
|
|
|
|
|
|
|
my $ident = ident $self; |
87
|
|
|
|
|
|
|
my $pos = ''; |
88
|
|
|
|
|
|
|
my @word_forms = (); |
89
|
|
|
|
|
|
|
my @definitions = (); |
90
|
|
|
|
|
|
|
$text =~ s/(.*?)//six; |
91
|
|
|
|
|
|
|
$pos=$1; |
92
|
|
|
|
|
|
|
$text =~ s/(.*?)//six; |
93
|
|
|
|
|
|
|
my $subhead = $1; |
94
|
|
|
|
|
|
|
if (defined $subhead) { |
95
|
|
|
|
|
|
|
#while ($subhead =~ m/(.*?)<\/b>/gisx) { push @word_forms, $self->syllables( $1 ); } |
96
|
|
|
|
|
|
|
while ($subhead =~ m/(.*?)<\/b>/gisx) { push @word_forms, $1; } |
97
|
|
|
|
|
|
|
|
98
|
|
|
|
|
|
|
if ($subhead =~ m/(.*?)<\/i>/i) { $pos = $1; $pos =~ s/\.//g; } # Subtype |
99
|
|
|
|
|
|
|
} |
100
|
|
|
|
|
|
|
#print "WORD FORMS: " . join(' ', @word_forms) ."\n\n\n"; |
101
|
|
|
|
|
|
|
while ($text =~ m/(.*?)/gsix) { |
102
|
|
|
|
|
|
|
my $def_text = $1; |
103
|
|
|
|
|
|
|
my $new_defs; |
104
|
|
|
|
|
|
|
if ($def_text =~ m/parse_list( $def_text, $pos ); } |
105
|
|
|
|
|
|
|
else { $new_defs = [$self->parse_single( $def_text )]; } |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
foreach my $definition (@{ $new_defs }) {push @definitions, $definition; } |
108
|
|
|
|
|
|
|
} |
109
|
|
|
|
|
|
|
foreach my $definition (@definitions){ |
110
|
|
|
|
|
|
|
$self->get_word_obj()->add_definition($definition->{text}, $definition->{example}, $pos); |
111
|
|
|
|
|
|
|
} |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
} |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
sub parse_list { |
116
|
|
|
|
|
|
|
my ($self, $text, $pos ) = @_; |
117
|
|
|
|
|
|
|
$text =~ s//\[/ig; |
118
|
|
|
|
|
|
|
$text =~ s/<\/ol>/]/ig; |
119
|
|
|
|
|
|
|
$text =~ s//{ text => '/ig; |
120
|
|
|
|
|
|
|
$text =~ s/<\/li>/'}, /ig; |
121
|
|
|
|
|
|
|
$text =~ s/'\[/\[/ig; |
122
|
|
|
|
|
|
|
$text =~ s/,\s]/]/ig; |
123
|
|
|
|
|
|
|
$text =~ s/]'/]/ig; |
124
|
|
|
|
|
|
|
$text =~ s/Informal<\/i>//ig;#added by Nathan |
125
|
|
|
|
|
|
|
$text =~ s/Slang<\/i>//ig;#added by Nathan |
126
|
|
|
|
|
|
|
$text =~ s/text => '? ?\[{ text => '/text => '/ig; |
127
|
|
|
|
|
|
|
$text =~ s/{ text => '.*<\/i> \[{ text/{ text/ig; |
128
|
|
|
|
|
|
|
$text =~ s/}\]}/}/ig; |
129
|
|
|
|
|
|
|
$text =~ s/({ text => '[-\.\w\s]*)'([-\.\w\s]*'},)/$1$2/ig; |
130
|
|
|
|
|
|
|
#$text =~ s///ig; |
131
|
|
|
|
|
|
|
#$text =~ s/<\/ol>/]/ig; |
132
|
|
|
|
|
|
|
#$text =~ s//{ text => '/ig; |
133
|
|
|
|
|
|
|
#$text =~ s/<\/li>/'}, /ig; |
134
|
|
|
|
|
|
|
#$text =~ s/'\[/\[/ig; |
135
|
|
|
|
|
|
|
#$text =~ s/,\s]/]/ig; |
136
|
|
|
|
|
|
|
#$text =~ s/]'/]/ig; |
137
|
|
|
|
|
|
|
|
138
|
|
|
|
|
|
|
print "LIST TEXT: $text\n"; |
139
|
|
|
|
|
|
|
my $definitions = eval "return $text;"; |
140
|
|
|
|
|
|
|
#my $definitions = [{ text => 'A domesticated carnivorous mammal (Canis familiaris) related to the foxes and wolves and raised in a wide variety of breeds.'}, { text => 'Any of various carnivorous mammals of the family Canidae, such as the dingo.'}, { text => 'A male animal of the family Canidae, especially of the fox or a domesticated breed.'}, { text => 'Any of various other animals, such as the prairie dog.'}, { text => 'Informal [{ text => 'A person: You won, you lucky dog.'}, { text => 'A person regarded as contemptible: You stole my watch, you dog.'}, { text => 'A person regarded as unattractive or uninteresting.'}, { text => 'Something of inferior or low quality: "The President had read the speech to some of his friends and they told him it was a dog" (John P. Roche).'}, { text => 'An investment that produces a low return or a loss.'}]}, { text => 'Slang [{ text => 'A person regarded as unattractive or uninteresting.'}, { text => 'Something of inferior or low quality: "The President had read the speech to some of his friends and they told him it was a dog" (John P. Roche).'}, { text => 'An investment that produces a low return or a loss.'}]}, { text => 'dogs Slang The feet.'}, { text => 'See andiron.'}, { text => 'Slang A hot dog; a wiener.'}, { text => 'Any of various hooked or U-shaped metallic devices used for gripping or holding heavy objects.'}, { text => 'Astronomy A sun dog.'}]; |
141
|
|
|
|
|
|
|
#while($text=~m/[ |
142
|
|
|
|
|
|
|
|
143
|
|
|
|
|
|
|
my @definitions; |
144
|
|
|
|
|
|
|
foreach my $definition ( @$definitions ) { |
145
|
|
|
|
|
|
|
#foreach my $definition ( @definitions ) { |
146
|
|
|
|
|
|
|
#print "HELLO: $definition->{text}\n\n"; |
147
|
|
|
|
|
|
|
if ( $definition->{text} =~ m/^ARRAY/ ) { |
148
|
|
|
|
|
|
|
foreach my $sub_list ( @{ $definition->{text} } ) { |
149
|
|
|
|
|
|
|
push @definitions, $self->parse_single( $sub_list->{text} ); |
150
|
|
|
|
|
|
|
} |
151
|
|
|
|
|
|
|
} else { |
152
|
|
|
|
|
|
|
push @definitions, $self->parse_single( $definition->{text} ); |
153
|
|
|
|
|
|
|
#$self->get_word_obj()->add_definition($definitions[-1]{text}, $definitions[-1]{example}, $pos); |
154
|
|
|
|
|
|
|
#my $list= $self->get_word_obj()->get_defs(); |
155
|
|
|
|
|
|
|
#my @list2 = @$list; |
156
|
|
|
|
|
|
|
#$self->report_word(-1); |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
} |
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
return \@definitions; |
161
|
|
|
|
|
|
|
} |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
sub report_word{ |
164
|
|
|
|
|
|
|
my ($self, $i)=@_; |
165
|
|
|
|
|
|
|
my $word = $self->get_word_obj(); |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
print "WORD: " . $word->get_word() . "\n"; |
168
|
|
|
|
|
|
|
my $list = $word->get_defs(); |
169
|
|
|
|
|
|
|
my @list2 = @$list; |
170
|
|
|
|
|
|
|
if($i eq ''){ |
171
|
|
|
|
|
|
|
$i=1; |
172
|
|
|
|
|
|
|
foreach my $def (@list2){ |
173
|
|
|
|
|
|
|
print "DEF#$i:\n text: " . $def->get_text() . "\n example: " . $def->get_example() . "\n pos: " . $def->get_pos() . "\n\n"; |
174
|
|
|
|
|
|
|
$i++; |
175
|
|
|
|
|
|
|
} |
176
|
|
|
|
|
|
|
}else{ |
177
|
|
|
|
|
|
|
my $def = $list2[$i]; |
178
|
|
|
|
|
|
|
print "DEF#$i:\n text: " . $def->get_text() . "\n example: " . $def->get_example() . "\n pos: " . $def->get_pos() . "\n\n"; |
179
|
|
|
|
|
|
|
} |
180
|
|
|
|
|
|
|
} |
181
|
|
|
|
|
|
|
|
182
|
|
|
|
|
|
|
sub parse_single { |
183
|
|
|
|
|
|
|
my ($self, $text ) = @_; |
184
|
|
|
|
|
|
|
my $ident = ident $self; |
185
|
|
|
|
|
|
|
while ( $text =~ m/(.*?)<\/b>(.*?)<\/i>/gi ) { |
186
|
|
|
|
|
|
|
my ($word_form, $pos) = ($self->syllables( $1 ), $2); $pos =~ s/\.//g; $pos =~ s/ //g; |
187
|
|
|
|
|
|
|
#$self->_insert_word_form( $word_form, $pos ); |
188
|
|
|
|
|
|
|
} |
189
|
|
|
|
|
|
|
my (@keys) = (); |
190
|
|
|
|
|
|
|
my (@roots) = (); |
191
|
|
|
|
|
|
|
my (%word_forms) = (); |
192
|
|
|
|
|
|
|
$text =~ s/(.*?)<\/tt>//i; |
193
|
|
|
|
|
|
|
my $root=''; |
194
|
|
|
|
|
|
|
#$root = $1; if ($root eq $text) { $root = ''; } |
195
|
|
|
|
|
|
|
|
196
|
|
|
|
|
|
|
# Check for additional word forms |
197
|
|
|
|
|
|
|
while ( $text =~ s/(.*?)<\/b>(.*?)<\/i>//gi ) { |
198
|
|
|
|
|
|
|
my ($word_form, $pos) = ($self->syllables($1), $2); $pos =~ s/\.//g; $pos =~ s/ //g; |
199
|
|
|
|
|
|
|
$word_forms{$word_form} = $pos; |
200
|
|
|
|
|
|
|
} |
201
|
|
|
|
|
|
|
if (keys %word_forms) { |
202
|
|
|
|
|
|
|
foreach my $key (sort keys %word_forms) { $self->_insert_word_form( $key, $word_forms{$key} ); } |
203
|
|
|
|
|
|
|
} else { |
204
|
|
|
|
|
|
|
# No word forms, parse definition, root, example |
205
|
|
|
|
|
|
|
#print "SINGLE TEXT: $text \n"; |
206
|
|
|
|
|
|
|
my $example = ''; |
207
|
|
|
|
|
|
|
$text =~ m/()/ig; #resetting $1 |
208
|
|
|
|
|
|
|
$text =~ s/(.*?)<\/i>//i; |
209
|
|
|
|
|
|
|
$example = $1; |
210
|
|
|
|
|
|
|
|
211
|
|
|
|
|
|
|
#print "EXAMPLE: $example\n"; |
212
|
|
|
|
|
|
|
|
213
|
|
|
|
|
|
|
if ($text eq $example) |
214
|
|
|
|
|
|
|
{ $example = ''; } |
215
|
|
|
|
|
|
|
$example =~ s/["']//i; |
216
|
|
|
|
|
|
|
$example =~ s/\.//i; |
217
|
|
|
|
|
|
|
$text =~ s/\[.*?]//i; |
218
|
|
|
|
|
|
|
|
219
|
|
|
|
|
|
|
if ($root) { |
220
|
|
|
|
|
|
|
if ($root =~ m/(.*?)<\/font>/g) { $root = $1; } |
221
|
|
|
|
|
|
|
while ($root =~ m/(.*?)<\/b>/g) { push @roots, $1; } |
222
|
|
|
|
|
|
|
if (@roots) { $root = join(',', @roots); } |
223
|
|
|
|
|
|
|
$root =~ s/'/:/g; } |
224
|
|
|
|
|
|
|
$text =~ s/<\/font>//i; |
225
|
|
|
|
|
|
|
$text =~ s/(.*?)<\/i>//i; |
226
|
|
|
|
|
|
|
$text =~ s/.*?<\/sup>//i; |
227
|
|
|
|
|
|
|
$text =~ s/://i; |
228
|
|
|
|
|
|
|
$text =~ s/["']//i; |
229
|
|
|
|
|
|
|
$text =~ s/See Synonyms at .*?<\/a>\.//i; |
230
|
|
|
|
|
|
|
$text =~ s/See .*?<\/a>//i; |
231
|
|
|
|
|
|
|
$text =~ s/.*?<\/b>//i; |
232
|
|
|
|
|
|
|
$text =~ s/.*?<\/font>//i; |
233
|
|
|
|
|
|
|
$text =~ s/.*?<\/i>//i; |
234
|
|
|
|
|
|
|
$text =~ s/\.//i; |
235
|
|
|
|
|
|
|
|
236
|
|
|
|
|
|
|
# push @keys, "word_id => '$word_id_of{$ident}'"; |
237
|
|
|
|
|
|
|
push @keys, "word => '$word_of{$ident}'"; |
238
|
|
|
|
|
|
|
if ($pos_of{$ident} && !$root) { push @keys, "pos => '$pos_of{$ident}'"; } |
239
|
|
|
|
|
|
|
if ( $root && $example) { push @keys, "root => '$root'"; push @keys, "root_def => '" . $self->_sql_escape($example) . "'"; $example = ''; } |
240
|
|
|
|
|
|
|
elsif ( $root ) { push @keys, "root => '$root'"; } |
241
|
|
|
|
|
|
|
#if ($example) { push @keys, "example => '" . $self->_sql_escape($example) . "'"; } |
242
|
|
|
|
|
|
|
if ($example) { push @keys, "example => '$example'"; } |
243
|
|
|
|
|
|
|
#if ($text) { push @keys, "text => '" . $self->_sql_escape($text) . "'"; } |
244
|
|
|
|
|
|
|
if ($text) { push @keys, "text => '$text'"; } |
245
|
|
|
|
|
|
|
} |
246
|
|
|
|
|
|
|
|
247
|
|
|
|
|
|
|
return eval "return { " . join(', ', @keys) . " };"; |
248
|
|
|
|
|
|
|
} |
249
|
|
|
|
|
|
|
} |
250
|
|
|
|
|
|
|
1; # Magic true value required at end of module |
251
|
|
|
|
|
|
|
__END__ |