line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Context::Porter; |
2
|
1
|
|
|
1
|
|
723
|
use base 'Text::Context'; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
714
|
|
3
|
1
|
|
|
1
|
|
3430
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
31
|
|
4
|
1
|
|
|
1
|
|
15
|
use warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
23
|
|
5
|
1
|
|
|
1
|
|
747
|
use Lingua::Stem::En; |
|
1
|
|
|
|
|
2313
|
|
|
1
|
|
|
|
|
416
|
|
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
our $VERSION = "1.1"; |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
=head1 NAME |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
Text::Context::Porter - Text::Context with inflection awareness |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
=head1 SYNOPSIS |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
use Text::Context::Porter; |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
my $snippet = Text::Context::Porter->new($text, @keywords); |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
$snippet->keywords("foo", "bar"); # In case you change your mind |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
print $snippet->as_html; |
22
|
|
|
|
|
|
|
print $snippet->as_text; |
23
|
|
|
|
|
|
|
|
24
|
|
|
|
|
|
|
=head1 DESCRIPTION |
25
|
|
|
|
|
|
|
|
26
|
|
|
|
|
|
|
Given a piece of text and some search terms, produces an object |
27
|
|
|
|
|
|
|
which locates the search terms in the message, extracts a reasonable-length |
28
|
|
|
|
|
|
|
string containing all the search terms, and optionally dumps the string out |
29
|
|
|
|
|
|
|
as HTML text with the search terms highlighted in bold. |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
However, unlike the ordinary C, this subclass is able to |
32
|
|
|
|
|
|
|
highlight terms in the document which are inflected variants of the search |
33
|
|
|
|
|
|
|
terms. For instance, searching for "testing" should highlight "test", |
34
|
|
|
|
|
|
|
"tested" and so on. |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
=cut |
37
|
|
|
|
|
|
|
|
38
|
|
|
|
|
|
|
sub keywords { |
39
|
1
|
|
|
1
|
1
|
366
|
my ($self, @keywords) = @_; |
40
|
1
|
50
|
|
|
|
4
|
if (@keywords) { |
41
|
2
|
|
|
|
|
3
|
$self->{keywords} = Lingua::Stem::En::stem({ -words => |
42
|
1
|
|
|
|
|
4
|
[ map {s/\s+/ /g; lc $_} @keywords ] |
|
2
|
|
|
|
|
10
|
|
43
|
|
|
|
|
|
|
}); |
44
|
|
|
|
|
|
|
} |
45
|
1
|
|
|
|
|
199
|
return @{$self->{keywords}}; |
|
1
|
|
|
|
|
4
|
|
46
|
|
|
|
|
|
|
} |
47
|
|
|
|
|
|
|
|
48
|
12
|
|
|
12
|
1
|
142
|
sub para_class {"Text::Context::Para::Porter"} |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
sub paras { |
51
|
2
|
|
|
2
|
1
|
646
|
my $self = shift; |
52
|
2
|
|
50
|
|
|
10
|
my $max_len = shift || 150; |
53
|
2
|
|
|
|
|
10
|
$self->prepare_text; |
54
|
2
|
|
|
|
|
26
|
$self->score_para($_) for @{$self->{text_a}}; |
|
2
|
|
|
|
|
10
|
|
55
|
2
|
|
|
|
|
13
|
my @paras = $self->get_appropriate_paras; |
56
|
2
|
|
|
|
|
133
|
return map { $_->slim($max_len / @paras) } @paras; |
|
2
|
|
|
|
|
8
|
|
57
|
|
|
|
|
|
|
|
58
|
|
|
|
|
|
|
} |
59
|
|
|
|
|
|
|
|
60
|
|
|
|
|
|
|
sub score_para { |
61
|
9
|
|
|
9
|
1
|
13
|
my ($self, $para) = @_; |
62
|
9
|
|
|
|
|
19
|
my $content = $para->{content}; |
63
|
9
|
|
|
|
|
12
|
$content = join " ", @{Lingua::Stem::En::stem({ -words => [ split /\W+/, $para->{content} ] })}; |
|
9
|
|
|
|
|
90
|
|
64
|
9
|
|
|
|
|
1692
|
my %matches; |
65
|
|
|
|
|
|
|
# Do all the matching of keywords in advance of the boring |
66
|
|
|
|
|
|
|
# permutation bit |
67
|
9
|
|
|
|
|
11
|
for my $word (@{$self->{keywords}}) { |
|
9
|
|
|
|
|
24
|
|
68
|
18
|
|
|
|
|
20
|
my $word_score = 0; |
69
|
18
|
100
|
|
|
|
212
|
$word_score += 1 + ($content =~ tr/ / /) if $content =~ /\b\Q$word\E\b/i; |
70
|
18
|
|
|
|
|
49
|
$matches{$word} = $word_score; |
71
|
|
|
|
|
|
|
} |
72
|
|
|
|
|
|
|
#XXX : Possible optimization: Give up if there are no matches |
73
|
|
|
|
|
|
|
|
74
|
9
|
|
|
|
|
33
|
for my $wordset ($self->permute_keywords) { |
75
|
27
|
|
|
|
|
314
|
my $this_score = 0; |
76
|
27
|
|
|
|
|
70
|
$this_score += $matches{$_} for @$wordset; |
77
|
27
|
100
|
|
|
|
79
|
$para->{scoretable}[$this_score] = $wordset if $this_score > @$wordset; |
78
|
|
|
|
|
|
|
} |
79
|
9
|
|
|
|
|
17
|
$para->{final_score} = $#{$para->{scoretable}}; |
|
9
|
|
|
|
|
39
|
|
80
|
|
|
|
|
|
|
} |
81
|
|
|
|
|
|
|
|
82
|
|
|
|
|
|
|
package Text::Context::Para::Porter; |
83
|
1
|
|
|
1
|
|
5
|
use constant DEFAULT_START_TAG => ''; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
65
|
|
84
|
1
|
|
|
1
|
|
4
|
use constant DEFAULT_END_TAG => ""; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
85
|
1
|
|
|
1
|
|
4
|
use base 'Text::Context::Para'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
811
|
|
86
|
1
|
|
|
1
|
|
9356
|
use HTML::Entities; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
692
|
|
87
|
|
|
|
|
|
|
|
88
|
|
|
|
|
|
|
sub marked_up { |
89
|
1
|
|
|
1
|
|
6
|
my $self = shift; |
90
|
1
|
|
50
|
|
|
8
|
my $start_tag = shift || DEFAULT_START_TAG; |
91
|
1
|
|
50
|
|
|
7
|
my $end_tag = shift || DEFAULT_END_TAG; |
92
|
1
|
|
|
|
|
5
|
my $content = $self->as_text; |
93
|
1
|
|
|
|
|
5
|
my %words = map {$_ => 1} @{$self->{marked_words}}; |
|
2
|
|
|
|
|
28
|
|
|
1
|
|
|
|
|
3
|
|
94
|
1
|
|
|
|
|
4
|
my $output; |
95
|
1
|
|
|
|
|
11
|
for my $word (split /(\s+)/, $content) { |
96
|
17
|
100
|
|
|
|
57
|
if ($word =~ /\S/) { |
97
|
9
|
|
|
|
|
9
|
my ($stemmed) = @{Lingua::Stem::En::stem({ -words => [ $word ]})}; |
|
9
|
|
|
|
|
37
|
|
98
|
9
|
100
|
|
|
|
491
|
if ($words{$stemmed}) { |
99
|
2
|
|
|
|
|
7
|
$word = $start_tag . encode_entities($word) . $end_tag; |
100
|
|
|
|
|
|
|
} else { |
101
|
7
|
|
|
|
|
21
|
$word = encode_entities($word); |
102
|
|
|
|
|
|
|
} |
103
|
|
|
|
|
|
|
} |
104
|
17
|
|
|
|
|
109
|
$output .= $word; |
105
|
|
|
|
|
|
|
} |
106
|
1
|
|
|
|
|
12
|
return $output; |
107
|
|
|
|
|
|
|
} |
108
|
|
|
|
|
|
|
|
109
|
|
|
|
|
|
|
sub slim { |
110
|
2
|
|
|
2
|
|
4
|
my ($self, $max_weight) = @_; |
111
|
2
|
|
|
|
|
5
|
$self->{content} =~ s/^\s+//; |
112
|
2
|
|
|
|
|
9
|
$self->{content} =~ s/\s+$//; |
113
|
2
|
50
|
|
|
|
14
|
return $self if length $self->{content} <= $max_weight; |
114
|
0
|
|
|
|
|
|
my %words = map {$_ => 1 } @{$self->{marked_words}}; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
115
|
0
|
|
|
|
|
|
my $old_length = -1; |
116
|
0
|
|
|
|
|
|
my $this_length = length $self->{content}; |
117
|
0
|
|
|
|
|
|
do {{ |
118
|
0
|
0
|
|
|
|
|
if ($old_length == $this_length) { return $self; } # Give up |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
119
|
0
|
|
|
|
|
|
$old_length = $this_length; |
120
|
|
|
|
|
|
|
|
121
|
0
|
|
|
|
|
|
$self->{content} =~ /^\W*(\w+)/; |
122
|
0
|
|
|
|
|
|
my $stemmed = Lingua::Stem::En::stem({ -words => [$1]}); |
123
|
0
|
|
|
|
|
|
$stemmed = $stemmed->[0]; |
124
|
0
|
0
|
|
|
|
|
if (!exists $words{$stemmed}) { |
125
|
0
|
|
|
|
|
|
$self->{content} =~ s/^\W*(\w+)\W*/.../ |
126
|
|
|
|
|
|
|
}; |
127
|
|
|
|
|
|
|
|
128
|
0
|
|
|
|
|
|
$self->{content} =~ /(\w+)\W*$/; |
129
|
0
|
|
|
|
|
|
$stemmed = Lingua::Stem::En::stem({ -words => [$1]}); |
130
|
0
|
0
|
|
|
|
|
if (!exists $words{$stemmed}) { |
131
|
0
|
|
|
|
|
|
$self->{content} =~ s/(\w+)\W*$/.../ ; |
132
|
|
|
|
|
|
|
}; |
133
|
0
|
|
|
|
|
|
$this_length = length $self->{content}; |
134
|
|
|
|
|
|
|
}} until ($this_length <= $max_weight); |
135
|
|
|
|
|
|
|
|
136
|
0
|
|
|
|
|
|
return $self; |
137
|
|
|
|
|
|
|
} |
138
|
|
|
|
|
|
|
=head1 COPYRIGHT |
139
|
|
|
|
|
|
|
|
140
|
|
|
|
|
|
|
Copyright (C) 2004,2006 Simon Cozens |
141
|
|
|
|
|
|
|
|
142
|
|
|
|
|
|
|
You may use and redistribute this module under the same terms as Perl |
143
|
|
|
|
|
|
|
itself. |
144
|
|
|
|
|
|
|
|
145
|
|
|
|
|
|
|
=cut |
146
|
|
|
|
|
|
|
|
147
|
|
|
|
|
|
|
1; |