line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Lingua::EN::Segmenter; |
2
|
|
|
|
|
|
|
|
3
|
|
|
|
|
|
|
=head1 NAME |
4
|
|
|
|
|
|
|
|
5
|
|
|
|
|
|
|
Lingua::EN::Segmenter - Subdivide texts into passages that represent subtopics |
6
|
|
|
|
|
|
|
|
7
|
|
|
|
|
|
|
=head1 SYNOPSIS |
8
|
|
|
|
|
|
|
|
9
|
|
|
|
|
|
|
Don't directly use this module. Use L instead. |
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 DESCRIPTION |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
See synopsis. |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 EXTENDING |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
L inherits from this module. If you want to |
18
|
|
|
|
|
|
|
segment text using a method other than text tiling, create a different module |
19
|
|
|
|
|
|
|
under Lingua::EN::Segmenter::* and inherit from this module. |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
=head1 AUTHORS |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
David James |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
=head1 SEE ALSO |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
L, L, |
28
|
|
|
|
|
|
|
L, L |
29
|
|
|
|
|
|
|
|
30
|
|
|
|
|
|
|
=head1 LICENSE |
31
|
|
|
|
|
|
|
|
32
|
|
|
|
|
|
|
Copyright (c) 2002 David James |
33
|
|
|
|
|
|
|
All rights reserved. |
34
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or |
35
|
|
|
|
|
|
|
modify it under the same terms as Perl itself. |
36
|
|
|
|
|
|
|
|
37
|
|
|
|
|
|
|
=cut |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
$VERSION = 0.10; |
40
|
|
|
|
|
|
|
@EXPORT_OK = qw( |
41
|
|
|
|
|
|
|
set_tokens_per_tile |
42
|
|
|
|
|
|
|
set_paragraph_regexp |
43
|
|
|
|
|
|
|
set_non_word_regexp |
44
|
|
|
|
|
|
|
set_locale |
45
|
|
|
|
|
|
|
set_stop_words |
46
|
|
|
|
|
|
|
segment |
47
|
|
|
|
|
|
|
segments |
48
|
|
|
|
|
|
|
); |
49
|
|
|
|
|
|
|
|
50
|
1
|
|
|
1
|
|
7
|
use strict; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
33
|
|
51
|
1
|
|
|
1
|
|
6
|
use base 'Class::Exporter'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
987
|
|
52
|
1
|
|
|
1
|
|
2070
|
use Lingua::EN::Splitter; |
|
0
|
|
|
|
|
|
|
|
0
|
|
|
|
|
|
|
53
|
|
|
|
|
|
|
use Carp qw(croak); |
54
|
|
|
|
|
|
|
|
55
|
|
|
|
|
|
|
|
56
|
|
|
|
|
|
|
# Create a new instance of this object |
57
|
|
|
|
|
|
|
sub new { |
58
|
|
|
|
|
|
|
my $class = shift; |
59
|
|
|
|
|
|
|
bless { |
60
|
|
|
|
|
|
|
MIN_SEGMENT_SIZE=>2, |
61
|
|
|
|
|
|
|
splitter=>Lingua::EN::Splitter->new, |
62
|
|
|
|
|
|
|
@_ |
63
|
|
|
|
|
|
|
}, $class |
64
|
|
|
|
|
|
|
} |
65
|
|
|
|
|
|
|
|
66
|
|
|
|
|
|
|
sub segment { |
67
|
|
|
|
|
|
|
croak "Use Lingua::EN::Segmenter::TextTiling instead."; |
68
|
|
|
|
|
|
|
} |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
sub segments { |
71
|
|
|
|
|
|
|
my ($self, $num_segments, $input) = @_; |
72
|
|
|
|
|
|
|
|
73
|
|
|
|
|
|
|
my $segment_breaks = $self->segment($num_segments,$input); |
74
|
|
|
|
|
|
|
my @segment_breaks = sort { $a <=> $b } keys %{$segment_breaks}; |
75
|
|
|
|
|
|
|
my @paragraphs = @{$self->{splitter}->paragraphs($input)}; |
76
|
|
|
|
|
|
|
my @segments; |
77
|
|
|
|
|
|
|
my $last_segment = -1; |
78
|
|
|
|
|
|
|
foreach (@segment_breaks,$#paragraphs) { |
79
|
|
|
|
|
|
|
next if $last_segment == $_; |
80
|
|
|
|
|
|
|
push @segments, join "\n\n", @paragraphs[$last_segment+1..$_]; |
81
|
|
|
|
|
|
|
$last_segment = $_; |
82
|
|
|
|
|
|
|
} |
83
|
|
|
|
|
|
|
return @segments; |
84
|
|
|
|
|
|
|
} |
85
|
|
|
|
|
|
|
|
86
|
|
|
|
|
|
|
|
87
|
|
|
|
|
|
|
######################################################### |
88
|
|
|
|
|
|
|
# Mutator methods |
89
|
|
|
|
|
|
|
######################################################### |
90
|
|
|
|
|
|
|
|
91
|
|
|
|
|
|
|
|
92
|
|
|
|
|
|
|
sub set_min_segment_size { |
93
|
|
|
|
|
|
|
my $self = shift; |
94
|
|
|
|
|
|
|
$self->{MIN_SEGMENT_SIZE} = shift; |
95
|
|
|
|
|
|
|
} |
96
|
|
|
|
|
|
|
|
97
|
|
|
|
|
|
|
sub set_tokens_per_tile { |
98
|
|
|
|
|
|
|
my $self = shift; |
99
|
|
|
|
|
|
|
$self->{splitter}->set_tokens_per_tile(@_); |
100
|
|
|
|
|
|
|
} |
101
|
|
|
|
|
|
|
|
102
|
|
|
|
|
|
|
sub set_paragraph_regexp { |
103
|
|
|
|
|
|
|
my $self = shift; |
104
|
|
|
|
|
|
|
$self->{splitter}->set_paragraph_regexp(@_); |
105
|
|
|
|
|
|
|
} |
106
|
|
|
|
|
|
|
|
107
|
|
|
|
|
|
|
sub set_non_word_regexp { |
108
|
|
|
|
|
|
|
my $self = shift; |
109
|
|
|
|
|
|
|
$self->{splitter}->set_non_word_regexp(@_); |
110
|
|
|
|
|
|
|
} |
111
|
|
|
|
|
|
|
|
112
|
|
|
|
|
|
|
sub set_locale { |
113
|
|
|
|
|
|
|
my $self = shift; |
114
|
|
|
|
|
|
|
$self->{splitter}->set_locale(@_); |
115
|
|
|
|
|
|
|
} |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
|
118
|
|
|
|
|
|
|
1; |