File Coverage

blib/lib/Lingua/EN/Segmenter.pm

Criterion	Covered	Total	%
statement	7	9	77.7
branch			n/a
condition			n/a
subroutine	3	3	100.0
pod			n/a
total	10	12	83.3

line	stmt	sub	time	code
1				package Lingua::EN::Segmenter;
2
3				=head1 NAME
4
5				Lingua::EN::Segmenter - Subdivide texts into passages that represent subtopics
6
7				=head1 SYNOPSIS
8
9				Don't directly use this module. Use L instead.
10
11				=head1 DESCRIPTION
12
13				See synopsis.
14
15				=head1 EXTENDING
16
17				L inherits from this module. If you want to
18				segment text using a method other than text tiling, create a different module
19				under Lingua::EN::Segmenter::* and inherit from this module.
20
21				=head1 AUTHORS
22
23				David James
24
25				=head1 SEE ALSO
26
27				L, L,
28				L, L
29
30				=head1 LICENSE
31
32				Copyright (c) 2002 David James
33				All rights reserved.
34				This program is free software; you can redistribute it and/or
35				modify it under the same terms as Perl itself.
36
37				=cut
38
39				$VERSION = 0.10;
40				@EXPORT_OK = qw(
41				set_tokens_per_tile
42				set_paragraph_regexp
43				set_non_word_regexp
44				set_locale
45				set_stop_words
46				segment
47				segments
48				);
49
50	1	1	7	use strict;
	1		3
	1		33
51	1	1	6	use base 'Class::Exporter';
	1		2
	1		987
52	1	1	2070	use Lingua::EN::Splitter;
	0
	0
53				use Carp qw(croak);
54
55
56				# Create a new instance of this object
57				sub new {
58				my $class = shift;
59				bless {
60				MIN_SEGMENT_SIZE=>2,
61				splitter=>Lingua::EN::Splitter->new,
62				@_
63				}, $class
64				}
65
66				sub segment {
67				croak "Use Lingua::EN::Segmenter::TextTiling instead.";
68				}
69
70				sub segments {
71				my ($self, $num_segments, $input) = @_;
72
73				my $segment_breaks = $self->segment($num_segments,$input);
74				my @segment_breaks = sort { $a <=> $b } keys %{$segment_breaks};
75				my @paragraphs = @{$self->{splitter}->paragraphs($input)};
76				my @segments;
77				my $last_segment = -1;
78				foreach (@segment_breaks,$#paragraphs) {
79				next if $last_segment == $_;
80				push @segments, join "\n\n", @paragraphs[$last_segment+1..$_];
81				$last_segment = $_;
82				}
83				return @segments;
84				}
85
86
87				#########################################################
88				# Mutator methods
89				#########################################################
90
91
92				sub set_min_segment_size {
93				my $self = shift;
94				$self->{MIN_SEGMENT_SIZE} = shift;
95				}
96
97				sub set_tokens_per_tile {
98				my $self = shift;
99				$self->{splitter}->set_tokens_per_tile(@_);
100				}
101
102				sub set_paragraph_regexp {
103				my $self = shift;
104				$self->{splitter}->set_paragraph_regexp(@_);
105				}
106
107				sub set_non_word_regexp {
108				my $self = shift;
109				$self->{splitter}->set_non_word_regexp(@_);
110				}
111
112				sub set_locale {
113				my $self = shift;
114				$self->{splitter}->set_locale(@_);
115				}
116
117
118				1;