File Coverage

blib/lib/Lingua/EN/Segmenter/TextTiling.pm
Criterion Covered Total %
statement 12 12 100.0
branch n/a
condition n/a
subroutine 4 4 100.0
pod n/a
total 16 16 100.0


line stmt bran cond sub pod time code
1             package Lingua::EN::Segmenter::TextTiling;
2              
3             =head1 NAME
4              
5             Lingua::EN::Segmenter::TextTiling - Segment text using the TextTiling method
6              
7             =head1 SYNOPSIS
8              
9             use Lingua::EN::Segmenter::TextTiling qw(segments);
10             use lib '.';
11            
12             my $text = <
13             Lingua::EN::Segmenter is a useful module that allows text to be split up
14             into words, paragraphs, segments, and tiles.
15            
16             Paragraphs are by default indicated by blank lines. Known segment breaks are
17             indicated by a line with only the word "segment_break" in it.
18            
19             The module detects paragraphs that are unrelated to each other by comparing
20             the number of words per-paragraph that are related. The algorithm is designed
21             to work only on long segments.
22            
23             SOUTH OF BAGHDAD, Iraq (CNN) -- Seven U.S. troops freed Sunday after being
24             held by Iraqi forces arrived by helicopter at a base south of Baghdad and were
25             transferred to a C-130 transport plane headed for Kuwait, CNN's Bob Franken
26             reported from the scene.
27            
28             EOT
29            
30             my $num_segment_breaks = 1;
31             my @segments = segments($num_segment_breaks,$text);
32             print $segments[0]; # Prints the first three paragraphs of the above text
33             print "\n----------SEGMENT_BREAK----------\n";
34             print $segments[1]; # Prints the last paragraph of the above text
35            
36             # This module can also be used in an object-oriented fashion
37             my $splitter = new Lingua::EN::Splitter;
38             @words = $splitter->words($text);
39              
40             =head1 DESCRIPTION
41              
42             See synopsis.
43              
44             =head1 EXTENDING
45              
46             This module is designed to be easily extendable. Feel free to extend from this
47             module when designing alternate methods for text segmentation.
48              
49             =head1 AUTHORS
50              
51             David James
52              
53             =head1 SEE ALSO
54              
55             L, L,
56             L
57              
58             =head1 LICENSE
59              
60             Copyright (c) 2002 David James
61             All rights reserved.
62             This program is free software; you can redistribute it and/or
63             modify it under the same terms as Perl itself.
64              
65             =cut
66              
67             $VERSION = 0.10;
68             @EXPORT_OK = qw(
69             segment
70             segments
71             set_tiles_per_block
72             set_number_of_smoothing_rounds
73             set_tokens_per_tile
74             set_paragraph_regexp
75             set_non_word_regexp
76             set_locale
77             set_stop_words
78             );
79              
80 1     1   3109 use Math::Vector::SortIndexes qw(sort_indexes_descending);
  1         212  
  1         81  
81 1     1   849 use Math::VecStat qw(average min sum);
  1         1336  
  1         86  
82 1     1   886 use Math::HashSum qw(hashsum);
  1         178  
  1         61  
83              
84 1     1   7 use base 'Lingua::EN::Segmenter';
  1         1  
  1         1476  
85             use strict;
86              
87              
88             # Create a new instance of the object
89             sub new {
90             my $class = shift;
91             $class->SUPER::new(
92             TILES_PER_BLOCK=>7,
93             NUMBER_OF_SMOOTHING_ROUNDS=>2,
94             @_
95             );
96             }
97              
98             # Segment a piece of text
99             sub segment {
100             my ($self, $num_segments, $input) = @_;
101              
102             # Get smoothed depth scores
103             my $scores = $self->smoothed_depth_scores($input);
104              
105             # Get paragraph breaks
106             my $breaks = $self->{splitter}->paragraph_breaks($input);
107            
108             # Get predicted segment breaks
109             return $self->tile2segment($num_segments, $breaks, $scores);
110             }
111              
112             sub set_tiles_per_block {
113             my $self = shift;
114             $self->{TILES_PER_BLOCK} = shift;
115             }
116              
117             sub set_number_of_smoothing_rounds {
118             my $self = shift;
119             $self->{NUMBER_OF_SMOOTHING_ROUNDS} = shift;
120             }
121              
122              
123             ######################################################################
124             # PRIVATE METHODS
125             ######################################################################
126              
127             # Accept as input the scores of the tiles. Output segment scores.
128             sub tile2segment {
129             my ($self, $num_segments, $breaks, $scores) = @_;
130              
131             my @indexes = sort_indexes_descending @$scores;
132              
133             my @too_close = (
134             -$self->{MIN_SEGMENT_SIZE}..-1, 1..$self->{MIN_SEGMENT_SIZE}
135             );
136             my @direction = qw(L R);
137             my (%segments, %verbose, $cut_off_depth);
138            
139             # Calculate the most likely segment breaks
140             GAP: foreach my $i (@indexes) {
141             my $tile_no = $i + $self->{TILES_PER_BLOCK};
142             my $closest_break = (min( map { abs($_ - $tile_no) } @$breaks ))[1];
143             $segments{$closest_break+$_} and next GAP for (@too_close);
144             $segments{$closest_break} .=
145             $direction[$breaks->[$closest_break] > $tile_no];
146             $verbose{$tile_no} = [ $tile_no, $scores->[$i], $closest_break ];
147             $cut_off_depth = $scores->[$i];
148             last if keys %segments == $num_segments;
149             }
150            
151             # Verbose output
152             if ($self->{VERBOSE}) {
153             printf "Cut-off depth = %6.4f\n\n", $cut_off_depth;
154             print " Gap Depth Para\n";
155             foreach (sort { $a <=> $b } keys %verbose) {
156             printf "%4d %6.3f %4d\n", @{$verbose{$_}}
157             }
158             print "\n";
159             }
160             return \%segments;
161             }
162              
163             # Calculate depth scores based on a list of gap scores
164             sub depth {
165             no warnings;
166            
167             my $self = shift;
168             my @score = @{$_[0]};
169             my @depth;
170             for my $i (1..$#score) {
171             $depth[$i] = $score[$i-1] + $score[$i+1] - 2*$score[$i];
172             }
173             $depth[0] = $score[1] - $score[0];
174             $depth[$#score] = $score[-2] - $score[-1];
175             return \@depth;
176             }
177              
178             # Given some depth scores, smooth them.
179             sub smooth {
180             my $self = shift;
181             my @depth = @{$_[0]};
182             unshift @depth, $depth[0];
183             push @depth, $depth[-1];
184             for (1..$self->{NUMBER_OF_SMOOTHING_ROUNDS}) {
185             foreach my $j (1..$#depth-1) {
186             $depth[$j] = average $depth[$j-1], $depth[$j], $depth[$j+1];
187             }
188             }
189             return [ @depth[1..$#depth-1] ];
190             }
191              
192             # Take text as input and output a list of smoothed depth scores
193             sub smoothed_depth_scores {
194             my ($self, $input) = @_;
195             my $words = $self->{splitter}->words($input);
196             my $tiles = $self->{splitter}->tile($words);
197             my $depth_scores = $self->depth($self->gap_scores($tiles));
198             return $self->smooth($depth_scores);
199             }
200              
201             # Scores for the gap between two tiles
202             sub gap_scores {
203             my $self = shift;
204             my @tiles = @{$_[0]};
205             my $TILES_PER_BLOCK = $self->{TILES_PER_BLOCK};
206             my (@score, $i);
207             for $i ($TILES_PER_BLOCK .. @tiles-$TILES_PER_BLOCK) {
208             my $L = $i-$TILES_PER_BLOCK;
209             my $R = $i+$TILES_PER_BLOCK-1;
210             my %l = hashsum map { %$_ } @tiles[$L..$i-1];
211             my %r = hashsum map { %$_ } @tiles[$i..$R];
212             my %all = map { %$_ } @tiles[$L..$R];
213             my $numerator = sum map { $l{$_}*$r{$_} } keys %all;
214             my $denom1 = sum map { $l{$_}*$l{$_} } keys %all;
215             my $denom2 = sum map { $r{$_}*$r{$_} } keys %all;
216             push @score, $numerator/sqrt($denom1*$denom2);
217             }
218             return \@score;
219             }
220              
221              
222             1;
223              
224              
225              
226              
227