File Coverage

blib/lib/Lingua/ZH/Summarize.pm

Criterion	Covered	Total	%
statement	10	12	83.3
branch			n/a
condition			n/a
subroutine	4	4	100.0
pod			n/a
total	14	16	87.5

line	stmt	sub	time	code
1				# $File: //member/autrijus/Lingua-ZH-Summarize/Summarize.pm $ $Author: autrijus $
2				# $Revision: #2 $ $Change: 3687 $ $DateTime: 2003/01/20 07:22:40 $
3
4				package Lingua::ZH::Summarize;
5				$Lingua::ZH::Summarize::VERSION = '0.01';
6
7	1	1	649	use strict;
	1		2
	1		44
8	1	1	5	use vars qw($VERSION @ISA @EXPORT);
	1		3
	1		65
9
10	1	1	5	use Exporter;
	1		2
	1		42
11	1	1	744	use Lingua::ZH::Toke;
	0
	0
12
13				=head1 NAME
14
15				Lingua::ZH::Summarize - Summarizing bodies of Chinese text
16
17				=head1 SYNOPSIS
18
19				use Lingua::ZH::Summarize;
20
21				print summarize( $text ); # Easy, no? :-)
22				print summarize( $text, maxlength => 500 ); # 500-byte summary
23				print summarize( $text, wrap => 75 ); # Wrap output to 75 col.
24
25				=head1 DESCRIPTION
26
27				This is a simple module which makes an unscientific effort at
28				summarizing Chinese text. It recognizes simple patterns which look like
29				statements, abridges them, and concatenates them into something vaguely
30				resembling a summary. It needs more work on large bodies of text, but
31				it seems to have a decent effect on small inputs at the moment.
32
33				Lingua::ZH::Summarize exports one function, C, which takes
34				the text to summarize as its first argument, and any number of optional
35				directives in C value> form. The options it'll take are:
36
37				=over
38
39				=item maxlength
40
41				Specifies the maximum length, in bytes, of the generated summary.
42
43				=item wrap
44
45				Prettyprints the summary output by wrapping it to the number of columns
46				which you specify. This requires the L module.
47
48				=back
49
50				Needless to say, this is a very simple and not terribly universally
51				effective scheme, but it's good enough for a first draft, and I'll bang
52				on it more later. Like I said, it's not a scientific approach to the
53				problem, but it's better than nothing.
54
55				=cut
56
57				@ISA = qw(Exporter);
58				@EXPORT = qw(summarize);
59
60				my %punct = map { $_ => $_ } qw(�C �H �I �F ...);
61				$punct{$_} = '�C' for qw(�A ��);
62
63				my %key = map { $_ => 1 } qw(�O �\| �� N �� );
64				my %stop = (
65				%key, map { $_ => 1 } qw(
66				�� M �P �� ~ �� i �� ] �B �� s
67				)
68				);
69
70				my $is_utf8;
71				sub import {
72				my $class = shift;
73
74				if ($_[0] eq 'utf8' and !$is_utf8++) {
75				Lingua::ZH::Toke->import(@_);
76
77				require Encode;
78				%key = map { Encode::decode( big5 => $_ ) => 1 } keys %key;
79				%stop = map { Encode::decode( big5 => $_ ) => 1 } keys %stop;
80				%punct = map {
81				Encode::decode( big5 => $_ ) =>
82				Encode::decode( big5 => $punct{$_} )
83				} keys %punct;
84				}
85				}
86
87				sub summarize {
88				my ($text, %options) = @_;
89
90				# Run each filter over the text.
91				return unless $text;
92
93				# Strip whitespace and formatting out of the text.
94				$text =~ s/^\s+//;
95				$text =~ s/\s+/ /sg;
96				$text =~ s/\s+$//;
97
98				unless (exists $options{maxlength} and $options{maxlength} > 0) {
99				$options{maxlength} = log(length($text)) * 30;
100				}
101
102				### Here's where the interesting logic happens.
103
104				my $sentence = Lingua::ZH::Toke->new($text);
105
106				# First we break it into sentence pieces. Kind of. Sort of.
107
108				my $summary = '';
109				my $flag = 1;
110
111				<$sentence> unless $sentence->[0][0];
112
113				while (my ($chunk, $punct) = (scalar <$sentence>, scalar <$sentence>)) {
114				($flag = $punct{$punct}, next) unless $flag;
115				$flag = $punct{$punct} or next;
116
117				next unless length($chunk) > 10;
118				next unless $chunk =~ /.+(?:\Q�O\E\|\Q�\|\E\|\Q��\E\|\Q�N\E\|\Q��\E\|\Q��\E)/;
119				next if $stop{substr($chunk, 0, 2)} or $stop{substr($chunk, -2)};
120
121				$summary .= $chunk . $punct{$punct};
122
123				last if length($summary) >= $options{maxlength};
124				}
125
126				### Done! Do any necessary postprocessing before returning.
127
128				return $summary unless $options{wrap};
129
130				# Prettyprint the summary to make it look nice on a terminal, if requested.
131
132				require Lingua::ZH::Wrap;
133
134				$summary = Encode::encode(big5 => $summary) if $is_utf8;
135				$summary = Lingua::ZH::Wrap::wrap(
136				$summary, $options{wrap} \|\| 72, 1
137				);
138				$summary = Encode::decode(big5 => $summary) if $is_utf8;
139
140				return $summary;
141				}
142
143				1;
144
145				=head1 SEE ALSO
146
147				L, L, L
148
149				=head1 ACKNOWLEDGEMENTS
150
151				Algorithm adapted from the L module by
152				Dennis Taylor, Edennis@funkplanet.comE.
153
154				=head1 AUTHORS
155
156				Autrijus Tang Eautrijus@autrijus.orgE
157
158				=head1 COPYRIGHT
159
160				Copyright 2003 by Autrijus Tang Eautrijus@autrijus.orgE.
161
162				This program is free software; you can redistribute it and/or modify it
163				under the same terms as Perl itself.
164
165				See L
166
167				=cut