File Coverage

blib/lib/Lingua/Concordance.pm

Criterion	Covered	Total	%
statement	134	139	96.4
branch	21	28	75.0
condition	1	6	16.6
subroutine	24	24	100.0
pod	10	10	100.0
total	190	207	91.7

line	stmt	bran	cond	sub	pod	time	code
1							package Lingua::Concordance;
2
3							# Concordance.pm - keyword-in-context (KWIC) search interface
4
5							# Eric Lease Morgan
6							# June 7, 2009 - first investigations
7							# June 8, 2009 - tweaked _by_match; still doesn't work quite right
8							# August 29, 2010 - added scale, positions, and map methods
9							# August 31, 2010 - removed \r & \t from input, duh!
10							# October 31, 2010 - removed syntax error from synopsis
11
12
13							# configure defaults
14	1			1		892	use constant RADIUS => 20;
	1					2
	1					70
15	1			1		6	use constant SORT => 'none';
	1					2
	1					38
16	1			1		21	use constant ORDINAL => 1;
	1					1
	1					42
17	1			1		7	use constant SCALE => 10;
	1					1
	1					37
18
19							# include
20	1			1		5	use strict;
	1					1
	1					29
21	1			1		4	use warnings;
	1					2
	1					35
22	1			1		719	use Math::Round;
	1					11613
	1					871
23
24							# define
25							our $VERSION = '0.04';
26
27
28							sub new {
29
30							# get input
31	1			1	1	775	my ( $class ) = @_;
32
33							# initalize
34	1					2	my $self = {};
35
36							# set defaults
37	1					4	$self->{ radius } = RADIUS;
38	1					2	$self->{ sort } = SORT;
39	1					2	$self->{ ordinal } = ORDINAL;
40	1					2	$self->{ scale } = SCALE;
41
42							# return
43	1					3	return bless $self, $class;
44
45							}
46
47
48							sub text {
49
50							# get input
51	10			10	1	15	my ( $self, $text ) = @_;
52
53							# check...
54	10	100				21	if ( $text ) {
55
56							# clean...
57	1					9	$text =~ s/\t/ /g;
58	1					8	$text =~ s/\r/ /g;
59	1					21	$text =~ s/\n/ /g;
60	1					241	$text =~ s/ +/ /g;
61	1					4	$text =~ s/\b--\b/ -- /g;
62
63							# set
64	1					3	$self->{ text } = $text;
65
66							}
67
68							# return
69	10					25	return $self->{ text };
70
71							}
72
73
74							sub query {
75
76							# get input; check & set; return
77	10			10	1	13	my ( $self, $query ) = @_;
78	10	100				22	if ( $query ) { $self->{ query } = $query }
	2					3
79	10					23	return $self->{ query };
80
81							}
82
83
84							sub scale {
85
86							# get input; check & set; return
87	4			4	1	393	my ( $self, $scale ) = @_;
88	4	100				11	if ( $scale ) { $self->{ scale } = $scale }
	1					2
89	4					12	return $self->{ scale };
90
91							}
92
93
94							sub map {
95
96							# get input
97	1			1	1	503	my ( $self ) = shift;
98
99							# initialize
100	1					3	my %map = ();
101	1					3	my $scale = $self->scale;
102	1					3	my @positions = $self->positions;
103	1					3	my $length = length( $self->text );
104
105	1					6	for ( my $i = 1; $i <= $scale; $i++ ) { $map{ round(( $i * ( 100 / $scale ))) } = 0 }
	5					90
106	1					16	my @locations = sort { $a <=> $b } keys %map;
	9					13
107
108							# process each position
109	1					3	foreach ( @positions ) {
110
111							# calculate postion as a percentage of length
112	2					8	my $position = round(( $_ * 100 ) / $length );
113
114							# map the position to a location on the scale; this is the cool part
115	2					16	my $location = 0;
116	2					9	for ( my $i = 0; $i <= $#locations; $i++ ) {
117
118	2	50	33			14	if ( $position >= 0 and $position <= $locations[ $i ] ) {
		0	0
		0
119
120	2					4	$location = $locations[ $i ];
121	2					3	last;
122
123							}
124	0					0	elsif ( $position > $locations[ $i ] and $position <= $locations[ $i + 1 ] ) { $location = $locations[ $i + 1 ] }
125	0					0	elsif ( $position <= 100 ) { $location = 100 }
126
127							}
128
129							# increment
130	2					6	$map{ $location }++;
131
132							}
133
134							# done
135	1					3	return \%map;
136
137							}
138
139
140							sub positions {
141
142	2			2	1	3	my ( $self ) = @_;
143	2					4	my @p = ();
144	2					5	my $query = $self->{ query };
145	2					5	my $text = $self->{ text };
146	2					23	while ( $text =~ m/$query/gi ) { push @p, pos $text }
	4					45
147	2					10	return @p;
148
149							}
150
151
152							sub radius {
153
154							# get input; check & set; return
155	46			46	1	50	my ( $self, $radius ) = @_;
156	46	100				86	if ( $radius ) { $self->{ radius } = $radius }
	2					4
157	46					101	return $self->{ radius };
158
159							}
160
161
162							sub ordinal {
163
164							# get input; check & set; return
165	30			30	1	1009	my ( $self, $ordinal ) = @_;
166	30	100				59	if ( $ordinal ) { $self->{ ordinal } = $ordinal }
	9					15
167	30					103	return $self->{ ordinal };
168
169							}
170
171
172							sub sort {
173
174							# get input; check & set; return
175	28			28	1	1581	my ( $self, $sort ) = @_;
176	28	100				57	if ( $sort ) { $self->{ sort } = $sort }
	8					14
177	28					86	return $self->{ sort };
178
179							}
180
181
182							sub lines {
183
184							# get input
185	7			7	1	22	my ( $self ) = shift;
186
187							# declare
188	7					12	my @lines = ();
189	7					6	my @sorted_lines = ();
190
191							# define
192	7					15	my $text = $self->text;
193	7					12	my $query = $self->query;
194	7					12	my $radius = $self->radius;
195	7					14	my $width = 2 * $self->radius;
196	7					11	my $ordinal = $self->ordinal;
197
198							# cheat; because $1, below, is not defined at compile time?
199	1			1		10	no warnings;
	1					2
	1					868
200
201							# gete the matching lines
202	7					49	while ( $text =~ /$query/gi ) {
203
204	14					20	my $match = $1;
205	14					15	my $pos = pos( $text );
206	14					25	my $start = $pos - $self->radius - length( $match );
207	14					16	my $extract = '';
208
209	14	50				33	if ( $start < 0 ) {
210
211	0					0	$extract = substr( $text, 0, $width + $start + length( $match ));
212	0					0	$extract = ( " " x -$start ) . $extract;
213
214							}
215
216							else {
217
218	14					31	$extract = substr( $text, $start, $width + length( $match ));
219	14					18	my $deficit = $width + length( $match ) - length( $extract );
220	14	50				28	if ( $deficit > 0 ) { $extract .= ( " " x $deficit ) }
	0					0
221
222							}
223
224	14					223	push @lines, $extract;
225
226							}
227
228							# brach according to sorting preference
229	7	100				14	if ( $self->sort eq 'left' ) {
		100
		100
230
231	2					7	foreach ( sort { _by_left( $self, $a, $b ) } @lines ) { push @sorted_lines, $_ }
	2					5
	4					7
232
233							}
234
235							elsif ( $self->sort eq 'right' ) {
236
237	2					6	foreach ( sort { _by_right( $self, $a, $b ) } @lines ) { push @sorted_lines, $_ }
	2					4
	4					8
238
239							}
240
241							elsif ( $self->sort eq 'match' ) {
242
243	1					3	foreach ( sort { _by_match( $self, $a, $b ) } @lines ) { push @sorted_lines, $_ }
	1					3
	2					3
244
245							}
246
247	2					3	else { @sorted_lines = @lines }
248
249							# done
250	7					33	return @sorted_lines;
251
252							}
253
254
255							sub _by_left {
256
257							# get input; find left word, compare, return
258	2			2		3	my ( $self, $a, $b ) = @_;
259	2					5	return lc( _on_left( $self, $a )) cmp lc( _on_left( $self, $b ));
260
261							}
262
263
264							sub _on_left {
265
266							# get input; remove punctuation; get left string; split; return ordinal word
267	6			6		8	my ( $self, $s ) = @_;
268	6					11	my @words = split( /\s+/, &_remove_punctuation( $self, substr( $s, 0, $self->radius )));
269	6					14	return $words[ scalar( @words ) - $self->ordinal - 1 ];
270
271							}
272
273
274							sub _remove_punctuation {
275
276	13			13		25	my ( $self, $s ) = @_;
277	13					20	$s = lc( $s );
278	13					40	$s =~ s/[^-a-z ]//g;
279	13					18	$s =~ s/--+/ /g;
280	13					15	$s =~ s/-//g;
281	13					66	$s =~ s/\s+/ /g;
282	13					52	return $s;
283
284							}
285
286
287							sub _by_right {
288
289							# get input; find right word, compare, return
290	2			2		3	my ( $self, $a, $b ) = @_;
291	2					5	return lc( _on_right( $self, $a )) cmp lc( _on_right( $self, $b ));
292
293							}
294
295
296							sub _on_right {
297
298							# get input; remove punctuation; get right string; split; return ordinal word
299	6			6		7	my ( $self, $s ) = @_;
300	6					15	my @words = split( /\s+/, &_remove_punctuation( $self, substr( $s, -$self->radius )));
301	6					17	return $words[ $self->ordinal ];
302
303							}
304
305
306							sub _by_match {
307
308	1			1		2	my ( $self, $a, $b ) = @_;
309	1					3	return substr( $a, length( $a ) - $self->radius ) cmp substr( $b, length( $b ) - $self->radius );
310
311							}
312
313
314							=head1 NAME
315
316							Lingua::Concordance - Keyword-in-context (KWIC) search interface
317
318
319							=head1 SYNOPSIS
320
321							# require
322							use Lingua::Concordance;
323
324							# initialize
325							$concordance = Lingua::Concordance->new;
326							$concordance->text( 'A long time ago, in a galaxy far far away...' );
327							$concordance->query( 'far' );
328
329							# do the work
330							foreach ( $concordance->lines ) { print "$_\n" }
331
332							# modify the query and map (graph) it
333							$concordance->query( 'i' );
334							$map = $concordance->map;
335							foreach ( sort { $a <=> $b } keys %map ) { print "$_\t", $$map{ $_ }, "\n" }
336
337
338							=head1 DESCRIPTION
339
340							Given a scalar (such as the content of a plain text electronic book or journal article) and a regular expression, this module implements a simple keyword-in-context (KWIC) search interface -- a concordance. Its purpuse is two-fold. First, it is intended to return lists of lines from a text containing the given expression. Second, it is intended to map the general location in the text where the expression appears. For example, the first half of the text, the last third, the third quarter, etc. See the Discussion section, below, for more detail.
341
342
343							=head1 METHODS
344
345
346							=head2 new
347
348							Create a new, empty concordance object:
349
350							$concordance = Lingua::Concordance->new;
351
352
353							=head2 text
354
355							Set or get the value of the concordance's text attribute where the input is expected to be a scalar containing some large amount of content, like an electronic book or journal article:
356
357							# set text attribute
358							$concordance->text( 'Call me Ishmael. Some years ago- never mind how long...' );
359
360							# get the text attribute
361							$text = $concordance->text;
362
363							Note: The scalar passed to this method gets internally normalized, specifically, all carriage returns are changed to spaces, and multiple spaces are changed to single spaces.
364
365
366							=head2 query
367
368							Set or get the value of the concordance's query attribute. The input is expected to be a regular expression but a simple word or phrase will work just fine:
369
370							# set query attribute
371							$concordance->query( 'Ishmael' );
372
373							# get query attribute
374							$query = $concordance->query;
375
376							See the Discussion section, below, for ways to make the most of this method through the use of powerful regular expressions. This is where the fun it.
377
378
379							=head2 radius
380
381							Set or get the length of each line returned from the lines method, below. Each line will be padded on the left and the right of the query with the number of characters necessary to equal the value of radius. This makes it easier to sort the lines:
382
383							# set radius attribute
384							$concordance->radius( $integer );
385
386							# get radius attribute
387							$integer = $concordance->query;
388
389							For terminal-based applications it is usually not reasonable to set this value to greater than 30. Web-based applications can use arbitrarily large numbers. The internally set default value is 20.
390
391
392							=head2 sort
393
394							Set or get the type of line sorting:
395
396							# set sort attribute
397							$concordance->sort( 'left' );
398
399							# get sort attribute
400							$sort = $concordance->sort;
401
402							Valid values include:
403
404							=over
405
406							* none - the default value; sorts lines in the order they appear in the text -- no sorting
407
408							* left - sorts lines by the (ordinal) word to the left of the query, as defined the ordinal method, below
409
410							* right - sorts lines by the (ordinal) word to the right of the query, as defined the ordinal method, below
411
412							* match - sorts lines by the value of the query (mostly)
413
414							=back
415
416							This is good for looking for patterns in texts, such as collocations (phrases, bi-grams, and n-grams). Again, see the Discussion section for hints.
417
418
419							=head2 ordinal
420
421							Set or get the number of words to the left or right of the query to be used for sorting purposes. The internally set default value is 1:
422
423							# set ordinal attribute
424							$concordance->ordinal( 2 );
425
426							# get ordinal attribute
427							$integer = $concordance->ordinal;
428
429							Used in combination with the sort method, above, this is good for looking for textual patterns. See the Discussion section for more information.
430
431
432							=head2 lines
433
434							Return a list of lines from the text matching the query. Our reason de existance:
435
436							# get the line containing the query
437							@lines = $concordance->lines;
438
439
440							=head2 positions
441
442							Return an array of integers representing the locations (offsets) of the query in the text.
443
444							# get positions of queries in a text
445							@positions = $concordance->positions;
446
447
448							=head2 scale
449
450							Set or get the scale for mapping query locations to positions between 0 and 100.
451
452							# set the scale
453							$concordance->scale( 5 );
454
455							# get the scale
456							$scale = $concordance->scale;
457
458							This number is used to initialize a scale from 0 to 100 where scale is a percentage of the whole text. A value of 2 will divide the text into two halves. A value of 3 will divide the text into three parts, all equal to 33% of the text's length. A value of 5 will create five equal parts all equal to 20% of the text.
459
460							The default is 10.
461
462
463							=head2 map
464
465							Returns a reference to a hash where the keys are integers representing locations on a scale between 0 and 100, inclusive. The values are the number of matched queries located at that position.
466
467							# map the query to percentages of the text
468							$map = $concordance->map;
469
470							# list the sections of the text where the query appears
471							foreach ( sort { $a <=> $b } keys %map ) { print "$_\t", $$map{ $_ }, "\n" }
472
473							The output of this method is intended to facilitate the graphing of matched queries on a bar chart where the hash's keys represent ranges along the X-axis and the values represent points up and down the Y-axis. The script in this distribution named bin/concordance.pl illustrates how to do this with a Perl module caled Text::BarGraph.
474
475
476							=head1 DISCUSSION
477
478							[Elaborate upon a number of things here such as but not limited to: 1) the history of concordances and concordance systems, 2) the usefulness of concordances in the study of literature, 3) how to expoit regular expressions to get the most out of a text and finding interesting snipettes, and 4) how the module might be implemented in scripts and programs.]
479
480
481							=head1 BUGS
482
483							The internal _by_match subroutine, the one used to sort results by the matching regular expression, does not work exactly as expected. Instead of sorting by the matching regular expression, it sorts by the string exactly to the right of the matched regular expression. Consquently, for queries such as 'human', it correctly matches and sorts on human, humanity, and humans, but matches such as Humanity do not necessarily come before humanity.
484
485
486							=head1 TODO
487
488							=over
489
490							* Write Discussion section.
491
492							* Implement error checking.
493
494							* Fix the _by_match bug.
495
496							* Enable all of the configuration methods (text, query, radius, sort, and ordinal) to be specified in the constructor.
497
498							* Require the text and query attributes to be specified as a part of the constructor, maybe.
499
500							* Remove line-feed characters while normalizing text to accomdate Windows-based text streams, maybe.
501
502							* Write an example CGI script, to accompany the distribution's terminal-based script, demonstrating how the module can be implemented in a Web interface.
503
504							* Write a full-featured terminal-based script enhancing the one found in the distribution.
505
506							=back
507
508							=head1 CHANGES
509
510							=over
511
512							* June 9, 2009 - initial release
513
514							* August 29, 2010 - added the postions, scale, and map methods
515
516							* October 31, 2010 - removed syntax error from synopsis ("Thank you, Pankaj Mehra.")
517
518							=back
519
520							=head1 ACKNOWLEDGEMENTS
521
522							The module implementes, almost verbatim, the concordance programs and subroutines described in Bilisoly, R. (2008). Practical text mining with Perl. Wiley series on methods and applications in data mining. Hoboken, N.J.: Wiley. pgs: 169-185. "Thanks Roger. I couldn't have done it without your book!"
523
524
525							=head1 AUTHOR
526
527							Eric Lease Morgan
528
529							=cut
530
531							# return true or die
532							1;