File Coverage

blib/lib/Lingua/Concordance.pm
Criterion Covered Total %
statement 134 139 96.4
branch 21 28 75.0
condition 1 6 16.6
subroutine 24 24 100.0
pod 10 10 100.0
total 190 207 91.7


line stmt bran cond sub pod time code
1             package Lingua::Concordance;
2              
3             # Concordance.pm - keyword-in-context (KWIC) search interface
4              
5             # Eric Lease Morgan
6             # June 7, 2009 - first investigations
7             # June 8, 2009 - tweaked _by_match; still doesn't work quite right
8             # August 29, 2010 - added scale, positions, and map methods
9             # August 31, 2010 - removed \r & \t from input, duh!
10             # October 31, 2010 - removed syntax error from synopsis
11              
12              
13             # configure defaults
14 1     1   892 use constant RADIUS => 20;
  1         2  
  1         70  
15 1     1   6 use constant SORT => 'none';
  1         2  
  1         38  
16 1     1   21 use constant ORDINAL => 1;
  1         1  
  1         42  
17 1     1   7 use constant SCALE => 10;
  1         1  
  1         37  
18              
19             # include
20 1     1   5 use strict;
  1         1  
  1         29  
21 1     1   4 use warnings;
  1         2  
  1         35  
22 1     1   719 use Math::Round;
  1         11613  
  1         871  
23              
24             # define
25             our $VERSION = '0.04';
26              
27              
28             sub new {
29              
30             # get input
31 1     1 1 775 my ( $class ) = @_;
32            
33             # initalize
34 1         2 my $self = {};
35            
36             # set defaults
37 1         4 $self->{ radius } = RADIUS;
38 1         2 $self->{ sort } = SORT;
39 1         2 $self->{ ordinal } = ORDINAL;
40 1         2 $self->{ scale } = SCALE;
41            
42             # return
43 1         3 return bless $self, $class;
44            
45             }
46              
47              
48             sub text {
49              
50             # get input
51 10     10 1 15 my ( $self, $text ) = @_;
52            
53             # check...
54 10 100       21 if ( $text ) {
55            
56             # clean...
57 1         9 $text =~ s/\t/ /g;
58 1         8 $text =~ s/\r/ /g;
59 1         21 $text =~ s/\n/ /g;
60 1         241 $text =~ s/ +/ /g;
61 1         4 $text =~ s/\b--\b/ -- /g;
62            
63             # set
64 1         3 $self->{ text } = $text;
65            
66             }
67            
68             # return
69 10         25 return $self->{ text };
70            
71             }
72              
73              
74             sub query {
75              
76             # get input; check & set; return
77 10     10 1 13 my ( $self, $query ) = @_;
78 10 100       22 if ( $query ) { $self->{ query } = $query }
  2         3  
79 10         23 return $self->{ query };
80            
81             }
82              
83              
84             sub scale {
85              
86             # get input; check & set; return
87 4     4 1 393 my ( $self, $scale ) = @_;
88 4 100       11 if ( $scale ) { $self->{ scale } = $scale }
  1         2  
89 4         12 return $self->{ scale };
90            
91             }
92              
93              
94             sub map {
95              
96             # get input
97 1     1 1 503 my ( $self ) = shift;
98              
99             # initialize
100 1         3 my %map = ();
101 1         3 my $scale = $self->scale;
102 1         3 my @positions = $self->positions;
103 1         3 my $length = length( $self->text );
104            
105 1         6 for ( my $i = 1; $i <= $scale; $i++ ) { $map{ round(( $i * ( 100 / $scale ))) } = 0 }
  5         90  
106 1         16 my @locations = sort { $a <=> $b } keys %map;
  9         13  
107            
108             # process each position
109 1         3 foreach ( @positions ) {
110            
111             # calculate postion as a percentage of length
112 2         8 my $position = round(( $_ * 100 ) / $length );
113            
114             # map the position to a location on the scale; this is the cool part
115 2         16 my $location = 0;
116 2         9 for ( my $i = 0; $i <= $#locations; $i++ ) {
117            
118 2 50 33     14 if ( $position >= 0 and $position <= $locations[ $i ] ) {
    0 0        
    0          
119            
120 2         4 $location = $locations[ $i ];
121 2         3 last;
122            
123             }
124 0         0 elsif ( $position > $locations[ $i ] and $position <= $locations[ $i + 1 ] ) { $location = $locations[ $i + 1 ] }
125 0         0 elsif ( $position <= 100 ) { $location = 100 }
126            
127             }
128            
129             # increment
130 2         6 $map{ $location }++;
131            
132             }
133              
134             # done
135 1         3 return \%map;
136            
137             }
138              
139              
140             sub positions {
141              
142 2     2 1 3 my ( $self ) = @_;
143 2         4 my @p = ();
144 2         5 my $query = $self->{ query };
145 2         5 my $text = $self->{ text };
146 2         23 while ( $text =~ m/$query/gi ) { push @p, pos $text }
  4         45  
147 2         10 return @p;
148              
149             }
150              
151              
152             sub radius {
153              
154             # get input; check & set; return
155 46     46 1 50 my ( $self, $radius ) = @_;
156 46 100       86 if ( $radius ) { $self->{ radius } = $radius }
  2         4  
157 46         101 return $self->{ radius };
158            
159             }
160              
161              
162             sub ordinal {
163              
164             # get input; check & set; return
165 30     30 1 1009 my ( $self, $ordinal ) = @_;
166 30 100       59 if ( $ordinal ) { $self->{ ordinal } = $ordinal }
  9         15  
167 30         103 return $self->{ ordinal };
168            
169             }
170              
171              
172             sub sort {
173              
174             # get input; check & set; return
175 28     28 1 1581 my ( $self, $sort ) = @_;
176 28 100       57 if ( $sort ) { $self->{ sort } = $sort }
  8         14  
177 28         86 return $self->{ sort };
178            
179             }
180              
181              
182             sub lines {
183              
184             # get input
185 7     7 1 22 my ( $self ) = shift;
186            
187             # declare
188 7         12 my @lines = ();
189 7         6 my @sorted_lines = ();
190            
191             # define
192 7         15 my $text = $self->text;
193 7         12 my $query = $self->query;
194 7         12 my $radius = $self->radius;
195 7         14 my $width = 2 * $self->radius;
196 7         11 my $ordinal = $self->ordinal;
197            
198             # cheat; because $1, below, is not defined at compile time?
199 1     1   10 no warnings;
  1         2  
  1         868  
200              
201             # gete the matching lines
202 7         49 while ( $text =~ /$query/gi ) {
203            
204 14         20 my $match = $1;
205 14         15 my $pos = pos( $text );
206 14         25 my $start = $pos - $self->radius - length( $match );
207 14         16 my $extract = '';
208            
209 14 50       33 if ( $start < 0 ) {
210            
211 0         0 $extract = substr( $text, 0, $width + $start + length( $match ));
212 0         0 $extract = ( " " x -$start ) . $extract;
213            
214             }
215            
216             else {
217            
218 14         31 $extract = substr( $text, $start, $width + length( $match ));
219 14         18 my $deficit = $width + length( $match ) - length( $extract );
220 14 50       28 if ( $deficit > 0 ) { $extract .= ( " " x $deficit ) }
  0         0  
221            
222             }
223            
224 14         223 push @lines, $extract;
225            
226             }
227            
228             # brach according to sorting preference
229 7 100       14 if ( $self->sort eq 'left' ) {
    100          
    100          
230            
231 2         7 foreach ( sort { _by_left( $self, $a, $b ) } @lines ) { push @sorted_lines, $_ }
  2         5  
  4         7  
232              
233             }
234            
235             elsif ( $self->sort eq 'right' ) {
236            
237 2         6 foreach ( sort { _by_right( $self, $a, $b ) } @lines ) { push @sorted_lines, $_ }
  2         4  
  4         8  
238              
239             }
240            
241             elsif ( $self->sort eq 'match' ) {
242            
243 1         3 foreach ( sort { _by_match( $self, $a, $b ) } @lines ) { push @sorted_lines, $_ }
  1         3  
  2         3  
244              
245             }
246            
247 2         3 else { @sorted_lines = @lines }
248            
249             # done
250 7         33 return @sorted_lines;
251            
252             }
253              
254              
255             sub _by_left {
256              
257             # get input; find left word, compare, return
258 2     2   3 my ( $self, $a, $b ) = @_;
259 2         5 return lc( _on_left( $self, $a )) cmp lc( _on_left( $self, $b ));
260            
261             }
262              
263              
264             sub _on_left {
265              
266             # get input; remove punctuation; get left string; split; return ordinal word
267 6     6   8 my ( $self, $s ) = @_;
268 6         11 my @words = split( /\s+/, &_remove_punctuation( $self, substr( $s, 0, $self->radius )));
269 6         14 return $words[ scalar( @words ) - $self->ordinal - 1 ];
270              
271             }
272              
273              
274             sub _remove_punctuation {
275            
276 13     13   25 my ( $self, $s ) = @_;
277 13         20 $s = lc( $s );
278 13         40 $s =~ s/[^-a-z ]//g;
279 13         18 $s =~ s/--+/ /g;
280 13         15 $s =~ s/-//g;
281 13         66 $s =~ s/\s+/ /g;
282 13         52 return $s;
283              
284             }
285              
286              
287             sub _by_right {
288              
289             # get input; find right word, compare, return
290 2     2   3 my ( $self, $a, $b ) = @_;
291 2         5 return lc( _on_right( $self, $a )) cmp lc( _on_right( $self, $b ));
292            
293             }
294              
295              
296             sub _on_right {
297              
298             # get input; remove punctuation; get right string; split; return ordinal word
299 6     6   7 my ( $self, $s ) = @_;
300 6         15 my @words = split( /\s+/, &_remove_punctuation( $self, substr( $s, -$self->radius )));
301 6         17 return $words[ $self->ordinal ];
302              
303             }
304              
305              
306             sub _by_match {
307              
308 1     1   2 my ( $self, $a, $b ) = @_;
309 1         3 return substr( $a, length( $a ) - $self->radius ) cmp substr( $b, length( $b ) - $self->radius );
310            
311             }
312              
313              
314             =head1 NAME
315              
316             Lingua::Concordance - Keyword-in-context (KWIC) search interface
317              
318              
319             =head1 SYNOPSIS
320              
321             # require
322             use Lingua::Concordance;
323              
324             # initialize
325             $concordance = Lingua::Concordance->new;
326             $concordance->text( 'A long time ago, in a galaxy far far away...' );
327             $concordance->query( 'far' );
328              
329             # do the work
330             foreach ( $concordance->lines ) { print "$_\n" }
331              
332             # modify the query and map (graph) it
333             $concordance->query( 'i' );
334             $map = $concordance->map;
335             foreach ( sort { $a <=> $b } keys %map ) { print "$_\t", $$map{ $_ }, "\n" }
336              
337              
338             =head1 DESCRIPTION
339              
340             Given a scalar (such as the content of a plain text electronic book or journal article) and a regular expression, this module implements a simple keyword-in-context (KWIC) search interface -- a concordance. Its purpuse is two-fold. First, it is intended to return lists of lines from a text containing the given expression. Second, it is intended to map the general location in the text where the expression appears. For example, the first half of the text, the last third, the third quarter, etc. See the Discussion section, below, for more detail.
341              
342              
343             =head1 METHODS
344              
345              
346             =head2 new
347              
348             Create a new, empty concordance object:
349              
350             $concordance = Lingua::Concordance->new;
351              
352              
353             =head2 text
354              
355             Set or get the value of the concordance's text attribute where the input is expected to be a scalar containing some large amount of content, like an electronic book or journal article:
356              
357             # set text attribute
358             $concordance->text( 'Call me Ishmael. Some years ago- never mind how long...' );
359              
360             # get the text attribute
361             $text = $concordance->text;
362              
363             Note: The scalar passed to this method gets internally normalized, specifically, all carriage returns are changed to spaces, and multiple spaces are changed to single spaces.
364              
365              
366             =head2 query
367              
368             Set or get the value of the concordance's query attribute. The input is expected to be a regular expression but a simple word or phrase will work just fine:
369              
370             # set query attribute
371             $concordance->query( 'Ishmael' );
372              
373             # get query attribute
374             $query = $concordance->query;
375              
376             See the Discussion section, below, for ways to make the most of this method through the use of powerful regular expressions. This is where the fun it.
377              
378              
379             =head2 radius
380              
381             Set or get the length of each line returned from the lines method, below. Each line will be padded on the left and the right of the query with the number of characters necessary to equal the value of radius. This makes it easier to sort the lines:
382              
383             # set radius attribute
384             $concordance->radius( $integer );
385              
386             # get radius attribute
387             $integer = $concordance->query;
388            
389             For terminal-based applications it is usually not reasonable to set this value to greater than 30. Web-based applications can use arbitrarily large numbers. The internally set default value is 20.
390              
391              
392             =head2 sort
393              
394             Set or get the type of line sorting:
395              
396             # set sort attribute
397             $concordance->sort( 'left' );
398              
399             # get sort attribute
400             $sort = $concordance->sort;
401            
402             Valid values include:
403              
404             =over
405              
406             * none - the default value; sorts lines in the order they appear in the text -- no sorting
407              
408             * left - sorts lines by the (ordinal) word to the left of the query, as defined the ordinal method, below
409              
410             * right - sorts lines by the (ordinal) word to the right of the query, as defined the ordinal method, below
411              
412             * match - sorts lines by the value of the query (mostly)
413              
414             =back
415              
416             This is good for looking for patterns in texts, such as collocations (phrases, bi-grams, and n-grams). Again, see the Discussion section for hints.
417              
418              
419             =head2 ordinal
420              
421             Set or get the number of words to the left or right of the query to be used for sorting purposes. The internally set default value is 1:
422              
423             # set ordinal attribute
424             $concordance->ordinal( 2 );
425              
426             # get ordinal attribute
427             $integer = $concordance->ordinal;
428              
429             Used in combination with the sort method, above, this is good for looking for textual patterns. See the Discussion section for more information.
430              
431              
432             =head2 lines
433              
434             Return a list of lines from the text matching the query. Our reason de existance:
435              
436             # get the line containing the query
437             @lines = $concordance->lines;
438              
439              
440             =head2 positions
441              
442             Return an array of integers representing the locations (offsets) of the query in the text.
443              
444             # get positions of queries in a text
445             @positions = $concordance->positions;
446              
447              
448             =head2 scale
449              
450             Set or get the scale for mapping query locations to positions between 0 and 100.
451              
452             # set the scale
453             $concordance->scale( 5 );
454              
455             # get the scale
456             $scale = $concordance->scale;
457              
458             This number is used to initialize a scale from 0 to 100 where scale is a percentage of the whole text. A value of 2 will divide the text into two halves. A value of 3 will divide the text into three parts, all equal to 33% of the text's length. A value of 5 will create five equal parts all equal to 20% of the text.
459              
460             The default is 10.
461              
462              
463             =head2 map
464              
465             Returns a reference to a hash where the keys are integers representing locations on a scale between 0 and 100, inclusive. The values are the number of matched queries located at that position.
466              
467             # map the query to percentages of the text
468             $map = $concordance->map;
469              
470             # list the sections of the text where the query appears
471             foreach ( sort { $a <=> $b } keys %map ) { print "$_\t", $$map{ $_ }, "\n" }
472              
473             The output of this method is intended to facilitate the graphing of matched queries on a bar chart where the hash's keys represent ranges along the X-axis and the values represent points up and down the Y-axis. The script in this distribution named bin/concordance.pl illustrates how to do this with a Perl module caled Text::BarGraph.
474              
475              
476             =head1 DISCUSSION
477              
478             [Elaborate upon a number of things here such as but not limited to: 1) the history of concordances and concordance systems, 2) the usefulness of concordances in the study of literature, 3) how to expoit regular expressions to get the most out of a text and finding interesting snipettes, and 4) how the module might be implemented in scripts and programs.]
479              
480              
481             =head1 BUGS
482              
483             The internal _by_match subroutine, the one used to sort results by the matching regular expression, does not work exactly as expected. Instead of sorting by the matching regular expression, it sorts by the string exactly to the right of the matched regular expression. Consquently, for queries such as 'human', it correctly matches and sorts on human, humanity, and humans, but matches such as Humanity do not necessarily come before humanity.
484              
485              
486             =head1 TODO
487              
488             =over
489              
490             * Write Discussion section.
491              
492             * Implement error checking.
493              
494             * Fix the _by_match bug.
495              
496             * Enable all of the configuration methods (text, query, radius, sort, and ordinal) to be specified in the constructor.
497              
498             * Require the text and query attributes to be specified as a part of the constructor, maybe.
499              
500             * Remove line-feed characters while normalizing text to accomdate Windows-based text streams, maybe.
501              
502             * Write an example CGI script, to accompany the distribution's terminal-based script, demonstrating how the module can be implemented in a Web interface.
503              
504             * Write a full-featured terminal-based script enhancing the one found in the distribution.
505              
506             =back
507              
508             =head1 CHANGES
509              
510             =over
511              
512             * June 9, 2009 - initial release
513              
514             * August 29, 2010 - added the postions, scale, and map methods
515              
516             * October 31, 2010 - removed syntax error from synopsis ("Thank you, Pankaj Mehra.")
517              
518             =back
519              
520             =head1 ACKNOWLEDGEMENTS
521              
522             The module implementes, almost verbatim, the concordance programs and subroutines described in Bilisoly, R. (2008). Practical text mining with Perl. Wiley series on methods and applications in data mining. Hoboken, N.J.: Wiley. pgs: 169-185. "Thanks Roger. I couldn't have done it without your book!"
523              
524              
525             =head1 AUTHOR
526              
527             Eric Lease Morgan
528              
529             =cut
530              
531             # return true or die
532             1;