line |
stmt |
bran |
cond |
sub |
pod |
time |
code |
1
|
|
|
|
|
|
|
package Text::Perfide::WordBags; |
2
|
|
|
|
|
|
|
|
3
|
1
|
|
|
1
|
|
30757
|
use warnings; |
|
1
|
|
|
|
|
3
|
|
|
1
|
|
|
|
|
38
|
|
4
|
1
|
|
|
1
|
|
6
|
use strict; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
36
|
|
5
|
1
|
|
|
1
|
|
879
|
use utf8::all; |
|
1
|
|
|
|
|
68720
|
|
|
1
|
|
|
|
|
7
|
|
6
|
|
|
|
|
|
|
|
7
|
1
|
|
|
1
|
|
3503
|
use base 'Exporter'; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
660
|
|
8
|
|
|
|
|
|
|
our @EXPORT = (qw/pairability file2bag bagint baguni bagcard/); |
9
|
|
|
|
|
|
|
|
10
|
|
|
|
|
|
|
|
11
|
|
|
|
|
|
|
=head1 NAME |
12
|
|
|
|
|
|
|
|
13
|
|
|
|
|
|
|
Text::Perfide::WordBags - Create word bags from text, and operate over them. |
14
|
|
|
|
|
|
|
|
15
|
|
|
|
|
|
|
=head1 VERSION |
16
|
|
|
|
|
|
|
|
17
|
|
|
|
|
|
|
Version 0.01_02 |
18
|
|
|
|
|
|
|
|
19
|
|
|
|
|
|
|
=cut |
20
|
|
|
|
|
|
|
|
21
|
|
|
|
|
|
|
our $VERSION = '0.01_02'; |
22
|
|
|
|
|
|
|
|
23
|
|
|
|
|
|
|
=head1 SYNOPSIS |
24
|
|
|
|
|
|
|
|
25
|
|
|
|
|
|
|
Quick summary of what the module does. |
26
|
|
|
|
|
|
|
|
27
|
|
|
|
|
|
|
Perhaps a little code snippet. |
28
|
|
|
|
|
|
|
|
29
|
|
|
|
|
|
|
use Text::Perfide::WordBags; |
30
|
|
|
|
|
|
|
|
31
|
|
|
|
|
|
|
my $foo = Text::Perfide::WordBags->new(); |
32
|
|
|
|
|
|
|
... |
33
|
|
|
|
|
|
|
|
34
|
|
|
|
|
|
|
=head1 EXPORT |
35
|
|
|
|
|
|
|
|
36
|
|
|
|
|
|
|
A list of functions that can be exported. You can delete this section |
37
|
|
|
|
|
|
|
if you don't export anything, such as for a purely object-oriented module. |
38
|
|
|
|
|
|
|
|
39
|
|
|
|
|
|
|
=head1 FUNCTIONS |
40
|
|
|
|
|
|
|
|
41
|
|
|
|
|
|
|
=head2 pairability |
42
|
|
|
|
|
|
|
|
43
|
|
|
|
|
|
|
Calculates pairability of two wordbags. Pairibitily value is given by: |
44
|
|
|
|
|
|
|
|
45
|
|
|
|
|
|
|
$int/($uni || 1) |
46
|
|
|
|
|
|
|
|
47
|
|
|
|
|
|
|
where $int and $uni are the values given by, respectively, the intersection |
48
|
|
|
|
|
|
|
and the union of the two bags. |
49
|
|
|
|
|
|
|
|
50
|
|
|
|
|
|
|
=cut |
51
|
|
|
|
|
|
|
|
52
|
|
|
|
|
|
|
sub pairability{ |
53
|
0
|
|
|
0
|
1
|
|
my ($bag1,$bag2) = @_; |
54
|
0
|
|
|
|
|
|
my $int = bagcard(bagint($bag1,$bag2)); |
55
|
0
|
|
|
|
|
|
my $uni = bagcard(baguni($bag1,$bag2)); |
56
|
0
|
|
0
|
|
|
|
return $int/($uni || 1); |
57
|
|
|
|
|
|
|
} |
58
|
|
|
|
|
|
|
|
59
|
|
|
|
|
|
|
=head2 file2bag |
60
|
|
|
|
|
|
|
|
61
|
|
|
|
|
|
|
Create a word bag from a file. |
62
|
|
|
|
|
|
|
|
63
|
|
|
|
|
|
|
Receives as argument a function ref and a file path. |
64
|
|
|
|
|
|
|
|
65
|
|
|
|
|
|
|
Reads a file in slurp mode, passes the text to the function passed as argument, |
66
|
|
|
|
|
|
|
and returns the result. |
67
|
|
|
|
|
|
|
|
68
|
|
|
|
|
|
|
=cut |
69
|
|
|
|
|
|
|
|
70
|
|
|
|
|
|
|
sub file2bag{ |
71
|
0
|
|
|
0
|
1
|
|
my ($txt2bag_fn,$file) = @_; |
72
|
0
|
0
|
|
|
|
|
open my $fp,'<:utf8',$file or die "Could not open file '$file'"; |
73
|
0
|
|
|
|
|
|
my $text = join '',<$fp>; |
74
|
0
|
|
|
|
|
|
return $txt2bag_fn->($text); |
75
|
|
|
|
|
|
|
} |
76
|
|
|
|
|
|
|
|
77
|
|
|
|
|
|
|
# sub txt2bag_pn{ # proper nouns |
78
|
|
|
|
|
|
|
# my $text = shift; |
79
|
|
|
|
|
|
|
# |
80
|
|
|
|
|
|
|
# my $upper = {}; |
81
|
|
|
|
|
|
|
# my $uppat = qr{\b[A-Z]\w{3,}(?:['-]\w+)*\b}; |
82
|
|
|
|
|
|
|
# $upper->{$1}++ while($text =~ /($uppat)/g); |
83
|
|
|
|
|
|
|
# |
84
|
|
|
|
|
|
|
# my $lower = {}; |
85
|
|
|
|
|
|
|
# my $lwpat = qr{\b[a-z]+(?:['-][a-z]+)*\b}; |
86
|
|
|
|
|
|
|
# $lower->{$1}++ while($text =~ /($lwpat)/g); |
87
|
|
|
|
|
|
|
# |
88
|
|
|
|
|
|
|
# foreach my $k (keys %$upper){ |
89
|
|
|
|
|
|
|
# if($lower->{lc $k}){ |
90
|
|
|
|
|
|
|
# my $ratio = $upper->{$k}/$lower->{lc $k}; |
91
|
|
|
|
|
|
|
# delete $upper->{$k} if $ratio < 10; |
92
|
|
|
|
|
|
|
# } |
93
|
|
|
|
|
|
|
# } |
94
|
|
|
|
|
|
|
# return $upper; |
95
|
|
|
|
|
|
|
# } |
96
|
|
|
|
|
|
|
# |
97
|
|
|
|
|
|
|
# sub txt2bag_num{ |
98
|
|
|
|
|
|
|
# my $text = shift; |
99
|
|
|
|
|
|
|
# my $bag = {}; |
100
|
|
|
|
|
|
|
# my $pecul = qr{\d+}; |
101
|
|
|
|
|
|
|
# $bag->{$1}++ while($text =~ /($pecul)/g); |
102
|
|
|
|
|
|
|
# if(haspn($bag)){ |
103
|
|
|
|
|
|
|
# foreach(1..300){ |
104
|
|
|
|
|
|
|
# $bag->{$_}-- if $bag->{$_}; |
105
|
|
|
|
|
|
|
# delete $bag->{$_} unless $bag->{$_}; |
106
|
|
|
|
|
|
|
# } |
107
|
|
|
|
|
|
|
# } |
108
|
|
|
|
|
|
|
# return $bag; |
109
|
|
|
|
|
|
|
# } |
110
|
|
|
|
|
|
|
|
111
|
|
|
|
|
|
|
=head2 bagint |
112
|
|
|
|
|
|
|
|
113
|
|
|
|
|
|
|
Calculates the intersection between two wordbags. |
114
|
|
|
|
|
|
|
|
115
|
|
|
|
|
|
|
=cut |
116
|
|
|
|
|
|
|
|
117
|
|
|
|
|
|
|
sub bagint { |
118
|
0
|
|
|
0
|
1
|
|
my ($bag1,$bag2) = @_; |
119
|
0
|
|
|
|
|
|
my $inters = {}; |
120
|
0
|
|
|
|
|
|
foreach(keys %$bag1){ |
121
|
0
|
0
|
|
|
|
|
next unless $bag2->{$_}; |
122
|
0
|
0
|
|
|
|
|
$inters->{$_} = $bag1->{$_} < $bag2->{$_} ? $bag1->{$_} : $bag2->{$_}; |
123
|
|
|
|
|
|
|
} |
124
|
0
|
|
|
|
|
|
return $inters; |
125
|
|
|
|
|
|
|
} |
126
|
|
|
|
|
|
|
|
127
|
|
|
|
|
|
|
=head2 baguni |
128
|
|
|
|
|
|
|
|
129
|
|
|
|
|
|
|
Calculates the union of two wordbags. |
130
|
|
|
|
|
|
|
|
131
|
|
|
|
|
|
|
=cut |
132
|
|
|
|
|
|
|
|
133
|
|
|
|
|
|
|
sub baguni { |
134
|
0
|
|
|
0
|
1
|
|
my ($bag1,$bag2) = @_; |
135
|
0
|
|
|
|
|
|
my $union = {}; |
136
|
0
|
|
|
|
|
|
foreach(keys %$bag1){ |
137
|
1
|
|
|
1
|
|
8
|
no warnings; |
|
1
|
|
|
|
|
2
|
|
|
1
|
|
|
|
|
353
|
|
138
|
0
|
0
|
|
|
|
|
$union->{$_} = $bag1->{$_} > $bag2->{$_} ? $bag1->{$_} : $bag2->{$_}; |
139
|
|
|
|
|
|
|
} |
140
|
0
|
|
|
|
|
|
foreach(keys %$bag2){ |
141
|
0
|
|
0
|
|
|
|
$union->{$_}//=$bag2->{$_}; |
142
|
|
|
|
|
|
|
} |
143
|
0
|
|
|
|
|
|
return $union; |
144
|
|
|
|
|
|
|
} |
145
|
|
|
|
|
|
|
|
146
|
|
|
|
|
|
|
=head2 bagcard |
147
|
|
|
|
|
|
|
|
148
|
|
|
|
|
|
|
Calculates the cardinality of two wordbags. |
149
|
|
|
|
|
|
|
|
150
|
|
|
|
|
|
|
=cut |
151
|
|
|
|
|
|
|
|
152
|
|
|
|
|
|
|
sub bagcard { |
153
|
0
|
|
|
0
|
1
|
|
my $bag = shift; |
154
|
0
|
|
|
|
|
|
my $soma = 0; |
155
|
0
|
|
|
|
|
|
map { $soma+= $bag->{$_} } keys %$bag; |
|
0
|
|
|
|
|
|
|
156
|
0
|
|
|
|
|
|
return $soma; |
157
|
|
|
|
|
|
|
} |
158
|
|
|
|
|
|
|
|
159
|
|
|
|
|
|
|
|
160
|
|
|
|
|
|
|
|
161
|
|
|
|
|
|
|
=head1 AUTHOR |
162
|
|
|
|
|
|
|
|
163
|
|
|
|
|
|
|
Andre Santos, C<< >> |
164
|
|
|
|
|
|
|
|
165
|
|
|
|
|
|
|
=head1 BUGS |
166
|
|
|
|
|
|
|
|
167
|
|
|
|
|
|
|
Please report any bugs or feature requests to C, or through |
168
|
|
|
|
|
|
|
the web interface at L. I will be notified, and then you'll |
169
|
|
|
|
|
|
|
automatically be notified of progress on your bug as I make changes. |
170
|
|
|
|
|
|
|
|
171
|
|
|
|
|
|
|
|
172
|
|
|
|
|
|
|
=head1 ACKNOWLEDGEMENTS |
173
|
|
|
|
|
|
|
|
174
|
|
|
|
|
|
|
|
175
|
|
|
|
|
|
|
=head1 LICENSE AND COPYRIGHT |
176
|
|
|
|
|
|
|
|
177
|
|
|
|
|
|
|
Copyright 2011 Project Natura. |
178
|
|
|
|
|
|
|
|
179
|
|
|
|
|
|
|
This program is free software; you can redistribute it and/or modify it |
180
|
|
|
|
|
|
|
under the terms of either: the GNU General Public License as published |
181
|
|
|
|
|
|
|
by the Free Software Foundation; or the Artistic License. |
182
|
|
|
|
|
|
|
|
183
|
|
|
|
|
|
|
See http://dev.perl.org/licenses/ for more information. |
184
|
|
|
|
|
|
|
|
185
|
|
|
|
|
|
|
|
186
|
|
|
|
|
|
|
=cut |
187
|
|
|
|
|
|
|
|
188
|
|
|
|
|
|
|
1; # End of Text::Perfide::WordBags |