File Coverage

blib/lib/String/Normal/Type/Business.pm

Criterion	Covered	Total	%
statement	65	68	95.5
branch	22	32	68.7
condition	7	11	63.6
subroutine	11	11	100.0
pod	2	2	100.0
total	107	124	86.2

line	stmt	bran	cond	sub	pod	time	code
1							package String::Normal::Type::Business;
2	9			9		29	use strict;
	9					11
	9					185
3	9			9		26	use warnings;
	9					11
	9					150
4	9			9		32	use String::Normal::Type;
	9					8
	9					169
5	9			9		2954	use String::Normal::Config;
	9					13
	9					163
6
7	9			9		3755	use Lingua::Stem;
	9					37256
	9					5501
8							our $STEM;
9							our $biz_stop;
10							our $biz_compress;
11
12							sub transform {
13	13			13	1	16	my ($self,$value) = @_;
14
15							# tokenize and stem
16	13					12	my (@digits,@words);
17	13					26	_tokenize_value( $value, \@digits, \@words );
18	13					44	$STEM->stem_in_place( @words );
19
20							# Remove "special" beginning and/or ending stopwords, if such words are present
21							# and enough tokens are in place to remove them safely.
22	13	50				6186	if (@words) {
23							# make a copy of @words and whittle it down
24	13					23	my @copy = @words;
25	13					10	my $count;
26	13	50				35	if ($count = $biz_stop->{first}{$copy[0]}) {
27	0	0				0	shift @copy if @copy >= $count;
28							}
29	13	50	33			67	if (@copy and $count = $biz_stop->{last}{$copy[-1]}) {
30	0	0				0	pop @copy if @copy >= $count;
31							}
32
33							# reverting back if overnormalization occurs
34	13	50				39	@words = @copy if @copy;
35							}
36
37							# Remove all middle stop words that are safe to remove, based on the number of
38							# tokens, of course.
39							my @filtered = map {
40	13		100			17	my $count = $biz_stop->{middle}{$_} \|\| '';
	26					82
41	26	100	66			101	(length $count and @words >= $count) ? () : $_;
42							} @words;
43
44							# If we filtered all words out, "revert" to the full array of stemmed tokens.
45	13	100				26	@filtered = @words unless @filtered;
46
47							# The canon name is the sorted filtered stemmed words plus the original digits.
48	13					95	return join ' ', sort @digits, @filtered;
49							}
50
51							sub new {
52	3			3	1	5	my $self = shift;
53	3					19	$STEM = Lingua::Stem->new;
54	3					124	$STEM->add_exceptions( String::Normal::Config::BusinessStem::_data( @_ ) );
55	3					2364	$biz_stop = String::Normal::Config::BusinessStop::_data( @_ );
56	3					14	$biz_compress = String::Normal::Config::BusinessCompress::_data( @_ );
57	3					26	return bless {@_}, $self;
58							}
59
60							sub _tokenize_value {
61	13			13		16	my ($value,$digits,$words) = @_;
62
63	13					33	$value = String::Normal::Type::_scrub_value( $value );
64
65							# split tokens on more than just whitespace:
66							# split digits from words but keep things like 3D and 1st combined,
67							# also split things like abcd#efgh but keep pound signs for #2 and # 1 and #
68							# prevent the empty string from finding its way into the token list as well
69	13	50				48	my @tokens = map { map length $_ ? $_ : (), split /##+\|\s+\|#+\b\|\b#+/, $_ } $value =~ /(?:\d+\w{1,2}\b\|\d+\|\D+)/g;
	13					122
70
71							# walk each token thru the tree and create markers
72	13					28	my @pairs = _mark_pairs( \@tokens );
73	13	100				48	_compress_list( \@tokens, \@pairs ) if @pairs;
74
75							# separate out tokens that contain digits (snowball stemmer will scrub all digits)
76	13					19	for (@tokens) {
77	26	50				44	if (/\d/) {
78	0					0	push @$digits, $_;
79							} else {
80	26					48	push @$words, $_;
81							}
82							}
83							}
84
85							sub _mark_pairs {
86	13			13		14	my $tokens = shift;
87	13					14	my @pairs = ();
88	13					34	for my $i (0 .. $#$tokens) {
89	30					34	my $token = $tokens->[$i];
90	30	100				62	next unless exists $biz_compress->{$token};
91	5	100				20	next if $i + 1 > $#$tokens;
92	4					15	my $end = _walk_tree( $i + 1, $tokens, $biz_compress->{$token} );
93	4	100				11	if ($end) {
94	3					6	push @pairs, [$i,$end];
95	3					6	$i = $end;
96							}
97							}
98	13					22	return @pairs;
99							}
100
101
102							sub _walk_tree {
103	5			5		7	my ($i, $list, $tree) = @_;
104
105	5	100				16	if (my $t = $tree->{$list->[$i]}) {
106	4	100	66			25	if (ref $t eq 'HASH' and !%$t) {
107	3					10	return $i;
108							} else {
109	1					5	_walk_tree( $i + 1, $list, $t );
110							}
111							}
112							}
113
114							sub _compress_list {
115	3			3		5	my ($list,$pairs) = @_;
116	3					7	for my $pair (reverse @$pairs) {
117	3					5	my ($s,$e) = @$pair;
118	3					18	splice @$list, $s, $e - $s + 1, join '', @$list[$s .. $e];
119							}
120							}
121
122							1;
123
124							__END__