File Coverage

blib/lib/WordLists/Tag/Tagger.pm

Criterion	Covered	Total	%
statement	93	105	88.5
branch	12	16	75.0
condition	8	20	40.0
subroutine	13	17	76.4
pod	1	5	20.0
total	127	163	77.9

line	stmt	bran	cond	sub	pod	time	code
1							package WordLists::Tag::Tagger;
2	2			2		127055	use strict;
	2					5
	2					69
3	2			2		11	use warnings;
	2					4
	2					55
4	2			2		1018	use utf8;
	2					13
	2					10
5	2			2		562	use WordLists::Common qw(/generic/);
	2					6
	2					494
6	2			2		2667	use Lingua::EN::Tagger;
	2					222434
	2					96
7	2			2		676	use WordLists::WordList;
	2					5
	2					60
8	2			2		11	use WordLists::Sense;
	2					5
	2					42
9	2			2		10	use WordLists::Base;
	2					3
	2					414
10							our $VERSION = $WordLists::Base::VERSION;
11
12							our $AUTOLOAD;
13							our @ignore_pos_codes = qw(cd to prp prps sym pp pps ppr lrb rrb ppc ppl );
14							sub _norm_word($)
15							{
16	11			11		21	my $s = shift;
17	11					21	$s=~s/\.//g;
18	11					17	$s=~s/\/.*//;
19	11					18	$s=~s/['’]s$//;
20	11					22	$s=~s/['‘’`"“”]$//;
21	11					19	$s=~s/^['‘’`"“”]//;
22	11					35	$s=~tr/…–\xA0\*//d;
23	11					30	return lc $s;
24							}
25							sub human_pos($)
26							{
27	29			29	0	47	my $sPos = shift;
28	29					58	$sPos =~ s<^nn.*$>
29							;
30	29					44	$sPos =~ s<^in$>
31							;
32	29					41	$sPos =~ s<^to$>
33							;
34	29					59	$sPos =~ s<^jj.*$>
35							;
36	29					48	$sPos =~ s<^md.*$>
37							;
38	29					35	$sPos =~ s<^vb.*$>
39							;
40	29					30	$sPos =~ s<^rb.*$>
41							;
42	29					56	$sPos =~ s<^det$>
43							;
44	29					36	$sPos =~ s<^cc$>
45							;
46	29					34	$sPos =~ s<^wrb$>
47							;
48	29					34	$sPos =~ s<^wdt$>
49							;
50	29					36	$sPos =~ s<^wp$>
51							;
52	29					36	$sPos =~ s<^prp.*$>
53							;
54	29					32	$sPos =~ s<^cd.*$>
55							;
56	29					37	$sPos =~ s<^uh$>
57							;
58	29					142	return $sPos;
59							}
60
61
62							sub new
63							{
64	1			1	0	14	my ($class, $args) = @_;
65	1		50			11	$args \|\|={};
66	1					12	my $self = {
67							tagger=>Lingua::EN::Tagger->new(%$args),
68							};
69	1					237810	bless ($self, $class);
70							}
71
72							1;
73
74							sub add_tags
75							{
76	0			0	0	0	my ($self, $sMS) = @_;
77	0					0	my $taggedMS = $self->{tagger}->add_tags( $sMS );
78	0					0	return $taggedMS;
79							}
80
81							sub add_human_tags
82							{
83	3			3	0	7	my ($self, $sMS) = @_;
84	3					10	my $sMSOUT = '';
85	3					5	foreach my $sSentence (@{$self->{tagger}->get_sentences($sMS)})
	3					19
86							{
87	3					15857	$sSentence =~ tr/<>&//d;
88	3					16	my $taggedSentence = $self->{tagger}->add_tags( $sSentence );
89	3	100				3577	$taggedSentence =~ s`<([a-z]+)>([^<]+)`human_pos($1) ne $1 ? qq($2) : $2;`ge;
	11					26
90	3					140	$sMSOUT .= " $taggedSentence ";
91							}
92	3					31	return $sMSOUT;
93							}
94
95							sub get_wordlist
96							{
97	1			1	1	4	my ($self, $sUntagged, $args) = @_;
98
99	1					2	my $wl;
100	1	50	33			14	if (defined $args->{'wl'} and ref $args->{'wl'} eq 'WordLists::WordList')
101							{
102	0					0	$wl=$args->{'wl'};
103							}
104							else
105							{
106	1					13	$wl = WordLists::WordList->new;
107							}
108	1					2	foreach my $sSentence (@{$self->{tagger}->get_sentences($sUntagged)})
	1					6
109							{
110	1					3039	my $taggedMS;
111	1					5	$taggedMS = $self->{tagger}->add_tags( $sSentence );
112	1					2614	while ($taggedMS =~ m`<([a-z]+)>([^<]+)`g)
113							{
114							#print "\n$1\t$2";
115	11					30	my $sHW = _norm_word($2);
116	11					18	my $sPosCode = $1;
117	11					21	my $bNext;
118	11					22	foreach (@ignore_pos_codes) #
119							{
120	125	100				239	if ($sPosCode eq $_)
121							{
122	2					3	$bNext++;
123	2					5	last;
124							}
125							}
126	11	100				32	next if $bNext;
127	9					21	my $sPos = human_pos($sPosCode);
128	9					43	my $sense = WordLists::Sense->new();
129	9					60	$sense->set_hw($sHW);
130	9					52	$sense->set_pos($sPos);
131	9					49	$sense->set_eg($sSentence);
132	9					46	$sense->set_poscode($sPosCode);
133	9	50	33	0		34	if (defined $args->{'callback_on_make_sense'} and ref $args->{'callback_on_make_sense'} eq ref sub{})
	0					0
134							{
135	0					0	&{$args->{'callback_on_make_sense'}}($sense);
	0					0
136							}
137	9					57	my @senses = $wl->get_senses_for($sense->get_hw, $sense->get_pos);
138	9	100	66			53	if (@senses and !$args->{'keep_repeats'})
139							{
140
141							}
142							else
143							{
144	8	50	33	0		38	if (defined $args->{'test_sense_before_add'} and ref $args->{'test_sense_before_add'} eq ref sub {} and !&{$args->{'test_sense_before_add'}}($sense))
	0		33			0
	0					0
145							{
146							}
147							else
148							{
149	8	50	33	0		29	if (defined $args->{'callback_on_add_sense'} and ref $args->{'callback_on_add_sense'} eq ref sub{})
	0					0
150							{
151	0					0	&{$args->{'callback_on_add_sense'}}($sense);
	0					0
152							}
153	8					26	$wl->add_sense($sense);
154							}
155							}
156							};
157							}
158	1					9	return $wl;
159							}
160							=pod
161
162							=head1 NAME
163
164							WordLists::Tag::Tagger
165
166							=head1 SYNOPSIS
167
168							my $tagger = WordLists::Tag::Tagger->new();
169							my $wl = $tagger->get_wordlist('The quick brown fox jumped over the lazy dog');
170
171							=head1 DESCRIPTION
172
173							Uses L to do various things with strings, chielfly to create a L out of a document, e.g. to use as a basis for a glossary.
174
175							=head1 METHODS
176
177							=head3 get_wordlist
178
179							Uses L to create a L out of a string (e.g. a manuscript).
180
181							L allows splitting into sentences, and these sentences become C fields in the Ls generated.
182
183							Only the first instance of each headword / part of speech combination is entered into the list, unless the third argument has a key C with a true value.
184
185							The fields populated are: C, C, C, and C, which is the original part of speech code outputted by the tagger.
186
187							The third argument is a hashref which allows you to configure several options.
188
189							C should be a coderef. It is passed the sense immediately before it is added to the wordlist, e.g. if you want to add a unit.
190
191							C should be a coderef. It is passed the sense before the wordlist is tested for inclusion. This is an opportunity to further normalise parts of speech.
192
193							C is a flag, which, if set, prevents the code from removing repetitions.
194
195							=head1 TODO
196
197							C is a wordlist or arrayref whose elements are L objects or plain hashrefs of the form C<< {hw=>'head', pos =>'n'} >>, or headwords as strings. If these words are found, they are not added to the list. (not yet implemented!)
198
199							=head1 BUGS
200
201							Please use the Github issues tracker.
202
203							=head1 LICENSE
204
205							Copyright 2011-2012 © Cambridge University Press. This module is free software; you can redistribute it and/or modify it under the same terms as Perl itself.
206
207							=cut