File Coverage

blib/lib/DateTime/Format/Natural/Extract.pm

Criterion	Covered	Total	%
statement	161	161	100.0
branch	54	56	96.4
condition	43	45	95.5
subroutine	13	13	100.0
pod			n/a
total	271	275	98.5

line	stmt	bran	cond	sub	time	code
1						package DateTime::Format::Natural::Extract;
2
3	26			26	239	use strict;
	26				57
	26				1121
4	26			26	140	use warnings;
	26				55
	26				1511
5	26				17247	use base qw(
6						DateTime::Format::Natural::Duration::Checks
7						DateTime::Format::Natural::Formatted
8	26			26	144	);
	26				54
9	26			26	390	use boolean qw(true false);
	26				164
	26				169
10
11	26			26	2089	use constant DATE_TYPE => 0x01;
	26				74
	26				1795
12	26			26	171	use constant GRAMMAR_TYPE => 0x02;
	26				53
	26				1430
13	26			26	142	use constant DURATION_TYPE => 0x04;
	26				62
	26				1393
14
15	26			26	160	use DateTime::Format::Natural::Utils qw(trim);
	26				49
	26				78528
16
17						our $VERSION = '0.14';
18
19						my %grammar_durations = map { $_ => true } qw(for_count_unit);
20
21						my $get_range = sub
22						{
23						my ($aref, $index) = @_;
24						return [ grep defined, @$aref[$index, $index + 1] ];
25						};
26
27						my $extract_duration = sub
28						{
29						my ($skip, $indexes, $index) = @_;
30
31						return false unless defined $indexes->[$index] && defined $indexes->[$index + 1];
32						my ($left_index, $right_index) = ($indexes->[$index][1], $indexes->[$index + 1][0]);
33
34						return ($skip->{$left_index} \|\| $skip->{$right_index}) ? false : true;
35						};
36
37						sub _extract_expressions
38						{
39	209			209	588	my $self = shift;
40	209				685	my ($extract_string) = @_;
41
42	209				755	$extract_string =~ s/^[,;.]//;
43	209				887	$extract_string =~ s/[,;.]$//;
44
45	209				1274	while ($extract_string =~ /([,;.])/g) {
46	18				93	my $mark = $1;
47	18				237	my %patterns = (
48						',' => qr/(?!\d{4})/,
49						';' => qr/(?=\w)/,
50						'.' => qr/(?=\w)/,
51						);
52	18				62	my $pattern = $patterns{$mark};
53	18				908	$extract_string =~ s/\Q$mark\E \s+? $pattern/ [token] /x; # pretend punctuation marks are tokens
54						}
55
56	209				1766	my $timespan_sep = $self->{data}->__timespan('literal');
57
58	209				2064	1 while $extract_string =~ s/^$timespan_sep\s+//i;
59	209				2288	1 while $extract_string =~ s/\s+$timespan_sep$//i;
60
61	209				1494	$self->_rewrite(\$extract_string);
62
63	209				1145	my @tokens = split /\s+/, $extract_string;
64	209				441	my %entries = %{$self->{data}->__grammar('')};
	209				1605
65
66	209				1418	my (@expressions, %skip);
67
68	209	100			2769	if ($extract_string =~ /\s+ $timespan_sep \s+/ix) {
69	97				742	$self->_extract_duration($extract_string, \@tokens, \@expressions, \%skip);
70						}
71
72	209				2840	my (%expand, %lengths);
73	209				3156	foreach my $keyword (keys %entries) {
74	14630				30635	$expand{$keyword} = $self->_expand_for($keyword);
75	14630				57313	$lengths{$keyword} = @{$entries{$keyword}->[0]};
	14630				40331
76						}
77
78	209				1741	my $seen_expression;
79	209				444	do {
80	439				3160	$seen_expression = false;
81	439				1894	my $date_index = undef;
82	439				2110	for (my $i = 0; $i < @tokens; $i++) {
83	1844	100			20744	next if $skip{$i};
84	1085	100			4911	if ($self->_check_for_date($tokens[$i], $i, \$date_index)) {
85	16				236	last;
86						}
87						}
88	439				12881	GRAMMAR: foreach my $keyword (sort { $lengths{$b} <=> $lengths{$a} } grep { $lengths{$_} <= @tokens } keys %entries) {
	102927				198900
	30730				68137
89	19956				826216	my @grammar = @{$entries{$keyword}};
	19956				112459
90	19956				50274	my $types_entry = shift @grammar;
91	19956				77233	my @grammars = [ [ @grammar ], false ];
92	19956	100	100		117158	if ($expand{$keyword} && @$types_entry + 1 <= @tokens) {
93	4436				82494	@grammar = $self->_expand($keyword, $types_entry, \@grammar);
94	4436				34873	unshift @grammars, [ [ @grammar ], true ];
95						}
96	19956				194207	foreach my $grammar (@grammars) {
97	24389				97560	my $expanded = $grammar->[1];
98	24389				58225	my $length = $lengths{$keyword};
99	24389	100			103083	$length++ if $expanded;
100	24389				203804	foreach my $entry (@{$grammar->[0]}) {
	24389				58854
101	151669	100			1389427	my ($types, $expression) = $expanded ? @$entry : ($types_entry, $entry);
102	151669				1404578	my $definition = $expression->[0];
103	151669				377509	my $matched = false;
104	151669				533300	my $pos = 0;
105	151669				255710	my @indexes;
106	151669				289899	my $date_index = undef;
107	151669				460542	for (my $i = 0; $i < @tokens; $i++) {
108	850627	100			7455005	next if $skip{$i};
109	496586	100			1171012	last unless defined $types->[$pos];
110	496494	100			1528648	if ($self->_check_for_date($tokens[$i], $i, \$date_index)) {
111	9918				153188	next;
112						}
113	486576	100	66		11655182	if ($types->[$pos] eq 'SCALAR' && defined $definition->{$pos} && $tokens[$i] =~ /^$definition->{$pos}$/i
		100	100
		100	100
			66
			100
114						or $types->[$pos] eq 'REGEXP' && $tokens[$i] =~ $definition->{$pos}
115						&& (@indexes ? ($i - $indexes[-1] == 1) : true)
116						) {
117	7961				136921	$matched = true;
118	7961				42149	push @indexes, $i;
119	7961				30021	$pos++;
120						}
121						elsif ($matched) {
122	5291				49643	last;
123						}
124						}
125	151669	100	100		1480783	if ($matched
		100	100
126						&& @indexes == $length
127						&& (defined $date_index ? ($indexes[0] - $date_index == 1) : true)
128						) {
129	223	100			6552	my $expression = join ' ', (defined $date_index ? $tokens[$date_index] : (), @tokens[@indexes]);
130	223	100			828	my $start_index = defined $date_index ? $indexes[0] - 1 : $indexes[0];
131	223	100			929	my $type = $grammar_durations{$keyword} ? DURATION_TYPE : GRAMMAR_TYPE;
132	223				1824	push @expressions, [ [ $start_index, $indexes[-1] ], $expression, { flags => $type } ];
133	223	100			1417	$skip{$_} = true foreach (defined $date_index ? $date_index : (), @indexes);
134	223				1831	$seen_expression = true;
135	223				2790	last GRAMMAR;
136						}
137						}
138						}
139						}
140	439	100	100		12904	if (defined $date_index && !$seen_expression) {
141	7				246	push @expressions, [ [ ($date_index) x 2 ], $tokens[$date_index], { flags => DATE_TYPE } ];
142	7				37	$skip{$date_index} = true;
143	7				49	$seen_expression = true;
144						}
145						} while ($seen_expression);
146
147	209				2973	return $self->_finalize_expressions(\@expressions, \@tokens);
148						}
149
150						sub _extract_duration
151						{
152	97			97	347	my $self = shift;
153	97				343	my ($extract_string, $tokens, $expressions, $skip) = @_;
154
155	97				734	my $timespan_sep = $self->{data}->__timespan('literal');
156
157	97				1452	my @strings = grep /\S/, map trim($_), split /\b $timespan_sep \b/ix, $extract_string;
158	97	50			563	if (@strings) {
159	97				254	my $index = 0;
160	97				226	my @indexes;
161	97				321	foreach my $string (@strings) {
162	208				666	my @string_tokens = split /\s+/, $string;
163	208				720	push @indexes, [ $index, $index + $#string_tokens ];
164	208				537	$index += $#string_tokens + 1;
165	208		100		2442	$index++ while defined $tokens->[$index] && $tokens->[$index] =~ /^$timespan_sep$/i;
166						}
167						DURATION: {
168	97				250	for (my $i = 0; $i <= $#strings - 1; $i++) {
	160				902
169	179	100			1081	next unless $extract_duration->($skip, \@indexes, $i);
170	111				1948	my $save_expression = false;
171	111				455	my @chunks;
172	111				325	foreach my $extract (qw(_first_to_last_extract _from_count_to_count_extract)) {
173	206	100			2680	if ($self->$extract($get_range->(\@strings, $i), $get_range->(\@indexes, $i), $tokens, \@chunks)) {
174	63				1900	$save_expression = true;
175	63				288	last;
176						}
177						}
178	111	100			2282	if ($save_expression) {
179	63				657	my $timespan_sep_index = $chunks[0]->[0][1] + 1;
180	63				354	my $expression = join ' ', ($chunks[0]->[1], $tokens->[$timespan_sep_index], $chunks[1]->[1]);
181	63				250	my @indexes = ($chunks[0]->[0][0], $chunks[1]->[0][1]);
182	63				406	push @$expressions, [ [ @indexes ], $expression, { flags => DURATION_TYPE } ];
183	63				424	$skip->{$_} = true foreach ($indexes[0] .. $indexes[1]);
184	63				1375	redo DURATION;
185						}
186						}
187						}
188						}
189						}
190
191						sub _finalize_expressions
192						{
193	209			209	575	my $self = shift;
194	209				719	my ($expressions, $tokens) = @_;
195
196	209				2173	my $timespan_sep = $self->{data}->__timespan('literal');
197	209				652	my (@duration_indexes, @final_expressions);
198
199	209				885	my $seen_duration = false;
200
201	209				1567	my @expressions = sort { $a->[0][0] <=> $b->[0][0] } @$expressions;
	97				649
202
203	209				896	for (my $i = 0; $i < @expressions; $i++) {
204	293				903	my $expression = $expressions[$i];
205
206	293				1003	my $prev = $expression->[0][0] - 1;
207	293				728	my $next = $expression->[0][1] + 1;
208
209	293	100			1497	if ($expression->[2]->{flags} & (DATE_TYPE\|GRAMMAR_TYPE)) {
		50
210	224	100	100		1092	if (!$seen_duration
		100	100
			100
			100
			100
211						&& defined $tokens->[$next]
212						&& $tokens->[$next] =~ /^$timespan_sep$/i
213						&& defined $expressions[$i + 1]
214						&& $expressions[$i + 1]->[2]->{flags} & (DATE_TYPE\|GRAMMAR_TYPE)
215						&& $expressions[$i + 1]->[0][0] - $next == 1
216						) {
217	31				1559	push @duration_indexes, ($expression->[0][0] .. $expression->[0][1]);
218	31				139	$seen_duration = true;
219						}
220						elsif ($seen_duration) {
221	31				874	push @duration_indexes, ($prev, $expression->[0][0] .. $expression->[0][1]);
222	31				200	push @final_expressions, join ' ', @$tokens[@duration_indexes];
223	31				107	@duration_indexes = ();
224	31				112	$seen_duration = false;
225						}
226						else {
227	162				5238	push @final_expressions, $expression->[1];
228						}
229						}
230						elsif ($expression->[2]->{flags} & DURATION_TYPE) {
231	69				368	push @final_expressions, $expression->[1];
232						}
233						}
234
235	209			262	1378	my $exclude = sub { $_[0] =~ /^\d{1,2}$/ };
	262				14416
236
237	209				994	return grep !$exclude->($_), @final_expressions;
238						}
239
240						sub _check_for_date
241						{
242	497579			497579	964561	my $self = shift;
243	497579				1200983	my ($token, $index, $date_index) = @_;
244
245	497579				3073076	my ($formatted) = $token =~ $self->{data}->__regexes('format');
246	497579				1752187	my %count = $self->_count_separators($formatted);
247	497579	100			1470290	if ($self->_check_formatted('ymd', \%count)) {
248	9934				23427	$$date_index = $index;
249	9934				40973	return true;
250						}
251						else {
252	487645				1388386	return false;
253						}
254						}
255
256						1;
257						__END__
258
259						=head1 NAME
260
261						DateTime::Format::Natural::Extract - Extract parsable expressions from strings
262
263						=head1 SYNOPSIS
264
265						Please see the DateTime::Format::Natural documentation.
266
267						=head1 DESCRIPTION
268
269						C<DateTime::Format::Natural::Extract> extracts expressions from strings to be
270						processed by the parse methods.
271
272						=head1 SEE ALSO
273
274						L<DateTime::Format::Natural>
275
276						=head1 AUTHOR
277
278						Steven Schubiger <schubiger@cpan.org>
279
280						=head1 LICENSE
281
282						This program is free software; you may redistribute it and/or
283						modify it under the same terms as Perl itself.
284
285						See L<http://dev.perl.org/licenses/>
286
287						=cut