File Coverage

blib/lib/Date/Parse/Modern.pm

Criterion	Covered	Total	%
statement	116	132	87.8
branch	45	56	80.3
condition	49	76	64.4
subroutine	8	8	100.0
pod	1	2	50.0
total	219	274	79.9

line	stmt	bran	cond	sub	pod	time	code
1							#!/usr/bin/env perl
2
3							package Date::Parse::Modern;
4
5	2			2		145737	use strict;
	2					20
	2					62
6	2			2		10	use warnings;
	2					8
	2					46
7	2			2		46	use v5.10;
	2					8
8
9	2			2		11	use Carp;
	2					4
	2					183
10	2			2		1003	use Time::Local 1.26;
	2					5191
	2					119
11	2			2		13	use Exporter 'import';
	2					4
	2					3776
12							our @EXPORT = ('strtotime');
13
14							###############################################################################
15
16							our $VERSION = 0.3;
17
18							# https://timezonedb.com/download
19							my $TZ_OFFSET = {
20							'ACDT' => 10, 'ACST' => 9, 'ACT' => -5, 'ACWST' => 8, 'ADT' => -3, 'AEDT' => 11, 'AEST' => 10, 'AFT' => 4,
21							'AKDT' => -8, 'AKST' => -9, 'ALMT' => 6, 'AMST' => 5, 'AMT' => 4, 'ANAST' => 12, 'ANAT' => 12, 'AQTT' => 5,
22							'ART' => -3, 'AST' => -4, 'AWDT' => 9, 'AWST' => 8, 'AZOST' => 0, 'AZOT' => -1, 'AZST' => 5, 'AZT' => 4,
23							'AoE' => -12, 'BNT' => 8, 'BOT' => -4, 'BRST' => -2, 'BRT' => -3, 'BST' => 1, 'BTT' => 6, 'CAST' => 8,
24							'CAT' => 2, 'CCT' => 6, 'CDT' => -5, 'CEST' => 2, 'CET' => 1, 'CHADT' => 13, 'CHAST' => 12, 'CHOST' => 9,
25							'CHOT' => 8, 'CHUT' => 10, 'CIDST' => -4, 'CIST' => -5, 'CKT' => -10, 'CLST' => -3, 'CLT' => -4, 'COT' => -5,
26							'CST' => -6, 'CVT' => -1, 'CXT' => 7, 'ChST' => 10, 'DAVT' => 7, 'DDUT' => 10, 'EASST' => -5, 'EAST' => -6,
27							'EAT' => 3, 'ECT' => -5, 'EDT' => -4, 'EEST' => 3, 'EET' => 2, 'EGST' => 0, 'EGT' => -1, 'EST' => -5,
28							'FET' => 3, 'FJST' => 13, 'FJT' => 12, 'FKST' => -3, 'FKT' => -4, 'FNT' => -2, 'GALT' => -6, 'GAMT' => -9,
29							'GET' => 4, 'GFT' => -3, 'GILT' => 12, 'GMT' => 0, 'GST' => -2, 'GYT' => -4, 'HDT' => -9, 'HKT' => 8,
30							'HOVST' => 8, 'HOVT' => 7, 'HST' => -10, 'ICT' => 7, 'IDT' => 3, 'IOT' => 6, 'IRDT' => 4, 'IRKST' => 9,
31							'IRKT' => 8, 'IRST' => 3, 'IST' => 2, 'JST' => 9, 'KGT' => 6, 'KOST' => 11, 'KRAST' => 8, 'KRAT' => 7,
32							'KST' => 9, 'KUYT' => 4, 'LHDT' => 11, 'LHST' => 10, 'LINT' => 14, 'MAGST' => 12, 'MAGT' => 11, 'MART' => -9,
33							'MAWT' => 5, 'MDT' => -6, 'MHT' => 12, 'MMT' => 6, 'MSD' => 4, 'MSK' => 3, 'MST' => -7, 'MUT' => 4,
34							'MVT' => 5, 'MYT' => 8, 'NCT' => 11, 'NDT' => -2, 'NFDT' => 12, 'NFT' => 11, 'NOVST' => 7, 'NOVT' => 7,
35							'NPT' => 5, 'NRT' => 12, 'NST' => -3, 'NUT' => -11, 'NZDT' => 13, 'NZST' => 12, 'OMSST' => 7, 'OMST' => 6,
36							'ORAT' => 5, 'PDT' => -7, 'PET' => -5, 'PETST' => 12, 'PETT' => 12, 'PGT' => 10, 'PHOT' => 13, 'PHT' => 8,
37							'PKT' => 5, 'PMDT' => -2, 'PMST' => -3, 'PONT' => 11, 'PST' => -8, 'PWT' => 9, 'PYST' => -3, 'PYT' => 8,
38							'QYZT' => 6, 'RET' => 4, 'ROTT' => -3, 'SAKT' => 11, 'SAMT' => 4, 'SAST' => 2, 'SBT' => 11, 'SCT' => 4,
39							'SGT' => 8, 'SRET' => 11, 'SRT' => -3, 'SST' => -11, 'SYOT' => 3, 'TAHT' => -10, 'TFT' => 5, 'TJT' => 5,
40							'TKT' => 13, 'TLT' => 9, 'TMT' => 5, 'TOST' => 14, 'TOT' => 13, 'TRT' => 3, 'TVT' => 12, 'ULAST' => 9,
41							'ULAT' => 8, 'UYST' => -2, 'UYT' => -3, 'UZT' => 5, 'VET' => -4, 'VLAST' => 11, 'VLAT' => 10, 'VOST' => 6,
42							'VUT' => 11, 'WAKT' => 12, 'WARST' => -3, 'WAST' => 2, 'WAT' => 1, 'WEST' => 1, 'WET' => 0, 'WFT' => 12,
43							'WGST' => -2, 'WGT' => -3, 'WIB' => 7, 'WIT' => 9, 'WITA' => 8, 'WST' => 1, 'YAKST' => 10, 'YAKT' => 9,
44							'YAPT' => 10, 'YEKST' => 6, 'YEKT' => 5, 'Z' => 0,
45							};
46
47							# Separator between dates pieces: '-' or '/' or '\'
48							my $sep = qr/[\/\\-]/;
49
50							# Force a local timezone offset (used for unit tests)
51							our $LOCAL_TZ_OFFSET = undef;
52
53							# Use caching for repeated lookups for the same TZ offset
54							our $USE_TZ_CACHE = 1;
55
56							# These are undocumented package variables. They could be changed to support
57							# alternate languages but there are caveats. These are cached and changing
58							# them after strtotime() is called won't affect anything. No one has requested
59							# alternate languages, so I'm leaving this undocumented for now.
60							our $MONTH_MAP = {
61							'jan' => 1, 'feb' => 2, 'mar' => 3, 'apr' => 4 , 'may' => 5 , 'jun' => 6 ,
62							'jul' => 7, 'aug' => 8, 'sep' => 9, 'oct' => 10, 'nov' => 11, 'dec' => 12,
63							};
64
65							# See above
66							our $MONTH_REGEXP = qr/
67							Jan\|January\|Feb\|February\|Mar\|March\|Apr\|April\|May\|Jun\|June\|
68							Jul\|July\|Aug\|August\|Sep\|September\|Oct\|October\|Nov\|November\|Dec\|December
69							/ix;
70
71							###############################################################################
72							###############################################################################
73							###############################################################################
74
75							=head1 NAME
76
77							C - Provide string to unixtime conversions
78
79							=head1 DESCRIPTION
80
81							C provides a single function C which takes a datetime string
82							and returns a unixtime. Care was given to support the most modern style strings that you would
83							commonly find in log files or on the internet.
84
85							=head1 USAGE
86
87							use Date::Parse::Modern;
88
89							C exports the C function automatically.
90
91							=head1 FUNCTIONS
92
93							=head2 strtotime($string)
94
95							my $unixtime = strtotime('1979-02-24'); # 288691200
96
97							Simply feed C a string with some type of date or time in it, and it will return an
98							integer unixtime. If the string is unparseable, or a weird error occurs, it will return C.
99
100							All the "magic" in C is done using regular expressions that look for common datetime
101							formats. Common formats like YYYY-MM-DD and HH:II:SS are easily detected and converted to the
102							appropriate formats. This allows the date or time to be found anywhere in the string, in (almost) any
103							order. In all cases, the day of the week is ignored in the input string.
104
105							B Strings without a year are assumed to be in the current year. Example: C
106
107							B Strings with only a date are assumed to occur at the midnight. Example: C<2023-01-15>
108
109							B Strings with only time are assumed to be the current day. Example: C<10:15am>
110
111							B In strings with numeric B textual time zone offsets, the numeric is used. Example:
112							C<14 Nov 1994 11:34:32 -0500 (EST)>
113
114							=head1 Bugs/Features
115
116							Please submit bugs and feature requests on Github:
117
118							https://github.com/scottchiefbaker/perl-Date-Parse-Modern
119
120							=head1 AUTHORS
121
122							Scott Baker - https://www.perturb.org/
123
124							=cut
125
126							###############################################################################
127							###############################################################################
128							###############################################################################
129
130							# The logic here is that we use regular expressions to pull out various patterns
131							# YYYY/MM/DD, H:I:S, DD MonthWord YYYY
132							sub strtotime {
133	38			38	1	195	my ($str, $debug) = @_;
134
135	38	100				93	if (!defined($str)) {
136	1					4	return undef;
137							}
138
139	37					74	my ($year, $month, $day) = (0, 0, 0);
140	37					69	my ($hour, $min , $sec) = (0, 0, 0);
141
142							###########################################################################
143							###########################################################################
144
145	37					91	state $rule_1 = qr/
146							\b
147							((\d{4})$sep(\d{2})$sep(\d{2}) # YYYY-MM-DD
148							\|
149							(\d{2})$sep(\d{2})$sep(\d{4})) # DD-MM-YYYY
150							/x;
151
152							# First we look to see if we have anything that mathches YYYY-MM-DD (numerically)
153	37	100				324	if ($str =~ $rule_1) {
154							# YYYY-MM-DD: 1999-12-24
155	14	100	66			56	if ($2 \|\| $3) {
156	13					25	$year = $2;
157	13					23	$month = $3;
158	13					27	$day = $4;
159							}
160
161							# DD-MM-YYYY: 12-24-1999
162	14	100	66			63	if ($5 \|\| $6) {
163	1					3	$day = $5;
164	1					2	$month = $6;
165	1					3	$year = $7;
166
167							# It might be American format (MM-DD-YYYY) so we do a quick flip/flop
168	1	50				5	if ($month > 12) {
169	1					5	($day, $month) = ($month, $day);
170							}
171							}
172							}
173
174							# The year may be on the end of the string: Sat May 8 21:24:31 2021
175	37	100				81	if (!$year) {
176	23					75	($year) = $str =~ m/\s(\d{4})\b/;
177							}
178
179							###########################################################################
180
181	37					202	state $rule_2 = qr/
182							(\d{1,2})? # Maybe some digits before month
183							\s*
184							($MONTH_REGEXP) # A textual month
185							\s+
186							(\d{1,4}) # Digits
187							[\s\$] # Whitespace OR end of line
188							((\d{4}) )? # If there are digits ater the space it's 'Jan 13 2000'
189							/x;
190
191							# Next we look for alpha months followed by a digit if we didn't find a numeric month above
192							# This will find: "April 13" and also "13 April 1995"
193	37	100	100			283	if (!$month && $str =~ $rule_2) {
194
195							# Get the numerical number for this month
196	13					47	my $month_name = lc(substr($2,0,3));
197	13					27	$month = $MONTH_MAP->{$month_name};
198
199							# 17 March 94
200	13	100				34	if ($1) {
201	4					13	$day = int($1);
202	4					20	$year = int($3);
203							# April 13 or April 13 94
204							} else {
205	9					21	$day = int($3);
206
207							# IF there is a $5 it's a year
208	9		50			31	$year \|\|= int($5 \|\| 0);
			66
209							}
210							}
211
212							###########################################################################
213
214							# Alternate date string like like: 21/dec/93 or dec/21/93 much less common
215	37	100	100			269	if (!$month && $str =~ /(.)($MONTH_REGEXP)(.)/) {
216	5					15	my $before = $1;
217	5					11	my $after = $3;
218
219							# Lookup the numeric month based on the string name
220	5		50			22	$month = $MONTH_MAP->{lc($2)} \|\| 0;
221
222							# Month starts string: dec/21/93 or feb/14/1999
223	5	100	33			22	if ($before eq "") {
		50
224	2					27	$after =~ m/(\d{2})$sep(\d{2,4})/;
225
226	2					6	$day = $1;
227	2					4	$year = $2;
228
229							# Month in the middle: 21/dec/93
230							} elsif ($before && $after) {
231	3					11	$before =~ m/(\d+)\D/; # Just the digits
232	3		50			8	$day = $1 \|\| 0;
233
234	3					10	$after =~ m/\D(\d{2,4})(.)/; # Get the digits AFTER the separator
235
236							# If it's not a time (has a colon) it's the year
237	3	100				9	if ($2 ne ":") {
238	1					3	$year = $1;
239							}
240							}
241							}
242
243							###########################################################################
244
245	37					57	state $rule_3 = qr/
246							(\b\|T) # Anchor point
247							(\d{1,2}): # Hours
248							(\d{1,2}):? # Minutes
249							(\d{2}(Z\|\.\d+)?)? # Seconds (optional)
250							\ ?(am\|pm\|AM\|PM)? # AMPM (optional)
251							/x;
252
253							# Now we look for times: 10:14, 10:14:17, 08:15pm
254	37	100				250	if ($str =~ $rule_3) {
255	27					73	$hour = int($2);
256	27					36	$min = int($3);
257	27		100			85	$sec = $4 \|\| 0; # Not int() cuz it might be float for milliseconds
258
259	27					58	$sec =~ s/Z$//;
260
261							# The string of AM or PM
262	27		100			89	my $ampm = lc($6 \|\| "");
263
264							# PM means add 12 hours
265	27	100				57	if ($ampm eq "pm") {
266	1					2	$hour += 12;
267							}
268
269							# 12:15am = 00:15 / 12:15pm = 12:15 so we have to compensate
270	27	50	66			58	if ($ampm && ($hour == 24 \|\| $hour == 12)) {
			100
271	2					5	$hour -= 12;
272							}
273							}
274
275							# Just some basic sanity checking
276	37		100			108	my $has_time = ($hour \|\| $min \|\| $sec);
277	37		66			111	my $has_date = ($year \|\| $month \|\| $day);
278
279	37	100	100			86	if (!$has_time && !$has_date) {
280							# One final check if NOTHING else has matched, we lookup a weird format: 20020722T100000Z
281	4	100				14	if ($str =~ m/(\d{4})(\d{2})(\d{2})T(\d\d)(\d\d)(\d\d)Z/) {
282	1					4	$year = $1;
283	1					2	$month = $2;
284	1					2	$day = $3;
285
286	1					2	$hour = $4;
287	1					3	$min = $5;
288	1					2	$sec = $6;
289							} else {
290	3					18	return undef;
291							}
292							}
293
294							###########################################################################
295							###########################################################################
296
297							# Sanity check some basic boundaries
298							# I don't think we need this any more since we eval() and timegm_modern() will barf and return undef
299							#if ($month > 12 \|\| $day > 31 \|\| $hour > 23 \|\| $min > 60 \|\| $sec > 61) {
300							# return undef;
301							#}
302
303	34		66			92	$month \|\|= (localtime())[4] + 1; # If there is no month, we assume the current month
304	34		66			74	$day \|\|= (localtime())[3]; # If there is no day, we assume the current day
305							# If we STILL don't have a year it may be a time only string so we assume it's the current year
306	34		66			210	$year \|\|= (localtime())[5] + 1900;
307
308							# Convert any two digit years to four digits
309	34	100				79	if ($year < 100) {
310	5					11	$year += 1900;
311							}
312
313							# If we have all the requisite pieces we build a unixtime
314	34					46	my $ret;
315	34	50	0			50	my $err = $@ \|\| 'Error' unless eval {
316	34					116	$ret = Time::Local::timegm_modern($sec, $min, $hour, $day, $month - 1, $year);
317
318	34					1318	return 1;
319							};
320
321	34	50	33			76	if ($err && $err =~ /Undefined subroutine/) {
322	0					0	print STDERR $err;
323	0					0	return undef;
324							};
325
326							# If we find a timezone offset we take that in to account now
327							# Either: +1000 or -0700
328							# or
329							# 11:53 PST (One to four chars after a time)
330	34					56	my $tz_offset_seconds = 0;
331	34					49	my $tz_str = '';
332	34					55	state $tz_rule = qr/
333							(
334							(\s\|:\d\d) # Start AFTER a space, or time (:12)
335							([+-])(\d{1,2})(\d{2}) # +1000 or -700 (three or four digits)
336							\|
337							\d{2}\ # Only match chars if they're AFTER a time
338							([A-Z]{1,4})\b # Capitalized TZ at end of string
339							\|
340							\d{2}(Z)$ # Just a simple Z at the end
341							)
342							/x;
343
344	34	100	100			439	if ($ret && $str =~ $tz_rule) {
		100
345	15					22	my $str_offset = 0;
346
347							# String timezone: 11:53 PST
348	15	100	100			56	if ($6 \|\| $7) {
349							# Whichever form matches, the TZ is that one
350	10		50			29	my $tz_code = $6 \|\| $7 \|\| '';
351
352							# Lookup the timezone offset in the table
353	10		100			45	$str_offset = $TZ_OFFSET->{$tz_code} \|\| 0;
354							# Timezone offsets are in hours, so we convert to seconds
355	10					15	$str_offset *= 3600;
356
357	10					16	$tz_str = $tz_code;
358							# Numeric format: +1000 or -0700
359							} else {
360							# Break the input string into parts so we can do math
361							# +1000 = 10 hours, -0700 = 7 hours, +0430 = 4.5 hours
362	5					18	$str_offset = ($4 + ($5 / 60)) * 3600;
363
364	5	100				13	if ($3 eq "-") {
365	4					7	$str_offset *= -1;
366							}
367
368	5					14	$tz_str = "$2$3$4";
369							}
370
371	15					26	$tz_offset_seconds = $str_offset;
372							# No timezone info found so we assume the local timezone
373							} elsif ($ret) {
374	17					38	my $local_offset = get_local_offset($ret);
375
376	17					24	$tz_offset_seconds = $local_offset;
377	17					29	$tz_str = 'UNSPECIFIED';
378							}
379
380							# Subtract the timezone offset from the unixtime
381	34					58	$ret -= $tz_offset_seconds;
382
383	34	50				60	if ($debug) {
384	0					0	my $color = "\e[38;5;45m";
385	0					0	my $reset = "\e[0m";
386	0					0	my $header = sprintf("%*s = YYYY-MM-DD HH:II:SS (timezone offset)", length($str) + 2, "Input string");
387	0		0			0	my $output = sprintf("'%s' = %02d-%02d-%02d %02d:%02d:%02d (%s = %d seconds)", $str, $year \|\| -1, $month \|\| -1, $day \|\| -1, $hour, $min, $sec, $tz_str, $tz_offset_seconds);
			0
			0
388
389	0					0	print STDERR $color . $header . $reset . "\n";
390	0					0	print STDERR $output . "\n";
391							}
392
393
394	34					176	return $ret;
395							}
396
397							# Return the timezone offset for the local machine
398							sub get_local_offset {
399	17			17	0	28	my $unixtime = $_[0];
400
401							# Since timezones only change on the half-hour (at most), we
402							# round down the nearest half hour "bucket" and then cache
403							# that result. We probably could get away with a full hour
404							# here but we don't gain much performance/memory by doing that
405	17					21	my $bucket_size = 1800;
406	17					33	my $cache_key = $unixtime - ($unixtime % $bucket_size);
407
408							# If we have a forced LOCAL_TZ_OFFSET we use that (unit tests)
409	17	50				32	if (defined($LOCAL_TZ_OFFSET)) {
410	17					45	return $LOCAL_TZ_OFFSET;
411							}
412
413							# Simple memoizing (improves repeated performance a LOT)
414							# Note: this is even faster than `use Memoize`
415	0						state $x = {};
416	0	0	0				if ($USE_TZ_CACHE && $x->{$cache_key}) {
417	0						return $x->{$cache_key};
418							}
419
420							# Get a time obj for this local timezone and UTC for the Unixtime
421							# Then compare the two to get the local TZ offset
422	0						my @t = localtime($unixtime);
423	0						my $ret = (Time::Local::timegm(@t) - Time::Local::timelocal(@t));
424
425							# Cache the result
426	0	0					if ($USE_TZ_CACHE) {
427	0						$x->{$cache_key} = $ret;
428							}
429
430	0						return $ret;
431							}
432
433							1;
434
435							__END__