File Coverage

blib/lib/Geo/UK/Postcode/Regex.pm

Criterion	Covered	Total	%
statement	90	90	100.0
branch	65	74	87.8
condition	11	13	84.6
subroutine	21	21	100.0
pod	16	16	100.0
total	203	214	94.8

line	stmt	bran	cond	sub	pod	time	code
1							package Geo::UK::Postcode::Regex;
2
3	8			8		450250	use strict;
	8					81
	8					257
4	8			8		55	use warnings;
	8					19
	8					241
5
6	8			8		3704	use Geo::UK::Postcode::Regex::Hash;
	8					24
	8					306
7
8	8			8		63	use base 'Exporter';
	8					19
	8					17527
9							our @EXPORT_OK = qw( is_valid_pc is_strict_pc is_lax_pc %REGEXES );
10
11							our $VERSION = '0.017';
12
13							=encoding utf-8
14
15							=head1 NAME
16
17							Geo::UK::Postcode::Regex - regular expressions for handling British postcodes
18
19							=head1 SYNOPSIS
20
21							See L for an alternative interface.
22
23							use Geo::UK::Postcode::Regex;
24
25							## REGULAR EXPRESSIONS
26
27							my $lax_re = Geo::UK::Postcode::Regex->regex;
28							my $strict_re = Geo::UK::Postcode::Regex->strict_regex;
29							my $valid_re = Geo::UK::Postcode::Regex->valid_regex;
30
31							# matching only
32							if ( $foo =~ $lax_re ) {...}
33							if ( $foo =~ $strict_re ) {...}
34							if ( $foo =~ $valid_re ) {...}
35
36							# matching and using components - see also parse()
37							if ( $foo =~ $lax_re ) {
38							my ( $area, $district, $sector, $unit ) = ( $1, $2, $3, $4 );
39							my $subdistrict = $district =~ s/([A-Z])$// ? $1 : undef;
40							...
41							}
42							if ( $foo =~ $strict_re ) {
43							my ( $area, $district, $sector, $unit ) = ( $1, $2, $3, $4 );
44							my $subdistrict = $district =~ s/([A-Z])$// ? $1 : undef;
45							...
46							}
47							if ( $foo =~ $valid_re ) {
48							my ( $outcode, $sector, $unit ) = ( $1, $2, $3 );
49							...
50							}
51
52
53							## VALIDATION METHODS
54
55							use Geo::UK::Postcode::Regex qw( is_valid_pc is_strict_pc is_lax_pc );
56
57							if (is_valid_pc("GE0 1UK")) {
58							...
59							}
60							if (is_strict_pc("GE0 1UK")) {
61							...
62							}
63							if (is_lax_pc("GE0 1UK")) {
64							...
65							}
66
67
68							## PARSING
69
70							my $parsed = Geo::UK::Postcode::Regex->parse("WC1H 9EB");
71
72							# returns:
73							# { area => 'WC',
74							# district => '1',
75							# subdistrict => 'H',
76							# sector => '9',
77							# unit => 'EB',
78							# outcode => 'WC1H',
79							# incode => '9EB',
80							# valid => 1,
81							# strict => 1,
82							# partial => 0,
83							# non_geographical => 0,
84							# bfpo => 0,
85							# }
86
87							# strict parsing (only valid characters):
88							...->parse( $pc, { strict => 1 } )
89
90							# valid outcodes only
91							...->parse( $pc, { valid => 1 } )
92
93							# match partial postcodes, e.g. 'WC1H', 'WC1H 9' - see below
94							...->parse( $pc, { partial => 1 } )
95
96
97							## PARSING PARTIAL POSTCODES
98
99							# outcode (district) only
100							my $parsed = Geo::UK::Postcode::Regex->parse( "AB10", { partial => 1 } );
101
102							# returns:
103							# { area => 'AB',
104							# district => '10',
105							# subdistrict => undef,
106							# sector => undef,
107							# unit => undef,
108							# outcode => 'AB10',
109							# incode => undef,
110							# valid => 1,
111							# strict => 1,
112							# partial => 1,
113							# non_geographical => 0,
114							# bfpo => 0,
115							# }
116
117							# sector only
118							my $parsed = Geo::UK::Postcode::Regex->parse( "AB10 1", { partial => 1 } );
119
120							# returns:
121							# { area => 'AB',
122							# district => '10',
123							# subdistrict => undef,
124							# sector => 1,
125							# unit => undef,
126							# outcode => 'AB10',
127							# incode => '1',
128							# valid => 1,
129							# strict => 1,
130							# partial => 1,
131							# non_geographical => 0,
132							# bfpo => 0,
133							# }
134
135
136							## EXTRACT OUTCODE FROM POSTCODE
137
138							my $outcode = Geo::UK::Postcode::Regex->outcode("AB101AA"); # returns 'AB10'
139
140							my $outcode = Geo::UK::Postcode::Regex->outcode( $postcode, { valid => 1 } )
141							or die "Invalid postcode";
142
143
144							## EXTRACT POSTCODES FROM TEXT
145
146							# \%options as per parse, excluding partial
147							my @extracted = Geo::UK::Postcode::Regex->extract( $text, \%options );
148
149
150							## POSTTOWNS
151							my @posttowns = Geo::UK::Postcode::Regex->outcode_to_posttowns($outcode);
152
153
154							## OUTCODES
155							my @outcodes = Geo::UK::Postcode::Regex->posttown_to_outcodes($posttown);
156
157
158							=head1 DESCRIPTION
159
160							Parsing UK postcodes with regular expressions (aka Regexp). This package has
161							been separated from L so it can be installed and used with
162							fewer dependencies.
163
164							Can handle partial postcodes (just the outcode or sector) and can test
165							against valid characters and currently valid outcodes.
166
167							Also can determine the posttown(s) from a postcode.
168
169							Districts and post town information taken from:
170							L
171
172							=head1 IMPORTANT CHANGES FOR VERSION 0.014
173
174							Please note that various bugfixes have changed the following:
175
176							=over
177
178							=item *
179
180							Unanchored regular expressions no longer match valid postcodes within invalid
181							ones.
182
183							=item *
184
185							Unanchored regular expressions in partial mode now can match a valid or strict
186							outcode with an invalid incode.
187
188							=back
189
190							Please get in touch if you have any questions.
191
192							See L for other changes affecting the Simple
193							interface.
194
195							=head1 NOTES AND LIMITATIONS
196
197							When parsing a partial postcode, whitespace may be required to separate the
198							outcode from the sector.
199
200							For example the sector 'B1 1' cannot be distinguished from the district 'B11'
201							without whitespace. This is not a problem when parsing full postcodes.
202
203							=cut
204
205							## REGULAR EXPRESSIONS
206
207							my $AREA1 = 'ABCDEFGHIJKLMNOPRSTUWYZ'; # [^QVX]
208							my $AREA2 = 'ABCDEFGHKLMNOPQRSTUVWXY'; # [^IJZ]
209
210							my $SUBDISTRICT1 = 'ABCDEFGHJKPSTUW'; # for single letter areas
211							my $SUBDISTRICT2 = 'ABEHMNPRVWXY'; # for two letter areas
212
213							my $UNIT1 = 'ABDEFGHJLNPQRSTUWXYZ'; # [^CIKMOV]
214							my $UNIT2 = 'ABDEFGHJLNPQRSTUWXYZ'; # [^CIKMOV]
215
216							our %COMPONENTS = (
217							strict => {
218							area => "[$AREA1][$AREA2]?",
219							district => qq% (?:
220							[0-9][0-9]?
221							\| (?
222							\| (?<=[A-Z]{2}) [0-9][$SUBDISTRICT2]
223							) %,
224							sector => '[0-9]',
225							unit => "[$UNIT1][$UNIT2]",
226							blank => '',
227							},
228							lax => {
229							area => '[A-Z]{1,2}',
230							district => '[0-9](?:[0-9]\|[A-Z])?',
231							sector => '[0-9]',
232							unit => '[A-Z]{2}',
233							},
234							);
235
236							my %BASE_REGEXES = (
237							full => ' %s %s \s* %s %s ',
238							partial => ' %s %s (?: \s* %s (?:%s)? ) ? ',
239							);
240
241							my ( %POSTTOWNS, %OUTCODES );
242
243							tie our %REGEXES, 'Geo::UK::Postcode::Regex::Hash', _fetch => sub {
244							my ($key) = @_;
245
246							_outcode_data() if $key =~ m/valid/ && !%OUTCODES;
247
248							my $type = $key =~ m/lax/ ? 'lax' : 'strict';
249
250							my $components = $Geo::UK::Postcode::Regex::COMPONENTS{$type};
251
252							my @comps
253							= $key =~ m/valid/
254							? @{$components}{qw( outcodes blank sector unit )}
255							: @{$components}{qw( area district sector unit )};
256
257							@comps = map { $_ ? "($_)" : $_ } @comps if $key =~ m/captures/;
258
259							my $size = $key =~ m/partial/ ? 'partial' : 'full';
260
261							my $re = sprintf( $BASE_REGEXES{$size}, @comps );
262
263							if ( $key =~ m/anchored/ ) {
264							$re = '^' . $re . '$';
265
266							} elsif ( $key =~ m/extract/ ) {
267							$re = '(?:[^0-9A-Z]\|\b) (' . $re . ') (?:[^0-9A-Z]\|\b)';
268
269							} else {
270							$re = '(?:[^0-9A-Z]\|\b) ' . $re . ' (?:[^0-9A-Z]\|\b)';
271							}
272
273							return $key =~ m/case-insensitive/ ? qr/$re/ix : qr/$re/x;
274							};
275
276							## OUTCODE AND POSTTOWN DATA
277
278							sub _outcode_data {
279	8			8		18	my %area_districts;
280
281							# get the original position in the DATA File Handle
282	8					32	my $orig_position = tell( DATA );
283							# Get outcodes from __DATA__
284	8					1051	while ( my $line = ) {
285	23944	100				81056	next unless $line =~ m/\w/;
286	23936					48321	chomp $line;
287	23936					84415	my ( $outcode, $non_geographical, @posttowns ) = split /,/, $line;
288
289	23936					54357	push @{ $POSTTOWNS{$_} }, $outcode foreach @posttowns;
	24936					83089
290	23936					148165	$OUTCODES{$outcode} = {
291							posttowns => \@posttowns,
292							non_geographical => $non_geographical,
293							};
294							}
295							# Reset position of DATA File Handle for re-reading
296	8					50	seek DATA, $orig_position, 0;
297
298							# Add in BX non-geographical outcodes
299	8					42	foreach ( 1 .. 99 ) {
300	792					2915	$OUTCODES{ 'BX' . $_ } = {
301							posttowns => [],
302							non_geographical => 1,
303							};
304							}
305
306	8					19970	foreach my $outcode ( sort keys %OUTCODES ) {
307							my ( $area, $district )
308							= $outcode =~ $REGEXES{strict_partial_anchored_captures}
309	24728	50				83253	or next;
310
311	24728	100				74052	$district = " $district" if length $district < 2;
312
313	24728					37347	push @{ $area_districts{$area}->{ substr( $district, 0, 1 ) } },
	24728					80903
314							substr( $district, 1, 1 );
315							}
316
317							$Geo::UK::Postcode::Regex::COMPONENTS{strict}->{outcodes} = '(?: ' . join(
318							"\|\n",
319							map {
320	8					3893	my $area = $_;
	1008					2594
321							sprintf(
322							"%s (?:%s)", #
323							$area,
324							join(
325							' \| ',
326							map {
327							sprintf( "%s[%s]",
328	3984					6383	$_, join( '', @{ $area_districts{$area}->{$_} } ) )
	3984					18964
329							} #
330	5641	100				12382	sort { $a eq ' ' ? 1 : $b eq ' ' ? -1 : $a <=> $b }
		100
331	1008					1488	keys %{ $area_districts{$area} }
	1008					3784
332							)
333							)
334							} sort keys %area_districts
335							) . ' )';
336
337							}
338
339							=head1 VALIDATION METHODS
340
341							The following methods are for validating postcodes to various degrees.
342
343							L may provide a more convenient way of using
344							and customising these.
345
346							=head2 regex, strict_regex, valid_regex
347
348							Return regular expressions to parse postcodes and capture the constituent
349							parts: area, district, sector and unit (or outcode, sector and unit in the
350							case of C).
351
352							C checks that the postcode only contains valid characters
353							according to the postcode specifications.
354
355							C checks that the outcode currently exists.
356
357							=head2 regex_partial, strict_regex_partial, valid_regex_partial
358
359							As above, but matches on partial postcodes of just the outcode
360							or sector
361
362							=cut
363
364	1			1	1	8	sub valid_regex_partial { $REGEXES{valid_partial_anchored_captures} }
365	1			1	1	8	sub strict_regex_partial { $REGEXES{strict_partial_anchored_captures} }
366	1			1	1	10	sub regex_partial { $REGEXES{lax_partial_anchored_captures} }
367	1			1	1	9	sub valid_regex { $REGEXES{valid_anchored_captures} }
368	1			1	1	7	sub strict_regex { $REGEXES{strict_anchored_captures} }
369	1			1	1	124	sub regex { $REGEXES{lax_anchored_captures} }
370
371
372							=head2 is_valid_pc, is_strict_pc, is_lax_pc
373
374							if (is_valid_pc( "AB1 2CD" ) ) { ... }
375
376							Alternative way to access the regexes.
377
378							=cut
379
380							sub is_valid_pc {
381	61	50		61	1	50972	my $pc = @_ > 1 ? $_[1] : $_[0]; # back-compat: can call as class method
382	61	100				488	return $pc =~ $REGEXES{valid_anchored} ? 1 : 0
383							}
384							sub is_strict_pc {
385	61	50		61	1	546803	my $pc = @_ > 1 ? $_[1] : $_[0]; # back-compat: can call as class method
386	61	100				781	return $pc =~ $REGEXES{strict_anchored} ? 1 : 0
387							}
388							sub is_lax_pc {
389	61	50		61	1	42931	my $pc = @_ > 1 ? $_[1] : $_[0]; # back-compat: can call as class method
390	61	100				497	return $pc =~ $REGEXES{lax_anchored} ? 1 : 0
391							}
392
393							=head1 PARSING METHODS
394
395							The following methods are for parsing postcodes or strings containing postcodes.
396
397							=head2 PARSING_OPTIONS
398
399							The parsing methods can take the following options, passed via a hashref:
400
401							=over
402
403							=item strict
404
405							Postcodes must not contain invalid characters according to the postcode
406							specification. For example a 'Q' may not appear as the first character.
407
408							=item valid
409
410							Postcodes must contain an outcode (area + district) that currently exists, in
411							addition to conforming to the C definition.
412
413							Returns false if string is not a currently existing outcode.
414
415							=item partial
416
417							Allows partial postcodes to be matched. In practice this means either an outcode
418							( area and district ) or an outcode together with the sector.
419
420							=back
421
422							=head2 extract
423
424							my @extracted = Geo::UK::Postcode::Regex->extract( $string, \%options );
425
426							Returns a list of full postcodes extracted from a string.
427
428							=cut
429
430							# TODO need to/can do partial?
431
432							sub extract {
433	3606			3606	1	25237	my ( $class, $string, $options ) = @_;
434
435	3606	100				9055	_outcode_data() unless %OUTCODES;
436
437							my $key
438							= $options->{valid} ? 'valid'
439	3606	100				11169	: $options->{strict} ? 'strict'
		100
440							: 'lax';
441
442	3606	100				8650	$key .= '_case-insensitive' if $options->{'case-insensitive'};
443	3606					6694	$key .= '_extract';
444
445	3606					18484	my @extracted = $string =~ m/$REGEXES{$key}/g;
446
447	3606					18595	return map {uc} @extracted;
	2016					14022
448							}
449
450							=head2 parse
451
452							my $parsed = Geo::UK::Postcode::Regex->parse( $pc, \%options );
453
454							Returns hashref of the constituent parts - see SYNOPSIS. Missing parts will be
455							set as undefined.
456
457							=cut
458
459							sub parse {
460	4040			4040	1	4655079	my ( $class, $string, $options ) = @_;
461
462	4040		100			11898	$options \|\|= {};
463
464	4040	100				12770	$string = uc $string if $options->{'case-insensitive'};
465
466							my $re
467							= $options->{partial}
468	4040	100				11587	? 'partial_anchored_captures'
469							: 'anchored_captures';
470
471	4040					24957	my ( $area, $district, $sector, $unit ) = $string =~ $REGEXES{"strict_$re"};
472
473	4040	100				15946	my $strict = $area ? 1 : 0; # matched strict?
474
475	4040	100				11123	unless ($strict) {
476	1635	100				8120	return if $options->{strict};
477
478							# try lax regex
479	565	100				3055	( $area, $district, $sector, $unit ) = $string =~ $REGEXES{"lax_$re"}
480							or return;
481							}
482
483	2563	50	66			8219	return unless $unit \|\| $options->{partial};
484
485	2563	50				6266	return unless defined $district;
486
487	2563					6079	my $outcode = $area . $district;
488	2563					8805	my $outcode_data = $class->outcodes_lookup->{$outcode};
489
490	2563	100	100			11950	return if $options->{valid} && !$outcode_data;
491
492	2340	100				11438	my $subdistrict = $district =~ s/([A-Z])$// ? $1 : undef;
493
494	2340	100				7593	my $incode = $unit ? "$sector$unit" : $sector ? $sector : undef;
		100
495
496							return {
497							area => $area,
498							district => $district,
499							subdistrict => $subdistrict,
500							sector => $sector,
501							unit => $unit,
502							outcode => $outcode,
503							incode => $incode,
504
505							strict => $strict,
506							partial => $unit ? 0 : 1,
507							valid => $outcode_data && $strict ? 1 : 0,
508
509	2340	100	100			40174	$outcode_data->{non_geographical} ? ( non_geographical => 1 ) : (),
		100
		100
		100
510							$outcode eq "BF1" ? ( bfpo => 1 ) : (),
511							};
512							}
513
514							=head2 outcode
515
516							my $outcode = Geo::UK::Postcode::Regex->outcode( $pc, \%options );
517
518							Extract the outcode (area and district) from a postcode string. Will work on
519							full or partial postcodes.
520
521							=cut
522
523							sub outcode {
524	18			18	1	772	my ( $class, $string, $options ) = @_;
525
526	18	100				27	my $parsed = $class->parse( $string, { partial => 1, %{ $options \|\| {} } } )
	18	100				88
527							or return;
528
529	14					101	return $parsed->{outcode};
530							}
531
532							=head1 LOOKUP METHODS
533
534							=head2 outcode_to_posttowns
535
536							my ( $posttown1, $posttown2, ... )
537							= Geo::UK::Postcode::Regex->outcode_to_posttowns($outcode);
538
539							Returns posttown(s) for supplied outcode.
540
541							Note - most outcodes will only have one posttown, but some are shared between
542							two posttowns.
543
544							=cut
545
546							sub outcode_to_posttowns {
547	2			2	1	1933	my ( $class, $outcode ) = @_;
548
549	2					13	my $data = $class->outcodes_lookup->{$outcode};
550
551	2	50				7	return @{ $data ? $data->{posttowns} : [] };
	2					42
552							}
553
554							=head2 posttown_to_outcodes
555
556							my @outcodes = Geo::UK::Postcode::Regex->posttown_to_outcodes($posttown);
557
558							Returns the outcodes covered by a posttown. Note some outcodes are shared
559							between posttowns.
560
561							=cut
562
563							sub posttown_to_outcodes {
564	1			1	1	1120	my ( $class, $posttown ) = @_;
565
566	1	50	50			5	return @{ $class->posttowns_lookup->{ $posttown \|\| '' } \|\| [] };
	1					7
567							}
568
569							=head2 outcodes_lookup
570
571							my %outcodes = %{ Geo::UK::Postcode::Regex->outcodes_lookup };
572							print "valid outcode" if $outcodes{$outcode};
573							my @posttowns = @{ $outcodes{$outcode} };
574
575							Hashref of outcodes to posttown(s);
576
577							=head2 posttowns_lookup
578
579							my %posttowns = %{ Geo::UK::Postcode::Regex->posttowns_lookup };
580							print "valid posttown" if $posttowns{$posttown};
581							my @outcodes = @{ $[posttowns{$posttown} };
582
583							Hashref of posttown to outcode(s);
584
585							=cut
586
587							sub outcodes_lookup {
588	5657			5657	1	14417	my $class = shift;
589
590	5657	100				12929	_outcode_data() unless %OUTCODES;
591
592	5657					21098	return \%OUTCODES;
593							}
594
595							sub posttowns_lookup {
596	1			1	1	4	my $class = shift;
597
598	1	50				8	_outcode_data() unless %POSTTOWNS;
599
600	1					24	return \%POSTTOWNS;
601							}
602
603							=head1 SEE ALSO
604
605							=over
606
607							=item *
608
609							L - companion package, provides Postcode objects
610
611							=item *
612
613							L
614
615							=item *
616
617							L
618
619							=item *
620
621							L
622
623							=item *
624
625							L
626
627							=item *
628
629							L
630
631							=back
632
633							=head1 SUPPORT
634
635							=head2 Bugs / Feature Requests
636
637							Please report any bugs or feature requests through the issue tracker
638							at L.
639							You will be notified automatically of any progress on your issue.
640
641							=head2 Source Code
642
643							This is open source software. The code repository is available for
644							public review and contribution under the terms of the license.
645
646							L
647
648							git clone git://github.com/mjemmeson/geo-uk-postcode-regex.git
649
650							=head1 AUTHOR
651
652							Michael Jemmeson Emjemmeson@cpan.orgE
653
654							=head1 CONTRIBUTORS
655
656							=over
657
658							=item *
659
660							Tom Bloor C
661
662							=back
663
664							=head1 COPYRIGHT
665
666							Copyright 2015-2017 Michael Jemmeson
667
668							=head1 LICENSE
669
670							This library is free software; you can redistribute it and/or modify
671							it under the same terms as Perl itself.
672
673							=cut
674
675							1;
676
677							__DATA__