File Coverage

blib/lib/URL/Normalize.pm

Criterion	Covered	Total	%
statement	113	118	95.7
branch	25	34	73.5
condition	9	13	69.2
subroutine	15	16	93.7
pod	12	12	100.0
total	174	193	90.1

line	stmt	bran	cond	sub	pod	time	code
1							package URL::Normalize;
2	11			11		1198103	use Moose;
	11					5376780
	11					74
3	11			11		91268	use namespace::autoclean;
	11					96948
	11					53
4
5	11			11		7511	use URI qw();
	11					66594
	11					275
6	11			11		5671	use URI::QueryParam qw();
	11					8583
	11					20521
7
8							=head1 NAME
9
10							URL::Normalize - Normalize/optimize URLs.
11
12							=head1 VERSION
13
14							Version 0.43
15
16							=cut
17
18							our $VERSION = '0.43';
19
20							=head1 SYNOPSIS
21
22							use URL::Normalize;
23
24							my $normalizer = URL::Normalize->new( 'http://www.example.com/display?lang=en&article=fred' );
25
26							# Normalize the URL
27							$normalizer->make_canonical;
28							$normalizer->remove_directory_index;
29							$normalizer->remove_empty_query;
30
31							# Get the normalized version back
32							my $url = $normalizer->url;
33
34							=cut
35
36							=head1 DESCRIPTION
37
38							When writing a web crawler, for example, it's always very costly to check if a
39							URL has been fetched/seen when you have millions or billions of URLs in a
40							database.
41
42							This module can help you create a unique "ID" of a URL, which you can use as a
43							key in a key/value-store; the key is the normalized URL, whereas all the URLs
44							that refers to the normalized URL are part of the value (normally an array or
45							hash);
46
47							'http://www.example.com/' = {
48							'http://www.example.com:80/' => 1,
49							'http://www.example.com/index.html' => 1,
50							'http://www.example.com/?' => 1,
51							}
52
53							Above, all the URLs inside the hash normalizes to the key if you run these
54							methods:
55
56							=over 4
57
58							=item * C<make_canonical>
59
60							=item * C<remove_directory_index>
61
62							=item * C<remove_empty_query>
63
64							=back
65
66							This is NOT a perfect solution.
67
68							If you normalize a URL using all the methods in this module, there is a high
69							probability that the URL will stop "working." This is merely a helper module
70							for those of you who wants to either normalize a URL using only a few of the
71							safer methods, and/or for those of you who wants to generate a possibly unique
72							"ID" from any given URL.
73
74							=head1 CONSTRUCTORS
75
76							=head2 new( $url )
77
78							Constructs a new URL::Normalize object:
79
80							my $normalizer = URL::Normalize->new( 'http://www.example.com/some/path' );
81
82							You can also send in just the path:
83
84							my $normalizer = URL::Normalize->new( '/some/path' );
85
86							The latter is NOT recommended, though, and hasn't been tested properly. You
87							should always give URL::Normalize an absolute URL by using L<URI>'s C<new_abs>.
88
89							=cut
90
91							=head1 METHODS
92
93							=cut
94
95							around BUILDARGS => sub {
96							my $orig = shift;
97							my $class = shift;
98
99							if ( @_ == 1 && !ref $_[0] ) {
100							return $class->$orig( url => $_[0] );
101							}
102							else {
103							return $class->$orig( @_ );
104							}
105							};
106
107							=head2 url
108
109							Get the current URL, preferably after you have run one or more of the
110							normalization methods.
111
112							=cut
113
114							has 'url' => (
115							isa => 'Str',
116							is => 'ro',
117							required => 1,
118							writer => '_set_url',
119							);
120
121							has 'dir_index_regexps' => (
122							traits => [ 'Array' ],
123							isa => 'ArrayRef[Str]',
124							is => 'rw',
125							handles => {
126							'add_dir_index_regexp' => 'push',
127							},
128							default => sub {
129							[
130							'/default\.aspx?',
131							'/default\.html\.aspx?',
132							'/default\.s?html?',
133							'/home\.s?html?',
134							'/index\.cgi',
135							'/index\.html\.aspx?',
136							'/index\.html\.php',
137							'/index\.jsp',
138							'/index\.php\d?',
139							'/index\.pl',
140							'/index\.s?html?',
141							'/welcome\.s?html?',
142							];
143							},
144							);
145
146							=head2 URI
147
148							Returns a L<URI> representation of the current URL.
149
150							=cut
151
152							has 'URI' => (
153							isa => 'URI',
154							is => 'ro',
155							lazy => 1,
156							default => sub {
157							my $self = shift;
158
159							my $URI = eval {
160							URI->new( $self->url )->canonical;
161							};
162
163							if ( $@ ) {
164							Carp::carp( "Failed to create a URI object from URL '" . $self->url . "'" );
165							}
166
167							return $URI;
168							},
169							);
170
171							=head2 make_canonical
172
173							Just a shortcut for URI::URL->new->canonical->as_string, and involves the
174							following steps (at least):
175
176							=over 4
177
178							=item * Converts the scheme and host to lower case.
179
180							=item * Capitalizes letters in escape sequences.
181
182							=item * Decodes percent-encoded octets of unreserved characters.
183
184							=item * Removes the default port (port 80 for http).
185
186							=back
187
188							Example:
189
190							my $normalizer = URL::Normalize->new(
191							url => 'HTTP://www.example.com:80/%7Eusername/',
192							);
193
194							$normalizer->make_canonical;
195
196							print $normalizer->url; # http://www.example.com/~username/
197
198							=cut
199
200							sub make_canonical {
201	17			17	1	61	my $self = shift;
202
203	17	50				426	if ( my $URI = $self->URI ) {
204	17					185	$self->_set_url( $URI->canonical->as_string );
205							}
206							else {
207	0					0	Carp::carp( "Can't make non-URI URLs canonical." );
208							}
209							}
210
211							=head2 remove_dot_segments
212
213							The C<.>, C<..> and C<...> segments will be removed and "folded" (or
214							"flattened", if you prefer) from the URL.
215
216							This method does NOT follow the algorithm described in L<RFC 3986: Uniform
217							Resource Indentifier\|http://tools.ietf.org/html/rfc3986>, but rather flattens
218							each path segment.
219
220							Also keep in mind that this method doesn't (because it can't) account for
221							symbolic links on the server side.
222
223							Example:
224
225							my $normalizer = URL::Normalize->new(
226							url => 'http://www.example.com/../a/b/../c/./d.html',
227							);
228
229							$normalizer->remove_dot_segments;
230
231							print $normalizer->url; # http://www.example.com/a/c/d.html
232
233							=cut
234
235							sub remove_dot_segments {
236	43			43	1	166	my $self = shift;
237
238	43	50				1092	if ( my $URI = $self->URI ) {
239	43					393	my $path = $URI->path;
240
241	43					496	my @new_segments = ();
242
243	43					162	foreach my $segment ( split('/', $path) ) {
244	169	100	100			507	if ( $segment eq '.' \|\| $segment eq '...' ) {
245	8					16	next;
246							}
247
248	161	100				283	if ( $segment eq '..' ) {
249	38					52	pop( @new_segments );
250	38					68	next;
251							}
252
253	123					224	push( @new_segments, $segment );
254							}
255
256	43					137	my $new_path = join( '/', @new_segments );
257	43	100	66			202	$new_path .= '/' if ( $new_path !~ m,/$, && $path =~ m,/$, );
258	43	100				126	$new_path = '/' . $new_path unless ( $new_path =~ m,^/, );
259
260	43					131	$URI->path( $new_path );
261
262	43					1383	$self->_set_url( $URI->as_string );
263							}
264							}
265
266							=head2 remove_directory_index
267
268							Removes well-known directory indexes, eg. C<index.html>, C<default.asp> etc.
269							This method is case-insensitive.
270
271							Example:
272
273							my $normalizer = URL::Normalize->new(
274							url => 'http://www.example.com/index.cgi?foo=/',
275							);
276
277							$normalizer->remove_directory_index;
278
279							print $normalizer->url; # http://www.example.com/?foo=/
280
281							The default regular expressions for matching a directory index are:
282
283							=over 4
284
285							=item * C<default\.aspx?>
286
287							=item * C<default\.html\.aspx?>
288
289							=item * C<default\.s?html?>
290
291							=item * C<home\.s?html?>
292
293							=item * C<index\.cgi>
294
295							=item * C<index\.html\.aspx?>
296
297							=item * C<index\.html\.php>
298
299							=item * C<index\.jsp>
300
301							=item * C<index\.php\d?>
302
303							=item * C<index\.pl>
304
305							=item * C<index\.s?html?>
306
307							=item * C<welcome\.s?html?>
308
309							=back
310
311							You can override these by sending in your own list of regular expressions
312							when creating the URL::Normalizer object:
313
314							my $normalizer = URL::Normalize->new(
315							url => 'http://www.example.com/index.cgi?foo=/',
316							dir_index_regexps => [ 'MyDirIndex\.html' ], # etc.
317							);
318
319							You can also choose to add regular expressions after the URL::Normalize
320							object has been created:
321
322							my $normalizer = URL::Normalize->new(
323							url => 'http://www.example.com/index.cgi?foo=/',
324							dir_index_regexps => [ 'MyDirIndex\.html' ], # etc.
325							);
326
327							# ...
328
329							$normalizer->add_directory_index_regexp( 'MyDirIndex\.html' );
330
331							=cut
332
333							sub remove_directory_index {
334	56			56	1	199	my $self = shift;
335
336	56	50				1497	if ( my $URI = $self->URI ) {
337	56	50				469	if ( my $path = $URI->path ) {
338	56					577	foreach my $regex ( @{$self->dir_index_regexps} ) {
	56					1579
339	672					3880	$path =~ s,$regex,/,i;
340							}
341
342	56					163	$URI->path( $path );
343							}
344
345	56					1820	$self->_set_url( $URI->as_string );
346							}
347							}
348
349							=head2 sort_query_parameters
350
351							Sorts the URL's query parameters alphabetically.
352
353							Uppercased parameters will be lowercased DURING sorting, but the parameters
354							will be in the original case AFTER sorting. If there are multiple values for
355							one parameter, the key/value-pairs will be sorted as well.
356
357							Example:
358
359							my $normalizer = URL::Normalize->new(
360							url => 'http://www.example.com/?b=2&c=3&a=0&A=1',
361							);
362
363							$normalizer->sort_query_parameters;
364
365							print $normalizer->url; # http://www.example.com/?a=0&A=1&b=2&c=3
366
367							=cut
368
369							sub sort_query_parameters {
370	15			15	1	60	my $self = shift;
371
372	15	50				389	if ( my $URI = $self->URI ) {
373	15	100				131	if ( $URI->as_string =~ m,\?, ) {
374	12		50			91	my $query_hash = $URI->query_form_hash \|\| {};
375	12					1157	my $query_string = '';
376	12					20	my %new_query_hash = ();
377
378	12					16	foreach my $key ( sort { lc($a) cmp lc($b) } keys %{$query_hash} ) {
	33					68
	12					52
379	34					57	my $values = $query_hash->{ $key };
380	34	100				61	unless ( ref $values ) {
381	32					57	$values = [ $values ];
382							}
383
384	34					41	foreach my $value ( @{$values} ) {
	34					56
385	36					41	push( @{ $new_query_hash{lc($key)}->{$value} }, $key );
	36					148
386							}
387							}
388
389	12					38	foreach my $sort_key ( sort keys %new_query_hash ) {
390	30					40	foreach my $value ( sort keys %{$new_query_hash{$sort_key}} ) {
	30					69
391	36					39	foreach my $key ( @{$new_query_hash{$sort_key}->{$value}} ) {
	36					59
392	36					76	$query_string .= $key . '=' . $value . '&';
393							}
394							}
395							}
396
397	12					46	$query_string =~ s,&$,,;
398
399	12					37	$URI->query( $query_string );
400							}
401
402	15					336	$self->_set_url( $URI->as_string );
403							}
404							}
405
406							=head2 remove_duplicate_query_parameters
407
408							Removes duplicate query parameters, i.e. where the key/value combination is
409							identical with another key/value combination.
410
411							Example:
412
413							my $normalizer = URL::Normalize->new(
414							url => 'http://www.example.com/?a=1&a=2&b=4&a=1&c=4',
415							);
416
417							$normalizer->remove_duplicate_query_parameters;
418
419							print $normalizer->url; # http://www.example.com/?a=1&a=2&b=3&c=4
420
421							=cut
422
423							sub remove_duplicate_query_parameters {
424	2			2	1	9	my $self = shift;
425
426	2	50				53	if ( my $URI = $self->URI ) {
427	2					20	my %seen = ();
428	2					5	my @new_query = ();
429
430	2					11	foreach my $key ( $URI->query_param ) {
431	6					285	my @values = $URI->query_param( $key );
432
433	6					749	foreach my $value ( @values ) {
434	10	100				29	unless ( $seen{$key}->{$value} ) {
435	8					23	push( @new_query, { key => $key, value => $value } );
436	8					24	$seen{$key}->{$value}++;
437							}
438							}
439							}
440
441	2					5	my $query_string = '';
442	2					3	foreach ( @new_query ) {
443	8					18	$query_string .= $_->{key} . '=' . $_->{value} . '&';
444							}
445
446	2					10	$query_string =~ s,&$,,;
447
448	2					7	$URI->query( $query_string );
449
450	2					69	$self->_set_url( $URI->as_string );
451							}
452							}
453
454							=head2 remove_empty_query_parameters
455
456							Removes empty query parameters, i.e. where there are keys with no value. This
457							only removes BLANK values, not values considered to be no value, like zero (0).
458
459							Example:
460
461							my $normalizer = URL::Normalize->new(
462							url => 'http://www.example.com/?a=1&b=&c=3',
463							);
464
465							$normalizer->remove_empty_query_parameters;
466
467							print $normalizer->url; # http://www.example.com/?a=1&c=3
468
469							=cut
470
471							sub remove_empty_query_parameters {
472	2			2	1	12	my $self = shift;
473
474	2	50				58	if ( my $URI = $self->URI ) {
475	2					53	foreach my $key ( $URI->query_param ) {
476	6					569	my @values = $URI->query_param( $key );
477
478	6					513	$URI->query_param_delete( $key );
479
480	6					982	foreach my $value ( @values ) {
481	7	100	66			36	if ( defined $value && length $value ) {
482	4					17	$URI->query_param_append( $key, $value );
483							}
484							}
485							}
486
487	2					267	$self->_set_url( $URI->as_string );
488							}
489							}
490
491							=head2 remove_empty_query
492
493							Removes empty query from the URL.
494
495							Example:
496
497							my $normalizer = URL::Normalize->new(
498							url => 'http://www.example.com/foo?',
499							);
500
501							$normalizer->remove_empty_query;
502
503							print $Normalize->url; # http://www.example.com/foo
504
505							=cut
506
507							sub remove_empty_query {
508	3			3	1	13	my $self = shift;
509
510	3					75	my $url = $self->url;
511	3					16	$url =~ s,\?$,,;
512
513	3					83	$self->_set_url( $url );
514							}
515
516							=head2 remove_fragment
517
518							Removes the fragment from the URL, but only if seems like they are at the end
519							of the URL.
520
521							For example C<http://www.example.com/#foo> will be translated to
522							C<http://www.example.com/>, but C<http://www.example.com/#foo/bar> will stay
523							the same.
524
525							Example:
526
527							my $normalizer = URL::Normalize->new(
528							url => 'http://www.example.com/bar.html#section1',
529							);
530
531							$normalizer->remove_fragment;
532
533							print $normalizer->url; # http://www.example.com/bar.html
534
535							=cut
536
537							sub remove_fragment {
538	26			26	1	91	my $self = shift;
539
540	26					637	my $url = $self->url;
541
542	26					114	$url =~ s{#(?:/\|[^?/]*)$}{};
543
544	26					710	$self->_set_url( $url );
545							}
546
547							=head2 remove_fragments
548
549							Like C<remove_fragment>, but removes EVERYTHING after a C<#>.
550
551							=cut
552
553							sub remove_fragments {
554	0			0	1	0	my $self = shift;
555
556	0					0	my $url = $self->url;
557
558	0					0	$url =~ s/#.*//;
559
560	0					0	$self->_set_url( $url );
561							}
562
563							=head2 remove_duplicate_slashes
564
565							Remove duplicate slashes from the URL.
566
567							Example:
568
569							my $normalizer = URL::Normalize->new(
570							url => 'http://www.example.com/foo//bar.html',
571							);
572
573							$normalizer->remove_duplicate_slashes;
574
575							print $normalizer->url; # http://www.example.com/foo/bar.html
576
577							=cut
578
579							sub remove_duplicate_slashes {
580	5			5	1	20	my $self = shift;
581
582	5	50				128	if ( my $URI = $self->URI ) {
583	5					43	my $path = $URI->path;
584
585	5					64	$path =~ s,/+,/,g;
586
587	5					16	$URI->path( $path );
588
589	5					170	$self->_set_url( $URI->as_string );
590							}
591							}
592
593							=head2 remove_query_parameter
594
595							Convenience method for removing a specific parameter from the URL. If
596							the parameter is mentioned multiple times (?a=1&a=2), all occurences
597							will be removed.
598
599							=cut
600
601							sub remove_query_parameter {
602	1			1	1	5	my $self = shift;
603	1					2	my $param = shift;
604
605	1					4	return $self->remove_query_parameters( [$param] );
606							}
607
608							=head2 remove_query_parameters
609
610							Convenience method for removing multiple parameters from the URL. If the
611							parameters are mentioned multiple times (?a=1&a=2), all occurences will be
612							removed.
613
614							=cut
615
616							sub remove_query_parameters {
617	1			1	1	2	my $self = shift;
618	1		50			4	my $params = shift \|\| [];
619
620	1	50				82	if ( my $URI = $self->URI ) {
621	1					13	foreach my $param ( @{$params} ) {
	1					4
622	1					6	$URI->query_param_delete( $param );
623							}
624
625	1					256	$self->_set_url( $URI->as_string );
626							}
627							}
628
629							=head1 SEE ALSO
630
631							=over 4
632
633							=item * L<URI::Normalize>
634
635							=item * L<URI>
636
637							=item * L<URI::URL>
638
639							=item * L<URI::QueryParam>
640
641							=item * L<RFC 3986: Uniform Resource Indentifier\|http://tools.ietf.org/html/rfc3986>
642
643							=item * L<Wikipedia: URL normalization\|http://en.wikipedia.org/wiki/URL_normalization>
644
645							=back
646
647							=head1 AUTHOR
648
649							Tore Aursand, C<< <toreau at gmail.com> >>
650
651							=head1 BUGS
652
653							Please report any bugs or feature requests to the web interface at L<https://github.com/toreau/URL-Normalize/issues>
654
655							=head1 SUPPORT
656
657							You can find documentation for this module with the perldoc command.
658
659							perldoc URL::Normalize
660
661							You can also look for information at:
662
663							=over 4
664
665							=item * AnnoCPAN: Annotated CPAN documentation
666
667							L<http://annocpan.org/dist/URL-Normalize>
668
669							=item * CPAN Ratings
670
671							L<http://cpanratings.perl.org/d/URL-Normalize>
672
673							=item * Search CPAN
674
675							L<http://search.cpan.org/dist/URL-Normalize/>
676
677							=back
678
679							=head1 LICENSE AND COPYRIGHT
680
681							The MIT License (MIT)
682
683							Copyright (c) 2012-2021 Tore Aursand
684
685							Permission is hereby granted, free of charge, to any person obtaining a copy
686							of this software and associated documentation files (the "Software"), to deal
687							in the Software without restriction, including without limitation the rights
688							to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
689							copies of the Software, and to permit persons to whom the Software is
690							furnished to do so, subject to the following conditions:
691
692							The above copyright notice and this permission notice shall be included in all
693							copies or substantial portions of the Software.
694
695							THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
696							IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
697							FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
698							AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
699							LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
700							OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
701							SOFTWARE.
702
703							=cut
704
705							__PACKAGE__->meta->make_immutable;
706
707							1; # End of URL::Normalize