File Coverage

blib/lib/String/Tagged.pm
Criterion Covered Total %
statement 534 570 93.6
branch 221 256 86.3
condition 80 92 86.9
subroutine 54 55 98.1
pod 34 34 100.0
total 923 1007 91.6


line stmt bran cond sub pod time code
1             # You may distribute under the terms of either the GNU General Public License
2             # or the Artistic License (the same terms as Perl itself)
3             #
4             # (C) Paul Evans, 2008-2023 -- leonerd@leonerd.org.uk
5              
6             package String::Tagged 0.24;
7              
8 21     21   4321875 use v5.14;
  21         90  
9 21     21   115 use warnings;
  21         171  
  21         1101  
10              
11 21     21   123 use Scalar::Util qw( blessed );
  21         36  
  21         1560  
12              
13             require String::Tagged::Extent;
14              
15             use constant {
16 21         1982 FLAG_ANCHOR_BEFORE => 0x01,
17             FLAG_ANCHOR_AFTER => 0x02,
18             FLAG_ITERATING => 0x04,
19             FLAG_DELETED => 0x08,
20 21     21   135 };
  21         42  
21              
22 21     21   108 use constant DEBUG => 0;
  21         35  
  21         1031  
23              
24             # Since we're providing overloading, we should set fallback by default
25 21     21   114 use overload fallback => 1;
  21         44  
  21         189  
26              
27             =head1 NAME
28              
29             C - string buffers with value tags on extents
30              
31             =head1 SYNOPSIS
32              
33             use String::Tagged;
34              
35             my $st = String::Tagged->new( "An important message" );
36              
37             $st->apply_tag( 3, 9, bold => 1 );
38              
39             $st->iter_substr_nooverlap(
40             sub {
41             my ( $substring, %tags ) = @_;
42              
43             print $tags{bold} ? "$substring"
44             : $substring;
45             }
46             );
47              
48             =head1 DESCRIPTION
49              
50             This module implements an object class, instances of which store a (mutable)
51             string buffer that supports tags. A tag is a name/value pair that applies to
52             some extent of the underlying string.
53              
54             The types of tag names ought to be strings, or at least values that are
55             well-behaved as strings, as the names will often be used as the keys in hashes
56             or applied to the C operator.
57              
58             The types of tag values are not restricted - any scalar will do. This could be
59             a simple integer or string, ARRAY or HASH reference, or even a CODE reference
60             containing an event handler of some kind.
61              
62             Tags may be arbitrarily overlapped. Any given offset within the string has in
63             effect, a set of uniquely named tags. Tags of different names are independent.
64             For tags of the same name, only the latest, shortest tag takes effect.
65              
66             For example, consider a string with three tags represented here:
67              
68             Here is my string with tags
69             [-------------------------] foo => 1
70             [-------] foo => 2
71             [---] bar => 3
72              
73             Every character in this string has a tag named C. The value of this tag
74             is 2 for the words C and C and the space inbetween, and 1
75             elsewhere. Additionally, the words C and C and the space between them
76             also have the tag C with a value 3.
77              
78             Since C does not understand the significance of the tag values
79             it therefore cannot detect if two neighbouring tags really contain the same
80             semantic idea. Consider the following string:
81              
82             A string with words
83             [-------] type => "message"
84             [--------] type => "message"
85              
86             This string contains two tags. C will treat this as two
87             different tag values as far as C is concerned, even
88             though C yields the same value for the C tag at any position
89             in the string. The C method may be used to merge tag extents of
90             tags that should be considered as equal.
91              
92             =head1 NAMING
93              
94             I spent a lot of time considering the name for this module. It seems that a
95             number of people across a number of languages all created similar
96             functionality, though named very differently. For the benefit of
97             keyword-based search tools and similar, here's a list of some other names this
98             sort of object might be known by:
99              
100             =over 4
101              
102             =item *
103              
104             Extents
105              
106             =item *
107              
108             Overlays
109              
110             =item *
111              
112             Attribute or attributed strings
113              
114             =item *
115              
116             Markup
117              
118             =item *
119              
120             Out-of-band data
121              
122             =back
123              
124             =cut
125              
126             *is_string_tagged =
127             # It would be nice if we could #ifdef HAVE_PERL_VERSION(...)
128             ( $] >= 5.034 ) ?
129 21     21   9951 do { eval 'use experimental "isa"; sub { $_[0] isa __PACKAGE__ }' // die $@ } :
  21     297   83882  
  21         125  
  297         1387  
130             # We can't call ->isa as a method on 5.32 because of a bug in the isa
131             # operator implementation that breaks the isa cache on the package.
132             do { sub { blessed $_[0] and UNIVERSAL::isa( $_[0], __PACKAGE__ ) } };
133              
134             =head1 CONSTRUCTOR
135              
136             =cut
137              
138             =head2 new
139              
140             $st = String::Tagged->new( $str );
141              
142             Returns a new instance of a C object. It will contain no tags.
143             If the optional C<$str> argument is supplied, the string buffer will be
144             initialised from this value.
145              
146             If C<$str> is a C object then it will be cloned, as if calling
147             the C method on it.
148              
149             =cut
150              
151             sub new
152             {
153 143     143 1 3521370 my $class = shift;
154 143         386 my ( $str ) = @_;
155              
156 143 100       3972 return $class->clone( $str ) if is_string_tagged( $str );
157              
158 122 100       398 $str = "" unless defined $str;
159              
160 122         715 return bless {
161             str => "$str",
162             tags => [],
163             }, $class;
164             }
165              
166             =head2 new_tagged
167              
168             $st = String::Tagged->new_tagged( $str, %tags );
169              
170             Shortcut for creating a new C object with the given tags
171             applied to the entire length. The tags will not be anchored at either end.
172              
173             =cut
174              
175             sub new_tagged
176             {
177 8     8 1 203098 my $class = shift;
178 8         32 my ( $str, %tags ) = @_;
179              
180 8         56 my $self = $class->new( $str );
181              
182 8         24 my $length = $self->length;
183 8         43 $self->apply_tag( 0, $length, $_ => $tags{$_} ) for keys %tags;
184              
185 8         41 return $self;
186             }
187              
188             =head2 clone (class)
189              
190             $new = String::Tagged->clone( $orig, %opts );
191              
192             Returns a new instance of C made by cloning the original,
193             subject to the options provided. The returned instance will be in the
194             requested class, which need not match the class of the original.
195              
196             The following options are recognised:
197              
198             =over 4
199              
200             =item only_tags => ARRAY
201              
202             If present, gives an ARRAY reference containing tag names. Only those tags
203             named here will be copied; others will be ignored.
204              
205             =item except_tags => ARRAY
206              
207             If present, gives an ARRAY reference containing tag names. All tags will be
208             copied except those named here.
209              
210             =item convert_tags => HASH
211              
212             If present, gives a HASH reference containing tag conversion functions. For
213             any tags in the original to be copied whose names appear in the hash, the
214             name and value are passed into the corresponding function, which should return
215             an even-sized key/value list giving a tag, or a list of tags, to apply to the
216             new clone.
217              
218             my @new_tags = $convert_tags->{$orig_name}->( $orig_name, $orig_value );
219             # Where @new_tags is ( $new_name, $new_value, $new_name_2, $new_value_2, ... );
220              
221             As a further convenience, if the value for a given tag name is a plain string
222             instead of a code reference, it gives the new name for the tag, and will be
223             applied with its existing value.
224              
225             If C is being used too, then the source names of any tags to be
226             converted must also be listed there, or they will not be copied.
227              
228             =item start => INT
229              
230             I
231              
232             Start at the given position; defaults to 0.
233              
234             =item end => INT
235              
236             I
237              
238             End after the given position; defaults to end of string. This option overrides
239             C.
240              
241             =item len => INT
242              
243             End after the given length beyond the start position; defaults to end of
244             string. This option only applies if C is not given.
245              
246             =back
247              
248             =head2 clone (instance)
249              
250             $new = $orig->clone( %args );
251              
252             Called as an instance (rather than a class) method, the newly-cloned instance
253             is returned in the same class as the original.
254              
255             =cut
256              
257             sub clone
258             {
259 57 100   57 1 2120 my ( $class, $orig ) = blessed $_[0] ?
260             ( ref $_[0], shift ) :
261             ( shift, shift );
262 57         140 my %opts = @_;
263              
264             my $only = exists $opts{only_tags} ?
265 57 100       161 { map { $_ => 1 } @{ $opts{only_tags} } } :
  1         2  
  1         2  
266             undef;
267              
268             my $except = exists $opts{except_tags} ?
269 57 50       164 { map { $_ => 1 } @{ $opts{except_tags} } } :
  0         0  
  0         0  
270             undef;
271              
272 57         107 my $convert = $opts{convert_tags};
273              
274 57         131 my $origstr = $orig->str;
275              
276 57   100     208 my $start = $opts{start} // 0;
277             my $end = $opts{end} //
278             ( defined $opts{len} ? $start + $opts{len}
279 57 100 100     255 : length $origstr );
280              
281 57         105 my $len = $end - $start;
282              
283 57         232 my $new = $class->new( substr $origstr, $start, $end - $start );
284              
285 57         138 my $tags = $orig->{tags};
286              
287             # We know we're only looking
288 57         149 foreach my $t ( @$tags ) {
289 64         151 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
290              
291 64 100       142 next if $te < $start;
292 59 100       120 last if $ts >= $end;
293              
294 56 100 100     164 next if $only and not $only->{$tn};
295 55 0 33     143 next if $except and $except->{$tn};
296              
297 55         87 my @tags;
298 55 100 66     215 if( $convert and my $c = $convert->{$tn} ) {
299 2 100       4 if( ref $c eq "CODE" ) {
300 1         4 @tags = $c->( $tn, $tv );
301             }
302             else {
303 1         3 @tags = ( $c, $tv );
304             }
305             }
306             else {
307 53         125 @tags = ( $tn, $tv );
308             }
309              
310 55         147 $_ -= $start for $ts, $te;
311              
312 55 100       117 my $tl = $te - ( $ts < 0 ? 0 : $ts );
313              
314 55 100       127 next if $te <= 0;
315 54 100 100     255 $ts = -1 if $ts < 0 or $tf & FLAG_ANCHOR_BEFORE;
316 54 100 100     202 $tl = -1 if $te > $len or $tf & FLAG_ANCHOR_AFTER;
317              
318 54         124 while( @tags ) {
319 54         173 $new->apply_tag( $ts, $tl, shift @tags, shift @tags );
320             }
321             }
322              
323 57         259 return $new;
324             }
325              
326             sub _mkextent
327             {
328 196     196   291 my $self = shift;
329 196         392 my ( $start, $end, $flags ) = @_;
330              
331 196         302 $flags &= (FLAG_ANCHOR_BEFORE|FLAG_ANCHOR_AFTER);
332              
333 196         970 return bless [ $self, $start, $end, $flags ], 'String::Tagged::Extent';
334             }
335              
336             =head2 from_sprintf
337              
338             $str = String::Tagged->from_sprintf( $format, @args );
339              
340             I
341              
342             Returns a new instance of a C object, initialised by
343             formatting the supplied arguments using the supplied format.
344              
345             The C<$format> string is similar to that supported by the core C
346             operator, though a few features such as out-of-order argument indexing and
347             vector formatting are missing. This format string may be a plain perl string,
348             or an instance of C. In the latter case, any tags within it
349             are preserved in the result.
350              
351             In the case of a C<%s> conversion, the value of the argument consumed may
352             itself be a C instance. In this case it will be appended to
353             the returned object, preserving any tags within it.
354              
355             All other conversions are handled individually by the core C
356             operator and appended to the result.
357              
358             =cut
359              
360             sub from_sprintf
361             {
362 12     12 1 232234 my $class = shift;
363 12         47 my ( $format, @args ) = @_;
364              
365             # Clone the format string into the candidate return value, and then
366             # repeatedly replace %... expansions with their required value using
367             # ->set_substr, so that embedded tags in the format will behave sensibly.
368              
369 12 100       473 my $ret = ( is_string_tagged( $format ) ) ?
370             $class->clone( $format ) :
371             $class->new( $format );
372              
373 12         26 my $pos = 0;
374              
375 12         82 while( $pos < length $ret ) {
376 22         66 my $str = "$ret";
377 22         70 pos( $str ) = $pos;
378              
379 22         41 my $replacement;
380              
381 22 100       242 if( $str =~ m/\G[^%]+/gc ) {
    100          
    100          
    50          
    0          
382             # A literal span
383 9         26 $pos = $+[0];
384 9         28 next;
385             }
386             elsif( $str =~ m/\G%%/gc ) {
387             # A literal %% conversion
388 1         3 $replacement = "%";
389             }
390             elsif( $str =~ m/\G%([-]?)(\d+|\*)?(?:\.(\d+|\*))?s/gc ) {
391             # A string
392 10         50 my ( $flags, $width, $precision ) = ( $1, $2, $3 );
393 10 100 100     43 $width = shift @args if defined $width and $width eq "*";
394 10 100 100     66 $precision = shift @args if defined $precision and $precision eq "*";
395 10         22 my $arg = shift @args;
396              
397 10 50       26 defined $arg or do {
398 0         0 warnings::warnif( uninitialized => "Use of ininitialized value in String::Tagged->from_sprintf" );
399 0         0 $arg = "";
400             };
401              
402 10 100       23 if( defined $precision ) {
403 2 50       59 if( is_string_tagged( $arg ) ) {
404 0         0 $arg = $arg->substr( 0, $precision );
405             }
406             else {
407 2         6 $arg = substr $arg, 0, $precision;
408             }
409             }
410              
411 10         20 my $leftalign = $flags =~ m/-/;
412              
413 10 100       27 my $padding = defined $width ? $width - length $arg : 0;
414 10 100       21 $padding = 0 if $padding < 0;
415              
416 10         18 $replacement = "";
417              
418 10 100       51 $replacement .= " " x $padding if !$leftalign;
419              
420 10         19 $replacement .= $arg;
421              
422 10 100       30 $replacement .= " " x $padding if $leftalign;
423             }
424             elsif( $str =~ m/\G%(.*?)([cduoxefgXEGbBpaAiDUOF])/gc ) {
425             # Another conversion format
426 2         14 my ( $template, $flags ) = ( $2, $1 );
427 2         4 my $argc = 1;
428 2         7 $argc += ( () = $flags =~ m/\*/g );
429              
430 2         14 $replacement = sprintf "%$flags$template", @args[0..$argc-1];
431 2         7 splice @args, 0, $argc;
432             }
433             elsif( $str =~ m/\G%(.*?)([a-zA-Z])/gc ) {
434 0         0 warn "Unrecognised sprintf conversion %$2";
435             }
436             else {
437             # must be at EOF now
438 0         0 last;
439             }
440              
441 13         52 my $templatelen = $+[0] - $-[0];
442 13         64 $ret->set_substr( $-[0], $templatelen, $replacement );
443              
444 13         52 $pos += length( $replacement );
445             }
446              
447 12         74 return $ret;
448             }
449              
450             =head2 join
451              
452             $str = String::Tagged->join( $sep, @parts );
453              
454             I
455              
456             Returns a new instance of a C object, formed by concatenating
457             each of the component piece together, joined with the separator string.
458              
459             The result will be much like the core C function, except that it will
460             preserve tags in the resulting string.
461              
462             =cut
463              
464             sub join
465             {
466 1     1 1 3 my $class = shift;
467 1         4 my ( $sep, @parts ) = @_;
468              
469 1 50       23 is_string_tagged( $sep ) or
470             $sep = $class->new( $sep );
471              
472 1         3 my $ret = shift @parts;
473 1         6 $ret .= $sep . $_ for @parts;
474              
475 1         6 return $ret;
476             }
477              
478             =head1 METHODS
479              
480             =cut
481              
482             =head2 str
483              
484             $str = $st->str;
485              
486             $str = "$st";
487              
488             Returns the plain string contained within the object.
489              
490             This method is also called for stringification; so the C
491             object can be used in a plain string interpolation such as
492              
493             my $message = String::Tagged->new( "Hello world" );
494             print "My message is $message\n";
495              
496             =cut
497              
498 21     21   35343 use overload '""' => 'str';
  21         82  
  21         167  
499              
500             sub str
501             {
502 230     230 1 24174 my $self = shift;
503 230         845 return $self->{str};
504             }
505              
506             =head2 length
507              
508             $len = $st->length;
509              
510             $len = length( $st );
511              
512             Returns the length of the plain string. Because stringification works on this
513             object class, the normal core C function works correctly on it.
514              
515             =cut
516              
517             sub length
518             {
519 422     422 1 527 my $self = shift;
520 422         1205 return CORE::length $self->{str};
521             }
522              
523             =head2 substr
524              
525             $str = $st->substr( $start, $len );
526              
527             Returns a C instance representing a section from within the
528             given string, containing all the same tags at the same conceptual positions.
529              
530             =cut
531              
532             sub substr
533             {
534 27     27 1 50 my $self = shift;
535 27         53 my ( $start, $len ) = @_;
536              
537 27         66 return $self->clone( start => $start, len => $len );
538             }
539              
540             =head2 plain_substr
541              
542             $str = $st->plain_substr( $start, $len );
543              
544             Returns as a plain perl string, the substring at the given position. This will
545             be the same string data as returned by C, only as a plain string
546             without the tags
547              
548             =cut
549              
550             sub plain_substr
551             {
552 23     23 1 39 my $self = shift;
553 23         84 my ( $start, $len ) = @_;
554              
555 23         128 return CORE::substr( $self->{str}, $start, $len );
556             }
557              
558             sub _cmp_tags
559             {
560 154     154   257 my ( $as, $ae ) = @$a;
561 154         202 my ( $bs, $be ) = @$b;
562              
563             # Sort by start first; shortest first
564 154   100     513 return $as <=> $bs ||
565             $ae <=> $be;
566             }
567              
568             sub _assert_sorted
569             {
570 0     0   0 my $self = shift;
571              
572 0         0 my $tags = $self->{tags};
573             # If fewer than 2 tags, must be sorted
574 0 0       0 return if @$tags < 2;
575              
576 0         0 my $prev = $tags->[0];
577              
578 0         0 for( my $i = 1; $i < @$tags; $i++ ) {
579 0         0 my $here = $tags->[$i];
580 0         0 local ( $a, $b ) = ( $prev, $here );
581 0 0       0 if( _cmp_tags() <= 0 ) {
582 0         0 $prev = $here;
583 0         0 next;
584             }
585              
586 0         0 print STDERR "Tag order violation at i=$i\n";
587 0         0 print STDERR "[@{[ $i - 1 ]}] = [ $tags->[$i-1]->[0], $tags->[$i-1]->[1] ]\n";
  0         0  
588 0         0 print STDERR "[@{[ $i ]}] = [ $tags->[$i]->[0], $tags->[$i]->[1] ]\n";
  0         0  
589 0         0 die "Assert failure";
590             }
591             }
592              
593             sub _insert_tag
594             {
595 162     162   213 my $self = shift;
596 162         307 my ( $start, $end, $name, $value, $flags ) = @_;
597              
598 162         273 my $tags = $self->{tags};
599              
600 162         351 my $newtag = [ $start, $end, $name => $value, $flags ];
601              
602             # Specialcase - if there's no tags yet, just push it
603 162 100       377 if( @$tags == 0 ) {
604 84         173 push @$tags, $newtag;
605 84         192 return;
606             }
607              
608 78         160 local $a = $newtag;
609              
610             # Two more special cases - it's quite likely we're either inserting an
611             # 'everywhere' tag, or appending one to the end. Check the endpoints first
612 78         108 local $b;
613              
614 78         120 $b = $tags->[0];
615 78 100       148 if( _cmp_tags() <= 0 ) {
616 14         34 unshift @$tags, $newtag;
617 14         33 return;
618             }
619              
620 64         103 $b = $tags->[-1];
621 64 100       105 if( _cmp_tags() >= 0 ) {
622 62         114 push @$tags, $newtag;
623 62         143 return;
624             }
625              
626 2         6 my $range_start = 0;
627 2         4 my $range_end = $#$tags;
628              
629 2         4 my $inspos;
630              
631 2         7 while( $range_end > $range_start ) {
632 2         8 my $i = int( ( $range_start + $range_end ) / 2 );
633              
634 2         4 $b = $tags->[$i];
635 2         4 my $cmp = _cmp_tags;
636              
637 2 50       64 if( $cmp > 0 ) {
    0          
638 2         6 $range_start = $i + 1;
639             }
640             elsif( $cmp < 0 ) {
641 0         0 $range_end = $i; # open interval
642             }
643             else {
644 0         0 $inspos = $i;
645 0         0 last;
646             }
647              
648 2 50       7 if( $range_start == $range_end ) {
649 2         2 $inspos = $range_start;
650 2         5 last;
651             }
652             }
653              
654 2 50       6 $inspos = $range_end unless defined $inspos;
655              
656 2 50       14 $inspos = 0 if $inspos < 0;
657 2 50       8 $inspos = @$tags if $inspos > @$tags;
658              
659 2         8 splice @$tags, $inspos, 0, $newtag;
660              
661 2         6 $self->_assert_sorted if DEBUG;
662             }
663              
664             =head2 apply_tag
665              
666             $st->apply_tag( $start, $len, $name, $value );
667              
668             Apply the named tag value to the given extent. The tag will start on the
669             character at the C<$start> index, and continue for the next C<$len>
670             characters.
671              
672             If C<$start> is given as -1, the tag will be considered to start "before" the
673             actual string. If C<$len> is given as -1, the tag will be considered to
674             end "after" end of the actual string. These special limits are used by
675             C when deciding whether to move a tag boundary. The start of any
676             tag that starts "before" the string is never moved, even if more text is
677             inserted at the beginning. Similarly, a tag which ends "after" the end of the
678             string, will continue to the end even if more text is appended.
679              
680             This method returns the C<$st> object.
681              
682             $st->apply_tag( $e, $name, $value )
683              
684             Alternatively, an existing L object can be passed as
685             the first argument instead of two integers. The new tag will apply at the
686             given extent.
687              
688             =cut
689              
690             sub apply_tag
691             {
692 159     159 1 9789 my $self = shift;
693 159         260 my ( $start, $end );
694 159         207 my $flags = 0;
695              
696 159 50       338 if( blessed $_[0] ) {
697 0         0 my $e = shift;
698 0         0 $start = $e->start;
699 0         0 $end = $e->end;
700              
701 0 0       0 $flags |= FLAG_ANCHOR_BEFORE if $e->anchor_before;
702 0 0       0 $flags |= FLAG_ANCHOR_AFTER if $e->anchor_after;
703             }
704             else {
705 159         247 $start = shift;
706 159         204 my $len = shift;
707              
708 159         330 my $strlen = $self->length;
709              
710 159 100       373 if( $start < 0 ) {
711 36         57 $start = 0;
712 36         90 $flags |= FLAG_ANCHOR_BEFORE;
713             }
714              
715 159 100       295 if( $len == -1 ) {
716 37         53 $end = $strlen;
717 37         59 $flags |= FLAG_ANCHOR_AFTER;
718             }
719             else {
720 122         164 $end = $start + $len;
721 122 100       273 $end = $strlen if $end > $strlen;
722             }
723             }
724              
725 159         285 my ( $name, $value ) = @_;
726              
727 159         440 $self->_insert_tag( $start, $end, $name, $value, $flags );
728              
729 159         437 return $self;
730             }
731              
732             sub _remove_tag
733             {
734 9     9   9 my $self = shift;
735 9         10 my $keepends = shift;
736 9         9 my ( $start, $end );
737              
738 9 100       17 if( blessed $_[0] ) {
739 5         4 my $e = shift;
740 5         9 $start = $e->start;
741 5         6 $end = $e->end;
742             }
743             else {
744 4         4 $start = shift;
745 4         5 $end = $start + shift;
746             }
747              
748 9         11 my ( $name ) = @_;
749              
750 9 100       18 if( my $t = $self->{iterating} ) {
751 5         6 my ( $ts, $te, $tn ) = @$t;
752 5 50 33     18 if( $start == $ts and $end == $te and $name eq $tn ) {
      33        
753 5         6 $t->[4] |= FLAG_DELETED;
754 5         6 return;
755             }
756             }
757              
758 4         5 my $tags = $self->{tags};
759              
760 4         4 my $have_added = 0;
761              
762             # Can't foreach() because we modify $i
763 4         9 for( my $i = 0; $i < @$tags; $i++ ) {
764 8         8 my ( $ts, $te, $tn, $tv, $tf ) = @{ $tags->[$i] };
  8         16  
765              
766 8 100       14 next if $te <= $start;
767 7 100       26 last if $ts >= $end;
768              
769 4 50       6 next if $tn ne $name;
770              
771 4 100 100     13 if( $keepends and $end < $te ) {
772 2         6 $self->_insert_tag( $end, $te, $tn, $tv, $tf & ~(FLAG_ANCHOR_BEFORE|FLAG_ITERATING) );
773 2         2 $have_added = 1;
774             }
775              
776 4 50       9 if( $tf & FLAG_ITERATING ) {
777 0         0 die "ARGH encountered FLAG_ITERATING while walking the list of tags during ->_remove_tag";
778             }
779              
780 4         5 splice @$tags, $i, 1;
781              
782 4 100 100     12 if( $keepends and $ts < $start ) {
783 1         3 $self->_insert_tag( $ts, $start, $tn, $tv, $tf & ~(FLAG_ANCHOR_AFTER|FLAG_ITERATING) );
784 1         1 $have_added = 1;
785             }
786             else {
787 3         6 $i--;
788             }
789             }
790              
791 4         4 if( DEBUG && $have_added ) {
792             $self->_assert_sorted;
793             }
794              
795 4         15 return $self;
796             }
797              
798             =head2 unapply_tag
799              
800             $st->unapply_tag( $start, $len, $name );
801              
802             Unapply the named tag value from the given extent. If the tag extends beyond
803             this extent, then any partial fragment of the tag will be left in the string.
804              
805             This method returns the C<$st> object.
806              
807             $st->unapply_tag( $e, $name );
808              
809             Alternatively, an existing L object can be passed as
810             the first argument instead of two integers.
811              
812             =cut
813              
814             sub unapply_tag
815             {
816 3     3 1 5 my $self = shift;
817 3         7 return $self->_remove_tag( 1, @_ );
818             }
819              
820             =head2 delete_tag
821              
822             $st->delete_tag( $start, $len, $name );
823              
824             Delete the named tag within the given extent. Entire tags are removed, even if
825             they extend beyond this extent.
826              
827             This method returns the C<$st> object.
828              
829             $st->delete_tag( $e, $name );
830              
831             Alternatively, an existing L object can be passed as
832             the first argument instead of two integers.
833              
834             =cut
835              
836             sub delete_tag
837             {
838 6     6 1 7472 my $self = shift;
839 6         10 return $self->_remove_tag( 0, @_ );
840             }
841              
842             =head2 delete_all_tag
843              
844             $st->delete_all_tag( $name );
845              
846             I
847              
848             Deletes every tag with the given name. This is more efficient than calling
849             C to list the tags then C on each one individually
850             in the case of a simple name match.
851              
852             This method returns the C<$st> object.
853              
854             =cut
855              
856             sub delete_all_tag
857             {
858 1     1 1 4 my $self = shift;
859 1         3 my ( $name ) = @_;
860              
861 1         1 my $tags = $self->{tags};
862              
863 1         5 for( my $i = 0; $i < @$tags; $i++ ) {
864 3         3 my ( $ts, $te, $tn, $tv, $tf ) = @{ $tags->[$i] };
  3         5  
865              
866 3 100       6 next if $tn ne $name;
867              
868 2         2 splice @$tags, $i, 1, ();
869 2         5 $i--;
870             }
871              
872 1         2 return $self;
873             }
874              
875             =head2 merge_tags
876              
877             $st->merge_tags( $eqsub );
878              
879             Merge neighbouring or overlapping tags of the same name and equal values.
880              
881             For each pair of tags of the same name that apply on neighbouring or
882             overlapping extents, the C<$eqsub> callback is called, as
883              
884             $equal = $eqsub->( $name, $value_a, $value_b );
885              
886             If this function returns true then the tags are merged.
887              
888             The equallity test function is free to perform any comparison of the values
889             that may be relevant to the application; for example it may deeply compare
890             referred structures and check for equivalence in some application-defined
891             manner. In this case, the first tag of a pair is retained, the second is
892             deleted. This may be relevant if the tag value is a reference to some object.
893              
894             =cut
895              
896             sub merge_tags
897             {
898 8     8 1 8341 my $self = shift;
899 8         21 my ( $eqsub ) = @_;
900              
901 8         14 my $tags = $self->{tags};
902              
903             # Can't foreach() because we modify @$tags
904 8         29 OUTER: for( my $i = 0; $i < @$tags; $i++ ) {
905 13         49 my ( $ts, $te, $tn, $tv, $tf ) = @{ $tags->[$i] };
  13         35  
906              
907 13         51 for( my $j = $i+1; $j < @$tags; $j++ ) {
908 11         17 my ( $t2s, $t2e, $t2n, $t2v, $t2f ) = @{ $tags->[$j] };
  11         26  
909              
910 11 100       29 last if $t2s > $te;
911 10 50       26 next unless $t2s <= $te;
912 10 100       33 next unless $t2n eq $tn;
913              
914 7 100       20 last unless $eqsub->( $tn, $tv, $t2v );
915              
916             # Need to delete the tag at $j, extend the end of the tag at $i, and
917             # possibly move $i later
918 6         34 splice @$tags, $j, 1, ();
919 6         14 $j--;
920              
921 6         12 $te = $tags->[$i][1] = $t2e;
922              
923 6 100       19 $tags->[$i][4] |= FLAG_ANCHOR_AFTER if $t2f & FLAG_ANCHOR_AFTER;
924              
925 6         34 local $a = $tags->[$i];
926              
927 6 100 100     60 if( local $b = $tags->[$i+1] and _cmp_tags() > 0 ) {
928 1         3 my $newpos = $i+1;
929 1   33     6 while( local $b = $tags->[$newpos ] and _cmp_tags() <= 0 ) {
930 0         0 $newpos++;
931             }
932              
933 1         5 splice @$tags, $newpos, 0, splice @$tags, $i, 1, ();
934              
935 1         4 redo OUTER;
936             }
937             }
938             }
939             }
940              
941             =head2 iter_extents
942              
943             $st->iter_extents( $callback, %opts );
944              
945             Iterate the tags stored in the string. For each tag, the CODE reference in
946             C<$callback> is invoked once, being passed a L object
947             that represents the extent of the tag.
948              
949             $callback->( $extent, $tagname, $tagvalue );
950              
951             Options passed in C<%opts> may include:
952              
953             =over 4
954              
955             =item start => INT
956              
957             Start at the given position; defaults to 0.
958              
959             =item end => INT
960              
961             End after the given position; defaults to end of string. This option overrides
962             C.
963              
964             =item len => INT
965              
966             End after the given length beyond the start position; defaults to end of
967             string. This option only applies if C is not given.
968              
969             =item only => ARRAY
970              
971             Select only the tags named in the given ARRAY reference.
972              
973             =item except => ARRAY
974              
975             Select all the tags except those named in the given ARRAY reference.
976              
977             =back
978              
979             I it is safe to call C from within the
980             callback function to remove the tag currently being iterated on.
981              
982             $str->iter_extents( sub {
983             my ( $e, $n, $v ) = @_;
984             $str->delete_tag( $e, $n ) if $n =~ m/^tmp_/;
985             } );
986              
987             Apart from this scenario, the tags in the string should not otherwise be added
988             or removed while the iteration is occurring.
989              
990             =cut
991              
992             sub iter_extents
993             {
994 52     52 1 148 my $self = shift;
995 52         111 my ( $callback, %opts ) = @_;
996              
997             my $start = exists $opts{start} ? $opts{start} :
998 52 100       176 0;
999              
1000             my $end = exists $opts{end} ? $opts{end} :
1001             exists $opts{len} ? $start + $opts{len} :
1002 52 50       201 $self->length + 1; # so as to include zerolen at end
    100          
1003              
1004 52 100       125 my $only = exists $opts{only} ? { map { $_ => 1 } @{ $opts{only} } } :
  1         4  
  1         4  
1005             undef;
1006              
1007 52 100       133 my $except = exists $opts{except} ? { map { $_ => 1 } @{ $opts{except} } } :
  1         5  
  1         3  
1008             undef;
1009              
1010 52         147 my $tags = $self->{tags};
1011              
1012 52         177 for ( my $i = 0; $i < @$tags; $i++ ) {
1013 76         146 my $t = $tags->[$i];
1014 76         170 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
1015              
1016 76 100       224 next if $te < $start;
1017 75 100       152 last if $ts >= $end;
1018              
1019 74 100 100     189 next if $only and !$only->{$tn};
1020 72 100 100     197 next if $except and $except->{$tn};
1021              
1022 71         141 $t->[4] |= FLAG_ITERATING;
1023 71         174 local $self->{iterating} = $t;
1024              
1025 71         190 $callback->( $self->_mkextent( $ts, $te, $tf ), $tn, $tv );
1026              
1027 71         3951 $t->[4] &= ~FLAG_ITERATING;
1028              
1029 71 100       380 if( $t->[4] & FLAG_DELETED ) {
1030 5         6 splice @$tags, $i, 1, ();
1031 5         11 $i--;
1032             }
1033             }
1034             }
1035              
1036             =head2 iter_tags
1037              
1038             $st->iter_tags( $callback, %opts );
1039              
1040             Iterate the tags stored in the string. For each tag, the CODE reference in
1041             C<$callback> is invoked once, being passed the start point and length of the
1042             tag.
1043              
1044             $callback->( $start, $length, $tagname, $tagvalue );
1045              
1046             Options passed in C<%opts> are the same as for C.
1047              
1048             =cut
1049              
1050             sub iter_tags
1051             {
1052 25     25 1 13614 my $self = shift;
1053 25         102 my ( $callback, %opts ) = @_;
1054              
1055             $self->iter_extents(
1056             sub {
1057 36     36   68 my ( $e, $tn, $tv ) = @_;
1058 36         127 $callback->( $e->start, $e->length, $tn, $tv );
1059             },
1060 25         175 %opts
1061             );
1062             }
1063              
1064             =head2 iter_extents_nooverlap
1065              
1066             $st->iter_extents_nooverlap( $callback, %opts );
1067              
1068             Iterate non-overlapping extents of tags stored in the string. The CODE
1069             reference in C<$callback> is invoked for each extent in the string where no
1070             tags change. The entire set of tags active in that extent is given to the
1071             callback. Because the extent covers possibly-multiple tags, it will not define
1072             the C and C flags.
1073              
1074             $callback->( $extent, %tags );
1075              
1076             The callback will be invoked over the entire length of the string, including
1077             any extents with no tags applied.
1078              
1079             Options may be passed in C<%opts> to control the range of the string iterated
1080             over, in the same way as the C method.
1081              
1082             If the C or C filters are applied, then only the tags that
1083             survive filtering will be present in the C<%tags> hash. Tags that are excluded
1084             by the filtering will not be present, nor will their bounds be used to split
1085             the string into extents.
1086              
1087             =cut
1088              
1089             sub iter_extents_nooverlap
1090             {
1091 36     36 1 77 my $self = shift;
1092 36         70 my ( $callback, %opts ) = @_;
1093              
1094             my $start = exists $opts{start} ? $opts{start} :
1095 36 100       110 0;
1096              
1097             my $end = exists $opts{end} ? $opts{end} :
1098             exists $opts{len} ? $start + $opts{len} :
1099 36 50       176 $self->length;
    100          
1100              
1101 36 100       109 my $only = exists $opts{only} ? { map { $_ => 1 } @{ $opts{only} } } :
  1         46  
  1         4  
1102             undef;
1103              
1104 36 100       86 my $except = exists $opts{except} ? { map { $_ => 1 } @{ $opts{except} } } :
  1         6  
  1         4  
1105             undef;
1106              
1107 36         99 my $tags = $self->{tags};
1108              
1109 36         91 my @active; # ARRAY of [ $ts, $te, $tn, $tv ]
1110 36         55 my $pos = $start;
1111              
1112 36         90 foreach my $t ( @$tags ) {
1113 78         14218 my ( $ts, $te, $tn, $tv ) = @$t;
1114              
1115 78 100       162 next if $te < $start;
1116 76 100       170 last if $ts > $end;
1117              
1118 74 100 100     200 next if $only and !$only->{$tn};
1119 72 100 100     147 next if $except and $except->{$tn};
1120              
1121 71         145 while( $pos < $ts ) {
1122 45         92 my %activetags;
1123             my %tagends;
1124 45         79 my $rangeend = $ts;
1125              
1126 45         82 foreach ( @active ) {
1127 42         91 my ( undef, $e, $n, $v ) = @$_;
1128              
1129 42 100       104 $e < $rangeend and $rangeend = $e;
1130 42 100 66     115 next if $tagends{$n} and $tagends{$n} < $e;
1131              
1132 41         86 $activetags{$n} = $v;
1133 41         114 $tagends{$n} = $e;
1134             }
1135              
1136 45         117 $callback->( $self->_mkextent( $pos, $rangeend, 0 ), %activetags );
1137              
1138 45         529 $pos = $rangeend;
1139 45         100 @active = grep { $_->[1] > $pos } @active;
  42         169  
1140             }
1141              
1142 71         234 push @active, [ $ts, $te, $tn, $tv ];
1143             }
1144              
1145 36         112 while( $pos < $end ) {
1146 56         127 my %activetags;
1147             my %tagends;
1148 56         325 my $rangeend = $end;
1149              
1150 56         95 foreach ( @active ) {
1151 74         140 my ( undef, $e, $n, $v ) = @$_;
1152              
1153 74 100       1306 $e < $rangeend and $rangeend = $e;
1154 74 100 100     228 next if $tagends{$n} and $tagends{$n} < $e;
1155              
1156 73         142 $activetags{$n} = $v;
1157 73         141 $tagends{$n} = $e;
1158             }
1159              
1160 56         129 $callback->( $self->_mkextent( $pos, $rangeend, 0 ), %activetags );
1161              
1162 56         1527 $pos = $rangeend;
1163 56         101 @active = grep { $_->[1] > $pos } @active;
  74         238  
1164             }
1165              
1166             # We might have zero-length tags active at the very end of the range
1167 36 100       155 if( my @zerolen = grep { $_->[0] == $pos and $_->[1] == $pos } @active ) {
  3 100       31  
1168 1         3 my %activetags;
1169 1         3 foreach ( @active ) {
1170 1         3 my ( undef, undef, $n, $v ) = @$_;
1171              
1172 1         5 $activetags{$n} = $v;
1173             }
1174              
1175 1         4 $callback->( $self->_mkextent( $pos, $pos, 0 ), %activetags );
1176             }
1177             }
1178              
1179             =head2 iter_tags_nooverlap
1180              
1181             $st->iter_tags_nooverlap( $callback, %opts );
1182              
1183             Iterate extents of the string using C, but passing
1184             the start and length of each extent to the callback instead of the extent
1185             object.
1186              
1187             $callback->( $start, $length, %tags );
1188              
1189             Options may be passed in C<%opts> to control the range of the string iterated
1190             over, in the same way as the C method.
1191              
1192             =cut
1193              
1194             sub iter_tags_nooverlap
1195             {
1196 30     30 1 12674 my $self = shift;
1197 30         79 my ( $callback, %opts ) = @_;
1198              
1199             $self->iter_extents_nooverlap(
1200             sub {
1201 85     85   188 my ( $e, %tags ) = @_;
1202 85         229 $callback->( $e->start, $e->length, %tags );
1203             },
1204 30         249 %opts
1205             );
1206             }
1207              
1208             =head2 iter_substr_nooverlap
1209              
1210             $st->iter_substr_nooverlap( $callback, %opts );
1211              
1212             Iterate extents of the string using C, but passing the
1213             substring of data instead of the extent object.
1214              
1215             $callback->( $substr, %tags );
1216              
1217             Options may be passed in C<%opts> to control the range of the string iterated
1218             over, in the same way as the C method.
1219              
1220             =cut
1221              
1222             sub iter_substr_nooverlap
1223             {
1224 6     6 1 8357 my $self = shift;
1225 6         39 my ( $callback, %opts ) = @_;
1226              
1227             $self->iter_extents_nooverlap(
1228             sub {
1229 17     17   11082 my ( $e, %tags ) = @_;
1230 17         63 $callback->( $e->plain_substr, %tags );
1231             },
1232 6         49 %opts,
1233             );
1234             }
1235              
1236             =head2 tagnames
1237              
1238             @names = $st->tagnames;
1239              
1240             Returns the set of tag names used in the string, in no particular order.
1241              
1242             =cut
1243              
1244             sub tagnames
1245             {
1246 14     14 1 85 my $self = shift;
1247              
1248 14         82 my $tags = $self->{tags};
1249              
1250 14         26 my %tags;
1251 14         30 foreach my $t ( @$tags ) {
1252 15         42 $tags{$t->[2]}++;
1253             }
1254              
1255 14         105 keys %tags;
1256             }
1257              
1258             =head2 get_tags_at
1259              
1260             $tags = $st->get_tags_at( $pos );
1261              
1262             Returns a HASH reference of all the tag values active at the given position.
1263              
1264             =cut
1265              
1266             sub get_tags_at
1267             {
1268 11     11 1 1956 my $self = shift;
1269 11         18 my ( $pos ) = @_;
1270              
1271 11         16 my $tags = $self->{tags};
1272              
1273 11         20 my %tags;
1274              
1275             # TODO: turn this into a binary search
1276 11         35 foreach my $t ( @$tags ) {
1277 14         34 my ( $ts, $te, $tn, $tv ) = @$t;
1278              
1279 14 100       35 last if $ts > $pos;
1280 11 100       22 next if $te <= $pos;
1281              
1282 10         29 $tags{$tn} = $tv;
1283             }
1284              
1285 11         51 return \%tags;
1286             }
1287              
1288             =head2 get_tag_at
1289              
1290             $value = $st->get_tag_at( $pos, $name );
1291              
1292             Returns the value of the named tag at the given position, or C if the
1293             tag is not applied there.
1294              
1295             =cut
1296              
1297             sub get_tag_at
1298             {
1299 6     6 1 16 my $self = shift;
1300 6         18 my ( $pos, $name ) = @_;
1301              
1302 6         15 my $tags = $self->{tags};
1303              
1304 6         10 my $value;
1305              
1306 6         30 foreach my $t ( @$tags ) {
1307 15         33 my ( $ts, $te, $tn, $tv ) = @$t;
1308              
1309 15 100       38 last if $ts > $pos;
1310 11 100       25 next if $te <= $pos;
1311              
1312 8 100       25 $value = $tv if $tn eq $name;
1313             }
1314              
1315 6         23 return $value;
1316             }
1317              
1318             =head2 get_tag_extent
1319              
1320             $extent = $st->get_tag_extent( $pos, $name );
1321              
1322             If the named tag applies to the given position, returns a
1323             L object to represent the extent of the tag at that
1324             position. If it does not, C is returned. If an extent is returned it
1325             will define the C and C flags if appropriate.
1326              
1327             =cut
1328              
1329             sub get_tag_extent
1330             {
1331 13     13 1 51 my $self = shift;
1332 13         29 my ( $pos, $name ) = @_;
1333              
1334 13         25 my $tags = $self->{tags};
1335              
1336 13         24 my ( $start, $end, $flags );
1337              
1338 13         29 foreach my $t ( @$tags ) {
1339 19         40 my ( $ts, $te, $tn, undef, $tf ) = @$t;
1340              
1341 19 100       59 last if $ts > $pos;
1342 16 100       36 next if $te <= $pos;
1343              
1344 15 100       42 next unless $tn eq $name;
1345              
1346 12         21 $start = $ts;
1347 12         17 $end = $te;
1348 12         25 $flags = $tf;
1349             }
1350              
1351 13 100       33 if( defined $start ) {
1352 12         42 return $self->_mkextent( $start, $end, $flags );
1353             }
1354             else {
1355 1         4 return undef;
1356             }
1357             }
1358              
1359             =head2 get_tag_missing_extent
1360              
1361             $extent = $st->get_tag_missing_extent( $pos, $name );
1362              
1363             If the named tag does not apply at the given position, returns the extent of
1364             the string around that position that does not have the tag. If it does exist,
1365             C is returned. If an extent is returned it will not define the
1366             C and C flags, as these do not make sense for the
1367             range in which a tag is absent.
1368              
1369             =cut
1370              
1371             sub get_tag_missing_extent
1372             {
1373 3     3 1 296 my $self = shift;
1374 3         5 my ( $pos, $name ) = @_;
1375              
1376 3         6 my $tags = $self->{tags};
1377              
1378 3         3 my $start = 0;
1379              
1380 3         4 foreach my $t ( @$tags ) {
1381 6         9 my ( $ts, $te, $tn ) = @$t;
1382              
1383 6 100       10 next unless $tn eq $name;
1384              
1385 3 100 100     13 if( $ts <= $pos and $te > $pos ) {
1386 1         3 return undef;
1387             }
1388              
1389 2 100       5 if( $ts > $pos ) {
1390 1         4 return $self->_mkextent( $start, $ts, 0 );
1391             }
1392              
1393 1         2 $start = $te;
1394             }
1395              
1396 1         5 return $self->_mkextent( $start, $self->length, 0 );
1397             }
1398              
1399             =head2 set_substr
1400              
1401             $st->set_substr( $start, $len, $newstr );
1402              
1403             Modifies a extent of the underlying plain string to that given. The extents of
1404             tags in the string are adjusted to cope with the modified region, and the
1405             adjustment in length.
1406              
1407             Tags entirely before the replaced extent remain unchanged.
1408              
1409             Tags entirely within the replaced extent are deleted.
1410              
1411             Tags entirely after the replaced extent are moved by appropriate amount to
1412             ensure they still apply to the same characters as before.
1413              
1414             Tags that start before and end after the extent remain, and have their lengths
1415             suitably adjusted.
1416              
1417             Tags that span just the start or end of the extent, but not both, are
1418             truncated, so as to remove the part of the tag applied on the modified extent
1419             but preserving that applied outside.
1420              
1421             If C<$newstr> is a C object, then its tags will be applied to
1422             C<$st> as appropriate. Edge-anchored tags in C<$newstr> will not be extended
1423             through C<$st>, though they will apply as edge-anchored if they now sit at the
1424             edge of the new string. If C<$newstr> is being appended to the end, then any
1425             existing edge-anchored tags at the end of C<$st> are I extended through
1426             the string; they will instead become bounded to their end position before the
1427             append happened.
1428              
1429             If C<$newstr> is otherwise treated as a plain string, then any existing
1430             edge-anchored tags at the end of C<$st> I extended through the newly
1431             added content and will continue to be edge-anchored in the result.
1432              
1433             =cut
1434              
1435             sub set_substr
1436             {
1437 57     57 1 17971 my $self = shift;
1438 57         130 my ( $start, $len, $new ) = @_;
1439              
1440 57         1375 my $new_is_st = is_string_tagged( $new );
1441              
1442 57         134 my $limit = $self->length;
1443              
1444 57 50       160 $start = $limit if $start > $limit;
1445 57 50       161 $len = ( $limit - $start ) if $len > ( $limit - $start );
1446              
1447 57         222 CORE::substr( $self->{str}, $start, $len ) = $new;
1448              
1449 57         133 my $oldend = $start + $len;
1450 57         140 my $newend = $start + CORE::length( $new );
1451              
1452 57         102 my $delta = $newend - $oldend;
1453             # Positions after $oldend have now moved up $delta places
1454              
1455 57         109 my $tags = $self->{tags};
1456              
1457 57         106 my $i = 0;
1458              
1459 57         170 for( ; $i < @$tags; $i++ ) {
1460             # In this loop we'll handle tags that start before the deleted section
1461              
1462 41         95 my $t = $tags->[$i];
1463 41         84 my ( $ts, $te, undef, undef, $tf ) = @$t;
1464              
1465 41 100 100     186 last if $ts >= $start and not( $tf & FLAG_ANCHOR_BEFORE );
1466              
1467             # Two cases:
1468             # A: Tag spans entirely outside deleted section - stretch/compress it
1469             # We may have to collapse it to nothing, so delete it
1470             # B: Tag starts before but ends within deleted section - truncate it
1471             # Plus a case we don't care about
1472             # Tag starts and ends entirely before the deleted section - ignore it
1473              
1474 29 100 100     228 if( $te > $oldend or
    100 100        
1475             ( $te == $oldend and $tf & FLAG_ANCHOR_AFTER ) ) {
1476             # Case A
1477 15 100 100     58 if( $tf & FLAG_ANCHOR_AFTER and $new_is_st ) {
1478             # Do not extend anchor-after tags if we are appending a String::Tagged
1479 4         7 $t->[4] &= ~FLAG_ANCHOR_AFTER;
1480 4         8 next;
1481             }
1482              
1483 11         17 $t->[1] += $delta;
1484              
1485 11 50       40 if( $t->[0] == $t->[1] ) {
1486 0         0 splice @$tags, $i, 1, ();
1487 0         0 $i--;
1488 0         0 next;
1489             }
1490             }
1491             elsif( $te > $start ) {
1492             # Case B
1493 1         5 $t->[1] = $start;
1494             }
1495             }
1496              
1497 57         161 for( ; $i < @$tags; $i++ ) {
1498 13         24 my $t = $tags->[$i];
1499 13         61 my ( $ts, $te ) = @$t;
1500              
1501             # In this loop we'll handle tags that start within the deleted section
1502 13 100       38 last if $ts >= $oldend;
1503              
1504             # Two cases
1505             # C: Tag contained entirely within deleted section - delete it
1506             # D: Tag starts within but ends after the deleted section - truncate it
1507              
1508 3 100       9 if( $te <= $oldend ) {
1509             # Case C
1510 2         6 splice @$tags, $i, 1;
1511 2         3 $i--;
1512 2         7 next;
1513             }
1514             else {
1515             # Case D
1516 1         2 $t->[0] = $newend;
1517 1         4 $t->[1] += $delta;
1518             }
1519             }
1520              
1521 57         112 for( ; $i < @$tags; $i++ ) {
1522 12         25 my $t = $tags->[$i];
1523 12         61 my ( $ts, $te, undef, undef, $tf ) = @$t;
1524              
1525             # In this loop we'll handle tags that start after the deleted section
1526              
1527             # One case
1528             # E: Tag starts and ends after the deleted section - move it
1529 12 100       670 $t->[0] += $delta unless $tf & FLAG_ANCHOR_BEFORE;
1530 12         26 $t->[1] += $delta;
1531              
1532             # If we've not moved the start (because it was FLAG_ANCHOR_BEFORE), we
1533             # might now have an ordering constraint violation. Better fix it.
1534 12         34 local $b = $t;
1535 12         46 foreach my $new_i ( reverse 0 .. $i-1 ) {
1536 7         13 local $a = $tags->[$new_i];
1537              
1538 7 100       18 last if _cmp_tags() <= 0;
1539              
1540 1         6 splice @$tags, $new_i, 0, splice @$tags, $i, 1, ();
1541              
1542 1         4 last;
1543             }
1544             }
1545              
1546 57 100       144 if( $new_is_st ) {
1547 22         44 my $atstart = $start == 0;
1548 22         50 my $atend = $newend == $self->length;
1549              
1550             $new->iter_extents( sub {
1551 21     21   45 my ( $e, $tn, $tv ) = @_;
1552 21 50 66     127 $self->apply_tag(
    100 100        
1553             ( $atstart && $e->anchor_before ) ? -1 : $start + $e->start,
1554             ( $atend && $e->anchor_after ) ? -1 : $e->length,
1555             $tn, $tv );
1556 22         165 } );
1557             }
1558              
1559 57         151 $self->_assert_sorted if DEBUG;
1560              
1561 57         263 return $self;
1562             }
1563              
1564             =head2 insert
1565              
1566             $st->insert( $start, $newstr );
1567              
1568             Insert the given string at the given position. A shortcut around
1569             C.
1570              
1571             If C<$newstr> is a C object, then its tags will be applied to
1572             C<$st> as appropriate. If C<$start> is 0, any before-anchored tags in will
1573             become before-anchored in C<$st>.
1574              
1575             =cut
1576              
1577             sub insert
1578             {
1579 15     15 1 6723 my $self = shift;
1580 15         38 my ( $at, $new ) = @_;
1581 15         67 $self->set_substr( $at, 0, $new );
1582             }
1583              
1584             =head2 append
1585              
1586             $st->append( $newstr );
1587              
1588             $st .= $newstr;
1589              
1590             Append to the underlying plain string. A shortcut around C.
1591              
1592             If C<$newstr> is a C object, then its tags will be applied to
1593             C<$st> as appropriate. Any after-anchored tags in will become after-anchored
1594             in C<$st>.
1595              
1596             As per C, whether any existing edge-anchored tags are extended
1597             through the newly-added content or become bounded to their current limit
1598             depends on whether C<$newstr> is a C instance or not.
1599              
1600             =cut
1601              
1602 21     21   87157 use overload '.=' => 'append';
  21         78  
  21         141  
1603              
1604             sub append
1605             {
1606 68     68 1 961 my $self = shift;
1607 68         122 my ( $new ) = @_;
1608              
1609 68 100       1446 return $self->set_substr( $self->length, 0, $new ) if is_string_tagged( $new );
1610              
1611             # Optimised version
1612 49         192 $self->{str} .= $new;
1613              
1614 49         137 my $newend = $self->length;
1615              
1616 49         72 my $tags = $self->{tags};
1617              
1618 49         121 my $i = 0;
1619              
1620             # Adjust boundaries of ANCHOR_AFTER tags
1621 49         115 for( ; $i < @$tags; $i++ ) {
1622 59         65 my $t = $tags->[$i];
1623 59 100       131 $t->[1] = $newend if $t->[4] & FLAG_ANCHOR_AFTER;
1624             }
1625              
1626 49         171 return $self;
1627             }
1628              
1629             =head2 append_tagged
1630              
1631             $st->append_tagged( $newstr, %tags );
1632              
1633             Append to the underlying plain string, and apply the given tags to the
1634             newly-inserted extent.
1635              
1636             Returns C<$st> itself so that the method may be easily chained.
1637              
1638             =cut
1639              
1640             sub append_tagged
1641             {
1642 19     19 1 36 my $self = shift;
1643 19         50 my ( $new, %tags ) = @_;
1644              
1645 19         37 my $start = $self->length;
1646 19         27 my $len = CORE::length( $new );
1647              
1648 19         55 $self->append( $new );
1649 19         79 $self->apply_tag( $start, $len, $_, $tags{$_} ) for keys %tags;
1650              
1651 19         60 return $self;
1652             }
1653              
1654             =head2 concat
1655              
1656             $ret = $st->concat( $other );
1657              
1658             $ret = $st . $other;
1659              
1660             Returns a new C containing the two strings concatenated
1661             together, preserving any tags present. This method overloads normal string
1662             concatenation operator, so expressions involving C values
1663             retain their tags.
1664              
1665             This method or operator tries to respect subclassing; preferring to return a
1666             new object of a subclass if either argument or operand is a subclass of
1667             C. If they are both subclasses, it will prefer the type of the
1668             invocant or first operand.
1669              
1670             =cut
1671              
1672 21     21   6830 use overload '.' => 'concat';
  21         40  
  21         121  
1673              
1674             sub concat
1675             {
1676 17     17 1 7615 my $self = shift;
1677 17         40 my ( $other, $swap ) = @_;
1678              
1679             # Try to find the "higher" subclass
1680 17 100 100     490 my $class = ( ref $self eq __PACKAGE__ and is_string_tagged( $other ) )
1681             ? ref $other : ref $self;
1682              
1683 17         73 my $ret = $class->new( $self );
1684 17 100       80 return $ret->insert( 0, $other ) if $swap;
1685 6         19 return $ret->append( $other );
1686             }
1687              
1688             =head2 matches
1689              
1690             @subs = $st->matches( $regexp );
1691              
1692             Returns a list of substrings (as C instances) for every
1693             non-overlapping match of the given C<$regexp>.
1694              
1695             This could be used, for example, to build a formatted string from a formatted
1696             template containing variable expansions:
1697              
1698             my $template = ...
1699             my %vars = ...
1700              
1701             my $ret = String::Tagged->new;
1702             foreach my $m ( $template->matches( qr/\$\w+|[^$]+/ ) ) {
1703             if( $m =~ m/^\$(\w+)$/ ) {
1704             $ret->append_tagged( $vars{$1}, %{ $m->get_tags_at( 0 ) } );
1705             }
1706             else {
1707             $ret->append( $m );
1708             }
1709             }
1710              
1711             This iterates segments of the template containing variables expansions
1712             starting with a C<$> symbol, and replaces them with values from the C<%vars>
1713             hash, careful to preserve all the formatting tags from the original template
1714             string.
1715              
1716             =cut
1717              
1718             sub matches
1719             {
1720 1     1 1 11 my $self = shift;
1721 1         3 my ( $re ) = @_;
1722              
1723 1         3 my $plain = $self->str;
1724              
1725 1         3 my @ret;
1726 1         31 while( $plain =~ m/$re/g ) {
1727 5         32 push @ret, $self->substr( $-[0], $+[0] - $-[0] );
1728             }
1729              
1730 1         5 return @ret;
1731             }
1732              
1733             =head2 match_extents
1734              
1735             @extents = $st->match_extents( $regexp );
1736              
1737             I
1738              
1739             Returns a list of extent objects for every non-overlapping match of the given
1740             C<$regexp>. This is similar to L, except that the results are
1741             returned as extent objects instead of substrings, allowing access to the
1742             position information as well.
1743              
1744             If using the result of this method to find regions of a string to modify,
1745             remember that any length alterations will not update positions in later extent
1746             objects. However, since the extents are non-overlapping and in position order,
1747             this can be handled by iterating them in reverse order so that the
1748             modifications done first are later in the string.
1749              
1750             foreach my $e ( reverse $st->match_extents( $pattern ) ) {
1751             $st->set_substr( $e->start, $e->length, $replacement );
1752             }
1753              
1754             =cut
1755              
1756             sub match_extents
1757             {
1758 2     2 1 11060 my $self = shift;
1759 2         4 my ( $re ) = @_;
1760              
1761 2         5 my $plain = $self->str;
1762              
1763 2         3 my @ret;
1764 2         16 while( $plain =~ m/$re/g ) {
1765 9         22 push @ret, $self->_mkextent( $-[0], $+[0], 0 );
1766             }
1767              
1768 2         6 return @ret;
1769             }
1770              
1771             =head2 split
1772              
1773             @parts = $st->split( $regexp, $limit );
1774              
1775             Returns a list of substrings by applying the regexp to the string content;
1776             similar to the core perl C function. If C<$limit> is supplied, the
1777             method will stop at that number of elements, returning the entire remainder of
1778             the input string as the final element. If the C<$regexp> contains a capture
1779             group then the content of the first one will be added to the return list as
1780             well.
1781              
1782             =cut
1783              
1784             sub split
1785             {
1786 4     4 1 23 my $self = shift;
1787 4         8 my ( $re, $limit ) = @_;
1788              
1789 4         8 my $plain = $self->str;
1790              
1791 4         6 my $prev = 0;
1792 4         5 my @ret;
1793 4         19 while( $plain =~ m/$re/g ) {
1794 5         17 push @ret, $self->substr( $prev, $-[0]-$prev );
1795 5 100       24 push @ret, $self->substr( $-[1], $+[1]-$-[1] ) if @- > 1;
1796              
1797 5         12 $prev = $+[0];
1798              
1799 5 100 66     23 last if defined $limit and @ret == $limit-1;
1800             }
1801              
1802 4 100       7 if( CORE::length $plain > $prev ) {
1803 3         7 push @ret, $self->substr( $prev, CORE::length( $plain ) - $prev );
1804             }
1805              
1806 4         13 return @ret;
1807             }
1808              
1809             =head2 sprintf
1810              
1811             $ret = $st->sprintf( @args );
1812              
1813             I
1814              
1815             Returns a new string by using the given instance as the format string for a
1816             L constructor call. The returned instance will be of the same
1817             class as the invocant.
1818              
1819             =cut
1820              
1821             sub sprintf
1822             {
1823 1     1 1 3 my $self = shift;
1824              
1825 1         5 return ( ref $self )->from_sprintf( $self, @_ );
1826             }
1827              
1828             =head2 debug_sprintf
1829              
1830             $ret = $st->debug_sprintf;
1831              
1832             Returns a representation of the string data and all the tags, suitable for
1833             debug printing or other similar use. This is a format such as is given in the
1834             DESCRIPTION section above.
1835              
1836             The output will consist of a number of lines, the first containing the plain
1837             underlying string, then one line per tag. The line shows the extent of the tag
1838             given by C<[---]> markers, or a C<|> in the special case of a tag covering
1839             only a single character. Special markings of C> and C> indicate
1840             tags which are "before" or "after" anchored.
1841              
1842             For example:
1843              
1844             Hello, world
1845             [---] word => 1
1846             <[----------]> everywhere => 1
1847             | space => 1
1848              
1849             =cut
1850              
1851             sub debug_sprintf
1852             {
1853 5     5 1 770 my $self = shift;
1854              
1855 5         18 my $str = $self->str;
1856 5         11 my $len = CORE::length( $str );
1857              
1858 5         7 my $maxnamelen = 0;
1859              
1860 5         15 my $ret = " " . ( $str =~ s/\n/./gr ) . "\n";
1861              
1862             $self->iter_tags( sub {
1863 7     7   12 my ( undef, undef, $name, undef ) = @_;
1864 7 100       17 CORE::length( $name ) > $maxnamelen and $maxnamelen = CORE::length( $name );
1865 5         33 } );
1866              
1867 5         28 foreach my $t ( @{ $self->{tags} } ) {
  5         25  
1868 7         13 my ( $ts, $te, $tn, $tv, $tf ) = @$t;
1869              
1870 7 100       21 $ret .= ( $tf & FLAG_ANCHOR_BEFORE ) ? " <" : " ";
1871              
1872 7         14 $ret .= " " x $ts;
1873              
1874 7         11 my $tl = $te - $ts;
1875              
1876 7 100       14 if( $tl == 0 ) {
    100          
1877 1         7 $ret =~ s/ $/>
1878 1         3 $te++; # account for extra printed width
1879             }
1880             elsif( $tl == 1 ) {
1881 2         2 $ret .= "|";
1882             }
1883             else {
1884 4         10 $ret .= "[" . ( "-" x ( $tl - 2 ) ) . "]";
1885             }
1886              
1887 7         12 $ret .= " " x ( $len - $te );
1888              
1889 7 100       13 $ret .= ( $tf & FLAG_ANCHOR_AFTER ) ? "> " : " ";
1890              
1891 7         27 $ret .= CORE::sprintf "%-*s => %s\n", $maxnamelen, $tn, $tv;
1892             }
1893              
1894 5         41 return $ret;
1895             }
1896              
1897             =head1 TODO
1898              
1899             =over 4
1900              
1901             =item *
1902              
1903             There are likely variations on the rules for C that could equally
1904             apply to some uses of tagged strings. Consider whether the behaviour of
1905             modification is chosen per-method, per-tag, or per-string.
1906              
1907             =item *
1908              
1909             Consider how to implement a clone from one tag format to another which wants
1910             to merge multiple different source tags together into a single new one.
1911              
1912             =back
1913              
1914             =head1 AUTHOR
1915              
1916             Paul Evans
1917              
1918             =cut
1919              
1920             0x55AA;