File Coverage

blib/lib/RTF/Tokenizer.pm
Criterion Covered Total %
statement 137 157 87.2
branch 83 100 83.0
condition 8 14 57.1
subroutine 18 19 94.7
pod 9 9 100.0
total 255 299 85.2


line stmt bran cond sub pod time code
1             #!perl
2             # RTF::Tokenizer - Peter Sergeant
3              
4             =head1 NAME
5              
6             RTF::Tokenizer - Tokenize RTF
7              
8             =head1 VERSION
9              
10             version 1.20
11              
12             =head1 DESCRIPTION
13              
14             Tokenizes RTF
15              
16             =head1 SYNOPSIS
17              
18             use RTF::Tokenizer;
19              
20             # Create a tokenizer object
21             my $tokenizer = RTF::Tokenizer->new();
22              
23             my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}' );
24             my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}', note_escapes => 1 );
25              
26             my $tokenizer = RTF::Tokenizer->new( file => \*STDIN );
27             my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf' );
28             my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf', sloppy => 1 );
29              
30             # Populate it from a file
31             $tokenizer->read_file('filename.txt');
32              
33             # Or a file handle
34             $tokenizer->read_file( \*STDIN );
35              
36             # Or a string
37             $tokenizer->read_string( '{\*\some rtf}' );
38              
39             # Get the first token
40             my ( $token_type, $argument, $parameter ) = $tokenizer->get_token();
41              
42             # Ooops, that was wrong...
43             $tokenizer->put_token( 'control', 'b', 1 );
44              
45             # Let's have the lot...
46             my @tokens = $tokenizer->get_all_tokens();
47              
48             =head1 INTRODUCTION
49              
50             This documentation assumes some basic knowledge of RTF.
51             If you lack that, go read The_RTF_Cookbook:
52              
53             L
54              
55             =cut
56              
57             require 5;
58              
59             package RTF::Tokenizer;
60             $RTF::Tokenizer::VERSION = '1.20';
61 15     15   133421 use vars qw($VERSION);
  15         23  
  15         688  
62              
63 15     15   56 use strict;
  15         14  
  15         234  
64 15     15   50 use warnings;
  15         15  
  15         276  
65 15     15   43 use Carp;
  15         14  
  15         924  
66 15     15   6243 use IO::File;
  15         100277  
  15         9896  
67              
68             =head1 METHODS
69              
70             =head2 new()
71              
72             Instantiates an RTF::Tokenizer object.
73              
74             B:
75              
76             C - calls the C method with the value provided after instantiation
77              
78             C - calls the C method with the value provided after instantiation
79              
80             C - boolean - whether to give RTF Escapes a token type of C (true) or
81             C (false, default)
82              
83             C - boolean - whether or not to allow some illegal but common RTF sequences found
84             'in the wild'. As of C<1.08>, this currently only allows control words with a numeric
85             argument to have a text field right after with no delimiter, like:
86              
87             \control1Plaintext
88              
89             but this may change in future releases. Defaults false.
90              
91             C - boolean - ... the RTF specification tells you to strip whitespace
92             which comes after control words, and newlines at the beginning and ending of text areas.
93             One result of that is that you can't actually round-trip the output of the tokenization
94             process. Turning this on is probably a bad idea, but someone cared enough to send me a
95             patch for it, so why not. Defaults false, and you should leave it that way.
96              
97             =cut
98              
99             sub new {
100             # Get the real class name in the highly unlikely event we've been
101             # called from an object itself.
102 51     51 1 347806 my $proto = shift;
103 51   66     236 my $class = ref($proto) || $proto;
104              
105             # Read in the named parameters
106 51         128 my %config = @_;
107              
108 51         216 my $self = {
109             _BUFFER => '', # Stores read but unparsed RTF
110             _BINARY_DATA => '', # Temporary data store if we're reading a \bin
111             _FILEHANDLE => '', # Stores the active filehandle
112             _INITIAL_READ => 512
113             , # How many characters to read by default. 512 recommended by RTF spec
114             _UC => 1, # Default number of characters to count for \uc
115             };
116              
117 51         74 bless $self, $class;
118              
119             # Call the data-reading convenience methods if required
120 51 100       174 if ( $config{'file'} ) {
    100          
121 5         24 $self->read_file( $config{'file'} );
122             } elsif ( $config{'string'} ) {
123 38         74 $self->read_string( $config{'string'} );
124             }
125              
126             # Set up final config stuff
127 51         103 $self->{_NOTE_ESCAPES} = $config{'note_escapes'};
128 51         61 $self->{_SLOPPY} = $config{'sloppy'};
129 51         95 $self->{_WHITESPACE} = $config{'preserve_whitespace'};
130              
131 51         119 return $self;
132              
133             }
134              
135             =head2 read_string( STRING )
136              
137             Appends the string to the tokenizer-object's buffer
138             (earlier versions would over-write the buffer -
139             this version does not).
140              
141             =cut
142              
143             sub read_string {
144 42     42 1 353 my $self = shift;
145 42         159 $self->{_BUFFER} .= shift;
146             }
147              
148             =head2 read_file( \*FILEHANDLE )
149              
150             =head2 read_file( $IO_File_object )
151              
152             =head2 read_file( 'filename' )
153              
154             Appends a chunk of data from the filehandle to the buffer,
155             and remembers the filehandle, so if you ask for a token,
156             and the buffer is empty, it'll try and read the next line
157             from the file (earlier versions would over-write the buffer -
158             this version does not).
159              
160             This chunk is 500 characters, and then whatever is left until
161             the next occurrence of the IRS (a newline character in this case).
162             If for whatever reason, you want to change that number to something
163             else, use C.
164              
165             =cut
166              
167             sub read_file {
168              
169 12     12 1 3931 my $self = shift;
170 12         15 my $file = shift;
171              
172             # Accept a filehandle referenced via a GLOB
173 12 100       33 if ( ref $file eq 'GLOB' ) {
    100          
    100          
    100          
174 2         13 $self->{_FILEHANDLE} = IO::File->new_from_fd( $file, '<' );
175             croak
176             "Couldn't create an IO::File object from the reference you specified"
177 2 100       253 unless $self->{_FILEHANDLE};
178              
179             # Accept IO::File and subclassed objects
180             } elsif (
181             eval {
182 10         106 $file->isa('IO::File');
183             } )
184             {
185 1         15 $self->{_FILEHANDLE} = $file;
186              
187             # This is undocumented, because you shouldn't use it. Don't rely on it.
188             } elsif ( ref $file eq 'IO::Scalar' ) {
189 4         10 $self->{_FILEHANDLE} = $file;
190              
191             # If it's not a reference, assume it's a filename
192             } elsif ( !ref $file ) {
193 4         16 $self->{_FILEHANDLE} = IO::File->new("< $file");
194 4 100       314 croak "Couldn't open '$file' for reading" unless $self->{_FILEHANDLE};
195              
196             # Complain if we get anything else
197             } else {
198 1         67 croak "You passed a reference to read_file of type " . ref($file) .
199             " which isn't an allowed type";
200             }
201              
202             # Check what our line-endings seem to be, then set $self->{_IRS} accordingly.
203             # This also reads in the first few lines as a side effect.
204 9         32 $self->_line_endings;
205             }
206              
207             # Reads a line from an IO:File'ish object
208             sub _get_line {
209 20     20   17 my $self = shift();
210              
211             # Localize the input record separator before changing it so
212             # we don't mess up any other part of the application running
213             # us that relies on it
214 20         55 local $/ = $self->{_IRS};
215              
216             # Read the line itself
217 20         140 my $line = $self->{_FILEHANDLE}->getline();
218 20 50       517 $self->{_BUFFER} .= $line if defined $line;
219             }
220              
221             # Determine what kind of line-endings the file uses
222              
223             sub _line_endings {
224 12     12   1169 my $self = shift();
225              
226 12         10 my $temp_buffer;
227 12         38 $self->{_FILEHANDLE}->read( $temp_buffer, $self->{_INITIAL_READ} );
228              
229             # This catches all allowed cases
230 12 50       186 if ( $temp_buffer =~ m/(\cM\cJ|\cM|\cJ)/ ) {
231 12         26 $self->{_IRS} = $1;
232              
233 12 100       41 $self->{_RS} = "Macintosh" if $self->{_IRS} eq "\cM";
234 12 100       30 $self->{_RS} = "Windows" if $self->{_IRS} eq "\cM\cJ";
235 12 100       27 $self->{_RS} = "UNIX" if $self->{_IRS} eq "\cJ";
236              
237             } else {
238 0         0 $self->{_RS} = "Unknown";
239             }
240              
241             # Add back to main buffer
242 12         32 $self->{_BUFFER} .= $temp_buffer;
243              
244             # Call C<_get_line> again so we're sure we're not only
245             # reading half a line
246 12         25 $self->_get_line;
247              
248             }
249              
250             =head2 get_token()
251              
252             Returns the next token as a three-item list: 'type', 'argument', 'parameter'.
253             Token is one of: C, C, C, C or C.
254              
255             If you turned on C, then you may get a forth item for
256             C tokens.
257              
258             =over
259              
260             =item C
261              
262             'type' is set to 'text'. 'argument' is set to the text itself. 'parameter'
263             is left blank. NOTE: C<\{>, C<\}>, and C<\\> are all returned as control words,
264             rather than rendered as text for you, as are C<\_>, C<\-> and friends.
265              
266             =item C
267              
268             'type' is 'control'. 'argument' is the control word or control symbol.
269             'parameter' is the control word's parameter if it has one - this will
270             be numeric, EXCEPT when 'argument' is a literal ', in which case it
271             will be a two-letter hex string.
272              
273             If you turned on C, you'll get a forth item,
274             which will be the whitespace or a defined empty string.
275              
276             =item C
277              
278             'type' is 'group'. If it's the beginning of an RTF group, then
279             'argument' is 1, else if it's the end, argument is 0. 'parameter'
280             is not set.
281              
282             =item C
283              
284             End of file reached. 'type' is 'eof'. 'argument' is 1. 'parameter' is
285             0.
286              
287             =item C
288              
289             If you specifically turn on this functionality, you'll get an
290             C type, which is identical to C, only, it's
291             only returned for escapes.
292              
293             =back
294              
295             =cut
296              
297             # Define a regular expression that matches characters which are 'text' -
298             # that is, they're not a backspace, a scoping brace, or discardable
299             # whitespace.
300             my $non_text_standard_re = qr/[^\\{}\r\n]/;
301             my $non_text_whitespace_re = qr/[^\\{}]/;
302              
303             sub get_token {
304 5902     5902 1 28661 my $self = shift;
305              
306             # If the last token we returned was \bin, we'll now have a
307             # big chunk of binary data waiting for the user, so send that
308             # back
309 5902 100       6628 if ( $self->{_BINARY_DATA} ) {
310 2         3 my $data = $self->{_BINARY_DATA};
311 2         2 $self->{_BINARY_DATA} = '';
312 2         8 return ( 'text', $data, '' );
313             }
314              
315             # We might have a cached token, and if we do, we'll want to
316             # return that first
317 5900 100       6291 if ( $self->{_PUT_TOKEN_CACHE_FLAG} ) {
318             # Take the value from the cache
319 3         2 my @return_values = @{ pop( @{ $self->{_PUT_TOKEN_CACHE} } ) };
  3         3  
  3         8  
320              
321             # Update the flag
322 3         3 $self->{_PUT_TOKEN_CACHE_FLAG} = @{ $self->{_PUT_TOKEN_CACHE} };
  3         5  
323              
324             # Give the user the token back
325 3         23 return @return_values;
326             }
327              
328             my $non_text_re =
329 5897 100       5685 $self->{_WHITESPACE} ? $non_text_whitespace_re : $non_text_standard_re;
330              
331             # Our main parsing loop
332 5897         3306 while (1) {
333              
334 5932         5648 my $start_character = substr( $self->{_BUFFER}, 0, 1, '' );
335              
336             # Most likely to be text, so we check for that first
337 5932 100       14249 if ( $start_character =~ $non_text_re ) {
    100          
    100          
    100          
    100          
338 15     15   99 no warnings 'uninitialized';
  15         18  
  15         16363  
339              
340             # We want to return text fields that have newlines in as one
341             # token, which requires a bit of work, as we read in one line
342             # at a time from out files...
343 224         162 my $temp_text = '';
344              
345             READTEXT:
346              
347             # Grab all the next 'text' characters
348 231         450 $self->{_BUFFER} =~ s/^([^\\{}]+)//s;
349 231 100       426 $temp_text .= $1 if defined $1;
350              
351             # If the buffer is empty, try reading in some more, and
352             # then go back to READTEXT to keep going. Now, the clever
353             # thing would be to assume that if the buffer *IS* empty
354             # then there MUST be more to read, which is true if we
355             # have well-formed input. We're going to assume that the
356             # input could well be a little broken.
357 231 100 66     370 if ( ( !$self->{_BUFFER} ) && ( $self->{_FILEHANDLE} ) ) {
358 7         33 $self->_get_line;
359 7 50       31 goto READTEXT if $self->{_BUFFER};
360             }
361              
362             # Make sure we're not including newlines in our output,
363             # as RTF spec says they're to be ignored...
364 224 100       264 unless ( $self->{_WHITESPACE} ) {
365 114         259 $temp_text =~ s/(\cM\cJ|\cM|\cJ)//g;
366             }
367              
368             # Give the user a shiny token back
369 224         565 return ( 'text', $start_character . $temp_text, '' );
370              
371             # Second most likely to be a control character
372             } elsif ( $start_character eq "\\" ) {
373 5192         4878 my @args = $self->_grab_control();
374              
375             # If the control word was an escape, and the user
376             # asked to be told separately about those, this
377             # will be set, so return an 'escape'. Otherwise,
378             # return the control word as a 'control'
379 5190 100       5700 if ( $self->{_TEMP_ESCAPE_FLAG} ) {
380 2         2 $self->{_TEMP_ESCAPE_FLAG} = 0;
381 2         9 return ( 'escape', @args );
382             } else {
383 5188         10505 return ( 'control', @args );
384             }
385              
386             # Probably a group then
387             } elsif ( $start_character eq '{' ) {
388 224         415 return ( 'group', 1, '' );
389             } elsif ( $start_character eq '}' ) {
390 220         390 return ( 'group', 0, '' );
391              
392             # No start character? Either we're at the end of our input,
393             # or we need some new input
394             } elsif ( !$start_character ) {
395             # If we were read from a string, we're all done
396 38 100       112 return ( 'eof', 1, 0 ) unless $self->{_FILEHANDLE};
397              
398             # If we were read from a file, try and get some more stuff
399             # in to the buffer, or return the 'eof' character
400 1 50       6 return ( 'eof', 1, 0 ) if $self->{_FILEHANDLE}->eof;
401 1         10 $self->_get_line;
402 1 50       3 return ( 'eof', 1, 0 ) unless $self->{_BUFFER};
403             }
404             }
405             }
406              
407             =head2 get_all_tokens
408              
409             As per C, but keeps calling C until it hits EOF. Returns
410             a list of arrayrefs.
411              
412             =cut
413              
414             sub get_all_tokens {
415 34     34 1 26 my $self = shift;
416 34         29 my @tokens;
417              
418 34         27 while (1) {
419 5822         5747 my $token = [ $self->get_token() ];
420 5822         4690 push( @tokens, $token );
421 5822 100       7693 last if $token->[0] eq 'eof';
422             }
423              
424 34         680 return @tokens;
425             }
426              
427             =head2 put_token( type, token, argument )
428              
429             Adds an item to the token cache, so that the next time you
430             call get_token, the arguments you passed here will be returned.
431             We don't check any of the values, so use this carefully. This
432             is on a first in last out basis.
433              
434             =cut
435              
436             sub put_token {
437 3     3 1 423 my $self = shift;
438              
439 3         2 push( @{ $self->{_PUT_TOKEN_CACHE} }, [@_] );
  3         6  
440              
441             # No need to set this to the real value of the token cache, as
442             # it'll get set properly when we try and read a cached token.
443 3         5 $self->{_PUT_TOKEN_CACHE_FLAG} = 1;
444             }
445              
446             =head2 sloppy( [bool] )
447              
448             Decides whether we allow some types of broken RTF. See C's docs
449             for a little more explanation about this. Pass it 1 to turn it on, 0 to
450             turn it off. This will always return undef.
451              
452             =cut
453              
454             sub sloppy {
455 0     0 1 0 my $self = shift;
456 0         0 my $bool = shift;
457              
458 0 0       0 if ($bool) {
459 0         0 $self->{_SLOPPY} = 1;
460             } else {
461 0         0 $self->{_SLOPPY} = 0;
462             }
463              
464 0         0 return;
465             }
466              
467             =head2 initial_read( [number] )
468              
469             Don't call this unless you actually have a good reason. When
470             the Tokenizer reads from a file, it first attempts to work out
471             what the correct input record-seperator should be, by reading
472             some characters from the file handle. This value starts off
473             as 512, which is twice the amount of characters that version 1.7
474             of the RTF specification says you should go before including a
475             line feed if you're writing RTF.
476              
477             Called with no argument, this returns the current value of the
478             number of characters we're going to read. Called with a numeric
479             argument, it sets the number of characters we'll read.
480              
481             You really don't need to use this method.
482              
483             =cut
484              
485             sub initial_read {
486 4     4 1 322 my $self = shift;
487 4 100       9 if (@_) { $self->{_INITIAL_READ} = shift }
  1         2  
488 4         11 return $self->{_INITIAL_READ};
489             }
490              
491             =head2 debug( [number] )
492              
493             Returns (non-destructively) the next 50 characters from the buffer,
494             OR, the number of characters you specify. Printing these to STDERR,
495             causing fatal errors, and the like, are left as an exercise to the
496             programmer.
497              
498             Note the part about 'from the buffer'. It really means that, which means
499             if there's nothing in the buffer, but still stuff we're reading from a
500             file it won't be shown. Chances are, if you're using this function, you're
501             debugging. There's an internal method called C<_get_line>, which is called
502             without arguments (C<$self->_get_line()>) that's how we get more stuff into
503             the buffer when we're reading from filehandles. There's no guarentee that'll
504             stay, or will always work that way, but, if you're debugging, that shouldn't
505             matter.
506              
507             =cut
508              
509             sub debug {
510 4     4 1 693 my $self = shift;
511 4   100     15 my $number = shift || 50;
512              
513 4         17 return substr( $self->{_BUFFER}, 0, $number );
514             }
515              
516             # Work with control characters
517              
518             # It's ugly to repeat myself here, but I believe having two literal re's
519             # here is going to offer a small performance benefit over a regex with
520             # a scalar in it.
521             my $control_word_standard_re = qr/
522             ^([a-z]{1,32}) # Lowercase word
523             (-?\d+)? # Optional signed number
524             (?:\s|(?=[^a-z0-9])) # Either whitespace, which we gobble or a
525             # non alpha-numeric, which we leave
526             /ix;
527             my $control_word_whitespace_re = qr/
528             ^([a-z]{1,32}) # Lowercase word
529             (-?\d+)? # Optional signed number
530             (\s*)? # Capture trailing whitespace
531             /ix;
532              
533             sub _grab_control {
534 5192     5192   3394 my $self = shift;
535              
536             my $whitespace_re =
537 5192 100       5020 $self->{_WHITESPACE} ? $control_word_whitespace_re :
538             $control_word_standard_re;
539              
540             # Check for a star here, as it simplifies our regex below,
541             # and it occurs pretty often
542 5192 100 0     19139 if ( $self->{_BUFFER} =~ s/^\*// ) {
    100          
    100          
    100          
    100          
    50          
    50          
    50          
    0          
543 59         92 return ( '*', '' );
544              
545             # A standard control word:
546             } elsif ( $self->{_BUFFER} =~ s/$whitespace_re// ) {
547             # Return the control word, unless it's a \bin
548 5117         3568 my $param = '';
549 5117 100       7615 $param = $2 if defined($2);
550              
551 5117         3014 my @whitespace;
552 5117 100       5917 if ( $self->{_WHITESPACE} ) {
553 2549 50       3530 push( @whitespace, defined $3 ? $3 : '' );
554             }
555              
556 5117 100       14122 return ( $1, $param, @whitespace ) unless $1 eq 'bin';
557              
558             # Pre-grab the binary data, and return the control word
559 4         4 my $byte_count = $2;
560 4         8 $self->_grab_bin($byte_count);
561 2         5 return ( 'bin', $byte_count, @whitespace );
562              
563             # hex-dec character (escape)
564             } elsif ( $self->{_BUFFER} =~ s/^'([0-9a-f]{2})//i ) {
565 9 100       21 $self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
566 9         24 return ( "'", $1 );
567              
568             # Control symbol (escape)
569             } elsif ( $self->{_BUFFER} =~ s/^([-_~:|{}'\\])// ) {
570 1 50       4 $self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
571 1         3 return ( $1, '' );
572              
573             # Escaped whitespace (ew, but allowed)
574             } elsif ( $self->{_BUFFER} =~ s/^[\r\n]// ) {
575 3         8 return ( 'par', '' );
576              
577             # Escaped tab (ew, but allowed)
578             } elsif ( $self->{_BUFFER} =~ s/^\t// ) {
579 0         0 return ( 'tab', '' );
580              
581             # Escaped semi-colon - this is WRONG
582             } elsif ( $self->{_BUFFER} =~ s/^\;// ) {
583 0         0 carp(
584             "Your RTF contains an escaped semi-colon. This isn't allowed, but we'll let you have it back as a literal for now. See the RTF spec."
585             );
586 0         0 return ( ';', '' );
587              
588             # Unicode characters
589             } elsif ( $self->{_BUFFER} =~ s/^u(\d+)// ) {
590 3         10 return ( 'u', $1 );
591              
592             # Allow incorrect control words
593             } elsif ( ( $self->{_SLOPPY} ) &&
594             ( $self->{_BUFFER} =~ s/^([a-z]{1,32})(-?\d+)//i ) )
595             {
596 0         0 my $param = '';
597 0 0       0 $param = $2 if defined($2);
598              
599 0         0 return ( $1, $param );
600             }
601              
602             # If we get here, something has gone wrong. First we'll create
603             # a human readable section of RTF to show the user.
604 0         0 my $die_string = substr( $self->{_BUFFER}, 0, 50 );
605 0         0 $die_string =~ s/\r/[R]/g;
606              
607             # Get angry with the user
608 0         0 carp
609             "Your RTF is broken, trying to recover to nearest group from '\\$die_string'\n";
610 0         0 carp
611             "Chances are you have some RTF like \\control1plaintext. Which is illegal. But you can allow that by passing the 'sloppy' attribute to new() or using the sloppy() method. Please also write to and abuse the developer of the software which wrote your RTF :-)\n";
612              
613             # Kill everything until the next group
614 0         0 $self->{_BUFFER} =~ s/^.+?([}{])/$1/;
615 0         0 return ( '', '' );
616             }
617              
618             # A first stab at grabbing binary data
619             sub _grab_bin {
620 4     4   3 my $self = shift;
621 4         4 my $bytes = shift;
622              
623             # If the buffer is too small, attempt to read in some more data...
624 4         10 while ( length( $self->{_BUFFER} ) < $bytes ) {
625              
626             # If there's no filehandle, or the one we have is eof, complain
627 2 50 66     6 if ( !$self->{_FILEHANDLE} || $self->{_FILEHANDLE}->eof ) {
628             croak "\\bin is asking for $bytes characters, but there are only " .
629 2         267 length( $self->{_BUFFER} ) . " left.";
630             }
631              
632             # Try and read in more data
633 0         0 $self->_get_line;
634             }
635              
636             # Return the right number of characters
637 2         5 $self->{_BINARY_DATA} = substr( $self->{_BUFFER}, 0, $bytes, '' );
638             }
639              
640             =head1 NOTES
641              
642             To avoid intrusively deep parsing, if an alternative ASCII
643             representation is available for a Unicode entity, and that
644             ASCII representation contains C<{>, or C<\>, by themselves, things
645             will go I. But I'm not convinced either of those is
646             allowed by the spec.
647              
648             =head1 AUTHOR
649              
650             Pete Sergeant -- C
651              
652             =head1 LICENSE
653              
654             Copyright B.
655              
656             This program is free software; you can redistribute it and/or modify it under
657             the same terms as Perl itself.
658              
659             =cut
660              
661             1;