File Coverage

blib/lib/RTF/Tokenizer.pm
Criterion Covered Total %
statement 145 150 96.6
branch 80 90 88.8
condition 11 14 78.5
subroutine 19 19 100.0
pod 9 9 100.0
total 264 282 93.6


line stmt bran cond sub pod time code
1             #!perl
2             # RTF::Tokenizer - Peter Sergeant
3              
4             =head1 NAME
5              
6             RTF::Tokenizer - Tokenize RTF
7              
8             =head1 VERSION
9              
10             version 1.18
11              
12             =head1 DESCRIPTION
13              
14             Tokenizes RTF
15              
16             =head1 SYNOPSIS
17              
18             use RTF::Tokenizer;
19              
20             # Create a tokenizer object
21             my $tokenizer = RTF::Tokenizer->new();
22              
23             my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}' );
24             my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}', note_escapes => 1 );
25              
26             my $tokenizer = RTF::Tokenizer->new( file => \*STDIN );
27             my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf' );
28             my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf', sloppy => 1 );
29              
30             # Populate it from a file
31             $tokenizer->read_file('filename.txt');
32              
33             # Or a file handle
34             $tokenizer->read_file( \*STDIN );
35              
36             # Or a string
37             $tokenizer->read_string( '{\*\some rtf}' );
38              
39             # Get the first token
40             my ( $token_type, $argument, $parameter ) = $tokenizer->get_token();
41              
42             # Ooops, that was wrong...
43             $tokenizer->put_token( 'control', 'b', 1 );
44              
45             # Let's have the lot...
46             my @tokens = $tokenizer->get_all_tokens();
47              
48             =head1 INTRODUCTION
49              
50             This documentation assumes some basic knowledge of RTF.
51             If you lack that, go read The_RTF_Cookbook:
52              
53             L
54              
55             =cut
56              
57             require 5;
58              
59             package RTF::Tokenizer;
60             $RTF::Tokenizer::VERSION = '1.18';
61 15     15   265791 use vars qw($VERSION);
  15         36  
  15         962  
62              
63 15     15   84 use strict;
  15         30  
  15         532  
64 15     15   83 use warnings;
  15         27  
  15         14841  
65 15     15   100 use Carp;
  15         25  
  15         1700  
66 15     15   15296 use IO::File;
  15         203910  
  15         14676  
67              
68             =head1 METHODS
69              
70             =head2 new()
71              
72             Instantiates an RTF::Tokenizer object.
73              
74             B:
75              
76             C - calls the C method with the value provided after instantiation
77              
78             C - calls the C method with the value provided after instantiation
79              
80             C - boolean - whether to give RTF Escapes a token type of C (true) or C (false)
81              
82             C - boolean - whether or not to allow some illegal but common RTF sequences found 'in the wild'. As of C<1.08>, this currently only allows
83             control words with a numeric argument to have a text field right after with
84             no delimiter, like:
85              
86             \control1Plaintext
87              
88             but this may change in future releases.
89              
90             =cut
91              
92             sub new {
93             # Get the real class name in the highly unlikely event we've been
94             # called from an object itself.
95 37     37 1 564655 my $proto = shift;
96 37   66     236 my $class = ref($proto) || $proto;
97              
98             # Read in the named parameters
99 37         126 my %config = @_;
100              
101 37         250 my $self = {
102             _BUFFER => '', # Stores read but unparsed RTF
103             _BINARY_DATA => '', # Temporary data store if we're reading a \bin
104             _FILEHANDLE => '', # Stores the active filehandle
105             _INITIAL_READ => 512
106             , # How many characters to read by default. 512 recommended by RTF spec
107             _UC => 1, # Default number of characters to count for \uc
108             };
109              
110 37         100 bless $self, $class;
111              
112             # Call the data-reading convenience methods if required
113 37 100       192 if ( $config{'file'} ) {
    100          
114 5         39 $self->read_file( $config{'file'} );
115             } elsif ( $config{'string'} ) {
116 21         71 $self->read_string( $config{'string'} );
117             }
118              
119             # Set up final config stuff
120 37         136 $self->{_NOTE_ESCAPES} = $config{'note_escapes'};
121 37         87 $self->{_SLOPPY} = $config{'sloppy'};
122              
123 37         134 return $self;
124              
125             }
126              
127             =head2 read_string( STRING )
128              
129             Appends the string to the tokenizer-object's buffer
130             (earlier versions would over-write the buffer -
131             this version does not).
132              
133             =cut
134              
135             sub read_string {
136 31     31 1 1615 my $self = shift;
137 31         1031 $self->{_BUFFER} .= shift;
138             }
139              
140             =head2 read_file( \*FILEHANDLE )
141              
142             =head2 read_file( $IO_File_object )
143              
144             =head2 read_file( 'filename' )
145              
146             Appends a chunk of data from the filehandle to the buffer,
147             and remembers the filehandle, so if you ask for a token,
148             and the buffer is empty, it'll try and read the next line
149             from the file (earlier versions would over-write the buffer -
150             this version does not).
151              
152             This chunk is 500 characters, and then whatever is left until
153             the next occurrence of the IRS (a newline character in this case).
154             If for whatever reason, you want to change that number to something
155             else, use C.
156              
157             =cut
158              
159             sub read_file {
160              
161 12     12 1 6352 my $self = shift;
162 12         18 my $file = shift;
163              
164             # Accept a filehandle referenced via a GLOB
165 12 100       39 if ( ref $file eq 'GLOB' ) {
    100          
    100          
    100          
166 2         18 $self->{_FILEHANDLE} = IO::File->new_from_fd( $file, '<' );
167 2 100       324 croak
168             "Couldn't create an IO::File object from the reference you specified"
169             unless $self->{_FILEHANDLE};
170              
171             # Accept IO::File and subclassed objects
172             } elsif (
173             eval {
174 10         129 $file->isa('IO::File');
175             } )
176             {
177 1         2 $self->{_FILEHANDLE} = $file;
178              
179             # This is undocumented, because you shouldn't use it. Don't rely on it.
180             } elsif ( ref $file eq 'IO::Scalar' ) {
181 4         17 $self->{_FILEHANDLE} = $file;
182              
183             # If it's not a reference, assume it's a filename
184             } elsif ( !ref $file ) {
185 4         24 $self->{_FILEHANDLE} = IO::File->new("< $file");
186 4 100       413 croak "Couldn't open '$file' for reading" unless $self->{_FILEHANDLE};
187              
188             # Complain if we get anything else
189             } else {
190 1         92 croak "You passed a reference to read_file of type " . ref($file) .
191             " which isn't an allowed type";
192             }
193              
194             # Check what our line-endings seem to be, then set $self->{_IRS} accordingly.
195             # This also reads in the first few lines as a side effect.
196 9         44 $self->_line_endings;
197             }
198              
199             # Reads a line from an IO:File'ish object
200             sub _get_line {
201 20     20   36 my $self = shift();
202              
203             # Localize the input record separator before changing it so
204             # we don't mess up any other part of the application running
205             # us that relies on it
206 20         73 local $/ = $self->{_IRS};
207              
208             # Read the line itself
209 20         198 my $line = $self->{_FILEHANDLE}->getline();
210 20 50       735 $self->{_BUFFER} .= $line if defined $line;
211             }
212              
213             # Determine what kind of line-endings the file uses
214              
215             sub _line_endings {
216 12     12   1552 my $self = shift();
217              
218 12         23 my $temp_buffer;
219 12         59 $self->{_FILEHANDLE}->read( $temp_buffer, $self->{_INITIAL_READ} );
220              
221             # This catches all allowed cases
222 12 50       264 if ( $temp_buffer =~ m/(\cM\cJ|\cM|\cJ)/ ) {
223 12         35 $self->{_IRS} = $1;
224              
225 12 100       47 $self->{_RS} = "Macintosh" if $self->{_IRS} eq "\cM";
226 12 100       51 $self->{_RS} = "Windows" if $self->{_IRS} eq "\cM\cJ";
227 12 100       43 $self->{_RS} = "UNIX" if $self->{_IRS} eq "\cJ";
228              
229             } else {
230 0         0 $self->{_RS} = "Unknown";
231             }
232              
233             # Add back to main buffer
234 12         35 $self->{_BUFFER} .= $temp_buffer;
235              
236             # Call C<_get_line> again so we're sure we're not only
237             # reading half a line
238 12         36 $self->_get_line;
239              
240             }
241              
242             =head2 get_token()
243              
244             Returns the next token as a three-item list: 'type', 'argument', 'parameter'.
245             Token is one of: C, C, C, C or C.
246              
247             =over
248              
249             =item C
250              
251             'type' is set to 'text'. 'argument' is set to the text itself. 'parameter'
252             is left blank. NOTE: C<\{>, C<\}>, and C<\\> are all returned as control words,
253             rather than rendered as text for you, as are C<\_>, C<\-> and friends.
254              
255             =item C
256              
257             'type' is 'control'. 'argument' is the control word or control symbol.
258             'parameter' is the control word's parameter if it has one - this will
259             be numeric, EXCEPT when 'argument' is a literal ', in which case it
260             will be a two-letter hex string.
261              
262             =item C
263              
264             'type' is 'group'. If it's the beginning of an RTF group, then
265             'argument' is 1, else if it's the end, argument is 0. 'parameter'
266             is not set.
267              
268             =item C
269              
270             End of file reached. 'type' is 'eof'. 'argument' is 1. 'parameter' is
271             0.
272              
273             =item C
274              
275             If you specifically turn on this functionality, you'll get an
276             C type, which is identical to C, only, it's
277             only returned for escapes.
278              
279             =back
280              
281             =cut
282              
283             sub get_token {
284 3006     3006 1 47524 my $self = shift;
285              
286             # If the last token we returned was \bin, we'll now have a
287             # big chunk of binary data waiting for the user, so send that
288             # back
289 3006 100       8044 if ( $self->{_BINARY_DATA} ) {
290 2         13 my $data = $self->{_BINARY_DATA};
291 2         5 $self->{_BINARY_DATA} = '';
292 2         15 return ( 'text', $data, '' );
293             }
294              
295             # We might have a cached token, and if we do, we'll want to
296             # return that first
297 3004 100       6837 if ( $self->{_PUT_TOKEN_CACHE_FLAG} ) {
298             # Take the value from the cache
299 3         4 my @return_values = @{ pop( @{ $self->{_PUT_TOKEN_CACHE} } ) };
  3         4  
  3         10  
300              
301             # Update the flag
302 3         5 $self->{_PUT_TOKEN_CACHE_FLAG} = @{ $self->{_PUT_TOKEN_CACHE} };
  3         5  
303              
304             # Give the user the token back
305 3         18 return @return_values;
306             }
307              
308             # Our main parsing loop
309 3001         3499 while (1) {
310              
311 3036         5392 my $start_character = substr( $self->{_BUFFER}, 0, 1, '' );
312              
313             # Most likely to be text, so we check for that first
314 3036 100       9564 if ( $start_character =~ /[^\\{}\r\n]/ ) {
    100          
    100          
    100          
    100          
315 15     15   150 no warnings 'uninitialized';
  15         35  
  15         28075  
316              
317             # We want to return text fields that have newlines in as one
318             # token, which requires a bit of work, as we read in one line
319             # at a time from out files...
320 124         166 my $temp_text = '';
321              
322 131         426 READTEXT:
323              
324             # Grab all the next 'text' characters
325             $self->{_BUFFER} =~ s/^([^\\{}]+)//s;
326 131 100       401 $temp_text .= $1 if defined $1;
327              
328             # If the buffer is empty, try reading in some more, and
329             # then go back to READTEXT to keep going. Now, the clever
330             # thing would be to assume that if the buffer *IS* empty
331             # then there MUST be more to read, which is true if we
332             # have well-formed input. We're going to assume that the
333             # input could well be a little broken.
334 131 100 100     431 if ( ( !$self->{_BUFFER} ) && ( $self->{_FILEHANDLE} ) ) {
335 7         73 $self->_get_line;
336 7 50       43 goto READTEXT if $self->{_BUFFER};
337             }
338              
339             # Make sure we're not including newlines in our output,
340             # as RTF spec says they're to be ignored...
341 124         581 $temp_text =~ s/(\cM\cJ|\cM|\cJ)//g;
342              
343             # Give the user a shiny token back
344 124         613 return ( 'text', $start_character . $temp_text, '' );
345              
346             # Second most likely to be a control character
347             } elsif ( $start_character eq "\\" ) {
348 2619         4572 my @args = $self->_grab_control();
349              
350             # If the control word was an escape, and the user
351             # asked to be told separately about those, this
352             # will be set, so return an 'escape'. Otherwise,
353             # return the control word as a 'control'
354 2617 100       5334 if ( $self->{_TEMP_ESCAPE_FLAG} ) {
355 2         3 $self->{_TEMP_ESCAPE_FLAG} = 0;
356 2         14 return ( 'escape', @args );
357             } else {
358 2615         12897 return ( 'control', @args );
359             }
360              
361             # Probably a group then
362             } elsif ( $start_character eq '{' ) {
363 118         446 return ( 'group', 1, '' );
364             } elsif ( $start_character eq '}' ) {
365 120         427 return ( 'group', 0, '' );
366              
367             # No start character? Either we're at the end of our input,
368             # or we need some new input
369             } elsif ( !$start_character ) {
370             # If we were read from a string, we're all done
371 21 100       119 return ( 'eof', 1, 0 ) unless $self->{_FILEHANDLE};
372              
373             # If we were read from a file, try and get some more stuff
374             # in to the buffer, or return the 'eof' character
375 1 50       7 return ( 'eof', 1, 0 ) if $self->{_FILEHANDLE}->eof;
376 1         9 $self->_get_line;
377 1 50       3 return ( 'eof', 1, 0 ) unless $self->{_BUFFER};
378             }
379             }
380             }
381              
382             =head2 get_all_tokens
383              
384             As per C, but keeps calling C until it hits EOF. Returns
385             a list of arrayrefs.
386              
387             =cut
388              
389             sub get_all_tokens {
390 17     17 1 39 my $self = shift;
391 17         22 my @tokens;
392              
393 17         90 while (1) {
394 2903         5560 my $token = [ $self->get_token() ];
395 2903         5577 push( @tokens, $token );
396 2903 100       6678 last if $token->[0] eq 'eof';
397             }
398              
399 17         681 return @tokens;
400             }
401              
402             =head2 put_token( type, token, argument )
403              
404             Adds an item to the token cache, so that the next time you
405             call get_token, the arguments you passed here will be returned.
406             We don't check any of the values, so use this carefully. This
407             is on a first in last out basis.
408              
409             =cut
410              
411             sub put_token {
412              
413 3     3 1 451 my $self = shift;
414 3         7 my ( $type, $token, $argument ) = ( shift, shift, shift );
415              
416 3         4 push( @{ $self->{_PUT_TOKEN_CACHE} }, [ $type, $token, $argument ] );
  3         10  
417              
418             # No need to set this to the real value of the token cache, as
419             # it'll get set properly when we try and read a cached token.
420 3         36 $self->{_PUT_TOKEN_CACHE_FLAG} = 1;
421             }
422              
423             =head2 sloppy( [bool] )
424              
425             Decides whether we allow some types of broken RTF. See C's docs
426             for a little more explanation about this. Pass it 1 to turn it on, 0 to
427             turn it off. This will always return undef.
428              
429             =cut
430              
431             sub sloppy {
432 2     2 1 1156 my $self = shift;
433 2         4 my $bool = shift;
434              
435 2 100       8 if ($bool) {
436 1         4 $self->{_SLOPPY} = 1;
437             } else {
438 1         2 $self->{_SLOPPY} = 0;
439             }
440              
441 2         39 return;
442             }
443              
444             =head2 initial_read( [number] )
445              
446             Don't call this unless you actually have a good reason. When
447             the Tokenizer reads from a file, it first attempts to work out
448             what the correct input record-seperator should be, by reading
449             some characters from the file handle. This value starts off
450             as 512, which is twice the amount of characters that version 1.7
451             of the RTF specification says you should go before including a
452             line feed if you're writing RTF.
453              
454             Called with no argument, this returns the current value of the
455             number of characters we're going to read. Called with a numeric
456             argument, it sets the number of characters we'll read.
457              
458             You really don't need to use this method.
459              
460             =cut
461              
462             sub initial_read {
463 4     4 1 601 my $self = shift;
464 4 100       13 if (@_) { $self->{_INITIAL_READ} = shift }
  1         3  
465 4         15 return $self->{_INITIAL_READ};
466             }
467              
468             =head2 debug( [number] )
469              
470             Returns (non-destructively) the next 50 characters from the buffer,
471             OR, the number of characters you specify. Printing these to STDERR,
472             causing fatal errors, and the like, are left as an exercise to the
473             programmer.
474              
475             Note the part about 'from the buffer'. It really means that, which means
476             if there's nothing in the buffer, but still stuff we're reading from a
477             file it won't be shown. Chances are, if you're using this function, you're
478             debugging. There's an internal method called C<_get_line>, which is called
479             without arguments (C<$self->_get_line()>) that's how we get more stuff into
480             the buffer when we're reading from filehandles. There's no guarentee that'll
481             stay, or will always work that way, but, if you're debugging, that shouldn't
482             matter.
483              
484             =cut
485              
486             sub debug {
487 4     4 1 794 my $self = shift;
488 4   100     19 my $number = shift || 50;
489              
490 4         32 return substr( $self->{_BUFFER}, 0, $number );
491             }
492              
493             # Work with control characters
494              
495             sub _grab_control {
496 2619     2619   11162 my $self = shift;
497              
498             # Check for a star here, as it simplifies our regex below,
499             # and it occurs pretty often
500 2619 100 66     14366 if ( $self->{_BUFFER} =~ s/^\*// ) {
    100          
    100          
    100          
    100          
    50          
    50          
    100          
    100          
501 30         78 return ( '*', '' );
502              
503             # A standard control word:
504             } elsif (
505             $self->{_BUFFER} =~ s/
506             ^([a-z]{1,32}) # Lowercase word
507             (-?\d+)? # Optional signed number
508             (?:\s|(?=[^a-z0-9])) # Either whitespace, which we gobble or a
509             # non alpha-numeric, which we leave
510             //ix
511             )
512             {
513             # Return the control word, unless it's a \bin
514 2569         3258 my $param = '';
515 2569 100       6862 $param = $2 if defined($2);
516 2569 100       13163 return ( $1, $param ) unless $1 eq 'bin';
517              
518             # Pre-grab the binary data, and return the control word
519 4         11 $self->_grab_bin($2);
520 2         7 return ( 'bin', $2 );
521              
522             # hex-dec character (escape)
523             } elsif ( $self->{_BUFFER} =~ s/^'([0-9a-f]{2})//i ) {
524 7 100       32 $self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
525 7         33 return ( "'", $1 );
526              
527             # Control symbol (escape)
528             } elsif ( $self->{_BUFFER} =~ s/^([-_~:|{}'\\])// ) {
529 1 50       4 $self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
530 1         5 return ( $1, '' );
531              
532             # Escaped whitespace (ew, but allowed)
533             } elsif ( $self->{_BUFFER} =~ s/^[\r\n]// ) {
534 3         9 return ( 'par', '' );
535              
536             # Escaped tab (ew, but allowed)
537             } elsif ( $self->{_BUFFER} =~ s/^\t// ) {
538 0         0 return ( 'tab', '' );
539              
540             # Escaped semi-colon - this is WRONG
541             } elsif ( $self->{_BUFFER} =~ s/^\;// ) {
542 0         0 carp(
543             "Your RTF contains an escaped semi-colon. This isn't allowed, but we'll let you have it back as a literal for now. See the RTF spec."
544             );
545 0         0 return ( ';', '' );
546              
547             # Unicode characters
548             } elsif ( $self->{_BUFFER} =~ s/^u(\d+)// ) {
549 4         17 return ( 'u', $1 );
550              
551             # Allow incorrect control words
552             } elsif ( ( $self->{_SLOPPY} ) &&
553             ( $self->{_BUFFER} =~ s/^([a-z]{1,32})(-?\d+)//i ) )
554             {
555 2         5 my $param = '';
556 2 50       10 $param = $2 if defined($2);
557              
558 2         35 return ( $1, $param );
559             }
560              
561             # If we get here, something has gone wrong. First we'll create
562             # a human readable section of RTF to show the user.
563 3         7 my $die_string = substr( $self->{_BUFFER}, 0, 50 );
564 3         8 $die_string =~ s/\r/[R]/g;
565              
566             # Get angry with the user
567 3         76 carp
568             "Your RTF is broken, trying to recover to nearest group from '\\$die_string'\n";
569 3         2004 carp
570             "Chances are you have some RTF like \\control1plaintext. Which is illegal. But you can allow that by passing the 'sloppy' attribute to new() or using the sloppy() method. Please also write to and abuse the developer of the software which wrote your RTF :-)\n";
571              
572             # Kill everything until the next group
573 3         2377 $self->{_BUFFER} =~ s/^.+?([}{])/$1/;
574 3         11 return ( '', '' );
575             }
576              
577             # A first stab at grabbing binary data
578             sub _grab_bin {
579 4     4   5 my $self = shift;
580 4         5 my $bytes = shift;
581              
582             # If the buffer is too small, attempt to read in some more data...
583 4         17 while ( length( $self->{_BUFFER} ) < $bytes ) {
584              
585             # If there's no filehandle, or the one we have is eof, complain
586 2 50 66     7 if ( !$self->{_FILEHANDLE} || $self->{_FILEHANDLE}->eof ) {
587 2         283 croak "\\bin is asking for $bytes characters, but there are only " .
588             length( $self->{_BUFFER} ) . " left.";
589             }
590              
591             # Try and read in more data
592 0         0 $self->_get_line;
593             }
594              
595             # Return the right number of characters
596 2         30 $self->{_BINARY_DATA} = substr( $self->{_BUFFER}, 0, $bytes, '' );
597             }
598              
599             =head1 NOTES
600              
601             To avoid intrusively deep parsing, if an alternative ASCII
602             representation is available for a Unicode entity, and that
603             ASCII representation contains C<{>, or C<\>, by themselves, things
604             will go I. But I'm not convinced either of those is
605             allowed by the spec.
606              
607             =head1 AUTHOR
608              
609             Pete Sergeant -- C
610              
611             =head1 LICENSE
612              
613             Copyright B.
614              
615             This program is free software; you can redistribute it and/or modify it under
616             the same terms as Perl itself.
617              
618             =cut
619              
620             1;