File Coverage

blib/lib/RTF/Tokenizer.pm

Criterion	Covered	Total	%
statement	137	157	87.2
branch	83	100	83.0
condition	8	14	57.1
subroutine	18	19	94.7
pod	9	9	100.0
total	255	299	85.2

line	stmt	bran	cond	sub	pod	time	code
1							#!perl
2							# RTF::Tokenizer - Peter Sergeant
3
4							=head1 NAME
5
6							RTF::Tokenizer - Tokenize RTF
7
8							=head1 VERSION
9
10							version 1.20
11
12							=head1 DESCRIPTION
13
14							Tokenizes RTF
15
16							=head1 SYNOPSIS
17
18							use RTF::Tokenizer;
19
20							# Create a tokenizer object
21							my $tokenizer = RTF::Tokenizer->new();
22
23							my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}' );
24							my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}', note_escapes => 1 );
25
26							my $tokenizer = RTF::Tokenizer->new( file => \*STDIN );
27							my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf' );
28							my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf', sloppy => 1 );
29
30							# Populate it from a file
31							$tokenizer->read_file('filename.txt');
32
33							# Or a file handle
34							$tokenizer->read_file( \*STDIN );
35
36							# Or a string
37							$tokenizer->read_string( '{\*\some rtf}' );
38
39							# Get the first token
40							my ( $token_type, $argument, $parameter ) = $tokenizer->get_token();
41
42							# Ooops, that was wrong...
43							$tokenizer->put_token( 'control', 'b', 1 );
44
45							# Let's have the lot...
46							my @tokens = $tokenizer->get_all_tokens();
47
48							=head1 INTRODUCTION
49
50							This documentation assumes some basic knowledge of RTF.
51							If you lack that, go read The_RTF_Cookbook:
52
53							L
54
55							=cut
56
57							require 5;
58
59							package RTF::Tokenizer;
60							$RTF::Tokenizer::VERSION = '1.20';
61	15			15		133421	use vars qw($VERSION);
	15					23
	15					688
62
63	15			15		56	use strict;
	15					14
	15					234
64	15			15		50	use warnings;
	15					15
	15					276
65	15			15		43	use Carp;
	15					14
	15					924
66	15			15		6243	use IO::File;
	15					100277
	15					9896
67
68							=head1 METHODS
69
70							=head2 new()
71
72							Instantiates an RTF::Tokenizer object.
73
74							B:
75
76							C - calls the C method with the value provided after instantiation
77
78							C - calls the C method with the value provided after instantiation
79
80							C - boolean - whether to give RTF Escapes a token type of C (true) or
81							C (false, default)
82
83							C - boolean - whether or not to allow some illegal but common RTF sequences found
84							'in the wild'. As of C<1.08>, this currently only allows control words with a numeric
85							argument to have a text field right after with no delimiter, like:
86
87							\control1Plaintext
88
89							but this may change in future releases. Defaults false.
90
91							C - boolean - ... the RTF specification tells you to strip whitespace
92							which comes after control words, and newlines at the beginning and ending of text areas.
93							One result of that is that you can't actually round-trip the output of the tokenization
94							process. Turning this on is probably a bad idea, but someone cared enough to send me a
95							patch for it, so why not. Defaults false, and you should leave it that way.
96
97							=cut
98
99							sub new {
100							# Get the real class name in the highly unlikely event we've been
101							# called from an object itself.
102	51			51	1	347806	my $proto = shift;
103	51		66			236	my $class = ref($proto) \|\| $proto;
104
105							# Read in the named parameters
106	51					128	my %config = @_;
107
108	51					216	my $self = {
109							_BUFFER => '', # Stores read but unparsed RTF
110							_BINARY_DATA => '', # Temporary data store if we're reading a \bin
111							_FILEHANDLE => '', # Stores the active filehandle
112							_INITIAL_READ => 512
113							, # How many characters to read by default. 512 recommended by RTF spec
114							_UC => 1, # Default number of characters to count for \uc
115							};
116
117	51					74	bless $self, $class;
118
119							# Call the data-reading convenience methods if required
120	51	100				174	if ( $config{'file'} ) {
		100
121	5					24	$self->read_file( $config{'file'} );
122							} elsif ( $config{'string'} ) {
123	38					74	$self->read_string( $config{'string'} );
124							}
125
126							# Set up final config stuff
127	51					103	$self->{_NOTE_ESCAPES} = $config{'note_escapes'};
128	51					61	$self->{_SLOPPY} = $config{'sloppy'};
129	51					95	$self->{_WHITESPACE} = $config{'preserve_whitespace'};
130
131	51					119	return $self;
132
133							}
134
135							=head2 read_string( STRING )
136
137							Appends the string to the tokenizer-object's buffer
138							(earlier versions would over-write the buffer -
139							this version does not).
140
141							=cut
142
143							sub read_string {
144	42			42	1	353	my $self = shift;
145	42					159	$self->{_BUFFER} .= shift;
146							}
147
148							=head2 read_file( \*FILEHANDLE )
149
150							=head2 read_file( $IO_File_object )
151
152							=head2 read_file( 'filename' )
153
154							Appends a chunk of data from the filehandle to the buffer,
155							and remembers the filehandle, so if you ask for a token,
156							and the buffer is empty, it'll try and read the next line
157							from the file (earlier versions would over-write the buffer -
158							this version does not).
159
160							This chunk is 500 characters, and then whatever is left until
161							the next occurrence of the IRS (a newline character in this case).
162							If for whatever reason, you want to change that number to something
163							else, use C.
164
165							=cut
166
167							sub read_file {
168
169	12			12	1	3931	my $self = shift;
170	12					15	my $file = shift;
171
172							# Accept a filehandle referenced via a GLOB
173	12	100				33	if ( ref $file eq 'GLOB' ) {
		100
		100
		100
174	2					13	$self->{_FILEHANDLE} = IO::File->new_from_fd( $file, '<' );
175							croak
176							"Couldn't create an IO::File object from the reference you specified"
177	2	100				253	unless $self->{_FILEHANDLE};
178
179							# Accept IO::File and subclassed objects
180							} elsif (
181							eval {
182	10					106	$file->isa('IO::File');
183							} )
184							{
185	1					15	$self->{_FILEHANDLE} = $file;
186
187							# This is undocumented, because you shouldn't use it. Don't rely on it.
188							} elsif ( ref $file eq 'IO::Scalar' ) {
189	4					10	$self->{_FILEHANDLE} = $file;
190
191							# If it's not a reference, assume it's a filename
192							} elsif ( !ref $file ) {
193	4					16	$self->{_FILEHANDLE} = IO::File->new("< $file");
194	4	100				314	croak "Couldn't open '$file' for reading" unless $self->{_FILEHANDLE};
195
196							# Complain if we get anything else
197							} else {
198	1					67	croak "You passed a reference to read_file of type " . ref($file) .
199							" which isn't an allowed type";
200							}
201
202							# Check what our line-endings seem to be, then set $self->{_IRS} accordingly.
203							# This also reads in the first few lines as a side effect.
204	9					32	$self->_line_endings;
205							}
206
207							# Reads a line from an IO:File'ish object
208							sub _get_line {
209	20			20		17	my $self = shift();
210
211							# Localize the input record separator before changing it so
212							# we don't mess up any other part of the application running
213							# us that relies on it
214	20					55	local $/ = $self->{_IRS};
215
216							# Read the line itself
217	20					140	my $line = $self->{_FILEHANDLE}->getline();
218	20	50				517	$self->{_BUFFER} .= $line if defined $line;
219							}
220
221							# Determine what kind of line-endings the file uses
222
223							sub _line_endings {
224	12			12		1169	my $self = shift();
225
226	12					10	my $temp_buffer;
227	12					38	$self->{_FILEHANDLE}->read( $temp_buffer, $self->{_INITIAL_READ} );
228
229							# This catches all allowed cases
230	12	50				186	if ( $temp_buffer =~ m/(\cM\cJ\|\cM\|\cJ)/ ) {
231	12					26	$self->{_IRS} = $1;
232
233	12	100				41	$self->{_RS} = "Macintosh" if $self->{_IRS} eq "\cM";
234	12	100				30	$self->{_RS} = "Windows" if $self->{_IRS} eq "\cM\cJ";
235	12	100				27	$self->{_RS} = "UNIX" if $self->{_IRS} eq "\cJ";
236
237							} else {
238	0					0	$self->{_RS} = "Unknown";
239							}
240
241							# Add back to main buffer
242	12					32	$self->{_BUFFER} .= $temp_buffer;
243
244							# Call C<_get_line> again so we're sure we're not only
245							# reading half a line
246	12					25	$self->_get_line;
247
248							}
249
250							=head2 get_token()
251
252							Returns the next token as a three-item list: 'type', 'argument', 'parameter'.
253							Token is one of: C, C, C, C or C.
254
255							If you turned on C, then you may get a forth item for
256							C tokens.
257
258							=over
259
260							=item C
261
262							'type' is set to 'text'. 'argument' is set to the text itself. 'parameter'
263							is left blank. NOTE: C<\{>, C<\}>, and C<\\> are all returned as control words,
264							rather than rendered as text for you, as are C<\_>, C<\-> and friends.
265
266							=item C
267
268							'type' is 'control'. 'argument' is the control word or control symbol.
269							'parameter' is the control word's parameter if it has one - this will
270							be numeric, EXCEPT when 'argument' is a literal ', in which case it
271							will be a two-letter hex string.
272
273							If you turned on C, you'll get a forth item,
274							which will be the whitespace or a defined empty string.
275
276							=item C
277
278							'type' is 'group'. If it's the beginning of an RTF group, then
279							'argument' is 1, else if it's the end, argument is 0. 'parameter'
280							is not set.
281
282							=item C
283
284							End of file reached. 'type' is 'eof'. 'argument' is 1. 'parameter' is
285							0.
286
287							=item C
288
289							If you specifically turn on this functionality, you'll get an
290							C type, which is identical to C, only, it's
291							only returned for escapes.
292
293							=back
294
295							=cut
296
297							# Define a regular expression that matches characters which are 'text' -
298							# that is, they're not a backspace, a scoping brace, or discardable
299							# whitespace.
300							my $non_text_standard_re = qr/[^\\{}\r\n]/;
301							my $non_text_whitespace_re = qr/[^\\{}]/;
302
303							sub get_token {
304	5902			5902	1	28661	my $self = shift;
305
306							# If the last token we returned was \bin, we'll now have a
307							# big chunk of binary data waiting for the user, so send that
308							# back
309	5902	100				6628	if ( $self->{_BINARY_DATA} ) {
310	2					3	my $data = $self->{_BINARY_DATA};
311	2					2	$self->{_BINARY_DATA} = '';
312	2					8	return ( 'text', $data, '' );
313							}
314
315							# We might have a cached token, and if we do, we'll want to
316							# return that first
317	5900	100				6291	if ( $self->{_PUT_TOKEN_CACHE_FLAG} ) {
318							# Take the value from the cache
319	3					2	my @return_values = @{ pop( @{ $self->{_PUT_TOKEN_CACHE} } ) };
	3					3
	3					8
320
321							# Update the flag
322	3					3	$self->{_PUT_TOKEN_CACHE_FLAG} = @{ $self->{_PUT_TOKEN_CACHE} };
	3					5
323
324							# Give the user the token back
325	3					23	return @return_values;
326							}
327
328							my $non_text_re =
329	5897	100				5685	$self->{_WHITESPACE} ? $non_text_whitespace_re : $non_text_standard_re;
330
331							# Our main parsing loop
332	5897					3306	while (1) {
333
334	5932					5648	my $start_character = substr( $self->{_BUFFER}, 0, 1, '' );
335
336							# Most likely to be text, so we check for that first
337	5932	100				14249	if ( $start_character =~ $non_text_re ) {
		100
		100
		100
		100
338	15			15		99	no warnings 'uninitialized';
	15					18
	15					16363
339
340							# We want to return text fields that have newlines in as one
341							# token, which requires a bit of work, as we read in one line
342							# at a time from out files...
343	224					162	my $temp_text = '';
344
345							READTEXT:
346
347							# Grab all the next 'text' characters
348	231					450	$self->{_BUFFER} =~ s/^([^\\{}]+)//s;
349	231	100				426	$temp_text .= $1 if defined $1;
350
351							# If the buffer is empty, try reading in some more, and
352							# then go back to READTEXT to keep going. Now, the clever
353							# thing would be to assume that if the buffer IS empty
354							# then there MUST be more to read, which is true if we
355							# have well-formed input. We're going to assume that the
356							# input could well be a little broken.
357	231	100	66			370	if ( ( !$self->{_BUFFER} ) && ( $self->{_FILEHANDLE} ) ) {
358	7					33	$self->_get_line;
359	7	50				31	goto READTEXT if $self->{_BUFFER};
360							}
361
362							# Make sure we're not including newlines in our output,
363							# as RTF spec says they're to be ignored...
364	224	100				264	unless ( $self->{_WHITESPACE} ) {
365	114					259	$temp_text =~ s/(\cM\cJ\|\cM\|\cJ)//g;
366							}
367
368							# Give the user a shiny token back
369	224					565	return ( 'text', $start_character . $temp_text, '' );
370
371							# Second most likely to be a control character
372							} elsif ( $start_character eq "\\" ) {
373	5192					4878	my @args = $self->_grab_control();
374
375							# If the control word was an escape, and the user
376							# asked to be told separately about those, this
377							# will be set, so return an 'escape'. Otherwise,
378							# return the control word as a 'control'
379	5190	100				5700	if ( $self->{_TEMP_ESCAPE_FLAG} ) {
380	2					2	$self->{_TEMP_ESCAPE_FLAG} = 0;
381	2					9	return ( 'escape', @args );
382							} else {
383	5188					10505	return ( 'control', @args );
384							}
385
386							# Probably a group then
387							} elsif ( $start_character eq '{' ) {
388	224					415	return ( 'group', 1, '' );
389							} elsif ( $start_character eq '}' ) {
390	220					390	return ( 'group', 0, '' );
391
392							# No start character? Either we're at the end of our input,
393							# or we need some new input
394							} elsif ( !$start_character ) {
395							# If we were read from a string, we're all done
396	38	100				112	return ( 'eof', 1, 0 ) unless $self->{_FILEHANDLE};
397
398							# If we were read from a file, try and get some more stuff
399							# in to the buffer, or return the 'eof' character
400	1	50				6	return ( 'eof', 1, 0 ) if $self->{_FILEHANDLE}->eof;
401	1					10	$self->_get_line;
402	1	50				3	return ( 'eof', 1, 0 ) unless $self->{_BUFFER};
403							}
404							}
405							}
406
407							=head2 get_all_tokens
408
409							As per C, but keeps calling C until it hits EOF. Returns
410							a list of arrayrefs.
411
412							=cut
413
414							sub get_all_tokens {
415	34			34	1	26	my $self = shift;
416	34					29	my @tokens;
417
418	34					27	while (1) {
419	5822					5747	my $token = [ $self->get_token() ];
420	5822					4690	push( @tokens, $token );
421	5822	100				7693	last if $token->[0] eq 'eof';
422							}
423
424	34					680	return @tokens;
425							}
426
427							=head2 put_token( type, token, argument )
428
429							Adds an item to the token cache, so that the next time you
430							call get_token, the arguments you passed here will be returned.
431							We don't check any of the values, so use this carefully. This
432							is on a first in last out basis.
433
434							=cut
435
436							sub put_token {
437	3			3	1	423	my $self = shift;
438
439	3					2	push( @{ $self->{_PUT_TOKEN_CACHE} }, [@_] );
	3					6
440
441							# No need to set this to the real value of the token cache, as
442							# it'll get set properly when we try and read a cached token.
443	3					5	$self->{_PUT_TOKEN_CACHE_FLAG} = 1;
444							}
445
446							=head2 sloppy( [bool] )
447
448							Decides whether we allow some types of broken RTF. See C's docs
449							for a little more explanation about this. Pass it 1 to turn it on, 0 to
450							turn it off. This will always return undef.
451
452							=cut
453
454							sub sloppy {
455	0			0	1	0	my $self = shift;
456	0					0	my $bool = shift;
457
458	0	0				0	if ($bool) {
459	0					0	$self->{_SLOPPY} = 1;
460							} else {
461	0					0	$self->{_SLOPPY} = 0;
462							}
463
464	0					0	return;
465							}
466
467							=head2 initial_read( [number] )
468
469							Don't call this unless you actually have a good reason. When
470							the Tokenizer reads from a file, it first attempts to work out
471							what the correct input record-seperator should be, by reading
472							some characters from the file handle. This value starts off
473							as 512, which is twice the amount of characters that version 1.7
474							of the RTF specification says you should go before including a
475							line feed if you're writing RTF.
476
477							Called with no argument, this returns the current value of the
478							number of characters we're going to read. Called with a numeric
479							argument, it sets the number of characters we'll read.
480
481							You really don't need to use this method.
482
483							=cut
484
485							sub initial_read {
486	4			4	1	322	my $self = shift;
487	4	100				9	if (@_) { $self->{_INITIAL_READ} = shift }
	1					2
488	4					11	return $self->{_INITIAL_READ};
489							}
490
491							=head2 debug( [number] )
492
493							Returns (non-destructively) the next 50 characters from the buffer,
494							OR, the number of characters you specify. Printing these to STDERR,
495							causing fatal errors, and the like, are left as an exercise to the
496							programmer.
497
498							Note the part about 'from the buffer'. It really means that, which means
499							if there's nothing in the buffer, but still stuff we're reading from a
500							file it won't be shown. Chances are, if you're using this function, you're
501							debugging. There's an internal method called C<_get_line>, which is called
502							without arguments (C<$self->_get_line()>) that's how we get more stuff into
503							the buffer when we're reading from filehandles. There's no guarentee that'll
504							stay, or will always work that way, but, if you're debugging, that shouldn't
505							matter.
506
507							=cut
508
509							sub debug {
510	4			4	1	693	my $self = shift;
511	4		100			15	my $number = shift \|\| 50;
512
513	4					17	return substr( $self->{_BUFFER}, 0, $number );
514							}
515
516							# Work with control characters
517
518							# It's ugly to repeat myself here, but I believe having two literal re's
519							# here is going to offer a small performance benefit over a regex with
520							# a scalar in it.
521							my $control_word_standard_re = qr/
522							^([a-z]{1,32}) # Lowercase word
523							(-?\d+)? # Optional signed number
524							(?:\s\|(?=[^a-z0-9])) # Either whitespace, which we gobble or a
525							# non alpha-numeric, which we leave
526							/ix;
527							my $control_word_whitespace_re = qr/
528							^([a-z]{1,32}) # Lowercase word
529							(-?\d+)? # Optional signed number
530							(\s*)? # Capture trailing whitespace
531							/ix;
532
533							sub _grab_control {
534	5192			5192		3394	my $self = shift;
535
536							my $whitespace_re =
537	5192	100				5020	$self->{_WHITESPACE} ? $control_word_whitespace_re :
538							$control_word_standard_re;
539
540							# Check for a star here, as it simplifies our regex below,
541							# and it occurs pretty often
542	5192	100	0			19139	if ( $self->{_BUFFER} =~ s/^\*// ) {
		100
		100
		100
		100
		50
		50
		50
		0
543	59					92	return ( '*', '' );
544
545							# A standard control word:
546							} elsif ( $self->{_BUFFER} =~ s/$whitespace_re// ) {
547							# Return the control word, unless it's a \bin
548	5117					3568	my $param = '';
549	5117	100				7615	$param = $2 if defined($2);
550
551	5117					3014	my @whitespace;
552	5117	100				5917	if ( $self->{_WHITESPACE} ) {
553	2549	50				3530	push( @whitespace, defined $3 ? $3 : '' );
554							}
555
556	5117	100				14122	return ( $1, $param, @whitespace ) unless $1 eq 'bin';
557
558							# Pre-grab the binary data, and return the control word
559	4					4	my $byte_count = $2;
560	4					8	$self->_grab_bin($byte_count);
561	2					5	return ( 'bin', $byte_count, @whitespace );
562
563							# hex-dec character (escape)
564							} elsif ( $self->{_BUFFER} =~ s/^'([0-9a-f]{2})//i ) {
565	9	100				21	$self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
566	9					24	return ( "'", $1 );
567
568							# Control symbol (escape)
569							} elsif ( $self->{_BUFFER} =~ s/^([-_~:\|{}'\\])// ) {
570	1	50				4	$self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
571	1					3	return ( $1, '' );
572
573							# Escaped whitespace (ew, but allowed)
574							} elsif ( $self->{_BUFFER} =~ s/^[\r\n]// ) {
575	3					8	return ( 'par', '' );
576
577							# Escaped tab (ew, but allowed)
578							} elsif ( $self->{_BUFFER} =~ s/^\t// ) {
579	0					0	return ( 'tab', '' );
580
581							# Escaped semi-colon - this is WRONG
582							} elsif ( $self->{_BUFFER} =~ s/^\;// ) {
583	0					0	carp(
584							"Your RTF contains an escaped semi-colon. This isn't allowed, but we'll let you have it back as a literal for now. See the RTF spec."
585							);
586	0					0	return ( ';', '' );
587
588							# Unicode characters
589							} elsif ( $self->{_BUFFER} =~ s/^u(\d+)// ) {
590	3					10	return ( 'u', $1 );
591
592							# Allow incorrect control words
593							} elsif ( ( $self->{_SLOPPY} ) &&
594							( $self->{_BUFFER} =~ s/^([a-z]{1,32})(-?\d+)//i ) )
595							{
596	0					0	my $param = '';
597	0	0				0	$param = $2 if defined($2);
598
599	0					0	return ( $1, $param );
600							}
601
602							# If we get here, something has gone wrong. First we'll create
603							# a human readable section of RTF to show the user.
604	0					0	my $die_string = substr( $self->{_BUFFER}, 0, 50 );
605	0					0	$die_string =~ s/\r/[R]/g;
606
607							# Get angry with the user
608	0					0	carp
609							"Your RTF is broken, trying to recover to nearest group from '\\$die_string'\n";
610	0					0	carp
611							"Chances are you have some RTF like \\control1plaintext. Which is illegal. But you can allow that by passing the 'sloppy' attribute to new() or using the sloppy() method. Please also write to and abuse the developer of the software which wrote your RTF :-)\n";
612
613							# Kill everything until the next group
614	0					0	$self->{_BUFFER} =~ s/^.+?([}{])/$1/;
615	0					0	return ( '', '' );
616							}
617
618							# A first stab at grabbing binary data
619							sub _grab_bin {
620	4			4		3	my $self = shift;
621	4					4	my $bytes = shift;
622
623							# If the buffer is too small, attempt to read in some more data...
624	4					10	while ( length( $self->{_BUFFER} ) < $bytes ) {
625
626							# If there's no filehandle, or the one we have is eof, complain
627	2	50	66			6	if ( !$self->{_FILEHANDLE} \|\| $self->{_FILEHANDLE}->eof ) {
628							croak "\\bin is asking for $bytes characters, but there are only " .
629	2					267	length( $self->{_BUFFER} ) . " left.";
630							}
631
632							# Try and read in more data
633	0					0	$self->_get_line;
634							}
635
636							# Return the right number of characters
637	2					5	$self->{_BINARY_DATA} = substr( $self->{_BUFFER}, 0, $bytes, '' );
638							}
639
640							=head1 NOTES
641
642							To avoid intrusively deep parsing, if an alternative ASCII
643							representation is available for a Unicode entity, and that
644							ASCII representation contains C<{>, or C<\>, by themselves, things
645							will go I. But I'm not convinced either of those is
646							allowed by the spec.
647
648							=head1 AUTHOR
649
650							Pete Sergeant -- C
651
652							=head1 LICENSE
653
654							Copyright B.
655
656							This program is free software; you can redistribute it and/or modify it under
657							the same terms as Perl itself.
658
659							=cut
660
661							1;