File Coverage

blib/lib/RTF/Tokenizer.pm

Criterion	Covered	Total	%
statement	145	150	96.6
branch	80	90	88.8
condition	11	14	78.5
subroutine	19	19	100.0
pod	9	9	100.0
total	264	282	93.6

line	stmt	bran	cond	sub	pod	time	code
1							#!perl
2							# RTF::Tokenizer - Peter Sergeant
3
4							=head1 NAME
5
6							RTF::Tokenizer - Tokenize RTF
7
8							=head1 VERSION
9
10							version 1.18
11
12							=head1 DESCRIPTION
13
14							Tokenizes RTF
15
16							=head1 SYNOPSIS
17
18							use RTF::Tokenizer;
19
20							# Create a tokenizer object
21							my $tokenizer = RTF::Tokenizer->new();
22
23							my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}' );
24							my $tokenizer = RTF::Tokenizer->new( string => '{\rtf1}', note_escapes => 1 );
25
26							my $tokenizer = RTF::Tokenizer->new( file => \*STDIN );
27							my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf' );
28							my $tokenizer = RTF::Tokenizer->new( file => 'lala.rtf', sloppy => 1 );
29
30							# Populate it from a file
31							$tokenizer->read_file('filename.txt');
32
33							# Or a file handle
34							$tokenizer->read_file( \*STDIN );
35
36							# Or a string
37							$tokenizer->read_string( '{\*\some rtf}' );
38
39							# Get the first token
40							my ( $token_type, $argument, $parameter ) = $tokenizer->get_token();
41
42							# Ooops, that was wrong...
43							$tokenizer->put_token( 'control', 'b', 1 );
44
45							# Let's have the lot...
46							my @tokens = $tokenizer->get_all_tokens();
47
48							=head1 INTRODUCTION
49
50							This documentation assumes some basic knowledge of RTF.
51							If you lack that, go read The_RTF_Cookbook:
52
53							L
54
55							=cut
56
57							require 5;
58
59							package RTF::Tokenizer;
60							$RTF::Tokenizer::VERSION = '1.18';
61	15			15		265791	use vars qw($VERSION);
	15					36
	15					962
62
63	15			15		84	use strict;
	15					30
	15					532
64	15			15		83	use warnings;
	15					27
	15					14841
65	15			15		100	use Carp;
	15					25
	15					1700
66	15			15		15296	use IO::File;
	15					203910
	15					14676
67
68							=head1 METHODS
69
70							=head2 new()
71
72							Instantiates an RTF::Tokenizer object.
73
74							B:
75
76							C - calls the C method with the value provided after instantiation
77
78							C - calls the C method with the value provided after instantiation
79
80							C - boolean - whether to give RTF Escapes a token type of C (true) or C (false)
81
82							C - boolean - whether or not to allow some illegal but common RTF sequences found 'in the wild'. As of C<1.08>, this currently only allows
83							control words with a numeric argument to have a text field right after with
84							no delimiter, like:
85
86							\control1Plaintext
87
88							but this may change in future releases.
89
90							=cut
91
92							sub new {
93							# Get the real class name in the highly unlikely event we've been
94							# called from an object itself.
95	37			37	1	564655	my $proto = shift;
96	37		66			236	my $class = ref($proto) \|\| $proto;
97
98							# Read in the named parameters
99	37					126	my %config = @_;
100
101	37					250	my $self = {
102							_BUFFER => '', # Stores read but unparsed RTF
103							_BINARY_DATA => '', # Temporary data store if we're reading a \bin
104							_FILEHANDLE => '', # Stores the active filehandle
105							_INITIAL_READ => 512
106							, # How many characters to read by default. 512 recommended by RTF spec
107							_UC => 1, # Default number of characters to count for \uc
108							};
109
110	37					100	bless $self, $class;
111
112							# Call the data-reading convenience methods if required
113	37	100				192	if ( $config{'file'} ) {
		100
114	5					39	$self->read_file( $config{'file'} );
115							} elsif ( $config{'string'} ) {
116	21					71	$self->read_string( $config{'string'} );
117							}
118
119							# Set up final config stuff
120	37					136	$self->{_NOTE_ESCAPES} = $config{'note_escapes'};
121	37					87	$self->{_SLOPPY} = $config{'sloppy'};
122
123	37					134	return $self;
124
125							}
126
127							=head2 read_string( STRING )
128
129							Appends the string to the tokenizer-object's buffer
130							(earlier versions would over-write the buffer -
131							this version does not).
132
133							=cut
134
135							sub read_string {
136	31			31	1	1615	my $self = shift;
137	31					1031	$self->{_BUFFER} .= shift;
138							}
139
140							=head2 read_file( \*FILEHANDLE )
141
142							=head2 read_file( $IO_File_object )
143
144							=head2 read_file( 'filename' )
145
146							Appends a chunk of data from the filehandle to the buffer,
147							and remembers the filehandle, so if you ask for a token,
148							and the buffer is empty, it'll try and read the next line
149							from the file (earlier versions would over-write the buffer -
150							this version does not).
151
152							This chunk is 500 characters, and then whatever is left until
153							the next occurrence of the IRS (a newline character in this case).
154							If for whatever reason, you want to change that number to something
155							else, use C.
156
157							=cut
158
159							sub read_file {
160
161	12			12	1	6352	my $self = shift;
162	12					18	my $file = shift;
163
164							# Accept a filehandle referenced via a GLOB
165	12	100				39	if ( ref $file eq 'GLOB' ) {
		100
		100
		100
166	2					18	$self->{_FILEHANDLE} = IO::File->new_from_fd( $file, '<' );
167	2	100				324	croak
168							"Couldn't create an IO::File object from the reference you specified"
169							unless $self->{_FILEHANDLE};
170
171							# Accept IO::File and subclassed objects
172							} elsif (
173							eval {
174	10					129	$file->isa('IO::File');
175							} )
176							{
177	1					2	$self->{_FILEHANDLE} = $file;
178
179							# This is undocumented, because you shouldn't use it. Don't rely on it.
180							} elsif ( ref $file eq 'IO::Scalar' ) {
181	4					17	$self->{_FILEHANDLE} = $file;
182
183							# If it's not a reference, assume it's a filename
184							} elsif ( !ref $file ) {
185	4					24	$self->{_FILEHANDLE} = IO::File->new("< $file");
186	4	100				413	croak "Couldn't open '$file' for reading" unless $self->{_FILEHANDLE};
187
188							# Complain if we get anything else
189							} else {
190	1					92	croak "You passed a reference to read_file of type " . ref($file) .
191							" which isn't an allowed type";
192							}
193
194							# Check what our line-endings seem to be, then set $self->{_IRS} accordingly.
195							# This also reads in the first few lines as a side effect.
196	9					44	$self->_line_endings;
197							}
198
199							# Reads a line from an IO:File'ish object
200							sub _get_line {
201	20			20		36	my $self = shift();
202
203							# Localize the input record separator before changing it so
204							# we don't mess up any other part of the application running
205							# us that relies on it
206	20					73	local $/ = $self->{_IRS};
207
208							# Read the line itself
209	20					198	my $line = $self->{_FILEHANDLE}->getline();
210	20	50				735	$self->{_BUFFER} .= $line if defined $line;
211							}
212
213							# Determine what kind of line-endings the file uses
214
215							sub _line_endings {
216	12			12		1552	my $self = shift();
217
218	12					23	my $temp_buffer;
219	12					59	$self->{_FILEHANDLE}->read( $temp_buffer, $self->{_INITIAL_READ} );
220
221							# This catches all allowed cases
222	12	50				264	if ( $temp_buffer =~ m/(\cM\cJ\|\cM\|\cJ)/ ) {
223	12					35	$self->{_IRS} = $1;
224
225	12	100				47	$self->{_RS} = "Macintosh" if $self->{_IRS} eq "\cM";
226	12	100				51	$self->{_RS} = "Windows" if $self->{_IRS} eq "\cM\cJ";
227	12	100				43	$self->{_RS} = "UNIX" if $self->{_IRS} eq "\cJ";
228
229							} else {
230	0					0	$self->{_RS} = "Unknown";
231							}
232
233							# Add back to main buffer
234	12					35	$self->{_BUFFER} .= $temp_buffer;
235
236							# Call C<_get_line> again so we're sure we're not only
237							# reading half a line
238	12					36	$self->_get_line;
239
240							}
241
242							=head2 get_token()
243
244							Returns the next token as a three-item list: 'type', 'argument', 'parameter'.
245							Token is one of: C, C, C, C or C.
246
247							=over
248
249							=item C
250
251							'type' is set to 'text'. 'argument' is set to the text itself. 'parameter'
252							is left blank. NOTE: C<\{>, C<\}>, and C<\\> are all returned as control words,
253							rather than rendered as text for you, as are C<\_>, C<\-> and friends.
254
255							=item C
256
257							'type' is 'control'. 'argument' is the control word or control symbol.
258							'parameter' is the control word's parameter if it has one - this will
259							be numeric, EXCEPT when 'argument' is a literal ', in which case it
260							will be a two-letter hex string.
261
262							=item C
263
264							'type' is 'group'. If it's the beginning of an RTF group, then
265							'argument' is 1, else if it's the end, argument is 0. 'parameter'
266							is not set.
267
268							=item C
269
270							End of file reached. 'type' is 'eof'. 'argument' is 1. 'parameter' is
271							0.
272
273							=item C
274
275							If you specifically turn on this functionality, you'll get an
276							C type, which is identical to C, only, it's
277							only returned for escapes.
278
279							=back
280
281							=cut
282
283							sub get_token {
284	3006			3006	1	47524	my $self = shift;
285
286							# If the last token we returned was \bin, we'll now have a
287							# big chunk of binary data waiting for the user, so send that
288							# back
289	3006	100				8044	if ( $self->{_BINARY_DATA} ) {
290	2					13	my $data = $self->{_BINARY_DATA};
291	2					5	$self->{_BINARY_DATA} = '';
292	2					15	return ( 'text', $data, '' );
293							}
294
295							# We might have a cached token, and if we do, we'll want to
296							# return that first
297	3004	100				6837	if ( $self->{_PUT_TOKEN_CACHE_FLAG} ) {
298							# Take the value from the cache
299	3					4	my @return_values = @{ pop( @{ $self->{_PUT_TOKEN_CACHE} } ) };
	3					4
	3					10
300
301							# Update the flag
302	3					5	$self->{_PUT_TOKEN_CACHE_FLAG} = @{ $self->{_PUT_TOKEN_CACHE} };
	3					5
303
304							# Give the user the token back
305	3					18	return @return_values;
306							}
307
308							# Our main parsing loop
309	3001					3499	while (1) {
310
311	3036					5392	my $start_character = substr( $self->{_BUFFER}, 0, 1, '' );
312
313							# Most likely to be text, so we check for that first
314	3036	100				9564	if ( $start_character =~ /[^\\{}\r\n]/ ) {
		100
		100
		100
		100
315	15			15		150	no warnings 'uninitialized';
	15					35
	15					28075
316
317							# We want to return text fields that have newlines in as one
318							# token, which requires a bit of work, as we read in one line
319							# at a time from out files...
320	124					166	my $temp_text = '';
321
322	131					426	READTEXT:
323
324							# Grab all the next 'text' characters
325							$self->{_BUFFER} =~ s/^([^\\{}]+)//s;
326	131	100				401	$temp_text .= $1 if defined $1;
327
328							# If the buffer is empty, try reading in some more, and
329							# then go back to READTEXT to keep going. Now, the clever
330							# thing would be to assume that if the buffer IS empty
331							# then there MUST be more to read, which is true if we
332							# have well-formed input. We're going to assume that the
333							# input could well be a little broken.
334	131	100	100			431	if ( ( !$self->{_BUFFER} ) && ( $self->{_FILEHANDLE} ) ) {
335	7					73	$self->_get_line;
336	7	50				43	goto READTEXT if $self->{_BUFFER};
337							}
338
339							# Make sure we're not including newlines in our output,
340							# as RTF spec says they're to be ignored...
341	124					581	$temp_text =~ s/(\cM\cJ\|\cM\|\cJ)//g;
342
343							# Give the user a shiny token back
344	124					613	return ( 'text', $start_character . $temp_text, '' );
345
346							# Second most likely to be a control character
347							} elsif ( $start_character eq "\\" ) {
348	2619					4572	my @args = $self->_grab_control();
349
350							# If the control word was an escape, and the user
351							# asked to be told separately about those, this
352							# will be set, so return an 'escape'. Otherwise,
353							# return the control word as a 'control'
354	2617	100				5334	if ( $self->{_TEMP_ESCAPE_FLAG} ) {
355	2					3	$self->{_TEMP_ESCAPE_FLAG} = 0;
356	2					14	return ( 'escape', @args );
357							} else {
358	2615					12897	return ( 'control', @args );
359							}
360
361							# Probably a group then
362							} elsif ( $start_character eq '{' ) {
363	118					446	return ( 'group', 1, '' );
364							} elsif ( $start_character eq '}' ) {
365	120					427	return ( 'group', 0, '' );
366
367							# No start character? Either we're at the end of our input,
368							# or we need some new input
369							} elsif ( !$start_character ) {
370							# If we were read from a string, we're all done
371	21	100				119	return ( 'eof', 1, 0 ) unless $self->{_FILEHANDLE};
372
373							# If we were read from a file, try and get some more stuff
374							# in to the buffer, or return the 'eof' character
375	1	50				7	return ( 'eof', 1, 0 ) if $self->{_FILEHANDLE}->eof;
376	1					9	$self->_get_line;
377	1	50				3	return ( 'eof', 1, 0 ) unless $self->{_BUFFER};
378							}
379							}
380							}
381
382							=head2 get_all_tokens
383
384							As per C, but keeps calling C until it hits EOF. Returns
385							a list of arrayrefs.
386
387							=cut
388
389							sub get_all_tokens {
390	17			17	1	39	my $self = shift;
391	17					22	my @tokens;
392
393	17					90	while (1) {
394	2903					5560	my $token = [ $self->get_token() ];
395	2903					5577	push( @tokens, $token );
396	2903	100				6678	last if $token->[0] eq 'eof';
397							}
398
399	17					681	return @tokens;
400							}
401
402							=head2 put_token( type, token, argument )
403
404							Adds an item to the token cache, so that the next time you
405							call get_token, the arguments you passed here will be returned.
406							We don't check any of the values, so use this carefully. This
407							is on a first in last out basis.
408
409							=cut
410
411							sub put_token {
412
413	3			3	1	451	my $self = shift;
414	3					7	my ( $type, $token, $argument ) = ( shift, shift, shift );
415
416	3					4	push( @{ $self->{_PUT_TOKEN_CACHE} }, [ $type, $token, $argument ] );
	3					10
417
418							# No need to set this to the real value of the token cache, as
419							# it'll get set properly when we try and read a cached token.
420	3					36	$self->{_PUT_TOKEN_CACHE_FLAG} = 1;
421							}
422
423							=head2 sloppy( [bool] )
424
425							Decides whether we allow some types of broken RTF. See C's docs
426							for a little more explanation about this. Pass it 1 to turn it on, 0 to
427							turn it off. This will always return undef.
428
429							=cut
430
431							sub sloppy {
432	2			2	1	1156	my $self = shift;
433	2					4	my $bool = shift;
434
435	2	100				8	if ($bool) {
436	1					4	$self->{_SLOPPY} = 1;
437							} else {
438	1					2	$self->{_SLOPPY} = 0;
439							}
440
441	2					39	return;
442							}
443
444							=head2 initial_read( [number] )
445
446							Don't call this unless you actually have a good reason. When
447							the Tokenizer reads from a file, it first attempts to work out
448							what the correct input record-seperator should be, by reading
449							some characters from the file handle. This value starts off
450							as 512, which is twice the amount of characters that version 1.7
451							of the RTF specification says you should go before including a
452							line feed if you're writing RTF.
453
454							Called with no argument, this returns the current value of the
455							number of characters we're going to read. Called with a numeric
456							argument, it sets the number of characters we'll read.
457
458							You really don't need to use this method.
459
460							=cut
461
462							sub initial_read {
463	4			4	1	601	my $self = shift;
464	4	100				13	if (@_) { $self->{_INITIAL_READ} = shift }
	1					3
465	4					15	return $self->{_INITIAL_READ};
466							}
467
468							=head2 debug( [number] )
469
470							Returns (non-destructively) the next 50 characters from the buffer,
471							OR, the number of characters you specify. Printing these to STDERR,
472							causing fatal errors, and the like, are left as an exercise to the
473							programmer.
474
475							Note the part about 'from the buffer'. It really means that, which means
476							if there's nothing in the buffer, but still stuff we're reading from a
477							file it won't be shown. Chances are, if you're using this function, you're
478							debugging. There's an internal method called C<_get_line>, which is called
479							without arguments (C<$self->_get_line()>) that's how we get more stuff into
480							the buffer when we're reading from filehandles. There's no guarentee that'll
481							stay, or will always work that way, but, if you're debugging, that shouldn't
482							matter.
483
484							=cut
485
486							sub debug {
487	4			4	1	794	my $self = shift;
488	4		100			19	my $number = shift \|\| 50;
489
490	4					32	return substr( $self->{_BUFFER}, 0, $number );
491							}
492
493							# Work with control characters
494
495							sub _grab_control {
496	2619			2619		11162	my $self = shift;
497
498							# Check for a star here, as it simplifies our regex below,
499							# and it occurs pretty often
500	2619	100	66			14366	if ( $self->{_BUFFER} =~ s/^\*// ) {
		100
		100
		100
		100
		50
		50
		100
		100
501	30					78	return ( '*', '' );
502
503							# A standard control word:
504							} elsif (
505							$self->{_BUFFER} =~ s/
506							^([a-z]{1,32}) # Lowercase word
507							(-?\d+)? # Optional signed number
508							(?:\s\|(?=[^a-z0-9])) # Either whitespace, which we gobble or a
509							# non alpha-numeric, which we leave
510							//ix
511							)
512							{
513							# Return the control word, unless it's a \bin
514	2569					3258	my $param = '';
515	2569	100				6862	$param = $2 if defined($2);
516	2569	100				13163	return ( $1, $param ) unless $1 eq 'bin';
517
518							# Pre-grab the binary data, and return the control word
519	4					11	$self->_grab_bin($2);
520	2					7	return ( 'bin', $2 );
521
522							# hex-dec character (escape)
523							} elsif ( $self->{_BUFFER} =~ s/^'([0-9a-f]{2})//i ) {
524	7	100				32	$self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
525	7					33	return ( "'", $1 );
526
527							# Control symbol (escape)
528							} elsif ( $self->{_BUFFER} =~ s/^([-_~:\|{}'\\])// ) {
529	1	50				4	$self->{_TEMP_ESCAPE_FLAG}++ if $self->{_NOTE_ESCAPES};
530	1					5	return ( $1, '' );
531
532							# Escaped whitespace (ew, but allowed)
533							} elsif ( $self->{_BUFFER} =~ s/^[\r\n]// ) {
534	3					9	return ( 'par', '' );
535
536							# Escaped tab (ew, but allowed)
537							} elsif ( $self->{_BUFFER} =~ s/^\t// ) {
538	0					0	return ( 'tab', '' );
539
540							# Escaped semi-colon - this is WRONG
541							} elsif ( $self->{_BUFFER} =~ s/^\;// ) {
542	0					0	carp(
543							"Your RTF contains an escaped semi-colon. This isn't allowed, but we'll let you have it back as a literal for now. See the RTF spec."
544							);
545	0					0	return ( ';', '' );
546
547							# Unicode characters
548							} elsif ( $self->{_BUFFER} =~ s/^u(\d+)// ) {
549	4					17	return ( 'u', $1 );
550
551							# Allow incorrect control words
552							} elsif ( ( $self->{_SLOPPY} ) &&
553							( $self->{_BUFFER} =~ s/^([a-z]{1,32})(-?\d+)//i ) )
554							{
555	2					5	my $param = '';
556	2	50				10	$param = $2 if defined($2);
557
558	2					35	return ( $1, $param );
559							}
560
561							# If we get here, something has gone wrong. First we'll create
562							# a human readable section of RTF to show the user.
563	3					7	my $die_string = substr( $self->{_BUFFER}, 0, 50 );
564	3					8	$die_string =~ s/\r/[R]/g;
565
566							# Get angry with the user
567	3					76	carp
568							"Your RTF is broken, trying to recover to nearest group from '\\$die_string'\n";
569	3					2004	carp
570							"Chances are you have some RTF like \\control1plaintext. Which is illegal. But you can allow that by passing the 'sloppy' attribute to new() or using the sloppy() method. Please also write to and abuse the developer of the software which wrote your RTF :-)\n";
571
572							# Kill everything until the next group
573	3					2377	$self->{_BUFFER} =~ s/^.+?([}{])/$1/;
574	3					11	return ( '', '' );
575							}
576
577							# A first stab at grabbing binary data
578							sub _grab_bin {
579	4			4		5	my $self = shift;
580	4					5	my $bytes = shift;
581
582							# If the buffer is too small, attempt to read in some more data...
583	4					17	while ( length( $self->{_BUFFER} ) < $bytes ) {
584
585							# If there's no filehandle, or the one we have is eof, complain
586	2	50	66			7	if ( !$self->{_FILEHANDLE} \|\| $self->{_FILEHANDLE}->eof ) {
587	2					283	croak "\\bin is asking for $bytes characters, but there are only " .
588							length( $self->{_BUFFER} ) . " left.";
589							}
590
591							# Try and read in more data
592	0					0	$self->_get_line;
593							}
594
595							# Return the right number of characters
596	2					30	$self->{_BINARY_DATA} = substr( $self->{_BUFFER}, 0, $bytes, '' );
597							}
598
599							=head1 NOTES
600
601							To avoid intrusively deep parsing, if an alternative ASCII
602							representation is available for a Unicode entity, and that
603							ASCII representation contains C<{>, or C<\>, by themselves, things
604							will go I. But I'm not convinced either of those is
605							allowed by the spec.
606
607							=head1 AUTHOR
608
609							Pete Sergeant -- C
610
611							=head1 LICENSE
612
613							Copyright B.
614
615							This program is free software; you can redistribute it and/or modify it under
616							the same terms as Perl itself.
617
618							=cut
619
620							1;