File Coverage

encengine.c

Criterion	Covered	Total	%
statement	38	42	90.4
branch	26	34	76.4
condition			n/a
subroutine			n/a
pod			n/a
total	64	76	84.2

line	stmt	bran	code
1			/*
2			Data structures for encoding transformations.
3
4			Perl works internally in either a native 'byte' encoding or
5			in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
6			representation. When we do we can use utf8_to_uv().
7
8			Most character encodings are either simple byte mappings or
9			variable length multi-byte encodings. UTF-8 can be viewed as a
10			rather extreme case of the latter.
11
12			So to solve an important part of perl's encode needs we need to solve the
13			"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
14			case. (Where one of multi-bytes will usually be UTF-8.)
15
16			The other type of encoding is a shift encoding where a prefix sequence
17			determines what subsequent bytes mean. Such encodings have state.
18
19			We also need to handle case where a character in one encoding has to be
20			represented as multiple characters in the other. e.g. letter+diacritic.
21
22			The process can be considered as pseudo perl:
23
24			my $dst = '';
25			while (length($src))
26			{
27			my $size = $count($src);
28			my $in_seq = substr($src,0,$size,'');
29			my $out_seq = $s2d_hash{$in_seq};
30			if (defined $out_seq)
31			{
32			$dst .= $out_seq;
33			}
34			else
35			{
36			# an error condition
37			}
38			}
39			return $dst;
40
41			That has the following components:
42			&src_count - a "rule" for how many bytes make up the next character in the
43			source.
44			%s2d_hash - a mapping from input sequences to output sequences
45
46			The problem with that scheme is that it does not allow the output
47			character repertoire to affect the characters considered from the
48			input.
49
50			So we use a "trie" representation which can also be considered
51			a state machine:
52
53			my $dst = '';
54			my $seq = \@s2d_seq;
55			my $next = \@s2d_next;
56			while (length($src))
57			{
58			my $byte = $substr($src,0,1,'');
59			my $out_seq = $seq->[$byte];
60			if (defined $out_seq)
61			{
62			$dst .= $out_seq;
63			}
64			else
65			{
66			# an error condition
67			}
68			($next,$seq) = @$next->[$byte] if $next;
69			}
70			return $dst;
71
72			There is now a pair of data structures to represent everything.
73			It is valid for output sequence at a particular point to
74			be defined but zero length, that just means "don't know yet".
75			For the single byte case there is no 'next' so new tables will be the same as
76			the original tables. For a multi-byte case a prefix byte will flip to the tables
77			for the next page (adding nothing to the output), then the tables for the page
78			will provide the actual output and set tables back to original base page.
79
80			This scheme can also handle shift encodings.
81
82			A slight enhancement to the scheme also allows for look-ahead - if
83			we add a flag to re-add the removed byte to the source we could handle
84			a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS)
85			ab -> a (and take b back please)
86
87			*/
88
89			#define PERL_NO_GET_CONTEXT
90			#include
91			#include
92			#include "encode.h"
93
94			int
95	62145		do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
96			STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
97			{
98	62145		const U8 *s = src;
99	62145		const U8 send = s + slen;
100	62145		const U8 *last = s;
101	62145		U8 *d = dst;
102	62145		U8 dend = d + dlen, dlast = d;
103	62145		int code = 0;
104	7289572	100	while (s < send) {
105	7230861		const encpage_t *e = enc;
106	7230861		U8 byte = *s;
107	46297679	100	while (byte > e->max)
108	39066818		e++;
109	14458288	100	if (byte >= e->min && e->slen && (approx \|\| !(e->slen & 0x80))) {
		100
		100
		50
110	7228681		const U8 *cend = s + (e->slen & 0x7f);
111	7228681	100	if (cend <= send) {
112			STRLEN n;
113	7228570	100	if ((n = e->dlen)) {
114	5230260		const U8 out = e->seq + n (byte - e->min);
115	5230260		U8 *oend = d + n;
116	5230260	50	if (dst) {
117	5230260	100	if (oend <= dend) {
118	12340180	100	while (d < oend)
119	7111063		d++ = out++;
120			}
121			else {
122			/* Out of space */
123	1143		code = ENCODE_NOSPACE;
124	1143		break;
125			}
126			}
127			else
128	0		d = oend;
129			}
130	7227427		enc = e->next;
131	7227427		s++;
132	7227427	100	if (s == cend) {
133	5229117	100	if (approx && (e->slen & 0x80))
		50
134	0		code = ENCODE_FALLBACK;
135	5229117		last = s;
136	5229117	50	if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
		0
		0
137	0		code = ENCODE_FOUND_TERM;
138	0		break;
139			}
140	7227427		dlast = d;
141			}
142			}
143			else {
144			/* partial source character */
145	111		code = ENCODE_PARTIAL;
146	111		break;
147			}
148			}
149			else {
150			/* Cannot represent */
151	2180		code = ENCODE_NOREP;
152	2180		break;
153			}
154			}
155	62145		*slen = last - src;
156	62145		*dout = d - dst;
157	62145		return code;
158			}