File Coverage

cpan/Encode/encengine.c

Criterion	Covered	Total	%
statement	25	25	100.0
branch			n/a
condition			n/a
subroutine			n/a
total	25	25	100.0

line	stmt	code
1		/*
2		Data structures for encoding transformations.
3
4		Perl works internally in either a native 'byte' encoding or
5		in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
6		representation. When we do we can use utf8_to_uv().
7
8		Most character encodings are either simple byte mappings or
9		variable length multi-byte encodings. UTF-8 can be viewed as a
10		rather extreme case of the latter.
11
12		So to solve an important part of perl's encode needs we need to solve the
13		"multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
14		case. (Where one of multi-bytes will usually be UTF-8.)
15
16		The other type of encoding is a shift encoding where a prefix sequence
17		determines what subsequent bytes mean. Such encodings have state.
18
19		We also need to handle case where a character in one encoding has to be
20		represented as multiple characters in the other. e.g. letter+diacritic.
21
22		The process can be considered as pseudo perl:
23
24		my $dst = '';
25		while (length($src))
26		{
27		my $size = $count($src);
28		my $in_seq = substr($src,0,$size,'');
29		my $out_seq = $s2d_hash{$in_seq};
30		if (defined $out_seq)
31		{
32		$dst .= $out_seq;
33		}
34		else
35		{
36		# an error condition
37		}
38		}
39		return $dst;
40
41		That has the following components:
42		&src_count - a "rule" for how many bytes make up the next character in the
43		source.
44		%s2d_hash - a mapping from input sequences to output sequences
45
46		The problem with that scheme is that it does not allow the output
47		character repertoire to affect the characters considered from the
48		input.
49
50		So we use a "trie" representation which can also be considered
51		a state machine:
52
53		my $dst = '';
54		my $seq = \@s2d_seq;
55		my $next = \@s2d_next;
56		while (length($src))
57		{
58		my $byte = $substr($src,0,1,'');
59		my $out_seq = $seq->[$byte];
60		if (defined $out_seq)
61		{
62		$dst .= $out_seq;
63		}
64		else
65		{
66		# an error condition
67		}
68		($next,$seq) = @$next->[$byte] if $next;
69		}
70		return $dst;
71
72		There is now a pair of data structures to represent everything.
73		It is valid for output sequence at a particular point to
74		be defined but zero length, that just means "don't know yet".
75		For the single byte case there is no 'next' so new tables will be the same as
76		the original tables. For a multi-byte case a prefix byte will flip to the tables
77		for the next page (adding nothing to the output), then the tables for the page
78		will provide the actual output and set tables back to original base page.
79
80		This scheme can also handle shift encodings.
81
82		A slight enhancement to the scheme also allows for look-ahead - if
83		we add a flag to re-add the removed byte to the source we could handle
84		a" -> �
85		ab -> a (and take b back please)
86
87		*/
88
89		#include
90		#include
91		#define U8 U8
92		#include "encode.h"
93
94		int
95	219502	do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
96		STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
97		{
98		const U8 *s = src;
99	219502	const U8 send = s + slen;
100		const U8 *last = s;
101		U8 *d = dst;
102	219502	U8 dend = d + dlen, dlast = d;
103		int code = 0;
104	15847110	while (s < send) {
105		const encpage_t *e = enc;
106	15415616	U8 byte = *s;
107	109125140	while (byte > e->max)
108	78293908	e++;
109	15415616	if (byte >= e->min && e->slen && (approx \|\| !(e->slen & 0x80))) {
110	15412858	const U8 *cend = s + (e->slen & 0x7f);
111	15412858	if (cend <= send) {
112		STRLEN n;
113	15412564	if ((n = e->dlen)) {
114	11411062	const U8 out = e->seq + n (byte - e->min);
115	11411062	U8 *oend = d + n;
116	11411062	if (dst) {
117	11411062	if (oend <= dend) {
118	26603938	while (d < oend)
119	15195804	d++ = out++;
120		}
121		else {
122		/* Out of space */
123		code = ENCODE_NOSPACE;
124		break;
125		}
126		}
127		else
128		d = oend;
129		}
130	15409636	enc = e->next;
131	15409636	s++;
132	15409636	if (s == cend) {
133	11408134	if (approx && (e->slen & 0x80))
134		code = ENCODE_FALLBACK;
135		last = s;
136	11408134	if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
137		code = ENCODE_FOUND_TERM;
138		break;
139		}
140		dlast = d;
141		}
142		}
143		else {
144		/* partial source character */
145		code = ENCODE_PARTIAL;
146		break;
147		}
148		}
149		else {
150		/* Cannot represent */
151		code = ENCODE_NOREP;
152		break;
153		}
154		}
155	219502	*slen = last - src;
156	219502	*dout = d - dst;
157	219502	return code;
158		}