File Coverage

cpan/Encode/encengine.c
Criterion Covered Total %
statement 25 25 100.0
branch n/a
condition n/a
subroutine n/a
total 25 25 100.0


line stmt bran cond sub time code
1           /*
2           Data structures for encoding transformations.
3            
4           Perl works internally in either a native 'byte' encoding or
5           in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
6           representation. When we do we can use utf8_to_uv().
7            
8           Most character encodings are either simple byte mappings or
9           variable length multi-byte encodings. UTF-8 can be viewed as a
10           rather extreme case of the latter.
11            
12           So to solve an important part of perl's encode needs we need to solve the
13           "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
14           case. (Where one of multi-bytes will usually be UTF-8.)
15            
16           The other type of encoding is a shift encoding where a prefix sequence
17           determines what subsequent bytes mean. Such encodings have state.
18            
19           We also need to handle case where a character in one encoding has to be
20           represented as multiple characters in the other. e.g. letter+diacritic.
21            
22           The process can be considered as pseudo perl:
23            
24           my $dst = '';
25           while (length($src))
26           {
27           my $size = $count($src);
28           my $in_seq = substr($src,0,$size,'');
29           my $out_seq = $s2d_hash{$in_seq};
30           if (defined $out_seq)
31           {
32           $dst .= $out_seq;
33           }
34           else
35           {
36           # an error condition
37           }
38           }
39           return $dst;
40            
41           That has the following components:
42           &src_count - a "rule" for how many bytes make up the next character in the
43           source.
44           %s2d_hash - a mapping from input sequences to output sequences
45            
46           The problem with that scheme is that it does not allow the output
47           character repertoire to affect the characters considered from the
48           input.
49            
50           So we use a "trie" representation which can also be considered
51           a state machine:
52            
53           my $dst = '';
54           my $seq = \@s2d_seq;
55           my $next = \@s2d_next;
56           while (length($src))
57           {
58           my $byte = $substr($src,0,1,'');
59           my $out_seq = $seq->[$byte];
60           if (defined $out_seq)
61           {
62           $dst .= $out_seq;
63           }
64           else
65           {
66           # an error condition
67           }
68           ($next,$seq) = @$next->[$byte] if $next;
69           }
70           return $dst;
71            
72           There is now a pair of data structures to represent everything.
73           It is valid for output sequence at a particular point to
74           be defined but zero length, that just means "don't know yet".
75           For the single byte case there is no 'next' so new tables will be the same as
76           the original tables. For a multi-byte case a prefix byte will flip to the tables
77           for the next page (adding nothing to the output), then the tables for the page
78           will provide the actual output and set tables back to original base page.
79            
80           This scheme can also handle shift encodings.
81            
82           A slight enhancement to the scheme also allows for look-ahead - if
83           we add a flag to re-add the removed byte to the source we could handle
84           a" -> ä
85           ab -> a (and take b back please)
86            
87           */
88            
89           #include
90           #include
91           #define U8 U8
92           #include "encode.h"
93            
94           int
95 219502         do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
96           STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
97           {
98           const U8 *s = src;
99 219502         const U8 *send = s + *slen;
100           const U8 *last = s;
101           U8 *d = dst;
102 219502         U8 *dend = d + dlen, *dlast = d;
103           int code = 0;
104 15847110         while (s < send) {
105           const encpage_t *e = enc;
106 15415616         U8 byte = *s;
107 109125140         while (byte > e->max)
108 78293908         e++;
109 15415616         if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
110 15412858         const U8 *cend = s + (e->slen & 0x7f);
111 15412858         if (cend <= send) {
112           STRLEN n;
113 15412564         if ((n = e->dlen)) {
114 11411062         const U8 *out = e->seq + n * (byte - e->min);
115 11411062         U8 *oend = d + n;
116 11411062         if (dst) {
117 11411062         if (oend <= dend) {
118 26603938         while (d < oend)
119 15195804         *d++ = *out++;
120           }
121           else {
122           /* Out of space */
123           code = ENCODE_NOSPACE;
124           break;
125           }
126           }
127           else
128           d = oend;
129           }
130 15409636         enc = e->next;
131 15409636         s++;
132 15409636         if (s == cend) {
133 11408134         if (approx && (e->slen & 0x80))
134           code = ENCODE_FALLBACK;
135           last = s;
136 11408134         if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
137           code = ENCODE_FOUND_TERM;
138           break;
139           }
140           dlast = d;
141           }
142           }
143           else {
144           /* partial source character */
145           code = ENCODE_PARTIAL;
146           break;
147           }
148           }
149           else {
150           /* Cannot represent */
151           code = ENCODE_NOREP;
152           break;
153           }
154           }
155 219502         *slen = last - src;
156 219502         *dout = d - dst;
157 219502         return code;
158           }