File Coverage

encengine.c
Criterion Covered Total %
statement 38 42 90.4
branch 26 34 76.4
condition n/a
subroutine n/a
pod n/a
total 64 76 84.2


line stmt bran cond sub pod time code
1             /*
2             Data structures for encoding transformations.
3              
4             Perl works internally in either a native 'byte' encoding or
5             in UTF-8 encoded Unicode. We have no immediate need for a "wchar_t"
6             representation. When we do we can use utf8_to_uv().
7              
8             Most character encodings are either simple byte mappings or
9             variable length multi-byte encodings. UTF-8 can be viewed as a
10             rather extreme case of the latter.
11              
12             So to solve an important part of perl's encode needs we need to solve the
13             "multi-byte -> multi-byte" case. The simple byte forms are then just degenerate
14             case. (Where one of multi-bytes will usually be UTF-8.)
15              
16             The other type of encoding is a shift encoding where a prefix sequence
17             determines what subsequent bytes mean. Such encodings have state.
18              
19             We also need to handle case where a character in one encoding has to be
20             represented as multiple characters in the other. e.g. letter+diacritic.
21              
22             The process can be considered as pseudo perl:
23              
24             my $dst = '';
25             while (length($src))
26             {
27             my $size = $count($src);
28             my $in_seq = substr($src,0,$size,'');
29             my $out_seq = $s2d_hash{$in_seq};
30             if (defined $out_seq)
31             {
32             $dst .= $out_seq;
33             }
34             else
35             {
36             # an error condition
37             }
38             }
39             return $dst;
40              
41             That has the following components:
42             &src_count - a "rule" for how many bytes make up the next character in the
43             source.
44             %s2d_hash - a mapping from input sequences to output sequences
45              
46             The problem with that scheme is that it does not allow the output
47             character repertoire to affect the characters considered from the
48             input.
49              
50             So we use a "trie" representation which can also be considered
51             a state machine:
52              
53             my $dst = '';
54             my $seq = \@s2d_seq;
55             my $next = \@s2d_next;
56             while (length($src))
57             {
58             my $byte = $substr($src,0,1,'');
59             my $out_seq = $seq->[$byte];
60             if (defined $out_seq)
61             {
62             $dst .= $out_seq;
63             }
64             else
65             {
66             # an error condition
67             }
68             ($next,$seq) = @$next->[$byte] if $next;
69             }
70             return $dst;
71              
72             There is now a pair of data structures to represent everything.
73             It is valid for output sequence at a particular point to
74             be defined but zero length, that just means "don't know yet".
75             For the single byte case there is no 'next' so new tables will be the same as
76             the original tables. For a multi-byte case a prefix byte will flip to the tables
77             for the next page (adding nothing to the output), then the tables for the page
78             will provide the actual output and set tables back to original base page.
79              
80             This scheme can also handle shift encodings.
81              
82             A slight enhancement to the scheme also allows for look-ahead - if
83             we add a flag to re-add the removed byte to the source we could handle
84             a" -> U+00E4 (LATIN SMALL LETTER A WITH DIAERESIS)
85             ab -> a (and take b back please)
86              
87             */
88              
89             #define PERL_NO_GET_CONTEXT
90             #include
91             #include
92             #include "encode.h"
93              
94             int
95 62145           do_encode(const encpage_t * enc, const U8 * src, STRLEN * slen, U8 * dst,
96             STRLEN dlen, STRLEN * dout, int approx, const U8 *term, STRLEN tlen)
97             {
98 62145           const U8 *s = src;
99 62145           const U8 *send = s + *slen;
100 62145           const U8 *last = s;
101 62145           U8 *d = dst;
102 62145           U8 *dend = d + dlen, *dlast = d;
103 62145           int code = 0;
104 7289572 100         while (s < send) {
105 7230861           const encpage_t *e = enc;
106 7230861           U8 byte = *s;
107 46297679 100         while (byte > e->max)
108 39066818           e++;
109 14458288 100         if (byte >= e->min && e->slen && (approx || !(e->slen & 0x80))) {
    100          
    100          
    50          
110 7228681           const U8 *cend = s + (e->slen & 0x7f);
111 7228681 100         if (cend <= send) {
112             STRLEN n;
113 7228570 100         if ((n = e->dlen)) {
114 5230260           const U8 *out = e->seq + n * (byte - e->min);
115 5230260           U8 *oend = d + n;
116 5230260 50         if (dst) {
117 5230260 100         if (oend <= dend) {
118 12340180 100         while (d < oend)
119 7111063           *d++ = *out++;
120             }
121             else {
122             /* Out of space */
123 1143           code = ENCODE_NOSPACE;
124 1143           break;
125             }
126             }
127             else
128 0           d = oend;
129             }
130 7227427           enc = e->next;
131 7227427           s++;
132 7227427 100         if (s == cend) {
133 5229117 100         if (approx && (e->slen & 0x80))
    50          
134 0           code = ENCODE_FALLBACK;
135 5229117           last = s;
136 5229117 50         if (term && (STRLEN)(d-dlast) == tlen && memEQ(dlast, term, tlen)) {
    0          
    0          
137 0           code = ENCODE_FOUND_TERM;
138 0           break;
139             }
140 7227427           dlast = d;
141             }
142             }
143             else {
144             /* partial source character */
145 111           code = ENCODE_PARTIAL;
146 111           break;
147             }
148             }
149             else {
150             /* Cannot represent */
151 2180           code = ENCODE_NOREP;
152 2180           break;
153             }
154             }
155 62145           *slen = last - src;
156 62145           *dout = d - dst;
157 62145           return code;
158             }