File Coverage

third_party/modest/source/myencoding/detect.c

Criterion	Covered	Total	%
statement	334	477	70.0
branch	226	428	52.8
condition			n/a
subroutine			n/a
pod			n/a
total	560	905	61.8

line	stmt	bran	code
1			/*
2			Copyright (C) 2015-2017 Alexander Borisov
3
4			This library is free software; you can redistribute it and/or
5			modify it under the terms of the GNU Lesser General Public
6			License as published by the Free Software Foundation; either
7			version 2.1 of the License, or (at your option) any later version.
8
9			This library is distributed in the hope that it will be useful,
10			but WITHOUT ANY WARRANTY; without even the implied warranty of
11			MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12			Lesser General Public License for more details.
13
14			You should have received a copy of the GNU Lesser General Public
15			License along with this library; if not, write to the Free Software
16			Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
17
18			Author: lex.borisov@gmail.com (Alexander Borisov)
19			*/
20
21			#include "myencoding/encoding.h"
22			#include "myencoding/detect_resource.h"
23			#include "mycore/utils/resources.h"
24
25	19		myencoding_trigram_result_t myencoding_detect_by_trigram(unsigned const char *u_text, size_t length,
26			const myencoding_trigram_t *list, size_t list_length,
27			size_t max_sum_for_break)
28			{
29	19		myencoding_trigram_result_t res = {0, 0};
30
31	2514	100	for (size_t i = 0; i < (length - 3); i++) {
32	2495	100	if(u_text[i] > 127)
33			{
34	1173196	100	for (size_t j = 0; j < list_length; j++)
35			{
36	1172076	100	if(memcmp(list[j].trigram, &u_text[i], 3) == 0) {
37	768		res.value += list[j].value;
38	768		res.count++;
39
40	768	100	if(res.value >= max_sum_for_break)
41	4		i = length;
42
43	768		break;
44			}
45			}
46			}
47			}
48
49	19		return res;
50			}
51
52	19		bool myencoding_detect_russian_has_end(myencoding_trigram_result_t *res, size_t min_count, size_t min_value)
53			{
54	19	100	if(res->value >= min_value \|\| res->count >= min_count)
		50
55	4		return true;
56
57	15		return false;
58			}
59
60	135		bool myencoding_detect_unicode_has_end(myencoding_unicode_result_t *res, size_t max_bad_percent)
61			{
62	135	100	if(res->count_good == 0) {
63	122	100	if(res->count_bad)
64	4		return false;
65
66	118		return true;
67			}
68	13	50	else if(res->count_bad == 0)
69	13		return true;
70
71	0		size_t percent_bad = (res->count_bad * 100) / res->count_good;
72	0	0	if(percent_bad < max_bad_percent)
73	0		return true;
74
75	0		return false;
76			}
77
78	135		myencoding_unicode_result_t myencoding_detect_utf_8(unsigned const char *u_text, size_t length)
79			{
80	135		size_t i = 0;
81	135		myencoding_unicode_result_t res = {0, 0, 0};
82
83	13941	100	while(i < length)
84			{
85	13807	100	if((u_text[i] & 0x80) == 0x00) {
86	12922		i++;
87	12922		res.count_ascii++;
88			}
89	885	100	else if((u_text[i] & 0xE0) == 0xC0) {
90	84		i += 2;
91
92	84	50	if(i >= length)
93	0		break;
94
95	84	50	if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0))
		100
96	80		res.count_good++;
97			else
98	84		res.count_bad++;
99			}
100	801	100	else if((u_text[i] & 0xF0) == 0xE0) {
101	500		i += 3;
102
103	500	100	if(i >= length)
104	1		break;
105
106	499	50	if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0) &&
		50
		0
107	0	0	((u_text[i - 2] & 0x80) && (u_text[i - 2] & 0x40) == 0)) {
108	0		res.count_good++;
109			}
110			else
111	499		res.count_bad++;
112			}
113	301	50	else if((u_text[i] & 0xF8) == 0xF0) {
114	301		i += 4;
115
116	301	50	if(i >= length)
117	0		break;
118
119	301	100	if(((u_text[i - 1] & 0x80) && (u_text[i - 1] & 0x40) == 0) &&
		50
		0
120	0	0	((u_text[i - 2] & 0x80) && (u_text[i - 2] & 0x40) == 0) &&
		0
121	0	0	((u_text[i - 3] & 0x80) && (u_text[i - 3] & 0x40) == 0)) {
122	0		res.count_good++;
123			}
124			else
125	301		res.count_bad++;
126			}
127			else {
128	0		i++;
129	0		res.count_bad++;
130			}
131			}
132
133	135		return res;
134			}
135
136	138		myencoding_unicode_result_t myencoding_detect_utf_16(unsigned const char *u_text, size_t length)
137			{
138	138		size_t i = 0;
139	138		myencoding_unicode_result_t res = {0, 0, 0};
140
141	16184	100	while(i < length)
142			{
143	16046	100	if(u_text[i] == 0x00) {
144	30	50	if((i % 2) == 0) {
145	0		i++;
146
147	0	0	if(u_text[i] > 0x1F && u_text[i] < 0x7F)
		0
148	0		res.count_bad++;
149			}
150			else {
151	30	50	if(u_text[(i - 1)] > 0x1F && u_text[(i - 1)] < 0x7F)
		50
152	30		res.count_good++;
153
154	30		i++;
155			}
156			}
157			else
158	16016		i++;
159			}
160
161	138		return res;
162			}
163
164	136		bool myencoding_detect_bom(const char text, size_t length, myencoding_t encoding)
165			{
166	136		unsigned const char u_text = (unsigned const char)text;
167
168	136	50	if(length > 2) {
169	136	100	if(u_text[0] == 0xEF &&
		50
170	1	50	u_text[1] == 0xBB &&
171	1		u_text[2] == 0xBF)
172			{
173	1		*encoding = MyENCODING_UTF_8;
174	1		return true;
175			}
176			}
177
178	135	50	if(length > 1) {
179	135	50	if(u_text[0] == 0xFE && u_text[1] == 0xFF) {
		0
180	0		*encoding = MyENCODING_UTF_16BE;
181	0		return true;
182			}
183
184	135	50	if(u_text[0] == 0xFF && u_text[1] == 0xFE) {
		0
185	0		*encoding = MyENCODING_UTF_16LE;
186	0		return true;
187			}
188			}
189
190			// //for UTF-32
191			// if(length > 3) {
192			// if(u_text[0] == 0x00 &&
193			// u_text[1] == 0x00 &&
194			// u_text[2] == 0xFE &&
195			// u_text[3] == 0xFF)
196			// {
197			// *encoding = MyENCODING_UTF_32BE;
198			// return true;
199			// }
200			//
201			// if(u_text[0] == 0xFF &&
202			// u_text[1] == 0xFE &&
203			// u_text[2] == 0x00 &&
204			// u_text[3] == 0x00)
205			// {
206			// *encoding = MyENCODING_UTF_32LE;
207			// return true;
208			// }
209			// }
210
211	135		return false;
212			}
213
214	136		bool myencoding_detect_and_cut_bom(const char text, size_t length, myencoding_t encoding, const char *new_text, size_t new_size)
215			{
216	136	100	if(myencoding_detect_bom(text, length, encoding))
217			{
218	1	50	if(*encoding == MyENCODING_UTF_8) {
219	1		*new_text = &text[3];
220	1		*new_size = length - 3;
221			}
222			else {
223	0		*new_text = &text[2];
224	0		*new_size = length - 2;
225			}
226
227	1		return true;
228			}
229
230	135		return false;
231			}
232
233	138		bool myencoding_detect_unicode(const char text, size_t length, myencoding_t encoding)
234			{
235	138		unsigned const char u_text = (unsigned const char)text;
236	138		*encoding = MyENCODING_DEFAULT;
237
238	138		myencoding_unicode_result_t res = myencoding_detect_utf_16(u_text, length);
239
240	138	50	if(res.count_bad == 0 && res.count_good >= 3) {
		100
241	3		*encoding = MyENCODING_UTF_16LE;
242	3		return true;
243			}
244	135	50	else if(res.count_bad >= 3 && res.count_good == 0) {
		0
245	0		*encoding = MyENCODING_UTF_16BE;
246	0		return true;
247			}
248
249	135		res = myencoding_detect_utf_8(u_text, length);
250	135	100	if(myencoding_detect_unicode_has_end(&res, 10)) {
251	131		*encoding = MyENCODING_UTF_8;
252	131		return true;
253			}
254
255	138		return false;
256			}
257
258	7		bool myencoding_detect_russian(const char text, size_t length, myencoding_t encoding)
259			{
260	7		unsigned const char u_text = (unsigned const char)text;
261
262	7		size_t min_count = 50;
263	7		size_t min_value = 100000;
264	7		size_t max_value = 0;
265
266	7		*encoding = MyENCODING_DEFAULT;
267
268			myencoding_trigram_result_t
269	7		res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_windows_1251, 1000, min_value);
270	7	100	if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
271	4		*encoding = MyENCODING_WINDOWS_1251;
272	4		return true;
273			}
274
275	3		max_value = res.value;
276	3	100	if(max_value) {
277	1		*encoding = MyENCODING_WINDOWS_1251;
278			}
279
280	3		res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_koi8_r, 1000, min_value);
281	3	50	if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
282	0		*encoding = MyENCODING_KOI8_R;
283	0		return true;
284			}
285
286	3	100	if(max_value < res.value) {
287	2		*encoding = MyENCODING_KOI8_R;
288	2		max_value = res.value;
289			}
290
291	3		res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_iso_8859_5, 1000, min_value);
292	3	50	if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
293	0		*encoding = MyENCODING_ISO_8859_5;
294	0		return true;
295			}
296
297	3	50	if(max_value < res.value) {
298	0		*encoding = MyENCODING_ISO_8859_5;
299	0		max_value = res.value;
300			}
301
302	3		res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_x_mac_cyrillic, 1000, min_value);
303	3	50	if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
304	0		*encoding = MyENCODING_X_MAC_CYRILLIC;
305	0		return true;
306			}
307
308	3	50	if(max_value < res.value) {
309	0		*encoding = MyENCODING_X_MAC_CYRILLIC;
310	0		max_value = res.value;
311			}
312
313	3		res = myencoding_detect_by_trigram(u_text, length, myencoding_detect_trigrams_index_ibm866, 1000, min_value);
314	3	50	if(myencoding_detect_russian_has_end(&res, min_count, min_value)) {
315	0		*encoding = MyENCODING_IBM866;
316	0		return true;
317			}
318
319	3	50	if(max_value < res.value) {
320	0		*encoding = MyENCODING_IBM866;
321			}
322
323	7		return false;
324			}
325
326	137		bool myencoding_detect(const char text, size_t length, myencoding_t encoding)
327			{
328	137		*encoding = MyENCODING_DEFAULT;
329
330	137	100	if(myencoding_detect_unicode(text, length, encoding))
331	133		return true;
332
333	4	100	if(myencoding_detect_russian(text, length, encoding))
334	1		return true;
335
336	3		return false;
337			}
338
339	19		const myencoding_detect_name_entry_t * myencoding_name_entry_by_name(const char* name, size_t length)
340			{
341	57		size_t idx = ((mycore_string_chars_lowercase_map[ (const unsigned char)name[0] ] *
342	38		mycore_string_chars_lowercase_map[ (const unsigned char)name[(length - 1)] ] *
343			length)
344	19		% MyENCODING_DETECT_NAME_STATIC_SIZE) + 1;
345
346	22	50	while (myencoding_detect_name_entry_static_list_index[idx].label)
347			{
348	22	100	if(myencoding_detect_name_entry_static_list_index[idx].label_length == length) {
349	21	100	if(mycore_strncasecmp(myencoding_detect_name_entry_static_list_index[idx].label, name, length) == 0)
350	18		return &myencoding_detect_name_entry_static_list_index[idx];
351
352	3	50	if(myencoding_detect_name_entry_static_list_index[idx].next)
353	3		idx = myencoding_detect_name_entry_static_list_index[idx].next;
354			else
355	0		return NULL;
356			}
357	1	50	else if(myencoding_detect_name_entry_static_list_index[idx].label_length > length) {
358	1		return NULL;
359			}
360			else {
361	0		idx = myencoding_detect_name_entry_static_list_index[idx].next;
362			}
363			}
364
365	0		return NULL;
366			}
367
368	19		bool myencoding_by_name(const char name, size_t length, myencoding_t encoding)
369			{
370	19		const myencoding_detect_name_entry_t *entry = myencoding_name_entry_by_name(name, length);
371
372	19	100	if(entry) {
373	18	50	if(encoding)
374	18		*encoding = entry->encoding;
375
376	18		return true;
377			}
378
379	1		return false;
380			}
381
382	3		const char * myencoding_name_by_id(myencoding_t encoding, size_t *length)
383			{
384	3	100	if(encoding >= MyENCODING_LAST_ENTRY) {
385	1	50	if(length) {
386	1		*length = 0;
387			}
388
389	1		return NULL;
390			}
391
392	2		const myencoding_entry_name_index_t *entry = &myencoding_entry_name_index_static_list_index[encoding];
393
394	2	50	if(length) {
395	2		*length = entry->length;
396			}
397
398	2		return entry->name;
399			}
400
401			/*
402			When an algorithm requires a user agent to prescan a byte stream to determine its encoding,
403			given some defined end condition, then it must run the following steps.
404			These steps either abort unsuccessfully or return a character encoding.
405			If at any point during these steps (including during instances of the get an attribute algorithm invoked by this one)
406			the user agent either runs out of bytes (meaning the position pointer created in the first step below goes beyond the end of the byte stream obtained so far)
407			or reaches its end condition, then abort the prescan a byte stream to determine its encoding algorithm unsuccessfully.
408			*/
409	1		bool myencoding_extracting_character_encoding_from_charset(const char data, size_t data_size, myencoding_t encoding)
410			{
411	1		return myencoding_extracting_character_encoding_from_charset_with_found(data, data_size, encoding, NULL, NULL);
412			}
413
414	2		bool myencoding_extracting_character_encoding_from_charset_with_found(const char data, size_t data_size, myencoding_t encoding, const char *found, size_t found_length)
415			{
416	2		*encoding = MyENCODING_NOT_DETERMINED;
417
418	2	50	if(found)
419	0		*found = NULL;
420	2	50	if(found_length)
421	0		*found_length = 0;
422
423			/* 1 */
424	2		size_t length = 0;
425	2		size_t charset_length = strlen("charset");
426
427	2		bool is_get_pos = false;
428	2		const unsigned char udata = (const unsigned char )data;
429
430			/* 2 */
431	24	50	while((length + charset_length) < data_size) {
432	24	100	if(mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"charset", &udata[length]))
433			{
434	2		length += charset_length;
435
436			/* 2 */
437	2	50	while(length < data_size) {
438	2	50	if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
		50
		50
		50
439	2	50	udata[length] != 0x0D && udata[length] != 0x20)
440			{
441	2		break;
442			}
443
444	0		length++;
445			}
446
447			/* 4 */
448	2	50	if(udata[length] == 0x3D) { /* EQUALS SIGN (=) */
449	2		is_get_pos = true;
450
451	2		length++;
452	2		break;
453			}
454			}
455
456	22		length++;
457			}
458
459	2	50	if(is_get_pos == false \|\| length >= data_size)
		50
460	0		return false;
461
462			/* 5 */
463	2	50	while(length < data_size) {
464	2	50	if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
		50
		50
		50
465	2	50	udata[length] != 0x0D && udata[length] != 0x20)
466			{
467	2		break;
468			}
469
470	0		length++;
471			}
472
473	2	50	if(length >= data_size)
474	0		return false;
475
476			/* 6 */
477			/* " */
478	2	50	if(udata[length] == 0x22)
479			{
480	0		length++;
481	0		size_t begin = length;
482
483	0	0	while(length < data_size) {
484	0	0	if(udata[length] == 0x22)
485			{
486	0	0	if(found)
487	0		*found = &data[begin];
488	0	0	if(found_length)
489	0		*found_length = (length - begin);
490
491	0		return myencoding_by_name(&data[begin], (length - begin), encoding);
492			}
493
494	0		length++;
495			}
496
497	0		return false;
498			}
499
500			/* ' */
501	2	50	if(udata[length] == 0x27)
502			{
503	0		length++;
504	0		size_t begin = length;
505
506	0	0	while(length < data_size) {
507	0	0	if(udata[length] == 0x27)
508			{
509	0	0	if(found)
510	0		*found = &data[begin];
511	0	0	if(found_length)
512	0		*found_length = (length - begin);
513
514	0		return myencoding_by_name(&data[begin], (length - begin), encoding);
515			}
516
517	0		length++;
518			}
519
520	0		return false;
521			}
522
523			/* other */
524	2	50	while(length < data_size) {
525	2	50	if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
		50
		50
		50
526	2	50	udata[length] != 0x0D && udata[length] != 0x20)
527			{
528	2		size_t begin = length;
529
530	26	100	while(length < data_size) {
531			/* SEMICOLON character (;) */
532	24	50	if(udata[length] == 0x3B)
533			{
534	0	0	if(found)
535	0		*found = &data[begin];
536	0	0	if(found_length)
537	0		*found_length = (length - begin);
538
539	0		return myencoding_by_name(&data[begin], (length - begin), encoding);
540			}
541
542	24		length++;
543			}
544
545	2	50	if(found)
546	0		*found = &data[begin];
547	2	50	if(found_length)
548	0		*found_length = (length - begin);
549
550	2		return myencoding_by_name(&data[begin], (length - begin), encoding);
551			}
552
553	0		length++;
554			}
555
556	0		return false;
557			}
558
559	120		bool myencoding_prescan_stream_to_determine_encoding_get_attr_spaces(const unsigned char udata, size_t data_length, size_t data_size, myencoding_detect_attr_t *attr)
560			{
561	120		size_t length = *data_length;
562
563			/* set position */
564	120		attr->key_length = length - attr->key_begin;
565
566			/* 6 */
567	120	50	while(length < data_size) {
568	120	50	if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
		50
		50
		50
569	120	50	udata[length] != 0x0D && udata[length] != 0x20 && udata[length] != 0x2F)
		50
570			{
571	120		break;
572			}
573
574	0		length++;
575			}
576
577	120	50	if(length >= data_size) {
578	0		*data_length = length;
579	0		return false;
580			}
581
582			/* 7 */
583	120	50	if(udata[length] != 0x3D) {
584	120		*data_length = length;
585	120		return false;
586			}
587
588			/* 8 */
589	0		*data_length = (length + 1);
590	0		return true;
591			}
592
593	2		size_t myencoding_prescan_stream_to_determine_encoding_get_attr_value(const unsigned char udata, size_t length, size_t data_size, myencoding_detect_attr_t attr, bool *it_last)
594			{
595			/* 9 */
596	2	50	while(length < data_size) {
597	2	50	if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
		50
		50
		50
598	2	50	udata[length] != 0x0D && udata[length] != 0x20)
599			{
600	2		break;
601			}
602
603	0		length++;
604			}
605
606	2	50	if(length >= data_size) {
607	0		*it_last = true;
608	0		return length;
609			}
610
611			/* 10 */
612	2		switch (udata[length]) {
613			case 0x22: /* (ASCII ") */
614	2		length++;
615	2		attr->value_begin = length;
616
617	45	50	while(length < data_size) {
618	45	100	if(udata[length] == 0x22)
619			{
620	2		attr->value_length = length - attr->value_begin;
621	2		return (length + 1);
622			}
623
624	43		length++;
625			}
626
627	0		break;
628
629			case 0x27: /* (ASCII ') */
630	0		length++;
631	0		attr->value_begin = length;
632
633	0	0	while(length < data_size) {
634	0	0	if(udata[length] == 0x27)
635			{
636	0		attr->value_length = length - attr->value_begin;
637	0		return (length + 1);
638			}
639
640	0		length++;
641			}
642
643	0		break;
644
645			case 0x3E: /* (ASCII >) */
646	0		*it_last = true;
647	0		return (length + 1);
648
649			default:
650	0		attr->value_begin = length;
651
652	0	0	while(length < data_size) {
653	0	0	if(udata[length] == 0x09 \|\| udata[length] == 0x0A \|\| udata[length] == 0x0C \|\|
		0
		0
		0
654	0	0	udata[length] == 0x0D \|\| udata[length] == 0x20 \|\| udata[length] == 0x3E)
		0
655			{
656	0		attr->value_length = length - attr->value_begin;
657	0		return (length + 1);
658			}
659
660	0		length++;
661			}
662
663	0		break;
664			}
665
666	0		attr->value_length = length - attr->value_begin;
667	0		return length;
668			}
669
670	738		size_t myencoding_prescan_stream_to_determine_encoding_get_attr(const unsigned char udata, size_t length, size_t data_size, myencoding_detect_attr_t attr, bool *it_last)
671			{
672	738		memset(attr, 0, sizeof(myencoding_detect_attr_t));
673
674			/*
675			If the byte at position is one of 0x09 (ASCII TAB), 0x0A (ASCII LF), 0x0C (ASCII FF), 0x0D (ASCII CR),
676			0x20 (ASCII space), or 0x2F (ASCII /) then advance position to the next byte and redo this step.
677			*/
678			/* 1 */
679	740	50	while(length < data_size) {
680	740	50	if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
		50
		50
		50
681	740	100	udata[length] != 0x0D && udata[length] != 0x20 && udata[length] != 0x2F)
		100
682			{
683	738		break;
684			}
685
686	2		length++;
687			}
688
689	738	50	if(length >= data_size) {
690	0		*it_last = true;
691	0		return length;
692			}
693
694			/* 2 */
695	738	100	if(udata[length] == 0x3E) { /* (ASCII >) */
696	2		*it_last = true;
697	2		return (length + 1);
698			}
699
700	736		attr->key_begin = length;
701
702			/* 3, 4 */
703	2408	50	while(length < data_size) {
704	2408		switch (udata[length]) {
705			case 0x3D: /* (ASCII =) */
706	2	50	if(attr->key_begin != (length - 1)) {
707	2		attr->key_length = length - attr->key_begin;
708
709	2		length++;
710	2		return myencoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
711			}
712
713			/* fall through */
714
715			case 0x09: /* (ASCII TAB) */
716			case 0x0A: /* (ASCII LF) */
717			case 0x0C: /* (ASCII FF) */
718			case 0x0D: /* (ASCII CR) */
719			case 0x20: /* (ASCII space) */
720	120		length++;
721
722	120	50	if(myencoding_prescan_stream_to_determine_encoding_get_attr_spaces(udata, &length, data_size, attr) == false) {
723	120		*it_last = true;
724	120		return length;
725			}
726
727	0		return myencoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
728
729			case 0x2F: /* (ASCII /) */
730			case 0x3E: /* (ASCII >) */
731	614		*it_last = true;
732	614		attr->key_length = length - attr->key_begin;
733
734	614		return (length + 1);
735
736			default:
737	1672		break;
738			}
739
740	1672		length++;
741			}
742
743	0	0	if(myencoding_prescan_stream_to_determine_encoding_get_attr_spaces(udata, &length, data_size, attr) == false) {
744	0		*it_last = true;
745	0		return length;
746			}
747
748	0		return myencoding_prescan_stream_to_determine_encoding_get_attr_value(udata, length, data_size, attr, it_last);
749			}
750
751	2		bool myencoding_prescan_stream_to_determine_encoding_check_meta(const unsigned char udata, size_t length, size_t data_size, myencoding_t encoding, const char found, size_t found_length)
752			{
753			myencoding_detect_attr_t attr;
754
755	2		bool got_pragma = false;
756	2		bool it_last = false;
757
758	2		unsigned int need_pragma = 0; /* 0 = NULL, 1 = false, 2 = true */
759
760			/*
761			http-equiv = 1
762			content = 2
763			charset = 4
764			*/
765			/* If the attribute's name is already in attribute list, then return to the step labeled attributes. */
766	2		size_t is_exists = 0;
767
768	4	50	while(*length < data_size) {
769	4		length = myencoding_prescan_stream_to_determine_encoding_get_attr(udata, length, data_size, &attr, &it_last);
770
771			/* 9 */
772	5		if(attr.key_length == strlen("http-equiv") &&
773	1		mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"http-equiv", &udata[ attr.key_begin ]))
774			{
775	2	50	if((is_exists & 1) == 0) {
776	1		is_exists \|= 1;
777
778	2		if(attr.value_length == strlen("content-type") &&
779	1		mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content-type", &udata[ attr.value_begin ]))
780			{
781	1		got_pragma = true;
782			}
783			}
784			}
785	4		else if(attr.key_length == strlen("content") &&
786	1		mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"content", &udata[ attr.key_begin ]))
787			{
788	2	50	if((is_exists & 2) == 0) {
789	1		is_exists \|= 2;
790
791	1	50	if(myencoding_extracting_character_encoding_from_charset_with_found((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding, found, found_length)) {
792	1		need_pragma = 2;
793			}
794			}
795			}
796	2		else if(attr.key_length == strlen("charset") &&
797	0		mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"charset", &udata[ attr.key_begin ]))
798			{
799	0	0	if((is_exists & 4) == 0) {
800	0		is_exists \|= 4;
801
802	0	0	if(found)
803	0		found = (const char)(&udata[ attr.value_begin ]);
804	0	0	if(found_length)
805	0		*found_length = attr.value_length;
806
807	0		myencoding_by_name((const char*)(&udata[ attr.value_begin ]), attr.value_length, encoding);
808	0		need_pragma = 1;
809			}
810			}
811
812	4	100	if(it_last)
813	2		break;
814			}
815
816			/* 11, 12, 13 */
817	2	100	if(need_pragma == 0 \|\| (need_pragma == 2 && got_pragma == false)) {
		50
		50
818	1		*encoding = MyENCODING_NOT_DETERMINED;
819	1		return false;
820			}
821
822			/* 14 */
823	1	50	if(encoding == MyENCODING_UTF_16BE \|\| encoding == MyENCODING_UTF_16LE) {
		50
824	0		*encoding = MyENCODING_UTF_8;
825			}
826
827			/* 15 */
828	1	50	if(*encoding == MyENCODING_X_USER_DEFINED) {
829	0		*encoding = MyENCODING_WINDOWS_1252;
830			}
831
832			/* 16 */
833	2		return true;
834			}
835
836	734		size_t myencoding_prescan_stream_to_determine_encoding_skip_name(const unsigned char *udata, size_t length, size_t data_size)
837			{
838	746	50	while(length < data_size) {
839	746	50	if(udata[length] != 0x09 && udata[length] != 0x0A && udata[length] != 0x0C &&
		100
		50
		50
840	740	100	udata[length] != 0x0D && udata[length] != 0x20)
841			{
842	734		break;
843			}
844
845	12		length++;
846			}
847
848	734	50	if(length >= data_size)
849	0		return length;
850
851	734	50	if(udata[length] == 0x3E) {
852	0		return (length + 1);
853			}
854
855			myencoding_detect_attr_t attr;
856	734		bool it_last = false;
857
858	734	50	while(length < data_size) {
859	734		length = myencoding_prescan_stream_to_determine_encoding_get_attr(udata, length, data_size, &attr, &it_last);
860
861	734	50	if(it_last) {
862	734		return length;
863			}
864			}
865
866	734		return length;
867			}
868
869	828		size_t myencoding_prescan_stream_to_determine_encoding_skip_other(const unsigned char *udata, size_t length, size_t data_size)
870			{
871	828	100	if(udata[length] == 0x2F) { /* / */
872	361		length++;
873
874	361	50	if(length >= data_size)
875	0		return length;
876
877	361	50	if(mycore_tokenizer_chars_map[ udata[length] ] == MyCORE_STRING_MAP_CHAR_A_Z_a_z) {
878	361		return myencoding_prescan_stream_to_determine_encoding_skip_name(udata, length, data_size);
879			}
880
881	0	0	while(length < data_size) {
882	0	0	if(udata[length] != 0x3E) {
883	0		return (length + 1);
884			}
885
886	0		length++;
887			}
888
889	0		return length;
890			}
891	467	100	else if(udata[length] == 0x21) { /* ! */
892	94		length++;
893
894	94	50	if((length + 2) < data_size && udata[length] == 0x2D && udata[(length+1)] == 0x2D) {
		100
		50
895	174	50	while(length < data_size) {
896	174	50	if(udata[length] != 0x3E) {
897	174	100	if(udata[(length - 1)] == 0x2D && udata[(length - 2)] == 0x2D)
		100
898	58		return (length + 1);
899			}
900
901	116		length++;
902			}
903
904	0		return length;
905			}
906
907	36	50	while(length < data_size) {
908	36	50	if(udata[length] != 0x3E) {
909	36		return (length + 1);
910			}
911
912	0		length++;
913			}
914
915	0		return length;
916			}
917	373	50	else if(udata[length] == 0x3F) { /* ? */
918	0		length++;
919
920	0	0	while(length < data_size) {
921	0	0	if(udata[length] != 0x3E) {
922	0		return (length + 1);
923			}
924
925	0		length++;
926			}
927
928	0		return length;
929			}
930
931
932	373		return myencoding_prescan_stream_to_determine_encoding_skip_name(udata, length, data_size);
933			}
934
935	136		myencoding_t myencoding_prescan_stream_to_determine_encoding(const char *data, size_t data_size)
936			{
937	136		return myencoding_prescan_stream_to_determine_encoding_with_found(data, data_size, NULL, NULL);
938			}
939
940	136		myencoding_t myencoding_prescan_stream_to_determine_encoding_with_found(const char data, size_t data_size, const char found, size_t found_length)
941			{
942	136		const unsigned char* udata = (const unsigned char*)data;
943	136		myencoding_t encoding = MyENCODING_NOT_DETERMINED;
944
945	136	50	if(found)
946	0		*found = NULL;
947	136	50	if(found_length)
948	0		*found_length = 0;
949
950	136		size_t i = 0;
951	9741	100	while(i < data_size) {
952			/* 0x3C = '<' */
953	9615	100	if(data[i] == 0x3C)
954			{
955	840	100	if((i + 5) >= data_size)
956	9		return encoding;
957
958	831		i++;
959
960	831	100	switch (data[i]) {
961			/*
962			A sequence of bytes starting with:
963			0x3C, 0x4D or 0x6D, 0x45 or 0x65, 0x54 or 0x74, 0x41 or 0x61,
964			and one of 0x09, 0x0A, 0x0C, 0x0D, 0x20, 0x2F
965			(case-insensitive ASCII '
966			*/
967			case 0x4D:
968			case 0x6D:
969	3	50	if(mycore_ustrcasecmp_without_checks_by_secondary((const unsigned char*)"meta", &udata[i])) {
970	3		i += 4;
971
972	3	50	if(udata[i] == 0x09 \|\| udata[i] == 0x0A \|\| udata[i] == 0x0C \|\|
		50
		50
		50
973	3	100	udata[i] == 0x0D \|\| udata[i] == 0x20 \|\| udata[i] == 0x2F)
		50
974			{
975	2		i++;
976
977	2	100	if(myencoding_prescan_stream_to_determine_encoding_check_meta(udata, &i, data_size, &encoding, found, found_length))
978	1		return encoding;
979			}
980			}
981
982	2		break;
983
984			default:
985	828		i = myencoding_prescan_stream_to_determine_encoding_skip_other(udata, i, data_size);
986	830		break;
987			}
988			}
989			else {
990	8775		i++;
991			}
992			}
993
994	136		return encoding;
995			}
996
997