Newer
Older
/*
* encoding.c : implements the encoding conversion functions needed for XML
*
* rfc2044 (UTF-8 and UTF-16) F. Yergeau Alis Technologies
* rfc2781 UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
* [ISO-10646] UTF-8 and UTF-16 in Annexes
* [ISO-8859-1] ISO Latin-1 characters codes.
* [UNICODE] The Unicode Consortium, "The Unicode Standard --
* Worldwide Character Encoding -- Version 1.0", Addison-
* Wesley, Volume 1, 1991, Volume 2, 1992. UTF-8 is
* described in Unicode Technical Report #4.
* [US-ASCII] Coded Character Set--7-bit American Standard Code for
* Information Interchange, ANSI X3.4-1986.
*
* See Copyright for the status of this software.
*
*
* Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
#define IN_LIBXML
#include <limits.h>
#include <ctype.h>
#include <stdlib.h>
#ifdef LIBXML_ICONV_ENABLED
#include <errno.h>
#endif
#include <libxml/encoding.h>
#include <libxml/xmlmemory.h>
#ifdef LIBXML_HTML_ENABLED
#include <libxml/HTMLparser.h>
#endif
#include <libxml/globals.h>
#include "buf.h"
#include "enc.h"
#ifdef LIBXML_ICU_ENABLED
#include <unicode/ucnv.h>
/* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
#define ICU_PIVOT_BUF_SIZE 1024
typedef struct _uconv_t uconv_t;
struct _uconv_t {
UConverter *uconv; /* for conversion between an encoding and UTF-16 */
UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
UChar pivot_buf[ICU_PIVOT_BUF_SIZE];
UChar *pivot_source;
UChar *pivot_target;
};
#endif
static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
struct _xmlCharEncodingAlias {
const char *name;
const char *alias;
};
static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
static int xmlCharEncodingAliasesNb = 0;
static int xmlCharEncodingAliasesMax = 0;
#if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
#if 0
#define DEBUG_ENCODING /* Define this to get encoding traces */
#endif
#else
#ifdef LIBXML_ISO8859X_ENABLED
static void xmlRegisterCharEncodingHandlersISO8859x (void);
#endif
#endif
static int xmlLittleEndian = 1;
/**
* xmlEncodingErrMemory:
*
* Handle an out of memory condition
*/
static void
xmlEncodingErrMemory(const char *extra)
{
__xmlSimpleError(XML_FROM_I18N, XML_ERR_NO_MEMORY, NULL, NULL, extra);
}
/**
* xmlErrEncoding:
* @error: the error number
* @msg: the error message
*
* n encoding error
*/
static void LIBXML_ATTR_FORMAT(2,0)
xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val)
{
__xmlRaiseError(NULL, NULL, NULL, NULL, NULL,
XML_FROM_I18N, error, XML_ERR_FATAL,
NULL, 0, val, NULL, NULL, 0, 0, msg, val);
}
#ifdef LIBXML_ICU_ENABLED
static uconv_t*
openIcuConverter(const char* name, int toUnicode)
{
UErrorCode status = U_ZERO_ERROR;
uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t));
if (conv == NULL)
return NULL;
conv->pivot_source = conv->pivot_buf;
conv->pivot_target = conv->pivot_buf;
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
conv->uconv = ucnv_open(name, &status);
if (U_FAILURE(status))
goto error;
status = U_ZERO_ERROR;
if (toUnicode) {
ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
NULL, NULL, NULL, &status);
}
else {
ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
NULL, NULL, NULL, &status);
}
if (U_FAILURE(status))
goto error;
status = U_ZERO_ERROR;
conv->utf8 = ucnv_open("UTF-8", &status);
if (U_SUCCESS(status))
return conv;
error:
if (conv->uconv)
ucnv_close(conv->uconv);
xmlFree(conv);
return NULL;
}
static void
closeIcuConverter(uconv_t *conv)
{
if (conv != NULL) {
ucnv_close(conv->uconv);
ucnv_close(conv->utf8);
xmlFree(conv);
}
}
#endif /* LIBXML_ICU_ENABLED */
/************************************************************************
* *
* Conversions To/From UTF8 encoding *
* *
************************************************************************/
/**
* asciiToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of ASCII chars
* @inlen: the length of @in
*
* Take a block of ASCII chars in and try to convert it to an UTF-8
* block of chars out.
* Returns 0 if success, or -1 otherwise
* The value of @inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
asciiToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
unsigned char* outstart = out;
const unsigned char* base = in;
const unsigned char* processed = in;
unsigned char* outend = out + *outlen;
const unsigned char* inend;
unsigned int c;
inend = in + (*inlen);
while ((in < inend) && (out - outstart + 5 < *outlen)) {
c= *in++;
if (out >= outend)
break;
if (c < 0x80) {
*out++ = c;
*outlen = out - outstart;
*inlen = processed - base;
return(-1);
}
processed = (const unsigned char*) in;
}
*outlen = out - outstart;
*inlen = processed - base;
return(*outlen);
#ifdef LIBXML_OUTPUT_ENABLED
/**
* UTF8Toascii:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and try to convert it to an ASCII
* block of chars out.
*
* Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
* The value of @inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
UTF8Toascii(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
const unsigned char* processed = in;
const unsigned char* outend;
const unsigned char* outstart = out;
const unsigned char* instart = in;
const unsigned char* inend;
unsigned int c, d;
int trailing;
if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
if (in == NULL) {
/*
* initialization nothing to do
*/
*outlen = 0;
*inlen = 0;
return(0);
}
inend = in + (*inlen);
outend = out + (*outlen);
while (in < inend) {
d = *in++;
if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) {
/* trailing byte in leading position */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else {
/* no chance for this in Ascii */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
if (inend - in < trailing) {
break;
for ( ; trailing; trailing--) {
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
break;
c <<= 6;
c |= d & 0x3F;
}
/* assertion: c is a single UTF-4 value */
if (c < 0x80) {
if (out >= outend)
break;
*out++ = c;
} else {
/* no chance for this in Ascii */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
processed = in;
}
*outlen = out - outstart;
*inlen = processed - instart;
return(*outlen);
#endif /* LIBXML_OUTPUT_ENABLED */
/**
* isolat1ToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of ISO Latin 1 chars
* @inlen: the length of @in
*
* Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
* block of chars out.
* Returns the number of bytes written if success, or -1 otherwise
* The value of @inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
int
isolat1ToUTF8(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
unsigned char* outstart = out;
const unsigned char* base = in;
unsigned char* outend;
const unsigned char* instop;
if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
return(-1);
outend = out + *outlen;
instop = inend;
while ((in < inend) && (out < outend - 1)) {
if (*in >= 0x80) {
*out++ = (((*in) >> 6) & 0x1F) | 0xC0;
++in;
}
if ((instop - in) > (outend - out)) instop = in + (outend - out);
while ((in < instop) && (*in < 0x80)) {
*out++ = *in++;
}
if ((in < inend) && (out < outend) && (*in < 0x80)) {
*out++ = *in++;
*inlen = in - base;
return(*outlen);
/**
* UTF8ToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @inb: a pointer to an array of UTF-8 chars
* @inlenb: the length of @in in UTF-8 chars
*
* No op copy operation for UTF8 handling.
*
* Returns the number of bytes written, or -1 if lack of space.
* The value of *inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
*/
static int
UTF8ToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb)
{
int len;
if ((out == NULL) || (outlen == NULL) || (inlenb == NULL))
if (inb == NULL) {
/* inb == NULL means output is initialized. */
*outlen = 0;
*inlenb = 0;
return(0);
}
if (*outlen > *inlenb) {
len = *inlenb;
} else {
len = *outlen;
}
if (len < 0)
return(-1);
/*
* FIXME: Conversion functions must assure valid UTF-8, so we have
* to check for UTF-8 validity. Preferably, this converter shouldn't
* be used at all.
*/
memcpy(out, inb, len);
*outlen = len;
*inlenb = len;
return(*outlen);
#ifdef LIBXML_OUTPUT_ENABLED
/**
* UTF8Toisolat1:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
* block of chars out.
*
* Returns the number of bytes written if success, -2 if the transcoding fails,
or -1 otherwise
* The value of @inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
* The value of @outlen after return is the number of octets produced.
*/
int
UTF8Toisolat1(unsigned char* out, int *outlen,
const unsigned char* in, int *inlen) {
const unsigned char* processed = in;
const unsigned char* outend;
const unsigned char* outstart = out;
const unsigned char* instart = in;
const unsigned char* inend;
unsigned int c, d;
int trailing;
if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
if (in == NULL) {
/*
* initialization nothing to do
*/
*outlen = 0;
*inlen = 0;
return(0);
}
inend = in + (*inlen);
outend = out + (*outlen);
while (in < inend) {
d = *in++;
if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) {
/* trailing byte in leading position */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else {
/* no chance for this in IsoLat1 */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
if (inend - in < trailing) {
break;
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
for ( ; trailing; trailing--) {
if (in >= inend)
break;
if (((d= *in++) & 0xC0) != 0x80) {
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
c <<= 6;
c |= d & 0x3F;
}
/* assertion: c is a single UTF-4 value */
if (c <= 0xFF) {
if (out >= outend)
break;
*out++ = c;
} else {
/* no chance for this in IsoLat1 */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
processed = in;
}
*outlen = out - outstart;
*inlen = processed - instart;
return(*outlen);
#endif /* LIBXML_OUTPUT_ENABLED */
/**
* UTF16LEToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @inb: a pointer to an array of UTF-16LE passwd as a byte array
* @inlenb: the length of @in in UTF-16LE chars
*
* Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
* block of chars out. This function assumes the endian property
* is the same between the native type of this machine and the
* inputed one.
*
* Returns the number of bytes written, or -1 if lack of space, or -2
* if the transcoding fails (if *in is not a valid utf16 string)
* The value of *inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
UTF16LEToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb)
{
unsigned char* outstart = out;
const unsigned char* processed = inb;
unsigned short* in = (unsigned short*) inb;
unsigned short* inend;
unsigned int c, d, inlen;
unsigned char *tmp;
int bits;
if (*outlen == 0) {
*inlenb = 0;
return(0);
}
outend = out + *outlen;
if ((*inlenb % 2) == 1)
(*inlenb)--;
inlen = *inlenb / 2;
inend = in + inlen;
while ((in < inend) && (out - outstart + 5 < *outlen)) {
if (xmlLittleEndian) {
c= *in++;
} else {
tmp = (unsigned char *) in;
c = *tmp++;
c = c | (((unsigned int)*tmp) << 8);
in++;
}
if ((c & 0xFC00) == 0xD800) { /* surrogates */
if (in >= inend) { /* handle split mutli-byte characters */
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
break;
}
if (xmlLittleEndian) {
d = *in++;
} else {
tmp = (unsigned char *) in;
d = *tmp++;
d = d | (((unsigned int)*tmp) << 8);
in++;
}
if ((d & 0xFC00) == 0xDC00) {
c &= 0x03FF;
c <<= 10;
c |= d & 0x03FF;
c += 0x10000;
}
else {
*outlen = out - outstart;
*inlenb = processed - inb;
return(-2);
}
}
/* assertion: c is a single UTF-4 value */
if (out >= outend)
break;
if (c < 0x80) { *out++= c; bits= -6; }
else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
for ( ; bits >= 0; bits-= 6) {
if (out >= outend)
break;
*out++= ((c >> bits) & 0x3F) | 0x80;
}
processed = (const unsigned char*) in;
}
*outlen = out - outstart;
*inlenb = processed - inb;
return(*outlen);
#ifdef LIBXML_OUTPUT_ENABLED
/**
* UTF8ToUTF16LE:
* @outb: a pointer to an array of bytes to store the result
* @outlen: the length of @outb
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
* block of chars out.
*
* Returns the number of bytes written, or -1 if lack of space, or -2
* if the transcoding failed.
UTF8ToUTF16LE(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen)
{
unsigned short* out = (unsigned short*) outb;
const unsigned char* processed = in;
const unsigned char *const instart = in;
unsigned short* outstart= out;
unsigned short* outend;
const unsigned char* inend;
unsigned int c, d;
int trailing;
unsigned char *tmp;
unsigned short tmp1, tmp2;
/* UTF16LE encoding has no BOM */
if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
if (in == NULL) {
*outlen = 0;
*inlen = 0;
return(0);
}
outend = out + (*outlen / 2);
while (in < inend) {
d= *in++;
if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) {
/* trailing byte in leading position */
*outlen = (out - outstart) * 2;
*inlen = processed - instart;
return(-2);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else {
/* no chance for this in UTF-16 */
*outlen = (out - outstart) * 2;
*inlen = processed - instart;
return(-2);
}
if (inend - in < trailing) {
break;
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
for ( ; trailing; trailing--) {
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
break;
c <<= 6;
c |= d & 0x3F;
}
/* assertion: c is a single UTF-4 value */
if (c < 0x10000) {
if (out >= outend)
break;
if (xmlLittleEndian) {
*out++ = c;
} else {
tmp = (unsigned char *) out;
*tmp = c ;
*(tmp + 1) = c >> 8 ;
out++;
}
}
else if (c < 0x110000) {
if (out+1 >= outend)
break;
c -= 0x10000;
if (xmlLittleEndian) {
*out++ = 0xD800 | (c >> 10);
*out++ = 0xDC00 | (c & 0x03FF);
} else {
tmp1 = 0xD800 | (c >> 10);
tmp = (unsigned char *) out;
*tmp = (unsigned char) tmp1;
*(tmp + 1) = tmp1 >> 8;
out++;
tmp2 = 0xDC00 | (c & 0x03FF);
tmp = (unsigned char *) out;
*tmp = (unsigned char) tmp2;
*(tmp + 1) = tmp2 >> 8;
out++;
}
}
else
break;
processed = in;
}
*outlen = (out - outstart) * 2;
*inlen = processed - instart;
return(*outlen);
* UTF8ToUTF16:
* @outb: a pointer to an array of bytes to store the result
* @outlen: the length of @outb
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and try to convert it to an UTF-16
* block of chars out.
*
* Returns the number of bytes written, or -1 if lack of space, or -2
* if the transcoding failed.
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
*/
static int
UTF8ToUTF16(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen)
{
if (in == NULL) {
/*
* initialization, add the Byte Order Mark for UTF-16LE
*/
if (*outlen >= 2) {
outb[0] = 0xFF;
outb[1] = 0xFE;
*outlen = 2;
*inlen = 0;
#ifdef DEBUG_ENCODING
xmlGenericError(xmlGenericErrorContext,
"Added FFFE Byte Order Mark\n");
#endif
return(2);
}
*outlen = 0;
*inlen = 0;
return(0);
}
return (UTF8ToUTF16LE(outb, outlen, in, inlen));
}
#endif /* LIBXML_OUTPUT_ENABLED */
/**
* UTF16BEToUTF8:
* @out: a pointer to an array of bytes to store the result
* @outlen: the length of @out
* @inb: a pointer to an array of UTF-16 passed as a byte array
* @inlenb: the length of @in in UTF-16 chars
*
* Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
* block of chars out. This function assumes the endian property
* is the same between the native type of this machine and the
* inputed one.
*
* Returns the number of bytes written, or -1 if lack of space, or -2
* if the transcoding fails (if *in is not a valid utf16 string)
* The value of *inlen after return is the number of octets consumed
* if the return value is positive, else unpredictable.
UTF16BEToUTF8(unsigned char* out, int *outlen,
const unsigned char* inb, int *inlenb)
{
unsigned char* outstart = out;
const unsigned char* processed = inb;
unsigned char* outend;
unsigned short* in = (unsigned short*) inb;
unsigned short* inend;
unsigned int c, d, inlen;
unsigned char *tmp;
int bits;
if (*outlen == 0) {
*inlenb = 0;
return(0);
}
outend = out + *outlen;
if ((*inlenb % 2) == 1)
(*inlenb)--;
inlen = *inlenb / 2;
inend= in + inlen;
while ((in < inend) && (out - outstart + 5 < *outlen)) {
if (xmlLittleEndian) {
tmp = (unsigned char *) in;
c = *tmp++;
c = (c << 8) | (unsigned int) *tmp;
if ((c & 0xFC00) == 0xD800) { /* surrogates */
if (in >= inend) { /* handle split mutli-byte characters */
break;
}
if (xmlLittleEndian) {
tmp = (unsigned char *) in;
d = *tmp++;
d = (d << 8) | (unsigned int) *tmp;
in++;
} else {
d= *in++;
}
if ((d & 0xFC00) == 0xDC00) {
c &= 0x03FF;
c <<= 10;
c |= d & 0x03FF;
c += 0x10000;
}
else {
*outlen = out - outstart;
*inlenb = processed - inb;
return(-2);
}
}
/* assertion: c is a single UTF-4 value */
if (out >= outend)
break;
if (c < 0x80) { *out++= c; bits= -6; }
else if (c < 0x800) { *out++= ((c >> 6) & 0x1F) | 0xC0; bits= 0; }
else if (c < 0x10000) { *out++= ((c >> 12) & 0x0F) | 0xE0; bits= 6; }
else { *out++= ((c >> 18) & 0x07) | 0xF0; bits= 12; }
if (out >= outend)
break;
*out++= ((c >> bits) & 0x3F) | 0x80;
}
processed = (const unsigned char*) in;
}
*outlen = out - outstart;
*inlenb = processed - inb;
return(*outlen);
#ifdef LIBXML_OUTPUT_ENABLED
/**
* UTF8ToUTF16BE:
* @outb: a pointer to an array of bytes to store the result
* @outlen: the length of @outb
* @in: a pointer to an array of UTF-8 chars
* @inlen: the length of @in
*
* Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
* block of chars out.
*
* Returns the number of byte written, or -1 by lack of space, or -2
* if the transcoding failed.
UTF8ToUTF16BE(unsigned char* outb, int *outlen,
const unsigned char* in, int *inlen)
{
unsigned short* out = (unsigned short*) outb;
const unsigned char* processed = in;
const unsigned char *const instart = in;
unsigned short* outstart= out;
unsigned short* outend;
const unsigned char* inend;
unsigned int c, d;
int trailing;
unsigned char *tmp;
unsigned short tmp1, tmp2;
/* UTF-16BE has no BOM */
if ((outb == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
if (in == NULL) {
*outlen = 0;
*inlen = 0;
return(0);
}
outend = out + (*outlen / 2);
while (in < inend) {
d= *in++;
if (d < 0x80) { c= d; trailing= 0; }
else if (d < 0xC0) {
/* trailing byte in leading position */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
} else if (d < 0xE0) { c= d & 0x1F; trailing= 1; }
else if (d < 0xF0) { c= d & 0x0F; trailing= 2; }
else if (d < 0xF8) { c= d & 0x07; trailing= 3; }
else {
/* no chance for this in UTF-16 */
*outlen = out - outstart;
*inlen = processed - instart;
return(-2);
}
if (inend - in < trailing) {
break;
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
for ( ; trailing; trailing--) {
if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80)) break;
c <<= 6;
c |= d & 0x3F;
}
/* assertion: c is a single UTF-4 value */
if (c < 0x10000) {
if (out >= outend) break;
if (xmlLittleEndian) {
tmp = (unsigned char *) out;
*tmp = c >> 8;
*(tmp + 1) = c;
out++;
} else {
*out++ = c;
}
}
else if (c < 0x110000) {
if (out+1 >= outend) break;
c -= 0x10000;
if (xmlLittleEndian) {
tmp1 = 0xD800 | (c >> 10);
tmp = (unsigned char *) out;
*tmp = tmp1 >> 8;
*(tmp + 1) = (unsigned char) tmp1;
out++;
tmp2 = 0xDC00 | (c & 0x03FF);
tmp = (unsigned char *) out;
*tmp = tmp2 >> 8;
*(tmp + 1) = (unsigned char) tmp2;
out++;
} else {
*out++ = 0xD800 | (c >> 10);
*out++ = 0xDC00 | (c & 0x03FF);
}
}
else
break;
processed = in;
}
*outlen = (out - outstart) * 2;
*inlen = processed - instart;
return(*outlen);
#endif /* LIBXML_OUTPUT_ENABLED */
/************************************************************************
* *
* Generic encoding handling routines *
* *
************************************************************************/
/**
* xmlDetectCharEncoding:
* @in: a pointer to the first bytes of the XML entity, must be at least
* 2 bytes long (at least 4 if encoding is UTF4 variant).
* @len: pointer to the length of the buffer
*
* Guess the encoding of the entity using the first bytes of the entity content
* according to the non-normative appendix F of the XML-1.0 recommendation.
* Returns one of the XML_CHAR_ENCODING_... values.
*/
xmlCharEncoding
xmlDetectCharEncoding(const unsigned char* in, int len)
{
if (in == NULL)
return(XML_CHAR_ENCODING_NONE);
if (len >= 4) {
if ((in[0] == 0x00) && (in[1] == 0x00) &&
(in[2] == 0x00) && (in[3] == 0x3C))
return(XML_CHAR_ENCODING_UCS4BE);
if ((in[0] == 0x3C) && (in[1] == 0x00) &&
(in[2] == 0x00) && (in[3] == 0x00))
return(XML_CHAR_ENCODING_UCS4LE);
if ((in[0] == 0x00) && (in[1] == 0x00) &&
(in[2] == 0x3C) && (in[3] == 0x00))
return(XML_CHAR_ENCODING_UCS4_2143);
if ((in[0] == 0x00) && (in[1] == 0x3C) &&
(in[2] == 0x00) && (in[3] == 0x00))
return(XML_CHAR_ENCODING_UCS4_3412);
if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
(in[2] == 0xA7) && (in[3] == 0x94))
return(XML_CHAR_ENCODING_EBCDIC);
if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
(in[2] == 0x78) && (in[3] == 0x6D))
return(XML_CHAR_ENCODING_UTF8);
/*
* Although not part of the recommendation, we also
* attempt an "auto-recognition" of UTF-16LE and
* UTF-16BE encodings.
*/
if ((in[0] == 0x3C) && (in[1] == 0x00) &&
(in[2] == 0x3F) && (in[3] == 0x00))
return(XML_CHAR_ENCODING_UTF16LE);
if ((in[0] == 0x00) && (in[1] == 0x3C) &&
(in[2] == 0x00) && (in[3] == 0x3F))
return(XML_CHAR_ENCODING_UTF16BE);
if (len >= 3) {
/*
* Errata on XML-1.0 June 20 2001
* We now allow an UTF8 encoded BOM
*/
if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
(in[2] == 0xBF))
return(XML_CHAR_ENCODING_UTF8);
}
/* For UTF-16 we can recognize by the BOM */
if (len >= 2) {
if ((in[0] == 0xFE) && (in[1] == 0xFF))
return(XML_CHAR_ENCODING_UTF16BE);
if ((in[0] == 0xFF) && (in[1] == 0xFE))