Skip to content
Snippets Groups Projects
encoding.c 143 KiB
Newer Older
  • Learn to ignore specific revisions
  • /*
     * encoding.c : implements the encoding conversion functions needed for XML
     *
    
     * rfc2044        (UTF-8 and UTF-16) F. Yergeau Alis Technologies
     * rfc2781        UTF-16, an encoding of ISO 10646, P. Hoffman, F. Yergeau
     * [ISO-10646]    UTF-8 and UTF-16 in Annexes
     * [ISO-8859-1]   ISO Latin-1 characters codes.
     * [UNICODE]      The Unicode Consortium, "The Unicode Standard --
     *                Worldwide Character Encoding -- Version 1.0", Addison-
     *                Wesley, Volume 1, 1991, Volume 2, 1992.  UTF-8 is
     *                described in Unicode Technical Report #4.
     * [US-ASCII]     Coded Character Set--7-bit American Standard Code for
     *                Information Interchange, ANSI X3.4-1986.
     *
     * See Copyright for the status of this software.
     *
    
     * daniel@veillard.com
    
     *
     * Original code for IsoLatin1 and UTF-16 by "Martin J. Duerst" <duerst@w3.org>
    
    Bjorn Reese's avatar
    Bjorn Reese committed
    #include "libxml.h"
    
    
    #include <string.h>
    
    #include <ctype.h>
    #include <stdlib.h>
    
    #ifdef LIBXML_ICONV_ENABLED
    #include <errno.h>
    #endif
    
    #include <libxml/encoding.h>
    #include <libxml/xmlmemory.h>
    #ifdef LIBXML_HTML_ENABLED
    #include <libxml/HTMLparser.h>
    #endif
    
    #include <libxml/xmlerror.h>
    
    
    #ifdef LIBXML_ICU_ENABLED
    #include <unicode/ucnv.h>
    /* Size of pivot buffer, same as icu/source/common/ucnv.cpp CHUNK_SIZE */
    #define ICU_PIVOT_BUF_SIZE 1024
    typedef struct _uconv_t uconv_t;
    struct _uconv_t {
      UConverter *uconv; /* for conversion between an encoding and UTF-16 */
      UConverter *utf8; /* for conversion between UTF-8 and UTF-16 */
      UChar      pivot_buf[ICU_PIVOT_BUF_SIZE];
      UChar      *pivot_source;
      UChar      *pivot_target;
    };
    #endif
    
    
    static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL;
    static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL;
    
    
    typedef struct _xmlCharEncodingAlias xmlCharEncodingAlias;
    typedef xmlCharEncodingAlias *xmlCharEncodingAliasPtr;
    struct _xmlCharEncodingAlias {
        const char *name;
        const char *alias;
    };
    
    static xmlCharEncodingAliasPtr xmlCharEncodingAliases = NULL;
    static int xmlCharEncodingAliasesNb = 0;
    static int xmlCharEncodingAliasesMax = 0;
    
    
    #if defined(LIBXML_ICONV_ENABLED) || defined(LIBXML_ICU_ENABLED)
    
    #if 0
    #define DEBUG_ENCODING  /* Define this to get encoding traces */
    #endif
    
    #else
    #ifdef LIBXML_ISO8859X_ENABLED
    static void xmlRegisterCharEncodingHandlersISO8859x (void);
    #endif
    
    #endif
    
    static int xmlLittleEndian = 1;
    
    
    Nick Wellnhofer's avatar
    Nick Wellnhofer committed
     * @extra:  extra information
    
     *
     * Handle an out of memory condition
     */
    static void
    xmlEncodingErrMemory(const char *extra)
    {
        __xmlSimpleError(XML_FROM_I18N, XML_ERR_NO_MEMORY, NULL, NULL, extra);
    }
    
    /**
     * xmlErrEncoding:
     * @error:  the error number
     * @msg:  the error message
     *
     * n encoding error
     */
    
    static void LIBXML_ATTR_FORMAT(2,0)
    
    xmlEncodingErr(xmlParserErrors error, const char *msg, const char *val)
    {
        __xmlRaiseError(NULL, NULL, NULL, NULL, NULL,
                        XML_FROM_I18N, error, XML_ERR_FATAL,
                        NULL, 0, val, NULL, NULL, 0, 0, msg, val);
    }
    
    #ifdef LIBXML_ICU_ENABLED
    static uconv_t*
    openIcuConverter(const char* name, int toUnicode)
    {
      UErrorCode status = U_ZERO_ERROR;
      uconv_t *conv = (uconv_t *) xmlMalloc(sizeof(uconv_t));
      if (conv == NULL)
        return NULL;
    
    
      conv->pivot_source = conv->pivot_buf;
      conv->pivot_target = conv->pivot_buf;
    
    
      conv->uconv = ucnv_open(name, &status);
      if (U_FAILURE(status))
        goto error;
    
      status = U_ZERO_ERROR;
      if (toUnicode) {
        ucnv_setToUCallBack(conv->uconv, UCNV_TO_U_CALLBACK_STOP,
                            NULL, NULL, NULL, &status);
      }
      else {
        ucnv_setFromUCallBack(conv->uconv, UCNV_FROM_U_CALLBACK_STOP,
                            NULL, NULL, NULL, &status);
      }
      if (U_FAILURE(status))
        goto error;
    
      status = U_ZERO_ERROR;
      conv->utf8 = ucnv_open("UTF-8", &status);
      if (U_SUCCESS(status))
        return conv;
    
    error:
      if (conv->uconv)
        ucnv_close(conv->uconv);
      xmlFree(conv);
      return NULL;
    }
    
    static void
    closeIcuConverter(uconv_t *conv)
    {
      if (conv != NULL) {
        ucnv_close(conv->uconv);
        ucnv_close(conv->utf8);
        xmlFree(conv);
      }
    }
    #endif /* LIBXML_ICU_ENABLED */
    
    
    /************************************************************************
     *									*
     *		Conversions To/From UTF8 encoding			*
     *									*
     ************************************************************************/
    
    /**
    
     * asciiToUTF8:
     * @out:  a pointer to an array of bytes to store the result
     * @outlen:  the length of @out
     * @in:  a pointer to an array of ASCII chars
     * @inlen:  the length of @in
     *
     * Take a block of ASCII chars in and try to convert it to an UTF-8
     * block of chars out.
     * Returns 0 if success, or -1 otherwise
     * The value of @inlen after return is the number of octets consumed
    
     *     if the return value is positive, else unpredictable.
    
     * The value of @outlen after return is the number of octets produced.
    
    asciiToUTF8(unsigned char* out, int *outlen,
                  const unsigned char* in, int *inlen) {
        unsigned char* outstart = out;
        const unsigned char* base = in;
        const unsigned char* processed = in;
        unsigned char* outend = out + *outlen;
        const unsigned char* inend;
        unsigned int c;
    
        inend = in + (*inlen);
        while ((in < inend) && (out - outstart + 5 < *outlen)) {
    	c= *in++;
    
            if (out >= outend)
    	    break;
    
            if (c < 0x80) {
    	    *out++ = c;
    
    	    *outlen = out - outstart;
    	    *inlen = processed - base;
    	    return(-1);
    	}
    
    	processed = (const unsigned char*) in;
        }
        *outlen = out - outstart;
        *inlen = processed - base;
    
    /**
     * UTF8Toascii:
     * @out:  a pointer to an array of bytes to store the result
     * @outlen:  the length of @out
     * @in:  a pointer to an array of UTF-8 chars
     * @inlen:  the length of @in
     *
     * Take a block of UTF-8 chars in and try to convert it to an ASCII
     * block of chars out.
     *
     * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
     * The value of @inlen after return is the number of octets consumed
    
     *     if the return value is positive, else unpredictable.
    
     * The value of @outlen after return is the number of octets produced.
    
    UTF8Toascii(unsigned char* out, int *outlen,
                  const unsigned char* in, int *inlen) {
        const unsigned char* processed = in;
        const unsigned char* outend;
        const unsigned char* outstart = out;
        const unsigned char* instart = in;
        const unsigned char* inend;
        unsigned int c, d;
        int trailing;
    
    
        if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
    
        if (in == NULL) {
            /*
    	 * initialization nothing to do
    	 */
    	*outlen = 0;
    	*inlen = 0;
    	return(0);
        }
        inend = in + (*inlen);
        outend = out + (*outlen);
        while (in < inend) {
    	d = *in++;
    	if      (d < 0x80)  { c= d; trailing= 0; }
    	else if (d < 0xC0) {
    	    /* trailing byte in leading position */
    	    *outlen = out - outstart;
    	    *inlen = processed - instart;
    	    return(-2);
            } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
            else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
            else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
    	else {
    	    /* no chance for this in Ascii */
    	    *outlen = out - outstart;
    	    *inlen = processed - instart;
    	    return(-2);
    	}
    
    	if (inend - in < trailing) {
    	    break;
    
    
    	for ( ; trailing; trailing--) {
    	    if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
    		break;
    	    c <<= 6;
    	    c |= d & 0x3F;
    	}
    
    	/* assertion: c is a single UTF-4 value */
    	if (c < 0x80) {
    	    if (out >= outend)
    		break;
    	    *out++ = c;
    	} else {
    	    /* no chance for this in Ascii */
    	    *outlen = out - outstart;
    	    *inlen = processed - instart;
    	    return(-2);
    	}
    	processed = in;
        }
        *outlen = out - outstart;
        *inlen = processed - instart;
    
    #endif /* LIBXML_OUTPUT_ENABLED */
    
    
    /**
     * isolat1ToUTF8:
     * @out:  a pointer to an array of bytes to store the result
     * @outlen:  the length of @out
     * @in:  a pointer to an array of ISO Latin 1 chars
     * @inlen:  the length of @in
     *
     * Take a block of ISO Latin 1 chars in and try to convert it to an UTF-8
     * block of chars out.
    
     * Returns the number of bytes written if success, or -1 otherwise
    
     * The value of @inlen after return is the number of octets consumed
    
     *     if the return value is positive, else unpredictable.
    
     * The value of @outlen after return is the number of octets produced.
    
     */
    int
    isolat1ToUTF8(unsigned char* out, int *outlen,
                  const unsigned char* in, int *inlen) {
        unsigned char* outstart = out;
        const unsigned char* base = in;
    
        const unsigned char* inend;
    
        if ((out == NULL) || (in == NULL) || (outlen == NULL) || (inlen == NULL))
    	return(-1);
    
        outend = out + *outlen;
    
        inend = in + (*inlen);
    
    
        while ((in < inend) && (out < outend - 1)) {
    	if (*in >= 0x80) {
    
    	    *out++ = (((*in) >>  6) & 0x1F) | 0xC0;
    
                *out++ = ((*in) & 0x3F) | 0x80;
    
    	if ((instop - in) > (outend - out)) instop = in + (outend - out);
    	while ((in < instop) && (*in < 0x80)) {
    
        }
        if ((in < inend) && (out < outend) && (*in < 0x80)) {
    
        }
        *outlen = out - outstart;
    
    /**
     * UTF8ToUTF8:
     * @out:  a pointer to an array of bytes to store the result
     * @outlen:  the length of @out
     * @inb:  a pointer to an array of UTF-8 chars
     * @inlenb:  the length of @in in UTF-8 chars
     *
     * No op copy operation for UTF8 handling.
     *
    
     * Returns the number of bytes written, or -1 if lack of space.
    
     *     The value of *inlen after return is the number of octets consumed
    
     *     if the return value is positive, else unpredictable.
    
     */
    static int
    UTF8ToUTF8(unsigned char* out, int *outlen,
               const unsigned char* inb, int *inlenb)
    {
        int len;
    
    
        if ((out == NULL) || (outlen == NULL) || (inlenb == NULL))
    
        if (inb == NULL) {
            /* inb == NULL means output is initialized. */
            *outlen = 0;
            *inlenb = 0;
            return(0);
        }
    
        if (*outlen > *inlenb) {
    	len = *inlenb;
        } else {
    	len = *outlen;
        }
        if (len < 0)
    	return(-1);
    
    
        /*
         * FIXME: Conversion functions must assure valid UTF-8, so we have
         * to check for UTF-8 validity. Preferably, this converter shouldn't
         * be used at all.
         */
    
        memcpy(out, inb, len);
    
        *outlen = len;
        *inlenb = len;
    
    /**
     * UTF8Toisolat1:
     * @out:  a pointer to an array of bytes to store the result
     * @outlen:  the length of @out
     * @in:  a pointer to an array of UTF-8 chars
     * @inlen:  the length of @in
     *
     * Take a block of UTF-8 chars in and try to convert it to an ISO Latin 1
     * block of chars out.
     *
    
     * Returns the number of bytes written if success, -2 if the transcoding fails,
               or -1 otherwise
    
     * The value of @inlen after return is the number of octets consumed
    
     *     if the return value is positive, else unpredictable.
    
     * The value of @outlen after return is the number of octets produced.
    
     */
    int
    UTF8Toisolat1(unsigned char* out, int *outlen,
                  const unsigned char* in, int *inlen) {
        const unsigned char* processed = in;
        const unsigned char* outend;
        const unsigned char* outstart = out;
        const unsigned char* instart = in;
        const unsigned char* inend;
        unsigned int c, d;
        int trailing;
    
    
        if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
    
        if (in == NULL) {
            /*
    	 * initialization nothing to do
    	 */
    	*outlen = 0;
    	*inlen = 0;
    	return(0);
        }
        inend = in + (*inlen);
        outend = out + (*outlen);
        while (in < inend) {
    	d = *in++;
    	if      (d < 0x80)  { c= d; trailing= 0; }
    	else if (d < 0xC0) {
    	    /* trailing byte in leading position */
    	    *outlen = out - outstart;
    	    *inlen = processed - instart;
    	    return(-2);
            } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
            else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
            else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
    	else {
    	    /* no chance for this in IsoLat1 */
    	    *outlen = out - outstart;
    	    *inlen = processed - instart;
    	    return(-2);
    	}
    
    	if (inend - in < trailing) {
    	    break;
    
    
    	for ( ; trailing; trailing--) {
    	    if (in >= inend)
    		break;
    	    if (((d= *in++) & 0xC0) != 0x80) {
    		*outlen = out - outstart;
    		*inlen = processed - instart;
    		return(-2);
    	    }
    	    c <<= 6;
    	    c |= d & 0x3F;
    	}
    
    	/* assertion: c is a single UTF-4 value */
    	if (c <= 0xFF) {
    	    if (out >= outend)
    		break;
    	    *out++ = c;
    	} else {
    	    /* no chance for this in IsoLat1 */
    	    *outlen = out - outstart;
    	    *inlen = processed - instart;
    	    return(-2);
    	}
    	processed = in;
        }
        *outlen = out - outstart;
        *inlen = processed - instart;
    
    #endif /* LIBXML_OUTPUT_ENABLED */
    
    
    /**
     * UTF16LEToUTF8:
     * @out:  a pointer to an array of bytes to store the result
     * @outlen:  the length of @out
     * @inb:  a pointer to an array of UTF-16LE passwd as a byte array
     * @inlenb:  the length of @in in UTF-16LE chars
     *
     * Take a block of UTF-16LE ushorts in and try to convert it to an UTF-8
    
     * block of chars out. This function assumes the endian property
    
     * is the same between the native type of this machine and the
     * inputed one.
     *
    
     * Returns the number of bytes written, or -1 if lack of space, or -2
     *     if the transcoding fails (if *in is not a valid utf16 string)
    
     *     The value of *inlen after return is the number of octets consumed
    
     *     if the return value is positive, else unpredictable.
    
    UTF16LEToUTF8(unsigned char* out, int *outlen,
                const unsigned char* inb, int *inlenb)
    {
        unsigned char* outstart = out;
        const unsigned char* processed = inb;
    
        unsigned char* outend;
    
        unsigned short* in = (unsigned short*) inb;
        unsigned short* inend;
        unsigned int c, d, inlen;
        unsigned char *tmp;
        int bits;
    
    
        if (*outlen == 0) {
            *inlenb = 0;
            return(0);
        }
        outend = out + *outlen;
    
        if ((*inlenb % 2) == 1)
            (*inlenb)--;
        inlen = *inlenb / 2;
        inend = in + inlen;
        while ((in < inend) && (out - outstart + 5 < *outlen)) {
            if (xmlLittleEndian) {
    	    c= *in++;
    	} else {
    	    tmp = (unsigned char *) in;
    	    c = *tmp++;
    	    c = c | (((unsigned int)*tmp) << 8);
    	    in++;
    	}
            if ((c & 0xFC00) == 0xD800) {    /* surrogates */
    
    	    if (in >= inend) {           /* handle split mutli-byte characters */
    
    		break;
    	    }
    	    if (xmlLittleEndian) {
    		d = *in++;
    	    } else {
    		tmp = (unsigned char *) in;
    		d = *tmp++;
    		d = d | (((unsigned int)*tmp) << 8);
    		in++;
    	    }
                if ((d & 0xFC00) == 0xDC00) {
                    c &= 0x03FF;
                    c <<= 10;
                    c |= d & 0x03FF;
                    c += 0x10000;
                }
                else {
    		*outlen = out - outstart;
    		*inlenb = processed - inb;
    	        return(-2);
    	    }
            }
    
    	/* assertion: c is a single UTF-4 value */
            if (out >= outend)
    	    break;
            if      (c <    0x80) {  *out++=  c;                bits= -6; }
            else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
            else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
            else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
    
            for ( ; bits >= 0; bits-= 6) {
                if (out >= outend)
    	        break;
                *out++= ((c >> bits) & 0x3F) | 0x80;
            }
    	processed = (const unsigned char*) in;
        }
        *outlen = out - outstart;
        *inlenb = processed - inb;
    
    /**
     * UTF8ToUTF16LE:
     * @outb:  a pointer to an array of bytes to store the result
     * @outlen:  the length of @outb
     * @in:  a pointer to an array of UTF-8 chars
     * @inlen:  the length of @in
     *
     * Take a block of UTF-8 chars in and try to convert it to an UTF-16LE
     * block of chars out.
     *
    
     * Returns the number of bytes written, or -1 if lack of space, or -2
    
     *     if the transcoding failed.
    
    UTF8ToUTF16LE(unsigned char* outb, int *outlen,
                const unsigned char* in, int *inlen)
    {
        unsigned short* out = (unsigned short*) outb;
        const unsigned char* processed = in;
    
        const unsigned char *const instart = in;
    
        unsigned short* outstart= out;
        unsigned short* outend;
    
        const unsigned char* inend;
    
        unsigned int c, d;
        int trailing;
        unsigned char *tmp;
        unsigned short tmp1, tmp2;
    
    
        if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
    
        if (in == NULL) {
    	*outlen = 0;
    	*inlen = 0;
    	return(0);
        }
    
        inend= in + *inlen;
    
        outend = out + (*outlen / 2);
        while (in < inend) {
          d= *in++;
          if      (d < 0x80)  { c= d; trailing= 0; }
          else if (d < 0xC0) {
              /* trailing byte in leading position */
    	  *outlen = (out - outstart) * 2;
    
    	  return(-2);
          } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
          else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
          else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
          else {
    	/* no chance for this in UTF-16 */
    	*outlen = (out - outstart) * 2;
    
    	return(-2);
          }
    
          if (inend - in < trailing) {
              break;
    
    
          for ( ; trailing; trailing--) {
              if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
    	      break;
              c <<= 6;
              c |= d & 0x3F;
          }
    
          /* assertion: c is a single UTF-4 value */
            if (c < 0x10000) {
                if (out >= outend)
    	        break;
    	    if (xmlLittleEndian) {
    		*out++ = c;
    	    } else {
    		tmp = (unsigned char *) out;
    		*tmp = c ;
    		*(tmp + 1) = c >> 8 ;
    		out++;
    	    }
            }
            else if (c < 0x110000) {
                if (out+1 >= outend)
    	        break;
                c -= 0x10000;
    	    if (xmlLittleEndian) {
    		*out++ = 0xD800 | (c >> 10);
    		*out++ = 0xDC00 | (c & 0x03FF);
    	    } else {
    		tmp1 = 0xD800 | (c >> 10);
    		tmp = (unsigned char *) out;
    		*tmp = (unsigned char) tmp1;
    		*(tmp + 1) = tmp1 >> 8;
    		out++;
    
    		tmp2 = 0xDC00 | (c & 0x03FF);
    		tmp = (unsigned char *) out;
    		*tmp  = (unsigned char) tmp2;
    		*(tmp + 1) = tmp2 >> 8;
    		out++;
    	    }
            }
            else
    	    break;
    	processed = in;
        }
        *outlen = (out - outstart) * 2;
    
     * UTF8ToUTF16:
     * @outb:  a pointer to an array of bytes to store the result
     * @outlen:  the length of @outb
     * @in:  a pointer to an array of UTF-8 chars
     * @inlen:  the length of @in
     *
     * Take a block of UTF-8 chars in and try to convert it to an UTF-16
     * block of chars out.
     *
     * Returns the number of bytes written, or -1 if lack of space, or -2
    
     *     if the transcoding failed.
    
     */
    static int
    UTF8ToUTF16(unsigned char* outb, int *outlen,
                const unsigned char* in, int *inlen)
    {
        if (in == NULL) {
    	/*
    	 * initialization, add the Byte Order Mark for UTF-16LE
    	 */
            if (*outlen >= 2) {
    	    outb[0] = 0xFF;
    	    outb[1] = 0xFE;
    	    *outlen = 2;
    	    *inlen = 0;
    #ifdef DEBUG_ENCODING
                xmlGenericError(xmlGenericErrorContext,
    		    "Added FFFE Byte Order Mark\n");
    #endif
    	    return(2);
    	}
    	*outlen = 0;
    	*inlen = 0;
    	return(0);
        }
        return (UTF8ToUTF16LE(outb, outlen, in, inlen));
    }
    
    #endif /* LIBXML_OUTPUT_ENABLED */
    
     * UTF16BEToUTF8:
     * @out:  a pointer to an array of bytes to store the result
     * @outlen:  the length of @out
    
     * @inb:  a pointer to an array of UTF-16 passed as a byte array
    
     * @inlenb:  the length of @in in UTF-16 chars
     *
     * Take a block of UTF-16 ushorts in and try to convert it to an UTF-8
    
     * block of chars out. This function assumes the endian property
    
     * is the same between the native type of this machine and the
     * inputed one.
     *
    
     * Returns the number of bytes written, or -1 if lack of space, or -2
     *     if the transcoding fails (if *in is not a valid utf16 string)
    
     * The value of *inlen after return is the number of octets consumed
    
     *     if the return value is positive, else unpredictable.
    
    UTF16BEToUTF8(unsigned char* out, int *outlen,
                const unsigned char* inb, int *inlenb)
    {
        unsigned char* outstart = out;
        const unsigned char* processed = inb;
    
        unsigned short* in = (unsigned short*) inb;
        unsigned short* inend;
        unsigned int c, d, inlen;
        unsigned char *tmp;
        int bits;
    
    
        if (*outlen == 0) {
            *inlenb = 0;
            return(0);
        }
        outend = out + *outlen;
    
        if ((*inlenb % 2) == 1)
            (*inlenb)--;
        inlen = *inlenb / 2;
        inend= in + inlen;
    
        while ((in < inend) && (out - outstart + 5 < *outlen)) {
    
    	if (xmlLittleEndian) {
    	    tmp = (unsigned char *) in;
    	    c = *tmp++;
    
    	    in++;
    	} else {
    	    c= *in++;
    
            if ((c & 0xFC00) == 0xD800) {    /* surrogates */
    
    	    if (in >= inend) {           /* handle split mutli-byte characters */
                    break;
    
    	    }
    	    if (xmlLittleEndian) {
    		tmp = (unsigned char *) in;
    		d = *tmp++;
    
    		in++;
    	    } else {
    		d= *in++;
    	    }
                if ((d & 0xFC00) == 0xDC00) {
                    c &= 0x03FF;
                    c <<= 10;
                    c |= d & 0x03FF;
                    c += 0x10000;
                }
                else {
    		*outlen = out - outstart;
    		*inlenb = processed - inb;
    	        return(-2);
    	    }
            }
    
    	/* assertion: c is a single UTF-4 value */
    
    	    break;
            if      (c <    0x80) {  *out++=  c;                bits= -6; }
            else if (c <   0x800) {  *out++= ((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
            else if (c < 0x10000) {  *out++= ((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
            else                  {  *out++= ((c >> 18) & 0x07) | 0xF0;  bits= 12; }
    
            for ( ; bits >= 0; bits-= 6) {
    
    	        break;
                *out++= ((c >> bits) & 0x3F) | 0x80;
            }
    	processed = (const unsigned char*) in;
        }
        *outlen = out - outstart;
        *inlenb = processed - inb;
    
    /**
     * UTF8ToUTF16BE:
     * @outb:  a pointer to an array of bytes to store the result
     * @outlen:  the length of @outb
     * @in:  a pointer to an array of UTF-8 chars
     * @inlen:  the length of @in
     *
     * Take a block of UTF-8 chars in and try to convert it to an UTF-16BE
     * block of chars out.
     *
     * Returns the number of byte written, or -1 by lack of space, or -2
    
     *     if the transcoding failed.
    
    UTF8ToUTF16BE(unsigned char* outb, int *outlen,
                const unsigned char* in, int *inlen)
    {
        unsigned short* out = (unsigned short*) outb;
        const unsigned char* processed = in;
    
        const unsigned char *const instart = in;
    
        unsigned short* outstart= out;
        unsigned short* outend;
    
        const unsigned char* inend;
    
        unsigned int c, d;
        int trailing;
        unsigned char *tmp;
        unsigned short tmp1, tmp2;
    
    
        if ((outb == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
    
        if (in == NULL) {
    	*outlen = 0;
    	*inlen = 0;
    	return(0);
        }
    
        inend= in + *inlen;
    
        outend = out + (*outlen / 2);
        while (in < inend) {
          d= *in++;
          if      (d < 0x80)  { c= d; trailing= 0; }
          else if (d < 0xC0)  {
              /* trailing byte in leading position */
    	  *outlen = out - outstart;
    
    	  return(-2);
          } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
          else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
          else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
          else {
              /* no chance for this in UTF-16 */
    	  *outlen = out - outstart;
    
    	  return(-2);
          }
    
          if (inend - in < trailing) {
              break;
    
    
          for ( ; trailing; trailing--) {
              if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))  break;
              c <<= 6;
              c |= d & 0x3F;
          }
    
          /* assertion: c is a single UTF-4 value */
            if (c < 0x10000) {
                if (out >= outend)  break;
    	    if (xmlLittleEndian) {
    		tmp = (unsigned char *) out;
    		*tmp = c >> 8;
    		*(tmp + 1) = c;
    		out++;
    	    } else {
    		*out++ = c;
    	    }
            }
            else if (c < 0x110000) {
                if (out+1 >= outend)  break;
                c -= 0x10000;
    	    if (xmlLittleEndian) {
    		tmp1 = 0xD800 | (c >> 10);
    		tmp = (unsigned char *) out;
    		*tmp = tmp1 >> 8;
    		*(tmp + 1) = (unsigned char) tmp1;
    		out++;
    
    		tmp2 = 0xDC00 | (c & 0x03FF);
    		tmp = (unsigned char *) out;
    		*tmp = tmp2 >> 8;
    		*(tmp + 1) = (unsigned char) tmp2;
    		out++;
    	    } else {
    		*out++ = 0xD800 | (c >> 10);
    		*out++ = 0xDC00 | (c & 0x03FF);
    	    }
            }
            else
    	    break;
    	processed = in;
        }
        *outlen = (out - outstart) * 2;
    
    #endif /* LIBXML_OUTPUT_ENABLED */
    
    /************************************************************************
     *									*
     *		Generic encoding handling routines			*
     *									*
     ************************************************************************/
    
    
    /**
     * xmlDetectCharEncoding:
     * @in:  a pointer to the first bytes of the XML entity, must be at least
    
     *       2 bytes long (at least 4 if encoding is UTF4 variant).
    
     * @len:  pointer to the length of the buffer
     *
     * Guess the encoding of the entity using the first bytes of the entity content
    
     * according to the non-normative appendix F of the XML-1.0 recommendation.
    
     * Returns one of the XML_CHAR_ENCODING_... values.
     */
    xmlCharEncoding
    xmlDetectCharEncoding(const unsigned char* in, int len)
    {
    
        if (len >= 4) {
    	if ((in[0] == 0x00) && (in[1] == 0x00) &&
    	    (in[2] == 0x00) && (in[3] == 0x3C))
    	    return(XML_CHAR_ENCODING_UCS4BE);
    	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
    	    (in[2] == 0x00) && (in[3] == 0x00))
    	    return(XML_CHAR_ENCODING_UCS4LE);
    	if ((in[0] == 0x00) && (in[1] == 0x00) &&
    	    (in[2] == 0x3C) && (in[3] == 0x00))
    	    return(XML_CHAR_ENCODING_UCS4_2143);
    	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
    	    (in[2] == 0x00) && (in[3] == 0x00))
    	    return(XML_CHAR_ENCODING_UCS4_3412);
    	if ((in[0] == 0x4C) && (in[1] == 0x6F) &&
    	    (in[2] == 0xA7) && (in[3] == 0x94))
    	    return(XML_CHAR_ENCODING_EBCDIC);
    	if ((in[0] == 0x3C) && (in[1] == 0x3F) &&
    	    (in[2] == 0x78) && (in[3] == 0x6D))
    	    return(XML_CHAR_ENCODING_UTF8);
    
    	/*
    	 * Although not part of the recommendation, we also
    	 * attempt an "auto-recognition" of UTF-16LE and
    	 * UTF-16BE encodings.
    	 */
    	if ((in[0] == 0x3C) && (in[1] == 0x00) &&
    	    (in[2] == 0x3F) && (in[3] == 0x00))
    	    return(XML_CHAR_ENCODING_UTF16LE);
    	if ((in[0] == 0x00) && (in[1] == 0x3C) &&
    	    (in[2] == 0x00) && (in[3] == 0x3F))
    	    return(XML_CHAR_ENCODING_UTF16BE);
    
        if (len >= 3) {
    	/*
    	 * Errata on XML-1.0 June 20 2001
    	 * We now allow an UTF8 encoded BOM
    	 */
    	if ((in[0] == 0xEF) && (in[1] == 0xBB) &&
    	    (in[2] == 0xBF))
    	    return(XML_CHAR_ENCODING_UTF8);
        }
    
        /* For UTF-16 we can recognize by the BOM */
    
        if (len >= 2) {
    	if ((in[0] == 0xFE) && (in[1] == 0xFF))
    	    return(XML_CHAR_ENCODING_UTF16BE);
    	if ((in[0] == 0xFF) && (in[1] == 0xFE))