# HG changeset patch # User websnarf <github@azillionmonkeys.com> # Date 1438060578 25200 # Mon Jul 27 22:16:18 2015 -0700 # Node ID cb6655bb2d21138d7635c0975ea38be74d479f0d # Parent 4f98637335e4c35ff438b4da9a48a55c56548471 Add utf8util module. Other minor updates. diff --git a/bstraux.c b/bstraux.c --- a/bstraux.c +++ b/bstraux.c @@ -1,5 +1,3 @@ -#define _CRT_SECURE_NO_WARNINGS - /* * This source file is part of the bstring string library. This code was * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source @@ -15,6 +13,10 @@ * functions. */ +#if defined (_MSC_VER) +# define _CRT_SECURE_NO_WARNINGS +#endif + #include <stdio.h> #include <stdlib.h> #include <string.h> diff --git a/bstrlib.c b/bstrlib.c --- a/bstrlib.c +++ b/bstrlib.c @@ -1,6 +1,6 @@ /* * This source file is part of the bstring string library. This code was - * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source + * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source * license and the GPL. Refer to the accompanying documentation for details * on usage and license. */ @@ -1640,11 +1640,11 @@ while ((pos = instr (b, pos, auxf)) >= 0) { if (slen >= mlen - 1) { int *t; - int vl; + int sl; mlen += mlen; - vl = sizeof (int *) * mlen; + sl = sizeof (int *) * mlen; if (static_d == d) d = NULL; /* static_d cannot be realloced */ - if (mlen <= 0 || vl < mlen || NULL == (t = (int *) bstr__realloc (d, vl))) { + if (mlen <= 0 || sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) { ret = BSTR_ERR; goto done; } diff --git a/bstrlib.h b/bstrlib.h --- a/bstrlib.h +++ b/bstrlib.h @@ -1,14 +1,14 @@ /* * This source file is part of the bstring string library. This code was - * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source - * license and the GPL. Refer to the accompanying documentation for details + * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source + * license and the GPL. Refer to the accompanying documentation for details * on usage and license. */ /* - * bstrlib.c + * bstrlib.h * - * This file is the core module for implementing the bstring functions. + * This file is the interface for the core bstring functions. */ #ifndef BSTRLIB_INCLUDE diff --git a/bstrlib.txt b/bstrlib.txt --- a/bstrlib.txt +++ b/bstrlib.txt @@ -479,7 +479,6 @@ implement such things at this low a level is that it is typically more efficient to use locks in higher level primitives. There is also no platform independent way to implement locks or mutexes. -4. Unicode/widecharacter support. Note that except for spotty support of wide characters, the default C standard library does not address any of these problems either. @@ -1998,6 +1997,13 @@ appending and extracting code points to and from bstrings, and parsing UTF8 and UTF16 from raw data. + The types cpUcs4 and cpUcs2 respectively are defined as 4 byte and 2 byte + encoding formats corresponding to UCS4 and UCS2 respectively. To test + if a raw code point is valid, the macro isLegalUnicodeCodePoint() has + been defined. The utf8 iterator is defined by struct utf8Iterator. To + test if the iterator has more code points to walk through the macro + utf8IteratorNoMore() has been defined. + To use these functions compile and link utf8util.c and buniutil.c .......................................................................... @@ -2014,7 +2020,8 @@ extern void utf8IteratorUninit (struct utf8Iterator* iter); - Invalidate utf8 iterator. + Invalidate utf8 iterator. After calling this the iterator iter, should + yield false when passed to the utf8IteratorNoMore() macro. .......................................................................... @@ -2049,7 +2056,7 @@ extern int buIsUTF8Content (const_bstring bu); - Scan a bstring and determine if it is made entirely of unicode code + Scan a bstring and determine if it is made entirely of unicode code valid points. If it is, 1 is returned, otherwise 0 is returned. .......................................................................... @@ -2067,7 +2074,7 @@ extern int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos); - Convert a string of UTF8 codepoints (bu), skipping the first pos, into a + Convert a string of UTF8 codepoints (bu), skipping the first pos, into a sequence of UTF16 encoded code points. Returns the number of UCS2 16-bit words written to the output. No more than len words are written to the target array ucs2. If any code point in bu is unparsable, it will be diff --git a/buniutil.c b/buniutil.c --- a/buniutil.c +++ b/buniutil.c @@ -20,7 +20,7 @@ /* int buIsUTF8Content (const_bstring bu) * - * Scan string and return 1 if its entire contents is entirely UTF8 code + * Scan string and return 1 if its entire contents is entirely UTF8 code * points. Otherwise return 0. */ int buIsUTF8Content (const_bstring bu) { @@ -28,7 +28,7 @@ if (NULL == bdata (bu)) return 0; for (utf8IteratorInit (&iter, bu->data, bu->slen); - !utf8IteratorNoMore (&iter);) { + iter.next < iter.slen;) { if (0 >= utf8IteratorGetNextCodePoint (&iter, -1)) return 0; } return 1; @@ -37,10 +37,11 @@ /* int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, * int pos) * - * Convert a string of UTF8 codepoints (bu) into a sequence of UTF16 encoded - * code points. Returns the number of UCS2 16-bit words written to the - * output. No more than len words are written to the target array ucs2. If - * any code point in bu is unparsable, it will be translated to errCh. + * Convert a string of UTF8 codepoints (bu) skipping the first pos, into a + * sequence of UTF16 encoded code points. Returns the number of UCS2 16-bit + * words written to the output. No more than len words are written to the + * target array ucs2. If any code point in bu is unparsable, it will be + * translated to errCh. */ int buGetBlkUTF16 (/* @out */ cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos) { struct tagbstring t; @@ -65,7 +66,8 @@ utf8IteratorInit (&iter, t.data, t.slen); ucs4 = BSTR_ERR; - for (i=0; 0 < len && !utf8IteratorNoMore (&iter) && 0 <= (ucs4 = utf8IteratorGetNextCodePoint (&iter, errCh)); i++) { + for (i=0; 0 < len && iter.next < iter.slen && + 0 <= (ucs4 = utf8IteratorGetNextCodePoint (&iter, errCh)); i++) { if (ucs4 < 0x10000) { *ucs2++ = (cpUcs2) ucs4; len--; @@ -97,13 +99,13 @@ Unicode UTF-8 ------- ----- -U-00000000 - U-0000007F: 0xxxxxxx -U-00000080 - U-000007FF: 110xxxxx 10xxxxxx -U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx +U-00000000 - U-0000007F: 0xxxxxxx +U-00000080 - U-000007FF: 110xxxxx 10xxxxxx +U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx -U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx -U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx UTF-32: U-000000 - U-10FFFF @@ -154,7 +156,7 @@ b->slen = oldSlen; return BSTR_ERR; } - } else + } else #if 0 if (v < 0x200000) #endif @@ -167,8 +169,8 @@ b->slen = oldSlen; return BSTR_ERR; } - } -#if 0 + } +#if 0 else if (v < 0x4000000) { c[0] = (unsigned char) ( (v >> 24) + 0xf8); c[1] = (unsigned char) (((v >> 18) & 0x3f) + 0x80); @@ -199,15 +201,15 @@ #define endSwap(cs,mode) ((mode) ? ((((cs) & 0xFF) << 8) | (((cs) >> 8) & 0xFF)) : (cs)) #define TEMP_UCS4_BUFFER_SIZE (64) -/* int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, +/* int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, * cpUcs2* bom, cpUcs4 errCh) * - * Append an array of UCS4 code points (utf16) to UTF8 codepoints (bu). Any + * Append an array of UCS2 code points (utf16) to UTF8 codepoints (bu). Any * invalid code point is replaced by errCh. If errCh is itself not a * valid code point, then this translation will halt upon the first error * and return BSTR_ERR. Otherwise BSTR_OK is returned. If a byte order mark * has been previously read, it may be passed in as bom, otherwise if *bom is - * set to 0, it will be filled in with the BOM as read from the first + * set to 0, it will be filled in with the BOM as read from the first * character if it is a BOM. */ int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, cpUcs2* bom, cpUcs4 errCh) { @@ -241,7 +243,9 @@ cc = 0; for (;i < len; i++) { - cpUcs4 c, v = endSwap (utf16[i], sm); + cpUcs4 c, v; + v = endSwap (utf16[i], sm); + if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */ if (v >= 0xDC00 || i >= len) { ErrMode:; diff --git a/test.cpp b/test.cpp --- a/test.cpp +++ b/test.cpp @@ -1,6 +1,6 @@ // // This source file is part of the bstring string library. This code was -// written by Paul Hsieh in 2002-2006, and is covered by the BSD open source +// written by Paul Hsieh in 2002-2015, and is covered by the BSD open source // license. Refer to the accompanying documentation for details on usage and // license. // diff --git a/testaux.c b/testaux.c --- a/testaux.c +++ b/testaux.c @@ -1,7 +1,7 @@ /* * This source file is part of the bstring string library. This code was - * written by Paul Hsieh in 2002-2006, and is covered by the BSD open source - * license. Refer to the accompanying documentation for details on usage and + * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source + * license. Refer to the accompanying documentation for details on usage and * license. */ @@ -26,7 +26,7 @@ if (0 > bcatblk (b, buf, elsize)) break; buf = (const void *) (elsize + (const char *) buf); } - return (int) i; + return (int) i; } int test0 (void) { diff --git a/utf8util.c b/utf8util.c new file mode 100644 --- /dev/null +++ b/utf8util.c @@ -0,0 +1,249 @@ +/* + * This source file is part of the bstring string library. This code was + * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source + * license and the GPL. Refer to the accompanying documentation for details + * on usage and license. + */ + +/* + * utf8util.c + * + * This file is not necessarily part of the core bstring library itself, but + * is just an generic module for implementing utf8 utility functions. + */ + +#include "utf8util.h" + +#ifndef NULL +#ifdef __cplusplus +#define NULL 0 +#else +#define NULL ((void *)0) +#endif +#endif + +/* Surrogate range is wrong, there is a maximum, the BOM alias is illegal and 0xFFFF is illegal */ +#define isLegalUnicodeCodePoint(v) ((((v) < 0xD800L) || ((v) > 0xDFFFL)) && (((unsigned long)(v)) <= 0x0010FFFFL) && (((v)|0x1F0001) != 0x1FFFFFL)) + +void utf8IteratorInit (struct utf8Iterator* iter, unsigned char* data, int slen) { + if (iter) { + iter->data = data; + iter->slen = (iter->data && slen >= 0) ? slen : -1; + iter->start = -1; + iter->next = (iter->slen >= 0) ? 0 : -1; + iter->error = (iter->slen >= 0) ? 0 : 1; + } +} + +void utf8IteratorUninit (struct utf8Iterator* iter) { + if (iter) { + iter->data = NULL; + iter->slen = -1; + iter->start = iter->next = -1; + } +} + +int utf8ScanBackwardsForCodePoint (unsigned char* msg, int len, int pos, cpUcs4* out) { + cpUcs4 v1, v2, v3, v4, x; + int ret; + if (NULL == msg || len < 0 || (unsigned) pos >= (unsigned) len) { + return -__LINE__; + } + if (!out) out = &x; + ret = 0; + if (msg[pos] < 0x80) { + *out = msg[pos]; + return 0; + } else if (msg[pos] < 0xC0) { + if (0 == pos) return -__LINE__; + ret = -__LINE__; + if (msg[pos-1] >= 0xC1 && msg[pos-1] < 0xF8) { + pos--; + ret = 1; + } else { + if (1 == pos) return -__LINE__; + if ((msg[pos-1] | 0x3F) != 0xBF) return -__LINE__; + if (msg[pos-2] >= 0xE0 && msg[pos-2] < 0xF8) { + pos -= 2; + ret = 2; + } else { + if (2 == pos) return -__LINE__; + if ((msg[pos-2] | 0x3F) != 0xBF) return -__LINE__; + if ((msg[pos-3]|0x07) == 0xF7) { + pos -= 3; + ret = 3; + } else return -__LINE__; + } + } + } + if (msg[pos] < 0xE0) { + if (pos + 1 >= len) return -__LINE__; + v1 = msg[pos] & ~0xE0; + v2 = msg[pos+1] & ~0xC0; + v1 = (v1 << 6) + v2; + if (v1 < 0x80) return -__LINE__; + *out = v1; + return ret; + } + if (msg[pos] < 0xF0) { + if (pos + 2 >= len) return -__LINE__; + v1 = msg[pos] & ~0xF0; + v2 = msg[pos+1] & ~0xC0; + v3 = msg[pos+2] & ~0xC0; + v1 = (v1 << 12) + (v2 << 6) + v3; + if (v1 < 0x800) return -__LINE__; + if (!isLegalUnicodeCodePoint(v1)) return -__LINE__; + *out = v1; + return ret; + } + + if (msg[pos] >= 0xF8) return -__LINE__; + + if (pos + 3 >= len) return -__LINE__; + v1 = msg[pos] & ~0xF8; + v2 = msg[pos+1] & ~0xC0; + v3 = msg[pos+2] & ~0xC0; + v4 = msg[pos+3] & ~0xC0; + v1 = (v1 << 18) + (v2 << 12) + (v3 << 6) + v4; + if (v1 < 0x10000) return -__LINE__; + if (!isLegalUnicodeCodePoint(v1)) return -__LINE__; + *out = v1; + return ret; +} + +/* +Code point UTF-8 +---------- ----- +U-00000000 - U-0000007F: 0xxxxxxx +U-00000080 - U-000007FF: 110xxxxx 10xxxxxx +U-00000800 - U-0000FFFF: 1110xxxx 10xxxxxx 10xxxxxx +U-00010000 - U-001FFFFF: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +U-00200000 - U-03FFFFFF: 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +U-04000000 - U-7FFFFFFF: 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx +*/ + +/* + * Returns next read code point for iterator. + * + * iter->data + iter->start points at the characters just read. + * + * iter->data + iter->next points at the characters that will be read next. + * + * iter->error is boolean indicating whether or not last read contained an error. + */ +cpUcs4 utf8IteratorGetNextCodePoint (struct utf8Iterator* iter, cpUcs4 errCh) { + unsigned char * chrs; + unsigned char c, d, e; + long v; + int i, ofs; + + if (NULL == iter || iter->next < 0) return errCh; + if (iter->next >= iter->slen) { + iter->start = iter->slen; + return errCh; + } + if (NULL == iter->data || iter->next < 0 || utf8IteratorNoMore(iter)) return errCh; + chrs = iter->data + iter->next; + + iter->error = 0; + c = chrs[0]; + ofs = 0; + + if (c < 0xC0 || c > 0xFD) { + if (c >= 0x80) goto ErrMode; + v = c; + ofs = 1; + } else if (c < 0xE0) { + if (iter->next >= iter->slen + 1) goto ErrMode; + v = (c << 6u) - (0x0C0 << 6u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + v += c; + if (c >= 0x40 || v < 0x80) goto ErrMode; + ofs = 2; + } else if (c < 0xF0) { + if (iter->next >= iter->slen + 2) goto ErrMode; + v = (c << 12) - (0x0E0 << 12u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + v += (c << 6u) + d; + if ((c|d) >= 0x40 || v < 0x800 || !isLegalUnicodeCodePoint (v)) goto ErrMode; + ofs = 3; + } else if (c < 0xF8) { + if (iter->next >= iter->slen + 3) goto ErrMode; + v = (c << 18) - (0x0F0 << 18u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + e = (unsigned char) ((unsigned) chrs[3] - 0x080); + v += (c << 12u) + (d << 6u) + e; + if ((c|d|e) >= 0x40 || v < 0x10000 || !isLegalUnicodeCodePoint (v)) goto ErrMode; + ofs = 4; + } else { /* 5 and 6 byte encodings are invalid */ + ErrMode:; + iter->error = 1; + v = errCh; + for (i = iter->next+1; i < iter->slen; i++) if ((iter->data[i] & 0xC0) != 0x80) break; + ofs = i - iter->next; + } + + iter->start = iter->next; + iter->next += ofs; + return v; +} + +/* + * Returns next read code point for iterator. + * + * iter->data + iter->start points at the characters to be read. + * + * iter->data + iter->next points at the characters that will be read next. + * + * iter->error is boolean indicating whether or not last read contained an error. + */ +cpUcs4 utf8IteratorGetCurrCodePoint (struct utf8Iterator* iter, cpUcs4 errCh) { + unsigned char * chrs; + unsigned char c, d, e; + long v; + + if (NULL == iter || iter->next < 0) return errCh; + if (iter->next >= iter->slen) { + iter->start = iter->slen; + return errCh; + } + if (NULL == iter->data || iter->next < 0 || utf8IteratorNoMore(iter)) return errCh; + chrs = iter->data + iter->next; + + iter->error = 0; + c = chrs[0]; + + if (c < 0xC0 || c > 0xFD) { + if (c >= 0x80) goto ErrMode; + v = c; + } else if (c < 0xE0) { + if (iter->next >= iter->slen + 1) goto ErrMode; + v = (c << 6u) - (0x0C0 << 6u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + v += c; + if (c >= 0x40 || v < 0x80) goto ErrMode; + } else if (c < 0xF0) { + if (iter->next >= iter->slen + 2) goto ErrMode; + v = (c << 12lu) - (0x0E0 << 12u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + v += (c << 6u) + d; + if ((c|d) >= 0x40 || v < 0x800 || !isLegalUnicodeCodePoint (v)) goto ErrMode; + } else if (c < 0xF8) { + if (iter->next >= iter->slen + 3) goto ErrMode; + v = (c << 18lu) - (0x0F0 << 18u); + c = (unsigned char) ((unsigned) chrs[1] - 0x080); + d = (unsigned char) ((unsigned) chrs[2] - 0x080); + e = (unsigned char) ((unsigned) chrs[3] - 0x080); + v += (c << 12lu) + (d << 6u) + e; + if ((c|d|e) >= 0x40 || v < 0x10000 || !isLegalUnicodeCodePoint (v)) goto ErrMode; + } else { /* 5 and 6 byte encodings are invalid */ + ErrMode:; + iter->error = 1; + v = errCh; + } + return v; +} diff --git a/utf8util.h b/utf8util.h new file mode 100644 --- /dev/null +++ b/utf8util.h @@ -0,0 +1,62 @@ +/* + * This source file is part of the bstring string library. This code was + * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source + * license and the GPL. Refer to the accompanying documentation for details + * on usage and license. + */ + +/* + * utf8util.h + * + * This file defines the interface to the utf8 utility functions. + */ + +#ifndef UTF8_UNICODE_UTILITIES +#define UTF8_UNICODE_UTILITIES + +#include <limits.h> + +#ifdef __cplusplus +extern "C" { +#endif + +#if INT_MAX >= 0x7fffffffUL +typedef int cpUcs4; +#elif LONG_MAX >= 0x7fffffffUL +typedef long cpUcs4; +#else +#error This compiler is not supported +#endif + +#if UINT_MAX == 0xFFFF +typedef unsigned int cpUcs2; +#elif USHRT_MAX == 0xFFFF +typedef unsigned short cpUcs2; +#elif UCHAR_MAX == 0xFFFF +typedef unsigned char cpUcs2; +#else +#error This compiler is not supported +#endif + +#define isLegalUnicodeCodePoint(v) ((((v) < 0xD800L) || ((v) > 0xDFFFL)) && (((unsigned long)(v)) <= 0x0010FFFFL) && (((v)|0x1F0001) != 0x1FFFFFL)) + +struct utf8Iterator { + unsigned char* data; + int slen; + int start, next; + int error; +}; + +#define utf8IteratorNoMore(it) (!(it) || (it)->next >= (it)->slen) + +extern void utf8IteratorInit (struct utf8Iterator* iter, unsigned char* data, int slen); +extern void utf8IteratorUninit (struct utf8Iterator* iter); +extern cpUcs4 utf8IteratorGetNextCodePoint (struct utf8Iterator* iter, cpUcs4 errCh); +extern cpUcs4 utf8IteratorGetCurrCodePoint (struct utf8Iterator* iter, cpUcs4 errCh); +extern int utf8ScanBackwardsForCodePoint (unsigned char* msg, int len, int pos, cpUcs4* out); + +#ifdef __cplusplus +} +#endif + +#endif /* UTF8_UNICODE_UTILITIES */