# HG changeset patch
# User websnarf <github@azillionmonkeys.com>
# Date 1438060578 25200
#      Mon Jul 27 22:16:18 2015 -0700
# Node ID cb6655bb2d21138d7635c0975ea38be74d479f0d
# Parent  4f98637335e4c35ff438b4da9a48a55c56548471
Add utf8util module.  Other minor updates.

diff --git a/bstraux.c b/bstraux.c
--- a/bstraux.c
+++ b/bstraux.c
@@ -1,5 +1,3 @@
-#define _CRT_SECURE_NO_WARNINGS
-
 /*
  * This source file is part of the bstring string library.  This code was
  * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
@@ -15,6 +13,10 @@
  * functions.
  */
 
+#if defined (_MSC_VER)
+# define _CRT_SECURE_NO_WARNINGS
+#endif
+
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
diff --git a/bstrlib.c b/bstrlib.c
--- a/bstrlib.c
+++ b/bstrlib.c
@@ -1,6 +1,6 @@
 /*
  * This source file is part of the bstring string library.  This code was
- * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
  * license and the GPL. Refer to the accompanying documentation for details
  * on usage and license.
  */
@@ -1640,11 +1640,11 @@
 	while ((pos = instr (b, pos, auxf)) >= 0) {
 		if (slen >= mlen - 1) {
 			int *t;
-			int vl;
+			int sl;
 			mlen += mlen;
-			vl = sizeof (int *) * mlen;
+			sl = sizeof (int *) * mlen;
 			if (static_d == d) d = NULL; /* static_d cannot be realloced */
-			if (mlen <= 0 || vl < mlen || NULL == (t = (int *) bstr__realloc (d, vl))) {
+			if (mlen <= 0 || sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
 				ret = BSTR_ERR;
 				goto done;
 			}
diff --git a/bstrlib.h b/bstrlib.h
--- a/bstrlib.h
+++ b/bstrlib.h
@@ -1,14 +1,14 @@
 /*
  * This source file is part of the bstring string library.  This code was
- * written by Paul Hsieh in 2002-2008, and is covered by the BSD open source 
- * license and the GPL. Refer to the accompanying documentation for details 
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
  * on usage and license.
  */
 
 /*
- * bstrlib.c
+ * bstrlib.h
  *
- * This file is the core module for implementing the bstring functions.
+ * This file is the interface for the core bstring functions.
  */
 
 #ifndef BSTRLIB_INCLUDE
diff --git a/bstrlib.txt b/bstrlib.txt
--- a/bstrlib.txt
+++ b/bstrlib.txt
@@ -479,7 +479,6 @@
    implement such things at this low a level is that it is typically more
    efficient to use locks in higher level primitives. There is also no
    platform independent way to implement locks or mutexes.
-4. Unicode/widecharacter support.
 
 Note that except for spotty support of wide characters, the default C
 standard library does not address any of these problems either.
@@ -1998,6 +1997,13 @@
     appending and extracting code points to and from bstrings, and parsing
     UTF8 and UTF16 from raw data.
 
+    The types cpUcs4 and cpUcs2 respectively are defined as 4 byte and 2 byte
+    encoding formats corresponding to UCS4 and UCS2 respectively.  To test
+    if a raw code point is valid, the macro isLegalUnicodeCodePoint() has
+    been defined.  The utf8 iterator is defined by struct utf8Iterator.  To
+    test if the iterator has more code points to walk through the macro
+    utf8IteratorNoMore() has been defined.
+
     To use these functions compile and link utf8util.c and buniutil.c
 
     ..........................................................................
@@ -2014,7 +2020,8 @@
 
     extern void utf8IteratorUninit (struct utf8Iterator* iter);
 
-    Invalidate utf8 iterator.
+    Invalidate utf8 iterator.  After calling this the iterator iter, should
+    yield false when passed to the utf8IteratorNoMore() macro.
 
     ..........................................................................
 
@@ -2049,7 +2056,7 @@
 
     extern int buIsUTF8Content (const_bstring bu);
 
-    Scan a bstring and determine if it is made entirely of unicode code 
+    Scan a bstring and determine if it is made entirely of unicode code
     valid points.  If it is, 1 is returned, otherwise 0 is returned.
 
     ..........................................................................
@@ -2067,7 +2074,7 @@
     extern int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh,
                               const_bstring bu, int pos);
 
-    Convert a string of UTF8 codepoints (bu), skipping the first pos, into a 
+    Convert a string of UTF8 codepoints (bu), skipping the first pos, into a
     sequence of UTF16 encoded code points.  Returns the number of UCS2 16-bit
     words written to the output.  No more than len words are written to the
     target array ucs2.  If any code point in bu is unparsable, it will be
diff --git a/buniutil.c b/buniutil.c
--- a/buniutil.c
+++ b/buniutil.c
@@ -20,7 +20,7 @@
 
 /*  int buIsUTF8Content (const_bstring bu)
  *
- *  Scan string and return 1 if its entire contents is entirely UTF8 code 
+ *  Scan string and return 1 if its entire contents is entirely UTF8 code
  *  points.  Otherwise return 0.
  */
 int buIsUTF8Content (const_bstring bu) {
@@ -28,7 +28,7 @@
 
 	if (NULL == bdata (bu)) return 0;
 	for (utf8IteratorInit (&iter, bu->data, bu->slen);
-		 !utf8IteratorNoMore (&iter);) {
+	     iter.next < iter.slen;) {
 		if (0 >= utf8IteratorGetNextCodePoint (&iter, -1)) return 0;
 	}
 	return 1;
@@ -37,10 +37,11 @@
 /*  int buGetBlkUTF16 (cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu,
  *                     int pos)
  *
- *  Convert a string of UTF8 codepoints (bu) into a sequence of UTF16 encoded
- *  code points.  Returns the number of UCS2 16-bit words written to the 
- *  output.  No more than len words are written to the target array ucs2.  If
- *  any code point in bu is unparsable, it will be translated to errCh.
+ *  Convert a string of UTF8 codepoints (bu) skipping the first pos, into a
+ *  sequence of UTF16 encoded code points.  Returns the number of UCS2 16-bit
+ *  words written to the output.  No more than len words are written to the
+ *  target array ucs2.  If any code point in bu is unparsable, it will be
+ *  translated to errCh.
  */
 int buGetBlkUTF16 (/* @out */ cpUcs2* ucs2, int len, cpUcs4 errCh, const_bstring bu, int pos) {
 struct tagbstring t;
@@ -65,7 +66,8 @@
 	utf8IteratorInit (&iter, t.data, t.slen);
 
 	ucs4 = BSTR_ERR;
-	for (i=0; 0 < len && !utf8IteratorNoMore (&iter) && 0 <= (ucs4 = utf8IteratorGetNextCodePoint (&iter, errCh)); i++) {
+	for (i=0; 0 < len && iter.next < iter.slen &&
+	          0 <= (ucs4 = utf8IteratorGetNextCodePoint (&iter, errCh)); i++) {
 		if (ucs4 < 0x10000) {
 			*ucs2++ = (cpUcs2) ucs4;
 			len--;
@@ -97,13 +99,13 @@
 
 Unicode                   UTF-8
 -------                   -----
-U-00000000 - U-0000007F:  0xxxxxxx  
-U-00000080 - U-000007FF:  110xxxxx 10xxxxxx  
-U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx  
+U-00000000 - U-0000007F:  0xxxxxxx
+U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
+U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
 U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
 
-U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx  
-U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 
+U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
 
 UTF-32: U-000000 - U-10FFFF
 
@@ -154,7 +156,7 @@
 				b->slen = oldSlen;
 				return BSTR_ERR;
 			}
-		} else 
+		} else
 #if 0
 			if (v < 0x200000)
 #endif
@@ -167,8 +169,8 @@
 				b->slen = oldSlen;
 				return BSTR_ERR;
 			}
-		} 
-#if 0		
+		}
+#if 0
 		else if (v < 0x4000000) {
 			c[0] = (unsigned char) ( (v >> 24)         + 0xf8);
 			c[1] = (unsigned char) (((v >> 18) & 0x3f) + 0x80);
@@ -199,15 +201,15 @@
 #define endSwap(cs,mode) ((mode) ? ((((cs) & 0xFF) << 8) | (((cs) >> 8) & 0xFF)) : (cs))
 #define TEMP_UCS4_BUFFER_SIZE (64)
 
-/*  int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, 
+/*  int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len,
  *                        cpUcs2* bom, cpUcs4 errCh)
  *
- *  Append an array of UCS4 code points (utf16) to UTF8 codepoints (bu).  Any
+ *  Append an array of UCS2 code points (utf16) to UTF8 codepoints (bu).  Any
  *  invalid code point is replaced by errCh.  If errCh is itself not a
  *  valid code point, then this translation will halt upon the first error
  *  and return BSTR_ERR.  Otherwise BSTR_OK is returned.  If a byte order mark
  *  has been previously read, it may be passed in as bom, otherwise if *bom is
- *  set to 0, it will be filled in with the BOM as read from the first 
+ *  set to 0, it will be filled in with the BOM as read from the first
  *  character if it is a BOM.
  */
 int buAppendBlkUTF16 (bstring bu, const cpUcs2* utf16, int len, cpUcs2* bom, cpUcs4 errCh) {
@@ -241,7 +243,9 @@
 
 	cc = 0;
 	for (;i < len; i++) {
-		cpUcs4 c, v = endSwap (utf16[i], sm);
+		cpUcs4 c, v;
+		v = endSwap (utf16[i], sm);
+
 		if ((v | 0x7FF) == 0xDFFF) { /* Deal with surrogate pairs */
 			if (v >= 0xDC00 || i >= len) {
 				ErrMode:;
diff --git a/test.cpp b/test.cpp
--- a/test.cpp
+++ b/test.cpp
@@ -1,6 +1,6 @@
 //
 // This source file is part of the bstring string library.  This code was
-// written by Paul Hsieh in 2002-2006, and is covered by the BSD open source
+// written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
 // license. Refer to the accompanying documentation for details on usage and
 // license.
 //
diff --git a/testaux.c b/testaux.c
--- a/testaux.c
+++ b/testaux.c
@@ -1,7 +1,7 @@
 /*
  * This source file is part of the bstring string library.  This code was
- * written by Paul Hsieh in 2002-2006, and is covered by the BSD open source 
- * license. Refer to the accompanying documentation for details on usage and 
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
+ * license. Refer to the accompanying documentation for details on usage and
  * license.
  */
 
@@ -26,7 +26,7 @@
 		if (0 > bcatblk (b, buf, elsize)) break;
 		buf = (const void *) (elsize + (const char *) buf);
 	}
-	return (int) i;	
+	return (int) i;
 }
 
 int test0 (void) {
diff --git a/utf8util.c b/utf8util.c
new file mode 100644
--- /dev/null
+++ b/utf8util.c
@@ -0,0 +1,249 @@
+/*
+ * This source file is part of the bstring string library.  This code was
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
+ * on usage and license.
+ */
+
+/*
+ * utf8util.c
+ *
+ * This file is not necessarily part of the core bstring library itself, but
+ * is just an generic module for implementing utf8 utility functions.
+ */
+
+#include "utf8util.h"
+
+#ifndef NULL
+#ifdef __cplusplus
+#define NULL	0
+#else
+#define NULL	((void *)0)
+#endif
+#endif
+
+/* Surrogate range is wrong, there is a maximum, the BOM alias is illegal and 0xFFFF is illegal */
+#define isLegalUnicodeCodePoint(v) ((((v) < 0xD800L) || ((v) > 0xDFFFL)) && (((unsigned long)(v)) <= 0x0010FFFFL) && (((v)|0x1F0001) != 0x1FFFFFL))
+
+void utf8IteratorInit (struct utf8Iterator* iter, unsigned char* data, int slen) {
+	if (iter) {
+		iter->data  = data;
+		iter->slen  = (iter->data && slen >= 0) ? slen : -1;
+		iter->start = -1;
+		iter->next  = (iter->slen >= 0) ? 0 : -1;
+		iter->error = (iter->slen >= 0) ? 0 : 1;
+	}
+}
+
+void utf8IteratorUninit (struct utf8Iterator* iter) {
+	if (iter) {
+		iter->data  = NULL;
+		iter->slen  = -1;
+		iter->start = iter->next = -1;
+	}
+}
+
+int utf8ScanBackwardsForCodePoint (unsigned char* msg, int len, int pos, cpUcs4* out) {
+	cpUcs4 v1, v2, v3, v4, x;
+	int ret;
+	if (NULL == msg || len < 0 || (unsigned) pos >= (unsigned) len) {
+		return -__LINE__;
+	}
+	if (!out) out = &x;
+	ret = 0;
+	if (msg[pos] < 0x80) {
+		*out = msg[pos];
+		return 0;
+	} else if (msg[pos] < 0xC0) {
+		if (0 == pos) return -__LINE__;
+		ret = -__LINE__;
+		if (msg[pos-1] >= 0xC1 && msg[pos-1] < 0xF8) {
+			pos--;
+			ret = 1;
+		} else {
+			if (1 == pos) return -__LINE__;
+			if ((msg[pos-1] | 0x3F) != 0xBF) return -__LINE__;
+			if (msg[pos-2] >= 0xE0 && msg[pos-2] < 0xF8) {
+				pos -= 2;
+				ret = 2;
+			} else {
+				if (2 == pos) return -__LINE__;
+				if ((msg[pos-2] | 0x3F) != 0xBF) return -__LINE__;
+				if ((msg[pos-3]|0x07) == 0xF7) {
+					pos -= 3;
+					ret = 3;
+				} else return -__LINE__;
+			}
+		}
+	}
+	if (msg[pos] < 0xE0) {
+		if (pos + 1 >= len) return -__LINE__;
+		v1 = msg[pos]   & ~0xE0;
+		v2 = msg[pos+1] & ~0xC0;
+		v1 = (v1 << 6) + v2;
+		if (v1 < 0x80) return -__LINE__;
+		*out = v1;
+		return ret;
+	}
+	if (msg[pos] < 0xF0) {
+		if (pos + 2 >= len) return -__LINE__;
+		v1 = msg[pos]   & ~0xF0;
+		v2 = msg[pos+1] & ~0xC0;
+		v3 = msg[pos+2] & ~0xC0;
+		v1 = (v1 << 12) + (v2 << 6) + v3;
+		if (v1 < 0x800) return -__LINE__;
+		if (!isLegalUnicodeCodePoint(v1)) return -__LINE__;
+		*out = v1;
+		return ret;
+	}
+
+	if (msg[pos] >= 0xF8) return -__LINE__;
+
+	if (pos + 3 >= len) return -__LINE__;
+	v1 = msg[pos]   & ~0xF8;
+	v2 = msg[pos+1] & ~0xC0;
+	v3 = msg[pos+2] & ~0xC0;
+	v4 = msg[pos+3] & ~0xC0;
+	v1 = (v1 << 18) + (v2 << 12) + (v3 << 6) + v4;
+	if (v1 < 0x10000) return -__LINE__;
+	if (!isLegalUnicodeCodePoint(v1)) return -__LINE__;
+	*out = v1;
+	return ret;
+}
+
+/*
+Code point                UTF-8
+----------                -----
+U-00000000 - U-0000007F:  0xxxxxxx
+U-00000080 - U-000007FF:  110xxxxx 10xxxxxx
+U-00000800 - U-0000FFFF:  1110xxxx 10xxxxxx 10xxxxxx
+U-00010000 - U-001FFFFF:  11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+
+U-00200000 - U-03FFFFFF:  111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+U-04000000 - U-7FFFFFFF:  1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+*/
+
+/*
+ *  Returns next read code point for iterator.
+ *
+ *  iter->data + iter->start points at the characters just read.
+ *
+ *  iter->data + iter->next points at the characters that will be read next.
+ *
+ *  iter->error is boolean indicating whether or not last read contained an error.
+ */
+cpUcs4 utf8IteratorGetNextCodePoint (struct utf8Iterator* iter, cpUcs4 errCh) {
+	unsigned char * chrs;
+	unsigned char c, d, e;
+	long v;
+	int i, ofs;
+
+	if (NULL == iter || iter->next < 0) return errCh;
+	if (iter->next >= iter->slen) {
+		iter->start = iter->slen;
+		return errCh;
+	}
+	if (NULL == iter->data || iter->next < 0 || utf8IteratorNoMore(iter)) return errCh;
+	chrs = iter->data + iter->next;
+
+	iter->error = 0;
+	c = chrs[0];
+	ofs = 0;
+
+	if (c < 0xC0 || c > 0xFD) {
+		if (c >= 0x80) goto ErrMode;
+		v = c;
+		ofs = 1;
+	} else if (c < 0xE0) {
+		if (iter->next >= iter->slen + 1) goto ErrMode;
+		v = (c << 6u) - (0x0C0 << 6u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		v += c;
+		if (c >= 0x40 || v < 0x80) goto ErrMode;
+		ofs = 2;
+	} else if (c < 0xF0) {
+		if (iter->next >= iter->slen + 2) goto ErrMode;
+		v = (c << 12) - (0x0E0 << 12u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+		v += (c << 6u) + d;
+		if ((c|d) >= 0x40 || v < 0x800 || !isLegalUnicodeCodePoint (v)) goto ErrMode;
+		ofs = 3;
+	} else if (c < 0xF8) {
+		if (iter->next >= iter->slen + 3) goto ErrMode;
+		v = (c << 18) - (0x0F0 << 18u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+		e = (unsigned char) ((unsigned) chrs[3] - 0x080);
+		v += (c << 12u) + (d << 6u) + e;
+		if ((c|d|e) >= 0x40 || v < 0x10000 || !isLegalUnicodeCodePoint (v)) goto ErrMode;
+		ofs = 4;
+	} else { /* 5 and 6 byte encodings are invalid */
+	ErrMode:;
+		iter->error = 1;
+		v = errCh;
+		for (i = iter->next+1; i < iter->slen; i++) if ((iter->data[i] & 0xC0) != 0x80) break;
+		ofs = i - iter->next;
+	}
+
+	iter->start = iter->next;
+	iter->next += ofs;
+	return v;
+}
+
+/*
+ *  Returns next read code point for iterator.
+ *
+ *  iter->data + iter->start points at the characters to be read.
+ *
+ *  iter->data + iter->next points at the characters that will be read next.
+ *
+ *  iter->error is boolean indicating whether or not last read contained an error.
+ */
+cpUcs4 utf8IteratorGetCurrCodePoint (struct utf8Iterator* iter, cpUcs4 errCh) {
+	unsigned char * chrs;
+	unsigned char c, d, e;
+	long v;
+
+	if (NULL == iter || iter->next < 0) return errCh;
+	if (iter->next >= iter->slen) {
+		iter->start = iter->slen;
+		return errCh;
+	}
+	if (NULL == iter->data || iter->next < 0 || utf8IteratorNoMore(iter)) return errCh;
+	chrs = iter->data + iter->next;
+
+	iter->error = 0;
+	c = chrs[0];
+
+	if (c < 0xC0 || c > 0xFD) {
+		if (c >= 0x80) goto ErrMode;
+		v = c;
+	} else if (c < 0xE0) {
+		if (iter->next >= iter->slen + 1) goto ErrMode;
+		v = (c << 6u) - (0x0C0 << 6u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		v += c;
+		if (c >= 0x40 || v < 0x80) goto ErrMode;
+	} else if (c < 0xF0) {
+		if (iter->next >= iter->slen + 2) goto ErrMode;
+		v = (c << 12lu) - (0x0E0 << 12u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+		v += (c << 6u) + d;
+		if ((c|d) >= 0x40 || v < 0x800 || !isLegalUnicodeCodePoint (v)) goto ErrMode;
+	} else if (c < 0xF8) {
+		if (iter->next >= iter->slen + 3) goto ErrMode;
+		v = (c << 18lu) - (0x0F0 << 18u);
+		c = (unsigned char) ((unsigned) chrs[1] - 0x080);
+		d = (unsigned char) ((unsigned) chrs[2] - 0x080);
+		e = (unsigned char) ((unsigned) chrs[3] - 0x080);
+		v += (c << 12lu) + (d << 6u) + e;
+		if ((c|d|e) >= 0x40 || v < 0x10000 || !isLegalUnicodeCodePoint (v)) goto ErrMode;
+	} else { /* 5 and 6 byte encodings are invalid */
+	ErrMode:;
+		iter->error = 1;
+		v = errCh;
+	}
+	return v;
+}
diff --git a/utf8util.h b/utf8util.h
new file mode 100644
--- /dev/null
+++ b/utf8util.h
@@ -0,0 +1,62 @@
+/*
+ * This source file is part of the bstring string library.  This code was
+ * written by Paul Hsieh in 2002-2015, and is covered by the BSD open source
+ * license and the GPL. Refer to the accompanying documentation for details
+ * on usage and license.
+ */
+
+/*
+ * utf8util.h
+ *
+ * This file defines the interface to the utf8 utility functions.
+ */
+
+#ifndef UTF8_UNICODE_UTILITIES
+#define UTF8_UNICODE_UTILITIES
+
+#include <limits.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if INT_MAX >= 0x7fffffffUL
+typedef int 			cpUcs4;
+#elif LONG_MAX >= 0x7fffffffUL
+typedef long			cpUcs4;
+#else
+#error This compiler is not supported
+#endif
+
+#if UINT_MAX == 0xFFFF
+typedef unsigned int	cpUcs2;
+#elif USHRT_MAX == 0xFFFF
+typedef unsigned short	cpUcs2;
+#elif UCHAR_MAX == 0xFFFF
+typedef unsigned char	cpUcs2;
+#else
+#error This compiler is not supported
+#endif
+
+#define isLegalUnicodeCodePoint(v) ((((v) < 0xD800L) || ((v) > 0xDFFFL)) && (((unsigned long)(v)) <= 0x0010FFFFL) && (((v)|0x1F0001) != 0x1FFFFFL))
+
+struct utf8Iterator {
+	unsigned char*	data;
+	int           	slen;
+	int           	start, next;
+	int           	error;
+};
+
+#define utf8IteratorNoMore(it) (!(it) || (it)->next >= (it)->slen)
+
+extern void utf8IteratorInit (struct utf8Iterator* iter, unsigned char* data, int slen);
+extern void utf8IteratorUninit (struct utf8Iterator* iter);
+extern cpUcs4 utf8IteratorGetNextCodePoint (struct utf8Iterator* iter, cpUcs4 errCh);
+extern cpUcs4 utf8IteratorGetCurrCodePoint (struct utf8Iterator* iter, cpUcs4 errCh);
+extern int utf8ScanBackwardsForCodePoint (unsigned char* msg, int len, int pos, cpUcs4* out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* UTF8_UNICODE_UTILITIES */