Really only use PyUnicode_FromUnicode() when needed (GH-3697)

* Really only use PyUnicode_FromUnicode() for strings that contain lone surrogate, not for normal non-BMP strings and not for surrogate pairs on 16bit Unicode platforms. See https://github.com/cython/cython/issues/3678 * Extend buildenv test to debug a MacOS problem. * Add a test for surrogate pairs in Unicode strings. * Limit PyUnicode_FromUnicode() usage to strings containing lone surrogates. * Accept ambiguity of surrogate pairs in Unicode string literals when generated on 16bit Py2 systems.

Really only use PyUnicode_FromUnicode() when needed (GH-3697)
4a8197169791 · scoder · a274549d307e · 4a819716 · 4a819716 · 4a819716
Commit 4a8197169791 authored 5 years ago by scoder
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -1677,8 +1677,13 @@
    def generate_evaluation_code(self, code):
        if self.type.is_pyobject:
-            if self.contains_surrogates():
+            # FIXME: this should go away entirely!
-                # surrogates are not really portable and cannot be
+            # Since string_contains_lone_surrogates() returns False for surrogate pairs in Py2/UCS2,
+            # Py2 can generate different code from Py3 here.  Let's hope we get away with claiming that
+            # the processing of surrogate pairs in code was always ambiguous and lead to different results
+            # on P16/32bit Unicode platforms.
+            if StringEncoding.string_contains_lone_surrogates(self.value):
+                # lone (unpaired) surrogates are not really portable and cannot be
                # decoded by the UTF-8 codec in Py3.3
                self.result_code = code.get_py_const(py_object_type, 'ustring')
                data_cname = code.get_pyunicode_ptr_const(self.value)

--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -172,6 +172,34 @@
    return False
+def string_contains_lone_surrogates(ustring):
+    """
+    Check if the unicode string contains lone surrogate code points
+    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+    Unicode, i.e. characters that would be spelled as two
+    separate code units on a narrow platform, but that do not form a pair.
+    """
+    last_was_start = False
+    unicode_uses_surrogate_encoding = sys.maxunicode == 65535
+    for c in map(ord, ustring):
+        # surrogates tend to be rare
+        if c < 0xD800 or c > 0xDFFF:
+            if last_was_start:
+                return True
+        elif not unicode_uses_surrogate_encoding:
+            # on 32bit Unicode platforms, there is never a pair
+            return True
+        elif c <= 0xDBFF:
+            if last_was_start:
+                return True  # lone start
+            last_was_start = True
+        else:
+            if not last_was_start:
+                return True  # lone end
+            last_was_start = False
+    return last_was_start
 class BytesLiteral(_bytes):
    # bytes subclass that is compatible with EncodedString
    encoding = None

--- a/Cython/Compiler/Tests/TestStringEncoding.py
+++ b/Cython/Compiler/Tests/TestStringEncoding.py
+# -*- coding: utf-8 -*-
+import sys
+import unittest
+import Cython.Compiler.StringEncoding as StringEncoding
+class StringEncodingTest(unittest.TestCase):
+    """
+    Test the StringEncoding module.
+    """
+    def test_string_contains_lone_surrogates(self):
+        self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"abc"))
+        self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\uABCD"))
+        self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\N{SNOWMAN}"))
+        # This behaves differently in Py2 when freshly parsed and read from a .pyc file,
+        # but it seems to be a marshalling bug in Py2, which doesn't hurt us in Cython.
+        if sys.version_info[0] != 2:
+            self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800\uDFFF"))
+        # In Py2 with 16bit Unicode, the following is indistinguishable from the 32bit character.
+        obfuscated_surrogate_pair = (u"\uDFFF" + "\uD800")[::-1]
+        if sys.version_info[0] == 2 and sys.maxunicode == 65565:
+            self.assertFalse(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
+        else:
+            self.assertTrue(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800"))
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF"))
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF\uD800"))
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800x\uDFFF"))
+    def test_string_contains_surrogates(self):
+        self.assertFalse(StringEncoding.string_contains_surrogates(u"abc"))
+        self.assertFalse(StringEncoding.string_contains_surrogates(u"\uABCD"))
+        self.assertFalse(StringEncoding.string_contains_surrogates(u"\N{SNOWMAN}"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800\uDFFF"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF\uD800"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800x\uDFFF"))
--- a/Cython/Utility/ModuleSetupCode.c
+++ b/Cython/Utility/ModuleSetupCode.c
@@ -741,5 +741,6 @@
  #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
  #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
  #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, ch)
+  #if defined(PyUnicode_IS_READY) && defined(PyUnicode_GET_SIZE)
  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
  #else
@@ -744,5 +745,8 @@
  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
  #else
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_LENGTH(u))
+  #endif
+#else
  #define CYTHON_PEP393_ENABLED 0
  #define PyUnicode_1BYTE_KIND  1
  #define PyUnicode_2BYTE_KIND  2

--- a/Cython/Utility/StringTools.c
+++ b/Cython/Utility/StringTools.c
@@ -118,6 +118,7 @@
 //////////////////// PyUCS4InUnicode ////////////////////
+#if Py_UNICODE_SIZE == 2
 static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
    /* handle surrogate pairs for Py_UNICODE buffers in 16bit Unicode builds */
    Py_UNICODE high_val, low_val;
@@ -129,6 +130,7 @@
    }
    return 0;
 }
+#endif
 static int __Pyx_PyUnicodeBufferContainsUCS4_BMP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
    Py_UNICODE uchar;
@@ -153,8 +155,9 @@
        return 0;
    }
 #endif
-    if (Py_UNICODE_SIZE == 2 && unlikely(character > 65535)) {
+#if Py_UNICODE_SIZE == 2
+    if (unlikely(character > 65535)) {
        return __Pyx_PyUnicodeBufferContainsUCS4_SP(
            PyUnicode_AS_UNICODE(unicode),
            PyUnicode_GET_SIZE(unicode),
            character);
@@ -157,8 +160,10 @@
        return __Pyx_PyUnicodeBufferContainsUCS4_SP(
            PyUnicode_AS_UNICODE(unicode),
            PyUnicode_GET_SIZE(unicode),
            character);
-    } else {
+    } else
+#endif
+    {
        return __Pyx_PyUnicodeBufferContainsUCS4_BMP(
            PyUnicode_AS_UNICODE(unicode),
            PyUnicode_GET_SIZE(unicode),

--- a/tests/compile/buildenv.pyx
+++ b/tests/compile/buildenv.pyx
@@ -34,6 +34,7 @@
    # Cython config
    cdef int CYTHON_COMPILING_IN_CPYTHON
+    cdef int CYTHON_COMPILING_IN_LIMITED_API
    cdef int CYTHON_COMPILING_IN_PYPY
    cdef int CYTHON_COMPILING_IN_PYSTON
    cdef int CYTHON_USE_PYLONG_INTERNALS
@@ -42,6 +43,7 @@
    cdef int CYTHON_USE_UNICODE_WRITER
    cdef int CYTHON_AVOID_BORROWED_REFS
    cdef int CYTHON_ASSUME_SAFE_MACROS
+    cdef int CYTHON_USE_TYPE_SLOTS
    cdef int CYTHON_UNPACK_METHODS
    cdef int CYTHON_FAST_THREAD_STATE
    cdef int CYTHON_FAST_PYCALL
@@ -76,6 +78,7 @@
 PY_VERSION_HEX  0x{PY_VERSION_HEX:X}
 CYTHON_COMPILING_IN_CPYTHON  {CYTHON_COMPILING_IN_CPYTHON}
+CYTHON_COMPILING_IN_LIMITED_API  {CYTHON_COMPILING_IN_LIMITED_API}
 CYTHON_COMPILING_IN_PYPY  {CYTHON_COMPILING_IN_PYPY}
 CYTHON_COMPILING_IN_PYSTON  {CYTHON_COMPILING_IN_PYSTON}
@@ -85,6 +88,7 @@
 CYTHON_USE_UNICODE_WRITER  {CYTHON_USE_UNICODE_WRITER}
 CYTHON_AVOID_BORROWED_REFS  {CYTHON_AVOID_BORROWED_REFS}
 CYTHON_ASSUME_SAFE_MACROS  {CYTHON_ASSUME_SAFE_MACROS}
+CYTHON_USE_TYPE_SLOTS  {CYTHON_USE_TYPE_SLOTS}
 CYTHON_UNPACK_METHODS  {CYTHON_UNPACK_METHODS}
 CYTHON_FAST_THREAD_STATE  {CYTHON_FAST_THREAD_STATE}
 CYTHON_FAST_PYCALL  {CYTHON_FAST_PYCALL}
@@ -132,6 +136,7 @@
 LINKCC (env) = {get_env('LINKCC', '')}
 Encodings:
+sys maxunicode = {sys.maxunicode}
 LANG (env) = {get_env('LANG', '')}
 PYTHONIOENCODING (env) = {get_env('PYTHONIOENCODING', '')}
 sys stdout encoding = {sys.stdout.encoding}

--- a/tests/run/unicodeliterals.pyx
+++ b/tests/run/unicodeliterals.pyx
@@ -21,6 +21,13 @@
    u'\udc00'
    >>> h
    u'\ud800'
+    >>> q
+    u'\udc00\ud800'
+    # The output of surrogate pairs differs between 16/32bit Unicode runtimes.
+    #>>> p
+    #u'\ud800\udc00'
    >>> add
    u'S\xf8k ik\xfc\xd6\xe4abc'
    >>> null
@@ -44,6 +51,10 @@
    1
    >>> len(h)
    1
+    >>> len(q)
+    2
+    >>> len(q)
+    2
    >>> len(add)
    12
    >>> len(null)
@@ -75,6 +86,10 @@
    True
    >>> h == u'\\ud800' # unescaped by Python (required by doctest)
    True
+    >>> p == u'\\ud800\\udc00' # unescaped by Python (required by doctest)
+    True
+    >>> q == u'\\udc00\\ud800' # unescaped by Python (required by doctest)
+    True
    >>> k == u'\\N{SNOWMAN}' == u'\\u2603'
    True
    >>> m == u'abc\\\\xf8\\\\t\\u00f8\\U000000f8'  # unescaped by Python (required by doctest)
@@ -111,6 +126,8 @@
 h = u'\ud800'   # lone lead surrogate
 k = u'\N{SNOWMAN}'
 m = ur'abc\xf8\t\u00f8\U000000f8'
+p = u'\ud800\udc00'  # surrogate pair
+q = u'\udc00\ud800'  # reversed surrogate pair
 add = u'Søk ik' + u'üÖä' + u'abc'
 null = u'\x00'