# HG changeset patch
# User scoder <stefan_ml@behnel.de>
# Date 1593517938 -7200
#      Tue Jun 30 13:52:18 2020 +0200
# Node ID b132101708b3f5fb65e93020a4b8c5f71f563e08
# Parent  35de58a361a08594c1199cbe066b28c0bb92e7aa
Really only use PyUnicode_FromUnicode() when needed (GH-3697)

* Really only use PyUnicode_FromUnicode() for strings that contain lone surrogate, not for normal non-BMP strings and not for surrogate pairs on 16bit Unicode platforms.

See https://github.com/cython/cython/issues/3678

* Extend buildenv test to debug a MacOS problem.
* Add a test for surrogate pairs in Unicode strings.
* Limit PyUnicode_FromUnicode() usage to strings containing lone surrogates.
* Accept ambiguity of surrogate pairs in Unicode string literals when generated on 16bit Py2 systems.

diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -1623,8 +1623,13 @@
 
     def generate_evaluation_code(self, code):
         if self.type.is_pyobject:
-            if self.contains_surrogates():
-                # surrogates are not really portable and cannot be
+            # FIXME: this should go away entirely!
+            # Since string_contains_lone_surrogates() returns False for surrogate pairs in Py2/UCS2,
+            # Py2 can generate different code from Py3 here.  Let's hope we get away with claiming that
+            # the processing of surrogate pairs in code was always ambiguous and lead to different results
+            # on P16/32bit Unicode platforms.
+            if StringEncoding.string_contains_lone_surrogates(self.value):
+                # lone (unpaired) surrogates are not really portable and cannot be
                 # decoded by the UTF-8 codec in Py3.3
                 self.result_code = code.get_py_const(py_object_type, 'ustring')
                 data_cname = code.get_pyunicode_ptr_const(self.value)
diff --git a/Cython/Compiler/StringEncoding.py b/Cython/Compiler/StringEncoding.py
--- a/Cython/Compiler/StringEncoding.py
+++ b/Cython/Compiler/StringEncoding.py
@@ -154,6 +154,34 @@
     return False
 
 
+def string_contains_lone_surrogates(ustring):
+    """
+    Check if the unicode string contains lone surrogate code points
+    on a CPython platform with wide (UCS-4) or narrow (UTF-16)
+    Unicode, i.e. characters that would be spelled as two
+    separate code units on a narrow platform, but that do not form a pair.
+    """
+    last_was_start = False
+    unicode_uses_surrogate_encoding = sys.maxunicode == 65535
+    for c in map(ord, ustring):
+        # surrogates tend to be rare
+        if c < 0xD800 or c > 0xDFFF:
+            if last_was_start:
+                return True
+        elif not unicode_uses_surrogate_encoding:
+            # on 32bit Unicode platforms, there is never a pair
+            return True
+        elif c <= 0xDBFF:
+            if last_was_start:
+                return True  # lone start
+            last_was_start = True
+        else:
+            if not last_was_start:
+                return True  # lone end
+            last_was_start = False
+    return last_was_start
+
+
 class BytesLiteral(_bytes):
     # bytes subclass that is compatible with EncodedString
     encoding = None
diff --git a/Cython/Compiler/Tests/TestStringEncoding.py b/Cython/Compiler/Tests/TestStringEncoding.py
new file mode 100644
--- /dev/null
+++ b/Cython/Compiler/Tests/TestStringEncoding.py
@@ -0,0 +1,44 @@
+# -*- coding: utf-8 -*-
+
+import sys
+import unittest
+
+import Cython.Compiler.StringEncoding as StringEncoding
+
+
+class StringEncodingTest(unittest.TestCase):
+    """
+    Test the StringEncoding module.
+    """
+    def test_string_contains_lone_surrogates(self):
+        self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"abc"))
+        self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\uABCD"))
+        self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\N{SNOWMAN}"))
+
+        # This behaves differently in Py2 when freshly parsed and read from a .pyc file,
+        # but it seems to be a marshalling bug in Py2, which doesn't hurt us in Cython.
+        if sys.version_info[0] != 2:
+            self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800\uDFFF"))
+
+        # In Py2 with 16bit Unicode, the following is indistinguishable from the 32bit character.
+        obfuscated_surrogate_pair = (u"\uDFFF" + "\uD800")[::-1]
+        if sys.version_info[0] == 2 and sys.maxunicode == 65565:
+            self.assertFalse(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
+        else:
+            self.assertTrue(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
+
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800"))
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF"))
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF\uD800"))
+        self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800x\uDFFF"))
+
+    def test_string_contains_surrogates(self):
+        self.assertFalse(StringEncoding.string_contains_surrogates(u"abc"))
+        self.assertFalse(StringEncoding.string_contains_surrogates(u"\uABCD"))
+        self.assertFalse(StringEncoding.string_contains_surrogates(u"\N{SNOWMAN}"))
+
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800\uDFFF"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF\uD800"))
+        self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800x\uDFFF"))
diff --git a/Cython/Utility/ModuleSetupCode.c b/Cython/Utility/ModuleSetupCode.c
--- a/Cython/Utility/ModuleSetupCode.c
+++ b/Cython/Utility/ModuleSetupCode.c
@@ -532,7 +532,11 @@
   #define __Pyx_PyUnicode_DATA(u)         PyUnicode_DATA(u)
   #define __Pyx_PyUnicode_READ(k, d, i)   PyUnicode_READ(k, d, i)
   #define __Pyx_PyUnicode_WRITE(k, d, i, ch)  PyUnicode_WRITE(k, d, i, ch)
+  #if defined(PyUnicode_IS_READY) && defined(PyUnicode_GET_SIZE)
   #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
+  #else
+  #define __Pyx_PyUnicode_IS_TRUE(u)      (0 != PyUnicode_GET_LENGTH(u))
+  #endif
 #else
   #define CYTHON_PEP393_ENABLED 0
   #define PyUnicode_1BYTE_KIND  1
diff --git a/Cython/Utility/StringTools.c b/Cython/Utility/StringTools.c
--- a/Cython/Utility/StringTools.c
+++ b/Cython/Utility/StringTools.c
@@ -66,6 +66,7 @@
 
 //////////////////// PyUCS4InUnicode ////////////////////
 
+#if Py_UNICODE_SIZE == 2
 static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
     /* handle surrogate pairs for Py_UNICODE buffers in 16bit Unicode builds */
     Py_UNICODE high_val, low_val;
@@ -77,6 +78,7 @@
     }
     return 0;
 }
+#endif
 
 static int __Pyx_PyUnicodeBufferContainsUCS4_BMP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
     Py_UNICODE uchar;
@@ -101,12 +103,15 @@
         return 0;
     }
 #endif
-    if (Py_UNICODE_SIZE == 2 && unlikely(character > 65535)) {
+#if Py_UNICODE_SIZE == 2
+    if (unlikely(character > 65535)) {
         return __Pyx_PyUnicodeBufferContainsUCS4_SP(
             PyUnicode_AS_UNICODE(unicode),
             PyUnicode_GET_SIZE(unicode),
             character);
-    } else {
+    } else
+#endif
+    {
         return __Pyx_PyUnicodeBufferContainsUCS4_BMP(
             PyUnicode_AS_UNICODE(unicode),
             PyUnicode_GET_SIZE(unicode),
diff --git a/tests/compile/buildenv.pyx b/tests/compile/buildenv.pyx
--- a/tests/compile/buildenv.pyx
+++ b/tests/compile/buildenv.pyx
@@ -121,4 +121,12 @@
 CFLAGS (env) = {get_env('CFLAGS', '')}
 LINKCC (distutils) = {config_var('LINKCC')}
 LINKCC (env) = {get_env('LINKCC', '')}
+
+Encodings:
+sys maxunicode = {sys.maxunicode}
+LANG (env) = {get_env('LANG', '')}
+PYTHONIOENCODING (env) = {get_env('PYTHONIOENCODING', '')}
+sys stdout encoding = {sys.stdout.encoding}
+sys default encoding = {sys.getdefaultencoding()}
+sys FS encoding = {sys.getfilesystemencoding()}
 """)
diff --git a/tests/run/unicodeliterals.pyx b/tests/run/unicodeliterals.pyx
--- a/tests/run/unicodeliterals.pyx
+++ b/tests/run/unicodeliterals.pyx
@@ -21,6 +21,13 @@
     u'\udc00'
     >>> h
     u'\ud800'
+    >>> q
+    u'\udc00\ud800'
+
+    # The output of surrogate pairs differs between 16/32bit Unicode runtimes.
+    #>>> p
+    #u'\ud800\udc00'
+
     >>> add
     u'S\xf8k ik\xfc\xd6\xe4abc'
     >>> null
@@ -44,6 +51,10 @@
     1
     >>> len(h)
     1
+    >>> len(q)
+    2
+    >>> len(q)
+    2
     >>> len(add)
     12
     >>> len(null)
@@ -75,6 +86,10 @@
     True
     >>> h == u'\\ud800' # unescaped by Python (required by doctest)
     True
+    >>> p == u'\\ud800\\udc00' # unescaped by Python (required by doctest)
+    True
+    >>> q == u'\\udc00\\ud800' # unescaped by Python (required by doctest)
+    True
     >>> k == u'\\N{SNOWMAN}' == u'\\u2603'
     True
     >>> m == u'abc\\\\xf8\\\\t\\u00f8\\U000000f8'  # unescaped by Python (required by doctest)
@@ -115,6 +130,8 @@
 h = u'\ud800'   # lone lead surrogate
 k = u'\N{SNOWMAN}'
 m = ur'abc\xf8\t\u00f8\U000000f8'
+p = u'\ud800\udc00'  # surrogate pair
+q = u'\udc00\ud800'  # reversed surrogate pair
 
 add = u'Søk ik' + u'üÖä' + u'abc'
 null = u'\x00'