# HG changeset patch
# User Stefan Behnel <stefan_ml@behnel.de>
# Date 1594193220 -7200
#      Wed Jul 08 09:27:00 2020 +0200
# Node ID 133ee7b17ff14c0196e7925ff74f8eccc3f20832
# Parent  044c0d743245322caa41974e3a8e2eb3b62689a6
Using Py_UNICODE to store lone surrogates makes Py3 join surrogate pairs on 16-bit Unicode platforms (Windows) when reading them back in, although we correctly processed them before.
Instead, we now use the "unicode_escape" codec to store byte strings, because it can return surrogate characters (which the other codecs cannot).

diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py
--- a/Cython/Compiler/ExprNodes.py
+++ b/Cython/Compiler/ExprNodes.py
@@ -1632,13 +1632,14 @@
                 # lone (unpaired) surrogates are not really portable and cannot be
                 # decoded by the UTF-8 codec in Py3.3
                 self.result_code = code.get_py_const(py_object_type, 'ustring')
-                data_cname = code.get_pyunicode_ptr_const(self.value)
+                data_cname = code.get_string_const(
+                    StringEncoding.BytesLiteral(self.value.encode('unicode_escape')))
                 const_code = code.get_cached_constants_writer(self.result_code)
                 if const_code is None:
                     return  # already initialised
                 const_code.mark_pos(self.pos)
                 const_code.putln(
-                    "%s = PyUnicode_FromUnicode(%s, (sizeof(%s) / sizeof(Py_UNICODE))-1); %s" % (
+                    "%s = PyUnicode_DecodeUnicodeEscape(%s, sizeof(%s) - 1, NULL); %s" % (
                         self.result_code,
                         data_cname,
                         data_cname,