# HG changeset patch # User Stefan Behnel <stefan_ml@behnel.de> # Date 1594193220 -7200 # Wed Jul 08 09:27:00 2020 +0200 # Node ID 133ee7b17ff14c0196e7925ff74f8eccc3f20832 # Parent 044c0d743245322caa41974e3a8e2eb3b62689a6 Using Py_UNICODE to store lone surrogates makes Py3 join surrogate pairs on 16-bit Unicode platforms (Windows) when reading them back in, although we correctly processed them before. Instead, we now use the "unicode_escape" codec to store byte strings, because it can return surrogate characters (which the other codecs cannot). diff --git a/Cython/Compiler/ExprNodes.py b/Cython/Compiler/ExprNodes.py --- a/Cython/Compiler/ExprNodes.py +++ b/Cython/Compiler/ExprNodes.py @@ -1632,13 +1632,14 @@ # lone (unpaired) surrogates are not really portable and cannot be # decoded by the UTF-8 codec in Py3.3 self.result_code = code.get_py_const(py_object_type, 'ustring') - data_cname = code.get_pyunicode_ptr_const(self.value) + data_cname = code.get_string_const( + StringEncoding.BytesLiteral(self.value.encode('unicode_escape'))) const_code = code.get_cached_constants_writer(self.result_code) if const_code is None: return # already initialised const_code.mark_pos(self.pos) const_code.putln( - "%s = PyUnicode_FromUnicode(%s, (sizeof(%s) / sizeof(Py_UNICODE))-1); %s" % ( + "%s = PyUnicode_DecodeUnicodeEscape(%s, sizeof(%s) - 1, NULL); %s" % ( self.result_code, data_cname, data_cname,