Skip to content
Snippets Groups Projects
Commit 4a8197169791 authored by scoder's avatar scoder
Browse files

Really only use PyUnicode_FromUnicode() when needed (GH-3697)

* Really only use PyUnicode_FromUnicode() for strings that contain lone surrogate, not for normal non-BMP strings and not for surrogate pairs on 16bit Unicode platforms.

See https://github.com/cython/cython/issues/3678

* Extend buildenv test to debug a MacOS problem.
* Add a test for surrogate pairs in Unicode strings.
* Limit PyUnicode_FromUnicode() usage to strings containing lone surrogates.
* Accept ambiguity of surrogate pairs in Unicode string literals when generated on 16bit Py2 systems.
parent a274549d307e
No related branches found
No related tags found
No related merge requests found
...@@ -1677,8 +1677,13 @@ ...@@ -1677,8 +1677,13 @@
   
def generate_evaluation_code(self, code): def generate_evaluation_code(self, code):
if self.type.is_pyobject: if self.type.is_pyobject:
if self.contains_surrogates(): # FIXME: this should go away entirely!
# surrogates are not really portable and cannot be # Since string_contains_lone_surrogates() returns False for surrogate pairs in Py2/UCS2,
# Py2 can generate different code from Py3 here. Let's hope we get away with claiming that
# the processing of surrogate pairs in code was always ambiguous and lead to different results
# on P16/32bit Unicode platforms.
if StringEncoding.string_contains_lone_surrogates(self.value):
# lone (unpaired) surrogates are not really portable and cannot be
# decoded by the UTF-8 codec in Py3.3 # decoded by the UTF-8 codec in Py3.3
self.result_code = code.get_py_const(py_object_type, 'ustring') self.result_code = code.get_py_const(py_object_type, 'ustring')
data_cname = code.get_pyunicode_ptr_const(self.value) data_cname = code.get_pyunicode_ptr_const(self.value)
......
...@@ -172,6 +172,34 @@ ...@@ -172,6 +172,34 @@
return False return False
def string_contains_lone_surrogates(ustring):
"""
Check if the unicode string contains lone surrogate code points
on a CPython platform with wide (UCS-4) or narrow (UTF-16)
Unicode, i.e. characters that would be spelled as two
separate code units on a narrow platform, but that do not form a pair.
"""
last_was_start = False
unicode_uses_surrogate_encoding = sys.maxunicode == 65535
for c in map(ord, ustring):
# surrogates tend to be rare
if c < 0xD800 or c > 0xDFFF:
if last_was_start:
return True
elif not unicode_uses_surrogate_encoding:
# on 32bit Unicode platforms, there is never a pair
return True
elif c <= 0xDBFF:
if last_was_start:
return True # lone start
last_was_start = True
else:
if not last_was_start:
return True # lone end
last_was_start = False
return last_was_start
class BytesLiteral(_bytes): class BytesLiteral(_bytes):
# bytes subclass that is compatible with EncodedString # bytes subclass that is compatible with EncodedString
encoding = None encoding = None
......
# -*- coding: utf-8 -*-
import sys
import unittest
import Cython.Compiler.StringEncoding as StringEncoding
class StringEncodingTest(unittest.TestCase):
"""
Test the StringEncoding module.
"""
def test_string_contains_lone_surrogates(self):
self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"abc"))
self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\uABCD"))
self.assertFalse(StringEncoding.string_contains_lone_surrogates(u"\N{SNOWMAN}"))
# This behaves differently in Py2 when freshly parsed and read from a .pyc file,
# but it seems to be a marshalling bug in Py2, which doesn't hurt us in Cython.
if sys.version_info[0] != 2:
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800\uDFFF"))
# In Py2 with 16bit Unicode, the following is indistinguishable from the 32bit character.
obfuscated_surrogate_pair = (u"\uDFFF" + "\uD800")[::-1]
if sys.version_info[0] == 2 and sys.maxunicode == 65565:
self.assertFalse(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
else:
self.assertTrue(StringEncoding.string_contains_lone_surrogates(obfuscated_surrogate_pair))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800"))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF"))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uDFFF\uD800"))
self.assertTrue(StringEncoding.string_contains_lone_surrogates(u"\uD800x\uDFFF"))
def test_string_contains_surrogates(self):
self.assertFalse(StringEncoding.string_contains_surrogates(u"abc"))
self.assertFalse(StringEncoding.string_contains_surrogates(u"\uABCD"))
self.assertFalse(StringEncoding.string_contains_surrogates(u"\N{SNOWMAN}"))
self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800"))
self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF"))
self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800\uDFFF"))
self.assertTrue(StringEncoding.string_contains_surrogates(u"\uDFFF\uD800"))
self.assertTrue(StringEncoding.string_contains_surrogates(u"\uD800x\uDFFF"))
...@@ -741,5 +741,6 @@ ...@@ -741,5 +741,6 @@
#define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u) #define __Pyx_PyUnicode_DATA(u) PyUnicode_DATA(u)
#define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i) #define __Pyx_PyUnicode_READ(k, d, i) PyUnicode_READ(k, d, i)
#define __Pyx_PyUnicode_WRITE(k, d, i, ch) PyUnicode_WRITE(k, d, i, ch) #define __Pyx_PyUnicode_WRITE(k, d, i, ch) PyUnicode_WRITE(k, d, i, ch)
#if defined(PyUnicode_IS_READY) && defined(PyUnicode_GET_SIZE)
#define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u))) #define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
#else #else
...@@ -744,5 +745,8 @@ ...@@ -744,5 +745,8 @@
#define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u))) #define __Pyx_PyUnicode_IS_TRUE(u) (0 != (likely(PyUnicode_IS_READY(u)) ? PyUnicode_GET_LENGTH(u) : PyUnicode_GET_SIZE(u)))
#else #else
#define __Pyx_PyUnicode_IS_TRUE(u) (0 != PyUnicode_GET_LENGTH(u))
#endif
#else
#define CYTHON_PEP393_ENABLED 0 #define CYTHON_PEP393_ENABLED 0
#define PyUnicode_1BYTE_KIND 1 #define PyUnicode_1BYTE_KIND 1
#define PyUnicode_2BYTE_KIND 2 #define PyUnicode_2BYTE_KIND 2
......
...@@ -118,6 +118,7 @@ ...@@ -118,6 +118,7 @@
//////////////////// PyUCS4InUnicode //////////////////// //////////////////// PyUCS4InUnicode ////////////////////
#if Py_UNICODE_SIZE == 2
static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) { static int __Pyx_PyUnicodeBufferContainsUCS4_SP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
/* handle surrogate pairs for Py_UNICODE buffers in 16bit Unicode builds */ /* handle surrogate pairs for Py_UNICODE buffers in 16bit Unicode builds */
Py_UNICODE high_val, low_val; Py_UNICODE high_val, low_val;
...@@ -129,6 +130,7 @@ ...@@ -129,6 +130,7 @@
} }
return 0; return 0;
} }
#endif
static int __Pyx_PyUnicodeBufferContainsUCS4_BMP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) { static int __Pyx_PyUnicodeBufferContainsUCS4_BMP(Py_UNICODE* buffer, Py_ssize_t length, Py_UCS4 character) {
Py_UNICODE uchar; Py_UNICODE uchar;
...@@ -153,8 +155,9 @@ ...@@ -153,8 +155,9 @@
return 0; return 0;
} }
#endif #endif
if (Py_UNICODE_SIZE == 2 && unlikely(character > 65535)) { #if Py_UNICODE_SIZE == 2
if (unlikely(character > 65535)) {
return __Pyx_PyUnicodeBufferContainsUCS4_SP( return __Pyx_PyUnicodeBufferContainsUCS4_SP(
PyUnicode_AS_UNICODE(unicode), PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode), PyUnicode_GET_SIZE(unicode),
character); character);
...@@ -157,8 +160,10 @@ ...@@ -157,8 +160,10 @@
return __Pyx_PyUnicodeBufferContainsUCS4_SP( return __Pyx_PyUnicodeBufferContainsUCS4_SP(
PyUnicode_AS_UNICODE(unicode), PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode), PyUnicode_GET_SIZE(unicode),
character); character);
} else { } else
#endif
{
return __Pyx_PyUnicodeBufferContainsUCS4_BMP( return __Pyx_PyUnicodeBufferContainsUCS4_BMP(
PyUnicode_AS_UNICODE(unicode), PyUnicode_AS_UNICODE(unicode),
PyUnicode_GET_SIZE(unicode), PyUnicode_GET_SIZE(unicode),
......
...@@ -34,6 +34,7 @@ ...@@ -34,6 +34,7 @@
# Cython config # Cython config
cdef int CYTHON_COMPILING_IN_CPYTHON cdef int CYTHON_COMPILING_IN_CPYTHON
cdef int CYTHON_COMPILING_IN_LIMITED_API
cdef int CYTHON_COMPILING_IN_PYPY cdef int CYTHON_COMPILING_IN_PYPY
cdef int CYTHON_COMPILING_IN_PYSTON cdef int CYTHON_COMPILING_IN_PYSTON
cdef int CYTHON_USE_PYLONG_INTERNALS cdef int CYTHON_USE_PYLONG_INTERNALS
...@@ -42,6 +43,7 @@ ...@@ -42,6 +43,7 @@
cdef int CYTHON_USE_UNICODE_WRITER cdef int CYTHON_USE_UNICODE_WRITER
cdef int CYTHON_AVOID_BORROWED_REFS cdef int CYTHON_AVOID_BORROWED_REFS
cdef int CYTHON_ASSUME_SAFE_MACROS cdef int CYTHON_ASSUME_SAFE_MACROS
cdef int CYTHON_USE_TYPE_SLOTS
cdef int CYTHON_UNPACK_METHODS cdef int CYTHON_UNPACK_METHODS
cdef int CYTHON_FAST_THREAD_STATE cdef int CYTHON_FAST_THREAD_STATE
cdef int CYTHON_FAST_PYCALL cdef int CYTHON_FAST_PYCALL
...@@ -76,6 +78,7 @@ ...@@ -76,6 +78,7 @@
PY_VERSION_HEX 0x{PY_VERSION_HEX:X} PY_VERSION_HEX 0x{PY_VERSION_HEX:X}
CYTHON_COMPILING_IN_CPYTHON {CYTHON_COMPILING_IN_CPYTHON} CYTHON_COMPILING_IN_CPYTHON {CYTHON_COMPILING_IN_CPYTHON}
CYTHON_COMPILING_IN_LIMITED_API {CYTHON_COMPILING_IN_LIMITED_API}
CYTHON_COMPILING_IN_PYPY {CYTHON_COMPILING_IN_PYPY} CYTHON_COMPILING_IN_PYPY {CYTHON_COMPILING_IN_PYPY}
CYTHON_COMPILING_IN_PYSTON {CYTHON_COMPILING_IN_PYSTON} CYTHON_COMPILING_IN_PYSTON {CYTHON_COMPILING_IN_PYSTON}
...@@ -85,6 +88,7 @@ ...@@ -85,6 +88,7 @@
CYTHON_USE_UNICODE_WRITER {CYTHON_USE_UNICODE_WRITER} CYTHON_USE_UNICODE_WRITER {CYTHON_USE_UNICODE_WRITER}
CYTHON_AVOID_BORROWED_REFS {CYTHON_AVOID_BORROWED_REFS} CYTHON_AVOID_BORROWED_REFS {CYTHON_AVOID_BORROWED_REFS}
CYTHON_ASSUME_SAFE_MACROS {CYTHON_ASSUME_SAFE_MACROS} CYTHON_ASSUME_SAFE_MACROS {CYTHON_ASSUME_SAFE_MACROS}
CYTHON_USE_TYPE_SLOTS {CYTHON_USE_TYPE_SLOTS}
CYTHON_UNPACK_METHODS {CYTHON_UNPACK_METHODS} CYTHON_UNPACK_METHODS {CYTHON_UNPACK_METHODS}
CYTHON_FAST_THREAD_STATE {CYTHON_FAST_THREAD_STATE} CYTHON_FAST_THREAD_STATE {CYTHON_FAST_THREAD_STATE}
CYTHON_FAST_PYCALL {CYTHON_FAST_PYCALL} CYTHON_FAST_PYCALL {CYTHON_FAST_PYCALL}
...@@ -132,6 +136,7 @@ ...@@ -132,6 +136,7 @@
LINKCC (env) = {get_env('LINKCC', '')} LINKCC (env) = {get_env('LINKCC', '')}
Encodings: Encodings:
sys maxunicode = {sys.maxunicode}
LANG (env) = {get_env('LANG', '')} LANG (env) = {get_env('LANG', '')}
PYTHONIOENCODING (env) = {get_env('PYTHONIOENCODING', '')} PYTHONIOENCODING (env) = {get_env('PYTHONIOENCODING', '')}
sys stdout encoding = {sys.stdout.encoding} sys stdout encoding = {sys.stdout.encoding}
......
...@@ -21,6 +21,13 @@ ...@@ -21,6 +21,13 @@
u'\udc00' u'\udc00'
>>> h >>> h
u'\ud800' u'\ud800'
>>> q
u'\udc00\ud800'
# The output of surrogate pairs differs between 16/32bit Unicode runtimes.
#>>> p
#u'\ud800\udc00'
>>> add >>> add
u'S\xf8k ik\xfc\xd6\xe4abc' u'S\xf8k ik\xfc\xd6\xe4abc'
>>> null >>> null
...@@ -44,6 +51,10 @@ ...@@ -44,6 +51,10 @@
1 1
>>> len(h) >>> len(h)
1 1
>>> len(q)
2
>>> len(q)
2
>>> len(add) >>> len(add)
12 12
>>> len(null) >>> len(null)
...@@ -75,6 +86,10 @@ ...@@ -75,6 +86,10 @@
True True
>>> h == u'\\ud800' # unescaped by Python (required by doctest) >>> h == u'\\ud800' # unescaped by Python (required by doctest)
True True
>>> p == u'\\ud800\\udc00' # unescaped by Python (required by doctest)
True
>>> q == u'\\udc00\\ud800' # unescaped by Python (required by doctest)
True
>>> k == u'\\N{SNOWMAN}' == u'\\u2603' >>> k == u'\\N{SNOWMAN}' == u'\\u2603'
True True
>>> m == u'abc\\\\xf8\\\\t\\u00f8\\U000000f8' # unescaped by Python (required by doctest) >>> m == u'abc\\\\xf8\\\\t\\u00f8\\U000000f8' # unescaped by Python (required by doctest)
...@@ -111,6 +126,8 @@ ...@@ -111,6 +126,8 @@
h = u'\ud800' # lone lead surrogate h = u'\ud800' # lone lead surrogate
k = u'\N{SNOWMAN}' k = u'\N{SNOWMAN}'
m = ur'abc\xf8\t\u00f8\U000000f8' m = ur'abc\xf8\t\u00f8\U000000f8'
p = u'\ud800\udc00' # surrogate pair
q = u'\udc00\ud800' # reversed surrogate pair
add = u'Søk ik' + u'üÖä' + u'abc' add = u'Søk ik' + u'üÖä' + u'abc'
null = u'\x00' null = u'\x00'
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment