# HG changeset patch # User ijl <ijl@mailbox.org> # Date 1597616541 0 # Sun Aug 16 22:22:21 2020 +0000 # Node ID eeefbc0e7566defe1fa0d10f18dec36295bcf881 # Parent 6d3eda83f2375e379913934aba01057a464b6831 Use PyUnicode_DecodeUTF8() for latin1 This was reported in #121 as an issue with some Common Crawl pages. latin1 is not worth worrying about marginal performance and using the libpython API fixes it. diff --git a/src/unicode.rs b/src/unicode.rs --- a/src/unicode.rs +++ b/src/unicode.rs @@ -77,15 +77,11 @@ ptr }, PyUnicodeKind::OneByte => unsafe { - let ptr = ffi!(PyUnicode_New(num_chars, 255)); - (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars; - let mut data_ptr = ptr.cast::<PyCompactUnicodeObject>().offset(1) as *mut u8; - for each in buf.chars() { - core::ptr::write(data_ptr, each as u8); - data_ptr = data_ptr.offset(1); - } - core::ptr::write(data_ptr, 0); - ptr + PyUnicode_DecodeUTF8( + buf.as_bytes().as_ptr() as *const c_char, + buf.as_bytes().len() as isize, + "ignore\0".as_ptr() as *const c_char, + ) }, PyUnicodeKind::TwoByte => unsafe { let ptr = ffi!(PyUnicode_New(num_chars, 65535));