# HG changeset patch
# User ijl <ijl@mailbox.org>
# Date 1597616541 0
#      Sun Aug 16 22:22:21 2020 +0000
# Node ID eeefbc0e7566defe1fa0d10f18dec36295bcf881
# Parent  6d3eda83f2375e379913934aba01057a464b6831
Use PyUnicode_DecodeUTF8() for latin1

This was reported in #121 as an issue with some Common Crawl
pages. latin1 is not worth worrying about marginal performance
and using the libpython API fixes it.

diff --git a/src/unicode.rs b/src/unicode.rs
--- a/src/unicode.rs
+++ b/src/unicode.rs
@@ -77,15 +77,11 @@
                 ptr
             },
             PyUnicodeKind::OneByte => unsafe {
-                let ptr = ffi!(PyUnicode_New(num_chars, 255));
-                (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars;
-                let mut data_ptr = ptr.cast::<PyCompactUnicodeObject>().offset(1) as *mut u8;
-                for each in buf.chars() {
-                    core::ptr::write(data_ptr, each as u8);
-                    data_ptr = data_ptr.offset(1);
-                }
-                core::ptr::write(data_ptr, 0);
-                ptr
+                PyUnicode_DecodeUTF8(
+                    buf.as_bytes().as_ptr() as *const c_char,
+                    buf.as_bytes().len() as isize,
+                    "ignore\0".as_ptr() as *const c_char,
+                )
             },
             PyUnicodeKind::TwoByte => unsafe {
                 let ptr = ffi!(PyUnicode_New(num_chars, 65535));