# HG changeset patch
# User ijl <ijl@mailbox.org>
# Date 1594677559 0
#      Mon Jul 13 21:59:19 2020 +0000
# Node ID bfde36265adde4fee511cd82e2ffe6e1420db72a
# Parent  a9eff9f72d1e62ae90f6510c38eff135fdd64790
Reduce work in unicode_from_str()

diff --git a/Cargo.lock b/Cargo.lock
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -60,9 +60,9 @@
 
 [[package]]
 name = "instant"
-version = "0.1.5"
+version = "0.1.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "69da7ce1490173c2bf4d26bc8be429aaeeaf4cce6c4b970b7949651fa17655fe"
+checksum = "5b141fdc7836c525d4d594027d318c84161ca17aaf8113ab1f81ab93ae897485"
 
 [[package]]
 name = "itoa"
@@ -85,15 +85,15 @@
 
 [[package]]
 name = "libc"
-version = "0.2.71"
+version = "0.2.72"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9457b06509d27052635f90d6466700c65095fdf75409b3fbdd903e988b886f49"
+checksum = "a9f8082297d534141b30c8d39e9b1773713ab50fdbe4ff30f750d063b3bfd701"
 
 [[package]]
 name = "lock_api"
-version = "0.4.0"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "de302ce1fe7482db13738fbaf2e21cfb06a986b89c0bf38d88abf16681aada4e"
+checksum = "28247cc5a5be2f05fbcd76dd0cf2c7d3b5400cb978a28042abcd4fa0b3f8261c"
 dependencies = [
  "scopeguard",
 ]
@@ -169,15 +169,15 @@
 
 [[package]]
 name = "rand_core"
-version = "0.4.2"
+version = "0.5.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc"
+checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19"
 
 [[package]]
 name = "redox_syscall"
-version = "0.1.56"
+version = "0.1.57"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84"
+checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce"
 
 [[package]]
 name = "ryu"
@@ -209,9 +209,9 @@
 
 [[package]]
 name = "smallvec"
-version = "1.4.0"
+version = "1.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c7cb5678e1615754284ec264d9bb5b4c27d2018577fd90ac0ceb578591ed5ee4"
+checksum = "3757cb9d89161a2f24e1cf78efa0c1fcff485d18e3f55e0aa3480824ddaa0f3f"
 
 [[package]]
 name = "static_assertions"
@@ -243,9 +243,9 @@
 
 [[package]]
 name = "wyhash"
-version = "0.3.0"
+version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "782a50f48ac4336916227cd199c61c7b42f38d0ad705421b49eb12c74c53ae00"
+checksum = "0fe26121db27575e4fb30ceded9806fbfe0edb489f170a17506d9ad0b1aca41c"
 dependencies = [
  "rand_core",
 ]
diff --git a/Cargo.toml b/Cargo.toml
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -60,7 +60,7 @@
 serde = { version = "1", default_features = false }
 serde_json = { path = "./json", default_features = false, features = ["std"] }
 smallvec = { version = "1", default_features = false, features = ["const_generics", "union", "specialization", "write"] }
-wyhash = { version = "0.3" }
+wyhash = { version = "0.4" }
 
 [profile.release]
 codegen-units = 1
diff --git a/bench/benchmark_dumps.py b/bench/benchmark_dumps.py
--- a/bench/benchmark_dumps.py
+++ b/bench/benchmark_dumps.py
@@ -4,11 +4,12 @@
 from json import dumps as _json_dumps
 from json import loads as json_loads
 
-from orjson import dumps as _orjson_dumps
 from rapidjson import dumps as _rapidjson_dumps
 from simplejson import dumps as _simplejson_dumps
 from ujson import dumps as _ujson_dumps
 
+from orjson import dumps as _orjson_dumps
+
 from .util import read_fixture_obj
 
 
diff --git a/bench/benchmark_loads.py b/bench/benchmark_loads.py
--- a/bench/benchmark_loads.py
+++ b/bench/benchmark_loads.py
@@ -4,8 +4,6 @@
 from json import dumps as json_dumps
 from json import loads as json_loads
 
-from orjson import dumps as orjson_dumps
-from orjson import loads as orjson_loads
 from rapidjson import dumps as rapidjson_dumps
 from rapidjson import loads as rapidjson_loads
 from simplejson import dumps as simplejson_dumps
@@ -13,6 +11,9 @@
 from ujson import dumps as ujson_dumps
 from ujson import loads as ujson_loads
 
+from orjson import dumps as orjson_dumps
+from orjson import loads as orjson_loads
+
 from .util import read_fixture_str
 
 
diff --git a/pydataclass b/pydataclass
--- a/pydataclass
+++ b/pydataclass
@@ -8,12 +8,13 @@
 from timeit import timeit
 from typing import List
 
-import orjson
 import rapidjson
 import simplejson
 import ujson
 from tabulate import tabulate
 
+import orjson
+
 os.sched_setaffinity(os.getpid(), {0, 1})
 
 
diff --git a/pynonstr b/pynonstr
--- a/pynonstr
+++ b/pynonstr
@@ -9,12 +9,13 @@
 from time import mktime
 from timeit import timeit
 
-import orjson
 import rapidjson
 import simplejson
 import ujson
 from tabulate import tabulate
 
+import orjson
+
 os.sched_setaffinity(os.getpid(), {0, 1})
 
 data_as_obj = []
diff --git a/pynumpy b/pynumpy
--- a/pynumpy
+++ b/pynumpy
@@ -10,13 +10,14 @@
 from timeit import timeit
 
 import numpy
-import orjson
 import psutil
 import rapidjson
 import simplejson
 from memory_profiler import memory_usage
 from tabulate import tabulate
 
+import orjson
+
 os.sched_setaffinity(os.getpid(), {0, 1})
 
 
diff --git a/pysort b/pysort
--- a/pysort
+++ b/pysort
@@ -8,12 +8,13 @@
 from pathlib import Path
 from timeit import timeit
 
-import orjson
 import rapidjson
 import simplejson
 import ujson
 from tabulate import tabulate
 
+import orjson
+
 os.sched_setaffinity(os.getpid(), {0, 1})
 
 
diff --git a/src/unicode.rs b/src/unicode.rs
--- a/src/unicode.rs
+++ b/src/unicode.rs
@@ -33,6 +33,15 @@
 const STATE_COMPACT_ASCII: u32 = 0b00000000000000000000000001100000;
 const STATE_COMPACT: u32 = 0b00000000000000000000000000100000;
 
+fn is_four_byte(buf: &str) -> bool {
+    for &each in buf.as_bytes() {
+        if unlikely!(each >= 240) {
+            return true;
+        }
+    }
+    false
+}
+
 enum PyUnicodeKind {
     Ascii,
     OneByte,
@@ -40,14 +49,13 @@
     FourByte,
 }
 
-fn find_str_kind(buf: &str) -> PyUnicodeKind {
-    if encoding_rs::mem::is_ascii(buf.as_bytes()) {
-        // needed to optimize ASCII case
+fn find_str_kind(buf: &str, num_chars: usize) -> PyUnicodeKind {
+    if buf.len() == num_chars {
         PyUnicodeKind::Ascii
     } else if unlikely!(encoding_rs::mem::is_str_latin1(buf)) {
         // fails fast, no obvious effect on CJK
         PyUnicodeKind::OneByte
-    } else if *buf.as_bytes().iter().max().unwrap() >= 240 {
+    } else if is_four_byte(buf) {
         PyUnicodeKind::FourByte
     } else {
         PyUnicodeKind::TwoByte
@@ -60,7 +68,8 @@
         ffi!(Py_INCREF(EMPTY_UNICODE));
         unsafe { EMPTY_UNICODE }
     } else {
-        match find_str_kind(buf) {
+        let num_chars = bytecount::num_chars(buf.as_bytes()) as isize;
+        match find_str_kind(buf, num_chars as usize) {
             PyUnicodeKind::Ascii => unsafe {
                 let ptr = ffi!(PyUnicode_New(len as isize, 127));
                 let data_ptr = ptr.cast::<PyASCIIObject>().offset(1) as *mut u8;
@@ -69,39 +78,36 @@
                 ptr
             },
             PyUnicodeKind::OneByte => unsafe {
-                let num_chars = bytecount::num_chars(buf.as_bytes()) as isize;
-                let ptr = ffi!(PyUnicode_New(num_chars as isize, 255));
+                let ptr = ffi!(PyUnicode_New(num_chars, 255));
+                (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars;
                 let mut data_ptr = ptr.cast::<PyCompactUnicodeObject>().offset(1) as *mut u8;
                 for each in buf.chars() {
                     core::ptr::write(data_ptr, each as u8);
                     data_ptr = data_ptr.offset(1);
                 }
                 core::ptr::write(data_ptr, 0);
-                (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars;
                 ptr
             },
             PyUnicodeKind::TwoByte => unsafe {
-                let num_chars = bytecount::num_chars(buf.as_bytes()) as isize;
                 let ptr = ffi!(PyUnicode_New(num_chars, 65535));
+                (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars;
                 let mut data_ptr = ptr.cast::<PyCompactUnicodeObject>().offset(1) as *mut u16;
                 for each in buf.chars() {
                     core::ptr::write(data_ptr, each as u16);
                     data_ptr = data_ptr.offset(1);
                 }
                 core::ptr::write(data_ptr, 0);
-                (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars;
                 ptr
             },
             PyUnicodeKind::FourByte => unsafe {
-                let num_chars = bytecount::num_chars(buf.as_bytes()) as isize;
                 let ptr = ffi!(PyUnicode_New(num_chars, 1114111));
+                (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars;
                 let mut data_ptr = ptr.cast::<PyCompactUnicodeObject>().offset(1) as *mut u32;
                 for each in buf.chars() {
                     core::ptr::write(data_ptr, each as u32);
                     data_ptr = data_ptr.offset(1);
                 }
                 core::ptr::write(data_ptr, 0);
-                (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars;
                 ptr
             },
         }
diff --git a/test/test_datetime.py b/test/test_datetime.py
--- a/test/test_datetime.py
+++ b/test/test_datetime.py
@@ -3,11 +3,12 @@
 import datetime
 import unittest
 
-import orjson
 import pytest
 import pytz
 from dateutil import tz
 
+import orjson
+
 try:
     import pendulum
 except ImportError:
diff --git a/test/test_memory.py b/test/test_memory.py
--- a/test/test_memory.py
+++ b/test/test_memory.py
@@ -7,10 +7,11 @@
 import unittest
 from typing import List
 
-import orjson
 import psutil
 import pytest
 
+import orjson
+
 try:
     import numpy
 except ImportError:
diff --git a/test/test_non_str_keys.py b/test/test_non_str_keys.py
--- a/test/test_non_str_keys.py
+++ b/test/test_non_str_keys.py
@@ -5,10 +5,11 @@
 import unittest
 import uuid
 
-import orjson
 import pytest
 import pytz
 
+import orjson
+
 try:
     import numpy
 except ImportError:
diff --git a/test/test_numpy.py b/test/test_numpy.py
--- a/test/test_numpy.py
+++ b/test/test_numpy.py
@@ -2,8 +2,9 @@
 
 import unittest
 
+import pytest
+
 import orjson
-import pytest
 
 try:
     import numpy
diff --git a/test/test_type.py b/test/test_type.py
--- a/test/test_type.py
+++ b/test/test_type.py
@@ -2,8 +2,9 @@
 
 import unittest
 
+import xxhash
+
 import orjson
-import xxhash
 
 
 class TypeTests(unittest.TestCase):