# HG changeset patch # User ijl <ijl@mailbox.org> # Date 1594677559 0 # Mon Jul 13 21:59:19 2020 +0000 # Node ID bfde36265adde4fee511cd82e2ffe6e1420db72a # Parent a9eff9f72d1e62ae90f6510c38eff135fdd64790 Reduce work in unicode_from_str() diff --git a/Cargo.lock b/Cargo.lock --- a/Cargo.lock +++ b/Cargo.lock @@ -60,9 +60,9 @@ [[package]] name = "instant" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "69da7ce1490173c2bf4d26bc8be429aaeeaf4cce6c4b970b7949651fa17655fe" +checksum = "5b141fdc7836c525d4d594027d318c84161ca17aaf8113ab1f81ab93ae897485" [[package]] name = "itoa" @@ -85,15 +85,15 @@ [[package]] name = "libc" -version = "0.2.71" +version = "0.2.72" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9457b06509d27052635f90d6466700c65095fdf75409b3fbdd903e988b886f49" +checksum = "a9f8082297d534141b30c8d39e9b1773713ab50fdbe4ff30f750d063b3bfd701" [[package]] name = "lock_api" -version = "0.4.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "de302ce1fe7482db13738fbaf2e21cfb06a986b89c0bf38d88abf16681aada4e" +checksum = "28247cc5a5be2f05fbcd76dd0cf2c7d3b5400cb978a28042abcd4fa0b3f8261c" dependencies = [ "scopeguard", ] @@ -169,15 +169,15 @@ [[package]] name = "rand_core" -version = "0.4.2" +version = "0.5.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9c33a3c44ca05fa6f1807d8e6743f3824e8509beca625669633be0acbdf509dc" +checksum = "90bde5296fc891b0cef12a6d03ddccc162ce7b2aff54160af9338f8d40df6d19" [[package]] name = "redox_syscall" -version = "0.1.56" +version = "0.1.57" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2439c63f3f6139d1b57529d16bc3b8bb855230c8efcc5d3a896c8bea7c3b1e84" +checksum = "41cc0f7e4d5d4544e8861606a285bb08d3e70712ccc7d2b84d7c0ccfaf4b05ce" [[package]] name = "ryu" @@ -209,9 +209,9 @@ [[package]] name = "smallvec" -version = "1.4.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7cb5678e1615754284ec264d9bb5b4c27d2018577fd90ac0ceb578591ed5ee4" +checksum = "3757cb9d89161a2f24e1cf78efa0c1fcff485d18e3f55e0aa3480824ddaa0f3f" [[package]] name = "static_assertions" @@ -243,9 +243,9 @@ [[package]] name = "wyhash" -version = "0.3.0" +version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "782a50f48ac4336916227cd199c61c7b42f38d0ad705421b49eb12c74c53ae00" +checksum = "0fe26121db27575e4fb30ceded9806fbfe0edb489f170a17506d9ad0b1aca41c" dependencies = [ "rand_core", ] diff --git a/Cargo.toml b/Cargo.toml --- a/Cargo.toml +++ b/Cargo.toml @@ -60,7 +60,7 @@ serde = { version = "1", default_features = false } serde_json = { path = "./json", default_features = false, features = ["std"] } smallvec = { version = "1", default_features = false, features = ["const_generics", "union", "specialization", "write"] } -wyhash = { version = "0.3" } +wyhash = { version = "0.4" } [profile.release] codegen-units = 1 diff --git a/bench/benchmark_dumps.py b/bench/benchmark_dumps.py --- a/bench/benchmark_dumps.py +++ b/bench/benchmark_dumps.py @@ -4,11 +4,12 @@ from json import dumps as _json_dumps from json import loads as json_loads -from orjson import dumps as _orjson_dumps from rapidjson import dumps as _rapidjson_dumps from simplejson import dumps as _simplejson_dumps from ujson import dumps as _ujson_dumps +from orjson import dumps as _orjson_dumps + from .util import read_fixture_obj diff --git a/bench/benchmark_loads.py b/bench/benchmark_loads.py --- a/bench/benchmark_loads.py +++ b/bench/benchmark_loads.py @@ -4,8 +4,6 @@ from json import dumps as json_dumps from json import loads as json_loads -from orjson import dumps as orjson_dumps -from orjson import loads as orjson_loads from rapidjson import dumps as rapidjson_dumps from rapidjson import loads as rapidjson_loads from simplejson import dumps as simplejson_dumps @@ -13,6 +11,9 @@ from ujson import dumps as ujson_dumps from ujson import loads as ujson_loads +from orjson import dumps as orjson_dumps +from orjson import loads as orjson_loads + from .util import read_fixture_str diff --git a/pydataclass b/pydataclass --- a/pydataclass +++ b/pydataclass @@ -8,12 +8,13 @@ from timeit import timeit from typing import List -import orjson import rapidjson import simplejson import ujson from tabulate import tabulate +import orjson + os.sched_setaffinity(os.getpid(), {0, 1}) diff --git a/pynonstr b/pynonstr --- a/pynonstr +++ b/pynonstr @@ -9,12 +9,13 @@ from time import mktime from timeit import timeit -import orjson import rapidjson import simplejson import ujson from tabulate import tabulate +import orjson + os.sched_setaffinity(os.getpid(), {0, 1}) data_as_obj = [] diff --git a/pynumpy b/pynumpy --- a/pynumpy +++ b/pynumpy @@ -10,13 +10,14 @@ from timeit import timeit import numpy -import orjson import psutil import rapidjson import simplejson from memory_profiler import memory_usage from tabulate import tabulate +import orjson + os.sched_setaffinity(os.getpid(), {0, 1}) diff --git a/pysort b/pysort --- a/pysort +++ b/pysort @@ -8,12 +8,13 @@ from pathlib import Path from timeit import timeit -import orjson import rapidjson import simplejson import ujson from tabulate import tabulate +import orjson + os.sched_setaffinity(os.getpid(), {0, 1}) diff --git a/src/unicode.rs b/src/unicode.rs --- a/src/unicode.rs +++ b/src/unicode.rs @@ -33,6 +33,15 @@ const STATE_COMPACT_ASCII: u32 = 0b00000000000000000000000001100000; const STATE_COMPACT: u32 = 0b00000000000000000000000000100000; +fn is_four_byte(buf: &str) -> bool { + for &each in buf.as_bytes() { + if unlikely!(each >= 240) { + return true; + } + } + false +} + enum PyUnicodeKind { Ascii, OneByte, @@ -40,14 +49,13 @@ FourByte, } -fn find_str_kind(buf: &str) -> PyUnicodeKind { - if encoding_rs::mem::is_ascii(buf.as_bytes()) { - // needed to optimize ASCII case +fn find_str_kind(buf: &str, num_chars: usize) -> PyUnicodeKind { + if buf.len() == num_chars { PyUnicodeKind::Ascii } else if unlikely!(encoding_rs::mem::is_str_latin1(buf)) { // fails fast, no obvious effect on CJK PyUnicodeKind::OneByte - } else if *buf.as_bytes().iter().max().unwrap() >= 240 { + } else if is_four_byte(buf) { PyUnicodeKind::FourByte } else { PyUnicodeKind::TwoByte @@ -60,7 +68,8 @@ ffi!(Py_INCREF(EMPTY_UNICODE)); unsafe { EMPTY_UNICODE } } else { - match find_str_kind(buf) { + let num_chars = bytecount::num_chars(buf.as_bytes()) as isize; + match find_str_kind(buf, num_chars as usize) { PyUnicodeKind::Ascii => unsafe { let ptr = ffi!(PyUnicode_New(len as isize, 127)); let data_ptr = ptr.cast::<PyASCIIObject>().offset(1) as *mut u8; @@ -69,39 +78,36 @@ ptr }, PyUnicodeKind::OneByte => unsafe { - let num_chars = bytecount::num_chars(buf.as_bytes()) as isize; - let ptr = ffi!(PyUnicode_New(num_chars as isize, 255)); + let ptr = ffi!(PyUnicode_New(num_chars, 255)); + (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars; let mut data_ptr = ptr.cast::<PyCompactUnicodeObject>().offset(1) as *mut u8; for each in buf.chars() { core::ptr::write(data_ptr, each as u8); data_ptr = data_ptr.offset(1); } core::ptr::write(data_ptr, 0); - (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars; ptr }, PyUnicodeKind::TwoByte => unsafe { - let num_chars = bytecount::num_chars(buf.as_bytes()) as isize; let ptr = ffi!(PyUnicode_New(num_chars, 65535)); + (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars; let mut data_ptr = ptr.cast::<PyCompactUnicodeObject>().offset(1) as *mut u16; for each in buf.chars() { core::ptr::write(data_ptr, each as u16); data_ptr = data_ptr.offset(1); } core::ptr::write(data_ptr, 0); - (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars; ptr }, PyUnicodeKind::FourByte => unsafe { - let num_chars = bytecount::num_chars(buf.as_bytes()) as isize; let ptr = ffi!(PyUnicode_New(num_chars, 1114111)); + (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars; let mut data_ptr = ptr.cast::<PyCompactUnicodeObject>().offset(1) as *mut u32; for each in buf.chars() { core::ptr::write(data_ptr, each as u32); data_ptr = data_ptr.offset(1); } core::ptr::write(data_ptr, 0); - (*ptr.cast::<PyCompactUnicodeObject>()).length = num_chars; ptr }, } diff --git a/test/test_datetime.py b/test/test_datetime.py --- a/test/test_datetime.py +++ b/test/test_datetime.py @@ -3,11 +3,12 @@ import datetime import unittest -import orjson import pytest import pytz from dateutil import tz +import orjson + try: import pendulum except ImportError: diff --git a/test/test_memory.py b/test/test_memory.py --- a/test/test_memory.py +++ b/test/test_memory.py @@ -7,10 +7,11 @@ import unittest from typing import List -import orjson import psutil import pytest +import orjson + try: import numpy except ImportError: diff --git a/test/test_non_str_keys.py b/test/test_non_str_keys.py --- a/test/test_non_str_keys.py +++ b/test/test_non_str_keys.py @@ -5,10 +5,11 @@ import unittest import uuid -import orjson import pytest import pytz +import orjson + try: import numpy except ImportError: diff --git a/test/test_numpy.py b/test/test_numpy.py --- a/test/test_numpy.py +++ b/test/test_numpy.py @@ -2,8 +2,9 @@ import unittest +import pytest + import orjson -import pytest try: import numpy diff --git a/test/test_type.py b/test/test_type.py --- a/test/test_type.py +++ b/test/test_type.py @@ -2,8 +2,9 @@ import unittest +import xxhash + import orjson -import xxhash class TypeTests(unittest.TestCase):