diff --git a/README.md b/README.md index 3376f44bf758ba01710f18c40227c9e23f9756cd_UkVBRE1FLm1k..83db8324a4cb41c42fc1d0096bca1ffe6078f677_UkVBRE1FLm1k 100644 --- a/README.md +++ b/README.md @@ -888,6 +888,25 @@ workers) and when multithreaded. It also uses some tests from the ultrajson library. +orjson is the most correct of the compared libraries. This graph shows how each +library fares handles a combined 342 JSON fixtures from the +[JSONTestSuite](https://github.com/nst/JSONTestSuite) and +[nativejson-benchmark](https://github.com/miloyip/nativejson-benchmark) tests: + +| Library | Invalid JSON fixtures not rejected | Valid JSON fixtures not deserialized | +|------------|--------------------------------------|----------------------------------------| +| orjson | 0 | 0 | +| ujson | 38 | 0 | +| rapidjson | 6 | 0 | +| simplejson | 13 | 0 | +| json | 17 | 0 | + +This shows that all libraries deserialize valid JSON but only orjson +correctly rejects the given invalid JSON fixtures. Errors are largely due to +accepting invalid strings and numbers. + +The graph above can be reproduced using the `pycorrectness` script. + ## Performance Serialization and deserialization performance of orjson is better than diff --git a/lint b/lint index 3376f44bf758ba01710f18c40227c9e23f9756cd_bGludA==..83db8324a4cb41c42fc1d0096bca1ffe6078f677_bGludA== 100755 --- a/lint +++ b/lint @@ -3,6 +3,6 @@ set -eou pipefail autoflake --in-place --recursive --remove-all-unused-imports --ignore-init-module-imports . -isort ./bench/*.py ./orjson.pyi ./test/*.py pydataclass pymem pysort pynumpy pynonstr -black ./bench/*.py ./orjson.pyi ./test/*.py pydataclass pymem pysort pynumpy pynonstr +isort ./bench/*.py ./orjson.pyi ./test/*.py pydataclass pymem pysort pynumpy pynonstr pycorrectness +black ./bench/*.py ./orjson.pyi ./test/*.py pydataclass pymem pysort pynumpy pynonstr pycorrectness mypy --ignore-missing-imports ./bench/*.py ./orjson.pyi ./test/*.py diff --git a/pycorrectness b/pycorrectness new file mode 100755 index 0000000000000000000000000000000000000000..83db8324a4cb41c42fc1d0096bca1ffe6078f677_cHljb3JyZWN0bmVzcw== --- /dev/null +++ b/pycorrectness @@ -0,0 +1,166 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (Apache-2.0 OR MIT) + +import collections +import io +import lzma +import os +from pathlib import Path + +from tabulate import tabulate + +import orjson + +dirname = os.path.join(os.path.dirname(__file__), "data") + +LIBRARIES = ["orjson", "ujson", "rapidjson", "simplejson", "json"] + + +def read_fixture_bytes(filename, subdir=None): + if subdir is None: + parts = (dirname, filename) + else: + parts = (dirname, subdir, filename) + path = Path(*parts) + if path.suffix == ".xz": + contents = lzma.decompress(path.read_bytes()) + else: + contents = path.read_bytes() + return contents + + +PARSING = { + filename: read_fixture_bytes(filename, "parsing") + for filename in os.listdir("data/parsing") +} + +JSONCHECKER = { + filename: read_fixture_bytes(filename, "jsonchecker") + for filename in os.listdir("data/jsonchecker") +} + + +RESULTS = collections.defaultdict(dict) + + +def read_fixture(filename, subdir=None): + if not filename in BYTES_CACHE: + BYTES_CACHE[filename] = read_fixture_bytes(filename, subdir) + return BYTES_CACHE[filename] + + +def test_passed(library, fixture): + passed = [] + try: + passed.append(library.loads(fixture) == orjson.loads(fixture)) + passed.append( + library.loads(fixture.decode("utf-8")) + == orjson.loads(fixture.decode("utf-8")) + ) + except Exception: + passed.append(False) + return all(passed) + + +def test_failed(library, fixture): + rejected_as_bytes = False + try: + library.loads(fixture) + except Exception: + rejected_as_bytes = True + + rejected_as_str = False + try: + library.loads(fixture.decode("utf-8")) + except Exception: + rejected_as_str = True + return rejected_as_bytes and rejected_as_str + + +MISTAKEN_PASSES = {key: 0 for key in LIBRARIES} + +MISTAKEN_FAILS = {key: 0 for key in LIBRARIES} + +PASS_WHITELIST = ("fail01.json", "fail18.json") + + +def should_pass(filename): + return ( + filename.startswith("y_") + or filename.startswith("pass") + or filename in PASS_WHITELIST + ) + + +def should_fail(filename): + return ( + filename.startswith("n_") + or filename.startswith("i_string") + or filename.startswith("i_object") + or filename.startswith("fail") + ) and not filename in PASS_WHITELIST + + +for libname in LIBRARIES: + library = __import__(libname) + for fixture_set in (PARSING, JSONCHECKER): + for filename, fixture in fixture_set.items(): + if should_pass(filename): + res = test_passed(library, fixture) + RESULTS[filename][libname] = res + if not res: + MISTAKEN_PASSES[libname] += 1 + + elif should_fail(filename): + res = test_failed(library, fixture) + RESULTS[filename][libname] = res + if not res: + MISTAKEN_FAILS[libname] += 1 + elif filename.startswith("i_"): + continue + else: + raise NotImplementedError + +FILENAMES = sorted(list(PARSING.keys()) + list(JSONCHECKER.keys())) + + +tab_results = [] +for filename in FILENAMES: + entry = [ + filename, + ] + for libname in LIBRARIES: + try: + entry.append("ok" if RESULTS[filename][libname] else "fail") + except KeyError: + continue + tab_results.append(entry) + +buf = io.StringIO() +buf.write(tabulate(tab_results, ["Fixture"] + LIBRARIES, tablefmt="github")) +buf.write("\n") +print(buf.getvalue()) + +failure_results = [ + [libname, MISTAKEN_FAILS[libname], MISTAKEN_PASSES[libname]] + for libname in LIBRARIES +] + +buf = io.StringIO() +buf.write( + tabulate( + failure_results, + [ + "Library", + "Invalid JSON fixtures not rejected", + "Valid JSON fixtures not deserialized", + ], + tablefmt="github", + ) +) +buf.write("\n") +print(buf.getvalue()) + +num_results = len([each for each in tab_results if len(each) > 1]) + +print(f"{num_results} fixtures tested") diff --git a/test/test_dataclass.py b/test/test_dataclass.py index 3376f44bf758ba01710f18c40227c9e23f9756cd_dGVzdC90ZXN0X2RhdGFjbGFzcy5weQ==..83db8324a4cb41c42fc1d0096bca1ffe6078f677_dGVzdC90ZXN0X2RhdGFjbGFzcy5weQ== 100644 --- a/test/test_dataclass.py +++ b/test/test_dataclass.py @@ -22,8 +22,6 @@ @dataclass class EmptyDataclassSlots: __slots__ = () - pass - @dataclass