Compare correctness

83db8324a4cb · ijl · 3376f44bf758 · 83db8324 · 83db8324 · 83db8324
Commit 83db8324a4cb authored 5 years ago by ijl
--- a/README.md
+++ b/README.md
@@ -888,6 +888,25 @@
 workers) and when
 multithreaded. It also uses some tests from the ultrajson library.
+orjson is the most correct of the compared libraries. This graph shows how each
+library fares handles a combined 342 JSON fixtures from the
+[JSONTestSuite](https://github.com/nst/JSONTestSuite) and
+[nativejson-benchmark](https://github.com/miloyip/nativejson-benchmark) tests:
+| Library    |   Invalid JSON fixtures not rejected |   Valid JSON fixtures not deserialized |
+|------------|--------------------------------------|----------------------------------------|
+| orjson     |                                    0 |                                      0 |
+| ujson      |                                   38 |                                      0 |
+| rapidjson  |                                    6 |                                      0 |
+| simplejson |                                   13 |                                      0 |
+| json       |                                   17 |                                      0 |
+This shows that all libraries deserialize valid JSON but only orjson
+correctly rejects the given invalid JSON fixtures. Errors are largely due to
+accepting invalid strings and numbers.
+The graph above can be reproduced using the `pycorrectness` script.
 ## Performance
 Serialization and deserialization performance of orjson is better than

--- a/lint
+++ b/lint
@@ -3,6 +3,6 @@
 set -eou pipefail
 autoflake --in-place --recursive --remove-all-unused-imports --ignore-init-module-imports .
-isort ./bench/*.py ./orjson.pyi ./test/*.py pydataclass pymem pysort pynumpy pynonstr
+isort ./bench/*.py ./orjson.pyi ./test/*.py pydataclass pymem pysort pynumpy pynonstr pycorrectness
-black ./bench/*.py ./orjson.pyi ./test/*.py pydataclass pymem pysort pynumpy pynonstr
+black ./bench/*.py ./orjson.pyi ./test/*.py pydataclass pymem pysort pynumpy pynonstr pycorrectness
 mypy --ignore-missing-imports ./bench/*.py ./orjson.pyi ./test/*.py
--- a/pycorrectness
+++ b/pycorrectness
+#!/usr/bin/env python3
+# SPDX-License-Identifier: (Apache-2.0 OR MIT)
+import collections
+import io
+import lzma
+import os
+from pathlib import Path
+from tabulate import tabulate
+import orjson
+dirname = os.path.join(os.path.dirname(__file__), "data")
+LIBRARIES = ["orjson", "ujson", "rapidjson", "simplejson", "json"]
+def read_fixture_bytes(filename, subdir=None):
+    if subdir is None:
+        parts = (dirname, filename)
+    else:
+        parts = (dirname, subdir, filename)
+    path = Path(*parts)
+    if path.suffix == ".xz":
+        contents = lzma.decompress(path.read_bytes())
+    else:
+        contents = path.read_bytes()
+    return contents
+PARSING = {
+    filename: read_fixture_bytes(filename, "parsing")
+    for filename in os.listdir("data/parsing")
+}
+JSONCHECKER = {
+    filename: read_fixture_bytes(filename, "jsonchecker")
+    for filename in os.listdir("data/jsonchecker")
+}
+RESULTS = collections.defaultdict(dict)
+def read_fixture(filename, subdir=None):
+    if not filename in BYTES_CACHE:
+        BYTES_CACHE[filename] = read_fixture_bytes(filename, subdir)
+    return BYTES_CACHE[filename]
+def test_passed(library, fixture):
+    passed = []
+    try:
+        passed.append(library.loads(fixture) == orjson.loads(fixture))
+        passed.append(
+            library.loads(fixture.decode("utf-8"))
+            == orjson.loads(fixture.decode("utf-8"))
+        )
+    except Exception:
+        passed.append(False)
+    return all(passed)
+def test_failed(library, fixture):
+    rejected_as_bytes = False
+    try:
+        library.loads(fixture)
+    except Exception:
+        rejected_as_bytes = True
+    rejected_as_str = False
+    try:
+        library.loads(fixture.decode("utf-8"))
+    except Exception:
+        rejected_as_str = True
+    return rejected_as_bytes and rejected_as_str
+MISTAKEN_PASSES = {key: 0 for key in LIBRARIES}
+MISTAKEN_FAILS = {key: 0 for key in LIBRARIES}
+PASS_WHITELIST = ("fail01.json", "fail18.json")
+def should_pass(filename):
+    return (
+        filename.startswith("y_")
+        or filename.startswith("pass")
+        or filename in PASS_WHITELIST
+    )
+def should_fail(filename):
+    return (
+        filename.startswith("n_")
+        or filename.startswith("i_string")
+        or filename.startswith("i_object")
+        or filename.startswith("fail")
+    ) and not filename in PASS_WHITELIST
+for libname in LIBRARIES:
+    library = __import__(libname)
+    for fixture_set in (PARSING, JSONCHECKER):
+        for filename, fixture in fixture_set.items():
+            if should_pass(filename):
+                res = test_passed(library, fixture)
+                RESULTS[filename][libname] = res
+                if not res:
+                    MISTAKEN_PASSES[libname] += 1
+            elif should_fail(filename):
+                res = test_failed(library, fixture)
+                RESULTS[filename][libname] = res
+                if not res:
+                    MISTAKEN_FAILS[libname] += 1
+            elif filename.startswith("i_"):
+                continue
+            else:
+                raise NotImplementedError
+FILENAMES = sorted(list(PARSING.keys()) + list(JSONCHECKER.keys()))
+tab_results = []
+for filename in FILENAMES:
+    entry = [
+        filename,
+    ]
+    for libname in LIBRARIES:
+        try:
+            entry.append("ok" if RESULTS[filename][libname] else "fail")
+        except KeyError:
+            continue
+    tab_results.append(entry)
+buf = io.StringIO()
+buf.write(tabulate(tab_results, ["Fixture"] + LIBRARIES, tablefmt="github"))
+buf.write("\n")
+print(buf.getvalue())
+failure_results = [
+    [libname, MISTAKEN_FAILS[libname], MISTAKEN_PASSES[libname]]
+    for libname in LIBRARIES
+]
+buf = io.StringIO()
+buf.write(
+    tabulate(
+        failure_results,
+        [
+            "Library",
+            "Invalid JSON fixtures not rejected",
+            "Valid JSON fixtures not deserialized",
+        ],
+        tablefmt="github",
+    )
+)
+buf.write("\n")
+print(buf.getvalue())
+num_results = len([each for each in tab_results if len(each) > 1])
+print(f"{num_results} fixtures tested")
--- a/test/test_dataclass.py
+++ b/test/test_dataclass.py
@@ -22,8 +22,6 @@
 @dataclass
 class EmptyDataclassSlots:
    __slots__ = ()
-    pass
 @dataclass