diff --git a/CHANGES.txt b/CHANGES.txt index ed9859bb61691e7c9fcec65d568d87bbaa1a391b_Q0hBTkdFUy50eHQ=..1f749686d39ef9a7917500b201848dc8873d4172_Q0hBTkdFUy50eHQ= 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,5 +1,7 @@ Version 2.1.0 released XXXX-XX-XX +* Decoding performance and memory utilization enhancements + http://bugs.python.org/issue7451 * JSONEncoderForHTML class for escaping &, <, > http://code.google.com/p/simplejson/issues/detail?id=66 * Memoization of object keys during encoding (when using speedups) diff --git a/simplejson/_speedups.c b/simplejson/_speedups.c index ed9859bb61691e7c9fcec65d568d87bbaa1a391b_c2ltcGxlanNvbi9fc3BlZWR1cHMuYw==..1f749686d39ef9a7917500b201848dc8873d4172_c2ltcGxlanNvbi9fc3BlZWR1cHMuYw== 100644 --- a/simplejson/_speedups.c +++ b/simplejson/_speedups.c @@ -54,6 +54,7 @@ PyObject *parse_float; PyObject *parse_int; PyObject *parse_constant; + PyObject *memo; } PyScannerObject; static PyMemberDef scanner_members[] = { @@ -441,6 +442,21 @@ return tpl; } +#define APPEND_OLD_CHUNK \ + if (chunk != NULL) { \ + if (chunks == NULL) { \ + chunks = PyList_New(0); \ + if (chunks == NULL) { \ + goto bail; \ + } \ + } \ + if (PyList_Append(chunks, chunk)) { \ + Py_DECREF(chunk); \ + goto bail; \ + } \ + Py_CLEAR(chunk); \ + } + static PyObject * scanstring_str(PyObject *pystr, Py_ssize_t end, char *encoding, int strict, Py_ssize_t *next_end_ptr) { @@ -459,10 +475,8 @@ Py_ssize_t next = begin; int has_unicode = 0; char *buf = PyString_AS_STRING(pystr); - PyObject *chunks = PyList_New(0); - if (chunks == NULL) { - goto bail; - } + PyObject *chunks = NULL; + PyObject *chunk = NULL; if (end < 0 || len <= end) { PyErr_SetString(PyExc_ValueError, "end is out of bounds"); goto bail; @@ -470,7 +484,6 @@ while (1) { /* Find the end of the string or the next escape */ Py_UNICODE c = 0; - PyObject *chunk = NULL; for (next = end; next < len; next++) { c = (unsigned char)buf[next]; if (c == '"' || c == '\\') { @@ -490,6 +503,7 @@ } /* Pick up this chunk if it's not zero length */ if (next != end) { + APPEND_OLD_CHUNK PyObject *strchunk = PyString_FromStringAndSize(&buf[end], next - end); if (strchunk == NULL) { goto bail; @@ -504,11 +518,6 @@ else { chunk = strchunk; } - if (PyList_Append(chunks, chunk)) { - Py_DECREF(chunk); - goto bail; - } - Py_DECREF(chunk); } next++; if (c == '"') { @@ -613,6 +622,7 @@ if (c > 0x7f) { has_unicode = 1; } + APPEND_OLD_CHUNK if (has_unicode) { chunk = PyUnicode_FromUnicode(&c, 1); if (chunk == NULL) { @@ -626,7 +636,17 @@ goto bail; } } - if (PyList_Append(chunks, chunk)) { - Py_DECREF(chunk); + } + + if (chunks == NULL) { + if (chunk != NULL) + rval = chunk; + else + rval = PyString_FromStringAndSize("", 0); + } + else { + APPEND_OLD_CHUNK + rval = join_list_string(chunks); + if (rval == NULL) { goto bail; } @@ -631,5 +651,5 @@ goto bail; } - Py_DECREF(chunk); + Py_CLEAR(chunks); } @@ -634,11 +654,6 @@ } - rval = join_list_string(chunks); - if (rval == NULL) { - goto bail; - } - Py_CLEAR(chunks); *next_end_ptr = end; return rval; bail: *next_end_ptr = -1; @@ -641,7 +656,8 @@ *next_end_ptr = end; return rval; bail: *next_end_ptr = -1; + Py_XDECREF(chunk); Py_XDECREF(chunks); return NULL; } @@ -663,10 +679,9 @@ Py_ssize_t begin = end - 1; Py_ssize_t next = begin; const Py_UNICODE *buf = PyUnicode_AS_UNICODE(pystr); - PyObject *chunks = PyList_New(0); - if (chunks == NULL) { - goto bail; - } + PyObject *chunks = NULL; + PyObject *chunk = NULL; + if (end < 0 || len <= end) { PyErr_SetString(PyExc_ValueError, "end is out of bounds"); goto bail; @@ -674,7 +689,6 @@ while (1) { /* Find the end of the string or the next escape */ Py_UNICODE c = 0; - PyObject *chunk = NULL; for (next = end; next < len; next++) { c = buf[next]; if (c == '"' || c == '\\') { @@ -691,7 +705,8 @@ } /* Pick up this chunk if it's not zero length */ if (next != end) { + APPEND_OLD_CHUNK chunk = PyUnicode_FromUnicode(&buf[end], next - end); if (chunk == NULL) { goto bail; } @@ -694,12 +709,7 @@ chunk = PyUnicode_FromUnicode(&buf[end], next - end); if (chunk == NULL) { goto bail; } - if (PyList_Append(chunks, chunk)) { - Py_DECREF(chunk); - goto bail; - } - Py_DECREF(chunk); } next++; if (c == '"') { @@ -801,7 +811,8 @@ } #endif } + APPEND_OLD_CHUNK chunk = PyUnicode_FromUnicode(&c, 1); if (chunk == NULL) { goto bail; } @@ -804,8 +815,18 @@ chunk = PyUnicode_FromUnicode(&c, 1); if (chunk == NULL) { goto bail; } - if (PyList_Append(chunks, chunk)) { - Py_DECREF(chunk); + } + + if (chunks == NULL) { + if (chunk != NULL) + rval = chunk; + else + rval = PyUnicode_FromStringAndSize("", 0); + } + else { + APPEND_OLD_CHUNK + rval = join_list_unicode(chunks); + if (rval == NULL) { goto bail; } @@ -810,4 +831,4 @@ goto bail; } - Py_DECREF(chunk); + Py_CLEAR(chunks); } @@ -813,11 +834,5 @@ } - - rval = join_list_unicode(chunks); - if (rval == NULL) { - goto bail; - } - Py_DECREF(chunks); *next_end_ptr = end; return rval; bail: *next_end_ptr = -1; @@ -820,7 +835,8 @@ *next_end_ptr = end; return rval; bail: *next_end_ptr = -1; + Py_XDECREF(chunk); Py_XDECREF(chunks); return NULL; } @@ -914,6 +930,7 @@ Py_VISIT(s->parse_float); Py_VISIT(s->parse_int); Py_VISIT(s->parse_constant); + Py_VISIT(s->memo); return 0; } @@ -930,6 +947,7 @@ Py_CLEAR(s->parse_float); Py_CLEAR(s->parse_int); Py_CLEAR(s->parse_constant); + Py_CLEAR(s->memo); return 0; } @@ -945,10 +963,10 @@ */ char *str = PyString_AS_STRING(pystr); Py_ssize_t end_idx = PyString_GET_SIZE(pystr) - 1; - PyObject *rval; - PyObject *pairs; + PyObject *rval = NULL; + PyObject *pairs = NULL; PyObject *item; PyObject *key = NULL; PyObject *val = NULL; char *encoding = PyString_AS_STRING(s->encoding); int strict = PyObject_IsTrue(s->strict); @@ -950,6 +968,7 @@ PyObject *item; PyObject *key = NULL; PyObject *val = NULL; char *encoding = PyString_AS_STRING(s->encoding); int strict = PyObject_IsTrue(s->strict); + int has_pairs_hook = (s->pairs_hook != Py_None); Py_ssize_t next_idx; @@ -955,7 +974,14 @@ Py_ssize_t next_idx; - pairs = PyList_New(0); - if (pairs == NULL) - return NULL; + if (has_pairs_hook) { + pairs = PyList_New(0); + if (pairs == NULL) + return NULL; + } + else { + rval = PyDict_New(); + if (rval == NULL) + return NULL; + } /* skip whitespace after { */ while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; @@ -963,6 +989,8 @@ /* only loop if the object is non-empty */ if (idx <= end_idx && str[idx] != '}') { while (idx <= end_idx) { + PyObject *memokey; + /* read key */ if (str[idx] != '"') { raise_errmsg("Expecting property name", pystr, idx); @@ -971,6 +999,16 @@ key = scanstring_str(pystr, idx + 1, encoding, strict, &next_idx); if (key == NULL) goto bail; + memokey = PyDict_GetItem(s->memo, key); + if (memokey != NULL) { + Py_INCREF(memokey); + Py_DECREF(key); + key = memokey; + } + else { + if (PyDict_SetItem(s->memo, key, key) < 0) + goto bail; + } idx = next_idx; /* skip whitespace between key and : delimiter, read :, skip whitespace */ @@ -987,10 +1025,14 @@ if (val == NULL) goto bail; - item = PyTuple_Pack(2, key, val); - if (item == NULL) - goto bail; - Py_CLEAR(key); - Py_CLEAR(val); - if (PyList_Append(pairs, item) == -1) { + if (has_pairs_hook) { + item = PyTuple_Pack(2, key, val); + if (item == NULL) + goto bail; + Py_CLEAR(key); + Py_CLEAR(val); + if (PyList_Append(pairs, item) == -1) { + Py_DECREF(item); + goto bail; + } Py_DECREF(item); @@ -996,3 +1038,2 @@ Py_DECREF(item); - goto bail; } @@ -998,5 +1039,10 @@ } - Py_DECREF(item); + else { + if (PyDict_SetItem(rval, key, val) < 0) + goto bail; + Py_CLEAR(key); + Py_CLEAR(val); + } idx = next_idx; /* skip whitespace before } or , */ @@ -1033,12 +1079,6 @@ return val; } - rval = PyObject_CallFunctionObjArgs((PyObject *)(&PyDict_Type), - pairs, NULL); - if (rval == NULL) - goto bail; - Py_CLEAR(pairs); - /* if object_hook is not None: rval = object_hook(rval) */ if (s->object_hook != Py_None) { val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL); @@ -1051,6 +1091,7 @@ *next_idx_ptr = idx + 1; return rval; bail: + Py_XDECREF(rval); Py_XDECREF(key); Py_XDECREF(val); Py_XDECREF(pairs); @@ -1068,9 +1109,9 @@ */ Py_UNICODE *str = PyUnicode_AS_UNICODE(pystr); Py_ssize_t end_idx = PyUnicode_GET_SIZE(pystr) - 1; - PyObject *rval; - PyObject *pairs; + PyObject *rval = NULL; + PyObject *pairs = NULL; PyObject *item; PyObject *key = NULL; PyObject *val = NULL; int strict = PyObject_IsTrue(s->strict); @@ -1073,6 +1114,7 @@ PyObject *item; PyObject *key = NULL; PyObject *val = NULL; int strict = PyObject_IsTrue(s->strict); + int has_pairs_hook = (s->pairs_hook != Py_None); Py_ssize_t next_idx; @@ -1077,8 +1119,15 @@ Py_ssize_t next_idx; - pairs = PyList_New(0); - if (pairs == NULL) - return NULL; + if (has_pairs_hook) { + pairs = PyList_New(0); + if (pairs == NULL) + return NULL; + } + else { + rval = PyDict_New(); + if (rval == NULL) + return NULL; + } /* skip whitespace after { */ while (idx <= end_idx && IS_WHITESPACE(str[idx])) idx++; @@ -1086,6 +1135,8 @@ /* only loop if the object is non-empty */ if (idx <= end_idx && str[idx] != '}') { while (idx <= end_idx) { + PyObject *memokey; + /* read key */ if (str[idx] != '"') { raise_errmsg("Expecting property name", pystr, idx); @@ -1094,6 +1145,16 @@ key = scanstring_unicode(pystr, idx + 1, strict, &next_idx); if (key == NULL) goto bail; + memokey = PyDict_GetItem(s->memo, key); + if (memokey != NULL) { + Py_INCREF(memokey); + Py_DECREF(key); + key = memokey; + } + else { + if (PyDict_SetItem(s->memo, key, key) < 0) + goto bail; + } idx = next_idx; /* skip whitespace between key and : delimiter, read :, skip whitespace */ @@ -1110,10 +1171,14 @@ if (val == NULL) goto bail; - item = PyTuple_Pack(2, key, val); - if (item == NULL) - goto bail; - Py_CLEAR(key); - Py_CLEAR(val); - if (PyList_Append(pairs, item) == -1) { + if (has_pairs_hook) { + item = PyTuple_Pack(2, key, val); + if (item == NULL) + goto bail; + Py_CLEAR(key); + Py_CLEAR(val); + if (PyList_Append(pairs, item) == -1) { + Py_DECREF(item); + goto bail; + } Py_DECREF(item); @@ -1119,3 +1184,2 @@ Py_DECREF(item); - goto bail; } @@ -1121,5 +1185,10 @@ } - Py_DECREF(item); + else { + if (PyDict_SetItem(rval, key, val) < 0) + goto bail; + Py_CLEAR(key); + Py_CLEAR(val); + } idx = next_idx; /* skip whitespace before } or , */ @@ -1157,12 +1226,6 @@ return val; } - rval = PyObject_CallFunctionObjArgs((PyObject *)(&PyDict_Type), - pairs, NULL); - if (rval == NULL) - goto bail; - Py_CLEAR(pairs); - /* if object_hook is not None: rval = object_hook(rval) */ if (s->object_hook != Py_None) { val = PyObject_CallFunctionObjArgs(s->object_hook, rval, NULL); @@ -1175,6 +1238,7 @@ *next_idx_ptr = idx + 1; return rval; bail: + Py_XDECREF(rval); Py_XDECREF(key); Py_XDECREF(val); Py_XDECREF(pairs); @@ -1723,6 +1787,7 @@ Py_TYPE(pystr)->tp_name); return NULL; } + PyDict_Clear(s->memo); return _build_rval_index_tuple(rval, next_idx); } @@ -1756,6 +1821,12 @@ if (!PyArg_ParseTupleAndKeywords(args, kwds, "O:make_scanner", kwlist, &ctx)) return -1; + + if (s->memo == NULL) { + s->memo = PyDict_New(); + if (s->memo == NULL) + goto bail; + } /* PyString_AS_STRING is used on encoding */ s->encoding = PyObject_GetAttrString(ctx, "encoding"); diff --git a/simplejson/decoder.py b/simplejson/decoder.py index ed9859bb61691e7c9fcec65d568d87bbaa1a391b_c2ltcGxlanNvbi9kZWNvZGVyLnB5..1f749686d39ef9a7917500b201848dc8873d4172_c2ltcGxlanNvbi9kZWNvZGVyLnB5 100644 --- a/simplejson/decoder.py +++ b/simplejson/decoder.py @@ -175,7 +175,12 @@ WHITESPACE_STR = ' \t\n\r' def JSONObject((s, end), encoding, strict, scan_once, object_hook, - object_pairs_hook, _w=WHITESPACE.match, _ws=WHITESPACE_STR): + object_pairs_hook, memo=None, + _w=WHITESPACE.match, _ws=WHITESPACE_STR): + # Backwards compatibility + if memo is None: + memo = {} + memo_get = memo.setdefault pairs = [] # Use a slice to prevent IndexError from being raised, the following # check will raise a more specific ValueError if the string is empty @@ -199,6 +204,7 @@ end += 1 while True: key, end = scanstring(s, end, encoding, strict) + key = memo_get(key, key) # To skip some function call overhead we optimize the fast paths where # the JSON key separator is ": " or just ":". @@ -382,6 +388,7 @@ self.parse_object = JSONObject self.parse_array = JSONArray self.parse_string = scanstring + self.memo = {} self.scan_once = make_scanner(self) def decode(self, s, _w=WHITESPACE.match): diff --git a/simplejson/scanner.py b/simplejson/scanner.py index ed9859bb61691e7c9fcec65d568d87bbaa1a391b_c2ltcGxlanNvbi9zY2FubmVyLnB5..1f749686d39ef9a7917500b201848dc8873d4172_c2ltcGxlanNvbi9zY2FubmVyLnB5 100644 --- a/simplejson/scanner.py +++ b/simplejson/scanner.py @@ -24,6 +24,7 @@ parse_constant = context.parse_constant object_hook = context.object_hook object_pairs_hook = context.object_pairs_hook + memo = context.memo def _scan_once(string, idx): try: @@ -35,7 +36,7 @@ return parse_string(string, idx + 1, encoding, strict) elif nextchar == '{': return parse_object((string, idx + 1), encoding, strict, - _scan_once, object_hook, object_pairs_hook) + _scan_once, object_hook, object_pairs_hook, memo) elif nextchar == '[': return parse_array((string, idx + 1), _scan_once) elif nextchar == 'n' and string[idx:idx + 4] == 'null': @@ -62,6 +63,12 @@ else: raise StopIteration - return _scan_once + def scan_once(string, idx): + try: + return _scan_once(string, idx) + finally: + memo.clear() + + return scan_once make_scanner = c_make_scanner or py_make_scanner diff --git a/simplejson/tests/test_decode.py b/simplejson/tests/test_decode.py index ed9859bb61691e7c9fcec65d568d87bbaa1a391b_c2ltcGxlanNvbi90ZXN0cy90ZXN0X2RlY29kZS5weQ==..1f749686d39ef9a7917500b201848dc8873d4172_c2ltcGxlanNvbi90ZXN0cy90ZXN0X2RlY29kZS5weQ== 100644 --- a/simplejson/tests/test_decode.py +++ b/simplejson/tests/test_decode.py @@ -6,6 +6,10 @@ from simplejson import OrderedDict class TestDecode(TestCase): + if not hasattr(TestCase, 'assertIs'): + def assertIs(self, a, b): + self.assert_(a is b, '%r is %r' % (a, b)) + def test_decimal(self): rval = json.loads('1.1', parse_float=decimal.Decimal) self.assert_(isinstance(rval, decimal.Decimal)) @@ -47,3 +51,18 @@ object_pairs_hook=OrderedDict, object_hook=lambda x: None), OrderedDict(p)) + + def check_keys_reuse(self, source, loads): + rval = loads(source) + (a, b), (c, d) = sorted(rval[0]), sorted(rval[1]) + self.assertIs(a, c) + self.assertIs(b, d) + + def test_keys_reuse_str(self): + s = u'[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]'.encode('utf8') + self.check_keys_reuse(s, json.loads) + + def test_keys_reuse_unicode(self): + s = u'[{"a_key": 1, "b_\xe9": 2}, {"a_key": 3, "b_\xe9": 4}]' + self.check_keys_reuse(s, json.loads) +