# HG changeset patch # User Bob Ippolito <bob@redivi.com> # Date 1222137286 0 # Tue Sep 23 02:34:46 2008 +0000 # Node ID 376baf4b841e9b3909ffe036a5482ac769dacd99 # Parent 78543dd76290bfd36ae18700f314682b715b4ea1 ugly hacks to decoder to avoid dispatch for roughly 2x speedup git-svn-id: http://simplejson.googlecode.com/svn/trunk@100 a4795897-2c25-0410-b006-0d3caba88fa1 diff --git a/simplejson/decoder.py b/simplejson/decoder.py --- a/simplejson/decoder.py +++ b/simplejson/decoder.py @@ -4,7 +4,7 @@ import re import sys -from simplejson.scanner import Scanner, pattern +from simplejson.scanner import make_scanner, pattern try: from simplejson._speedups import scanstring as c_scanstring except ImportError: @@ -58,20 +58,21 @@ rval = c[s] else: rval = fn(s) - return rval, None -pattern('(-?Infinity|NaN|true|false|null)')(JSONConstant) + return rval, match.end() +pattern(r'(-?Infinity|NaN|true|false|null)')(JSONConstant) def JSONNumber(match, context): - match = JSONNumber.regex.match(match.string, *match.span()) - integer, frac, exp = match.groups() + # m1 = JSONNumber.regex.match(match.string, *match.span()) + # assert m1.groups()[:3] == match.groups()[:3] + integer, frac, exp = match.groups()[:3] if frac or exp: fn = getattr(context, 'parse_float', None) or float res = fn(integer + (frac or '') + (exp or '')) else: fn = getattr(context, 'parse_int', None) or int res = fn(integer) - return res, None + return res, match.end() pattern(r'(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?')(JSONNumber) @@ -149,52 +150,82 @@ # Use speedup if available scanstring = c_scanstring or py_scanstring -def JSONString(match, context): +def JSONString((string, end), context): encoding = getattr(context, 'encoding', None) strict = getattr(context, 'strict', True) - return scanstring(match.string, match.end(), encoding, strict) + return scanstring(string, end, encoding, strict) pattern(r'"')(JSONString) -WHITESPACE = re.compile(r'\s*', FLAGS) +WHITESPACE = re.compile(r'[ \t\n\r]*', FLAGS) +WHITESPACE_STR = ' \t\n\r' -def JSONObject(match, context, _w=WHITESPACE.match): +def JSONObject((s, end), context, _w=WHITESPACE.match, _ws=WHITESPACE_STR): pairs = {} - s = match.string - end = _w(s, match.end()).end() nextchar = s[end:end + 1] - # Trivial empty object - if nextchar == '}': - return pairs, end + 1 + # Normally we expect nextchar == '"' if nextchar != '"': - raise ValueError(errmsg("Expecting property name", s, end)) + if nextchar in _ws: + end = _w(s, end).end() + nextchar = s[end:end + 1] + # Trivial empty object + if nextchar == '}': + return pairs, end + 1 + elif nextchar != '"': + raise ValueError(errmsg("Expecting property name", s, end)) end += 1 encoding = getattr(context, 'encoding', None) strict = getattr(context, 'strict', True) - iterscan = JSONScanner.iterscan + scan_once = JSONScanner while True: key, end = scanstring(s, end, encoding, strict) - end = _w(s, end).end() + + # To skip some function call overhead we optimize the fast paths where + # the JSON key separator is ": " or just ":". if s[end:end + 1] != ':': - raise ValueError(errmsg("Expecting : delimiter", s, end)) - end = _w(s, end + 1).end() + end = _w(s, end).end() + if s[end:end + 1] != ':': + raise ValueError(errmsg("Expecting : delimiter", s, end)) + + end += 1 + try: - value, end = iterscan(s, idx=end, context=context).next() + if s[end] in _ws: + end += 1 + if s[end] in _ws: + end = _w(s, end).end() + except IndexError: + pass + + try: + value, end = scan_once(s, end, context) except StopIteration: raise ValueError(errmsg("Expecting object", s, end)) pairs[key] = value - end = _w(s, end).end() nextchar = s[end:end + 1] + if nextchar in _ws: + end = _w(s, end).end() + nextchar = s[end:end + 1] end += 1 + if nextchar == '}': break - if nextchar != ',': + elif nextchar != ',': raise ValueError(errmsg("Expecting , delimiter", s, end - 1)) - end = _w(s, end).end() + + try: + if s[end] in _ws: + end += 1 + if s[end] in _ws: + end = _w(s, end).end() + except IndexError: + pass + nextchar = s[end:end + 1] end += 1 if nextchar != '"': raise ValueError(errmsg("Expecting property name", s, end - 1)) + object_hook = getattr(context, 'object_hook', None) if object_hook is not None: pairs = object_hook(pairs) @@ -202,29 +233,40 @@ pattern(r'{')(JSONObject) -def JSONArray(match, context, _w=WHITESPACE.match): +def JSONArray((s, end), context, _w=WHITESPACE.match, _ws=WHITESPACE_STR): values = [] - s = match.string - end = _w(s, match.end()).end() + nextchar = s[end:end + 1] + if nextchar in _ws: + end = _w(s, end).end() + nextchar = s[end:end + 1] # Look-ahead for trivial empty array - nextchar = s[end:end + 1] if nextchar == ']': return values, end + 1 - iterscan = JSONScanner.iterscan + scan_once = JSONScanner while True: try: - value, end = iterscan(s, idx=end, context=context).next() + value, end = scan_once(s, end, context) except StopIteration: raise ValueError(errmsg("Expecting object", s, end)) values.append(value) - end = _w(s, end).end() nextchar = s[end:end + 1] + if nextchar in _ws: + end = _w(s, end).end() + nextchar = s[end:end + 1] end += 1 if nextchar == ']': break if nextchar != ',': raise ValueError(errmsg("Expecting , delimiter", s, end)) - end = _w(s, end).end() + + try: + if s[end] in _ws: + end += 1 + if s[end] in _ws: + end = _w(s, end).end() + except IndexError: + pass + return values, end pattern(r'\[')(JSONArray) @@ -237,7 +279,7 @@ JSONNumber, ] -JSONScanner = Scanner(ANYTHING) +JSONScanner = make_scanner(ANYTHING) class JSONDecoder(object): @@ -270,7 +312,6 @@ their corresponding ``float`` values, which is outside the JSON spec. """ - _scanner = Scanner(ANYTHING) __all__ = ['__init__', 'decode', 'raw_decode'] def __init__(self, encoding=None, object_hook=None, parse_float=None, @@ -330,9 +371,10 @@ This can be used to decode a JSON document from a string that may have extraneous data at the end. """ - kw.setdefault('context', self) + idx = kw.get('idx', 0) + context = kw.get('context', self) try: - obj, end = self._scanner.iterscan(s, **kw).next() + obj, end = JSONScanner(s, idx, context) except StopIteration: raise ValueError("No JSON object could be decoded") return obj, end diff --git a/simplejson/scanner.py b/simplejson/scanner.py --- a/simplejson/scanner.py +++ b/simplejson/scanner.py @@ -8,56 +8,51 @@ import sre_constants from sre_constants import BRANCH, SUBPATTERN -__all__ = ['Scanner', 'pattern'] +__all__ = ['make_scanner', 'pattern'] FLAGS = (VERBOSE | MULTILINE | DOTALL) -class Scanner(object): - def __init__(self, lexicon, flags=FLAGS): - self.actions = [None] - # Combine phrases into a compound pattern - s = sre_parse.Pattern() - s.flags = flags - p = [] - for idx, token in enumerate(lexicon): - phrase = token.pattern - try: - subpattern = sre_parse.SubPattern(s, - [(SUBPATTERN, (idx + 1, sre_parse.parse(phrase, flags)))]) - except sre_constants.error: - raise - p.append(subpattern) - self.actions.append(token) - - s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work - p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) - self.scanner = sre_compile.compile(p) +def make_scanner(lexicon, flags=FLAGS): + actions = [None] + # Combine phrases into a compound pattern + s = sre_parse.Pattern() + s.flags = flags + charpatterns = {} + p = [] + idx = 0 + for token in lexicon: + if token.pattern in (r'\[', r'{', r'"'): + charpatterns[token.pattern[-1]] = token + idx += 1 + phrase = token.pattern + try: + subpattern = sre_parse.SubPattern(s, + [(SUBPATTERN, (idx, sre_parse.parse(phrase, flags)))]) + except sre_constants.error: + raise + p.append(subpattern) + actions.append(token) - def iterscan(self, string, idx=0, context=None): - """ - Yield match, end_idx for each match - """ - match = self.scanner.scanner(string, idx).match - actions = self.actions - lastend = idx - end = len(string) - while True: - m = match() - if m is None: - break - matchbegin, matchend = m.span() - if lastend == matchend: - break - action = actions[m.lastindex] - if action is not None: - rval, next_pos = action(m, context) - if next_pos is not None and next_pos != matchend: - # "fast forward" the scanner - matchend = next_pos - match = self.scanner.scanner(string, matchend).match - yield rval, matchend - lastend = matchend + s.groups = len(p) + 1 # NOTE(guido): Added to make SRE validation work + p = sre_parse.SubPattern(s, [(BRANCH, (None, p))]) + scanner = sre_compile.compile(p).scanner + def _scan_once(string, idx=0, context=None): + try: + action = charpatterns[string[idx]] + except KeyError: + pass + except IndexError: + raise StopIteration + else: + return action((string, idx + 1), context) + + m = scanner(string, idx).match() + if m is None or m.end() == idx: + raise StopIteration + return actions[m.lastindex](m, context) + + return _scan_once def pattern(pattern, flags=FLAGS): def decorator(fn): diff --git a/simplejson/tests/test_decode.py b/simplejson/tests/test_decode.py --- a/simplejson/tests/test_decode.py +++ b/simplejson/tests/test_decode.py @@ -13,3 +13,10 @@ rval = S.loads('1', parse_int=float) self.assert_(isinstance(rval, float)) self.assertEquals(rval, 1.0) + + def test_decoder_optimizations(self): + # Several optimizations were made that skip over calls to + # the whitespace regex, so this test is designed to try and + # exercise the uncommon cases. The array cases are already covered. + rval = S.loads('{ "key" : "value" , "k":"v" }') + self.assertEquals(rval, {"key":"value", "k":"v"})