generalize BOM stripping to any use of raw_decode

86d8873cb842 · Bob Ippolito · 457493073c75 · 86d8873c · 86d8873c · 86d8873c
Commit 86d8873cb842 authored 11 years ago by Bob Ippolito
--- a/CHANGES.txt
+++ b/CHANGES.txt
+Version 3.6.0 released 2014-07-21
+* Automatically strip any UTF-8 BOM from input to more closely
+  follow the latest specs
+  https://github.com/simplejson/simplejson/pull/101
 Version 3.5.3 released 2014-06-24
 * Fix lower bound checking in scan_once / raw_decode API

--- a/conf.py
+++ b/conf.py
@@ -42,5 +42,5 @@
 # other places throughout the built documents.
 #
 # The short X.Y version.
-version = '3.5'
+version = '3.6'
 # The full version, including alpha/beta/rc tags.
@@ -46,5 +46,5 @@
 # The full version, including alpha/beta/rc tags.
-release = '3.5.3'
+release = '3.6.0'
 # There are two options for replacing |today|: either, you set today to some
 # non-false value, then it is used:

--- a/simplejson/__init__.py
+++ b/simplejson/__init__.py
@@ -98,7 +98,7 @@
    Expecting property name: line 1 column 3 (char 2)
 """
 from __future__ import absolute_import
-__version__ = '3.5.3'
+__version__ = '3.6.0'
 __all__ = [
    'dump', 'dumps', 'load', 'loads',
    'JSONDecoder', 'JSONDecodeError', 'JSONEncoder',
@@ -437,16 +437,7 @@
    of subclassing whenever possible.
    """
-    # Strip the UTF-8 BOM
+    return loads(fp.read(),
-    contents = fp.read()
-    ord0 = ord(contents[0])
-    if ord0 in (0xef, 0xfeff):
-        if ord0 == 0xfeff:
-            contents = contents[1:]
-        elif contents[:3] == '\xef\xbb\xbf':
-            contents = contents[3:]
-    return loads(contents,
        encoding=encoding, cls=cls, object_hook=object_hook,
        parse_float=parse_float, parse_int=parse_int,
        parse_constant=parse_constant, object_pairs_hook=object_pairs_hook,

--- a/simplejson/decoder.py
+++ b/simplejson/decoder.py
@@ -390,4 +390,11 @@
            raise JSONDecodeError('Expecting value', s, idx)
        if _PY3 and not isinstance(s, text_type):
            raise TypeError("Input string must be text, not bytes")
+        # strip UTF-8 bom
+        if len(s) > idx:
+            ord0 = ord(s[idx])
+            if ord0 == 0xfeff:
+                idx += 1
+            elif ord0 == 0xef and s[idx:idx + 3] == '\xef\xbb\xbf':
+                idx += 3
        return self.scan_once(s, idx=_w(s, idx).end())
--- a/simplejson/tests/test_unicode.py
+++ b/simplejson/tests/test_unicode.py
 import sys
-import os.path
+import codecs
 from unittest import TestCase
 import simplejson as json
@@ -3,7 +3,7 @@
 from unittest import TestCase
 import simplejson as json
-from simplejson.compat import unichr, text_type, b, u
+from simplejson.compat import unichr, text_type, b, u, BytesIO
 class TestUnicode(TestCase):
    def test_encoding1(self):
@@ -146,9 +146,8 @@
            '"' + c + '"')
    def test_strip_bom(self):
-        thisdir = os.path.dirname(__file__)
+        content = u"\u3053\u3093\u306b\u3061\u308f"
-        json_file = os.path.join(thisdir, "utf-8-bom.json")
+        json_doc = codecs.BOM_UTF8 + b(json.dumps(content))
-        doc_ascii = {
+        self.assertEqual(json.load(BytesIO(json_doc)), content)
-            u"content": u"\u3053\u3093\u306b\u3061\u308f"
+        for doc in json_doc, json_doc.decode('utf8'):
-        }
+            self.assertEqual(json.loads(doc), content)
-        self.assertEqual(json.load(open(json_file)), doc_ascii)
--- a/simplejson/tests/utf-8-bom.json
+++ b/simplejson/tests/utf-8-bom.json
-{
-    "content": "こんにちわ"
-}