Merge pull request #8 from pjdelport/hack-around-python-2-ascii

PiDelport · web-flow · commit 9aec8f80274e · 2017-09-20T20:32:14.000+02:00
Hack around Python 2 ASCII encoding bug / incompatibility
diff --git a/.coveragerc b/.coveragerc
@@ -2,6 +2,9 @@
 [run]
 branch = True
 
+# We seem to need timid mode to get correct results.
+timid = True
+
 source =
     src
     tests
diff --git a/src/backports/os.py b/src/backports/os.py
@@ -103,6 +103,9 @@ def _fscodec():
     import codecs  # Use codecs.lookup() for name normalisation.
     _HACK_AROUND_PY2_UTF8 = (sys.version_info < (3,) and
                              codecs.lookup(encoding) == codecs.lookup('utf-8'))
+    # Do we need to hack around Python 2's ASCII codec error handler behaviour?
+    _HACK_AROUND_PY2_ASCII = (sys.version_info < (3,) and
+                              codecs.lookup(encoding) == codecs.lookup('ascii'))
 
     # XXX backport: chr(octet) became bytes([octet])
     _byte = chr if sys.version_info < (3,) else lambda i: bytes([i])
@@ -116,7 +119,7 @@ def fsencode(filename):
         if isinstance(filename, bytes):
             return filename
         elif isinstance(filename, _str):
-            if _HACK_AROUND_PY2_UTF8:
+            if _HACK_AROUND_PY2_UTF8 or _HACK_AROUND_PY2_ASCII:
                 # XXX backport: Unlike Python 3, Python 2's UTF-8 codec does not
                 # consider surrogate codepoints invalid, so the surrogateescape
                 # error handler never gets invoked to encode them back into high
@@ -125,6 +128,16 @@ def fsencode(filename):
                 # This code hacks around that by manually encoding the surrogate
                 # codepoints to high bytes, without relying on surrogateescape.
                 #
+                # As a *separate* issue to the above, Python2's ASCII codec has
+                # a different problem: it correctly invokes the surrogateescape
+                # error handler, but then seems to do additional strict
+                # validation (?) on the interim surrogate-decoded Unicode buffer
+                # returned by surrogateescape, and then fails with a
+                # UnicodeEncodeError anyway.
+                #
+                # The fix for that happens to be the same (manual encoding),
+                # even though the two causes are quite different.
+                #
                 return b''.join(
                     (_byte(ord(c) - 0xDC00) if 0xDC00 <= ord(c) <= 0xDCFF else
                      c.encode(encoding))
diff --git a/tests/test_extra.py b/tests/test_extra.py
@@ -4,8 +4,10 @@
 """
 from __future__ import unicode_literals
 
+import codecs
 import os as real_os
 import sys
+from functools import partial
 
 from backports import os
 
@@ -46,6 +48,14 @@
 UTF8_ENCODED_SURROGATE = b'\xed\xb0\x80'
 
 
+# Helper strategy: If the filesystem encoding is ASCII,
+# limit the set of valid text to encode to ASCII too.
+FILESYSTEM_IS_ASCII = codecs.lookup(sys.getfilesystemencoding()) == codecs.lookup('ascii')
+ASCII = ''.join(chr(i) for i in range(128))
+encodable_text = (partial(text, alphabet=ASCII) if FILESYSTEM_IS_ASCII else
+                  text)
+
+
 class ExtraFSEncodingTests(unittest.TestCase):
 
     def test_encode_surrogates(self):
@@ -60,7 +70,7 @@ def test_decode_surrogates(self):
         """
         self.assertEqual(os.fsdecode(HIGH_BYTES), HIGH_SURROGATES)
 
-    @given(text())
+    @given(encodable_text())
     @example(HIGH_SURROGATES)
     def test_text_roundtrip(self, s):
         self.assertEqual(os.fsdecode(os.fsencode(s)), s)
@@ -92,7 +102,7 @@ class TestAgainstPython3(unittest.TestCase):
     On Python 3, the backported implementations should match the standard library.
     """
 
-    @given(text())
+    @given(encodable_text())
     @example(HIGH_SURROGATES)
     def test_encode_text(self, s):
         self.assertEqual(os.fsencode(s), real_os.fsencode(s))
diff --git a/tests/test_os.py b/tests/test_os.py
@@ -20,4 +20,16 @@ def test_identity(self):
                 bytesfn = os.fsencode(fn)
             except UnicodeEncodeError:
                 continue
+
+            # XXX backport: Ignore bug in future.utils.surrogateescape.replace_surrogate_encode()
+            # by treating the below NameError like the above UnicodeEncodeError.
+            #
+            # Bug: https://github.com/PythonCharmers/python-future/issues/256
+            # (This workaround can be removed once that is fixed.)
+            except NameError as e:  # pragma: no cover
+                if e.message == "global name 'exc' is not defined":
+                    continue
+                else:
+                    raise
+
             self.assertEqual(os.fsdecode(bytesfn), fn)
diff --git a/tox.ini b/tox.ini
@@ -24,13 +24,20 @@ deps =
 usedevelop =
     codecov: true
 
+whitelist_externals =
+    env
+
+# Note: This runs the test suite with both the current locale's encoding,
+# and with LANG empty, to test against ASCII.
 commands =
     # XXX: This will currently run the tests twice under codecov, but oh well.
     # TODO: Use a factor-based override or negation for this sometime?
     # See:
     # https://github.com/tox-dev/tox/issues/189
     # https://github.com/tox-dev/tox/issues/292
     python -m unittest discover tests
+    env LANG= python -m unittest discover tests
 
     codecov: coverage run -m unittest discover tests
+    codecov: env LANG= coverage run --append -m unittest discover tests
     codecov: codecov -e TOXENV