Skip to content

Commit 9aec8f8

Browse files
authored
Merge pull request #8 from pjdelport/hack-around-python-2-ascii
Hack around Python 2 ASCII encoding bug / incompatibility
2 parents 848c092 + f69d58f commit 9aec8f8

File tree

5 files changed

+48
-3
lines changed

5 files changed

+48
-3
lines changed

.coveragerc

+3
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,9 @@
22
[run]
33
branch = True
44

5+
# We seem to need timid mode to get correct results.
6+
timid = True
7+
58
source =
69
src
710
tests

src/backports/os.py

+14-1
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ def _fscodec():
103103
import codecs # Use codecs.lookup() for name normalisation.
104104
_HACK_AROUND_PY2_UTF8 = (sys.version_info < (3,) and
105105
codecs.lookup(encoding) == codecs.lookup('utf-8'))
106+
# Do we need to hack around Python 2's ASCII codec error handler behaviour?
107+
_HACK_AROUND_PY2_ASCII = (sys.version_info < (3,) and
108+
codecs.lookup(encoding) == codecs.lookup('ascii'))
106109

107110
# XXX backport: chr(octet) became bytes([octet])
108111
_byte = chr if sys.version_info < (3,) else lambda i: bytes([i])
@@ -116,7 +119,7 @@ def fsencode(filename):
116119
if isinstance(filename, bytes):
117120
return filename
118121
elif isinstance(filename, _str):
119-
if _HACK_AROUND_PY2_UTF8:
122+
if _HACK_AROUND_PY2_UTF8 or _HACK_AROUND_PY2_ASCII:
120123
# XXX backport: Unlike Python 3, Python 2's UTF-8 codec does not
121124
# consider surrogate codepoints invalid, so the surrogateescape
122125
# error handler never gets invoked to encode them back into high
@@ -125,6 +128,16 @@ def fsencode(filename):
125128
# This code hacks around that by manually encoding the surrogate
126129
# codepoints to high bytes, without relying on surrogateescape.
127130
#
131+
# As a *separate* issue to the above, Python2's ASCII codec has
132+
# a different problem: it correctly invokes the surrogateescape
133+
# error handler, but then seems to do additional strict
134+
# validation (?) on the interim surrogate-decoded Unicode buffer
135+
# returned by surrogateescape, and then fails with a
136+
# UnicodeEncodeError anyway.
137+
#
138+
# The fix for that happens to be the same (manual encoding),
139+
# even though the two causes are quite different.
140+
#
128141
return b''.join(
129142
(_byte(ord(c) - 0xDC00) if 0xDC00 <= ord(c) <= 0xDCFF else
130143
c.encode(encoding))

tests/test_extra.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,10 @@
44
"""
55
from __future__ import unicode_literals
66

7+
import codecs
78
import os as real_os
89
import sys
10+
from functools import partial
911

1012
from backports import os
1113

@@ -46,6 +48,14 @@
4648
UTF8_ENCODED_SURROGATE = b'\xed\xb0\x80'
4749

4850

51+
# Helper strategy: If the filesystem encoding is ASCII,
52+
# limit the set of valid text to encode to ASCII too.
53+
FILESYSTEM_IS_ASCII = codecs.lookup(sys.getfilesystemencoding()) == codecs.lookup('ascii')
54+
ASCII = ''.join(chr(i) for i in range(128))
55+
encodable_text = (partial(text, alphabet=ASCII) if FILESYSTEM_IS_ASCII else
56+
text)
57+
58+
4959
class ExtraFSEncodingTests(unittest.TestCase):
5060

5161
def test_encode_surrogates(self):
@@ -60,7 +70,7 @@ def test_decode_surrogates(self):
6070
"""
6171
self.assertEqual(os.fsdecode(HIGH_BYTES), HIGH_SURROGATES)
6272

63-
@given(text())
73+
@given(encodable_text())
6474
@example(HIGH_SURROGATES)
6575
def test_text_roundtrip(self, s):
6676
self.assertEqual(os.fsdecode(os.fsencode(s)), s)
@@ -92,7 +102,7 @@ class TestAgainstPython3(unittest.TestCase):
92102
On Python 3, the backported implementations should match the standard library.
93103
"""
94104

95-
@given(text())
105+
@given(encodable_text())
96106
@example(HIGH_SURROGATES)
97107
def test_encode_text(self, s):
98108
self.assertEqual(os.fsencode(s), real_os.fsencode(s))

tests/test_os.py

+12
Original file line numberDiff line numberDiff line change
@@ -20,4 +20,16 @@ def test_identity(self):
2020
bytesfn = os.fsencode(fn)
2121
except UnicodeEncodeError:
2222
continue
23+
24+
# XXX backport: Ignore bug in future.utils.surrogateescape.replace_surrogate_encode()
25+
# by treating the below NameError like the above UnicodeEncodeError.
26+
#
27+
# Bug: https://github.com/PythonCharmers/python-future/issues/256
28+
# (This workaround can be removed once that is fixed.)
29+
except NameError as e: # pragma: no cover
30+
if e.message == "global name 'exc' is not defined":
31+
continue
32+
else:
33+
raise
34+
2335
self.assertEqual(os.fsdecode(bytesfn), fn)

tox.ini

+7
Original file line numberDiff line numberDiff line change
@@ -24,13 +24,20 @@ deps =
2424
usedevelop =
2525
codecov: true
2626

27+
whitelist_externals =
28+
env
29+
30+
# Note: This runs the test suite with both the current locale's encoding,
31+
# and with LANG empty, to test against ASCII.
2732
commands =
2833
# XXX: This will currently run the tests twice under codecov, but oh well.
2934
# TODO: Use a factor-based override or negation for this sometime?
3035
# See:
3136
# https://github.com/tox-dev/tox/issues/189
3237
# https://github.com/tox-dev/tox/issues/292
3338
python -m unittest discover tests
39+
env LANG= python -m unittest discover tests
3440

3541
codecov: coverage run -m unittest discover tests
42+
codecov: env LANG= coverage run --append -m unittest discover tests
3643
codecov: codecov -e TOXENV

0 commit comments

Comments
 (0)