Skip to content

Commit 8baff4c

Browse files
gh-136736: Fix handling alphanumerical non-ASCII characters in encodings.normalize_encoding()
1 parent 180b3eb commit 8baff4c

File tree

3 files changed

+16
-3
lines changed

3 files changed

+16
-3
lines changed

Lib/encodings/__init__.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,10 @@ def normalize_encoding(encoding):
5858
chars = []
5959
punct = False
6060
for c in encoding:
61-
if c.isalnum() or c == '.':
61+
if c.isascii() and (c.isalnum() or c == '.'):
6262
if punct and chars:
6363
chars.append('_')
64-
if c.isascii():
65-
chars.append(c)
64+
chars.append(c)
6665
punct = False
6766
else:
6867
punct = True

Lib/test/test_codecs.py

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3888,6 +3888,12 @@ def search_function(encoding):
38883888
self.assertEqual(FOUND, codecs.lookup('AAA---8'))
38893889
self.assertEqual(FOUND, codecs.lookup('AAA 8'))
38903890
self.assertEqual(FOUND, codecs.lookup('aaa\xe9\u20ac-8'))
3891+
self.assertEqual(FOUND, codecs.lookup('aaa\xe98'))
3892+
self.assertEqual(FOUND, codecs.lookup('aaa\u20ac8'))
3893+
self.assertEqual(FOUND, codecs.lookup('aaa-\xe9-8'))
3894+
self.assertEqual(FOUND, codecs.lookup('aaa-\u20ac-8'))
3895+
self.assertEqual(FOUND, codecs.lookup('aaa-8-\xe9'))
3896+
self.assertEqual(FOUND, codecs.lookup('aaa-8-\u20ac'))
38913897
self.assertEqual(NOT_FOUND, codecs.lookup('AAA.8'))
38923898
self.assertEqual(NOT_FOUND, codecs.lookup('AAA...8'))
38933899
self.assertEqual(NOT_FOUND, codecs.lookup('BBB-8'))
@@ -3899,6 +3905,12 @@ def test_encodings_normalize_encoding(self):
38993905
normalize = encodings.normalize_encoding
39003906
self.assertEqual(normalize('utf_8'), 'utf_8')
39013907
self.assertEqual(normalize('utf\xE9\u20AC\U0010ffff-8'), 'utf_8')
3908+
self.assertEqual(normalize('utf\xe98'), 'utf_8')
3909+
self.assertEqual(normalize('utf\u20ac8'), 'utf_8')
3910+
self.assertEqual(normalize('utf-\xe9-8'), 'utf_8')
3911+
self.assertEqual(normalize('utf-\u20ac-8'), 'utf_8')
3912+
self.assertEqual(normalize('utf-8-\xe9'), 'utf_8')
3913+
self.assertEqual(normalize('utf-8-\u20ac'), 'utf_8')
39023914
self.assertEqual(normalize('utf 8'), 'utf_8')
39033915
# encodings.normalize_encoding() doesn't convert
39043916
# characters to lower case.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Fix handling alphanumerical non-ASCII characters in
2+
:func:`encodings.normalize_encoding`.

0 commit comments

Comments
 (0)