Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit30c2ae4

Browse files
miss-islingtonserhiy-storchaka
authored andcommitted
[3.7]bpo-24214: Fixed the UTF-8 and UTF-16 incremental decoders. (GH-14304) (GH-14369)
*bpo-24214: Fixed the UTF-8 and UTF-16 incremental decoders. (GH-14304)* The UTF-8 incremental decoders fails now fast if encounter a sequence that can't be handled by the error handler.* The UTF-16 incremental decoders with the surrogatepass error handler decodes now a lone low surrogate with final=False.(cherry picked from commit894263b)Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parentc58fc3a commit30c2ae4

File tree

4 files changed

+37
-6
lines changed

4 files changed

+37
-6
lines changed

‎Lib/test/test_codecs.py‎

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -404,11 +404,19 @@ def test_lone_surrogates(self):
404404
deftest_incremental_surrogatepass(self):
405405
# Test incremental decoder for surrogatepass handler:
406406
# see issue #24214
407+
# High surrogate
407408
data='\uD901'.encode(self.encoding,'surrogatepass')
408409
foriinrange(1,len(data)):
409410
dec=codecs.getincrementaldecoder(self.encoding)('surrogatepass')
410411
self.assertEqual(dec.decode(data[:i]),'')
411412
self.assertEqual(dec.decode(data[i:],True),'\uD901')
413+
# Low surrogate
414+
data='\uDC02'.encode(self.encoding,'surrogatepass')
415+
foriinrange(1,len(data)):
416+
dec=codecs.getincrementaldecoder(self.encoding)('surrogatepass')
417+
self.assertEqual(dec.decode(data[:i]),'')
418+
final=self.encoding=="cp65001"
419+
self.assertEqual(dec.decode(data[i:],final),'\uDC02')
412420

413421

414422
classUTF32Test(ReadTest,unittest.TestCase):
@@ -849,6 +857,23 @@ def test_surrogatepass_handler(self):
849857
withself.assertRaises(UnicodeDecodeError):
850858
b"abc\xed\xa0z".decode(self.encoding,"surrogatepass")
851859

860+
deftest_incremental_errors(self):
861+
# Test that the incremental decoder can fail with final=False.
862+
# See issue #24214
863+
cases= [b'\x80',b'\xBF',b'\xC0',b'\xC1',b'\xF5',b'\xF6',b'\xFF']
864+
forprefixin (b'\xC2',b'\xDF',b'\xE0',b'\xE0\xA0',b'\xEF',
865+
b'\xEF\xBF',b'\xF0',b'\xF0\x90',b'\xF0\x90\x80',
866+
b'\xF4',b'\xF4\x8F',b'\xF4\x8F\xBF'):
867+
forsuffixinb'\x7F',b'\xC0':
868+
cases.append(prefix+suffix)
869+
cases.extend((b'\xE0\x80',b'\xE0\x9F',b'\xED\xA0\x80',
870+
b'\xED\xBF\xBF',b'\xF0\x80',b'\xF0\x8F',b'\xF4\x90'))
871+
872+
fordataincases:
873+
withself.subTest(data=data):
874+
dec=codecs.getincrementaldecoder(self.encoding)()
875+
self.assertRaises(UnicodeDecodeError,dec.decode,data)
876+
852877

853878
@unittest.skipUnless(sys.platform=='win32',
854879
'cp65001 is a Windows-only codec')
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Improved support of the surrogatepass error handler in the UTF-8 and UTF-16
2+
incremental decoders.

‎Objects/stringlib/codecs.h‎

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ STRINGLIB(utf8_decode)(const char **inptr, const char *end,
207207
gotoInvalidContinuation1;
208208
}elseif (ch==0xF4&&ch2 >=0x90) {
209209
/* invalid sequence
210-
\xF4\x90\x80\80- -- 110000- overflow */
210+
\xF4\x90\x80\x80- -- 110000- overflow */
211211
gotoInvalidContinuation1;
212212
}
213213
if (!IS_CONTINUATION_BYTE(ch3)) {
@@ -573,10 +573,10 @@ STRINGLIB(utf16_decode)(const unsigned char **inptr, const unsigned char *e,
573573
}
574574

575575
/* UTF-16 code pair: */
576-
if (q >=e)
577-
gotoUnexpectedEnd;
578576
if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))
579577
gotoIllegalEncoding;
578+
if (q >=e)
579+
gotoUnexpectedEnd;
580580
ch2= (q[ihi] <<8) |q[ilo];
581581
q+=2;
582582
if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))

‎Objects/unicodeobject.c‎

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4888,11 +4888,15 @@ PyUnicode_DecodeUTF8Stateful(const char *s,
48884888
endinpos=startinpos+1;
48894889
break;
48904890
case2:
4891-
case3:
4892-
case4:
4893-
if (s==end||consumed) {
4891+
if (consumed&& (unsignedchar)s[0]==0xED&&end-s==2
4892+
&& (unsignedchar)s[1] >=0xA0&& (unsignedchar)s[1] <=0xBF)
4893+
{
4894+
/* Truncated surrogate code in range D800-DFFF */
48944895
gotoEnd;
48954896
}
4897+
/* fall through */
4898+
case3:
4899+
case4:
48964900
errmsg="invalid continuation byte";
48974901
startinpos=s-starts;
48984902
endinpos=startinpos+ch-1;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2026 Movatter.jp