NotificationsYou must be signed in to change notification settings
Fork34.1k
Star71.6k

Commit30c2ae4

authored and

committed

[3.7]bpo-24214: Fixed the UTF-8 and UTF-16 incremental decoders. (GH-14304) (GH-14369)

*bpo-24214: Fixed the UTF-8 and UTF-16 incremental decoders. (GH-14304)* The UTF-8 incremental decoders fails now fast if encounter a sequence that can't be handled by the error handler.* The UTF-16 incremental decoders with the surrogatepass error handler decodes now a lone low surrogate with final=False.(cherry picked from commit894263b)Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>

1 parentc58fc3a commit30c2ae4Copy full SHA for 30c2ae4

File tree

4 files changed

+37

-6

lines changed

Lib/test
- test_codecs.py
Misc/NEWS.d/next/Core and Builtins
- 2019-06-22-12-45-20.bpo-24214.hIiHeD.rst
Objects
- stringlib
  - codecs.h
- unicodeobject.c

4 files changed

+37

-6

lines changed

`‎Lib/test/test_codecs.py‎`

Lines changed: 25 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -404,11 +404,19 @@ def test_lone_surrogates(self):`
`404`	`404`	`deftest_incremental_surrogatepass(self):`
`405`	`405`	`# Test incremental decoder for surrogatepass handler:`
`406`	`406`	`# see issue #24214`
	`407`	`+# High surrogate`
`407`	`408`	`data='\uD901'.encode(self.encoding,'surrogatepass')`
`408`	`409`	`foriinrange(1,len(data)):`
`409`	`410`	`dec=codecs.getincrementaldecoder(self.encoding)('surrogatepass')`
`410`	`411`	`self.assertEqual(dec.decode(data[:i]),'')`
`411`	`412`	`self.assertEqual(dec.decode(data[i:],True),'\uD901')`
	`413`	`+# Low surrogate`
	`414`	`+data='\uDC02'.encode(self.encoding,'surrogatepass')`
	`415`	`+foriinrange(1,len(data)):`
	`416`	`+dec=codecs.getincrementaldecoder(self.encoding)('surrogatepass')`
	`417`	`+self.assertEqual(dec.decode(data[:i]),'')`
	`418`	`+final=self.encoding=="cp65001"`
	`419`	`+self.assertEqual(dec.decode(data[i:],final),'\uDC02')`
`412`	`420`
`413`	`421`
`414`	`422`	`classUTF32Test(ReadTest,unittest.TestCase):`
`@@ -849,6 +857,23 @@ def test_surrogatepass_handler(self):`
`849`	`857`	`withself.assertRaises(UnicodeDecodeError):`
`850`	`858`	`b"abc\xed\xa0z".decode(self.encoding,"surrogatepass")`
`851`	`859`
	`860`	`+deftest_incremental_errors(self):`
	`861`	`+# Test that the incremental decoder can fail with final=False.`
	`862`	`+# See issue #24214`
	`863`	`+cases= [b'\x80',b'\xBF',b'\xC0',b'\xC1',b'\xF5',b'\xF6',b'\xFF']`
	`864`	`+forprefixin (b'\xC2',b'\xDF',b'\xE0',b'\xE0\xA0',b'\xEF',`
	`865`	`+b'\xEF\xBF',b'\xF0',b'\xF0\x90',b'\xF0\x90\x80',`
	`866`	`+b'\xF4',b'\xF4\x8F',b'\xF4\x8F\xBF'):`
	`867`	`+forsuffixinb'\x7F',b'\xC0':`
	`868`	`+cases.append(prefix+suffix)`
	`869`	`+cases.extend((b'\xE0\x80',b'\xE0\x9F',b'\xED\xA0\x80',`
	`870`	`+b'\xED\xBF\xBF',b'\xF0\x80',b'\xF0\x8F',b'\xF4\x90'))`
	`871`	`+`
	`872`	`+fordataincases:`
	`873`	`+withself.subTest(data=data):`
	`874`	`+dec=codecs.getincrementaldecoder(self.encoding)()`
	`875`	`+self.assertRaises(UnicodeDecodeError,dec.decode,data)`
	`876`	`+`
`852`	`877`
`853`	`878`	`@unittest.skipUnless(sys.platform=='win32',`
`854`	`879`	`'cp65001 is a Windows-only codec')`

`‎Misc/NEWS.d/next/Core and Builtins/2019-06-22-12-45-20.bpo-24214.hIiHeD.rst‎`

Lines changed: 2 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+Improved support of the surrogatepass error handler in the UTF-8 and UTF-16`
	`2`	`+incremental decoders.`

`‎Objects/stringlib/codecs.h‎`

Lines changed: 3 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -207,7 +207,7 @@ STRINGLIB(utf8_decode)(const char *inptr, const char end,`
`207`	`207`	`gotoInvalidContinuation1;`
`208`	`208`	`}elseif (ch==0xF4&&ch2 >=0x90) {`
`209`	`209`	`/* invalid sequence`
`210`		`- \xF4\x90\x80\80- -- 110000- overflow */`
	`210`	`+ \xF4\x90\x80\x80- -- 110000- overflow */`
`211`	`211`	`gotoInvalidContinuation1;`
`212`	`212`	`}`
`213`	`213`	`if (!IS_CONTINUATION_BYTE(ch3)) {`
`@@ -573,10 +573,10 @@ STRINGLIB(utf16_decode)(const unsigned char *inptr, const unsigned char e,`
`573`	`573`	`}`
`574`	`574`
`575`	`575`	`/* UTF-16 code pair: */`
`576`		`-if (q >=e)`
`577`		`- gotoUnexpectedEnd;`
`578`	`576`	`if (!Py_UNICODE_IS_HIGH_SURROGATE(ch))`
`579`	`577`	`gotoIllegalEncoding;`
	`578`	`+if (q >=e)`
	`579`	`+ gotoUnexpectedEnd;`
`580`	`580`	`ch2= (q[ihi] <<8) \|q[ilo];`
`581`	`581`	`q+=2;`
`582`	`582`	`if (!Py_UNICODE_IS_LOW_SURROGATE(ch2))`

`‎Objects/unicodeobject.c‎`

Lines changed: 7 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -4888,11 +4888,15 @@ PyUnicode_DecodeUTF8Stateful(const char *s,`
`4888`	`4888`	`endinpos=startinpos+1;`
`4889`	`4889`	`break;`
`4890`	`4890`	`case2:`
`4891`		`-case3:`
`4892`		`-case4:`
`4893`		`-if (s==end\|\|consumed) {`
	`4891`	`+if (consumed&& (unsignedchar)s[0]==0xED&&end-s==2`
	`4892`	`+&& (unsignedchar)s[1] >=0xA0&& (unsignedchar)s[1] <=0xBF)`
	`4893`	`+ {`
	`4894`	`+/* Truncated surrogate code in range D800-DFFF */`
`4894`	`4895`	`gotoEnd;`
`4895`	`4896`	`}`
	`4897`	`+/* fall through */`
	`4898`	`+case3:`
	`4899`	`+case4:`
`4896`	`4900`	`errmsg="invalid continuation byte";`
`4897`	`4901`	`startinpos=s-starts;`
`4898`	`4902`	`endinpos=startinpos+ch-1;`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit30c2ae4

File tree

4 files changed

4 files changed

`‎Lib/test/test_codecs.py‎`

`‎Misc/NEWS.d/next/Core and Builtins/2019-06-22-12-45-20.bpo-24214.hIiHeD.rst‎`

`‎Objects/stringlib/codecs.h‎`

`‎Objects/unicodeobject.c‎`

0 commit comments