NotificationsYou must be signed in to change notification settings
Fork33.3k
Star69.7k

Commit19fb53f

committed

gh-63161: Fix PEP 263 support (GH-139481)

* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified.* Detect decoding error in comments for UTF-8 encoding.* Include the decoding error position for default encoding in SyntaxError.(cherry picked from commit5c942f1)Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>

1 parent08738ce commit19fb53fCopy full SHA for 19fb53f

File tree

9 files changed

+211

-46

lines changed

Lib/test
- test_exceptions.py
- test_source_encoding.py
Misc/NEWS.d/next/Core_and_Builtins
- 2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst
Parser
- pegen_errors.c
- tokenizer

9 files changed

+211

-46

lines changed

`‎Lib/test/test_exceptions.py‎`

Lines changed: 6 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -224,6 +224,8 @@ def check(self, src, lineno, offset, end_lineno=None, end_offset=None, encoding=`
`224`	`224`	`ifnotisinstance(src,str):`
`225`	`225`	`src=src.decode(encoding,'replace')`
`226`	`226`	`line=src.split('\n')[lineno-1]`
	`227`	`+iflineno==1:`
	`228`	`+line=line.removeprefix('\ufeff')`
`227`	`229`	`self.assertIn(line,cm.exception.text)`
`228`	`230`
`229`	`231`	`deftest_error_offset_continuation_characters(self):`
`@@ -239,7 +241,9 @@ def testSyntaxErrorOffset(self):`
`239`	`241`	`check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +',1,20)`
`240`	`242`	`check(b'# -- coding: cp1251 --\nPython = "\xcf\xb3\xf2\xee\xed" +',`
`241`	`243`	`2,19,encoding='cp1251')`
`242`		`-check(b'Python = "\xcf\xb3\xf2\xee\xed" +',1,10)`
	`244`	`+check(b'Python = "\xcf\xb3\xf2\xee\xed" +',1,12)`
	`245`	`+check(b'\n\n\nPython = "\xcf\xb3\xf2\xee\xed" +',4,12)`
	`246`	`+check(b'\xef\xbb\xbfPython = "\xcf\xb3\xf2\xee\xed" +',1,12)`
`243`	`247`	`check('x = "a',1,5)`
`244`	`248`	`check('lambda x: x = 2',1,1)`
`245`	`249`	`check('f{a + b + c}',1,2)`
`@@ -287,7 +291,7 @@ def baz():`
`287`	`291`	`check("pass\npass\npass\n(1+)\npass\npass\npass",4,4)`
`288`	`292`	`check("(1+)",1,4)`
`289`	`293`	`check("[interesting\nfoo()\n",1,1)`
`290`		`-check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n",0,-1)`
	`294`	`+check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n",1,0)`
`291`	`295`	`check("""f'''`
`292`	`296`	`{`
`293`	`297`	`(123_a)`

`‎Lib/test/test_source_encoding.py‎`

Lines changed: 113 additions & 11 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,8 @@`
`1`	`1`	`# -- coding: utf-8 --`
`2`	`2`
`3`	`3`	`importunittest`
`4`		`-fromtest.supportimportscript_helper,captured_stdout,requires_subprocess,requires_resource`
	`4`	`+fromtestimportsupport`
	`5`	`+fromtest.supportimportscript_helper`
`5`	`6`	`fromtest.support.os_helperimportTESTFN,unlink,rmtree`
`6`	`7`	`fromtest.support.import_helperimportunload`
`7`	`8`	`importimportlib`
`@@ -64,7 +65,7 @@ def test_issue7820(self):`
`64`	`65`	`# two bytes in common with the UTF-8 BOM`
`65`	`66`	`self.assertRaises(SyntaxError,eval,b'\xef\xbb\x20')`
`66`	`67`
`67`		`-@requires_subprocess()`
	`68`	`+@support.requires_subprocess()`
`68`	`69`	`deftest_20731(self):`
`69`	`70`	`sub=subprocess.Popen([sys.executable,`
`70`	`71`	`os.path.join(os.path.dirname(__file__),`
`@@ -267,6 +268,17 @@ def test_second_non_utf8_coding_line(self):`
`267`	`268`	`b'print(ascii("\xc3\xa4"))\n')`
`268`	`269`	`self.check_script_output(src,br"'\xc3\u20ac'")`
`269`	`270`
	`271`	`+deftest_first_utf8_coding_line_error(self):`
	`272`	`+src= (b'#coding:ascii\xc3\xa4\n'`
	`273`	`+b'raise RuntimeError\n')`
	`274`	`+self.check_script_error(src,br"(\(unicode error\) )?'ascii' codec can't decode byte")`
	`275`	`+`
	`276`	`+deftest_second_utf8_coding_line_error(self):`
	`277`	`+src= (b'#!/usr/bin/python\n'`
	`278`	`+b'#coding:ascii\xc3\xa4\n'`
	`279`	`+b'raise RuntimeError\n')`
	`280`	`+self.check_script_error(src,br"(\(unicode error\) )?'ascii' codec can't decode byte")`
	`281`	`+`
`270`	`282`	`deftest_utf8_bom(self):`
`271`	`283`	`src= (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')`
`272`	`284`	`self.check_script_output(src,br"'\xe4'")`
`@@ -282,10 +294,80 @@ def test_utf8_bom_and_utf8_coding_line(self):`
`282`	`294`	`b'print(ascii("\xc3\xa4"))\n')`
`283`	`295`	`self.check_script_output(src,br"'\xe4'")`
`284`	`296`
`285`		`-deftest_utf8_non_utf8_comment_line_error(self):`
	`297`	`+deftest_utf8_bom_and_non_utf8_first_coding_line(self):`
	`298`	`+src= (b'\xef\xbb\xbf#coding:iso-8859-15\n'`
	`299`	`+b'raise RuntimeError\n')`
	`300`	`+self.check_script_error(src,`
	`301`	`+br"encoding problem: iso-8859-15 with BOM",`
	`302`	`+lineno=1)`
	`303`	`+`
	`304`	`+deftest_utf8_bom_and_non_utf8_second_coding_line(self):`
	`305`	`+src= (b'\xef\xbb\xbf#first\n'`
	`306`	`+b'#coding:iso-8859-15\n'`
	`307`	`+b'raise RuntimeError\n')`
	`308`	`+self.check_script_error(src,`
	`309`	`+br"encoding problem: iso-8859-15 with BOM",`
	`310`	`+lineno=2)`
	`311`	`+`
	`312`	`+deftest_non_utf8_shebang(self):`
	`313`	`+src= (b'#!/home/\xa4/bin/python\n'`
	`314`	`+b'#coding:iso-8859-15\n'`
	`315`	`+b'print(ascii("\xc3\xa4"))\n')`
	`316`	`+self.check_script_output(src,br"'\xc3\u20ac'")`
	`317`	`+`
	`318`	`+deftest_utf8_shebang_error(self):`
	`319`	`+src= (b'#!/home/\xc3\xa4/bin/python\n'`
	`320`	`+b'#coding:ascii\n'`
	`321`	`+b'raise RuntimeError\n')`
	`322`	`+self.check_script_error(src,br"(\(unicode error\) )?'ascii' codec can't decode byte")`
	`323`	`+`
	`324`	`+deftest_non_utf8_shebang_error(self):`
	`325`	`+src= (b'#!/home/\xa4/bin/python\n'`
	`326`	`+b'raise RuntimeError\n')`
	`327`	`+self.check_script_error(src,br"Non-UTF-8 code starting with .* on line 1",`
	`328`	`+lineno=1)`
	`329`	`+`
	`330`	`+deftest_non_utf8_second_line_error(self):`
	`331`	`+src= (b'#first\n'`
	`332`	`+b'#second\xa4\n'`
	`333`	`+b'raise RuntimeError\n')`
	`334`	`+self.check_script_error(src,`
	`335`	`+br"Non-UTF-8 code starting with .* on line 2",`
	`336`	`+lineno=2)`
	`337`	`+`
	`338`	`+deftest_non_utf8_third_line_error(self):`
	`339`	`+src= (b'#first\n'`
	`340`	`+b'#second\n'`
	`341`	`+b'#third\xa4\n'`
	`342`	`+b'raise RuntimeError\n')`
	`343`	`+self.check_script_error(src,`
	`344`	`+br"Non-UTF-8 code starting with .* on line 3",`
	`345`	`+lineno=3)`
	`346`	`+`
	`347`	`+deftest_utf8_bom_non_utf8_third_line_error(self):`
	`348`	`+src= (b'\xef\xbb\xbf#first\n'`
	`349`	`+b'#second\n'`
	`350`	`+b'#third\xa4\n'`
	`351`	`+b'raise RuntimeError\n')`
	`352`	`+self.check_script_error(src,`
	`353`	`+br"Non-UTF-8 code starting with .* on line 3\|"`
	`354`	`+br"'utf-8' codec can't decode byte",`
	`355`	`+lineno=3)`
	`356`	`+`
	`357`	`+deftest_utf_8_non_utf8_third_line_error(self):`
	`358`	`+src= (b'#coding: utf-8\n'`
	`359`	`+b'#second\n'`
	`360`	`+b'#third\xa4\n'`
	`361`	`+b'raise RuntimeError\n')`
	`362`	`+self.check_script_error(src,`
	`363`	`+br"Non-UTF-8 code starting with .* on line 3\|"`
	`364`	`+br"'utf-8' codec can't decode byte",`
	`365`	`+lineno=3)`
	`366`	`+`
	`367`	`+deftest_utf8_non_utf8_third_line_error(self):`
`286`	`368`	`src= (b'#coding: utf8\n'`
`287`		`-b'#\n'`
`288`		`-b'#\xa4\n'`
	`369`	`+b'#second\n'`
	`370`	`+b'#third\xa4\n'`
`289`	`371`	`b'raise RuntimeError\n')`
`290`	`372`	`self.check_script_error(src,`
`291`	`373`	`br"'utf-8' codec can't decode byte\|"`
`@@ -326,7 +408,7 @@ def test_nul_in_second_coding_line(self):`
`326`	`408`	`classUTF8ValidatorTest(unittest.TestCase):`
`327`	`409`	`@unittest.skipIf(notsys.platform.startswith("linux"),`
`328`	`410`	`"Too slow to run on non-Linux platforms")`
`329`		`-@requires_resource('cpu')`
	`411`	`+@support.requires_resource('cpu')`
`330`	`412`	`deftest_invalid_utf8(self):`
`331`	`413`	`# This is a port of test_utf8_decode_invalid_sequences in`
`332`	`414`	`# test_unicode.py to exercise the separate utf8 validator in`
`@@ -392,19 +474,29 @@ def check(content):`
`392`	`474`	`check(b'\xF4'+cb+b'\xBF\xBF')`
`393`	`475`
`394`	`476`
	`477`	`+@support.force_not_colorized_test_class`
`395`	`478`	`classBytesSourceEncodingTest(AbstractSourceEncodingTest,unittest.TestCase):`
`396`	`479`
`397`	`480`	`defcheck_script_output(self,src,expected):`
`398`		`-withcaptured_stdout()asstdout:`
	`481`	`+withsupport.captured_stdout()asstdout:`
`399`	`482`	`exec(src)`
`400`	`483`	`out=stdout.getvalue().encode('latin1')`
`401`	`484`	`self.assertEqual(out.rstrip(),expected)`
`402`	`485`
`403`		`-defcheck_script_error(self,src,expected):`
`404`		`-withself.assertRaisesRegex(SyntaxError,expected.decode())ascm:`
	`486`	`+defcheck_script_error(self,src,expected,lineno=...):`
	`487`	`+withself.assertRaises(SyntaxError)ascm:`
`405`	`488`	`exec(src)`
	`489`	`+exc=cm.exception`
	`490`	`+self.assertRegex(str(exc),expected.decode())`
	`491`	`+iflinenoisnot ...:`
	`492`	`+self.assertEqual(exc.lineno,lineno)`
	`493`	`+line=src.splitlines()[lineno-1].decode(errors='replace')`
	`494`	`+iflineno==1:`
	`495`	`+line=line.removeprefix('\ufeff')`
	`496`	`+self.assertEqual(line,exc.text)`
`406`	`497`
`407`	`498`
	`499`	`+@support.force_not_colorized_test_class`
`408`	`500`	`classFileSourceEncodingTest(AbstractSourceEncodingTest,unittest.TestCase):`
`409`	`501`
`410`	`502`	`defcheck_script_output(self,src,expected):`
`@@ -415,13 +507,23 @@ def check_script_output(self, src, expected):`
`415`	`507`	`res=script_helper.assert_python_ok(fn)`
`416`	`508`	`self.assertEqual(res.out.rstrip(),expected)`
`417`	`509`
`418`		`-defcheck_script_error(self,src,expected):`
	`510`	`+defcheck_script_error(self,src,expected,lineno=...):`
`419`	`511`	`withtempfile.TemporaryDirectory()astmpd:`
`420`	`512`	`fn=os.path.join(tmpd,'test.py')`
`421`	`513`	`withopen(fn,'wb')asfp:`
`422`	`514`	`fp.write(src)`
`423`	`515`	`res=script_helper.assert_python_failure(fn)`
`424`		`-self.assertRegex(res.err.rstrip().splitlines()[-1],b'SyntaxError.*?'+expected)`
	`516`	`+err=res.err.rstrip()`
	`517`	`+self.assertRegex(err.splitlines()[-1],b'SyntaxError: '+expected)`
	`518`	`+iflinenoisnot ...:`
	`519`	`+self.assertIn(f', line{lineno}\n'.encode(),`
	`520`	`+err.replace(os.linesep.encode(),b'\n'))`
	`521`	`+line=src.splitlines()[lineno-1].decode(errors='replace')`
	`522`	`+iflineno==1:`
	`523`	`+line=line.removeprefix('\ufeff')`
	`524`	`+line=line.encode(sys.stderr.encoding,sys.stderr.errors)`
	`525`	`+self.assertIn(line,err)`
	`526`	`+`
`425`	`527`
`426`	`528`
`427`	`529`	`if__name__=="__main__":`

`‎Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst‎`

Lines changed: 5 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,5 @@`
	`1`	`+Support non-UTF-8 shebang and comments in Python source files if non-UTF-8`
	`2`	`+encoding is specified. Detect decoding error in comments for default (UTF-8)`
	`3`	`+encoding. Show the line and position of decoding error for default encoding`
	`4`	`+in a traceback. Show the line containing the coding cookie when it conflicts`
	`5`	`+with the BOM in a traceback.`

`‎Parser/pegen_errors.c‎`

Lines changed: 8 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -2,6 +2,7 @@`
`2`	`2`	`#include<errcode.h>`
`3`	`3`
`4`	`4`	`#include"pycore_pyerrors.h"// _PyErr_ProgramDecodedTextObject()`
	`5`	`+#include"pycore_runtime.h"// _Py_ID()`
`5`	`6`	`#include"lexer/state.h"`
`6`	`7`	`#include"lexer/lexer.h"`
`7`	`8`	`#include"pegen.h"`
`@@ -23,6 +24,13 @@ _PyPegen_raise_tokenizer_init_error(PyObject *filename)`
`23`	`24`	`PyObject*value;`
`24`	`25`	`PyObject*tback;`
`25`	`26`	`PyErr_Fetch(&type,&value,&tback);`
	`27`	`+if (PyErr_GivenExceptionMatches(value,PyExc_SyntaxError)) {`
	`28`	`+if (PyObject_SetAttr(value,&_Py_ID(filename),filename)) {`
	`29`	`+ gotoerror;`
	`30`	`+ }`
	`31`	`+PyErr_Restore(type,value,tback);`
	`32`	`+return;`
	`33`	`+ }`
`26`	`34`	`errstr=PyObject_Str(value);`
`27`	`35`	`if (!errstr) {`
`28`	`36`	`gotoerror;`

`‎Parser/tokenizer/file_tokenizer.c‎`

Lines changed: 35 additions & 14 deletions

Original file line number	Diff line number	Diff line change
`@@ -282,10 +282,8 @@ tok_underflow_interactive(struct tok_state *tok) {`
`282`	`282`	`}`
`283`	`283`
`284`	`284`	`staticint`
`285`		`-tok_underflow_file(structtok_state*tok) {`
`286`		`-if (tok->start==NULL&& !INSIDE_FSTRING(tok)) {`
`287`		`-tok->cur=tok->inp=tok->buf;`
`288`		`- }`
	`285`	`+tok_underflow_file(structtok_state*tok)`
	`286`	`+{`
`289`	`287`	`if (tok->decoding_state==STATE_INIT) {`
`290`	`288`	`/* We have not yet determined the encoding.`
`291`	`289`	`If an encoding is found, use the file-pointer`
`@@ -296,8 +294,16 @@ tok_underflow_file(struct tok_state *tok) {`
`296`	`294`	`}`
`297`	`295`	`assert(tok->decoding_state!=STATE_INIT);`
`298`	`296`	`}`
	`297`	`+intraw=tok->decoding_readline==NULL;`
	`298`	`+if (raw&&tok->decoding_state!=STATE_NORMAL) {`
	`299`	`+/* Keep the first line in the buffer to validate it later if`
	`300`	`+ * the encoding has not yet been determined. */`
	`301`	`+ }`
	`302`	`+elseif (tok->start==NULL&& !INSIDE_FSTRING(tok)) {`
	`303`	`+tok->cur=tok->inp=tok->buf;`
	`304`	`+ }`
`299`	`305`	`/* Read until '\n' or EOF */`
`300`		`-if (tok->decoding_readline!=NULL) {`
	`306`	`+if (!raw) {`
`301`	`307`	`/* We already have a codec associated with this input. */`
`302`	`308`	`if (!tok_readline_recode(tok)) {`
`303`	`309`	`return0;`
`@@ -328,20 +334,35 @@ tok_underflow_file(struct tok_state *tok) {`
`328`	`334`
`329`	`335`	`ADVANCE_LINENO();`
`330`	`336`	`if (tok->decoding_state!=STATE_NORMAL) {`
`331`		`-if (tok->lineno>2) {`
`332`		`-tok->decoding_state=STATE_NORMAL;`
`333`		`- }`
`334`		`-elseif (!_PyTokenizer_check_coding_spec(tok->cur,strlen(tok->cur),`
	`337`	`+if (!_PyTokenizer_check_coding_spec(tok->cur,strlen(tok->cur),`
`335`	`338`	`tok,fp_setreadl))`
`336`	`339`	`{`
`337`	`340`	`return0;`
`338`	`341`	`}`
	`342`	`+if (tok->lineno >=2) {`
	`343`	`+tok->decoding_state=STATE_NORMAL;`
	`344`	`+ }`
`339`	`345`	`}`
`340`		`-/* The default encoding is UTF-8, so make sure we don't have any`
`341`		`- non-UTF-8 sequences in it. */`
`342`		`-if (!tok->encoding&& !_PyTokenizer_ensure_utf8(tok->cur,tok)) {`
`343`		`-_PyTokenizer_error_ret(tok);`
`344`		`-return0;`
	`346`	`+if (raw&&tok->decoding_state==STATE_NORMAL) {`
	`347`	`+constchar*line=tok->lineno <=2 ?tok->buf :tok->cur;`
	`348`	`+intlineno=tok->lineno <=2 ?1 :tok->lineno;`
	`349`	`+if (!tok->encoding) {`
	`350`	`+/* The default encoding is UTF-8, so make sure we don't have any`
	`351`	`+ non-UTF-8 sequences in it. */`
	`352`	`+if (!_PyTokenizer_ensure_utf8(line,tok,lineno)) {`
	`353`	`+_PyTokenizer_error_ret(tok);`
	`354`	`+return0;`
	`355`	`+ }`
	`356`	`+ }`
	`357`	`+else {`
	`358`	`+PyObject*tmp=PyUnicode_Decode(line,strlen(line),`
	`359`	`+tok->encoding,NULL);`
	`360`	`+if (tmp==NULL) {`
	`361`	`+_PyTokenizer_error_ret(tok);`
	`362`	`+return0;`
	`363`	`+ }`
	`364`	`+Py_DECREF(tmp);`
	`365`	`+ }`
`345`	`366`	`}`
`346`	`367`	`assert(tok->done==E_OK);`
`347`	`368`	`returntok->done==E_OK;`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

Commit19fb53f

File tree

9 files changed

9 files changed

`‎Lib/test/test_exceptions.py‎`

`‎Lib/test/test_source_encoding.py‎`

`‎Misc/NEWS.d/next/Core_and_Builtins/2025-10-01-18-21-19.gh-issue-63161.ef1S6N.rst‎`

`‎Parser/pegen_errors.c‎`

`‎Parser/tokenizer/file_tokenizer.c‎`

0 commit comments