Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit19fb53f

Browse files
gh-63161: Fix PEP 263 support (GH-139481)
* Support non-UTF-8 shebang and comments if non-UTF-8 encoding is specified.* Detect decoding error in comments for UTF-8 encoding.* Include the decoding error position for default encoding in SyntaxError.(cherry picked from commit5c942f1)Co-authored-by: Serhiy Storchaka <storchaka@gmail.com>
1 parent08738ce commit19fb53f

File tree

9 files changed

+211
-46
lines changed

9 files changed

+211
-46
lines changed

‎Lib/test/test_exceptions.py‎

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,8 @@ def check(self, src, lineno, offset, end_lineno=None, end_offset=None, encoding=
224224
ifnotisinstance(src,str):
225225
src=src.decode(encoding,'replace')
226226
line=src.split('\n')[lineno-1]
227+
iflineno==1:
228+
line=line.removeprefix('\ufeff')
227229
self.assertIn(line,cm.exception.text)
228230

229231
deftest_error_offset_continuation_characters(self):
@@ -239,7 +241,9 @@ def testSyntaxErrorOffset(self):
239241
check('Python = "\u1e54\xfd\u0163\u0125\xf2\xf1" +',1,20)
240242
check(b'# -*- coding: cp1251 -*-\nPython = "\xcf\xb3\xf2\xee\xed" +',
241243
2,19,encoding='cp1251')
242-
check(b'Python = "\xcf\xb3\xf2\xee\xed" +',1,10)
244+
check(b'Python = "\xcf\xb3\xf2\xee\xed" +',1,12)
245+
check(b'\n\n\nPython = "\xcf\xb3\xf2\xee\xed" +',4,12)
246+
check(b'\xef\xbb\xbfPython = "\xcf\xb3\xf2\xee\xed" +',1,12)
243247
check('x = "a',1,5)
244248
check('lambda x: x = 2',1,1)
245249
check('f{a + b + c}',1,2)
@@ -287,7 +291,7 @@ def baz():
287291
check("pass\npass\npass\n(1+)\npass\npass\npass",4,4)
288292
check("(1+)",1,4)
289293
check("[interesting\nfoo()\n",1,1)
290-
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n",0,-1)
294+
check(b"\xef\xbb\xbf#coding: utf8\nprint('\xe6\x88\x91')\n",1,0)
291295
check("""f'''
292296
{
293297
(123_a)

‎Lib/test/test_source_encoding.py‎

Lines changed: 113 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
# -*- coding: utf-8 -*-
22

33
importunittest
4-
fromtest.supportimportscript_helper,captured_stdout,requires_subprocess,requires_resource
4+
fromtestimportsupport
5+
fromtest.supportimportscript_helper
56
fromtest.support.os_helperimportTESTFN,unlink,rmtree
67
fromtest.support.import_helperimportunload
78
importimportlib
@@ -64,7 +65,7 @@ def test_issue7820(self):
6465
# two bytes in common with the UTF-8 BOM
6566
self.assertRaises(SyntaxError,eval,b'\xef\xbb\x20')
6667

67-
@requires_subprocess()
68+
@support.requires_subprocess()
6869
deftest_20731(self):
6970
sub=subprocess.Popen([sys.executable,
7071
os.path.join(os.path.dirname(__file__),
@@ -267,6 +268,17 @@ def test_second_non_utf8_coding_line(self):
267268
b'print(ascii("\xc3\xa4"))\n')
268269
self.check_script_output(src,br"'\xc3\u20ac'")
269270

271+
deftest_first_utf8_coding_line_error(self):
272+
src= (b'#coding:ascii\xc3\xa4\n'
273+
b'raise RuntimeError\n')
274+
self.check_script_error(src,br"(\(unicode error\) )?'ascii' codec can't decode byte")
275+
276+
deftest_second_utf8_coding_line_error(self):
277+
src= (b'#!/usr/bin/python\n'
278+
b'#coding:ascii\xc3\xa4\n'
279+
b'raise RuntimeError\n')
280+
self.check_script_error(src,br"(\(unicode error\) )?'ascii' codec can't decode byte")
281+
270282
deftest_utf8_bom(self):
271283
src= (b'\xef\xbb\xbfprint(ascii("\xc3\xa4"))\n')
272284
self.check_script_output(src,br"'\xe4'")
@@ -282,10 +294,80 @@ def test_utf8_bom_and_utf8_coding_line(self):
282294
b'print(ascii("\xc3\xa4"))\n')
283295
self.check_script_output(src,br"'\xe4'")
284296

285-
deftest_utf8_non_utf8_comment_line_error(self):
297+
deftest_utf8_bom_and_non_utf8_first_coding_line(self):
298+
src= (b'\xef\xbb\xbf#coding:iso-8859-15\n'
299+
b'raise RuntimeError\n')
300+
self.check_script_error(src,
301+
br"encoding problem: iso-8859-15 with BOM",
302+
lineno=1)
303+
304+
deftest_utf8_bom_and_non_utf8_second_coding_line(self):
305+
src= (b'\xef\xbb\xbf#first\n'
306+
b'#coding:iso-8859-15\n'
307+
b'raise RuntimeError\n')
308+
self.check_script_error(src,
309+
br"encoding problem: iso-8859-15 with BOM",
310+
lineno=2)
311+
312+
deftest_non_utf8_shebang(self):
313+
src= (b'#!/home/\xa4/bin/python\n'
314+
b'#coding:iso-8859-15\n'
315+
b'print(ascii("\xc3\xa4"))\n')
316+
self.check_script_output(src,br"'\xc3\u20ac'")
317+
318+
deftest_utf8_shebang_error(self):
319+
src= (b'#!/home/\xc3\xa4/bin/python\n'
320+
b'#coding:ascii\n'
321+
b'raise RuntimeError\n')
322+
self.check_script_error(src,br"(\(unicode error\) )?'ascii' codec can't decode byte")
323+
324+
deftest_non_utf8_shebang_error(self):
325+
src= (b'#!/home/\xa4/bin/python\n'
326+
b'raise RuntimeError\n')
327+
self.check_script_error(src,br"Non-UTF-8 code starting with .* on line 1",
328+
lineno=1)
329+
330+
deftest_non_utf8_second_line_error(self):
331+
src= (b'#first\n'
332+
b'#second\xa4\n'
333+
b'raise RuntimeError\n')
334+
self.check_script_error(src,
335+
br"Non-UTF-8 code starting with .* on line 2",
336+
lineno=2)
337+
338+
deftest_non_utf8_third_line_error(self):
339+
src= (b'#first\n'
340+
b'#second\n'
341+
b'#third\xa4\n'
342+
b'raise RuntimeError\n')
343+
self.check_script_error(src,
344+
br"Non-UTF-8 code starting with .* on line 3",
345+
lineno=3)
346+
347+
deftest_utf8_bom_non_utf8_third_line_error(self):
348+
src= (b'\xef\xbb\xbf#first\n'
349+
b'#second\n'
350+
b'#third\xa4\n'
351+
b'raise RuntimeError\n')
352+
self.check_script_error(src,
353+
br"Non-UTF-8 code starting with .* on line 3|"
354+
br"'utf-8' codec can't decode byte",
355+
lineno=3)
356+
357+
deftest_utf_8_non_utf8_third_line_error(self):
358+
src= (b'#coding: utf-8\n'
359+
b'#second\n'
360+
b'#third\xa4\n'
361+
b'raise RuntimeError\n')
362+
self.check_script_error(src,
363+
br"Non-UTF-8 code starting with .* on line 3|"
364+
br"'utf-8' codec can't decode byte",
365+
lineno=3)
366+
367+
deftest_utf8_non_utf8_third_line_error(self):
286368
src= (b'#coding: utf8\n'
287-
b'#\n'
288-
b'#\xa4\n'
369+
b'#second\n'
370+
b'#third\xa4\n'
289371
b'raise RuntimeError\n')
290372
self.check_script_error(src,
291373
br"'utf-8' codec can't decode byte|"
@@ -326,7 +408,7 @@ def test_nul_in_second_coding_line(self):
326408
classUTF8ValidatorTest(unittest.TestCase):
327409
@unittest.skipIf(notsys.platform.startswith("linux"),
328410
"Too slow to run on non-Linux platforms")
329-
@requires_resource('cpu')
411+
@support.requires_resource('cpu')
330412
deftest_invalid_utf8(self):
331413
# This is a port of test_utf8_decode_invalid_sequences in
332414
# test_unicode.py to exercise the separate utf8 validator in
@@ -392,19 +474,29 @@ def check(content):
392474
check(b'\xF4'+cb+b'\xBF\xBF')
393475

394476

477+
@support.force_not_colorized_test_class
395478
classBytesSourceEncodingTest(AbstractSourceEncodingTest,unittest.TestCase):
396479

397480
defcheck_script_output(self,src,expected):
398-
withcaptured_stdout()asstdout:
481+
withsupport.captured_stdout()asstdout:
399482
exec(src)
400483
out=stdout.getvalue().encode('latin1')
401484
self.assertEqual(out.rstrip(),expected)
402485

403-
defcheck_script_error(self,src,expected):
404-
withself.assertRaisesRegex(SyntaxError,expected.decode())ascm:
486+
defcheck_script_error(self,src,expected,lineno=...):
487+
withself.assertRaises(SyntaxError)ascm:
405488
exec(src)
489+
exc=cm.exception
490+
self.assertRegex(str(exc),expected.decode())
491+
iflinenoisnot ...:
492+
self.assertEqual(exc.lineno,lineno)
493+
line=src.splitlines()[lineno-1].decode(errors='replace')
494+
iflineno==1:
495+
line=line.removeprefix('\ufeff')
496+
self.assertEqual(line,exc.text)
406497

407498

499+
@support.force_not_colorized_test_class
408500
classFileSourceEncodingTest(AbstractSourceEncodingTest,unittest.TestCase):
409501

410502
defcheck_script_output(self,src,expected):
@@ -415,13 +507,23 @@ def check_script_output(self, src, expected):
415507
res=script_helper.assert_python_ok(fn)
416508
self.assertEqual(res.out.rstrip(),expected)
417509

418-
defcheck_script_error(self,src,expected):
510+
defcheck_script_error(self,src,expected,lineno=...):
419511
withtempfile.TemporaryDirectory()astmpd:
420512
fn=os.path.join(tmpd,'test.py')
421513
withopen(fn,'wb')asfp:
422514
fp.write(src)
423515
res=script_helper.assert_python_failure(fn)
424-
self.assertRegex(res.err.rstrip().splitlines()[-1],b'SyntaxError.*?'+expected)
516+
err=res.err.rstrip()
517+
self.assertRegex(err.splitlines()[-1],b'SyntaxError: '+expected)
518+
iflinenoisnot ...:
519+
self.assertIn(f', line{lineno}\n'.encode(),
520+
err.replace(os.linesep.encode(),b'\n'))
521+
line=src.splitlines()[lineno-1].decode(errors='replace')
522+
iflineno==1:
523+
line=line.removeprefix('\ufeff')
524+
line=line.encode(sys.stderr.encoding,sys.stderr.errors)
525+
self.assertIn(line,err)
526+
425527

426528

427529
if__name__=="__main__":
Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
Support non-UTF-8 shebang and comments in Python source files if non-UTF-8
2+
encoding is specified. Detect decoding error in comments for default (UTF-8)
3+
encoding. Show the line and position of decoding error for default encoding
4+
in a traceback. Show the line containing the coding cookie when it conflicts
5+
with the BOM in a traceback.

‎Parser/pegen_errors.c‎

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#include<errcode.h>
33

44
#include"pycore_pyerrors.h"// _PyErr_ProgramDecodedTextObject()
5+
#include"pycore_runtime.h"// _Py_ID()
56
#include"lexer/state.h"
67
#include"lexer/lexer.h"
78
#include"pegen.h"
@@ -23,6 +24,13 @@ _PyPegen_raise_tokenizer_init_error(PyObject *filename)
2324
PyObject*value;
2425
PyObject*tback;
2526
PyErr_Fetch(&type,&value,&tback);
27+
if (PyErr_GivenExceptionMatches(value,PyExc_SyntaxError)) {
28+
if (PyObject_SetAttr(value,&_Py_ID(filename),filename)) {
29+
gotoerror;
30+
}
31+
PyErr_Restore(type,value,tback);
32+
return;
33+
}
2634
errstr=PyObject_Str(value);
2735
if (!errstr) {
2836
gotoerror;

‎Parser/tokenizer/file_tokenizer.c‎

Lines changed: 35 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -282,10 +282,8 @@ tok_underflow_interactive(struct tok_state *tok) {
282282
}
283283

284284
staticint
285-
tok_underflow_file(structtok_state*tok) {
286-
if (tok->start==NULL&& !INSIDE_FSTRING(tok)) {
287-
tok->cur=tok->inp=tok->buf;
288-
}
285+
tok_underflow_file(structtok_state*tok)
286+
{
289287
if (tok->decoding_state==STATE_INIT) {
290288
/* We have not yet determined the encoding.
291289
If an encoding is found, use the file-pointer
@@ -296,8 +294,16 @@ tok_underflow_file(struct tok_state *tok) {
296294
}
297295
assert(tok->decoding_state!=STATE_INIT);
298296
}
297+
intraw=tok->decoding_readline==NULL;
298+
if (raw&&tok->decoding_state!=STATE_NORMAL) {
299+
/* Keep the first line in the buffer to validate it later if
300+
* the encoding has not yet been determined. */
301+
}
302+
elseif (tok->start==NULL&& !INSIDE_FSTRING(tok)) {
303+
tok->cur=tok->inp=tok->buf;
304+
}
299305
/* Read until '\n' or EOF */
300-
if (tok->decoding_readline!=NULL) {
306+
if (!raw) {
301307
/* We already have a codec associated with this input. */
302308
if (!tok_readline_recode(tok)) {
303309
return0;
@@ -328,20 +334,35 @@ tok_underflow_file(struct tok_state *tok) {
328334

329335
ADVANCE_LINENO();
330336
if (tok->decoding_state!=STATE_NORMAL) {
331-
if (tok->lineno>2) {
332-
tok->decoding_state=STATE_NORMAL;
333-
}
334-
elseif (!_PyTokenizer_check_coding_spec(tok->cur,strlen(tok->cur),
337+
if (!_PyTokenizer_check_coding_spec(tok->cur,strlen(tok->cur),
335338
tok,fp_setreadl))
336339
{
337340
return0;
338341
}
342+
if (tok->lineno >=2) {
343+
tok->decoding_state=STATE_NORMAL;
344+
}
339345
}
340-
/* The default encoding is UTF-8, so make sure we don't have any
341-
non-UTF-8 sequences in it. */
342-
if (!tok->encoding&& !_PyTokenizer_ensure_utf8(tok->cur,tok)) {
343-
_PyTokenizer_error_ret(tok);
344-
return0;
346+
if (raw&&tok->decoding_state==STATE_NORMAL) {
347+
constchar*line=tok->lineno <=2 ?tok->buf :tok->cur;
348+
intlineno=tok->lineno <=2 ?1 :tok->lineno;
349+
if (!tok->encoding) {
350+
/* The default encoding is UTF-8, so make sure we don't have any
351+
non-UTF-8 sequences in it. */
352+
if (!_PyTokenizer_ensure_utf8(line,tok,lineno)) {
353+
_PyTokenizer_error_ret(tok);
354+
return0;
355+
}
356+
}
357+
else {
358+
PyObject*tmp=PyUnicode_Decode(line,strlen(line),
359+
tok->encoding,NULL);
360+
if (tmp==NULL) {
361+
_PyTokenizer_error_ret(tok);
362+
return0;
363+
}
364+
Py_DECREF(tmp);
365+
}
345366
}
346367
assert(tok->done==E_OK);
347368
returntok->done==E_OK;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp