Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit52e6e03

Browse files
committed
Fix lexing for unicode escape sequences
fixesdotnet#338closesdotnet#348commit9b68fc5Author: latkin <latkin@microsoft.com>Date: Wed Apr 15 10:40:46 2015 -0700 Use func, not lazycommit55b811dAuthor: latkin <latkin@microsoft.com>Date: Tue Apr 14 13:11:29 2015 -0700 Use dedicated type for lex resultcommit49424b5Author: latkin <latkin@microsoft.com>Date: Tue Apr 7 18:28:21 2015 -0700 Fix lexing for unicode escape sequencesfixesdotnet#338 Changes lexing of unicode escape sequences to match the F# spec (which says things should work the same as C#). - For short escape sequences, directly encode the hex value into a char - For long escape sequences, validate that the total codepoint is <= 0x0010FFFF - If it is, follow same logic as before (which was correct) - If it isn't, issue an error (same as C#)
1 parentfecd35f commit52e6e03

File tree

10 files changed

+123
-35
lines changed

10 files changed

+123
-35
lines changed

‎src/fsharp/FSComp.txt‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1139,6 +1139,7 @@ lexIndentOffForML,"Consider using a file with extension '.ml' or '.mli' instead"
11391139
1242,parsMissingGreaterThan,"Unmatched '<'. Expected closing '>'"
11401140
1243,parsUnexpectedQuotationOperatorInTypeAliasDidYouMeanVerbatimString,"Unexpected quotation operator '<@' in type definition. If you intend to pass a verbatim string as a static argument to a type provider, put a space between the '<' and '@' characters."
11411141
1244,parsErrorParsingAsOperatorName,"Attempted to parse this as an operator name, but failed"
1142+
1245,lexInvalidUnicodeLiteral,"\U%s is not a valid Unicode character escape sequence"
11421143
# Fsc.exe resource strings
11431144
fscTooManyErrors,"Exiting - too many errors"
11441145
2001,docfileNoXmlSuffix,"The documentation file has no .xml suffix"

‎src/fsharp/lex.fsl‎

Lines changed: 17 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -124,7 +124,7 @@ let startString args (lexbuf: UnicodeLexing.Lexbuf) =
124124
BYTEARRAY (Lexhelp.stringBufferAsBytes buf)
125125
)
126126
else
127-
STRING (System.Text.Encoding.Unicode.GetString(s,0,s.Length)))
127+
STRING (Lexhelp.stringBufferAsString s))
128128
buf,fin,m
129129

130130
// Utility functions for processing XML documentation
@@ -410,10 +410,9 @@ rule token args skip = parse
410410
| '\'' hexGraphShort '\'' { CHAR (char (int32 (hexGraphShort (lexemeTrimBoth lexbuf 3 1)))) }
411411
| '\'' unicodeGraphShort '\'' { CHAR (char (int32 (unicodeGraphShort (lexemeTrimBoth lexbuf 3 1)))) }
412412
| '\'' unicodeGraphLong '\''
413-
{ let hi,lo = unicodeGraphLong (lexemeTrimBoth lexbuf 3 1)
414-
match hi with
415-
| None -> CHAR (char lo)
416-
| Some _ -> fail args lexbuf (FSComp.SR.lexThisUnicodeOnlyInStringLiterals()) (CHAR (char lo)) }
413+
{ match unicodeGraphLong (lexemeTrimBoth lexbuf 3 1) with
414+
| SingleChar(c) -> CHAR (char c)
415+
| _ -> fail args lexbuf (FSComp.SR.lexThisUnicodeOnlyInStringLiterals()) (CHAR (char 0)) }
417416
| "(*IF-FSHARP"
418417
{ if not skip then (COMMENT (LexCont.Token !args.ifdefStack)) else token args skip lexbuf }
419418
| "(*F#"
@@ -756,11 +755,19 @@ and string sargs skip = parse
756755
if not skip then (STRING_TEXT (LexCont.String(!args.ifdefStack,m))) else string sargs skip lexbuf }
757756

758757
| unicodeGraphLong
759-
{ let (buf,_fin,m,args) = sargs
760-
let hi,lo = unicodeGraphLong (lexemeTrimLeft lexbuf 2)
761-
(match hi with | None -> () | Some c -> addUnicodeChar buf (int c));
762-
addUnicodeChar buf (int lo);
763-
if not skip then (STRING_TEXT (LexCont.String(!args.ifdefStack,m))) else string sargs skip lexbuf }
758+
{ let (buf,_fin,m,args) = sargs
759+
let hexChars = lexemeTrimLeft lexbuf 2
760+
let result () = if not skip then (STRING_TEXT (LexCont.String(!args.ifdefStack,m))) else string sargs skip lexbuf
761+
match unicodeGraphLong hexChars with
762+
| Invalid ->
763+
fail args lexbuf (FSComp.SR.lexInvalidUnicodeLiteral hexChars) (result ())
764+
| SingleChar(c) ->
765+
addUnicodeChar buf (int c)
766+
result ()
767+
| SurrogatePair(hi, lo) ->
768+
addUnicodeChar buf (int hi)
769+
addUnicodeChar buf (int lo)
770+
result () }
764771

765772
| '"'
766773
{ let (buf,fin,_m,_args) = sargs

‎src/fsharp/lexhelp.fs‎

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,12 @@ type lexargs =
5656
lightSyntaxStatus:LightSyntaxStatus;
5757
errorLogger:ErrorLogger}
5858

59+
/// possible results of lexing a long unicode escape sequence in a string literal, e.g. "\UDEADBEEF"
60+
typeLongUnicodeLexResult=
61+
| SurrogatePairofuint16*uint16
62+
| SingleCharofuint16
63+
| Invalid
64+
5965
letmkLexargs(_filename,defines,lightSyntaxStatus,resourceManager,ifdefStack,errorLogger)=
6066
{ defines= defines;
6167
ifdefStack= ifdefStack;
@@ -97,6 +103,16 @@ let addIntChar (buf: ByteBuffer) c =
97103
letaddUnicodeChar buf c= addIntChar buf(int c)
98104
letaddByteChar buf(c:char)= addIntChar buf(int32 c%256)
99105

106+
letstringBufferAsString(buf:byte[])=
107+
if buf.Length%2<>0then failwith"Expected even number of bytes";
108+
letchars:char[]= Array.zeroCreate(buf.Length/2)
109+
for i=0to(buf.Length/2)-1do
110+
lethi= buf.[i*2+1]
111+
letlo= buf.[i*2]
112+
letc= char(((int hi)*256)+(int lo))
113+
chars.[i]<- c
114+
System.String(chars)
115+
100116
/// When lexing bytearrays we don't expect to see any unicode stuff.
101117
/// Likewise when lexing string constants we shouldn't see any trigraphs > 127
102118
/// So to turn the bytes collected in the string buffer back into a bytearray
@@ -143,11 +159,16 @@ let unicodeGraphLong (s:string) =
143159
if s.Length<>8then failwith"unicodeGraphLong";
144160
lethigh= hexdigit s.[0]*4096+ hexdigit s.[1]*256+ hexdigit s.[2]*16+ hexdigit s.[3]in
145161
letlow= hexdigit s.[4]*4096+ hexdigit s.[5]*256+ hexdigit s.[6]*16+ hexdigit s.[7]in
146-
if high=0then None, uint16 low
147-
else
148-
(* A surrogate pair - see http://www.unicode.org/unicode/uni2book/ch03.pdf, section 3.7*)
149-
Some(uint16(0xD800+((high*0x10000+ low-0x10000)/0x400))),
150-
uint16(0xDC00+((high*0x10000+ low-0x10000)%0x400))
162+
// not a surrogate pair
163+
if high=0then SingleChar(uint16 low)
164+
// invalid encoding
165+
elif high>0x10then Invalid
166+
// valid surrogate pair - see http://www.unicode.org/unicode/uni2book/ch03.pdf, section 3.7 *)
167+
else
168+
letcodepoint= high*0x10000+ low
169+
lethiSurr= uint16(0xD800+((codepoint-0x10000)/0x400))
170+
letloSurr= uint16(0xDC00+((codepoint-0x10000)%0x400))
171+
SurrogatePair(hiSurr, loSurr)
151172

152173
letescape c=
153174
match cwith

‎src/fsharp/lexhelp.fsi‎

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,11 @@ type lexargs =
3333
lightSyntaxStatus:LightSyntaxStatus;
3434
errorLogger:ErrorLogger}
3535

36+
typeLongUnicodeLexResult=
37+
| SurrogatePairofuint16*uint16
38+
| SingleCharofuint16
39+
| Invalid
40+
3641
valresetLexbufPos:string->UnicodeLexing.Lexbuf->unit
3742
valmkLexargs:'a* string list* LightSyntaxStatus* LexResourceManager* LexerIfdefStack* ErrorLogger-> lexargs
3843
val reusingLexbufForParsing: UnicodeLexing.Lexbuf->(unit-> 'a)-> 'a
@@ -43,6 +48,7 @@ val internal callStringFinisher : ('a -> 'b -> byte[] -> 'c) -> AbstractIL.Inter
4348
val internal addUnicodeString: AbstractIL.Internal.ByteBuffer-> string-> unit
4449
val internal addUnicodeChar: AbstractIL.Internal.ByteBuffer-> int-> unit
4550
val internal addByteChar: AbstractIL.Internal.ByteBuffer-> char-> unit
51+
val internal stringBufferAsString: byte[]-> string
4652
val internal stringBufferAsBytes: AbstractIL.Internal.ByteBuffer-> byte[]
4753
val internal stringBufferIsBytes: AbstractIL.Internal.ByteBuffer-> bool
4854
val internal newline: Lexing.LexBuffer<'a>-> unit
@@ -51,7 +57,7 @@ val internal digit : char -> int32
5157
val internal hexdigit: char-> int32
5258
val internal unicodeGraphShort: string-> uint16
5359
val internal hexGraphShort: string-> uint16
54-
val internal unicodeGraphLong: string->uint16 option* uint16
60+
val internal unicodeGraphLong: string->LongUnicodeLexResult
5561
val internal escape: char-> char
5662

5763
exception internal ReservedKeyword of string* Range.range

‎src/fsharp/vs/service.fs‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -952,7 +952,7 @@ type TypeCheckInfo
952952
|_-> None
953953

954954
match UntypedParseInfoImpl.TryGetCompletionContext(line, colAtEndOfNamesAndResidue, untypedParseInfoOpt)with
955-
| Some Invalid-> None
955+
| SomeCompletionContext.Invalid-> None
956956
| Some(CompletionContext.Inherit(InheritanceContext.Class,(plid,_)))->
957957
FindInEnv(plid,false)
958958
|> FilterRelevantItemsBy None GetBaseClassCandidates
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
// #Regression #Conformance #LexicalAnalysis
2+
#light
3+
4+
// Verify error with malformed long unicode escape sequences
5+
6+
lettooBigForChar= '\U00024B62'
7+
letbogusString01="\U00110000"
8+
letbogusString02="\UFFFF0000"
9+
10+
exit1
11+
12+
//<Expects id="FS1159" span="(6,22-6,34)" status="error">This Unicode encoding is only valid in string literals</Expects>
13+
//<Expects id="FS1245" span="(7,21-7,33)" status="error">\\U00110000 is not a valid Unicode character escape sequence</Expects>
14+
//<Expects id="FS1245" span="(8,21-8,33)" status="error">\\UFFFF0000 is not a valid Unicode character escape sequence</Expects>
Lines changed: 25 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,29 @@
1-
// #Conformance #LexicalAnalysis
2-
#light
3-
41
// Test string literals with short Unicode Literals
52

6-
letunicodeString="\u2660\u2663\u2665\u2666"
7-
letexpectedResult="♠ ♣ ♥ ♦"
3+
let mutablefailure=false
4+
5+
letcheckStr(inputStr:string)expectedChars expectedStr=
6+
letcharCodes= inputStr.ToCharArray()|> Array.map int
7+
if charCodes<> expectedCharsthen
8+
printfn"Character encodings don't match"
9+
printfn" Expected%A" expectedChars
10+
printfn" Actual%A" charCodes
11+
false
12+
else
13+
match expectedStrwith
14+
| Some(exp)when exp<> inputStr->
15+
printfn"String representation doesn't match"
16+
printfn" Expected%s" exp
17+
printfn" Actual%s" inputStr
18+
false
19+
|_->true
820

9-
if unicodeString<> expectedResultthen exit1
21+
lettest(inputStr:string)expectedChars expectedStr=
22+
failure<-(checkStr inputStr expectedChars expectedStr)&& failure
23+
24+
test"\u2660\u2663\u2665\u2666"[|0x2660;0x2663;0x2665;0x2666|](Some("♠♣♥♦"))
25+
test"\uD800\uDBFF\uDC00\uDFFF"[|0xD800;32;0xDBFF;32;0xDC00;32;0xDFFF|] None
26+
test"\u0000\u0000\uFFFE\uFFFD\uFFFC"[|0;0;0xFFFE;0xFFFD;0xFFFC|] None
27+
test"\uD900\uD901\uD902"[|0xD900;0xD901;0xD902|] None
1028

11-
exit0
29+
exit(if failurethen1else0)
Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,31 @@
1-
// #Conformance #LexicalAnalysis
2-
#light
3-
41
// Test string literals with long Unicode Literals
52

6-
letunicodeString="\U00002660\U00002663\U00002665\U00002666"
7-
letexpectedResult="♠ ♣ ♥ ♦"
3+
let mutablefailure=false
4+
5+
letcheckStr(inputStr:string)expectedChars expectedStr=
6+
letcharCodes= inputStr.ToCharArray()|> Array.map int
7+
if charCodes<> expectedCharsthen
8+
printfn"Character encodings don't match"
9+
printfn" Expected%A" expectedChars
10+
printfn" Actual%A" charCodes
11+
false
12+
else
13+
match expectedStrwith
14+
| Some(exp)when exp<> inputStr->
15+
printfn"String representation doesn't match"
16+
printfn" Expected%s" exp
17+
printfn" Actual%s" inputStr
18+
false
19+
|_->true
820

9-
if unicodeString<> expectedResultthen exit1
21+
lettest(inputStr:string)expectedChars expectedStr=
22+
failure<-(checkStr inputStr expectedChars expectedStr)&& failure
23+
24+
test"\U00002660\U00002663\U00002665\U00002666"[|0x2660;0x2663;0x2665;0x2666|](Some("♠♣♥♦"))
25+
test"\U0000D800\U0000DBFF\U0000DC00\U0000DFFF"[|0xD800;32;0xDBFF;32;0xDC00;32;0xDFFF|] None
26+
test"\U00000000\U00000000\U0000FFFE\U0000FFFD\U0000FFFC"[|0;0;0xFFFE;0xFFFD;0xFFFC|] None
27+
test"\U0000D900\U0000D901\U0000D902"[|0xD900;0xD901;0xD902|] None
28+
test"\U00010437"[|0xD801;0xDC37;|](Some("𐐷"))
29+
test"\U00024B62"[|0xD852;0xDF62|](Some("𤭢"))
1030

11-
exit0
31+
exit(if failurethen1else0)

‎tests/fsharpqa/Source/Conformance/LexicalAnalysis/StringsAndCharacters/env.lst‎

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020

2121
SOURCE=UnicodeString01.fs# UnicodeString01.fs
2222
SOURCE=UnicodeString02.fs# UnicodeString02.fs
23+
SOURCE=E_BogusLongUnicodeEscape.fs SCFLAGS="--codepage:1252 --test:ErrorRanges"# E_BogusLongUnicodeEscape.fs
2324

2425
SOURCE=E_ByteStrUnicodeChar01.fs# E_ByteStrUnicodeChar01.fs
2526
SOURCE=E_ByteCharUnicodeChar01.fs# E_ByteCharUnicodeChar01.fs

‎tests/fsharpqa/Source/Conformance/PatternMatching/Simple/simplePatterns19.fs‎

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,10 +4,10 @@
44
// Pattern match long unicode literals
55

66
[<Literal>]
7-
letUnicodeString1="\U00000000\UFFFFFFFF"
7+
letUnicodeString1="\U00000000\U0002FFFF"
88

99
[<Literal>]
10-
letUnicodeString2="\U11111111\U22222222"
10+
letUnicodeString2="\U00101111\U000F2222"
1111

1212
lettestStr x=
1313
match xwith
@@ -18,9 +18,9 @@ let testStr x =
1818
if testStr"foo"<>0then exit1
1919

2020
if testStr UnicodeString1<>1then exit1
21-
if testStr"\U00000000\UFFFFFFFF"<>1then exit1
21+
if testStr"\U00000000\U0002FFFF"<>1then exit1
2222

2323
if testStr UnicodeString2<>2then exit1
24-
if testStr"\U11111111\U22222222"<>2then exit1
24+
if testStr"\U00101111\U000F2222"<>2then exit1
2525

2626
exit0

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp