Movatterモバイル変換
[0]ホーム
{-# LANGUAGE Trustworthy #-}{-# LANGUAGE NoImplicitPrelude , BangPatterns , NondecreasingIndentation , MagicHash #-}{-# OPTIONS_GHC -funbox-strict-fields #-}------------------------------------------------------------------------------- |-- Module : GHC.IO.Encoding.UTF8-- Copyright : (c) The University of Glasgow, 2009-- License : see libraries/base/LICENSE---- Maintainer : libraries@haskell.org-- Stability : internal-- Portability : non-portable---- UTF-8 Codec for the IO library---- Portions Copyright : (c) Tom Harper 2008-2009,-- (c) Bryan O'Sullivan 2009,-- (c) Duncan Coutts 2009-------------------------------------------------------------------------------moduleGHC.IO.Encoding.UTF8(utf8,mkUTF8,utf8_bom,mkUTF8_bom)whereimportGHC.BaseimportGHC.RealimportGHC.NumimportGHC.IORef-- import GHC.IOimportGHC.IO.BufferimportGHC.IO.Encoding.FailureimportGHC.IO.Encoding.TypesimportGHC.WordimportData.Bitsutf8::TextEncodingutf8=mkUTF8ErrorOnCodingFailure-- | @since 4.4.0.0mkUTF8::CodingFailureMode->TextEncodingmkUTF8cfm=TextEncoding{textEncodingName="UTF-8",mkTextDecoder=utf8_DFcfm,mkTextEncoder=utf8_EFcfm}utf8_DF::CodingFailureMode->IO(TextDecoder())utf8_DFcfm=return(BufferCodec{encode=utf8_decode,recover=recoverDecodecfm,close=return(),getState=return(),setState=const$return()})utf8_EF::CodingFailureMode->IO(TextEncoder())utf8_EFcfm=return(BufferCodec{encode=utf8_encode,recover=recoverEncodecfm,close=return(),getState=return(),setState=const$return()})utf8_bom::TextEncodingutf8_bom=mkUTF8_bomErrorOnCodingFailuremkUTF8_bom::CodingFailureMode->TextEncodingmkUTF8_bomcfm=TextEncoding{textEncodingName="UTF-8BOM",mkTextDecoder=utf8_bom_DFcfm,mkTextEncoder=utf8_bom_EFcfm}utf8_bom_DF::CodingFailureMode->IO(TextDecoderBool)utf8_bom_DFcfm=doref<-newIORefTruereturn(BufferCodec{encode=utf8_bom_decoderef,recover=recoverDecodecfm,close=return(),getState=readIORefref,setState=writeIORefref})utf8_bom_EF::CodingFailureMode->IO(TextEncoderBool)utf8_bom_EFcfm=doref<-newIORefTruereturn(BufferCodec{encode=utf8_bom_encoderef,recover=recoverEncodecfm,close=return(),getState=readIORefref,setState=writeIORefref})utf8_bom_decode::IORefBool->DecodeBufferutf8_bom_decoderefinput@Buffer{bufRaw=iraw,bufL=ir,bufR=iw,bufSize=_}output=dofirst<-readIORefrefifnotfirstthenutf8_decodeinputoutputelsedoletno_bom=dowriteIORefrefFalse;utf8_decodeinputoutputifiw-ir<1thenreturn(InputUnderflow,input,output)elsedoc0<-readWord8Bufirawirif(c0/=bom0)thenno_bomelsedoifiw-ir<2thenreturn(InputUnderflow,input,output)elsedoc1<-readWord8Bufiraw(ir+1)if(c1/=bom1)thenno_bomelsedoifiw-ir<3thenreturn(InputUnderflow,input,output)elsedoc2<-readWord8Bufiraw(ir+2)if(c2/=bom2)thenno_bomelsedo-- found a BOM, ignore it and carry onwriteIORefrefFalseutf8_decodeinput{bufL=ir+3}outpututf8_bom_encode::IORefBool->EncodeBufferutf8_bom_encoderefinputoutput@Buffer{bufRaw=oraw,bufL=_,bufR=ow,bufSize=os}=dob<-readIORefrefifnotbthenutf8_encodeinputoutputelseifos-ow<3thenreturn(OutputUnderflow,input,output)elsedowriteIORefrefFalsewriteWord8Buforawowbom0writeWord8Buforaw(ow+1)bom1writeWord8Buforaw(ow+2)bom2utf8_encodeinputoutput{bufR=ow+3}bom0,bom1,bom2::Word8bom0=0xefbom1=0xbbbom2=0xbfutf8_decode::DecodeBufferutf8_decodeinput@Buffer{bufRaw=iraw,bufL=ir0,bufR=iw,bufSize=_}output@Buffer{bufRaw=oraw,bufL=_,bufR=ow0,bufSize=os}=letloop!ir!ow|ow>=os=doneOutputUnderflowirow|ir>=iw=doneInputUnderflowirow|otherwise=doc0<-readWord8Bufirawircasec0of_|c0<=0x7f->doow'<-writeCharBuforawow(unsafeChr(fromIntegralc0))loop(ir+1)ow'|c0>=0xc0&&c0<=0xc1->invalid-- Overlong forms|c0>=0xc2&&c0<=0xdf->ifiw-ir<2thendoneInputUnderflowirowelsedoc1<-readWord8Bufiraw(ir+1)if(c1<0x80||c1>=0xc0)theninvalidelsedoow'<-writeCharBuforawow(chr2c0c1)loop(ir+2)ow'|c0>=0xe0&&c0<=0xef->caseiw-irof1->doneInputUnderflowirow2->do-- check for an error even when we don't have-- the full sequence yet (#3341)c1<-readWord8Bufiraw(ir+1)ifnot(validate3c0c10x80)theninvalidelsedoneInputUnderflowirow_->doc1<-readWord8Bufiraw(ir+1)c2<-readWord8Bufiraw(ir+2)ifnot(validate3c0c1c2)theninvalidelsedoow'<-writeCharBuforawow(chr3c0c1c2)loop(ir+3)ow'|c0>=0xf0->caseiw-irof1->doneInputUnderflowirow2->do-- check for an error even when we don't have-- the full sequence yet (#3341)c1<-readWord8Bufiraw(ir+1)ifnot(validate4c0c10x800x80)theninvalidelsedoneInputUnderflowirow3->doc1<-readWord8Bufiraw(ir+1)c2<-readWord8Bufiraw(ir+2)ifnot(validate4c0c1c20x80)theninvalidelsedoneInputUnderflowirow_->doc1<-readWord8Bufiraw(ir+1)c2<-readWord8Bufiraw(ir+2)c3<-readWord8Bufiraw(ir+3)ifnot(validate4c0c1c2c3)theninvalidelsedoow'<-writeCharBuforawow(chr4c0c1c2c3)loop(ir+4)ow'|otherwise->invalidwhereinvalid=doneInvalidSequenceirow-- lambda-lifted, to avoid thunks being built in the inner-loop:donewhy!ir!ow=return(why,ifir==iwtheninput{bufL=0,bufR=0}elseinput{bufL=ir},output{bufR=ow})inloopir0ow0utf8_encode::EncodeBufferutf8_encodeinput@Buffer{bufRaw=iraw,bufL=ir0,bufR=iw,bufSize=_}output@Buffer{bufRaw=oraw,bufL=_,bufR=ow0,bufSize=os}=letdonewhy!ir!ow=return(why,ifir==iwtheninput{bufL=0,bufR=0}elseinput{bufL=ir},output{bufR=ow})loop!ir!ow|ow>=os=doneOutputUnderflowirow|ir>=iw=doneInputUnderflowirow|otherwise=do(c,ir')<-readCharBufirawircaseordcofx|x<=0x7F->dowriteWord8Buforawow(fromIntegralx)loopir'(ow+1)|x<=0x07FF->ifos-ow<2thendoneOutputUnderflowirowelsedolet(c1,c2)=ord2cwriteWord8Buforawowc1writeWord8Buforaw(ow+1)c2loopir'(ow+2)|x<=0xFFFF->ifisSurrogatecthendoneInvalidSequenceirowelsedoifos-ow<3thendoneOutputUnderflowirowelsedolet(c1,c2,c3)=ord3cwriteWord8Buforawowc1writeWord8Buforaw(ow+1)c2writeWord8Buforaw(ow+2)c3loopir'(ow+3)|otherwise->doifos-ow<4thendoneOutputUnderflowirowelsedolet(c1,c2,c3,c4)=ord4cwriteWord8Buforawowc1writeWord8Buforaw(ow+1)c2writeWord8Buforaw(ow+2)c3writeWord8Buforaw(ow+3)c4loopir'(ow+4)inloopir0ow0-- ------------------------------------------------------------------------------- UTF-8 primitives, lifted from Data.Text.Fusion.Utf8ord2::Char->(Word8,Word8)ord2c=assert(n>=0x80&&n<=0x07ff)(x1,x2)wheren=ordcx1=fromIntegral$(n`shiftR`6)+0xC0x2=fromIntegral$(n.&.0x3F)+0x80ord3::Char->(Word8,Word8,Word8)ord3c=assert(n>=0x0800&&n<=0xffff)(x1,x2,x3)wheren=ordcx1=fromIntegral$(n`shiftR`12)+0xE0x2=fromIntegral$((n`shiftR`6).&.0x3F)+0x80x3=fromIntegral$(n.&.0x3F)+0x80ord4::Char->(Word8,Word8,Word8,Word8)ord4c=assert(n>=0x10000)(x1,x2,x3,x4)wheren=ordcx1=fromIntegral$(n`shiftR`18)+0xF0x2=fromIntegral$((n`shiftR`12).&.0x3F)+0x80x3=fromIntegral$((n`shiftR`6).&.0x3F)+0x80x4=fromIntegral$(n.&.0x3F)+0x80chr2::Word8->Word8->Charchr2(W8#x1#)(W8#x2#)=C#(chr#(z1#+#z2#))where!y1#=word2Int#x1#!y2#=word2Int#x2#!z1#=uncheckedIShiftL#(y1#-#0xC0#)6#!z2#=y2#-#0x80#{-# INLINEchr2#-}chr3::Word8->Word8->Word8->Charchr3(W8#x1#)(W8#x2#)(W8#x3#)=C#(chr#(z1#+#z2#+#z3#))where!y1#=word2Int#x1#!y2#=word2Int#x2#!y3#=word2Int#x3#!z1#=uncheckedIShiftL#(y1#-#0xE0#)12#!z2#=uncheckedIShiftL#(y2#-#0x80#)6#!z3#=y3#-#0x80#{-# INLINEchr3#-}chr4::Word8->Word8->Word8->Word8->Charchr4(W8#x1#)(W8#x2#)(W8#x3#)(W8#x4#)=C#(chr#(z1#+#z2#+#z3#+#z4#))where!y1#=word2Int#x1#!y2#=word2Int#x2#!y3#=word2Int#x3#!y4#=word2Int#x4#!z1#=uncheckedIShiftL#(y1#-#0xF0#)18#!z2#=uncheckedIShiftL#(y2#-#0x80#)12#!z3#=uncheckedIShiftL#(y3#-#0x80#)6#!z4#=y4#-#0x80#{-# INLINEchr4#-}between::Word8-- ^ byte to check->Word8-- ^ lower bound->Word8-- ^ upper bound->Boolbetweenxyz=x>=y&&x<=z{-# INLINEbetween#-}validate3::Word8->Word8->Word8->Bool{-# INLINEvalidate3#-}validate3x1x2x3=validate3_1||validate3_2||validate3_3||validate3_4wherevalidate3_1=(x1==0xE0)&&betweenx20xA00xBF&&betweenx30x800xBFvalidate3_2=betweenx10xE10xEC&&betweenx20x800xBF&&betweenx30x800xBFvalidate3_3=x1==0xED&&betweenx20x800x9F&&betweenx30x800xBFvalidate3_4=betweenx10xEE0xEF&&betweenx20x800xBF&&betweenx30x800xBFvalidate4::Word8->Word8->Word8->Word8->Bool{-# INLINEvalidate4#-}validate4x1x2x3x4=validate4_1||validate4_2||validate4_3wherevalidate4_1=x1==0xF0&&betweenx20x900xBF&&betweenx30x800xBF&&betweenx40x800xBFvalidate4_2=betweenx10xF10xF3&&betweenx20x800xBF&&betweenx30x800xBF&&betweenx40x800xBFvalidate4_3=x1==0xF4&&betweenx20x800x8F&&betweenx30x800xBF&&betweenx40x800xBF
[8]ページ先頭