Movatterモバイル変換
[0]ホーム
{-# LANGUAGE Trustworthy #-}{-# LANGUAGE CPP, NoImplicitPrelude #-}{-# OPTIONS_GHC -funbox-strict-fields #-}------------------------------------------------------------------------------- |-- Module : GHC.IO.Encoding-- Copyright : (c) The University of Glasgow, 2008-2009-- License : see libraries/base/LICENSE---- Maintainer : libraries@haskell.org-- Stability : internal-- Portability : non-portable---- Text codecs for I/O-------------------------------------------------------------------------------moduleGHC.IO.Encoding(BufferCodec(..),TextEncoding(..),TextEncoder,TextDecoder,CodingProgress(..),latin1,latin1_encode,latin1_decode,utf8,utf8_bom,utf16,utf16le,utf16be,utf32,utf32le,utf32be,initLocaleEncoding,getLocaleEncoding,getFileSystemEncoding,getForeignEncoding,setLocaleEncoding,setFileSystemEncoding,setForeignEncoding,char8,mkTextEncoding,argvEncoding)whereimportGHC.BaseimportGHC.IO.ExceptionimportGHC.IO.BufferimportGHC.IO.Encoding.FailureimportGHC.IO.Encoding.Types#if !defined(mingw32_HOST_OS)importqualifiedGHC.IO.Encoding.IconvasIconv#elseimportqualifiedGHC.IO.Encoding.CodePageasCodePageimportText.Read(reads)#endifimportqualifiedGHC.IO.Encoding.Latin1asLatin1importqualifiedGHC.IO.Encoding.UTF8asUTF8importqualifiedGHC.IO.Encoding.UTF16asUTF16importqualifiedGHC.IO.Encoding.UTF32asUTF32importGHC.ListimportGHC.WordimportData.IORefimportData.Char(toUpper)importSystem.IO.Unsafe(unsafePerformIO)-- ------------------------------------------------------------------------------- | The Latin1 (ISO8859-1) encoding. This encoding maps bytes-- directly to the first 256 Unicode code points, and is thus not a-- complete Unicode encoding. An attempt to write a character greater than-- '\255' to a 'Handle' using the 'latin1' encoding will result in an error.latin1::TextEncodinglatin1=Latin1.latin1_checked-- | The UTF-8 Unicode encodingutf8::TextEncodingutf8=UTF8.utf8-- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte-- sequence 0xEF 0xBB 0xBF). This encoding behaves like 'utf8',-- except that on input, the BOM sequence is ignored at the beginning-- of the stream, and on output, the BOM sequence is prepended.---- The byte-order-mark is strictly unnecessary in UTF-8, but is-- sometimes used to identify the encoding of a file.--utf8_bom::TextEncodingutf8_bom=UTF8.utf8_bom-- | The UTF-16 Unicode encoding (a byte-order-mark should be used to-- indicate endianness).utf16::TextEncodingutf16=UTF16.utf16-- | The UTF-16 Unicode encoding (litte-endian)utf16le::TextEncodingutf16le=UTF16.utf16le-- | The UTF-16 Unicode encoding (big-endian)utf16be::TextEncodingutf16be=UTF16.utf16be-- | The UTF-32 Unicode encoding (a byte-order-mark should be used to-- indicate endianness).utf32::TextEncodingutf32=UTF32.utf32-- | The UTF-32 Unicode encoding (litte-endian)utf32le::TextEncodingutf32le=UTF32.utf32le-- | The UTF-32 Unicode encoding (big-endian)utf32be::TextEncodingutf32be=UTF32.utf32be-- | The Unicode encoding of the current locale---- @since 4.5.0.0getLocaleEncoding::IOTextEncoding-- | The Unicode encoding of the current locale, but allowing arbitrary-- undecodable bytes to be round-tripped through it.---- This 'TextEncoding' is used to decode and encode command line arguments-- and environment variables on non-Windows platforms.---- On Windows, this encoding *should not* be used if possible because-- the use of code pages is deprecated: Strings should be retrieved-- via the "wide" W-family of UTF-16 APIs instead---- @since 4.5.0.0getFileSystemEncoding::IOTextEncoding-- | The Unicode encoding of the current locale, but where undecodable-- bytes are replaced with their closest visual match. Used for-- the 'CString' marshalling functions in "Foreign.C.String"---- @since 4.5.0.0getForeignEncoding::IOTextEncoding-- | @since 4.5.0.0setLocaleEncoding,setFileSystemEncoding,setForeignEncoding::TextEncoding->IO()(getLocaleEncoding,setLocaleEncoding)=mkGlobalinitLocaleEncoding(getFileSystemEncoding,setFileSystemEncoding)=mkGlobalinitFileSystemEncoding(getForeignEncoding,setForeignEncoding)=mkGlobalinitForeignEncodingmkGlobal::a->(IOa,a->IO())mkGlobalx=unsafePerformIO$dox_ref<-newIORefxreturn(readIORefx_ref,writeIORefx_ref)-- | @since 4.5.0.0initLocaleEncoding,initFileSystemEncoding,initForeignEncoding::TextEncoding#if !defined(mingw32_HOST_OS)-- It is rather important that we don't just call Iconv.mkIconvEncoding here-- because some iconvs (in particular GNU iconv) will brokenly UTF-8 encode-- lone surrogates without complaint.---- By going through our Haskell implementations of those encodings, we are-- guaranteed to catch such errors.---- FIXME: this is not a complete solution because if the locale encoding is one-- which we don't have a Haskell-side decoder for, iconv might still ignore the-- lone surrogate in the input.initLocaleEncoding=unsafePerformIO$mkTextEncoding'ErrorOnCodingFailureIconv.localeEncodingNameinitFileSystemEncoding=unsafePerformIO$mkTextEncoding'RoundtripFailureIconv.localeEncodingNameinitForeignEncoding=unsafePerformIO$mkTextEncoding'IgnoreCodingFailureIconv.localeEncodingName#elseinitLocaleEncoding=CodePage.localeEncodinginitFileSystemEncoding=CodePage.mkLocaleEncodingRoundtripFailureinitForeignEncoding=CodePage.mkLocaleEncodingIgnoreCodingFailure#endif-- See Note [Windows Unicode Arguments] in rts/RtsFlags.c-- On Windows we assume hs_init argv is in utf8 encoding.-- | Internal encoding of argvargvEncoding::IOTextEncoding#if defined(mingw32_HOST_OS)argvEncoding=returnutf8#elseargvEncoding=getFileSystemEncoding#endif-- | An encoding in which Unicode code points are translated to bytes-- by taking the code point modulo 256. When decoding, bytes are-- translated directly into the equivalent code point.---- This encoding never fails in either direction. However, encoding-- discards information, so encode followed by decode is not the-- identity.---- @since 4.4.0.0char8::TextEncodingchar8=Latin1.latin1-- | Look up the named Unicode encoding. May fail with---- * 'isDoesNotExistError' if the encoding is unknown---- The set of known encodings is system-dependent, but includes at least:---- * @UTF-8@---- * @UTF-16@, @UTF-16BE@, @UTF-16LE@---- * @UTF-32@, @UTF-32BE@, @UTF-32LE@---- There is additional notation (borrowed from GNU iconv) for specifying-- how illegal characters are handled:---- * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause-- all illegal sequences on input to be ignored, and on output-- will drop all code points that have no representation in the-- target encoding.---- * a suffix of @\/\/TRANSLIT@ will choose a replacement character-- for illegal sequences or code points.---- * a suffix of @\/\/ROUNDTRIP@ will use a PEP383-style escape mechanism-- to represent any invalid bytes in the input as Unicode codepoints (specifically,-- as lone surrogates, which are normally invalid in UTF-32).-- Upon output, these special codepoints are detected and turned back into the-- corresponding original byte.---- In theory, this mechanism allows arbitrary data to be roundtripped via-- a 'String' with no loss of data. In practice, there are two limitations-- to be aware of:---- 1. This only stands a chance of working for an encoding which is an ASCII-- superset, as for security reasons we refuse to escape any bytes smaller-- than 128. Many encodings of interest are ASCII supersets (in particular,-- you can assume that the locale encoding is an ASCII superset) but many-- (such as UTF-16) are not.---- 2. If the underlying encoding is not itself roundtrippable, this mechanism-- can fail. Roundtrippable encodings are those which have an injective mapping-- into Unicode. Almost all encodings meet this criteria, but some do not. Notably,-- Shift-JIS (CP932) and Big5 contain several different encodings of the same-- Unicode codepoint.---- On Windows, you can access supported code pages with the prefix-- @CP@; for example, @\"CP1250\"@.--mkTextEncoding::String->IOTextEncodingmkTextEncodinge=casemb_coding_failure_modeofNothing->unknownEncodingErreJustcfm->mkTextEncoding'cfmencwhere(enc,suffix)=span(/='/')emb_coding_failure_mode=casesuffixof""->JustErrorOnCodingFailure"//IGNORE"->JustIgnoreCodingFailure"//TRANSLIT"->JustTransliterateCodingFailure"//ROUNDTRIP"->JustRoundtripFailure_->NothingmkTextEncoding'::CodingFailureMode->String->IOTextEncodingmkTextEncoding'cfmenc=case[toUpperc|c<-enc,c/='-']of-- UTF-8 and friends we can handle ourselves"UTF8"->return$UTF8.mkUTF8cfm"UTF16"->return$UTF16.mkUTF16cfm"UTF16LE"->return$UTF16.mkUTF16lecfm"UTF16BE"->return$UTF16.mkUTF16becfm"UTF32"->return$UTF32.mkUTF32cfm"UTF32LE"->return$UTF32.mkUTF32lecfm"UTF32BE"->return$UTF32.mkUTF32becfm-- On AIX, we want to avoid iconv, because it is either-- a) totally broken, or b) non-reentrant, or c) actually works.-- Detecting b) is difficult as you'd have to trigger the reentrancy-- corruption.-- Therefore, on AIX, we handle the popular ASCII and latin1 encodings-- ourselves. For consistency, we do the same on other platforms.-- We use `mkLatin1_checked` instead of `mkLatin1`, since the latter-- completely ignores the CodingFailureMode (TEST=encoding005)._|isAscii->return(Latin1.mkAsciicfm)_|isLatin1->return(Latin1.mkLatin1_checkedcfm)#if defined(mingw32_HOST_OS)'C':'P':n|[(cp,"")]<-readsn->return$CodePage.mkCodePageEncodingcfmcp_->unknownEncodingErr(enc++codingFailureModeSuffixcfm)#else-- Otherwise, handle other encoding needs via iconv.-- Unfortunately there is no good way to determine whether iconv is actually-- functional without telling it to do something._->dores<-Iconv.mkIconvEncodingcfmenccaseresofJuste->returneNothing->unknownEncodingErr(enc++codingFailureModeSuffixcfm)#endifwhereisAscii=enc`elem`asciiEncNamesisLatin1=enc`elem`latin1EncNamesasciiEncNames=-- ASCII aliases specified by RFC 1345 and RFC 3808.["ANSI_X3.4-1968","iso-ir-6","ANSI_X3.4-1986","ISO_646.irv:1991","US-ASCII","us","IBM367","cp367","csASCII","ASCII","ISO646-US"]latin1EncNames=-- latin1 aliases specified by RFC 1345 and RFC 3808.["ISO_8859-1:1987","iso-ir-100","ISO_8859-1","ISO-8859-1","latin1","l1","IBM819","CP819","csISOLatin1"]latin1_encode::CharBuffer->BufferWord8->IO(CharBuffer,BufferWord8)latin1_encodeinputoutput=fmap(\(_why,input',output')->(input',output'))$Latin1.latin1_encodeinputoutput-- unchecked, used for char8--latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encodelatin1_decode::BufferWord8->CharBuffer->IO(BufferWord8,CharBuffer)latin1_decodeinputoutput=fmap(\(_why,input',output')->(input',output'))$Latin1.latin1_decodeinputoutput--latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encodeunknownEncodingErr::String->IOaunknownEncodingErre=ioException(IOErrorNothingNoSuchThing"mkTextEncoding"("unknown encoding:"++e)NothingNothing)
[8]ページ先頭