Movatterモバイル変換

[0]ホーム
{-# LANGUAGE Trustworthy #-}{-# LANGUAGE CPP, NoImplicitPrelude #-}{-# LANGUAGE UnboxedTuples #-}{-# OPTIONS_GHC -funbox-strict-fields #-}------------------------------------------------------------------------------- |-- Module      :  GHC.IO.Encoding-- Copyright   :  (c) The University of Glasgow, 2008-2009-- License     :  see libraries/base/LICENSE---- Maintainer  :  libraries@haskell.org-- Stability   :  internal-- Portability :  non-portable---- Text codecs for I/O-------------------------------------------------------------------------------moduleGHC.IO.Encoding(BufferCodec(..),TextEncoding(..),TextEncoder,TextDecoder,CodingProgress(..),latin1,latin1_encode,latin1_decode,utf8,utf8_bom,utf16,utf16le,utf16be,utf32,utf32le,utf32be,initLocaleEncoding,getLocaleEncoding,getFileSystemEncoding,getForeignEncoding,setLocaleEncoding,setFileSystemEncoding,setForeignEncoding,char8,mkTextEncoding,argvEncoding)whereimportGHC.BaseimportGHC.IO.ExceptionimportGHC.IO.BufferimportGHC.IO.Encoding.FailureimportGHC.IO.Encoding.Types#if !defined(mingw32_HOST_OS)importqualifiedGHC.IO.Encoding.IconvasIconv#elseimportqualifiedGHC.IO.Encoding.CodePageasCodePageimportText.Read(reads)#endifimportqualifiedGHC.IO.Encoding.Latin1asLatin1importqualifiedGHC.IO.Encoding.UTF8asUTF8importqualifiedGHC.IO.Encoding.UTF16asUTF16importqualifiedGHC.IO.Encoding.UTF32asUTF32importGHC.ListimportGHC.WordimportData.IORefimportData.Char(toUpper)importSystem.IO.Unsafe(unsafePerformIO)-- ------------------------------------------------------------------------------- | The Latin1 (ISO8859-1) encoding.  This encoding maps bytes-- directly to the first 256 Unicode code points, and is thus not a-- complete Unicode encoding.  An attempt to write a character greater than-- @\'\\255\'@ to a 'System.IO.Handle' using the 'latin1' encoding will result in an-- error.latin1::TextEncodinglatin1 :: TextEncodinglatin1=TextEncodingLatin1.latin1_checked-- | The UTF-8 Unicode encodingutf8::TextEncodingutf8 :: TextEncodingutf8=TextEncodingUTF8.utf8-- | The UTF-8 Unicode encoding, with a byte-order-mark (BOM; the byte-- sequence 0xEF 0xBB 0xBF).  This encoding behaves like 'utf8',-- except that on input, the BOM sequence is ignored at the beginning-- of the stream, and on output, the BOM sequence is prepended.---- The byte-order-mark is strictly unnecessary in UTF-8, but is-- sometimes used to identify the encoding of a file.--utf8_bom::TextEncodingutf8_bom :: TextEncodingutf8_bom=TextEncodingUTF8.utf8_bom-- | The UTF-16 Unicode encoding (a byte-order-mark should be used to-- indicate endianness).utf16::TextEncodingutf16 :: TextEncodingutf16=TextEncodingUTF16.utf16-- | The UTF-16 Unicode encoding (little-endian)utf16le::TextEncodingutf16le :: TextEncodingutf16le=TextEncodingUTF16.utf16le-- | The UTF-16 Unicode encoding (big-endian)utf16be::TextEncodingutf16be :: TextEncodingutf16be=TextEncodingUTF16.utf16be-- | The UTF-32 Unicode encoding (a byte-order-mark should be used to-- indicate endianness).utf32::TextEncodingutf32 :: TextEncodingutf32=TextEncodingUTF32.utf32-- | The UTF-32 Unicode encoding (little-endian)utf32le::TextEncodingutf32le :: TextEncodingutf32le=TextEncodingUTF32.utf32le-- | The UTF-32 Unicode encoding (big-endian)utf32be::TextEncodingutf32be :: TextEncodingutf32be=TextEncodingUTF32.utf32be-- | The Unicode encoding of the current locale---- @since 4.5.0.0getLocaleEncoding::IOTextEncoding{-# NOINLINEgetLocaleEncoding#-}-- | The encoding of the current locale, but allowing arbitrary-- undecodable bytes to be round-tripped through it.---- Do not expect the encoding to be Unicode-compatible: it could appear to be ASCII or anything else.---- This 'TextEncoding' is used to decode and encode command line arguments-- and environment variables on non-Windows platforms.---- On Windows, this encoding *should not* be used if possible because-- the use of code pages is deprecated: Strings should be retrieved-- via the "wide" W-family of UTF-16 APIs instead---- @since 4.5.0.0getFileSystemEncoding::IOTextEncoding{-# NOINLINEgetFileSystemEncoding#-}-- | The Unicode encoding of the current locale, but where undecodable-- bytes are replaced with their closest visual match. Used for-- the 'Foreign.C.String.CString' marshalling functions in "Foreign.C.String"---- @since 4.5.0.0getForeignEncoding::IOTextEncoding{-# NOINLINEgetForeignEncoding#-}-- | Set locale encoding for your program. The locale affects-- how 'Char's are encoded and decoded when serialized to bytes: e. g.,-- when you read or write files ('System.IO.readFile'', 'System.IO.writeFile')-- or use standard input/output ('System.IO.getLine', 'System.IO.putStrLn').-- For instance, if your program prints non-ASCII characters, it is prudent to execute---- > setLocaleEncoding utf8---- This is necessary, but not enough on Windows, where console is-- a stateful device, which needs to be configured using-- @System.Win32.Console.setConsoleOutputCP@ and restored back afterwards.-- These intricacies are covered by-- <https://hackage.haskell.org/package/code-page code-page> package,-- which offers a crossplatform @System.IO.CodePage.withCodePage@ bracket.---- Wrong locale encoding typically causes error messages like-- "invalid argument (cannot decode byte sequence starting from ...)"-- or "invalid argument (cannot encode character ...)".---- @since 4.5.0.0setLocaleEncoding::TextEncoding->IO(){-# NOINLINEsetLocaleEncoding#-}-- | @since 4.5.0.0setFileSystemEncoding::TextEncoding->IO(){-# NOINLINEsetFileSystemEncoding#-}-- | @since 4.5.0.0setForeignEncoding::TextEncoding->IO(){-# NOINLINEsetForeignEncoding#-}(IO TextEncodinggetLocaleEncoding,TextEncoding -> IO ()setLocaleEncoding)=TextEncoding -> (IO TextEncoding, TextEncoding -> IO ())forall a. a -> (IO a, a -> IO ())mkGlobalTextEncodinginitLocaleEncoding(IO TextEncodinggetFileSystemEncoding,TextEncoding -> IO ()setFileSystemEncoding)=TextEncoding -> (IO TextEncoding, TextEncoding -> IO ())forall a. a -> (IO a, a -> IO ())mkGlobalTextEncodinginitFileSystemEncoding(IO TextEncodinggetForeignEncoding,TextEncoding -> IO ()setForeignEncoding)=TextEncoding -> (IO TextEncoding, TextEncoding -> IO ())forall a. a -> (IO a, a -> IO ())mkGlobalTextEncodinginitForeignEncodingmkGlobal::a->(IOa,a->IO())mkGlobal :: forall a. a -> (IO a, a -> IO ())mkGlobalax=IO (IO a, a -> IO ()) -> (IO a, a -> IO ())forall a. IO a -> aunsafePerformIO(IO (IO a, a -> IO ()) -> (IO a, a -> IO ()))-> IO (IO a, a -> IO ()) -> (IO a, a -> IO ())forall a b. (a -> b) -> a -> b$doIORef ax_ref<-a -> IO (IORef a)forall a. a -> IO (IORef a)newIORefax(IO a, a -> IO ()) -> IO (IO a, a -> IO ())forall a. a -> IO aforall (m :: * -> *) a. Monad m => a -> m areturn(IORef a -> IO aforall a. IORef a -> IO areadIORefIORef ax_ref,IORef a -> a -> IO ()forall a. IORef a -> a -> IO ()writeIORefIORef ax_ref){-# NOINLINEmkGlobal#-}-- | @since 4.5.0.0initLocaleEncoding,initFileSystemEncoding,initForeignEncoding::TextEncoding{-# NOINLINEinitLocaleEncoding#-}-- N.B. initLocaleEncoding is exported for use in System.IO.localeEncoding.-- NOINLINE ensures that this result is shared.#if defined(javascript_HOST_ARCH)initLocaleEncoding=utf8initFileSystemEncoding=utf8initForeignEncoding=utf8#elif !defined(mingw32_HOST_OS)-- It is rather important that we don't just call Iconv.mkIconvEncoding here-- because some iconvs (in particular GNU iconv) will brokenly UTF-8 encode-- lone surrogates without complaint.---- By going through our Haskell implementations of those encodings, we are-- guaranteed to catch such errors.---- FIXME: this is not a complete solution because if the locale encoding is one-- which we don't have a Haskell-side decoder for, iconv might still ignore the-- lone surrogate in the input.initLocaleEncoding :: TextEncodinginitLocaleEncoding=IO TextEncoding -> TextEncodingforall a. IO a -> aunsafePerformIO(IO TextEncoding -> TextEncoding)-> IO TextEncoding -> TextEncodingforall a b. (a -> b) -> a -> b$CodingFailureMode -> String -> IO TextEncodingmkTextEncoding'CodingFailureModeErrorOnCodingFailureStringIconv.localeEncodingNameinitFileSystemEncoding :: TextEncodinginitFileSystemEncoding=IO TextEncoding -> TextEncodingforall a. IO a -> aunsafePerformIO(IO TextEncoding -> TextEncoding)-> IO TextEncoding -> TextEncodingforall a b. (a -> b) -> a -> b$CodingFailureMode -> String -> IO TextEncodingmkTextEncoding'CodingFailureModeRoundtripFailureStringIconv.localeEncodingNameinitForeignEncoding :: TextEncodinginitForeignEncoding=IO TextEncoding -> TextEncodingforall a. IO a -> aunsafePerformIO(IO TextEncoding -> TextEncoding)-> IO TextEncoding -> TextEncodingforall a b. (a -> b) -> a -> b$CodingFailureMode -> String -> IO TextEncodingmkTextEncoding'CodingFailureModeIgnoreCodingFailureStringIconv.localeEncodingName#elseinitLocaleEncoding=CodePage.localeEncodinginitFileSystemEncoding=CodePage.mkLocaleEncodingRoundtripFailureinitForeignEncoding=CodePage.mkLocaleEncodingIgnoreCodingFailure#endif-- See Note [Windows Unicode Arguments] in rts/RtsFlags.c-- On Windows we assume hs_init argv is in utf8 encoding.-- | Internal encoding of argvargvEncoding::IOTextEncoding#if defined(mingw32_HOST_OS)argvEncoding=returnutf8#elseargvEncoding :: IO TextEncodingargvEncoding=IO TextEncodinggetFileSystemEncoding#endif-- | An encoding in which Unicode code points are translated to bytes-- by taking the code point modulo 256.  When decoding, bytes are-- translated directly into the equivalent code point.---- This encoding never fails in either direction.  However, encoding-- discards information, so encode followed by decode is not the-- identity.---- @since 4.4.0.0char8::TextEncodingchar8 :: TextEncodingchar8=TextEncodingLatin1.latin1-- | Look up the named Unicode encoding.  May fail with----  * 'System.IO.Error.isDoesNotExistError' if the encoding is unknown---- The set of known encodings is system-dependent, but includes at least:----  * @UTF-8@----  * @UTF-16@, @UTF-16BE@, @UTF-16LE@----  * @UTF-32@, @UTF-32BE@, @UTF-32LE@---- There is additional notation (borrowed from GNU iconv) for specifying-- how illegal characters are handled:----  * a suffix of @\/\/IGNORE@, e.g. @UTF-8\/\/IGNORE@, will cause--    all illegal sequences on input to be ignored, and on output--    will drop all code points that have no representation in the--    target encoding.----  * a suffix of @\/\/TRANSLIT@ will choose a replacement character--    for illegal sequences or code points.----  * a suffix of @\/\/ROUNDTRIP@ will use a PEP383-style escape mechanism--    to represent any invalid bytes in the input as Unicode codepoints (specifically,--    as lone surrogates, which are normally invalid in UTF-32).--    Upon output, these special codepoints are detected and turned back into the--    corresponding original byte.----    In theory, this mechanism allows arbitrary data to be roundtripped via--    a 'String' with no loss of data. In practice, there are two limitations--    to be aware of:----      1. This only stands a chance of working for an encoding which is an ASCII--         superset, as for security reasons we refuse to escape any bytes smaller--         than 128. Many encodings of interest are ASCII supersets (in particular,--         you can assume that the locale encoding is an ASCII superset) but many--         (such as UTF-16) are not.----      2. If the underlying encoding is not itself roundtrippable, this mechanism--         can fail. Roundtrippable encodings are those which have an injective mapping--         into Unicode. Almost all encodings meet this criterion, but some do not. Notably,--         Shift-JIS (CP932) and Big5 contain several different encodings of the same--         Unicode codepoint.---- On Windows, you can access supported code pages with the prefix-- @CP@; for example, @\"CP1250\"@.--mkTextEncoding::String->IOTextEncodingmkTextEncoding :: String -> IO TextEncodingmkTextEncodingStringe=caseMaybe CodingFailureModemb_coding_failure_modeofMaybe CodingFailureModeNothing->String -> IO TextEncodingforall a. String -> IO aunknownEncodingErrStringeJustCodingFailureModecfm->CodingFailureMode -> String -> IO TextEncodingmkTextEncoding'CodingFailureModecfmStringencwhere(Stringenc,Stringsuffix)=(Char -> Bool) -> String -> (String, String)forall a. (a -> Bool) -> [a] -> ([a], [a])span(Char -> Char -> Boolforall a. Eq a => a -> a -> Bool/=Char'/')Stringemb_coding_failure_mode :: Maybe CodingFailureModemb_coding_failure_mode=caseStringsuffixofString""->CodingFailureMode -> Maybe CodingFailureModeforall a. a -> Maybe aJustCodingFailureModeErrorOnCodingFailureString"//IGNORE"->CodingFailureMode -> Maybe CodingFailureModeforall a. a -> Maybe aJustCodingFailureModeIgnoreCodingFailureString"//TRANSLIT"->CodingFailureMode -> Maybe CodingFailureModeforall a. a -> Maybe aJustCodingFailureModeTransliterateCodingFailureString"//ROUNDTRIP"->CodingFailureMode -> Maybe CodingFailureModeforall a. a -> Maybe aJustCodingFailureModeRoundtripFailureString_->Maybe CodingFailureModeforall a. Maybe aNothingmkTextEncoding'::CodingFailureMode->String->IOTextEncodingmkTextEncoding' :: CodingFailureMode -> String -> IO TextEncodingmkTextEncoding'CodingFailureModecfmStringenc=case[Char -> ChartoUpperCharc|Charc<-Stringenc,CharcChar -> Char -> Boolforall a. Eq a => a -> a -> Bool/=Char'-']of-- UTF-8 and friends we can handle ourselvesString"UTF8"->TextEncoding -> IO TextEncodingforall a. a -> IO aforall (m :: * -> *) a. Monad m => a -> m areturn(TextEncoding -> IO TextEncoding)-> TextEncoding -> IO TextEncodingforall a b. (a -> b) -> a -> b$CodingFailureMode -> TextEncodingUTF8.mkUTF8CodingFailureModecfmString"UTF16"->TextEncoding -> IO TextEncodingforall a. a -> IO aforall (m :: * -> *) a. Monad m => a -> m areturn(TextEncoding -> IO TextEncoding)-> TextEncoding -> IO TextEncodingforall a b. (a -> b) -> a -> b$CodingFailureMode -> TextEncodingUTF16.mkUTF16CodingFailureModecfmString"UTF16LE"->TextEncoding -> IO TextEncodingforall a. a -> IO aforall (m :: * -> *) a. Monad m => a -> m areturn(TextEncoding -> IO TextEncoding)-> TextEncoding -> IO TextEncodingforall a b. (a -> b) -> a -> b$CodingFailureMode -> TextEncodingUTF16.mkUTF16leCodingFailureModecfmString"UTF16BE"->TextEncoding -> IO TextEncodingforall a. a -> IO aforall (m :: * -> *) a. Monad m => a -> m areturn(TextEncoding -> IO TextEncoding)-> TextEncoding -> IO TextEncodingforall a b. (a -> b) -> a -> b$CodingFailureMode -> TextEncodingUTF16.mkUTF16beCodingFailureModecfmString"UTF32"->TextEncoding -> IO TextEncodingforall a. a -> IO aforall (m :: * -> *) a. Monad m => a -> m areturn(TextEncoding -> IO TextEncoding)-> TextEncoding -> IO TextEncodingforall a b. (a -> b) -> a -> b$CodingFailureMode -> TextEncodingUTF32.mkUTF32CodingFailureModecfmString"UTF32LE"->TextEncoding -> IO TextEncodingforall a. a -> IO aforall (m :: * -> *) a. Monad m => a -> m areturn(TextEncoding -> IO TextEncoding)-> TextEncoding -> IO TextEncodingforall a b. (a -> b) -> a -> b$CodingFailureMode -> TextEncodingUTF32.mkUTF32leCodingFailureModecfmString"UTF32BE"->TextEncoding -> IO TextEncodingforall a. a -> IO aforall (m :: * -> *) a. Monad m => a -> m areturn(TextEncoding -> IO TextEncoding)-> TextEncoding -> IO TextEncodingforall a b. (a -> b) -> a -> b$CodingFailureMode -> TextEncodingUTF32.mkUTF32beCodingFailureModecfm-- On AIX, we want to avoid iconv, because it is either-- a) totally broken, or b) non-reentrant, or c) actually works.-- Detecting b) is difficult as you'd have to trigger the reentrancy-- corruption.-- Therefore, on AIX, we handle the popular ASCII and latin1 encodings-- ourselves. For consistency, we do the same on other platforms.-- We use `mkLatin1_checked` instead of `mkLatin1`, since the latter-- completely ignores the CodingFailureMode (TEST=encoding005).String_|BoolisAscii->TextEncoding -> IO TextEncodingforall a. a -> IO aforall (m :: * -> *) a. Monad m => a -> m areturn(CodingFailureMode -> TextEncodingLatin1.mkAsciiCodingFailureModecfm)String_|BoolisLatin1->TextEncoding -> IO TextEncodingforall a. a -> IO aforall (m :: * -> *) a. Monad m => a -> m areturn(CodingFailureMode -> TextEncodingLatin1.mkLatin1_checkedCodingFailureModecfm)#if defined(mingw32_HOST_OS)'C':'P':n|[(cp,"")]<-readsn->return$CodePage.mkCodePageEncodingcfmcp_->unknownEncodingErr(enc++codingFailureModeSuffixcfm)#else-- Otherwise, handle other encoding needs via iconv.-- Unfortunately there is no good way to determine whether iconv is actually-- functional without telling it to do something.String_->doMaybe TextEncodingres<-CodingFailureMode -> String -> IO (Maybe TextEncoding)Iconv.mkIconvEncodingCodingFailureModecfmStringenccaseMaybe TextEncodingresofJustTextEncodinge->TextEncoding -> IO TextEncodingforall a. a -> IO aforall (m :: * -> *) a. Monad m => a -> m areturnTextEncodingeMaybe TextEncodingNothing->String -> IO TextEncodingforall a. String -> IO aunknownEncodingErr(StringencString -> String -> Stringforall a. [a] -> [a] -> [a]++CodingFailureMode -> StringcodingFailureModeSuffixCodingFailureModecfm)#endifwhereisAscii :: BoolisAscii=StringencString -> [String] -> Boolforall a. Eq a => a -> [a] -> Bool`elem`[String]asciiEncNamesisLatin1 :: BoolisLatin1=StringencString -> [String] -> Boolforall a. Eq a => a -> [a] -> Bool`elem`[String]latin1EncNamesasciiEncNames :: [String]asciiEncNames=-- ASCII aliases specified by RFC 1345 and RFC 3808.[String"ANSI_X3.4-1968",String"iso-ir-6",String"ANSI_X3.4-1986",String"ISO_646.irv:1991",String"US-ASCII",String"us",String"IBM367",String"cp367",String"csASCII",String"ASCII",String"ISO646-US"]latin1EncNames :: [String]latin1EncNames=-- latin1 aliases specified by RFC 1345 and RFC 3808.[String"ISO_8859-1:1987",String"iso-ir-100",String"ISO_8859-1",String"ISO-8859-1",String"latin1",String"l1",String"IBM819",String"CP819",String"csISOLatin1"]latin1_encode::CharBuffer->BufferWord8->IO(CharBuffer,BufferWord8)latin1_encode :: CharBuffer -> Buffer Word8 -> IO (CharBuffer, Buffer Word8)latin1_encodeCharBufferinputBuffer Word8output=(State# RealWorld -> (# State# RealWorld, (CharBuffer, Buffer Word8) #))-> IO (CharBuffer, Buffer Word8)forall a. (State# RealWorld -> (# State# RealWorld, a #)) -> IO aIO((State# RealWorld  -> (# State# RealWorld, (CharBuffer, Buffer Word8) #)) -> IO (CharBuffer, Buffer Word8))-> (State# RealWorld    -> (# State# RealWorld, (CharBuffer, Buffer Word8) #))-> IO (CharBuffer, Buffer Word8)forall a b. (a -> b) -> a -> b$\State# RealWorldst->caseEncodeBuffer#Latin1.latin1_encodeCharBufferinputBuffer Word8outputState# RealWorldstof(#State# RealWorldst',CodingProgress_why,CharBufferinput',Buffer Word8output'#)->(#State# RealWorldst',(CharBufferinput',Buffer Word8output')#)-- unchecked, used for char8--latin1_encode = unsafePerformIO $ do mkTextEncoder Iconv.latin1 >>= return.encodelatin1_decode::BufferWord8->CharBuffer->IO(BufferWord8,CharBuffer)latin1_decode :: Buffer Word8 -> CharBuffer -> IO (Buffer Word8, CharBuffer)latin1_decodeBuffer Word8inputCharBufferoutput=(State# RealWorld -> (# State# RealWorld, (Buffer Word8, CharBuffer) #))-> IO (Buffer Word8, CharBuffer)forall a. (State# RealWorld -> (# State# RealWorld, a #)) -> IO aIO((State# RealWorld  -> (# State# RealWorld, (Buffer Word8, CharBuffer) #)) -> IO (Buffer Word8, CharBuffer))-> (State# RealWorld    -> (# State# RealWorld, (Buffer Word8, CharBuffer) #))-> IO (Buffer Word8, CharBuffer)forall a b. (a -> b) -> a -> b$\State# RealWorldst->caseDecodeBuffer#Latin1.latin1_decodeBuffer Word8inputCharBufferoutputState# RealWorldstof(#State# RealWorldst',CodingProgress_why,Buffer Word8input',CharBufferoutput'#)->(#State# RealWorldst',(Buffer Word8input',CharBufferoutput')#)--latin1_decode = unsafePerformIO $ do mkTextDecoder Iconv.latin1 >>= return.encodeunknownEncodingErr::String->IOaunknownEncodingErr :: forall a. String -> IO aunknownEncodingErrStringe=IOException -> IO aforall a. IOException -> IO aioException(Maybe Handle-> IOErrorType-> String-> String-> Maybe CInt-> Maybe String-> IOExceptionIOErrorMaybe Handleforall a. Maybe aNothingIOErrorTypeNoSuchThingString"mkTextEncoding"(String"unknown encoding:"String -> String -> Stringforall a. [a] -> [a] -> [a]++Stringe)Maybe CIntforall a. Maybe aNothingMaybe Stringforall a. Maybe aNothing)
[8]ページ先頭