diff options
author | John MacFarlane <fiddlosopher@gmail.com> | 2013-02-08 09:45:15 -0800 |
---|---|---|
committer | John MacFarlane <fiddlosopher@gmail.com> | 2013-02-08 09:45:25 -0800 |
commit | 3465ba2f61872566e221235fba58001bbc2a3b67 (patch) | |
tree | fa62e78da0ad97ddc1c6030a5d31748cce3e2683 | |
parent | ec5dc431def8fd7aa43dab1f59fb20dcd2933d16 (diff) | |
download | pandoc-3465ba2f61872566e221235fba58001bbc2a3b67.tar.gz |
UTF8: Strip off BOM if present.
Closes #743.
-rw-r--r-- | src/Text/Pandoc/UTF8.hs | 11 |
1 files changed, 9 insertions, 2 deletions
diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs index 582afb6dc..a6921fc8f 100644 --- a/src/Text/Pandoc/UTF8.hs +++ b/src/Text/Pandoc/UTF8.hs @@ -86,10 +86,17 @@ hGetContents = fmap toStringLazy . BL.hGetContents -- >> hSetNewlineMode h universalNewlineMode -- >> IO.hGetContents h +-- | Drop BOM (byte order marker) if present at beginning of string. +-- Note that Data.Text converts the BOM to code point FEFF, zero-width +-- no-break space, so if the string begins with this we strip it off. +dropBOM :: String -> String +dropBOM ('\xFEFF':xs) = xs +dropBOM xs = xs + -- | Convert UTF8-encoded ByteString to String, also -- removing '\r' characters. toString :: B.ByteString -> String -toString = filter (/='\r') . T.unpack . T.decodeUtf8 +toString = filter (/='\r') . dropBOM . T.unpack . T.decodeUtf8 fromString :: String -> B.ByteString fromString = T.encodeUtf8 . T.pack @@ -97,7 +104,7 @@ fromString = T.encodeUtf8 . T.pack -- | Convert UTF8-encoded ByteString to String, also -- removing '\r' characters. toStringLazy :: BL.ByteString -> String -toStringLazy = filter (/='\r') . TL.unpack . TL.decodeUtf8 +toStringLazy = filter (/='\r') . dropBOM . TL.unpack . TL.decodeUtf8 fromStringLazy :: String -> BL.ByteString fromStringLazy = TL.encodeUtf8 . TL.pack |