UTF8: Strip off BOM if present.

Closes #743.
author: John MacFarlane <fiddlosopher@gmail.com> 2013-02-08 09:45:15 -0800
committer: John MacFarlane <fiddlosopher@gmail.com> 2013-02-08 09:45:25 -0800
commit: 3465ba2f61872566e221235fba58001bbc2a3b67 (patch)
tree: fa62e78da0ad97ddc1c6030a5d31748cce3e2683 /src/Text
parent: ec5dc431def8fd7aa43dab1f59fb20dcd2933d16 (diff)
download: pandoc-3465ba2f61872566e221235fba58001bbc2a3b67.tar.gz
1 files changed, 9 insertions, 2 deletions
diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs
index 582afb6dc..a6921fc8f 100644
--- a/src/Text/Pandoc/UTF8.hs
+++ b/src/Text/Pandoc/UTF8.hs
@@ -86,10 +86,17 @@ hGetContents = fmap toStringLazy . BL.hGetContents
 --                   >> hSetNewlineMode h universalNewlineMode
 --                   >> IO.hGetContents h
 
+-- | Drop BOM (byte order marker) if present at beginning of string.
+-- Note that Data.Text converts the BOM to code point FEFF, zero-width
+-- no-break space, so if the string begins with this  we strip it off.
+dropBOM :: String -> String
+dropBOM ('\xFEFF':xs) = xs
+dropBOM xs = xs
+
 -- | Convert UTF8-encoded ByteString to String, also
 -- removing '\r' characters.
 toString :: B.ByteString -> String
-toString = filter (/='\r') . T.unpack . T.decodeUtf8
+toString = filter (/='\r') . dropBOM . T.unpack . T.decodeUtf8
 
 fromString :: String -> B.ByteString
 fromString = T.encodeUtf8 . T.pack
@@ -97,7 +104,7 @@ fromString = T.encodeUtf8 . T.pack
 -- | Convert UTF8-encoded ByteString to String, also
 -- removing '\r' characters.
 toStringLazy :: BL.ByteString -> String
-toStringLazy = filter (/='\r') . TL.unpack . TL.decodeUtf8
+toStringLazy = filter (/='\r') . dropBOM . TL.unpack . TL.decodeUtf8
 
 fromStringLazy :: String -> BL.ByteString
 fromStringLazy = TL.encodeUtf8 . TL.pack
author	John MacFarlane <fiddlosopher@gmail.com>	2013-02-08 09:45:15 -0800
committer	John MacFarlane <fiddlosopher@gmail.com>	2013-02-08 09:45:25 -0800
commit	3465ba2f61872566e221235fba58001bbc2a3b67 (patch)
tree	fa62e78da0ad97ddc1c6030a5d31748cce3e2683 /src/Text
parent	ec5dc431def8fd7aa43dab1f59fb20dcd2933d16 (diff)
download	pandoc-3465ba2f61872566e221235fba58001bbc2a3b67.tar.gz