UTF8: export toText, toTextLazy.

Define toString, toStringLazy in terms of them.
author: John MacFarlane <jgm@berkeley.edu> 2017-06-10 15:54:35 +0200
committer: John MacFarlane <jgm@berkeley.edu> 2017-06-10 15:54:35 +0200
commit: c691b975061e3674a80474968fab604cafe776af (patch)
tree: 4d515cfcc45effe229d6f7fb57cbfc95519481da /src/Text/Pandoc
parent: 72b45f05ed361d9fd21c0b8625263cf69494fe7a (diff)
download: pandoc-c691b975061e3674a80474968fab604cafe776af.tar.gz
1 files changed, 27 insertions, 17 deletions
diff --git a/src/Text/Pandoc/UTF8.hs b/src/Text/Pandoc/UTF8.hs
index 84043d4cb..3f1b28e54 100644
--- a/src/Text/Pandoc/UTF8.hs
+++ b/src/Text/Pandoc/UTF8.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE OverloadedStrings #-}
 {-
 Copyright (C) 2010-2017 John MacFarlane <jgm@berkeley.edu>
 
@@ -41,8 +42,10 @@ module Text.Pandoc.UTF8 ( readFile
                         , hPutStrLn
                         , hGetContents
                         , toString
+                        , toText
                         , fromString
                         , toStringLazy
+                        , toTextLazy
                         , fromStringLazy
                         , encodePath
                         , decodeArg
@@ -51,7 +54,7 @@ module Text.Pandoc.UTF8 ( readFile
 where
 
 import qualified Data.ByteString.Char8 as B
-import qualified Data.ByteString.Lazy as BL
+import qualified Data.ByteString.Lazy.Char8 as BL
 import qualified Data.Text as T
 import qualified Data.Text.Encoding as T
 import qualified Data.Text.Lazy as TL
@@ -110,31 +113,38 @@ hGetContents = fmap toString . B.hGetContents
 --                   >> hSetNewlineMode h universalNewlineMode
 --                   >> IO.hGetContents h
 
--- | Drop BOM (byte order marker) if present at beginning of string.
--- Note that Data.Text converts the BOM to code point FEFF, zero-width
--- no-break space, so if the string begins with this  we strip it off.
-dropBOM :: String -> String
-dropBOM ('\xFEFF':xs) = xs
-dropBOM xs            = xs
-
-filterCRs :: String -> String
-filterCRs ('\r':'\n':xs) = '\n': filterCRs xs
-filterCRs ('\r':xs)      = '\n' : filterCRs xs
-filterCRs (x:xs)         = x : filterCRs xs
-filterCRs []             = []
+-- | Convert UTF8-encoded ByteString to Text, also
+-- removing '\r' characters.
+toText :: B.ByteString -> T.Text
+toText = T.decodeUtf8 . filterCRs . dropBOM
+  where dropBOM bs =
+         if "\xEF\xBB\xBF" `B.isPrefixOf` bs
+            then B.drop 3 bs
+            else bs
+        filterCRs = B.filter (/='\r')
 
 -- | Convert UTF8-encoded ByteString to String, also
 -- removing '\r' characters.
 toString :: B.ByteString -> String
-toString = filterCRs . dropBOM . T.unpack . T.decodeUtf8
+toString = T.unpack . toText
 
-fromString :: String -> B.ByteString
-fromString = T.encodeUtf8 . T.pack
+-- | Convert UTF8-encoded ByteString to Text, also
+-- removing '\r' characters.
+toTextLazy :: BL.ByteString -> TL.Text
+toTextLazy = TL.decodeUtf8 . filterCRs . dropBOM
+  where dropBOM bs =
+         if "\xEF\xBB\xBF" `BL.isPrefixOf` bs
+            then BL.drop 3 bs
+            else bs
+        filterCRs = BL.filter (/='\r')
 
 -- | Convert UTF8-encoded ByteString to String, also
 -- removing '\r' characters.
 toStringLazy :: BL.ByteString -> String
-toStringLazy = filterCRs . dropBOM . TL.unpack . TL.decodeUtf8
+toStringLazy = TL.unpack . toTextLazy
+
+fromString :: String -> B.ByteString
+fromString = T.encodeUtf8 . T.pack
 
 fromStringLazy :: String -> BL.ByteString
 fromStringLazy = TL.encodeUtf8 . TL.pack
author	John MacFarlane <jgm@berkeley.edu>	2017-06-10 15:54:35 +0200
committer	John MacFarlane <jgm@berkeley.edu>	2017-06-10 15:54:35 +0200
commit	c691b975061e3674a80474968fab604cafe776af (patch)
tree	4d515cfcc45effe229d6f7fb57cbfc95519481da /src/Text/Pandoc
parent	72b45f05ed361d9fd21c0b8625263cf69494fe7a (diff)
download	pandoc-c691b975061e3674a80474968fab604cafe776af.tar.gz