aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2018-11-01 15:59:40 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2018-11-01 15:59:59 -0700
commit5350ce26b463ccb8edc7b0c7b1be72558e87992c (patch)
tree5335f1aec0eab047e6f8ac57af2fda3c656e4bf1
parent614b1c0c6188589128c4bfebeab65fa6306a916c (diff)
downloadpandoc-5350ce26b463ccb8edc7b0c7b1be72558e87992c.tar.gz
Text.Pandoc.XML: add toHtml5Entities.
[API change]
-rw-r--r--src/Text/Pandoc/XML.hs18
1 files changed, 18 insertions, 0 deletions
diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs
index add46bd6c..169951591 100644
--- a/src/Text/Pandoc/XML.hs
+++ b/src/Text/Pandoc/XML.hs
@@ -35,6 +35,7 @@ module Text.Pandoc.XML ( escapeCharForXML,
inTagsSimple,
inTagsIndented,
toEntities,
+ toHtml5Entities,
fromEntities ) where
import Prelude
@@ -43,6 +44,8 @@ import Data.Text (Text)
import qualified Data.Text as T
import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Pretty
+import qualified Data.Map as M
+import Text.HTML.TagSoup.Entity (htmlEntities)
-- | Escape one character as needed for XML.
escapeCharForXML :: Char -> String
@@ -100,6 +103,21 @@ toEntities = T.concatMap go
where go c | isAscii c = T.singleton c
| otherwise = T.pack ("&#" ++ show (ord c) ++ ";")
+-- | Escape all non-ascii characters using HTML5 entities, falling
+-- back to numerical entities.
+toHtml5Entities :: Text -> Text
+toHtml5Entities = T.concatMap go
+ where go c | isAscii c = T.singleton c
+ | otherwise =
+ case M.lookup c html5EntityMap of
+ Just t -> T.singleton '&' <> t <> T.singleton ';'
+ Nothing -> T.pack ("&#" ++ show (ord c) ++ ";")
+
+html5EntityMap :: M.Map Char Text
+html5EntityMap = M.fromList [(c, T.takeWhile (/=';') (T.pack ent))
+ | (ent@(_:_), [c]) <- htmlEntities
+ , last ent == ';']
+
-- Unescapes XML entities
fromEntities :: String -> String
fromEntities ('&':xs) =