From 5350ce26b463ccb8edc7b0c7b1be72558e87992c Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 1 Nov 2018 15:59:40 -0700 Subject: Text.Pandoc.XML: add toHtml5Entities. [API change] --- src/Text/Pandoc/XML.hs | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'src/Text') diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs index add46bd6c..169951591 100644 --- a/src/Text/Pandoc/XML.hs +++ b/src/Text/Pandoc/XML.hs @@ -35,6 +35,7 @@ module Text.Pandoc.XML ( escapeCharForXML, inTagsSimple, inTagsIndented, toEntities, + toHtml5Entities, fromEntities ) where import Prelude @@ -43,6 +44,8 @@ import Data.Text (Text) import qualified Data.Text as T import Text.HTML.TagSoup.Entity (lookupEntity) import Text.Pandoc.Pretty +import qualified Data.Map as M +import Text.HTML.TagSoup.Entity (htmlEntities) -- | Escape one character as needed for XML. escapeCharForXML :: Char -> String @@ -100,6 +103,21 @@ toEntities = T.concatMap go where go c | isAscii c = T.singleton c | otherwise = T.pack ("&#" ++ show (ord c) ++ ";") +-- | Escape all non-ascii characters using HTML5 entities, falling +-- back to numerical entities. +toHtml5Entities :: Text -> Text +toHtml5Entities = T.concatMap go + where go c | isAscii c = T.singleton c + | otherwise = + case M.lookup c html5EntityMap of + Just t -> T.singleton '&' <> t <> T.singleton ';' + Nothing -> T.pack ("&#" ++ show (ord c) ++ ";") + +html5EntityMap :: M.Map Char Text +html5EntityMap = M.fromList [(c, T.takeWhile (/=';') (T.pack ent)) + | (ent@(_:_), [c]) <- htmlEntities + , last ent == ';'] + -- Unescapes XML entities fromEntities :: String -> String fromEntities ('&':xs) = -- cgit v1.2.3