diff options
Diffstat (limited to 'src/Text')
-rw-r--r-- | src/Text/Pandoc/XML.hs | 18 |
1 files changed, 18 insertions, 0 deletions
diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs index add46bd6c..169951591 100644 --- a/src/Text/Pandoc/XML.hs +++ b/src/Text/Pandoc/XML.hs @@ -35,6 +35,7 @@ module Text.Pandoc.XML ( escapeCharForXML, inTagsSimple, inTagsIndented, toEntities, + toHtml5Entities, fromEntities ) where import Prelude @@ -43,6 +44,8 @@ import Data.Text (Text) import qualified Data.Text as T import Text.HTML.TagSoup.Entity (lookupEntity) import Text.Pandoc.Pretty +import qualified Data.Map as M +import Text.HTML.TagSoup.Entity (htmlEntities) -- | Escape one character as needed for XML. escapeCharForXML :: Char -> String @@ -100,6 +103,21 @@ toEntities = T.concatMap go where go c | isAscii c = T.singleton c | otherwise = T.pack ("&#" ++ show (ord c) ++ ";") +-- | Escape all non-ascii characters using HTML5 entities, falling +-- back to numerical entities. +toHtml5Entities :: Text -> Text +toHtml5Entities = T.concatMap go + where go c | isAscii c = T.singleton c + | otherwise = + case M.lookup c html5EntityMap of + Just t -> T.singleton '&' <> t <> T.singleton ';' + Nothing -> T.pack ("&#" ++ show (ord c) ++ ";") + +html5EntityMap :: M.Map Char Text +html5EntityMap = M.fromList [(c, T.takeWhile (/=';') (T.pack ent)) + | (ent@(_:_), [c]) <- htmlEntities + , last ent == ';'] + -- Unescapes XML entities fromEntities :: String -> String fromEntities ('&':xs) = |