From 12a5bd3c8d34eddbabee0dc54fd7ce6d9539c9d4 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 8 Jan 2016 17:08:01 -0800 Subject: Entity handling fixes: - Text.Pandoc.XML.fromEntities: handle entities without a semicolon. Always lookup character references with the trailing ';', even if it wasn't present. And never add it when looking up numerical entities. (This is what tagsoup seems to require.) - Text.Pandoc.Parsing.characterReference: Always lookup character references with the trailing ';', and leave off the ';' when looking up numerical entities. This fixes a regression for e.g. `⟨`. --- src/Text/Pandoc/Parsing.hs | 5 ++++- src/Text/Pandoc/XML.hs | 8 ++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 85786eb3e..db891d5d4 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -573,7 +573,10 @@ characterReference :: Stream s m Char => ParserT s st m Char characterReference = try $ do char '&' ent <- many1Till nonspaceChar (char ';') - case lookupEntity ent of + let ent' = case ent of + '#':_ -> ent + _ -> ent ++ ";" + case lookupEntity ent' of Just c -> return c Nothing -> fail "entity not found" diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs index caa13f177..1e01b62f2 100644 --- a/src/Text/Pandoc/XML.hs +++ b/src/Text/Pandoc/XML.hs @@ -100,11 +100,15 @@ toEntities (c:cs) -- Unescapes XML entities fromEntities :: String -> String fromEntities ('&':xs) = - case lookupEntity ent of + case lookupEntity ent' of Just c -> c : fromEntities rest Nothing -> '&' : fromEntities xs where (ent, rest) = case break (\c -> isSpace c || c == ';') xs of (zs,';':ys) -> (zs,ys) - _ -> ("",xs) + (zs, ys) -> (zs,ys) + ent' = case ent of + '#':_ -> ent + _ -> ent ++ ";" + fromEntities (x:xs) = x : fromEntities xs fromEntities [] = [] -- cgit v1.2.3