From 1433aaa4c35af84fbe00ecf971acd1414da6dea8 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Wed, 12 Feb 2020 08:46:54 -0800 Subject: HTML reader: don't parse `data-id` as `id` attribute. And similarly don't parse any `data-X` as `X` when `X` is a valid HTML attribute. Reported in comment on #5415. --- src/Text/Pandoc/Readers/HTML.hs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index aa73cd9a1..057ff1d31 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -52,6 +52,7 @@ import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) import Text.Pandoc.Definition import Text.Pandoc.Readers.LaTeX (rawLaTeXInline) import Text.Pandoc.Readers.LaTeX.Types (Macro) +import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes) import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.Options ( @@ -835,7 +836,14 @@ mathMLToTeXMath s = writeTeX <$> readMathML s toStringAttr :: [(Text, Text)] -> [(Text, Text)] toStringAttr = map go - where go (x,y) = (fromMaybe x $ T.stripPrefix "data-" x, y) + where + go (x,y) = + case T.stripPrefix "data-" x of + Nothing -> (x,y) + Just x' -> if x' `Set.member` (html5Attributes <> + html4Attributes <> rdfaAttributes) + then (x,y) + else (x',y) pScriptMath :: PandocMonad m => TagParser m Inlines pScriptMath = try $ do -- cgit v1.2.3