From 0a502e5ff52b251bbf3da69fd1f9a88d5e0fe92c Mon Sep 17 00:00:00 2001 From: John MacFarlane <jgm@berkeley.edu> Date: Thu, 10 Dec 2020 15:44:10 -0800 Subject: HTML reader: retain attribute prefixes and avoid duplicates. Previously we stripped attribute prefixes, reading `xml:lang` as `lang` for example. This resulted in two duplicate `lang` attributes when `xml:lang` and `lang` were both used. This commit causes the prefixes to be retained, and also avoids invald duplicate attributes. Closes #6938. --- src/Text/Pandoc/Readers/HTML/Parsing.hs | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'src/Text/Pandoc/Readers/HTML') diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index 2d58319da..e28ebe77b 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -193,14 +193,20 @@ t1 `closes` t2 | _ `closes` _ = False toStringAttr :: [(Text, Text)] -> [(Text, Text)] -toStringAttr = map go +toStringAttr = foldr go [] where - go (x,y) = - case T.stripPrefix "data-" x of - Just x' | x' `Set.notMember` (html5Attributes <> - html4Attributes <> rdfaAttributes) - -> (x',y) - _ -> (x,y) + go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)] + -- treat xml:lang as lang + go ("xml:lang",y) ats = go ("lang",y) ats + -- prevent duplicate attributes + go (x,y) ats + | any (\(x',_) -> x == x') ats = ats + | otherwise = + case T.stripPrefix "data-" x of + Just x' | x' `Set.notMember` (html5Attributes <> + html4Attributes <> rdfaAttributes) + -> go (x',y) ats + _ -> (x,y):ats -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. -- cgit v1.2.3