aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2020-12-10 15:44:10 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2020-12-10 15:44:10 -0800
commit0a502e5ff52b251bbf3da69fd1f9a88d5e0fe92c (patch)
tree7c78b78cadf50dd68033ccc08fd1a9c43c2b325d /src/Text/Pandoc/Readers/HTML
parent248a2a1db5cb567499a9272a2a2f2390c13d9275 (diff)
downloadpandoc-0a502e5ff52b251bbf3da69fd1f9a88d5e0fe92c.tar.gz
HTML reader: retain attribute prefixes and avoid duplicates.
Previously we stripped attribute prefixes, reading `xml:lang` as `lang` for example. This resulted in two duplicate `lang` attributes when `xml:lang` and `lang` were both used. This commit causes the prefixes to be retained, and also avoids invald duplicate attributes. Closes #6938.
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML')
-rw-r--r--src/Text/Pandoc/Readers/HTML/Parsing.hs20
1 files changed, 13 insertions, 7 deletions
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 2d58319da..e28ebe77b 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -193,14 +193,20 @@ t1 `closes` t2 |
_ `closes` _ = False
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
-toStringAttr = map go
+toStringAttr = foldr go []
where
- go (x,y) =
- case T.stripPrefix "data-" x of
- Just x' | x' `Set.notMember` (html5Attributes <>
- html4Attributes <> rdfaAttributes)
- -> (x',y)
- _ -> (x,y)
+ go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
+ -- treat xml:lang as lang
+ go ("xml:lang",y) ats = go ("lang",y) ats
+ -- prevent duplicate attributes
+ go (x,y) ats
+ | any (\(x',_) -> x == x') ats = ats
+ | otherwise =
+ case T.stripPrefix "data-" x of
+ Just x' | x' `Set.notMember` (html5Attributes <>
+ html4Attributes <> rdfaAttributes)
+ -> go (x',y) ats
+ _ -> (x,y):ats
-- Unlike fromAttrib from tagsoup, this distinguishes
-- between a missing attribute and an attribute with empty content.