aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2020-12-10 15:44:10 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2020-12-10 15:44:10 -0800
commit0a502e5ff52b251bbf3da69fd1f9a88d5e0fe92c (patch)
tree7c78b78cadf50dd68033ccc08fd1a9c43c2b325d /src/Text/Pandoc/Readers
parent248a2a1db5cb567499a9272a2a2f2390c13d9275 (diff)
downloadpandoc-0a502e5ff52b251bbf3da69fd1f9a88d5e0fe92c.tar.gz
HTML reader: retain attribute prefixes and avoid duplicates.
Previously we stripped attribute prefixes, reading `xml:lang` as `lang` for example. This resulted in two duplicate `lang` attributes when `xml:lang` and `lang` were both used. This commit causes the prefixes to be retained, and also avoids invald duplicate attributes. Closes #6938.
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs28
-rw-r--r--src/Text/Pandoc/Readers/HTML/Parsing.hs20
2 files changed, 24 insertions, 24 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index eb78979a3..f870a241d 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -74,7 +74,7 @@ readHtml :: PandocMonad m
-> Text -- ^ String to parse (assumes @'\n'@ line endings)
-> m Pandoc
readHtml opts inp = do
- let tags = stripPrefixes . canonicalizeTags $
+ let tags = stripPrefixes $ canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True }
(crFilter inp)
parseDoc = do
@@ -95,6 +95,15 @@ readHtml opts inp = do
Right doc -> return doc
Left err -> throwError $ PandocParseError $ T.pack $ getError err
+-- Strip namespace prefixes on tags (not attributes)
+stripPrefixes :: [Tag Text] -> [Tag Text]
+stripPrefixes = map stripPrefix
+
+stripPrefix :: Tag Text -> Tag Text
+stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as
+stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s)
+stripPrefix x = x
+
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
replaceNotes bs = do
st <- getState
@@ -114,7 +123,7 @@ setInPlain = local (\s -> s {inPlain = True})
pHtml :: PandocMonad m => TagParser m Blocks
pHtml = try $ do
(TagOpen "html" attr) <- lookAhead pAny
- for_ (lookup "lang" attr) $
+ for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $
updateState . B.setMeta "lang" . B.text
pInTags "html" block
@@ -1024,21 +1033,6 @@ htmlTag f = try $ do
handleTag tagname
_ -> mzero
--- Strip namespace prefixes
-stripPrefixes :: [Tag Text] -> [Tag Text]
-stripPrefixes = map stripPrefix
-
-stripPrefix :: Tag Text -> Tag Text
-stripPrefix (TagOpen s as) =
- TagOpen (stripPrefix' s) (map (first stripPrefix') as)
-stripPrefix (TagClose s) = TagClose (stripPrefix' s)
-stripPrefix x = x
-
-stripPrefix' :: Text -> Text
-stripPrefix' s =
- if T.null t then s else T.drop 1 t
- where (_, t) = T.span (/= ':') s
-
-- Utilities
-- | Adjusts a url according to the document's base URL.
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 2d58319da..e28ebe77b 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -193,14 +193,20 @@ t1 `closes` t2 |
_ `closes` _ = False
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
-toStringAttr = map go
+toStringAttr = foldr go []
where
- go (x,y) =
- case T.stripPrefix "data-" x of
- Just x' | x' `Set.notMember` (html5Attributes <>
- html4Attributes <> rdfaAttributes)
- -> (x',y)
- _ -> (x,y)
+ go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
+ -- treat xml:lang as lang
+ go ("xml:lang",y) ats = go ("lang",y) ats
+ -- prevent duplicate attributes
+ go (x,y) ats
+ | any (\(x',_) -> x == x') ats = ats
+ | otherwise =
+ case T.stripPrefix "data-" x of
+ Just x' | x' `Set.notMember` (html5Attributes <>
+ html4Attributes <> rdfaAttributes)
+ -> go (x',y) ats
+ _ -> (x,y):ats
-- Unlike fromAttrib from tagsoup, this distinguishes
-- between a missing attribute and an attribute with empty content.