aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2020-12-10 15:44:10 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2020-12-10 15:44:10 -0800
commit0a502e5ff52b251bbf3da69fd1f9a88d5e0fe92c (patch)
tree7c78b78cadf50dd68033ccc08fd1a9c43c2b325d
parent248a2a1db5cb567499a9272a2a2f2390c13d9275 (diff)
downloadpandoc-0a502e5ff52b251bbf3da69fd1f9a88d5e0fe92c.tar.gz
HTML reader: retain attribute prefixes and avoid duplicates.
Previously we stripped attribute prefixes, reading `xml:lang` as `lang` for example. This resulted in two duplicate `lang` attributes when `xml:lang` and `lang` were both used. This commit causes the prefixes to be retained, and also avoids invald duplicate attributes. Closes #6938.
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs28
-rw-r--r--src/Text/Pandoc/Readers/HTML/Parsing.hs20
-rw-r--r--test/command/5986.md2
-rw-r--r--test/epub/wasteland.native8
4 files changed, 29 insertions, 29 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index eb78979a3..f870a241d 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -74,7 +74,7 @@ readHtml :: PandocMonad m
-> Text -- ^ String to parse (assumes @'\n'@ line endings)
-> m Pandoc
readHtml opts inp = do
- let tags = stripPrefixes . canonicalizeTags $
+ let tags = stripPrefixes $ canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True }
(crFilter inp)
parseDoc = do
@@ -95,6 +95,15 @@ readHtml opts inp = do
Right doc -> return doc
Left err -> throwError $ PandocParseError $ T.pack $ getError err
+-- Strip namespace prefixes on tags (not attributes)
+stripPrefixes :: [Tag Text] -> [Tag Text]
+stripPrefixes = map stripPrefix
+
+stripPrefix :: Tag Text -> Tag Text
+stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as
+stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s)
+stripPrefix x = x
+
replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
replaceNotes bs = do
st <- getState
@@ -114,7 +123,7 @@ setInPlain = local (\s -> s {inPlain = True})
pHtml :: PandocMonad m => TagParser m Blocks
pHtml = try $ do
(TagOpen "html" attr) <- lookAhead pAny
- for_ (lookup "lang" attr) $
+ for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $
updateState . B.setMeta "lang" . B.text
pInTags "html" block
@@ -1024,21 +1033,6 @@ htmlTag f = try $ do
handleTag tagname
_ -> mzero
--- Strip namespace prefixes
-stripPrefixes :: [Tag Text] -> [Tag Text]
-stripPrefixes = map stripPrefix
-
-stripPrefix :: Tag Text -> Tag Text
-stripPrefix (TagOpen s as) =
- TagOpen (stripPrefix' s) (map (first stripPrefix') as)
-stripPrefix (TagClose s) = TagClose (stripPrefix' s)
-stripPrefix x = x
-
-stripPrefix' :: Text -> Text
-stripPrefix' s =
- if T.null t then s else T.drop 1 t
- where (_, t) = T.span (/= ':') s
-
-- Utilities
-- | Adjusts a url according to the document's base URL.
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 2d58319da..e28ebe77b 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -193,14 +193,20 @@ t1 `closes` t2 |
_ `closes` _ = False
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
-toStringAttr = map go
+toStringAttr = foldr go []
where
- go (x,y) =
- case T.stripPrefix "data-" x of
- Just x' | x' `Set.notMember` (html5Attributes <>
- html4Attributes <> rdfaAttributes)
- -> (x',y)
- _ -> (x,y)
+ go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
+ -- treat xml:lang as lang
+ go ("xml:lang",y) ats = go ("lang",y) ats
+ -- prevent duplicate attributes
+ go (x,y) ats
+ | any (\(x',_) -> x == x') ats = ats
+ | otherwise =
+ case T.stripPrefix "data-" x of
+ Just x' | x' `Set.notMember` (html5Attributes <>
+ html4Attributes <> rdfaAttributes)
+ -> go (x',y) ats
+ _ -> (x,y):ats
-- Unlike fromAttrib from tagsoup, this distinguishes
-- between a missing attribute and an attribute with empty content.
diff --git a/test/command/5986.md b/test/command/5986.md
index ea0ca70c1..ed8dd30c9 100644
--- a/test/command/5986.md
+++ b/test/command/5986.md
@@ -4,7 +4,7 @@
^D
<p><span id="title_page.xhtml"></span></p>
<p><span id="nav.xhtml"></span></p>
-<nav type="landmarks" id="landmarks" hidden="hidden">
+<nav epub:type="landmarks" id="landmarks" hidden="hidden">
<ol>
<li><a href="text/title_page.xhtml">Title Page</a></li>
<li><a href="#nav.xhtml#toc">Table of Contents</a></li>
diff --git a/test/epub/wasteland.native b/test/epub/wasteland.native
index 0ed23eba2..ff59182a6 100644
--- a/test/epub/wasteland.native
+++ b/test/epub/wasteland.native
@@ -1,8 +1,8 @@
[Para [Image ("",[],[]) [] ("wasteland-cover.jpg","")]
,Para [Span ("wasteland-content.xhtml",[],[]) []]
-,Div ("wasteland-content.xhtml#frontmatter",["section"],[("type","frontmatter")])
+,Div ("wasteland-content.xhtml#frontmatter",["section","frontmatter"],[])
[]
-,Div ("wasteland-content.xhtml#bodymatter",["section"],[("type","bodymatter")])
+,Div ("wasteland-content.xhtml#bodymatter",["section","bodymatter"],[])
[Div ("wasteland-content.xhtml#ch1",["section"],[])
[Header 2 ("",[],[]) [Str "I.",Space,Str "THE",Space,Str "BURIAL",Space,Str "OF",Space,Str "THE",Space,Str "DEAD"]
,Div ("",["linegroup"],[])
@@ -922,8 +922,8 @@
[Plain [Str "Datta.",Space,Str "Dayadhvam.",Space,Str "Damyata."]]
,Div ("wasteland-content.xhtml#ln434",["linegroup","indent"],[])
[Plain [Span ("",[],[("lang","sa")]) [Str "Shantih",Space,Str "shantih",Space,Str "shantih",Note [Para [Link ("",[],[]) [Str "434."] ("#wasteland-content.xhtml#ln434",""),Space,Str "Shantih.",Space,Str "Repeated",Space,Str "as",Space,Str "here,",Space,Str "a",Space,Str "formal",Space,Str "ending",Space,Str "to",Space,Str "an",Space,Str "Upanishad.",Space,Str "'The",SoftBreak,Str "Peace",Space,Str "which",Space,Str "passeth",Space,Str "understanding'",Space,Str "is",Space,Str "a",Space,Str "feeble",Space,Str "translation",Space,Str "of",Space,Str "the",SoftBreak,Str "content",Space,Str "of",Space,Str "this",Space,Str "word."]],SoftBreak]]]]]]
-,Div ("wasteland-content.xhtml#backmatter",["section"],[("type","backmatter")])
- [Div ("wasteland-content.xhtml#rearnotes",["section"],[("type","rearnotes")])
+,Div ("wasteland-content.xhtml#backmatter",["section","backmatter"],[])
+ [Div ("wasteland-content.xhtml#rearnotes",["section","rearnotes"],[])
[Header 2 ("",[],[]) [Str "NOTES",Space,Str "ON",Space,Str "\"THE",Space,Str "WASTE",Space,Str "LAND\""]
,Para [Str "Not",Space,Str "only",Space,Str "the",Space,Str "title,",Space,Str "but",Space,Str "the",Space,Str "plan",Space,Str "and",Space,Str "a",Space,Str "good",Space,Str "deal",Space,Str "of",Space,Str "the",Space,Str "incidental",Space,Str "symbolism",Space,Str "of",SoftBreak,Str "the",Space,Str "poem",Space,Str "were",Space,Str "suggested",Space,Str "by",Space,Str "Miss",Space,Str "Jessie",Space,Str "L.",Space,Str "Weston's",Space,Str "book",Space,Str "on",Space,Str "the",Space,Str "Grail",Space,Str "legend:",SoftBreak,Str "From",Space,Str "Ritual",Space,Str "to",Space,Str "Romance"]
,Para [Str "Indeed,",Space,Str "so",Space,Str "deeply",Space,Str "am",Space,Str "I",Space,Str "indebted,",Space,Str "Miss",Space,Str "Weston's",Space,Str "book",Space,Str "will",Space,Str "elucidate",Space,Str "the",SoftBreak,Str "difficulties",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "much",Space,Str "better",Space,Str "than",Space,Str "my",Space,Str "notes",Space,Str "can",Space,Str "do;",Space,Str "and",Space,Str "I",Space,Str "recommend",Space,Str "it",SoftBreak,Str "(apart",Space,Str "from",Space,Str "the",Space,Str "great",Space,Str "interest",Space,Str "of",Space,Str "the",Space,Str "book",Space,Str "itself)",Space,Str "to",Space,Str "any",Space,Str "who",Space,Str "think",Space,Str "such",SoftBreak,Str "elucidation",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "worth",Space,Str "the",Space,Str "trouble.",Space,Str "To",Space,Str "another",Space,Str "work",Space,Str "of",Space,Str "anthropology",Space,Str "I",Space,Str "am",SoftBreak,Str "indebted",Space,Str "in",Space,Str "general,",Space,Str "one",Space,Str "which",Space,Str "has",Space,Str "influenced",Space,Str "our",Space,Str "generation",Space,Str "profoundly;",Space,Str "I",Space,Str "mean",SoftBreak,Str "The",Space,Str "Golden",Space,Str "Bough;",Space,Str "I",Space,Str "have",Space,Str "used",Space,Str "especially",Space,Str "the",Space,Str "two",Space,Str "volumes",Space,Str "Adonis,",Space,Str "Attis,",Space,Str "Osiris.",SoftBreak,Str "Anyone",Space,Str "who",Space,Str "is",Space,Str "acquainted",Space,Str "with",Space,Str "these",Space,Str "works",Space,Str "will",Space,Str "immediately",Space,Str "recognise",Space,Str "in",Space,Str "the",Space,Str "poem",SoftBreak,Str "certain",Space,Str "references",Space,Str "to",Space,Str "vegetation",Space,Str "ceremonies."]