diff options
author | John MacFarlane <jgm@berkeley.edu> | 2020-12-10 15:44:10 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2020-12-10 15:44:10 -0800 |
commit | 0a502e5ff52b251bbf3da69fd1f9a88d5e0fe92c (patch) | |
tree | 7c78b78cadf50dd68033ccc08fd1a9c43c2b325d | |
parent | 248a2a1db5cb567499a9272a2a2f2390c13d9275 (diff) | |
download | pandoc-0a502e5ff52b251bbf3da69fd1f9a88d5e0fe92c.tar.gz |
HTML reader: retain attribute prefixes and avoid duplicates.
Previously we stripped attribute prefixes, reading
`xml:lang` as `lang` for example. This resulted in
two duplicate `lang` attributes when `xml:lang` and
`lang` were both used. This commit causes the prefixes
to be retained, and also avoids invald duplicate
attributes.
Closes #6938.
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 28 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML/Parsing.hs | 20 | ||||
-rw-r--r-- | test/command/5986.md | 2 | ||||
-rw-r--r-- | test/epub/wasteland.native | 8 |
4 files changed, 29 insertions, 29 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index eb78979a3..f870a241d 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -74,7 +74,7 @@ readHtml :: PandocMonad m -> Text -- ^ String to parse (assumes @'\n'@ line endings) -> m Pandoc readHtml opts inp = do - let tags = stripPrefixes . canonicalizeTags $ + let tags = stripPrefixes $ canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } (crFilter inp) parseDoc = do @@ -95,6 +95,15 @@ readHtml opts inp = do Right doc -> return doc Left err -> throwError $ PandocParseError $ T.pack $ getError err +-- Strip namespace prefixes on tags (not attributes) +stripPrefixes :: [Tag Text] -> [Tag Text] +stripPrefixes = map stripPrefix + +stripPrefix :: Tag Text -> Tag Text +stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as +stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s) +stripPrefix x = x + replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes bs = do st <- getState @@ -114,7 +123,7 @@ setInPlain = local (\s -> s {inPlain = True}) pHtml :: PandocMonad m => TagParser m Blocks pHtml = try $ do (TagOpen "html" attr) <- lookAhead pAny - for_ (lookup "lang" attr) $ + for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $ updateState . B.setMeta "lang" . B.text pInTags "html" block @@ -1024,21 +1033,6 @@ htmlTag f = try $ do handleTag tagname _ -> mzero --- Strip namespace prefixes -stripPrefixes :: [Tag Text] -> [Tag Text] -stripPrefixes = map stripPrefix - -stripPrefix :: Tag Text -> Tag Text -stripPrefix (TagOpen s as) = - TagOpen (stripPrefix' s) (map (first stripPrefix') as) -stripPrefix (TagClose s) = TagClose (stripPrefix' s) -stripPrefix x = x - -stripPrefix' :: Text -> Text -stripPrefix' s = - if T.null t then s else T.drop 1 t - where (_, t) = T.span (/= ':') s - -- Utilities -- | Adjusts a url according to the document's base URL. diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index 2d58319da..e28ebe77b 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -193,14 +193,20 @@ t1 `closes` t2 | _ `closes` _ = False toStringAttr :: [(Text, Text)] -> [(Text, Text)] -toStringAttr = map go +toStringAttr = foldr go [] where - go (x,y) = - case T.stripPrefix "data-" x of - Just x' | x' `Set.notMember` (html5Attributes <> - html4Attributes <> rdfaAttributes) - -> (x',y) - _ -> (x,y) + go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)] + -- treat xml:lang as lang + go ("xml:lang",y) ats = go ("lang",y) ats + -- prevent duplicate attributes + go (x,y) ats + | any (\(x',_) -> x == x') ats = ats + | otherwise = + case T.stripPrefix "data-" x of + Just x' | x' `Set.notMember` (html5Attributes <> + html4Attributes <> rdfaAttributes) + -> go (x',y) ats + _ -> (x,y):ats -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. diff --git a/test/command/5986.md b/test/command/5986.md index ea0ca70c1..ed8dd30c9 100644 --- a/test/command/5986.md +++ b/test/command/5986.md @@ -4,7 +4,7 @@ ^D <p><span id="title_page.xhtml"></span></p> <p><span id="nav.xhtml"></span></p> -<nav type="landmarks" id="landmarks" hidden="hidden"> +<nav epub:type="landmarks" id="landmarks" hidden="hidden"> <ol> <li><a href="text/title_page.xhtml">Title Page</a></li> <li><a href="#nav.xhtml#toc">Table of Contents</a></li> diff --git a/test/epub/wasteland.native b/test/epub/wasteland.native index 0ed23eba2..ff59182a6 100644 --- a/test/epub/wasteland.native +++ b/test/epub/wasteland.native @@ -1,8 +1,8 @@ [Para [Image ("",[],[]) [] ("wasteland-cover.jpg","")] ,Para [Span ("wasteland-content.xhtml",[],[]) []] -,Div ("wasteland-content.xhtml#frontmatter",["section"],[("type","frontmatter")]) +,Div ("wasteland-content.xhtml#frontmatter",["section","frontmatter"],[]) [] -,Div ("wasteland-content.xhtml#bodymatter",["section"],[("type","bodymatter")]) +,Div ("wasteland-content.xhtml#bodymatter",["section","bodymatter"],[]) [Div ("wasteland-content.xhtml#ch1",["section"],[]) [Header 2 ("",[],[]) [Str "I.",Space,Str "THE",Space,Str "BURIAL",Space,Str "OF",Space,Str "THE",Space,Str "DEAD"] ,Div ("",["linegroup"],[]) @@ -922,8 +922,8 @@ [Plain [Str "Datta.",Space,Str "Dayadhvam.",Space,Str "Damyata."]] ,Div ("wasteland-content.xhtml#ln434",["linegroup","indent"],[]) [Plain [Span ("",[],[("lang","sa")]) [Str "Shantih",Space,Str "shantih",Space,Str "shantih",Note [Para [Link ("",[],[]) [Str "434."] ("#wasteland-content.xhtml#ln434",""),Space,Str "Shantih.",Space,Str "Repeated",Space,Str "as",Space,Str "here,",Space,Str "a",Space,Str "formal",Space,Str "ending",Space,Str "to",Space,Str "an",Space,Str "Upanishad.",Space,Str "'The",SoftBreak,Str "Peace",Space,Str "which",Space,Str "passeth",Space,Str "understanding'",Space,Str "is",Space,Str "a",Space,Str "feeble",Space,Str "translation",Space,Str "of",Space,Str "the",SoftBreak,Str "content",Space,Str "of",Space,Str "this",Space,Str "word."]],SoftBreak]]]]]] -,Div ("wasteland-content.xhtml#backmatter",["section"],[("type","backmatter")]) - [Div ("wasteland-content.xhtml#rearnotes",["section"],[("type","rearnotes")]) +,Div ("wasteland-content.xhtml#backmatter",["section","backmatter"],[]) + [Div ("wasteland-content.xhtml#rearnotes",["section","rearnotes"],[]) [Header 2 ("",[],[]) [Str "NOTES",Space,Str "ON",Space,Str "\"THE",Space,Str "WASTE",Space,Str "LAND\""] ,Para [Str "Not",Space,Str "only",Space,Str "the",Space,Str "title,",Space,Str "but",Space,Str "the",Space,Str "plan",Space,Str "and",Space,Str "a",Space,Str "good",Space,Str "deal",Space,Str "of",Space,Str "the",Space,Str "incidental",Space,Str "symbolism",Space,Str "of",SoftBreak,Str "the",Space,Str "poem",Space,Str "were",Space,Str "suggested",Space,Str "by",Space,Str "Miss",Space,Str "Jessie",Space,Str "L.",Space,Str "Weston's",Space,Str "book",Space,Str "on",Space,Str "the",Space,Str "Grail",Space,Str "legend:",SoftBreak,Str "From",Space,Str "Ritual",Space,Str "to",Space,Str "Romance"] ,Para [Str "Indeed,",Space,Str "so",Space,Str "deeply",Space,Str "am",Space,Str "I",Space,Str "indebted,",Space,Str "Miss",Space,Str "Weston's",Space,Str "book",Space,Str "will",Space,Str "elucidate",Space,Str "the",SoftBreak,Str "difficulties",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "much",Space,Str "better",Space,Str "than",Space,Str "my",Space,Str "notes",Space,Str "can",Space,Str "do;",Space,Str "and",Space,Str "I",Space,Str "recommend",Space,Str "it",SoftBreak,Str "(apart",Space,Str "from",Space,Str "the",Space,Str "great",Space,Str "interest",Space,Str "of",Space,Str "the",Space,Str "book",Space,Str "itself)",Space,Str "to",Space,Str "any",Space,Str "who",Space,Str "think",Space,Str "such",SoftBreak,Str "elucidation",Space,Str "of",Space,Str "the",Space,Str "poem",Space,Str "worth",Space,Str "the",Space,Str "trouble.",Space,Str "To",Space,Str "another",Space,Str "work",Space,Str "of",Space,Str "anthropology",Space,Str "I",Space,Str "am",SoftBreak,Str "indebted",Space,Str "in",Space,Str "general,",Space,Str "one",Space,Str "which",Space,Str "has",Space,Str "influenced",Space,Str "our",Space,Str "generation",Space,Str "profoundly;",Space,Str "I",Space,Str "mean",SoftBreak,Str "The",Space,Str "Golden",Space,Str "Bough;",Space,Str "I",Space,Str "have",Space,Str "used",Space,Str "especially",Space,Str "the",Space,Str "two",Space,Str "volumes",Space,Str "Adonis,",Space,Str "Attis,",Space,Str "Osiris.",SoftBreak,Str "Anyone",Space,Str "who",Space,Str "is",Space,Str "acquainted",Space,Str "with",Space,Str "these",Space,Str "works",Space,Str "will",Space,Str "immediately",Space,Str "recognise",Space,Str "in",Space,Str "the",Space,Str "poem",SoftBreak,Str "certain",Space,Str "references",Space,Str "to",Space,Str "vegetation",Space,Str "ceremonies."] |