From 94466c006005892dc177a8a0518ccf3c55b4e51b Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Thu, 7 Aug 2014 22:12:44 -0700 Subject: HTML reader: Really ignore DOCTYPE and xml declarations. This actually does what d71b013841f3c9c8c595591e312a31df16a728cb said it did. Revised epub tests to remove the repeated DOCTYPE and xml tags. --- src/Text/Pandoc/Readers/HTML.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index a8df1394c..42ef11065 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -289,7 +289,7 @@ fixPlains inList bs = if any isParaish bs' pRawTag :: TagParser String pRawTag = do tag <- pAnyTag - let ignorable x = x `elem` ["html","head","body","DOCTYPE","?xml"] + let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"] if tagOpen ignorable (const True) tag || tagClose ignorable tag then return [] else return $ renderTags' [tag] @@ -728,7 +728,7 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", -} blockHtmlTags :: [String] -blockHtmlTags = ["address", "article", "aside", +blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside", "blockquote", "body", "button", "canvas", "caption", "center", "col", "colgroup", "dd", "dir", "div", "dl", "dt", "embed", "fieldset", "figcaption", "figure", -- cgit v1.2.3