From 94466c006005892dc177a8a0518ccf3c55b4e51b Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Thu, 7 Aug 2014 22:12:44 -0700
Subject: HTML reader: Really ignore DOCTYPE and xml declarations.

This actually does what d71b013841f3c9c8c595591e312a31df16a728cb
said it did.

Revised epub tests to remove the repeated DOCTYPE and xml tags.
---
 src/Text/Pandoc/Readers/HTML.hs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'src/Text/Pandoc/Readers')

diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index a8df1394c..42ef11065 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -289,7 +289,7 @@ fixPlains inList bs = if any isParaish bs'
 pRawTag :: TagParser String
 pRawTag = do
   tag <- pAnyTag
-  let ignorable x = x `elem` ["html","head","body","DOCTYPE","?xml"]
+  let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"]
   if tagOpen ignorable (const True) tag || tagClose ignorable tag
      then return []
      else return $ renderTags' [tag]
@@ -728,7 +728,7 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
 -}
 
 blockHtmlTags :: [String]
-blockHtmlTags = ["address", "article", "aside",
+blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside",
                  "blockquote", "body", "button", "canvas",
                  "caption", "center", "col", "colgroup", "dd", "dir", "div",
                  "dl", "dt", "embed", "fieldset", "figcaption", "figure",
-- 
cgit v1.2.3