From 4399db4fd2d3f1f936e21cd0fe732142a7fccab2 Mon Sep 17 00:00:00 2001
From: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>
Date: Sat, 18 Aug 2007 23:44:26 +0000
Subject: Bug fixes in readers: + LaTeX reader:  skip anything after
 \end{document} + HTML reader: fixed bug skipping material after </html> --
 previously,   stuff at the end was skipped even if no </html> was present,
 which   meant only part of the file would be parsed and no error issued +
 HTML reader: added new constant eitherBlockOrInline with elements that   may
 count either as block-level or inline + Modified isInline and isBlock to take
 this into account + modified rawHtmlBlock to accept any tag (even an inline
 tag);   this is innocuous, because rawHtmlBlock is tried only if a regular  
 inline element can't be parsed.

git-svn-id: https://pandoc.googlecode.com/svn/trunk@862 788f1e2b-df1e-0410-8736-df70ead52e1b
---
 src/Text/Pandoc/Readers/HTML.hs  | 24 ++++++++++++++++++------
 src/Text/Pandoc/Readers/LaTeX.hs |  3 ++-
 2 files changed, 20 insertions(+), 7 deletions(-)
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 85aa1e4a3..ac3947ad1 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -59,11 +59,21 @@ readHtml = readWith parseHtml
 -- Constants
 --
 
+eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins",
+                  "map", "area", "object", "script"]
+
 inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
                   "br", "cite", "code", "dfn", "em", "font", "i", "img",
                   "input", "kbd", "label", "q", "s", "samp", "select",
                   "small", "span", "strike", "strong", "sub", "sup",
-                  "textarea", "tt", "u", "var"]
+                  "textarea", "tt", "u", "var"] ++ eitherBlockOrInline
+
+blockHtmlTags = ["address", "blockquote", "center", "dir", "div",
+                 "dl", "fieldset", "form", "h1", "h2", "h3", "h4",
+                 "h5", "h6", "hr", "isindex", "menu", "noframes",
+                 "noscript", "ol", "p", "pre", "table", "ul", "dd",
+                 "dt", "frameset", "li", "tbody", "td", "tfoot",
+                 "th", "thead", "tr"] ++ eitherBlockOrInline
 
 --
 -- HTML utility functions
@@ -171,12 +181,15 @@ htmlEndTag tag = try $ do
   char '>'
   return $ "</" ++ tag ++ ">"
 
--- | Returns @True@ if the tag is an inline tag.
+-- | Returns @True@ if the tag is (or can be) an inline tag.
 isInline tag = (extractTagType tag) `elem` inlineHtmlTags
 
+-- | Returns @True@ if the tag is (or can be) a block tag.
+isBlock tag = (extractTagType tag) `elem` blockHtmlTags 
+
 anyHtmlBlockTag = try $ do
   tag <- anyHtmlTag <|> anyHtmlEndTag
-  if isInline tag then fail "inline tag" else return tag
+  if isBlock tag then return tag else fail "inline tag"
 
 anyHtmlInlineTag = try $ do
   tag <- anyHtmlTag <|> anyHtmlEndTag
@@ -193,7 +206,7 @@ htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
 
 rawHtmlBlock = try $ do
   notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
-  body <- htmlBlockElement <|> anyHtmlBlockTag
+  body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag
   sp <- many space
   state <- getState
   if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null
@@ -260,8 +273,7 @@ parseHtml = do
   spaces
   optional (htmlEndTag "body")
   spaces
-  optional (htmlEndTag "html")
-  many anyChar -- ignore anything after </html>
+  optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html>
   eof
   return $ Pandoc (Meta title authors date) blocks
 
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index 84bda1942..9440b446a 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -117,7 +117,8 @@ parseLaTeX = do
   spaces
   blocks <- parseBlocks
   spaces
-  optional $ try (string "\\end{document}") -- might not be present (fragment)
+  optional $ try (string "\\end{document}" >> many anyChar) 
+  -- might not be present (fragment)
   spaces
   eof
   state <- getState
-- 
cgit v1.2.3