From 4399db4fd2d3f1f936e21cd0fe732142a7fccab2 Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Sat, 18 Aug 2007 23:44:26 +0000 Subject: Bug fixes in readers: + LaTeX reader: skip anything after \end{document} + HTML reader: fixed bug skipping material after -- previously, stuff at the end was skipped even if no was present, which meant only part of the file would be parsed and no error issued + HTML reader: added new constant eitherBlockOrInline with elements that may count either as block-level or inline + Modified isInline and isBlock to take this into account + modified rawHtmlBlock to accept any tag (even an inline tag); this is innocuous, because rawHtmlBlock is tried only if a regular inline element can't be parsed. git-svn-id: https://pandoc.googlecode.com/svn/trunk@862 788f1e2b-df1e-0410-8736-df70ead52e1b --- src/Text/Pandoc/Readers/HTML.hs | 24 ++++++++++++++++++------ src/Text/Pandoc/Readers/LaTeX.hs | 3 ++- 2 files changed, 20 insertions(+), 7 deletions(-) (limited to 'src/Text/Pandoc') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 85aa1e4a3..ac3947ad1 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -59,11 +59,21 @@ readHtml = readWith parseHtml -- Constants -- +eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", + "map", "area", "object", "script"] + inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", "br", "cite", "code", "dfn", "em", "font", "i", "img", "input", "kbd", "label", "q", "s", "samp", "select", "small", "span", "strike", "strong", "sub", "sup", - "textarea", "tt", "u", "var"] + "textarea", "tt", "u", "var"] ++ eitherBlockOrInline + +blockHtmlTags = ["address", "blockquote", "center", "dir", "div", + "dl", "fieldset", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "hr", "isindex", "menu", "noframes", + "noscript", "ol", "p", "pre", "table", "ul", "dd", + "dt", "frameset", "li", "tbody", "td", "tfoot", + "th", "thead", "tr"] ++ eitherBlockOrInline -- -- HTML utility functions @@ -171,12 +181,15 @@ htmlEndTag tag = try $ do char '>' return $ "" --- | Returns @True@ if the tag is an inline tag. +-- | Returns @True@ if the tag is (or can be) an inline tag. isInline tag = (extractTagType tag) `elem` inlineHtmlTags +-- | Returns @True@ if the tag is (or can be) a block tag. +isBlock tag = (extractTagType tag) `elem` blockHtmlTags + anyHtmlBlockTag = try $ do tag <- anyHtmlTag <|> anyHtmlEndTag - if isInline tag then fail "inline tag" else return tag + if isBlock tag then return tag else fail "inline tag" anyHtmlInlineTag = try $ do tag <- anyHtmlTag <|> anyHtmlEndTag @@ -193,7 +206,7 @@ htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ] rawHtmlBlock = try $ do notFollowedBy' (htmlTag "/body" <|> htmlTag "/html") - body <- htmlBlockElement <|> anyHtmlBlockTag + body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag sp <- many space state <- getState if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null @@ -260,8 +273,7 @@ parseHtml = do spaces optional (htmlEndTag "body") spaces - optional (htmlEndTag "html") - many anyChar -- ignore anything after + optional (htmlEndTag "html" >> many anyChar) -- ignore anything after eof return $ Pandoc (Meta title authors date) blocks diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 84bda1942..9440b446a 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -117,7 +117,8 @@ parseLaTeX = do spaces blocks <- parseBlocks spaces - optional $ try (string "\\end{document}") -- might not be present (fragment) + optional $ try (string "\\end{document}" >> many anyChar) + -- might not be present (fragment) spaces eof state <- getState -- cgit v1.2.3