diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2010-02-02 07:36:55 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2010-02-02 07:36:55 +0000 |
commit | 19b0c72dd18050a00dd77bb3dfddd0d0702d157f (patch) | |
tree | c120e270496a1ecba0c2badd4b8bc2a59c7d0a93 | |
parent | 70a7d7b2141472fe558bfc191f259667ed09f911 (diff) | |
download | pandoc-19b0c72dd18050a00dd77bb3dfddd0d0702d157f.tar.gz |
Made HTML reader much more forgiving.
+ Incorporated idea (from HXT) that an element can be closed
by an open tag for another element.
+ Javascript is partially parsed to make sure that a <script>
section is not closed by a </script> in a comment or string.
+ More lenient non-quoted attribute values.
Now we accept anything but a space character, quote, or <>.
This helps in parsing e.g. www.google.com!
+ Bare & signs are now parsed as a string. This is a common
HTML mistake.
+ Skip a bare < in malformed HTML.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@1825 788f1e2b-df1e-0410-8736-df70ead52e1b
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 135 |
1 files changed, 106 insertions, 29 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index c0df3aa65..58762c35f 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -50,6 +50,7 @@ import Data.Maybe ( fromMaybe ) import Data.List ( isPrefixOf, isSuffixOf, intercalate ) import Data.Char ( toLower, isAlphaNum ) import Network.URI ( parseURIReference, URI (..) ) +import Control.Monad ( liftM ) -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ParserState -- ^ Parser state @@ -71,7 +72,7 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", "br", "cite", "code", "dfn", "em", "font", "i", "img", "input", "kbd", "label", "q", "s", "samp", "select", "small", "span", "strike", "strong", "sub", "sup", - "textarea", "tt", "u", "var"] ++ eitherBlockOrInline + "textarea", "tt", "u", "var"] -} blockHtmlTags :: [[Char]] @@ -80,7 +81,7 @@ blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div", "h5", "h6", "head", "hr", "html", "isindex", "menu", "noframes", "noscript", "ol", "p", "pre", "table", "ul", "dd", "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script"] ++ eitherBlockOrInline + "th", "thead", "tr", "script", "style"] sanitaryTags :: [[Char]] sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big", @@ -112,6 +113,40 @@ sanitaryAttributes = ["abbr", "accept", "accept-charset", "summary", "tabindex", "target", "title", "type", "usemap", "valign", "value", "vspace", "width"] +-- taken from HXT and extended + +closes :: String -> String -> Bool +"EOF" `closes` _ = True +_ `closes` "body" = False +_ `closes` "html" = False +"a" `closes` "a" = True +"li" `closes` "li" = True +"th" `closes` t | t `elem` ["th","td"] = True +"td" `closes` t | t `elem` ["th","td"] = True +"tr" `closes` t | t `elem` ["th","td","tr"] = True +"dt" `closes` t | t `elem` ["dt","dd"] = True +"dd" `closes` t | t `elem` ["dt","dd"] = True +"hr" `closes` "p" = True +"p" `closes` "p" = True +"meta" `closes` "meta" = True +"colgroup" `closes` "colgroup" = True +"form" `closes` "form" = True +"label" `closes` "label" = True +"map" `closes` "map" = True +"object" `closes` "object" = True +_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True +t `closes` "select" | t /= "option" = True +"thead" `closes` t | t `elem` ["colgroup"] = True +"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True +"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True +t `closes` t2 | + t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] && + t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" +t1 `closes` t2 | + t1 `elem` blockHtmlTags && + t2 `notElem` (blockHtmlTags ++ eitherBlockOrInline) = True +_ `closes` _ = False + -- -- HTML utility functions -- @@ -176,6 +211,19 @@ extractTagType ('<':rest) = map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest extractTagType _ = "" +-- Parse any HTML tag (opening or self-closing) and return tag type +anyOpener :: GenParser Char ParserState [Char] +anyOpener = try $ do + char '<' + spaces + tag <- many1 alphaNum + skipMany htmlAttribute + spaces + option "" (string "/") + spaces + char '>' + return $ map toLower tag + -- | Parse any HTML tag (opening or self-closing) and return text of tag anyHtmlTag :: GenParser Char ParserState [Char] anyHtmlTag = try $ do @@ -257,32 +305,30 @@ htmlRegularAttribute = try $ do (content, quoteStr) <- choice [ (quoted '\''), (quoted '"'), (do - a <- many (alphaNum <|> (oneOf "-._:")) + a <- many (noneOf " \t\n\r\"'<>") return (a,"")) ] return (name, content, (name ++ "=" ++ quoteStr ++ content ++ quoteStr)) -- | Parse an end tag of type 'tag' -htmlEndTag :: [Char] -> GenParser Char st [Char] +htmlEndTag :: [Char] -> GenParser Char ParserState [Char] htmlEndTag tag = try $ do - char '<' - spaces - char '/' - spaces - stringAnyCase tag - spaces - char '>' - return $ "</" ++ tag ++ ">" - -{- --- | Returns @True@ if the tag is (or can be) an inline tag. -isInline :: String -> Bool -isInline tag = (extractTagType tag) `elem` inlineHtmlTags --} + closedByNext <- lookAhead $ option False $ liftM (`closes` tag) $ + anyOpener <|> (eof >> return "EOF") + if closedByNext + then return "" + else do char '<' + spaces + char '/' + spaces + stringAnyCase tag + spaces + char '>' + return $ "</" ++ tag ++ ">" -- | Returns @True@ if the tag is (or can be) a block tag. isBlock :: String -> Bool -isBlock tag = (extractTagType tag) `elem` blockHtmlTags +isBlock tag = (extractTagType tag) `elem` (blockHtmlTags ++ eitherBlockOrInline) anyHtmlBlockTag :: GenParser Char ParserState [Char] anyHtmlBlockTag = try $ do @@ -298,18 +344,43 @@ anyHtmlInlineTag = try $ do -- Scripts must be treated differently, because they can contain '<>' etc. htmlScript :: GenParser Char ParserState [Char] htmlScript = try $ do - open <- string "<script" - rest <- manyTill anyChar (htmlEndTag "script") + lookAhead $ htmlTag "script" + open <- anyHtmlTag + rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script") st <- getState if stateSanitizeHTML st && not ("script" `elem` sanitaryTags) then return "<!-- unsafe HTML removed -->" else return $ open ++ rest ++ "</script>" +scriptChunk :: GenParser Char ParserState [Char] +scriptChunk = jsComment <|> jsString <|> jsChars + where jsComment = jsEndlineComment <|> jsMultilineComment + jsString = jsSingleQuoteString <|> jsDoubleQuoteString + jsChars = many1 (noneOf "<\"'*/") <|> count 1 anyChar + jsEndlineComment = try $ do + string "//" + res <- manyTill anyChar newline + return ("//" ++ res) + jsMultilineComment = try $ do + string "/*" + res <- manyTill anyChar (try $ string "*/") + return ("/*" ++ res ++ "*/") + jsSingleQuoteString = stringwith '\'' + jsDoubleQuoteString = stringwith '"' + charWithEsc escapable = try $ + (try $ char '\\' >> oneOf ('\\':escapable) >>= \x -> return ['\\',x]) + <|> count 1 anyChar + stringwith c = try $ do + char c + res <- liftM concat $ manyTill (charWithEsc [c]) (char c) + return (c : (res ++ [c])) + -- | Parses material between style tags. -- Style tags must be treated differently, because they can contain CSS htmlStyle :: GenParser Char ParserState [Char] htmlStyle = try $ do - open <- string "<style" + lookAhead $ htmlTag "style" + open <- anyHtmlTag rest <- manyTill anyChar (htmlEndTag "style") st <- getState if stateSanitizeHTML st && not ("style" `elem` sanitaryTags) @@ -404,6 +475,14 @@ bodyTitle = try $ do _ -> fail "not title" inlinesTilEnd "h1" +endOfDoc :: GenParser Char ParserState () +endOfDoc = try $ do + spaces + optional (htmlEndTag "body") + spaces + optional (htmlEndTag "html" >> many anyChar) -- ignore stuff after </html> + eof + parseHtml :: GenParser Char ParserState Pandoc parseHtml = do sepEndBy (choice [xmlDec, definition, htmlComment]) spaces @@ -415,11 +494,7 @@ parseHtml = do spaces optional bodyTitle -- skip title in body, because it's represented in meta blocks <- parseBlocks - spaces - optional (htmlEndTag "body") - spaces - optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html> - eof + endOfDoc return $ Pandoc meta blocks -- @@ -438,6 +513,7 @@ block = choice [ codeBlock , para , plain , rawHtmlBlock' + , notFollowedBy' endOfDoc >> char '<' >> return Null ] <?> "block" -- @@ -586,6 +662,7 @@ inline = choice [ charRef , link , image , rawHtmlInline + , char '&' >> return (Str "&") -- common HTML error ] <?> "inline" code :: GenParser Char ParserState Inline @@ -599,7 +676,7 @@ code = try $ do rawHtmlInline :: GenParser Char ParserState Inline rawHtmlInline = do - result <- htmlScript <|> htmlStyle <|> htmlComment <|> anyHtmlInlineTag + result <- anyHtmlInlineTag <|> htmlComment state <- getState if stateParseRaw state then return (HtmlInline result) else return (Str "") @@ -640,7 +717,7 @@ linebreak :: GenParser Char ParserState Inline linebreak = htmlTag "br" >> optional newline >> return LineBreak str :: GenParser Char st Inline -str = many1 (noneOf "<& \t\n") >>= return . Str +str = many1 (noneOf "< \t\n&") >>= return . Str -- -- links and images |