{- Copyright (C) 2006-8 John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | Module : Text.Pandoc.Readers.HTML Copyright : Copyright (C) 2006-8 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane Stability : alpha Portability : portable Conversion of HTML to 'Pandoc' document. -} module Text.Pandoc.Readers.HTML ( readHtml, rawHtmlInline, rawHtmlBlock, anyHtmlBlockTag, anyHtmlInlineTag, anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType, htmlBlockElement, htmlComment, unsanitaryURI ) where import Text.ParserCombinators.Parsec import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) import Data.Maybe ( fromMaybe ) import Data.List ( isPrefixOf, isSuffixOf, intercalate ) import Data.Char ( toLower, isAlphaNum ) import Network.URI ( parseURIReference, URI (..) ) import Control.Monad ( liftM ) -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ParserState -- ^ Parser state -> String -- ^ String to parse (assumes @'\n'@ line endings) -> Pandoc readHtml = readWith parseHtml -- -- Constants -- eitherBlockOrInline :: [[Char]] eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", "map", "area", "object"] {- inlineHtmlTags :: [[Char]] inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", "br", "cite", "code", "dfn", "em", "font", "i", "img", "input", "kbd", "label", "q", "s", "samp", "select", "small", "span", "strike", "strong", "sub", "sup", "textarea", "tt", "u", "var"] -} blockHtmlTags :: [[Char]] blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div", "dl", "fieldset", "form", "h1", "h2", "h3", "h4", "h5", "h6", "head", "hr", "html", "isindex", "menu", "noframes", "noscript", "ol", "p", "pre", "table", "ul", "dd", "dt", "frameset", "li", "tbody", "td", "tfoot", "th", "thead", "tr", "script", "style"] sanitaryTags :: [[Char]] sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big", "blockquote", "br", "button", "caption", "center", "cite", "code", "col", "colgroup", "dd", "del", "dfn", "dir", "div", "dl", "dt", "em", "fieldset", "font", "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", "i", "img", "input", "ins", "kbd", "label", "legend", "li", "map", "menu", "ol", "optgroup", "option", "p", "pre", "q", "s", "samp", "select", "small", "span", "strike", "strong", "sub", "sup", "table", "tbody", "td", "textarea", "tfoot", "th", "thead", "tr", "tt", "u", "ul", "var"] sanitaryAttributes :: [[Char]] sanitaryAttributes = ["abbr", "accept", "accept-charset", "accesskey", "action", "align", "alt", "axis", "border", "cellpadding", "cellspacing", "char", "charoff", "charset", "checked", "cite", "class", "clear", "cols", "colspan", "color", "compact", "coords", "datetime", "dir", "disabled", "enctype", "for", "frame", "headers", "height", "href", "hreflang", "hspace", "id", "ismap", "label", "lang", "longdesc", "maxlength", "media", "method", "multiple", "name", "nohref", "noshade", "nowrap", "prompt", "readonly", "rel", "rev", "rows", "rowspan", "rules", "scope", "selected", "shape", "size", "span", "src", "start", "summary", "tabindex", "target", "title", "type", "usemap", "valign", "value", "vspace", "width"] -- taken from HXT and extended closes :: String -> String -> Bool "EOF" `closes` _ = True _ `closes` "body" = False _ `closes` "html" = False "a" `closes` "a" = True "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True "td" `closes` t | t `elem` ["th","td"] = True "tr" `closes` t | t `elem` ["th","td","tr"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True "dd" `closes` t | t `elem` ["dt","dd"] = True "hr" `closes` "p" = True "p" `closes` "p" = True "meta" `closes` "meta" = True "colgroup" `closes` "colgroup" = True "form" `closes` "form" = True "label" `closes` "label" = True "map" `closes` "map" = True "object" `closes` "object" = True _ `closes` t | t `elem` ["option","style","script","textarea","title"] = True t `closes` "select" | t /= "option" = True "thead" `closes` t | t `elem` ["colgroup"] = True "tfoot" `closes` t | t `elem` ["thead","colgroup"] = True "tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True t `closes` t2 | t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] && t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" t1 `closes` t2 | t1 `elem` blockHtmlTags && t2 `notElem` (blockHtmlTags ++ eitherBlockOrInline) = True _ `closes` _ = False -- -- HTML utility functions -- -- | Returns @True@ if sanitization is specified and the specified tag is -- not on the sanitized tag list. unsanitaryTag :: [Char] -> GenParser tok ParserState Bool unsanitaryTag tag = do st <- getState return $ stateSanitizeHTML st && tag `notElem` sanitaryTags -- | returns @True@ if sanitization is specified and the specified attribute -- is not on the sanitized attribute list. unsanitaryAttribute :: ([Char], String, t) -> GenParser tok ParserState Bool unsanitaryAttribute (attr, val, _) = do st <- getState return $ stateSanitizeHTML st && (attr `notElem` sanitaryAttributes || (attr `elem` ["href","src"] && unsanitaryURI val)) -- | Returns @True@ if the specified URI is potentially a security risk. unsanitaryURI :: String -> Bool unsanitaryURI u = let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:", "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:", "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:", "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:", "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:", "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:", "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:", "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:", "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:", "snews:", "webcal:", "ymsgr:"] in case parseURIReference u of Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes Nothing -> True -- | Read blocks until end tag. blocksTilEnd :: String -> GenParser Char ParserState [Block] blocksTilEnd tag = do blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) return $ filter (/= Null) blocks -- | Read inlines until end tag. inlinesTilEnd :: String -> GenParser Char ParserState [Inline] inlinesTilEnd tag = manyTill inline (htmlEndTag tag) -- | Parse blocks between open and close tag. blocksIn :: String -> GenParser Char ParserState [Block] blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag -- | Parse inlines between open and close tag. inlinesIn :: String -> GenParser Char ParserState [Inline] inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag -- | Extract type from a tag: e.g. @br@ from @\@ extractTagType :: String -> String extractTagType ('<':rest) = let isSpaceOrSlash c = c `elem` "/ \n\t" in map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest extractTagType _ = "" -- Parse any HTML tag (opening or self-closing) and return tag type anyOpener :: GenParser Char ParserState [Char] anyOpener = try $ do char '<' spaces tag <- many1 alphaNum skipMany htmlAttribute spaces option "" (string "/") spaces char '>' return $ map toLower tag -- | Parse any HTML tag (opening or self-closing) and return text of tag anyHtmlTag :: GenParser Char ParserState [Char] anyHtmlTag = try $ do char '<' spaces tag <- many1 alphaNum attribs <- many htmlAttribute spaces ender <- option "" (string "/") let ender' = if null ender then "" else " /" spaces char '>' let result = "<" ++ tag ++ concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" unsanitary <- unsanitaryTag tag if unsanitary then return $ "" else return result anyHtmlEndTag :: GenParser Char ParserState [Char] anyHtmlEndTag = try $ do char '<' spaces char '/' spaces tag <- many1 alphaNum spaces char '>' let result = "" unsanitary <- unsanitaryTag tag if unsanitary then return $ "" else return result htmlTag :: String -> GenParser Char ParserState (String, [(String, String)]) htmlTag tag = try $ do char '<' spaces stringAnyCase tag attribs <- many htmlAttribute spaces optional (string "/") spaces char '>' return (tag, (map (\(name, content, _) -> (name, content)) attribs)) -- parses a quoted html attribute value quoted :: Char -> GenParser Char st (String, String) quoted quoteChar = do result <- between (char quoteChar) (char quoteChar) (many (noneOf [quoteChar])) return (result, [quoteChar]) nullAttribute :: ([Char], [Char], [Char]) nullAttribute = ("", "", "") htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char]) htmlAttribute = do attr <- htmlRegularAttribute <|> htmlMinimizedAttribute unsanitary <- unsanitaryAttribute attr if unsanitary then return nullAttribute else return attr -- minimized boolean attribute htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char]) htmlMinimizedAttribute = try $ do many1 space name <- many1 (choice [letter, oneOf ".-_:"]) return (name, name, name) htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char]) htmlRegularAttribute = try $ do many1 space name <- many1 (choice [letter, oneOf ".-_:"]) spaces char '=' spaces (content, quoteStr) <- choice [ (quoted '\''), (quoted '"'), (do a <- many (noneOf " \t\n\r\"'<>") return (a,"")) ] return (name, content, (name ++ "=" ++ quoteStr ++ content ++ quoteStr)) -- | Parse an end tag of type 'tag' htmlEndTag :: [Char] -> GenParser Char ParserState [Char] htmlEndTag tag = try $ do closedByNext <- lookAhead $ option False $ liftM (`closes` tag) $ anyOpener <|> (eof >> return "EOF") if closedByNext then return "" else do char '<' spaces char '/' spaces stringAnyCase tag spaces char '>' return $ "" -- | Returns @True@ if the tag is (or can be) a block tag. isBlock :: String -> Bool isBlock tag = (extractTagType tag) `elem` (blockHtmlTags ++ eitherBlockOrInline) anyHtmlBlockTag :: GenParser Char ParserState [Char] anyHtmlBlockTag = try $ do tag <- anyHtmlTag <|> anyHtmlEndTag if isBlock tag then return tag else fail "not a block tag" anyHtmlInlineTag :: GenParser Char ParserState [Char] anyHtmlInlineTag = try $ do tag <- anyHtmlTag <|> anyHtmlEndTag if not (isBlock tag) then return tag else fail "not an inline tag" -- | Parses material between script tags. -- Scripts must be treated differently, because they can contain '<>' etc. htmlScript :: GenParser Char ParserState [Char] htmlScript = try $ do lookAhead $ htmlTag "script" open <- anyHtmlTag rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script") st <- getState if stateSanitizeHTML st && not ("script" `elem` sanitaryTags) then return "" else return $ open ++ rest ++ "" scriptChunk :: GenParser Char ParserState [Char] scriptChunk = jsComment <|> jsString <|> jsChars where jsComment = jsEndlineComment <|> jsMultilineComment jsString = jsSingleQuoteString <|> jsDoubleQuoteString jsChars = many1 (noneOf "<\"'*/") <|> count 1 anyChar jsEndlineComment = try $ do string "//" res <- manyTill anyChar newline return ("//" ++ res) jsMultilineComment = try $ do string "/*" res <- manyTill anyChar (try $ string "*/") return ("/*" ++ res ++ "*/") jsSingleQuoteString = stringwith '\'' jsDoubleQuoteString = stringwith '"' charWithEsc escapable = try $ (try $ char '\\' >> oneOf ('\\':escapable) >>= \x -> return ['\\',x]) <|> count 1 anyChar stringwith c = try $ do char c res <- liftM concat $ manyTill (charWithEsc [c]) (char c) return (c : (res ++ [c])) -- | Parses material between style tags. -- Style tags must be treated differently, because they can contain CSS htmlStyle :: GenParser Char ParserState [Char] htmlStyle = try $ do lookAhead $ htmlTag "style" open <- anyHtmlTag rest <- manyTill anyChar (htmlEndTag "style") st <- getState if stateSanitizeHTML st && not ("style" `elem` sanitaryTags) then return "" else return $ open ++ rest ++ "" htmlBlockElement :: GenParser Char ParserState [Char] htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ] rawHtmlBlock :: GenParser Char ParserState Block rawHtmlBlock = try $ do body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag state <- getState if stateParseRaw state then return (RawHtml body) else return Null -- This is a block whose contents should be passed through verbatim, not interpreted. rawVerbatimBlock :: GenParser Char ParserState [Char] rawVerbatimBlock = try $ do start <- anyHtmlBlockTag let tagtype = extractTagType start if tagtype `elem` ["pre"] then do contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar) end <- htmlEndTag tagtype return $ start ++ contents ++ end else fail "Not a verbatim block" -- We don't want to parse or as raw HTML, since these -- are handled in parseHtml. rawHtmlBlock' :: GenParser Char ParserState Block rawHtmlBlock' = do notFollowedBy' (htmlTag "/body" <|> htmlTag "/html") rawHtmlBlock -- | Parses an HTML comment. htmlComment :: GenParser Char st [Char] htmlComment = try $ do string "" return $ "" -- -- parsing documents -- xmlDec :: GenParser Char st [Char] xmlDec = try $ do string "') return $ "" definition :: GenParser Char st [Char] definition = try $ do string "') return $ "" nonTitleNonHead :: GenParser Char ParserState Char nonTitleNonHead = try $ do notFollowedBy $ (htmlTag "title" >> return ' ') <|> (htmlEndTag "head" >> return ' ') (rawHtmlBlock >> return ' ') <|> anyChar parseTitle :: GenParser Char ParserState [Inline] parseTitle = try $ do (tag, _) <- htmlTag "title" contents <- inlinesTilEnd tag spaces return contents -- parse header and return meta-information (for now, just title) parseHead :: GenParser Char ParserState Meta parseHead = try $ do htmlTag "head" spaces skipMany nonTitleNonHead contents <- option [] parseTitle skipMany nonTitleNonHead htmlEndTag "head" return $ Meta contents [] [] skipHtmlTag :: String -> GenParser Char ParserState () skipHtmlTag tag = optional (htmlTag tag) -- h1 class="title" representation of title in body bodyTitle :: GenParser Char ParserState [Inline] bodyTitle = try $ do (_, attribs) <- htmlTag "h1" case (extractAttribute "class" attribs) of Just "title" -> return "" _ -> fail "not title" inlinesTilEnd "h1" endOfDoc :: GenParser Char ParserState () endOfDoc = try $ do spaces optional (htmlEndTag "body") spaces optional (htmlEndTag "html" >> many anyChar) -- ignore stuff after eof parseHtml :: GenParser Char ParserState Pandoc parseHtml = do sepEndBy (choice [xmlDec, definition, htmlComment]) spaces skipHtmlTag "html" spaces meta <- option (Meta [] [] []) parseHead spaces skipHtmlTag "body" spaces optional bodyTitle -- skip title in body, because it's represented in meta blocks <- parseBlocks endOfDoc return $ Pandoc meta blocks -- -- parsing blocks -- parseBlocks :: GenParser Char ParserState [Block] parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null)) block :: GenParser Char ParserState Block block = choice [ codeBlock , header , hrule , list , blockQuote , para , plain , rawHtmlBlock' , notFollowedBy' endOfDoc >> char '<' >> return Null ] "block" -- -- header blocks -- header :: GenParser Char ParserState Block header = choice (map headerLevel (enumFromTo 1 5)) "header" headerLevel :: Int -> GenParser Char ParserState Block headerLevel n = try $ do let level = "h" ++ show n htmlTag level contents <- inlinesTilEnd level return $ Header n (normalizeSpaces contents) -- -- hrule block -- hrule :: GenParser Char ParserState Block hrule = try $ do (_, attribs) <- htmlTag "hr" state <- getState if not (null attribs) && stateParseRaw state then unexpected "attributes in hr" -- parse as raw in this case else return HorizontalRule -- -- code blocks -- -- Note: HTML tags in code blocks (e.g. for syntax highlighting) are -- skipped, because they are not portable to output formats other than HTML. codeBlock :: GenParser Char ParserState Block codeBlock = try $ do htmlTag "pre" result <- manyTill (many1 (satisfy (/= '<')) <|> ((anyHtmlTag <|> anyHtmlEndTag) >> return "")) (htmlEndTag "pre") let result' = concat result -- drop leading newline if any let result'' = if "\n" `isPrefixOf` result' then drop 1 result' else result' -- drop trailing newline if any let result''' = if "\n" `isSuffixOf` result'' then init result'' else result'' return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result''' -- -- block quotes -- blockQuote :: GenParser Char ParserState Block blockQuote = try $ htmlTag "blockquote" >> spaces >> blocksTilEnd "blockquote" >>= (return . BlockQuote) -- -- list blocks -- list :: GenParser Char ParserState Block list = choice [ bulletList, orderedList, definitionList ] "list" orderedList :: GenParser Char ParserState Block orderedList = try $ do (_, attribs) <- htmlTag "ol" (start, style) <- option (1, DefaultStyle) $ do failIfStrict let sta = fromMaybe "1" $ lookup "start" attribs let sty = fromMaybe (fromMaybe "" $ lookup "style" attribs) $ lookup "class" attribs let sty' = case sty of "lower-roman" -> LowerRoman "upper-roman" -> UpperRoman "lower-alpha" -> LowerAlpha "upper-alpha" -> UpperAlpha "decimal" -> Decimal _ -> DefaultStyle return (read sta, sty') spaces items <- sepEndBy1 (blocksIn "li") spaces htmlEndTag "ol" return $ OrderedList (start, style, DefaultDelim) items bulletList :: GenParser Char ParserState Block bulletList = try $ do htmlTag "ul" spaces items <- sepEndBy1 (blocksIn "li") spaces htmlEndTag "ul" return $ BulletList items definitionList :: GenParser Char ParserState Block definitionList = try $ do failIfStrict -- def lists not part of standard markdown htmlTag "dl" spaces items <- sepEndBy1 definitionListItem spaces htmlEndTag "dl" return $ DefinitionList items definitionListItem :: GenParser Char ParserState ([Inline], [[Block]]) definitionListItem = try $ do terms <- sepEndBy1 (inlinesIn "dt") spaces defs <- sepEndBy1 (blocksIn "dd") spaces let term = intercalate [LineBreak] terms return (term, defs) -- -- paragraph block -- para :: GenParser Char ParserState Block para = try $ htmlTag "p" >> inlinesTilEnd "p" >>= return . Para . normalizeSpaces -- -- plain block -- plain :: GenParser Char ParserState Block plain = many1 inline >>= return . Plain . normalizeSpaces -- -- inline -- inline :: GenParser Char ParserState Inline inline = choice [ charRef , strong , emph , superscript , subscript , strikeout , spanStrikeout , code , str , linebreak , whitespace , link , image , rawHtmlInline , char '&' >> return (Str "&") -- common HTML error ] "inline" code :: GenParser Char ParserState Inline code = try $ do htmlTag "code" result <- manyTill anyChar (htmlEndTag "code") -- remove internal line breaks, leading and trailing space, -- and decode character references return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ intercalate " " $ lines result rawHtmlInline :: GenParser Char ParserState Inline rawHtmlInline = do result <- anyHtmlInlineTag <|> htmlComment state <- getState if stateParseRaw state then return (HtmlInline result) else return (Str "") betweenTags :: [Char] -> GenParser Char ParserState [Inline] betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= return . normalizeSpaces emph :: GenParser Char ParserState Inline emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph strong :: GenParser Char ParserState Inline strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong superscript :: GenParser Char ParserState Inline superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript subscript :: GenParser Char ParserState Inline subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript strikeout :: GenParser Char ParserState Inline strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= return . Strikeout spanStrikeout :: GenParser Char ParserState Inline spanStrikeout = try $ do failIfStrict -- strict markdown has no strikeout, so treat as raw HTML (_, attributes) <- htmlTag "span" result <- case (extractAttribute "class" attributes) of Just "strikeout" -> inlinesTilEnd "span" _ -> fail "not a strikeout" return $ Strikeout result whitespace :: GenParser Char st Inline whitespace = many1 space >> return Space -- hard line break linebreak :: GenParser Char ParserState Inline linebreak = htmlTag "br" >> optional newline >> return LineBreak str :: GenParser Char st Inline str = many1 (noneOf "< \t\n&") >>= return . Str -- -- links and images -- -- extract contents of attribute (attribute names are case-insensitive) extractAttribute :: [Char] -> [([Char], String)] -> Maybe String extractAttribute _ [] = Nothing extractAttribute name ((attrName, contents):rest) = let name' = map toLower name attrName' = map toLower attrName in if attrName' == name' then Just (decodeCharacterReferences contents) else extractAttribute name rest link :: GenParser Char ParserState Inline link = try $ do (_, attributes) <- htmlTag "a" url <- case (extractAttribute "href" attributes) of Just url -> return url Nothing -> fail "no href" let title = fromMaybe "" $ extractAttribute "title" attributes lab <- inlinesTilEnd "a" return $ Link (normalizeSpaces lab) (url, title) image :: GenParser Char ParserState Inline image = try $ do (_, attributes) <- htmlTag "img" url <- case (extractAttribute "src" attributes) of Just url -> return url Nothing -> fail "no src" let title = fromMaybe "" $ extractAttribute "title" attributes let alt = fromMaybe "" (extractAttribute "alt" attributes) return $ Image [Str alt] (url, title)