diff options
Diffstat (limited to 'Text/Pandoc/Readers/HTML.hs')
-rw-r--r-- | Text/Pandoc/Readers/HTML.hs | 675 |
1 files changed, 0 insertions, 675 deletions
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs deleted file mode 100644 index 65e512b5e..000000000 --- a/Text/Pandoc/Readers/HTML.hs +++ /dev/null @@ -1,675 +0,0 @@ -{- -Copyright (C) 2006-8 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-8 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion of HTML to 'Pandoc' document. --} -module Text.Pandoc.Readers.HTML ( - readHtml, - rawHtmlInline, - rawHtmlBlock, - anyHtmlBlockTag, - anyHtmlInlineTag, - anyHtmlTag, - anyHtmlEndTag, - htmlEndTag, - extractTagType, - htmlBlockElement, - unsanitaryURI - ) where - -import Text.ParserCombinators.Parsec -import Text.Pandoc.Definition -import Text.Pandoc.Shared -import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) -import Data.Maybe ( fromMaybe ) -import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf, intercalate ) -import Data.Char ( toLower, isAlphaNum ) -import Network.URI ( parseURIReference, URI (..) ) - --- | Convert HTML-formatted string to 'Pandoc' document. -readHtml :: ParserState -- ^ Parser state - -> String -- ^ String to parse - -> Pandoc -readHtml = readWith parseHtml - --- --- Constants --- - -eitherBlockOrInline :: [[Char]] -eitherBlockOrInline = ["applet", "button", "del", "iframe", "ins", - "map", "area", "object"] - -{- -inlineHtmlTags :: [[Char]] -inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", - "br", "cite", "code", "dfn", "em", "font", "i", "img", - "input", "kbd", "label", "q", "s", "samp", "select", - "small", "span", "strike", "strong", "sub", "sup", - "textarea", "tt", "u", "var"] ++ eitherBlockOrInline --} - -blockHtmlTags :: [[Char]] -blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div", - "dl", "fieldset", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "hr", "html", "isindex", "menu", "noframes", - "noscript", "ol", "p", "pre", "table", "ul", "dd", - "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script"] ++ eitherBlockOrInline - -sanitaryTags :: [[Char]] -sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big", - "blockquote", "br", "button", "caption", "center", - "cite", "code", "col", "colgroup", "dd", "del", "dfn", - "dir", "div", "dl", "dt", "em", "fieldset", "font", - "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", - "i", "img", "input", "ins", "kbd", "label", "legend", - "li", "map", "menu", "ol", "optgroup", "option", "p", - "pre", "q", "s", "samp", "select", "small", "span", - "strike", "strong", "sub", "sup", "table", "tbody", - "td", "textarea", "tfoot", "th", "thead", "tr", "tt", - "u", "ul", "var"] - -sanitaryAttributes :: [[Char]] -sanitaryAttributes = ["abbr", "accept", "accept-charset", - "accesskey", "action", "align", "alt", "axis", - "border", "cellpadding", "cellspacing", "char", - "charoff", "charset", "checked", "cite", "class", - "clear", "cols", "colspan", "color", "compact", - "coords", "datetime", "dir", "disabled", - "enctype", "for", "frame", "headers", "height", - "href", "hreflang", "hspace", "id", "ismap", - "label", "lang", "longdesc", "maxlength", "media", - "method", "multiple", "name", "nohref", "noshade", - "nowrap", "prompt", "readonly", "rel", "rev", - "rows", "rowspan", "rules", "scope", "selected", - "shape", "size", "span", "src", "start", - "summary", "tabindex", "target", "title", "type", - "usemap", "valign", "value", "vspace", "width"] - --- --- HTML utility functions --- - --- | Returns @True@ if sanitization is specified and the specified tag is --- not on the sanitized tag list. -unsanitaryTag :: [Char] - -> GenParser tok ParserState Bool -unsanitaryTag tag = do - st <- getState - return $ stateSanitizeHTML st && tag `notElem` sanitaryTags - --- | returns @True@ if sanitization is specified and the specified attribute --- is not on the sanitized attribute list. -unsanitaryAttribute :: ([Char], String, t) - -> GenParser tok ParserState Bool -unsanitaryAttribute (attr, val, _) = do - st <- getState - return $ stateSanitizeHTML st && - (attr `notElem` sanitaryAttributes || - (attr `elem` ["href","src"] && unsanitaryURI val)) - --- | Returns @True@ if the specified URI is potentially a security risk. -unsanitaryURI :: String -> Bool -unsanitaryURI u = - let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:", - "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:", - "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:", - "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:", - "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:", - "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:", - "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:", - "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:", - "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:", - "snews:", "webcal:", "ymsgr:"] - in case parseURIReference u of - Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes - Nothing -> True - --- | Read blocks until end tag. -blocksTilEnd :: String -> GenParser Char ParserState [Block] -blocksTilEnd tag = do - blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) - return $ filter (/= Null) blocks - --- | Read inlines until end tag. -inlinesTilEnd :: String -> GenParser Char ParserState [Inline] -inlinesTilEnd tag = manyTill inline (htmlEndTag tag) - --- | Parse blocks between open and close tag. -blocksIn :: String -> GenParser Char ParserState [Block] -blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag - --- | Parse inlines between open and close tag. -inlinesIn :: String -> GenParser Char ParserState [Inline] -inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag - --- | Extract type from a tag: e.g. @br@ from @\<br\>@ -extractTagType :: String -> String -extractTagType ('<':rest) = - let isSpaceOrSlash c = c `elem` "/ \n\t" in - map toLower $ takeWhile isAlphaNum $ dropWhile isSpaceOrSlash rest -extractTagType _ = "" - --- | Parse any HTML tag (opening or self-closing) and return text of tag -anyHtmlTag :: GenParser Char ParserState [Char] -anyHtmlTag = try $ do - char '<' - spaces - tag <- many1 alphaNum - attribs <- many htmlAttribute - spaces - ender <- option "" (string "/") - let ender' = if null ender then "" else " /" - spaces - char '>' - let result = "<" ++ tag ++ - concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" - unsanitary <- unsanitaryTag tag - if unsanitary - then return $ "<!-- unsafe HTML removed -->" - else return result - -anyHtmlEndTag :: GenParser Char ParserState [Char] -anyHtmlEndTag = try $ do - char '<' - spaces - char '/' - spaces - tag <- many1 alphaNum - spaces - char '>' - let result = "</" ++ tag ++ ">" - unsanitary <- unsanitaryTag tag - if unsanitary - then return $ "<!-- unsafe HTML removed -->" - else return result - -htmlTag :: String -> GenParser Char ParserState (String, [(String, String)]) -htmlTag tag = try $ do - char '<' - spaces - stringAnyCase tag - attribs <- many htmlAttribute - spaces - optional (string "/") - spaces - char '>' - return (tag, (map (\(name, content, _) -> (name, content)) attribs)) - --- parses a quoted html attribute value -quoted :: Char -> GenParser Char st (String, String) -quoted quoteChar = do - result <- between (char quoteChar) (char quoteChar) - (many (noneOf [quoteChar])) - return (result, [quoteChar]) - -nullAttribute :: ([Char], [Char], [Char]) -nullAttribute = ("", "", "") - -htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char]) -htmlAttribute = do - attr <- htmlRegularAttribute <|> htmlMinimizedAttribute - unsanitary <- unsanitaryAttribute attr - if unsanitary - then return nullAttribute - else return attr - --- minimized boolean attribute -htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char]) -htmlMinimizedAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - return (name, name, name) - -htmlRegularAttribute :: GenParser Char st ([Char], [Char], [Char]) -htmlRegularAttribute = try $ do - many1 space - name <- many1 (choice [letter, oneOf ".-_:"]) - spaces - char '=' - spaces - (content, quoteStr) <- choice [ (quoted '\''), - (quoted '"'), - (do - a <- many (alphaNum <|> (oneOf "-._:")) - return (a,"")) ] - return (name, content, - (name ++ "=" ++ quoteStr ++ content ++ quoteStr)) - --- | Parse an end tag of type 'tag' -htmlEndTag :: [Char] -> GenParser Char st [Char] -htmlEndTag tag = try $ do - char '<' - spaces - char '/' - spaces - stringAnyCase tag - spaces - char '>' - return $ "</" ++ tag ++ ">" - -{- --- | Returns @True@ if the tag is (or can be) an inline tag. -isInline :: String -> Bool -isInline tag = (extractTagType tag) `elem` inlineHtmlTags --} - --- | Returns @True@ if the tag is (or can be) a block tag. -isBlock :: String -> Bool -isBlock tag = (extractTagType tag) `elem` blockHtmlTags - -anyHtmlBlockTag :: GenParser Char ParserState [Char] -anyHtmlBlockTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if isBlock tag then return tag else fail "not a block tag" - -anyHtmlInlineTag :: GenParser Char ParserState [Char] -anyHtmlInlineTag = try $ do - tag <- anyHtmlTag <|> anyHtmlEndTag - if not (isBlock tag) then return tag else fail "not an inline tag" - --- | Parses material between script tags. --- Scripts must be treated differently, because they can contain '<>' etc. -htmlScript :: GenParser Char ParserState [Char] -htmlScript = try $ do - open <- string "<script" - rest <- manyTill anyChar (htmlEndTag "script") - st <- getState - if stateSanitizeHTML st && not ("script" `elem` sanitaryTags) - then return "<!-- unsafe HTML removed -->" - else return $ open ++ rest ++ "</script>" - --- | Parses material between style tags. --- Style tags must be treated differently, because they can contain CSS -htmlStyle :: GenParser Char ParserState [Char] -htmlStyle = try $ do - open <- string "<style" - rest <- manyTill anyChar (htmlEndTag "style") - st <- getState - if stateSanitizeHTML st && not ("style" `elem` sanitaryTags) - then return "<!-- unsafe HTML removed -->" - else return $ open ++ rest ++ "</style>" - -htmlBlockElement :: GenParser Char ParserState [Char] -htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ] - -rawHtmlBlock :: GenParser Char ParserState Block -rawHtmlBlock = try $ do - body <- htmlBlockElement <|> rawVerbatimBlock <|> anyHtmlBlockTag - state <- getState - if stateParseRaw state then return (RawHtml body) else return Null - --- This is a block whose contents should be passed through verbatim, not interpreted. -rawVerbatimBlock :: GenParser Char ParserState [Char] -rawVerbatimBlock = try $ do - start <- anyHtmlBlockTag - let tagtype = extractTagType start - if tagtype `elem` ["pre"] - then do - contents <- many (notFollowedBy' (htmlEndTag tagtype) >> anyChar) - end <- htmlEndTag tagtype - return $ start ++ contents ++ end - else fail "Not a verbatim block" - --- We don't want to parse </body> or </html> as raw HTML, since these --- are handled in parseHtml. -rawHtmlBlock' :: GenParser Char ParserState Block -rawHtmlBlock' = do notFollowedBy' (htmlTag "/body" <|> htmlTag "/html") - rawHtmlBlock - --- | Parses an HTML comment. -htmlComment :: GenParser Char st [Char] -htmlComment = try $ do - string "<!--" - comment <- manyTill anyChar (try (string "-->")) - return $ "<!--" ++ comment ++ "-->" - --- --- parsing documents --- - -xmlDec :: GenParser Char st [Char] -xmlDec = try $ do - string "<?" - rest <- manyTill anyChar (char '>') - return $ "<?" ++ rest ++ ">" - -definition :: GenParser Char st [Char] -definition = try $ do - string "<!" - rest <- manyTill anyChar (char '>') - return $ "<!" ++ rest ++ ">" - -nonTitleNonHead :: GenParser Char ParserState Char -nonTitleNonHead = try $ do - notFollowedBy $ (htmlTag "title" >> return ' ') <|> - (htmlEndTag "head" >> return ' ') - (rawHtmlBlock >> return ' ') <|> anyChar - -parseTitle :: GenParser Char ParserState [Inline] -parseTitle = try $ do - (tag, _) <- htmlTag "title" - contents <- inlinesTilEnd tag - spaces - return contents - --- parse header and return meta-information (for now, just title) -parseHead :: GenParser Char ParserState ([Inline], [a], [Char]) -parseHead = try $ do - htmlTag "head" - spaces - skipMany nonTitleNonHead - contents <- option [] parseTitle - skipMany nonTitleNonHead - htmlEndTag "head" - return (contents, [], "") - -skipHtmlTag :: String -> GenParser Char ParserState () -skipHtmlTag tag = optional (htmlTag tag) - --- h1 class="title" representation of title in body -bodyTitle :: GenParser Char ParserState [Inline] -bodyTitle = try $ do - (_, attribs) <- htmlTag "h1" - case (extractAttribute "class" attribs) of - Just "title" -> return "" - _ -> fail "not title" - inlinesTilEnd "h1" - -parseHtml :: GenParser Char ParserState Pandoc -parseHtml = do - sepEndBy (choice [xmlDec, definition, htmlComment]) spaces - skipHtmlTag "html" - spaces - (title, authors, date) <- option ([], [], "") parseHead - spaces - skipHtmlTag "body" - spaces - optional bodyTitle -- skip title in body, because it's represented in meta - blocks <- parseBlocks - spaces - optional (htmlEndTag "body") - spaces - optional (htmlEndTag "html" >> many anyChar) -- ignore anything after </html> - eof - return $ Pandoc (Meta title authors date) blocks - --- --- parsing blocks --- - -parseBlocks :: GenParser Char ParserState [Block] -parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null)) - -block :: GenParser Char ParserState Block -block = choice [ codeBlock - , header - , hrule - , list - , blockQuote - , para - , plain - , rawHtmlBlock' - ] <?> "block" - --- --- header blocks --- - -header :: GenParser Char ParserState Block -header = choice (map headerLevel (enumFromTo 1 5)) <?> "header" - -headerLevel :: Int -> GenParser Char ParserState Block -headerLevel n = try $ do - let level = "h" ++ show n - htmlTag level - contents <- inlinesTilEnd level - return $ Header n (normalizeSpaces contents) - --- --- hrule block --- - -hrule :: GenParser Char ParserState Block -hrule = try $ do - (_, attribs) <- htmlTag "hr" - state <- getState - if not (null attribs) && stateParseRaw state - then unexpected "attributes in hr" -- parse as raw in this case - else return HorizontalRule - --- --- code blocks --- - --- Note: HTML tags in code blocks (e.g. for syntax highlighting) are --- skipped, because they are not portable to output formats other than HTML. -codeBlock :: GenParser Char ParserState Block -codeBlock = try $ do - htmlTag "pre" - result <- manyTill - (many1 (satisfy (/= '<')) <|> - ((anyHtmlTag <|> anyHtmlEndTag) >> return "")) - (htmlEndTag "pre") - let result' = concat result - -- drop leading newline if any - let result'' = if "\n" `isPrefixOf` result' - then drop 1 result' - else result' - -- drop trailing newline if any - let result''' = if "\n" `isSuffixOf` result'' - then init result'' - else result'' - return $ CodeBlock ("",[],[]) $ decodeCharacterReferences result''' - --- --- block quotes --- - -blockQuote :: GenParser Char ParserState Block -blockQuote = try $ htmlTag "blockquote" >> spaces >> - blocksTilEnd "blockquote" >>= (return . BlockQuote) - --- --- list blocks --- - -list :: GenParser Char ParserState Block -list = choice [ bulletList, orderedList, definitionList ] <?> "list" - -orderedList :: GenParser Char ParserState Block -orderedList = try $ do - (_, attribs) <- htmlTag "ol" - (start, style) <- option (1, DefaultStyle) $ - do failIfStrict - let sta = fromMaybe "1" $ - lookup "start" attribs - let sty = fromMaybe (fromMaybe "" $ - lookup "style" attribs) $ - lookup "class" attribs - let sty' = case sty of - "lower-roman" -> LowerRoman - "upper-roman" -> UpperRoman - "lower-alpha" -> LowerAlpha - "upper-alpha" -> UpperAlpha - "decimal" -> Decimal - _ -> DefaultStyle - return (read sta, sty') - spaces - items <- sepEndBy1 (blocksIn "li") spaces - htmlEndTag "ol" - return $ OrderedList (start, style, DefaultDelim) items - -bulletList :: GenParser Char ParserState Block -bulletList = try $ do - htmlTag "ul" - spaces - items <- sepEndBy1 (blocksIn "li") spaces - htmlEndTag "ul" - return $ BulletList items - -definitionList :: GenParser Char ParserState Block -definitionList = try $ do - failIfStrict -- def lists not part of standard markdown - htmlTag "dl" - spaces - items <- sepEndBy1 definitionListItem spaces - htmlEndTag "dl" - return $ DefinitionList items - -definitionListItem :: GenParser Char ParserState ([Inline], [Block]) -definitionListItem = try $ do - terms <- sepEndBy1 (inlinesIn "dt") spaces - defs <- sepEndBy1 (blocksIn "dd") spaces - let term = intercalate [LineBreak] terms - return (term, concat defs) - --- --- paragraph block --- - -para :: GenParser Char ParserState Block -para = try $ htmlTag "p" >> inlinesTilEnd "p" >>= - return . Para . normalizeSpaces - --- --- plain block --- - -plain :: GenParser Char ParserState Block -plain = many1 inline >>= return . Plain . normalizeSpaces - --- --- inline --- - -inline :: GenParser Char ParserState Inline -inline = choice [ charRef - , strong - , emph - , superscript - , subscript - , strikeout - , spanStrikeout - , code - , str - , linebreak - , whitespace - , link - , image - , rawHtmlInline - ] <?> "inline" - -code :: GenParser Char ParserState Inline -code = try $ do - htmlTag "code" - result <- manyTill anyChar (htmlEndTag "code") - -- remove internal line breaks, leading and trailing space, - -- and decode character references - return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ - intercalate " " $ lines result - -rawHtmlInline :: GenParser Char ParserState Inline -rawHtmlInline = do - result <- htmlScript <|> htmlStyle <|> htmlComment <|> anyHtmlInlineTag - state <- getState - if stateParseRaw state then return (HtmlInline result) else return (Str "") - -betweenTags :: [Char] -> GenParser Char ParserState [Inline] -betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= - return . normalizeSpaces - -emph :: GenParser Char ParserState Inline -emph = (betweenTags "em" <|> betweenTags "i") >>= return . Emph - -strong :: GenParser Char ParserState Inline -strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong - -superscript :: GenParser Char ParserState Inline -superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript - -subscript :: GenParser Char ParserState Inline -subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript - -strikeout :: GenParser Char ParserState Inline -strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= - return . Strikeout - -spanStrikeout :: GenParser Char ParserState Inline -spanStrikeout = try $ do - failIfStrict -- strict markdown has no strikeout, so treat as raw HTML - (_, attributes) <- htmlTag "span" - result <- case (extractAttribute "class" attributes) of - Just "strikeout" -> inlinesTilEnd "span" - _ -> fail "not a strikeout" - return $ Strikeout result - -whitespace :: GenParser Char st Inline -whitespace = many1 space >> return Space - --- hard line break -linebreak :: GenParser Char ParserState Inline -linebreak = htmlTag "br" >> optional newline >> return LineBreak - -str :: GenParser Char st Inline -str = many1 (noneOf "<& \t\n") >>= return . Str - --- --- links and images --- - --- extract contents of attribute (attribute names are case-insensitive) -extractAttribute :: [Char] -> [([Char], String)] -> Maybe String -extractAttribute _ [] = Nothing -extractAttribute name ((attrName, contents):rest) = - let name' = map toLower name - attrName' = map toLower attrName - in if attrName' == name' - then Just (decodeCharacterReferences contents) - else extractAttribute name rest - -link :: GenParser Char ParserState Inline -link = try $ do - (_, attributes) <- htmlTag "a" - url <- case (extractAttribute "href" attributes) of - Just url -> return url - Nothing -> fail "no href" - let title = fromMaybe "" $ extractAttribute "title" attributes - lab <- inlinesTilEnd "a" - return $ Link (normalizeSpaces lab) (url, title) - -image :: GenParser Char ParserState Inline -image = try $ do - (_, attributes) <- htmlTag "img" - url <- case (extractAttribute "src" attributes) of - Just url -> return url - Nothing -> fail "no src" - let title = fromMaybe "" $ extractAttribute "title" attributes - let alt = fromMaybe "" (extractAttribute "alt" attributes) - return $ Image [Str alt] (url, title) - |