diff options
-rw-r--r-- | Main.hs | 23 | ||||
-rw-r--r-- | README | 5 | ||||
-rw-r--r-- | Text/Pandoc/Readers/HTML.hs | 93 | ||||
-rw-r--r-- | Text/Pandoc/Readers/Markdown.hs | 17 | ||||
-rw-r--r-- | Text/Pandoc/Shared.hs | 2 | ||||
-rw-r--r-- | man/man1/html2markdown.1.md | 4 | ||||
-rw-r--r-- | man/man1/pandoc.1.md | 5 |
7 files changed, 119 insertions, 30 deletions
@@ -104,6 +104,7 @@ data Opt = Opt , optStrict :: Bool -- ^ Use strict markdown syntax , optReferenceLinks :: Bool -- ^ Use reference links in writing markdown, rst , optWrapText :: Bool -- ^ Wrap text + , optSanitizeHTML :: Bool -- ^ Sanitize HTML } -- | Defaults for command-line options. @@ -132,6 +133,7 @@ defaultOpts = Opt , optStrict = False , optReferenceLinks = False , optWrapText = True + , optSanitizeHTML = False } -- | A list of functions, each transforming the options data structure @@ -226,6 +228,11 @@ options = (\opt -> return opt { optWrapText = False })) "" -- "Do not wrap text in output" + , Option "" ["sanitize-html"] + (NoArg + (\opt -> return opt { optSanitizeHTML = True })) + "" -- "Sanitize HTML" + , Option "" ["toc", "table-of-contents"] (NoArg (\opt -> return opt { optTableOfContents = True })) @@ -424,6 +431,7 @@ main = do , optStrict = strict , optReferenceLinks = referenceLinks , optWrapText = wrap + , optSanitizeHTML = sanitize } = opts if dumpArgs @@ -476,13 +484,14 @@ main = do x:(tabFilter (spsToNextStop - 1) xs) let startParserState = - defaultParserState { stateParseRaw = parseRaw, - stateTabStop = tabStop, - stateStandalone = standalone && (not strict), - stateSmart = smart || writerName' `elem` - ["latex", "context"], - stateColumns = columns, - stateStrict = strict } + defaultParserState { stateParseRaw = parseRaw, + stateTabStop = tabStop, + stateSanitizeHTML = sanitize, + stateStandalone = standalone && (not strict), + stateSmart = smart || writerName' `elem` + ["latex", "context"], + stateColumns = columns, + stateStrict = strict } let csslink = if (css == "") then "" else "<link rel=\"stylesheet\" href=\"" ++ css ++ @@ -345,6 +345,11 @@ For further documentation, see the `pandoc(1)` man page. : disables text-wrapping in output. By default, text is wrapped appropriately for the output format. +`--sanitize-html` +: sanitizes HTML (in markdown or HTML input) using a whitelist. + Unsafe tags are replaced by HTML comments; unsafe attributes + are omitted. + `--dump-args` : is intended to make it easier to create wrapper scripts that use Pandoc. It causes Pandoc to dump information about the arguments diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs index 1fff4705f..42a085f63 100644 --- a/Text/Pandoc/Readers/HTML.hs +++ b/Text/Pandoc/Readers/HTML.hs @@ -75,10 +75,54 @@ blockHtmlTags = ["address", "blockquote", "center", "dir", "div", "dt", "frameset", "li", "tbody", "td", "tfoot", "th", "thead", "tr", "script"] ++ eitherBlockOrInline +sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big", + "blockquote", "br", "button", "caption", "center", + "cite", "code", "col", "colgroup", "dd", "del", "dfn", + "dir", "div", "dl", "dt", "em", "fieldset", "font", + "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", + "i", "img", "input", "ins", "kbd", "label", "legend", + "li", "map", "menu", "ol", "optgroup", "option", "p", + "pre", "q", "s", "samp", "select", "small", "span", + "strike", "strong", "sub", "sup", "table", "tbody", + "td", "textarea", "tfoot", "th", "thead", "tr", "tt", + "u", "ul", "var"] + +sanitaryAttributes = ["abbr", "accept", "accept-charset", + "accesskey", "action", "align", "alt", "axis", + "border", "cellpadding", "cellspacing", "char", + "charoff", "charset", "checked", "cite", "class", + "clear", "cols", "colspan", "color", "compact", + "coords", "datetime", "dir", "disabled", + "enctype", "for", "frame", "headers", "height", + "href", "hreflang", "hspace", "id", "ismap", + "label", "lang", "longdesc", "maxlength", "media", + "method", "multiple", "name", "nohref", "noshade", + "nowrap", "prompt", "readonly", "rel", "rev", + "rows", "rowspan", "rules", "scope", "selected", + "shape", "size", "span", "src", "start", + "summary", "tabindex", "target", "title", "type", + "usemap", "valign", "value", "vspace", "width"] + -- -- HTML utility functions -- +-- | Returns @True@ if sanitization is specified and the specified tag is +-- not on the sanitized tag list. +unsanitaryTag tag = do + st <- getState + if stateSanitizeHTML st && not (tag `elem` sanitaryTags) + then return True + else return False + +-- | returns @True@ if sanitization is specified and the specified attribute +-- is not on the sanitized attribute list. +unsanitaryAttribute (attr, _, _) = do + st <- getState + if stateSanitizeHTML st && not (attr `elem` sanitaryAttributes) + then return True + else return False + -- | Read blocks until end tag. blocksTilEnd tag = do blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) @@ -111,20 +155,28 @@ anyHtmlTag = try $ do let ender' = if null ender then "" else " /" spaces char '>' - return $ "<" ++ tag ++ - concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" + let result = "<" ++ tag ++ + concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" + unsanitary <- unsanitaryTag tag + if unsanitary + then return $ "<!-- unsafe tag " ++ result ++ " omitted -->" + else return result anyHtmlEndTag = try $ do char '<' spaces char '/' spaces - tagType <- many1 alphaNum + tag <- many1 alphaNum spaces char '>' - return $ "</" ++ tagType ++ ">" + let result = "</" ++ tag ++ ">" + unsanitary <- unsanitaryTag tag + if unsanitary + then return $ "<!-- unsafe tag " ++ result ++ " omitted -->" + else return result -htmlTag :: String -> GenParser Char st (String, [(String, String)]) +htmlTag :: String -> GenParser Char ParserState (String, [(String, String)]) htmlTag tag = try $ do char '<' spaces @@ -142,7 +194,14 @@ quoted quoteChar = do (many (noneOf [quoteChar])) return (result, [quoteChar]) -htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute +nullAttribute = ("", "", "") + +htmlAttribute = do + attr <- htmlRegularAttribute <|> htmlMinimizedAttribute + unsanitary <- unsanitaryAttribute attr + if unsanitary + then return nullAttribute + else return attr -- minimized boolean attribute htmlMinimizedAttribute = try $ do @@ -183,7 +242,7 @@ isBlock tag = (extractTagType tag) `elem` blockHtmlTags anyHtmlBlockTag = try $ do tag <- anyHtmlTag <|> anyHtmlEndTag - if isBlock tag then return tag else fail "not a block tag" + if not (isInline tag) then return tag else fail "not a block tag" anyHtmlInlineTag = try $ do tag <- anyHtmlTag <|> anyHtmlEndTag @@ -194,19 +253,25 @@ anyHtmlInlineTag = try $ do htmlScript = try $ do open <- string "<script" rest <- manyTill anyChar (htmlEndTag "script") - return $ open ++ rest ++ "</script>" + st <- getState + if stateSanitizeHTML st && not ("script" `elem` sanitaryTags) + then return "<!-- unsafe script omitted -->" + else return $ open ++ rest ++ "</script>" -- | Parses material between style tags. -- Style tags must be treated differently, because they can contain CSS htmlStyle = try $ do open <- string "<style" rest <- manyTill anyChar (htmlEndTag "style") - return $ open ++ rest ++ "</style>" + st <- getState + if stateSanitizeHTML st && not ("style" `elem` sanitaryTags) + then return "<!-- unsafe style omitted -->" + else return $ open ++ rest ++ "</style>" htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ] rawHtmlBlock = try $ do - body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag + body <- htmlBlockElement <|> anyHtmlBlockTag state <- getState if stateParseRaw state then return (RawHtml body) else return Null @@ -235,8 +300,10 @@ definition = try $ do rest <- manyTill anyChar (char '>') return $ "<!" ++ rest ++ ">" -nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >> - ((rawHtmlBlock >> return ' ') <|> anyChar) +nonTitleNonHead = try $ do + notFollowedBy $ (htmlTag "title" >> return ' ') <|> + (htmlEndTag "head" >> return ' ') + (rawHtmlBlock >> return ' ') <|> anyChar parseTitle = try $ do (tag, _) <- htmlTag "title" @@ -251,7 +318,7 @@ parseHead = try $ do skipMany nonTitleNonHead contents <- option [] parseTitle skipMany nonTitleNonHead - htmlTag "/head" + htmlEndTag "head" return (contents, [], "") skipHtmlTag tag = optional (htmlTag tag) diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs index 6455dcd9d..2d1fa7583 100644 --- a/Text/Pandoc/Readers/Markdown.hs +++ b/Text/Pandoc/Readers/Markdown.hs @@ -249,7 +249,7 @@ block = do , blockQuote , rawLaTeXEnvironment , para - , htmlBlock + , rawHtmlBlocks , plain , nullBlock ]) <?> "block" @@ -482,15 +482,12 @@ plain = many1 inline >>= return . Plain . normalizeSpaces htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element" -htmlBlock = do - st <- getState - if stateStrict st - then try $ do failUnlessBeginningOfLine - first <- htmlElement - finalSpace <- many (oneOf spaceChars) - finalNewlines <- many newline - return $ RawHtml $ first ++ finalSpace ++ finalNewlines - else rawHtmlBlocks +htmlBlock = try $ do + failUnlessBeginningOfLine + first <- htmlElement + finalSpace <- many (oneOf spaceChars) + finalNewlines <- many newline + return $ RawHtml $ first ++ finalSpace ++ finalNewlines -- True if tag is self-closing isSelfClosing tag = diff --git a/Text/Pandoc/Shared.hs b/Text/Pandoc/Shared.hs index 7086ca452..477d86464 100644 --- a/Text/Pandoc/Shared.hs +++ b/Text/Pandoc/Shared.hs @@ -595,6 +595,7 @@ data ParserState = ParserState { stateParseRaw :: Bool, -- ^ Parse raw HTML and LaTeX? stateParserContext :: ParserContext, -- ^ Inside list? stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment? + stateSanitizeHTML :: Bool, -- ^ Sanitize HTML? stateKeys :: KeyTable, -- ^ List of reference keys stateNotes :: NoteTable, -- ^ List of notes stateTabStop :: Int, -- ^ Tab stop @@ -614,6 +615,7 @@ defaultParserState = ParserState { stateParseRaw = False, stateParserContext = NullState, stateQuoteContext = NoQuote, + stateSanitizeHTML = False, stateKeys = [], stateNotes = [], stateTabStop = 4, diff --git a/man/man1/html2markdown.1.md b/man/man1/html2markdown.1.md index 6c5d2dcc8..19d5104af 100644 --- a/man/man1/html2markdown.1.md +++ b/man/man1/html2markdown.1.md @@ -51,6 +51,10 @@ a complete list. The following options are most relevant: \--no-wrap : Disable text wrapping in output. (Default is to wrap text.) +\--sanitize-html +: Sanitizes HTML using a whitelist. Unsafe tags are replaced by HTML + comments; unsafe attributes are omitted. + -H *FILE*, \--include-in-header=*FILE* : Include contents of *FILE* at the end of the header. Implies `-s`. diff --git a/man/man1/pandoc.1.md b/man/man1/pandoc.1.md index 37d3dc262..427004419 100644 --- a/man/man1/pandoc.1.md +++ b/man/man1/pandoc.1.md @@ -126,6 +126,11 @@ to Pandoc. Or use `html2markdown`(1), a wrapper around `pandoc`. \--no-wrap : Disable text wrapping in output. (Default is to wrap text.) +\--sanitize-html +: Sanitizes HTML (in markdown or HTML input) using a whitelist. + Unsafe tags are replaced by HTML comments; unsafe attributes + are omitted. + \--toc, \--table-of-contents : Include an automatically generated table of contents (HTML, markdown, RTF) or an instruction to create one (LaTeX, reStructuredText). |