diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2008-01-03 21:32:32 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2008-01-03 21:32:32 +0000 |
commit | 5df912b162575cb9daf6702bb7f2c2a5858c0b00 (patch) | |
tree | b7e165f47e19839fe30ddcd8250f4fb2e89e4ebb /Text/Pandoc/Readers | |
parent | a505f70f0b33b7fa52ad4d8df77ebff090b36d01 (diff) | |
download | pandoc-5df912b162575cb9daf6702bb7f2c2a5858c0b00.tar.gz |
Added optional HTML sanitization using a whitelist.
When this option is specified (--sanitize-html on the command line),
unsafe HTML tags will be replaced by HTML comments, and unsafe HTML
attributes will be removed. This option should be especially useful
for those who want to use pandoc libraries in web applications, where
users will provide the input.
+ Main.hs: Added --sanitize-html option.
+ Text.Pandoc.Shared: Added stateSanitizeHTML to ParserState.
+ Text.Pandoc.Readers.HTML:
- Added whitelists of sanitaryTags and sanitaryAttributes.
- Added parsers to check these lists (and state) to see if a given
tag or attribute should be counted unsafe.
- Modified anyHtmlTag and anyHtmlEndTag to replace unsafe tags
with comments.
- Modified htmlAttribute to remove unsafe attributes.
- Modified htmlScript and htmlStyle to remove these elements if
unsafe.
- Modified rawHtmlBlock to use anyHtmlBlockTag instead of anyHtmlTag
and anyHtmlEndTag. This fixes a bug in markdown parsing, where
inline tags would be included in raw HTML blocks.
- Modified anyHtmlBlockTag to test for (not inline) rather than
directly for block. This allows us to handle e.g. docbook in
the markdown reader.
- Minor tweaks in nonTitleNonHead and parseTitle.
+ Text.Pandoc.Readers.Markdown:
- In non-strict mode use rawHtmlBlocks instead of htmlBlock.
Simplified htmlBlock, since we know it's only called in strict
mode.
+ Modified README and man pages to document new option.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@1166 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'Text/Pandoc/Readers')
-rw-r--r-- | Text/Pandoc/Readers/HTML.hs | 93 | ||||
-rw-r--r-- | Text/Pandoc/Readers/Markdown.hs | 17 |
2 files changed, 87 insertions, 23 deletions
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs index 1fff4705f..42a085f63 100644 --- a/Text/Pandoc/Readers/HTML.hs +++ b/Text/Pandoc/Readers/HTML.hs @@ -75,10 +75,54 @@ blockHtmlTags = ["address", "blockquote", "center", "dir", "div", "dt", "frameset", "li", "tbody", "td", "tfoot", "th", "thead", "tr", "script"] ++ eitherBlockOrInline +sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big", + "blockquote", "br", "button", "caption", "center", + "cite", "code", "col", "colgroup", "dd", "del", "dfn", + "dir", "div", "dl", "dt", "em", "fieldset", "font", + "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", + "i", "img", "input", "ins", "kbd", "label", "legend", + "li", "map", "menu", "ol", "optgroup", "option", "p", + "pre", "q", "s", "samp", "select", "small", "span", + "strike", "strong", "sub", "sup", "table", "tbody", + "td", "textarea", "tfoot", "th", "thead", "tr", "tt", + "u", "ul", "var"] + +sanitaryAttributes = ["abbr", "accept", "accept-charset", + "accesskey", "action", "align", "alt", "axis", + "border", "cellpadding", "cellspacing", "char", + "charoff", "charset", "checked", "cite", "class", + "clear", "cols", "colspan", "color", "compact", + "coords", "datetime", "dir", "disabled", + "enctype", "for", "frame", "headers", "height", + "href", "hreflang", "hspace", "id", "ismap", + "label", "lang", "longdesc", "maxlength", "media", + "method", "multiple", "name", "nohref", "noshade", + "nowrap", "prompt", "readonly", "rel", "rev", + "rows", "rowspan", "rules", "scope", "selected", + "shape", "size", "span", "src", "start", + "summary", "tabindex", "target", "title", "type", + "usemap", "valign", "value", "vspace", "width"] + -- -- HTML utility functions -- +-- | Returns @True@ if sanitization is specified and the specified tag is +-- not on the sanitized tag list. +unsanitaryTag tag = do + st <- getState + if stateSanitizeHTML st && not (tag `elem` sanitaryTags) + then return True + else return False + +-- | returns @True@ if sanitization is specified and the specified attribute +-- is not on the sanitized attribute list. +unsanitaryAttribute (attr, _, _) = do + st <- getState + if stateSanitizeHTML st && not (attr `elem` sanitaryAttributes) + then return True + else return False + -- | Read blocks until end tag. blocksTilEnd tag = do blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) @@ -111,20 +155,28 @@ anyHtmlTag = try $ do let ender' = if null ender then "" else " /" spaces char '>' - return $ "<" ++ tag ++ - concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" + let result = "<" ++ tag ++ + concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" + unsanitary <- unsanitaryTag tag + if unsanitary + then return $ "<!-- unsafe tag " ++ result ++ " omitted -->" + else return result anyHtmlEndTag = try $ do char '<' spaces char '/' spaces - tagType <- many1 alphaNum + tag <- many1 alphaNum spaces char '>' - return $ "</" ++ tagType ++ ">" + let result = "</" ++ tag ++ ">" + unsanitary <- unsanitaryTag tag + if unsanitary + then return $ "<!-- unsafe tag " ++ result ++ " omitted -->" + else return result -htmlTag :: String -> GenParser Char st (String, [(String, String)]) +htmlTag :: String -> GenParser Char ParserState (String, [(String, String)]) htmlTag tag = try $ do char '<' spaces @@ -142,7 +194,14 @@ quoted quoteChar = do (many (noneOf [quoteChar])) return (result, [quoteChar]) -htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute +nullAttribute = ("", "", "") + +htmlAttribute = do + attr <- htmlRegularAttribute <|> htmlMinimizedAttribute + unsanitary <- unsanitaryAttribute attr + if unsanitary + then return nullAttribute + else return attr -- minimized boolean attribute htmlMinimizedAttribute = try $ do @@ -183,7 +242,7 @@ isBlock tag = (extractTagType tag) `elem` blockHtmlTags anyHtmlBlockTag = try $ do tag <- anyHtmlTag <|> anyHtmlEndTag - if isBlock tag then return tag else fail "not a block tag" + if not (isInline tag) then return tag else fail "not a block tag" anyHtmlInlineTag = try $ do tag <- anyHtmlTag <|> anyHtmlEndTag @@ -194,19 +253,25 @@ anyHtmlInlineTag = try $ do htmlScript = try $ do open <- string "<script" rest <- manyTill anyChar (htmlEndTag "script") - return $ open ++ rest ++ "</script>" + st <- getState + if stateSanitizeHTML st && not ("script" `elem` sanitaryTags) + then return "<!-- unsafe script omitted -->" + else return $ open ++ rest ++ "</script>" -- | Parses material between style tags. -- Style tags must be treated differently, because they can contain CSS htmlStyle = try $ do open <- string "<style" rest <- manyTill anyChar (htmlEndTag "style") - return $ open ++ rest ++ "</style>" + st <- getState + if stateSanitizeHTML st && not ("style" `elem` sanitaryTags) + then return "<!-- unsafe style omitted -->" + else return $ open ++ rest ++ "</style>" htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ] rawHtmlBlock = try $ do - body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag + body <- htmlBlockElement <|> anyHtmlBlockTag state <- getState if stateParseRaw state then return (RawHtml body) else return Null @@ -235,8 +300,10 @@ definition = try $ do rest <- manyTill anyChar (char '>') return $ "<!" ++ rest ++ ">" -nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >> - ((rawHtmlBlock >> return ' ') <|> anyChar) +nonTitleNonHead = try $ do + notFollowedBy $ (htmlTag "title" >> return ' ') <|> + (htmlEndTag "head" >> return ' ') + (rawHtmlBlock >> return ' ') <|> anyChar parseTitle = try $ do (tag, _) <- htmlTag "title" @@ -251,7 +318,7 @@ parseHead = try $ do skipMany nonTitleNonHead contents <- option [] parseTitle skipMany nonTitleNonHead - htmlTag "/head" + htmlEndTag "head" return (contents, [], "") skipHtmlTag tag = optional (htmlTag tag) diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs index 6455dcd9d..2d1fa7583 100644 --- a/Text/Pandoc/Readers/Markdown.hs +++ b/Text/Pandoc/Readers/Markdown.hs @@ -249,7 +249,7 @@ block = do , blockQuote , rawLaTeXEnvironment , para - , htmlBlock + , rawHtmlBlocks , plain , nullBlock ]) <?> "block" @@ -482,15 +482,12 @@ plain = many1 inline >>= return . Plain . normalizeSpaces htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element" -htmlBlock = do - st <- getState - if stateStrict st - then try $ do failUnlessBeginningOfLine - first <- htmlElement - finalSpace <- many (oneOf spaceChars) - finalNewlines <- many newline - return $ RawHtml $ first ++ finalSpace ++ finalNewlines - else rawHtmlBlocks +htmlBlock = try $ do + failUnlessBeginningOfLine + first <- htmlElement + finalSpace <- many (oneOf spaceChars) + finalNewlines <- many newline + return $ RawHtml $ first ++ finalSpace ++ finalNewlines -- True if tag is self-closing isSelfClosing tag = |