From 5770ceca36a7ab15ad43b3d949c749f6425ecbcb Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Fri, 10 Dec 2010 12:26:03 -0800 Subject: Removed HTML sanitization. This is better done on the resulting HTML; use the xss-sanitize library for this. xss-sanitize is based on pandoc's sanitization, but improves it. - Removed stateSanitize from ParserState. - Removed --sanitize-html option. --- src/Text/Pandoc/Parsing.hs | 2 - src/Text/Pandoc/Readers/HTML.hs | 95 ++----------------------------------- src/Text/Pandoc/Readers/Markdown.hs | 16 ++----- 3 files changed, 10 insertions(+), 103 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs index 1b206e4c7..48c6aa70d 100644 --- a/src/Text/Pandoc/Parsing.hs +++ b/src/Text/Pandoc/Parsing.hs @@ -586,7 +586,6 @@ data ParserState = ParserState { stateParseRaw :: Bool, -- ^ Parse raw HTML and LaTeX? stateParserContext :: ParserContext, -- ^ Inside list? stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment? - stateSanitizeHTML :: Bool, -- ^ Sanitize HTML? stateKeys :: KeyTable, -- ^ List of reference keys stateCitations :: [String], -- ^ List of available citations stateNotes :: NoteTable, -- ^ List of notes @@ -614,7 +613,6 @@ defaultParserState = ParserState { stateParseRaw = False, stateParserContext = NullState, stateQuoteContext = NoQuote, - stateSanitizeHTML = False, stateKeys = M.empty, stateCitations = [], stateNotes = [], diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index f05fdd57b..ed026eb49 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -40,7 +40,6 @@ module Text.Pandoc.Readers.HTML ( extractTagType, htmlBlockElement, htmlComment, - unsanitaryURI ) where import Text.ParserCombinators.Parsec @@ -51,7 +50,6 @@ import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) import Data.Maybe ( fromMaybe ) import Data.List ( isPrefixOf, isSuffixOf, intercalate ) import Data.Char ( toLower, isAlphaNum ) -import Network.URI ( parseURIReference, URI (..) ) import Control.Monad ( liftM, when ) -- | Convert HTML-formatted string to 'Pandoc' document. @@ -85,36 +83,6 @@ blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div", "dt", "frameset", "li", "tbody", "td", "tfoot", "th", "thead", "tr", "script", "style"] -sanitaryTags :: [[Char]] -sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big", - "blockquote", "br", "button", "caption", "center", - "cite", "code", "col", "colgroup", "dd", "del", "dfn", - "dir", "div", "dl", "dt", "em", "fieldset", "font", - "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr", - "i", "img", "input", "ins", "kbd", "label", "legend", - "li", "map", "menu", "ol", "optgroup", "option", "p", - "pre", "q", "s", "samp", "select", "small", "span", - "strike", "strong", "sub", "sup", "table", "tbody", - "td", "textarea", "tfoot", "th", "thead", "tr", "tt", - "u", "ul", "var"] - -sanitaryAttributes :: [[Char]] -sanitaryAttributes = ["abbr", "accept", "accept-charset", - "accesskey", "action", "align", "alt", "axis", - "border", "cellpadding", "cellspacing", "char", - "charoff", "charset", "checked", "cite", "class", - "clear", "cols", "colspan", "color", "compact", - "coords", "datetime", "dir", "disabled", - "enctype", "for", "frame", "headers", "height", - "href", "hreflang", "hspace", "id", "ismap", - "label", "lang", "longdesc", "maxlength", "media", - "method", "multiple", "name", "nohref", "noshade", - "nowrap", "prompt", "readonly", "rel", "rev", - "rows", "rowspan", "rules", "scope", "selected", - "shape", "size", "span", "src", "start", - "summary", "tabindex", "target", "title", "type", - "usemap", "valign", "value", "vspace", "width"] - -- taken from HXT and extended closes :: String -> String -> Bool @@ -153,41 +121,6 @@ _ `closes` _ = False -- HTML utility functions -- --- | Returns @True@ if sanitization is specified and the specified tag is --- not on the sanitized tag list. -unsanitaryTag :: [Char] - -> GenParser tok ParserState Bool -unsanitaryTag tag = do - st <- getState - return $ stateSanitizeHTML st && tag `notElem` sanitaryTags - --- | returns @True@ if sanitization is specified and the specified attribute --- is not on the sanitized attribute list. -unsanitaryAttribute :: ([Char], String, t) - -> GenParser tok ParserState Bool -unsanitaryAttribute (attr, val, _) = do - st <- getState - return $ stateSanitizeHTML st && - (attr `notElem` sanitaryAttributes || - (attr `elem` ["href","src"] && unsanitaryURI val)) - --- | Returns @True@ if the specified URI is potentially a security risk. -unsanitaryURI :: String -> Bool -unsanitaryURI u = - let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:", - "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:", - "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:", - "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:", - "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:", - "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:", - "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:", - "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:", - "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:", - "snews:", "webcal:", "ymsgr:"] - in case parseURIReference (escapeURI u) of - Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes - Nothing -> True - -- | Read blocks until end tag. blocksTilEnd :: String -> GenParser Char ParserState [Block] blocksTilEnd tag = do @@ -240,10 +173,7 @@ anyHtmlTag = try $ do char '>' let result = "<" ++ tag ++ concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">" - unsanitary <- unsanitaryTag tag - if unsanitary - then return $ "" - else return result + return result anyHtmlEndTag :: GenParser Char ParserState [Char] anyHtmlEndTag = try $ do @@ -255,10 +185,7 @@ anyHtmlEndTag = try $ do spaces char '>' let result = "" - unsanitary <- unsanitaryTag tag - if unsanitary - then return $ "" - else return result + return result htmlTag :: Bool -> String @@ -294,16 +221,10 @@ quoted quoteChar = do (many (noneOf [quoteChar])) return (result, [quoteChar]) -nullAttribute :: ([Char], [Char], [Char]) -nullAttribute = ("", "", "") - htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char]) htmlAttribute = do attr <- htmlRegularAttribute <|> htmlMinimizedAttribute - unsanitary <- unsanitaryAttribute attr - if unsanitary - then return nullAttribute - else return attr + return attr -- minimized boolean attribute htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char]) @@ -364,10 +285,7 @@ htmlScript = try $ do lookAhead $ htmlOpenTag "script" open <- anyHtmlTag rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script") - st <- getState - if stateSanitizeHTML st && not ("script" `elem` sanitaryTags) - then return "" - else return $ open ++ rest ++ "" + return $ open ++ rest ++ "" scriptChunk :: GenParser Char ParserState [Char] scriptChunk = jsComment <|> jsString <|> jsChars @@ -399,10 +317,7 @@ htmlStyle = try $ do lookAhead $ htmlOpenTag "style" open <- anyHtmlTag rest <- manyTill anyChar (htmlEndTag "style") - st <- getState - if stateSanitizeHTML st && not ("style" `elem` sanitaryTags) - then return "" - else return $ open ++ rest ++ "" + return $ open ++ rest ++ "" htmlBlockElement :: GenParser Char ParserState [Char] htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ] diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 6e66c862d..ad107ecd7 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -41,7 +41,7 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' ) import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, anyHtmlInlineTag, anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType, - htmlBlockElement, htmlComment, unsanitaryURI ) + htmlBlockElement, htmlComment ) import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) import Text.ParserCombinators.Parsec import Control.Monad (when, liftM, guard) @@ -1152,10 +1152,7 @@ link :: GenParser Char ParserState Inline link = try $ do lab <- reference (src, tit) <- source <|> referenceLink lab - sanitize <- getState >>= return . stateSanitizeHTML - if sanitize && unsanitaryURI src - then fail "Unsanitary URI" - else return $ Link lab (src, tit) + return $ Link lab (src, tit) -- a link like [this][ref] or [this][] or [this] referenceLink :: [Inline] @@ -1175,12 +1172,9 @@ autoLink = try $ do (orig, src) <- uri <|> emailAddress char '>' st <- getState - let sanitize = stateSanitizeHTML st - if sanitize && unsanitaryURI src - then fail "Unsanitary URI" - else return $ if stateStrict st - then Link [Str orig] (src, "") - else Link [Code orig] (src, "") + return $ if stateStrict st + then Link [Str orig] (src, "") + else Link [Code orig] (src, "") image :: GenParser Char ParserState Inline image = try $ do -- cgit v1.2.3