aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Main.hs23
-rw-r--r--README5
-rw-r--r--Text/Pandoc/Readers/HTML.hs93
-rw-r--r--Text/Pandoc/Readers/Markdown.hs17
-rw-r--r--Text/Pandoc/Shared.hs2
-rw-r--r--man/man1/html2markdown.1.md4
-rw-r--r--man/man1/pandoc.1.md5
7 files changed, 119 insertions, 30 deletions
diff --git a/Main.hs b/Main.hs
index 5938fefc4..98bc0897f 100644
--- a/Main.hs
+++ b/Main.hs
@@ -104,6 +104,7 @@ data Opt = Opt
, optStrict :: Bool -- ^ Use strict markdown syntax
, optReferenceLinks :: Bool -- ^ Use reference links in writing markdown, rst
, optWrapText :: Bool -- ^ Wrap text
+ , optSanitizeHTML :: Bool -- ^ Sanitize HTML
}
-- | Defaults for command-line options.
@@ -132,6 +133,7 @@ defaultOpts = Opt
, optStrict = False
, optReferenceLinks = False
, optWrapText = True
+ , optSanitizeHTML = False
}
-- | A list of functions, each transforming the options data structure
@@ -226,6 +228,11 @@ options =
(\opt -> return opt { optWrapText = False }))
"" -- "Do not wrap text in output"
+ , Option "" ["sanitize-html"]
+ (NoArg
+ (\opt -> return opt { optSanitizeHTML = True }))
+ "" -- "Sanitize HTML"
+
, Option "" ["toc", "table-of-contents"]
(NoArg
(\opt -> return opt { optTableOfContents = True }))
@@ -424,6 +431,7 @@ main = do
, optStrict = strict
, optReferenceLinks = referenceLinks
, optWrapText = wrap
+ , optSanitizeHTML = sanitize
} = opts
if dumpArgs
@@ -476,13 +484,14 @@ main = do
x:(tabFilter (spsToNextStop - 1) xs)
let startParserState =
- defaultParserState { stateParseRaw = parseRaw,
- stateTabStop = tabStop,
- stateStandalone = standalone && (not strict),
- stateSmart = smart || writerName' `elem`
- ["latex", "context"],
- stateColumns = columns,
- stateStrict = strict }
+ defaultParserState { stateParseRaw = parseRaw,
+ stateTabStop = tabStop,
+ stateSanitizeHTML = sanitize,
+ stateStandalone = standalone && (not strict),
+ stateSmart = smart || writerName' `elem`
+ ["latex", "context"],
+ stateColumns = columns,
+ stateStrict = strict }
let csslink = if (css == "")
then ""
else "<link rel=\"stylesheet\" href=\"" ++ css ++
diff --git a/README b/README
index cd6b30cda..0f17b3b5e 100644
--- a/README
+++ b/README
@@ -345,6 +345,11 @@ For further documentation, see the `pandoc(1)` man page.
: disables text-wrapping in output. By default, text is wrapped
appropriately for the output format.
+`--sanitize-html`
+: sanitizes HTML (in markdown or HTML input) using a whitelist.
+ Unsafe tags are replaced by HTML comments; unsafe attributes
+ are omitted.
+
`--dump-args`
: is intended to make it easier to create wrapper scripts that use
Pandoc. It causes Pandoc to dump information about the arguments
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs
index 1fff4705f..42a085f63 100644
--- a/Text/Pandoc/Readers/HTML.hs
+++ b/Text/Pandoc/Readers/HTML.hs
@@ -75,10 +75,54 @@ blockHtmlTags = ["address", "blockquote", "center", "dir", "div",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr", "script"] ++ eitherBlockOrInline
+sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
+ "blockquote", "br", "button", "caption", "center",
+ "cite", "code", "col", "colgroup", "dd", "del", "dfn",
+ "dir", "div", "dl", "dt", "em", "fieldset", "font",
+ "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
+ "i", "img", "input", "ins", "kbd", "label", "legend",
+ "li", "map", "menu", "ol", "optgroup", "option", "p",
+ "pre", "q", "s", "samp", "select", "small", "span",
+ "strike", "strong", "sub", "sup", "table", "tbody",
+ "td", "textarea", "tfoot", "th", "thead", "tr", "tt",
+ "u", "ul", "var"]
+
+sanitaryAttributes = ["abbr", "accept", "accept-charset",
+ "accesskey", "action", "align", "alt", "axis",
+ "border", "cellpadding", "cellspacing", "char",
+ "charoff", "charset", "checked", "cite", "class",
+ "clear", "cols", "colspan", "color", "compact",
+ "coords", "datetime", "dir", "disabled",
+ "enctype", "for", "frame", "headers", "height",
+ "href", "hreflang", "hspace", "id", "ismap",
+ "label", "lang", "longdesc", "maxlength", "media",
+ "method", "multiple", "name", "nohref", "noshade",
+ "nowrap", "prompt", "readonly", "rel", "rev",
+ "rows", "rowspan", "rules", "scope", "selected",
+ "shape", "size", "span", "src", "start",
+ "summary", "tabindex", "target", "title", "type",
+ "usemap", "valign", "value", "vspace", "width"]
+
--
-- HTML utility functions
--
+-- | Returns @True@ if sanitization is specified and the specified tag is
+-- not on the sanitized tag list.
+unsanitaryTag tag = do
+ st <- getState
+ if stateSanitizeHTML st && not (tag `elem` sanitaryTags)
+ then return True
+ else return False
+
+-- | returns @True@ if sanitization is specified and the specified attribute
+-- is not on the sanitized attribute list.
+unsanitaryAttribute (attr, _, _) = do
+ st <- getState
+ if stateSanitizeHTML st && not (attr `elem` sanitaryAttributes)
+ then return True
+ else return False
+
-- | Read blocks until end tag.
blocksTilEnd tag = do
blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
@@ -111,20 +155,28 @@ anyHtmlTag = try $ do
let ender' = if null ender then "" else " /"
spaces
char '>'
- return $ "<" ++ tag ++
- concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
+ let result = "<" ++ tag ++
+ concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
+ unsanitary <- unsanitaryTag tag
+ if unsanitary
+ then return $ "<!-- unsafe tag " ++ result ++ " omitted -->"
+ else return result
anyHtmlEndTag = try $ do
char '<'
spaces
char '/'
spaces
- tagType <- many1 alphaNum
+ tag <- many1 alphaNum
spaces
char '>'
- return $ "</" ++ tagType ++ ">"
+ let result = "</" ++ tag ++ ">"
+ unsanitary <- unsanitaryTag tag
+ if unsanitary
+ then return $ "<!-- unsafe tag " ++ result ++ " omitted -->"
+ else return result
-htmlTag :: String -> GenParser Char st (String, [(String, String)])
+htmlTag :: String -> GenParser Char ParserState (String, [(String, String)])
htmlTag tag = try $ do
char '<'
spaces
@@ -142,7 +194,14 @@ quoted quoteChar = do
(many (noneOf [quoteChar]))
return (result, [quoteChar])
-htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
+nullAttribute = ("", "", "")
+
+htmlAttribute = do
+ attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
+ unsanitary <- unsanitaryAttribute attr
+ if unsanitary
+ then return nullAttribute
+ else return attr
-- minimized boolean attribute
htmlMinimizedAttribute = try $ do
@@ -183,7 +242,7 @@ isBlock tag = (extractTagType tag) `elem` blockHtmlTags
anyHtmlBlockTag = try $ do
tag <- anyHtmlTag <|> anyHtmlEndTag
- if isBlock tag then return tag else fail "not a block tag"
+ if not (isInline tag) then return tag else fail "not a block tag"
anyHtmlInlineTag = try $ do
tag <- anyHtmlTag <|> anyHtmlEndTag
@@ -194,19 +253,25 @@ anyHtmlInlineTag = try $ do
htmlScript = try $ do
open <- string "<script"
rest <- manyTill anyChar (htmlEndTag "script")
- return $ open ++ rest ++ "</script>"
+ st <- getState
+ if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
+ then return "<!-- unsafe script omitted -->"
+ else return $ open ++ rest ++ "</script>"
-- | Parses material between style tags.
-- Style tags must be treated differently, because they can contain CSS
htmlStyle = try $ do
open <- string "<style"
rest <- manyTill anyChar (htmlEndTag "style")
- return $ open ++ rest ++ "</style>"
+ st <- getState
+ if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
+ then return "<!-- unsafe style omitted -->"
+ else return $ open ++ rest ++ "</style>"
htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
rawHtmlBlock = try $ do
- body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag
+ body <- htmlBlockElement <|> anyHtmlBlockTag
state <- getState
if stateParseRaw state then return (RawHtml body) else return Null
@@ -235,8 +300,10 @@ definition = try $ do
rest <- manyTill anyChar (char '>')
return $ "<!" ++ rest ++ ">"
-nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >>
- ((rawHtmlBlock >> return ' ') <|> anyChar)
+nonTitleNonHead = try $ do
+ notFollowedBy $ (htmlTag "title" >> return ' ') <|>
+ (htmlEndTag "head" >> return ' ')
+ (rawHtmlBlock >> return ' ') <|> anyChar
parseTitle = try $ do
(tag, _) <- htmlTag "title"
@@ -251,7 +318,7 @@ parseHead = try $ do
skipMany nonTitleNonHead
contents <- option [] parseTitle
skipMany nonTitleNonHead
- htmlTag "/head"
+ htmlEndTag "head"
return (contents, [], "")
skipHtmlTag tag = optional (htmlTag tag)
diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs
index 6455dcd9d..2d1fa7583 100644
--- a/Text/Pandoc/Readers/Markdown.hs
+++ b/Text/Pandoc/Readers/Markdown.hs
@@ -249,7 +249,7 @@ block = do
, blockQuote
, rawLaTeXEnvironment
, para
- , htmlBlock
+ , rawHtmlBlocks
, plain
, nullBlock ]) <?> "block"
@@ -482,15 +482,12 @@ plain = many1 inline >>= return . Plain . normalizeSpaces
htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
-htmlBlock = do
- st <- getState
- if stateStrict st
- then try $ do failUnlessBeginningOfLine
- first <- htmlElement
- finalSpace <- many (oneOf spaceChars)
- finalNewlines <- many newline
- return $ RawHtml $ first ++ finalSpace ++ finalNewlines
- else rawHtmlBlocks
+htmlBlock = try $ do
+ failUnlessBeginningOfLine
+ first <- htmlElement
+ finalSpace <- many (oneOf spaceChars)
+ finalNewlines <- many newline
+ return $ RawHtml $ first ++ finalSpace ++ finalNewlines
-- True if tag is self-closing
isSelfClosing tag =
diff --git a/Text/Pandoc/Shared.hs b/Text/Pandoc/Shared.hs
index 7086ca452..477d86464 100644
--- a/Text/Pandoc/Shared.hs
+++ b/Text/Pandoc/Shared.hs
@@ -595,6 +595,7 @@ data ParserState = ParserState
{ stateParseRaw :: Bool, -- ^ Parse raw HTML and LaTeX?
stateParserContext :: ParserContext, -- ^ Inside list?
stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment?
+ stateSanitizeHTML :: Bool, -- ^ Sanitize HTML?
stateKeys :: KeyTable, -- ^ List of reference keys
stateNotes :: NoteTable, -- ^ List of notes
stateTabStop :: Int, -- ^ Tab stop
@@ -614,6 +615,7 @@ defaultParserState =
ParserState { stateParseRaw = False,
stateParserContext = NullState,
stateQuoteContext = NoQuote,
+ stateSanitizeHTML = False,
stateKeys = [],
stateNotes = [],
stateTabStop = 4,
diff --git a/man/man1/html2markdown.1.md b/man/man1/html2markdown.1.md
index 6c5d2dcc8..19d5104af 100644
--- a/man/man1/html2markdown.1.md
+++ b/man/man1/html2markdown.1.md
@@ -51,6 +51,10 @@ a complete list. The following options are most relevant:
\--no-wrap
: Disable text wrapping in output. (Default is to wrap text.)
+\--sanitize-html
+: Sanitizes HTML using a whitelist. Unsafe tags are replaced by HTML
+ comments; unsafe attributes are omitted.
+
-H *FILE*, \--include-in-header=*FILE*
: Include contents of *FILE* at the end of the header. Implies
`-s`.
diff --git a/man/man1/pandoc.1.md b/man/man1/pandoc.1.md
index 37d3dc262..427004419 100644
--- a/man/man1/pandoc.1.md
+++ b/man/man1/pandoc.1.md
@@ -126,6 +126,11 @@ to Pandoc. Or use `html2markdown`(1), a wrapper around `pandoc`.
\--no-wrap
: Disable text wrapping in output. (Default is to wrap text.)
+\--sanitize-html
+: Sanitizes HTML (in markdown or HTML input) using a whitelist.
+ Unsafe tags are replaced by HTML comments; unsafe attributes
+ are omitted.
+
\--toc, \--table-of-contents
: Include an automatically generated table of contents (HTML, markdown,
RTF) or an instruction to create one (LaTeX, reStructuredText).