7 files changed, 119 insertions, 30 deletions
diff --git a/Main.hs b/Main.hs
index 5938fefc4..98bc0897f 100644
--- a/Main.hs
+++ b/Main.hs
@@ -104,6 +104,7 @@ data Opt = Opt
     , optStrict            :: Bool    -- ^ Use strict markdown syntax
     , optReferenceLinks    :: Bool    -- ^ Use reference links in writing markdown, rst
     , optWrapText          :: Bool    -- ^ Wrap text
+    , optSanitizeHTML      :: Bool    -- ^ Sanitize HTML
     }
 
 -- | Defaults for command-line options.
@@ -132,6 +133,7 @@ defaultOpts = Opt
     , optStrict            = False
     , optReferenceLinks    = False
     , optWrapText          = True
+    , optSanitizeHTML      = False
     }
 
 -- | A list of functions, each transforming the options data structure
@@ -226,6 +228,11 @@ options =
                   (\opt -> return opt { optWrapText = False }))
                  "" -- "Do not wrap text in output"
 
+    , Option "" ["sanitize-html"]
+                 (NoArg
+                  (\opt -> return opt { optSanitizeHTML = True }))
+                 "" -- "Sanitize HTML"
+
     , Option "" ["toc", "table-of-contents"]
                 (NoArg
                  (\opt -> return opt { optTableOfContents = True }))
@@ -424,6 +431,7 @@ main = do
               , optStrict            = strict
               , optReferenceLinks    = referenceLinks
               , optWrapText          = wrap
+              , optSanitizeHTML      = sanitize
              } = opts
 
   if dumpArgs
@@ -476,13 +484,14 @@ main = do
         x:(tabFilter (spsToNextStop - 1) xs)
 
   let startParserState = 
-         defaultParserState { stateParseRaw    = parseRaw,
-                              stateTabStop     = tabStop, 
-                              stateStandalone  = standalone && (not strict),
-                              stateSmart       = smart || writerName' `elem` 
-                                                          ["latex", "context"],
-                              stateColumns     = columns,
-                              stateStrict      = strict }
+         defaultParserState { stateParseRaw     = parseRaw,
+                              stateTabStop      = tabStop, 
+                              stateSanitizeHTML = sanitize,
+                              stateStandalone   = standalone && (not strict),
+                              stateSmart        = smart || writerName' `elem` 
+                                                           ["latex", "context"],
+                              stateColumns      = columns,
+                              stateStrict       = strict }
   let csslink = if (css == "")
                    then "" 
                    else "<link rel=\"stylesheet\" href=\"" ++ css ++ 
diff --git a/README b/README
index cd6b30cda..0f17b3b5e 100644
--- a/README
+++ b/README
@@ -345,6 +345,11 @@ For further documentation, see the `pandoc(1)` man page.
 :   disables text-wrapping in output.  By default, text is wrapped
     appropriately for the output format.
 
+`--sanitize-html`
+:   sanitizes HTML (in markdown or HTML input) using a whitelist.
+    Unsafe tags are replaced by HTML comments; unsafe attributes
+    are omitted.
+
 `--dump-args`
 :   is intended to make it easier to create wrapper scripts that use
     Pandoc. It causes Pandoc to dump information about the arguments
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs
index 1fff4705f..42a085f63 100644
--- a/Text/Pandoc/Readers/HTML.hs
+++ b/Text/Pandoc/Readers/HTML.hs
@@ -75,10 +75,54 @@ blockHtmlTags = ["address", "blockquote", "center", "dir", "div",
                  "dt", "frameset", "li", "tbody", "td", "tfoot",
                  "th", "thead", "tr", "script"] ++ eitherBlockOrInline
 
+sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
+                "blockquote", "br", "button", "caption", "center",
+                "cite", "code", "col", "colgroup", "dd", "del", "dfn",
+                "dir", "div", "dl", "dt", "em", "fieldset", "font",
+                "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
+                "i", "img", "input", "ins", "kbd", "label", "legend",
+                "li", "map", "menu", "ol", "optgroup", "option", "p",
+                "pre", "q", "s", "samp", "select", "small", "span",
+                "strike", "strong", "sub", "sup", "table", "tbody",
+                "td", "textarea", "tfoot", "th", "thead", "tr", "tt",
+                "u", "ul", "var"]
+
+sanitaryAttributes = ["abbr", "accept", "accept-charset",
+                      "accesskey", "action", "align", "alt", "axis",
+                      "border", "cellpadding", "cellspacing", "char",
+                      "charoff", "charset", "checked", "cite", "class",
+                      "clear", "cols", "colspan", "color", "compact",
+                      "coords", "datetime", "dir", "disabled",
+                      "enctype", "for", "frame", "headers", "height",
+                      "href", "hreflang", "hspace", "id", "ismap",
+                      "label", "lang", "longdesc", "maxlength", "media",
+                      "method", "multiple", "name", "nohref", "noshade",
+                      "nowrap", "prompt", "readonly", "rel", "rev",
+                      "rows", "rowspan", "rules", "scope", "selected",
+                      "shape", "size", "span", "src", "start",
+                      "summary", "tabindex", "target", "title", "type",
+                      "usemap", "valign", "value", "vspace", "width"]
+
 --
 -- HTML utility functions
 --
 
+-- | Returns @True@ if sanitization is specified and the specified tag is 
+--  not on the sanitized tag list.
+unsanitaryTag tag = do
+  st <- getState
+  if stateSanitizeHTML st && not (tag `elem` sanitaryTags)
+     then return True
+     else return False
+
+-- | returns @True@ if sanitization is specified and the specified attribute
+--  is not on the sanitized attribute list.
+unsanitaryAttribute (attr, _, _) = do
+  st <- getState
+  if stateSanitizeHTML st && not (attr `elem` sanitaryAttributes)
+    then return True
+    else return False
+
 -- | Read blocks until end tag.
 blocksTilEnd tag = do
   blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
@@ -111,20 +155,28 @@ anyHtmlTag = try $ do
   let ender' = if null ender then "" else " /"
   spaces
   char '>'
-  return $ "<" ++ tag ++ 
-           concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
+  let result = "<" ++ tag ++ 
+               concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
+  unsanitary <- unsanitaryTag tag
+  if unsanitary
+     then return $ "<!-- unsafe tag " ++ result ++ " omitted -->"
+     else return result
 
 anyHtmlEndTag = try $ do
   char '<'   
   spaces
   char '/'
   spaces
-  tagType <- many1 alphaNum
+  tag <- many1 alphaNum
   spaces
   char '>'
-  return $ "</" ++ tagType ++ ">"
+  let result = "</" ++ tag ++ ">"
+  unsanitary <- unsanitaryTag tag
+  if unsanitary
+     then return $ "<!-- unsafe tag " ++ result ++ " omitted -->"
+     else return result 
 
-htmlTag :: String -> GenParser Char st (String, [(String, String)])
+htmlTag :: String -> GenParser Char ParserState (String, [(String, String)])
 htmlTag tag = try $ do
   char '<'
   spaces
@@ -142,7 +194,14 @@ quoted quoteChar = do
                     (many (noneOf [quoteChar]))
   return (result, [quoteChar])
 
-htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
+nullAttribute = ("", "", "") 
+
+htmlAttribute = do
+  attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
+  unsanitary <- unsanitaryAttribute attr
+  if unsanitary
+     then return nullAttribute
+     else return attr
 
 -- minimized boolean attribute
 htmlMinimizedAttribute = try $ do
@@ -183,7 +242,7 @@ isBlock tag = (extractTagType tag) `elem` blockHtmlTags
 
 anyHtmlBlockTag = try $ do
   tag <- anyHtmlTag <|> anyHtmlEndTag
-  if isBlock tag then return tag else fail "not a block tag"
+  if not (isInline tag) then return tag else fail "not a block tag"
 
 anyHtmlInlineTag = try $ do
   tag <- anyHtmlTag <|> anyHtmlEndTag
@@ -194,19 +253,25 @@ anyHtmlInlineTag = try $ do
 htmlScript = try $ do
   open <- string "<script"
   rest <- manyTill anyChar (htmlEndTag "script")
-  return $ open ++ rest ++ "</script>"
+  st <- getState
+  if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
+     then return "<!-- unsafe script omitted -->"
+     else return $ open ++ rest ++ "</script>"
 
 -- | Parses material between style tags.
 -- Style tags must be treated differently, because they can contain CSS
 htmlStyle = try $ do
   open <- string "<style"
   rest <- manyTill anyChar (htmlEndTag "style")
-  return $ open ++ rest ++ "</style>"
+  st <- getState
+  if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
+     then return "<!-- unsafe style omitted -->"
+     else return $ open ++ rest ++ "</style>"
 
 htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
 
 rawHtmlBlock = try $ do
-  body <- htmlBlockElement <|> anyHtmlTag <|> anyHtmlEndTag
+  body <- htmlBlockElement <|> anyHtmlBlockTag
   state <- getState
   if stateParseRaw state then return (RawHtml body) else return Null
 
@@ -235,8 +300,10 @@ definition = try $ do
   rest <- manyTill anyChar (char '>')
   return $ "<!" ++ rest ++ ">"
 
-nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >>
-  ((rawHtmlBlock >> return ' ') <|> anyChar)
+nonTitleNonHead = try $ do
+  notFollowedBy $ (htmlTag "title" >> return ' ') <|> 
+                  (htmlEndTag "head" >> return ' ')
+  (rawHtmlBlock >> return ' ') <|> anyChar
 
 parseTitle = try $ do
   (tag, _) <- htmlTag "title"
@@ -251,7 +318,7 @@ parseHead = try $ do
   skipMany nonTitleNonHead
   contents <- option [] parseTitle
   skipMany nonTitleNonHead
-  htmlTag "/head"
+  htmlEndTag "head"
   return (contents, [], "")
 
 skipHtmlTag tag = optional (htmlTag tag)
diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs
index 6455dcd9d..2d1fa7583 100644
--- a/Text/Pandoc/Readers/Markdown.hs
+++ b/Text/Pandoc/Readers/Markdown.hs
@@ -249,7 +249,7 @@ block = do
                    , blockQuote
                    , rawLaTeXEnvironment
                    , para
-                   , htmlBlock
+                   , rawHtmlBlocks
                    , plain
                    , nullBlock ]) <?> "block"
 
@@ -482,15 +482,12 @@ plain = many1 inline >>= return . Plain . normalizeSpaces
 
 htmlElement = strictHtmlBlock <|> htmlBlockElement <?> "html element"
 
-htmlBlock = do
-  st <- getState
-  if stateStrict st
-    then try $ do failUnlessBeginningOfLine
-                  first <- htmlElement
-                  finalSpace <- many (oneOf spaceChars)
-                  finalNewlines <- many newline
-                  return $ RawHtml $ first ++ finalSpace ++ finalNewlines
-    else rawHtmlBlocks
+htmlBlock = try $ do
+    failUnlessBeginningOfLine
+    first <- htmlElement
+    finalSpace <- many (oneOf spaceChars)
+    finalNewlines <- many newline
+    return $ RawHtml $ first ++ finalSpace ++ finalNewlines
 
 -- True if tag is self-closing
 isSelfClosing tag = 
diff --git a/Text/Pandoc/Shared.hs b/Text/Pandoc/Shared.hs
index 7086ca452..477d86464 100644
--- a/Text/Pandoc/Shared.hs
+++ b/Text/Pandoc/Shared.hs
@@ -595,6 +595,7 @@ data ParserState = ParserState
     { stateParseRaw        :: Bool,          -- ^ Parse raw HTML and LaTeX?
       stateParserContext   :: ParserContext, -- ^ Inside list?
       stateQuoteContext    :: QuoteContext,  -- ^ Inside quoted environment?
+      stateSanitizeHTML    :: Bool,          -- ^ Sanitize HTML?
       stateKeys            :: KeyTable,      -- ^ List of reference keys
       stateNotes           :: NoteTable,     -- ^ List of notes
       stateTabStop         :: Int,           -- ^ Tab stop
@@ -614,6 +615,7 @@ defaultParserState =
     ParserState { stateParseRaw        = False,
                   stateParserContext   = NullState,
                   stateQuoteContext    = NoQuote,
+                  stateSanitizeHTML    = False,
                   stateKeys            = [],
                   stateNotes           = [],
                   stateTabStop         = 4,
diff --git a/man/man1/html2markdown.1.md b/man/man1/html2markdown.1.md
index 6c5d2dcc8..19d5104af 100644
--- a/man/man1/html2markdown.1.md
+++ b/man/man1/html2markdown.1.md
@@ -51,6 +51,10 @@ a complete list.  The following options are most relevant:
 \--no-wrap
 :   Disable text wrapping in output.  (Default is to wrap text.)
 
+\--sanitize-html
+:   Sanitizes HTML using a whitelist. Unsafe tags are replaced by HTML
+    comments; unsafe attributes are omitted.
+
 -H *FILE*, \--include-in-header=*FILE*
 :   Include contents of *FILE* at the end of the header.  Implies
     `-s`.
diff --git a/man/man1/pandoc.1.md b/man/man1/pandoc.1.md
index 37d3dc262..427004419 100644
--- a/man/man1/pandoc.1.md
+++ b/man/man1/pandoc.1.md
@@ -126,6 +126,11 @@ to Pandoc.  Or use `html2markdown`(1), a wrapper around `pandoc`.
 \--no-wrap
 :   Disable text wrapping in output. (Default is to wrap text.)
 
+\--sanitize-html
+:   Sanitizes HTML (in markdown or HTML input) using a whitelist.
+    Unsafe tags are replaced by HTML comments; unsafe attributes
+    are omitted.
+
 \--toc, \--table-of-contents
 :   Include an automatically generated table of contents (HTML, markdown,
     RTF) or an instruction to create one (LaTeX, reStructuredText).