aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README3
-rw-r--r--Text/Pandoc/Readers/HTML.hs32
-rw-r--r--Text/Pandoc/Readers/Markdown.hs16
-rw-r--r--man/man1/html2markdown.1.md4
-rw-r--r--man/man1/pandoc.1.md3
5 files changed, 39 insertions, 19 deletions
diff --git a/README b/README
index e54821ba3..75d482333 100644
--- a/README
+++ b/README
@@ -356,7 +356,8 @@ For further documentation, see the `pandoc(1)` man page.
`--sanitize-html`
: sanitizes HTML (in markdown or HTML input) using a whitelist.
Unsafe tags are replaced by HTML comments; unsafe attributes
- are omitted.
+ are omitted. URIs in links and images are also checked against a
+ whitelist of URI schemes.
`--dump-args`
: is intended to make it easier to create wrapper scripts that use
diff --git a/Text/Pandoc/Readers/HTML.hs b/Text/Pandoc/Readers/HTML.hs
index 359ff3021..7bd76d983 100644
--- a/Text/Pandoc/Readers/HTML.hs
+++ b/Text/Pandoc/Readers/HTML.hs
@@ -37,7 +37,8 @@ module Text.Pandoc.Readers.HTML (
anyHtmlEndTag,
htmlEndTag,
extractTagType,
- htmlBlockElement
+ htmlBlockElement,
+ unsanitaryURI
) where
import Text.ParserCombinators.Parsec
@@ -47,6 +48,7 @@ import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
import Data.Maybe ( fromMaybe )
import Data.List ( takeWhile, dropWhile, isPrefixOf, isSuffixOf )
import Data.Char ( toLower, isAlphaNum )
+import Network.URI ( parseURIReference, URI (..) )
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ParserState -- ^ Parser state
@@ -110,17 +112,31 @@ sanitaryAttributes = ["abbr", "accept", "accept-charset",
-- not on the sanitized tag list.
unsanitaryTag tag = do
st <- getState
- if stateSanitizeHTML st && not (tag `elem` sanitaryTags)
- then return True
- else return False
+ return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
-- | returns @True@ if sanitization is specified and the specified attribute
-- is not on the sanitized attribute list.
-unsanitaryAttribute (attr, _, _) = do
+unsanitaryAttribute (attr, val, _) = do
st <- getState
- if stateSanitizeHTML st && not (attr `elem` sanitaryAttributes)
- then return True
- else return False
+ return $ stateSanitizeHTML st &&
+ (attr `notElem` sanitaryAttributes ||
+ (attr `elem` ["href","src"] && unsanitaryURI val))
+
+-- | Returns @True@ if the specified URI is potentially a security risk.
+unsanitaryURI uri =
+ let safeURISchemes = [ "", "http", "https", "ftp", "mailto", "file",
+ "telnet", "gopher", "aaa", "aaas", "acap", "cap", "cid",
+ "crid", "dav", "dict", "dns", "fax", "go", "h323", "im",
+ "imap", "ldap", "mid", "news", "nfs", "nntp", "pop",
+ "pres", "sip", "sips", "snmp", "tel", "urn", "wais",
+ "xmpp", "z39.50r", "z39.50s", "aim", "callto", "cvs",
+ "ed2k", "feed", "fish", "gg", "irc", "ircs", "lastfm",
+ "ldaps", "magnet", "mms", "msnim", "notes", "rsync",
+ "secondlife", "skype", "ssh", "sftp", "smb", "sms",
+ "snews", "webcal", "ymsgr"]
+ in case parseURIReference uri of
+ Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes
+ Nothing -> True
-- | Read blocks until end tag.
blocksTilEnd tag = do
diff --git a/Text/Pandoc/Readers/Markdown.hs b/Text/Pandoc/Readers/Markdown.hs
index e6f09f97a..2dbf9e189 100644
--- a/Text/Pandoc/Readers/Markdown.hs
+++ b/Text/Pandoc/Readers/Markdown.hs
@@ -41,7 +41,7 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
anyHtmlInlineTag, anyHtmlTag,
anyHtmlEndTag, htmlEndTag, extractTagType,
- htmlBlockElement )
+ htmlBlockElement, unsanitaryURI )
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
import Text.ParserCombinators.Parsec
@@ -921,7 +921,10 @@ linkTitle = try $ do
link = try $ do
label <- reference
src <- source <|> referenceLink label
- return $ Link label src
+ sanitize <- getState >>= return . stateSanitizeHTML
+ if sanitize && unsanitaryURI (fst src)
+ then fail "Unsanitary URI"
+ else return $ Link label src
-- a link like [this][ref] or [this][] or [this]
referenceLink label = do
@@ -941,9 +944,12 @@ autoLink = try $ do
then drop 7 src
else src
st <- getState
- return $ if stateStrict st
- then Link [Str src'] (src, "")
- else Link [Code src'] (src, "")
+ let sanitize = stateSanitizeHTML st
+ if sanitize && unsanitaryURI src
+ then fail "Unsanitary URI"
+ else return $ if stateStrict st
+ then Link [Str src'] (src, "")
+ else Link [Code src'] (src, "")
image = try $ do
char '!'
diff --git a/man/man1/html2markdown.1.md b/man/man1/html2markdown.1.md
index 1db37cf47..905bdd0d0 100644
--- a/man/man1/html2markdown.1.md
+++ b/man/man1/html2markdown.1.md
@@ -51,10 +51,6 @@ a complete list. The following options are most relevant:
\--no-wrap
: Disable text wrapping in output. (Default is to wrap text.)
-\--sanitize-html
-: Sanitizes HTML using a whitelist. Unsafe tags are replaced by HTML
- comments; unsafe attributes are omitted.
-
-H *FILE*, \--include-in-header=*FILE*
: Include contents of *FILE* at the end of the header. Implies
`-s`.
diff --git a/man/man1/pandoc.1.md b/man/man1/pandoc.1.md
index 5bf734d5a..e3ca8e591 100644
--- a/man/man1/pandoc.1.md
+++ b/man/man1/pandoc.1.md
@@ -128,7 +128,8 @@ to Pandoc. Or use `html2markdown`(1), a wrapper around `pandoc`.
\--sanitize-html
: Sanitizes HTML (in markdown or HTML input) using a whitelist.
Unsafe tags are replaced by HTML comments; unsafe attributes
- are omitted.
+ are omitted. URIs in links and images are also checked against a
+ whitelist of URI schemes.
\--toc, \--table-of-contents
: Include an automatically generated table of contents (HTML, markdown,