aboutsummaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc.hs45
-rw-r--r--src/Text/Pandoc/Parsing.hs2
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs95
-rw-r--r--src/Text/Pandoc/Readers/LaTeX.hs17
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs75
5 files changed, 96 insertions, 138 deletions
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs
index ab1e3cd03..7d3468461 100644
--- a/src/Text/Pandoc.hs
+++ b/src/Text/Pandoc.hs
@@ -57,6 +57,9 @@ module Text.Pandoc
(
-- * Definitions
module Text.Pandoc.Definition
+ -- * Lists of readers and writers
+ , readers
+ , writers
-- * Readers: converting /to/ Pandoc format
, readMarkdown
, readRST
@@ -127,8 +130,50 @@ import Text.Pandoc.Templates
import Text.Pandoc.Parsing
import Text.Pandoc.Shared
import Data.Version (showVersion)
+import Text.JSON.Generic
import Paths_pandoc (version)
-- | Version number of pandoc library.
pandocVersion :: String
pandocVersion = showVersion version
+
+-- | Association list of formats and readers.
+readers :: [(String, ParserState -> String -> Pandoc)]
+readers = [("native" , \_ -> read)
+ ,("json" , \_ -> decodeJSON)
+ ,("markdown" , readMarkdown)
+ ,("markdown+lhs" , readMarkdown)
+ ,("rst" , readRST)
+ ,("textile" , readTextile) -- TODO : textile+lhs
+ ,("rst+lhs" , readRST)
+ ,("html" , readHtml)
+ ,("latex" , readLaTeX)
+ ,("latex+lhs" , readLaTeX)
+ ]
+
+-- | Association list of formats and writers (omitting the
+-- binary writers, odt and epub).
+writers :: [ ( String, WriterOptions -> Pandoc -> String ) ]
+writers = [("native" , writeNative)
+ ,("json" , \_ -> encodeJSON)
+ ,("html" , writeHtmlString)
+ ,("html+lhs" , writeHtmlString)
+ ,("s5" , writeHtmlString)
+ ,("slidy" , writeHtmlString)
+ ,("docbook" , writeDocbook)
+ ,("opendocument" , writeOpenDocument)
+ ,("latex" , writeLaTeX)
+ ,("latex+lhs" , writeLaTeX)
+ ,("context" , writeConTeXt)
+ ,("texinfo" , writeTexinfo)
+ ,("man" , writeMan)
+ ,("markdown" , writeMarkdown)
+ ,("markdown+lhs" , writeMarkdown)
+ ,("plain" , writePlain)
+ ,("rst" , writeRST)
+ ,("rst+lhs" , writeRST)
+ ,("mediawiki" , writeMediaWiki)
+ ,("textile" , writeTextile)
+ ,("rtf" , writeRTF)
+ ,("org" , writeOrg)
+ ]
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 1b206e4c7..48c6aa70d 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -586,7 +586,6 @@ data ParserState = ParserState
{ stateParseRaw :: Bool, -- ^ Parse raw HTML and LaTeX?
stateParserContext :: ParserContext, -- ^ Inside list?
stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment?
- stateSanitizeHTML :: Bool, -- ^ Sanitize HTML?
stateKeys :: KeyTable, -- ^ List of reference keys
stateCitations :: [String], -- ^ List of available citations
stateNotes :: NoteTable, -- ^ List of notes
@@ -614,7 +613,6 @@ defaultParserState =
ParserState { stateParseRaw = False,
stateParserContext = NullState,
stateQuoteContext = NoQuote,
- stateSanitizeHTML = False,
stateKeys = M.empty,
stateCitations = [],
stateNotes = [],
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index f05fdd57b..ed026eb49 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -40,7 +40,6 @@ module Text.Pandoc.Readers.HTML (
extractTagType,
htmlBlockElement,
htmlComment,
- unsanitaryURI
) where
import Text.ParserCombinators.Parsec
@@ -51,7 +50,6 @@ import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
import Data.Maybe ( fromMaybe )
import Data.List ( isPrefixOf, isSuffixOf, intercalate )
import Data.Char ( toLower, isAlphaNum )
-import Network.URI ( parseURIReference, URI (..) )
import Control.Monad ( liftM, when )
-- | Convert HTML-formatted string to 'Pandoc' document.
@@ -85,36 +83,6 @@ blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
"dt", "frameset", "li", "tbody", "td", "tfoot",
"th", "thead", "tr", "script", "style"]
-sanitaryTags :: [[Char]]
-sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
- "blockquote", "br", "button", "caption", "center",
- "cite", "code", "col", "colgroup", "dd", "del", "dfn",
- "dir", "div", "dl", "dt", "em", "fieldset", "font",
- "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
- "i", "img", "input", "ins", "kbd", "label", "legend",
- "li", "map", "menu", "ol", "optgroup", "option", "p",
- "pre", "q", "s", "samp", "select", "small", "span",
- "strike", "strong", "sub", "sup", "table", "tbody",
- "td", "textarea", "tfoot", "th", "thead", "tr", "tt",
- "u", "ul", "var"]
-
-sanitaryAttributes :: [[Char]]
-sanitaryAttributes = ["abbr", "accept", "accept-charset",
- "accesskey", "action", "align", "alt", "axis",
- "border", "cellpadding", "cellspacing", "char",
- "charoff", "charset", "checked", "cite", "class",
- "clear", "cols", "colspan", "color", "compact",
- "coords", "datetime", "dir", "disabled",
- "enctype", "for", "frame", "headers", "height",
- "href", "hreflang", "hspace", "id", "ismap",
- "label", "lang", "longdesc", "maxlength", "media",
- "method", "multiple", "name", "nohref", "noshade",
- "nowrap", "prompt", "readonly", "rel", "rev",
- "rows", "rowspan", "rules", "scope", "selected",
- "shape", "size", "span", "src", "start",
- "summary", "tabindex", "target", "title", "type",
- "usemap", "valign", "value", "vspace", "width"]
-
-- taken from HXT and extended
closes :: String -> String -> Bool
@@ -153,41 +121,6 @@ _ `closes` _ = False
-- HTML utility functions
--
--- | Returns @True@ if sanitization is specified and the specified tag is
--- not on the sanitized tag list.
-unsanitaryTag :: [Char]
- -> GenParser tok ParserState Bool
-unsanitaryTag tag = do
- st <- getState
- return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
-
--- | returns @True@ if sanitization is specified and the specified attribute
--- is not on the sanitized attribute list.
-unsanitaryAttribute :: ([Char], String, t)
- -> GenParser tok ParserState Bool
-unsanitaryAttribute (attr, val, _) = do
- st <- getState
- return $ stateSanitizeHTML st &&
- (attr `notElem` sanitaryAttributes ||
- (attr `elem` ["href","src"] && unsanitaryURI val))
-
--- | Returns @True@ if the specified URI is potentially a security risk.
-unsanitaryURI :: String -> Bool
-unsanitaryURI u =
- let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:",
- "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:",
- "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:",
- "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:",
- "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:",
- "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:",
- "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:",
- "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:",
- "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:",
- "snews:", "webcal:", "ymsgr:"]
- in case parseURIReference (escapeURI u) of
- Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes
- Nothing -> True
-
-- | Read blocks until end tag.
blocksTilEnd :: String -> GenParser Char ParserState [Block]
blocksTilEnd tag = do
@@ -240,10 +173,7 @@ anyHtmlTag = try $ do
char '>'
let result = "<" ++ tag ++
concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
- unsanitary <- unsanitaryTag tag
- if unsanitary
- then return $ "<!-- unsafe HTML removed -->"
- else return result
+ return result
anyHtmlEndTag :: GenParser Char ParserState [Char]
anyHtmlEndTag = try $ do
@@ -255,10 +185,7 @@ anyHtmlEndTag = try $ do
spaces
char '>'
let result = "</" ++ tag ++ ">"
- unsanitary <- unsanitaryTag tag
- if unsanitary
- then return $ "<!-- unsafe HTML removed -->"
- else return result
+ return result
htmlTag :: Bool
-> String
@@ -294,16 +221,10 @@ quoted quoteChar = do
(many (noneOf [quoteChar]))
return (result, [quoteChar])
-nullAttribute :: ([Char], [Char], [Char])
-nullAttribute = ("", "", "")
-
htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char])
htmlAttribute = do
attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
- unsanitary <- unsanitaryAttribute attr
- if unsanitary
- then return nullAttribute
- else return attr
+ return attr
-- minimized boolean attribute
htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char])
@@ -364,10 +285,7 @@ htmlScript = try $ do
lookAhead $ htmlOpenTag "script"
open <- anyHtmlTag
rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script")
- st <- getState
- if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
- then return "<!-- unsafe HTML removed -->"
- else return $ open ++ rest ++ "</script>"
+ return $ open ++ rest ++ "</script>"
scriptChunk :: GenParser Char ParserState [Char]
scriptChunk = jsComment <|> jsString <|> jsChars
@@ -399,10 +317,7 @@ htmlStyle = try $ do
lookAhead $ htmlOpenTag "style"
open <- anyHtmlTag
rest <- manyTill anyChar (htmlEndTag "style")
- st <- getState
- if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
- then return "<!-- unsafe HTML removed -->"
- else return $ open ++ rest ++ "</style>"
+ return $ open ++ rest ++ "</style>"
htmlBlockElement :: GenParser Char ParserState [Char]
htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index 406809dfc..0bc13d2dd 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -119,20 +119,15 @@ anyEnvironment = try $ do
-- | Process LaTeX preamble, extracting metadata.
processLaTeXPreamble :: GenParser Char ParserState ()
-processLaTeXPreamble = try $ manyTill
- (choice [bibliographic, comment, unknownCommand, nullBlock])
- (try (string "\\begin{document}")) >>
- spaces
+processLaTeXPreamble =
+ skipMany $ notFollowedBy' anyEnvironment >> block
-- | Parse LaTeX and return 'Pandoc'.
parseLaTeX :: GenParser Char ParserState Pandoc
parseLaTeX = do
- optional processLaTeXPreamble -- preamble might not be present (fragment)
- spaces
- blocks <- parseBlocks
spaces
- optional $ try (string "\\end{document}" >> many anyChar)
- -- might not be present (fragment)
+ blocks <- try (processLaTeXPreamble >> spaces >> environment "document")
+ <|> many block
spaces
eof
state <- getState
@@ -420,8 +415,8 @@ ignore = try $ do
unknownCommand :: GenParser Char ParserState Block
unknownCommand = try $ do
- notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description",
- "document"]
+ spaces
+ notFollowedBy' $ oneOfStrings ["\\begin","\\end","\\item"]
state <- getState
when (stateParserContext state == ListItemState) $
notFollowedBy' (string "\\item")
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 59f825808..a68741bda 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -41,7 +41,7 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' )
import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag,
anyHtmlInlineTag, anyHtmlTag,
anyHtmlEndTag, htmlEndTag, extractTagType,
- htmlBlockElement, htmlComment, unsanitaryURI )
+ htmlBlockElement, htmlComment )
import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
import Text.ParserCombinators.Parsec
import Control.Monad (when, liftM, guard)
@@ -202,9 +202,17 @@ referenceKey = try $ do
lab <- reference
char ':'
skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[')
- let sourceURL excludes = many $
- optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' referenceTitle >> char ' '))
- src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n"
+ let nl = char '\n' >> notFollowedBy blankline >> return ' '
+ let sourceURL = liftM unwords $ many $ try $ do
+ notFollowedBy' referenceTitle
+ skipMany (oneOf " \t")
+ optional nl
+ notFollowedBy' reference
+ skipMany (oneOf " \t")
+ many1 (noneOf " \t\n")
+ let betweenAngles = try $ char '<' >>
+ manyTill (noneOf ">\n" <|> nl) (char '>')
+ src <- try betweenAngles <|> sourceURL
tit <- option "" referenceTitle
blanklines
endPos <- getPosition
@@ -650,16 +658,14 @@ isHtmlOrBlank _ = False
para :: GenParser Char ParserState Block
para = try $ do
- result <- many1 inline
- if all isHtmlOrBlank result
- then fail "treat as raw HTML"
- else return ()
- newline
- blanklines <|> do st <- getState
- if stateStrict st
- then lookAhead (blockQuote <|> header) >> return ""
- else pzero
- return $ Para $ normalizeSpaces result
+ result <- liftM normalizeSpaces $ many1 inline
+ guard $ not . all isHtmlOrBlank $ result
+ option (Plain result) $ try $ do
+ newline
+ blanklines <|>
+ (getState >>= guard . stateStrict >>
+ lookAhead (blockQuote <|> header) >> return "")
+ return $ Para result
plain :: GenParser Char ParserState Block
plain = many1 inline >>~ spaces >>= return . Plain . normalizeSpaces
@@ -1085,15 +1091,13 @@ endline = try $ do
newline
notFollowedBy blankline
st <- getState
- if stateStrict st
- then do notFollowedBy emailBlockQuoteStart
- notFollowedBy (char '#') -- atx header
- else return ()
+ when (stateStrict st) $ do
+ notFollowedBy emailBlockQuoteStart
+ notFollowedBy (char '#') -- atx header
-- parse potential list-starts differently if in a list:
- if stateParserContext st == ListItemState
- then notFollowedBy' (bulletListStart <|>
- (anyOrderedListStart >> return ()))
- else return ()
+ when (stateParserContext st == ListItemState) $ do
+ notFollowedBy' bulletListStart
+ notFollowedBy' anyOrderedListStart
return Space
--
@@ -1118,9 +1122,16 @@ source =
source' :: GenParser Char st (String, [Char])
source' = do
skipSpaces
- let sourceURL excludes = many $
- optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' linkTitle >> char ' '))
- src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n"
+ let nl = char '\n' >>~ notFollowedBy blankline
+ let sourceURL = liftM unwords $ many $ try $ do
+ notFollowedBy' linkTitle
+ skipMany (oneOf " \t")
+ optional nl
+ skipMany (oneOf " \t")
+ many1 (noneOf " \t\n")
+ let betweenAngles = try $ char '<' >>
+ manyTill (noneOf ">\n" <|> nl) (char '>')
+ src <- try betweenAngles <|> sourceURL
tit <- option "" linkTitle
skipSpaces
eof
@@ -1139,10 +1150,7 @@ link :: GenParser Char ParserState Inline
link = try $ do
lab <- reference
(src, tit) <- source <|> referenceLink lab
- sanitize <- getState >>= return . stateSanitizeHTML
- if sanitize && unsanitaryURI src
- then fail "Unsanitary URI"
- else return $ Link lab (src, tit)
+ return $ Link lab (src, tit)
-- a link like [this][ref] or [this][] or [this]
referenceLink :: [Inline]
@@ -1162,12 +1170,9 @@ autoLink = try $ do
(orig, src) <- uri <|> emailAddress
char '>'
st <- getState
- let sanitize = stateSanitizeHTML st
- if sanitize && unsanitaryURI src
- then fail "Unsanitary URI"
- else return $ if stateStrict st
- then Link [Str orig] (src, "")
- else Link [Code orig] (src, "")
+ return $ if stateStrict st
+ then Link [Str orig] (src, "")
+ else Link [Code orig] (src, "")
image :: GenParser Char ParserState Inline
image = try $ do