5 files changed, 96 insertions, 138 deletions
diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs
index ab1e3cd03..7d3468461 100644
--- a/src/Text/Pandoc.hs
+++ b/src/Text/Pandoc.hs
@@ -57,6 +57,9 @@ module Text.Pandoc
                ( 
                -- * Definitions
                module Text.Pandoc.Definition
+               -- * Lists of readers and writers
+               , readers
+               , writers
                -- * Readers: converting /to/ Pandoc format
                , readMarkdown
                , readRST
@@ -127,8 +130,50 @@ import Text.Pandoc.Templates
 import Text.Pandoc.Parsing
 import Text.Pandoc.Shared
 import Data.Version (showVersion)
+import Text.JSON.Generic
 import Paths_pandoc (version)
 
 -- | Version number of pandoc library.
 pandocVersion :: String
 pandocVersion = showVersion version
+
+-- | Association list of formats and readers.
+readers :: [(String, ParserState -> String -> Pandoc)]
+readers = [("native"       , \_ -> read)
+          ,("json"         , \_ -> decodeJSON)
+          ,("markdown"     , readMarkdown)
+          ,("markdown+lhs" , readMarkdown)
+          ,("rst"          , readRST)
+          ,("textile"      , readTextile) -- TODO : textile+lhs 
+          ,("rst+lhs"      , readRST)
+          ,("html"         , readHtml)
+          ,("latex"        , readLaTeX)
+          ,("latex+lhs"    , readLaTeX)
+          ]
+
+-- | Association list of formats and writers (omitting the
+-- binary writers, odt and epub).
+writers :: [ ( String, WriterOptions -> Pandoc -> String ) ]
+writers = [("native"       , writeNative)
+          ,("json"         , \_ -> encodeJSON)
+          ,("html"         , writeHtmlString)
+          ,("html+lhs"     , writeHtmlString)
+          ,("s5"           , writeHtmlString)
+          ,("slidy"        , writeHtmlString)
+          ,("docbook"      , writeDocbook)
+          ,("opendocument" , writeOpenDocument)
+          ,("latex"        , writeLaTeX)
+          ,("latex+lhs"    , writeLaTeX)
+          ,("context"      , writeConTeXt)
+          ,("texinfo"      , writeTexinfo)
+          ,("man"          , writeMan)
+          ,("markdown"     , writeMarkdown)
+          ,("markdown+lhs" , writeMarkdown)
+          ,("plain"        , writePlain)
+          ,("rst"          , writeRST)
+          ,("rst+lhs"      , writeRST)
+          ,("mediawiki"    , writeMediaWiki)
+          ,("textile"      , writeTextile)
+          ,("rtf"          , writeRTF)
+          ,("org"          , writeOrg)
+          ]
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs
index 1b206e4c7..48c6aa70d 100644
--- a/src/Text/Pandoc/Parsing.hs
+++ b/src/Text/Pandoc/Parsing.hs
@@ -586,7 +586,6 @@ data ParserState = ParserState
     { stateParseRaw        :: Bool,          -- ^ Parse raw HTML and LaTeX?
       stateParserContext   :: ParserContext, -- ^ Inside list?
       stateQuoteContext    :: QuoteContext,  -- ^ Inside quoted environment?
-      stateSanitizeHTML    :: Bool,          -- ^ Sanitize HTML?
       stateKeys            :: KeyTable,      -- ^ List of reference keys
       stateCitations       :: [String],      -- ^ List of available citations
       stateNotes           :: NoteTable,     -- ^ List of notes
@@ -614,7 +613,6 @@ defaultParserState =
     ParserState { stateParseRaw        = False,
                   stateParserContext   = NullState,
                   stateQuoteContext    = NoQuote,
-                  stateSanitizeHTML    = False,
                   stateKeys            = M.empty,
                   stateCitations       = [],
                   stateNotes           = [],
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index f05fdd57b..ed026eb49 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -40,7 +40,6 @@ module Text.Pandoc.Readers.HTML (
                                  extractTagType,
                                  htmlBlockElement,
                                  htmlComment,
-                                 unsanitaryURI
                                 ) where
 
 import Text.ParserCombinators.Parsec
@@ -51,7 +50,6 @@ import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
 import Data.Maybe ( fromMaybe )
 import Data.List ( isPrefixOf, isSuffixOf, intercalate )
 import Data.Char ( toLower, isAlphaNum )
-import Network.URI ( parseURIReference, URI (..) )
 import Control.Monad ( liftM, when )
 
 -- | Convert HTML-formatted string to 'Pandoc' document.
@@ -85,36 +83,6 @@ blockHtmlTags = ["address", "blockquote", "body", "center", "dir", "div",
                  "dt", "frameset", "li", "tbody", "td", "tfoot",
                  "th", "thead", "tr", "script", "style"]
 
-sanitaryTags :: [[Char]]
-sanitaryTags = ["a", "abbr", "acronym", "address", "area", "b", "big",
-                "blockquote", "br", "button", "caption", "center",
-                "cite", "code", "col", "colgroup", "dd", "del", "dfn",
-                "dir", "div", "dl", "dt", "em", "fieldset", "font",
-                "form", "h1", "h2", "h3", "h4", "h5", "h6", "hr",
-                "i", "img", "input", "ins", "kbd", "label", "legend",
-                "li", "map", "menu", "ol", "optgroup", "option", "p",
-                "pre", "q", "s", "samp", "select", "small", "span",
-                "strike", "strong", "sub", "sup", "table", "tbody",
-                "td", "textarea", "tfoot", "th", "thead", "tr", "tt",
-                "u", "ul", "var"]
-
-sanitaryAttributes :: [[Char]]
-sanitaryAttributes = ["abbr", "accept", "accept-charset",
-                      "accesskey", "action", "align", "alt", "axis",
-                      "border", "cellpadding", "cellspacing", "char",
-                      "charoff", "charset", "checked", "cite", "class",
-                      "clear", "cols", "colspan", "color", "compact",
-                      "coords", "datetime", "dir", "disabled",
-                      "enctype", "for", "frame", "headers", "height",
-                      "href", "hreflang", "hspace", "id", "ismap",
-                      "label", "lang", "longdesc", "maxlength", "media",
-                      "method", "multiple", "name", "nohref", "noshade",
-                      "nowrap", "prompt", "readonly", "rel", "rev",
-                      "rows", "rowspan", "rules", "scope", "selected",
-                      "shape", "size", "span", "src", "start",
-                      "summary", "tabindex", "target", "title", "type",
-                      "usemap", "valign", "value", "vspace", "width"]
-
 -- taken from HXT and extended
 
 closes :: String -> String -> Bool
@@ -153,41 +121,6 @@ _ `closes` _ = False
 -- HTML utility functions
 --
 
--- | Returns @True@ if sanitization is specified and the specified tag is 
---  not on the sanitized tag list.
-unsanitaryTag :: [Char]
-              -> GenParser tok ParserState Bool
-unsanitaryTag tag = do
-  st <- getState
-  return $ stateSanitizeHTML st && tag `notElem` sanitaryTags
-
--- | returns @True@ if sanitization is specified and the specified attribute
---  is not on the sanitized attribute list.
-unsanitaryAttribute :: ([Char], String, t)
-                    -> GenParser tok ParserState Bool
-unsanitaryAttribute (attr, val, _) = do
-  st <- getState
-  return $ stateSanitizeHTML st &&
-           (attr `notElem` sanitaryAttributes ||
-             (attr `elem` ["href","src"] && unsanitaryURI val))
-
--- | Returns @True@ if the specified URI is potentially a security risk.
-unsanitaryURI :: String -> Bool
-unsanitaryURI u =
-  let safeURISchemes = [ "", "http:", "https:", "ftp:", "mailto:", "file:",
-             "telnet:", "gopher:", "aaa:", "aaas:", "acap:", "cap:", "cid:",
-             "crid:", "dav:", "dict:", "dns:", "fax:", "go:", "h323:", "im:",
-             "imap:", "ldap:", "mid:", "news:", "nfs:", "nntp:", "pop:",
-             "pres:", "sip:", "sips:", "snmp:", "tel:", "urn:", "wais:",
-             "xmpp:", "z39.50r:", "z39.50s:", "aim:", "callto:", "cvs:",
-             "ed2k:", "feed:", "fish:", "gg:", "irc:", "ircs:", "lastfm:",
-             "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:",
-             "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:",
-             "snews:", "webcal:", "ymsgr:"]
-  in  case parseURIReference (escapeURI u) of
-           Just p  -> (map toLower $ uriScheme p) `notElem` safeURISchemes
-           Nothing -> True
-
 -- | Read blocks until end tag.
 blocksTilEnd :: String -> GenParser Char ParserState [Block]
 blocksTilEnd tag = do
@@ -240,10 +173,7 @@ anyHtmlTag = try $ do
   char '>'
   let result = "<" ++ tag ++ 
                concatMap (\(_, _, raw) -> (' ':raw)) attribs ++ ender' ++ ">"
-  unsanitary <- unsanitaryTag tag
-  if unsanitary
-     then return $ "<!-- unsafe HTML removed -->"
-     else return result
+  return result
 
 anyHtmlEndTag :: GenParser Char ParserState [Char]
 anyHtmlEndTag = try $ do
@@ -255,10 +185,7 @@ anyHtmlEndTag = try $ do
   spaces
   char '>'
   let result = "</" ++ tag ++ ">"
-  unsanitary <- unsanitaryTag tag
-  if unsanitary
-     then return $ "<!-- unsafe HTML removed -->"
-     else return result 
+  return result 
 
 htmlTag :: Bool
         -> String
@@ -294,16 +221,10 @@ quoted quoteChar = do
                     (many (noneOf [quoteChar]))
   return (result, [quoteChar])
 
-nullAttribute :: ([Char], [Char], [Char])
-nullAttribute = ("", "", "") 
-
 htmlAttribute :: GenParser Char ParserState ([Char], [Char], [Char])
 htmlAttribute = do
   attr <- htmlRegularAttribute <|> htmlMinimizedAttribute
-  unsanitary <- unsanitaryAttribute attr
-  if unsanitary
-     then return nullAttribute
-     else return attr
+  return attr
 
 -- minimized boolean attribute
 htmlMinimizedAttribute :: GenParser Char st ([Char], [Char], [Char])
@@ -364,10 +285,7 @@ htmlScript = try $ do
   lookAhead $ htmlOpenTag "script"
   open <- anyHtmlTag
   rest <- liftM concat $ manyTill scriptChunk (htmlEndTag "script")
-  st <- getState
-  if stateSanitizeHTML st && not ("script" `elem` sanitaryTags)
-     then return "<!-- unsafe HTML removed -->"
-     else return $ open ++ rest ++ "</script>"
+  return $ open ++ rest ++ "</script>"
 
 scriptChunk :: GenParser Char ParserState [Char]
 scriptChunk = jsComment <|> jsString <|> jsChars
@@ -399,10 +317,7 @@ htmlStyle = try $ do
   lookAhead $ htmlOpenTag "style"
   open <- anyHtmlTag
   rest <- manyTill anyChar (htmlEndTag "style")
-  st <- getState
-  if stateSanitizeHTML st && not ("style" `elem` sanitaryTags)
-     then return "<!-- unsafe HTML removed -->"
-     else return $ open ++ rest ++ "</style>"
+  return $ open ++ rest ++ "</style>"
 
 htmlBlockElement :: GenParser Char ParserState [Char]
 htmlBlockElement = choice [ htmlScript, htmlStyle, htmlComment, xmlDec, definition ]
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index 406809dfc..0bc13d2dd 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -119,20 +119,15 @@ anyEnvironment =  try $ do
 
 -- | Process LaTeX preamble, extracting metadata.
 processLaTeXPreamble :: GenParser Char ParserState ()
-processLaTeXPreamble = try $ manyTill 
-  (choice [bibliographic, comment, unknownCommand, nullBlock]) 
-  (try (string "\\begin{document}")) >> 
-  spaces
+processLaTeXPreamble =
+  skipMany $ notFollowedBy' anyEnvironment >> block
 
 -- | Parse LaTeX and return 'Pandoc'.
 parseLaTeX :: GenParser Char ParserState Pandoc
 parseLaTeX = do
-  optional processLaTeXPreamble -- preamble might not be present (fragment)
-  spaces
-  blocks <- parseBlocks
   spaces
-  optional $ try (string "\\end{document}" >> many anyChar) 
-  -- might not be present (fragment)
+  blocks <- try (processLaTeXPreamble >> spaces >> environment "document")
+        <|> many block
   spaces
   eof
   state <- getState
@@ -420,8 +415,8 @@ ignore = try $ do
 
 unknownCommand :: GenParser Char ParserState Block
 unknownCommand = try $ do
-  notFollowedBy' $ choice $ map end ["itemize", "enumerate", "description",
-                                     "document"]
+  spaces
+  notFollowedBy' $ oneOfStrings ["\\begin","\\end","\\item"]
   state <- getState
   when (stateParserContext state == ListItemState) $
      notFollowedBy' (string "\\item")
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 59f825808..a68741bda 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -41,7 +41,7 @@ import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' )
 import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, 
                                   anyHtmlInlineTag, anyHtmlTag,
                                   anyHtmlEndTag, htmlEndTag, extractTagType,
-                                  htmlBlockElement, htmlComment, unsanitaryURI )
+                                  htmlBlockElement, htmlComment )
 import Text.Pandoc.CharacterReferences ( decodeCharacterReferences )
 import Text.ParserCombinators.Parsec
 import Control.Monad (when, liftM, guard)
@@ -202,9 +202,17 @@ referenceKey = try $ do
   lab <- reference
   char ':'
   skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[')
-  let sourceURL excludes = many $
-        optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' referenceTitle >> char ' '))
-  src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n"
+  let nl = char '\n' >> notFollowedBy blankline >> return ' '
+  let sourceURL = liftM unwords $ many $ try $ do
+                    notFollowedBy' referenceTitle
+                    skipMany (oneOf " \t")
+                    optional nl
+                    notFollowedBy' reference
+                    skipMany (oneOf " \t")
+                    many1 (noneOf " \t\n")
+  let betweenAngles = try $ char '<' >>
+                            manyTill (noneOf ">\n" <|> nl) (char '>')
+  src <- try betweenAngles <|> sourceURL
   tit <- option "" referenceTitle
   blanklines
   endPos <- getPosition
@@ -650,16 +658,14 @@ isHtmlOrBlank _              = False
 
 para :: GenParser Char ParserState Block
 para = try $ do 
-  result <- many1 inline
-  if all isHtmlOrBlank result
-     then fail "treat as raw HTML"
-     else return ()
-  newline
-  blanklines <|> do st <- getState
-                    if stateStrict st
-                       then lookAhead (blockQuote <|> header) >> return ""
-                       else pzero
-  return $ Para $ normalizeSpaces result
+  result <- liftM normalizeSpaces $ many1 inline
+  guard $ not . all isHtmlOrBlank $ result
+  option (Plain result) $ try $ do
+              newline
+              blanklines <|>
+                (getState >>= guard . stateStrict >>
+                 lookAhead (blockQuote <|> header) >> return "")
+              return $ Para result
 
 plain :: GenParser Char ParserState Block
 plain = many1 inline >>~ spaces >>= return . Plain . normalizeSpaces
@@ -1085,15 +1091,13 @@ endline = try $ do
   newline
   notFollowedBy blankline
   st <- getState
-  if stateStrict st 
-    then do notFollowedBy emailBlockQuoteStart
-            notFollowedBy (char '#')  -- atx header
-    else return () 
+  when (stateStrict st) $ do
+    notFollowedBy emailBlockQuoteStart
+    notFollowedBy (char '#')  -- atx header
   -- parse potential list-starts differently if in a list:
-  if stateParserContext st == ListItemState
-     then notFollowedBy' (bulletListStart <|> 
-                          (anyOrderedListStart >> return ()))
-     else return ()
+  when (stateParserContext st == ListItemState) $ do
+     notFollowedBy' bulletListStart
+     notFollowedBy' anyOrderedListStart
   return Space
 
 --
@@ -1118,9 +1122,16 @@ source =
 source' :: GenParser Char st (String, [Char])
 source' = do
   skipSpaces
-  let sourceURL excludes = many $
-        optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' linkTitle >> char ' '))
-  src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n"
+  let nl = char '\n' >>~ notFollowedBy blankline
+  let sourceURL = liftM unwords $ many $ try $ do
+                    notFollowedBy' linkTitle
+                    skipMany (oneOf " \t")
+                    optional nl
+                    skipMany (oneOf " \t")
+                    many1 (noneOf " \t\n")
+  let betweenAngles = try $ char '<' >>
+                            manyTill (noneOf ">\n" <|> nl) (char '>')
+  src <- try betweenAngles <|> sourceURL
   tit <- option "" linkTitle
   skipSpaces
   eof
@@ -1139,10 +1150,7 @@ link :: GenParser Char ParserState Inline
 link = try $ do
   lab <- reference
   (src, tit) <- source <|> referenceLink lab
-  sanitize <- getState >>= return . stateSanitizeHTML
-  if sanitize && unsanitaryURI src
-     then fail "Unsanitary URI"
-     else return $ Link lab (src, tit)
+  return $ Link lab (src, tit)
 
 -- a link like [this][ref] or [this][] or [this]
 referenceLink :: [Inline]
@@ -1162,12 +1170,9 @@ autoLink = try $ do
   (orig, src) <- uri <|> emailAddress
   char '>'
   st <- getState
-  let sanitize = stateSanitizeHTML st
-  if sanitize && unsanitaryURI src
-     then fail "Unsanitary URI"
-     else return $ if stateStrict st
-                      then Link [Str orig] (src, "")
-                      else Link [Code orig] (src, "")
+  return $ if stateStrict st
+              then Link [Str orig] (src, "")
+              else Link [Code orig] (src, "")
 
 image :: GenParser Char ParserState Inline
 image = try $ do