initial import

git-svn-id: https://pandoc.googlecode.com/svn/trunk@2 788f1e2b-df1e-0410-8736-df70ead52e1b
author: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2006-10-17 14:22:29 +0000
committer: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2006-10-17 14:22:29 +0000
commit: df7b68225101966051f8b592a27127bf789eb81e (patch)
tree: a063e97ed58d0bdb2cbb5a95c3e8c1bcce54aa00 /src/Text/Pandoc/Readers
parent: e7dbfef4d8aa528d9245424e9c372e900a774c90 (diff)
download: pandoc-df7b68225101966051f8b592a27127bf789eb81e.tar.gz
4 files changed, 2245 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
new file mode 100644
index 000000000..054d9eb72
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -0,0 +1,434 @@
+-- | Converts HTML to 'Pandoc' document.
+module Text.Pandoc.Readers.HTML ( 
+                                 readHtml, 
+                                 rawHtmlInline, 
+                                 rawHtmlBlock, 
+                                 anyHtmlBlockTag, 
+                                 anyHtmlInlineTag  
+                                ) where
+
+import Text.Regex ( matchRegex, mkRegex )
+import Text.ParserCombinators.Parsec
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared 
+import Text.Pandoc.HtmlEntities ( decodeEntities, htmlEntityToChar )
+import Maybe ( fromMaybe )
+import Char ( toUpper, toLower )
+
+-- | Convert HTML-formatted string to 'Pandoc' document.
+readHtml :: ParserState   -- ^ Parser state
+         -> String        -- ^ String to parse
+         -> Pandoc
+readHtml = readWith parseHtml
+
+-- for testing 
+testString :: String -> IO ()
+testString = testStringWith parseHtml
+
+--
+-- Constants
+--
+
+inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", "br", "cite", 
+                  "code", "dfn", "em", "font", "i", "img", "input", "kbd", "label", "q", 
+                  "s", "samp", "select", "small", "span", "strike", "strong", "sub", 
+                  "sup", "textarea", "tt", "u", "var"]
+
+--
+-- HTML utility functions
+--
+
+-- | Read blocks until end tag.
+blocksTilEnd tag = try (do
+  blocks <- manyTill (do {b <- block; spaces; return b}) (htmlEndTag tag)
+  return blocks)
+
+-- | Read inlines until end tag.
+inlinesTilEnd tag = try (do
+  inlines <- manyTill inline (htmlEndTag tag)
+  return inlines)
+
+-- extract type from a tag:  e.g. br from <br>, < br >, </br>, etc.
+extractTagType tag = case (matchRegex (mkRegex  "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of
+                       Just [match]   -> (map toLower match)
+                       Nothing        -> ""
+
+anyHtmlTag = try (do
+  char '<'
+  spaces
+  tag <- many1 alphaNum
+  attribs <- htmlAttributes  
+  spaces
+  ender <- option "" (string "/")
+  let ender' = if (null ender) then "" else " /"
+  spaces
+  char '>'
+  return ("<" ++ tag ++ attribs ++ ender' ++ ">"))
+
+anyHtmlEndTag = try (do
+  char '<'   
+  spaces
+  char '/'
+  spaces
+  tagType <- many1 alphaNum
+  spaces
+  char '>'
+  return ("</" ++ tagType ++ ">"))
+
+htmlTag :: String -> GenParser Char st (String, [(String, String)])
+htmlTag tag = try (do
+  char '<'
+  spaces
+  stringAnyCase tag
+  attribs <- many htmlAttribute
+  spaces
+  option "" (string "/")
+  spaces
+  char '>'
+  return (tag, (map (\(name, content, raw) -> (name, content)) attribs)))
+
+-- parses a quoted html attribute value
+quoted quoteChar = do
+  result <- between (char quoteChar) (char quoteChar) (many (noneOf [quoteChar]))
+  return (result, [quoteChar])
+
+htmlAttributes = do
+  attrList <- many htmlAttribute
+  return (concatMap (\(name, content, raw) -> raw) attrList)
+
+htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
+
+-- minimized boolean attribute (no = and value)
+htmlMinimizedAttribute = try (do
+  spaces
+  name <- many1 (choice [letter, oneOf ".-_:"])
+  spaces
+  notFollowedBy (char '=')
+  let content = name
+  return (name, content, (" " ++ name)))
+
+htmlRegularAttribute = try (do
+  spaces
+  name <- many1 (choice [letter, oneOf ".-_:"])
+  spaces
+  char '='
+  spaces
+  (content, quoteStr) <- choice [ (quoted '\''), 
+                                  (quoted '"'), 
+                                  (do{ a <- (many (alphaNum <|> (oneOf "-._:"))); 
+                                       return (a,"")} ) ]
+  return (name, content, (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)))
+
+htmlEndTag tag = try (do
+  char '<'   
+  spaces
+  char '/'
+  spaces
+  stringAnyCase tag
+  spaces
+  char '>'
+  return ("</" ++ tag ++ ">"))
+
+-- | Returns @True@ if the tag is an inline tag.
+isInline tag = (extractTagType tag) `elem` inlineHtmlTags
+
+anyHtmlBlockTag = try (do
+  tag <- choice [anyHtmlTag, anyHtmlEndTag]
+  if isInline tag then
+      fail "inline tag"
+    else
+      return tag)
+
+anyHtmlInlineTag = try (do
+  tag <- choice [ anyHtmlTag, anyHtmlEndTag ]
+  if isInline tag then
+      return tag
+    else
+      fail "not an inline tag")
+
+-- scripts must be treated differently, because they can contain <> etc.
+htmlScript = try (do
+  open <- string "<script"
+  rest <- manyTill anyChar (htmlEndTag "script")
+  return (open ++ rest ++ "</script>"))
+
+rawHtmlBlock = do
+  notFollowedBy (do {choice [htmlTag "/body", htmlTag "/html"]; return ' '})
+  body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec, definition]
+  sp <- (many space)
+  state <- getState
+  if stateParseRaw state then
+      return (RawHtml (body ++ sp))
+    else
+      return Null
+
+htmlComment = try (do
+  string "<!--"
+  comment <- manyTill anyChar (try (string "-->"))
+  return ("<!--" ++ comment ++ "-->"))
+
+--
+-- parsing documents
+--
+
+xmlDec = try (do
+  string "<?"
+  rest <- manyTill anyChar (char '>')
+  return ("<?" ++ rest ++ ">"))
+
+definition = try (do
+  string "<!"
+  rest <- manyTill anyChar (char '>')
+  return ("<!" ++ rest ++ ">"))
+
+nonTitleNonHead = try (do
+  notFollowedBy' (htmlTag "title")
+  notFollowedBy' (htmlTag "/head")
+  result <- choice [do {rawHtmlBlock; return ' '}, anyChar]
+  return result)
+
+parseTitle = try (do
+  (tag, attribs) <- htmlTag "title"
+  contents <- inlinesTilEnd tag
+  spaces
+  return contents)
+
+-- parse header and return meta-information (for now, just title)
+parseHead = try (do
+  htmlTag "head"
+  spaces
+  skipMany nonTitleNonHead
+  contents <- option [] parseTitle
+  skipMany nonTitleNonHead
+  htmlTag "/head"
+  return (contents, [], ""))
+
+skipHtmlTag tag = option ("",[]) (htmlTag tag)
+
+-- h1 class="title" representation of title in body
+bodyTitle = try (do
+  (tag, attribs) <- htmlTag "h1"  
+  cl <- case (extractAttribute "class" attribs) of
+          Just "title" -> do {return ""}
+          otherwise    -> fail "not title"
+  inlinesTilEnd "h1"
+  return "")
+
+parseHtml = do
+  sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
+  skipHtmlTag "html"
+  spaces
+  (title, authors, date) <- option ([], [], "") parseHead 
+  spaces
+  skipHtmlTag "body"
+  spaces
+  option "" bodyTitle  -- skip title in body, because it's represented in meta
+  blocks <- parseBlocks
+  spaces
+  option "" (htmlEndTag "body")
+  spaces
+  option "" (htmlEndTag "html")
+  many anyChar -- ignore anything after </html>
+  eof
+  state <- getState
+  let keyBlocks = stateKeyBlocks state 
+  return (Pandoc (Meta title authors date) (blocks ++ (reverse keyBlocks)))
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+  spaces
+  result <- sepEndBy block spaces
+  return result
+
+block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain, 
+                 rawHtmlBlock ] <?> "block"
+
+--
+-- header blocks
+--
+
+header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
+
+headerLevel n = try (do
+    let level = "h" ++ show n
+    (tag, attribs) <- htmlTag level
+    contents <- inlinesTilEnd level
+    return (Header n (normalizeSpaces contents)))
+
+--
+-- hrule block
+--
+
+hrule = try (do
+  (tag, attribs) <- htmlTag "hr"
+  state <- getState
+  if (not (null attribs)) && (stateParseRaw state) then
+       unexpected "attributes in hr"  -- in this case we want to parse it as raw html
+     else
+       return HorizontalRule)
+
+--
+-- code blocks
+--
+
+codeBlock = choice [ preCodeBlock, bareCodeBlock ] <?> "code block"
+
+preCodeBlock = try (do
+    htmlTag "pre" 
+    spaces
+    htmlTag "code"
+    result <- manyTill anyChar (htmlEndTag "code")
+    spaces
+    htmlEndTag "pre"
+    return (CodeBlock (decodeEntities result)))
+
+bareCodeBlock = try (do
+    htmlTag "code"
+    result <- manyTill anyChar (htmlEndTag "code")
+    return (CodeBlock (decodeEntities result)))
+
+--
+-- block quotes
+--
+
+blockQuote = try (do
+    tag <- htmlTag "blockquote"
+    spaces
+    blocks <- blocksTilEnd "blockquote"
+    return (BlockQuote blocks))
+
+--
+-- list blocks
+--
+
+list = choice [ bulletList, orderedList ] <?> "list"
+
+orderedList = try (do
+    tag <- htmlTag "ol"
+    spaces
+    items <- sepEndBy1 listItem spaces
+    htmlEndTag "ol"
+    return (OrderedList items))
+
+bulletList = try (do
+    tag <- htmlTag "ul"
+    spaces
+    items <- sepEndBy1 listItem spaces
+    htmlEndTag "ul"
+    return (BulletList items))
+
+listItem = try (do
+    tag <- htmlTag "li"
+    spaces
+    blocks <- blocksTilEnd "li"
+    return blocks)
+
+--
+-- paragraph block
+--
+
+para = try (do 
+  tag <- htmlTag "p"
+  result <- inlinesTilEnd "p"
+  return (Para (normalizeSpaces result)))
+
+-- 
+-- plain block
+--
+
+plain = do
+    result <- many1 inline
+    return (Plain (normalizeSpaces result))
+
+-- 
+-- inline
+--
+
+inline = choice [ text, special ] <?> "inline"
+
+text =  choice [ entity, strong, emph, code, str, linebreak, whitespace ] <?> "text"
+
+special = choice [ link, image, rawHtmlInline ] <?> "link, inline html, or image"
+
+entity = try (do
+  char '&'
+  body <- choice [(many1 letter), 
+                  (try (do{ char '#'; num <- many1 digit; return ("#" ++ num)}))]
+  char ';'
+  return (Str [fromMaybe '?' (htmlEntityToChar ("&" ++ body ++ ";"))]))
+
+code = try (do 
+  htmlTag "code"
+  result <- manyTill anyChar (htmlEndTag "code")
+  -- remove internal line breaks, leading and trailing space, and decode entities
+  let result' = decodeEntities $ removeLeadingTrailingSpace $ joinWithSep " " $ lines result 
+  return (Code result'))
+
+rawHtmlInline = do
+  result <- choice [htmlScript, anyHtmlInlineTag]
+  state <- getState
+  if stateParseRaw state then
+      return (HtmlInline result)
+    else
+      return (Str "")
+
+betweenTags tag = try (do
+  htmlTag tag
+  result <- inlinesTilEnd tag
+  return (normalizeSpaces result))
+
+emph = try (do 
+  result <- choice [betweenTags "em", betweenTags "it"]
+  return (Emph result))
+
+strong = try (do 
+  result <- choice [betweenTags "b", betweenTags "strong"]
+  return (Strong result))
+
+whitespace = do
+  many1 space
+  return Space
+
+-- hard line break
+linebreak = do
+  htmlTag "br"
+  return LineBreak 
+
+str = do 
+  result <- many1 (noneOf "<& \t\n")
+  return (Str (decodeEntities result))
+
+--
+-- links and images
+--
+
+-- extract contents of attribute (attribute names are case-insensitive)
+extractAttribute name [] = Nothing
+extractAttribute name ((attrName, contents):rest) = 
+  let name' = map toLower name 
+      attrName' = map toLower attrName in
+  if (attrName' == name') then Just contents else extractAttribute name rest
+
+link = try (do
+  (tag, attributes) <- htmlTag "a"  
+  url <- case (extractAttribute "href" attributes) of
+           Just url -> do {return url}
+           Nothing  -> fail "no href"
+  let title = fromMaybe "" (extractAttribute "title" attributes)
+  label <- inlinesTilEnd "a"
+  ref <- generateReference url title
+  return (Link (normalizeSpaces label) ref))
+
+image = try (do
+  (tag, attributes) <- htmlTag "img" 
+  url <- case (extractAttribute "src" attributes) of
+           Just url -> do {return url}
+           Nothing  -> fail "no src"
+  let title = fromMaybe "" (extractAttribute "title" attributes)
+  let alt = fromMaybe "" (extractAttribute "alt" attributes)
+  ref <- generateReference url title
+  return (Image [Str alt] ref)) 
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
new file mode 100644
index 000000000..3bf3dfd23
--- /dev/null
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -0,0 +1,585 @@
+-- | Converts LaTeX to 'Pandoc' document.
+module Text.Pandoc.Readers.LaTeX ( 
+                                  readLaTeX,
+                                  rawLaTeXInline,
+                                  rawLaTeXEnvironment
+                                 ) where
+
+import Text.ParserCombinators.Parsec
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared 
+import Maybe ( fromMaybe )
+import Char ( chr )
+
+-- | Parse LaTeX from string and return 'Pandoc' document.
+readLaTeX :: ParserState   -- ^ Parser state, including options for parser
+          -> String        -- ^ String to parse
+          -> Pandoc
+readLaTeX = readWith parseLaTeX
+
+-- for testing
+testString = testStringWith parseLaTeX
+
+-- characters with special meaning
+specialChars = "\\$%&^&_~#{}\n \t|<>"
+
+--
+-- utility functions
+--
+
+-- | Change quotation marks in a string back to "basic" quotes.
+normalizeQuotes :: String -> String
+normalizeQuotes = gsub "''" "\"" . gsub "`" "'"
+
+-- | Change LaTeX En dashes between digits to hyphens.
+normalizeDashes :: String -> String
+normalizeDashes = gsub "([0-9])--([0-9])" "\\1-\\2"
+
+normalizePunctuation :: String -> String
+normalizePunctuation = normalizeDashes . normalizeQuotes
+
+-- | Returns command option (between []) if any, or empty string.
+commandOpt = option "" (between (char '[') (char ']') (many1 (noneOf "]")))
+
+-- | Returns text between brackets and its matching pair.
+bracketedText = try (do
+  char '{'
+  result <- many (choice [ try (do{ char '\\'; 
+                                    b <- oneOf "{}"; 
+                                    return (['\\', b])}), -- escaped bracket
+                           count 1 (noneOf "{}"),
+                           do {text <- bracketedText; return ("{" ++ text ++ "}")} ])
+  char '}'
+  return (concat result))
+
+-- | Parses list of arguments of LaTeX command.
+commandArgs = many bracketedText
+
+-- | Parses LaTeX command, returns (name, star, option, list of arguments).
+command = try (do
+  char '\\'
+  name <- many1 alphaNum
+  star <- option "" (string "*")  -- some commands have starred versions
+  opt <- commandOpt
+  args <- commandArgs
+  return (name, star, opt, args))
+
+begin name = try (do
+  string "\\begin{"
+  string name
+  char '}'
+  option "" commandOpt 
+  option [] commandArgs
+  spaces
+  return name)
+
+end name = try (do
+  string "\\end{"
+  string name
+  char '}'
+  spaces
+  return name)
+
+-- | Returns a list of block elements containing the contents of an environment.
+environment name = try (do
+  begin name
+  spaces
+  contents <- manyTill block (end name)
+  return contents)
+
+anyEnvironment =  try (do
+  string "\\begin{"
+  name <- many alphaNum 
+  star <- option "" (string "*") -- some environments have starred variants
+  char '}'
+  option "" commandOpt 
+  option [] commandArgs
+  spaces
+  contents <- manyTill block (end (name ++ star))
+  return (BlockQuote contents))
+
+--
+-- parsing documents
+--
+
+-- | Skip everything up through \begin{document}
+skipLaTeXHeader = try (do
+  manyTill anyChar (begin "document")
+  spaces
+  return "")
+
+-- | Parse LaTeX and return 'Pandoc'.
+parseLaTeX = do
+  option "" skipLaTeXHeader            -- if parsing a fragment, this might not be present
+  blocks <- parseBlocks
+  spaces
+  option "" (string "\\end{document}") -- if parsing a fragment, this might not be present
+  spaces
+  eof
+  state <- getState
+  let keyBlocks = stateKeyBlocks state 
+  let noteBlocks = stateNoteBlocks state
+  let blocks' = filter (/= Null) blocks
+  return (Pandoc (Meta [] [] "") (blocks' ++ (reverse noteBlocks) ++ (reverse keyBlocks)))
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+  spaces
+  result <- many block
+  return result
+
+block = choice [ hrule, codeBlock, header, list, blockQuote, mathBlock, comment, 
+                 bibliographic, para, specialEnvironment, itemBlock, unknownEnvironment, 
+                 unknownCommand ] <?> "block"
+
+--
+-- header blocks
+--
+
+header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
+
+headerLevel n = try (do
+  let subs = concat $ replicate (n - 1) "sub"
+  string ("\\" ++ subs ++ "section")
+  option ' ' (char '*')
+  char '{'
+  title <- manyTill inline (char '}')
+  spaces
+  return (Header n (normalizeSpaces title)))
+
+--
+-- hrule block
+--
+
+hrule = try (do
+  oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", "\\newpage" ]
+  spaces
+  return HorizontalRule)
+
+--
+-- code blocks
+--
+
+codeBlock = try (do
+  string "\\begin{verbatim}"  -- don't use begin function because it gobbles whitespace
+  option "" blanklines        -- we want to gobble blank lines, but not leading space
+  contents <- manyTill anyChar (try (string "\\end{verbatim}"))
+  spaces
+  return (CodeBlock (stripTrailingNewlines contents)))
+
+--
+-- block quotes
+--
+
+blockQuote = choice [ blockQuote1, blockQuote2 ] <?> "blockquote"
+
+blockQuote1 = try (do
+  blocks <- environment "quote"
+  spaces
+  return (BlockQuote blocks))
+
+blockQuote2 = try (do
+  blocks <- environment "quotation"
+  spaces
+  return (BlockQuote blocks))
+
+--
+-- math block
+--
+
+mathBlock = mathBlockWith (begin "equation") (end "equation") <|> 
+            mathBlockWith (begin "displaymath") (end "displaymath") <|>
+            mathBlockWith (string "\\[") (string "\\]") <?> "math block"
+
+mathBlockWith start end = try (do
+  start
+  spaces
+  result <- manyTill anyChar end
+  spaces
+  return (BlockQuote [Para [TeX ("$" ++ result ++ "$")]]))
+
+--
+-- list blocks
+--
+
+list = bulletList <|> orderedList <?> "list"
+
+listItem = try (do
+    ("item", _, _, _) <- command
+    spaces
+    state <- getState
+    let oldParserContext = stateParserContext state
+    updateState (\state -> state {stateParserContext = ListItemState})
+    blocks <- many block
+    updateState (\state -> state {stateParserContext = oldParserContext})
+    return blocks)
+
+orderedList = try (do
+    begin "enumerate"
+    spaces
+    items <- many listItem
+    end "enumerate"
+    spaces
+    return (OrderedList items))
+
+bulletList = try (do
+    begin "itemize"
+    spaces
+    items <- many listItem
+    end "itemize"
+    spaces
+    return (BulletList items))
+
+--
+-- paragraph block
+--
+
+para = try (do 
+  result <- many1 inline
+  spaces
+  return (Para (normalizeSpaces result)))
+
+--
+-- title authors date
+--
+
+bibliographic = choice [ maketitle, title, authors, date ]
+
+maketitle = try (do
+  string "\\maketitle"
+  spaces
+  return Null)
+
+title = try (do
+  string "\\title{"
+  tit <- manyTill inline (char '}')
+  spaces
+  updateState (\state -> state { stateTitle = tit })
+  return Null)
+
+authors = try (do
+  string "\\author{"
+  authors <- manyTill anyChar (char '}')
+  spaces
+  let authors' = map removeLeadingTrailingSpace $ lines $ gsub "\\\\" "\n" authors
+  updateState (\state -> state { stateAuthors = authors' })
+  return Null)
+
+date = try (do
+  string "\\date{"
+  date' <- manyTill anyChar (char '}')
+  spaces
+  updateState (\state -> state { stateDate = date' })
+  return Null)
+
+--
+-- item block
+-- for use in unknown environments that aren't being parsed as raw latex
+--
+
+-- this forces items to be parsed in different blocks
+itemBlock = try (do
+  ("item", _, opt, _) <- command
+  state <- getState
+  if (stateParserContext state == ListItemState) then 
+      fail "item should be handled by list block"
+    else
+      if null opt then
+          return Null
+        else
+          return (Plain [Str opt]))
+
+--
+-- raw LaTeX 
+--
+
+specialEnvironment = do  -- these are always parsed as raw
+  followedBy' (choice (map (\name -> begin name)  ["tabular", "figure", "tabbing", "eqnarry", 
+                                                   "picture", "table", "verse", "theorem"]))
+  rawLaTeXEnvironment
+
+-- | Parse any LaTeX environment and return a Para block containing 
+-- the whole literal environment as raw TeX.
+rawLaTeXEnvironment :: GenParser Char st Block
+rawLaTeXEnvironment = try (do
+  string "\\begin"
+  char '{'
+  name <- many1 alphaNum
+  star <- option "" (string "*") -- for starred variants
+  let name' = name ++ star
+  char '}'
+  opt <- option "" commandOpt 
+  args <- option [] commandArgs
+  let optStr = if (null opt) then "" else "[" ++ opt ++ "]"
+  let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args
+  contents <- manyTill (choice [(many1 (noneOf "\\")), 
+                                (do{ (Para [TeX str]) <- rawLaTeXEnvironment; return str }),
+                                string "\\"]) (end name')
+  spaces
+  return (Para [TeX ("\\begin{" ++ name' ++ "}" ++ optStr ++ argStr ++ 
+                                (concat contents) ++ "\\end{" ++ name' ++ "}")]))
+
+unknownEnvironment = try (do
+  state <- getState
+  result <- if stateParseRaw state then  -- check to see whether we should include raw TeX
+                rawLaTeXEnvironment      -- if so, get the whole raw environment
+            else
+                anyEnvironment           -- otherwise just the contents
+  return result)
+
+unknownCommand = try (do
+  notFollowedBy' (string "\\end{itemize}")
+  notFollowedBy' (string "\\end{enumerate}")
+  notFollowedBy' (string "\\end{document}")
+  (name, star, opt, args) <- command
+  spaces
+  let optStr = if null opt then "" else "[" ++ opt ++ "]"
+  let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args
+  state <- getState
+  if (name == "item") && ((stateParserContext state) == ListItemState) then
+      fail "should not be parsed as raw"
+    else
+      string ""
+  if stateParseRaw state then
+      return (Plain [TeX ("\\" ++ name ++ star ++ optStr ++ argStr)])
+    else
+      return (Plain [Str (joinWithSep " " args)]))
+
+-- latex comment
+comment = try (do
+  char '%'
+  result <- manyTill anyChar newline
+  spaces
+  return Null)
+
+-- 
+-- inline
+--
+
+inline =  choice [ strong, emph, ref, lab, code, linebreak, math, ldots, accentedChar, 
+                 specialChar, specialInline, escapedChar, unescapedChar, str, 
+                 endline, whitespace ] <?> "inline"
+
+specialInline = choice [ link, image, footnote, rawLaTeXInline ] <?> 
+                "link, raw TeX, note, or image"
+
+ldots = try (do
+  string "\\ldots"
+  return (Str "..."))
+
+accentedChar = normalAccentedChar <|> specialAccentedChar
+
+normalAccentedChar = try (do
+  char '\\'
+  accent <- oneOf "'`^\"~"
+  character <- choice [ between (char '{') (char '}') anyChar, anyChar ]
+  let table = fromMaybe [] $ lookup character accentTable 
+  let result = case lookup accent table of
+                 Just num  -> chr num
+                 Nothing   -> '?'
+  return (Str [result]))
+
+-- an association list of letters and association list of accents
+-- and decimal character numbers.
+accentTable = 
+  [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]),
+    ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]),
+    ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]),
+    ('N', [('~', 209)]),
+    ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]),
+    ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]),
+    ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]),
+    ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]),
+    ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]),
+    ('n', [('~', 241)]),
+    ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]),
+    ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ]
+
+specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, oslash, pound, 
+                               euro, copyright, sect ]
+
+ccedil = try (do
+  char '\\'
+  letter <- choice [try (string "cc"), try (string "cC")]
+  let num = if letter == "cc" then 231 else 199
+  return (Str [chr num]))
+
+aring = try (do
+  char '\\'
+  letter <- choice [try (string "aa"), try (string "AA")]
+  let num = if letter == "aa" then 229 else 197
+  return (Str [chr num]))
+
+iuml = try (do
+  string "\\\""
+  choice [try (string "\\i"), try (string "{\\i}")]
+  return (Str [chr 239]))
+
+icirc = try (do
+  string "\\^"
+  choice [try (string "\\i"), try (string "{\\i}")]
+  return (Str [chr 238]))
+
+szlig = try (do
+  string "\\ss"
+  return (Str [chr 223]))
+
+oslash = try (do
+  char '\\'
+  letter <- choice [char 'o', char 'O']
+  let num = if letter == 'o' then 248 else 216
+  return (Str [chr num]))
+
+aelig = try (do
+  char '\\'
+  letter <- choice [try (string "ae"), try (string "AE")]
+  let num = if letter == "ae" then 230 else 198
+  return (Str [chr num]))
+
+pound = try (do 
+  string "\\pounds"
+  return (Str [chr 163]))
+
+euro = try (do
+  string "\\euro"
+  return (Str [chr 8364]))
+
+copyright = try (do
+  string "\\copyright"
+  return (Str [chr 169]))
+
+sect = try (do
+  string "\\S"
+  return (Str [chr 167]))
+
+escapedChar = escaped (oneOf " $%^&_#{}")
+
+unescapedChar = do  -- ignore standalone, nonescaped special characters
+  oneOf "$^&_#{}|<>"
+  return (Str "")
+
+specialChar = choice [ backslash, bar, lt, gt ]
+
+backslash = try (do 
+  string "\\textbackslash"
+  return (Str "\\"))
+
+bar = try (do
+  string "\\textbar"
+  return (Str "\\"))
+
+lt = try (do
+  string "\\textless"
+  return (Str "<"))
+
+gt = try (do
+  string "\\textgreater"
+  return (Str ">"))
+
+code = try (do 
+  string "\\verb"
+  marker <- anyChar
+  result <- manyTill anyChar (char marker)
+  let result' = removeLeadingTrailingSpace result
+  return (Code result'))
+
+emph = try (do 
+  oneOfStrings [ "\\emph{", "\\textit{" ]
+  result <- manyTill inline (char '}')
+  return (Emph result))
+
+lab = try (do
+  string "\\label{"
+  result <- manyTill anyChar (char '}')
+  return (Str ("(" ++ result ++ ")")))
+
+ref = try (do
+  string "\\ref{"
+  result <- manyTill anyChar (char '}')
+  return (Str (result)))
+
+strong = try (do 
+  string "\\textbf{"
+  result <- manyTill inline (char '}')
+  return (Strong result))
+
+whitespace = do
+  many1 (oneOf "~ \t")
+  return Space
+
+-- hard line break
+linebreak = try (do
+  string "\\\\"
+  return LineBreak)
+
+str = do 
+  result <- many1 (noneOf specialChars)
+  return (Str (normalizePunctuation result))
+
+-- endline internal to paragraph
+endline = try (do
+  newline
+  notFollowedBy blankline
+  return Space)
+
+-- math
+math = math1 <|> math2 <?> "math"
+
+math1 = try (do
+  char '$'
+  result <- many (noneOf "$")
+  char '$'
+  return (TeX ("$" ++ result ++ "$")))
+
+math2 = try (do
+  string "\\("
+  result <- many (noneOf "$")
+  string "\\)"
+  return (TeX ("$" ++ result ++ "$")))
+
+--
+-- links and images
+--
+
+link = try (do
+  string "\\href{"
+  url <- manyTill anyChar (char '}')
+  char '{'
+  label <- manyTill inline (char '}') 
+  ref <- generateReference url ""
+  return (Link (normalizeSpaces label) ref))
+
+image = try (do
+  ("includegraphics", _, _, (src:lst)) <- command
+  return (Image [Str "image"] (Src src "")))
+
+footnote = try (do
+  ("footnote", _, _, (contents:[])) <- command
+  let blocks = case runParser parseBlocks defaultParserState "footnote" contents of
+                 Left err -> error $ "Input:\n" ++ show contents ++
+                             "\nError:\n" ++ show err
+                 Right result -> result
+  state <- getState
+  let notes = stateNoteBlocks state
+  let nextRef  = case notes of
+                   []                   -> "1"
+                   (Note ref body):rest -> (show ((read ref) + 1))
+  setState (state { stateNoteBlocks = (Note nextRef blocks):notes })
+  return (NoteRef nextRef))
+
+-- | Parse any LaTeX command and return it in a raw TeX inline element.
+rawLaTeXInline :: GenParser Char ParserState Inline
+rawLaTeXInline = try (do
+  (name, star, opt, args) <- command
+  let optStr = if (null opt) then "" else "[" ++ opt ++ "]"
+  let argStr = concatMap (\arg -> "{" ++ arg ++ "}") args
+  state <- getState
+  if ((name == "begin") || (name == "end") || (name == "item")) then
+      fail "not an inline command" 
+    else
+      string ""
+  return (TeX ("\\" ++ name ++ star ++ optStr ++ argStr)))
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
new file mode 100644
index 000000000..60ac40fd7
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -0,0 +1,582 @@
+-- | Convert markdown to Pandoc document.
+module Text.Pandoc.Readers.Markdown ( 
+                                     readMarkdown 
+                                    ) where
+
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Definition
+import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
+import Text.Pandoc.Shared 
+import Text.Pandoc.Readers.HTML ( rawHtmlInline, rawHtmlBlock, anyHtmlBlockTag, 
+                                               anyHtmlInlineTag )
+import Text.Pandoc.HtmlEntities ( decodeEntities )
+import Text.Regex ( matchRegex, mkRegex )
+import Text.ParserCombinators.Parsec
+
+-- | Read markdown from an input string and return a Pandoc document.
+readMarkdown :: ParserState -> String -> Pandoc
+readMarkdown = readWith parseMarkdown
+
+-- | Parse markdown string with default options and print result (for testing).
+testString :: String -> IO ()
+testString = testStringWith parseMarkdown 
+
+--
+-- Constants and data structure definitions
+--
+
+spaceChars = " \t"
+endLineChars = "\n"
+labelStart = '['
+labelEnd = ']'
+labelSep = ':'
+srcStart = '('
+srcEnd = ')'
+imageStart = '!'
+noteStart = '^'
+codeStart = '`'
+codeEnd = '`'
+emphStart = '*'
+emphEnd = '*'
+emphStartAlt = '_'
+emphEndAlt = '_'
+autoLinkStart = '<'
+autoLinkEnd = '>'
+mathStart = '$'
+mathEnd = '$'
+bulletListMarkers = "*+-"
+orderedListDelimiters = "."
+escapeChar = '\\'
+hruleChars = "*-_"
+quoteChars = "'\""
+atxHChar = '#'
+titleOpeners = "\"'("
+setextHChars = ['=','-']
+blockQuoteChar = '>'
+hyphenChar = '-'
+
+-- treat these as potentially non-text when parsing inline:
+specialChars = [escapeChar, labelStart, labelEnd, emphStart, emphEnd, emphStartAlt, 
+                emphEndAlt, codeStart, codeEnd, autoLinkEnd, autoLinkStart, mathStart, 
+                mathEnd, imageStart, noteStart, hyphenChar]
+
+--
+-- auxiliary functions
+--
+
+-- | Skip a single endline if there is one.
+skipEndline = option Space endline
+
+indentSpaces = do
+  state <- getState
+  let tabStop = stateTabStop state
+  oneOfStrings [ "\t", (replicate tabStop ' ') ] <?> "indentation"
+
+skipNonindentSpaces = do
+  state <- getState
+  let tabStop = stateTabStop state
+  choice (map (\n -> (try (count n (char ' ')))) (reverse [0..(tabStop - 1)]))
+
+--
+-- document structure
+--
+
+titleLine = try (do
+  char '%'
+  skipSpaces
+  line <- manyTill inline newline
+  return line)
+
+authorsLine = try (do
+  char '%'
+  skipSpaces
+  authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
+  newline
+  return (map removeLeadingTrailingSpace authors))
+
+dateLine = try (do
+  char '%'
+  skipSpaces
+  date <- many (noneOf "\n")
+  newline
+  return (removeTrailingSpace date))
+
+titleBlock = try (do
+  title <- option [] titleLine
+  author <- option [] authorsLine
+  date <- option "" dateLine
+  option "" blanklines
+  return (title, author, date))
+
+parseMarkdown = do
+  updateState (\state -> state { stateParseRaw = True }) -- need to parse raw HTML
+  (title, author, date) <- option ([],[],"") titleBlock
+  blocks <- parseBlocks
+  state <- getState
+  let keys = reverse $ stateKeyBlocks state
+  return (Pandoc (Meta title author date) (blocks ++ keys))
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+  result <- manyTill block eof
+  return result
+
+block = choice [ codeBlock, referenceKey, note, header, hrule, list, blockQuote, rawHtmlBlocks, 
+                 rawLaTeXEnvironment, para, plain, blankBlock, nullBlock ] <?> "block"
+
+--
+-- header blocks
+--
+
+header = choice [ setextHeader, atxHeader ] <?> "header"
+
+atxHeader = try (do
+  lead <- many1 (char atxHChar)
+  skipSpaces
+  txt <- many1 (do {notFollowedBy' atxClosing; inline})
+  atxClosing
+  return (Header (length lead) (normalizeSpaces txt)))
+
+atxClosing = try (do
+  skipMany (char atxHChar)
+  skipSpaces
+  newline
+  option "" blanklines)
+
+setextHeader = choice (map (\x -> setextH x) (enumFromTo 1 (length setextHChars)))
+
+setextH n = try (do
+    txt <- many1 (do {notFollowedBy newline; inline})
+    endline
+    many1 (char (setextHChars !! (n-1)))
+    skipSpaces
+    newline
+    option "" blanklines
+    return (Header n (normalizeSpaces txt)))
+
+--
+-- hrule block
+--
+
+hruleWith chr = 
+    try (do
+           skipSpaces
+           char chr
+           skipSpaces
+           char chr
+           skipSpaces
+           char chr
+           skipMany (oneOf (chr:spaceChars))
+           newline
+           option "" blanklines
+           return HorizontalRule)
+
+hrule = choice (map hruleWith hruleChars) <?> "hrule"
+
+--
+-- code blocks
+--
+
+indentedLine = try (do
+    indentSpaces
+    result <- manyTill anyChar newline
+    return (result ++ "\n"))
+
+-- two or more indented lines, possibly separated by blank lines
+indentedBlock = try (do 
+  res1 <- indentedLine
+  blanks <- many blankline 
+  res2 <- choice [indentedBlock, indentedLine]
+  return (res1 ++ blanks ++ res2))
+
+codeBlock = do
+    result <- choice [indentedBlock, indentedLine]
+    option "" blanklines
+    return (CodeBlock result)
+
+--
+-- note block
+--
+
+note = try (do
+    (NoteRef ref) <- noteRef 
+    skipSpaces
+    raw <- sepBy (many (choice [nonEndline, 
+                                (try (do {endline; notFollowedBy (char noteStart); return '\n'}))
+                               ])) (try (do {newline; char noteStart; option ' ' (char ' ')}))
+    newline
+    blanklines
+    -- parse the extracted block, which may contain various block elements:
+    state <- getState
+    let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of
+                   Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err
+                   Right result -> result
+    return (Note ref parsed))
+
+--
+-- block quotes
+--
+
+emacsBoxQuote = try (do
+    string ",----"
+    manyTill anyChar newline
+    raw <- manyTill (try (do{ char '|'; 
+                              option ' ' (char ' '); 
+                              result <- manyTill anyChar newline; 
+                              return result})) 
+                     (string "`----")
+    manyTill anyChar newline
+    option "" blanklines
+    return raw)
+
+emailBlockQuoteStart = try (do
+  skipNonindentSpaces
+  char blockQuoteChar
+  option ' ' (char ' ')
+  return "> ")
+
+emailBlockQuote = try (do
+    emailBlockQuoteStart
+    raw <- sepBy (many (choice [nonEndline, 
+                                (try (do{ endline; 
+                                          notFollowedBy' emailBlockQuoteStart;
+                                          return '\n'}))])) 
+           (try (do {newline; emailBlockQuoteStart}))
+    newline <|> (do{ eof; return '\n'})
+    option "" blanklines
+    return raw)
+
+blockQuote = do 
+    raw <- choice [ emailBlockQuote, emacsBoxQuote ]
+    -- parse the extracted block, which may contain various block elements:
+    state <- getState
+    let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of
+                   Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err
+                   Right result -> result
+    return (BlockQuote parsed)
+
+--
+-- list blocks
+--
+
+list = choice [ bulletList, orderedList ] <?> "list"
+
+bulletListStart = 
+    try (do
+           option ' ' newline -- if preceded by a Plain block in a list context
+           skipNonindentSpaces
+           notFollowedBy' hrule  -- because hrules start out just like lists
+           oneOf bulletListMarkers
+           spaceChar
+           skipSpaces)
+
+orderedListStart = 
+    try (do
+           option ' ' newline -- if preceded by a Plain block in a list context
+           skipNonindentSpaces
+           many1 digit
+           oneOf orderedListDelimiters
+           oneOf spaceChars
+           skipSpaces)
+
+-- parse a line of a list item (start = parser for beginning of list item)
+listLine start = try (do
+  notFollowedBy' start
+  notFollowedBy blankline
+  notFollowedBy' (try (do{ indentSpaces; 
+                           many (spaceChar);
+                           choice [bulletListStart, orderedListStart]}))
+  line <- manyTill anyChar newline
+  return (line ++ "\n"))
+
+-- parse raw text for one list item, excluding start marker and continuations
+rawListItem start = 
+    try (do
+           start
+           result <- many1 (listLine start)
+           blanks <- many blankline
+           return ((concat result) ++ blanks))
+
+-- continuation of a list item - indented and separated by blankline 
+-- or (in compact lists) endline.
+-- note: nested lists are parsed as continuations
+listContinuation start = 
+    try (do
+           followedBy' indentSpaces
+           result <- many1 (listContinuationLine start)
+           blanks <- many blankline
+           return ((concat result) ++ blanks))
+
+listContinuationLine start = try (do
+    notFollowedBy blankline
+    notFollowedBy' start
+    option "" indentSpaces
+    result <- manyTill anyChar newline
+    return (result ++ "\n"))
+
+listItem start = 
+    try (do 
+           first <- rawListItem start
+           rest <- many (listContinuation start)
+           -- parsing with ListItemState forces markers at beginning of lines to
+           -- count as list item markers, even if not separated by blank space.
+           -- see definition of "endline"
+           state <- getState
+           let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState}) 
+                        "block" raw of
+                          Left err -> error $ "Raw block:\n" ++ raw ++ "\nError:\n" ++ show err
+                          Right result -> result
+                   where raw = concat (first:rest) 
+           return parsed)
+
+orderedList = 
+    try (do
+           items <- many1 (listItem orderedListStart)
+           let items' = compactify items
+           return (OrderedList items'))
+
+bulletList = 
+    try (do
+           items <- many1 (listItem bulletListStart)
+           let items' = compactify items
+           return (BulletList items'))
+
+--
+-- paragraph block
+--
+
+para = try (do 
+  result <- many1 inline
+  newline
+  choice [ (do{ followedBy' (oneOfStrings [">", ",----"]); return "" }), blanklines ]  
+  let result' = normalizeSpaces result
+  return (Para result'))
+
+plain = do
+  result <- many1 inline
+  let result' = normalizeSpaces result
+  return (Plain result')
+
+-- 
+-- raw html
+--
+
+rawHtmlBlocks = try (do
+   htmlBlocks <- many1 rawHtmlBlock    
+   let combined = concatMap (\(RawHtml str) -> str) htmlBlocks
+   let combined' = if (last combined == '\n') then 
+                       init combined  -- strip extra newline 
+                   else 
+                       combined 
+   return (RawHtml combined'))
+
+-- 
+-- reference key
+--
+
+referenceKey = 
+    try (do
+           skipSpaces
+           label <- reference
+           char labelSep
+           skipSpaces
+           option ' ' (char autoLinkStart)
+           src <- many (noneOf (titleOpeners ++ [autoLinkEnd] ++ endLineChars))
+           option ' ' (char autoLinkEnd)
+           tit <- option "" title 
+           blanklines 
+           return (Key label (Src (removeTrailingSpace src) tit))) 
+
+-- 
+-- inline
+--
+
+text = choice [ math, strong, emph, code2, code1, str, linebreak, tabchar, 
+                whitespace, endline ] <?> "text"
+
+inline = choice [ rawLaTeXInline, escapedChar, special, hyphens, text, ltSign, symbol ] <?> "inline"
+
+special = choice [ link, referenceLink, rawHtmlInline, autoLink, 
+                   image, noteRef ] <?> "link, inline html, note, or image"
+
+escapedChar = escaped anyChar
+
+ltSign = do
+  notFollowedBy' rawHtmlBlocks -- don't return < if it starts html
+  char '<'
+  return (Str ['<'])
+
+specialCharsMinusLt = filter (/= '<') specialChars
+
+symbol = do 
+  result <- oneOf specialCharsMinusLt
+  return (Str [result])
+
+hyphens = try (do
+  result <- many1 (char '-')
+  if (length result) == 1 then
+      skipEndline   -- don't want to treat endline after hyphen as a space
+    else
+      do{ string ""; return Space }
+  return (Str result))
+
+-- parses inline code, between codeStart and codeEnd
+code1 = 
+    try (do 
+           char codeStart
+           result <- many (noneOf [codeEnd])
+           char codeEnd
+           let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines
+           return (Code result'))
+
+-- parses inline code, between 2 codeStarts and 2 codeEnds
+code2 = 
+    try (do
+           string [codeStart, codeStart]
+           result <- manyTill anyChar (try (string [codeEnd, codeEnd]))
+           let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines
+           return (Code result'))
+
+mathWord = many1 (choice [(noneOf (" \t\n\\" ++ [mathEnd])), (try (do {c <- char '\\'; notFollowedBy (char mathEnd); return c}))])
+
+math = try (do
+  char mathStart
+  notFollowedBy space
+  words <- sepBy1 mathWord (many1 space)
+  char mathEnd
+  return (TeX ("$" ++ (joinWithSep " " words) ++ "$")))
+
+emph = do
+  result <- choice [ (enclosed (char emphStart) (char emphEnd) inline), 
+                      (enclosed (char emphStartAlt) (char emphEndAlt) inline) ]
+  return (Emph (normalizeSpaces result))
+
+strong = do
+  result <- choice [ (enclosed (count 2 (char emphStart)) (count 2 (char emphEnd)) inline), 
+                     (enclosed (count 2 (char emphStartAlt)) (count 2 (char emphEndAlt)) inline)]
+  return (Strong (normalizeSpaces result))
+
+whitespace = do
+  many1 (oneOf spaceChars) <?> "whitespace"
+  return Space
+
+tabchar = do
+  tab
+  return (Str "\t")
+
+-- hard line break
+linebreak = try (do
+  oneOf spaceChars
+  many1 (oneOf spaceChars) 
+  endline
+  return LineBreak )
+
+nonEndline = noneOf endLineChars
+
+str = do 
+  result <- many1 ((noneOf (specialChars ++ spaceChars ++ endLineChars))) 
+  return (Str (decodeEntities result))
+
+-- an endline character that can be treated as a space, not a structural break
+endline =
+    try (do
+           newline
+           -- next line would allow block quotes without preceding blank line
+           -- Markdown.pl does allow this, but there's a chance of a wrapped
+           -- greater-than sign triggering a block quote by accident...
+--         notFollowedBy (try (do { choice [emailBlockQuoteStart, string ",----"]; return ' ' }))  
+           notFollowedBy blankline
+           -- parse potential list starts at beginning of line differently if in a list:
+           st <- getState
+           if (stateParserContext st) == ListItemState then 
+               do
+                 notFollowedBy' orderedListStart
+                 notFollowedBy' bulletListStart
+             else
+               option () pzero
+           return Space)
+
+--
+-- links
+--
+
+-- a reference label for a link
+reference = do
+  char labelStart
+  label <- manyTill inline (char labelEnd)
+  return (normalizeSpaces label)
+
+-- source for a link, with optional title
+source = 
+    try (do 
+           char srcStart
+           option ' ' (char autoLinkStart)
+           src <- many (noneOf ([srcEnd, autoLinkEnd] ++ titleOpeners))
+           option ' ' (char autoLinkEnd)
+           tit <- option "" title
+           skipSpaces
+           char srcEnd
+           return (Src (removeTrailingSpace src) tit))
+
+titleWith startChar endChar =
+    try (do
+           skipSpaces
+           skipEndline  -- a title can be on the next line from the source
+           skipSpaces
+           char startChar
+           tit <- manyTill (choice [ try (do {char '\\'; char endChar}), 
+                                     (noneOf (endChar:endLineChars)) ]) (char endChar) 
+           let tit' = gsub "\"" "&quot;" tit
+           return tit')
+
+title = choice [titleWith '(' ')', titleWith '"' '"', titleWith '\'' '\''] <?> "title"
+
+link = choice [explicitLink, referenceLink] <?> "link"
+
+explicitLink = 
+    try (do
+           label <- reference
+           src <- source 
+           return (Link label src)) 
+
+referenceLink = choice [referenceLinkDouble, referenceLinkSingle]
+
+referenceLinkDouble =     -- a link like [this][/url/]
+    try (do
+           label <- reference
+           skipSpaces
+           skipEndline
+           skipSpaces
+           ref <- reference 
+           return (Link label (Ref ref))) 
+
+referenceLinkSingle =     -- a link like [this]
+    try (do
+           label <- reference
+           return (Link label (Ref []))) 
+
+autoLink =                -- a link <like.this.com>
+    try (do
+           notFollowedBy (do {anyHtmlBlockTag; return ' '})
+           src <- between (char autoLinkStart) (char autoLinkEnd) 
+                  (many (noneOf (spaceChars ++ endLineChars ++ [autoLinkEnd])))
+           case (matchRegex emailAddress src) of
+             Just _  -> return (Link [Str src] (Src ("mailto:" ++ src) ""))
+             Nothing -> return (Link [Str src] (Src src ""))) 
+
+emailAddress = mkRegex "([^@:/]+)@(([^.]+[.]?)*([^.]+))"  -- presupposes no whitespace
+
+image = 
+    try (do
+           char imageStart
+           (Link label src) <- link
+           return (Image label src)) 
+
+noteRef = try (do
+    char noteStart
+    ref <- between (char '(') (char ')') (many1 (noneOf " \t\n)"))
+    return (NoteRef ref))
+
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
new file mode 100644
index 000000000..82e5ea303
--- /dev/null
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -0,0 +1,644 @@
+-- | Parse reStructuredText and return Pandoc document.
+module Text.Pandoc.Readers.RST ( 
+                                readRST
+                               ) where
+import Text.Pandoc.Definition
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Shared 
+import Text.Pandoc.Readers.HTML ( anyHtmlBlockTag, anyHtmlInlineTag )
+import Text.Regex ( matchRegex, mkRegex )
+import Text.ParserCombinators.Parsec
+import Data.Maybe ( fromMaybe )
+import List ( findIndex )
+import Char ( toUpper )
+
+-- | Parse reStructuredText string and return Pandoc document.
+readRST :: ParserState -> String -> Pandoc
+readRST = readWith parseRST
+
+-- | Parse a string and print result (for testing).
+testString :: String -> IO ()
+testString = testStringWith parseRST
+
+--
+-- Constants and data structure definitions
+---
+
+bulletListMarkers = "*+-"
+underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~"
+
+-- treat these as potentially non-text when parsing inline:
+specialChars = "\\`|*_<>$:[-"
+
+--
+-- parsing documents
+--
+
+isAnonKeyBlock block = case block of
+                         (Key [Str "_"] str) -> True
+                         otherwise -> False
+
+isNotAnonKeyBlock block = not (isAnonKeyBlock block)
+
+isHeader1 :: Block -> Bool
+isHeader1 (Header 1 _) = True
+isHeader1 _ = False
+
+isHeader2 :: Block -> Bool
+isHeader2 (Header 2 _) = True
+isHeader2 _ = False
+
+-- | Promote all headers in a list of blocks.  (Part of
+-- title transformation for RST.)
+promoteHeaders :: Int -> [Block] -> [Block]
+promoteHeaders num ((Header level text):rest) = 
+    (Header (level - num) text):(promoteHeaders num rest)
+promoteHeaders num (other:rest) = other:(promoteHeaders num rest)
+promoteHeaders num [] = []
+
+-- | If list of blocks starts with a header (or a header and subheader)
+-- of level that are not found elsewhere, return it as a title and
+-- promote all the other headers. 
+titleTransform :: [Block]              -- ^ list of blocks
+               -> ([Block], [Inline])  -- ^ modified list of blocks, title
+titleTransform ((Header 1 head1):(Header 2 head2):rest) =  -- title and subtitle
+    if (any isHeader1 rest) || (any isHeader2 rest) then
+        ((Header 1 head1):(Header 2 head2):rest, [])
+      else
+        ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2)
+titleTransform ((Header 1 head1):rest) =       -- title, no subtitle
+    if (any isHeader1 rest) then
+        ((Header 1 head1):rest, [])
+      else
+        ((promoteHeaders 1 rest), head1)
+titleTransform blocks = (blocks, [])
+
+parseRST = do
+  state <- getState
+  input <- getInput
+  blocks <- parseBlocks   -- first pass
+  let anonymousKeys = filter isAnonKeyBlock blocks
+  let blocks' = if (null anonymousKeys) then
+                    blocks
+                  else -- run parser again to fill in anonymous links...
+                    case runParser parseBlocks (state { stateKeyBlocks = anonymousKeys }) 
+                             "RST source, second pass" input of
+                      Left err -> error $ "\nError:\n" ++ show err
+                      Right result -> (filter isNotAnonKeyBlock result)
+  let (blocks'', title) = if stateStandalone state then
+                              titleTransform blocks'
+                            else
+                              (blocks', [])
+  state <- getState
+  let authors = stateAuthors state
+  let date = stateDate state
+  let title' = if (null title) then (stateTitle state) else title
+  return (Pandoc (Meta title' authors date) blocks'')
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+  result <- manyTill block eof
+  return result
+
+block = choice [ codeBlock, rawHtmlBlock, rawLaTeXBlock, blockQuote, referenceKey, 
+                 imageBlock, unknownDirective, header, hrule, list, fieldList, lineBlock,
+                 para, plain, blankBlock, nullBlock ] <?> "block"
+
+--
+-- field list
+--
+
+fieldListItem = try (do
+  char ':'
+  name <- many1 alphaNum
+  string ": "
+  skipSpaces
+  first <- manyTill anyChar newline
+  rest <- many (do{ notFollowedBy (char ':'); 
+                    notFollowedBy blankline; 
+                    skipSpaces;
+                    manyTill anyChar newline })
+  return (name, (joinWithSep " " (first:rest)))) 
+
+fieldList = try (do
+  items <- many1 fieldListItem
+  blanklines
+  let authors = case (lookup "Authors" items) of
+                   Just auth -> [auth]
+                   Nothing   -> map snd (filter (\(x,y) -> x == "Author") items)
+  let date = case (lookup "Date" items) of
+                   Just dat  -> dat
+                   Nothing   -> ""
+  let title = case (lookup "Title" items) of
+                    Just tit  -> [Str tit]
+                    Nothing   -> []
+  let remaining = filter (\(x,y) -> (x /= "Authors") && (x /= "Author") && (x /= "Date") && 
+                                    (x /= "Title")) items
+  let result = map (\(x,y) -> Para [Strong [Str x], Str ":", Space, Str y]) remaining
+  updateState (\st -> st { stateAuthors = authors, stateDate = date, stateTitle = title })
+  return (BlockQuote result))
+
+--
+-- line block
+--
+
+lineBlockLine = try (do
+  string "| "
+  white <- many (oneOf " \t")
+  line <- manyTill inline newline
+  let line' = (if null white then [] else [Str white]) ++ line ++ [LineBreak]
+  return line')
+
+lineBlock = try (do
+  lines <- many1 lineBlockLine
+  blanklines
+  return $ Para (concat lines))
+
+--
+-- paragraph block
+--
+
+para = choice [ paraBeforeCodeBlock, paraNormal ] <?> "paragraph"
+
+codeBlockStart = try (do
+                        string "::"
+                        blankline
+                        blankline)
+
+-- paragraph that ends in a :: starting a code block
+paraBeforeCodeBlock = try (do
+  result <- many1 (do {notFollowedBy' codeBlockStart; inline})
+  followedBy' (string "::")
+  return (Para (if (last result == Space) then 
+                    normalizeSpaces result
+                else 
+                    (normalizeSpaces result) ++ [Str ":"])))
+
+-- regular paragraph
+paraNormal = try (do 
+  result <- many1 inline
+  newline
+  blanklines
+  let result' = normalizeSpaces result
+  return (Para result'))
+
+plain = do
+  result <- many1 inline
+  let result' = normalizeSpaces result
+  return (Plain result')
+
+--
+-- image block
+--
+
+imageBlock = try (do
+    string ".. image:: "
+    src <- manyTill anyChar newline
+    return (Plain [Image [Str "image"] (Src src "")]))
+
+--
+-- header blocks
+--
+
+header = choice [ doubleHeader, singleHeader ] <?> "header"
+
+-- a header with lines on top and bottom
+doubleHeader = try (do
+    c <- oneOf underlineChars
+    rest <- many (char c)  -- the top line
+    let lenTop = length (c:rest)
+    skipSpaces
+    newline
+    txt <- many1 (do {notFollowedBy blankline; inline})
+    pos <- getPosition  
+    let len = (sourceColumn pos) - 1
+    if (len > lenTop) then fail "title longer than border" else (do {return ()})
+    blankline              -- spaces and newline
+    count lenTop (char c)  -- the bottom line
+    blanklines
+    -- check to see if we've had this kind of header before.  
+    -- if so, get appropriate level.  if not, add to list.
+    state <- getState
+    let headerTable = stateHeaderTable state
+    let (headerTable', level) =  case findIndex (== DoubleHeader c) headerTable of
+         Just ind    -> (headerTable, ind + 1)
+         Nothing     -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1)
+    setState (state { stateHeaderTable = headerTable' })
+    return (Header level (normalizeSpaces txt)))
+
+-- a header with line on the bottom only
+singleHeader = try (do 
+    notFollowedBy' whitespace
+    txt <- many1 (do {notFollowedBy blankline; inline})
+    pos <- getPosition
+    let len = (sourceColumn pos) - 1
+    blankline
+    c <- oneOf underlineChars
+    rest <- count (len - 1) (char c)
+    many (char c)
+    blanklines
+    state <- getState
+    let headerTable = stateHeaderTable state
+    let (headerTable', level) =  case findIndex (== SingleHeader c) headerTable of
+         Just ind    -> (headerTable, ind + 1)
+         Nothing     -> (headerTable ++ [SingleHeader c], (length headerTable) + 1)
+    setState (state { stateHeaderTable = headerTable' })
+    return (Header level (normalizeSpaces txt)))
+
+--
+-- hrule block
+--
+
+hruleWith chr = 
+    try (do
+           count 4 (char chr)
+           skipMany (char chr)
+           skipSpaces
+           newline
+           blanklines
+           return HorizontalRule)
+
+hrule = choice (map hruleWith underlineChars) <?> "hrule"
+
+--
+-- code blocks
+--
+
+-- read a line indented by a given string
+indentedLine indents = try (do
+    string indents
+    result <- manyTill anyChar newline
+    return (result ++ "\n"))
+
+-- two or more indented lines, possibly separated by blank lines
+-- if variable = True, then any indent will work, but it must be consistent through the block
+-- if variable = False, indent should be one tab or equivalent in spaces
+indentedBlock variable = try (do 
+  state <- getState
+  let tabStop = stateTabStop state
+  indents <- if variable then
+                 many1 (oneOf " \t")
+             else
+                 oneOfStrings ["\t", (replicate tabStop ' ')]
+  firstline <- manyTill anyChar newline
+  rest <- many (choice [ indentedLine indents, 
+                         try (do {b <- blanklines; l <- indentedLine indents; return (b ++ l)})])
+  option "" blanklines
+  return (firstline ++ "\n" ++ (concat rest)))
+
+codeBlock = try (do
+    codeBlockStart
+    result <- indentedBlock False  -- the False means we want one tab stop indent on each line
+    return (CodeBlock result))
+
+--
+-- raw html
+--
+
+rawHtmlBlock = try (do
+    string ".. raw:: html"
+    blanklines
+    result <- indentedBlock True
+    return (RawHtml result))
+
+--
+-- raw latex
+--
+
+rawLaTeXBlock = try (do
+    string ".. raw:: latex"
+    blanklines
+    result <- indentedBlock True
+    return (Para [(TeX result)]))
+
+--
+-- block quotes
+--
+
+blockQuote = try (do
+    block <- indentedBlock True
+    -- parse the extracted block, which may contain various block elements:
+    state <- getState
+    let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) 
+                  "block" (block ++ "\n\n") of
+                   Left err -> error $ "Raw block:\n" ++ show block ++ "\nError:\n" ++ show err
+                   Right result -> result
+    return (BlockQuote parsed))
+
+--
+-- list blocks
+--
+
+list = choice [ bulletList, orderedList ] <?> "list"
+
+-- parses bullet list start and returns its length (inc. following whitespace)
+bulletListStart = 
+    try (do
+           notFollowedBy' hrule  -- because hrules start out just like lists
+           marker <- oneOf bulletListMarkers
+           white <- many1 spaceChar
+           let len = length (marker:white)
+           return len) 
+
+withPeriodSuffix parser = try (do
+                                 a <- parser
+                                 b <- char '.'
+                                 return (a ++ [b]))
+
+withParentheses parser = try (do
+                                a <- char '('
+                                b <- parser
+                                c <- char ')'
+                                return ([a] ++ b ++ [c]))
+
+withRightParen parser = try (do
+                               a <- parser
+                               b <- char ')'
+                               return (a ++ [b]))
+
+upcaseWord = map toUpper
+
+romanNumeral = do
+  let lowerNumerals = ["i", "ii", "iii", "iiii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", "xx", "xxi", "xxii", "xxiii", "xxiv" ]
+  let upperNumerals = map upcaseWord lowerNumerals
+  result <- choice $ map string (lowerNumerals ++ upperNumerals)
+  return result
+
+orderedListEnumerator = choice [ many1 digit, 
+                                 string "#", 
+                                 count 1 letter, 
+                                 romanNumeral ]
+
+-- parses ordered list start and returns its length (inc. following whitespace)
+orderedListStart = 
+    try (do
+           marker <- choice [ withPeriodSuffix orderedListEnumerator, 
+                              withParentheses orderedListEnumerator, 
+                              withRightParen orderedListEnumerator ]
+           white <- many1 spaceChar
+           let len = length (marker ++ white)
+           return len)
+
+-- parse a line of a list item
+listLine markerLength = try (do
+  notFollowedBy blankline
+  indentWith markerLength
+  line <- manyTill anyChar newline
+  return (line ++ "\n"))
+
+-- indent by specified number of spaces (or equiv. tabs)
+indentWith num = do
+  state <- getState
+  let tabStop = stateTabStop state
+  if (num < tabStop) then
+      count num  (char ' ')
+    else
+      choice [ try (count num (char ' ')), 
+               (try (do {char '\t'; count (num - tabStop) (char ' ')})) ] 
+
+-- parse raw text for one list item, excluding start marker and continuations
+rawListItem start = 
+    try (do
+           markerLength <- start
+           firstLine <- manyTill anyChar newline
+           restLines <- many (listLine markerLength)
+           return (markerLength, (firstLine ++ "\n" ++ (concat restLines))))
+
+-- continuation of a list item - indented and separated by blankline or (in compact lists) 
+-- endline.  Note: nested lists are parsed as continuations.
+listContinuation markerLength = 
+    try (do
+           blanks <- many1 blankline
+           result <- many1 (listLine markerLength)
+           return (blanks ++ (concat result)))
+
+listItem start = 
+    try (do 
+           (markerLength, first) <- rawListItem start
+           rest <- many (listContinuation markerLength)
+           blanks <- choice [ try (do {b <- many blankline; followedBy' start; return b}), 
+                              many1 blankline ]  -- whole list must end with blank
+           -- parsing with ListItemState forces markers at beginning of lines to
+           -- count as list item markers, even if not separated by blank space.
+           -- see definition of "endline"
+           state <- getState
+           let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState}) 
+                        "list item" raw of
+                          Left err -> error $ "Raw:\n" ++ raw ++ "\nError:\n" ++ show err
+                          Right result -> result
+                   where raw = concat (first:rest) ++ blanks 
+           return parsed)
+
+orderedList = 
+    try (do
+           items <- many1 (listItem orderedListStart)
+           let items' = compactify items
+           return (OrderedList items'))
+
+bulletList = 
+    try (do
+           items <- many1 (listItem bulletListStart)
+           let items' = compactify items
+           return (BulletList items'))
+
+--
+-- unknown directive (e.g. comment)
+--
+
+unknownDirective = try (do
+    string ".. "
+    manyTill anyChar newline
+    many (do {string "   "; 
+              char ':'; 
+              many1 (noneOf "\n:"); 
+              char ':'; 
+              many1 (noneOf "\n"); 
+              newline})
+    option "" blanklines
+    return Null)
+
+-- 
+-- reference key
+--
+
+referenceKey = choice [imageKey, anonymousKey, regularKeyQuoted, regularKey]
+
+imageKey = try (do
+     string ".. |"
+     ref <- manyTill inline (char '|')
+     skipSpaces
+     string "image::"
+     src <- manyTill anyChar newline
+     return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+anonymousKey = try (do
+     choice [string ".. __:", string "__"]
+     skipSpaces
+     src <- manyTill anyChar newline
+     state <- getState
+     return (Key [Str "_"] (Src (removeLeadingTrailingSpace src) "")))
+
+regularKeyQuoted = try (do
+     string ".. _`"
+     ref <- manyTill inline (string "`:")
+     skipSpaces
+     src <- manyTill anyChar newline
+     return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+regularKey = try (do
+     string ".. _"
+     ref <- manyTill inline (char ':')
+     skipSpaces
+     src <- manyTill anyChar newline
+     return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+ -- 
+ -- inline
+ --
+
+text = choice [ strong, emph, code, str, tabchar, whitespace, endline ] <?> "text"
+
+inline = choice [ escapedChar, special, hyphens, text, symbol ] <?> "inline"
+
+special = choice [ link, image ] <?> "link, inline html, or image"
+
+hyphens = try (do
+  result <- many1 (char '-')
+  option Space endline   -- don't want to treat endline after hyphen or dash as a space
+  return (Str result))
+
+escapedChar = escaped anyChar
+
+symbol = do 
+  result <- oneOf specialChars
+  return (Str [result])
+
+-- parses inline code, between codeStart and codeEnd
+code = 
+    try (do 
+           string "``"
+           result <- manyTill anyChar (string "``")
+           let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result
+           return (Code result'))
+
+emph = do
+  result <- enclosed (char '*') (char '*') inline
+  return (Emph (normalizeSpaces result))
+
+strong = do
+  result <- enclosed (string "**") (string "**") inline
+  return (Strong (normalizeSpaces result))
+
+whitespace = do
+  many1 spaceChar <?> "whitespace"
+  return Space
+
+tabchar = do
+  tab
+  return (Str "\t")
+
+str = do 
+  notFollowedBy' oneWordReferenceLink
+  result <- many1 (noneOf (specialChars ++ "\t\n "))
+  return (Str result)
+
+-- an endline character that can be treated as a space, not a structural break
+endline =
+    try (do
+           newline
+           notFollowedBy blankline
+           -- parse potential list starts at beginning of line differently if in a list:
+           st <- getState
+           if ((stateParserContext st) == ListItemState) then 
+               notFollowedBy' (choice [orderedListStart, bulletListStart])
+             else 
+               option () pzero
+           return Space)
+
+--
+-- links
+--
+
+link = choice [explicitLink, referenceLink, autoLink, oneWordReferenceLink] <?> "link"
+
+explicitLink = 
+    try (do
+           char '`'
+           label <- manyTill inline (try (do {spaces; char '<'}))
+           src <- manyTill (noneOf ">\n ") (char '>')
+           skipSpaces
+           string "`_"
+           return (Link (normalizeSpaces label) (Src (removeLeadingTrailingSpace src) ""))) 
+
+anonymousLinkEnding = 
+    try (do
+           char '_'
+           state <- getState
+           let anonKeys = stateKeyBlocks state
+           -- if there's a list of anon key refs (from previous pass), pop one off.
+           -- otherwise return an anon key ref for the next pass to take care of...
+           case anonKeys of
+             (Key [Str "_"] src):rest -> 
+                 do{ setState (state { stateKeyBlocks = rest }); 
+                     return src }
+             otherwise -> return (Ref [Str "_"]))
+
+referenceLink = 
+    try (do
+           char '`'
+           label <- manyTill inline (string "`_")
+           src <- option (Ref []) anonymousLinkEnding
+           return (Link (normalizeSpaces label) src)) 
+
+oneWordReferenceLink =
+    try (do
+           label <- many1 alphaNum
+           char '_'
+           src <- option (Ref []) anonymousLinkEnding
+           notFollowedBy alphaNum    -- because this_is_not a link
+           return (Link [Str label] src)) 
+
+uriScheme = oneOfStrings [ "http://", "https://", "ftp://", "file://", "mailto:", 
+                           "news:", "telnet:" ]
+
+uri = try (do
+    scheme <- uriScheme
+    identifier <- many1 (noneOf " \t\n")
+    return (scheme ++ identifier))
+
+autoURI = try (do
+    src <- uri
+    return (Link [Str src] (Src src "")))
+
+emailChar = alphaNum <|> oneOf "-+_."
+
+emailAddress = try (do
+    firstLetter <- alphaNum
+    restAddr <- many emailChar
+    let addr = firstLetter:restAddr
+    char '@'
+    dom <- domain
+    return (addr ++ '@':dom))
+
+domainChar = alphaNum <|> char '-'
+
+domain = try (do
+    first <- many1 domainChar
+    dom <- many1 (try (do{ char '.'; many1 domainChar }))
+    return (joinWithSep "." (first:dom)))
+
+autoEmail = try (do
+    src <- emailAddress
+    return (Link [Str src] (Src ("mailto:" ++ src) "")))
+
+autoLink = autoURI <|> autoEmail
+
+-- For now, we assume that all substitution references are for images.
+image = 
+    try (do
+           char '|'
+           ref <- manyTill inline (char '|')
+           return (Image (normalizeSpaces ref) (Ref ref)))
author	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2006-10-17 14:22:29 +0000
committer	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2006-10-17 14:22:29 +0000
commit	df7b68225101966051f8b592a27127bf789eb81e (patch)
tree	a063e97ed58d0bdb2cbb5a95c3e8c1bcce54aa00 /src/Text/Pandoc/Readers
parent	e7dbfef4d8aa528d9245424e9c372e900a774c90 (diff)
download	pandoc-df7b68225101966051f8b592a27127bf789eb81e.tar.gz