1 files changed, 582 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
new file mode 100644
index 000000000..60ac40fd7
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -0,0 +1,582 @@
+-- | Convert markdown to Pandoc document.
+module Text.Pandoc.Readers.Markdown ( 
+                                     readMarkdown 
+                                    ) where
+
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Definition
+import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
+import Text.Pandoc.Shared 
+import Text.Pandoc.Readers.HTML ( rawHtmlInline, rawHtmlBlock, anyHtmlBlockTag, 
+                                               anyHtmlInlineTag )
+import Text.Pandoc.HtmlEntities ( decodeEntities )
+import Text.Regex ( matchRegex, mkRegex )
+import Text.ParserCombinators.Parsec
+
+-- | Read markdown from an input string and return a Pandoc document.
+readMarkdown :: ParserState -> String -> Pandoc
+readMarkdown = readWith parseMarkdown
+
+-- | Parse markdown string with default options and print result (for testing).
+testString :: String -> IO ()
+testString = testStringWith parseMarkdown 
+
+--
+-- Constants and data structure definitions
+--
+
+spaceChars = " \t"
+endLineChars = "\n"
+labelStart = '['
+labelEnd = ']'
+labelSep = ':'
+srcStart = '('
+srcEnd = ')'
+imageStart = '!'
+noteStart = '^'
+codeStart = '`'
+codeEnd = '`'
+emphStart = '*'
+emphEnd = '*'
+emphStartAlt = '_'
+emphEndAlt = '_'
+autoLinkStart = '<'
+autoLinkEnd = '>'
+mathStart = '$'
+mathEnd = '$'
+bulletListMarkers = "*+-"
+orderedListDelimiters = "."
+escapeChar = '\\'
+hruleChars = "*-_"
+quoteChars = "'\""
+atxHChar = '#'
+titleOpeners = "\"'("
+setextHChars = ['=','-']
+blockQuoteChar = '>'
+hyphenChar = '-'
+
+-- treat these as potentially non-text when parsing inline:
+specialChars = [escapeChar, labelStart, labelEnd, emphStart, emphEnd, emphStartAlt, 
+                emphEndAlt, codeStart, codeEnd, autoLinkEnd, autoLinkStart, mathStart, 
+                mathEnd, imageStart, noteStart, hyphenChar]
+
+--
+-- auxiliary functions
+--
+
+-- | Skip a single endline if there is one.
+skipEndline = option Space endline
+
+indentSpaces = do
+  state <- getState
+  let tabStop = stateTabStop state
+  oneOfStrings [ "\t", (replicate tabStop ' ') ] <?> "indentation"
+
+skipNonindentSpaces = do
+  state <- getState
+  let tabStop = stateTabStop state
+  choice (map (\n -> (try (count n (char ' ')))) (reverse [0..(tabStop - 1)]))
+
+--
+-- document structure
+--
+
+titleLine = try (do
+  char '%'
+  skipSpaces
+  line <- manyTill inline newline
+  return line)
+
+authorsLine = try (do
+  char '%'
+  skipSpaces
+  authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
+  newline
+  return (map removeLeadingTrailingSpace authors))
+
+dateLine = try (do
+  char '%'
+  skipSpaces
+  date <- many (noneOf "\n")
+  newline
+  return (removeTrailingSpace date))
+
+titleBlock = try (do
+  title <- option [] titleLine
+  author <- option [] authorsLine
+  date <- option "" dateLine
+  option "" blanklines
+  return (title, author, date))
+
+parseMarkdown = do
+  updateState (\state -> state { stateParseRaw = True }) -- need to parse raw HTML
+  (title, author, date) <- option ([],[],"") titleBlock
+  blocks <- parseBlocks
+  state <- getState
+  let keys = reverse $ stateKeyBlocks state
+  return (Pandoc (Meta title author date) (blocks ++ keys))
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+  result <- manyTill block eof
+  return result
+
+block = choice [ codeBlock, referenceKey, note, header, hrule, list, blockQuote, rawHtmlBlocks, 
+                 rawLaTeXEnvironment, para, plain, blankBlock, nullBlock ] <?> "block"
+
+--
+-- header blocks
+--
+
+header = choice [ setextHeader, atxHeader ] <?> "header"
+
+atxHeader = try (do
+  lead <- many1 (char atxHChar)
+  skipSpaces
+  txt <- many1 (do {notFollowedBy' atxClosing; inline})
+  atxClosing
+  return (Header (length lead) (normalizeSpaces txt)))
+
+atxClosing = try (do
+  skipMany (char atxHChar)
+  skipSpaces
+  newline
+  option "" blanklines)
+
+setextHeader = choice (map (\x -> setextH x) (enumFromTo 1 (length setextHChars)))
+
+setextH n = try (do
+    txt <- many1 (do {notFollowedBy newline; inline})
+    endline
+    many1 (char (setextHChars !! (n-1)))
+    skipSpaces
+    newline
+    option "" blanklines
+    return (Header n (normalizeSpaces txt)))
+
+--
+-- hrule block
+--
+
+hruleWith chr = 
+    try (do
+           skipSpaces
+           char chr
+           skipSpaces
+           char chr
+           skipSpaces
+           char chr
+           skipMany (oneOf (chr:spaceChars))
+           newline
+           option "" blanklines
+           return HorizontalRule)
+
+hrule = choice (map hruleWith hruleChars) <?> "hrule"
+
+--
+-- code blocks
+--
+
+indentedLine = try (do
+    indentSpaces
+    result <- manyTill anyChar newline
+    return (result ++ "\n"))
+
+-- two or more indented lines, possibly separated by blank lines
+indentedBlock = try (do 
+  res1 <- indentedLine
+  blanks <- many blankline 
+  res2 <- choice [indentedBlock, indentedLine]
+  return (res1 ++ blanks ++ res2))
+
+codeBlock = do
+    result <- choice [indentedBlock, indentedLine]
+    option "" blanklines
+    return (CodeBlock result)
+
+--
+-- note block
+--
+
+note = try (do
+    (NoteRef ref) <- noteRef 
+    skipSpaces
+    raw <- sepBy (many (choice [nonEndline, 
+                                (try (do {endline; notFollowedBy (char noteStart); return '\n'}))
+                               ])) (try (do {newline; char noteStart; option ' ' (char ' ')}))
+    newline
+    blanklines
+    -- parse the extracted block, which may contain various block elements:
+    state <- getState
+    let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of
+                   Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err
+                   Right result -> result
+    return (Note ref parsed))
+
+--
+-- block quotes
+--
+
+emacsBoxQuote = try (do
+    string ",----"
+    manyTill anyChar newline
+    raw <- manyTill (try (do{ char '|'; 
+                              option ' ' (char ' '); 
+                              result <- manyTill anyChar newline; 
+                              return result})) 
+                     (string "`----")
+    manyTill anyChar newline
+    option "" blanklines
+    return raw)
+
+emailBlockQuoteStart = try (do
+  skipNonindentSpaces
+  char blockQuoteChar
+  option ' ' (char ' ')
+  return "> ")
+
+emailBlockQuote = try (do
+    emailBlockQuoteStart
+    raw <- sepBy (many (choice [nonEndline, 
+                                (try (do{ endline; 
+                                          notFollowedBy' emailBlockQuoteStart;
+                                          return '\n'}))])) 
+           (try (do {newline; emailBlockQuoteStart}))
+    newline <|> (do{ eof; return '\n'})
+    option "" blanklines
+    return raw)
+
+blockQuote = do 
+    raw <- choice [ emailBlockQuote, emacsBoxQuote ]
+    -- parse the extracted block, which may contain various block elements:
+    state <- getState
+    let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of
+                   Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err
+                   Right result -> result
+    return (BlockQuote parsed)
+
+--
+-- list blocks
+--
+
+list = choice [ bulletList, orderedList ] <?> "list"
+
+bulletListStart = 
+    try (do
+           option ' ' newline -- if preceded by a Plain block in a list context
+           skipNonindentSpaces
+           notFollowedBy' hrule  -- because hrules start out just like lists
+           oneOf bulletListMarkers
+           spaceChar
+           skipSpaces)
+
+orderedListStart = 
+    try (do
+           option ' ' newline -- if preceded by a Plain block in a list context
+           skipNonindentSpaces
+           many1 digit
+           oneOf orderedListDelimiters
+           oneOf spaceChars
+           skipSpaces)
+
+-- parse a line of a list item (start = parser for beginning of list item)
+listLine start = try (do
+  notFollowedBy' start
+  notFollowedBy blankline
+  notFollowedBy' (try (do{ indentSpaces; 
+                           many (spaceChar);
+                           choice [bulletListStart, orderedListStart]}))
+  line <- manyTill anyChar newline
+  return (line ++ "\n"))
+
+-- parse raw text for one list item, excluding start marker and continuations
+rawListItem start = 
+    try (do
+           start
+           result <- many1 (listLine start)
+           blanks <- many blankline
+           return ((concat result) ++ blanks))
+
+-- continuation of a list item - indented and separated by blankline 
+-- or (in compact lists) endline.
+-- note: nested lists are parsed as continuations
+listContinuation start = 
+    try (do
+           followedBy' indentSpaces
+           result <- many1 (listContinuationLine start)
+           blanks <- many blankline
+           return ((concat result) ++ blanks))
+
+listContinuationLine start = try (do
+    notFollowedBy blankline
+    notFollowedBy' start
+    option "" indentSpaces
+    result <- manyTill anyChar newline
+    return (result ++ "\n"))
+
+listItem start = 
+    try (do 
+           first <- rawListItem start
+           rest <- many (listContinuation start)
+           -- parsing with ListItemState forces markers at beginning of lines to
+           -- count as list item markers, even if not separated by blank space.
+           -- see definition of "endline"
+           state <- getState
+           let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState}) 
+                        "block" raw of
+                          Left err -> error $ "Raw block:\n" ++ raw ++ "\nError:\n" ++ show err
+                          Right result -> result
+                   where raw = concat (first:rest) 
+           return parsed)
+
+orderedList = 
+    try (do
+           items <- many1 (listItem orderedListStart)
+           let items' = compactify items
+           return (OrderedList items'))
+
+bulletList = 
+    try (do
+           items <- many1 (listItem bulletListStart)
+           let items' = compactify items
+           return (BulletList items'))
+
+--
+-- paragraph block
+--
+
+para = try (do 
+  result <- many1 inline
+  newline
+  choice [ (do{ followedBy' (oneOfStrings [">", ",----"]); return "" }), blanklines ]  
+  let result' = normalizeSpaces result
+  return (Para result'))
+
+plain = do
+  result <- many1 inline
+  let result' = normalizeSpaces result
+  return (Plain result')
+
+-- 
+-- raw html
+--
+
+rawHtmlBlocks = try (do
+   htmlBlocks <- many1 rawHtmlBlock    
+   let combined = concatMap (\(RawHtml str) -> str) htmlBlocks
+   let combined' = if (last combined == '\n') then 
+                       init combined  -- strip extra newline 
+                   else 
+                       combined 
+   return (RawHtml combined'))
+
+-- 
+-- reference key
+--
+
+referenceKey = 
+    try (do
+           skipSpaces
+           label <- reference
+           char labelSep
+           skipSpaces
+           option ' ' (char autoLinkStart)
+           src <- many (noneOf (titleOpeners ++ [autoLinkEnd] ++ endLineChars))
+           option ' ' (char autoLinkEnd)
+           tit <- option "" title 
+           blanklines 
+           return (Key label (Src (removeTrailingSpace src) tit))) 
+
+-- 
+-- inline
+--
+
+text = choice [ math, strong, emph, code2, code1, str, linebreak, tabchar, 
+                whitespace, endline ] <?> "text"
+
+inline = choice [ rawLaTeXInline, escapedChar, special, hyphens, text, ltSign, symbol ] <?> "inline"
+
+special = choice [ link, referenceLink, rawHtmlInline, autoLink, 
+                   image, noteRef ] <?> "link, inline html, note, or image"
+
+escapedChar = escaped anyChar
+
+ltSign = do
+  notFollowedBy' rawHtmlBlocks -- don't return < if it starts html
+  char '<'
+  return (Str ['<'])
+
+specialCharsMinusLt = filter (/= '<') specialChars
+
+symbol = do 
+  result <- oneOf specialCharsMinusLt
+  return (Str [result])
+
+hyphens = try (do
+  result <- many1 (char '-')
+  if (length result) == 1 then
+      skipEndline   -- don't want to treat endline after hyphen as a space
+    else
+      do{ string ""; return Space }
+  return (Str result))
+
+-- parses inline code, between codeStart and codeEnd
+code1 = 
+    try (do 
+           char codeStart
+           result <- many (noneOf [codeEnd])
+           char codeEnd
+           let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines
+           return (Code result'))
+
+-- parses inline code, between 2 codeStarts and 2 codeEnds
+code2 = 
+    try (do
+           string [codeStart, codeStart]
+           result <- manyTill anyChar (try (string [codeEnd, codeEnd]))
+           let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines
+           return (Code result'))
+
+mathWord = many1 (choice [(noneOf (" \t\n\\" ++ [mathEnd])), (try (do {c <- char '\\'; notFollowedBy (char mathEnd); return c}))])
+
+math = try (do
+  char mathStart
+  notFollowedBy space
+  words <- sepBy1 mathWord (many1 space)
+  char mathEnd
+  return (TeX ("$" ++ (joinWithSep " " words) ++ "$")))
+
+emph = do
+  result <- choice [ (enclosed (char emphStart) (char emphEnd) inline), 
+                      (enclosed (char emphStartAlt) (char emphEndAlt) inline) ]
+  return (Emph (normalizeSpaces result))
+
+strong = do
+  result <- choice [ (enclosed (count 2 (char emphStart)) (count 2 (char emphEnd)) inline), 
+                     (enclosed (count 2 (char emphStartAlt)) (count 2 (char emphEndAlt)) inline)]
+  return (Strong (normalizeSpaces result))
+
+whitespace = do
+  many1 (oneOf spaceChars) <?> "whitespace"
+  return Space
+
+tabchar = do
+  tab
+  return (Str "\t")
+
+-- hard line break
+linebreak = try (do
+  oneOf spaceChars
+  many1 (oneOf spaceChars) 
+  endline
+  return LineBreak )
+
+nonEndline = noneOf endLineChars
+
+str = do 
+  result <- many1 ((noneOf (specialChars ++ spaceChars ++ endLineChars))) 
+  return (Str (decodeEntities result))
+
+-- an endline character that can be treated as a space, not a structural break
+endline =
+    try (do
+           newline
+           -- next line would allow block quotes without preceding blank line
+           -- Markdown.pl does allow this, but there's a chance of a wrapped
+           -- greater-than sign triggering a block quote by accident...
+--         notFollowedBy (try (do { choice [emailBlockQuoteStart, string ",----"]; return ' ' }))  
+           notFollowedBy blankline
+           -- parse potential list starts at beginning of line differently if in a list:
+           st <- getState
+           if (stateParserContext st) == ListItemState then 
+               do
+                 notFollowedBy' orderedListStart
+                 notFollowedBy' bulletListStart
+             else
+               option () pzero
+           return Space)
+
+--
+-- links
+--
+
+-- a reference label for a link
+reference = do
+  char labelStart
+  label <- manyTill inline (char labelEnd)
+  return (normalizeSpaces label)
+
+-- source for a link, with optional title
+source = 
+    try (do 
+           char srcStart
+           option ' ' (char autoLinkStart)
+           src <- many (noneOf ([srcEnd, autoLinkEnd] ++ titleOpeners))
+           option ' ' (char autoLinkEnd)
+           tit <- option "" title
+           skipSpaces
+           char srcEnd
+           return (Src (removeTrailingSpace src) tit))
+
+titleWith startChar endChar =
+    try (do
+           skipSpaces
+           skipEndline  -- a title can be on the next line from the source
+           skipSpaces
+           char startChar
+           tit <- manyTill (choice [ try (do {char '\\'; char endChar}), 
+                                     (noneOf (endChar:endLineChars)) ]) (char endChar) 
+           let tit' = gsub "\"" "&quot;" tit
+           return tit')
+
+title = choice [titleWith '(' ')', titleWith '"' '"', titleWith '\'' '\''] <?> "title"
+
+link = choice [explicitLink, referenceLink] <?> "link"
+
+explicitLink = 
+    try (do
+           label <- reference
+           src <- source 
+           return (Link label src)) 
+
+referenceLink = choice [referenceLinkDouble, referenceLinkSingle]
+
+referenceLinkDouble =     -- a link like [this][/url/]
+    try (do
+           label <- reference
+           skipSpaces
+           skipEndline
+           skipSpaces
+           ref <- reference 
+           return (Link label (Ref ref))) 
+
+referenceLinkSingle =     -- a link like [this]
+    try (do
+           label <- reference
+           return (Link label (Ref []))) 
+
+autoLink =                -- a link <like.this.com>
+    try (do
+           notFollowedBy (do {anyHtmlBlockTag; return ' '})
+           src <- between (char autoLinkStart) (char autoLinkEnd) 
+                  (many (noneOf (spaceChars ++ endLineChars ++ [autoLinkEnd])))
+           case (matchRegex emailAddress src) of
+             Just _  -> return (Link [Str src] (Src ("mailto:" ++ src) ""))
+             Nothing -> return (Link [Str src] (Src src ""))) 
+
+emailAddress = mkRegex "([^@:/]+)@(([^.]+[.]?)*([^.]+))"  -- presupposes no whitespace
+
+image = 
+    try (do
+           char imageStart
+           (Link label src) <- link
+           return (Image label src)) 
+
+noteRef = try (do
+    char noteStart
+    ref <- between (char '(') (char ')') (many1 (noneOf " \t\n)"))
+    return (NoteRef ref))
+