aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-10-17 14:22:29 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-10-17 14:22:29 +0000
commitdf7b68225101966051f8b592a27127bf789eb81e (patch)
treea063e97ed58d0bdb2cbb5a95c3e8c1bcce54aa00 /src/Text/Pandoc/Readers
parente7dbfef4d8aa528d9245424e9c372e900a774c90 (diff)
downloadpandoc-df7b68225101966051f8b592a27127bf789eb81e.tar.gz
initial import
git-svn-id: https://pandoc.googlecode.com/svn/trunk@2 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs434
-rw-r--r--src/Text/Pandoc/Readers/LaTeX.hs585
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs582
-rw-r--r--src/Text/Pandoc/Readers/RST.hs644
4 files changed, 2245 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
new file mode 100644
index 000000000..054d9eb72
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -0,0 +1,434 @@
+-- | Converts HTML to 'Pandoc' document.
+module Text.Pandoc.Readers.HTML (
+ readHtml,
+ rawHtmlInline,
+ rawHtmlBlock,
+ anyHtmlBlockTag,
+ anyHtmlInlineTag
+ ) where
+
+import Text.Regex ( matchRegex, mkRegex )
+import Text.ParserCombinators.Parsec
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared
+import Text.Pandoc.HtmlEntities ( decodeEntities, htmlEntityToChar )
+import Maybe ( fromMaybe )
+import Char ( toUpper, toLower )
+
+-- | Convert HTML-formatted string to 'Pandoc' document.
+readHtml :: ParserState -- ^ Parser state
+ -> String -- ^ String to parse
+ -> Pandoc
+readHtml = readWith parseHtml
+
+-- for testing
+testString :: String -> IO ()
+testString = testStringWith parseHtml
+
+--
+-- Constants
+--
+
+inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", "br", "cite",
+ "code", "dfn", "em", "font", "i", "img", "input", "kbd", "label", "q",
+ "s", "samp", "select", "small", "span", "strike", "strong", "sub",
+ "sup", "textarea", "tt", "u", "var"]
+
+--
+-- HTML utility functions
+--
+
+-- | Read blocks until end tag.
+blocksTilEnd tag = try (do
+ blocks <- manyTill (do {b <- block; spaces; return b}) (htmlEndTag tag)
+ return blocks)
+
+-- | Read inlines until end tag.
+inlinesTilEnd tag = try (do
+ inlines <- manyTill inline (htmlEndTag tag)
+ return inlines)
+
+-- extract type from a tag: e.g. br from <br>, < br >, </br>, etc.
+extractTagType tag = case (matchRegex (mkRegex "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of
+ Just [match] -> (map toLower match)
+ Nothing -> ""
+
+anyHtmlTag = try (do
+ char '<'
+ spaces
+ tag <- many1 alphaNum
+ attribs <- htmlAttributes
+ spaces
+ ender <- option "" (string "/")
+ let ender' = if (null ender) then "" else " /"
+ spaces
+ char '>'
+ return ("<" ++ tag ++ attribs ++ ender' ++ ">"))
+
+anyHtmlEndTag = try (do
+ char '<'
+ spaces
+ char '/'
+ spaces
+ tagType <- many1 alphaNum
+ spaces
+ char '>'
+ return ("</" ++ tagType ++ ">"))
+
+htmlTag :: String -> GenParser Char st (String, [(String, String)])
+htmlTag tag = try (do
+ char '<'
+ spaces
+ stringAnyCase tag
+ attribs <- many htmlAttribute
+ spaces
+ option "" (string "/")
+ spaces
+ char '>'
+ return (tag, (map (\(name, content, raw) -> (name, content)) attribs)))
+
+-- parses a quoted html attribute value
+quoted quoteChar = do
+ result <- between (char quoteChar) (char quoteChar) (many (noneOf [quoteChar]))
+ return (result, [quoteChar])
+
+htmlAttributes = do
+ attrList <- many htmlAttribute
+ return (concatMap (\(name, content, raw) -> raw) attrList)
+
+htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
+
+-- minimized boolean attribute (no = and value)
+htmlMinimizedAttribute = try (do
+ spaces
+ name <- many1 (choice [letter, oneOf ".-_:"])
+ spaces
+ notFollowedBy (char '=')
+ let content = name
+ return (name, content, (" " ++ name)))
+
+htmlRegularAttribute = try (do
+ spaces
+ name <- many1 (choice [letter, oneOf ".-_:"])
+ spaces
+ char '='
+ spaces
+ (content, quoteStr) <- choice [ (quoted '\''),
+ (quoted '"'),
+ (do{ a <- (many (alphaNum <|> (oneOf "-._:")));
+ return (a,"")} ) ]
+ return (name, content, (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)))
+
+htmlEndTag tag = try (do
+ char '<'
+ spaces
+ char '/'
+ spaces
+ stringAnyCase tag
+ spaces
+ char '>'
+ return ("</" ++ tag ++ ">"))
+
+-- | Returns @True@ if the tag is an inline tag.
+isInline tag = (extractTagType tag) `elem` inlineHtmlTags
+
+anyHtmlBlockTag = try (do
+ tag <- choice [anyHtmlTag, anyHtmlEndTag]
+ if isInline tag then
+ fail "inline tag"
+ else
+ return tag)
+
+anyHtmlInlineTag = try (do
+ tag <- choice [ anyHtmlTag, anyHtmlEndTag ]
+ if isInline tag then
+ return tag
+ else
+ fail "not an inline tag")
+
+-- scripts must be treated differently, because they can contain <> etc.
+htmlScript = try (do
+ open <- string "<script"
+ rest <- manyTill anyChar (htmlEndTag "script")
+ return (open ++ rest ++ "</script>"))
+
+rawHtmlBlock = do
+ notFollowedBy (do {choice [htmlTag "/body", htmlTag "/html"]; return ' '})
+ body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec, definition]
+ sp <- (many space)
+ state <- getState
+ if stateParseRaw state then
+ return (RawHtml (body ++ sp))
+ else
+ return Null
+
+htmlComment = try (do
+ string "<!--"
+ comment <- manyTill anyChar (try (string "-->"))
+ return ("<!--" ++ comment ++ "-->"))
+
+--
+-- parsing documents
+--
+
+xmlDec = try (do
+ string "<?"
+ rest <- manyTill anyChar (char '>')
+ return ("<?" ++ rest ++ ">"))
+
+definition = try (do
+ string "<!"
+ rest <- manyTill anyChar (char '>')
+ return ("<!" ++ rest ++ ">"))
+
+nonTitleNonHead = try (do
+ notFollowedBy' (htmlTag "title")
+ notFollowedBy' (htmlTag "/head")
+ result <- choice [do {rawHtmlBlock; return ' '}, anyChar]
+ return result)
+
+parseTitle = try (do
+ (tag, attribs) <- htmlTag "title"
+ contents <- inlinesTilEnd tag
+ spaces
+ return contents)
+
+-- parse header and return meta-information (for now, just title)
+parseHead = try (do
+ htmlTag "head"
+ spaces
+ skipMany nonTitleNonHead
+ contents <- option [] parseTitle
+ skipMany nonTitleNonHead
+ htmlTag "/head"
+ return (contents, [], ""))
+
+skipHtmlTag tag = option ("",[]) (htmlTag tag)
+
+-- h1 class="title" representation of title in body
+bodyTitle = try (do
+ (tag, attribs) <- htmlTag "h1"
+ cl <- case (extractAttribute "class" attribs) of
+ Just "title" -> do {return ""}
+ otherwise -> fail "not title"
+ inlinesTilEnd "h1"
+ return "")
+
+parseHtml = do
+ sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
+ skipHtmlTag "html"
+ spaces
+ (title, authors, date) <- option ([], [], "") parseHead
+ spaces
+ skipHtmlTag "body"
+ spaces
+ option "" bodyTitle -- skip title in body, because it's represented in meta
+ blocks <- parseBlocks
+ spaces
+ option "" (htmlEndTag "body")
+ spaces
+ option "" (htmlEndTag "html")
+ many anyChar -- ignore anything after </html>
+ eof
+ state <- getState
+ let keyBlocks = stateKeyBlocks state
+ return (Pandoc (Meta title authors date) (blocks ++ (reverse keyBlocks)))
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+ spaces
+ result <- sepEndBy block spaces
+ return result
+
+block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain,
+ rawHtmlBlock ] <?> "block"
+
+--
+-- header blocks
+--
+
+header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
+
+headerLevel n = try (do
+ let level = "h" ++ show n
+ (tag, attribs) <- htmlTag level
+ contents <- inlinesTilEnd level
+ return (Header n (normalizeSpaces contents)))
+
+--
+-- hrule block
+--
+
+hrule = try (do
+ (tag, attribs) <- htmlTag "hr"
+ state <- getState
+ if (not (null attribs)) && (stateParseRaw state) then
+ unexpected "attributes in hr" -- in this case we want to parse it as raw html
+ else
+ return HorizontalRule)
+
+--
+-- code blocks
+--
+
+codeBlock = choice [ preCodeBlock, bareCodeBlock ] <?> "code block"
+
+preCodeBlock = try (do
+ htmlTag "pre"
+ spaces
+ htmlTag "code"
+ result <- manyTill anyChar (htmlEndTag "code")
+ spaces
+ htmlEndTag "pre"
+ return (CodeBlock (decodeEntities result)))
+
+bareCodeBlock = try (do
+ htmlTag "code"
+ result <- manyTill anyChar (htmlEndTag "code")
+ return (CodeBlock (decodeEntities result)))
+
+--
+-- block quotes
+--
+
+blockQuote = try (do
+ tag <- htmlTag "blockquote"
+ spaces
+ blocks <- blocksTilEnd "blockquote"
+ return (BlockQuote blocks))
+
+--
+-- list blocks
+--
+
+list = choice [ bulletList, orderedList ] <?> "list"
+
+orderedList = try (do
+ tag <- htmlTag "ol"
+ spaces
+ items <- sepEndBy1 listItem spaces
+ htmlEndTag "ol"
+ return (OrderedList items))
+
+bulletList = try (do
+ tag <- htmlTag "ul"
+ spaces
+ items <- sepEndBy1 listItem spaces
+ htmlEndTag "ul"
+ return (BulletList items))
+
+listItem = try (do
+ tag <- htmlTag "li"
+ spaces
+ blocks <- blocksTilEnd "li"
+ return blocks)
+
+--
+-- paragraph block
+--
+
+para = try (do
+ tag <- htmlTag "p"
+ result <- inlinesTilEnd "p"
+ return (Para (normalizeSpaces result)))
+
+--
+-- plain block
+--
+
+plain = do
+ result <- many1 inline
+ return (Plain (normalizeSpaces result))
+
+--
+-- inline
+--
+
+inline = choice [ text, special ] <?> "inline"
+
+text = choice [ entity, strong, emph, code, str, linebreak, whitespace ] <?> "text"
+
+special = choice [ link, image, rawHtmlInline ] <?> "link, inline html, or image"
+
+entity = try (do
+ char '&'
+ body <- choice [(many1 letter),
+ (try (do{ char '#'; num <- many1 digit; return ("#" ++ num)}))]
+ char ';'
+ return (Str [fromMaybe '?' (htmlEntityToChar ("&" ++ body ++ ";"))]))
+
+code = try (do
+ htmlTag "code"
+ result <- manyTill anyChar (htmlEndTag "code")
+ -- remove internal line breaks, leading and trailing space, and decode entities
+ let result' = decodeEntities $ removeLeadingTrailingSpace $ joinWithSep " " $ lines result
+ return (Code result'))
+
+rawHtmlInline = do
+ result <- choice [htmlScript, anyHtmlInlineTag]
+ state <- getState
+ if stateParseRaw state then
+ return (HtmlInline result)
+ else
+ return (Str "")
+
+betweenTags tag = try (do
+ htmlTag tag
+ result <- inlinesTilEnd tag
+ return (normalizeSpaces result))
+
+emph = try (do
+ result <- choice [betweenTags "em", betweenTags "it"]
+ return (Emph result))
+
+strong = try (do
+ result <- choice [betweenTags "b", betweenTags "strong"]
+ return (Strong result))
+
+whitespace = do
+ many1 space
+ return Space
+
+-- hard line break
+linebreak = do
+ htmlTag "br"
+ return LineBreak
+
+str = do
+ result <- many1 (noneOf "<& \t\n")
+ return (Str (decodeEntities result))
+
+--
+-- links and images
+--
+
+-- extract contents of attribute (attribute names are case-insensitive)
+extractAttribute name [] = Nothing
+extractAttribute name ((attrName, contents):rest) =
+ let name' = map toLower name
+ attrName' = map toLower attrName in
+ if (attrName' == name') then Just contents else extractAttribute name rest
+
+link = try (do
+ (tag, attributes) <- htmlTag "a"
+ url <- case (extractAttribute "href" attributes) of
+ Just url -> do {return url}
+ Nothing -> fail "no href"
+ let title = fromMaybe "" (extractAttribute "title" attributes)
+ label <- inlinesTilEnd "a"
+ ref <- generateReference url title
+ return (Link (normalizeSpaces label) ref))
+
+image = try (do
+ (tag, attributes) <- htmlTag "img"
+ url <- case (extractAttribute "src" attributes) of
+ Just url -> do {return url}
+ Nothing -> fail "no src"
+ let title = fromMaybe "" (extractAttribute "title" attributes)
+ let alt = fromMaybe "" (extractAttribute "alt" attributes)
+ ref <- generateReference url title
+ return (Image [Str alt] ref))
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
new file mode 100644
index 000000000..3bf3dfd23
--- /dev/null
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -0,0 +1,585 @@
+-- | Converts LaTeX to 'Pandoc' document.
+module Text.Pandoc.Readers.LaTeX (
+ readLaTeX,
+ rawLaTeXInline,
+ rawLaTeXEnvironment
+ ) where
+
+import Text.ParserCombinators.Parsec
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Definition
+import Text.Pandoc.Shared
+import Maybe ( fromMaybe )
+import Char ( chr )
+
+-- | Parse LaTeX from string and return 'Pandoc' document.
+readLaTeX :: ParserState -- ^ Parser state, including options for parser
+ -> String -- ^ String to parse
+ -> Pandoc
+readLaTeX = readWith parseLaTeX
+
+-- for testing
+testString = testStringWith parseLaTeX
+
+-- characters with special meaning
+specialChars = "\\$%&^&_~#{}\n \t|<>"
+
+--
+-- utility functions
+--
+
+-- | Change quotation marks in a string back to "basic" quotes.
+normalizeQuotes :: String -> String
+normalizeQuotes = gsub "''" "\"" . gsub "`" "'"
+
+-- | Change LaTeX En dashes between digits to hyphens.
+normalizeDashes :: String -> String
+normalizeDashes = gsub "([0-9])--([0-9])" "\\1-\\2"
+
+normalizePunctuation :: String -> String
+normalizePunctuation = normalizeDashes . normalizeQuotes
+
+-- | Returns command option (between []) if any, or empty string.
+commandOpt = option "" (between (char '[') (char ']') (many1 (noneOf "]")))
+
+-- | Returns text between brackets and its matching pair.
+bracketedText = try (do
+ char '{'
+ result <- many (choice [ try (do{ char '\\';
+ b <- oneOf "{}";
+ return (['\\', b])}), -- escaped bracket
+ count 1 (noneOf "{}"),
+ do {text <- bracketedText; return ("{" ++ text ++ "}")} ])
+ char '}'
+ return (concat result))
+
+-- | Parses list of arguments of LaTeX command.
+commandArgs = many bracketedText
+
+-- | Parses LaTeX command, returns (name, star, option, list of arguments).
+command = try (do
+ char '\\'
+ name <- many1 alphaNum
+ star <- option "" (string "*") -- some commands have starred versions
+ opt <- commandOpt
+ args <- commandArgs
+ return (name, star, opt, args))
+
+begin name = try (do
+ string "\\begin{"
+ string name
+ char '}'
+ option "" commandOpt
+ option [] commandArgs
+ spaces
+ return name)
+
+end name = try (do
+ string "\\end{"
+ string name
+ char '}'
+ spaces
+ return name)
+
+-- | Returns a list of block elements containing the contents of an environment.
+environment name = try (do
+ begin name
+ spaces
+ contents <- manyTill block (end name)
+ return contents)
+
+anyEnvironment = try (do
+ string "\\begin{"
+ name <- many alphaNum
+ star <- option "" (string "*") -- some environments have starred variants
+ char '}'
+ option "" commandOpt
+ option [] commandArgs
+ spaces
+ contents <- manyTill block (end (name ++ star))
+ return (BlockQuote contents))
+
+--
+-- parsing documents
+--
+
+-- | Skip everything up through \begin{document}
+skipLaTeXHeader = try (do
+ manyTill anyChar (begin "document")
+ spaces
+ return "")
+
+-- | Parse LaTeX and return 'Pandoc'.
+parseLaTeX = do
+ option "" skipLaTeXHeader -- if parsing a fragment, this might not be present
+ blocks <- parseBlocks
+ spaces
+ option "" (string "\\end{document}") -- if parsing a fragment, this might not be present
+ spaces
+ eof
+ state <- getState
+ let keyBlocks = stateKeyBlocks state
+ let noteBlocks = stateNoteBlocks state
+ let blocks' = filter (/= Null) blocks
+ return (Pandoc (Meta [] [] "") (blocks' ++ (reverse noteBlocks) ++ (reverse keyBlocks)))
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+ spaces
+ result <- many block
+ return result
+
+block = choice [ hrule, codeBlock, header, list, blockQuote, mathBlock, comment,
+ bibliographic, para, specialEnvironment, itemBlock, unknownEnvironment,
+ unknownCommand ] <?> "block"
+
+--
+-- header blocks
+--
+
+header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
+
+headerLevel n = try (do
+ let subs = concat $ replicate (n - 1) "sub"
+ string ("\\" ++ subs ++ "section")
+ option ' ' (char '*')
+ char '{'
+ title <- manyTill inline (char '}')
+ spaces
+ return (Header n (normalizeSpaces title)))
+
+--
+-- hrule block
+--
+
+hrule = try (do
+ oneOfStrings [ "\\begin{center}\\rule{3in}{0.4pt}\\end{center}\n\n", "\\newpage" ]
+ spaces
+ return HorizontalRule)
+
+--
+-- code blocks
+--
+
+codeBlock = try (do
+ string "\\begin{verbatim}" -- don't use begin function because it gobbles whitespace
+ option "" blanklines -- we want to gobble blank lines, but not leading space
+ contents <- manyTill anyChar (try (string "\\end{verbatim}"))
+ spaces
+ return (CodeBlock (stripTrailingNewlines contents)))
+
+--
+-- block quotes
+--
+
+blockQuote = choice [ blockQuote1, blockQuote2 ] <?> "blockquote"
+
+blockQuote1 = try (do
+ blocks <- environment "quote"
+ spaces
+ return (BlockQuote blocks))
+
+blockQuote2 = try (do
+ blocks <- environment "quotation"
+ spaces
+ return (BlockQuote blocks))
+
+--
+-- math block
+--
+
+mathBlock = mathBlockWith (begin "equation") (end "equation") <|>
+ mathBlockWith (begin "displaymath") (end "displaymath") <|>
+ mathBlockWith (string "\\[") (string "\\]") <?> "math block"
+
+mathBlockWith start end = try (do
+ start
+ spaces
+ result <- manyTill anyChar end
+ spaces
+ return (BlockQuote [Para [TeX ("$" ++ result ++ "$")]]))
+
+--
+-- list blocks
+--
+
+list = bulletList <|> orderedList <?> "list"
+
+listItem = try (do
+ ("item", _, _, _) <- command
+ spaces
+ state <- getState
+ let oldParserContext = stateParserContext state
+ updateState (\state -> state {stateParserContext = ListItemState})
+ blocks <- many block
+ updateState (\state -> state {stateParserContext = oldParserContext})
+ return blocks)
+
+orderedList = try (do
+ begin "enumerate"
+ spaces
+ items <- many listItem
+ end "enumerate"
+ spaces
+ return (OrderedList items))
+
+bulletList = try (do
+ begin "itemize"
+ spaces
+ items <- many listItem
+ end "itemize"
+ spaces
+ return (BulletList items))
+
+--
+-- paragraph block
+--
+
+para = try (do
+ result <- many1 inline
+ spaces
+ return (Para (normalizeSpaces result)))
+
+--
+-- title authors date
+--
+
+bibliographic = choice [ maketitle, title, authors, date ]
+
+maketitle = try (do
+ string "\\maketitle"
+ spaces
+ return Null)
+
+title = try (do
+ string "\\title{"
+ tit <- manyTill inline (char '}')
+ spaces
+ updateState (\state -> state { stateTitle = tit })
+ return Null)
+
+authors = try (do
+ string "\\author{"
+ authors <- manyTill anyChar (char '}')
+ spaces
+ let authors' = map removeLeadingTrailingSpace $ lines $ gsub "\\\\" "\n" authors
+ updateState (\state -> state { stateAuthors = authors' })
+ return Null)
+
+date = try (do
+ string "\\date{"
+ date' <- manyTill anyChar (char '}')
+ spaces
+ updateState (\state -> state { stateDate = date' })
+ return Null)
+
+--
+-- item block
+-- for use in unknown environments that aren't being parsed as raw latex
+--
+
+-- this forces items to be parsed in different blocks
+itemBlock = try (do
+ ("item", _, opt, _) <- command
+ state <- getState
+ if (stateParserContext state == ListItemState) then
+ fail "item should be handled by list block"
+ else
+ if null opt then
+ return Null
+ else
+ return (Plain [Str opt]))
+
+--
+-- raw LaTeX
+--
+
+specialEnvironment = do -- these are always parsed as raw
+ followedBy' (choice (map (\name -> begin name) ["tabular", "figure", "tabbing", "eqnarry",
+ "picture", "table", "verse", "theorem"]))
+ rawLaTeXEnvironment
+
+-- | Parse any LaTeX environment and return a Para block containing
+-- the whole literal environment as raw TeX.
+rawLaTeXEnvironment :: GenParser Char st Block
+rawLaTeXEnvironment = try (do
+ string "\\begin"
+ char '{'
+ name <- many1 alphaNum
+ star <- option "" (string "*") -- for starred variants
+ let name' = name ++ star
+ char '}'
+ opt <- option "" commandOpt
+ args <- option [] commandArgs
+ let optStr = if (null opt) then "" else "[" ++ opt ++ "]"
+ let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args
+ contents <- manyTill (choice [(many1 (noneOf "\\")),
+ (do{ (Para [TeX str]) <- rawLaTeXEnvironment; return str }),
+ string "\\"]) (end name')
+ spaces
+ return (Para [TeX ("\\begin{" ++ name' ++ "}" ++ optStr ++ argStr ++
+ (concat contents) ++ "\\end{" ++ name' ++ "}")]))
+
+unknownEnvironment = try (do
+ state <- getState
+ result <- if stateParseRaw state then -- check to see whether we should include raw TeX
+ rawLaTeXEnvironment -- if so, get the whole raw environment
+ else
+ anyEnvironment -- otherwise just the contents
+ return result)
+
+unknownCommand = try (do
+ notFollowedBy' (string "\\end{itemize}")
+ notFollowedBy' (string "\\end{enumerate}")
+ notFollowedBy' (string "\\end{document}")
+ (name, star, opt, args) <- command
+ spaces
+ let optStr = if null opt then "" else "[" ++ opt ++ "]"
+ let argStr = concatMap (\arg -> ("{" ++ arg ++ "}")) args
+ state <- getState
+ if (name == "item") && ((stateParserContext state) == ListItemState) then
+ fail "should not be parsed as raw"
+ else
+ string ""
+ if stateParseRaw state then
+ return (Plain [TeX ("\\" ++ name ++ star ++ optStr ++ argStr)])
+ else
+ return (Plain [Str (joinWithSep " " args)]))
+
+-- latex comment
+comment = try (do
+ char '%'
+ result <- manyTill anyChar newline
+ spaces
+ return Null)
+
+--
+-- inline
+--
+
+inline = choice [ strong, emph, ref, lab, code, linebreak, math, ldots, accentedChar,
+ specialChar, specialInline, escapedChar, unescapedChar, str,
+ endline, whitespace ] <?> "inline"
+
+specialInline = choice [ link, image, footnote, rawLaTeXInline ] <?>
+ "link, raw TeX, note, or image"
+
+ldots = try (do
+ string "\\ldots"
+ return (Str "..."))
+
+accentedChar = normalAccentedChar <|> specialAccentedChar
+
+normalAccentedChar = try (do
+ char '\\'
+ accent <- oneOf "'`^\"~"
+ character <- choice [ between (char '{') (char '}') anyChar, anyChar ]
+ let table = fromMaybe [] $ lookup character accentTable
+ let result = case lookup accent table of
+ Just num -> chr num
+ Nothing -> '?'
+ return (Str [result]))
+
+-- an association list of letters and association list of accents
+-- and decimal character numbers.
+accentTable =
+ [ ('A', [('`', 192), ('\'', 193), ('^', 194), ('~', 195), ('"', 196)]),
+ ('E', [('`', 200), ('\'', 201), ('^', 202), ('"', 203)]),
+ ('I', [('`', 204), ('\'', 205), ('^', 206), ('"', 207)]),
+ ('N', [('~', 209)]),
+ ('O', [('`', 210), ('\'', 211), ('^', 212), ('~', 213), ('"', 214)]),
+ ('U', [('`', 217), ('\'', 218), ('^', 219), ('"', 220)]),
+ ('a', [('`', 224), ('\'', 225), ('^', 227), ('"', 228)]),
+ ('e', [('`', 232), ('\'', 233), ('^', 234), ('"', 235)]),
+ ('i', [('`', 236), ('\'', 237), ('^', 238), ('"', 239)]),
+ ('n', [('~', 241)]),
+ ('o', [('`', 242), ('\'', 243), ('^', 244), ('~', 245), ('"', 246)]),
+ ('u', [('`', 249), ('\'', 250), ('^', 251), ('"', 252)]) ]
+
+specialAccentedChar = choice [ ccedil, aring, iuml, szlig, aelig, oslash, pound,
+ euro, copyright, sect ]
+
+ccedil = try (do
+ char '\\'
+ letter <- choice [try (string "cc"), try (string "cC")]
+ let num = if letter == "cc" then 231 else 199
+ return (Str [chr num]))
+
+aring = try (do
+ char '\\'
+ letter <- choice [try (string "aa"), try (string "AA")]
+ let num = if letter == "aa" then 229 else 197
+ return (Str [chr num]))
+
+iuml = try (do
+ string "\\\""
+ choice [try (string "\\i"), try (string "{\\i}")]
+ return (Str [chr 239]))
+
+icirc = try (do
+ string "\\^"
+ choice [try (string "\\i"), try (string "{\\i}")]
+ return (Str [chr 238]))
+
+szlig = try (do
+ string "\\ss"
+ return (Str [chr 223]))
+
+oslash = try (do
+ char '\\'
+ letter <- choice [char 'o', char 'O']
+ let num = if letter == 'o' then 248 else 216
+ return (Str [chr num]))
+
+aelig = try (do
+ char '\\'
+ letter <- choice [try (string "ae"), try (string "AE")]
+ let num = if letter == "ae" then 230 else 198
+ return (Str [chr num]))
+
+pound = try (do
+ string "\\pounds"
+ return (Str [chr 163]))
+
+euro = try (do
+ string "\\euro"
+ return (Str [chr 8364]))
+
+copyright = try (do
+ string "\\copyright"
+ return (Str [chr 169]))
+
+sect = try (do
+ string "\\S"
+ return (Str [chr 167]))
+
+escapedChar = escaped (oneOf " $%^&_#{}")
+
+unescapedChar = do -- ignore standalone, nonescaped special characters
+ oneOf "$^&_#{}|<>"
+ return (Str "")
+
+specialChar = choice [ backslash, bar, lt, gt ]
+
+backslash = try (do
+ string "\\textbackslash"
+ return (Str "\\"))
+
+bar = try (do
+ string "\\textbar"
+ return (Str "\\"))
+
+lt = try (do
+ string "\\textless"
+ return (Str "<"))
+
+gt = try (do
+ string "\\textgreater"
+ return (Str ">"))
+
+code = try (do
+ string "\\verb"
+ marker <- anyChar
+ result <- manyTill anyChar (char marker)
+ let result' = removeLeadingTrailingSpace result
+ return (Code result'))
+
+emph = try (do
+ oneOfStrings [ "\\emph{", "\\textit{" ]
+ result <- manyTill inline (char '}')
+ return (Emph result))
+
+lab = try (do
+ string "\\label{"
+ result <- manyTill anyChar (char '}')
+ return (Str ("(" ++ result ++ ")")))
+
+ref = try (do
+ string "\\ref{"
+ result <- manyTill anyChar (char '}')
+ return (Str (result)))
+
+strong = try (do
+ string "\\textbf{"
+ result <- manyTill inline (char '}')
+ return (Strong result))
+
+whitespace = do
+ many1 (oneOf "~ \t")
+ return Space
+
+-- hard line break
+linebreak = try (do
+ string "\\\\"
+ return LineBreak)
+
+str = do
+ result <- many1 (noneOf specialChars)
+ return (Str (normalizePunctuation result))
+
+-- endline internal to paragraph
+endline = try (do
+ newline
+ notFollowedBy blankline
+ return Space)
+
+-- math
+math = math1 <|> math2 <?> "math"
+
+math1 = try (do
+ char '$'
+ result <- many (noneOf "$")
+ char '$'
+ return (TeX ("$" ++ result ++ "$")))
+
+math2 = try (do
+ string "\\("
+ result <- many (noneOf "$")
+ string "\\)"
+ return (TeX ("$" ++ result ++ "$")))
+
+--
+-- links and images
+--
+
+link = try (do
+ string "\\href{"
+ url <- manyTill anyChar (char '}')
+ char '{'
+ label <- manyTill inline (char '}')
+ ref <- generateReference url ""
+ return (Link (normalizeSpaces label) ref))
+
+image = try (do
+ ("includegraphics", _, _, (src:lst)) <- command
+ return (Image [Str "image"] (Src src "")))
+
+footnote = try (do
+ ("footnote", _, _, (contents:[])) <- command
+ let blocks = case runParser parseBlocks defaultParserState "footnote" contents of
+ Left err -> error $ "Input:\n" ++ show contents ++
+ "\nError:\n" ++ show err
+ Right result -> result
+ state <- getState
+ let notes = stateNoteBlocks state
+ let nextRef = case notes of
+ [] -> "1"
+ (Note ref body):rest -> (show ((read ref) + 1))
+ setState (state { stateNoteBlocks = (Note nextRef blocks):notes })
+ return (NoteRef nextRef))
+
+-- | Parse any LaTeX command and return it in a raw TeX inline element.
+rawLaTeXInline :: GenParser Char ParserState Inline
+rawLaTeXInline = try (do
+ (name, star, opt, args) <- command
+ let optStr = if (null opt) then "" else "[" ++ opt ++ "]"
+ let argStr = concatMap (\arg -> "{" ++ arg ++ "}") args
+ state <- getState
+ if ((name == "begin") || (name == "end") || (name == "item")) then
+ fail "not an inline command"
+ else
+ string ""
+ return (TeX ("\\" ++ name ++ star ++ optStr ++ argStr)))
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
new file mode 100644
index 000000000..60ac40fd7
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -0,0 +1,582 @@
+-- | Convert markdown to Pandoc document.
+module Text.Pandoc.Readers.Markdown (
+ readMarkdown
+ ) where
+
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Definition
+import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment )
+import Text.Pandoc.Shared
+import Text.Pandoc.Readers.HTML ( rawHtmlInline, rawHtmlBlock, anyHtmlBlockTag,
+ anyHtmlInlineTag )
+import Text.Pandoc.HtmlEntities ( decodeEntities )
+import Text.Regex ( matchRegex, mkRegex )
+import Text.ParserCombinators.Parsec
+
+-- | Read markdown from an input string and return a Pandoc document.
+readMarkdown :: ParserState -> String -> Pandoc
+readMarkdown = readWith parseMarkdown
+
+-- | Parse markdown string with default options and print result (for testing).
+testString :: String -> IO ()
+testString = testStringWith parseMarkdown
+
+--
+-- Constants and data structure definitions
+--
+
+spaceChars = " \t"
+endLineChars = "\n"
+labelStart = '['
+labelEnd = ']'
+labelSep = ':'
+srcStart = '('
+srcEnd = ')'
+imageStart = '!'
+noteStart = '^'
+codeStart = '`'
+codeEnd = '`'
+emphStart = '*'
+emphEnd = '*'
+emphStartAlt = '_'
+emphEndAlt = '_'
+autoLinkStart = '<'
+autoLinkEnd = '>'
+mathStart = '$'
+mathEnd = '$'
+bulletListMarkers = "*+-"
+orderedListDelimiters = "."
+escapeChar = '\\'
+hruleChars = "*-_"
+quoteChars = "'\""
+atxHChar = '#'
+titleOpeners = "\"'("
+setextHChars = ['=','-']
+blockQuoteChar = '>'
+hyphenChar = '-'
+
+-- treat these as potentially non-text when parsing inline:
+specialChars = [escapeChar, labelStart, labelEnd, emphStart, emphEnd, emphStartAlt,
+ emphEndAlt, codeStart, codeEnd, autoLinkEnd, autoLinkStart, mathStart,
+ mathEnd, imageStart, noteStart, hyphenChar]
+
+--
+-- auxiliary functions
+--
+
+-- | Skip a single endline if there is one.
+skipEndline = option Space endline
+
+indentSpaces = do
+ state <- getState
+ let tabStop = stateTabStop state
+ oneOfStrings [ "\t", (replicate tabStop ' ') ] <?> "indentation"
+
+skipNonindentSpaces = do
+ state <- getState
+ let tabStop = stateTabStop state
+ choice (map (\n -> (try (count n (char ' ')))) (reverse [0..(tabStop - 1)]))
+
+--
+-- document structure
+--
+
+titleLine = try (do
+ char '%'
+ skipSpaces
+ line <- manyTill inline newline
+ return line)
+
+authorsLine = try (do
+ char '%'
+ skipSpaces
+ authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;")
+ newline
+ return (map removeLeadingTrailingSpace authors))
+
+dateLine = try (do
+ char '%'
+ skipSpaces
+ date <- many (noneOf "\n")
+ newline
+ return (removeTrailingSpace date))
+
+titleBlock = try (do
+ title <- option [] titleLine
+ author <- option [] authorsLine
+ date <- option "" dateLine
+ option "" blanklines
+ return (title, author, date))
+
+parseMarkdown = do
+ updateState (\state -> state { stateParseRaw = True }) -- need to parse raw HTML
+ (title, author, date) <- option ([],[],"") titleBlock
+ blocks <- parseBlocks
+ state <- getState
+ let keys = reverse $ stateKeyBlocks state
+ return (Pandoc (Meta title author date) (blocks ++ keys))
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+ result <- manyTill block eof
+ return result
+
+block = choice [ codeBlock, referenceKey, note, header, hrule, list, blockQuote, rawHtmlBlocks,
+ rawLaTeXEnvironment, para, plain, blankBlock, nullBlock ] <?> "block"
+
+--
+-- header blocks
+--
+
+header = choice [ setextHeader, atxHeader ] <?> "header"
+
+atxHeader = try (do
+ lead <- many1 (char atxHChar)
+ skipSpaces
+ txt <- many1 (do {notFollowedBy' atxClosing; inline})
+ atxClosing
+ return (Header (length lead) (normalizeSpaces txt)))
+
+atxClosing = try (do
+ skipMany (char atxHChar)
+ skipSpaces
+ newline
+ option "" blanklines)
+
+setextHeader = choice (map (\x -> setextH x) (enumFromTo 1 (length setextHChars)))
+
+setextH n = try (do
+ txt <- many1 (do {notFollowedBy newline; inline})
+ endline
+ many1 (char (setextHChars !! (n-1)))
+ skipSpaces
+ newline
+ option "" blanklines
+ return (Header n (normalizeSpaces txt)))
+
+--
+-- hrule block
+--
+
+hruleWith chr =
+ try (do
+ skipSpaces
+ char chr
+ skipSpaces
+ char chr
+ skipSpaces
+ char chr
+ skipMany (oneOf (chr:spaceChars))
+ newline
+ option "" blanklines
+ return HorizontalRule)
+
+hrule = choice (map hruleWith hruleChars) <?> "hrule"
+
+--
+-- code blocks
+--
+
+indentedLine = try (do
+ indentSpaces
+ result <- manyTill anyChar newline
+ return (result ++ "\n"))
+
+-- two or more indented lines, possibly separated by blank lines
+indentedBlock = try (do
+ res1 <- indentedLine
+ blanks <- many blankline
+ res2 <- choice [indentedBlock, indentedLine]
+ return (res1 ++ blanks ++ res2))
+
+codeBlock = do
+ result <- choice [indentedBlock, indentedLine]
+ option "" blanklines
+ return (CodeBlock result)
+
+--
+-- note block
+--
+
+note = try (do
+ (NoteRef ref) <- noteRef
+ skipSpaces
+ raw <- sepBy (many (choice [nonEndline,
+ (try (do {endline; notFollowedBy (char noteStart); return '\n'}))
+ ])) (try (do {newline; char noteStart; option ' ' (char ' ')}))
+ newline
+ blanklines
+ -- parse the extracted block, which may contain various block elements:
+ state <- getState
+ let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of
+ Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err
+ Right result -> result
+ return (Note ref parsed))
+
+--
+-- block quotes
+--
+
+emacsBoxQuote = try (do
+ string ",----"
+ manyTill anyChar newline
+ raw <- manyTill (try (do{ char '|';
+ option ' ' (char ' ');
+ result <- manyTill anyChar newline;
+ return result}))
+ (string "`----")
+ manyTill anyChar newline
+ option "" blanklines
+ return raw)
+
+emailBlockQuoteStart = try (do
+ skipNonindentSpaces
+ char blockQuoteChar
+ option ' ' (char ' ')
+ return "> ")
+
+emailBlockQuote = try (do
+ emailBlockQuoteStart
+ raw <- sepBy (many (choice [nonEndline,
+ (try (do{ endline;
+ notFollowedBy' emailBlockQuoteStart;
+ return '\n'}))]))
+ (try (do {newline; emailBlockQuoteStart}))
+ newline <|> (do{ eof; return '\n'})
+ option "" blanklines
+ return raw)
+
+blockQuote = do
+ raw <- choice [ emailBlockQuote, emacsBoxQuote ]
+ -- parse the extracted block, which may contain various block elements:
+ state <- getState
+ let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) "block" ((joinWithSep "\n" raw) ++ "\n\n") of
+ Left err -> error $ "Raw block:\n" ++ show raw ++ "\nError:\n" ++ show err
+ Right result -> result
+ return (BlockQuote parsed)
+
+--
+-- list blocks
+--
+
+list = choice [ bulletList, orderedList ] <?> "list"
+
+bulletListStart =
+ try (do
+ option ' ' newline -- if preceded by a Plain block in a list context
+ skipNonindentSpaces
+ notFollowedBy' hrule -- because hrules start out just like lists
+ oneOf bulletListMarkers
+ spaceChar
+ skipSpaces)
+
+orderedListStart =
+ try (do
+ option ' ' newline -- if preceded by a Plain block in a list context
+ skipNonindentSpaces
+ many1 digit
+ oneOf orderedListDelimiters
+ oneOf spaceChars
+ skipSpaces)
+
+-- parse a line of a list item (start = parser for beginning of list item)
+listLine start = try (do
+ notFollowedBy' start
+ notFollowedBy blankline
+ notFollowedBy' (try (do{ indentSpaces;
+ many (spaceChar);
+ choice [bulletListStart, orderedListStart]}))
+ line <- manyTill anyChar newline
+ return (line ++ "\n"))
+
+-- parse raw text for one list item, excluding start marker and continuations
+rawListItem start =
+ try (do
+ start
+ result <- many1 (listLine start)
+ blanks <- many blankline
+ return ((concat result) ++ blanks))
+
+-- continuation of a list item - indented and separated by blankline
+-- or (in compact lists) endline.
+-- note: nested lists are parsed as continuations
+listContinuation start =
+ try (do
+ followedBy' indentSpaces
+ result <- many1 (listContinuationLine start)
+ blanks <- many blankline
+ return ((concat result) ++ blanks))
+
+listContinuationLine start = try (do
+ notFollowedBy blankline
+ notFollowedBy' start
+ option "" indentSpaces
+ result <- manyTill anyChar newline
+ return (result ++ "\n"))
+
+listItem start =
+ try (do
+ first <- rawListItem start
+ rest <- many (listContinuation start)
+ -- parsing with ListItemState forces markers at beginning of lines to
+ -- count as list item markers, even if not separated by blank space.
+ -- see definition of "endline"
+ state <- getState
+ let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState})
+ "block" raw of
+ Left err -> error $ "Raw block:\n" ++ raw ++ "\nError:\n" ++ show err
+ Right result -> result
+ where raw = concat (first:rest)
+ return parsed)
+
+orderedList =
+ try (do
+ items <- many1 (listItem orderedListStart)
+ let items' = compactify items
+ return (OrderedList items'))
+
+bulletList =
+ try (do
+ items <- many1 (listItem bulletListStart)
+ let items' = compactify items
+ return (BulletList items'))
+
+--
+-- paragraph block
+--
+
+para = try (do
+ result <- many1 inline
+ newline
+ choice [ (do{ followedBy' (oneOfStrings [">", ",----"]); return "" }), blanklines ]
+ let result' = normalizeSpaces result
+ return (Para result'))
+
+plain = do
+ result <- many1 inline
+ let result' = normalizeSpaces result
+ return (Plain result')
+
+--
+-- raw html
+--
+
+rawHtmlBlocks = try (do
+ htmlBlocks <- many1 rawHtmlBlock
+ let combined = concatMap (\(RawHtml str) -> str) htmlBlocks
+ let combined' = if (last combined == '\n') then
+ init combined -- strip extra newline
+ else
+ combined
+ return (RawHtml combined'))
+
+--
+-- reference key
+--
+
+referenceKey =
+ try (do
+ skipSpaces
+ label <- reference
+ char labelSep
+ skipSpaces
+ option ' ' (char autoLinkStart)
+ src <- many (noneOf (titleOpeners ++ [autoLinkEnd] ++ endLineChars))
+ option ' ' (char autoLinkEnd)
+ tit <- option "" title
+ blanklines
+ return (Key label (Src (removeTrailingSpace src) tit)))
+
+--
+-- inline
+--
+
+text = choice [ math, strong, emph, code2, code1, str, linebreak, tabchar,
+ whitespace, endline ] <?> "text"
+
+inline = choice [ rawLaTeXInline, escapedChar, special, hyphens, text, ltSign, symbol ] <?> "inline"
+
+special = choice [ link, referenceLink, rawHtmlInline, autoLink,
+ image, noteRef ] <?> "link, inline html, note, or image"
+
+escapedChar = escaped anyChar
+
+ltSign = do
+ notFollowedBy' rawHtmlBlocks -- don't return < if it starts html
+ char '<'
+ return (Str ['<'])
+
+specialCharsMinusLt = filter (/= '<') specialChars
+
+symbol = do
+ result <- oneOf specialCharsMinusLt
+ return (Str [result])
+
+hyphens = try (do
+ result <- many1 (char '-')
+ if (length result) == 1 then
+ skipEndline -- don't want to treat endline after hyphen as a space
+ else
+ do{ string ""; return Space }
+ return (Str result))
+
+-- parses inline code, between codeStart and codeEnd
+code1 =
+ try (do
+ char codeStart
+ result <- many (noneOf [codeEnd])
+ char codeEnd
+ let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines
+ return (Code result'))
+
+-- parses inline code, between 2 codeStarts and 2 codeEnds
+code2 =
+ try (do
+ string [codeStart, codeStart]
+ result <- manyTill anyChar (try (string [codeEnd, codeEnd]))
+ let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result -- get rid of any internal newlines
+ return (Code result'))
+
+mathWord = many1 (choice [(noneOf (" \t\n\\" ++ [mathEnd])), (try (do {c <- char '\\'; notFollowedBy (char mathEnd); return c}))])
+
+math = try (do
+ char mathStart
+ notFollowedBy space
+ words <- sepBy1 mathWord (many1 space)
+ char mathEnd
+ return (TeX ("$" ++ (joinWithSep " " words) ++ "$")))
+
+emph = do
+ result <- choice [ (enclosed (char emphStart) (char emphEnd) inline),
+ (enclosed (char emphStartAlt) (char emphEndAlt) inline) ]
+ return (Emph (normalizeSpaces result))
+
+strong = do
+ result <- choice [ (enclosed (count 2 (char emphStart)) (count 2 (char emphEnd)) inline),
+ (enclosed (count 2 (char emphStartAlt)) (count 2 (char emphEndAlt)) inline)]
+ return (Strong (normalizeSpaces result))
+
+whitespace = do
+ many1 (oneOf spaceChars) <?> "whitespace"
+ return Space
+
+tabchar = do
+ tab
+ return (Str "\t")
+
+-- hard line break
+linebreak = try (do
+ oneOf spaceChars
+ many1 (oneOf spaceChars)
+ endline
+ return LineBreak )
+
+nonEndline = noneOf endLineChars
+
+str = do
+ result <- many1 ((noneOf (specialChars ++ spaceChars ++ endLineChars)))
+ return (Str (decodeEntities result))
+
+-- an endline character that can be treated as a space, not a structural break
+endline =
+ try (do
+ newline
+ -- next line would allow block quotes without preceding blank line
+ -- Markdown.pl does allow this, but there's a chance of a wrapped
+ -- greater-than sign triggering a block quote by accident...
+-- notFollowedBy (try (do { choice [emailBlockQuoteStart, string ",----"]; return ' ' }))
+ notFollowedBy blankline
+ -- parse potential list starts at beginning of line differently if in a list:
+ st <- getState
+ if (stateParserContext st) == ListItemState then
+ do
+ notFollowedBy' orderedListStart
+ notFollowedBy' bulletListStart
+ else
+ option () pzero
+ return Space)
+
+--
+-- links
+--
+
+-- a reference label for a link
+reference = do
+ char labelStart
+ label <- manyTill inline (char labelEnd)
+ return (normalizeSpaces label)
+
+-- source for a link, with optional title
+source =
+ try (do
+ char srcStart
+ option ' ' (char autoLinkStart)
+ src <- many (noneOf ([srcEnd, autoLinkEnd] ++ titleOpeners))
+ option ' ' (char autoLinkEnd)
+ tit <- option "" title
+ skipSpaces
+ char srcEnd
+ return (Src (removeTrailingSpace src) tit))
+
+titleWith startChar endChar =
+ try (do
+ skipSpaces
+ skipEndline -- a title can be on the next line from the source
+ skipSpaces
+ char startChar
+ tit <- manyTill (choice [ try (do {char '\\'; char endChar}),
+ (noneOf (endChar:endLineChars)) ]) (char endChar)
+ let tit' = gsub "\"" "&quot;" tit
+ return tit')
+
+title = choice [titleWith '(' ')', titleWith '"' '"', titleWith '\'' '\''] <?> "title"
+
+link = choice [explicitLink, referenceLink] <?> "link"
+
+explicitLink =
+ try (do
+ label <- reference
+ src <- source
+ return (Link label src))
+
+referenceLink = choice [referenceLinkDouble, referenceLinkSingle]
+
+referenceLinkDouble = -- a link like [this][/url/]
+ try (do
+ label <- reference
+ skipSpaces
+ skipEndline
+ skipSpaces
+ ref <- reference
+ return (Link label (Ref ref)))
+
+referenceLinkSingle = -- a link like [this]
+ try (do
+ label <- reference
+ return (Link label (Ref [])))
+
+autoLink = -- a link <like.this.com>
+ try (do
+ notFollowedBy (do {anyHtmlBlockTag; return ' '})
+ src <- between (char autoLinkStart) (char autoLinkEnd)
+ (many (noneOf (spaceChars ++ endLineChars ++ [autoLinkEnd])))
+ case (matchRegex emailAddress src) of
+ Just _ -> return (Link [Str src] (Src ("mailto:" ++ src) ""))
+ Nothing -> return (Link [Str src] (Src src "")))
+
+emailAddress = mkRegex "([^@:/]+)@(([^.]+[.]?)*([^.]+))" -- presupposes no whitespace
+
+image =
+ try (do
+ char imageStart
+ (Link label src) <- link
+ return (Image label src))
+
+noteRef = try (do
+ char noteStart
+ ref <- between (char '(') (char ')') (many1 (noneOf " \t\n)"))
+ return (NoteRef ref))
+
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
new file mode 100644
index 000000000..82e5ea303
--- /dev/null
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -0,0 +1,644 @@
+-- | Parse reStructuredText and return Pandoc document.
+module Text.Pandoc.Readers.RST (
+ readRST
+ ) where
+import Text.Pandoc.Definition
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Shared
+import Text.Pandoc.Readers.HTML ( anyHtmlBlockTag, anyHtmlInlineTag )
+import Text.Regex ( matchRegex, mkRegex )
+import Text.ParserCombinators.Parsec
+import Data.Maybe ( fromMaybe )
+import List ( findIndex )
+import Char ( toUpper )
+
+-- | Parse reStructuredText string and return Pandoc document.
+readRST :: ParserState -> String -> Pandoc
+readRST = readWith parseRST
+
+-- | Parse a string and print result (for testing).
+testString :: String -> IO ()
+testString = testStringWith parseRST
+
+--
+-- Constants and data structure definitions
+---
+
+bulletListMarkers = "*+-"
+underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~"
+
+-- treat these as potentially non-text when parsing inline:
+specialChars = "\\`|*_<>$:[-"
+
+--
+-- parsing documents
+--
+
+isAnonKeyBlock block = case block of
+ (Key [Str "_"] str) -> True
+ otherwise -> False
+
+isNotAnonKeyBlock block = not (isAnonKeyBlock block)
+
+isHeader1 :: Block -> Bool
+isHeader1 (Header 1 _) = True
+isHeader1 _ = False
+
+isHeader2 :: Block -> Bool
+isHeader2 (Header 2 _) = True
+isHeader2 _ = False
+
+-- | Promote all headers in a list of blocks. (Part of
+-- title transformation for RST.)
+promoteHeaders :: Int -> [Block] -> [Block]
+promoteHeaders num ((Header level text):rest) =
+ (Header (level - num) text):(promoteHeaders num rest)
+promoteHeaders num (other:rest) = other:(promoteHeaders num rest)
+promoteHeaders num [] = []
+
+-- | If list of blocks starts with a header (or a header and subheader)
+-- of level that are not found elsewhere, return it as a title and
+-- promote all the other headers.
+titleTransform :: [Block] -- ^ list of blocks
+ -> ([Block], [Inline]) -- ^ modified list of blocks, title
+titleTransform ((Header 1 head1):(Header 2 head2):rest) = -- title and subtitle
+ if (any isHeader1 rest) || (any isHeader2 rest) then
+ ((Header 1 head1):(Header 2 head2):rest, [])
+ else
+ ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2)
+titleTransform ((Header 1 head1):rest) = -- title, no subtitle
+ if (any isHeader1 rest) then
+ ((Header 1 head1):rest, [])
+ else
+ ((promoteHeaders 1 rest), head1)
+titleTransform blocks = (blocks, [])
+
+parseRST = do
+ state <- getState
+ input <- getInput
+ blocks <- parseBlocks -- first pass
+ let anonymousKeys = filter isAnonKeyBlock blocks
+ let blocks' = if (null anonymousKeys) then
+ blocks
+ else -- run parser again to fill in anonymous links...
+ case runParser parseBlocks (state { stateKeyBlocks = anonymousKeys })
+ "RST source, second pass" input of
+ Left err -> error $ "\nError:\n" ++ show err
+ Right result -> (filter isNotAnonKeyBlock result)
+ let (blocks'', title) = if stateStandalone state then
+ titleTransform blocks'
+ else
+ (blocks', [])
+ state <- getState
+ let authors = stateAuthors state
+ let date = stateDate state
+ let title' = if (null title) then (stateTitle state) else title
+ return (Pandoc (Meta title' authors date) blocks'')
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+ result <- manyTill block eof
+ return result
+
+block = choice [ codeBlock, rawHtmlBlock, rawLaTeXBlock, blockQuote, referenceKey,
+ imageBlock, unknownDirective, header, hrule, list, fieldList, lineBlock,
+ para, plain, blankBlock, nullBlock ] <?> "block"
+
+--
+-- field list
+--
+
+fieldListItem = try (do
+ char ':'
+ name <- many1 alphaNum
+ string ": "
+ skipSpaces
+ first <- manyTill anyChar newline
+ rest <- many (do{ notFollowedBy (char ':');
+ notFollowedBy blankline;
+ skipSpaces;
+ manyTill anyChar newline })
+ return (name, (joinWithSep " " (first:rest))))
+
+fieldList = try (do
+ items <- many1 fieldListItem
+ blanklines
+ let authors = case (lookup "Authors" items) of
+ Just auth -> [auth]
+ Nothing -> map snd (filter (\(x,y) -> x == "Author") items)
+ let date = case (lookup "Date" items) of
+ Just dat -> dat
+ Nothing -> ""
+ let title = case (lookup "Title" items) of
+ Just tit -> [Str tit]
+ Nothing -> []
+ let remaining = filter (\(x,y) -> (x /= "Authors") && (x /= "Author") && (x /= "Date") &&
+ (x /= "Title")) items
+ let result = map (\(x,y) -> Para [Strong [Str x], Str ":", Space, Str y]) remaining
+ updateState (\st -> st { stateAuthors = authors, stateDate = date, stateTitle = title })
+ return (BlockQuote result))
+
+--
+-- line block
+--
+
+lineBlockLine = try (do
+ string "| "
+ white <- many (oneOf " \t")
+ line <- manyTill inline newline
+ let line' = (if null white then [] else [Str white]) ++ line ++ [LineBreak]
+ return line')
+
+lineBlock = try (do
+ lines <- many1 lineBlockLine
+ blanklines
+ return $ Para (concat lines))
+
+--
+-- paragraph block
+--
+
+para = choice [ paraBeforeCodeBlock, paraNormal ] <?> "paragraph"
+
+codeBlockStart = try (do
+ string "::"
+ blankline
+ blankline)
+
+-- paragraph that ends in a :: starting a code block
+paraBeforeCodeBlock = try (do
+ result <- many1 (do {notFollowedBy' codeBlockStart; inline})
+ followedBy' (string "::")
+ return (Para (if (last result == Space) then
+ normalizeSpaces result
+ else
+ (normalizeSpaces result) ++ [Str ":"])))
+
+-- regular paragraph
+paraNormal = try (do
+ result <- many1 inline
+ newline
+ blanklines
+ let result' = normalizeSpaces result
+ return (Para result'))
+
+plain = do
+ result <- many1 inline
+ let result' = normalizeSpaces result
+ return (Plain result')
+
+--
+-- image block
+--
+
+imageBlock = try (do
+ string ".. image:: "
+ src <- manyTill anyChar newline
+ return (Plain [Image [Str "image"] (Src src "")]))
+
+--
+-- header blocks
+--
+
+header = choice [ doubleHeader, singleHeader ] <?> "header"
+
+-- a header with lines on top and bottom
+doubleHeader = try (do
+ c <- oneOf underlineChars
+ rest <- many (char c) -- the top line
+ let lenTop = length (c:rest)
+ skipSpaces
+ newline
+ txt <- many1 (do {notFollowedBy blankline; inline})
+ pos <- getPosition
+ let len = (sourceColumn pos) - 1
+ if (len > lenTop) then fail "title longer than border" else (do {return ()})
+ blankline -- spaces and newline
+ count lenTop (char c) -- the bottom line
+ blanklines
+ -- check to see if we've had this kind of header before.
+ -- if so, get appropriate level. if not, add to list.
+ state <- getState
+ let headerTable = stateHeaderTable state
+ let (headerTable', level) = case findIndex (== DoubleHeader c) headerTable of
+ Just ind -> (headerTable, ind + 1)
+ Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1)
+ setState (state { stateHeaderTable = headerTable' })
+ return (Header level (normalizeSpaces txt)))
+
+-- a header with line on the bottom only
+singleHeader = try (do
+ notFollowedBy' whitespace
+ txt <- many1 (do {notFollowedBy blankline; inline})
+ pos <- getPosition
+ let len = (sourceColumn pos) - 1
+ blankline
+ c <- oneOf underlineChars
+ rest <- count (len - 1) (char c)
+ many (char c)
+ blanklines
+ state <- getState
+ let headerTable = stateHeaderTable state
+ let (headerTable', level) = case findIndex (== SingleHeader c) headerTable of
+ Just ind -> (headerTable, ind + 1)
+ Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1)
+ setState (state { stateHeaderTable = headerTable' })
+ return (Header level (normalizeSpaces txt)))
+
+--
+-- hrule block
+--
+
+hruleWith chr =
+ try (do
+ count 4 (char chr)
+ skipMany (char chr)
+ skipSpaces
+ newline
+ blanklines
+ return HorizontalRule)
+
+hrule = choice (map hruleWith underlineChars) <?> "hrule"
+
+--
+-- code blocks
+--
+
+-- read a line indented by a given string
+indentedLine indents = try (do
+ string indents
+ result <- manyTill anyChar newline
+ return (result ++ "\n"))
+
+-- two or more indented lines, possibly separated by blank lines
+-- if variable = True, then any indent will work, but it must be consistent through the block
+-- if variable = False, indent should be one tab or equivalent in spaces
+indentedBlock variable = try (do
+ state <- getState
+ let tabStop = stateTabStop state
+ indents <- if variable then
+ many1 (oneOf " \t")
+ else
+ oneOfStrings ["\t", (replicate tabStop ' ')]
+ firstline <- manyTill anyChar newline
+ rest <- many (choice [ indentedLine indents,
+ try (do {b <- blanklines; l <- indentedLine indents; return (b ++ l)})])
+ option "" blanklines
+ return (firstline ++ "\n" ++ (concat rest)))
+
+codeBlock = try (do
+ codeBlockStart
+ result <- indentedBlock False -- the False means we want one tab stop indent on each line
+ return (CodeBlock result))
+
+--
+-- raw html
+--
+
+rawHtmlBlock = try (do
+ string ".. raw:: html"
+ blanklines
+ result <- indentedBlock True
+ return (RawHtml result))
+
+--
+-- raw latex
+--
+
+rawLaTeXBlock = try (do
+ string ".. raw:: latex"
+ blanklines
+ result <- indentedBlock True
+ return (Para [(TeX result)]))
+
+--
+-- block quotes
+--
+
+blockQuote = try (do
+ block <- indentedBlock True
+ -- parse the extracted block, which may contain various block elements:
+ state <- getState
+ let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState})
+ "block" (block ++ "\n\n") of
+ Left err -> error $ "Raw block:\n" ++ show block ++ "\nError:\n" ++ show err
+ Right result -> result
+ return (BlockQuote parsed))
+
+--
+-- list blocks
+--
+
+list = choice [ bulletList, orderedList ] <?> "list"
+
+-- parses bullet list start and returns its length (inc. following whitespace)
+bulletListStart =
+ try (do
+ notFollowedBy' hrule -- because hrules start out just like lists
+ marker <- oneOf bulletListMarkers
+ white <- many1 spaceChar
+ let len = length (marker:white)
+ return len)
+
+withPeriodSuffix parser = try (do
+ a <- parser
+ b <- char '.'
+ return (a ++ [b]))
+
+withParentheses parser = try (do
+ a <- char '('
+ b <- parser
+ c <- char ')'
+ return ([a] ++ b ++ [c]))
+
+withRightParen parser = try (do
+ a <- parser
+ b <- char ')'
+ return (a ++ [b]))
+
+upcaseWord = map toUpper
+
+romanNumeral = do
+ let lowerNumerals = ["i", "ii", "iii", "iiii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", "xx", "xxi", "xxii", "xxiii", "xxiv" ]
+ let upperNumerals = map upcaseWord lowerNumerals
+ result <- choice $ map string (lowerNumerals ++ upperNumerals)
+ return result
+
+orderedListEnumerator = choice [ many1 digit,
+ string "#",
+ count 1 letter,
+ romanNumeral ]
+
+-- parses ordered list start and returns its length (inc. following whitespace)
+orderedListStart =
+ try (do
+ marker <- choice [ withPeriodSuffix orderedListEnumerator,
+ withParentheses orderedListEnumerator,
+ withRightParen orderedListEnumerator ]
+ white <- many1 spaceChar
+ let len = length (marker ++ white)
+ return len)
+
+-- parse a line of a list item
+listLine markerLength = try (do
+ notFollowedBy blankline
+ indentWith markerLength
+ line <- manyTill anyChar newline
+ return (line ++ "\n"))
+
+-- indent by specified number of spaces (or equiv. tabs)
+indentWith num = do
+ state <- getState
+ let tabStop = stateTabStop state
+ if (num < tabStop) then
+ count num (char ' ')
+ else
+ choice [ try (count num (char ' ')),
+ (try (do {char '\t'; count (num - tabStop) (char ' ')})) ]
+
+-- parse raw text for one list item, excluding start marker and continuations
+rawListItem start =
+ try (do
+ markerLength <- start
+ firstLine <- manyTill anyChar newline
+ restLines <- many (listLine markerLength)
+ return (markerLength, (firstLine ++ "\n" ++ (concat restLines))))
+
+-- continuation of a list item - indented and separated by blankline or (in compact lists)
+-- endline. Note: nested lists are parsed as continuations.
+listContinuation markerLength =
+ try (do
+ blanks <- many1 blankline
+ result <- many1 (listLine markerLength)
+ return (blanks ++ (concat result)))
+
+listItem start =
+ try (do
+ (markerLength, first) <- rawListItem start
+ rest <- many (listContinuation markerLength)
+ blanks <- choice [ try (do {b <- many blankline; followedBy' start; return b}),
+ many1 blankline ] -- whole list must end with blank
+ -- parsing with ListItemState forces markers at beginning of lines to
+ -- count as list item markers, even if not separated by blank space.
+ -- see definition of "endline"
+ state <- getState
+ let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState})
+ "list item" raw of
+ Left err -> error $ "Raw:\n" ++ raw ++ "\nError:\n" ++ show err
+ Right result -> result
+ where raw = concat (first:rest) ++ blanks
+ return parsed)
+
+orderedList =
+ try (do
+ items <- many1 (listItem orderedListStart)
+ let items' = compactify items
+ return (OrderedList items'))
+
+bulletList =
+ try (do
+ items <- many1 (listItem bulletListStart)
+ let items' = compactify items
+ return (BulletList items'))
+
+--
+-- unknown directive (e.g. comment)
+--
+
+unknownDirective = try (do
+ string ".. "
+ manyTill anyChar newline
+ many (do {string " ";
+ char ':';
+ many1 (noneOf "\n:");
+ char ':';
+ many1 (noneOf "\n");
+ newline})
+ option "" blanklines
+ return Null)
+
+--
+-- reference key
+--
+
+referenceKey = choice [imageKey, anonymousKey, regularKeyQuoted, regularKey]
+
+imageKey = try (do
+ string ".. |"
+ ref <- manyTill inline (char '|')
+ skipSpaces
+ string "image::"
+ src <- manyTill anyChar newline
+ return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+anonymousKey = try (do
+ choice [string ".. __:", string "__"]
+ skipSpaces
+ src <- manyTill anyChar newline
+ state <- getState
+ return (Key [Str "_"] (Src (removeLeadingTrailingSpace src) "")))
+
+regularKeyQuoted = try (do
+ string ".. _`"
+ ref <- manyTill inline (string "`:")
+ skipSpaces
+ src <- manyTill anyChar newline
+ return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+regularKey = try (do
+ string ".. _"
+ ref <- manyTill inline (char ':')
+ skipSpaces
+ src <- manyTill anyChar newline
+ return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+ --
+ -- inline
+ --
+
+text = choice [ strong, emph, code, str, tabchar, whitespace, endline ] <?> "text"
+
+inline = choice [ escapedChar, special, hyphens, text, symbol ] <?> "inline"
+
+special = choice [ link, image ] <?> "link, inline html, or image"
+
+hyphens = try (do
+ result <- many1 (char '-')
+ option Space endline -- don't want to treat endline after hyphen or dash as a space
+ return (Str result))
+
+escapedChar = escaped anyChar
+
+symbol = do
+ result <- oneOf specialChars
+ return (Str [result])
+
+-- parses inline code, between codeStart and codeEnd
+code =
+ try (do
+ string "``"
+ result <- manyTill anyChar (string "``")
+ let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result
+ return (Code result'))
+
+emph = do
+ result <- enclosed (char '*') (char '*') inline
+ return (Emph (normalizeSpaces result))
+
+strong = do
+ result <- enclosed (string "**") (string "**") inline
+ return (Strong (normalizeSpaces result))
+
+whitespace = do
+ many1 spaceChar <?> "whitespace"
+ return Space
+
+tabchar = do
+ tab
+ return (Str "\t")
+
+str = do
+ notFollowedBy' oneWordReferenceLink
+ result <- many1 (noneOf (specialChars ++ "\t\n "))
+ return (Str result)
+
+-- an endline character that can be treated as a space, not a structural break
+endline =
+ try (do
+ newline
+ notFollowedBy blankline
+ -- parse potential list starts at beginning of line differently if in a list:
+ st <- getState
+ if ((stateParserContext st) == ListItemState) then
+ notFollowedBy' (choice [orderedListStart, bulletListStart])
+ else
+ option () pzero
+ return Space)
+
+--
+-- links
+--
+
+link = choice [explicitLink, referenceLink, autoLink, oneWordReferenceLink] <?> "link"
+
+explicitLink =
+ try (do
+ char '`'
+ label <- manyTill inline (try (do {spaces; char '<'}))
+ src <- manyTill (noneOf ">\n ") (char '>')
+ skipSpaces
+ string "`_"
+ return (Link (normalizeSpaces label) (Src (removeLeadingTrailingSpace src) "")))
+
+anonymousLinkEnding =
+ try (do
+ char '_'
+ state <- getState
+ let anonKeys = stateKeyBlocks state
+ -- if there's a list of anon key refs (from previous pass), pop one off.
+ -- otherwise return an anon key ref for the next pass to take care of...
+ case anonKeys of
+ (Key [Str "_"] src):rest ->
+ do{ setState (state { stateKeyBlocks = rest });
+ return src }
+ otherwise -> return (Ref [Str "_"]))
+
+referenceLink =
+ try (do
+ char '`'
+ label <- manyTill inline (string "`_")
+ src <- option (Ref []) anonymousLinkEnding
+ return (Link (normalizeSpaces label) src))
+
+oneWordReferenceLink =
+ try (do
+ label <- many1 alphaNum
+ char '_'
+ src <- option (Ref []) anonymousLinkEnding
+ notFollowedBy alphaNum -- because this_is_not a link
+ return (Link [Str label] src))
+
+uriScheme = oneOfStrings [ "http://", "https://", "ftp://", "file://", "mailto:",
+ "news:", "telnet:" ]
+
+uri = try (do
+ scheme <- uriScheme
+ identifier <- many1 (noneOf " \t\n")
+ return (scheme ++ identifier))
+
+autoURI = try (do
+ src <- uri
+ return (Link [Str src] (Src src "")))
+
+emailChar = alphaNum <|> oneOf "-+_."
+
+emailAddress = try (do
+ firstLetter <- alphaNum
+ restAddr <- many emailChar
+ let addr = firstLetter:restAddr
+ char '@'
+ dom <- domain
+ return (addr ++ '@':dom))
+
+domainChar = alphaNum <|> char '-'
+
+domain = try (do
+ first <- many1 domainChar
+ dom <- many1 (try (do{ char '.'; many1 domainChar }))
+ return (joinWithSep "." (first:dom)))
+
+autoEmail = try (do
+ src <- emailAddress
+ return (Link [Str src] (Src ("mailto:" ++ src) "")))
+
+autoLink = autoURI <|> autoEmail
+
+-- For now, we assume that all substitution references are for images.
+image =
+ try (do
+ char '|'
+ ref <- manyTill inline (char '|')
+ return (Image (normalizeSpaces ref) (Ref ref)))