aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/RST.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/RST.hs')
-rw-r--r--src/Text/Pandoc/Readers/RST.hs644
1 files changed, 644 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
new file mode 100644
index 000000000..82e5ea303
--- /dev/null
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -0,0 +1,644 @@
+-- | Parse reStructuredText and return Pandoc document.
+module Text.Pandoc.Readers.RST (
+ readRST
+ ) where
+import Text.Pandoc.Definition
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Shared
+import Text.Pandoc.Readers.HTML ( anyHtmlBlockTag, anyHtmlInlineTag )
+import Text.Regex ( matchRegex, mkRegex )
+import Text.ParserCombinators.Parsec
+import Data.Maybe ( fromMaybe )
+import List ( findIndex )
+import Char ( toUpper )
+
+-- | Parse reStructuredText string and return Pandoc document.
+readRST :: ParserState -> String -> Pandoc
+readRST = readWith parseRST
+
+-- | Parse a string and print result (for testing).
+testString :: String -> IO ()
+testString = testStringWith parseRST
+
+--
+-- Constants and data structure definitions
+---
+
+bulletListMarkers = "*+-"
+underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~"
+
+-- treat these as potentially non-text when parsing inline:
+specialChars = "\\`|*_<>$:[-"
+
+--
+-- parsing documents
+--
+
+isAnonKeyBlock block = case block of
+ (Key [Str "_"] str) -> True
+ otherwise -> False
+
+isNotAnonKeyBlock block = not (isAnonKeyBlock block)
+
+isHeader1 :: Block -> Bool
+isHeader1 (Header 1 _) = True
+isHeader1 _ = False
+
+isHeader2 :: Block -> Bool
+isHeader2 (Header 2 _) = True
+isHeader2 _ = False
+
+-- | Promote all headers in a list of blocks. (Part of
+-- title transformation for RST.)
+promoteHeaders :: Int -> [Block] -> [Block]
+promoteHeaders num ((Header level text):rest) =
+ (Header (level - num) text):(promoteHeaders num rest)
+promoteHeaders num (other:rest) = other:(promoteHeaders num rest)
+promoteHeaders num [] = []
+
+-- | If list of blocks starts with a header (or a header and subheader)
+-- of level that are not found elsewhere, return it as a title and
+-- promote all the other headers.
+titleTransform :: [Block] -- ^ list of blocks
+ -> ([Block], [Inline]) -- ^ modified list of blocks, title
+titleTransform ((Header 1 head1):(Header 2 head2):rest) = -- title and subtitle
+ if (any isHeader1 rest) || (any isHeader2 rest) then
+ ((Header 1 head1):(Header 2 head2):rest, [])
+ else
+ ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2)
+titleTransform ((Header 1 head1):rest) = -- title, no subtitle
+ if (any isHeader1 rest) then
+ ((Header 1 head1):rest, [])
+ else
+ ((promoteHeaders 1 rest), head1)
+titleTransform blocks = (blocks, [])
+
+parseRST = do
+ state <- getState
+ input <- getInput
+ blocks <- parseBlocks -- first pass
+ let anonymousKeys = filter isAnonKeyBlock blocks
+ let blocks' = if (null anonymousKeys) then
+ blocks
+ else -- run parser again to fill in anonymous links...
+ case runParser parseBlocks (state { stateKeyBlocks = anonymousKeys })
+ "RST source, second pass" input of
+ Left err -> error $ "\nError:\n" ++ show err
+ Right result -> (filter isNotAnonKeyBlock result)
+ let (blocks'', title) = if stateStandalone state then
+ titleTransform blocks'
+ else
+ (blocks', [])
+ state <- getState
+ let authors = stateAuthors state
+ let date = stateDate state
+ let title' = if (null title) then (stateTitle state) else title
+ return (Pandoc (Meta title' authors date) blocks'')
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+ result <- manyTill block eof
+ return result
+
+block = choice [ codeBlock, rawHtmlBlock, rawLaTeXBlock, blockQuote, referenceKey,
+ imageBlock, unknownDirective, header, hrule, list, fieldList, lineBlock,
+ para, plain, blankBlock, nullBlock ] <?> "block"
+
+--
+-- field list
+--
+
+fieldListItem = try (do
+ char ':'
+ name <- many1 alphaNum
+ string ": "
+ skipSpaces
+ first <- manyTill anyChar newline
+ rest <- many (do{ notFollowedBy (char ':');
+ notFollowedBy blankline;
+ skipSpaces;
+ manyTill anyChar newline })
+ return (name, (joinWithSep " " (first:rest))))
+
+fieldList = try (do
+ items <- many1 fieldListItem
+ blanklines
+ let authors = case (lookup "Authors" items) of
+ Just auth -> [auth]
+ Nothing -> map snd (filter (\(x,y) -> x == "Author") items)
+ let date = case (lookup "Date" items) of
+ Just dat -> dat
+ Nothing -> ""
+ let title = case (lookup "Title" items) of
+ Just tit -> [Str tit]
+ Nothing -> []
+ let remaining = filter (\(x,y) -> (x /= "Authors") && (x /= "Author") && (x /= "Date") &&
+ (x /= "Title")) items
+ let result = map (\(x,y) -> Para [Strong [Str x], Str ":", Space, Str y]) remaining
+ updateState (\st -> st { stateAuthors = authors, stateDate = date, stateTitle = title })
+ return (BlockQuote result))
+
+--
+-- line block
+--
+
+lineBlockLine = try (do
+ string "| "
+ white <- many (oneOf " \t")
+ line <- manyTill inline newline
+ let line' = (if null white then [] else [Str white]) ++ line ++ [LineBreak]
+ return line')
+
+lineBlock = try (do
+ lines <- many1 lineBlockLine
+ blanklines
+ return $ Para (concat lines))
+
+--
+-- paragraph block
+--
+
+para = choice [ paraBeforeCodeBlock, paraNormal ] <?> "paragraph"
+
+codeBlockStart = try (do
+ string "::"
+ blankline
+ blankline)
+
+-- paragraph that ends in a :: starting a code block
+paraBeforeCodeBlock = try (do
+ result <- many1 (do {notFollowedBy' codeBlockStart; inline})
+ followedBy' (string "::")
+ return (Para (if (last result == Space) then
+ normalizeSpaces result
+ else
+ (normalizeSpaces result) ++ [Str ":"])))
+
+-- regular paragraph
+paraNormal = try (do
+ result <- many1 inline
+ newline
+ blanklines
+ let result' = normalizeSpaces result
+ return (Para result'))
+
+plain = do
+ result <- many1 inline
+ let result' = normalizeSpaces result
+ return (Plain result')
+
+--
+-- image block
+--
+
+imageBlock = try (do
+ string ".. image:: "
+ src <- manyTill anyChar newline
+ return (Plain [Image [Str "image"] (Src src "")]))
+
+--
+-- header blocks
+--
+
+header = choice [ doubleHeader, singleHeader ] <?> "header"
+
+-- a header with lines on top and bottom
+doubleHeader = try (do
+ c <- oneOf underlineChars
+ rest <- many (char c) -- the top line
+ let lenTop = length (c:rest)
+ skipSpaces
+ newline
+ txt <- many1 (do {notFollowedBy blankline; inline})
+ pos <- getPosition
+ let len = (sourceColumn pos) - 1
+ if (len > lenTop) then fail "title longer than border" else (do {return ()})
+ blankline -- spaces and newline
+ count lenTop (char c) -- the bottom line
+ blanklines
+ -- check to see if we've had this kind of header before.
+ -- if so, get appropriate level. if not, add to list.
+ state <- getState
+ let headerTable = stateHeaderTable state
+ let (headerTable', level) = case findIndex (== DoubleHeader c) headerTable of
+ Just ind -> (headerTable, ind + 1)
+ Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1)
+ setState (state { stateHeaderTable = headerTable' })
+ return (Header level (normalizeSpaces txt)))
+
+-- a header with line on the bottom only
+singleHeader = try (do
+ notFollowedBy' whitespace
+ txt <- many1 (do {notFollowedBy blankline; inline})
+ pos <- getPosition
+ let len = (sourceColumn pos) - 1
+ blankline
+ c <- oneOf underlineChars
+ rest <- count (len - 1) (char c)
+ many (char c)
+ blanklines
+ state <- getState
+ let headerTable = stateHeaderTable state
+ let (headerTable', level) = case findIndex (== SingleHeader c) headerTable of
+ Just ind -> (headerTable, ind + 1)
+ Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1)
+ setState (state { stateHeaderTable = headerTable' })
+ return (Header level (normalizeSpaces txt)))
+
+--
+-- hrule block
+--
+
+hruleWith chr =
+ try (do
+ count 4 (char chr)
+ skipMany (char chr)
+ skipSpaces
+ newline
+ blanklines
+ return HorizontalRule)
+
+hrule = choice (map hruleWith underlineChars) <?> "hrule"
+
+--
+-- code blocks
+--
+
+-- read a line indented by a given string
+indentedLine indents = try (do
+ string indents
+ result <- manyTill anyChar newline
+ return (result ++ "\n"))
+
+-- two or more indented lines, possibly separated by blank lines
+-- if variable = True, then any indent will work, but it must be consistent through the block
+-- if variable = False, indent should be one tab or equivalent in spaces
+indentedBlock variable = try (do
+ state <- getState
+ let tabStop = stateTabStop state
+ indents <- if variable then
+ many1 (oneOf " \t")
+ else
+ oneOfStrings ["\t", (replicate tabStop ' ')]
+ firstline <- manyTill anyChar newline
+ rest <- many (choice [ indentedLine indents,
+ try (do {b <- blanklines; l <- indentedLine indents; return (b ++ l)})])
+ option "" blanklines
+ return (firstline ++ "\n" ++ (concat rest)))
+
+codeBlock = try (do
+ codeBlockStart
+ result <- indentedBlock False -- the False means we want one tab stop indent on each line
+ return (CodeBlock result))
+
+--
+-- raw html
+--
+
+rawHtmlBlock = try (do
+ string ".. raw:: html"
+ blanklines
+ result <- indentedBlock True
+ return (RawHtml result))
+
+--
+-- raw latex
+--
+
+rawLaTeXBlock = try (do
+ string ".. raw:: latex"
+ blanklines
+ result <- indentedBlock True
+ return (Para [(TeX result)]))
+
+--
+-- block quotes
+--
+
+blockQuote = try (do
+ block <- indentedBlock True
+ -- parse the extracted block, which may contain various block elements:
+ state <- getState
+ let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState})
+ "block" (block ++ "\n\n") of
+ Left err -> error $ "Raw block:\n" ++ show block ++ "\nError:\n" ++ show err
+ Right result -> result
+ return (BlockQuote parsed))
+
+--
+-- list blocks
+--
+
+list = choice [ bulletList, orderedList ] <?> "list"
+
+-- parses bullet list start and returns its length (inc. following whitespace)
+bulletListStart =
+ try (do
+ notFollowedBy' hrule -- because hrules start out just like lists
+ marker <- oneOf bulletListMarkers
+ white <- many1 spaceChar
+ let len = length (marker:white)
+ return len)
+
+withPeriodSuffix parser = try (do
+ a <- parser
+ b <- char '.'
+ return (a ++ [b]))
+
+withParentheses parser = try (do
+ a <- char '('
+ b <- parser
+ c <- char ')'
+ return ([a] ++ b ++ [c]))
+
+withRightParen parser = try (do
+ a <- parser
+ b <- char ')'
+ return (a ++ [b]))
+
+upcaseWord = map toUpper
+
+romanNumeral = do
+ let lowerNumerals = ["i", "ii", "iii", "iiii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", "xx", "xxi", "xxii", "xxiii", "xxiv" ]
+ let upperNumerals = map upcaseWord lowerNumerals
+ result <- choice $ map string (lowerNumerals ++ upperNumerals)
+ return result
+
+orderedListEnumerator = choice [ many1 digit,
+ string "#",
+ count 1 letter,
+ romanNumeral ]
+
+-- parses ordered list start and returns its length (inc. following whitespace)
+orderedListStart =
+ try (do
+ marker <- choice [ withPeriodSuffix orderedListEnumerator,
+ withParentheses orderedListEnumerator,
+ withRightParen orderedListEnumerator ]
+ white <- many1 spaceChar
+ let len = length (marker ++ white)
+ return len)
+
+-- parse a line of a list item
+listLine markerLength = try (do
+ notFollowedBy blankline
+ indentWith markerLength
+ line <- manyTill anyChar newline
+ return (line ++ "\n"))
+
+-- indent by specified number of spaces (or equiv. tabs)
+indentWith num = do
+ state <- getState
+ let tabStop = stateTabStop state
+ if (num < tabStop) then
+ count num (char ' ')
+ else
+ choice [ try (count num (char ' ')),
+ (try (do {char '\t'; count (num - tabStop) (char ' ')})) ]
+
+-- parse raw text for one list item, excluding start marker and continuations
+rawListItem start =
+ try (do
+ markerLength <- start
+ firstLine <- manyTill anyChar newline
+ restLines <- many (listLine markerLength)
+ return (markerLength, (firstLine ++ "\n" ++ (concat restLines))))
+
+-- continuation of a list item - indented and separated by blankline or (in compact lists)
+-- endline. Note: nested lists are parsed as continuations.
+listContinuation markerLength =
+ try (do
+ blanks <- many1 blankline
+ result <- many1 (listLine markerLength)
+ return (blanks ++ (concat result)))
+
+listItem start =
+ try (do
+ (markerLength, first) <- rawListItem start
+ rest <- many (listContinuation markerLength)
+ blanks <- choice [ try (do {b <- many blankline; followedBy' start; return b}),
+ many1 blankline ] -- whole list must end with blank
+ -- parsing with ListItemState forces markers at beginning of lines to
+ -- count as list item markers, even if not separated by blank space.
+ -- see definition of "endline"
+ state <- getState
+ let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState})
+ "list item" raw of
+ Left err -> error $ "Raw:\n" ++ raw ++ "\nError:\n" ++ show err
+ Right result -> result
+ where raw = concat (first:rest) ++ blanks
+ return parsed)
+
+orderedList =
+ try (do
+ items <- many1 (listItem orderedListStart)
+ let items' = compactify items
+ return (OrderedList items'))
+
+bulletList =
+ try (do
+ items <- many1 (listItem bulletListStart)
+ let items' = compactify items
+ return (BulletList items'))
+
+--
+-- unknown directive (e.g. comment)
+--
+
+unknownDirective = try (do
+ string ".. "
+ manyTill anyChar newline
+ many (do {string " ";
+ char ':';
+ many1 (noneOf "\n:");
+ char ':';
+ many1 (noneOf "\n");
+ newline})
+ option "" blanklines
+ return Null)
+
+--
+-- reference key
+--
+
+referenceKey = choice [imageKey, anonymousKey, regularKeyQuoted, regularKey]
+
+imageKey = try (do
+ string ".. |"
+ ref <- manyTill inline (char '|')
+ skipSpaces
+ string "image::"
+ src <- manyTill anyChar newline
+ return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+anonymousKey = try (do
+ choice [string ".. __:", string "__"]
+ skipSpaces
+ src <- manyTill anyChar newline
+ state <- getState
+ return (Key [Str "_"] (Src (removeLeadingTrailingSpace src) "")))
+
+regularKeyQuoted = try (do
+ string ".. _`"
+ ref <- manyTill inline (string "`:")
+ skipSpaces
+ src <- manyTill anyChar newline
+ return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+regularKey = try (do
+ string ".. _"
+ ref <- manyTill inline (char ':')
+ skipSpaces
+ src <- manyTill anyChar newline
+ return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+ --
+ -- inline
+ --
+
+text = choice [ strong, emph, code, str, tabchar, whitespace, endline ] <?> "text"
+
+inline = choice [ escapedChar, special, hyphens, text, symbol ] <?> "inline"
+
+special = choice [ link, image ] <?> "link, inline html, or image"
+
+hyphens = try (do
+ result <- many1 (char '-')
+ option Space endline -- don't want to treat endline after hyphen or dash as a space
+ return (Str result))
+
+escapedChar = escaped anyChar
+
+symbol = do
+ result <- oneOf specialChars
+ return (Str [result])
+
+-- parses inline code, between codeStart and codeEnd
+code =
+ try (do
+ string "``"
+ result <- manyTill anyChar (string "``")
+ let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result
+ return (Code result'))
+
+emph = do
+ result <- enclosed (char '*') (char '*') inline
+ return (Emph (normalizeSpaces result))
+
+strong = do
+ result <- enclosed (string "**") (string "**") inline
+ return (Strong (normalizeSpaces result))
+
+whitespace = do
+ many1 spaceChar <?> "whitespace"
+ return Space
+
+tabchar = do
+ tab
+ return (Str "\t")
+
+str = do
+ notFollowedBy' oneWordReferenceLink
+ result <- many1 (noneOf (specialChars ++ "\t\n "))
+ return (Str result)
+
+-- an endline character that can be treated as a space, not a structural break
+endline =
+ try (do
+ newline
+ notFollowedBy blankline
+ -- parse potential list starts at beginning of line differently if in a list:
+ st <- getState
+ if ((stateParserContext st) == ListItemState) then
+ notFollowedBy' (choice [orderedListStart, bulletListStart])
+ else
+ option () pzero
+ return Space)
+
+--
+-- links
+--
+
+link = choice [explicitLink, referenceLink, autoLink, oneWordReferenceLink] <?> "link"
+
+explicitLink =
+ try (do
+ char '`'
+ label <- manyTill inline (try (do {spaces; char '<'}))
+ src <- manyTill (noneOf ">\n ") (char '>')
+ skipSpaces
+ string "`_"
+ return (Link (normalizeSpaces label) (Src (removeLeadingTrailingSpace src) "")))
+
+anonymousLinkEnding =
+ try (do
+ char '_'
+ state <- getState
+ let anonKeys = stateKeyBlocks state
+ -- if there's a list of anon key refs (from previous pass), pop one off.
+ -- otherwise return an anon key ref for the next pass to take care of...
+ case anonKeys of
+ (Key [Str "_"] src):rest ->
+ do{ setState (state { stateKeyBlocks = rest });
+ return src }
+ otherwise -> return (Ref [Str "_"]))
+
+referenceLink =
+ try (do
+ char '`'
+ label <- manyTill inline (string "`_")
+ src <- option (Ref []) anonymousLinkEnding
+ return (Link (normalizeSpaces label) src))
+
+oneWordReferenceLink =
+ try (do
+ label <- many1 alphaNum
+ char '_'
+ src <- option (Ref []) anonymousLinkEnding
+ notFollowedBy alphaNum -- because this_is_not a link
+ return (Link [Str label] src))
+
+uriScheme = oneOfStrings [ "http://", "https://", "ftp://", "file://", "mailto:",
+ "news:", "telnet:" ]
+
+uri = try (do
+ scheme <- uriScheme
+ identifier <- many1 (noneOf " \t\n")
+ return (scheme ++ identifier))
+
+autoURI = try (do
+ src <- uri
+ return (Link [Str src] (Src src "")))
+
+emailChar = alphaNum <|> oneOf "-+_."
+
+emailAddress = try (do
+ firstLetter <- alphaNum
+ restAddr <- many emailChar
+ let addr = firstLetter:restAddr
+ char '@'
+ dom <- domain
+ return (addr ++ '@':dom))
+
+domainChar = alphaNum <|> char '-'
+
+domain = try (do
+ first <- many1 domainChar
+ dom <- many1 (try (do{ char '.'; many1 domainChar }))
+ return (joinWithSep "." (first:dom)))
+
+autoEmail = try (do
+ src <- emailAddress
+ return (Link [Str src] (Src ("mailto:" ++ src) "")))
+
+autoLink = autoURI <|> autoEmail
+
+-- For now, we assume that all substitution references are for images.
+image =
+ try (do
+ char '|'
+ ref <- manyTill inline (char '|')
+ return (Image (normalizeSpaces ref) (Ref ref)))