diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2006-10-17 14:22:29 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2006-10-17 14:22:29 +0000 |
commit | df7b68225101966051f8b592a27127bf789eb81e (patch) | |
tree | a063e97ed58d0bdb2cbb5a95c3e8c1bcce54aa00 /src/Text/Pandoc/Readers/RST.hs | |
parent | e7dbfef4d8aa528d9245424e9c372e900a774c90 (diff) | |
download | pandoc-df7b68225101966051f8b592a27127bf789eb81e.tar.gz |
initial import
git-svn-id: https://pandoc.googlecode.com/svn/trunk@2 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Readers/RST.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/RST.hs | 644 |
1 files changed, 644 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs new file mode 100644 index 000000000..82e5ea303 --- /dev/null +++ b/src/Text/Pandoc/Readers/RST.hs @@ -0,0 +1,644 @@ +-- | Parse reStructuredText and return Pandoc document. +module Text.Pandoc.Readers.RST ( + readRST + ) where +import Text.Pandoc.Definition +import Text.ParserCombinators.Pandoc +import Text.Pandoc.Shared +import Text.Pandoc.Readers.HTML ( anyHtmlBlockTag, anyHtmlInlineTag ) +import Text.Regex ( matchRegex, mkRegex ) +import Text.ParserCombinators.Parsec +import Data.Maybe ( fromMaybe ) +import List ( findIndex ) +import Char ( toUpper ) + +-- | Parse reStructuredText string and return Pandoc document. +readRST :: ParserState -> String -> Pandoc +readRST = readWith parseRST + +-- | Parse a string and print result (for testing). +testString :: String -> IO () +testString = testStringWith parseRST + +-- +-- Constants and data structure definitions +--- + +bulletListMarkers = "*+-" +underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~" + +-- treat these as potentially non-text when parsing inline: +specialChars = "\\`|*_<>$:[-" + +-- +-- parsing documents +-- + +isAnonKeyBlock block = case block of + (Key [Str "_"] str) -> True + otherwise -> False + +isNotAnonKeyBlock block = not (isAnonKeyBlock block) + +isHeader1 :: Block -> Bool +isHeader1 (Header 1 _) = True +isHeader1 _ = False + +isHeader2 :: Block -> Bool +isHeader2 (Header 2 _) = True +isHeader2 _ = False + +-- | Promote all headers in a list of blocks. (Part of +-- title transformation for RST.) +promoteHeaders :: Int -> [Block] -> [Block] +promoteHeaders num ((Header level text):rest) = + (Header (level - num) text):(promoteHeaders num rest) +promoteHeaders num (other:rest) = other:(promoteHeaders num rest) +promoteHeaders num [] = [] + +-- | If list of blocks starts with a header (or a header and subheader) +-- of level that are not found elsewhere, return it as a title and +-- promote all the other headers. +titleTransform :: [Block] -- ^ list of blocks + -> ([Block], [Inline]) -- ^ modified list of blocks, title +titleTransform ((Header 1 head1):(Header 2 head2):rest) = -- title and subtitle + if (any isHeader1 rest) || (any isHeader2 rest) then + ((Header 1 head1):(Header 2 head2):rest, []) + else + ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2) +titleTransform ((Header 1 head1):rest) = -- title, no subtitle + if (any isHeader1 rest) then + ((Header 1 head1):rest, []) + else + ((promoteHeaders 1 rest), head1) +titleTransform blocks = (blocks, []) + +parseRST = do + state <- getState + input <- getInput + blocks <- parseBlocks -- first pass + let anonymousKeys = filter isAnonKeyBlock blocks + let blocks' = if (null anonymousKeys) then + blocks + else -- run parser again to fill in anonymous links... + case runParser parseBlocks (state { stateKeyBlocks = anonymousKeys }) + "RST source, second pass" input of + Left err -> error $ "\nError:\n" ++ show err + Right result -> (filter isNotAnonKeyBlock result) + let (blocks'', title) = if stateStandalone state then + titleTransform blocks' + else + (blocks', []) + state <- getState + let authors = stateAuthors state + let date = stateDate state + let title' = if (null title) then (stateTitle state) else title + return (Pandoc (Meta title' authors date) blocks'') + +-- +-- parsing blocks +-- + +parseBlocks = do + result <- manyTill block eof + return result + +block = choice [ codeBlock, rawHtmlBlock, rawLaTeXBlock, blockQuote, referenceKey, + imageBlock, unknownDirective, header, hrule, list, fieldList, lineBlock, + para, plain, blankBlock, nullBlock ] <?> "block" + +-- +-- field list +-- + +fieldListItem = try (do + char ':' + name <- many1 alphaNum + string ": " + skipSpaces + first <- manyTill anyChar newline + rest <- many (do{ notFollowedBy (char ':'); + notFollowedBy blankline; + skipSpaces; + manyTill anyChar newline }) + return (name, (joinWithSep " " (first:rest)))) + +fieldList = try (do + items <- many1 fieldListItem + blanklines + let authors = case (lookup "Authors" items) of + Just auth -> [auth] + Nothing -> map snd (filter (\(x,y) -> x == "Author") items) + let date = case (lookup "Date" items) of + Just dat -> dat + Nothing -> "" + let title = case (lookup "Title" items) of + Just tit -> [Str tit] + Nothing -> [] + let remaining = filter (\(x,y) -> (x /= "Authors") && (x /= "Author") && (x /= "Date") && + (x /= "Title")) items + let result = map (\(x,y) -> Para [Strong [Str x], Str ":", Space, Str y]) remaining + updateState (\st -> st { stateAuthors = authors, stateDate = date, stateTitle = title }) + return (BlockQuote result)) + +-- +-- line block +-- + +lineBlockLine = try (do + string "| " + white <- many (oneOf " \t") + line <- manyTill inline newline + let line' = (if null white then [] else [Str white]) ++ line ++ [LineBreak] + return line') + +lineBlock = try (do + lines <- many1 lineBlockLine + blanklines + return $ Para (concat lines)) + +-- +-- paragraph block +-- + +para = choice [ paraBeforeCodeBlock, paraNormal ] <?> "paragraph" + +codeBlockStart = try (do + string "::" + blankline + blankline) + +-- paragraph that ends in a :: starting a code block +paraBeforeCodeBlock = try (do + result <- many1 (do {notFollowedBy' codeBlockStart; inline}) + followedBy' (string "::") + return (Para (if (last result == Space) then + normalizeSpaces result + else + (normalizeSpaces result) ++ [Str ":"]))) + +-- regular paragraph +paraNormal = try (do + result <- many1 inline + newline + blanklines + let result' = normalizeSpaces result + return (Para result')) + +plain = do + result <- many1 inline + let result' = normalizeSpaces result + return (Plain result') + +-- +-- image block +-- + +imageBlock = try (do + string ".. image:: " + src <- manyTill anyChar newline + return (Plain [Image [Str "image"] (Src src "")])) + +-- +-- header blocks +-- + +header = choice [ doubleHeader, singleHeader ] <?> "header" + +-- a header with lines on top and bottom +doubleHeader = try (do + c <- oneOf underlineChars + rest <- many (char c) -- the top line + let lenTop = length (c:rest) + skipSpaces + newline + txt <- many1 (do {notFollowedBy blankline; inline}) + pos <- getPosition + let len = (sourceColumn pos) - 1 + if (len > lenTop) then fail "title longer than border" else (do {return ()}) + blankline -- spaces and newline + count lenTop (char c) -- the bottom line + blanklines + -- check to see if we've had this kind of header before. + -- if so, get appropriate level. if not, add to list. + state <- getState + let headerTable = stateHeaderTable state + let (headerTable', level) = case findIndex (== DoubleHeader c) headerTable of + Just ind -> (headerTable, ind + 1) + Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1) + setState (state { stateHeaderTable = headerTable' }) + return (Header level (normalizeSpaces txt))) + +-- a header with line on the bottom only +singleHeader = try (do + notFollowedBy' whitespace + txt <- many1 (do {notFollowedBy blankline; inline}) + pos <- getPosition + let len = (sourceColumn pos) - 1 + blankline + c <- oneOf underlineChars + rest <- count (len - 1) (char c) + many (char c) + blanklines + state <- getState + let headerTable = stateHeaderTable state + let (headerTable', level) = case findIndex (== SingleHeader c) headerTable of + Just ind -> (headerTable, ind + 1) + Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1) + setState (state { stateHeaderTable = headerTable' }) + return (Header level (normalizeSpaces txt))) + +-- +-- hrule block +-- + +hruleWith chr = + try (do + count 4 (char chr) + skipMany (char chr) + skipSpaces + newline + blanklines + return HorizontalRule) + +hrule = choice (map hruleWith underlineChars) <?> "hrule" + +-- +-- code blocks +-- + +-- read a line indented by a given string +indentedLine indents = try (do + string indents + result <- manyTill anyChar newline + return (result ++ "\n")) + +-- two or more indented lines, possibly separated by blank lines +-- if variable = True, then any indent will work, but it must be consistent through the block +-- if variable = False, indent should be one tab or equivalent in spaces +indentedBlock variable = try (do + state <- getState + let tabStop = stateTabStop state + indents <- if variable then + many1 (oneOf " \t") + else + oneOfStrings ["\t", (replicate tabStop ' ')] + firstline <- manyTill anyChar newline + rest <- many (choice [ indentedLine indents, + try (do {b <- blanklines; l <- indentedLine indents; return (b ++ l)})]) + option "" blanklines + return (firstline ++ "\n" ++ (concat rest))) + +codeBlock = try (do + codeBlockStart + result <- indentedBlock False -- the False means we want one tab stop indent on each line + return (CodeBlock result)) + +-- +-- raw html +-- + +rawHtmlBlock = try (do + string ".. raw:: html" + blanklines + result <- indentedBlock True + return (RawHtml result)) + +-- +-- raw latex +-- + +rawLaTeXBlock = try (do + string ".. raw:: latex" + blanklines + result <- indentedBlock True + return (Para [(TeX result)])) + +-- +-- block quotes +-- + +blockQuote = try (do + block <- indentedBlock True + -- parse the extracted block, which may contain various block elements: + state <- getState + let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) + "block" (block ++ "\n\n") of + Left err -> error $ "Raw block:\n" ++ show block ++ "\nError:\n" ++ show err + Right result -> result + return (BlockQuote parsed)) + +-- +-- list blocks +-- + +list = choice [ bulletList, orderedList ] <?> "list" + +-- parses bullet list start and returns its length (inc. following whitespace) +bulletListStart = + try (do + notFollowedBy' hrule -- because hrules start out just like lists + marker <- oneOf bulletListMarkers + white <- many1 spaceChar + let len = length (marker:white) + return len) + +withPeriodSuffix parser = try (do + a <- parser + b <- char '.' + return (a ++ [b])) + +withParentheses parser = try (do + a <- char '(' + b <- parser + c <- char ')' + return ([a] ++ b ++ [c])) + +withRightParen parser = try (do + a <- parser + b <- char ')' + return (a ++ [b])) + +upcaseWord = map toUpper + +romanNumeral = do + let lowerNumerals = ["i", "ii", "iii", "iiii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", "xx", "xxi", "xxii", "xxiii", "xxiv" ] + let upperNumerals = map upcaseWord lowerNumerals + result <- choice $ map string (lowerNumerals ++ upperNumerals) + return result + +orderedListEnumerator = choice [ many1 digit, + string "#", + count 1 letter, + romanNumeral ] + +-- parses ordered list start and returns its length (inc. following whitespace) +orderedListStart = + try (do + marker <- choice [ withPeriodSuffix orderedListEnumerator, + withParentheses orderedListEnumerator, + withRightParen orderedListEnumerator ] + white <- many1 spaceChar + let len = length (marker ++ white) + return len) + +-- parse a line of a list item +listLine markerLength = try (do + notFollowedBy blankline + indentWith markerLength + line <- manyTill anyChar newline + return (line ++ "\n")) + +-- indent by specified number of spaces (or equiv. tabs) +indentWith num = do + state <- getState + let tabStop = stateTabStop state + if (num < tabStop) then + count num (char ' ') + else + choice [ try (count num (char ' ')), + (try (do {char '\t'; count (num - tabStop) (char ' ')})) ] + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem start = + try (do + markerLength <- start + firstLine <- manyTill anyChar newline + restLines <- many (listLine markerLength) + return (markerLength, (firstLine ++ "\n" ++ (concat restLines)))) + +-- continuation of a list item - indented and separated by blankline or (in compact lists) +-- endline. Note: nested lists are parsed as continuations. +listContinuation markerLength = + try (do + blanks <- many1 blankline + result <- many1 (listLine markerLength) + return (blanks ++ (concat result))) + +listItem start = + try (do + (markerLength, first) <- rawListItem start + rest <- many (listContinuation markerLength) + blanks <- choice [ try (do {b <- many blankline; followedBy' start; return b}), + many1 blankline ] -- whole list must end with blank + -- parsing with ListItemState forces markers at beginning of lines to + -- count as list item markers, even if not separated by blank space. + -- see definition of "endline" + state <- getState + let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState}) + "list item" raw of + Left err -> error $ "Raw:\n" ++ raw ++ "\nError:\n" ++ show err + Right result -> result + where raw = concat (first:rest) ++ blanks + return parsed) + +orderedList = + try (do + items <- many1 (listItem orderedListStart) + let items' = compactify items + return (OrderedList items')) + +bulletList = + try (do + items <- many1 (listItem bulletListStart) + let items' = compactify items + return (BulletList items')) + +-- +-- unknown directive (e.g. comment) +-- + +unknownDirective = try (do + string ".. " + manyTill anyChar newline + many (do {string " "; + char ':'; + many1 (noneOf "\n:"); + char ':'; + many1 (noneOf "\n"); + newline}) + option "" blanklines + return Null) + +-- +-- reference key +-- + +referenceKey = choice [imageKey, anonymousKey, regularKeyQuoted, regularKey] + +imageKey = try (do + string ".. |" + ref <- manyTill inline (char '|') + skipSpaces + string "image::" + src <- manyTill anyChar newline + return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) ""))) + +anonymousKey = try (do + choice [string ".. __:", string "__"] + skipSpaces + src <- manyTill anyChar newline + state <- getState + return (Key [Str "_"] (Src (removeLeadingTrailingSpace src) ""))) + +regularKeyQuoted = try (do + string ".. _`" + ref <- manyTill inline (string "`:") + skipSpaces + src <- manyTill anyChar newline + return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) ""))) + +regularKey = try (do + string ".. _" + ref <- manyTill inline (char ':') + skipSpaces + src <- manyTill anyChar newline + return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) ""))) + + -- + -- inline + -- + +text = choice [ strong, emph, code, str, tabchar, whitespace, endline ] <?> "text" + +inline = choice [ escapedChar, special, hyphens, text, symbol ] <?> "inline" + +special = choice [ link, image ] <?> "link, inline html, or image" + +hyphens = try (do + result <- many1 (char '-') + option Space endline -- don't want to treat endline after hyphen or dash as a space + return (Str result)) + +escapedChar = escaped anyChar + +symbol = do + result <- oneOf specialChars + return (Str [result]) + +-- parses inline code, between codeStart and codeEnd +code = + try (do + string "``" + result <- manyTill anyChar (string "``") + let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result + return (Code result')) + +emph = do + result <- enclosed (char '*') (char '*') inline + return (Emph (normalizeSpaces result)) + +strong = do + result <- enclosed (string "**") (string "**") inline + return (Strong (normalizeSpaces result)) + +whitespace = do + many1 spaceChar <?> "whitespace" + return Space + +tabchar = do + tab + return (Str "\t") + +str = do + notFollowedBy' oneWordReferenceLink + result <- many1 (noneOf (specialChars ++ "\t\n ")) + return (Str result) + +-- an endline character that can be treated as a space, not a structural break +endline = + try (do + newline + notFollowedBy blankline + -- parse potential list starts at beginning of line differently if in a list: + st <- getState + if ((stateParserContext st) == ListItemState) then + notFollowedBy' (choice [orderedListStart, bulletListStart]) + else + option () pzero + return Space) + +-- +-- links +-- + +link = choice [explicitLink, referenceLink, autoLink, oneWordReferenceLink] <?> "link" + +explicitLink = + try (do + char '`' + label <- manyTill inline (try (do {spaces; char '<'})) + src <- manyTill (noneOf ">\n ") (char '>') + skipSpaces + string "`_" + return (Link (normalizeSpaces label) (Src (removeLeadingTrailingSpace src) ""))) + +anonymousLinkEnding = + try (do + char '_' + state <- getState + let anonKeys = stateKeyBlocks state + -- if there's a list of anon key refs (from previous pass), pop one off. + -- otherwise return an anon key ref for the next pass to take care of... + case anonKeys of + (Key [Str "_"] src):rest -> + do{ setState (state { stateKeyBlocks = rest }); + return src } + otherwise -> return (Ref [Str "_"])) + +referenceLink = + try (do + char '`' + label <- manyTill inline (string "`_") + src <- option (Ref []) anonymousLinkEnding + return (Link (normalizeSpaces label) src)) + +oneWordReferenceLink = + try (do + label <- many1 alphaNum + char '_' + src <- option (Ref []) anonymousLinkEnding + notFollowedBy alphaNum -- because this_is_not a link + return (Link [Str label] src)) + +uriScheme = oneOfStrings [ "http://", "https://", "ftp://", "file://", "mailto:", + "news:", "telnet:" ] + +uri = try (do + scheme <- uriScheme + identifier <- many1 (noneOf " \t\n") + return (scheme ++ identifier)) + +autoURI = try (do + src <- uri + return (Link [Str src] (Src src ""))) + +emailChar = alphaNum <|> oneOf "-+_." + +emailAddress = try (do + firstLetter <- alphaNum + restAddr <- many emailChar + let addr = firstLetter:restAddr + char '@' + dom <- domain + return (addr ++ '@':dom)) + +domainChar = alphaNum <|> char '-' + +domain = try (do + first <- many1 domainChar + dom <- many1 (try (do{ char '.'; many1 domainChar })) + return (joinWithSep "." (first:dom))) + +autoEmail = try (do + src <- emailAddress + return (Link [Str src] (Src ("mailto:" ++ src) ""))) + +autoLink = autoURI <|> autoEmail + +-- For now, we assume that all substitution references are for images. +image = + try (do + char '|' + ref <- manyTill inline (char '|') + return (Image (normalizeSpaces ref) (Ref ref))) |