initial import

git-svn-id: https://pandoc.googlecode.com/svn/trunk@2 788f1e2b-df1e-0410-8736-df70ead52e1b
author: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2006-10-17 14:22:29 +0000
committer: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2006-10-17 14:22:29 +0000
commit: df7b68225101966051f8b592a27127bf789eb81e (patch)
tree: a063e97ed58d0bdb2cbb5a95c3e8c1bcce54aa00 /src/Text/Pandoc/Readers/RST.hs
parent: e7dbfef4d8aa528d9245424e9c372e900a774c90 (diff)
download: pandoc-df7b68225101966051f8b592a27127bf789eb81e.tar.gz
1 files changed, 644 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/RST.hs b/src/Text/Pandoc/Readers/RST.hs
new file mode 100644
index 000000000..82e5ea303
--- /dev/null
+++ b/src/Text/Pandoc/Readers/RST.hs
@@ -0,0 +1,644 @@
+-- | Parse reStructuredText and return Pandoc document.
+module Text.Pandoc.Readers.RST ( 
+                                readRST
+                               ) where
+import Text.Pandoc.Definition
+import Text.ParserCombinators.Pandoc
+import Text.Pandoc.Shared 
+import Text.Pandoc.Readers.HTML ( anyHtmlBlockTag, anyHtmlInlineTag )
+import Text.Regex ( matchRegex, mkRegex )
+import Text.ParserCombinators.Parsec
+import Data.Maybe ( fromMaybe )
+import List ( findIndex )
+import Char ( toUpper )
+
+-- | Parse reStructuredText string and return Pandoc document.
+readRST :: ParserState -> String -> Pandoc
+readRST = readWith parseRST
+
+-- | Parse a string and print result (for testing).
+testString :: String -> IO ()
+testString = testStringWith parseRST
+
+--
+-- Constants and data structure definitions
+---
+
+bulletListMarkers = "*+-"
+underlineChars = "!\"#$&'()*+,-./:;<=>?@[\\]^_`{|}~"
+
+-- treat these as potentially non-text when parsing inline:
+specialChars = "\\`|*_<>$:[-"
+
+--
+-- parsing documents
+--
+
+isAnonKeyBlock block = case block of
+                         (Key [Str "_"] str) -> True
+                         otherwise -> False
+
+isNotAnonKeyBlock block = not (isAnonKeyBlock block)
+
+isHeader1 :: Block -> Bool
+isHeader1 (Header 1 _) = True
+isHeader1 _ = False
+
+isHeader2 :: Block -> Bool
+isHeader2 (Header 2 _) = True
+isHeader2 _ = False
+
+-- | Promote all headers in a list of blocks.  (Part of
+-- title transformation for RST.)
+promoteHeaders :: Int -> [Block] -> [Block]
+promoteHeaders num ((Header level text):rest) = 
+    (Header (level - num) text):(promoteHeaders num rest)
+promoteHeaders num (other:rest) = other:(promoteHeaders num rest)
+promoteHeaders num [] = []
+
+-- | If list of blocks starts with a header (or a header and subheader)
+-- of level that are not found elsewhere, return it as a title and
+-- promote all the other headers. 
+titleTransform :: [Block]              -- ^ list of blocks
+               -> ([Block], [Inline])  -- ^ modified list of blocks, title
+titleTransform ((Header 1 head1):(Header 2 head2):rest) =  -- title and subtitle
+    if (any isHeader1 rest) || (any isHeader2 rest) then
+        ((Header 1 head1):(Header 2 head2):rest, [])
+      else
+        ((promoteHeaders 2 rest), head1 ++ [Str ":", Space] ++ head2)
+titleTransform ((Header 1 head1):rest) =       -- title, no subtitle
+    if (any isHeader1 rest) then
+        ((Header 1 head1):rest, [])
+      else
+        ((promoteHeaders 1 rest), head1)
+titleTransform blocks = (blocks, [])
+
+parseRST = do
+  state <- getState
+  input <- getInput
+  blocks <- parseBlocks   -- first pass
+  let anonymousKeys = filter isAnonKeyBlock blocks
+  let blocks' = if (null anonymousKeys) then
+                    blocks
+                  else -- run parser again to fill in anonymous links...
+                    case runParser parseBlocks (state { stateKeyBlocks = anonymousKeys }) 
+                             "RST source, second pass" input of
+                      Left err -> error $ "\nError:\n" ++ show err
+                      Right result -> (filter isNotAnonKeyBlock result)
+  let (blocks'', title) = if stateStandalone state then
+                              titleTransform blocks'
+                            else
+                              (blocks', [])
+  state <- getState
+  let authors = stateAuthors state
+  let date = stateDate state
+  let title' = if (null title) then (stateTitle state) else title
+  return (Pandoc (Meta title' authors date) blocks'')
+
+--
+-- parsing blocks
+--
+
+parseBlocks = do
+  result <- manyTill block eof
+  return result
+
+block = choice [ codeBlock, rawHtmlBlock, rawLaTeXBlock, blockQuote, referenceKey, 
+                 imageBlock, unknownDirective, header, hrule, list, fieldList, lineBlock,
+                 para, plain, blankBlock, nullBlock ] <?> "block"
+
+--
+-- field list
+--
+
+fieldListItem = try (do
+  char ':'
+  name <- many1 alphaNum
+  string ": "
+  skipSpaces
+  first <- manyTill anyChar newline
+  rest <- many (do{ notFollowedBy (char ':'); 
+                    notFollowedBy blankline; 
+                    skipSpaces;
+                    manyTill anyChar newline })
+  return (name, (joinWithSep " " (first:rest)))) 
+
+fieldList = try (do
+  items <- many1 fieldListItem
+  blanklines
+  let authors = case (lookup "Authors" items) of
+                   Just auth -> [auth]
+                   Nothing   -> map snd (filter (\(x,y) -> x == "Author") items)
+  let date = case (lookup "Date" items) of
+                   Just dat  -> dat
+                   Nothing   -> ""
+  let title = case (lookup "Title" items) of
+                    Just tit  -> [Str tit]
+                    Nothing   -> []
+  let remaining = filter (\(x,y) -> (x /= "Authors") && (x /= "Author") && (x /= "Date") && 
+                                    (x /= "Title")) items
+  let result = map (\(x,y) -> Para [Strong [Str x], Str ":", Space, Str y]) remaining
+  updateState (\st -> st { stateAuthors = authors, stateDate = date, stateTitle = title })
+  return (BlockQuote result))
+
+--
+-- line block
+--
+
+lineBlockLine = try (do
+  string "| "
+  white <- many (oneOf " \t")
+  line <- manyTill inline newline
+  let line' = (if null white then [] else [Str white]) ++ line ++ [LineBreak]
+  return line')
+
+lineBlock = try (do
+  lines <- many1 lineBlockLine
+  blanklines
+  return $ Para (concat lines))
+
+--
+-- paragraph block
+--
+
+para = choice [ paraBeforeCodeBlock, paraNormal ] <?> "paragraph"
+
+codeBlockStart = try (do
+                        string "::"
+                        blankline
+                        blankline)
+
+-- paragraph that ends in a :: starting a code block
+paraBeforeCodeBlock = try (do
+  result <- many1 (do {notFollowedBy' codeBlockStart; inline})
+  followedBy' (string "::")
+  return (Para (if (last result == Space) then 
+                    normalizeSpaces result
+                else 
+                    (normalizeSpaces result) ++ [Str ":"])))
+
+-- regular paragraph
+paraNormal = try (do 
+  result <- many1 inline
+  newline
+  blanklines
+  let result' = normalizeSpaces result
+  return (Para result'))
+
+plain = do
+  result <- many1 inline
+  let result' = normalizeSpaces result
+  return (Plain result')
+
+--
+-- image block
+--
+
+imageBlock = try (do
+    string ".. image:: "
+    src <- manyTill anyChar newline
+    return (Plain [Image [Str "image"] (Src src "")]))
+
+--
+-- header blocks
+--
+
+header = choice [ doubleHeader, singleHeader ] <?> "header"
+
+-- a header with lines on top and bottom
+doubleHeader = try (do
+    c <- oneOf underlineChars
+    rest <- many (char c)  -- the top line
+    let lenTop = length (c:rest)
+    skipSpaces
+    newline
+    txt <- many1 (do {notFollowedBy blankline; inline})
+    pos <- getPosition  
+    let len = (sourceColumn pos) - 1
+    if (len > lenTop) then fail "title longer than border" else (do {return ()})
+    blankline              -- spaces and newline
+    count lenTop (char c)  -- the bottom line
+    blanklines
+    -- check to see if we've had this kind of header before.  
+    -- if so, get appropriate level.  if not, add to list.
+    state <- getState
+    let headerTable = stateHeaderTable state
+    let (headerTable', level) =  case findIndex (== DoubleHeader c) headerTable of
+         Just ind    -> (headerTable, ind + 1)
+         Nothing     -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1)
+    setState (state { stateHeaderTable = headerTable' })
+    return (Header level (normalizeSpaces txt)))
+
+-- a header with line on the bottom only
+singleHeader = try (do 
+    notFollowedBy' whitespace
+    txt <- many1 (do {notFollowedBy blankline; inline})
+    pos <- getPosition
+    let len = (sourceColumn pos) - 1
+    blankline
+    c <- oneOf underlineChars
+    rest <- count (len - 1) (char c)
+    many (char c)
+    blanklines
+    state <- getState
+    let headerTable = stateHeaderTable state
+    let (headerTable', level) =  case findIndex (== SingleHeader c) headerTable of
+         Just ind    -> (headerTable, ind + 1)
+         Nothing     -> (headerTable ++ [SingleHeader c], (length headerTable) + 1)
+    setState (state { stateHeaderTable = headerTable' })
+    return (Header level (normalizeSpaces txt)))
+
+--
+-- hrule block
+--
+
+hruleWith chr = 
+    try (do
+           count 4 (char chr)
+           skipMany (char chr)
+           skipSpaces
+           newline
+           blanklines
+           return HorizontalRule)
+
+hrule = choice (map hruleWith underlineChars) <?> "hrule"
+
+--
+-- code blocks
+--
+
+-- read a line indented by a given string
+indentedLine indents = try (do
+    string indents
+    result <- manyTill anyChar newline
+    return (result ++ "\n"))
+
+-- two or more indented lines, possibly separated by blank lines
+-- if variable = True, then any indent will work, but it must be consistent through the block
+-- if variable = False, indent should be one tab or equivalent in spaces
+indentedBlock variable = try (do 
+  state <- getState
+  let tabStop = stateTabStop state
+  indents <- if variable then
+                 many1 (oneOf " \t")
+             else
+                 oneOfStrings ["\t", (replicate tabStop ' ')]
+  firstline <- manyTill anyChar newline
+  rest <- many (choice [ indentedLine indents, 
+                         try (do {b <- blanklines; l <- indentedLine indents; return (b ++ l)})])
+  option "" blanklines
+  return (firstline ++ "\n" ++ (concat rest)))
+
+codeBlock = try (do
+    codeBlockStart
+    result <- indentedBlock False  -- the False means we want one tab stop indent on each line
+    return (CodeBlock result))
+
+--
+-- raw html
+--
+
+rawHtmlBlock = try (do
+    string ".. raw:: html"
+    blanklines
+    result <- indentedBlock True
+    return (RawHtml result))
+
+--
+-- raw latex
+--
+
+rawLaTeXBlock = try (do
+    string ".. raw:: latex"
+    blanklines
+    result <- indentedBlock True
+    return (Para [(TeX result)]))
+
+--
+-- block quotes
+--
+
+blockQuote = try (do
+    block <- indentedBlock True
+    -- parse the extracted block, which may contain various block elements:
+    state <- getState
+    let parsed = case runParser parseBlocks (state {stateParserContext = BlockQuoteState}) 
+                  "block" (block ++ "\n\n") of
+                   Left err -> error $ "Raw block:\n" ++ show block ++ "\nError:\n" ++ show err
+                   Right result -> result
+    return (BlockQuote parsed))
+
+--
+-- list blocks
+--
+
+list = choice [ bulletList, orderedList ] <?> "list"
+
+-- parses bullet list start and returns its length (inc. following whitespace)
+bulletListStart = 
+    try (do
+           notFollowedBy' hrule  -- because hrules start out just like lists
+           marker <- oneOf bulletListMarkers
+           white <- many1 spaceChar
+           let len = length (marker:white)
+           return len) 
+
+withPeriodSuffix parser = try (do
+                                 a <- parser
+                                 b <- char '.'
+                                 return (a ++ [b]))
+
+withParentheses parser = try (do
+                                a <- char '('
+                                b <- parser
+                                c <- char ')'
+                                return ([a] ++ b ++ [c]))
+
+withRightParen parser = try (do
+                               a <- parser
+                               b <- char ')'
+                               return (a ++ [b]))
+
+upcaseWord = map toUpper
+
+romanNumeral = do
+  let lowerNumerals = ["i", "ii", "iii", "iiii", "iv", "v", "vi", "vii", "viii", "ix", "x", "xi", "xii", "xiii", "xiv", "xv", "xvi", "xvii", "xviii", "xix", "xx", "xxi", "xxii", "xxiii", "xxiv" ]
+  let upperNumerals = map upcaseWord lowerNumerals
+  result <- choice $ map string (lowerNumerals ++ upperNumerals)
+  return result
+
+orderedListEnumerator = choice [ many1 digit, 
+                                 string "#", 
+                                 count 1 letter, 
+                                 romanNumeral ]
+
+-- parses ordered list start and returns its length (inc. following whitespace)
+orderedListStart = 
+    try (do
+           marker <- choice [ withPeriodSuffix orderedListEnumerator, 
+                              withParentheses orderedListEnumerator, 
+                              withRightParen orderedListEnumerator ]
+           white <- many1 spaceChar
+           let len = length (marker ++ white)
+           return len)
+
+-- parse a line of a list item
+listLine markerLength = try (do
+  notFollowedBy blankline
+  indentWith markerLength
+  line <- manyTill anyChar newline
+  return (line ++ "\n"))
+
+-- indent by specified number of spaces (or equiv. tabs)
+indentWith num = do
+  state <- getState
+  let tabStop = stateTabStop state
+  if (num < tabStop) then
+      count num  (char ' ')
+    else
+      choice [ try (count num (char ' ')), 
+               (try (do {char '\t'; count (num - tabStop) (char ' ')})) ] 
+
+-- parse raw text for one list item, excluding start marker and continuations
+rawListItem start = 
+    try (do
+           markerLength <- start
+           firstLine <- manyTill anyChar newline
+           restLines <- many (listLine markerLength)
+           return (markerLength, (firstLine ++ "\n" ++ (concat restLines))))
+
+-- continuation of a list item - indented and separated by blankline or (in compact lists) 
+-- endline.  Note: nested lists are parsed as continuations.
+listContinuation markerLength = 
+    try (do
+           blanks <- many1 blankline
+           result <- many1 (listLine markerLength)
+           return (blanks ++ (concat result)))
+
+listItem start = 
+    try (do 
+           (markerLength, first) <- rawListItem start
+           rest <- many (listContinuation markerLength)
+           blanks <- choice [ try (do {b <- many blankline; followedBy' start; return b}), 
+                              many1 blankline ]  -- whole list must end with blank
+           -- parsing with ListItemState forces markers at beginning of lines to
+           -- count as list item markers, even if not separated by blank space.
+           -- see definition of "endline"
+           state <- getState
+           let parsed = case runParser parseBlocks (state {stateParserContext = ListItemState}) 
+                        "list item" raw of
+                          Left err -> error $ "Raw:\n" ++ raw ++ "\nError:\n" ++ show err
+                          Right result -> result
+                   where raw = concat (first:rest) ++ blanks 
+           return parsed)
+
+orderedList = 
+    try (do
+           items <- many1 (listItem orderedListStart)
+           let items' = compactify items
+           return (OrderedList items'))
+
+bulletList = 
+    try (do
+           items <- many1 (listItem bulletListStart)
+           let items' = compactify items
+           return (BulletList items'))
+
+--
+-- unknown directive (e.g. comment)
+--
+
+unknownDirective = try (do
+    string ".. "
+    manyTill anyChar newline
+    many (do {string "   "; 
+              char ':'; 
+              many1 (noneOf "\n:"); 
+              char ':'; 
+              many1 (noneOf "\n"); 
+              newline})
+    option "" blanklines
+    return Null)
+
+-- 
+-- reference key
+--
+
+referenceKey = choice [imageKey, anonymousKey, regularKeyQuoted, regularKey]
+
+imageKey = try (do
+     string ".. |"
+     ref <- manyTill inline (char '|')
+     skipSpaces
+     string "image::"
+     src <- manyTill anyChar newline
+     return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+anonymousKey = try (do
+     choice [string ".. __:", string "__"]
+     skipSpaces
+     src <- manyTill anyChar newline
+     state <- getState
+     return (Key [Str "_"] (Src (removeLeadingTrailingSpace src) "")))
+
+regularKeyQuoted = try (do
+     string ".. _`"
+     ref <- manyTill inline (string "`:")
+     skipSpaces
+     src <- manyTill anyChar newline
+     return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+regularKey = try (do
+     string ".. _"
+     ref <- manyTill inline (char ':')
+     skipSpaces
+     src <- manyTill anyChar newline
+     return (Key (normalizeSpaces ref) (Src (removeLeadingTrailingSpace src) "")))
+
+ -- 
+ -- inline
+ --
+
+text = choice [ strong, emph, code, str, tabchar, whitespace, endline ] <?> "text"
+
+inline = choice [ escapedChar, special, hyphens, text, symbol ] <?> "inline"
+
+special = choice [ link, image ] <?> "link, inline html, or image"
+
+hyphens = try (do
+  result <- many1 (char '-')
+  option Space endline   -- don't want to treat endline after hyphen or dash as a space
+  return (Str result))
+
+escapedChar = escaped anyChar
+
+symbol = do 
+  result <- oneOf specialChars
+  return (Str [result])
+
+-- parses inline code, between codeStart and codeEnd
+code = 
+    try (do 
+           string "``"
+           result <- manyTill anyChar (string "``")
+           let result' = removeLeadingTrailingSpace $ joinWithSep " " $ lines result
+           return (Code result'))
+
+emph = do
+  result <- enclosed (char '*') (char '*') inline
+  return (Emph (normalizeSpaces result))
+
+strong = do
+  result <- enclosed (string "**") (string "**") inline
+  return (Strong (normalizeSpaces result))
+
+whitespace = do
+  many1 spaceChar <?> "whitespace"
+  return Space
+
+tabchar = do
+  tab
+  return (Str "\t")
+
+str = do 
+  notFollowedBy' oneWordReferenceLink
+  result <- many1 (noneOf (specialChars ++ "\t\n "))
+  return (Str result)
+
+-- an endline character that can be treated as a space, not a structural break
+endline =
+    try (do
+           newline
+           notFollowedBy blankline
+           -- parse potential list starts at beginning of line differently if in a list:
+           st <- getState
+           if ((stateParserContext st) == ListItemState) then 
+               notFollowedBy' (choice [orderedListStart, bulletListStart])
+             else 
+               option () pzero
+           return Space)
+
+--
+-- links
+--
+
+link = choice [explicitLink, referenceLink, autoLink, oneWordReferenceLink] <?> "link"
+
+explicitLink = 
+    try (do
+           char '`'
+           label <- manyTill inline (try (do {spaces; char '<'}))
+           src <- manyTill (noneOf ">\n ") (char '>')
+           skipSpaces
+           string "`_"
+           return (Link (normalizeSpaces label) (Src (removeLeadingTrailingSpace src) ""))) 
+
+anonymousLinkEnding = 
+    try (do
+           char '_'
+           state <- getState
+           let anonKeys = stateKeyBlocks state
+           -- if there's a list of anon key refs (from previous pass), pop one off.
+           -- otherwise return an anon key ref for the next pass to take care of...
+           case anonKeys of
+             (Key [Str "_"] src):rest -> 
+                 do{ setState (state { stateKeyBlocks = rest }); 
+                     return src }
+             otherwise -> return (Ref [Str "_"]))
+
+referenceLink = 
+    try (do
+           char '`'
+           label <- manyTill inline (string "`_")
+           src <- option (Ref []) anonymousLinkEnding
+           return (Link (normalizeSpaces label) src)) 
+
+oneWordReferenceLink =
+    try (do
+           label <- many1 alphaNum
+           char '_'
+           src <- option (Ref []) anonymousLinkEnding
+           notFollowedBy alphaNum    -- because this_is_not a link
+           return (Link [Str label] src)) 
+
+uriScheme = oneOfStrings [ "http://", "https://", "ftp://", "file://", "mailto:", 
+                           "news:", "telnet:" ]
+
+uri = try (do
+    scheme <- uriScheme
+    identifier <- many1 (noneOf " \t\n")
+    return (scheme ++ identifier))
+
+autoURI = try (do
+    src <- uri
+    return (Link [Str src] (Src src "")))
+
+emailChar = alphaNum <|> oneOf "-+_."
+
+emailAddress = try (do
+    firstLetter <- alphaNum
+    restAddr <- many emailChar
+    let addr = firstLetter:restAddr
+    char '@'
+    dom <- domain
+    return (addr ++ '@':dom))
+
+domainChar = alphaNum <|> char '-'
+
+domain = try (do
+    first <- many1 domainChar
+    dom <- many1 (try (do{ char '.'; many1 domainChar }))
+    return (joinWithSep "." (first:dom)))
+
+autoEmail = try (do
+    src <- emailAddress
+    return (Link [Str src] (Src ("mailto:" ++ src) "")))
+
+autoLink = autoURI <|> autoEmail
+
+-- For now, we assume that all substitution references are for images.
+image = 
+    try (do
+           char '|'
+           ref <- manyTill inline (char '|')
+           return (Image (normalizeSpaces ref) (Ref ref)))
author	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2006-10-17 14:22:29 +0000
committer	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2006-10-17 14:22:29 +0000
commit	df7b68225101966051f8b592a27127bf789eb81e (patch)
tree	a063e97ed58d0bdb2cbb5a95c3e8c1bcce54aa00 /src/Text/Pandoc/Readers/RST.hs
parent	e7dbfef4d8aa528d9245424e9c372e900a774c90 (diff)
download	pandoc-df7b68225101966051f8b592a27127bf789eb81e.tar.gz