{-
Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-}

{- |
   Module      : Text.Pandoc.Readers.RST
   Copyright   : Copyright (C) 2006-2010 John MacFarlane
   License     : GNU GPL, version 2 or above

   Maintainer  : John MacFarlane <jgm@berkeley.edu>
   Stability   : alpha
   Portability : portable

Conversion from reStructuredText to 'Pandoc' document.
-}
module Text.Pandoc.Readers.RST (
                                readRST
                               ) where
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.Parsing
import Text.Pandoc.Options
import Control.Monad ( when, liftM, guard, mzero )
import Data.List ( findIndex, intersperse, intercalate,
                   transpose, sort, deleteFirstsBy, isSuffixOf )
import qualified Data.Map as M
import Text.Printf ( printf )
import Data.Maybe ( catMaybes )
import Control.Applicative ((<$>), (<$), (<*), (*>))
import Text.Pandoc.Builder (Inlines, Blocks, trimInlines, (<>))
import qualified Text.Pandoc.Builder as B
import Data.Monoid (mconcat, mempty)
import Data.Sequence (viewr, ViewR(..))
import Data.Char (toLower)

-- | Parse reStructuredText string and return Pandoc document.
readRST :: ReaderOptions -- ^ Reader options
        -> String        -- ^ String to parse (assuming @'\n'@ line endings)
        -> Pandoc
readRST opts s = (readWith parseRST) def{ stateOptions = opts } (s ++ "\n\n")

type RSTParser = Parser [Char] ParserState

--
-- Constants and data structure definitions
---

bulletListMarkers :: [Char]
bulletListMarkers = "*+-"

underlineChars :: [Char]
underlineChars = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"

-- treat these as potentially non-text when parsing inline:
specialChars :: [Char]
specialChars = "\\`|*_<>$:/[]{}()-.\"'\8216\8217\8220\8221"

--
-- parsing documents
--

isHeader :: Int -> Block -> Bool
isHeader n (Header x _) = x == n
isHeader _ _            = False

-- | Promote all headers in a list of blocks.  (Part of
-- title transformation for RST.)
promoteHeaders :: Int -> [Block] -> [Block]
promoteHeaders num ((Header level text):rest) =
    (Header (level - num) text):(promoteHeaders num rest)
promoteHeaders num (other:rest) = other:(promoteHeaders num rest)
promoteHeaders _   [] = []

-- | If list of blocks starts with a header (or a header and subheader)
-- of level that are not found elsewhere, return it as a title and
-- promote all the other headers.
titleTransform :: [Block]              -- ^ list of blocks
               -> ([Block], [Inline])  -- ^ modified list of blocks, title
titleTransform ((Header 1 head1):(Header 2 head2):rest) |
   not (any (isHeader 1) rest || any (isHeader 2) rest) =  -- both title & subtitle
   (promoteHeaders 2 rest, head1 ++ [Str ":", Space] ++ head2)
titleTransform ((Header 1 head1):rest) |
   not (any (isHeader 1) rest) =  -- title, no subtitle
   (promoteHeaders 1 rest, head1)
titleTransform blocks = (blocks, [])

parseRST :: RSTParser Pandoc
parseRST = do
  optional blanklines -- skip blank lines at beginning of file
  startPos <- getPosition
  -- go through once just to get list of reference keys and notes
  -- docMinusKeys is the raw document with blanks where the keys were...
  docMinusKeys <- concat <$>
                  manyTill (referenceKey <|> noteBlock <|> lineClump) eof
  setInput docMinusKeys
  setPosition startPos
  st' <- getState
  let reversedNotes = stateNotes st'
  updateState $ \s -> s { stateNotes = reverse reversedNotes }
  -- now parse it for real...
  blocks <- B.toList <$> parseBlocks
  standalone <- getOption readerStandalone
  let (blocks', title) = if standalone
                            then titleTransform blocks
                            else (blocks, [])
  state <- getState
  let authors = stateAuthors state
  let date = stateDate state
  let title' = if null title then stateTitle state else title
  return $ Pandoc (Meta title' authors date) blocks'

--
-- parsing blocks
--

parseBlocks :: RSTParser Blocks
parseBlocks = mconcat <$> manyTill block eof

block :: RSTParser Blocks
block = choice [ codeBlock
               , rawBlock
               , blockQuote
               , fieldList
               , directive
               , comment
               , header
               , hrule
               , lineBlock     -- must go before definitionList
               , table
               , list
               , lhsCodeBlock
               , para
               , plain
               ] <?> "block"

--
-- field list
--

rawFieldListItem :: String -> RSTParser (String, String)
rawFieldListItem indent = try $ do
  string indent
  char ':'
  name <- many1Till (noneOf "\n") (char ':')
  skipSpaces
  first <- manyTill anyChar newline
  rest <- option "" $ try $ do lookAhead (string indent >> spaceChar)
                               indentedBlock
  let raw = (if null first then "" else (first ++ "\n")) ++ rest ++ "\n"
  return (name, raw)

fieldListItem :: String
              -> RSTParser (Maybe (Inlines, [Blocks]))
fieldListItem indent = try $ do
  (name, raw) <- rawFieldListItem indent
  let term = B.str name
  contents <- parseFromString parseBlocks raw
  optional blanklines
  case (name, B.toList contents) of
       ("Author", x) -> do
           updateState $ \st ->
             st{ stateAuthors = stateAuthors st ++ [extractContents x] }
           return Nothing
       ("Authors", [BulletList auths]) -> do
           updateState $ \st -> st{ stateAuthors = map extractContents auths }
           return Nothing
       ("Date", x) -> do
           updateState $ \st -> st{ stateDate = extractContents x }
           return Nothing
       ("Title", x) -> do
           updateState $ \st -> st{ stateTitle = extractContents x }
           return Nothing
       _            -> return $ Just (term, [contents])

extractContents :: [Block] -> [Inline]
extractContents [Plain auth] = auth
extractContents [Para auth]  = auth
extractContents _            = []

fieldList :: RSTParser Blocks
fieldList = try $ do
  indent <- lookAhead $ many spaceChar
  items <- many1 $ fieldListItem indent
  if null items
     then return mempty
     else return $ B.definitionList $ catMaybes items

--
-- line block
--

lineBlockLine :: RSTParser Inlines
lineBlockLine = try $ do
  char '|'
  char ' ' <|> lookAhead (char '\n')
  white <- many spaceChar
  line <- many $ (notFollowedBy newline >> inline) <|> (try $ endline >>~ char ' ')
  optional endline
  return $ if null white
              then mconcat line
              else B.str white <> mconcat line

lineBlock :: RSTParser Blocks
lineBlock = try $ do
  lines' <- many1 lineBlockLine
  blanklines
  return $ B.para (mconcat $ intersperse B.linebreak lines')

--
-- paragraph block
--

-- note: paragraph can end in a :: starting a code block
para :: RSTParser Blocks
para = try $ do
  result <- trimInlines . mconcat <$> many1 inline
  option (B.plain result) $ try $ do
    newline
    blanklines
    case viewr (B.unMany result) of
         ys :> (Str xs) | "::" `isSuffixOf` xs -> do
              raw <- option mempty codeBlockBody
              return $ B.para (B.Many ys <> B.str (take (length xs - 1) xs))
                         <> raw
         _ -> return (B.para result)

plain :: RSTParser Blocks
plain = B.plain . trimInlines . mconcat <$> many1 inline

--
-- header blocks
--

header :: RSTParser Blocks
header = doubleHeader <|> singleHeader <?> "header"

-- a header with lines on top and bottom
doubleHeader :: RSTParser Blocks
doubleHeader = try $ do
  c <- oneOf underlineChars
  rest <- many (char c)  -- the top line
  let lenTop = length (c:rest)
  skipSpaces
  newline
  txt <- trimInlines . mconcat <$> many1 (notFollowedBy blankline >> inline)
  pos <- getPosition
  let len = (sourceColumn pos) - 1
  if (len > lenTop) then fail "title longer than border" else return ()
  blankline              -- spaces and newline
  count lenTop (char c)  -- the bottom line
  blanklines
  -- check to see if we've had this kind of header before.
  -- if so, get appropriate level.  if not, add to list.
  state <- getState
  let headerTable = stateHeaderTable state
  let (headerTable',level) = case findIndex (== DoubleHeader c) headerTable of
        Just ind -> (headerTable, ind + 1)
        Nothing -> (headerTable ++ [DoubleHeader c], (length headerTable) + 1)
  setState (state { stateHeaderTable = headerTable' })
  return $ B.header level txt

-- a header with line on the bottom only
singleHeader :: RSTParser Blocks
singleHeader = try $ do
  notFollowedBy' whitespace
  txt <- trimInlines . mconcat <$> many1 (do {notFollowedBy blankline; inline})
  pos <- getPosition
  let len = (sourceColumn pos) - 1
  blankline
  c <- oneOf underlineChars
  count (len - 1) (char c)
  many (char c)
  blanklines
  state <- getState
  let headerTable = stateHeaderTable state
  let (headerTable',level) = case findIndex (== SingleHeader c) headerTable of
        Just ind -> (headerTable, ind + 1)
        Nothing -> (headerTable ++ [SingleHeader c], (length headerTable) + 1)
  setState (state { stateHeaderTable = headerTable' })
  return $ B.header level txt

--
-- hrule block
--

hrule :: Parser [Char] st Blocks
hrule = try $ do
  chr <- oneOf underlineChars
  count 3 (char chr)
  skipMany (char chr)
  blankline
  blanklines
  return B.horizontalRule

--
-- code blocks
--

-- read a line indented by a given string
indentedLine :: String -> Parser [Char] st [Char]
indentedLine indents = try $ do
  string indents
  manyTill anyChar newline

-- one or more indented lines, possibly separated by blank lines.
-- any amount of indentation will work.
indentedBlock :: Parser [Char] st [Char]
indentedBlock = try $ do
  indents <- lookAhead $ many1 spaceChar
  lns <- many1 $ try $ do b <- option "" blanklines
                          l <- indentedLine indents
                          return (b ++ l)
  optional blanklines
  return $ unlines lns

codeBlockStart :: Parser [Char] st Char
codeBlockStart = string "::" >> blankline >> blankline

codeBlock :: Parser [Char] st Blocks
codeBlock = try $ codeBlockStart >> codeBlockBody

codeBlockBody :: Parser [Char] st Blocks
codeBlockBody = try $ B.codeBlock . stripTrailingNewlines <$> indentedBlock

lhsCodeBlock :: RSTParser Blocks
lhsCodeBlock = try $ do
  getPosition >>= guard . (==1) . sourceColumn
  guardEnabled Ext_literate_haskell
  optional codeBlockStart
  lns <- many1 birdTrackLine
  -- if (as is normal) there is always a space after >, drop it
  let lns' = if all (\ln -> null ln || take 1 ln == " ") lns
                then map (drop 1) lns
                else lns
  blanklines
  return $ B.codeBlockWith ("", ["sourceCode", "literate", "haskell"], [])
         $ intercalate "\n" lns'

birdTrackLine :: Parser [Char] st [Char]
birdTrackLine = char '>' >> manyTill anyChar newline

--
-- raw html/latex/etc
--

rawBlock :: Parser [Char] st Blocks
rawBlock = try $ do
  string ".. raw:: "
  lang <- many1 (letter <|> digit)
  blanklines
  result <- indentedBlock
  return $ B.rawBlock lang result

--
-- block quotes
--

blockQuote :: RSTParser Blocks
blockQuote = do
  raw <- indentedBlock
  -- parse the extracted block, which may contain various block elements:
  contents <- parseFromString parseBlocks $ raw ++ "\n\n"
  return $ B.blockQuote contents

--
-- list blocks
--

list :: RSTParser Blocks
list = choice [ bulletList, orderedList, definitionList ] <?> "list"

definitionListItem :: RSTParser (Inlines, [Blocks])
definitionListItem = try $ do
  -- avoid capturing a directive or comment
  notFollowedBy (try $ char '.' >> char '.')
  term <- trimInlines . mconcat <$> many1Till inline endline
  raw <- indentedBlock
  -- parse the extracted block, which may contain various block elements:
  contents <- parseFromString parseBlocks $ raw ++ "\n"
  return (term, [contents])

definitionList :: RSTParser Blocks
definitionList = B.definitionList <$> many1 definitionListItem

-- parses bullet list start and returns its length (inc. following whitespace)
bulletListStart :: Parser [Char] st Int
bulletListStart = try $ do
  notFollowedBy' hrule  -- because hrules start out just like lists
  marker <- oneOf bulletListMarkers
  white <- many1 spaceChar
  return $ length (marker:white)

-- parses ordered list start and returns its length (inc following whitespace)
orderedListStart :: ListNumberStyle
                 -> ListNumberDelim
                 -> RSTParser Int
orderedListStart style delim = try $ do
  (_, markerLen) <- withHorizDisplacement (orderedListMarker style delim)
  white <- many1 spaceChar
  return $ markerLen + length white

-- parse a line of a list item
listLine :: Int -> RSTParser [Char]
listLine markerLength = try $ do
  notFollowedBy blankline
  indentWith markerLength
  line <- manyTill anyChar newline
  return $ line ++ "\n"

-- indent by specified number of spaces (or equiv. tabs)
indentWith :: Int -> RSTParser [Char]
indentWith num = do
  tabStop <- getOption readerTabStop
  if (num < tabStop)
     then count num  (char ' ')
     else choice [ try (count num (char ' ')),
                   (try (char '\t' >> count (num - tabStop) (char ' '))) ]

-- parse raw text for one list item, excluding start marker and continuations
rawListItem :: RSTParser Int
            -> RSTParser (Int, [Char])
rawListItem start = try $ do
  markerLength <- start
  firstLine <- manyTill anyChar newline
  restLines <- many (listLine markerLength)
  return (markerLength, (firstLine ++ "\n" ++ (concat restLines)))

-- continuation of a list item - indented and separated by blankline or
-- (in compact lists) endline.
-- Note: nested lists are parsed as continuations.
listContinuation :: Int -> RSTParser [Char]
listContinuation markerLength = try $ do
  blanks <- many1 blankline
  result <- many1 (listLine markerLength)
  return $ blanks ++ concat result

listItem :: RSTParser Int
         -> RSTParser Blocks
listItem start = try $ do
  (markerLength, first) <- rawListItem start
  rest <- many (listContinuation markerLength)
  blanks <- choice [ try (many blankline >>~ lookAhead start),
                     many1 blankline ]  -- whole list must end with blank.
  -- parsing with ListItemState forces markers at beginning of lines to
  -- count as list item markers, even if not separated by blank space.
  -- see definition of "endline"
  state <- getState
  let oldContext = stateParserContext state
  setState $ state {stateParserContext = ListItemState}
  -- parse the extracted block, which may itself contain block elements
  parsed <- parseFromString parseBlocks $ concat (first:rest) ++ blanks
  updateState (\st -> st {stateParserContext = oldContext})
  return parsed

orderedList :: RSTParser Blocks
orderedList = try $ do
  (start, style, delim) <- lookAhead (anyOrderedListMarker >>~ spaceChar)
  items <- many1 (listItem (orderedListStart style delim))
  let items' = compactify' items
  return $ B.orderedListWith (start, style, delim) items'

bulletList :: RSTParser Blocks
bulletList = B.bulletList . compactify' <$> many1 (listItem bulletListStart)

--
-- directive (e.g. comment, container, compound-paragraph)
--

comment :: RSTParser Blocks
comment = try $ do
  getPosition >>= guard . (==1) . sourceColumn
  string ".."
  skipMany1 spaceChar <|> (() <$ lookAhead newline)
  notFollowedBy' directiveLabel
  manyTill anyChar blanklines
  optional indentedBlock
  return mempty

directiveLabel :: RSTParser String
directiveLabel = map toLower
  <$> many1Till (letter <|> char '-') (try $ string "::")

directive :: RSTParser Blocks
directive = try $ do
  getPosition >>= guard . (==1) . sourceColumn
  string ".."
  directive'

-- TODO: line-block, parsed-literal, table, csv-table, list-table
-- replace, unicode
-- date
-- include
-- raw (consolidate)
-- class
-- title
directive' :: RSTParser Blocks
directive' = do
  skipMany1 spaceChar
  label <- directiveLabel
  skipMany spaceChar
  top <- many $ satisfy (/='\n')
             <|> try (char '\n' <*
                      notFollowedBy' (rawFieldListItem "   ") <*
                      count 3 (char ' ') <*
                      notFollowedBy blankline)
  newline
  fields <- many $ rawFieldListItem "   "
  body <- option "" $ try $ blanklines >> indentedBlock
  optional blanklines
  let body' = body ++ "\n\n"
  case label of
        "container" -> parseFromString parseBlocks body'
        "replace" -> B.para <$>  -- consumed by substKey
                   parseFromString (trimInlines . mconcat <$> many inline)
                   (trim top)
        "unicode" -> B.para <$>  -- consumed by substKey
                   parseFromString (trimInlines . mconcat <$> many inline)
                   (trim $ unicodeTransform top)
        "compound" -> parseFromString parseBlocks body'
        "pull-quote" -> B.blockQuote <$> parseFromString parseBlocks body'
        "epigraph" -> B.blockQuote <$> parseFromString parseBlocks body'
        "highlights" -> B.blockQuote <$> parseFromString parseBlocks body'
        "rubric" -> B.para . B.strong <$> parseFromString
                          (trimInlines . mconcat <$> many inline) top
        _ | label `elem` ["attention","caution","danger","error","hint",
                          "important","note","tip","warning"] ->
           do let tit = B.para $ B.strong $ B.str label
              bod <- parseFromString parseBlocks $ top ++ "\n\n" ++ body'
              return $ B.blockQuote $ tit <> bod
        "admonition" ->
           do tit <- B.para . B.strong <$> parseFromString
                          (trimInlines . mconcat <$> many inline) top
              bod <- parseFromString parseBlocks body'
              return $ B.blockQuote $ tit <> bod
        "sidebar" ->
           do let subtit = maybe "" trim $ lookup "subtitle" fields
              tit <- B.para . B.strong <$> parseFromString
                          (trimInlines . mconcat <$> many inline)
                          (trim top ++ if null subtit
                                          then ""
                                          else (":  " ++ subtit))
              bod <- parseFromString parseBlocks body'
              return $ B.blockQuote $ tit <> bod
        "topic" ->
           do tit <- B.para . B.strong <$> parseFromString
                          (trimInlines . mconcat <$> many inline) top
              bod <- parseFromString parseBlocks body'
              return $ tit <> bod
        "default-role" -> mempty <$ updateState (\s ->
                              s { stateRstDefaultRole =
                                  case trim top of
                                     ""   -> stateRstDefaultRole def
                                     role -> role })
        "code" -> codeblock (lookup "number-lines" fields) (trim top) body
        "code-block" -> codeblock (lookup "number-lines" fields) (trim top) body
        "math" -> return $ B.para $ mconcat $ map B.displayMath
                         $ toChunks $ top ++ "\n\n" ++ body
        "figure" -> do
           (caption, legend) <- parseFromString extractCaption body'
           let src = escapeURI $ trim top
           return $ B.para (B.image src "" caption) <> legend
        "image" -> do
           let src = escapeURI $ trim top
           let alt = B.str $ maybe "image" trim $ lookup "alt" fields
           return $ B.para
                  $ case lookup "target" fields of
                          Just t  -> B.link (escapeURI $ trim t) ""
                                     $ B.image src "" alt
                          Nothing -> B.image src "" alt
        _     -> return mempty

-- Can contain haracter codes as decimal numbers or
-- hexadecimal numbers, prefixed by 0x, x, \x, U+, u, or \u
-- or as XML-style hexadecimal character entities, e.g. &#x1a2b;
-- or text, which is used as-is.  Comments start with ..
unicodeTransform :: String -> String
unicodeTransform t =
  case t of
       ('.':'.':xs)  -> unicodeTransform $ dropWhile (/='\n') xs -- comment
       ('0':'x':xs)  -> go "0x" xs
       ('x':xs)      -> go "x" xs
       ('\\':'x':xs) -> go "\\x" xs
       ('U':'+':xs)  -> go "U+" xs
       ('u':xs)      -> go "u" xs
       ('\\':'u':xs) -> go "\\u" xs
       ('&':'#':'x':xs) -> maybe ("&#x" ++ unicodeTransform xs)
                           -- drop semicolon
                           (\(c,s) -> c : unicodeTransform (drop 1 s))
                           $ extractUnicodeChar xs
       (x:xs)        -> x : unicodeTransform xs
       []            -> []
    where go pref zs = maybe (pref ++ unicodeTransform zs)
                         (\(c,s) -> c : unicodeTransform s)
                         $ extractUnicodeChar zs

extractUnicodeChar :: String -> Maybe (Char, String)
extractUnicodeChar s = maybe Nothing (\c -> Just (c,rest)) mbc
  where (ds,rest) = span isHexDigit s
        mbc = safeRead ('\'':'\\':'x':ds ++ "'")

isHexDigit :: Char -> Bool
isHexDigit c = c `elem` "0123456789ABCDEFabcdef"

extractCaption :: RSTParser (Inlines, Blocks)
extractCaption = do
  capt <- trimInlines . mconcat <$> many inline
  legend <- optional blanklines >> (mconcat <$> many block)
  return (capt,legend)

-- divide string by blanklines
toChunks :: String -> [String]
toChunks = dropWhile null
           . map (trim . unlines)
           . splitBy (all (`elem` " \t")) . lines

codeblock :: Maybe String -> String -> String -> RSTParser Blocks
codeblock numberLines lang body =
  return $ B.codeBlockWith attribs $ stripTrailingNewlines body
    where attribs = ("", classes, kvs)
          classes = "sourceCode" : lang
                    : maybe [] (\_ -> ["numberLines"]) numberLines
          kvs     = case numberLines of
                          Just "" -> []
                          Nothing -> []
                          Just n  -> [("startFrom",n)]

---
--- note block
---

noteBlock :: RSTParser [Char]
noteBlock = try $ do
  startPos <- getPosition
  string ".."
  spaceChar >> skipMany spaceChar
  ref <- noteMarker
  first <- (spaceChar >> skipMany spaceChar >> anyLine)
        <|> (newline >> return "")
  blanks <- option "" blanklines
  rest <- option "" indentedBlock
  endPos <- getPosition
  let raw = first ++ "\n" ++ blanks ++ rest ++ "\n"
  let newnote = (ref, raw)
  st <- getState
  let oldnotes = stateNotes st
  updateState $ \s -> s { stateNotes = newnote : oldnotes }
  -- return blanks so line count isn't affected
  return $ replicate (sourceLine endPos - sourceLine startPos) '\n'

noteMarker :: RSTParser [Char]
noteMarker = do
  char '['
  res <- many1 digit
      <|> (try $ char '#' >> liftM ('#':) simpleReferenceName')
      <|> count 1 (oneOf "#*")
  char ']'
  return res

--
-- reference key
--

quotedReferenceName :: RSTParser Inlines
quotedReferenceName = try $ do
  char '`' >> notFollowedBy (char '`') -- `` means inline code!
  label' <- trimInlines . mconcat <$> many1Till inline (char '`')
  return label'

unquotedReferenceName :: RSTParser Inlines
unquotedReferenceName = try $ do
  label' <- trimInlines . mconcat <$> many1Till inline (lookAhead $ char ':')
  return label'

-- Simple reference names are single words consisting of alphanumerics
-- plus isolated (no two adjacent) internal hyphens, underscores,
-- periods, colons and plus signs; no whitespace or other characters
-- are allowed.
simpleReferenceName' :: Parser [Char] st String
simpleReferenceName' = do
  x <- alphaNum
  xs <- many $  alphaNum
            <|> (try $ oneOf "-_:+." >> lookAhead alphaNum)
  return (x:xs)

simpleReferenceName :: Parser [Char] st Inlines
simpleReferenceName = do
  raw <- simpleReferenceName'
  return $ B.str raw

referenceName :: RSTParser Inlines
referenceName = quotedReferenceName <|>
                (try $ simpleReferenceName >>~ lookAhead (char ':')) <|>
                unquotedReferenceName

referenceKey :: RSTParser [Char]
referenceKey = do
  startPos <- getPosition
  choice [substKey, anonymousKey, regularKey]
  optional blanklines
  endPos <- getPosition
  -- return enough blanks to replace key
  return $ replicate (sourceLine endPos - sourceLine startPos) '\n'

targetURI :: Parser [Char] st [Char]
targetURI = do
  skipSpaces
  optional newline
  contents <- many1 (try (many spaceChar >> newline >>
                          many1 spaceChar >> noneOf " \t\n") <|> noneOf "\n")
  blanklines
  return $ escapeURI $ trim $ contents

substKey :: RSTParser ()
substKey = try $ do
  getPosition >>= guard . (==1) . sourceColumn
  string ".."
  skipMany1 spaceChar
  (alt,ref) <- withRaw $ trimInlines . mconcat
                      <$> enclosed (char '|') (char '|') inline
  res <- B.toList <$> directive'
  il <- case res of
             -- use alt unless :alt: attribute on image:
             [Para [Image [Str "image"] (src,tit)]] ->
                return $ B.image src tit alt
             [Para [Link [Image [Str "image"] (src,tit)] (src',tit')]] ->
                return $ B.link src' tit' (B.image src tit alt)
             [Para ils] -> return $ B.fromList ils
             _          -> mzero
  let key = toKey $ stripFirstAndLast ref
  updateState $ \s -> s{ stateSubstitutions = M.insert key il $ stateSubstitutions s }

anonymousKey :: RSTParser ()
anonymousKey = try $ do
  oneOfStrings [".. __:", "__"]
  src <- targetURI
  pos <- getPosition
  let key = toKey $ "_" ++ printf "%09d" (sourceLine pos)
  updateState $ \s -> s { stateKeys = M.insert key (src,"") $ stateKeys s }

stripTicks :: String -> String
stripTicks = reverse . stripTick . reverse . stripTick
  where stripTick ('`':xs) = xs
        stripTick xs = xs

regularKey :: RSTParser ()
regularKey = try $ do
  string ".. _"
  (_,ref) <- withRaw referenceName
  char ':'
  src <- targetURI
  let key = toKey $ stripTicks ref
  updateState $ \s -> s { stateKeys = M.insert key (src,"") $ stateKeys s }

--
-- tables
--

-- General tables TODO:
--  - figure out if leading spaces are acceptable and if so, add
--    support for them
--
-- Simple tables TODO:
--  - column spans
--  - multiline support
--  - ensure that rightmost column span does not need to reach end
--  - require at least 2 columns
--
-- Grid tables TODO:
--  - column spans

dashedLine :: Char -> Parser [Char] st (Int, Int)
dashedLine ch = do
  dashes <- many1 (char ch)
  sp     <- many (char ' ')
  return (length dashes, length $ dashes ++ sp)

simpleDashedLines :: Char -> Parser [Char] st [(Int,Int)]
simpleDashedLines ch = try $ many1 (dashedLine ch)

-- Parse a table row separator
simpleTableSep :: Char -> RSTParser Char
simpleTableSep ch = try $ simpleDashedLines ch >> newline

-- Parse a table footer
simpleTableFooter :: RSTParser [Char]
simpleTableFooter = try $ simpleTableSep '=' >> blanklines

-- Parse a raw line and split it into chunks by indices.
simpleTableRawLine :: [Int] -> RSTParser [String]
simpleTableRawLine indices = do
  line <- many1Till anyChar newline
  return (simpleTableSplitLine indices line)

-- Parse a table row and return a list of blocks (columns).
simpleTableRow :: [Int] -> RSTParser [[Block]]
simpleTableRow indices = do
  notFollowedBy' simpleTableFooter
  firstLine <- simpleTableRawLine indices
  colLines  <- return [] -- TODO
  let cols = map unlines . transpose $ firstLine : colLines
  mapM (parseFromString (B.toList . mconcat <$> many plain)) cols

simpleTableSplitLine :: [Int] -> String -> [String]
simpleTableSplitLine indices line =
  map trim
  $ tail $ splitByIndices (init indices) line

simpleTableHeader :: Bool  -- ^ Headerless table
                  -> RSTParser ([[Block]], [Alignment], [Int])
simpleTableHeader headless = try $ do
  optional blanklines
  rawContent  <- if headless
                    then return ""
                    else simpleTableSep '=' >> anyLine
  dashes      <- simpleDashedLines '=' <|> simpleDashedLines '-'
  newline
  let lines'   = map snd dashes
  let indices  = scanl (+) 0 lines'
  let aligns   = replicate (length lines') AlignDefault
  let rawHeads = if headless
                    then replicate (length dashes) ""
                    else simpleTableSplitLine indices rawContent
  heads <- mapM (parseFromString (B.toList . mconcat <$> many plain)) $
             map trim rawHeads
  return (heads, aligns, indices)

-- Parse a simple table.
simpleTable :: Bool  -- ^ Headerless table
            -> RSTParser Blocks
simpleTable headless = do
  Table c a _w h l <- tableWith (simpleTableHeader headless) simpleTableRow sep simpleTableFooter
  -- Simple tables get 0s for relative column widths (i.e., use default)
  return $ B.singleton $ Table c a (replicate (length a) 0) h l
 where
  sep = return () -- optional (simpleTableSep '-')

gridTable :: Bool -- ^ Headerless table
          -> RSTParser Blocks
gridTable headerless = B.singleton
  <$> gridTableWith (B.toList <$> parseBlocks) headerless

table :: RSTParser Blocks
table = gridTable False <|> simpleTable False <|>
        gridTable True  <|> simpleTable True <?> "table"

--
-- inline
--

inline :: RSTParser Inlines
inline = choice [ whitespace
                , link
                , str
                , endline
                , strong
                , emph
                , code
                , image
                , superscript
                , subscript
                , math
                , note
                , smart
                , hyphens
                , escapedChar
                , symbol ] <?> "inline"

hyphens :: RSTParser Inlines
hyphens = do
  result <- many1 (char '-')
  optional endline
  -- don't want to treat endline after hyphen or dash as a space
  return $ B.str result

escapedChar :: Parser [Char] st Inlines
escapedChar = do c <- escaped anyChar
                 return $ if c == ' '  -- '\ ' is null in RST
                             then mempty
                             else B.str [c]

symbol :: RSTParser Inlines
symbol = do
  result <- oneOf specialChars
  return $ B.str [result]

-- parses inline code, between codeStart and codeEnd
code :: RSTParser Inlines
code = try $ do
  string "``"
  result <- manyTill anyChar (try (string "``"))
  return $ B.code
         $ trim $ unwords $ lines result

-- succeeds only if we're not right after a str (ie. in middle of word)
atStart :: RSTParser a -> RSTParser a
atStart p = do
  pos <- getPosition
  st <- getState
  -- single quote start can't be right after str
  guard $ stateLastStrPos st /= Just pos
  p

emph :: RSTParser Inlines
emph = B.emph . trimInlines . mconcat <$>
         enclosed (atStart $ char '*') (char '*') inline

strong :: RSTParser Inlines
strong = B.strong . trimInlines . mconcat <$>
          enclosed (atStart $ string "**") (try $ string "**") inline

-- Parses inline interpreted text which is required to have the given role.
-- This decision is based on the role marker (if present),
-- and the current default interpreted text role.
interpreted :: [Char] -> RSTParser [Char]
interpreted role = try $ do
  state <- getState
  if role == stateRstDefaultRole state
     then try markedInterpretedText <|> unmarkedInterpretedText
     else     markedInterpretedText
 where
  markedInterpretedText = try (roleMarker *> unmarkedInterpretedText)
                          <|> (unmarkedInterpretedText <* roleMarker)
  roleMarker = string $ ":" ++ role ++ ":"
  -- Note, this doesn't precisely implement the complex rule in
  -- http://docutils.sourceforge.net/docs/ref/rst/restructuredtext.html#inline-markup-recognition-rules
  -- but it should be good enough for most purposes
  unmarkedInterpretedText = do
      result <- enclosed (atStart $ char '`') (char '`') anyChar
      return result

superscript :: RSTParser Inlines
superscript = B.superscript . B.str <$> interpreted "sup"

subscript :: RSTParser Inlines
subscript = B.subscript . B.str <$> interpreted "sub"

math :: RSTParser Inlines
math = B.math <$> interpreted "math"

whitespace :: RSTParser Inlines
whitespace = B.space <$ skipMany1 spaceChar <?> "whitespace"

str :: RSTParser Inlines
str = do
  let strChar = noneOf ("\t\n " ++ specialChars)
  result <- many1 strChar
  updateLastStrPos
  return $ B.str result

-- an endline character that can be treated as a space, not a structural break
endline :: RSTParser Inlines
endline = try $ do
  newline
  notFollowedBy blankline
  -- parse potential list-starts at beginning of line differently in a list:
  st <- getState
  if (stateParserContext st) == ListItemState
     then notFollowedBy (anyOrderedListMarker >> spaceChar) >>
          notFollowedBy' bulletListStart
     else return ()
  return B.space

--
-- links
--

link :: RSTParser Inlines
link = choice [explicitLink, referenceLink, autoLink]  <?> "link"

explicitLink :: RSTParser Inlines
explicitLink = try $ do
  char '`'
  notFollowedBy (char '`') -- `` marks start of inline code
  label' <- trimInlines . mconcat <$>
             manyTill (notFollowedBy (char '`') >> inline) (char '<')
  src <- manyTill (noneOf ">\n") (char '>')
  skipSpaces
  string "`_"
  return $ B.link (escapeURI $ trim src) "" label'

referenceLink :: RSTParser Inlines
referenceLink = try $ do
  (label',ref) <- withRaw (quotedReferenceName <|> simpleReferenceName) >>~
                   char '_'
  state <- getState
  let keyTable = stateKeys state
  let isAnonKey (Key ('_':_)) = True
      isAnonKey _             = False
  key <- option (toKey $ stripTicks ref) $
                do char '_'
                   let anonKeys = sort $ filter isAnonKey $ M.keys keyTable
                   if null anonKeys
                      then mzero
                      else return (head anonKeys)
  (src,tit) <- case M.lookup key keyTable of
                    Nothing     -> fail "no corresponding key"
                    Just target -> return target
  -- if anonymous link, remove key so it won't be used again
  when (isAnonKey key) $ updateState $ \s -> s{ stateKeys = M.delete key keyTable }
  return $ B.link src tit label'

autoURI :: RSTParser Inlines
autoURI = do
  (orig, src) <- uri
  return $ B.link src "" $ B.str orig

autoEmail :: RSTParser Inlines
autoEmail = do
  (orig, src) <- emailAddress
  return $ B.link src "" $ B.str orig

autoLink :: RSTParser Inlines
autoLink = autoURI <|> autoEmail

-- For now, we assume that all substitution references are for images.
image :: RSTParser Inlines
image = try $ do
  (_,ref) <- withRaw $ enclosed (char '|') (char '|') inline
  state <- getState
  let substTable = stateSubstitutions state
  case M.lookup (toKey $ stripFirstAndLast ref) substTable of
       Nothing     -> fail "no corresponding key"
       Just target -> return target

note :: RSTParser Inlines
note = try $ do
  ref <- noteMarker
  char '_'
  state <- getState
  let notes = stateNotes state
  case lookup ref notes of
    Nothing   -> fail "note not found"
    Just raw  -> do
      -- We temporarily empty the note list while parsing the note,
      -- so that we don't get infinite loops with notes inside notes...
      -- Note references inside other notes are allowed in reST, but
      -- not yet in this implementation.
      updateState $ \st -> st{ stateNotes = [] }
      contents <- parseFromString parseBlocks raw
      let newnotes = if (ref == "*" || ref == "#") -- auto-numbered
                        -- delete the note so the next auto-numbered note
                        -- doesn't get the same contents:
                        then deleteFirstsBy (==) notes [(ref,raw)]
                        else notes
      updateState $ \st -> st{ stateNotes = newnotes }
      return $ B.note contents

smart :: RSTParser Inlines
smart = do
  getOption readerSmart >>= guard
  doubleQuoted <|> singleQuoted <|>
    choice (map (B.singleton <$>) [apostrophe, dash, ellipses])

singleQuoted :: RSTParser Inlines
singleQuoted = try $ do
  singleQuoteStart
  withQuoteContext InSingleQuote $
    B.singleQuoted . trimInlines . mconcat <$>
      many1Till inline singleQuoteEnd

doubleQuoted :: RSTParser Inlines
doubleQuoted = try $ do
  doubleQuoteStart
  withQuoteContext InDoubleQuote $
    B.doubleQuoted . trimInlines . mconcat <$>
      many1Till inline doubleQuoteEnd