{-# LANGUAGE RelaxedPolyRec #-} -- needed for inlinesBetween on GHC < 7 {- Copyright (C) 2006-2010 John MacFarlane This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA -} {- | Module : Text.Pandoc.Readers.Markdown Copyright : Copyright (C) 2006-2010 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane Stability : alpha Portability : portable Conversion of markdown-formatted plain text to 'Pandoc' document. -} module Text.Pandoc.Readers.Markdown ( readMarkdown ) where import Data.List ( transpose, sortBy, findIndex, intercalate ) import qualified Data.Map as M import Data.Ord ( comparing ) import Data.Char ( isAlphaNum ) import Data.Maybe import Text.Pandoc.Definition import Text.Pandoc.Generic import Text.Pandoc.Options import Text.Pandoc.Shared import Text.Pandoc.Parsing import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXBlock ) import Text.Pandoc.Readers.HTML ( htmlTag, htmlInBalanced, isInlineTag, isBlockTag, isTextTag, isCommentTag ) import Text.Pandoc.XML ( fromEntities ) import Control.Monad (when, liftM, guard, mzero, unless ) import Text.HTML.TagSoup import Text.HTML.TagSoup.Match (tagOpen) -- | Read markdown from an input string and return a Pandoc document. readMarkdown :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assuming @'\n'@ line endings) -> Pandoc readMarkdown opts s = (readWith parseMarkdown) def{ stateOptions = opts } (s ++ "\n\n") -- -- Constants and data structure definitions -- isBulletListMarker :: Char -> Bool isBulletListMarker '*' = True isBulletListMarker '+' = True isBulletListMarker '-' = True isBulletListMarker _ = False isHruleChar :: Char -> Bool isHruleChar '*' = True isHruleChar '-' = True isHruleChar '_' = True isHruleChar _ = False setextHChars :: [Char] setextHChars = "=-" isBlank :: Char -> Bool isBlank ' ' = True isBlank '\t' = True isBlank '\n' = True isBlank _ = False -- -- auxiliary functions -- indentSpaces :: Parser [Char] ParserState [Char] indentSpaces = try $ do tabStop <- getOption readerTabStop count tabStop (char ' ') <|> string "\t" "indentation" nonindentSpaces :: Parser [Char] ParserState [Char] nonindentSpaces = do tabStop <- getOption readerTabStop sps <- many (char ' ') if length sps < tabStop then return sps else unexpected "indented line" skipNonindentSpaces :: Parser [Char] ParserState () skipNonindentSpaces = do tabStop <- getOption readerTabStop atMostSpaces (tabStop - 1) atMostSpaces :: Int -> Parser [Char] ParserState () atMostSpaces 0 = notFollowedBy (char ' ') atMostSpaces n = (char ' ' >> atMostSpaces (n-1)) <|> return () litChar :: Parser [Char] ParserState Char litChar = escapedChar' <|> noneOf "\n" <|> (newline >> notFollowedBy blankline >> return ' ') -- | Parse a sequence of inline elements between square brackets, -- including inlines between balanced pairs of square brackets. inlinesInBalancedBrackets :: Parser [Char] ParserState Inline -> Parser [Char] ParserState [Inline] inlinesInBalancedBrackets parser = try $ do char '[' result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser guard (res == "[") bal <- inlinesInBalancedBrackets parser return $ [Str "["] ++ bal ++ [Str "]"]) <|> (count 1 parser)) (char ']') return $ concat result -- -- document structure -- titleLine :: Parser [Char] ParserState [Inline] titleLine = try $ do char '%' skipSpaces res <- many $ (notFollowedBy newline >> inline) <|> try (endline >> whitespace) newline return $ normalizeSpaces res authorsLine :: Parser [Char] ParserState [[Inline]] authorsLine = try $ do char '%' skipSpaces authors <- sepEndBy (many (notFollowedBy (satisfy $ \c -> c == ';' || c == '\n') >> inline)) (char ';' <|> try (newline >> notFollowedBy blankline >> spaceChar)) newline return $ filter (not . null) $ map normalizeSpaces authors dateLine :: Parser [Char] ParserState [Inline] dateLine = try $ do char '%' skipSpaces date <- manyTill inline newline return $ normalizeSpaces date titleBlock :: Parser [Char] ParserState ([Inline], [[Inline]], [Inline]) titleBlock = try $ do guardEnabled Ext_pandoc_title_blocks title <- option [] titleLine author <- option [] authorsLine date <- option [] dateLine optional blanklines return (title, author, date) parseMarkdown :: Parser [Char] ParserState Pandoc parseMarkdown = do -- markdown allows raw HTML updateState $ \state -> state { stateOptions = let oldOpts = stateOptions state in oldOpts{ readerParseRaw = True } } startPos <- getPosition -- go through once just to get list of reference keys and notes -- docMinusKeys is the raw document with blanks where the keys/notes were... let firstPassParser = referenceKey <|> (guardEnabled Ext_footnotes >> noteBlock) <|> (guardEnabled Ext_delimited_code_blocks >> liftM snd (withRaw codeBlockDelimited)) <|> lineClump docMinusKeys <- liftM concat $ manyTill firstPassParser eof setInput docMinusKeys setPosition startPos st' <- getState let reversedNotes = stateNotes st' updateState $ \s -> s { stateNotes = reverse reversedNotes } -- now parse it for real... (title, author, date) <- option ([],[],[]) titleBlock blocks <- parseBlocks let doc = Pandoc (Meta title author date) $ filter (/= Null) blocks -- if there are labeled examples, change references into numbers examples <- liftM stateExamples getState let handleExampleRef :: Inline -> Inline handleExampleRef z@(Str ('@':xs)) = case M.lookup xs examples of Just n -> Str (show n) Nothing -> z handleExampleRef z = z if M.null examples then return doc else return $ bottomUp handleExampleRef doc -- -- initial pass for references and notes -- referenceKey :: Parser [Char] ParserState [Char] referenceKey = try $ do startPos <- getPosition skipNonindentSpaces lab <- reference char ':' skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[') let sourceURL = liftM unwords $ many $ try $ do notFollowedBy' referenceTitle skipMany spaceChar optional $ newline >> notFollowedBy blankline skipMany spaceChar notFollowedBy' reference many1 $ escapedChar' <|> satisfy (not . isBlank) let betweenAngles = try $ char '<' >> manyTill (escapedChar' <|> litChar) (char '>') src <- try betweenAngles <|> sourceURL tit <- option "" referenceTitle blanklines endPos <- getPosition let target = (escapeURI $ removeTrailingSpace src, tit) st <- getState let oldkeys = stateKeys st updateState $ \s -> s { stateKeys = M.insert (toKey lab) target oldkeys } -- return blanks so line count isn't affected return $ replicate (sourceLine endPos - sourceLine startPos) '\n' referenceTitle :: Parser [Char] ParserState String referenceTitle = try $ do skipSpaces >> optional newline >> skipSpaces tit <- (charsInBalanced '(' ')' litChar >>= return . unwords . words) <|> do delim <- char '\'' <|> char '"' manyTill litChar (try (char delim >> skipSpaces >> notFollowedBy (noneOf ")\n"))) return $ fromEntities tit noteMarker :: Parser [Char] ParserState [Char] noteMarker = string "[^" >> many1Till (satisfy $ not . isBlank) (char ']') rawLine :: Parser [Char] ParserState [Char] rawLine = try $ do notFollowedBy blankline notFollowedBy' $ try $ skipNonindentSpaces >> noteMarker optional indentSpaces anyLine rawLines :: Parser [Char] ParserState [Char] rawLines = do first <- anyLine rest <- many rawLine return $ unlines (first:rest) noteBlock :: Parser [Char] ParserState [Char] noteBlock = try $ do startPos <- getPosition skipNonindentSpaces ref <- noteMarker char ':' optional blankline optional indentSpaces raw <- sepBy rawLines (try (blankline >> indentSpaces >> notFollowedBy blankline)) optional blanklines endPos <- getPosition let newnote = (ref, (intercalate "\n" raw) ++ "\n\n") st <- getState let oldnotes = stateNotes st updateState $ \s -> s { stateNotes = newnote : oldnotes } -- return blanks so line count isn't affected return $ replicate (sourceLine endPos - sourceLine startPos) '\n' -- -- parsing blocks -- parseBlocks :: Parser [Char] ParserState [Block] parseBlocks = manyTill block eof block :: Parser [Char] ParserState Block block = choice [ codeBlockDelimited , guardEnabled Ext_latex_macros >> macro , header , table , codeBlockIndented , lhsCodeBlock , blockQuote , hrule , bulletList , orderedList , definitionList , rawTeXBlock , para , htmlBlock , plain , nullBlock ] "block" -- -- header blocks -- header :: Parser [Char] ParserState Block header = setextHeader <|> atxHeader "header" atxHeader :: Parser [Char] ParserState Block atxHeader = try $ do level <- many1 (char '#') >>= return . length notFollowedBy (char '.' <|> char ')') -- this would be a list skipSpaces text <- manyTill inline atxClosing >>= return . normalizeSpaces return $ Header level text atxClosing :: Parser [Char] st [Char] atxClosing = try $ skipMany (char '#') >> blanklines setextHeader :: Parser [Char] ParserState Block setextHeader = try $ do -- This lookahead prevents us from wasting time parsing Inlines -- unless necessary -- it gives a significant performance boost. lookAhead $ anyLine >> many1 (oneOf setextHChars) >> blankline text <- many1Till inline newline underlineChar <- oneOf setextHChars many (char underlineChar) blanklines let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1 return $ Header level (normalizeSpaces text) -- -- hrule block -- hrule :: Parser [Char] st Block hrule = try $ do skipSpaces start <- satisfy isHruleChar count 2 (skipSpaces >> char start) skipMany (spaceChar <|> char start) newline optional blanklines return HorizontalRule -- -- code blocks -- indentedLine :: Parser [Char] ParserState [Char] indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n") blockDelimiter :: (Char -> Bool) -> Maybe Int -> Parser [Char] st (Int, (String, [String], [(String, String)]), Char) blockDelimiter f len = try $ do c <- lookAhead (satisfy f) size <- case len of Just l -> count l (char c) >> many (char c) >> return l Nothing -> count 3 (char c) >> many (char c) >>= return . (+ 3) . length many spaceChar attr <- option ([],[],[]) $ attributes -- ~~~ {.ruby} <|> (many1 alphaNum >>= \x -> return ([],[x],[])) -- github variant ```ruby blankline return (size, attr, c) attributes :: Parser [Char] st ([Char], [[Char]], [([Char], [Char])]) attributes = try $ do char '{' spnl attrs <- many (attribute >>~ spnl) char '}' let (ids, classes, keyvals) = unzip3 attrs let firstNonNull [] = "" firstNonNull (x:xs) | not (null x) = x | otherwise = firstNonNull xs return (firstNonNull $ reverse ids, concat classes, concat keyvals) attribute :: Parser [Char] st ([Char], [[Char]], [([Char], [Char])]) attribute = identifierAttr <|> classAttr <|> keyValAttr identifier :: Parser [Char] st [Char] identifier = do first <- letter rest <- many $ alphaNum <|> oneOf "-_:." return (first:rest) identifierAttr :: Parser [Char] st ([Char], [a], [a1]) identifierAttr = try $ do char '#' result <- identifier return (result,[],[]) classAttr :: Parser [Char] st ([Char], [[Char]], [a]) classAttr = try $ do char '.' result <- identifier return ("",[result],[]) keyValAttr :: Parser [Char] st ([Char], [a], [([Char], [Char])]) keyValAttr = try $ do key <- identifier char '=' val <- enclosed (char '"') (char '"') anyChar <|> enclosed (char '\'') (char '\'') anyChar <|> many nonspaceChar return ("",[],[(key,val)]) codeBlockDelimited :: Parser [Char] ParserState Block codeBlockDelimited = try $ do guardEnabled Ext_delimited_code_blocks (size, attr, c) <- blockDelimiter (\c -> c == '~' || c == '`') Nothing contents <- manyTill anyLine (blockDelimiter (== c) (Just size)) blanklines return $ CodeBlock attr $ intercalate "\n" contents codeBlockIndented :: Parser [Char] ParserState Block codeBlockIndented = do contents <- many1 (indentedLine <|> try (do b <- blanklines l <- indentedLine return $ b ++ l)) optional blanklines classes <- getOption readerIndentedCodeClasses return $ CodeBlock ("", classes, []) $ stripTrailingNewlines $ concat contents lhsCodeBlock :: Parser [Char] ParserState Block lhsCodeBlock = do failUnlessLHS liftM (CodeBlock ("",["sourceCode","literate","haskell"],[])) (lhsCodeBlockBird <|> lhsCodeBlockLaTeX) <|> liftM (CodeBlock ("",["sourceCode","haskell"],[])) lhsCodeBlockInverseBird lhsCodeBlockLaTeX :: Parser [Char] ParserState String lhsCodeBlockLaTeX = try $ do string "\\begin{code}" manyTill spaceChar newline contents <- many1Till anyChar (try $ string "\\end{code}") blanklines return $ stripTrailingNewlines contents lhsCodeBlockBird :: Parser [Char] ParserState String lhsCodeBlockBird = lhsCodeBlockBirdWith '>' lhsCodeBlockInverseBird :: Parser [Char] ParserState String lhsCodeBlockInverseBird = lhsCodeBlockBirdWith '<' lhsCodeBlockBirdWith :: Char -> Parser [Char] ParserState String lhsCodeBlockBirdWith c = try $ do pos <- getPosition when (sourceColumn pos /= 1) $ fail "Not in first column" lns <- many1 $ birdTrackLine c -- if (as is normal) there is always a space after >, drop it let lns' = if all (\ln -> null ln || take 1 ln == " ") lns then map (drop 1) lns else lns blanklines return $ intercalate "\n" lns' birdTrackLine :: Char -> Parser [Char] st [Char] birdTrackLine c = try $ do char c -- allow html tags on left margin: when (c == '<') $ notFollowedBy letter manyTill anyChar newline -- -- block quotes -- emailBlockQuoteStart :: Parser [Char] ParserState Char emailBlockQuoteStart = try $ skipNonindentSpaces >> char '>' >>~ optional (char ' ') emailBlockQuote :: Parser [Char] ParserState [[Char]] emailBlockQuote = try $ do emailBlockQuoteStart raw <- sepBy (many (nonEndline <|> (try (endline >> notFollowedBy emailBlockQuoteStart >> return '\n')))) (try (newline >> emailBlockQuoteStart)) newline <|> (eof >> return '\n') optional blanklines return raw blockQuote :: Parser [Char] ParserState Block blockQuote = do raw <- emailBlockQuote -- parse the extracted block, which may contain various block elements: contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n" return $ BlockQuote contents -- -- list blocks -- bulletListStart :: Parser [Char] ParserState () bulletListStart = try $ do optional newline -- if preceded by a Plain block in a list context skipNonindentSpaces notFollowedBy' hrule -- because hrules start out just like lists satisfy isBulletListMarker spaceChar skipSpaces anyOrderedListStart :: Parser [Char] ParserState (Int, ListNumberStyle, ListNumberDelim) anyOrderedListStart = try $ do optional newline -- if preceded by a Plain block in a list context skipNonindentSpaces notFollowedBy $ string "p." >> spaceChar >> digit -- page number state <- getState if readerStrict (stateOptions state) then do many1 digit char '.' spaceChar return (1, DefaultStyle, DefaultDelim) else do (num, style, delim) <- anyOrderedListMarker -- if it could be an abbreviated first name, insist on more than one space if delim == Period && (style == UpperAlpha || (style == UpperRoman && num `elem` [1, 5, 10, 50, 100, 500, 1000])) then char '\t' <|> (try $ char ' ' >> spaceChar) else spaceChar skipSpaces return (num, style, delim) listStart :: Parser [Char] ParserState () listStart = bulletListStart <|> (anyOrderedListStart >> return ()) -- parse a line of a list item (start = parser for beginning of list item) listLine :: Parser [Char] ParserState [Char] listLine = try $ do notFollowedBy blankline notFollowedBy' (do indentSpaces many (spaceChar) listStart) chunks <- manyTill (liftM snd (htmlTag isCommentTag) <|> count 1 anyChar) newline return $ concat chunks ++ "\n" -- parse raw text for one list item, excluding start marker and continuations rawListItem :: Parser [Char] ParserState a -> Parser [Char] ParserState [Char] rawListItem start = try $ do start first <- listLine rest <- many (notFollowedBy listStart >> listLine) blanks <- many blankline return $ concat (first:rest) ++ blanks -- continuation of a list item - indented and separated by blankline -- or (in compact lists) endline. -- note: nested lists are parsed as continuations listContinuation :: Parser [Char] ParserState [Char] listContinuation = try $ do lookAhead indentSpaces result <- many1 listContinuationLine blanks <- many blankline return $ concat result ++ blanks listContinuationLine :: Parser [Char] ParserState [Char] listContinuationLine = try $ do notFollowedBy blankline notFollowedBy' listStart optional indentSpaces result <- manyTill anyChar newline return $ result ++ "\n" listItem :: Parser [Char] ParserState a -> Parser [Char] ParserState [Block] listItem start = try $ do first <- rawListItem start continuations <- many listContinuation -- parsing with ListItemState forces markers at beginning of lines to -- count as list item markers, even if not separated by blank space. -- see definition of "endline" state <- getState let oldContext = stateParserContext state setState $ state {stateParserContext = ListItemState} -- parse the extracted block, which may contain various block elements: let raw = concat (first:continuations) contents <- parseFromString parseBlocks raw updateState (\st -> st {stateParserContext = oldContext}) return contents orderedList :: Parser [Char] ParserState Block orderedList = try $ do (start, style, delim) <- lookAhead anyOrderedListStart unless ((style == DefaultStyle || style == Decimal || style == Example) && (delim == DefaultDelim || delim == Period)) $ guardEnabled Ext_fancy_lists when (style == Example) $ guardEnabled Ext_example_lists items <- many1 $ listItem $ try $ do optional newline -- if preceded by a Plain block in a list context skipNonindentSpaces orderedListMarker style delim start' <- option 1 $ guardEnabled Ext_startnum >> return start return $ OrderedList (start', style, delim) $ compactify items bulletList :: Parser [Char] ParserState Block bulletList = many1 (listItem bulletListStart) >>= return . BulletList . compactify -- definition lists defListMarker :: Parser [Char] ParserState () defListMarker = do sps <- nonindentSpaces char ':' <|> char '~' tabStop <- getOption readerTabStop let remaining = tabStop - (length sps + 1) if remaining > 0 then count remaining (char ' ') <|> string "\t" else mzero return () definitionListItem :: Parser [Char] ParserState ([Inline], [[Block]]) definitionListItem = try $ do guardEnabled Ext_definition_lists -- first, see if this has any chance of being a definition list: lookAhead (anyLine >> optional blankline >> defListMarker) term <- manyTill inline newline optional blankline raw <- many1 defRawBlock state <- getState let oldContext = stateParserContext state -- parse the extracted block, which may contain various block elements: contents <- mapM (parseFromString parseBlocks) raw updateState (\st -> st {stateParserContext = oldContext}) return ((normalizeSpaces term), contents) defRawBlock :: Parser [Char] ParserState [Char] defRawBlock = try $ do defListMarker firstline <- anyLine rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine) trailing <- option "" blanklines cont <- liftM concat $ many $ do lns <- many1 $ notFollowedBy blankline >> indentSpaces >> anyLine trl <- option "" blanklines return $ unlines lns ++ trl return $ firstline ++ "\n" ++ unlines rawlines ++ trailing ++ cont definitionList :: Parser [Char] ParserState Block definitionList = do items <- many1 definitionListItem -- "compactify" the definition list: let defs = map snd items let defBlocks = reverse $ concat $ concat defs let isPara (Para _) = True isPara _ = False let items' = case take 1 defBlocks of [Para x] -> if not $ any isPara (drop 1 defBlocks) then let (t,ds) = last items lastDef = last ds ds' = init ds ++ [init lastDef ++ [Plain x]] in init items ++ [(t, ds')] else items _ -> items return $ DefinitionList items' -- -- paragraph block -- isHtmlOrBlank :: Inline -> Bool isHtmlOrBlank (RawInline "html" _) = True isHtmlOrBlank (Space) = True isHtmlOrBlank (LineBreak) = True isHtmlOrBlank _ = False para :: Parser [Char] ParserState Block para = try $ do result <- liftM normalizeSpaces $ many1 inline guard $ not . all isHtmlOrBlank $ result option (Plain result) $ try $ do newline (blanklines >> return Null) <|> (guardDisabled Ext_blank_before_blockquote >> lookAhead blockQuote) <|> (guardDisabled Ext_blank_before_header >> lookAhead header) return $ Para result plain :: Parser [Char] ParserState Block plain = many1 inline >>~ spaces >>= return . Plain . normalizeSpaces -- -- raw html -- htmlElement :: Parser [Char] ParserState [Char] htmlElement = strictHtmlBlock <|> liftM snd (htmlTag isBlockTag) htmlBlock :: Parser [Char] ParserState Block htmlBlock = RawBlock "html" `fmap` ((guardEnabled Ext_markdown_in_html_blocks >> rawHtmlBlocks) <|> htmlBlock') htmlBlock' :: Parser [Char] ParserState String htmlBlock' = try $ do first <- htmlElement finalSpace <- many spaceChar finalNewlines <- many newline return $ first ++ finalSpace ++ finalNewlines strictHtmlBlock :: Parser [Char] ParserState [Char] strictHtmlBlock = htmlInBalanced (not . isInlineTag) rawVerbatimBlock :: Parser [Char] ParserState String rawVerbatimBlock = try $ do (TagOpen tag _, open) <- htmlTag (tagOpen (\t -> t == "pre" || t == "style" || t == "script") (const True)) contents <- manyTill anyChar (htmlTag (~== TagClose tag)) return $ open ++ contents ++ renderTags [TagClose tag] rawTeXBlock :: Parser [Char] ParserState Block rawTeXBlock = do guardEnabled Ext_raw_tex result <- liftM (RawBlock "latex") rawLaTeXBlock <|> liftM (RawBlock "context") rawConTeXtEnvironment spaces return result rawHtmlBlocks :: Parser [Char] ParserState String rawHtmlBlocks = do htmlBlocks <- many1 $ do blk <- rawVerbatimBlock <|> liftM snd (htmlTag isBlockTag) sps <- do sp1 <- many spaceChar sp2 <- option "" (blankline >> return "\n") sp3 <- many spaceChar sp4 <- option "" blanklines return $ sp1 ++ sp2 ++ sp3 ++ sp4 -- note: we want raw html to be able to -- precede a code block, when separated -- by a blank line return $ blk ++ sps let combined = concat htmlBlocks return $ if last combined == '\n' then init combined else combined -- -- Tables -- -- Parse a dashed line with optional trailing spaces; return its length -- and the length including trailing space. dashedLine :: Char -> Parser [Char] st (Int, Int) dashedLine ch = do dashes <- many1 (char ch) sp <- many spaceChar return $ (length dashes, length $ dashes ++ sp) -- Parse a table header with dashed lines of '-' preceded by -- one (or zero) line of text. simpleTableHeader :: Bool -- ^ Headerless table -> Parser [Char] ParserState ([[Block]], [Alignment], [Int]) simpleTableHeader headless = try $ do rawContent <- if headless then return "" else anyLine initSp <- nonindentSpaces dashes <- many1 (dashedLine '-') newline let (lengths, lines') = unzip dashes let indices = scanl (+) (length initSp) lines' -- If no header, calculate alignment on basis of first row of text rawHeads <- liftM (tail . splitStringByIndices (init indices)) $ if headless then lookAhead anyLine else return rawContent let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths let rawHeads' = if headless then replicate (length dashes) "" else rawHeads heads <- mapM (parseFromString (many plain)) $ map removeLeadingTrailingSpace rawHeads' return (heads, aligns, indices) -- Parse a table footer - dashed lines followed by blank line. tableFooter :: Parser [Char] ParserState [Char] tableFooter = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> blanklines -- Parse a table separator - dashed line. tableSep :: Parser [Char] ParserState Char tableSep = try $ skipNonindentSpaces >> many1 (dashedLine '-') >> char '\n' -- Parse a raw line and split it into chunks by indices. rawTableLine :: [Int] -> Parser [Char] ParserState [String] rawTableLine indices = do notFollowedBy' (blanklines <|> tableFooter) line <- many1Till anyChar newline return $ map removeLeadingTrailingSpace $ tail $ splitStringByIndices (init indices) line -- Parse a table line and return a list of lists of blocks (columns). tableLine :: [Int] -> Parser [Char] ParserState [[Block]] tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain)) -- Parse a multiline table row and return a list of blocks (columns). multilineRow :: [Int] -> Parser [Char] ParserState [[Block]] multilineRow indices = do colLines <- many1 (rawTableLine indices) let cols = map unlines $ transpose colLines mapM (parseFromString (many plain)) cols -- Parses a table caption: inlines beginning with 'Table:' -- and followed by blank lines. tableCaption :: Parser [Char] ParserState [Inline] tableCaption = try $ do guardEnabled Ext_table_captions skipNonindentSpaces string ":" <|> string "Table:" result <- many1 inline blanklines return $ normalizeSpaces result -- Parse a simple table with '---' header and one line per row. simpleTable :: Bool -- ^ Headerless table -> Parser [Char] ParserState Block simpleTable headless = do Table c a _w h l <- tableWith (simpleTableHeader headless) tableLine (return ()) (if headless then tableFooter else tableFooter <|> blanklines) -- Simple tables get 0s for relative column widths (i.e., use default) return $ Table c a (replicate (length a) 0) h l -- Parse a multiline table: starts with row of '-' on top, then header -- (which may be multiline), then the rows, -- which may be multiline, separated by blank lines, and -- ending with a footer (dashed line followed by blank line). multilineTable :: Bool -- ^ Headerless table -> Parser [Char] ParserState Block multilineTable headless = tableWith (multilineTableHeader headless) multilineRow blanklines tableFooter multilineTableHeader :: Bool -- ^ Headerless table -> Parser [Char] ParserState ([[Block]], [Alignment], [Int]) multilineTableHeader headless = try $ do if headless then return '\n' else tableSep >>~ notFollowedBy blankline rawContent <- if headless then return $ repeat "" else many1 (notFollowedBy tableSep >> many1Till anyChar newline) initSp <- nonindentSpaces dashes <- many1 (dashedLine '-') newline let (lengths, lines') = unzip dashes let indices = scanl (+) (length initSp) lines' rawHeadsList <- if headless then liftM (map (:[]) . tail . splitStringByIndices (init indices)) $ lookAhead anyLine else return $ transpose $ map (\ln -> tail $ splitStringByIndices (init indices) ln) rawContent let aligns = zipWith alignType rawHeadsList lengths let rawHeads = if headless then replicate (length dashes) "" else map (intercalate " ") rawHeadsList heads <- mapM (parseFromString (many plain)) $ map removeLeadingTrailingSpace rawHeads return (heads, aligns, indices) -- Returns an alignment type for a table, based on a list of strings -- (the rows of the column header) and a number (the length of the -- dashed line under the rows. alignType :: [String] -> Int -> Alignment alignType [] _ = AlignDefault alignType strLst len = let nonempties = filter (not . null) $ map removeTrailingSpace strLst (leftSpace, rightSpace) = case sortBy (comparing length) nonempties of (x:_) -> (head x `elem` " \t", length x < len) [] -> (False, False) in case (leftSpace, rightSpace) of (True, False) -> AlignRight (False, True) -> AlignLeft (True, True) -> AlignCenter (False, False) -> AlignDefault gridTable :: Bool -- ^ Headerless table -> Parser [Char] ParserState Block gridTable = gridTableWith block pipeTable :: Bool -- ^ Headerless table -> Parser [Char] ParserState Block pipeTable headless = tableWith (pipeTableHeader headless) (\_ -> pipeTableRow) (return ()) blanklines -- | Parse header for an pipe table. pipeTableHeader :: Bool -- ^ Headerless table -> Parser [Char] ParserState ([[Block]], [Alignment], [Int]) pipeTableHeader headless = do try $ do heads <- if headless then return $ repeat [] else pipeTableRow aligns <- nonindentSpaces >> optional (char '|') >> pipeTableHeaderPart `sepBy1` sepPipe optional (char '|') newline let cols = length aligns return (take cols heads, aligns, []) sepPipe :: Parser [Char] ParserState () sepPipe = try $ char '|' >> notFollowedBy blankline pipeTableRow :: Parser [Char] ParserState [[Block]] pipeTableRow = do nonindentSpaces optional (char '|') let cell = many (notFollowedBy (blankline <|> char '|') >> inline) first <- cell sepPipe rest <- cell `sepBy1` sepPipe optional (char '|') blankline return $ map (\ils -> if null ils then [] else [Plain $ normalizeSpaces ils]) (first:rest) pipeTableHeaderPart :: Parser [Char] st Alignment pipeTableHeaderPart = do left <- optionMaybe (char ':') many1 (char '-') right <- optionMaybe (char ':') return $ case (left,right) of (Nothing,Nothing) -> AlignDefault (Just _,Nothing) -> AlignLeft (Nothing,Just _) -> AlignRight (Just _,Just _) -> AlignCenter -- Succeed only if current line contains a pipe. scanForPipe :: Parser [Char] st () scanForPipe = lookAhead (manyTill (satisfy (/='\n')) (char '|')) >> return () table :: Parser [Char] ParserState Block table = try $ do frontCaption <- option [] tableCaption Table _ aligns widths heads lines' <- try (guardEnabled Ext_pipe_tables >> scanForPipe >> (pipeTable True <|> pipeTable False)) <|> try (guardEnabled Ext_multiline_tables >> (multilineTable False <|> simpleTable True)) <|> try (guardEnabled Ext_simple_tables >> (simpleTable False <|> multilineTable True)) <|> try (guardEnabled Ext_grid_tables >> (gridTable False <|> gridTable True)) "table" optional blanklines caption <- if null frontCaption then option [] tableCaption else return frontCaption return $ Table caption aligns widths heads lines' -- -- inline -- inline :: Parser [Char] ParserState Inline inline = choice inlineParsers "inline" inlineParsers :: [Parser [Char] ParserState Inline] inlineParsers = [ whitespace , str , endline , code , fours , strong , emph , note , link , cite , image , math , strikeout , superscript , subscript , inlineNote -- after superscript because of ^[link](/foo)^ , autoLink , rawHtmlInline , escapedChar , rawLaTeXInline' , exampleRef , smartPunctuation inline , charRef , symbol , ltSign ] escapedChar' :: Parser [Char] ParserState Char escapedChar' = try $ do char '\\' (guardEnabled Ext_all_symbols_escapable >> satisfy (not . isAlphaNum)) <|> oneOf "\\`*_{}[]()>#+-.!~" escapedChar :: Parser [Char] ParserState Inline escapedChar = do result <- escapedChar' case result of ' ' -> return $ Str "\160" -- "\ " is a nonbreaking space '\n' -> guardEnabled Ext_escaped_line_breaks >> return LineBreak -- "\[newline]" is a linebreak _ -> return $ Str [result] ltSign :: Parser [Char] ParserState Inline ltSign = do guardDisabled Ext_markdown_in_html_blocks <|> (notFollowedBy' rawHtmlBlocks >> return ()) char '<' return $ Str ['<'] exampleRef :: Parser [Char] ParserState Inline exampleRef = try $ do guardEnabled Ext_example_lists char '@' lab <- many1 (alphaNum <|> oneOf "-_") -- We just return a Str. These are replaced with numbers -- later. See the end of parseMarkdown. return $ Str $ '@' : lab symbol :: Parser [Char] ParserState Inline symbol = do result <- noneOf "<\\\n\t " <|> try (do lookAhead $ char '\\' notFollowedBy' rawTeXBlock char '\\') return $ Str [result] -- parses inline code, between n `s and n `s code :: Parser [Char] ParserState Inline code = try $ do starts <- many1 (char '`') skipSpaces result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|> (char '\n' >> notFollowedBy' blankline >> return " ")) (try (skipSpaces >> count (length starts) (char '`') >> notFollowedBy (char '`'))) attr <- option ([],[],[]) (try $ guardEnabled Ext_inline_code_attributes >> optional whitespace >> attributes) return $ Code attr $ removeLeadingTrailingSpace $ concat result mathWord :: Parser [Char] st [Char] mathWord = liftM concat $ many1 mathChunk mathChunk :: Parser [Char] st [Char] mathChunk = do char '\\' c <- anyChar return ['\\',c] <|> many1 (satisfy $ \c -> not (isBlank c || c == '\\' || c == '$')) math :: Parser [Char] ParserState Inline math = (mathDisplay >>= applyMacros' >>= return . Math DisplayMath) <|> (mathInline >>= applyMacros' >>= return . Math InlineMath) mathDisplay :: Parser [Char] ParserState String mathDisplay = try $ do guardEnabled Ext_tex_math string "$$" many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string "$$") mathInline :: Parser [Char] ParserState String mathInline = try $ do guardEnabled Ext_tex_math char '$' notFollowedBy space words' <- sepBy1 mathWord (many1 (spaceChar <|> (newline >>~ notFollowedBy' blankline))) char '$' notFollowedBy digit return $ intercalate " " words' -- to avoid performance problems, treat 4 or more _ or * or ~ or ^ in a row -- as a literal rather than attempting to parse for emph/strong/strikeout/super/sub fours :: Parser [Char] st Inline fours = try $ do x <- char '*' <|> char '_' <|> char '~' <|> char '^' count 2 $ satisfy (==x) rest <- many1 (satisfy (==x)) return $ Str (x:x:x:rest) -- | Parses a list of inlines between start and end delimiters. inlinesBetween :: (Show b) => Parser [Char] ParserState a -> Parser [Char] ParserState b -> Parser [Char] ParserState [Inline] inlinesBetween start end = normalizeSpaces `liftM` try (start >> many1Till inner end) where inner = innerSpace <|> (notFollowedBy' whitespace >> inline) innerSpace = try $ whitespace >>~ notFollowedBy' end -- This is used to prevent exponential blowups for things like: -- a**a*a**a*a**a*a**a*a**a*a**a*a** nested :: Parser [Char] ParserState a -> Parser [Char] ParserState a nested p = do nestlevel <- stateMaxNestingLevel `fmap` getState guard $ nestlevel > 0 updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } res <- p updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } return res emph :: Parser [Char] ParserState Inline emph = Emph `fmap` nested (inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd) where starStart = char '*' >> lookAhead nonspaceChar starEnd = notFollowedBy' strong >> char '*' ulStart = char '_' >> lookAhead nonspaceChar ulEnd = notFollowedBy' strong >> char '_' strong :: Parser [Char] ParserState Inline strong = Strong `liftM` nested (inlinesBetween starStart starEnd <|> inlinesBetween ulStart ulEnd) where starStart = string "**" >> lookAhead nonspaceChar starEnd = try $ string "**" ulStart = string "__" >> lookAhead nonspaceChar ulEnd = try $ string "__" strikeout :: Parser [Char] ParserState Inline strikeout = Strikeout `liftM` (guardEnabled Ext_strikeout >> inlinesBetween strikeStart strikeEnd) where strikeStart = string "~~" >> lookAhead nonspaceChar >> notFollowedBy (char '~') strikeEnd = try $ string "~~" superscript :: Parser [Char] ParserState Inline superscript = guardEnabled Ext_superscript >> enclosed (char '^') (char '^') (notFollowedBy spaceChar >> inline) >>= -- may not contain Space return . Superscript subscript :: Parser [Char] ParserState Inline subscript = guardEnabled Ext_subscript >> enclosed (char '~') (char '~') (notFollowedBy spaceChar >> inline) >>= -- may not contain Space return . Subscript whitespace :: Parser [Char] ParserState Inline whitespace = spaceChar >> ( (spaceChar >> skipMany spaceChar >> option Space (endline >> return LineBreak)) <|> (skipMany spaceChar >> return Space) ) "whitespace" nonEndline :: Parser [Char] st Char nonEndline = satisfy (/='\n') str :: Parser [Char] ParserState Inline str = do smart <- (readerSmart . stateOptions) `fmap` getState a <- alphaNum as <- many $ alphaNum <|> (guardEnabled Ext_intraword_underscores >> try (char '_' >>~ lookAhead alphaNum)) <|> if smart then (try $ satisfy (\c -> c == '\'' || c == '\x2019') >> lookAhead alphaNum >> return '\x2019') -- for things like l'aide else mzero pos <- getPosition updateState $ \s -> s{ stateLastStrPos = Just pos } let result = a:as let spacesToNbr = map (\c -> if c == ' ' then '\160' else c) if smart then case likelyAbbrev result of [] -> return $ Str result xs -> choice (map (\x -> try (string x >> oneOf " \n" >> lookAhead alphaNum >> return (Str $ result ++ spacesToNbr x ++ "\160"))) xs) <|> (return $ Str result) else return $ Str result -- | if the string matches the beginning of an abbreviation (before -- the first period, return strings that would finish the abbreviation. likelyAbbrev :: String -> [String] likelyAbbrev x = let abbrevs = [ "Mr.", "Mrs.", "Ms.", "Capt.", "Dr.", "Prof.", "Gen.", "Gov.", "e.g.", "i.e.", "Sgt.", "St.", "vol.", "vs.", "Sen.", "Rep.", "Pres.", "Hon.", "Rev.", "Ph.D.", "M.D.", "M.A.", "p.", "pp.", "ch.", "sec.", "cf.", "cp."] abbrPairs = map (break (=='.')) abbrevs in map snd $ filter (\(y,_) -> y == x) abbrPairs -- an endline character that can be treated as a space, not a structural break endline :: Parser [Char] ParserState Inline endline = try $ do newline notFollowedBy blankline guardEnabled Ext_blank_before_blockquote <|> notFollowedBy emailBlockQuoteStart guardEnabled Ext_blank_before_header <|> notFollowedBy (char '#') -- atx header -- parse potential list-starts differently if in a list: st <- getState when (stateParserContext st == ListItemState) $ do notFollowedBy' bulletListStart notFollowedBy' anyOrderedListStart return Space -- -- links -- -- a reference label for a link reference :: Parser [Char] ParserState [Inline] reference = do notFollowedBy' (string "[^") -- footnote reference result <- inlinesInBalancedBrackets inline return $ normalizeSpaces result -- source for a link, with optional title source :: Parser [Char] ParserState (String, [Char]) source = (try $ charsInBalanced '(' ')' litChar >>= parseFromString source') <|> -- the following is needed for cases like: [ref](/url(a). (enclosed (char '(') (char ')') litChar >>= parseFromString source') -- auxiliary function for source source' :: Parser [Char] ParserState (String, [Char]) source' = do skipSpaces let nl = char '\n' >>~ notFollowedBy blankline let sourceURL = liftM unwords $ many $ try $ do notFollowedBy' linkTitle skipMany spaceChar optional nl skipMany spaceChar many1 $ escapedChar' <|> satisfy (not . isBlank) let betweenAngles = try $ char '<' >> manyTill (escapedChar' <|> noneOf ">\n" <|> nl) (char '>') src <- try betweenAngles <|> sourceURL tit <- option "" linkTitle skipSpaces eof return (escapeURI $ removeTrailingSpace src, tit) linkTitle :: Parser [Char] ParserState String linkTitle = try $ do (many1 spaceChar >> option '\n' newline) <|> newline skipSpaces delim <- oneOf "'\"" tit <- manyTill litChar (try (char delim >> skipSpaces >> eof)) return $ fromEntities tit link :: Parser [Char] ParserState Inline link = try $ do lab <- reference (src, tit) <- source <|> referenceLink lab return $ Link (delinkify lab) (src, tit) delinkify :: [Inline] -> [Inline] delinkify = bottomUp $ concatMap go where go (Link lab _) = lab go x = [x] -- a link like [this][ref] or [this][] or [this] referenceLink :: [Inline] -> Parser [Char] ParserState (String, [Char]) referenceLink lab = do ref <- option [] (try (optional (char ' ') >> optional (newline >> skipSpaces) >> reference)) let ref' = if null ref then lab else ref state <- getState case lookupKeySrc (stateKeys state) (toKey ref') of Nothing -> fail "no corresponding key" Just target -> return target autoLink :: Parser [Char] ParserState Inline autoLink = try $ do char '<' (orig, src) <- uri <|> emailAddress char '>' (guardEnabled Ext_autolink_code_spans >> return (Link [Code ("",["url"],[]) orig] (src, ""))) <|> return (Link [Str orig] (src, "")) image :: Parser [Char] ParserState Inline image = try $ do char '!' lab <- reference (src, tit) <- source <|> referenceLink lab return $ Image lab (src,tit) note :: Parser [Char] ParserState Inline note = try $ do guardEnabled Ext_footnotes ref <- noteMarker state <- getState let notes = stateNotes state case lookup ref notes of Nothing -> fail "note not found" Just raw -> do -- We temporarily empty the note list while parsing the note, -- so that we don't get infinite loops with notes inside notes... -- Note references inside other notes do not work. updateState $ \st -> st{ stateNotes = [] } contents <- parseFromString parseBlocks raw updateState $ \st -> st{ stateNotes = notes } return $ Note contents inlineNote :: Parser [Char] ParserState Inline inlineNote = try $ do guardEnabled Ext_inline_notes char '^' contents <- inlinesInBalancedBrackets inline return $ Note [Para contents] rawLaTeXInline' :: Parser [Char] ParserState Inline rawLaTeXInline' = try $ do guardEnabled Ext_raw_tex lookAhead $ char '\\' >> notFollowedBy' (string "start") -- context env RawInline _ s <- rawLaTeXInline return $ RawInline "tex" s -- "tex" because it might be context or latex rawConTeXtEnvironment :: Parser [Char] st String rawConTeXtEnvironment = try $ do string "\\start" completion <- inBrackets (letter <|> digit <|> spaceChar) <|> (many1 letter) contents <- manyTill (rawConTeXtEnvironment <|> (count 1 anyChar)) (try $ string "\\stop" >> string completion) return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion inBrackets :: (Parser [Char] st Char) -> Parser [Char] st String inBrackets parser = do char '[' contents <- many parser char ']' return $ "[" ++ contents ++ "]" rawHtmlInline :: Parser [Char] ParserState Inline rawHtmlInline = do mdInHtml <- option False $ guardEnabled Ext_markdown_in_html_blocks >> return True (_,result) <- if mdInHtml then htmlTag isInlineTag else htmlTag (not . isTextTag) return $ RawInline "html" result -- Citations cite :: Parser [Char] ParserState Inline cite = do guardEnabled Ext_citations citations <- textualCite <|> normalCite return $ Cite citations [] spnl :: Parser [Char] st () spnl = try $ do skipSpaces optional newline skipSpaces notFollowedBy (char '\n') textualCite :: Parser [Char] ParserState [Citation] textualCite = try $ do (_, key) <- citeKey let first = Citation{ citationId = key , citationPrefix = [] , citationSuffix = [] , citationMode = AuthorInText , citationNoteNum = 0 , citationHash = 0 } rest <- option [] $ try $ spnl >> normalCite if null rest then option [first] $ bareloc first else return $ first : rest bareloc :: Citation -> Parser [Char] ParserState [Citation] bareloc c = try $ do spnl char '[' suff <- suffix rest <- option [] $ try $ char ';' >> citeList spnl char ']' return $ c{ citationSuffix = suff } : rest normalCite :: Parser [Char] ParserState [Citation] normalCite = try $ do char '[' spnl citations <- citeList spnl char ']' return citations citeKey :: Parser [Char] ParserState (Bool, String) citeKey = try $ do suppress_author <- option False (char '-' >> return True) char '@' first <- letter let internal p = try $ p >>~ lookAhead (letter <|> digit) rest <- many $ letter <|> digit <|> internal (oneOf ":.#$%&-_?<>~") let key = first:rest citations' <- getOption readerCitations guard $ key `elem` citations' return (suppress_author, key) suffix :: Parser [Char] ParserState [Inline] suffix = try $ do hasSpace <- option False (notFollowedBy nonspaceChar >> return True) spnl rest <- liftM normalizeSpaces $ many $ notFollowedBy (oneOf ";]") >> inline return $ if hasSpace then Space : rest else rest prefix :: Parser [Char] ParserState [Inline] prefix = liftM normalizeSpaces $ manyTill inline (char ']' <|> liftM (const ']') (lookAhead citeKey)) citeList :: Parser [Char] ParserState [Citation] citeList = sepBy1 citation (try $ char ';' >> spnl) citation :: Parser [Char] ParserState Citation citation = try $ do pref <- prefix (suppress_author, key) <- citeKey suff <- suffix return $ Citation{ citationId = key , citationPrefix = pref , citationSuffix = suff , citationMode = if suppress_author then SuppressAuthor else NormalCitation , citationNoteNum = 0 , citationHash = 0 }