From 42aca57dee8d88afa5fac512aeb1198102908865 Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Sat, 24 Jan 2009 20:00:26 +0000 Subject: Moved all haskell source to src subdirectory. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1528 788f1e2b-df1e-0410-8736-df70ead52e1b --- src/Text/Pandoc/Readers/Markdown.hs | 1243 +++++++++++++++++++++++++++++++++++ 1 file changed, 1243 insertions(+) create mode 100644 src/Text/Pandoc/Readers/Markdown.hs (limited to 'src/Text/Pandoc/Readers/Markdown.hs') diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs new file mode 100644 index 000000000..896f5832e --- /dev/null +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -0,0 +1,1243 @@ +{-# LANGUAGE CPP #-} +{- +Copyright (C) 2006-8 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Markdown + Copyright : Copyright (C) 2006-8 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Conversion of markdown-formatted plain text to 'Pandoc' document. +-} +module Text.Pandoc.Readers.Markdown ( + readMarkdown + ) where + +import Data.List ( transpose, isPrefixOf, isSuffixOf, lookup, sortBy, findIndex, intercalate ) +import Data.Ord ( comparing ) +import Data.Char ( isAlphaNum, isAlpha, isLower, isDigit, isUpper ) +import Data.Maybe +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.Readers.LaTeX ( rawLaTeXInline, rawLaTeXEnvironment' ) +import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlBlockTag, + anyHtmlInlineTag, anyHtmlTag, + anyHtmlEndTag, htmlEndTag, extractTagType, + htmlBlockElement, unsanitaryURI ) +import Text.Pandoc.CharacterReferences ( decodeCharacterReferences ) +import Text.ParserCombinators.Parsec +import Control.Monad (when) + +-- | Read markdown from an input string and return a Pandoc document. +readMarkdown :: ParserState -> String -> Pandoc +readMarkdown state s = (readWith parseMarkdown) state (s ++ "\n\n") + +-- +-- Constants and data structure definitions +-- + +spaceChars :: [Char] +spaceChars = " \t" + +bulletListMarkers :: [Char] +bulletListMarkers = "*+-" + +hruleChars :: [Char] +hruleChars = "*-_" + +setextHChars :: [Char] +setextHChars = "=-" + +-- treat these as potentially non-text when parsing inline: +specialChars :: [Char] +specialChars = "\\[]*_~`<>$!^-.&'\"\8216\8217\8220\8221" + +-- +-- auxiliary functions +-- + +indentSpaces :: GenParser Char ParserState [Char] +indentSpaces = try $ do + state <- getState + let tabStop = stateTabStop state + try (count tabStop (char ' ')) <|> + (many (char ' ') >> string "\t") "indentation" + +nonindentSpaces :: GenParser Char ParserState [Char] +nonindentSpaces = do + state <- getState + let tabStop = stateTabStop state + sps <- many (char ' ') + if length sps < tabStop + then return sps + else unexpected "indented line" + +-- | Fail unless we're at beginning of a line. +failUnlessBeginningOfLine :: GenParser tok st () +failUnlessBeginningOfLine = do + pos <- getPosition + if sourceColumn pos == 1 then return () else fail "not beginning of line" + +-- | Fail unless we're in "smart typography" mode. +failUnlessSmart :: GenParser tok ParserState () +failUnlessSmart = do + state <- getState + if stateSmart state then return () else fail "Smart typography feature" + +-- | Parse a sequence of inline elements between square brackets, +-- including inlines between balanced pairs of square brackets. +inlinesInBalancedBrackets :: GenParser Char ParserState Inline + -> GenParser Char ParserState [Inline] +inlinesInBalancedBrackets parser = try $ do + char '[' + result <- manyTill ( (do lookAhead $ try $ do (Str res) <- parser + if res == "[" + then return () + else pzero + bal <- inlinesInBalancedBrackets parser + return $ [Str "["] ++ bal ++ [Str "]"]) + <|> (count 1 parser)) + (char ']') + return $ concat result + +-- +-- document structure +-- + +titleLine :: GenParser Char ParserState [Inline] +titleLine = try $ char '%' >> skipSpaces >> manyTill inline newline + +authorsLine :: GenParser Char st [String] +authorsLine = try $ do + char '%' + skipSpaces + authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;") + newline + return $ map (decodeCharacterReferences . removeLeadingTrailingSpace) authors + +dateLine :: GenParser Char st String +dateLine = try $ do + char '%' + skipSpaces + date <- many (noneOf "\n") + newline + return $ decodeCharacterReferences $ removeTrailingSpace date + +titleBlock :: GenParser Char ParserState ([Inline], [String], [Char]) +titleBlock = try $ do + failIfStrict + title <- option [] titleLine + author <- option [] authorsLine + date <- option "" dateLine + optional blanklines + return (title, author, date) + +parseMarkdown :: GenParser Char ParserState Pandoc +parseMarkdown = do + -- markdown allows raw HTML + updateState (\state -> state { stateParseRaw = True }) + startPos <- getPosition + -- go through once just to get list of reference keys + -- docMinusKeys is the raw document with blanks where the keys were... + docMinusKeys <- manyTill (referenceKey <|> lineClump) eof >>= + return . concat + setInput docMinusKeys + setPosition startPos + st <- getState + -- go through again for notes unless strict... + if stateStrict st + then return () + else do docMinusNotes <- manyTill (noteBlock <|> lineClump) eof >>= + return . concat + st' <- getState + let reversedNotes = stateNotes st' + updateState $ \s -> s { stateNotes = reverse reversedNotes } + setInput docMinusNotes + setPosition startPos + -- now parse it for real... + (title, author, date) <- option ([],[],"") titleBlock + blocks <- parseBlocks + return $ Pandoc (Meta title author date) $ filter (/= Null) blocks + +-- +-- initial pass for references and notes +-- + +referenceKey :: GenParser Char ParserState [Char] +referenceKey = try $ do + startPos <- getPosition + nonindentSpaces + lab <- reference + char ':' + skipSpaces >> optional newline >> skipSpaces >> notFollowedBy (char '[') + let sourceURL excludes = many $ + optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' referenceTitle >> char ' ')) + src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n" + tit <- option "" referenceTitle + blanklines + endPos <- getPosition + let newkey = (lab, (intercalate "%20" $ words $ removeTrailingSpace src, tit)) + st <- getState + let oldkeys = stateKeys st + updateState $ \s -> s { stateKeys = newkey : oldkeys } + -- return blanks so line count isn't affected + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +referenceTitle :: GenParser Char st String +referenceTitle = try $ do + skipSpaces >> optional newline >> skipSpaces + tit <- (charsInBalanced '(' ')' >>= return . unwords . words) + <|> do delim <- char '\'' <|> char '"' + manyTill anyChar (try (char delim >> skipSpaces >> + notFollowedBy (noneOf ")\n"))) + return $ decodeCharacterReferences tit + +noteMarker :: GenParser Char st [Char] +noteMarker = string "[^" >> manyTill (noneOf " \t\n") (char ']') + +rawLine :: GenParser Char ParserState [Char] +rawLine = do + notFollowedBy blankline + notFollowedBy' noteMarker + contents <- many1 nonEndline + end <- option "" (newline >> optional indentSpaces >> return "\n") + return $ contents ++ end + +rawLines :: GenParser Char ParserState [Char] +rawLines = many1 rawLine >>= return . concat + +noteBlock :: GenParser Char ParserState [Char] +noteBlock = try $ do + startPos <- getPosition + ref <- noteMarker + char ':' + optional blankline + optional indentSpaces + raw <- sepBy rawLines (try (blankline >> indentSpaces)) + optional blanklines + endPos <- getPosition + -- parse the extracted text, which may contain various block elements: + contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n" + let newnote = (ref, contents) + st <- getState + let oldnotes = stateNotes st + updateState $ \s -> s { stateNotes = newnote : oldnotes } + -- return blanks so line count isn't affected + return $ replicate (sourceLine endPos - sourceLine startPos) '\n' + +-- +-- parsing blocks +-- + +parseBlocks :: GenParser Char ParserState [Block] +parseBlocks = manyTill block eof + +block :: GenParser Char ParserState Block +block = do + st <- getState + choice (if stateStrict st + then [ header + , codeBlockIndented + , blockQuote + , hrule + , bulletList + , orderedList + , htmlBlock + , para + , plain + , nullBlock ] + else [ codeBlockDelimited + , header + , table + , codeBlockIndented + , lhsCodeBlock + , blockQuote + , hrule + , bulletList + , orderedList + , definitionList + , para + , rawHtmlBlocks + , plain + , nullBlock ]) "block" + +-- +-- header blocks +-- + +header :: GenParser Char ParserState Block +header = setextHeader <|> atxHeader "header" + +atxHeader :: GenParser Char ParserState Block +atxHeader = try $ do + level <- many1 (char '#') >>= return . length + notFollowedBy (char '.' <|> char ')') -- this would be a list + skipSpaces + text <- manyTill inline atxClosing >>= return . normalizeSpaces + return $ Header level text + +atxClosing :: GenParser Char st [Char] +atxClosing = try $ skipMany (char '#') >> blanklines + +setextHeader :: GenParser Char ParserState Block +setextHeader = try $ do + text <- many1Till inline newline + underlineChar <- oneOf setextHChars + many (char underlineChar) + blanklines + let level = (fromMaybe 0 $ findIndex (== underlineChar) setextHChars) + 1 + return $ Header level (normalizeSpaces text) + +-- +-- hrule block +-- + +hrule :: GenParser Char st Block +hrule = try $ do + skipSpaces + start <- oneOf hruleChars + count 2 (skipSpaces >> char start) + skipMany (oneOf spaceChars <|> char start) + newline + optional blanklines + return HorizontalRule + +-- +-- code blocks +-- + +indentedLine :: GenParser Char ParserState [Char] +indentedLine = indentSpaces >> manyTill anyChar newline >>= return . (++ "\n") + +codeBlockDelimiter :: Maybe Int + -> GenParser Char st (Int, ([Char], [[Char]], [([Char], [Char])])) +codeBlockDelimiter len = try $ do + size <- case len of + Just l -> count l (char '~') >> many (char '~') >> return l + Nothing -> count 3 (char '~') >> many (char '~') >>= + return . (+ 3) . length + many spaceChar + attr <- option ([],[],[]) attributes + blankline + return (size, attr) + +attributes :: GenParser Char st ([Char], [[Char]], [([Char], [Char])]) +attributes = try $ do + char '{' + many spaceChar + attrs <- many (attribute >>~ many spaceChar) + char '}' + let (ids, classes, keyvals) = unzip3 attrs + let id' = if null ids then "" else head ids + return (id', concat classes, concat keyvals) + +attribute :: GenParser Char st ([Char], [[Char]], [([Char], [Char])]) +attribute = identifierAttr <|> classAttr <|> keyValAttr + +identifier :: GenParser Char st [Char] +identifier = do + first <- letter + rest <- many alphaNum + return (first:rest) + +identifierAttr :: GenParser Char st ([Char], [a], [a1]) +identifierAttr = try $ do + char '#' + result <- identifier + return (result,[],[]) + +classAttr :: GenParser Char st ([Char], [[Char]], [a]) +classAttr = try $ do + char '.' + result <- identifier + return ("",[result],[]) + +keyValAttr :: GenParser Char st ([Char], [a], [([Char], [Char])]) +keyValAttr = try $ do + key <- identifier + char '=' + char '"' + val <- manyTill (noneOf "\n") (char '"') + return ("",[],[(key,val)]) + +codeBlockDelimited :: GenParser Char st Block +codeBlockDelimited = try $ do + (size, attr) <- codeBlockDelimiter Nothing + contents <- manyTill anyLine (codeBlockDelimiter (Just size)) + blanklines + return $ CodeBlock attr $ intercalate "\n" contents + +codeBlockIndented :: GenParser Char ParserState Block +codeBlockIndented = do + contents <- many1 (indentedLine <|> + try (do b <- blanklines + l <- indentedLine + return $ b ++ l)) + optional blanklines + return $ CodeBlock ("",[],[]) $ stripTrailingNewlines $ concat contents + +lhsCodeBlock :: GenParser Char ParserState Block +lhsCodeBlock = do + failUnlessLHS + contents <- lhsCodeBlockBird <|> lhsCodeBlockLaTeX + return $ CodeBlock ("",["sourceCode","haskell"],[]) contents + +lhsCodeBlockLaTeX :: GenParser Char ParserState String +lhsCodeBlockLaTeX = try $ do + string "\\begin{code}" + manyTill spaceChar newline + contents <- many1Till anyChar (try $ string "\\end{code}") + blanklines + return $ stripTrailingNewlines contents + +lhsCodeBlockBird :: GenParser Char ParserState String +lhsCodeBlockBird = try $ do + pos <- getPosition + when (sourceColumn pos /= 1) $ fail "Not in first column" + lns <- many1 birdTrackLine + -- if (as is normal) there is always a space after >, drop it + let lns' = if all (\ln -> null ln || take 1 ln == " ") lns + then map (drop 1) lns + else lns + blanklines + return $ intercalate "\n" lns' + +birdTrackLine :: GenParser Char st [Char] +birdTrackLine = do + char '>' + manyTill anyChar newline + + +-- +-- block quotes +-- + +emailBlockQuoteStart :: GenParser Char ParserState Char +emailBlockQuoteStart = try $ nonindentSpaces >> char '>' >>~ optional (char ' ') + +emailBlockQuote :: GenParser Char ParserState [[Char]] +emailBlockQuote = try $ do + emailBlockQuoteStart + raw <- sepBy (many (nonEndline <|> + (try (endline >> notFollowedBy emailBlockQuoteStart >> + return '\n')))) + (try (newline >> emailBlockQuoteStart)) + newline <|> (eof >> return '\n') + optional blanklines + return raw + +blockQuote :: GenParser Char ParserState Block +blockQuote = do + raw <- emailBlockQuote + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ (intercalate "\n" raw) ++ "\n\n" + return $ BlockQuote contents + +-- +-- list blocks +-- + +bulletListStart :: GenParser Char ParserState () +bulletListStart = try $ do + optional newline -- if preceded by a Plain block in a list context + nonindentSpaces + notFollowedBy' hrule -- because hrules start out just like lists + oneOf bulletListMarkers + spaceChar + skipSpaces + +anyOrderedListStart :: GenParser Char ParserState (Int, ListNumberStyle, ListNumberDelim) +anyOrderedListStart = try $ do + optional newline -- if preceded by a Plain block in a list context + nonindentSpaces + notFollowedBy $ string "p." >> spaceChar >> digit -- page number + state <- getState + if stateStrict state + then do many1 digit + char '.' + spaceChar + return (1, DefaultStyle, DefaultDelim) + else do (num, style, delim) <- anyOrderedListMarker + -- if it could be an abbreviated first name, insist on more than one space + if delim == Period && (style == UpperAlpha || (style == UpperRoman && + num `elem` [1, 5, 10, 50, 100, 500, 1000])) + then char '\t' <|> (char ' ' >>~ notFollowedBy (satisfy isUpper)) + else spaceChar + skipSpaces + return (num, style, delim) + +listStart :: GenParser Char ParserState () +listStart = bulletListStart <|> (anyOrderedListStart >> return ()) + +-- parse a line of a list item (start = parser for beginning of list item) +listLine :: GenParser Char ParserState [Char] +listLine = try $ do + notFollowedBy' listStart + notFollowedBy blankline + notFollowedBy' (do indentSpaces + many (spaceChar) + listStart) + line <- manyTill anyChar newline + return $ line ++ "\n" + +-- parse raw text for one list item, excluding start marker and continuations +rawListItem :: GenParser Char ParserState [Char] +rawListItem = try $ do + listStart + result <- many1 listLine + blanks <- many blankline + return $ concat result ++ blanks + +-- continuation of a list item - indented and separated by blankline +-- or (in compact lists) endline. +-- note: nested lists are parsed as continuations +listContinuation :: GenParser Char ParserState [Char] +listContinuation = try $ do + lookAhead indentSpaces + result <- many1 listContinuationLine + blanks <- many blankline + return $ concat result ++ blanks + +listContinuationLine :: GenParser Char ParserState [Char] +listContinuationLine = try $ do + notFollowedBy blankline + notFollowedBy' listStart + optional indentSpaces + result <- manyTill anyChar newline + return $ result ++ "\n" + +listItem :: GenParser Char ParserState [Block] +listItem = try $ do + first <- rawListItem + continuations <- many listContinuation + -- parsing with ListItemState forces markers at beginning of lines to + -- count as list item markers, even if not separated by blank space. + -- see definition of "endline" + state <- getState + let oldContext = stateParserContext state + setState $ state {stateParserContext = ListItemState} + -- parse the extracted block, which may contain various block elements: + let raw = concat (first:continuations) + contents <- parseFromString parseBlocks raw + updateState (\st -> st {stateParserContext = oldContext}) + return contents + +orderedList :: GenParser Char ParserState Block +orderedList = try $ do + (start, style, delim) <- lookAhead anyOrderedListStart + items <- many1 listItem + return $ OrderedList (start, style, delim) $ compactify items + +bulletList :: GenParser Char ParserState Block +bulletList = try $ do + lookAhead bulletListStart + many1 listItem >>= return . BulletList . compactify + +-- definition lists + +definitionListItem :: GenParser Char ParserState ([Inline], [Block]) +definitionListItem = try $ do + notFollowedBy blankline + notFollowedBy' indentSpaces + -- first, see if this has any chance of being a definition list: + lookAhead (anyLine >> char ':') + term <- manyTill inline newline + raw <- many1 defRawBlock + state <- getState + let oldContext = stateParserContext state + -- parse the extracted block, which may contain various block elements: + contents <- parseFromString parseBlocks $ concat raw + updateState (\st -> st {stateParserContext = oldContext}) + return ((normalizeSpaces term), contents) + +defRawBlock :: GenParser Char ParserState [Char] +defRawBlock = try $ do + char ':' + state <- getState + let tabStop = stateTabStop state + try (count (tabStop - 1) (char ' ')) <|> (many (char ' ') >> string "\t") + firstline <- anyLine + rawlines <- many (notFollowedBy blankline >> indentSpaces >> anyLine) + trailing <- option "" blanklines + return $ firstline ++ "\n" ++ unlines rawlines ++ trailing + +definitionList :: GenParser Char ParserState Block +definitionList = do + items <- many1 definitionListItem + let (terms, defs) = unzip items + let defs' = compactify defs + let items' = zip terms defs' + return $ DefinitionList items' + +-- +-- paragraph block +-- + +isHtmlOrBlank :: Inline -> Bool +isHtmlOrBlank (HtmlInline _) = True +isHtmlOrBlank (Space) = True +isHtmlOrBlank (LineBreak) = True +isHtmlOrBlank _ = False + +para :: GenParser Char ParserState Block +para = try $ do + result <- many1 inline + if all isHtmlOrBlank result + then fail "treat as raw HTML" + else return () + newline + blanklines <|> do st <- getState + if stateStrict st + then lookAhead (blockQuote <|> header) >> return "" + else pzero + return $ Para $ normalizeSpaces result + +plain :: GenParser Char ParserState Block +plain = many1 inline >>= return . Plain . normalizeSpaces + +-- +-- raw html +-- + +htmlElement :: GenParser Char ParserState [Char] +htmlElement = strictHtmlBlock <|> htmlBlockElement "html element" + +htmlBlock :: GenParser Char ParserState Block +htmlBlock = try $ do + failUnlessBeginningOfLine + first <- htmlElement + finalSpace <- many (oneOf spaceChars) + finalNewlines <- many newline + return $ RawHtml $ first ++ finalSpace ++ finalNewlines + +-- True if tag is self-closing +isSelfClosing :: [Char] -> Bool +isSelfClosing tag = + isSuffixOf "/>" $ filter (not . (`elem` " \n\t")) tag + +strictHtmlBlock :: GenParser Char ParserState [Char] +strictHtmlBlock = try $ do + tag <- anyHtmlBlockTag + let tag' = extractTagType tag + if isSelfClosing tag || tag' == "hr" + then return tag + else do contents <- many (notFollowedBy' (htmlEndTag tag') >> + (htmlElement <|> (count 1 anyChar))) + end <- htmlEndTag tag' + return $ tag ++ concat contents ++ end + +rawHtmlBlocks :: GenParser Char ParserState Block +rawHtmlBlocks = do + htmlBlocks <- many1 $ do (RawHtml blk) <- rawHtmlBlock + sps <- do sp1 <- many spaceChar + sp2 <- option "" (blankline >> return "\n") + sp3 <- many spaceChar + sp4 <- option "" blanklines + return $ sp1 ++ sp2 ++ sp3 ++ sp4 + -- note: we want raw html to be able to + -- precede a code block, when separated + -- by a blank line + return $ blk ++ sps + let combined = concat htmlBlocks + let combined' = if last combined == '\n' then init combined else combined + return $ RawHtml combined' + +-- +-- Tables +-- + +-- Parse a dashed line with optional trailing spaces; return its length +-- and the length including trailing space. +dashedLine :: Char + -> GenParser Char st (Int, Int) +dashedLine ch = do + dashes <- many1 (char ch) + sp <- many spaceChar + return $ (length dashes, length $ dashes ++ sp) + +-- Parse a table header with dashed lines of '-' preceded by +-- one line of text. +simpleTableHeader :: GenParser Char ParserState ([[Char]], [Alignment], [Int]) +simpleTableHeader = try $ do + rawContent <- anyLine + initSp <- nonindentSpaces + dashes <- many1 (dashedLine '-') + newline + let (lengths, lines') = unzip dashes + let indices = scanl (+) (length initSp) lines' + let rawHeads = tail $ splitByIndices (init indices) rawContent + let aligns = zipWith alignType (map (\a -> [a]) rawHeads) lengths + return (rawHeads, aligns, indices) + +-- Parse a table footer - dashed lines followed by blank line. +tableFooter :: GenParser Char ParserState [Char] +tableFooter = try $ nonindentSpaces >> many1 (dashedLine '-') >> blanklines + +-- Parse a table separator - dashed line. +tableSep :: GenParser Char ParserState String +tableSep = try $ nonindentSpaces >> many1 (dashedLine '-') >> string "\n" + +-- Parse a raw line and split it into chunks by indices. +rawTableLine :: [Int] + -> GenParser Char ParserState [String] +rawTableLine indices = do + notFollowedBy' (blanklines <|> tableFooter) + line <- many1Till anyChar newline + return $ map removeLeadingTrailingSpace $ tail $ + splitByIndices (init indices) line + +-- Parse a table line and return a list of lists of blocks (columns). +tableLine :: [Int] + -> GenParser Char ParserState [[Block]] +tableLine indices = rawTableLine indices >>= mapM (parseFromString (many plain)) + +-- Parse a multiline table row and return a list of blocks (columns). +multilineRow :: [Int] + -> GenParser Char ParserState [[Block]] +multilineRow indices = do + colLines <- many1 (rawTableLine indices) + optional blanklines + let cols = map unlines $ transpose colLines + mapM (parseFromString (many plain)) cols + +-- Calculate relative widths of table columns, based on indices +widthsFromIndices :: Int -- Number of columns on terminal + -> [Int] -- Indices + -> [Double] -- Fractional relative sizes of columns +widthsFromIndices _ [] = [] +widthsFromIndices numColumns indices = + let lengths = zipWith (-) indices (0:indices) + totLength = sum lengths + quotient = if totLength > numColumns + then fromIntegral totLength + else fromIntegral numColumns + fracs = map (\l -> (fromIntegral l) / quotient) lengths in + tail fracs + +-- Parses a table caption: inlines beginning with 'Table:' +-- and followed by blank lines. +tableCaption :: GenParser Char ParserState [Inline] +tableCaption = try $ do + nonindentSpaces + string "Table:" + result <- many1 inline + blanklines + return $ normalizeSpaces result + +-- Parse a table using 'headerParser', 'lineParser', and 'footerParser'. +tableWith :: GenParser Char ParserState ([[Char]], [Alignment], [Int]) + -> ([Int] -> GenParser Char ParserState [[Block]]) + -> GenParser Char ParserState end + -> GenParser Char ParserState Block +tableWith headerParser lineParser footerParser = try $ do + (rawHeads, aligns, indices) <- headerParser + lines' <- many1Till (lineParser indices) footerParser + caption <- option [] tableCaption + heads <- mapM (parseFromString (many plain)) rawHeads + state <- getState + let numColumns = stateColumns state + let widths = widthsFromIndices numColumns indices + return $ Table caption aligns widths heads lines' + +-- Parse a simple table with '---' header and one line per row. +simpleTable :: GenParser Char ParserState Block +simpleTable = tableWith simpleTableHeader tableLine blanklines + +-- Parse a multiline table: starts with row of '-' on top, then header +-- (which may be multiline), then the rows, +-- which may be multiline, separated by blank lines, and +-- ending with a footer (dashed line followed by blank line). +multilineTable :: GenParser Char ParserState Block +multilineTable = tableWith multilineTableHeader multilineRow tableFooter + +multilineTableHeader :: GenParser Char ParserState ([String], [Alignment], [Int]) +multilineTableHeader = try $ do + tableSep + rawContent <- many1 (notFollowedBy' tableSep >> many1Till anyChar newline) + initSp <- nonindentSpaces + dashes <- many1 (dashedLine '-') + newline + let (lengths, lines') = unzip dashes + let indices = scanl (+) (length initSp) lines' + let rawHeadsList = transpose $ map + (\ln -> tail $ splitByIndices (init indices) ln) + rawContent + let rawHeads = map (intercalate " ") rawHeadsList + let aligns = zipWith alignType rawHeadsList lengths + return ((map removeLeadingTrailingSpace rawHeads), aligns, indices) + +-- Returns an alignment type for a table, based on a list of strings +-- (the rows of the column header) and a number (the length of the +-- dashed line under the rows. +alignType :: [String] + -> Int + -> Alignment +alignType [] _ = AlignDefault +alignType strLst len = + let s = head $ sortBy (comparing length) $ + map removeTrailingSpace strLst + leftSpace = if null s then False else (s !! 0) `elem` " \t" + rightSpace = length s < len || (s !! (len - 1)) `elem` " \t" + in case (leftSpace, rightSpace) of + (True, False) -> AlignRight + (False, True) -> AlignLeft + (True, True) -> AlignCenter + (False, False) -> AlignDefault + +table :: GenParser Char ParserState Block +table = simpleTable <|> multilineTable "table" + +-- +-- inline +-- + +inline :: GenParser Char ParserState Inline +inline = choice inlineParsers "inline" + +inlineParsers :: [GenParser Char ParserState Inline] +inlineParsers = [ abbrev + , str + , smartPunctuation + , whitespace + , endline + , code + , charRef + , strong + , emph + , note + , inlineNote + , link +#ifdef _CITEPROC + , inlineCitation +#endif + , image + , math + , strikeout + , superscript + , subscript + , autoLink + , rawHtmlInline' + , rawLaTeXInline' + , escapedChar + , symbol + , ltSign ] + +inlineNonLink :: GenParser Char ParserState Inline +inlineNonLink = (choice $ + map (\parser -> try (parser >>= failIfLink)) inlineParsers) + "inline (non-link)" + +failIfLink :: Inline -> GenParser tok st Inline +failIfLink (Link _ _) = pzero +failIfLink elt = return elt + +escapedChar :: GenParser Char ParserState Inline +escapedChar = do + char '\\' + state <- getState + result <- option '\\' $ if stateStrict state + then oneOf "\\`*_{}[]()>#+-.!~" + else satisfy (not . isAlphaNum) + let result' = if result == ' ' + then '\160' -- '\ ' is a nonbreaking space + else result + return $ Str [result'] + +ltSign :: GenParser Char ParserState Inline +ltSign = do + st <- getState + if stateStrict st + then char '<' + else notFollowedBy' rawHtmlBlocks >> char '<' -- unless it starts html + return $ Str ['<'] + +specialCharsMinusLt :: [Char] +specialCharsMinusLt = filter (/= '<') specialChars + +symbol :: GenParser Char ParserState Inline +symbol = do + result <- oneOf specialCharsMinusLt + return $ Str [result] + +-- parses inline code, between n `s and n `s +code :: GenParser Char ParserState Inline +code = try $ do + starts <- many1 (char '`') + skipSpaces + result <- many1Till (many1 (noneOf "`\n") <|> many1 (char '`') <|> + (char '\n' >> return " ")) + (try (skipSpaces >> count (length starts) (char '`') >> + notFollowedBy (char '`'))) + return $ Code $ removeLeadingTrailingSpace $ concat result + +mathWord :: GenParser Char st [Char] +mathWord = many1 ((noneOf " \t\n\\$") <|> + (try (char '\\') >>~ notFollowedBy (char '$'))) + +math :: GenParser Char ParserState Inline +math = (mathDisplay >>= return . Math DisplayMath) + <|> (mathInline >>= return . Math InlineMath) + +mathDisplay :: GenParser Char ParserState String +mathDisplay = try $ do + failIfStrict + string "$$" + many1Till (noneOf "\n" <|> (newline >>~ notFollowedBy' blankline)) (try $ string "$$") + +mathInline :: GenParser Char ParserState String +mathInline = try $ do + failIfStrict + char '$' + notFollowedBy space + words' <- sepBy1 mathWord (many1 (spaceChar <|> (newline >>~ notFollowedBy' blankline))) + char '$' + notFollowedBy digit + return $ intercalate " " words' + +emph :: GenParser Char ParserState Inline +emph = ((enclosed (char '*') (notFollowedBy' strong >> char '*') inline) <|> + (enclosed (char '_') (notFollowedBy' strong >> char '_' >> + notFollowedBy alphaNum) inline)) >>= + return . Emph . normalizeSpaces + +strong :: GenParser Char ParserState Inline +strong = ((enclosed (string "**") (try $ string "**") inline) <|> + (enclosed (string "__") (try $ string "__") inline)) >>= + return . Strong . normalizeSpaces + +strikeout :: GenParser Char ParserState Inline +strikeout = failIfStrict >> enclosed (string "~~") (try $ string "~~") inline >>= + return . Strikeout . normalizeSpaces + +superscript :: GenParser Char ParserState Inline +superscript = failIfStrict >> enclosed (char '^') (char '^') + (notFollowedBy' whitespace >> inline) >>= -- may not contain Space + return . Superscript + +subscript :: GenParser Char ParserState Inline +subscript = failIfStrict >> enclosed (char '~') (char '~') + (notFollowedBy' whitespace >> inline) >>= -- may not contain Space + return . Subscript + +abbrev :: GenParser Char ParserState Inline +abbrev = failUnlessSmart >> + (assumedAbbrev <|> knownAbbrev) >>= return . Str . (++ ".\160") + +-- an string of letters followed by a period that does not end a sentence +-- is assumed to be an abbreviation. It is assumed that sentences don't +-- start with lowercase letters or numerals. +assumedAbbrev :: GenParser Char ParserState [Char] +assumedAbbrev = try $ do + result <- many1 $ satisfy isAlpha + string ". " + lookAhead $ satisfy (\x -> isLower x || isDigit x) + return result + +-- these strings are treated as abbreviations even if they are followed +-- by a capital letter (such as a name). +knownAbbrev :: GenParser Char ParserState [Char] +knownAbbrev = try $ do + result <- oneOfStrings [ "Mr", "Mrs", "Ms", "Capt", "Dr", "Prof", "Gen", + "Gov", "e.g", "i.e", "Sgt", "St", "vol", "vs", + "Sen", "Rep", "Pres", "Hon", "Rev" ] + string ". " + return result + +smartPunctuation :: GenParser Char ParserState Inline +smartPunctuation = failUnlessSmart >> + choice [ quoted, apostrophe, dash, ellipses ] + +apostrophe :: GenParser Char ParserState Inline +apostrophe = (char '\'' <|> char '\8217') >> return Apostrophe + +quoted :: GenParser Char ParserState Inline +quoted = doubleQuoted <|> singleQuoted + +withQuoteContext :: QuoteContext + -> (GenParser Char ParserState Inline) + -> GenParser Char ParserState Inline +withQuoteContext context parser = do + oldState <- getState + let oldQuoteContext = stateQuoteContext oldState + setState oldState { stateQuoteContext = context } + result <- parser + newState <- getState + setState newState { stateQuoteContext = oldQuoteContext } + return result + +singleQuoted :: GenParser Char ParserState Inline +singleQuoted = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ many1Till inline singleQuoteEnd >>= + return . Quoted SingleQuote . normalizeSpaces + +doubleQuoted :: GenParser Char ParserState Inline +doubleQuoted = try $ do + doubleQuoteStart + withQuoteContext InDoubleQuote $ many1Till inline doubleQuoteEnd >>= + return . Quoted DoubleQuote . normalizeSpaces + +failIfInQuoteContext :: QuoteContext -> GenParser tok ParserState () +failIfInQuoteContext context = do + st <- getState + if stateQuoteContext st == context + then fail "already inside quotes" + else return () + +singleQuoteStart :: GenParser Char ParserState Char +singleQuoteStart = do + failIfInQuoteContext InSingleQuote + char '\8216' <|> + (try $ do char '\'' + notFollowedBy (oneOf ")!],.;:-? \t\n") + notFollowedBy (try (oneOfStrings ["s","t","m","ve","ll","re"] >> + satisfy (not . isAlphaNum))) + -- possess/contraction + return '\'') + +singleQuoteEnd :: GenParser Char st Char +singleQuoteEnd = try $ do + char '\8217' <|> char '\'' + notFollowedBy alphaNum + return '\'' + +doubleQuoteStart :: GenParser Char ParserState Char +doubleQuoteStart = do + failIfInQuoteContext InDoubleQuote + char '\8220' <|> + (try $ do char '"' + notFollowedBy (oneOf " \t\n") + return '"') + +doubleQuoteEnd :: GenParser Char st Char +doubleQuoteEnd = char '\8221' <|> char '"' + +ellipses :: GenParser Char st Inline +ellipses = oneOfStrings ["...", " . . . ", ". . .", " . . ."] >> return Ellipses + +dash :: GenParser Char st Inline +dash = enDash <|> emDash + +enDash :: GenParser Char st Inline +enDash = try $ char '-' >> notFollowedBy (noneOf "0123456789") >> return EnDash + +emDash :: GenParser Char st Inline +emDash = oneOfStrings ["---", "--"] >> return EmDash + +whitespace :: GenParser Char ParserState Inline +whitespace = do + sps <- many1 (oneOf spaceChars) + if length sps >= 2 + then option Space (endline >> return LineBreak) + else return Space "whitespace" + +nonEndline :: GenParser Char st Char +nonEndline = satisfy (/='\n') + +strChar :: GenParser Char st Char +strChar = noneOf (specialChars ++ spaceChars ++ "\n") + +str :: GenParser Char st Inline +str = many1 strChar >>= return . Str + +-- an endline character that can be treated as a space, not a structural break +endline :: GenParser Char ParserState Inline +endline = try $ do + newline + notFollowedBy blankline + st <- getState + if stateStrict st + then do notFollowedBy emailBlockQuoteStart + notFollowedBy (char '#') -- atx header + else return () + -- parse potential list-starts differently if in a list: + if stateParserContext st == ListItemState + then notFollowedBy' (bulletListStart <|> + (anyOrderedListStart >> return ())) + else return () + return Space + +-- +-- links +-- + +-- a reference label for a link +reference :: GenParser Char ParserState [Inline] +reference = do notFollowedBy' (string "[^") -- footnote reference + result <- inlinesInBalancedBrackets inlineNonLink + return $ normalizeSpaces result + +-- source for a link, with optional title +source :: GenParser Char st (String, [Char]) +source = + (try $ charsInBalanced '(' ')' >>= parseFromString source') <|> + -- the following is needed for cases like: [ref](/url(a). + (enclosed (char '(') (char ')') anyChar >>= + parseFromString source') + +-- auxiliary function for source +source' :: GenParser Char st (String, [Char]) +source' = do + skipSpaces + let sourceURL excludes = many $ + optional (char '\\') >> (noneOf (' ':excludes) <|> (notFollowedBy' linkTitle >> char ' ')) + src <- try (char '<' >> sourceURL ">\t\n" >>~ char '>') <|> sourceURL "\t\n" + tit <- option "" linkTitle + skipSpaces + eof + return (intercalate "%20" $ words $ removeTrailingSpace src, tit) + +linkTitle :: GenParser Char st String +linkTitle = try $ do + (many1 spaceChar >> option '\n' newline) <|> newline + skipSpaces + delim <- oneOf "'\"" + tit <- manyTill (optional (char '\\') >> anyChar) + (try (char delim >> skipSpaces >> eof)) + return $ decodeCharacterReferences tit + +link :: GenParser Char ParserState Inline +link = try $ do + lab <- reference + src <- source <|> referenceLink lab + sanitize <- getState >>= return . stateSanitizeHTML + if sanitize && unsanitaryURI (fst src) + then fail "Unsanitary URI" + else return $ Link lab src + +-- a link like [this][ref] or [this][] or [this] +referenceLink :: [Inline] + -> GenParser Char ParserState (String, [Char]) +referenceLink lab = do + ref <- option [] (try (optional (char ' ') >> + optional (newline >> skipSpaces) >> reference)) + let ref' = if null ref then lab else ref + state <- getState + case lookupKeySrc (stateKeys state) ref' of + Nothing -> fail "no corresponding key" + Just target -> return target + +autoLink :: GenParser Char ParserState Inline +autoLink = try $ do + char '<' + src <- uri <|> (emailAddress >>= (return . ("mailto:" ++))) + char '>' + let src' = if "mailto:" `isPrefixOf` src + then drop 7 src + else src + st <- getState + let sanitize = stateSanitizeHTML st + if sanitize && unsanitaryURI src + then fail "Unsanitary URI" + else return $ if stateStrict st + then Link [Str src'] (src, "") + else Link [Code src'] (src, "") + +image :: GenParser Char ParserState Inline +image = try $ do + char '!' + (Link lab src) <- link + return $ Image lab src + +note :: GenParser Char ParserState Inline +note = try $ do + failIfStrict + ref <- noteMarker + state <- getState + let notes = stateNotes state + case lookup ref notes of + Nothing -> fail "note not found" + Just contents -> return $ Note contents + +inlineNote :: GenParser Char ParserState Inline +inlineNote = try $ do + failIfStrict + char '^' + contents <- inlinesInBalancedBrackets inline + return $ Note [Para contents] + +rawLaTeXInline' :: GenParser Char ParserState Inline +rawLaTeXInline' = do + failIfStrict + (rawConTeXtEnvironment' >>= return . TeX) + <|> (rawLaTeXEnvironment' >>= return . TeX) + <|> rawLaTeXInline + +rawConTeXtEnvironment' :: GenParser Char st String +rawConTeXtEnvironment' = try $ do + string "\\start" + completion <- inBrackets (letter <|> digit <|> spaceChar) + <|> (many1 letter) + contents <- manyTill (rawConTeXtEnvironment' <|> (count 1 anyChar)) + (try $ string "\\stop" >> string completion) + return $ "\\start" ++ completion ++ concat contents ++ "\\stop" ++ completion + +inBrackets :: (GenParser Char st Char) -> GenParser Char st String +inBrackets parser = do + char '[' + contents <- many parser + char ']' + return $ "[" ++ contents ++ "]" + +rawHtmlInline' :: GenParser Char ParserState Inline +rawHtmlInline' = do + st <- getState + result <- if stateStrict st + then choice [htmlBlockElement, anyHtmlTag, anyHtmlEndTag] + else anyHtmlInlineTag + return $ HtmlInline result + +#ifdef _CITEPROC +inlineCitation :: GenParser Char ParserState Inline +inlineCitation = try $ do + failIfStrict + cit <- citeMarker + let citations = readWith parseCitation defaultParserState cit + mr <- mapM chkCit citations + if catMaybes mr /= [] + then return $ Cite citations [] + else fail "no citation found" + +chkCit :: Target -> GenParser Char ParserState (Maybe Target) +chkCit t = do + st <- getState + case lookupKeySrc (stateKeys st) [Str $ fst t] of + Just _ -> fail "This is a link" + Nothing -> if elem (fst t) $ stateCitations st + then return $ Just t + else return $ Nothing + +citeMarker :: GenParser Char ParserState String +citeMarker = char '[' >> manyTill ( noneOf "\n" <|> (newline >>~ notFollowedBy blankline) ) (char ']') + +parseCitation :: GenParser Char ParserState [(String,String)] +parseCitation = try $ sepBy (parseLabel) (oneOf ";") + +parseLabel :: GenParser Char ParserState (String,String) +parseLabel = try $ do + res <- sepBy (skipSpaces >> optional newline >> skipSpaces >> many1 (noneOf "@;")) (oneOf "@") + case res of + [lab,loc] -> return (lab, loc) + [lab] -> return (lab, "" ) + _ -> return ("" , "" ) + +#endif -- cgit v1.2.3