diff options
Diffstat (limited to 'src/Text/Pandoc/Parsing.hs')
| -rw-r--r-- | src/Text/Pandoc/Parsing.hs | 1329 |
1 files changed, 1329 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Parsing.hs b/src/Text/Pandoc/Parsing.hs new file mode 100644 index 000000000..400d07f2a --- /dev/null +++ b/src/Text/Pandoc/Parsing.hs @@ -0,0 +1,1329 @@ +{-# LANGUAGE + FlexibleContexts +, GeneralizedNewtypeDeriving +, TypeSynonymInstances +, MultiParamTypeClasses +, FlexibleInstances +, IncoherentInstances #-} + +{- +Copyright (C) 2006-2016 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Parsing + Copyright : Copyright (C) 2006-2016 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +A utility library with parsers used in pandoc readers. +-} +module Text.Pandoc.Parsing ( anyLine, + many1Till, + notFollowedBy', + oneOfStrings, + oneOfStringsCI, + spaceChar, + nonspaceChar, + skipSpaces, + blankline, + blanklines, + enclosed, + stringAnyCase, + parseFromString, + lineClump, + charsInBalanced, + romanNumeral, + emailAddress, + uri, + mathInline, + mathDisplay, + withHorizDisplacement, + withRaw, + escaped, + characterReference, + anyOrderedListMarker, + orderedListMarker, + charRef, + lineBlockLines, + tableWith, + widthsFromIndices, + gridTableWith, + readWith, + readWithM, + testStringWith, + guardEnabled, + guardDisabled, + updateLastStrPos, + notAfterString, + logMessage, + reportLogMessages, + ParserState (..), + HasReaderOptions (..), + HasHeaderMap (..), + HasIdentifierList (..), + HasMacros (..), + HasLogMessages (..), + HasLastStrPosition (..), + defaultParserState, + HeaderType (..), + ParserContext (..), + QuoteContext (..), + HasQuoteContext (..), + NoteTable, + NoteTable', + KeyTable, + SubstTable, + Key (..), + toKey, + registerHeader, + smartPunctuation, + singleQuoteStart, + singleQuoteEnd, + doubleQuoteStart, + doubleQuoteEnd, + ellipses, + apostrophe, + dash, + nested, + citeKey, + macro, + applyMacros', + Parser, + ParserT, + F(..), + runF, + askF, + asksF, + token, + (<+?>), + extractIdClass, + insertIncludedFile, + -- * Re-exports from Text.Pandoc.Parsec + Stream, + runParser, + runParserT, + parse, + anyToken, + getInput, + setInput, + unexpected, + char, + letter, + digit, + alphaNum, + skipMany, + skipMany1, + spaces, + space, + anyChar, + satisfy, + newline, + string, + count, + eof, + noneOf, + oneOf, + lookAhead, + notFollowedBy, + many, + many1, + manyTill, + (<|>), + (<?>), + choice, + try, + sepBy, + sepBy1, + sepEndBy, + sepEndBy1, + endBy, + endBy1, + option, + optional, + optionMaybe, + getState, + setState, + updateState, + SourcePos, + getPosition, + setPosition, + sourceColumn, + sourceLine, + setSourceColumn, + setSourceLine, + newPos + ) +where + +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Builder (Blocks, Inlines, rawBlock, HasMeta(..)) +import qualified Text.Pandoc.Builder as B +import Text.Pandoc.XML (fromEntities) +import qualified Text.Pandoc.UTF8 as UTF8 (putStrLn) +import Text.Parsec hiding (token) +import Text.Parsec.Pos (newPos) +import Data.Char ( toLower, toUpper, ord, chr, isAscii, isAlphaNum, + isHexDigit, isSpace, isPunctuation ) +import Data.List ( intercalate, transpose, isSuffixOf ) +import Text.Pandoc.Shared +import qualified Data.Map as M +import Text.TeXMath.Readers.TeX.Macros (applyMacros, Macro, + parseMacroDefinitions) +import Text.HTML.TagSoup.Entity ( lookupEntity ) +import Text.Pandoc.Asciify (toAsciiChar) +import Data.Monoid ((<>)) +import Text.Pandoc.Class (PandocMonad, readFileFromDirs, report) +import Text.Pandoc.Logging +import Data.Default +import qualified Data.Set as Set +import Control.Monad.Reader +import Control.Monad.Identity +import Data.Maybe (catMaybes) + +import Text.Pandoc.Error +import Control.Monad.Except + +type Parser t s = Parsec t s + +type ParserT = ParsecT + +newtype F a = F { unF :: Reader ParserState a } deriving (Monad, Applicative, Functor) + +runF :: F a -> ParserState -> a +runF = runReader . unF + +askF :: F ParserState +askF = F ask + +asksF :: (ParserState -> a) -> F a +asksF f = F $ asks f + +instance Monoid a => Monoid (F a) where + mempty = return mempty + mappend = liftM2 mappend + mconcat = liftM mconcat . sequence + +-- | Parse any line of text +anyLine :: Stream [Char] m Char => ParserT [Char] st m [Char] +anyLine = do + -- This is much faster than: + -- manyTill anyChar newline + inp <- getInput + pos <- getPosition + case break (=='\n') inp of + (this, '\n':rest) -> do + -- needed to persuade parsec that this won't match an empty string: + anyChar + setInput rest + setPosition $ incSourceLine (setSourceColumn pos 1) 1 + return this + _ -> mzero + +-- | Like @manyTill@, but reads at least one item. +many1Till :: Stream s m t + => ParserT s st m a + -> ParserT s st m end + -> ParserT s st m [a] +many1Till p end = do + first <- p + rest <- manyTill p end + return (first:rest) + +-- | A more general form of @notFollowedBy@. This one allows any +-- type of parser to be specified, and succeeds only if that parser fails. +-- It does not consume any input. +notFollowedBy' :: (Show b, Stream s m a) => ParserT s st m b -> ParserT s st m () +notFollowedBy' p = try $ join $ do a <- try p + return (unexpected (show a)) + <|> + return (return ()) +-- (This version due to Andrew Pimlott on the Haskell mailing list.) + +oneOfStrings' :: Stream s m Char => (Char -> Char -> Bool) -> [String] -> ParserT s st m String +oneOfStrings' _ [] = fail "no strings" +oneOfStrings' matches strs = try $ do + c <- anyChar + let strs' = [xs | (x:xs) <- strs, x `matches` c] + case strs' of + [] -> fail "not found" + _ -> (c:) <$> oneOfStrings' matches strs' + <|> if "" `elem` strs' + then return [c] + else fail "not found" + +-- | Parses one of a list of strings. If the list contains +-- two strings one of which is a prefix of the other, the longer +-- string will be matched if possible. +oneOfStrings :: Stream s m Char => [String] -> ParserT s st m String +oneOfStrings = oneOfStrings' (==) + +-- | Parses one of a list of strings (tried in order), case insensitive. +oneOfStringsCI :: Stream s m Char => [String] -> ParserT s st m String +oneOfStringsCI = oneOfStrings' ciMatch + where ciMatch x y = toLower' x == toLower' y + -- this optimizes toLower by checking common ASCII case + -- first, before calling the expensive unicode-aware + -- function: + toLower' c | c >= 'A' && c <= 'Z' = chr (ord c + 32) + | isAscii c = c + | otherwise = toLower c + +-- | Parses a space or tab. +spaceChar :: Stream s m Char => ParserT s st m Char +spaceChar = satisfy $ \c -> c == ' ' || c == '\t' + +-- | Parses a nonspace, nonnewline character. +nonspaceChar :: Stream s m Char => ParserT s st m Char +nonspaceChar = satisfy $ flip notElem ['\t', '\n', ' ', '\r'] + +-- | Skips zero or more spaces or tabs. +skipSpaces :: Stream s m Char => ParserT s st m () +skipSpaces = skipMany spaceChar + +-- | Skips zero or more spaces or tabs, then reads a newline. +blankline :: Stream s m Char => ParserT s st m Char +blankline = try $ skipSpaces >> newline + +-- | Parses one or more blank lines and returns a string of newlines. +blanklines :: Stream s m Char => ParserT s st m [Char] +blanklines = many1 blankline + +-- | Parses material enclosed between start and end parsers. +enclosed :: Stream s m Char => ParserT s st m t -- ^ start parser + -> ParserT s st m end -- ^ end parser + -> ParserT s st m a -- ^ content parser (to be used repeatedly) + -> ParserT s st m [a] +enclosed start end parser = try $ + start >> notFollowedBy space >> many1Till parser end + +-- | Parse string, case insensitive. +stringAnyCase :: Stream s m Char => [Char] -> ParserT s st m String +stringAnyCase [] = string "" +stringAnyCase (x:xs) = do + firstChar <- char (toUpper x) <|> char (toLower x) + rest <- stringAnyCase xs + return (firstChar:rest) + +-- | Parse contents of 'str' using 'parser' and return result. +parseFromString :: Monad m => ParserT String st m a -> String -> ParserT String st m a +parseFromString parser str = do + oldPos <- getPosition + oldInput <- getInput + setInput str + result <- parser + spaces + eof + setInput oldInput + setPosition oldPos + return result + +-- | Parse raw line block up to and including blank lines. +lineClump :: Stream [Char] m Char => ParserT [Char] st m String +lineClump = blanklines + <|> (many1 (notFollowedBy blankline >> anyLine) >>= return . unlines) + +-- | Parse a string of characters between an open character +-- and a close character, including text between balanced +-- pairs of open and close, which must be different. For example, +-- @charsInBalanced '(' ')' anyChar@ will parse "(hello (there))" +-- and return "hello (there)". +charsInBalanced :: Stream s m Char => Char -> Char -> ParserT s st m Char + -> ParserT s st m String +charsInBalanced open close parser = try $ do + char open + let isDelim c = c == open || c == close + raw <- many $ many1 (notFollowedBy (satisfy isDelim) >> parser) + <|> (do res <- charsInBalanced open close parser + return $ [open] ++ res ++ [close]) + char close + return $ concat raw + +-- old charsInBalanced would be: +-- charsInBalanced open close (noneOf "\n" <|> char '\n' >> notFollowedBy blankline) +-- old charsInBalanced' would be: +-- charsInBalanced open close anyChar + +-- Auxiliary functions for romanNumeral: + +lowercaseRomanDigits :: [Char] +lowercaseRomanDigits = ['i','v','x','l','c','d','m'] + +uppercaseRomanDigits :: [Char] +uppercaseRomanDigits = map toUpper lowercaseRomanDigits + +-- | Parses a roman numeral (uppercase or lowercase), returns number. +romanNumeral :: Stream s m Char => Bool -- ^ Uppercase if true + -> ParserT s st m Int +romanNumeral upperCase = do + let romanDigits = if upperCase + then uppercaseRomanDigits + else lowercaseRomanDigits + lookAhead $ oneOf romanDigits + let [one, five, ten, fifty, hundred, fivehundred, thousand] = + map char romanDigits + thousands <- many thousand >>= (return . (1000 *) . length) + ninehundreds <- option 0 $ try $ hundred >> thousand >> return 900 + fivehundreds <- many fivehundred >>= (return . (500 *) . length) + fourhundreds <- option 0 $ try $ hundred >> fivehundred >> return 400 + hundreds <- many hundred >>= (return . (100 *) . length) + nineties <- option 0 $ try $ ten >> hundred >> return 90 + fifties <- many fifty >>= (return . (50 *) . length) + forties <- option 0 $ try $ ten >> fifty >> return 40 + tens <- many ten >>= (return . (10 *) . length) + nines <- option 0 $ try $ one >> ten >> return 9 + fives <- many five >>= (return . (5 *) . length) + fours <- option 0 $ try $ one >> five >> return 4 + ones <- many one >>= (return . length) + let total = thousands + ninehundreds + fivehundreds + fourhundreds + + hundreds + nineties + fifties + forties + tens + nines + + fives + fours + ones + if total == 0 + then fail "not a roman numeral" + else return total + +-- Parsers for email addresses and URIs + +-- | Parses an email address; returns original and corresponding +-- escaped mailto: URI. +emailAddress :: Stream s m Char => ParserT s st m (String, String) +emailAddress = try $ toResult <$> mailbox <*> (char '@' *> domain) + where toResult mbox dom = let full = fromEntities $ mbox ++ '@':dom + in (full, escapeURI $ "mailto:" ++ full) + mailbox = intercalate "." <$> (emailWord `sepby1` dot) + domain = intercalate "." <$> (subdomain `sepby1` dot) + dot = char '.' + subdomain = many1 $ alphaNum <|> innerPunct + -- this excludes some valid email addresses, since an + -- email could contain e.g. '__', but gives better results + -- for our purposes, when combined with markdown parsing: + innerPunct = try (satisfy (\c -> isEmailPunct c || c == '@') + <* notFollowedBy space + <* notFollowedBy (satisfy isPunctuation)) + -- technically an email address could begin with a symbol, + -- but allowing this creates too many problems. + -- See e.g. https://github.com/jgm/pandoc/issues/2940 + emailWord = do x <- satisfy isAlphaNum + xs <- many (satisfy isEmailChar) + return (x:xs) + isEmailChar c = isAlphaNum c || isEmailPunct c + isEmailPunct c = c `elem` "!\"#$%&'*+-/=?^_{|}~;" + -- note: sepBy1 from parsec consumes input when sep + -- succeeds and p fails, so we use this variant here. + sepby1 p sep = (:) <$> p <*> (many (try $ sep >> p)) + + +-- Schemes from http://www.iana.org/assignments/uri-schemes.html plus +-- the unofficial schemes coap, doi, javascript, isbn, pmid +schemes :: [String] +schemes = ["coap","doi","javascript","aaa","aaas","about","acap","cap","cid", + "crid","data","dav","dict","dns","file","ftp","geo","go","gopher", + "h323","http","https","iax","icap","im","imap","info","ipp","iris", + "iris.beep","iris.xpc","iris.xpcs","iris.lwz","ldap","mailto","mid", + "msrp","msrps","mtqp","mupdate","news","nfs","ni","nih","nntp", + "opaquelocktoken","pop","pres","rtsp","service","session","shttp","sieve", + "sip","sips","sms","snmp","soap.beep","soap.beeps","tag","tel","telnet", + "tftp","thismessage","tn3270","tip","tv","urn","vemmi","ws","wss","xcon", + "xcon-userid","xmlrpc.beep","xmlrpc.beeps","xmpp","z39.50r","z39.50s", + "adiumxtra","afp","afs","aim","apt","attachment","aw","beshare","bitcoin", + "bolo","callto","chrome","chrome-extension","com-eventbrite-attendee", + "content", "cvs","dlna-playsingle","dlna-playcontainer","dtn","dvb", + "ed2k","facetime","feed","finger","fish","gg","git","gizmoproject", + "gtalk","hcp","icon","ipn","irc","irc6","ircs","itms","jar","jms", + "keyparc","lastfm","ldaps","magnet","maps","market","message","mms", + "ms-help","msnim","mumble","mvn","notes","oid","palm","paparazzi", + "platform","proxy","psyc","query","res","resource","rmi","rsync", + "rtmp","secondlife","sftp","sgn","skype","smb","soldat","spotify", + "ssh","steam","svn","teamspeak","things","udp","unreal","ut2004", + "ventrilo","view-source","webcal","wtai","wyciwyg","xfire","xri", + "ymsgr", "isbn", "pmid"] + +uriScheme :: Stream s m Char => ParserT s st m String +uriScheme = oneOfStringsCI schemes + +-- | Parses a URI. Returns pair of original and URI-escaped version. +uri :: Stream [Char] m Char => ParserT [Char] st m (String, String) +uri = try $ do + scheme <- uriScheme + char ':' + -- We allow sentence punctuation except at the end, since + -- we don't want the trailing '.' in 'http://google.com.' We want to allow + -- http://en.wikipedia.org/wiki/State_of_emergency_(disambiguation) + -- as a URL, while NOT picking up the closing paren in + -- (http://wikipedia.org). So we include balanced parens in the URL. + let isWordChar c = isAlphaNum c || c `elem` "#$%*+/@\\_-" + let wordChar = satisfy isWordChar + let percentEscaped = try $ char '%' >> skipMany1 (satisfy isHexDigit) + let entity = () <$ characterReference + let punct = skipMany1 (char ',') + <|> () <$ (satisfy (\c -> not (isSpace c) && c /= '<' && c /= '>')) + let uriChunk = skipMany1 wordChar + <|> percentEscaped + <|> entity + <|> (try $ punct >> + lookAhead (void (satisfy isWordChar) <|> percentEscaped)) + str <- snd <$> withRaw (skipMany1 ( () <$ + (enclosed (char '(') (char ')') uriChunk + <|> enclosed (char '{') (char '}') uriChunk + <|> enclosed (char '[') (char ']') uriChunk) + <|> uriChunk)) + str' <- option str $ char '/' >> return (str ++ "/") + let uri' = scheme ++ ":" ++ fromEntities str' + return (uri', escapeURI uri') + +mathInlineWith :: Stream s m Char => String -> String -> ParserT s st m String +mathInlineWith op cl = try $ do + string op + notFollowedBy space + words' <- many1Till (count 1 (noneOf " \t\n\\") + <|> (char '\\' >> + -- This next clause is needed because \text{..} can + -- contain $, \(\), etc. + (try (string "text" >> + (("\\text" ++) <$> inBalancedBraces 0 "")) + <|> (\c -> ['\\',c]) <$> anyChar)) + <|> do (blankline <* notFollowedBy' blankline) <|> + (oneOf " \t" <* skipMany (oneOf " \t")) + notFollowedBy (char '$') + return " " + ) (try $ string cl) + notFollowedBy digit -- to prevent capture of $5 + return $ concat words' + where + inBalancedBraces :: Stream s m Char => Int -> String -> ParserT s st m String + inBalancedBraces 0 "" = do + c <- anyChar + if c == '{' + then inBalancedBraces 1 "{" + else mzero + inBalancedBraces 0 s = return $ reverse s + inBalancedBraces numOpen ('\\':xs) = do + c <- anyChar + inBalancedBraces numOpen (c:'\\':xs) + inBalancedBraces numOpen xs = do + c <- anyChar + case c of + '}' -> inBalancedBraces (numOpen - 1) (c:xs) + '{' -> inBalancedBraces (numOpen + 1) (c:xs) + _ -> inBalancedBraces numOpen (c:xs) + +mathDisplayWith :: Stream s m Char => String -> String -> ParserT s st m String +mathDisplayWith op cl = try $ do + string op + many1Till (noneOf "\n" <|> (newline <* notFollowedBy' blankline)) (try $ string cl) + +mathDisplay :: (HasReaderOptions st, Stream s m Char) + => ParserT s st m String +mathDisplay = + (guardEnabled Ext_tex_math_dollars >> mathDisplayWith "$$" "$$") + <|> (guardEnabled Ext_tex_math_single_backslash >> + mathDisplayWith "\\[" "\\]") + <|> (guardEnabled Ext_tex_math_double_backslash >> + mathDisplayWith "\\\\[" "\\\\]") + +mathInline :: (HasReaderOptions st , Stream s m Char) + => ParserT s st m String +mathInline = + (guardEnabled Ext_tex_math_dollars >> mathInlineWith "$" "$") + <|> (guardEnabled Ext_tex_math_single_backslash >> + mathInlineWith "\\(" "\\)") + <|> (guardEnabled Ext_tex_math_double_backslash >> + mathInlineWith "\\\\(" "\\\\)") + +-- | Applies a parser, returns tuple of its results and its horizontal +-- displacement (the difference between the source column at the end +-- and the source column at the beginning). Vertical displacement +-- (source row) is ignored. +withHorizDisplacement :: Stream s m Char + => ParserT s st m a -- ^ Parser to apply + -> ParserT s st m (a, Int) -- ^ (result, displacement) +withHorizDisplacement parser = do + pos1 <- getPosition + result <- parser + pos2 <- getPosition + return (result, sourceColumn pos2 - sourceColumn pos1) + +-- | Applies a parser and returns the raw string that was parsed, +-- along with the value produced by the parser. +withRaw :: Stream [Char] m Char => ParsecT [Char] st m a -> ParsecT [Char] st m (a, [Char]) +withRaw parser = do + pos1 <- getPosition + inp <- getInput + result <- parser + pos2 <- getPosition + let (l1,c1) = (sourceLine pos1, sourceColumn pos1) + let (l2,c2) = (sourceLine pos2, sourceColumn pos2) + let inplines = take ((l2 - l1) + 1) $ lines inp + let raw = case inplines of + [] -> "" + [l] -> take (c2 - c1) l + ls -> unlines (init ls) ++ take (c2 - 1) (last ls) + return (result, raw) + +-- | Parses backslash, then applies character parser. +escaped :: Stream s m Char + => ParserT s st m Char -- ^ Parser for character to escape + -> ParserT s st m Char +escaped parser = try $ char '\\' >> parser + +-- | Parse character entity. +characterReference :: Stream s m Char => ParserT s st m Char +characterReference = try $ do + char '&' + ent <- many1Till nonspaceChar (char ';') + let ent' = case ent of + '#':'X':xs -> '#':'x':xs -- workaround tagsoup bug + '#':_ -> ent + _ -> ent ++ ";" + case lookupEntity ent' of + Just (c : _) -> return c + _ -> fail "entity not found" + +-- | Parses an uppercase roman numeral and returns (UpperRoman, number). +upperRoman :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +upperRoman = do + num <- romanNumeral True + return (UpperRoman, num) + +-- | Parses a lowercase roman numeral and returns (LowerRoman, number). +lowerRoman :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +lowerRoman = do + num <- romanNumeral False + return (LowerRoman, num) + +-- | Parses a decimal numeral and returns (Decimal, number). +decimal :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +decimal = do + num <- many1 digit + return (Decimal, read num) + +-- | Parses a '@' and optional label and +-- returns (DefaultStyle, [next example number]). The next +-- example number is incremented in parser state, and the label +-- (if present) is added to the label table. +exampleNum :: Stream s m Char + => ParserT s ParserState m (ListNumberStyle, Int) +exampleNum = do + char '@' + lab <- many (alphaNum <|> satisfy (\c -> c == '_' || c == '-')) + st <- getState + let num = stateNextExample st + let newlabels = if null lab + then stateExamples st + else M.insert lab num $ stateExamples st + updateState $ \s -> s{ stateNextExample = num + 1 + , stateExamples = newlabels } + return (Example, num) + +-- | Parses a '#' returns (DefaultStyle, 1). +defaultNum :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +defaultNum = do + char '#' + return (DefaultStyle, 1) + +-- | Parses a lowercase letter and returns (LowerAlpha, number). +lowerAlpha :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +lowerAlpha = do + ch <- oneOf ['a'..'z'] + return (LowerAlpha, ord ch - ord 'a' + 1) + +-- | Parses an uppercase letter and returns (UpperAlpha, number). +upperAlpha :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +upperAlpha = do + ch <- oneOf ['A'..'Z'] + return (UpperAlpha, ord ch - ord 'A' + 1) + +-- | Parses a roman numeral i or I +romanOne :: Stream s m Char => ParserT s st m (ListNumberStyle, Int) +romanOne = (char 'i' >> return (LowerRoman, 1)) <|> + (char 'I' >> return (UpperRoman, 1)) + +-- | Parses an ordered list marker and returns list attributes. +anyOrderedListMarker :: Stream s m Char => ParserT s ParserState m ListAttributes +anyOrderedListMarker = choice $ + [delimParser numParser | delimParser <- [inPeriod, inOneParen, inTwoParens], + numParser <- [decimal, exampleNum, defaultNum, romanOne, + lowerAlpha, lowerRoman, upperAlpha, upperRoman]] + +-- | Parses a list number (num) followed by a period, returns list attributes. +inPeriod :: Stream s m Char + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes +inPeriod num = try $ do + (style, start) <- num + char '.' + let delim = if style == DefaultStyle + then DefaultDelim + else Period + return (start, style, delim) + +-- | Parses a list number (num) followed by a paren, returns list attributes. +inOneParen :: Stream s m Char + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes +inOneParen num = try $ do + (style, start) <- num + char ')' + return (start, style, OneParen) + +-- | Parses a list number (num) enclosed in parens, returns list attributes. +inTwoParens :: Stream s m Char + => ParserT s st m (ListNumberStyle, Int) + -> ParserT s st m ListAttributes +inTwoParens num = try $ do + char '(' + (style, start) <- num + char ')' + return (start, style, TwoParens) + +-- | Parses an ordered list marker with a given style and delimiter, +-- returns number. +orderedListMarker :: Stream s m Char + => ListNumberStyle + -> ListNumberDelim + -> ParserT s ParserState m Int +orderedListMarker style delim = do + let num = defaultNum <|> -- # can continue any kind of list + case style of + DefaultStyle -> decimal + Example -> exampleNum + Decimal -> decimal + UpperRoman -> upperRoman + LowerRoman -> lowerRoman + UpperAlpha -> upperAlpha + LowerAlpha -> lowerAlpha + let context = case delim of + DefaultDelim -> inPeriod + Period -> inPeriod + OneParen -> inOneParen + TwoParens -> inTwoParens + (start, _, _) <- context num + return start + +-- | Parses a character reference and returns a Str element. +charRef :: Stream s m Char => ParserT s st m Inline +charRef = do + c <- characterReference + return $ Str [c] + +lineBlockLine :: Stream [Char] m Char => ParserT [Char] st m String +lineBlockLine = try $ do + char '|' + char ' ' + white <- many (spaceChar >> return '\160') + notFollowedBy newline + line <- anyLine + continuations <- many (try $ char ' ' >> anyLine) + return $ white ++ unwords (line : continuations) + +blankLineBlockLine :: Stream [Char] m Char => ParserT [Char] st m Char +blankLineBlockLine = try (char '|' >> blankline) + +-- | Parses an RST-style line block and returns a list of strings. +lineBlockLines :: Stream [Char] m Char => ParserT [Char] st m [String] +lineBlockLines = try $ do + lines' <- many1 (lineBlockLine <|> ((:[]) <$> blankLineBlockLine)) + skipMany1 $ blankline <|> blankLineBlockLine + return lines' + +-- | Parse a table using 'headerParser', 'rowParser', +-- 'lineParser', and 'footerParser'. +tableWith :: Stream s m Char + => ParserT s ParserState m ([Blocks], [Alignment], [Int]) + -> ([Int] -> ParserT s ParserState m [Blocks]) + -> ParserT s ParserState m sep + -> ParserT s ParserState m end + -> ParserT s ParserState m Blocks +tableWith headerParser rowParser lineParser footerParser = try $ do + (heads, aligns, indices) <- headerParser + lines' <- rowParser indices `sepEndBy1` lineParser + footerParser + numColumns <- getOption readerColumns + let widths = if (indices == []) + then replicate (length aligns) 0.0 + else widthsFromIndices numColumns indices + return $ B.table mempty (zip aligns widths) heads lines' + +-- Calculate relative widths of table columns, based on indices +widthsFromIndices :: Int -- Number of columns on terminal + -> [Int] -- Indices + -> [Double] -- Fractional relative sizes of columns +widthsFromIndices _ [] = [] +widthsFromIndices numColumns' indices = + let numColumns = max numColumns' (if null indices then 0 else last indices) + lengths' = zipWith (-) indices (0:indices) + lengths = reverse $ + case reverse lengths' of + [] -> [] + [x] -> [x] + -- compensate for the fact that intercolumn + -- spaces are counted in widths of all columns + -- but the last... + (x:y:zs) -> if x < y && y - x <= 2 + then y:y:zs + else x:y:zs + totLength = sum lengths + quotient = if totLength > numColumns + then fromIntegral totLength + else fromIntegral numColumns + fracs = map (\l -> (fromIntegral l) / quotient) lengths in + tail fracs + +--- + +-- Parse a grid table: starts with row of '-' on top, then header +-- (which may be grid), then the rows, +-- which may be grid, separated by blank lines, and +-- ending with a footer (dashed line followed by blank line). +gridTableWith :: Stream [Char] m Char + => ParserT [Char] ParserState m Blocks -- ^ Block list parser + -> Bool -- ^ Headerless table + -> ParserT [Char] ParserState m Blocks +gridTableWith blocks headless = + tableWith (gridTableHeader headless blocks) (gridTableRow blocks) + (gridTableSep '-') gridTableFooter + +gridTableSplitLine :: [Int] -> String -> [String] +gridTableSplitLine indices line = map removeFinalBar $ tail $ + splitStringByIndices (init indices) $ trimr line + +gridPart :: Stream s m Char => Char -> ParserT s st m (Int, Int) +gridPart ch = do + dashes <- many1 (char ch) + char '+' + return (length dashes, length dashes + 1) + +gridDashedLines :: Stream s m Char => Char -> ParserT s st m [(Int,Int)] +gridDashedLines ch = try $ char '+' >> many1 (gridPart ch) <* blankline + +removeFinalBar :: String -> String +removeFinalBar = + reverse . dropWhile (`elem` " \t") . dropWhile (=='|') . reverse + +-- | Separator between rows of grid table. +gridTableSep :: Stream s m Char => Char -> ParserT s ParserState m Char +gridTableSep ch = try $ gridDashedLines ch >> return '\n' + +-- | Parse header for a grid table. +gridTableHeader :: Stream [Char] m Char + => Bool -- ^ Headerless table + -> ParserT [Char] ParserState m Blocks + -> ParserT [Char] ParserState m ([Blocks], [Alignment], [Int]) +gridTableHeader headless blocks = try $ do + optional blanklines + dashes <- gridDashedLines '-' + rawContent <- if headless + then return $ repeat "" + else many1 + (notFollowedBy (gridTableSep '=') >> char '|' >> + many1Till anyChar newline) + if headless + then return () + else gridTableSep '=' >> return () + let lines' = map snd dashes + let indices = scanl (+) 0 lines' + let aligns = replicate (length lines') AlignDefault + -- RST does not have a notion of alignments + let rawHeads = if headless + then replicate (length dashes) "" + else map (intercalate " ") $ transpose + $ map (gridTableSplitLine indices) rawContent + heads <- mapM (parseFromString blocks) $ map trim rawHeads + return (heads, aligns, indices) + +gridTableRawLine :: Stream s m Char => [Int] -> ParserT s ParserState m [String] +gridTableRawLine indices = do + char '|' + line <- many1Till anyChar newline + return (gridTableSplitLine indices line) + +-- | Parse row of grid table. +gridTableRow :: Stream [Char] m Char + => ParserT [Char] ParserState m Blocks + -> [Int] + -> ParserT [Char] ParserState m [Blocks] +gridTableRow blocks indices = do + colLines <- many1 (gridTableRawLine indices) + let cols = map ((++ "\n") . unlines . removeOneLeadingSpace) $ + transpose colLines + mapM (liftM compactifyCell . parseFromString blocks) cols + +removeOneLeadingSpace :: [String] -> [String] +removeOneLeadingSpace xs = + if all startsWithSpace xs + then map (drop 1) xs + else xs + where startsWithSpace "" = True + startsWithSpace (y:_) = y == ' ' + +compactifyCell :: Blocks -> Blocks +compactifyCell bs = head $ compactify [bs] + +-- | Parse footer for a grid table. +gridTableFooter :: Stream s m Char => ParserT s ParserState m [Char] +gridTableFooter = blanklines + +--- + +-- | Removes the ParsecT layer from the monad transformer stack +readWithM :: (Monad m) + => ParserT [Char] st m a -- ^ parser + -> st -- ^ initial state + -> String -- ^ input + -> m (Either PandocError a) +readWithM parser state input = + mapLeft (PandocParsecError input) `liftM` runParserT parser state "source" input + + +-- | Parse a string with a given parser and state +readWith :: Parser [Char] st a + -> st + -> String + -> Either PandocError a +readWith p t inp = runIdentity $ readWithM p t inp + +-- | Parse a string with @parser@ (for testing). +testStringWith :: (Show a) + => ParserT [Char] ParserState Identity a + -> [Char] + -> IO () +testStringWith parser str = UTF8.putStrLn $ show $ + readWith parser defaultParserState str + +-- | Parsing options. +data ParserState = ParserState + { stateOptions :: ReaderOptions, -- ^ User options + stateParserContext :: ParserContext, -- ^ Inside list? + stateQuoteContext :: QuoteContext, -- ^ Inside quoted environment? + stateAllowLinks :: Bool, -- ^ Allow parsing of links + stateMaxNestingLevel :: Int, -- ^ Max # of nested Strong/Emph + stateLastStrPos :: Maybe SourcePos, -- ^ Position after last str parsed + stateKeys :: KeyTable, -- ^ List of reference keys + stateHeaderKeys :: KeyTable, -- ^ List of implicit header ref keys + stateSubstitutions :: SubstTable, -- ^ List of substitution references + stateNotes :: NoteTable, -- ^ List of notes (raw bodies) + stateNotes' :: NoteTable', -- ^ List of notes (parsed bodies) + stateMeta :: Meta, -- ^ Document metadata + stateMeta' :: F Meta, -- ^ Document metadata + stateHeaderTable :: [HeaderType], -- ^ Ordered list of header types used + stateHeaders :: M.Map Inlines String, -- ^ List of headers and ids (used for implicit ref links) + stateIdentifiers :: Set.Set String, -- ^ Header identifiers used + stateNextExample :: Int, -- ^ Number of next example + stateExamples :: M.Map String Int, -- ^ Map from example labels to numbers + stateHasChapters :: Bool, -- ^ True if \chapter encountered + stateMacros :: [Macro], -- ^ List of macros defined so far + stateRstDefaultRole :: String, -- ^ Current rST default interpreted text role + stateRstCustomRoles :: M.Map String (String, Maybe String, Attr), -- ^ Current rST custom text roles + -- Triple represents: 1) Base role, 2) Optional format (only for :raw: + -- roles), 3) Additional classes (rest of Attr is unused)). + stateCaption :: Maybe Inlines, -- ^ Caption in current environment + stateInHtmlBlock :: Maybe String, -- ^ Tag type of HTML block being parsed + stateContainers :: [String], -- ^ parent include files + stateLogMessages :: [LogMessage], -- ^ log messages + stateMarkdownAttribute :: Bool -- ^ True if in markdown=1 context + } + +instance Default ParserState where + def = defaultParserState + +instance HasMeta ParserState where + setMeta field val st = + st{ stateMeta = setMeta field val $ stateMeta st } + deleteMeta field st = + st{ stateMeta = deleteMeta field $ stateMeta st } + +class HasReaderOptions st where + extractReaderOptions :: st -> ReaderOptions + getOption :: (Stream s m t) => (ReaderOptions -> b) -> ParserT s st m b + -- default + getOption f = (f . extractReaderOptions) <$> getState + +class HasQuoteContext st m where + getQuoteContext :: (Stream s m t) => ParsecT s st m QuoteContext + withQuoteContext :: QuoteContext -> ParsecT s st m a -> ParsecT s st m a + +instance Monad m => HasQuoteContext ParserState m where + getQuoteContext = stateQuoteContext <$> getState + withQuoteContext context parser = do + oldState <- getState + let oldQuoteContext = stateQuoteContext oldState + setState oldState { stateQuoteContext = context } + result <- parser + newState <- getState + setState newState { stateQuoteContext = oldQuoteContext } + return result + +instance HasReaderOptions ParserState where + extractReaderOptions = stateOptions + +class HasHeaderMap st where + extractHeaderMap :: st -> M.Map Inlines String + updateHeaderMap :: (M.Map Inlines String -> M.Map Inlines String) -> + st -> st + +instance HasHeaderMap ParserState where + extractHeaderMap = stateHeaders + updateHeaderMap f st = st{ stateHeaders = f $ stateHeaders st } + +class HasIdentifierList st where + extractIdentifierList :: st -> Set.Set String + updateIdentifierList :: (Set.Set String -> Set.Set String) -> st -> st + +instance HasIdentifierList ParserState where + extractIdentifierList = stateIdentifiers + updateIdentifierList f st = st{ stateIdentifiers = f $ stateIdentifiers st } + +class HasMacros st where + extractMacros :: st -> [Macro] + updateMacros :: ([Macro] -> [Macro]) -> st -> st + +instance HasMacros ParserState where + extractMacros = stateMacros + updateMacros f st = st{ stateMacros = f $ stateMacros st } + +class HasLastStrPosition st where + setLastStrPos :: SourcePos -> st -> st + getLastStrPos :: st -> Maybe SourcePos + +instance HasLastStrPosition ParserState where + setLastStrPos pos st = st{ stateLastStrPos = Just pos } + getLastStrPos st = stateLastStrPos st + +class HasLogMessages st where + addLogMessage :: LogMessage -> st -> st + getLogMessages :: st -> [LogMessage] + +instance HasLogMessages ParserState where + addLogMessage msg st = st{ stateLogMessages = msg : stateLogMessages st } + getLogMessages st = reverse $ stateLogMessages st + +defaultParserState :: ParserState +defaultParserState = + ParserState { stateOptions = def, + stateParserContext = NullState, + stateQuoteContext = NoQuote, + stateAllowLinks = True, + stateMaxNestingLevel = 6, + stateLastStrPos = Nothing, + stateKeys = M.empty, + stateHeaderKeys = M.empty, + stateSubstitutions = M.empty, + stateNotes = [], + stateNotes' = [], + stateMeta = nullMeta, + stateMeta' = return nullMeta, + stateHeaderTable = [], + stateHeaders = M.empty, + stateIdentifiers = Set.empty, + stateNextExample = 1, + stateExamples = M.empty, + stateHasChapters = False, + stateMacros = [], + stateRstDefaultRole = "title-reference", + stateRstCustomRoles = M.empty, + stateCaption = Nothing, + stateInHtmlBlock = Nothing, + stateContainers = [], + stateLogMessages = [], + stateMarkdownAttribute = False + } + +-- | Add a log message. +logMessage :: (Stream s m a, HasLogMessages st) + => LogMessage -> ParserT s st m () +logMessage msg = updateState (addLogMessage msg) + +-- | Report all the accumulated log messages, according to verbosity level. +reportLogMessages :: (PandocMonad m, HasLogMessages st) => ParserT s st m () +reportLogMessages = do + msgs <- getLogMessages <$> getState + mapM_ report msgs + +-- | Succeed only if the extension is enabled. +guardEnabled :: (Stream s m a, HasReaderOptions st) => Extension -> ParserT s st m () +guardEnabled ext = getOption readerExtensions >>= guard . extensionEnabled ext + +-- | Succeed only if the extension is disabled. +guardDisabled :: (Stream s m a, HasReaderOptions st) => Extension -> ParserT s st m () +guardDisabled ext = getOption readerExtensions >>= guard . not . extensionEnabled ext + +-- | Update the position on which the last string ended. +updateLastStrPos :: (Stream s m a, HasLastStrPosition st) => ParserT s st m () +updateLastStrPos = getPosition >>= updateState . setLastStrPos + +-- | Whether we are right after the end of a string. +notAfterString :: (Stream s m a, HasLastStrPosition st) => ParserT s st m Bool +notAfterString = do + pos <- getPosition + st <- getState + return $ getLastStrPos st /= Just pos + +data HeaderType + = SingleHeader Char -- ^ Single line of characters underneath + | DoubleHeader Char -- ^ Lines of characters above and below + deriving (Eq, Show) + +data ParserContext + = ListItemState -- ^ Used when running parser on list item contents + | NullState -- ^ Default state + deriving (Eq, Show) + +data QuoteContext + = InSingleQuote -- ^ Used when parsing inside single quotes + | InDoubleQuote -- ^ Used when parsing inside double quotes + | NoQuote -- ^ Used when not parsing inside quotes + deriving (Eq, Show) + +type NoteTable = [(String, String)] + +type NoteTable' = [(String, F Blocks)] -- used in markdown reader + +newtype Key = Key String deriving (Show, Read, Eq, Ord) + +toKey :: String -> Key +toKey = Key . map toLower . unwords . words . unbracket + where unbracket ('[':xs) | "]" `isSuffixOf` xs = take (length xs - 1) xs + unbracket xs = xs + +type KeyTable = M.Map Key (Target, Attr) + +type SubstTable = M.Map Key Inlines + +-- | Add header to the list of headers in state, together +-- with its associated identifier. If the identifier is null +-- and the auto_identifers extension is set, generate a new +-- unique identifier, and update the list of identifiers +-- in state. +registerHeader :: (Stream s m a, HasReaderOptions st, HasHeaderMap st, HasIdentifierList st) + => Attr -> Inlines -> ParserT s st m Attr +registerHeader (ident,classes,kvs) header' = do + ids <- extractIdentifierList <$> getState + exts <- getOption readerExtensions + let insert' = M.insertWith (\_new old -> old) + if null ident && Ext_auto_identifiers `extensionEnabled` exts + then do + let id' = uniqueIdent (B.toList header') ids + let id'' = if Ext_ascii_identifiers `extensionEnabled` exts + then catMaybes $ map toAsciiChar id' + else id' + updateState $ updateIdentifierList $ Set.insert id' + updateState $ updateIdentifierList $ Set.insert id'' + updateState $ updateHeaderMap $ insert' header' id' + return (id'',classes,kvs) + else do + unless (null ident) $ + updateState $ updateHeaderMap $ insert' header' ident + return (ident,classes,kvs) + +smartPunctuation :: (HasReaderOptions st, HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +smartPunctuation inlineParser = do + guardEnabled Ext_smart + choice [ quoted inlineParser, apostrophe, dash, ellipses ] + +apostrophe :: Stream s m Char => ParserT s st m Inlines +apostrophe = (char '\'' <|> char '\8217') >> return (B.str "\x2019") + +quoted :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +quoted inlineParser = doubleQuoted inlineParser <|> singleQuoted inlineParser + +singleQuoted :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +singleQuoted inlineParser = try $ do + singleQuoteStart + withQuoteContext InSingleQuote $ many1Till inlineParser singleQuoteEnd >>= + return . B.singleQuoted . mconcat + +doubleQuoted :: (HasQuoteContext st m, Stream s m Char) + => ParserT s st m Inlines + -> ParserT s st m Inlines +doubleQuoted inlineParser = try $ do + doubleQuoteStart + withQuoteContext InDoubleQuote $ manyTill inlineParser doubleQuoteEnd >>= + return . B.doubleQuoted . mconcat + +failIfInQuoteContext :: (HasQuoteContext st m, Stream s m t) + => QuoteContext + -> ParserT s st m () +failIfInQuoteContext context = do + context' <- getQuoteContext + if context' == context + then fail "already inside quotes" + else return () + +charOrRef :: Stream s m Char => String -> ParserT s st m Char +charOrRef cs = + oneOf cs <|> try (do c <- characterReference + guard (c `elem` cs) + return c) + +singleQuoteStart :: (HasLastStrPosition st, HasQuoteContext st m, Stream s m Char) + => ParserT s st m () +singleQuoteStart = do + failIfInQuoteContext InSingleQuote + -- single quote start can't be right after str + guard =<< notAfterString + () <$ charOrRef "'\8216\145" + +singleQuoteEnd :: Stream s m Char + => ParserT s st m () +singleQuoteEnd = try $ do + charOrRef "'\8217\146" + notFollowedBy alphaNum + +doubleQuoteStart :: (HasQuoteContext st m, Stream s m Char) + => ParserT s st m () +doubleQuoteStart = do + failIfInQuoteContext InDoubleQuote + try $ do charOrRef "\"\8220\147" + notFollowedBy . satisfy $ flip elem [' ', '\t', '\n'] + +doubleQuoteEnd :: Stream s m Char + => ParserT s st m () +doubleQuoteEnd = void (charOrRef "\"\8221\148") + +ellipses :: Stream s m Char + => ParserT s st m Inlines +ellipses = try (string "..." >> return (B.str "\8230")) + +dash :: (HasReaderOptions st, Stream s m Char) + => ParserT s st m Inlines +dash = try $ do + oldDashes <- extensionEnabled Ext_old_dashes <$> getOption readerExtensions + if oldDashes + then do + char '-' + (char '-' >> return (B.str "\8212")) + <|> (lookAhead digit >> return (B.str "\8211")) + else do + string "--" + (char '-' >> return (B.str "\8212")) + <|> return (B.str "\8211") + +-- This is used to prevent exponential blowups for things like: +-- a**a*a**a*a**a*a**a*a**a*a**a*a** +nested :: Stream s m a + => ParserT s ParserState m a + -> ParserT s ParserState m a +nested p = do + nestlevel <- stateMaxNestingLevel <$> getState + guard $ nestlevel > 0 + updateState $ \st -> st{ stateMaxNestingLevel = stateMaxNestingLevel st - 1 } + res <- p + updateState $ \st -> st{ stateMaxNestingLevel = nestlevel } + return res + +citeKey :: (Stream s m Char, HasLastStrPosition st) + => ParserT s st m (Bool, String) +citeKey = try $ do + guard =<< notAfterString + suppress_author <- option False (char '-' *> return True) + char '@' + firstChar <- alphaNum <|> char '_' <|> char '*' -- @* for wildcard in nocite + let regchar = satisfy (\c -> isAlphaNum c || c == '_') + let internal p = try $ p <* lookAhead regchar + rest <- many $ regchar <|> internal (oneOf ":.#$%&-+?<>~/") <|> + try (oneOf ":/" <* lookAhead (char '/')) + let key = firstChar:rest + return (suppress_author, key) + + +token :: (Stream s m t) + => (t -> String) + -> (t -> SourcePos) + -> (t -> Maybe a) + -> ParsecT s st m a +token pp pos match = tokenPrim pp (\_ t _ -> pos t) match + +-- +-- Macros +-- + +-- | Parse a \newcommand or \renewcommand macro definition. +macro :: (Stream [Char] m Char, HasMacros st, HasReaderOptions st) + => ParserT [Char] st m Blocks +macro = do + apply <- getOption readerApplyMacros + inp <- getInput + case parseMacroDefinitions inp of + ([], _) -> mzero + (ms, rest) -> do def' <- count (length inp - length rest) anyChar + if apply + then do + updateState $ \st -> + updateMacros (ms ++) st + return mempty + else return $ rawBlock "latex" def' + +-- | Apply current macros to string. +applyMacros' :: (HasReaderOptions st, HasMacros st, Stream [Char] m Char) + => String + -> ParserT [Char] st m String +applyMacros' target = do + apply <- getOption readerApplyMacros + if apply + then do macros <- extractMacros <$> getState + return $ applyMacros macros target + else return target + +infixr 5 <+?> +(<+?>) :: (Monoid a) => ParserT s st m a -> ParserT s st m a -> ParserT s st m a +a <+?> b = a >>= flip fmap (try b <|> return mempty) . (<>) + +extractIdClass :: Attr -> Attr +extractIdClass (ident, cls, kvs) = (ident', cls', kvs') + where + ident' = case (lookup "id" kvs) of + Just v -> v + Nothing -> ident + cls' = case (lookup "class" kvs) of + Just cl -> words cl + Nothing -> cls + kvs' = filter (\(k,_) -> k /= "id" || k /= "class") kvs + +insertIncludedFile :: PandocMonad m + => ParserT String ParserState m Blocks + -> [FilePath] -> FilePath + -> ParserT String ParserState m Blocks +insertIncludedFile blocks dirs f = do + oldPos <- getPosition + oldInput <- getInput + containers <- stateContainers <$> getState + when (f `elem` containers) $ + throwError $ PandocParseError $ "Include file loop at " ++ show oldPos + updateState $ \s -> s{ stateContainers = f : stateContainers s } + mbcontents <- readFileFromDirs dirs f + contents <- case mbcontents of + Just s -> return s + Nothing -> do + report $ CouldNotLoadIncludeFile f oldPos + return "" + setPosition $ newPos f 1 1 + setInput contents + bs <- blocks + setInput oldInput + setPosition oldPos + updateState $ \s -> s{ stateContainers = tail $ stateContainers s } + return bs |
