diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/LaTeX.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 2774 |
1 files changed, 1671 insertions, 1103 deletions
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 17fb48548..a9bafb03b 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -1,4 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE PatternGuards #-} +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} {-# LANGUAGE ScopedTypeVariables #-} {- Copyright (C) 2006-2017 John MacFarlane <jgm@berkeley.edu> @@ -28,20 +31,26 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA Portability : portable Conversion of LaTeX to 'Pandoc' document. + -} module Text.Pandoc.Readers.LaTeX ( readLaTeX, + applyMacros, rawLaTeXInline, rawLaTeXBlock, - inlineCommand, + macro, + inlineCommand ) where import Control.Applicative (many, optional, (<|>)) import Control.Monad import Control.Monad.Except (throwError) -import Data.Char (chr, isAlphaNum, isLetter, ord) -import Data.Text (Text, unpack) +import Data.Char (chr, isAlphaNum, isLetter, ord, isDigit) +import Data.Default +import Data.Text (Text) +import qualified Data.Text as T import Data.List (intercalate, isPrefixOf) import qualified Data.Map as M +import qualified Data.Set as Set import Data.Maybe (fromMaybe, maybeToList) import Safe (minimumDef) import System.FilePath (addExtension, replaceExtension, takeExtension) @@ -52,10 +61,19 @@ import Text.Pandoc.Highlighting (fromListingsLanguage, languagesByExtension) import Text.Pandoc.ImageSize (numUnit, showFl) import Text.Pandoc.Logging import Text.Pandoc.Options -import Text.Pandoc.Parsing hiding (many, mathDisplay, mathInline, optional, - space, (<|>)) +import Text.Pandoc.Parsing hiding (many, optional, withRaw, + mathInline, mathDisplay, + space, (<|>), spaces, blankline) import Text.Pandoc.Shared +import Text.Pandoc.Readers.LaTeX.Types (Macro(..), Tok(..), + TokType(..)) import Text.Pandoc.Walk +import Text.Pandoc.Error (PandocError(PandocParsecError, PandocMacroLoop)) + +-- for debugging: +-- import Text.Pandoc.Extensions (getDefaultExtensions) +-- import Text.Pandoc.Class (runIOorExplode, PandocIO) +-- import Debug.Trace (traceShowId) -- | Parse LaTeX from string and return 'Pandoc' document. readLaTeX :: PandocMonad m @@ -63,17 +81,18 @@ readLaTeX :: PandocMonad m -> Text -- ^ String to parse (assumes @'\n'@ line endings) -> m Pandoc readLaTeX opts ltx = do - parsed <- readWithM parseLaTeX def{ stateOptions = opts } (unpack ltx) + parsed <- runParserT parseLaTeX def{ sOptions = opts } "source" + (tokenize (crFilter ltx)) case parsed of Right result -> return result - Left e -> throwError e + Left e -> throwError $ PandocParsecError (T.unpack ltx) e parseLaTeX :: PandocMonad m => LP m Pandoc parseLaTeX = do bs <- blocks eof st <- getState - let meta = stateMeta st + let meta = sMeta st let doc' = doc bs let headerLevel (Header n _ _) = [n] headerLevel _ = [] @@ -87,177 +106,476 @@ parseLaTeX = do else id) doc' return $ Pandoc meta bs' -type LP m = ParserT String ParserState m - -anyControlSeq :: PandocMonad m => LP m String -anyControlSeq = do - char '\\' - next <- option '\n' anyChar - case next of - '\n' -> return "" - c | isLetter c -> (c:) <$> (many letter <* optional sp) - | otherwise -> return [c] - -controlSeq :: PandocMonad m => String -> LP m String -controlSeq name = try $ do - char '\\' - case name of - "" -> mzero - [c] | not (isLetter c) -> string [c] - cs -> string cs <* notFollowedBy letter <* optional sp - return name - -dimenarg :: PandocMonad m => LP m String -dimenarg = try $ do - ch <- option "" $ string "=" - num <- many1 digit - dim <- oneOfStrings ["pt","pc","in","bp","cm","mm","dd","cc","sp"] - return $ ch ++ num ++ dim +-- testParser :: LP PandocIO a -> Text -> IO a +-- testParser p t = do +-- res <- runIOorExplode (runParserT p defaultLaTeXState{ +-- sOptions = def{ readerExtensions = +-- enableExtension Ext_raw_tex $ +-- getDefaultExtensions "latex" }} "source" (tokenize t)) +-- case res of +-- Left e -> error (show e) +-- Right r -> return r + +data LaTeXState = LaTeXState{ sOptions :: ReaderOptions + , sMeta :: Meta + , sQuoteContext :: QuoteContext + , sMacros :: M.Map Text Macro + , sContainers :: [String] + , sHeaders :: M.Map Inlines String + , sLogMessages :: [LogMessage] + , sIdentifiers :: Set.Set String + , sVerbatimMode :: Bool + , sCaption :: Maybe Inlines + , sInListItem :: Bool + , sInTableCell :: Bool + } + deriving Show + +defaultLaTeXState :: LaTeXState +defaultLaTeXState = LaTeXState{ sOptions = def + , sMeta = nullMeta + , sQuoteContext = NoQuote + , sMacros = M.empty + , sContainers = [] + , sHeaders = M.empty + , sLogMessages = [] + , sIdentifiers = Set.empty + , sVerbatimMode = False + , sCaption = Nothing + , sInListItem = False + , sInTableCell = False + } + +instance PandocMonad m => HasQuoteContext LaTeXState m where + getQuoteContext = sQuoteContext <$> getState + withQuoteContext context parser = do + oldState <- getState + let oldQuoteContext = sQuoteContext oldState + setState oldState { sQuoteContext = context } + result <- parser + newState <- getState + setState newState { sQuoteContext = oldQuoteContext } + return result + +instance HasLogMessages LaTeXState where + addLogMessage msg st = st{ sLogMessages = msg : sLogMessages st } + getLogMessages st = reverse $ sLogMessages st + +instance HasIdentifierList LaTeXState where + extractIdentifierList = sIdentifiers + updateIdentifierList f st = st{ sIdentifiers = f $ sIdentifiers st } + +instance HasIncludeFiles LaTeXState where + getIncludeFiles = sContainers + addIncludeFile f s = s{ sContainers = f : sContainers s } + dropLatestIncludeFile s = s { sContainers = drop 1 $ sContainers s } + +instance HasHeaderMap LaTeXState where + extractHeaderMap = sHeaders + updateHeaderMap f st = st{ sHeaders = f $ sHeaders st } + +instance HasMacros LaTeXState where + extractMacros st = sMacros st + updateMacros f st = st{ sMacros = f (sMacros st) } + +instance HasReaderOptions LaTeXState where + extractReaderOptions = sOptions + +instance HasMeta LaTeXState where + setMeta field val st = + st{ sMeta = setMeta field val $ sMeta st } + deleteMeta field st = + st{ sMeta = deleteMeta field $ sMeta st } + +instance Default LaTeXState where + def = defaultLaTeXState + +type LP m = ParserT [Tok] LaTeXState m + +withVerbatimMode :: PandocMonad m => LP m a -> LP m a +withVerbatimMode parser = do + updateState $ \st -> st{ sVerbatimMode = True } + result <- parser + updateState $ \st -> st{ sVerbatimMode = False } + return result + +rawLaTeXBlock :: (PandocMonad m, HasMacros s, HasReaderOptions s) + => ParserT String s m String +rawLaTeXBlock = do + lookAhead (try (char '\\' >> letter)) + inp <- getInput + let toks = tokenize $ T.pack inp + let rawblock = do + (_, raw) <- try $ + withRaw (environment <|> macroDef <|> blockCommand) + return raw + pstate <- getState + let lstate = def{ sOptions = extractReaderOptions pstate } + res <- runParserT rawblock lstate "source" toks + case res of + Left _ -> mzero + Right raw -> takeP (T.length (untokenize raw)) + +macro :: (PandocMonad m, HasMacros s, HasReaderOptions s) + => ParserT String s m Blocks +macro = do + guardEnabled Ext_latex_macros + lookAhead (char '\\' *> oneOfStrings ["new", "renew", "provide"] *> + oneOfStrings ["command", "environment"]) + inp <- getInput + let toks = tokenize $ T.pack inp + let rawblock = do + (_, raw) <- withRaw $ try macroDef + st <- getState + return (raw, st) + pstate <- getState + let lstate = def{ sOptions = extractReaderOptions pstate + , sMacros = extractMacros pstate } + res <- runParserT rawblock lstate "source" toks + case res of + Left _ -> mzero + Right (raw, st) -> do + updateState (updateMacros (const $ sMacros st)) + mempty <$ takeP (T.length (untokenize raw)) + +applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s) + => String -> ParserT String s m String +applyMacros s = do + (guardEnabled Ext_latex_macros >> + do let retokenize = doMacros 0 *> (toksToString <$> getInput) + pstate <- getState + let lstate = def{ sOptions = extractReaderOptions pstate + , sMacros = extractMacros pstate } + res <- runParserT retokenize lstate "math" (tokenize (T.pack s)) + case res of + Left e -> fail (show e) + Right s' -> return s') <|> return s + +rawLaTeXInline :: (PandocMonad m, HasMacros s, HasReaderOptions s) + => ParserT String s m String +rawLaTeXInline = do + lookAhead (try (char '\\' >> letter) <|> char '$') + inp <- getInput + let toks = tokenize $ T.pack inp + let rawinline = do + (_, raw) <- try $ withRaw (inlineEnvironment <|> inlineCommand') + st <- getState + return (raw, st) + pstate <- getState + let lstate = def{ sOptions = extractReaderOptions pstate + , sMacros = extractMacros pstate } + res <- runParserT rawinline lstate "source" toks + case res of + Left _ -> mzero + Right (raw, s) -> do + updateState $ updateMacros (const $ sMacros s) + takeP (T.length (untokenize raw)) + +inlineCommand :: PandocMonad m => ParserT String ParserState m Inlines +inlineCommand = do + lookAhead (try (char '\\' >> letter) <|> char '$') + inp <- getInput + let toks = tokenize $ T.pack inp + let rawinline = do + (il, raw) <- try $ withRaw (inlineEnvironment <|> inlineCommand') + st <- getState + return (il, raw, st) + pstate <- getState + let lstate = def{ sOptions = extractReaderOptions pstate + , sMacros = extractMacros pstate } + res <- runParserT rawinline lstate "source" toks + case res of + Left _ -> mzero + Right (il, raw, s) -> do + updateState $ updateMacros (const $ sMacros s) + takeP (T.length (untokenize raw)) + return il + +tokenize :: Text -> [Tok] +tokenize = totoks (1, 1) + +totoks :: (Line, Column) -> Text -> [Tok] +totoks (lin,col) t = + case T.uncons t of + Nothing -> [] + Just (c, rest) + | c == '\n' -> + Tok (lin, col) Newline "\n" + : totoks (lin + 1,1) rest + | isSpaceOrTab c -> + let (sps, rest') = T.span isSpaceOrTab t + in Tok (lin, col) Spaces sps + : totoks (lin, col + T.length sps) rest' + | isAlphaNum c -> + let (ws, rest') = T.span isAlphaNum t + in Tok (lin, col) Word ws + : totoks (lin, col + T.length ws) rest' + | c == '%' -> + let (cs, rest') = T.break (== '\n') rest + in Tok (lin, col) Comment ("%" <> cs) + : totoks (lin, col + 1 + T.length cs) rest' + | c == '\\' -> + case T.uncons rest of + Nothing -> [Tok (lin, col) Symbol (T.singleton c)] + Just (d, rest') + | isLetter d -> + let (ws, rest'') = T.span isLetter rest + (ss, rest''') = T.span isSpaceOrTab rest'' + in Tok (lin, col) (CtrlSeq ws) ("\\" <> ws <> ss) + : totoks (lin, + col + 1 + T.length ws + T.length ss) rest''' + | d == '\t' || d == '\n' -> + Tok (lin, col) Symbol ("\\") + : totoks (lin, col + 1) rest + | otherwise -> + Tok (lin, col) (CtrlSeq (T.singleton d)) (T.pack [c,d]) + : totoks (lin, col + 2) rest' + | c == '#' -> + let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest + in case safeRead (T.unpack t1) of + Just i -> + Tok (lin, col) (Arg i) ("#" <> t1) + : totoks (lin, col + 1 + T.length t1) t2 + Nothing -> + Tok (lin, col) Symbol ("#") + : totoks (lin, col + 1) t2 + | c == '^' -> + case T.uncons rest of + Just ('^', rest') -> + case T.uncons rest' of + Just (d, rest'') + | isLowerHex d -> + case T.uncons rest'' of + Just (e, rest''') | isLowerHex e -> + Tok (lin, col) Esc2 (T.pack ['^','^',d,e]) + : totoks (lin, col + 4) rest''' + _ -> + Tok (lin, col) Esc1 (T.pack ['^','^',d]) + : totoks (lin, col + 3) rest'' + | d < '\128' -> + Tok (lin, col) Esc1 (T.pack ['^','^',d]) + : totoks (lin, col + 3) rest'' + _ -> [Tok (lin, col) Symbol ("^"), + Tok (lin, col + 1) Symbol ("^")] + _ -> Tok (lin, col) Symbol ("^") + : totoks (lin, col + 1) rest + | otherwise -> + Tok (lin, col) Symbol (T.singleton c) : totoks (lin, col + 1) rest + + where isSpaceOrTab ' ' = True + isSpaceOrTab '\t' = True + isSpaceOrTab _ = False + +isLowerHex :: Char -> Bool +isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f' + +untokenize :: [Tok] -> Text +untokenize = mconcat . map untoken + +untoken :: Tok -> Text +untoken (Tok _ _ t) = t + +satisfyTok :: PandocMonad m => (Tok -> Bool) -> LP m Tok +satisfyTok f = + try $ do + res <- tokenPrim (T.unpack . untoken) updatePos matcher + doMacros 0 -- apply macros on remaining input stream + return res + where matcher t | f t = Just t + | otherwise = Nothing + updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos + updatePos spos _ (Tok (lin,col) _ _ : _) = + setSourceColumn (setSourceLine spos lin) col + updatePos spos _ [] = spos + +doMacros :: PandocMonad m => Int -> LP m () +doMacros n = do + verbatimMode <- sVerbatimMode <$> getState + when (not verbatimMode) $ do + inp <- getInput + case inp of + Tok spos (CtrlSeq "begin") _ : Tok _ Symbol "{" : + Tok _ Word name : Tok _ Symbol "}" : ts + -> handleMacros spos name ts + Tok spos (CtrlSeq "end") _ : Tok _ Symbol "{" : + Tok _ Word name : Tok _ Symbol "}" : ts + -> handleMacros spos ("end" <> name) ts + Tok spos (CtrlSeq name) _ : ts + -> handleMacros spos name ts + _ -> return () + where handleMacros spos name ts = do + macros <- sMacros <$> getState + case M.lookup name macros of + Nothing -> return () + Just (Macro numargs optarg newtoks) -> do + setInput ts + let getarg = spaces >> braced + args <- case optarg of + Nothing -> count numargs getarg + Just o -> + (:) <$> option o bracketedToks + <*> count (numargs - 1) getarg + let addTok (Tok _ (Arg i) _) acc | i > 0 + , i <= numargs = + map (setpos spos) (args !! (i - 1)) ++ acc + addTok t acc = setpos spos t : acc + ts' <- getInput + setInput $ foldr addTok ts' newtoks + if n > 20 -- detect macro expansion loops + then throwError $ PandocMacroLoop (T.unpack name) + else doMacros (n + 1) + +setpos :: (Line, Column) -> Tok -> Tok +setpos spos (Tok _ tt txt) = Tok spos tt txt + +anyControlSeq :: PandocMonad m => LP m Tok +anyControlSeq = satisfyTok isCtrlSeq + where isCtrlSeq (Tok _ (CtrlSeq _) _) = True + isCtrlSeq _ = False + +anySymbol :: PandocMonad m => LP m Tok +anySymbol = satisfyTok isSym + where isSym (Tok _ Symbol _) = True + isSym _ = False + +spaces :: PandocMonad m => LP m () +spaces = skipMany (satisfyTok (tokTypeIn [Comment, Spaces, Newline])) + +spaces1 :: PandocMonad m => LP m () +spaces1 = skipMany1 (satisfyTok (tokTypeIn [Comment, Spaces, Newline])) + +tokTypeIn :: [TokType] -> Tok -> Bool +tokTypeIn toktypes (Tok _ tt _) = tt `elem` toktypes + +controlSeq :: PandocMonad m => Text -> LP m Tok +controlSeq name = satisfyTok isNamed + where isNamed (Tok _ (CtrlSeq n) _) = n == name + isNamed _ = False + +symbol :: PandocMonad m => Char -> LP m Tok +symbol c = satisfyTok isc + where isc (Tok _ Symbol d) = case T.uncons d of + Just (c',_) -> c == c' + _ -> False + isc _ = False + +symbolIn :: PandocMonad m => [Char] -> LP m Tok +symbolIn cs = satisfyTok isInCs + where isInCs (Tok _ Symbol d) = case T.uncons d of + Just (c,_) -> c `elem` cs + _ -> False + isInCs _ = False sp :: PandocMonad m => LP m () sp = whitespace <|> endline whitespace :: PandocMonad m => LP m () -whitespace = skipMany1 $ satisfy (\c -> c == ' ' || c == '\t') - -endline :: PandocMonad m => LP m () -endline = try (newline >> lookAhead anyChar >> notFollowedBy blankline) +whitespace = () <$ satisfyTok isSpaceTok + where isSpaceTok (Tok _ Spaces _) = True + isSpaceTok _ = False -isLowerHex :: Char -> Bool -isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f' +newlineTok :: PandocMonad m => LP m () +newlineTok = () <$ satisfyTok isNewlineTok -tildeEscape :: PandocMonad m => LP m Char -tildeEscape = try $ do - string "^^" - c <- satisfy (\x -> x >= '\0' && x <= '\128') - d <- if isLowerHex c - then option "" $ count 1 (satisfy isLowerHex) - else return "" - if null d - then case ord c of - x | x >= 64 && x <= 127 -> return $ chr (x - 64) - | otherwise -> return $ chr (x + 64) - else return $ chr $ read ('0':'x':c:d) +isNewlineTok :: Tok -> Bool +isNewlineTok (Tok _ Newline _) = True +isNewlineTok _ = False comment :: PandocMonad m => LP m () -comment = do - char '%' - skipMany (satisfy (/='\n')) - optional newline - return () +comment = () <$ satisfyTok isCommentTok + where isCommentTok (Tok _ Comment _) = True + isCommentTok _ = False + +anyTok :: PandocMonad m => LP m Tok +anyTok = satisfyTok (const True) -bgroup :: PandocMonad m => LP m () +endline :: PandocMonad m => LP m () +endline = try $ do + newlineTok + lookAhead anyTok + notFollowedBy blankline + +blankline :: PandocMonad m => LP m () +blankline = try $ skipMany whitespace *> newlineTok + +primEscape :: PandocMonad m => LP m Char +primEscape = do + Tok _ toktype t <- satisfyTok (tokTypeIn [Esc1, Esc2]) + case toktype of + Esc1 -> case T.uncons (T.drop 2 t) of + Just (c, _) + | c >= '\64' && c <= '\127' -> return (chr (ord c - 64)) + | otherwise -> return (chr (ord c + 64)) + Nothing -> fail "Empty content of Esc1" + Esc2 -> case safeRead ('0':'x':T.unpack (T.drop 2 t)) of + Just x -> return (chr x) + Nothing -> fail $ "Could not read: " ++ T.unpack t + _ -> fail "Expected an Esc1 or Esc2 token" -- should not happen + +bgroup :: PandocMonad m => LP m Tok bgroup = try $ do - skipMany (spaceChar <|> try (newline <* notFollowedBy blankline)) - () <$ char '{' - <|> () <$ controlSeq "bgroup" - <|> () <$ controlSeq "begingroup" + skipMany sp + symbol '{' <|> controlSeq "bgroup" <|> controlSeq "begingroup" -egroup :: PandocMonad m => LP m () -egroup = () <$ char '}' - <|> () <$ controlSeq "egroup" - <|> () <$ controlSeq "endgroup" +egroup :: PandocMonad m => LP m Tok +egroup = (symbol '}' <|> controlSeq "egroup" <|> controlSeq "endgroup") -grouped :: PandocMonad m => Monoid a => LP m a -> LP m a +grouped :: (PandocMonad m, Monoid a) => LP m a -> LP m a grouped parser = try $ do bgroup -- first we check for an inner 'grouped', because -- {{a,b}} should be parsed the same as {a,b} - try (grouped parser <* egroup) - <|> (mconcat <$> manyTill parser egroup) - -braced :: PandocMonad m => LP m String -braced = grouped chunk - where chunk = - many1 (satisfy (\c -> c /= '\\' && c /= '}' && c /= '{')) - <|> try (string "\\}") - <|> try (string "\\{") - <|> try (string "\\\\") - <|> ((\x -> "{" ++ x ++ "}") <$> braced) - <|> count 1 anyChar + try (grouped parser <* egroup) <|> (mconcat <$> manyTill parser egroup) + +braced :: PandocMonad m => LP m [Tok] +braced = bgroup *> braced' 1 + where braced' (n :: Int) = + handleEgroup n <|> handleBgroup n <|> handleOther n + handleEgroup n = do + t <- egroup + if n == 1 + then return [] + else (t:) <$> braced' (n - 1) + handleBgroup n = do + t <- bgroup + (t:) <$> braced' (n + 1) + handleOther n = do + t <- anyTok + (t:) <$> braced' n bracketed :: PandocMonad m => Monoid a => LP m a -> LP m a -bracketed parser = try $ char '[' *> (mconcat <$> manyTill parser (char ']')) - -mathDisplay :: PandocMonad m => LP m String -> LP m Inlines -mathDisplay p = displayMath <$> (try p >>= applyMacros' . trim) - -mathInline :: PandocMonad m => LP m String -> LP m Inlines -mathInline p = math <$> (try p >>= applyMacros') - -mathChars :: PandocMonad m => LP m String -mathChars = - concat <$> many (escapedChar - <|> (snd <$> withRaw braced) - <|> many1 (satisfy isOrdChar)) - where escapedChar = try $ do char '\\' - c <- anyChar - return ['\\',c] - isOrdChar '$' = False - isOrdChar '{' = False - isOrdChar '}' = False - isOrdChar '\\' = False - isOrdChar _ = True - -quoted' :: PandocMonad m => (Inlines -> Inlines) -> LP m String -> LP m () -> LP m Inlines -quoted' f starter ender = do - startchs <- starter - smart <- extensionEnabled Ext_smart <$> getOption readerExtensions - if smart - then do - ils <- many (notFollowedBy ender >> inline) - (ender >> return (f (mconcat ils))) <|> - (<> mconcat ils) <$> - lit (case startchs of - "``" -> "“" - "`" -> "‘" - _ -> startchs) - else lit startchs +bracketed parser = try $ do + symbol '[' + mconcat <$> manyTill parser (symbol ']') -doubleQuote :: PandocMonad m => LP m Inlines -doubleQuote = do - quoted' doubleQuoted (try $ string "``") (void $ try $ string "''") - <|> quoted' doubleQuoted (string "“") (void $ char '”') - -- the following is used by babel for localized quotes: - <|> quoted' doubleQuoted (try $ string "\"`") (void $ try $ string "\"'") - <|> quoted' doubleQuoted (string "\"") (void $ char '"') +dimenarg :: PandocMonad m => LP m Text +dimenarg = try $ do + ch <- option False $ True <$ symbol '=' + Tok _ _ s <- satisfyTok isWordTok + guard $ (T.take 2 (T.reverse s)) `elem` + ["pt","pc","in","bp","cm","mm","dd","cc","sp"] + let num = T.take (T.length s - 2) s + guard $ T.length num > 0 + guard $ T.all isDigit num + return $ T.pack ['=' | ch] <> s -singleQuote :: PandocMonad m => LP m Inlines -singleQuote = do - smart <- extensionEnabled Ext_smart <$> getOption readerExtensions - if smart - then quoted' singleQuoted (string "`") (try $ char '\'' >> notFollowedBy letter) - <|> quoted' singleQuoted (string "‘") (try $ char '’' >> notFollowedBy letter) - else str <$> many1 (oneOf "`\'‘’") +-- inline elements: -inline :: PandocMonad m => LP m Inlines -inline = (mempty <$ comment) - <|> (space <$ whitespace) - <|> (softbreak <$ endline) - <|> inlineText - <|> inlineCommand - <|> inlineEnvironment - <|> inlineGroup - <|> (char '-' *> option (str "-") - (char '-' *> option (str "–") (str "—" <$ char '-'))) - <|> doubleQuote - <|> singleQuote - <|> (str "”" <$ try (string "''")) - <|> (str "”" <$ char '”') - <|> (str "’" <$ char '\'') - <|> (str "’" <$ char '’') - <|> (str "\160" <$ char '~') - <|> mathDisplay (string "$$" *> mathChars <* string "$$") - <|> mathInline (char '$' *> mathChars <* char '$') - <|> (guardEnabled Ext_literate_haskell *> char '|' *> doLHSverb) - <|> (str . (:[]) <$> tildeEscape) - <|> (do res <- oneOf "#&~^'`\"[]" - pos <- getPosition - report $ ParsingUnescaped [res] pos - return $ str [res]) +word :: PandocMonad m => LP m Inlines +word = (str . T.unpack . untoken) <$> satisfyTok isWordTok -inlines :: PandocMonad m => LP m Inlines -inlines = mconcat <$> many (notFollowedBy (char '}') *> inline) +regularSymbol :: PandocMonad m => LP m Inlines +regularSymbol = (str . T.unpack . untoken) <$> satisfyTok isRegularSymbol + where isRegularSymbol (Tok _ Symbol t) = not $ T.any isSpecial t + isRegularSymbol _ = False + isSpecial c = c `Set.member` specialChars + +specialChars :: Set.Set Char +specialChars = Set.fromList "#$%&~_^\\{}" + +isWordTok :: Tok -> Bool +isWordTok (Tok _ Word _) = True +isWordTok _ = False inlineGroup :: PandocMonad m => LP m Inlines inlineGroup = do @@ -268,467 +586,19 @@ inlineGroup = do -- we need the span so we can detitlecase bibtex entries; -- we need to know when something is {C}apitalized -block :: PandocMonad m => LP m Blocks -block = (mempty <$ comment) - <|> (mempty <$ ((spaceChar <|> newline) *> spaces)) - <|> environment - <|> include - <|> macro - <|> blockCommand - <|> paragraph - <|> grouped block - -blocks :: PandocMonad m => LP m Blocks -blocks = mconcat <$> many block - -getRawCommand :: PandocMonad m => String -> LP m String -getRawCommand name' = do - rawargs <- withRaw (many (try (optional sp *> opt)) *> - option "" (try (optional sp *> dimenarg)) *> - many braced) - return $ '\\' : name' ++ snd rawargs - -lookupListDefault :: (Ord k) => v -> [k] -> M.Map k v -> v -lookupListDefault d = (fromMaybe d .) . lookupList - where - lookupList l m = msum $ map (`M.lookup` m) l - -blockCommand :: PandocMonad m => LP m Blocks -blockCommand = try $ do - name <- anyControlSeq - guard $ name /= "begin" && name /= "end" - star <- option "" (string "*" <* optional sp) - let name' = name ++ star - let raw = do - rawcommand <- getRawCommand name' - transformed <- applyMacros' rawcommand - guard $ transformed /= rawcommand - notFollowedBy $ parseFromString' inlines transformed - parseFromString' blocks transformed - lookupListDefault raw [name',name] blockCommands - -inBrackets :: Inlines -> Inlines -inBrackets x = str "[" <> x <> str "]" - --- eat an optional argument and one or more arguments in braces -ignoreInlines :: PandocMonad m => String -> (String, LP m Inlines) -ignoreInlines name = (name, p) - where - p = do oa <- optargs - let rawCommand = '\\':name ++ oa - let doraw = guardRaw >> return (rawInline "latex" rawCommand) - doraw <|> ignore rawCommand - -guardRaw :: PandocMonad m => LP m () -guardRaw = getOption readerExtensions >>= guard . extensionEnabled Ext_raw_tex - -optargs :: PandocMonad m => LP m String -optargs = snd <$> withRaw (skipopts *> skipMany (try $ optional sp *> braced)) - -ignore :: (Monoid a, PandocMonad m) => String -> ParserT s u m a -ignore raw = do - pos <- getPosition - report $ SkippedContent raw pos - return mempty - -ignoreBlocks :: PandocMonad m => String -> (String, LP m Blocks) -ignoreBlocks name = (name, p) - where - p = do oa <- optargs - let rawCommand = '\\':name ++ oa - let doraw = guardRaw >> return (rawBlock "latex" rawCommand) - doraw <|> ignore rawCommand - -blockCommands :: PandocMonad m => M.Map String (LP m Blocks) -blockCommands = M.fromList $ - [ ("par", mempty <$ skipopts) - , ("parbox", braced >> grouped blocks) - , ("title", mempty <$ (skipopts *> - (grouped inline >>= addMeta "title") - <|> (grouped block >>= addMeta "title"))) - , ("subtitle", mempty <$ (skipopts *> tok >>= addMeta "subtitle")) - , ("author", mempty <$ (skipopts *> authors)) - -- -- in letter class, temp. store address & sig as title, author - , ("address", mempty <$ (skipopts *> tok >>= addMeta "address")) - , ("signature", mempty <$ (skipopts *> authors)) - , ("date", mempty <$ (skipopts *> tok >>= addMeta "date")) - -- Koma-script metadata commands - , ("dedication", mempty <$ (skipopts *> tok >>= addMeta "dedication")) - -- sectioning - , ("part", section nullAttr (-1)) - , ("part*", section nullAttr (-1)) - , ("chapter", section nullAttr 0) - , ("chapter*", section ("",["unnumbered"],[]) 0) - , ("section", section nullAttr 1) - , ("section*", section ("",["unnumbered"],[]) 1) - , ("subsection", section nullAttr 2) - , ("subsection*", section ("",["unnumbered"],[]) 2) - , ("subsubsection", section nullAttr 3) - , ("subsubsection*", section ("",["unnumbered"],[]) 3) - , ("paragraph", section nullAttr 4) - , ("paragraph*", section ("",["unnumbered"],[]) 4) - , ("subparagraph", section nullAttr 5) - , ("subparagraph*", section ("",["unnumbered"],[]) 5) - -- beamer slides - , ("frametitle", section nullAttr 3) - , ("framesubtitle", section nullAttr 4) - -- letters - , ("opening", (para . trimInlines) <$> (skipopts *> tok)) - , ("closing", skipopts *> closing) - -- - , ("hrule", pure horizontalRule) - , ("strut", pure mempty) - , ("rule", skipopts *> tok *> tok *> pure horizontalRule) - , ("item", skipopts *> looseItem) - , ("documentclass", skipopts *> braced *> preamble) - , ("centerline", (para . trimInlines) <$> (skipopts *> tok)) - , ("caption", skipopts *> setCaption) - , ("bibliography", mempty <$ (skipopts *> braced >>= - addMeta "bibliography" . splitBibs)) - , ("addbibresource", mempty <$ (skipopts *> braced >>= - addMeta "bibliography" . splitBibs)) - -- includes - , ("lstinputlisting", inputListing) - , ("graphicspath", graphicsPath) - -- hyperlink - , ("hypertarget", braced >> grouped block) - -- LaTeX colors - , ("textcolor", coloredBlock "color") - , ("colorbox", coloredBlock "background-color") - ] ++ map ignoreBlocks - -- these commands will be ignored unless --parse-raw is specified, - -- in which case they will appear as raw latex blocks - [ "newcommand", "renewcommand", "newenvironment", "renewenvironment" - -- newcommand, etc. should be parsed by macro, but we need this - -- here so these aren't parsed as inline commands to ignore - , "special", "pdfannot", "pdfstringdef" - , "bibliographystyle" - , "maketitle", "makeindex", "makeglossary" - , "addcontentsline", "addtocontents", "addtocounter" - -- \ignore{} is used conventionally in literate haskell for definitions - -- that are to be processed by the compiler but not printed. - , "ignore" - , "hyperdef" - , "markboth", "markright", "markleft" - , "hspace", "vspace" - , "newpage" - , "clearpage" - , "pagebreak" - ] - -coloredBlock :: PandocMonad m => String -> LP m Blocks -coloredBlock stylename = do - skipopts - color <- braced - let constructor = divWith ("",[],[("style",stylename ++ ": " ++ color)]) - inlineContents <|> constructor <$> blockContents - where inlineContents = do - ils <- grouped inline - rest <- inlines - return (para (ils <> rest)) - blockContents = grouped block - -graphicsPath :: PandocMonad m => LP m Blocks -graphicsPath = do - ps <- bgroup *> (manyTill braced egroup) - getResourcePath >>= setResourcePath . (++ ps) - return mempty - -addMeta :: PandocMonad m => ToMetaValue a => String -> a -> LP m () -addMeta field val = updateState $ \st -> - st{ stateMeta = addMetaField field val $ stateMeta st } - -splitBibs :: String -> [Inlines] -splitBibs = map (str . flip replaceExtension "bib" . trim) . splitBy (==',') - -setCaption :: PandocMonad m => LP m Blocks -setCaption = do - ils <- tok - mblabel <- option Nothing $ - try $ spaces' >> controlSeq "label" >> (Just <$> tok) - let ils' = case mblabel of - Just lab -> ils <> spanWith - ("",[],[("data-label", stringify lab)]) mempty - Nothing -> ils - updateState $ \st -> st{ stateCaption = Just ils' } - return mempty - -resetCaption :: PandocMonad m => LP m () -resetCaption = updateState $ \st -> st{ stateCaption = Nothing } - -authors :: PandocMonad m => LP m () -authors = try $ do - bgroup - let oneAuthor = mconcat <$> - many1 (notFollowedBy' (controlSeq "and") >> - (inline <|> mempty <$ blockCommand)) - -- skip e.g. \vspace{10pt} - auths <- sepBy oneAuthor (controlSeq "and") - egroup - addMeta "author" (map trimInlines auths) - -section :: PandocMonad m => Attr -> Int -> LP m Blocks -section (ident, classes, kvs) lvl = do - skipopts - contents <- grouped inline - lab <- option ident $ try (spaces' >> controlSeq "label" >> spaces' >> braced) - attr' <- registerHeader (lab, classes, kvs) contents - return $ headerWith attr' lvl contents - -inlineCommand :: PandocMonad m => LP m Inlines -inlineCommand = try $ do - (name, raw') <- withRaw anyControlSeq - guard $ name /= "begin" && name /= "end" - star <- option "" (string "*") - let name' = name ++ star - let raw = do - guard $ not (isBlockCommand name) - rawargs <- withRaw - (skipangles *> skipopts *> option "" dimenarg *> many braced) - let rawcommand = raw' ++ star ++ snd rawargs - transformed <- applyMacros' rawcommand - exts <- getOption readerExtensions - if transformed /= rawcommand - then parseFromString' inlines transformed - else if extensionEnabled Ext_raw_tex exts - then return $ rawInline "latex" rawcommand - else ignore rawcommand - (lookupListDefault raw [name',name] inlineCommands <* - optional (try (string "{}"))) - -rawInlineOr :: PandocMonad m => String -> LP m Inlines -> LP m Inlines -rawInlineOr name' fallback = do - parseRaw <- extensionEnabled Ext_raw_tex <$> getOption readerExtensions - if parseRaw - then rawInline "latex" <$> getRawCommand name' - else fallback - -isBlockCommand :: String -> Bool -isBlockCommand s = s `M.member` (blockCommands :: M.Map String (LP PandocPure Blocks)) - - -inlineEnvironments :: PandocMonad m => M.Map String (LP m Inlines) -inlineEnvironments = M.fromList - [ ("displaymath", mathEnvWith id Nothing "displaymath") - , ("math", math <$> mathEnv "math") - , ("equation", mathEnvWith id Nothing "equation") - , ("equation*", mathEnvWith id Nothing "equation*") - , ("gather", mathEnvWith id (Just "gathered") "gather") - , ("gather*", mathEnvWith id (Just "gathered") "gather*") - , ("multline", mathEnvWith id (Just "gathered") "multline") - , ("multline*", mathEnvWith id (Just "gathered") "multline*") - , ("eqnarray", mathEnvWith id (Just "aligned") "eqnarray") - , ("eqnarray*", mathEnvWith id (Just "aligned") "eqnarray*") - , ("align", mathEnvWith id (Just "aligned") "align") - , ("align*", mathEnvWith id (Just "aligned") "align*") - , ("alignat", mathEnvWith id (Just "aligned") "alignat") - , ("alignat*", mathEnvWith id (Just "aligned") "alignat*") - ] - -inlineCommands :: PandocMonad m => M.Map String (LP m Inlines) -inlineCommands = M.fromList $ - [ ("emph", extractSpaces emph <$> tok) - , ("textit", extractSpaces emph <$> tok) - , ("textsl", extractSpaces emph <$> tok) - , ("textsc", extractSpaces smallcaps <$> tok) - , ("textsf", extractSpaces (spanWith ("",["sans-serif"],[])) <$> tok) - , ("textmd", extractSpaces (spanWith ("",["medium"],[])) <$> tok) - , ("textrm", extractSpaces (spanWith ("",["roman"],[])) <$> tok) - , ("textup", extractSpaces (spanWith ("",["upright"],[])) <$> tok) - , ("texttt", ttfamily) - , ("sout", extractSpaces strikeout <$> tok) - , ("textsuperscript", extractSpaces superscript <$> tok) - , ("textsubscript", extractSpaces subscript <$> tok) - , ("textbackslash", lit "\\") - , ("backslash", lit "\\") - , ("slash", lit "/") - , ("textbf", extractSpaces strong <$> tok) - , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) - , ("ldots", lit "…") - , ("vdots", lit "\8942") - , ("dots", lit "…") - , ("mdots", lit "…") - , ("sim", lit "~") - , ("label", rawInlineOr "label" (inBrackets <$> tok)) - , ("ref", rawInlineOr "ref" (inBrackets <$> tok)) - , ("textgreek", tok) - , ("sep", lit ",") - , ("cref", rawInlineOr "cref" (inBrackets <$> tok)) -- from cleveref.sty - , ("(", mathInline $ manyTill anyChar (try $ string "\\)")) - , ("[", mathDisplay $ manyTill anyChar (try $ string "\\]")) - , ("ensuremath", mathInline braced) - , ("texorpdfstring", (\_ x -> x) <$> tok <*> tok) - , ("P", lit "¶") - , ("S", lit "§") - , ("$", lit "$") - , ("%", lit "%") - , ("&", lit "&") - , ("#", lit "#") - , ("_", lit "_") - , ("{", lit "{") - , ("}", lit "}") - -- old TeX commands - , ("em", extractSpaces emph <$> inlines) - , ("it", extractSpaces emph <$> inlines) - , ("sl", extractSpaces emph <$> inlines) - , ("bf", extractSpaces strong <$> inlines) - , ("rm", inlines) - , ("itshape", extractSpaces emph <$> inlines) - , ("slshape", extractSpaces emph <$> inlines) - , ("scshape", extractSpaces smallcaps <$> inlines) - , ("bfseries", extractSpaces strong <$> inlines) - , ("/", pure mempty) -- italic correction - , ("aa", lit "å") - , ("AA", lit "Å") - , ("ss", lit "ß") - , ("o", lit "ø") - , ("O", lit "Ø") - , ("L", lit "Ł") - , ("l", lit "ł") - , ("ae", lit "æ") - , ("AE", lit "Æ") - , ("oe", lit "œ") - , ("OE", lit "Œ") - , ("pounds", lit "£") - , ("euro", lit "€") - , ("copyright", lit "©") - , ("textasciicircum", lit "^") - , ("textasciitilde", lit "~") - , ("H", try $ tok >>= accent hungarumlaut) - , ("`", option (str "`") $ try $ tok >>= accent grave) - , ("'", option (str "'") $ try $ tok >>= accent acute) - , ("^", option (str "^") $ try $ tok >>= accent circ) - , ("~", option (str "~") $ try $ tok >>= accent tilde) - , ("\"", option (str "\"") $ try $ tok >>= accent umlaut) - , (".", option (str ".") $ try $ tok >>= accent dot) - , ("=", option (str "=") $ try $ tok >>= accent macron) - , ("c", option (str "c") $ try $ tok >>= accent cedilla) - , ("v", option (str "v") $ try $ tok >>= accent hacek) - , ("u", option (str "u") $ try $ tok >>= accent breve) - , ("i", lit "i") - , ("\\", linebreak <$ (optional (bracketed inline) *> spaces')) - , (",", lit "\8198") - , ("@", pure mempty) - , (" ", lit "\160") - , ("ps", pure $ str "PS." <> space) - , ("TeX", lit "TeX") - , ("LaTeX", lit "LaTeX") - , ("bar", lit "|") - , ("textless", lit "<") - , ("textgreater", lit ">") - , ("thanks", note <$> grouped block) - , ("footnote", note <$> grouped block) - , ("verb", doverb) - , ("lstinline", dolstinline) - , ("Verb", doverb) - , ("url", (unescapeURL <$> braced) >>= \url -> - pure (link url "" (str url))) - , ("href", (unescapeURL <$> braced <* optional sp) >>= \url -> - tok >>= \lab -> - pure (link url "" lab)) - , ("includegraphics", do options <- option [] keyvals - src <- unescapeURL . removeDoubleQuotes <$> braced - mkImage options src) - , ("enquote", enquote) - , ("cite", citation "cite" NormalCitation False) - , ("Cite", citation "Cite" NormalCitation False) - , ("citep", citation "citep" NormalCitation False) - , ("citep*", citation "citep*" NormalCitation False) - , ("citeal", citation "citeal" NormalCitation False) - , ("citealp", citation "citealp" NormalCitation False) - , ("citealp*", citation "citealp*" NormalCitation False) - , ("autocite", citation "autocite" NormalCitation False) - , ("smartcite", citation "smartcite" NormalCitation False) - , ("footcite", inNote <$> citation "footcite" NormalCitation False) - , ("parencite", citation "parencite" NormalCitation False) - , ("supercite", citation "supercite" NormalCitation False) - , ("footcitetext", inNote <$> citation "footcitetext" NormalCitation False) - , ("citeyearpar", citation "citeyearpar" SuppressAuthor False) - , ("citeyear", citation "citeyear" SuppressAuthor False) - , ("autocite*", citation "autocite*" SuppressAuthor False) - , ("cite*", citation "cite*" SuppressAuthor False) - , ("parencite*", citation "parencite*" SuppressAuthor False) - , ("textcite", citation "textcite" AuthorInText False) - , ("citet", citation "citet" AuthorInText False) - , ("citet*", citation "citet*" AuthorInText False) - , ("citealt", citation "citealt" AuthorInText False) - , ("citealt*", citation "citealt*" AuthorInText False) - , ("textcites", citation "textcites" AuthorInText True) - , ("cites", citation "cites" NormalCitation True) - , ("autocites", citation "autocites" NormalCitation True) - , ("footcites", inNote <$> citation "footcites" NormalCitation True) - , ("parencites", citation "parencites" NormalCitation True) - , ("supercites", citation "supercites" NormalCitation True) - , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True) - , ("Autocite", citation "Autocite" NormalCitation False) - , ("Smartcite", citation "Smartcite" NormalCitation False) - , ("Footcite", citation "Footcite" NormalCitation False) - , ("Parencite", citation "Parencite" NormalCitation False) - , ("Supercite", citation "Supercite" NormalCitation False) - , ("Footcitetext", inNote <$> citation "Footcitetext" NormalCitation False) - , ("Citeyearpar", citation "Citeyearpar" SuppressAuthor False) - , ("Citeyear", citation "Citeyear" SuppressAuthor False) - , ("Autocite*", citation "Autocite*" SuppressAuthor False) - , ("Cite*", citation "Cite*" SuppressAuthor False) - , ("Parencite*", citation "Parencite*" SuppressAuthor False) - , ("Textcite", citation "Textcite" AuthorInText False) - , ("Textcites", citation "Textcites" AuthorInText True) - , ("Cites", citation "Cites" NormalCitation True) - , ("Autocites", citation "Autocites" NormalCitation True) - , ("Footcites", citation "Footcites" NormalCitation True) - , ("Parencites", citation "Parencites" NormalCitation True) - , ("Supercites", citation "Supercites" NormalCitation True) - , ("Footcitetexts", inNote <$> citation "Footcitetexts" NormalCitation True) - , ("citetext", complexNatbibCitation NormalCitation) - , ("citeauthor", (try (tok *> optional sp *> controlSeq "citetext") *> - complexNatbibCitation AuthorInText) - <|> citation "citeauthor" AuthorInText False) - , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= - addMeta "nocite")) - , ("hypertarget", braced >> tok) - -- siuntix - , ("SI", dosiunitx) - -- hyphenat - , ("bshyp", lit "\\\173") - , ("fshyp", lit "/\173") - , ("dothyp", lit ".\173") - , ("colonhyp", lit ":\173") - , ("hyp", lit "-") - , ("nohyphens", tok) - , ("textnhtt", ttfamily) - , ("nhttfamily", ttfamily) - -- LaTeX colors - , ("textcolor", coloredInline "color") - , ("colorbox", coloredInline "background-color") - -- fontawesome - , ("faCheck", lit "\10003") - , ("faClose", lit "\10007") - ] ++ map ignoreInlines - -- these commands will be ignored unless --parse-raw is specified, - -- in which case they will appear as raw latex blocks: - [ "index" - , "hspace" - , "vspace" - , "newpage" - , "clearpage" - , "pagebreak" - ] - -coloredInline :: PandocMonad m => String -> LP m Inlines -coloredInline stylename = do - skipopts - color <- braced - spanWith ("",[],[("style",stylename ++ ": " ++ color)]) <$> tok - -ttfamily :: PandocMonad m => LP m Inlines -ttfamily = (code . stringify . toList) <$> tok +doLHSverb :: PandocMonad m => LP m Inlines +doLHSverb = + (codeWith ("",["haskell"],[]) . T.unpack . untokenize) + <$> manyTill (satisfyTok (not . isNewlineTok)) (symbol '|') mkImage :: PandocMonad m => [(String, String)] -> String -> LP m Inlines mkImage options src = do - let replaceTextwidth (k,v) = case numUnit v of - Just (num, "\\textwidth") -> (k, showFl (num * 100) ++ "%") - _ -> (k, v) - let kvs = map replaceTextwidth $ filter (\(k,_) -> k `elem` ["width", "height"]) options + let replaceTextwidth (k,v) = + case numUnit v of + Just (num, "\\textwidth") -> (k, showFl (num * 100) ++ "%") + _ -> (k, v) + let kvs = map replaceTextwidth + $ filter (\(k,_) -> k `elem` ["width", "height"]) options let attr = ("",[], kvs) let alt = str "image" case takeExtension src of @@ -737,56 +607,131 @@ mkImage options src = do return $ imageWith attr (addExtension src defaultExt) "" alt _ -> return $ imageWith attr src "" alt -inNote :: Inlines -> Inlines -inNote ils = - note $ para $ ils <> str "." +-- converts e.g. \SI{1}[\$]{} to "$ 1" or \SI{1}{\euro} to "1 €" +dosiunitx :: PandocMonad m => LP m Inlines +dosiunitx = do + skipopts + value <- tok + valueprefix <- option "" $ bracketed tok + unit <- tok + let emptyOr160 "" = "" + emptyOr160 _ = "\160" + return . mconcat $ [valueprefix, + emptyOr160 valueprefix, + value, + emptyOr160 unit, + unit] -unescapeURL :: String -> String -unescapeURL ('\\':x:xs) | isEscapable x = x:unescapeURL xs - where isEscapable c = c `elem` ("#$%&~_^\\{}" :: String) -unescapeURL (x:xs) = x:unescapeURL xs -unescapeURL [] = "" +lit :: String -> LP m Inlines +lit = pure . str + +removeDoubleQuotes :: Text -> Text +removeDoubleQuotes t = + maybe t id $ T.stripPrefix "\"" t >>= T.stripSuffix "\"" + +doubleQuote :: PandocMonad m => LP m Inlines +doubleQuote = do + quoted' doubleQuoted (try $ count 2 $ symbol '`') + (void $ try $ count 2 $ symbol '\'') + <|> quoted' doubleQuoted ((:[]) <$> symbol '“') (void $ symbol '”') + -- the following is used by babel for localized quotes: + <|> quoted' doubleQuoted (try $ sequence [symbol '"', symbol '`']) + (void $ try $ sequence [symbol '"', symbol '\'']) + <|> quoted' doubleQuoted ((:[]) <$> symbol '"') + (void $ symbol '"') + +singleQuote :: PandocMonad m => LP m Inlines +singleQuote = do + quoted' singleQuoted ((:[]) <$> symbol '`') + (try $ symbol '\'' >> + notFollowedBy (satisfyTok startsWithLetter)) + <|> quoted' singleQuoted ((:[]) <$> symbol '‘') + (try $ symbol '’' >> + notFollowedBy (satisfyTok startsWithLetter)) + where startsWithLetter (Tok _ Word t) = + case T.uncons t of + Just (c, _) | isLetter c -> True + _ -> False + startsWithLetter _ = False + +quoted' :: PandocMonad m + => (Inlines -> Inlines) + -> LP m [Tok] + -> LP m () + -> LP m Inlines +quoted' f starter ender = do + startchs <- (T.unpack . untokenize) <$> starter + smart <- extensionEnabled Ext_smart <$> getOption readerExtensions + if smart + then do + ils <- many (notFollowedBy ender >> inline) + (ender >> return (f (mconcat ils))) <|> + (<> mconcat ils) <$> + lit (case startchs of + "``" -> "“" + "`" -> "‘" + cs -> cs) + else lit startchs enquote :: PandocMonad m => LP m Inlines enquote = do skipopts - context <- stateQuoteContext <$> getState - if context == InDoubleQuote + quoteContext <- sQuoteContext <$> getState + if quoteContext == InDoubleQuote then singleQuoted <$> withQuoteContext InSingleQuote tok else doubleQuoted <$> withQuoteContext InDoubleQuote tok doverb :: PandocMonad m => LP m Inlines doverb = do - marker <- anyChar - code <$> manyTill (satisfy (/='\n')) (char marker) + Tok _ Symbol t <- anySymbol + marker <- case T.uncons t of + Just (c, ts) | T.null ts -> return c + _ -> mzero + withVerbatimMode $ + (code . T.unpack . untokenize) <$> + manyTill (verbTok marker) (symbol marker) + +verbTok :: PandocMonad m => Char -> LP m Tok +verbTok stopchar = do + t@(Tok (lin, col) toktype txt) <- satisfyTok (not . isNewlineTok) + case T.findIndex (== stopchar) txt of + Nothing -> return t + Just i -> do + let (t1, t2) = T.splitAt i txt + inp <- getInput + setInput $ Tok (lin, col + i) Symbol (T.singleton stopchar) + : (totoks (lin, col + i + 1) (T.drop 1 t2)) ++ inp + return $ Tok (lin, col) toktype t1 dolstinline :: PandocMonad m => LP m Inlines dolstinline = do options <- option [] keyvals let classes = maybeToList $ lookup "language" options >>= fromListingsLanguage - marker <- char '{' <|> anyChar - codeWith ("",classes,[]) <$> manyTill (satisfy (/='\n')) (char '}' <|> char marker) + Tok _ Symbol t <- anySymbol + marker <- case T.uncons t of + Just (c, ts) | T.null ts -> return c + _ -> mzero + let stopchar = if marker == '{' then '}' else marker + withVerbatimMode $ + (codeWith ("",classes,[]) . T.unpack . untokenize) <$> + manyTill (verbTok stopchar) (symbol stopchar) -doLHSverb :: PandocMonad m => LP m Inlines -doLHSverb = codeWith ("",["haskell"],[]) <$> manyTill (satisfy (/='\n')) (char '|') - --- converts e.g. \SI{1}[\$]{} to "$ 1" or \SI{1}{\euro} to "1 €" -dosiunitx :: PandocMonad m => LP m Inlines -dosiunitx = do - skipopts - value <- tok - valueprefix <- option "" $ char '[' >> (mconcat <$> manyTill tok (char ']')) - unit <- tok - let emptyOr160 "" = "" - emptyOr160 _ = "\160" - return . mconcat $ [valueprefix, - emptyOr160 valueprefix, - value, - emptyOr160 unit, - unit] +keyval :: PandocMonad m => LP m (String, String) +keyval = try $ do + Tok _ Word key <- satisfyTok isWordTok + let isSpecSym (Tok _ Symbol t) = t `elem` [".",":","-","|","\\"] + isSpecSym _ = False + val <- option [] $ do + symbol '=' + braced <|> (many1 (satisfyTok isWordTok <|> satisfyTok isSpecSym + <|> anyControlSeq)) + optional sp + optional (symbol ',') + optional sp + return (T.unpack key, T.unpack . untokenize $ val) -lit :: String -> LP m Inlines -lit = pure . str +keyvals :: PandocMonad m => LP m [(String, String)] +keyvals = try $ symbol '[' >> manyTill keyval (symbol ']') accent :: (Char -> String) -> Inlines -> LP m Inlines accent f ils = @@ -994,18 +939,149 @@ breve 'U' = "Ŭ" breve 'u' = "ŭ" breve c = [c] +toksToString :: [Tok] -> String +toksToString = T.unpack . untokenize + +mathDisplay :: String -> Inlines +mathDisplay = displayMath . trim + +mathInline :: String -> Inlines +mathInline = math . trim + +dollarsMath :: PandocMonad m => LP m Inlines +dollarsMath = do + symbol '$' + display <- option False (True <$ symbol '$') + contents <- trim . toksToString <$> + many (notFollowedBy (symbol '$') >> anyTok) + if display + then do + mathDisplay contents <$ try (symbol '$' >> symbol '$') + <|> (guard (null contents) >> return (mathInline "")) + else mathInline contents <$ (symbol '$') + +-- citations + +addPrefix :: [Inline] -> [Citation] -> [Citation] +addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks +addPrefix _ _ = [] + +addSuffix :: [Inline] -> [Citation] -> [Citation] +addSuffix s ks@(_:_) = + let k = last ks + in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] +addSuffix _ _ = [] + +simpleCiteArgs :: PandocMonad m => LP m [Citation] +simpleCiteArgs = try $ do + first <- optionMaybe $ toList <$> opt + second <- optionMaybe $ toList <$> opt + keys <- try $ bgroup *> (manyTill citationLabel egroup) + let (pre, suf) = case (first , second ) of + (Just s , Nothing) -> (mempty, s ) + (Just s , Just t ) -> (s , t ) + _ -> (mempty, mempty) + conv k = Citation { citationId = k + , citationPrefix = [] + , citationSuffix = [] + , citationMode = NormalCitation + , citationHash = 0 + , citationNoteNum = 0 + } + return $ addPrefix pre $ addSuffix suf $ map conv keys + +citationLabel :: PandocMonad m => LP m String +citationLabel = do + optional sp + toksToString <$> + (many1 (satisfyTok isWordTok <|> symbolIn bibtexKeyChar) + <* optional sp + <* optional (symbol ',') + <* optional sp) + where bibtexKeyChar = ".:;?!`'()/*@_+=-[]" :: [Char] + +cites :: PandocMonad m => CitationMode -> Bool -> LP m [Citation] +cites mode multi = try $ do + cits <- if multi + then many1 simpleCiteArgs + else count 1 simpleCiteArgs + let cs = concat cits + return $ case mode of + AuthorInText -> case cs of + (c:rest) -> c {citationMode = mode} : rest + [] -> [] + _ -> map (\a -> a {citationMode = mode}) cs + +citation :: PandocMonad m => String -> CitationMode -> Bool -> LP m Inlines +citation name mode multi = do + (c,raw) <- withRaw $ cites mode multi + return $ cite c (rawInline "latex" $ "\\" ++ name ++ (toksToString raw)) + +handleCitationPart :: Inlines -> [Citation] +handleCitationPart ils = + let isCite Cite{} = True + isCite _ = False + (pref, rest) = break isCite (toList ils) + in case rest of + (Cite cs _:suff) -> addPrefix pref $ addSuffix suff cs + _ -> [] + +complexNatbibCitation :: PandocMonad m => CitationMode -> LP m Inlines +complexNatbibCitation mode = try $ do + (cs, raw) <- + withRaw $ concat <$> do + bgroup + items <- mconcat <$> + many1 (notFollowedBy (symbol ';') >> inline) + `sepBy1` (symbol ';') + egroup + return $ map handleCitationPart items + case cs of + [] -> mzero + (c:cits) -> return $ cite (c{ citationMode = mode }:cits) + (rawInline "latex" $ "\\citetext" ++ toksToString raw) + +inNote :: Inlines -> Inlines +inNote ils = + note $ para $ ils <> str "." + +inlineCommand' :: PandocMonad m => LP m Inlines +inlineCommand' = try $ do + Tok _ (CtrlSeq name) cmd <- anyControlSeq + guard $ name /= "begin" && name /= "end" + star <- option "" ("*" <$ symbol '*' <* optional sp) + let name' = name <> star + let names = ordNub [name', name] -- check non-starred as fallback + let raw = do + guard $ isInlineCommand name || not (isBlockCommand name) + rawcommand <- getRawCommand (cmd <> star) + (guardEnabled Ext_raw_tex >> return (rawInline "latex" rawcommand)) + <|> ignore rawcommand + lookupListDefault raw names inlineCommands + tok :: PandocMonad m => LP m Inlines -tok = try $ grouped inline <|> inlineCommand <|> str <$> count 1 inlineChar +tok = grouped inline <|> inlineCommand' <|> singleChar + where singleChar = try $ do + Tok (lin,col) toktype t <- satisfyTok (tokTypeIn [Word, Symbol]) + guard $ not $ toktype == Symbol && + T.any (`Set.member` specialChars) t + if T.length t > 1 + then do + let (t1, t2) = (T.take 1 t, T.drop 1 t) + inp <- getInput + setInput $ (Tok (lin, col + 1) toktype t2) : inp + return $ str (T.unpack t1) + else return $ str (T.unpack t) opt :: PandocMonad m => LP m Inlines opt = bracketed inline -rawopt :: PandocMonad m => LP m String +rawopt :: PandocMonad m => LP m Text rawopt = do - contents <- bracketed (many1 (noneOf "[]") <|> try (string "\\]") <|> - try (string "\\[") <|> rawopt) + symbol '[' + inner <- untokenize <$> manyTill anyTok (symbol ']') optional sp - return $ "[" ++ contents ++ "]" + return $ "[" <> inner <> "]" skipopts :: PandocMonad m => LP m () skipopts = skipMany rawopt @@ -1013,58 +1089,719 @@ skipopts = skipMany rawopt -- opts in angle brackets are used in beamer rawangle :: PandocMonad m => LP m () rawangle = try $ do - char '<' - skipMany (noneOf ">") - char '>' - return () + symbol '<' + () <$ manyTill anyTok (symbol '>') skipangles :: PandocMonad m => LP m () skipangles = skipMany rawangle -inlineText :: PandocMonad m => LP m Inlines -inlineText = str <$> many1 inlineChar +ignore :: (Monoid a, PandocMonad m) => String -> ParserT s u m a +ignore raw = do + pos <- getPosition + report $ SkippedContent raw pos + return mempty + +withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok]) +withRaw parser = do + inp <- getInput + result <- parser + nxt <- option (Tok (0,0) Word "") (lookAhead anyTok) + let raw = takeWhile (/= nxt) inp + return (result, raw) + +inBrackets :: Inlines -> Inlines +inBrackets x = str "[" <> x <> str "]" + +unescapeURL :: String -> String +unescapeURL ('\\':x:xs) | isEscapable x = x:unescapeURL xs + where isEscapable c = c `elem` ("#$%&~_^\\{}" :: String) +unescapeURL (x:xs) = x:unescapeURL xs +unescapeURL [] = "" + +mathEnvWith :: PandocMonad m + => (Inlines -> a) -> Maybe Text -> Text -> LP m a +mathEnvWith f innerEnv name = f . mathDisplay . inner <$> mathEnv name + where inner x = case innerEnv of + Nothing -> x + Just y -> "\\begin{" ++ T.unpack y ++ "}\n" ++ x ++ + "\\end{" ++ T.unpack y ++ "}" + +mathEnv :: PandocMonad m => Text -> LP m String +mathEnv name = do + skipopts + optional blankline + res <- manyTill anyTok (end_ name) + return $ stripTrailingNewlines $ T.unpack $ untokenize res + +inlineEnvironment :: PandocMonad m => LP m Inlines +inlineEnvironment = try $ do + controlSeq "begin" + name <- untokenize <$> braced + M.findWithDefault mzero name inlineEnvironments + +inlineEnvironments :: PandocMonad m => M.Map Text (LP m Inlines) +inlineEnvironments = M.fromList [ + ("displaymath", mathEnvWith id Nothing "displaymath") + , ("math", math <$> mathEnv "math") + , ("equation", mathEnvWith id Nothing "equation") + , ("equation*", mathEnvWith id Nothing "equation*") + , ("gather", mathEnvWith id (Just "gathered") "gather") + , ("gather*", mathEnvWith id (Just "gathered") "gather*") + , ("multline", mathEnvWith id (Just "gathered") "multline") + , ("multline*", mathEnvWith id (Just "gathered") "multline*") + , ("eqnarray", mathEnvWith id (Just "aligned") "eqnarray") + , ("eqnarray*", mathEnvWith id (Just "aligned") "eqnarray*") + , ("align", mathEnvWith id (Just "aligned") "align") + , ("align*", mathEnvWith id (Just "aligned") "align*") + , ("alignat", mathEnvWith id (Just "aligned") "alignat") + , ("alignat*", mathEnvWith id (Just "aligned") "alignat*") + ] + +inlineCommands :: PandocMonad m => M.Map Text (LP m Inlines) +inlineCommands = M.fromList $ + [ ("emph", extractSpaces emph <$> tok) + , ("textit", extractSpaces emph <$> tok) + , ("textsl", extractSpaces emph <$> tok) + , ("textsc", extractSpaces smallcaps <$> tok) + , ("textsf", extractSpaces (spanWith ("",["sans-serif"],[])) <$> tok) + , ("textmd", extractSpaces (spanWith ("",["medium"],[])) <$> tok) + , ("textrm", extractSpaces (spanWith ("",["roman"],[])) <$> tok) + , ("textup", extractSpaces (spanWith ("",["upright"],[])) <$> tok) + , ("texttt", ttfamily) + , ("sout", extractSpaces strikeout <$> tok) + , ("textsuperscript", extractSpaces superscript <$> tok) + , ("textsubscript", extractSpaces subscript <$> tok) + , ("textbackslash", lit "\\") + , ("backslash", lit "\\") + , ("slash", lit "/") + , ("textbf", extractSpaces strong <$> tok) + , ("textnormal", extractSpaces (spanWith ("",["nodecor"],[])) <$> tok) + , ("ldots", lit "…") + , ("vdots", lit "\8942") + , ("dots", lit "…") + , ("mdots", lit "…") + , ("sim", lit "~") + , ("label", rawInlineOr "label" (inBrackets <$> tok)) + , ("ref", rawInlineOr "ref" (inBrackets <$> tok)) + , ("textgreek", tok) + , ("sep", lit ",") + , ("cref", rawInlineOr "cref" (inBrackets <$> tok)) -- from cleveref.sty + , ("(", mathInline . toksToString <$> manyTill anyTok (controlSeq ")")) + , ("[", mathDisplay . toksToString <$> manyTill anyTok (controlSeq "]")) + , ("ensuremath", mathInline . toksToString <$> braced) + , ("texorpdfstring", (\_ x -> x) <$> tok <*> tok) + , ("P", lit "¶") + , ("S", lit "§") + , ("$", lit "$") + , ("%", lit "%") + , ("&", lit "&") + , ("#", lit "#") + , ("_", lit "_") + , ("{", lit "{") + , ("}", lit "}") + -- old TeX commands + , ("em", extractSpaces emph <$> inlines) + , ("it", extractSpaces emph <$> inlines) + , ("sl", extractSpaces emph <$> inlines) + , ("bf", extractSpaces strong <$> inlines) + , ("rm", inlines) + , ("itshape", extractSpaces emph <$> inlines) + , ("slshape", extractSpaces emph <$> inlines) + , ("scshape", extractSpaces smallcaps <$> inlines) + , ("bfseries", extractSpaces strong <$> inlines) + , ("/", pure mempty) -- italic correction + , ("aa", lit "å") + , ("AA", lit "Å") + , ("ss", lit "ß") + , ("o", lit "ø") + , ("O", lit "Ø") + , ("L", lit "Ł") + , ("l", lit "ł") + , ("ae", lit "æ") + , ("AE", lit "Æ") + , ("oe", lit "œ") + , ("OE", lit "Œ") + , ("pounds", lit "£") + , ("euro", lit "€") + , ("copyright", lit "©") + , ("textasciicircum", lit "^") + , ("textasciitilde", lit "~") + , ("H", try $ tok >>= accent hungarumlaut) + , ("`", option (str "`") $ try $ tok >>= accent grave) + , ("'", option (str "'") $ try $ tok >>= accent acute) + , ("^", option (str "^") $ try $ tok >>= accent circ) + , ("~", option (str "~") $ try $ tok >>= accent tilde) + , ("\"", option (str "\"") $ try $ tok >>= accent umlaut) + , (".", option (str ".") $ try $ tok >>= accent dot) + , ("=", option (str "=") $ try $ tok >>= accent macron) + , ("c", option (str "c") $ try $ tok >>= accent cedilla) + , ("v", option (str "v") $ try $ tok >>= accent hacek) + , ("u", option (str "u") $ try $ tok >>= accent breve) + , ("i", lit "i") + , ("\\", linebreak <$ (do inTableCell <- sInTableCell <$> getState + guard $ not inTableCell + optional (bracketed inline) + spaces)) + , (",", lit "\8198") + , ("@", pure mempty) + , (" ", lit "\160") + , ("ps", pure $ str "PS." <> space) + , ("TeX", lit "TeX") + , ("LaTeX", lit "LaTeX") + , ("bar", lit "|") + , ("textless", lit "<") + , ("textgreater", lit ">") + , ("thanks", note <$> grouped block) + , ("footnote", note <$> grouped block) + , ("verb", doverb) + , ("lstinline", dolstinline) + , ("Verb", doverb) + , ("url", ((unescapeURL . T.unpack . untokenize) <$> braced) >>= \url -> + pure (link url "" (str url))) + , ("href", (unescapeURL . toksToString <$> + braced <* optional sp) >>= \url -> + tok >>= \lab -> pure (link url "" lab)) + , ("includegraphics", do options <- option [] keyvals + src <- unescapeURL . T.unpack . + removeDoubleQuotes . untokenize <$> braced + mkImage options src) + , ("enquote", enquote) + , ("cite", citation "cite" NormalCitation False) + , ("Cite", citation "Cite" NormalCitation False) + , ("citep", citation "citep" NormalCitation False) + , ("citep*", citation "citep*" NormalCitation False) + , ("citeal", citation "citeal" NormalCitation False) + , ("citealp", citation "citealp" NormalCitation False) + , ("citealp*", citation "citealp*" NormalCitation False) + , ("autocite", citation "autocite" NormalCitation False) + , ("smartcite", citation "smartcite" NormalCitation False) + , ("footcite", inNote <$> citation "footcite" NormalCitation False) + , ("parencite", citation "parencite" NormalCitation False) + , ("supercite", citation "supercite" NormalCitation False) + , ("footcitetext", inNote <$> citation "footcitetext" NormalCitation False) + , ("citeyearpar", citation "citeyearpar" SuppressAuthor False) + , ("citeyear", citation "citeyear" SuppressAuthor False) + , ("autocite*", citation "autocite*" SuppressAuthor False) + , ("cite*", citation "cite*" SuppressAuthor False) + , ("parencite*", citation "parencite*" SuppressAuthor False) + , ("textcite", citation "textcite" AuthorInText False) + , ("citet", citation "citet" AuthorInText False) + , ("citet*", citation "citet*" AuthorInText False) + , ("citealt", citation "citealt" AuthorInText False) + , ("citealt*", citation "citealt*" AuthorInText False) + , ("textcites", citation "textcites" AuthorInText True) + , ("cites", citation "cites" NormalCitation True) + , ("autocites", citation "autocites" NormalCitation True) + , ("footcites", inNote <$> citation "footcites" NormalCitation True) + , ("parencites", citation "parencites" NormalCitation True) + , ("supercites", citation "supercites" NormalCitation True) + , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True) + , ("Autocite", citation "Autocite" NormalCitation False) + , ("Smartcite", citation "Smartcite" NormalCitation False) + , ("Footcite", citation "Footcite" NormalCitation False) + , ("Parencite", citation "Parencite" NormalCitation False) + , ("Supercite", citation "Supercite" NormalCitation False) + , ("Footcitetext", inNote <$> citation "Footcitetext" NormalCitation False) + , ("Citeyearpar", citation "Citeyearpar" SuppressAuthor False) + , ("Citeyear", citation "Citeyear" SuppressAuthor False) + , ("Autocite*", citation "Autocite*" SuppressAuthor False) + , ("Cite*", citation "Cite*" SuppressAuthor False) + , ("Parencite*", citation "Parencite*" SuppressAuthor False) + , ("Textcite", citation "Textcite" AuthorInText False) + , ("Textcites", citation "Textcites" AuthorInText True) + , ("Cites", citation "Cites" NormalCitation True) + , ("Autocites", citation "Autocites" NormalCitation True) + , ("Footcites", citation "Footcites" NormalCitation True) + , ("Parencites", citation "Parencites" NormalCitation True) + , ("Supercites", citation "Supercites" NormalCitation True) + , ("Footcitetexts", inNote <$> citation "Footcitetexts" NormalCitation True) + , ("citetext", complexNatbibCitation NormalCitation) + , ("citeauthor", (try (tok *> optional sp *> controlSeq "citetext") *> + complexNatbibCitation AuthorInText) + <|> citation "citeauthor" AuthorInText False) + , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>= + addMeta "nocite")) + , ("hypertarget", braced >> tok) + -- siuntix + , ("SI", dosiunitx) + -- hyphenat + , ("bshyp", lit "\\\173") + , ("fshyp", lit "/\173") + , ("dothyp", lit ".\173") + , ("colonhyp", lit ":\173") + , ("hyp", lit "-") + , ("nohyphens", tok) + , ("textnhtt", ttfamily) + , ("nhttfamily", ttfamily) + -- LaTeX colors + , ("textcolor", coloredInline "color") + , ("colorbox", coloredInline "background-color") + -- fontawesome + , ("faCheck", lit "\10003") + , ("faClose", lit "\10007") + ] + +coloredInline :: PandocMonad m => String -> LP m Inlines +coloredInline stylename = do + skipopts + color <- braced + spanWith ("",[],[("style",stylename ++ ": " ++ toksToString color)]) <$> tok + +ttfamily :: PandocMonad m => LP m Inlines +ttfamily = (code . stringify . toList) <$> tok + +rawInlineOr :: PandocMonad m => Text -> LP m Inlines -> LP m Inlines +rawInlineOr name' fallback = do + parseRaw <- extensionEnabled Ext_raw_tex <$> getOption readerExtensions + if parseRaw + then rawInline "latex" <$> getRawCommand name' + else fallback + +getRawCommand :: PandocMonad m => Text -> LP m String +getRawCommand txt = do + (_, rawargs) <- withRaw + ((if txt == "\\write" + then () <$ satisfyTok isWordTok -- digits + else return ()) *> + skipangles *> + skipopts *> + option "" (try (optional sp *> dimenarg)) *> + many braced) + return $ T.unpack (txt <> untokenize rawargs) + +isBlockCommand :: Text -> Bool +isBlockCommand s = + s `M.member` (blockCommands :: M.Map Text (LP PandocPure Blocks)) + || s `Set.member` treatAsBlock + +treatAsBlock :: Set.Set Text +treatAsBlock = Set.fromList + [ "newcommand", "renewcommand" + , "newenvironment", "renewenvironment" + , "providecommand", "provideenvironment" + -- newcommand, etc. should be parsed by macroDef, but we need this + -- here so these aren't parsed as inline commands to ignore + , "special", "pdfannot", "pdfstringdef" + , "bibliographystyle" + , "maketitle", "makeindex", "makeglossary" + , "addcontentsline", "addtocontents", "addtocounter" + -- \ignore{} is used conventionally in literate haskell for definitions + -- that are to be processed by the compiler but not printed. + , "ignore" + , "hyperdef" + , "markboth", "markright", "markleft" + , "hspace", "vspace" + , "newpage" + , "clearpage" + , "pagebreak" + ] + +isInlineCommand :: Text -> Bool +isInlineCommand s = + s `M.member` (inlineCommands :: M.Map Text (LP PandocPure Inlines)) + || s `Set.member` treatAsInline + +treatAsInline :: Set.Set Text +treatAsInline = Set.fromList + [ "index" + , "hspace" + , "vspace" + , "noindent" + , "newpage" + , "clearpage" + , "pagebreak" + ] + +lookupListDefault :: (Show k, Ord k) => v -> [k] -> M.Map k v -> v +lookupListDefault d = (fromMaybe d .) . lookupList + where lookupList l m = msum $ map (`M.lookup` m) l + +inline :: PandocMonad m => LP m Inlines +inline = (mempty <$ comment) + <|> (space <$ whitespace) + <|> (softbreak <$ endline) + <|> word + <|> inlineCommand' + <|> inlineEnvironment + <|> inlineGroup + <|> (symbol '-' *> + option (str "-") (symbol '-' *> + option (str "–") (str "—" <$ symbol '-'))) + <|> doubleQuote + <|> singleQuote + <|> (str "”" <$ try (symbol '\'' >> symbol '\'')) + <|> (str "”" <$ symbol '”') + <|> (str "’" <$ symbol '\'') + <|> (str "’" <$ symbol '’') + <|> (str "\160" <$ symbol '~') + <|> dollarsMath + <|> (guardEnabled Ext_literate_haskell *> symbol '|' *> doLHSverb) + <|> (str . (:[]) <$> primEscape) + <|> regularSymbol + <|> (do res <- symbolIn "#^'`\"[]" + pos <- getPosition + let s = T.unpack (untoken res) + report $ ParsingUnescaped s pos + return $ str s) + +inlines :: PandocMonad m => LP m Inlines +inlines = mconcat <$> many inline + +-- block elements: + +begin_ :: PandocMonad m => Text -> LP m () +begin_ t = (try $ do + controlSeq "begin" + spaces + symbol '{' + spaces + Tok _ Word txt <- satisfyTok isWordTok + spaces + symbol '}' + guard (t == txt)) <?> ("\\begin{" ++ T.unpack t ++ "}") + +end_ :: PandocMonad m => Text -> LP m () +end_ t = (try $ do + controlSeq "end" + spaces + symbol '{' + spaces + Tok _ Word txt <- satisfyTok isWordTok + spaces + symbol '}' + guard $ t == txt) <?> ("\\end{" ++ T.unpack t ++ "}") + +preamble :: PandocMonad m => LP m Blocks +preamble = mempty <$ many preambleBlock + where preambleBlock = spaces1 + <|> void include + <|> void macroDef + <|> void blockCommand + <|> void braced + <|> (notFollowedBy (begin_ "document") >> void anyTok) -inlineChar :: PandocMonad m => LP m Char -inlineChar = noneOf "\\$%&~#{}^'`\"‘’“”-[] \t\n" +paragraph :: PandocMonad m => LP m Blocks +paragraph = do + x <- trimInlines . mconcat <$> many1 inline + if x == mempty + then return mempty + else return $ para x + +include :: PandocMonad m => LP m Blocks +include = do + (Tok _ (CtrlSeq name) _) <- + controlSeq "include" <|> controlSeq "input" <|> + controlSeq "subfile" <|> controlSeq "usepackage" + skipMany $ bracketed inline -- skip options + fs <- (map trim . splitBy (==',') . T.unpack . untokenize) <$> braced + let fs' = if name == "usepackage" + then map (maybeAddExtension ".sty") fs + else map (maybeAddExtension ".tex") fs + dirs <- (splitBy (==':') . fromMaybe ".") <$> lookupEnv "TEXINPUTS" + mconcat <$> mapM (insertIncludedFile blocks (tokenize . T.pack) dirs) fs' + +maybeAddExtension :: String -> FilePath -> FilePath +maybeAddExtension ext fp = + if null (takeExtension fp) + then addExtension fp ext + else fp + +addMeta :: PandocMonad m => ToMetaValue a => String -> a -> LP m () +addMeta field val = updateState $ \st -> + st{ sMeta = addMetaField field val $ sMeta st } + +authors :: PandocMonad m => LP m () +authors = try $ do + bgroup + let oneAuthor = mconcat <$> + many1 (notFollowedBy' (controlSeq "and") >> + (inline <|> mempty <$ blockCommand)) + -- skip e.g. \vspace{10pt} + auths <- sepBy oneAuthor (controlSeq "and") + egroup + addMeta "author" (map trimInlines auths) + +macroDef :: PandocMonad m => LP m Blocks +macroDef = do + guardEnabled Ext_latex_macros + mempty <$ ((commandDef <|> environmentDef) <* doMacros 0) + where commandDef = do + (name, macro') <- newcommand + updateState $ \s -> s{ sMacros = M.insert name macro' (sMacros s) } + environmentDef = do + (name, macro1, macro2) <- newenvironment + updateState $ \s -> s{ sMacros = + M.insert name macro1 (sMacros s) } + updateState $ \s -> s{ sMacros = + M.insert ("end" <> name) macro2 (sMacros s) } + -- @\newenvironment{envname}[n-args][default]{begin}{end}@ + -- is equivalent to + -- @\newcommand{\envname}[n-args][default]{begin}@ + -- @\newcommand{\endenvname}@ + +newcommand :: PandocMonad m => LP m (Text, Macro) +newcommand = do + pos <- getPosition + Tok _ (CtrlSeq mtype) _ <- controlSeq "newcommand" <|> + controlSeq "renewcommand" <|> + controlSeq "providecommand" + optional $ symbol '*' + Tok _ (CtrlSeq name) txt <- withVerbatimMode $ anyControlSeq <|> + (symbol '{' *> spaces *> anyControlSeq <* spaces <* symbol '}') + spaces + numargs <- option 0 $ try bracketedNum + spaces + optarg <- option Nothing $ Just <$> try bracketedToks + spaces + contents <- braced + when (mtype == "newcommand") $ do + macros <- sMacros <$> getState + case M.lookup name macros of + Just _ -> report $ MacroAlreadyDefined (T.unpack txt) pos + Nothing -> return () + return (name, Macro numargs optarg contents) + +newenvironment :: PandocMonad m => LP m (Text, Macro, Macro) +newenvironment = do + pos <- getPosition + Tok _ (CtrlSeq mtype) _ <- controlSeq "newenvironment" <|> + controlSeq "renewenvironment" <|> + controlSeq "provideenvironment" + optional $ symbol '*' + symbol '{' + spaces + Tok _ Word name <- satisfyTok isWordTok + spaces + symbol '}' + spaces + numargs <- option 0 $ try bracketedNum + spaces + optarg <- option Nothing $ Just <$> try bracketedToks + spaces + startcontents <- braced + spaces + endcontents <- braced + when (mtype == "newenvironment") $ do + macros <- sMacros <$> getState + case M.lookup name macros of + Just _ -> report $ MacroAlreadyDefined (T.unpack name) pos + Nothing -> return () + return (name, Macro numargs optarg startcontents, + Macro 0 Nothing endcontents) + +bracketedToks :: PandocMonad m => LP m [Tok] +bracketedToks = do + symbol '[' + manyTill anyTok (symbol ']') + +bracketedNum :: PandocMonad m => LP m Int +bracketedNum = do + ds <- untokenize <$> bracketedToks + case safeRead (T.unpack ds) of + Just i -> return i + _ -> return 0 + +setCaption :: PandocMonad m => LP m Blocks +setCaption = do + ils <- tok + mblabel <- option Nothing $ + try $ spaces >> controlSeq "label" >> (Just <$> tok) + let ils' = case mblabel of + Just lab -> ils <> spanWith + ("",[],[("data-label", stringify lab)]) mempty + Nothing -> ils + updateState $ \st -> st{ sCaption = Just ils' } + return mempty + +looseItem :: PandocMonad m => LP m Blocks +looseItem = do + inListItem <- sInListItem <$> getState + guard $ not inListItem + skipopts + return mempty + +resetCaption :: PandocMonad m => LP m () +resetCaption = updateState $ \st -> st{ sCaption = Nothing } + +section :: PandocMonad m => Attr -> Int -> LP m Blocks +section (ident, classes, kvs) lvl = do + skipopts + contents <- grouped inline + lab <- option ident $ + try (spaces >> controlSeq "label" + >> spaces >> toksToString <$> braced) + attr' <- registerHeader (lab, classes, kvs) contents + return $ headerWith attr' lvl contents + +blockCommand :: PandocMonad m => LP m Blocks +blockCommand = try $ do + Tok _ (CtrlSeq name) txt <- anyControlSeq + guard $ name /= "begin" && name /= "end" + star <- option "" ("*" <$ symbol '*' <* optional sp) + let name' = name <> star + let names = ordNub [name', name] + let raw = do + guard $ isBlockCommand name || not (isInlineCommand name) + rawBlock "latex" <$> getRawCommand (txt <> star) + lookupListDefault raw names blockCommands + +closing :: PandocMonad m => LP m Blocks +closing = do + contents <- tok + st <- getState + let extractInlines (MetaBlocks [Plain ys]) = ys + extractInlines (MetaBlocks [Para ys ]) = ys + extractInlines _ = [] + let sigs = case lookupMeta "author" (sMeta st) of + Just (MetaList xs) -> + para $ trimInlines $ fromList $ + intercalate [LineBreak] $ map extractInlines xs + _ -> mempty + return $ para (trimInlines contents) <> sigs + +blockCommands :: PandocMonad m => M.Map Text (LP m Blocks) +blockCommands = M.fromList $ + [ ("par", mempty <$ skipopts) + , ("parbox", braced >> grouped blocks) + , ("title", mempty <$ (skipopts *> + (grouped inline >>= addMeta "title") + <|> (grouped block >>= addMeta "title"))) + , ("subtitle", mempty <$ (skipopts *> tok >>= addMeta "subtitle")) + , ("author", mempty <$ (skipopts *> authors)) + -- -- in letter class, temp. store address & sig as title, author + , ("address", mempty <$ (skipopts *> tok >>= addMeta "address")) + , ("signature", mempty <$ (skipopts *> authors)) + , ("date", mempty <$ (skipopts *> tok >>= addMeta "date")) + -- Koma-script metadata commands + , ("dedication", mempty <$ (skipopts *> tok >>= addMeta "dedication")) + -- sectioning + , ("part", section nullAttr (-1)) + , ("part*", section nullAttr (-1)) + , ("chapter", section nullAttr 0) + , ("chapter*", section ("",["unnumbered"],[]) 0) + , ("section", section nullAttr 1) + , ("section*", section ("",["unnumbered"],[]) 1) + , ("subsection", section nullAttr 2) + , ("subsection*", section ("",["unnumbered"],[]) 2) + , ("subsubsection", section nullAttr 3) + , ("subsubsection*", section ("",["unnumbered"],[]) 3) + , ("paragraph", section nullAttr 4) + , ("paragraph*", section ("",["unnumbered"],[]) 4) + , ("subparagraph", section nullAttr 5) + , ("subparagraph*", section ("",["unnumbered"],[]) 5) + -- beamer slides + , ("frametitle", section nullAttr 3) + , ("framesubtitle", section nullAttr 4) + -- letters + , ("opening", (para . trimInlines) <$> (skipopts *> tok)) + , ("closing", skipopts *> closing) + -- + , ("hrule", pure horizontalRule) + , ("strut", pure mempty) + , ("rule", skipopts *> tok *> tok *> pure horizontalRule) + , ("item", looseItem) + , ("documentclass", skipopts *> braced *> preamble) + , ("centerline", (para . trimInlines) <$> (skipopts *> tok)) + , ("caption", skipopts *> setCaption) + , ("bibliography", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs . toksToString)) + , ("addbibresource", mempty <$ (skipopts *> braced >>= + addMeta "bibliography" . splitBibs . toksToString)) + -- includes + , ("lstinputlisting", inputListing) + , ("graphicspath", graphicsPath) + -- hyperlink + , ("hypertarget", try $ braced >> grouped block) + -- LaTeX colors + , ("textcolor", coloredBlock "color") + , ("colorbox", coloredBlock "background-color") + ] + + +environments :: PandocMonad m => M.Map Text (LP m Blocks) +environments = M.fromList + [ ("document", env "document" blocks) + , ("abstract", mempty <$ (env "abstract" blocks >>= addMeta "abstract")) + , ("letter", env "letter" letterContents) + , ("minipage", env "minipage" $ + skipopts *> spaces *> optional braced *> spaces *> blocks) + , ("figure", env "figure" $ skipopts *> figure) + , ("subfigure", env "subfigure" $ skipopts *> tok *> figure) + , ("center", env "center" blocks) + , ("longtable", env "longtable" $ + resetCaption *> simpTable "longtable" False >>= addTableCaption) + , ("table", env "table" $ + resetCaption *> skipopts *> blocks >>= addTableCaption) + , ("tabular*", env "tabular" $ simpTable "tabular*" True) + , ("tabularx", env "tabularx" $ simpTable "tabularx" True) + , ("tabular", env "tabular" $ simpTable "tabular" False) + , ("quote", blockQuote <$> env "quote" blocks) + , ("quotation", blockQuote <$> env "quotation" blocks) + , ("verse", blockQuote <$> env "verse" blocks) + , ("itemize", bulletList <$> listenv "itemize" (many item)) + , ("description", definitionList <$> listenv "description" (many descItem)) + , ("enumerate", orderedList') + , ("alltt", alltt <$> env "alltt" blocks) + , ("code", guardEnabled Ext_literate_haskell *> + (codeBlockWith ("",["sourceCode","literate","haskell"],[]) <$> + verbEnv "code")) + , ("comment", mempty <$ verbEnv "comment") + , ("verbatim", codeBlock <$> verbEnv "verbatim") + , ("Verbatim", fancyverbEnv "Verbatim") + , ("BVerbatim", fancyverbEnv "BVerbatim") + , ("lstlisting", do attr <- parseListingsOptions <$> option [] keyvals + codeBlockWith attr <$> verbEnv "lstlisting") + , ("minted", minted) + , ("obeylines", obeylines) + , ("displaymath", mathEnvWith para Nothing "displaymath") + , ("equation", mathEnvWith para Nothing "equation") + , ("equation*", mathEnvWith para Nothing "equation*") + , ("gather", mathEnvWith para (Just "gathered") "gather") + , ("gather*", mathEnvWith para (Just "gathered") "gather*") + , ("multline", mathEnvWith para (Just "gathered") "multline") + , ("multline*", mathEnvWith para (Just "gathered") "multline*") + , ("eqnarray", mathEnvWith para (Just "aligned") "eqnarray") + , ("eqnarray*", mathEnvWith para (Just "aligned") "eqnarray*") + , ("align", mathEnvWith para (Just "aligned") "align") + , ("align*", mathEnvWith para (Just "aligned") "align*") + , ("alignat", mathEnvWith para (Just "aligned") "alignat") + , ("alignat*", mathEnvWith para (Just "aligned") "alignat*") + , ("tikzpicture", rawVerbEnv "tikzpicture") + ] environment :: PandocMonad m => LP m Blocks environment = do controlSeq "begin" - name <- braced + name <- untokenize <$> braced M.findWithDefault mzero name environments <|> rawEnv name -inlineEnvironment :: PandocMonad m => LP m Inlines -inlineEnvironment = try $ do - controlSeq "begin" - name <- braced - M.findWithDefault mzero name inlineEnvironments +env :: PandocMonad m => Text -> LP m a -> LP m a +env name p = p <* end_ name -rawEnv :: PandocMonad m => String -> LP m Blocks +rawEnv :: PandocMonad m => Text -> LP m Blocks rawEnv name = do exts <- getOption readerExtensions let parseRaw = extensionEnabled Ext_raw_tex exts rawOptions <- mconcat <$> many rawopt - let beginCommand = "\\begin{" ++ name ++ "}" ++ rawOptions + let beginCommand = "\\begin{" <> name <> "}" <> rawOptions pos1 <- getPosition (bs, raw) <- withRaw $ env name blocks - raw' <- applyMacros' $ beginCommand ++ raw - if raw' /= beginCommand ++ raw - then parseFromString' blocks raw' - else if parseRaw - then return $ rawBlock "latex" $ beginCommand ++ raw' - else do - unless parseRaw $ do - report $ SkippedContent beginCommand pos1 - pos2 <- getPosition - report $ SkippedContent ("\\end{" ++ name ++ "}") pos2 - return bs - -rawVerbEnv :: PandocMonad m => String -> LP m Blocks + if parseRaw + then return $ rawBlock "latex" + $ T.unpack $ beginCommand <> untokenize raw + else do + unless parseRaw $ do + report $ SkippedContent (T.unpack beginCommand) pos1 + pos2 <- getPosition + report $ SkippedContent ("\\end{" ++ T.unpack name ++ "}") pos2 + return bs + +rawVerbEnv :: PandocMonad m => Text -> LP m Blocks rawVerbEnv name = do pos <- getPosition (_, raw) <- withRaw $ verbEnv name - let raw' = "\\begin{tikzpicture}" ++ raw + let raw' = "\\begin{tikzpicture}" ++ toksToString raw exts <- getOption readerExtensions let parseRaw = extensionEnabled Ext_raw_tex exts if parseRaw @@ -1073,36 +1810,118 @@ rawVerbEnv name = do report $ SkippedContent raw' pos return mempty ----- +verbEnv :: PandocMonad m => Text -> LP m String +verbEnv name = withVerbatimMode $ do + skipopts + optional blankline + res <- manyTill anyTok (end_ name) + return $ stripTrailingNewlines $ toksToString res -maybeAddExtension :: String -> FilePath -> FilePath -maybeAddExtension ext fp = - if null (takeExtension fp) - then addExtension fp ext - else fp +fancyverbEnv :: PandocMonad m => Text -> LP m Blocks +fancyverbEnv name = do + options <- option [] keyvals + let kvs = [ (if k == "firstnumber" + then "startFrom" + else k, v) | (k,v) <- options ] + let classes = [ "numberLines" | + lookup "numbers" options == Just "left" ] + let attr = ("",classes,kvs) + codeBlockWith attr <$> verbEnv name -include :: PandocMonad m => LP m Blocks -include = do - fs' <- try $ do - char '\\' - name <- try (string "include") - <|> try (string "input") - <|> try (string "subfile") - <|> string "usepackage" - -- skip options - skipMany $ try $ char '[' *> manyTill anyChar (char ']') - fs <- (map trim . splitBy (==',')) <$> braced - return $ if name == "usepackage" - then map (maybeAddExtension ".sty") fs - else map (maybeAddExtension ".tex") fs - dirs <- (splitBy (==':') . fromMaybe ".") <$> lookupEnv "TEXINPUTS" - mconcat <$> mapM (insertIncludedFile blocks dirs) fs' +obeylines :: PandocMonad m => LP m Blocks +obeylines = do + para . fromList . removeLeadingTrailingBreaks . + walk softBreakToHard . toList <$> env "obeylines" inlines + where softBreakToHard SoftBreak = LineBreak + softBreakToHard x = x + removeLeadingTrailingBreaks = reverse . dropWhile isLineBreak . + reverse . dropWhile isLineBreak + isLineBreak LineBreak = True + isLineBreak _ = False + +minted :: PandocMonad m => LP m Blocks +minted = do + options <- option [] keyvals + lang <- toksToString <$> braced + let kvs = [ (if k == "firstnumber" + then "startFrom" + else k, v) | (k,v) <- options ] + let classes = [ lang | not (null lang) ] ++ + [ "numberLines" | + lookup "linenos" options == Just "true" ] + let attr = ("",classes,kvs) + codeBlockWith attr <$> verbEnv "minted" + +letterContents :: PandocMonad m => LP m Blocks +letterContents = do + bs <- blocks + st <- getState + -- add signature (author) and address (title) + let addr = case lookupMeta "address" (sMeta st) of + Just (MetaBlocks [Plain xs]) -> + para $ trimInlines $ fromList xs + _ -> mempty + return $ addr <> bs -- sig added by \closing + +figure :: PandocMonad m => LP m Blocks +figure = try $ do + resetCaption + blocks >>= addImageCaption + +addImageCaption :: PandocMonad m => Blocks -> LP m Blocks +addImageCaption = walkM go + where go (Image attr alt (src,tit)) + | not ("fig:" `isPrefixOf` tit) = do + mbcapt <- sCaption <$> getState + return $ case mbcapt of + Just ils -> Image attr (toList ils) (src, "fig:" ++ tit) + Nothing -> Image attr alt (src,tit) + go x = return x + +coloredBlock :: PandocMonad m => String -> LP m Blocks +coloredBlock stylename = do + skipopts + color <- braced + let constructor = divWith ("",[],[("style",stylename ++ ": " ++ toksToString color)]) + inlineContents <|> constructor <$> blockContents + where inlineContents = do + ils <- grouped inline + rest <- inlines + return (para (ils <> rest)) + blockContents = grouped block + +graphicsPath :: PandocMonad m => LP m Blocks +graphicsPath = do + ps <- map toksToString <$> (bgroup *> manyTill braced egroup) + getResourcePath >>= setResourcePath . (++ ps) + return mempty + +splitBibs :: String -> [Inlines] +splitBibs = map (str . flip replaceExtension "bib" . trim) . splitBy (==',') + +alltt :: Blocks -> Blocks +alltt = walk strToCode + where strToCode (Str s) = Code nullAttr s + strToCode Space = RawInline (Format "latex") "\\ " + strToCode SoftBreak = LineBreak + strToCode x = x + +parseListingsOptions :: [(String, String)] -> Attr +parseListingsOptions options = + let kvs = [ (if k == "firstnumber" + then "startFrom" + else k, v) | (k,v) <- options ] + classes = [ "numberLines" | + lookup "numbers" options == Just "left" ] + ++ maybeToList (lookup "language" options + >>= fromListingsLanguage) + in (fromMaybe "" (lookup "label" options), classes, kvs) inputListing :: PandocMonad m => LP m Blocks inputListing = do pos <- getPosition options <- option [] keyvals - f <- filter (/='"') <$> braced + f <- filter (/='"') . toksToString <$> braced dirs <- (splitBy (==':') . fromMaybe ".") <$> lookupEnv "TEXINPUTS" mbCode <- readFileFromDirs dirs f codeLines <- case mbCode of @@ -1121,169 +1940,10 @@ inputListing = do drop (firstline - 1) codeLines return $ codeBlockWith (ident,ordNub (classes ++ language),kvs) codeContents -parseListingsOptions :: [(String, String)] -> Attr -parseListingsOptions options = - let kvs = [ (if k == "firstnumber" - then "startFrom" - else k, v) | (k,v) <- options ] - classes = [ "numberLines" | - lookup "numbers" options == Just "left" ] - ++ maybeToList (lookup "language" options - >>= fromListingsLanguage) - in (fromMaybe "" (lookup "label" options), classes, kvs) - ----- - -keyval :: PandocMonad m => LP m (String, String) -keyval = try $ do - key <- many1 alphaNum - val <- option "" $ char '=' >> braced <|> (many1 (alphaNum <|> oneOf ".:-|\\")) - skipMany spaceChar - optional (char ',') - skipMany spaceChar - return (key, val) - - -keyvals :: PandocMonad m => LP m [(String, String)] -keyvals = try $ char '[' *> manyTill keyval (char ']') - -alltt :: PandocMonad m => String -> LP m Blocks -alltt t = walk strToCode <$> parseFromString' blocks - (substitute " " "\\ " $ substitute "%" "\\%" $ - intercalate "\\\\\n" $ lines t) - where strToCode (Str s) = Code nullAttr s - strToCode x = x - -rawLaTeXBlock :: PandocMonad m => LP m String -rawLaTeXBlock = snd <$> try (withRaw (environment <|> blockCommand)) - -rawLaTeXInline :: PandocMonad m => LP m Inline -rawLaTeXInline = do - raw <- (snd <$> withRaw inlineCommand) - <|> (snd <$> withRaw inlineEnvironment) - <|> (snd <$> withRaw blockCommand) - RawInline "latex" <$> applyMacros' raw - -addImageCaption :: PandocMonad m => Blocks -> LP m Blocks -addImageCaption = walkM go - where go (Image attr alt (src,tit)) - | not ("fig:" `isPrefixOf` tit) = do - mbcapt <- stateCaption <$> getState - return $ case mbcapt of - Just ils -> Image attr (toList ils) (src, "fig:" ++ tit) - Nothing -> Image attr alt (src,tit) - go x = return x - -addTableCaption :: PandocMonad m => Blocks -> LP m Blocks -addTableCaption = walkM go - where go (Table c als ws hs rs) = do - mbcapt <- stateCaption <$> getState - return $ case mbcapt of - Just ils -> Table (toList ils) als ws hs rs - Nothing -> Table c als ws hs rs - go x = return x - -environments :: PandocMonad m => M.Map String (LP m Blocks) -environments = M.fromList - [ ("document", env "document" blocks <* skipMany anyChar) - , ("abstract", mempty <$ (env "abstract" blocks >>= addMeta "abstract")) - , ("letter", env "letter" letterContents) - , ("minipage", env "minipage" $ - skipopts *> spaces' *> optional braced *> spaces' *> blocks) - , ("figure", env "figure" $ skipopts *> figure) - , ("subfigure", env "subfigure" $ skipopts *> tok *> figure) - , ("center", env "center" blocks) - , ("longtable", env "longtable" $ - resetCaption *> simpTable "longtable" False >>= addTableCaption) - , ("table", env "table" $ - resetCaption *> skipopts *> blocks >>= addTableCaption) - , ("tabular*", env "tabular" $ simpTable "tabular*" True) - , ("tabularx", env "tabularx" $ simpTable "tabularx" True) - , ("tabular", env "tabular" $ simpTable "tabular" False) - , ("quote", blockQuote <$> env "quote" blocks) - , ("quotation", blockQuote <$> env "quotation" blocks) - , ("verse", blockQuote <$> env "verse" blocks) - , ("itemize", bulletList <$> listenv "itemize" (many item)) - , ("description", definitionList <$> listenv "description" (many descItem)) - , ("enumerate", orderedList') - , ("alltt", alltt =<< verbEnv "alltt") - , ("code", guardEnabled Ext_literate_haskell *> - (codeBlockWith ("",["sourceCode","literate","haskell"],[]) <$> - verbEnv "code")) - , ("comment", mempty <$ verbEnv "comment") - , ("verbatim", codeBlock <$> verbEnv "verbatim") - , ("Verbatim", fancyverbEnv "Verbatim") - , ("BVerbatim", fancyverbEnv "BVerbatim") - , ("lstlisting", do attr <- parseListingsOptions <$> option [] keyvals - codeBlockWith attr <$> verbEnv "lstlisting") - , ("minted", do options <- option [] keyvals - lang <- grouped (many1 $ satisfy (/='}')) - let kvs = [ (if k == "firstnumber" - then "startFrom" - else k, v) | (k,v) <- options ] - let classes = [ lang | not (null lang) ] ++ - [ "numberLines" | - lookup "linenos" options == Just "true" ] - let attr = ("",classes,kvs) - codeBlockWith attr <$> verbEnv "minted") - , ("obeylines", parseFromString - (para . trimInlines . mconcat <$> many inline) =<< - intercalate "\\\\\n" . lines <$> verbEnv "obeylines") - , ("displaymath", mathEnvWith para Nothing "displaymath") - , ("equation", mathEnvWith para Nothing "equation") - , ("equation*", mathEnvWith para Nothing "equation*") - , ("gather", mathEnvWith para (Just "gathered") "gather") - , ("gather*", mathEnvWith para (Just "gathered") "gather*") - , ("multline", mathEnvWith para (Just "gathered") "multline") - , ("multline*", mathEnvWith para (Just "gathered") "multline*") - , ("eqnarray", mathEnvWith para (Just "aligned") "eqnarray") - , ("eqnarray*", mathEnvWith para (Just "aligned") "eqnarray*") - , ("align", mathEnvWith para (Just "aligned") "align") - , ("align*", mathEnvWith para (Just "aligned") "align*") - , ("alignat", mathEnvWith para (Just "aligned") "alignat") - , ("alignat*", mathEnvWith para (Just "aligned") "alignat*") - , ("tikzpicture", rawVerbEnv "tikzpicture") - ] - -figure :: PandocMonad m => LP m Blocks -figure = try $ do - resetCaption - blocks >>= addImageCaption - -letterContents :: PandocMonad m => LP m Blocks -letterContents = do - bs <- blocks - st <- getState - -- add signature (author) and address (title) - let addr = case lookupMeta "address" (stateMeta st) of - Just (MetaBlocks [Plain xs]) -> - para $ trimInlines $ fromList xs - _ -> mempty - return $ addr <> bs -- sig added by \closing - -closing :: PandocMonad m => LP m Blocks -closing = do - contents <- tok - st <- getState - let extractInlines (MetaBlocks [Plain ys]) = ys - extractInlines (MetaBlocks [Para ys ]) = ys - extractInlines _ = [] - let sigs = case lookupMeta "author" (stateMeta st) of - Just (MetaList xs) -> - para $ trimInlines $ fromList $ - intercalate [LineBreak] $ map extractInlines xs - _ -> mempty - return $ para (trimInlines contents) <> sigs +-- lists item :: PandocMonad m => LP m Blocks -item = blocks *> controlSeq "item" *> skipopts *> blocks - -looseItem :: PandocMonad m => LP m Blocks -looseItem = do - ctx <- stateParserContext `fmap` getState - if ctx == ListItemState - then mzero - else return mempty +item = void blocks *> controlSeq "item" *> skipopts *> blocks descItem :: PandocMonad m => LP m (Inlines, [Blocks]) descItem = do @@ -1294,302 +1954,210 @@ descItem = do bs <- blocks return (ils, [bs]) -env :: PandocMonad m => String -> LP m a -> LP m a -env name p = p <* - (try (controlSeq "end" *> braced >>= guard . (== name)) - <?> ("\\end{" ++ name ++ "}")) - -listenv :: PandocMonad m => String -> LP m a -> LP m a +listenv :: PandocMonad m => Text -> LP m a -> LP m a listenv name p = try $ do - oldCtx <- stateParserContext `fmap` getState - updateState $ \st -> st{ stateParserContext = ListItemState } + oldInListItem <- sInListItem `fmap` getState + updateState $ \st -> st{ sInListItem = True } res <- env name p - updateState $ \st -> st{ stateParserContext = oldCtx } + updateState $ \st -> st{ sInListItem = oldInListItem } return res -mathEnvWith :: PandocMonad m - => (Inlines -> a) -> Maybe String -> String -> LP m a -mathEnvWith f innerEnv name = f <$> mathDisplay (inner <$> mathEnv name) - where inner x = case innerEnv of - Nothing -> x - Just y -> "\\begin{" ++ y ++ "}\n" ++ x ++ - "\\end{" ++ y ++ "}" - -mathEnv :: PandocMonad m => String -> LP m String -mathEnv name = do - skipopts - optional blankline - let endEnv = try $ controlSeq "end" *> braced >>= guard . (== name) - charMuncher = skipMany comment *> - (many1 (noneOf "\\%") <|> try (string "\\%") - <|> try (string "\\\\") <|> count 1 anyChar) - res <- concat <$> manyTill charMuncher endEnv - return $ stripTrailingNewlines res - -verbEnv :: PandocMonad m => String -> LP m String -verbEnv name = do - skipopts - optional blankline - let endEnv = try $ controlSeq "end" *> braced >>= guard . (== name) - charMuncher = anyChar - res <- manyTill charMuncher endEnv - return $ stripTrailingNewlines res - -fancyverbEnv :: PandocMonad m => String -> LP m Blocks -fancyverbEnv name = do - options <- option [] keyvals - let kvs = [ (if k == "firstnumber" - then "startFrom" - else k, v) | (k,v) <- options ] - let classes = [ "numberLines" | - lookup "numbers" options == Just "left" ] - let attr = ("",classes,kvs) - codeBlockWith attr <$> verbEnv name - orderedList' :: PandocMonad m => LP m Blocks orderedList' = try $ do - optional sp - (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) $ - try $ char '[' *> anyOrderedListMarker <* char ']' spaces - optional $ try $ controlSeq "setlength" *> grouped (controlSeq "itemindent") *> braced + let markerSpec = do + symbol '[' + ts <- toksToString <$> manyTill anyTok (symbol ']') + case runParser anyOrderedListMarker def "option" ts of + Right r -> return r + Left _ -> do + pos <- getPosition + report $ SkippedContent ("[" ++ ts ++ "]") pos + return (1, DefaultStyle, DefaultDelim) + (_, style, delim) <- option (1, DefaultStyle, DefaultDelim) markerSpec spaces - start <- option 1 $ try $ do controlSeq "setcounter" - grouped (string "enum" *> many1 (oneOf "iv")) + optional $ try $ controlSeq "setlength" + *> grouped (count 1 $ controlSeq "itemindent") + *> braced + spaces + start <- option 1 $ try $ do pos <- getPosition + controlSeq "setcounter" + ctr <- toksToString <$> braced + guard $ "enum" `isPrefixOf` ctr + guard $ all (`elem` ['i','v']) (drop 4 ctr) optional sp - num <- grouped (many1 digit) - spaces - return (read num + 1 :: Int) + num <- toksToString <$> braced + case safeRead num of + Just i -> return (i + 1 :: Int) + Nothing -> do + report $ SkippedContent + ("\\setcounter{" ++ ctr ++ + "}{" ++ num ++ "}") pos + return 1 bs <- listenv "enumerate" (many item) return $ orderedListWith (start, style, delim) bs -paragraph :: PandocMonad m => LP m Blocks -paragraph = do - x <- trimInlines . mconcat <$> many1 inline - if x == mempty - then return mempty - else return $ para x - -preamble :: PandocMonad m => LP m Blocks -preamble = mempty <$> manyTill preambleBlock beginDoc - where beginDoc = lookAhead $ try $ controlSeq "begin" *> string "{document}" - preambleBlock = void comment - <|> void sp - <|> void blanklines - <|> void include - <|> void macro - <|> void blockCommand - <|> void anyControlSeq - <|> void braced - <|> void anyChar - -------- - --- citations - -addPrefix :: [Inline] -> [Citation] -> [Citation] -addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks -addPrefix _ _ = [] - -addSuffix :: [Inline] -> [Citation] -> [Citation] -addSuffix s ks@(_:_) = - let k = last ks - in init ks ++ [k {citationSuffix = citationSuffix k ++ s}] -addSuffix _ _ = [] - -simpleCiteArgs :: PandocMonad m => LP m [Citation] -simpleCiteArgs = try $ do - first <- optionMaybe $ toList <$> opt - second <- optionMaybe $ toList <$> opt - keys <- try $ bgroup *> (manyTill citationLabel egroup) - let (pre, suf) = case (first , second ) of - (Just s , Nothing) -> (mempty, s ) - (Just s , Just t ) -> (s , t ) - _ -> (mempty, mempty) - conv k = Citation { citationId = k - , citationPrefix = [] - , citationSuffix = [] - , citationMode = NormalCitation - , citationHash = 0 - , citationNoteNum = 0 - } - return $ addPrefix pre $ addSuffix suf $ map conv keys - -citationLabel :: PandocMonad m => LP m String -citationLabel = optional sp *> - (many1 (satisfy isBibtexKeyChar) - <* optional sp - <* optional (char ',') - <* optional sp) - where isBibtexKeyChar c = isAlphaNum c || c `elem` (".:;?!`'()/*@_+=-[]" :: String) +-- tables -cites :: PandocMonad m => CitationMode -> Bool -> LP m [Citation] -cites mode multi = try $ do - cits <- if multi - then many1 simpleCiteArgs - else count 1 simpleCiteArgs - let cs = concat cits - return $ case mode of - AuthorInText -> case cs of - (c:rest) -> c {citationMode = mode} : rest - [] -> [] - _ -> map (\a -> a {citationMode = mode}) cs +hline :: PandocMonad m => LP m () +hline = try $ do + spaces + controlSeq "hline" <|> + -- booktabs rules: + controlSeq "toprule" <|> + controlSeq "bottomrule" <|> + controlSeq "midrule" <|> + controlSeq "endhead" <|> + controlSeq "endfirsthead" + spaces + optional $ bracketed inline + return () -citation :: PandocMonad m => String -> CitationMode -> Bool -> LP m Inlines -citation name mode multi = do - (c,raw) <- withRaw $ cites mode multi - return $ cite c (rawInline "latex" $ "\\" ++ name ++ raw) +lbreak :: PandocMonad m => LP m Tok +lbreak = (controlSeq "\\" <|> controlSeq "tabularnewline") <* spaces -complexNatbibCitation :: PandocMonad m => CitationMode -> LP m Inlines -complexNatbibCitation mode = try $ do - let ils = (toList . trimInlines . mconcat) <$> - many (notFollowedBy (oneOf "\\};") >> inline) - let parseOne = try $ do - skipSpaces - pref <- ils - cit' <- inline -- expect a citation - let citlist = toList cit' - cits' <- case citlist of - [Cite cs _] -> return cs - _ -> mzero - suff <- ils - skipSpaces - optional $ char ';' - return $ addPrefix pref $ addSuffix suff cits' - (c:cits, raw) <- withRaw $ grouped parseOne - return $ cite (c{ citationMode = mode }:cits) - (rawInline "latex" $ "\\citetext" ++ raw) +amp :: PandocMonad m => LP m Tok +amp = symbol '&' --- tables +-- Split a Word into individual Symbols (for parseAligns) +splitWordTok :: PandocMonad m => LP m () +splitWordTok = do + inp <- getInput + case inp of + (Tok spos Word t : rest) -> do + setInput $ map (Tok spos Symbol . T.singleton) (T.unpack t) ++ rest + _ -> return () -parseAligns :: PandocMonad m => LP m [(Alignment, Double, (String, String))] +parseAligns :: PandocMonad m => LP m [(Alignment, Double, ([Tok], [Tok]))] parseAligns = try $ do - bgroup - let maybeBar = skipMany $ sp <|> () <$ char '|' <|> () <$ (char '@' >> braced) - maybeBar - let cAlign = AlignCenter <$ char 'c' - let lAlign = AlignLeft <$ char 'l' - let rAlign = AlignRight <$ char 'r' - let parAlign = AlignLeft <$ char 'p' - -- algins from tabularx - let xAlign = AlignLeft <$ char 'X' - let mAlign = AlignLeft <$ char 'm' - let bAlign = AlignLeft <$ char 'b' - let alignChar = cAlign <|> lAlign <|> rAlign <|> parAlign - <|> xAlign <|> mAlign <|> bAlign - let alignPrefix = char '>' >> braced - let alignSuffix = char '<' >> braced + let maybeBar = skipMany $ + sp <|> () <$ symbol '|' <|> () <$ (symbol '@' >> braced) + let cAlign = AlignCenter <$ symbol 'c' + let lAlign = AlignLeft <$ symbol 'l' + let rAlign = AlignRight <$ symbol 'r' + let parAlign = AlignLeft <$ symbol 'p' + -- aligns from tabularx + let xAlign = AlignLeft <$ symbol 'X' + let mAlign = AlignLeft <$ symbol 'm' + let bAlign = AlignLeft <$ symbol 'b' + let alignChar = splitWordTok *> ( cAlign <|> lAlign <|> rAlign <|> parAlign + <|> xAlign <|> mAlign <|> bAlign ) + let alignPrefix = symbol '>' >> braced + let alignSuffix = symbol '<' >> braced let colWidth = try $ do - char '{' - ds <- many1 (oneOf "0123456789.") + symbol '{' + ds <- trim . toksToString <$> manyTill anyTok (controlSeq "linewidth") spaces - string "\\linewidth" - char '}' + symbol '}' case safeRead ds of Just w -> return w Nothing -> return 0.0 - let alignSpec = do + let alignSpec = try $ do spaces - pref <- option "" alignPrefix + pref <- option [] alignPrefix spaces al <- alignChar - width <- colWidth <|> option 0.0 (do s <- braced + width <- colWidth <|> option 0.0 (do s <- toksToString <$> braced pos <- getPosition report $ SkippedContent s pos return 0.0) spaces - suff <- option "" alignSuffix + suff <- option [] alignSuffix return (al, width, (pref, suff)) - aligns' <- sepEndBy alignSpec maybeBar + bgroup + spaces + maybeBar + aligns' <- many (alignSpec <* maybeBar) spaces egroup spaces - return $ aligns' - -hline :: PandocMonad m => LP m () -hline = try $ do - spaces' - controlSeq "hline" <|> - -- booktabs rules: - controlSeq "toprule" <|> - controlSeq "bottomrule" <|> - controlSeq "midrule" <|> - controlSeq "endhead" <|> - controlSeq "endfirsthead" - spaces' - optional $ bracketed (many1 (satisfy (/=']'))) - return () - -lbreak :: PandocMonad m => LP m () -lbreak = () <$ try (spaces' *> - (controlSeq "\\" <|> controlSeq "tabularnewline") <* - spaces') - -amp :: PandocMonad m => LP m () -amp = () <$ try (spaces' *> char '&' <* spaces') + return aligns' parseTableRow :: PandocMonad m - => String -- ^ table environment name - -> [(String, String)] -- ^ pref/suffixes + => Text -- ^ table environment name + -> [([Tok], [Tok])] -- ^ pref/suffixes -> LP m [Blocks] -parseTableRow envname prefsufs = try $ do +parseTableRow envname prefsufs = do + notFollowedBy (spaces *> end_ envname) let cols = length prefsufs - let tableCellRaw = concat <$> many - (do notFollowedBy amp - notFollowedBy lbreak - notFollowedBy $ () <$ try (string ("\\end{" ++ envname ++ "}")) - many1 (noneOf "&%\n\r\\") - <|> try (string "\\&") - <|> count 1 anyChar) - let plainify bs = case toList bs of - [Para ils] -> plain (fromList ils) - _ -> bs - rawcells <- sepBy1 tableCellRaw amp - guard $ length rawcells == cols - let rawcells' = zipWith (\c (p, s) -> p ++ trim c ++ s) rawcells prefsufs - let tableCell = plainify <$> blocks - cells' <- mapM (parseFromString' tableCell) rawcells' - let numcells = length cells' + -- add prefixes and suffixes in token stream: + let celltoks (pref, suff) = do + prefpos <- getPosition + contents <- many (notFollowedBy + (() <$ amp <|> () <$ lbreak <|> end_ envname) + >> anyTok) + suffpos <- getPosition + option [] (count 1 amp) + return $ map (setpos (sourceLine prefpos, sourceColumn prefpos)) pref + ++ contents ++ + map (setpos (sourceLine suffpos, sourceColumn suffpos)) suff + rawcells <- sequence (map celltoks prefsufs) + oldInput <- getInput + cells <- sequence $ map (\ts -> setInput ts >> parseTableCell) rawcells + setInput oldInput + spaces + let numcells = length cells guard $ numcells <= cols && numcells >= 1 - guard $ cells' /= [mempty] + guard $ cells /= [mempty] -- note: a & b in a three-column table leaves an empty 3rd cell: - let cells'' = cells' ++ replicate (cols - numcells) mempty - spaces' - return cells'' + return $ cells ++ replicate (cols - numcells) mempty -spaces' :: PandocMonad m => LP m () -spaces' = spaces *> skipMany (comment *> spaces) +parseTableCell :: PandocMonad m => LP m Blocks +parseTableCell = do + let plainify bs = case toList bs of + [Para ils] -> plain (fromList ils) + _ -> bs + updateState $ \st -> st{ sInTableCell = True } + cells <- plainify <$> blocks + updateState $ \st -> st{ sInTableCell = False } + return cells -simpTable :: PandocMonad m => String -> Bool -> LP m Blocks +simpTable :: PandocMonad m => Text -> Bool -> LP m Blocks simpTable envname hasWidthParameter = try $ do - when hasWidthParameter $ () <$ (spaces' >> tok) + when hasWidthParameter $ () <$ (spaces >> tok) skipopts colspecs <- parseAligns let (aligns, widths, prefsufs) = unzip3 colspecs let cols = length colspecs optional $ controlSeq "caption" *> skipopts *> setCaption optional lbreak - spaces' + spaces skipMany hline - spaces' + spaces header' <- option [] $ try (parseTableRow envname prefsufs <* lbreak <* many1 hline) - spaces' + spaces rows <- sepEndBy (parseTableRow envname prefsufs) (lbreak <* optional (skipMany hline)) - spaces' + spaces optional $ controlSeq "caption" *> skipopts *> setCaption optional lbreak - spaces' + spaces let header'' = if null header' then replicate cols mempty else header' lookAhead $ controlSeq "end" -- make sure we're at end return $ table mempty (zip aligns widths) header'' rows -removeDoubleQuotes :: String -> String -removeDoubleQuotes ('"':xs) = - case reverse xs of - '"':ys -> reverse ys - _ -> '"':xs -removeDoubleQuotes xs = xs +addTableCaption :: PandocMonad m => Blocks -> LP m Blocks +addTableCaption = walkM go + where go (Table c als ws hs rs) = do + mbcapt <- sCaption <$> getState + return $ case mbcapt of + Just ils -> Table (toList ils) als ws hs rs + Nothing -> Table c als ws hs rs + go x = return x + + +block :: PandocMonad m => LP m Blocks +block = (mempty <$ spaces1) + <|> environment + <|> include + <|> macroDef + <|> blockCommand + <|> paragraph + <|> grouped block + +blocks :: PandocMonad m => LP m Blocks +blocks = mconcat <$> many block + |