diff options
| author | John MacFarlane <jgm@berkeley.edu> | 2018-09-28 10:33:32 -0700 | 
|---|---|---|
| committer | John MacFarlane <jgm@berkeley.edu> | 2018-09-28 10:33:32 -0700 | 
| commit | 9dac993835603a1a92136f07e80a6561b1598754 (patch) | |
| tree | d345721f53c39d84adf76c87cef4ab531aff5282 /src/Text/Pandoc/Readers | |
| parent | 4f9ab7e03268e576d86e697f7110869434d08557 (diff) | |
| download | pandoc-9dac993835603a1a92136f07e80a6561b1598754.tar.gz | |
Added Text.Pandoc.Readers.LaTeX.Parsing (unexported).
This collects some of the general-purpose code from the LaTeX
reader, with the aim of making the module smaller. (We've been
having out-of-memory issues compiling this module on CI.)
Diffstat (limited to 'src/Text/Pandoc/Readers')
| -rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 560 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/LaTeX/Parsing.hs | 663 | 
2 files changed, 666 insertions, 557 deletions
| diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 4a7f2f978..5065cc81c 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -47,8 +47,7 @@ import Prelude  import Control.Applicative (many, optional, (<|>))  import Control.Monad  import Control.Monad.Except (throwError) -import Control.Monad.Trans (lift) -import Data.Char (chr, isAlphaNum, isDigit, isLetter, ord, toLower, toUpper) +import Data.Char (isDigit, isLetter, toLower, toUpper)  import Data.Default  import Data.List (intercalate, isPrefixOf)  import qualified Data.Map as M @@ -63,7 +62,7 @@ import Text.Pandoc.Builder  import Text.Pandoc.Class (PandocMonad, PandocPure, getResourcePath, lookupEnv,                            readFileFromDirs, report, setResourcePath,                            setTranslations, translateTerm, trace) -import Text.Pandoc.Error (PandocError (PandocMacroLoop, PandocParseError, PandocParsecError)) +import Text.Pandoc.Error (PandocError ( PandocParseError, PandocParsecError))  import Text.Pandoc.Highlighting (fromListingsLanguage, languagesByExtension)  import Text.Pandoc.ImageSize (numUnit, showFl)  import Text.Pandoc.Logging @@ -72,10 +71,10 @@ import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline,                              optional, space, spaces, withRaw, (<|>))  import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..),                                          ArgSpec (..), Tok (..), TokType (..)) +import Text.Pandoc.Readers.LaTeX.Parsing  import Text.Pandoc.Shared  import qualified Text.Pandoc.Translations as Translations  import Text.Pandoc.Walk -import Text.Parsec.Pos  import qualified Text.Pandoc.Builder as B  -- for debugging: @@ -137,151 +136,6 @@ resolveRefs _ x = x  --        Left e  -> error (show e)  --        Right r -> return r -newtype DottedNum = DottedNum [Int] -  deriving (Show) - -renderDottedNum :: DottedNum -> String -renderDottedNum (DottedNum xs) = -  intercalate "." (map show xs) - -incrementDottedNum :: Int -> DottedNum -> DottedNum -incrementDottedNum level (DottedNum ns) = DottedNum $ -  case reverse (take level (ns ++ repeat 0)) of -       (x:xs) -> reverse (x+1 : xs) -       []     -> []  -- shouldn't happen - -data LaTeXState = LaTeXState{ sOptions       :: ReaderOptions -                            , sMeta          :: Meta -                            , sQuoteContext  :: QuoteContext -                            , sMacros        :: M.Map Text Macro -                            , sContainers    :: [String] -                            , sHeaders       :: M.Map Inlines String -                            , sLogMessages   :: [LogMessage] -                            , sIdentifiers   :: Set.Set String -                            , sVerbatimMode  :: Bool -                            , sCaption       :: (Maybe Inlines, Maybe String) -                            , sInListItem    :: Bool -                            , sInTableCell   :: Bool -                            , sLastHeaderNum :: DottedNum -                            , sLastFigureNum :: DottedNum -                            , sLabels        :: M.Map String [Inline] -                            , sHasChapters   :: Bool -                            , sToggles       :: M.Map String Bool -                            } -     deriving Show - -defaultLaTeXState :: LaTeXState -defaultLaTeXState = LaTeXState{ sOptions       = def -                              , sMeta          = nullMeta -                              , sQuoteContext  = NoQuote -                              , sMacros        = M.empty -                              , sContainers    = [] -                              , sHeaders       = M.empty -                              , sLogMessages   = [] -                              , sIdentifiers   = Set.empty -                              , sVerbatimMode  = False -                              , sCaption       = (Nothing, Nothing) -                              , sInListItem    = False -                              , sInTableCell   = False -                              , sLastHeaderNum = DottedNum [] -                              , sLastFigureNum = DottedNum [] -                              , sLabels        = M.empty -                              , sHasChapters   = False -                              , sToggles       = M.empty -                              } - -instance PandocMonad m => HasQuoteContext LaTeXState m where -  getQuoteContext = sQuoteContext <$> getState -  withQuoteContext context parser = do -    oldState <- getState -    let oldQuoteContext = sQuoteContext oldState -    setState oldState { sQuoteContext = context } -    result <- parser -    newState <- getState -    setState newState { sQuoteContext = oldQuoteContext } -    return result - -instance HasLogMessages LaTeXState where -  addLogMessage msg st = st{ sLogMessages = msg : sLogMessages st } -  getLogMessages st = reverse $ sLogMessages st - -instance HasIdentifierList LaTeXState where -  extractIdentifierList     = sIdentifiers -  updateIdentifierList f st = st{ sIdentifiers = f $ sIdentifiers st } - -instance HasIncludeFiles LaTeXState where -  getIncludeFiles = sContainers -  addIncludeFile f s = s{ sContainers = f : sContainers s } -  dropLatestIncludeFile s = s { sContainers = drop 1 $ sContainers s } - -instance HasHeaderMap LaTeXState where -  extractHeaderMap     = sHeaders -  updateHeaderMap f st = st{ sHeaders = f $ sHeaders st } - -instance HasMacros LaTeXState where -  extractMacros  st  = sMacros st -  updateMacros f st  = st{ sMacros = f (sMacros st) } - -instance HasReaderOptions LaTeXState where -  extractReaderOptions = sOptions - -instance HasMeta LaTeXState where -  setMeta field val st = -    st{ sMeta = setMeta field val $ sMeta st } -  deleteMeta field st = -    st{ sMeta = deleteMeta field $ sMeta st } - -instance Default LaTeXState where -  def = defaultLaTeXState - -type LP m = ParserT [Tok] LaTeXState m - -withVerbatimMode :: PandocMonad m => LP m a -> LP m a -withVerbatimMode parser = do -  updateState $ \st -> st{ sVerbatimMode = True } -  result <- parser -  updateState $ \st -> st{ sVerbatimMode = False } -  return result - -rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s) -               => Bool -> LP m a -> LP m a -> ParserT String s m (a, String) -rawLaTeXParser retokenize parser valParser = do -  inp <- getInput -  let toks = tokenize "source" $ T.pack inp -  pstate <- getState -  let lstate = def{ sOptions = extractReaderOptions pstate } -  let lstate' = lstate { sMacros = extractMacros pstate } -  let rawparser = (,) <$> withRaw valParser <*> getState -  res' <- lift $ runParserT (snd <$> withRaw parser) lstate "chunk" toks -  case res' of -       Left _    -> mzero -       Right toks' -> do -         res <- lift $ runParserT (do when retokenize $ do -                                        -- retokenize, applying macros -                                        doMacros 0 -                                        ts <- many (satisfyTok (const True)) -                                        setInput ts -                                      rawparser) -                        lstate' "chunk" toks' -         case res of -              Left _    -> mzero -              Right ((val, raw), st) -> do -                updateState (updateMacros (sMacros st <>)) -                _ <- takeP (T.length (untokenize toks')) -                return (val, T.unpack (untokenize raw)) - -applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s) -            => String -> ParserT String s m String -applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|> -   do let retokenize = doMacros 0 *> -             (toksToString <$> many (satisfyTok (const True))) -      pstate <- getState -      let lstate = def{ sOptions = extractReaderOptions pstate -                      , sMacros  = extractMacros pstate } -      res <- runParserT retokenize lstate "math" (tokenize "math" (T.pack s)) -      case res of -           Left e   -> fail (show e) -           Right s' -> return s'  rawLaTeXBlock :: (PandocMonad m, HasMacros s, HasReaderOptions s)                => ParserT String s m String @@ -326,358 +180,6 @@ inlineCommand = do    lookAhead (try (char '\\' >> letter))    fst <$> rawLaTeXParser True (inlineEnvironment <|> inlineCommand') inlines -tokenize :: SourceName -> Text -> [Tok] -tokenize sourcename = totoks (initialPos sourcename) - -totoks :: SourcePos -> Text -> [Tok] -totoks pos t = -  case T.uncons t of -       Nothing        -> [] -       Just (c, rest) -         | c == '\n' -> -           Tok pos Newline "\n" -           : totoks (setSourceColumn (incSourceLine pos 1) 1) rest -         | isSpaceOrTab c -> -           let (sps, rest') = T.span isSpaceOrTab t -           in  Tok pos Spaces sps -               : totoks (incSourceColumn pos (T.length sps)) -                 rest' -         | isAlphaNum c -> -           let (ws, rest') = T.span isAlphaNum t -           in  Tok pos Word ws -               : totoks (incSourceColumn pos (T.length ws)) rest' -         | c == '%' -> -           let (cs, rest') = T.break (== '\n') rest -           in  Tok pos Comment ("%" <> cs) -               : totoks (incSourceColumn pos (1 + T.length cs)) rest' -         | c == '\\' -> -           case T.uncons rest of -                Nothing -> [Tok pos (CtrlSeq " ") "\\"] -                Just (d, rest') -                  | isLetterOrAt d -> -                      -- \makeatletter is common in macro defs; -                      -- ideally we should make tokenization sensitive -                      -- to \makeatletter and \makeatother, but this is -                      -- probably best for now -                      let (ws, rest'') = T.span isLetterOrAt rest -                          (ss, rest''') = T.span isSpaceOrTab rest'' -                      in  Tok pos (CtrlSeq ws) ("\\" <> ws <> ss) -                          : totoks (incSourceColumn pos -                               (1 + T.length ws + T.length ss)) rest''' -                  | isSpaceOrTab d || d == '\n' -> -                      let (w1, r1) = T.span isSpaceOrTab rest -                          (w2, (w3, r3)) = case T.uncons r1 of -                                          Just ('\n', r2) -                                                  -> (T.pack "\n", -                                                        T.span isSpaceOrTab r2) -                                          _ -> (mempty, (mempty, r1)) -                          ws = "\\" <> w1 <> w2 <> w3 -                      in  case T.uncons r3 of -                               Just ('\n', _) -> -                                 Tok pos (CtrlSeq " ") ("\\" <> w1) -                                 : totoks (incSourceColumn pos (T.length ws)) -                                   r1 -                               _ -> -                                 Tok pos (CtrlSeq " ") ws -                                 : totoks (incSourceColumn pos (T.length ws)) -                                   r3 -                  | otherwise  -> -                      Tok pos (CtrlSeq (T.singleton d)) (T.pack [c,d]) -                      : totoks (incSourceColumn pos 2) rest' -         | c == '#' -> -           let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest -           in  case safeRead (T.unpack t1) of -                    Just i -> -                       Tok pos (Arg i) ("#" <> t1) -                       : totoks (incSourceColumn pos (1 + T.length t1)) t2 -                    Nothing -> -                       Tok pos Symbol "#" -                       : totoks (incSourceColumn pos 1) t2 -         | c == '^' -> -           case T.uncons rest of -                Just ('^', rest') -> -                  case T.uncons rest' of -                       Just (d, rest'') -                         | isLowerHex d -> -                           case T.uncons rest'' of -                                Just (e, rest''') | isLowerHex e -> -                                  Tok pos Esc2 (T.pack ['^','^',d,e]) -                                  : totoks (incSourceColumn pos 4) rest''' -                                _ -> -                                  Tok pos Esc1 (T.pack ['^','^',d]) -                                  : totoks (incSourceColumn pos 3) rest'' -                         | d < '\128' -> -                                  Tok pos Esc1 (T.pack ['^','^',d]) -                                  : totoks (incSourceColumn pos 3) rest'' -                       _ -> Tok pos Symbol "^" : -                            Tok (incSourceColumn pos 1) Symbol "^" : -                            totoks (incSourceColumn pos 2) rest' -                _ -> Tok pos Symbol "^" -                     : totoks (incSourceColumn pos 1) rest -         | otherwise -> -           Tok pos Symbol (T.singleton c) : totoks (incSourceColumn pos 1) rest - -isSpaceOrTab :: Char -> Bool -isSpaceOrTab ' '  = True -isSpaceOrTab '\t' = True -isSpaceOrTab _    = False - -isLetterOrAt :: Char -> Bool -isLetterOrAt '@' = True -isLetterOrAt c   = isLetter c - -isLowerHex :: Char -> Bool -isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f' - -untokenize :: [Tok] -> Text -untokenize = mconcat . map untoken - -untoken :: Tok -> Text -untoken (Tok _ _ t) = t - -satisfyTok :: PandocMonad m => (Tok -> Bool) -> LP m Tok -satisfyTok f = -  try $ do -    res <- tokenPrim (T.unpack . untoken) updatePos matcher -    doMacros 0 -- apply macros on remaining input stream -    return res -  where matcher t | f t       = Just t -                  | otherwise = Nothing -        updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos -        updatePos _spos _ (Tok pos _ _ : _) = pos -        updatePos spos _ []                 = incSourceColumn spos 1 - -doMacros :: PandocMonad m => Int -> LP m () -doMacros n = do -  verbatimMode <- sVerbatimMode <$> getState -  unless verbatimMode $ do -    inp <- getInput -    case inp of -         Tok spos (CtrlSeq "begin") _ : Tok _ Symbol "{" : -          Tok _ Word name : Tok _ Symbol "}" : ts -            -> handleMacros spos name ts -         Tok spos (CtrlSeq "end") _ : Tok _ Symbol "{" : -          Tok _ Word name : Tok _ Symbol "}" : ts -            -> handleMacros spos ("end" <> name) ts -         Tok _ (CtrlSeq "expandafter") _ : t : ts -            -> do setInput ts -                  doMacros n -                  getInput >>= setInput . combineTok t -         Tok spos (CtrlSeq name) _ : ts -            -> handleMacros spos name ts -         _ -> return () -  where combineTok (Tok spos (CtrlSeq name) x) (Tok _ Word w : ts) -          | T.all isLetterOrAt w = -            Tok spos (CtrlSeq (name <> w)) (x1 <> w <> x2) : ts -              where (x1, x2) = T.break isSpaceOrTab x -        combineTok t ts = t:ts -        handleMacros spos name ts = do -                macros <- sMacros <$> getState -                case M.lookup name macros of -                     Nothing -> return () -                     Just (Macro expansionPoint argspecs optarg newtoks) -> do -                       setInput ts -                       let matchTok (Tok _ toktype txt) = -                             satisfyTok (\(Tok _ toktype' txt') -> -                                           toktype == toktype' && -                                           txt == txt') -                       let matchPattern toks = try $ mapM_ matchTok toks -                       let getargs argmap [] = return argmap -                           getargs argmap (Pattern toks : rest) = try $ do -                              matchPattern toks -                              getargs argmap rest -                           getargs argmap (ArgNum i : Pattern toks : rest) = -                             try $ do -                               x <- mconcat <$> manyTill -                                     (braced <|> ((:[]) <$> anyTok)) -                                     (matchPattern toks) -                               getargs (M.insert i x argmap) rest -                           getargs argmap (ArgNum i : rest) = do -                             x <- try $ spaces >> bracedOrToken -                             getargs (M.insert i x argmap) rest -                       args <- case optarg of -                                    Nothing -> getargs M.empty argspecs -                                    Just o  -> do -                                       x <- option o bracketedToks -                                       getargs (M.singleton 1 x) argspecs -                       -- first boolean param is true if we're tokenizing -                       -- an argument (in which case we don't want to -                       -- expand #1 etc.) -                       let addTok False (Tok _ (Arg i) _) acc = -                              case M.lookup i args of -                                   Nothing -> mzero -                                   Just xs -> foldr (addTok True) acc xs -                           -- see #4007 -                           addTok _ (Tok _ (CtrlSeq x) txt) -                                  acc@(Tok _ Word _ : _) -                             | not (T.null txt) && -                               isLetter (T.last txt) = -                               Tok spos (CtrlSeq x) (txt <> " ") : acc -                           addTok _ t acc = setpos spos t : acc -                       ts' <- getInput -                       setInput $ foldr (addTok False) ts' newtoks -                       case expansionPoint of -                            ExpandWhenUsed -> -                              if n > 20  -- detect macro expansion loops -                                 then throwError $ PandocMacroLoop (T.unpack name) -                                 else doMacros (n + 1) -                            ExpandWhenDefined -> return () - - -setpos :: SourcePos -> Tok -> Tok -setpos spos (Tok _ tt txt) = Tok spos tt txt - -anyControlSeq :: PandocMonad m => LP m Tok -anyControlSeq = satisfyTok isCtrlSeq - -isCtrlSeq :: Tok -> Bool -isCtrlSeq (Tok _ (CtrlSeq _) _) = True -isCtrlSeq _                     = False - -anySymbol :: PandocMonad m => LP m Tok -anySymbol = satisfyTok isSymbolTok - -isSymbolTok :: Tok -> Bool -isSymbolTok (Tok _ Symbol _) = True -isSymbolTok _                = False - -spaces :: PandocMonad m => LP m () -spaces = skipMany (satisfyTok (tokTypeIn [Comment, Spaces, Newline])) - -spaces1 :: PandocMonad m => LP m () -spaces1 = skipMany1 (satisfyTok (tokTypeIn [Comment, Spaces, Newline])) - -tokTypeIn :: [TokType] -> Tok -> Bool -tokTypeIn toktypes (Tok _ tt _) = tt `elem` toktypes - -controlSeq :: PandocMonad m => Text -> LP m Tok -controlSeq name = satisfyTok isNamed -  where isNamed (Tok _ (CtrlSeq n) _) = n == name -        isNamed _                     = False - -symbol :: PandocMonad m => Char -> LP m Tok -symbol c = satisfyTok isc -  where isc (Tok _ Symbol d) = case T.uncons d of -                                    Just (c',_) -> c == c' -                                    _           -> False -        isc _ = False - -symbolIn :: PandocMonad m => [Char] -> LP m Tok -symbolIn cs = satisfyTok isInCs -  where isInCs (Tok _ Symbol d) = case T.uncons d of -                                       Just (c,_) -> c `elem` cs -                                       _          -> False -        isInCs _ = False - -sp :: PandocMonad m => LP m () -sp = whitespace <|> endline - -whitespace :: PandocMonad m => LP m () -whitespace = () <$ satisfyTok isSpaceTok - -isSpaceTok :: Tok -> Bool -isSpaceTok (Tok _ Spaces _) = True -isSpaceTok _                = False - -newlineTok :: PandocMonad m => LP m () -newlineTok = () <$ satisfyTok isNewlineTok - -isNewlineTok :: Tok -> Bool -isNewlineTok (Tok _ Newline _) = True -isNewlineTok _                 = False - -comment :: PandocMonad m => LP m () -comment = () <$ satisfyTok isCommentTok - -isCommentTok :: Tok -> Bool -isCommentTok (Tok _ Comment _) = True -isCommentTok _                 = False - -anyTok :: PandocMonad m => LP m Tok -anyTok = satisfyTok (const True) - -endline :: PandocMonad m => LP m () -endline = try $ do -  newlineTok -  lookAhead anyTok -  notFollowedBy blankline - -blankline :: PandocMonad m => LP m () -blankline = try $ skipMany whitespace *> newlineTok - -primEscape :: PandocMonad m => LP m Char -primEscape = do -  Tok _ toktype t <- satisfyTok (tokTypeIn [Esc1, Esc2]) -  case toktype of -       Esc1 -> case T.uncons (T.drop 2 t) of -                    Just (c, _) -                      | c >= '\64' && c <= '\127' -> return (chr (ord c - 64)) -                      | otherwise                 -> return (chr (ord c + 64)) -                    Nothing -> fail "Empty content of Esc1" -       Esc2 -> case safeRead ('0':'x':T.unpack (T.drop 2 t)) of -                    Just x  -> return (chr x) -                    Nothing -> fail $ "Could not read: " ++ T.unpack t -       _    -> fail "Expected an Esc1 or Esc2 token" -- should not happen - -bgroup :: PandocMonad m => LP m Tok -bgroup = try $ do -  skipMany sp -  symbol '{' <|> controlSeq "bgroup" <|> controlSeq "begingroup" - -egroup :: PandocMonad m => LP m Tok -egroup = symbol '}' <|> controlSeq "egroup" <|> controlSeq "endgroup" - -grouped :: (PandocMonad m,  Monoid a) => LP m a -> LP m a -grouped parser = try $ do -  bgroup -  -- first we check for an inner 'grouped', because -  -- {{a,b}} should be parsed the same as {a,b} -  try (grouped parser <* egroup) <|> (mconcat <$> manyTill parser egroup) - -braced' :: PandocMonad m => LP m Tok -> Int -> LP m [Tok] -braced' getTok n = -  handleEgroup <|> handleBgroup <|> handleOther -  where handleEgroup = do -          t <- egroup -          if n == 1 -             then return [] -             else (t:) <$> braced' getTok (n - 1) -        handleBgroup = do -          t <- bgroup -          (t:) <$> braced' getTok (n + 1) -        handleOther = do -          t <- getTok -          (t:) <$> braced' getTok n - -braced :: PandocMonad m => LP m [Tok] -braced = bgroup *> braced' anyTok 1 - --- URLs require special handling, because they can contain % --- characters.  So we retonenize comments as we go... -bracedUrl :: PandocMonad m => LP m [Tok] -bracedUrl = bgroup *> braced' (retokenizeComment >> anyTok) 1 - -bracketed :: PandocMonad m => Monoid a => LP m a -> LP m a -bracketed parser = try $ do -  symbol '[' -  mconcat <$> manyTill parser (symbol ']') - -parenWrapped :: PandocMonad m => Monoid a => LP m a -> LP m a -parenWrapped parser = try $ do -  symbol '(' -  mconcat <$> manyTill parser (symbol ')') - -dimenarg :: PandocMonad m => LP m Text -dimenarg = try $ do -  ch  <- option False $ True <$ symbol '=' -  Tok _ _ s <- satisfyTok isWordTok -  guard $ T.take 2 (T.reverse s) `elem` -           ["pt","pc","in","bp","cm","mm","dd","cc","sp"] -  let num = T.take (T.length s - 2) s -  guard $ T.length num > 0 -  guard $ T.all isDigit num -  return $ T.pack ['=' | ch] <> s -  -- inline elements:  word :: PandocMonad m => LP m Inlines @@ -689,13 +191,6 @@ regularSymbol = (str . T.unpack . untoken) <$> satisfyTok isRegularSymbol          isRegularSymbol _                = False          isSpecial c = c `Set.member` specialChars -specialChars :: Set.Set Char -specialChars = Set.fromList "#$%&~_^\\{}" - -isWordTok :: Tok -> Bool -isWordTok (Tok _ Word _) = True -isWordTok _              = False -  inlineGroup :: PandocMonad m => LP m Inlines  inlineGroup = do    ils <- grouped inline @@ -1396,9 +891,6 @@ breve 'U' = "Ŭ"  breve 'u' = "ŭ"  breve c   = [c] -toksToString :: [Tok] -> String -toksToString = T.unpack . untokenize -  mathDisplay :: String -> Inlines  mathDisplay = displayMath . trim @@ -1562,19 +1054,6 @@ tok = try $ spaces >> grouped inline <|> inlineCommand' <|> singleChar'            Tok _ _ t <- singleChar            return (str (T.unpack t)) -singleChar :: PandocMonad m => LP m Tok -singleChar = try $ do -  Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol]) -  guard $ not $ toktype == Symbol && -                T.any (`Set.member` specialChars) t -  if T.length t > 1 -     then do -       let (t1, t2) = (T.take 1 t, T.drop 1 t) -       inp <- getInput -       setInput $ Tok (incSourceColumn pos 1) toktype t2 : inp -       return $ Tok pos toktype t1 -     else return $ Tok pos toktype t -  opt :: PandocMonad m => LP m Inlines  opt = bracketed inline <|> (str . T.unpack <$> rawopt) @@ -1611,20 +1090,6 @@ overlayTok =                      Tok _ Symbol c     -> c `elem` ["-","+","@","|",":",","]                      _                  -> False) -ignore :: (Monoid a, PandocMonad m) => String -> ParserT s u m a -ignore raw = do -  pos <- getPosition -  report $ SkippedContent raw pos -  return mempty - -withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok]) -withRaw parser = do -  inp <- getInput -  result <- parser -  nxt <- option (Tok (initialPos "source") Word "") (lookAhead anyTok) -  let raw = takeWhile (/= nxt) inp -  return (result, raw) -  inBrackets :: Inlines -> Inlines  inBrackets x = str "[" <> x <> str "]" @@ -1634,17 +1099,6 @@ unescapeURL ('\\':x:xs) | isEscapable x = x:unescapeURL xs  unescapeURL (x:xs) = x:unescapeURL xs  unescapeURL [] = "" --- For handling URLs, which allow literal % characters... -retokenizeComment :: PandocMonad m => LP m () -retokenizeComment = (do -  Tok pos Comment txt <- satisfyTok isCommentTok -  let updPos (Tok pos' toktype' txt') = -        Tok (incSourceColumn (incSourceLine pos' (sourceLine pos - 1)) -             (sourceColumn pos)) toktype' txt' -  let newtoks = map updPos $ tokenize (sourceName pos) $ T.tail txt -  getInput >>= setInput . ((Tok pos Symbol "%" : newtoks) ++)) -    <|> return () -  mathEnvWith :: PandocMonad m              => (Inlines -> a) -> Maybe Text -> Text -> LP m a  mathEnvWith f innerEnv name = f . mathDisplay . inner <$> mathEnv name @@ -2364,9 +1818,6 @@ isArgTok :: Tok -> Bool  isArgTok (Tok _ (Arg _) _) = True  isArgTok _                 = False -bracedOrToken :: PandocMonad m => LP m [Tok] -bracedOrToken = braced <|> ((:[]) <$> (anyControlSeq <|> singleChar)) -  newcommand :: PandocMonad m => LP m (Text, Macro)  newcommand = do    pos <- getPosition @@ -2417,11 +1868,6 @@ newenvironment = do    return (name, Macro ExpandWhenUsed argspecs optarg startcontents,               Macro ExpandWhenUsed [] Nothing endcontents) -bracketedToks :: PandocMonad m => LP m [Tok] -bracketedToks = do -  symbol '[' -  mconcat <$> manyTill (braced <|> (:[]) <$> anyTok) (symbol ']') -  bracketedNum :: PandocMonad m => LP m Int  bracketedNum = do    ds <- untokenize <$> bracketedToks diff --git a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs new file mode 100644 index 000000000..81d83dab2 --- /dev/null +++ b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs @@ -0,0 +1,663 @@ +{-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE FlexibleInstances     #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{-# LANGUAGE OverloadedStrings     #-} +{-# LANGUAGE ScopedTypeVariables   #-} +{- +Copyright (C) 2006-2018 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Readers.LaTeX.Parsing +   Copyright   : Copyright (C) 2006-2018 John MacFarlane +   License     : GNU GPL, version 2 or above + +   Maintainer  : John MacFarlane <jgm@berkeley.edu> +   Stability   : alpha +   Portability : portable + +General parsing types and functions for LaTeX. +-} +module Text.Pandoc.Readers.LaTeX.Parsing +  ( DottedNum(..) +  , renderDottedNum +  , incrementDottedNum +  , LaTeXState(..) +  , defaultLaTeXState +  , LP +  , withVerbatimMode +  , rawLaTeXParser +  , applyMacros +  , tokenize +  , untokenize +  , untoken +  , totoks +  , toksToString +  , satisfyTok +  , doMacros +  , setpos +  , anyControlSeq +  , anySymbol +  , isWordTok +  , isNewlineTok +  , spaces +  , spaces1 +  , tokTypeIn +  , controlSeq +  , symbol +  , symbolIn +  , sp +  , whitespace +  , newlineTok +  , comment +  , anyTok +  , singleChar +  , specialChars +  , endline +  , blankline +  , primEscape +  , bgroup +  , egroup +  , grouped +  , braced +  , braced' +  , bracedUrl +  , bracedOrToken +  , bracketed +  , bracketedToks +  , parenWrapped +  , dimenarg +  , ignore +  , withRaw +  ) where + +import Prelude +import Control.Applicative (many, (<|>)) +import Control.Monad +import Control.Monad.Except (throwError) +import Control.Monad.Trans (lift) +import Data.Char (chr, isAlphaNum, isDigit, isLetter, ord) +import Data.Default +import Data.List (intercalate) +import qualified Data.Map as M +import qualified Data.Set as Set +import Data.Text (Text) +import qualified Data.Text as T +import Text.Pandoc.Builder +import Text.Pandoc.Class (PandocMonad, report) +import Text.Pandoc.Error (PandocError (PandocMacroLoop)) +import Text.Pandoc.Logging +import Text.Pandoc.Options +import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline, +                            optional, space, spaces, withRaw, (<|>)) +import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..), +                                        ArgSpec (..), Tok (..), TokType (..)) +import Text.Pandoc.Shared +import Text.Parsec.Pos + +newtype DottedNum = DottedNum [Int] +  deriving (Show) + +renderDottedNum :: DottedNum -> String +renderDottedNum (DottedNum xs) = +  intercalate "." (map show xs) + +incrementDottedNum :: Int -> DottedNum -> DottedNum +incrementDottedNum level (DottedNum ns) = DottedNum $ +  case reverse (take level (ns ++ repeat 0)) of +       (x:xs) -> reverse (x+1 : xs) +       []     -> []  -- shouldn't happen + +data LaTeXState = LaTeXState{ sOptions       :: ReaderOptions +                            , sMeta          :: Meta +                            , sQuoteContext  :: QuoteContext +                            , sMacros        :: M.Map Text Macro +                            , sContainers    :: [String] +                            , sHeaders       :: M.Map Inlines String +                            , sLogMessages   :: [LogMessage] +                            , sIdentifiers   :: Set.Set String +                            , sVerbatimMode  :: Bool +                            , sCaption       :: (Maybe Inlines, Maybe String) +                            , sInListItem    :: Bool +                            , sInTableCell   :: Bool +                            , sLastHeaderNum :: DottedNum +                            , sLastFigureNum :: DottedNum +                            , sLabels        :: M.Map String [Inline] +                            , sHasChapters   :: Bool +                            , sToggles       :: M.Map String Bool +                            } +     deriving Show + +defaultLaTeXState :: LaTeXState +defaultLaTeXState = LaTeXState{ sOptions       = def +                              , sMeta          = nullMeta +                              , sQuoteContext  = NoQuote +                              , sMacros        = M.empty +                              , sContainers    = [] +                              , sHeaders       = M.empty +                              , sLogMessages   = [] +                              , sIdentifiers   = Set.empty +                              , sVerbatimMode  = False +                              , sCaption       = (Nothing, Nothing) +                              , sInListItem    = False +                              , sInTableCell   = False +                              , sLastHeaderNum = DottedNum [] +                              , sLastFigureNum = DottedNum [] +                              , sLabels        = M.empty +                              , sHasChapters   = False +                              , sToggles       = M.empty +                              } + +instance PandocMonad m => HasQuoteContext LaTeXState m where +  getQuoteContext = sQuoteContext <$> getState +  withQuoteContext context parser = do +    oldState <- getState +    let oldQuoteContext = sQuoteContext oldState +    setState oldState { sQuoteContext = context } +    result <- parser +    newState <- getState +    setState newState { sQuoteContext = oldQuoteContext } +    return result + +instance HasLogMessages LaTeXState where +  addLogMessage msg st = st{ sLogMessages = msg : sLogMessages st } +  getLogMessages st = reverse $ sLogMessages st + +instance HasIdentifierList LaTeXState where +  extractIdentifierList     = sIdentifiers +  updateIdentifierList f st = st{ sIdentifiers = f $ sIdentifiers st } + +instance HasIncludeFiles LaTeXState where +  getIncludeFiles = sContainers +  addIncludeFile f s = s{ sContainers = f : sContainers s } +  dropLatestIncludeFile s = s { sContainers = drop 1 $ sContainers s } + +instance HasHeaderMap LaTeXState where +  extractHeaderMap     = sHeaders +  updateHeaderMap f st = st{ sHeaders = f $ sHeaders st } + +instance HasMacros LaTeXState where +  extractMacros  st  = sMacros st +  updateMacros f st  = st{ sMacros = f (sMacros st) } + +instance HasReaderOptions LaTeXState where +  extractReaderOptions = sOptions + +instance HasMeta LaTeXState where +  setMeta field val st = +    st{ sMeta = setMeta field val $ sMeta st } +  deleteMeta field st = +    st{ sMeta = deleteMeta field $ sMeta st } + +instance Default LaTeXState where +  def = defaultLaTeXState + +type LP m = ParserT [Tok] LaTeXState m + +withVerbatimMode :: PandocMonad m => LP m a -> LP m a +withVerbatimMode parser = do +  updateState $ \st -> st{ sVerbatimMode = True } +  result <- parser +  updateState $ \st -> st{ sVerbatimMode = False } +  return result + +rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s) +               => Bool -> LP m a -> LP m a -> ParserT String s m (a, String) +rawLaTeXParser retokenize parser valParser = do +  inp <- getInput +  let toks = tokenize "source" $ T.pack inp +  pstate <- getState +  let lstate = def{ sOptions = extractReaderOptions pstate } +  let lstate' = lstate { sMacros = extractMacros pstate } +  let rawparser = (,) <$> withRaw valParser <*> getState +  res' <- lift $ runParserT (snd <$> withRaw parser) lstate "chunk" toks +  case res' of +       Left _    -> mzero +       Right toks' -> do +         res <- lift $ runParserT (do when retokenize $ do +                                        -- retokenize, applying macros +                                        doMacros 0 +                                        ts <- many (satisfyTok (const True)) +                                        setInput ts +                                      rawparser) +                        lstate' "chunk" toks' +         case res of +              Left _    -> mzero +              Right ((val, raw), st) -> do +                updateState (updateMacros (sMacros st <>)) +                _ <- takeP (T.length (untokenize toks')) +                return (val, T.unpack (untokenize raw)) + +applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s) +            => String -> ParserT String s m String +applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|> +   do let retokenize = doMacros 0 *> +             (toksToString <$> many (satisfyTok (const True))) +      pstate <- getState +      let lstate = def{ sOptions = extractReaderOptions pstate +                      , sMacros  = extractMacros pstate } +      res <- runParserT retokenize lstate "math" (tokenize "math" (T.pack s)) +      case res of +           Left e   -> fail (show e) +           Right s' -> return s' +tokenize :: SourceName -> Text -> [Tok] +tokenize sourcename = totoks (initialPos sourcename) + +totoks :: SourcePos -> Text -> [Tok] +totoks pos t = +  case T.uncons t of +       Nothing        -> [] +       Just (c, rest) +         | c == '\n' -> +           Tok pos Newline "\n" +           : totoks (setSourceColumn (incSourceLine pos 1) 1) rest +         | isSpaceOrTab c -> +           let (sps, rest') = T.span isSpaceOrTab t +           in  Tok pos Spaces sps +               : totoks (incSourceColumn pos (T.length sps)) +                 rest' +         | isAlphaNum c -> +           let (ws, rest') = T.span isAlphaNum t +           in  Tok pos Word ws +               : totoks (incSourceColumn pos (T.length ws)) rest' +         | c == '%' -> +           let (cs, rest') = T.break (== '\n') rest +           in  Tok pos Comment ("%" <> cs) +               : totoks (incSourceColumn pos (1 + T.length cs)) rest' +         | c == '\\' -> +           case T.uncons rest of +                Nothing -> [Tok pos (CtrlSeq " ") "\\"] +                Just (d, rest') +                  | isLetterOrAt d -> +                      -- \makeatletter is common in macro defs; +                      -- ideally we should make tokenization sensitive +                      -- to \makeatletter and \makeatother, but this is +                      -- probably best for now +                      let (ws, rest'') = T.span isLetterOrAt rest +                          (ss, rest''') = T.span isSpaceOrTab rest'' +                      in  Tok pos (CtrlSeq ws) ("\\" <> ws <> ss) +                          : totoks (incSourceColumn pos +                               (1 + T.length ws + T.length ss)) rest''' +                  | isSpaceOrTab d || d == '\n' -> +                      let (w1, r1) = T.span isSpaceOrTab rest +                          (w2, (w3, r3)) = case T.uncons r1 of +                                          Just ('\n', r2) +                                                  -> (T.pack "\n", +                                                        T.span isSpaceOrTab r2) +                                          _ -> (mempty, (mempty, r1)) +                          ws = "\\" <> w1 <> w2 <> w3 +                      in  case T.uncons r3 of +                               Just ('\n', _) -> +                                 Tok pos (CtrlSeq " ") ("\\" <> w1) +                                 : totoks (incSourceColumn pos (T.length ws)) +                                   r1 +                               _ -> +                                 Tok pos (CtrlSeq " ") ws +                                 : totoks (incSourceColumn pos (T.length ws)) +                                   r3 +                  | otherwise  -> +                      Tok pos (CtrlSeq (T.singleton d)) (T.pack [c,d]) +                      : totoks (incSourceColumn pos 2) rest' +         | c == '#' -> +           let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest +           in  case safeRead (T.unpack t1) of +                    Just i -> +                       Tok pos (Arg i) ("#" <> t1) +                       : totoks (incSourceColumn pos (1 + T.length t1)) t2 +                    Nothing -> +                       Tok pos Symbol "#" +                       : totoks (incSourceColumn pos 1) t2 +         | c == '^' -> +           case T.uncons rest of +                Just ('^', rest') -> +                  case T.uncons rest' of +                       Just (d, rest'') +                         | isLowerHex d -> +                           case T.uncons rest'' of +                                Just (e, rest''') | isLowerHex e -> +                                  Tok pos Esc2 (T.pack ['^','^',d,e]) +                                  : totoks (incSourceColumn pos 4) rest''' +                                _ -> +                                  Tok pos Esc1 (T.pack ['^','^',d]) +                                  : totoks (incSourceColumn pos 3) rest'' +                         | d < '\128' -> +                                  Tok pos Esc1 (T.pack ['^','^',d]) +                                  : totoks (incSourceColumn pos 3) rest'' +                       _ -> Tok pos Symbol "^" : +                            Tok (incSourceColumn pos 1) Symbol "^" : +                            totoks (incSourceColumn pos 2) rest' +                _ -> Tok pos Symbol "^" +                     : totoks (incSourceColumn pos 1) rest +         | otherwise -> +           Tok pos Symbol (T.singleton c) : totoks (incSourceColumn pos 1) rest + +isSpaceOrTab :: Char -> Bool +isSpaceOrTab ' '  = True +isSpaceOrTab '\t' = True +isSpaceOrTab _    = False + +isLetterOrAt :: Char -> Bool +isLetterOrAt '@' = True +isLetterOrAt c   = isLetter c + +isLowerHex :: Char -> Bool +isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f' + +untokenize :: [Tok] -> Text +untokenize = mconcat . map untoken + +untoken :: Tok -> Text +untoken (Tok _ _ t) = t + +toksToString :: [Tok] -> String +toksToString = T.unpack . untokenize + +satisfyTok :: PandocMonad m => (Tok -> Bool) -> LP m Tok +satisfyTok f = +  try $ do +    res <- tokenPrim (T.unpack . untoken) updatePos matcher +    doMacros 0 -- apply macros on remaining input stream +    return res +  where matcher t | f t       = Just t +                  | otherwise = Nothing +        updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos +        updatePos _spos _ (Tok pos _ _ : _) = pos +        updatePos spos _ []                 = incSourceColumn spos 1 + +doMacros :: PandocMonad m => Int -> LP m () +doMacros n = do +  verbatimMode <- sVerbatimMode <$> getState +  unless verbatimMode $ do +    inp <- getInput +    case inp of +         Tok spos (CtrlSeq "begin") _ : Tok _ Symbol "{" : +          Tok _ Word name : Tok _ Symbol "}" : ts +            -> handleMacros spos name ts +         Tok spos (CtrlSeq "end") _ : Tok _ Symbol "{" : +          Tok _ Word name : Tok _ Symbol "}" : ts +            -> handleMacros spos ("end" <> name) ts +         Tok _ (CtrlSeq "expandafter") _ : t : ts +            -> do setInput ts +                  doMacros n +                  getInput >>= setInput . combineTok t +         Tok spos (CtrlSeq name) _ : ts +            -> handleMacros spos name ts +         _ -> return () +  where combineTok (Tok spos (CtrlSeq name) x) (Tok _ Word w : ts) +          | T.all isLetterOrAt w = +            Tok spos (CtrlSeq (name <> w)) (x1 <> w <> x2) : ts +              where (x1, x2) = T.break isSpaceOrTab x +        combineTok t ts = t:ts +        handleMacros spos name ts = do +                macros <- sMacros <$> getState +                case M.lookup name macros of +                     Nothing -> return () +                     Just (Macro expansionPoint argspecs optarg newtoks) -> do +                       setInput ts +                       let matchTok (Tok _ toktype txt) = +                             satisfyTok (\(Tok _ toktype' txt') -> +                                           toktype == toktype' && +                                           txt == txt') +                       let matchPattern toks = try $ mapM_ matchTok toks +                       let getargs argmap [] = return argmap +                           getargs argmap (Pattern toks : rest) = try $ do +                              matchPattern toks +                              getargs argmap rest +                           getargs argmap (ArgNum i : Pattern toks : rest) = +                             try $ do +                               x <- mconcat <$> manyTill +                                     (braced <|> ((:[]) <$> anyTok)) +                                     (matchPattern toks) +                               getargs (M.insert i x argmap) rest +                           getargs argmap (ArgNum i : rest) = do +                             x <- try $ spaces >> bracedOrToken +                             getargs (M.insert i x argmap) rest +                       args <- case optarg of +                                    Nothing -> getargs M.empty argspecs +                                    Just o  -> do +                                       x <- option o bracketedToks +                                       getargs (M.singleton 1 x) argspecs +                       -- first boolean param is true if we're tokenizing +                       -- an argument (in which case we don't want to +                       -- expand #1 etc.) +                       let addTok False (Tok _ (Arg i) _) acc = +                              case M.lookup i args of +                                   Nothing -> mzero +                                   Just xs -> foldr (addTok True) acc xs +                           -- see #4007 +                           addTok _ (Tok _ (CtrlSeq x) txt) +                                  acc@(Tok _ Word _ : _) +                             | not (T.null txt) && +                               isLetter (T.last txt) = +                               Tok spos (CtrlSeq x) (txt <> " ") : acc +                           addTok _ t acc = setpos spos t : acc +                       ts' <- getInput +                       setInput $ foldr (addTok False) ts' newtoks +                       case expansionPoint of +                            ExpandWhenUsed -> +                              if n > 20  -- detect macro expansion loops +                                 then throwError $ PandocMacroLoop (T.unpack name) +                                 else doMacros (n + 1) +                            ExpandWhenDefined -> return () + + +setpos :: SourcePos -> Tok -> Tok +setpos spos (Tok _ tt txt) = Tok spos tt txt + +anyControlSeq :: PandocMonad m => LP m Tok +anyControlSeq = satisfyTok isCtrlSeq + +isCtrlSeq :: Tok -> Bool +isCtrlSeq (Tok _ (CtrlSeq _) _) = True +isCtrlSeq _                     = False + +anySymbol :: PandocMonad m => LP m Tok +anySymbol = satisfyTok isSymbolTok + +isSymbolTok :: Tok -> Bool +isSymbolTok (Tok _ Symbol _) = True +isSymbolTok _                = False + +isWordTok :: Tok -> Bool +isWordTok (Tok _ Word _) = True +isWordTok _              = False + +spaces :: PandocMonad m => LP m () +spaces = skipMany (satisfyTok (tokTypeIn [Comment, Spaces, Newline])) + +spaces1 :: PandocMonad m => LP m () +spaces1 = skipMany1 (satisfyTok (tokTypeIn [Comment, Spaces, Newline])) + +tokTypeIn :: [TokType] -> Tok -> Bool +tokTypeIn toktypes (Tok _ tt _) = tt `elem` toktypes + +controlSeq :: PandocMonad m => Text -> LP m Tok +controlSeq name = satisfyTok isNamed +  where isNamed (Tok _ (CtrlSeq n) _) = n == name +        isNamed _                     = False + +symbol :: PandocMonad m => Char -> LP m Tok +symbol c = satisfyTok isc +  where isc (Tok _ Symbol d) = case T.uncons d of +                                    Just (c',_) -> c == c' +                                    _           -> False +        isc _ = False + +symbolIn :: PandocMonad m => [Char] -> LP m Tok +symbolIn cs = satisfyTok isInCs +  where isInCs (Tok _ Symbol d) = case T.uncons d of +                                       Just (c,_) -> c `elem` cs +                                       _          -> False +        isInCs _ = False + +sp :: PandocMonad m => LP m () +sp = whitespace <|> endline + +whitespace :: PandocMonad m => LP m () +whitespace = () <$ satisfyTok isSpaceTok + +isSpaceTok :: Tok -> Bool +isSpaceTok (Tok _ Spaces _) = True +isSpaceTok _                = False + +newlineTok :: PandocMonad m => LP m () +newlineTok = () <$ satisfyTok isNewlineTok + +isNewlineTok :: Tok -> Bool +isNewlineTok (Tok _ Newline _) = True +isNewlineTok _                 = False + +comment :: PandocMonad m => LP m () +comment = () <$ satisfyTok isCommentTok + +isCommentTok :: Tok -> Bool +isCommentTok (Tok _ Comment _) = True +isCommentTok _                 = False + +anyTok :: PandocMonad m => LP m Tok +anyTok = satisfyTok (const True) + +singleChar :: PandocMonad m => LP m Tok +singleChar = try $ do +  Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol]) +  guard $ not $ toktype == Symbol && +                T.any (`Set.member` specialChars) t +  if T.length t > 1 +     then do +       let (t1, t2) = (T.take 1 t, T.drop 1 t) +       inp <- getInput +       setInput $ Tok (incSourceColumn pos 1) toktype t2 : inp +       return $ Tok pos toktype t1 +     else return $ Tok pos toktype t + +specialChars :: Set.Set Char +specialChars = Set.fromList "#$%&~_^\\{}" + +endline :: PandocMonad m => LP m () +endline = try $ do +  newlineTok +  lookAhead anyTok +  notFollowedBy blankline + +blankline :: PandocMonad m => LP m () +blankline = try $ skipMany whitespace *> newlineTok + +primEscape :: PandocMonad m => LP m Char +primEscape = do +  Tok _ toktype t <- satisfyTok (tokTypeIn [Esc1, Esc2]) +  case toktype of +       Esc1 -> case T.uncons (T.drop 2 t) of +                    Just (c, _) +                      | c >= '\64' && c <= '\127' -> return (chr (ord c - 64)) +                      | otherwise                 -> return (chr (ord c + 64)) +                    Nothing -> fail "Empty content of Esc1" +       Esc2 -> case safeRead ('0':'x':T.unpack (T.drop 2 t)) of +                    Just x  -> return (chr x) +                    Nothing -> fail $ "Could not read: " ++ T.unpack t +       _    -> fail "Expected an Esc1 or Esc2 token" -- should not happen + +bgroup :: PandocMonad m => LP m Tok +bgroup = try $ do +  skipMany sp +  symbol '{' <|> controlSeq "bgroup" <|> controlSeq "begingroup" + +egroup :: PandocMonad m => LP m Tok +egroup = symbol '}' <|> controlSeq "egroup" <|> controlSeq "endgroup" + +grouped :: (PandocMonad m,  Monoid a) => LP m a -> LP m a +grouped parser = try $ do +  bgroup +  -- first we check for an inner 'grouped', because +  -- {{a,b}} should be parsed the same as {a,b} +  try (grouped parser <* egroup) <|> (mconcat <$> manyTill parser egroup) + +braced' :: PandocMonad m => LP m Tok -> Int -> LP m [Tok] +braced' getTok n = +  handleEgroup <|> handleBgroup <|> handleOther +  where handleEgroup = do +          t <- egroup +          if n == 1 +             then return [] +             else (t:) <$> braced' getTok (n - 1) +        handleBgroup = do +          t <- bgroup +          (t:) <$> braced' getTok (n + 1) +        handleOther = do +          t <- getTok +          (t:) <$> braced' getTok n + +braced :: PandocMonad m => LP m [Tok] +braced = bgroup *> braced' anyTok 1 + +-- URLs require special handling, because they can contain % +-- characters.  So we retonenize comments as we go... +bracedUrl :: PandocMonad m => LP m [Tok] +bracedUrl = bgroup *> braced' (retokenizeComment >> anyTok) 1 + +-- For handling URLs, which allow literal % characters... +retokenizeComment :: PandocMonad m => LP m () +retokenizeComment = (do +  Tok pos Comment txt <- satisfyTok isCommentTok +  let updPos (Tok pos' toktype' txt') = +        Tok (incSourceColumn (incSourceLine pos' (sourceLine pos - 1)) +             (sourceColumn pos)) toktype' txt' +  let newtoks = map updPos $ tokenize (sourceName pos) $ T.tail txt +  getInput >>= setInput . ((Tok pos Symbol "%" : newtoks) ++)) +    <|> return () + +bracedOrToken :: PandocMonad m => LP m [Tok] +bracedOrToken = braced <|> ((:[]) <$> (anyControlSeq <|> singleChar)) + +bracketed :: PandocMonad m => Monoid a => LP m a -> LP m a +bracketed parser = try $ do +  symbol '[' +  mconcat <$> manyTill parser (symbol ']') + +bracketedToks :: PandocMonad m => LP m [Tok] +bracketedToks = do +  symbol '[' +  mconcat <$> manyTill (braced <|> (:[]) <$> anyTok) (symbol ']') + +parenWrapped :: PandocMonad m => Monoid a => LP m a -> LP m a +parenWrapped parser = try $ do +  symbol '(' +  mconcat <$> manyTill parser (symbol ')') + +dimenarg :: PandocMonad m => LP m Text +dimenarg = try $ do +  ch  <- option False $ True <$ symbol '=' +  Tok _ _ s <- satisfyTok isWordTok +  guard $ T.take 2 (T.reverse s) `elem` +           ["pt","pc","in","bp","cm","mm","dd","cc","sp"] +  let num = T.take (T.length s - 2) s +  guard $ T.length num > 0 +  guard $ T.all isDigit num +  return $ T.pack ['=' | ch] <> s + +ignore :: (Monoid a, PandocMonad m) => String -> ParserT s u m a +ignore raw = do +  pos <- getPosition +  report $ SkippedContent raw pos +  return mempty + +withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok]) +withRaw parser = do +  inp <- getInput +  result <- parser +  nxt <- option (Tok (initialPos "source") Word "") (lookAhead anyTok) +  let raw = takeWhile (/= nxt) inp +  return (result, raw) | 
