Added Text.Pandoc.Readers.LaTeX.Parsing (unexported).

This collects some of the general-purpose code from the LaTeX reader, with the aim of making the module smaller. (We've been having out-of-memory issues compiling this module on CI.)
author: John MacFarlane <jgm@berkeley.edu> 2018-09-28 10:33:32 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2018-09-28 10:33:32 -0700
commit: 9dac993835603a1a92136f07e80a6561b1598754 (patch)
tree: d345721f53c39d84adf76c87cef4ab531aff5282 /src
parent: 4f9ab7e03268e576d86e697f7110869434d08557 (diff)
download: pandoc-9dac993835603a1a92136f07e80a6561b1598754.tar.gz
2 files changed, 666 insertions, 557 deletions
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs
index 4a7f2f978..5065cc81c 100644
--- a/src/Text/Pandoc/Readers/LaTeX.hs
+++ b/src/Text/Pandoc/Readers/LaTeX.hs
@@ -47,8 +47,7 @@ import Prelude
 import Control.Applicative (many, optional, (<|>))
 import Control.Monad
 import Control.Monad.Except (throwError)
-import Control.Monad.Trans (lift)
-import Data.Char (chr, isAlphaNum, isDigit, isLetter, ord, toLower, toUpper)
+import Data.Char (isDigit, isLetter, toLower, toUpper)
 import Data.Default
 import Data.List (intercalate, isPrefixOf)
 import qualified Data.Map as M
@@ -63,7 +62,7 @@ import Text.Pandoc.Builder
 import Text.Pandoc.Class (PandocMonad, PandocPure, getResourcePath, lookupEnv,
                           readFileFromDirs, report, setResourcePath,
                           setTranslations, translateTerm, trace)
-import Text.Pandoc.Error (PandocError (PandocMacroLoop, PandocParseError, PandocParsecError))
+import Text.Pandoc.Error (PandocError ( PandocParseError, PandocParsecError))
 import Text.Pandoc.Highlighting (fromListingsLanguage, languagesByExtension)
 import Text.Pandoc.ImageSize (numUnit, showFl)
 import Text.Pandoc.Logging
@@ -72,10 +71,10 @@ import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline,
                             optional, space, spaces, withRaw, (<|>))
 import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..),
                                         ArgSpec (..), Tok (..), TokType (..))
+import Text.Pandoc.Readers.LaTeX.Parsing
 import Text.Pandoc.Shared
 import qualified Text.Pandoc.Translations as Translations
 import Text.Pandoc.Walk
-import Text.Parsec.Pos
 import qualified Text.Pandoc.Builder as B
 
 -- for debugging:
@@ -137,151 +136,6 @@ resolveRefs _ x = x
 --        Left e  -> error (show e)
 --        Right r -> return r
 
-newtype DottedNum = DottedNum [Int]
-  deriving (Show)
-
-renderDottedNum :: DottedNum -> String
-renderDottedNum (DottedNum xs) =
-  intercalate "." (map show xs)
-
-incrementDottedNum :: Int -> DottedNum -> DottedNum
-incrementDottedNum level (DottedNum ns) = DottedNum $
-  case reverse (take level (ns ++ repeat 0)) of
-       (x:xs) -> reverse (x+1 : xs)
-       []     -> []  -- shouldn't happen
-
-data LaTeXState = LaTeXState{ sOptions       :: ReaderOptions
-                            , sMeta          :: Meta
-                            , sQuoteContext  :: QuoteContext
-                            , sMacros        :: M.Map Text Macro
-                            , sContainers    :: [String]
-                            , sHeaders       :: M.Map Inlines String
-                            , sLogMessages   :: [LogMessage]
-                            , sIdentifiers   :: Set.Set String
-                            , sVerbatimMode  :: Bool
-                            , sCaption       :: (Maybe Inlines, Maybe String)
-                            , sInListItem    :: Bool
-                            , sInTableCell   :: Bool
-                            , sLastHeaderNum :: DottedNum
-                            , sLastFigureNum :: DottedNum
-                            , sLabels        :: M.Map String [Inline]
-                            , sHasChapters   :: Bool
-                            , sToggles       :: M.Map String Bool
-                            }
-     deriving Show
-
-defaultLaTeXState :: LaTeXState
-defaultLaTeXState = LaTeXState{ sOptions       = def
-                              , sMeta          = nullMeta
-                              , sQuoteContext  = NoQuote
-                              , sMacros        = M.empty
-                              , sContainers    = []
-                              , sHeaders       = M.empty
-                              , sLogMessages   = []
-                              , sIdentifiers   = Set.empty
-                              , sVerbatimMode  = False
-                              , sCaption       = (Nothing, Nothing)
-                              , sInListItem    = False
-                              , sInTableCell   = False
-                              , sLastHeaderNum = DottedNum []
-                              , sLastFigureNum = DottedNum []
-                              , sLabels        = M.empty
-                              , sHasChapters   = False
-                              , sToggles       = M.empty
-                              }
-
-instance PandocMonad m => HasQuoteContext LaTeXState m where
-  getQuoteContext = sQuoteContext <$> getState
-  withQuoteContext context parser = do
-    oldState <- getState
-    let oldQuoteContext = sQuoteContext oldState
-    setState oldState { sQuoteContext = context }
-    result <- parser
-    newState <- getState
-    setState newState { sQuoteContext = oldQuoteContext }
-    return result
-
-instance HasLogMessages LaTeXState where
-  addLogMessage msg st = st{ sLogMessages = msg : sLogMessages st }
-  getLogMessages st = reverse $ sLogMessages st
-
-instance HasIdentifierList LaTeXState where
-  extractIdentifierList     = sIdentifiers
-  updateIdentifierList f st = st{ sIdentifiers = f $ sIdentifiers st }
-
-instance HasIncludeFiles LaTeXState where
-  getIncludeFiles = sContainers
-  addIncludeFile f s = s{ sContainers = f : sContainers s }
-  dropLatestIncludeFile s = s { sContainers = drop 1 $ sContainers s }
-
-instance HasHeaderMap LaTeXState where
-  extractHeaderMap     = sHeaders
-  updateHeaderMap f st = st{ sHeaders = f $ sHeaders st }
-
-instance HasMacros LaTeXState where
-  extractMacros  st  = sMacros st
-  updateMacros f st  = st{ sMacros = f (sMacros st) }
-
-instance HasReaderOptions LaTeXState where
-  extractReaderOptions = sOptions
-
-instance HasMeta LaTeXState where
-  setMeta field val st =
-    st{ sMeta = setMeta field val $ sMeta st }
-  deleteMeta field st =
-    st{ sMeta = deleteMeta field $ sMeta st }
-
-instance Default LaTeXState where
-  def = defaultLaTeXState
-
-type LP m = ParserT [Tok] LaTeXState m
-
-withVerbatimMode :: PandocMonad m => LP m a -> LP m a
-withVerbatimMode parser = do
-  updateState $ \st -> st{ sVerbatimMode = True }
-  result <- parser
-  updateState $ \st -> st{ sVerbatimMode = False }
-  return result
-
-rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s)
-               => Bool -> LP m a -> LP m a -> ParserT String s m (a, String)
-rawLaTeXParser retokenize parser valParser = do
-  inp <- getInput
-  let toks = tokenize "source" $ T.pack inp
-  pstate <- getState
-  let lstate = def{ sOptions = extractReaderOptions pstate }
-  let lstate' = lstate { sMacros = extractMacros pstate }
-  let rawparser = (,) <$> withRaw valParser <*> getState
-  res' <- lift $ runParserT (snd <$> withRaw parser) lstate "chunk" toks
-  case res' of
-       Left _    -> mzero
-       Right toks' -> do
-         res <- lift $ runParserT (do when retokenize $ do
-                                        -- retokenize, applying macros
-                                        doMacros 0
-                                        ts <- many (satisfyTok (const True))
-                                        setInput ts
-                                      rawparser)
-                        lstate' "chunk" toks'
-         case res of
-              Left _    -> mzero
-              Right ((val, raw), st) -> do
-                updateState (updateMacros (sMacros st <>))
-                _ <- takeP (T.length (untokenize toks'))
-                return (val, T.unpack (untokenize raw))
-
-applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s)
-            => String -> ParserT String s m String
-applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|>
-   do let retokenize = doMacros 0 *>
-             (toksToString <$> many (satisfyTok (const True)))
-      pstate <- getState
-      let lstate = def{ sOptions = extractReaderOptions pstate
-                      , sMacros  = extractMacros pstate }
-      res <- runParserT retokenize lstate "math" (tokenize "math" (T.pack s))
-      case res of
-           Left e   -> fail (show e)
-           Right s' -> return s'
 
 rawLaTeXBlock :: (PandocMonad m, HasMacros s, HasReaderOptions s)
               => ParserT String s m String
@@ -326,358 +180,6 @@ inlineCommand = do
   lookAhead (try (char '\\' >> letter))
   fst <$> rawLaTeXParser True (inlineEnvironment <|> inlineCommand') inlines
 
-tokenize :: SourceName -> Text -> [Tok]
-tokenize sourcename = totoks (initialPos sourcename)
-
-totoks :: SourcePos -> Text -> [Tok]
-totoks pos t =
-  case T.uncons t of
-       Nothing        -> []
-       Just (c, rest)
-         | c == '\n' ->
-           Tok pos Newline "\n"
-           : totoks (setSourceColumn (incSourceLine pos 1) 1) rest
-         | isSpaceOrTab c ->
-           let (sps, rest') = T.span isSpaceOrTab t
-           in  Tok pos Spaces sps
-               : totoks (incSourceColumn pos (T.length sps))
-                 rest'
-         | isAlphaNum c ->
-           let (ws, rest') = T.span isAlphaNum t
-           in  Tok pos Word ws
-               : totoks (incSourceColumn pos (T.length ws)) rest'
-         | c == '%' ->
-           let (cs, rest') = T.break (== '\n') rest
-           in  Tok pos Comment ("%" <> cs)
-               : totoks (incSourceColumn pos (1 + T.length cs)) rest'
-         | c == '\\' ->
-           case T.uncons rest of
-                Nothing -> [Tok pos (CtrlSeq " ") "\\"]
-                Just (d, rest')
-                  | isLetterOrAt d ->
-                      -- \makeatletter is common in macro defs;
-                      -- ideally we should make tokenization sensitive
-                      -- to \makeatletter and \makeatother, but this is
-                      -- probably best for now
-                      let (ws, rest'') = T.span isLetterOrAt rest
-                          (ss, rest''') = T.span isSpaceOrTab rest''
-                      in  Tok pos (CtrlSeq ws) ("\\" <> ws <> ss)
-                          : totoks (incSourceColumn pos
-                               (1 + T.length ws + T.length ss)) rest'''
-                  | isSpaceOrTab d || d == '\n' ->
-                      let (w1, r1) = T.span isSpaceOrTab rest
-                          (w2, (w3, r3)) = case T.uncons r1 of
-                                          Just ('\n', r2)
-                                                  -> (T.pack "\n",
-                                                        T.span isSpaceOrTab r2)
-                                          _ -> (mempty, (mempty, r1))
-                          ws = "\\" <> w1 <> w2 <> w3
-                      in  case T.uncons r3 of
-                               Just ('\n', _) ->
-                                 Tok pos (CtrlSeq " ") ("\\" <> w1)
-                                 : totoks (incSourceColumn pos (T.length ws))
-                                   r1
-                               _ ->
-                                 Tok pos (CtrlSeq " ") ws
-                                 : totoks (incSourceColumn pos (T.length ws))
-                                   r3
-                  | otherwise  ->
-                      Tok pos (CtrlSeq (T.singleton d)) (T.pack [c,d])
-                      : totoks (incSourceColumn pos 2) rest'
-         | c == '#' ->
-           let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest
-           in  case safeRead (T.unpack t1) of
-                    Just i ->
-                       Tok pos (Arg i) ("#" <> t1)
-                       : totoks (incSourceColumn pos (1 + T.length t1)) t2
-                    Nothing ->
-                       Tok pos Symbol "#"
-                       : totoks (incSourceColumn pos 1) t2
-         | c == '^' ->
-           case T.uncons rest of
-                Just ('^', rest') ->
-                  case T.uncons rest' of
-                       Just (d, rest'')
-                         | isLowerHex d ->
-                           case T.uncons rest'' of
-                                Just (e, rest''') | isLowerHex e ->
-                                  Tok pos Esc2 (T.pack ['^','^',d,e])
-                                  : totoks (incSourceColumn pos 4) rest'''
-                                _ ->
-                                  Tok pos Esc1 (T.pack ['^','^',d])
-                                  : totoks (incSourceColumn pos 3) rest''
-                         | d < '\128' ->
-                                  Tok pos Esc1 (T.pack ['^','^',d])
-                                  : totoks (incSourceColumn pos 3) rest''
-                       _ -> Tok pos Symbol "^" :
-                            Tok (incSourceColumn pos 1) Symbol "^" :
-                            totoks (incSourceColumn pos 2) rest'
-                _ -> Tok pos Symbol "^"
-                     : totoks (incSourceColumn pos 1) rest
-         | otherwise ->
-           Tok pos Symbol (T.singleton c) : totoks (incSourceColumn pos 1) rest
-
-isSpaceOrTab :: Char -> Bool
-isSpaceOrTab ' '  = True
-isSpaceOrTab '\t' = True
-isSpaceOrTab _    = False
-
-isLetterOrAt :: Char -> Bool
-isLetterOrAt '@' = True
-isLetterOrAt c   = isLetter c
-
-isLowerHex :: Char -> Bool
-isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f'
-
-untokenize :: [Tok] -> Text
-untokenize = mconcat . map untoken
-
-untoken :: Tok -> Text
-untoken (Tok _ _ t) = t
-
-satisfyTok :: PandocMonad m => (Tok -> Bool) -> LP m Tok
-satisfyTok f =
-  try $ do
-    res <- tokenPrim (T.unpack . untoken) updatePos matcher
-    doMacros 0 -- apply macros on remaining input stream
-    return res
-  where matcher t | f t       = Just t
-                  | otherwise = Nothing
-        updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos
-        updatePos _spos _ (Tok pos _ _ : _) = pos
-        updatePos spos _ []                 = incSourceColumn spos 1
-
-doMacros :: PandocMonad m => Int -> LP m ()
-doMacros n = do
-  verbatimMode <- sVerbatimMode <$> getState
-  unless verbatimMode $ do
-    inp <- getInput
-    case inp of
-         Tok spos (CtrlSeq "begin") _ : Tok _ Symbol "{" :
-          Tok _ Word name : Tok _ Symbol "}" : ts
-            -> handleMacros spos name ts
-         Tok spos (CtrlSeq "end") _ : Tok _ Symbol "{" :
-          Tok _ Word name : Tok _ Symbol "}" : ts
-            -> handleMacros spos ("end" <> name) ts
-         Tok _ (CtrlSeq "expandafter") _ : t : ts
-            -> do setInput ts
-                  doMacros n
-                  getInput >>= setInput . combineTok t
-         Tok spos (CtrlSeq name) _ : ts
-            -> handleMacros spos name ts
-         _ -> return ()
-  where combineTok (Tok spos (CtrlSeq name) x) (Tok _ Word w : ts)
-          | T.all isLetterOrAt w =
-            Tok spos (CtrlSeq (name <> w)) (x1 <> w <> x2) : ts
-              where (x1, x2) = T.break isSpaceOrTab x
-        combineTok t ts = t:ts
-        handleMacros spos name ts = do
-                macros <- sMacros <$> getState
-                case M.lookup name macros of
-                     Nothing -> return ()
-                     Just (Macro expansionPoint argspecs optarg newtoks) -> do
-                       setInput ts
-                       let matchTok (Tok _ toktype txt) =
-                             satisfyTok (\(Tok _ toktype' txt') ->
-                                           toktype == toktype' &&
-                                           txt == txt')
-                       let matchPattern toks = try $ mapM_ matchTok toks
-                       let getargs argmap [] = return argmap
-                           getargs argmap (Pattern toks : rest) = try $ do
-                              matchPattern toks
-                              getargs argmap rest
-                           getargs argmap (ArgNum i : Pattern toks : rest) =
-                             try $ do
-                               x <- mconcat <$> manyTill
-                                     (braced <|> ((:[]) <$> anyTok))
-                                     (matchPattern toks)
-                               getargs (M.insert i x argmap) rest
-                           getargs argmap (ArgNum i : rest) = do
-                             x <- try $ spaces >> bracedOrToken
-                             getargs (M.insert i x argmap) rest
-                       args <- case optarg of
-                                    Nothing -> getargs M.empty argspecs
-                                    Just o  -> do
-                                       x <- option o bracketedToks
-                                       getargs (M.singleton 1 x) argspecs
-                       -- first boolean param is true if we're tokenizing
-                       -- an argument (in which case we don't want to
-                       -- expand #1 etc.)
-                       let addTok False (Tok _ (Arg i) _) acc =
-                              case M.lookup i args of
-                                   Nothing -> mzero
-                                   Just xs -> foldr (addTok True) acc xs
-                           -- see #4007
-                           addTok _ (Tok _ (CtrlSeq x) txt)
-                                  acc@(Tok _ Word _ : _)
-                             | not (T.null txt) &&
-                               isLetter (T.last txt) =
-                               Tok spos (CtrlSeq x) (txt <> " ") : acc
-                           addTok _ t acc = setpos spos t : acc
-                       ts' <- getInput
-                       setInput $ foldr (addTok False) ts' newtoks
-                       case expansionPoint of
-                            ExpandWhenUsed ->
-                              if n > 20  -- detect macro expansion loops
-                                 then throwError $ PandocMacroLoop (T.unpack name)
-                                 else doMacros (n + 1)
-                            ExpandWhenDefined -> return ()
-
-
-setpos :: SourcePos -> Tok -> Tok
-setpos spos (Tok _ tt txt) = Tok spos tt txt
-
-anyControlSeq :: PandocMonad m => LP m Tok
-anyControlSeq = satisfyTok isCtrlSeq
-
-isCtrlSeq :: Tok -> Bool
-isCtrlSeq (Tok _ (CtrlSeq _) _) = True
-isCtrlSeq _                     = False
-
-anySymbol :: PandocMonad m => LP m Tok
-anySymbol = satisfyTok isSymbolTok
-
-isSymbolTok :: Tok -> Bool
-isSymbolTok (Tok _ Symbol _) = True
-isSymbolTok _                = False
-
-spaces :: PandocMonad m => LP m ()
-spaces = skipMany (satisfyTok (tokTypeIn [Comment, Spaces, Newline]))
-
-spaces1 :: PandocMonad m => LP m ()
-spaces1 = skipMany1 (satisfyTok (tokTypeIn [Comment, Spaces, Newline]))
-
-tokTypeIn :: [TokType] -> Tok -> Bool
-tokTypeIn toktypes (Tok _ tt _) = tt `elem` toktypes
-
-controlSeq :: PandocMonad m => Text -> LP m Tok
-controlSeq name = satisfyTok isNamed
-  where isNamed (Tok _ (CtrlSeq n) _) = n == name
-        isNamed _                     = False
-
-symbol :: PandocMonad m => Char -> LP m Tok
-symbol c = satisfyTok isc
-  where isc (Tok _ Symbol d) = case T.uncons d of
-                                    Just (c',_) -> c == c'
-                                    _           -> False
-        isc _ = False
-
-symbolIn :: PandocMonad m => [Char] -> LP m Tok
-symbolIn cs = satisfyTok isInCs
-  where isInCs (Tok _ Symbol d) = case T.uncons d of
-                                       Just (c,_) -> c `elem` cs
-                                       _          -> False
-        isInCs _ = False
-
-sp :: PandocMonad m => LP m ()
-sp = whitespace <|> endline
-
-whitespace :: PandocMonad m => LP m ()
-whitespace = () <$ satisfyTok isSpaceTok
-
-isSpaceTok :: Tok -> Bool
-isSpaceTok (Tok _ Spaces _) = True
-isSpaceTok _                = False
-
-newlineTok :: PandocMonad m => LP m ()
-newlineTok = () <$ satisfyTok isNewlineTok
-
-isNewlineTok :: Tok -> Bool
-isNewlineTok (Tok _ Newline _) = True
-isNewlineTok _                 = False
-
-comment :: PandocMonad m => LP m ()
-comment = () <$ satisfyTok isCommentTok
-
-isCommentTok :: Tok -> Bool
-isCommentTok (Tok _ Comment _) = True
-isCommentTok _                 = False
-
-anyTok :: PandocMonad m => LP m Tok
-anyTok = satisfyTok (const True)
-
-endline :: PandocMonad m => LP m ()
-endline = try $ do
-  newlineTok
-  lookAhead anyTok
-  notFollowedBy blankline
-
-blankline :: PandocMonad m => LP m ()
-blankline = try $ skipMany whitespace *> newlineTok
-
-primEscape :: PandocMonad m => LP m Char
-primEscape = do
-  Tok _ toktype t <- satisfyTok (tokTypeIn [Esc1, Esc2])
-  case toktype of
-       Esc1 -> case T.uncons (T.drop 2 t) of
-                    Just (c, _)
-                      | c >= '\64' && c <= '\127' -> return (chr (ord c - 64))
-                      | otherwise                 -> return (chr (ord c + 64))
-                    Nothing -> fail "Empty content of Esc1"
-       Esc2 -> case safeRead ('0':'x':T.unpack (T.drop 2 t)) of
-                    Just x  -> return (chr x)
-                    Nothing -> fail $ "Could not read: " ++ T.unpack t
-       _    -> fail "Expected an Esc1 or Esc2 token" -- should not happen
-
-bgroup :: PandocMonad m => LP m Tok
-bgroup = try $ do
-  skipMany sp
-  symbol '{' <|> controlSeq "bgroup" <|> controlSeq "begingroup"
-
-egroup :: PandocMonad m => LP m Tok
-egroup = symbol '}' <|> controlSeq "egroup" <|> controlSeq "endgroup"
-
-grouped :: (PandocMonad m,  Monoid a) => LP m a -> LP m a
-grouped parser = try $ do
-  bgroup
-  -- first we check for an inner 'grouped', because
-  -- {{a,b}} should be parsed the same as {a,b}
-  try (grouped parser <* egroup) <|> (mconcat <$> manyTill parser egroup)
-
-braced' :: PandocMonad m => LP m Tok -> Int -> LP m [Tok]
-braced' getTok n =
-  handleEgroup <|> handleBgroup <|> handleOther
-  where handleEgroup = do
-          t <- egroup
-          if n == 1
-             then return []
-             else (t:) <$> braced' getTok (n - 1)
-        handleBgroup = do
-          t <- bgroup
-          (t:) <$> braced' getTok (n + 1)
-        handleOther = do
-          t <- getTok
-          (t:) <$> braced' getTok n
-
-braced :: PandocMonad m => LP m [Tok]
-braced = bgroup *> braced' anyTok 1
-
--- URLs require special handling, because they can contain %
--- characters.  So we retonenize comments as we go...
-bracedUrl :: PandocMonad m => LP m [Tok]
-bracedUrl = bgroup *> braced' (retokenizeComment >> anyTok) 1
-
-bracketed :: PandocMonad m => Monoid a => LP m a -> LP m a
-bracketed parser = try $ do
-  symbol '['
-  mconcat <$> manyTill parser (symbol ']')
-
-parenWrapped :: PandocMonad m => Monoid a => LP m a -> LP m a
-parenWrapped parser = try $ do
-  symbol '('
-  mconcat <$> manyTill parser (symbol ')')
-
-dimenarg :: PandocMonad m => LP m Text
-dimenarg = try $ do
-  ch  <- option False $ True <$ symbol '='
-  Tok _ _ s <- satisfyTok isWordTok
-  guard $ T.take 2 (T.reverse s) `elem`
-           ["pt","pc","in","bp","cm","mm","dd","cc","sp"]
-  let num = T.take (T.length s - 2) s
-  guard $ T.length num > 0
-  guard $ T.all isDigit num
-  return $ T.pack ['=' | ch] <> s
-
 -- inline elements:
 
 word :: PandocMonad m => LP m Inlines
@@ -689,13 +191,6 @@ regularSymbol = (str . T.unpack . untoken) <$> satisfyTok isRegularSymbol
         isRegularSymbol _                = False
         isSpecial c = c `Set.member` specialChars
 
-specialChars :: Set.Set Char
-specialChars = Set.fromList "#$%&~_^\\{}"
-
-isWordTok :: Tok -> Bool
-isWordTok (Tok _ Word _) = True
-isWordTok _              = False
-
 inlineGroup :: PandocMonad m => LP m Inlines
 inlineGroup = do
   ils <- grouped inline
@@ -1396,9 +891,6 @@ breve 'U' = "Ŭ"
 breve 'u' = "ŭ"
 breve c   = [c]
 
-toksToString :: [Tok] -> String
-toksToString = T.unpack . untokenize
-
 mathDisplay :: String -> Inlines
 mathDisplay = displayMath . trim
 
@@ -1562,19 +1054,6 @@ tok = try $ spaces >> grouped inline <|> inlineCommand' <|> singleChar'
           Tok _ _ t <- singleChar
           return (str (T.unpack t))
 
-singleChar :: PandocMonad m => LP m Tok
-singleChar = try $ do
-  Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol])
-  guard $ not $ toktype == Symbol &&
-                T.any (`Set.member` specialChars) t
-  if T.length t > 1
-     then do
-       let (t1, t2) = (T.take 1 t, T.drop 1 t)
-       inp <- getInput
-       setInput $ Tok (incSourceColumn pos 1) toktype t2 : inp
-       return $ Tok pos toktype t1
-     else return $ Tok pos toktype t
-
 opt :: PandocMonad m => LP m Inlines
 opt = bracketed inline <|> (str . T.unpack <$> rawopt)
 
@@ -1611,20 +1090,6 @@ overlayTok =
                     Tok _ Symbol c     -> c `elem` ["-","+","@","|",":",","]
                     _                  -> False)
 
-ignore :: (Monoid a, PandocMonad m) => String -> ParserT s u m a
-ignore raw = do
-  pos <- getPosition
-  report $ SkippedContent raw pos
-  return mempty
-
-withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok])
-withRaw parser = do
-  inp <- getInput
-  result <- parser
-  nxt <- option (Tok (initialPos "source") Word "") (lookAhead anyTok)
-  let raw = takeWhile (/= nxt) inp
-  return (result, raw)
-
 inBrackets :: Inlines -> Inlines
 inBrackets x = str "[" <> x <> str "]"
 
@@ -1634,17 +1099,6 @@ unescapeURL ('\\':x:xs) | isEscapable x = x:unescapeURL xs
 unescapeURL (x:xs) = x:unescapeURL xs
 unescapeURL [] = ""
 
--- For handling URLs, which allow literal % characters...
-retokenizeComment :: PandocMonad m => LP m ()
-retokenizeComment = (do
-  Tok pos Comment txt <- satisfyTok isCommentTok
-  let updPos (Tok pos' toktype' txt') =
-        Tok (incSourceColumn (incSourceLine pos' (sourceLine pos - 1))
-             (sourceColumn pos)) toktype' txt'
-  let newtoks = map updPos $ tokenize (sourceName pos) $ T.tail txt
-  getInput >>= setInput . ((Tok pos Symbol "%" : newtoks) ++))
-    <|> return ()
-
 mathEnvWith :: PandocMonad m
             => (Inlines -> a) -> Maybe Text -> Text -> LP m a
 mathEnvWith f innerEnv name = f . mathDisplay . inner <$> mathEnv name
@@ -2364,9 +1818,6 @@ isArgTok :: Tok -> Bool
 isArgTok (Tok _ (Arg _) _) = True
 isArgTok _                 = False
 
-bracedOrToken :: PandocMonad m => LP m [Tok]
-bracedOrToken = braced <|> ((:[]) <$> (anyControlSeq <|> singleChar))
-
 newcommand :: PandocMonad m => LP m (Text, Macro)
 newcommand = do
   pos <- getPosition
@@ -2417,11 +1868,6 @@ newenvironment = do
   return (name, Macro ExpandWhenUsed argspecs optarg startcontents,
              Macro ExpandWhenUsed [] Nothing endcontents)
 
-bracketedToks :: PandocMonad m => LP m [Tok]
-bracketedToks = do
-  symbol '['
-  mconcat <$> manyTill (braced <|> (:[]) <$> anyTok) (symbol ']')
-
 bracketedNum :: PandocMonad m => LP m Int
 bracketedNum = do
   ds <- untokenize <$> bracketedToks
diff --git a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs
new file mode 100644
index 000000000..81d83dab2
--- /dev/null
+++ b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs
@@ -0,0 +1,663 @@
+{-# LANGUAGE NoImplicitPrelude #-}
+{-# LANGUAGE FlexibleInstances     #-}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE OverloadedStrings     #-}
+{-# LANGUAGE ScopedTypeVariables   #-}
+{-
+Copyright (C) 2006-2018 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+-}
+
+{- |
+   Module      : Text.Pandoc.Readers.LaTeX.Parsing
+   Copyright   : Copyright (C) 2006-2018 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+
+General parsing types and functions for LaTeX.
+-}
+module Text.Pandoc.Readers.LaTeX.Parsing
+  ( DottedNum(..)
+  , renderDottedNum
+  , incrementDottedNum
+  , LaTeXState(..)
+  , defaultLaTeXState
+  , LP
+  , withVerbatimMode
+  , rawLaTeXParser
+  , applyMacros
+  , tokenize
+  , untokenize
+  , untoken
+  , totoks
+  , toksToString
+  , satisfyTok
+  , doMacros
+  , setpos
+  , anyControlSeq
+  , anySymbol
+  , isWordTok
+  , isNewlineTok
+  , spaces
+  , spaces1
+  , tokTypeIn
+  , controlSeq
+  , symbol
+  , symbolIn
+  , sp
+  , whitespace
+  , newlineTok
+  , comment
+  , anyTok
+  , singleChar
+  , specialChars
+  , endline
+  , blankline
+  , primEscape
+  , bgroup
+  , egroup
+  , grouped
+  , braced
+  , braced'
+  , bracedUrl
+  , bracedOrToken
+  , bracketed
+  , bracketedToks
+  , parenWrapped
+  , dimenarg
+  , ignore
+  , withRaw
+  ) where
+
+import Prelude
+import Control.Applicative (many, (<|>))
+import Control.Monad
+import Control.Monad.Except (throwError)
+import Control.Monad.Trans (lift)
+import Data.Char (chr, isAlphaNum, isDigit, isLetter, ord)
+import Data.Default
+import Data.List (intercalate)
+import qualified Data.Map as M
+import qualified Data.Set as Set
+import Data.Text (Text)
+import qualified Data.Text as T
+import Text.Pandoc.Builder
+import Text.Pandoc.Class (PandocMonad, report)
+import Text.Pandoc.Error (PandocError (PandocMacroLoop))
+import Text.Pandoc.Logging
+import Text.Pandoc.Options
+import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline,
+                            optional, space, spaces, withRaw, (<|>))
+import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..),
+                                        ArgSpec (..), Tok (..), TokType (..))
+import Text.Pandoc.Shared
+import Text.Parsec.Pos
+
+newtype DottedNum = DottedNum [Int]
+  deriving (Show)
+
+renderDottedNum :: DottedNum -> String
+renderDottedNum (DottedNum xs) =
+  intercalate "." (map show xs)
+
+incrementDottedNum :: Int -> DottedNum -> DottedNum
+incrementDottedNum level (DottedNum ns) = DottedNum $
+  case reverse (take level (ns ++ repeat 0)) of
+       (x:xs) -> reverse (x+1 : xs)
+       []     -> []  -- shouldn't happen
+
+data LaTeXState = LaTeXState{ sOptions       :: ReaderOptions
+                            , sMeta          :: Meta
+                            , sQuoteContext  :: QuoteContext
+                            , sMacros        :: M.Map Text Macro
+                            , sContainers    :: [String]
+                            , sHeaders       :: M.Map Inlines String
+                            , sLogMessages   :: [LogMessage]
+                            , sIdentifiers   :: Set.Set String
+                            , sVerbatimMode  :: Bool
+                            , sCaption       :: (Maybe Inlines, Maybe String)
+                            , sInListItem    :: Bool
+                            , sInTableCell   :: Bool
+                            , sLastHeaderNum :: DottedNum
+                            , sLastFigureNum :: DottedNum
+                            , sLabels        :: M.Map String [Inline]
+                            , sHasChapters   :: Bool
+                            , sToggles       :: M.Map String Bool
+                            }
+     deriving Show
+
+defaultLaTeXState :: LaTeXState
+defaultLaTeXState = LaTeXState{ sOptions       = def
+                              , sMeta          = nullMeta
+                              , sQuoteContext  = NoQuote
+                              , sMacros        = M.empty
+                              , sContainers    = []
+                              , sHeaders       = M.empty
+                              , sLogMessages   = []
+                              , sIdentifiers   = Set.empty
+                              , sVerbatimMode  = False
+                              , sCaption       = (Nothing, Nothing)
+                              , sInListItem    = False
+                              , sInTableCell   = False
+                              , sLastHeaderNum = DottedNum []
+                              , sLastFigureNum = DottedNum []
+                              , sLabels        = M.empty
+                              , sHasChapters   = False
+                              , sToggles       = M.empty
+                              }
+
+instance PandocMonad m => HasQuoteContext LaTeXState m where
+  getQuoteContext = sQuoteContext <$> getState
+  withQuoteContext context parser = do
+    oldState <- getState
+    let oldQuoteContext = sQuoteContext oldState
+    setState oldState { sQuoteContext = context }
+    result <- parser
+    newState <- getState
+    setState newState { sQuoteContext = oldQuoteContext }
+    return result
+
+instance HasLogMessages LaTeXState where
+  addLogMessage msg st = st{ sLogMessages = msg : sLogMessages st }
+  getLogMessages st = reverse $ sLogMessages st
+
+instance HasIdentifierList LaTeXState where
+  extractIdentifierList     = sIdentifiers
+  updateIdentifierList f st = st{ sIdentifiers = f $ sIdentifiers st }
+
+instance HasIncludeFiles LaTeXState where
+  getIncludeFiles = sContainers
+  addIncludeFile f s = s{ sContainers = f : sContainers s }
+  dropLatestIncludeFile s = s { sContainers = drop 1 $ sContainers s }
+
+instance HasHeaderMap LaTeXState where
+  extractHeaderMap     = sHeaders
+  updateHeaderMap f st = st{ sHeaders = f $ sHeaders st }
+
+instance HasMacros LaTeXState where
+  extractMacros  st  = sMacros st
+  updateMacros f st  = st{ sMacros = f (sMacros st) }
+
+instance HasReaderOptions LaTeXState where
+  extractReaderOptions = sOptions
+
+instance HasMeta LaTeXState where
+  setMeta field val st =
+    st{ sMeta = setMeta field val $ sMeta st }
+  deleteMeta field st =
+    st{ sMeta = deleteMeta field $ sMeta st }
+
+instance Default LaTeXState where
+  def = defaultLaTeXState
+
+type LP m = ParserT [Tok] LaTeXState m
+
+withVerbatimMode :: PandocMonad m => LP m a -> LP m a
+withVerbatimMode parser = do
+  updateState $ \st -> st{ sVerbatimMode = True }
+  result <- parser
+  updateState $ \st -> st{ sVerbatimMode = False }
+  return result
+
+rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s)
+               => Bool -> LP m a -> LP m a -> ParserT String s m (a, String)
+rawLaTeXParser retokenize parser valParser = do
+  inp <- getInput
+  let toks = tokenize "source" $ T.pack inp
+  pstate <- getState
+  let lstate = def{ sOptions = extractReaderOptions pstate }
+  let lstate' = lstate { sMacros = extractMacros pstate }
+  let rawparser = (,) <$> withRaw valParser <*> getState
+  res' <- lift $ runParserT (snd <$> withRaw parser) lstate "chunk" toks
+  case res' of
+       Left _    -> mzero
+       Right toks' -> do
+         res <- lift $ runParserT (do when retokenize $ do
+                                        -- retokenize, applying macros
+                                        doMacros 0
+                                        ts <- many (satisfyTok (const True))
+                                        setInput ts
+                                      rawparser)
+                        lstate' "chunk" toks'
+         case res of
+              Left _    -> mzero
+              Right ((val, raw), st) -> do
+                updateState (updateMacros (sMacros st <>))
+                _ <- takeP (T.length (untokenize toks'))
+                return (val, T.unpack (untokenize raw))
+
+applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s)
+            => String -> ParserT String s m String
+applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|>
+   do let retokenize = doMacros 0 *>
+             (toksToString <$> many (satisfyTok (const True)))
+      pstate <- getState
+      let lstate = def{ sOptions = extractReaderOptions pstate
+                      , sMacros  = extractMacros pstate }
+      res <- runParserT retokenize lstate "math" (tokenize "math" (T.pack s))
+      case res of
+           Left e   -> fail (show e)
+           Right s' -> return s'
+tokenize :: SourceName -> Text -> [Tok]
+tokenize sourcename = totoks (initialPos sourcename)
+
+totoks :: SourcePos -> Text -> [Tok]
+totoks pos t =
+  case T.uncons t of
+       Nothing        -> []
+       Just (c, rest)
+         | c == '\n' ->
+           Tok pos Newline "\n"
+           : totoks (setSourceColumn (incSourceLine pos 1) 1) rest
+         | isSpaceOrTab c ->
+           let (sps, rest') = T.span isSpaceOrTab t
+           in  Tok pos Spaces sps
+               : totoks (incSourceColumn pos (T.length sps))
+                 rest'
+         | isAlphaNum c ->
+           let (ws, rest') = T.span isAlphaNum t
+           in  Tok pos Word ws
+               : totoks (incSourceColumn pos (T.length ws)) rest'
+         | c == '%' ->
+           let (cs, rest') = T.break (== '\n') rest
+           in  Tok pos Comment ("%" <> cs)
+               : totoks (incSourceColumn pos (1 + T.length cs)) rest'
+         | c == '\\' ->
+           case T.uncons rest of
+                Nothing -> [Tok pos (CtrlSeq " ") "\\"]
+                Just (d, rest')
+                  | isLetterOrAt d ->
+                      -- \makeatletter is common in macro defs;
+                      -- ideally we should make tokenization sensitive
+                      -- to \makeatletter and \makeatother, but this is
+                      -- probably best for now
+                      let (ws, rest'') = T.span isLetterOrAt rest
+                          (ss, rest''') = T.span isSpaceOrTab rest''
+                      in  Tok pos (CtrlSeq ws) ("\\" <> ws <> ss)
+                          : totoks (incSourceColumn pos
+                               (1 + T.length ws + T.length ss)) rest'''
+                  | isSpaceOrTab d || d == '\n' ->
+                      let (w1, r1) = T.span isSpaceOrTab rest
+                          (w2, (w3, r3)) = case T.uncons r1 of
+                                          Just ('\n', r2)
+                                                  -> (T.pack "\n",
+                                                        T.span isSpaceOrTab r2)
+                                          _ -> (mempty, (mempty, r1))
+                          ws = "\\" <> w1 <> w2 <> w3
+                      in  case T.uncons r3 of
+                               Just ('\n', _) ->
+                                 Tok pos (CtrlSeq " ") ("\\" <> w1)
+                                 : totoks (incSourceColumn pos (T.length ws))
+                                   r1
+                               _ ->
+                                 Tok pos (CtrlSeq " ") ws
+                                 : totoks (incSourceColumn pos (T.length ws))
+                                   r3
+                  | otherwise  ->
+                      Tok pos (CtrlSeq (T.singleton d)) (T.pack [c,d])
+                      : totoks (incSourceColumn pos 2) rest'
+         | c == '#' ->
+           let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest
+           in  case safeRead (T.unpack t1) of
+                    Just i ->
+                       Tok pos (Arg i) ("#" <> t1)
+                       : totoks (incSourceColumn pos (1 + T.length t1)) t2
+                    Nothing ->
+                       Tok pos Symbol "#"
+                       : totoks (incSourceColumn pos 1) t2
+         | c == '^' ->
+           case T.uncons rest of
+                Just ('^', rest') ->
+                  case T.uncons rest' of
+                       Just (d, rest'')
+                         | isLowerHex d ->
+                           case T.uncons rest'' of
+                                Just (e, rest''') | isLowerHex e ->
+                                  Tok pos Esc2 (T.pack ['^','^',d,e])
+                                  : totoks (incSourceColumn pos 4) rest'''
+                                _ ->
+                                  Tok pos Esc1 (T.pack ['^','^',d])
+                                  : totoks (incSourceColumn pos 3) rest''
+                         | d < '\128' ->
+                                  Tok pos Esc1 (T.pack ['^','^',d])
+                                  : totoks (incSourceColumn pos 3) rest''
+                       _ -> Tok pos Symbol "^" :
+                            Tok (incSourceColumn pos 1) Symbol "^" :
+                            totoks (incSourceColumn pos 2) rest'
+                _ -> Tok pos Symbol "^"
+                     : totoks (incSourceColumn pos 1) rest
+         | otherwise ->
+           Tok pos Symbol (T.singleton c) : totoks (incSourceColumn pos 1) rest
+
+isSpaceOrTab :: Char -> Bool
+isSpaceOrTab ' '  = True
+isSpaceOrTab '\t' = True
+isSpaceOrTab _    = False
+
+isLetterOrAt :: Char -> Bool
+isLetterOrAt '@' = True
+isLetterOrAt c   = isLetter c
+
+isLowerHex :: Char -> Bool
+isLowerHex x = x >= '0' && x <= '9' || x >= 'a' && x <= 'f'
+
+untokenize :: [Tok] -> Text
+untokenize = mconcat . map untoken
+
+untoken :: Tok -> Text
+untoken (Tok _ _ t) = t
+
+toksToString :: [Tok] -> String
+toksToString = T.unpack . untokenize
+
+satisfyTok :: PandocMonad m => (Tok -> Bool) -> LP m Tok
+satisfyTok f =
+  try $ do
+    res <- tokenPrim (T.unpack . untoken) updatePos matcher
+    doMacros 0 -- apply macros on remaining input stream
+    return res
+  where matcher t | f t       = Just t
+                  | otherwise = Nothing
+        updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos
+        updatePos _spos _ (Tok pos _ _ : _) = pos
+        updatePos spos _ []                 = incSourceColumn spos 1
+
+doMacros :: PandocMonad m => Int -> LP m ()
+doMacros n = do
+  verbatimMode <- sVerbatimMode <$> getState
+  unless verbatimMode $ do
+    inp <- getInput
+    case inp of
+         Tok spos (CtrlSeq "begin") _ : Tok _ Symbol "{" :
+          Tok _ Word name : Tok _ Symbol "}" : ts
+            -> handleMacros spos name ts
+         Tok spos (CtrlSeq "end") _ : Tok _ Symbol "{" :
+          Tok _ Word name : Tok _ Symbol "}" : ts
+            -> handleMacros spos ("end" <> name) ts
+         Tok _ (CtrlSeq "expandafter") _ : t : ts
+            -> do setInput ts
+                  doMacros n
+                  getInput >>= setInput . combineTok t
+         Tok spos (CtrlSeq name) _ : ts
+            -> handleMacros spos name ts
+         _ -> return ()
+  where combineTok (Tok spos (CtrlSeq name) x) (Tok _ Word w : ts)
+          | T.all isLetterOrAt w =
+            Tok spos (CtrlSeq (name <> w)) (x1 <> w <> x2) : ts
+              where (x1, x2) = T.break isSpaceOrTab x
+        combineTok t ts = t:ts
+        handleMacros spos name ts = do
+                macros <- sMacros <$> getState
+                case M.lookup name macros of
+                     Nothing -> return ()
+                     Just (Macro expansionPoint argspecs optarg newtoks) -> do
+                       setInput ts
+                       let matchTok (Tok _ toktype txt) =
+                             satisfyTok (\(Tok _ toktype' txt') ->
+                                           toktype == toktype' &&
+                                           txt == txt')
+                       let matchPattern toks = try $ mapM_ matchTok toks
+                       let getargs argmap [] = return argmap
+                           getargs argmap (Pattern toks : rest) = try $ do
+                              matchPattern toks
+                              getargs argmap rest
+                           getargs argmap (ArgNum i : Pattern toks : rest) =
+                             try $ do
+                               x <- mconcat <$> manyTill
+                                     (braced <|> ((:[]) <$> anyTok))
+                                     (matchPattern toks)
+                               getargs (M.insert i x argmap) rest
+                           getargs argmap (ArgNum i : rest) = do
+                             x <- try $ spaces >> bracedOrToken
+                             getargs (M.insert i x argmap) rest
+                       args <- case optarg of
+                                    Nothing -> getargs M.empty argspecs
+                                    Just o  -> do
+                                       x <- option o bracketedToks
+                                       getargs (M.singleton 1 x) argspecs
+                       -- first boolean param is true if we're tokenizing
+                       -- an argument (in which case we don't want to
+                       -- expand #1 etc.)
+                       let addTok False (Tok _ (Arg i) _) acc =
+                              case M.lookup i args of
+                                   Nothing -> mzero
+                                   Just xs -> foldr (addTok True) acc xs
+                           -- see #4007
+                           addTok _ (Tok _ (CtrlSeq x) txt)
+                                  acc@(Tok _ Word _ : _)
+                             | not (T.null txt) &&
+                               isLetter (T.last txt) =
+                               Tok spos (CtrlSeq x) (txt <> " ") : acc
+                           addTok _ t acc = setpos spos t : acc
+                       ts' <- getInput
+                       setInput $ foldr (addTok False) ts' newtoks
+                       case expansionPoint of
+                            ExpandWhenUsed ->
+                              if n > 20  -- detect macro expansion loops
+                                 then throwError $ PandocMacroLoop (T.unpack name)
+                                 else doMacros (n + 1)
+                            ExpandWhenDefined -> return ()
+
+
+setpos :: SourcePos -> Tok -> Tok
+setpos spos (Tok _ tt txt) = Tok spos tt txt
+
+anyControlSeq :: PandocMonad m => LP m Tok
+anyControlSeq = satisfyTok isCtrlSeq
+
+isCtrlSeq :: Tok -> Bool
+isCtrlSeq (Tok _ (CtrlSeq _) _) = True
+isCtrlSeq _                     = False
+
+anySymbol :: PandocMonad m => LP m Tok
+anySymbol = satisfyTok isSymbolTok
+
+isSymbolTok :: Tok -> Bool
+isSymbolTok (Tok _ Symbol _) = True
+isSymbolTok _                = False
+
+isWordTok :: Tok -> Bool
+isWordTok (Tok _ Word _) = True
+isWordTok _              = False
+
+spaces :: PandocMonad m => LP m ()
+spaces = skipMany (satisfyTok (tokTypeIn [Comment, Spaces, Newline]))
+
+spaces1 :: PandocMonad m => LP m ()
+spaces1 = skipMany1 (satisfyTok (tokTypeIn [Comment, Spaces, Newline]))
+
+tokTypeIn :: [TokType] -> Tok -> Bool
+tokTypeIn toktypes (Tok _ tt _) = tt `elem` toktypes
+
+controlSeq :: PandocMonad m => Text -> LP m Tok
+controlSeq name = satisfyTok isNamed
+  where isNamed (Tok _ (CtrlSeq n) _) = n == name
+        isNamed _                     = False
+
+symbol :: PandocMonad m => Char -> LP m Tok
+symbol c = satisfyTok isc
+  where isc (Tok _ Symbol d) = case T.uncons d of
+                                    Just (c',_) -> c == c'
+                                    _           -> False
+        isc _ = False
+
+symbolIn :: PandocMonad m => [Char] -> LP m Tok
+symbolIn cs = satisfyTok isInCs
+  where isInCs (Tok _ Symbol d) = case T.uncons d of
+                                       Just (c,_) -> c `elem` cs
+                                       _          -> False
+        isInCs _ = False
+
+sp :: PandocMonad m => LP m ()
+sp = whitespace <|> endline
+
+whitespace :: PandocMonad m => LP m ()
+whitespace = () <$ satisfyTok isSpaceTok
+
+isSpaceTok :: Tok -> Bool
+isSpaceTok (Tok _ Spaces _) = True
+isSpaceTok _                = False
+
+newlineTok :: PandocMonad m => LP m ()
+newlineTok = () <$ satisfyTok isNewlineTok
+
+isNewlineTok :: Tok -> Bool
+isNewlineTok (Tok _ Newline _) = True
+isNewlineTok _                 = False
+
+comment :: PandocMonad m => LP m ()
+comment = () <$ satisfyTok isCommentTok
+
+isCommentTok :: Tok -> Bool
+isCommentTok (Tok _ Comment _) = True
+isCommentTok _                 = False
+
+anyTok :: PandocMonad m => LP m Tok
+anyTok = satisfyTok (const True)
+
+singleChar :: PandocMonad m => LP m Tok
+singleChar = try $ do
+  Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol])
+  guard $ not $ toktype == Symbol &&
+                T.any (`Set.member` specialChars) t
+  if T.length t > 1
+     then do
+       let (t1, t2) = (T.take 1 t, T.drop 1 t)
+       inp <- getInput
+       setInput $ Tok (incSourceColumn pos 1) toktype t2 : inp
+       return $ Tok pos toktype t1
+     else return $ Tok pos toktype t
+
+specialChars :: Set.Set Char
+specialChars = Set.fromList "#$%&~_^\\{}"
+
+endline :: PandocMonad m => LP m ()
+endline = try $ do
+  newlineTok
+  lookAhead anyTok
+  notFollowedBy blankline
+
+blankline :: PandocMonad m => LP m ()
+blankline = try $ skipMany whitespace *> newlineTok
+
+primEscape :: PandocMonad m => LP m Char
+primEscape = do
+  Tok _ toktype t <- satisfyTok (tokTypeIn [Esc1, Esc2])
+  case toktype of
+       Esc1 -> case T.uncons (T.drop 2 t) of
+                    Just (c, _)
+                      | c >= '\64' && c <= '\127' -> return (chr (ord c - 64))
+                      | otherwise                 -> return (chr (ord c + 64))
+                    Nothing -> fail "Empty content of Esc1"
+       Esc2 -> case safeRead ('0':'x':T.unpack (T.drop 2 t)) of
+                    Just x  -> return (chr x)
+                    Nothing -> fail $ "Could not read: " ++ T.unpack t
+       _    -> fail "Expected an Esc1 or Esc2 token" -- should not happen
+
+bgroup :: PandocMonad m => LP m Tok
+bgroup = try $ do
+  skipMany sp
+  symbol '{' <|> controlSeq "bgroup" <|> controlSeq "begingroup"
+
+egroup :: PandocMonad m => LP m Tok
+egroup = symbol '}' <|> controlSeq "egroup" <|> controlSeq "endgroup"
+
+grouped :: (PandocMonad m,  Monoid a) => LP m a -> LP m a
+grouped parser = try $ do
+  bgroup
+  -- first we check for an inner 'grouped', because
+  -- {{a,b}} should be parsed the same as {a,b}
+  try (grouped parser <* egroup) <|> (mconcat <$> manyTill parser egroup)
+
+braced' :: PandocMonad m => LP m Tok -> Int -> LP m [Tok]
+braced' getTok n =
+  handleEgroup <|> handleBgroup <|> handleOther
+  where handleEgroup = do
+          t <- egroup
+          if n == 1
+             then return []
+             else (t:) <$> braced' getTok (n - 1)
+        handleBgroup = do
+          t <- bgroup
+          (t:) <$> braced' getTok (n + 1)
+        handleOther = do
+          t <- getTok
+          (t:) <$> braced' getTok n
+
+braced :: PandocMonad m => LP m [Tok]
+braced = bgroup *> braced' anyTok 1
+
+-- URLs require special handling, because they can contain %
+-- characters.  So we retonenize comments as we go...
+bracedUrl :: PandocMonad m => LP m [Tok]
+bracedUrl = bgroup *> braced' (retokenizeComment >> anyTok) 1
+
+-- For handling URLs, which allow literal % characters...
+retokenizeComment :: PandocMonad m => LP m ()
+retokenizeComment = (do
+  Tok pos Comment txt <- satisfyTok isCommentTok
+  let updPos (Tok pos' toktype' txt') =
+        Tok (incSourceColumn (incSourceLine pos' (sourceLine pos - 1))
+             (sourceColumn pos)) toktype' txt'
+  let newtoks = map updPos $ tokenize (sourceName pos) $ T.tail txt
+  getInput >>= setInput . ((Tok pos Symbol "%" : newtoks) ++))
+    <|> return ()
+
+bracedOrToken :: PandocMonad m => LP m [Tok]
+bracedOrToken = braced <|> ((:[]) <$> (anyControlSeq <|> singleChar))
+
+bracketed :: PandocMonad m => Monoid a => LP m a -> LP m a
+bracketed parser = try $ do
+  symbol '['
+  mconcat <$> manyTill parser (symbol ']')
+
+bracketedToks :: PandocMonad m => LP m [Tok]
+bracketedToks = do
+  symbol '['
+  mconcat <$> manyTill (braced <|> (:[]) <$> anyTok) (symbol ']')
+
+parenWrapped :: PandocMonad m => Monoid a => LP m a -> LP m a
+parenWrapped parser = try $ do
+  symbol '('
+  mconcat <$> manyTill parser (symbol ')')
+
+dimenarg :: PandocMonad m => LP m Text
+dimenarg = try $ do
+  ch  <- option False $ True <$ symbol '='
+  Tok _ _ s <- satisfyTok isWordTok
+  guard $ T.take 2 (T.reverse s) `elem`
+           ["pt","pc","in","bp","cm","mm","dd","cc","sp"]
+  let num = T.take (T.length s - 2) s
+  guard $ T.length num > 0
+  guard $ T.all isDigit num
+  return $ T.pack ['=' | ch] <> s
+
+ignore :: (Monoid a, PandocMonad m) => String -> ParserT s u m a
+ignore raw = do
+  pos <- getPosition
+  report $ SkippedContent raw pos
+  return mempty
+
+withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok])
+withRaw parser = do
+  inp <- getInput
+  result <- parser
+  nxt <- option (Tok (initialPos "source") Word "") (lookAhead anyTok)
+  let raw = takeWhile (/= nxt) inp
+  return (result, raw)
author	John MacFarlane <jgm@berkeley.edu>	2018-09-28 10:33:32 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2018-09-28 10:33:32 -0700
commit	9dac993835603a1a92136f07e80a6561b1598754 (patch)
tree	d345721f53c39d84adf76c87cef4ab531aff5282 /src
parent	4f9ab7e03268e576d86e697f7110869434d08557 (diff)
download	pandoc-9dac993835603a1a92136f07e80a6561b1598754.tar.gz