Merge branch 'master' of https://github.com/jgm/pandoc

author: Igor Pashev <pashev.igor@gmail.com> 2021-07-17 18:10:34 +0200
committer: Igor Pashev <pashev.igor@gmail.com> 2021-07-17 18:46:16 +0200
commit: 48459559a13a20083fc9b31eb523b8ea2bf0a63f (patch)
tree: 1c04e75709457403110a6f8c5c90099f22369de3 /src/Text/Pandoc/Readers/LaTeX
parent: 0c39509d9b6a58958228cebf5d643598e5c98950 (diff)
parent: 46099e79defe662e541b12548200caf29063c1c6 (diff)
download: pandoc-48459559a13a20083fc9b31eb523b8ea2bf0a63f.tar.gz
9 files changed, 1973 insertions, 252 deletions
diff --git a/src/Text/Pandoc/Readers/LaTeX/Citation.hs b/src/Text/Pandoc/Readers/LaTeX/Citation.hs
new file mode 100644
index 000000000..af97125c6
--- /dev/null
+++ b/src/Text/Pandoc/Readers/LaTeX/Citation.hs
@@ -0,0 +1,210 @@
+{-# LANGUAGE ScopedTypeVariables #-}
+{-# LANGUAGE OverloadedStrings #-}
+module Text.Pandoc.Readers.LaTeX.Citation
+  ( citationCommands
+  , cites
+  )
+where
+
+import Text.Pandoc.Class
+import Text.Pandoc.Readers.LaTeX.Parsing
+import Text.Pandoc.Builder as B
+import qualified Data.Map as M
+import Data.Text (Text)
+import Control.Applicative ((<|>), optional, many)
+import Control.Monad (mzero)
+import Control.Monad.Trans (lift)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Error (PandocError(PandocParsecError))
+import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline,
+                            optional, space, spaces, withRaw, (<|>))
+
+citationCommands :: PandocMonad m => LP m Inlines -> M.Map Text (LP m Inlines)
+citationCommands inline =
+  let citation = citationWith inline
+      tok = spaces *> grouped inline
+   in M.fromList
+  [ ("cite", citation "cite" NormalCitation False)
+  , ("Cite", citation "Cite" NormalCitation False)
+  , ("citep", citation "citep" NormalCitation False)
+  , ("citep*", citation "citep*" NormalCitation False)
+  , ("citeal", citation "citeal" NormalCitation False)
+  , ("citealp", citation "citealp" NormalCitation False)
+  , ("citealp*", citation "citealp*" NormalCitation False)
+  , ("autocite", citation "autocite" NormalCitation False)
+  , ("smartcite", citation "smartcite" NormalCitation False)
+  , ("footcite", inNote <$> citation "footcite" NormalCitation False)
+  , ("parencite", citation "parencite" NormalCitation False)
+  , ("supercite", citation "supercite" NormalCitation False)
+  , ("footcitetext", inNote <$> citation "footcitetext" NormalCitation False)
+  , ("citeyearpar", citation "citeyearpar" SuppressAuthor False)
+  , ("citeyear", citation "citeyear" SuppressAuthor False)
+  , ("autocite*", citation "autocite*" SuppressAuthor False)
+  , ("cite*", citation "cite*" SuppressAuthor False)
+  , ("parencite*", citation "parencite*" SuppressAuthor False)
+  , ("textcite", citation "textcite" AuthorInText False)
+  , ("citet", citation "citet" AuthorInText False)
+  , ("citet*", citation "citet*" AuthorInText False)
+  , ("citealt", citation "citealt" AuthorInText False)
+  , ("citealt*", citation "citealt*" AuthorInText False)
+  , ("textcites", citation "textcites" AuthorInText True)
+  , ("cites", citation "cites" NormalCitation True)
+  , ("autocites", citation "autocites" NormalCitation True)
+  , ("footcites", inNote <$> citation "footcites" NormalCitation True)
+  , ("parencites", citation "parencites" NormalCitation True)
+  , ("supercites", citation "supercites" NormalCitation True)
+  , ("footcitetexts", inNote <$> citation "footcitetexts" NormalCitation True)
+  , ("Autocite", citation "Autocite" NormalCitation False)
+  , ("Smartcite", citation "Smartcite" NormalCitation False)
+  , ("Footcite", inNote <$> citation "Footcite" NormalCitation False)
+  , ("Parencite", citation "Parencite" NormalCitation False)
+  , ("Supercite", citation "Supercite" NormalCitation False)
+  , ("Footcitetext", inNote <$> citation "Footcitetext" NormalCitation False)
+  , ("Citeyearpar", citation "Citeyearpar" SuppressAuthor False)
+  , ("Citeyear", citation "Citeyear" SuppressAuthor False)
+  , ("Autocite*", citation "Autocite*" SuppressAuthor False)
+  , ("Cite*", citation "Cite*" SuppressAuthor False)
+  , ("Parencite*", citation "Parencite*" SuppressAuthor False)
+  , ("Textcite", citation "Textcite" AuthorInText False)
+  , ("Textcites", citation "Textcites" AuthorInText True)
+  , ("Cites", citation "Cites" NormalCitation True)
+  , ("Autocites", citation "Autocites" NormalCitation True)
+  , ("Footcites", inNote <$> citation "Footcites" NormalCitation True)
+  , ("Parencites", citation "Parencites" NormalCitation True)
+  , ("Supercites", citation "Supercites" NormalCitation True)
+  , ("Footcitetexts", inNote <$> citation "Footcitetexts" NormalCitation True)
+  , ("citetext", complexNatbibCitation inline NormalCitation)
+  , ("citeauthor", (try (tok *> sp *> controlSeq "citetext") *>
+                        complexNatbibCitation inline AuthorInText)
+                   <|> citation "citeauthor" AuthorInText False)
+  , ("nocite", mempty <$ (citation "nocite" NormalCitation False >>=
+                          addMeta "nocite"))
+  ]
+
+-- citations
+
+addPrefix :: [Inline] -> [Citation] -> [Citation]
+addPrefix p (k:ks) = k {citationPrefix = p ++ citationPrefix k} : ks
+addPrefix _ _      = []
+
+addSuffix :: [Inline] -> [Citation] -> [Citation]
+addSuffix s ks@(_:_) =
+  let k = last ks
+  in  init ks ++ [k {citationSuffix = citationSuffix k ++ s}]
+addSuffix _ _ = []
+
+simpleCiteArgs :: forall m . PandocMonad m => LP m Inlines -> LP m [Citation]
+simpleCiteArgs inline = try $ do
+  first  <- optionMaybe $ toList <$> opt
+  second <- optionMaybe $ toList <$> opt
+  keys <- try $ bgroup *> manyTill citationLabel egroup
+  let (pre, suf) = case (first  , second ) of
+        (Just s , Nothing) -> (mempty, s )
+        (Just s , Just t ) -> (s , t )
+        _                  -> (mempty, mempty)
+      conv k = Citation { citationId      = k
+                        , citationPrefix  = []
+                        , citationSuffix  = []
+                        , citationMode    = NormalCitation
+                        , citationHash    = 0
+                        , citationNoteNum = 0
+                        }
+  return $ addPrefix pre $ addSuffix suf $ map conv keys
+ where
+  opt :: PandocMonad m => LP m Inlines
+  opt = do
+    toks <- try (sp *> bracketedToks <* sp)
+    -- now parse the toks as inlines
+    st <- getState
+    parsed <- lift $
+      runParserT (mconcat <$> many inline) st "bracketed option" toks
+    case parsed of
+      Right result -> return result
+      Left e       -> throwError $ PandocParsecError (toSources toks) e
+
+
+
+citationLabel :: PandocMonad m => LP m Text
+citationLabel  = do
+  sp
+  untokenize <$>
+    (many1 (satisfyTok isWordTok <|> symbolIn bibtexKeyChar)
+          <* sp
+          <* optional (symbol ',')
+          <* sp)
+  where bibtexKeyChar = ".:;?!`'()/*@_+=-&[]" :: [Char]
+
+cites :: PandocMonad m
+      => LP m Inlines -> CitationMode -> Bool -> LP m [Citation]
+cites inline mode multi = try $ do
+  let paropt = parenWrapped inline
+  cits <- if multi
+             then do
+               multiprenote <- optionMaybe $ toList <$> paropt
+               multipostnote <- optionMaybe $ toList <$> paropt
+               let (pre, suf) = case (multiprenote, multipostnote) of
+                     (Just s , Nothing) -> (mempty, s)
+                     (Nothing , Just t) -> (mempty, t)
+                     (Just s , Just t ) -> (s, t)
+                     _                  -> (mempty, mempty)
+               tempCits <- many1 $ simpleCiteArgs inline
+               case tempCits of
+                 (k:ks) -> case ks of
+                             (_:_) -> return $ (addMprenote pre k : init ks) ++
+                                                 [addMpostnote suf (last ks)]
+                             _ -> return [addMprenote pre (addMpostnote suf k)]
+                 _ -> return [[]]
+             else count 1 $ simpleCiteArgs inline
+  let cs = concat cits
+  return $ case mode of
+        AuthorInText -> case cs of
+                             (c:rest) -> c {citationMode = mode} : rest
+                             []       -> []
+        _            -> map (\a -> a {citationMode = mode}) cs
+  where mprenote (k:ks) = (k:ks) ++ [Space]
+        mprenote _ = mempty
+        mpostnote (k:ks) = [Str ",", Space] ++ (k:ks)
+        mpostnote _ = mempty
+        addMprenote mpn (k:ks) =
+          let mpnfinal = case citationPrefix k of
+                           (_:_) -> mprenote mpn
+                           _ -> mpn
+          in addPrefix mpnfinal (k:ks)
+        addMprenote _ _ = []
+        addMpostnote = addSuffix . mpostnote
+
+citationWith :: PandocMonad m
+             => LP m Inlines -> Text -> CitationMode -> Bool -> LP m Inlines
+citationWith inline name mode multi = do
+  (c,raw) <- withRaw $ cites inline mode multi
+  return $ cite c (rawInline "latex" $ "\\" <> name <> untokenize raw)
+
+handleCitationPart :: Inlines -> [Citation]
+handleCitationPart ils =
+  let isCite Cite{} = True
+      isCite _      = False
+      (pref, rest) = break isCite (toList ils)
+  in case rest of
+          (Cite cs _:suff) -> addPrefix pref $ addSuffix suff cs
+          _                -> []
+
+complexNatbibCitation :: PandocMonad m
+                      => LP m Inlines -> CitationMode -> LP m Inlines
+complexNatbibCitation inline mode = try $ do
+  (cs, raw) <-
+    withRaw $ concat <$> do
+      bgroup
+      items <- mconcat <$>
+                many1 (notFollowedBy (symbol ';') >> inline)
+                  `sepBy1` symbol ';'
+      egroup
+      return $ map handleCitationPart items
+  case cs of
+       []       -> mzero
+       (c:cits) -> return $ cite (c{ citationMode = mode }:cits)
+                      (rawInline "latex" $ "\\citetext" <> untokenize raw)
+
+inNote :: Inlines -> Inlines
+inNote ils =
+  note $ para $ ils <> str "."
+
diff --git a/src/Text/Pandoc/Readers/LaTeX/Inline.hs b/src/Text/Pandoc/Readers/LaTeX/Inline.hs
new file mode 100644
index 000000000..7b8bca4af
--- /dev/null
+++ b/src/Text/Pandoc/Readers/LaTeX/Inline.hs
@@ -0,0 +1,397 @@
+{-# LANGUAGE OverloadedStrings     #-}
+{-# LANGUAGE ViewPatterns          #-}
+{- |
+   Module      : Text.Pandoc.Readers.LaTeX.Inline
+   Copyright   : Copyright (C) 2006-2021 John MacFarlane
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : John MacFarlane <jgm@berkeley.edu>
+   Stability   : alpha
+   Portability : portable
+-}
+module Text.Pandoc.Readers.LaTeX.Inline
+  ( acronymCommands
+  , verbCommands
+  , charCommands
+  , accentCommands
+  , nameCommands
+  , biblatexInlineCommands
+  , refCommands
+  , rawInlineOr
+  , listingsLanguage
+  )
+where
+
+import qualified Data.Map as M
+import Data.Text (Text)
+import qualified Data.Text as T
+import Text.Pandoc.Builder
+import Text.Pandoc.Shared (toRomanNumeral, safeRead)
+import Text.Pandoc.Readers.LaTeX.Types (Tok (..), TokType (..))
+import Control.Applicative (optional, (<|>))
+import Control.Monad (guard, mzero, mplus, unless)
+import Text.Pandoc.Class.PandocMonad (PandocMonad (..), translateTerm)
+import Text.Pandoc.Readers.LaTeX.Parsing
+import Text.Pandoc.Extensions (extensionEnabled, Extension(..))
+import Text.Pandoc.Parsing (getOption, updateState, getState, notFollowedBy,
+                            manyTill, getInput, setInput, incSourceColumn,
+                            option, many1, try)
+import Data.Char (isDigit)
+import Text.Pandoc.Highlighting (fromListingsLanguage,)
+import Data.Maybe (maybeToList, fromMaybe)
+import Text.Pandoc.Options (ReaderOptions(..))
+import qualified Data.Text.Normalize as Normalize
+import qualified Text.Pandoc.Translations as Translations
+
+rawInlineOr :: PandocMonad m => Text -> LP m Inlines -> LP m Inlines
+rawInlineOr name' fallback = do
+  parseRaw <- extensionEnabled Ext_raw_tex <$> getOption readerExtensions
+  if parseRaw
+     then rawInline "latex" <$> getRawCommand name' ("\\" <> name')
+     else fallback
+
+dolabel :: PandocMonad m => LP m Inlines
+dolabel = do
+  v <- braced
+  let refstr = untokenize v
+  updateState $ \st ->
+    st{ sLastLabel = Just refstr }
+  return $ spanWith (refstr,[],[("label", refstr)])
+    $ inBrackets $ str $ untokenize v
+
+doref :: PandocMonad m => Text -> LP m Inlines
+doref cls = do
+  v <- braced
+  let refstr = untokenize v
+  return $ linkWith ("",[],[ ("reference-type", cls)
+                           , ("reference", refstr)])
+                    ("#" <> refstr)
+                    ""
+                    (inBrackets $ str refstr)
+
+inBrackets :: Inlines -> Inlines
+inBrackets x = str "[" <> x <> str "]"
+
+doTerm :: PandocMonad m => Translations.Term -> LP m Inlines
+doTerm term = str <$> translateTerm term
+
+lit :: Text -> LP m Inlines
+lit = pure . str
+
+doverb :: PandocMonad m => LP m Inlines
+doverb = do
+  Tok _ Symbol t <- anySymbol
+  marker <- case T.uncons t of
+              Just (c, ts) | T.null ts -> return c
+              _            -> mzero
+  withVerbatimMode $
+    code . untokenize <$>
+      manyTill (notFollowedBy newlineTok >> verbTok marker) (symbol marker)
+
+verbTok :: PandocMonad m => Char -> LP m Tok
+verbTok stopchar = do
+  t@(Tok pos toktype txt) <- anyTok
+  case T.findIndex (== stopchar) txt of
+       Nothing -> return t
+       Just i  -> do
+         let (t1, t2) = T.splitAt i txt
+         inp <- getInput
+         setInput $ Tok (incSourceColumn pos i) Symbol (T.singleton stopchar)
+                  : totoks (incSourceColumn pos (i + 1)) (T.drop 1 t2) ++ inp
+         return $ Tok pos toktype t1
+
+listingsLanguage :: [(Text, Text)] -> Maybe Text
+listingsLanguage opts =
+  case lookup "language" opts of
+    Nothing  -> Nothing
+    Just l   -> fromListingsLanguage l `mplus` Just l
+
+dolstinline :: PandocMonad m => LP m Inlines
+dolstinline = do
+  options <- option [] keyvals
+  let classes = maybeToList $ listingsLanguage options
+  doinlinecode classes
+
+domintinline :: PandocMonad m => LP m Inlines
+domintinline = do
+  skipopts
+  cls <- untokenize <$> braced
+  doinlinecode [cls]
+
+doinlinecode :: PandocMonad m => [Text] -> LP m Inlines
+doinlinecode classes = do
+  Tok _ Symbol t <- anySymbol
+  marker <- case T.uncons t of
+              Just (c, ts) | T.null ts -> return c
+              _            -> mzero
+  let stopchar = if marker == '{' then '}' else marker
+  withVerbatimMode $
+    codeWith ("",classes,[]) . T.map nlToSpace . untokenize <$>
+      manyTill (verbTok stopchar) (symbol stopchar)
+
+nlToSpace :: Char -> Char
+nlToSpace '\n' = ' '
+nlToSpace x    = x
+
+romanNumeralUpper :: (PandocMonad m) => LP m Inlines
+romanNumeralUpper =
+  str . toRomanNumeral <$> romanNumeralArg
+
+romanNumeralLower :: (PandocMonad m) => LP m Inlines
+romanNumeralLower =
+  str . T.toLower . toRomanNumeral <$> romanNumeralArg
+
+romanNumeralArg :: (PandocMonad m) => LP m Int
+romanNumeralArg = spaces *> (parser <|> inBraces)
+  where
+    inBraces = do
+      symbol '{'
+      spaces
+      res <- parser
+      spaces
+      symbol '}'
+      return res
+    parser = do
+      s <- untokenize <$> many1 (satisfyTok isWordTok)
+      let (digits, rest) = T.span isDigit s
+      unless (T.null rest) $
+        Prelude.fail "Non-digits in argument to \\Rn or \\RN"
+      safeRead digits
+
+accentWith :: PandocMonad m
+           => LP m Inlines -> Char -> Maybe Char -> LP m Inlines
+accentWith tok combiningAccent fallBack = try $ do
+  ils <- tok
+  case toList ils of
+       (Str (T.uncons -> Just (x, xs)) : ys) -> return $ fromList $
+         -- try to normalize to the combined character:
+         Str (Normalize.normalize Normalize.NFC
+               (T.pack [x, combiningAccent]) <> xs) : ys
+       [Space] -> return $ str $ T.singleton
+                         $ fromMaybe combiningAccent fallBack
+       []      -> return $ str $ T.singleton
+                         $ fromMaybe combiningAccent fallBack
+       _       -> return ils
+
+
+verbCommands :: PandocMonad m => M.Map Text (LP m Inlines)
+verbCommands = M.fromList
+  [ ("verb", doverb)
+  , ("lstinline", dolstinline)
+  , ("mintinline", domintinline)
+  , ("Verb", doverb)
+  ]
+
+accentCommands :: PandocMonad m => LP m Inlines -> M.Map Text (LP m Inlines)
+accentCommands tok =
+  let accent = accentWith tok
+  in  M.fromList
+  [ ("aa", lit "å")
+  , ("AA", lit "Å")
+  , ("ss", lit "ß")
+  , ("o", lit "ø")
+  , ("O", lit "Ø")
+  , ("L", lit "Ł")
+  , ("l", lit "ł")
+  , ("ae", lit "æ")
+  , ("AE", lit "Æ")
+  , ("oe", lit "œ")
+  , ("OE", lit "Œ")
+  , ("pounds", lit "£")
+  , ("euro", lit "€")
+  , ("copyright", lit "©")
+  , ("textasciicircum", lit "^")
+  , ("textasciitilde", lit "~")
+  , ("H", accent '\779' Nothing) -- hungarumlaut
+  , ("`", accent '\768' (Just '`')) -- grave
+  , ("'", accent '\769' (Just '\'')) -- acute
+  , ("^", accent '\770' (Just '^')) -- circ
+  , ("~", accent '\771' (Just '~')) -- tilde
+  , ("\"", accent '\776' Nothing) -- umlaut
+  , (".", accent '\775' Nothing) -- dot
+  , ("=", accent '\772' Nothing) -- macron
+  , ("|", accent '\781' Nothing) -- vertical line above
+  , ("b", accent '\817' Nothing) -- macron below
+  , ("c", accent '\807' Nothing) -- cedilla
+  , ("G", accent '\783' Nothing) -- doublegrave
+  , ("h", accent '\777' Nothing) -- hookabove
+  , ("d", accent '\803' Nothing) -- dotbelow
+  , ("f", accent '\785' Nothing)  -- inverted breve
+  , ("r", accent '\778' Nothing)  -- ringabove
+  , ("t", accent '\865' Nothing)  -- double inverted breve
+  , ("U", accent '\782' Nothing)  -- double vertical line above
+  , ("v", accent '\780' Nothing) -- hacek
+  , ("u", accent '\774' Nothing) -- breve
+  , ("k", accent '\808' Nothing) -- ogonek
+  , ("textogonekcentered", accent '\808' Nothing) -- ogonek
+  , ("i", lit "ı")  -- dotless i
+  , ("j", lit "ȷ")  -- dotless j
+  , ("newtie", accent '\785' Nothing) -- inverted breve
+  , ("textcircled", accent '\8413' Nothing) -- combining circle
+  ]
+
+charCommands :: PandocMonad m => M.Map Text (LP m Inlines)
+charCommands = M.fromList
+  [ ("ldots", lit "…")
+  , ("vdots", lit "\8942")
+  , ("dots", lit "…")
+  , ("mdots", lit "…")
+  , ("sim", lit "~")
+  , ("sep", lit ",")
+  , ("P", lit "¶")
+  , ("S", lit "§")
+  , ("$", lit "$")
+  , ("%", lit "%")
+  , ("&", lit "&")
+  , ("#", lit "#")
+  , ("_", lit "_")
+  , ("{", lit "{")
+  , ("}", lit "}")
+  , ("qed", lit "\a0\x25FB")
+  , ("lq", return (str "‘"))
+  , ("rq", return (str "’"))
+  , ("textquoteleft", return (str "‘"))
+  , ("textquoteright", return (str "’"))
+  , ("textquotedblleft", return (str "“"))
+  , ("textquotedblright", return (str "”"))
+  , ("/", pure mempty) -- italic correction
+  , ("\\", linebreak <$ (do inTableCell <- sInTableCell <$> getState
+                            guard $ not inTableCell
+                            optional rawopt
+                            spaces))
+  , (",", lit "\8198")
+  , ("@", pure mempty)
+  , (" ", lit "\160")
+  , ("ps", pure $ str "PS." <> space)
+  , ("TeX", lit "TeX")
+  , ("LaTeX", lit "LaTeX")
+  , ("bar", lit "|")
+  , ("textless", lit "<")
+  , ("textgreater", lit ">")
+  , ("textbackslash", lit "\\")
+  , ("backslash", lit "\\")
+  , ("slash", lit "/")
+  -- fontawesome
+  , ("faCheck", lit "\10003")
+  , ("faClose", lit "\10007")
+  -- hyphenat
+  , ("bshyp", lit "\\\173")
+  , ("fshyp", lit "/\173")
+  , ("dothyp", lit ".\173")
+  , ("colonhyp", lit ":\173")
+  , ("hyp", lit "-")
+  ]
+
+biblatexInlineCommands :: PandocMonad m
+                       => LP m Inlines -> M.Map Text (LP m Inlines)
+biblatexInlineCommands tok = M.fromList
+  -- biblatex misc
+  [ ("RN", romanNumeralUpper)
+  , ("Rn", romanNumeralLower)
+  , ("mkbibquote", spanWith nullAttr . doubleQuoted <$> tok)
+  , ("mkbibemph", spanWith nullAttr . emph <$> tok)
+  , ("mkbibitalic", spanWith nullAttr . emph <$> tok)
+  , ("mkbibbold", spanWith nullAttr . strong <$> tok)
+  , ("mkbibparens",
+       spanWith nullAttr . (\x -> str "(" <> x <> str ")") <$> tok)
+  , ("mkbibbrackets",
+       spanWith nullAttr . (\x -> str "[" <> x <> str "]") <$> tok)
+  , ("autocap", spanWith nullAttr <$> tok)
+  , ("textnormal", spanWith ("",["nodecor"],[]) <$> tok)
+  , ("bibstring",
+       (\x -> spanWith ("",[],[("bibstring",x)]) (str x)) . untokenize
+         <$> braced)
+  , ("adddot", pure (str "."))
+  , ("adddotspace", pure (spanWith nullAttr (str "." <> space)))
+  , ("addabbrvspace", pure space)
+  , ("hyphen", pure (str "-"))
+  ]
+
+nameCommands :: PandocMonad m => M.Map Text (LP m Inlines)
+nameCommands = M.fromList
+  [ ("figurename", doTerm Translations.Figure)
+  , ("prefacename", doTerm Translations.Preface)
+  , ("refname", doTerm Translations.References)
+  , ("bibname", doTerm Translations.Bibliography)
+  , ("chaptername", doTerm Translations.Chapter)
+  , ("partname", doTerm Translations.Part)
+  , ("contentsname", doTerm Translations.Contents)
+  , ("listfigurename", doTerm Translations.ListOfFigures)
+  , ("listtablename", doTerm Translations.ListOfTables)
+  , ("indexname", doTerm Translations.Index)
+  , ("abstractname", doTerm Translations.Abstract)
+  , ("tablename", doTerm Translations.Table)
+  , ("enclname", doTerm Translations.Encl)
+  , ("ccname", doTerm Translations.Cc)
+  , ("headtoname", doTerm Translations.To)
+  , ("pagename", doTerm Translations.Page)
+  , ("seename", doTerm Translations.See)
+  , ("seealsoname", doTerm Translations.SeeAlso)
+  , ("proofname", doTerm Translations.Proof)
+  , ("glossaryname", doTerm Translations.Glossary)
+  , ("lstlistingname", doTerm Translations.Listing)
+  ]
+
+refCommands :: PandocMonad m => M.Map Text (LP m Inlines)
+refCommands = M.fromList
+  [ ("label", rawInlineOr "label" dolabel)
+  , ("ref", rawInlineOr "ref" $ doref "ref")
+  , ("cref", rawInlineOr "cref" $ doref "ref")       -- from cleveref.sty
+  , ("vref", rawInlineOr "vref" $ doref "ref+page")  -- from varioref.sty
+  , ("eqref", rawInlineOr "eqref" $ doref "eqref")   -- from amsmath.sty
+  ]
+
+acronymCommands :: PandocMonad m => M.Map Text (LP m Inlines)
+acronymCommands = M.fromList
+  -- glossaries package
+  [ ("gls", doAcronym "short")
+  , ("Gls", doAcronym "short")
+  , ("glsdesc", doAcronym "long")
+  , ("Glsdesc", doAcronym "long")
+  , ("GLSdesc", doAcronym "long")
+  , ("acrlong", doAcronym "long")
+  , ("Acrlong", doAcronym "long")
+  , ("acrfull", doAcronym "full")
+  , ("Acrfull", doAcronym "full")
+  , ("acrshort", doAcronym "abbrv")
+  , ("Acrshort", doAcronym "abbrv")
+  , ("glspl", doAcronymPlural "short")
+  , ("Glspl", doAcronymPlural "short")
+  , ("glsdescplural", doAcronymPlural "long")
+  , ("Glsdescplural", doAcronymPlural "long")
+  , ("GLSdescplural", doAcronymPlural "long")
+  -- acronyms package
+  , ("ac", doAcronym "short")
+  , ("acf", doAcronym "full")
+  , ("acs", doAcronym "abbrv")
+  , ("acl", doAcronym "long")
+  , ("acp", doAcronymPlural "short")
+  , ("acfp", doAcronymPlural "full")
+  , ("acsp", doAcronymPlural "abbrv")
+  , ("aclp", doAcronymPlural "long")
+  , ("Ac", doAcronym "short")
+  , ("Acf", doAcronym "full")
+  , ("Acs", doAcronym "abbrv")
+  , ("Acl", doAcronym "long")
+  , ("Acp", doAcronymPlural "short")
+  , ("Acfp", doAcronymPlural "full")
+  , ("Acsp", doAcronymPlural "abbrv")
+  , ("Aclp", doAcronymPlural "long")
+  ]
+
+doAcronym :: PandocMonad m => Text -> LP m Inlines
+doAcronym form = do
+  acro <- braced
+  return . mconcat $ [spanWith ("",[],[("acronym-label", untokenize acro),
+    ("acronym-form", "singular+" <> form)])
+    $ str $ untokenize acro]
+
+doAcronymPlural :: PandocMonad m => Text -> LP m Inlines
+doAcronymPlural form = do
+  acro <- braced
+  let plural = str "s"
+  return . mconcat $ [spanWith ("",[],[("acronym-label", untokenize acro),
+    ("acronym-form", "plural+" <> form)]) $
+   mconcat [str $ untokenize acro, plural]]
+
+
diff --git a/src/Text/Pandoc/Readers/LaTeX/Lang.hs b/src/Text/Pandoc/Readers/LaTeX/Lang.hs
index 814b2fe79..6a8327904 100644
--- a/src/Text/Pandoc/Readers/LaTeX/Lang.hs
+++ b/src/Text/Pandoc/Readers/LaTeX/Lang.hs
@@ -1,7 +1,7 @@
 {-# LANGUAGE OverloadedStrings #-}
 {- |
    Module      : Text.Pandoc.Readers.LaTeX.Lang
-   Copyright   : Copyright (C) 2018-2020 John MacFarlane
+   Copyright   : Copyright (C) 2018-2021 John MacFarlane
    License     : GNU GPL, version 2 or above
 
    Maintainer  : John MacFarlane <jgm@berkeley.edu>
@@ -12,144 +12,223 @@ Functions for parsing polyglossia and babel language specifiers to
 BCP47 'Lang'.
 -}
 module Text.Pandoc.Readers.LaTeX.Lang
-  ( polyglossiaLangToBCP47
+  ( setDefaultLanguage
+  , polyglossiaLangToBCP47
   , babelLangToBCP47
+  , enquoteCommands
+  , inlineLanguageCommands
   )
 where
 import qualified Data.Map as M
+import Data.Text (Text)
 import qualified Data.Text as T
-import Text.Pandoc.BCP47 (Lang(..))
+import Text.Pandoc.Shared (extractSpaces)
+import Text.Collate.Lang (Lang(..), renderLang)
+import Text.Pandoc.Class (PandocMonad(..), setTranslations)
+import Text.Pandoc.Readers.LaTeX.Parsing
+import Text.Pandoc.Parsing (updateState, option, getState, QuoteContext(..),
+                            withQuoteContext)
+import Text.Pandoc.Builder (Blocks, Inlines, setMeta, str, spanWith,
+                            singleQuoted, doubleQuoted)
+
+enquote :: PandocMonad m
+        => LP m Inlines
+        -> Bool -> Maybe Text -> LP m Inlines
+enquote tok starred mblang = do
+  skipopts
+  let lang = mblang >>= babelLangToBCP47
+  let langspan = case lang of
+                      Nothing -> id
+                      Just l  -> spanWith ("",[],[("lang", renderLang l)])
+  quoteContext <- sQuoteContext <$> getState
+  if starred || quoteContext == InDoubleQuote
+     then singleQuoted . langspan <$> withQuoteContext InSingleQuote tok
+     else doubleQuoted . langspan <$> withQuoteContext InDoubleQuote tok
+
+enquoteCommands :: PandocMonad m
+                => LP m Inlines -> M.Map Text (LP m Inlines)
+enquoteCommands tok = M.fromList
+  [ ("enquote*", enquote tok True Nothing)
+  , ("enquote", enquote tok False Nothing)
+  -- foreignquote is supposed to use native quote marks
+  , ("foreignquote*", braced >>= enquote tok True . Just . untokenize)
+  , ("foreignquote", braced >>= enquote tok False . Just . untokenize)
+  -- hypehnquote uses regular quotes
+  , ("hyphenquote*", braced >>= enquote tok True . Just . untokenize)
+  , ("hyphenquote", braced >>= enquote tok False . Just . untokenize)
+  ]
+
+foreignlanguage :: PandocMonad m => LP m Inlines -> LP m Inlines
+foreignlanguage tok = do
+  babelLang <- untokenize <$> braced
+  case babelLangToBCP47 babelLang of
+       Just lang -> spanWith ("", [], [("lang",  renderLang lang)]) <$> tok
+       _ -> tok
+
+inlineLanguageCommands :: PandocMonad m
+                       => LP m Inlines -> M.Map Text (LP m Inlines)
+inlineLanguageCommands tok =
+  M.fromList $
+    ("foreignlanguage", foreignlanguage tok) :
+    (mk <$> M.toList polyglossiaLangToBCP47)
+  where
+    mk (polyglossia, bcp47Func) =
+      ("text" <> polyglossia, inlineLanguage tok bcp47Func)
+
+inlineLanguage :: PandocMonad m
+               => LP m Inlines -> (Text -> Lang) -> LP m Inlines
+inlineLanguage tok bcp47Func = do
+  o <- option "" $ T.filter (\c -> c /= '[' && c /= ']')
+                <$> rawopt
+  let lang = renderLang $ bcp47Func o
+  extractSpaces (spanWith ("", [], [("lang", lang)])) <$> tok
+
+setDefaultLanguage :: PandocMonad m => LP m Blocks
+setDefaultLanguage = do
+  o <- option "" $ T.filter (\c -> c /= '[' && c /= ']')
+                <$> rawopt
+  polylang <- untokenize <$> braced
+  case M.lookup polylang polyglossiaLangToBCP47 of
+       Nothing -> return mempty -- TODO mzero? warning?
+       Just langFunc -> do
+         let l = langFunc o
+         setTranslations l
+         updateState $ setMeta "lang" $ str (renderLang l)
+         return mempty
 
 polyglossiaLangToBCP47 :: M.Map T.Text (T.Text -> Lang)
 polyglossiaLangToBCP47 = M.fromList
   [ ("arabic", \o -> case T.filter (/=' ') o of
-       "locale=algeria"    -> Lang "ar" "" "DZ" []
-       "locale=mashriq"    -> Lang "ar" "" "SY" []
-       "locale=libya"      -> Lang "ar" "" "LY" []
-       "locale=morocco"    -> Lang "ar" "" "MA" []
-       "locale=mauritania" -> Lang "ar" "" "MR" []
-       "locale=tunisia"    -> Lang "ar" "" "TN" []
-       _                   -> Lang "ar" "" "" [])
+       "locale=algeria"    -> Lang "ar" Nothing (Just "DZ") [] [] []
+       "locale=mashriq"    -> Lang "ar" Nothing (Just "SY") [] [] []
+       "locale=libya"      -> Lang "ar" Nothing (Just "LY") [] [] []
+       "locale=morocco"    -> Lang "ar" Nothing (Just "MA") [] [] []
+       "locale=mauritania" -> Lang "ar" Nothing (Just "MR") [] [] []
+       "locale=tunisia"    -> Lang "ar" Nothing (Just "TN") [] [] []
+       _                   -> Lang "ar" Nothing (Just "")   [] [] [])
   , ("german", \o -> case T.filter (/=' ') o of
-       "spelling=old" -> Lang "de" "" "DE" ["1901"]
+       "spelling=old" -> Lang "de" Nothing (Just "DE") ["1901"] [] []
        "variant=austrian,spelling=old"
-                       -> Lang "de" "" "AT" ["1901"]
-       "variant=austrian" -> Lang "de" "" "AT" []
+                       -> Lang "de" Nothing (Just "AT") ["1901"] [] []
+       "variant=austrian" -> Lang "de" Nothing (Just "AT") [] [] []
        "variant=swiss,spelling=old"
-                       -> Lang "de" "" "CH" ["1901"]
-       "variant=swiss" -> Lang "de" "" "CH" []
-       _ -> Lang "de" "" "" [])
-  , ("lsorbian", \_ -> Lang "dsb" "" "" [])
+                       -> Lang "de" Nothing (Just "CH") ["1901"] [] []
+       "variant=swiss" -> Lang "de" Nothing (Just "CH") [] [] []
+       _ -> Lang "de" Nothing Nothing [] [] [])
+  , ("lsorbian", \_ -> Lang "dsb" Nothing Nothing [] [] [])
   , ("greek", \o -> case T.filter (/=' ') o of
-       "variant=poly"    -> Lang "el" "" "polyton" []
-       "variant=ancient" -> Lang "grc" "" "" []
-       _                 -> Lang "el" "" "" [])
+       "variant=poly"    -> Lang "el" Nothing (Just "polyton") [] [] []
+       "variant=ancient" -> Lang "grc" Nothing Nothing [] [] []
+       _                 -> Lang "el" Nothing Nothing [] [] [])
   , ("english", \o -> case T.filter (/=' ') o of
-       "variant=australian" -> Lang "en" "" "AU" []
-       "variant=canadian"   -> Lang "en" "" "CA" []
-       "variant=british"    -> Lang "en" "" "GB" []
-       "variant=newzealand" -> Lang "en" "" "NZ" []
-       "variant=american"   -> Lang "en" "" "US" []
-       _                    -> Lang "en" "" "" [])
-  , ("usorbian", \_ -> Lang "hsb" "" "" [])
+       "variant=australian" -> Lang "en" Nothing (Just "AU") [] [] []
+       "variant=canadian"   -> Lang "en" Nothing (Just "CA") [] [] []
+       "variant=british"    -> Lang "en" Nothing (Just "GB") [] [] []
+       "variant=newzealand" -> Lang "en" Nothing (Just "NZ") [] [] []
+       "variant=american"   -> Lang "en" Nothing (Just "US") [] [] []
+       _                    -> Lang "en" Nothing (Just "")   [] [] [])
+  , ("usorbian", \_ -> Lang "hsb" Nothing Nothing [] [] [])
   , ("latin", \o -> case T.filter (/=' ') o of
-       "variant=classic" -> Lang "la" "" "" ["x-classic"]
-       _                 -> Lang "la" "" "" [])
-  , ("slovenian", \_ -> Lang "sl" "" "" [])
-  , ("serbianc", \_ -> Lang "sr" "cyrl" "" [])
-  , ("pinyin", \_ -> Lang "zh" "Latn" "" ["pinyin"])
-  , ("afrikaans", \_ -> Lang "af" "" "" [])
-  , ("amharic", \_ -> Lang "am" "" "" [])
-  , ("assamese", \_ -> Lang "as" "" "" [])
-  , ("asturian", \_ -> Lang "ast" "" "" [])
-  , ("bulgarian", \_ -> Lang "bg" "" "" [])
-  , ("bengali", \_ -> Lang "bn" "" "" [])
-  , ("tibetan", \_ -> Lang "bo" "" "" [])
-  , ("breton", \_ -> Lang "br" "" "" [])
-  , ("catalan", \_ -> Lang "ca" "" "" [])
-  , ("welsh", \_ -> Lang "cy" "" "" [])
-  , ("czech", \_ -> Lang "cs" "" "" [])
-  , ("coptic", \_ -> Lang "cop" "" "" [])
-  , ("danish", \_ -> Lang "da" "" "" [])
-  , ("divehi", \_ -> Lang "dv" "" "" [])
-  , ("esperanto", \_ -> Lang "eo" "" "" [])
-  , ("spanish", \_ -> Lang "es" "" "" [])
-  , ("estonian", \_ -> Lang "et" "" "" [])
-  , ("basque", \_ -> Lang "eu" "" "" [])
-  , ("farsi", \_ -> Lang "fa" "" "" [])
-  , ("finnish", \_ -> Lang "fi" "" "" [])
-  , ("french", \_ -> Lang "fr" "" "" [])
-  , ("friulan", \_ -> Lang "fur" "" "" [])
-  , ("irish", \_ -> Lang "ga" "" "" [])
-  , ("scottish", \_ -> Lang "gd" "" "" [])
-  , ("ethiopic", \_ -> Lang "gez" "" "" [])
-  , ("galician", \_ -> Lang "gl" "" "" [])
-  , ("hebrew", \_ -> Lang "he" "" "" [])
-  , ("hindi", \_ -> Lang "hi" "" "" [])
-  , ("croatian", \_ -> Lang "hr" "" "" [])
-  , ("magyar", \_ -> Lang "hu" "" "" [])
-  , ("armenian", \_ -> Lang "hy" "" "" [])
-  , ("interlingua", \_ -> Lang "ia" "" "" [])
-  , ("indonesian", \_ -> Lang "id" "" "" [])
-  , ("icelandic", \_ -> Lang "is" "" "" [])
-  , ("italian", \_ -> Lang "it" "" "" [])
-  , ("japanese", \_ -> Lang "jp" "" "" [])
-  , ("khmer", \_ -> Lang "km" "" "" [])
-  , ("kurmanji", \_ -> Lang "kmr" "" "" [])
-  , ("kannada", \_ -> Lang "kn" "" "" [])
-  , ("korean", \_ -> Lang "ko" "" "" [])
-  , ("lao", \_ -> Lang "lo" "" "" [])
-  , ("lithuanian", \_ -> Lang "lt" "" "" [])
-  , ("latvian", \_ -> Lang "lv" "" "" [])
-  , ("malayalam", \_ -> Lang "ml" "" "" [])
-  , ("mongolian", \_ -> Lang "mn" "" "" [])
-  , ("marathi", \_ -> Lang "mr" "" "" [])
-  , ("dutch", \_ -> Lang "nl" "" "" [])
-  , ("nynorsk", \_ -> Lang "nn" "" "" [])
-  , ("norsk", \_ -> Lang "no" "" "" [])
-  , ("nko", \_ -> Lang "nqo" "" "" [])
-  , ("occitan", \_ -> Lang "oc" "" "" [])
-  , ("panjabi", \_ -> Lang "pa" "" "" [])
-  , ("polish", \_ -> Lang "pl" "" "" [])
-  , ("piedmontese", \_ -> Lang "pms" "" "" [])
-  , ("portuguese", \_ -> Lang "pt" "" "" [])
-  , ("romansh", \_ -> Lang "rm" "" "" [])
-  , ("romanian", \_ -> Lang "ro" "" "" [])
-  , ("russian", \_ -> Lang "ru" "" "" [])
-  , ("sanskrit", \_ -> Lang "sa" "" "" [])
-  , ("samin", \_ -> Lang "se" "" "" [])
-  , ("slovak", \_ -> Lang "sk" "" "" [])
-  , ("albanian", \_ -> Lang "sq" "" "" [])
-  , ("serbian", \_ -> Lang "sr" "" "" [])
-  , ("swedish", \_ -> Lang "sv" "" "" [])
-  , ("syriac", \_ -> Lang "syr" "" "" [])
-  , ("tamil", \_ -> Lang "ta" "" "" [])
-  , ("telugu", \_ -> Lang "te" "" "" [])
-  , ("thai", \_ -> Lang "th" "" "" [])
-  , ("turkmen", \_ -> Lang "tk" "" "" [])
-  , ("turkish", \_ -> Lang "tr" "" "" [])
-  , ("ukrainian", \_ -> Lang "uk" "" "" [])
-  , ("urdu", \_ -> Lang "ur" "" "" [])
-  , ("vietnamese", \_ -> Lang "vi" "" "" [])
+       "variant=classic" -> Lang "la" Nothing Nothing ["x-classic"] [] []
+       _                 -> Lang "la" Nothing Nothing [] [] [])
+  , ("slovenian", \_ -> Lang "sl" Nothing Nothing [] [] [])
+  , ("serbianc", \_ -> Lang "sr" (Just "Cyrl") Nothing [] [] [])
+  , ("pinyin", \_ -> Lang "zh" (Just "Latn") Nothing ["pinyin"] [] [])
+  , ("afrikaans", \_ -> simpleLang "af")
+  , ("amharic", \_ -> simpleLang "am")
+  , ("assamese", \_ -> simpleLang "as")
+  , ("asturian", \_ -> simpleLang "ast")
+  , ("bulgarian", \_ -> simpleLang "bg")
+  , ("bengali", \_ -> simpleLang "bn")
+  , ("tibetan", \_ -> simpleLang "bo")
+  , ("breton", \_ -> simpleLang "br")
+  , ("catalan", \_ -> simpleLang "ca")
+  , ("welsh", \_ -> simpleLang "cy")
+  , ("czech", \_ -> simpleLang "cs")
+  , ("coptic", \_ -> simpleLang "cop")
+  , ("danish", \_ -> simpleLang "da")
+  , ("divehi", \_ -> simpleLang "dv")
+  , ("esperanto", \_ -> simpleLang "eo")
+  , ("spanish", \_ -> simpleLang "es")
+  , ("estonian", \_ -> simpleLang "et")
+  , ("basque", \_ -> simpleLang "eu")
+  , ("farsi", \_ -> simpleLang "fa")
+  , ("finnish", \_ -> simpleLang "fi")
+  , ("french", \_ -> simpleLang "fr")
+  , ("friulan", \_ -> simpleLang "fur")
+  , ("irish", \_ -> simpleLang "ga")
+  , ("scottish", \_ -> simpleLang "gd")
+  , ("ethiopic", \_ -> simpleLang "gez")
+  , ("galician", \_ -> simpleLang "gl")
+  , ("hebrew", \_ -> simpleLang "he")
+  , ("hindi", \_ -> simpleLang "hi")
+  , ("croatian", \_ -> simpleLang "hr")
+  , ("magyar", \_ -> simpleLang "hu")
+  , ("armenian", \_ -> simpleLang "hy")
+  , ("interlingua", \_ -> simpleLang "ia")
+  , ("indonesian", \_ -> simpleLang "id")
+  , ("icelandic", \_ -> simpleLang "is")
+  , ("italian", \_ -> simpleLang "it")
+  , ("japanese", \_ -> simpleLang "jp")
+  , ("khmer", \_ -> simpleLang "km")
+  , ("kurmanji", \_ -> simpleLang "kmr")
+  , ("kannada", \_ -> simpleLang "kn")
+  , ("korean", \_ -> simpleLang "ko")
+  , ("lao", \_ -> simpleLang "lo")
+  , ("lithuanian", \_ -> simpleLang "lt")
+  , ("latvian", \_ -> simpleLang "lv")
+  , ("malayalam", \_ -> simpleLang "ml")
+  , ("mongolian", \_ -> simpleLang "mn")
+  , ("marathi", \_ -> simpleLang "mr")
+  , ("dutch", \_ -> simpleLang "nl")
+  , ("nynorsk", \_ -> simpleLang "nn")
+  , ("norsk", \_ -> simpleLang "no")
+  , ("nko", \_ -> simpleLang "nqo")
+  , ("occitan", \_ -> simpleLang "oc")
+  , ("panjabi", \_ -> simpleLang "pa")
+  , ("polish", \_ -> simpleLang "pl")
+  , ("piedmontese", \_ -> simpleLang "pms")
+  , ("portuguese", \_ -> simpleLang "pt")
+  , ("romansh", \_ -> simpleLang "rm")
+  , ("romanian", \_ -> simpleLang "ro")
+  , ("russian", \_ -> simpleLang "ru")
+  , ("sanskrit", \_ -> simpleLang "sa")
+  , ("samin", \_ -> simpleLang "se")
+  , ("slovak", \_ -> simpleLang "sk")
+  , ("albanian", \_ -> simpleLang "sq")
+  , ("serbian", \_ -> simpleLang "sr")
+  , ("swedish", \_ -> simpleLang "sv")
+  , ("syriac", \_ -> simpleLang "syr")
+  , ("tamil", \_ -> simpleLang "ta")
+  , ("telugu", \_ -> simpleLang "te")
+  , ("thai", \_ -> simpleLang "th")
+  , ("turkmen", \_ -> simpleLang "tk")
+  , ("turkish", \_ -> simpleLang "tr")
+  , ("ukrainian", \_ -> simpleLang "uk")
+  , ("urdu", \_ -> simpleLang "ur")
+  , ("vietnamese", \_ -> simpleLang "vi")
   ]
 
+simpleLang :: Text -> Lang
+simpleLang l = Lang l Nothing Nothing [] [] []
+
 babelLangToBCP47 :: T.Text -> Maybe Lang
 babelLangToBCP47 s =
   case s of
-       "austrian" -> Just $ Lang "de" "" "AT" ["1901"]
-       "naustrian" -> Just $ Lang "de" "" "AT" []
-       "swissgerman" -> Just $ Lang "de" "" "CH" ["1901"]
-       "nswissgerman" -> Just $ Lang "de" "" "CH" []
-       "german" -> Just $ Lang "de" "" "DE" ["1901"]
-       "ngerman" -> Just $ Lang "de" "" "DE" []
-       "lowersorbian" -> Just $ Lang "dsb" "" "" []
-       "uppersorbian" -> Just $ Lang "hsb" "" "" []
-       "polutonikogreek" -> Just $ Lang "el" "" "" ["polyton"]
-       "slovene" -> Just $ Lang "sl" "" "" []
-       "australian" -> Just $ Lang "en" "" "AU" []
-       "canadian" -> Just $ Lang "en" "" "CA" []
-       "british" -> Just $ Lang "en" "" "GB" []
-       "newzealand" -> Just $ Lang "en" "" "NZ" []
-       "american" -> Just $ Lang "en" "" "US" []
-       "classiclatin" -> Just $ Lang "la" "" "" ["x-classic"]
+       "austrian" -> Just $ Lang "de" Nothing (Just "AT") ["1901"] [] []
+       "naustrian" -> Just $ Lang "de" Nothing (Just "AT") [] [] []
+       "swissgerman" -> Just $ Lang "de" Nothing (Just "CH") ["1901"] [] []
+       "nswissgerman" -> Just $ Lang "de" Nothing (Just "CH") [] [] []
+       "german" -> Just $ Lang "de" Nothing (Just "DE") ["1901"] [] []
+       "ngerman" -> Just $ Lang "de" Nothing (Just "DE") [] [] []
+       "lowersorbian" -> Just $ Lang "dsb" Nothing Nothing [] [] []
+       "uppersorbian" -> Just $ Lang "hsb" Nothing Nothing [] [] []
+       "polutonikogreek" -> Just $ Lang "el" Nothing Nothing ["polyton"] [] []
+       "slovene" -> Just $ simpleLang "sl"
+       "australian" -> Just $ Lang "en" Nothing (Just "AU") [] [] []
+       "canadian" -> Just $ Lang "en" Nothing (Just "CA") [] [] []
+       "british" -> Just $ Lang "en" Nothing (Just "GB") [] [] []
+       "newzealand" -> Just $ Lang "en" Nothing (Just "NZ") [] [] []
+       "american" -> Just $ Lang "en" Nothing (Just "US") [] [] []
+       "classiclatin" -> Just $ Lang "la" Nothing Nothing ["x-classic"] [] []
        _ -> ($ "") <$> M.lookup s polyglossiaLangToBCP47
diff --git a/src/Text/Pandoc/Readers/LaTeX/Macro.hs b/src/Text/Pandoc/Readers/LaTeX/Macro.hs
new file mode 100644
index 000000000..5495a8e74
--- /dev/null
+++ b/src/Text/Pandoc/Readers/LaTeX/Macro.hs
@@ -0,0 +1,184 @@
+{-# LANGUAGE OverloadedStrings #-}
+module Text.Pandoc.Readers.LaTeX.Macro
+  ( macroDef
+  )
+where
+import Text.Pandoc.Extensions (Extension(..))
+import Text.Pandoc.Logging (LogMessage(MacroAlreadyDefined))
+import Text.Pandoc.Readers.LaTeX.Parsing
+import Text.Pandoc.Readers.LaTeX.Types
+import Text.Pandoc.Class
+import Text.Pandoc.Shared (safeRead)
+import Text.Pandoc.Parsing hiding (blankline, mathDisplay, mathInline,
+                            optional, space, spaces, withRaw, (<|>))
+import Control.Applicative ((<|>), optional)
+import qualified Data.Map as M
+import Data.Text (Text)
+import qualified Data.Text as T
+
+macroDef :: (PandocMonad m, Monoid a) => (Text -> a) -> LP m a
+macroDef constructor = do
+    (_, s) <- withRaw (commandDef <|> environmentDef)
+    (constructor (untokenize s) <$
+      guardDisabled Ext_latex_macros)
+     <|> return mempty
+  where commandDef = do
+          nameMacroPairs <- newcommand <|> letmacro <|> defmacro <|> newif
+          guardDisabled Ext_latex_macros <|>
+           mapM_ (\(name, macro') ->
+                   updateState (\s -> s{ sMacros = M.insert name macro'
+                                          (sMacros s) })) nameMacroPairs
+        environmentDef = do
+          mbenv <- newenvironment
+          case mbenv of
+            Nothing -> return ()
+            Just (name, macro1, macro2) ->
+              guardDisabled Ext_latex_macros <|>
+                do updateState $ \s -> s{ sMacros =
+                    M.insert name macro1 (sMacros s) }
+                   updateState $ \s -> s{ sMacros =
+                    M.insert ("end" <> name) macro2 (sMacros s) }
+        -- @\newenvironment{envname}[n-args][default]{begin}{end}@
+        -- is equivalent to
+        -- @\newcommand{\envname}[n-args][default]{begin}@
+        -- @\newcommand{\endenvname}@
+
+letmacro :: PandocMonad m => LP m [(Text, Macro)]
+letmacro = do
+  controlSeq "let"
+  (name, contents) <- withVerbatimMode $ do
+    Tok _ (CtrlSeq name) _ <- anyControlSeq
+    optional $ symbol '='
+    spaces
+    -- we first parse in verbatim mode, and then expand macros,
+    -- because we don't want \let\foo\bar to turn into
+    -- \let\foo hello if we have previously \def\bar{hello}
+    contents <- bracedOrToken
+    return (name, contents)
+  contents' <- doMacros' 0 contents
+  return [(name, Macro ExpandWhenDefined [] Nothing contents')]
+
+defmacro :: PandocMonad m => LP m [(Text, Macro)]
+defmacro = do
+  -- we use withVerbatimMode, because macros are to be expanded
+  -- at point of use, not point of definition
+  controlSeq "def"
+  withVerbatimMode $ do
+    Tok _ (CtrlSeq name) _ <- anyControlSeq
+    argspecs <- many (argspecArg <|> argspecPattern)
+    contents <- bracedOrToken
+    return [(name, Macro ExpandWhenUsed argspecs Nothing contents)]
+
+-- \newif\iffoo' defines:
+-- \iffoo to be \iffalse
+-- \footrue to be a command that defines \iffoo to be \iftrue
+-- \foofalse to be a command that defines \iffoo to be \iffalse
+newif :: PandocMonad m => LP m [(Text, Macro)]
+newif = do
+  controlSeq "newif"
+  withVerbatimMode $ do
+    Tok pos (CtrlSeq name) _ <- anyControlSeq
+    -- \def\iffoo\iffalse
+    -- \def\footrue{\def\iffoo\iftrue}
+    -- \def\foofalse{\def\iffoo\iffalse}
+    let base = T.drop 2 name
+    return [ (name, Macro ExpandWhenUsed [] Nothing
+                    [Tok pos (CtrlSeq "iffalse") "\\iffalse"])
+           , (base <> "true",
+                   Macro ExpandWhenUsed [] Nothing
+                   [ Tok pos (CtrlSeq "def") "\\def"
+                   , Tok pos (CtrlSeq name) ("\\" <> name)
+                   , Tok pos (CtrlSeq "iftrue") "\\iftrue"
+                   ])
+           , (base <> "false",
+                   Macro ExpandWhenUsed [] Nothing
+                   [ Tok pos (CtrlSeq "def") "\\def"
+                   , Tok pos (CtrlSeq name) ("\\" <> name)
+                   , Tok pos (CtrlSeq "iffalse") "\\iffalse"
+                   ])
+           ]
+
+argspecArg :: PandocMonad m => LP m ArgSpec
+argspecArg = do
+  Tok _ (Arg i) _ <- satisfyTok isArgTok
+  return $ ArgNum i
+
+argspecPattern :: PandocMonad m => LP m ArgSpec
+argspecPattern =
+  Pattern <$> many1 (satisfyTok (\(Tok _ toktype' txt) ->
+                              (toktype' == Symbol || toktype' == Word) &&
+                              (txt /= "{" && txt /= "\\" && txt /= "}")))
+
+newcommand :: PandocMonad m => LP m [(Text, Macro)]
+newcommand = do
+  Tok pos (CtrlSeq mtype) _ <- controlSeq "newcommand" <|>
+                             controlSeq "renewcommand" <|>
+                             controlSeq "providecommand" <|>
+                             controlSeq "DeclareMathOperator" <|>
+                             controlSeq "DeclareRobustCommand"
+  withVerbatimMode $ do
+    Tok _ (CtrlSeq name) txt <- do
+      optional (symbol '*')
+      anyControlSeq <|>
+        (symbol '{' *> spaces *> anyControlSeq <* spaces <* symbol '}')
+    spaces
+    numargs <- option 0 $ try bracketedNum
+    let argspecs = map ArgNum [1..numargs]
+    spaces
+    optarg <- option Nothing $ Just <$> try bracketedToks
+    spaces
+    contents' <- bracedOrToken
+    let contents =
+         case mtype of
+              "DeclareMathOperator" ->
+                 Tok pos (CtrlSeq "mathop") "\\mathop"
+                 : Tok pos Symbol "{"
+                 : Tok pos (CtrlSeq "mathrm") "\\mathrm"
+                 : Tok pos Symbol "{"
+                 : (contents' ++
+                   [ Tok pos Symbol "}", Tok pos Symbol "}" ])
+              _                     -> contents'
+    macros <- sMacros <$> getState
+    case M.lookup name macros of
+        Just macro
+          | mtype == "newcommand" -> do
+              report $ MacroAlreadyDefined txt pos
+              return [(name, macro)]
+          | mtype == "providecommand" -> return [(name, macro)]
+        _ -> return [(name, Macro ExpandWhenUsed argspecs optarg contents)]
+
+newenvironment :: PandocMonad m => LP m (Maybe (Text, Macro, Macro))
+newenvironment = do
+  pos <- getPosition
+  Tok _ (CtrlSeq mtype) _ <- controlSeq "newenvironment" <|>
+                             controlSeq "renewenvironment" <|>
+                             controlSeq "provideenvironment"
+  withVerbatimMode $ do
+    optional $ symbol '*'
+    spaces
+    name <- untokenize <$> braced
+    spaces
+    numargs <- option 0 $ try bracketedNum
+    spaces
+    optarg <- option Nothing $ Just <$> try bracketedToks
+    let argspecs = map (\i -> ArgNum i) [1..numargs]
+    startcontents <- spaces >> bracedOrToken
+    endcontents <- spaces >> bracedOrToken
+    macros <- sMacros <$> getState
+    case M.lookup name macros of
+         Just _
+           | mtype == "newenvironment" -> do
+               report $ MacroAlreadyDefined name pos
+               return Nothing
+           | mtype == "provideenvironment" ->
+               return Nothing
+         _ -> return $ Just (name,
+                      Macro ExpandWhenUsed argspecs optarg startcontents,
+                      Macro ExpandWhenUsed [] Nothing endcontents)
+
+bracketedNum :: PandocMonad m => LP m Int
+bracketedNum = do
+  ds <- untokenize <$> bracketedToks
+  case safeRead ds of
+       Just i -> return i
+       _      -> return 0
diff --git a/src/Text/Pandoc/Readers/LaTeX/Math.hs b/src/Text/Pandoc/Readers/LaTeX/Math.hs
new file mode 100644
index 000000000..5b49a0376
--- /dev/null
+++ b/src/Text/Pandoc/Readers/LaTeX/Math.hs
@@ -0,0 +1,221 @@
+{-# LANGUAGE OverloadedStrings #-}
+module Text.Pandoc.Readers.LaTeX.Math
+  ( dollarsMath
+  , inlineEnvironments
+  , inlineEnvironment
+  , mathInline
+  , mathDisplay
+  , theoremstyle
+  , theoremEnvironment
+  , newtheorem
+  , proof
+  )
+where
+import Data.Maybe (fromMaybe)
+import Text.Pandoc.Walk (walk)
+import Text.Pandoc.Builder as B
+import qualified Data.Sequence as Seq
+import Text.Pandoc.Readers.LaTeX.Parsing
+import Text.Pandoc.Readers.LaTeX.Types
+import Text.Pandoc.Class
+import Text.Pandoc.Shared (trimMath, stripTrailingNewlines)
+import Text.Pandoc.Parsing hiding (blankline, mathDisplay, mathInline,
+                            optional, space, spaces, withRaw, (<|>))
+import Control.Applicative ((<|>), optional)
+import Control.Monad (guard, mzero)
+import qualified Data.Map as M
+import Data.Text (Text)
+
+dollarsMath :: PandocMonad m => LP m Inlines
+dollarsMath = do
+  symbol '$'
+  display <- option False (True <$ symbol '$')
+  (do contents <- try $ untokenize <$> pDollarsMath 0
+      if display
+         then mathDisplay contents <$ symbol '$'
+         else return $ mathInline contents)
+   <|> (guard display >> return (mathInline ""))
+
+-- Int is number of embedded groupings
+pDollarsMath :: PandocMonad m => Int -> LP m [Tok]
+pDollarsMath n = do
+  tk@(Tok _ toktype t) <- anyTok
+  case toktype of
+       Symbol | t == "$"
+              , n == 0 -> return []
+              | t == "\\" -> do
+                  tk' <- anyTok
+                  (tk :) . (tk' :) <$> pDollarsMath n
+              | t == "{" -> (tk :) <$> pDollarsMath (n+1)
+              | t == "}" ->
+                if n > 0
+                then (tk :) <$> pDollarsMath (n-1)
+                else mzero
+       _ -> (tk :) <$> pDollarsMath n
+
+mathDisplay :: Text -> Inlines
+mathDisplay = displayMath . trimMath
+
+mathInline :: Text -> Inlines
+mathInline = math . trimMath
+
+mathEnvWith :: PandocMonad m
+            => (Inlines -> a) -> Maybe Text -> Text -> LP m a
+mathEnvWith f innerEnv name = f . mathDisplay . inner <$> mathEnv name
+   where inner x = case innerEnv of
+                        Nothing -> x
+                        Just y  -> "\\begin{" <> y <> "}\n" <> x <>
+                                   "\\end{" <> y <> "}"
+
+mathEnv :: PandocMonad m => Text -> LP m Text
+mathEnv name = do
+  skipopts
+  optional blankline
+  res <- manyTill anyTok (end_ name)
+  return $ stripTrailingNewlines $ untokenize res
+
+inlineEnvironment :: PandocMonad m => LP m Inlines
+inlineEnvironment = try $ do
+  controlSeq "begin"
+  name <- untokenize <$> braced
+  M.findWithDefault mzero name inlineEnvironments
+
+inlineEnvironments :: PandocMonad m => M.Map Text (LP m Inlines)
+inlineEnvironments = M.fromList [
+    ("displaymath", mathEnvWith id Nothing "displaymath")
+  , ("math", math <$> mathEnv "math")
+  , ("equation", mathEnvWith id Nothing "equation")
+  , ("equation*", mathEnvWith id Nothing "equation*")
+  , ("gather", mathEnvWith id (Just "gathered") "gather")
+  , ("gather*", mathEnvWith id (Just "gathered") "gather*")
+  , ("multline", mathEnvWith id (Just "gathered") "multline")
+  , ("multline*", mathEnvWith id (Just "gathered") "multline*")
+  , ("eqnarray", mathEnvWith id (Just "aligned") "eqnarray")
+  , ("eqnarray*", mathEnvWith id (Just "aligned") "eqnarray*")
+  , ("align", mathEnvWith id (Just "aligned") "align")
+  , ("align*", mathEnvWith id (Just "aligned") "align*")
+  , ("alignat", mathEnvWith id (Just "aligned") "alignat")
+  , ("alignat*", mathEnvWith id (Just "aligned") "alignat*")
+  , ("dmath", mathEnvWith id Nothing "dmath")
+  , ("dmath*", mathEnvWith id Nothing "dmath*")
+  , ("dgroup", mathEnvWith id (Just "aligned") "dgroup")
+  , ("dgroup*", mathEnvWith id (Just "aligned") "dgroup*")
+  , ("darray", mathEnvWith id (Just "aligned") "darray")
+  , ("darray*", mathEnvWith id (Just "aligned") "darray*")
+  ]
+
+theoremstyle :: PandocMonad m => LP m Blocks
+theoremstyle = do
+  stylename <- untokenize <$> braced
+  let mbstyle = case stylename of
+                  "plain"      -> Just PlainStyle
+                  "definition" -> Just DefinitionStyle
+                  "remark"     -> Just RemarkStyle
+                  _            -> Nothing
+  case mbstyle of
+    Nothing  -> return ()
+    Just sty -> updateState $ \s -> s{ sLastTheoremStyle = sty }
+  return mempty
+
+newtheorem :: PandocMonad m => LP m Inlines -> LP m Blocks
+newtheorem inline = do
+  number <- option True (False <$ symbol '*' <* sp)
+  name <- untokenize <$> braced
+  sp
+  series <- option Nothing $ Just . untokenize <$> bracketedToks
+  sp
+  showName <- tokWith inline
+  sp
+  syncTo <- option Nothing $ Just . untokenize <$> bracketedToks
+  sty <- sLastTheoremStyle <$> getState
+  let spec = TheoremSpec { theoremName = showName
+                         , theoremStyle = sty
+                         , theoremSeries = series
+                         , theoremSyncTo = syncTo
+                         , theoremNumber = number
+                         , theoremLastNum = DottedNum [0] }
+  tmap <- sTheoremMap <$> getState
+  updateState $ \s -> s{ sTheoremMap =
+                            M.insert name spec tmap }
+  return mempty
+
+theoremEnvironment :: PandocMonad m
+                   => LP m Blocks -> LP m Inlines -> Text -> LP m Blocks
+theoremEnvironment blocks opt name = do
+  tmap <- sTheoremMap <$> getState
+  case M.lookup name tmap of
+    Nothing -> mzero
+    Just tspec -> do
+       optTitle <- option mempty $ (\x -> space <> "(" <> x <> ")") <$> opt
+       mblabel <- option Nothing $ Just . untokenize <$>
+                   try (spaces >> controlSeq "label" >> spaces >> braced)
+       bs <- env name blocks
+       number <-
+         if theoremNumber tspec
+            then do
+               let name' = fromMaybe name $ theoremSeries tspec
+               num <- getNextNumber
+                   (maybe (DottedNum [0]) theoremLastNum .
+                    M.lookup name' . sTheoremMap)
+               updateState $ \s ->
+                 s{ sTheoremMap =
+                       M.adjust
+                       (\spec -> spec{ theoremLastNum = num })
+                       name'
+                       (sTheoremMap s)
+                  }
+
+               case mblabel of
+                 Just ident ->
+                   updateState $ \s ->
+                     s{ sLabels = M.insert ident
+                         (B.toList $
+                           theoremName tspec <> "\160" <>
+                           str (renderDottedNum num)) (sLabels s) }
+                 Nothing -> return ()
+               return $ space <> B.text (renderDottedNum num)
+            else return mempty
+       let titleEmph = case theoremStyle tspec of
+                         PlainStyle      -> B.strong
+                         DefinitionStyle -> B.strong
+                         RemarkStyle     -> B.emph
+       let title = titleEmph (theoremName tspec <> number)
+                      <> optTitle <> "." <> space
+       return $ divWith (fromMaybe "" mblabel, [name], []) $ addTitle title
+              $ case theoremStyle tspec of
+                  PlainStyle -> walk italicize bs
+                  _          -> bs
+
+
+
+proof :: PandocMonad m => LP m Blocks -> LP m Inlines -> LP m Blocks
+proof blocks opt = do
+  title <- option (B.text "Proof") opt
+  bs <- env "proof" blocks
+  return $
+    B.divWith ("", ["proof"], []) $
+      addQed $ addTitle (B.emph (title <> ".")) bs
+
+addTitle :: Inlines -> Blocks -> Blocks
+addTitle ils bs =
+  case B.toList bs of
+    (Para xs : rest)
+      -> B.fromList (Para (B.toList ils ++ (Space : xs)) : rest)
+    _ -> B.para ils <> bs
+
+addQed :: Blocks -> Blocks
+addQed bs =
+  case Seq.viewr (B.unMany bs) of
+    s Seq.:> Para ils
+      -> B.Many (s Seq.|> Para (ils ++ B.toList qedSign))
+    _ -> bs <> B.para qedSign
+ where
+  qedSign = B.str "\xa0\x25FB"
+
+italicize :: Block -> Block
+italicize x@(Para [Image{}]) = x -- see #6925
+italicize (Para ils) = Para [Emph ils]
+italicize (Plain ils) = Plain [Emph ils]
+italicize x = x
+
+
diff --git a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs
index 563d32883..9dac4d6ef 100644
--- a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs
+++ b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs
@@ -1,11 +1,12 @@
 {-# LANGUAGE FlexibleInstances     #-}
+{-# LANGUAGE FlexibleContexts      #-}
 {-# LANGUAGE LambdaCase            #-}
 {-# LANGUAGE MultiParamTypeClasses #-}
 {-# LANGUAGE OverloadedStrings     #-}
 {-# LANGUAGE ScopedTypeVariables   #-}
 {- |
    Module      : Text.Pandoc.Readers.LaTeX.Parsing
-   Copyright   : Copyright (C) 2006-2020 John MacFarlane
+   Copyright   : Copyright (C) 2006-2021 John MacFarlane
    License     : GNU GPL, version 2 or above
 
    Maintainer  : John MacFarlane <jgm@berkeley.edu>
@@ -27,11 +28,15 @@ module Text.Pandoc.Readers.LaTeX.Parsing
   , rawLaTeXParser
   , applyMacros
   , tokenize
+  , tokenizeSources
+  , getInputTokens
   , untokenize
   , untoken
   , totoks
   , toksToString
   , satisfyTok
+  , parseFromToks
+  , disablingWithRaw
   , doMacros
   , doMacros'
   , setpos
@@ -52,6 +57,7 @@ module Text.Pandoc.Readers.LaTeX.Parsing
   , comment
   , anyTok
   , singleChar
+  , tokWith
   , specialChars
   , endline
   , blankline
@@ -78,6 +84,11 @@ module Text.Pandoc.Readers.LaTeX.Parsing
   , rawopt
   , overlaySpecification
   , getNextNumber
+  , label
+  , setCaption
+  , resetCaption
+  , env
+  , addMeta
   ) where
 
 import Control.Applicative (many, (<|>))
@@ -87,13 +98,15 @@ import Control.Monad.Trans (lift)
 import Data.Char (chr, isAlphaNum, isDigit, isLetter, ord)
 import Data.Default
 import Data.List (intercalate)
+import qualified Data.IntMap as IntMap
 import qualified Data.Map as M
 import qualified Data.Set as Set
 import Data.Text (Text)
 import qualified Data.Text as T
 import Text.Pandoc.Builder
 import Text.Pandoc.Class.PandocMonad (PandocMonad, report)
-import Text.Pandoc.Error (PandocError (PandocMacroLoop))
+import Text.Pandoc.Error
+         (PandocError (PandocMacroLoop,PandocShouldNeverHappenError))
 import Text.Pandoc.Logging
 import Text.Pandoc.Options
 import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline,
@@ -102,7 +115,6 @@ import Text.Pandoc.Readers.LaTeX.Types (ExpansionPoint (..), Macro (..),
                                         ArgSpec (..), Tok (..), TokType (..))
 import Text.Pandoc.Shared
 import Text.Parsec.Pos
--- import Debug.Trace
 
 newtype DottedNum = DottedNum [Int]
   deriving (Show, Eq)
@@ -151,7 +163,9 @@ data LaTeXState = LaTeXState{ sOptions       :: ReaderOptions
                             , sLabels        :: M.Map Text [Inline]
                             , sHasChapters   :: Bool
                             , sToggles       :: M.Map Text Bool
-                            , sExpanded      :: Bool
+                            , sFileContents  :: M.Map Text Text
+                            , sEnableWithRaw :: Bool
+                            , sRawTokens     :: IntMap.IntMap [Tok]
                             }
      deriving Show
 
@@ -176,7 +190,9 @@ defaultLaTeXState = LaTeXState{ sOptions       = def
                               , sLabels        = M.empty
                               , sHasChapters   = False
                               , sToggles       = M.empty
-                              , sExpanded      = False
+                              , sFileContents  = M.empty
+                              , sEnableWithRaw = True
+                              , sRawTokens     = IntMap.empty
                               }
 
 instance PandocMonad m => HasQuoteContext LaTeXState m where
@@ -232,21 +248,25 @@ withVerbatimMode parser = do
        updateState $ \st -> st{ sVerbatimMode = False }
        return result
 
-rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s)
+rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s, Show a)
                => [Tok] -> Bool -> LP m a -> LP m a
-               -> ParserT Text s m (a, Text)
+               -> ParserT Sources s m (a, Text)
 rawLaTeXParser toks retokenize parser valParser = do
   pstate <- getState
   let lstate = def{ sOptions = extractReaderOptions pstate }
   let lstate' = lstate { sMacros = extractMacros pstate }
+  let setStartPos = case toks of
+                      Tok pos _ _ : _ -> setPosition pos
+                      _ -> return ()
+  let preparser = setStartPos >> parser
   let rawparser = (,) <$> withRaw valParser <*> getState
-  res' <- lift $ runParserT (snd <$> withRaw parser) lstate "chunk" toks
+  res' <- lift $ runParserT (withRaw (preparser >> getPosition))
+                            lstate "chunk" toks
   case res' of
        Left _    -> mzero
-       Right toks' -> do
+       Right (endpos, toks') -> do
          res <- lift $ runParserT (do when retokenize $ do
                                         -- retokenize, applying macros
-                                        doMacros
                                         ts <- many (satisfyTok (const True))
                                         setInput ts
                                       rawparser)
@@ -255,7 +275,13 @@ rawLaTeXParser toks retokenize parser valParser = do
               Left _    -> mzero
               Right ((val, raw), st) -> do
                 updateState (updateMacros (sMacros st <>))
-                _ <- takeP (T.length (untokenize toks'))
+                let skipTilPos stopPos = do
+                      anyChar
+                      pos <- getPosition
+                      if pos >= stopPos
+                         then return ()
+                         else skipTilPos stopPos
+                skipTilPos endpos
                 let result = untokenize raw
                 -- ensure we end with space if input did, see #4442
                 let result' =
@@ -268,7 +294,7 @@ rawLaTeXParser toks retokenize parser valParser = do
                 return (val, result')
 
 applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s)
-            => Text -> ParserT Text s m Text
+            => Text -> ParserT Sources s m Text
 applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|>
    do let retokenize = untokenize <$> many (satisfyTok (const True))
       pstate <- getState
@@ -279,6 +305,31 @@ applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|>
            Left e   -> Prelude.fail (show e)
            Right s' -> return s'
 
+{-
+When tokenize or untokenize change, test with this
+QuickCheck property:
+
+> tokUntokRoundtrip :: String -> Bool
+> tokUntokRoundtrip s =
+>   let t = T.pack s in untokenize (tokenize "random" t) == t
+-}
+
+tokenizeSources :: Sources -> [Tok]
+tokenizeSources = concatMap tokenizeSource . unSources
+ where
+   tokenizeSource (pos, t) = totoks pos t
+
+-- Return tokens from input sources. Ensure that starting position is
+-- correct.
+getInputTokens :: PandocMonad m => ParserT Sources s m [Tok]
+getInputTokens = do
+  pos <- getPosition
+  ss <- getInput
+  return $
+    case ss of
+      Sources [] -> []
+      Sources ((_,t):rest) -> tokenizeSources $ Sources ((pos,t):rest)
+
 tokenize :: SourceName -> Text -> [Tok]
 tokenize sourcename = totoks (initialPos sourcename)
 
@@ -402,41 +453,62 @@ untoken t = untokenAccum t mempty
 toksToString :: [Tok] -> String
 toksToString = T.unpack . untokenize
 
+parseFromToks :: PandocMonad m => LP m a -> [Tok] -> LP m a
+parseFromToks parser toks = do
+  oldInput <- getInput
+  setInput toks
+  oldpos <- getPosition
+  case toks of
+     Tok pos _ _ : _ -> setPosition pos
+     _ -> return ()
+  result <- disablingWithRaw parser
+  setInput oldInput
+  setPosition oldpos
+  return result
+
+disablingWithRaw :: PandocMonad m => LP m a -> LP m a
+disablingWithRaw parser = do
+  oldEnableWithRaw <- sEnableWithRaw <$> getState
+  updateState $ \st -> st{ sEnableWithRaw = False }
+  result <- parser
+  updateState $ \st -> st{ sEnableWithRaw = oldEnableWithRaw }
+  return result
+
 satisfyTok :: PandocMonad m => (Tok -> Bool) -> LP m Tok
 satisfyTok f = do
     doMacros -- apply macros on remaining input stream
     res <- tokenPrim (T.unpack . untoken) updatePos matcher
-    updateState $ \st -> st{ sExpanded = False }
-    return res
+    updateState $ \st ->
+      if sEnableWithRaw st
+         then st{ sRawTokens = IntMap.map (res:) $ sRawTokens st }
+         else st
+    return $! res
   where matcher t | f t       = Just t
                   | otherwise = Nothing
         updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos
         updatePos _spos _ (Tok pos _ _ : _) = pos
-        updatePos spos _ []                 = incSourceColumn spos 1
+        updatePos spos (Tok _ _ t)  []      = incSourceColumn spos (T.length t)
 
 doMacros :: PandocMonad m => LP m ()
 doMacros = do
-  expanded <- sExpanded <$> getState
-  verbatimMode <- sVerbatimMode <$> getState
-  unless (expanded || verbatimMode) $ do
-      getInput >>= doMacros' 1 >>= setInput
-      updateState $ \st -> st{ sExpanded = True }
+  st <- getState
+  unless (sVerbatimMode st) $
+    getInput >>= doMacros' 1 >>= setInput
 
 doMacros' :: PandocMonad m => Int -> [Tok] -> LP m [Tok]
 doMacros' n inp =
   case inp of
      Tok spos (CtrlSeq "begin") _ : Tok _ Symbol "{" :
       Tok _ Word name : Tok _ Symbol "}" : ts
-        -> handleMacros n spos name ts
+        -> handleMacros n spos name ts <|> return inp
      Tok spos (CtrlSeq "end") _ : Tok _ Symbol "{" :
       Tok _ Word name : Tok _ Symbol "}" : ts
-        -> handleMacros n spos ("end" <> name) ts
+        -> handleMacros n spos ("end" <> name) ts <|> return inp
      Tok _ (CtrlSeq "expandafter") _ : t : ts
         -> combineTok t <$> doMacros' n ts
      Tok spos (CtrlSeq name) _ : ts
-        -> handleMacros n spos name ts
+        -> handleMacros n spos name ts <|> return inp
      _ -> return inp
-   <|> return inp
 
   where
     combineTok (Tok spos (CtrlSeq name) x) (Tok _ Word w : ts)
@@ -482,7 +554,7 @@ doMacros' n inp =
         $ throwError $ PandocMacroLoop name
       macros <- sMacros <$> getState
       case M.lookup name macros of
-           Nothing -> mzero
+           Nothing -> trySpecialMacro name ts
            Just (Macro expansionPoint argspecs optarg newtoks) -> do
              let getargs' = do
                    args <-
@@ -510,6 +582,41 @@ doMacros' n inp =
                    ExpandWhenUsed    -> doMacros' (n' + 1) result
                    ExpandWhenDefined -> return result
 
+-- | Certain macros do low-level tex manipulations that can't
+-- be represented in our Macro type, so we handle them here.
+trySpecialMacro :: PandocMonad m => Text -> [Tok] -> LP m [Tok]
+trySpecialMacro "xspace" ts = do
+  ts' <- doMacros' 1 ts
+  case ts' of
+    Tok pos Word t : _
+      | startsWithAlphaNum t -> return $ Tok pos Spaces " " : ts'
+    _ -> return ts'
+trySpecialMacro "iftrue" ts = handleIf True ts
+trySpecialMacro "iffalse" ts = handleIf False ts
+trySpecialMacro _ _ = mzero
+
+handleIf :: PandocMonad m => Bool -> [Tok] -> LP m [Tok]
+handleIf b ts = do
+  res' <- lift $ runParserT (ifParser b) defaultLaTeXState "tokens" ts
+  case res' of
+    Left _ -> Prelude.fail "Could not parse conditional"
+    Right ts' -> return ts'
+
+ifParser :: PandocMonad m => Bool -> LP m [Tok]
+ifParser b = do
+  ifToks <- many (notFollowedBy (controlSeq "else" <|> controlSeq "fi")
+                    *> anyTok)
+  elseToks <- (controlSeq "else" >> manyTill anyTok (controlSeq "fi"))
+                 <|> ([] <$ controlSeq "fi")
+  rest <- getInput
+  return $ (if b then ifToks else elseToks) ++ rest
+
+startsWithAlphaNum :: Text -> Bool
+startsWithAlphaNum t =
+  case T.uncons t of
+       Just (c, _) | isAlphaNum c -> True
+       _           -> False
+
 setpos :: SourcePos -> Tok -> Tok
 setpos spos (Tok _ tt txt) = Tok spos tt txt
 
@@ -592,18 +699,22 @@ isCommentTok _                 = False
 anyTok :: PandocMonad m => LP m Tok
 anyTok = satisfyTok (const True)
 
+singleCharTok :: PandocMonad m => LP m Tok
+singleCharTok =
+  satisfyTok $ \case
+     Tok _ Word  t   -> T.length t == 1
+     Tok _ Symbol t  -> not (T.any (`Set.member` specialChars) t)
+     _               -> False
+
 singleChar :: PandocMonad m => LP m Tok
-singleChar = try $ do
-  Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol])
-  guard $ not $ toktype == Symbol &&
-                T.any (`Set.member` specialChars) t
-  if T.length t > 1
-     then do
-       let (t1, t2) = (T.take 1 t, T.drop 1 t)
-       inp <- getInput
-       setInput $ Tok (incSourceColumn pos 1) toktype t2 : inp
-       return $ Tok pos toktype t1
-     else return $ Tok pos toktype t
+singleChar = singleCharTok <|> singleCharFromWord
+ where
+  singleCharFromWord = do
+    Tok pos toktype t <- disablingWithRaw $ satisfyTok isWordTok
+    let (t1, t2) = (T.take 1 t, T.drop 1 t)
+    inp <- getInput
+    setInput $ Tok pos toktype t1 : Tok (incSourceColumn pos 1) toktype t2 : inp
+    anyTok
 
 specialChars :: Set.Set Char
 specialChars = Set.fromList "#$%&~_^\\{}"
@@ -646,28 +757,25 @@ grouped parser = try $ do
   -- {{a,b}} should be parsed the same as {a,b}
   try (grouped parser <* egroup) <|> (mconcat <$> manyTill parser egroup)
 
-braced' :: PandocMonad m => LP m Tok -> Int -> LP m [Tok]
-braced' getTok n =
-  handleEgroup <|> handleBgroup <|> handleOther
-  where handleEgroup = do
-          t <- symbol '}'
-          if n == 1
-             then return []
-             else (t:) <$> braced' getTok (n - 1)
-        handleBgroup = do
-          t <- symbol '{'
-          (t:) <$> braced' getTok (n + 1)
-        handleOther = do
-          t <- getTok
-          (t:) <$> braced' getTok n
+braced' :: PandocMonad m => LP m Tok -> LP m [Tok]
+braced' getTok = symbol '{' *> go (1 :: Int)
+ where
+  go n = do
+    t <- getTok
+    case t of
+      Tok _ Symbol "}"
+        | n > 1     -> (t:) <$> go (n - 1)
+        | otherwise -> return []
+      Tok _ Symbol "{" -> (t:) <$> go (n + 1)
+      _ -> (t:) <$> go n
 
 braced :: PandocMonad m => LP m [Tok]
-braced = symbol '{' *> braced' anyTok 1
+braced = braced' anyTok
 
 -- URLs require special handling, because they can contain %
 -- characters.  So we retonenize comments as we go...
 bracedUrl :: PandocMonad m => LP m [Tok]
-bracedUrl = bgroup *> braced' (retokenizeComment >> anyTok) 1
+bracedUrl = braced' (retokenizeComment >> anyTok)
 
 -- For handling URLs, which allow literal % characters...
 retokenizeComment :: PandocMonad m => LP m ()
@@ -723,16 +831,29 @@ ignore raw = do
 
 withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok])
 withRaw parser = do
-  inp <- getInput
+  rawTokensMap <- sRawTokens <$> getState
+  let key = case IntMap.lookupMax rawTokensMap of
+               Nothing     -> 0
+               Just (n,_)  -> n + 1
+  -- insert empty list at key
+  updateState $ \st -> st{ sRawTokens =
+                             IntMap.insert key [] $ sRawTokens st }
   result <- parser
-  nxtpos <- option Nothing ((\(Tok pos' _ _) -> Just pos') <$> lookAhead anyTok)
-  let raw = takeWhile (\(Tok pos _ _) -> maybe True
-                  (\p -> sourceName p /= sourceName pos || pos < p) nxtpos) inp
+  mbRevToks <- IntMap.lookup key . sRawTokens <$> getState
+  raw <- case mbRevToks of
+           Just revtoks -> do
+             updateState $ \st -> st{ sRawTokens =
+                                        IntMap.delete key $ sRawTokens st}
+             return $ reverse revtoks
+           Nothing      ->
+             throwError $ PandocShouldNeverHappenError $
+                "sRawTokens has nothing at key " <> T.pack (show key)
   return (result, raw)
 
 keyval :: PandocMonad m => LP m (Text, Text)
 keyval = try $ do
-  Tok _ Word key <- satisfyTok isWordTok
+  key <- untokenize <$> many1 (notFollowedBy (symbol '=') >>
+                         (symbol '-' <|> symbol '_' <|> satisfyTok isWordTok))
   sp
   val <- option mempty $ do
            symbol '='
@@ -792,7 +913,7 @@ getRawCommand name txt = do
   (_, rawargs) <- withRaw $
       case name of
            "write" -> do
-             void $ satisfyTok isWordTok -- digits
+             void $ many $ satisfyTok isDigitTok -- digits
              void braced
            "titleformat" -> do
              void braced
@@ -807,6 +928,10 @@ getRawCommand name txt = do
                void $ many braced
   return $ txt <> untokenize rawargs
 
+isDigitTok :: Tok -> Bool
+isDigitTok (Tok _ Word t) = T.all isDigit t
+isDigitTok _              = False
+
 skipopts :: PandocMonad m => LP m ()
 skipopts = skipMany (void overlaySpecification <|> void rawopt)
 
@@ -874,3 +999,35 @@ getNextNumber getCurrentNum = do
                Just n  -> [n, 1]
                Nothing -> [1]
 
+label :: PandocMonad m => LP m ()
+label = do
+  controlSeq "label"
+  t <- braced
+  updateState $ \st -> st{ sLastLabel = Just $ untokenize t }
+
+setCaption :: PandocMonad m => LP m Inlines -> LP m ()
+setCaption inline = try $ do
+  skipopts
+  ils <- tokWith inline
+  optional $ try $ spaces *> label
+  updateState $ \st -> st{ sCaption = Just ils }
+
+resetCaption :: PandocMonad m => LP m ()
+resetCaption = updateState $ \st -> st{ sCaption   = Nothing
+                                      , sLastLabel = Nothing }
+
+env :: PandocMonad m => Text -> LP m a -> LP m a
+env name p = p <* end_ name
+
+tokWith :: PandocMonad m => LP m Inlines -> LP m Inlines
+tokWith inlineParser = try $ spaces >>
+                                 grouped inlineParser
+                            <|> (lookAhead anyControlSeq >> inlineParser)
+                            <|> singleChar'
+  where singleChar' = do
+          Tok _ _ t <- singleChar
+          return $ str t
+
+addMeta :: PandocMonad m => ToMetaValue a => Text -> a -> LP m ()
+addMeta field val = updateState $ \st ->
+   st{ sMeta = addMetaField field val $ sMeta st }
diff --git a/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs b/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs
index db9c276e7..b8bf0ce7f 100644
--- a/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs
+++ b/src/Text/Pandoc/Readers/LaTeX/SIunitx.hs
@@ -1,12 +1,7 @@
+{-# LANGUAGE ScopedTypeVariables #-}
 {-# LANGUAGE OverloadedStrings #-}
 module Text.Pandoc.Readers.LaTeX.SIunitx
-  ( dosi
-  , doSI
-  , doSIrange
-  , doSInum
-  , doSInumlist
-  , doSIang
-  )
+  ( siunitxCommands )
 where
 import Text.Pandoc.Builder
 import Text.Pandoc.Readers.LaTeX.Parsing
@@ -15,14 +10,32 @@ import Text.Pandoc.Class
 import Text.Pandoc.Parsing hiding (blankline, mathDisplay, mathInline,
                             optional, space, spaces, withRaw, (<|>))
 import Control.Applicative ((<|>))
+import Control.Monad (void)
 import qualified Data.Map as M
 import Data.Char (isDigit)
 import Data.Text (Text)
 import qualified Data.Text as T
 import Data.List (intersperse)
+import qualified Data.Sequence as Seq
+import Text.Pandoc.Walk (walk)
+
+siunitxCommands :: PandocMonad m
+                 => LP m Inlines -> M.Map Text (LP m Inlines)
+siunitxCommands tok = M.fromList
+  [ ("si", dosi tok)
+  , ("SI", doSI tok)
+  , ("SIrange", doSIrange True tok)
+  , ("numrange", doSIrange False tok)
+  , ("numlist", doSInumlist)
+  , ("SIlist", doSIlist tok)
+  , ("num", doSInum)
+  , ("ang", doSIang)
+  ]
 
 dosi :: PandocMonad m => LP m Inlines -> LP m Inlines
-dosi tok = grouped (siUnit tok) <|> siUnit tok
+dosi tok = do
+  options <- option [] keyvals
+  grouped (siUnit options tok) <|> siUnit options tok
 
 -- converts e.g. \SI{1}[\$]{} to "$ 1" or \SI{1}{\euro} to "1 €"
 doSI :: PandocMonad m => LP m Inlines -> LP m Inlines
@@ -57,23 +70,50 @@ doSInumlist = do
              mconcat (intersperse (str "," <> space) (init xs)) <>
              text ", & " <> last xs
 
+doSIlist :: PandocMonad m => LP m Inlines -> LP m Inlines
+doSIlist tok = do
+  options <- option [] keyvals
+  nums <- map tonum . T.splitOn ";" . untokenize <$> braced
+  unit <- grouped (siUnit options tok) <|> siUnit options tok
+  let xs = map (<> (str "\xa0" <> unit)) nums
+  case xs of
+    []  -> return mempty
+    [x] -> return x
+    _   -> return $
+             mconcat (intersperse (str "," <> space) (init xs)) <>
+             text ", & " <> last xs
+
 parseNum :: Parser Text () Inlines
 parseNum = (mconcat <$> many parseNumPart) <* eof
 
+minus :: Text
+minus = "\x2212"
+
+hyphenToMinus :: Inline -> Inline
+hyphenToMinus (Str t) = Str (T.replace "-" minus t)
+hyphenToMinus x = x
+
 parseNumPart :: Parser Text () Inlines
 parseNumPart =
   parseDecimalNum <|>
   parseComma <|>
   parsePlusMinus <|>
+  parsePM <|>
   parseI <|>
   parseExp <|>
   parseX <|>
   parseSpace
  where
-  parseDecimalNum = do
-    pref <- option mempty $ (mempty <$ char '+') <|> ("\x2212" <$ char '-')
-    basenum <- (pref <>) . T.pack
-                <$> many1 (satisfy (\c -> isDigit c || c == '.'))
+  parseDecimalNum, parsePlusMinus, parsePM,
+    parseComma, parseI, parseX,
+    parseExp, parseSpace :: Parser Text () Inlines
+  parseDecimalNum = try $ do
+    pref <- option mempty $ (mempty <$ char '+') <|> (minus <$ char '-')
+    basenum' <- many1 (satisfy (\c -> isDigit c || c == '.'))
+    let basenum = pref <> T.pack
+                    (case basenum' of
+                      '.':_ -> '0':basenum'
+                      _ -> basenum')
     uncertainty <- option mempty $ T.pack <$> parseParens
     if T.null uncertainty
        then return $ str basenum
@@ -91,6 +131,7 @@ parseNumPart =
                                                | otherwise -> "." <> t
   parseComma = str "." <$ char ','
   parsePlusMinus = str "\xa0\xb1\xa0" <$ try (string "+-")
+  parsePM = str "\xa0\xb1\xa0" <$ try (string "\\pm")
   parseParens =
     char '(' *> many1 (satisfy (\c -> isDigit c || c == '.')) <* char ')'
   parseI = str "i" <$ char 'i'
@@ -103,11 +144,14 @@ doSIang :: PandocMonad m => LP m Inlines
 doSIang = do
   skipopts
   ps <- T.splitOn ";" . untokenize <$> braced
+  let dropPlus t = case T.uncons t of
+                     Just ('+',t') -> t'
+                     _ -> t
   case ps ++ repeat "" of
     (d:m:s:_) -> return $
-      (if T.null d then mempty else str d <> str "\xb0") <>
-      (if T.null m then mempty else str m <> str "\x2032") <>
-      (if T.null s then mempty else str s <> str "\x2033")
+      (if T.null d then mempty else str (dropPlus d) <> str "\xb0") <>
+      (if T.null m then mempty else str (dropPlus m) <> str "\x2032") <>
+      (if T.null s then mempty else str (dropPlus s) <> str "\x2033")
     _ -> return mempty
 
 -- converts e.g. \SIrange{100}{200}{\ms} to "100 ms--200 ms"
@@ -136,40 +180,99 @@ doSIrange includeUnits tok = do
 emptyOr160 :: Inlines -> Inlines
 emptyOr160 x = if x == mempty then x else str "\160"
 
-siUnit :: PandocMonad m => LP m Inlines -> LP m Inlines
-siUnit tok = try (do
-  Tok _ (CtrlSeq name) _ <- anyControlSeq
-  case name of
-    "square" -> do
-       unit <- siUnit tok
-       return $ unit <> superscript "2"
-    "cubic" -> do
-       unit <- siUnit tok
-       return $ unit <> superscript "3"
-    "raisetothe" -> do
-       n <- tok
-       unit <- siUnit tok
-       return $ unit <> superscript n
-    _ ->
-       case M.lookup name siUnitMap of
-            Just il ->
-              option il $
-                choice
-                 [ (il <> superscript "2") <$ controlSeq "squared"
-                 , (il <> superscript "3") <$ controlSeq "cubed"
-                 , (\n -> il <> superscript n) <$> (controlSeq "tothe" *> tok)
-                 ]
-            Nothing -> fail "not an siunit unit command")
- <|> (lookAhead anyControlSeq >> tok)
- <|> (do Tok _ Word t <- satisfyTok isWordTok
-         return $ str t)
- <|> (symbol '^' *> (superscript <$> tok))
- <|> (symbol '_' *> (subscript <$> tok))
- <|> ("\xa0" <$ symbol '.')
- <|> ("\xa0" <$ symbol '~')
- <|> tok
- <|> (do Tok _ _ t <- anyTok
-         return (str t))
+siUnit :: forall m. PandocMonad m => [(Text,Text)] -> LP m Inlines -> LP m Inlines
+siUnit options tok = mconcat . intersperse (str "\xa0") <$> many1 siUnitPart
+ where
+  siUnitPart :: LP m Inlines
+  siUnitPart = try $ do
+    skipMany (void (symbol '.') <|> void (symbol '~') <|> spaces1)
+    x <- ((siPrefix <*> siBase)
+            <|> (do u <- siBase <|> tok
+                    option u $ siSuffix <*> pure u))
+    option x (siInfix x)
+  siInfix :: Inlines -> LP m Inlines
+  siInfix u1 = try $
+       (do _ <- controlSeq "per"
+           u2 <- siUnitPart
+           let useSlash = lookup "per-mode" options == Just "symbol"
+           if useSlash
+              then return (u1 <> str "/" <> u2)
+              else return (u1 <> str "\xa0" <> negateExponent u2))
+   <|> (do _ <- symbol '/'
+           u2 <- siUnitPart
+           return (u1 <> str "/" <> u2))
+  siPrefix :: LP m (Inlines -> Inlines)
+  siPrefix =
+       (do _ <- controlSeq "square"
+           skipopts
+           return (<> superscript "2"))
+   <|> (do _ <- controlSeq "cubic"
+           skipopts
+           return (<> superscript "3"))
+   <|> (do _ <- controlSeq "raisetothe"
+           skipopts
+           n <- walk hyphenToMinus <$> tok
+           return (<> superscript n))
+  siSuffix :: LP m (Inlines -> Inlines)
+  siSuffix =
+       (do _ <- controlSeq "squared"
+           skipopts
+           return (<> superscript "2"))
+   <|> (do _ <- controlSeq "cubed"
+           skipopts
+           return (<> superscript "3"))
+   <|> (do _ <- controlSeq "tothe"
+           skipopts
+           n <- walk hyphenToMinus <$> tok
+           return (<> superscript n))
+   <|> (symbol '^' *> (do n <- walk hyphenToMinus <$> tok
+                          return (<> superscript n)))
+   <|> (symbol '_' *> (do n <- walk hyphenToMinus <$> tok
+                          return (<> subscript n)))
+  negateExponent :: Inlines -> Inlines
+  negateExponent ils =
+    case Seq.viewr (unMany ils) of
+      xs Seq.:> Superscript ss -> (Many xs) <>
+                                     superscript (str minus <> fromList ss)
+      _ -> ils <> superscript (str (minus <> "1"))
+  siBase :: LP m Inlines
+  siBase =
+    ((try
+       (do Tok _ (CtrlSeq name) _ <- anyControlSeq
+           case M.lookup name siUnitModifierMap of
+              Just il -> (il <>) <$> siBase
+              Nothing ->
+                case M.lookup name siUnitMap of
+                   Just il -> pure il
+                   Nothing -> fail "not a unit command"))
+    <|> (do Tok _ Word t <- satisfyTok isWordTok
+            return $ str t)
+     )
+
+siUnitModifierMap :: M.Map Text Inlines
+siUnitModifierMap = M.fromList
+  [ ("atto", str "a")
+  , ("centi", str "c")
+  , ("deca", str "d")
+  , ("deci", str "d")
+  , ("deka", str "d")
+  , ("exa", str "E")
+  , ("femto", str "f")
+  , ("giga", str "G")
+  , ("hecto", str "h")
+  , ("kilo", str "k")
+  , ("mega", str "M")
+  , ("micro", str "μ")
+  , ("milli", str "m")
+  , ("nano", str "n")
+  , ("peta", str "P")
+  , ("pico", str "p")
+  , ("tera", str "T")
+  , ("yocto", str "y")
+  , ("yotta", str "Y")
+  , ("zepto", str "z")
+  , ("zetta", str "Z")
+  ]
 
 siUnitMap :: M.Map Text Inlines
 siUnitMap = M.fromList
@@ -269,7 +372,6 @@ siUnitMap = M.fromList
   , ("arcsecond", str "″")
   , ("astronomicalunit", str "ua")
   , ("atomicmassunit", str "u")
-  , ("atto", str "a")
   , ("bar", str "bar")
   , ("barn", str "b")
   , ("becquerel", str "Bq")
@@ -277,51 +379,38 @@ siUnitMap = M.fromList
   , ("bohr", emph (str "a") <> subscript (str "0"))
   , ("candela", str "cd")
   , ("celsius", str "°C")
-  , ("centi", str "c")
   , ("clight", emph (str "c") <> subscript (str "0"))
   , ("coulomb", str "C")
   , ("dalton", str "Da")
   , ("day", str "d")
-  , ("deca", str "d")
-  , ("deci", str "d")
   , ("decibel", str "db")
   , ("degreeCelsius",str "°C")
   , ("degree", str "°")
-  , ("deka", str "d")
   , ("electronmass", emph (str "m") <> subscript (str "e"))
   , ("electronvolt", str "eV")
   , ("elementarycharge", emph (str "e"))
-  , ("exa", str "E")
   , ("farad", str "F")
-  , ("femto", str "f")
-  , ("giga", str "G")
   , ("gram", str "g")
   , ("gray", str "Gy")
   , ("hartree", emph (str "E") <> subscript (str "h"))
   , ("hectare", str "ha")
-  , ("hecto", str "h")
   , ("henry", str "H")
   , ("hertz", str "Hz")
   , ("hour", str "h")
   , ("joule", str "J")
   , ("katal", str "kat")
   , ("kelvin", str "K")
-  , ("kilo", str "k")
   , ("kilogram", str "kg")
   , ("knot", str "kn")
   , ("liter", str "L")
   , ("litre", str "l")
   , ("lumen", str "lm")
   , ("lux", str "lx")
-  , ("mega", str "M")
   , ("meter", str "m")
   , ("metre", str "m")
-  , ("micro", str "μ")
-  , ("milli", str "m")
   , ("minute", str "min")
   , ("mmHg", str "mmHg")
   , ("mole", str "mol")
-  , ("nano", str "n")
   , ("nauticalmile", str "M")
   , ("neper", str "Np")
   , ("newton", str "N")
@@ -329,25 +418,17 @@ siUnitMap = M.fromList
   , ("Pa", str "Pa")
   , ("pascal", str "Pa")
   , ("percent", str "%")
-  , ("per", str "/")
-  , ("peta", str "P")
-  , ("pico", str "p")
   , ("planckbar", emph (str "\x210f"))
   , ("radian", str "rad")
   , ("second", str "s")
   , ("siemens", str "S")
   , ("sievert", str "Sv")
   , ("steradian", str "sr")
-  , ("tera", str "T")
   , ("tesla", str "T")
   , ("tonne", str "t")
   , ("volt", str "V")
   , ("watt", str "W")
   , ("weber", str "Wb")
-  , ("yocto", str "y")
-  , ("yotta", str "Y")
-  , ("zepto", str "z")
-  , ("zetta", str "Z")
   ]
 
 
diff --git a/src/Text/Pandoc/Readers/LaTeX/Table.hs b/src/Text/Pandoc/Readers/LaTeX/Table.hs
new file mode 100644
index 000000000..f56728fe1
--- /dev/null
+++ b/src/Text/Pandoc/Readers/LaTeX/Table.hs
@@ -0,0 +1,379 @@
+{-# LANGUAGE BangPatterns #-}
+{-# LANGUAGE OverloadedStrings #-}
+module Text.Pandoc.Readers.LaTeX.Table
+  ( tableEnvironments )
+where
+
+import Data.Functor (($>))
+import Text.Pandoc.Class
+import Text.Pandoc.Readers.LaTeX.Parsing
+import Text.Pandoc.Readers.LaTeX.Types
+import Text.Pandoc.Builder as B
+import qualified Data.Map as M
+import Data.Text (Text)
+import Data.Maybe (fromMaybe)
+import qualified Data.Text as T
+import Control.Applicative ((<|>), optional, many)
+import Control.Monad (when, void)
+import Text.Pandoc.Shared (safeRead, trim)
+import Text.Pandoc.Logging (LogMessage(SkippedContent))
+import Text.Pandoc.Walk (walkM)
+import Text.Pandoc.Parsing hiding (blankline, many, mathDisplay, mathInline,
+                            optional, space, spaces, withRaw, (<|>))
+
+tableEnvironments :: PandocMonad m
+                  => LP m Blocks
+                  -> LP m Inlines
+                  -> M.Map Text (LP m Blocks)
+tableEnvironments blocks inline =
+  M.fromList
+  [ ("longtable",  env "longtable" $
+          resetCaption *>
+            simpTable blocks inline "longtable" False >>= addTableCaption)
+  , ("table",  env "table" $
+          skipopts *> resetCaption *> blocks >>= addTableCaption)
+  , ("tabular*", env "tabular*" $ simpTable blocks inline "tabular*" True)
+  , ("tabularx", env "tabularx" $ simpTable blocks inline "tabularx" True)
+  , ("tabular", env "tabular"  $ simpTable blocks inline "tabular" False)
+  ]
+
+hline :: PandocMonad m => LP m ()
+hline = try $ do
+  spaces
+  controlSeq "hline" <|>
+    (controlSeq "cline" <* braced) <|>
+    -- booktabs rules:
+    controlSeq "toprule" <|>
+    controlSeq "bottomrule" <|>
+    controlSeq "midrule" <|>
+    controlSeq "endhead" <|>
+    controlSeq "endfirsthead"
+  spaces
+  optional rawopt
+  return ()
+
+lbreak :: PandocMonad m => LP m Tok
+lbreak = (controlSeq "\\" <|> controlSeq "tabularnewline")
+         <* skipopts <* spaces
+
+amp :: PandocMonad m => LP m Tok
+amp = symbol '&'
+
+-- Split a Word into individual Symbols (for parseAligns)
+splitWordTok :: PandocMonad m => LP m ()
+splitWordTok = do
+  inp <- getInput
+  case inp of
+       (Tok spos Word t : rest) ->
+         setInput $ map (Tok spos Symbol . T.singleton) (T.unpack t) <> rest
+       _ -> return ()
+
+parseAligns :: PandocMonad m => LP m [(Alignment, ColWidth, ([Tok], [Tok]))]
+parseAligns = try $ do
+  let maybeBar = skipMany
+        (try $ sp *> (() <$ symbol '|' <|> () <$ (symbol '@' >> braced)))
+  let cAlign = AlignCenter <$ symbol 'c'
+  let lAlign = AlignLeft <$ symbol 'l'
+  let rAlign = AlignRight <$ symbol 'r'
+  let parAlign = AlignLeft <$ symbol 'p'
+  -- aligns from tabularx
+  let xAlign = AlignLeft <$ symbol 'X'
+  let mAlign = AlignLeft <$ symbol 'm'
+  let bAlign = AlignLeft <$ symbol 'b'
+  let alignChar = splitWordTok *> (  cAlign <|> lAlign <|> rAlign <|> parAlign
+                                 <|> xAlign <|> mAlign <|> bAlign )
+  let alignPrefix = symbol '>' >> braced
+  let alignSuffix = symbol '<' >> braced
+  let colWidth = try $ do
+        symbol '{'
+        ds <- trim . untokenize <$> manyTill anyTok (controlSeq "linewidth")
+        spaces
+        symbol '}'
+        return $ safeRead ds
+  let alignSpec = do
+        pref <- option [] alignPrefix
+        spaces
+        al <- alignChar
+        width <- colWidth <|> option Nothing (do s <- untokenize <$> braced
+                                                 pos <- getPosition
+                                                 report $ SkippedContent s pos
+                                                 return Nothing)
+        spaces
+        suff <- option [] alignSuffix
+        return (al, width, (pref, suff))
+  let starAlign = do -- '*{2}{r}' == 'rr', we just expand like a macro
+        symbol '*'
+        spaces
+        ds <- trim . untokenize <$> braced
+        spaces
+        spec <- braced
+        case safeRead ds of
+             Just n  ->
+               getInput >>= setInput . (mconcat (replicate n spec) ++)
+             Nothing -> Prelude.fail $ "Could not parse " <> T.unpack ds <> " as number"
+  bgroup
+  spaces
+  maybeBar
+  aligns' <- many $ try $ spaces >> optional starAlign >>
+                            (alignSpec <* maybeBar)
+  spaces
+  egroup
+  spaces
+  return $ map toSpec aligns'
+  where
+    toColWidth (Just w) | w > 0 = ColWidth w
+    toColWidth _                = ColWidthDefault
+    toSpec (x, y, z) = (x, toColWidth y, z)
+
+-- N.B. this parser returns a Row that may have erroneous empty cells
+-- in it. See the note above fixTableHead for details.
+parseTableRow :: PandocMonad m
+              => LP m Blocks -- ^ block parser
+              -> LP m Inlines -- ^ inline parser
+              -> Text   -- ^ table environment name
+              -> [([Tok], [Tok])] -- ^ pref/suffixes
+              -> LP m Row
+parseTableRow blocks inline envname prefsufs = do
+  notFollowedBy (spaces *> end_ envname)
+  -- contexts that can contain & that is not colsep:
+  let canContainAmp (Tok _ (CtrlSeq "begin") _) = True
+      canContainAmp (Tok _ (CtrlSeq "verb") _)  = True
+      canContainAmp (Tok _ (CtrlSeq "Verb") _)  = True
+      canContainAmp _       = False
+  -- add prefixes and suffixes in token stream:
+  let celltoks (pref, suff) = do
+        prefpos <- getPosition
+        contents <- mconcat <$>
+            many ( snd <$> withRaw
+                     ((lookAhead (controlSeq "parbox") >>
+                       void blocks) -- #5711
+                      <|>
+                      (lookAhead (satisfyTok canContainAmp) >> void inline)
+                      <|>
+                      (lookAhead (symbol '$') >> void inline))
+                  <|>
+                   (do notFollowedBy
+                         (() <$ amp <|> () <$ lbreak <|> end_ envname)
+                       count 1 anyTok) )
+
+        suffpos <- getPosition
+        option [] (count 1 amp)
+        return $ map (setpos prefpos) pref ++ contents ++ map (setpos suffpos) suff
+  rawcells <- mapM celltoks prefsufs
+  cells <- mapM (parseFromToks (parseTableCell blocks)) rawcells
+  spaces
+  return $ Row nullAttr cells
+
+parseTableCell :: PandocMonad m => LP m Blocks -> LP m Cell
+parseTableCell blocks = do
+  spaces
+  updateState $ \st -> st{ sInTableCell = True }
+  cell' <-   multicolumnCell blocks
+         <|> multirowCell blocks
+         <|> parseSimpleCell
+         <|> parseEmptyCell
+  updateState $ \st -> st{ sInTableCell = False }
+  spaces
+  return cell'
+  where
+    -- The parsing of empty cells is important in LaTeX, especially when dealing
+    -- with multirow/multicolumn. See #6603.
+    parseEmptyCell = spaces $> emptyCell
+    parseSimpleCell = simpleCell <$> (plainify <$> blocks)
+
+
+cellAlignment :: PandocMonad m => LP m Alignment
+cellAlignment = skipMany (symbol '|') *> alignment <* skipMany (symbol '|')
+  where
+    alignment = do
+      c <- untoken <$> singleChar
+      return $ case c of
+        "l" -> AlignLeft
+        "r" -> AlignRight
+        "c" -> AlignCenter
+        "*" -> AlignDefault
+        _   -> AlignDefault
+
+plainify :: Blocks -> Blocks
+plainify bs = case toList bs of
+                [Para ils] -> plain (fromList ils)
+                _          -> bs
+
+multirowCell :: PandocMonad m => LP m Blocks -> LP m Cell
+multirowCell blocks = controlSeq "multirow" >> do
+  -- Full prototype for \multirow macro is:
+  --     \multirow[vpos]{nrows}[bigstruts]{width}[vmove]{text}
+  -- However, everything except `nrows` and `text` make
+  -- sense in the context of the Pandoc AST
+  _ <- optional $ symbol '[' *> cellAlignment <* symbol ']'   -- vertical position
+  nrows <- fmap (fromMaybe 1 . safeRead . untokenize) braced
+  _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']')  -- bigstrut-related
+  _ <- symbol '{' *> manyTill anyTok (symbol '}')             -- Cell width
+  _ <- optional $ symbol '[' *> manyTill anyTok (symbol ']')  -- Length used for fine-tuning
+  content <- symbol '{' *> (plainify <$> blocks) <* symbol '}'
+  return $ cell AlignDefault (RowSpan nrows) (ColSpan 1) content
+
+multicolumnCell :: PandocMonad m => LP m Blocks -> LP m Cell
+multicolumnCell blocks = controlSeq "multicolumn" >> do
+  span' <- fmap (fromMaybe 1 . safeRead . untokenize) braced
+  alignment <- symbol '{' *> cellAlignment <* symbol '}'
+
+  let singleCell = do
+        content <- plainify <$> blocks
+        return $ cell alignment (RowSpan 1) (ColSpan span') content
+
+  -- Two possible contents: either a \multirow cell, or content.
+  -- E.g. \multicol{1}{c}{\multirow{2}{1em}{content}}
+  -- Note that a \multirow cell can be nested in a \multicolumn,
+  -- but not the other way around. See #6603
+  let nestedCell = do
+        (Cell _ _ (RowSpan rs) _ bs) <- multirowCell blocks
+        return $ cell
+                  alignment
+                  (RowSpan rs)
+                  (ColSpan span')
+                  (fromList bs)
+
+  symbol '{' *> (nestedCell <|> singleCell) <* symbol '}'
+
+-- LaTeX tables are stored with empty cells underneath multirow cells
+-- denoting the grid spaces taken up by them. More specifically, if a
+-- cell spans m rows, then it will overwrite all the cells in the
+-- columns it spans for (m-1) rows underneath it, requiring padding
+-- cells in these places. These padding cells need to be removed for
+-- proper table reading. See #6603.
+--
+-- These fixTable functions do not otherwise fix up malformed
+-- input tables: that is left to the table builder.
+fixTableHead :: TableHead -> TableHead
+fixTableHead (TableHead attr rows) = TableHead attr rows'
+  where
+    rows' = fixTableRows rows
+
+fixTableBody :: TableBody -> TableBody
+fixTableBody (TableBody attr rhc th tb)
+  = TableBody attr rhc th' tb'
+  where
+    th' = fixTableRows th
+    tb' = fixTableRows tb
+
+fixTableRows :: [Row] -> [Row]
+fixTableRows = fixTableRows' $ repeat Nothing
+  where
+    fixTableRows' oldHang (Row attr cells : rs)
+      = let (newHang, cells') = fixTableRow oldHang cells
+            rs'               = fixTableRows' newHang rs
+        in Row attr cells' : rs'
+    fixTableRows' _ [] = []
+
+-- The overhang is represented as Just (relative cell dimensions) or
+-- Nothing for an empty grid space.
+fixTableRow :: [Maybe (ColSpan, RowSpan)] -> [Cell] -> ([Maybe (ColSpan, RowSpan)], [Cell])
+fixTableRow oldHang cells
+  -- If there's overhang, drop cells until their total width meets the
+  -- width of the occupied grid spaces (or we run out)
+  | (n, prefHang, restHang) <- splitHang oldHang
+  , n > 0
+  = let cells' = dropToWidth getCellW n cells
+        (restHang', cells'') = fixTableRow restHang cells'
+    in (prefHang restHang', cells'')
+  -- Otherwise record the overhang of a pending cell and fix the rest
+  -- of the row
+  | c@(Cell _ _ h w _):cells' <- cells
+  = let h' = max 1 h
+        w' = max 1 w
+        oldHang' = dropToWidth getHangW w' oldHang
+        (newHang, cells'') = fixTableRow oldHang' cells'
+    in (toHang w' h' <> newHang, c : cells'')
+  | otherwise
+  = (oldHang, [])
+  where
+    getCellW (Cell _ _ _ w _) = w
+    getHangW = maybe 1 fst
+    getCS (ColSpan n) = n
+
+    toHang c r
+      | r > 1     = [Just (c, r)]
+      | otherwise = replicate (getCS c) Nothing
+
+    -- Take the prefix of the overhang list representing filled grid
+    -- spaces. Also return the remainder and the length of this prefix.
+    splitHang = splitHang' 0 id
+
+    splitHang' !n l (Just (c, r):xs)
+      = splitHang' (n + c) (l . (toHang c (r-1) ++)) xs
+    splitHang' n l xs = (n, l, xs)
+
+    -- Drop list items until the total width of the dropped items
+    -- exceeds the passed width.
+    dropToWidth _     n l | n < 1 = l
+    dropToWidth wproj n (c:cs)    = dropToWidth wproj (n - wproj c) cs
+    dropToWidth _     _ []        = []
+
+simpTable :: PandocMonad m
+          => LP m Blocks
+          -> LP m Inlines
+          -> Text
+          -> Bool
+          -> LP m Blocks
+simpTable blocks inline envname hasWidthParameter = try $ do
+  when hasWidthParameter $ () <$ tokWith inline
+  skipopts
+  colspecs <- parseAligns
+  let (aligns, widths, prefsufs) = unzip3 colspecs
+  optional $ controlSeq "caption" *> setCaption inline
+  spaces
+  optional label
+  spaces
+  optional lbreak
+  spaces
+  skipMany hline
+  spaces
+  header' <- option [] . try . fmap (:[]) $
+             parseTableRow blocks inline envname prefsufs <*
+               lbreak <* many1 hline
+  spaces
+  rows <- sepEndBy (parseTableRow blocks inline envname prefsufs)
+                    (lbreak <* optional (skipMany hline))
+  spaces
+  optional $ controlSeq "caption" *> setCaption inline
+  spaces
+  optional label
+  spaces
+  optional lbreak
+  spaces
+  lookAhead $ controlSeq "end" -- make sure we're at end
+  let th  = fixTableHead $ TableHead nullAttr header'
+  let tbs = [fixTableBody $ TableBody nullAttr 0 [] rows]
+  let tf  = TableFoot nullAttr []
+  return $ table emptyCaption (zip aligns widths) th tbs tf
+
+addTableCaption :: PandocMonad m => Blocks -> LP m Blocks
+addTableCaption = walkM go
+  where go (Table attr c spec th tb tf) = do
+          st <- getState
+          let mblabel = sLastLabel st
+          capt <- case (sCaption st, mblabel) of
+                   (Just ils, Nothing)  -> return $ caption Nothing (plain ils)
+                   (Just ils, Just lab) -> do
+                     num <- getNextNumber sLastTableNum
+                     setState
+                       st{ sLastTableNum = num
+                         , sLabels = M.insert lab
+                                    [Str (renderDottedNum num)]
+                                    (sLabels st) }
+                     return $ caption Nothing (plain ils) -- add number??
+                   (Nothing, _)  -> return c
+          let attr' = case (attr, mblabel) of
+                        ((_,classes,kvs), Just ident) ->
+                           (ident,classes,kvs)
+                        _ -> attr
+          return $ addAttrDiv attr' $ Table nullAttr capt spec th tb tf
+        go x = return x
+
+-- TODO: For now we add a Div to contain table attributes, since
+-- most writers don't do anything yet with attributes on Table.
+-- This can be removed when that changes.
+addAttrDiv :: Attr -> Block -> Block
+addAttrDiv ("",[],[]) b = b
+addAttrDiv attr b       = Div attr [b]
diff --git a/src/Text/Pandoc/Readers/LaTeX/Types.hs b/src/Text/Pandoc/Readers/LaTeX/Types.hs
index a017a2afb..c20b72bc5 100644
--- a/src/Text/Pandoc/Readers/LaTeX/Types.hs
+++ b/src/Text/Pandoc/Readers/LaTeX/Types.hs
@@ -1,6 +1,7 @@
+{-# LANGUAGE FlexibleInstances #-}
 {- |
    Module      : Text.Pandoc.Readers.LaTeX.Types
-   Copyright   : Copyright (C) 2017-2020 John MacFarlane
+   Copyright   : Copyright (C) 2017-2021 John MacFarlane
    License     : GNU GPL, version 2 or above
 
    Maintainer  : John MacFarlane <jgm@berkeley.edu>
@@ -18,7 +19,9 @@ module Text.Pandoc.Readers.LaTeX.Types ( Tok(..)
                                        )
 where
 import Data.Text (Text)
-import Text.Parsec.Pos (SourcePos)
+import Text.Parsec.Pos (SourcePos, sourceName)
+import Text.Pandoc.Sources
+import Data.List (groupBy)
 
 data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment |
                Esc1    | Esc2   | Arg Int
@@ -27,6 +30,16 @@ data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment |
 data Tok = Tok SourcePos TokType Text
      deriving (Eq, Ord, Show)
 
+instance ToSources [Tok] where
+  toSources = Sources
+    . map (\ts -> case ts of
+                    Tok p _ _ : _ -> (p, mconcat $ map tokToText ts)
+                    _ -> error "toSources [Tok] encountered empty group")
+    . groupBy (\(Tok p1 _ _) (Tok p2 _ _) -> sourceName p1 == sourceName p2)
+
+tokToText :: Tok -> Text
+tokToText (Tok _ _ t) = t
+
 data ExpansionPoint = ExpandWhenDefined | ExpandWhenUsed
      deriving (Eq, Ord, Show)
author	Igor Pashev <pashev.igor@gmail.com>	2021-07-17 18:10:34 +0200
committer	Igor Pashev <pashev.igor@gmail.com>	2021-07-17 18:46:16 +0200
commit	48459559a13a20083fc9b31eb523b8ea2bf0a63f (patch)
tree	1c04e75709457403110a6f8c5c90099f22369de3 /src/Text/Pandoc/Readers/LaTeX
parent	0c39509d9b6a58958228cebf5d643598e5c98950 (diff)
parent	46099e79defe662e541b12548200caf29063c1c6 (diff)
download	pandoc-48459559a13a20083fc9b31eb523b8ea2bf0a63f.tar.gz