diff options
author | John MacFarlane <jgm@berkeley.edu> | 2021-05-01 13:17:45 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2021-05-09 19:11:34 -0600 |
commit | 6e45607f9948f45b2e94f54b4825b667ca0d5441 (patch) | |
tree | c4cbc0f6d398c4c4cf28fda23665fe19d26602b3 /src/Text/Pandoc/Readers/LaTeX | |
parent | 295d93e96b1853c2ff4658aa7206ea1329024fab (diff) | |
download | pandoc-6e45607f9948f45b2e94f54b4825b667ca0d5441.tar.gz |
Change reader types, allowing better tracking of source positions.
Previously, when multiple file arguments were provided, pandoc
simply concatenated them and passed the contents to the readers,
which took a Text argument.
As a result, the readers had no way of knowing which file
was the source of any particular bit of text. This meant that
we couldn't report accurate source positions on errors or
include accurate source positions as attributes in the AST.
More seriously, it meant that we couldn't resolve resource
paths relative to the files containing them
(see e.g. #5501, #6632, #6384, #3752).
Add Text.Pandoc.Sources (exported module), with a `Sources` type
and a `ToSources` class. A `Sources` wraps a list of `(SourcePos,
Text)` pairs. [API change] A parsec `Stream` instance is provided for
`Sources`. The module also exports versions of parsec's `satisfy` and
other Char parsers that track source positions accurately from a
`Sources` stream (or any instance of the new `UpdateSourcePos` class).
Text.Pandoc.Parsing now exports these modified Char parsers instead of
the ones parsec provides. Modified parsers to use a `Sources` as stream
[API change].
The readers that previously took a `Text` argument have been
modified to take any instance of `ToSources`. So, they may still
be used with a `Text`, but they can also be used with a `Sources`
object.
In Text.Pandoc.Error, modified the constructor PandocParsecError
to take a `Sources` rather than a `Text` as first argument,
so parse error locations can be accurately reported.
T.P.Error: showPos, do not print "-" as source name.
Diffstat (limited to 'src/Text/Pandoc/Readers/LaTeX')
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX/Citation.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX/Parsing.hs | 12 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX/Types.hs | 15 |
3 files changed, 24 insertions, 5 deletions
diff --git a/src/Text/Pandoc/Readers/LaTeX/Citation.hs b/src/Text/Pandoc/Readers/LaTeX/Citation.hs index 655823dab..af97125c6 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Citation.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Citation.hs @@ -120,7 +120,7 @@ simpleCiteArgs inline = try $ do runParserT (mconcat <$> many inline) st "bracketed option" toks case parsed of Right result -> return result - Left e -> throwError $ PandocParsecError (untokenize toks) e + Left e -> throwError $ PandocParsecError (toSources toks) e diff --git a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs index db58b333d..35ce3509d 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs @@ -27,6 +27,7 @@ module Text.Pandoc.Readers.LaTeX.Parsing , rawLaTeXParser , applyMacros , tokenize + , tokenizeSources , untokenize , untoken , totoks @@ -248,7 +249,7 @@ withVerbatimMode parser = do rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s) => [Tok] -> Bool -> LP m a -> LP m a - -> ParserT Text s m (a, Text) + -> ParserT Sources s m (a, Text) rawLaTeXParser toks retokenize parser valParser = do pstate <- getState let lstate = def{ sOptions = extractReaderOptions pstate } @@ -268,7 +269,7 @@ rawLaTeXParser toks retokenize parser valParser = do Left _ -> mzero Right ((val, raw), st) -> do updateState (updateMacros (sMacros st <>)) - _ <- takeP (T.length (untokenize toks')) + void $ count (T.length (untokenize toks')) anyChar let result = untokenize raw -- ensure we end with space if input did, see #4442 let result' = @@ -281,7 +282,7 @@ rawLaTeXParser toks retokenize parser valParser = do return (val, result') applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s) - => Text -> ParserT Text s m Text + => Text -> ParserT Sources s m Text applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|> do let retokenize = untokenize <$> many (satisfyTok (const True)) pstate <- getState @@ -301,6 +302,11 @@ QuickCheck property: > let t = T.pack s in untokenize (tokenize "random" t) == t -} +tokenizeSources :: Sources -> [Tok] +tokenizeSources = concatMap tokenizeSource . unSources + where + tokenizeSource (pos, t) = totoks pos t + tokenize :: SourceName -> Text -> [Tok] tokenize sourcename = totoks (initialPos sourcename) diff --git a/src/Text/Pandoc/Readers/LaTeX/Types.hs b/src/Text/Pandoc/Readers/LaTeX/Types.hs index f8c214318..c20b72bc5 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Types.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Types.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE FlexibleInstances #-} {- | Module : Text.Pandoc.Readers.LaTeX.Types Copyright : Copyright (C) 2017-2021 John MacFarlane @@ -18,7 +19,9 @@ module Text.Pandoc.Readers.LaTeX.Types ( Tok(..) ) where import Data.Text (Text) -import Text.Parsec.Pos (SourcePos) +import Text.Parsec.Pos (SourcePos, sourceName) +import Text.Pandoc.Sources +import Data.List (groupBy) data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment | Esc1 | Esc2 | Arg Int @@ -27,6 +30,16 @@ data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment | data Tok = Tok SourcePos TokType Text deriving (Eq, Ord, Show) +instance ToSources [Tok] where + toSources = Sources + . map (\ts -> case ts of + Tok p _ _ : _ -> (p, mconcat $ map tokToText ts) + _ -> error "toSources [Tok] encountered empty group") + . groupBy (\(Tok p1 _ _) (Tok p2 _ _) -> sourceName p1 == sourceName p2) + +tokToText :: Tok -> Text +tokToText (Tok _ _ t) = t + data ExpansionPoint = ExpandWhenDefined | ExpandWhenUsed deriving (Eq, Ord, Show) |