Change reader types, allowing better tracking of source positions.

Previously, when multiple file arguments were provided, pandoc simply concatenated them and passed the contents to the readers, which took a Text argument. As a result, the readers had no way of knowing which file was the source of any particular bit of text. This meant that we couldn't report accurate source positions on errors or include accurate source positions as attributes in the AST. More seriously, it meant that we couldn't resolve resource paths relative to the files containing them (see e.g. #5501, #6632, #6384, #3752). Add Text.Pandoc.Sources (exported module), with a `Sources` type and a `ToSources` class. A `Sources` wraps a list of `(SourcePos, Text)` pairs. [API change] A parsec `Stream` instance is provided for `Sources`. The module also exports versions of parsec's `satisfy` and other Char parsers that track source positions accurately from a `Sources` stream (or any instance of the new `UpdateSourcePos` class). Text.Pandoc.Parsing now exports these modified Char parsers instead of the ones parsec provides. Modified parsers to use a `Sources` as stream [API change]. The readers that previously took a `Text` argument have been modified to take any instance of `ToSources`. So, they may still be used with a `Text`, but they can also be used with a `Sources` object. In Text.Pandoc.Error, modified the constructor PandocParsecError to take a `Sources` rather than a `Text` as first argument, so parse error locations can be accurately reported. T.P.Error: showPos, do not print "-" as source name.
author: John MacFarlane <jgm@berkeley.edu> 2021-05-01 13:17:45 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2021-05-09 19:11:34 -0600
commit: 6e45607f9948f45b2e94f54b4825b667ca0d5441 (patch)
tree: c4cbc0f6d398c4c4cf28fda23665fe19d26602b3 /src/Text/Pandoc/Readers/LaTeX
parent: 295d93e96b1853c2ff4658aa7206ea1329024fab (diff)
download: pandoc-6e45607f9948f45b2e94f54b4825b667ca0d5441.tar.gz
3 files changed, 24 insertions, 5 deletions
diff --git a/src/Text/Pandoc/Readers/LaTeX/Citation.hs b/src/Text/Pandoc/Readers/LaTeX/Citation.hs
index 655823dab..af97125c6 100644
--- a/src/Text/Pandoc/Readers/LaTeX/Citation.hs
+++ b/src/Text/Pandoc/Readers/LaTeX/Citation.hs
@@ -120,7 +120,7 @@ simpleCiteArgs inline = try $ do
       runParserT (mconcat <$> many inline) st "bracketed option" toks
     case parsed of
       Right result -> return result
-      Left e       -> throwError $ PandocParsecError (untokenize toks) e
+      Left e       -> throwError $ PandocParsecError (toSources toks) e
 
 
 
diff --git a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs
index db58b333d..35ce3509d 100644
--- a/src/Text/Pandoc/Readers/LaTeX/Parsing.hs
+++ b/src/Text/Pandoc/Readers/LaTeX/Parsing.hs
@@ -27,6 +27,7 @@ module Text.Pandoc.Readers.LaTeX.Parsing
   , rawLaTeXParser
   , applyMacros
   , tokenize
+  , tokenizeSources
   , untokenize
   , untoken
   , totoks
@@ -248,7 +249,7 @@ withVerbatimMode parser = do
 
 rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s)
                => [Tok] -> Bool -> LP m a -> LP m a
-               -> ParserT Text s m (a, Text)
+               -> ParserT Sources s m (a, Text)
 rawLaTeXParser toks retokenize parser valParser = do
   pstate <- getState
   let lstate = def{ sOptions = extractReaderOptions pstate }
@@ -268,7 +269,7 @@ rawLaTeXParser toks retokenize parser valParser = do
               Left _    -> mzero
               Right ((val, raw), st) -> do
                 updateState (updateMacros (sMacros st <>))
-                _ <- takeP (T.length (untokenize toks'))
+                void $ count (T.length (untokenize toks')) anyChar
                 let result = untokenize raw
                 -- ensure we end with space if input did, see #4442
                 let result' =
@@ -281,7 +282,7 @@ rawLaTeXParser toks retokenize parser valParser = do
                 return (val, result')
 
 applyMacros :: (PandocMonad m, HasMacros s, HasReaderOptions s)
-            => Text -> ParserT Text s m Text
+            => Text -> ParserT Sources s m Text
 applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|>
    do let retokenize = untokenize <$> many (satisfyTok (const True))
       pstate <- getState
@@ -301,6 +302,11 @@ QuickCheck property:
 >   let t = T.pack s in untokenize (tokenize "random" t) == t
 -}
 
+tokenizeSources :: Sources -> [Tok]
+tokenizeSources = concatMap tokenizeSource . unSources
+ where
+   tokenizeSource (pos, t) = totoks pos t
+
 tokenize :: SourceName -> Text -> [Tok]
 tokenize sourcename = totoks (initialPos sourcename)
 
diff --git a/src/Text/Pandoc/Readers/LaTeX/Types.hs b/src/Text/Pandoc/Readers/LaTeX/Types.hs
index f8c214318..c20b72bc5 100644
--- a/src/Text/Pandoc/Readers/LaTeX/Types.hs
+++ b/src/Text/Pandoc/Readers/LaTeX/Types.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE FlexibleInstances #-}
 {- |
    Module      : Text.Pandoc.Readers.LaTeX.Types
    Copyright   : Copyright (C) 2017-2021 John MacFarlane
@@ -18,7 +19,9 @@ module Text.Pandoc.Readers.LaTeX.Types ( Tok(..)
                                        )
 where
 import Data.Text (Text)
-import Text.Parsec.Pos (SourcePos)
+import Text.Parsec.Pos (SourcePos, sourceName)
+import Text.Pandoc.Sources
+import Data.List (groupBy)
 
 data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment |
                Esc1    | Esc2   | Arg Int
@@ -27,6 +30,16 @@ data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment |
 data Tok = Tok SourcePos TokType Text
      deriving (Eq, Ord, Show)
 
+instance ToSources [Tok] where
+  toSources = Sources
+    . map (\ts -> case ts of
+                    Tok p _ _ : _ -> (p, mconcat $ map tokToText ts)
+                    _ -> error "toSources [Tok] encountered empty group")
+    . groupBy (\(Tok p1 _ _) (Tok p2 _ _) -> sourceName p1 == sourceName p2)
+
+tokToText :: Tok -> Text
+tokToText (Tok _ _ t) = t
+
 data ExpansionPoint = ExpandWhenDefined | ExpandWhenUsed
      deriving (Eq, Ord, Show)
author	John MacFarlane <jgm@berkeley.edu>	2021-05-01 13:17:45 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2021-05-09 19:11:34 -0600
commit	6e45607f9948f45b2e94f54b4825b667ca0d5441 (patch)
tree	c4cbc0f6d398c4c4cf28fda23665fe19d26602b3 /src/Text/Pandoc/Readers/LaTeX
parent	295d93e96b1853c2ff4658aa7206ea1329024fab (diff)
download	pandoc-6e45607f9948f45b2e94f54b4825b667ca0d5441.tar.gz