diff options
author | John MacFarlane <jgm@berkeley.edu> | 2017-10-16 22:03:57 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2017-10-16 22:05:34 -0700 |
commit | c40857b38905b9ea298a777354eb4cb0da2213c6 (patch) | |
tree | cb5c453d2fcb5cfada4fa7c8aeb94205080afc74 /src/Text/Pandoc/Readers | |
parent | 9cf9a64923c672fafd1458bda6f643ada83e2b1e (diff) | |
download | pandoc-c40857b38905b9ea298a777354eb4cb0da2213c6.tar.gz |
Improved handling of include files in LaTeX reader.
Previously `\include` wouldn't work if the included file
contained, e.g., a begin without a matching end.
We've changed the Tok type so that it stores a full SourcePos,
rather than just a line and column. So tokens keeep track
of the file they came from. This allows us to use a simpler
method for includes, which doesn't require parsing the included
document as a whole.
Closes #3971.
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX.hs | 134 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/LaTeX/Types.hs | 8 |
2 files changed, 81 insertions, 61 deletions
diff --git a/src/Text/Pandoc/Readers/LaTeX.hs b/src/Text/Pandoc/Readers/LaTeX.hs index 57db33871..8c6c7d0ff 100644 --- a/src/Text/Pandoc/Readers/LaTeX.hs +++ b/src/Text/Pandoc/Readers/LaTeX.hs @@ -71,7 +71,9 @@ import Text.Pandoc.Shared import Text.Pandoc.Readers.LaTeX.Types (Macro(..), ExpansionPoint(..), Tok(..), TokType(..)) import Text.Pandoc.Walk -import Text.Pandoc.Error (PandocError(PandocParsecError, PandocMacroLoop)) +import Text.Pandoc.Error + (PandocError(PandocParsecError, PandocParseError, PandocMacroLoop)) +import Text.Parsec.Pos -- for debugging: -- import Text.Pandoc.Extensions (getDefaultExtensions) @@ -85,7 +87,7 @@ readLaTeX :: PandocMonad m -> m Pandoc readLaTeX opts ltx = do parsed <- runParserT parseLaTeX def{ sOptions = opts } "source" - (tokenize (crFilter ltx)) + (tokenize "source" (crFilter ltx)) case parsed of Right result -> return result Left e -> throwError $ PandocParsecError (T.unpack ltx) e @@ -127,7 +129,7 @@ resolveRefs _ x = x -- res <- runIOorExplode (runParserT p defaultLaTeXState{ -- sOptions = def{ readerExtensions = -- enableExtension Ext_raw_tex $ --- getDefaultExtensions "latex" }} "source" (tokenize t)) +-- getDefaultExtensions "latex" }} "source" (tokenize "source" t)) -- case res of -- Left e -> error (show e) -- Right r -> return r @@ -238,7 +240,7 @@ rawLaTeXParser :: (PandocMonad m, HasMacros s, HasReaderOptions s) => LP m a -> ParserT String s m String rawLaTeXParser parser = do inp <- getInput - let toks = tokenize $ T.pack inp + let toks = tokenize "source" $ T.pack inp pstate <- getState let lstate = def{ sOptions = extractReaderOptions pstate } res <- lift $ runParserT ((,) <$> try (snd <$> withRaw parser) <*> getState) @@ -257,7 +259,7 @@ applyMacros s = (guardDisabled Ext_latex_macros >> return s) <|> pstate <- getState let lstate = def{ sOptions = extractReaderOptions pstate , sMacros = extractMacros pstate } - res <- runParserT retokenize lstate "math" (tokenize (T.pack s)) + res <- runParserT retokenize lstate "math" (tokenize "math" (T.pack s)) case res of Left e -> fail (show e) Right s' -> return s' @@ -278,7 +280,7 @@ inlineCommand :: PandocMonad m => ParserT String ParserState m Inlines inlineCommand = do lookAhead (try (char '\\' >> letter) <|> char '$') inp <- getInput - let toks = tokenize $ T.pack inp + let toks = tokenize "chunk" $ T.pack inp let rawinline = do (il, raw) <- try $ withRaw (inlineEnvironment <|> inlineCommand') st <- getState @@ -294,32 +296,33 @@ inlineCommand = do takeP (T.length (untokenize raw)) return il -tokenize :: Text -> [Tok] -tokenize = totoks (1, 1) +tokenize :: SourceName -> Text -> [Tok] +tokenize sourcename = totoks (initialPos sourcename) -totoks :: (Line, Column) -> Text -> [Tok] -totoks (lin,col) t = +totoks :: SourcePos -> Text -> [Tok] +totoks pos t = case T.uncons t of Nothing -> [] Just (c, rest) | c == '\n' -> - Tok (lin, col) Newline "\n" - : totoks (lin + 1,1) rest + Tok pos Newline "\n" + : totoks (setSourceColumn (incSourceLine pos 1) 1) rest | isSpaceOrTab c -> let (sps, rest') = T.span isSpaceOrTab t - in Tok (lin, col) Spaces sps - : totoks (lin, col + T.length sps) rest' + in Tok pos Spaces sps + : totoks (incSourceColumn pos (T.length sps)) + rest' | isAlphaNum c -> let (ws, rest') = T.span isAlphaNum t - in Tok (lin, col) Word ws - : totoks (lin, col + T.length ws) rest' + in Tok pos Word ws + : totoks (incSourceColumn pos (T.length ws)) rest' | c == '%' -> let (cs, rest') = T.break (== '\n') rest - in Tok (lin, col) Comment ("%" <> cs) - : totoks (lin, col + 1 + T.length cs) rest' + in Tok pos Comment ("%" <> cs) + : totoks (incSourceColumn pos (1 + T.length cs)) rest' | c == '\\' -> case T.uncons rest of - Nothing -> [Tok (lin, col) Symbol (T.singleton c)] + Nothing -> [Tok pos Symbol (T.singleton c)] Just (d, rest') | isLetterOrAt d -> -- \makeatletter is common in macro defs; @@ -328,24 +331,24 @@ totoks (lin,col) t = -- probably best for now let (ws, rest'') = T.span isLetterOrAt rest (ss, rest''') = T.span isSpaceOrTab rest'' - in Tok (lin, col) (CtrlSeq ws) ("\\" <> ws <> ss) - : totoks (lin, - col + 1 + T.length ws + T.length ss) rest''' + in Tok pos (CtrlSeq ws) ("\\" <> ws <> ss) + : totoks (incSourceColumn pos + (1 + T.length ws + T.length ss)) rest''' | d == '\t' || d == '\n' -> - Tok (lin, col) Symbol ("\\") - : totoks (lin, col + 1) rest + Tok pos Symbol ("\\") + : totoks (incSourceColumn pos 1) rest | otherwise -> - Tok (lin, col) (CtrlSeq (T.singleton d)) (T.pack [c,d]) - : totoks (lin, col + 2) rest' + Tok pos (CtrlSeq (T.singleton d)) (T.pack [c,d]) + : totoks (incSourceColumn pos 2) rest' | c == '#' -> let (t1, t2) = T.span (\d -> d >= '0' && d <= '9') rest in case safeRead (T.unpack t1) of Just i -> - Tok (lin, col) (Arg i) ("#" <> t1) - : totoks (lin, col + 1 + T.length t1) t2 + Tok pos (Arg i) ("#" <> t1) + : totoks (incSourceColumn pos (1 + T.length t1)) t2 Nothing -> - Tok (lin, col) Symbol ("#") - : totoks (lin, col + 1) t2 + Tok pos Symbol ("#") + : totoks (incSourceColumn pos 1) t2 | c == '^' -> case T.uncons rest of Just ('^', rest') -> @@ -354,20 +357,20 @@ totoks (lin,col) t = | isLowerHex d -> case T.uncons rest'' of Just (e, rest''') | isLowerHex e -> - Tok (lin, col) Esc2 (T.pack ['^','^',d,e]) - : totoks (lin, col + 4) rest''' + Tok pos Esc2 (T.pack ['^','^',d,e]) + : totoks (incSourceColumn pos 4) rest''' _ -> - Tok (lin, col) Esc1 (T.pack ['^','^',d]) - : totoks (lin, col + 3) rest'' + Tok pos Esc1 (T.pack ['^','^',d]) + : totoks (incSourceColumn pos 3) rest'' | d < '\128' -> - Tok (lin, col) Esc1 (T.pack ['^','^',d]) - : totoks (lin, col + 3) rest'' - _ -> [Tok (lin, col) Symbol ("^"), - Tok (lin, col + 1) Symbol ("^")] - _ -> Tok (lin, col) Symbol ("^") - : totoks (lin, col + 1) rest + Tok pos Esc1 (T.pack ['^','^',d]) + : totoks (incSourceColumn pos 3) rest'' + _ -> [Tok pos Symbol ("^"), + Tok (incSourceColumn pos 1) Symbol ("^")] + _ -> Tok pos Symbol ("^") + : totoks (incSourceColumn pos 1) rest | otherwise -> - Tok (lin, col) Symbol (T.singleton c) : totoks (lin, col + 1) rest + Tok pos Symbol (T.singleton c) : totoks (incSourceColumn pos 1) rest where isSpaceOrTab ' ' = True isSpaceOrTab '\t' = True @@ -393,8 +396,7 @@ satisfyTok f = where matcher t | f t = Just t | otherwise = Nothing updatePos :: SourcePos -> Tok -> [Tok] -> SourcePos - updatePos spos _ (Tok (lin,col) _ _ : _) = - setSourceColumn (setSourceLine spos lin) col + updatePos _spos _ (Tok pos _ _ : _) = pos updatePos spos _ [] = spos doMacros :: PandocMonad m => Int -> LP m () @@ -437,7 +439,7 @@ doMacros n = do else doMacros (n + 1) ExpandWhenDefined -> return () -setpos :: (Line, Column) -> Tok -> Tok +setpos :: SourcePos -> Tok -> Tok setpos spos (Tok _ tt txt) = Tok spos tt txt anyControlSeq :: PandocMonad m => LP m Tok @@ -728,15 +730,15 @@ doverb = do verbTok :: PandocMonad m => Char -> LP m Tok verbTok stopchar = do - t@(Tok (lin, col) toktype txt) <- satisfyTok (not . isNewlineTok) + t@(Tok pos toktype txt) <- satisfyTok (not . isNewlineTok) case T.findIndex (== stopchar) txt of Nothing -> return t Just i -> do let (t1, t2) = T.splitAt i txt inp <- getInput - setInput $ Tok (lin, col + i) Symbol (T.singleton stopchar) - : (totoks (lin, col + i + 1) (T.drop 1 t2)) ++ inp - return $ Tok (lin, col) toktype t1 + setInput $ Tok (incSourceColumn pos i) Symbol (T.singleton stopchar) + : (totoks (incSourceColumn pos (i + 1)) (T.drop 1 t2)) ++ inp + return $ Tok pos toktype t1 dolstinline :: PandocMonad m => LP m Inlines dolstinline = do @@ -1117,16 +1119,16 @@ tok = grouped inline <|> inlineCommand' <|> singleChar' singleChar :: PandocMonad m => LP m Tok singleChar = try $ do - Tok (lin,col) toktype t <- satisfyTok (tokTypeIn [Word, Symbol]) + Tok pos toktype t <- satisfyTok (tokTypeIn [Word, Symbol]) guard $ not $ toktype == Symbol && T.any (`Set.member` specialChars) t if T.length t > 1 then do let (t1, t2) = (T.take 1 t, T.drop 1 t) inp <- getInput - setInput $ (Tok (lin, col + 1) toktype t2) : inp - return $ Tok (lin,col) toktype t1 - else return $ Tok (lin,col) toktype t + setInput $ (Tok (incSourceColumn pos 1) toktype t2) : inp + return $ Tok pos toktype t1 + else return $ Tok pos toktype t opt :: PandocMonad m => LP m Inlines opt = bracketed inline @@ -1159,7 +1161,7 @@ withRaw :: PandocMonad m => LP m a -> LP m (a, [Tok]) withRaw parser = do inp <- getInput result <- parser - nxt <- option (Tok (0,0) Word "") (lookAhead anyTok) + nxt <- option (Tok (initialPos "source") Word "") (lookAhead anyTok) let raw = takeWhile (/= nxt) inp return (result, raw) @@ -1739,7 +1741,27 @@ include = do then map (maybeAddExtension ".sty") fs else map (maybeAddExtension ".tex") fs dirs <- (splitBy (==':') . fromMaybe ".") <$> lookupEnv "TEXINPUTS" - mconcat <$> mapM (insertIncludedFile blocks (tokenize . T.pack) dirs) fs' + mapM_ (insertIncluded dirs) fs' + return mempty + +insertIncluded :: PandocMonad m + => [FilePath] + -> FilePath + -> LP m () +insertIncluded dirs f = do + pos <- getPosition + containers <- getIncludeFiles <$> getState + when (f `elem` containers) $ do + throwError $ PandocParseError $ "Include file loop at " ++ show pos + updateState $ addIncludeFile f + mbcontents <- readFileFromDirs dirs f + contents <- case mbcontents of + Just s -> return s + Nothing -> do + report $ CouldNotLoadIncludeFile f pos + return "" + getInput >>= setInput . (tokenize f (T.pack contents) ++) + updateState dropLatestIncludeFile maybeAddExtension :: String -> FilePath -> FilePath maybeAddExtension ext fp = @@ -2394,9 +2416,7 @@ parseTableRow envname prefsufs = do >> anyTok) suffpos <- getPosition option [] (count 1 amp) - return $ map (setpos (sourceLine prefpos, sourceColumn prefpos)) pref - ++ contents ++ - map (setpos (sourceLine suffpos, sourceColumn suffpos)) suff + return $ map (setpos prefpos) pref ++ contents ++ map (setpos suffpos) suff rawcells <- sequence (map celltoks prefsufs) oldInput <- getInput cells <- sequence $ map (\ts -> setInput ts >> parseTableCell) rawcells diff --git a/src/Text/Pandoc/Readers/LaTeX/Types.hs b/src/Text/Pandoc/Readers/LaTeX/Types.hs index 2bef3cb1a..9e441714d 100644 --- a/src/Text/Pandoc/Readers/LaTeX/Types.hs +++ b/src/Text/Pandoc/Readers/LaTeX/Types.hs @@ -31,17 +31,17 @@ module Text.Pandoc.Readers.LaTeX.Types ( Tok(..) , TokType(..) , Macro(..) , ExpansionPoint(..) - , Line - , Column ) + , SourcePos + ) where import Data.Text (Text) -import Text.Parsec.Pos (Line, Column) +import Text.Parsec.Pos (SourcePos) data TokType = CtrlSeq Text | Spaces | Newline | Symbol | Word | Comment | Esc1 | Esc2 | Arg Int deriving (Eq, Ord, Show) -data Tok = Tok (Line, Column) TokType Text +data Tok = Tok SourcePos TokType Text deriving (Eq, Ord, Show) data ExpansionPoint = ExpandWhenDefined | ExpandWhenUsed |