aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-08-22 17:47:47 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2021-08-24 22:19:20 -0700
commitfd7c140cde7544c0236586e2aa22691c33c3265a (patch)
tree487bec54e50dd583a07ee1d85f80a12369f0998d /src/Text/Pandoc
parentc39ddeb8f86e9dec5bd6096685812452e3f2c65e (diff)
downloadpandoc-fd7c140cde7544c0236586e2aa22691c33c3265a.tar.gz
Reorganize App to make it easier to limit IO in main loop.
Previously we used liftIO fairly liberally. The code has been restructured to avoid this. A small behavior change is that pandoc will now fall back to latin1 encoding for inputs that can't be read as UTF-8. This is what it did previously for content fetched from the web and not marked as to content type. It makes sense to do the same for local files.
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r--src/Text/Pandoc/App.hs185
1 files changed, 100 insertions, 85 deletions
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index dd58b1b39..66d659ba1 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE TupleSections #-}
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE CPP #-}
@@ -27,7 +28,7 @@ module Text.Pandoc.App (
import qualified Control.Exception as E
import Control.Monad ( (>=>), when )
import Control.Monad.Trans ( MonadIO(..) )
-import Control.Monad.Except (throwError)
+import Control.Monad.Except (throwError, catchError)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Char8 as B8
import qualified Data.ByteString.Lazy as BL
@@ -38,6 +39,7 @@ import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
import qualified Data.Text.Lazy.Encoding as TE
+import qualified Data.Text.Encoding as TSE
import qualified Data.Text.Encoding.Error as TE
import qualified Data.Text.Encoding.Error as TSE
import Network.URI (URI (..), parseURI)
@@ -48,7 +50,7 @@ import System.IO (nativeNewline, stdout)
import qualified System.IO as IO (Newline (..))
import Text.Pandoc
import Text.Pandoc.Builder (setMeta)
-import Text.Pandoc.MIME (getCharset)
+import Text.Pandoc.MIME (getCharset, MimeType)
import Text.Pandoc.App.FormatHeuristics (formatFromFilePaths)
import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), defaultOpts,
IpynbOutput (..))
@@ -69,6 +71,7 @@ import qualified Text.Pandoc.UTF8 as UTF8
import System.Posix.IO (stdOutput)
import System.Posix.Terminal (queryTerminal)
#endif
+import Debug.Trace
convertWithOpts :: Opt -> IO ()
convertWithOpts opts = do
@@ -94,14 +97,14 @@ convertWithOpts opts = do
let sources = case optInputFiles opts of
Just xs | not (optIgnoreArgs opts) -> xs
_ -> ["-"]
-
#ifdef _WINDOWS
let istty = True
#else
istty <- liftIO $ queryTerminal stdOutput
#endif
- (output, reports) <- runIOorExplode $ do
+ res <- runIO $ do
+
setTrace (optTrace opts)
setVerbosity verbosity
setUserDataDir datadir
@@ -110,6 +113,8 @@ convertWithOpts opts = do
setInputFiles (fromMaybe ["-"] (optInputFiles opts))
setOutputFile (optOutputFile opts)
+ inputs <- readSources sources
+
-- assign reader and writer based on options and filenames
readerName <- case optFrom opts of
Just f -> return f
@@ -135,20 +140,6 @@ convertWithOpts opts = do
(reader, readerExts) <- getReader readerName
- let convertTabs = tabFilter (if optPreserveTabs opts ||
- readerNameBase == "t2t" ||
- readerNameBase == "man"
- then 0
- else optTabStop opts)
-
-
- let readSources :: [FilePath] -> PandocIO [(FilePath, Text)]
- readSources srcs =
- mapM (\fp -> do
- t <- readSource fp
- return (if fp == "-" then "" else fp, convertTabs t)) srcs
-
-
outputSettings <- optToOutputSettings opts
let format = outputFormat outputSettings
let writer = outputWriter outputSettings
@@ -236,20 +227,11 @@ convertWithOpts opts = do
_ -> Format format) :))
$ []
- let sourceToDoc :: [FilePath] -> PandocIO Pandoc
- sourceToDoc sources' =
- case reader of
- TextReader r
- | readerNameBase == "json" ->
- mconcat <$> mapM (readSource >=> r readerOpts) sources'
- | optFileScope opts ->
- -- Read source and convert tabs (see #6709)
- let readSource' = fmap convertTabs . readSource
- in mconcat <$> mapM (readSource' >=> r readerOpts) sources'
- | otherwise ->
- readSources sources' >>= r readerOpts
- ByteStringReader r ->
- mconcat <$> mapM (readFile' >=> r readerOpts) sources'
+ let convertTabs = tabFilter (if optPreserveTabs opts ||
+ readerNameBase == "t2t" ||
+ readerNameBase == "man"
+ then 0
+ else optTabStop opts)
when (readerNameBase == "markdown_github" ||
@@ -275,7 +257,22 @@ convertWithOpts opts = do
maybe id (setMeta "citation-abbreviations")
(optCitationAbbreviations opts) $ mempty
- doc <- sourceToDoc sources >>=
+ doc <- (case reader of
+ TextReader r
+ | readerNameBase == "json" ->
+ mconcat <$>
+ mapM (inputToText convertTabs
+ >=> r readerOpts . (:[])) inputs
+ | optFileScope opts ->
+ mconcat <$> mapM
+ (inputToText convertTabs
+ >=> r readerOpts . (:[]))
+ inputs
+ | otherwise -> mapM (inputToText convertTabs) inputs
+ >>= r readerOpts
+ ByteStringReader r ->
+ mconcat <$> mapM (r readerOpts . inputToLazyByteString) inputs)
+ >>=
( (if isJust (optExtractMedia opts)
then fillMediaBag
else return)
@@ -310,19 +307,22 @@ convertWithOpts opts = do
reports <- getLog
return (output, reports)
- case optLogFile opts of
- Nothing -> return ()
- Just logfile -> BL.writeFile logfile (encodeLogMessages reports)
- let isWarning msg = messageVerbosity msg == WARNING
- when (optFailIfWarnings opts && any isWarning reports) $
- E.throwIO PandocFailOnWarningError
- let eol = case optEol opts of
- CRLF -> IO.CRLF
- LF -> IO.LF
- Native -> nativeNewline
- case output of
- TextOutput t -> writerFn eol outputFile t
- BinaryOutput bs -> writeFnBinary outputFile bs
+ case res of
+ Left e -> E.throwIO e
+ Right (output, reports) -> do
+ case optLogFile opts of
+ Nothing -> return ()
+ Just logfile -> BL.writeFile logfile (encodeLogMessages reports)
+ let isWarning msg = messageVerbosity msg == WARNING
+ when (optFailIfWarnings opts && any isWarning reports) $
+ E.throwIO PandocFailOnWarningError
+ let eol = case optEol opts of
+ CRLF -> IO.CRLF
+ LF -> IO.LF
+ Native -> nativeNewline
+ case output of
+ TextOutput t -> writerFn eol outputFile t
+ BinaryOutput bs -> writeFnBinary outputFile bs
data PandocOutput = TextOutput Text | BinaryOutput BL.ByteString
deriving (Show)
@@ -344,49 +344,64 @@ adjustMetadata f (Pandoc meta bs) = Pandoc (f meta) bs
applyTransforms :: Monad m => [Transform] -> Pandoc -> m Pandoc
applyTransforms transforms d = return $ foldr ($) d transforms
-readSource :: FilePath -> PandocIO Text
-readSource src = case parseURI src of
- Just u | uriScheme u `elem` ["http:","https:"] ->
- readURI src
- | uriScheme u == "file:" -> liftIO $
- readTextFile (uriPathToPath $ T.pack $ uriPath u)
- _ -> liftIO $ readTextFile src
- where readTextFile :: FilePath -> IO Text
- readTextFile fp = do
- bs <- if src == "-"
- then BS.getContents
- else BS.readFile fp
- E.catch (return $! UTF8.toText bs)
- (\e -> E.throwIO $ case e of
- TSE.DecodeError _ (Just w) ->
- case BS.elemIndex w bs of
- Just offset ->
- PandocUTF8DecodingError (T.pack fp) offset w
- _ -> PandocUTF8DecodingError (T.pack fp) 0 w
- _ -> PandocAppError (tshow e))
-
-readURI :: FilePath -> PandocIO Text
-readURI src = do
- (bs, mt) <- openURL (T.pack src)
+readSources :: (PandocMonad m, MonadIO m)
+ => [FilePath] -> m [(FilePath, (BS.ByteString, Maybe MimeType))]
+readSources srcs =
+ mapM (\fp -> do t <- readSource fp
+ return (if fp == "-" then "" else fp, t)) srcs
+
+readSource :: (PandocMonad m, MonadIO m)
+ => FilePath -> m (BS.ByteString, Maybe MimeType)
+readSource "-" = (,Nothing) <$> readStdinStrict
+readSource src =
+ case parseURI src of
+ Just u | uriScheme u `elem` ["http:","https:"] -> openURL (T.pack src)
+ | uriScheme u == "file:" ->
+ (,Nothing) <$>
+ readFileStrict (uriPathToPath $ T.pack $ uriPath u)
+ _ -> (,Nothing) <$> readFileStrict src
+
+utf8ToText :: PandocMonad m => FilePath -> BS.ByteString -> m Text
+utf8ToText fp bs =
+ case TSE.decodeUtf8' . dropBOM $ bs of
+ Left (TSE.DecodeError _ (Just w)) ->
+ case BS.elemIndex w bs of
+ Just offset -> throwError $ PandocUTF8DecodingError (T.pack fp) offset w
+ Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w
+ Left e -> throwError $ PandocAppError (tshow e)
+ Right t -> return t
+ where
+ dropBOM bs' =
+ if "\xEF\xBB\xBF" `BS.isPrefixOf` bs'
+ then BS.drop 3 bs'
+ else bs'
+
+
+inputToText :: PandocMonad m
+ => (Text -> Text)
+ -> (FilePath, (BS.ByteString, Maybe MimeType))
+ -> m (FilePath, Text)
+inputToText convTabs (fp, (bs,mt)) =
+ (fp,) . convTabs . T.filter (/='\r') <$>
case mt >>= getCharset of
- Just "UTF-8" -> return $ UTF8.toText bs
+ Just "UTF-8" -> utf8ToText fp bs
Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs
Just charset -> throwError $ PandocUnsupportedCharsetError charset
- Nothing -> liftIO $ -- try first as UTF-8, then as latin1
- E.catch (return $! UTF8.toText bs)
- (\case
- TSE.DecodeError{} ->
+ Nothing -> catchError
+ (utf8ToText fp bs)
+ (\case
+ PandocUTF8DecodingError{} ->
return $ T.pack $ B8.unpack bs
- e -> E.throwIO e)
+ e -> throwError e)
-readFile' :: MonadIO m => FilePath -> m BL.ByteString
-readFile' "-" = liftIO BL.getContents
-readFile' f = liftIO $ BL.readFile f
+inputToLazyByteString :: (FilePath, (BS.ByteString, Maybe MimeType))
+ -> BL.ByteString
+inputToLazyByteString (_, (bs,_)) = BL.fromStrict bs
-writeFnBinary :: MonadIO m => FilePath -> BL.ByteString -> m ()
-writeFnBinary "-" = liftIO . BL.putStr
-writeFnBinary f = liftIO . BL.writeFile (UTF8.encodePath f)
+writeFnBinary :: FilePath -> BL.ByteString -> IO ()
+writeFnBinary "-" = BL.putStr
+writeFnBinary f = BL.writeFile (UTF8.encodePath f)
-writerFn :: MonadIO m => IO.Newline -> FilePath -> Text -> m ()
-writerFn eol "-" = liftIO . UTF8.putStrWith eol
-writerFn eol f = liftIO . UTF8.writeFileWith eol f
+writerFn :: IO.Newline -> FilePath -> Text -> IO ()
+writerFn eol "-" = UTF8.putStrWith eol
+writerFn eol f = UTF8.writeFileWith eol f