From fd7c140cde7544c0236586e2aa22691c33c3265a Mon Sep 17 00:00:00 2001
From: John MacFarlane <jgm@berkeley.edu>
Date: Sun, 22 Aug 2021 17:47:47 -0700
Subject: Reorganize App to make it easier to limit IO in main loop.

Previously we used liftIO fairly liberally.  The code has
been restructured to avoid this.

A small behavior change is that pandoc will now fall back
to latin1 encoding for inputs that can't be read as UTF-8.
This is what it did previously for content fetched from
the web and not marked as to content type. It makes sense
to do the same for local files.
---
 src/Text/Pandoc/App.hs | 185 ++++++++++++++++++++++++++-----------------------
 1 file changed, 100 insertions(+), 85 deletions(-)

(limited to 'src')

diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index dd58b1b39..66d659ba1 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE TupleSections       #-}
 {-# LANGUAGE LambdaCase          #-}
 {-# LANGUAGE OverloadedStrings   #-}
 {-# LANGUAGE CPP                 #-}
@@ -27,7 +28,7 @@ module Text.Pandoc.App (
 import qualified Control.Exception as E
 import Control.Monad ( (>=>), when )
 import Control.Monad.Trans ( MonadIO(..) )
-import Control.Monad.Except (throwError)
+import Control.Monad.Except (throwError, catchError)
 import qualified Data.ByteString as BS
 import qualified Data.ByteString.Char8 as B8
 import qualified Data.ByteString.Lazy as BL
@@ -38,6 +39,7 @@ import Data.Text (Text)
 import qualified Data.Text as T
 import qualified Data.Text.Lazy as TL
 import qualified Data.Text.Lazy.Encoding as TE
+import qualified Data.Text.Encoding as TSE
 import qualified Data.Text.Encoding.Error as TE
 import qualified Data.Text.Encoding.Error as TSE
 import Network.URI (URI (..), parseURI)
@@ -48,7 +50,7 @@ import System.IO (nativeNewline, stdout)
 import qualified System.IO as IO (Newline (..))
 import Text.Pandoc
 import Text.Pandoc.Builder (setMeta)
-import Text.Pandoc.MIME (getCharset)
+import Text.Pandoc.MIME (getCharset, MimeType)
 import Text.Pandoc.App.FormatHeuristics (formatFromFilePaths)
 import Text.Pandoc.App.Opt (Opt (..), LineEnding (..), defaultOpts,
                             IpynbOutput (..))
@@ -69,6 +71,7 @@ import qualified Text.Pandoc.UTF8 as UTF8
 import System.Posix.IO (stdOutput)
 import System.Posix.Terminal (queryTerminal)
 #endif
+import Debug.Trace
 
 convertWithOpts :: Opt -> IO ()
 convertWithOpts opts = do
@@ -94,14 +97,14 @@ convertWithOpts opts = do
   let sources = case optInputFiles opts of
                      Just xs | not (optIgnoreArgs opts) -> xs
                      _ -> ["-"]
-
 #ifdef _WINDOWS
   let istty = True
 #else
   istty <- liftIO $ queryTerminal stdOutput
 #endif
 
-  (output, reports) <- runIOorExplode $ do
+  res <- runIO $ do
+
     setTrace (optTrace opts)
     setVerbosity verbosity
     setUserDataDir datadir
@@ -110,6 +113,8 @@ convertWithOpts opts = do
     setInputFiles (fromMaybe ["-"] (optInputFiles opts))
     setOutputFile (optOutputFile opts)
 
+    inputs <- readSources sources
+
     -- assign reader and writer based on options and filenames
     readerName <- case optFrom opts of
                        Just f  -> return f
@@ -135,20 +140,6 @@ convertWithOpts opts = do
 
     (reader, readerExts) <- getReader readerName
 
-    let convertTabs = tabFilter (if optPreserveTabs opts ||
-                                      readerNameBase == "t2t" ||
-                                      readerNameBase == "man"
-                                    then 0
-                                    else optTabStop opts)
-
-
-    let readSources :: [FilePath] -> PandocIO [(FilePath, Text)]
-        readSources srcs =
-          mapM (\fp -> do
-                   t <- readSource fp
-                   return (if fp == "-" then "" else fp, convertTabs t)) srcs
-
-
     outputSettings <- optToOutputSettings opts
     let format = outputFormat outputSettings
     let writer = outputWriter outputSettings
@@ -236,20 +227,11 @@ convertWithOpts opts = do
                                             _        -> Format format) :))
                      $ []
 
-    let sourceToDoc :: [FilePath] -> PandocIO Pandoc
-        sourceToDoc sources' =
-           case reader of
-             TextReader r
-               | readerNameBase == "json" ->
-                   mconcat <$> mapM (readSource >=> r readerOpts) sources'
-               | optFileScope opts  ->
-                   -- Read source and convert tabs (see #6709)
-                   let readSource' = fmap convertTabs . readSource
-                   in mconcat <$> mapM (readSource' >=> r readerOpts) sources'
-               | otherwise ->
-                   readSources sources' >>= r readerOpts
-             ByteStringReader r ->
-               mconcat <$> mapM (readFile' >=> r readerOpts) sources'
+    let convertTabs = tabFilter (if optPreserveTabs opts ||
+                                      readerNameBase == "t2t" ||
+                                      readerNameBase == "man"
+                                    then 0
+                                    else optTabStop opts)
 
 
     when (readerNameBase == "markdown_github" ||
@@ -275,7 +257,22 @@ convertWithOpts opts = do
           maybe id (setMeta "citation-abbreviations")
                          (optCitationAbbreviations opts) $ mempty
 
-    doc <- sourceToDoc sources >>=
+    doc <- (case reader of
+             TextReader r
+               | readerNameBase == "json" ->
+                   mconcat <$>
+                      mapM (inputToText convertTabs
+                             >=> r readerOpts . (:[])) inputs
+               | optFileScope opts  ->
+                   mconcat <$> mapM
+                      (inputToText convertTabs
+                             >=> r readerOpts . (:[]))
+                      inputs
+               | otherwise -> mapM (inputToText convertTabs) inputs
+                                >>= r readerOpts
+             ByteStringReader r ->
+               mconcat <$> mapM (r readerOpts . inputToLazyByteString) inputs)
+            >>=
               (   (if isJust (optExtractMedia opts)
                       then fillMediaBag
                       else return)
@@ -310,19 +307,22 @@ convertWithOpts opts = do
     reports <- getLog
     return (output, reports)
 
-  case optLogFile opts of
-       Nothing      -> return ()
-       Just logfile -> BL.writeFile logfile (encodeLogMessages reports)
-  let isWarning msg = messageVerbosity msg == WARNING
-  when (optFailIfWarnings opts && any isWarning reports) $
-      E.throwIO PandocFailOnWarningError
-  let eol = case optEol opts of
-                 CRLF   -> IO.CRLF
-                 LF     -> IO.LF
-                 Native -> nativeNewline
-  case output of
-    TextOutput t    -> writerFn eol outputFile t
-    BinaryOutput bs -> writeFnBinary outputFile bs
+  case res of
+    Left e -> E.throwIO e
+    Right (output, reports) -> do
+      case optLogFile opts of
+           Nothing      -> return ()
+           Just logfile -> BL.writeFile logfile (encodeLogMessages reports)
+      let isWarning msg = messageVerbosity msg == WARNING
+      when (optFailIfWarnings opts && any isWarning reports) $
+          E.throwIO PandocFailOnWarningError
+      let eol = case optEol opts of
+                     CRLF   -> IO.CRLF
+                     LF     -> IO.LF
+                     Native -> nativeNewline
+      case output of
+        TextOutput t    -> writerFn eol outputFile t
+        BinaryOutput bs -> writeFnBinary outputFile bs
 
 data PandocOutput = TextOutput Text | BinaryOutput BL.ByteString
   deriving (Show)
@@ -344,49 +344,64 @@ adjustMetadata f (Pandoc meta bs) = Pandoc (f meta) bs
 applyTransforms :: Monad m => [Transform] -> Pandoc -> m Pandoc
 applyTransforms transforms d = return $ foldr ($) d transforms
 
-readSource :: FilePath -> PandocIO Text
-readSource src = case parseURI src of
-                      Just u | uriScheme u `elem` ["http:","https:"] ->
-                                 readURI src
-                             | uriScheme u == "file:" -> liftIO $
-                                 readTextFile (uriPathToPath $ T.pack $ uriPath u)
-                      _       -> liftIO $ readTextFile src
-  where readTextFile :: FilePath -> IO Text
-        readTextFile fp = do
-          bs <- if src == "-"
-                   then BS.getContents
-                   else BS.readFile fp
-          E.catch (return $! UTF8.toText bs)
-             (\e -> E.throwIO $ case e of
-                         TSE.DecodeError _ (Just w) ->
-                           case BS.elemIndex w bs of
-                             Just offset ->
-                                  PandocUTF8DecodingError (T.pack fp) offset w
-                             _ -> PandocUTF8DecodingError (T.pack fp) 0 w
-                         _ -> PandocAppError (tshow e))
-
-readURI :: FilePath -> PandocIO Text
-readURI src = do
-  (bs, mt) <- openURL (T.pack src)
+readSources :: (PandocMonad m, MonadIO m)
+            => [FilePath] -> m [(FilePath, (BS.ByteString, Maybe MimeType))]
+readSources srcs =
+  mapM (\fp -> do t <- readSource fp
+                  return (if fp == "-" then "" else fp, t)) srcs
+
+readSource :: (PandocMonad m, MonadIO m)
+           => FilePath -> m (BS.ByteString, Maybe MimeType)
+readSource "-" = (,Nothing) <$> readStdinStrict
+readSource src =
+  case parseURI src of
+    Just u | uriScheme u `elem` ["http:","https:"] -> openURL (T.pack src)
+           | uriScheme u == "file:" ->
+               (,Nothing) <$>
+                 readFileStrict (uriPathToPath $ T.pack $ uriPath u)
+    _       -> (,Nothing) <$> readFileStrict src
+
+utf8ToText :: PandocMonad m => FilePath -> BS.ByteString -> m Text
+utf8ToText fp bs =
+  case TSE.decodeUtf8' . dropBOM $ bs of
+    Left (TSE.DecodeError _ (Just w)) ->
+      case BS.elemIndex w bs of
+        Just offset -> throwError $ PandocUTF8DecodingError (T.pack fp) offset w
+        Nothing -> throwError $ PandocUTF8DecodingError (T.pack fp) 0 w
+    Left e -> throwError $ PandocAppError (tshow e)
+    Right t -> return t
+ where
+   dropBOM bs' =
+     if "\xEF\xBB\xBF" `BS.isPrefixOf` bs'
+        then BS.drop 3 bs'
+        else bs'
+
+
+inputToText :: PandocMonad m
+            => (Text -> Text)
+            -> (FilePath, (BS.ByteString, Maybe MimeType))
+            -> m (FilePath, Text)
+inputToText convTabs (fp, (bs,mt)) =
+  (fp,) . convTabs . T.filter (/='\r') <$>
   case mt >>= getCharset of
-    Just "UTF-8"      -> return $ UTF8.toText bs
+    Just "UTF-8"      -> utf8ToText fp bs
     Just "ISO-8859-1" -> return $ T.pack $ B8.unpack bs
     Just charset      -> throwError $ PandocUnsupportedCharsetError charset
-    Nothing           -> liftIO $ -- try first as UTF-8, then as latin1
-                          E.catch (return $! UTF8.toText bs)
-                                  (\case
-                                      TSE.DecodeError{} ->
+    Nothing           -> catchError
+                           (utf8ToText fp bs)
+                           (\case
+                              PandocUTF8DecodingError{} ->
                                         return $ T.pack $ B8.unpack bs
-                                      e -> E.throwIO e)
+                              e -> throwError e)
 
-readFile' :: MonadIO m => FilePath -> m BL.ByteString
-readFile' "-" = liftIO BL.getContents
-readFile' f   = liftIO $ BL.readFile f
+inputToLazyByteString :: (FilePath, (BS.ByteString, Maybe MimeType))
+                      -> BL.ByteString
+inputToLazyByteString (_, (bs,_)) = BL.fromStrict bs
 
-writeFnBinary :: MonadIO m => FilePath -> BL.ByteString -> m ()
-writeFnBinary "-" = liftIO . BL.putStr
-writeFnBinary f   = liftIO . BL.writeFile (UTF8.encodePath f)
+writeFnBinary :: FilePath -> BL.ByteString -> IO ()
+writeFnBinary "-" = BL.putStr
+writeFnBinary f   = BL.writeFile (UTF8.encodePath f)
 
-writerFn :: MonadIO m => IO.Newline -> FilePath -> Text -> m ()
-writerFn eol "-" = liftIO . UTF8.putStrWith eol
-writerFn eol f   = liftIO . UTF8.writeFileWith eol f
+writerFn :: IO.Newline -> FilePath -> Text -> IO ()
+writerFn eol "-" = UTF8.putStrWith eol
+writerFn eol f   = UTF8.writeFileWith eol f
-- 
cgit v1.2.3