Introduce file-scope parsing (parse-before-combine)

Traditionally pandoc operates on multiple files by first concetenating them (around extra line breaks) and then processing the joined file. So it only parses a multi-file document at the document scope. This has the benefit that footnotes and links can be in different files, but it also introduces a couple of difficulties: - it is difficult to join files with footnotes without some sort of preprocessing, which makes it difficult to write academic documents in small pieces. - it makes it impossible to process multiple binary input files, which can't be catted. - it makes it impossible to process files from different input formats. This commit introduces alternative method. Instead of catting the files first, it parses the files first, and then combines the parsed output. This makes it impossible to have links across multiple files, and auto-identified headers won't work correctly if headers in multiple files have the same name. On the other hand, footnotes across multiple files will work correctly and will allow more freedom for input formats. Since ByteStringReaders can currently only read one binary file, and will ignore subsequent files, we also changes the behavior to automatically parse before combining if using the ByteStringReader. If we use one file, it will work as normal. If there is more than one file it will combine them after parsing (assuming that the format is the same). Note that this is intended to be an optional method, defaulting to off. Turn it on with `--file-scope`.
author: Jesse Rosenthal <jrosenthal@jhu.edu> 2016-02-20 21:27:08 -0500
committer: Jesse Rosenthal <jrosenthal@jhu.edu> 2016-03-15 12:52:51 -0400
commit: 5c055b4cf3cdfac534a74c5c5775aa2d58889150 (patch)
tree: 3b470812f963b018eb7a3660e3a76352bc41fb4d /pandoc.hs
parent: 68fd333ec4475c5a524004bcf2e76a7959dc3afa (diff)
download: pandoc-5c055b4cf3cdfac534a74c5c5775aa2d58889150.tar.gz
1 files changed, 23 insertions, 4 deletions
diff --git a/pandoc.hs b/pandoc.hs
index 72a7592d8..59277690a 100644
--- a/pandoc.hs
+++ b/pandoc.hs
@@ -215,6 +215,7 @@ data Opt = Opt
     , optExtractMedia      :: Maybe FilePath -- ^ Path to extract embedded media
     , optTrace             :: Bool       -- ^ Print debug information
     , optTrackChanges      :: TrackChanges -- ^ Accept or reject MS Word track-changes.
+    , optFileScope        :: Bool         -- ^ Parse input files before combining
     , optKaTeXStylesheet   :: Maybe String     -- ^ Path to stylesheet for KaTeX
     , optKaTeXJS           :: Maybe String     -- ^ Path to js file for KaTeX
     }
@@ -278,6 +279,7 @@ defaultOpts = Opt
     , optExtractMedia          = Nothing
     , optTrace                 = False
     , optTrackChanges          = AcceptChanges
+    , optFileScope            = False
     , optKaTeXStylesheet       = Nothing
     , optKaTeXJS               = Nothing
     }
@@ -387,6 +389,11 @@ options =
                   "accept|reject|all")
                  "" -- "Accepting or reject MS Word track-changes.""
 
+    , Option "" ["file-scope"]
+                 (NoArg
+                  (\opt -> return opt { optFileScope = True }))
+                 "" -- "Parse input files before combining"
+
     , Option "" ["extract-media"]
                  (ReqArg
                   (\arg opt ->
@@ -1117,6 +1124,7 @@ convertWithOpts opts args = do
               , optExtractMedia          = mbExtractMedia
               , optTrace                 = trace
               , optTrackChanges          = trackChanges
+              , optFileScope            = fileScope
               , optKaTeXStylesheet       = katexStylesheet
               , optKaTeXJS               = katexJS
              } = opts
@@ -1269,6 +1277,7 @@ convertWithOpts opts args = do
                       , readerDefaultImageExtension = defaultImageExtension
                       , readerTrace = trace
                       , readerTrackChanges = trackChanges
+                      , readerFileScope   = fileScope
                       }
 
   when (not (isTextFormat format) && outputFile == "-") $
@@ -1301,13 +1310,23 @@ convertWithOpts opts args = do
                                then handleIncludes
                                else return . Right
 
-  (doc, media) <- fmap handleError $
-      case reader of
+  let sourceToDoc :: [FilePath] -> IO (Pandoc, MediaBag)
+      sourceToDoc sources' = fmap handleError $
+        case reader of
           StringReader r-> do
-            srcs <- convertTabs . intercalate "\n" <$> readSources sources
+            srcs <- convertTabs . intercalate "\n" <$> readSources sources'
             doc <- handleIncludes' srcs
             either (return . Left) (\s -> fmap (,mempty) <$> r readerOpts s) doc
-          ByteStringReader r -> readFiles sources >>= r readerOpts
+          ByteStringReader r -> readFiles sources' >>= r readerOpts
+
+  -- We parse first if fileScope is set OR if the reader is a
+  -- BSReader. So, if it's a StringReader AND not fileScope, we
+  -- don't.
+  (doc, media) <- case reader of
+    (StringReader _) | not fileScope -> sourceToDoc sources
+    _                                   -> do
+      pairs <- mapM (\s -> sourceToDoc [s]) sources
+      return (mconcat $ map fst pairs, mconcat $ map snd pairs)
 
   let writerOptions = def { writerStandalone       = standalone',
                             writerTemplate         = templ,
author	Jesse Rosenthal <jrosenthal@jhu.edu>	2016-02-20 21:27:08 -0500
committer	Jesse Rosenthal <jrosenthal@jhu.edu>	2016-03-15 12:52:51 -0400
commit	5c055b4cf3cdfac534a74c5c5775aa2d58889150 (patch)
tree	3b470812f963b018eb7a3660e3a76352bc41fb4d /pandoc.hs
parent	68fd333ec4475c5a524004bcf2e76a7959dc3afa (diff)
download	pandoc-5c055b4cf3cdfac534a74c5c5775aa2d58889150.tar.gz