Allow `--extract-media` to work with non-binary input formats.

If `--extract-media` is supplied with a non-binary input format, pandoc will attempt to extract the contents of all linked images, whether in local files, data: uris, or external uris. They will be named based on the sha1 hash of the contents. Closes #1583, #2289. Notes: - One thing that is slightly subideal with this commit is that identical resources will be downloaded multiple times. To improve this we could have mediabag store an original filename/url + a new name. - We might think about reusing some of this code, since more or less the same thing is done in the Docx, EPUB, PDF writers (with slight variations).
author: John MacFarlane <jgm@berkeley.edu> 2017-05-07 11:45:33 +0200
committer: John MacFarlane <jgm@berkeley.edu> 2017-05-07 11:45:33 +0200
commit: 400fe3188e3f5a3e48700ae114a0da05ae6e599a (patch)
tree: ddd3b30cbb3c397f70cfa6fdf53b04a2edb54ecf
parent: de0fd90051d1fa03cc19d771fed974fc8886c04e (diff)
download: pandoc-400fe3188e3f5a3e48700ae114a0da05ae6e599a.tar.gz
1 files changed, 26 insertions, 4 deletions
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index b8a3c6613..212ae7fe2 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -45,6 +45,7 @@ import Data.Aeson (eitherDecode', encode)
 import qualified Data.ByteString as BS
 import qualified Data.ByteString.Lazy as B
 import Data.Char (toLower, toUpper)
+import Data.Digest.Pure.SHA (sha1, showDigest)
 import qualified Data.Set as Set
 import Data.Foldable (foldrM)
 import Data.List (intercalate, isPrefixOf, isSuffixOf, sort)
@@ -68,17 +69,19 @@ import System.IO (stdout)
 import System.IO.Error (isDoesNotExistError)
 import Text.Pandoc
 import Text.Pandoc.Builder (setMeta)
-import Text.Pandoc.Class (PandocIO, getLog, withMediaBag, getMediaBag)
+import Text.Pandoc.Class (PandocIO, getLog, withMediaBag, getMediaBag,
+                          fetchItem, insertMedia)
 import Text.Pandoc.Highlighting (highlightingStyles)
 import Text.Pandoc.Lua ( runLuaFilter )
 import Text.Pandoc.MediaBag (extractMediaBag, mediaDirectory)
+import Text.Pandoc.MIME (extensionFromMimeType)
 import Text.Pandoc.PDF (makePDF)
 import Text.Pandoc.Process (pipeProcess)
 import Text.Pandoc.SelfContained (makeSelfContained, makeDataURI)
 import Text.Pandoc.Shared (headerShift, openURL, readDataFile,
                            readDataFileUTF8, safeRead, tabFilter)
 import qualified Text.Pandoc.UTF8 as UTF8
-import Text.Pandoc.Walk (walk)
+import Text.Pandoc.Walk (walkM, walk)
 import Text.Pandoc.XML (toEntities)
 import Text.Printf
 #ifndef _WINDOWS
@@ -413,11 +416,15 @@ convertWithOpts opts = do
 
   runIO' $ do
     (doc, media) <- withMediaBag $ sourceToDoc sources >>=
-                 (maybe return extractMedia (optExtractMedia opts)
+              (   (if isJust (optExtractMedia opts)
+                      then fillMedia (writerSourceURL writerOptions)
+                      else return)
+              >=> maybe return extractMedia (optExtractMedia opts)
               >=> return . flip (foldr addMetadata) metadata
               >=> applyTransforms transforms
               >=> applyLuaFilters datadir (optLuaFilters opts) [format]
-              >=> applyFilters datadir filters' [format])
+              >=> applyFilters datadir filters' [format]
+              )
 
     case writer of
       ByteStringWriter f -> f writerOptions doc >>= writeFnBinary outputFile
@@ -723,6 +730,21 @@ defaultWriterName x =
 
 -- Transformations of a Pandoc document post-parsing:
 
+-- | Traverse tree, filling media bag.
+fillMedia :: Maybe String -> Pandoc -> PandocIO Pandoc
+fillMedia sourceURL d = walkM handleImage d
+  where handleImage :: Inline -> PandocIO Inline
+        handleImage (Image attr lab (src, tit)) = do
+          (bs, mt) <- fetchItem sourceURL src
+          let ext = fromMaybe (takeExtension src)
+                      (mt >>= extensionFromMimeType)
+          let bs' = B.fromChunks [bs]
+          let basename = showDigest $ sha1 bs'
+          let fname = basename <.> ext
+          insertMedia fname mt bs'
+          return $ Image attr lab (fname, tit)
+        handleImage x = return x
+
 extractMedia :: FilePath -> Pandoc -> PandocIO Pandoc
 extractMedia dir d = do
   media <- getMediaBag
author	John MacFarlane <jgm@berkeley.edu>	2017-05-07 11:45:33 +0200
committer	John MacFarlane <jgm@berkeley.edu>	2017-05-07 11:45:33 +0200
commit	400fe3188e3f5a3e48700ae114a0da05ae6e599a (patch)
tree	ddd3b30cbb3c397f70cfa6fdf53b04a2edb54ecf
parent	de0fd90051d1fa03cc19d771fed974fc8886c04e (diff)
download	pandoc-400fe3188e3f5a3e48700ae114a0da05ae6e599a.tar.gz