aboutsummaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2017-05-07 11:45:33 +0200
committerJohn MacFarlane <jgm@berkeley.edu>2017-05-07 11:45:33 +0200
commit400fe3188e3f5a3e48700ae114a0da05ae6e599a (patch)
treeddd3b30cbb3c397f70cfa6fdf53b04a2edb54ecf /src/Text
parentde0fd90051d1fa03cc19d771fed974fc8886c04e (diff)
downloadpandoc-400fe3188e3f5a3e48700ae114a0da05ae6e599a.tar.gz
Allow `--extract-media` to work with non-binary input formats.
If `--extract-media` is supplied with a non-binary input format, pandoc will attempt to extract the contents of all linked images, whether in local files, data: uris, or external uris. They will be named based on the sha1 hash of the contents. Closes #1583, #2289. Notes: - One thing that is slightly subideal with this commit is that identical resources will be downloaded multiple times. To improve this we could have mediabag store an original filename/url + a new name. - We might think about reusing some of this code, since more or less the same thing is done in the Docx, EPUB, PDF writers (with slight variations).
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/App.hs30
1 files changed, 26 insertions, 4 deletions
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index b8a3c6613..212ae7fe2 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -45,6 +45,7 @@ import Data.Aeson (eitherDecode', encode)
import qualified Data.ByteString as BS
import qualified Data.ByteString.Lazy as B
import Data.Char (toLower, toUpper)
+import Data.Digest.Pure.SHA (sha1, showDigest)
import qualified Data.Set as Set
import Data.Foldable (foldrM)
import Data.List (intercalate, isPrefixOf, isSuffixOf, sort)
@@ -68,17 +69,19 @@ import System.IO (stdout)
import System.IO.Error (isDoesNotExistError)
import Text.Pandoc
import Text.Pandoc.Builder (setMeta)
-import Text.Pandoc.Class (PandocIO, getLog, withMediaBag, getMediaBag)
+import Text.Pandoc.Class (PandocIO, getLog, withMediaBag, getMediaBag,
+ fetchItem, insertMedia)
import Text.Pandoc.Highlighting (highlightingStyles)
import Text.Pandoc.Lua ( runLuaFilter )
import Text.Pandoc.MediaBag (extractMediaBag, mediaDirectory)
+import Text.Pandoc.MIME (extensionFromMimeType)
import Text.Pandoc.PDF (makePDF)
import Text.Pandoc.Process (pipeProcess)
import Text.Pandoc.SelfContained (makeSelfContained, makeDataURI)
import Text.Pandoc.Shared (headerShift, openURL, readDataFile,
readDataFileUTF8, safeRead, tabFilter)
import qualified Text.Pandoc.UTF8 as UTF8
-import Text.Pandoc.Walk (walk)
+import Text.Pandoc.Walk (walkM, walk)
import Text.Pandoc.XML (toEntities)
import Text.Printf
#ifndef _WINDOWS
@@ -413,11 +416,15 @@ convertWithOpts opts = do
runIO' $ do
(doc, media) <- withMediaBag $ sourceToDoc sources >>=
- (maybe return extractMedia (optExtractMedia opts)
+ ( (if isJust (optExtractMedia opts)
+ then fillMedia (writerSourceURL writerOptions)
+ else return)
+ >=> maybe return extractMedia (optExtractMedia opts)
>=> return . flip (foldr addMetadata) metadata
>=> applyTransforms transforms
>=> applyLuaFilters datadir (optLuaFilters opts) [format]
- >=> applyFilters datadir filters' [format])
+ >=> applyFilters datadir filters' [format]
+ )
case writer of
ByteStringWriter f -> f writerOptions doc >>= writeFnBinary outputFile
@@ -723,6 +730,21 @@ defaultWriterName x =
-- Transformations of a Pandoc document post-parsing:
+-- | Traverse tree, filling media bag.
+fillMedia :: Maybe String -> Pandoc -> PandocIO Pandoc
+fillMedia sourceURL d = walkM handleImage d
+ where handleImage :: Inline -> PandocIO Inline
+ handleImage (Image attr lab (src, tit)) = do
+ (bs, mt) <- fetchItem sourceURL src
+ let ext = fromMaybe (takeExtension src)
+ (mt >>= extensionFromMimeType)
+ let bs' = B.fromChunks [bs]
+ let basename = showDigest $ sha1 bs'
+ let fname = basename <.> ext
+ insertMedia fname mt bs'
+ return $ Image attr lab (fname, tit)
+ handleImage x = return x
+
extractMedia :: FilePath -> Pandoc -> PandocIO Pandoc
extractMedia dir d = do
media <- getMediaBag