diff options
author | John MacFarlane <jgm@berkeley.edu> | 2017-05-07 11:45:33 +0200 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2017-05-07 11:45:33 +0200 |
commit | 400fe3188e3f5a3e48700ae114a0da05ae6e599a (patch) | |
tree | ddd3b30cbb3c397f70cfa6fdf53b04a2edb54ecf | |
parent | de0fd90051d1fa03cc19d771fed974fc8886c04e (diff) | |
download | pandoc-400fe3188e3f5a3e48700ae114a0da05ae6e599a.tar.gz |
Allow `--extract-media` to work with non-binary input formats.
If `--extract-media` is supplied with a non-binary input format,
pandoc will attempt to extract the contents of all linked images,
whether in local files, data: uris, or external uris.
They will be named based on the sha1 hash of the contents.
Closes #1583, #2289.
Notes:
- One thing that is slightly subideal with this commit is that
identical resources will be downloaded multiple times. To improve
this we could have mediabag store an original filename/url +
a new name.
- We might think about reusing some of this code, since more or less the
same thing is done in the Docx, EPUB, PDF writers (with slight
variations).
-rw-r--r-- | src/Text/Pandoc/App.hs | 30 |
1 files changed, 26 insertions, 4 deletions
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs index b8a3c6613..212ae7fe2 100644 --- a/src/Text/Pandoc/App.hs +++ b/src/Text/Pandoc/App.hs @@ -45,6 +45,7 @@ import Data.Aeson (eitherDecode', encode) import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B import Data.Char (toLower, toUpper) +import Data.Digest.Pure.SHA (sha1, showDigest) import qualified Data.Set as Set import Data.Foldable (foldrM) import Data.List (intercalate, isPrefixOf, isSuffixOf, sort) @@ -68,17 +69,19 @@ import System.IO (stdout) import System.IO.Error (isDoesNotExistError) import Text.Pandoc import Text.Pandoc.Builder (setMeta) -import Text.Pandoc.Class (PandocIO, getLog, withMediaBag, getMediaBag) +import Text.Pandoc.Class (PandocIO, getLog, withMediaBag, getMediaBag, + fetchItem, insertMedia) import Text.Pandoc.Highlighting (highlightingStyles) import Text.Pandoc.Lua ( runLuaFilter ) import Text.Pandoc.MediaBag (extractMediaBag, mediaDirectory) +import Text.Pandoc.MIME (extensionFromMimeType) import Text.Pandoc.PDF (makePDF) import Text.Pandoc.Process (pipeProcess) import Text.Pandoc.SelfContained (makeSelfContained, makeDataURI) import Text.Pandoc.Shared (headerShift, openURL, readDataFile, readDataFileUTF8, safeRead, tabFilter) import qualified Text.Pandoc.UTF8 as UTF8 -import Text.Pandoc.Walk (walk) +import Text.Pandoc.Walk (walkM, walk) import Text.Pandoc.XML (toEntities) import Text.Printf #ifndef _WINDOWS @@ -413,11 +416,15 @@ convertWithOpts opts = do runIO' $ do (doc, media) <- withMediaBag $ sourceToDoc sources >>= - (maybe return extractMedia (optExtractMedia opts) + ( (if isJust (optExtractMedia opts) + then fillMedia (writerSourceURL writerOptions) + else return) + >=> maybe return extractMedia (optExtractMedia opts) >=> return . flip (foldr addMetadata) metadata >=> applyTransforms transforms >=> applyLuaFilters datadir (optLuaFilters opts) [format] - >=> applyFilters datadir filters' [format]) + >=> applyFilters datadir filters' [format] + ) case writer of ByteStringWriter f -> f writerOptions doc >>= writeFnBinary outputFile @@ -723,6 +730,21 @@ defaultWriterName x = -- Transformations of a Pandoc document post-parsing: +-- | Traverse tree, filling media bag. +fillMedia :: Maybe String -> Pandoc -> PandocIO Pandoc +fillMedia sourceURL d = walkM handleImage d + where handleImage :: Inline -> PandocIO Inline + handleImage (Image attr lab (src, tit)) = do + (bs, mt) <- fetchItem sourceURL src + let ext = fromMaybe (takeExtension src) + (mt >>= extensionFromMimeType) + let bs' = B.fromChunks [bs] + let basename = showDigest $ sha1 bs' + let fname = basename <.> ext + insertMedia fname mt bs' + return $ Image attr lab (fname, tit) + handleImage x = return x + extractMedia :: FilePath -> Pandoc -> PandocIO Pandoc extractMedia dir d = do media <- getMediaBag |