diff options
-rw-r--r-- | pandoc.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc.hs | 6 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 57 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 10 | ||||
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 11 | ||||
-rw-r--r-- | tests/Tests/Readers/Docx.hs | 3 | ||||
-rw-r--r-- | tests/docx.image_no_embed.native | 2 |
7 files changed, 49 insertions, 43 deletions
@@ -1201,7 +1201,8 @@ main = do readSources sources >>= handleIncludes' . convertTabs . intercalate "\n" >>= r readerOpts - ByteStringReader r -> readFiles sources >>= r readerOpts + ByteStringReader r -> readFiles sources >>= r readerOpts >>= + (return . fst) let doc0 = M.foldWithKey setMeta doc metadata diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index be34641a9..11553383c 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -160,7 +160,7 @@ import Text.Pandoc.Writers.Haddock import Text.Pandoc.Writers.Custom import Text.Pandoc.Templates import Text.Pandoc.Options -import Text.Pandoc.Shared (safeRead, warn) +import Text.Pandoc.Shared (safeRead, warn, MediaBag) import Data.Aeson import qualified Data.ByteString.Lazy as BL import Data.List (intercalate) @@ -204,12 +204,12 @@ markdown o s = do return doc data Reader = StringReader (ReaderOptions -> String -> IO Pandoc) - | ByteStringReader (ReaderOptions -> BL.ByteString -> IO Pandoc) + | ByteStringReader (ReaderOptions -> BL.ByteString -> IO (Pandoc, MediaBag)) mkStringReader :: (ReaderOptions -> String -> Pandoc) -> Reader mkStringReader r = StringReader (\o s -> return $ r o s) -mkBSReader :: (ReaderOptions -> BL.ByteString -> Pandoc) -> Reader +mkBSReader :: (ReaderOptions -> BL.ByteString -> (Pandoc, MediaBag)) -> Reader mkBSReader r = ByteStringReader (\o s -> return $ r o s) -- | Association list of formats and readers. diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 48a23dd3c..f5fb6565a 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -78,8 +78,6 @@ import Codec.Archive.Zip import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Builder (text, toList) -import Text.Pandoc.MIME (getMimeType) -import Text.Pandoc.UTF8 (toString) import Text.Pandoc.Walk import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Readers.Docx.Lists @@ -88,9 +86,7 @@ import Text.Pandoc.Readers.Docx.TexChar import Text.Pandoc.Shared import Data.Maybe (mapMaybe, fromMaybe) import Data.List (delete, isPrefixOf, (\\), intercalate, intersect) -import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B -import Data.ByteString.Base64 (encode) import qualified Data.Map as M import Control.Monad.Reader import Control.Monad.State @@ -98,17 +94,24 @@ import Text.Printf (printf) readDocx :: ReaderOptions -> B.ByteString - -> Pandoc + -> (Pandoc, MediaBag) readDocx opts bytes = case archiveToDocx (toArchive bytes) of - Right docx -> Pandoc meta blks where - (meta, blks) = (docxToMetaAndBlocks opts docx) + Right docx -> (Pandoc meta blks, mediaBag) where + (meta, blks, mediaBag) = (docxToOutput opts docx) Left _ -> error $ "couldn't parse docx file" data DState = DState { docxAnchorMap :: M.Map String String + , docxMediaBag :: MediaBag , docxInHeaderBlock :: Bool , docxInTexSubscript :: Bool } +defaultDState :: DState +defaultDState = DState { docxAnchorMap = M.empty + , docxMediaBag = M.empty + , docxInHeaderBlock = False + , docxInTexSubscript = False} + data DEnv = DEnv { docxOptions :: ReaderOptions , docxDocument :: Docx} @@ -319,13 +322,6 @@ runToInlines (Footnote bps) = runToInlines (Endnote bps) = concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks]) -makeDataUrl :: String -> B.ByteString -> Maybe String -makeDataUrl fp bs = - case getMimeType fp of - Just mime -> Just $ "data:" ++ mime ++ ";base64," ++ - toString (encode $ BS.concat $ B.toChunks bs) - Nothing -> Nothing - parPartToInlines :: ParPart -> DocxContext [Inline] parPartToInlines (PlainRun r) = runToInlines r parPartToInlines (Insertion _ author date runs) = do @@ -372,11 +368,9 @@ parPartToInlines (BookMark _ anchor) = modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap} return [Span (newAnchor, ["anchor"], []) []] parPartToInlines (Drawing fp bs) = do - return $ case True of -- TODO: add self-contained images - True -> [Image [] (fp, "")] - False -> case makeDataUrl fp bs of - Just d -> [Image [] (d, "")] - Nothing -> [Image [] ("", "")] + mediaBag <- gets docxMediaBag + modify $ \s -> s { docxMediaBag = M.insert fp bs mediaBag} + return [Image [] (fp, "")] parPartToInlines (InternalHyperLink anchor runs) = do ils <- concatMapM runToInlines runs return [Link ils ('#' : anchor, "")] @@ -675,26 +669,25 @@ rewriteLink l@(Link ils ('#':target, title)) = do Nothing -> l rewriteLink il = return il -bodyToMetaAndBlocks :: Body -> DocxContext (Meta, [Block]) -bodyToMetaAndBlocks (Body bps) = do +bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag) +bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps blks <- concatMapM bodyPartToBlocks blkbps >>= walkM rewriteLink - return $ - (meta, - blocksToDefinitions $ - blocksToBullets $ blks) - -docxToMetaAndBlocks :: ReaderOptions -> Docx -> (Meta, [Block]) -docxToMetaAndBlocks opts d@(Docx (Document _ body)) = - let dState = DState { docxAnchorMap = M.empty - , docxInHeaderBlock = False - , docxInTexSubscript = False} + mediaBag <- gets docxMediaBag + return $ (meta, + blocksToDefinitions $ blocksToBullets $ blks, + mediaBag) + +docxToOutput :: ReaderOptions -> Docx -> (Meta, [Block], MediaBag) +docxToOutput opts d@(Docx (Document _ body)) = + let dState = defaultDState dEnv = DEnv { docxOptions = opts , docxDocument = d} in - evalDocxContext (bodyToMetaAndBlocks body) dEnv dState + evalDocxContext (bodyToOutput body) dEnv dState + ilToCode :: Inline -> String ilToCode (Str s) = s diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 8541a1a3a..71938afe0 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -779,11 +779,11 @@ expandDrawingId :: String -> D ParPart expandDrawingId s = do target <- asks (lookupRelationship s . envRelationships) case target of - Just t -> do let filepath = combine "word" t - bytes <- asks (lookup filepath . envMedia) - case bytes of - Just bs -> return $ Drawing filepath bs - Nothing -> throwError DocxError + Just filepath -> do + bytes <- asks (lookup (combine "word" filepath) . envMedia) + case bytes of + Just bs -> return $ Drawing filepath bs + Nothing -> throwError DocxError Nothing -> throwError DocxError elemToParPart :: NameSpaces -> Element -> D ParPart diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 77180bd4b..0282eadf6 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -48,6 +48,8 @@ module Text.Pandoc.Shared ( toRomanNumeral, escapeURI, tabFilter, + -- * Media Handling + MediaBag, -- * Date/time normalizeDate, -- * Pandoc block and inline list processing @@ -114,6 +116,7 @@ import Text.HTML.TagSoup (renderTagsOptions, RenderOptions(..), Tag(..), renderOptions) import qualified Data.ByteString as BS import qualified Data.ByteString.Char8 as B8 +import qualified Data.ByteString.Lazy as BL import Text.Pandoc.Compat.Monoid import Data.ByteString.Base64 (decodeLenient) import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) @@ -280,6 +283,14 @@ tabFilter tabStop = x : go (spsToNextStop - 1) xs in go tabStop +--- +--- Media handling +--- + +-- | A map of media paths to their binary representations. + +type MediaBag = M.Map String BL.ByteString + -- -- Date/time -- diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs index b1a966969..e8dd6b72e 100644 --- a/tests/Tests/Readers/Docx.hs +++ b/tests/Tests/Readers/Docx.hs @@ -37,7 +37,8 @@ compareOutput :: ReaderOptions compareOutput opts docxFile nativeFile = do df <- B.readFile docxFile nf <- Prelude.readFile nativeFile - return $ (noNorm (readDocx opts df), noNorm (readNative nf)) + let (p, _) = readDocx opts df + return $ (noNorm p, noNorm (readNative nf)) testCompareWithOptsIO :: ReaderOptions -> String -> FilePath -> FilePath -> IO Test testCompareWithOptsIO opts name docxFile nativeFile = do diff --git a/tests/docx.image_no_embed.native b/tests/docx.image_no_embed.native index 063958bc7..aa0f65d27 100644 --- a/tests/docx.image_no_embed.native +++ b/tests/docx.image_no_embed.native @@ -1,2 +1,2 @@ [Header 2 ("an-image",[],[]) [Str "An",Space,Str "image"] -,Para [Image [] ("word/media/image1.jpeg","")]] +,Para [Image [] ("media/image1.jpeg","")]] |