From 9ce229570056ddb67f95d6b5c7b58ccf6d0413fd Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Wed, 30 Jul 2014 12:46:03 -0400 Subject: Docx reader: Make docx reader put image data in MediaBag. Image data will not be put in a media bag map, which will be output along with the pandoc output. --- src/Text/Pandoc/Readers/Docx.hs | 57 +++++++++++++++-------------------- src/Text/Pandoc/Readers/Docx/Parse.hs | 10 +++--- 2 files changed, 30 insertions(+), 37 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 48a23dd3c..f5fb6565a 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -78,8 +78,6 @@ import Codec.Archive.Zip import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Builder (text, toList) -import Text.Pandoc.MIME (getMimeType) -import Text.Pandoc.UTF8 (toString) import Text.Pandoc.Walk import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Readers.Docx.Lists @@ -88,9 +86,7 @@ import Text.Pandoc.Readers.Docx.TexChar import Text.Pandoc.Shared import Data.Maybe (mapMaybe, fromMaybe) import Data.List (delete, isPrefixOf, (\\), intercalate, intersect) -import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B -import Data.ByteString.Base64 (encode) import qualified Data.Map as M import Control.Monad.Reader import Control.Monad.State @@ -98,17 +94,24 @@ import Text.Printf (printf) readDocx :: ReaderOptions -> B.ByteString - -> Pandoc + -> (Pandoc, MediaBag) readDocx opts bytes = case archiveToDocx (toArchive bytes) of - Right docx -> Pandoc meta blks where - (meta, blks) = (docxToMetaAndBlocks opts docx) + Right docx -> (Pandoc meta blks, mediaBag) where + (meta, blks, mediaBag) = (docxToOutput opts docx) Left _ -> error $ "couldn't parse docx file" data DState = DState { docxAnchorMap :: M.Map String String + , docxMediaBag :: MediaBag , docxInHeaderBlock :: Bool , docxInTexSubscript :: Bool } +defaultDState :: DState +defaultDState = DState { docxAnchorMap = M.empty + , docxMediaBag = M.empty + , docxInHeaderBlock = False + , docxInTexSubscript = False} + data DEnv = DEnv { docxOptions :: ReaderOptions , docxDocument :: Docx} @@ -319,13 +322,6 @@ runToInlines (Footnote bps) = runToInlines (Endnote bps) = concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks]) -makeDataUrl :: String -> B.ByteString -> Maybe String -makeDataUrl fp bs = - case getMimeType fp of - Just mime -> Just $ "data:" ++ mime ++ ";base64," ++ - toString (encode $ BS.concat $ B.toChunks bs) - Nothing -> Nothing - parPartToInlines :: ParPart -> DocxContext [Inline] parPartToInlines (PlainRun r) = runToInlines r parPartToInlines (Insertion _ author date runs) = do @@ -372,11 +368,9 @@ parPartToInlines (BookMark _ anchor) = modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap} return [Span (newAnchor, ["anchor"], []) []] parPartToInlines (Drawing fp bs) = do - return $ case True of -- TODO: add self-contained images - True -> [Image [] (fp, "")] - False -> case makeDataUrl fp bs of - Just d -> [Image [] (d, "")] - Nothing -> [Image [] ("", "")] + mediaBag <- gets docxMediaBag + modify $ \s -> s { docxMediaBag = M.insert fp bs mediaBag} + return [Image [] (fp, "")] parPartToInlines (InternalHyperLink anchor runs) = do ils <- concatMapM runToInlines runs return [Link ils ('#' : anchor, "")] @@ -675,26 +669,25 @@ rewriteLink l@(Link ils ('#':target, title)) = do Nothing -> l rewriteLink il = return il -bodyToMetaAndBlocks :: Body -> DocxContext (Meta, [Block]) -bodyToMetaAndBlocks (Body bps) = do +bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag) +bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps blks <- concatMapM bodyPartToBlocks blkbps >>= walkM rewriteLink - return $ - (meta, - blocksToDefinitions $ - blocksToBullets $ blks) - -docxToMetaAndBlocks :: ReaderOptions -> Docx -> (Meta, [Block]) -docxToMetaAndBlocks opts d@(Docx (Document _ body)) = - let dState = DState { docxAnchorMap = M.empty - , docxInHeaderBlock = False - , docxInTexSubscript = False} + mediaBag <- gets docxMediaBag + return $ (meta, + blocksToDefinitions $ blocksToBullets $ blks, + mediaBag) + +docxToOutput :: ReaderOptions -> Docx -> (Meta, [Block], MediaBag) +docxToOutput opts d@(Docx (Document _ body)) = + let dState = defaultDState dEnv = DEnv { docxOptions = opts , docxDocument = d} in - evalDocxContext (bodyToMetaAndBlocks body) dEnv dState + evalDocxContext (bodyToOutput body) dEnv dState + ilToCode :: Inline -> String ilToCode (Str s) = s diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 8541a1a3a..71938afe0 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -779,11 +779,11 @@ expandDrawingId :: String -> D ParPart expandDrawingId s = do target <- asks (lookupRelationship s . envRelationships) case target of - Just t -> do let filepath = combine "word" t - bytes <- asks (lookup filepath . envMedia) - case bytes of - Just bs -> return $ Drawing filepath bs - Nothing -> throwError DocxError + Just filepath -> do + bytes <- asks (lookup (combine "word" filepath) . envMedia) + case bytes of + Just bs -> return $ Drawing filepath bs + Nothing -> throwError DocxError Nothing -> throwError DocxError elemToParPart :: NameSpaces -> Element -> D ParPart -- cgit v1.2.3