From 9ce229570056ddb67f95d6b5c7b58ccf6d0413fd Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Wed, 30 Jul 2014 12:46:03 -0400 Subject: Docx reader: Make docx reader put image data in MediaBag. Image data will not be put in a media bag map, which will be output along with the pandoc output. --- src/Text/Pandoc/Readers/Docx.hs | 57 +++++++++++++++-------------------- src/Text/Pandoc/Readers/Docx/Parse.hs | 10 +++--- 2 files changed, 30 insertions(+), 37 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 48a23dd3c..f5fb6565a 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -78,8 +78,6 @@ import Codec.Archive.Zip import Text.Pandoc.Definition import Text.Pandoc.Options import Text.Pandoc.Builder (text, toList) -import Text.Pandoc.MIME (getMimeType) -import Text.Pandoc.UTF8 (toString) import Text.Pandoc.Walk import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Readers.Docx.Lists @@ -88,9 +86,7 @@ import Text.Pandoc.Readers.Docx.TexChar import Text.Pandoc.Shared import Data.Maybe (mapMaybe, fromMaybe) import Data.List (delete, isPrefixOf, (\\), intercalate, intersect) -import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B -import Data.ByteString.Base64 (encode) import qualified Data.Map as M import Control.Monad.Reader import Control.Monad.State @@ -98,17 +94,24 @@ import Text.Printf (printf) readDocx :: ReaderOptions -> B.ByteString - -> Pandoc + -> (Pandoc, MediaBag) readDocx opts bytes = case archiveToDocx (toArchive bytes) of - Right docx -> Pandoc meta blks where - (meta, blks) = (docxToMetaAndBlocks opts docx) + Right docx -> (Pandoc meta blks, mediaBag) where + (meta, blks, mediaBag) = (docxToOutput opts docx) Left _ -> error $ "couldn't parse docx file" data DState = DState { docxAnchorMap :: M.Map String String + , docxMediaBag :: MediaBag , docxInHeaderBlock :: Bool , docxInTexSubscript :: Bool } +defaultDState :: DState +defaultDState = DState { docxAnchorMap = M.empty + , docxMediaBag = M.empty + , docxInHeaderBlock = False + , docxInTexSubscript = False} + data DEnv = DEnv { docxOptions :: ReaderOptions , docxDocument :: Docx} @@ -319,13 +322,6 @@ runToInlines (Footnote bps) = runToInlines (Endnote bps) = concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks]) -makeDataUrl :: String -> B.ByteString -> Maybe String -makeDataUrl fp bs = - case getMimeType fp of - Just mime -> Just $ "data:" ++ mime ++ ";base64," ++ - toString (encode $ BS.concat $ B.toChunks bs) - Nothing -> Nothing - parPartToInlines :: ParPart -> DocxContext [Inline] parPartToInlines (PlainRun r) = runToInlines r parPartToInlines (Insertion _ author date runs) = do @@ -372,11 +368,9 @@ parPartToInlines (BookMark _ anchor) = modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap} return [Span (newAnchor, ["anchor"], []) []] parPartToInlines (Drawing fp bs) = do - return $ case True of -- TODO: add self-contained images - True -> [Image [] (fp, "")] - False -> case makeDataUrl fp bs of - Just d -> [Image [] (d, "")] - Nothing -> [Image [] ("", "")] + mediaBag <- gets docxMediaBag + modify $ \s -> s { docxMediaBag = M.insert fp bs mediaBag} + return [Image [] (fp, "")] parPartToInlines (InternalHyperLink anchor runs) = do ils <- concatMapM runToInlines runs return [Link ils ('#' : anchor, "")] @@ -675,26 +669,25 @@ rewriteLink l@(Link ils ('#':target, title)) = do Nothing -> l rewriteLink il = return il -bodyToMetaAndBlocks :: Body -> DocxContext (Meta, [Block]) -bodyToMetaAndBlocks (Body bps) = do +bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag) +bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps blks <- concatMapM bodyPartToBlocks blkbps >>= walkM rewriteLink - return $ - (meta, - blocksToDefinitions $ - blocksToBullets $ blks) - -docxToMetaAndBlocks :: ReaderOptions -> Docx -> (Meta, [Block]) -docxToMetaAndBlocks opts d@(Docx (Document _ body)) = - let dState = DState { docxAnchorMap = M.empty - , docxInHeaderBlock = False - , docxInTexSubscript = False} + mediaBag <- gets docxMediaBag + return $ (meta, + blocksToDefinitions $ blocksToBullets $ blks, + mediaBag) + +docxToOutput :: ReaderOptions -> Docx -> (Meta, [Block], MediaBag) +docxToOutput opts d@(Docx (Document _ body)) = + let dState = defaultDState dEnv = DEnv { docxOptions = opts , docxDocument = d} in - evalDocxContext (bodyToMetaAndBlocks body) dEnv dState + evalDocxContext (bodyToOutput body) dEnv dState + ilToCode :: Inline -> String ilToCode (Str s) = s diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 8541a1a3a..71938afe0 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -779,11 +779,11 @@ expandDrawingId :: String -> D ParPart expandDrawingId s = do target <- asks (lookupRelationship s . envRelationships) case target of - Just t -> do let filepath = combine "word" t - bytes <- asks (lookup filepath . envMedia) - case bytes of - Just bs -> return $ Drawing filepath bs - Nothing -> throwError DocxError + Just filepath -> do + bytes <- asks (lookup (combine "word" filepath) . envMedia) + case bytes of + Just bs -> return $ Drawing filepath bs + Nothing -> throwError DocxError Nothing -> throwError DocxError elemToParPart :: NameSpaces -> Element -> D ParPart -- cgit v1.2.3 From f78d2f62199a7bab0ae1fccafda6d97dd9681f12 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Wed, 30 Jul 2014 12:47:26 -0400 Subject: Shared: Make MediaBag available through Shared. --- src/Text/Pandoc/Shared.hs | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 77180bd4b..0282eadf6 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -48,6 +48,8 @@ module Text.Pandoc.Shared ( toRomanNumeral, escapeURI, tabFilter, + -- * Media Handling + MediaBag, -- * Date/time normalizeDate, -- * Pandoc block and inline list processing @@ -114,6 +116,7 @@ import Text.HTML.TagSoup (renderTagsOptions, RenderOptions(..), Tag(..), renderOptions) import qualified Data.ByteString as BS import qualified Data.ByteString.Char8 as B8 +import qualified Data.ByteString.Lazy as BL import Text.Pandoc.Compat.Monoid import Data.ByteString.Base64 (decodeLenient) import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) @@ -280,6 +283,14 @@ tabFilter tabStop = x : go (spsToNextStop - 1) xs in go tabStop +--- +--- Media handling +--- + +-- | A map of media paths to their binary representations. + +type MediaBag = M.Map String BL.ByteString + -- -- Date/time -- -- cgit v1.2.3 From a79ea1850389b09ddc1f3046750b3481d0e10b60 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Wed, 30 Jul 2014 12:47:49 -0400 Subject: Pandoc.hs: change BSReader to output MediaBag as well as pandoc. --- src/Text/Pandoc.hs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index be34641a9..11553383c 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -160,7 +160,7 @@ import Text.Pandoc.Writers.Haddock import Text.Pandoc.Writers.Custom import Text.Pandoc.Templates import Text.Pandoc.Options -import Text.Pandoc.Shared (safeRead, warn) +import Text.Pandoc.Shared (safeRead, warn, MediaBag) import Data.Aeson import qualified Data.ByteString.Lazy as BL import Data.List (intercalate) @@ -204,12 +204,12 @@ markdown o s = do return doc data Reader = StringReader (ReaderOptions -> String -> IO Pandoc) - | ByteStringReader (ReaderOptions -> BL.ByteString -> IO Pandoc) + | ByteStringReader (ReaderOptions -> BL.ByteString -> IO (Pandoc, MediaBag)) mkStringReader :: (ReaderOptions -> String -> Pandoc) -> Reader mkStringReader r = StringReader (\o s -> return $ r o s) -mkBSReader :: (ReaderOptions -> BL.ByteString -> Pandoc) -> Reader +mkBSReader :: (ReaderOptions -> BL.ByteString -> (Pandoc, MediaBag)) -> Reader mkBSReader r = ByteStringReader (\o s -> return $ r o s) -- | Association list of formats and readers. -- cgit v1.2.3