Merge pull request #3165 from hubertp-lshift/feature/odt-image

[odt] images parser
author: John MacFarlane <jgm@berkeley.edu> 2016-10-18 22:00:58 +0200
committer: GitHub <noreply@github.com> 2016-10-18 22:00:58 +0200
commit: 0cd11b3e5404fa6fca7538098bcf315343d1a237 (patch)
tree: 39057cfcb282e2e7b901dc12510ecffc3baad25b /src
parent: 8264ae2abe976184086a5a40c3d082f5e3e99ca5 (diff)
parent: 4417e33ea9d49a2001091adb4d2b19ebdefe5795 (diff)
download: pandoc-0cd11b3e5404fa6fca7538098bcf315343d1a237.tar.gz
4 files changed, 151 insertions, 38 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index b9021ec08..7b9779105 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -65,7 +65,7 @@ import Control.Monad.State
 import Control.Applicative ((<|>))
 import qualified Data.Map as M
 import Control.Monad.Except
-import Text.Pandoc.Shared (safeRead)
+import Text.Pandoc.Shared (safeRead, filteredFilesFromArchive)
 import Text.TeXMath.Readers.OMML (readOMML)
 import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..))
 import Text.TeXMath (Exp)
@@ -86,7 +86,6 @@ data ReaderEnv = ReaderEnv { envNotes         :: Notes
 
 data ReaderState = ReaderState { stateWarnings :: [String] }
                  deriving Show
-                                                  
 
 data DocxError = DocxError | WrongElem
                deriving Show
@@ -276,7 +275,7 @@ archiveToDocxWithWarnings archive = do
       comments  = archiveToComments archive
       numbering = archiveToNumbering archive
       rels      = archiveToRelationships archive
-      media     = archiveToMedia archive
+      media     = filteredFilesFromArchive archive filePathIsMedia
       (styles, parstyles) = archiveToStyles archive
       rEnv =
         ReaderEnv notes comments numbering rels media Nothing styles parstyles InDocument
@@ -402,7 +401,6 @@ archiveToComments zf =
     case cmts of
       Just c -> Comments cmts_namespaces c
       Nothing -> Comments cmts_namespaces M.empty
-               
 
 filePathToRelType :: FilePath -> Maybe DocumentLocation
 filePathToRelType "word/_rels/document.xml.rels" = Just InDocument
@@ -424,7 +422,7 @@ filePathToRelationships ar fp | Just relType <- filePathToRelType fp
                               , Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry =
   mapMaybe (relElemToRelationship relType) $ elChildren relElems
 filePathToRelationships _ _ = []
-                               
+
 archiveToRelationships :: Archive -> [Relationship]
 archiveToRelationships archive =
   concatMap (filePathToRelationships archive) $ filesInArchive archive
@@ -435,16 +433,6 @@ filePathIsMedia fp =
   in
    (dir == "word/media/")
 
-getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString)
-getMediaPair zf fp =
-  case findEntryByPath fp zf of
-    Just e -> Just (fp, fromEntry e)
-    Nothing -> Nothing
-
-archiveToMedia :: Archive -> Media
-archiveToMedia zf =
-  mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf))
-
 lookupLevel :: String -> String -> Numbering -> Maybe Level
 lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do
   absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs
@@ -741,7 +729,7 @@ elemToCommentStart ns element
   , Just cmtDate <- findAttr (elemName ns "w" "date") element = do
       bps <- mapD (elemToBodyPart ns) (elChildren element)
       return $ CommentStart cmtId cmtAuthor cmtDate bps
-elemToCommentStart _ _ = throwError WrongElem      
+elemToCommentStart _ _ = throwError WrongElem
 
 lookupFootnote :: String -> Notes -> Maybe Element
 lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s)
diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs
index 68e89263c..046fb4d6d 100644
--- a/src/Text/Pandoc/Readers/Odt.hs
+++ b/src/Text/Pandoc/Readers/Odt.hs
@@ -37,6 +37,8 @@ import qualified Text.XML.Light                        as XML
 
 import qualified Data.ByteString.Lazy                  as B
 
+import           System.FilePath
+
 import           Text.Pandoc.Definition
 import           Text.Pandoc.Error
 import           Text.Pandoc.Options
@@ -48,39 +50,49 @@ import           Text.Pandoc.Readers.Odt.StyleReader
 
 import           Text.Pandoc.Readers.Odt.Generic.XMLConverter
 import           Text.Pandoc.Readers.Odt.Generic.Fallible
+import           Text.Pandoc.Shared (filteredFilesFromArchive)
 
 --
 readOdt :: ReaderOptions
         -> B.ByteString
         -> Either PandocError (Pandoc, MediaBag)
-readOdt _ bytes = case bytesToOdt bytes of
-                    Right pandoc -> Right (pandoc , mempty)
-                    Left  err    -> Left err
+readOdt _ bytes = bytesToOdt bytes-- of
+--                    Right (pandoc, mediaBag) -> Right (pandoc , mediaBag)
+--                    Left  err                -> Left err
 
 --
-bytesToOdt :: B.ByteString -> Either PandocError Pandoc
+bytesToOdt :: B.ByteString -> Either PandocError (Pandoc, MediaBag)
 bytesToOdt bytes = case toArchiveOrFail bytes of
   Right archive -> archiveToOdt archive
   Left _        -> Left $ ParseFailure "Couldn't parse odt file."
 
 --
-archiveToOdt :: Archive -> Either PandocError Pandoc
+archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag)
 archiveToOdt archive
-  | Just contentEntry <- findEntryByPath "content.xml" archive
-  , Just stylesEntry  <- findEntryByPath "styles.xml"  archive
-  , Just contentElem  <- entryToXmlElem contentEntry
-  , Just stylesElem   <- entryToXmlElem stylesEntry
-  , Right styles      <- chooseMax (readStylesAt stylesElem )
-                                   (readStylesAt contentElem)
-  , startState        <- readerState styles
-  , Right pandoc      <- runConverter' read_body
-                                       startState
-                                       contentElem
-  = Right pandoc
+  | Just contentEntry      <- findEntryByPath "content.xml" archive
+  , Just stylesEntry       <- findEntryByPath "styles.xml"  archive
+  , Just contentElem       <- entryToXmlElem contentEntry
+  , Just stylesElem        <- entryToXmlElem stylesEntry
+  , Right styles           <- chooseMax (readStylesAt stylesElem )
+                                        (readStylesAt contentElem)
+  , media                  <- filteredFilesFromArchive archive filePathIsOdtMedia
+  , startState             <- readerState styles media
+  , Right pandocWithMedia  <- runConverter' read_body
+                                            startState
+                                            contentElem
+
+  = Right pandocWithMedia
 
   | otherwise
     -- Not very detailed, but I don't think more information would be helpful
   = Left $ ParseFailure "Couldn't parse odt file."
+    where
+      filePathIsOdtMedia :: FilePath -> Bool
+      filePathIsOdtMedia fp =
+        let (dir, _) = splitFileName fp
+        in
+         (dir == "Pictures/")
+
 
 --
 entryToXmlElem :: Entry -> Maybe XML.Element
diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
index 35fbc3731..11d39498c 100644
--- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs
+++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
@@ -42,6 +42,7 @@ module Text.Pandoc.Readers.Odt.ContentReader
 import           Control.Arrow
 import           Control.Applicative    hiding ( liftA, liftA2, liftA3 )
 
+import qualified Data.ByteString.Lazy   as B
 import qualified Data.Map               as M
 import           Data.List                     ( find )
 import           Data.Maybe
@@ -50,6 +51,7 @@ import qualified Text.XML.Light         as XML
 
 import           Text.Pandoc.Definition
 import           Text.Pandoc.Builder
+import           Text.Pandoc.MediaBag (insertMedia, MediaBag)
 import           Text.Pandoc.Shared
 
 import           Text.Pandoc.Readers.Odt.Base
@@ -68,6 +70,7 @@ import qualified Data.Set as Set
 --------------------------------------------------------------------------------
 
 type Anchor = String
+type Media = [(FilePath, B.ByteString)]
 
 data ReaderState
    = ReaderState { -- | A collection of styles read somewhere else.
@@ -87,14 +90,17 @@ data ReaderState
                    -- | A map from internal anchor names to "pretty" ones.
                    -- The mapping is a purely cosmetic one.
                  , bookmarkAnchors  :: M.Map Anchor Anchor
-
+                   -- | A map of files / binary data from the archive
+                 , envMedia         :: Media
+                   -- | Hold binary resources used in the document
+                 , odtMediaBag      :: MediaBag
 --               , sequences
 --               , trackedChangeIDs
                  }
   deriving ( Show )
 
-readerState :: Styles -> ReaderState
-readerState styles = ReaderState styles [] 0 Nothing M.empty
+readerState :: Styles -> Media -> ReaderState
+readerState styles media = ReaderState styles [] 0 Nothing M.empty media mempty
 
 --
 pushStyle'  :: Style -> ReaderState -> ReaderState
@@ -134,6 +140,16 @@ putPrettyAnchor ugly pretty state@ReaderState{..}
 usedAnchors :: ReaderState -> [Anchor]
 usedAnchors ReaderState{..} = M.elems bookmarkAnchors
 
+getMediaBag :: ReaderState -> MediaBag
+getMediaBag ReaderState{..} = odtMediaBag
+
+getMediaEnv :: ReaderState -> Media
+getMediaEnv ReaderState{..} = envMedia
+
+insertMedia' :: (FilePath, B.ByteString) -> ReaderState ->  ReaderState
+insertMedia' (fp, bs) state@ReaderState{..}
+  = state { odtMediaBag = insertMedia fp Nothing bs odtMediaBag }
+
 --------------------------------------------------------------------------------
 -- Reader type and associated tools
 --------------------------------------------------------------------------------
@@ -190,6 +206,22 @@ popStyle =     keepingTheValue (
 getCurrentListLevel :: OdtReaderSafe _x ListLevel
 getCurrentListLevel = getExtraState >>^ currentListLevel
 
+--
+updateMediaWithResource :: OdtReaderSafe (FilePath, B.ByteString) (FilePath, B.ByteString)
+updateMediaWithResource = keepingTheValue (
+                 (keepingTheValue getExtraState
+                  >>% insertMedia'
+                  )
+                 >>> setExtraState
+               )
+           >>^ fst
+
+lookupResource :: OdtReaderSafe String (FilePath, B.ByteString)
+lookupResource = proc target -> do
+    state <- getExtraState -< ()
+    case lookup target (getMediaEnv state) of
+      Just bs -> returnV (target, bs) -<< ()
+      Nothing -> returnV ("", B.empty) -< ()
 
 type AnchorPrefix = String
 
@@ -511,6 +543,10 @@ read_plain_text =  fst ^&&& read_plain_text' >>% recover
     extractText (XML.Text cData) = succeedWith (XML.cdData cData)
     extractText         _        = failEmpty
 
+read_text_seq :: InlineMatcher
+read_text_seq  = matchingElement NsText "sequence"
+                 $ matchChildContent [] read_plain_text
+
 
 -- specifically. I honor that, although the current implementation of '(<>)'
 -- for 'Inlines' in "Text.Pandoc.Builder" will collaps them agein.
@@ -559,6 +595,8 @@ read_paragraph    = matchingElement NsText "p"
                                         , read_reference_start
                                         , read_bookmark_ref
                                         , read_reference_ref
+                                        , read_maybe_nested_img_frame
+                                        , read_text_seq
                                         ] read_plain_text
 
 
@@ -583,6 +621,7 @@ read_header       = matchingElement NsText "h"
                                   , read_reference_start
                                   , read_bookmark_ref
                                   , read_reference_ref
+                                  , read_maybe_nested_img_frame
                                   ] read_plain_text
               ) -< blocks
   anchor   <- getHeaderAnchor -< children
@@ -688,6 +727,64 @@ read_table_cell    = matchingElement NsTable "table-cell"
                                           ]
 
 ----------------------
+-- Images
+----------------------
+
+--
+read_maybe_nested_img_frame  :: InlineMatcher
+read_maybe_nested_img_frame   = matchingElement NsDraw "frame"
+                                $ proc blocks -> do
+                                   img <- (findChild' NsDraw "image") -< ()
+                                   case img of
+                                     Just _  ->  read_frame                                 -< blocks
+                                     Nothing ->  matchChildContent' [ read_frame_text_box ] -< blocks
+
+read_frame :: OdtReaderSafe Inlines Inlines
+read_frame =
+  proc blocks -> do
+   w          <- ( findAttr' NsSVG "width" )                 -< ()
+   h          <- ( findAttr' NsSVG "height" )                -< ()
+   titleNodes <- ( matchChildContent' [ read_frame_title ] ) -< blocks
+   src        <-  matchChildContent' [ read_image_src ]      -< blocks
+   resource   <- lookupResource                              -< src
+   _          <- updateMediaWithResource                     -< resource
+   alt        <- (matchChildContent [] read_plain_text)      -< blocks
+   arr (uncurry4 imageWith ) -<
+                (image_attributes w h, src, inlineListToIdentifier (toList titleNodes), alt)
+
+image_attributes :: Maybe String -> Maybe String -> Attr
+image_attributes x y =
+  ( "", [], (dim "width" x) ++ (dim "height" y))
+  where
+    dim _ (Just "")   = []
+    dim name (Just v) = [(name, v)]
+    dim _ Nothing     = []
+
+read_image_src :: (Namespace, ElementName, OdtReader Anchor Anchor)
+read_image_src = matchingElement NsDraw "image"
+                 $ proc _ -> do
+                    imgSrc <- findAttr NsXLink "href" -< ()
+                    case imgSrc of
+                      Right src -> returnV src -<< ()
+                      Left _    -> returnV ""  -< ()
+
+read_frame_title :: InlineMatcher
+read_frame_title = matchingElement NsSVG "title"
+                   $ (matchChildContent [] read_plain_text)
+
+read_frame_text_box :: InlineMatcher
+read_frame_text_box = matchingElement NsDraw "text-box"
+                      $ proc blocks -> do
+                         paragraphs <- (matchChildContent' [ read_paragraph ]) -< blocks
+                         arr read_img_with_caption                             -< toList paragraphs
+
+read_img_with_caption :: [Block] -> Inlines
+read_img_with_caption ((Para ((Image attr _ target) : txt)) : _) =
+  singleton (Image attr txt target)             -- override caption with the text that follows
+read_img_with_caption _ =
+  mempty
+
+----------------------
 -- Internal links
 ----------------------
 
@@ -783,8 +880,11 @@ read_text = matchChildContent' [ read_header
                                ]
             >>^ doc
 
-read_body :: OdtReader _x Pandoc
+read_body :: OdtReader _x (Pandoc, MediaBag)
 read_body = executeIn NsOffice "body"
           $ executeIn NsOffice "text"
-          $ liftAsSuccess read_text
-
+          $ liftAsSuccess
+          $ proc inlines -> do
+             txt   <- read_text     -< inlines
+             state <- getExtraState -< ()
+             returnA                -< (txt, getMediaBag state)
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index 407ff97db..4c10a5572 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -68,6 +68,7 @@ module Text.Pandoc.Shared (
                      Element (..),
                      hierarchicalize,
                      uniqueIdent,
+                     inlineListToIdentifier,
                      isHeaderBlock,
                      headerShift,
                      isTightList,
@@ -85,6 +86,7 @@ module Text.Pandoc.Shared (
                      fetchItem',
                      openURL,
                      collapseFilePath,
+                     filteredFilesFromArchive,
                      -- * Error handling
                      err,
                      warn,
@@ -111,6 +113,7 @@ import System.Exit (exitWith, ExitCode(..))
 import Data.Char ( toLower, isLower, isUpper, isAlpha,
                    isLetter, isDigit, isSpace )
 import Data.List ( find, stripPrefix, intercalate )
+import Data.Maybe (mapMaybe)
 import Data.Version ( showVersion )
 import qualified Data.Map as M
 import Network.URI ( escapeURIString, nonStrictRelativeTo,
@@ -1031,6 +1034,16 @@ collapseFilePath = Posix.joinPath . reverse . foldl go [] . splitDirectories
     isSingleton _ = Nothing
     checkPathSeperator = fmap isPathSeparator . isSingleton
 
+--
+-- File selection from the archive
+--
+filteredFilesFromArchive :: Archive -> (FilePath -> Bool) -> [(FilePath, BL.ByteString)]
+filteredFilesFromArchive zf f =
+  mapMaybe (fileAndBinary zf) (filter f (filesInArchive zf))
+  where
+    fileAndBinary :: Archive -> FilePath -> Maybe (FilePath, BL.ByteString)
+    fileAndBinary a fp = findEntryByPath fp a >>= \e -> Just (fp, fromEntry e)
+
 ---
 --- Squash blocks into inlines
 ---
author	John MacFarlane <jgm@berkeley.edu>	2016-10-18 22:00:58 +0200
committer	GitHub <noreply@github.com>	2016-10-18 22:00:58 +0200
commit	0cd11b3e5404fa6fca7538098bcf315343d1a237 (patch)
tree	39057cfcb282e2e7b901dc12510ecffc3baad25b /src
parent	8264ae2abe976184086a5a40c3d082f5e3e99ca5 (diff)
parent	4417e33ea9d49a2001091adb4d2b19ebdefe5795 (diff)
download	pandoc-0cd11b3e5404fa6fca7538098bcf315343d1a237.tar.gz