diff options
| author | John MacFarlane <jgm@berkeley.edu> | 2016-10-18 22:00:58 +0200 | 
|---|---|---|
| committer | GitHub <noreply@github.com> | 2016-10-18 22:00:58 +0200 | 
| commit | 0cd11b3e5404fa6fca7538098bcf315343d1a237 (patch) | |
| tree | 39057cfcb282e2e7b901dc12510ecffc3baad25b /src | |
| parent | 8264ae2abe976184086a5a40c3d082f5e3e99ca5 (diff) | |
| parent | 4417e33ea9d49a2001091adb4d2b19ebdefe5795 (diff) | |
| download | pandoc-0cd11b3e5404fa6fca7538098bcf315343d1a237.tar.gz | |
Merge pull request #3165 from hubertp-lshift/feature/odt-image
[odt] images parser
Diffstat (limited to 'src')
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 20 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Odt.hs | 44 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/Odt/ContentReader.hs | 112 | ||||
| -rw-r--r-- | src/Text/Pandoc/Shared.hs | 13 | 
4 files changed, 151 insertions, 38 deletions
| diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index b9021ec08..7b9779105 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -65,7 +65,7 @@ import Control.Monad.State  import Control.Applicative ((<|>))  import qualified Data.Map as M  import Control.Monad.Except -import Text.Pandoc.Shared (safeRead) +import Text.Pandoc.Shared (safeRead, filteredFilesFromArchive)  import Text.TeXMath.Readers.OMML (readOMML)  import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..))  import Text.TeXMath (Exp) @@ -86,7 +86,6 @@ data ReaderEnv = ReaderEnv { envNotes         :: Notes  data ReaderState = ReaderState { stateWarnings :: [String] }                   deriving Show -                                                    data DocxError = DocxError | WrongElem                 deriving Show @@ -276,7 +275,7 @@ archiveToDocxWithWarnings archive = do        comments  = archiveToComments archive        numbering = archiveToNumbering archive        rels      = archiveToRelationships archive -      media     = archiveToMedia archive +      media     = filteredFilesFromArchive archive filePathIsMedia        (styles, parstyles) = archiveToStyles archive        rEnv =          ReaderEnv notes comments numbering rels media Nothing styles parstyles InDocument @@ -402,7 +401,6 @@ archiveToComments zf =      case cmts of        Just c -> Comments cmts_namespaces c        Nothing -> Comments cmts_namespaces M.empty -                 filePathToRelType :: FilePath -> Maybe DocumentLocation  filePathToRelType "word/_rels/document.xml.rels" = Just InDocument @@ -424,7 +422,7 @@ filePathToRelationships ar fp | Just relType <- filePathToRelType fp                                , Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry =    mapMaybe (relElemToRelationship relType) $ elChildren relElems  filePathToRelationships _ _ = [] -                                +  archiveToRelationships :: Archive -> [Relationship]  archiveToRelationships archive =    concatMap (filePathToRelationships archive) $ filesInArchive archive @@ -435,16 +433,6 @@ filePathIsMedia fp =    in     (dir == "word/media/") -getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString) -getMediaPair zf fp = -  case findEntryByPath fp zf of -    Just e -> Just (fp, fromEntry e) -    Nothing -> Nothing - -archiveToMedia :: Archive -> Media -archiveToMedia zf = -  mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf)) -  lookupLevel :: String -> String -> Numbering -> Maybe Level  lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do    absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs @@ -741,7 +729,7 @@ elemToCommentStart ns element    , Just cmtDate <- findAttr (elemName ns "w" "date") element = do        bps <- mapD (elemToBodyPart ns) (elChildren element)        return $ CommentStart cmtId cmtAuthor cmtDate bps -elemToCommentStart _ _ = throwError WrongElem       +elemToCommentStart _ _ = throwError WrongElem  lookupFootnote :: String -> Notes -> Maybe Element  lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s) diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs index 68e89263c..046fb4d6d 100644 --- a/src/Text/Pandoc/Readers/Odt.hs +++ b/src/Text/Pandoc/Readers/Odt.hs @@ -37,6 +37,8 @@ import qualified Text.XML.Light                        as XML  import qualified Data.ByteString.Lazy                  as B +import           System.FilePath +  import           Text.Pandoc.Definition  import           Text.Pandoc.Error  import           Text.Pandoc.Options @@ -48,39 +50,49 @@ import           Text.Pandoc.Readers.Odt.StyleReader  import           Text.Pandoc.Readers.Odt.Generic.XMLConverter  import           Text.Pandoc.Readers.Odt.Generic.Fallible +import           Text.Pandoc.Shared (filteredFilesFromArchive)  --  readOdt :: ReaderOptions          -> B.ByteString          -> Either PandocError (Pandoc, MediaBag) -readOdt _ bytes = case bytesToOdt bytes of -                    Right pandoc -> Right (pandoc , mempty) -                    Left  err    -> Left err +readOdt _ bytes = bytesToOdt bytes-- of +--                    Right (pandoc, mediaBag) -> Right (pandoc , mediaBag) +--                    Left  err                -> Left err  -- -bytesToOdt :: B.ByteString -> Either PandocError Pandoc +bytesToOdt :: B.ByteString -> Either PandocError (Pandoc, MediaBag)  bytesToOdt bytes = case toArchiveOrFail bytes of    Right archive -> archiveToOdt archive    Left _        -> Left $ ParseFailure "Couldn't parse odt file."  -- -archiveToOdt :: Archive -> Either PandocError Pandoc +archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag)  archiveToOdt archive -  | Just contentEntry <- findEntryByPath "content.xml" archive -  , Just stylesEntry  <- findEntryByPath "styles.xml"  archive -  , Just contentElem  <- entryToXmlElem contentEntry -  , Just stylesElem   <- entryToXmlElem stylesEntry -  , Right styles      <- chooseMax (readStylesAt stylesElem ) -                                   (readStylesAt contentElem) -  , startState        <- readerState styles -  , Right pandoc      <- runConverter' read_body -                                       startState -                                       contentElem -  = Right pandoc +  | Just contentEntry      <- findEntryByPath "content.xml" archive +  , Just stylesEntry       <- findEntryByPath "styles.xml"  archive +  , Just contentElem       <- entryToXmlElem contentEntry +  , Just stylesElem        <- entryToXmlElem stylesEntry +  , Right styles           <- chooseMax (readStylesAt stylesElem ) +                                        (readStylesAt contentElem) +  , media                  <- filteredFilesFromArchive archive filePathIsOdtMedia +  , startState             <- readerState styles media +  , Right pandocWithMedia  <- runConverter' read_body +                                            startState +                                            contentElem + +  = Right pandocWithMedia    | otherwise      -- Not very detailed, but I don't think more information would be helpful    = Left $ ParseFailure "Couldn't parse odt file." +    where +      filePathIsOdtMedia :: FilePath -> Bool +      filePathIsOdtMedia fp = +        let (dir, _) = splitFileName fp +        in +         (dir == "Pictures/") +  --  entryToXmlElem :: Entry -> Maybe XML.Element diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index 35fbc3731..11d39498c 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -42,6 +42,7 @@ module Text.Pandoc.Readers.Odt.ContentReader  import           Control.Arrow  import           Control.Applicative    hiding ( liftA, liftA2, liftA3 ) +import qualified Data.ByteString.Lazy   as B  import qualified Data.Map               as M  import           Data.List                     ( find )  import           Data.Maybe @@ -50,6 +51,7 @@ import qualified Text.XML.Light         as XML  import           Text.Pandoc.Definition  import           Text.Pandoc.Builder +import           Text.Pandoc.MediaBag (insertMedia, MediaBag)  import           Text.Pandoc.Shared  import           Text.Pandoc.Readers.Odt.Base @@ -68,6 +70,7 @@ import qualified Data.Set as Set  --------------------------------------------------------------------------------  type Anchor = String +type Media = [(FilePath, B.ByteString)]  data ReaderState     = ReaderState { -- | A collection of styles read somewhere else. @@ -87,14 +90,17 @@ data ReaderState                     -- | A map from internal anchor names to "pretty" ones.                     -- The mapping is a purely cosmetic one.                   , bookmarkAnchors  :: M.Map Anchor Anchor - +                   -- | A map of files / binary data from the archive +                 , envMedia         :: Media +                   -- | Hold binary resources used in the document +                 , odtMediaBag      :: MediaBag  --               , sequences  --               , trackedChangeIDs                   }    deriving ( Show ) -readerState :: Styles -> ReaderState -readerState styles = ReaderState styles [] 0 Nothing M.empty +readerState :: Styles -> Media -> ReaderState +readerState styles media = ReaderState styles [] 0 Nothing M.empty media mempty  --  pushStyle'  :: Style -> ReaderState -> ReaderState @@ -134,6 +140,16 @@ putPrettyAnchor ugly pretty state@ReaderState{..}  usedAnchors :: ReaderState -> [Anchor]  usedAnchors ReaderState{..} = M.elems bookmarkAnchors +getMediaBag :: ReaderState -> MediaBag +getMediaBag ReaderState{..} = odtMediaBag + +getMediaEnv :: ReaderState -> Media +getMediaEnv ReaderState{..} = envMedia + +insertMedia' :: (FilePath, B.ByteString) -> ReaderState ->  ReaderState +insertMedia' (fp, bs) state@ReaderState{..} +  = state { odtMediaBag = insertMedia fp Nothing bs odtMediaBag } +  --------------------------------------------------------------------------------  -- Reader type and associated tools  -------------------------------------------------------------------------------- @@ -190,6 +206,22 @@ popStyle =     keepingTheValue (  getCurrentListLevel :: OdtReaderSafe _x ListLevel  getCurrentListLevel = getExtraState >>^ currentListLevel +-- +updateMediaWithResource :: OdtReaderSafe (FilePath, B.ByteString) (FilePath, B.ByteString) +updateMediaWithResource = keepingTheValue ( +                 (keepingTheValue getExtraState +                  >>% insertMedia' +                  ) +                 >>> setExtraState +               ) +           >>^ fst + +lookupResource :: OdtReaderSafe String (FilePath, B.ByteString) +lookupResource = proc target -> do +    state <- getExtraState -< () +    case lookup target (getMediaEnv state) of +      Just bs -> returnV (target, bs) -<< () +      Nothing -> returnV ("", B.empty) -< ()  type AnchorPrefix = String @@ -511,6 +543,10 @@ read_plain_text =  fst ^&&& read_plain_text' >>% recover      extractText (XML.Text cData) = succeedWith (XML.cdData cData)      extractText         _        = failEmpty +read_text_seq :: InlineMatcher +read_text_seq  = matchingElement NsText "sequence" +                 $ matchChildContent [] read_plain_text +  -- specifically. I honor that, although the current implementation of '(<>)'  -- for 'Inlines' in "Text.Pandoc.Builder" will collaps them agein. @@ -559,6 +595,8 @@ read_paragraph    = matchingElement NsText "p"                                          , read_reference_start                                          , read_bookmark_ref                                          , read_reference_ref +                                        , read_maybe_nested_img_frame +                                        , read_text_seq                                          ] read_plain_text @@ -583,6 +621,7 @@ read_header       = matchingElement NsText "h"                                    , read_reference_start                                    , read_bookmark_ref                                    , read_reference_ref +                                  , read_maybe_nested_img_frame                                    ] read_plain_text                ) -< blocks    anchor   <- getHeaderAnchor -< children @@ -688,6 +727,64 @@ read_table_cell    = matchingElement NsTable "table-cell"                                            ]  ---------------------- +-- Images +---------------------- + +-- +read_maybe_nested_img_frame  :: InlineMatcher +read_maybe_nested_img_frame   = matchingElement NsDraw "frame" +                                $ proc blocks -> do +                                   img <- (findChild' NsDraw "image") -< () +                                   case img of +                                     Just _  ->  read_frame                                 -< blocks +                                     Nothing ->  matchChildContent' [ read_frame_text_box ] -< blocks + +read_frame :: OdtReaderSafe Inlines Inlines +read_frame = +  proc blocks -> do +   w          <- ( findAttr' NsSVG "width" )                 -< () +   h          <- ( findAttr' NsSVG "height" )                -< () +   titleNodes <- ( matchChildContent' [ read_frame_title ] ) -< blocks +   src        <-  matchChildContent' [ read_image_src ]      -< blocks +   resource   <- lookupResource                              -< src +   _          <- updateMediaWithResource                     -< resource +   alt        <- (matchChildContent [] read_plain_text)      -< blocks +   arr (uncurry4 imageWith ) -< +                (image_attributes w h, src, inlineListToIdentifier (toList titleNodes), alt) + +image_attributes :: Maybe String -> Maybe String -> Attr +image_attributes x y = +  ( "", [], (dim "width" x) ++ (dim "height" y)) +  where +    dim _ (Just "")   = [] +    dim name (Just v) = [(name, v)] +    dim _ Nothing     = [] + +read_image_src :: (Namespace, ElementName, OdtReader Anchor Anchor) +read_image_src = matchingElement NsDraw "image" +                 $ proc _ -> do +                    imgSrc <- findAttr NsXLink "href" -< () +                    case imgSrc of +                      Right src -> returnV src -<< () +                      Left _    -> returnV ""  -< () + +read_frame_title :: InlineMatcher +read_frame_title = matchingElement NsSVG "title" +                   $ (matchChildContent [] read_plain_text) + +read_frame_text_box :: InlineMatcher +read_frame_text_box = matchingElement NsDraw "text-box" +                      $ proc blocks -> do +                         paragraphs <- (matchChildContent' [ read_paragraph ]) -< blocks +                         arr read_img_with_caption                             -< toList paragraphs + +read_img_with_caption :: [Block] -> Inlines +read_img_with_caption ((Para ((Image attr _ target) : txt)) : _) = +  singleton (Image attr txt target)             -- override caption with the text that follows +read_img_with_caption _ = +  mempty + +----------------------  -- Internal links  ---------------------- @@ -783,8 +880,11 @@ read_text = matchChildContent' [ read_header                                 ]              >>^ doc -read_body :: OdtReader _x Pandoc +read_body :: OdtReader _x (Pandoc, MediaBag)  read_body = executeIn NsOffice "body"            $ executeIn NsOffice "text" -          $ liftAsSuccess read_text - +          $ liftAsSuccess +          $ proc inlines -> do +             txt   <- read_text     -< inlines +             state <- getExtraState -< () +             returnA                -< (txt, getMediaBag state) diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 407ff97db..4c10a5572 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -68,6 +68,7 @@ module Text.Pandoc.Shared (                       Element (..),                       hierarchicalize,                       uniqueIdent, +                     inlineListToIdentifier,                       isHeaderBlock,                       headerShift,                       isTightList, @@ -85,6 +86,7 @@ module Text.Pandoc.Shared (                       fetchItem',                       openURL,                       collapseFilePath, +                     filteredFilesFromArchive,                       -- * Error handling                       err,                       warn, @@ -111,6 +113,7 @@ import System.Exit (exitWith, ExitCode(..))  import Data.Char ( toLower, isLower, isUpper, isAlpha,                     isLetter, isDigit, isSpace )  import Data.List ( find, stripPrefix, intercalate ) +import Data.Maybe (mapMaybe)  import Data.Version ( showVersion )  import qualified Data.Map as M  import Network.URI ( escapeURIString, nonStrictRelativeTo, @@ -1031,6 +1034,16 @@ collapseFilePath = Posix.joinPath . reverse . foldl go [] . splitDirectories      isSingleton _ = Nothing      checkPathSeperator = fmap isPathSeparator . isSingleton +-- +-- File selection from the archive +-- +filteredFilesFromArchive :: Archive -> (FilePath -> Bool) -> [(FilePath, BL.ByteString)] +filteredFilesFromArchive zf f = +  mapMaybe (fileAndBinary zf) (filter f (filesInArchive zf)) +  where +    fileAndBinary :: Archive -> FilePath -> Maybe (FilePath, BL.ByteString) +    fileAndBinary a fp = findEntryByPath fp a >>= \e -> Just (fp, fromEntry e) +  ---  --- Squash blocks into inlines  --- | 
