aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs20
-rw-r--r--src/Text/Pandoc/Readers/Odt.hs44
-rw-r--r--src/Text/Pandoc/Readers/Odt/ContentReader.hs112
-rw-r--r--src/Text/Pandoc/Shared.hs13
4 files changed, 151 insertions, 38 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index b9021ec08..7b9779105 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -65,7 +65,7 @@ import Control.Monad.State
import Control.Applicative ((<|>))
import qualified Data.Map as M
import Control.Monad.Except
-import Text.Pandoc.Shared (safeRead)
+import Text.Pandoc.Shared (safeRead, filteredFilesFromArchive)
import Text.TeXMath.Readers.OMML (readOMML)
import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..))
import Text.TeXMath (Exp)
@@ -86,7 +86,6 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes
data ReaderState = ReaderState { stateWarnings :: [String] }
deriving Show
-
data DocxError = DocxError | WrongElem
deriving Show
@@ -276,7 +275,7 @@ archiveToDocxWithWarnings archive = do
comments = archiveToComments archive
numbering = archiveToNumbering archive
rels = archiveToRelationships archive
- media = archiveToMedia archive
+ media = filteredFilesFromArchive archive filePathIsMedia
(styles, parstyles) = archiveToStyles archive
rEnv =
ReaderEnv notes comments numbering rels media Nothing styles parstyles InDocument
@@ -402,7 +401,6 @@ archiveToComments zf =
case cmts of
Just c -> Comments cmts_namespaces c
Nothing -> Comments cmts_namespaces M.empty
-
filePathToRelType :: FilePath -> Maybe DocumentLocation
filePathToRelType "word/_rels/document.xml.rels" = Just InDocument
@@ -424,7 +422,7 @@ filePathToRelationships ar fp | Just relType <- filePathToRelType fp
, Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry =
mapMaybe (relElemToRelationship relType) $ elChildren relElems
filePathToRelationships _ _ = []
-
+
archiveToRelationships :: Archive -> [Relationship]
archiveToRelationships archive =
concatMap (filePathToRelationships archive) $ filesInArchive archive
@@ -435,16 +433,6 @@ filePathIsMedia fp =
in
(dir == "word/media/")
-getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString)
-getMediaPair zf fp =
- case findEntryByPath fp zf of
- Just e -> Just (fp, fromEntry e)
- Nothing -> Nothing
-
-archiveToMedia :: Archive -> Media
-archiveToMedia zf =
- mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf))
-
lookupLevel :: String -> String -> Numbering -> Maybe Level
lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do
absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs
@@ -741,7 +729,7 @@ elemToCommentStart ns element
, Just cmtDate <- findAttr (elemName ns "w" "date") element = do
bps <- mapD (elemToBodyPart ns) (elChildren element)
return $ CommentStart cmtId cmtAuthor cmtDate bps
-elemToCommentStart _ _ = throwError WrongElem
+elemToCommentStart _ _ = throwError WrongElem
lookupFootnote :: String -> Notes -> Maybe Element
lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s)
diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs
index 68e89263c..046fb4d6d 100644
--- a/src/Text/Pandoc/Readers/Odt.hs
+++ b/src/Text/Pandoc/Readers/Odt.hs
@@ -37,6 +37,8 @@ import qualified Text.XML.Light as XML
import qualified Data.ByteString.Lazy as B
+import System.FilePath
+
import Text.Pandoc.Definition
import Text.Pandoc.Error
import Text.Pandoc.Options
@@ -48,39 +50,49 @@ import Text.Pandoc.Readers.Odt.StyleReader
import Text.Pandoc.Readers.Odt.Generic.XMLConverter
import Text.Pandoc.Readers.Odt.Generic.Fallible
+import Text.Pandoc.Shared (filteredFilesFromArchive)
--
readOdt :: ReaderOptions
-> B.ByteString
-> Either PandocError (Pandoc, MediaBag)
-readOdt _ bytes = case bytesToOdt bytes of
- Right pandoc -> Right (pandoc , mempty)
- Left err -> Left err
+readOdt _ bytes = bytesToOdt bytes-- of
+-- Right (pandoc, mediaBag) -> Right (pandoc , mediaBag)
+-- Left err -> Left err
--
-bytesToOdt :: B.ByteString -> Either PandocError Pandoc
+bytesToOdt :: B.ByteString -> Either PandocError (Pandoc, MediaBag)
bytesToOdt bytes = case toArchiveOrFail bytes of
Right archive -> archiveToOdt archive
Left _ -> Left $ ParseFailure "Couldn't parse odt file."
--
-archiveToOdt :: Archive -> Either PandocError Pandoc
+archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag)
archiveToOdt archive
- | Just contentEntry <- findEntryByPath "content.xml" archive
- , Just stylesEntry <- findEntryByPath "styles.xml" archive
- , Just contentElem <- entryToXmlElem contentEntry
- , Just stylesElem <- entryToXmlElem stylesEntry
- , Right styles <- chooseMax (readStylesAt stylesElem )
- (readStylesAt contentElem)
- , startState <- readerState styles
- , Right pandoc <- runConverter' read_body
- startState
- contentElem
- = Right pandoc
+ | Just contentEntry <- findEntryByPath "content.xml" archive
+ , Just stylesEntry <- findEntryByPath "styles.xml" archive
+ , Just contentElem <- entryToXmlElem contentEntry
+ , Just stylesElem <- entryToXmlElem stylesEntry
+ , Right styles <- chooseMax (readStylesAt stylesElem )
+ (readStylesAt contentElem)
+ , media <- filteredFilesFromArchive archive filePathIsOdtMedia
+ , startState <- readerState styles media
+ , Right pandocWithMedia <- runConverter' read_body
+ startState
+ contentElem
+
+ = Right pandocWithMedia
| otherwise
-- Not very detailed, but I don't think more information would be helpful
= Left $ ParseFailure "Couldn't parse odt file."
+ where
+ filePathIsOdtMedia :: FilePath -> Bool
+ filePathIsOdtMedia fp =
+ let (dir, _) = splitFileName fp
+ in
+ (dir == "Pictures/")
+
--
entryToXmlElem :: Entry -> Maybe XML.Element
diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
index 35fbc3731..11d39498c 100644
--- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs
+++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
@@ -42,6 +42,7 @@ module Text.Pandoc.Readers.Odt.ContentReader
import Control.Arrow
import Control.Applicative hiding ( liftA, liftA2, liftA3 )
+import qualified Data.ByteString.Lazy as B
import qualified Data.Map as M
import Data.List ( find )
import Data.Maybe
@@ -50,6 +51,7 @@ import qualified Text.XML.Light as XML
import Text.Pandoc.Definition
import Text.Pandoc.Builder
+import Text.Pandoc.MediaBag (insertMedia, MediaBag)
import Text.Pandoc.Shared
import Text.Pandoc.Readers.Odt.Base
@@ -68,6 +70,7 @@ import qualified Data.Set as Set
--------------------------------------------------------------------------------
type Anchor = String
+type Media = [(FilePath, B.ByteString)]
data ReaderState
= ReaderState { -- | A collection of styles read somewhere else.
@@ -87,14 +90,17 @@ data ReaderState
-- | A map from internal anchor names to "pretty" ones.
-- The mapping is a purely cosmetic one.
, bookmarkAnchors :: M.Map Anchor Anchor
-
+ -- | A map of files / binary data from the archive
+ , envMedia :: Media
+ -- | Hold binary resources used in the document
+ , odtMediaBag :: MediaBag
-- , sequences
-- , trackedChangeIDs
}
deriving ( Show )
-readerState :: Styles -> ReaderState
-readerState styles = ReaderState styles [] 0 Nothing M.empty
+readerState :: Styles -> Media -> ReaderState
+readerState styles media = ReaderState styles [] 0 Nothing M.empty media mempty
--
pushStyle' :: Style -> ReaderState -> ReaderState
@@ -134,6 +140,16 @@ putPrettyAnchor ugly pretty state@ReaderState{..}
usedAnchors :: ReaderState -> [Anchor]
usedAnchors ReaderState{..} = M.elems bookmarkAnchors
+getMediaBag :: ReaderState -> MediaBag
+getMediaBag ReaderState{..} = odtMediaBag
+
+getMediaEnv :: ReaderState -> Media
+getMediaEnv ReaderState{..} = envMedia
+
+insertMedia' :: (FilePath, B.ByteString) -> ReaderState -> ReaderState
+insertMedia' (fp, bs) state@ReaderState{..}
+ = state { odtMediaBag = insertMedia fp Nothing bs odtMediaBag }
+
--------------------------------------------------------------------------------
-- Reader type and associated tools
--------------------------------------------------------------------------------
@@ -190,6 +206,22 @@ popStyle = keepingTheValue (
getCurrentListLevel :: OdtReaderSafe _x ListLevel
getCurrentListLevel = getExtraState >>^ currentListLevel
+--
+updateMediaWithResource :: OdtReaderSafe (FilePath, B.ByteString) (FilePath, B.ByteString)
+updateMediaWithResource = keepingTheValue (
+ (keepingTheValue getExtraState
+ >>% insertMedia'
+ )
+ >>> setExtraState
+ )
+ >>^ fst
+
+lookupResource :: OdtReaderSafe String (FilePath, B.ByteString)
+lookupResource = proc target -> do
+ state <- getExtraState -< ()
+ case lookup target (getMediaEnv state) of
+ Just bs -> returnV (target, bs) -<< ()
+ Nothing -> returnV ("", B.empty) -< ()
type AnchorPrefix = String
@@ -511,6 +543,10 @@ read_plain_text = fst ^&&& read_plain_text' >>% recover
extractText (XML.Text cData) = succeedWith (XML.cdData cData)
extractText _ = failEmpty
+read_text_seq :: InlineMatcher
+read_text_seq = matchingElement NsText "sequence"
+ $ matchChildContent [] read_plain_text
+
-- specifically. I honor that, although the current implementation of '(<>)'
-- for 'Inlines' in "Text.Pandoc.Builder" will collaps them agein.
@@ -559,6 +595,8 @@ read_paragraph = matchingElement NsText "p"
, read_reference_start
, read_bookmark_ref
, read_reference_ref
+ , read_maybe_nested_img_frame
+ , read_text_seq
] read_plain_text
@@ -583,6 +621,7 @@ read_header = matchingElement NsText "h"
, read_reference_start
, read_bookmark_ref
, read_reference_ref
+ , read_maybe_nested_img_frame
] read_plain_text
) -< blocks
anchor <- getHeaderAnchor -< children
@@ -688,6 +727,64 @@ read_table_cell = matchingElement NsTable "table-cell"
]
----------------------
+-- Images
+----------------------
+
+--
+read_maybe_nested_img_frame :: InlineMatcher
+read_maybe_nested_img_frame = matchingElement NsDraw "frame"
+ $ proc blocks -> do
+ img <- (findChild' NsDraw "image") -< ()
+ case img of
+ Just _ -> read_frame -< blocks
+ Nothing -> matchChildContent' [ read_frame_text_box ] -< blocks
+
+read_frame :: OdtReaderSafe Inlines Inlines
+read_frame =
+ proc blocks -> do
+ w <- ( findAttr' NsSVG "width" ) -< ()
+ h <- ( findAttr' NsSVG "height" ) -< ()
+ titleNodes <- ( matchChildContent' [ read_frame_title ] ) -< blocks
+ src <- matchChildContent' [ read_image_src ] -< blocks
+ resource <- lookupResource -< src
+ _ <- updateMediaWithResource -< resource
+ alt <- (matchChildContent [] read_plain_text) -< blocks
+ arr (uncurry4 imageWith ) -<
+ (image_attributes w h, src, inlineListToIdentifier (toList titleNodes), alt)
+
+image_attributes :: Maybe String -> Maybe String -> Attr
+image_attributes x y =
+ ( "", [], (dim "width" x) ++ (dim "height" y))
+ where
+ dim _ (Just "") = []
+ dim name (Just v) = [(name, v)]
+ dim _ Nothing = []
+
+read_image_src :: (Namespace, ElementName, OdtReader Anchor Anchor)
+read_image_src = matchingElement NsDraw "image"
+ $ proc _ -> do
+ imgSrc <- findAttr NsXLink "href" -< ()
+ case imgSrc of
+ Right src -> returnV src -<< ()
+ Left _ -> returnV "" -< ()
+
+read_frame_title :: InlineMatcher
+read_frame_title = matchingElement NsSVG "title"
+ $ (matchChildContent [] read_plain_text)
+
+read_frame_text_box :: InlineMatcher
+read_frame_text_box = matchingElement NsDraw "text-box"
+ $ proc blocks -> do
+ paragraphs <- (matchChildContent' [ read_paragraph ]) -< blocks
+ arr read_img_with_caption -< toList paragraphs
+
+read_img_with_caption :: [Block] -> Inlines
+read_img_with_caption ((Para ((Image attr _ target) : txt)) : _) =
+ singleton (Image attr txt target) -- override caption with the text that follows
+read_img_with_caption _ =
+ mempty
+
+----------------------
-- Internal links
----------------------
@@ -783,8 +880,11 @@ read_text = matchChildContent' [ read_header
]
>>^ doc
-read_body :: OdtReader _x Pandoc
+read_body :: OdtReader _x (Pandoc, MediaBag)
read_body = executeIn NsOffice "body"
$ executeIn NsOffice "text"
- $ liftAsSuccess read_text
-
+ $ liftAsSuccess
+ $ proc inlines -> do
+ txt <- read_text -< inlines
+ state <- getExtraState -< ()
+ returnA -< (txt, getMediaBag state)
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index 407ff97db..4c10a5572 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -68,6 +68,7 @@ module Text.Pandoc.Shared (
Element (..),
hierarchicalize,
uniqueIdent,
+ inlineListToIdentifier,
isHeaderBlock,
headerShift,
isTightList,
@@ -85,6 +86,7 @@ module Text.Pandoc.Shared (
fetchItem',
openURL,
collapseFilePath,
+ filteredFilesFromArchive,
-- * Error handling
err,
warn,
@@ -111,6 +113,7 @@ import System.Exit (exitWith, ExitCode(..))
import Data.Char ( toLower, isLower, isUpper, isAlpha,
isLetter, isDigit, isSpace )
import Data.List ( find, stripPrefix, intercalate )
+import Data.Maybe (mapMaybe)
import Data.Version ( showVersion )
import qualified Data.Map as M
import Network.URI ( escapeURIString, nonStrictRelativeTo,
@@ -1031,6 +1034,16 @@ collapseFilePath = Posix.joinPath . reverse . foldl go [] . splitDirectories
isSingleton _ = Nothing
checkPathSeperator = fmap isPathSeparator . isSingleton
+--
+-- File selection from the archive
+--
+filteredFilesFromArchive :: Archive -> (FilePath -> Bool) -> [(FilePath, BL.ByteString)]
+filteredFilesFromArchive zf f =
+ mapMaybe (fileAndBinary zf) (filter f (filesInArchive zf))
+ where
+ fileAndBinary :: Archive -> FilePath -> Maybe (FilePath, BL.ByteString)
+ fileAndBinary a fp = findEntryByPath fp a >>= \e -> Just (fp, fromEntry e)
+
---
--- Squash blocks into inlines
---