aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2019-07-13 15:53:58 -0700
committerGitHub <noreply@github.com>2019-07-13 15:53:58 -0700
commita16311c225e9b9788ee6499e474f9a9170510b11 (patch)
treede02402064fc3acc433815854bc494778da7af1c /src/Text/Pandoc
parent178416194617b330b280a5e4a9e894258c0f8b1f (diff)
parent449c133406dec231e61b8a3ecbbdfac2cbc00dbc (diff)
downloadpandoc-a16311c225e9b9788ee6499e474f9a9170510b11.tar.gz
Merge pull request #5606 from blmage/odt-frames
Improve the parsing of frames in ODT documents
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r--src/Text/Pandoc/Readers/Odt.hs5
-rw-r--r--src/Text/Pandoc/Readers/Odt/ContentReader.hs122
-rw-r--r--src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs65
-rw-r--r--src/Text/Pandoc/Readers/Odt/StyleReader.hs10
4 files changed, 127 insertions, 75 deletions
diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs
index 3a3d1e992..dfa019932 100644
--- a/src/Text/Pandoc/Readers/Odt.hs
+++ b/src/Text/Pandoc/Readers/Odt.hs
@@ -86,9 +86,8 @@ archiveToOdt archive
where
filePathIsOdtMedia :: FilePath -> Bool
filePathIsOdtMedia fp =
- let (dir, _) = splitFileName fp
- in
- (dir == "Pictures/")
+ let (dir, name) = splitFileName fp
+ in (dir == "Pictures/") || (dir /= "./" && name == "content.xml")
--
diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
index 1d9a0cb8c..d8e5ba272 100644
--- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs
+++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
@@ -1,5 +1,7 @@
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE Arrows #-}
+{-# LANGUAGE DeriveFoldable #-}
+{-# LANGUAGE GeneralizedNewtypeDeriving #-}
{-# LANGUAGE PatternGuards #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TupleSections #-}
@@ -26,21 +28,26 @@ import Control.Applicative hiding (liftA, liftA2, liftA3)
import Control.Arrow
import qualified Data.ByteString.Lazy as B
-import Data.List (find, intercalate)
+import Data.Foldable (fold)
+import Data.List (find, intercalate, stripPrefix)
import qualified Data.Map as M
import Data.Maybe
+import Data.Semigroup (First(..), Option(..))
+import Text.TeXMath (readMathML, writeTeX)
import qualified Text.XML.Light as XML
import Text.Pandoc.Builder
import Text.Pandoc.MediaBag (MediaBag, insertMedia)
import Text.Pandoc.Shared
import Text.Pandoc.Extensions (extensionsFromList, Extension(..))
+import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Readers.Odt.Base
import Text.Pandoc.Readers.Odt.Namespaces
import Text.Pandoc.Readers.Odt.StyleReader
+import Text.Pandoc.Readers.Odt.Arrows.State (foldS)
import Text.Pandoc.Readers.Odt.Arrows.Utils
import Text.Pandoc.Readers.Odt.Generic.Fallible
import Text.Pandoc.Readers.Odt.Generic.Utils
@@ -498,6 +505,13 @@ type InlineMatcher = ElementMatcher Inlines
type BlockMatcher = ElementMatcher Blocks
+newtype FirstMatch a = FirstMatch (Option (First a))
+ deriving (Foldable, Monoid, Semigroup)
+
+firstMatch :: a -> FirstMatch a
+firstMatch = FirstMatch . Option . Just . First
+
+
--
matchingElement :: (Monoid e)
=> Namespace -> ElementName
@@ -598,7 +612,7 @@ read_paragraph = matchingElement NsText "p"
, read_reference_start
, read_bookmark_ref
, read_reference_ref
- , read_maybe_nested_img_frame
+ , read_frame
, read_text_seq
] read_plain_text
@@ -624,7 +638,7 @@ read_header = matchingElement NsText "h"
, read_reference_start
, read_bookmark_ref
, read_reference_ref
- , read_maybe_nested_img_frame
+ , read_frame
] read_plain_text
) -< blocks
anchor <- getHeaderAnchor -< children
@@ -737,32 +751,43 @@ read_table_cell = matchingElement NsTable "table-cell"
]
----------------------
--- Images
+-- Frames
----------------------
--
-read_maybe_nested_img_frame :: InlineMatcher
-read_maybe_nested_img_frame = matchingElement NsDraw "frame"
- $ proc blocks -> do
- img <- (findChild' NsDraw "image") -< ()
- case img of
- Just _ -> read_frame -< blocks
- Nothing -> matchChildContent' [ read_frame_text_box ] -< blocks
-
-read_frame :: OdtReaderSafe Inlines Inlines
-read_frame =
- proc blocks -> do
- let exts = extensionsFromList [Ext_auto_identifiers]
- w <- ( findAttr' NsSVG "width" ) -< ()
- h <- ( findAttr' NsSVG "height" ) -< ()
- titleNodes <- ( matchChildContent' [ read_frame_title ] ) -< blocks
- src <- matchChildContent' [ read_image_src ] -< blocks
- resource <- lookupResource -< src
- _ <- updateMediaWithResource -< resource
- alt <- (matchChildContent [] read_plain_text) -< blocks
- arr (uncurry4 imageWith ) -<
- (image_attributes w h, src,
- inlineListToIdentifier exts (toList titleNodes), alt)
+read_frame :: InlineMatcher
+read_frame = matchingElement NsDraw "frame"
+ $ filterChildrenName' NsDraw (`elem` ["image", "object", "text-box"])
+ >>> foldS read_frame_child
+ >>> arr fold
+
+read_frame_child :: OdtReaderSafe XML.Element (FirstMatch Inlines)
+read_frame_child =
+ proc child -> case elName child of
+ "image" -> read_frame_img -< child
+ "object" -> read_frame_mathml -< child
+ "text-box" -> read_frame_text_box -< child
+ _ -> returnV mempty -< ()
+
+read_frame_img :: OdtReaderSafe XML.Element (FirstMatch Inlines)
+read_frame_img =
+ proc img -> do
+ src <- executeIn (findAttr' NsXLink "href") -< img
+ case fold src of
+ "" -> returnV mempty -< ()
+ src' -> do
+ let exts = extensionsFromList [Ext_auto_identifiers]
+ resource <- lookupResource -< src'
+ _ <- updateMediaWithResource -< resource
+ w <- findAttr' NsSVG "width" -< ()
+ h <- findAttr' NsSVG "height" -< ()
+ titleNodes <- matchChildContent' [ read_frame_title ] -< ()
+ alt <- matchChildContent [] read_plain_text -< ()
+ arr (firstMatch . uncurry4 imageWith) -<
+ (image_attributes w h, src', inlineListToIdentifier exts (toList titleNodes), alt)
+
+read_frame_title :: InlineMatcher
+read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text)
image_attributes :: Maybe String -> Maybe String -> Attr
image_attributes x y =
@@ -772,28 +797,29 @@ image_attributes x y =
dim name (Just v) = [(name, v)]
dim _ Nothing = []
-read_image_src :: (Namespace, ElementName, OdtReader Anchor Anchor)
-read_image_src = matchingElement NsDraw "image"
- $ proc _ -> do
- imgSrc <- findAttr NsXLink "href" -< ()
- case imgSrc of
- Right src -> returnV src -<< ()
- Left _ -> returnV "" -< ()
-
-read_frame_title :: InlineMatcher
-read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text)
-
-read_frame_text_box :: InlineMatcher
-read_frame_text_box = matchingElement NsDraw "text-box"
- $ proc blocks -> do
- paragraphs <- (matchChildContent' [ read_paragraph ]) -< blocks
- arr read_img_with_caption -< toList paragraphs
-
-read_img_with_caption :: [Block] -> Inlines
+read_frame_mathml :: OdtReaderSafe XML.Element (FirstMatch Inlines)
+read_frame_mathml =
+ proc obj -> do
+ src <- executeIn (findAttr' NsXLink "href") -< obj
+ case fold src of
+ "" -> returnV mempty -< ()
+ src' -> do
+ let path = fromMaybe src' (stripPrefix "./" src') ++ "/content.xml"
+ (_, mathml) <- lookupResource -< path
+ case readMathML (UTF8.toString $ B.toStrict mathml) of
+ Left _ -> returnV mempty -< ()
+ Right exps -> arr (firstMatch . displayMath . writeTeX) -< exps
+
+read_frame_text_box :: OdtReaderSafe XML.Element (FirstMatch Inlines)
+read_frame_text_box = proc box -> do
+ paragraphs <- executeIn (matchChildContent' [ read_paragraph ]) -< box
+ arr read_img_with_caption -< toList paragraphs
+
+read_img_with_caption :: [Block] -> FirstMatch Inlines
read_img_with_caption (Para [Image attr alt (src,title)] : _) =
- singleton (Image attr alt (src, 'f':'i':'g':':':title)) -- no text, default caption
+ firstMatch $ singleton (Image attr alt (src, 'f':'i':'g':':':title)) -- no text, default caption
read_img_with_caption (Para (Image attr _ (src,title) : txt) : _) =
- singleton (Image attr txt (src, 'f':'i':'g':':':title) ) -- override caption with the text that follows
+ firstMatch $ singleton (Image attr txt (src, 'f':'i':'g':':':title) ) -- override caption with the text that follows
read_img_with_caption ( Para (_ : xs) : ys) =
read_img_with_caption (Para xs : ys)
read_img_with_caption _ =
@@ -901,8 +927,8 @@ post_process' (Table _ a w h r : Div ("", ["caption"], _) [Para inlines] : xs) =
post_process' bs = bs
read_body :: OdtReader _x (Pandoc, MediaBag)
-read_body = executeIn NsOffice "body"
- $ executeIn NsOffice "text"
+read_body = executeInSub NsOffice "body"
+ $ executeInSub NsOffice "text"
$ liftAsSuccess
$ proc inlines -> do
txt <- read_text -< inlines
diff --git a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs
index c45916c03..ccbaf6fc4 100644
--- a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs
+++ b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs
@@ -29,8 +29,10 @@ module Text.Pandoc.Readers.Odt.Generic.XMLConverter
, modifyExtraState
, producingExtraState
, findChild'
+, filterChildrenName'
, isSet'
, isSetWithDefault
+, elName
, searchAttr
, lookupAttr
, lookupAttr'
@@ -43,6 +45,7 @@ module Text.Pandoc.Readers.Odt.Generic.XMLConverter
, readAttrWithDefault
, getAttr
, executeIn
+, executeInSub
, withEveryL
, tryAll
, matchContent'
@@ -309,34 +312,44 @@ readNSattributes = fromState $ \state -> maybe (state, failEmpty )
-- | Given a namespace id and an element name, creates a 'XML.QName' for
-- internal use
-elemName :: (NameSpaceID nsID)
+qualifyName :: (NameSpaceID nsID)
=> nsID -> ElementName
-> XMLConverter nsID extraState x XML.QName
-elemName nsID name = lookupNSiri nsID
+qualifyName nsID name = lookupNSiri nsID
&&& lookupNSprefix nsID
>>% XML.QName name
-- | Checks if a given element matches both a specified namespace id
+-- and a predicate
+elemNameMatches :: (NameSpaceID nsID)
+ => nsID -> (ElementName -> Bool)
+ -> XMLConverter nsID extraState XML.Element Bool
+elemNameMatches nsID f = keepingTheValue (lookupNSiri nsID) >>% hasMatchingName
+ where hasMatchingName e iri = let name = XML.elName e
+ in f (XML.qName name)
+ && XML.qURI name == iri
+
+-- | Checks if a given element matches both a specified namespace id
-- and a specified element name
elemNameIs :: (NameSpaceID nsID)
=> nsID -> ElementName
-> XMLConverter nsID extraState XML.Element Bool
-elemNameIs nsID name = keepingTheValue (lookupNSiri nsID) >>% hasThatName
- where hasThatName e iri = let elName = XML.elName e
- in XML.qName elName == name
- && XML.qURI elName == iri
+elemNameIs nsID name = elemNameMatches nsID (== name)
--------------------------------------------------------------------------------
-- General content
--------------------------------------------------------------------------------
+elName :: XML.Element -> ElementName
+elName = XML.qName . XML.elName
+
--
elContent :: XMLConverter nsID extraState x [XML.Content]
elContent = getCurrentElement
>>^ XML.elContent
--------------------------------------------------------------------------------
--- Chilren
+-- Children
--------------------------------------------------------------------------------
--
@@ -344,7 +357,7 @@ elContent = getCurrentElement
findChildren :: (NameSpaceID nsID)
=> nsID -> ElementName
-> XMLConverter nsID extraState x [XML.Element]
-findChildren nsID name = elemName nsID name
+findChildren nsID name = qualifyName nsID name
&&& getCurrentElement
>>% XML.findChildren
@@ -353,7 +366,7 @@ findChild' :: (NameSpaceID nsID)
=> nsID
-> ElementName
-> XMLConverter nsID extraState x (Maybe XML.Element)
-findChild' nsID name = elemName nsID name
+findChild' nsID name = qualifyName nsID name
&&& getCurrentElement
>>% XML.findChild
@@ -364,6 +377,14 @@ findChild :: (NameSpaceID nsID)
findChild nsID name = findChild' nsID name
>>> maybeToChoice
+filterChildrenName' :: (NameSpaceID nsID)
+ => nsID
+ -> (ElementName -> Bool)
+ -> XMLConverter nsID extraState x [XML.Element]
+filterChildrenName' nsID f = getCurrentElement
+ >>> arr XML.elChildren
+ >>> iterateS (keepingTheValue (elemNameMatches nsID f))
+ >>> arr (catMaybes . fmap (uncurry $ bool Nothing . Just))
--------------------------------------------------------------------------------
-- Attributes
@@ -441,7 +462,7 @@ lookupDefaultingAttr nsID attrName
findAttr' :: (NameSpaceID nsID)
=> nsID -> AttributeName
-> XMLConverter nsID extraState x (Maybe AttributeValue)
-findAttr' nsID attrName = elemName nsID attrName
+findAttr' nsID attrName = qualifyName nsID attrName
&&& getCurrentElement
>>% XML.findAttr
@@ -537,15 +558,21 @@ executeThere a = second jumpThere
>>> jumpBack -- >>? jumpBack would not ensure the jump.
>>^ collapseEither
--- | Do something in a sub-element, tnen come back
-executeIn :: (NameSpaceID nsID)
- => nsID -> ElementName
- -> FallibleXMLConverter nsID extraState f s
- -> FallibleXMLConverter nsID extraState f s
-executeIn nsID name a = keepingTheValue
- (findChild nsID name)
- >>> ignoringState liftFailure
- >>? switchingTheStack a
+
+-- | Do something in a specific element, then come back
+executeIn :: XMLConverter nsID extraState XML.Element s
+ -> XMLConverter nsID extraState XML.Element s
+executeIn a = duplicate >>> switchingTheStack a
+
+-- | Do something in a sub-element, then come back
+executeInSub :: (NameSpaceID nsID)
+ => nsID -> ElementName
+ -> FallibleXMLConverter nsID extraState f s
+ -> FallibleXMLConverter nsID extraState f s
+executeInSub nsID name a = keepingTheValue
+ (findChild nsID name)
+ >>> ignoringState liftFailure
+ >>? switchingTheStack a
where liftFailure (_, (Left f)) = Left f
liftFailure (x, (Right e)) = Right (x, e)
diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs
index 23ca57786..79e8d7aea 100644
--- a/src/Text/Pandoc/Readers/Odt/StyleReader.hs
+++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs
@@ -113,7 +113,7 @@ type StyleReaderSafe a b = XMLReaderSafe FontPitches a b
-- | A reader for font pitches
fontPitchReader :: XMLReader _s _x FontPitches
-fontPitchReader = executeIn NsOffice "font-face-decls" (
+fontPitchReader = executeInSub NsOffice "font-face-decls" (
withEveryL NsStyle "font-face" (liftAsSuccess (
findAttr' NsStyle "name"
&&&
@@ -423,7 +423,7 @@ readAllStyles = ( readFontPitches
--
readStyles :: StyleReader _x Styles
-readStyles = executeIn NsOffice "styles" $ liftAsSuccess
+readStyles = executeInSub NsOffice "styles" $ liftAsSuccess
$ liftA3 Styles
( tryAll NsStyle "style" readStyle >>^ M.fromList )
( tryAll NsText "list-style" readListStyle >>^ M.fromList )
@@ -431,7 +431,7 @@ readStyles = executeIn NsOffice "styles" $ liftAsSuccess
--
readAutomaticStyles :: StyleReader _x Styles
-readAutomaticStyles = executeIn NsOffice "automatic-styles" $ liftAsSuccess
+readAutomaticStyles = executeInSub NsOffice "automatic-styles" $ liftAsSuccess
$ liftA3 Styles
( tryAll NsStyle "style" readStyle >>^ M.fromList )
( tryAll NsText "list-style" readListStyle >>^ M.fromList )
@@ -462,7 +462,7 @@ readStyleProperties = liftA2 SProps
--
readTextProperties :: StyleReader _x TextProperties
readTextProperties =
- executeIn NsStyle "text-properties" $ liftAsSuccess
+ executeInSub NsStyle "text-properties" $ liftAsSuccess
( liftA6 PropT
( searchAttr NsXSL_FO "font-style" False isFontEmphasised )
( searchAttr NsXSL_FO "font-weight" False isFontBold )
@@ -501,7 +501,7 @@ readLineMode modeAttr styleAttr = proc x -> do
--
readParaProperties :: StyleReader _x ParaProperties
readParaProperties =
- executeIn NsStyle "paragraph-properties" $ liftAsSuccess
+ executeInSub NsStyle "paragraph-properties" $ liftAsSuccess
( liftA3 PropP
( liftA2 readNumbering
( isSet' NsText "number-lines" )