diff options
author | John MacFarlane <jgm@berkeley.edu> | 2019-07-13 15:53:58 -0700 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-07-13 15:53:58 -0700 |
commit | a16311c225e9b9788ee6499e474f9a9170510b11 (patch) | |
tree | de02402064fc3acc433815854bc494778da7af1c /src/Text/Pandoc | |
parent | 178416194617b330b280a5e4a9e894258c0f8b1f (diff) | |
parent | 449c133406dec231e61b8a3ecbbdfac2cbc00dbc (diff) | |
download | pandoc-a16311c225e9b9788ee6499e474f9a9170510b11.tar.gz |
Merge pull request #5606 from blmage/odt-frames
Improve the parsing of frames in ODT documents
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r-- | src/Text/Pandoc/Readers/Odt.hs | 5 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Odt/ContentReader.hs | 122 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs | 65 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Odt/StyleReader.hs | 10 |
4 files changed, 127 insertions, 75 deletions
diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs index 3a3d1e992..dfa019932 100644 --- a/src/Text/Pandoc/Readers/Odt.hs +++ b/src/Text/Pandoc/Readers/Odt.hs @@ -86,9 +86,8 @@ archiveToOdt archive where filePathIsOdtMedia :: FilePath -> Bool filePathIsOdtMedia fp = - let (dir, _) = splitFileName fp - in - (dir == "Pictures/") + let (dir, name) = splitFileName fp + in (dir == "Pictures/") || (dir /= "./" && name == "content.xml") -- diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs index 1d9a0cb8c..d8e5ba272 100644 --- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs +++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs @@ -1,5 +1,7 @@ {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE Arrows #-} +{-# LANGUAGE DeriveFoldable #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE RecordWildCards #-} {-# LANGUAGE TupleSections #-} @@ -26,21 +28,26 @@ import Control.Applicative hiding (liftA, liftA2, liftA3) import Control.Arrow import qualified Data.ByteString.Lazy as B -import Data.List (find, intercalate) +import Data.Foldable (fold) +import Data.List (find, intercalate, stripPrefix) import qualified Data.Map as M import Data.Maybe +import Data.Semigroup (First(..), Option(..)) +import Text.TeXMath (readMathML, writeTeX) import qualified Text.XML.Light as XML import Text.Pandoc.Builder import Text.Pandoc.MediaBag (MediaBag, insertMedia) import Text.Pandoc.Shared import Text.Pandoc.Extensions (extensionsFromList, Extension(..)) +import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Readers.Odt.Base import Text.Pandoc.Readers.Odt.Namespaces import Text.Pandoc.Readers.Odt.StyleReader +import Text.Pandoc.Readers.Odt.Arrows.State (foldS) import Text.Pandoc.Readers.Odt.Arrows.Utils import Text.Pandoc.Readers.Odt.Generic.Fallible import Text.Pandoc.Readers.Odt.Generic.Utils @@ -498,6 +505,13 @@ type InlineMatcher = ElementMatcher Inlines type BlockMatcher = ElementMatcher Blocks +newtype FirstMatch a = FirstMatch (Option (First a)) + deriving (Foldable, Monoid, Semigroup) + +firstMatch :: a -> FirstMatch a +firstMatch = FirstMatch . Option . Just . First + + -- matchingElement :: (Monoid e) => Namespace -> ElementName @@ -598,7 +612,7 @@ read_paragraph = matchingElement NsText "p" , read_reference_start , read_bookmark_ref , read_reference_ref - , read_maybe_nested_img_frame + , read_frame , read_text_seq ] read_plain_text @@ -624,7 +638,7 @@ read_header = matchingElement NsText "h" , read_reference_start , read_bookmark_ref , read_reference_ref - , read_maybe_nested_img_frame + , read_frame ] read_plain_text ) -< blocks anchor <- getHeaderAnchor -< children @@ -737,32 +751,43 @@ read_table_cell = matchingElement NsTable "table-cell" ] ---------------------- --- Images +-- Frames ---------------------- -- -read_maybe_nested_img_frame :: InlineMatcher -read_maybe_nested_img_frame = matchingElement NsDraw "frame" - $ proc blocks -> do - img <- (findChild' NsDraw "image") -< () - case img of - Just _ -> read_frame -< blocks - Nothing -> matchChildContent' [ read_frame_text_box ] -< blocks - -read_frame :: OdtReaderSafe Inlines Inlines -read_frame = - proc blocks -> do - let exts = extensionsFromList [Ext_auto_identifiers] - w <- ( findAttr' NsSVG "width" ) -< () - h <- ( findAttr' NsSVG "height" ) -< () - titleNodes <- ( matchChildContent' [ read_frame_title ] ) -< blocks - src <- matchChildContent' [ read_image_src ] -< blocks - resource <- lookupResource -< src - _ <- updateMediaWithResource -< resource - alt <- (matchChildContent [] read_plain_text) -< blocks - arr (uncurry4 imageWith ) -< - (image_attributes w h, src, - inlineListToIdentifier exts (toList titleNodes), alt) +read_frame :: InlineMatcher +read_frame = matchingElement NsDraw "frame" + $ filterChildrenName' NsDraw (`elem` ["image", "object", "text-box"]) + >>> foldS read_frame_child + >>> arr fold + +read_frame_child :: OdtReaderSafe XML.Element (FirstMatch Inlines) +read_frame_child = + proc child -> case elName child of + "image" -> read_frame_img -< child + "object" -> read_frame_mathml -< child + "text-box" -> read_frame_text_box -< child + _ -> returnV mempty -< () + +read_frame_img :: OdtReaderSafe XML.Element (FirstMatch Inlines) +read_frame_img = + proc img -> do + src <- executeIn (findAttr' NsXLink "href") -< img + case fold src of + "" -> returnV mempty -< () + src' -> do + let exts = extensionsFromList [Ext_auto_identifiers] + resource <- lookupResource -< src' + _ <- updateMediaWithResource -< resource + w <- findAttr' NsSVG "width" -< () + h <- findAttr' NsSVG "height" -< () + titleNodes <- matchChildContent' [ read_frame_title ] -< () + alt <- matchChildContent [] read_plain_text -< () + arr (firstMatch . uncurry4 imageWith) -< + (image_attributes w h, src', inlineListToIdentifier exts (toList titleNodes), alt) + +read_frame_title :: InlineMatcher +read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text) image_attributes :: Maybe String -> Maybe String -> Attr image_attributes x y = @@ -772,28 +797,29 @@ image_attributes x y = dim name (Just v) = [(name, v)] dim _ Nothing = [] -read_image_src :: (Namespace, ElementName, OdtReader Anchor Anchor) -read_image_src = matchingElement NsDraw "image" - $ proc _ -> do - imgSrc <- findAttr NsXLink "href" -< () - case imgSrc of - Right src -> returnV src -<< () - Left _ -> returnV "" -< () - -read_frame_title :: InlineMatcher -read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text) - -read_frame_text_box :: InlineMatcher -read_frame_text_box = matchingElement NsDraw "text-box" - $ proc blocks -> do - paragraphs <- (matchChildContent' [ read_paragraph ]) -< blocks - arr read_img_with_caption -< toList paragraphs - -read_img_with_caption :: [Block] -> Inlines +read_frame_mathml :: OdtReaderSafe XML.Element (FirstMatch Inlines) +read_frame_mathml = + proc obj -> do + src <- executeIn (findAttr' NsXLink "href") -< obj + case fold src of + "" -> returnV mempty -< () + src' -> do + let path = fromMaybe src' (stripPrefix "./" src') ++ "/content.xml" + (_, mathml) <- lookupResource -< path + case readMathML (UTF8.toString $ B.toStrict mathml) of + Left _ -> returnV mempty -< () + Right exps -> arr (firstMatch . displayMath . writeTeX) -< exps + +read_frame_text_box :: OdtReaderSafe XML.Element (FirstMatch Inlines) +read_frame_text_box = proc box -> do + paragraphs <- executeIn (matchChildContent' [ read_paragraph ]) -< box + arr read_img_with_caption -< toList paragraphs + +read_img_with_caption :: [Block] -> FirstMatch Inlines read_img_with_caption (Para [Image attr alt (src,title)] : _) = - singleton (Image attr alt (src, 'f':'i':'g':':':title)) -- no text, default caption + firstMatch $ singleton (Image attr alt (src, 'f':'i':'g':':':title)) -- no text, default caption read_img_with_caption (Para (Image attr _ (src,title) : txt) : _) = - singleton (Image attr txt (src, 'f':'i':'g':':':title) ) -- override caption with the text that follows + firstMatch $ singleton (Image attr txt (src, 'f':'i':'g':':':title) ) -- override caption with the text that follows read_img_with_caption ( Para (_ : xs) : ys) = read_img_with_caption (Para xs : ys) read_img_with_caption _ = @@ -901,8 +927,8 @@ post_process' (Table _ a w h r : Div ("", ["caption"], _) [Para inlines] : xs) = post_process' bs = bs read_body :: OdtReader _x (Pandoc, MediaBag) -read_body = executeIn NsOffice "body" - $ executeIn NsOffice "text" +read_body = executeInSub NsOffice "body" + $ executeInSub NsOffice "text" $ liftAsSuccess $ proc inlines -> do txt <- read_text -< inlines diff --git a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs index c45916c03..ccbaf6fc4 100644 --- a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs +++ b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs @@ -29,8 +29,10 @@ module Text.Pandoc.Readers.Odt.Generic.XMLConverter , modifyExtraState , producingExtraState , findChild' +, filterChildrenName' , isSet' , isSetWithDefault +, elName , searchAttr , lookupAttr , lookupAttr' @@ -43,6 +45,7 @@ module Text.Pandoc.Readers.Odt.Generic.XMLConverter , readAttrWithDefault , getAttr , executeIn +, executeInSub , withEveryL , tryAll , matchContent' @@ -309,34 +312,44 @@ readNSattributes = fromState $ \state -> maybe (state, failEmpty ) -- | Given a namespace id and an element name, creates a 'XML.QName' for -- internal use -elemName :: (NameSpaceID nsID) +qualifyName :: (NameSpaceID nsID) => nsID -> ElementName -> XMLConverter nsID extraState x XML.QName -elemName nsID name = lookupNSiri nsID +qualifyName nsID name = lookupNSiri nsID &&& lookupNSprefix nsID >>% XML.QName name -- | Checks if a given element matches both a specified namespace id +-- and a predicate +elemNameMatches :: (NameSpaceID nsID) + => nsID -> (ElementName -> Bool) + -> XMLConverter nsID extraState XML.Element Bool +elemNameMatches nsID f = keepingTheValue (lookupNSiri nsID) >>% hasMatchingName + where hasMatchingName e iri = let name = XML.elName e + in f (XML.qName name) + && XML.qURI name == iri + +-- | Checks if a given element matches both a specified namespace id -- and a specified element name elemNameIs :: (NameSpaceID nsID) => nsID -> ElementName -> XMLConverter nsID extraState XML.Element Bool -elemNameIs nsID name = keepingTheValue (lookupNSiri nsID) >>% hasThatName - where hasThatName e iri = let elName = XML.elName e - in XML.qName elName == name - && XML.qURI elName == iri +elemNameIs nsID name = elemNameMatches nsID (== name) -------------------------------------------------------------------------------- -- General content -------------------------------------------------------------------------------- +elName :: XML.Element -> ElementName +elName = XML.qName . XML.elName + -- elContent :: XMLConverter nsID extraState x [XML.Content] elContent = getCurrentElement >>^ XML.elContent -------------------------------------------------------------------------------- --- Chilren +-- Children -------------------------------------------------------------------------------- -- @@ -344,7 +357,7 @@ elContent = getCurrentElement findChildren :: (NameSpaceID nsID) => nsID -> ElementName -> XMLConverter nsID extraState x [XML.Element] -findChildren nsID name = elemName nsID name +findChildren nsID name = qualifyName nsID name &&& getCurrentElement >>% XML.findChildren @@ -353,7 +366,7 @@ findChild' :: (NameSpaceID nsID) => nsID -> ElementName -> XMLConverter nsID extraState x (Maybe XML.Element) -findChild' nsID name = elemName nsID name +findChild' nsID name = qualifyName nsID name &&& getCurrentElement >>% XML.findChild @@ -364,6 +377,14 @@ findChild :: (NameSpaceID nsID) findChild nsID name = findChild' nsID name >>> maybeToChoice +filterChildrenName' :: (NameSpaceID nsID) + => nsID + -> (ElementName -> Bool) + -> XMLConverter nsID extraState x [XML.Element] +filterChildrenName' nsID f = getCurrentElement + >>> arr XML.elChildren + >>> iterateS (keepingTheValue (elemNameMatches nsID f)) + >>> arr (catMaybes . fmap (uncurry $ bool Nothing . Just)) -------------------------------------------------------------------------------- -- Attributes @@ -441,7 +462,7 @@ lookupDefaultingAttr nsID attrName findAttr' :: (NameSpaceID nsID) => nsID -> AttributeName -> XMLConverter nsID extraState x (Maybe AttributeValue) -findAttr' nsID attrName = elemName nsID attrName +findAttr' nsID attrName = qualifyName nsID attrName &&& getCurrentElement >>% XML.findAttr @@ -537,15 +558,21 @@ executeThere a = second jumpThere >>> jumpBack -- >>? jumpBack would not ensure the jump. >>^ collapseEither --- | Do something in a sub-element, tnen come back -executeIn :: (NameSpaceID nsID) - => nsID -> ElementName - -> FallibleXMLConverter nsID extraState f s - -> FallibleXMLConverter nsID extraState f s -executeIn nsID name a = keepingTheValue - (findChild nsID name) - >>> ignoringState liftFailure - >>? switchingTheStack a + +-- | Do something in a specific element, then come back +executeIn :: XMLConverter nsID extraState XML.Element s + -> XMLConverter nsID extraState XML.Element s +executeIn a = duplicate >>> switchingTheStack a + +-- | Do something in a sub-element, then come back +executeInSub :: (NameSpaceID nsID) + => nsID -> ElementName + -> FallibleXMLConverter nsID extraState f s + -> FallibleXMLConverter nsID extraState f s +executeInSub nsID name a = keepingTheValue + (findChild nsID name) + >>> ignoringState liftFailure + >>? switchingTheStack a where liftFailure (_, (Left f)) = Left f liftFailure (x, (Right e)) = Right (x, e) diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs index 23ca57786..79e8d7aea 100644 --- a/src/Text/Pandoc/Readers/Odt/StyleReader.hs +++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs @@ -113,7 +113,7 @@ type StyleReaderSafe a b = XMLReaderSafe FontPitches a b -- | A reader for font pitches fontPitchReader :: XMLReader _s _x FontPitches -fontPitchReader = executeIn NsOffice "font-face-decls" ( +fontPitchReader = executeInSub NsOffice "font-face-decls" ( withEveryL NsStyle "font-face" (liftAsSuccess ( findAttr' NsStyle "name" &&& @@ -423,7 +423,7 @@ readAllStyles = ( readFontPitches -- readStyles :: StyleReader _x Styles -readStyles = executeIn NsOffice "styles" $ liftAsSuccess +readStyles = executeInSub NsOffice "styles" $ liftAsSuccess $ liftA3 Styles ( tryAll NsStyle "style" readStyle >>^ M.fromList ) ( tryAll NsText "list-style" readListStyle >>^ M.fromList ) @@ -431,7 +431,7 @@ readStyles = executeIn NsOffice "styles" $ liftAsSuccess -- readAutomaticStyles :: StyleReader _x Styles -readAutomaticStyles = executeIn NsOffice "automatic-styles" $ liftAsSuccess +readAutomaticStyles = executeInSub NsOffice "automatic-styles" $ liftAsSuccess $ liftA3 Styles ( tryAll NsStyle "style" readStyle >>^ M.fromList ) ( tryAll NsText "list-style" readListStyle >>^ M.fromList ) @@ -462,7 +462,7 @@ readStyleProperties = liftA2 SProps -- readTextProperties :: StyleReader _x TextProperties readTextProperties = - executeIn NsStyle "text-properties" $ liftAsSuccess + executeInSub NsStyle "text-properties" $ liftAsSuccess ( liftA6 PropT ( searchAttr NsXSL_FO "font-style" False isFontEmphasised ) ( searchAttr NsXSL_FO "font-weight" False isFontBold ) @@ -501,7 +501,7 @@ readLineMode modeAttr styleAttr = proc x -> do -- readParaProperties :: StyleReader _x ParaProperties readParaProperties = - executeIn NsStyle "paragraph-properties" $ liftAsSuccess + executeInSub NsStyle "paragraph-properties" $ liftAsSuccess ( liftA3 PropP ( liftA2 readNumbering ( isSet' NsText "number-lines" ) |