Docx Reader: preprocess Document body to unwrap "w:sdt" elements

We walk through the document (using the zipper in Text.XML.Light.Cursor) to unwrap the sdt tags before doing the rest of the parsing of the document. Note that the function is generically named `walkDocument` in case we need to do any further preprocessing in the future. Closes #4190
author: Jesse Rosenthal <jrosenthal@jhu.edu> 2017-12-27 09:49:28 -0500
committer: Jesse Rosenthal <jrosenthal@jhu.edu> 2017-12-27 10:03:00 -0500
commit: dc3ee500a0447dc258ae5b49cf5907cba0d407aa (patch)
tree: a56379ab9c4fd70447da2f3cc28f318ff46c6533 /src/Text/Pandoc/Readers/Docx
parent: 7e8cfc099017b554dc4105a8c56544867784a9f8 (diff)
download: pandoc-dc3ee500a0447dc258ae5b49cf5907cba0d407aa.tar.gz
1 files changed, 31 insertions, 1 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index 99e6f99e6..48a512be2 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -73,6 +73,7 @@ import Text.TeXMath (Exp)
 import Text.TeXMath.Readers.OMML (readOMML)
 import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, stringToFont)
 import Text.XML.Light
+import qualified Text.XML.Light.Cursor as XMLC
 
 data ReaderEnv = ReaderEnv { envNotes         :: Notes
                            , envComments      :: Comments
@@ -117,6 +118,32 @@ mapD f xs =
   in
    concatMapM handler xs
 
+unwrapSDT :: NameSpaces -> Content -> Content
+unwrapSDT ns (Elem element)
+  | isElem ns "w" "sdt" element
+  , Just sdtContent <- findChildByName ns "w" "sdtContent" element
+  , child : _    <- elChildren sdtContent
+  = Elem child
+unwrapSDT _ content = content
+
+walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor
+walkDocument' ns cur =
+  let modifiedCur = XMLC.modifyContent (unwrapSDT ns) cur
+  in
+    case XMLC.nextDF modifiedCur of
+      Just cur' -> walkDocument' ns cur'
+      Nothing   -> XMLC.root modifiedCur
+
+walkDocument :: NameSpaces -> Element -> Maybe Element
+walkDocument ns element =
+  let cur = XMLC.fromContent (Elem element)
+      cur' = walkDocument' ns cur
+  in
+    case XMLC.toTree cur' of
+      Elem element' -> Just element'
+      _             -> Nothing
+
+
 data Docx = Docx Document
           deriving Show
 
@@ -298,7 +325,10 @@ archiveToDocument zf = do
   docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
   let namespaces = elemToNameSpaces docElem
   bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem
-  body <- elemToBody namespaces bodyElem
+  let bodyElem' = case walkDocument namespaces bodyElem of
+        Just e -> e
+        Nothing -> bodyElem
+  body <- elemToBody namespaces bodyElem'
   return $ Document namespaces body
 
 elemToBody :: NameSpaces -> Element -> D Body
author	Jesse Rosenthal <jrosenthal@jhu.edu>	2017-12-27 09:49:28 -0500
committer	Jesse Rosenthal <jrosenthal@jhu.edu>	2017-12-27 10:03:00 -0500
commit	dc3ee500a0447dc258ae5b49cf5907cba0d407aa (patch)
tree	a56379ab9c4fd70447da2f3cc28f318ff46c6533 /src/Text/Pandoc/Readers/Docx
parent	7e8cfc099017b554dc4105a8c56544867784a9f8 (diff)
download	pandoc-dc3ee500a0447dc258ae5b49cf5907cba0d407aa.tar.gz