From 4cce0efa4824b4081a2d971c488c5026c314bbb5 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Wed, 6 Feb 2019 21:06:14 -0500 Subject: Docx reader: Dynamically determine document.xml path. The desktop Word program places the main document file in "word/document.xml", but the online word places it in "word/document2.xml". This file path is actually stated in the root "_rels/.rels" file, in the "Relationship" element with an "http://../officedocument" type. Closes #5277 --- src/Text/Pandoc/Readers/Docx/Parse.hs | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index e551ca7aa..fa4dc90a6 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -359,12 +359,21 @@ archiveToDocxWithWarnings archive = do Right doc -> Right (Docx doc, stateWarnings st) Left e -> Left e - +getDocumentPath :: Archive -> Maybe String +getDocumentPath zf = do + entry <- findEntryByPath "_rels/.rels" zf + relsElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let rels = filterChildrenName (\n -> qName n == "Relationship") relsElem + rel <- listToMaybe $ + filter (\e -> findAttr (QName "Type" Nothing Nothing) e == + Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument") + rels + findAttr (QName "Target" Nothing Nothing) rel archiveToDocument :: Archive -> D Document archiveToDocument zf = do - entry <- maybeToD $ findEntryByPath "word/document.xml" zf - `mplus` findEntryByPath "word/document2.xml" zf -- see #5277 + docPath <- maybeToD $ getDocumentPath zf + entry <- maybeToD $ findEntryByPath docPath zf docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry let namespaces = elemToNameSpaces docElem bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem -- cgit v1.2.3