diff options
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 18 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 247 | ||||
-rw-r--r-- | test/Tests/Readers/Docx.hs | 4 | ||||
-rw-r--r-- | test/docx/nested_instrText.docx | bin | 0 -> 14112 bytes | |||
-rw-r--r-- | test/docx/nested_instrText.native | 5 |
5 files changed, 159 insertions, 115 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 66cd84291..462e3c679 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -246,8 +246,8 @@ runToText _ = "" parPartToText :: ParPart -> T.Text parPartToText (PlainRun run) = runToText run -parPartToText (InternalHyperLink _ runs) = T.concat $ map runToText runs -parPartToText (ExternalHyperLink _ runs) = T.concat $ map runToText runs +parPartToText (InternalHyperLink _ children) = T.concat $ map parPartToText children +parPartToText (ExternalHyperLink _ children) = T.concat $ map parPartToText children parPartToText _ = "" blacklistedCharStyles :: [CharStyleName] @@ -437,18 +437,18 @@ parPartToInlines' Chart = return $ spanWith ("", ["chart"], []) $ text "[CHART]" parPartToInlines' Diagram = return $ spanWith ("", ["diagram"], []) $ text "[DIAGRAM]" -parPartToInlines' (InternalHyperLink anchor runs) = do - ils <- smushInlines <$> mapM runToInlines runs +parPartToInlines' (InternalHyperLink anchor children) = do + ils <- smushInlines <$> mapM parPartToInlines' children return $ link ("#" <> anchor) "" ils -parPartToInlines' (ExternalHyperLink target runs) = do - ils <- smushInlines <$> mapM runToInlines runs +parPartToInlines' (ExternalHyperLink target children) = do + ils <- smushInlines <$> mapM parPartToInlines' children return $ link target "" ils parPartToInlines' (PlainOMath exps) = return $ math $ writeTeX exps -parPartToInlines' (Field info runs) = +parPartToInlines' (Field info children) = case info of - HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url runs - UnknownField -> smushInlines <$> mapM runToInlines runs + HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url children + _ -> smushInlines <$> mapM parPartToInlines' children parPartToInlines' NullParPart = return mempty isAnchorSpan :: Inline -> Bool diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index e4d3ea6f8..a97d4b3d1 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -93,14 +93,13 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes deriving Show data ReaderState = ReaderState { stateWarnings :: [T.Text] - , stateFldCharState :: FldCharState + , stateFldCharState :: [FldCharState] } deriving Show data FldCharState = FldCharOpen | FldCharFieldInfo FieldInfo - | FldCharContent FieldInfo [Run] - | FldCharClosed + | FldCharContent FieldInfo [ParPart] deriving (Show) data DocxError = DocxError @@ -314,13 +313,13 @@ data ParPart = PlainRun Run | CommentStart CommentId Author (Maybe CommentDate) [BodyPart] | CommentEnd CommentId | BookMark BookMarkId Anchor - | InternalHyperLink Anchor [Run] - | ExternalHyperLink URL [Run] + | InternalHyperLink Anchor [ParPart] + | ExternalHyperLink URL [ParPart] | Drawing FilePath T.Text T.Text B.ByteString Extent -- title, alt | Chart -- placeholder for now | Diagram -- placeholder for now | PlainOMath [Exp] - | Field FieldInfo [Run] + | Field FieldInfo [ParPart] | NullParPart -- when we need to return nothing, but -- not because of an error. deriving Show @@ -373,7 +372,7 @@ archiveToDocxWithWarnings archive = do , envDocXmlPath = docXmlPath } rState = ReaderState { stateWarnings = [] - , stateFldCharState = FldCharClosed + , stateFldCharState = [] } (eitherDoc, st) = runD (archiveToDocument archive) rEnv rState case eitherDoc of @@ -701,28 +700,31 @@ elemToBodyPart ns element elemToBodyPart ns element | isElem ns "w" "p" element = do parstyle <- elemToParagraphStyle ns element <$> asks envParStyles - parparts <- mapD (elemToParPart ns) (elChildren element) + parparts' <- mapD (elemToParPart ns) (elChildren element) + fldCharState <- gets stateFldCharState + modify $ \st -> st {stateFldCharState = emptyFldCharContents fldCharState} -- Word uses list enumeration for numbered headings, so we only -- want to infer a list from the styles if it is NOT a heading. - case pHeading parstyle of - Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do - levelInfo <- lookupLevel numId lvl <$> asks envNumbering - return $ ListItem parstyle numId lvl levelInfo parparts - _ -> let - hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle) - - hasSimpleTableField = fromMaybe False $ do - fldSimple <- findChildByName ns "w" "fldSimple" element - instr <- findAttrByName ns "w" "instr" fldSimple - pure ("Table" `elem` T.words instr) - - hasComplexTableField = fromMaybe False $ do - instrText <- findElementByName ns "w" "instrText" element - pure ("Table" `elem` T.words (strContent instrText)) - - in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField) - then return $ TblCaption parstyle parparts - else return $ Paragraph parstyle parparts + let parparts = parparts' ++ (openFldCharsToParParts fldCharState) in + case pHeading parstyle of + Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do + levelInfo <- lookupLevel numId lvl <$> asks envNumbering + return $ ListItem parstyle numId lvl levelInfo parparts + _ -> let + hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle) + + hasSimpleTableField = fromMaybe False $ do + fldSimple <- findChildByName ns "w" "fldSimple" element + instr <- findAttrByName ns "w" "instr" fldSimple + pure ("Table" `elem` T.words instr) + + hasComplexTableField = fromMaybe False $ do + instrText <- findElementByName ns "w" "instrText" element + pure ("Table" `elem` T.words (strContent instrText)) + + in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField) + then return $ TblCaption parstyle parparts + else return $ Paragraph parstyle parparts elemToBodyPart ns element | isElem ns "w" "tbl" element = do @@ -754,6 +756,19 @@ lookupRelationship docLocation relid rels = where pairs = map (\(Relationship loc relid' target) -> ((loc, relid'), target)) rels +openFldCharsToParParts :: [FldCharState] -> [ParPart] +openFldCharsToParParts [] = [] +openFldCharsToParParts (FldCharContent info children : ancestors) = case openFldCharsToParParts ancestors of + Field parentInfo siblings : _ -> [Field parentInfo $ siblings ++ [Field info $ reverse children]] + _ -> [Field info $ reverse children] +openFldCharsToParParts (_ : ancestors) = openFldCharsToParParts ancestors + +emptyFldCharContents :: [FldCharState] -> [FldCharState] +emptyFldCharContents = map + (\x -> case x of + FldCharContent info _ -> FldCharContent info [] + _ -> x) + expandDrawingId :: T.Text -> D (FilePath, B.ByteString) expandDrawingId s = do location <- asks envLocation @@ -778,51 +793,6 @@ getTitleAndAlt ns element = in (title, alt) elemToParPart :: NameSpaces -> Element -> D ParPart -elemToParPart ns element - | isElem ns "w" "r" element - , Just drawingElem <- findChildByName ns "w" "drawing" element - , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" - , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem - = let (title, alt) = getTitleAndAlt ns drawingElem - a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" - drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrByName ns "r" "embed" - in - case drawing of - Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) - Nothing -> throwError WrongElem --- The two cases below are an attempt to deal with images in deprecated vml format. --- Todo: check out title and attr for deprecated format. -elemToParPart ns element - | isElem ns "w" "r" element - , Just _ <- findChildByName ns "w" "pict" element = - let drawing = findElement (elemName ns "v" "imagedata") element - >>= findAttrByName ns "r" "id" - in - case drawing of - Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) - Nothing -> throwError WrongElem -elemToParPart ns element - | isElem ns "w" "r" element - , Just objectElem <- findChildByName ns "w" "object" element - , Just shapeElem <- findChildByName ns "v" "shape" objectElem - , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem - , Just drawingId <- findAttrByName ns "r" "id" imagedataElem - = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) --- Diagram -elemToParPart ns element - | isElem ns "w" "r" element - , Just drawingElem <- findChildByName ns "w" "drawing" element - , d_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram" - , Just _ <- findElement (QName "relIds" (Just d_ns) (Just "dgm")) drawingElem - = return Diagram --- Chart -elemToParPart ns element - | isElem ns "w" "r" element - , Just drawingElem <- findChildByName ns "w" "drawing" element - , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" - , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem - = return Chart {- The next one is a bit complicated. fldChar fields work by first having a <w:fldChar fldCharType="begin"> in a run, then a run with @@ -854,8 +824,13 @@ example (omissions and my comments in brackets): So we do this in a number of steps. If we encounter the fldchar begin tag, we start open a fldchar state variable (see state above). We add the instrtext to it as FieldInfo. Then we close that and start adding -the runs when we get to separate. Then when we get to end, we produce -the Field type with appropriate FieldInfo and Runs. +the children when we get to separate. Then when we get to end, we produce +the Field type with appropriate FieldInfo and ParParts. + +Since there can be nested fields, the fldchar state needs to be a stack, +so we can have multiple fldchars open at the same time. When a fldchar is +closed, we either add the resulting field to its parent or we return it if +there is no parent. -} elemToParPart ns element | isElem ns "w" "r" element @@ -863,78 +838,138 @@ elemToParPart ns element , Just fldCharType <- findAttrByName ns "w" "fldCharType" fldChar = do fldCharState <- gets stateFldCharState case fldCharState of - FldCharClosed | fldCharType == "begin" -> do - modify $ \st -> st {stateFldCharState = FldCharOpen} + _ | fldCharType == "begin" -> do + modify $ \st -> st {stateFldCharState = FldCharOpen : fldCharState} return NullParPart - FldCharFieldInfo info | fldCharType == "separate" -> do - modify $ \st -> st {stateFldCharState = FldCharContent info []} + FldCharFieldInfo info : ancestors | fldCharType == "separate" -> do + modify $ \st -> st {stateFldCharState = FldCharContent info [] : ancestors} return NullParPart - FldCharContent info runs | fldCharType == "end" -> do - modify $ \st -> st {stateFldCharState = FldCharClosed} - return $ Field info $ reverse runs + [FldCharContent info children] | fldCharType == "end" -> do + modify $ \st -> st {stateFldCharState = []} + return $ Field info $ reverse children + FldCharContent info children : FldCharContent parentInfo siblings : ancestors | fldCharType == "end" -> + let parent = FldCharContent parentInfo $ (Field info (reverse children)) : siblings in do + modify $ \st -> st {stateFldCharState = parent : ancestors} + return NullParPart _ -> throwError WrongElem elemToParPart ns element | isElem ns "w" "r" element , Just instrText <- findChildByName ns "w" "instrText" element = do fldCharState <- gets stateFldCharState case fldCharState of - FldCharOpen -> do + FldCharOpen : ancestors -> do info <- eitherToD $ parseFieldInfo $ strContent instrText - modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} + modify $ \st -> st {stateFldCharState = FldCharFieldInfo info : ancestors} return NullParPart _ -> return NullParPart -elemToParPart ns element +{- +There is an open fldchar, so we calculate the element and add it to the +children. For this we need to first change the fldchar state to an empty +stack to avoid descendants of children simply being added to the state instead +of to their direct parent element. This would happen in the case of a +w:hyperlink element for example. +-} +elemToParPart ns element = do + fldCharState <- gets stateFldCharState + case fldCharState of + FldCharContent info children : ancestors -> do + modify $ \st -> st {stateFldCharState = []} + parPart <- elemToParPart' ns element `catchError` \_ -> return NullParPart + modify $ \st -> st{stateFldCharState = FldCharContent info (parPart : children) : ancestors} + return NullParPart + _ -> elemToParPart' ns element + +elemToParPart' :: NameSpaces -> Element -> D ParPart +elemToParPart' ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element + , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture" + , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem + = let (title, alt) = getTitleAndAlt ns drawingElem + a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem + >>= findAttrByName ns "r" "embed" + in + case drawing of + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) + Nothing -> throwError WrongElem +-- The two cases below are an attempt to deal with images in deprecated vml format. +-- Todo: check out title and attr for deprecated format. +elemToParPart' ns element + | isElem ns "w" "r" element + , Just _ <- findChildByName ns "w" "pict" element = + let drawing = findElement (elemName ns "v" "imagedata") element + >>= findAttrByName ns "r" "id" + in + case drawing of + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) + Nothing -> throwError WrongElem +elemToParPart' ns element + | isElem ns "w" "r" element + , Just objectElem <- findChildByName ns "w" "object" element + , Just shapeElem <- findChildByName ns "v" "shape" objectElem + , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem + , Just drawingId <- findAttrByName ns "r" "id" imagedataElem + = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing) +-- Diagram +elemToParPart' ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element + , d_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram" + , Just _ <- findElement (QName "relIds" (Just d_ns) (Just "dgm")) drawingElem + = return Diagram +-- Chart +elemToParPart' ns element + | isElem ns "w" "r" element + , Just drawingElem <- findChildByName ns "w" "drawing" element + , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart" + , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem + = return Chart +elemToParPart' ns element | isElem ns "w" "r" element = do run <- elemToRun ns element - -- we check to see if we have an open FldChar in state that we're - -- recording. - fldCharState <- gets stateFldCharState - case fldCharState of - FldCharContent info runs -> do - modify $ \st -> st{stateFldCharState = FldCharContent info (run : runs)} - return NullParPart - _ -> return $ PlainRun run -elemToParPart ns element + return $ PlainRun run +elemToParPart' ns element | Just change <- getTrackedChange ns element = do runs <- mapD (elemToRun ns) (elChildren element) return $ ChangedRuns change runs -elemToParPart ns element +elemToParPart' ns element | isElem ns "w" "bookmarkStart" element , Just bmId <- findAttrByName ns "w" "id" element , Just bmName <- findAttrByName ns "w" "name" element = return $ BookMark bmId bmName -elemToParPart ns element +elemToParPart' ns element | isElem ns "w" "hyperlink" element , Just relId <- findAttrByName ns "r" "id" element = do location <- asks envLocation - runs <- mapD (elemToRun ns) (elChildren element) + children <- mapD (elemToParPart ns) (elChildren element) rels <- asks envRelationships case lookupRelationship location relId rels of Just target -> case findAttrByName ns "w" "anchor" element of - Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) runs - Nothing -> return $ ExternalHyperLink target runs - Nothing -> return $ ExternalHyperLink "" runs -elemToParPart ns element + Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) children + Nothing -> return $ ExternalHyperLink target children + Nothing -> return $ ExternalHyperLink "" children +elemToParPart' ns element | isElem ns "w" "hyperlink" element , Just anchor <- findAttrByName ns "w" "anchor" element = do - runs <- mapD (elemToRun ns) (elChildren element) - return $ InternalHyperLink anchor runs -elemToParPart ns element + children <- mapD (elemToParPart ns) (elChildren element) + return $ InternalHyperLink anchor children +elemToParPart' ns element | isElem ns "w" "commentRangeStart" element , Just cmtId <- findAttrByName ns "w" "id" element = do (Comments _ commentMap) <- asks envComments case M.lookup cmtId commentMap of Just cmtElem -> elemToCommentStart ns cmtElem Nothing -> throwError WrongElem -elemToParPart ns element +elemToParPart' ns element | isElem ns "w" "commentRangeEnd" element , Just cmtId <- findAttrByName ns "w" "id" element = return $ CommentEnd cmtId -elemToParPart ns element +elemToParPart' ns element | isElem ns "m" "oMath" element = fmap PlainOMath (eitherToD $ readOMML $ showElement element) -elemToParPart _ _ = throwError WrongElem +elemToParPart' _ _ = throwError WrongElem elemToCommentStart :: NameSpaces -> Element -> D ParPart elemToCommentStart ns element diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs index 2f28af317..af6023836 100644 --- a/test/Tests/Readers/Docx.hs +++ b/test/Tests/Readers/Docx.hs @@ -148,6 +148,10 @@ tests = [ testGroup "document" "docx/instrText_hyperlink.docx" "docx/instrText_hyperlink.native" , testCompare + "nested fields with <w:instrText> tag" + "docx/nested_instrText.docx" + "docx/nested_instrText.native" + , testCompare "inline image" "docx/image.docx" "docx/image_no_embed.native" diff --git a/test/docx/nested_instrText.docx b/test/docx/nested_instrText.docx Binary files differnew file mode 100644 index 000000000..532584193 --- /dev/null +++ b/test/docx/nested_instrText.docx diff --git a/test/docx/nested_instrText.native b/test/docx/nested_instrText.native new file mode 100644 index 000000000..730b041f5 --- /dev/null +++ b/test/docx/nested_instrText.native @@ -0,0 +1,5 @@ +[Para [Str "\24076\26395\28145\20837\20102\35299\30340\35835\32773\21487\20197\21435\30475David",Space,Str "French",Space,Str "Belding\21644Kevin",Space,Str "J.",Space,Str "Mitchell\30340" + ,Link ("",[],[]) [Str "Foundations",Space,Str "of",Space,Str "Analysis,",Space,Str "1/16/18",Space,Str "8:40:00",Space,Str "AM,",Space,Str "2nd",Space,Str "Edition"] ("https://books.google.com/books?id=sp_Zcb9ot90C&lpg=PR4&hl=zh-CN&pg=PA19#v=onepage&q&f=true","") + ,Str ",\21487\20174\&19\39029\30475\36215\65292\25110D.C.",Space,Str "Goldrei\30340",Space + ,Link ("",[],[]) [Str "Classic",Space,Str "Set",Space,Str "Theory:",Space,Str "For",Space,Str "Guided",Space,Str "Independent",Space,Str "Study"] ("https://books.google.ae/books?id=dlc0DwAAQBAJ&lpg=PT29&hl=zh-CN&pg=PT26#v=onepage&q&f=true","") + ,Str "\65292\20174\31532\20108\31456\30475\36215\65292\38405\35835\26102\35201\27880\24847\26412\25991\19982\36825\20123\20070\25152\19981\21516\30340\26159\24182\27809\26377\25226\23454\25968\30475\20316\26159\26377\29702\25968\38598\30340\20998\21106\12290"]] |