aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMilan Bracke <mbracke@antidot.net>2021-06-14 15:00:36 +0200
committerJohn MacFarlane <jgm@berkeley.edu>2021-10-18 19:15:40 -0700
commit193f6bfebaa43d0d6749d10a4e7ca78a0d31361d (patch)
treed7d84ad824082520b1bb7878864d67cd904a7a57
parent8de261ba4e1e03f9dd5f78297d7299e9d78bbcfd (diff)
downloadpandoc-193f6bfebaa43d0d6749d10a4e7ca78a0d31361d.tar.gz
Docx reader: fix handling of nested fields
Fields delimited by fldChar elements can contain other fields. Before, the nested fields would be ignored, except for the end, which would be considered the end of the parent field. To fix this issue, fields needed to be considered containing ParParts instead of Runs, since a Run can't represent complex enough structures. This also impacted Hyperlinks since they can originate from a field.
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs18
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs247
-rw-r--r--test/Tests/Readers/Docx.hs4
-rw-r--r--test/docx/nested_instrText.docxbin0 -> 14112 bytes
-rw-r--r--test/docx/nested_instrText.native5
5 files changed, 159 insertions, 115 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 66cd84291..462e3c679 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -246,8 +246,8 @@ runToText _ = ""
parPartToText :: ParPart -> T.Text
parPartToText (PlainRun run) = runToText run
-parPartToText (InternalHyperLink _ runs) = T.concat $ map runToText runs
-parPartToText (ExternalHyperLink _ runs) = T.concat $ map runToText runs
+parPartToText (InternalHyperLink _ children) = T.concat $ map parPartToText children
+parPartToText (ExternalHyperLink _ children) = T.concat $ map parPartToText children
parPartToText _ = ""
blacklistedCharStyles :: [CharStyleName]
@@ -437,18 +437,18 @@ parPartToInlines' Chart =
return $ spanWith ("", ["chart"], []) $ text "[CHART]"
parPartToInlines' Diagram =
return $ spanWith ("", ["diagram"], []) $ text "[DIAGRAM]"
-parPartToInlines' (InternalHyperLink anchor runs) = do
- ils <- smushInlines <$> mapM runToInlines runs
+parPartToInlines' (InternalHyperLink anchor children) = do
+ ils <- smushInlines <$> mapM parPartToInlines' children
return $ link ("#" <> anchor) "" ils
-parPartToInlines' (ExternalHyperLink target runs) = do
- ils <- smushInlines <$> mapM runToInlines runs
+parPartToInlines' (ExternalHyperLink target children) = do
+ ils <- smushInlines <$> mapM parPartToInlines' children
return $ link target "" ils
parPartToInlines' (PlainOMath exps) =
return $ math $ writeTeX exps
-parPartToInlines' (Field info runs) =
+parPartToInlines' (Field info children) =
case info of
- HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url runs
- UnknownField -> smushInlines <$> mapM runToInlines runs
+ HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url children
+ _ -> smushInlines <$> mapM parPartToInlines' children
parPartToInlines' NullParPart = return mempty
isAnchorSpan :: Inline -> Bool
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index e4d3ea6f8..a97d4b3d1 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -93,14 +93,13 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes
deriving Show
data ReaderState = ReaderState { stateWarnings :: [T.Text]
- , stateFldCharState :: FldCharState
+ , stateFldCharState :: [FldCharState]
}
deriving Show
data FldCharState = FldCharOpen
| FldCharFieldInfo FieldInfo
- | FldCharContent FieldInfo [Run]
- | FldCharClosed
+ | FldCharContent FieldInfo [ParPart]
deriving (Show)
data DocxError = DocxError
@@ -314,13 +313,13 @@ data ParPart = PlainRun Run
| CommentStart CommentId Author (Maybe CommentDate) [BodyPart]
| CommentEnd CommentId
| BookMark BookMarkId Anchor
- | InternalHyperLink Anchor [Run]
- | ExternalHyperLink URL [Run]
+ | InternalHyperLink Anchor [ParPart]
+ | ExternalHyperLink URL [ParPart]
| Drawing FilePath T.Text T.Text B.ByteString Extent -- title, alt
| Chart -- placeholder for now
| Diagram -- placeholder for now
| PlainOMath [Exp]
- | Field FieldInfo [Run]
+ | Field FieldInfo [ParPart]
| NullParPart -- when we need to return nothing, but
-- not because of an error.
deriving Show
@@ -373,7 +372,7 @@ archiveToDocxWithWarnings archive = do
, envDocXmlPath = docXmlPath
}
rState = ReaderState { stateWarnings = []
- , stateFldCharState = FldCharClosed
+ , stateFldCharState = []
}
(eitherDoc, st) = runD (archiveToDocument archive) rEnv rState
case eitherDoc of
@@ -701,28 +700,31 @@ elemToBodyPart ns element
elemToBodyPart ns element
| isElem ns "w" "p" element = do
parstyle <- elemToParagraphStyle ns element <$> asks envParStyles
- parparts <- mapD (elemToParPart ns) (elChildren element)
+ parparts' <- mapD (elemToParPart ns) (elChildren element)
+ fldCharState <- gets stateFldCharState
+ modify $ \st -> st {stateFldCharState = emptyFldCharContents fldCharState}
-- Word uses list enumeration for numbered headings, so we only
-- want to infer a list from the styles if it is NOT a heading.
- case pHeading parstyle of
- Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do
- levelInfo <- lookupLevel numId lvl <$> asks envNumbering
- return $ ListItem parstyle numId lvl levelInfo parparts
- _ -> let
- hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle)
-
- hasSimpleTableField = fromMaybe False $ do
- fldSimple <- findChildByName ns "w" "fldSimple" element
- instr <- findAttrByName ns "w" "instr" fldSimple
- pure ("Table" `elem` T.words instr)
-
- hasComplexTableField = fromMaybe False $ do
- instrText <- findElementByName ns "w" "instrText" element
- pure ("Table" `elem` T.words (strContent instrText))
-
- in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField)
- then return $ TblCaption parstyle parparts
- else return $ Paragraph parstyle parparts
+ let parparts = parparts' ++ (openFldCharsToParParts fldCharState) in
+ case pHeading parstyle of
+ Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do
+ levelInfo <- lookupLevel numId lvl <$> asks envNumbering
+ return $ ListItem parstyle numId lvl levelInfo parparts
+ _ -> let
+ hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle)
+
+ hasSimpleTableField = fromMaybe False $ do
+ fldSimple <- findChildByName ns "w" "fldSimple" element
+ instr <- findAttrByName ns "w" "instr" fldSimple
+ pure ("Table" `elem` T.words instr)
+
+ hasComplexTableField = fromMaybe False $ do
+ instrText <- findElementByName ns "w" "instrText" element
+ pure ("Table" `elem` T.words (strContent instrText))
+
+ in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField)
+ then return $ TblCaption parstyle parparts
+ else return $ Paragraph parstyle parparts
elemToBodyPart ns element
| isElem ns "w" "tbl" element = do
@@ -754,6 +756,19 @@ lookupRelationship docLocation relid rels =
where
pairs = map (\(Relationship loc relid' target) -> ((loc, relid'), target)) rels
+openFldCharsToParParts :: [FldCharState] -> [ParPart]
+openFldCharsToParParts [] = []
+openFldCharsToParParts (FldCharContent info children : ancestors) = case openFldCharsToParParts ancestors of
+ Field parentInfo siblings : _ -> [Field parentInfo $ siblings ++ [Field info $ reverse children]]
+ _ -> [Field info $ reverse children]
+openFldCharsToParParts (_ : ancestors) = openFldCharsToParParts ancestors
+
+emptyFldCharContents :: [FldCharState] -> [FldCharState]
+emptyFldCharContents = map
+ (\x -> case x of
+ FldCharContent info _ -> FldCharContent info []
+ _ -> x)
+
expandDrawingId :: T.Text -> D (FilePath, B.ByteString)
expandDrawingId s = do
location <- asks envLocation
@@ -778,51 +793,6 @@ getTitleAndAlt ns element =
in (title, alt)
elemToParPart :: NameSpaces -> Element -> D ParPart
-elemToParPart ns element
- | isElem ns "w" "r" element
- , Just drawingElem <- findChildByName ns "w" "drawing" element
- , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture"
- , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem
- = let (title, alt) = getTitleAndAlt ns drawingElem
- a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
- drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem
- >>= findAttrByName ns "r" "embed"
- in
- case drawing of
- Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem)
- Nothing -> throwError WrongElem
--- The two cases below are an attempt to deal with images in deprecated vml format.
--- Todo: check out title and attr for deprecated format.
-elemToParPart ns element
- | isElem ns "w" "r" element
- , Just _ <- findChildByName ns "w" "pict" element =
- let drawing = findElement (elemName ns "v" "imagedata") element
- >>= findAttrByName ns "r" "id"
- in
- case drawing of
- Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing)
- Nothing -> throwError WrongElem
-elemToParPart ns element
- | isElem ns "w" "r" element
- , Just objectElem <- findChildByName ns "w" "object" element
- , Just shapeElem <- findChildByName ns "v" "shape" objectElem
- , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem
- , Just drawingId <- findAttrByName ns "r" "id" imagedataElem
- = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing)
--- Diagram
-elemToParPart ns element
- | isElem ns "w" "r" element
- , Just drawingElem <- findChildByName ns "w" "drawing" element
- , d_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram"
- , Just _ <- findElement (QName "relIds" (Just d_ns) (Just "dgm")) drawingElem
- = return Diagram
--- Chart
-elemToParPart ns element
- | isElem ns "w" "r" element
- , Just drawingElem <- findChildByName ns "w" "drawing" element
- , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart"
- , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem
- = return Chart
{-
The next one is a bit complicated. fldChar fields work by first
having a <w:fldChar fldCharType="begin"> in a run, then a run with
@@ -854,8 +824,13 @@ example (omissions and my comments in brackets):
So we do this in a number of steps. If we encounter the fldchar begin
tag, we start open a fldchar state variable (see state above). We add
the instrtext to it as FieldInfo. Then we close that and start adding
-the runs when we get to separate. Then when we get to end, we produce
-the Field type with appropriate FieldInfo and Runs.
+the children when we get to separate. Then when we get to end, we produce
+the Field type with appropriate FieldInfo and ParParts.
+
+Since there can be nested fields, the fldchar state needs to be a stack,
+so we can have multiple fldchars open at the same time. When a fldchar is
+closed, we either add the resulting field to its parent or we return it if
+there is no parent.
-}
elemToParPart ns element
| isElem ns "w" "r" element
@@ -863,78 +838,138 @@ elemToParPart ns element
, Just fldCharType <- findAttrByName ns "w" "fldCharType" fldChar = do
fldCharState <- gets stateFldCharState
case fldCharState of
- FldCharClosed | fldCharType == "begin" -> do
- modify $ \st -> st {stateFldCharState = FldCharOpen}
+ _ | fldCharType == "begin" -> do
+ modify $ \st -> st {stateFldCharState = FldCharOpen : fldCharState}
return NullParPart
- FldCharFieldInfo info | fldCharType == "separate" -> do
- modify $ \st -> st {stateFldCharState = FldCharContent info []}
+ FldCharFieldInfo info : ancestors | fldCharType == "separate" -> do
+ modify $ \st -> st {stateFldCharState = FldCharContent info [] : ancestors}
return NullParPart
- FldCharContent info runs | fldCharType == "end" -> do
- modify $ \st -> st {stateFldCharState = FldCharClosed}
- return $ Field info $ reverse runs
+ [FldCharContent info children] | fldCharType == "end" -> do
+ modify $ \st -> st {stateFldCharState = []}
+ return $ Field info $ reverse children
+ FldCharContent info children : FldCharContent parentInfo siblings : ancestors | fldCharType == "end" ->
+ let parent = FldCharContent parentInfo $ (Field info (reverse children)) : siblings in do
+ modify $ \st -> st {stateFldCharState = parent : ancestors}
+ return NullParPart
_ -> throwError WrongElem
elemToParPart ns element
| isElem ns "w" "r" element
, Just instrText <- findChildByName ns "w" "instrText" element = do
fldCharState <- gets stateFldCharState
case fldCharState of
- FldCharOpen -> do
+ FldCharOpen : ancestors -> do
info <- eitherToD $ parseFieldInfo $ strContent instrText
- modify $ \st -> st{stateFldCharState = FldCharFieldInfo info}
+ modify $ \st -> st {stateFldCharState = FldCharFieldInfo info : ancestors}
return NullParPart
_ -> return NullParPart
-elemToParPart ns element
+{-
+There is an open fldchar, so we calculate the element and add it to the
+children. For this we need to first change the fldchar state to an empty
+stack to avoid descendants of children simply being added to the state instead
+of to their direct parent element. This would happen in the case of a
+w:hyperlink element for example.
+-}
+elemToParPart ns element = do
+ fldCharState <- gets stateFldCharState
+ case fldCharState of
+ FldCharContent info children : ancestors -> do
+ modify $ \st -> st {stateFldCharState = []}
+ parPart <- elemToParPart' ns element `catchError` \_ -> return NullParPart
+ modify $ \st -> st{stateFldCharState = FldCharContent info (parPart : children) : ancestors}
+ return NullParPart
+ _ -> elemToParPart' ns element
+
+elemToParPart' :: NameSpaces -> Element -> D ParPart
+elemToParPart' ns element
+ | isElem ns "w" "r" element
+ , Just drawingElem <- findChildByName ns "w" "drawing" element
+ , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture"
+ , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem
+ = let (title, alt) = getTitleAndAlt ns drawingElem
+ a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
+ drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem
+ >>= findAttrByName ns "r" "embed"
+ in
+ case drawing of
+ Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem)
+ Nothing -> throwError WrongElem
+-- The two cases below are an attempt to deal with images in deprecated vml format.
+-- Todo: check out title and attr for deprecated format.
+elemToParPart' ns element
+ | isElem ns "w" "r" element
+ , Just _ <- findChildByName ns "w" "pict" element =
+ let drawing = findElement (elemName ns "v" "imagedata") element
+ >>= findAttrByName ns "r" "id"
+ in
+ case drawing of
+ Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing)
+ Nothing -> throwError WrongElem
+elemToParPart' ns element
+ | isElem ns "w" "r" element
+ , Just objectElem <- findChildByName ns "w" "object" element
+ , Just shapeElem <- findChildByName ns "v" "shape" objectElem
+ , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem
+ , Just drawingId <- findAttrByName ns "r" "id" imagedataElem
+ = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing)
+-- Diagram
+elemToParPart' ns element
+ | isElem ns "w" "r" element
+ , Just drawingElem <- findChildByName ns "w" "drawing" element
+ , d_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram"
+ , Just _ <- findElement (QName "relIds" (Just d_ns) (Just "dgm")) drawingElem
+ = return Diagram
+-- Chart
+elemToParPart' ns element
+ | isElem ns "w" "r" element
+ , Just drawingElem <- findChildByName ns "w" "drawing" element
+ , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart"
+ , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem
+ = return Chart
+elemToParPart' ns element
| isElem ns "w" "r" element = do
run <- elemToRun ns element
- -- we check to see if we have an open FldChar in state that we're
- -- recording.
- fldCharState <- gets stateFldCharState
- case fldCharState of
- FldCharContent info runs -> do
- modify $ \st -> st{stateFldCharState = FldCharContent info (run : runs)}
- return NullParPart
- _ -> return $ PlainRun run
-elemToParPart ns element
+ return $ PlainRun run
+elemToParPart' ns element
| Just change <- getTrackedChange ns element = do
runs <- mapD (elemToRun ns) (elChildren element)
return $ ChangedRuns change runs
-elemToParPart ns element
+elemToParPart' ns element
| isElem ns "w" "bookmarkStart" element
, Just bmId <- findAttrByName ns "w" "id" element
, Just bmName <- findAttrByName ns "w" "name" element =
return $ BookMark bmId bmName
-elemToParPart ns element
+elemToParPart' ns element
| isElem ns "w" "hyperlink" element
, Just relId <- findAttrByName ns "r" "id" element = do
location <- asks envLocation
- runs <- mapD (elemToRun ns) (elChildren element)
+ children <- mapD (elemToParPart ns) (elChildren element)
rels <- asks envRelationships
case lookupRelationship location relId rels of
Just target ->
case findAttrByName ns "w" "anchor" element of
- Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) runs
- Nothing -> return $ ExternalHyperLink target runs
- Nothing -> return $ ExternalHyperLink "" runs
-elemToParPart ns element
+ Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) children
+ Nothing -> return $ ExternalHyperLink target children
+ Nothing -> return $ ExternalHyperLink "" children
+elemToParPart' ns element
| isElem ns "w" "hyperlink" element
, Just anchor <- findAttrByName ns "w" "anchor" element = do
- runs <- mapD (elemToRun ns) (elChildren element)
- return $ InternalHyperLink anchor runs
-elemToParPart ns element
+ children <- mapD (elemToParPart ns) (elChildren element)
+ return $ InternalHyperLink anchor children
+elemToParPart' ns element
| isElem ns "w" "commentRangeStart" element
, Just cmtId <- findAttrByName ns "w" "id" element = do
(Comments _ commentMap) <- asks envComments
case M.lookup cmtId commentMap of
Just cmtElem -> elemToCommentStart ns cmtElem
Nothing -> throwError WrongElem
-elemToParPart ns element
+elemToParPart' ns element
| isElem ns "w" "commentRangeEnd" element
, Just cmtId <- findAttrByName ns "w" "id" element =
return $ CommentEnd cmtId
-elemToParPart ns element
+elemToParPart' ns element
| isElem ns "m" "oMath" element =
fmap PlainOMath (eitherToD $ readOMML $ showElement element)
-elemToParPart _ _ = throwError WrongElem
+elemToParPart' _ _ = throwError WrongElem
elemToCommentStart :: NameSpaces -> Element -> D ParPart
elemToCommentStart ns element
diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs
index 2f28af317..af6023836 100644
--- a/test/Tests/Readers/Docx.hs
+++ b/test/Tests/Readers/Docx.hs
@@ -148,6 +148,10 @@ tests = [ testGroup "document"
"docx/instrText_hyperlink.docx"
"docx/instrText_hyperlink.native"
, testCompare
+ "nested fields with <w:instrText> tag"
+ "docx/nested_instrText.docx"
+ "docx/nested_instrText.native"
+ , testCompare
"inline image"
"docx/image.docx"
"docx/image_no_embed.native"
diff --git a/test/docx/nested_instrText.docx b/test/docx/nested_instrText.docx
new file mode 100644
index 000000000..532584193
--- /dev/null
+++ b/test/docx/nested_instrText.docx
Binary files differ
diff --git a/test/docx/nested_instrText.native b/test/docx/nested_instrText.native
new file mode 100644
index 000000000..730b041f5
--- /dev/null
+++ b/test/docx/nested_instrText.native
@@ -0,0 +1,5 @@
+[Para [Str "\24076\26395\28145\20837\20102\35299\30340\35835\32773\21487\20197\21435\30475David",Space,Str "French",Space,Str "Belding\21644Kevin",Space,Str "J.",Space,Str "Mitchell\30340"
+ ,Link ("",[],[]) [Str "Foundations",Space,Str "of",Space,Str "Analysis,",Space,Str "1/16/18",Space,Str "8:40:00",Space,Str "AM,",Space,Str "2nd",Space,Str "Edition"] ("https://books.google.com/books?id=sp_Zcb9ot90C&lpg=PR4&hl=zh-CN&pg=PA19#v=onepage&q&f=true","")
+ ,Str ",\21487\20174\&19\39029\30475\36215\65292\25110D.C.",Space,Str "Goldrei\30340",Space
+ ,Link ("",[],[]) [Str "Classic",Space,Str "Set",Space,Str "Theory:",Space,Str "For",Space,Str "Guided",Space,Str "Independent",Space,Str "Study"] ("https://books.google.ae/books?id=dlc0DwAAQBAJ&lpg=PT29&hl=zh-CN&pg=PT26#v=onepage&q&f=true","")
+ ,Str "\65292\20174\31532\20108\31456\30475\36215\65292\38405\35835\26102\35201\27880\24847\26412\25991\19982\36825\20123\20070\25152\19981\21516\30340\26159\24182\27809\26377\25226\23454\25968\30475\20316\26159\26377\29702\25968\38598\30340\20998\21106\12290"]]