aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs18
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs247
-rw-r--r--test/Tests/Readers/Docx.hs4
-rw-r--r--test/docx/nested_instrText.docxbin0 -> 14112 bytes
-rw-r--r--test/docx/nested_instrText.native5
5 files changed, 159 insertions, 115 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 66cd84291..462e3c679 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -246,8 +246,8 @@ runToText _ = ""
parPartToText :: ParPart -> T.Text
parPartToText (PlainRun run) = runToText run
-parPartToText (InternalHyperLink _ runs) = T.concat $ map runToText runs
-parPartToText (ExternalHyperLink _ runs) = T.concat $ map runToText runs
+parPartToText (InternalHyperLink _ children) = T.concat $ map parPartToText children
+parPartToText (ExternalHyperLink _ children) = T.concat $ map parPartToText children
parPartToText _ = ""
blacklistedCharStyles :: [CharStyleName]
@@ -437,18 +437,18 @@ parPartToInlines' Chart =
return $ spanWith ("", ["chart"], []) $ text "[CHART]"
parPartToInlines' Diagram =
return $ spanWith ("", ["diagram"], []) $ text "[DIAGRAM]"
-parPartToInlines' (InternalHyperLink anchor runs) = do
- ils <- smushInlines <$> mapM runToInlines runs
+parPartToInlines' (InternalHyperLink anchor children) = do
+ ils <- smushInlines <$> mapM parPartToInlines' children
return $ link ("#" <> anchor) "" ils
-parPartToInlines' (ExternalHyperLink target runs) = do
- ils <- smushInlines <$> mapM runToInlines runs
+parPartToInlines' (ExternalHyperLink target children) = do
+ ils <- smushInlines <$> mapM parPartToInlines' children
return $ link target "" ils
parPartToInlines' (PlainOMath exps) =
return $ math $ writeTeX exps
-parPartToInlines' (Field info runs) =
+parPartToInlines' (Field info children) =
case info of
- HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url runs
- UnknownField -> smushInlines <$> mapM runToInlines runs
+ HyperlinkField url -> parPartToInlines' $ ExternalHyperLink url children
+ _ -> smushInlines <$> mapM parPartToInlines' children
parPartToInlines' NullParPart = return mempty
isAnchorSpan :: Inline -> Bool
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index e4d3ea6f8..a97d4b3d1 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -93,14 +93,13 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes
deriving Show
data ReaderState = ReaderState { stateWarnings :: [T.Text]
- , stateFldCharState :: FldCharState
+ , stateFldCharState :: [FldCharState]
}
deriving Show
data FldCharState = FldCharOpen
| FldCharFieldInfo FieldInfo
- | FldCharContent FieldInfo [Run]
- | FldCharClosed
+ | FldCharContent FieldInfo [ParPart]
deriving (Show)
data DocxError = DocxError
@@ -314,13 +313,13 @@ data ParPart = PlainRun Run
| CommentStart CommentId Author (Maybe CommentDate) [BodyPart]
| CommentEnd CommentId
| BookMark BookMarkId Anchor
- | InternalHyperLink Anchor [Run]
- | ExternalHyperLink URL [Run]
+ | InternalHyperLink Anchor [ParPart]
+ | ExternalHyperLink URL [ParPart]
| Drawing FilePath T.Text T.Text B.ByteString Extent -- title, alt
| Chart -- placeholder for now
| Diagram -- placeholder for now
| PlainOMath [Exp]
- | Field FieldInfo [Run]
+ | Field FieldInfo [ParPart]
| NullParPart -- when we need to return nothing, but
-- not because of an error.
deriving Show
@@ -373,7 +372,7 @@ archiveToDocxWithWarnings archive = do
, envDocXmlPath = docXmlPath
}
rState = ReaderState { stateWarnings = []
- , stateFldCharState = FldCharClosed
+ , stateFldCharState = []
}
(eitherDoc, st) = runD (archiveToDocument archive) rEnv rState
case eitherDoc of
@@ -701,28 +700,31 @@ elemToBodyPart ns element
elemToBodyPart ns element
| isElem ns "w" "p" element = do
parstyle <- elemToParagraphStyle ns element <$> asks envParStyles
- parparts <- mapD (elemToParPart ns) (elChildren element)
+ parparts' <- mapD (elemToParPart ns) (elChildren element)
+ fldCharState <- gets stateFldCharState
+ modify $ \st -> st {stateFldCharState = emptyFldCharContents fldCharState}
-- Word uses list enumeration for numbered headings, so we only
-- want to infer a list from the styles if it is NOT a heading.
- case pHeading parstyle of
- Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do
- levelInfo <- lookupLevel numId lvl <$> asks envNumbering
- return $ ListItem parstyle numId lvl levelInfo parparts
- _ -> let
- hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle)
-
- hasSimpleTableField = fromMaybe False $ do
- fldSimple <- findChildByName ns "w" "fldSimple" element
- instr <- findAttrByName ns "w" "instr" fldSimple
- pure ("Table" `elem` T.words instr)
-
- hasComplexTableField = fromMaybe False $ do
- instrText <- findElementByName ns "w" "instrText" element
- pure ("Table" `elem` T.words (strContent instrText))
-
- in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField)
- then return $ TblCaption parstyle parparts
- else return $ Paragraph parstyle parparts
+ let parparts = parparts' ++ (openFldCharsToParParts fldCharState) in
+ case pHeading parstyle of
+ Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do
+ levelInfo <- lookupLevel numId lvl <$> asks envNumbering
+ return $ ListItem parstyle numId lvl levelInfo parparts
+ _ -> let
+ hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle)
+
+ hasSimpleTableField = fromMaybe False $ do
+ fldSimple <- findChildByName ns "w" "fldSimple" element
+ instr <- findAttrByName ns "w" "instr" fldSimple
+ pure ("Table" `elem` T.words instr)
+
+ hasComplexTableField = fromMaybe False $ do
+ instrText <- findElementByName ns "w" "instrText" element
+ pure ("Table" `elem` T.words (strContent instrText))
+
+ in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField)
+ then return $ TblCaption parstyle parparts
+ else return $ Paragraph parstyle parparts
elemToBodyPart ns element
| isElem ns "w" "tbl" element = do
@@ -754,6 +756,19 @@ lookupRelationship docLocation relid rels =
where
pairs = map (\(Relationship loc relid' target) -> ((loc, relid'), target)) rels
+openFldCharsToParParts :: [FldCharState] -> [ParPart]
+openFldCharsToParParts [] = []
+openFldCharsToParParts (FldCharContent info children : ancestors) = case openFldCharsToParParts ancestors of
+ Field parentInfo siblings : _ -> [Field parentInfo $ siblings ++ [Field info $ reverse children]]
+ _ -> [Field info $ reverse children]
+openFldCharsToParParts (_ : ancestors) = openFldCharsToParParts ancestors
+
+emptyFldCharContents :: [FldCharState] -> [FldCharState]
+emptyFldCharContents = map
+ (\x -> case x of
+ FldCharContent info _ -> FldCharContent info []
+ _ -> x)
+
expandDrawingId :: T.Text -> D (FilePath, B.ByteString)
expandDrawingId s = do
location <- asks envLocation
@@ -778,51 +793,6 @@ getTitleAndAlt ns element =
in (title, alt)
elemToParPart :: NameSpaces -> Element -> D ParPart
-elemToParPart ns element
- | isElem ns "w" "r" element
- , Just drawingElem <- findChildByName ns "w" "drawing" element
- , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture"
- , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem
- = let (title, alt) = getTitleAndAlt ns drawingElem
- a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
- drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem
- >>= findAttrByName ns "r" "embed"
- in
- case drawing of
- Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem)
- Nothing -> throwError WrongElem
--- The two cases below are an attempt to deal with images in deprecated vml format.
--- Todo: check out title and attr for deprecated format.
-elemToParPart ns element
- | isElem ns "w" "r" element
- , Just _ <- findChildByName ns "w" "pict" element =
- let drawing = findElement (elemName ns "v" "imagedata") element
- >>= findAttrByName ns "r" "id"
- in
- case drawing of
- Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing)
- Nothing -> throwError WrongElem
-elemToParPart ns element
- | isElem ns "w" "r" element
- , Just objectElem <- findChildByName ns "w" "object" element
- , Just shapeElem <- findChildByName ns "v" "shape" objectElem
- , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem
- , Just drawingId <- findAttrByName ns "r" "id" imagedataElem
- = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing)
--- Diagram
-elemToParPart ns element
- | isElem ns "w" "r" element
- , Just drawingElem <- findChildByName ns "w" "drawing" element
- , d_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram"
- , Just _ <- findElement (QName "relIds" (Just d_ns) (Just "dgm")) drawingElem
- = return Diagram
--- Chart
-elemToParPart ns element
- | isElem ns "w" "r" element
- , Just drawingElem <- findChildByName ns "w" "drawing" element
- , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart"
- , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem
- = return Chart
{-
The next one is a bit complicated. fldChar fields work by first
having a <w:fldChar fldCharType="begin"> in a run, then a run with
@@ -854,8 +824,13 @@ example (omissions and my comments in brackets):
So we do this in a number of steps. If we encounter the fldchar begin
tag, we start open a fldchar state variable (see state above). We add
the instrtext to it as FieldInfo. Then we close that and start adding
-the runs when we get to separate. Then when we get to end, we produce
-the Field type with appropriate FieldInfo and Runs.
+the children when we get to separate. Then when we get to end, we produce
+the Field type with appropriate FieldInfo and ParParts.
+
+Since there can be nested fields, the fldchar state needs to be a stack,
+so we can have multiple fldchars open at the same time. When a fldchar is
+closed, we either add the resulting field to its parent or we return it if
+there is no parent.
-}
elemToParPart ns element
| isElem ns "w" "r" element
@@ -863,78 +838,138 @@ elemToParPart ns element
, Just fldCharType <- findAttrByName ns "w" "fldCharType" fldChar = do
fldCharState <- gets stateFldCharState
case fldCharState of
- FldCharClosed | fldCharType == "begin" -> do
- modify $ \st -> st {stateFldCharState = FldCharOpen}
+ _ | fldCharType == "begin" -> do
+ modify $ \st -> st {stateFldCharState = FldCharOpen : fldCharState}
return NullParPart
- FldCharFieldInfo info | fldCharType == "separate" -> do
- modify $ \st -> st {stateFldCharState = FldCharContent info []}
+ FldCharFieldInfo info : ancestors | fldCharType == "separate" -> do
+ modify $ \st -> st {stateFldCharState = FldCharContent info [] : ancestors}
return NullParPart
- FldCharContent info runs | fldCharType == "end" -> do
- modify $ \st -> st {stateFldCharState = FldCharClosed}
- return $ Field info $ reverse runs
+ [FldCharContent info children] | fldCharType == "end" -> do
+ modify $ \st -> st {stateFldCharState = []}
+ return $ Field info $ reverse children
+ FldCharContent info children : FldCharContent parentInfo siblings : ancestors | fldCharType == "end" ->
+ let parent = FldCharContent parentInfo $ (Field info (reverse children)) : siblings in do
+ modify $ \st -> st {stateFldCharState = parent : ancestors}
+ return NullParPart
_ -> throwError WrongElem
elemToParPart ns element
| isElem ns "w" "r" element
, Just instrText <- findChildByName ns "w" "instrText" element = do
fldCharState <- gets stateFldCharState
case fldCharState of
- FldCharOpen -> do
+ FldCharOpen : ancestors -> do
info <- eitherToD $ parseFieldInfo $ strContent instrText
- modify $ \st -> st{stateFldCharState = FldCharFieldInfo info}
+ modify $ \st -> st {stateFldCharState = FldCharFieldInfo info : ancestors}
return NullParPart
_ -> return NullParPart
-elemToParPart ns element
+{-
+There is an open fldchar, so we calculate the element and add it to the
+children. For this we need to first change the fldchar state to an empty
+stack to avoid descendants of children simply being added to the state instead
+of to their direct parent element. This would happen in the case of a
+w:hyperlink element for example.
+-}
+elemToParPart ns element = do
+ fldCharState <- gets stateFldCharState
+ case fldCharState of
+ FldCharContent info children : ancestors -> do
+ modify $ \st -> st {stateFldCharState = []}
+ parPart <- elemToParPart' ns element `catchError` \_ -> return NullParPart
+ modify $ \st -> st{stateFldCharState = FldCharContent info (parPart : children) : ancestors}
+ return NullParPart
+ _ -> elemToParPart' ns element
+
+elemToParPart' :: NameSpaces -> Element -> D ParPart
+elemToParPart' ns element
+ | isElem ns "w" "r" element
+ , Just drawingElem <- findChildByName ns "w" "drawing" element
+ , pic_ns <- "http://schemas.openxmlformats.org/drawingml/2006/picture"
+ , Just picElem <- findElement (QName "pic" (Just pic_ns) (Just "pic")) drawingElem
+ = let (title, alt) = getTitleAndAlt ns drawingElem
+ a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
+ drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem
+ >>= findAttrByName ns "r" "embed"
+ in
+ case drawing of
+ Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem)
+ Nothing -> throwError WrongElem
+-- The two cases below are an attempt to deal with images in deprecated vml format.
+-- Todo: check out title and attr for deprecated format.
+elemToParPart' ns element
+ | isElem ns "w" "r" element
+ , Just _ <- findChildByName ns "w" "pict" element =
+ let drawing = findElement (elemName ns "v" "imagedata") element
+ >>= findAttrByName ns "r" "id"
+ in
+ case drawing of
+ Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing)
+ Nothing -> throwError WrongElem
+elemToParPart' ns element
+ | isElem ns "w" "r" element
+ , Just objectElem <- findChildByName ns "w" "object" element
+ , Just shapeElem <- findChildByName ns "v" "shape" objectElem
+ , Just imagedataElem <- findChildByName ns "v" "imagedata" shapeElem
+ , Just drawingId <- findAttrByName ns "r" "id" imagedataElem
+ = expandDrawingId drawingId >>= (\(fp, bs) -> return $ Drawing fp "" "" bs Nothing)
+-- Diagram
+elemToParPart' ns element
+ | isElem ns "w" "r" element
+ , Just drawingElem <- findChildByName ns "w" "drawing" element
+ , d_ns <- "http://schemas.openxmlformats.org/drawingml/2006/diagram"
+ , Just _ <- findElement (QName "relIds" (Just d_ns) (Just "dgm")) drawingElem
+ = return Diagram
+-- Chart
+elemToParPart' ns element
+ | isElem ns "w" "r" element
+ , Just drawingElem <- findChildByName ns "w" "drawing" element
+ , c_ns <- "http://schemas.openxmlformats.org/drawingml/2006/chart"
+ , Just _ <- findElement (QName "chart" (Just c_ns) (Just "c")) drawingElem
+ = return Chart
+elemToParPart' ns element
| isElem ns "w" "r" element = do
run <- elemToRun ns element
- -- we check to see if we have an open FldChar in state that we're
- -- recording.
- fldCharState <- gets stateFldCharState
- case fldCharState of
- FldCharContent info runs -> do
- modify $ \st -> st{stateFldCharState = FldCharContent info (run : runs)}
- return NullParPart
- _ -> return $ PlainRun run
-elemToParPart ns element
+ return $ PlainRun run
+elemToParPart' ns element
| Just change <- getTrackedChange ns element = do
runs <- mapD (elemToRun ns) (elChildren element)
return $ ChangedRuns change runs
-elemToParPart ns element
+elemToParPart' ns element
| isElem ns "w" "bookmarkStart" element
, Just bmId <- findAttrByName ns "w" "id" element
, Just bmName <- findAttrByName ns "w" "name" element =
return $ BookMark bmId bmName
-elemToParPart ns element
+elemToParPart' ns element
| isElem ns "w" "hyperlink" element
, Just relId <- findAttrByName ns "r" "id" element = do
location <- asks envLocation
- runs <- mapD (elemToRun ns) (elChildren element)
+ children <- mapD (elemToParPart ns) (elChildren element)
rels <- asks envRelationships
case lookupRelationship location relId rels of
Just target ->
case findAttrByName ns "w" "anchor" element of
- Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) runs
- Nothing -> return $ ExternalHyperLink target runs
- Nothing -> return $ ExternalHyperLink "" runs
-elemToParPart ns element
+ Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) children
+ Nothing -> return $ ExternalHyperLink target children
+ Nothing -> return $ ExternalHyperLink "" children
+elemToParPart' ns element
| isElem ns "w" "hyperlink" element
, Just anchor <- findAttrByName ns "w" "anchor" element = do
- runs <- mapD (elemToRun ns) (elChildren element)
- return $ InternalHyperLink anchor runs
-elemToParPart ns element
+ children <- mapD (elemToParPart ns) (elChildren element)
+ return $ InternalHyperLink anchor children
+elemToParPart' ns element
| isElem ns "w" "commentRangeStart" element
, Just cmtId <- findAttrByName ns "w" "id" element = do
(Comments _ commentMap) <- asks envComments
case M.lookup cmtId commentMap of
Just cmtElem -> elemToCommentStart ns cmtElem
Nothing -> throwError WrongElem
-elemToParPart ns element
+elemToParPart' ns element
| isElem ns "w" "commentRangeEnd" element
, Just cmtId <- findAttrByName ns "w" "id" element =
return $ CommentEnd cmtId
-elemToParPart ns element
+elemToParPart' ns element
| isElem ns "m" "oMath" element =
fmap PlainOMath (eitherToD $ readOMML $ showElement element)
-elemToParPart _ _ = throwError WrongElem
+elemToParPart' _ _ = throwError WrongElem
elemToCommentStart :: NameSpaces -> Element -> D ParPart
elemToCommentStart ns element
diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs
index 2f28af317..af6023836 100644
--- a/test/Tests/Readers/Docx.hs
+++ b/test/Tests/Readers/Docx.hs
@@ -148,6 +148,10 @@ tests = [ testGroup "document"
"docx/instrText_hyperlink.docx"
"docx/instrText_hyperlink.native"
, testCompare
+ "nested fields with <w:instrText> tag"
+ "docx/nested_instrText.docx"
+ "docx/nested_instrText.native"
+ , testCompare
"inline image"
"docx/image.docx"
"docx/image_no_embed.native"
diff --git a/test/docx/nested_instrText.docx b/test/docx/nested_instrText.docx
new file mode 100644
index 000000000..532584193
--- /dev/null
+++ b/test/docx/nested_instrText.docx
Binary files differ
diff --git a/test/docx/nested_instrText.native b/test/docx/nested_instrText.native
new file mode 100644
index 000000000..730b041f5
--- /dev/null
+++ b/test/docx/nested_instrText.native
@@ -0,0 +1,5 @@
+[Para [Str "\24076\26395\28145\20837\20102\35299\30340\35835\32773\21487\20197\21435\30475David",Space,Str "French",Space,Str "Belding\21644Kevin",Space,Str "J.",Space,Str "Mitchell\30340"
+ ,Link ("",[],[]) [Str "Foundations",Space,Str "of",Space,Str "Analysis,",Space,Str "1/16/18",Space,Str "8:40:00",Space,Str "AM,",Space,Str "2nd",Space,Str "Edition"] ("https://books.google.com/books?id=sp_Zcb9ot90C&lpg=PR4&hl=zh-CN&pg=PA19#v=onepage&q&f=true","")
+ ,Str ",\21487\20174\&19\39029\30475\36215\65292\25110D.C.",Space,Str "Goldrei\30340",Space
+ ,Link ("",[],[]) [Str "Classic",Space,Str "Set",Space,Str "Theory:",Space,Str "For",Space,Str "Guided",Space,Str "Independent",Space,Str "Study"] ("https://books.google.ae/books?id=dlc0DwAAQBAJ&lpg=PT29&hl=zh-CN&pg=PT26#v=onepage&q&f=true","")
+ ,Str "\65292\20174\31532\20108\31456\30475\36215\65292\38405\35835\26102\35201\27880\24847\26412\25991\19982\36825\20123\20070\25152\19981\21516\30340\26159\24182\27809\26377\25226\23454\25968\30475\20316\26159\26377\29702\25968\38598\30340\20998\21106\12290"]]