diff options
author | Emily Bourke <undergroundquizscene@protonmail.com> | 2020-06-18 09:53:32 +0100 |
---|---|---|
committer | Albert Krewinkel <albert@zeitkraut.de> | 2021-05-28 20:15:23 +0200 |
commit | 56b211120c62a01f8aba1c4512acfe4677d8c7d0 (patch) | |
tree | 9cc7f09243f80abbdc67957f66381bbf03d0d79f /src/Text/Pandoc/Readers/Docx | |
parent | 44484d0dee1bd095240b9faf26f8d1dad8e560ea (diff) | |
download | pandoc-56b211120c62a01f8aba1c4512acfe4677d8c7d0.tar.gz |
Docx reader: Support new table features.
* Column spans
* Row spans
- The spec says that if the `val` attribute is ommitted, its value
should be assumed to be `continue`, and that its values are
restricted to {`restart`, `continue`}. If the value has any other
value, I think it seems reasonable to default it to `continue`. It
might cause problems if the spec is extended in the future by adding
a third possible value, in which case this would probably give
incorrect behaviour, and wouldn't error.
* Allow multiple header rows
* Include table description in simple caption
- The table description element is like alt text for a table (along
with the table caption element). It seems like we should include
this somewhere, but I’m not 100% sure how – I’m pairing it with the
simple caption for the moment. (Should it maybe go in the block
caption instead?)
* Detect table captions
- Check for caption paragraph style /and/ either the simple or
complex table field. This means the caption detection fails for
captions which don’t contain a field, as in an example doc I added
as a test. However, I think it’s better to be too conservative: a
missed table caption will still show up as a paragraph next to the
table, whereas if I incorrectly classify something else as a table
caption it could cause havoc by pairing it up with a table it’s
not at all related to, or dropping it entirely.
* Update tests and add new ones
Partially fixes: #6316
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 113 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Util.hs | 7 |
2 files changed, 107 insertions, 13 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 978d6ff3a..aaa8f4ad0 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -33,7 +33,9 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , ParStyle , CharStyle(cStyleData) , Row(..) + , TblHeader(..) , Cell(..) + , VMerge(..) , TrackedChange(..) , ChangeType(..) , ChangeInfo(..) @@ -50,6 +52,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , pHeading , constructBogusParStyleData , leftBiasedMergeRunStyle + , rowsToRowspans ) where import Text.Pandoc.Readers.Docx.Parse.Styles import Codec.Archive.Zip @@ -225,6 +228,7 @@ defaultParagraphStyle = ParagraphStyle { pStyle = [] data BodyPart = Paragraph ParagraphStyle [ParPart] | ListItem ParagraphStyle T.Text T.Text (Maybe Level) [ParPart] | Tbl T.Text TblGrid TblLook [Row] + | TblCaption ParagraphStyle [ParPart] | OMathPara [Exp] deriving Show @@ -236,12 +240,61 @@ newtype TblLook = TblLook {firstRowFormatting::Bool} defaultTblLook :: TblLook defaultTblLook = TblLook{firstRowFormatting = False} -newtype Row = Row [Cell] - deriving Show +data Row = Row TblHeader [Cell] deriving Show + +data TblHeader = HasTblHeader | NoTblHeader deriving (Show, Eq) -newtype Cell = Cell [BodyPart] +data Cell = Cell GridSpan VMerge [BodyPart] deriving Show +type GridSpan = Integer + +data VMerge = Continue + -- ^ This cell should be merged with the one above it + | Restart + -- ^ This cell should not be merged with the one above it + deriving (Show, Eq) + +rowsToRowspans :: [Row] -> [[(Int, Cell)]] +rowsToRowspans rows = let + removeMergedCells = fmap (filter (\(_, Cell _ vmerge _) -> vmerge == Restart)) + in removeMergedCells (foldr f [] rows) + where + f :: Row -> [[(Int, Cell)]] -> [[(Int, Cell)]] + f (Row _ cells) acc = let + spans = g cells Nothing (listToMaybe acc) + in spans : acc + + g :: + -- | The current row + [Cell] -> + -- | Number of columns left below + Maybe Integer -> + -- | (rowspan so far, cell) for the row below this one + Maybe [(Int, Cell)] -> + -- | (rowspan so far, cell) for this row + [(Int, Cell)] + g cells _ Nothing = zip (repeat 1) cells + g cells columnsLeftBelow (Just rowBelow) = + case cells of + [] -> [] + thisCell@(Cell thisGridSpan _ _) : restOfRow -> case rowBelow of + [] -> zip (repeat 1) cells + (spanSoFarBelow, Cell gridSpanBelow vmerge _) : _ -> + let spanSoFar = case vmerge of + Restart -> 1 + Continue -> 1 + spanSoFarBelow + columnsToDrop = thisGridSpan + (gridSpanBelow - fromMaybe gridSpanBelow columnsLeftBelow) + (newColumnsLeftBelow, restOfRowBelow) = dropColumns columnsToDrop rowBelow + in (spanSoFar, thisCell) : g restOfRow (Just newColumnsLeftBelow) (Just restOfRowBelow) + + dropColumns :: Integer -> [(a, Cell)] -> (Integer, [(a, Cell)]) + dropColumns n [] = (n, []) + dropColumns n cells@((_, Cell gridSpan _ _) : otherCells) = + if n < gridSpan + then (gridSpan - n, cells) + else dropColumns (n - gridSpan) otherCells + leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle leftBiasedMergeRunStyle a b = RunStyle { isBold = isBold a <|> isBold b @@ -587,14 +640,31 @@ elemToRow ns element | isElem ns "w" "tr" element = do let cellElems = findChildrenByName ns "w" "tc" element cells <- mapD (elemToCell ns) cellElems - return $ Row cells + let hasTblHeader = maybe NoTblHeader (const HasTblHeader) + (findChildByName ns "w" "trPr" element + >>= findChildByName ns "w" "tblHeader") + return $ Row hasTblHeader cells elemToRow _ _ = throwError WrongElem elemToCell :: NameSpaces -> Element -> D Cell elemToCell ns element | isElem ns "w" "tc" element = do + let properties = findChildByName ns "w" "tcPr" element + let gridSpan = properties + >>= findChildByName ns "w" "gridSpan" + >>= findAttrByName ns "w" "val" + >>= stringToInteger + let vMerge = case properties >>= findChildByName ns "w" "vMerge" of + Nothing -> Restart + Just e -> + fromMaybe Continue $ do + s <- findAttrByName ns "w" "val" e + case s of + "continue" -> Just Continue + "restart" -> Just Restart + _ -> Nothing cellContents <- mapD (elemToBodyPart ns) (elChildren element) - return $ Cell cellContents + return $ Cell (fromMaybe 1 gridSpan) vMerge cellContents elemToCell _ _ = throwError WrongElem elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation @@ -626,10 +696,9 @@ pNumInfo = getParStyleField numInfo . pStyle elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element - , (c:_) <- findChildrenByName ns "m" "oMathPara" element = - do - expsLst <- eitherToD $ readOMML $ showElement c - return $ OMathPara expsLst + , (c:_) <- findChildrenByName ns "m" "oMathPara" element = do + expsLst <- eitherToD $ readOMML $ showElement c + return $ OMathPara expsLst elemToBodyPart ns element | isElem ns "w" "p" element , Just (numId, lvl) <- getNumInfo ns element = do @@ -647,13 +716,31 @@ elemToBodyPart ns element Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do levelInfo <- lookupLevel numId lvl <$> asks envNumbering return $ ListItem parstyle numId lvl levelInfo parparts - _ -> return $ Paragraph parstyle parparts + _ -> let + hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle) + + hasSimpleTableField = fromMaybe False $ do + fldSimple <- findChildByName ns "w" "fldSimple" element + instr <- findAttrByName ns "w" "instr" fldSimple + pure ("Table" `elem` T.words instr) + + hasComplexTableField = fromMaybe False $ do + instrText <- findElementByName ns "w" "instrText" element + pure ("Table" `elem` T.words (strContent instrText)) + + in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField) + then return $ TblCaption parstyle parparts + else return $ Paragraph parstyle parparts + elemToBodyPart ns element | isElem ns "w" "tbl" element = do - let caption' = findChildByName ns "w" "tblPr" element + let tblProperties = findChildByName ns "w" "tblPr" element + caption = fromMaybe "" $ tblProperties >>= findChildByName ns "w" "tblCaption" >>= findAttrByName ns "w" "val" - caption = fromMaybe "" caption' + description = fromMaybe "" $ tblProperties + >>= findChildByName ns "w" "tblDescription" + >>= findAttrByName ns "w" "val" grid' = case findChildByName ns "w" "tblGrid" element of Just g -> elemToTblGrid ns g Nothing -> return [] @@ -666,7 +753,7 @@ elemToBodyPart ns element grid <- grid' tblLook <- tblLook' rows <- mapD (elemToRow ns) (elChildren element) - return $ Tbl caption grid tblLook rows + return $ Tbl (caption <> description) grid tblLook rows elemToBodyPart _ _ = throwError WrongElem lookupRelationship :: DocumentLocation -> RelId -> [Relationship] -> Maybe Target diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index ac331cba6..970697a2d 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -19,6 +19,7 @@ module Text.Pandoc.Readers.Docx.Util ( , elemToNameSpaces , findChildByName , findChildrenByName + , findElementByName , findAttrByName ) where @@ -56,6 +57,12 @@ findChildrenByName ns pref name el = let ns' = ns <> elemToNameSpaces el in findChildren (elemName ns' pref name) el +-- | Like 'findChildrenByName', but searches descendants. +findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element +findElementByName ns pref name el = + let ns' = ns <> elemToNameSpaces el + in findElement (elemName ns' pref name) el + findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text findAttrByName ns pref name el = let ns' = ns <> elemToNameSpaces el |