From 56b211120c62a01f8aba1c4512acfe4677d8c7d0 Mon Sep 17 00:00:00 2001 From: Emily Bourke Date: Thu, 18 Jun 2020 09:53:32 +0100 Subject: Docx reader: Support new table features. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Column spans * Row spans - The spec says that if the `val` attribute is ommitted, its value should be assumed to be `continue`, and that its values are restricted to {`restart`, `continue`}. If the value has any other value, I think it seems reasonable to default it to `continue`. It might cause problems if the spec is extended in the future by adding a third possible value, in which case this would probably give incorrect behaviour, and wouldn't error. * Allow multiple header rows * Include table description in simple caption - The table description element is like alt text for a table (along with the table caption element). It seems like we should include this somewhere, but I’m not 100% sure how – I’m pairing it with the simple caption for the moment. (Should it maybe go in the block caption instead?) * Detect table captions - Check for caption paragraph style /and/ either the simple or complex table field. This means the caption detection fails for captions which don’t contain a field, as in an example doc I added as a test. However, I think it’s better to be too conservative: a missed table caption will still show up as a paragraph next to the table, whereas if I incorrectly classify something else as a table caption it could cause havoc by pairing it up with a table it’s not at all related to, or dropping it entirely. * Update tests and add new ones Partially fixes: #6316 --- src/Text/Pandoc/Readers/Docx.hs | 92 ++++++++++++++++----------- src/Text/Pandoc/Readers/Docx/Parse.hs | 113 ++++++++++++++++++++++++++++++---- src/Text/Pandoc/Readers/Docx/Util.hs | 7 +++ 3 files changed, 163 insertions(+), 49 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 375bb7338..c06adf7e3 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -61,13 +61,14 @@ module Text.Pandoc.Readers.Docx import Codec.Archive.Zip import Control.Monad.Reader import Control.Monad.State.Strict +import Data.Bifunctor (bimap, first) import qualified Data.ByteString.Lazy as B import Data.Default (Default) -import Data.List (delete, intersect) +import Data.List (delete, intersect, foldl') import Data.Char (isSpace) import qualified Data.Map as M import qualified Data.Text as T -import Data.Maybe (isJust, fromMaybe) +import Data.Maybe (catMaybes, isJust, fromMaybe) import Data.Sequence (ViewL (..), viewl) import qualified Data.Sequence as Seq import qualified Data.Set as Set @@ -113,6 +114,7 @@ data DState = DState { docxAnchorMap :: M.Map T.Text T.Text -- restarting , docxListState :: M.Map (T.Text, T.Text) Integer , docxPrevPara :: Inlines + , docxTableCaptions :: [Blocks] } instance Default DState where @@ -123,6 +125,7 @@ instance Default DState where , docxDropCap = mempty , docxListState = M.empty , docxPrevPara = mempty + , docxTableCaptions = [] } data DEnv = DEnv { docxOptions :: ReaderOptions @@ -491,15 +494,32 @@ singleParaToPlain blks singleton $ Plain ils singleParaToPlain blks = blks -cellToBlocks :: PandocMonad m => Docx.Cell -> DocxContext m Blocks -cellToBlocks (Docx.Cell bps) = do +cellToCell :: PandocMonad m => RowSpan -> Docx.Cell -> DocxContext m Pandoc.Cell +cellToCell rowSpan (Docx.Cell gridSpan _ bps) = do blks <- smushBlocks <$> mapM bodyPartToBlocks bps - return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks + let blks' = singleParaToPlain $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks + return (cell AlignDefault rowSpan (ColSpan (fromIntegral gridSpan)) blks') + +rowsToRows :: PandocMonad m => [Docx.Row] -> DocxContext m [Pandoc.Row] +rowsToRows rows = do + let rowspans = (fmap . fmap) (first RowSpan) (Docx.rowsToRowspans rows) + cells <- traverse (traverse (uncurry cellToCell)) rowspans + return (fmap (Pandoc.Row nullAttr) cells) + +splitHeaderRows :: Bool -> [Docx.Row] -> ([Docx.Row], [Docx.Row]) +splitHeaderRows hasFirstRowFormatting rs = bimap reverse reverse $ fst + $ if hasFirstRowFormatting + then foldl' f ((take 1 rs, []), True) (drop 1 rs) + else foldl' f (([], []), False) rs + where + f ((headerRows, bodyRows), previousRowWasHeader) r@(Docx.Row h cs) + | h == HasTblHeader || (previousRowWasHeader && any isContinuationCell cs) + = ((r : headerRows, bodyRows), True) + | otherwise + = ((headerRows, r : bodyRows), False) + + isContinuationCell (Docx.Cell _ vm _) = vm == Docx.Continue -rowToBlocksList :: PandocMonad m => Docx.Row -> DocxContext m [Blocks] -rowToBlocksList (Docx.Row cells) = do - blksList <- mapM cellToBlocks cells - return $ map singleParaToPlain blksList -- like trimInlines, but also take out linebreaks trimSps :: Inlines -> Inlines @@ -546,6 +566,11 @@ normalizeToClassName = T.map go . fromStyleName where go c | isSpace c = '-' | otherwise = c +bodyPartToTableCaption :: PandocMonad m => BodyPart -> DocxContext m (Maybe Blocks) +bodyPartToTableCaption (TblCaption pPr parparts) = + Just <$> bodyPartToBlocks (Paragraph pPr parparts) +bodyPartToTableCaption _ = pure Nothing + bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks bodyPartToBlocks (Paragraph pPr parparts) | Just True <- pBidi pPr = do @@ -637,50 +662,43 @@ bodyPartToBlocks (ListItem pPr _ _ _ parparts) = let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr} in bodyPartToBlocks $ Paragraph pPr' parparts +bodyPartToBlocks (TblCaption _ _) = + return $ para mempty -- collected separately bodyPartToBlocks (Tbl _ _ _ []) = return $ para mempty -bodyPartToBlocks (Tbl cap grid look parts@(r:rs)) = do - let cap' = simpleCaption $ plain $ text cap - (hdr, rows) = case firstRowFormatting look of - True | null rs -> (Nothing, [r]) - | otherwise -> (Just r, rs) - False -> (Nothing, r:rs) - - cells <- mapM rowToBlocksList rows +bodyPartToBlocks (Tbl cap grid look parts) = do + captions <- gets docxTableCaptions + fullCaption <- case captions of + c : cs -> do + modify (\s -> s { docxTableCaptions = cs }) + return c + [] -> return $ if T.null cap then mempty else plain (text cap) + let shortCaption = if T.null cap then Nothing else Just (toList (text cap)) + cap' = caption shortCaption fullCaption + (hdr, rows) = splitHeaderRows (firstRowFormatting look) parts let width = maybe 0 maximum $ nonEmpty $ map rowLength parts rowLength :: Docx.Row -> Int - rowLength (Docx.Row c) = length c + rowLength (Docx.Row _ c) = sum (fmap (\(Docx.Cell gridSpan _ _) -> fromIntegral gridSpan) c) - let toRow = Pandoc.Row nullAttr . map simpleCell - toHeaderRow l = [toRow l | not (null l)] + headerCells <- rowsToRows hdr + bodyCells <- rowsToRows rows - -- pad cells. New Text.Pandoc.Builder will do that for us, - -- so this is for compatibility while we switch over. - let cells' = map (\row -> toRow $ take width (row ++ repeat mempty)) cells - - hdrCells <- case hdr of - Just r' -> toHeaderRow <$> rowToBlocksList r' - Nothing -> return [] - - -- The two following variables (horizontal column alignment and - -- relative column widths) go to the default at the - -- moment. Width information is in the TblGrid field of the Tbl, - -- so should be possible. Alignment might be more difficult, - -- since there doesn't seem to be a column entity in docx. + -- Horizontal column alignment goes to the default at the moment. Getting + -- it might be difficult, since there doesn't seem to be a column entity + -- in docx. let alignments = replicate width AlignDefault totalWidth = sum grid widths = (\w -> ColWidth (fromInteger w / fromInteger totalWidth)) <$> grid return $ table cap' (zip alignments widths) - (TableHead nullAttr hdrCells) - [TableBody nullAttr 0 [] cells'] + (TableHead nullAttr headerCells) + [TableBody nullAttr 0 [] bodyCells] (TableFoot nullAttr []) bodyPartToBlocks (OMathPara e) = return $ para $ displayMath (writeTeX e) - -- replace targets with generated anchors. rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline rewriteLink' l@(Link attr ils (T.uncons -> Just ('#',target), title)) = do @@ -716,6 +734,8 @@ bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block]) bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps + captions <- catMaybes <$> mapM bodyPartToTableCaption blkbps + modify (\s -> s { docxTableCaptions = captions }) blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks blks'' <- removeOrphanAnchors blks' diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 978d6ff3a..aaa8f4ad0 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -33,7 +33,9 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , ParStyle , CharStyle(cStyleData) , Row(..) + , TblHeader(..) , Cell(..) + , VMerge(..) , TrackedChange(..) , ChangeType(..) , ChangeInfo(..) @@ -50,6 +52,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , pHeading , constructBogusParStyleData , leftBiasedMergeRunStyle + , rowsToRowspans ) where import Text.Pandoc.Readers.Docx.Parse.Styles import Codec.Archive.Zip @@ -225,6 +228,7 @@ defaultParagraphStyle = ParagraphStyle { pStyle = [] data BodyPart = Paragraph ParagraphStyle [ParPart] | ListItem ParagraphStyle T.Text T.Text (Maybe Level) [ParPart] | Tbl T.Text TblGrid TblLook [Row] + | TblCaption ParagraphStyle [ParPart] | OMathPara [Exp] deriving Show @@ -236,12 +240,61 @@ newtype TblLook = TblLook {firstRowFormatting::Bool} defaultTblLook :: TblLook defaultTblLook = TblLook{firstRowFormatting = False} -newtype Row = Row [Cell] - deriving Show +data Row = Row TblHeader [Cell] deriving Show + +data TblHeader = HasTblHeader | NoTblHeader deriving (Show, Eq) -newtype Cell = Cell [BodyPart] +data Cell = Cell GridSpan VMerge [BodyPart] deriving Show +type GridSpan = Integer + +data VMerge = Continue + -- ^ This cell should be merged with the one above it + | Restart + -- ^ This cell should not be merged with the one above it + deriving (Show, Eq) + +rowsToRowspans :: [Row] -> [[(Int, Cell)]] +rowsToRowspans rows = let + removeMergedCells = fmap (filter (\(_, Cell _ vmerge _) -> vmerge == Restart)) + in removeMergedCells (foldr f [] rows) + where + f :: Row -> [[(Int, Cell)]] -> [[(Int, Cell)]] + f (Row _ cells) acc = let + spans = g cells Nothing (listToMaybe acc) + in spans : acc + + g :: + -- | The current row + [Cell] -> + -- | Number of columns left below + Maybe Integer -> + -- | (rowspan so far, cell) for the row below this one + Maybe [(Int, Cell)] -> + -- | (rowspan so far, cell) for this row + [(Int, Cell)] + g cells _ Nothing = zip (repeat 1) cells + g cells columnsLeftBelow (Just rowBelow) = + case cells of + [] -> [] + thisCell@(Cell thisGridSpan _ _) : restOfRow -> case rowBelow of + [] -> zip (repeat 1) cells + (spanSoFarBelow, Cell gridSpanBelow vmerge _) : _ -> + let spanSoFar = case vmerge of + Restart -> 1 + Continue -> 1 + spanSoFarBelow + columnsToDrop = thisGridSpan + (gridSpanBelow - fromMaybe gridSpanBelow columnsLeftBelow) + (newColumnsLeftBelow, restOfRowBelow) = dropColumns columnsToDrop rowBelow + in (spanSoFar, thisCell) : g restOfRow (Just newColumnsLeftBelow) (Just restOfRowBelow) + + dropColumns :: Integer -> [(a, Cell)] -> (Integer, [(a, Cell)]) + dropColumns n [] = (n, []) + dropColumns n cells@((_, Cell gridSpan _ _) : otherCells) = + if n < gridSpan + then (gridSpan - n, cells) + else dropColumns (n - gridSpan) otherCells + leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle leftBiasedMergeRunStyle a b = RunStyle { isBold = isBold a <|> isBold b @@ -587,14 +640,31 @@ elemToRow ns element | isElem ns "w" "tr" element = do let cellElems = findChildrenByName ns "w" "tc" element cells <- mapD (elemToCell ns) cellElems - return $ Row cells + let hasTblHeader = maybe NoTblHeader (const HasTblHeader) + (findChildByName ns "w" "trPr" element + >>= findChildByName ns "w" "tblHeader") + return $ Row hasTblHeader cells elemToRow _ _ = throwError WrongElem elemToCell :: NameSpaces -> Element -> D Cell elemToCell ns element | isElem ns "w" "tc" element = do + let properties = findChildByName ns "w" "tcPr" element + let gridSpan = properties + >>= findChildByName ns "w" "gridSpan" + >>= findAttrByName ns "w" "val" + >>= stringToInteger + let vMerge = case properties >>= findChildByName ns "w" "vMerge" of + Nothing -> Restart + Just e -> + fromMaybe Continue $ do + s <- findAttrByName ns "w" "val" e + case s of + "continue" -> Just Continue + "restart" -> Just Restart + _ -> Nothing cellContents <- mapD (elemToBodyPart ns) (elChildren element) - return $ Cell cellContents + return $ Cell (fromMaybe 1 gridSpan) vMerge cellContents elemToCell _ _ = throwError WrongElem elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation @@ -626,10 +696,9 @@ pNumInfo = getParStyleField numInfo . pStyle elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element - , (c:_) <- findChildrenByName ns "m" "oMathPara" element = - do - expsLst <- eitherToD $ readOMML $ showElement c - return $ OMathPara expsLst + , (c:_) <- findChildrenByName ns "m" "oMathPara" element = do + expsLst <- eitherToD $ readOMML $ showElement c + return $ OMathPara expsLst elemToBodyPart ns element | isElem ns "w" "p" element , Just (numId, lvl) <- getNumInfo ns element = do @@ -647,13 +716,31 @@ elemToBodyPart ns element Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do levelInfo <- lookupLevel numId lvl <$> asks envNumbering return $ ListItem parstyle numId lvl levelInfo parparts - _ -> return $ Paragraph parstyle parparts + _ -> let + hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle) + + hasSimpleTableField = fromMaybe False $ do + fldSimple <- findChildByName ns "w" "fldSimple" element + instr <- findAttrByName ns "w" "instr" fldSimple + pure ("Table" `elem` T.words instr) + + hasComplexTableField = fromMaybe False $ do + instrText <- findElementByName ns "w" "instrText" element + pure ("Table" `elem` T.words (strContent instrText)) + + in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField) + then return $ TblCaption parstyle parparts + else return $ Paragraph parstyle parparts + elemToBodyPart ns element | isElem ns "w" "tbl" element = do - let caption' = findChildByName ns "w" "tblPr" element + let tblProperties = findChildByName ns "w" "tblPr" element + caption = fromMaybe "" $ tblProperties >>= findChildByName ns "w" "tblCaption" >>= findAttrByName ns "w" "val" - caption = fromMaybe "" caption' + description = fromMaybe "" $ tblProperties + >>= findChildByName ns "w" "tblDescription" + >>= findAttrByName ns "w" "val" grid' = case findChildByName ns "w" "tblGrid" element of Just g -> elemToTblGrid ns g Nothing -> return [] @@ -666,7 +753,7 @@ elemToBodyPart ns element grid <- grid' tblLook <- tblLook' rows <- mapD (elemToRow ns) (elChildren element) - return $ Tbl caption grid tblLook rows + return $ Tbl (caption <> description) grid tblLook rows elemToBodyPart _ _ = throwError WrongElem lookupRelationship :: DocumentLocation -> RelId -> [Relationship] -> Maybe Target diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index ac331cba6..970697a2d 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -19,6 +19,7 @@ module Text.Pandoc.Readers.Docx.Util ( , elemToNameSpaces , findChildByName , findChildrenByName + , findElementByName , findAttrByName ) where @@ -56,6 +57,12 @@ findChildrenByName ns pref name el = let ns' = ns <> elemToNameSpaces el in findChildren (elemName ns' pref name) el +-- | Like 'findChildrenByName', but searches descendants. +findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element +findElementByName ns pref name el = + let ns' = ns <> elemToNameSpaces el + in findElement (elemName ns' pref name) el + findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text findAttrByName ns pref name el = let ns' = ns <> elemToNameSpaces el -- cgit v1.2.3