aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorEmily Bourke <undergroundquizscene@protonmail.com>2020-06-18 09:53:32 +0100
committerAlbert Krewinkel <albert@zeitkraut.de>2021-05-28 20:15:23 +0200
commit56b211120c62a01f8aba1c4512acfe4677d8c7d0 (patch)
tree9cc7f09243f80abbdc67957f66381bbf03d0d79f /src/Text/Pandoc/Readers
parent44484d0dee1bd095240b9faf26f8d1dad8e560ea (diff)
downloadpandoc-56b211120c62a01f8aba1c4512acfe4677d8c7d0.tar.gz
Docx reader: Support new table features.
* Column spans * Row spans - The spec says that if the `val` attribute is ommitted, its value should be assumed to be `continue`, and that its values are restricted to {`restart`, `continue`}. If the value has any other value, I think it seems reasonable to default it to `continue`. It might cause problems if the spec is extended in the future by adding a third possible value, in which case this would probably give incorrect behaviour, and wouldn't error. * Allow multiple header rows * Include table description in simple caption - The table description element is like alt text for a table (along with the table caption element). It seems like we should include this somewhere, but I’m not 100% sure how – I’m pairing it with the simple caption for the moment. (Should it maybe go in the block caption instead?) * Detect table captions - Check for caption paragraph style /and/ either the simple or complex table field. This means the caption detection fails for captions which don’t contain a field, as in an example doc I added as a test. However, I think it’s better to be too conservative: a missed table caption will still show up as a paragraph next to the table, whereas if I incorrectly classify something else as a table caption it could cause havoc by pairing it up with a table it’s not at all related to, or dropping it entirely. * Update tests and add new ones Partially fixes: #6316
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs92
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs113
-rw-r--r--src/Text/Pandoc/Readers/Docx/Util.hs7
3 files changed, 163 insertions, 49 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 375bb7338..c06adf7e3 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -61,13 +61,14 @@ module Text.Pandoc.Readers.Docx
import Codec.Archive.Zip
import Control.Monad.Reader
import Control.Monad.State.Strict
+import Data.Bifunctor (bimap, first)
import qualified Data.ByteString.Lazy as B
import Data.Default (Default)
-import Data.List (delete, intersect)
+import Data.List (delete, intersect, foldl')
import Data.Char (isSpace)
import qualified Data.Map as M
import qualified Data.Text as T
-import Data.Maybe (isJust, fromMaybe)
+import Data.Maybe (catMaybes, isJust, fromMaybe)
import Data.Sequence (ViewL (..), viewl)
import qualified Data.Sequence as Seq
import qualified Data.Set as Set
@@ -113,6 +114,7 @@ data DState = DState { docxAnchorMap :: M.Map T.Text T.Text
-- restarting
, docxListState :: M.Map (T.Text, T.Text) Integer
, docxPrevPara :: Inlines
+ , docxTableCaptions :: [Blocks]
}
instance Default DState where
@@ -123,6 +125,7 @@ instance Default DState where
, docxDropCap = mempty
, docxListState = M.empty
, docxPrevPara = mempty
+ , docxTableCaptions = []
}
data DEnv = DEnv { docxOptions :: ReaderOptions
@@ -491,15 +494,32 @@ singleParaToPlain blks
singleton $ Plain ils
singleParaToPlain blks = blks
-cellToBlocks :: PandocMonad m => Docx.Cell -> DocxContext m Blocks
-cellToBlocks (Docx.Cell bps) = do
+cellToCell :: PandocMonad m => RowSpan -> Docx.Cell -> DocxContext m Pandoc.Cell
+cellToCell rowSpan (Docx.Cell gridSpan _ bps) = do
blks <- smushBlocks <$> mapM bodyPartToBlocks bps
- return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks
+ let blks' = singleParaToPlain $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks
+ return (cell AlignDefault rowSpan (ColSpan (fromIntegral gridSpan)) blks')
+
+rowsToRows :: PandocMonad m => [Docx.Row] -> DocxContext m [Pandoc.Row]
+rowsToRows rows = do
+ let rowspans = (fmap . fmap) (first RowSpan) (Docx.rowsToRowspans rows)
+ cells <- traverse (traverse (uncurry cellToCell)) rowspans
+ return (fmap (Pandoc.Row nullAttr) cells)
+
+splitHeaderRows :: Bool -> [Docx.Row] -> ([Docx.Row], [Docx.Row])
+splitHeaderRows hasFirstRowFormatting rs = bimap reverse reverse $ fst
+ $ if hasFirstRowFormatting
+ then foldl' f ((take 1 rs, []), True) (drop 1 rs)
+ else foldl' f (([], []), False) rs
+ where
+ f ((headerRows, bodyRows), previousRowWasHeader) r@(Docx.Row h cs)
+ | h == HasTblHeader || (previousRowWasHeader && any isContinuationCell cs)
+ = ((r : headerRows, bodyRows), True)
+ | otherwise
+ = ((headerRows, r : bodyRows), False)
+
+ isContinuationCell (Docx.Cell _ vm _) = vm == Docx.Continue
-rowToBlocksList :: PandocMonad m => Docx.Row -> DocxContext m [Blocks]
-rowToBlocksList (Docx.Row cells) = do
- blksList <- mapM cellToBlocks cells
- return $ map singleParaToPlain blksList
-- like trimInlines, but also take out linebreaks
trimSps :: Inlines -> Inlines
@@ -546,6 +566,11 @@ normalizeToClassName = T.map go . fromStyleName
where go c | isSpace c = '-'
| otherwise = c
+bodyPartToTableCaption :: PandocMonad m => BodyPart -> DocxContext m (Maybe Blocks)
+bodyPartToTableCaption (TblCaption pPr parparts) =
+ Just <$> bodyPartToBlocks (Paragraph pPr parparts)
+bodyPartToTableCaption _ = pure Nothing
+
bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks
bodyPartToBlocks (Paragraph pPr parparts)
| Just True <- pBidi pPr = do
@@ -637,50 +662,43 @@ bodyPartToBlocks (ListItem pPr _ _ _ parparts) =
let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr}
in
bodyPartToBlocks $ Paragraph pPr' parparts
+bodyPartToBlocks (TblCaption _ _) =
+ return $ para mempty -- collected separately
bodyPartToBlocks (Tbl _ _ _ []) =
return $ para mempty
-bodyPartToBlocks (Tbl cap grid look parts@(r:rs)) = do
- let cap' = simpleCaption $ plain $ text cap
- (hdr, rows) = case firstRowFormatting look of
- True | null rs -> (Nothing, [r])
- | otherwise -> (Just r, rs)
- False -> (Nothing, r:rs)
-
- cells <- mapM rowToBlocksList rows
+bodyPartToBlocks (Tbl cap grid look parts) = do
+ captions <- gets docxTableCaptions
+ fullCaption <- case captions of
+ c : cs -> do
+ modify (\s -> s { docxTableCaptions = cs })
+ return c
+ [] -> return $ if T.null cap then mempty else plain (text cap)
+ let shortCaption = if T.null cap then Nothing else Just (toList (text cap))
+ cap' = caption shortCaption fullCaption
+ (hdr, rows) = splitHeaderRows (firstRowFormatting look) parts
let width = maybe 0 maximum $ nonEmpty $ map rowLength parts
rowLength :: Docx.Row -> Int
- rowLength (Docx.Row c) = length c
+ rowLength (Docx.Row _ c) = sum (fmap (\(Docx.Cell gridSpan _ _) -> fromIntegral gridSpan) c)
- let toRow = Pandoc.Row nullAttr . map simpleCell
- toHeaderRow l = [toRow l | not (null l)]
+ headerCells <- rowsToRows hdr
+ bodyCells <- rowsToRows rows
- -- pad cells. New Text.Pandoc.Builder will do that for us,
- -- so this is for compatibility while we switch over.
- let cells' = map (\row -> toRow $ take width (row ++ repeat mempty)) cells
-
- hdrCells <- case hdr of
- Just r' -> toHeaderRow <$> rowToBlocksList r'
- Nothing -> return []
-
- -- The two following variables (horizontal column alignment and
- -- relative column widths) go to the default at the
- -- moment. Width information is in the TblGrid field of the Tbl,
- -- so should be possible. Alignment might be more difficult,
- -- since there doesn't seem to be a column entity in docx.
+ -- Horizontal column alignment goes to the default at the moment. Getting
+ -- it might be difficult, since there doesn't seem to be a column entity
+ -- in docx.
let alignments = replicate width AlignDefault
totalWidth = sum grid
widths = (\w -> ColWidth (fromInteger w / fromInteger totalWidth)) <$> grid
return $ table cap'
(zip alignments widths)
- (TableHead nullAttr hdrCells)
- [TableBody nullAttr 0 [] cells']
+ (TableHead nullAttr headerCells)
+ [TableBody nullAttr 0 [] bodyCells]
(TableFoot nullAttr [])
bodyPartToBlocks (OMathPara e) =
return $ para $ displayMath (writeTeX e)
-
-- replace targets with generated anchors.
rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline
rewriteLink' l@(Link attr ils (T.uncons -> Just ('#',target), title)) = do
@@ -716,6 +734,8 @@ bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block])
bodyToOutput (Body bps) = do
let (metabps, blkbps) = sepBodyParts bps
meta <- bodyPartsToMeta metabps
+ captions <- catMaybes <$> mapM bodyPartToTableCaption blkbps
+ modify (\s -> s { docxTableCaptions = captions })
blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps
blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks
blks'' <- removeOrphanAnchors blks'
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index 978d6ff3a..aaa8f4ad0 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -33,7 +33,9 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
, ParStyle
, CharStyle(cStyleData)
, Row(..)
+ , TblHeader(..)
, Cell(..)
+ , VMerge(..)
, TrackedChange(..)
, ChangeType(..)
, ChangeInfo(..)
@@ -50,6 +52,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
, pHeading
, constructBogusParStyleData
, leftBiasedMergeRunStyle
+ , rowsToRowspans
) where
import Text.Pandoc.Readers.Docx.Parse.Styles
import Codec.Archive.Zip
@@ -225,6 +228,7 @@ defaultParagraphStyle = ParagraphStyle { pStyle = []
data BodyPart = Paragraph ParagraphStyle [ParPart]
| ListItem ParagraphStyle T.Text T.Text (Maybe Level) [ParPart]
| Tbl T.Text TblGrid TblLook [Row]
+ | TblCaption ParagraphStyle [ParPart]
| OMathPara [Exp]
deriving Show
@@ -236,12 +240,61 @@ newtype TblLook = TblLook {firstRowFormatting::Bool}
defaultTblLook :: TblLook
defaultTblLook = TblLook{firstRowFormatting = False}
-newtype Row = Row [Cell]
- deriving Show
+data Row = Row TblHeader [Cell] deriving Show
+
+data TblHeader = HasTblHeader | NoTblHeader deriving (Show, Eq)
-newtype Cell = Cell [BodyPart]
+data Cell = Cell GridSpan VMerge [BodyPart]
deriving Show
+type GridSpan = Integer
+
+data VMerge = Continue
+ -- ^ This cell should be merged with the one above it
+ | Restart
+ -- ^ This cell should not be merged with the one above it
+ deriving (Show, Eq)
+
+rowsToRowspans :: [Row] -> [[(Int, Cell)]]
+rowsToRowspans rows = let
+ removeMergedCells = fmap (filter (\(_, Cell _ vmerge _) -> vmerge == Restart))
+ in removeMergedCells (foldr f [] rows)
+ where
+ f :: Row -> [[(Int, Cell)]] -> [[(Int, Cell)]]
+ f (Row _ cells) acc = let
+ spans = g cells Nothing (listToMaybe acc)
+ in spans : acc
+
+ g ::
+ -- | The current row
+ [Cell] ->
+ -- | Number of columns left below
+ Maybe Integer ->
+ -- | (rowspan so far, cell) for the row below this one
+ Maybe [(Int, Cell)] ->
+ -- | (rowspan so far, cell) for this row
+ [(Int, Cell)]
+ g cells _ Nothing = zip (repeat 1) cells
+ g cells columnsLeftBelow (Just rowBelow) =
+ case cells of
+ [] -> []
+ thisCell@(Cell thisGridSpan _ _) : restOfRow -> case rowBelow of
+ [] -> zip (repeat 1) cells
+ (spanSoFarBelow, Cell gridSpanBelow vmerge _) : _ ->
+ let spanSoFar = case vmerge of
+ Restart -> 1
+ Continue -> 1 + spanSoFarBelow
+ columnsToDrop = thisGridSpan + (gridSpanBelow - fromMaybe gridSpanBelow columnsLeftBelow)
+ (newColumnsLeftBelow, restOfRowBelow) = dropColumns columnsToDrop rowBelow
+ in (spanSoFar, thisCell) : g restOfRow (Just newColumnsLeftBelow) (Just restOfRowBelow)
+
+ dropColumns :: Integer -> [(a, Cell)] -> (Integer, [(a, Cell)])
+ dropColumns n [] = (n, [])
+ dropColumns n cells@((_, Cell gridSpan _ _) : otherCells) =
+ if n < gridSpan
+ then (gridSpan - n, cells)
+ else dropColumns (n - gridSpan) otherCells
+
leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle
leftBiasedMergeRunStyle a b = RunStyle
{ isBold = isBold a <|> isBold b
@@ -587,14 +640,31 @@ elemToRow ns element | isElem ns "w" "tr" element =
do
let cellElems = findChildrenByName ns "w" "tc" element
cells <- mapD (elemToCell ns) cellElems
- return $ Row cells
+ let hasTblHeader = maybe NoTblHeader (const HasTblHeader)
+ (findChildByName ns "w" "trPr" element
+ >>= findChildByName ns "w" "tblHeader")
+ return $ Row hasTblHeader cells
elemToRow _ _ = throwError WrongElem
elemToCell :: NameSpaces -> Element -> D Cell
elemToCell ns element | isElem ns "w" "tc" element =
do
+ let properties = findChildByName ns "w" "tcPr" element
+ let gridSpan = properties
+ >>= findChildByName ns "w" "gridSpan"
+ >>= findAttrByName ns "w" "val"
+ >>= stringToInteger
+ let vMerge = case properties >>= findChildByName ns "w" "vMerge" of
+ Nothing -> Restart
+ Just e ->
+ fromMaybe Continue $ do
+ s <- findAttrByName ns "w" "val" e
+ case s of
+ "continue" -> Just Continue
+ "restart" -> Just Restart
+ _ -> Nothing
cellContents <- mapD (elemToBodyPart ns) (elChildren element)
- return $ Cell cellContents
+ return $ Cell (fromMaybe 1 gridSpan) vMerge cellContents
elemToCell _ _ = throwError WrongElem
elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation
@@ -626,10 +696,9 @@ pNumInfo = getParStyleField numInfo . pStyle
elemToBodyPart :: NameSpaces -> Element -> D BodyPart
elemToBodyPart ns element
| isElem ns "w" "p" element
- , (c:_) <- findChildrenByName ns "m" "oMathPara" element =
- do
- expsLst <- eitherToD $ readOMML $ showElement c
- return $ OMathPara expsLst
+ , (c:_) <- findChildrenByName ns "m" "oMathPara" element = do
+ expsLst <- eitherToD $ readOMML $ showElement c
+ return $ OMathPara expsLst
elemToBodyPart ns element
| isElem ns "w" "p" element
, Just (numId, lvl) <- getNumInfo ns element = do
@@ -647,13 +716,31 @@ elemToBodyPart ns element
Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do
levelInfo <- lookupLevel numId lvl <$> asks envNumbering
return $ ListItem parstyle numId lvl levelInfo parparts
- _ -> return $ Paragraph parstyle parparts
+ _ -> let
+ hasCaptionStyle = elem "Caption" (pStyleId <$> pStyle parstyle)
+
+ hasSimpleTableField = fromMaybe False $ do
+ fldSimple <- findChildByName ns "w" "fldSimple" element
+ instr <- findAttrByName ns "w" "instr" fldSimple
+ pure ("Table" `elem` T.words instr)
+
+ hasComplexTableField = fromMaybe False $ do
+ instrText <- findElementByName ns "w" "instrText" element
+ pure ("Table" `elem` T.words (strContent instrText))
+
+ in if hasCaptionStyle && (hasSimpleTableField || hasComplexTableField)
+ then return $ TblCaption parstyle parparts
+ else return $ Paragraph parstyle parparts
+
elemToBodyPart ns element
| isElem ns "w" "tbl" element = do
- let caption' = findChildByName ns "w" "tblPr" element
+ let tblProperties = findChildByName ns "w" "tblPr" element
+ caption = fromMaybe "" $ tblProperties
>>= findChildByName ns "w" "tblCaption"
>>= findAttrByName ns "w" "val"
- caption = fromMaybe "" caption'
+ description = fromMaybe "" $ tblProperties
+ >>= findChildByName ns "w" "tblDescription"
+ >>= findAttrByName ns "w" "val"
grid' = case findChildByName ns "w" "tblGrid" element of
Just g -> elemToTblGrid ns g
Nothing -> return []
@@ -666,7 +753,7 @@ elemToBodyPart ns element
grid <- grid'
tblLook <- tblLook'
rows <- mapD (elemToRow ns) (elChildren element)
- return $ Tbl caption grid tblLook rows
+ return $ Tbl (caption <> description) grid tblLook rows
elemToBodyPart _ _ = throwError WrongElem
lookupRelationship :: DocumentLocation -> RelId -> [Relationship] -> Maybe Target
diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs
index ac331cba6..970697a2d 100644
--- a/src/Text/Pandoc/Readers/Docx/Util.hs
+++ b/src/Text/Pandoc/Readers/Docx/Util.hs
@@ -19,6 +19,7 @@ module Text.Pandoc.Readers.Docx.Util (
, elemToNameSpaces
, findChildByName
, findChildrenByName
+ , findElementByName
, findAttrByName
) where
@@ -56,6 +57,12 @@ findChildrenByName ns pref name el =
let ns' = ns <> elemToNameSpaces el
in findChildren (elemName ns' pref name) el
+-- | Like 'findChildrenByName', but searches descendants.
+findElementByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element
+findElementByName ns pref name el =
+ let ns' = ns <> elemToNameSpaces el
+ in findElement (elemName ns' pref name) el
+
findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text
findAttrByName ns pref name el =
let ns' = ns <> elemToNameSpaces el