diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 101 |
1 files changed, 59 insertions, 42 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 00de6a0cd..c06adf7e3 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -61,13 +61,14 @@ module Text.Pandoc.Readers.Docx import Codec.Archive.Zip import Control.Monad.Reader import Control.Monad.State.Strict +import Data.Bifunctor (bimap, first) import qualified Data.ByteString.Lazy as B import Data.Default (Default) -import Data.List (delete, intersect) +import Data.List (delete, intersect, foldl') import Data.Char (isSpace) import qualified Data.Map as M import qualified Data.Text as T -import Data.Maybe (isJust, fromMaybe) +import Data.Maybe (catMaybes, isJust, fromMaybe) import Data.Sequence (ViewL (..), viewl) import qualified Data.Sequence as Seq import qualified Data.Set as Set @@ -85,6 +86,7 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad) import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Error import Text.Pandoc.Logging +import Data.List.NonEmpty (nonEmpty) readDocx :: PandocMonad m => ReaderOptions @@ -112,6 +114,7 @@ data DState = DState { docxAnchorMap :: M.Map T.Text T.Text -- restarting , docxListState :: M.Map (T.Text, T.Text) Integer , docxPrevPara :: Inlines + , docxTableCaptions :: [Blocks] } instance Default DState where @@ -122,6 +125,7 @@ instance Default DState where , docxDropCap = mempty , docxListState = M.empty , docxPrevPara = mempty + , docxTableCaptions = [] } data DEnv = DEnv { docxOptions :: ReaderOptions @@ -490,15 +494,32 @@ singleParaToPlain blks singleton $ Plain ils singleParaToPlain blks = blks -cellToBlocks :: PandocMonad m => Docx.Cell -> DocxContext m Blocks -cellToBlocks (Docx.Cell bps) = do +cellToCell :: PandocMonad m => RowSpan -> Docx.Cell -> DocxContext m Pandoc.Cell +cellToCell rowSpan (Docx.Cell gridSpan _ bps) = do blks <- smushBlocks <$> mapM bodyPartToBlocks bps - return $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks + let blks' = singleParaToPlain $ fromList $ blocksToDefinitions $ blocksToBullets $ toList blks + return (cell AlignDefault rowSpan (ColSpan (fromIntegral gridSpan)) blks') + +rowsToRows :: PandocMonad m => [Docx.Row] -> DocxContext m [Pandoc.Row] +rowsToRows rows = do + let rowspans = (fmap . fmap) (first RowSpan) (Docx.rowsToRowspans rows) + cells <- traverse (traverse (uncurry cellToCell)) rowspans + return (fmap (Pandoc.Row nullAttr) cells) + +splitHeaderRows :: Bool -> [Docx.Row] -> ([Docx.Row], [Docx.Row]) +splitHeaderRows hasFirstRowFormatting rs = bimap reverse reverse $ fst + $ if hasFirstRowFormatting + then foldl' f ((take 1 rs, []), True) (drop 1 rs) + else foldl' f (([], []), False) rs + where + f ((headerRows, bodyRows), previousRowWasHeader) r@(Docx.Row h cs) + | h == HasTblHeader || (previousRowWasHeader && any isContinuationCell cs) + = ((r : headerRows, bodyRows), True) + | otherwise + = ((headerRows, r : bodyRows), False) + + isContinuationCell (Docx.Cell _ vm _) = vm == Docx.Continue -rowToBlocksList :: PandocMonad m => Docx.Row -> DocxContext m [Blocks] -rowToBlocksList (Docx.Row cells) = do - blksList <- mapM cellToBlocks cells - return $ map singleParaToPlain blksList -- like trimInlines, but also take out linebreaks trimSps :: Inlines -> Inlines @@ -545,6 +566,11 @@ normalizeToClassName = T.map go . fromStyleName where go c | isSpace c = '-' | otherwise = c +bodyPartToTableCaption :: PandocMonad m => BodyPart -> DocxContext m (Maybe Blocks) +bodyPartToTableCaption (TblCaption pPr parparts) = + Just <$> bodyPartToBlocks (Paragraph pPr parparts) +bodyPartToTableCaption _ = pure Nothing + bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks bodyPartToBlocks (Paragraph pPr parparts) | Just True <- pBidi pPr = do @@ -636,54 +662,43 @@ bodyPartToBlocks (ListItem pPr _ _ _ parparts) = let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr} in bodyPartToBlocks $ Paragraph pPr' parparts +bodyPartToBlocks (TblCaption _ _) = + return $ para mempty -- collected separately bodyPartToBlocks (Tbl _ _ _ []) = return $ para mempty -bodyPartToBlocks (Tbl cap _ look parts@(r:rs)) = do - let cap' = simpleCaption $ plain $ text cap - (hdr, rows) = case firstRowFormatting look of - True | null rs -> (Nothing, [r]) - | otherwise -> (Just r, rs) - False -> (Nothing, r:rs) - - cells <- mapM rowToBlocksList rows +bodyPartToBlocks (Tbl cap grid look parts) = do + captions <- gets docxTableCaptions + fullCaption <- case captions of + c : cs -> do + modify (\s -> s { docxTableCaptions = cs }) + return c + [] -> return $ if T.null cap then mempty else plain (text cap) + let shortCaption = if T.null cap then Nothing else Just (toList (text cap)) + cap' = caption shortCaption fullCaption + (hdr, rows) = splitHeaderRows (firstRowFormatting look) parts let width = maybe 0 maximum $ nonEmpty $ map rowLength parts - -- Data.List.NonEmpty is not available with ghc 7.10 so we roll out - -- our own, see - -- https://github.com/jgm/pandoc/pull/4361#issuecomment-365416155 - nonEmpty [] = Nothing - nonEmpty l = Just l rowLength :: Docx.Row -> Int - rowLength (Docx.Row c) = length c + rowLength (Docx.Row _ c) = sum (fmap (\(Docx.Cell gridSpan _ _) -> fromIntegral gridSpan) c) - let toRow = Pandoc.Row nullAttr . map simpleCell - toHeaderRow l = [toRow l | not (null l)] + headerCells <- rowsToRows hdr + bodyCells <- rowsToRows rows - -- pad cells. New Text.Pandoc.Builder will do that for us, - -- so this is for compatibility while we switch over. - let cells' = map (\row -> toRow $ take width (row ++ repeat mempty)) cells - - hdrCells <- case hdr of - Just r' -> toHeaderRow <$> rowToBlocksList r' - Nothing -> return [] - - -- The two following variables (horizontal column alignment and - -- relative column widths) go to the default at the - -- moment. Width information is in the TblGrid field of the Tbl, - -- so should be possible. Alignment might be more difficult, - -- since there doesn't seem to be a column entity in docx. + -- Horizontal column alignment goes to the default at the moment. Getting + -- it might be difficult, since there doesn't seem to be a column entity + -- in docx. let alignments = replicate width AlignDefault - widths = replicate width ColWidthDefault + totalWidth = sum grid + widths = (\w -> ColWidth (fromInteger w / fromInteger totalWidth)) <$> grid return $ table cap' (zip alignments widths) - (TableHead nullAttr hdrCells) - [TableBody nullAttr 0 [] cells'] + (TableHead nullAttr headerCells) + [TableBody nullAttr 0 [] bodyCells] (TableFoot nullAttr []) bodyPartToBlocks (OMathPara e) = return $ para $ displayMath (writeTeX e) - -- replace targets with generated anchors. rewriteLink' :: PandocMonad m => Inline -> DocxContext m Inline rewriteLink' l@(Link attr ils (T.uncons -> Just ('#',target), title)) = do @@ -719,6 +734,8 @@ bodyToOutput :: PandocMonad m => Body -> DocxContext m (Meta, [Block]) bodyToOutput (Body bps) = do let (metabps, blkbps) = sepBodyParts bps meta <- bodyPartsToMeta metabps + captions <- catMaybes <$> mapM bodyPartToTableCaption blkbps + modify (\s -> s { docxTableCaptions = captions }) blks <- smushBlocks <$> mapM bodyPartToBlocks blkbps blks' <- rewriteLinks $ blocksToDefinitions $ blocksToBullets $ toList blks blks'' <- removeOrphanAnchors blks' |