From fd14ad52618c98928ab83aa43689158cc788c4a8 Mon Sep 17 00:00:00 2001 From: Nikolay Yakimov Date: Sun, 8 Sep 2019 16:47:44 +0300 Subject: [Docx Reader] Code clean-up Reduce code duplication, remove redundant brackets, use newtype instead of data where appropriate --- src/Text/Pandoc/Readers/Docx.hs | 70 +++++++++++++---------------------- src/Text/Pandoc/Readers/Docx/Parse.hs | 32 +++++++--------- 2 files changed, 39 insertions(+), 63 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 4f44d18e7..a26986af2 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -2,6 +2,7 @@ {-# LANGUAGE CPP #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternGuards #-} +{-# LANGUAGE MultiWayIf #-} {- | Module : Text.Pandoc.Readers.Docx Copyright : Copyright (C) 2014-2019 Jesse Rosenthal @@ -129,7 +130,7 @@ instance Default DEnv where type DocxContext m = ReaderT DEnv (StateT DState m) evalDocxContext :: PandocMonad m => DocxContext m a -> DEnv -> DState -> m a -evalDocxContext ctx env st = flip evalStateT st $flip runReaderT env ctx +evalDocxContext ctx env st = flip evalStateT st $ runReaderT ctx env -- This is empty, but we put it in for future-proofing. spansToKeep :: [String] @@ -537,15 +538,6 @@ parStyleToTransform pPr let pPr' = pPr { pStyle = cs, indentation = Nothing} transform <- parStyleToTransform pPr' return $ divWith ("", [c], []) . transform - | (c:cs) <- pStyle pPr - , Just True <- pBlockQuote pPr = do - opts <- asks docxOptions - let pPr' = pPr { pStyle = cs } - transform <- parStyleToTransform pPr' - let extraInfo = if isEnabled Ext_styles opts - then divWith ("", [], [("custom-style", c)]) - else id - return $ extraInfo . blockQuote . transform | (c:cs) <- pStyle pPr = do opts <- asks docxOptions let pPr' = pPr { pStyle = cs} @@ -553,22 +545,15 @@ parStyleToTransform pPr let extraInfo = if isEnabled Ext_styles opts then divWith ("", [], [("custom-style", c)]) else id - return $ extraInfo . transform + return $ extraInfo . (if fromMaybe False (pBlockQuote pPr) then blockQuote else id) . transform | null (pStyle pPr) - , Just left <- indentation pPr >>= leftParIndent - , Just hang <- indentation pPr >>= hangingParIndent = do + , Just left <- indentation pPr >>= leftParIndent = do let pPr' = pPr { indentation = Nothing } + hang = fromMaybe 0 $ indentation pPr >>= hangingParIndent transform <- parStyleToTransform pPr' - return $ case (left - hang) > 0 of - True -> blockQuote . transform - False -> transform - | null (pStyle pPr), - Just left <- indentation pPr >>= leftParIndent = do - let pPr' = pPr { indentation = Nothing } - transform <- parStyleToTransform pPr' - return $ case left > 0 of - True -> blockQuote . transform - False -> transform + return $ if (left - hang) > 0 + then blockQuote . transform + else transform parStyleToTransform _ = return id bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks @@ -585,7 +570,7 @@ bodyPartToBlocks (Paragraph pPr parparts) makeHeaderAnchor $ headerWith ("", delete style (pStyle pPr), []) n ils | otherwise = do - ils <- (trimSps . smushInlines) <$> mapM parPartToInlines parparts + ils <- trimSps . smushInlines <$> mapM parPartToInlines parparts prevParaIls <- gets docxPrevPara dropIls <- gets docxDropCap let ils' = dropIls <> ils @@ -596,21 +581,21 @@ bodyPartToBlocks (Paragraph pPr parparts) let ils'' = prevParaIls <> (if isNull prevParaIls then mempty else space) <> ils' + handleInsertion = do + modify $ \s -> s {docxPrevPara = mempty} + transform <- parStyleToTransform pPr + return $ transform $ para ils'' opts <- asks docxOptions - case () of - - _ | isNull ils'' && not (isEnabled Ext_empty_paragraphs opts) -> + if | isNull ils'' && not (isEnabled Ext_empty_paragraphs opts) -> return mempty - _ | Just (TrackedChange Insertion _) <- pChange pPr - , AcceptChanges <- readerTrackChanges opts -> do - modify $ \s -> s {docxPrevPara = mempty} - transform <- parStyleToTransform pPr - return $ transform $ para ils'' - _ | Just (TrackedChange Insertion _) <- pChange pPr + | Just (TrackedChange Insertion _) <- pChange pPr + , AcceptChanges <- readerTrackChanges opts -> + handleInsertion + | Just (TrackedChange Insertion _) <- pChange pPr , RejectChanges <- readerTrackChanges opts -> do modify $ \s -> s {docxPrevPara = ils''} return mempty - _ | Just (TrackedChange Insertion cInfo) <- pChange pPr + | Just (TrackedChange Insertion cInfo) <- pChange pPr , AllChanges <- readerTrackChanges opts , ChangeInfo _ cAuthor cDate <- cInfo -> do let attr = ("", ["paragraph-insertion"], [("author", cAuthor), ("date", cDate)]) @@ -618,16 +603,14 @@ bodyPartToBlocks (Paragraph pPr parparts) transform <- parStyleToTransform pPr return $ transform $ para $ ils'' <> insertMark - _ | Just (TrackedChange Deletion _) <- pChange pPr + | Just (TrackedChange Deletion _) <- pChange pPr , AcceptChanges <- readerTrackChanges opts -> do modify $ \s -> s {docxPrevPara = ils''} return mempty - _ | Just (TrackedChange Deletion _) <- pChange pPr - , RejectChanges <- readerTrackChanges opts -> do - modify $ \s -> s {docxPrevPara = mempty} - transform <- parStyleToTransform pPr - return $ transform $ para ils'' - _ | Just (TrackedChange Deletion cInfo) <- pChange pPr + | Just (TrackedChange Deletion _) <- pChange pPr + , RejectChanges <- readerTrackChanges opts -> + handleInsertion + | Just (TrackedChange Deletion cInfo) <- pChange pPr , AllChanges <- readerTrackChanges opts , ChangeInfo _ cAuthor cDate <- cInfo -> do let attr = ("", ["paragraph-deletion"], [("author", cAuthor), ("date", cDate)]) @@ -635,10 +618,7 @@ bodyPartToBlocks (Paragraph pPr parparts) transform <- parStyleToTransform pPr return $ transform $ para $ ils'' <> insertMark - _ | otherwise -> do - modify $ \s -> s {docxPrevPara = mempty} - transform <- parStyleToTransform pPr - return $ transform $ para ils'' + | otherwise -> handleInsertion bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do -- We check whether this current numId has previously been used, -- since Docx expects us to pick up where we left off. diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index f725660b9..330c9208f 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -121,9 +121,9 @@ unwrap :: NameSpaces -> Content -> [Content] unwrap ns (Elem element) | isElem ns "w" "sdt" element , Just sdtContent <- findChildByName ns "w" "sdtContent" element - = concatMap ((unwrap ns) . Elem) (elChildren sdtContent) + = concatMap (unwrap ns . Elem) (elChildren sdtContent) | isElem ns "w" "smartTag" element - = concatMap ((unwrap ns) . Elem) (elChildren element) + = concatMap (unwrap ns . Elem) (elChildren element) unwrap _ content = [content] unwrapChild :: NameSpaces -> Content -> Content @@ -149,13 +149,13 @@ walkDocument ns element = _ -> Nothing -data Docx = Docx Document +newtype Docx = Docx Document deriving Show data Document = Document NameSpaces Body deriving Show -data Body = Body [BodyPart] +newtype Body = Body [BodyPart] deriving Show type Media = [(FilePath, B.ByteString)] @@ -242,16 +242,16 @@ data BodyPart = Paragraph ParagraphStyle [ParPart] type TblGrid = [Integer] -data TblLook = TblLook {firstRowFormatting::Bool} +newtype TblLook = TblLook {firstRowFormatting::Bool} deriving Show defaultTblLook :: TblLook defaultTblLook = TblLook{firstRowFormatting = False} -data Row = Row [Cell] +newtype Row = Row [Cell] deriving Show -data Cell = Cell [BodyPart] +newtype Cell = Cell [BodyPart] deriving Show -- (width, height) in EMUs @@ -495,7 +495,7 @@ filePathToRelType "word/_rels/endnotes.xml.rels" _ = Just InEndnote -- -- to see if it's a documentPath, we have to check against the dynamic -- -- docPath specified in "_rels/.rels" filePathToRelType path docXmlPath = - if path == "word/_rels/" ++ (takeFileName docXmlPath) ++ ".rels" + if path == "word/_rels/" ++ takeFileName docXmlPath ++ ".rels" then Just InDocument else Nothing @@ -537,7 +537,7 @@ lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do case lvlOverride of Just (LevelOverride _ _ (Just lvl')) -> Just lvl' Just (LevelOverride _ (Just strt) _) -> - lookup ilvl $ map (\(Level i fmt s _) -> (i, (Level i fmt s (Just strt)))) lvls + lookup ilvl $ map (\(Level i fmt s _) -> (i, Level i fmt s (Just strt))) lvls _ -> lookup ilvl $ map (\l@(Level i _ _ _) -> (i, l)) lvls @@ -703,23 +703,19 @@ elemToBodyPart ns element elemToBodyPart ns element | isElem ns "w" "p" element , Just (numId, lvl) <- getNumInfo ns element = do - sty <- asks envParStyles - let parstyle = elemToParagraphStyle ns element sty + parstyle <- elemToParagraphStyle ns element <$> asks envParStyles parparts <- mapD (elemToParPart ns) (elChildren element) - num <- asks envNumbering - let levelInfo = lookupLevel numId lvl num + levelInfo <- lookupLevel numId lvl <$> asks envNumbering return $ ListItem parstyle numId lvl levelInfo parparts elemToBodyPart ns element | isElem ns "w" "p" element = do - sty <- asks envParStyles - let parstyle = elemToParagraphStyle ns element sty + parstyle <- elemToParagraphStyle ns element <$> asks envParStyles parparts <- mapD (elemToParPart ns) (elChildren element) -- Word uses list enumeration for numbered headings, so we only -- want to infer a list from the styles if it is NOT a heading. case pHeading parstyle of Nothing | Just (numId, lvl) <- pNumInfo parstyle -> do - num <- asks envNumbering - let levelInfo = lookupLevel numId lvl num + levelInfo <- lookupLevel numId lvl <$> asks envNumbering return $ ListItem parstyle numId lvl levelInfo parparts _ -> return $ Paragraph parstyle parparts elemToBodyPart ns element @@ -727,7 +723,7 @@ elemToBodyPart ns element let caption' = findChildByName ns "w" "tblPr" element >>= findChildByName ns "w" "tblCaption" >>= findAttrByName ns "w" "val" - caption = (fromMaybe "" caption') + caption = fromMaybe "" caption' grid' = case findChildByName ns "w" "tblGrid" element of Just g -> elemToTblGrid ns g Nothing -> return [] -- cgit v1.2.3 From c113ca6717d00870ec10716897d76a6fa62b1d41 Mon Sep 17 00:00:00 2001 From: Nikolay Yakimov Date: Sun, 15 Sep 2019 01:40:23 +0300 Subject: [Docx Reader] Use style names, not ids, for assigning semantic meaning Motivating issues: #5523, #5052, #5074 Style name comparisons are case-insensitive, since those are case-insensitive in Word. w:styleId will be used as style name if w:name is missing (this should only happen for malformed docx and is kept as a fallback to avoid failing altogether on malformed documents) Block quote detection code moved from Docx.Parser to Readers.Docx Code styles, i.e. "Source Code" and "Verbatim Char" now honor style inheritance Docx Reader now honours "Compact" style (used in Pandoc-generated docx). The side-effect is that "Compact" style no longer shows up in docx+styles output. Styles inherited from "Compact" will still show up. Removed obsolete list-item style from divsToKeep. That didn't really do anything for a while now. Add newtypes to differentiate between style names, ids, and different style types (that is, paragraph and character styles) Since docx style names can have spaces in them, and pandoc-markdown classes can't, anywhere when style name is used as a class name, spaces are replaced with ASCII dashes `-`. Get rid of extraneous intermediate types, carrying styleId information. Instead, styleId is saved with other style data. Use RunStyle for inline style definitions only (lacking styleId and styleName); for Character Styles use CharStyle type (which is basicaly RunStyle with styleId and StyleName bolted onto it). --- src/Text/Pandoc/Readers/Docx.hs | 162 ++++++++++--------- src/Text/Pandoc/Readers/Docx/Lists.hs | 25 +-- src/Text/Pandoc/Readers/Docx/Parse.hs | 283 ++++++++++++++++++++++----------- test/Tests/Readers/Docx.hs | 9 ++ test/docx/compact-style-removal.docx | Bin 0 -> 9951 bytes test/docx/compact-style-removal.native | 5 + test/docx/lists-compact.docx | Bin 0 -> 9952 bytes test/docx/lists-compact.native | 5 + 8 files changed, 306 insertions(+), 183 deletions(-) create mode 100644 test/docx/compact-style-removal.docx create mode 100644 test/docx/compact-style-removal.native create mode 100644 test/docx/lists-compact.docx create mode 100644 test/docx/lists-compact.native (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index a26986af2..9d17ab118 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -3,6 +3,7 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE FlexibleContexts #-} {- | Module : Text.Pandoc.Readers.Docx Copyright : Copyright (C) 2014-2019 Jesse Rosenthal @@ -65,6 +66,7 @@ import Control.Monad.State.Strict import qualified Data.ByteString.Lazy as B import Data.Default (Default) import Data.List (delete, intersect) +import Data.Char (isSpace) import qualified Data.Map as M import Data.Maybe (isJust, fromMaybe) import Data.Sequence (ViewL (..), viewl) @@ -133,13 +135,13 @@ evalDocxContext :: PandocMonad m => DocxContext m a -> DEnv -> DState -> m a evalDocxContext ctx env st = flip evalStateT st $ runReaderT ctx env -- This is empty, but we put it in for future-proofing. -spansToKeep :: [String] +spansToKeep :: [CharStyleName] spansToKeep = [] -divsToKeep :: [String] -divsToKeep = ["list-item", "Definition", "DefinitionTerm"] +divsToKeep :: [ParaStyleName] +divsToKeep = ["Definition", "Definition Term"] -metaStyles :: M.Map String String +metaStyles :: M.Map ParaStyleName String metaStyles = M.fromList [ ("Title", "title") , ("Subtitle", "subtitle") , ("Author", "author") @@ -151,7 +153,7 @@ sepBodyParts = span (\bp -> isMetaPar bp || isEmptyPar bp) isMetaPar :: BodyPart -> Bool isMetaPar (Paragraph pPr _) = - not $ null $ intersect (pStyle pPr) (M.keys metaStyles) + not $ null $ intersect (getStyleNames $ pStyle pPr) (M.keys metaStyles) isMetaPar _ = False isEmptyPar :: BodyPart -> Bool @@ -168,7 +170,7 @@ bodyPartsToMeta' :: PandocMonad m => [BodyPart] -> DocxContext m (M.Map String M bodyPartsToMeta' [] = return M.empty bodyPartsToMeta' (bp : bps) | (Paragraph pPr parParts) <- bp - , (c : _)<- (pStyle pPr) `intersect` (M.keys metaStyles) + , (c : _)<- getStyleNames (pStyle pPr) `intersect` M.keys metaStyles , (Just metaField) <- M.lookup c metaStyles = do inlines <- smushInlines <$> mapM parPartToInlines parParts remaining <- bodyPartsToMeta' bps @@ -198,11 +200,29 @@ fixAuthors (MetaBlocks blks) = g _ = MetaInlines [] fixAuthors mv = mv -codeStyles :: [String] -codeStyles = ["VerbatimChar"] +isInheritedFromStyles :: (Eq (StyleName s), HasStyleName s, HasParentStyle s) => [StyleName s] -> s -> Bool +isInheritedFromStyles names sty + | getStyleName sty `elem` names = True + | Just psty <- getParentStyle sty = isInheritedFromStyles names psty + | otherwise = False -codeDivs :: [String] -codeDivs = ["SourceCode"] +hasStylesInheritedFrom :: [ParaStyleName] -> ParagraphStyle -> Bool +hasStylesInheritedFrom ns s = any (isInheritedFromStyles ns) $ pStyle s + +removeStyleNamed :: ParaStyleName -> ParagraphStyle -> ParagraphStyle +removeStyleNamed sn ps = ps{pStyle = filter (\psd -> getStyleName psd /= sn) $ pStyle ps} + +isCodeCharStyle :: CharStyle -> Bool +isCodeCharStyle = isInheritedFromStyles ["Verbatim Char"] + +isCodeDiv :: ParagraphStyle -> Bool +isCodeDiv = hasStylesInheritedFrom ["Source Code"] + +isBlockQuote :: ParStyle -> Bool +isBlockQuote = + isInheritedFromStyles [ + "Quote", "Block Text", "Block Quote", "Block Quotation" + ] runElemToInlines :: RunElem -> Inlines runElemToInlines (TextRun s) = text s @@ -228,57 +248,31 @@ parPartToString (InternalHyperLink _ runs) = concatMap runToString runs parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs parPartToString _ = "" -blacklistedCharStyles :: [String] +blacklistedCharStyles :: [CharStyleName] blacklistedCharStyles = ["Hyperlink"] resolveDependentRunStyle :: PandocMonad m => RunStyle -> DocxContext m RunStyle resolveDependentRunStyle rPr - | Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles = + | Just s <- rParentStyle rPr + , getStyleName s `elem` blacklistedCharStyles = return rPr - | Just (_, cs) <- rStyle rPr = do + | Just s <- rParentStyle rPr = do opts <- asks docxOptions if isEnabled Ext_styles opts then return rPr - else do rPr' <- resolveDependentRunStyle cs - return $ - RunStyle { isBold = case isBold rPr of - Just bool -> Just bool - Nothing -> isBold rPr' - , isItalic = case isItalic rPr of - Just bool -> Just bool - Nothing -> isItalic rPr' - , isSmallCaps = case isSmallCaps rPr of - Just bool -> Just bool - Nothing -> isSmallCaps rPr' - , isStrike = case isStrike rPr of - Just bool -> Just bool - Nothing -> isStrike rPr' - , isRTL = case isRTL rPr of - Just bool -> Just bool - Nothing -> isRTL rPr' - , rVertAlign = case rVertAlign rPr of - Just valign -> Just valign - Nothing -> rVertAlign rPr' - , rUnderline = case rUnderline rPr of - Just ulstyle -> Just ulstyle - Nothing -> rUnderline rPr' - , rStyle = rStyle rPr - } + else leftBiasedMergeRunStyle rPr <$> resolveDependentRunStyle (cStyleData s) | otherwise = return rPr runStyleToTransform :: PandocMonad m => RunStyle -> DocxContext m (Inlines -> Inlines) runStyleToTransform rPr - | Just (s, _) <- rStyle rPr - , s `elem` spansToKeep = do - transform <- runStyleToTransform rPr{rStyle = Nothing} - return $ spanWith ("", [s], []) . transform - | Just (s, _) <- rStyle rPr = do - opts <- asks docxOptions - let extraInfo = if isEnabled Ext_styles opts - then spanWith ("", [], [("custom-style", s)]) - else id - transform <- runStyleToTransform rPr{rStyle = Nothing} - return $ extraInfo . transform + | Just sn <- getStyleName <$> rParentStyle rPr + , sn `elem` spansToKeep = do + transform <- runStyleToTransform rPr{rParentStyle = Nothing} + return $ spanWith ("", [normalizeToClassName sn], []) . transform + | Just s <- rParentStyle rPr = do + ei <- extraInfo spanWith s + transform <- runStyleToTransform rPr{rParentStyle = Nothing} + return $ ei . transform | Just True <- isItalic rPr = do transform <- runStyleToTransform rPr{isItalic = Nothing} return $ emph . transform @@ -310,8 +304,7 @@ runStyleToTransform rPr runToInlines :: PandocMonad m => Run -> DocxContext m Inlines runToInlines (Run rs runElems) - | Just (s, _) <- rStyle rs - , s `elem` codeStyles = do + | maybe False isCodeCharStyle $ rParentStyle rs = do rPr <- resolveDependentRunStyle rs let codeString = code $ concatMap runElemToString runElems return $ case rVertAlign rPr of @@ -526,39 +519,49 @@ trimSps (Many ils) = Many $ Seq.dropWhileL isSp $Seq.dropWhileR isSp ils isSp LineBreak = True isSp _ = False +extraInfo :: (Eq (StyleName a), PandocMonad m, HasStyleName a) + => (Attr -> i -> i) -> a -> DocxContext m (i -> i) +extraInfo f s = do + opts <- asks docxOptions + return $ if | isEnabled Ext_styles opts + -> f ("", [], [("custom-style", fromStyleName $ getStyleName s)]) + | otherwise -> id + parStyleToTransform :: PandocMonad m => ParagraphStyle -> DocxContext m (Blocks -> Blocks) parStyleToTransform pPr | (c:cs) <- pStyle pPr - , c `elem` divsToKeep = do + , getStyleName c `elem` divsToKeep = do let pPr' = pPr { pStyle = cs } transform <- parStyleToTransform pPr' - return $ divWith ("", [c], []) . transform + return $ divWith ("", [normalizeToClassName $ getStyleName c], []) . transform | (c:cs) <- pStyle pPr, - c `elem` listParagraphDivs = do + getStyleName c `elem` listParagraphStyles = do let pPr' = pPr { pStyle = cs, indentation = Nothing} transform <- parStyleToTransform pPr' - return $ divWith ("", [c], []) . transform + return $ divWith ("", [normalizeToClassName $ getStyleName c], []) . transform | (c:cs) <- pStyle pPr = do - opts <- asks docxOptions - let pPr' = pPr { pStyle = cs} + let pPr' = pPr { pStyle = cs } transform <- parStyleToTransform pPr' - let extraInfo = if isEnabled Ext_styles opts - then divWith ("", [], [("custom-style", c)]) - else id - return $ extraInfo . (if fromMaybe False (pBlockQuote pPr) then blockQuote else id) . transform + ei <- extraInfo divWith c + return $ ei . (if isBlockQuote c then blockQuote else id) . transform | null (pStyle pPr) , Just left <- indentation pPr >>= leftParIndent = do let pPr' = pPr { indentation = Nothing } hang = fromMaybe 0 $ indentation pPr >>= hangingParIndent transform <- parStyleToTransform pPr' - return $ if (left - hang) > 0 + return $ if (left - hang) > 0 then blockQuote . transform else transform parStyleToTransform _ = return id +normalizeToClassName :: (FromStyleName a) => a -> String +normalizeToClassName = map go . fromStyleName + where go c | isSpace c = '-' + | otherwise = c + bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks bodyPartToBlocks (Paragraph pPr parparts) - | not $ null $ codeDivs `intersect` (pStyle pPr) = do + | isCodeDiv pPr = do transform <- parStyleToTransform pPr return $ transform $ @@ -568,13 +571,16 @@ bodyPartToBlocks (Paragraph pPr parparts) ils <-local (\s-> s{docxInHeaderBlock=True}) (smushInlines <$> mapM parPartToInlines parparts) makeHeaderAnchor $ - headerWith ("", delete style (pStyle pPr), []) n ils + headerWith ("", map normalizeToClassName . delete style $ getStyleNames (pStyle pPr), []) n ils | otherwise = do ils <- trimSps . smushInlines <$> mapM parPartToInlines parparts prevParaIls <- gets docxPrevPara dropIls <- gets docxDropCap let ils' = dropIls <> ils - if dropCap pPr + let (paraOrPlain, pPr') + | hasStylesInheritedFrom ["Compact"] pPr = (plain, removeStyleNamed "Compact" pPr) + | otherwise = (para, pPr) + if dropCap pPr' then do modify $ \s -> s { docxDropCap = ils' } return mempty else do modify $ \s -> s { docxDropCap = mempty } @@ -583,41 +589,41 @@ bodyPartToBlocks (Paragraph pPr parparts) ils' handleInsertion = do modify $ \s -> s {docxPrevPara = mempty} - transform <- parStyleToTransform pPr - return $ transform $ para ils'' + transform <- parStyleToTransform pPr' + return $ transform $ paraOrPlain ils'' opts <- asks docxOptions if | isNull ils'' && not (isEnabled Ext_empty_paragraphs opts) -> return mempty - | Just (TrackedChange Insertion _) <- pChange pPr + | Just (TrackedChange Insertion _) <- pChange pPr' , AcceptChanges <- readerTrackChanges opts -> handleInsertion - | Just (TrackedChange Insertion _) <- pChange pPr + | Just (TrackedChange Insertion _) <- pChange pPr' , RejectChanges <- readerTrackChanges opts -> do modify $ \s -> s {docxPrevPara = ils''} return mempty - | Just (TrackedChange Insertion cInfo) <- pChange pPr + | Just (TrackedChange Insertion cInfo) <- pChange pPr' , AllChanges <- readerTrackChanges opts , ChangeInfo _ cAuthor cDate <- cInfo -> do let attr = ("", ["paragraph-insertion"], [("author", cAuthor), ("date", cDate)]) insertMark = spanWith attr mempty - transform <- parStyleToTransform pPr + transform <- parStyleToTransform pPr' return $ transform $ - para $ ils'' <> insertMark - | Just (TrackedChange Deletion _) <- pChange pPr + paraOrPlain $ ils'' <> insertMark + | Just (TrackedChange Deletion _) <- pChange pPr' , AcceptChanges <- readerTrackChanges opts -> do modify $ \s -> s {docxPrevPara = ils''} return mempty - | Just (TrackedChange Deletion _) <- pChange pPr + | Just (TrackedChange Deletion _) <- pChange pPr' , RejectChanges <- readerTrackChanges opts -> handleInsertion - | Just (TrackedChange Deletion cInfo) <- pChange pPr + | Just (TrackedChange Deletion cInfo) <- pChange pPr' , AllChanges <- readerTrackChanges opts , ChangeInfo _ cAuthor cDate <- cInfo -> do let attr = ("", ["paragraph-deletion"], [("author", cAuthor), ("date", cDate)]) insertMark = spanWith attr mempty - transform <- parStyleToTransform pPr + transform <- parStyleToTransform pPr' return $ transform $ - para $ ils'' <> insertMark + paraOrPlain $ ils'' <> insertMark | otherwise -> handleInsertion bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do -- We check whether this current numId has previously been used, @@ -638,7 +644,7 @@ bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do blks <- bodyPartToBlocks (Paragraph pPr parparts) return $ divWith ("", ["list-item"], kvs) blks bodyPartToBlocks (ListItem pPr _ _ _ parparts) = - let pPr' = pPr {pStyle = "ListParagraph": pStyle pPr} + let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr} in bodyPartToBlocks $ Paragraph pPr' parparts bodyPartToBlocks (Tbl _ _ _ []) = diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs index cc390f122..eb24640c5 100644 --- a/src/Text/Pandoc/Readers/Docx/Lists.hs +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -1,4 +1,5 @@ {-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Docx.Lists Copyright : Copyright (C) 2014-2019 Jesse Rosenthal @@ -14,13 +15,16 @@ Functions for converting flat docx paragraphs into nested lists. module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets , blocksToDefinitions , listParagraphDivs + , listParagraphStyles ) where import Prelude import Data.List import Data.Maybe +import Data.String (fromString) import Text.Pandoc.Generic (bottomUp) import Text.Pandoc.JSON +import Text.Pandoc.Readers.Docx.Parse (ParaStyleName) import Text.Pandoc.Shared (trim, safeRead) isListItem :: Block -> Bool @@ -79,7 +83,10 @@ getListType b@(Div (_, _, kvs) _) | isListItem b = getListType _ = Nothing listParagraphDivs :: [String] -listParagraphDivs = ["ListParagraph"] +listParagraphDivs = ["list-paragraph"] + +listParagraphStyles :: [ParaStyleName] +listParagraphStyles = map fromString listParagraphDivs -- This is a first stab at going through and attaching meaning to list -- paragraphs, without an item marker, following a list item. We @@ -160,7 +167,7 @@ blocksToDefinitions' defAcc acc [] = reverse $ DefinitionList (reverse defAcc) : acc blocksToDefinitions' defAcc acc (Div (_, classes1, _) blks1 : Div (ident2, classes2, kvs2) blks2 : blks) - | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 = + | "Definition-Term" `elem` classes1 && "Definition" `elem` classes2 = let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) pair = if remainingAttr2 == ("", [], []) then (concatMap plainParaInlines blks1, [blks2]) else (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) in @@ -169,12 +176,12 @@ blocksToDefinitions' ((defTerm, defItems):defs) acc (Div (ident2, classes2, kvs2) blks2 : blks) | "Definition" `elem` classes2 = let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) - defItems2 = case remainingAttr2 == ("", [], []) of - True -> blks2 - False -> [Div remainingAttr2 blks2] - defAcc' = case null defItems of - True -> (defTerm, [defItems2]) : defs - False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs + defItems2 = if remainingAttr2 == ("", [], []) + then blks2 + else [Div remainingAttr2 blks2] + defAcc' = if null defItems + then (defTerm, [defItems2]) : defs + else (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs in blocksToDefinitions' defAcc' acc blks blocksToDefinitions' [] acc (b:blks) = @@ -198,7 +205,5 @@ removeListDivs' blk = [blk] removeListDivs :: [Block] -> [Block] removeListDivs = concatMap removeListDivs' - - blocksToDefinitions :: [Block] -> [Block] blocksToDefinitions = blocksToDefinitions' [] [] diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 330c9208f..00c5fb0be 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -1,7 +1,11 @@ {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} {- | Module : Text.Pandoc.Readers.Docx.Parse Copyright : Copyright (C) 2014-2019 Jesse Rosenthal @@ -31,6 +35,8 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , VertAlign(..) , ParIndentation(..) , ParagraphStyle(..) + , ParStyle + , CharStyle(cStyleData) , Row(..) , Cell(..) , TrackedChange(..) @@ -38,8 +44,17 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , ChangeInfo(..) , FieldInfo(..) , Level(..) + , ParaStyleName + , CharStyleName + , FromStyleName(..) + , HasStyleName(..) + , HasParentStyle(..) , archiveToDocx , archiveToDocxWithWarnings + , getStyleNames + , pHeading + , constructBogusParStyleData + , leftBiasedMergeRunStyle ) where import Prelude import Codec.Archive.Zip @@ -49,10 +64,13 @@ import Control.Monad.Reader import Control.Monad.State.Strict import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B -import Data.Char (chr, ord, readLitChar) +import Data.Char (chr, ord, readLitChar, toLower) import Data.List +import Data.Function (on) +import Data.String (IsString(..)) import qualified Data.Map as M import Data.Maybe +import Data.Coerce import System.FilePath import Text.Pandoc.Readers.Docx.Util import Text.Pandoc.Readers.Docx.Fields @@ -160,13 +178,9 @@ newtype Body = Body [BodyPart] type Media = [(FilePath, B.ByteString)] -type CharStyle = (String, RunStyle) +type CharStyleMap = M.Map CharStyleId CharStyle -type ParStyle = (String, ParStyleData) - -type CharStyleMap = M.Map String RunStyle - -type ParStyleMap = M.Map String ParStyleData +type ParStyleMap = M.Map ParaStyleId ParStyle data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] deriving Show @@ -213,12 +227,9 @@ data ChangeInfo = ChangeInfo ChangeId Author ChangeDate data TrackedChange = TrackedChange ChangeType ChangeInfo deriving Show -data ParagraphStyle = ParagraphStyle { pStyle :: [String] +data ParagraphStyle = ParagraphStyle { pStyle :: [ParStyle] , indentation :: Maybe ParIndentation , dropCap :: Bool - , pHeading :: Maybe (String, Int) - , pNumInfo :: Maybe (String, String) - , pBlockQuote :: Maybe Bool , pChange :: Maybe TrackedChange } deriving Show @@ -227,9 +238,6 @@ defaultParagraphStyle :: ParagraphStyle defaultParagraphStyle = ParagraphStyle { pStyle = [] , indentation = Nothing , dropCap = False - , pHeading = Nothing - , pNumInfo = Nothing - , pBlockQuote = Nothing , pChange = Nothing } @@ -254,6 +262,49 @@ newtype Row = Row [Cell] newtype Cell = Cell [BodyPart] deriving Show +newtype CharStyleId = CharStyleId { fromCharStyleId :: String } + deriving (Show, Eq, Ord, FromStyleId) +newtype ParaStyleId = ParaStyleId { fromParaStyleId :: String } + deriving (Show, Eq, Ord, FromStyleId) + +newtype CharStyleName = CharStyleName { fromCharStyleName :: CIString } + deriving (Show, Eq, Ord, IsString, FromStyleName) +newtype ParaStyleName = ParaStyleName { fromParaStyleName :: CIString } + deriving (Show, Eq, Ord, IsString, FromStyleName) + +-- Case-insensitive comparisons +newtype CIString = CIString String deriving (Show, IsString, FromStyleName) + +class FromStyleName a where + fromStyleName :: a -> String + +instance FromStyleName String where + fromStyleName = id + +class FromStyleId a where + fromStyleId :: a -> String + +instance FromStyleId String where + fromStyleId = id + +instance Eq CIString where + (==) = (==) `on` map toLower . coerce + +instance Ord CIString where + compare = compare `on` map toLower . coerce + +leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle +leftBiasedMergeRunStyle a b = RunStyle + { isBold = isBold a <|> isBold b + , isItalic = isItalic a <|> isItalic b + , isSmallCaps = isSmallCaps a <|> isSmallCaps b + , isStrike = isStrike a <|> isStrike b + , isRTL = isRTL a <|> isRTL b + , rVertAlign = rVertAlign a <|> rVertAlign b + , rUnderline = rUnderline a <|> rUnderline b + , rParentStyle = rParentStyle a + } + -- (width, height) in EMUs type Extent = Maybe (Double, Double) @@ -285,21 +336,28 @@ data RunElem = TextRun String | LnBrk | Tab | SoftHyphen | NoBreakHyphen data VertAlign = BaseLn | SupScrpt | SubScrpt deriving Show -data RunStyle = RunStyle { isBold :: Maybe Bool - , isItalic :: Maybe Bool - , isSmallCaps :: Maybe Bool - , isStrike :: Maybe Bool - , isRTL :: Maybe Bool - , rVertAlign :: Maybe VertAlign - , rUnderline :: Maybe String - , rStyle :: Maybe CharStyle +data CharStyle = CharStyle { cStyleId :: CharStyleId + , cStyleName :: CharStyleName + , cStyleData :: RunStyle + } deriving (Show) + +data RunStyle = RunStyle { isBold :: Maybe Bool + , isItalic :: Maybe Bool + , isSmallCaps :: Maybe Bool + , isStrike :: Maybe Bool + , isRTL :: Maybe Bool + , rVertAlign :: Maybe VertAlign + , rUnderline :: Maybe String + , rParentStyle :: Maybe CharStyle } deriving Show -data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int) - , isBlockQuote :: Maybe Bool - , numInfo :: Maybe (String, String) - , psStyle :: Maybe ParStyle} +data ParStyle = ParStyle { headingLev :: Maybe (ParaStyleName, Int) + , numInfo :: Maybe (String, String) + , psParentStyle :: Maybe ParStyle + , pStyleName :: ParaStyleName + , pStyleId :: ParaStyleId + } deriving Show defaultRunStyle :: RunStyle @@ -310,7 +368,7 @@ defaultRunStyle = RunStyle { isBold = Nothing , isRTL = Nothing , rVertAlign = Nothing , rUnderline = Nothing - , rStyle = Nothing + , rParentStyle = Nothing } type Target = String @@ -390,7 +448,10 @@ elemToBody ns element | isElem ns "w" "body" element = elemToBody _ _ = throwError WrongElem archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap) -archiveToStyles zf = +archiveToStyles = archiveToStyles' getStyleId getStyleId +archiveToStyles' :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) => + (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2) +archiveToStyles' conv1 conv2 zf = let stylesElem = findEntryByPath "word/styles.xml" zf >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) in @@ -399,19 +460,17 @@ archiveToStyles zf = Just styElem -> let namespaces = elemToNameSpaces styElem in - ( M.fromList $ buildBasedOnList namespaces styElem - (Nothing :: Maybe CharStyle), - M.fromList $ buildBasedOnList namespaces styElem - (Nothing :: Maybe ParStyle) ) + ( M.fromList $ map (\r -> (conv1 r, r)) $ buildBasedOnList namespaces styElem Nothing, + M.fromList $ map (\p -> (conv2 p, p)) $ buildBasedOnList namespaces styElem Nothing) -isBasedOnStyle :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> Bool +isBasedOnStyle :: (ElemToStyle a, FromStyleId (StyleId a)) => NameSpaces -> Element -> Maybe a -> Bool isBasedOnStyle ns element parentStyle | isElem ns "w" "style" element , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle , Just basedOnVal <- findChildByName ns "w" "basedOn" element >>= findAttrByName ns "w" "val" - , Just ps <- parentStyle = basedOnVal == getStyleId ps + , Just ps <- parentStyle = basedOnVal == fromStyleId (getStyleId ps) | isElem ns "w" "style" element , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle @@ -419,30 +478,70 @@ isBasedOnStyle ns element parentStyle , Nothing <- parentStyle = True | otherwise = False -class ElemToStyle a where +class HasStyleId a => ElemToStyle a where cStyleType :: Maybe a -> String elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a - getStyleId :: a -> String + +class FromStyleId (StyleId a) => HasStyleId a where + type StyleId a + getStyleId :: a -> StyleId a + +class FromStyleName (StyleName a) => HasStyleName a where + type StyleName a + getStyleName :: a -> StyleName a + +class HasParentStyle a where + getParentStyle :: a -> Maybe a + +instance HasParentStyle CharStyle where + getParentStyle = rParentStyle . cStyleData + +instance HasParentStyle ParStyle where + getParentStyle = psParentStyle + +getStyleNames :: (Functor t, HasStyleName a) => t a -> t (StyleName a) +getStyleNames = fmap getStyleName + +constructBogusParStyleData :: ParaStyleName -> ParStyle +constructBogusParStyleData stName = ParStyle + { headingLev = Nothing + , numInfo = Nothing + , psParentStyle = Nothing + , pStyleName = stName + , pStyleId = ParaStyleId . filter (/=' ') . fromStyleName $ stName + } instance ElemToStyle CharStyle where cStyleType _ = "character" elemToStyle ns element parentStyle | isElem ns "w" "style" element - , Just "character" <- findAttrByName ns "w" "type" element - , Just styleId <- findAttrByName ns "w" "styleId" element = - Just (styleId, elemToRunStyle ns element parentStyle) + , Just "character" <- findAttrByName ns "w" "type" element = + elemToCharStyle ns element parentStyle | otherwise = Nothing - getStyleId s = fst s + +instance HasStyleId CharStyle where + type StyleId CharStyle = CharStyleId + getStyleId = cStyleId + +instance HasStyleName CharStyle where + type StyleName CharStyle = CharStyleName + getStyleName = cStyleName instance ElemToStyle ParStyle where cStyleType _ = "paragraph" elemToStyle ns element parentStyle | isElem ns "w" "style" element , Just "paragraph" <- findAttrByName ns "w" "type" element - , Just styleId <- findAttrByName ns "w" "styleId" element = - Just (styleId, elemToParStyleData ns element parentStyle) + = elemToParStyleData ns element parentStyle | otherwise = Nothing - getStyleId s = fst s + +instance HasStyleId ParStyle where + type StyleId ParStyle = ParaStyleId + getStyleId = pStyleId + +instance HasStyleName ParStyle where + type StyleName ParStyle = ParaStyleName + getStyleName = pStyleName getStyleChildren :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a] getStyleChildren ns element parentStyle @@ -693,6 +792,12 @@ testBitMask bitMaskS n = stringToInteger :: String -> Maybe Integer stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) +pHeading :: ParagraphStyle -> Maybe (ParaStyleName, Int) +pHeading = getParStyleField headingLev . pStyle + +pNumInfo :: ParagraphStyle -> Maybe (String, String) +pNumInfo = getParStyleField numInfo . pStyle + elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element @@ -1003,20 +1108,18 @@ elemToRun ns element return $ Run runStyle runElems elemToRun _ _ = throwError WrongElem -getParentStyleValue :: (ParStyleData -> Maybe a) -> ParStyleData -> Maybe a +getParentStyleValue :: (ParStyle -> Maybe a) -> ParStyle -> Maybe a getParentStyleValue field style | Just value <- field style = Just value - | Just parentStyle <- psStyle style - = getParentStyleValue field (snd parentStyle) + | Just parentStyle <- psParentStyle style + = getParentStyleValue field parentStyle getParentStyleValue _ _ = Nothing -getParStyleField :: (ParStyleData -> Maybe a) -> ParStyleMap -> [String] -> - Maybe a -getParStyleField field stylemap styles - | x <- mapMaybe (\x -> M.lookup x stylemap) styles - , (y:_) <- mapMaybe (getParentStyleValue field) x +getParStyleField :: (ParStyle -> Maybe a) -> [ParStyle] -> Maybe a +getParStyleField field styles + | (y:_) <- mapMaybe (getParentStyleValue field) styles = Just y -getParStyleField _ _ _ = Nothing +getParStyleField _ _ = Nothing getTrackedChange :: NameSpaces -> Element -> Maybe TrackedChange getTrackedChange ns element @@ -1038,10 +1141,10 @@ elemToParagraphStyle ns element sty | Just pPr <- findChildByName ns "w" "pPr" element = let style = mapMaybe - (findAttrByName ns "w" "val") + (fmap ParaStyleId . findAttrByName ns "w" "val") (findChildrenByName ns "w" "pStyle" pPr) in ParagraphStyle - {pStyle = style + {pStyle = mapMaybe (`M.lookup` sty) style , indentation = findChildByName ns "w" "ind" pPr >>= elemToParIndentation ns @@ -1053,9 +1156,6 @@ elemToParagraphStyle ns element sty Just "none" -> False Just _ -> True Nothing -> False - , pHeading = getParStyleField headingLev sty style - , pNumInfo = getParStyleField numInfo sty style - , pBlockQuote = getParStyleField isBlockQuote sty style , pChange = findChildByName ns "w" "rPr" pPr >>= filterChild (\e -> isElem ns "w" "ins" e || isElem ns "w" "moveTo" e || @@ -1085,16 +1185,20 @@ elemToRunStyleD :: NameSpaces -> Element -> D RunStyle elemToRunStyleD ns element | Just rPr <- findChildByName ns "w" "rPr" element = do charStyles <- asks envCharStyles - let parentSty = case + let parentSty = findChildByName ns "w" "rStyle" rPr >>= - findAttrByName ns "w" "val" - of - Just styName | Just style <- M.lookup styName charStyles -> - Just (styName, style) - _ -> Nothing + findAttrByName ns "w" "val" >>= + flip M.lookup charStyles . CharStyleId return $ elemToRunStyle ns element parentSty elemToRunStyleD _ _ = return defaultRunStyle +elemToCharStyle :: NameSpaces + -> Element -> Maybe CharStyle -> Maybe CharStyle +elemToCharStyle ns element parentStyle + = CharStyle <$> (CharStyleId <$> findAttrByName ns "w" "styleId" element) + <*> getElementStyleName ns element + <*> (Just $ elemToRunStyle ns element parentStyle) + elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle elemToRunStyle ns element parentStyle | Just rPr <- findChildByName ns "w" "rPr" element = @@ -1117,38 +1221,23 @@ elemToRunStyle ns element parentStyle , rUnderline = findChildByName ns "w" "u" rPr >>= findAttrByName ns "w" "val" - , rStyle = parentStyle + , rParentStyle = parentStyle } elemToRunStyle _ _ _ = defaultRunStyle -getHeaderLevel :: NameSpaces -> Element -> Maybe (String,Int) +getHeaderLevel :: NameSpaces -> Element -> Maybe (ParaStyleName, Int) getHeaderLevel ns element - | Just styleId <- findAttrByName ns "w" "styleId" element - , Just index <- stripPrefix "Heading" styleId - , Just n <- stringToInteger index - , n > 0 = Just (styleId, fromInteger n) - | Just styleId <- findAttrByName ns "w" "styleId" element - , Just index <- findChildByName ns "w" "name" element >>= - findAttrByName ns "w" "val" >>= - stripPrefix "heading " - , Just n <- stringToInteger index - , n > 0 = Just (styleId, fromInteger n) + | Just styleName <- getElementStyleName ns element + , Just n <- stringToInteger =<< + (stripPrefix "heading " . map toLower $ + fromStyleName styleName) + , n > 0 = Just (styleName, fromInteger n) getHeaderLevel _ _ = Nothing -blockQuoteStyleIds :: [String] -blockQuoteStyleIds = ["Quote", "BlockQuote", "BlockQuotation"] - -blockQuoteStyleNames :: [String] -blockQuoteStyleNames = ["Quote", "Block Text"] - -getBlockQuote :: NameSpaces -> Element -> Maybe Bool -getBlockQuote ns element - | Just styleId <- findAttrByName ns "w" "styleId" element - , styleId `elem` blockQuoteStyleIds = Just True - | Just styleName <- findChildByName ns "w" "name" element >>= - findAttrByName ns "w" "val" - , styleName `elem` blockQuoteStyleNames = Just True -getBlockQuote _ _ = Nothing +getElementStyleName :: Coercible String a => NameSpaces -> Element -> Maybe a +getElementStyleName ns el = coerce <$> + ((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val") + <|> findAttrByName ns "w" "styleId" el) getNumInfo :: NameSpaces -> Element -> Maybe (String, String) getNumInfo ns element = do @@ -1163,15 +1252,19 @@ getNumInfo ns element = do return (numId, lvl) -elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> ParStyleData -elemToParStyleData ns element parentStyle = - ParStyleData +elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> Maybe ParStyle +elemToParStyleData ns element parentStyle + | Just styleId <- findAttrByName ns "w" "styleId" element + , Just styleName <- getElementStyleName ns element + = Just $ ParStyle { headingLev = getHeaderLevel ns element - , isBlockQuote = getBlockQuote ns element , numInfo = getNumInfo ns element - , psStyle = parentStyle - } + , psParentStyle = parentStyle + , pStyleName = styleName + , pStyleId = ParaStyleId styleId + } +elemToParStyleData _ _ _ = Nothing elemToRunElem :: NameSpaces -> Element -> D RunElem elemToRunElem ns element diff --git a/test/Tests/Readers/Docx.hs b/test/Tests/Readers/Docx.hs index 9d0913e55..583a6ec18 100644 --- a/test/Tests/Readers/Docx.hs +++ b/test/Tests/Readers/Docx.hs @@ -255,6 +255,10 @@ tests = [ testGroup "document" "lists" "docx/lists.docx" "docx/lists.native" + , testCompare + "compact lists" + "docx/lists-compact.docx" + "docx/lists-compact.native" , testCompare "lists with level overrides" "docx/lists_level_override.docx" @@ -425,6 +429,11 @@ tests = [ testGroup "document" "custom styles (`+styles`) enabled" "docx/custom-style-reference.docx" "docx/custom-style-with-styles.native" + , testCompareWithOpts + def{readerExtensions=extensionsFromList [Ext_styles]} + "custom styles (`+styles`): Compact style is removed from output" + "docx/compact-style-removal.docx" + "docx/compact-style-removal.native" ] , testGroup "metadata" [ testCompareWithOpts def{readerStandalone=True} diff --git a/test/docx/compact-style-removal.docx b/test/docx/compact-style-removal.docx new file mode 100644 index 000000000..fde0064db Binary files /dev/null and b/test/docx/compact-style-removal.docx differ diff --git a/test/docx/compact-style-removal.native b/test/docx/compact-style-removal.native new file mode 100644 index 000000000..340878ba0 --- /dev/null +++ b/test/docx/compact-style-removal.native @@ -0,0 +1,5 @@ +[OrderedList (1,Decimal,Period) + [[Plain [Str "One"]] + ,[Plain [Str "Two"]] + ,[Plain [Str "Three"]] + ,[Plain [Str "Four"]]]] diff --git a/test/docx/lists-compact.docx b/test/docx/lists-compact.docx new file mode 100644 index 000000000..d7f9e4a06 Binary files /dev/null and b/test/docx/lists-compact.docx differ diff --git a/test/docx/lists-compact.native b/test/docx/lists-compact.native new file mode 100644 index 000000000..340878ba0 --- /dev/null +++ b/test/docx/lists-compact.native @@ -0,0 +1,5 @@ +[OrderedList (1,Decimal,Period) + [[Plain [Str "One"]] + ,[Plain [Str "Two"]] + ,[Plain [Str "Three"]] + ,[Plain [Str "Four"]]]] -- cgit v1.2.3 From 4a5e389f218dc2679b8c3ab9b79ccca946731d22 Mon Sep 17 00:00:00 2001 From: Nikolay Yakimov Date: Sun, 8 Sep 2019 19:32:09 +0300 Subject: [Docx Writer] Code clean-up Reduce code duplication, remove redundant brackets --- src/Text/Pandoc/Writers/Docx.hs | 77 ++++++++++++++++++++--------------------- 1 file changed, 37 insertions(+), 40 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs index 02db23db5..62c7499e4 100644 --- a/src/Text/Pandoc/Writers/Docx.hs +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -208,7 +208,7 @@ writeDocx opts doc@(Pandoc meta _) = do let doc' = walk fixDisplayMath doc username <- P.lookupEnv "USERNAME" utctime <- P.getCurrentTime - distArchive <- (toArchive . BL.fromStrict) <$> do + distArchive <- toArchive . BL.fromStrict <$> do oldUserDataDir <- P.getUserDataDir P.setUserDataDir Nothing res <- P.readDefaultDataFile "reference.docx" @@ -216,7 +216,7 @@ writeDocx opts doc@(Pandoc meta _) = do return res refArchive <- case writerReferenceDoc opts of Just f -> toArchive <$> P.readFileLazy f - Nothing -> (toArchive . BL.fromStrict) <$> + Nothing -> toArchive . BL.fromStrict <$> P.readDataFile "reference.docx" parsedDoc <- parseXml refArchive distArchive "word/document.xml" @@ -237,7 +237,7 @@ writeDocx opts doc@(Pandoc meta _) = do >>= subtrct mbAttrMarRight >>= subtrct mbAttrMarLeft where - subtrct mbStr = \x -> mbStr >>= safeRead >>= (\y -> Just $ x - y) + subtrct mbStr x = mbStr >>= safeRead >>= (\y -> Just $ x - y) -- styles mblang <- toLang $ getLang opts meta @@ -285,7 +285,7 @@ writeDocx opts doc@(Pandoc meta _) = do envRTL = isRTLmeta , envChangesAuthor = fromMaybe "unknown" username , envChangesDate = formatTime defaultTimeLocale "%FT%XZ" utctime - , envPrintWidth = maybe 420 (\x -> quot x 20) pgContentWidth + , envPrintWidth = maybe 420 (`quot` 20) pgContentWidth } @@ -366,7 +366,7 @@ writeDocx opts doc@(Pandoc meta _) = do map (\x -> (maybe "" ("/word/" ++) $ extractTarget x, "application/vnd.openxmlformats-officedocument.wordprocessingml.footer+xml")) footers) ++ map mkImageOverride imgs ++ - map mkMediaOverride [ eRelativePath e | e <- zEntries refArchive + [ mkMediaOverride (eRelativePath e) | e <- zEntries refArchive , "word/media/" `isPrefixOf` eRelativePath e ] let defaultnodes = [mknode "Default" @@ -589,8 +589,8 @@ writeDocx opts doc@(Pandoc meta _) = do mapMaybe (fmap ("word/" ++) . extractTarget) (headers ++ footers) let miscRelEntries = [ e | e <- zEntries refArchive - , "word/_rels/" `isPrefixOf` (eRelativePath e) - , ".xml.rels" `isSuffixOf` (eRelativePath e) + , "word/_rels/" `isPrefixOf` eRelativePath e + , ".xml.rels" `isSuffixOf` eRelativePath e , eRelativePath e /= "word/_rels/document.xml.rels" , eRelativePath e /= "word/_rels/footnotes.xml.rels" ] let otherMediaEntries = [ e | e <- zEntries refArchive @@ -778,24 +778,24 @@ makeTOC opts = do tocTitle <- gets stTocTitle title <- withParaPropM (pStyleM "TOC Heading") (blocksToOpenXML opts [Para tocTitle]) return - [mknode "w:sdt" [] ([ + [mknode "w:sdt" [] [ mknode "w:sdtPr" [] ( - mknode "w:docPartObj" [] ( + mknode "w:docPartObj" [] [mknode "w:docPartGallery" [("w:val","Table of Contents")] (), mknode "w:docPartUnique" [] ()] - ) -- w:docPartObj + -- w:docPartObj ), -- w:sdtPr mknode "w:sdtContent" [] (title++[ mknode "w:p" [] ( - mknode "w:r" [] ([ + mknode "w:r" [] [ mknode "w:fldChar" [("w:fldCharType","begin"),("w:dirty","true")] (), mknode "w:instrText" [("xml:space","preserve")] tocCmd, mknode "w:fldChar" [("w:fldCharType","separate")] (), mknode "w:fldChar" [("w:fldCharType","end")] () - ]) -- w:r + ] -- w:r ) -- w:p ]) - ])] -- w:sdt + ]] -- w:sdt -- | Convert Pandoc document to two lists of -- OpenXML elements (the main document and footnotes). @@ -1030,20 +1030,17 @@ blockToOpenXML' opts (Table caption aligns widths headers rows) = do : [ mkrow True headers' | hasHeader ] ++ map (mkrow False) rows' )] -blockToOpenXML' opts (BulletList lst) = do - let marker = BulletMarker - addList marker - numid <- getNumId - l <- asList $ concat `fmap` mapM (listItemToOpenXML opts numid) lst - setFirstPara - return l -blockToOpenXML' opts (OrderedList (start, numstyle, numdelim) lst) = do - let marker = NumberMarker numstyle numdelim start - addList marker - numid <- getNumId - l <- asList $ concat `fmap` mapM (listItemToOpenXML opts numid) lst - setFirstPara - return l +blockToOpenXML' opts el + | BulletList lst <- el = addOpenXMLList BulletMarker lst + | OrderedList (start, numstyle, numdelim) lst <- el + = addOpenXMLList (NumberMarker numstyle numdelim start) lst + where + addOpenXMLList marker lst = do + addList marker + numid <- getNumId + l <- asList $ concat `fmap` mapM (listItemToOpenXML opts numid) lst + setFirstPara + return l blockToOpenXML' opts (DefinitionList items) = do l <- concat `fmap` mapM (definitionListItemToOpenXML opts) items setFirstPara @@ -1159,7 +1156,7 @@ inlineToOpenXML' _ (Str str) = formattedString str inlineToOpenXML' opts Space = inlineToOpenXML opts (Str " ") inlineToOpenXML' opts SoftBreak = inlineToOpenXML opts (Str " ") -inlineToOpenXML' opts (Span (_,["underline"],_) ils) = do +inlineToOpenXML' opts (Span (_,["underline"],_) ils) = withTextProp (mknode "w:u" [("w:val","single")] ()) $ inlinesToOpenXML opts ils inlineToOpenXML' _ (Span (ident,["comment-start"],kvs) ils) = do @@ -1192,18 +1189,21 @@ inlineToOpenXML' opts (Span (ident,classes,kvs) ils) = do Just "rtl" -> local (\env -> env { envRTL = True }) Just "ltr" -> local (\env -> env { envRTL = False }) _ -> id - let off x = withTextProp (mknode x [("w:val","0")] ()) - let pmod = (if "csl-no-emph" `elem` classes then off "w:i" else id) . + off x = withTextProp (mknode x [("w:val","0")] ()) + pmod = (if "csl-no-emph" `elem` classes then off "w:i" else id) . (if "csl-no-strong" `elem` classes then off "w:b" else id) . (if "csl-no-smallcaps" `elem` classes then off "w:smallCaps" else id) + getChangeAuthorDate = do + defaultAuthor <- asks envChangesAuthor + defaultDate <- asks envChangesDate + let author = fromMaybe defaultAuthor (lookup "author" kvs) + date = fromMaybe defaultDate (lookup "date" kvs) + return (author, date) insmod <- if "insertion" `elem` classes then do - defaultAuthor <- asks envChangesAuthor - defaultDate <- asks envChangesDate - let author = fromMaybe defaultAuthor (lookup "author" kvs) - date = fromMaybe defaultDate (lookup "date" kvs) + (author, date) <- getChangeAuthorDate insId <- gets stInsId modify $ \s -> s{stInsId = insId + 1} return $ \f -> do @@ -1215,10 +1215,7 @@ inlineToOpenXML' opts (Span (ident,classes,kvs) ils) = do else return id delmod <- if "deletion" `elem` classes then do - defaultAuthor <- asks envChangesAuthor - defaultDate <- asks envChangesDate - let author = fromMaybe defaultAuthor (lookup "author" kvs) - date = fromMaybe defaultDate (lookup "date" kvs) + (author, date) <- getChangeAuthorDate delId <- gets stDelId modify $ \s -> s{stDelId = delId + 1} return $ \f -> local (\env->env{envInDel=True}) $ do @@ -1431,12 +1428,12 @@ defaultFootnotes :: [Element] defaultFootnotes = [ mknode "w:footnote" [("w:type", "separator"), ("w:id", "-1")] [ mknode "w:p" [] - [mknode "w:r" [] $ + [mknode "w:r" [] [ mknode "w:separator" [] ()]]] , mknode "w:footnote" [("w:type", "continuationSeparator"), ("w:id", "0")] [ mknode "w:p" [] - [ mknode "w:r" [] $ + [ mknode "w:r" [] [ mknode "w:continuationSeparator" [] ()]]]] -- cgit v1.2.3 From 14b00517ae6df7d4bb24b418530c8b57182c787c Mon Sep 17 00:00:00 2001 From: Nikolay Yakimov Date: Sat, 14 Sep 2019 17:23:26 +0300 Subject: [Docx Writer] Consistently use style names, not style ids Styles that this change affects: paragraph styles: Author, Abstract, Compact, Figure, Captioned Figure, Image Caption, First Paragraph, Source Code, Table Caption, Definition, Definition Term; character styles: Verbatim Char, token styles (those with names ending in Tok) --- src/Text/Pandoc/Writers/Docx.hs | 52 ++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 27 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Writers/Docx.hs b/src/Text/Pandoc/Writers/Docx.hs index 62c7499e4..d62dbeedb 100644 --- a/src/Text/Pandoc/Writers/Docx.hs +++ b/src/Text/Pandoc/Writers/Docx.hs @@ -780,7 +780,7 @@ makeTOC opts = do return [mknode "w:sdt" [] [ mknode "w:sdtPr" [] ( - mknode "w:docPartObj" [] + mknode "w:docPartObj" [] [mknode "w:docPartGallery" [("w:val","Table of Contents")] (), mknode "w:docPartUnique" [] ()] -- w:docPartObj @@ -809,12 +809,12 @@ writeOpenXML opts (Pandoc meta blocks) = do let includeTOC = writerTableOfContents opts || lookupMetaBool "toc" meta title <- withParaPropM (pStyleM "Title") $ blocksToOpenXML opts [Para tit | not (null tit)] subtitle <- withParaPropM (pStyleM "Subtitle") $ blocksToOpenXML opts [Para subtitle' | not (null subtitle')] - authors <- withParaProp (pCustomStyle "Author") $ blocksToOpenXML opts $ + authors <- withParaPropM (pStyleM "Author") $ blocksToOpenXML opts $ map Para auths date <- withParaPropM (pStyleM "Date") $ blocksToOpenXML opts [Para dat | not (null dat)] abstract <- if null abstract' then return [] - else withParaProp (pCustomStyle "Abstract") $ blocksToOpenXML opts abstract' + else withParaPropM (pStyleM "Abstract") $ blocksToOpenXML opts abstract' let convertSpace (Str x : Space : Str y : xs) = Str (x ++ " " ++ y) : xs convertSpace (Str x : Str y : xs) = Str (x ++ y) : xs convertSpace xs = xs @@ -848,18 +848,12 @@ writeOpenXML opts (Pandoc meta blocks) = do blocksToOpenXML :: (PandocMonad m) => WriterOptions -> [Block] -> WS m [Element] blocksToOpenXML opts bls = concat `fmap` mapM (blockToOpenXML opts) bls -pCustomStyle :: String -> Element -pCustomStyle sty = mknode "w:pStyle" [("w:val",sty)] () - pStyleM :: (PandocMonad m) => String -> WS m XML.Element pStyleM styleName = do styleMaps <- gets stStyleMaps let sty' = getStyleId styleName $ sParaStyleMap styleMaps return $ mknode "w:pStyle" [("w:val",sty')] () -rCustomStyle :: String -> Element -rCustomStyle sty = mknode "w:rStyle" [("w:val",sty)] () - rStyleM :: (PandocMonad m) => String -> WS m XML.Element rStyleM styleName = do styleMaps <- gets stStyleMaps @@ -921,19 +915,19 @@ blockToOpenXML' opts (Header lev (ident,_,_) lst) = do blockToOpenXML' opts (Plain lst) = do isInTable <- gets stInTable let block = blockToOpenXML opts (Para lst) - para <- if isInTable then withParaProp (pCustomStyle "Compact") block else block + prop <- pStyleM "Compact" + para <- if isInTable then withParaProp prop block else block return $ para - -- title beginning with fig: indicates that the image is a figure blockToOpenXML' opts (Para [Image attr alt (src,'f':'i':'g':':':tit)]) = do setFirstPara - let prop = pCustomStyle $ + prop <- pStyleM $ if null alt then "Figure" - else "CaptionedFigure" + else "Captioned Figure" paraProps <- local (\env -> env { envParaProperties = EnvProps (Just prop) [] <> envParaProperties env }) (getParaProps False) contents <- inlinesToOpenXML opts [Image attr alt (src,tit)] - captionNode <- withParaProp (pCustomStyle "ImageCaption") + captionNode <- withParaPropM (pStyleM "Image Caption") $ blockToOpenXML opts (Para alt) return $ mknode "w:p" [] (paraProps ++ contents) : captionNode blockToOpenXML' opts (Para lst) @@ -944,10 +938,10 @@ blockToOpenXML' opts (Para lst) [x] -> isDisplayMath x _ -> False paraProps <- getParaProps displayMathPara - bodyTextStyle <- pStyleM "Body Text" + bodyTextStyle <- if isFirstPara + then pStyleM "First Paragraph" + else pStyleM "Body Text" let paraProps' = case paraProps of - [] | isFirstPara -> [mknode "w:pPr" [] - [pCustomStyle "FirstParagraph"]] [] -> [mknode "w:pPr" [] [bodyTextStyle]] ps -> ps modify $ \s -> s { stFirstPara = False } @@ -965,7 +959,7 @@ blockToOpenXML' opts (BlockQuote blocks) = do setFirstPara return p blockToOpenXML' opts (CodeBlock attrs@(ident, _, _) str) = do - p <- withParaProp (pCustomStyle "SourceCode") (blockToOpenXML opts $ Para [Code attrs str]) + p <- withParaPropM (pStyleM "Source Code") (blockToOpenXML opts $ Para [Code attrs str]) setFirstPara wrapBookmark ident p blockToOpenXML' _ HorizontalRule = do @@ -981,7 +975,7 @@ blockToOpenXML' opts (Table caption aligns widths headers rows) = do let captionStr = stringify caption caption' <- if null caption then return [] - else withParaProp (pCustomStyle "TableCaption") + else withParaPropM (pStyleM "Table Caption") $ blockToOpenXML opts (Para caption) let alignmentFor al = mknode "w:jc" [("w:val",alignmentToString al)] () -- Table cells require a element, even an empty one! @@ -997,7 +991,8 @@ blockToOpenXML' opts (Table caption aligns widths headers rows) = do [ mknode "w:tcBorders" [] $ mknode "w:bottom" [("w:val","single")] () , mknode "w:vAlign" [("w:val","bottom")] () ] - let emptyCell = [mknode "w:p" [] [mknode "w:pPr" [] [pCustomStyle "Compact"]]] + compactStyle <- pStyleM "Compact" + let emptyCell = [mknode "w:p" [] [mknode "w:pPr" [] [compactStyle]]] let mkcell border contents = mknode "w:tc" [] $ [ borderProps | border ] ++ if null contents @@ -1048,9 +1043,9 @@ blockToOpenXML' opts (DefinitionList items) = do definitionListItemToOpenXML :: (PandocMonad m) => WriterOptions -> ([Inline],[[Block]]) -> WS m [Element] definitionListItemToOpenXML opts (term,defs) = do - term' <- withParaProp (pCustomStyle "DefinitionTerm") + term' <- withParaPropM (pStyleM "Definition Term") $ blockToOpenXML opts (Para term) - defs' <- withParaProp (pCustomStyle "Definition") + defs' <- withParaPropM (pStyleM "Definition") $ concat `fmap` mapM (blocksToOpenXML opts) defs return $ term' ++ defs' @@ -1263,14 +1258,17 @@ inlineToOpenXML' opts (Math mathType str) = do Left il -> inlineToOpenXML' opts il inlineToOpenXML' opts (Cite _ lst) = inlinesToOpenXML opts lst inlineToOpenXML' opts (Code attrs str) = do + let alltoktypes = [KeywordTok ..] + tokTypesMap <- mapM (\tt -> (,) tt <$> rStyleM (show tt)) alltoktypes let unhighlighted = intercalate [br] `fmap` mapM formattedString (lines str) formatOpenXML _fmtOpts = intercalate [br] . map (map toHlTok) - toHlTok (toktype,tok) = mknode "w:r" [] - [ mknode "w:rPr" [] - [ rCustomStyle (show toktype) ] - , mknode "w:t" [("xml:space","preserve")] (T.unpack tok) ] - withTextProp (rCustomStyle "VerbatimChar") + toHlTok (toktype,tok) = + mknode "w:r" [] + [ mknode "w:rPr" [] $ + maybeToList (lookup toktype tokTypesMap) + , mknode "w:t" [("xml:space","preserve")] (T.unpack tok) ] + withTextPropM (rStyleM "Verbatim Char") $ if isNothing (writerHighlightStyle opts) then unhighlighted else case highlight (writerSyntaxMap opts) -- cgit v1.2.3