diff options
author | Nikolay Yakimov <root@livid.pp.ru> | 2019-09-15 01:40:23 +0300 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2019-09-21 11:18:15 -0700 |
commit | c113ca6717d00870ec10716897d76a6fa62b1d41 (patch) | |
tree | 3855697f666c24b7a11b49a9bc5429984e86b270 /src/Text/Pandoc | |
parent | fd14ad52618c98928ab83aa43689158cc788c4a8 (diff) | |
download | pandoc-c113ca6717d00870ec10716897d76a6fa62b1d41.tar.gz |
[Docx Reader] Use style names, not ids, for assigning semantic meaning
Motivating issues: #5523, #5052, #5074
Style name comparisons are case-insensitive, since those are
case-insensitive in Word.
w:styleId will be used as style name if w:name is missing (this should
only happen for malformed docx and is kept as a fallback to avoid
failing altogether on malformed documents)
Block quote detection code moved from Docx.Parser to Readers.Docx
Code styles, i.e. "Source Code" and "Verbatim Char" now honor style
inheritance
Docx Reader now honours "Compact" style (used in Pandoc-generated docx).
The side-effect is that "Compact" style no longer shows up in
docx+styles output. Styles inherited from "Compact" will still
show up.
Removed obsolete list-item style from divsToKeep. That didn't
really do anything for a while now.
Add newtypes to differentiate between style names, ids, and
different style types (that is, paragraph and character styles)
Since docx style names can have spaces in them, and pandoc-markdown
classes can't, anywhere when style name is used as a class name,
spaces are replaced with ASCII dashes `-`.
Get rid of extraneous intermediate types, carrying styleId information.
Instead, styleId is saved with other style data.
Use RunStyle for inline style definitions only (lacking styleId and styleName);
for Character Styles use CharStyle type (which is basicaly RunStyle with styleId
and StyleName bolted onto it).
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 162 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Lists.hs | 25 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 283 |
3 files changed, 287 insertions, 183 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index a26986af2..9d17ab118 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -3,6 +3,7 @@ {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE MultiWayIf #-} +{-# LANGUAGE FlexibleContexts #-} {- | Module : Text.Pandoc.Readers.Docx Copyright : Copyright (C) 2014-2019 Jesse Rosenthal @@ -65,6 +66,7 @@ import Control.Monad.State.Strict import qualified Data.ByteString.Lazy as B import Data.Default (Default) import Data.List (delete, intersect) +import Data.Char (isSpace) import qualified Data.Map as M import Data.Maybe (isJust, fromMaybe) import Data.Sequence (ViewL (..), viewl) @@ -133,13 +135,13 @@ evalDocxContext :: PandocMonad m => DocxContext m a -> DEnv -> DState -> m a evalDocxContext ctx env st = flip evalStateT st $ runReaderT ctx env -- This is empty, but we put it in for future-proofing. -spansToKeep :: [String] +spansToKeep :: [CharStyleName] spansToKeep = [] -divsToKeep :: [String] -divsToKeep = ["list-item", "Definition", "DefinitionTerm"] +divsToKeep :: [ParaStyleName] +divsToKeep = ["Definition", "Definition Term"] -metaStyles :: M.Map String String +metaStyles :: M.Map ParaStyleName String metaStyles = M.fromList [ ("Title", "title") , ("Subtitle", "subtitle") , ("Author", "author") @@ -151,7 +153,7 @@ sepBodyParts = span (\bp -> isMetaPar bp || isEmptyPar bp) isMetaPar :: BodyPart -> Bool isMetaPar (Paragraph pPr _) = - not $ null $ intersect (pStyle pPr) (M.keys metaStyles) + not $ null $ intersect (getStyleNames $ pStyle pPr) (M.keys metaStyles) isMetaPar _ = False isEmptyPar :: BodyPart -> Bool @@ -168,7 +170,7 @@ bodyPartsToMeta' :: PandocMonad m => [BodyPart] -> DocxContext m (M.Map String M bodyPartsToMeta' [] = return M.empty bodyPartsToMeta' (bp : bps) | (Paragraph pPr parParts) <- bp - , (c : _)<- (pStyle pPr) `intersect` (M.keys metaStyles) + , (c : _)<- getStyleNames (pStyle pPr) `intersect` M.keys metaStyles , (Just metaField) <- M.lookup c metaStyles = do inlines <- smushInlines <$> mapM parPartToInlines parParts remaining <- bodyPartsToMeta' bps @@ -198,11 +200,29 @@ fixAuthors (MetaBlocks blks) = g _ = MetaInlines [] fixAuthors mv = mv -codeStyles :: [String] -codeStyles = ["VerbatimChar"] +isInheritedFromStyles :: (Eq (StyleName s), HasStyleName s, HasParentStyle s) => [StyleName s] -> s -> Bool +isInheritedFromStyles names sty + | getStyleName sty `elem` names = True + | Just psty <- getParentStyle sty = isInheritedFromStyles names psty + | otherwise = False -codeDivs :: [String] -codeDivs = ["SourceCode"] +hasStylesInheritedFrom :: [ParaStyleName] -> ParagraphStyle -> Bool +hasStylesInheritedFrom ns s = any (isInheritedFromStyles ns) $ pStyle s + +removeStyleNamed :: ParaStyleName -> ParagraphStyle -> ParagraphStyle +removeStyleNamed sn ps = ps{pStyle = filter (\psd -> getStyleName psd /= sn) $ pStyle ps} + +isCodeCharStyle :: CharStyle -> Bool +isCodeCharStyle = isInheritedFromStyles ["Verbatim Char"] + +isCodeDiv :: ParagraphStyle -> Bool +isCodeDiv = hasStylesInheritedFrom ["Source Code"] + +isBlockQuote :: ParStyle -> Bool +isBlockQuote = + isInheritedFromStyles [ + "Quote", "Block Text", "Block Quote", "Block Quotation" + ] runElemToInlines :: RunElem -> Inlines runElemToInlines (TextRun s) = text s @@ -228,57 +248,31 @@ parPartToString (InternalHyperLink _ runs) = concatMap runToString runs parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs parPartToString _ = "" -blacklistedCharStyles :: [String] +blacklistedCharStyles :: [CharStyleName] blacklistedCharStyles = ["Hyperlink"] resolveDependentRunStyle :: PandocMonad m => RunStyle -> DocxContext m RunStyle resolveDependentRunStyle rPr - | Just (s, _) <- rStyle rPr, s `elem` blacklistedCharStyles = + | Just s <- rParentStyle rPr + , getStyleName s `elem` blacklistedCharStyles = return rPr - | Just (_, cs) <- rStyle rPr = do + | Just s <- rParentStyle rPr = do opts <- asks docxOptions if isEnabled Ext_styles opts then return rPr - else do rPr' <- resolveDependentRunStyle cs - return $ - RunStyle { isBold = case isBold rPr of - Just bool -> Just bool - Nothing -> isBold rPr' - , isItalic = case isItalic rPr of - Just bool -> Just bool - Nothing -> isItalic rPr' - , isSmallCaps = case isSmallCaps rPr of - Just bool -> Just bool - Nothing -> isSmallCaps rPr' - , isStrike = case isStrike rPr of - Just bool -> Just bool - Nothing -> isStrike rPr' - , isRTL = case isRTL rPr of - Just bool -> Just bool - Nothing -> isRTL rPr' - , rVertAlign = case rVertAlign rPr of - Just valign -> Just valign - Nothing -> rVertAlign rPr' - , rUnderline = case rUnderline rPr of - Just ulstyle -> Just ulstyle - Nothing -> rUnderline rPr' - , rStyle = rStyle rPr - } + else leftBiasedMergeRunStyle rPr <$> resolveDependentRunStyle (cStyleData s) | otherwise = return rPr runStyleToTransform :: PandocMonad m => RunStyle -> DocxContext m (Inlines -> Inlines) runStyleToTransform rPr - | Just (s, _) <- rStyle rPr - , s `elem` spansToKeep = do - transform <- runStyleToTransform rPr{rStyle = Nothing} - return $ spanWith ("", [s], []) . transform - | Just (s, _) <- rStyle rPr = do - opts <- asks docxOptions - let extraInfo = if isEnabled Ext_styles opts - then spanWith ("", [], [("custom-style", s)]) - else id - transform <- runStyleToTransform rPr{rStyle = Nothing} - return $ extraInfo . transform + | Just sn <- getStyleName <$> rParentStyle rPr + , sn `elem` spansToKeep = do + transform <- runStyleToTransform rPr{rParentStyle = Nothing} + return $ spanWith ("", [normalizeToClassName sn], []) . transform + | Just s <- rParentStyle rPr = do + ei <- extraInfo spanWith s + transform <- runStyleToTransform rPr{rParentStyle = Nothing} + return $ ei . transform | Just True <- isItalic rPr = do transform <- runStyleToTransform rPr{isItalic = Nothing} return $ emph . transform @@ -310,8 +304,7 @@ runStyleToTransform rPr runToInlines :: PandocMonad m => Run -> DocxContext m Inlines runToInlines (Run rs runElems) - | Just (s, _) <- rStyle rs - , s `elem` codeStyles = do + | maybe False isCodeCharStyle $ rParentStyle rs = do rPr <- resolveDependentRunStyle rs let codeString = code $ concatMap runElemToString runElems return $ case rVertAlign rPr of @@ -526,39 +519,49 @@ trimSps (Many ils) = Many $ Seq.dropWhileL isSp $Seq.dropWhileR isSp ils isSp LineBreak = True isSp _ = False +extraInfo :: (Eq (StyleName a), PandocMonad m, HasStyleName a) + => (Attr -> i -> i) -> a -> DocxContext m (i -> i) +extraInfo f s = do + opts <- asks docxOptions + return $ if | isEnabled Ext_styles opts + -> f ("", [], [("custom-style", fromStyleName $ getStyleName s)]) + | otherwise -> id + parStyleToTransform :: PandocMonad m => ParagraphStyle -> DocxContext m (Blocks -> Blocks) parStyleToTransform pPr | (c:cs) <- pStyle pPr - , c `elem` divsToKeep = do + , getStyleName c `elem` divsToKeep = do let pPr' = pPr { pStyle = cs } transform <- parStyleToTransform pPr' - return $ divWith ("", [c], []) . transform + return $ divWith ("", [normalizeToClassName $ getStyleName c], []) . transform | (c:cs) <- pStyle pPr, - c `elem` listParagraphDivs = do + getStyleName c `elem` listParagraphStyles = do let pPr' = pPr { pStyle = cs, indentation = Nothing} transform <- parStyleToTransform pPr' - return $ divWith ("", [c], []) . transform + return $ divWith ("", [normalizeToClassName $ getStyleName c], []) . transform | (c:cs) <- pStyle pPr = do - opts <- asks docxOptions - let pPr' = pPr { pStyle = cs} + let pPr' = pPr { pStyle = cs } transform <- parStyleToTransform pPr' - let extraInfo = if isEnabled Ext_styles opts - then divWith ("", [], [("custom-style", c)]) - else id - return $ extraInfo . (if fromMaybe False (pBlockQuote pPr) then blockQuote else id) . transform + ei <- extraInfo divWith c + return $ ei . (if isBlockQuote c then blockQuote else id) . transform | null (pStyle pPr) , Just left <- indentation pPr >>= leftParIndent = do let pPr' = pPr { indentation = Nothing } hang = fromMaybe 0 $ indentation pPr >>= hangingParIndent transform <- parStyleToTransform pPr' - return $ if (left - hang) > 0 + return $ if (left - hang) > 0 then blockQuote . transform else transform parStyleToTransform _ = return id +normalizeToClassName :: (FromStyleName a) => a -> String +normalizeToClassName = map go . fromStyleName + where go c | isSpace c = '-' + | otherwise = c + bodyPartToBlocks :: PandocMonad m => BodyPart -> DocxContext m Blocks bodyPartToBlocks (Paragraph pPr parparts) - | not $ null $ codeDivs `intersect` (pStyle pPr) = do + | isCodeDiv pPr = do transform <- parStyleToTransform pPr return $ transform $ @@ -568,13 +571,16 @@ bodyPartToBlocks (Paragraph pPr parparts) ils <-local (\s-> s{docxInHeaderBlock=True}) (smushInlines <$> mapM parPartToInlines parparts) makeHeaderAnchor $ - headerWith ("", delete style (pStyle pPr), []) n ils + headerWith ("", map normalizeToClassName . delete style $ getStyleNames (pStyle pPr), []) n ils | otherwise = do ils <- trimSps . smushInlines <$> mapM parPartToInlines parparts prevParaIls <- gets docxPrevPara dropIls <- gets docxDropCap let ils' = dropIls <> ils - if dropCap pPr + let (paraOrPlain, pPr') + | hasStylesInheritedFrom ["Compact"] pPr = (plain, removeStyleNamed "Compact" pPr) + | otherwise = (para, pPr) + if dropCap pPr' then do modify $ \s -> s { docxDropCap = ils' } return mempty else do modify $ \s -> s { docxDropCap = mempty } @@ -583,41 +589,41 @@ bodyPartToBlocks (Paragraph pPr parparts) ils' handleInsertion = do modify $ \s -> s {docxPrevPara = mempty} - transform <- parStyleToTransform pPr - return $ transform $ para ils'' + transform <- parStyleToTransform pPr' + return $ transform $ paraOrPlain ils'' opts <- asks docxOptions if | isNull ils'' && not (isEnabled Ext_empty_paragraphs opts) -> return mempty - | Just (TrackedChange Insertion _) <- pChange pPr + | Just (TrackedChange Insertion _) <- pChange pPr' , AcceptChanges <- readerTrackChanges opts -> handleInsertion - | Just (TrackedChange Insertion _) <- pChange pPr + | Just (TrackedChange Insertion _) <- pChange pPr' , RejectChanges <- readerTrackChanges opts -> do modify $ \s -> s {docxPrevPara = ils''} return mempty - | Just (TrackedChange Insertion cInfo) <- pChange pPr + | Just (TrackedChange Insertion cInfo) <- pChange pPr' , AllChanges <- readerTrackChanges opts , ChangeInfo _ cAuthor cDate <- cInfo -> do let attr = ("", ["paragraph-insertion"], [("author", cAuthor), ("date", cDate)]) insertMark = spanWith attr mempty - transform <- parStyleToTransform pPr + transform <- parStyleToTransform pPr' return $ transform $ - para $ ils'' <> insertMark - | Just (TrackedChange Deletion _) <- pChange pPr + paraOrPlain $ ils'' <> insertMark + | Just (TrackedChange Deletion _) <- pChange pPr' , AcceptChanges <- readerTrackChanges opts -> do modify $ \s -> s {docxPrevPara = ils''} return mempty - | Just (TrackedChange Deletion _) <- pChange pPr + | Just (TrackedChange Deletion _) <- pChange pPr' , RejectChanges <- readerTrackChanges opts -> handleInsertion - | Just (TrackedChange Deletion cInfo) <- pChange pPr + | Just (TrackedChange Deletion cInfo) <- pChange pPr' , AllChanges <- readerTrackChanges opts , ChangeInfo _ cAuthor cDate <- cInfo -> do let attr = ("", ["paragraph-deletion"], [("author", cAuthor), ("date", cDate)]) insertMark = spanWith attr mempty - transform <- parStyleToTransform pPr + transform <- parStyleToTransform pPr' return $ transform $ - para $ ils'' <> insertMark + paraOrPlain $ ils'' <> insertMark | otherwise -> handleInsertion bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do -- We check whether this current numId has previously been used, @@ -638,7 +644,7 @@ bodyPartToBlocks (ListItem pPr numId lvl (Just levelInfo) parparts) = do blks <- bodyPartToBlocks (Paragraph pPr parparts) return $ divWith ("", ["list-item"], kvs) blks bodyPartToBlocks (ListItem pPr _ _ _ parparts) = - let pPr' = pPr {pStyle = "ListParagraph": pStyle pPr} + let pPr' = pPr {pStyle = constructBogusParStyleData "list-paragraph": pStyle pPr} in bodyPartToBlocks $ Paragraph pPr' parparts bodyPartToBlocks (Tbl _ _ _ []) = diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs index cc390f122..eb24640c5 100644 --- a/src/Text/Pandoc/Readers/Docx/Lists.hs +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -1,4 +1,5 @@ {-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Docx.Lists Copyright : Copyright (C) 2014-2019 Jesse Rosenthal @@ -14,13 +15,16 @@ Functions for converting flat docx paragraphs into nested lists. module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets , blocksToDefinitions , listParagraphDivs + , listParagraphStyles ) where import Prelude import Data.List import Data.Maybe +import Data.String (fromString) import Text.Pandoc.Generic (bottomUp) import Text.Pandoc.JSON +import Text.Pandoc.Readers.Docx.Parse (ParaStyleName) import Text.Pandoc.Shared (trim, safeRead) isListItem :: Block -> Bool @@ -79,7 +83,10 @@ getListType b@(Div (_, _, kvs) _) | isListItem b = getListType _ = Nothing listParagraphDivs :: [String] -listParagraphDivs = ["ListParagraph"] +listParagraphDivs = ["list-paragraph"] + +listParagraphStyles :: [ParaStyleName] +listParagraphStyles = map fromString listParagraphDivs -- This is a first stab at going through and attaching meaning to list -- paragraphs, without an item marker, following a list item. We @@ -160,7 +167,7 @@ blocksToDefinitions' defAcc acc [] = reverse $ DefinitionList (reverse defAcc) : acc blocksToDefinitions' defAcc acc (Div (_, classes1, _) blks1 : Div (ident2, classes2, kvs2) blks2 : blks) - | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 = + | "Definition-Term" `elem` classes1 && "Definition" `elem` classes2 = let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) pair = if remainingAttr2 == ("", [], []) then (concatMap plainParaInlines blks1, [blks2]) else (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) in @@ -169,12 +176,12 @@ blocksToDefinitions' ((defTerm, defItems):defs) acc (Div (ident2, classes2, kvs2) blks2 : blks) | "Definition" `elem` classes2 = let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) - defItems2 = case remainingAttr2 == ("", [], []) of - True -> blks2 - False -> [Div remainingAttr2 blks2] - defAcc' = case null defItems of - True -> (defTerm, [defItems2]) : defs - False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs + defItems2 = if remainingAttr2 == ("", [], []) + then blks2 + else [Div remainingAttr2 blks2] + defAcc' = if null defItems + then (defTerm, [defItems2]) : defs + else (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs in blocksToDefinitions' defAcc' acc blks blocksToDefinitions' [] acc (b:blks) = @@ -198,7 +205,5 @@ removeListDivs' blk = [blk] removeListDivs :: [Block] -> [Block] removeListDivs = concatMap removeListDivs' - - blocksToDefinitions :: [Block] -> [Block] blocksToDefinitions = blocksToDefinitions' [] [] diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 330c9208f..00c5fb0be 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -1,7 +1,11 @@ {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} {- | Module : Text.Pandoc.Readers.Docx.Parse Copyright : Copyright (C) 2014-2019 Jesse Rosenthal @@ -31,6 +35,8 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , VertAlign(..) , ParIndentation(..) , ParagraphStyle(..) + , ParStyle + , CharStyle(cStyleData) , Row(..) , Cell(..) , TrackedChange(..) @@ -38,8 +44,17 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , ChangeInfo(..) , FieldInfo(..) , Level(..) + , ParaStyleName + , CharStyleName + , FromStyleName(..) + , HasStyleName(..) + , HasParentStyle(..) , archiveToDocx , archiveToDocxWithWarnings + , getStyleNames + , pHeading + , constructBogusParStyleData + , leftBiasedMergeRunStyle ) where import Prelude import Codec.Archive.Zip @@ -49,10 +64,13 @@ import Control.Monad.Reader import Control.Monad.State.Strict import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B -import Data.Char (chr, ord, readLitChar) +import Data.Char (chr, ord, readLitChar, toLower) import Data.List +import Data.Function (on) +import Data.String (IsString(..)) import qualified Data.Map as M import Data.Maybe +import Data.Coerce import System.FilePath import Text.Pandoc.Readers.Docx.Util import Text.Pandoc.Readers.Docx.Fields @@ -160,13 +178,9 @@ newtype Body = Body [BodyPart] type Media = [(FilePath, B.ByteString)] -type CharStyle = (String, RunStyle) +type CharStyleMap = M.Map CharStyleId CharStyle -type ParStyle = (String, ParStyleData) - -type CharStyleMap = M.Map String RunStyle - -type ParStyleMap = M.Map String ParStyleData +type ParStyleMap = M.Map ParaStyleId ParStyle data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] deriving Show @@ -213,12 +227,9 @@ data ChangeInfo = ChangeInfo ChangeId Author ChangeDate data TrackedChange = TrackedChange ChangeType ChangeInfo deriving Show -data ParagraphStyle = ParagraphStyle { pStyle :: [String] +data ParagraphStyle = ParagraphStyle { pStyle :: [ParStyle] , indentation :: Maybe ParIndentation , dropCap :: Bool - , pHeading :: Maybe (String, Int) - , pNumInfo :: Maybe (String, String) - , pBlockQuote :: Maybe Bool , pChange :: Maybe TrackedChange } deriving Show @@ -227,9 +238,6 @@ defaultParagraphStyle :: ParagraphStyle defaultParagraphStyle = ParagraphStyle { pStyle = [] , indentation = Nothing , dropCap = False - , pHeading = Nothing - , pNumInfo = Nothing - , pBlockQuote = Nothing , pChange = Nothing } @@ -254,6 +262,49 @@ newtype Row = Row [Cell] newtype Cell = Cell [BodyPart] deriving Show +newtype CharStyleId = CharStyleId { fromCharStyleId :: String } + deriving (Show, Eq, Ord, FromStyleId) +newtype ParaStyleId = ParaStyleId { fromParaStyleId :: String } + deriving (Show, Eq, Ord, FromStyleId) + +newtype CharStyleName = CharStyleName { fromCharStyleName :: CIString } + deriving (Show, Eq, Ord, IsString, FromStyleName) +newtype ParaStyleName = ParaStyleName { fromParaStyleName :: CIString } + deriving (Show, Eq, Ord, IsString, FromStyleName) + +-- Case-insensitive comparisons +newtype CIString = CIString String deriving (Show, IsString, FromStyleName) + +class FromStyleName a where + fromStyleName :: a -> String + +instance FromStyleName String where + fromStyleName = id + +class FromStyleId a where + fromStyleId :: a -> String + +instance FromStyleId String where + fromStyleId = id + +instance Eq CIString where + (==) = (==) `on` map toLower . coerce + +instance Ord CIString where + compare = compare `on` map toLower . coerce + +leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle +leftBiasedMergeRunStyle a b = RunStyle + { isBold = isBold a <|> isBold b + , isItalic = isItalic a <|> isItalic b + , isSmallCaps = isSmallCaps a <|> isSmallCaps b + , isStrike = isStrike a <|> isStrike b + , isRTL = isRTL a <|> isRTL b + , rVertAlign = rVertAlign a <|> rVertAlign b + , rUnderline = rUnderline a <|> rUnderline b + , rParentStyle = rParentStyle a + } + -- (width, height) in EMUs type Extent = Maybe (Double, Double) @@ -285,21 +336,28 @@ data RunElem = TextRun String | LnBrk | Tab | SoftHyphen | NoBreakHyphen data VertAlign = BaseLn | SupScrpt | SubScrpt deriving Show -data RunStyle = RunStyle { isBold :: Maybe Bool - , isItalic :: Maybe Bool - , isSmallCaps :: Maybe Bool - , isStrike :: Maybe Bool - , isRTL :: Maybe Bool - , rVertAlign :: Maybe VertAlign - , rUnderline :: Maybe String - , rStyle :: Maybe CharStyle +data CharStyle = CharStyle { cStyleId :: CharStyleId + , cStyleName :: CharStyleName + , cStyleData :: RunStyle + } deriving (Show) + +data RunStyle = RunStyle { isBold :: Maybe Bool + , isItalic :: Maybe Bool + , isSmallCaps :: Maybe Bool + , isStrike :: Maybe Bool + , isRTL :: Maybe Bool + , rVertAlign :: Maybe VertAlign + , rUnderline :: Maybe String + , rParentStyle :: Maybe CharStyle } deriving Show -data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int) - , isBlockQuote :: Maybe Bool - , numInfo :: Maybe (String, String) - , psStyle :: Maybe ParStyle} +data ParStyle = ParStyle { headingLev :: Maybe (ParaStyleName, Int) + , numInfo :: Maybe (String, String) + , psParentStyle :: Maybe ParStyle + , pStyleName :: ParaStyleName + , pStyleId :: ParaStyleId + } deriving Show defaultRunStyle :: RunStyle @@ -310,7 +368,7 @@ defaultRunStyle = RunStyle { isBold = Nothing , isRTL = Nothing , rVertAlign = Nothing , rUnderline = Nothing - , rStyle = Nothing + , rParentStyle = Nothing } type Target = String @@ -390,7 +448,10 @@ elemToBody ns element | isElem ns "w" "body" element = elemToBody _ _ = throwError WrongElem archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap) -archiveToStyles zf = +archiveToStyles = archiveToStyles' getStyleId getStyleId +archiveToStyles' :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) => + (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2) +archiveToStyles' conv1 conv2 zf = let stylesElem = findEntryByPath "word/styles.xml" zf >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) in @@ -399,19 +460,17 @@ archiveToStyles zf = Just styElem -> let namespaces = elemToNameSpaces styElem in - ( M.fromList $ buildBasedOnList namespaces styElem - (Nothing :: Maybe CharStyle), - M.fromList $ buildBasedOnList namespaces styElem - (Nothing :: Maybe ParStyle) ) + ( M.fromList $ map (\r -> (conv1 r, r)) $ buildBasedOnList namespaces styElem Nothing, + M.fromList $ map (\p -> (conv2 p, p)) $ buildBasedOnList namespaces styElem Nothing) -isBasedOnStyle :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> Bool +isBasedOnStyle :: (ElemToStyle a, FromStyleId (StyleId a)) => NameSpaces -> Element -> Maybe a -> Bool isBasedOnStyle ns element parentStyle | isElem ns "w" "style" element , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle , Just basedOnVal <- findChildByName ns "w" "basedOn" element >>= findAttrByName ns "w" "val" - , Just ps <- parentStyle = basedOnVal == getStyleId ps + , Just ps <- parentStyle = basedOnVal == fromStyleId (getStyleId ps) | isElem ns "w" "style" element , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle @@ -419,30 +478,70 @@ isBasedOnStyle ns element parentStyle , Nothing <- parentStyle = True | otherwise = False -class ElemToStyle a where +class HasStyleId a => ElemToStyle a where cStyleType :: Maybe a -> String elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a - getStyleId :: a -> String + +class FromStyleId (StyleId a) => HasStyleId a where + type StyleId a + getStyleId :: a -> StyleId a + +class FromStyleName (StyleName a) => HasStyleName a where + type StyleName a + getStyleName :: a -> StyleName a + +class HasParentStyle a where + getParentStyle :: a -> Maybe a + +instance HasParentStyle CharStyle where + getParentStyle = rParentStyle . cStyleData + +instance HasParentStyle ParStyle where + getParentStyle = psParentStyle + +getStyleNames :: (Functor t, HasStyleName a) => t a -> t (StyleName a) +getStyleNames = fmap getStyleName + +constructBogusParStyleData :: ParaStyleName -> ParStyle +constructBogusParStyleData stName = ParStyle + { headingLev = Nothing + , numInfo = Nothing + , psParentStyle = Nothing + , pStyleName = stName + , pStyleId = ParaStyleId . filter (/=' ') . fromStyleName $ stName + } instance ElemToStyle CharStyle where cStyleType _ = "character" elemToStyle ns element parentStyle | isElem ns "w" "style" element - , Just "character" <- findAttrByName ns "w" "type" element - , Just styleId <- findAttrByName ns "w" "styleId" element = - Just (styleId, elemToRunStyle ns element parentStyle) + , Just "character" <- findAttrByName ns "w" "type" element = + elemToCharStyle ns element parentStyle | otherwise = Nothing - getStyleId s = fst s + +instance HasStyleId CharStyle where + type StyleId CharStyle = CharStyleId + getStyleId = cStyleId + +instance HasStyleName CharStyle where + type StyleName CharStyle = CharStyleName + getStyleName = cStyleName instance ElemToStyle ParStyle where cStyleType _ = "paragraph" elemToStyle ns element parentStyle | isElem ns "w" "style" element , Just "paragraph" <- findAttrByName ns "w" "type" element - , Just styleId <- findAttrByName ns "w" "styleId" element = - Just (styleId, elemToParStyleData ns element parentStyle) + = elemToParStyleData ns element parentStyle | otherwise = Nothing - getStyleId s = fst s + +instance HasStyleId ParStyle where + type StyleId ParStyle = ParaStyleId + getStyleId = pStyleId + +instance HasStyleName ParStyle where + type StyleName ParStyle = ParaStyleName + getStyleName = pStyleName getStyleChildren :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a] getStyleChildren ns element parentStyle @@ -693,6 +792,12 @@ testBitMask bitMaskS n = stringToInteger :: String -> Maybe Integer stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) +pHeading :: ParagraphStyle -> Maybe (ParaStyleName, Int) +pHeading = getParStyleField headingLev . pStyle + +pNumInfo :: ParagraphStyle -> Maybe (String, String) +pNumInfo = getParStyleField numInfo . pStyle + elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element @@ -1003,20 +1108,18 @@ elemToRun ns element return $ Run runStyle runElems elemToRun _ _ = throwError WrongElem -getParentStyleValue :: (ParStyleData -> Maybe a) -> ParStyleData -> Maybe a +getParentStyleValue :: (ParStyle -> Maybe a) -> ParStyle -> Maybe a getParentStyleValue field style | Just value <- field style = Just value - | Just parentStyle <- psStyle style - = getParentStyleValue field (snd parentStyle) + | Just parentStyle <- psParentStyle style + = getParentStyleValue field parentStyle getParentStyleValue _ _ = Nothing -getParStyleField :: (ParStyleData -> Maybe a) -> ParStyleMap -> [String] -> - Maybe a -getParStyleField field stylemap styles - | x <- mapMaybe (\x -> M.lookup x stylemap) styles - , (y:_) <- mapMaybe (getParentStyleValue field) x +getParStyleField :: (ParStyle -> Maybe a) -> [ParStyle] -> Maybe a +getParStyleField field styles + | (y:_) <- mapMaybe (getParentStyleValue field) styles = Just y -getParStyleField _ _ _ = Nothing +getParStyleField _ _ = Nothing getTrackedChange :: NameSpaces -> Element -> Maybe TrackedChange getTrackedChange ns element @@ -1038,10 +1141,10 @@ elemToParagraphStyle ns element sty | Just pPr <- findChildByName ns "w" "pPr" element = let style = mapMaybe - (findAttrByName ns "w" "val") + (fmap ParaStyleId . findAttrByName ns "w" "val") (findChildrenByName ns "w" "pStyle" pPr) in ParagraphStyle - {pStyle = style + {pStyle = mapMaybe (`M.lookup` sty) style , indentation = findChildByName ns "w" "ind" pPr >>= elemToParIndentation ns @@ -1053,9 +1156,6 @@ elemToParagraphStyle ns element sty Just "none" -> False Just _ -> True Nothing -> False - , pHeading = getParStyleField headingLev sty style - , pNumInfo = getParStyleField numInfo sty style - , pBlockQuote = getParStyleField isBlockQuote sty style , pChange = findChildByName ns "w" "rPr" pPr >>= filterChild (\e -> isElem ns "w" "ins" e || isElem ns "w" "moveTo" e || @@ -1085,16 +1185,20 @@ elemToRunStyleD :: NameSpaces -> Element -> D RunStyle elemToRunStyleD ns element | Just rPr <- findChildByName ns "w" "rPr" element = do charStyles <- asks envCharStyles - let parentSty = case + let parentSty = findChildByName ns "w" "rStyle" rPr >>= - findAttrByName ns "w" "val" - of - Just styName | Just style <- M.lookup styName charStyles -> - Just (styName, style) - _ -> Nothing + findAttrByName ns "w" "val" >>= + flip M.lookup charStyles . CharStyleId return $ elemToRunStyle ns element parentSty elemToRunStyleD _ _ = return defaultRunStyle +elemToCharStyle :: NameSpaces + -> Element -> Maybe CharStyle -> Maybe CharStyle +elemToCharStyle ns element parentStyle + = CharStyle <$> (CharStyleId <$> findAttrByName ns "w" "styleId" element) + <*> getElementStyleName ns element + <*> (Just $ elemToRunStyle ns element parentStyle) + elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle elemToRunStyle ns element parentStyle | Just rPr <- findChildByName ns "w" "rPr" element = @@ -1117,38 +1221,23 @@ elemToRunStyle ns element parentStyle , rUnderline = findChildByName ns "w" "u" rPr >>= findAttrByName ns "w" "val" - , rStyle = parentStyle + , rParentStyle = parentStyle } elemToRunStyle _ _ _ = defaultRunStyle -getHeaderLevel :: NameSpaces -> Element -> Maybe (String,Int) +getHeaderLevel :: NameSpaces -> Element -> Maybe (ParaStyleName, Int) getHeaderLevel ns element - | Just styleId <- findAttrByName ns "w" "styleId" element - , Just index <- stripPrefix "Heading" styleId - , Just n <- stringToInteger index - , n > 0 = Just (styleId, fromInteger n) - | Just styleId <- findAttrByName ns "w" "styleId" element - , Just index <- findChildByName ns "w" "name" element >>= - findAttrByName ns "w" "val" >>= - stripPrefix "heading " - , Just n <- stringToInteger index - , n > 0 = Just (styleId, fromInteger n) + | Just styleName <- getElementStyleName ns element + , Just n <- stringToInteger =<< + (stripPrefix "heading " . map toLower $ + fromStyleName styleName) + , n > 0 = Just (styleName, fromInteger n) getHeaderLevel _ _ = Nothing -blockQuoteStyleIds :: [String] -blockQuoteStyleIds = ["Quote", "BlockQuote", "BlockQuotation"] - -blockQuoteStyleNames :: [String] -blockQuoteStyleNames = ["Quote", "Block Text"] - -getBlockQuote :: NameSpaces -> Element -> Maybe Bool -getBlockQuote ns element - | Just styleId <- findAttrByName ns "w" "styleId" element - , styleId `elem` blockQuoteStyleIds = Just True - | Just styleName <- findChildByName ns "w" "name" element >>= - findAttrByName ns "w" "val" - , styleName `elem` blockQuoteStyleNames = Just True -getBlockQuote _ _ = Nothing +getElementStyleName :: Coercible String a => NameSpaces -> Element -> Maybe a +getElementStyleName ns el = coerce <$> + ((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val") + <|> findAttrByName ns "w" "styleId" el) getNumInfo :: NameSpaces -> Element -> Maybe (String, String) getNumInfo ns element = do @@ -1163,15 +1252,19 @@ getNumInfo ns element = do return (numId, lvl) -elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> ParStyleData -elemToParStyleData ns element parentStyle = - ParStyleData +elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> Maybe ParStyle +elemToParStyleData ns element parentStyle + | Just styleId <- findAttrByName ns "w" "styleId" element + , Just styleName <- getElementStyleName ns element + = Just $ ParStyle { headingLev = getHeaderLevel ns element - , isBlockQuote = getBlockQuote ns element , numInfo = getNumInfo ns element - , psStyle = parentStyle - } + , psParentStyle = parentStyle + , pStyleName = styleName + , pStyleId = ParaStyleId styleId + } +elemToParStyleData _ _ _ = Nothing elemToRunElem :: NameSpaces -> Element -> D RunElem elemToRunElem ns element |