diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Lists.hs | 25 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 283 |
2 files changed, 203 insertions, 105 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs index cc390f122..eb24640c5 100644 --- a/src/Text/Pandoc/Readers/Docx/Lists.hs +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -1,4 +1,5 @@ {-# LANGUAGE NoImplicitPrelude #-} +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Docx.Lists Copyright : Copyright (C) 2014-2019 Jesse Rosenthal @@ -14,13 +15,16 @@ Functions for converting flat docx paragraphs into nested lists. module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets , blocksToDefinitions , listParagraphDivs + , listParagraphStyles ) where import Prelude import Data.List import Data.Maybe +import Data.String (fromString) import Text.Pandoc.Generic (bottomUp) import Text.Pandoc.JSON +import Text.Pandoc.Readers.Docx.Parse (ParaStyleName) import Text.Pandoc.Shared (trim, safeRead) isListItem :: Block -> Bool @@ -79,7 +83,10 @@ getListType b@(Div (_, _, kvs) _) | isListItem b = getListType _ = Nothing listParagraphDivs :: [String] -listParagraphDivs = ["ListParagraph"] +listParagraphDivs = ["list-paragraph"] + +listParagraphStyles :: [ParaStyleName] +listParagraphStyles = map fromString listParagraphDivs -- This is a first stab at going through and attaching meaning to list -- paragraphs, without an item marker, following a list item. We @@ -160,7 +167,7 @@ blocksToDefinitions' defAcc acc [] = reverse $ DefinitionList (reverse defAcc) : acc blocksToDefinitions' defAcc acc (Div (_, classes1, _) blks1 : Div (ident2, classes2, kvs2) blks2 : blks) - | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 = + | "Definition-Term" `elem` classes1 && "Definition" `elem` classes2 = let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) pair = if remainingAttr2 == ("", [], []) then (concatMap plainParaInlines blks1, [blks2]) else (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) in @@ -169,12 +176,12 @@ blocksToDefinitions' ((defTerm, defItems):defs) acc (Div (ident2, classes2, kvs2) blks2 : blks) | "Definition" `elem` classes2 = let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) - defItems2 = case remainingAttr2 == ("", [], []) of - True -> blks2 - False -> [Div remainingAttr2 blks2] - defAcc' = case null defItems of - True -> (defTerm, [defItems2]) : defs - False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs + defItems2 = if remainingAttr2 == ("", [], []) + then blks2 + else [Div remainingAttr2 blks2] + defAcc' = if null defItems + then (defTerm, [defItems2]) : defs + else (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs in blocksToDefinitions' defAcc' acc blks blocksToDefinitions' [] acc (b:blks) = @@ -198,7 +205,5 @@ removeListDivs' blk = [blk] removeListDivs :: [Block] -> [Block] removeListDivs = concatMap removeListDivs' - - blocksToDefinitions :: [Block] -> [Block] blocksToDefinitions = blocksToDefinitions' [] [] diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 330c9208f..00c5fb0be 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -1,7 +1,11 @@ {-# LANGUAGE NoImplicitPrelude #-} {-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE PatternGuards #-} {-# LANGUAGE ViewPatterns #-} +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE TypeFamilies #-} +{-# LANGUAGE GeneralizedNewtypeDeriving #-} {- | Module : Text.Pandoc.Readers.Docx.Parse Copyright : Copyright (C) 2014-2019 Jesse Rosenthal @@ -31,6 +35,8 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , VertAlign(..) , ParIndentation(..) , ParagraphStyle(..) + , ParStyle + , CharStyle(cStyleData) , Row(..) , Cell(..) , TrackedChange(..) @@ -38,8 +44,17 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , ChangeInfo(..) , FieldInfo(..) , Level(..) + , ParaStyleName + , CharStyleName + , FromStyleName(..) + , HasStyleName(..) + , HasParentStyle(..) , archiveToDocx , archiveToDocxWithWarnings + , getStyleNames + , pHeading + , constructBogusParStyleData + , leftBiasedMergeRunStyle ) where import Prelude import Codec.Archive.Zip @@ -49,10 +64,13 @@ import Control.Monad.Reader import Control.Monad.State.Strict import Data.Bits ((.|.)) import qualified Data.ByteString.Lazy as B -import Data.Char (chr, ord, readLitChar) +import Data.Char (chr, ord, readLitChar, toLower) import Data.List +import Data.Function (on) +import Data.String (IsString(..)) import qualified Data.Map as M import Data.Maybe +import Data.Coerce import System.FilePath import Text.Pandoc.Readers.Docx.Util import Text.Pandoc.Readers.Docx.Fields @@ -160,13 +178,9 @@ newtype Body = Body [BodyPart] type Media = [(FilePath, B.ByteString)] -type CharStyle = (String, RunStyle) +type CharStyleMap = M.Map CharStyleId CharStyle -type ParStyle = (String, ParStyleData) - -type CharStyleMap = M.Map String RunStyle - -type ParStyleMap = M.Map String ParStyleData +type ParStyleMap = M.Map ParaStyleId ParStyle data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] deriving Show @@ -213,12 +227,9 @@ data ChangeInfo = ChangeInfo ChangeId Author ChangeDate data TrackedChange = TrackedChange ChangeType ChangeInfo deriving Show -data ParagraphStyle = ParagraphStyle { pStyle :: [String] +data ParagraphStyle = ParagraphStyle { pStyle :: [ParStyle] , indentation :: Maybe ParIndentation , dropCap :: Bool - , pHeading :: Maybe (String, Int) - , pNumInfo :: Maybe (String, String) - , pBlockQuote :: Maybe Bool , pChange :: Maybe TrackedChange } deriving Show @@ -227,9 +238,6 @@ defaultParagraphStyle :: ParagraphStyle defaultParagraphStyle = ParagraphStyle { pStyle = [] , indentation = Nothing , dropCap = False - , pHeading = Nothing - , pNumInfo = Nothing - , pBlockQuote = Nothing , pChange = Nothing } @@ -254,6 +262,49 @@ newtype Row = Row [Cell] newtype Cell = Cell [BodyPart] deriving Show +newtype CharStyleId = CharStyleId { fromCharStyleId :: String } + deriving (Show, Eq, Ord, FromStyleId) +newtype ParaStyleId = ParaStyleId { fromParaStyleId :: String } + deriving (Show, Eq, Ord, FromStyleId) + +newtype CharStyleName = CharStyleName { fromCharStyleName :: CIString } + deriving (Show, Eq, Ord, IsString, FromStyleName) +newtype ParaStyleName = ParaStyleName { fromParaStyleName :: CIString } + deriving (Show, Eq, Ord, IsString, FromStyleName) + +-- Case-insensitive comparisons +newtype CIString = CIString String deriving (Show, IsString, FromStyleName) + +class FromStyleName a where + fromStyleName :: a -> String + +instance FromStyleName String where + fromStyleName = id + +class FromStyleId a where + fromStyleId :: a -> String + +instance FromStyleId String where + fromStyleId = id + +instance Eq CIString where + (==) = (==) `on` map toLower . coerce + +instance Ord CIString where + compare = compare `on` map toLower . coerce + +leftBiasedMergeRunStyle :: RunStyle -> RunStyle -> RunStyle +leftBiasedMergeRunStyle a b = RunStyle + { isBold = isBold a <|> isBold b + , isItalic = isItalic a <|> isItalic b + , isSmallCaps = isSmallCaps a <|> isSmallCaps b + , isStrike = isStrike a <|> isStrike b + , isRTL = isRTL a <|> isRTL b + , rVertAlign = rVertAlign a <|> rVertAlign b + , rUnderline = rUnderline a <|> rUnderline b + , rParentStyle = rParentStyle a + } + -- (width, height) in EMUs type Extent = Maybe (Double, Double) @@ -285,21 +336,28 @@ data RunElem = TextRun String | LnBrk | Tab | SoftHyphen | NoBreakHyphen data VertAlign = BaseLn | SupScrpt | SubScrpt deriving Show -data RunStyle = RunStyle { isBold :: Maybe Bool - , isItalic :: Maybe Bool - , isSmallCaps :: Maybe Bool - , isStrike :: Maybe Bool - , isRTL :: Maybe Bool - , rVertAlign :: Maybe VertAlign - , rUnderline :: Maybe String - , rStyle :: Maybe CharStyle +data CharStyle = CharStyle { cStyleId :: CharStyleId + , cStyleName :: CharStyleName + , cStyleData :: RunStyle + } deriving (Show) + +data RunStyle = RunStyle { isBold :: Maybe Bool + , isItalic :: Maybe Bool + , isSmallCaps :: Maybe Bool + , isStrike :: Maybe Bool + , isRTL :: Maybe Bool + , rVertAlign :: Maybe VertAlign + , rUnderline :: Maybe String + , rParentStyle :: Maybe CharStyle } deriving Show -data ParStyleData = ParStyleData { headingLev :: Maybe (String, Int) - , isBlockQuote :: Maybe Bool - , numInfo :: Maybe (String, String) - , psStyle :: Maybe ParStyle} +data ParStyle = ParStyle { headingLev :: Maybe (ParaStyleName, Int) + , numInfo :: Maybe (String, String) + , psParentStyle :: Maybe ParStyle + , pStyleName :: ParaStyleName + , pStyleId :: ParaStyleId + } deriving Show defaultRunStyle :: RunStyle @@ -310,7 +368,7 @@ defaultRunStyle = RunStyle { isBold = Nothing , isRTL = Nothing , rVertAlign = Nothing , rUnderline = Nothing - , rStyle = Nothing + , rParentStyle = Nothing } type Target = String @@ -390,7 +448,10 @@ elemToBody ns element | isElem ns "w" "body" element = elemToBody _ _ = throwError WrongElem archiveToStyles :: Archive -> (CharStyleMap, ParStyleMap) -archiveToStyles zf = +archiveToStyles = archiveToStyles' getStyleId getStyleId +archiveToStyles' :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) => + (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2) +archiveToStyles' conv1 conv2 zf = let stylesElem = findEntryByPath "word/styles.xml" zf >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) in @@ -399,19 +460,17 @@ archiveToStyles zf = Just styElem -> let namespaces = elemToNameSpaces styElem in - ( M.fromList $ buildBasedOnList namespaces styElem - (Nothing :: Maybe CharStyle), - M.fromList $ buildBasedOnList namespaces styElem - (Nothing :: Maybe ParStyle) ) + ( M.fromList $ map (\r -> (conv1 r, r)) $ buildBasedOnList namespaces styElem Nothing, + M.fromList $ map (\p -> (conv2 p, p)) $ buildBasedOnList namespaces styElem Nothing) -isBasedOnStyle :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> Bool +isBasedOnStyle :: (ElemToStyle a, FromStyleId (StyleId a)) => NameSpaces -> Element -> Maybe a -> Bool isBasedOnStyle ns element parentStyle | isElem ns "w" "style" element , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle , Just basedOnVal <- findChildByName ns "w" "basedOn" element >>= findAttrByName ns "w" "val" - , Just ps <- parentStyle = basedOnVal == getStyleId ps + , Just ps <- parentStyle = basedOnVal == fromStyleId (getStyleId ps) | isElem ns "w" "style" element , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle @@ -419,30 +478,70 @@ isBasedOnStyle ns element parentStyle , Nothing <- parentStyle = True | otherwise = False -class ElemToStyle a where +class HasStyleId a => ElemToStyle a where cStyleType :: Maybe a -> String elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a - getStyleId :: a -> String + +class FromStyleId (StyleId a) => HasStyleId a where + type StyleId a + getStyleId :: a -> StyleId a + +class FromStyleName (StyleName a) => HasStyleName a where + type StyleName a + getStyleName :: a -> StyleName a + +class HasParentStyle a where + getParentStyle :: a -> Maybe a + +instance HasParentStyle CharStyle where + getParentStyle = rParentStyle . cStyleData + +instance HasParentStyle ParStyle where + getParentStyle = psParentStyle + +getStyleNames :: (Functor t, HasStyleName a) => t a -> t (StyleName a) +getStyleNames = fmap getStyleName + +constructBogusParStyleData :: ParaStyleName -> ParStyle +constructBogusParStyleData stName = ParStyle + { headingLev = Nothing + , numInfo = Nothing + , psParentStyle = Nothing + , pStyleName = stName + , pStyleId = ParaStyleId . filter (/=' ') . fromStyleName $ stName + } instance ElemToStyle CharStyle where cStyleType _ = "character" elemToStyle ns element parentStyle | isElem ns "w" "style" element - , Just "character" <- findAttrByName ns "w" "type" element - , Just styleId <- findAttrByName ns "w" "styleId" element = - Just (styleId, elemToRunStyle ns element parentStyle) + , Just "character" <- findAttrByName ns "w" "type" element = + elemToCharStyle ns element parentStyle | otherwise = Nothing - getStyleId s = fst s + +instance HasStyleId CharStyle where + type StyleId CharStyle = CharStyleId + getStyleId = cStyleId + +instance HasStyleName CharStyle where + type StyleName CharStyle = CharStyleName + getStyleName = cStyleName instance ElemToStyle ParStyle where cStyleType _ = "paragraph" elemToStyle ns element parentStyle | isElem ns "w" "style" element , Just "paragraph" <- findAttrByName ns "w" "type" element - , Just styleId <- findAttrByName ns "w" "styleId" element = - Just (styleId, elemToParStyleData ns element parentStyle) + = elemToParStyleData ns element parentStyle | otherwise = Nothing - getStyleId s = fst s + +instance HasStyleId ParStyle where + type StyleId ParStyle = ParaStyleId + getStyleId = pStyleId + +instance HasStyleName ParStyle where + type StyleName ParStyle = ParaStyleName + getStyleName = pStyleName getStyleChildren :: (ElemToStyle a) => NameSpaces -> Element -> Maybe a -> [a] getStyleChildren ns element parentStyle @@ -693,6 +792,12 @@ testBitMask bitMaskS n = stringToInteger :: String -> Maybe Integer stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) +pHeading :: ParagraphStyle -> Maybe (ParaStyleName, Int) +pHeading = getParStyleField headingLev . pStyle + +pNumInfo :: ParagraphStyle -> Maybe (String, String) +pNumInfo = getParStyleField numInfo . pStyle + elemToBodyPart :: NameSpaces -> Element -> D BodyPart elemToBodyPart ns element | isElem ns "w" "p" element @@ -1003,20 +1108,18 @@ elemToRun ns element return $ Run runStyle runElems elemToRun _ _ = throwError WrongElem -getParentStyleValue :: (ParStyleData -> Maybe a) -> ParStyleData -> Maybe a +getParentStyleValue :: (ParStyle -> Maybe a) -> ParStyle -> Maybe a getParentStyleValue field style | Just value <- field style = Just value - | Just parentStyle <- psStyle style - = getParentStyleValue field (snd parentStyle) + | Just parentStyle <- psParentStyle style + = getParentStyleValue field parentStyle getParentStyleValue _ _ = Nothing -getParStyleField :: (ParStyleData -> Maybe a) -> ParStyleMap -> [String] -> - Maybe a -getParStyleField field stylemap styles - | x <- mapMaybe (\x -> M.lookup x stylemap) styles - , (y:_) <- mapMaybe (getParentStyleValue field) x +getParStyleField :: (ParStyle -> Maybe a) -> [ParStyle] -> Maybe a +getParStyleField field styles + | (y:_) <- mapMaybe (getParentStyleValue field) styles = Just y -getParStyleField _ _ _ = Nothing +getParStyleField _ _ = Nothing getTrackedChange :: NameSpaces -> Element -> Maybe TrackedChange getTrackedChange ns element @@ -1038,10 +1141,10 @@ elemToParagraphStyle ns element sty | Just pPr <- findChildByName ns "w" "pPr" element = let style = mapMaybe - (findAttrByName ns "w" "val") + (fmap ParaStyleId . findAttrByName ns "w" "val") (findChildrenByName ns "w" "pStyle" pPr) in ParagraphStyle - {pStyle = style + {pStyle = mapMaybe (`M.lookup` sty) style , indentation = findChildByName ns "w" "ind" pPr >>= elemToParIndentation ns @@ -1053,9 +1156,6 @@ elemToParagraphStyle ns element sty Just "none" -> False Just _ -> True Nothing -> False - , pHeading = getParStyleField headingLev sty style - , pNumInfo = getParStyleField numInfo sty style - , pBlockQuote = getParStyleField isBlockQuote sty style , pChange = findChildByName ns "w" "rPr" pPr >>= filterChild (\e -> isElem ns "w" "ins" e || isElem ns "w" "moveTo" e || @@ -1085,16 +1185,20 @@ elemToRunStyleD :: NameSpaces -> Element -> D RunStyle elemToRunStyleD ns element | Just rPr <- findChildByName ns "w" "rPr" element = do charStyles <- asks envCharStyles - let parentSty = case + let parentSty = findChildByName ns "w" "rStyle" rPr >>= - findAttrByName ns "w" "val" - of - Just styName | Just style <- M.lookup styName charStyles -> - Just (styName, style) - _ -> Nothing + findAttrByName ns "w" "val" >>= + flip M.lookup charStyles . CharStyleId return $ elemToRunStyle ns element parentSty elemToRunStyleD _ _ = return defaultRunStyle +elemToCharStyle :: NameSpaces + -> Element -> Maybe CharStyle -> Maybe CharStyle +elemToCharStyle ns element parentStyle + = CharStyle <$> (CharStyleId <$> findAttrByName ns "w" "styleId" element) + <*> getElementStyleName ns element + <*> (Just $ elemToRunStyle ns element parentStyle) + elemToRunStyle :: NameSpaces -> Element -> Maybe CharStyle -> RunStyle elemToRunStyle ns element parentStyle | Just rPr <- findChildByName ns "w" "rPr" element = @@ -1117,38 +1221,23 @@ elemToRunStyle ns element parentStyle , rUnderline = findChildByName ns "w" "u" rPr >>= findAttrByName ns "w" "val" - , rStyle = parentStyle + , rParentStyle = parentStyle } elemToRunStyle _ _ _ = defaultRunStyle -getHeaderLevel :: NameSpaces -> Element -> Maybe (String,Int) +getHeaderLevel :: NameSpaces -> Element -> Maybe (ParaStyleName, Int) getHeaderLevel ns element - | Just styleId <- findAttrByName ns "w" "styleId" element - , Just index <- stripPrefix "Heading" styleId - , Just n <- stringToInteger index - , n > 0 = Just (styleId, fromInteger n) - | Just styleId <- findAttrByName ns "w" "styleId" element - , Just index <- findChildByName ns "w" "name" element >>= - findAttrByName ns "w" "val" >>= - stripPrefix "heading " - , Just n <- stringToInteger index - , n > 0 = Just (styleId, fromInteger n) + | Just styleName <- getElementStyleName ns element + , Just n <- stringToInteger =<< + (stripPrefix "heading " . map toLower $ + fromStyleName styleName) + , n > 0 = Just (styleName, fromInteger n) getHeaderLevel _ _ = Nothing -blockQuoteStyleIds :: [String] -blockQuoteStyleIds = ["Quote", "BlockQuote", "BlockQuotation"] - -blockQuoteStyleNames :: [String] -blockQuoteStyleNames = ["Quote", "Block Text"] - -getBlockQuote :: NameSpaces -> Element -> Maybe Bool -getBlockQuote ns element - | Just styleId <- findAttrByName ns "w" "styleId" element - , styleId `elem` blockQuoteStyleIds = Just True - | Just styleName <- findChildByName ns "w" "name" element >>= - findAttrByName ns "w" "val" - , styleName `elem` blockQuoteStyleNames = Just True -getBlockQuote _ _ = Nothing +getElementStyleName :: Coercible String a => NameSpaces -> Element -> Maybe a +getElementStyleName ns el = coerce <$> + ((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val") + <|> findAttrByName ns "w" "styleId" el) getNumInfo :: NameSpaces -> Element -> Maybe (String, String) getNumInfo ns element = do @@ -1163,15 +1252,19 @@ getNumInfo ns element = do return (numId, lvl) -elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> ParStyleData -elemToParStyleData ns element parentStyle = - ParStyleData +elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> Maybe ParStyle +elemToParStyleData ns element parentStyle + | Just styleId <- findAttrByName ns "w" "styleId" element + , Just styleName <- getElementStyleName ns element + = Just $ ParStyle { headingLev = getHeaderLevel ns element - , isBlockQuote = getBlockQuote ns element , numInfo = getNumInfo ns element - , psStyle = parentStyle - } + , psParentStyle = parentStyle + , pStyleName = styleName + , pStyleId = ParaStyleId styleId + } +elemToParStyleData _ _ _ = Nothing elemToRunElem :: NameSpaces -> Element -> D RunElem elemToRunElem ns element |