aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Docx.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx.hs')
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs805
1 files changed, 404 insertions, 401 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 09c2330fb..085ee01fc 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE PatternGuards #-}
+
{-
Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
@@ -76,48 +78,188 @@ import Codec.Archive.Zip
import Text.Pandoc.Definition
import Text.Pandoc.Options
import Text.Pandoc.Builder (text, toList)
-import Text.Pandoc.Generic (bottomUp)
-import Text.Pandoc.MIME (getMimeType)
-import Text.Pandoc.UTF8 (toString)
+import Text.Pandoc.Walk
import Text.Pandoc.Readers.Docx.Parse
import Text.Pandoc.Readers.Docx.Lists
-import Data.Maybe (mapMaybe, isJust, fromJust)
-import Data.List (delete, isPrefixOf, (\\), intersect)
-import qualified Data.ByteString as BS
+import Text.Pandoc.Readers.Docx.Reducible
+import Text.Pandoc.Shared
+import Text.Pandoc.MediaBag (insertMedia, MediaBag)
+import Data.Maybe (mapMaybe)
+import Data.List (delete, stripPrefix, (\\), intersect)
+import Data.Monoid
+import Text.TeXMath (writeTeX)
+import Data.Default (Default)
import qualified Data.ByteString.Lazy as B
-import Data.ByteString.Base64 (encode)
-import System.FilePath (combine)
+import qualified Data.Map as M
+import Control.Monad.Reader
+import Control.Monad.State
+import Control.Applicative ((<$>))
readDocx :: ReaderOptions
-> B.ByteString
- -> Pandoc
+ -> (Pandoc, MediaBag)
readDocx opts bytes =
case archiveToDocx (toArchive bytes) of
- Just docx -> Pandoc nullMeta (docxToBlocks opts docx)
- Nothing -> error $ "couldn't parse docx file"
-
-runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)])
-runStyleToSpanAttr rPr = ("",
- mapMaybe id [
- if isBold rPr then (Just "strong") else Nothing,
- if isItalic rPr then (Just "emph") else Nothing,
- if isSmallCaps rPr then (Just "smallcaps") else Nothing,
- if isStrike rPr then (Just "strike") else Nothing,
- if isSuperScript rPr then (Just "superscript") else Nothing,
- if isSubScript rPr then (Just "subscript") else Nothing,
- rStyle rPr],
- case underline rPr of
- Just fmt -> [("underline", fmt)]
- _ -> []
- )
-
-parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)])
-parStyleToDivAttr pPr = ("",
- pStyle pPr,
- case indent pPr of
- Just n -> [("indent", (show n))]
- Nothing -> []
- )
+ Right docx -> (Pandoc meta blks, mediaBag) where
+ (meta, blks, mediaBag) = (docxToOutput opts docx)
+ Left _ -> error $ "couldn't parse docx file"
+
+data DState = DState { docxAnchorMap :: M.Map String String
+ , docxMediaBag :: MediaBag }
+
+instance Default DState where
+ def = DState { docxAnchorMap = M.empty
+ , docxMediaBag = mempty }
+
+data DEnv = DEnv { docxOptions :: ReaderOptions
+ , docxInHeaderBlock :: Bool }
+
+instance Default DEnv where
+ def = DEnv def False
+
+type DocxContext = ReaderT DEnv (State DState)
+
+evalDocxContext :: DocxContext a -> DEnv -> DState -> a
+evalDocxContext ctx env st = evalState (runReaderT ctx env) st
+
+concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b]
+concatMapM f xs = liftM concat (mapM f xs)
+
+-- This is empty, but we put it in for future-proofing.
+spansToKeep :: [String]
+spansToKeep = []
+
+divsToKeep :: [String]
+divsToKeep = ["list-item", "Definition", "DefinitionTerm"]
+
+metaStyles :: M.Map String String
+metaStyles = M.fromList [ ("Title", "title")
+ , ("Subtitle", "subtitle")
+ , ("Author", "author")
+ , ("Date", "date")
+ , ("Abstract", "abstract")]
+
+sepBodyParts :: [BodyPart] -> ([BodyPart], [BodyPart])
+sepBodyParts = span (\bp -> (isMetaPar bp || isEmptyPar bp))
+
+isMetaPar :: BodyPart -> Bool
+isMetaPar (Paragraph pPr _) =
+ not $ null $ intersect (pStyle pPr) (M.keys metaStyles)
+isMetaPar _ = False
+
+isEmptyPar :: BodyPart -> Bool
+isEmptyPar (Paragraph _ parParts) =
+ all isEmptyParPart parParts
+ where
+ isEmptyParPart (PlainRun (Run _ runElems)) = all isEmptyElem runElems
+ isEmptyParPart _ = False
+ isEmptyElem (TextRun s) = trim s == ""
+ isEmptyElem _ = True
+isEmptyPar _ = False
+
+bodyPartsToMeta' :: [BodyPart] -> DocxContext (M.Map String MetaValue)
+bodyPartsToMeta' [] = return M.empty
+bodyPartsToMeta' (bp : bps)
+ | (Paragraph pPr parParts) <- bp
+ , (c : _)<- intersect (pStyle pPr) (M.keys metaStyles)
+ , (Just metaField) <- M.lookup c metaStyles = do
+ inlines <- parPartsToInlines parParts
+ remaining <- bodyPartsToMeta' bps
+ let
+ f (MetaInlines ils) (MetaInlines ils') = MetaBlocks [Para ils, Para ils']
+ f (MetaInlines ils) (MetaBlocks blks) = MetaBlocks ((Para ils) : blks)
+ f m (MetaList mv) = MetaList (m : mv)
+ f m n = MetaList [m, n]
+ return $ M.insertWith f metaField (MetaInlines inlines) remaining
+bodyPartsToMeta' (_ : bps) = bodyPartsToMeta' bps
+
+bodyPartsToMeta :: [BodyPart] -> DocxContext Meta
+bodyPartsToMeta bps = do
+ mp <- bodyPartsToMeta' bps
+ let mp' =
+ case M.lookup "author" mp of
+ Just mv -> M.insert "author" (fixAuthors mv) mp
+ Nothing -> mp
+ return $ Meta mp'
+
+fixAuthors :: MetaValue -> MetaValue
+fixAuthors (MetaBlocks blks) =
+ MetaList $ map g $ filter f blks
+ where f (Para _) = True
+ f _ = False
+ g (Para ils) = MetaInlines ils
+ g _ = MetaInlines []
+fixAuthors mv = mv
+
+runStyleToContainers :: RunStyle -> [Container Inline]
+runStyleToContainers rPr =
+ let spanClassToContainers :: String -> [Container Inline]
+ spanClassToContainers s | s `elem` codeSpans =
+ [Container $ (\ils -> Code ("", [], []) (concatMap ilToCode ils))]
+ spanClassToContainers s | s `elem` spansToKeep =
+ [Container $ Span ("", [s], [])]
+ spanClassToContainers _ = []
+
+ classContainers = case rStyle rPr of
+ Nothing -> []
+ Just s -> spanClassToContainers s
+
+ formatters = map Container $ mapMaybe id
+ [ if isBold rPr then (Just Strong) else Nothing
+ , if isItalic rPr then (Just Emph) else Nothing
+ , if isSmallCaps rPr then (Just SmallCaps) else Nothing
+ , if isStrike rPr then (Just Strikeout) else Nothing
+ , if isSuperScript rPr then (Just Superscript) else Nothing
+ , if isSubScript rPr then (Just Subscript) else Nothing
+ , rUnderline rPr >>=
+ (\f -> if f == "single" then (Just Emph) else Nothing)
+ ]
+ in
+ classContainers ++ formatters
+
+parStyleToContainers :: ParagraphStyle -> [Container Block]
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, Just n <- isHeaderClass c =
+ [Container $ \_ -> Header n ("", delete ("Heading" ++ show n) cs, []) []]
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` divsToKeep =
+ let pPr' = pPr { pStyle = cs }
+ in
+ (Container $ Div ("", [c], [])) : (parStyleToContainers pPr')
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` codeDivs =
+ -- This is a bit of a cludge. We make the codeblock from the raw
+ -- parparts in bodyPartToBlocks. But we need something to match against.
+ let pPr' = pPr { pStyle = cs }
+ in
+ (Container $ \_ -> CodeBlock ("", [], []) "") : (parStyleToContainers pPr')
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` listParagraphDivs =
+ let pPr' = pPr { pStyle = cs, indentation = Nothing}
+ in
+ (Container $ Div ("", [c], [])) : (parStyleToContainers pPr')
+
+parStyleToContainers pPr | (c:cs) <- pStyle pPr, c `elem` blockQuoteDivs =
+ let pPr' = pPr { pStyle = cs \\ blockQuoteDivs }
+ in
+ (Container BlockQuote) : (parStyleToContainers pPr')
+parStyleToContainers pPr | (_:cs) <- pStyle pPr =
+ let pPr' = pPr { pStyle = cs}
+ in
+ parStyleToContainers pPr'
+parStyleToContainers pPr | null (pStyle pPr),
+ Just left <- indentation pPr >>= leftParIndent,
+ Just hang <- indentation pPr >>= hangingParIndent =
+ let pPr' = pPr { indentation = Nothing }
+ in
+ case (left - hang) > 0 of
+ True -> (Container BlockQuote) : (parStyleToContainers pPr')
+ False -> parStyleToContainers pPr'
+parStyleToContainers pPr | null (pStyle pPr),
+ Just left <- indentation pPr >>= leftParIndent =
+ let pPr' = pPr { indentation = Nothing }
+ in
+ case left > 0 of
+ True -> (Container BlockQuote) : (parStyleToContainers pPr')
+ False -> parStyleToContainers pPr'
+parStyleToContainers _ = []
+
strToInlines :: String -> [Inline]
strToInlines = toList . text
@@ -144,115 +286,103 @@ runElemToString (Tab) = ['\t']
runElemsToString :: [RunElem] -> String
runElemsToString = concatMap runElemToString
---- We use this instead of the more general
---- Text.Pandoc.Shared.normalize for reasons of efficiency. For
---- whatever reason, `normalize` makes a run take almost twice as
---- long. (It does more, but this does what we need)
-inlineNormalize :: [Inline] -> [Inline]
-inlineNormalize [] = []
-inlineNormalize (Str "" : ils) = inlineNormalize ils
-inlineNormalize ((Str s) : (Str s') : l) =
- inlineNormalize (Str (s++s') : l)
-inlineNormalize ((Emph ils) : (Emph ils') : l) =
- inlineNormalize $ (Emph $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Emph ils) : l) =
- Emph (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Strong ils) : (Strong ils') : l) =
- inlineNormalize $ (Strong $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Strong ils) : l) =
- Strong (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Strikeout ils) : (Strikeout ils') : l) =
- inlineNormalize $ (Strikeout $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Strikeout ils) : l) =
- Strikeout (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Superscript ils) : (Superscript ils') : l) =
- inlineNormalize $ (Superscript $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Superscript ils) : l) =
- Superscript (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Subscript ils) : (Subscript ils') : l) =
- inlineNormalize $ (Subscript $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Subscript ils) : l) =
- Subscript (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Space : Space : l)) =
- inlineNormalize $ (Space : l)
-inlineNormalize ((Quoted qt ils) : l) =
- Quoted qt (inlineNormalize ils) : inlineNormalize l
-inlineNormalize ((Cite cits ils) : l) =
- let
- f :: Citation -> Citation
- f (Citation s pref suff mode num hash) =
- Citation s (inlineNormalize pref) (inlineNormalize suff) mode num hash
- in
- Cite (map f cits) (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Link ils s) : l) =
- Link (inlineNormalize ils) s : (inlineNormalize l)
-inlineNormalize ((Image ils s) : l) =
- Image (inlineNormalize ils) s : (inlineNormalize l)
-inlineNormalize ((Note blks) : l) =
- Note (map blockNormalize blks) : (inlineNormalize l)
-inlineNormalize ((Span attr ils) : l) =
- Span attr (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize (il : l) = il : (inlineNormalize l)
-
-stripSpaces :: [Inline] -> [Inline]
-stripSpaces ils =
- reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) ils
-
-blockNormalize :: Block -> Block
-blockNormalize (Plain ils) = Plain $ stripSpaces $ inlineNormalize ils
-blockNormalize (Para ils) = Para $ stripSpaces $ inlineNormalize ils
-blockNormalize (Header n attr ils) =
- Header n attr $ stripSpaces $ inlineNormalize ils
-blockNormalize (Table ils align width hdr cells) =
- Table (stripSpaces $ inlineNormalize ils) align width hdr cells
-blockNormalize (DefinitionList pairs) =
- DefinitionList $ map (\(ils, blklsts) -> (stripSpaces (inlineNormalize ils), (map (map blockNormalize) blklsts))) pairs
-blockNormalize (BlockQuote blks) = BlockQuote (map blockNormalize blks)
-blockNormalize (OrderedList attr blkslst) =
- OrderedList attr $ map (\blks -> map blockNormalize blks) blkslst
-blockNormalize (BulletList blkslst) =
- BulletList $ map (\blks -> map blockNormalize blks) blkslst
-blockNormalize (Div attr blks) = Div attr (map blockNormalize blks)
-blockNormalize blk = blk
-
-runToInlines :: ReaderOptions -> Docx -> Run -> [Inline]
-runToInlines _ _ (Run rs runElems)
- | isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans =
- case runStyleToSpanAttr rs == ("", [], []) of
- True -> [Str (runElemsToString runElems)]
- False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]]
- | otherwise = case runStyleToSpanAttr rs == ("", [], []) of
- True -> concatMap runElemToInlines runElems
- False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)]
-runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) =
- case (getFootNote fnId notes) of
- Just bodyParts ->
- [Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
- Nothing ->
- [Note [Div ("", ["footnote"], []) []]]
-runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) =
- case (getEndNote fnId notes) of
- Just bodyParts ->
- [Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
- Nothing ->
- [Note [Div ("", ["endnote"], []) []]]
-
-parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline]
-parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r
-parPartToInlines _ _ (BookMark _ anchor) =
- [Span (anchor, ["anchor"], []) []]
-parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) =
- case lookupRelationship relid rels of
- Just target -> [Image [] (combine "word" target, "")]
- Nothing -> [Image [] ("", "")]
-parPartToInlines opts docx (InternalHyperLink anchor runs) =
- [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")]
-parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) =
- case lookupRelationship relid rels of
- Just target ->
- [Link (concatMap (runToInlines opts docx) runs) (target, "")]
- Nothing ->
- [Link (concatMap (runToInlines opts docx) runs) ("", "")]
+runToString :: Run -> String
+runToString (Run _ runElems) = runElemsToString runElems
+runToString _ = ""
+
+parPartToString :: ParPart -> String
+parPartToString (PlainRun run) = runToString run
+parPartToString (InternalHyperLink _ runs) = concatMap runToString runs
+parPartToString (ExternalHyperLink _ runs) = concatMap runToString runs
+parPartToString _ = ""
+
+
+inlineCodeContainer :: Container Inline -> Bool
+inlineCodeContainer (Container f) = case f [] of
+ Code _ "" -> True
+ _ -> False
+inlineCodeContainer _ = False
+
+
+runToInlines :: Run -> DocxContext [Inline]
+runToInlines (Run rs runElems)
+ | any inlineCodeContainer (runStyleToContainers rs) =
+ return $
+ rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems]
+ | otherwise =
+ return $
+ rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems)
+runToInlines (Footnote bps) =
+ concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks])
+runToInlines (Endnote bps) =
+ concatMapM bodyPartToBlocks bps >>= (\blks -> return [Note blks])
+runToInlines (InlineDrawing fp bs) = do
+ mediaBag <- gets docxMediaBag
+ modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag }
+ return [Image [] (fp, "")]
+
+
+
+
+parPartToInlines :: ParPart -> DocxContext [Inline]
+parPartToInlines (PlainRun r) = runToInlines r
+parPartToInlines (Insertion _ author date runs) = do
+ opts <- asks docxOptions
+ case readerTrackChanges opts of
+ AcceptChanges -> concatMapM runToInlines runs >>= return
+ RejectChanges -> return []
+ AllChanges -> do
+ ils <- (concatMapM runToInlines runs)
+ return [Span
+ ("", ["insertion"], [("author", author), ("date", date)])
+ ils]
+parPartToInlines (Deletion _ author date runs) = do
+ opts <- asks docxOptions
+ case readerTrackChanges opts of
+ AcceptChanges -> return []
+ RejectChanges -> concatMapM runToInlines runs >>= return
+ AllChanges -> do
+ ils <- concatMapM runToInlines runs
+ return [Span
+ ("", ["deletion"], [("author", author), ("date", date)])
+ ils]
+parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return []
+parPartToInlines (BookMark _ anchor) =
+ -- We record these, so we can make sure not to overwrite
+ -- user-defined anchor links with header auto ids.
+ do
+ -- get whether we're in a header.
+ inHdrBool <- asks docxInHeaderBlock
+ -- Get the anchor map.
+ anchorMap <- gets docxAnchorMap
+ -- We don't want to rewrite if we're in a header, since we'll take
+ -- care of that later, when we make the header anchor. If the
+ -- bookmark were already in uniqueIdent form, this would lead to a
+ -- duplication. Otherwise, we check to see if the id is already in
+ -- there. Rewrite if necessary. This will have the possible effect
+ -- of rewriting user-defined anchor links. However, since these
+ -- are not defined in pandoc, it seems like a necessary evil to
+ -- avoid an extra pass.
+ let newAnchor =
+ if not inHdrBool && anchor `elem` (M.elems anchorMap)
+ then uniqueIdent [Str anchor] (M.elems anchorMap)
+ else anchor
+ unless inHdrBool
+ (modify $ \s -> s { docxAnchorMap = M.insert anchor newAnchor anchorMap})
+ return [Span (newAnchor, ["anchor"], []) []]
+parPartToInlines (Drawing fp bs) = do
+ mediaBag <- gets docxMediaBag
+ modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag }
+ return [Image [] (fp, "")]
+parPartToInlines (InternalHyperLink anchor runs) = do
+ ils <- concatMapM runToInlines runs
+ return [Link ils ('#' : anchor, "")]
+parPartToInlines (ExternalHyperLink target runs) = do
+ ils <- concatMapM runToInlines runs
+ return [Link ils (target, "")]
+parPartToInlines (PlainOMath exps) = do
+ return [Math InlineMath (writeTeX exps)]
+
isAnchorSpan :: Inline -> Bool
isAnchorSpan (Span (ident, classes, kvs) ils) =
@@ -265,74 +395,106 @@ isAnchorSpan _ = False
dummyAnchors :: [String]
dummyAnchors = ["_GoBack"]
-makeHeaderAnchors :: Block -> Block
-makeHeaderAnchors h@(Header n (_, classes, kvs) ils) =
- case filter isAnchorSpan ils of
- [] -> h
- (x@(Span (ident, _, _) _) : xs) ->
- case ident `elem` dummyAnchors of
- True -> h
- False -> Header n (ident, classes, kvs) (ils \\ (x:xs))
- _ -> h
-makeHeaderAnchors blk = blk
-
-
-parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline]
-parPartsToInlines opts docx parparts =
- --
- -- We're going to skip data-uri's for now. It should be an option,
- -- not mandatory.
- --
- (if False -- TODO depend on option
- then bottomUp (makeImagesSelfContained docx)
- else id) $
- bottomUp spanTrim $
- bottomUp spanCorrect $
- bottomUp spanReduce $
- concatMap (parPartToInlines opts docx) parparts
-
-cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block]
-cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps
-
-rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]]
-rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells
-
-bodyPartToBlock :: ReaderOptions -> Docx -> BodyPart -> Block
-bodyPartToBlock opts docx (Paragraph pPr parparts) =
- Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)]
-bodyPartToBlock opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) =
+makeHeaderAnchor :: Block -> DocxContext Block
+-- If there is an anchor already there (an anchor span in the header,
+-- to be exact), we rename and associate the new id with the old one.
+makeHeaderAnchor (Header n (_, classes, kvs) ils)
+ | (x : xs) <- filter isAnchorSpan ils
+ , (Span (ident, _, _) _) <- x
+ , notElem ident dummyAnchors =
+ do
+ hdrIDMap <- gets docxAnchorMap
+ let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+ modify $ \s -> s {docxAnchorMap = M.insert ident newIdent hdrIDMap}
+ return $ Header n (newIdent, classes, kvs) (ils \\ (x:xs))
+-- Otherwise we just give it a name, and register that name (associate
+-- it with itself.)
+makeHeaderAnchor (Header n (_, classes, kvs) ils) =
+ do
+ hdrIDMap <- gets docxAnchorMap
+ let newIdent = uniqueIdent ils (M.elems hdrIDMap)
+ modify $ \s -> s {docxAnchorMap = M.insert newIdent newIdent hdrIDMap}
+ return $ Header n (newIdent, classes, kvs) ils
+makeHeaderAnchor blk = return blk
+
+
+parPartsToInlines :: [ParPart] -> DocxContext [Inline]
+parPartsToInlines parparts = do
+ ils <- concatMapM parPartToInlines parparts
+ return $ reduceList $ ils
+
+cellToBlocks :: Cell -> DocxContext [Block]
+cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps
+
+rowToBlocksList :: Row -> DocxContext [[Block]]
+rowToBlocksList (Row cells) = mapM cellToBlocks cells
+
+isBlockCodeContainer :: Container Block -> Bool
+isBlockCodeContainer (Container f) | CodeBlock _ _ <- f [] = True
+isBlockCodeContainer _ = False
+
+isHeaderContainer :: Container Block -> Bool
+isHeaderContainer (Container f) | Header _ _ _ <- f [] = True
+isHeaderContainer _ = False
+
+bodyPartToBlocks :: BodyPart -> DocxContext [Block]
+bodyPartToBlocks (Paragraph pPr parparts)
+ | any isBlockCodeContainer (parStyleToContainers pPr) =
+ let
+ otherConts = filter (not . isBlockCodeContainer) (parStyleToContainers pPr)
+ in
+ return $
+ rebuild
+ otherConts
+ [CodeBlock ("", [], []) (concatMap parPartToString parparts)]
+bodyPartToBlocks (Paragraph pPr parparts)
+ | any isHeaderContainer (parStyleToContainers pPr) = do
+ ils <- normalizeSpaces <$> local (\s -> s{docxInHeaderBlock = True})
+ (parPartsToInlines parparts)
+ let (Container hdrFun) = head $ filter isHeaderContainer (parStyleToContainers pPr)
+ Header n attr _ = hdrFun []
+ hdr <- makeHeaderAnchor $ Header n attr ils
+ return [hdr]
+bodyPartToBlocks (Paragraph pPr parparts) = do
+ ils <- parPartsToInlines parparts >>= (return . normalizeSpaces)
+ case ils of
+ [] -> return []
+ _ -> do
+ return $
+ rebuild
+ (parStyleToContainers pPr)
+ [Para ils]
+bodyPartToBlocks (ListItem pPr numId lvl levelInfo parparts) = do
let
- kvs = case lookupLevel numId lvl numbering of
- Just (_, fmt, txt, Just start) -> [ ("level", lvl)
- , ("num-id", numId)
- , ("format", fmt)
- , ("text", txt)
- , ("start", (show start))
- ]
-
- Just (_, fmt, txt, Nothing) -> [ ("level", lvl)
- , ("num-id", numId)
- , ("format", fmt)
- , ("text", txt)
- ]
- Nothing -> []
- in
- Div
- ("", ["list-item"], kvs)
- [bodyPartToBlock opts docx (Paragraph pPr parparts)]
-bodyPartToBlock _ _ (Tbl _ _ _ []) =
- Para []
-bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) =
+ kvs = case levelInfo of
+ (_, fmt, txt, Just start) -> [ ("level", lvl)
+ , ("num-id", numId)
+ , ("format", fmt)
+ , ("text", txt)
+ , ("start", (show start))
+ ]
+
+ (_, fmt, txt, Nothing) -> [ ("level", lvl)
+ , ("num-id", numId)
+ , ("format", fmt)
+ , ("text", txt)
+ ]
+ blks <- bodyPartToBlocks (Paragraph pPr parparts)
+ return $ [Div ("", ["list-item"], kvs) blks]
+bodyPartToBlocks (Tbl _ _ _ []) =
+ return [Para []]
+bodyPartToBlocks (Tbl cap _ look (r:rs)) = do
let caption = strToInlines cap
(hdr, rows) = case firstRowFormatting look of
True -> (Just r, rs)
False -> (Nothing, r:rs)
- hdrCells = case hdr of
- Just r' -> rowToBlocksList opts docx r'
- Nothing -> []
- cells = map (rowToBlocksList opts docx) rows
+ hdrCells <- case hdr of
+ Just r' -> rowToBlocksList r'
+ Nothing -> return []
- size = case null hdrCells of
+ cells <- mapM rowToBlocksList rows
+
+ let size = case null hdrCells of
True -> length $ head cells
False -> length $ hdrCells
--
@@ -341,208 +503,49 @@ bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) =
-- moment. Width information is in the TblGrid field of the Tbl,
-- so should be possible. Alignment might be more difficult,
-- since there doesn't seem to be a column entity in docx.
- alignments = take size (repeat AlignDefault)
- widths = take size (repeat 0) :: [Double]
- in
- Table caption alignments widths hdrCells cells
-
-makeImagesSelfContained :: Docx -> Inline -> Inline
-makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) =
- case lookup uri media of
- Just bs -> case getMimeType uri of
- Just mime -> let data_uri =
- "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs)
- in
- Image alt (data_uri, title)
- Nothing -> i
- Nothing -> i
-makeImagesSelfContained _ inline = inline
-
-bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block]
-bodyToBlocks opts docx (Body bps) =
- bottomUp removeEmptyPars $
- map blockNormalize $
- bottomUp spanRemove $
- bottomUp divRemove $
- map (makeHeaderAnchors) $
- bottomUp divCorrect $
- bottomUp divReduce $
- bottomUp divCorrectPreReduce $
- bottomUp blocksToDefinitions $
- blocksToBullets $
- map (bodyPartToBlock opts docx) bps
-
-docxToBlocks :: ReaderOptions -> Docx -> [Block]
-docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body
-
-spanReduce :: [Inline] -> [Inline]
-spanReduce [] = []
-spanReduce ((Span (id1, classes1, kvs1) ils1) : ils)
- | (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils)
-spanReduce (s1@(Span (id1, classes1, kvs1) ils1) :
- s2@(Span (id2, classes2, kvs2) ils2) :
- ils) =
- let classes' = classes1 `intersect` classes2
- kvs' = kvs1 `intersect` kvs2
- classes1' = classes1 \\ classes'
- kvs1' = kvs1 \\ kvs'
- classes2' = classes2 \\ classes'
- kvs2' = kvs2 \\ kvs'
- in
- case null classes' && null kvs' of
- True -> s1 : (spanReduce (s2 : ils))
- False -> let attr' = ("", classes', kvs')
- attr1' = (id1, classes1', kvs1')
- attr2' = (id2, classes2', kvs2')
- in
- spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] :
- ils)
-spanReduce (il:ils) = il : (spanReduce ils)
+ alignments = replicate size AlignDefault
+ widths = replicate size 0 :: [Double]
+
+ return [Table caption alignments widths hdrCells cells]
+bodyPartToBlocks (OMathPara e) = do
+ return [Para [Math DisplayMath (writeTeX e)]]
+
+
+-- replace targets with generated anchors.
+rewriteLink :: Inline -> DocxContext Inline
+rewriteLink l@(Link ils ('#':target, title)) = do
+ anchorMap <- gets docxAnchorMap
+ return $ case M.lookup target anchorMap of
+ Just newTarget -> (Link ils ('#':newTarget, title))
+ Nothing -> l
+rewriteLink il = return il
+
+bodyToOutput :: Body -> DocxContext (Meta, [Block], MediaBag)
+bodyToOutput (Body bps) = do
+ let (metabps, blkbps) = sepBodyParts bps
+ meta <- bodyPartsToMeta metabps
+ blks <- concatMapM bodyPartToBlocks blkbps >>=
+ walkM rewriteLink
+ mediaBag <- gets docxMediaBag
+ return $ (meta,
+ blocksToDefinitions $ blocksToBullets $ blks,
+ mediaBag)
+
+docxToOutput :: ReaderOptions -> Docx -> (Meta, [Block], MediaBag)
+docxToOutput opts (Docx (Document _ body)) =
+ let dEnv = def { docxOptions = opts} in
+ evalDocxContext (bodyToOutput body) dEnv def
+
ilToCode :: Inline -> String
ilToCode (Str s) = s
-ilToCode _ = ""
-
-spanRemove' :: Inline -> [Inline]
-spanRemove' s@(Span (ident, classes, _) [])
- -- "_GoBack" is automatically inserted. We don't want to keep it.
- | classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s]
-spanRemove' (Span (_, _, kvs) ils) =
- case lookup "underline" kvs of
- Just val -> [Span ("", [], [("underline", val)]) ils]
- Nothing -> ils
-spanRemove' il = [il]
-
-spanRemove :: [Inline] -> [Inline]
-spanRemove = concatMap spanRemove'
-
-spanTrim' :: Inline -> [Inline]
-spanTrim' il@(Span _ []) = [il]
-spanTrim' il@(Span attr (il':[]))
- | il' == Space = [Span attr [], Space]
- | otherwise = [il]
-spanTrim' (Span attr ils)
- | head ils == Space && last ils == Space =
- [Space, Span attr (init $ tail ils), Space]
- | head ils == Space = [Space, Span attr (tail ils)]
- | last ils == Space = [Span attr (init ils), Space]
-spanTrim' il = [il]
-
-spanTrim :: [Inline] -> [Inline]
-spanTrim = concatMap spanTrim'
-
-spanCorrect' :: Inline -> [Inline]
-spanCorrect' (Span ("", [], []) ils) = ils
-spanCorrect' (Span (ident, classes, kvs) ils)
- | "emph" `elem` classes =
- [Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils]
- | "strong" `elem` classes =
- [Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils]
- | "smallcaps" `elem` classes =
- [SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils]
- | "strike" `elem` classes =
- [Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils]
- | "superscript" `elem` classes =
- [Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils]
- | "subscript" `elem` classes =
- [Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils]
- | (not . null) (codeSpans `intersect` classes) =
- [Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)]
- | otherwise =
- [Span (ident, classes, kvs) ils]
-spanCorrect' il = [il]
-
-spanCorrect :: [Inline] -> [Inline]
-spanCorrect = concatMap spanCorrect'
-
-removeEmptyPars :: [Block] -> [Block]
-removeEmptyPars blks = filter (\b -> b /= (Para [])) blks
-
-divReduce :: [Block] -> [Block]
-divReduce [] = []
-divReduce ((Div (id1, classes1, kvs1) blks1) : blks)
- | (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks)
-divReduce (d1@(Div (id1, classes1, kvs1) blks1) :
- d2@(Div (id2, classes2, kvs2) blks2) :
- blks) =
- let classes' = classes1 `intersect` classes2
- kvs' = kvs1 `intersect` kvs2
- classes1' = classes1 \\ classes'
- kvs1' = kvs1 \\ kvs'
- classes2' = classes2 \\ classes'
- kvs2' = kvs2 \\ kvs'
- in
- case null classes' && null kvs' of
- True -> d1 : (divReduce (d2 : blks))
- False -> let attr' = ("", classes', kvs')
- attr1' = (id1, classes1', kvs1')
- attr2' = (id2, classes2', kvs2')
- in
- divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] :
- blks)
-divReduce (blk:blks) = blk : (divReduce blks)
+ilToCode Space = " "
+ilToCode _ = ""
isHeaderClass :: String -> Maybe Int
-isHeaderClass s | "Heading" `isPrefixOf` s =
- case reads (drop (length "Heading") s) :: [(Int, String)] of
+isHeaderClass s | Just s' <- stripPrefix "Heading" s =
+ case reads s' :: [(Int, String)] of
[] -> Nothing
((n, "") : []) -> Just n
_ -> Nothing
isHeaderClass _ = Nothing
-
-findHeaderClass :: [String] -> Maybe Int
-findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of
- [] -> Nothing
- n : _ -> Just n
-
-blksToInlines :: [Block] -> [Inline]
-blksToInlines (Para ils : _) = ils
-blksToInlines (Plain ils : _) = ils
-blksToInlines _ = []
-
-divCorrectPreReduce' :: Block -> [Block]
-divCorrectPreReduce' (Div (ident, classes, kvs) blks)
- | isJust $ findHeaderClass classes =
- let n = fromJust $ findHeaderClass classes
- in
- [Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)]
- | otherwise = [Div (ident, classes, kvs) blks]
-divCorrectPreReduce' blk = [blk]
-
-divCorrectPreReduce :: [Block] -> [Block]
-divCorrectPreReduce = concatMap divCorrectPreReduce'
-
-blkToCode :: Block -> String
-blkToCode (Para []) = ""
-blkToCode (Para ((Code _ s):ils)) = s ++ (blkToCode (Para ils))
-blkToCode (Para ((Span (_, classes, _) ils'): ils))
- | (not . null) (codeSpans `intersect` classes) =
- (init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils))
-blkToCode _ = ""
-
-divRemove' :: Block -> [Block]
-divRemove' (Div (_, _, kvs) blks) =
- case lookup "indent" kvs of
- Just val -> [Div ("", [], [("indent", val)]) blks]
- Nothing -> blks
-divRemove' blk = [blk]
-
-divRemove :: [Block] -> [Block]
-divRemove = concatMap divRemove'
-
-divCorrect' :: Block -> [Block]
-divCorrect' b@(Div (ident, classes, kvs) blks)
- | (not . null) (blockQuoteDivs `intersect` classes) =
- [BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]]
- | (not . null) (codeDivs `intersect` classes) =
- [CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)]
- | otherwise =
- case lookup "indent" kvs of
- Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]
- Just _ ->
- [BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]]
- Nothing -> [b]
-divCorrect' blk = [blk]
-
-divCorrect :: [Block] -> [Block]
-divCorrect = concatMap divCorrect'