diff options
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 341 |
1 files changed, 199 insertions, 142 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index cb0735e31..0607aac7f 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -1,3 +1,5 @@ +{-# LANGUAGE PatternGuards #-} + {- Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> @@ -82,12 +84,16 @@ import Text.Pandoc.Walk import Text.Pandoc.Readers.Docx.Parse import Text.Pandoc.Readers.Docx.Lists import Text.Pandoc.Readers.Docx.Reducible -import Data.Maybe (mapMaybe, isJust, fromJust) +import Text.Pandoc.Shared +import Data.Maybe (mapMaybe) import Data.List (delete, isPrefixOf, (\\)) import qualified Data.ByteString as BS import qualified Data.ByteString.Lazy as B import Data.ByteString.Base64 (encode) import System.FilePath (combine) +import qualified Data.Map as M +import Control.Monad.Reader +import Control.Monad.State readDocx :: ReaderOptions -> B.ByteString @@ -97,11 +103,24 @@ readDocx opts bytes = Just docx -> Pandoc nullMeta (docxToBlocks opts docx) Nothing -> error $ "couldn't parse docx file" -spansToKeep :: [String] -spansToKeep = [] +data DState = DState { docxAnchorMap :: M.Map String String } + +data DEnv = DEnv { docxOptions :: ReaderOptions + , docxDocument :: Docx} + +type DocxContext = ReaderT DEnv (State DState) + +evalDocxContext :: DocxContext a -> DEnv -> DState -> a +evalDocxContext ctx env st = evalState (runReaderT ctx env) st + +concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs = liftM concat (mapM f xs) -- This is empty, but we put it in for future-proofing. +spansToKeep :: [String] +spansToKeep = [] + divsToKeep :: [String] divsToKeep = ["list-item", "Definition", "DefinitionTerm"] @@ -132,11 +151,9 @@ runStyleToContainers rPr = divAttrToContainers :: [String] -> [(String, String)] -> [Container Block] -divAttrToContainers (c:cs) _ | isJust (isHeaderClass c) = - let n = fromJust (isHeaderClass c) - in - [(Container $ \blks -> - Header n ("", delete ("Heading" ++ show n) cs, []) (blksToInlines blks))] +divAttrToContainers (c:cs) _ | Just n <- isHeaderClass c = + [Container $ \_ -> + Header n ("", delete ("Heading" ++ show n) cs, []) []] divAttrToContainers (c:cs) kvs | c `elem` divsToKeep = (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs) divAttrToContainers (c:cs) kvs | c `elem` codeDivs = @@ -150,10 +167,10 @@ divAttrToContainers (c:cs) kvs | c `elem` listParagraphDivs = divAttrToContainers (c:cs) kvs | c `elem` blockQuoteDivs = (Container BlockQuote) : (divAttrToContainers (cs \\ blockQuoteDivs) kvs) divAttrToContainers (_:cs) kvs = divAttrToContainers cs kvs -divAttrToContainers [] kvs | isJust (lookup "indent" kvs) = +divAttrToContainers [] kvs | Just numString <- lookup "indent" kvs = let kvs' = filter (\(k,_) -> k /= "indent") kvs in - case fromJust (lookup "indent" kvs) of + case numString of "0" -> divAttrToContainers [] kvs' ('-' : _) -> divAttrToContainers [] kvs' _ -> (Container BlockQuote) : divAttrToContainers [] kvs' @@ -213,57 +230,84 @@ inlineCodeContainer (Container f) = case f [] of inlineCodeContainer _ = False -runToInlines :: ReaderOptions -> Docx -> Run -> [Inline] -runToInlines _ _ (Run rs runElems) +runToInlines :: Run -> DocxContext [Inline] +runToInlines (Run rs runElems) | any inlineCodeContainer (runStyleToContainers rs) = + return $ rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems] | otherwise = + return $ rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems) -runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) = +runToInlines (Footnote fnId) = do + (Docx _ notes _ _ _ ) <- asks docxDocument case (getFootNote fnId notes) of - Just bodyParts -> - [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)] - Nothing -> - [Note []] -runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) = + Just bodyParts -> do + blks <- concatMapM bodyPartToBlocks bodyParts + return $ [Note blks] + Nothing -> return [Note []] +runToInlines (Endnote fnId) = do + (Docx _ notes _ _ _ ) <- asks docxDocument case (getEndNote fnId notes) of - Just bodyParts -> - [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)] - Nothing -> - [Note []] - -parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline] -parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r -parPartToInlines opts docx (Insertion _ author date runs) = + Just bodyParts -> do + blks <- concatMapM bodyPartToBlocks bodyParts + return $ [Note blks] + Nothing -> return [Note []] + +parPartToInlines :: ParPart -> DocxContext [Inline] +parPartToInlines (PlainRun r) = runToInlines r +parPartToInlines (Insertion _ author date runs) = do + opts <- asks docxOptions case readerTrackChanges opts of - AcceptChanges -> concatMap (runToInlines opts docx) runs - RejectChanges -> [] - AllChanges -> - [Span - ("", ["insertion"], [("author", author), ("date", date)]) - (concatMap (runToInlines opts docx) runs)] -parPartToInlines opts docx (Deletion _ author date runs) = + AcceptChanges -> concatMapM runToInlines runs >>= return + RejectChanges -> return [] + AllChanges -> do + ils <- (concatMapM runToInlines runs) + return [Span + ("", ["insertion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (Deletion _ author date runs) = do + opts <- asks docxOptions case readerTrackChanges opts of - AcceptChanges -> [] - RejectChanges -> concatMap (runToInlines opts docx) runs - AllChanges -> - [Span - ("", ["deletion"], [("author", author), ("date", date)]) - (concatMap (runToInlines opts docx) runs)] -parPartToInlines _ _ (BookMark _ anchor) | anchor `elem` dummyAnchors = [] -parPartToInlines _ _ (BookMark _ anchor) = [Span (anchor, ["anchor"], []) []] -parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) = - case lookupRelationship relid rels of + AcceptChanges -> return [] + RejectChanges -> concatMapM runToInlines runs >>= return + AllChanges -> do + ils <- concatMapM runToInlines runs + return [Span + ("", ["deletion"], [("author", author), ("date", date)]) + ils] +parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return [] +parPartToInlines (BookMark _ anchor) = + -- We record these, so we can make sure not to overwrite + -- user-defined anchor links with header auto ids. + do + -- Get the anchor map. + anchorMap <- gets docxAnchorMap + -- Check to see if the id is already in there. Rewrite if + -- necessary. This will have the possible effect of rewriting + -- user-defined anchor links. However, since these are not defined + -- in pandoc, it seems like a necessary evil to avoid an extra + -- pass. + let newAnchor = case anchor `elem` (M.elems anchorMap) of + True -> uniqueIdent [Str anchor] (M.elems anchorMap) + False -> anchor + put DState{ docxAnchorMap = M.insert anchor newAnchor anchorMap} + return [Span (anchor, ["anchor"], []) []] +parPartToInlines (Drawing relid) = do + (Docx _ _ _ rels _) <- asks docxDocument + return $ case lookupRelationship relid rels of Just target -> [Image [] (combine "word" target, "")] Nothing -> [Image [] ("", "")] -parPartToInlines opts docx (InternalHyperLink anchor runs) = - [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")] -parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) = - case lookupRelationship relid rels of +parPartToInlines (InternalHyperLink anchor runs) = do + ils <- concatMapM runToInlines runs + return [Link ils ('#' : anchor, "")] +parPartToInlines (ExternalHyperLink relid runs) = do + (Docx _ _ _ rels _) <- asks docxDocument + rs <- concatMapM runToInlines runs + return $ case lookupRelationship relid rels of Just target -> - [Link (concatMap (runToInlines opts docx) runs) (target, "")] + [Link rs (target, "")] Nothing -> - [Link (concatMap (runToInlines opts docx) runs) ("", "")] + [Link rs ("", "")] isAnchorSpan :: Inline -> Bool isAnchorSpan (Span (ident, classes, kvs) ils) = @@ -276,64 +320,68 @@ isAnchorSpan _ = False dummyAnchors :: [String] dummyAnchors = ["_GoBack"] -makeHeaderAnchors :: Block -> Block -makeHeaderAnchors h@(Header n (_, classes, kvs) ils) = - case filter isAnchorSpan ils of - [] -> h - (x@(Span (ident, _, _) _) : xs) -> - case ident `elem` dummyAnchors of - True -> h - False -> Header n (ident, classes, kvs) (ils \\ (x:xs)) - _ -> h -makeHeaderAnchors blk = blk - -parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline] -parPartsToInlines opts docx parparts = - -- - -- We're going to skip data-uri's for now. It should be an option, - -- not mandatory. - -- - (if False -- TODO depend on option - then walk (makeImagesSelfContained docx) - else id) $ - -- bottomUp spanTrim $ - -- bottomUp spanCorrect $ - -- bottomUp spanReduce $ - reduceList $ concatMap (parPartToInlines opts docx) parparts - -cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block] -cellToBlocks opts docx (Cell bps) = concatMap (bodyPartToBlocks opts docx) bps - -rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]] -rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells - -blockCodeContainer :: Container Block -> Bool -blockCodeContainer (Container f) = case f [] of - CodeBlock _ _ -> True - _ -> False -blockCodeContainer _ = False - -bodyPartToBlocks :: ReaderOptions -> Docx -> BodyPart -> [Block] -bodyPartToBlocks _ _ (Paragraph pPr parparts) - | any blockCodeContainer (parStyleToContainers pPr) = +makeHeaderAnchor :: Block -> DocxContext Block +makeHeaderAnchor (Header n (_, classes, kvs) ils) + | (x : xs) <- filter isAnchorSpan ils + , (Span (ident, _, _) _) <- x + , notElem ident dummyAnchors = + do + hdrIDMap <- gets docxAnchorMap + let newIdent = uniqueIdent ils (M.elems hdrIDMap) + put DState{docxAnchorMap = M.insert ident newIdent hdrIDMap} + return $ Header n (newIdent, classes, kvs) (ils \\ (x:xs)) +makeHeaderAnchor blk = return blk + + +parPartsToInlines :: [ParPart] -> DocxContext [Inline] +parPartsToInlines parparts = do + ils <- concatMapM parPartToInlines parparts >>= + -- TODO: Option for self-containted images + (if False then (walkM makeImagesSelfContained) else return) + return $ reduceList $ ils + +cellToBlocks :: Cell -> DocxContext [Block] +cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps + +rowToBlocksList :: Row -> DocxContext [[Block]] +rowToBlocksList (Row cells) = mapM cellToBlocks cells + +isBlockCodeContainer :: Container Block -> Bool +isBlockCodeContainer (Container f) | CodeBlock _ _ <- f [] = True +isBlockCodeContainer _ = False + +isHeaderContainer :: Container Block -> Bool +isHeaderContainer (Container f) | Header _ _ _ <- f [] = True +isHeaderContainer _ = False + +bodyPartToBlocks :: BodyPart -> DocxContext [Block] +bodyPartToBlocks (Paragraph pPr parparts) + | any isBlockCodeContainer (parStyleToContainers pPr) = let - otherConts = filter (not . blockCodeContainer) (parStyleToContainers pPr) + otherConts = filter (not . isBlockCodeContainer) (parStyleToContainers pPr) in + return $ rebuild otherConts [CodeBlock ("", [], []) (concatMap parPartToString parparts)] -bodyPartToBlocks opts docx (Paragraph pPr parparts) = - case parPartsToInlines opts docx parparts of - [] -> - [] - _ -> - let parContents = parPartsToInlines opts docx parparts - trimmedContents = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) parContents - in +bodyPartToBlocks (Paragraph pPr parparts) + | any isHeaderContainer (parStyleToContainers pPr) = do + ils <- parPartsToInlines parparts >>= (return . normalizeSpaces) + let (Container hdrFun) = head $ filter isHeaderContainer (parStyleToContainers pPr) + Header n attr _ = hdrFun [] + hdr <- makeHeaderAnchor $ Header n attr ils + return [hdr] +bodyPartToBlocks (Paragraph pPr parparts) = do + ils <- parPartsToInlines parparts >>= (return . normalizeSpaces) + case ils of + [] -> return [] + _ -> do + return $ rebuild (parStyleToContainers pPr) - [Para trimmedContents] -bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) = + [Para ils] +bodyPartToBlocks (ListItem pPr numId lvl parparts) = do + (Docx _ _ numbering _ _) <- asks docxDocument let kvs = case lookupLevel numId lvl numbering of Just (_, fmt, txt, Just start) -> [ ("level", lvl) @@ -349,23 +397,22 @@ bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parp , ("text", txt) ] Nothing -> [] - in - [Div - ("", ["list-item"], kvs) - (bodyPartToBlocks opts docx (Paragraph pPr parparts))] -bodyPartToBlocks _ _ (Tbl _ _ _ []) = - [Para []] -bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) = + blks <- bodyPartToBlocks (Paragraph pPr parparts) + return $ [Div ("", ["list-item"], kvs) blks] +bodyPartToBlocks (Tbl _ _ _ []) = + return [Para []] +bodyPartToBlocks (Tbl cap _ look (r:rs)) = do let caption = strToInlines cap (hdr, rows) = case firstRowFormatting look of True -> (Just r, rs) False -> (Nothing, r:rs) - hdrCells = case hdr of - Just r' -> rowToBlocksList opts docx r' - Nothing -> [] - cells = map (rowToBlocksList opts docx) rows + hdrCells <- case hdr of + Just r' -> rowToBlocksList r' + Nothing -> return [] + + cells <- mapM rowToBlocksList rows - size = case null hdrCells of + let size = case null hdrCells of True -> length $ head cells False -> length $ hdrCells -- @@ -374,41 +421,56 @@ bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) = -- moment. Width information is in the TblGrid field of the Tbl, -- so should be possible. Alignment might be more difficult, -- since there doesn't seem to be a column entity in docx. - alignments = take size (repeat AlignDefault) - widths = take size (repeat 0) :: [Double] - in - [Table caption alignments widths hdrCells cells] - - -makeImagesSelfContained :: Docx -> Inline -> Inline -makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) = - case lookup uri media of - Just bs -> case getMimeType uri of - Just mime -> let data_uri = - "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs) - in - Image alt (data_uri, title) - Nothing -> i + alignments = replicate size AlignDefault + widths = replicate size 0 :: [Double] + + return [Table caption alignments widths hdrCells cells] + +-- replace targets with generated anchors. +rewriteLink :: Inline -> DocxContext Inline +rewriteLink l@(Link ils ('#':target, title)) = do + anchorMap <- gets docxAnchorMap + return $ case M.lookup target anchorMap of + Just newTarget -> (Link ils ('#':newTarget, title)) + Nothing -> l +rewriteLink il = return il + +makeImagesSelfContained :: Inline -> DocxContext Inline +makeImagesSelfContained i@(Image alt (uri, title)) = do + (Docx _ _ _ _ media) <- asks docxDocument + return $ case lookup uri media of + Just bs -> + case getMimeType uri of + Just mime -> + let data_uri = "data:" ++ mime ++ ";base64," ++ + toString (encode $ BS.concat $ B.toChunks bs) + in + Image alt (data_uri, title) + Nothing -> i Nothing -> i -makeImagesSelfContained _ inline = inline +makeImagesSelfContained inline = return inline -bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block] -bodyToBlocks opts docx (Body bps) = - map (makeHeaderAnchors) $ - blocksToDefinitions $ - blocksToBullets $ - concatMap (bodyPartToBlocks opts docx) bps +bodyToBlocks :: Body -> DocxContext [Block] +bodyToBlocks (Body bps) = do + blks <- concatMapM bodyPartToBlocks bps >>= + walkM rewriteLink + return $ + blocksToDefinitions $ + blocksToBullets $ blks docxToBlocks :: ReaderOptions -> Docx -> [Block] -docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body - +docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = + let dState = DState { docxAnchorMap = M.empty } + dEnv = DEnv { docxOptions = opts + , docxDocument = d} + in + evalDocxContext (bodyToBlocks body) dEnv dState ilToCode :: Inline -> String ilToCode (Str s) = s ilToCode Space = " " ilToCode _ = "" - isHeaderClass :: String -> Maybe Int isHeaderClass s | "Heading" `isPrefixOf` s = case reads (drop (length "Heading") s) :: [(Int, String)] of @@ -416,8 +478,3 @@ isHeaderClass s | "Heading" `isPrefixOf` s = ((n, "") : []) -> Just n _ -> Nothing isHeaderClass _ = Nothing - -blksToInlines :: [Block] -> [Inline] -blksToInlines (Para ils : _) = ils -blksToInlines (Plain ils : _) = ils -blksToInlines _ = [] |