diff options
| author | Jesse Rosenthal <jrosenthal@jhu.edu> | 2014-06-26 16:48:41 -0400 | 
|---|---|---|
| committer | Jesse Rosenthal <jrosenthal@jhu.edu> | 2014-06-28 04:00:16 -0400 | 
| commit | 4248f25152d5715ad99f9d8dda8bf83f33f650ff (patch) | |
| tree | 3628de4d30cce6e4ee027ebbc29519a2c38a0d44 | |
| parent | b1a8f1fa1ad8a6083f0e00cf786eaeff5f10c3be (diff) | |
| download | pandoc-4248f25152d5715ad99f9d8dda8bf83f33f650ff.tar.gz | |
Move Docx reader to DocxContext monad
This is a ReaderT State stack, which keeps track of some environment info, such
as the options and the docx doc. The state will come in handy in the future,
for a couple of planned features (rewriting the section anchors as auto_idents,
and hopefully smart-quoting).
| -rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 247 | 
1 files changed, 140 insertions, 107 deletions
| diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index cb0735e31..5773027f2 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -88,6 +88,9 @@ import qualified Data.ByteString as BS  import qualified Data.ByteString.Lazy as B  import Data.ByteString.Base64 (encode)  import System.FilePath (combine) +import qualified Data.Map as M +import Control.Monad.Reader +import Control.Monad.State  readDocx :: ReaderOptions           -> B.ByteString @@ -97,11 +100,24 @@ readDocx opts bytes =      Just docx -> Pandoc nullMeta (docxToBlocks opts docx)      Nothing   -> error $ "couldn't parse docx file" -spansToKeep :: [String] -spansToKeep = [] +data DState = DState { docxHdrLinks :: M.Map String String } + +data DEnv = DEnv { docxOptions  :: ReaderOptions +                 , docxDocument :: Docx} + +type DocxContext = ReaderT DEnv (State DState) + +evalDocxContext :: DocxContext a -> DEnv -> DState -> a +evalDocxContext ctx env st = evalState (runReaderT ctx env) st + +concatMapM        :: (Monad m) => (a -> m [b]) -> [a] -> m [b] +concatMapM f xs   =  liftM concat (mapM f xs)  -- This is empty, but we put it in for future-proofing. +spansToKeep :: [String] +spansToKeep = [] +  divsToKeep :: [String]  divsToKeep = ["list-item", "Definition", "DefinitionTerm"] @@ -213,57 +229,69 @@ inlineCodeContainer (Container f) = case f [] of  inlineCodeContainer _ = False -runToInlines :: ReaderOptions -> Docx -> Run -> [Inline] -runToInlines _ _ (Run rs runElems) +runToInlines :: Run -> DocxContext [Inline] +runToInlines (Run rs runElems)    | any inlineCodeContainer (runStyleToContainers rs) = +      return $        rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems]    | otherwise = +      return $        rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems) -runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) = +runToInlines (Footnote fnId) = do +  (Docx _ notes _ _ _ ) <- asks docxDocument    case (getFootNote fnId notes) of -    Just bodyParts -> -      [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)] -    Nothing        -> -      [Note []] -runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) = +    Just bodyParts -> do +      blks <- concatMapM bodyPartToBlocks bodyParts +      return $ [Note blks] +    Nothing        -> return [Note []] +runToInlines (Endnote fnId) = do +  (Docx _ notes _ _ _ ) <- asks docxDocument    case (getEndNote fnId notes) of -    Just bodyParts -> -      [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)] -    Nothing        -> -      [Note []] - -parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline] -parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r -parPartToInlines opts docx (Insertion _ author date runs) = +    Just bodyParts -> do +      blks <- concatMapM bodyPartToBlocks bodyParts +      return $ [Note blks] +    Nothing        -> return [Note []] + +parPartToInlines :: ParPart -> DocxContext [Inline] +parPartToInlines (PlainRun r) = runToInlines r +parPartToInlines (Insertion _ author date runs) = do +  opts <- asks docxOptions    case readerTrackChanges opts of -    AcceptChanges -> concatMap (runToInlines opts docx) runs -    RejectChanges -> [] -    AllChanges    -> -      [Span -       ("", ["insertion"], [("author", author), ("date", date)]) -       (concatMap (runToInlines opts docx) runs)] -parPartToInlines opts docx (Deletion _ author date runs) = +    AcceptChanges -> concatMapM runToInlines runs >>= return +    RejectChanges -> return [] +    AllChanges    -> do +      ils <- (concatMapM runToInlines runs) +      return [Span +              ("", ["insertion"], [("author", author), ("date", date)]) +              ils] +parPartToInlines (Deletion _ author date runs) = do +  opts <- asks docxOptions    case readerTrackChanges opts of -    AcceptChanges -> [] -    RejectChanges -> concatMap (runToInlines opts docx) runs -    AllChanges    -> -      [Span -       ("", ["deletion"], [("author", author), ("date", date)]) -       (concatMap (runToInlines opts docx) runs)] -parPartToInlines _ _ (BookMark _ anchor) | anchor `elem` dummyAnchors = [] -parPartToInlines _ _ (BookMark _ anchor) = [Span (anchor, ["anchor"], []) []] -parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) = -  case lookupRelationship relid rels of +    AcceptChanges -> return [] +    RejectChanges -> concatMapM runToInlines runs >>= return +    AllChanges    -> do +      ils <- concatMapM runToInlines runs +      return [Span +              ("", ["deletion"], [("author", author), ("date", date)]) +              ils] +parPartToInlines (BookMark _ anchor) | anchor `elem` dummyAnchors = return [] +parPartToInlines (BookMark _ anchor) = return [Span (anchor, ["anchor"], []) []] +parPartToInlines (Drawing relid) = do +  (Docx _ _ _ rels _) <- asks docxDocument +  return $ case lookupRelationship relid rels of      Just target -> [Image [] (combine "word" target, "")]      Nothing     -> [Image [] ("", "")] -parPartToInlines opts docx (InternalHyperLink anchor runs) = -  [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")] -parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) = -  case lookupRelationship relid rels of +parPartToInlines (InternalHyperLink anchor runs) = do +  ils <- concatMapM runToInlines runs +  return [Link ils ('#' : anchor, "")] +parPartToInlines (ExternalHyperLink relid runs) = do +  (Docx _ _ _ rels _) <- asks docxDocument +  rs <- concatMapM runToInlines runs +  return $ case lookupRelationship relid rels of      Just target -> -      [Link (concatMap (runToInlines opts docx) runs) (target, "")] +      [Link rs (target, "")]      Nothing -> -      [Link (concatMap (runToInlines opts docx) runs) ("", "")] +      [Link rs ("", "")]  isAnchorSpan :: Inline -> Bool  isAnchorSpan (Span (ident, classes, kvs) ils) = @@ -287,25 +315,18 @@ makeHeaderAnchors h@(Header n (_, classes, kvs) ils) =      _ -> h  makeHeaderAnchors blk = blk -parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline] -parPartsToInlines opts docx parparts = -  -- -  -- We're going to skip data-uri's for now. It should be an option, -  -- not mandatory. -  -- -  (if False -- TODO depend on option -      then walk (makeImagesSelfContained docx) -      else id) $ -  -- bottomUp spanTrim $ -  -- bottomUp spanCorrect $ -  -- bottomUp spanReduce $ -  reduceList $ concatMap (parPartToInlines opts docx) parparts - -cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block] -cellToBlocks opts docx (Cell bps) = concatMap (bodyPartToBlocks opts docx) bps - -rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]] -rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells +parPartsToInlines :: [ParPart] -> DocxContext [Inline] +parPartsToInlines parparts = do +  ils <- concatMapM parPartToInlines parparts >>= +         --  TODO: Option for self-containted images +         (if False then (walkM makeImagesSelfContained) else return) +  return $ reduceList $ ils + +cellToBlocks :: Cell -> DocxContext [Block] +cellToBlocks (Cell bps) = concatMapM bodyPartToBlocks bps + +rowToBlocksList :: Row -> DocxContext [[Block]] +rowToBlocksList (Row cells) = mapM cellToBlocks cells  blockCodeContainer :: Container Block -> Bool  blockCodeContainer (Container f) = case f [] of @@ -313,27 +334,32 @@ blockCodeContainer (Container f) = case f [] of    _             -> False  blockCodeContainer _ = False -bodyPartToBlocks :: ReaderOptions -> Docx -> BodyPart -> [Block] -bodyPartToBlocks _ _ (Paragraph pPr parparts) +bodyPartToBlocks :: BodyPart -> DocxContext [Block] +bodyPartToBlocks (Paragraph pPr parparts)    | any blockCodeContainer (parStyleToContainers pPr) =      let        otherConts = filter (not . blockCodeContainer) (parStyleToContainers pPr)      in +     return $        rebuild       otherConts       [CodeBlock ("", [], []) (concatMap parPartToString parparts)] -bodyPartToBlocks opts docx (Paragraph pPr parparts) = -  case parPartsToInlines opts docx parparts of -    [] -> -      [] -    _ -> -      let parContents = parPartsToInlines opts docx parparts -          trimmedContents = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) parContents -      in +bodyPartToBlocks (Paragraph pPr parparts) = do +  ils <- parPartsToInlines parparts +  case ils of +    [] -> return [] +    _ -> do +      parContents <- parPartsToInlines parparts +      let trimmedContents = reverse $ +                            dropWhile (Space ==) $ +                            reverse $ +                            dropWhile (Space ==) parContents +      return $         rebuild         (parStyleToContainers pPr)         [Para trimmedContents] -bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) = +bodyPartToBlocks (ListItem pPr numId lvl parparts) = do +  (Docx _ _ numbering _ _) <- asks docxDocument    let      kvs = case lookupLevel numId lvl numbering of        Just (_, fmt, txt, Just start) -> [ ("level", lvl) @@ -349,23 +375,22 @@ bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parp                                          , ("text", txt)                                          ]        Nothing                        -> [] -  in -   [Div -    ("", ["list-item"], kvs) -    (bodyPartToBlocks opts docx (Paragraph pPr parparts))] -bodyPartToBlocks _ _ (Tbl _ _ _ []) = -  [Para []] -bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) = +  blks <- bodyPartToBlocks (Paragraph pPr parparts) +  return $ [Div ("", ["list-item"], kvs) blks] +bodyPartToBlocks (Tbl _ _ _ []) = +  return [Para []] +bodyPartToBlocks (Tbl cap _ look (r:rs)) = do    let caption = strToInlines cap        (hdr, rows) = case firstRowFormatting look of          True -> (Just r, rs)          False -> (Nothing, r:rs) -      hdrCells = case hdr of -        Just r' -> rowToBlocksList opts docx r' -        Nothing -> [] -      cells = map (rowToBlocksList opts docx) rows +  hdrCells <- case hdr of +    Just r' -> rowToBlocksList r' +    Nothing -> return [] +         +  cells <- mapM rowToBlocksList rows -      size = case null hdrCells of +  let size = case null hdrCells of          True -> length $ head cells          False -> length $ hdrCells        -- @@ -374,34 +399,42 @@ bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) =        -- moment. Width information is in the TblGrid field of the Tbl,        -- so should be possible. Alignment might be more difficult,        -- since there doesn't seem to be a column entity in docx. -      alignments = take size (repeat AlignDefault) -      widths = take size (repeat 0) :: [Double] -  in -   [Table caption alignments widths hdrCells cells] - - -makeImagesSelfContained :: Docx -> Inline -> Inline -makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) = -  case lookup uri media of -    Just bs -> case getMimeType uri of -      Just mime ->  let data_uri = -                          "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs) -                    in -                     Image alt (data_uri, title) -      Nothing  -> i +      alignments = replicate size AlignDefault +      widths = replicate size 0 :: [Double] + +  return [Table caption alignments widths hdrCells cells] + + +makeImagesSelfContained :: Inline -> DocxContext Inline +makeImagesSelfContained i@(Image alt (uri, title)) = do +  (Docx _ _ _ _ media) <- asks docxDocument +  return $ case lookup uri media of +    Just bs -> +      case getMimeType uri of +        Just mime -> +          let data_uri = "data:" ++ mime ++ ";base64," ++ +                         toString (encode $ BS.concat $ B.toChunks bs) +          in +           Image alt (data_uri, title) +        Nothing  -> i      Nothing -> i -makeImagesSelfContained _ inline = inline +makeImagesSelfContained inline = return inline -bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block] -bodyToBlocks opts docx (Body bps) = -  map (makeHeaderAnchors) $ -  blocksToDefinitions $ -  blocksToBullets $ -  concatMap (bodyPartToBlocks opts docx) bps +bodyToBlocks :: Body -> DocxContext [Block] +bodyToBlocks (Body bps) = do +  blks <- concatMapM bodyPartToBlocks bps +  return $ +    map (makeHeaderAnchors) $ +    blocksToDefinitions $ +    blocksToBullets $ blks  docxToBlocks :: ReaderOptions -> Docx -> [Block] -docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body - +docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = +  let dState = DState { docxHdrLinks = M.empty } +      dEnv   = DEnv { docxOptions  = opts +                    , docxDocument = d} +  in +   evalDocxContext (bodyToBlocks body) dEnv dState  ilToCode :: Inline -> String  ilToCode (Str s) = s | 
