diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx/Parse.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 107 |
1 files changed, 64 insertions, 43 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 432965d49..eec8b12c9 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -35,6 +35,7 @@ module Text.Pandoc.Readers.Docx.Parse ( Docx(..) , Body(..) , BodyPart(..) , TblLook(..) + , Extent , ParPart(..) , Run(..) , RunElem(..) @@ -62,6 +63,7 @@ import Control.Monad.Reader import Control.Applicative ((<|>)) import qualified Data.Map as M import Text.Pandoc.Compat.Except +import Text.Pandoc.Shared (safeRead) import Text.TeXMath.Readers.OMML (readOMML) import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..)) import Text.TeXMath (Exp) @@ -75,6 +77,7 @@ data ReaderEnv = ReaderEnv { envNotes :: Notes , envFont :: Maybe Font , envCharStyles :: CharStyleMap , envParStyles :: ParStyleMap + , envLocation :: DocumentLocation } deriving Show @@ -87,7 +90,7 @@ instance Error DocxError where type D = ExceptT DocxError (Reader ReaderEnv) runD :: D a -> ReaderEnv -> Either DocxError a -runD dx re = runReader (runExceptT dx ) re +runD dx re = runReader (runExceptT dx) re maybeToD :: Maybe a -> D a maybeToD (Just a) = return a @@ -140,7 +143,10 @@ data AbstractNumb = AbstractNumb String [Level] -- (ilvl, format, string, start) type Level = (String, String, String, Maybe Integer) -data Relationship = Relationship (RelId, Target) +data DocumentLocation = InDocument | InFootnote | InEndnote + deriving (Eq,Show) + +data Relationship = Relationship DocumentLocation RelId Target deriving Show data Notes = Notes NameSpaces @@ -173,7 +179,7 @@ defaultParagraphStyle = ParagraphStyle { pStyle = [] data BodyPart = Paragraph ParagraphStyle [ParPart] - | ListItem ParagraphStyle String String Level [ParPart] + | ListItem ParagraphStyle String String (Maybe Level) [ParPart] | Tbl String TblGrid TblLook [Row] | OMathPara [Exp] deriving Show @@ -192,20 +198,23 @@ data Row = Row [Cell] data Cell = Cell [BodyPart] deriving Show +-- (width, height) in EMUs +type Extent = Maybe (Double, Double) + data ParPart = PlainRun Run | Insertion ChangeId Author ChangeDate [Run] | Deletion ChangeId Author ChangeDate [Run] | BookMark BookMarkId Anchor | InternalHyperLink Anchor [Run] | ExternalHyperLink URL [Run] - | Drawing FilePath B.ByteString + | Drawing FilePath B.ByteString Extent | PlainOMath [Exp] deriving Show data Run = Run RunStyle [RunElem] | Footnote [BodyPart] | Endnote [BodyPart] - | InlineDrawing FilePath B.ByteString + | InlineDrawing FilePath B.ByteString Extent deriving Show data RunElem = TextRun String | LnBrk | Tab | SoftHyphen | NoBreakHyphen @@ -238,7 +247,6 @@ defaultRunStyle = RunStyle { isBold = Nothing , rUnderline = Nothing , rStyle = Nothing} - type Target = String type Anchor = String type URL = String @@ -255,7 +263,8 @@ archiveToDocx archive = do rels = archiveToRelationships archive media = archiveToMedia archive (styles, parstyles) = archiveToStyles archive - rEnv = ReaderEnv notes numbering rels media Nothing styles parstyles + rEnv = + ReaderEnv notes numbering rels media Nothing styles parstyles InDocument doc <- runD (archiveToDocument archive) rEnv return $ Docx doc @@ -362,29 +371,30 @@ archiveToNotes zf = in Notes ns fn en -filePathIsRel :: FilePath -> Bool -filePathIsRel fp = - let (dir, name) = splitFileName fp - in - (dir == "word/_rels/") && ((takeExtension name) == ".rels") +filePathToRelType :: FilePath -> Maybe DocumentLocation +filePathToRelType "word/_rels/document.xml.rels" = Just InDocument +filePathToRelType "word/_rels/footnotes.xml.rels" = Just InFootnote +filePathToRelType "word/_rels/endnotes.xml.rels" = Just InEndnote +filePathToRelType _ = Nothing -relElemToRelationship :: Element -> Maybe Relationship -relElemToRelationship element | qName (elName element) == "Relationship" = +relElemToRelationship :: DocumentLocation -> Element -> Maybe Relationship +relElemToRelationship relType element | qName (elName element) == "Relationship" = do relId <- findAttr (QName "Id" Nothing Nothing) element target <- findAttr (QName "Target" Nothing Nothing) element - return $ Relationship (relId, target) -relElemToRelationship _ = Nothing - - + return $ Relationship relType relId target +relElemToRelationship _ _ = Nothing + +filePathToRelationships :: Archive -> FilePath -> [Relationship] +filePathToRelationships ar fp | Just relType <- filePathToRelType fp + , Just entry <- findEntryByPath fp ar + , Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry = + mapMaybe (relElemToRelationship relType) $ elChildren relElems +filePathToRelationships _ _ = [] + archiveToRelationships :: Archive -> [Relationship] archiveToRelationships archive = - let relPaths = filter filePathIsRel (filesInArchive archive) - entries = mapMaybe (\f -> findEntryByPath f archive) relPaths - relElems = mapMaybe (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries - rels = mapMaybe relElemToRelationship $ concatMap elChildren relElems - in - rels + concatMap (filePathToRelationships archive) $ filesInArchive archive filePathIsMedia :: FilePath -> Bool filePathIsMedia fp = @@ -409,6 +419,7 @@ lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls return lvl + numElemToNum :: NameSpaces -> Element -> Maybe Numb numElemToNum ns element | qName (elName element) == "num" && @@ -558,9 +569,8 @@ elemToBodyPart ns element let parstyle = elemToParagraphStyle ns element sty parparts <- mapD (elemToParPart ns) (elChildren element) num <- asks envNumbering - case lookupLevel numId lvl num of - Just levelInfo -> return $ ListItem parstyle numId lvl levelInfo parparts - Nothing -> throwError WrongElem + let levelInfo = lookupLevel numId lvl num + return $ ListItem parstyle numId lvl levelInfo parparts elemToBodyPart ns element | isElem ns "w" "p" element = do sty <- asks envParStyles @@ -569,11 +579,8 @@ elemToBodyPart ns element case pNumInfo parstyle of Just (numId, lvl) -> do num <- asks envNumbering - case lookupLevel numId lvl num of - Just levelInfo -> - return $ ListItem parstyle numId lvl levelInfo parparts - Nothing -> - throwError WrongElem + let levelInfo = lookupLevel numId lvl num + return $ ListItem parstyle numId lvl levelInfo parparts Nothing -> return $ Paragraph parstyle parparts elemToBodyPart ns element | isElem ns "w" "tbl" element = do @@ -596,13 +603,16 @@ elemToBodyPart ns element return $ Tbl caption grid tblLook rows elemToBodyPart _ _ = throwError WrongElem -lookupRelationship :: RelId -> [Relationship] -> Maybe Target -lookupRelationship relid rels = - lookup relid (map (\(Relationship pair) -> pair) rels) +lookupRelationship :: DocumentLocation -> RelId -> [Relationship] -> Maybe Target +lookupRelationship docLocation relid rels = + lookup (docLocation, relid) pairs + where + pairs = map (\(Relationship loc relid' target) -> ((loc, relid'), target)) rels expandDrawingId :: String -> D (FilePath, B.ByteString) expandDrawingId s = do - target <- asks (lookupRelationship s . envRelationships) + location <- asks envLocation + target <- asks (lookupRelationship location s . envRelationships) case target of Just filepath -> do bytes <- asks (lookup ("word/" ++ filepath) . envMedia) @@ -614,13 +624,13 @@ expandDrawingId s = do elemToParPart :: NameSpaces -> Element -> D ParPart elemToParPart ns element | isElem ns "w" "r" element - , Just _ <- findChild (elemName ns "w" "drawing") element = + , Just drawingElem <- findChild (elemName ns "w" "drawing") element = let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) in case drawing of - Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp bs) + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp bs $ elemToExtent drawingElem) Nothing -> throwError WrongElem -- The below is an attempt to deal with images in deprecated vml format. elemToParPart ns element @@ -630,7 +640,7 @@ elemToParPart ns element >>= findAttr (elemName ns "r" "id") in case drawing of - Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp bs) + Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp bs Nothing) Nothing -> throwError WrongElem elemToParPart ns element | isElem ns "w" "r" element = @@ -657,9 +667,10 @@ elemToParPart ns element elemToParPart ns element | isElem ns "w" "hyperlink" element , Just relId <- findAttr (elemName ns "r" "id") element = do + location <- asks envLocation runs <- mapD (elemToRun ns) (elChildren element) rels <- asks envRelationships - case lookupRelationship relId rels of + case lookupRelationship location relId rels of Just target -> do case findAttr (elemName ns "w" "anchor") element of Just anchor -> return $ ExternalHyperLink (target ++ '#':anchor) runs @@ -681,6 +692,16 @@ lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s) lookupEndnote :: String -> Notes -> Maybe Element lookupEndnote s (Notes _ _ ens) = ens >>= (M.lookup s) +elemToExtent :: Element -> Extent +elemToExtent drawingElem = + case (getDim "cx", getDim "cy") of + (Just w, Just h) -> Just (w, h) + _ -> Nothing + where + wp_ns = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" + getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem + >>= findAttr (QName at Nothing Nothing) >>= safeRead + elemToRun :: NameSpaces -> Element -> D Run elemToRun ns element | isElem ns "w" "r" element @@ -691,7 +712,7 @@ elemToRun ns element in case drawing of Just s -> expandDrawingId s >>= - (\(fp, bs) -> return $ InlineDrawing fp bs) + (\(fp, bs) -> return $ InlineDrawing fp bs $ elemToExtent drawingElem) Nothing -> throwError WrongElem elemToRun ns element | isElem ns "w" "r" element @@ -699,7 +720,7 @@ elemToRun ns element , Just fnId <- findAttr (elemName ns "w" "id") ref = do notes <- asks envNotes case lookupFootnote fnId notes of - Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e) + Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) return $ Footnote bps Nothing -> return $ Footnote [] elemToRun ns element @@ -708,7 +729,7 @@ elemToRun ns element , Just enId <- findAttr (elemName ns "w" "id") ref = do notes <- asks envNotes case lookupEndnote enId notes of - Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e) + Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) return $ Endnote bps Nothing -> return $ Endnote [] elemToRun ns element |