aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Docx/Parse.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx/Parse.hs')
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs981
1 files changed, 549 insertions, 432 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index 18200bcf9..1abd4bc6b 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE PatternGuards, ViewPatterns #-}
+
{-
Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
@@ -8,49 +10,46 @@ the Free Software Foundation; either version 2 of the License, or
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-}
{- |
- Module : Text.Pandoc.Readers.Docx.Parse
- Copyright : Copyright (C) 2014 Jesse Rosenthal
- License : GNU GPL, version 2 or above
+ Module : Text.Pandoc.Readers.Docx.Parse
+ Copyright : Copyright (C) 2014 Jesse Rosenthal
+ License : GNU GPL, version 2 or above
- Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
- Stability : alpha
- Portability : portable
+ Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
+ Stability : alpha
+ Portability : portable
Conversion of docx archive into Docx haskell type
-}
+module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
+ , Document(..)
+ , Body(..)
+ , BodyPart(..)
+ , TblLook(..)
+ , ParPart(..)
+ , Run(..)
+ , RunElem(..)
+ , Notes
+ , Numbering
+ , Relationship
+ , Media
+ , RunStyle(..)
+ , ParIndentation(..)
+ , ParagraphStyle(..)
+ , Row(..)
+ , Cell(..)
+ , archiveToDocx
+ ) where
-module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
- , Document(..)
- , Body(..)
- , BodyPart(..)
- , TblLook(..)
- , ParPart(..)
- , Run(..)
- , RunElem(..)
- , Notes
- , Numbering
- , Relationship
- , Media
- , RunStyle(..)
- , ParagraphStyle(..)
- , Row(..)
- , Cell(..)
- , getFootNote
- , getEndNote
- , lookupLevel
- , lookupRelationship
- , archiveToDocx
- ) where
import Codec.Archive.Zip
import Text.XML.Light
import Data.Maybe
@@ -59,39 +58,244 @@ import System.FilePath
import Data.Bits ((.|.))
import qualified Data.ByteString.Lazy as B
import qualified Text.Pandoc.UTF8 as UTF8
+import Control.Monad.Reader
+import Control.Applicative ((<$>), (<|>))
+import qualified Data.Map as M
+import Text.Pandoc.Compat.Except
+import Text.TeXMath.Readers.OMML (readOMML)
+import Text.Pandoc.Readers.Docx.Fonts (getUnicode, Font(..))
+import Text.TeXMath (Exp)
+import Data.Char (readLitChar, ord, chr)
+
+data ReaderEnv = ReaderEnv { envNotes :: Notes
+ , envNumbering :: Numbering
+ , envRelationships :: [Relationship]
+ , envMedia :: Media
+ , envFont :: Maybe Font
+ }
+ deriving Show
+
+data DocxError = DocxError | WrongElem
+ deriving Show
+
+instance Error DocxError where
+ noMsg = WrongElem
+
+type D = ExceptT DocxError (Reader ReaderEnv)
+
+runD :: D a -> ReaderEnv -> Either DocxError a
+runD dx re = runReader (runExceptT dx ) re
+
+maybeToD :: Maybe a -> D a
+maybeToD (Just a) = return a
+maybeToD Nothing = throwError DocxError
+
+eitherToD :: Either a b -> D b
+eitherToD (Right b) = return b
+eitherToD (Left _) = throwError DocxError
+
+concatMapM :: (Monad m) => (a -> m [b]) -> [a] -> m [b]
+concatMapM f xs = liftM concat (mapM f xs)
+
+
+-- This is similar to `mapMaybe`: it maps a function returning the D
+-- monad over a list, and only keeps the non-erroring return values.
+mapD :: (a -> D b) -> [a] -> D [b]
+mapD f xs =
+ let handler x = (f x >>= (\y-> return [y])) `catchError` (\_ -> return [])
+ in
+ concatMapM handler xs
-attrToNSPair :: Attr -> Maybe (String, String)
-attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val)
-attrToNSPair _ = Nothing
+type NameSpaces = [(String, String)]
+data Docx = Docx Document
+ deriving Show
-type NameSpaces = [(String, String)]
+data Document = Document NameSpaces Body
+ deriving Show
-data Docx = Docx Document Notes Numbering [Relationship] Media
+data Body = Body [BodyPart]
deriving Show
-archiveToDocx :: Archive -> Maybe Docx
+type Media = [(FilePath, B.ByteString)]
+
+data Numbering = Numbering NameSpaces [Numb] [AbstractNumb]
+ deriving Show
+
+data Numb = Numb String String -- right now, only a key to an abstract num
+ deriving Show
+
+data AbstractNumb = AbstractNumb String [Level]
+ deriving Show
+
+-- (ilvl, format, string, start)
+type Level = (String, String, String, Maybe Integer)
+
+data Relationship = Relationship (RelId, Target)
+ deriving Show
+
+data Notes = Notes NameSpaces
+ (Maybe (M.Map String Element))
+ (Maybe (M.Map String Element))
+ deriving Show
+
+data ParIndentation = ParIndentation { leftParIndent :: Maybe Integer
+ , rightParIndent :: Maybe Integer
+ , hangingParIndent :: Maybe Integer}
+ deriving Show
+
+data ParagraphStyle = ParagraphStyle { pStyle :: [String]
+ , indentation :: Maybe ParIndentation
+ }
+ deriving Show
+
+defaultParagraphStyle :: ParagraphStyle
+defaultParagraphStyle = ParagraphStyle { pStyle = []
+ , indentation = Nothing
+ }
+
+
+data BodyPart = Paragraph ParagraphStyle [ParPart]
+ | ListItem ParagraphStyle String String Level [ParPart]
+ | Tbl String TblGrid TblLook [Row]
+ | OMathPara [Exp]
+ deriving Show
+
+type TblGrid = [Integer]
+
+data TblLook = TblLook {firstRowFormatting::Bool}
+ deriving Show
+
+defaultTblLook :: TblLook
+defaultTblLook = TblLook{firstRowFormatting = False}
+
+data Row = Row [Cell]
+ deriving Show
+
+data Cell = Cell [BodyPart]
+ deriving Show
+
+data ParPart = PlainRun Run
+ | Insertion ChangeId Author ChangeDate [Run]
+ | Deletion ChangeId Author ChangeDate [Run]
+ | BookMark BookMarkId Anchor
+ | InternalHyperLink Anchor [Run]
+ | ExternalHyperLink URL [Run]
+ | Drawing FilePath B.ByteString
+ | PlainOMath [Exp]
+ deriving Show
+
+data Run = Run RunStyle [RunElem]
+ | Footnote [BodyPart]
+ | Endnote [BodyPart]
+ | InlineDrawing FilePath B.ByteString
+ deriving Show
+
+data RunElem = TextRun String | LnBrk | Tab
+ deriving Show
+
+data RunStyle = RunStyle { isBold :: Bool
+ , isItalic :: Bool
+ , isSmallCaps :: Bool
+ , isStrike :: Bool
+ , isSuperScript :: Bool
+ , isSubScript :: Bool
+ , rUnderline :: Maybe String
+ , rStyle :: Maybe String }
+ deriving Show
+
+defaultRunStyle :: RunStyle
+defaultRunStyle = RunStyle { isBold = False
+ , isItalic = False
+ , isSmallCaps = False
+ , isStrike = False
+ , isSuperScript = False
+ , isSubScript = False
+ , rUnderline = Nothing
+ , rStyle = Nothing
+ }
+
+
+type Target = String
+type Anchor = String
+type URL = String
+type BookMarkId = String
+type RelId = String
+type ChangeId = String
+type Author = String
+type ChangeDate = String
+
+attrToNSPair :: Attr -> Maybe (String, String)
+attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val)
+attrToNSPair _ = Nothing
+
+archiveToDocx :: Archive -> Either DocxError Docx
archiveToDocx archive = do
- let notes = archiveToNotes archive
- rels = archiveToRelationships archive
- media = archiveToMedia archive
- doc <- archiveToDocument archive
- numbering <- archiveToNumbering archive
- return $ Docx doc notes numbering rels media
-
-data Document = Document NameSpaces Body
- deriving Show
+ let notes = archiveToNotes archive
+ numbering = archiveToNumbering archive
+ rels = archiveToRelationships archive
+ media = archiveToMedia archive
+ rEnv = ReaderEnv notes numbering rels media Nothing
+ doc <- runD (archiveToDocument archive) rEnv
+ return $ Docx doc
+
-archiveToDocument :: Archive -> Maybe Document
+archiveToDocument :: Archive -> D Document
archiveToDocument zf = do
- entry <- findEntryByPath "word/document.xml" zf
- docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
- let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
- bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem
+ entry <- maybeToD $ findEntryByPath "word/document.xml" zf
+ docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
+ bodyElem <- maybeToD $ findChild (elemName namespaces "w" "body") docElem
body <- elemToBody namespaces bodyElem
return $ Document namespaces body
-type Media = [(FilePath, B.ByteString)]
+elemToBody :: NameSpaces -> Element -> D Body
+elemToBody ns element | isElem ns "w" "body" element =
+ mapD (elemToBodyPart ns) (elChildren element) >>=
+ (\bps -> return $ Body bps)
+elemToBody _ _ = throwError WrongElem
+
+archiveToNotes :: Archive -> Notes
+archiveToNotes zf =
+ let fnElem = findEntryByPath "word/footnotes.xml" zf
+ >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ enElem = findEntryByPath "word/endnotes.xml" zf
+ >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ fn_namespaces = case fnElem of
+ Just e -> mapMaybe attrToNSPair (elAttribs e)
+ Nothing -> []
+ en_namespaces = case enElem of
+ Just e -> mapMaybe attrToNSPair (elAttribs e)
+ Nothing -> []
+ ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces
+ fn = fnElem >>= (elemToNotes ns "footnote")
+ en = enElem >>= (elemToNotes ns "endnote")
+ in
+ Notes ns fn en
+
+filePathIsRel :: FilePath -> Bool
+filePathIsRel fp =
+ let (dir, name) = splitFileName fp
+ in
+ (dir == "word/_rels/") && ((takeExtension name) == ".rels")
+
+relElemToRelationship :: Element -> Maybe Relationship
+relElemToRelationship element | qName (elName element) == "Relationship" =
+ do
+ relId <- findAttr (QName "Id" Nothing Nothing) element
+ target <- findAttr (QName "Target" Nothing Nothing) element
+ return $ Relationship (relId, target)
+relElemToRelationship _ = Nothing
+
+
+archiveToRelationships :: Archive -> [Relationship]
+archiveToRelationships archive =
+ let relPaths = filter filePathIsRel (filesInArchive archive)
+ entries = mapMaybe (\f -> findEntryByPath f archive) relPaths
+ relElems = mapMaybe (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries
+ rels = mapMaybe relElemToRelationship $ concatMap elChildren relElems
+ in
+ rels
filePathIsMedia :: FilePath -> Bool
filePathIsMedia fp =
@@ -109,18 +313,6 @@ archiveToMedia :: Archive -> Media
archiveToMedia zf =
mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf))
-data Numbering = Numbering NameSpaces [Numb] [AbstractNumb]
- deriving Show
-
-data Numb = Numb String String -- right now, only a key to an abstract num
- deriving Show
-
-data AbstractNumb = AbstractNumb String [Level]
- deriving Show
-
--- (ilvl, format, string, start)
-type Level = (String, String, String, Maybe Integer)
-
lookupLevel :: String -> String -> Numbering -> Maybe Level
lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do
absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs
@@ -148,7 +340,7 @@ absNumElemToAbsNum ns element |
let levelElems = findChildren
(QName "lvl" (lookup "w" ns) (Just "w"))
element
- levels = mapMaybe id $ map (levelElemToLevel ns) levelElems
+ levels = mapMaybe (levelElemToLevel ns) levelElems
return $ AbstractNumb absNumId levels
absNumElemToAbsNum _ _ = Nothing
@@ -167,8 +359,8 @@ levelElemToLevel ns element |
return (ilvl, fmt, txt, start)
levelElemToLevel _ _ = Nothing
-archiveToNumbering :: Archive -> Maybe Numbering
-archiveToNumbering zf =
+archiveToNumbering' :: Archive -> Maybe Numbering
+archiveToNumbering' zf = do
case findEntryByPath "word/numbering.xml" zf of
Nothing -> Just $ Numbering [] [] []
Just entry -> do
@@ -180,321 +372,281 @@ archiveToNumbering zf =
absNumElems = findChildren
(QName "abstractNum" (lookup "w" namespaces) (Just "w"))
numberingElem
- nums = mapMaybe id $ map (numElemToNum namespaces) numElems
- absNums = mapMaybe id $ map (absNumElemToAbsNum namespaces) absNumElems
+ nums = mapMaybe (numElemToNum namespaces) numElems
+ absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems
return $ Numbering namespaces nums absNums
-data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])])
- deriving Show
-
-noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart])
-noteElemToNote ns element
- | qName (elName element) `elem` ["endnote", "footnote"] &&
- qURI (elName element) == (lookup "w" ns) =
- do
- noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
- let bps = map fromJust
- $ filter isJust
- $ map (elemToBodyPart ns)
- $ filterChildrenName (isParOrTbl ns) element
- return $ (noteId, bps)
-noteElemToNote _ _ = Nothing
-
-getFootNote :: String -> Notes -> Maybe [BodyPart]
-getFootNote s (Notes _ fns _) = fns >>= (lookup s)
-
-getEndNote :: String -> Notes -> Maybe [BodyPart]
-getEndNote s (Notes _ _ ens) = ens >>= (lookup s)
+archiveToNumbering :: Archive -> Numbering
+archiveToNumbering archive =
+ fromMaybe (Numbering [] [] []) (archiveToNumbering' archive)
-elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])]
+elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map String Element)
elemToNotes ns notetype element
- | qName (elName element) == (notetype ++ "s") &&
- qURI (elName element) == (lookup "w" ns) =
- Just $ map fromJust
- $ filter isJust
- $ map (noteElemToNote ns)
- $ findChildren (QName notetype (lookup "w" ns) (Just "w")) element
+ | isElem ns "w" (notetype ++ "s") element =
+ let pairs = mapMaybe
+ (\e -> findAttr (elemName ns "w" "id") e >>=
+ (\a -> Just (a, e)))
+ (findChildren (elemName ns "w" notetype) element)
+ in
+ Just $ M.fromList $ pairs
elemToNotes _ _ _ = Nothing
-archiveToNotes :: Archive -> Notes
-archiveToNotes zf =
- let fnElem = findEntryByPath "word/footnotes.xml" zf
- >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
- enElem = findEntryByPath "word/endnotes.xml" zf
- >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
- fn_namespaces = case fnElem of
- Just e -> mapMaybe attrToNSPair (elAttribs e)
- Nothing -> []
- en_namespaces = case enElem of
- Just e -> mapMaybe attrToNSPair (elAttribs e)
- Nothing -> []
- ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces
- fn = fnElem >>= (elemToNotes ns "footnote")
- en = enElem >>= (elemToNotes ns "endnote")
- in
- Notes ns fn en
+---------------------------------------------
+---------------------------------------------
+elemName :: NameSpaces -> String -> String -> QName
+elemName ns prefix name = (QName name (lookup prefix ns) (Just prefix))
-data Relationship = Relationship (RelId, Target)
- deriving Show
+isElem :: NameSpaces -> String -> String -> Element -> Bool
+isElem ns prefix name element =
+ qName (elName element) == name &&
+ qURI (elName element) == (lookup prefix ns)
-lookupRelationship :: RelId -> [Relationship] -> Maybe Target
-lookupRelationship relid rels =
- lookup relid (map (\(Relationship pair) -> pair) rels)
-filePathIsRel :: FilePath -> Bool
-filePathIsRel fp =
- let (dir, name) = splitFileName fp
+elemToTblGrid :: NameSpaces -> Element -> D TblGrid
+elemToTblGrid ns element | isElem ns "w" "tblGrid" element =
+ let cols = findChildren (elemName ns "w" "gridCol") element
in
- (dir == "word/_rels/") && ((takeExtension name) == ".rels")
-
-relElemToRelationship :: Element -> Maybe Relationship
-relElemToRelationship element | qName (elName element) == "Relationship" =
- do
- relId <- findAttr (QName "Id" Nothing Nothing) element
- target <- findAttr (QName "Target" Nothing Nothing) element
- return $ Relationship (relId, target)
-relElemToRelationship _ = Nothing
-
-
-archiveToRelationships :: Archive -> [Relationship]
-archiveToRelationships archive =
- let relPaths = filter filePathIsRel (filesInArchive archive)
- entries = map fromJust $ filter isJust $ map (\f -> findEntryByPath f archive) relPaths
- relElems = map fromJust $ filter isJust $ map (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries
- rels = map fromJust $ filter isJust $ map relElemToRelationship $ concatMap elChildren relElems
+ mapD (\e -> maybeToD (findAttr (elemName ns "w" "val") e >>= stringToInteger))
+ cols
+elemToTblGrid _ _ = throwError WrongElem
+
+elemToTblLook :: NameSpaces -> Element -> D TblLook
+elemToTblLook ns element | isElem ns "w" "tblLook" element =
+ let firstRow = findAttr (elemName ns "w" "firstRow") element
+ val = findAttr (elemName ns "w" "val") element
+ firstRowFmt =
+ case firstRow of
+ Just "1" -> True
+ Just _ -> False
+ Nothing -> case val of
+ Just bitMask -> testBitMask bitMask 0x020
+ Nothing -> False
in
- rels
-
-data Body = Body [BodyPart]
- deriving Show
+ return $ TblLook{firstRowFormatting = firstRowFmt}
+elemToTblLook _ _ = throwError WrongElem
-isParOrTbl :: NameSpaces -> QName -> Bool
-isParOrTbl ns q = qName q `elem` ["p", "tbl"] &&
- qURI q == (lookup "w" ns)
+elemToRow :: NameSpaces -> Element -> D Row
+elemToRow ns element | isElem ns "w" "tr" element =
+ do
+ let cellElems = findChildren (elemName ns "w" "tc") element
+ cells <- mapD (elemToCell ns) cellElems
+ return $ Row cells
+elemToRow _ _ = throwError WrongElem
-elemToBody :: NameSpaces -> Element -> Maybe Body
-elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) =
- Just $ Body
- $ map fromJust
- $ filter isJust
- $ map (elemToBodyPart ns) $ filterChildrenName (isParOrTbl ns) element
-elemToBody _ _ = Nothing
+elemToCell :: NameSpaces -> Element -> D Cell
+elemToCell ns element | isElem ns "w" "tc" element =
+ do
+ cellContents <- mapD (elemToBodyPart ns) (elChildren element)
+ return $ Cell cellContents
+elemToCell _ _ = throwError WrongElem
+
+elemToParIndentation :: NameSpaces -> Element -> Maybe ParIndentation
+elemToParIndentation ns element | isElem ns "w" "ind" element =
+ Just $ ParIndentation {
+ leftParIndent =
+ findAttr (QName "left" (lookup "w" ns) (Just "w")) element >>=
+ stringToInteger
+ , rightParIndent =
+ findAttr (QName "right" (lookup "w" ns) (Just "w")) element >>=
+ stringToInteger
+ , hangingParIndent =
+ findAttr (QName "hanging" (lookup "w" ns) (Just "w")) element >>=
+ stringToInteger}
+elemToParIndentation _ _ = Nothing
-isRunOrLinkOrBookmark :: NameSpaces -> QName -> Bool
-isRunOrLinkOrBookmark ns q = qName q `elem` ["r", "hyperlink", "bookmarkStart"] &&
- qURI q == (lookup "w" ns)
elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String)
-elemToNumInfo ns element
- | qName (elName element) == "p" &&
- qURI (elName element) == (lookup "w" ns) =
- do
- pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element
- numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr
- lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>=
- findAttr (QName "val" (lookup "w" ns) (Just "w"))
- numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>=
- findAttr (QName "val" (lookup "w" ns) (Just "w"))
- return (numId, lvl)
+elemToNumInfo ns element | isElem ns "w" "p" element = do
+ let pPr = findChild (elemName ns "w" "pPr") element
+ numPr = pPr >>= findChild (elemName ns "w" "numPr")
+ lvl <- numPr >>=
+ findChild (elemName ns "w" "ilvl") >>=
+ findAttr (elemName ns "w" "val")
+ numId <- numPr >>=
+ findChild (elemName ns "w" "numId") >>=
+ findAttr (elemName ns "w" "val")
+ return (numId, lvl)
elemToNumInfo _ _ = Nothing
--- isBookMarkTag :: NameSpaces -> QName -> Bool
--- isBookMarkTag ns q = qName q `elem` ["bookmarkStart", "bookmarkEnd"] &&
--- qURI q == (lookup "w" ns)
-
--- parChildrenToBookmark :: NameSpaces -> [Element] -> BookMark
--- parChildrenToBookmark ns (bms : bme : _)
--- | qName (elName bms) == "bookmarkStart" &&
--- qURI (elName bms) == (lookup "w" ns) &&
--- qName (elName bme) == "bookmarkEnd" &&
--- qURI (elName bme) == (lookup "w" ns) = do
--- bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) bms
--- bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) bms
--- return $ (bmId, bmName)
--- parChildrenToBookmark _ _ = Nothing
-
-elemToBodyPart :: NameSpaces -> Element -> Maybe BodyPart
-elemToBodyPart ns element
- | qName (elName element) == "p" &&
- qURI (elName element) == (lookup "w" ns) =
- let parstyle = elemToParagraphStyle ns element
- parparts = mapMaybe id
- $ map (elemToParPart ns)
- $ filterChildrenName (isRunOrLinkOrBookmark ns) element
- in
- case elemToNumInfo ns element of
- Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts
- Nothing -> Just $ Paragraph parstyle parparts
- | qName (elName element) == "tbl" &&
- qURI (elName element) == (lookup "w" ns) =
- let
- caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element
- >>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w"))
- >>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
- grid = case
- findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element
- of
- Just g -> elemToTblGrid ns g
- Nothing -> []
- tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element
- >>= findChild (QName "tblLook" (lookup "w" ns) (Just "w"))
- >>= elemToTblLook ns
- in
- Just $ Tbl
- (fromMaybe "" caption)
- grid
- (fromMaybe defaultTblLook tblLook)
- (mapMaybe (elemToRow ns) (elChildren element))
- | otherwise = Nothing
-
-elemToTblLook :: NameSpaces -> Element -> Maybe TblLook
-elemToTblLook ns element
- | qName (elName element) == "tblLook" &&
- qURI (elName element) == (lookup "w" ns) =
- let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element
- val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element
- firstRowFmt =
- case firstRow of
- Just "1" -> True
- Just _ -> False
- Nothing -> case val of
- Just bitMask -> testBitMask bitMask 0x020
- Nothing -> False
- in
- Just $ TblLook{firstRowFormatting = firstRowFmt}
-elemToTblLook _ _ = Nothing
-
testBitMask :: String -> Int -> Bool
testBitMask bitMaskS n =
case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of
[] -> False
((n', _) : _) -> ((n' .|. n) /= 0)
-data ParagraphStyle = ParagraphStyle { pStyle :: [String]
- , indent :: Maybe Integer
- }
- deriving Show
-
-defaultParagraphStyle :: ParagraphStyle
-defaultParagraphStyle = ParagraphStyle { pStyle = []
- , indent = Nothing
- }
-
-elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle
-elemToParagraphStyle ns element =
- case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of
- Just pPr ->
- ParagraphStyle
- {pStyle =
- mapMaybe id $
- map
- (findAttr (QName "val" (lookup "w" ns) (Just "w")))
- (findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr)
- , indent =
- findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>=
- findAttr (QName "left" (lookup "w" ns) (Just "w")) >>=
- stringToInteger
- }
- Nothing -> defaultParagraphStyle
-
-
-data BodyPart = Paragraph ParagraphStyle [ParPart]
- | ListItem ParagraphStyle String String [ParPart]
- | Tbl String TblGrid TblLook [Row]
-
- deriving Show
-
-type TblGrid = [Integer]
-
-data TblLook = TblLook {firstRowFormatting::Bool}
- deriving Show
-
-defaultTblLook :: TblLook
-defaultTblLook = TblLook{firstRowFormatting = False}
-
stringToInteger :: String -> Maybe Integer
stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)])
-elemToTblGrid :: NameSpaces -> Element -> TblGrid
-elemToTblGrid ns element
- | qName (elName element) == "tblGrid" &&
- qURI (elName element) == (lookup "w" ns) =
- let
- cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element
- in
- mapMaybe (\e ->
- findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e
- >>= stringToInteger
- )
- cols
-elemToTblGrid _ _ = []
-
-data Row = Row [Cell]
- deriving Show
-
-
-elemToRow :: NameSpaces -> Element -> Maybe Row
-elemToRow ns element
- | qName (elName element) == "tr" &&
- qURI (elName element) == (lookup "w" ns) =
- let
- cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element
- in
- Just $ Row (mapMaybe (elemToCell ns) cells)
-elemToRow _ _ = Nothing
+elemToBodyPart :: NameSpaces -> Element -> D BodyPart
+elemToBodyPart ns element
+ | isElem ns "w" "p" element
+ , (c:_) <- findChildren (elemName ns "m" "oMathPara") element =
+ do
+ expsLst <- eitherToD $ readOMML $ showElement c
+ return $ OMathPara expsLst
+elemToBodyPart ns element
+ | isElem ns "w" "p" element
+ , Just (numId, lvl) <- elemToNumInfo ns element = do
+ let parstyle = elemToParagraphStyle ns element
+ parparts <- mapD (elemToParPart ns) (elChildren element)
+ num <- asks envNumbering
+ case lookupLevel numId lvl num of
+ Just levelInfo -> return $ ListItem parstyle numId lvl levelInfo parparts
+ Nothing -> throwError WrongElem
+elemToBodyPart ns element
+ | isElem ns "w" "p" element = do
+ let parstyle = elemToParagraphStyle ns element
+ parparts <- mapD (elemToParPart ns) (elChildren element)
+ return $ Paragraph parstyle parparts
+elemToBodyPart ns element
+ | isElem ns "w" "tbl" element = do
+ let caption' = findChild (elemName ns "w" "tblPr") element
+ >>= findChild (elemName ns "w" "tblCaption")
+ >>= findAttr (elemName ns "w" "val")
+ caption = (fromMaybe "" caption')
+ grid' = case findChild (elemName ns "w" "tblGrid") element of
+ Just g -> elemToTblGrid ns g
+ Nothing -> return []
+ tblLook' = case findChild (elemName ns "w" "tblPr") element >>=
+ findChild (elemName ns "w" "tblLook")
+ of
+ Just l -> elemToTblLook ns l
+ Nothing -> return defaultTblLook
+
+ grid <- grid'
+ tblLook <- tblLook'
+ rows <- mapD (elemToRow ns) (elChildren element)
+ return $ Tbl caption grid tblLook rows
+elemToBodyPart _ _ = throwError WrongElem
-data Cell = Cell [BodyPart]
- deriving Show
+lookupRelationship :: RelId -> [Relationship] -> Maybe Target
+lookupRelationship relid rels =
+ lookup relid (map (\(Relationship pair) -> pair) rels)
-elemToCell :: NameSpaces -> Element -> Maybe Cell
-elemToCell ns element
- | qName (elName element) == "tc" &&
- qURI (elName element) == (lookup "w" ns) =
- Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element))
-elemToCell _ _ = Nothing
+expandDrawingId :: String -> D (FilePath, B.ByteString)
+expandDrawingId s = do
+ target <- asks (lookupRelationship s . envRelationships)
+ case target of
+ Just filepath -> do
+ bytes <- asks (lookup (combine "word" filepath) . envMedia)
+ case bytes of
+ Just bs -> return (filepath, bs)
+ Nothing -> throwError DocxError
+ Nothing -> throwError DocxError
+
+elemToParPart :: NameSpaces -> Element -> D ParPart
+elemToParPart ns element
+ | isElem ns "w" "r" element
+ , Just _ <- findChild (elemName ns "w" "drawing") element =
+ let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
+ drawing = findElement (QName "blip" (Just a_ns) (Just "a")) element
+ >>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
+ in
+ case drawing of
+ Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp bs)
+ Nothing -> throwError WrongElem
+elemToParPart ns element
+ | isElem ns "w" "r" element =
+ elemToRun ns element >>= (\r -> return $ PlainRun r)
+elemToParPart ns element
+ | isElem ns "w" "ins" element
+ , Just cId <- findAttr (elemName ns "w" "id") element
+ , Just cAuthor <- findAttr (elemName ns "w" "author") element
+ , Just cDate <- findAttr (elemName ns "w" "date") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ return $ Insertion cId cAuthor cDate runs
+elemToParPart ns element
+ | isElem ns "w" "del" element
+ , Just cId <- findAttr (elemName ns "w" "id") element
+ , Just cAuthor <- findAttr (elemName ns "w" "author") element
+ , Just cDate <- findAttr (elemName ns "w" "date") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ return $ Deletion cId cAuthor cDate runs
+elemToParPart ns element
+ | isElem ns "w" "bookmarkStart" element
+ , Just bmId <- findAttr (elemName ns "w" "id") element
+ , Just bmName <- findAttr (elemName ns "w" "name") element =
+ return $ BookMark bmId bmName
+elemToParPart ns element
+ | isElem ns "w" "hyperlink" element
+ , Just anchor <- findAttr (elemName ns "w" "anchor") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ return $ InternalHyperLink anchor runs
+elemToParPart ns element
+ | isElem ns "w" "hyperlink" element
+ , Just relId <- findAttr (elemName ns "r" "id") element = do
+ runs <- mapD (elemToRun ns) (elChildren element)
+ rels <- asks envRelationships
+ return $ case lookupRelationship relId rels of
+ Just target -> ExternalHyperLink target runs
+ Nothing -> ExternalHyperLink "" runs
+elemToParPart ns element
+ | isElem ns "m" "oMath" element =
+ (eitherToD $ readOMML $ showElement element) >>= (return . PlainOMath)
+elemToParPart _ _ = throwError WrongElem
-data ParPart = PlainRun Run
- | BookMark BookMarkId Anchor
- | InternalHyperLink Anchor [Run]
- | ExternalHyperLink RelId [Run]
- | Drawing String
- deriving Show
+lookupFootnote :: String -> Notes -> Maybe Element
+lookupFootnote s (Notes _ fns _) = fns >>= (M.lookup s)
-data Run = Run RunStyle [RunElem]
- | Footnote String
- | Endnote String
- deriving Show
+lookupEndnote :: String -> Notes -> Maybe Element
+lookupEndnote s (Notes _ _ ens) = ens >>= (M.lookup s)
-data RunElem = TextRun String | LnBrk | Tab
- deriving Show
+elemToRun :: NameSpaces -> Element -> D Run
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just drawingElem <- findChild (elemName ns "w" "drawing") element =
+ let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
+ drawing = findElement (QName "blip" (Just a_ns) (Just "a")) drawingElem
+ >>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
+ in
+ case drawing of
+ Just s -> expandDrawingId s >>=
+ (\(fp, bs) -> return $ InlineDrawing fp bs)
+ Nothing -> throwError WrongElem
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just ref <- findChild (elemName ns "w" "footnoteReference") element
+ , Just fnId <- findAttr (elemName ns "w" "id") ref = do
+ notes <- asks envNotes
+ case lookupFootnote fnId notes of
+ Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e)
+ return $ Footnote bps
+ Nothing -> return $ Footnote []
+elemToRun ns element
+ | isElem ns "w" "r" element
+ , Just ref <- findChild (elemName ns "w" "endnoteReference") element
+ , Just enId <- findAttr (elemName ns "w" "id") ref = do
+ notes <- asks envNotes
+ case lookupEndnote enId notes of
+ Just e -> do bps <- mapD (elemToBodyPart ns) (elChildren e)
+ return $ Endnote bps
+ Nothing -> return $ Endnote []
+elemToRun ns element
+ | isElem ns "w" "r" element = do
+ runElems <- elemToRunElems ns element
+ return $ Run (elemToRunStyle ns element) runElems
+elemToRun _ _ = throwError WrongElem
-data RunStyle = RunStyle { isBold :: Bool
- , isItalic :: Bool
- , isSmallCaps :: Bool
- , isStrike :: Bool
- , isSuperScript :: Bool
- , isSubScript :: Bool
- , underline :: Maybe String
- , rStyle :: Maybe String }
- deriving Show
+elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle
+elemToParagraphStyle ns element
+ | Just pPr <- findChild (elemName ns "w" "pPr") element =
+ ParagraphStyle
+ {pStyle =
+ mapMaybe
+ (findAttr (elemName ns "w" "val"))
+ (findChildren (elemName ns "w" "pStyle") pPr)
+ , indentation =
+ findChild (elemName ns "w" "ind") pPr >>=
+ elemToParIndentation ns
+ }
+elemToParagraphStyle _ _ = defaultParagraphStyle
-defaultRunStyle :: RunStyle
-defaultRunStyle = RunStyle { isBold = False
- , isItalic = False
- , isSmallCaps = False
- , isStrike = False
- , isSuperScript = False
- , isSubScript = False
- , underline = Nothing
- , rStyle = Nothing
- }
elemToRunStyle :: NameSpaces -> Element -> RunStyle
-elemToRunStyle ns element =
- case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of
- Just rPr ->
- RunStyle
+elemToRunStyle ns element
+ | Just rPr <- findChild (elemName ns "w" "rPr") element =
+ RunStyle
{
isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr
, isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr
@@ -508,100 +660,65 @@ elemToRunStyle ns element =
(Just "subscript" ==
(findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))))
- , underline =
+ , rUnderline =
findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))
, rStyle =
findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>=
findAttr (QName "val" (lookup "w" ns) (Just "w"))
}
- Nothing -> defaultRunStyle
+elemToRunStyle _ _ = defaultRunStyle
-elemToRun :: NameSpaces -> Element -> Maybe Run
-elemToRun ns element
- | qName (elName element) == "r" &&
- qURI (elName element) == (lookup "w" ns) =
- case
- findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>=
- findAttr (QName "id" (lookup "w" ns) (Just "w"))
- of
- Just s -> Just $ Footnote s
- Nothing ->
- case
- findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>=
- findAttr (QName "id" (lookup "w" ns) (Just "w"))
- of
- Just s -> Just $ Endnote s
- Nothing -> Just $
- Run (elemToRunStyle ns element)
- (elemToRunElems ns element)
-elemToRun _ _ = Nothing
-
-elemToRunElem :: NameSpaces -> Element -> Maybe RunElem
+elemToRunElem :: NameSpaces -> Element -> D RunElem
elemToRunElem ns element
- | qName (elName element) == "t" &&
- qURI (elName element) == (lookup "w" ns) =
- Just $ TextRun (strContent element)
- | qName (elName element) == "br" &&
- qURI (elName element) == (lookup "w" ns) =
- Just $ LnBrk
- | qName (elName element) == "tab" &&
- qURI (elName element) == (lookup "w" ns) =
- Just $ Tab
- | otherwise = Nothing
-
-
-elemToRunElems :: NameSpaces -> Element -> [RunElem]
+ | isElem ns "w" "t" element
+ || isElem ns "w" "delText" element
+ || isElem ns "m" "t" element = do
+ let str = strContent element
+ font <- asks envFont
+ case font of
+ Nothing -> return $ TextRun str
+ Just f -> return . TextRun $
+ map (\x -> fromMaybe x . getUnicode f . lowerFromPrivate $ x) str
+ | isElem ns "w" "br" element = return LnBrk
+ | isElem ns "w" "tab" element = return Tab
+ | isElem ns "w" "sym" element = return (getSymChar ns element)
+ | otherwise = throwError WrongElem
+ where
+ lowerFromPrivate (ord -> c)
+ | c >= ord '\xF000' = chr $ c - ord '\xF000'
+ | otherwise = chr c
+
+-- The char attribute is a hex string
+getSymChar :: NameSpaces -> Element -> RunElem
+getSymChar ns element
+ | Just s <- lowerFromPrivate <$> getCodepoint
+ , Just font <- getFont =
+ let [(char, _)] = readLitChar ("\\x" ++ s) in
+ TextRun . maybe "" (:[]) $ getUnicode font char
+ where
+ getCodepoint = findAttr (elemName ns "w" "char") element
+ getFont = stringToFont =<< findAttr (elemName ns "w" "font") element
+ lowerFromPrivate ('F':xs) = '0':xs
+ lowerFromPrivate xs = xs
+getSymChar _ _ = TextRun ""
+
+stringToFont :: String -> Maybe Font
+stringToFont "Symbol" = Just Symbol
+stringToFont _ = Nothing
+
+elemToRunElems :: NameSpaces -> Element -> D [RunElem]
elemToRunElems ns element
- | qName (elName element) == "r" &&
- qURI (elName element) == (lookup "w" ns) =
- mapMaybe (elemToRunElem ns) (elChildren element)
- | otherwise = []
-
-elemToDrawing :: NameSpaces -> Element -> Maybe ParPart
-elemToDrawing ns element
- | qName (elName element) == "drawing" &&
- qURI (elName element) == (lookup "w" ns) =
- let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
- in
- findElement (QName "blip" (Just a_ns) (Just "a")) element
- >>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
- >>= (\s -> Just $ Drawing s)
-elemToDrawing _ _ = Nothing
-
-
-elemToParPart :: NameSpaces -> Element -> Maybe ParPart
-elemToParPart ns element
- | qName (elName element) == "r" &&
- qURI (elName element) == (lookup "w" ns) =
- case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of
- Just drawingElem -> elemToDrawing ns drawingElem
- Nothing -> do
- r <- elemToRun ns element
- return $ PlainRun r
-elemToParPart ns element
- | qName (elName element) == "bookmarkStart" &&
- qURI (elName element) == (lookup "w" ns) = do
- bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
- bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element
- return $ BookMark bmId bmName
-elemToParPart ns element
- | qName (elName element) == "hyperlink" &&
- qURI (elName element) == (lookup "w" ns) =
- let runs = map fromJust $ filter isJust $ map (elemToRun ns)
- $ findChildren (QName "r" (lookup "w" ns) (Just "w")) element
- in
- case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of
- Just anchor ->
- Just $ InternalHyperLink anchor runs
- Nothing ->
- case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of
- Just relId -> Just $ ExternalHyperLink relId runs
- Nothing -> Nothing
-elemToParPart _ _ = Nothing
-
-type Target = String
-type Anchor = String
-type BookMarkId = String
-type RelId = String
-
+ | isElem ns "w" "r" element
+ || isElem ns "m" "r" element = do
+ let qualName = elemName ns "w"
+ let font = do
+ fontElem <- findElement (qualName "rFonts") element
+ stringToFont =<<
+ (foldr (<|>) Nothing $
+ map (flip findAttr fontElem . qualName) ["ascii", "hAnsi"])
+ local (setFont font) (mapD (elemToRunElem ns) (elChildren element))
+elemToRunElems _ _ = throwError WrongElem
+
+setFont :: Maybe Font -> ReaderEnv -> ReaderEnv
+setFont f s = s{envFont = f}