aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Docx
diff options
context:
space:
mode:
authorClare Macrae <github@cfmacrae.fastmail.co.uk>2014-06-29 19:22:31 +0100
committerClare Macrae <github@cfmacrae.fastmail.co.uk>2014-06-29 19:22:31 +0100
commit717e16660d1ee83f690b35d0aa9b60c8ac9d6b61 (patch)
treeaa850d4ee99fa0b14da9ba0396ba6aa67e2037e3 /src/Text/Pandoc/Readers/Docx
parentfccfc8429cf4d002df37977f03508c9aae457416 (diff)
parentce69021e42d7bf50deccba2a52ed4717f6ddac10 (diff)
downloadpandoc-717e16660d1ee83f690b35d0aa9b60c8ac9d6b61.tar.gz
Merge remote-tracking branch 'jgm/master' into dokuwiki
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx')
-rw-r--r--src/Text/Pandoc/Readers/Docx/Lists.hs227
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs596
-rw-r--r--src/Text/Pandoc/Readers/Docx/Reducible.hs181
3 files changed, 1004 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs
new file mode 100644
index 000000000..1e37d0076
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Lists.hs
@@ -0,0 +1,227 @@
+{-
+Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Docx.Lists
+ Copyright : Copyright (C) 2014 Jesse Rosenthal
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
+ Stability : alpha
+ Portability : portable
+
+Functions for converting flat docx paragraphs into nested lists.
+-}
+
+module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets
+ , blocksToDefinitions
+ , listParagraphDivs
+ ) where
+
+import Text.Pandoc.JSON
+import Text.Pandoc.Generic (bottomUp)
+import Text.Pandoc.Shared (trim)
+import Control.Monad
+import Data.List
+import Data.Maybe
+
+isListItem :: Block -> Bool
+isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True
+isListItem _ = False
+
+getLevel :: Block -> Maybe Integer
+getLevel (Div (_, _, kvs) _) = liftM read $ lookup "level" kvs
+getLevel _ = Nothing
+
+getLevelN :: Block -> Integer
+getLevelN b = case getLevel b of
+ Just n -> n
+ Nothing -> -1
+
+getNumId :: Block -> Maybe Integer
+getNumId (Div (_, _, kvs) _) = liftM read $ lookup "num-id" kvs
+getNumId _ = Nothing
+
+getNumIdN :: Block -> Integer
+getNumIdN b = case getNumId b of
+ Just n -> n
+ Nothing -> -1
+
+getText :: Block -> Maybe String
+getText (Div (_, _, kvs) _) = lookup "text" kvs
+getText _ = Nothing
+
+data ListType = Itemized | Enumerated ListAttributes
+
+listStyleMap :: [(String, ListNumberStyle)]
+listStyleMap = [("upperLetter", UpperAlpha),
+ ("lowerLetter", LowerAlpha),
+ ("upperRoman", UpperRoman),
+ ("lowerRoman", LowerRoman),
+ ("decimal", Decimal)]
+
+listDelimMap :: [(String, ListNumberDelim)]
+listDelimMap = [("%1)", OneParen),
+ ("(%1)", TwoParens),
+ ("%1.", Period)]
+
+getListType :: Block -> Maybe ListType
+getListType b@(Div (_, _, kvs) _) | isListItem b =
+ let
+ start = lookup "start" kvs
+ frmt = lookup "format" kvs
+ txt = lookup "text" kvs
+ in
+ case frmt of
+ Just "bullet" -> Just Itemized
+ Just f ->
+ case txt of
+ Just t -> Just $ Enumerated (
+ read (fromMaybe "1" start) :: Int,
+ fromMaybe DefaultStyle (lookup f listStyleMap),
+ fromMaybe DefaultDelim (lookup t listDelimMap))
+ Nothing -> Nothing
+ _ -> Nothing
+getListType _ = Nothing
+
+listParagraphDivs :: [String]
+listParagraphDivs = ["ListParagraph"]
+
+-- This is a first stab at going through and attaching meaning to list
+-- paragraphs, without an item marker, following a list item. We
+-- assume that these are paragraphs in the same item.
+
+handleListParagraphs :: [Block] -> [Block]
+handleListParagraphs [] = []
+handleListParagraphs (
+ (Div attr1@(_, classes1, _) blks1) :
+ (Div (ident2, classes2, kvs2) blks2) :
+ blks
+ ) | "list-item" `elem` classes1 &&
+ not ("list-item" `elem` classes2) &&
+ (not . null) (listParagraphDivs `intersect` classes2) =
+ -- We don't want to keep this indent.
+ let newDiv2 =
+ (Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2)
+ in
+ handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks)
+handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks)
+
+separateBlocks' :: Block -> [[Block]] -> [[Block]]
+separateBlocks' blk ([] : []) = [[blk]]
+separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]]
+separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]]
+-- The following is for the invisible bullet lists. This is how
+-- pandoc-generated ooxml does multiparagraph item lists.
+separateBlocks' b acc | liftM trim (getText b) == Just "" =
+ (init acc) ++ [(last acc) ++ [b]]
+separateBlocks' b acc = acc ++ [[b]]
+
+separateBlocks :: [Block] -> [[Block]]
+separateBlocks blks = foldr separateBlocks' [[]] (reverse blks)
+
+flatToBullets' :: Integer -> [Block] -> [Block]
+flatToBullets' _ [] = []
+flatToBullets' num xs@(b : elems)
+ | getLevelN b == num = b : (flatToBullets' num elems)
+ | otherwise =
+ let bNumId = getNumIdN b
+ bLevel = getLevelN b
+ (children, remaining) =
+ span
+ (\b' ->
+ ((getLevelN b') > bLevel ||
+ ((getLevelN b') == bLevel && (getNumIdN b') == bNumId)))
+ xs
+ in
+ case getListType b of
+ Just (Enumerated attr) ->
+ (OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) :
+ (flatToBullets' num remaining)
+ _ ->
+ (BulletList (separateBlocks $ flatToBullets' bLevel children)) :
+ (flatToBullets' num remaining)
+
+flatToBullets :: [Block] -> [Block]
+flatToBullets elems = flatToBullets' (-1) elems
+
+blocksToBullets :: [Block] -> [Block]
+blocksToBullets blks =
+ bottomUp removeListDivs $
+ flatToBullets $ (handleListParagraphs blks)
+
+plainParaInlines :: Block -> [Inline]
+plainParaInlines (Plain ils) = ils
+plainParaInlines (Para ils) = ils
+plainParaInlines _ = []
+
+blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block]
+blocksToDefinitions' [] acc [] = reverse acc
+blocksToDefinitions' defAcc acc [] =
+ reverse $ (DefinitionList (reverse defAcc)) : acc
+blocksToDefinitions' defAcc acc
+ ((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks)
+ | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 =
+ let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2)
+ pair = case remainingAttr2 == ("", [], []) of
+ True -> (concatMap plainParaInlines blks1, [blks2])
+ False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]])
+ in
+ blocksToDefinitions' (pair : defAcc) acc blks
+blocksToDefinitions' defAcc acc
+ ((Div (ident2, classes2, kvs2) blks2) : blks)
+ | (not . null) defAcc && "Definition" `elem` classes2 =
+ let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2)
+ defItems2 = case remainingAttr2 == ("", [], []) of
+ True -> blks2
+ False -> [Div remainingAttr2 blks2]
+ ((defTerm, defItems):defs) = defAcc
+ defAcc' = case null defItems of
+ True -> (defTerm, [defItems2]) : defs
+ False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs
+ in
+ blocksToDefinitions' defAcc' acc blks
+blocksToDefinitions' [] acc (b:blks) =
+ blocksToDefinitions' [] (b:acc) blks
+blocksToDefinitions' defAcc acc (b:blks) =
+ blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks
+
+removeListDivs' :: Block -> [Block]
+removeListDivs' (Div (ident, classes, kvs) blks)
+ | "list-item" `elem` classes =
+ case delete "list-item" classes of
+ [] -> blks
+ classes' -> [Div (ident, classes', kvs) $ blks]
+removeListDivs' (Div (ident, classes, kvs) blks)
+ | not $ null $ listParagraphDivs `intersect` classes =
+ case classes \\ listParagraphDivs of
+ [] -> blks
+ classes' -> [Div (ident, classes', kvs) blks]
+removeListDivs' blk = [blk]
+
+removeListDivs :: [Block] -> [Block]
+removeListDivs = concatMap removeListDivs'
+
+
+
+blocksToDefinitions :: [Block] -> [Block]
+blocksToDefinitions = blocksToDefinitions' [] []
+
+
+
+
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
new file mode 100644
index 000000000..07f34450d
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -0,0 +1,596 @@
+{-
+Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Docx.Parse
+ Copyright : Copyright (C) 2014 Jesse Rosenthal
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
+ Stability : alpha
+ Portability : portable
+
+Conversion of docx archive into Docx haskell type
+-}
+
+
+module Text.Pandoc.Readers.Docx.Parse ( Docx(..)
+ , Document(..)
+ , Body(..)
+ , BodyPart(..)
+ , TblLook(..)
+ , ParPart(..)
+ , Run(..)
+ , RunElem(..)
+ , Notes
+ , Numbering
+ , Relationship
+ , Media
+ , RunStyle(..)
+ , ParagraphStyle(..)
+ , Row(..)
+ , Cell(..)
+ , getFootNote
+ , getEndNote
+ , lookupLevel
+ , lookupRelationship
+ , archiveToDocx
+ ) where
+import Codec.Archive.Zip
+import Text.XML.Light
+import Data.Maybe
+import Data.List
+import System.FilePath
+import Data.Bits ((.|.))
+import qualified Data.ByteString.Lazy as B
+import qualified Text.Pandoc.UTF8 as UTF8
+
+attrToNSPair :: Attr -> Maybe (String, String)
+attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val)
+attrToNSPair _ = Nothing
+
+
+type NameSpaces = [(String, String)]
+
+data Docx = Docx Document Notes Numbering [Relationship] Media
+ deriving Show
+
+archiveToDocx :: Archive -> Maybe Docx
+archiveToDocx archive = do
+ let notes = archiveToNotes archive
+ rels = archiveToRelationships archive
+ media = archiveToMedia archive
+ doc <- archiveToDocument archive
+ numbering <- archiveToNumbering archive
+ return $ Docx doc notes numbering rels media
+
+data Document = Document NameSpaces Body
+ deriving Show
+
+archiveToDocument :: Archive -> Maybe Document
+archiveToDocument zf = do
+ entry <- findEntryByPath "word/document.xml" zf
+ docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
+ bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem
+ body <- elemToBody namespaces bodyElem
+ return $ Document namespaces body
+
+type Media = [(FilePath, B.ByteString)]
+
+filePathIsMedia :: FilePath -> Bool
+filePathIsMedia fp =
+ let (dir, _) = splitFileName fp
+ in
+ (dir == "word/media/")
+
+getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString)
+getMediaPair zf fp =
+ case findEntryByPath fp zf of
+ Just e -> Just (fp, fromEntry e)
+ Nothing -> Nothing
+
+archiveToMedia :: Archive -> Media
+archiveToMedia zf =
+ mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf))
+
+data Numbering = Numbering NameSpaces [Numb] [AbstractNumb]
+ deriving Show
+
+data Numb = Numb String String -- right now, only a key to an abstract num
+ deriving Show
+
+data AbstractNumb = AbstractNumb String [Level]
+ deriving Show
+
+-- (ilvl, format, string, start)
+type Level = (String, String, String, Maybe Integer)
+
+lookupLevel :: String -> String -> Numbering -> Maybe Level
+lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do
+ absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs
+ lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs
+ lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls
+ return lvl
+
+numElemToNum :: NameSpaces -> Element -> Maybe Numb
+numElemToNum ns element |
+ qName (elName element) == "num" &&
+ qURI (elName element) == (lookup "w" ns) = do
+ numId <- findAttr (QName "numId" (lookup "w" ns) (Just "w")) element
+ absNumId <- findChild (QName "abstractNumId" (lookup "w" ns) (Just "w")) element
+ >>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ return $ Numb numId absNumId
+numElemToNum _ _ = Nothing
+
+absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb
+absNumElemToAbsNum ns element |
+ qName (elName element) == "abstractNum" &&
+ qURI (elName element) == (lookup "w" ns) = do
+ absNumId <- findAttr
+ (QName "abstractNumId" (lookup "w" ns) (Just "w"))
+ element
+ let levelElems = findChildren
+ (QName "lvl" (lookup "w" ns) (Just "w"))
+ element
+ levels = mapMaybe (levelElemToLevel ns) levelElems
+ return $ AbstractNumb absNumId levels
+absNumElemToAbsNum _ _ = Nothing
+
+levelElemToLevel :: NameSpaces -> Element -> Maybe Level
+levelElemToLevel ns element |
+ qName (elName element) == "lvl" &&
+ qURI (elName element) == (lookup "w" ns) = do
+ ilvl <- findAttr (QName "ilvl" (lookup "w" ns) (Just "w")) element
+ fmt <- findChild (QName "numFmt" (lookup "w" ns) (Just "w")) element
+ >>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ txt <- findChild (QName "lvlText" (lookup "w" ns) (Just "w")) element
+ >>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ let start = findChild (QName "start" (lookup "w" ns) (Just "w")) element
+ >>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)])))
+ return (ilvl, fmt, txt, start)
+levelElemToLevel _ _ = Nothing
+
+archiveToNumbering :: Archive -> Maybe Numbering
+archiveToNumbering zf =
+ case findEntryByPath "word/numbering.xml" zf of
+ Nothing -> Just $ Numbering [] [] []
+ Just entry -> do
+ numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ let namespaces = mapMaybe attrToNSPair (elAttribs numberingElem)
+ numElems = findChildren
+ (QName "num" (lookup "w" namespaces) (Just "w"))
+ numberingElem
+ absNumElems = findChildren
+ (QName "abstractNum" (lookup "w" namespaces) (Just "w"))
+ numberingElem
+ nums = mapMaybe (numElemToNum namespaces) numElems
+ absNums = mapMaybe (absNumElemToAbsNum namespaces) absNumElems
+ return $ Numbering namespaces nums absNums
+
+data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])])
+ deriving Show
+
+noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart])
+noteElemToNote ns element
+ | qName (elName element) `elem` ["endnote", "footnote"] &&
+ qURI (elName element) == (lookup "w" ns) =
+ do
+ noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
+ let bps = mapMaybe (elemToBodyPart ns)
+ $ elChildren element
+ return $ (noteId, bps)
+noteElemToNote _ _ = Nothing
+
+getFootNote :: String -> Notes -> Maybe [BodyPart]
+getFootNote s (Notes _ fns _) = fns >>= (lookup s)
+
+getEndNote :: String -> Notes -> Maybe [BodyPart]
+getEndNote s (Notes _ _ ens) = ens >>= (lookup s)
+
+elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])]
+elemToNotes ns notetype element
+ | qName (elName element) == (notetype ++ "s") &&
+ qURI (elName element) == (lookup "w" ns) =
+ Just $ mapMaybe (noteElemToNote ns)
+ $ findChildren (QName notetype (lookup "w" ns) (Just "w")) element
+elemToNotes _ _ _ = Nothing
+
+archiveToNotes :: Archive -> Notes
+archiveToNotes zf =
+ let fnElem = findEntryByPath "word/footnotes.xml" zf
+ >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ enElem = findEntryByPath "word/endnotes.xml" zf
+ >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ fn_namespaces = case fnElem of
+ Just e -> mapMaybe attrToNSPair (elAttribs e)
+ Nothing -> []
+ en_namespaces = case enElem of
+ Just e -> mapMaybe attrToNSPair (elAttribs e)
+ Nothing -> []
+ ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces
+ fn = fnElem >>= (elemToNotes ns "footnote")
+ en = enElem >>= (elemToNotes ns "endnote")
+ in
+ Notes ns fn en
+
+
+data Relationship = Relationship (RelId, Target)
+ deriving Show
+
+lookupRelationship :: RelId -> [Relationship] -> Maybe Target
+lookupRelationship relid rels =
+ lookup relid (map (\(Relationship pair) -> pair) rels)
+
+filePathIsRel :: FilePath -> Bool
+filePathIsRel fp =
+ let (dir, name) = splitFileName fp
+ in
+ (dir == "word/_rels/") && ((takeExtension name) == ".rels")
+
+relElemToRelationship :: Element -> Maybe Relationship
+relElemToRelationship element | qName (elName element) == "Relationship" =
+ do
+ relId <- findAttr (QName "Id" Nothing Nothing) element
+ target <- findAttr (QName "Target" Nothing Nothing) element
+ return $ Relationship (relId, target)
+relElemToRelationship _ = Nothing
+
+
+archiveToRelationships :: Archive -> [Relationship]
+archiveToRelationships archive =
+ let relPaths = filter filePathIsRel (filesInArchive archive)
+ entries = mapMaybe (\f -> findEntryByPath f archive) relPaths
+ relElems = mapMaybe (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries
+ rels = mapMaybe relElemToRelationship $ concatMap elChildren relElems
+ in
+ rels
+
+data Body = Body [BodyPart]
+ deriving Show
+
+elemToBody :: NameSpaces -> Element -> Maybe Body
+elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) =
+ Just $ Body
+ $ mapMaybe (elemToBodyPart ns) $ elChildren element
+elemToBody _ _ = Nothing
+
+elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String)
+elemToNumInfo ns element
+ | qName (elName element) == "p" &&
+ qURI (elName element) == (lookup "w" ns) =
+ do
+ pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element
+ numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr
+ lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>=
+ findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>=
+ findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ return (numId, lvl)
+elemToNumInfo _ _ = Nothing
+
+elemToBodyPart :: NameSpaces -> Element -> Maybe BodyPart
+elemToBodyPart ns element
+ | qName (elName element) == "p" &&
+ qURI (elName element) == (lookup "w" ns) =
+ let parstyle = elemToParagraphStyle ns element
+ parparts = mapMaybe (elemToParPart ns)
+ $ elChildren element
+ in
+ case elemToNumInfo ns element of
+ Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts
+ Nothing -> Just $ Paragraph parstyle parparts
+ | qName (elName element) == "tbl" &&
+ qURI (elName element) == (lookup "w" ns) =
+ let
+ caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element
+ >>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w"))
+ >>= findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ grid = case
+ findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element
+ of
+ Just g -> elemToTblGrid ns g
+ Nothing -> []
+ tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element
+ >>= findChild (QName "tblLook" (lookup "w" ns) (Just "w"))
+ >>= elemToTblLook ns
+ in
+ Just $ Tbl
+ (fromMaybe "" caption)
+ grid
+ (fromMaybe defaultTblLook tblLook)
+ (mapMaybe (elemToRow ns) (elChildren element))
+ | otherwise = Nothing
+
+elemToTblLook :: NameSpaces -> Element -> Maybe TblLook
+elemToTblLook ns element
+ | qName (elName element) == "tblLook" &&
+ qURI (elName element) == (lookup "w" ns) =
+ let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element
+ val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element
+ firstRowFmt =
+ case firstRow of
+ Just "1" -> True
+ Just _ -> False
+ Nothing -> case val of
+ Just bitMask -> testBitMask bitMask 0x020
+ Nothing -> False
+ in
+ Just $ TblLook{firstRowFormatting = firstRowFmt}
+elemToTblLook _ _ = Nothing
+
+testBitMask :: String -> Int -> Bool
+testBitMask bitMaskS n =
+ case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of
+ [] -> False
+ ((n', _) : _) -> ((n' .|. n) /= 0)
+
+data ParagraphStyle = ParagraphStyle { pStyle :: [String]
+ , indent :: Maybe Integer
+ }
+ deriving Show
+
+defaultParagraphStyle :: ParagraphStyle
+defaultParagraphStyle = ParagraphStyle { pStyle = []
+ , indent = Nothing
+ }
+
+elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle
+elemToParagraphStyle ns element =
+ case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of
+ Just pPr ->
+ ParagraphStyle
+ {pStyle =
+ mapMaybe
+ (findAttr (QName "val" (lookup "w" ns) (Just "w")))
+ (findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr)
+ , indent =
+ findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>=
+ findAttr (QName "left" (lookup "w" ns) (Just "w")) >>=
+ stringToInteger
+ }
+ Nothing -> defaultParagraphStyle
+
+
+data BodyPart = Paragraph ParagraphStyle [ParPart]
+ | ListItem ParagraphStyle String String [ParPart]
+ | Tbl String TblGrid TblLook [Row]
+
+ deriving Show
+
+type TblGrid = [Integer]
+
+data TblLook = TblLook {firstRowFormatting::Bool}
+ deriving Show
+
+defaultTblLook :: TblLook
+defaultTblLook = TblLook{firstRowFormatting = False}
+
+stringToInteger :: String -> Maybe Integer
+stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)])
+
+elemToTblGrid :: NameSpaces -> Element -> TblGrid
+elemToTblGrid ns element
+ | qName (elName element) == "tblGrid" &&
+ qURI (elName element) == (lookup "w" ns) =
+ let
+ cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element
+ in
+ mapMaybe (\e ->
+ findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e
+ >>= stringToInteger
+ )
+ cols
+elemToTblGrid _ _ = []
+
+data Row = Row [Cell]
+ deriving Show
+
+
+elemToRow :: NameSpaces -> Element -> Maybe Row
+elemToRow ns element
+ | qName (elName element) == "tr" &&
+ qURI (elName element) == (lookup "w" ns) =
+ let
+ cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element
+ in
+ Just $ Row (mapMaybe (elemToCell ns) cells)
+elemToRow _ _ = Nothing
+
+data Cell = Cell [BodyPart]
+ deriving Show
+
+elemToCell :: NameSpaces -> Element -> Maybe Cell
+elemToCell ns element
+ | qName (elName element) == "tc" &&
+ qURI (elName element) == (lookup "w" ns) =
+ Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element))
+elemToCell _ _ = Nothing
+
+data ParPart = PlainRun Run
+ | Insertion ChangeId Author ChangeDate [Run]
+ | Deletion ChangeId Author ChangeDate [Run]
+ | BookMark BookMarkId Anchor
+ | InternalHyperLink Anchor [Run]
+ | ExternalHyperLink RelId [Run]
+ | Drawing String
+ deriving Show
+
+data Run = Run RunStyle [RunElem]
+ | Footnote String
+ | Endnote String
+ deriving Show
+
+data RunElem = TextRun String | LnBrk | Tab
+ deriving Show
+
+data RunStyle = RunStyle { isBold :: Bool
+ , isItalic :: Bool
+ , isSmallCaps :: Bool
+ , isStrike :: Bool
+ , isSuperScript :: Bool
+ , isSubScript :: Bool
+ , underline :: Maybe String
+ , rStyle :: Maybe String }
+ deriving Show
+
+defaultRunStyle :: RunStyle
+defaultRunStyle = RunStyle { isBold = False
+ , isItalic = False
+ , isSmallCaps = False
+ , isStrike = False
+ , isSuperScript = False
+ , isSubScript = False
+ , underline = Nothing
+ , rStyle = Nothing
+ }
+
+elemToRunStyle :: NameSpaces -> Element -> RunStyle
+elemToRunStyle ns element =
+ case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of
+ Just rPr ->
+ RunStyle
+ {
+ isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr
+ , isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr
+ , isSmallCaps = isJust $ findChild (QName "smallCaps" (lookup "w" ns) (Just "w")) rPr
+ , isStrike = isJust $ findChild (QName "strike" (lookup "w" ns) (Just "w")) rPr
+ , isSuperScript =
+ (Just "superscript" ==
+ (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>=
+ findAttr (QName "val" (lookup "w" ns) (Just "w"))))
+ , isSubScript =
+ (Just "subscript" ==
+ (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>=
+ findAttr (QName "val" (lookup "w" ns) (Just "w"))))
+ , underline =
+ findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>=
+ findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ , rStyle =
+ findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>=
+ findAttr (QName "val" (lookup "w" ns) (Just "w"))
+ }
+ Nothing -> defaultRunStyle
+
+elemToRun :: NameSpaces -> Element -> Maybe Run
+elemToRun ns element
+ | qName (elName element) == "r" &&
+ qURI (elName element) == (lookup "w" ns) =
+ case
+ findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>=
+ findAttr (QName "id" (lookup "w" ns) (Just "w"))
+ of
+ Just s -> Just $ Footnote s
+ Nothing ->
+ case
+ findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>=
+ findAttr (QName "id" (lookup "w" ns) (Just "w"))
+ of
+ Just s -> Just $ Endnote s
+ Nothing -> Just $
+ Run (elemToRunStyle ns element)
+ (elemToRunElems ns element)
+elemToRun _ _ = Nothing
+
+elemToRunElem :: NameSpaces -> Element -> Maybe RunElem
+elemToRunElem ns element
+ | (qName (elName element) == "t" || qName (elName element) == "delText") &&
+ qURI (elName element) == (lookup "w" ns) =
+ Just $ TextRun (strContent element)
+ | qName (elName element) == "br" &&
+ qURI (elName element) == (lookup "w" ns) =
+ Just $ LnBrk
+ | qName (elName element) == "tab" &&
+ qURI (elName element) == (lookup "w" ns) =
+ Just $ Tab
+ | otherwise = Nothing
+
+
+elemToRunElems :: NameSpaces -> Element -> [RunElem]
+elemToRunElems ns element
+ | qName (elName element) == "r" &&
+ qURI (elName element) == (lookup "w" ns) =
+ mapMaybe (elemToRunElem ns) (elChildren element)
+ | otherwise = []
+
+elemToDrawing :: NameSpaces -> Element -> Maybe ParPart
+elemToDrawing ns element
+ | qName (elName element) == "drawing" &&
+ qURI (elName element) == (lookup "w" ns) =
+ let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main"
+ in
+ findElement (QName "blip" (Just a_ns) (Just "a")) element
+ >>= findAttr (QName "embed" (lookup "r" ns) (Just "r"))
+ >>= (\s -> Just $ Drawing s)
+elemToDrawing _ _ = Nothing
+
+
+elemToParPart :: NameSpaces -> Element -> Maybe ParPart
+elemToParPart ns element
+ | qName (elName element) == "r" &&
+ qURI (elName element) == (lookup "w" ns) =
+ case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of
+ Just drawingElem -> elemToDrawing ns drawingElem
+ Nothing -> do
+ r <- elemToRun ns element
+ return $ PlainRun r
+elemToParPart ns element
+ | qName (elName element) == "ins" &&
+ qURI (elName element) == (lookup "w" ns) = do
+ cId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
+ cAuthor <- findAttr (QName "author" (lookup "w" ns) (Just "w")) element
+ cDate <- findAttr (QName "date" (lookup "w" ns) (Just "w")) element
+ let runs = mapMaybe (elemToRun ns) (elChildren element)
+ return $ Insertion cId cAuthor cDate runs
+elemToParPart ns element
+ | qName (elName element) == "del" &&
+ qURI (elName element) == (lookup "w" ns) = do
+ cId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
+ cAuthor <- findAttr (QName "author" (lookup "w" ns) (Just "w")) element
+ cDate <- findAttr (QName "date" (lookup "w" ns) (Just "w")) element
+ let runs = mapMaybe (elemToRun ns) (elChildren element)
+ return $ Deletion cId cAuthor cDate runs
+elemToParPart ns element
+ | qName (elName element) == "bookmarkStart" &&
+ qURI (elName element) == (lookup "w" ns) = do
+ bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element
+ bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element
+ return $ BookMark bmId bmName
+elemToParPart ns element
+ | qName (elName element) == "hyperlink" &&
+ qURI (elName element) == (lookup "w" ns) =
+ let runs = mapMaybe (elemToRun ns)
+ $ findChildren (QName "r" (lookup "w" ns) (Just "w")) element
+ in
+ case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of
+ Just anchor ->
+ Just $ InternalHyperLink anchor runs
+ Nothing ->
+ case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of
+ Just relId -> Just $ ExternalHyperLink relId runs
+ Nothing -> Nothing
+elemToParPart _ _ = Nothing
+
+type Target = String
+type Anchor = String
+type BookMarkId = String
+type RelId = String
+type ChangeId = String
+type Author = String
+type ChangeDate = String
diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs
new file mode 100644
index 000000000..8c105d1f1
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Reducible.hs
@@ -0,0 +1,181 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+{-
+Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Readers.Docx.Reducible
+ Copyright : Copyright (C) 2014 Jesse Rosenthal
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Jesse Rosenthal <jrosenthal@jhu.edu>
+ Stability : alpha
+ Portability : portable
+
+Typeclass for combining adjacent blocks and inlines correctly.
+-}
+
+
+module Text.Pandoc.Readers.Docx.Reducible ((<++>),
+ (<+++>),
+ Reducible,
+ Container(..),
+ container,
+ innards,
+ reduceList,
+ reduceListB,
+ rebuild)
+ where
+
+import Text.Pandoc.Builder
+import Data.List ((\\), intersect)
+
+data Container a = Container ([a] -> a) | NullContainer
+
+instance (Eq a) => Eq (Container a) where
+ (Container x) == (Container y) = ((x []) == (y []))
+ NullContainer == NullContainer = True
+ _ == _ = False
+
+instance (Show a) => Show (Container a) where
+ show (Container x) = "Container {" ++
+ (reverse $ drop 3 $ reverse $ show $ x []) ++
+ "}"
+ show (NullContainer) = "NullContainer"
+
+class Reducible a where
+ (<++>) :: a -> a -> [a]
+ container :: a -> Container a
+ innards :: a -> [a]
+ isSpace :: a -> Bool
+
+(<+++>) :: (Reducible a) => Many a -> Many a -> Many a
+mr <+++> ms = fromList $ reduceList $ toList mr ++ toList ms
+
+reduceListB :: (Reducible a) => Many a -> Many a
+reduceListB = fromList . reduceList . toList
+
+reduceList' :: (Reducible a) => [a] -> [a] -> [a]
+reduceList' acc [] = acc
+reduceList' [] (x:xs) = reduceList' [x] xs
+reduceList' as (x:xs) = reduceList' (init as ++ (last as <++> x) ) xs
+
+reduceList :: (Reducible a) => [a] -> [a]
+reduceList = reduceList' []
+
+combineReducibles :: (Reducible a, Eq a) => a -> a -> [a]
+combineReducibles r s =
+ let (conts, rs) = topLevelContainers r
+ (conts', ss) = topLevelContainers s
+ shared = conts `intersect` conts'
+ remaining = conts \\ shared
+ remaining' = conts' \\ shared
+ in
+ case null shared of
+ True -> case (not . null) rs && isSpace (last rs) of
+ True -> rebuild conts (init rs) ++ [last rs, s]
+ False -> [r,s]
+ False -> rebuild
+ shared $
+ reduceList $
+ (rebuild remaining rs) ++ (rebuild remaining' ss)
+
+instance Reducible Inline where
+ s1@(Span (id1, classes1, kvs1) ils1) <++> s2@(Span (id2, classes2, kvs2) ils2) =
+ let classes' = classes1 `intersect` classes2
+ kvs' = kvs1 `intersect` kvs2
+ classes1' = classes1 \\ classes'
+ kvs1' = kvs1 \\ kvs'
+ classes2' = classes2 \\ classes'
+ kvs2' = kvs2 \\ kvs'
+ in
+ case null classes' && null kvs' of
+ True -> [s1,s2]
+ False -> let attr' = ("", classes', kvs')
+ attr1' = (id1, classes1', kvs1')
+ attr2' = (id2, classes2', kvs2')
+ s1' = case null classes1' && null kvs1' of
+ True -> ils1
+ False -> [Span attr1' ils1]
+ s2' = case null classes2' && null kvs2' of
+ True -> ils2
+ False -> [Span attr2' ils2]
+ in
+ [Span attr' $ reduceList $ s1' ++ s2']
+
+ (Str x) <++> (Str y) = [Str (x++y)]
+ il <++> il' = combineReducibles il il'
+
+ container (Emph _) = Container Emph
+ container (Strong _) = Container Strong
+ container (Strikeout _) = Container Strikeout
+ container (Subscript _) = Container Subscript
+ container (Superscript _) = Container Superscript
+ container (Quoted qt _) = Container $ Quoted qt
+ container (Cite cs _) = Container $ Cite cs
+ container (Span attr _) = Container $ Span attr
+ container _ = NullContainer
+
+ innards (Emph ils) = ils
+ innards (Strong ils) = ils
+ innards (Strikeout ils) = ils
+ innards (Subscript ils) = ils
+ innards (Superscript ils) = ils
+ innards (Quoted _ ils) = ils
+ innards (Cite _ ils) = ils
+ innards (Span _ ils) = ils
+ innards _ = []
+
+ isSpace Space = True
+ isSpace _ = False
+
+instance Reducible Block where
+ (Div (ident, classes, kvs) blks) <++> blk | "list-item" `elem` classes =
+ [Div (ident, classes, kvs) (reduceList blks), blk]
+
+ blk <++> blk' = combineReducibles blk blk'
+
+ container (BlockQuote _) = Container BlockQuote
+ container (Div attr _) = Container $ Div attr
+ container _ = NullContainer
+
+ innards (BlockQuote bs) = bs
+ innards (Div _ bs) = bs
+ innards _ = []
+
+ isSpace _ = False
+
+
+topLevelContainers' :: (Reducible a) => [a] -> ([Container a], [a])
+topLevelContainers' (r : []) = case container r of
+ NullContainer -> ([], [r])
+ _ ->
+ let (conts, inns) = topLevelContainers' (innards r)
+ in
+ ((container r) : conts, inns)
+topLevelContainers' rs = ([], rs)
+
+topLevelContainers :: (Reducible a) => a -> ([Container a], [a])
+topLevelContainers il = topLevelContainers' [il]
+
+rebuild :: [Container a] -> [a] -> [a]
+rebuild [] xs = xs
+rebuild ((Container f) : cs) xs = rebuild cs $ [f xs]
+rebuild (NullContainer : cs) xs = rebuild cs $ xs
+
+