From bbe99003f8d25dc65ab12851907ecd5d9aad746c Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 16 Jun 2014 22:44:40 -0700 Subject: Naming: Use Docx instead of DocX. For consistency with the existing writer. --- src/Text/Pandoc/Readers/DocX.hs | 479 --------------------------- src/Text/Pandoc/Readers/DocX/Lists.hs | 208 ------------ src/Text/Pandoc/Readers/DocX/Parse.hs | 604 ---------------------------------- src/Text/Pandoc/Readers/Docx.hs | 479 +++++++++++++++++++++++++++ src/Text/Pandoc/Readers/Docx/Lists.hs | 208 ++++++++++++ src/Text/Pandoc/Readers/Docx/Parse.hs | 604 ++++++++++++++++++++++++++++++++++ 6 files changed, 1291 insertions(+), 1291 deletions(-) delete mode 100644 src/Text/Pandoc/Readers/DocX.hs delete mode 100644 src/Text/Pandoc/Readers/DocX/Lists.hs delete mode 100644 src/Text/Pandoc/Readers/DocX/Parse.hs create mode 100644 src/Text/Pandoc/Readers/Docx.hs create mode 100644 src/Text/Pandoc/Readers/Docx/Lists.hs create mode 100644 src/Text/Pandoc/Readers/Docx/Parse.hs (limited to 'src/Text/Pandoc') diff --git a/src/Text/Pandoc/Readers/DocX.hs b/src/Text/Pandoc/Readers/DocX.hs deleted file mode 100644 index 976e2e271..000000000 --- a/src/Text/Pandoc/Readers/DocX.hs +++ /dev/null @@ -1,479 +0,0 @@ -{- -Copyright (C) 2014 Jesse Rosenthal - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.DocX - Copyright : Copyright (C) 2014 Jesse Rosenthal - License : GNU GPL, version 2 or above - - Maintainer : Jesse Rosenthal - Stability : alpha - Portability : portable - -Conversion of DocX type (defined in Text.Pandoc.Readers.DocX.Parse) -to 'Pandoc' document. -} - -{- -Current state of implementation of DocX entities ([x] means -implemented, [-] means partially implemented): - -* Blocks - - - [X] Para - - [X] CodeBlock (styled with `SourceCode`) - - [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally, - indented) - - [X] OrderedList - - [X] BulletList - - [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`) - - [X] Header (styled with `Heading#`) - - [ ] HorizontalRule - - [-] Table (column widths and alignments not yet implemented) - -* Inlines - - - [X] Str - - [X] Emph (From italics. `underline` currently read as span. In - future, it might optionally be emph as well) - - [X] Strong - - [X] Strikeout - - [X] Superscript - - [X] Subscript - - [X] SmallCaps - - [ ] Quoted - - [ ] Cite - - [X] Code (styled with `VerbatimChar`) - - [X] Space - - [X] LineBreak (these are invisible in Word: entered with Shift-Return) - - [ ] Math - - [X] Link (links to an arbitrary bookmark create a span with the target as - id and "anchor" class) - - [-] Image (Links to path in archive. Future option for - data-encoded URI likely.) - - [X] Note (Footnotes and Endnotes are silently combined.) --} - -module Text.Pandoc.Readers.DocX - ( readDocX - ) where - -import Codec.Archive.Zip -import Text.Pandoc.Definition -import Text.Pandoc.Options -import Text.Pandoc.Builder (text, toList) -import Text.Pandoc.Generic (bottomUp) -import Text.Pandoc.MIME (getMimeType) -import Text.Pandoc.UTF8 (toString) -import Text.Pandoc.Readers.DocX.Parse -import Text.Pandoc.Readers.DocX.Lists -import Data.Maybe (mapMaybe, isJust, fromJust) -import Data.List (delete, isPrefixOf, (\\), intersect) -import qualified Data.ByteString as BS -import qualified Data.ByteString.Lazy as B -import Data.ByteString.Base64 (encode) -import System.FilePath (combine) - -readDocX :: ReaderOptions - -> B.ByteString - -> Pandoc -readDocX opts bytes = - case archiveToDocX (toArchive bytes) of - Just docx -> Pandoc nullMeta (docxToBlocks opts docx) - Nothing -> error $ "couldn't parse docx file" - -runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)]) -runStyleToSpanAttr rPr = ("", - mapMaybe id [ - if isBold rPr then (Just "strong") else Nothing, - if isItalic rPr then (Just "emph") else Nothing, - if isSmallCaps rPr then (Just "smallcaps") else Nothing, - if isStrike rPr then (Just "strike") else Nothing, - if isSuperScript rPr then (Just "superscript") else Nothing, - if isSubScript rPr then (Just "subscript") else Nothing, - rStyle rPr], - case underline rPr of - Just fmt -> [("underline", fmt)] - _ -> [] - ) - -parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)]) -parStyleToDivAttr pPr = ("", - pStyle pPr, - case indent pPr of - Just n -> [("indent", (show n))] - Nothing -> [] - ) - -strToInlines :: String -> [Inline] -strToInlines = toList . text - -codeSpans :: [String] -codeSpans = ["VerbatimChar"] - -blockQuoteDivs :: [String] -blockQuoteDivs = ["Quote", "BlockQuote"] - -codeDivs :: [String] -codeDivs = ["SourceCode"] - -runElemToInlines :: RunElem -> [Inline] -runElemToInlines (TextRun s) = strToInlines s -runElemToInlines (LnBrk) = [LineBreak] - -runElemToString :: RunElem -> String -runElemToString (TextRun s) = s -runElemToString (LnBrk) = ['\n'] - -runElemsToString :: [RunElem] -> String -runElemsToString = concatMap runElemToString - -strNormalize :: [Inline] -> [Inline] -strNormalize [] = [] -strNormalize (Str "" : ils) = strNormalize ils -strNormalize ((Str s) : (Str s') : l) = strNormalize ((Str (s++s')) : l) -strNormalize (il:ils) = il : (strNormalize ils) - -runToInlines :: ReaderOptions -> DocX -> Run -> [Inline] -runToInlines _ _ (Run rs runElems) - | isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans = - case runStyleToSpanAttr rs == ("", [], []) of - True -> [Str (runElemsToString runElems)] - False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]] - | otherwise = case runStyleToSpanAttr rs == ("", [], []) of - True -> concatMap runElemToInlines runElems - False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)] -runToInlines opts docx@(DocX _ notes _ _ _ ) (Footnote fnId) = - case (getFootNote fnId notes) of - Just bodyParts -> - [Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] - Nothing -> - [Note [Div ("", ["footnote"], []) []]] -runToInlines opts docx@(DocX _ notes _ _ _) (Endnote fnId) = - case (getEndNote fnId notes) of - Just bodyParts -> - [Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] - Nothing -> - [Note [Div ("", ["endnote"], []) []]] - -parPartToInlines :: ReaderOptions -> DocX -> ParPart -> [Inline] -parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r -parPartToInlines _ _ (BookMark _ anchor) = - [Span (anchor, ["anchor"], []) []] -parPartToInlines _ (DocX _ _ _ rels _) (Drawing relid) = - case lookupRelationship relid rels of - Just target -> [Image [] (combine "word" target, "")] - Nothing -> [Image [] ("", "")] -parPartToInlines opts docx (InternalHyperLink anchor runs) = - [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")] -parPartToInlines opts docx@(DocX _ _ _ rels _) (ExternalHyperLink relid runs) = - case lookupRelationship relid rels of - Just target -> - [Link (concatMap (runToInlines opts docx) runs) (target, "")] - Nothing -> - [Link (concatMap (runToInlines opts docx) runs) ("", "")] - -isAnchorSpan :: Inline -> Bool -isAnchorSpan (Span (ident, classes, kvs) ils) = - (not . null) ident && - classes == ["anchor"] && - null kvs && - null ils -isAnchorSpan _ = False - -dummyAnchors :: [String] -dummyAnchors = ["_GoBack"] - -makeHeaderAnchors :: Block -> Block -makeHeaderAnchors h@(Header n (_, classes, kvs) ils) = - case filter isAnchorSpan ils of - [] -> h - (x@(Span (ident, _, _) _) : xs) -> - case ident `elem` dummyAnchors of - True -> h - False -> Header n (ident, classes, kvs) (ils \\ (x:xs)) - _ -> h -makeHeaderAnchors blk = blk - - -parPartsToInlines :: ReaderOptions -> DocX -> [ParPart] -> [Inline] -parPartsToInlines opts docx parparts = - -- - -- We're going to skip data-uri's for now. It should be an option, - -- not mandatory. - -- - --bottomUp (makeImagesSelfContained docx) $ - bottomUp spanCorrect $ - bottomUp spanTrim $ - bottomUp spanReduce $ - concatMap (parPartToInlines opts docx) parparts - -cellToBlocks :: ReaderOptions -> DocX -> Cell -> [Block] -cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps - -rowToBlocksList :: ReaderOptions -> DocX -> Row -> [[Block]] -rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells - -bodyPartToBlock :: ReaderOptions -> DocX -> BodyPart -> Block -bodyPartToBlock opts docx (Paragraph pPr parparts) = - Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)] -bodyPartToBlock opts docx@(DocX _ _ numbering _ _) (ListItem pPr numId lvl parparts) = - let - kvs = case lookupLevel numId lvl numbering of - Just (_, fmt, txt, Just start) -> [ ("level", lvl) - , ("num-id", numId) - , ("format", fmt) - , ("text", txt) - , ("start", (show start)) - ] - - Just (_, fmt, txt, Nothing) -> [ ("level", lvl) - , ("num-id", numId) - , ("format", fmt) - , ("text", txt) - ] - Nothing -> [] - in - Div - ("", ["list-item"], kvs) - [bodyPartToBlock opts docx (Paragraph pPr parparts)] -bodyPartToBlock _ _ (Tbl _ _ _ []) = - Para [] -bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) = - let caption = strToInlines cap - (hdr, rows) = case firstRowFormatting look of - True -> (Just r, rs) - False -> (Nothing, r:rs) - hdrCells = case hdr of - Just r' -> rowToBlocksList opts docx r' - Nothing -> [] - cells = map (rowToBlocksList opts docx) rows - - size = case null hdrCells of - True -> length $ head cells - False -> length $ hdrCells - -- - -- The two following variables (horizontal column alignment and - -- relative column widths) go to the default at the - -- moment. Width information is in the TblGrid field of the Tbl, - -- so should be possible. Alignment might be more difficult, - -- since there doesn't seem to be a column entity in docx. - alignments = take size (repeat AlignDefault) - widths = take size (repeat 0) :: [Double] - in - Table caption alignments widths hdrCells cells - -makeImagesSelfContained :: DocX -> Inline -> Inline -makeImagesSelfContained (DocX _ _ _ _ media) i@(Image alt (uri, title)) = - case lookup uri media of - Just bs -> case getMimeType uri of - Just mime -> let data_uri = - "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs) - in - Image alt (data_uri, title) - Nothing -> i - Nothing -> i -makeImagesSelfContained _ inline = inline - -bodyToBlocks :: ReaderOptions -> DocX -> Body -> [Block] -bodyToBlocks opts docx (Body bps) = - bottomUp removeEmptyPars $ - bottomUp strNormalize $ - bottomUp spanRemove $ - bottomUp divRemove $ - map (makeHeaderAnchors) $ - bottomUp divCorrect $ - bottomUp divReduce $ - bottomUp divCorrectPreReduce $ - bottomUp blocksToDefinitions $ - blocksToBullets $ - map (bodyPartToBlock opts docx) bps - -docxToBlocks :: ReaderOptions -> DocX -> [Block] -docxToBlocks opts d@(DocX (Document _ body) _ _ _ _) = bodyToBlocks opts d body - -spanReduce :: [Inline] -> [Inline] -spanReduce [] = [] -spanReduce ((Span (id1, classes1, kvs1) ils1) : ils) - | (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils) -spanReduce (s1@(Span (id1, classes1, kvs1) ils1) : - s2@(Span (id2, classes2, kvs2) ils2) : - ils) = - let classes' = classes1 `intersect` classes2 - kvs' = kvs1 `intersect` kvs2 - classes1' = classes1 \\ classes' - kvs1' = kvs1 \\ kvs' - classes2' = classes2 \\ classes' - kvs2' = kvs2 \\ kvs' - in - case null classes' && null kvs' of - True -> s1 : (spanReduce (s2 : ils)) - False -> let attr' = ("", classes', kvs') - attr1' = (id1, classes1', kvs1') - attr2' = (id2, classes2', kvs2') - in - spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] : - ils) -spanReduce (il:ils) = il : (spanReduce ils) - -ilToCode :: Inline -> String -ilToCode (Str s) = s -ilToCode _ = "" - -spanRemove' :: Inline -> [Inline] -spanRemove' s@(Span (ident, classes, _) []) - -- "_GoBack" is automatically inserted. We don't want to keep it. - | classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s] -spanRemove' (Span (_, _, kvs) ils) = - case lookup "underline" kvs of - Just val -> [Span ("", [], [("underline", val)]) ils] - Nothing -> ils -spanRemove' il = [il] - -spanRemove :: [Inline] -> [Inline] -spanRemove = concatMap spanRemove' - -spanTrim' :: Inline -> [Inline] -spanTrim' il@(Span _ []) = [il] -spanTrim' il@(Span attr (il':[])) - | il' == Space = [Span attr [], Space] - | otherwise = [il] -spanTrim' (Span attr ils) - | head ils == Space && last ils == Space = - [Space, Span attr (init $ tail ils), Space] - | head ils == Space = [Space, Span attr (tail ils)] - | last ils == Space = [Span attr (init ils), Space] -spanTrim' il = [il] - -spanTrim :: [Inline] -> [Inline] -spanTrim = concatMap spanTrim' - -spanCorrect' :: Inline -> [Inline] -spanCorrect' (Span ("", [], []) ils) = ils -spanCorrect' (Span (ident, classes, kvs) ils) - | "emph" `elem` classes = - [Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils] - | "strong" `elem` classes = - [Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils] - | "smallcaps" `elem` classes = - [SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils] - | "strike" `elem` classes = - [Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils] - | "superscript" `elem` classes = - [Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils] - | "subscript" `elem` classes = - [Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils] - | (not . null) (codeSpans `intersect` classes) = - [Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)] - | otherwise = - [Span (ident, classes, kvs) ils] -spanCorrect' il = [il] - -spanCorrect :: [Inline] -> [Inline] -spanCorrect = concatMap spanCorrect' - -removeEmptyPars :: [Block] -> [Block] -removeEmptyPars blks = filter (\b -> b /= (Para [])) blks - -divReduce :: [Block] -> [Block] -divReduce [] = [] -divReduce ((Div (id1, classes1, kvs1) blks1) : blks) - | (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks) -divReduce (d1@(Div (id1, classes1, kvs1) blks1) : - d2@(Div (id2, classes2, kvs2) blks2) : - blks) = - let classes' = classes1 `intersect` classes2 - kvs' = kvs1 `intersect` kvs2 - classes1' = classes1 \\ classes' - kvs1' = kvs1 \\ kvs' - classes2' = classes2 \\ classes' - kvs2' = kvs2 \\ kvs' - in - case null classes' && null kvs' of - True -> d1 : (divReduce (d2 : blks)) - False -> let attr' = ("", classes', kvs') - attr1' = (id1, classes1', kvs1') - attr2' = (id2, classes2', kvs2') - in - divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] : - blks) -divReduce (blk:blks) = blk : (divReduce blks) - -isHeaderClass :: String -> Maybe Int -isHeaderClass s | "Heading" `isPrefixOf` s = - case reads (drop (length "Heading") s) :: [(Int, String)] of - [] -> Nothing - ((n, "") : []) -> Just n - _ -> Nothing -isHeaderClass _ = Nothing - -findHeaderClass :: [String] -> Maybe Int -findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of - [] -> Nothing - n : _ -> Just n - -blksToInlines :: [Block] -> [Inline] -blksToInlines (Para ils : _) = ils -blksToInlines (Plain ils : _) = ils -blksToInlines _ = [] - -divCorrectPreReduce' :: Block -> [Block] -divCorrectPreReduce' (Div (ident, classes, kvs) blks) - | isJust $ findHeaderClass classes = - let n = fromJust $ findHeaderClass classes - in - [Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)] - | otherwise = [Div (ident, classes, kvs) blks] -divCorrectPreReduce' blk = [blk] - -divCorrectPreReduce :: [Block] -> [Block] -divCorrectPreReduce = concatMap divCorrectPreReduce' - -blkToCode :: Block -> String -blkToCode (Para []) = "" -blkToCode (Para ((Code _ s):ils)) = s ++ (blkToCode (Para ils)) -blkToCode (Para ((Span (_, classes, _) ils'): ils)) - | (not . null) (codeSpans `intersect` classes) = - (init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils)) -blkToCode _ = "" - -divRemove' :: Block -> [Block] -divRemove' (Div (_, _, kvs) blks) = - case lookup "indent" kvs of - Just val -> [Div ("", [], [("indent", val)]) blks] - Nothing -> blks -divRemove' blk = [blk] - -divRemove :: [Block] -> [Block] -divRemove = concatMap divRemove' - -divCorrect' :: Block -> [Block] -divCorrect' b@(Div (ident, classes, kvs) blks) - | (not . null) (blockQuoteDivs `intersect` classes) = - [BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]] - | (not . null) (codeDivs `intersect` classes) = - [CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)] - | otherwise = - case lookup "indent" kvs of - Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks] - Just _ -> - [BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]] - Nothing -> [b] -divCorrect' blk = [blk] - -divCorrect :: [Block] -> [Block] -divCorrect = concatMap divCorrect' diff --git a/src/Text/Pandoc/Readers/DocX/Lists.hs b/src/Text/Pandoc/Readers/DocX/Lists.hs deleted file mode 100644 index b20679261..000000000 --- a/src/Text/Pandoc/Readers/DocX/Lists.hs +++ /dev/null @@ -1,208 +0,0 @@ -{- -Copyright (C) 2014 Jesse Rosenthal - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.DocX.Lists - Copyright : Copyright (C) 2014 Jesse Rosenthal - License : GNU GPL, version 2 or above - - Maintainer : Jesse Rosenthal - Stability : alpha - Portability : portable - -Functions for converting flat DocX paragraphs into nested lists. --} - -module Text.Pandoc.Readers.DocX.Lists ( blocksToBullets - , blocksToDefinitions) where - -import Text.Pandoc.JSON -import Text.Pandoc.Shared (trim) -import Control.Monad -import Data.List -import Data.Maybe - -isListItem :: Block -> Bool -isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True -isListItem _ = False - -getLevel :: Block -> Maybe Integer -getLevel (Div (_, _, kvs) _) = liftM read $ lookup "level" kvs -getLevel _ = Nothing - -getLevelN :: Block -> Integer -getLevelN b = case getLevel b of - Just n -> n - Nothing -> -1 - -getNumId :: Block -> Maybe Integer -getNumId (Div (_, _, kvs) _) = liftM read $ lookup "num-id" kvs -getNumId _ = Nothing - -getNumIdN :: Block -> Integer -getNumIdN b = case getNumId b of - Just n -> n - Nothing -> -1 - -getText :: Block -> Maybe String -getText (Div (_, _, kvs) _) = lookup "text" kvs -getText _ = Nothing - -data ListType = Itemized | Enumerated ListAttributes - -listStyleMap :: [(String, ListNumberStyle)] -listStyleMap = [("upperLetter", UpperAlpha), - ("lowerLetter", LowerAlpha), - ("upperRoman", UpperRoman), - ("lowerRoman", LowerRoman), - ("decimal", Decimal)] - -listDelimMap :: [(String, ListNumberDelim)] -listDelimMap = [("%1)", OneParen), - ("(%1)", TwoParens), - ("%1.", Period)] - -getListType :: Block -> Maybe ListType -getListType b@(Div (_, _, kvs) _) | isListItem b = - let - start = lookup "start" kvs - frmt = lookup "format" kvs - txt = lookup "text" kvs - in - case frmt of - Just "bullet" -> Just Itemized - Just f -> - case txt of - Just t -> Just $ Enumerated ( - read (fromMaybe "1" start) :: Int, - fromMaybe DefaultStyle (lookup f listStyleMap), - fromMaybe DefaultDelim (lookup t listDelimMap)) - Nothing -> Nothing - _ -> Nothing -getListType _ = Nothing - -listParagraphDivs :: [String] -listParagraphDivs = ["ListParagraph"] - --- This is a first stab at going through and attaching meaning to list --- paragraphs, without an item marker, following a list item. We --- assume that these are paragraphs in the same item. - -handleListParagraphs :: [Block] -> [Block] -handleListParagraphs [] = [] -handleListParagraphs ( - (Div attr1@(_, classes1, _) blks1) : - (Div (ident2, classes2, kvs2) blks2) : - blks - ) | "list-item" `elem` classes1 && - not ("list-item" `elem` classes2) && - (not . null) (listParagraphDivs `intersect` classes2) = - -- We don't want to keep this indent. - let newDiv2 = - (Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2) - in - handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks) -handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks) - -separateBlocks' :: Block -> [[Block]] -> [[Block]] -separateBlocks' blk ([] : []) = [[blk]] -separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]] -separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]] --- The following is for the invisible bullet lists. This is how --- pandoc-generated ooxml does multiparagraph item lists. -separateBlocks' b acc | liftM trim (getText b) == Just "" = - (init acc) ++ [(last acc) ++ [b]] -separateBlocks' b acc = acc ++ [[b]] - -separateBlocks :: [Block] -> [[Block]] -separateBlocks blks = foldr separateBlocks' [[]] (reverse blks) - -flatToBullets' :: Integer -> [Block] -> [Block] -flatToBullets' _ [] = [] -flatToBullets' num xs@(b : elems) - | getLevelN b == num = b : (flatToBullets' num elems) - | otherwise = - let bNumId = getNumIdN b - bLevel = getLevelN b - (children, remaining) = - span - (\b' -> - ((getLevelN b') > bLevel || - ((getLevelN b') == bLevel && (getNumIdN b') == bNumId))) - xs - in - case getListType b of - Just (Enumerated attr) -> - (OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) : - (flatToBullets' num remaining) - _ -> - (BulletList (separateBlocks $ flatToBullets' bLevel children)) : - (flatToBullets' num remaining) - -flatToBullets :: [Block] -> [Block] -flatToBullets elems = flatToBullets' (-1) elems - -blocksToBullets :: [Block] -> [Block] -blocksToBullets blks = - -- bottomUp removeListItemDivs $ - flatToBullets $ (handleListParagraphs blks) - - -plainParaInlines :: Block -> [Inline] -plainParaInlines (Plain ils) = ils -plainParaInlines (Para ils) = ils -plainParaInlines _ = [] - -blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block] -blocksToDefinitions' [] acc [] = reverse acc -blocksToDefinitions' defAcc acc [] = - reverse $ (DefinitionList (reverse defAcc)) : acc -blocksToDefinitions' defAcc acc - ((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks) - | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 = - let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) - pair = case remainingAttr2 == ("", [], []) of - True -> (concatMap plainParaInlines blks1, [blks2]) - False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) - in - blocksToDefinitions' (pair : defAcc) acc blks -blocksToDefinitions' defAcc acc - ((Div (ident2, classes2, kvs2) blks2) : blks) - | (not . null) defAcc && "Definition" `elem` classes2 = - let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) - defItems2 = case remainingAttr2 == ("", [], []) of - True -> blks2 - False -> [Div remainingAttr2 blks2] - ((defTerm, defItems):defs) = defAcc - defAcc' = case null defItems of - True -> (defTerm, [defItems2]) : defs - False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs - in - blocksToDefinitions' defAcc' acc blks -blocksToDefinitions' [] acc (b:blks) = - blocksToDefinitions' [] (b:acc) blks -blocksToDefinitions' defAcc acc (b:blks) = - blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks - - -blocksToDefinitions :: [Block] -> [Block] -blocksToDefinitions = blocksToDefinitions' [] [] - - - - diff --git a/src/Text/Pandoc/Readers/DocX/Parse.hs b/src/Text/Pandoc/Readers/DocX/Parse.hs deleted file mode 100644 index d7033d9e8..000000000 --- a/src/Text/Pandoc/Readers/DocX/Parse.hs +++ /dev/null @@ -1,604 +0,0 @@ -{- -Copyright (C) 2014 Jesse Rosenthal - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.DocX.Parse - Copyright : Copyright (C) 2014 Jesse Rosenthal - License : GNU GPL, version 2 or above - - Maintainer : Jesse Rosenthal - Stability : alpha - Portability : portable - -Conversion of DocX archive into DocX haskell type --} - - -module Text.Pandoc.Readers.DocX.Parse ( DocX(..) - , Document(..) - , Body(..) - , BodyPart(..) - , TblLook(..) - , ParPart(..) - , Run(..) - , RunElem(..) - , Notes - , Numbering - , Relationship - , Media - , RunStyle(..) - , ParagraphStyle(..) - , Row(..) - , Cell(..) - , getFootNote - , getEndNote - , lookupLevel - , lookupRelationship - , archiveToDocX - ) where -import Codec.Archive.Zip -import Text.XML.Light -import Data.Maybe -import Data.List -import System.FilePath -import Data.Bits ((.|.)) -import qualified Data.ByteString.Lazy as B -import qualified Text.Pandoc.UTF8 as UTF8 - -attrToNSPair :: Attr -> Maybe (String, String) -attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) -attrToNSPair _ = Nothing - - -type NameSpaces = [(String, String)] - -data DocX = DocX Document Notes Numbering [Relationship] Media - deriving Show - -archiveToDocX :: Archive -> Maybe DocX -archiveToDocX archive = do - let notes = archiveToNotes archive - rels = archiveToRelationships archive - media = archiveToMedia archive - doc <- archiveToDocument archive - numbering <- archiveToNumbering archive - return $ DocX doc notes numbering rels media - -data Document = Document NameSpaces Body - deriving Show - -archiveToDocument :: Archive -> Maybe Document -archiveToDocument zf = do - entry <- findEntryByPath "word/document.xml" zf - docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry - let namespaces = mapMaybe attrToNSPair (elAttribs docElem) - bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem - body <- elemToBody namespaces bodyElem - return $ Document namespaces body - -type Media = [(FilePath, B.ByteString)] - -filePathIsMedia :: FilePath -> Bool -filePathIsMedia fp = - let (dir, _) = splitFileName fp - in - (dir == "word/media/") - -getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString) -getMediaPair zf fp = - case findEntryByPath fp zf of - Just e -> Just (fp, fromEntry e) - Nothing -> Nothing - -archiveToMedia :: Archive -> Media -archiveToMedia zf = - mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf)) - -data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] - deriving Show - -data Numb = Numb String String -- right now, only a key to an abstract num - deriving Show - -data AbstractNumb = AbstractNumb String [Level] - deriving Show - --- (ilvl, format, string, start) -type Level = (String, String, String, Maybe Integer) - -lookupLevel :: String -> String -> Numbering -> Maybe Level -lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do - absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs - lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs - lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls - return lvl - -numElemToNum :: NameSpaces -> Element -> Maybe Numb -numElemToNum ns element | - qName (elName element) == "num" && - qURI (elName element) == (lookup "w" ns) = do - numId <- findAttr (QName "numId" (lookup "w" ns) (Just "w")) element - absNumId <- findChild (QName "abstractNumId" (lookup "w" ns) (Just "w")) element - >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) - return $ Numb numId absNumId -numElemToNum _ _ = Nothing - -absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb -absNumElemToAbsNum ns element | - qName (elName element) == "abstractNum" && - qURI (elName element) == (lookup "w" ns) = do - absNumId <- findAttr - (QName "abstractNumId" (lookup "w" ns) (Just "w")) - element - let levelElems = findChildren - (QName "lvl" (lookup "w" ns) (Just "w")) - element - levels = mapMaybe id $ map (levelElemToLevel ns) levelElems - return $ AbstractNumb absNumId levels -absNumElemToAbsNum _ _ = Nothing - -levelElemToLevel :: NameSpaces -> Element -> Maybe Level -levelElemToLevel ns element | - qName (elName element) == "lvl" && - qURI (elName element) == (lookup "w" ns) = do - ilvl <- findAttr (QName "ilvl" (lookup "w" ns) (Just "w")) element - fmt <- findChild (QName "numFmt" (lookup "w" ns) (Just "w")) element - >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) - txt <- findChild (QName "lvlText" (lookup "w" ns) (Just "w")) element - >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) - let start = findChild (QName "start" (lookup "w" ns) (Just "w")) element - >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) - >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) - return (ilvl, fmt, txt, start) -levelElemToLevel _ _ = Nothing - -archiveToNumbering :: Archive -> Maybe Numbering -archiveToNumbering zf = - case findEntryByPath "word/numbering.xml" zf of - Nothing -> Just $ Numbering [] [] [] - Just entry -> do - numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry - let namespaces = mapMaybe attrToNSPair (elAttribs numberingElem) - numElems = findChildren - (QName "num" (lookup "w" namespaces) (Just "w")) - numberingElem - absNumElems = findChildren - (QName "abstractNum" (lookup "w" namespaces) (Just "w")) - numberingElem - nums = mapMaybe id $ map (numElemToNum namespaces) numElems - absNums = mapMaybe id $ map (absNumElemToAbsNum namespaces) absNumElems - return $ Numbering namespaces nums absNums - -data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])]) - deriving Show - -noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart]) -noteElemToNote ns element - | qName (elName element) `elem` ["endnote", "footnote"] && - qURI (elName element) == (lookup "w" ns) = - do - noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element - let bps = map fromJust - $ filter isJust - $ map (elemToBodyPart ns) - $ filterChildrenName (isParOrTbl ns) element - return $ (noteId, bps) -noteElemToNote _ _ = Nothing - -getFootNote :: String -> Notes -> Maybe [BodyPart] -getFootNote s (Notes _ fns _) = fns >>= (lookup s) - -getEndNote :: String -> Notes -> Maybe [BodyPart] -getEndNote s (Notes _ _ ens) = ens >>= (lookup s) - -elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])] -elemToNotes ns notetype element - | qName (elName element) == (notetype ++ "s") && - qURI (elName element) == (lookup "w" ns) = - Just $ map fromJust - $ filter isJust - $ map (noteElemToNote ns) - $ findChildren (QName notetype (lookup "w" ns) (Just "w")) element -elemToNotes _ _ _ = Nothing - -archiveToNotes :: Archive -> Notes -archiveToNotes zf = - let fnElem = findEntryByPath "word/footnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) - enElem = findEntryByPath "word/endnotes.xml" zf - >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) - fn_namespaces = case fnElem of - Just e -> mapMaybe attrToNSPair (elAttribs e) - Nothing -> [] - en_namespaces = case enElem of - Just e -> mapMaybe attrToNSPair (elAttribs e) - Nothing -> [] - ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces - fn = fnElem >>= (elemToNotes ns "footnote") - en = enElem >>= (elemToNotes ns "endnote") - in - Notes ns fn en - - -data Relationship = Relationship (RelId, Target) - deriving Show - -lookupRelationship :: RelId -> [Relationship] -> Maybe Target -lookupRelationship relid rels = - lookup relid (map (\(Relationship pair) -> pair) rels) - -filePathIsRel :: FilePath -> Bool -filePathIsRel fp = - let (dir, name) = splitFileName fp - in - (dir == "word/_rels/") && ((takeExtension name) == ".rels") - -relElemToRelationship :: Element -> Maybe Relationship -relElemToRelationship element | qName (elName element) == "Relationship" = - do - relId <- findAttr (QName "Id" Nothing Nothing) element - target <- findAttr (QName "Target" Nothing Nothing) element - return $ Relationship (relId, target) -relElemToRelationship _ = Nothing - - -archiveToRelationships :: Archive -> [Relationship] -archiveToRelationships archive = - let relPaths = filter filePathIsRel (filesInArchive archive) - entries = map fromJust $ filter isJust $ map (\f -> findEntryByPath f archive) relPaths - relElems = map fromJust $ filter isJust $ map (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries - rels = map fromJust $ filter isJust $ map relElemToRelationship $ concatMap elChildren relElems - in - rels - -data Body = Body [BodyPart] - deriving Show - -isParOrTbl :: NameSpaces -> QName -> Bool -isParOrTbl ns q = qName q `elem` ["p", "tbl"] && - qURI q == (lookup "w" ns) - -elemToBody :: NameSpaces -> Element -> Maybe Body -elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) = - Just $ Body - $ map fromJust - $ filter isJust - $ map (elemToBodyPart ns) $ filterChildrenName (isParOrTbl ns) element -elemToBody _ _ = Nothing - -isRunOrLinkOrBookmark :: NameSpaces -> QName -> Bool -isRunOrLinkOrBookmark ns q = qName q `elem` ["r", "hyperlink", "bookmarkStart"] && - qURI q == (lookup "w" ns) - -elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String) -elemToNumInfo ns element - | qName (elName element) == "p" && - qURI (elName element) == (lookup "w" ns) = - do - pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element - numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr - lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")) - numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")) - return (numId, lvl) -elemToNumInfo _ _ = Nothing - --- isBookMarkTag :: NameSpaces -> QName -> Bool --- isBookMarkTag ns q = qName q `elem` ["bookmarkStart", "bookmarkEnd"] && --- qURI q == (lookup "w" ns) - --- parChildrenToBookmark :: NameSpaces -> [Element] -> BookMark --- parChildrenToBookmark ns (bms : bme : _) --- | qName (elName bms) == "bookmarkStart" && --- qURI (elName bms) == (lookup "w" ns) && --- qName (elName bme) == "bookmarkEnd" && --- qURI (elName bme) == (lookup "w" ns) = do --- bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) bms --- bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) bms --- return $ (bmId, bmName) --- parChildrenToBookmark _ _ = Nothing - -elemToBodyPart :: NameSpaces -> Element -> Maybe BodyPart -elemToBodyPart ns element - | qName (elName element) == "p" && - qURI (elName element) == (lookup "w" ns) = - let parstyle = elemToParagraphStyle ns element - parparts = mapMaybe id - $ map (elemToParPart ns) - $ filterChildrenName (isRunOrLinkOrBookmark ns) element - in - case elemToNumInfo ns element of - Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts - Nothing -> Just $ Paragraph parstyle parparts - | qName (elName element) == "tbl" && - qURI (elName element) == (lookup "w" ns) = - let - caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element - >>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w")) - >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) - grid = case - findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element - of - Just g -> elemToTblGrid ns g - Nothing -> [] - tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element - >>= findChild (QName "tblLook" (lookup "w" ns) (Just "w")) - >>= elemToTblLook ns - in - Just $ Tbl - (fromMaybe "" caption) - grid - (fromMaybe defaultTblLook tblLook) - (mapMaybe (elemToRow ns) (elChildren element)) - | otherwise = Nothing - -elemToTblLook :: NameSpaces -> Element -> Maybe TblLook -elemToTblLook ns element - | qName (elName element) == "tblLook" && - qURI (elName element) == (lookup "w" ns) = - let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element - val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element - firstRowFmt = - case firstRow of - Just "1" -> True - Just _ -> False - Nothing -> case val of - Just bitMask -> testBitMask bitMask 0x020 - Nothing -> False - in - Just $ TblLook{firstRowFormatting = firstRowFmt} -elemToTblLook _ _ = Nothing - -testBitMask :: String -> Int -> Bool -testBitMask bitMaskS n = - case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of - [] -> False - ((n', _) : _) -> ((n' .|. n) /= 0) - -data ParagraphStyle = ParagraphStyle { pStyle :: [String] - , indent :: Maybe Integer - } - deriving Show - -defaultParagraphStyle :: ParagraphStyle -defaultParagraphStyle = ParagraphStyle { pStyle = [] - , indent = Nothing - } - -elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle -elemToParagraphStyle ns element = - case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of - Just pPr -> - ParagraphStyle - {pStyle = - mapMaybe id $ - map - (findAttr (QName "val" (lookup "w" ns) (Just "w"))) - (findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr) - , indent = - findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>= - findAttr (QName "left" (lookup "w" ns) (Just "w")) >>= - stringToInteger - } - Nothing -> defaultParagraphStyle - - -data BodyPart = Paragraph ParagraphStyle [ParPart] - | ListItem ParagraphStyle String String [ParPart] - | Tbl String TblGrid TblLook [Row] - - deriving Show - -type TblGrid = [Integer] - -data TblLook = TblLook {firstRowFormatting::Bool} - deriving Show - -defaultTblLook :: TblLook -defaultTblLook = TblLook{firstRowFormatting = False} - -stringToInteger :: String -> Maybe Integer -stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) - -elemToTblGrid :: NameSpaces -> Element -> TblGrid -elemToTblGrid ns element - | qName (elName element) == "tblGrid" && - qURI (elName element) == (lookup "w" ns) = - let - cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element - in - mapMaybe (\e -> - findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e - >>= stringToInteger - ) - cols -elemToTblGrid _ _ = [] - -data Row = Row [Cell] - deriving Show - - -elemToRow :: NameSpaces -> Element -> Maybe Row -elemToRow ns element - | qName (elName element) == "tr" && - qURI (elName element) == (lookup "w" ns) = - let - cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element - in - Just $ Row (mapMaybe (elemToCell ns) cells) -elemToRow _ _ = Nothing - -data Cell = Cell [BodyPart] - deriving Show - -elemToCell :: NameSpaces -> Element -> Maybe Cell -elemToCell ns element - | qName (elName element) == "tc" && - qURI (elName element) == (lookup "w" ns) = - Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element)) -elemToCell _ _ = Nothing - -data ParPart = PlainRun Run - | BookMark BookMarkId Anchor - | InternalHyperLink Anchor [Run] - | ExternalHyperLink RelId [Run] - | Drawing String - deriving Show - -data Run = Run RunStyle [RunElem] - | Footnote String - | Endnote String - deriving Show - -data RunElem = TextRun String | LnBrk - deriving Show - -data RunStyle = RunStyle { isBold :: Bool - , isItalic :: Bool - , isSmallCaps :: Bool - , isStrike :: Bool - , isSuperScript :: Bool - , isSubScript :: Bool - , underline :: Maybe String - , rStyle :: Maybe String } - deriving Show - -defaultRunStyle :: RunStyle -defaultRunStyle = RunStyle { isBold = False - , isItalic = False - , isSmallCaps = False - , isStrike = False - , isSuperScript = False - , isSubScript = False - , underline = Nothing - , rStyle = Nothing - } - -elemToRunStyle :: NameSpaces -> Element -> RunStyle -elemToRunStyle ns element = - case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of - Just rPr -> - RunStyle - { - isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr - , isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr - , isSmallCaps = isJust $ findChild (QName "smallCaps" (lookup "w" ns) (Just "w")) rPr - , isStrike = isJust $ findChild (QName "strike" (lookup "w" ns) (Just "w")) rPr - , isSuperScript = - (Just "superscript" == - (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")))) - , isSubScript = - (Just "subscript" == - (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")))) - , underline = - findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")) - , rStyle = - findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>= - findAttr (QName "val" (lookup "w" ns) (Just "w")) - } - Nothing -> defaultRunStyle - -elemToRun :: NameSpaces -> Element -> Maybe Run -elemToRun ns element - | qName (elName element) == "r" && - qURI (elName element) == (lookup "w" ns) = - case - findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>= - findAttr (QName "id" (lookup "w" ns) (Just "w")) - of - Just s -> Just $ Footnote s - Nothing -> - case - findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>= - findAttr (QName "id" (lookup "w" ns) (Just "w")) - of - Just s -> Just $ Endnote s - Nothing -> Just $ - Run (elemToRunStyle ns element) - (elemToRunElems ns element) -elemToRun _ _ = Nothing - -elemToRunElem :: NameSpaces -> Element -> Maybe RunElem -elemToRunElem ns element - | qName (elName element) == "t" && - qURI (elName element) == (lookup "w" ns) = - Just $ TextRun (strContent element) - | qName (elName element) == "br" && - qURI (elName element) == (lookup "w" ns) = - Just $ LnBrk - | otherwise = Nothing - - -elemToRunElems :: NameSpaces -> Element -> [RunElem] -elemToRunElems ns element - | qName (elName element) == "r" && - qURI (elName element) == (lookup "w" ns) = - mapMaybe (elemToRunElem ns) (elChildren element) - | otherwise = [] - -elemToDrawing :: NameSpaces -> Element -> Maybe ParPart -elemToDrawing ns element - | qName (elName element) == "drawing" && - qURI (elName element) == (lookup "w" ns) = - let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" - in - findElement (QName "blip" (Just a_ns) (Just "a")) element - >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) - >>= (\s -> Just $ Drawing s) -elemToDrawing _ _ = Nothing - - -elemToParPart :: NameSpaces -> Element -> Maybe ParPart -elemToParPart ns element - | qName (elName element) == "r" && - qURI (elName element) == (lookup "w" ns) = - case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of - Just drawingElem -> elemToDrawing ns drawingElem - Nothing -> do - r <- elemToRun ns element - return $ PlainRun r -elemToParPart ns element - | qName (elName element) == "bookmarkStart" && - qURI (elName element) == (lookup "w" ns) = do - bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element - bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element - return $ BookMark bmId bmName -elemToParPart ns element - | qName (elName element) == "hyperlink" && - qURI (elName element) == (lookup "w" ns) = - let runs = map fromJust $ filter isJust $ map (elemToRun ns) - $ findChildren (QName "r" (lookup "w" ns) (Just "w")) element - in - case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of - Just anchor -> - Just $ InternalHyperLink anchor runs - Nothing -> - case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of - Just relId -> Just $ ExternalHyperLink relId runs - Nothing -> Nothing -elemToParPart _ _ = Nothing - -type Target = String -type Anchor = String -type BookMarkId = String -type RelId = String - diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs new file mode 100644 index 000000000..df4be41ff --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -0,0 +1,479 @@ +{- +Copyright (C) 2014 Jesse Rosenthal + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal + Stability : alpha + Portability : portable + +Conversion of Docx type (defined in Text.Pandoc.Readers.Docx.Parse) +to 'Pandoc' document. -} + +{- +Current state of implementation of Docx entities ([x] means +implemented, [-] means partially implemented): + +* Blocks + + - [X] Para + - [X] CodeBlock (styled with `SourceCode`) + - [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally, + indented) + - [X] OrderedList + - [X] BulletList + - [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`) + - [X] Header (styled with `Heading#`) + - [ ] HorizontalRule + - [-] Table (column widths and alignments not yet implemented) + +* Inlines + + - [X] Str + - [X] Emph (From italics. `underline` currently read as span. In + future, it might optionally be emph as well) + - [X] Strong + - [X] Strikeout + - [X] Superscript + - [X] Subscript + - [X] SmallCaps + - [ ] Quoted + - [ ] Cite + - [X] Code (styled with `VerbatimChar`) + - [X] Space + - [X] LineBreak (these are invisible in Word: entered with Shift-Return) + - [ ] Math + - [X] Link (links to an arbitrary bookmark create a span with the target as + id and "anchor" class) + - [-] Image (Links to path in archive. Future option for + data-encoded URI likely.) + - [X] Note (Footnotes and Endnotes are silently combined.) +-} + +module Text.Pandoc.Readers.Docx + ( readDocx + ) where + +import Codec.Archive.Zip +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Builder (text, toList) +import Text.Pandoc.Generic (bottomUp) +import Text.Pandoc.MIME (getMimeType) +import Text.Pandoc.UTF8 (toString) +import Text.Pandoc.Readers.Docx.Parse +import Text.Pandoc.Readers.Docx.Lists +import Data.Maybe (mapMaybe, isJust, fromJust) +import Data.List (delete, isPrefixOf, (\\), intersect) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as B +import Data.ByteString.Base64 (encode) +import System.FilePath (combine) + +readDocx :: ReaderOptions + -> B.ByteString + -> Pandoc +readDocx opts bytes = + case archiveToDocx (toArchive bytes) of + Just docx -> Pandoc nullMeta (docxToBlocks opts docx) + Nothing -> error $ "couldn't parse docx file" + +runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)]) +runStyleToSpanAttr rPr = ("", + mapMaybe id [ + if isBold rPr then (Just "strong") else Nothing, + if isItalic rPr then (Just "emph") else Nothing, + if isSmallCaps rPr then (Just "smallcaps") else Nothing, + if isStrike rPr then (Just "strike") else Nothing, + if isSuperScript rPr then (Just "superscript") else Nothing, + if isSubScript rPr then (Just "subscript") else Nothing, + rStyle rPr], + case underline rPr of + Just fmt -> [("underline", fmt)] + _ -> [] + ) + +parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)]) +parStyleToDivAttr pPr = ("", + pStyle pPr, + case indent pPr of + Just n -> [("indent", (show n))] + Nothing -> [] + ) + +strToInlines :: String -> [Inline] +strToInlines = toList . text + +codeSpans :: [String] +codeSpans = ["VerbatimChar"] + +blockQuoteDivs :: [String] +blockQuoteDivs = ["Quote", "BlockQuote"] + +codeDivs :: [String] +codeDivs = ["SourceCode"] + +runElemToInlines :: RunElem -> [Inline] +runElemToInlines (TextRun s) = strToInlines s +runElemToInlines (LnBrk) = [LineBreak] + +runElemToString :: RunElem -> String +runElemToString (TextRun s) = s +runElemToString (LnBrk) = ['\n'] + +runElemsToString :: [RunElem] -> String +runElemsToString = concatMap runElemToString + +strNormalize :: [Inline] -> [Inline] +strNormalize [] = [] +strNormalize (Str "" : ils) = strNormalize ils +strNormalize ((Str s) : (Str s') : l) = strNormalize ((Str (s++s')) : l) +strNormalize (il:ils) = il : (strNormalize ils) + +runToInlines :: ReaderOptions -> Docx -> Run -> [Inline] +runToInlines _ _ (Run rs runElems) + | isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans = + case runStyleToSpanAttr rs == ("", [], []) of + True -> [Str (runElemsToString runElems)] + False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]] + | otherwise = case runStyleToSpanAttr rs == ("", [], []) of + True -> concatMap runElemToInlines runElems + False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)] +runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) = + case (getFootNote fnId notes) of + Just bodyParts -> + [Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] + Nothing -> + [Note [Div ("", ["footnote"], []) []]] +runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) = + case (getEndNote fnId notes) of + Just bodyParts -> + [Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] + Nothing -> + [Note [Div ("", ["endnote"], []) []]] + +parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline] +parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r +parPartToInlines _ _ (BookMark _ anchor) = + [Span (anchor, ["anchor"], []) []] +parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) = + case lookupRelationship relid rels of + Just target -> [Image [] (combine "word" target, "")] + Nothing -> [Image [] ("", "")] +parPartToInlines opts docx (InternalHyperLink anchor runs) = + [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")] +parPartToInlines opts docx@(Docx _ _ _ rels _) (ExternalHyperLink relid runs) = + case lookupRelationship relid rels of + Just target -> + [Link (concatMap (runToInlines opts docx) runs) (target, "")] + Nothing -> + [Link (concatMap (runToInlines opts docx) runs) ("", "")] + +isAnchorSpan :: Inline -> Bool +isAnchorSpan (Span (ident, classes, kvs) ils) = + (not . null) ident && + classes == ["anchor"] && + null kvs && + null ils +isAnchorSpan _ = False + +dummyAnchors :: [String] +dummyAnchors = ["_GoBack"] + +makeHeaderAnchors :: Block -> Block +makeHeaderAnchors h@(Header n (_, classes, kvs) ils) = + case filter isAnchorSpan ils of + [] -> h + (x@(Span (ident, _, _) _) : xs) -> + case ident `elem` dummyAnchors of + True -> h + False -> Header n (ident, classes, kvs) (ils \\ (x:xs)) + _ -> h +makeHeaderAnchors blk = blk + + +parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline] +parPartsToInlines opts docx parparts = + -- + -- We're going to skip data-uri's for now. It should be an option, + -- not mandatory. + -- + --bottomUp (makeImagesSelfContained docx) $ + bottomUp spanCorrect $ + bottomUp spanTrim $ + bottomUp spanReduce $ + concatMap (parPartToInlines opts docx) parparts + +cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block] +cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps + +rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]] +rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells + +bodyPartToBlock :: ReaderOptions -> Docx -> BodyPart -> Block +bodyPartToBlock opts docx (Paragraph pPr parparts) = + Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)] +bodyPartToBlock opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) = + let + kvs = case lookupLevel numId lvl numbering of + Just (_, fmt, txt, Just start) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + , ("start", (show start)) + ] + + Just (_, fmt, txt, Nothing) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + ] + Nothing -> [] + in + Div + ("", ["list-item"], kvs) + [bodyPartToBlock opts docx (Paragraph pPr parparts)] +bodyPartToBlock _ _ (Tbl _ _ _ []) = + Para [] +bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) = + let caption = strToInlines cap + (hdr, rows) = case firstRowFormatting look of + True -> (Just r, rs) + False -> (Nothing, r:rs) + hdrCells = case hdr of + Just r' -> rowToBlocksList opts docx r' + Nothing -> [] + cells = map (rowToBlocksList opts docx) rows + + size = case null hdrCells of + True -> length $ head cells + False -> length $ hdrCells + -- + -- The two following variables (horizontal column alignment and + -- relative column widths) go to the default at the + -- moment. Width information is in the TblGrid field of the Tbl, + -- so should be possible. Alignment might be more difficult, + -- since there doesn't seem to be a column entity in docx. + alignments = take size (repeat AlignDefault) + widths = take size (repeat 0) :: [Double] + in + Table caption alignments widths hdrCells cells + +makeImagesSelfContained :: Docx -> Inline -> Inline +makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) = + case lookup uri media of + Just bs -> case getMimeType uri of + Just mime -> let data_uri = + "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs) + in + Image alt (data_uri, title) + Nothing -> i + Nothing -> i +makeImagesSelfContained _ inline = inline + +bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block] +bodyToBlocks opts docx (Body bps) = + bottomUp removeEmptyPars $ + bottomUp strNormalize $ + bottomUp spanRemove $ + bottomUp divRemove $ + map (makeHeaderAnchors) $ + bottomUp divCorrect $ + bottomUp divReduce $ + bottomUp divCorrectPreReduce $ + bottomUp blocksToDefinitions $ + blocksToBullets $ + map (bodyPartToBlock opts docx) bps + +docxToBlocks :: ReaderOptions -> Docx -> [Block] +docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body + +spanReduce :: [Inline] -> [Inline] +spanReduce [] = [] +spanReduce ((Span (id1, classes1, kvs1) ils1) : ils) + | (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils) +spanReduce (s1@(Span (id1, classes1, kvs1) ils1) : + s2@(Span (id2, classes2, kvs2) ils2) : + ils) = + let classes' = classes1 `intersect` classes2 + kvs' = kvs1 `intersect` kvs2 + classes1' = classes1 \\ classes' + kvs1' = kvs1 \\ kvs' + classes2' = classes2 \\ classes' + kvs2' = kvs2 \\ kvs' + in + case null classes' && null kvs' of + True -> s1 : (spanReduce (s2 : ils)) + False -> let attr' = ("", classes', kvs') + attr1' = (id1, classes1', kvs1') + attr2' = (id2, classes2', kvs2') + in + spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] : + ils) +spanReduce (il:ils) = il : (spanReduce ils) + +ilToCode :: Inline -> String +ilToCode (Str s) = s +ilToCode _ = "" + +spanRemove' :: Inline -> [Inline] +spanRemove' s@(Span (ident, classes, _) []) + -- "_GoBack" is automatically inserted. We don't want to keep it. + | classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s] +spanRemove' (Span (_, _, kvs) ils) = + case lookup "underline" kvs of + Just val -> [Span ("", [], [("underline", val)]) ils] + Nothing -> ils +spanRemove' il = [il] + +spanRemove :: [Inline] -> [Inline] +spanRemove = concatMap spanRemove' + +spanTrim' :: Inline -> [Inline] +spanTrim' il@(Span _ []) = [il] +spanTrim' il@(Span attr (il':[])) + | il' == Space = [Span attr [], Space] + | otherwise = [il] +spanTrim' (Span attr ils) + | head ils == Space && last ils == Space = + [Space, Span attr (init $ tail ils), Space] + | head ils == Space = [Space, Span attr (tail ils)] + | last ils == Space = [Span attr (init ils), Space] +spanTrim' il = [il] + +spanTrim :: [Inline] -> [Inline] +spanTrim = concatMap spanTrim' + +spanCorrect' :: Inline -> [Inline] +spanCorrect' (Span ("", [], []) ils) = ils +spanCorrect' (Span (ident, classes, kvs) ils) + | "emph" `elem` classes = + [Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils] + | "strong" `elem` classes = + [Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils] + | "smallcaps" `elem` classes = + [SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils] + | "strike" `elem` classes = + [Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils] + | "superscript" `elem` classes = + [Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils] + | "subscript" `elem` classes = + [Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils] + | (not . null) (codeSpans `intersect` classes) = + [Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)] + | otherwise = + [Span (ident, classes, kvs) ils] +spanCorrect' il = [il] + +spanCorrect :: [Inline] -> [Inline] +spanCorrect = concatMap spanCorrect' + +removeEmptyPars :: [Block] -> [Block] +removeEmptyPars blks = filter (\b -> b /= (Para [])) blks + +divReduce :: [Block] -> [Block] +divReduce [] = [] +divReduce ((Div (id1, classes1, kvs1) blks1) : blks) + | (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks) +divReduce (d1@(Div (id1, classes1, kvs1) blks1) : + d2@(Div (id2, classes2, kvs2) blks2) : + blks) = + let classes' = classes1 `intersect` classes2 + kvs' = kvs1 `intersect` kvs2 + classes1' = classes1 \\ classes' + kvs1' = kvs1 \\ kvs' + classes2' = classes2 \\ classes' + kvs2' = kvs2 \\ kvs' + in + case null classes' && null kvs' of + True -> d1 : (divReduce (d2 : blks)) + False -> let attr' = ("", classes', kvs') + attr1' = (id1, classes1', kvs1') + attr2' = (id2, classes2', kvs2') + in + divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] : + blks) +divReduce (blk:blks) = blk : (divReduce blks) + +isHeaderClass :: String -> Maybe Int +isHeaderClass s | "Heading" `isPrefixOf` s = + case reads (drop (length "Heading") s) :: [(Int, String)] of + [] -> Nothing + ((n, "") : []) -> Just n + _ -> Nothing +isHeaderClass _ = Nothing + +findHeaderClass :: [String] -> Maybe Int +findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of + [] -> Nothing + n : _ -> Just n + +blksToInlines :: [Block] -> [Inline] +blksToInlines (Para ils : _) = ils +blksToInlines (Plain ils : _) = ils +blksToInlines _ = [] + +divCorrectPreReduce' :: Block -> [Block] +divCorrectPreReduce' (Div (ident, classes, kvs) blks) + | isJust $ findHeaderClass classes = + let n = fromJust $ findHeaderClass classes + in + [Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)] + | otherwise = [Div (ident, classes, kvs) blks] +divCorrectPreReduce' blk = [blk] + +divCorrectPreReduce :: [Block] -> [Block] +divCorrectPreReduce = concatMap divCorrectPreReduce' + +blkToCode :: Block -> String +blkToCode (Para []) = "" +blkToCode (Para ((Code _ s):ils)) = s ++ (blkToCode (Para ils)) +blkToCode (Para ((Span (_, classes, _) ils'): ils)) + | (not . null) (codeSpans `intersect` classes) = + (init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils)) +blkToCode _ = "" + +divRemove' :: Block -> [Block] +divRemove' (Div (_, _, kvs) blks) = + case lookup "indent" kvs of + Just val -> [Div ("", [], [("indent", val)]) blks] + Nothing -> blks +divRemove' blk = [blk] + +divRemove :: [Block] -> [Block] +divRemove = concatMap divRemove' + +divCorrect' :: Block -> [Block] +divCorrect' b@(Div (ident, classes, kvs) blks) + | (not . null) (blockQuoteDivs `intersect` classes) = + [BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]] + | (not . null) (codeDivs `intersect` classes) = + [CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)] + | otherwise = + case lookup "indent" kvs of + Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks] + Just _ -> + [BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]] + Nothing -> [b] +divCorrect' blk = [blk] + +divCorrect :: [Block] -> [Block] +divCorrect = concatMap divCorrect' diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs new file mode 100644 index 000000000..68559d98b --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Lists.hs @@ -0,0 +1,208 @@ +{- +Copyright (C) 2014 Jesse Rosenthal + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Lists + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal + Stability : alpha + Portability : portable + +Functions for converting flat docx paragraphs into nested lists. +-} + +module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets + , blocksToDefinitions) where + +import Text.Pandoc.JSON +import Text.Pandoc.Shared (trim) +import Control.Monad +import Data.List +import Data.Maybe + +isListItem :: Block -> Bool +isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True +isListItem _ = False + +getLevel :: Block -> Maybe Integer +getLevel (Div (_, _, kvs) _) = liftM read $ lookup "level" kvs +getLevel _ = Nothing + +getLevelN :: Block -> Integer +getLevelN b = case getLevel b of + Just n -> n + Nothing -> -1 + +getNumId :: Block -> Maybe Integer +getNumId (Div (_, _, kvs) _) = liftM read $ lookup "num-id" kvs +getNumId _ = Nothing + +getNumIdN :: Block -> Integer +getNumIdN b = case getNumId b of + Just n -> n + Nothing -> -1 + +getText :: Block -> Maybe String +getText (Div (_, _, kvs) _) = lookup "text" kvs +getText _ = Nothing + +data ListType = Itemized | Enumerated ListAttributes + +listStyleMap :: [(String, ListNumberStyle)] +listStyleMap = [("upperLetter", UpperAlpha), + ("lowerLetter", LowerAlpha), + ("upperRoman", UpperRoman), + ("lowerRoman", LowerRoman), + ("decimal", Decimal)] + +listDelimMap :: [(String, ListNumberDelim)] +listDelimMap = [("%1)", OneParen), + ("(%1)", TwoParens), + ("%1.", Period)] + +getListType :: Block -> Maybe ListType +getListType b@(Div (_, _, kvs) _) | isListItem b = + let + start = lookup "start" kvs + frmt = lookup "format" kvs + txt = lookup "text" kvs + in + case frmt of + Just "bullet" -> Just Itemized + Just f -> + case txt of + Just t -> Just $ Enumerated ( + read (fromMaybe "1" start) :: Int, + fromMaybe DefaultStyle (lookup f listStyleMap), + fromMaybe DefaultDelim (lookup t listDelimMap)) + Nothing -> Nothing + _ -> Nothing +getListType _ = Nothing + +listParagraphDivs :: [String] +listParagraphDivs = ["ListParagraph"] + +-- This is a first stab at going through and attaching meaning to list +-- paragraphs, without an item marker, following a list item. We +-- assume that these are paragraphs in the same item. + +handleListParagraphs :: [Block] -> [Block] +handleListParagraphs [] = [] +handleListParagraphs ( + (Div attr1@(_, classes1, _) blks1) : + (Div (ident2, classes2, kvs2) blks2) : + blks + ) | "list-item" `elem` classes1 && + not ("list-item" `elem` classes2) && + (not . null) (listParagraphDivs `intersect` classes2) = + -- We don't want to keep this indent. + let newDiv2 = + (Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2) + in + handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks) +handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks) + +separateBlocks' :: Block -> [[Block]] -> [[Block]] +separateBlocks' blk ([] : []) = [[blk]] +separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]] +-- The following is for the invisible bullet lists. This is how +-- pandoc-generated ooxml does multiparagraph item lists. +separateBlocks' b acc | liftM trim (getText b) == Just "" = + (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b acc = acc ++ [[b]] + +separateBlocks :: [Block] -> [[Block]] +separateBlocks blks = foldr separateBlocks' [[]] (reverse blks) + +flatToBullets' :: Integer -> [Block] -> [Block] +flatToBullets' _ [] = [] +flatToBullets' num xs@(b : elems) + | getLevelN b == num = b : (flatToBullets' num elems) + | otherwise = + let bNumId = getNumIdN b + bLevel = getLevelN b + (children, remaining) = + span + (\b' -> + ((getLevelN b') > bLevel || + ((getLevelN b') == bLevel && (getNumIdN b') == bNumId))) + xs + in + case getListType b of + Just (Enumerated attr) -> + (OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) : + (flatToBullets' num remaining) + _ -> + (BulletList (separateBlocks $ flatToBullets' bLevel children)) : + (flatToBullets' num remaining) + +flatToBullets :: [Block] -> [Block] +flatToBullets elems = flatToBullets' (-1) elems + +blocksToBullets :: [Block] -> [Block] +blocksToBullets blks = + -- bottomUp removeListItemDivs $ + flatToBullets $ (handleListParagraphs blks) + + +plainParaInlines :: Block -> [Inline] +plainParaInlines (Plain ils) = ils +plainParaInlines (Para ils) = ils +plainParaInlines _ = [] + +blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block] +blocksToDefinitions' [] acc [] = reverse acc +blocksToDefinitions' defAcc acc [] = + reverse $ (DefinitionList (reverse defAcc)) : acc +blocksToDefinitions' defAcc acc + ((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks) + | "DefinitionTerm" `elem` classes1 && "Definition" `elem` classes2 = + let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) + pair = case remainingAttr2 == ("", [], []) of + True -> (concatMap plainParaInlines blks1, [blks2]) + False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) + in + blocksToDefinitions' (pair : defAcc) acc blks +blocksToDefinitions' defAcc acc + ((Div (ident2, classes2, kvs2) blks2) : blks) + | (not . null) defAcc && "Definition" `elem` classes2 = + let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) + defItems2 = case remainingAttr2 == ("", [], []) of + True -> blks2 + False -> [Div remainingAttr2 blks2] + ((defTerm, defItems):defs) = defAcc + defAcc' = case null defItems of + True -> (defTerm, [defItems2]) : defs + False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs + in + blocksToDefinitions' defAcc' acc blks +blocksToDefinitions' [] acc (b:blks) = + blocksToDefinitions' [] (b:acc) blks +blocksToDefinitions' defAcc acc (b:blks) = + blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks + + +blocksToDefinitions :: [Block] -> [Block] +blocksToDefinitions = blocksToDefinitions' [] [] + + + + diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs new file mode 100644 index 000000000..22e9dd909 --- /dev/null +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -0,0 +1,604 @@ +{- +Copyright (C) 2014 Jesse Rosenthal + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.Docx.Parse + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal + Stability : alpha + Portability : portable + +Conversion of docx archive into Docx haskell type +-} + + +module Text.Pandoc.Readers.Docx.Parse ( Docx(..) + , Document(..) + , Body(..) + , BodyPart(..) + , TblLook(..) + , ParPart(..) + , Run(..) + , RunElem(..) + , Notes + , Numbering + , Relationship + , Media + , RunStyle(..) + , ParagraphStyle(..) + , Row(..) + , Cell(..) + , getFootNote + , getEndNote + , lookupLevel + , lookupRelationship + , archiveToDocx + ) where +import Codec.Archive.Zip +import Text.XML.Light +import Data.Maybe +import Data.List +import System.FilePath +import Data.Bits ((.|.)) +import qualified Data.ByteString.Lazy as B +import qualified Text.Pandoc.UTF8 as UTF8 + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) +attrToNSPair _ = Nothing + + +type NameSpaces = [(String, String)] + +data Docx = Docx Document Notes Numbering [Relationship] Media + deriving Show + +archiveToDocx :: Archive -> Maybe Docx +archiveToDocx archive = do + let notes = archiveToNotes archive + rels = archiveToRelationships archive + media = archiveToMedia archive + doc <- archiveToDocument archive + numbering <- archiveToNumbering archive + return $ Docx doc notes numbering rels media + +data Document = Document NameSpaces Body + deriving Show + +archiveToDocument :: Archive -> Maybe Document +archiveToDocument zf = do + entry <- findEntryByPath "word/document.xml" zf + docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem + body <- elemToBody namespaces bodyElem + return $ Document namespaces body + +type Media = [(FilePath, B.ByteString)] + +filePathIsMedia :: FilePath -> Bool +filePathIsMedia fp = + let (dir, _) = splitFileName fp + in + (dir == "word/media/") + +getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString) +getMediaPair zf fp = + case findEntryByPath fp zf of + Just e -> Just (fp, fromEntry e) + Nothing -> Nothing + +archiveToMedia :: Archive -> Media +archiveToMedia zf = + mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf)) + +data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] + deriving Show + +data Numb = Numb String String -- right now, only a key to an abstract num + deriving Show + +data AbstractNumb = AbstractNumb String [Level] + deriving Show + +-- (ilvl, format, string, start) +type Level = (String, String, String, Maybe Integer) + +lookupLevel :: String -> String -> Numbering -> Maybe Level +lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do + absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs + lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs + lvl <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls + return lvl + +numElemToNum :: NameSpaces -> Element -> Maybe Numb +numElemToNum ns element | + qName (elName element) == "num" && + qURI (elName element) == (lookup "w" ns) = do + numId <- findAttr (QName "numId" (lookup "w" ns) (Just "w")) element + absNumId <- findChild (QName "abstractNumId" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + return $ Numb numId absNumId +numElemToNum _ _ = Nothing + +absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb +absNumElemToAbsNum ns element | + qName (elName element) == "abstractNum" && + qURI (elName element) == (lookup "w" ns) = do + absNumId <- findAttr + (QName "abstractNumId" (lookup "w" ns) (Just "w")) + element + let levelElems = findChildren + (QName "lvl" (lookup "w" ns) (Just "w")) + element + levels = mapMaybe id $ map (levelElemToLevel ns) levelElems + return $ AbstractNumb absNumId levels +absNumElemToAbsNum _ _ = Nothing + +levelElemToLevel :: NameSpaces -> Element -> Maybe Level +levelElemToLevel ns element | + qName (elName element) == "lvl" && + qURI (elName element) == (lookup "w" ns) = do + ilvl <- findAttr (QName "ilvl" (lookup "w" ns) (Just "w")) element + fmt <- findChild (QName "numFmt" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + txt <- findChild (QName "lvlText" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + let start = findChild (QName "start" (lookup "w" ns) (Just "w")) element + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + return (ilvl, fmt, txt, start) +levelElemToLevel _ _ = Nothing + +archiveToNumbering :: Archive -> Maybe Numbering +archiveToNumbering zf = + case findEntryByPath "word/numbering.xml" zf of + Nothing -> Just $ Numbering [] [] [] + Just entry -> do + numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry + let namespaces = mapMaybe attrToNSPair (elAttribs numberingElem) + numElems = findChildren + (QName "num" (lookup "w" namespaces) (Just "w")) + numberingElem + absNumElems = findChildren + (QName "abstractNum" (lookup "w" namespaces) (Just "w")) + numberingElem + nums = mapMaybe id $ map (numElemToNum namespaces) numElems + absNums = mapMaybe id $ map (absNumElemToAbsNum namespaces) absNumElems + return $ Numbering namespaces nums absNums + +data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])]) + deriving Show + +noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart]) +noteElemToNote ns element + | qName (elName element) `elem` ["endnote", "footnote"] && + qURI (elName element) == (lookup "w" ns) = + do + noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element + let bps = map fromJust + $ filter isJust + $ map (elemToBodyPart ns) + $ filterChildrenName (isParOrTbl ns) element + return $ (noteId, bps) +noteElemToNote _ _ = Nothing + +getFootNote :: String -> Notes -> Maybe [BodyPart] +getFootNote s (Notes _ fns _) = fns >>= (lookup s) + +getEndNote :: String -> Notes -> Maybe [BodyPart] +getEndNote s (Notes _ _ ens) = ens >>= (lookup s) + +elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])] +elemToNotes ns notetype element + | qName (elName element) == (notetype ++ "s") && + qURI (elName element) == (lookup "w" ns) = + Just $ map fromJust + $ filter isJust + $ map (noteElemToNote ns) + $ findChildren (QName notetype (lookup "w" ns) (Just "w")) element +elemToNotes _ _ _ = Nothing + +archiveToNotes :: Archive -> Notes +archiveToNotes zf = + let fnElem = findEntryByPath "word/footnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + enElem = findEntryByPath "word/endnotes.xml" zf + >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) + fn_namespaces = case fnElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + en_namespaces = case enElem of + Just e -> mapMaybe attrToNSPair (elAttribs e) + Nothing -> [] + ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces + fn = fnElem >>= (elemToNotes ns "footnote") + en = enElem >>= (elemToNotes ns "endnote") + in + Notes ns fn en + + +data Relationship = Relationship (RelId, Target) + deriving Show + +lookupRelationship :: RelId -> [Relationship] -> Maybe Target +lookupRelationship relid rels = + lookup relid (map (\(Relationship pair) -> pair) rels) + +filePathIsRel :: FilePath -> Bool +filePathIsRel fp = + let (dir, name) = splitFileName fp + in + (dir == "word/_rels/") && ((takeExtension name) == ".rels") + +relElemToRelationship :: Element -> Maybe Relationship +relElemToRelationship element | qName (elName element) == "Relationship" = + do + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element + return $ Relationship (relId, target) +relElemToRelationship _ = Nothing + + +archiveToRelationships :: Archive -> [Relationship] +archiveToRelationships archive = + let relPaths = filter filePathIsRel (filesInArchive archive) + entries = map fromJust $ filter isJust $ map (\f -> findEntryByPath f archive) relPaths + relElems = map fromJust $ filter isJust $ map (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries + rels = map fromJust $ filter isJust $ map relElemToRelationship $ concatMap elChildren relElems + in + rels + +data Body = Body [BodyPart] + deriving Show + +isParOrTbl :: NameSpaces -> QName -> Bool +isParOrTbl ns q = qName q `elem` ["p", "tbl"] && + qURI q == (lookup "w" ns) + +elemToBody :: NameSpaces -> Element -> Maybe Body +elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) = + Just $ Body + $ map fromJust + $ filter isJust + $ map (elemToBodyPart ns) $ filterChildrenName (isParOrTbl ns) element +elemToBody _ _ = Nothing + +isRunOrLinkOrBookmark :: NameSpaces -> QName -> Bool +isRunOrLinkOrBookmark ns q = qName q `elem` ["r", "hyperlink", "bookmarkStart"] && + qURI q == (lookup "w" ns) + +elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String) +elemToNumInfo ns element + | qName (elName element) == "p" && + qURI (elName element) == (lookup "w" ns) = + do + pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element + numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr + lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + return (numId, lvl) +elemToNumInfo _ _ = Nothing + +-- isBookMarkTag :: NameSpaces -> QName -> Bool +-- isBookMarkTag ns q = qName q `elem` ["bookmarkStart", "bookmarkEnd"] && +-- qURI q == (lookup "w" ns) + +-- parChildrenToBookmark :: NameSpaces -> [Element] -> BookMark +-- parChildrenToBookmark ns (bms : bme : _) +-- | qName (elName bms) == "bookmarkStart" && +-- qURI (elName bms) == (lookup "w" ns) && +-- qName (elName bme) == "bookmarkEnd" && +-- qURI (elName bme) == (lookup "w" ns) = do +-- bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) bms +-- bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) bms +-- return $ (bmId, bmName) +-- parChildrenToBookmark _ _ = Nothing + +elemToBodyPart :: NameSpaces -> Element -> Maybe BodyPart +elemToBodyPart ns element + | qName (elName element) == "p" && + qURI (elName element) == (lookup "w" ns) = + let parstyle = elemToParagraphStyle ns element + parparts = mapMaybe id + $ map (elemToParPart ns) + $ filterChildrenName (isRunOrLinkOrBookmark ns) element + in + case elemToNumInfo ns element of + Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts + Nothing -> Just $ Paragraph parstyle parparts + | qName (elName element) == "tbl" && + qURI (elName element) == (lookup "w" ns) = + let + caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element + >>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w")) + >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) + grid = case + findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element + of + Just g -> elemToTblGrid ns g + Nothing -> [] + tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element + >>= findChild (QName "tblLook" (lookup "w" ns) (Just "w")) + >>= elemToTblLook ns + in + Just $ Tbl + (fromMaybe "" caption) + grid + (fromMaybe defaultTblLook tblLook) + (mapMaybe (elemToRow ns) (elChildren element)) + | otherwise = Nothing + +elemToTblLook :: NameSpaces -> Element -> Maybe TblLook +elemToTblLook ns element + | qName (elName element) == "tblLook" && + qURI (elName element) == (lookup "w" ns) = + let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element + val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element + firstRowFmt = + case firstRow of + Just "1" -> True + Just _ -> False + Nothing -> case val of + Just bitMask -> testBitMask bitMask 0x020 + Nothing -> False + in + Just $ TblLook{firstRowFormatting = firstRowFmt} +elemToTblLook _ _ = Nothing + +testBitMask :: String -> Int -> Bool +testBitMask bitMaskS n = + case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of + [] -> False + ((n', _) : _) -> ((n' .|. n) /= 0) + +data ParagraphStyle = ParagraphStyle { pStyle :: [String] + , indent :: Maybe Integer + } + deriving Show + +defaultParagraphStyle :: ParagraphStyle +defaultParagraphStyle = ParagraphStyle { pStyle = [] + , indent = Nothing + } + +elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle +elemToParagraphStyle ns element = + case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of + Just pPr -> + ParagraphStyle + {pStyle = + mapMaybe id $ + map + (findAttr (QName "val" (lookup "w" ns) (Just "w"))) + (findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr) + , indent = + findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>= + findAttr (QName "left" (lookup "w" ns) (Just "w")) >>= + stringToInteger + } + Nothing -> defaultParagraphStyle + + +data BodyPart = Paragraph ParagraphStyle [ParPart] + | ListItem ParagraphStyle String String [ParPart] + | Tbl String TblGrid TblLook [Row] + + deriving Show + +type TblGrid = [Integer] + +data TblLook = TblLook {firstRowFormatting::Bool} + deriving Show + +defaultTblLook :: TblLook +defaultTblLook = TblLook{firstRowFormatting = False} + +stringToInteger :: String -> Maybe Integer +stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) + +elemToTblGrid :: NameSpaces -> Element -> TblGrid +elemToTblGrid ns element + | qName (elName element) == "tblGrid" && + qURI (elName element) == (lookup "w" ns) = + let + cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element + in + mapMaybe (\e -> + findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e + >>= stringToInteger + ) + cols +elemToTblGrid _ _ = [] + +data Row = Row [Cell] + deriving Show + + +elemToRow :: NameSpaces -> Element -> Maybe Row +elemToRow ns element + | qName (elName element) == "tr" && + qURI (elName element) == (lookup "w" ns) = + let + cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element + in + Just $ Row (mapMaybe (elemToCell ns) cells) +elemToRow _ _ = Nothing + +data Cell = Cell [BodyPart] + deriving Show + +elemToCell :: NameSpaces -> Element -> Maybe Cell +elemToCell ns element + | qName (elName element) == "tc" && + qURI (elName element) == (lookup "w" ns) = + Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element)) +elemToCell _ _ = Nothing + +data ParPart = PlainRun Run + | BookMark BookMarkId Anchor + | InternalHyperLink Anchor [Run] + | ExternalHyperLink RelId [Run] + | Drawing String + deriving Show + +data Run = Run RunStyle [RunElem] + | Footnote String + | Endnote String + deriving Show + +data RunElem = TextRun String | LnBrk + deriving Show + +data RunStyle = RunStyle { isBold :: Bool + , isItalic :: Bool + , isSmallCaps :: Bool + , isStrike :: Bool + , isSuperScript :: Bool + , isSubScript :: Bool + , underline :: Maybe String + , rStyle :: Maybe String } + deriving Show + +defaultRunStyle :: RunStyle +defaultRunStyle = RunStyle { isBold = False + , isItalic = False + , isSmallCaps = False + , isStrike = False + , isSuperScript = False + , isSubScript = False + , underline = Nothing + , rStyle = Nothing + } + +elemToRunStyle :: NameSpaces -> Element -> RunStyle +elemToRunStyle ns element = + case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of + Just rPr -> + RunStyle + { + isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr + , isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr + , isSmallCaps = isJust $ findChild (QName "smallCaps" (lookup "w" ns) (Just "w")) rPr + , isStrike = isJust $ findChild (QName "strike" (lookup "w" ns) (Just "w")) rPr + , isSuperScript = + (Just "superscript" == + (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")))) + , isSubScript = + (Just "subscript" == + (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")))) + , underline = + findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + , rStyle = + findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>= + findAttr (QName "val" (lookup "w" ns) (Just "w")) + } + Nothing -> defaultRunStyle + +elemToRun :: NameSpaces -> Element -> Maybe Run +elemToRun ns element + | qName (elName element) == "r" && + qURI (elName element) == (lookup "w" ns) = + case + findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>= + findAttr (QName "id" (lookup "w" ns) (Just "w")) + of + Just s -> Just $ Footnote s + Nothing -> + case + findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>= + findAttr (QName "id" (lookup "w" ns) (Just "w")) + of + Just s -> Just $ Endnote s + Nothing -> Just $ + Run (elemToRunStyle ns element) + (elemToRunElems ns element) +elemToRun _ _ = Nothing + +elemToRunElem :: NameSpaces -> Element -> Maybe RunElem +elemToRunElem ns element + | qName (elName element) == "t" && + qURI (elName element) == (lookup "w" ns) = + Just $ TextRun (strContent element) + | qName (elName element) == "br" && + qURI (elName element) == (lookup "w" ns) = + Just $ LnBrk + | otherwise = Nothing + + +elemToRunElems :: NameSpaces -> Element -> [RunElem] +elemToRunElems ns element + | qName (elName element) == "r" && + qURI (elName element) == (lookup "w" ns) = + mapMaybe (elemToRunElem ns) (elChildren element) + | otherwise = [] + +elemToDrawing :: NameSpaces -> Element -> Maybe ParPart +elemToDrawing ns element + | qName (elName element) == "drawing" && + qURI (elName element) == (lookup "w" ns) = + let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" + in + findElement (QName "blip" (Just a_ns) (Just "a")) element + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) + >>= (\s -> Just $ Drawing s) +elemToDrawing _ _ = Nothing + + +elemToParPart :: NameSpaces -> Element -> Maybe ParPart +elemToParPart ns element + | qName (elName element) == "r" && + qURI (elName element) == (lookup "w" ns) = + case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of + Just drawingElem -> elemToDrawing ns drawingElem + Nothing -> do + r <- elemToRun ns element + return $ PlainRun r +elemToParPart ns element + | qName (elName element) == "bookmarkStart" && + qURI (elName element) == (lookup "w" ns) = do + bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element + bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element + return $ BookMark bmId bmName +elemToParPart ns element + | qName (elName element) == "hyperlink" && + qURI (elName element) == (lookup "w" ns) = + let runs = map fromJust $ filter isJust $ map (elemToRun ns) + $ findChildren (QName "r" (lookup "w" ns) (Just "w")) element + in + case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of + Just anchor -> + Just $ InternalHyperLink anchor runs + Nothing -> + case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of + Just relId -> Just $ ExternalHyperLink relId runs + Nothing -> Nothing +elemToParPart _ _ = Nothing + +type Target = String +type Anchor = String +type BookMarkId = String +type RelId = String + -- cgit v1.2.3