From 293e4cfdc3028218089115acd6f091b9ea3aa7d6 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sat, 14 Jun 2014 10:02:35 -0400 Subject: Add DocX files to tree. This introduces Text.Pandoc.DocX, and its exported `readDocX` function. --- src/Text/Pandoc/Readers/DocX.hs | 479 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 479 insertions(+) create mode 100644 src/Text/Pandoc/Readers/DocX.hs (limited to 'src/Text/Pandoc/Readers/DocX.hs') diff --git a/src/Text/Pandoc/Readers/DocX.hs b/src/Text/Pandoc/Readers/DocX.hs new file mode 100644 index 000000000..976e2e271 --- /dev/null +++ b/src/Text/Pandoc/Readers/DocX.hs @@ -0,0 +1,479 @@ +{- +Copyright (C) 2014 Jesse Rosenthal + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Readers.DocX + Copyright : Copyright (C) 2014 Jesse Rosenthal + License : GNU GPL, version 2 or above + + Maintainer : Jesse Rosenthal + Stability : alpha + Portability : portable + +Conversion of DocX type (defined in Text.Pandoc.Readers.DocX.Parse) +to 'Pandoc' document. -} + +{- +Current state of implementation of DocX entities ([x] means +implemented, [-] means partially implemented): + +* Blocks + + - [X] Para + - [X] CodeBlock (styled with `SourceCode`) + - [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally, + indented) + - [X] OrderedList + - [X] BulletList + - [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`) + - [X] Header (styled with `Heading#`) + - [ ] HorizontalRule + - [-] Table (column widths and alignments not yet implemented) + +* Inlines + + - [X] Str + - [X] Emph (From italics. `underline` currently read as span. In + future, it might optionally be emph as well) + - [X] Strong + - [X] Strikeout + - [X] Superscript + - [X] Subscript + - [X] SmallCaps + - [ ] Quoted + - [ ] Cite + - [X] Code (styled with `VerbatimChar`) + - [X] Space + - [X] LineBreak (these are invisible in Word: entered with Shift-Return) + - [ ] Math + - [X] Link (links to an arbitrary bookmark create a span with the target as + id and "anchor" class) + - [-] Image (Links to path in archive. Future option for + data-encoded URI likely.) + - [X] Note (Footnotes and Endnotes are silently combined.) +-} + +module Text.Pandoc.Readers.DocX + ( readDocX + ) where + +import Codec.Archive.Zip +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Builder (text, toList) +import Text.Pandoc.Generic (bottomUp) +import Text.Pandoc.MIME (getMimeType) +import Text.Pandoc.UTF8 (toString) +import Text.Pandoc.Readers.DocX.Parse +import Text.Pandoc.Readers.DocX.Lists +import Data.Maybe (mapMaybe, isJust, fromJust) +import Data.List (delete, isPrefixOf, (\\), intersect) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as B +import Data.ByteString.Base64 (encode) +import System.FilePath (combine) + +readDocX :: ReaderOptions + -> B.ByteString + -> Pandoc +readDocX opts bytes = + case archiveToDocX (toArchive bytes) of + Just docx -> Pandoc nullMeta (docxToBlocks opts docx) + Nothing -> error $ "couldn't parse docx file" + +runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)]) +runStyleToSpanAttr rPr = ("", + mapMaybe id [ + if isBold rPr then (Just "strong") else Nothing, + if isItalic rPr then (Just "emph") else Nothing, + if isSmallCaps rPr then (Just "smallcaps") else Nothing, + if isStrike rPr then (Just "strike") else Nothing, + if isSuperScript rPr then (Just "superscript") else Nothing, + if isSubScript rPr then (Just "subscript") else Nothing, + rStyle rPr], + case underline rPr of + Just fmt -> [("underline", fmt)] + _ -> [] + ) + +parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)]) +parStyleToDivAttr pPr = ("", + pStyle pPr, + case indent pPr of + Just n -> [("indent", (show n))] + Nothing -> [] + ) + +strToInlines :: String -> [Inline] +strToInlines = toList . text + +codeSpans :: [String] +codeSpans = ["VerbatimChar"] + +blockQuoteDivs :: [String] +blockQuoteDivs = ["Quote", "BlockQuote"] + +codeDivs :: [String] +codeDivs = ["SourceCode"] + +runElemToInlines :: RunElem -> [Inline] +runElemToInlines (TextRun s) = strToInlines s +runElemToInlines (LnBrk) = [LineBreak] + +runElemToString :: RunElem -> String +runElemToString (TextRun s) = s +runElemToString (LnBrk) = ['\n'] + +runElemsToString :: [RunElem] -> String +runElemsToString = concatMap runElemToString + +strNormalize :: [Inline] -> [Inline] +strNormalize [] = [] +strNormalize (Str "" : ils) = strNormalize ils +strNormalize ((Str s) : (Str s') : l) = strNormalize ((Str (s++s')) : l) +strNormalize (il:ils) = il : (strNormalize ils) + +runToInlines :: ReaderOptions -> DocX -> Run -> [Inline] +runToInlines _ _ (Run rs runElems) + | isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans = + case runStyleToSpanAttr rs == ("", [], []) of + True -> [Str (runElemsToString runElems)] + False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]] + | otherwise = case runStyleToSpanAttr rs == ("", [], []) of + True -> concatMap runElemToInlines runElems + False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)] +runToInlines opts docx@(DocX _ notes _ _ _ ) (Footnote fnId) = + case (getFootNote fnId notes) of + Just bodyParts -> + [Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] + Nothing -> + [Note [Div ("", ["footnote"], []) []]] +runToInlines opts docx@(DocX _ notes _ _ _) (Endnote fnId) = + case (getEndNote fnId notes) of + Just bodyParts -> + [Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] + Nothing -> + [Note [Div ("", ["endnote"], []) []]] + +parPartToInlines :: ReaderOptions -> DocX -> ParPart -> [Inline] +parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r +parPartToInlines _ _ (BookMark _ anchor) = + [Span (anchor, ["anchor"], []) []] +parPartToInlines _ (DocX _ _ _ rels _) (Drawing relid) = + case lookupRelationship relid rels of + Just target -> [Image [] (combine "word" target, "")] + Nothing -> [Image [] ("", "")] +parPartToInlines opts docx (InternalHyperLink anchor runs) = + [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")] +parPartToInlines opts docx@(DocX _ _ _ rels _) (ExternalHyperLink relid runs) = + case lookupRelationship relid rels of + Just target -> + [Link (concatMap (runToInlines opts docx) runs) (target, "")] + Nothing -> + [Link (concatMap (runToInlines opts docx) runs) ("", "")] + +isAnchorSpan :: Inline -> Bool +isAnchorSpan (Span (ident, classes, kvs) ils) = + (not . null) ident && + classes == ["anchor"] && + null kvs && + null ils +isAnchorSpan _ = False + +dummyAnchors :: [String] +dummyAnchors = ["_GoBack"] + +makeHeaderAnchors :: Block -> Block +makeHeaderAnchors h@(Header n (_, classes, kvs) ils) = + case filter isAnchorSpan ils of + [] -> h + (x@(Span (ident, _, _) _) : xs) -> + case ident `elem` dummyAnchors of + True -> h + False -> Header n (ident, classes, kvs) (ils \\ (x:xs)) + _ -> h +makeHeaderAnchors blk = blk + + +parPartsToInlines :: ReaderOptions -> DocX -> [ParPart] -> [Inline] +parPartsToInlines opts docx parparts = + -- + -- We're going to skip data-uri's for now. It should be an option, + -- not mandatory. + -- + --bottomUp (makeImagesSelfContained docx) $ + bottomUp spanCorrect $ + bottomUp spanTrim $ + bottomUp spanReduce $ + concatMap (parPartToInlines opts docx) parparts + +cellToBlocks :: ReaderOptions -> DocX -> Cell -> [Block] +cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps + +rowToBlocksList :: ReaderOptions -> DocX -> Row -> [[Block]] +rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells + +bodyPartToBlock :: ReaderOptions -> DocX -> BodyPart -> Block +bodyPartToBlock opts docx (Paragraph pPr parparts) = + Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)] +bodyPartToBlock opts docx@(DocX _ _ numbering _ _) (ListItem pPr numId lvl parparts) = + let + kvs = case lookupLevel numId lvl numbering of + Just (_, fmt, txt, Just start) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + , ("start", (show start)) + ] + + Just (_, fmt, txt, Nothing) -> [ ("level", lvl) + , ("num-id", numId) + , ("format", fmt) + , ("text", txt) + ] + Nothing -> [] + in + Div + ("", ["list-item"], kvs) + [bodyPartToBlock opts docx (Paragraph pPr parparts)] +bodyPartToBlock _ _ (Tbl _ _ _ []) = + Para [] +bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) = + let caption = strToInlines cap + (hdr, rows) = case firstRowFormatting look of + True -> (Just r, rs) + False -> (Nothing, r:rs) + hdrCells = case hdr of + Just r' -> rowToBlocksList opts docx r' + Nothing -> [] + cells = map (rowToBlocksList opts docx) rows + + size = case null hdrCells of + True -> length $ head cells + False -> length $ hdrCells + -- + -- The two following variables (horizontal column alignment and + -- relative column widths) go to the default at the + -- moment. Width information is in the TblGrid field of the Tbl, + -- so should be possible. Alignment might be more difficult, + -- since there doesn't seem to be a column entity in docx. + alignments = take size (repeat AlignDefault) + widths = take size (repeat 0) :: [Double] + in + Table caption alignments widths hdrCells cells + +makeImagesSelfContained :: DocX -> Inline -> Inline +makeImagesSelfContained (DocX _ _ _ _ media) i@(Image alt (uri, title)) = + case lookup uri media of + Just bs -> case getMimeType uri of + Just mime -> let data_uri = + "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs) + in + Image alt (data_uri, title) + Nothing -> i + Nothing -> i +makeImagesSelfContained _ inline = inline + +bodyToBlocks :: ReaderOptions -> DocX -> Body -> [Block] +bodyToBlocks opts docx (Body bps) = + bottomUp removeEmptyPars $ + bottomUp strNormalize $ + bottomUp spanRemove $ + bottomUp divRemove $ + map (makeHeaderAnchors) $ + bottomUp divCorrect $ + bottomUp divReduce $ + bottomUp divCorrectPreReduce $ + bottomUp blocksToDefinitions $ + blocksToBullets $ + map (bodyPartToBlock opts docx) bps + +docxToBlocks :: ReaderOptions -> DocX -> [Block] +docxToBlocks opts d@(DocX (Document _ body) _ _ _ _) = bodyToBlocks opts d body + +spanReduce :: [Inline] -> [Inline] +spanReduce [] = [] +spanReduce ((Span (id1, classes1, kvs1) ils1) : ils) + | (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils) +spanReduce (s1@(Span (id1, classes1, kvs1) ils1) : + s2@(Span (id2, classes2, kvs2) ils2) : + ils) = + let classes' = classes1 `intersect` classes2 + kvs' = kvs1 `intersect` kvs2 + classes1' = classes1 \\ classes' + kvs1' = kvs1 \\ kvs' + classes2' = classes2 \\ classes' + kvs2' = kvs2 \\ kvs' + in + case null classes' && null kvs' of + True -> s1 : (spanReduce (s2 : ils)) + False -> let attr' = ("", classes', kvs') + attr1' = (id1, classes1', kvs1') + attr2' = (id2, classes2', kvs2') + in + spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] : + ils) +spanReduce (il:ils) = il : (spanReduce ils) + +ilToCode :: Inline -> String +ilToCode (Str s) = s +ilToCode _ = "" + +spanRemove' :: Inline -> [Inline] +spanRemove' s@(Span (ident, classes, _) []) + -- "_GoBack" is automatically inserted. We don't want to keep it. + | classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s] +spanRemove' (Span (_, _, kvs) ils) = + case lookup "underline" kvs of + Just val -> [Span ("", [], [("underline", val)]) ils] + Nothing -> ils +spanRemove' il = [il] + +spanRemove :: [Inline] -> [Inline] +spanRemove = concatMap spanRemove' + +spanTrim' :: Inline -> [Inline] +spanTrim' il@(Span _ []) = [il] +spanTrim' il@(Span attr (il':[])) + | il' == Space = [Span attr [], Space] + | otherwise = [il] +spanTrim' (Span attr ils) + | head ils == Space && last ils == Space = + [Space, Span attr (init $ tail ils), Space] + | head ils == Space = [Space, Span attr (tail ils)] + | last ils == Space = [Span attr (init ils), Space] +spanTrim' il = [il] + +spanTrim :: [Inline] -> [Inline] +spanTrim = concatMap spanTrim' + +spanCorrect' :: Inline -> [Inline] +spanCorrect' (Span ("", [], []) ils) = ils +spanCorrect' (Span (ident, classes, kvs) ils) + | "emph" `elem` classes = + [Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils] + | "strong" `elem` classes = + [Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils] + | "smallcaps" `elem` classes = + [SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils] + | "strike" `elem` classes = + [Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils] + | "superscript" `elem` classes = + [Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils] + | "subscript" `elem` classes = + [Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils] + | (not . null) (codeSpans `intersect` classes) = + [Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)] + | otherwise = + [Span (ident, classes, kvs) ils] +spanCorrect' il = [il] + +spanCorrect :: [Inline] -> [Inline] +spanCorrect = concatMap spanCorrect' + +removeEmptyPars :: [Block] -> [Block] +removeEmptyPars blks = filter (\b -> b /= (Para [])) blks + +divReduce :: [Block] -> [Block] +divReduce [] = [] +divReduce ((Div (id1, classes1, kvs1) blks1) : blks) + | (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks) +divReduce (d1@(Div (id1, classes1, kvs1) blks1) : + d2@(Div (id2, classes2, kvs2) blks2) : + blks) = + let classes' = classes1 `intersect` classes2 + kvs' = kvs1 `intersect` kvs2 + classes1' = classes1 \\ classes' + kvs1' = kvs1 \\ kvs' + classes2' = classes2 \\ classes' + kvs2' = kvs2 \\ kvs' + in + case null classes' && null kvs' of + True -> d1 : (divReduce (d2 : blks)) + False -> let attr' = ("", classes', kvs') + attr1' = (id1, classes1', kvs1') + attr2' = (id2, classes2', kvs2') + in + divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] : + blks) +divReduce (blk:blks) = blk : (divReduce blks) + +isHeaderClass :: String -> Maybe Int +isHeaderClass s | "Heading" `isPrefixOf` s = + case reads (drop (length "Heading") s) :: [(Int, String)] of + [] -> Nothing + ((n, "") : []) -> Just n + _ -> Nothing +isHeaderClass _ = Nothing + +findHeaderClass :: [String] -> Maybe Int +findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of + [] -> Nothing + n : _ -> Just n + +blksToInlines :: [Block] -> [Inline] +blksToInlines (Para ils : _) = ils +blksToInlines (Plain ils : _) = ils +blksToInlines _ = [] + +divCorrectPreReduce' :: Block -> [Block] +divCorrectPreReduce' (Div (ident, classes, kvs) blks) + | isJust $ findHeaderClass classes = + let n = fromJust $ findHeaderClass classes + in + [Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)] + | otherwise = [Div (ident, classes, kvs) blks] +divCorrectPreReduce' blk = [blk] + +divCorrectPreReduce :: [Block] -> [Block] +divCorrectPreReduce = concatMap divCorrectPreReduce' + +blkToCode :: Block -> String +blkToCode (Para []) = "" +blkToCode (Para ((Code _ s):ils)) = s ++ (blkToCode (Para ils)) +blkToCode (Para ((Span (_, classes, _) ils'): ils)) + | (not . null) (codeSpans `intersect` classes) = + (init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils)) +blkToCode _ = "" + +divRemove' :: Block -> [Block] +divRemove' (Div (_, _, kvs) blks) = + case lookup "indent" kvs of + Just val -> [Div ("", [], [("indent", val)]) blks] + Nothing -> blks +divRemove' blk = [blk] + +divRemove :: [Block] -> [Block] +divRemove = concatMap divRemove' + +divCorrect' :: Block -> [Block] +divCorrect' b@(Div (ident, classes, kvs) blks) + | (not . null) (blockQuoteDivs `intersect` classes) = + [BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]] + | (not . null) (codeDivs `intersect` classes) = + [CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)] + | otherwise = + case lookup "indent" kvs of + Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks] + Just _ -> + [BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]] + Nothing -> [b] +divCorrect' blk = [blk] + +divCorrect :: [Block] -> [Block] +divCorrect = concatMap divCorrect' -- cgit v1.2.3