diff options
| author | John MacFarlane <jgm@berkeley.edu> | 2014-06-16 22:16:45 -0700 | 
|---|---|---|
| committer | John MacFarlane <jgm@berkeley.edu> | 2014-06-16 22:16:45 -0700 | 
| commit | bec9f3c641e1376c0ef27d3df5a7b0a3fc7eee4f (patch) | |
| tree | 53c00a7038058f04f82aab2d149f0851a538156d /src/Text/Pandoc | |
| parent | 78ee2416d105bd25337819a49835623a8a296224 (diff) | |
| parent | c709cec0bdf7a3029a43f0c46d071a7ca1ab6a13 (diff) | |
| download | pandoc-bec9f3c641e1376c0ef27d3df5a7b0a3fc7eee4f.tar.gz | |
Merge branch 'docx' of https://github.com/jkr/pandoc into jkr-docx
Diffstat (limited to 'src/Text/Pandoc')
| -rw-r--r-- | src/Text/Pandoc/Readers/DocX.hs | 479 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/DocX/Lists.hs | 208 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/DocX/Parse.hs | 604 | 
3 files changed, 1291 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/DocX.hs b/src/Text/Pandoc/Readers/DocX.hs new file mode 100644 index 000000000..976e2e271 --- /dev/null +++ b/src/Text/Pandoc/Readers/DocX.hs @@ -0,0 +1,479 @@ +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Readers.DocX +   Copyright   : Copyright (C) 2014 Jesse Rosenthal +   License     : GNU GPL, version 2 or above + +   Maintainer  : Jesse Rosenthal <jrosenthal@jhu.edu> +   Stability   : alpha +   Portability : portable + +Conversion of DocX type (defined in Text.Pandoc.Readers.DocX.Parse) +to 'Pandoc' document.  -} + +{- +Current state of implementation of DocX entities ([x] means +implemented, [-] means partially implemented): + +* Blocks + +  - [X] Para +  - [X] CodeBlock (styled with `SourceCode`) +  - [X] BlockQuote (styled with `Quote`, `BlockQuote`, or, optionally, +        indented) +  - [X] OrderedList +  - [X] BulletList +  - [X] DefinitionList (styled with adjacent `DefinitionTerm` and `Definition`) +  - [X] Header (styled with `Heading#`) +  - [ ] HorizontalRule +  - [-] Table (column widths and alignments not yet implemented) + +* Inlines + +  - [X] Str +  - [X] Emph (From italics. `underline` currently read as span. In +        future, it might optionally be emph as well) +  - [X] Strong +  - [X] Strikeout +  - [X] Superscript +  - [X] Subscript +  - [X] SmallCaps +  - [ ] Quoted +  - [ ] Cite +  - [X] Code (styled with `VerbatimChar`) +  - [X] Space +  - [X] LineBreak (these are invisible in Word: entered with Shift-Return) +  - [ ] Math +  - [X] Link (links to an arbitrary bookmark create a span with the target as +        id and "anchor" class) +  - [-] Image (Links to path in archive. Future option for +        data-encoded URI likely.) +  - [X] Note (Footnotes and Endnotes are silently combined.) +-} + +module Text.Pandoc.Readers.DocX +       ( readDocX +       ) where + +import Codec.Archive.Zip +import Text.Pandoc.Definition +import Text.Pandoc.Options +import Text.Pandoc.Builder (text, toList) +import Text.Pandoc.Generic (bottomUp) +import Text.Pandoc.MIME (getMimeType) +import Text.Pandoc.UTF8 (toString) +import Text.Pandoc.Readers.DocX.Parse +import Text.Pandoc.Readers.DocX.Lists +import Data.Maybe (mapMaybe, isJust, fromJust) +import Data.List (delete, isPrefixOf, (\\), intersect) +import qualified Data.ByteString as BS +import qualified Data.ByteString.Lazy as B +import Data.ByteString.Base64 (encode) +import System.FilePath (combine) + +readDocX :: ReaderOptions +         -> B.ByteString +         -> Pandoc +readDocX opts bytes = +  case archiveToDocX (toArchive bytes) of +    Just docx -> Pandoc nullMeta (docxToBlocks opts docx) +    Nothing   -> error $ "couldn't parse docx file" + +runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)]) +runStyleToSpanAttr rPr = ("", +                          mapMaybe id [ +                            if isBold rPr then (Just "strong") else Nothing, +                            if isItalic rPr then (Just "emph") else Nothing, +                            if isSmallCaps rPr then (Just "smallcaps") else Nothing, +                            if isStrike rPr then (Just "strike") else Nothing, +                            if isSuperScript rPr then (Just "superscript") else Nothing, +                            if isSubScript rPr then (Just "subscript") else Nothing, +                            rStyle rPr], +                          case underline rPr of +                            Just fmt -> [("underline", fmt)] +                            _        -> [] +                         ) + +parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)]) +parStyleToDivAttr pPr = ("", +                          pStyle pPr, +                          case indent pPr of +                            Just n  -> [("indent", (show n))] +                            Nothing -> [] +                         ) + +strToInlines :: String -> [Inline] +strToInlines = toList . text + +codeSpans :: [String] +codeSpans = ["VerbatimChar"] + +blockQuoteDivs :: [String] +blockQuoteDivs = ["Quote", "BlockQuote"] + +codeDivs :: [String] +codeDivs = ["SourceCode"] + +runElemToInlines :: RunElem -> [Inline] +runElemToInlines (TextRun s) = strToInlines s +runElemToInlines (LnBrk) = [LineBreak] + +runElemToString :: RunElem -> String +runElemToString (TextRun s) = s +runElemToString (LnBrk) = ['\n'] + +runElemsToString :: [RunElem] -> String +runElemsToString = concatMap runElemToString + +strNormalize :: [Inline] -> [Inline] +strNormalize [] = [] +strNormalize (Str "" : ils) = strNormalize ils +strNormalize ((Str s) : (Str s') : l) = strNormalize ((Str (s++s')) : l) +strNormalize (il:ils) = il : (strNormalize ils) + +runToInlines :: ReaderOptions -> DocX -> Run -> [Inline] +runToInlines _ _ (Run rs runElems)  +  | isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans = +    case runStyleToSpanAttr rs == ("", [], []) of +      True -> [Str (runElemsToString runElems)] +      False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]] +  | otherwise = case runStyleToSpanAttr rs == ("", [], []) of +      True -> concatMap runElemToInlines runElems +      False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)] +runToInlines opts docx@(DocX _ notes _ _ _ ) (Footnote fnId) = +  case (getFootNote fnId notes) of +    Just bodyParts -> +      [Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] +    Nothing        -> +      [Note [Div ("", ["footnote"], []) []]] +runToInlines opts docx@(DocX _ notes _ _ _) (Endnote fnId) = +  case (getEndNote fnId notes) of +    Just bodyParts -> +      [Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]] +    Nothing        -> +      [Note [Div ("", ["endnote"], []) []]] + +parPartToInlines :: ReaderOptions -> DocX -> ParPart -> [Inline] +parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r +parPartToInlines _ _ (BookMark _ anchor) = +  [Span (anchor, ["anchor"], []) []] +parPartToInlines _ (DocX _ _ _ rels _) (Drawing relid) = +  case lookupRelationship relid rels of +    Just target -> [Image [] (combine "word" target, "")] +    Nothing     -> [Image [] ("", "")] +parPartToInlines opts docx (InternalHyperLink anchor runs) = +  [Link (concatMap (runToInlines opts docx) runs) ('#' : anchor, "")] +parPartToInlines opts docx@(DocX _ _ _ rels _) (ExternalHyperLink relid runs) = +  case lookupRelationship relid rels of +    Just target -> +      [Link (concatMap (runToInlines opts docx) runs) (target, "")] +    Nothing -> +      [Link (concatMap (runToInlines opts docx) runs) ("", "")] + +isAnchorSpan :: Inline -> Bool +isAnchorSpan (Span (ident, classes, kvs) ils) = +  (not . null) ident && +  classes == ["anchor"] && +  null kvs && +  null ils +isAnchorSpan _ = False + +dummyAnchors :: [String] +dummyAnchors = ["_GoBack"] + +makeHeaderAnchors :: Block -> Block +makeHeaderAnchors h@(Header n (_, classes, kvs) ils) = +  case filter isAnchorSpan ils of +    []   -> h +    (x@(Span (ident, _, _) _) : xs) -> +      case ident `elem` dummyAnchors of +        True -> h +        False -> Header n (ident, classes, kvs) (ils \\ (x:xs)) +    _ -> h +makeHeaderAnchors blk = blk +   + +parPartsToInlines :: ReaderOptions -> DocX -> [ParPart] -> [Inline] +parPartsToInlines opts docx parparts = +  -- +  -- We're going to skip data-uri's for now. It should be an option, +  -- not mandatory. +  -- +  --bottomUp (makeImagesSelfContained docx) $ +  bottomUp spanCorrect $ +  bottomUp spanTrim $ +  bottomUp spanReduce $ +  concatMap (parPartToInlines opts docx) parparts + +cellToBlocks :: ReaderOptions -> DocX -> Cell -> [Block] +cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps + +rowToBlocksList :: ReaderOptions -> DocX -> Row -> [[Block]] +rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells + +bodyPartToBlock :: ReaderOptions -> DocX -> BodyPart -> Block +bodyPartToBlock opts docx (Paragraph pPr parparts) = +  Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)] +bodyPartToBlock opts docx@(DocX _ _ numbering _ _) (ListItem pPr numId lvl parparts) = +  let +    kvs = case lookupLevel numId lvl numbering of +      Just (_, fmt, txt, Just start) -> [ ("level", lvl) +                                        , ("num-id", numId) +                                        , ("format", fmt) +                                        , ("text", txt) +                                        , ("start", (show start)) +                                        ] +       +      Just (_, fmt, txt, Nothing)    -> [ ("level", lvl) +                                        , ("num-id", numId) +                                        , ("format", fmt) +                                        , ("text", txt) +                                        ] +      Nothing                        -> [] +  in +   Div +   ("", ["list-item"], kvs) +   [bodyPartToBlock opts docx (Paragraph pPr parparts)] +bodyPartToBlock _ _ (Tbl _ _ _ []) = +  Para [] +bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) = +  let caption = strToInlines cap +      (hdr, rows) = case firstRowFormatting look of +        True -> (Just r, rs) +        False -> (Nothing, r:rs) +      hdrCells = case hdr of +        Just r' -> rowToBlocksList opts docx r' +        Nothing -> [] +      cells = map (rowToBlocksList opts docx) rows +       +      size = case null hdrCells of +        True -> length $ head cells +        False -> length $ hdrCells +      -- +      -- The two following variables (horizontal column alignment and +      -- relative column widths) go to the default at the +      -- moment. Width information is in the TblGrid field of the Tbl, +      -- so should be possible. Alignment might be more difficult, +      -- since there doesn't seem to be a column entity in docx. +      alignments = take size (repeat AlignDefault) +      widths = take size (repeat 0) :: [Double] +  in +   Table caption alignments widths hdrCells cells + +makeImagesSelfContained :: DocX -> Inline -> Inline +makeImagesSelfContained (DocX _ _ _ _ media) i@(Image alt (uri, title)) = +  case lookup uri media of +    Just bs -> case getMimeType uri of +      Just mime ->  let data_uri = +                          "data:" ++ mime ++ ";base64," ++ toString (encode $ BS.concat $ B.toChunks bs) +                    in +                     Image alt (data_uri, title) +      Nothing  -> i +    Nothing -> i +makeImagesSelfContained _ inline = inline + +bodyToBlocks :: ReaderOptions -> DocX -> Body -> [Block] +bodyToBlocks opts docx (Body bps) = +  bottomUp removeEmptyPars $ +  bottomUp strNormalize $  +  bottomUp spanRemove $  +  bottomUp divRemove $ +  map (makeHeaderAnchors) $ +  bottomUp divCorrect $ +  bottomUp divReduce $ +  bottomUp divCorrectPreReduce $ +  bottomUp blocksToDefinitions $ +  blocksToBullets $ +  map (bodyPartToBlock opts docx) bps + +docxToBlocks :: ReaderOptions -> DocX -> [Block] +docxToBlocks opts d@(DocX (Document _ body) _ _ _ _) = bodyToBlocks opts d body + +spanReduce :: [Inline] -> [Inline] +spanReduce [] = [] +spanReduce ((Span (id1, classes1, kvs1) ils1) : ils) +  | (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils) +spanReduce (s1@(Span (id1, classes1, kvs1) ils1) : +            s2@(Span (id2, classes2, kvs2) ils2) : +            ils) = +  let classes'  = classes1 `intersect` classes2 +      kvs'      = kvs1 `intersect` kvs2 +      classes1' = classes1 \\ classes' +      kvs1'     = kvs1 \\ kvs' +      classes2' = classes2 \\ classes' +      kvs2'     = kvs2 \\ kvs' +  in +   case null classes' && null kvs' of +     True -> s1 : (spanReduce (s2 : ils)) +     False -> let attr'  = ("", classes', kvs') +                  attr1' = (id1, classes1', kvs1') +                  attr2' = (id2, classes2', kvs2') +              in +               spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] : +                           ils) +spanReduce (il:ils) = il : (spanReduce ils) + +ilToCode :: Inline -> String +ilToCode (Str s) = s +ilToCode _ = "" + +spanRemove' :: Inline -> [Inline] +spanRemove' s@(Span (ident, classes, _) []) +  -- "_GoBack" is automatically inserted. We don't want to keep it. +  | classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s] +spanRemove' (Span (_, _, kvs) ils) = +  case lookup "underline" kvs of +    Just val -> [Span ("", [], [("underline", val)]) ils] +    Nothing  -> ils +spanRemove' il = [il] + +spanRemove :: [Inline] -> [Inline] +spanRemove = concatMap spanRemove' + +spanTrim' :: Inline -> [Inline] +spanTrim' il@(Span _ []) = [il] +spanTrim' il@(Span attr (il':[])) +  | il' == Space = [Span attr [], Space] +  | otherwise = [il] +spanTrim' (Span attr ils) +  | head ils == Space && last ils == Space = +    [Space, Span attr (init $ tail ils), Space] +  | head ils == Space = [Space, Span attr (tail ils)] +  | last ils == Space = [Span attr (init ils), Space] +spanTrim' il = [il] + +spanTrim :: [Inline] -> [Inline] +spanTrim = concatMap spanTrim' + +spanCorrect' :: Inline -> [Inline] +spanCorrect' (Span ("", [], []) ils) = ils +spanCorrect' (Span (ident, classes, kvs) ils) +  | "emph" `elem` classes = +    [Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils] +  | "strong" `elem` classes = +      [Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils] +  | "smallcaps" `elem` classes = +      [SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils] +  | "strike" `elem` classes = +      [Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils] +  | "superscript" `elem` classes = +      [Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils] +  | "subscript" `elem` classes = +      [Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils] +  | (not . null) (codeSpans `intersect` classes) = +         [Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)] +  | otherwise = +      [Span (ident, classes, kvs) ils] +spanCorrect' il = [il] + +spanCorrect :: [Inline] -> [Inline] +spanCorrect = concatMap spanCorrect' + +removeEmptyPars :: [Block] -> [Block] +removeEmptyPars blks = filter (\b -> b /= (Para [])) blks + +divReduce :: [Block] -> [Block] +divReduce [] = [] +divReduce ((Div (id1, classes1, kvs1) blks1) : blks) +  | (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks) +divReduce (d1@(Div (id1, classes1, kvs1) blks1) : +           d2@(Div (id2, classes2, kvs2) blks2) : +            blks) = +  let classes'  = classes1 `intersect` classes2 +      kvs'      = kvs1 `intersect` kvs2 +      classes1' = classes1 \\ classes' +      kvs1'     = kvs1 \\ kvs' +      classes2' = classes2 \\ classes' +      kvs2'     = kvs2 \\ kvs' +  in +   case null classes' && null kvs' of +     True -> d1 : (divReduce (d2 : blks)) +     False -> let attr'  = ("", classes', kvs') +                  attr1' = (id1, classes1', kvs1') +                  attr2' = (id2, classes2', kvs2') +              in +               divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] : +                           blks) +divReduce (blk:blks) = blk : (divReduce blks) + +isHeaderClass :: String -> Maybe Int +isHeaderClass s | "Heading" `isPrefixOf` s = +  case reads (drop (length "Heading") s) :: [(Int, String)] of +    [] -> Nothing +    ((n, "") : []) -> Just n +    _       -> Nothing +isHeaderClass _ = Nothing + +findHeaderClass :: [String] -> Maybe Int +findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of +  [] -> Nothing +  n : _ -> Just n + +blksToInlines :: [Block] -> [Inline] +blksToInlines (Para ils : _) = ils +blksToInlines (Plain ils : _) = ils +blksToInlines _ = [] + +divCorrectPreReduce' :: Block -> [Block] +divCorrectPreReduce' (Div (ident, classes, kvs) blks) +  | isJust $ findHeaderClass classes = +    let n = fromJust $ findHeaderClass classes +    in +    [Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)] +  | otherwise = [Div (ident, classes, kvs) blks] +divCorrectPreReduce' blk = [blk] + +divCorrectPreReduce :: [Block] -> [Block] +divCorrectPreReduce = concatMap divCorrectPreReduce' + +blkToCode :: Block -> String +blkToCode (Para []) = "" +blkToCode (Para ((Code _ s):ils)) = s ++ (blkToCode (Para ils)) +blkToCode (Para ((Span (_, classes, _) ils'): ils)) +  | (not . null) (codeSpans `intersect` classes) = +    (init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils)) +blkToCode _ = "" + +divRemove' :: Block -> [Block] +divRemove' (Div (_, _, kvs) blks) = +  case lookup "indent" kvs of +    Just val -> [Div ("", [], [("indent", val)]) blks] +    Nothing  -> blks +divRemove' blk = [blk] + +divRemove :: [Block] -> [Block] +divRemove = concatMap divRemove' +                                                  +divCorrect' :: Block -> [Block] +divCorrect' b@(Div (ident, classes, kvs) blks) +  | (not . null) (blockQuoteDivs `intersect` classes) = +    [BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]] +  | (not . null) (codeDivs `intersect` classes) = +    [CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)] +  | otherwise = +      case lookup "indent" kvs of +        Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks] +        Just _   -> +          [BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]] +        Nothing  -> [b] +divCorrect' blk = [blk] + +divCorrect :: [Block] -> [Block] +divCorrect = concatMap divCorrect' diff --git a/src/Text/Pandoc/Readers/DocX/Lists.hs b/src/Text/Pandoc/Readers/DocX/Lists.hs new file mode 100644 index 000000000..b20679261 --- /dev/null +++ b/src/Text/Pandoc/Readers/DocX/Lists.hs @@ -0,0 +1,208 @@ +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Readers.DocX.Lists +   Copyright   : Copyright (C) 2014 Jesse Rosenthal +   License     : GNU GPL, version 2 or above + +   Maintainer  : Jesse Rosenthal <jrosenthal@jhu.edu> +   Stability   : alpha +   Portability : portable + +Functions for converting flat DocX paragraphs into nested lists. +-} + +module Text.Pandoc.Readers.DocX.Lists ( blocksToBullets +                                      , blocksToDefinitions) where + +import Text.Pandoc.JSON +import Text.Pandoc.Shared (trim) +import Control.Monad +import Data.List +import Data.Maybe + +isListItem :: Block -> Bool +isListItem (Div (_, classes, _) _) | "list-item" `elem` classes = True +isListItem _ = False + +getLevel :: Block -> Maybe Integer +getLevel (Div (_, _, kvs) _) =  liftM read $ lookup "level" kvs +getLevel _ = Nothing + +getLevelN :: Block -> Integer +getLevelN b = case getLevel b of +  Just n -> n +  Nothing -> -1 + +getNumId :: Block -> Maybe Integer +getNumId (Div (_, _, kvs) _) =  liftM read $ lookup "num-id" kvs +getNumId _ = Nothing + +getNumIdN :: Block -> Integer +getNumIdN b = case getNumId b of +  Just n -> n +  Nothing -> -1 + +getText :: Block -> Maybe String +getText (Div (_, _, kvs) _) = lookup "text" kvs +getText _ = Nothing + +data ListType = Itemized | Enumerated ListAttributes + +listStyleMap :: [(String, ListNumberStyle)] +listStyleMap = [("upperLetter", UpperAlpha), +                ("lowerLetter", LowerAlpha), +                ("upperRoman", UpperRoman), +                ("lowerRoman", LowerRoman), +                ("decimal", Decimal)] + +listDelimMap :: [(String, ListNumberDelim)] +listDelimMap = [("%1)", OneParen), +                ("(%1)", TwoParens), +                ("%1.", Period)] + +getListType :: Block -> Maybe ListType +getListType b@(Div (_, _, kvs) _) | isListItem b = +  let +    start = lookup "start" kvs +    frmt = lookup "format" kvs +    txt  = lookup "text" kvs +  in +   case frmt of +     Just "bullet" -> Just Itemized +     Just f        -> +       case txt of +         Just t -> Just $ Enumerated ( +                  read (fromMaybe "1" start) :: Int, +                  fromMaybe DefaultStyle (lookup f listStyleMap), +                  fromMaybe DefaultDelim (lookup t listDelimMap)) +         Nothing -> Nothing +     _ -> Nothing +getListType _ = Nothing + +listParagraphDivs :: [String] +listParagraphDivs = ["ListParagraph"] + +-- This is a first stab at going through and attaching meaning to list +-- paragraphs, without an item marker, following a list item. We +-- assume that these are paragraphs in the same item. + +handleListParagraphs :: [Block] -> [Block] +handleListParagraphs [] = [] +handleListParagraphs ( +  (Div attr1@(_, classes1, _) blks1) : +  (Div (ident2, classes2, kvs2) blks2) : +  blks +  ) | "list-item" `elem` classes1 && +    not ("list-item" `elem` classes2) && +    (not . null) (listParagraphDivs `intersect` classes2) = +      -- We don't want to keep this indent. +      let newDiv2 = +            (Div (ident2, classes2, filter (\kv -> fst kv /= "indent") kvs2) blks2) +      in +       handleListParagraphs ((Div attr1 (blks1 ++ [newDiv2])) : blks) +handleListParagraphs (blk:blks) = blk : (handleListParagraphs blks) +                   +separateBlocks' :: Block -> [[Block]] -> [[Block]] +separateBlocks' blk ([] : []) = [[blk]] +separateBlocks' b@(BulletList _) acc = (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b@(OrderedList _ _) acc = (init acc) ++ [(last acc) ++ [b]] +-- The following is for the invisible bullet lists. This is how +-- pandoc-generated ooxml does multiparagraph item lists. +separateBlocks' b acc | liftM trim (getText b) == Just "" = +  (init acc) ++ [(last acc) ++ [b]] +separateBlocks' b acc = acc ++ [[b]] + +separateBlocks :: [Block] -> [[Block]] +separateBlocks blks = foldr separateBlocks' [[]] (reverse blks) + +flatToBullets' :: Integer -> [Block] -> [Block] +flatToBullets' _ [] = [] +flatToBullets' num xs@(b : elems) +  | getLevelN b == num = b : (flatToBullets' num elems) +  | otherwise =  +    let bNumId = getNumIdN b +        bLevel = getLevelN b +        (children, remaining) = +          span +          (\b' -> +            ((getLevelN b') > bLevel || +             ((getLevelN b') == bLevel && (getNumIdN b') == bNumId))) +          xs +    in +     case getListType b of +       Just (Enumerated attr) -> +         (OrderedList attr (separateBlocks $ flatToBullets' bLevel children)) : +         (flatToBullets' num remaining) +       _ -> +         (BulletList (separateBlocks $ flatToBullets' bLevel children)) : +         (flatToBullets' num remaining) + +flatToBullets :: [Block] -> [Block] +flatToBullets elems = flatToBullets' (-1) elems + +blocksToBullets :: [Block] -> [Block] +blocksToBullets blks = +  -- bottomUp removeListItemDivs $  +  flatToBullets $ (handleListParagraphs blks) + + +plainParaInlines :: Block -> [Inline] +plainParaInlines (Plain ils) = ils +plainParaInlines (Para ils) = ils +plainParaInlines _ = [] + +blocksToDefinitions' :: [([Inline], [[Block]])] -> [Block] -> [Block] -> [Block] +blocksToDefinitions' []     acc [] = reverse acc +blocksToDefinitions' defAcc acc [] = +  reverse $ (DefinitionList (reverse defAcc)) : acc +blocksToDefinitions' defAcc acc +  ((Div (_, classes1, _) blks1) : (Div (ident2, classes2, kvs2) blks2) : blks) +  | "DefinitionTerm" `elem` classes1 && "Definition"  `elem` classes2 = +    let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) +        pair = case remainingAttr2 == ("", [], []) of +          True -> (concatMap plainParaInlines blks1, [blks2]) +          False -> (concatMap plainParaInlines blks1, [[Div remainingAttr2 blks2]]) +    in +     blocksToDefinitions' (pair : defAcc) acc blks +blocksToDefinitions' defAcc acc +  ((Div (ident2, classes2, kvs2) blks2) : blks) +  | (not . null) defAcc && "Definition"  `elem` classes2 = +    let remainingAttr2 = (ident2, delete "Definition" classes2, kvs2) +        defItems2 = case remainingAttr2 == ("", [], []) of +          True -> blks2 +          False -> [Div remainingAttr2 blks2] +        ((defTerm, defItems):defs) = defAcc +        defAcc' = case null defItems of +          True -> (defTerm, [defItems2]) : defs +          False -> (defTerm, init defItems ++ [last defItems ++ defItems2]) : defs +    in +     blocksToDefinitions' defAcc' acc blks +blocksToDefinitions' [] acc (b:blks) = +  blocksToDefinitions' [] (b:acc) blks +blocksToDefinitions' defAcc acc (b:blks) = +  blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks + + +blocksToDefinitions :: [Block] -> [Block] +blocksToDefinitions = blocksToDefinitions' [] [] + +     +     +     diff --git a/src/Text/Pandoc/Readers/DocX/Parse.hs b/src/Text/Pandoc/Readers/DocX/Parse.hs new file mode 100644 index 000000000..d7033d9e8 --- /dev/null +++ b/src/Text/Pandoc/Readers/DocX/Parse.hs @@ -0,0 +1,604 @@ +{- +Copyright (C) 2014 Jesse Rosenthal <jrosenthal@jhu.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA +-} + +{- | +   Module      : Text.Pandoc.Readers.DocX.Parse +   Copyright   : Copyright (C) 2014 Jesse Rosenthal +   License     : GNU GPL, version 2 or above + +   Maintainer  : Jesse Rosenthal <jrosenthal@jhu.edu> +   Stability   : alpha +   Portability : portable + +Conversion of DocX archive into DocX haskell type +-} + + +module Text.Pandoc.Readers.DocX.Parse (  DocX(..) +                                       , Document(..) +                                       , Body(..) +                                       , BodyPart(..) +                                       , TblLook(..) +                                       , ParPart(..) +                                       , Run(..) +                                       , RunElem(..) +                                       , Notes +                                       , Numbering +                                       , Relationship +                                       , Media +                                       , RunStyle(..) +                                       , ParagraphStyle(..) +                                       , Row(..) +                                       , Cell(..) +                                       , getFootNote +                                       , getEndNote +                                       , lookupLevel +                                       , lookupRelationship +                                       , archiveToDocX +                                       ) where +import Codec.Archive.Zip +import Text.XML.Light +import Data.Maybe +import Data.List +import System.FilePath +import Data.Bits ((.|.)) +import qualified Data.ByteString.Lazy as B +import qualified Text.Pandoc.UTF8 as UTF8 + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) +attrToNSPair _ = Nothing + + +type NameSpaces = [(String, String)] + +data DocX = DocX Document Notes Numbering [Relationship] Media +          deriving Show + +archiveToDocX :: Archive -> Maybe DocX +archiveToDocX archive = do +  let notes = archiveToNotes archive +      rels = archiveToRelationships archive +      media = archiveToMedia archive +  doc <- archiveToDocument archive +  numbering <- archiveToNumbering archive +  return $ DocX doc notes numbering rels media + +data Document = Document NameSpaces Body  +          deriving Show + +archiveToDocument :: Archive -> Maybe Document +archiveToDocument zf = do +  entry <- findEntryByPath "word/document.xml" zf +  docElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry +  let namespaces = mapMaybe attrToNSPair (elAttribs docElem)  +  bodyElem <- findChild (QName "body" (lookup "w" namespaces) Nothing) docElem +  body <- elemToBody namespaces bodyElem +  return $ Document namespaces body + +type Media = [(FilePath, B.ByteString)] + +filePathIsMedia :: FilePath -> Bool +filePathIsMedia fp = +  let (dir, _) = splitFileName fp +  in +   (dir == "word/media/") + +getMediaPair :: Archive -> FilePath -> Maybe (FilePath, B.ByteString) +getMediaPair zf fp = +  case findEntryByPath fp zf of +    Just e -> Just (fp, fromEntry e) +    Nothing -> Nothing + +archiveToMedia :: Archive -> Media +archiveToMedia zf = +  mapMaybe (getMediaPair zf) (filter filePathIsMedia (filesInArchive zf)) + +data Numbering = Numbering NameSpaces [Numb] [AbstractNumb] +                 deriving Show + +data Numb = Numb String String           -- right now, only a key to an abstract num +            deriving Show + +data AbstractNumb = AbstractNumb String [Level] +                    deriving Show + +-- (ilvl, format, string, start) +type Level = (String, String, String, Maybe Integer) + +lookupLevel :: String -> String -> Numbering -> Maybe Level +lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do +  absNumId <- lookup numId $ map (\(Numb nid absnumid) -> (nid, absnumid)) numbs +  lvls <- lookup absNumId $ map (\(AbstractNumb aid ls) -> (aid, ls)) absNumbs +  lvl  <- lookup ilvl $ map (\l@(i, _, _, _) -> (i, l)) lvls +  return lvl + +numElemToNum :: NameSpaces -> Element -> Maybe Numb +numElemToNum ns element | +  qName (elName element) == "num" && +  qURI (elName element) == (lookup "w" ns) = do +    numId <- findAttr (QName "numId" (lookup "w" ns) (Just "w")) element +    absNumId <- findChild (QName "abstractNumId" (lookup "w" ns) (Just "w")) element +                >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) +    return $ Numb numId absNumId +numElemToNum _ _ = Nothing + +absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb +absNumElemToAbsNum ns element | +  qName (elName element) == "abstractNum" && +  qURI (elName element) == (lookup "w" ns) = do +    absNumId <- findAttr +                (QName "abstractNumId" (lookup "w" ns) (Just "w")) +                element +    let levelElems = findChildren +                 (QName "lvl" (lookup "w" ns) (Just "w")) +                 element +        levels = mapMaybe id $ map (levelElemToLevel ns) levelElems +    return $ AbstractNumb absNumId levels +absNumElemToAbsNum _ _ = Nothing + +levelElemToLevel :: NameSpaces -> Element -> Maybe Level +levelElemToLevel ns element | +    qName (elName element) == "lvl" && +    qURI (elName element) == (lookup "w" ns) = do +      ilvl <- findAttr (QName "ilvl" (lookup "w" ns) (Just "w")) element +      fmt <- findChild (QName "numFmt" (lookup "w" ns) (Just "w")) element +             >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) +      txt <- findChild (QName "lvlText" (lookup "w" ns) (Just "w")) element +             >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) +      let start = findChild (QName "start" (lookup "w" ns) (Just "w")) element +                  >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) +                  >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) +      return (ilvl, fmt, txt, start) +levelElemToLevel _ _ = Nothing + +archiveToNumbering :: Archive -> Maybe Numbering +archiveToNumbering zf = +  case findEntryByPath "word/numbering.xml" zf of +    Nothing -> Just $ Numbering [] [] [] +    Just entry -> do +      numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry +      let namespaces = mapMaybe attrToNSPair (elAttribs numberingElem) +          numElems = findChildren +                     (QName "num" (lookup "w" namespaces) (Just "w")) +                     numberingElem +          absNumElems = findChildren +                        (QName "abstractNum" (lookup "w" namespaces) (Just "w")) +                        numberingElem +          nums = mapMaybe id $ map (numElemToNum namespaces) numElems +          absNums = mapMaybe id $ map (absNumElemToAbsNum namespaces) absNumElems +      return $ Numbering namespaces nums absNums + +data Notes = Notes NameSpaces (Maybe [(String, [BodyPart])]) (Maybe [(String, [BodyPart])]) +           deriving Show + +noteElemToNote :: NameSpaces -> Element -> Maybe (String, [BodyPart]) +noteElemToNote ns element +  | qName (elName element) `elem` ["endnote", "footnote"] && +    qURI (elName element) == (lookup "w" ns) = +      do +        noteId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element +        let bps = map fromJust +                  $ filter isJust +                  $ map (elemToBodyPart ns) +                  $ filterChildrenName (isParOrTbl ns) element +        return $ (noteId, bps) +noteElemToNote _ _ = Nothing + +getFootNote :: String -> Notes -> Maybe [BodyPart] +getFootNote s (Notes _ fns _) = fns >>= (lookup s) + +getEndNote :: String -> Notes -> Maybe [BodyPart] +getEndNote s (Notes _ _ ens) = ens >>= (lookup s) + +elemToNotes :: NameSpaces -> String -> Element -> Maybe [(String, [BodyPart])] +elemToNotes ns notetype element +  | qName (elName element) == (notetype ++ "s") && +    qURI (elName element) == (lookup "w" ns) = +      Just $ map fromJust +      $ filter isJust +      $ map (noteElemToNote ns) +      $ findChildren (QName notetype (lookup "w" ns) (Just "w")) element +elemToNotes _ _ _ = Nothing + +archiveToNotes :: Archive -> Notes +archiveToNotes zf = +  let fnElem = findEntryByPath "word/footnotes.xml" zf +               >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) +      enElem = findEntryByPath "word/endnotes.xml" zf +               >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry) +      fn_namespaces = case fnElem of +        Just e -> mapMaybe attrToNSPair (elAttribs e) +        Nothing -> [] +      en_namespaces = case enElem of +        Just e -> mapMaybe attrToNSPair (elAttribs e) +        Nothing -> [] +      ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces +      fn = fnElem >>= (elemToNotes ns "footnote") +      en = enElem >>= (elemToNotes ns "endnote") +  in +   Notes ns fn en + + +data Relationship = Relationship (RelId, Target) +                  deriving Show + +lookupRelationship :: RelId -> [Relationship] -> Maybe Target +lookupRelationship relid rels = +  lookup relid (map (\(Relationship pair) -> pair) rels) + +filePathIsRel :: FilePath -> Bool +filePathIsRel fp = +  let (dir, name) = splitFileName fp +  in +   (dir == "word/_rels/") && ((takeExtension name) == ".rels") + +relElemToRelationship :: Element -> Maybe Relationship +relElemToRelationship element | qName (elName element) == "Relationship" = +  do +    relId <- findAttr (QName "Id" Nothing Nothing) element +    target <- findAttr (QName "Target" Nothing Nothing) element +    return $ Relationship (relId, target) +relElemToRelationship _ = Nothing +   + +archiveToRelationships :: Archive -> [Relationship] +archiveToRelationships archive =  +  let relPaths = filter filePathIsRel (filesInArchive archive) +      entries  = map fromJust $ filter isJust $ map (\f -> findEntryByPath f archive) relPaths +      relElems = map fromJust $ filter isJust $ map (parseXMLDoc . UTF8.toStringLazy . fromEntry) entries +      rels = map fromJust $ filter isJust $ map relElemToRelationship $ concatMap elChildren relElems +  in +   rels +    +data Body = Body [BodyPart] +          deriving Show + +isParOrTbl :: NameSpaces -> QName -> Bool +isParOrTbl ns q = qName q `elem` ["p", "tbl"] && +                  qURI q == (lookup "w" ns) + +elemToBody :: NameSpaces -> Element ->  Maybe Body +elemToBody ns element | qName (elName element) == "body" && qURI (elName element) == (lookup "w" ns) = +  Just $ Body +  $ map fromJust +  $ filter isJust +  $ map (elemToBodyPart ns) $ filterChildrenName (isParOrTbl ns) element +elemToBody _ _ = Nothing + +isRunOrLinkOrBookmark :: NameSpaces -> QName ->  Bool +isRunOrLinkOrBookmark ns q = qName q `elem` ["r", "hyperlink", "bookmarkStart"] && +                   qURI q == (lookup "w" ns) + +elemToNumInfo :: NameSpaces -> Element -> Maybe (String, String) +elemToNumInfo ns element +  | qName (elName element) == "p" && +    qURI (elName element) == (lookup "w" ns) = +      do +        pPr <- findChild (QName "pPr" (lookup "w" ns) (Just "w")) element +        numPr <- findChild (QName "numPr" (lookup "w" ns) (Just "w")) pPr +        lvl <- findChild (QName "ilvl" (lookup "w" ns) (Just "w")) numPr >>= +                findAttr (QName "val" (lookup "w" ns) (Just "w")) +        numId <- findChild (QName "numId" (lookup "w" ns) (Just "w")) numPr >>= +                 findAttr (QName "val" (lookup "w" ns) (Just "w")) +        return (numId, lvl) +elemToNumInfo _ _ = Nothing + +-- isBookMarkTag :: NameSpaces -> QName -> Bool +-- isBookMarkTag ns q = qName q `elem` ["bookmarkStart", "bookmarkEnd"] && +--                      qURI q == (lookup "w" ns) + +-- parChildrenToBookmark :: NameSpaces -> [Element] -> BookMark +-- parChildrenToBookmark ns (bms : bme : _) +--   | qName (elName bms) == "bookmarkStart" && +--     qURI (elName bms) == (lookup "w" ns) && +--     qName (elName bme) == "bookmarkEnd" && +--     qURI (elName bme) == (lookup "w" ns) = do +--       bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) bms +--       bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) bms +--       return $ (bmId, bmName) +-- parChildrenToBookmark _ _ = Nothing + +elemToBodyPart :: NameSpaces -> Element ->  Maybe BodyPart +elemToBodyPart ns element +  | qName (elName element) == "p" && +    qURI (elName element) == (lookup "w" ns) = +      let parstyle = elemToParagraphStyle ns element +          parparts = mapMaybe id +                     $ map (elemToParPart ns) +                     $ filterChildrenName (isRunOrLinkOrBookmark ns) element +      in +       case elemToNumInfo ns element of +         Just (numId, lvl) -> Just $ ListItem parstyle numId lvl parparts +         Nothing -> Just $ Paragraph parstyle parparts +  | qName (elName element) == "tbl" && +    qURI (elName element) == (lookup "w" ns) = +      let +        caption = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element +                  >>= findChild (QName "tblCaption" (lookup "w" ns) (Just "w")) +                  >>= findAttr (QName "val" (lookup "w" ns) (Just "w")) +        grid = case +          findChild (QName "tblGrid" (lookup "w" ns) (Just "w")) element +          of +            Just g -> elemToTblGrid ns g +            Nothing -> [] +        tblLook = findChild (QName "tblPr" (lookup "w" ns) (Just "w")) element +                  >>= findChild (QName "tblLook" (lookup "w" ns) (Just "w")) +                  >>= elemToTblLook ns +      in +       Just $ Tbl +       (fromMaybe "" caption) +       grid +       (fromMaybe defaultTblLook tblLook) +       (mapMaybe (elemToRow ns) (elChildren element)) +  | otherwise = Nothing + +elemToTblLook :: NameSpaces -> Element -> Maybe TblLook +elemToTblLook ns element +  | qName (elName element) == "tblLook" && +    qURI (elName element) == (lookup "w" ns) = +      let firstRow = findAttr (QName "firstRow" (lookup "w" ns) (Just "w")) element +          val = findAttr (QName "val" (lookup "w" ns) (Just "w")) element +          firstRowFmt =  +            case firstRow of +              Just "1" -> True +              Just  _  -> False +              Nothing -> case val of +                Just bitMask -> testBitMask bitMask 0x020 +                Nothing      -> False +      in +       Just $ TblLook{firstRowFormatting = firstRowFmt} +elemToTblLook _ _ = Nothing + +testBitMask :: String -> Int -> Bool +testBitMask bitMaskS n = +  case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of +    []            -> False +    ((n', _) : _) -> ((n' .|. n) /= 0) + +data ParagraphStyle = ParagraphStyle { pStyle :: [String] +                                     , indent :: Maybe Integer +                                     } +                      deriving Show + +defaultParagraphStyle :: ParagraphStyle +defaultParagraphStyle = ParagraphStyle { pStyle = [] +                                       , indent = Nothing +                                       } + +elemToParagraphStyle :: NameSpaces -> Element -> ParagraphStyle +elemToParagraphStyle ns element = +  case findChild (QName "pPr" (lookup "w" ns) (Just "w")) element of +    Just pPr -> +      ParagraphStyle +      {pStyle = +          mapMaybe id $ +          map +          (findAttr (QName "val" (lookup "w" ns) (Just "w"))) +          (findChildren (QName "pStyle" (lookup "w" ns) (Just "w")) pPr) +      , indent = +        findChild (QName "ind" (lookup "w" ns) (Just "w")) pPr >>= +        findAttr (QName "left" (lookup "w" ns) (Just "w")) >>= +        stringToInteger +        } +    Nothing -> defaultParagraphStyle + + +data BodyPart = Paragraph ParagraphStyle [ParPart] +              | ListItem ParagraphStyle String String [ParPart] +              | Tbl String TblGrid TblLook [Row] + +              deriving Show + +type TblGrid = [Integer] + +data TblLook = TblLook {firstRowFormatting::Bool} +              deriving Show + +defaultTblLook :: TblLook +defaultTblLook = TblLook{firstRowFormatting = False} + +stringToInteger :: String -> Maybe Integer +stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) + +elemToTblGrid :: NameSpaces -> Element -> TblGrid +elemToTblGrid ns element +  | qName (elName element) == "tblGrid" && +    qURI (elName element) == (lookup "w" ns) = +      let +        cols = findChildren (QName "gridCol" (lookup "w" ns) (Just "w")) element +      in +       mapMaybe (\e -> +                  findAttr (QName "val" (lookup "w" ns) (Just ("w"))) e +                  >>= stringToInteger +                ) +       cols +elemToTblGrid _ _ = [] + +data Row = Row [Cell] +           deriving Show + + +elemToRow :: NameSpaces -> Element -> Maybe Row +elemToRow ns element +  | qName (elName element) == "tr" && +    qURI (elName element) == (lookup "w" ns) = +      let  +        cells = findChildren (QName "tc" (lookup "w" ns) (Just "w")) element +      in +       Just $ Row (mapMaybe (elemToCell ns) cells) +elemToRow _ _ = Nothing + +data Cell = Cell [BodyPart] +            deriving Show + +elemToCell :: NameSpaces -> Element -> Maybe Cell +elemToCell ns element +  | qName (elName element) == "tc" && +    qURI (elName element) == (lookup "w" ns) = +      Just $ Cell (mapMaybe (elemToBodyPart ns) (elChildren element)) +elemToCell _ _ = Nothing + +data ParPart = PlainRun Run +             | BookMark BookMarkId Anchor +             | InternalHyperLink Anchor [Run] +             | ExternalHyperLink RelId [Run] +             | Drawing String +             deriving Show + +data Run = Run RunStyle [RunElem] +         | Footnote String  +         | Endnote String +           deriving Show + +data RunElem = TextRun String | LnBrk +             deriving Show + +data RunStyle = RunStyle { isBold :: Bool +                         , isItalic :: Bool +                         , isSmallCaps :: Bool +                         , isStrike :: Bool +                         , isSuperScript :: Bool +                         , isSubScript :: Bool +                         , underline :: Maybe String +                         , rStyle :: Maybe String } +                deriving Show + +defaultRunStyle :: RunStyle +defaultRunStyle = RunStyle { isBold = False +                           , isItalic = False +                           , isSmallCaps = False +                           , isStrike = False +                           , isSuperScript = False +                           , isSubScript = False +                           , underline = Nothing +                           , rStyle = Nothing +                           }  + +elemToRunStyle :: NameSpaces -> Element -> RunStyle +elemToRunStyle ns element = +  case findChild (QName "rPr" (lookup "w" ns) (Just "w")) element of +    Just rPr -> +      RunStyle +      { +        isBold = isJust $ findChild (QName "b" (lookup "w" ns) (Just "w")) rPr +      , isItalic = isJust $ findChild (QName "i" (lookup "w" ns) (Just "w")) rPr +      , isSmallCaps = isJust $ findChild (QName "smallCaps" (lookup "w" ns) (Just "w")) rPr +      , isStrike = isJust $ findChild (QName "strike" (lookup "w" ns) (Just "w")) rPr +      , isSuperScript = +        (Just "superscript" == +        (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= +         findAttr (QName "val" (lookup "w" ns) (Just "w")))) +      , isSubScript = +        (Just "subscript" == +        (findChild (QName "vertAlign" (lookup "w" ns) (Just "w")) rPr >>= +         findAttr (QName "val" (lookup "w" ns) (Just "w")))) +      , underline = +        findChild (QName "u" (lookup "w" ns) (Just "w")) rPr >>= +        findAttr (QName "val" (lookup "w" ns) (Just "w")) +      , rStyle = +        findChild (QName "rStyle" (lookup "w" ns) (Just "w")) rPr >>= +        findAttr (QName "val" (lookup "w" ns) (Just "w")) +        } +    Nothing -> defaultRunStyle + +elemToRun :: NameSpaces -> Element -> Maybe Run +elemToRun ns element +  | qName (elName element) == "r" && +    qURI (elName element) == (lookup "w" ns) = +      case +        findChild (QName "footnoteReference" (lookup "w" ns) (Just "w")) element >>= +        findAttr (QName "id" (lookup "w" ns) (Just "w")) +      of +        Just s -> Just $ Footnote s +        Nothing -> +          case +            findChild (QName "endnoteReference" (lookup "w" ns) (Just "w")) element >>= +            findAttr (QName "id" (lookup "w" ns) (Just "w")) +          of +            Just s -> Just $ Endnote s  +            Nothing ->  Just $ +                        Run (elemToRunStyle ns element) +                        (elemToRunElems ns element) +elemToRun _ _ = Nothing + +elemToRunElem :: NameSpaces -> Element -> Maybe RunElem +elemToRunElem ns element +  | qName (elName element) == "t" && +    qURI (elName element) == (lookup "w" ns) = +      Just $ TextRun (strContent element) +  | qName (elName element) == "br" && +    qURI (elName element) == (lookup "w" ns) = +      Just $ LnBrk +  | otherwise = Nothing + + +elemToRunElems :: NameSpaces -> Element -> [RunElem] +elemToRunElems ns element +  | qName (elName element) == "r" && +    qURI (elName element) == (lookup "w" ns) = +      mapMaybe (elemToRunElem ns) (elChildren element) +  | otherwise = [] + +elemToDrawing :: NameSpaces -> Element -> Maybe ParPart +elemToDrawing ns element +  | qName (elName element) == "drawing" && +    qURI (elName element) == (lookup "w" ns) = +      let a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" +      in +       findElement (QName "blip" (Just a_ns) (Just "a")) element +       >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) +       >>= (\s -> Just $ Drawing s) +elemToDrawing _ _ = Nothing + + +elemToParPart :: NameSpaces -> Element -> Maybe ParPart +elemToParPart ns element +  | qName (elName element) == "r" && +    qURI (elName element) == (lookup "w" ns) = +      case findChild (QName "drawing" (lookup "w" ns) (Just "w")) element of +        Just drawingElem -> elemToDrawing ns drawingElem +        Nothing -> do +          r <- elemToRun ns element +          return $ PlainRun r +elemToParPart ns element +  | qName (elName element) == "bookmarkStart" && +    qURI (elName element) == (lookup "w" ns) = do +      bmId <- findAttr (QName "id" (lookup "w" ns) (Just "w")) element +      bmName <- findAttr (QName "name" (lookup "w" ns) (Just "w")) element +      return $ BookMark bmId bmName +elemToParPart ns element +  | qName (elName element) == "hyperlink" && +    qURI (elName element) == (lookup "w" ns) = +      let runs = map fromJust $ filter isJust $ map (elemToRun ns) +                 $ findChildren (QName "r" (lookup "w" ns) (Just "w")) element +      in +       case findAttr (QName "anchor" (lookup "w" ns) (Just "w")) element of +         Just anchor -> +          Just $ InternalHyperLink anchor runs +         Nothing -> +           case findAttr (QName "id" (lookup "r" ns) (Just "r")) element of +             Just relId -> Just $ ExternalHyperLink relId runs +             Nothing    -> Nothing +elemToParPart _ _ = Nothing + +type Target = String +type Anchor = String +type BookMarkId = String +type RelId = String +                 | 
