aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pandoc.cabal1
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs384
-rw-r--r--src/Text/Pandoc/Readers/Docx/Lists.hs25
-rw-r--r--src/Text/Pandoc/Readers/Docx/Reducible.hs150
-rw-r--r--tests/Tests/Readers/Docx.hs4
-rw-r--r--tests/docx.trailing_spaces_in_formatting.docxbin0 -> 12916 bytes
-rw-r--r--tests/docx.trailing_spaces_in_formatting.native1
7 files changed, 289 insertions, 276 deletions
diff --git a/pandoc.cabal b/pandoc.cabal
index e35a76caf..634d249fe 100644
--- a/pandoc.cabal
+++ b/pandoc.cabal
@@ -327,6 +327,7 @@ Library
Text.Pandoc.SelfContained,
Text.Pandoc.Process
Other-Modules: Text.Pandoc.Readers.Docx.Lists,
+ Text.Pandoc.Readers.Docx.Reducible,
Text.Pandoc.Readers.Docx.Parse,
Text.Pandoc.Writers.Shared,
Text.Pandoc.Asciify,
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 09c2330fb..ffe7f5a92 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -79,8 +79,10 @@ import Text.Pandoc.Builder (text, toList)
import Text.Pandoc.Generic (bottomUp)
import Text.Pandoc.MIME (getMimeType)
import Text.Pandoc.UTF8 (toString)
+import Text.Pandoc.Walk
import Text.Pandoc.Readers.Docx.Parse
import Text.Pandoc.Readers.Docx.Lists
+import Text.Pandoc.Readers.Docx.Reducible
import Data.Maybe (mapMaybe, isJust, fromJust)
import Data.List (delete, isPrefixOf, (\\), intersect)
import qualified Data.ByteString as BS
@@ -96,28 +98,65 @@ readDocx opts bytes =
Just docx -> Pandoc nullMeta (docxToBlocks opts docx)
Nothing -> error $ "couldn't parse docx file"
-runStyleToSpanAttr :: RunStyle -> (String, [String], [(String, String)])
-runStyleToSpanAttr rPr = ("",
- mapMaybe id [
- if isBold rPr then (Just "strong") else Nothing,
- if isItalic rPr then (Just "emph") else Nothing,
- if isSmallCaps rPr then (Just "smallcaps") else Nothing,
- if isStrike rPr then (Just "strike") else Nothing,
- if isSuperScript rPr then (Just "superscript") else Nothing,
- if isSubScript rPr then (Just "subscript") else Nothing,
- rStyle rPr],
- case underline rPr of
- Just fmt -> [("underline", fmt)]
- _ -> []
- )
-
-parStyleToDivAttr :: ParagraphStyle -> (String, [String], [(String, String)])
-parStyleToDivAttr pPr = ("",
- pStyle pPr,
- case indent pPr of
- Just n -> [("indent", (show n))]
- Nothing -> []
- )
+spansToKeep :: [String]
+spansToKeep = ["list-item", "Definition", "DefinitionTerm"] ++ codeSpans
+
+
+-- This is empty, but we put it in for future-proofing.
+divsToKeep :: [String]
+divsToKeep = []
+
+runStyleToContainers :: RunStyle -> [Container Inline]
+runStyleToContainers rPr =
+ let formatters = mapMaybe id
+ [ if isBold rPr then (Just Strong) else Nothing
+ , if isItalic rPr then (Just Emph) else Nothing
+ , if isSmallCaps rPr then (Just SmallCaps) else Nothing
+ , if isStrike rPr then (Just Strikeout) else Nothing
+ , if isSuperScript rPr then (Just Superscript) else Nothing
+ , if isSubScript rPr then (Just Subscript) else Nothing
+ , rStyle rPr >>=
+ (\s -> if s `elem` spansToKeep then Just s else Nothing) >>=
+ (\s -> Just $ Span ("", [s], []))
+ , underline rPr >>= (\f -> Just $ Span ("", [], [("underline", f)]))
+ ]
+ in
+ map Container formatters
+
+
+divAttrToContainers :: [String] -> [(String, String)] -> [Container Block]
+divAttrToContainers [] [] = []
+divAttrToContainers (c:cs) _ | isJust (isHeaderClass c) =
+ let n = fromJust (isHeaderClass c)
+ in
+ [(Container $ \blks ->
+ Header n ("", delete ("Heading" ++ show n) cs, []) (blksToInlines blks))]
+divAttrToContainers (c:_) _ | c `elem` codeDivs =
+ [Container $ \blks -> CodeBlock ("", [], []) (concatMap blkToCode blks)]
+divAttrToContainers (c:cs) kvs | c `elem` listParagraphDivs =
+ let kvs' = filter (\(k,_) -> k /= "indent") kvs
+ in
+ (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs')
+divAttrToContainers (c:cs) kvs | c `elem` blockQuoteDivs =
+ (Container BlockQuote) : (divAttrToContainers (cs \\ blockQuoteDivs) kvs)
+divAttrToContainers (c:cs) kvs | c `elem` divsToKeep =
+ (Container $ Div ("", [c], [])) : (divAttrToContainers cs kvs)
+divAttrToContainers (_:cs) kvs = divAttrToContainers cs kvs
+divAttrToContainers [] (kv:kvs) | fst kv == "indent" =
+ (Container BlockQuote) : divAttrToContainers [] kvs
+divAttrToContainers [] (_:kvs) =
+ divAttrToContainers [] kvs
+
+
+parStyleToContainers :: ParagraphStyle -> [Container Block]
+parStyleToContainers pPr =
+ let classes = pStyle pPr
+ kvs = case indent pPr of
+ Just n -> [("indent", show n)]
+ Nothing -> []
+ in
+ divAttrToContainers classes kvs
+
strToInlines :: String -> [Inline]
strToInlines = toList . text
@@ -144,103 +183,42 @@ runElemToString (Tab) = ['\t']
runElemsToString :: [RunElem] -> String
runElemsToString = concatMap runElemToString
---- We use this instead of the more general
---- Text.Pandoc.Shared.normalize for reasons of efficiency. For
---- whatever reason, `normalize` makes a run take almost twice as
---- long. (It does more, but this does what we need)
-inlineNormalize :: [Inline] -> [Inline]
-inlineNormalize [] = []
-inlineNormalize (Str "" : ils) = inlineNormalize ils
-inlineNormalize ((Str s) : (Str s') : l) =
- inlineNormalize (Str (s++s') : l)
-inlineNormalize ((Emph ils) : (Emph ils') : l) =
- inlineNormalize $ (Emph $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Emph ils) : l) =
- Emph (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Strong ils) : (Strong ils') : l) =
- inlineNormalize $ (Strong $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Strong ils) : l) =
- Strong (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Strikeout ils) : (Strikeout ils') : l) =
- inlineNormalize $ (Strikeout $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Strikeout ils) : l) =
- Strikeout (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Superscript ils) : (Superscript ils') : l) =
- inlineNormalize $ (Superscript $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Superscript ils) : l) =
- Superscript (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Subscript ils) : (Subscript ils') : l) =
- inlineNormalize $ (Subscript $ inlineNormalize (ils ++ ils')) : l
-inlineNormalize ((Subscript ils) : l) =
- Subscript (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Space : Space : l)) =
- inlineNormalize $ (Space : l)
-inlineNormalize ((Quoted qt ils) : l) =
- Quoted qt (inlineNormalize ils) : inlineNormalize l
-inlineNormalize ((Cite cits ils) : l) =
- let
- f :: Citation -> Citation
- f (Citation s pref suff mode num hash) =
- Citation s (inlineNormalize pref) (inlineNormalize suff) mode num hash
- in
- Cite (map f cits) (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize ((Link ils s) : l) =
- Link (inlineNormalize ils) s : (inlineNormalize l)
-inlineNormalize ((Image ils s) : l) =
- Image (inlineNormalize ils) s : (inlineNormalize l)
-inlineNormalize ((Note blks) : l) =
- Note (map blockNormalize blks) : (inlineNormalize l)
-inlineNormalize ((Span attr ils) : l) =
- Span attr (inlineNormalize ils) : (inlineNormalize l)
-inlineNormalize (il : l) = il : (inlineNormalize l)
-
-stripSpaces :: [Inline] -> [Inline]
-stripSpaces ils =
- reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) ils
-
-blockNormalize :: Block -> Block
-blockNormalize (Plain ils) = Plain $ stripSpaces $ inlineNormalize ils
-blockNormalize (Para ils) = Para $ stripSpaces $ inlineNormalize ils
-blockNormalize (Header n attr ils) =
- Header n attr $ stripSpaces $ inlineNormalize ils
-blockNormalize (Table ils align width hdr cells) =
- Table (stripSpaces $ inlineNormalize ils) align width hdr cells
-blockNormalize (DefinitionList pairs) =
- DefinitionList $ map (\(ils, blklsts) -> (stripSpaces (inlineNormalize ils), (map (map blockNormalize) blklsts))) pairs
-blockNormalize (BlockQuote blks) = BlockQuote (map blockNormalize blks)
-blockNormalize (OrderedList attr blkslst) =
- OrderedList attr $ map (\blks -> map blockNormalize blks) blkslst
-blockNormalize (BulletList blkslst) =
- BulletList $ map (\blks -> map blockNormalize blks) blkslst
-blockNormalize (Div attr blks) = Div attr (map blockNormalize blks)
-blockNormalize blk = blk
+
+inlineCodeContainer :: Container Inline -> Bool
+inlineCodeContainer (Container f) = case f [] of
+ Span (_, classes, _) _ -> (not . null) (classes `intersect` codeSpans)
+ _ -> False
+inlineCodeContainer _ = False
+
+-- blockCodeContainer :: Container Block -> Bool
+-- blockCodeContainer (Container f) = case f [] of
+-- Div (ident, classes, kvs) _ -> (not . null) (classes `intersect` codeDivs)
+-- _ -> False
+-- blockCodeContainer _ = False
runToInlines :: ReaderOptions -> Docx -> Run -> [Inline]
runToInlines _ _ (Run rs runElems)
- | isJust (rStyle rs) && (fromJust (rStyle rs)) `elem` codeSpans =
- case runStyleToSpanAttr rs == ("", [], []) of
- True -> [Str (runElemsToString runElems)]
- False -> [Span (runStyleToSpanAttr rs) [Str (runElemsToString runElems)]]
- | otherwise = case runStyleToSpanAttr rs == ("", [], []) of
- True -> concatMap runElemToInlines runElems
- False -> [Span (runStyleToSpanAttr rs) (concatMap runElemToInlines runElems)]
+ | any inlineCodeContainer (runStyleToContainers rs) =
+ rebuild (runStyleToContainers rs) $ [Str $ runElemsToString runElems]
+ | otherwise =
+ rebuild (runStyleToContainers rs) (concatMap runElemToInlines runElems)
runToInlines opts docx@(Docx _ notes _ _ _ ) (Footnote fnId) =
case (getFootNote fnId notes) of
Just bodyParts ->
- [Note [Div ("", ["footnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
+ [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)]
Nothing ->
- [Note [Div ("", ["footnote"], []) []]]
+ [Note []]
runToInlines opts docx@(Docx _ notes _ _ _) (Endnote fnId) =
case (getEndNote fnId notes) of
Just bodyParts ->
- [Note [Div ("", ["endnote"], []) (map (bodyPartToBlock opts docx) bodyParts)]]
+ [Note (concatMap (bodyPartToBlocks opts docx) bodyParts)]
Nothing ->
- [Note [Div ("", ["endnote"], []) []]]
+ [Note []]
parPartToInlines :: ReaderOptions -> Docx -> ParPart -> [Inline]
parPartToInlines opts docx (PlainRun r) = runToInlines opts docx r
-parPartToInlines _ _ (BookMark _ anchor) =
- [Span (anchor, ["anchor"], []) []]
+parPartToInlines _ _ (BookMark _ anchor) | anchor `elem` dummyAnchors = []
+parPartToInlines _ _ (BookMark _ anchor) = [Span (anchor, ["anchor"], []) []]
parPartToInlines _ (Docx _ _ _ rels _) (Drawing relid) =
case lookupRelationship relid rels of
Just target -> [Image [] (combine "word" target, "")]
@@ -276,7 +254,6 @@ makeHeaderAnchors h@(Header n (_, classes, kvs) ils) =
_ -> h
makeHeaderAnchors blk = blk
-
parPartsToInlines :: ReaderOptions -> Docx -> [ParPart] -> [Inline]
parPartsToInlines opts docx parparts =
--
@@ -284,23 +261,32 @@ parPartsToInlines opts docx parparts =
-- not mandatory.
--
(if False -- TODO depend on option
- then bottomUp (makeImagesSelfContained docx)
+ then walk (makeImagesSelfContained docx)
else id) $
- bottomUp spanTrim $
- bottomUp spanCorrect $
- bottomUp spanReduce $
- concatMap (parPartToInlines opts docx) parparts
+ -- bottomUp spanTrim $
+ -- bottomUp spanCorrect $
+ -- bottomUp spanReduce $
+ reduceList $ concatMap (parPartToInlines opts docx) parparts
cellToBlocks :: ReaderOptions -> Docx -> Cell -> [Block]
-cellToBlocks opts docx (Cell bps) = map (bodyPartToBlock opts docx) bps
+cellToBlocks opts docx (Cell bps) = concatMap (bodyPartToBlocks opts docx) bps
rowToBlocksList :: ReaderOptions -> Docx -> Row -> [[Block]]
rowToBlocksList opts docx (Row cells) = map (cellToBlocks opts docx) cells
-bodyPartToBlock :: ReaderOptions -> Docx -> BodyPart -> Block
-bodyPartToBlock opts docx (Paragraph pPr parparts) =
- Div (parStyleToDivAttr pPr) [Para (parPartsToInlines opts docx parparts)]
-bodyPartToBlock opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) =
+bodyPartToBlocks :: ReaderOptions -> Docx -> BodyPart -> [Block]
+bodyPartToBlocks opts docx (Paragraph pPr parparts) =
+ case parPartsToInlines opts docx parparts of
+ [] ->
+ []
+ _ ->
+ let parContents = parPartsToInlines opts docx parparts
+ trimmedContents = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) parContents
+ in
+ rebuild
+ (parStyleToContainers pPr)
+ [Para trimmedContents]
+bodyPartToBlocks opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parparts) =
let
kvs = case lookupLevel numId lvl numbering of
Just (_, fmt, txt, Just start) -> [ ("level", lvl)
@@ -317,12 +303,12 @@ bodyPartToBlock opts docx@(Docx _ _ numbering _ _) (ListItem pPr numId lvl parpa
]
Nothing -> []
in
- Div
- ("", ["list-item"], kvs)
- [bodyPartToBlock opts docx (Paragraph pPr parparts)]
-bodyPartToBlock _ _ (Tbl _ _ _ []) =
- Para []
-bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) =
+ [Div
+ ("", ["list-item"], kvs)
+ (bodyPartToBlocks opts docx (Paragraph pPr parparts))]
+bodyPartToBlocks _ _ (Tbl _ _ _ []) =
+ [Para []]
+bodyPartToBlocks opts docx (Tbl cap _ look (r:rs)) =
let caption = strToInlines cap
(hdr, rows) = case firstRowFormatting look of
True -> (Just r, rs)
@@ -344,7 +330,8 @@ bodyPartToBlock opts docx (Tbl cap _ look (r:rs)) =
alignments = take size (repeat AlignDefault)
widths = take size (repeat 0) :: [Double]
in
- Table caption alignments widths hdrCells cells
+ [Table caption alignments widths hdrCells cells]
+
makeImagesSelfContained :: Docx -> Inline -> Inline
makeImagesSelfContained (Docx _ _ _ _ media) i@(Image alt (uri, title)) =
@@ -360,127 +347,19 @@ makeImagesSelfContained _ inline = inline
bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block]
bodyToBlocks opts docx (Body bps) =
- bottomUp removeEmptyPars $
- map blockNormalize $
- bottomUp spanRemove $
- bottomUp divRemove $
map (makeHeaderAnchors) $
- bottomUp divCorrect $
- bottomUp divReduce $
- bottomUp divCorrectPreReduce $
bottomUp blocksToDefinitions $
blocksToBullets $
- map (bodyPartToBlock opts docx) bps
+ concatMap (bodyPartToBlocks opts docx) bps
docxToBlocks :: ReaderOptions -> Docx -> [Block]
docxToBlocks opts d@(Docx (Document _ body) _ _ _ _) = bodyToBlocks opts d body
-spanReduce :: [Inline] -> [Inline]
-spanReduce [] = []
-spanReduce ((Span (id1, classes1, kvs1) ils1) : ils)
- | (id1, classes1, kvs1) == ("", [], []) = ils1 ++ (spanReduce ils)
-spanReduce (s1@(Span (id1, classes1, kvs1) ils1) :
- s2@(Span (id2, classes2, kvs2) ils2) :
- ils) =
- let classes' = classes1 `intersect` classes2
- kvs' = kvs1 `intersect` kvs2
- classes1' = classes1 \\ classes'
- kvs1' = kvs1 \\ kvs'
- classes2' = classes2 \\ classes'
- kvs2' = kvs2 \\ kvs'
- in
- case null classes' && null kvs' of
- True -> s1 : (spanReduce (s2 : ils))
- False -> let attr' = ("", classes', kvs')
- attr1' = (id1, classes1', kvs1')
- attr2' = (id2, classes2', kvs2')
- in
- spanReduce (Span attr' [(Span attr1' ils1), (Span attr2' ils2)] :
- ils)
-spanReduce (il:ils) = il : (spanReduce ils)
ilToCode :: Inline -> String
ilToCode (Str s) = s
ilToCode _ = ""
-spanRemove' :: Inline -> [Inline]
-spanRemove' s@(Span (ident, classes, _) [])
- -- "_GoBack" is automatically inserted. We don't want to keep it.
- | classes == ["anchor"] && not (ident `elem` dummyAnchors) = [s]
-spanRemove' (Span (_, _, kvs) ils) =
- case lookup "underline" kvs of
- Just val -> [Span ("", [], [("underline", val)]) ils]
- Nothing -> ils
-spanRemove' il = [il]
-
-spanRemove :: [Inline] -> [Inline]
-spanRemove = concatMap spanRemove'
-
-spanTrim' :: Inline -> [Inline]
-spanTrim' il@(Span _ []) = [il]
-spanTrim' il@(Span attr (il':[]))
- | il' == Space = [Span attr [], Space]
- | otherwise = [il]
-spanTrim' (Span attr ils)
- | head ils == Space && last ils == Space =
- [Space, Span attr (init $ tail ils), Space]
- | head ils == Space = [Space, Span attr (tail ils)]
- | last ils == Space = [Span attr (init ils), Space]
-spanTrim' il = [il]
-
-spanTrim :: [Inline] -> [Inline]
-spanTrim = concatMap spanTrim'
-
-spanCorrect' :: Inline -> [Inline]
-spanCorrect' (Span ("", [], []) ils) = ils
-spanCorrect' (Span (ident, classes, kvs) ils)
- | "emph" `elem` classes =
- [Emph $ spanCorrect' $ Span (ident, (delete "emph" classes), kvs) ils]
- | "strong" `elem` classes =
- [Strong $ spanCorrect' $ Span (ident, (delete "strong" classes), kvs) ils]
- | "smallcaps" `elem` classes =
- [SmallCaps $ spanCorrect' $ Span (ident, (delete "smallcaps" classes), kvs) ils]
- | "strike" `elem` classes =
- [Strikeout $ spanCorrect' $ Span (ident, (delete "strike" classes), kvs) ils]
- | "superscript" `elem` classes =
- [Superscript $ spanCorrect' $ Span (ident, (delete "superscript" classes), kvs) ils]
- | "subscript" `elem` classes =
- [Subscript $ spanCorrect' $ Span (ident, (delete "subscript" classes), kvs) ils]
- | (not . null) (codeSpans `intersect` classes) =
- [Code (ident, (classes \\ codeSpans), kvs) (init $ unlines $ map ilToCode ils)]
- | otherwise =
- [Span (ident, classes, kvs) ils]
-spanCorrect' il = [il]
-
-spanCorrect :: [Inline] -> [Inline]
-spanCorrect = concatMap spanCorrect'
-
-removeEmptyPars :: [Block] -> [Block]
-removeEmptyPars blks = filter (\b -> b /= (Para [])) blks
-
-divReduce :: [Block] -> [Block]
-divReduce [] = []
-divReduce ((Div (id1, classes1, kvs1) blks1) : blks)
- | (id1, classes1, kvs1) == ("", [], []) = blks1 ++ (divReduce blks)
-divReduce (d1@(Div (id1, classes1, kvs1) blks1) :
- d2@(Div (id2, classes2, kvs2) blks2) :
- blks) =
- let classes' = classes1 `intersect` classes2
- kvs' = kvs1 `intersect` kvs2
- classes1' = classes1 \\ classes'
- kvs1' = kvs1 \\ kvs'
- classes2' = classes2 \\ classes'
- kvs2' = kvs2 \\ kvs'
- in
- case null classes' && null kvs' of
- True -> d1 : (divReduce (d2 : blks))
- False -> let attr' = ("", classes', kvs')
- attr1' = (id1, classes1', kvs1')
- attr2' = (id2, classes2', kvs2')
- in
- divReduce (Div attr' [(Div attr1' blks1), (Div attr2' blks2)] :
- blks)
-divReduce (blk:blks) = blk : (divReduce blks)
isHeaderClass :: String -> Maybe Int
isHeaderClass s | "Heading" `isPrefixOf` s =
@@ -490,27 +369,12 @@ isHeaderClass s | "Heading" `isPrefixOf` s =
_ -> Nothing
isHeaderClass _ = Nothing
-findHeaderClass :: [String] -> Maybe Int
-findHeaderClass ss = case mapMaybe id $ map isHeaderClass ss of
- [] -> Nothing
- n : _ -> Just n
blksToInlines :: [Block] -> [Inline]
blksToInlines (Para ils : _) = ils
blksToInlines (Plain ils : _) = ils
blksToInlines _ = []
-divCorrectPreReduce' :: Block -> [Block]
-divCorrectPreReduce' (Div (ident, classes, kvs) blks)
- | isJust $ findHeaderClass classes =
- let n = fromJust $ findHeaderClass classes
- in
- [Header n (ident, delete ("Heading" ++ (show n)) classes, kvs) (blksToInlines blks)]
- | otherwise = [Div (ident, classes, kvs) blks]
-divCorrectPreReduce' blk = [blk]
-
-divCorrectPreReduce :: [Block] -> [Block]
-divCorrectPreReduce = concatMap divCorrectPreReduce'
blkToCode :: Block -> String
blkToCode (Para []) = ""
@@ -520,29 +384,3 @@ blkToCode (Para ((Span (_, classes, _) ils'): ils))
(init $ unlines $ map ilToCode ils') ++ (blkToCode (Para ils))
blkToCode _ = ""
-divRemove' :: Block -> [Block]
-divRemove' (Div (_, _, kvs) blks) =
- case lookup "indent" kvs of
- Just val -> [Div ("", [], [("indent", val)]) blks]
- Nothing -> blks
-divRemove' blk = [blk]
-
-divRemove :: [Block] -> [Block]
-divRemove = concatMap divRemove'
-
-divCorrect' :: Block -> [Block]
-divCorrect' b@(Div (ident, classes, kvs) blks)
- | (not . null) (blockQuoteDivs `intersect` classes) =
- [BlockQuote [Div (ident, classes \\ blockQuoteDivs, kvs) blks]]
- | (not . null) (codeDivs `intersect` classes) =
- [CodeBlock (ident, (classes \\ codeDivs), kvs) (init $ unlines $ map blkToCode blks)]
- | otherwise =
- case lookup "indent" kvs of
- Just "0" -> [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]
- Just _ ->
- [BlockQuote [Div (ident, classes, filter (\kv -> fst kv /= "indent") kvs) blks]]
- Nothing -> [b]
-divCorrect' blk = [blk]
-
-divCorrect :: [Block] -> [Block]
-divCorrect = concatMap divCorrect'
diff --git a/src/Text/Pandoc/Readers/Docx/Lists.hs b/src/Text/Pandoc/Readers/Docx/Lists.hs
index 68559d98b..1e37d0076 100644
--- a/src/Text/Pandoc/Readers/Docx/Lists.hs
+++ b/src/Text/Pandoc/Readers/Docx/Lists.hs
@@ -29,9 +29,12 @@ Functions for converting flat docx paragraphs into nested lists.
-}
module Text.Pandoc.Readers.Docx.Lists ( blocksToBullets
- , blocksToDefinitions) where
+ , blocksToDefinitions
+ , listParagraphDivs
+ ) where
import Text.Pandoc.JSON
+import Text.Pandoc.Generic (bottomUp)
import Text.Pandoc.Shared (trim)
import Control.Monad
import Data.List
@@ -159,10 +162,9 @@ flatToBullets elems = flatToBullets' (-1) elems
blocksToBullets :: [Block] -> [Block]
blocksToBullets blks =
- -- bottomUp removeListItemDivs $
+ bottomUp removeListDivs $
flatToBullets $ (handleListParagraphs blks)
-
plainParaInlines :: Block -> [Inline]
plainParaInlines (Plain ils) = ils
plainParaInlines (Para ils) = ils
@@ -199,6 +201,23 @@ blocksToDefinitions' [] acc (b:blks) =
blocksToDefinitions' defAcc acc (b:blks) =
blocksToDefinitions' [] (b : (DefinitionList (reverse defAcc)) : acc) blks
+removeListDivs' :: Block -> [Block]
+removeListDivs' (Div (ident, classes, kvs) blks)
+ | "list-item" `elem` classes =
+ case delete "list-item" classes of
+ [] -> blks
+ classes' -> [Div (ident, classes', kvs) $ blks]
+removeListDivs' (Div (ident, classes, kvs) blks)
+ | not $ null $ listParagraphDivs `intersect` classes =
+ case classes \\ listParagraphDivs of
+ [] -> blks
+ classes' -> [Div (ident, classes', kvs) blks]
+removeListDivs' blk = [blk]
+
+removeListDivs :: [Block] -> [Block]
+removeListDivs = concatMap removeListDivs'
+
+
blocksToDefinitions :: [Block] -> [Block]
blocksToDefinitions = blocksToDefinitions' [] []
diff --git a/src/Text/Pandoc/Readers/Docx/Reducible.hs b/src/Text/Pandoc/Readers/Docx/Reducible.hs
new file mode 100644
index 000000000..1ed31ebd0
--- /dev/null
+++ b/src/Text/Pandoc/Readers/Docx/Reducible.hs
@@ -0,0 +1,150 @@
+{-# LANGUAGE OverloadedStrings #-}
+
+module Text.Pandoc.Readers.Docx.Reducible ((<++>),
+ (<+++>),
+ Reducible,
+ Container(..),
+ container,
+ innards,
+ reduceList,
+ reduceListB,
+ rebuild)
+ where
+
+import Text.Pandoc.Builder
+import Data.List ((\\), intersect)
+
+data Container a = Container ([a] -> a) | NullContainer
+
+instance (Eq a) => Eq (Container a) where
+ (Container x) == (Container y) = ((x []) == (y []))
+ NullContainer == NullContainer = True
+ _ == _ = False
+
+instance (Show a) => Show (Container a) where
+ show (Container x) = "Container {" ++
+ (reverse $ drop 3 $ reverse $ show $ x []) ++
+ "}"
+ show (NullContainer) = "NullContainer"
+
+class Reducible a where
+ (<++>) :: a -> a -> [a]
+ container :: a -> Container a
+ innards :: a -> [a]
+ isSpace :: a -> Bool
+
+(<+++>) :: (Reducible a) => Many a -> Many a -> Many a
+mr <+++> ms = fromList $ reduceList $ toList mr ++ toList ms
+
+reduceListB :: (Reducible a) => Many a -> Many a
+reduceListB = fromList . reduceList . toList
+
+reduceList' :: (Reducible a) => [a] -> [a] -> [a]
+reduceList' acc [] = acc
+reduceList' [] (x:xs) = reduceList' [x] xs
+reduceList' as (x:xs) = reduceList' (init as ++ (last as <++> x) ) xs
+
+reduceList :: (Reducible a) => [a] -> [a]
+reduceList = reduceList' []
+
+combineReducibles :: (Reducible a, Eq a) => a -> a -> [a]
+combineReducibles r s =
+ let (conts, rs) = topLevelContainers r
+ (conts', ss) = topLevelContainers s
+ shared = conts `intersect` conts'
+ remaining = conts \\ shared
+ remaining' = conts' \\ shared
+ in
+ case null shared of
+ True -> case (not . null) rs && isSpace (last rs) of
+ True -> rebuild conts (init rs) ++ [last rs, s]
+ False -> [r,s]
+ False -> rebuild
+ shared $
+ reduceList $
+ (rebuild remaining rs) ++ (rebuild remaining' ss)
+
+instance Reducible Inline where
+ s1@(Span (id1, classes1, kvs1) ils1) <++> s2@(Span (id2, classes2, kvs2) ils2) =
+ let classes' = classes1 `intersect` classes2
+ kvs' = kvs1 `intersect` kvs2
+ classes1' = classes1 \\ classes'
+ kvs1' = kvs1 \\ kvs'
+ classes2' = classes2 \\ classes'
+ kvs2' = kvs2 \\ kvs'
+ in
+ case null classes' && null kvs' of
+ True -> [s1,s2]
+ False -> let attr' = ("", classes', kvs')
+ attr1' = (id1, classes1', kvs1')
+ attr2' = (id2, classes2', kvs2')
+ s1' = case null classes1' && null kvs1' of
+ True -> ils1
+ False -> [Span attr1' ils1]
+ s2' = case null classes2' && null kvs2' of
+ True -> ils2
+ False -> [Span attr2' ils2]
+ in
+ [Span attr' $ reduceList $ s1' ++ s2']
+
+ (Str x) <++> (Str y) = [Str (x++y)]
+ il <++> il' = combineReducibles il il'
+
+ container (Emph _) = Container Emph
+ container (Strong _) = Container Strong
+ container (Strikeout _) = Container Strikeout
+ container (Subscript _) = Container Subscript
+ container (Superscript _) = Container Superscript
+ container (Quoted qt _) = Container $ Quoted qt
+ container (Cite cs _) = Container $ Cite cs
+ container (Span attr _) = Container $ Span attr
+ container _ = NullContainer
+
+ innards (Emph ils) = ils
+ innards (Strong ils) = ils
+ innards (Strikeout ils) = ils
+ innards (Subscript ils) = ils
+ innards (Superscript ils) = ils
+ innards (Quoted _ ils) = ils
+ innards (Cite _ ils) = ils
+ innards (Span _ ils) = ils
+ innards _ = []
+
+ isSpace Space = True
+ isSpace _ = False
+
+instance Reducible Block where
+ (Div (ident, classes, kvs) blks) <++> blk | "list-item" `elem` classes =
+ [Div (ident, classes, kvs) (reduceList blks), blk]
+
+ blk <++> blk' = combineReducibles blk blk'
+
+ container (BlockQuote _) = Container BlockQuote
+ container (Div attr _) = Container $ Div attr
+ container _ = NullContainer
+
+ innards (BlockQuote bs) = bs
+ innards (Div _ bs) = bs
+ innards _ = []
+
+ isSpace _ = False
+
+
+topLevelContainers' :: (Reducible a) => [a] -> ([Container a], [a])
+topLevelContainers' (r : []) = case container r of
+ NullContainer -> ([], [r])
+ _ ->
+ let (conts, inns) = topLevelContainers' (innards r)
+ in
+ ((container r) : conts, inns)
+topLevelContainers' rs = ([], rs)
+
+topLevelContainers :: (Reducible a) => a -> ([Container a], [a])
+topLevelContainers il = topLevelContainers' [il]
+
+rebuild :: [Container a] -> [a] -> [a]
+rebuild [] xs = xs
+rebuild ((Container f) : cs) xs = rebuild cs $ [f xs]
+rebuild (NullContainer : cs) xs = rebuild cs $ xs
+
+
diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs
index ffb079eee..a42dc31e9 100644
--- a/tests/Tests/Readers/Docx.hs
+++ b/tests/Tests/Readers/Docx.hs
@@ -82,6 +82,10 @@ tests = [ testGroup "inlines"
"normalizing inlines deep inside blocks"
"docx.deep_normalize.docx"
"docx.deep_normalize.native"
+ , testCompare
+ "move trailing spaces outside of formatting"
+ "docx.trailing_spaces_in_formatting.docx"
+ "docx.trailing_spaces_in_formatting.native"
]
, testGroup "blocks"
[ testCompare
diff --git a/tests/docx.trailing_spaces_in_formatting.docx b/tests/docx.trailing_spaces_in_formatting.docx
new file mode 100644
index 000000000..ebe7404a9
--- /dev/null
+++ b/tests/docx.trailing_spaces_in_formatting.docx
Binary files differ
diff --git a/tests/docx.trailing_spaces_in_formatting.native b/tests/docx.trailing_spaces_in_formatting.native
new file mode 100644
index 000000000..46ea9bca8
--- /dev/null
+++ b/tests/docx.trailing_spaces_in_formatting.native
@@ -0,0 +1 @@
+[Para [Str "Turn",Space,Str "my",Space,Emph [Str "formatting"],Space,Str "off",Space,Str "after",Space,Str "the",Space,Str "spaces."]]