From 9c7e0dc84b2384347099827999f0e2f7be4f7e51 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sat, 21 Jun 2014 17:53:45 -0400 Subject: Implement new normalization. There were some problems with the old str normalization. This fixes those problems. Also, since it drills down on its own, it only needs to be mapped over the blocks, not walked over the tree. --- src/Text/Pandoc/Readers/Docx.hs | 68 ++++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 11 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 479a88161..299adf5a8 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -148,25 +148,71 @@ runElemsToString = concatMap runElemToString --- Text.Pandoc.Shared.normalize for reasons of efficiency. For --- whatever reason, `normalize` makes a run take almost twice as --- long. (It does more, but this does what we need) -strNormalize :: [Inline] -> [Inline] -strNormalize [] = [] -strNormalize (Str "" : ils) = strNormalize ils -strNormalize ((Str s) : (Str s') : l) = strNormalize ((Str (s++s')) : l) -strNormalize (il:ils) = il : (strNormalize ils) +inlineNormalize :: [Inline] -> [Inline] +inlineNormalize [] = [] +inlineNormalize (Str "" : ils) = inlineNormalize ils +inlineNormalize ((Str s) : (Str s') : l) = + inlineNormalize (Str (s++s') : l) +inlineNormalize ((Emph ils) : (Emph ils') : l) = + inlineNormalize $ (Emph $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Emph ils) : l) = + Emph (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Strong ils) : (Strong ils') : l) = + inlineNormalize $ (Strong $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Strong ils) : l) = + Strong (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Strikeout ils) : (Strikeout ils') : l) = + inlineNormalize $ (Strikeout $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Strikeout ils) : l) = + Strikeout (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Superscript ils) : (Superscript ils') : l) = + inlineNormalize $ (Superscript $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Superscript ils) : l) = + Superscript (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Subscript ils) : (Subscript ils') : l) = + inlineNormalize $ (Subscript $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Subscript ils) : l) = + Subscript (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Space : Space : l)) = + inlineNormalize $ (Space : l) +inlineNormalize ((Quoted qt ils) : l) = + Quoted qt (inlineNormalize ils) : inlineNormalize l +inlineNormalize ((Cite cits ils) : l) = + let + f :: Citation -> Citation + f (Citation s pref suff mode num hash) = + Citation s (inlineNormalize pref) (inlineNormalize suff) mode num hash + in + Cite (map f cits) (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Link ils s) : l) = + Link (inlineNormalize ils) s : (inlineNormalize l) +inlineNormalize ((Image ils s) : l) = + Image (inlineNormalize ils) s : (inlineNormalize l) +inlineNormalize ((Note blks) : l) = + Note (map blockNormalize blks) : (inlineNormalize l) +inlineNormalize ((Span attr ils) : l) = + Span attr (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize (il : l) = il : (inlineNormalize l) stripSpaces :: [Inline] -> [Inline] stripSpaces ils = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) ils blockNormalize :: Block -> Block -blockNormalize (Plain ils) = Plain $ bottomUp strNormalize $ stripSpaces ils -blockNormalize (Para ils) = Para $ bottomUp strNormalize $ stripSpaces ils +blockNormalize (Plain ils) = Plain $ stripSpaces $ inlineNormalize ils +blockNormalize (Para ils) = Para $ stripSpaces $ inlineNormalize ils blockNormalize (Header n attr ils) = - Header n attr $ bottomUp strNormalize $ stripSpaces ils + Header n attr $ stripSpaces $ inlineNormalize ils blockNormalize (Table ils align width hdr cells) = - Table (bottomUp strNormalize $ stripSpaces ils) align width hdr cells + Table (stripSpaces $ inlineNormalize ils) align width hdr cells blockNormalize (DefinitionList pairs) = - DefinitionList $ map (\(ils, blklsts) -> (bottomUp strNormalize (stripSpaces ils), blklsts)) pairs + DefinitionList $ map (\(ils, blklsts) -> (stripSpaces (inlineNormalize ils), (map (map blockNormalize) blklsts))) pairs +blockNormalize (BlockQuote blks) = BlockQuote (map blockNormalize blks) +blockNormalize (OrderedList attr blkslst) = + OrderedList attr $ map (\blks -> map blockNormalize blks) blkslst +blockNormalize (BulletList blkslst) = + BulletList $ map (\blks -> map blockNormalize blks) blkslst +blockNormalize (Div attr blks) = Div attr (map blockNormalize blks) blockNormalize blk = blk runToInlines :: ReaderOptions -> Docx -> Run -> [Inline] @@ -315,7 +361,7 @@ makeImagesSelfContained _ inline = inline bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block] bodyToBlocks opts docx (Body bps) = bottomUp removeEmptyPars $ - bottomUp blockNormalize $ + map blockNormalize $ bottomUp spanRemove $ bottomUp divRemove $ map (makeHeaderAnchors) $ -- cgit v1.2.3 From 8e5bd9d851aa0f60462015f61e3980b134e3c131 Mon Sep 17 00:00:00 2001 From: Jesse Rosenthal Date: Sun, 22 Jun 2014 01:47:11 -0400 Subject: Docx reader: Fix spacing in formatting. The normalizing tests revealed a problem with unformatted spaces, brought about by `spanTrim`. This fixes by not trimming the spaces out of spans until they are in their final form. --- src/Text/Pandoc/Readers/Docx.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 299adf5a8..09c2330fb 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -286,8 +286,8 @@ parPartsToInlines opts docx parparts = (if False -- TODO depend on option then bottomUp (makeImagesSelfContained docx) else id) $ - bottomUp spanCorrect $ bottomUp spanTrim $ + bottomUp spanCorrect $ bottomUp spanReduce $ concatMap (parPartToInlines opts docx) parparts -- cgit v1.2.3