diff options
author | John MacFarlane <jgm@berkeley.edu> | 2014-06-21 23:14:03 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2014-06-21 23:14:03 -0700 |
commit | cab16024fcba447f64b0254f54f3e0b4dd93bb99 (patch) | |
tree | 8318fa059ee7adb1103d0de5792a116b30b23502 | |
parent | 08fe16e9720a3a191caf095d48e0a6c454039bf9 (diff) | |
parent | ed43513087b514a5240fde04784dbf8709182513 (diff) | |
download | pandoc-cab16024fcba447f64b0254f54f3e0b4dd93bb99.tar.gz |
Merge pull request #1363 from jkr/newNormalize
Improve normalization
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 70 | ||||
-rw-r--r-- | tests/Tests/Readers/Docx.hs | 8 | ||||
-rw-r--r-- | tests/docx.deep_normalize.docx | bin | 0 -> 29246 bytes | |||
-rw-r--r-- | tests/docx.deep_normalize.native | 6 | ||||
-rw-r--r-- | tests/docx.normalize.docx | bin | 0 -> 25994 bytes | |||
-rw-r--r-- | tests/docx.normalize.native | 2 |
6 files changed, 74 insertions, 12 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 479a88161..09c2330fb 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -148,25 +148,71 @@ runElemsToString = concatMap runElemToString --- Text.Pandoc.Shared.normalize for reasons of efficiency. For --- whatever reason, `normalize` makes a run take almost twice as --- long. (It does more, but this does what we need) -strNormalize :: [Inline] -> [Inline] -strNormalize [] = [] -strNormalize (Str "" : ils) = strNormalize ils -strNormalize ((Str s) : (Str s') : l) = strNormalize ((Str (s++s')) : l) -strNormalize (il:ils) = il : (strNormalize ils) +inlineNormalize :: [Inline] -> [Inline] +inlineNormalize [] = [] +inlineNormalize (Str "" : ils) = inlineNormalize ils +inlineNormalize ((Str s) : (Str s') : l) = + inlineNormalize (Str (s++s') : l) +inlineNormalize ((Emph ils) : (Emph ils') : l) = + inlineNormalize $ (Emph $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Emph ils) : l) = + Emph (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Strong ils) : (Strong ils') : l) = + inlineNormalize $ (Strong $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Strong ils) : l) = + Strong (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Strikeout ils) : (Strikeout ils') : l) = + inlineNormalize $ (Strikeout $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Strikeout ils) : l) = + Strikeout (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Superscript ils) : (Superscript ils') : l) = + inlineNormalize $ (Superscript $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Superscript ils) : l) = + Superscript (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Subscript ils) : (Subscript ils') : l) = + inlineNormalize $ (Subscript $ inlineNormalize (ils ++ ils')) : l +inlineNormalize ((Subscript ils) : l) = + Subscript (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Space : Space : l)) = + inlineNormalize $ (Space : l) +inlineNormalize ((Quoted qt ils) : l) = + Quoted qt (inlineNormalize ils) : inlineNormalize l +inlineNormalize ((Cite cits ils) : l) = + let + f :: Citation -> Citation + f (Citation s pref suff mode num hash) = + Citation s (inlineNormalize pref) (inlineNormalize suff) mode num hash + in + Cite (map f cits) (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize ((Link ils s) : l) = + Link (inlineNormalize ils) s : (inlineNormalize l) +inlineNormalize ((Image ils s) : l) = + Image (inlineNormalize ils) s : (inlineNormalize l) +inlineNormalize ((Note blks) : l) = + Note (map blockNormalize blks) : (inlineNormalize l) +inlineNormalize ((Span attr ils) : l) = + Span attr (inlineNormalize ils) : (inlineNormalize l) +inlineNormalize (il : l) = il : (inlineNormalize l) stripSpaces :: [Inline] -> [Inline] stripSpaces ils = reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) ils blockNormalize :: Block -> Block -blockNormalize (Plain ils) = Plain $ bottomUp strNormalize $ stripSpaces ils -blockNormalize (Para ils) = Para $ bottomUp strNormalize $ stripSpaces ils +blockNormalize (Plain ils) = Plain $ stripSpaces $ inlineNormalize ils +blockNormalize (Para ils) = Para $ stripSpaces $ inlineNormalize ils blockNormalize (Header n attr ils) = - Header n attr $ bottomUp strNormalize $ stripSpaces ils + Header n attr $ stripSpaces $ inlineNormalize ils blockNormalize (Table ils align width hdr cells) = - Table (bottomUp strNormalize $ stripSpaces ils) align width hdr cells + Table (stripSpaces $ inlineNormalize ils) align width hdr cells blockNormalize (DefinitionList pairs) = - DefinitionList $ map (\(ils, blklsts) -> (bottomUp strNormalize (stripSpaces ils), blklsts)) pairs + DefinitionList $ map (\(ils, blklsts) -> (stripSpaces (inlineNormalize ils), (map (map blockNormalize) blklsts))) pairs +blockNormalize (BlockQuote blks) = BlockQuote (map blockNormalize blks) +blockNormalize (OrderedList attr blkslst) = + OrderedList attr $ map (\blks -> map blockNormalize blks) blkslst +blockNormalize (BulletList blkslst) = + BulletList $ map (\blks -> map blockNormalize blks) blkslst +blockNormalize (Div attr blks) = Div attr (map blockNormalize blks) blockNormalize blk = blk runToInlines :: ReaderOptions -> Docx -> Run -> [Inline] @@ -240,8 +286,8 @@ parPartsToInlines opts docx parparts = (if False -- TODO depend on option then bottomUp (makeImagesSelfContained docx) else id) $ - bottomUp spanCorrect $ bottomUp spanTrim $ + bottomUp spanCorrect $ bottomUp spanReduce $ concatMap (parPartToInlines opts docx) parparts @@ -315,7 +361,7 @@ makeImagesSelfContained _ inline = inline bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block] bodyToBlocks opts docx (Body bps) = bottomUp removeEmptyPars $ - bottomUp blockNormalize $ + map blockNormalize $ bottomUp spanRemove $ bottomUp divRemove $ map (makeHeaderAnchors) $ diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs index e8fa33241..ffb079eee 100644 --- a/tests/Tests/Readers/Docx.hs +++ b/tests/Tests/Readers/Docx.hs @@ -74,6 +74,14 @@ tests = [ testGroup "inlines" "literal tabs" "docx.tabs.docx" "docx.tabs.native" + , testCompare + "normalizing inlines" + "docx.normalize.docx" + "docx.normalize.native" + , testCompare + "normalizing inlines deep inside blocks" + "docx.deep_normalize.docx" + "docx.deep_normalize.native" ] , testGroup "blocks" [ testCompare diff --git a/tests/docx.deep_normalize.docx b/tests/docx.deep_normalize.docx Binary files differnew file mode 100644 index 000000000..7626d59ce --- /dev/null +++ b/tests/docx.deep_normalize.docx diff --git a/tests/docx.deep_normalize.native b/tests/docx.deep_normalize.native new file mode 100644 index 000000000..9b2089ec8 --- /dev/null +++ b/tests/docx.deep_normalize.native @@ -0,0 +1,6 @@ +[OrderedList (1,Decimal,OneParen) + [[Para [Str "This",Space,Str "is",Space,Str "at",Space,Str "the",Space,Str "first",Space,Str "level"] + ,OrderedList (1,LowerAlpha,DefaultDelim) + [[Para [Str "This",Space,Str "is",Space,Str "at",Space,Str "the",Space,Str "second",Space,Str "level"] + ,OrderedList (1,LowerRoman,DefaultDelim) + [[Para [Str "This",Space,Str "is",Space,Emph [Str "at",Space,Strong [Str "the",Space,Str "third",Space,Str "level"],Str ",",Space,Str "and",Space,Str "I",Space,Str "want",Space,Str "to"],Space,Str "test",Space,Str "normalization",Space,Str "here."]]]]]]]] diff --git a/tests/docx.normalize.docx b/tests/docx.normalize.docx Binary files differnew file mode 100644 index 000000000..5e4370a47 --- /dev/null +++ b/tests/docx.normalize.docx diff --git a/tests/docx.normalize.native b/tests/docx.normalize.native new file mode 100644 index 000000000..aeba672c4 --- /dev/null +++ b/tests/docx.normalize.native @@ -0,0 +1,2 @@ +[Para [Str "These",Space,Str "are",Space,Str "different",Space,Str "fonts."] +,Para [Strong [Str "These",Space,Emph [Str "are",Space,Strikeout [Str "different"]],Space,Str "fonts."]]] |