aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2014-06-21 23:14:03 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2014-06-21 23:14:03 -0700
commitcab16024fcba447f64b0254f54f3e0b4dd93bb99 (patch)
tree8318fa059ee7adb1103d0de5792a116b30b23502
parent08fe16e9720a3a191caf095d48e0a6c454039bf9 (diff)
parented43513087b514a5240fde04784dbf8709182513 (diff)
downloadpandoc-cab16024fcba447f64b0254f54f3e0b4dd93bb99.tar.gz
Merge pull request #1363 from jkr/newNormalize
Improve normalization
-rw-r--r--src/Text/Pandoc/Readers/Docx.hs70
-rw-r--r--tests/Tests/Readers/Docx.hs8
-rw-r--r--tests/docx.deep_normalize.docxbin0 -> 29246 bytes
-rw-r--r--tests/docx.deep_normalize.native6
-rw-r--r--tests/docx.normalize.docxbin0 -> 25994 bytes
-rw-r--r--tests/docx.normalize.native2
6 files changed, 74 insertions, 12 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs
index 479a88161..09c2330fb 100644
--- a/src/Text/Pandoc/Readers/Docx.hs
+++ b/src/Text/Pandoc/Readers/Docx.hs
@@ -148,25 +148,71 @@ runElemsToString = concatMap runElemToString
--- Text.Pandoc.Shared.normalize for reasons of efficiency. For
--- whatever reason, `normalize` makes a run take almost twice as
--- long. (It does more, but this does what we need)
-strNormalize :: [Inline] -> [Inline]
-strNormalize [] = []
-strNormalize (Str "" : ils) = strNormalize ils
-strNormalize ((Str s) : (Str s') : l) = strNormalize ((Str (s++s')) : l)
-strNormalize (il:ils) = il : (strNormalize ils)
+inlineNormalize :: [Inline] -> [Inline]
+inlineNormalize [] = []
+inlineNormalize (Str "" : ils) = inlineNormalize ils
+inlineNormalize ((Str s) : (Str s') : l) =
+ inlineNormalize (Str (s++s') : l)
+inlineNormalize ((Emph ils) : (Emph ils') : l) =
+ inlineNormalize $ (Emph $ inlineNormalize (ils ++ ils')) : l
+inlineNormalize ((Emph ils) : l) =
+ Emph (inlineNormalize ils) : (inlineNormalize l)
+inlineNormalize ((Strong ils) : (Strong ils') : l) =
+ inlineNormalize $ (Strong $ inlineNormalize (ils ++ ils')) : l
+inlineNormalize ((Strong ils) : l) =
+ Strong (inlineNormalize ils) : (inlineNormalize l)
+inlineNormalize ((Strikeout ils) : (Strikeout ils') : l) =
+ inlineNormalize $ (Strikeout $ inlineNormalize (ils ++ ils')) : l
+inlineNormalize ((Strikeout ils) : l) =
+ Strikeout (inlineNormalize ils) : (inlineNormalize l)
+inlineNormalize ((Superscript ils) : (Superscript ils') : l) =
+ inlineNormalize $ (Superscript $ inlineNormalize (ils ++ ils')) : l
+inlineNormalize ((Superscript ils) : l) =
+ Superscript (inlineNormalize ils) : (inlineNormalize l)
+inlineNormalize ((Subscript ils) : (Subscript ils') : l) =
+ inlineNormalize $ (Subscript $ inlineNormalize (ils ++ ils')) : l
+inlineNormalize ((Subscript ils) : l) =
+ Subscript (inlineNormalize ils) : (inlineNormalize l)
+inlineNormalize ((Space : Space : l)) =
+ inlineNormalize $ (Space : l)
+inlineNormalize ((Quoted qt ils) : l) =
+ Quoted qt (inlineNormalize ils) : inlineNormalize l
+inlineNormalize ((Cite cits ils) : l) =
+ let
+ f :: Citation -> Citation
+ f (Citation s pref suff mode num hash) =
+ Citation s (inlineNormalize pref) (inlineNormalize suff) mode num hash
+ in
+ Cite (map f cits) (inlineNormalize ils) : (inlineNormalize l)
+inlineNormalize ((Link ils s) : l) =
+ Link (inlineNormalize ils) s : (inlineNormalize l)
+inlineNormalize ((Image ils s) : l) =
+ Image (inlineNormalize ils) s : (inlineNormalize l)
+inlineNormalize ((Note blks) : l) =
+ Note (map blockNormalize blks) : (inlineNormalize l)
+inlineNormalize ((Span attr ils) : l) =
+ Span attr (inlineNormalize ils) : (inlineNormalize l)
+inlineNormalize (il : l) = il : (inlineNormalize l)
stripSpaces :: [Inline] -> [Inline]
stripSpaces ils =
reverse $ dropWhile (Space ==) $ reverse $ dropWhile (Space ==) ils
blockNormalize :: Block -> Block
-blockNormalize (Plain ils) = Plain $ bottomUp strNormalize $ stripSpaces ils
-blockNormalize (Para ils) = Para $ bottomUp strNormalize $ stripSpaces ils
+blockNormalize (Plain ils) = Plain $ stripSpaces $ inlineNormalize ils
+blockNormalize (Para ils) = Para $ stripSpaces $ inlineNormalize ils
blockNormalize (Header n attr ils) =
- Header n attr $ bottomUp strNormalize $ stripSpaces ils
+ Header n attr $ stripSpaces $ inlineNormalize ils
blockNormalize (Table ils align width hdr cells) =
- Table (bottomUp strNormalize $ stripSpaces ils) align width hdr cells
+ Table (stripSpaces $ inlineNormalize ils) align width hdr cells
blockNormalize (DefinitionList pairs) =
- DefinitionList $ map (\(ils, blklsts) -> (bottomUp strNormalize (stripSpaces ils), blklsts)) pairs
+ DefinitionList $ map (\(ils, blklsts) -> (stripSpaces (inlineNormalize ils), (map (map blockNormalize) blklsts))) pairs
+blockNormalize (BlockQuote blks) = BlockQuote (map blockNormalize blks)
+blockNormalize (OrderedList attr blkslst) =
+ OrderedList attr $ map (\blks -> map blockNormalize blks) blkslst
+blockNormalize (BulletList blkslst) =
+ BulletList $ map (\blks -> map blockNormalize blks) blkslst
+blockNormalize (Div attr blks) = Div attr (map blockNormalize blks)
blockNormalize blk = blk
runToInlines :: ReaderOptions -> Docx -> Run -> [Inline]
@@ -240,8 +286,8 @@ parPartsToInlines opts docx parparts =
(if False -- TODO depend on option
then bottomUp (makeImagesSelfContained docx)
else id) $
- bottomUp spanCorrect $
bottomUp spanTrim $
+ bottomUp spanCorrect $
bottomUp spanReduce $
concatMap (parPartToInlines opts docx) parparts
@@ -315,7 +361,7 @@ makeImagesSelfContained _ inline = inline
bodyToBlocks :: ReaderOptions -> Docx -> Body -> [Block]
bodyToBlocks opts docx (Body bps) =
bottomUp removeEmptyPars $
- bottomUp blockNormalize $
+ map blockNormalize $
bottomUp spanRemove $
bottomUp divRemove $
map (makeHeaderAnchors) $
diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs
index e8fa33241..ffb079eee 100644
--- a/tests/Tests/Readers/Docx.hs
+++ b/tests/Tests/Readers/Docx.hs
@@ -74,6 +74,14 @@ tests = [ testGroup "inlines"
"literal tabs"
"docx.tabs.docx"
"docx.tabs.native"
+ , testCompare
+ "normalizing inlines"
+ "docx.normalize.docx"
+ "docx.normalize.native"
+ , testCompare
+ "normalizing inlines deep inside blocks"
+ "docx.deep_normalize.docx"
+ "docx.deep_normalize.native"
]
, testGroup "blocks"
[ testCompare
diff --git a/tests/docx.deep_normalize.docx b/tests/docx.deep_normalize.docx
new file mode 100644
index 000000000..7626d59ce
--- /dev/null
+++ b/tests/docx.deep_normalize.docx
Binary files differ
diff --git a/tests/docx.deep_normalize.native b/tests/docx.deep_normalize.native
new file mode 100644
index 000000000..9b2089ec8
--- /dev/null
+++ b/tests/docx.deep_normalize.native
@@ -0,0 +1,6 @@
+[OrderedList (1,Decimal,OneParen)
+ [[Para [Str "This",Space,Str "is",Space,Str "at",Space,Str "the",Space,Str "first",Space,Str "level"]
+ ,OrderedList (1,LowerAlpha,DefaultDelim)
+ [[Para [Str "This",Space,Str "is",Space,Str "at",Space,Str "the",Space,Str "second",Space,Str "level"]
+ ,OrderedList (1,LowerRoman,DefaultDelim)
+ [[Para [Str "This",Space,Str "is",Space,Emph [Str "at",Space,Strong [Str "the",Space,Str "third",Space,Str "level"],Str ",",Space,Str "and",Space,Str "I",Space,Str "want",Space,Str "to"],Space,Str "test",Space,Str "normalization",Space,Str "here."]]]]]]]]
diff --git a/tests/docx.normalize.docx b/tests/docx.normalize.docx
new file mode 100644
index 000000000..5e4370a47
--- /dev/null
+++ b/tests/docx.normalize.docx
Binary files differ
diff --git a/tests/docx.normalize.native b/tests/docx.normalize.native
new file mode 100644
index 000000000..aeba672c4
--- /dev/null
+++ b/tests/docx.normalize.native
@@ -0,0 +1,2 @@
+[Para [Str "These",Space,Str "are",Space,Str "different",Space,Str "fonts."]
+,Para [Strong [Str "These",Space,Emph [Str "are",Space,Strikeout [Str "different"]],Space,Str "fonts."]]]