Make HTML reader more forgiving of bad HTML.

* Skip spaces after <b>, <emph>, etc. * Convert Plain elements into Para when they're in a list item with Para, Pre, BlockQuote, CodeBlock. An example of HTML that pandoc handles better now: ~~~~ <h4> Testing html to markdown </h4> <ul> <li> <b> An item in a list </b> <p> An introductory sentence. <pre> Some preformatted text at this stage comes next. But alas! much havoc is wrought by Pandoc. </pre> </ul> ~~~~ Thanks to Dirk Laurie for reporting the issues.
author: John MacFarlane <jgm@berkeley.edu> 2011-07-10 16:54:46 -0700
committer: John MacFarlane <jgm@berkeley.edu> 2011-07-10 16:54:46 -0700
commit: 4134dad500f6cb360a9ceb957e0c5ec2ec59b2a8 (patch)
tree: 7740f80af4dc5cb4b195097a05abd405abbb2f6a /src/Text/Pandoc
parent: b5411c06aa5c4909cf10647e6ba0fe186cfa41f6 (diff)
download: pandoc-4134dad500f6cb360a9ceb957e0c5ec2ec59b2a8.tar.gz
1 files changed, 16 insertions, 4 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 18e3113d3..ba25d8ad8 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -107,7 +107,7 @@ pBulletList = try $ do
   -- treat it as a list item, though it's not valid xhtml...
   skipMany nonItem
   items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul")
-  return [BulletList items]
+  return [BulletList $ map fixPlains items]
 
 pOrderedList :: TagParser [Block]
 pOrderedList = try $ do
@@ -138,7 +138,7 @@ pOrderedList = try $ do
   -- treat it as a list item, though it's not valid xhtml...
   skipMany nonItem
   items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol")
-  return [OrderedList (start, style, DefaultDelim) items]
+  return [OrderedList (start, style, DefaultDelim) $ map fixPlains items]
 
 pDefinitionList :: TagParser [Block]
 pDefinitionList = try $ do
@@ -154,7 +154,19 @@ pDefListItem = try $ do
   defs  <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
   skipMany nonItem
   let term = intercalate [LineBreak] terms
-  return (term, defs)
+  return (term, map fixPlains defs)
+
+fixPlains :: [Block] -> [Block]
+fixPlains bs = if any isParaish bs
+                  then map plainToPara bs
+                  else bs
+  where isParaish (Para _) = True
+        isParaish (CodeBlock _ _) = True
+        isParaish (Header _ _) = True
+        isParaish (BlockQuote _) = True
+        isParaish _        = False
+        plainToPara (Plain xs) = Para xs
+        plainToPara x = x
 
 pRawTag :: TagParser String
 pRawTag = do
@@ -358,7 +370,7 @@ pInlinesInTags :: String -> ([Inline] -> Inline)
                -> TagParser [Inline]
 pInlinesInTags tagtype f = do
   contents <- pInTags tagtype inline
-  return [f contents]
+  return [f $ normalizeSpaces contents]
 
 pInTags :: String -> TagParser [a]
         -> TagParser [a]
author	John MacFarlane <jgm@berkeley.edu>	2011-07-10 16:54:46 -0700
committer	John MacFarlane <jgm@berkeley.edu>	2011-07-10 16:54:46 -0700
commit	4134dad500f6cb360a9ceb957e0c5ec2ec59b2a8 (patch)
tree	7740f80af4dc5cb4b195097a05abd405abbb2f6a /src/Text/Pandoc
parent	b5411c06aa5c4909cf10647e6ba0fe186cfa41f6 (diff)
download	pandoc-4134dad500f6cb360a9ceb957e0c5ec2ec59b2a8.tar.gz