aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2011-07-10 16:54:46 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2011-07-10 16:54:46 -0700
commit4134dad500f6cb360a9ceb957e0c5ec2ec59b2a8 (patch)
tree7740f80af4dc5cb4b195097a05abd405abbb2f6a /src
parentb5411c06aa5c4909cf10647e6ba0fe186cfa41f6 (diff)
downloadpandoc-4134dad500f6cb360a9ceb957e0c5ec2ec59b2a8.tar.gz
Make HTML reader more forgiving of bad HTML.
* Skip spaces after <b>, <emph>, etc. * Convert Plain elements into Para when they're in a list item with Para, Pre, BlockQuote, CodeBlock. An example of HTML that pandoc handles better now: ~~~~ <h4> Testing html to markdown </h4> <ul> <li> <b> An item in a list </b> <p> An introductory sentence. <pre> Some preformatted text at this stage comes next. But alas! much havoc is wrought by Pandoc. </pre> </ul> ~~~~ Thanks to Dirk Laurie for reporting the issues.
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs20
1 files changed, 16 insertions, 4 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 18e3113d3..ba25d8ad8 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -107,7 +107,7 @@ pBulletList = try $ do
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul")
- return [BulletList items]
+ return [BulletList $ map fixPlains items]
pOrderedList :: TagParser [Block]
pOrderedList = try $ do
@@ -138,7 +138,7 @@ pOrderedList = try $ do
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol")
- return [OrderedList (start, style, DefaultDelim) items]
+ return [OrderedList (start, style, DefaultDelim) $ map fixPlains items]
pDefinitionList :: TagParser [Block]
pDefinitionList = try $ do
@@ -154,7 +154,19 @@ pDefListItem = try $ do
defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
skipMany nonItem
let term = intercalate [LineBreak] terms
- return (term, defs)
+ return (term, map fixPlains defs)
+
+fixPlains :: [Block] -> [Block]
+fixPlains bs = if any isParaish bs
+ then map plainToPara bs
+ else bs
+ where isParaish (Para _) = True
+ isParaish (CodeBlock _ _) = True
+ isParaish (Header _ _) = True
+ isParaish (BlockQuote _) = True
+ isParaish _ = False
+ plainToPara (Plain xs) = Para xs
+ plainToPara x = x
pRawTag :: TagParser String
pRawTag = do
@@ -358,7 +370,7 @@ pInlinesInTags :: String -> ([Inline] -> Inline)
-> TagParser [Inline]
pInlinesInTags tagtype f = do
contents <- pInTags tagtype inline
- return [f contents]
+ return [f $ normalizeSpaces contents]
pInTags :: String -> TagParser [a]
-> TagParser [a]