From 4134dad500f6cb360a9ceb957e0c5ec2ec59b2a8 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 10 Jul 2011 16:54:46 -0700 Subject: Make HTML reader more forgiving of bad HTML. * Skip spaces after , , etc. * Convert Plain elements into Para when they're in a list item with Para, Pre, BlockQuote, CodeBlock. An example of HTML that pandoc handles better now: ~~~~

Testing html to markdown

  • An item in a list

    An introductory sentence.

    Some preformatted text
    at this stage comes next.
    
    But alas! much havoc
    is wrought by Pandoc.
    
~~~~ Thanks to Dirk Laurie for reporting the issues. --- src/Text/Pandoc/Readers/HTML.hs | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 18e3113d3..ba25d8ad8 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -107,7 +107,7 @@ pBulletList = try $ do -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul") - return [BulletList items] + return [BulletList $ map fixPlains items] pOrderedList :: TagParser [Block] pOrderedList = try $ do @@ -138,7 +138,7 @@ pOrderedList = try $ do -- treat it as a list item, though it's not valid xhtml... skipMany nonItem items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol") - return [OrderedList (start, style, DefaultDelim) items] + return [OrderedList (start, style, DefaultDelim) $ map fixPlains items] pDefinitionList :: TagParser [Block] pDefinitionList = try $ do @@ -154,7 +154,19 @@ pDefListItem = try $ do defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) skipMany nonItem let term = intercalate [LineBreak] terms - return (term, defs) + return (term, map fixPlains defs) + +fixPlains :: [Block] -> [Block] +fixPlains bs = if any isParaish bs + then map plainToPara bs + else bs + where isParaish (Para _) = True + isParaish (CodeBlock _ _) = True + isParaish (Header _ _) = True + isParaish (BlockQuote _) = True + isParaish _ = False + plainToPara (Plain xs) = Para xs + plainToPara x = x pRawTag :: TagParser String pRawTag = do @@ -358,7 +370,7 @@ pInlinesInTags :: String -> ([Inline] -> Inline) -> TagParser [Inline] pInlinesInTags tagtype f = do contents <- pInTags tagtype inline - return [f contents] + return [f $ normalizeSpaces contents] pInTags :: String -> TagParser [a] -> TagParser [a] -- cgit v1.2.3