From 5222572033e12948de2786122532f3c589145fe1 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sat, 26 Nov 2016 22:28:28 +0100 Subject: HTML reader: improved table parsing. We now check explicitly for non-1 rowspan or colspan attributes, and fail when we encounter them. Previously we checked that each row had the same number of cells, but that could be true even with rowspans/colspans. And there are cases where it isn't true in tables that we can handle fine -- e.g. when a tr element is empty. So now we just pad rows with empty cells when needed. Closes #3027. --- src/Text/Pandoc/Readers/HTML.hs | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index e2fc97fbf..d4360e521 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -435,17 +435,20 @@ pTable = try $ do rowsLs <- many pTBody rows' <- pOptInTag "tfoot" $ many pTr TagClose _ <- pSatisfy (~== TagClose "table") - let rows = (concat rowsLs) ++ rows' + let rows'' = (concat rowsLs) ++ rows' -- fail on empty table - guard $ not $ null head' && null rows + guard $ not $ null head' && null rows'' let isSinglePlain x = case B.toList x of [] -> True [Plain _] -> True _ -> False - let isSimple = all isSinglePlain $ concat (head':rows) - let cols = length $ if null head' then head rows else head' - -- fail if there are colspans or rowspans - guard $ all (\r -> length r == cols) rows + let isSimple = all isSinglePlain $ concat (head':rows'') + let cols = length $ if null head' then head rows'' else head' + -- add empty cells to short rows + let addEmpties r = case length r - cols of + n | n > 1 -> r ++ replicate n [] + | otherwise -> r + let rows = addEmpties rows'' let aligns = replicate cols AlignDefault let widths = if null widths' then if isSimple @@ -471,10 +474,17 @@ pColgroup = try $ do skipMany pBlank manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank +noColOrRowSpans :: Tag String -> Bool +noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan" + where isNullOrOne x = case fromAttrib x t of + "" -> True + "1" -> True + _ -> False + pCell :: String -> TagParser [Blocks] pCell celltype = try $ do skipMany pBlank - res <- pInTags celltype block + res <- pInTags' celltype noColOrRowSpans block skipMany pBlank return [res] @@ -695,10 +705,13 @@ pInlinesInTags :: String -> (Inlines -> Inlines) -> TagParser Inlines pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline -pInTags :: (Monoid a) => String -> TagParser a - -> TagParser a -pInTags tagtype parser = try $ do - pSatisfy (~== TagOpen tagtype []) +pInTags :: (Monoid a) => String -> TagParser a -> TagParser a +pInTags tagtype parser = pInTags' tagtype (const True) parser + +pInTags' :: (Monoid a) => String -> (Tag String -> Bool) -> TagParser a + -> TagParser a +pInTags' tagtype tagtest parser = try $ do + pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) mconcat <$> manyTill parser (pCloses tagtype <|> eof) -- parses p, preceeded by an optional opening tag -- cgit v1.2.3