From 07919e1b2270a906019575e4ce85590d6754d41c Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Thu, 26 Nov 2020 07:22:01 +0100 Subject: HTML reader: improve support for table headers, footer, attributes - `` elements are no longer added to the table body but used as table footer. - Separate `` elements are no longer combined into one. - Attributes on ``, ``, ``/``, and `` elements are preserved. --- src/Text/Pandoc/Readers/HTML/Table.hs | 238 ++++++++++++++++++++++++++-------- 1 file changed, 183 insertions(+), 55 deletions(-) (limited to 'src/Text/Pandoc/Readers/HTML/Table.hs') diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs index e6d0a9097..e40d90221 100644 --- a/src/Text/Pandoc/Readers/HTML/Table.hs +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ViewPatterns #-} {- | @@ -12,17 +13,19 @@ HTML table parser. -} -module Text.Pandoc.Readers.HTML.Table (pTable') where +module Text.Pandoc.Readers.HTML.Table (pTable) where -import Control.Monad (guard) +import Control.Applicative ((<|>)) import Data.Maybe (fromMaybe) import Data.Text (Text) import Text.HTML.TagSoup import Text.Pandoc.Builder (Blocks) +import Text.Pandoc.CSS (cssAttributes) import Text.Pandoc.Definition import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) import Text.Pandoc.Parsing - ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try) + ( eof, lookAhead, many, many1, manyTill, option, optional + , optionMaybe, skipMany, try) import Text.Pandoc.Readers.HTML.Parsing import Text.Pandoc.Readers.HTML.Types (TagParser) import Text.Pandoc.Shared (onlySimpleTableCells, safeRead) @@ -57,58 +60,183 @@ pColgroup = try $ do skipMany pBlank manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank --- | Parses a simple HTML table -pTable' :: PandocMonad m - => TagParser m Blocks -- ^ Caption parser - -> (Text -> TagParser m [Cell]) -- ^ Table cell parser - -> TagParser m Blocks -pTable' block pCell = try $ do - TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" []) - let attribs = toAttr attribs' +pCell :: PandocMonad m + => TagParser m Blocks + -> Text + -> TagParser m [Cell] +pCell block celltype = try $ do + skipMany pBlank + TagOpen _ attribs <- lookAhead $ pSatisfy (matchTagOpen celltype []) + let cssAttribs = maybe [] cssAttributes $ lookup "style" attribs + let align = case lookup "align" attribs <|> + lookup "text-align" cssAttribs of + Just "left" -> AlignLeft + Just "right" -> AlignRight + Just "center" -> AlignCenter + _ -> AlignDefault + let rowspan = RowSpan . fromMaybe 1 $ + safeRead =<< lookup "rowspan" attribs + let colspan = ColSpan . fromMaybe 1 $ + safeRead =<< lookup "colspan" attribs + res <- pInTags celltype block + skipMany pBlank + let handledAttribs = ["align", "colspan", "rowspan", "text-align"] + attribs' = foldr go [] attribs + go kv@(k, _) acc = case k of + "style" -> case filter ((/= "text-align") . fst) cssAttribs of + [] -> acc + cs -> ("style", toStyleString cs) : acc + -- drop attrib if it's already handled + _ | k `elem` handledAttribs -> acc + _ -> kv : acc + return [B.cellWith (toAttr attribs') align rowspan colspan res] + +-- | Create a style attribute string from a list of CSS attributes +toStyleString :: [(Text, Text)] -> Text +toStyleString = T.intercalate "; " . map (\(k, v) -> k <> ": " <> v) + +data RowType + = HeaderCells + | AllCells + +-- | Parses a table row +pRow :: PandocMonad m + => TagParser m Blocks + -> RowType + -> TagParser m [B.Row] +pRow block rowType = try $ do + skipMany pBlank + case rowType of + HeaderCells -> do + maybeCells <- optionMaybe (pInTags "tr" (pCell block "th")) + return $ case maybeCells of + Nothing -> [] + Just cells -> [Row nullAttr cells] + AllCells -> do + cells <- pInTags "tr" (pCell block "td" <|> pCell block "th") + return [Row nullAttr cells] + +-- | Parses a table head +pTableHead :: PandocMonad m + => TagParser m Blocks + -> TagParser m TableHead +pTableHead block = try $ do + skipMany pBlank + (attribs, rows) <- pInTagWithAttribs ClosingTagOptional "thead" + (option [] $ pRow block AllCells) + <|> pInTagWithAttribs TagsOmittable "thead" + (pRow block HeaderCells) + let cells = concatMap (\(Row _ cs) -> cs) rows + if null cells + then TableHead nullAttr <$> + pInTag TagsOmittable "tbody" (pRow block HeaderCells) + else return $ TableHead (toAttr attribs) [Row nullAttr cells] + +-- | Parses a table foot +pTableFoot :: PandocMonad m + => TagParser m Blocks + -> TagParser m TableFoot +pTableFoot block = try $ do + skipMany pBlank + TagOpen _ attribs <- pSatisfy (matchTagOpen "tfoot" []) <* skipMany pBlank + rows <- mconcat <$> many (pRow block AllCells <* skipMany pBlank) + optional $ pSatisfy (matchTagClose "tfoot") + return $ TableFoot (toAttr attribs) rows + +-- | Parses a table body +pTableBody :: PandocMonad m + => TagParser m Blocks + -> TagParser m TableBody +pTableBody block = do skipMany pBlank - caption <- option mempty $ pInTags "caption" block <* skipMany pBlank - widths' <- (mconcat <$> many1 pColgroup) <|> many pCol - let pTh = option [] $ pInTags "tr" (pCell "th") - pTr = try $ skipMany pBlank - *> pInTags "tr" (pCell "td" <|> pCell "th") - pTBody = pInTag TagsOmittable "tbody" $ many1 pTr - head'' <- pInTag ClosingTagOptional "thead" (option [] pTr) - <|> pInTag TagsOmittable "thead" pTh - head' <- pInTag TagsOmittable "tbody" - (if null head'' then pTh else return head'') - topfoot <- option [] $ pInTag TagsRequired "tfoot" $ many pTr - rowsLs <- many pTBody - bottomfoot <- option [] $ pInTag ClosingTagOptional "tfoot" $ many pTr + (attribs, rows) <- pInTagWithAttribs TagsOmittable "tbody" + (mconcat <$> many1 (pRow block AllCells)) + return $ TableBody (toAttr attribs) 0 [] rows + + +-- | Parses a simple HTML table +pTable :: PandocMonad m + => TagParser m Blocks -- ^ Caption and cell contents parser + -> TagParser m Blocks +pTable block = try $ do + TagOpen _ attribs <- pSatisfy (matchTagOpen "table" []) <* skipMany pBlank + caption <- option mempty $ pInTags "caption" block <* skipMany pBlank + widths <- ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank + thead <- pTableHead block <* skipMany pBlank + topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank + tbodies <- many (pTableBody block) <* skipMany pBlank + botfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank TagClose _ <- pSatisfy (matchTagClose "table") - let rows = concat rowsLs <> topfoot <> bottomfoot - rows''' = map (map cellContents) rows + let tfoot = fromMaybe (TableFoot nullAttr []) $ topfoot <|> botfoot + case normalize widths thead tbodies tfoot of + Left err -> fail err + Right (colspecs, thead', tbodies', tfoot') -> return $ + B.tableWith (toAttr attribs) + (B.simpleCaption caption) + colspecs + thead' + tbodies' + tfoot' +data TableType + = SimpleTable + | NormalTable + +tableType :: [[Cell]] -> TableType +tableType cells = + if onlySimpleTableCells $ map (map cellContents) cells + then SimpleTable + else NormalTable + where + cellContents :: Cell -> [Block] + cellContents (Cell _ _ _ _ bs) = bs + +normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot + -> Either String ([ColSpec], TableHead, [TableBody], TableFoot) +normalize widths head' bodies foot = do + let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot + let rowLength = length . rowCells + let ncols = maximum (map rowLength rows) + let tblType = tableType (map rowCells rows) -- fail on empty table - guard $ not $ null head' && null rows''' - let isSimple = onlySimpleTableCells $ - map cellContents head' : rows''' - let cols = if null head' - then maximum (map length rows''') - else length head' - let aligns = case rows of - (cs:_) -> take cols $ - concatMap cellAligns cs ++ repeat AlignDefault - _ -> replicate cols AlignDefault - let widths = if null widths' - then if isSimple - then replicate cols ColWidthDefault - else replicate cols (ColWidth (1.0 / fromIntegral cols)) - else widths' - let toRow = Row nullAttr - toHeaderRow l = [toRow l | not (null l)] - return $ B.tableWith attribs - (B.simpleCaption caption) - (zip aligns widths) - (TableHead nullAttr $ toHeaderRow head') - [TableBody nullAttr 0 [] $ map toRow rows] - (TableFoot nullAttr []) - -cellContents :: Cell -> [Block] -cellContents (Cell _ _ _ _ bs) = bs - -cellAligns :: Cell -> [Alignment] -cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align + if null rows + then Left "empty table" + else Right + ( zip (calculateAlignments ncols bodies) + (normalizeColWidths ncols tblType widths) + , head' + , bodies + , foot + ) + +normalizeColWidths :: Int -> TableType -> [ColWidth] -> [ColWidth] +normalizeColWidths ncols tblType = \case + [] -> case tblType of + SimpleTable -> replicate ncols ColWidthDefault + NormalTable -> replicate ncols (ColWidth $ 1 / fromIntegral ncols) + widths -> widths + +calculateAlignments :: Int -> [TableBody] -> [Alignment] +calculateAlignments cols tbodies = + case cells of + cs:_ -> take cols $ concatMap cellAligns cs ++ repeat AlignDefault + _ -> replicate cols AlignDefault + where + cells :: [[Cell]] + cells = concatMap bodyRowCells tbodies + cellAligns :: Cell -> [Alignment] + cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align + +bodyRowCells :: TableBody -> [[Cell]] +bodyRowCells = map rowCells . bodyRows + +headRows :: TableHead -> [B.Row] +headRows (TableHead _ rows) = rows + +bodyRows :: TableBody -> [B.Row] +bodyRows (TableBody _ _ headerRows bodyRows') = headerRows <> bodyRows' + +footRows :: TableFoot -> [B.Row] +footRows (TableFoot _ rows) = rows + +rowCells :: B.Row -> [Cell] +rowCells (Row _ cells) = cells -- cgit v1.2.3