aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML/Table.hs
diff options
context:
space:
mode:
authorAlbert Krewinkel <albert@zeitkraut.de>2020-11-26 07:22:01 +0100
committerAlbert Krewinkel <albert@zeitkraut.de>2020-11-26 07:22:01 +0100
commit07919e1b2270a906019575e4ce85590d6754d41c (patch)
treec04aa488cb17b00ac13b72a67c3ad4e0e8c3efd6 /src/Text/Pandoc/Readers/HTML/Table.hs
parent3e01ae405f9bf5f40e1b8e519029825aa4880602 (diff)
downloadpandoc-07919e1b2270a906019575e4ce85590d6754d41c.tar.gz
HTML reader: improve support for table headers, footer, attributes
- `<tfoot>` elements are no longer added to the table body but used as table footer. - Separate `<tbody>` elements are no longer combined into one. - Attributes on `<thead>`, `<tbody>`, `<th>`/`<td>`, and `<tfoot>` elements are preserved.
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML/Table.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML/Table.hs238
1 files changed, 183 insertions, 55 deletions
diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs
index e6d0a9097..e40d90221 100644
--- a/src/Text/Pandoc/Readers/HTML/Table.hs
+++ b/src/Text/Pandoc/Readers/HTML/Table.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns #-}
{- |
@@ -12,17 +13,19 @@
HTML table parser.
-}
-module Text.Pandoc.Readers.HTML.Table (pTable') where
+module Text.Pandoc.Readers.HTML.Table (pTable) where
-import Control.Monad (guard)
+import Control.Applicative ((<|>))
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Text.HTML.TagSoup
import Text.Pandoc.Builder (Blocks)
+import Text.Pandoc.CSS (cssAttributes)
import Text.Pandoc.Definition
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
import Text.Pandoc.Parsing
- ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try)
+ ( eof, lookAhead, many, many1, manyTill, option, optional
+ , optionMaybe, skipMany, try)
import Text.Pandoc.Readers.HTML.Parsing
import Text.Pandoc.Readers.HTML.Types (TagParser)
import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
@@ -57,58 +60,183 @@ pColgroup = try $ do
skipMany pBlank
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
--- | Parses a simple HTML table
-pTable' :: PandocMonad m
- => TagParser m Blocks -- ^ Caption parser
- -> (Text -> TagParser m [Cell]) -- ^ Table cell parser
- -> TagParser m Blocks
-pTable' block pCell = try $ do
- TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
- let attribs = toAttr attribs'
+pCell :: PandocMonad m
+ => TagParser m Blocks
+ -> Text
+ -> TagParser m [Cell]
+pCell block celltype = try $ do
+ skipMany pBlank
+ TagOpen _ attribs <- lookAhead $ pSatisfy (matchTagOpen celltype [])
+ let cssAttribs = maybe [] cssAttributes $ lookup "style" attribs
+ let align = case lookup "align" attribs <|>
+ lookup "text-align" cssAttribs of
+ Just "left" -> AlignLeft
+ Just "right" -> AlignRight
+ Just "center" -> AlignCenter
+ _ -> AlignDefault
+ let rowspan = RowSpan . fromMaybe 1 $
+ safeRead =<< lookup "rowspan" attribs
+ let colspan = ColSpan . fromMaybe 1 $
+ safeRead =<< lookup "colspan" attribs
+ res <- pInTags celltype block
+ skipMany pBlank
+ let handledAttribs = ["align", "colspan", "rowspan", "text-align"]
+ attribs' = foldr go [] attribs
+ go kv@(k, _) acc = case k of
+ "style" -> case filter ((/= "text-align") . fst) cssAttribs of
+ [] -> acc
+ cs -> ("style", toStyleString cs) : acc
+ -- drop attrib if it's already handled
+ _ | k `elem` handledAttribs -> acc
+ _ -> kv : acc
+ return [B.cellWith (toAttr attribs') align rowspan colspan res]
+
+-- | Create a style attribute string from a list of CSS attributes
+toStyleString :: [(Text, Text)] -> Text
+toStyleString = T.intercalate "; " . map (\(k, v) -> k <> ": " <> v)
+
+data RowType
+ = HeaderCells
+ | AllCells
+
+-- | Parses a table row
+pRow :: PandocMonad m
+ => TagParser m Blocks
+ -> RowType
+ -> TagParser m [B.Row]
+pRow block rowType = try $ do
+ skipMany pBlank
+ case rowType of
+ HeaderCells -> do
+ maybeCells <- optionMaybe (pInTags "tr" (pCell block "th"))
+ return $ case maybeCells of
+ Nothing -> []
+ Just cells -> [Row nullAttr cells]
+ AllCells -> do
+ cells <- pInTags "tr" (pCell block "td" <|> pCell block "th")
+ return [Row nullAttr cells]
+
+-- | Parses a table head
+pTableHead :: PandocMonad m
+ => TagParser m Blocks
+ -> TagParser m TableHead
+pTableHead block = try $ do
+ skipMany pBlank
+ (attribs, rows) <- pInTagWithAttribs ClosingTagOptional "thead"
+ (option [] $ pRow block AllCells)
+ <|> pInTagWithAttribs TagsOmittable "thead"
+ (pRow block HeaderCells)
+ let cells = concatMap (\(Row _ cs) -> cs) rows
+ if null cells
+ then TableHead nullAttr <$>
+ pInTag TagsOmittable "tbody" (pRow block HeaderCells)
+ else return $ TableHead (toAttr attribs) [Row nullAttr cells]
+
+-- | Parses a table foot
+pTableFoot :: PandocMonad m
+ => TagParser m Blocks
+ -> TagParser m TableFoot
+pTableFoot block = try $ do
+ skipMany pBlank
+ TagOpen _ attribs <- pSatisfy (matchTagOpen "tfoot" []) <* skipMany pBlank
+ rows <- mconcat <$> many (pRow block AllCells <* skipMany pBlank)
+ optional $ pSatisfy (matchTagClose "tfoot")
+ return $ TableFoot (toAttr attribs) rows
+
+-- | Parses a table body
+pTableBody :: PandocMonad m
+ => TagParser m Blocks
+ -> TagParser m TableBody
+pTableBody block = do
skipMany pBlank
- caption <- option mempty $ pInTags "caption" block <* skipMany pBlank
- widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
- let pTh = option [] $ pInTags "tr" (pCell "th")
- pTr = try $ skipMany pBlank
- *> pInTags "tr" (pCell "td" <|> pCell "th")
- pTBody = pInTag TagsOmittable "tbody" $ many1 pTr
- head'' <- pInTag ClosingTagOptional "thead" (option [] pTr)
- <|> pInTag TagsOmittable "thead" pTh
- head' <- pInTag TagsOmittable "tbody"
- (if null head'' then pTh else return head'')
- topfoot <- option [] $ pInTag TagsRequired "tfoot" $ many pTr
- rowsLs <- many pTBody
- bottomfoot <- option [] $ pInTag ClosingTagOptional "tfoot" $ many pTr
+ (attribs, rows) <- pInTagWithAttribs TagsOmittable "tbody"
+ (mconcat <$> many1 (pRow block AllCells))
+ return $ TableBody (toAttr attribs) 0 [] rows
+
+
+-- | Parses a simple HTML table
+pTable :: PandocMonad m
+ => TagParser m Blocks -- ^ Caption and cell contents parser
+ -> TagParser m Blocks
+pTable block = try $ do
+ TagOpen _ attribs <- pSatisfy (matchTagOpen "table" []) <* skipMany pBlank
+ caption <- option mempty $ pInTags "caption" block <* skipMany pBlank
+ widths <- ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank
+ thead <- pTableHead block <* skipMany pBlank
+ topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank
+ tbodies <- many (pTableBody block) <* skipMany pBlank
+ botfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank
TagClose _ <- pSatisfy (matchTagClose "table")
- let rows = concat rowsLs <> topfoot <> bottomfoot
- rows''' = map (map cellContents) rows
+ let tfoot = fromMaybe (TableFoot nullAttr []) $ topfoot <|> botfoot
+ case normalize widths thead tbodies tfoot of
+ Left err -> fail err
+ Right (colspecs, thead', tbodies', tfoot') -> return $
+ B.tableWith (toAttr attribs)
+ (B.simpleCaption caption)
+ colspecs
+ thead'
+ tbodies'
+ tfoot'
+data TableType
+ = SimpleTable
+ | NormalTable
+
+tableType :: [[Cell]] -> TableType
+tableType cells =
+ if onlySimpleTableCells $ map (map cellContents) cells
+ then SimpleTable
+ else NormalTable
+ where
+ cellContents :: Cell -> [Block]
+ cellContents (Cell _ _ _ _ bs) = bs
+
+normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot
+ -> Either String ([ColSpec], TableHead, [TableBody], TableFoot)
+normalize widths head' bodies foot = do
+ let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot
+ let rowLength = length . rowCells
+ let ncols = maximum (map rowLength rows)
+ let tblType = tableType (map rowCells rows)
-- fail on empty table
- guard $ not $ null head' && null rows'''
- let isSimple = onlySimpleTableCells $
- map cellContents head' : rows'''
- let cols = if null head'
- then maximum (map length rows''')
- else length head'
- let aligns = case rows of
- (cs:_) -> take cols $
- concatMap cellAligns cs ++ repeat AlignDefault
- _ -> replicate cols AlignDefault
- let widths = if null widths'
- then if isSimple
- then replicate cols ColWidthDefault
- else replicate cols (ColWidth (1.0 / fromIntegral cols))
- else widths'
- let toRow = Row nullAttr
- toHeaderRow l = [toRow l | not (null l)]
- return $ B.tableWith attribs
- (B.simpleCaption caption)
- (zip aligns widths)
- (TableHead nullAttr $ toHeaderRow head')
- [TableBody nullAttr 0 [] $ map toRow rows]
- (TableFoot nullAttr [])
-
-cellContents :: Cell -> [Block]
-cellContents (Cell _ _ _ _ bs) = bs
-
-cellAligns :: Cell -> [Alignment]
-cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align
+ if null rows
+ then Left "empty table"
+ else Right
+ ( zip (calculateAlignments ncols bodies)
+ (normalizeColWidths ncols tblType widths)
+ , head'
+ , bodies
+ , foot
+ )
+
+normalizeColWidths :: Int -> TableType -> [ColWidth] -> [ColWidth]
+normalizeColWidths ncols tblType = \case
+ [] -> case tblType of
+ SimpleTable -> replicate ncols ColWidthDefault
+ NormalTable -> replicate ncols (ColWidth $ 1 / fromIntegral ncols)
+ widths -> widths
+
+calculateAlignments :: Int -> [TableBody] -> [Alignment]
+calculateAlignments cols tbodies =
+ case cells of
+ cs:_ -> take cols $ concatMap cellAligns cs ++ repeat AlignDefault
+ _ -> replicate cols AlignDefault
+ where
+ cells :: [[Cell]]
+ cells = concatMap bodyRowCells tbodies
+ cellAligns :: Cell -> [Alignment]
+ cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align
+
+bodyRowCells :: TableBody -> [[Cell]]
+bodyRowCells = map rowCells . bodyRows
+
+headRows :: TableHead -> [B.Row]
+headRows (TableHead _ rows) = rows
+
+bodyRows :: TableBody -> [B.Row]
+bodyRows (TableBody _ _ headerRows bodyRows') = headerRows <> bodyRows'
+
+footRows :: TableFoot -> [B.Row]
+footRows (TableFoot _ rows) = rows
+
+rowCells :: B.Row -> [Cell]
+rowCells (Row _ cells) = cells