aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML
diff options
context:
space:
mode:
authorAlbert Krewinkel <albert@zeitkraut.de>2020-11-26 07:22:01 +0100
committerAlbert Krewinkel <albert@zeitkraut.de>2020-11-26 07:22:01 +0100
commit07919e1b2270a906019575e4ce85590d6754d41c (patch)
treec04aa488cb17b00ac13b72a67c3ad4e0e8c3efd6 /src/Text/Pandoc/Readers/HTML
parent3e01ae405f9bf5f40e1b8e519029825aa4880602 (diff)
downloadpandoc-07919e1b2270a906019575e4ce85590d6754d41c.tar.gz
HTML reader: improve support for table headers, footer, attributes
- `<tfoot>` elements are no longer added to the table body but used as table footer. - Separate `<tbody>` elements are no longer combined into one. - Attributes on `<thead>`, `<tbody>`, `<th>`/`<td>`, and `<tfoot>` elements are preserved.
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML')
-rw-r--r--src/Text/Pandoc/Readers/HTML/Parsing.hs47
-rw-r--r--src/Text/Pandoc/Readers/HTML/Table.hs238
2 files changed, 219 insertions, 66 deletions
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 8788a933e..2d58319da 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.HTML.Parsing
@@ -15,6 +16,7 @@ module Text.Pandoc.Readers.HTML.Parsing
, pInTags
, pInTags'
, pInTag
+ , pInTagWithAttribs
, pAny
, pCloses
, pSatisfy
@@ -22,6 +24,7 @@ module Text.Pandoc.Readers.HTML.Parsing
, matchTagClose
, matchTagOpen
, isSpace
+ , maybeFromAttrib
, toAttr
, toStringAttr
)
@@ -31,11 +34,11 @@ import Control.Monad (guard, void, mzero)
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Text.HTML.TagSoup
- ( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose )
+ ( Attribute, Tag (..), isTagText, isTagPosition, isTagOpen, isTagClose, (~==) )
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
import Text.Pandoc.Definition (Attr)
import Text.Pandoc.Parsing
- ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
+ ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, option, optional
, skipMany, setPosition, token, try)
import Text.Pandoc.Readers.HTML.TagCategories
import Text.Pandoc.Readers.HTML.Types
@@ -60,25 +63,41 @@ pInTags' :: (PandocMonad m, Monoid a)
-> TagParser m a
-> TagParser m a
pInTags' tagtype tagtest parser = try $ do
- pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
+ pSatisfy $ \t -> matchTagOpen tagtype [] t && tagtest t
mconcat <$> manyTill parser (pCloses tagtype <|> eof)
--- parses p, preceded by an opening tag (optional if tagsOptional)
--- and followed by a closing tag (optional if tagsOptional)
-pInTag :: PandocMonad m => TagOmission -> Text -> TagParser m a -> TagParser m a
-pInTag tagOmission tagtype p = try $ do
- skipMany pBlank
+pInTag :: PandocMonad m
+ => TagOmission -- ^ Whether some tags can be omitted
+ -> Text -- ^ @tagtype@ Tag name
+ -> TagParser m a -- ^ @p@ Content parser
+ -> TagParser m a
+pInTag tagOmission tagtype = fmap snd . pInTagWithAttribs tagOmission tagtype
+
+-- | Returns the contents of a tag together with its attributes; parses
+-- @p@, preceded by an opening tag (optional if TagsOmittable) and
+-- followed by a closing tag (optional unless TagsRequired).
+pInTagWithAttribs :: PandocMonad m
+ => TagOmission -- ^ Whether some tags can be omitted
+ -> Text -- ^ @tagtype@ Tag name
+ -> TagParser m a -- ^ @p@ Content parser
+ -> TagParser m ([Attribute Text], a)
+pInTagWithAttribs tagOmission tagtype p = try $ do
let openingOptional = tagOmission == TagsOmittable
let closingOptional = tagOmission /= TagsRequired
- (if openingOptional then optional else void) $
- pSatisfy (matchTagOpen tagtype [])
+ skipMany pBlank
+ attribs <- (if openingOptional then option [] else id)
+ (getAttribs <$> pSatisfy (matchTagOpen tagtype []))
skipMany pBlank
x <- p
skipMany pBlank
(if closingOptional then optional else void) $
pSatisfy (matchTagClose tagtype)
skipMany pBlank
- return x
+ return (attribs, x)
+ where
+ getAttribs = \case
+ TagOpen _ attribs -> attribs
+ _ -> []
pCloses :: PandocMonad m => Text -> TagParser m ()
pCloses tagtype = try $ do
@@ -183,6 +202,12 @@ toStringAttr = map go
-> (x',y)
_ -> (x,y)
+-- Unlike fromAttrib from tagsoup, this distinguishes
+-- between a missing attribute and an attribute with empty content.
+maybeFromAttrib :: Text -> Tag Text -> Maybe Text
+maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs
+maybeFromAttrib _ _ = Nothing
+
mkAttr :: [(Text, Text)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
where attribsId = fromMaybe "" $ lookup "id" attr
diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs
index e6d0a9097..e40d90221 100644
--- a/src/Text/Pandoc/Readers/HTML/Table.hs
+++ b/src/Text/Pandoc/Readers/HTML/Table.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns #-}
{- |
@@ -12,17 +13,19 @@
HTML table parser.
-}
-module Text.Pandoc.Readers.HTML.Table (pTable') where
+module Text.Pandoc.Readers.HTML.Table (pTable) where
-import Control.Monad (guard)
+import Control.Applicative ((<|>))
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Text.HTML.TagSoup
import Text.Pandoc.Builder (Blocks)
+import Text.Pandoc.CSS (cssAttributes)
import Text.Pandoc.Definition
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
import Text.Pandoc.Parsing
- ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try)
+ ( eof, lookAhead, many, many1, manyTill, option, optional
+ , optionMaybe, skipMany, try)
import Text.Pandoc.Readers.HTML.Parsing
import Text.Pandoc.Readers.HTML.Types (TagParser)
import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
@@ -57,58 +60,183 @@ pColgroup = try $ do
skipMany pBlank
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
--- | Parses a simple HTML table
-pTable' :: PandocMonad m
- => TagParser m Blocks -- ^ Caption parser
- -> (Text -> TagParser m [Cell]) -- ^ Table cell parser
- -> TagParser m Blocks
-pTable' block pCell = try $ do
- TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
- let attribs = toAttr attribs'
+pCell :: PandocMonad m
+ => TagParser m Blocks
+ -> Text
+ -> TagParser m [Cell]
+pCell block celltype = try $ do
+ skipMany pBlank
+ TagOpen _ attribs <- lookAhead $ pSatisfy (matchTagOpen celltype [])
+ let cssAttribs = maybe [] cssAttributes $ lookup "style" attribs
+ let align = case lookup "align" attribs <|>
+ lookup "text-align" cssAttribs of
+ Just "left" -> AlignLeft
+ Just "right" -> AlignRight
+ Just "center" -> AlignCenter
+ _ -> AlignDefault
+ let rowspan = RowSpan . fromMaybe 1 $
+ safeRead =<< lookup "rowspan" attribs
+ let colspan = ColSpan . fromMaybe 1 $
+ safeRead =<< lookup "colspan" attribs
+ res <- pInTags celltype block
+ skipMany pBlank
+ let handledAttribs = ["align", "colspan", "rowspan", "text-align"]
+ attribs' = foldr go [] attribs
+ go kv@(k, _) acc = case k of
+ "style" -> case filter ((/= "text-align") . fst) cssAttribs of
+ [] -> acc
+ cs -> ("style", toStyleString cs) : acc
+ -- drop attrib if it's already handled
+ _ | k `elem` handledAttribs -> acc
+ _ -> kv : acc
+ return [B.cellWith (toAttr attribs') align rowspan colspan res]
+
+-- | Create a style attribute string from a list of CSS attributes
+toStyleString :: [(Text, Text)] -> Text
+toStyleString = T.intercalate "; " . map (\(k, v) -> k <> ": " <> v)
+
+data RowType
+ = HeaderCells
+ | AllCells
+
+-- | Parses a table row
+pRow :: PandocMonad m
+ => TagParser m Blocks
+ -> RowType
+ -> TagParser m [B.Row]
+pRow block rowType = try $ do
+ skipMany pBlank
+ case rowType of
+ HeaderCells -> do
+ maybeCells <- optionMaybe (pInTags "tr" (pCell block "th"))
+ return $ case maybeCells of
+ Nothing -> []
+ Just cells -> [Row nullAttr cells]
+ AllCells -> do
+ cells <- pInTags "tr" (pCell block "td" <|> pCell block "th")
+ return [Row nullAttr cells]
+
+-- | Parses a table head
+pTableHead :: PandocMonad m
+ => TagParser m Blocks
+ -> TagParser m TableHead
+pTableHead block = try $ do
+ skipMany pBlank
+ (attribs, rows) <- pInTagWithAttribs ClosingTagOptional "thead"
+ (option [] $ pRow block AllCells)
+ <|> pInTagWithAttribs TagsOmittable "thead"
+ (pRow block HeaderCells)
+ let cells = concatMap (\(Row _ cs) -> cs) rows
+ if null cells
+ then TableHead nullAttr <$>
+ pInTag TagsOmittable "tbody" (pRow block HeaderCells)
+ else return $ TableHead (toAttr attribs) [Row nullAttr cells]
+
+-- | Parses a table foot
+pTableFoot :: PandocMonad m
+ => TagParser m Blocks
+ -> TagParser m TableFoot
+pTableFoot block = try $ do
+ skipMany pBlank
+ TagOpen _ attribs <- pSatisfy (matchTagOpen "tfoot" []) <* skipMany pBlank
+ rows <- mconcat <$> many (pRow block AllCells <* skipMany pBlank)
+ optional $ pSatisfy (matchTagClose "tfoot")
+ return $ TableFoot (toAttr attribs) rows
+
+-- | Parses a table body
+pTableBody :: PandocMonad m
+ => TagParser m Blocks
+ -> TagParser m TableBody
+pTableBody block = do
skipMany pBlank
- caption <- option mempty $ pInTags "caption" block <* skipMany pBlank
- widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
- let pTh = option [] $ pInTags "tr" (pCell "th")
- pTr = try $ skipMany pBlank
- *> pInTags "tr" (pCell "td" <|> pCell "th")
- pTBody = pInTag TagsOmittable "tbody" $ many1 pTr
- head'' <- pInTag ClosingTagOptional "thead" (option [] pTr)
- <|> pInTag TagsOmittable "thead" pTh
- head' <- pInTag TagsOmittable "tbody"
- (if null head'' then pTh else return head'')
- topfoot <- option [] $ pInTag TagsRequired "tfoot" $ many pTr
- rowsLs <- many pTBody
- bottomfoot <- option [] $ pInTag ClosingTagOptional "tfoot" $ many pTr
+ (attribs, rows) <- pInTagWithAttribs TagsOmittable "tbody"
+ (mconcat <$> many1 (pRow block AllCells))
+ return $ TableBody (toAttr attribs) 0 [] rows
+
+
+-- | Parses a simple HTML table
+pTable :: PandocMonad m
+ => TagParser m Blocks -- ^ Caption and cell contents parser
+ -> TagParser m Blocks
+pTable block = try $ do
+ TagOpen _ attribs <- pSatisfy (matchTagOpen "table" []) <* skipMany pBlank
+ caption <- option mempty $ pInTags "caption" block <* skipMany pBlank
+ widths <- ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank
+ thead <- pTableHead block <* skipMany pBlank
+ topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank
+ tbodies <- many (pTableBody block) <* skipMany pBlank
+ botfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank
TagClose _ <- pSatisfy (matchTagClose "table")
- let rows = concat rowsLs <> topfoot <> bottomfoot
- rows''' = map (map cellContents) rows
+ let tfoot = fromMaybe (TableFoot nullAttr []) $ topfoot <|> botfoot
+ case normalize widths thead tbodies tfoot of
+ Left err -> fail err
+ Right (colspecs, thead', tbodies', tfoot') -> return $
+ B.tableWith (toAttr attribs)
+ (B.simpleCaption caption)
+ colspecs
+ thead'
+ tbodies'
+ tfoot'
+data TableType
+ = SimpleTable
+ | NormalTable
+
+tableType :: [[Cell]] -> TableType
+tableType cells =
+ if onlySimpleTableCells $ map (map cellContents) cells
+ then SimpleTable
+ else NormalTable
+ where
+ cellContents :: Cell -> [Block]
+ cellContents (Cell _ _ _ _ bs) = bs
+
+normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot
+ -> Either String ([ColSpec], TableHead, [TableBody], TableFoot)
+normalize widths head' bodies foot = do
+ let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot
+ let rowLength = length . rowCells
+ let ncols = maximum (map rowLength rows)
+ let tblType = tableType (map rowCells rows)
-- fail on empty table
- guard $ not $ null head' && null rows'''
- let isSimple = onlySimpleTableCells $
- map cellContents head' : rows'''
- let cols = if null head'
- then maximum (map length rows''')
- else length head'
- let aligns = case rows of
- (cs:_) -> take cols $
- concatMap cellAligns cs ++ repeat AlignDefault
- _ -> replicate cols AlignDefault
- let widths = if null widths'
- then if isSimple
- then replicate cols ColWidthDefault
- else replicate cols (ColWidth (1.0 / fromIntegral cols))
- else widths'
- let toRow = Row nullAttr
- toHeaderRow l = [toRow l | not (null l)]
- return $ B.tableWith attribs
- (B.simpleCaption caption)
- (zip aligns widths)
- (TableHead nullAttr $ toHeaderRow head')
- [TableBody nullAttr 0 [] $ map toRow rows]
- (TableFoot nullAttr [])
-
-cellContents :: Cell -> [Block]
-cellContents (Cell _ _ _ _ bs) = bs
-
-cellAligns :: Cell -> [Alignment]
-cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align
+ if null rows
+ then Left "empty table"
+ else Right
+ ( zip (calculateAlignments ncols bodies)
+ (normalizeColWidths ncols tblType widths)
+ , head'
+ , bodies
+ , foot
+ )
+
+normalizeColWidths :: Int -> TableType -> [ColWidth] -> [ColWidth]
+normalizeColWidths ncols tblType = \case
+ [] -> case tblType of
+ SimpleTable -> replicate ncols ColWidthDefault
+ NormalTable -> replicate ncols (ColWidth $ 1 / fromIntegral ncols)
+ widths -> widths
+
+calculateAlignments :: Int -> [TableBody] -> [Alignment]
+calculateAlignments cols tbodies =
+ case cells of
+ cs:_ -> take cols $ concatMap cellAligns cs ++ repeat AlignDefault
+ _ -> replicate cols AlignDefault
+ where
+ cells :: [[Cell]]
+ cells = concatMap bodyRowCells tbodies
+ cellAligns :: Cell -> [Alignment]
+ cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align
+
+bodyRowCells :: TableBody -> [[Cell]]
+bodyRowCells = map rowCells . bodyRows
+
+headRows :: TableHead -> [B.Row]
+headRows (TableHead _ rows) = rows
+
+bodyRows :: TableBody -> [B.Row]
+bodyRows (TableBody _ _ headerRows bodyRows') = headerRows <> bodyRows'
+
+footRows :: TableFoot -> [B.Row]
+footRows (TableFoot _ rows) = rows
+
+rowCells :: B.Row -> [Cell]
+rowCells (Row _ cells) = cells