From 07919e1b2270a906019575e4ce85590d6754d41c Mon Sep 17 00:00:00 2001
From: Albert Krewinkel <albert@zeitkraut.de>
Date: Thu, 26 Nov 2020 07:22:01 +0100
Subject: HTML reader: improve support for table headers, footer, attributes

- `<tfoot>` elements are no longer added to the table body but used as
  table footer.
- Separate `<tbody>` elements are no longer combined into one.
- Attributes on `<thead>`, `<tbody>`, `<th>`/`<td>`, and `<tfoot>`
  elements are preserved.
---
 src/Text/Pandoc/Readers/HTML/Parsing.hs |  47 +++++--
 src/Text/Pandoc/Readers/HTML/Table.hs   | 238 ++++++++++++++++++++++++--------
 2 files changed, 219 insertions(+), 66 deletions(-)

(limited to 'src/Text/Pandoc/Readers/HTML')

diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 8788a933e..2d58319da 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE LambdaCase            #-}
 {-# LANGUAGE OverloadedStrings     #-}
 {- |
    Module      : Text.Pandoc.Readers.HTML.Parsing
@@ -15,6 +16,7 @@ module Text.Pandoc.Readers.HTML.Parsing
   , pInTags
   , pInTags'
   , pInTag
+  , pInTagWithAttribs
   , pAny
   , pCloses
   , pSatisfy
@@ -22,6 +24,7 @@ module Text.Pandoc.Readers.HTML.Parsing
   , matchTagClose
   , matchTagOpen
   , isSpace
+  , maybeFromAttrib
   , toAttr
   , toStringAttr
   )
@@ -31,11 +34,11 @@ import Control.Monad (guard, void, mzero)
 import Data.Maybe (fromMaybe)
 import Data.Text (Text)
 import Text.HTML.TagSoup
-  ( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose )
+  ( Attribute, Tag (..), isTagText, isTagPosition, isTagOpen, isTagClose, (~==) )
 import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
 import Text.Pandoc.Definition (Attr)
 import Text.Pandoc.Parsing
-  ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
+  ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, option, optional
   , skipMany, setPosition, token, try)
 import Text.Pandoc.Readers.HTML.TagCategories
 import Text.Pandoc.Readers.HTML.Types
@@ -60,25 +63,41 @@ pInTags' :: (PandocMonad m, Monoid a)
          -> TagParser m a
          -> TagParser m a
 pInTags' tagtype tagtest parser = try $ do
-  pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
+  pSatisfy $ \t -> matchTagOpen tagtype [] t && tagtest t
   mconcat <$> manyTill parser (pCloses tagtype <|> eof)
 
--- parses p, preceded by an opening tag (optional if tagsOptional)
--- and followed by a closing tag (optional if tagsOptional)
-pInTag :: PandocMonad m => TagOmission -> Text -> TagParser m a -> TagParser m a
-pInTag tagOmission tagtype p = try $ do
-  skipMany pBlank
+pInTag :: PandocMonad m
+       => TagOmission    -- ^ Whether some tags can be omitted
+       -> Text           -- ^ @tagtype@ Tag name
+       -> TagParser m a  -- ^ @p@ Content parser
+       -> TagParser m a
+pInTag tagOmission tagtype = fmap snd . pInTagWithAttribs tagOmission tagtype
+
+-- | Returns the contents of a tag together with its attributes; parses
+-- @p@, preceded by an opening tag (optional if TagsOmittable) and
+-- followed by a closing tag (optional unless TagsRequired).
+pInTagWithAttribs :: PandocMonad m
+                  => TagOmission    -- ^ Whether some tags can be omitted
+                  -> Text           -- ^ @tagtype@ Tag name
+                  -> TagParser m a  -- ^ @p@ Content parser
+                  -> TagParser m ([Attribute Text], a)
+pInTagWithAttribs tagOmission tagtype p = try $ do
   let openingOptional = tagOmission == TagsOmittable
   let closingOptional = tagOmission /= TagsRequired
-  (if openingOptional then optional else void) $
-    pSatisfy (matchTagOpen tagtype [])
+  skipMany pBlank
+  attribs <- (if openingOptional then option [] else id)
+             (getAttribs <$> pSatisfy (matchTagOpen tagtype []))
   skipMany pBlank
   x <- p
   skipMany pBlank
   (if closingOptional then optional else void) $
     pSatisfy (matchTagClose tagtype)
   skipMany pBlank
-  return x
+  return (attribs, x)
+  where
+    getAttribs = \case
+      TagOpen _ attribs -> attribs
+      _                 -> []
 
 pCloses :: PandocMonad m => Text -> TagParser m ()
 pCloses tagtype = try $ do
@@ -183,6 +202,12 @@ toStringAttr = map go
          -> (x',y)
        _ -> (x,y)
 
+-- Unlike fromAttrib from tagsoup, this distinguishes
+-- between a missing attribute and an attribute with empty content.
+maybeFromAttrib :: Text -> Tag Text -> Maybe Text
+maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs
+maybeFromAttrib _ _ = Nothing
+
 mkAttr :: [(Text, Text)] -> Attr
 mkAttr attr = (attribsId, attribsClasses, attribsKV)
   where attribsId = fromMaybe "" $ lookup "id" attr
diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs
index e6d0a9097..e40d90221 100644
--- a/src/Text/Pandoc/Readers/HTML/Table.hs
+++ b/src/Text/Pandoc/Readers/HTML/Table.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE LambdaCase #-}
 {-# LANGUAGE OverloadedStrings #-}
 {-# LANGUAGE ViewPatterns #-}
 {- |
@@ -12,17 +13,19 @@
 
 HTML table parser.
 -}
-module Text.Pandoc.Readers.HTML.Table (pTable') where
+module Text.Pandoc.Readers.HTML.Table (pTable) where
 
-import Control.Monad (guard)
+import Control.Applicative ((<|>))
 import Data.Maybe (fromMaybe)
 import Data.Text (Text)
 import Text.HTML.TagSoup
 import Text.Pandoc.Builder (Blocks)
+import Text.Pandoc.CSS (cssAttributes)
 import Text.Pandoc.Definition
 import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
 import Text.Pandoc.Parsing
-  ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try)
+  ( eof, lookAhead, many, many1, manyTill, option, optional
+  , optionMaybe, skipMany, try)
 import Text.Pandoc.Readers.HTML.Parsing
 import Text.Pandoc.Readers.HTML.Types (TagParser)
 import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
@@ -57,58 +60,183 @@ pColgroup = try $ do
   skipMany pBlank
   manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
 
--- | Parses a simple HTML table
-pTable' :: PandocMonad m
-        => TagParser m Blocks           -- ^ Caption parser
-        -> (Text -> TagParser m [Cell]) -- ^ Table cell parser
-        -> TagParser m Blocks
-pTable' block pCell = try $ do
-  TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
-  let attribs = toAttr attribs'
+pCell :: PandocMonad m
+      => TagParser m Blocks
+      -> Text
+      -> TagParser m [Cell]
+pCell block celltype = try $ do
+  skipMany pBlank
+  TagOpen _ attribs <- lookAhead $ pSatisfy (matchTagOpen celltype [])
+  let cssAttribs = maybe [] cssAttributes $ lookup "style" attribs
+  let align = case lookup "align" attribs <|>
+                   lookup "text-align" cssAttribs of
+                Just "left"   -> AlignLeft
+                Just "right"  -> AlignRight
+                Just "center" -> AlignCenter
+                _             -> AlignDefault
+  let rowspan = RowSpan . fromMaybe 1 $
+                safeRead =<< lookup "rowspan" attribs
+  let colspan = ColSpan . fromMaybe 1 $
+                safeRead =<< lookup "colspan" attribs
+  res <- pInTags celltype block
+  skipMany pBlank
+  let handledAttribs = ["align", "colspan", "rowspan", "text-align"]
+      attribs' = foldr go [] attribs
+      go kv@(k, _) acc = case k of
+        "style" -> case filter ((/= "text-align") . fst) cssAttribs of
+                     [] -> acc
+                     cs -> ("style", toStyleString cs) : acc
+        -- drop attrib if it's already handled
+        _ | k `elem` handledAttribs -> acc
+        _ -> kv : acc
+  return [B.cellWith (toAttr attribs') align rowspan colspan res]
+
+-- | Create a style attribute string from a list of CSS attributes
+toStyleString :: [(Text, Text)] -> Text
+toStyleString = T.intercalate "; " . map (\(k, v) -> k <> ": " <> v)
+
+data RowType
+  = HeaderCells
+  | AllCells
+
+-- | Parses a table row
+pRow :: PandocMonad m
+     => TagParser m Blocks
+     -> RowType
+     -> TagParser m [B.Row]
+pRow block rowType = try $ do
+  skipMany pBlank
+  case rowType of
+    HeaderCells -> do
+      maybeCells <- optionMaybe (pInTags "tr" (pCell block "th"))
+      return $ case maybeCells of
+        Nothing    -> []
+        Just cells -> [Row nullAttr cells]
+    AllCells    -> do
+      cells <- pInTags "tr" (pCell block "td" <|> pCell block "th")
+      return [Row nullAttr cells]
+
+-- | Parses a table head
+pTableHead :: PandocMonad m
+           => TagParser m Blocks
+           -> TagParser m TableHead
+pTableHead block = try $ do
+  skipMany pBlank
+  (attribs, rows) <-  pInTagWithAttribs ClosingTagOptional "thead"
+                                        (option [] $ pRow block AllCells)
+                  <|> pInTagWithAttribs TagsOmittable "thead"
+                                        (pRow block HeaderCells)
+  let cells = concatMap (\(Row _ cs) -> cs) rows
+  if null cells
+    then TableHead nullAttr <$>
+         pInTag TagsOmittable "tbody" (pRow block HeaderCells)
+    else return $ TableHead (toAttr attribs) [Row nullAttr cells]
+
+-- | Parses a table foot
+pTableFoot :: PandocMonad m
+           => TagParser m Blocks
+           -> TagParser m TableFoot
+pTableFoot block = try $ do
+  skipMany pBlank
+  TagOpen _ attribs <- pSatisfy (matchTagOpen "tfoot" []) <* skipMany pBlank
+  rows <- mconcat <$> many (pRow block AllCells <* skipMany pBlank)
+  optional $ pSatisfy (matchTagClose "tfoot")
+  return $ TableFoot (toAttr attribs) rows
+
+-- | Parses a table body
+pTableBody :: PandocMonad m
+           => TagParser m Blocks
+           -> TagParser m TableBody
+pTableBody block = do
   skipMany pBlank
-  caption <- option mempty $ pInTags "caption" block <* skipMany pBlank
-  widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
-  let pTh = option [] $ pInTags "tr" (pCell "th")
-      pTr = try $ skipMany pBlank
-               *> pInTags "tr" (pCell "td" <|> pCell "th")
-      pTBody = pInTag TagsOmittable "tbody" $ many1 pTr
-  head'' <- pInTag ClosingTagOptional "thead" (option [] pTr)
-        <|> pInTag TagsOmittable "thead" pTh
-  head'  <- pInTag TagsOmittable "tbody"
-               (if null head'' then pTh else return head'')
-  topfoot <- option [] $ pInTag TagsRequired "tfoot" $ many pTr
-  rowsLs <- many pTBody
-  bottomfoot <- option [] $ pInTag ClosingTagOptional "tfoot" $ many pTr
+  (attribs, rows) <- pInTagWithAttribs TagsOmittable "tbody"
+                     (mconcat <$> many1 (pRow block AllCells))
+  return $ TableBody (toAttr attribs) 0 [] rows
+
+
+-- | Parses a simple HTML table
+pTable :: PandocMonad m
+       => TagParser m Blocks -- ^ Caption and cell contents parser
+       -> TagParser m Blocks
+pTable block = try $ do
+  TagOpen _ attribs <- pSatisfy (matchTagOpen "table" [])  <* skipMany pBlank
+  caption <- option mempty $ pInTags "caption" block       <* skipMany pBlank
+  widths  <- ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank
+  thead   <- pTableHead block               <* skipMany pBlank
+  topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank
+  tbodies <- many (pTableBody block)        <* skipMany pBlank
+  botfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank
   TagClose _ <- pSatisfy (matchTagClose "table")
-  let rows = concat rowsLs <> topfoot <> bottomfoot
-      rows''' = map (map cellContents) rows
+  let tfoot = fromMaybe (TableFoot nullAttr []) $ topfoot <|> botfoot
+  case normalize widths thead tbodies tfoot of
+    Left err -> fail err
+    Right (colspecs, thead', tbodies', tfoot') -> return $
+      B.tableWith (toAttr attribs)
+                  (B.simpleCaption caption)
+                  colspecs
+                  thead'
+                  tbodies'
+                  tfoot'
+data TableType
+  = SimpleTable
+  | NormalTable
+
+tableType :: [[Cell]] -> TableType
+tableType cells =
+  if onlySimpleTableCells $ map (map cellContents) cells
+  then SimpleTable
+  else NormalTable
+  where
+    cellContents :: Cell -> [Block]
+    cellContents (Cell _ _ _ _ bs) = bs
+
+normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot
+          -> Either String ([ColSpec], TableHead, [TableBody], TableFoot)
+normalize widths head' bodies foot = do
+  let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot
+  let rowLength = length . rowCells
+  let ncols = maximum (map rowLength rows)
+  let tblType = tableType (map rowCells rows)
   -- fail on empty table
-  guard $ not $ null head' && null rows'''
-  let isSimple = onlySimpleTableCells $
-                 map cellContents head' : rows'''
-  let cols = if null head'
-                then maximum (map length rows''')
-                else length head'
-  let aligns = case rows of
-                 (cs:_) -> take cols $
-                           concatMap cellAligns cs ++ repeat AlignDefault
-                 _      -> replicate cols AlignDefault
-  let widths = if null widths'
-                  then if isSimple
-                       then replicate cols ColWidthDefault
-                       else replicate cols (ColWidth (1.0 / fromIntegral cols))
-                  else widths'
-  let toRow = Row nullAttr
-      toHeaderRow l = [toRow l | not (null l)]
-  return $ B.tableWith attribs
-                   (B.simpleCaption caption)
-                   (zip aligns widths)
-                   (TableHead nullAttr $ toHeaderRow head')
-                   [TableBody nullAttr 0 [] $ map toRow rows]
-                   (TableFoot nullAttr [])
-
-cellContents :: Cell -> [Block]
-cellContents (Cell _ _ _ _ bs) = bs
-
-cellAligns :: Cell -> [Alignment]
-cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align
+  if null rows
+    then Left "empty table"
+    else Right
+         ( zip (calculateAlignments ncols bodies)
+               (normalizeColWidths ncols tblType widths)
+         , head'
+         , bodies
+         , foot
+         )
+
+normalizeColWidths :: Int -> TableType -> [ColWidth] -> [ColWidth]
+normalizeColWidths ncols tblType = \case
+  [] -> case tblType of
+          SimpleTable -> replicate ncols ColWidthDefault
+          NormalTable -> replicate ncols (ColWidth $ 1 / fromIntegral ncols)
+  widths -> widths
+
+calculateAlignments :: Int -> [TableBody] -> [Alignment]
+calculateAlignments cols tbodies =
+  case cells of
+    cs:_ -> take cols $ concatMap cellAligns cs ++ repeat AlignDefault
+    _    -> replicate cols AlignDefault
+  where
+    cells :: [[Cell]]
+    cells = concatMap bodyRowCells tbodies
+    cellAligns :: Cell -> [Alignment]
+    cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align
+
+bodyRowCells :: TableBody -> [[Cell]]
+bodyRowCells = map rowCells . bodyRows
+
+headRows :: TableHead -> [B.Row]
+headRows (TableHead _ rows) = rows
+
+bodyRows :: TableBody -> [B.Row]
+bodyRows (TableBody _ _ headerRows bodyRows') = headerRows <> bodyRows'
+
+footRows :: TableFoot -> [B.Row]
+footRows (TableFoot _ rows) = rows
+
+rowCells :: B.Row -> [Cell]
+rowCells (Row _ cells) = cells
-- 
cgit v1.2.3