From 41237fcc0ed890795f065c0a7f204fa5597defee Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Mon, 23 Nov 2020 12:32:37 +0100 Subject: HTML reader: extract table parsing into separate module --- src/Text/Pandoc/Readers/HTML/Table.hs | 111 ++++++++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 src/Text/Pandoc/Readers/HTML/Table.hs (limited to 'src/Text/Pandoc/Readers/HTML/Table.hs') diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs new file mode 100644 index 000000000..bad39bd2d --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -0,0 +1,111 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} +{- | + Module : Text.Pandoc.Readers.HTML.Table + Copyright : © 2006-2020 John MacFarlane, + 2020 Albert Krewinkel + License : GNU GPL, version 2 or above + + Maintainer : Albert Krewinkel + Stability : alpha + Portability : portable + +HTML table parser. +-} +module Text.Pandoc.Readers.HTML.Table (pTable') where + +import Control.Monad (guard) +import Data.Maybe (fromMaybe) +import Data.Text (Text) +import Text.HTML.TagSoup +import Text.Pandoc.Builder (Blocks, Inlines) +import Text.Pandoc.Definition +import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) +import Text.Pandoc.Parsing + ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try) +import Text.Pandoc.Readers.HTML.Parsing +import Text.Pandoc.Readers.HTML.Types (TagParser) +import Text.Pandoc.Shared (onlySimpleTableCells, safeRead) +import qualified Data.Text as T +import qualified Text.Pandoc.Builder as B + +-- | Parses a @@ element, returning the column's width. Defaults to +-- @'ColWidthDefault'@ if the width is not set or cannot be determined. +pCol :: PandocMonad m => TagParser m ColWidth +pCol = try $ do + TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) + let attribs = toStringAttr attribs' + skipMany pBlank + optional $ pSatisfy (matchTagClose "col") + skipMany pBlank + let width = case lookup "width" attribs of + Nothing -> case lookup "style" attribs of + Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs -> + fromMaybe 0.0 $ safeRead (T.filter + (`notElem` (" \t\r\n%'\";" :: [Char])) xs) + _ -> 0.0 + Just (T.unsnoc -> Just (xs, '%')) -> + fromMaybe 0.0 $ safeRead xs + _ -> 0.0 + if width > 0.0 + then return $ ColWidth $ width / 100.0 + else return ColWidthDefault + +pColgroup :: PandocMonad m => TagParser m [ColWidth] +pColgroup = try $ do + pSatisfy (matchTagOpen "colgroup" []) + skipMany pBlank + manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank + +-- | Parses a simple HTML table +pTable' :: PandocMonad m + => TagParser m Inlines -- ^ Caption parser + -> (Text -> TagParser m [(Alignment, Blocks)]) -- ^ Table cell parser + -> TagParser m Blocks +pTable' inline pCell = try $ do + TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" []) + let attribs = toAttr attribs' + skipMany pBlank + caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank + widths' <- (mconcat <$> many1 pColgroup) <|> many pCol + let pTh = option [] $ pInTags "tr" (pCell "th") + pTr = try $ skipMany pBlank >> + pInTags "tr" (pCell "td" <|> pCell "th") + pTBody = pInTag True "tbody" $ many1 pTr + head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh + head' <- map snd <$> + pInTag True "tbody" + (if null head'' then pTh else return head'') + topfoot <- option [] $ pInTag False "tfoot" $ many pTr + rowsLs <- many pTBody + bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr + TagClose _ <- pSatisfy (matchTagClose "table") + let rows'' = concat rowsLs <> topfoot <> bottomfoot + let rows''' = map (map snd) rows'' + -- fail on empty table + guard $ not $ null head' && null rows''' + let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows''' + let cols = if null head' + then maximum (map length rows''') + else length head' + -- add empty cells to short rows + let addEmpties r = case cols - length r of + n | n > 0 -> r <> replicate n mempty + | otherwise -> r + let rows = map addEmpties rows''' + let aligns = case rows'' of + (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault + _ -> replicate cols AlignDefault + let widths = if null widths' + then if isSimple + then replicate cols ColWidthDefault + else replicate cols (ColWidth (1.0 / fromIntegral cols)) + else widths' + let toRow = Row nullAttr . map B.simpleCell + toHeaderRow l = [toRow l | not (null l)] + return $ B.tableWith attribs + (B.simpleCaption $ B.plain caption) + (zip aligns widths) + (TableHead nullAttr $ toHeaderRow head') + [TableBody nullAttr 0 [] $ map toRow rows] + (TableFoot nullAttr []) -- cgit v1.2.3