aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML/Table.hs
diff options
context:
space:
mode:
authorAlbert Krewinkel <albert@zeitkraut.de>2020-11-23 12:32:37 +0100
committerAlbert Krewinkel <albert@zeitkraut.de>2020-11-24 14:17:35 +0100
commit41237fcc0ed890795f065c0a7f204fa5597defee (patch)
treeb44cc82baf4194ac8fc6e922308840472a6a68c2 /src/Text/Pandoc/Readers/HTML/Table.hs
parent2f110265ff4a1dc429d58e7401d68968e42b6db1 (diff)
downloadpandoc-41237fcc0ed890795f065c0a7f204fa5597defee.tar.gz
HTML reader: extract table parsing into separate module
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML/Table.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML/Table.hs111
1 files changed, 111 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs
new file mode 100644
index 000000000..bad39bd2d
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML/Table.hs
@@ -0,0 +1,111 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ViewPatterns #-}
+{- |
+ Module : Text.Pandoc.Readers.HTML.Table
+ Copyright : © 2006-2020 John MacFarlane,
+ 2020 Albert Krewinkel
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Albert Krewinkel <albert@zeitkraut.de>
+ Stability : alpha
+ Portability : portable
+
+HTML table parser.
+-}
+module Text.Pandoc.Readers.HTML.Table (pTable') where
+
+import Control.Monad (guard)
+import Data.Maybe (fromMaybe)
+import Data.Text (Text)
+import Text.HTML.TagSoup
+import Text.Pandoc.Builder (Blocks, Inlines)
+import Text.Pandoc.Definition
+import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
+import Text.Pandoc.Parsing
+ ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try)
+import Text.Pandoc.Readers.HTML.Parsing
+import Text.Pandoc.Readers.HTML.Types (TagParser)
+import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
+import qualified Data.Text as T
+import qualified Text.Pandoc.Builder as B
+
+-- | Parses a @<col>@ element, returning the column's width. Defaults to
+-- @'ColWidthDefault'@ if the width is not set or cannot be determined.
+pCol :: PandocMonad m => TagParser m ColWidth
+pCol = try $ do
+ TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
+ let attribs = toStringAttr attribs'
+ skipMany pBlank
+ optional $ pSatisfy (matchTagClose "col")
+ skipMany pBlank
+ let width = case lookup "width" attribs of
+ Nothing -> case lookup "style" attribs of
+ Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
+ fromMaybe 0.0 $ safeRead (T.filter
+ (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
+ _ -> 0.0
+ Just (T.unsnoc -> Just (xs, '%')) ->
+ fromMaybe 0.0 $ safeRead xs
+ _ -> 0.0
+ if width > 0.0
+ then return $ ColWidth $ width / 100.0
+ else return ColWidthDefault
+
+pColgroup :: PandocMonad m => TagParser m [ColWidth]
+pColgroup = try $ do
+ pSatisfy (matchTagOpen "colgroup" [])
+ skipMany pBlank
+ manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
+
+-- | Parses a simple HTML table
+pTable' :: PandocMonad m
+ => TagParser m Inlines -- ^ Caption parser
+ -> (Text -> TagParser m [(Alignment, Blocks)]) -- ^ Table cell parser
+ -> TagParser m Blocks
+pTable' inline pCell = try $ do
+ TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
+ let attribs = toAttr attribs'
+ skipMany pBlank
+ caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
+ widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
+ let pTh = option [] $ pInTags "tr" (pCell "th")
+ pTr = try $ skipMany pBlank >>
+ pInTags "tr" (pCell "td" <|> pCell "th")
+ pTBody = pInTag True "tbody" $ many1 pTr
+ head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
+ head' <- map snd <$>
+ pInTag True "tbody"
+ (if null head'' then pTh else return head'')
+ topfoot <- option [] $ pInTag False "tfoot" $ many pTr
+ rowsLs <- many pTBody
+ bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
+ TagClose _ <- pSatisfy (matchTagClose "table")
+ let rows'' = concat rowsLs <> topfoot <> bottomfoot
+ let rows''' = map (map snd) rows''
+ -- fail on empty table
+ guard $ not $ null head' && null rows'''
+ let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
+ let cols = if null head'
+ then maximum (map length rows''')
+ else length head'
+ -- add empty cells to short rows
+ let addEmpties r = case cols - length r of
+ n | n > 0 -> r <> replicate n mempty
+ | otherwise -> r
+ let rows = map addEmpties rows'''
+ let aligns = case rows'' of
+ (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
+ _ -> replicate cols AlignDefault
+ let widths = if null widths'
+ then if isSimple
+ then replicate cols ColWidthDefault
+ else replicate cols (ColWidth (1.0 / fromIntegral cols))
+ else widths'
+ let toRow = Row nullAttr . map B.simpleCell
+ toHeaderRow l = [toRow l | not (null l)]
+ return $ B.tableWith attribs
+ (B.simpleCaption $ B.plain caption)
+ (zip aligns widths)
+ (TableHead nullAttr $ toHeaderRow head')
+ [TableBody nullAttr 0 [] $ map toRow rows]
+ (TableFoot nullAttr [])