aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML
diff options
context:
space:
mode:
authorAlbert Krewinkel <albert@zeitkraut.de>2020-11-23 12:32:37 +0100
committerAlbert Krewinkel <albert@zeitkraut.de>2020-11-24 14:17:35 +0100
commit41237fcc0ed890795f065c0a7f204fa5597defee (patch)
treeb44cc82baf4194ac8fc6e922308840472a6a68c2 /src/Text/Pandoc/Readers/HTML
parent2f110265ff4a1dc429d58e7401d68968e42b6db1 (diff)
downloadpandoc-41237fcc0ed890795f065c0a7f204fa5597defee.tar.gz
HTML reader: extract table parsing into separate module
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML')
-rw-r--r--src/Text/Pandoc/Readers/HTML/Parsing.hs26
-rw-r--r--src/Text/Pandoc/Readers/HTML/Table.hs111
2 files changed, 137 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 7fda066b5..e68e43b25 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -21,19 +21,25 @@ module Text.Pandoc.Readers.HTML.Parsing
, matchTagClose
, matchTagOpen
, isSpace
+ , toAttr
+ , toStringAttr
)
where
import Control.Monad (guard, void, mzero)
+import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Text.HTML.TagSoup
+ ( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose )
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
+import Text.Pandoc.Definition (Attr)
import Text.Pandoc.Parsing
( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
, skipMany, setPosition, token, try)
import Text.Pandoc.Readers.HTML.TagCategories
import Text.Pandoc.Readers.HTML.Types
import Text.Pandoc.Shared (tshow)
+import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
import qualified Data.Set as Set
import qualified Data.Text as T
@@ -154,3 +160,23 @@ t1 `closes` t2 |
t2 `Set.notMember` blockTags &&
t2 `Set.notMember` eitherBlockOrInline = True
_ `closes` _ = False
+
+toStringAttr :: [(Text, Text)] -> [(Text, Text)]
+toStringAttr = map go
+ where
+ go (x,y) =
+ case T.stripPrefix "data-" x of
+ Just x' | x' `Set.notMember` (html5Attributes <>
+ html4Attributes <> rdfaAttributes)
+ -> (x',y)
+ _ -> (x,y)
+
+mkAttr :: [(Text, Text)] -> Attr
+mkAttr attr = (attribsId, attribsClasses, attribsKV)
+ where attribsId = fromMaybe "" $ lookup "id" attr
+ attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
+ attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+ epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
+
+toAttr :: [(Text, Text)] -> Attr
+toAttr = mkAttr . toStringAttr
diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs
new file mode 100644
index 000000000..bad39bd2d
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML/Table.hs
@@ -0,0 +1,111 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ViewPatterns #-}
+{- |
+ Module : Text.Pandoc.Readers.HTML.Table
+ Copyright : © 2006-2020 John MacFarlane,
+ 2020 Albert Krewinkel
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Albert Krewinkel <albert@zeitkraut.de>
+ Stability : alpha
+ Portability : portable
+
+HTML table parser.
+-}
+module Text.Pandoc.Readers.HTML.Table (pTable') where
+
+import Control.Monad (guard)
+import Data.Maybe (fromMaybe)
+import Data.Text (Text)
+import Text.HTML.TagSoup
+import Text.Pandoc.Builder (Blocks, Inlines)
+import Text.Pandoc.Definition
+import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
+import Text.Pandoc.Parsing
+ ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try)
+import Text.Pandoc.Readers.HTML.Parsing
+import Text.Pandoc.Readers.HTML.Types (TagParser)
+import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
+import qualified Data.Text as T
+import qualified Text.Pandoc.Builder as B
+
+-- | Parses a @<col>@ element, returning the column's width. Defaults to
+-- @'ColWidthDefault'@ if the width is not set or cannot be determined.
+pCol :: PandocMonad m => TagParser m ColWidth
+pCol = try $ do
+ TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
+ let attribs = toStringAttr attribs'
+ skipMany pBlank
+ optional $ pSatisfy (matchTagClose "col")
+ skipMany pBlank
+ let width = case lookup "width" attribs of
+ Nothing -> case lookup "style" attribs of
+ Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
+ fromMaybe 0.0 $ safeRead (T.filter
+ (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
+ _ -> 0.0
+ Just (T.unsnoc -> Just (xs, '%')) ->
+ fromMaybe 0.0 $ safeRead xs
+ _ -> 0.0
+ if width > 0.0
+ then return $ ColWidth $ width / 100.0
+ else return ColWidthDefault
+
+pColgroup :: PandocMonad m => TagParser m [ColWidth]
+pColgroup = try $ do
+ pSatisfy (matchTagOpen "colgroup" [])
+ skipMany pBlank
+ manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
+
+-- | Parses a simple HTML table
+pTable' :: PandocMonad m
+ => TagParser m Inlines -- ^ Caption parser
+ -> (Text -> TagParser m [(Alignment, Blocks)]) -- ^ Table cell parser
+ -> TagParser m Blocks
+pTable' inline pCell = try $ do
+ TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
+ let attribs = toAttr attribs'
+ skipMany pBlank
+ caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
+ widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
+ let pTh = option [] $ pInTags "tr" (pCell "th")
+ pTr = try $ skipMany pBlank >>
+ pInTags "tr" (pCell "td" <|> pCell "th")
+ pTBody = pInTag True "tbody" $ many1 pTr
+ head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
+ head' <- map snd <$>
+ pInTag True "tbody"
+ (if null head'' then pTh else return head'')
+ topfoot <- option [] $ pInTag False "tfoot" $ many pTr
+ rowsLs <- many pTBody
+ bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
+ TagClose _ <- pSatisfy (matchTagClose "table")
+ let rows'' = concat rowsLs <> topfoot <> bottomfoot
+ let rows''' = map (map snd) rows''
+ -- fail on empty table
+ guard $ not $ null head' && null rows'''
+ let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
+ let cols = if null head'
+ then maximum (map length rows''')
+ else length head'
+ -- add empty cells to short rows
+ let addEmpties r = case cols - length r of
+ n | n > 0 -> r <> replicate n mempty
+ | otherwise -> r
+ let rows = map addEmpties rows'''
+ let aligns = case rows'' of
+ (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
+ _ -> replicate cols AlignDefault
+ let widths = if null widths'
+ then if isSimple
+ then replicate cols ColWidthDefault
+ else replicate cols (ColWidth (1.0 / fromIntegral cols))
+ else widths'
+ let toRow = Row nullAttr . map B.simpleCell
+ toHeaderRow l = [toRow l | not (null l)]
+ return $ B.tableWith attribs
+ (B.simpleCaption $ B.plain caption)
+ (zip aligns widths)
+ (TableHead nullAttr $ toHeaderRow head')
+ [TableBody nullAttr 0 [] $ map toRow rows]
+ (TableFoot nullAttr [])