aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAlbert Krewinkel <albert@zeitkraut.de>2020-11-23 12:32:37 +0100
committerAlbert Krewinkel <albert@zeitkraut.de>2020-11-24 14:17:35 +0100
commit41237fcc0ed890795f065c0a7f204fa5597defee (patch)
treeb44cc82baf4194ac8fc6e922308840472a6a68c2 /src
parent2f110265ff4a1dc429d58e7401d68968e42b6db1 (diff)
downloadpandoc-41237fcc0ed890795f065c0a7f204fa5597defee.tar.gz
HTML reader: extract table parsing into separate module
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs98
-rw-r--r--src/Text/Pandoc/Readers/HTML/Parsing.hs26
-rw-r--r--src/Text/Pandoc/Readers/HTML/Table.hs111
3 files changed, 140 insertions, 95 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 9e84bedab..e9fefb9c0 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -50,10 +50,10 @@ import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Definition
import Text.Pandoc.Readers.HTML.Parsing
+import Text.Pandoc.Readers.HTML.Table (pTable')
import Text.Pandoc.Readers.HTML.TagCategories
import Text.Pandoc.Readers.HTML.Types
import Text.Pandoc.Readers.LaTeX (rawLaTeXInline)
-import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
import Text.Pandoc.Error
import Text.Pandoc.Logging
import Text.Pandoc.Options (
@@ -64,7 +64,7 @@ import Text.Pandoc.Options (
import Text.Pandoc.Parsing hiding ((<|>))
import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
extractSpaces, htmlSpanLikeElements, elemText, splitTextBy,
- onlySimpleTableCells, safeRead, tshow)
+ safeRead, tshow)
import Text.Pandoc.Walk
import Text.Parsec.Error
import Text.TeXMath (readMathML, writeTeX)
@@ -474,79 +474,7 @@ pHrule = do
return B.horizontalRule
pTable :: PandocMonad m => TagParser m Blocks
-pTable = try $ do
- TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
- let attribs = toAttr attribs'
- skipMany pBlank
- caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
- widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
- let pTh = option [] $ pInTags "tr" (pCell "th")
- pTr = try $ skipMany pBlank >>
- pInTags "tr" (pCell "td" <|> pCell "th")
- pTBody = pInTag True "tbody" $ many1 pTr
- head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
- head' <- map snd <$>
- pInTag True "tbody"
- (if null head'' then pTh else return head'')
- topfoot <- option [] $ pInTag False "tfoot" $ many pTr
- rowsLs <- many pTBody
- bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
- TagClose _ <- pSatisfy (matchTagClose "table")
- let rows'' = concat rowsLs <> topfoot <> bottomfoot
- let rows''' = map (map snd) rows''
- -- fail on empty table
- guard $ not $ null head' && null rows'''
- let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
- let cols = if null head'
- then maximum (map length rows''')
- else length head'
- -- add empty cells to short rows
- let addEmpties r = case cols - length r of
- n | n > 0 -> r <> replicate n mempty
- | otherwise -> r
- let rows = map addEmpties rows'''
- let aligns = case rows'' of
- (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
- _ -> replicate cols AlignDefault
- let widths = if null widths'
- then if isSimple
- then replicate cols ColWidthDefault
- else replicate cols (ColWidth (1.0 / fromIntegral cols))
- else widths'
- let toRow = Row nullAttr . map B.simpleCell
- toHeaderRow l = [toRow l | not (null l)]
- return $ B.tableWith attribs
- (B.simpleCaption $ B.plain caption)
- (zip aligns widths)
- (TableHead nullAttr $ toHeaderRow head')
- [TableBody nullAttr 0 [] $ map toRow rows]
- (TableFoot nullAttr [])
-
-pCol :: PandocMonad m => TagParser m ColWidth
-pCol = try $ do
- TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
- let attribs = toStringAttr attribs'
- skipMany pBlank
- optional $ pSatisfy (matchTagClose "col")
- skipMany pBlank
- let width = case lookup "width" attribs of
- Nothing -> case lookup "style" attribs of
- Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
- fromMaybe 0.0 $ safeRead (T.filter
- (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
- _ -> 0.0
- Just (T.unsnoc -> Just (xs, '%')) ->
- fromMaybe 0.0 $ safeRead xs
- _ -> 0.0
- if width > 0.0
- then return $ ColWidth $ width / 100.0
- else return ColWidthDefault
-
-pColgroup :: PandocMonad m => TagParser m [ColWidth]
-pColgroup = try $ do
- pSatisfy (matchTagOpen "colgroup" [])
- skipMany pBlank
- manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
+pTable = pTable' inline pCell
noColOrRowSpans :: Tag Text -> Bool
noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
@@ -847,16 +775,6 @@ pRawHtmlInline = do
mathMLToTeXMath :: Text -> Either Text Text
mathMLToTeXMath s = writeTeX <$> readMathML s
-toStringAttr :: [(Text, Text)] -> [(Text, Text)]
-toStringAttr = map go
- where
- go (x,y) =
- case T.stripPrefix "data-" x of
- Just x' | x' `Set.notMember` (html5Attributes <>
- html4Attributes <> rdfaAttributes)
- -> (x',y)
- _ -> (x,y)
-
pScriptMath :: PandocMonad m => TagParser m Inlines
pScriptMath = try $ do
TagOpen _ attr' <- pSatisfy $ tagOpen (=="script") (const True)
@@ -1151,16 +1069,6 @@ htmlTag f = try $ do
handleTag tagname
_ -> mzero
-mkAttr :: [(Text, Text)] -> Attr
-mkAttr attr = (attribsId, attribsClasses, attribsKV)
- where attribsId = fromMaybe "" $ lookup "id" attr
- attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
- attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
- epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
-
-toAttr :: [(Text, Text)] -> Attr
-toAttr = mkAttr . toStringAttr
-
-- Strip namespace prefixes
stripPrefixes :: [Tag Text] -> [Tag Text]
stripPrefixes = map stripPrefix
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 7fda066b5..e68e43b25 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -21,19 +21,25 @@ module Text.Pandoc.Readers.HTML.Parsing
, matchTagClose
, matchTagOpen
, isSpace
+ , toAttr
+ , toStringAttr
)
where
import Control.Monad (guard, void, mzero)
+import Data.Maybe (fromMaybe)
import Data.Text (Text)
import Text.HTML.TagSoup
+ ( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose )
import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
+import Text.Pandoc.Definition (Attr)
import Text.Pandoc.Parsing
( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
, skipMany, setPosition, token, try)
import Text.Pandoc.Readers.HTML.TagCategories
import Text.Pandoc.Readers.HTML.Types
import Text.Pandoc.Shared (tshow)
+import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
import qualified Data.Set as Set
import qualified Data.Text as T
@@ -154,3 +160,23 @@ t1 `closes` t2 |
t2 `Set.notMember` blockTags &&
t2 `Set.notMember` eitherBlockOrInline = True
_ `closes` _ = False
+
+toStringAttr :: [(Text, Text)] -> [(Text, Text)]
+toStringAttr = map go
+ where
+ go (x,y) =
+ case T.stripPrefix "data-" x of
+ Just x' | x' `Set.notMember` (html5Attributes <>
+ html4Attributes <> rdfaAttributes)
+ -> (x',y)
+ _ -> (x,y)
+
+mkAttr :: [(Text, Text)] -> Attr
+mkAttr attr = (attribsId, attribsClasses, attribsKV)
+ where attribsId = fromMaybe "" $ lookup "id" attr
+ attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
+ attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+ epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
+
+toAttr :: [(Text, Text)] -> Attr
+toAttr = mkAttr . toStringAttr
diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs
new file mode 100644
index 000000000..bad39bd2d
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML/Table.hs
@@ -0,0 +1,111 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ViewPatterns #-}
+{- |
+ Module : Text.Pandoc.Readers.HTML.Table
+ Copyright : © 2006-2020 John MacFarlane,
+ 2020 Albert Krewinkel
+ License : GNU GPL, version 2 or above
+
+ Maintainer : Albert Krewinkel <albert@zeitkraut.de>
+ Stability : alpha
+ Portability : portable
+
+HTML table parser.
+-}
+module Text.Pandoc.Readers.HTML.Table (pTable') where
+
+import Control.Monad (guard)
+import Data.Maybe (fromMaybe)
+import Data.Text (Text)
+import Text.HTML.TagSoup
+import Text.Pandoc.Builder (Blocks, Inlines)
+import Text.Pandoc.Definition
+import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
+import Text.Pandoc.Parsing
+ ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try)
+import Text.Pandoc.Readers.HTML.Parsing
+import Text.Pandoc.Readers.HTML.Types (TagParser)
+import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
+import qualified Data.Text as T
+import qualified Text.Pandoc.Builder as B
+
+-- | Parses a @<col>@ element, returning the column's width. Defaults to
+-- @'ColWidthDefault'@ if the width is not set or cannot be determined.
+pCol :: PandocMonad m => TagParser m ColWidth
+pCol = try $ do
+ TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
+ let attribs = toStringAttr attribs'
+ skipMany pBlank
+ optional $ pSatisfy (matchTagClose "col")
+ skipMany pBlank
+ let width = case lookup "width" attribs of
+ Nothing -> case lookup "style" attribs of
+ Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
+ fromMaybe 0.0 $ safeRead (T.filter
+ (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
+ _ -> 0.0
+ Just (T.unsnoc -> Just (xs, '%')) ->
+ fromMaybe 0.0 $ safeRead xs
+ _ -> 0.0
+ if width > 0.0
+ then return $ ColWidth $ width / 100.0
+ else return ColWidthDefault
+
+pColgroup :: PandocMonad m => TagParser m [ColWidth]
+pColgroup = try $ do
+ pSatisfy (matchTagOpen "colgroup" [])
+ skipMany pBlank
+ manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
+
+-- | Parses a simple HTML table
+pTable' :: PandocMonad m
+ => TagParser m Inlines -- ^ Caption parser
+ -> (Text -> TagParser m [(Alignment, Blocks)]) -- ^ Table cell parser
+ -> TagParser m Blocks
+pTable' inline pCell = try $ do
+ TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
+ let attribs = toAttr attribs'
+ skipMany pBlank
+ caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
+ widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
+ let pTh = option [] $ pInTags "tr" (pCell "th")
+ pTr = try $ skipMany pBlank >>
+ pInTags "tr" (pCell "td" <|> pCell "th")
+ pTBody = pInTag True "tbody" $ many1 pTr
+ head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
+ head' <- map snd <$>
+ pInTag True "tbody"
+ (if null head'' then pTh else return head'')
+ topfoot <- option [] $ pInTag False "tfoot" $ many pTr
+ rowsLs <- many pTBody
+ bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
+ TagClose _ <- pSatisfy (matchTagClose "table")
+ let rows'' = concat rowsLs <> topfoot <> bottomfoot
+ let rows''' = map (map snd) rows''
+ -- fail on empty table
+ guard $ not $ null head' && null rows'''
+ let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
+ let cols = if null head'
+ then maximum (map length rows''')
+ else length head'
+ -- add empty cells to short rows
+ let addEmpties r = case cols - length r of
+ n | n > 0 -> r <> replicate n mempty
+ | otherwise -> r
+ let rows = map addEmpties rows'''
+ let aligns = case rows'' of
+ (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
+ _ -> replicate cols AlignDefault
+ let widths = if null widths'
+ then if isSimple
+ then replicate cols ColWidthDefault
+ else replicate cols (ColWidth (1.0 / fromIntegral cols))
+ else widths'
+ let toRow = Row nullAttr . map B.simpleCell
+ toHeaderRow l = [toRow l | not (null l)]
+ return $ B.tableWith attribs
+ (B.simpleCaption $ B.plain caption)
+ (zip aligns widths)
+ (TableHead nullAttr $ toHeaderRow head')
+ [TableBody nullAttr 0 [] $ map toRow rows]
+ (TableFoot nullAttr [])