HTML reader: extract table parsing into separate module

author: Albert Krewinkel <albert@zeitkraut.de> 2020-11-23 12:32:37 +0100
committer: Albert Krewinkel <albert@zeitkraut.de> 2020-11-24 14:17:35 +0100
commit: 41237fcc0ed890795f065c0a7f204fa5597defee (patch)
tree: b44cc82baf4194ac8fc6e922308840472a6a68c2 /src
parent: 2f110265ff4a1dc429d58e7401d68968e42b6db1 (diff)
download: pandoc-41237fcc0ed890795f065c0a7f204fa5597defee.tar.gz
3 files changed, 140 insertions, 95 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 9e84bedab..e9fefb9c0 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -50,10 +50,10 @@ import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
 import qualified Text.Pandoc.UTF8 as UTF8
 import Text.Pandoc.Definition
 import Text.Pandoc.Readers.HTML.Parsing
+import Text.Pandoc.Readers.HTML.Table (pTable')
 import Text.Pandoc.Readers.HTML.TagCategories
 import Text.Pandoc.Readers.HTML.Types
 import Text.Pandoc.Readers.LaTeX (rawLaTeXInline)
-import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
 import Text.Pandoc.Error
 import Text.Pandoc.Logging
 import Text.Pandoc.Options (
@@ -64,7 +64,7 @@ import Text.Pandoc.Options (
 import Text.Pandoc.Parsing hiding ((<|>))
 import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,
                            extractSpaces, htmlSpanLikeElements, elemText, splitTextBy,
-                           onlySimpleTableCells, safeRead, tshow)
+                           safeRead, tshow)
 import Text.Pandoc.Walk
 import Text.Parsec.Error
 import Text.TeXMath (readMathML, writeTeX)
@@ -474,79 +474,7 @@ pHrule = do
   return B.horizontalRule
 
 pTable :: PandocMonad m => TagParser m Blocks
-pTable = try $ do
-  TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
-  let attribs = toAttr attribs'
-  skipMany pBlank
-  caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
-  widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
-  let pTh = option [] $ pInTags "tr" (pCell "th")
-      pTr = try $ skipMany pBlank >>
-                  pInTags "tr" (pCell "td" <|> pCell "th")
-      pTBody = pInTag True "tbody" $ many1 pTr
-  head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
-  head'  <- map snd <$>
-             pInTag True "tbody"
-               (if null head'' then pTh else return head'')
-  topfoot <- option [] $ pInTag False "tfoot" $ many pTr
-  rowsLs <- many pTBody
-  bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
-  TagClose _ <- pSatisfy (matchTagClose "table")
-  let rows'' = concat rowsLs <> topfoot <> bottomfoot
-  let rows''' = map (map snd) rows''
-  -- fail on empty table
-  guard $ not $ null head' && null rows'''
-  let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
-  let cols = if null head'
-                then maximum (map length rows''')
-                else length head'
-  -- add empty cells to short rows
-  let addEmpties r = case cols - length r of
-                           n | n > 0 -> r <> replicate n mempty
-                             | otherwise -> r
-  let rows = map addEmpties rows'''
-  let aligns = case rows'' of
-                    (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
-                    _      -> replicate cols AlignDefault
-  let widths = if null widths'
-                  then if isSimple
-                       then replicate cols ColWidthDefault
-                       else replicate cols (ColWidth (1.0 / fromIntegral cols))
-                  else widths'
-  let toRow = Row nullAttr . map B.simpleCell
-      toHeaderRow l = [toRow l | not (null l)]
-  return $ B.tableWith attribs
-                   (B.simpleCaption $ B.plain caption)
-                   (zip aligns widths)
-                   (TableHead nullAttr $ toHeaderRow head')
-                   [TableBody nullAttr 0 [] $ map toRow rows]
-                   (TableFoot nullAttr [])
-
-pCol :: PandocMonad m => TagParser m ColWidth
-pCol = try $ do
-  TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
-  let attribs = toStringAttr attribs'
-  skipMany pBlank
-  optional $ pSatisfy (matchTagClose "col")
-  skipMany pBlank
-  let width = case lookup "width" attribs of
-                Nothing -> case lookup "style" attribs of
-                  Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
-                    fromMaybe 0.0 $ safeRead (T.filter
-                      (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
-                  _ -> 0.0
-                Just (T.unsnoc -> Just (xs, '%')) ->
-                  fromMaybe 0.0 $ safeRead xs
-                _ -> 0.0
-  if width > 0.0
-    then return $ ColWidth $ width / 100.0
-    else return ColWidthDefault
-
-pColgroup :: PandocMonad m => TagParser m [ColWidth]
-pColgroup = try $ do
-  pSatisfy (matchTagOpen "colgroup" [])
-  skipMany pBlank
-  manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
+pTable = pTable' inline pCell
 
 noColOrRowSpans :: Tag Text -> Bool
 noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
@@ -847,16 +775,6 @@ pRawHtmlInline = do
 mathMLToTeXMath :: Text -> Either Text Text
 mathMLToTeXMath s = writeTeX <$> readMathML s
 
-toStringAttr :: [(Text, Text)] -> [(Text, Text)]
-toStringAttr = map go
-  where
-   go (x,y) =
-     case T.stripPrefix "data-" x of
-       Just x' | x' `Set.notMember` (html5Attributes <>
-                                     html4Attributes <> rdfaAttributes)
-         -> (x',y)
-       _ -> (x,y)
-
 pScriptMath :: PandocMonad m => TagParser m Inlines
 pScriptMath = try $ do
   TagOpen _ attr' <- pSatisfy $ tagOpen (=="script") (const True)
@@ -1151,16 +1069,6 @@ htmlTag f = try $ do
          handleTag tagname
        _ -> mzero
 
-mkAttr :: [(Text, Text)] -> Attr
-mkAttr attr = (attribsId, attribsClasses, attribsKV)
-  where attribsId = fromMaybe "" $ lookup "id" attr
-        attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
-        attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
-        epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
-
-toAttr :: [(Text, Text)] -> Attr
-toAttr = mkAttr . toStringAttr
-
 -- Strip namespace prefixes
 stripPrefixes :: [Tag Text] -> [Tag Text]
 stripPrefixes = map stripPrefix
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 7fda066b5..e68e43b25 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -21,19 +21,25 @@ module Text.Pandoc.Readers.HTML.Parsing
   , matchTagClose
   , matchTagOpen
   , isSpace
+  , toAttr
+  , toStringAttr
   )
 where
 
 import Control.Monad (guard, void, mzero)
+import Data.Maybe (fromMaybe)
 import Data.Text (Text)
 import Text.HTML.TagSoup
+  ( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose )
 import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
+import Text.Pandoc.Definition (Attr)
 import Text.Pandoc.Parsing
   ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional
   , skipMany, setPosition, token, try)
 import Text.Pandoc.Readers.HTML.TagCategories
 import Text.Pandoc.Readers.HTML.Types
 import Text.Pandoc.Shared (tshow)
+import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)
 import qualified Data.Set as Set
 import qualified Data.Text as T
 
@@ -154,3 +160,23 @@ t1 `closes` t2 |
    t2 `Set.notMember` blockTags &&
    t2 `Set.notMember` eitherBlockOrInline = True
 _ `closes` _ = False
+
+toStringAttr :: [(Text, Text)] -> [(Text, Text)]
+toStringAttr = map go
+  where
+   go (x,y) =
+     case T.stripPrefix "data-" x of
+       Just x' | x' `Set.notMember` (html5Attributes <>
+                                     html4Attributes <> rdfaAttributes)
+         -> (x',y)
+       _ -> (x,y)
+
+mkAttr :: [(Text, Text)] -> Attr
+mkAttr attr = (attribsId, attribsClasses, attribsKV)
+  where attribsId = fromMaybe "" $ lookup "id" attr
+        attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes
+        attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+        epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr
+
+toAttr :: [(Text, Text)] -> Attr
+toAttr = mkAttr . toStringAttr
diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs
new file mode 100644
index 000000000..bad39bd2d
--- /dev/null
+++ b/src/Text/Pandoc/Readers/HTML/Table.hs
@@ -0,0 +1,111 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-# LANGUAGE ViewPatterns #-}
+{- |
+   Module      : Text.Pandoc.Readers.HTML.Table
+   Copyright   : © 2006-2020 John MacFarlane,
+                   2020 Albert Krewinkel
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Albert Krewinkel <albert@zeitkraut.de>
+   Stability   : alpha
+   Portability : portable
+
+HTML table parser.
+-}
+module Text.Pandoc.Readers.HTML.Table (pTable') where
+
+import Control.Monad (guard)
+import Data.Maybe (fromMaybe)
+import Data.Text (Text)
+import Text.HTML.TagSoup
+import Text.Pandoc.Builder (Blocks, Inlines)
+import Text.Pandoc.Definition
+import Text.Pandoc.Class.PandocMonad (PandocMonad (..))
+import Text.Pandoc.Parsing
+  ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try)
+import Text.Pandoc.Readers.HTML.Parsing
+import Text.Pandoc.Readers.HTML.Types (TagParser)
+import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
+import qualified Data.Text as T
+import qualified Text.Pandoc.Builder as B
+
+-- | Parses a @<col>@ element, returning the column's width. Defaults to
+-- @'ColWidthDefault'@ if the width is not set or cannot be determined.
+pCol :: PandocMonad m => TagParser m ColWidth
+pCol = try $ do
+  TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
+  let attribs = toStringAttr attribs'
+  skipMany pBlank
+  optional $ pSatisfy (matchTagClose "col")
+  skipMany pBlank
+  let width = case lookup "width" attribs of
+                Nothing -> case lookup "style" attribs of
+                  Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
+                    fromMaybe 0.0 $ safeRead (T.filter
+                      (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
+                  _ -> 0.0
+                Just (T.unsnoc -> Just (xs, '%')) ->
+                  fromMaybe 0.0 $ safeRead xs
+                _ -> 0.0
+  if width > 0.0
+    then return $ ColWidth $ width / 100.0
+    else return ColWidthDefault
+
+pColgroup :: PandocMonad m => TagParser m [ColWidth]
+pColgroup = try $ do
+  pSatisfy (matchTagOpen "colgroup" [])
+  skipMany pBlank
+  manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
+
+-- | Parses a simple HTML table
+pTable' :: PandocMonad m
+        => TagParser m Inlines                         -- ^ Caption parser
+        -> (Text -> TagParser m [(Alignment, Blocks)]) -- ^ Table cell parser
+        -> TagParser m Blocks
+pTable' inline pCell = try $ do
+  TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" [])
+  let attribs = toAttr attribs'
+  skipMany pBlank
+  caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
+  widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
+  let pTh = option [] $ pInTags "tr" (pCell "th")
+      pTr = try $ skipMany pBlank >>
+                  pInTags "tr" (pCell "td" <|> pCell "th")
+      pTBody = pInTag True "tbody" $ many1 pTr
+  head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh
+  head'  <- map snd <$>
+             pInTag True "tbody"
+               (if null head'' then pTh else return head'')
+  topfoot <- option [] $ pInTag False "tfoot" $ many pTr
+  rowsLs <- many pTBody
+  bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr
+  TagClose _ <- pSatisfy (matchTagClose "table")
+  let rows'' = concat rowsLs <> topfoot <> bottomfoot
+  let rows''' = map (map snd) rows''
+  -- fail on empty table
+  guard $ not $ null head' && null rows'''
+  let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows'''
+  let cols = if null head'
+                then maximum (map length rows''')
+                else length head'
+  -- add empty cells to short rows
+  let addEmpties r = case cols - length r of
+                           n | n > 0 -> r <> replicate n mempty
+                             | otherwise -> r
+  let rows = map addEmpties rows'''
+  let aligns = case rows'' of
+                    (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault
+                    _      -> replicate cols AlignDefault
+  let widths = if null widths'
+                  then if isSimple
+                       then replicate cols ColWidthDefault
+                       else replicate cols (ColWidth (1.0 / fromIntegral cols))
+                  else widths'
+  let toRow = Row nullAttr . map B.simpleCell
+      toHeaderRow l = [toRow l | not (null l)]
+  return $ B.tableWith attribs
+                   (B.simpleCaption $ B.plain caption)
+                   (zip aligns widths)
+                   (TableHead nullAttr $ toHeaderRow head')
+                   [TableBody nullAttr 0 [] $ map toRow rows]
+                   (TableFoot nullAttr [])
author	Albert Krewinkel <albert@zeitkraut.de>	2020-11-23 12:32:37 +0100
committer	Albert Krewinkel <albert@zeitkraut.de>	2020-11-24 14:17:35 +0100
commit	41237fcc0ed890795f065c0a7f204fa5597defee (patch)
tree	b44cc82baf4194ac8fc6e922308840472a6a68c2 /src
parent	2f110265ff4a1dc429d58e7401d68968e42b6db1 (diff)
download	pandoc-41237fcc0ed890795f065c0a7f204fa5597defee.tar.gz