aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML')
-rw-r--r--src/Text/Pandoc/Readers/HTML/Parsing.hs26
-rw-r--r--src/Text/Pandoc/Readers/HTML/Table.hs57
-rw-r--r--src/Text/Pandoc/Readers/HTML/TagCategories.hs2
-rw-r--r--src/Text/Pandoc/Readers/HTML/Types.hs2
4 files changed, 58 insertions, 29 deletions
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs
index 2d58319da..bd8d7c96c 100644
--- a/src/Text/Pandoc/Readers/HTML/Parsing.hs
+++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs
@@ -2,7 +2,7 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.HTML.Parsing
- Copyright : Copyright (C) 2006-2020 John MacFarlane
+ Copyright : Copyright (C) 2006-2021 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -161,10 +161,12 @@ _ `closes` "html" = False
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"td" `closes` t | t `elem` ["th","td"] = True
-"tr" `closes` t | t `elem` ["th","td","tr"] = True
+"tr" `closes` t | t `elem` ["th","td","tr","colgroup"] = True
"dd" `closes` t | t `elem` ["dt", "dd"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
+"col" `closes` "col" = True
+"colgroup" `closes` "col" = True
"optgroup" `closes` "optgroup" = True
"optgroup" `closes` "option" = True
"option" `closes` "option" = True
@@ -193,14 +195,20 @@ t1 `closes` t2 |
_ `closes` _ = False
toStringAttr :: [(Text, Text)] -> [(Text, Text)]
-toStringAttr = map go
+toStringAttr = foldr go []
where
- go (x,y) =
- case T.stripPrefix "data-" x of
- Just x' | x' `Set.notMember` (html5Attributes <>
- html4Attributes <> rdfaAttributes)
- -> (x',y)
- _ -> (x,y)
+ go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)]
+ -- treat xml:lang as lang
+ go ("xml:lang",y) ats = go ("lang",y) ats
+ -- prevent duplicate attributes
+ go (x,y) ats
+ | any (\(x',_) -> x == x') ats = ats
+ | otherwise =
+ case T.stripPrefix "data-" x of
+ Just x' | x' `Set.notMember` (html5Attributes <>
+ html4Attributes <> rdfaAttributes)
+ -> go (x',y) ats
+ _ -> (x,y):ats
-- Unlike fromAttrib from tagsoup, this distinguishes
-- between a missing attribute and an attribute with empty content.
diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs
index 5a783988f..6e62e12f5 100644
--- a/src/Text/Pandoc/Readers/HTML/Table.hs
+++ b/src/Text/Pandoc/Readers/HTML/Table.hs
@@ -3,8 +3,8 @@
{-# LANGUAGE ViewPatterns #-}
{- |
Module : Text.Pandoc.Readers.HTML.Table
- Copyright : © 2006-2020 John MacFarlane,
- 2020 Albert Krewinkel
+ Copyright : © 2006-2021 John MacFarlane,
+ 2020-2021 Albert Krewinkel
License : GNU GPL, version 2 or above
Maintainer : Albert Krewinkel <albert@zeitkraut.de>
@@ -17,6 +17,8 @@ module Text.Pandoc.Readers.HTML.Table (pTable) where
import Control.Applicative ((<|>))
import Data.Maybe (fromMaybe)
+import Data.Either (lefts, rights)
+import Data.List.NonEmpty (nonEmpty)
import Data.Text (Text)
import Text.HTML.TagSoup
import Text.Pandoc.Builder (Blocks)
@@ -32,34 +34,51 @@ import Text.Pandoc.Shared (onlySimpleTableCells, safeRead)
import qualified Data.Text as T
import qualified Text.Pandoc.Builder as B
--- | Parses a @<col>@ element, returning the column's width. Defaults to
--- @'ColWidthDefault'@ if the width is not set or cannot be determined.
-pCol :: PandocMonad m => TagParser m ColWidth
+-- | Parses a @<col>@ element, returning the column's width.
+-- An Either value is used: Left i means a "relative length" with
+-- integral value i (see https://www.w3.org/TR/html4/types.html#h-6.6);
+-- Right w means a regular width. Defaults to @'Right ColWidthDefault'@
+-- if the width is not set or cannot be determined.
+pCol :: PandocMonad m => TagParser m (Either Int ColWidth)
pCol = try $ do
TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" [])
let attribs = toStringAttr attribs'
skipMany pBlank
optional $ pSatisfy (matchTagClose "col")
skipMany pBlank
- let width = case lookup "width" attribs of
+ return $ case lookup "width" attribs of
Nothing -> case lookup "style" attribs of
Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs ->
- fromMaybe 0.0 $ safeRead (T.filter
- (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
- _ -> 0.0
+ maybe (Right ColWidthDefault) (Right . ColWidth . (/ 100.0))
+ $ safeRead (T.filter
+ (`notElem` (" \t\r\n%'\";" :: [Char])) xs)
+ _ -> Right ColWidthDefault
+ Just (T.unsnoc -> Just (xs, '*')) ->
+ maybe (Left 1) Left $ safeRead xs
Just (T.unsnoc -> Just (xs, '%')) ->
- fromMaybe 0.0 $ safeRead xs
- _ -> 0.0
- if width > 0.0
- then return $ ColWidth $ width / 100.0
- else return ColWidthDefault
+ maybe (Right ColWidthDefault)
+ (Right . ColWidth . (/ 100.0)) $ safeRead xs
+ _ -> Right ColWidthDefault
-pColgroup :: PandocMonad m => TagParser m [ColWidth]
+pColgroup :: PandocMonad m => TagParser m [Either Int ColWidth]
pColgroup = try $ do
pSatisfy (matchTagOpen "colgroup" [])
skipMany pBlank
manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
+resolveRelativeLengths :: [Either Int ColWidth] -> [ColWidth]
+resolveRelativeLengths ws =
+ let remaining = 1 - sum (map getColWidth $ rights ws)
+ relatives = sum $ lefts ws
+ relUnit = remaining / fromIntegral relatives
+ toColWidth (Right x) = x
+ toColWidth (Left i) = ColWidth (fromIntegral i * relUnit)
+ in map toColWidth ws
+
+getColWidth :: ColWidth -> Double
+getColWidth ColWidthDefault = 0
+getColWidth (ColWidth w) = w
+
data CellType
= HeaderCell
| BodyCell
@@ -181,7 +200,8 @@ pTable :: PandocMonad m
pTable block = try $ do
TagOpen _ attribs <- pSatisfy (matchTagOpen "table" []) <* skipMany pBlank
caption <- option mempty $ pInTags "caption" block <* skipMany pBlank
- widths <- ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank
+ widths <- resolveRelativeLengths <$>
+ ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank
thead <- pTableHead block <* skipMany pBlank
topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank
tbodies <- many (pTableBody block) <* skipMany pBlank
@@ -214,8 +234,9 @@ normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot
-> Either String ([ColSpec], TableHead, [TableBody], TableFoot)
normalize widths head' bodies foot = do
let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot
- let rowLength = length . rowCells
- let ncols = maximum (map rowLength rows)
+ let cellWidth (Cell _ _ _ (ColSpan cs) _) = cs
+ let rowLength = foldr (\cell acc -> cellWidth cell + acc) 0 . rowCells
+ let ncols = maybe 0 maximum $ nonEmpty $ map rowLength rows
let tblType = tableType (map rowCells rows)
-- fail on empty table
if null rows
diff --git a/src/Text/Pandoc/Readers/HTML/TagCategories.hs b/src/Text/Pandoc/Readers/HTML/TagCategories.hs
index 4f82a1831..b7bd40fee 100644
--- a/src/Text/Pandoc/Readers/HTML/TagCategories.hs
+++ b/src/Text/Pandoc/Readers/HTML/TagCategories.hs
@@ -1,7 +1,7 @@
{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.HTML.TagCategories
- Copyright : Copyright (C) 2006-2020 John MacFarlane
+ Copyright : Copyright (C) 2006-2021 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
diff --git a/src/Text/Pandoc/Readers/HTML/Types.hs b/src/Text/Pandoc/Readers/HTML/Types.hs
index a94eeb828..12c519ad6 100644
--- a/src/Text/Pandoc/Readers/HTML/Types.hs
+++ b/src/Text/Pandoc/Readers/HTML/Types.hs
@@ -2,7 +2,7 @@
{-# LANGUAGE MultiParamTypeClasses #-}
{- |
Module : Text.Pandoc.Readers.HTML.Types
- Copyright : Copyright (C) 2006-2020 John MacFarlane
+ Copyright : Copyright (C) 2006-2021 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>