diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML/Parsing.hs | 26 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML/Table.hs | 57 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML/TagCategories.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML/Types.hs | 2 |
4 files changed, 58 insertions, 29 deletions
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index 2d58319da..bd8d7c96c 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -2,7 +2,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.HTML.Parsing - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -161,10 +161,12 @@ _ `closes` "html" = False "li" `closes` "li" = True "th" `closes` t | t `elem` ["th","td"] = True "td" `closes` t | t `elem` ["th","td"] = True -"tr" `closes` t | t `elem` ["th","td","tr"] = True +"tr" `closes` t | t `elem` ["th","td","tr","colgroup"] = True "dd" `closes` t | t `elem` ["dt", "dd"] = True "dt" `closes` t | t `elem` ["dt","dd"] = True "rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True +"col" `closes` "col" = True +"colgroup" `closes` "col" = True "optgroup" `closes` "optgroup" = True "optgroup" `closes` "option" = True "option" `closes` "option" = True @@ -193,14 +195,20 @@ t1 `closes` t2 | _ `closes` _ = False toStringAttr :: [(Text, Text)] -> [(Text, Text)] -toStringAttr = map go +toStringAttr = foldr go [] where - go (x,y) = - case T.stripPrefix "data-" x of - Just x' | x' `Set.notMember` (html5Attributes <> - html4Attributes <> rdfaAttributes) - -> (x',y) - _ -> (x,y) + go :: (Text, Text) -> [(Text, Text)] -> [(Text, Text)] + -- treat xml:lang as lang + go ("xml:lang",y) ats = go ("lang",y) ats + -- prevent duplicate attributes + go (x,y) ats + | any (\(x',_) -> x == x') ats = ats + | otherwise = + case T.stripPrefix "data-" x of + Just x' | x' `Set.notMember` (html5Attributes <> + html4Attributes <> rdfaAttributes) + -> go (x',y) ats + _ -> (x,y):ats -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs index 5a783988f..6e62e12f5 100644 --- a/src/Text/Pandoc/Readers/HTML/Table.hs +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -3,8 +3,8 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.HTML.Table - Copyright : © 2006-2020 John MacFarlane, - 2020 Albert Krewinkel + Copyright : © 2006-2021 John MacFarlane, + 2020-2021 Albert Krewinkel License : GNU GPL, version 2 or above Maintainer : Albert Krewinkel <albert@zeitkraut.de> @@ -17,6 +17,8 @@ module Text.Pandoc.Readers.HTML.Table (pTable) where import Control.Applicative ((<|>)) import Data.Maybe (fromMaybe) +import Data.Either (lefts, rights) +import Data.List.NonEmpty (nonEmpty) import Data.Text (Text) import Text.HTML.TagSoup import Text.Pandoc.Builder (Blocks) @@ -32,34 +34,51 @@ import Text.Pandoc.Shared (onlySimpleTableCells, safeRead) import qualified Data.Text as T import qualified Text.Pandoc.Builder as B --- | Parses a @<col>@ element, returning the column's width. Defaults to --- @'ColWidthDefault'@ if the width is not set or cannot be determined. -pCol :: PandocMonad m => TagParser m ColWidth +-- | Parses a @<col>@ element, returning the column's width. +-- An Either value is used: Left i means a "relative length" with +-- integral value i (see https://www.w3.org/TR/html4/types.html#h-6.6); +-- Right w means a regular width. Defaults to @'Right ColWidthDefault'@ +-- if the width is not set or cannot be determined. +pCol :: PandocMonad m => TagParser m (Either Int ColWidth) pCol = try $ do TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) let attribs = toStringAttr attribs' skipMany pBlank optional $ pSatisfy (matchTagClose "col") skipMany pBlank - let width = case lookup "width" attribs of + return $ case lookup "width" attribs of Nothing -> case lookup "style" attribs of Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs -> - fromMaybe 0.0 $ safeRead (T.filter - (`notElem` (" \t\r\n%'\";" :: [Char])) xs) - _ -> 0.0 + maybe (Right ColWidthDefault) (Right . ColWidth . (/ 100.0)) + $ safeRead (T.filter + (`notElem` (" \t\r\n%'\";" :: [Char])) xs) + _ -> Right ColWidthDefault + Just (T.unsnoc -> Just (xs, '*')) -> + maybe (Left 1) Left $ safeRead xs Just (T.unsnoc -> Just (xs, '%')) -> - fromMaybe 0.0 $ safeRead xs - _ -> 0.0 - if width > 0.0 - then return $ ColWidth $ width / 100.0 - else return ColWidthDefault + maybe (Right ColWidthDefault) + (Right . ColWidth . (/ 100.0)) $ safeRead xs + _ -> Right ColWidthDefault -pColgroup :: PandocMonad m => TagParser m [ColWidth] +pColgroup :: PandocMonad m => TagParser m [Either Int ColWidth] pColgroup = try $ do pSatisfy (matchTagOpen "colgroup" []) skipMany pBlank manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank +resolveRelativeLengths :: [Either Int ColWidth] -> [ColWidth] +resolveRelativeLengths ws = + let remaining = 1 - sum (map getColWidth $ rights ws) + relatives = sum $ lefts ws + relUnit = remaining / fromIntegral relatives + toColWidth (Right x) = x + toColWidth (Left i) = ColWidth (fromIntegral i * relUnit) + in map toColWidth ws + +getColWidth :: ColWidth -> Double +getColWidth ColWidthDefault = 0 +getColWidth (ColWidth w) = w + data CellType = HeaderCell | BodyCell @@ -181,7 +200,8 @@ pTable :: PandocMonad m pTable block = try $ do TagOpen _ attribs <- pSatisfy (matchTagOpen "table" []) <* skipMany pBlank caption <- option mempty $ pInTags "caption" block <* skipMany pBlank - widths <- ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank + widths <- resolveRelativeLengths <$> + ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank thead <- pTableHead block <* skipMany pBlank topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank tbodies <- many (pTableBody block) <* skipMany pBlank @@ -214,8 +234,9 @@ normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot -> Either String ([ColSpec], TableHead, [TableBody], TableFoot) normalize widths head' bodies foot = do let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot - let rowLength = length . rowCells - let ncols = maximum (map rowLength rows) + let cellWidth (Cell _ _ _ (ColSpan cs) _) = cs + let rowLength = foldr (\cell acc -> cellWidth cell + acc) 0 . rowCells + let ncols = maybe 0 maximum $ nonEmpty $ map rowLength rows let tblType = tableType (map rowCells rows) -- fail on empty table if null rows diff --git a/src/Text/Pandoc/Readers/HTML/TagCategories.hs b/src/Text/Pandoc/Readers/HTML/TagCategories.hs index 4f82a1831..b7bd40fee 100644 --- a/src/Text/Pandoc/Readers/HTML/TagCategories.hs +++ b/src/Text/Pandoc/Readers/HTML/TagCategories.hs @@ -1,7 +1,7 @@ {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.HTML.TagCategories - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> diff --git a/src/Text/Pandoc/Readers/HTML/Types.hs b/src/Text/Pandoc/Readers/HTML/Types.hs index a94eeb828..12c519ad6 100644 --- a/src/Text/Pandoc/Readers/HTML/Types.hs +++ b/src/Text/Pandoc/Readers/HTML/Types.hs @@ -2,7 +2,7 @@ {-# LANGUAGE MultiParamTypeClasses #-} {- | Module : Text.Pandoc.Readers.HTML.Types - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> |