diff options
author | Albert Krewinkel <albert@zeitkraut.de> | 2020-11-26 07:22:01 +0100 |
---|---|---|
committer | Albert Krewinkel <albert@zeitkraut.de> | 2020-11-26 07:22:01 +0100 |
commit | 07919e1b2270a906019575e4ce85590d6754d41c (patch) | |
tree | c04aa488cb17b00ac13b72a67c3ad4e0e8c3efd6 /src | |
parent | 3e01ae405f9bf5f40e1b8e519029825aa4880602 (diff) | |
download | pandoc-07919e1b2270a906019575e4ce85590d6754d41c.tar.gz |
HTML reader: improve support for table headers, footer, attributes
- `<tfoot>` elements are no longer added to the table body but used as
table footer.
- Separate `<tbody>` elements are no longer combined into one.
- Attributes on `<thead>`, `<tbody>`, `<th>`/`<td>`, and `<tfoot>`
elements are preserved.
Diffstat (limited to 'src')
-rw-r--r-- | src/Text/Pandoc/CSS.hs | 37 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 40 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML/Parsing.hs | 47 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML/Table.hs | 238 |
4 files changed, 244 insertions, 118 deletions
diff --git a/src/Text/Pandoc/CSS.hs b/src/Text/Pandoc/CSS.hs index 80251850b..d98c85147 100644 --- a/src/Text/Pandoc/CSS.hs +++ b/src/Text/Pandoc/CSS.hs @@ -11,41 +11,46 @@ Portability : portable Tools for working with CSS. -} -module Text.Pandoc.CSS ( pickStyleAttrProps - , pickStylesToKVs - ) +module Text.Pandoc.CSS + ( cssAttributes + , pickStyleAttrProps + , pickStylesToKVs + ) where -import qualified Data.Text as T import Data.Maybe (mapMaybe, listToMaybe) +import Data.Text (Text, pack) import Text.Pandoc.Shared (trim) import Text.Parsec import Text.Parsec.Text -ruleParser :: Parser (T.Text, T.Text) +ruleParser :: Parser (Text, Text) ruleParser = do p <- many1 (noneOf ":") <* char ':' v <- many1 (noneOf ":;") <* optional (char ';') <* spaces - return (trim $ T.pack p, trim $ T.pack v) + return (trim $ pack p, trim $ pack v) -styleAttrParser :: Parser [(T.Text, T.Text)] +styleAttrParser :: Parser [(Text, Text)] styleAttrParser = many1 ruleParser -eitherToMaybe :: Either a b -> Maybe b -eitherToMaybe (Right x) = Just x -eitherToMaybe _ = Nothing +-- | Parses a style string, returning the CSS attributes. +-- Returns an empty list on failure. +cssAttributes :: Text -> [(Text, Text)] +cssAttributes styleString = + -- Use Data.Either.fromRight once GHC 8.0 is no longer supported + case parse styleAttrParser "" styleString of + Left _ -> [] + Right x -> x -- | takes a list of keys/properties and a CSS string and -- returns the corresponding key-value-pairs. -pickStylesToKVs :: [T.Text] -> T.Text -> [(T.Text, T.Text)] +pickStylesToKVs :: [Text] -> Text -> [(Text, Text)] pickStylesToKVs props styleAttr = - case parse styleAttrParser "" styleAttr of - Left _ -> [] - Right styles -> filter (\s -> fst s `elem` props) styles + filter (\s -> fst s `elem` props) $ cssAttributes styleAttr -- | takes a list of key/property synonyms and a CSS string and maybe -- returns the value of the first match (in order of the supplied list) -pickStyleAttrProps :: [T.Text] -> T.Text -> Maybe T.Text +pickStyleAttrProps :: [Text] -> Text -> Maybe Text pickStyleAttrProps lookupProps styleAttr = do - styles <- eitherToMaybe $ parse styleAttrParser "" styleAttr + styles <- either (const Nothing) Just $ parse styleAttrParser "" styleAttr listToMaybe $ mapMaybe (`lookup` styles) lookupProps diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index fa996d2f0..eb78979a3 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -26,7 +26,7 @@ module Text.Pandoc.Readers.HTML ( readHtml import Control.Applicative ((<|>)) import Control.Arrow (first) -import Control.Monad (guard, mplus, msum, mzero, unless, void) +import Control.Monad (guard, msum, mzero, unless, void) import Control.Monad.Except (throwError) import Control.Monad.Reader (ask, asks, lift, local, runReaderT) import Data.ByteString.Base64 (encode) @@ -50,7 +50,7 @@ import Text.Pandoc.CSS (pickStyleAttrProps) import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Definition import Text.Pandoc.Readers.HTML.Parsing -import Text.Pandoc.Readers.HTML.Table (pTable') +import Text.Pandoc.Readers.HTML.Table (pTable) import Text.Pandoc.Readers.HTML.TagCategories import Text.Pandoc.Readers.HTML.Types import Text.Pandoc.Readers.LaTeX (rawLaTeXInline) @@ -63,8 +63,7 @@ import Text.Pandoc.Options ( extensionEnabled) import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI, - extractSpaces, htmlSpanLikeElements, elemText, splitTextBy, - safeRead, tshow) + extractSpaces, htmlSpanLikeElements, safeRead, tshow) import Text.Pandoc.Walk import Text.Parsec.Error import Text.TeXMath (readMathML, writeTeX) @@ -159,7 +158,7 @@ block = do , pCodeBlock , pList , pHrule - , pTable + , pTable block , pHtml , pHead , pBody @@ -464,31 +463,6 @@ pHrule = do pSelfClosing (=="hr") (const True) return B.horizontalRule -pTable :: PandocMonad m => TagParser m Blocks -pTable = pTable' block pCell - -pCell :: PandocMonad m => Text -> TagParser m [Cell] -pCell celltype = try $ do - skipMany pBlank - tag <- lookAhead $ pSatisfy (\t -> t ~== TagOpen celltype []) - let extractAlign' [] = "" - extractAlign' ("text-align":x:_) = x - extractAlign' (_:xs) = extractAlign' xs - let extractAlign = extractAlign' . splitTextBy (`elemText` " \t;:") - let align = case maybeFromAttrib "align" tag `mplus` - (extractAlign <$> maybeFromAttrib "style" tag) of - Just "left" -> AlignLeft - Just "right" -> AlignRight - Just "center" -> AlignCenter - _ -> AlignDefault - let rowspan = RowSpan . fromMaybe 1 $ - safeRead =<< maybeFromAttrib "rowspan" tag - let colspan = ColSpan . fromMaybe 1 $ - safeRead =<< maybeFromAttrib "colspan" tag - res <- pInTags celltype block - skipMany pBlank - return [B.cell align rowspan colspan res] - pBlockQuote :: PandocMonad m => TagParser m Blocks pBlockQuote = do contents <- pInTags "blockquote" block @@ -653,12 +627,6 @@ pLineBreak = do pSelfClosing (=="br") (const True) return B.linebreak --- Unlike fromAttrib from tagsoup, this distinguishes --- between a missing attribute and an attribute with empty content. -maybeFromAttrib :: Text -> Tag Text -> Maybe Text -maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs -maybeFromAttrib _ _ = Nothing - pLink :: PandocMonad m => TagParser m Inlines pLink = try $ do tag <- pSatisfy $ tagOpenLit "a" (const True) diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index 8788a933e..2d58319da 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.HTML.Parsing @@ -15,6 +16,7 @@ module Text.Pandoc.Readers.HTML.Parsing , pInTags , pInTags' , pInTag + , pInTagWithAttribs , pAny , pCloses , pSatisfy @@ -22,6 +24,7 @@ module Text.Pandoc.Readers.HTML.Parsing , matchTagClose , matchTagOpen , isSpace + , maybeFromAttrib , toAttr , toStringAttr ) @@ -31,11 +34,11 @@ import Control.Monad (guard, void, mzero) import Data.Maybe (fromMaybe) import Data.Text (Text) import Text.HTML.TagSoup - ( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose ) + ( Attribute, Tag (..), isTagText, isTagPosition, isTagOpen, isTagClose, (~==) ) import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) import Text.Pandoc.Definition (Attr) import Text.Pandoc.Parsing - ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional + ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, option, optional , skipMany, setPosition, token, try) import Text.Pandoc.Readers.HTML.TagCategories import Text.Pandoc.Readers.HTML.Types @@ -60,25 +63,41 @@ pInTags' :: (PandocMonad m, Monoid a) -> TagParser m a -> TagParser m a pInTags' tagtype tagtest parser = try $ do - pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) + pSatisfy $ \t -> matchTagOpen tagtype [] t && tagtest t mconcat <$> manyTill parser (pCloses tagtype <|> eof) --- parses p, preceded by an opening tag (optional if tagsOptional) --- and followed by a closing tag (optional if tagsOptional) -pInTag :: PandocMonad m => TagOmission -> Text -> TagParser m a -> TagParser m a -pInTag tagOmission tagtype p = try $ do - skipMany pBlank +pInTag :: PandocMonad m + => TagOmission -- ^ Whether some tags can be omitted + -> Text -- ^ @tagtype@ Tag name + -> TagParser m a -- ^ @p@ Content parser + -> TagParser m a +pInTag tagOmission tagtype = fmap snd . pInTagWithAttribs tagOmission tagtype + +-- | Returns the contents of a tag together with its attributes; parses +-- @p@, preceded by an opening tag (optional if TagsOmittable) and +-- followed by a closing tag (optional unless TagsRequired). +pInTagWithAttribs :: PandocMonad m + => TagOmission -- ^ Whether some tags can be omitted + -> Text -- ^ @tagtype@ Tag name + -> TagParser m a -- ^ @p@ Content parser + -> TagParser m ([Attribute Text], a) +pInTagWithAttribs tagOmission tagtype p = try $ do let openingOptional = tagOmission == TagsOmittable let closingOptional = tagOmission /= TagsRequired - (if openingOptional then optional else void) $ - pSatisfy (matchTagOpen tagtype []) + skipMany pBlank + attribs <- (if openingOptional then option [] else id) + (getAttribs <$> pSatisfy (matchTagOpen tagtype [])) skipMany pBlank x <- p skipMany pBlank (if closingOptional then optional else void) $ pSatisfy (matchTagClose tagtype) skipMany pBlank - return x + return (attribs, x) + where + getAttribs = \case + TagOpen _ attribs -> attribs + _ -> [] pCloses :: PandocMonad m => Text -> TagParser m () pCloses tagtype = try $ do @@ -183,6 +202,12 @@ toStringAttr = map go -> (x',y) _ -> (x,y) +-- Unlike fromAttrib from tagsoup, this distinguishes +-- between a missing attribute and an attribute with empty content. +maybeFromAttrib :: Text -> Tag Text -> Maybe Text +maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs +maybeFromAttrib _ _ = Nothing + mkAttr :: [(Text, Text)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) where attribsId = fromMaybe "" $ lookup "id" attr diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs index e6d0a9097..e40d90221 100644 --- a/src/Text/Pandoc/Readers/HTML/Table.hs +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ViewPatterns #-} {- | @@ -12,17 +13,19 @@ HTML table parser. -} -module Text.Pandoc.Readers.HTML.Table (pTable') where +module Text.Pandoc.Readers.HTML.Table (pTable) where -import Control.Monad (guard) +import Control.Applicative ((<|>)) import Data.Maybe (fromMaybe) import Data.Text (Text) import Text.HTML.TagSoup import Text.Pandoc.Builder (Blocks) +import Text.Pandoc.CSS (cssAttributes) import Text.Pandoc.Definition import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) import Text.Pandoc.Parsing - ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try) + ( eof, lookAhead, many, many1, manyTill, option, optional + , optionMaybe, skipMany, try) import Text.Pandoc.Readers.HTML.Parsing import Text.Pandoc.Readers.HTML.Types (TagParser) import Text.Pandoc.Shared (onlySimpleTableCells, safeRead) @@ -57,58 +60,183 @@ pColgroup = try $ do skipMany pBlank manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank --- | Parses a simple HTML table -pTable' :: PandocMonad m - => TagParser m Blocks -- ^ Caption parser - -> (Text -> TagParser m [Cell]) -- ^ Table cell parser - -> TagParser m Blocks -pTable' block pCell = try $ do - TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" []) - let attribs = toAttr attribs' +pCell :: PandocMonad m + => TagParser m Blocks + -> Text + -> TagParser m [Cell] +pCell block celltype = try $ do + skipMany pBlank + TagOpen _ attribs <- lookAhead $ pSatisfy (matchTagOpen celltype []) + let cssAttribs = maybe [] cssAttributes $ lookup "style" attribs + let align = case lookup "align" attribs <|> + lookup "text-align" cssAttribs of + Just "left" -> AlignLeft + Just "right" -> AlignRight + Just "center" -> AlignCenter + _ -> AlignDefault + let rowspan = RowSpan . fromMaybe 1 $ + safeRead =<< lookup "rowspan" attribs + let colspan = ColSpan . fromMaybe 1 $ + safeRead =<< lookup "colspan" attribs + res <- pInTags celltype block + skipMany pBlank + let handledAttribs = ["align", "colspan", "rowspan", "text-align"] + attribs' = foldr go [] attribs + go kv@(k, _) acc = case k of + "style" -> case filter ((/= "text-align") . fst) cssAttribs of + [] -> acc + cs -> ("style", toStyleString cs) : acc + -- drop attrib if it's already handled + _ | k `elem` handledAttribs -> acc + _ -> kv : acc + return [B.cellWith (toAttr attribs') align rowspan colspan res] + +-- | Create a style attribute string from a list of CSS attributes +toStyleString :: [(Text, Text)] -> Text +toStyleString = T.intercalate "; " . map (\(k, v) -> k <> ": " <> v) + +data RowType + = HeaderCells + | AllCells + +-- | Parses a table row +pRow :: PandocMonad m + => TagParser m Blocks + -> RowType + -> TagParser m [B.Row] +pRow block rowType = try $ do + skipMany pBlank + case rowType of + HeaderCells -> do + maybeCells <- optionMaybe (pInTags "tr" (pCell block "th")) + return $ case maybeCells of + Nothing -> [] + Just cells -> [Row nullAttr cells] + AllCells -> do + cells <- pInTags "tr" (pCell block "td" <|> pCell block "th") + return [Row nullAttr cells] + +-- | Parses a table head +pTableHead :: PandocMonad m + => TagParser m Blocks + -> TagParser m TableHead +pTableHead block = try $ do + skipMany pBlank + (attribs, rows) <- pInTagWithAttribs ClosingTagOptional "thead" + (option [] $ pRow block AllCells) + <|> pInTagWithAttribs TagsOmittable "thead" + (pRow block HeaderCells) + let cells = concatMap (\(Row _ cs) -> cs) rows + if null cells + then TableHead nullAttr <$> + pInTag TagsOmittable "tbody" (pRow block HeaderCells) + else return $ TableHead (toAttr attribs) [Row nullAttr cells] + +-- | Parses a table foot +pTableFoot :: PandocMonad m + => TagParser m Blocks + -> TagParser m TableFoot +pTableFoot block = try $ do + skipMany pBlank + TagOpen _ attribs <- pSatisfy (matchTagOpen "tfoot" []) <* skipMany pBlank + rows <- mconcat <$> many (pRow block AllCells <* skipMany pBlank) + optional $ pSatisfy (matchTagClose "tfoot") + return $ TableFoot (toAttr attribs) rows + +-- | Parses a table body +pTableBody :: PandocMonad m + => TagParser m Blocks + -> TagParser m TableBody +pTableBody block = do skipMany pBlank - caption <- option mempty $ pInTags "caption" block <* skipMany pBlank - widths' <- (mconcat <$> many1 pColgroup) <|> many pCol - let pTh = option [] $ pInTags "tr" (pCell "th") - pTr = try $ skipMany pBlank - *> pInTags "tr" (pCell "td" <|> pCell "th") - pTBody = pInTag TagsOmittable "tbody" $ many1 pTr - head'' <- pInTag ClosingTagOptional "thead" (option [] pTr) - <|> pInTag TagsOmittable "thead" pTh - head' <- pInTag TagsOmittable "tbody" - (if null head'' then pTh else return head'') - topfoot <- option [] $ pInTag TagsRequired "tfoot" $ many pTr - rowsLs <- many pTBody - bottomfoot <- option [] $ pInTag ClosingTagOptional "tfoot" $ many pTr + (attribs, rows) <- pInTagWithAttribs TagsOmittable "tbody" + (mconcat <$> many1 (pRow block AllCells)) + return $ TableBody (toAttr attribs) 0 [] rows + + +-- | Parses a simple HTML table +pTable :: PandocMonad m + => TagParser m Blocks -- ^ Caption and cell contents parser + -> TagParser m Blocks +pTable block = try $ do + TagOpen _ attribs <- pSatisfy (matchTagOpen "table" []) <* skipMany pBlank + caption <- option mempty $ pInTags "caption" block <* skipMany pBlank + widths <- ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank + thead <- pTableHead block <* skipMany pBlank + topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank + tbodies <- many (pTableBody block) <* skipMany pBlank + botfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank TagClose _ <- pSatisfy (matchTagClose "table") - let rows = concat rowsLs <> topfoot <> bottomfoot - rows''' = map (map cellContents) rows + let tfoot = fromMaybe (TableFoot nullAttr []) $ topfoot <|> botfoot + case normalize widths thead tbodies tfoot of + Left err -> fail err + Right (colspecs, thead', tbodies', tfoot') -> return $ + B.tableWith (toAttr attribs) + (B.simpleCaption caption) + colspecs + thead' + tbodies' + tfoot' +data TableType + = SimpleTable + | NormalTable + +tableType :: [[Cell]] -> TableType +tableType cells = + if onlySimpleTableCells $ map (map cellContents) cells + then SimpleTable + else NormalTable + where + cellContents :: Cell -> [Block] + cellContents (Cell _ _ _ _ bs) = bs + +normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot + -> Either String ([ColSpec], TableHead, [TableBody], TableFoot) +normalize widths head' bodies foot = do + let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot + let rowLength = length . rowCells + let ncols = maximum (map rowLength rows) + let tblType = tableType (map rowCells rows) -- fail on empty table - guard $ not $ null head' && null rows''' - let isSimple = onlySimpleTableCells $ - map cellContents head' : rows''' - let cols = if null head' - then maximum (map length rows''') - else length head' - let aligns = case rows of - (cs:_) -> take cols $ - concatMap cellAligns cs ++ repeat AlignDefault - _ -> replicate cols AlignDefault - let widths = if null widths' - then if isSimple - then replicate cols ColWidthDefault - else replicate cols (ColWidth (1.0 / fromIntegral cols)) - else widths' - let toRow = Row nullAttr - toHeaderRow l = [toRow l | not (null l)] - return $ B.tableWith attribs - (B.simpleCaption caption) - (zip aligns widths) - (TableHead nullAttr $ toHeaderRow head') - [TableBody nullAttr 0 [] $ map toRow rows] - (TableFoot nullAttr []) - -cellContents :: Cell -> [Block] -cellContents (Cell _ _ _ _ bs) = bs - -cellAligns :: Cell -> [Alignment] -cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align + if null rows + then Left "empty table" + else Right + ( zip (calculateAlignments ncols bodies) + (normalizeColWidths ncols tblType widths) + , head' + , bodies + , foot + ) + +normalizeColWidths :: Int -> TableType -> [ColWidth] -> [ColWidth] +normalizeColWidths ncols tblType = \case + [] -> case tblType of + SimpleTable -> replicate ncols ColWidthDefault + NormalTable -> replicate ncols (ColWidth $ 1 / fromIntegral ncols) + widths -> widths + +calculateAlignments :: Int -> [TableBody] -> [Alignment] +calculateAlignments cols tbodies = + case cells of + cs:_ -> take cols $ concatMap cellAligns cs ++ repeat AlignDefault + _ -> replicate cols AlignDefault + where + cells :: [[Cell]] + cells = concatMap bodyRowCells tbodies + cellAligns :: Cell -> [Alignment] + cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align + +bodyRowCells :: TableBody -> [[Cell]] +bodyRowCells = map rowCells . bodyRows + +headRows :: TableHead -> [B.Row] +headRows (TableHead _ rows) = rows + +bodyRows :: TableBody -> [B.Row] +bodyRows (TableBody _ _ headerRows bodyRows') = headerRows <> bodyRows' + +footRows :: TableFoot -> [B.Row] +footRows (TableFoot _ rows) = rows + +rowCells :: B.Row -> [Cell] +rowCells (Row _ cells) = cells |