diff options
Diffstat (limited to 'src/Text')
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 98 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML/Parsing.hs | 26 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML/Table.hs | 111 | 
3 files changed, 140 insertions, 95 deletions
| diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 9e84bedab..e9fefb9c0 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -50,10 +50,10 @@ import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)  import qualified Text.Pandoc.UTF8 as UTF8  import Text.Pandoc.Definition  import Text.Pandoc.Readers.HTML.Parsing +import Text.Pandoc.Readers.HTML.Table (pTable')  import Text.Pandoc.Readers.HTML.TagCategories  import Text.Pandoc.Readers.HTML.Types  import Text.Pandoc.Readers.LaTeX (rawLaTeXInline) -import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)  import Text.Pandoc.Error  import Text.Pandoc.Logging  import Text.Pandoc.Options ( @@ -64,7 +64,7 @@ import Text.Pandoc.Options (  import Text.Pandoc.Parsing hiding ((<|>))  import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI,                             extractSpaces, htmlSpanLikeElements, elemText, splitTextBy, -                           onlySimpleTableCells, safeRead, tshow) +                           safeRead, tshow)  import Text.Pandoc.Walk  import Text.Parsec.Error  import Text.TeXMath (readMathML, writeTeX) @@ -474,79 +474,7 @@ pHrule = do    return B.horizontalRule  pTable :: PandocMonad m => TagParser m Blocks -pTable = try $ do -  TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" []) -  let attribs = toAttr attribs' -  skipMany pBlank -  caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank -  widths' <- (mconcat <$> many1 pColgroup) <|> many pCol -  let pTh = option [] $ pInTags "tr" (pCell "th") -      pTr = try $ skipMany pBlank >> -                  pInTags "tr" (pCell "td" <|> pCell "th") -      pTBody = pInTag True "tbody" $ many1 pTr -  head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh -  head'  <- map snd <$> -             pInTag True "tbody" -               (if null head'' then pTh else return head'') -  topfoot <- option [] $ pInTag False "tfoot" $ many pTr -  rowsLs <- many pTBody -  bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr -  TagClose _ <- pSatisfy (matchTagClose "table") -  let rows'' = concat rowsLs <> topfoot <> bottomfoot -  let rows''' = map (map snd) rows'' -  -- fail on empty table -  guard $ not $ null head' && null rows''' -  let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows''' -  let cols = if null head' -                then maximum (map length rows''') -                else length head' -  -- add empty cells to short rows -  let addEmpties r = case cols - length r of -                           n | n > 0 -> r <> replicate n mempty -                             | otherwise -> r -  let rows = map addEmpties rows''' -  let aligns = case rows'' of -                    (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault -                    _      -> replicate cols AlignDefault -  let widths = if null widths' -                  then if isSimple -                       then replicate cols ColWidthDefault -                       else replicate cols (ColWidth (1.0 / fromIntegral cols)) -                  else widths' -  let toRow = Row nullAttr . map B.simpleCell -      toHeaderRow l = [toRow l | not (null l)] -  return $ B.tableWith attribs -                   (B.simpleCaption $ B.plain caption) -                   (zip aligns widths) -                   (TableHead nullAttr $ toHeaderRow head') -                   [TableBody nullAttr 0 [] $ map toRow rows] -                   (TableFoot nullAttr []) - -pCol :: PandocMonad m => TagParser m ColWidth -pCol = try $ do -  TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) -  let attribs = toStringAttr attribs' -  skipMany pBlank -  optional $ pSatisfy (matchTagClose "col") -  skipMany pBlank -  let width = case lookup "width" attribs of -                Nothing -> case lookup "style" attribs of -                  Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs -> -                    fromMaybe 0.0 $ safeRead (T.filter -                      (`notElem` (" \t\r\n%'\";" :: [Char])) xs) -                  _ -> 0.0 -                Just (T.unsnoc -> Just (xs, '%')) -> -                  fromMaybe 0.0 $ safeRead xs -                _ -> 0.0 -  if width > 0.0 -    then return $ ColWidth $ width / 100.0 -    else return ColWidthDefault - -pColgroup :: PandocMonad m => TagParser m [ColWidth] -pColgroup = try $ do -  pSatisfy (matchTagOpen "colgroup" []) -  skipMany pBlank -  manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank +pTable = pTable' inline pCell  noColOrRowSpans :: Tag Text -> Bool  noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan" @@ -847,16 +775,6 @@ pRawHtmlInline = do  mathMLToTeXMath :: Text -> Either Text Text  mathMLToTeXMath s = writeTeX <$> readMathML s -toStringAttr :: [(Text, Text)] -> [(Text, Text)] -toStringAttr = map go -  where -   go (x,y) = -     case T.stripPrefix "data-" x of -       Just x' | x' `Set.notMember` (html5Attributes <> -                                     html4Attributes <> rdfaAttributes) -         -> (x',y) -       _ -> (x,y) -  pScriptMath :: PandocMonad m => TagParser m Inlines  pScriptMath = try $ do    TagOpen _ attr' <- pSatisfy $ tagOpen (=="script") (const True) @@ -1151,16 +1069,6 @@ htmlTag f = try $ do           handleTag tagname         _ -> mzero -mkAttr :: [(Text, Text)] -> Attr -mkAttr attr = (attribsId, attribsClasses, attribsKV) -  where attribsId = fromMaybe "" $ lookup "id" attr -        attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes -        attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr -        epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr - -toAttr :: [(Text, Text)] -> Attr -toAttr = mkAttr . toStringAttr -  -- Strip namespace prefixes  stripPrefixes :: [Tag Text] -> [Tag Text]  stripPrefixes = map stripPrefix diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index 7fda066b5..e68e43b25 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -21,19 +21,25 @@ module Text.Pandoc.Readers.HTML.Parsing    , matchTagClose    , matchTagOpen    , isSpace +  , toAttr +  , toStringAttr    )  where  import Control.Monad (guard, void, mzero) +import Data.Maybe (fromMaybe)  import Data.Text (Text)  import Text.HTML.TagSoup +  ( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose )  import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) +import Text.Pandoc.Definition (Attr)  import Text.Pandoc.Parsing    ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional    , skipMany, setPosition, token, try)  import Text.Pandoc.Readers.HTML.TagCategories  import Text.Pandoc.Readers.HTML.Types  import Text.Pandoc.Shared (tshow) +import Text.Pandoc.XML (html5Attributes, html4Attributes, rdfaAttributes)  import qualified Data.Set as Set  import qualified Data.Text as T @@ -154,3 +160,23 @@ t1 `closes` t2 |     t2 `Set.notMember` blockTags &&     t2 `Set.notMember` eitherBlockOrInline = True  _ `closes` _ = False + +toStringAttr :: [(Text, Text)] -> [(Text, Text)] +toStringAttr = map go +  where +   go (x,y) = +     case T.stripPrefix "data-" x of +       Just x' | x' `Set.notMember` (html5Attributes <> +                                     html4Attributes <> rdfaAttributes) +         -> (x',y) +       _ -> (x,y) + +mkAttr :: [(Text, Text)] -> Attr +mkAttr attr = (attribsId, attribsClasses, attribsKV) +  where attribsId = fromMaybe "" $ lookup "id" attr +        attribsClasses = T.words (fromMaybe "" $ lookup "class" attr) <> epubTypes +        attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr +        epubTypes = T.words $ fromMaybe "" $ lookup "epub:type" attr + +toAttr :: [(Text, Text)] -> Attr +toAttr = mkAttr . toStringAttr diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs new file mode 100644 index 000000000..bad39bd2d --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -0,0 +1,111 @@ +{-# LANGUAGE OverloadedStrings #-} +{-# LANGUAGE ViewPatterns #-} +{- | +   Module      : Text.Pandoc.Readers.HTML.Table +   Copyright   : © 2006-2020 John MacFarlane, +                   2020 Albert Krewinkel +   License     : GNU GPL, version 2 or above + +   Maintainer  : Albert Krewinkel <albert@zeitkraut.de> +   Stability   : alpha +   Portability : portable + +HTML table parser. +-} +module Text.Pandoc.Readers.HTML.Table (pTable') where + +import Control.Monad (guard) +import Data.Maybe (fromMaybe) +import Data.Text (Text) +import Text.HTML.TagSoup +import Text.Pandoc.Builder (Blocks, Inlines) +import Text.Pandoc.Definition +import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) +import Text.Pandoc.Parsing +  ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try) +import Text.Pandoc.Readers.HTML.Parsing +import Text.Pandoc.Readers.HTML.Types (TagParser) +import Text.Pandoc.Shared (onlySimpleTableCells, safeRead) +import qualified Data.Text as T +import qualified Text.Pandoc.Builder as B + +-- | Parses a @<col>@ element, returning the column's width. Defaults to +-- @'ColWidthDefault'@ if the width is not set or cannot be determined. +pCol :: PandocMonad m => TagParser m ColWidth +pCol = try $ do +  TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) +  let attribs = toStringAttr attribs' +  skipMany pBlank +  optional $ pSatisfy (matchTagClose "col") +  skipMany pBlank +  let width = case lookup "width" attribs of +                Nothing -> case lookup "style" attribs of +                  Just (T.stripPrefix "width:" -> Just xs) | T.any (== '%') xs -> +                    fromMaybe 0.0 $ safeRead (T.filter +                      (`notElem` (" \t\r\n%'\";" :: [Char])) xs) +                  _ -> 0.0 +                Just (T.unsnoc -> Just (xs, '%')) -> +                  fromMaybe 0.0 $ safeRead xs +                _ -> 0.0 +  if width > 0.0 +    then return $ ColWidth $ width / 100.0 +    else return ColWidthDefault + +pColgroup :: PandocMonad m => TagParser m [ColWidth] +pColgroup = try $ do +  pSatisfy (matchTagOpen "colgroup" []) +  skipMany pBlank +  manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank + +-- | Parses a simple HTML table +pTable' :: PandocMonad m +        => TagParser m Inlines                         -- ^ Caption parser +        -> (Text -> TagParser m [(Alignment, Blocks)]) -- ^ Table cell parser +        -> TagParser m Blocks +pTable' inline pCell = try $ do +  TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" []) +  let attribs = toAttr attribs' +  skipMany pBlank +  caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank +  widths' <- (mconcat <$> many1 pColgroup) <|> many pCol +  let pTh = option [] $ pInTags "tr" (pCell "th") +      pTr = try $ skipMany pBlank >> +                  pInTags "tr" (pCell "td" <|> pCell "th") +      pTBody = pInTag True "tbody" $ many1 pTr +  head'' <- pInTag False "thead" (option [] pTr) <|> pInTag True "thead" pTh +  head'  <- map snd <$> +             pInTag True "tbody" +               (if null head'' then pTh else return head'') +  topfoot <- option [] $ pInTag False "tfoot" $ many pTr +  rowsLs <- many pTBody +  bottomfoot <- option [] $ pInTag False "tfoot" $ many pTr +  TagClose _ <- pSatisfy (matchTagClose "table") +  let rows'' = concat rowsLs <> topfoot <> bottomfoot +  let rows''' = map (map snd) rows'' +  -- fail on empty table +  guard $ not $ null head' && null rows''' +  let isSimple = onlySimpleTableCells $ fmap B.toList <$> head':rows''' +  let cols = if null head' +                then maximum (map length rows''') +                else length head' +  -- add empty cells to short rows +  let addEmpties r = case cols - length r of +                           n | n > 0 -> r <> replicate n mempty +                             | otherwise -> r +  let rows = map addEmpties rows''' +  let aligns = case rows'' of +                    (cs:_) -> take cols $ map fst cs ++ repeat AlignDefault +                    _      -> replicate cols AlignDefault +  let widths = if null widths' +                  then if isSimple +                       then replicate cols ColWidthDefault +                       else replicate cols (ColWidth (1.0 / fromIntegral cols)) +                  else widths' +  let toRow = Row nullAttr . map B.simpleCell +      toHeaderRow l = [toRow l | not (null l)] +  return $ B.tableWith attribs +                   (B.simpleCaption $ B.plain caption) +                   (zip aligns widths) +                   (TableHead nullAttr $ toHeaderRow head') +                   [TableBody nullAttr 0 [] $ map toRow rows] +                   (TableFoot nullAttr []) | 
