diff options
Diffstat (limited to 'src/Text')
| -rw-r--r-- | src/Text/Pandoc/CSS.hs | 37 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 40 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML/Parsing.hs | 47 | ||||
| -rw-r--r-- | src/Text/Pandoc/Readers/HTML/Table.hs | 238 | 
4 files changed, 244 insertions, 118 deletions
| diff --git a/src/Text/Pandoc/CSS.hs b/src/Text/Pandoc/CSS.hs index 80251850b..d98c85147 100644 --- a/src/Text/Pandoc/CSS.hs +++ b/src/Text/Pandoc/CSS.hs @@ -11,41 +11,46 @@ Portability : portable  Tools for working with CSS.  -} -module Text.Pandoc.CSS ( pickStyleAttrProps -                       , pickStylesToKVs -                       ) +module Text.Pandoc.CSS +  ( cssAttributes +  , pickStyleAttrProps +  , pickStylesToKVs +  )  where -import qualified Data.Text as T  import Data.Maybe (mapMaybe, listToMaybe) +import Data.Text (Text, pack)  import Text.Pandoc.Shared (trim)  import Text.Parsec  import Text.Parsec.Text -ruleParser :: Parser (T.Text, T.Text) +ruleParser :: Parser (Text, Text)  ruleParser = do      p <- many1 (noneOf ":")  <* char ':'      v <- many1 (noneOf ":;") <* optional (char ';') <* spaces -    return (trim $ T.pack p, trim $ T.pack v) +    return (trim $ pack p, trim $ pack v) -styleAttrParser :: Parser [(T.Text, T.Text)] +styleAttrParser :: Parser [(Text, Text)]  styleAttrParser = many1 ruleParser -eitherToMaybe :: Either a b -> Maybe b -eitherToMaybe (Right x) = Just x -eitherToMaybe _         = Nothing +-- | Parses a style string, returning the CSS attributes. +-- Returns an empty list on failure. +cssAttributes :: Text -> [(Text, Text)] +cssAttributes styleString = +  -- Use Data.Either.fromRight once GHC 8.0 is no longer supported +  case parse styleAttrParser "" styleString of +    Left _  -> [] +    Right x -> x  -- | takes a list of keys/properties and a CSS string and  -- returns the corresponding key-value-pairs. -pickStylesToKVs :: [T.Text] -> T.Text -> [(T.Text, T.Text)] +pickStylesToKVs :: [Text] -> Text -> [(Text, Text)]  pickStylesToKVs props styleAttr = -  case parse styleAttrParser "" styleAttr of -    Left _       -> [] -    Right styles -> filter (\s -> fst s `elem` props) styles +  filter (\s -> fst s `elem` props) $ cssAttributes styleAttr  -- | takes a list of key/property synonyms and a CSS string and maybe  -- returns the value of the first match (in order of the supplied list) -pickStyleAttrProps :: [T.Text] -> T.Text -> Maybe T.Text +pickStyleAttrProps :: [Text] -> Text -> Maybe Text  pickStyleAttrProps lookupProps styleAttr = do -    styles <- eitherToMaybe $ parse styleAttrParser "" styleAttr +    styles <- either (const Nothing) Just $ parse styleAttrParser "" styleAttr      listToMaybe $ mapMaybe (`lookup` styles) lookupProps diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index fa996d2f0..eb78979a3 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -26,7 +26,7 @@ module Text.Pandoc.Readers.HTML ( readHtml  import Control.Applicative ((<|>))  import Control.Arrow (first) -import Control.Monad (guard, mplus, msum, mzero, unless, void) +import Control.Monad (guard, msum, mzero, unless, void)  import Control.Monad.Except (throwError)  import Control.Monad.Reader (ask, asks, lift, local, runReaderT)  import Data.ByteString.Base64 (encode) @@ -50,7 +50,7 @@ import Text.Pandoc.CSS (pickStyleAttrProps)  import qualified Text.Pandoc.UTF8 as UTF8  import Text.Pandoc.Definition  import Text.Pandoc.Readers.HTML.Parsing -import Text.Pandoc.Readers.HTML.Table (pTable') +import Text.Pandoc.Readers.HTML.Table (pTable)  import Text.Pandoc.Readers.HTML.TagCategories  import Text.Pandoc.Readers.HTML.Types  import Text.Pandoc.Readers.LaTeX (rawLaTeXInline) @@ -63,8 +63,7 @@ import Text.Pandoc.Options (      extensionEnabled)  import Text.Pandoc.Parsing hiding ((<|>))  import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI, -                           extractSpaces, htmlSpanLikeElements, elemText, splitTextBy, -                           safeRead, tshow) +                           extractSpaces, htmlSpanLikeElements, safeRead, tshow)  import Text.Pandoc.Walk  import Text.Parsec.Error  import Text.TeXMath (readMathML, writeTeX) @@ -159,7 +158,7 @@ block = do              , pCodeBlock              , pList              , pHrule -            , pTable +            , pTable block              , pHtml              , pHead              , pBody @@ -464,31 +463,6 @@ pHrule = do    pSelfClosing (=="hr") (const True)    return B.horizontalRule -pTable :: PandocMonad m => TagParser m Blocks -pTable = pTable' block pCell - -pCell :: PandocMonad m => Text -> TagParser m [Cell] -pCell celltype = try $ do -  skipMany pBlank -  tag <- lookAhead $ pSatisfy (\t -> t ~== TagOpen celltype []) -  let extractAlign' []                 = "" -      extractAlign' ("text-align":x:_) = x -      extractAlign' (_:xs)             = extractAlign' xs -  let extractAlign = extractAlign' . splitTextBy (`elemText` " \t;:") -  let align = case maybeFromAttrib "align" tag `mplus` -                   (extractAlign <$> maybeFromAttrib "style" tag) of -                   Just "left"   -> AlignLeft -                   Just "right"  -> AlignRight -                   Just "center" -> AlignCenter -                   _             -> AlignDefault -  let rowspan = RowSpan . fromMaybe 1 $ -                safeRead =<< maybeFromAttrib "rowspan" tag -  let colspan = ColSpan . fromMaybe 1 $ -                safeRead =<< maybeFromAttrib "colspan" tag -  res <- pInTags celltype block -  skipMany pBlank -  return [B.cell align rowspan colspan res] -  pBlockQuote :: PandocMonad m => TagParser m Blocks  pBlockQuote = do    contents <- pInTags "blockquote" block @@ -653,12 +627,6 @@ pLineBreak = do    pSelfClosing (=="br") (const True)    return B.linebreak --- Unlike fromAttrib from tagsoup, this distinguishes --- between a missing attribute and an attribute with empty content. -maybeFromAttrib :: Text -> Tag Text -> Maybe Text -maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs -maybeFromAttrib _ _ = Nothing -  pLink :: PandocMonad m => TagParser m Inlines  pLink = try $ do    tag <- pSatisfy $ tagOpenLit "a" (const True) diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs index 8788a933e..2d58319da 100644 --- a/src/Text/Pandoc/Readers/HTML/Parsing.hs +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE LambdaCase            #-}  {-# LANGUAGE OverloadedStrings     #-}  {- |     Module      : Text.Pandoc.Readers.HTML.Parsing @@ -15,6 +16,7 @@ module Text.Pandoc.Readers.HTML.Parsing    , pInTags    , pInTags'    , pInTag +  , pInTagWithAttribs    , pAny    , pCloses    , pSatisfy @@ -22,6 +24,7 @@ module Text.Pandoc.Readers.HTML.Parsing    , matchTagClose    , matchTagOpen    , isSpace +  , maybeFromAttrib    , toAttr    , toStringAttr    ) @@ -31,11 +34,11 @@ import Control.Monad (guard, void, mzero)  import Data.Maybe (fromMaybe)  import Data.Text (Text)  import Text.HTML.TagSoup -  ( Tag (..), (~==), isTagText, isTagPosition, isTagOpen, isTagClose ) +  ( Attribute, Tag (..), isTagText, isTagPosition, isTagOpen, isTagClose, (~==) )  import Text.Pandoc.Class.PandocMonad (PandocMonad (..))  import Text.Pandoc.Definition (Attr)  import Text.Pandoc.Parsing -  ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional +  ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, option, optional    , skipMany, setPosition, token, try)  import Text.Pandoc.Readers.HTML.TagCategories  import Text.Pandoc.Readers.HTML.Types @@ -60,25 +63,41 @@ pInTags' :: (PandocMonad m, Monoid a)           -> TagParser m a           -> TagParser m a  pInTags' tagtype tagtest parser = try $ do -  pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) +  pSatisfy $ \t -> matchTagOpen tagtype [] t && tagtest t    mconcat <$> manyTill parser (pCloses tagtype <|> eof) --- parses p, preceded by an opening tag (optional if tagsOptional) --- and followed by a closing tag (optional if tagsOptional) -pInTag :: PandocMonad m => TagOmission -> Text -> TagParser m a -> TagParser m a -pInTag tagOmission tagtype p = try $ do -  skipMany pBlank +pInTag :: PandocMonad m +       => TagOmission    -- ^ Whether some tags can be omitted +       -> Text           -- ^ @tagtype@ Tag name +       -> TagParser m a  -- ^ @p@ Content parser +       -> TagParser m a +pInTag tagOmission tagtype = fmap snd . pInTagWithAttribs tagOmission tagtype + +-- | Returns the contents of a tag together with its attributes; parses +-- @p@, preceded by an opening tag (optional if TagsOmittable) and +-- followed by a closing tag (optional unless TagsRequired). +pInTagWithAttribs :: PandocMonad m +                  => TagOmission    -- ^ Whether some tags can be omitted +                  -> Text           -- ^ @tagtype@ Tag name +                  -> TagParser m a  -- ^ @p@ Content parser +                  -> TagParser m ([Attribute Text], a) +pInTagWithAttribs tagOmission tagtype p = try $ do    let openingOptional = tagOmission == TagsOmittable    let closingOptional = tagOmission /= TagsRequired -  (if openingOptional then optional else void) $ -    pSatisfy (matchTagOpen tagtype []) +  skipMany pBlank +  attribs <- (if openingOptional then option [] else id) +             (getAttribs <$> pSatisfy (matchTagOpen tagtype []))    skipMany pBlank    x <- p    skipMany pBlank    (if closingOptional then optional else void) $      pSatisfy (matchTagClose tagtype)    skipMany pBlank -  return x +  return (attribs, x) +  where +    getAttribs = \case +      TagOpen _ attribs -> attribs +      _                 -> []  pCloses :: PandocMonad m => Text -> TagParser m ()  pCloses tagtype = try $ do @@ -183,6 +202,12 @@ toStringAttr = map go           -> (x',y)         _ -> (x,y) +-- Unlike fromAttrib from tagsoup, this distinguishes +-- between a missing attribute and an attribute with empty content. +maybeFromAttrib :: Text -> Tag Text -> Maybe Text +maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs +maybeFromAttrib _ _ = Nothing +  mkAttr :: [(Text, Text)] -> Attr  mkAttr attr = (attribsId, attribsClasses, attribsKV)    where attribsId = fromMaybe "" $ lookup "id" attr diff --git a/src/Text/Pandoc/Readers/HTML/Table.hs b/src/Text/Pandoc/Readers/HTML/Table.hs index e6d0a9097..e40d90221 100644 --- a/src/Text/Pandoc/Readers/HTML/Table.hs +++ b/src/Text/Pandoc/Readers/HTML/Table.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE LambdaCase #-}  {-# LANGUAGE OverloadedStrings #-}  {-# LANGUAGE ViewPatterns #-}  {- | @@ -12,17 +13,19 @@  HTML table parser.  -} -module Text.Pandoc.Readers.HTML.Table (pTable') where +module Text.Pandoc.Readers.HTML.Table (pTable) where -import Control.Monad (guard) +import Control.Applicative ((<|>))  import Data.Maybe (fromMaybe)  import Data.Text (Text)  import Text.HTML.TagSoup  import Text.Pandoc.Builder (Blocks) +import Text.Pandoc.CSS (cssAttributes)  import Text.Pandoc.Definition  import Text.Pandoc.Class.PandocMonad (PandocMonad (..))  import Text.Pandoc.Parsing -  ( (<|>), eof, many, many1, manyTill, option, optional, skipMany, try) +  ( eof, lookAhead, many, many1, manyTill, option, optional +  , optionMaybe, skipMany, try)  import Text.Pandoc.Readers.HTML.Parsing  import Text.Pandoc.Readers.HTML.Types (TagParser)  import Text.Pandoc.Shared (onlySimpleTableCells, safeRead) @@ -57,58 +60,183 @@ pColgroup = try $ do    skipMany pBlank    manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank --- | Parses a simple HTML table -pTable' :: PandocMonad m -        => TagParser m Blocks           -- ^ Caption parser -        -> (Text -> TagParser m [Cell]) -- ^ Table cell parser -        -> TagParser m Blocks -pTable' block pCell = try $ do -  TagOpen _ attribs' <- pSatisfy (matchTagOpen "table" []) -  let attribs = toAttr attribs' +pCell :: PandocMonad m +      => TagParser m Blocks +      -> Text +      -> TagParser m [Cell] +pCell block celltype = try $ do +  skipMany pBlank +  TagOpen _ attribs <- lookAhead $ pSatisfy (matchTagOpen celltype []) +  let cssAttribs = maybe [] cssAttributes $ lookup "style" attribs +  let align = case lookup "align" attribs <|> +                   lookup "text-align" cssAttribs of +                Just "left"   -> AlignLeft +                Just "right"  -> AlignRight +                Just "center" -> AlignCenter +                _             -> AlignDefault +  let rowspan = RowSpan . fromMaybe 1 $ +                safeRead =<< lookup "rowspan" attribs +  let colspan = ColSpan . fromMaybe 1 $ +                safeRead =<< lookup "colspan" attribs +  res <- pInTags celltype block +  skipMany pBlank +  let handledAttribs = ["align", "colspan", "rowspan", "text-align"] +      attribs' = foldr go [] attribs +      go kv@(k, _) acc = case k of +        "style" -> case filter ((/= "text-align") . fst) cssAttribs of +                     [] -> acc +                     cs -> ("style", toStyleString cs) : acc +        -- drop attrib if it's already handled +        _ | k `elem` handledAttribs -> acc +        _ -> kv : acc +  return [B.cellWith (toAttr attribs') align rowspan colspan res] + +-- | Create a style attribute string from a list of CSS attributes +toStyleString :: [(Text, Text)] -> Text +toStyleString = T.intercalate "; " . map (\(k, v) -> k <> ": " <> v) + +data RowType +  = HeaderCells +  | AllCells + +-- | Parses a table row +pRow :: PandocMonad m +     => TagParser m Blocks +     -> RowType +     -> TagParser m [B.Row] +pRow block rowType = try $ do +  skipMany pBlank +  case rowType of +    HeaderCells -> do +      maybeCells <- optionMaybe (pInTags "tr" (pCell block "th")) +      return $ case maybeCells of +        Nothing    -> [] +        Just cells -> [Row nullAttr cells] +    AllCells    -> do +      cells <- pInTags "tr" (pCell block "td" <|> pCell block "th") +      return [Row nullAttr cells] + +-- | Parses a table head +pTableHead :: PandocMonad m +           => TagParser m Blocks +           -> TagParser m TableHead +pTableHead block = try $ do +  skipMany pBlank +  (attribs, rows) <-  pInTagWithAttribs ClosingTagOptional "thead" +                                        (option [] $ pRow block AllCells) +                  <|> pInTagWithAttribs TagsOmittable "thead" +                                        (pRow block HeaderCells) +  let cells = concatMap (\(Row _ cs) -> cs) rows +  if null cells +    then TableHead nullAttr <$> +         pInTag TagsOmittable "tbody" (pRow block HeaderCells) +    else return $ TableHead (toAttr attribs) [Row nullAttr cells] + +-- | Parses a table foot +pTableFoot :: PandocMonad m +           => TagParser m Blocks +           -> TagParser m TableFoot +pTableFoot block = try $ do +  skipMany pBlank +  TagOpen _ attribs <- pSatisfy (matchTagOpen "tfoot" []) <* skipMany pBlank +  rows <- mconcat <$> many (pRow block AllCells <* skipMany pBlank) +  optional $ pSatisfy (matchTagClose "tfoot") +  return $ TableFoot (toAttr attribs) rows + +-- | Parses a table body +pTableBody :: PandocMonad m +           => TagParser m Blocks +           -> TagParser m TableBody +pTableBody block = do    skipMany pBlank -  caption <- option mempty $ pInTags "caption" block <* skipMany pBlank -  widths' <- (mconcat <$> many1 pColgroup) <|> many pCol -  let pTh = option [] $ pInTags "tr" (pCell "th") -      pTr = try $ skipMany pBlank -               *> pInTags "tr" (pCell "td" <|> pCell "th") -      pTBody = pInTag TagsOmittable "tbody" $ many1 pTr -  head'' <- pInTag ClosingTagOptional "thead" (option [] pTr) -        <|> pInTag TagsOmittable "thead" pTh -  head'  <- pInTag TagsOmittable "tbody" -               (if null head'' then pTh else return head'') -  topfoot <- option [] $ pInTag TagsRequired "tfoot" $ many pTr -  rowsLs <- many pTBody -  bottomfoot <- option [] $ pInTag ClosingTagOptional "tfoot" $ many pTr +  (attribs, rows) <- pInTagWithAttribs TagsOmittable "tbody" +                     (mconcat <$> many1 (pRow block AllCells)) +  return $ TableBody (toAttr attribs) 0 [] rows + + +-- | Parses a simple HTML table +pTable :: PandocMonad m +       => TagParser m Blocks -- ^ Caption and cell contents parser +       -> TagParser m Blocks +pTable block = try $ do +  TagOpen _ attribs <- pSatisfy (matchTagOpen "table" [])  <* skipMany pBlank +  caption <- option mempty $ pInTags "caption" block       <* skipMany pBlank +  widths  <- ((mconcat <$> many1 pColgroup) <|> many pCol) <* skipMany pBlank +  thead   <- pTableHead block               <* skipMany pBlank +  topfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank +  tbodies <- many (pTableBody block)        <* skipMany pBlank +  botfoot <- optionMaybe (pTableFoot block) <* skipMany pBlank    TagClose _ <- pSatisfy (matchTagClose "table") -  let rows = concat rowsLs <> topfoot <> bottomfoot -      rows''' = map (map cellContents) rows +  let tfoot = fromMaybe (TableFoot nullAttr []) $ topfoot <|> botfoot +  case normalize widths thead tbodies tfoot of +    Left err -> fail err +    Right (colspecs, thead', tbodies', tfoot') -> return $ +      B.tableWith (toAttr attribs) +                  (B.simpleCaption caption) +                  colspecs +                  thead' +                  tbodies' +                  tfoot' +data TableType +  = SimpleTable +  | NormalTable + +tableType :: [[Cell]] -> TableType +tableType cells = +  if onlySimpleTableCells $ map (map cellContents) cells +  then SimpleTable +  else NormalTable +  where +    cellContents :: Cell -> [Block] +    cellContents (Cell _ _ _ _ bs) = bs + +normalize :: [ColWidth] -> TableHead -> [TableBody] -> TableFoot +          -> Either String ([ColSpec], TableHead, [TableBody], TableFoot) +normalize widths head' bodies foot = do +  let rows = headRows head' <> concatMap bodyRows bodies <> footRows foot +  let rowLength = length . rowCells +  let ncols = maximum (map rowLength rows) +  let tblType = tableType (map rowCells rows)    -- fail on empty table -  guard $ not $ null head' && null rows''' -  let isSimple = onlySimpleTableCells $ -                 map cellContents head' : rows''' -  let cols = if null head' -                then maximum (map length rows''') -                else length head' -  let aligns = case rows of -                 (cs:_) -> take cols $ -                           concatMap cellAligns cs ++ repeat AlignDefault -                 _      -> replicate cols AlignDefault -  let widths = if null widths' -                  then if isSimple -                       then replicate cols ColWidthDefault -                       else replicate cols (ColWidth (1.0 / fromIntegral cols)) -                  else widths' -  let toRow = Row nullAttr -      toHeaderRow l = [toRow l | not (null l)] -  return $ B.tableWith attribs -                   (B.simpleCaption caption) -                   (zip aligns widths) -                   (TableHead nullAttr $ toHeaderRow head') -                   [TableBody nullAttr 0 [] $ map toRow rows] -                   (TableFoot nullAttr []) - -cellContents :: Cell -> [Block] -cellContents (Cell _ _ _ _ bs) = bs - -cellAligns :: Cell -> [Alignment] -cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align +  if null rows +    then Left "empty table" +    else Right +         ( zip (calculateAlignments ncols bodies) +               (normalizeColWidths ncols tblType widths) +         , head' +         , bodies +         , foot +         ) + +normalizeColWidths :: Int -> TableType -> [ColWidth] -> [ColWidth] +normalizeColWidths ncols tblType = \case +  [] -> case tblType of +          SimpleTable -> replicate ncols ColWidthDefault +          NormalTable -> replicate ncols (ColWidth $ 1 / fromIntegral ncols) +  widths -> widths + +calculateAlignments :: Int -> [TableBody] -> [Alignment] +calculateAlignments cols tbodies = +  case cells of +    cs:_ -> take cols $ concatMap cellAligns cs ++ repeat AlignDefault +    _    -> replicate cols AlignDefault +  where +    cells :: [[Cell]] +    cells = concatMap bodyRowCells tbodies +    cellAligns :: Cell -> [Alignment] +    cellAligns (Cell _ align _ (ColSpan cs) _) = replicate cs align + +bodyRowCells :: TableBody -> [[Cell]] +bodyRowCells = map rowCells . bodyRows + +headRows :: TableHead -> [B.Row] +headRows (TableHead _ rows) = rows + +bodyRows :: TableBody -> [B.Row] +bodyRows (TableBody _ _ headerRows bodyRows') = headerRows <> bodyRows' + +footRows :: TableFoot -> [B.Row] +footRows (TableFoot _ rows) = rows + +rowCells :: B.Row -> [Cell] +rowCells (Row _ cells) = cells | 
