From f9258371dd20e0a9569c04923188a91f6c2e489e Mon Sep 17 00:00:00 2001 From: Albert Krewinkel Date: Sun, 22 Nov 2020 22:30:47 +0100 Subject: HTML reader: extract submodules Reducing module size should reduce memory use during compilation. This is preparatory work to tackle support for more table features. --- src/Text/Pandoc/Readers/HTML/Parsing.hs | 156 ++++++++++++++++++++++++++ src/Text/Pandoc/Readers/HTML/TagCategories.hs | 78 +++++++++++++ src/Text/Pandoc/Readers/HTML/Types.hs | 97 ++++++++++++++++ 3 files changed, 331 insertions(+) create mode 100644 src/Text/Pandoc/Readers/HTML/Parsing.hs create mode 100644 src/Text/Pandoc/Readers/HTML/TagCategories.hs create mode 100644 src/Text/Pandoc/Readers/HTML/Types.hs (limited to 'src/Text/Pandoc/Readers/HTML') diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs new file mode 100644 index 000000000..7fda066b5 --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -0,0 +1,156 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.HTML.Parsing + Copyright : Copyright (C) 2006-2020 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Parsing functions and utilities. +-} +module Text.Pandoc.Readers.HTML.Parsing + ( pInTags + , pInTags' + , pInTag + , pAny + , pCloses + , pSatisfy + , pBlank + , matchTagClose + , matchTagOpen + , isSpace + ) +where + +import Control.Monad (guard, void, mzero) +import Data.Text (Text) +import Text.HTML.TagSoup +import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) +import Text.Pandoc.Parsing + ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional + , skipMany, setPosition, token, try) +import Text.Pandoc.Readers.HTML.TagCategories +import Text.Pandoc.Readers.HTML.Types +import Text.Pandoc.Shared (tshow) +import qualified Data.Set as Set +import qualified Data.Text as T + +pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a +pInTags tagtype parser = pInTags' tagtype (const True) parser + +pInTags' :: (PandocMonad m, Monoid a) + => Text + -> (Tag Text -> Bool) + -> TagParser m a + -> TagParser m a +pInTags' tagtype tagtest parser = try $ do + pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) + mconcat <$> manyTill parser (pCloses tagtype <|> eof) + +-- parses p, preceded by an opening tag (optional if tagsOptional) +-- and followed by a closing tag (optional if tagsOptional) +pInTag :: PandocMonad m => Bool -> Text -> TagParser m a -> TagParser m a +pInTag tagsOptional tagtype p = try $ do + skipMany pBlank + (if tagsOptional then optional else void) $ pSatisfy (matchTagOpen tagtype []) + skipMany pBlank + x <- p + skipMany pBlank + (if tagsOptional then optional else void) $ pSatisfy (matchTagClose tagtype) + skipMany pBlank + return x + +pCloses :: PandocMonad m => Text -> TagParser m () +pCloses tagtype = try $ do + t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag + case t of + (TagClose t') | t' == tagtype -> void pAny + (TagOpen t' _) | t' `closes` tagtype -> return () + (TagClose "ul") | tagtype == "li" -> return () + (TagClose "ol") | tagtype == "li" -> return () + (TagClose "dl") | tagtype == "dd" -> return () + (TagClose "table") | tagtype == "td" -> return () + (TagClose "table") | tagtype == "th" -> return () + (TagClose "table") | tagtype == "tr" -> return () + (TagClose "td") | tagtype `Set.member` blockHtmlTags -> return () + (TagClose "th") | tagtype `Set.member` blockHtmlTags -> return () + (TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags + -> return () -- see #3794 + _ -> mzero + +pBlank :: PandocMonad m => TagParser m () +pBlank = try $ do + (TagText str) <- pSatisfy isTagText + guard $ T.all isSpace str + +pLocation :: PandocMonad m => TagParser m () +pLocation = do + (TagPosition r c) <- pSat isTagPosition + setPosition $ newPos "input" r c + +pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) +pSat f = do + pos <- getPosition + token tshow (const pos) (\x -> if f x then Just x else Nothing) + +pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) +pSatisfy f = try $ optional pLocation >> pSat f + +matchTagClose :: Text -> (Tag Text -> Bool) +matchTagClose t = (~== TagClose t) + +matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool) +matchTagOpen t as = (~== TagOpen t as) + +pAny :: PandocMonad m => TagParser m (Tag Text) +pAny = pSatisfy (const True) + +isSpace :: Char -> Bool +isSpace ' ' = True +isSpace '\t' = True +isSpace '\n' = True +isSpace '\r' = True +isSpace _ = False + +-- taken from HXT and extended +-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags +closes :: Text -> Text -> Bool +_ `closes` "body" = False +_ `closes` "html" = False +"body" `closes` "head" = True +"a" `closes` "a" = True +"li" `closes` "li" = True +"th" `closes` t | t `elem` ["th","td"] = True +"td" `closes` t | t `elem` ["th","td"] = True +"tr" `closes` t | t `elem` ["th","td","tr"] = True +"dd" `closes` t | t `elem` ["dt", "dd"] = True +"dt" `closes` t | t `elem` ["dt","dd"] = True +"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True +"optgroup" `closes` "optgroup" = True +"optgroup" `closes` "option" = True +"option" `closes` "option" = True +-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags +x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", + "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section", + "table", "ul"] = True +_ `closes` "meta" = True +"form" `closes` "form" = True +"label" `closes` "label" = True +"map" `closes` "map" = True +"object" `closes` "object" = True +_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True +t `closes` "select" | t /= "option" = True +"thead" `closes` "colgroup" = True +"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True +"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True +t `closes` t2 | + t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] && + t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main" +t1 `closes` t2 | + t1 `Set.member` blockTags && + t2 `Set.notMember` blockTags && + t2 `Set.notMember` eitherBlockOrInline = True +_ `closes` _ = False diff --git a/src/Text/Pandoc/Readers/HTML/TagCategories.hs b/src/Text/Pandoc/Readers/HTML/TagCategories.hs new file mode 100644 index 000000000..4f82a1831 --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML/TagCategories.hs @@ -0,0 +1,78 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.HTML.TagCategories + Copyright : Copyright (C) 2006-2020 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Categories of tags. +-} +module Text.Pandoc.Readers.HTML.TagCategories + ( blockHtmlTags + , blockDocBookTags + , eitherBlockOrInline + , epubTags + , blockTags + , sectioningContent + , groupingContent + ) +where + +import Data.Set (Set, fromList, unions) +import Data.Text (Text) + +eitherBlockOrInline :: Set Text +eitherBlockOrInline = fromList + ["audio", "applet", "button", "iframe", "embed", + "del", "ins", "progress", "map", "area", "noscript", "script", + "object", "svg", "video", "source"] + +blockHtmlTags :: Set Text +blockHtmlTags = fromList + ["?xml", "!DOCTYPE", "address", "article", "aside", + "blockquote", "body", "canvas", + "caption", "center", "col", "colgroup", "dd", "details", + "dir", "div", "dl", "dt", "fieldset", "figcaption", "figure", + "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "head", "header", "hgroup", "hr", "html", + "isindex", "main", "menu", "meta", "noframes", "nav", + "ol", "output", "p", "pre", + "section", "summary", "table", "tbody", "textarea", + "thead", "tfoot", "ul", "dd", + "dt", "frameset", "li", "tbody", "td", "tfoot", + "th", "thead", "tr", "script", "style"] + +-- We want to allow raw docbook in markdown documents, so we +-- include docbook block tags here too. +blockDocBookTags :: Set Text +blockDocBookTags = fromList + ["calloutlist", "bibliolist", "glosslist", "itemizedlist", + "orderedlist", "segmentedlist", "simplelist", + "variablelist", "caution", "important", "note", "tip", + "warning", "address", "literallayout", "programlisting", + "programlistingco", "screen", "screenco", "screenshot", + "synopsis", "example", "informalexample", "figure", + "informalfigure", "table", "informaltable", "para", + "simpara", "formalpara", "equation", "informalequation", + "figure", "screenshot", "mediaobject", "qandaset", + "procedure", "task", "cmdsynopsis", "funcsynopsis", + "classsynopsis", "blockquote", "epigraph", "msgset", + "sidebar", "title"] + +epubTags :: Set Text +epubTags = fromList ["case", "switch", "default"] + +blockTags :: Set Text +blockTags = unions [blockHtmlTags, blockDocBookTags, epubTags] + +sectioningContent :: [Text] +sectioningContent = ["article", "aside", "nav", "section"] + + +groupingContent :: [Text] +groupingContent = ["p", "hr", "pre", "blockquote", "ol" + , "ul", "li", "dl", "dt", "dt", "dd" + , "figure", "figcaption", "div", "main"] diff --git a/src/Text/Pandoc/Readers/HTML/Types.hs b/src/Text/Pandoc/Readers/HTML/Types.hs new file mode 100644 index 000000000..a94eeb828 --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML/Types.hs @@ -0,0 +1,97 @@ +{-# LANGUAGE FlexibleInstances #-} +{-# LANGUAGE MultiParamTypeClasses #-} +{- | + Module : Text.Pandoc.Readers.HTML.Types + Copyright : Copyright (C) 2006-2020 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Types for pandoc's HTML reader. +-} +module Text.Pandoc.Readers.HTML.Types + ( TagParser + , HTMLParser + , HTMLState (..) + , HTMLLocal (..) + ) +where + +import Control.Monad.Reader (ReaderT, asks, local) +import Data.Default (Default (def)) +import Data.Map (Map) +import Data.Set (Set) +import Data.Text (Text) +import Network.URI (URI) +import Text.HTML.TagSoup +import Text.Pandoc.Builder (Blocks, HasMeta (..)) +import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) +import Text.Pandoc.Logging (LogMessage) +import Text.Pandoc.Options (ReaderOptions) +import Text.Pandoc.Parsing + ( HasIdentifierList (..), HasLastStrPosition (..), HasLogMessages (..) + , HasMacros (..), HasQuoteContext (..), HasReaderOptions (..) + , ParserT, ParserState, QuoteContext (NoQuote) + ) +import Text.Pandoc.Readers.LaTeX.Types (Macro) + +-- | HTML parser type +type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m) + +-- | HTML parser, expecting @Tag Text@ as tokens. +type TagParser m = HTMLParser m [Tag Text] + +-- | Global HTML parser state +data HTMLState = HTMLState + { parserState :: ParserState + , noteTable :: [(Text, Blocks)] + , baseHref :: Maybe URI + , identifiers :: Set Text + , logMessages :: [LogMessage] + , macros :: Map Text Macro + , readerOpts :: ReaderOptions + } + +-- | Local HTML parser state +data HTMLLocal = HTMLLocal + { quoteContext :: QuoteContext + , inChapter :: Bool -- ^ Set if in chapter section + , inPlain :: Bool -- ^ Set if in pPlain + } + + +-- Instances + +instance HasMacros HTMLState where + extractMacros = macros + updateMacros f st = st{ macros = f $ macros st } + +instance HasIdentifierList HTMLState where + extractIdentifierList = identifiers + updateIdentifierList f s = s{ identifiers = f (identifiers s) } + +instance HasLogMessages HTMLState where + addLogMessage m s = s{ logMessages = m : logMessages s } + getLogMessages = reverse . logMessages + +-- This signature should be more general +-- MonadReader HTMLLocal m => HasQuoteContext st m +instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where + getQuoteContext = asks quoteContext + withQuoteContext q = local (\s -> s{quoteContext = q}) + +instance HasReaderOptions HTMLState where + extractReaderOptions = extractReaderOptions . parserState + +instance HasMeta HTMLState where + setMeta s b st = st {parserState = setMeta s b $ parserState st} + deleteMeta s st = st {parserState = deleteMeta s $ parserState st} + +instance Default HTMLLocal where + def = HTMLLocal NoQuote False False + +instance HasLastStrPosition HTMLState where + setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} + getLastStrPos = getLastStrPos . parserState -- cgit v1.2.3