diff options
author | Albert Krewinkel <albert@zeitkraut.de> | 2020-11-22 22:30:47 +0100 |
---|---|---|
committer | Albert Krewinkel <albert+github@zeitkraut.de> | 2020-11-23 10:12:20 +0100 |
commit | f9258371dd20e0a9569c04923188a91f6c2e489e (patch) | |
tree | 7f0e5339cc0b0aec1d893c3ea2cc2a335ea41549 /src/Text/Pandoc/Readers/HTML/Parsing.hs | |
parent | 75c881e2d97df015857a18ec46df0ecc17347778 (diff) | |
download | pandoc-f9258371dd20e0a9569c04923188a91f6c2e489e.tar.gz |
HTML reader: extract submodules
Reducing module size should reduce memory use during compilation.
This is preparatory work to tackle support for more table features.
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML/Parsing.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML/Parsing.hs | 156 |
1 files changed, 156 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/HTML/Parsing.hs b/src/Text/Pandoc/Readers/HTML/Parsing.hs new file mode 100644 index 000000000..7fda066b5 --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML/Parsing.hs @@ -0,0 +1,156 @@ +{-# LANGUAGE OverloadedStrings #-} +{- | + Module : Text.Pandoc.Readers.HTML.Parsing + Copyright : Copyright (C) 2006-2020 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Parsing functions and utilities. +-} +module Text.Pandoc.Readers.HTML.Parsing + ( pInTags + , pInTags' + , pInTag + , pAny + , pCloses + , pSatisfy + , pBlank + , matchTagClose + , matchTagOpen + , isSpace + ) +where + +import Control.Monad (guard, void, mzero) +import Data.Text (Text) +import Text.HTML.TagSoup +import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) +import Text.Pandoc.Parsing + ( (<|>), eof, getPosition, lookAhead, manyTill, newPos, optional + , skipMany, setPosition, token, try) +import Text.Pandoc.Readers.HTML.TagCategories +import Text.Pandoc.Readers.HTML.Types +import Text.Pandoc.Shared (tshow) +import qualified Data.Set as Set +import qualified Data.Text as T + +pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a +pInTags tagtype parser = pInTags' tagtype (const True) parser + +pInTags' :: (PandocMonad m, Monoid a) + => Text + -> (Tag Text -> Bool) + -> TagParser m a + -> TagParser m a +pInTags' tagtype tagtest parser = try $ do + pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) + mconcat <$> manyTill parser (pCloses tagtype <|> eof) + +-- parses p, preceded by an opening tag (optional if tagsOptional) +-- and followed by a closing tag (optional if tagsOptional) +pInTag :: PandocMonad m => Bool -> Text -> TagParser m a -> TagParser m a +pInTag tagsOptional tagtype p = try $ do + skipMany pBlank + (if tagsOptional then optional else void) $ pSatisfy (matchTagOpen tagtype []) + skipMany pBlank + x <- p + skipMany pBlank + (if tagsOptional then optional else void) $ pSatisfy (matchTagClose tagtype) + skipMany pBlank + return x + +pCloses :: PandocMonad m => Text -> TagParser m () +pCloses tagtype = try $ do + t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag + case t of + (TagClose t') | t' == tagtype -> void pAny + (TagOpen t' _) | t' `closes` tagtype -> return () + (TagClose "ul") | tagtype == "li" -> return () + (TagClose "ol") | tagtype == "li" -> return () + (TagClose "dl") | tagtype == "dd" -> return () + (TagClose "table") | tagtype == "td" -> return () + (TagClose "table") | tagtype == "th" -> return () + (TagClose "table") | tagtype == "tr" -> return () + (TagClose "td") | tagtype `Set.member` blockHtmlTags -> return () + (TagClose "th") | tagtype `Set.member` blockHtmlTags -> return () + (TagClose t') | tagtype == "p" && t' `Set.member` blockHtmlTags + -> return () -- see #3794 + _ -> mzero + +pBlank :: PandocMonad m => TagParser m () +pBlank = try $ do + (TagText str) <- pSatisfy isTagText + guard $ T.all isSpace str + +pLocation :: PandocMonad m => TagParser m () +pLocation = do + (TagPosition r c) <- pSat isTagPosition + setPosition $ newPos "input" r c + +pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) +pSat f = do + pos <- getPosition + token tshow (const pos) (\x -> if f x then Just x else Nothing) + +pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) +pSatisfy f = try $ optional pLocation >> pSat f + +matchTagClose :: Text -> (Tag Text -> Bool) +matchTagClose t = (~== TagClose t) + +matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool) +matchTagOpen t as = (~== TagOpen t as) + +pAny :: PandocMonad m => TagParser m (Tag Text) +pAny = pSatisfy (const True) + +isSpace :: Char -> Bool +isSpace ' ' = True +isSpace '\t' = True +isSpace '\n' = True +isSpace '\r' = True +isSpace _ = False + +-- taken from HXT and extended +-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags +closes :: Text -> Text -> Bool +_ `closes` "body" = False +_ `closes` "html" = False +"body" `closes` "head" = True +"a" `closes` "a" = True +"li" `closes` "li" = True +"th" `closes` t | t `elem` ["th","td"] = True +"td" `closes` t | t `elem` ["th","td"] = True +"tr" `closes` t | t `elem` ["th","td","tr"] = True +"dd" `closes` t | t `elem` ["dt", "dd"] = True +"dt" `closes` t | t `elem` ["dt","dd"] = True +"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True +"optgroup" `closes` "optgroup" = True +"optgroup" `closes` "option" = True +"option" `closes` "option" = True +-- https://html.spec.whatwg.org/multipage/syntax.html#optional-tags +x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", + "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", + "h5", "h6", "header", "hr", "main", "menu", "nav", "ol", "p", "pre", "section", + "table", "ul"] = True +_ `closes` "meta" = True +"form" `closes` "form" = True +"label" `closes` "label" = True +"map" `closes` "map" = True +"object" `closes` "object" = True +_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True +t `closes` "select" | t /= "option" = True +"thead" `closes` "colgroup" = True +"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True +"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True +t `closes` t2 | + t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","main","p"] && + t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" or "main" +t1 `closes` t2 | + t1 `Set.member` blockTags && + t2 `Set.notMember` blockTags && + t2 `Set.notMember` eitherBlockOrInline = True +_ `closes` _ = False |