From 49b738de4e9babe727a8e48f22a90eacbdd63847 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Sun, 11 Jun 2017 11:55:21 +0200 Subject: Rewrote HTML reader to use Text throughout. - Export new NamedTag class from HTML reader. - Effect on memory usage is modest (< 10%). --- src/Text/Pandoc/Readers/HTML.hs | 331 +++++++++++++++++++++++----------------- 1 file changed, 194 insertions(+), 137 deletions(-) (limited to 'src/Text/Pandoc/Readers') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 3bccf89fb..94f933c4d 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -1,5 +1,5 @@ {-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses, -ViewPatterns#-} +ViewPatterns, OverloadedStrings #-} {- Copyright (C) 2006-2017 John MacFarlane @@ -34,6 +34,7 @@ module Text.Pandoc.Readers.HTML ( readHtml , htmlInBalanced , isInlineTag , isBlockTag + , NamedTag(..) , isTextTag , isCommentTag ) where @@ -43,7 +44,7 @@ import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) -import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField +import Text.Pandoc.Shared ( extractSpaces, addMetaField , escapeURI, safeRead ) import Text.Pandoc.Options (ReaderOptions(readerExtensions), extensionEnabled, Extension (Ext_epub_html_exts, @@ -53,13 +54,14 @@ import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Walk import qualified Data.Map as M import Data.Maybe ( fromMaybe, isJust) -import Data.List ( intercalate, isInfixOf, isPrefixOf ) +import Data.List ( intercalate, isPrefixOf ) import Data.Char ( isDigit, isLetter, isAlphaNum ) import Control.Monad ( guard, mzero, void, unless ) import Control.Arrow ((***)) import Control.Applicative ( (<|>) ) import Data.Monoid (First (..)) -import Data.Text (Text, unpack) +import Data.Text (Text) +import qualified Data.Text as T import Text.TeXMath (readMathML, writeTeX) import Data.Default (Default (..), def) import Control.Monad.Reader (ask, asks, local, ReaderT, runReaderT, lift) @@ -80,7 +82,7 @@ readHtml :: PandocMonad m readHtml opts inp = do let tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } - (unpack inp) + inp parseDoc = do blocks <- (fixPlains False) . mconcat <$> manyTill block eof meta <- stateMeta . parserState <$> getState @@ -130,7 +132,7 @@ setInPlain = local (\s -> s {inPlain = True}) type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m) -type TagParser m = HTMLParser m [Tag String] +type TagParser m = HTMLParser m [Tag Text] pBody :: PandocMonad m => TagParser m Blocks pBody = pInTags "body" block @@ -140,12 +142,12 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag where pTitle = pInTags "title" inline >>= setTitle . trimInlines setTitle t = mempty <$ (updateState $ B.setMeta "title" t) pMetaTag = do - mt <- pSatisfy (~== TagOpen "meta" []) - let name = fromAttrib "name" mt + mt <- pSatisfy (matchTagOpen "meta" []) + let name = T.unpack $ fromAttrib "name" mt if null name then return mempty else do - let content = fromAttrib "content" mt + let content = T.unpack $ fromAttrib "content" mt updateState $ \s -> let ps = parserState s in s{ parserState = ps{ @@ -153,9 +155,9 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag (stateMeta ps) } } return mempty pBaseTag = do - bt <- pSatisfy (~== TagOpen "base" []) + bt <- pSatisfy (matchTagOpen "base" []) updateState $ \st -> st{ baseHref = - parseURIReference $ fromAttrib "href" bt } + parseURIReference $ T.unpack $ fromAttrib "href" bt } return mempty block :: PandocMonad m => TagParser m Blocks @@ -195,29 +197,31 @@ eSwitch :: (PandocMonad m, Monoid a) -> TagParser m a eSwitch constructor parser = try $ do guardEnabled Ext_epub_html_exts - pSatisfy (~== TagOpen "switch" []) + pSatisfy (matchTagOpen "switch" []) cases <- getFirst . mconcat <$> manyTill (First <$> (eCase <* skipMany pBlank) ) - (lookAhead $ try $ pSatisfy (~== TagOpen "default" [])) + (lookAhead $ try $ pSatisfy (matchTagOpen "default" [])) skipMany pBlank fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank) skipMany pBlank - pSatisfy (~== TagClose "switch") + pSatisfy (matchTagClose "switch") return $ maybe fallback constructor cases eCase :: PandocMonad m => TagParser m (Maybe Inlines) eCase = do skipMany pBlank - TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" []) + TagOpen _ attr' <- lookAhead $ pSatisfy $ (matchTagOpen "case" []) + let attr = toStringAttr attr' case (flip lookup namespaces) =<< lookup "required-namespace" attr of Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)) - Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case")) + Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (matchTagClose "case")) eFootnote :: PandocMonad m => TagParser m () eFootnote = try $ do let notes = ["footnote", "rearnote"] guardEnabled Ext_epub_html_exts - (TagOpen tag attr) <- lookAhead $ pAnyTag + (TagOpen tag attr') <- lookAhead $ pAnyTag + let attr = toStringAttr attr' guard (maybe False (flip elem notes) (lookup "type" attr)) let ident = fromMaybe "" (lookup "id" attr) content <- pInTags tag block @@ -229,7 +233,8 @@ addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s) eNoteref :: PandocMonad m => TagParser m Inlines eNoteref = try $ do guardEnabled Ext_epub_html_exts - TagOpen tag attr <- lookAhead $ pAnyTag + TagOpen tag attr' <- lookAhead $ pAnyTag + let attr = toStringAttr attr' guard (maybe False (== "noteref") (lookup "type" attr)) let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr) guard (not (null ident)) @@ -249,10 +254,10 @@ pList = pBulletList <|> pOrderedList <|> pDefinitionList pBulletList :: PandocMonad m => TagParser m Blocks pBulletList = try $ do - pSatisfy (~== TagOpen "ul" []) + pSatisfy (matchTagOpen "ul" []) let nonItem = pSatisfy (\t -> not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && - not (t ~== TagClose "ul")) + not (matchTagClose "ul" t)) -- note: if they have an
    or
      not in scope of a
    • , -- treat it as a list item, though it's not valid xhtml... skipMany nonItem @@ -261,7 +266,8 @@ pBulletList = try $ do pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks pListItem nonItem = do - TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" []) + TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "li" []) + let attr = toStringAttr attr' let addId ident bs = case B.toList bs of (Plain ils:xs) -> B.fromList (Plain [Span (ident, [], []) ils] : xs) @@ -287,7 +293,8 @@ parseTypeAttr _ = DefaultStyle pOrderedList :: PandocMonad m => TagParser m Blocks pOrderedList = try $ do - TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) + TagOpen _ attribs' <- pSatisfy (matchTagOpen "ol" []) + let attribs = toStringAttr attribs' let (start, style) = (sta', sty') where sta = fromMaybe "1" $ lookup "start" attribs @@ -309,7 +316,7 @@ pOrderedList = try $ do ] let nonItem = pSatisfy (\t -> not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && - not (t ~== TagClose "ol")) + not (matchTagClose "ol" t)) -- note: if they have an
        or
          not in scope of a
        • , -- treat it as a list item, though it's not valid xhtml... skipMany nonItem @@ -318,14 +325,14 @@ pOrderedList = try $ do pDefinitionList :: PandocMonad m => TagParser m Blocks pDefinitionList = try $ do - pSatisfy (~== TagOpen "dl" []) + pSatisfy (matchTagOpen "dl" []) items <- manyTill pDefListItem (pCloses "dl") return $ B.definitionList items pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks]) pDefListItem = try $ do - let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) && - not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl")) + let nonItem = pSatisfy (\t -> not (matchTagOpen "dt" [] t) && + not (matchTagOpen "dd" [] t) && not (matchTagClose "dl" t)) terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) skipMany nonItem @@ -348,12 +355,12 @@ fixPlains inList bs = if any isParaish bs' plainToPara x = x bs' = B.toList bs -pRawTag :: PandocMonad m => TagParser m String +pRawTag :: PandocMonad m => TagParser m Text pRawTag = do tag <- pAnyTag let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"] if tagOpen ignorable (const True) tag || tagClose ignorable tag - then return [] + then return mempty else return $ renderTags' [tag] pDiv :: PandocMonad m => TagParser m Blocks @@ -362,7 +369,8 @@ pDiv = try $ do let isDivLike "div" = True isDivLike "section" = True isDivLike _ = False - TagOpen tag attr <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) + TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) + let attr = toStringAttr attr' contents <- pInTags tag block let (ident, classes, kvs) = mkAttr attr let classes' = if tag == "section" @@ -372,7 +380,7 @@ pDiv = try $ do pRawHtmlBlock :: PandocMonad m => TagParser m Blocks pRawHtmlBlock = do - raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag + raw <- T.unpack <$> (pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag) exts <- getOption readerExtensions if extensionEnabled Ext_raw_html exts && not (null raw) then return $ B.rawBlock "html" raw @@ -387,33 +395,35 @@ ignore raw = do logMessage $ SkippedContent raw pos return mempty -pHtmlBlock :: PandocMonad m => String -> TagParser m String +pHtmlBlock :: PandocMonad m => Text -> TagParser m Text pHtmlBlock t = try $ do - open <- pSatisfy (~== TagOpen t []) - contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) - return $ renderTags' $ [open] ++ contents ++ [TagClose t] + open <- pSatisfy (matchTagOpen t []) + contents <- manyTill pAnyTag (pSatisfy (matchTagClose t)) + return $ renderTags' $ [open] <> contents <> [TagClose t] -- Sets chapter context eSection :: PandocMonad m => TagParser m Blocks eSection = try $ do - let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as) + let matchChapter as = maybe False (T.isInfixOf "chapter") (lookup "type" as) let sectTag = tagOpen (`elem` sectioningContent) matchChapter TagOpen tag _ <- lookAhead $ pSatisfy sectTag setInChapter (pInTags tag block) -headerLevel :: PandocMonad m => String -> TagParser m Int +headerLevel :: PandocMonad m => Text -> TagParser m Int headerLevel tagtype = do - let level = read (drop 1 tagtype) - (try $ do - guardEnabled Ext_epub_html_exts - asks inChapter >>= guard - return (level - 1)) - <|> - return level + case safeRead (T.unpack (T.drop 1 tagtype)) of + Just level -> + (try $ do + guardEnabled Ext_epub_html_exts + asks inChapter >>= guard + return (level - 1)) + <|> + return level + Nothing -> fail "Could not retrieve header level" eTitlePage :: PandocMonad m => TagParser m () eTitlePage = try $ do - let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as) + let isTitlePage as = maybe False (T.isInfixOf "titlepage") (lookup "type" as) let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section") isTitlePage TagOpen tag _ <- lookAhead $ pSatisfy groupTag @@ -421,19 +431,21 @@ eTitlePage = try $ do pHeader :: PandocMonad m => TagParser m Blocks pHeader = try $ do - TagOpen tagtype attr <- pSatisfy $ + TagOpen tagtype attr' <- pSatisfy $ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) - let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] + let attr = toStringAttr attr' + let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text) + [("class","title")] level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] words $ lookup "class" attr let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] - attr' <- registerHeader (ident, classes, keyvals) contents + attr'' <- registerHeader (ident, classes, keyvals) contents return $ if bodyTitle then mempty -- skip a representation of the title in the body - else B.headerWith attr' level contents + else B.headerWith attr'' level contents pHrule :: PandocMonad m => TagParser m Blocks pHrule = do @@ -442,7 +454,7 @@ pHrule = do pTable :: PandocMonad m => TagParser m Blocks pTable = try $ do - TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) + TagOpen _ _ <- pSatisfy (matchTagOpen "table" []) skipMany pBlank caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank widths' <- (mconcat <$> many1 pColgroup) <|> many pCol @@ -456,8 +468,8 @@ pTable = try $ do else return head'' rowsLs <- many pTBody rows' <- pOptInTag "tfoot" $ many pTr - TagClose _ <- pSatisfy (~== TagClose "table") - let rows'' = (concat rowsLs) ++ rows' + TagClose _ <- pSatisfy (matchTagClose "table") + let rows'' = (concat rowsLs) <> rows' -- fail on empty table guard $ not $ null head' && null rows'' let isSinglePlain x = case B.toList x of @@ -468,7 +480,7 @@ pTable = try $ do let cols = length $ if null head' then head rows'' else head' -- add empty cells to short rows let addEmpties r = case cols - length r of - n | n > 0 -> r ++ replicate n mempty + n | n > 0 -> r <> replicate n mempty | otherwise -> r let rows = map addEmpties rows'' let aligns = replicate cols AlignDefault @@ -481,15 +493,16 @@ pTable = try $ do pCol :: PandocMonad m => TagParser m Double pCol = try $ do - TagOpen _ attribs <- pSatisfy (~== TagOpen "col" []) + TagOpen _ attribs' <- pSatisfy (matchTagOpen "col" []) + let attribs = toStringAttr attribs' skipMany pBlank - optional $ pSatisfy (~== TagClose "col") + optional $ pSatisfy (matchTagClose "col") skipMany pBlank return $ case lookup "width" attribs of Nothing -> case lookup "style" attribs of Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs -> fromMaybe 0.0 $ safeRead ('0':'.':filter - (`notElem` " \t\r\n%'\";") xs) + (`notElem` (" \t\r\n%'\";" :: [Char])) xs) _ -> 0.0 Just x | not (null x) && last x == '%' -> fromMaybe 0.0 $ safeRead ('0':'.':init x) @@ -497,18 +510,18 @@ pCol = try $ do pColgroup :: PandocMonad m => TagParser m [Double] pColgroup = try $ do - pSatisfy (~== TagOpen "colgroup" []) + pSatisfy (matchTagOpen "colgroup" []) skipMany pBlank manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank -noColOrRowSpans :: Tag String -> Bool +noColOrRowSpans :: Tag Text -> Bool noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan" where isNullOrOne x = case fromAttrib x t of "" -> True "1" -> True _ -> False -pCell :: PandocMonad m => String -> TagParser m [Blocks] +pCell :: PandocMonad m => Text -> TagParser m [Blocks] pCell celltype = try $ do skipMany pBlank res <- pInTags' celltype noColOrRowSpans block @@ -534,7 +547,8 @@ pPara = do pCodeBlock :: PandocMonad m => TagParser m Blocks pCodeBlock = try $ do - TagOpen _ attr <- pSatisfy (~== TagOpen "pre" []) + TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" []) + let attr = toStringAttr attr' contents <- manyTill pAnyTag (pCloses "pre" <|> eof) let rawText = concatMap tagToString contents -- drop leading newline if any @@ -547,8 +561,8 @@ pCodeBlock = try $ do _ -> result' return $ B.codeBlockWith (mkAttr attr) result -tagToString :: Tag String -> String -tagToString (TagText s) = s +tagToString :: Tag Text -> String +tagToString (TagText s) = T.unpack s tagToString (TagOpen "br" _) = "\n" tagToString _ = "" @@ -577,20 +591,20 @@ pLocation = do (TagPosition r c) <- pSat isTagPosition setPosition $ newPos "input" r c -pSat :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String) +pSat :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) pSat f = do pos <- getPosition token show (const pos) (\x -> if f x then Just x else Nothing) -pSatisfy :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String) +pSatisfy :: PandocMonad m => (Tag Text -> Bool) -> TagParser m (Tag Text) pSatisfy f = try $ optional pLocation >> pSat f -pAnyTag :: PandocMonad m => TagParser m (Tag String) +pAnyTag :: PandocMonad m => TagParser m (Tag Text) pAnyTag = pSatisfy (const True) pSelfClosing :: PandocMonad m - => (String -> Bool) -> ([Attribute String] -> Bool) - -> TagParser m (Tag String) + => (Text -> Bool) -> ([Attribute Text] -> Bool) + -> TagParser m (Tag Text) pSelfClosing f g = do open <- pSatisfy (tagOpen f g) optional $ pSatisfy (tagClose f) @@ -628,7 +642,7 @@ pStrikeout = do pInlinesInTags "s" B.strikeout <|> pInlinesInTags "strike" B.strikeout <|> pInlinesInTags "del" B.strikeout <|> - try (do pSatisfy (~== TagOpen "span" [("class","strikeout")]) + try (do pSatisfy (matchTagOpen "span" [("class","strikeout")]) contents <- mconcat <$> manyTill inline (pCloses "span") return $ B.strikeout contents) @@ -639,17 +653,19 @@ pLineBreak = do -- Unlike fromAttrib from tagsoup, this distinguishes -- between a missing attribute and an attribute with empty content. -maybeFromAttrib :: String -> Tag String -> Maybe String -maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs +maybeFromAttrib :: String -> Tag Text -> Maybe String +maybeFromAttrib name (TagOpen _ attrs) = + T.unpack <$> lookup (T.pack name) attrs maybeFromAttrib _ _ = Nothing pLink :: PandocMonad m => TagParser m Inlines pLink = try $ do tag <- pSatisfy $ tagOpenLit "a" (const True) - let title = fromAttrib "title" tag + let title = T.unpack $ fromAttrib "title" tag -- take id from id attribute if present, otherwise name - let uid = maybe (fromAttrib "name" tag) id $ maybeFromAttrib "id" tag - let cls = words $ fromAttrib "class" tag + let uid = maybe (T.unpack $ fromAttrib "name" tag) id $ + maybeFromAttrib "id" tag + let cls = words $ T.unpack $ fromAttrib "class" tag lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a") -- check for href; if href, then a link, otherwise a span case maybeFromAttrib "href" tag of @@ -667,30 +683,33 @@ pImage :: PandocMonad m => TagParser m Inlines pImage = do tag <- pSelfClosing (=="img") (isJust . lookup "src") mbBaseHref <- baseHref <$> getState - let url' = fromAttrib "src" tag + let url' = T.unpack $ fromAttrib "src" tag let url = case (parseURIReference url', mbBaseHref) of (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) _ -> url' - let title = fromAttrib "title" tag - let alt = fromAttrib "alt" tag - let uid = fromAttrib "id" tag - let cls = words $ fromAttrib "class" tag + let title = T.unpack $ fromAttrib "title" tag + let alt = T.unpack $ fromAttrib "alt" tag + let uid = T.unpack $ fromAttrib "id" tag + let cls = words $ T.unpack $ fromAttrib "class" tag let getAtt k = case fromAttrib k tag of "" -> [] - v -> [(k, v)] + v -> [(T.unpack k, T.unpack v)] let kvs = concat $ map getAtt ["width", "height", "sizes", "srcset"] return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt) pCode :: PandocMonad m => TagParser m Inlines pCode = try $ do - (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) + (TagOpen open attr') <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) + let attr = toStringAttr attr' result <- manyTill pAnyTag (pCloses open) - return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result + return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ T.unpack $ + innerText result pSpan :: PandocMonad m => TagParser m Inlines pSpan = try $ do guardEnabled Ext_native_spans - TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) + TagOpen _ attr' <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) + let attr = toStringAttr attr' contents <- pInTags "span" inline let isSmallCaps = fontVariant == "small-caps" || "smallcaps" `elem` classes where styleAttr = fromMaybe "" $ lookup "style" attr @@ -708,7 +727,7 @@ pRawHtmlInline = do then pSatisfy (not . isBlockTag) else pSatisfy isInlineTag exts <- getOption readerExtensions - let raw = renderTags' [result] + let raw = T.unpack $ renderTags' [result] if extensionEnabled Ext_raw_html exts then return $ B.rawInline "html" raw else ignore raw @@ -716,32 +735,38 @@ pRawHtmlInline = do mathMLToTeXMath :: String -> Either String String mathMLToTeXMath s = writeTeX <$> readMathML s +toStringAttr :: [(Text, Text)] -> [(String, String)] +toStringAttr = map go + where go (x,y) = (T.unpack x, T.unpack y) + pMath :: PandocMonad m => Bool -> TagParser m Inlines pMath inCase = try $ do - open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True) + open@(TagOpen _ attr') <- pSatisfy $ tagOpen (=="math") (const True) -- we'll assume math tags are MathML unless specially marked -- otherwise... + let attr = toStringAttr attr' unless inCase $ guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr)) - contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math")) - case mathMLToTeXMath (renderTags $ [open] ++ contents ++ [TagClose "math"]) of + contents <- manyTill pAnyTag (pSatisfy (matchTagClose "math")) + case mathMLToTeXMath (T.unpack $ renderTags $ + [open] <> contents <> [TagClose "math"]) of Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $ - innerText contents + T.unpack $ innerText contents Right [] -> return mempty Right x -> return $ case lookup "display" attr of Just "block" -> B.displayMath x _ -> B.math x -pInlinesInTags :: PandocMonad m => String -> (Inlines -> Inlines) +pInlinesInTags :: PandocMonad m => Text -> (Inlines -> Inlines) -> TagParser m Inlines pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline -pInTags :: (PandocMonad m, Monoid a) => String -> TagParser m a -> TagParser m a +pInTags :: (PandocMonad m, Monoid a) => Text -> TagParser m a -> TagParser m a pInTags tagtype parser = pInTags' tagtype (const True) parser pInTags' :: (PandocMonad m, Monoid a) - => String - -> (Tag String -> Bool) + => Text + -> (Tag Text -> Bool) -> TagParser m a -> TagParser m a pInTags' tagtype tagtest parser = try $ do @@ -750,18 +775,18 @@ pInTags' tagtype tagtest parser = try $ do -- parses p, preceeded by an optional opening tag -- and followed by an optional closing tags -pOptInTag :: PandocMonad m => String -> TagParser m a -> TagParser m a +pOptInTag :: PandocMonad m => Text -> TagParser m a -> TagParser m a pOptInTag tagtype p = try $ do skipMany pBlank - optional $ pSatisfy (~== TagOpen tagtype []) + optional $ pSatisfy (matchTagOpen tagtype []) skipMany pBlank x <- p skipMany pBlank - optional $ pSatisfy (~== TagClose tagtype) + optional $ pSatisfy (matchTagClose tagtype) skipMany pBlank return x -pCloses :: PandocMonad m => String -> TagParser m () +pCloses :: PandocMonad m => Text -> TagParser m () pCloses tagtype = try $ do t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag case t of @@ -782,15 +807,15 @@ pTagText = try $ do parsed <- lift $ lift $ flip runReaderT qu $ runParserT (many pTagContents) st "text" str case parsed of - Left _ -> throwError $ PandocParseError $ "Could not parse `" ++ str ++ "'" + Left _ -> throwError $ PandocParseError $ "Could not parse `" <> T.unpack str <> "'" Right result -> return $ mconcat result pBlank :: PandocMonad m => TagParser m () pBlank = try $ do (TagText str) <- pSatisfy isTagText - guard $ all isSpace str + guard $ T.all isSpace str -type InlinesParser m = HTMLParser m String +type InlinesParser m = HTMLParser m Text pTagContents :: PandocMonad m => InlinesParser m Inlines pTagContents = @@ -871,13 +896,13 @@ pSpace = many1 (satisfy isSpace) >>= \xs -> -- Constants -- -eitherBlockOrInline :: Set.Set String +eitherBlockOrInline :: Set.Set Text eitherBlockOrInline = Set.fromList ["audio", "applet", "button", "iframe", "embed", "del", "ins", "progress", "map", "area", "noscript", "script", "object", "svg", "video", "source"] -blockHtmlTags :: Set.Set String +blockHtmlTags :: Set.Set Text blockHtmlTags = Set.fromList ["?xml", "!DOCTYPE", "address", "article", "aside", "blockquote", "body", "canvas", @@ -893,7 +918,7 @@ blockHtmlTags = Set.fromList -- We want to allow raw docbook in markdown documents, so we -- include docbook block tags here too. -blockDocBookTags :: Set.Set String +blockDocBookTags :: Set.Set Text blockDocBookTags = Set.fromList ["calloutlist", "bibliolist", "glosslist", "itemizedlist", "orderedlist", "segmentedlist", "simplelist", @@ -908,37 +933,52 @@ blockDocBookTags = Set.fromList "classsynopsis", "blockquote", "epigraph", "msgset", "sidebar", "title"] -epubTags :: Set.Set String +epubTags :: Set.Set Text epubTags = Set.fromList ["case", "switch", "default"] -blockTags :: Set.Set String +blockTags :: Set.Set Text blockTags = Set.unions [blockHtmlTags, blockDocBookTags, epubTags] -isInlineTag :: Tag String -> Bool -isInlineTag t = tagOpen isInlineTagName (const True) t || - tagClose isInlineTagName t || - tagComment (const True) t - where isInlineTagName x = x `Set.notMember` blockTags - -isBlockTag :: Tag String -> Bool -isBlockTag t = tagOpen isBlockTagName (const True) t || - tagClose isBlockTagName t || - tagComment (const True) t - where isBlockTagName ('?':_) = True - isBlockTagName ('!':_) = True - isBlockTagName x = x `Set.member` blockTags - || x `Set.member` - eitherBlockOrInline - -isTextTag :: Tag String -> Bool +class NamedTag a where + getTagName :: a -> Maybe Text + +instance NamedTag (Tag Text) where + getTagName (TagOpen t _) = Just t + getTagName (TagClose t) = Just t + getTagName _ = Nothing + +instance NamedTag (Tag String) where + getTagName (TagOpen t _) = Just (T.pack t) + getTagName (TagClose t) = Just (T.pack t) + getTagName _ = Nothing + +isInlineTag :: NamedTag (Tag a) => Tag a -> Bool +isInlineTag t = isInlineTagName || isCommentTag t + where isInlineTagName = case getTagName t of + Just x -> x + `Set.notMember` blockTags + Nothing -> False + +isBlockTag :: NamedTag (Tag a) => Tag a -> Bool +isBlockTag t = isBlockTagName || isTagComment t + where isBlockTagName = + case getTagName t of + Just x + | "?" `T.isPrefixOf` x -> True + | "!" `T.isPrefixOf` x -> True + | otherwise -> x `Set.member` blockTags + || x `Set.member` eitherBlockOrInline + Nothing -> False + +isTextTag :: Tag a -> Bool isTextTag = tagText (const True) -isCommentTag :: Tag String -> Bool +isCommentTag :: Tag a -> Bool isCommentTag = tagComment (const True) -- taken from HXT and extended -- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags -closes :: String -> String -> Bool +closes :: Text -> Text -> Bool _ `closes` "body" = False _ `closes` "html" = False "body" `closes` "head" = True @@ -1000,8 +1040,11 @@ htmlInBalanced f = try $ do let cs = ec - sc lscontents <- unlines <$> count ls anyLine cscontents <- count cs anyChar - (_,closetag) <- htmlTag (~== TagClose tn) - return (lscontents ++ cscontents ++ closetag) + closetag <- do + x <- many (satisfy (/='>')) + char '>' + return (x <> ">") + return (lscontents <> cscontents <> closetag) _ -> mzero _ -> mzero @@ -1019,7 +1062,7 @@ htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts go n (t:ts') = (t :) <$> go n ts' go _ [] = mzero -hasTagWarning :: [Tag String] -> Bool +hasTagWarning :: [Tag a] -> Bool hasTagWarning (TagWarning _:_) = True hasTagWarning _ = False @@ -1047,47 +1090,48 @@ htmlTag f = try $ do -- basic sanity check, since the parser is very forgiving -- and finds tags in stuff like x should NOT be a tag either. -- tagsoup will parse it as TagOpen "https:" [("example.org","")] guard $ last tagname /= ':' rendered <- manyTill anyChar (char '>') - return (next, rendered ++ ">") + return (next, rendered <> ">") case next of TagComment s | "") + return (next, "") | otherwise -> fail "bogus comment mode, HTML5 parse error" TagOpen tagname attr -> do guard $ all (isName . fst) attr handleTag tagname - TagClose tagname -> handleTag tagname + TagClose tagname -> + handleTag tagname _ -> mzero mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) where attribsId = fromMaybe "" $ lookup "id" attr - attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes + attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) <> epubTypes attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr -- Strip namespace prefixes -stripPrefixes :: [Tag String] -> [Tag String] +stripPrefixes :: [Tag Text] -> [Tag Text] stripPrefixes = map stripPrefix -stripPrefix :: Tag String -> Tag String +stripPrefix :: Tag Text -> Tag Text stripPrefix (TagOpen s as) = TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as) stripPrefix (TagClose s) = TagClose (stripPrefix' s) stripPrefix x = x -stripPrefix' :: String -> String +stripPrefix' :: Text -> Text stripPrefix' s = - case span (/= ':') s of - (_, "") -> s - (_, (_:ts)) -> ts + if T.null t then s else T.drop 1 t + where (_, t) = T.span (/= ':') s isSpace :: Char -> Bool isSpace ' ' = True @@ -1130,19 +1174,32 @@ instance HasLastStrPosition HTMLState where setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} getLastStrPos = getLastStrPos . parserState +-- For now we need a special verison here; the one in Shared has String type +renderTags' :: [Tag Text] -> Text +renderTags' = renderTagsOptions + renderOptions{ optMinimize = matchTags ["hr", "br", "img", + "meta", "link"] + , optRawTag = matchTags ["script", "style"] } + where matchTags = \tags -> flip elem tags . T.toLower + -- EPUB Specific -- -- -sectioningContent :: [String] +sectioningContent :: [Text] sectioningContent = ["article", "aside", "nav", "section"] -groupingContent :: [String] +groupingContent :: [Text] groupingContent = ["p", "hr", "pre", "blockquote", "ol" , "ul", "li", "dl", "dt", "dt", "dd" , "figure", "figcaption", "div", "main"] +matchTagClose :: Text -> (Tag Text -> Bool) +matchTagClose t = (~== TagClose t) + +matchTagOpen :: Text -> [(Text, Text)] -> (Tag Text -> Bool) +matchTagOpen t as = (~== TagOpen t as) {- @@ -1150,7 +1207,7 @@ types :: [(String, ([String], Int))] types = -- Document divisions map (\s -> (s, (["section", "body"], 0))) ["volume", "part", "chapter", "division"] - ++ -- Document section and components + <> -- Document section and components [ ("abstract", ([], 0))] -} -- cgit v1.2.3