diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 411 |
1 files changed, 222 insertions, 189 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index eb78979a3..fdf4f28e0 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -5,7 +5,7 @@ {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-2020 John MacFarlane + Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane <jgm@berkeley.edu> @@ -19,21 +19,20 @@ module Text.Pandoc.Readers.HTML ( readHtml , htmlInBalanced , isInlineTag , isBlockTag - , NamedTag(..) , isTextTag , isCommentTag ) where import Control.Applicative ((<|>)) -import Control.Arrow (first) import Control.Monad (guard, msum, mzero, unless, void) -import Control.Monad.Except (throwError) +import Control.Monad.Except (throwError, catchError) import Control.Monad.Reader (ask, asks, lift, local, runReaderT) import Data.ByteString.Base64 (encode) import Data.Char (isAlphaNum, isLetter) import Data.Default (Default (..), def) import Data.Foldable (for_) import Data.List.Split (splitWhen) +import Data.List (foldl') import qualified Data.Map as M import Data.Maybe (fromMaybe, isJust, isNothing) import Data.Monoid (First (..)) @@ -62,21 +61,22 @@ import Text.Pandoc.Options ( ReaderOptions (readerExtensions, readerStripComments), extensionEnabled) import Text.Pandoc.Parsing hiding ((<|>)) -import Text.Pandoc.Shared (addMetaField, blocksToInlines', crFilter, escapeURI, - extractSpaces, htmlSpanLikeElements, safeRead, tshow) +import Text.Pandoc.Shared ( + addMetaField, blocksToInlines', escapeURI, extractSpaces, + htmlSpanLikeElements, renderTags', safeRead, tshow) import Text.Pandoc.Walk import Text.Parsec.Error import Text.TeXMath (readMathML, writeTeX) -- | Convert HTML-formatted string to 'Pandoc' document. -readHtml :: PandocMonad m +readHtml :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options - -> Text -- ^ String to parse (assumes @'\n'@ line endings) + -> a -- ^ Input to parse -> m Pandoc readHtml opts inp = do - let tags = stripPrefixes . canonicalizeTags $ + let tags = stripPrefixes $ canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } - (crFilter inp) + (sourcesToText $ toSources inp) parseDoc = do blocks <- fixPlains False . mconcat <$> manyTill block eof meta <- stateMeta . parserState <$> getState @@ -95,6 +95,15 @@ readHtml opts inp = do Right doc -> return doc Left err -> throwError $ PandocParseError $ T.pack $ getError err +-- Strip namespace prefixes on tags (not attributes) +stripPrefixes :: [Tag Text] -> [Tag Text] +stripPrefixes = map stripPrefix + +stripPrefix :: Tag Text -> Tag Text +stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as +stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s) +stripPrefix x = x + replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes bs = do st <- getState @@ -112,14 +121,18 @@ setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInPlain = local (\s -> s {inPlain = True}) pHtml :: PandocMonad m => TagParser m Blocks -pHtml = try $ do +pHtml = do (TagOpen "html" attr) <- lookAhead pAny - for_ (lookup "lang" attr) $ + for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $ updateState . B.setMeta "lang" . B.text pInTags "html" block pBody :: PandocMonad m => TagParser m Blocks -pBody = pInTags "body" block +pBody = do + (TagOpen "body" attr) <- lookAhead pAny + for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $ + updateState . B.setMeta "lang" . B.text + pInTags "body" block pHead :: PandocMonad m => TagParser m Blocks pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAny) @@ -145,32 +158,65 @@ pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAny) return mempty block :: PandocMonad m => TagParser m Blocks -block = do - res <- choice - [ eSection - , eSwitch B.para block - , mempty <$ eFootnote - , mempty <$ eTOC - , mempty <$ eTitlePage - , pPara - , pHeader - , pBlockQuote - , pCodeBlock - , pList - , pHrule - , pTable block - , pHtml - , pHead - , pBody - , pLineBlock - , pDiv - , pPlain - , pFigure - , pIframe - , pRawHtmlBlock - ] - trace (T.take 60 $ tshow $ B.toList res) - return res +block = ((do + tag <- lookAhead (pSatisfy isBlockTag) + exts <- getOption readerExtensions + case tag of + TagOpen name attr -> + let type' = fromMaybe "" $ + lookup "type" attr <|> lookup "epub:type" attr + epubExts = extensionEnabled Ext_epub_html_exts exts + in + case name of + _ | name `elem` sectioningContent + , epubExts + , "chapter" `T.isInfixOf` type' + -> eSection + _ | epubExts + , type' `elem` ["footnote", "rearnote"] + -> mempty <$ eFootnote + _ | epubExts + , type' == "toc" + -> mempty <$ eTOC + _ | "titlepage" `T.isInfixOf` type' + , name `elem` ("section" : groupingContent) + -> mempty <$ eTitlePage + "p" -> pPara + "h1" -> pHeader + "h2" -> pHeader + "h3" -> pHeader + "h4" -> pHeader + "h5" -> pHeader + "h6" -> pHeader + "blockquote" -> pBlockQuote + "pre" -> pCodeBlock + "ul" -> pBulletList + "ol" -> pOrderedList + "dl" -> pDefinitionList + "table" -> pTable block + "hr" -> pHrule + "html" -> pHtml + "head" -> pHead + "body" -> pBody + "div" + | extensionEnabled Ext_line_blocks exts + , Just "line-block" <- lookup "class" attr + -> pLineBlock + | otherwise + -> pDiv + "section" -> pDiv + "header" -> pDiv + "main" -> pDiv + "figure" -> pFigure + "iframe" -> pIframe + "style" -> pRawHtmlBlock + "textarea" -> pRawHtmlBlock + "switch" + | epubExts + -> eSwitch B.para block + _ -> mzero + _ -> mzero) <|> pPlain <|> pRawHtmlBlock) >>= \res -> + res <$ trace (T.take 60 $ tshow $ B.toList res) namespaces :: PandocMonad m => [(Text, TagParser m Inlines)] namespaces = [(mathMLNamespace, pMath True)] @@ -243,9 +289,6 @@ eTOC = try $ do guard $ (lookup "type" attr <|> lookup "epub:type" attr) == Just "toc" void (pInTags tag block) -pList :: PandocMonad m => TagParser m Blocks -pList = pBulletList <|> pOrderedList <|> pDefinitionList - pBulletList :: PandocMonad m => TagParser m Blocks pBulletList = try $ do pSatisfy (matchTagOpen "ul" []) @@ -319,7 +362,10 @@ pDefListItem = try $ do terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) skipMany nonItem - let term = foldl1 (\x y -> x <> B.linebreak <> y) $ map trimInlines terms + let term = foldl' (\x y -> if null x + then trimInlines y + else x <> B.linebreak <> trimInlines y) + mempty terms return (term, map (fixPlains True) defs) fixPlains :: Bool -> Blocks -> Blocks @@ -356,13 +402,16 @@ pLineBlock = try $ do B.toList ils return $ B.lineBlock lns +isDivLike :: Text -> Bool +isDivLike "div" = True +isDivLike "section" = True +isDivLike "header" = True +isDivLike "main" = True +isDivLike _ = False + pDiv :: PandocMonad m => TagParser m Blocks pDiv = try $ do guardEnabled Ext_native_divs - let isDivLike "div" = True - isDivLike "section" = True - isDivLike "main" = True - isDivLike _ = False TagOpen tag attr' <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) let (ident, classes, kvs) = toAttr attr' contents <- pInTags tag block @@ -380,11 +429,17 @@ pIframe = try $ do tag <- pSatisfy (tagOpen (=="iframe") (isJust . lookup "src")) pCloses "iframe" <|> eof url <- canonicalizeUrl $ fromAttrib "src" tag - (bs, _) <- openURL url - let inp = UTF8.toText bs - opts <- readerOpts <$> getState - Pandoc _ contents <- readHtml opts inp - return $ B.divWith ("",["iframe"],[]) $ B.fromList contents + if T.null url + then ignore $ renderTags' [tag, TagClose "iframe"] + else catchError + (do (bs, _) <- openURL url + let inp = UTF8.toText bs + opts <- readerOpts <$> getState + Pandoc _ contents <- readHtml opts inp + return $ B.divWith ("",["iframe"],[]) $ B.fromList contents) + (\e -> do + logMessage $ CouldNotFetchResource url (renderError e) + ignore $ renderTags' [tag, TagClose "iframe"]) pRawHtmlBlock :: PandocMonad m => TagParser m Blocks pRawHtmlBlock = do @@ -446,17 +501,13 @@ pHeader = try $ do tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) let attr = toStringAttr attr' - let bodyTitle = TagOpen tagtype attr' ~== TagOpen ("h1" :: Text) - [("class","title")] level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] T.words $ lookup "class" attr let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] attr'' <- registerHeader (ident, classes, keyvals) contents - return $ if bodyTitle - then mempty -- skip a representation of the title in the body - else B.headerWith attr'' level contents + return $ B.headerWith attr'' level contents pHrule :: PandocMonad m => TagParser m Blocks pHrule = do @@ -506,7 +557,18 @@ pFigure = try $ do pCodeBlock :: PandocMonad m => TagParser m Blocks pCodeBlock = try $ do TagOpen _ attr' <- pSatisfy (matchTagOpen "pre" []) - let attr = toAttr attr' + -- if the `pre` has no attributes, try if it is followed by a `code` + -- element and use those attributes if possible. + attr <- case attr' of + _:_ -> pure (toAttr attr') + [] -> option nullAttr $ do + TagOpen _ codeAttr <- pSatisfy (matchTagOpen "code" []) + pure $ toAttr + [ (k, v') | (k, v) <- codeAttr + -- strip language from class + , let v' = if k == "class" + then fromMaybe v (T.stripPrefix "language-" v) + else v ] contents <- manyTill pAny (pCloses "pre" <|> eof) let rawText = T.concat $ map tagToText contents -- drop leading newline if any @@ -525,31 +587,47 @@ tagToText (TagOpen "br" _) = "\n" tagToText _ = "" inline :: PandocMonad m => TagParser m Inlines -inline = choice - [ eNoteref - , eSwitch id inline - , pTagText - , pQ - , pEmph - , pStrong - , pSuperscript - , pSubscript - , pSpanLike - , pSmall - , pStrikeout - , pUnderline - , pLineBreak - , pLink - , pImage - , pSvg - , pBdo - , pCode - , pCodeWithClass [("samp","sample"),("var","variable")] - , pSpan - , pMath False - , pScriptMath - , pRawHtmlInline - ] +inline = pTagText <|> do + tag <- lookAhead (pSatisfy isInlineTag) + exts <- getOption readerExtensions + case tag of + TagOpen name attr -> + case name of + "a" | extensionEnabled Ext_epub_html_exts exts + , Just "noteref" <- lookup "type" attr <|> lookup "epub:type" attr + , Just ('#',_) <- lookup "href" attr >>= T.uncons + -> eNoteref + | otherwise -> pLink + "switch" -> eSwitch id inline + "q" -> pQ + "em" -> pEmph + "i" -> pEmph + "strong" -> pStrong + "b" -> pStrong + "sup" -> pSuperscript + "sub" -> pSubscript + "small" -> pSmall + "s" -> pStrikeout + "strike" -> pStrikeout + "del" -> pStrikeout + "u" -> pUnderline + "ins" -> pUnderline + "br" -> pLineBreak + "img" -> pImage + "svg" -> pSvg + "bdo" -> pBdo + "code" -> pCode + "samp" -> pCodeWithClass "samp" "sample" + "var" -> pCodeWithClass "var" "variable" + "span" -> pSpan + "math" -> pMath False + "script" + | Just x <- lookup "type" attr + , "math/tex" `T.isPrefixOf` x -> pScriptMath + _ | name `elem` htmlSpanLikeElements -> pSpanLike + _ -> pRawHtmlInline + TagText _ -> pTagText + _ -> pRawHtmlInline pSelfClosing :: PandocMonad m => (Text -> Bool) -> ([Attribute Text] -> Bool) @@ -560,27 +638,25 @@ pSelfClosing f g = do return open pQ :: PandocMonad m => TagParser m Inlines -pQ = choice $ map try [citedQuote, normalQuote] - where citedQuote = do - tag <- pSatisfy $ tagOpenLit "q" (any ((=="cite") . fst)) - - url <- canonicalizeUrl $ fromAttrib "cite" tag - let uid = fromMaybe (fromAttrib "name" tag) $ - maybeFromAttrib "id" tag - let cls = T.words $ fromAttrib "class" tag - - makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url)]) - normalQuote = do - pSatisfy $ tagOpenLit "q" (const True) - makeQuote id - makeQuote wrapper = do - ctx <- asks quoteContext - let (constructor, innerContext) = case ctx of - InDoubleQuote -> (B.singleQuoted, InSingleQuote) - _ -> (B.doubleQuoted, InDoubleQuote) - - content <- withQuoteContext innerContext (mconcat <$> manyTill inline (pCloses "q")) - return $ extractSpaces (constructor . wrapper) content +pQ = do + TagOpen _ attrs <- pSatisfy $ tagOpenLit "q" (const True) + case lookup "cite" attrs of + Just url -> do + let uid = fromMaybe mempty $ + lookup "name" attrs <> lookup "id" attrs + let cls = maybe [] T.words $ lookup "class" attrs + url' <- canonicalizeUrl url + makeQuote $ B.spanWith (uid, cls, [("cite", escapeURI url')]) + Nothing -> makeQuote id + where + makeQuote wrapper = do + ctx <- asks quoteContext + let (constructor, innerContext) = case ctx of + InDoubleQuote -> (B.singleQuoted, InSingleQuote) + _ -> (B.doubleQuoted, InDoubleQuote) + content <- withQuoteContext innerContext + (mconcat <$> manyTill inline (pCloses "q")) + return $ extractSpaces (constructor . wrapper) content pEmph :: PandocMonad m => TagParser m Inlines pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph @@ -646,17 +722,12 @@ pLink = try $ do pImage :: PandocMonad m => TagParser m Inlines pImage = do - tag <- pSelfClosing (=="img") (isJust . lookup "src") + tag@(TagOpen _ attr') <- pSelfClosing (=="img") (isJust . lookup "src") url <- canonicalizeUrl $ fromAttrib "src" tag let title = fromAttrib "title" tag let alt = fromAttrib "alt" tag - let uid = fromAttrib "id" tag - let cls = T.words $ fromAttrib "class" tag - let getAtt k = case fromAttrib k tag of - "" -> [] - v -> [(k, v)] - let kvs = concatMap getAtt ["width", "height", "sizes", "srcset"] - return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt) + let attr = toAttr $ filter (\(k,_) -> k /= "alt" && k /= "title" && k /= "src") attr' + return $ B.imageWith attr (escapeURI url) title (B.text alt) pSvg :: PandocMonad m => TagParser m Inlines pSvg = do @@ -671,13 +742,12 @@ pSvg = do UTF8.toText (encode $ UTF8.fromText rawText) return $ B.imageWith (ident,cls,[]) svgData mempty mempty -pCodeWithClass :: PandocMonad m => [(T.Text,Text)] -> TagParser m Inlines -pCodeWithClass elemToClass = try $ do - let tagTest = flip elem . fmap fst $ elemToClass - TagOpen open attr' <- pSatisfy $ tagOpen tagTest (const True) +pCodeWithClass :: PandocMonad m => Text -> Text -> TagParser m Inlines +pCodeWithClass name class' = try $ do + TagOpen open attr' <- pSatisfy $ tagOpen (== name) (const True) result <- manyTill pAny (pCloses open) let (ids,cs,kvs) = toAttr attr' - cs' = maybe cs (:cs) . lookup open $ elemToClass + cs' = class' : cs return . B.codeWith (ids,cs',kvs) . T.unwords . T.lines . innerText $ result @@ -764,17 +834,19 @@ pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline pTagText :: PandocMonad m => TagParser m Inlines pTagText = try $ do + pos <- getPosition (TagText str) <- pSatisfy isTagText st <- getState qu <- ask parsed <- lift $ lift $ - flip runReaderT qu $ runParserT (many pTagContents) st "text" str + flip runReaderT qu $ runParserT (many pTagContents) st "text" + (Sources [(pos, str)]) case parsed of Left _ -> throwError $ PandocParseError $ "Could not parse `" <> str <> "'" Right result -> return $ mconcat result -type InlinesParser m = HTMLParser m Text +type InlinesParser m = HTMLParser m Sources pTagContents :: PandocMonad m => InlinesParser m Inlines pTagContents = @@ -868,27 +940,23 @@ pSpace = many1 (satisfy isSpace) >>= \xs -> then return B.softbreak else return B.space -class NamedTag a where - getTagName :: a -> Maybe Text - -instance NamedTag (Tag Text) where - getTagName (TagOpen t _) = Just t - getTagName (TagClose t) = Just t - getTagName _ = Nothing - -instance NamedTag (Tag String) where - getTagName (TagOpen t _) = Just (T.pack t) - getTagName (TagClose t) = Just (T.pack t) - getTagName _ = Nothing - -isInlineTag :: NamedTag (Tag a) => Tag a -> Bool -isInlineTag t = - isCommentTag t || case getTagName t of - Nothing -> False - Just x -> x `Set.notMember` blockTags || - T.take 1 x == "?" -- processing instr. - -isBlockTag :: NamedTag (Tag a) => Tag a -> Bool +getTagName :: Tag Text -> Maybe Text +getTagName (TagOpen t _) = Just t +getTagName (TagClose t) = Just t +getTagName _ = Nothing + +isInlineTag :: Tag Text -> Bool +isInlineTag t = isCommentTag t || case t of + TagOpen "script" _ -> "math/tex" `T.isPrefixOf` fromAttrib "type" t + TagClose "script" -> True + TagOpen name _ -> isInlineTagName name + TagClose name -> isInlineTagName name + _ -> False + where isInlineTagName x = + x `Set.notMember` blockTags || + T.take 1 x == "?" -- processing instr. + +isBlockTag :: Tag Text -> Bool isBlockTag t = isBlockTagName || isTagComment t where isBlockTagName = case getTagName t of @@ -899,10 +967,10 @@ isBlockTag t = isBlockTagName || isTagComment t || x `Set.member` eitherBlockOrInline Nothing -> False -isTextTag :: Tag a -> Bool +isTextTag :: Tag Text -> Bool isTextTag = tagText (const True) -isCommentTag :: Tag a -> Bool +isCommentTag :: Tag Text -> Bool isCommentTag = tagComment (const True) --- parsers for use in markdown, textile readers @@ -910,13 +978,14 @@ isCommentTag = tagComment (const True) -- | Matches a stretch of HTML in balanced tags. htmlInBalanced :: Monad m => (Tag Text -> Bool) - -> ParserT Text st m Text + -> ParserT Sources st m Text htmlInBalanced f = try $ do lookAhead (char '<') - inp <- getInput - let ts = canonicalizeTags $ - parseTagsOptions parseOptions{ optTagWarning = True, - optTagPosition = True } inp + sources <- getInput + let ts = canonicalizeTags + $ parseTagsOptions parseOptions{ optTagWarning = True, + optTagPosition = True } + $ sourcesToText sources case ts of (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do guard $ f t @@ -951,22 +1020,24 @@ htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts go n (t:ts') = (t :) <$> go n ts' go _ [] = mzero -hasTagWarning :: [Tag a] -> Bool +hasTagWarning :: [Tag Text] -> Bool hasTagWarning (TagWarning _:_) = True hasTagWarning _ = False -- | Matches a tag meeting a certain condition. htmlTag :: (HasReaderOptions st, Monad m) => (Tag Text -> Bool) - -> ParserT Text st m (Tag Text, Text) + -> ParserT Sources st m (Tag Text, Text) htmlTag f = try $ do lookAhead (char '<') startpos <- getPosition - inp <- getInput + sources <- getInput + let inp = sourcesToText sources let ts = canonicalizeTags $ parseTagsOptions parseOptions{ optTagWarning = False , optTagPosition = True } - (inp <> " ") -- add space to ensure that + (inp <> " ") + -- add space to ensure that -- we get a TagPosition after the tag (next, ln, col) <- case ts of (TagPosition{} : next : TagPosition ln col : _) @@ -1024,21 +1095,6 @@ htmlTag f = try $ do handleTag tagname _ -> mzero --- Strip namespace prefixes -stripPrefixes :: [Tag Text] -> [Tag Text] -stripPrefixes = map stripPrefix - -stripPrefix :: Tag Text -> Tag Text -stripPrefix (TagOpen s as) = - TagOpen (stripPrefix' s) (map (first stripPrefix') as) -stripPrefix (TagClose s) = TagClose (stripPrefix' s) -stripPrefix x = x - -stripPrefix' :: Text -> Text -stripPrefix' s = - if T.null t then s else T.drop 1 t - where (_, t) = T.span (/= ':') s - -- Utilities -- | Adjusts a url according to the document's base URL. @@ -1048,26 +1104,3 @@ canonicalizeUrl url = do return $ case (parseURIReference (T.unpack url), mbBaseHref) of (Just rel, Just bs) -> tshow (rel `nonStrictRelativeTo` bs) _ -> url - --- For now we need a special version here; the one in Shared has String type -renderTags' :: [Tag Text] -> Text -renderTags' = renderTagsOptions - renderOptions{ optMinimize = matchTags ["hr", "br", "img", - "meta", "link"] - , optRawTag = matchTags ["script", "style"] } - where matchTags tags = flip elem tags . T.toLower - - --- EPUB Specific --- --- -{- - -types :: [(String, ([String], Int))] -types = -- Document divisions - map (\s -> (s, (["section", "body"], 0))) - ["volume", "part", "chapter", "division"] - <> -- Document section and components - [ - ("abstract", ([], 0))] --} |