{-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE FlexibleInstances #-} {-# LANGUAGE LambdaCase #-} {-# LANGUAGE OverloadedStrings #-} {-# LANGUAGE ViewPatterns #-} {- | Module : Text.Pandoc.Readers.HTML Copyright : Copyright (C) 2006-2021 John MacFarlane License : GNU GPL, version 2 or above Maintainer : John MacFarlane Stability : alpha Portability : portable Conversion of HTML to 'Pandoc' document. -} module Text.Pandoc.Readers.HTML ( readHtml , htmlTag , htmlInBalanced , isInlineTag , isBlockTag , isTextTag , isCommentTag ) where import Control.Applicative ((<|>)) import Control.Monad (guard, msum, mzero, unless, void) import Control.Monad.Except (throwError, catchError) import Control.Monad.Reader (ask, asks, lift, local, runReaderT) import Data.ByteString.Base64 (encode) import Data.Char (isAlphaNum, isLetter) import Data.Default (Default (..), def) import Data.Foldable (for_) import Data.List.Split (splitWhen) import Data.List (foldl') import qualified Data.Map as M import Data.Maybe (fromMaybe, isJust, isNothing) import Data.Monoid (First (..)) import qualified Data.Set as Set import Data.Text (Text) import qualified Data.Text as T import Network.URI (nonStrictRelativeTo, parseURIReference) import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Builder (Blocks, Inlines, trimInlines) import qualified Text.Pandoc.Builder as B import Text.Pandoc.Class.PandocMonad (PandocMonad (..)) import Text.Pandoc.CSS (pickStyleAttrProps) import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Definition import Text.Pandoc.Readers.HTML.Parsing import Text.Pandoc.Readers.HTML.Table (pTable) import Text.Pandoc.Readers.HTML.TagCategories import Text.Pandoc.Readers.HTML.Types import Text.Pandoc.Readers.LaTeX (rawLaTeXInline) import Text.Pandoc.Error import Text.Pandoc.Logging import Text.Pandoc.Options ( Extension (Ext_epub_html_exts, Ext_empty_paragraphs, Ext_native_divs, Ext_native_spans, Ext_raw_html, Ext_line_blocks, Ext_raw_tex), ReaderOptions (readerExtensions, readerStripComments), extensionEnabled) import Text.Pandoc.Parsing hiding ((<|>)) import Text.Pandoc.Shared ( addMetaField, blocksToInlines', escapeURI, extractSpaces, htmlSpanLikeElements, renderTags', safeRead, tshow) import Text.Pandoc.Walk import Text.Parsec.Error import Text.TeXMath (readMathML, writeTeX) -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: (PandocMonad m, ToSources a) => ReaderOptions -- ^ Reader options -> a -- ^ Input to parse -> m Pandoc readHtml opts inp = do let tags = stripPrefixes $ canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } (sourcesToText $ toSources inp) parseDoc = do blocks <- fixPlains False . mconcat <$> manyTill block eof meta <- stateMeta . parserState <$> getState bs' <- replaceNotes (B.toList blocks) reportLogMessages return $ Pandoc meta bs' getError (errorMessages -> ms) = case ms of [] -> "" (m:_) -> messageString m result <- flip runReaderT def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } [] Nothing Set.empty [] M.empty opts) "source" tags case result of Right doc -> return doc Left err -> throwError $ PandocParseError $ T.pack $ getError err -- Strip namespace prefixes on tags (not attributes) stripPrefixes :: [Tag Text] -> [Tag Text] stripPrefixes = map stripPrefix stripPrefix :: Tag Text -> Tag Text stripPrefix (TagOpen s as) = TagOpen (T.takeWhileEnd (/=':') s) as stripPrefix (TagClose s) = TagClose (T.takeWhileEnd (/=':') s) stripPrefix x = x replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] replaceNotes bs = do st <- getState return $ walk (replaceNotes' (noteTable st)) bs replaceNotes' :: [(Text, Blocks)] -> Inline -> Inline replaceNotes' noteTbl (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) $ lookup ref noteTbl replaceNotes' _ x = x setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInChapter = local (\s -> s {inChapter = True}) setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a setInPlain = local (\s -> s {inPlain = True}) pHtml :: PandocMonad m => TagParser m Blocks pHtml = do (TagOpen "html" attr) <- lookAhead pAny for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $ updateState . B.setMeta "lang" . B.text pInTags "html" block pBody :: PandocMonad m => TagParser m Blocks pBody = do (TagOpen "body" attr) <- lookAhead pAny for_ (lookup "lang" attr <|> lookup "xml:lang" attr) $ updateState . B.setMeta "lang" . B.text pInTags "body" block pHead :: PandocMonad m => TagParser m Blocks pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAny) where pTitle = pInTags "title" inline >>= setTitle . trimInlines setTitle t = mempty <$ updateState (B.setMeta "title" t) pMetaTag = do mt <- pSatisfy (matchTagOpen "meta" []) let name = fromAttrib "name" mt if T.null name then return mempty else do let content = fromAttrib "content" mt updateState $ \s -> let ps = parserState s in s{ parserState = ps{ stateMeta = addMetaField name (B.text content) (stateMeta ps) } } return mempty pBaseTag = do bt <- pSatisfy (matchTagOpen "base" []) updateState $ \st -> st{ baseHref = parseURIReference $ T.unpack $ fromAttrib "href" bt } return mempty block :: PandocMonad m => TagParser m Blocks block = ((do tag <- lookAhead (pSatisfy isBlockTag) exts <- getOption readerExtensions case tag of TagOpen name attr -> let type' = fromMaybe "" $ lookup "type" attr <|> lookup "epub:type" attr epubExts = extensionEnabled Ext_epub_html_exts exts in case name of _ | name `elem` sectioningContent , epubExts , "chapter" `T.isInfixOf` type' -> eSection _ | epubExts , type' `elem` ["footnote", "rearnote"] -> mempty <$ eFootnote _ | epubExts , type' == "toc" -> mempty <$ eTOC _ | "titlepage" `T.isInfixOf` type' , name `elem` ("section" : groupingContent) -> mempty <$ eTitlePage "p" -> pPara "h1" -> pHeader "h2" -> pHeader "h3" -> pHeader "h4" -> pHeader "h5" -> pHeader "h6" -> pHeader "blockquote" -> pBlockQuote "pre" -> pCodeBlock "ul" -> pBulletList "ol" -> pOrderedList "dl" -> pDefinitionList "table" -> pTable block "hr" -> pHrule "html" -> pHtml "head" -> pHead "body" -> pBody "div" | extensionEnabled Ext_line_blocks exts , Just "line-block" <- lookup "class" attr -> pLineBlock | otherwise -> pDiv "section" -> pDiv "main" -> pDiv "figure" -> pFigure "iframe" -> pIframe "style" -> pRawHtmlBlock "textarea" -> pRawHtmlBlock "switch" | epubExts -> eSwitch B.para block _ -> mzero _ -> mzero) <|> pPlain <|> pRawHtmlBlock) >>= \res -> res <$ trace (T.take 60 $ tshow $ B.toList res) namespaces :: PandocMonad m => [(Text, TagParser m Inlines)] namespaces = [(mathMLNamespace, pMath True)] mathMLNamespace :: Text mathMLNamespace = "http://www.w3.org/1998/Math/MathML" eSwitch :: (PandocMonad m, Monoid a) => (Inlines -> a) -> TagParser m a -> TagParser m a eSwitch constructor parser = try $ do guardEnabled Ext_epub_html_exts pSatisfy (matchTagOpen "switch" []) cases <- getFirst . mconcat <$> manyTill (First <$> (eCase <* skipMany pBlank) ) (lookAhead $ try $ pSatisfy (matchTagOpen "default" [])) skipMany pBlank fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank) skipMany pBlank pSatisfy (matchTagClose "switch") return $ maybe fallback constructor cases eCase :: PandocMonad m => TagParser m (Maybe Inlines) eCase = do skipMany pBlank TagOpen _ attr' <- lookAhead $ pSatisfy (matchTagOpen "case" []) let attr = toStringAttr attr' case flip lookup namespaces =<< lookup "required-namespace" attr of Just p -> Just <$> pInTags "case" (skipMany pBlank *> p <* skipMany pBlank) Nothing -> Nothing <$ manyTill pAny (pSatisfy (matchTagClose "case")) eFootnote :: PandocMonad m => TagParser m () eFootnote = try $ do let notes = ["footnote", "rearnote"] guardEnabled Ext_epub_html_exts (TagOpen tag attr') <- lookAhead pAny let attr = toStringAttr attr' guard $ maybe False (`elem` notes) (lookup "type" attr <|> lookup "epub:type" attr) let ident = fromMaybe "" (lookup "id" attr) content <- pInTags tag block addNote ident content addNote :: PandocMonad m => Text -> Blocks -> TagParser m () addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : noteTable s}) eNoteref :: PandocMonad m => TagParser m Inlines eNoteref = try $ do guardEnabled Ext_epub_html_exts TagOpen tag attr <- pSatisfy (\case TagOpen _ as -> (lookup "type" as <|> lookup "epub:type" as) == Just "noteref" _ -> False) ident <- case lookup "href" attr >>= T.uncons of Just ('#', rest) -> return rest _ -> mzero _ <- manyTill pAny (pSatisfy (\case TagClose t -> t == tag _ -> False)) return $ B.rawInline "noteref" ident -- Strip TOC if there is one, better to generate again eTOC :: PandocMonad m => TagParser m () eTOC = try $ do guardEnabled Ext_epub_html_exts (TagOpen tag attr) <- lookAhead pAny guard $ (lookup "type" attr <|> lookup "epub:type" attr) == Just "toc" void (pInTags tag block) pBulletList :: PandocMonad m => TagParser m Blocks pBulletList = try $ do pSatisfy (matchTagOpen "ul" []) let nonItem = pSatisfy (\t -> not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && not (matchTagClose "ul" t)) -- note: if they have an
    or