diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 1136 |
1 files changed, 0 insertions, 1136 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs deleted file mode 100644 index f02f1a1d4..000000000 --- a/src/Text/Pandoc/Readers/HTML.hs +++ /dev/null @@ -1,1136 +0,0 @@ -{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses, -ViewPatterns#-} -{- -Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Readers.HTML - Copyright : Copyright (C) 2006-2015 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Conversion of HTML to 'Pandoc' document. --} -module Text.Pandoc.Readers.HTML ( readHtml - , htmlTag - , htmlInBalanced - , isInlineTag - , isBlockTag - , isTextTag - , isCommentTag - ) where - -import Text.HTML.TagSoup -import Text.HTML.TagSoup.Match -import Text.Pandoc.Definition -import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) -import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField - , escapeURI, safeRead ) -import Text.Pandoc.Options (ReaderOptions(readerExtensions), extensionEnabled, - Extension (Ext_epub_html_exts, - Ext_raw_html, Ext_native_divs, Ext_native_spans)) -import Text.Pandoc.Logging -import Text.Pandoc.Parsing hiding ((<|>)) -import Text.Pandoc.Walk -import qualified Data.Map as M -import Data.Maybe ( fromMaybe, isJust) -import Data.List ( intercalate, isInfixOf, isPrefixOf ) -import Data.Char ( isDigit ) -import Control.Monad ( guard, mzero, void, unless ) -import Control.Arrow ((***)) -import Control.Applicative ( (<|>) ) -import Data.Monoid (First (..)) -import Text.TeXMath (readMathML, writeTeX) -import Data.Default (Default (..), def) -import Control.Monad.Reader (ask, asks, local, ReaderT, runReaderT, lift) -import Network.URI (URI, parseURIReference, nonStrictRelativeTo) -import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps) -import Data.Monoid ((<>)) -import Text.Parsec.Error -import qualified Data.Set as Set -import Text.Pandoc.Error -import Text.Pandoc.Class (PandocMonad, report) -import Control.Monad.Except (throwError) - --- | Convert HTML-formatted string to 'Pandoc' document. -readHtml :: PandocMonad m - => ReaderOptions -- ^ Reader options - -> String -- ^ String to parse (assumes @'\n'@ line endings) - -> m Pandoc -readHtml opts inp = do - let tags = stripPrefixes . canonicalizeTags $ - parseTagsOptions parseOptions{ optTagPosition = True } inp - parseDoc = do - blocks <- (fixPlains False) . mconcat <$> manyTill block eof - meta <- stateMeta . parserState <$> getState - bs' <- replaceNotes (B.toList blocks) - return $ Pandoc meta bs' - getError (errorMessages -> ms) = case ms of - [] -> "" - (m:_) -> messageString m - result <- flip runReaderT def $ - runParserT parseDoc - (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty) - "source" tags - case result of - Right doc -> return doc - Left err -> throwError $ PandocParseError $ getError err - -replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block] -replaceNotes = walkM replaceNotes' - -replaceNotes' :: PandocMonad m => Inline -> TagParser m Inline -replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes - where - getNotes = noteTable <$> getState -replaceNotes' x = return x - -data HTMLState = - HTMLState - { parserState :: ParserState, - noteTable :: [(String, Blocks)], - baseHref :: Maybe URI, - identifiers :: Set.Set String, - headerMap :: M.Map Inlines String - } - -data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext - , inChapter :: Bool -- ^ Set if in chapter section - , inPlain :: Bool -- ^ Set if in pPlain - } - -setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a -setInChapter = local (\s -> s {inChapter = True}) - -setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a -setInPlain = local (\s -> s {inPlain = True}) - -type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m) - -type TagParser m = HTMLParser m [Tag String] - -pBody :: PandocMonad m => TagParser m Blocks -pBody = pInTags "body" block - -pHead :: PandocMonad m => TagParser m Blocks -pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag) - where pTitle = pInTags "title" inline >>= setTitle . trimInlines - setTitle t = mempty <$ (updateState $ B.setMeta "title" t) - pMetaTag = do - mt <- pSatisfy (~== TagOpen "meta" []) - let name = fromAttrib "name" mt - if null name - then return mempty - else do - let content = fromAttrib "content" mt - updateState $ \s -> - let ps = parserState s in - s{ parserState = ps{ - stateMeta = addMetaField name (B.text content) - (stateMeta ps) } } - return mempty - pBaseTag = do - bt <- pSatisfy (~== TagOpen "base" []) - updateState $ \st -> st{ baseHref = - parseURIReference $ fromAttrib "href" bt } - return mempty - -block :: PandocMonad m => TagParser m Blocks -block = do - pos <- getPosition - res <- choice - [ eSection - , eSwitch B.para block - , mempty <$ eFootnote - , mempty <$ eTOC - , mempty <$ eTitlePage - , pPara - , pHeader - , pBlockQuote - , pCodeBlock - , pList - , pHrule - , pTable - , pHead - , pBody - , pDiv - , pPlain - , pRawHtmlBlock - ] - report $ ParsingTrace (take 60 $ show $ B.toList res) pos - return res - -namespaces :: PandocMonad m => [(String, TagParser m Inlines)] -namespaces = [(mathMLNamespace, pMath True)] - -mathMLNamespace :: String -mathMLNamespace = "http://www.w3.org/1998/Math/MathML" - -eSwitch :: (PandocMonad m, Monoid a) - => (Inlines -> a) - -> TagParser m a - -> TagParser m a -eSwitch constructor parser = try $ do - guardEnabled Ext_epub_html_exts - pSatisfy (~== TagOpen "switch" []) - cases <- getFirst . mconcat <$> - manyTill (First <$> (eCase <* skipMany pBlank) ) - (lookAhead $ try $ pSatisfy (~== TagOpen "default" [])) - skipMany pBlank - fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank) - skipMany pBlank - pSatisfy (~== TagClose "switch") - return $ maybe fallback constructor cases - -eCase :: PandocMonad m => TagParser m (Maybe Inlines) -eCase = do - skipMany pBlank - TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" []) - case (flip lookup namespaces) =<< lookup "required-namespace" attr of - Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)) - Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case")) - -eFootnote :: PandocMonad m => TagParser m () -eFootnote = try $ do - let notes = ["footnote", "rearnote"] - guardEnabled Ext_epub_html_exts - (TagOpen tag attr) <- lookAhead $ pAnyTag - guard (maybe False (flip elem notes) (lookup "type" attr)) - let ident = fromMaybe "" (lookup "id" attr) - content <- pInTags tag block - addNote ident content - -addNote :: PandocMonad m => String -> Blocks -> TagParser m () -addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)}) - -eNoteref :: PandocMonad m => TagParser m Inlines -eNoteref = try $ do - guardEnabled Ext_epub_html_exts - TagOpen tag attr <- lookAhead $ pAnyTag - guard (maybe False (== "noteref") (lookup "type" attr)) - let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr) - guard (not (null ident)) - pInTags tag block - return $ B.rawInline "noteref" ident - --- Strip TOC if there is one, better to generate again -eTOC :: PandocMonad m => TagParser m () -eTOC = try $ do - guardEnabled Ext_epub_html_exts - (TagOpen tag attr) <- lookAhead $ pAnyTag - guard (maybe False (== "toc") (lookup "type" attr)) - void (pInTags tag block) - -pList :: PandocMonad m => TagParser m Blocks -pList = pBulletList <|> pOrderedList <|> pDefinitionList - -pBulletList :: PandocMonad m => TagParser m Blocks -pBulletList = try $ do - pSatisfy (~== TagOpen "ul" []) - let nonItem = pSatisfy (\t -> - not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && - not (t ~== TagClose "ul")) - -- note: if they have an <ol> or <ul> not in scope of a <li>, - -- treat it as a list item, though it's not valid xhtml... - skipMany nonItem - items <- manyTill (pListItem nonItem) (pCloses "ul") - return $ B.bulletList $ map (fixPlains True) items - -pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks -pListItem nonItem = do - TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" []) - let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr) - (liDiv <>) <$> pInTags "li" block <* skipMany nonItem - -parseListStyleType :: String -> ListNumberStyle -parseListStyleType "lower-roman" = LowerRoman -parseListStyleType "upper-roman" = UpperRoman -parseListStyleType "lower-alpha" = LowerAlpha -parseListStyleType "upper-alpha" = UpperAlpha -parseListStyleType "decimal" = Decimal -parseListStyleType _ = DefaultStyle - -parseTypeAttr :: String -> ListNumberStyle -parseTypeAttr "i" = LowerRoman -parseTypeAttr "I" = UpperRoman -parseTypeAttr "a" = LowerAlpha -parseTypeAttr "A" = UpperAlpha -parseTypeAttr "1" = Decimal -parseTypeAttr _ = DefaultStyle - -pOrderedList :: PandocMonad m => TagParser m Blocks -pOrderedList = try $ do - TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) - let (start, style) = (sta', sty') - where sta = fromMaybe "1" $ - lookup "start" attribs - sta' = if all isDigit sta - then read sta - else 1 - - pickListStyle = pickStyleAttrProps ["list-style-type", "list-style"] - - typeAttr = fromMaybe "" $ lookup "type" attribs - classAttr = fromMaybe "" $ lookup "class" attribs - styleAttr = fromMaybe "" $ lookup "style" attribs - listStyle = fromMaybe "" $ pickListStyle styleAttr - - sty' = foldOrElse DefaultStyle - [ parseTypeAttr typeAttr - , parseListStyleType classAttr - , parseListStyleType listStyle - ] - let nonItem = pSatisfy (\t -> - not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) && - not (t ~== TagClose "ol")) - -- note: if they have an <ol> or <ul> not in scope of a <li>, - -- treat it as a list item, though it's not valid xhtml... - skipMany nonItem - items <- manyTill (pListItem nonItem) (pCloses "ol") - return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items - -pDefinitionList :: PandocMonad m => TagParser m Blocks -pDefinitionList = try $ do - pSatisfy (~== TagOpen "dl" []) - items <- manyTill pDefListItem (pCloses "dl") - return $ B.definitionList items - -pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks]) -pDefListItem = try $ do - let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) && - not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl")) - terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline) - defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block) - skipMany nonItem - let term = foldl1 (\x y -> x <> B.linebreak <> y) terms - return (term, map (fixPlains True) defs) - -fixPlains :: Bool -> Blocks -> Blocks -fixPlains inList bs = if any isParaish bs' - then B.fromList $ map plainToPara bs' - else bs - where isParaish (Para _) = True - isParaish (CodeBlock _ _) = True - isParaish (Header _ _ _) = True - isParaish (BlockQuote _) = True - isParaish (BulletList _) = not inList - isParaish (OrderedList _ _) = not inList - isParaish (DefinitionList _) = not inList - isParaish _ = False - plainToPara (Plain xs) = Para xs - plainToPara x = x - bs' = B.toList bs - -pRawTag :: PandocMonad m => TagParser m String -pRawTag = do - tag <- pAnyTag - let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"] - if tagOpen ignorable (const True) tag || tagClose ignorable tag - then return [] - else return $ renderTags' [tag] - -pDiv :: PandocMonad m => TagParser m Blocks -pDiv = try $ do - guardEnabled Ext_native_divs - let isDivLike "div" = True - isDivLike "section" = True - isDivLike _ = False - TagOpen tag attr <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True) - contents <- pInTags tag block - let (ident, classes, kvs) = mkAttr attr - let classes' = if tag == "section" - then "section":classes - else classes - return $ B.divWith (ident, classes', kvs) contents - -pRawHtmlBlock :: PandocMonad m => TagParser m Blocks -pRawHtmlBlock = do - raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag - exts <- getOption readerExtensions - if extensionEnabled Ext_raw_html exts && not (null raw) - then return $ B.rawBlock "html" raw - else ignore raw - -ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a -ignore raw = do - pos <- getPosition - -- raw can be null for tags like <!DOCTYPE>; see paRawTag - -- in this case we don't want a warning: - unless (null raw) $ - report $ SkippedContent raw pos - return mempty - -pHtmlBlock :: PandocMonad m => String -> TagParser m String -pHtmlBlock t = try $ do - open <- pSatisfy (~== TagOpen t []) - contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) - return $ renderTags' $ [open] ++ contents ++ [TagClose t] - --- Sets chapter context -eSection :: PandocMonad m => TagParser m Blocks -eSection = try $ do - let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as) - let sectTag = tagOpen (`elem` sectioningContent) matchChapter - TagOpen tag _ <- lookAhead $ pSatisfy sectTag - setInChapter (pInTags tag block) - -headerLevel :: PandocMonad m => String -> TagParser m Int -headerLevel tagtype = do - let level = read (drop 1 tagtype) - (try $ do - guardEnabled Ext_epub_html_exts - asks inChapter >>= guard - return (level - 1)) - <|> - return level - -eTitlePage :: PandocMonad m => TagParser m () -eTitlePage = try $ do - let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as) - let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section") - isTitlePage - TagOpen tag _ <- lookAhead $ pSatisfy groupTag - () <$ pInTags tag block - -pHeader :: PandocMonad m => TagParser m Blocks -pHeader = try $ do - TagOpen tagtype attr <- pSatisfy $ - tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) - (const True) - let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] - level <- headerLevel tagtype - contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) - let ident = fromMaybe "" $ lookup "id" attr - let classes = maybe [] words $ lookup "class" attr - let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"] - attr' <- registerHeader (ident, classes, keyvals) contents - return $ if bodyTitle - then mempty -- skip a representation of the title in the body - else B.headerWith attr' level contents - -pHrule :: PandocMonad m => TagParser m Blocks -pHrule = do - pSelfClosing (=="hr") (const True) - return B.horizontalRule - -pTable :: PandocMonad m => TagParser m Blocks -pTable = try $ do - TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) - skipMany pBlank - caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank - widths' <- (mconcat <$> many1 pColgroup) <|> many pCol - let pTh = option [] $ pInTags "tr" (pCell "th") - pTr = try $ skipMany pBlank >> pInTags "tr" (pCell "td" <|> pCell "th") - pTBody = do pOptInTag "tbody" $ many1 pTr - head'' <- pOptInTag "thead" pTh - head' <- pOptInTag "tbody" $ do - if null head'' - then pTh - else return head'' - rowsLs <- many pTBody - rows' <- pOptInTag "tfoot" $ many pTr - TagClose _ <- pSatisfy (~== TagClose "table") - let rows'' = (concat rowsLs) ++ rows' - -- fail on empty table - guard $ not $ null head' && null rows'' - let isSinglePlain x = case B.toList x of - [] -> True - [Plain _] -> True - _ -> False - let isSimple = all isSinglePlain $ concat (head':rows'') - let cols = length $ if null head' then head rows'' else head' - -- add empty cells to short rows - let addEmpties r = case cols - length r of - n | n > 0 -> r ++ replicate n mempty - | otherwise -> r - let rows = map addEmpties rows'' - let aligns = replicate cols AlignDefault - let widths = if null widths' - then if isSimple - then replicate cols 0 - else replicate cols (1.0 / fromIntegral cols) - else widths' - return $ B.table caption (zip aligns widths) head' rows - -pCol :: PandocMonad m => TagParser m Double -pCol = try $ do - TagOpen _ attribs <- pSatisfy (~== TagOpen "col" []) - skipMany pBlank - optional $ pSatisfy (~== TagClose "col") - skipMany pBlank - return $ case lookup "width" attribs of - Nothing -> case lookup "style" attribs of - Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs -> - fromMaybe 0.0 $ safeRead ('0':'.':filter - (`notElem` " \t\r\n%'\";") xs) - _ -> 0.0 - Just x | not (null x) && last x == '%' -> - fromMaybe 0.0 $ safeRead ('0':'.':init x) - _ -> 0.0 - -pColgroup :: PandocMonad m => TagParser m [Double] -pColgroup = try $ do - pSatisfy (~== TagOpen "colgroup" []) - skipMany pBlank - manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank - -noColOrRowSpans :: Tag String -> Bool -noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan" - where isNullOrOne x = case fromAttrib x t of - "" -> True - "1" -> True - _ -> False - -pCell :: PandocMonad m => String -> TagParser m [Blocks] -pCell celltype = try $ do - skipMany pBlank - res <- pInTags' celltype noColOrRowSpans block - skipMany pBlank - return [res] - -pBlockQuote :: PandocMonad m => TagParser m Blocks -pBlockQuote = do - contents <- pInTags "blockquote" block - return $ B.blockQuote $ fixPlains False contents - -pPlain :: PandocMonad m => TagParser m Blocks -pPlain = do - contents <- setInPlain $ trimInlines . mconcat <$> many1 inline - if B.isNull contents - then return mempty - else return $ B.plain contents - -pPara :: PandocMonad m => TagParser m Blocks -pPara = do - contents <- trimInlines <$> pInTags "p" inline - return $ B.para contents - -pCodeBlock :: PandocMonad m => TagParser m Blocks -pCodeBlock = try $ do - TagOpen _ attr <- pSatisfy (~== TagOpen "pre" []) - contents <- manyTill pAnyTag (pCloses "pre" <|> eof) - let rawText = concatMap tagToString contents - -- drop leading newline if any - let result' = case rawText of - '\n':xs -> xs - _ -> rawText - -- drop trailing newline if any - let result = case reverse result' of - '\n':_ -> init result' - _ -> result' - return $ B.codeBlockWith (mkAttr attr) result - -tagToString :: Tag String -> String -tagToString (TagText s) = s -tagToString (TagOpen "br" _) = "\n" -tagToString _ = "" - -inline :: PandocMonad m => TagParser m Inlines -inline = choice - [ eNoteref - , eSwitch id inline - , pTagText - , pQ - , pEmph - , pStrong - , pSuperscript - , pSubscript - , pStrikeout - , pLineBreak - , pLink - , pImage - , pCode - , pSpan - , pMath False - , pRawHtmlInline - ] - -pLocation :: PandocMonad m => TagParser m () -pLocation = do - (TagPosition r c) <- pSat isTagPosition - setPosition $ newPos "input" r c - -pSat :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String) -pSat f = do - pos <- getPosition - token show (const pos) (\x -> if f x then Just x else Nothing) - -pSatisfy :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String) -pSatisfy f = try $ optional pLocation >> pSat f - -pAnyTag :: PandocMonad m => TagParser m (Tag String) -pAnyTag = pSatisfy (const True) - -pSelfClosing :: PandocMonad m - => (String -> Bool) -> ([Attribute String] -> Bool) - -> TagParser m (Tag String) -pSelfClosing f g = do - open <- pSatisfy (tagOpen f g) - optional $ pSatisfy (tagClose f) - return open - -pQ :: PandocMonad m => TagParser m Inlines -pQ = do - context <- asks quoteContext - let quoteType = case context of - InDoubleQuote -> SingleQuote - _ -> DoubleQuote - let innerQuoteContext = if quoteType == SingleQuote - then InSingleQuote - else InDoubleQuote - let constructor = case quoteType of - SingleQuote -> B.singleQuoted - DoubleQuote -> B.doubleQuoted - withQuoteContext innerQuoteContext $ - pInlinesInTags "q" constructor - -pEmph :: PandocMonad m => TagParser m Inlines -pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph - -pStrong :: PandocMonad m => TagParser m Inlines -pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong - -pSuperscript :: PandocMonad m => TagParser m Inlines -pSuperscript = pInlinesInTags "sup" B.superscript - -pSubscript :: PandocMonad m => TagParser m Inlines -pSubscript = pInlinesInTags "sub" B.subscript - -pStrikeout :: PandocMonad m => TagParser m Inlines -pStrikeout = do - pInlinesInTags "s" B.strikeout <|> - pInlinesInTags "strike" B.strikeout <|> - pInlinesInTags "del" B.strikeout <|> - try (do pSatisfy (~== TagOpen "span" [("class","strikeout")]) - contents <- mconcat <$> manyTill inline (pCloses "span") - return $ B.strikeout contents) - -pLineBreak :: PandocMonad m => TagParser m Inlines -pLineBreak = do - pSelfClosing (=="br") (const True) - return B.linebreak - --- Unlike fromAttrib from tagsoup, this distinguishes --- between a missing attribute and an attribute with empty content. -maybeFromAttrib :: String -> Tag String -> Maybe String -maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs -maybeFromAttrib _ _ = Nothing - -pLink :: PandocMonad m => TagParser m Inlines -pLink = try $ do - tag <- pSatisfy $ tagOpenLit "a" (const True) - let title = fromAttrib "title" tag - -- take id from id attribute if present, otherwise name - let uid = maybe (fromAttrib "name" tag) id $ maybeFromAttrib "id" tag - let cls = words $ fromAttrib "class" tag - lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a") - -- check for href; if href, then a link, otherwise a span - case maybeFromAttrib "href" tag of - Nothing -> - return $ B.spanWith (uid, cls, []) lab - Just url' -> do - mbBaseHref <- baseHref <$> getState - let url = case (parseURIReference url', mbBaseHref) of - (Just rel, Just bs) -> - show (rel `nonStrictRelativeTo` bs) - _ -> url' - return $ B.linkWith (uid, cls, []) (escapeURI url) title lab - -pImage :: PandocMonad m => TagParser m Inlines -pImage = do - tag <- pSelfClosing (=="img") (isJust . lookup "src") - mbBaseHref <- baseHref <$> getState - let url' = fromAttrib "src" tag - let url = case (parseURIReference url', mbBaseHref) of - (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs) - _ -> url' - let title = fromAttrib "title" tag - let alt = fromAttrib "alt" tag - let uid = fromAttrib "id" tag - let cls = words $ fromAttrib "class" tag - let getAtt k = case fromAttrib k tag of - "" -> [] - v -> [(k, v)] - let kvs = concat $ map getAtt ["width", "height", "sizes", "srcset"] - return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt) - -pCode :: PandocMonad m => TagParser m Inlines -pCode = try $ do - (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True) - result <- manyTill pAnyTag (pCloses open) - return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result - -pSpan :: PandocMonad m => TagParser m Inlines -pSpan = try $ do - guardEnabled Ext_native_spans - TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True) - contents <- pInTags "span" inline - let isSmallCaps = fontVariant == "small-caps" - where styleAttr = fromMaybe "" $ lookup "style" attr - fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr - let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr) - return $ tag contents - -pRawHtmlInline :: PandocMonad m => TagParser m Inlines -pRawHtmlInline = do - inplain <- asks inPlain - result <- pSatisfy (tagComment (const True)) - <|> if inplain - then pSatisfy (not . isBlockTag) - else pSatisfy isInlineTag - exts <- getOption readerExtensions - let raw = renderTags' [result] - if extensionEnabled Ext_raw_html exts - then return $ B.rawInline "html" raw - else ignore raw - -mathMLToTeXMath :: String -> Either String String -mathMLToTeXMath s = writeTeX <$> readMathML s - -pMath :: PandocMonad m => Bool -> TagParser m Inlines -pMath inCase = try $ do - open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True) - -- we'll assume math tags are MathML unless specially marked - -- otherwise... - unless inCase $ - guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr)) - contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math")) - case mathMLToTeXMath (renderTags $ [open] ++ contents ++ [TagClose "math"]) of - Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $ - innerText contents - Right [] -> return mempty - Right x -> return $ case lookup "display" attr of - Just "block" -> B.displayMath x - _ -> B.math x - -pInlinesInTags :: PandocMonad m => String -> (Inlines -> Inlines) - -> TagParser m Inlines -pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline - -pInTags :: (PandocMonad m, Monoid a) => String -> TagParser m a -> TagParser m a -pInTags tagtype parser = pInTags' tagtype (const True) parser - -pInTags' :: (PandocMonad m, Monoid a) - => String - -> (Tag String -> Bool) - -> TagParser m a - -> TagParser m a -pInTags' tagtype tagtest parser = try $ do - pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t) - mconcat <$> manyTill parser (pCloses tagtype <|> eof) - --- parses p, preceeded by an optional opening tag --- and followed by an optional closing tags -pOptInTag :: PandocMonad m => String -> TagParser m a -> TagParser m a -pOptInTag tagtype p = try $ do - skipMany pBlank - optional $ pSatisfy (~== TagOpen tagtype []) - skipMany pBlank - x <- p - skipMany pBlank - optional $ pSatisfy (~== TagClose tagtype) - skipMany pBlank - return x - -pCloses :: PandocMonad m => String -> TagParser m () -pCloses tagtype = try $ do - t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag - case t of - (TagClose t') | t' == tagtype -> pAnyTag >> return () - (TagOpen t' _) | t' `closes` tagtype -> return () - (TagClose "ul") | tagtype == "li" -> return () - (TagClose "ol") | tagtype == "li" -> return () - (TagClose "dl") | tagtype == "dd" -> return () - (TagClose "table") | tagtype == "td" -> return () - (TagClose "table") | tagtype == "tr" -> return () - _ -> mzero - -pTagText :: PandocMonad m => TagParser m Inlines -pTagText = try $ do - (TagText str) <- pSatisfy isTagText - st <- getState - qu <- ask - parsed <- lift $ lift $ - flip runReaderT qu $ runParserT (many pTagContents) st "text" str - case parsed of - Left _ -> throwError $ PandocParseError $ "Could not parse `" ++ str ++ "'" - Right result -> return $ mconcat result - -pBlank :: PandocMonad m => TagParser m () -pBlank = try $ do - (TagText str) <- pSatisfy isTagText - guard $ all isSpace str - -type InlinesParser m = HTMLParser m String - -pTagContents :: PandocMonad m => InlinesParser m Inlines -pTagContents = - B.displayMath <$> mathDisplay - <|> B.math <$> mathInline - <|> pStr - <|> pSpace - <|> smartPunctuation pTagContents - <|> pSymbol - <|> pBad - -pStr :: PandocMonad m => InlinesParser m Inlines -pStr = do - result <- many1 $ satisfy $ \c -> - not (isSpace c) && not (isSpecial c) && not (isBad c) - updateLastStrPos - return $ B.str result - -isSpecial :: Char -> Bool -isSpecial '"' = True -isSpecial '\'' = True -isSpecial '.' = True -isSpecial '-' = True -isSpecial '$' = True -isSpecial '\8216' = True -isSpecial '\8217' = True -isSpecial '\8220' = True -isSpecial '\8221' = True -isSpecial _ = False - -pSymbol :: PandocMonad m => InlinesParser m Inlines -pSymbol = satisfy isSpecial >>= return . B.str . (:[]) - -isBad :: Char -> Bool -isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML - -pBad :: PandocMonad m => InlinesParser m Inlines -pBad = do - c <- satisfy isBad - let c' = case c of - '\128' -> '\8364' - '\130' -> '\8218' - '\131' -> '\402' - '\132' -> '\8222' - '\133' -> '\8230' - '\134' -> '\8224' - '\135' -> '\8225' - '\136' -> '\710' - '\137' -> '\8240' - '\138' -> '\352' - '\139' -> '\8249' - '\140' -> '\338' - '\142' -> '\381' - '\145' -> '\8216' - '\146' -> '\8217' - '\147' -> '\8220' - '\148' -> '\8221' - '\149' -> '\8226' - '\150' -> '\8211' - '\151' -> '\8212' - '\152' -> '\732' - '\153' -> '\8482' - '\154' -> '\353' - '\155' -> '\8250' - '\156' -> '\339' - '\158' -> '\382' - '\159' -> '\376' - _ -> '?' - return $ B.str [c'] - -pSpace :: PandocMonad m => InlinesParser m Inlines -pSpace = many1 (satisfy isSpace) >>= \xs -> - if '\n' `elem` xs - then return B.softbreak - else return B.space - --- --- Constants --- - -eitherBlockOrInline :: [String] -eitherBlockOrInline = ["audio", "applet", "button", "iframe", "embed", - "del", "ins", - "progress", "map", "area", "noscript", "script", - "object", "svg", "video", "source"] - -{- -inlineHtmlTags :: [[Char]] -inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", - "br", "cite", "code", "dfn", "em", "font", "i", "img", - "input", "kbd", "label", "q", "s", "samp", "select", - "small", "span", "strike", "strong", "sub", "sup", - "textarea", "tt", "u", "var"] --} - -blockHtmlTags :: [String] -blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside", - "blockquote", "body", "button", "canvas", - "caption", "center", "col", "colgroup", "dd", "dir", "div", - "dl", "dt", "fieldset", "figcaption", "figure", - "footer", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "head", "header", "hgroup", "hr", "html", - "isindex", "menu", "noframes", "ol", "output", "p", "pre", - "section", "table", "tbody", "textarea", - "thead", "tfoot", "ul", "dd", - "dt", "frameset", "li", "tbody", "td", "tfoot", - "th", "thead", "tr", "script", "style"] - --- We want to allow raw docbook in markdown documents, so we --- include docbook block tags here too. -blockDocBookTags :: [String] -blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist", - "orderedlist", "segmentedlist", "simplelist", - "variablelist", "caution", "important", "note", "tip", - "warning", "address", "literallayout", "programlisting", - "programlistingco", "screen", "screenco", "screenshot", - "synopsis", "example", "informalexample", "figure", - "informalfigure", "table", "informaltable", "para", - "simpara", "formalpara", "equation", "informalequation", - "figure", "screenshot", "mediaobject", "qandaset", - "procedure", "task", "cmdsynopsis", "funcsynopsis", - "classsynopsis", "blockquote", "epigraph", "msgset", - "sidebar", "title"] - -epubTags :: [String] -epubTags = ["case", "switch", "default"] - -blockTags :: [String] -blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags - -isInlineTag :: Tag String -> Bool -isInlineTag t = tagOpen isInlineTagName (const True) t || - tagClose isInlineTagName t || - tagComment (const True) t - where isInlineTagName x = x `notElem` blockTags - -isBlockTag :: Tag String -> Bool -isBlockTag t = tagOpen isBlockTagName (const True) t || - tagClose isBlockTagName t || - tagComment (const True) t - where isBlockTagName ('?':_) = True - isBlockTagName ('!':_) = True - isBlockTagName x = x `elem` blockTags - || x `elem` eitherBlockOrInline - -isTextTag :: Tag String -> Bool -isTextTag = tagText (const True) - -isCommentTag :: Tag String -> Bool -isCommentTag = tagComment (const True) - --- taken from HXT and extended --- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags -closes :: String -> String -> Bool -_ `closes` "body" = False -_ `closes` "html" = False -"body" `closes` "head" = True -"a" `closes` "a" = True -"li" `closes` "li" = True -"th" `closes` t | t `elem` ["th","td"] = True -"tr" `closes` t | t `elem` ["th","td","tr"] = True -"dd" `closes` t | t `elem` ["dt", "dd"] = True -"dt" `closes` t | t `elem` ["dt","dd"] = True -"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True -"optgroup" `closes` "optgroup" = True -"optgroup" `closes` "option" = True -"option" `closes` "option" = True --- http://www.w3.org/TR/html-markup/p.html -x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote", - "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4", - "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section", - "table", "ul"] = True -"meta" `closes` "meta" = True -"form" `closes` "form" = True -"label" `closes` "label" = True -"map" `closes` "map" = True -"object" `closes` "object" = True -_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True -t `closes` "select" | t /= "option" = True -"thead" `closes` t | t `elem` ["colgroup"] = True -"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True -"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True -t `closes` t2 | - t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] && - t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div" -t1 `closes` t2 | - t1 `elem` blockTags && - t2 `notElem` (blockTags ++ eitherBlockOrInline) = True -_ `closes` _ = False - ---- parsers for use in markdown, textile readers - --- | Matches a stretch of HTML in balanced tags. -htmlInBalanced :: (Monad m) - => (Tag String -> Bool) - -> ParserT String st m String -htmlInBalanced f = try $ do - lookAhead (char '<') - inp <- getInput - let ts = canonicalizeTags $ - parseTagsOptions parseOptions{ optTagWarning = True, - optTagPosition = True } inp - case ts of - (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do - guard $ f t - guard $ not $ hasTagWarning (t : take 1 rest) - case htmlInBalanced' tn (t:rest) of - [] -> mzero - xs -> case reverse xs of - (TagClose _ : TagPosition er ec : _) -> do - let ls = er - sr - let cs = ec - sc - lscontents <- unlines <$> count ls anyLine - cscontents <- count cs anyChar - (_,closetag) <- htmlTag (~== TagClose tn) - return (lscontents ++ cscontents ++ closetag) - _ -> mzero - _ -> mzero - -htmlInBalanced' :: String - -> [Tag String] - -> [Tag String] -htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts - where go :: Int -> [Tag String] -> Maybe [Tag String] - go n (t@(TagOpen tn' _):rest) | tn' == tagname = - (t :) <$> go (n + 1) rest - go 1 (t@(TagClose tn'):_) | tn' == tagname = - return [t] - go n (t@(TagClose tn'):rest) | tn' == tagname = - (t :) <$> go (n - 1) rest - go n (t:ts') = (t :) <$> go n ts' - go _ [] = mzero - -hasTagWarning :: [Tag String] -> Bool -hasTagWarning (TagWarning _:_) = True -hasTagWarning _ = False - --- | Matches a tag meeting a certain condition. -htmlTag :: Monad m - => (Tag String -> Bool) - -> ParserT [Char] st m (Tag String, String) -htmlTag f = try $ do - lookAhead (char '<') - inp <- getInput - let (next : _) = canonicalizeTags $ parseTagsOptions - parseOptions{ optTagWarning = False } inp - guard $ f next - let handleTag tagname = do - -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66> - -- should NOT be parsed as an HTML tag, see #2277 - guard $ not ('.' `elem` tagname) - -- <https://example.org> should NOT be a tag either. - -- tagsoup will parse it as TagOpen "https:" [("example.org","")] - guard $ not (null tagname) - guard $ last tagname /= ':' - rendered <- manyTill anyChar (char '>') - return (next, rendered ++ ">") - case next of - TagComment s - | "<!--" `isPrefixOf` inp -> do - count (length s + 4) anyChar - skipMany (satisfy (/='>')) - char '>' - return (next, "<!--" ++ s ++ "-->") - | otherwise -> fail "bogus comment mode, HTML5 parse error" - TagOpen tagname _attr -> handleTag tagname - TagClose tagname -> handleTag tagname - _ -> mzero - -mkAttr :: [(String, String)] -> Attr -mkAttr attr = (attribsId, attribsClasses, attribsKV) - where attribsId = fromMaybe "" $ lookup "id" attr - attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes - attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr - epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr - --- Strip namespace prefixes -stripPrefixes :: [Tag String] -> [Tag String] -stripPrefixes = map stripPrefix - -stripPrefix :: Tag String -> Tag String -stripPrefix (TagOpen s as) = - TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as) -stripPrefix (TagClose s) = TagClose (stripPrefix' s) -stripPrefix x = x - -stripPrefix' :: String -> String -stripPrefix' s = - case span (/= ':') s of - (_, "") -> s - (_, (_:ts)) -> ts - -isSpace :: Char -> Bool -isSpace ' ' = True -isSpace '\t' = True -isSpace '\n' = True -isSpace '\r' = True -isSpace _ = False - --- Instances - -instance HasIdentifierList HTMLState where - extractIdentifierList = identifiers - updateIdentifierList f s = s{ identifiers = f (identifiers s) } - -instance HasHeaderMap HTMLState where - extractHeaderMap = headerMap - updateHeaderMap f s = s{ headerMap = f (headerMap s) } - --- This signature should be more general --- MonadReader HTMLLocal m => HasQuoteContext st m -instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where - getQuoteContext = asks quoteContext - withQuoteContext q = local (\s -> s{quoteContext = q}) - -instance HasReaderOptions HTMLState where - extractReaderOptions = extractReaderOptions . parserState - -instance HasMeta HTMLState where - setMeta s b st = st {parserState = setMeta s b $ parserState st} - deleteMeta s st = st {parserState = deleteMeta s $ parserState st} - -instance Default HTMLLocal where - def = HTMLLocal NoQuote False False - -instance HasLastStrPosition HTMLState where - setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} - getLastStrPos = getLastStrPos . parserState - - --- EPUB Specific --- --- -sectioningContent :: [String] -sectioningContent = ["article", "aside", "nav", "section"] - - -groupingContent :: [String] -groupingContent = ["p", "hr", "pre", "blockquote", "ol" - , "ul", "li", "dl", "dt", "dt", "dd" - , "figure", "figcaption", "div", "main"] - - -{- - -types :: [(String, ([String], Int))] -types = -- Document divisions - map (\s -> (s, (["section", "body"], 0))) - ["volume", "part", "chapter", "division"] - ++ -- Document section and components - [ - ("abstract", ([], 0))] --} |