aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs1136
1 files changed, 0 insertions, 1136 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
deleted file mode 100644
index f02f1a1d4..000000000
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ /dev/null
@@ -1,1136 +0,0 @@
-{-# LANGUAGE FlexibleContexts, FlexibleInstances, MultiParamTypeClasses,
-ViewPatterns#-}
-{-
-Copyright (C) 2006-2015 John MacFarlane <jgm@berkeley.edu>
-
-This program is free software; you can redistribute it and/or modify
-it under the terms of the GNU General Public License as published by
-the Free Software Foundation; either version 2 of the License, or
-(at your option) any later version.
-
-This program is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-
-You should have received a copy of the GNU General Public License
-along with this program; if not, write to the Free Software
-Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
--}
-
-{- |
- Module : Text.Pandoc.Readers.HTML
- Copyright : Copyright (C) 2006-2015 John MacFarlane
- License : GNU GPL, version 2 or above
-
- Maintainer : John MacFarlane <jgm@berkeley.edu>
- Stability : alpha
- Portability : portable
-
-Conversion of HTML to 'Pandoc' document.
--}
-module Text.Pandoc.Readers.HTML ( readHtml
- , htmlTag
- , htmlInBalanced
- , isInlineTag
- , isBlockTag
- , isTextTag
- , isCommentTag
- ) where
-
-import Text.HTML.TagSoup
-import Text.HTML.TagSoup.Match
-import Text.Pandoc.Definition
-import qualified Text.Pandoc.Builder as B
-import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
-import Text.Pandoc.Shared ( extractSpaces, renderTags', addMetaField
- , escapeURI, safeRead )
-import Text.Pandoc.Options (ReaderOptions(readerExtensions), extensionEnabled,
- Extension (Ext_epub_html_exts,
- Ext_raw_html, Ext_native_divs, Ext_native_spans))
-import Text.Pandoc.Logging
-import Text.Pandoc.Parsing hiding ((<|>))
-import Text.Pandoc.Walk
-import qualified Data.Map as M
-import Data.Maybe ( fromMaybe, isJust)
-import Data.List ( intercalate, isInfixOf, isPrefixOf )
-import Data.Char ( isDigit )
-import Control.Monad ( guard, mzero, void, unless )
-import Control.Arrow ((***))
-import Control.Applicative ( (<|>) )
-import Data.Monoid (First (..))
-import Text.TeXMath (readMathML, writeTeX)
-import Data.Default (Default (..), def)
-import Control.Monad.Reader (ask, asks, local, ReaderT, runReaderT, lift)
-import Network.URI (URI, parseURIReference, nonStrictRelativeTo)
-import Text.Pandoc.CSS (foldOrElse, pickStyleAttrProps)
-import Data.Monoid ((<>))
-import Text.Parsec.Error
-import qualified Data.Set as Set
-import Text.Pandoc.Error
-import Text.Pandoc.Class (PandocMonad, report)
-import Control.Monad.Except (throwError)
-
--- | Convert HTML-formatted string to 'Pandoc' document.
-readHtml :: PandocMonad m
- => ReaderOptions -- ^ Reader options
- -> String -- ^ String to parse (assumes @'\n'@ line endings)
- -> m Pandoc
-readHtml opts inp = do
- let tags = stripPrefixes . canonicalizeTags $
- parseTagsOptions parseOptions{ optTagPosition = True } inp
- parseDoc = do
- blocks <- (fixPlains False) . mconcat <$> manyTill block eof
- meta <- stateMeta . parserState <$> getState
- bs' <- replaceNotes (B.toList blocks)
- return $ Pandoc meta bs'
- getError (errorMessages -> ms) = case ms of
- [] -> ""
- (m:_) -> messageString m
- result <- flip runReaderT def $
- runParserT parseDoc
- (HTMLState def{ stateOptions = opts } [] Nothing Set.empty M.empty)
- "source" tags
- case result of
- Right doc -> return doc
- Left err -> throwError $ PandocParseError $ getError err
-
-replaceNotes :: PandocMonad m => [Block] -> TagParser m [Block]
-replaceNotes = walkM replaceNotes'
-
-replaceNotes' :: PandocMonad m => Inline -> TagParser m Inline
-replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes
- where
- getNotes = noteTable <$> getState
-replaceNotes' x = return x
-
-data HTMLState =
- HTMLState
- { parserState :: ParserState,
- noteTable :: [(String, Blocks)],
- baseHref :: Maybe URI,
- identifiers :: Set.Set String,
- headerMap :: M.Map Inlines String
- }
-
-data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
- , inChapter :: Bool -- ^ Set if in chapter section
- , inPlain :: Bool -- ^ Set if in pPlain
- }
-
-setInChapter :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
-setInChapter = local (\s -> s {inChapter = True})
-
-setInPlain :: PandocMonad m => HTMLParser m s a -> HTMLParser m s a
-setInPlain = local (\s -> s {inPlain = True})
-
-type HTMLParser m s = ParserT s HTMLState (ReaderT HTMLLocal m)
-
-type TagParser m = HTMLParser m [Tag String]
-
-pBody :: PandocMonad m => TagParser m Blocks
-pBody = pInTags "body" block
-
-pHead :: PandocMonad m => TagParser m Blocks
-pHead = pInTags "head" $ pTitle <|> pMetaTag <|> pBaseTag <|> (mempty <$ pAnyTag)
- where pTitle = pInTags "title" inline >>= setTitle . trimInlines
- setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
- pMetaTag = do
- mt <- pSatisfy (~== TagOpen "meta" [])
- let name = fromAttrib "name" mt
- if null name
- then return mempty
- else do
- let content = fromAttrib "content" mt
- updateState $ \s ->
- let ps = parserState s in
- s{ parserState = ps{
- stateMeta = addMetaField name (B.text content)
- (stateMeta ps) } }
- return mempty
- pBaseTag = do
- bt <- pSatisfy (~== TagOpen "base" [])
- updateState $ \st -> st{ baseHref =
- parseURIReference $ fromAttrib "href" bt }
- return mempty
-
-block :: PandocMonad m => TagParser m Blocks
-block = do
- pos <- getPosition
- res <- choice
- [ eSection
- , eSwitch B.para block
- , mempty <$ eFootnote
- , mempty <$ eTOC
- , mempty <$ eTitlePage
- , pPara
- , pHeader
- , pBlockQuote
- , pCodeBlock
- , pList
- , pHrule
- , pTable
- , pHead
- , pBody
- , pDiv
- , pPlain
- , pRawHtmlBlock
- ]
- report $ ParsingTrace (take 60 $ show $ B.toList res) pos
- return res
-
-namespaces :: PandocMonad m => [(String, TagParser m Inlines)]
-namespaces = [(mathMLNamespace, pMath True)]
-
-mathMLNamespace :: String
-mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
-
-eSwitch :: (PandocMonad m, Monoid a)
- => (Inlines -> a)
- -> TagParser m a
- -> TagParser m a
-eSwitch constructor parser = try $ do
- guardEnabled Ext_epub_html_exts
- pSatisfy (~== TagOpen "switch" [])
- cases <- getFirst . mconcat <$>
- manyTill (First <$> (eCase <* skipMany pBlank) )
- (lookAhead $ try $ pSatisfy (~== TagOpen "default" []))
- skipMany pBlank
- fallback <- pInTags "default" (skipMany pBlank *> parser <* skipMany pBlank)
- skipMany pBlank
- pSatisfy (~== TagClose "switch")
- return $ maybe fallback constructor cases
-
-eCase :: PandocMonad m => TagParser m (Maybe Inlines)
-eCase = do
- skipMany pBlank
- TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" [])
- case (flip lookup namespaces) =<< lookup "required-namespace" attr of
- Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank))
- Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case"))
-
-eFootnote :: PandocMonad m => TagParser m ()
-eFootnote = try $ do
- let notes = ["footnote", "rearnote"]
- guardEnabled Ext_epub_html_exts
- (TagOpen tag attr) <- lookAhead $ pAnyTag
- guard (maybe False (flip elem notes) (lookup "type" attr))
- let ident = fromMaybe "" (lookup "id" attr)
- content <- pInTags tag block
- addNote ident content
-
-addNote :: PandocMonad m => String -> Blocks -> TagParser m ()
-addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)})
-
-eNoteref :: PandocMonad m => TagParser m Inlines
-eNoteref = try $ do
- guardEnabled Ext_epub_html_exts
- TagOpen tag attr <- lookAhead $ pAnyTag
- guard (maybe False (== "noteref") (lookup "type" attr))
- let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr)
- guard (not (null ident))
- pInTags tag block
- return $ B.rawInline "noteref" ident
-
--- Strip TOC if there is one, better to generate again
-eTOC :: PandocMonad m => TagParser m ()
-eTOC = try $ do
- guardEnabled Ext_epub_html_exts
- (TagOpen tag attr) <- lookAhead $ pAnyTag
- guard (maybe False (== "toc") (lookup "type" attr))
- void (pInTags tag block)
-
-pList :: PandocMonad m => TagParser m Blocks
-pList = pBulletList <|> pOrderedList <|> pDefinitionList
-
-pBulletList :: PandocMonad m => TagParser m Blocks
-pBulletList = try $ do
- pSatisfy (~== TagOpen "ul" [])
- let nonItem = pSatisfy (\t ->
- not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
- not (t ~== TagClose "ul"))
- -- note: if they have an <ol> or <ul> not in scope of a <li>,
- -- treat it as a list item, though it's not valid xhtml...
- skipMany nonItem
- items <- manyTill (pListItem nonItem) (pCloses "ul")
- return $ B.bulletList $ map (fixPlains True) items
-
-pListItem :: PandocMonad m => TagParser m a -> TagParser m Blocks
-pListItem nonItem = do
- TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" [])
- let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr)
- (liDiv <>) <$> pInTags "li" block <* skipMany nonItem
-
-parseListStyleType :: String -> ListNumberStyle
-parseListStyleType "lower-roman" = LowerRoman
-parseListStyleType "upper-roman" = UpperRoman
-parseListStyleType "lower-alpha" = LowerAlpha
-parseListStyleType "upper-alpha" = UpperAlpha
-parseListStyleType "decimal" = Decimal
-parseListStyleType _ = DefaultStyle
-
-parseTypeAttr :: String -> ListNumberStyle
-parseTypeAttr "i" = LowerRoman
-parseTypeAttr "I" = UpperRoman
-parseTypeAttr "a" = LowerAlpha
-parseTypeAttr "A" = UpperAlpha
-parseTypeAttr "1" = Decimal
-parseTypeAttr _ = DefaultStyle
-
-pOrderedList :: PandocMonad m => TagParser m Blocks
-pOrderedList = try $ do
- TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
- let (start, style) = (sta', sty')
- where sta = fromMaybe "1" $
- lookup "start" attribs
- sta' = if all isDigit sta
- then read sta
- else 1
-
- pickListStyle = pickStyleAttrProps ["list-style-type", "list-style"]
-
- typeAttr = fromMaybe "" $ lookup "type" attribs
- classAttr = fromMaybe "" $ lookup "class" attribs
- styleAttr = fromMaybe "" $ lookup "style" attribs
- listStyle = fromMaybe "" $ pickListStyle styleAttr
-
- sty' = foldOrElse DefaultStyle
- [ parseTypeAttr typeAttr
- , parseListStyleType classAttr
- , parseListStyleType listStyle
- ]
- let nonItem = pSatisfy (\t ->
- not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
- not (t ~== TagClose "ol"))
- -- note: if they have an <ol> or <ul> not in scope of a <li>,
- -- treat it as a list item, though it's not valid xhtml...
- skipMany nonItem
- items <- manyTill (pListItem nonItem) (pCloses "ol")
- return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items
-
-pDefinitionList :: PandocMonad m => TagParser m Blocks
-pDefinitionList = try $ do
- pSatisfy (~== TagOpen "dl" [])
- items <- manyTill pDefListItem (pCloses "dl")
- return $ B.definitionList items
-
-pDefListItem :: PandocMonad m => TagParser m (Inlines, [Blocks])
-pDefListItem = try $ do
- let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) &&
- not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl"))
- terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
- defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
- skipMany nonItem
- let term = foldl1 (\x y -> x <> B.linebreak <> y) terms
- return (term, map (fixPlains True) defs)
-
-fixPlains :: Bool -> Blocks -> Blocks
-fixPlains inList bs = if any isParaish bs'
- then B.fromList $ map plainToPara bs'
- else bs
- where isParaish (Para _) = True
- isParaish (CodeBlock _ _) = True
- isParaish (Header _ _ _) = True
- isParaish (BlockQuote _) = True
- isParaish (BulletList _) = not inList
- isParaish (OrderedList _ _) = not inList
- isParaish (DefinitionList _) = not inList
- isParaish _ = False
- plainToPara (Plain xs) = Para xs
- plainToPara x = x
- bs' = B.toList bs
-
-pRawTag :: PandocMonad m => TagParser m String
-pRawTag = do
- tag <- pAnyTag
- let ignorable x = x `elem` ["html","head","body","!DOCTYPE","?xml"]
- if tagOpen ignorable (const True) tag || tagClose ignorable tag
- then return []
- else return $ renderTags' [tag]
-
-pDiv :: PandocMonad m => TagParser m Blocks
-pDiv = try $ do
- guardEnabled Ext_native_divs
- let isDivLike "div" = True
- isDivLike "section" = True
- isDivLike _ = False
- TagOpen tag attr <- lookAhead $ pSatisfy $ tagOpen isDivLike (const True)
- contents <- pInTags tag block
- let (ident, classes, kvs) = mkAttr attr
- let classes' = if tag == "section"
- then "section":classes
- else classes
- return $ B.divWith (ident, classes', kvs) contents
-
-pRawHtmlBlock :: PandocMonad m => TagParser m Blocks
-pRawHtmlBlock = do
- raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag
- exts <- getOption readerExtensions
- if extensionEnabled Ext_raw_html exts && not (null raw)
- then return $ B.rawBlock "html" raw
- else ignore raw
-
-ignore :: (Monoid a, PandocMonad m) => String -> TagParser m a
-ignore raw = do
- pos <- getPosition
- -- raw can be null for tags like <!DOCTYPE>; see paRawTag
- -- in this case we don't want a warning:
- unless (null raw) $
- report $ SkippedContent raw pos
- return mempty
-
-pHtmlBlock :: PandocMonad m => String -> TagParser m String
-pHtmlBlock t = try $ do
- open <- pSatisfy (~== TagOpen t [])
- contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
- return $ renderTags' $ [open] ++ contents ++ [TagClose t]
-
--- Sets chapter context
-eSection :: PandocMonad m => TagParser m Blocks
-eSection = try $ do
- let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as)
- let sectTag = tagOpen (`elem` sectioningContent) matchChapter
- TagOpen tag _ <- lookAhead $ pSatisfy sectTag
- setInChapter (pInTags tag block)
-
-headerLevel :: PandocMonad m => String -> TagParser m Int
-headerLevel tagtype = do
- let level = read (drop 1 tagtype)
- (try $ do
- guardEnabled Ext_epub_html_exts
- asks inChapter >>= guard
- return (level - 1))
- <|>
- return level
-
-eTitlePage :: PandocMonad m => TagParser m ()
-eTitlePage = try $ do
- let isTitlePage as = maybe False (isInfixOf "titlepage") (lookup "type" as)
- let groupTag = tagOpen (\x -> x `elem` groupingContent || x == "section")
- isTitlePage
- TagOpen tag _ <- lookAhead $ pSatisfy groupTag
- () <$ pInTags tag block
-
-pHeader :: PandocMonad m => TagParser m Blocks
-pHeader = try $ do
- TagOpen tagtype attr <- pSatisfy $
- tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
- (const True)
- let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
- level <- headerLevel tagtype
- contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
- let ident = fromMaybe "" $ lookup "id" attr
- let classes = maybe [] words $ lookup "class" attr
- let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
- attr' <- registerHeader (ident, classes, keyvals) contents
- return $ if bodyTitle
- then mempty -- skip a representation of the title in the body
- else B.headerWith attr' level contents
-
-pHrule :: PandocMonad m => TagParser m Blocks
-pHrule = do
- pSelfClosing (=="hr") (const True)
- return B.horizontalRule
-
-pTable :: PandocMonad m => TagParser m Blocks
-pTable = try $ do
- TagOpen _ _ <- pSatisfy (~== TagOpen "table" [])
- skipMany pBlank
- caption <- option mempty $ pInTags "caption" inline <* skipMany pBlank
- widths' <- (mconcat <$> many1 pColgroup) <|> many pCol
- let pTh = option [] $ pInTags "tr" (pCell "th")
- pTr = try $ skipMany pBlank >> pInTags "tr" (pCell "td" <|> pCell "th")
- pTBody = do pOptInTag "tbody" $ many1 pTr
- head'' <- pOptInTag "thead" pTh
- head' <- pOptInTag "tbody" $ do
- if null head''
- then pTh
- else return head''
- rowsLs <- many pTBody
- rows' <- pOptInTag "tfoot" $ many pTr
- TagClose _ <- pSatisfy (~== TagClose "table")
- let rows'' = (concat rowsLs) ++ rows'
- -- fail on empty table
- guard $ not $ null head' && null rows''
- let isSinglePlain x = case B.toList x of
- [] -> True
- [Plain _] -> True
- _ -> False
- let isSimple = all isSinglePlain $ concat (head':rows'')
- let cols = length $ if null head' then head rows'' else head'
- -- add empty cells to short rows
- let addEmpties r = case cols - length r of
- n | n > 0 -> r ++ replicate n mempty
- | otherwise -> r
- let rows = map addEmpties rows''
- let aligns = replicate cols AlignDefault
- let widths = if null widths'
- then if isSimple
- then replicate cols 0
- else replicate cols (1.0 / fromIntegral cols)
- else widths'
- return $ B.table caption (zip aligns widths) head' rows
-
-pCol :: PandocMonad m => TagParser m Double
-pCol = try $ do
- TagOpen _ attribs <- pSatisfy (~== TagOpen "col" [])
- skipMany pBlank
- optional $ pSatisfy (~== TagClose "col")
- skipMany pBlank
- return $ case lookup "width" attribs of
- Nothing -> case lookup "style" attribs of
- Just ('w':'i':'d':'t':'h':':':xs) | '%' `elem` xs ->
- fromMaybe 0.0 $ safeRead ('0':'.':filter
- (`notElem` " \t\r\n%'\";") xs)
- _ -> 0.0
- Just x | not (null x) && last x == '%' ->
- fromMaybe 0.0 $ safeRead ('0':'.':init x)
- _ -> 0.0
-
-pColgroup :: PandocMonad m => TagParser m [Double]
-pColgroup = try $ do
- pSatisfy (~== TagOpen "colgroup" [])
- skipMany pBlank
- manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
-
-noColOrRowSpans :: Tag String -> Bool
-noColOrRowSpans t = isNullOrOne "colspan" && isNullOrOne "rowspan"
- where isNullOrOne x = case fromAttrib x t of
- "" -> True
- "1" -> True
- _ -> False
-
-pCell :: PandocMonad m => String -> TagParser m [Blocks]
-pCell celltype = try $ do
- skipMany pBlank
- res <- pInTags' celltype noColOrRowSpans block
- skipMany pBlank
- return [res]
-
-pBlockQuote :: PandocMonad m => TagParser m Blocks
-pBlockQuote = do
- contents <- pInTags "blockquote" block
- return $ B.blockQuote $ fixPlains False contents
-
-pPlain :: PandocMonad m => TagParser m Blocks
-pPlain = do
- contents <- setInPlain $ trimInlines . mconcat <$> many1 inline
- if B.isNull contents
- then return mempty
- else return $ B.plain contents
-
-pPara :: PandocMonad m => TagParser m Blocks
-pPara = do
- contents <- trimInlines <$> pInTags "p" inline
- return $ B.para contents
-
-pCodeBlock :: PandocMonad m => TagParser m Blocks
-pCodeBlock = try $ do
- TagOpen _ attr <- pSatisfy (~== TagOpen "pre" [])
- contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
- let rawText = concatMap tagToString contents
- -- drop leading newline if any
- let result' = case rawText of
- '\n':xs -> xs
- _ -> rawText
- -- drop trailing newline if any
- let result = case reverse result' of
- '\n':_ -> init result'
- _ -> result'
- return $ B.codeBlockWith (mkAttr attr) result
-
-tagToString :: Tag String -> String
-tagToString (TagText s) = s
-tagToString (TagOpen "br" _) = "\n"
-tagToString _ = ""
-
-inline :: PandocMonad m => TagParser m Inlines
-inline = choice
- [ eNoteref
- , eSwitch id inline
- , pTagText
- , pQ
- , pEmph
- , pStrong
- , pSuperscript
- , pSubscript
- , pStrikeout
- , pLineBreak
- , pLink
- , pImage
- , pCode
- , pSpan
- , pMath False
- , pRawHtmlInline
- ]
-
-pLocation :: PandocMonad m => TagParser m ()
-pLocation = do
- (TagPosition r c) <- pSat isTagPosition
- setPosition $ newPos "input" r c
-
-pSat :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String)
-pSat f = do
- pos <- getPosition
- token show (const pos) (\x -> if f x then Just x else Nothing)
-
-pSatisfy :: PandocMonad m => (Tag String -> Bool) -> TagParser m (Tag String)
-pSatisfy f = try $ optional pLocation >> pSat f
-
-pAnyTag :: PandocMonad m => TagParser m (Tag String)
-pAnyTag = pSatisfy (const True)
-
-pSelfClosing :: PandocMonad m
- => (String -> Bool) -> ([Attribute String] -> Bool)
- -> TagParser m (Tag String)
-pSelfClosing f g = do
- open <- pSatisfy (tagOpen f g)
- optional $ pSatisfy (tagClose f)
- return open
-
-pQ :: PandocMonad m => TagParser m Inlines
-pQ = do
- context <- asks quoteContext
- let quoteType = case context of
- InDoubleQuote -> SingleQuote
- _ -> DoubleQuote
- let innerQuoteContext = if quoteType == SingleQuote
- then InSingleQuote
- else InDoubleQuote
- let constructor = case quoteType of
- SingleQuote -> B.singleQuoted
- DoubleQuote -> B.doubleQuoted
- withQuoteContext innerQuoteContext $
- pInlinesInTags "q" constructor
-
-pEmph :: PandocMonad m => TagParser m Inlines
-pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph
-
-pStrong :: PandocMonad m => TagParser m Inlines
-pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong
-
-pSuperscript :: PandocMonad m => TagParser m Inlines
-pSuperscript = pInlinesInTags "sup" B.superscript
-
-pSubscript :: PandocMonad m => TagParser m Inlines
-pSubscript = pInlinesInTags "sub" B.subscript
-
-pStrikeout :: PandocMonad m => TagParser m Inlines
-pStrikeout = do
- pInlinesInTags "s" B.strikeout <|>
- pInlinesInTags "strike" B.strikeout <|>
- pInlinesInTags "del" B.strikeout <|>
- try (do pSatisfy (~== TagOpen "span" [("class","strikeout")])
- contents <- mconcat <$> manyTill inline (pCloses "span")
- return $ B.strikeout contents)
-
-pLineBreak :: PandocMonad m => TagParser m Inlines
-pLineBreak = do
- pSelfClosing (=="br") (const True)
- return B.linebreak
-
--- Unlike fromAttrib from tagsoup, this distinguishes
--- between a missing attribute and an attribute with empty content.
-maybeFromAttrib :: String -> Tag String -> Maybe String
-maybeFromAttrib name (TagOpen _ attrs) = lookup name attrs
-maybeFromAttrib _ _ = Nothing
-
-pLink :: PandocMonad m => TagParser m Inlines
-pLink = try $ do
- tag <- pSatisfy $ tagOpenLit "a" (const True)
- let title = fromAttrib "title" tag
- -- take id from id attribute if present, otherwise name
- let uid = maybe (fromAttrib "name" tag) id $ maybeFromAttrib "id" tag
- let cls = words $ fromAttrib "class" tag
- lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")
- -- check for href; if href, then a link, otherwise a span
- case maybeFromAttrib "href" tag of
- Nothing ->
- return $ B.spanWith (uid, cls, []) lab
- Just url' -> do
- mbBaseHref <- baseHref <$> getState
- let url = case (parseURIReference url', mbBaseHref) of
- (Just rel, Just bs) ->
- show (rel `nonStrictRelativeTo` bs)
- _ -> url'
- return $ B.linkWith (uid, cls, []) (escapeURI url) title lab
-
-pImage :: PandocMonad m => TagParser m Inlines
-pImage = do
- tag <- pSelfClosing (=="img") (isJust . lookup "src")
- mbBaseHref <- baseHref <$> getState
- let url' = fromAttrib "src" tag
- let url = case (parseURIReference url', mbBaseHref) of
- (Just rel, Just bs) -> show (rel `nonStrictRelativeTo` bs)
- _ -> url'
- let title = fromAttrib "title" tag
- let alt = fromAttrib "alt" tag
- let uid = fromAttrib "id" tag
- let cls = words $ fromAttrib "class" tag
- let getAtt k = case fromAttrib k tag of
- "" -> []
- v -> [(k, v)]
- let kvs = concat $ map getAtt ["width", "height", "sizes", "srcset"]
- return $ B.imageWith (uid, cls, kvs) (escapeURI url) title (B.text alt)
-
-pCode :: PandocMonad m => TagParser m Inlines
-pCode = try $ do
- (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
- result <- manyTill pAnyTag (pCloses open)
- return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result
-
-pSpan :: PandocMonad m => TagParser m Inlines
-pSpan = try $ do
- guardEnabled Ext_native_spans
- TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
- contents <- pInTags "span" inline
- let isSmallCaps = fontVariant == "small-caps"
- where styleAttr = fromMaybe "" $ lookup "style" attr
- fontVariant = fromMaybe "" $ pickStyleAttrProps ["font-variant"] styleAttr
- let tag = if isSmallCaps then B.smallcaps else B.spanWith (mkAttr attr)
- return $ tag contents
-
-pRawHtmlInline :: PandocMonad m => TagParser m Inlines
-pRawHtmlInline = do
- inplain <- asks inPlain
- result <- pSatisfy (tagComment (const True))
- <|> if inplain
- then pSatisfy (not . isBlockTag)
- else pSatisfy isInlineTag
- exts <- getOption readerExtensions
- let raw = renderTags' [result]
- if extensionEnabled Ext_raw_html exts
- then return $ B.rawInline "html" raw
- else ignore raw
-
-mathMLToTeXMath :: String -> Either String String
-mathMLToTeXMath s = writeTeX <$> readMathML s
-
-pMath :: PandocMonad m => Bool -> TagParser m Inlines
-pMath inCase = try $ do
- open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True)
- -- we'll assume math tags are MathML unless specially marked
- -- otherwise...
- unless inCase $
- guard (maybe True (== mathMLNamespace) (lookup "xmlns" attr))
- contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math"))
- case mathMLToTeXMath (renderTags $ [open] ++ contents ++ [TagClose "math"]) of
- Left _ -> return $ B.spanWith ("",["math"],attr) $ B.text $
- innerText contents
- Right [] -> return mempty
- Right x -> return $ case lookup "display" attr of
- Just "block" -> B.displayMath x
- _ -> B.math x
-
-pInlinesInTags :: PandocMonad m => String -> (Inlines -> Inlines)
- -> TagParser m Inlines
-pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
-
-pInTags :: (PandocMonad m, Monoid a) => String -> TagParser m a -> TagParser m a
-pInTags tagtype parser = pInTags' tagtype (const True) parser
-
-pInTags' :: (PandocMonad m, Monoid a)
- => String
- -> (Tag String -> Bool)
- -> TagParser m a
- -> TagParser m a
-pInTags' tagtype tagtest parser = try $ do
- pSatisfy (\t -> t ~== TagOpen tagtype [] && tagtest t)
- mconcat <$> manyTill parser (pCloses tagtype <|> eof)
-
--- parses p, preceeded by an optional opening tag
--- and followed by an optional closing tags
-pOptInTag :: PandocMonad m => String -> TagParser m a -> TagParser m a
-pOptInTag tagtype p = try $ do
- skipMany pBlank
- optional $ pSatisfy (~== TagOpen tagtype [])
- skipMany pBlank
- x <- p
- skipMany pBlank
- optional $ pSatisfy (~== TagClose tagtype)
- skipMany pBlank
- return x
-
-pCloses :: PandocMonad m => String -> TagParser m ()
-pCloses tagtype = try $ do
- t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
- case t of
- (TagClose t') | t' == tagtype -> pAnyTag >> return ()
- (TagOpen t' _) | t' `closes` tagtype -> return ()
- (TagClose "ul") | tagtype == "li" -> return ()
- (TagClose "ol") | tagtype == "li" -> return ()
- (TagClose "dl") | tagtype == "dd" -> return ()
- (TagClose "table") | tagtype == "td" -> return ()
- (TagClose "table") | tagtype == "tr" -> return ()
- _ -> mzero
-
-pTagText :: PandocMonad m => TagParser m Inlines
-pTagText = try $ do
- (TagText str) <- pSatisfy isTagText
- st <- getState
- qu <- ask
- parsed <- lift $ lift $
- flip runReaderT qu $ runParserT (many pTagContents) st "text" str
- case parsed of
- Left _ -> throwError $ PandocParseError $ "Could not parse `" ++ str ++ "'"
- Right result -> return $ mconcat result
-
-pBlank :: PandocMonad m => TagParser m ()
-pBlank = try $ do
- (TagText str) <- pSatisfy isTagText
- guard $ all isSpace str
-
-type InlinesParser m = HTMLParser m String
-
-pTagContents :: PandocMonad m => InlinesParser m Inlines
-pTagContents =
- B.displayMath <$> mathDisplay
- <|> B.math <$> mathInline
- <|> pStr
- <|> pSpace
- <|> smartPunctuation pTagContents
- <|> pSymbol
- <|> pBad
-
-pStr :: PandocMonad m => InlinesParser m Inlines
-pStr = do
- result <- many1 $ satisfy $ \c ->
- not (isSpace c) && not (isSpecial c) && not (isBad c)
- updateLastStrPos
- return $ B.str result
-
-isSpecial :: Char -> Bool
-isSpecial '"' = True
-isSpecial '\'' = True
-isSpecial '.' = True
-isSpecial '-' = True
-isSpecial '$' = True
-isSpecial '\8216' = True
-isSpecial '\8217' = True
-isSpecial '\8220' = True
-isSpecial '\8221' = True
-isSpecial _ = False
-
-pSymbol :: PandocMonad m => InlinesParser m Inlines
-pSymbol = satisfy isSpecial >>= return . B.str . (:[])
-
-isBad :: Char -> Bool
-isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
-
-pBad :: PandocMonad m => InlinesParser m Inlines
-pBad = do
- c <- satisfy isBad
- let c' = case c of
- '\128' -> '\8364'
- '\130' -> '\8218'
- '\131' -> '\402'
- '\132' -> '\8222'
- '\133' -> '\8230'
- '\134' -> '\8224'
- '\135' -> '\8225'
- '\136' -> '\710'
- '\137' -> '\8240'
- '\138' -> '\352'
- '\139' -> '\8249'
- '\140' -> '\338'
- '\142' -> '\381'
- '\145' -> '\8216'
- '\146' -> '\8217'
- '\147' -> '\8220'
- '\148' -> '\8221'
- '\149' -> '\8226'
- '\150' -> '\8211'
- '\151' -> '\8212'
- '\152' -> '\732'
- '\153' -> '\8482'
- '\154' -> '\353'
- '\155' -> '\8250'
- '\156' -> '\339'
- '\158' -> '\382'
- '\159' -> '\376'
- _ -> '?'
- return $ B.str [c']
-
-pSpace :: PandocMonad m => InlinesParser m Inlines
-pSpace = many1 (satisfy isSpace) >>= \xs ->
- if '\n' `elem` xs
- then return B.softbreak
- else return B.space
-
---
--- Constants
---
-
-eitherBlockOrInline :: [String]
-eitherBlockOrInline = ["audio", "applet", "button", "iframe", "embed",
- "del", "ins",
- "progress", "map", "area", "noscript", "script",
- "object", "svg", "video", "source"]
-
-{-
-inlineHtmlTags :: [[Char]]
-inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
- "br", "cite", "code", "dfn", "em", "font", "i", "img",
- "input", "kbd", "label", "q", "s", "samp", "select",
- "small", "span", "strike", "strong", "sub", "sup",
- "textarea", "tt", "u", "var"]
--}
-
-blockHtmlTags :: [String]
-blockHtmlTags = ["?xml", "!DOCTYPE", "address", "article", "aside",
- "blockquote", "body", "button", "canvas",
- "caption", "center", "col", "colgroup", "dd", "dir", "div",
- "dl", "dt", "fieldset", "figcaption", "figure",
- "footer", "form", "h1", "h2", "h3", "h4",
- "h5", "h6", "head", "header", "hgroup", "hr", "html",
- "isindex", "menu", "noframes", "ol", "output", "p", "pre",
- "section", "table", "tbody", "textarea",
- "thead", "tfoot", "ul", "dd",
- "dt", "frameset", "li", "tbody", "td", "tfoot",
- "th", "thead", "tr", "script", "style"]
-
--- We want to allow raw docbook in markdown documents, so we
--- include docbook block tags here too.
-blockDocBookTags :: [String]
-blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist",
- "orderedlist", "segmentedlist", "simplelist",
- "variablelist", "caution", "important", "note", "tip",
- "warning", "address", "literallayout", "programlisting",
- "programlistingco", "screen", "screenco", "screenshot",
- "synopsis", "example", "informalexample", "figure",
- "informalfigure", "table", "informaltable", "para",
- "simpara", "formalpara", "equation", "informalequation",
- "figure", "screenshot", "mediaobject", "qandaset",
- "procedure", "task", "cmdsynopsis", "funcsynopsis",
- "classsynopsis", "blockquote", "epigraph", "msgset",
- "sidebar", "title"]
-
-epubTags :: [String]
-epubTags = ["case", "switch", "default"]
-
-blockTags :: [String]
-blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags
-
-isInlineTag :: Tag String -> Bool
-isInlineTag t = tagOpen isInlineTagName (const True) t ||
- tagClose isInlineTagName t ||
- tagComment (const True) t
- where isInlineTagName x = x `notElem` blockTags
-
-isBlockTag :: Tag String -> Bool
-isBlockTag t = tagOpen isBlockTagName (const True) t ||
- tagClose isBlockTagName t ||
- tagComment (const True) t
- where isBlockTagName ('?':_) = True
- isBlockTagName ('!':_) = True
- isBlockTagName x = x `elem` blockTags
- || x `elem` eitherBlockOrInline
-
-isTextTag :: Tag String -> Bool
-isTextTag = tagText (const True)
-
-isCommentTag :: Tag String -> Bool
-isCommentTag = tagComment (const True)
-
--- taken from HXT and extended
--- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
-closes :: String -> String -> Bool
-_ `closes` "body" = False
-_ `closes` "html" = False
-"body" `closes` "head" = True
-"a" `closes` "a" = True
-"li" `closes` "li" = True
-"th" `closes` t | t `elem` ["th","td"] = True
-"tr" `closes` t | t `elem` ["th","td","tr"] = True
-"dd" `closes` t | t `elem` ["dt", "dd"] = True
-"dt" `closes` t | t `elem` ["dt","dd"] = True
-"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
-"optgroup" `closes` "optgroup" = True
-"optgroup" `closes` "option" = True
-"option" `closes` "option" = True
--- http://www.w3.org/TR/html-markup/p.html
-x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
- "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
- "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section",
- "table", "ul"] = True
-"meta" `closes` "meta" = True
-"form" `closes` "form" = True
-"label" `closes` "label" = True
-"map" `closes` "map" = True
-"object" `closes` "object" = True
-_ `closes` t | t `elem` ["option","style","script","textarea","title"] = True
-t `closes` "select" | t /= "option" = True
-"thead" `closes` t | t `elem` ["colgroup"] = True
-"tfoot" `closes` t | t `elem` ["thead","colgroup"] = True
-"tbody" `closes` t | t `elem` ["tbody","tfoot","thead","colgroup"] = True
-t `closes` t2 |
- t `elem` ["h1","h2","h3","h4","h5","h6","dl","ol","ul","table","div","p"] &&
- t2 `elem` ["h1","h2","h3","h4","h5","h6","p" ] = True -- not "div"
-t1 `closes` t2 |
- t1 `elem` blockTags &&
- t2 `notElem` (blockTags ++ eitherBlockOrInline) = True
-_ `closes` _ = False
-
---- parsers for use in markdown, textile readers
-
--- | Matches a stretch of HTML in balanced tags.
-htmlInBalanced :: (Monad m)
- => (Tag String -> Bool)
- -> ParserT String st m String
-htmlInBalanced f = try $ do
- lookAhead (char '<')
- inp <- getInput
- let ts = canonicalizeTags $
- parseTagsOptions parseOptions{ optTagWarning = True,
- optTagPosition = True } inp
- case ts of
- (TagPosition sr sc : t@(TagOpen tn _) : rest) -> do
- guard $ f t
- guard $ not $ hasTagWarning (t : take 1 rest)
- case htmlInBalanced' tn (t:rest) of
- [] -> mzero
- xs -> case reverse xs of
- (TagClose _ : TagPosition er ec : _) -> do
- let ls = er - sr
- let cs = ec - sc
- lscontents <- unlines <$> count ls anyLine
- cscontents <- count cs anyChar
- (_,closetag) <- htmlTag (~== TagClose tn)
- return (lscontents ++ cscontents ++ closetag)
- _ -> mzero
- _ -> mzero
-
-htmlInBalanced' :: String
- -> [Tag String]
- -> [Tag String]
-htmlInBalanced' tagname ts = fromMaybe [] $ go 0 ts
- where go :: Int -> [Tag String] -> Maybe [Tag String]
- go n (t@(TagOpen tn' _):rest) | tn' == tagname =
- (t :) <$> go (n + 1) rest
- go 1 (t@(TagClose tn'):_) | tn' == tagname =
- return [t]
- go n (t@(TagClose tn'):rest) | tn' == tagname =
- (t :) <$> go (n - 1) rest
- go n (t:ts') = (t :) <$> go n ts'
- go _ [] = mzero
-
-hasTagWarning :: [Tag String] -> Bool
-hasTagWarning (TagWarning _:_) = True
-hasTagWarning _ = False
-
--- | Matches a tag meeting a certain condition.
-htmlTag :: Monad m
- => (Tag String -> Bool)
- -> ParserT [Char] st m (Tag String, String)
-htmlTag f = try $ do
- lookAhead (char '<')
- inp <- getInput
- let (next : _) = canonicalizeTags $ parseTagsOptions
- parseOptions{ optTagWarning = False } inp
- guard $ f next
- let handleTag tagname = do
- -- <www.boe.es/buscar/act.php?id=BOE-A-1996-8930#a66>
- -- should NOT be parsed as an HTML tag, see #2277
- guard $ not ('.' `elem` tagname)
- -- <https://example.org> should NOT be a tag either.
- -- tagsoup will parse it as TagOpen "https:" [("example.org","")]
- guard $ not (null tagname)
- guard $ last tagname /= ':'
- rendered <- manyTill anyChar (char '>')
- return (next, rendered ++ ">")
- case next of
- TagComment s
- | "<!--" `isPrefixOf` inp -> do
- count (length s + 4) anyChar
- skipMany (satisfy (/='>'))
- char '>'
- return (next, "<!--" ++ s ++ "-->")
- | otherwise -> fail "bogus comment mode, HTML5 parse error"
- TagOpen tagname _attr -> handleTag tagname
- TagClose tagname -> handleTag tagname
- _ -> mzero
-
-mkAttr :: [(String, String)] -> Attr
-mkAttr attr = (attribsId, attribsClasses, attribsKV)
- where attribsId = fromMaybe "" $ lookup "id" attr
- attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes
- attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
- epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
-
--- Strip namespace prefixes
-stripPrefixes :: [Tag String] -> [Tag String]
-stripPrefixes = map stripPrefix
-
-stripPrefix :: Tag String -> Tag String
-stripPrefix (TagOpen s as) =
- TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as)
-stripPrefix (TagClose s) = TagClose (stripPrefix' s)
-stripPrefix x = x
-
-stripPrefix' :: String -> String
-stripPrefix' s =
- case span (/= ':') s of
- (_, "") -> s
- (_, (_:ts)) -> ts
-
-isSpace :: Char -> Bool
-isSpace ' ' = True
-isSpace '\t' = True
-isSpace '\n' = True
-isSpace '\r' = True
-isSpace _ = False
-
--- Instances
-
-instance HasIdentifierList HTMLState where
- extractIdentifierList = identifiers
- updateIdentifierList f s = s{ identifiers = f (identifiers s) }
-
-instance HasHeaderMap HTMLState where
- extractHeaderMap = headerMap
- updateHeaderMap f s = s{ headerMap = f (headerMap s) }
-
--- This signature should be more general
--- MonadReader HTMLLocal m => HasQuoteContext st m
-instance PandocMonad m => HasQuoteContext HTMLState (ReaderT HTMLLocal m) where
- getQuoteContext = asks quoteContext
- withQuoteContext q = local (\s -> s{quoteContext = q})
-
-instance HasReaderOptions HTMLState where
- extractReaderOptions = extractReaderOptions . parserState
-
-instance HasMeta HTMLState where
- setMeta s b st = st {parserState = setMeta s b $ parserState st}
- deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
-
-instance Default HTMLLocal where
- def = HTMLLocal NoQuote False False
-
-instance HasLastStrPosition HTMLState where
- setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
- getLastStrPos = getLastStrPos . parserState
-
-
--- EPUB Specific
---
---
-sectioningContent :: [String]
-sectioningContent = ["article", "aside", "nav", "section"]
-
-
-groupingContent :: [String]
-groupingContent = ["p", "hr", "pre", "blockquote", "ol"
- , "ul", "li", "dl", "dt", "dt", "dd"
- , "figure", "figcaption", "div", "main"]
-
-
-{-
-
-types :: [(String, ([String], Int))]
-types = -- Document divisions
- map (\s -> (s, (["section", "body"], 0)))
- ["volume", "part", "chapter", "division"]
- ++ -- Document section and components
- [
- ("abstract", ([], 0))]
--}