aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs343
1 files changed, 214 insertions, 129 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index f6657a4d1..552e8a251 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1,5 +1,5 @@
{-
-Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu>
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
@@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
{- |
Module : Text.Pandoc.Readers.HTML
- Copyright : Copyright (C) 2006-2010 John MacFarlane
+ Copyright : Copyright (C) 2006-2014 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -40,6 +40,7 @@ import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Builder (Blocks, Inlines, trimInlines)
import Text.Pandoc.Shared
import Text.Pandoc.Options
import Text.Pandoc.Parsing
@@ -47,7 +48,10 @@ import Data.Maybe ( fromMaybe, isJust )
import Data.List ( intercalate )
import Data.Char ( isDigit )
import Control.Monad ( liftM, guard, when, mzero )
-import Control.Applicative ( (<$>), (<$) )
+import Control.Applicative ( (<$>), (<$), (<*) )
+import Data.Monoid
+import Text.Printf (printf)
+import Debug.Trace (trace)
isSpace :: Char -> Bool
isSpace ' ' = True
@@ -66,39 +70,56 @@ readHtml opts inp =
where tags = canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
parseDoc = do
- blocks <- (fixPlains False . concat) <$> manyTill block eof
+ blocks <- (fixPlains False) . mconcat <$> manyTill block eof
meta <- stateMeta <$> getState
- return $ Pandoc meta blocks
+ return $ Pandoc meta (B.toList blocks)
type TagParser = Parser [Tag String] ParserState
-pBody :: TagParser [Block]
+pBody :: TagParser Blocks
pBody = pInTags "body" block
-pHead :: TagParser [Block]
-pHead = pInTags "head" $ pTitle <|> ([] <$ pAnyTag)
- where pTitle = pInTags "title" inline >>= setTitle . normalizeSpaces
- setTitle t = [] <$ (updateState $ B.setMeta "title" (B.fromList t))
-
-block :: TagParser [Block]
-block = choice
+pHead :: TagParser Blocks
+pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
+ where pTitle = pInTags "title" inline >>= setTitle . trimInlines
+ setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
+ pMetaTag = do
+ mt <- pSatisfy (~== TagOpen "meta" [])
+ let name = fromAttrib "name" mt
+ if null name
+ then return mempty
+ else do
+ let content = fromAttrib "content" mt
+ updateState $ B.setMeta name (B.text content)
+ return mempty
+
+block :: TagParser Blocks
+block = do
+ tr <- getOption readerTrace
+ pos <- getPosition
+ res <- choice
[ pPara
, pHeader
, pBlockQuote
, pCodeBlock
, pList
, pHrule
- , pSimpleTable
+ , pTable
, pHead
, pBody
, pPlain
+ , pDiv
, pRawHtmlBlock
]
+ when tr $ trace (printf "line %d: %s" (sourceLine pos)
+ (take 60 $ show $ B.toList res)) (return ())
+ return res
-pList :: TagParser [Block]
+
+pList :: TagParser Blocks
pList = pBulletList <|> pOrderedList <|> pDefinitionList
-pBulletList :: TagParser [Block]
+pBulletList :: TagParser Blocks
pBulletList = try $ do
pSatisfy (~== TagOpen "ul" [])
let nonItem = pSatisfy (\t ->
@@ -108,9 +129,9 @@ pBulletList = try $ do
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul")
- return [BulletList $ map (fixPlains True) items]
+ return $ B.bulletList $ map (fixPlains True) items
-pOrderedList :: TagParser [Block]
+pOrderedList :: TagParser Blocks
pOrderedList = try $ do
TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
let (start, style) = (sta', sty')
@@ -136,27 +157,27 @@ pOrderedList = try $ do
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol")
- return [OrderedList (start, style, DefaultDelim) $ map (fixPlains True) items]
+ return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items
-pDefinitionList :: TagParser [Block]
+pDefinitionList :: TagParser Blocks
pDefinitionList = try $ do
pSatisfy (~== TagOpen "dl" [])
items <- manyTill pDefListItem (pCloses "dl")
- return [DefinitionList items]
+ return $ B.definitionList items
-pDefListItem :: TagParser ([Inline],[[Block]])
+pDefListItem :: TagParser (Inlines, [Blocks])
pDefListItem = try $ do
let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) &&
not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl"))
terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
defs <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
skipMany nonItem
- let term = intercalate [LineBreak] terms
+ let term = foldl1 (\x y -> x <> B.linebreak <> y) terms
return (term, map (fixPlains True) defs)
-fixPlains :: Bool -> [Block] -> [Block]
-fixPlains inList bs = if any isParaish bs
- then map plainToPara bs
+fixPlains :: Bool -> Blocks -> Blocks
+fixPlains inList bs = if any isParaish bs'
+ then B.fromList $ map plainToPara bs'
else bs
where isParaish (Para _) = True
isParaish (CodeBlock _ _) = True
@@ -168,6 +189,7 @@ fixPlains inList bs = if any isParaish bs
isParaish _ = False
plainToPara (Plain xs) = Para xs
plainToPara x = x
+ bs' = B.toList bs
pRawTag :: TagParser String
pRawTag = do
@@ -177,13 +199,20 @@ pRawTag = do
then return []
else return $ renderTags' [tag]
-pRawHtmlBlock :: TagParser [Block]
+pDiv :: TagParser Blocks
+pDiv = try $ do
+ getOption readerParseRaw >>= guard
+ TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="div") (const True)
+ contents <- pInTags "div" block
+ return $ B.divWith (mkAttr attr) contents
+
+pRawHtmlBlock :: TagParser Blocks
pRawHtmlBlock = do
raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag
parseRaw <- getOption readerParseRaw
if parseRaw && not (null raw)
- then return [RawBlock "html" raw]
- else return []
+ then return $ B.rawBlock "html" raw
+ else return mempty
pHtmlBlock :: String -> TagParser String
pHtmlBlock t = try $ do
@@ -191,70 +220,96 @@ pHtmlBlock t = try $ do
contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
return $ renderTags' $ [open] ++ contents ++ [TagClose t]
-pHeader :: TagParser [Block]
+pHeader :: TagParser Blocks
pHeader = try $ do
TagOpen tagtype attr <- pSatisfy $
tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
(const True)
let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
let level = read (drop 1 tagtype)
- contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof)
- let ident = maybe "" id $ lookup "id" attr
+ contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
+ let ident = fromMaybe "" $ lookup "id" attr
let classes = maybe [] words $ lookup "class" attr
let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
return $ if bodyTitle
- then [] -- skip a representation of the title in the body
- else [Header level (ident, classes, keyvals) $
- normalizeSpaces contents]
+ then mempty -- skip a representation of the title in the body
+ else B.headerWith (ident, classes, keyvals) level contents
-pHrule :: TagParser [Block]
+pHrule :: TagParser Blocks
pHrule = do
pSelfClosing (=="hr") (const True)
- return [HorizontalRule]
+ return B.horizontalRule
-pSimpleTable :: TagParser [Block]
-pSimpleTable = try $ do
+pTable :: TagParser Blocks
+pTable = try $ do
TagOpen _ _ <- pSatisfy (~== TagOpen "table" [])
skipMany pBlank
- caption <- option [] $ pInTags "caption" inline >>~ skipMany pBlank
- skipMany $ (pInTags "col" block >> skipMany pBlank) <|>
- (pInTags "colgroup" block >> skipMany pBlank)
+ caption <- option mempty $ pInTags "caption" inline >>~ skipMany pBlank
+ -- TODO actually read these and take width information from them
+ widths' <- pColgroup <|> many pCol
head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th")
skipMany pBlank
rows <- pOptInTag "tbody"
$ many1 $ try $ skipMany pBlank >> pInTags "tr" (pCell "td")
skipMany pBlank
TagClose _ <- pSatisfy (~== TagClose "table")
- let cols = maximum $ map length rows
- let aligns = replicate cols AlignLeft
- let widths = replicate cols 0
- return [Table caption aligns widths head' rows]
+ let isSinglePlain x = case B.toList x of
+ [Plain _] -> True
+ _ -> False
+ let isSimple = all isSinglePlain $ concat (head':rows)
+ let cols = length $ if null head' then head rows else head'
+ -- fail if there are colspans or rowspans
+ guard $ all (\r -> length r == cols) rows
+ let aligns = replicate cols AlignDefault
+ let widths = if null widths'
+ then if isSimple
+ then replicate cols 0
+ else replicate cols (1.0 / fromIntegral cols)
+ else widths'
+ return $ B.table caption (zip aligns widths) head' rows
+
+pCol :: TagParser Double
+pCol = try $ do
+ TagOpen _ attribs <- pSatisfy (~== TagOpen "col" [])
+ skipMany pBlank
+ optional $ pSatisfy (~== TagClose "col")
+ skipMany pBlank
+ return $ case lookup "width" attribs of
+ Just x | not (null x) && last x == '%' ->
+ fromMaybe 0.0 $ safeRead ('0':'.':init x)
+ _ -> 0.0
+
+pColgroup :: TagParser [Double]
+pColgroup = try $ do
+ pSatisfy (~== TagOpen "colgroup" [])
+ skipMany pBlank
+ manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
-pCell :: String -> TagParser [TableCell]
+pCell :: String -> TagParser [Blocks]
pCell celltype = try $ do
skipMany pBlank
- res <- pInTags celltype pPlain
+ res <- pInTags celltype block
skipMany pBlank
return [res]
-pBlockQuote :: TagParser [Block]
+pBlockQuote :: TagParser Blocks
pBlockQuote = do
contents <- pInTags "blockquote" block
- return [BlockQuote $ fixPlains False contents]
+ return $ B.blockQuote $ fixPlains False contents
-pPlain :: TagParser [Block]
+pPlain :: TagParser Blocks
pPlain = do
- contents <- liftM (normalizeSpaces . concat) $ many1 inline
- if null contents
- then return []
- else return [Plain contents]
+ contents <- trimInlines . mconcat <$> many1 inline
+ if B.isNull contents
+ then return mempty
+ else return $ B.plain contents
-pPara :: TagParser [Block]
+pPara :: TagParser Blocks
pPara = do
- contents <- pInTags "p" inline
- return [Para $ normalizeSpaces contents]
+ contents <- trimInlines <$> pInTags "p" inline
+ return $ B.para contents
-pCodeBlock :: TagParser [Block]
+pCodeBlock :: TagParser Blocks
pCodeBlock = try $ do
TagOpen _ attr <- pSatisfy (~== TagOpen "pre" [])
contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
@@ -267,13 +322,9 @@ pCodeBlock = try $ do
let result = case reverse result' of
'\n':_ -> init result'
_ -> result'
- let attribsId = fromMaybe "" $ lookup "id" attr
- let attribsClasses = words $ fromMaybe "" $ lookup "class" attr
- let attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
- let attribs = (attribsId, attribsClasses, attribsKV)
- return [CodeBlock attribs result]
+ return $ B.codeBlockWith (mkAttr attr) result
-inline :: TagParser [Inline]
+inline :: TagParser Inlines
inline = choice
[ pTagText
, pQ
@@ -286,6 +337,7 @@ inline = choice
, pLink
, pImage
, pCode
+ , pSpan
, pRawHtmlInline
]
@@ -312,7 +364,7 @@ pSelfClosing f g = do
optional $ pSatisfy (tagClose f)
return open
-pQ :: TagParser [Inline]
+pQ :: TagParser Inlines
pQ = do
quoteContext <- stateQuoteContext `fmap` getState
let quoteType = case quoteContext of
@@ -321,79 +373,84 @@ pQ = do
let innerQuoteContext = if quoteType == SingleQuote
then InSingleQuote
else InDoubleQuote
- withQuoteContext innerQuoteContext $ pInlinesInTags "q" (Quoted quoteType)
+ let constructor = case quoteType of
+ SingleQuote -> B.singleQuoted
+ DoubleQuote -> B.doubleQuoted
+ withQuoteContext innerQuoteContext $
+ pInlinesInTags "q" constructor
-pEmph :: TagParser [Inline]
-pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph
+pEmph :: TagParser Inlines
+pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph
-pStrong :: TagParser [Inline]
-pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong
+pStrong :: TagParser Inlines
+pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong
-pSuperscript :: TagParser [Inline]
-pSuperscript = pInlinesInTags "sup" Superscript
+pSuperscript :: TagParser Inlines
+pSuperscript = pInlinesInTags "sup" B.superscript
-pSubscript :: TagParser [Inline]
-pSubscript = pInlinesInTags "sub" Subscript
+pSubscript :: TagParser Inlines
+pSubscript = pInlinesInTags "sub" B.subscript
-pStrikeout :: TagParser [Inline]
+pStrikeout :: TagParser Inlines
pStrikeout = do
- pInlinesInTags "s" Strikeout <|>
- pInlinesInTags "strike" Strikeout <|>
- pInlinesInTags "del" Strikeout <|>
+ pInlinesInTags "s" B.strikeout <|>
+ pInlinesInTags "strike" B.strikeout <|>
+ pInlinesInTags "del" B.strikeout <|>
try (do pSatisfy (~== TagOpen "span" [("class","strikeout")])
- contents <- liftM concat $ manyTill inline (pCloses "span")
- return [Strikeout contents])
+ contents <- mconcat <$> manyTill inline (pCloses "span")
+ return $ B.strikeout contents)
-pLineBreak :: TagParser [Inline]
+pLineBreak :: TagParser Inlines
pLineBreak = do
pSelfClosing (=="br") (const True)
- return [LineBreak]
+ return B.linebreak
-pLink :: TagParser [Inline]
+pLink :: TagParser Inlines
pLink = try $ do
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
let url = fromAttrib "href" tag
let title = fromAttrib "title" tag
- lab <- liftM concat $ manyTill inline (pCloses "a")
- return [Link (normalizeSpaces lab) (escapeURI url, title)]
+ lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")
+ return $ B.link (escapeURI url) title lab
-pImage :: TagParser [Inline]
+pImage :: TagParser Inlines
pImage = do
tag <- pSelfClosing (=="img") (isJust . lookup "src")
let url = fromAttrib "src" tag
let title = fromAttrib "title" tag
let alt = fromAttrib "alt" tag
- return [Image (B.toList $ B.text alt) (escapeURI url, title)]
+ return $ B.image (escapeURI url) title (B.text alt)
-pCode :: TagParser [Inline]
+pCode :: TagParser Inlines
pCode = try $ do
(TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
result <- manyTill pAnyTag (pCloses open)
- let ident = fromMaybe "" $ lookup "id" attr
- let classes = words $ fromMaybe [] $ lookup "class" attr
- let rest = filter (\(x,_) -> x /= "id" && x /= "class") attr
- return [Code (ident,classes,rest)
- $ intercalate " " $ lines $ innerText result]
+ return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result
-pRawHtmlInline :: TagParser [Inline]
+pSpan :: TagParser Inlines
+pSpan = try $ do
+ getOption readerParseRaw >>= guard
+ TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
+ contents <- pInTags "span" inline
+ return $ B.spanWith (mkAttr attr) contents
+
+pRawHtmlInline :: TagParser Inlines
pRawHtmlInline = do
result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag
parseRaw <- getOption readerParseRaw
if parseRaw
- then return [RawInline "html" $ renderTags' [result]]
- else return []
+ then return $ B.rawInline "html" $ renderTags' [result]
+ else return mempty
-pInlinesInTags :: String -> ([Inline] -> Inline)
- -> TagParser [Inline]
-pInlinesInTags tagtype f = do
- contents <- pInTags tagtype inline
- return [f $ normalizeSpaces contents]
+pInlinesInTags :: String -> (Inlines -> Inlines)
+ -> TagParser Inlines
+pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
-pInTags :: String -> TagParser [a]
- -> TagParser [a]
+pInTags :: (Monoid a) => String -> TagParser a
+ -> TagParser a
pInTags tagtype parser = try $ do
pSatisfy (~== TagOpen tagtype [])
- liftM concat $ manyTill parser (pCloses tagtype <|> eof)
+ mconcat <$> manyTill parser (pCloses tagtype <|> eof)
pOptInTag :: String -> TagParser a
-> TagParser a
@@ -409,56 +466,65 @@ pCloses :: String -> TagParser ()
pCloses tagtype = try $ do
t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
case t of
- (TagClose t') | t' == tagtype -> pAnyTag >> return ()
+ (TagClose t') | t' == tagtype -> pAnyTag >> return ()
(TagOpen t' _) | t' `closes` tagtype -> return ()
(TagClose "ul") | tagtype == "li" -> return ()
(TagClose "ol") | tagtype == "li" -> return ()
(TagClose "dl") | tagtype == "li" -> return ()
+ (TagClose "table") | tagtype == "td" -> return ()
+ (TagClose "table") | tagtype == "tr" -> return ()
_ -> mzero
-pTagText :: TagParser [Inline]
+pTagText :: TagParser Inlines
pTagText = try $ do
(TagText str) <- pSatisfy isTagText
st <- getState
case runParser (many pTagContents) st "text" str of
Left _ -> fail $ "Could not parse `" ++ str ++ "'"
- Right result -> return result
+ Right result -> return $ mconcat result
pBlank :: TagParser ()
pBlank = try $ do
(TagText str) <- pSatisfy isTagText
guard $ all isSpace str
-pTagContents :: Parser [Char] ParserState Inline
+pTagContents :: Parser [Char] ParserState Inlines
pTagContents =
- pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad
-
-pStr :: Parser [Char] ParserState Inline
+ B.displayMath <$> mathDisplay
+ <|> B.math <$> mathInline
+ <|> pStr
+ <|> pSpace
+ <|> smartPunctuation pTagContents
+ <|> pSymbol
+ <|> pBad
+
+pStr :: Parser [Char] ParserState Inlines
pStr = do
result <- many1 $ satisfy $ \c ->
not (isSpace c) && not (isSpecial c) && not (isBad c)
pos <- getPosition
updateState $ \s -> s{ stateLastStrPos = Just pos }
- return $ Str result
+ return $ B.str result
isSpecial :: Char -> Bool
isSpecial '"' = True
isSpecial '\'' = True
isSpecial '.' = True
isSpecial '-' = True
+isSpecial '$' = True
isSpecial '\8216' = True
isSpecial '\8217' = True
isSpecial '\8220' = True
isSpecial '\8221' = True
isSpecial _ = False
-pSymbol :: Parser [Char] ParserState Inline
-pSymbol = satisfy isSpecial >>= return . Str . (:[])
+pSymbol :: Parser [Char] ParserState Inlines
+pSymbol = satisfy isSpecial >>= return . B.str . (:[])
isBad :: Char -> Bool
isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
-pBad :: Parser [Char] ParserState Inline
+pBad :: Parser [Char] ParserState Inlines
pBad = do
c <- satisfy isBad
let c' = case c of
@@ -490,10 +556,10 @@ pBad = do
'\158' -> '\382'
'\159' -> '\376'
_ -> '?'
- return $ Str [c']
+ return $ B.str [c']
-pSpace :: Parser [Char] ParserState Inline
-pSpace = many1 (satisfy isSpace) >> return Space
+pSpace :: Parser [Char] ParserState Inlines
+pSpace = many1 (satisfy isSpace) >> return B.space
--
-- Constants
@@ -521,7 +587,7 @@ blockHtmlTags = ["address", "article", "aside", "blockquote", "body", "button",
"noframes", "noscript", "object", "ol", "output", "p", "pre", "progress",
"section", "table", "tbody", "textarea", "thead", "tfoot", "ul", "dd",
"dt", "frameset", "li", "tbody", "td", "tfoot",
- "th", "thead", "tr", "script", "style", "video"]
+ "th", "thead", "tr", "script", "style", "svg", "video"]
-- We want to allow raw docbook in markdown documents, so we
-- include docbook block tags here too.
@@ -543,15 +609,19 @@ blockTags :: [String]
blockTags = blockHtmlTags ++ blockDocBookTags
isInlineTag :: Tag String -> Bool
-isInlineTag t = tagOpen (`notElem` blockTags) (const True) t ||
- tagClose (`notElem` blockTags) t ||
+isInlineTag t = tagOpen isInlineTagName (const True) t ||
+ tagClose isInlineTagName t ||
tagComment (const True) t
+ where isInlineTagName x = x `notElem` blockTags
isBlockTag :: Tag String -> Bool
-isBlockTag t = tagOpen (`elem` blocktags) (const True) t ||
- tagClose (`elem` blocktags) t ||
+isBlockTag t = tagOpen isBlockTagName (const True) t ||
+ tagClose isBlockTagName t ||
tagComment (const True) t
- where blocktags = blockTags ++ eitherBlockOrInline
+ where isBlockTagName ('?':_) = True
+ isBlockTagName ('!':_) = True
+ isBlockTagName x = x `elem` blockTags
+ || x `elem` eitherBlockOrInline
isTextTag :: Tag String -> Bool
isTextTag = tagText (const True)
@@ -560,7 +630,7 @@ isCommentTag :: Tag String -> Bool
isCommentTag = tagComment (const True)
-- taken from HXT and extended
-
+-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
closes :: String -> String -> Bool
_ `closes` "body" = False
_ `closes` "html" = False
@@ -568,11 +638,18 @@ _ `closes` "html" = False
"li" `closes` "li" = True
"th" `closes` t | t `elem` ["th","td"] = True
"tr" `closes` t | t `elem` ["th","td","tr"] = True
+"dd" `closes` t | t `elem` ["dt", "dd"] = True
"dt" `closes` t | t `elem` ["dt","dd"] = True
-"hr" `closes` "p" = True
-"p" `closes` "p" = True
+"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
+"optgroup" `closes` "optgroup" = True
+"optgroup" `closes` "option" = True
+"option" `closes` "option" = True
+-- http://www.w3.org/TR/html-markup/p.html
+x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
+ "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
+ "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section",
+ "table", "ul"] = True
"meta" `closes` "meta" = True
-"colgroup" `closes` "colgroup" = True
"form" `closes` "form" = True
"label" `closes` "label" = True
"map" `closes` "map" = True
@@ -620,3 +697,11 @@ htmlTag f = try $ do
_ -> do
rendered <- manyTill anyChar (char '>')
return (next, rendered ++ ">")
+
+mkAttr :: [(String, String)] -> Attr
+mkAttr attr = (attribsId, attribsClasses, attribsKV)
+ where attribsId = fromMaybe "" $ lookup "id" attr
+ attribsClasses = words $ fromMaybe "" $ lookup "class" attr
+ attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+
+