1 files changed, 214 insertions, 129 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index f6657a4d1..552e8a251 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1,5 +1,5 @@
 {-
-Copyright (C) 2006-2010 John MacFarlane <jgm@berkeley.edu>
+Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu>
 
 This program is free software; you can redistribute it and/or modify
 it under the terms of the GNU General Public License as published by
@@ -18,7 +18,7 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 
 {- |
    Module      : Text.Pandoc.Readers.HTML
-   Copyright   : Copyright (C) 2006-2010 John MacFarlane
+   Copyright   : Copyright (C) 2006-2014 John MacFarlane
    License     : GNU GPL, version 2 or above
 
    Maintainer  : John MacFarlane <jgm@berkeley.edu>
@@ -40,6 +40,7 @@ import Text.HTML.TagSoup
 import Text.HTML.TagSoup.Match
 import Text.Pandoc.Definition
 import qualified Text.Pandoc.Builder as B
+import Text.Pandoc.Builder (Blocks, Inlines, trimInlines)
 import Text.Pandoc.Shared
 import Text.Pandoc.Options
 import Text.Pandoc.Parsing
@@ -47,7 +48,10 @@ import Data.Maybe ( fromMaybe, isJust )
 import Data.List ( intercalate )
 import Data.Char ( isDigit )
 import Control.Monad ( liftM, guard, when, mzero )
-import Control.Applicative ( (<$>), (<$) )
+import Control.Applicative ( (<$>), (<$), (<*) )
+import Data.Monoid
+import Text.Printf (printf)
+import Debug.Trace (trace)
 
 isSpace :: Char -> Bool
 isSpace ' '  = True
@@ -66,39 +70,56 @@ readHtml opts inp =
     where tags = canonicalizeTags $
                    parseTagsOptions parseOptions{ optTagPosition = True } inp
           parseDoc = do
-             blocks <- (fixPlains False . concat) <$> manyTill block eof
+             blocks <- (fixPlains False) . mconcat <$> manyTill block eof
              meta <- stateMeta <$> getState
-             return $ Pandoc meta blocks
+             return $ Pandoc meta (B.toList blocks)
 
 type TagParser = Parser [Tag String] ParserState
 
-pBody :: TagParser [Block]
+pBody :: TagParser Blocks
 pBody = pInTags "body" block
 
-pHead :: TagParser [Block]
-pHead = pInTags "head" $ pTitle <|> ([] <$ pAnyTag)
-  where pTitle = pInTags "title" inline >>= setTitle . normalizeSpaces
-        setTitle t = [] <$ (updateState $ B.setMeta "title" (B.fromList t))
-
-block :: TagParser [Block]
-block = choice
+pHead :: TagParser Blocks
+pHead = pInTags "head" $ pTitle <|> pMetaTag <|> (mempty <$ pAnyTag)
+  where pTitle = pInTags "title" inline >>= setTitle . trimInlines
+        setTitle t = mempty <$ (updateState $ B.setMeta "title" t)
+        pMetaTag = do
+          mt <- pSatisfy (~== TagOpen "meta" [])
+          let name = fromAttrib "name" mt
+          if null name
+             then return mempty
+             else do
+               let content = fromAttrib "content" mt
+               updateState $ B.setMeta name (B.text content)
+               return mempty
+
+block :: TagParser Blocks
+block = do
+  tr <- getOption readerTrace
+  pos <- getPosition
+  res <- choice
             [ pPara
             , pHeader
             , pBlockQuote
             , pCodeBlock
             , pList
             , pHrule
-            , pSimpleTable
+            , pTable
             , pHead
             , pBody
             , pPlain
+            , pDiv
             , pRawHtmlBlock
             ]
+  when tr $ trace (printf "line %d: %s" (sourceLine pos)
+             (take 60 $ show $ B.toList res)) (return ())
+  return res
 
-pList :: TagParser [Block]
+
+pList :: TagParser Blocks
 pList = pBulletList <|> pOrderedList <|> pDefinitionList
 
-pBulletList :: TagParser [Block]
+pBulletList :: TagParser Blocks
 pBulletList = try $ do
   pSatisfy (~== TagOpen "ul" [])
   let nonItem = pSatisfy (\t ->
@@ -108,9 +129,9 @@ pBulletList = try $ do
   -- treat it as a list item, though it's not valid xhtml...
   skipMany nonItem
   items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ul")
-  return [BulletList $ map (fixPlains True) items]
+  return $ B.bulletList $ map (fixPlains True) items
 
-pOrderedList :: TagParser [Block]
+pOrderedList :: TagParser Blocks
 pOrderedList = try $ do
   TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
   let (start, style) = (sta', sty')
@@ -136,27 +157,27 @@ pOrderedList = try $ do
   -- treat it as a list item, though it's not valid xhtml...
   skipMany nonItem
   items <- manyTill (pInTags "li" block >>~ skipMany nonItem) (pCloses "ol")
-  return [OrderedList (start, style, DefaultDelim) $ map (fixPlains True) items]
+  return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items
 
-pDefinitionList :: TagParser [Block]
+pDefinitionList :: TagParser Blocks
 pDefinitionList = try $ do
   pSatisfy (~== TagOpen "dl" [])
   items <- manyTill pDefListItem (pCloses "dl")
-  return [DefinitionList items]
+  return $ B.definitionList items
 
-pDefListItem :: TagParser ([Inline],[[Block]])
+pDefListItem :: TagParser (Inlines, [Blocks])
 pDefListItem = try $ do
   let nonItem = pSatisfy (\t -> not (t ~== TagOpen "dt" []) &&
                   not (t ~== TagOpen "dd" []) && not (t ~== TagClose "dl"))
   terms <- many1 (try $ skipMany nonItem >> pInTags "dt" inline)
   defs  <- many1 (try $ skipMany nonItem >> pInTags "dd" block)
   skipMany nonItem
-  let term = intercalate [LineBreak] terms
+  let term = foldl1 (\x y ->  x <> B.linebreak <> y) terms
   return (term, map (fixPlains True) defs)
 
-fixPlains :: Bool -> [Block] -> [Block]
-fixPlains inList bs = if any isParaish bs
-                         then map plainToPara bs
+fixPlains :: Bool -> Blocks -> Blocks
+fixPlains inList bs = if any isParaish bs'
+                         then B.fromList $ map plainToPara bs'
                          else bs
   where isParaish (Para _) = True
         isParaish (CodeBlock _ _) = True
@@ -168,6 +189,7 @@ fixPlains inList bs = if any isParaish bs
         isParaish _        = False
         plainToPara (Plain xs) = Para xs
         plainToPara x = x
+        bs' = B.toList bs
 
 pRawTag :: TagParser String
 pRawTag = do
@@ -177,13 +199,20 @@ pRawTag = do
      then return []
      else return $ renderTags' [tag]
 
-pRawHtmlBlock :: TagParser [Block]
+pDiv :: TagParser Blocks
+pDiv = try $ do
+  getOption readerParseRaw >>= guard
+  TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="div") (const True)
+  contents <- pInTags "div" block
+  return $ B.divWith (mkAttr attr) contents
+
+pRawHtmlBlock :: TagParser Blocks
 pRawHtmlBlock = do
   raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag
   parseRaw <- getOption readerParseRaw
   if parseRaw && not (null raw)
-     then return [RawBlock "html" raw]
-     else return []
+     then return $ B.rawBlock "html" raw
+     else return mempty
 
 pHtmlBlock :: String -> TagParser String
 pHtmlBlock t = try $ do
@@ -191,70 +220,96 @@ pHtmlBlock t = try $ do
   contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
   return $ renderTags' $ [open] ++ contents ++ [TagClose t]
 
-pHeader :: TagParser [Block]
+pHeader :: TagParser Blocks
 pHeader = try $ do
   TagOpen tagtype attr <- pSatisfy $
                            tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
                            (const True)
   let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
   let level = read (drop 1 tagtype)
-  contents <- liftM concat $ manyTill inline (pCloses tagtype <|> eof)
-  let ident = maybe "" id $ lookup "id" attr
+  contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
+  let ident = fromMaybe "" $ lookup "id" attr
   let classes = maybe [] words $ lookup "class" attr
   let keyvals = [(k,v) | (k,v) <- attr, k /= "class", k /= "id"]
   return $ if bodyTitle
-              then []  -- skip a representation of the title in the body
-              else [Header level (ident, classes, keyvals) $
-                    normalizeSpaces contents]
+              then mempty  -- skip a representation of the title in the body
+              else B.headerWith (ident, classes, keyvals) level contents
 
-pHrule :: TagParser [Block]
+pHrule :: TagParser Blocks
 pHrule = do
   pSelfClosing (=="hr") (const True)
-  return [HorizontalRule]
+  return B.horizontalRule
 
-pSimpleTable :: TagParser [Block]
-pSimpleTable = try $ do
+pTable :: TagParser Blocks
+pTable = try $ do
   TagOpen _ _ <- pSatisfy (~== TagOpen "table" [])
   skipMany pBlank
-  caption <- option [] $ pInTags "caption" inline >>~ skipMany pBlank
-  skipMany $ (pInTags "col" block >> skipMany pBlank) <|>
-             (pInTags "colgroup" block >> skipMany pBlank)
+  caption <- option mempty $ pInTags "caption" inline >>~ skipMany pBlank
+  -- TODO actually read these and take width information from them
+  widths' <- pColgroup <|> many pCol
   head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th")
   skipMany pBlank
   rows <- pOptInTag "tbody"
           $ many1 $ try $ skipMany pBlank >> pInTags "tr" (pCell "td")
   skipMany pBlank
   TagClose _ <- pSatisfy (~== TagClose "table")
-  let cols = maximum $ map length rows
-  let aligns = replicate cols AlignLeft
-  let widths = replicate cols 0
-  return [Table caption aligns widths head' rows]
+  let isSinglePlain x = case B.toList x of
+                             [Plain _] -> True
+                             _         -> False
+  let isSimple = all isSinglePlain $ concat (head':rows)
+  let cols = length $ if null head' then head rows else head'
+  -- fail if there are colspans or rowspans
+  guard $ all (\r -> length r == cols) rows
+  let aligns = replicate cols AlignDefault
+  let widths = if null widths'
+                  then if isSimple
+                       then replicate cols 0
+                       else replicate cols (1.0 / fromIntegral cols)
+                  else widths'
+  return $ B.table caption (zip aligns widths) head' rows
+
+pCol :: TagParser Double
+pCol = try $ do
+  TagOpen _ attribs <- pSatisfy (~== TagOpen "col" [])
+  skipMany pBlank
+  optional $ pSatisfy (~== TagClose "col")
+  skipMany pBlank
+  return $ case lookup "width" attribs of
+           Just x | not (null x) && last x == '%' ->
+             fromMaybe 0.0 $ safeRead ('0':'.':init x)
+           _ -> 0.0
+
+pColgroup :: TagParser [Double]
+pColgroup = try $ do
+  pSatisfy (~== TagOpen "colgroup" [])
+  skipMany pBlank
+  manyTill pCol (pCloses "colgroup" <|> eof) <* skipMany pBlank
 
-pCell :: String -> TagParser [TableCell]
+pCell :: String -> TagParser [Blocks]
 pCell celltype = try $ do
   skipMany pBlank
-  res <- pInTags celltype pPlain
+  res <- pInTags celltype block
   skipMany pBlank
   return [res]
 
-pBlockQuote :: TagParser [Block]
+pBlockQuote :: TagParser Blocks
 pBlockQuote = do
   contents <- pInTags "blockquote" block
-  return [BlockQuote $ fixPlains False contents]
+  return $ B.blockQuote $ fixPlains False contents
 
-pPlain :: TagParser [Block]
+pPlain :: TagParser Blocks
 pPlain = do
-  contents <- liftM (normalizeSpaces . concat) $ many1 inline
-  if null contents
-     then return []
-     else return [Plain contents]
+  contents <- trimInlines . mconcat <$> many1 inline
+  if B.isNull contents
+     then return mempty
+     else return $ B.plain contents
 
-pPara :: TagParser [Block]
+pPara :: TagParser Blocks
 pPara = do
-  contents <- pInTags "p" inline
-  return [Para $ normalizeSpaces contents]
+  contents <- trimInlines <$> pInTags "p" inline
+  return $ B.para contents
 
-pCodeBlock :: TagParser [Block]
+pCodeBlock :: TagParser Blocks
 pCodeBlock = try $ do
   TagOpen _ attr <- pSatisfy (~== TagOpen "pre" [])
   contents <- manyTill pAnyTag (pCloses "pre" <|> eof)
@@ -267,13 +322,9 @@ pCodeBlock = try $ do
   let result = case reverse result' of
                     '\n':_   -> init result'
                     _        -> result'
-  let attribsId = fromMaybe "" $ lookup "id" attr
-  let attribsClasses = words $ fromMaybe "" $ lookup "class" attr
-  let attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
-  let attribs = (attribsId, attribsClasses, attribsKV)
-  return [CodeBlock attribs result]
+  return $ B.codeBlockWith (mkAttr attr) result
 
-inline :: TagParser [Inline]
+inline :: TagParser Inlines
 inline = choice
            [ pTagText
            , pQ
@@ -286,6 +337,7 @@ inline = choice
            , pLink
            , pImage
            , pCode
+           , pSpan
            , pRawHtmlInline
            ]
 
@@ -312,7 +364,7 @@ pSelfClosing f g = do
   optional $ pSatisfy (tagClose f)
   return open
 
-pQ :: TagParser [Inline]
+pQ :: TagParser Inlines
 pQ = do
   quoteContext <- stateQuoteContext `fmap` getState
   let quoteType = case quoteContext of
@@ -321,79 +373,84 @@ pQ = do
   let innerQuoteContext = if quoteType == SingleQuote
                              then InSingleQuote
                              else InDoubleQuote
-  withQuoteContext innerQuoteContext $ pInlinesInTags "q" (Quoted quoteType)
+  let constructor = case quoteType of
+                            SingleQuote -> B.singleQuoted
+                            DoubleQuote -> B.doubleQuoted
+  withQuoteContext innerQuoteContext $
+    pInlinesInTags "q" constructor
 
-pEmph :: TagParser [Inline]
-pEmph = pInlinesInTags "em" Emph <|> pInlinesInTags "i" Emph
+pEmph :: TagParser Inlines
+pEmph = pInlinesInTags "em" B.emph <|> pInlinesInTags "i" B.emph
 
-pStrong :: TagParser [Inline]
-pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong
+pStrong :: TagParser Inlines
+pStrong = pInlinesInTags "strong" B.strong <|> pInlinesInTags "b" B.strong
 
-pSuperscript :: TagParser [Inline]
-pSuperscript = pInlinesInTags "sup" Superscript
+pSuperscript :: TagParser Inlines
+pSuperscript = pInlinesInTags "sup" B.superscript
 
-pSubscript :: TagParser [Inline]
-pSubscript = pInlinesInTags "sub" Subscript
+pSubscript :: TagParser Inlines
+pSubscript = pInlinesInTags "sub" B.subscript
 
-pStrikeout :: TagParser [Inline]
+pStrikeout :: TagParser Inlines
 pStrikeout = do
-  pInlinesInTags "s" Strikeout <|>
-    pInlinesInTags "strike" Strikeout <|>
-    pInlinesInTags "del" Strikeout <|>
+  pInlinesInTags "s" B.strikeout <|>
+    pInlinesInTags "strike" B.strikeout <|>
+    pInlinesInTags "del" B.strikeout <|>
     try (do pSatisfy (~== TagOpen "span" [("class","strikeout")])
-            contents <- liftM concat $ manyTill inline (pCloses "span")
-            return [Strikeout contents])
+            contents <- mconcat <$> manyTill inline (pCloses "span")
+            return $ B.strikeout contents)
 
-pLineBreak :: TagParser [Inline]
+pLineBreak :: TagParser Inlines
 pLineBreak = do
   pSelfClosing (=="br") (const True)
-  return [LineBreak]
+  return B.linebreak
 
-pLink :: TagParser [Inline]
+pLink :: TagParser Inlines
 pLink = try $ do
   tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
   let url = fromAttrib "href" tag
   let title = fromAttrib "title" tag
-  lab <- liftM concat $ manyTill inline (pCloses "a")
-  return [Link (normalizeSpaces lab) (escapeURI url, title)]
+  lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")
+  return $ B.link (escapeURI url) title lab
 
-pImage :: TagParser [Inline]
+pImage :: TagParser Inlines
 pImage = do
   tag <- pSelfClosing (=="img") (isJust . lookup "src")
   let url = fromAttrib "src" tag
   let title = fromAttrib "title" tag
   let alt = fromAttrib "alt" tag
-  return [Image (B.toList $ B.text alt) (escapeURI url, title)]
+  return $ B.image (escapeURI url) title (B.text alt)
 
-pCode :: TagParser [Inline]
+pCode :: TagParser Inlines
 pCode = try $ do
   (TagOpen open attr) <- pSatisfy $ tagOpen (`elem` ["code","tt"]) (const True)
   result <- manyTill pAnyTag (pCloses open)
-  let ident = fromMaybe "" $ lookup "id" attr
-  let classes = words $ fromMaybe [] $ lookup "class" attr
-  let rest = filter (\(x,_) -> x /= "id" && x /= "class") attr
-  return [Code (ident,classes,rest)
-         $ intercalate " " $ lines $ innerText result]
+  return $ B.codeWith (mkAttr attr) $ intercalate " " $ lines $ innerText result
 
-pRawHtmlInline :: TagParser [Inline]
+pSpan :: TagParser Inlines
+pSpan = try $ do
+  getOption readerParseRaw >>= guard
+  TagOpen _ attr <- lookAhead $ pSatisfy $ tagOpen (=="span") (const True)
+  contents <- pInTags "span" inline
+  return $ B.spanWith (mkAttr attr) contents
+
+pRawHtmlInline :: TagParser Inlines
 pRawHtmlInline = do
   result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag
   parseRaw <- getOption readerParseRaw
   if parseRaw
-     then return [RawInline "html" $ renderTags' [result]]
-     else return []
+     then return $ B.rawInline "html" $ renderTags' [result]
+     else return mempty
 
-pInlinesInTags :: String -> ([Inline] -> Inline)
-               -> TagParser [Inline]
-pInlinesInTags tagtype f = do
-  contents <- pInTags tagtype inline
-  return [f $ normalizeSpaces contents]
+pInlinesInTags :: String -> (Inlines -> Inlines)
+               -> TagParser Inlines
+pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
 
-pInTags :: String -> TagParser [a]
-        -> TagParser [a]
+pInTags :: (Monoid a) => String -> TagParser a
+        -> TagParser a
 pInTags tagtype parser = try $ do
   pSatisfy (~== TagOpen tagtype [])
-  liftM concat $ manyTill parser (pCloses tagtype <|> eof)
+  mconcat <$> manyTill parser (pCloses tagtype <|> eof)
 
 pOptInTag :: String -> TagParser a
           -> TagParser a
@@ -409,56 +466,65 @@ pCloses :: String -> TagParser ()
 pCloses tagtype = try $ do
   t <- lookAhead $ pSatisfy $ \tag -> isTagClose tag || isTagOpen tag
   case t of
-       (TagClose t')  | t' == tagtype -> pAnyTag >> return ()
+       (TagClose t') | t' == tagtype -> pAnyTag >> return ()
        (TagOpen t' _) | t' `closes` tagtype -> return ()
        (TagClose "ul") | tagtype == "li" -> return ()
        (TagClose "ol") | tagtype == "li" -> return ()
        (TagClose "dl") | tagtype == "li" -> return ()
+       (TagClose "table") | tagtype == "td" -> return ()
+       (TagClose "table") | tagtype == "tr" -> return ()
        _ -> mzero
 
-pTagText :: TagParser [Inline]
+pTagText :: TagParser Inlines
 pTagText = try $ do
   (TagText str) <- pSatisfy isTagText
   st <- getState
   case runParser (many pTagContents) st "text" str of
        Left _        -> fail $ "Could not parse `" ++ str ++ "'"
-       Right result  -> return result
+       Right result  -> return $ mconcat result
 
 pBlank :: TagParser ()
 pBlank = try $ do
   (TagText str) <- pSatisfy isTagText
   guard $ all isSpace str
 
-pTagContents :: Parser [Char] ParserState Inline
+pTagContents :: Parser [Char] ParserState Inlines
 pTagContents =
-  pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad
-
-pStr :: Parser [Char] ParserState Inline
+      B.displayMath <$> mathDisplay
+  <|> B.math        <$> mathInline
+  <|> pStr
+  <|> pSpace
+  <|> smartPunctuation pTagContents
+  <|> pSymbol
+  <|> pBad
+
+pStr :: Parser [Char] ParserState Inlines
 pStr = do
   result <- many1 $ satisfy $ \c ->
                      not (isSpace c) && not (isSpecial c) && not (isBad c)
   pos <- getPosition
   updateState $ \s -> s{ stateLastStrPos = Just pos }
-  return $ Str result
+  return $ B.str result
 
 isSpecial :: Char -> Bool
 isSpecial '"' = True
 isSpecial '\'' = True
 isSpecial '.' = True
 isSpecial '-' = True
+isSpecial '$' = True
 isSpecial '\8216' = True
 isSpecial '\8217' = True
 isSpecial '\8220' = True
 isSpecial '\8221' = True
 isSpecial _ = False
 
-pSymbol :: Parser [Char] ParserState Inline
-pSymbol = satisfy isSpecial >>= return . Str . (:[])
+pSymbol :: Parser [Char] ParserState Inlines
+pSymbol = satisfy isSpecial >>= return . B.str . (:[])
 
 isBad :: Char -> Bool
 isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
 
-pBad :: Parser [Char] ParserState Inline
+pBad :: Parser [Char] ParserState Inlines
 pBad = do
   c <- satisfy isBad
   let c' = case c of
@@ -490,10 +556,10 @@ pBad = do
                 '\158' -> '\382'
                 '\159' -> '\376'
                 _      -> '?'
-  return $ Str [c']
+  return $ B.str [c']
 
-pSpace :: Parser [Char] ParserState Inline
-pSpace = many1 (satisfy isSpace) >> return Space
+pSpace :: Parser [Char] ParserState Inlines
+pSpace = many1 (satisfy isSpace) >> return B.space
 
 --
 -- Constants
@@ -521,7 +587,7 @@ blockHtmlTags = ["address", "article", "aside", "blockquote", "body", "button",
                  "noframes", "noscript", "object", "ol", "output", "p", "pre", "progress",
                  "section", "table", "tbody", "textarea", "thead", "tfoot", "ul", "dd",
                  "dt", "frameset", "li", "tbody", "td", "tfoot",
-                 "th", "thead", "tr", "script", "style", "video"]
+                 "th", "thead", "tr", "script", "style", "svg", "video"]
 
 -- We want to allow raw docbook in markdown documents, so we
 -- include docbook block tags here too.
@@ -543,15 +609,19 @@ blockTags :: [String]
 blockTags = blockHtmlTags ++ blockDocBookTags
 
 isInlineTag :: Tag String -> Bool
-isInlineTag t = tagOpen (`notElem` blockTags) (const True) t ||
-                tagClose (`notElem` blockTags) t ||
+isInlineTag t = tagOpen isInlineTagName (const True) t ||
+                tagClose isInlineTagName t ||
                 tagComment (const True) t
+                 where isInlineTagName x = x `notElem` blockTags
 
 isBlockTag :: Tag String -> Bool
-isBlockTag t = tagOpen (`elem` blocktags) (const True) t ||
-               tagClose (`elem` blocktags) t ||
+isBlockTag t = tagOpen isBlockTagName (const True) t ||
+               tagClose isBlockTagName t ||
                tagComment (const True) t
-                 where blocktags = blockTags ++ eitherBlockOrInline
+                 where isBlockTagName ('?':_) = True
+                       isBlockTagName ('!':_) = True
+                       isBlockTagName x       = x `elem` blockTags
+                                             || x `elem` eitherBlockOrInline
 
 isTextTag :: Tag String -> Bool
 isTextTag = tagText (const True)
@@ -560,7 +630,7 @@ isCommentTag :: Tag String -> Bool
 isCommentTag = tagComment (const True)
 
 -- taken from HXT and extended
-
+-- See http://www.w3.org/TR/html5/syntax.html sec 8.1.2.4 optional tags
 closes :: String -> String -> Bool
 _ `closes` "body" = False
 _ `closes` "html" = False
@@ -568,11 +638,18 @@ _ `closes` "html" = False
 "li" `closes` "li" = True
 "th" `closes` t | t `elem` ["th","td"] = True
 "tr" `closes` t | t `elem` ["th","td","tr"] = True
+"dd" `closes` t | t `elem` ["dt", "dd"] = True
 "dt" `closes` t | t `elem` ["dt","dd"] = True
-"hr" `closes` "p" = True
-"p" `closes` "p" = True
+"rt" `closes` t | t `elem` ["rb", "rt", "rtc"] = True
+"optgroup" `closes` "optgroup" = True
+"optgroup" `closes` "option" = True
+"option" `closes` "option" = True
+-- http://www.w3.org/TR/html-markup/p.html
+x `closes` "p" | x `elem` ["address", "article", "aside", "blockquote",
+   "dir", "div", "dl", "fieldset", "footer", "form", "h1", "h2", "h3", "h4",
+   "h5", "h6", "header", "hr", "menu", "nav", "ol", "p", "pre", "section",
+   "table", "ul"] = True
 "meta" `closes` "meta" = True
-"colgroup" `closes` "colgroup" = True
 "form" `closes` "form" = True
 "label" `closes` "label" = True
 "map" `closes` "map" = True
@@ -620,3 +697,11 @@ htmlTag f = try $ do
        _            -> do
           rendered <- manyTill anyChar (char '>')
           return (next, rendered ++ ">")
+
+mkAttr :: [(String, String)] -> Attr
+mkAttr attr = (attribsId, attribsClasses, attribsKV)
+  where attribsId = fromMaybe "" $ lookup "id" attr
+        attribsClasses = words $ fromMaybe "" $ lookup "class" attr
+        attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+
+