1 files changed, 160 insertions, 202 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 1742667b8..1eb5d7b4a 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -41,12 +41,12 @@ module Text.Pandoc.Readers.HTML (
                                 ) where
 
 import Text.ParserCombinators.Parsec
-import Text.Pandoc.ParserCombinators
 import Text.Pandoc.Definition
 import Text.Pandoc.Shared 
-import Text.Pandoc.Entities ( characterEntity, decodeEntities )
+import Text.Pandoc.CharacterReferences ( characterReference, 
+                                         decodeCharacterReferences )
 import Data.Maybe ( fromMaybe )
-import Data.List ( intersect, takeWhile, dropWhile )
+import Data.List ( takeWhile, dropWhile )
 import Data.Char ( toUpper, toLower, isAlphaNum )
 
 -- | Convert HTML-formatted string to 'Pandoc' document.
@@ -55,10 +55,6 @@ readHtml :: ParserState   -- ^ Parser state
          -> Pandoc
 readHtml = readWith parseHtml
 
--- for testing 
-testString :: String -> IO ()
-testString = testStringWith parseHtml
-
 --
 -- Constants
 --
@@ -74,26 +70,18 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
 --
 
 -- | Read blocks until end tag.
-blocksTilEnd tag = try (do
-  blocks <- manyTill (do {b <- block; spaces; return b}) (htmlEndTag tag)
-  return $ filter (/= Null) blocks)
+blocksTilEnd tag = do
+  blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
+  return $ filter (/= Null) blocks
 
 -- | Read inlines until end tag.
-inlinesTilEnd tag = try (do
-  inlines <- manyTill inline (htmlEndTag tag)
-  return inlines)
+inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
 
 -- | Parse blocks between open and close tag.
-blocksIn tag = try $ do
-  htmlTag tag
-  spaces
-  blocksTilEnd tag
+blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag
 
 -- | Parse inlines between open and close tag.
-inlinesIn tag = try $ do
-  htmlTag tag
-  spaces
-  inlinesTilEnd tag
+inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag
 
 -- | Extract type from a tag:  e.g. @br@ from @\<br\>@
 extractTagType :: String -> String
@@ -103,19 +91,19 @@ extractTagType ('<':rest) =
 extractTagType _ = ""
 
 -- | Parse any HTML tag (closing or opening) and return text of tag
-anyHtmlTag = try (do
+anyHtmlTag = try $ do
   char '<'
   spaces
   tag <- many1 alphaNum
   attribs <- htmlAttributes
   spaces
   ender <- option "" (string "/")
-  let ender' = if (null ender) then "" else " /"
+  let ender' = if null ender then "" else " /"
   spaces
   char '>'
-  return ("<" ++ tag ++ attribs ++ ender' ++ ">"))
+  return $ "<" ++ tag ++ attribs ++ ender' ++ ">"
 
-anyHtmlEndTag = try (do
+anyHtmlEndTag = try $ do
   char '<'   
   spaces
   char '/'
@@ -123,19 +111,19 @@ anyHtmlEndTag = try (do
   tagType <- many1 alphaNum
   spaces
   char '>'
-  return ("</" ++ tagType ++ ">"))
+  return $ "</" ++ tagType ++ ">"
 
 htmlTag :: String -> GenParser Char st (String, [(String, String)])
-htmlTag tag = try (do
+htmlTag tag = try $ do
   char '<'
   spaces
   stringAnyCase tag
   attribs <- many htmlAttribute
   spaces
-  option "" (string "/")
+  optional (string "/")
   spaces
   char '>'
-  return (tag, (map (\(name, content, raw) -> (name, content)) attribs)))
+  return (tag, (map (\(name, content, raw) -> (name, content)) attribs))
 
 -- parses a quoted html attribute value
 quoted quoteChar = do
@@ -145,20 +133,20 @@ quoted quoteChar = do
 
 htmlAttributes = do
   attrList <- many htmlAttribute
-  return (concatMap (\(name, content, raw) -> raw) attrList)
+  return $ concatMap (\(name, content, raw) -> raw) attrList
 
 htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
 
--- minimized boolean attribute (no = and value)
-htmlMinimizedAttribute = try (do
+-- minimized boolean attribute
+htmlMinimizedAttribute = try $ do
   many1 space
   name <- many1 (choice [letter, oneOf ".-_:"])
   spaces
   notFollowedBy (char '=')
   let content = name
-  return (name, content, (" " ++ name)))
+  return (name, content, (" " ++ name))
 
-htmlRegularAttribute = try (do
+htmlRegularAttribute = try $ do
   many1 space
   name <- many1 (choice [letter, oneOf ".-_:"])
   spaces
@@ -170,10 +158,10 @@ htmlRegularAttribute = try (do
                                      a <- many (alphaNum <|> (oneOf "-._:"))
                                      return (a,"")) ]
   return (name, content,
-          (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)))
+          (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr))
 
 -- | Parse an end tag of type 'tag'
-htmlEndTag tag = try (do
+htmlEndTag tag = try $ do
   char '<'   
   spaces
   char '/'
@@ -181,87 +169,83 @@ htmlEndTag tag = try (do
   stringAnyCase tag
   spaces
   char '>'
-  return ("</" ++ tag ++ ">"))
+  return $ "</" ++ tag ++ ">"
 
 -- | Returns @True@ if the tag is an inline tag.
 isInline tag = (extractTagType tag) `elem` inlineHtmlTags
 
-anyHtmlBlockTag = try (do
-  tag <- choice [anyHtmlTag, anyHtmlEndTag]
-  if isInline tag then fail "inline tag" else return tag)
+anyHtmlBlockTag = try $ do
+  tag <- anyHtmlTag <|> anyHtmlEndTag
+  if isInline tag then fail "inline tag" else return tag
 
-anyHtmlInlineTag = try (do
-  tag <- choice [ anyHtmlTag, anyHtmlEndTag ]
-  if isInline tag then return tag else fail "not an inline tag")
+anyHtmlInlineTag = try $ do
+  tag <- anyHtmlTag <|> anyHtmlEndTag
+  if isInline tag then return tag else fail "not an inline tag"
 
 -- | Parses material between script tags.
 -- Scripts must be treated differently, because they can contain '<>' etc.
-htmlScript = try (do
+htmlScript = try $ do
   open <- string "<script"
   rest <- manyTill anyChar (htmlEndTag "script")
-  return (open ++ rest ++ "</script>"))
+  return $ open ++ rest ++ "</script>"
 
 htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
 
-rawHtmlBlock = try (do
-  notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"])
+rawHtmlBlock = try $ do
+  notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
   body <- htmlBlockElement <|> anyHtmlBlockTag
-  sp <- (many space)
+  sp <- many space
   state <- getState
-  if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null)
+  if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null
 
 -- | Parses an HTML comment.
-htmlComment = try (do
+htmlComment = try $ do
   string "<!--"
   comment <- manyTill anyChar (try (string "-->"))
-  return ("<!--" ++ comment ++ "-->"))
+  return $ "<!--" ++ comment ++ "-->"
 
 --
 -- parsing documents
 --
 
-xmlDec = try (do
+xmlDec = try $ do
   string "<?"
   rest <- manyTill anyChar (char '>')
-  return ("<?" ++ rest ++ ">"))
+  return $ "<?" ++ rest ++ ">"
 
-definition = try (do
+definition = try $ do
   string "<!"
   rest <- manyTill anyChar (char '>')
-  return ("<!" ++ rest ++ ">"))
+  return $ "<!" ++ rest ++ ">"
 
-nonTitleNonHead = try (do
-  notFollowedBy' (htmlTag "title")
-  notFollowedBy' (htmlTag "/head")
-  result <- choice [do {rawHtmlBlock; return ' '}, anyChar]
-  return result)
+nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >>
+  ((rawHtmlBlock >> return ' ') <|> anyChar)
 
-parseTitle = try (do
-  (tag, attribs) <- htmlTag "title"
+parseTitle = try $ do
+  (tag, _) <- htmlTag "title"
   contents <- inlinesTilEnd tag
   spaces
-  return contents)
+  return contents
 
 -- parse header and return meta-information (for now, just title)
-parseHead = try (do
+parseHead = try $ do
   htmlTag "head"
   spaces
   skipMany nonTitleNonHead
   contents <- option [] parseTitle
   skipMany nonTitleNonHead
   htmlTag "/head"
-  return (contents, [], ""))
+  return (contents, [], "")
 
-skipHtmlTag tag = option ("",[]) (htmlTag tag)
+skipHtmlTag tag = optional (htmlTag tag)
 
 -- h1 class="title" representation of title in body
-bodyTitle = try (do
+bodyTitle = try $ do
   (tag, attribs) <- htmlTag "h1"  
   cl <- case (extractAttribute "class" attribs) of
-          Just "title" -> do {return ""}
+          Just "title" -> return ""
           otherwise    -> fail "not title"
   inlinesTilEnd "h1"
-  return "")
 
 parseHtml = do
   sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
@@ -271,27 +255,30 @@ parseHtml = do
   spaces
   skipHtmlTag "body"
   spaces
-  option "" bodyTitle  -- skip title in body, because it's represented in meta
+  optional bodyTitle  -- skip title in body, because it's represented in meta
   blocks <- parseBlocks
   spaces
-  option "" (htmlEndTag "body")
+  optional (htmlEndTag "body")
   spaces
-  option "" (htmlEndTag "html")
+  optional (htmlEndTag "html")
   many anyChar -- ignore anything after </html>
   eof
-  return (Pandoc (Meta title authors date) blocks)
+  return $ Pandoc (Meta title authors date) blocks
 
 --
 -- parsing blocks
 --
 
-parseBlocks = do
-  spaces
-  result <- sepEndBy block spaces
-  return $ filter (/= Null) result
+parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
 
-block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain, 
-                 rawHtmlBlock ] <?> "block"
+block = choice [ codeBlock
+               , header
+               , hrule
+               , list
+               , blockQuote
+               , para
+               , plain
+               , rawHtmlBlock ] <?> "block"
 
 --
 -- header blocks
@@ -299,53 +286,49 @@ block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain,
 
 header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
 
-headerLevel n = try (do
+headerLevel n = try $ do
     let level = "h" ++ show n
     (tag, attribs) <- htmlTag level
     contents <- inlinesTilEnd level
-    return (Header n (normalizeSpaces contents)))
+    return $ Header n (normalizeSpaces contents)
 
 --
 -- hrule block
 --
 
-hrule = try (do
+hrule = try  $ do
   (tag, attribs) <- htmlTag "hr"
   state <- getState
-  if (not (null attribs)) && (stateParseRaw state)
-     then -- in this case we want to parse it as raw html
-          unexpected "attributes in hr"
-     else return HorizontalRule)
+  if not (null attribs) && stateParseRaw state
+     then unexpected "attributes in hr" -- parse as raw in this case
+     else return HorizontalRule
 
 --
 -- code blocks
 --
 
-codeBlock = choice [ preCodeBlock, bareCodeBlock ] <?> "code block"
+codeBlock = preCodeBlock <|> bareCodeBlock <?> "code block"
 
-preCodeBlock = try (do
+preCodeBlock = try $ do
     htmlTag "pre" 
     spaces
-    htmlTag "code"
-    result <- manyTill anyChar (htmlEndTag "code")
+    result <- bareCodeBlock
     spaces
     htmlEndTag "pre"
-    return (CodeBlock (stripTrailingNewlines (decodeEntities result))))
+    return result
 
-bareCodeBlock = try (do
+bareCodeBlock = try $ do
     htmlTag "code"
     result <- manyTill anyChar (htmlEndTag "code")
-    return (CodeBlock (stripTrailingNewlines (decodeEntities result))))
+    return $ CodeBlock $ stripTrailingNewlines $ 
+             decodeCharacterReferences result
 
 --
 -- block quotes
 --
 
-blockQuote = try (do
-    tag <- htmlTag "blockquote"
-    spaces
-    blocks <- blocksTilEnd "blockquote"
-    return (BlockQuote blocks))
+blockQuote = try $ htmlTag "blockquote" >> spaces >> 
+                   blocksTilEnd "blockquote" >>= (return . BlockQuote)
 
 --
 -- list blocks
@@ -354,119 +337,105 @@ blockQuote = try (do
 list = choice [ bulletList, orderedList, definitionList ] <?> "list"
 
 orderedList = try $ do
-    (_, attribs) <- htmlTag "ol"
-    (start, style) <- option (1, DefaultStyle) $
-                             do failIfStrict
-                                let sta = fromMaybe "1" $ 
-                                          lookup "start" attribs
-                                let sty = fromMaybe (fromMaybe "" $
-                                          lookup "style" attribs) $
-                                          lookup "class" attribs
-                                let sty' = case sty of
-                                            "lower-roman"  -> LowerRoman
-                                            "upper-roman"  -> UpperRoman
-                                            "lower-alpha"  -> LowerAlpha
-                                            "upper-alpha"  -> UpperAlpha
-                                            "decimal"      -> Decimal
-                                            _              -> DefaultStyle
-                                return (read sta, sty')
-    spaces
-    items <- sepEndBy1 (blocksIn "li") spaces
-    htmlEndTag "ol"
-    return (OrderedList (start, style, DefaultDelim) items)
+  (_, attribs) <- htmlTag "ol"
+  (start, style) <- option (1, DefaultStyle) $
+                           do failIfStrict
+                              let sta = fromMaybe "1" $ 
+                                        lookup "start" attribs
+                              let sty = fromMaybe (fromMaybe "" $
+                                        lookup "style" attribs) $
+                                        lookup "class" attribs
+                              let sty' = case sty of
+                                          "lower-roman"  -> LowerRoman
+                                          "upper-roman"  -> UpperRoman
+                                          "lower-alpha"  -> LowerAlpha
+                                          "upper-alpha"  -> UpperAlpha
+                                          "decimal"      -> Decimal
+                                          _              -> DefaultStyle
+                              return (read sta, sty')
+  spaces
+  items <- sepEndBy1 (blocksIn "li") spaces
+  htmlEndTag "ol"
+  return $ OrderedList (start, style, DefaultDelim) items
 
 bulletList = try $ do
-    htmlTag "ul"
-    spaces
-    items <- sepEndBy1 (blocksIn "li") spaces
-    htmlEndTag "ul"
-    return (BulletList items)
+  htmlTag "ul"
+  spaces
+  items <- sepEndBy1 (blocksIn "li") spaces
+  htmlEndTag "ul"
+  return $ BulletList items
 
 definitionList = try $ do
-    failIfStrict  -- def lists not part of standard markdown
-    tag <- htmlTag "dl"
-    spaces
-    items <- sepEndBy1 definitionListItem spaces
-    htmlEndTag "dl"
-    return (DefinitionList items)
+  failIfStrict  -- def lists not part of standard markdown
+  tag <- htmlTag "dl"
+  spaces
+  items <- sepEndBy1 definitionListItem spaces
+  htmlEndTag "dl"
+  return $ DefinitionList items
 
 definitionListItem = try $ do
-    terms <- sepEndBy1 (inlinesIn "dt") spaces
-    defs <- sepEndBy1 (blocksIn "dd") spaces
-    let term = joinWithSep [LineBreak] terms
-    return (term, concat defs)
+  terms <- sepEndBy1 (inlinesIn "dt") spaces
+  defs <- sepEndBy1 (blocksIn "dd") spaces
+  let term = joinWithSep [LineBreak] terms
+  return (term, concat defs)
 
 --
 -- paragraph block
 --
 
-para = try (do 
-  tag <- htmlTag "p"
-  result <- inlinesTilEnd "p"
-  return (Para (normalizeSpaces result)))
+para = htmlTag "p" >> inlinesTilEnd "p" >>= return . Para . normalizeSpaces
 
 -- 
 -- plain block
 --
 
-plain = do
-    result <- many1 inline
-    return (Plain (normalizeSpaces result))
+plain = many1 inline >>= return . Plain . normalizeSpaces
 
 -- 
 -- inline
 --
 
-inline = choice [ text, special ] <?> "inline"
-
-text =  choice [ entity, strong, emph, superscript, subscript,
-                 strikeout, spanStrikeout, code, str,
-                 linebreak, whitespace ] <?> "text"
-
-special = choice [ link, image, rawHtmlInline ] <?> 
-                 "link, inline html, or image"
-
-entity = do
-  ent <- characterEntity
-  return $ Str [ent]
-
-code = try (do 
+inline = choice [ charRef
+                , strong
+                , emph
+                , superscript
+                , subscript
+                , strikeout
+                , spanStrikeout
+                , code
+                , str
+                , linebreak
+                , whitespace
+                , link
+                , image
+                , rawHtmlInline ] <?> "inline"
+
+code = try $ do 
   htmlTag "code"
   result <- manyTill anyChar (htmlEndTag "code")
   -- remove internal line breaks, leading and trailing space,
-  -- and decode entities
-  let result' = decodeEntities $ removeLeadingTrailingSpace $ 
-                joinWithSep " " $ lines result 
-  return (Code result'))
+  -- and decode character references
+  return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ 
+                  joinWithSep " " $ lines result 
 
 rawHtmlInline = do
-  result <- choice [htmlScript, anyHtmlInlineTag]
+  result <- htmlScript <|> anyHtmlInlineTag
   state <- getState
   if stateParseRaw state then return (HtmlInline result) else return (Str "")
 
-betweenTags tag = try (do
-  htmlTag tag
-  result <- inlinesTilEnd tag
-  return (normalizeSpaces result))
+betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= 
+                        return . normalizeSpaces
 
-emph = try (do 
-  result <- choice [betweenTags "em", betweenTags "it"]
-  return (Emph result))
+emph = (betweenTags "em" <|> betweenTags "it") >>= return . Emph
 
-superscript = try $ do
-  failIfStrict -- strict markdown has no superscript, so treat as raw HTML
-  result <- betweenTags "sup"
-  return (Superscript result)  
+strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
 
-subscript = try $ do
-  failIfStrict -- strict markdown has no subscript, so treat as raw HTML
-  result <- betweenTags "sub"
-  return (Subscript result)  
+superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
 
-strikeout = try $ do
-  failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
-  result <- choice [betweenTags "s", betweenTags "strike"]
-  return (Strikeout result)
+subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
+
+strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
+            return . Strikeout
 
 spanStrikeout = try $ do
   failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
@@ -474,25 +443,14 @@ spanStrikeout = try $ do
   result <- case (extractAttribute "class" attributes) of
               Just "strikeout" -> inlinesTilEnd "span"
               _                -> fail "not a strikeout"
-  return (Strikeout result)
+  return $ Strikeout result
 
-strong = try (do 
-  result <- choice [betweenTags "b", betweenTags "strong"]
-  return (Strong result))
-
-whitespace = do
-  many1 space
-  return Space
+whitespace = many1 space >> return Space
 
 -- hard line break
-linebreak = do
-  htmlTag "br"
-  option ' ' newline
-  return LineBreak 
+linebreak = htmlTag "br" >> optional newline >> return LineBreak
 
-str = do 
-  result <- many1 (noneOf "<& \t\n")
-  return (Str result)
+str = many1 (noneOf "<& \t\n") >>= return . Str
 
 --
 -- links and images
@@ -501,27 +459,27 @@ str = do
 -- extract contents of attribute (attribute names are case-insensitive)
 extractAttribute name [] = Nothing
 extractAttribute name ((attrName, contents):rest) = 
-  let name' = map toLower name 
-      attrName' = map toLower attrName in
-  if (attrName' == name')
-     then Just (decodeEntities contents)
-     else extractAttribute name rest
+  let name'     = map toLower name 
+      attrName' = map toLower attrName
+  in  if attrName' == name'
+         then Just (decodeCharacterReferences contents)
+         else extractAttribute name rest
 
 link = try $ do
   (tag, attributes) <- htmlTag "a"  
   url <- case (extractAttribute "href" attributes) of
-           Just url -> do {return url}
+           Just url -> return url
            Nothing  -> fail "no href"
-  let title = fromMaybe "" (extractAttribute "title" attributes)
+  let title = fromMaybe "" $ extractAttribute "title" attributes
   label <- inlinesTilEnd "a"
   return $ Link (normalizeSpaces label) (url, title)
 
 image = try $ do
   (tag, attributes) <- htmlTag "img" 
   url <- case (extractAttribute "src" attributes) of
-           Just url -> do {return url}
+           Just url -> return url
            Nothing  -> fail "no src"
-  let title = fromMaybe "" (extractAttribute "title" attributes)
+  let title = fromMaybe "" $ extractAttribute "title" attributes
   let alt = fromMaybe "" (extractAttribute "alt" attributes)
   return $ Image [Str alt] (url, title)