1 files changed, 44 insertions, 65 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 536bddd39..e5c310ffc 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -19,10 +19,10 @@ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 {- |
    Module      : Text.Pandoc.Readers.HTML
    Copyright   : Copyright (C) 2006-2010 John MacFarlane
-   License     : GNU GPL, version 2 or above 
+   License     : GNU GPL, version 2 or above
 
    Maintainer  : John MacFarlane <jgm@berkeley.edu>
-   Stability   : alpha 
+   Stability   : alpha
    Portability : portable
 
 Conversion of HTML to 'Pandoc' document.
@@ -36,18 +36,17 @@ module Text.Pandoc.Readers.HTML ( readHtml
                                 , isCommentTag
                                 ) where
 
-import Text.ParserCombinators.Parsec
-import Text.ParserCombinators.Parsec.Pos
 import Text.HTML.TagSoup
 import Text.HTML.TagSoup.Match
 import Text.Pandoc.Definition
 import Text.Pandoc.Builder (text, toList)
 import Text.Pandoc.Shared
+import Text.Pandoc.Options
 import Text.Pandoc.Parsing
 import Data.Maybe ( fromMaybe, isJust )
 import Data.List ( intercalate )
-import Data.Char ( isDigit, toLower )
-import Control.Monad ( liftM, guard, when )
+import Data.Char ( isDigit )
+import Control.Monad ( liftM, guard, when, mzero )
 
 isSpace :: Char -> Bool
 isSpace ' '  = True
@@ -56,11 +55,11 @@ isSpace '\n' = True
 isSpace _    = False
 
 -- | Convert HTML-formatted string to 'Pandoc' document.
-readHtml :: ParserState   -- ^ Parser state
+readHtml :: ReaderOptions -- ^ Reader options
          -> String        -- ^ String to parse (assumes @'\n'@ line endings)
          -> Pandoc
-readHtml st inp = Pandoc meta blocks
-  where blocks  = readWith parseBody st rest
+readHtml opts inp = Pandoc meta blocks
+  where blocks  = readWith parseBody def{ stateOptions = opts } rest
         tags    = canonicalizeTags $
                    parseTagsOptions parseOptions{ optTagPosition = True } inp
         hasHeader = any (~== TagOpen "head" []) tags
@@ -68,7 +67,7 @@ readHtml st inp = Pandoc meta blocks
                           then parseHeader tags
                           else (Meta [] [] [], tags)
 
-type TagParser = GenParser (Tag String) ParserState
+type TagParser = Parser [Tag String] ParserState
 
 -- TODO - fix this - not every header has a title tag
 parseHeader :: [Tag String] -> (Meta, [Tag String])
@@ -96,18 +95,6 @@ block = choice
             , pRawHtmlBlock
             ]
 
--- repeated in SelfContained -- consolidate eventually
-renderTags' :: [Tag String] -> String
-renderTags' = renderTagsOptions
-               renderOptions{ optMinimize = \x ->
-                                    let y = map toLower x
-                                    in  y == "hr" || y == "br" ||
-                                        y == "img" || y == "meta" ||
-                                        y == "link"
-                            , optRawTag = \x ->
-                                    let y = map toLower x
-                                    in  y == "script" || y == "style" }
-
 pList :: TagParser [Block]
 pList = pBulletList <|> pOrderedList <|> pDefinitionList
 
@@ -126,25 +113,22 @@ pBulletList = try $ do
 pOrderedList :: TagParser [Block]
 pOrderedList = try $ do
   TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
-  st <- getState
-  let (start, style) =  if stateStrict st
-                           then (1, DefaultStyle) 
-                           else (sta', sty')
-                              where sta = fromMaybe "1" $
-                                          lookup "start" attribs
-                                    sta' = if all isDigit sta
-                                              then read sta
-                                              else 1
-                                    sty = fromMaybe (fromMaybe "" $
-                                          lookup "style" attribs) $
-                                          lookup "class" attribs
-                                    sty' = case sty of
-                                            "lower-roman"  -> LowerRoman
-                                            "upper-roman"  -> UpperRoman
-                                            "lower-alpha"  -> LowerAlpha
-                                            "upper-alpha"  -> UpperAlpha
-                                            "decimal"      -> Decimal
-                                            _              -> DefaultStyle
+  let (start, style) = (sta', sty')
+                       where sta = fromMaybe "1" $
+                                   lookup "start" attribs
+                             sta' = if all isDigit sta
+                                       then read sta
+                                       else 1
+                             sty = fromMaybe (fromMaybe "" $
+                                   lookup "style" attribs) $
+                                   lookup "class" attribs
+                             sty' = case sty of
+                                     "lower-roman"  -> LowerRoman
+                                     "upper-roman"  -> UpperRoman
+                                     "lower-alpha"  -> LowerAlpha
+                                     "upper-alpha"  -> UpperAlpha
+                                     "decimal"      -> Decimal
+                                     _              -> DefaultStyle
   let nonItem = pSatisfy (\t ->
                   not (tagOpen (`elem` ["li","ol","ul","dl"]) (const True) t) &&
                   not (t ~== TagClose "ol"))
@@ -196,8 +180,8 @@ pRawTag = do
 pRawHtmlBlock :: TagParser [Block]
 pRawHtmlBlock = do
   raw <- pHtmlBlock "script" <|> pHtmlBlock "style" <|> pRawTag
-  state <- getState
-  if stateParseRaw state && not (null raw)
+  parseRaw <- getOption readerParseRaw
+  if parseRaw && not (null raw)
      then return [RawBlock "html" raw]
      else return []
 
@@ -235,7 +219,7 @@ pSimpleTable = try $ do
   rows <- pOptInTag "tbody"
           $ many1 $ try $ skipMany pBlank >> pInTags "tr" (pCell "td")
   skipMany pBlank
-  TagClose _ <- pSatisfy (~== TagClose "table") 
+  TagClose _ <- pSatisfy (~== TagClose "table")
   let cols = maximum $ map length rows
   let aligns = replicate cols AlignLeft
   let widths = replicate cols 0
@@ -281,10 +265,7 @@ pCodeBlock = try $ do
   let attribsId = fromMaybe "" $ lookup "id" attr
   let attribsClasses = words $ fromMaybe "" $ lookup "class" attr
   let attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
-  st <- getState
-  let attribs = if stateStrict st
-                   then ("",[],[])
-                   else (attribsId, attribsClasses, attribsKV)
+  let attribs = (attribsId, attribsClasses, attribsKV)
   return [CodeBlock attribs result]
 
 inline :: TagParser [Inline]
@@ -310,7 +291,7 @@ pLocation = do
 pSat :: (Tag String -> Bool) -> TagParser (Tag String)
 pSat f = do
   pos <- getPosition
-  token show (const pos) (\x -> if f x then Just x else Nothing) 
+  token show (const pos) (\x -> if f x then Just x else Nothing)
 
 pSatisfy :: (Tag String -> Bool) -> TagParser (Tag String)
 pSatisfy f = try $ optional pLocation >> pSat f
@@ -332,14 +313,13 @@ pStrong :: TagParser [Inline]
 pStrong = pInlinesInTags "strong" Strong <|> pInlinesInTags "b" Strong
 
 pSuperscript :: TagParser [Inline]
-pSuperscript = failIfStrict >> pInlinesInTags "sup" Superscript
+pSuperscript = pInlinesInTags "sup" Superscript
 
 pSubscript :: TagParser [Inline]
-pSubscript = failIfStrict >> pInlinesInTags "sub" Subscript
+pSubscript = pInlinesInTags "sub" Subscript
 
 pStrikeout :: TagParser [Inline]
 pStrikeout = do
-  failIfStrict
   pInlinesInTags "s" Strikeout <|>
     pInlinesInTags "strike" Strikeout <|>
     pInlinesInTags "del" Strikeout <|>
@@ -381,8 +361,8 @@ pCode = try $ do
 pRawHtmlInline :: TagParser [Inline]
 pRawHtmlInline = do
   result <- pSatisfy (tagComment (const True)) <|> pSatisfy isInlineTag
-  state <- getState
-  if stateParseRaw state
+  parseRaw <- getOption readerParseRaw
+  if parseRaw
      then return [RawInline "html" $ renderTags' [result]]
      else return []
 
@@ -417,7 +397,7 @@ pCloses tagtype = try $ do
        (TagClose "ul") | tagtype == "li" -> return ()
        (TagClose "ol") | tagtype == "li" -> return ()
        (TagClose "dl") | tagtype == "li" -> return ()
-       _ -> pzero
+       _ -> mzero
 
 pTagText :: TagParser [Inline]
 pTagText = try $ do
@@ -432,11 +412,11 @@ pBlank = try $ do
   (TagText str) <- pSatisfy isTagText
   guard $ all isSpace str
 
-pTagContents :: GenParser Char ParserState Inline
+pTagContents :: Parser [Char] ParserState Inline
 pTagContents =
   pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad
 
-pStr :: GenParser Char ParserState Inline
+pStr :: Parser [Char] ParserState Inline
 pStr = do
   result <- many1 $ satisfy $ \c ->
                      not (isSpace c) && not (isSpecial c) && not (isBad c)
@@ -455,13 +435,13 @@ isSpecial '\8220' = True
 isSpecial '\8221' = True
 isSpecial _ = False
 
-pSymbol :: GenParser Char ParserState Inline
+pSymbol :: Parser [Char] ParserState Inline
 pSymbol = satisfy isSpecial >>= return . Str . (:[])
 
 isBad :: Char -> Bool
 isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML
 
-pBad :: GenParser Char ParserState Inline
+pBad :: Parser [Char] ParserState Inline
 pBad = do
   c <- satisfy isBad
   let c' = case c of
@@ -495,7 +475,7 @@ pBad = do
                 _      -> '?'
   return $ Str [c']
 
-pSpace :: GenParser Char ParserState Inline
+pSpace :: Parser [Char] ParserState Inline
 pSpace = many1 (satisfy isSpace) >> return Space
 
 --
@@ -593,20 +573,19 @@ _ `closes` _ = False
 --- parsers for use in markdown, textile readers
 
 -- | Matches a stretch of HTML in balanced tags.
-htmlInBalanced :: (Tag String -> Bool) -> GenParser Char ParserState String
+htmlInBalanced :: (Tag String -> Bool) -> Parser [Char] ParserState String
 htmlInBalanced f = try $ do
   (TagOpen t _, tag) <- htmlTag f
   guard $ '/' `notElem` tag      -- not a self-closing tag
-  let nonTagChunk = many1 $ satisfy (/= '<')
   let stopper = htmlTag (~== TagClose t)
   let anytag = liftM snd $ htmlTag (const True)
   contents <- many $ notFollowedBy' stopper >>
-                     (nonTagChunk <|> htmlInBalanced (const True) <|> anytag)
+                     (htmlInBalanced f <|> anytag <|> count 1 anyChar)
   endtag <- liftM snd stopper
   return $ tag ++ concat contents ++ endtag
 
 -- | Matches a tag meeting a certain condition.
-htmlTag :: (Tag String -> Bool) -> GenParser Char ParserState (Tag String, String)
+htmlTag :: (Tag String -> Bool) -> Parser [Char] ParserState (Tag String, String)
 htmlTag f = try $ do
   lookAhead (char '<')
   (next : _) <- getInput >>= return . canonicalizeTags . parseTags
@@ -617,7 +596,7 @@ htmlTag f = try $ do
           count (length s + 4) anyChar
           skipMany (satisfy (/='>'))
           char '>'
-          return (next, "<!--" ++ s ++ "-->") 
+          return (next, "<!--" ++ s ++ "-->")
        _            -> do
           rendered <- manyTill anyChar (char '>')
           return (next, rendered ++ ">")