diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 34 |
1 files changed, 20 insertions, 14 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 0c017b2e4..d76524e14 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -36,8 +36,6 @@ module Text.Pandoc.Readers.HTML ( readHtml , isCommentTag ) where -import Text.ParserCombinators.Parsec -import Text.ParserCombinators.Parsec.Pos import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Definition @@ -46,8 +44,14 @@ import Text.Pandoc.Shared import Text.Pandoc.Parsing import Data.Maybe ( fromMaybe, isJust ) import Data.List ( intercalate ) -import Data.Char ( isSpace, isDigit, toLower ) -import Control.Monad ( liftM, guard, when ) +import Data.Char ( isDigit, toLower ) +import Control.Monad ( liftM, guard, when, mzero ) + +isSpace :: Char -> Bool +isSpace ' ' = True +isSpace '\t' = True +isSpace '\n' = True +isSpace _ = False -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ParserState -- ^ Parser state @@ -62,7 +66,7 @@ readHtml st inp = Pandoc meta blocks then parseHeader tags else (Meta [] [] [], tags) -type TagParser = GenParser (Tag String) ParserState +type TagParser = Parser [Tag String] ParserState -- TODO - fix this - not every header has a title tag parseHeader :: [Tag String] -> (Meta, [Tag String]) @@ -222,6 +226,8 @@ pSimpleTable :: TagParser [Block] pSimpleTable = try $ do TagOpen _ _ <- pSatisfy (~== TagOpen "table" []) skipMany pBlank + caption <- option [] $ pInTags "caption" inline >>~ skipMany pBlank + skipMany $ pInTags "col" block >> skipMany pBlank head' <- option [] $ pOptInTag "thead" $ pInTags "tr" (pCell "th") skipMany pBlank rows <- pOptInTag "tbody" @@ -231,7 +237,7 @@ pSimpleTable = try $ do let cols = maximum $ map length rows let aligns = replicate cols AlignLeft let widths = replicate cols 0 - return [Table [] aligns widths head' rows] + return [Table caption aligns widths head' rows] pCell :: String -> TagParser [TableCell] pCell celltype = try $ do @@ -409,7 +415,7 @@ pCloses tagtype = try $ do (TagClose "ul") | tagtype == "li" -> return () (TagClose "ol") | tagtype == "li" -> return () (TagClose "dl") | tagtype == "li" -> return () - _ -> pzero + _ -> mzero pTagText :: TagParser [Inline] pTagText = try $ do @@ -424,11 +430,11 @@ pBlank = try $ do (TagText str) <- pSatisfy isTagText guard $ all isSpace str -pTagContents :: GenParser Char ParserState Inline +pTagContents :: Parser [Char] ParserState Inline pTagContents = pStr <|> pSpace <|> smartPunctuation pTagContents <|> pSymbol <|> pBad -pStr :: GenParser Char ParserState Inline +pStr :: Parser [Char] ParserState Inline pStr = do result <- many1 $ satisfy $ \c -> not (isSpace c) && not (isSpecial c) && not (isBad c) @@ -447,13 +453,13 @@ isSpecial '\8220' = True isSpecial '\8221' = True isSpecial _ = False -pSymbol :: GenParser Char ParserState Inline +pSymbol :: Parser [Char] ParserState Inline pSymbol = satisfy isSpecial >>= return . Str . (:[]) isBad :: Char -> Bool isBad c = c >= '\128' && c <= '\159' -- not allowed in HTML -pBad :: GenParser Char ParserState Inline +pBad :: Parser [Char] ParserState Inline pBad = do c <- satisfy isBad let c' = case c of @@ -487,7 +493,7 @@ pBad = do _ -> '?' return $ Str [c'] -pSpace :: GenParser Char ParserState Inline +pSpace :: Parser [Char] ParserState Inline pSpace = many1 (satisfy isSpace) >> return Space -- @@ -585,7 +591,7 @@ _ `closes` _ = False --- parsers for use in markdown, textile readers -- | Matches a stretch of HTML in balanced tags. -htmlInBalanced :: (Tag String -> Bool) -> GenParser Char ParserState String +htmlInBalanced :: (Tag String -> Bool) -> Parser [Char] ParserState String htmlInBalanced f = try $ do (TagOpen t _, tag) <- htmlTag f guard $ '/' `notElem` tag -- not a self-closing tag @@ -598,7 +604,7 @@ htmlInBalanced f = try $ do return $ tag ++ concat contents ++ endtag -- | Matches a tag meeting a certain condition. -htmlTag :: (Tag String -> Bool) -> GenParser Char ParserState (Tag String, String) +htmlTag :: (Tag String -> Bool) -> Parser [Char] ParserState (Tag String, String) htmlTag f = try $ do lookAhead (char '<') (next : _) <- getInput >>= return . canonicalizeTags . parseTags |