diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 362 |
1 files changed, 160 insertions, 202 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 1742667b8..1eb5d7b4a 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -41,12 +41,12 @@ module Text.Pandoc.Readers.HTML ( ) where import Text.ParserCombinators.Parsec -import Text.Pandoc.ParserCombinators import Text.Pandoc.Definition import Text.Pandoc.Shared -import Text.Pandoc.Entities ( characterEntity, decodeEntities ) +import Text.Pandoc.CharacterReferences ( characterReference, + decodeCharacterReferences ) import Data.Maybe ( fromMaybe ) -import Data.List ( intersect, takeWhile, dropWhile ) +import Data.List ( takeWhile, dropWhile ) import Data.Char ( toUpper, toLower, isAlphaNum ) -- | Convert HTML-formatted string to 'Pandoc' document. @@ -55,10 +55,6 @@ readHtml :: ParserState -- ^ Parser state -> Pandoc readHtml = readWith parseHtml --- for testing -testString :: String -> IO () -testString = testStringWith parseHtml - -- -- Constants -- @@ -74,26 +70,18 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", -- -- | Read blocks until end tag. -blocksTilEnd tag = try (do - blocks <- manyTill (do {b <- block; spaces; return b}) (htmlEndTag tag) - return $ filter (/= Null) blocks) +blocksTilEnd tag = do + blocks <- manyTill (block >>~ spaces) (htmlEndTag tag) + return $ filter (/= Null) blocks -- | Read inlines until end tag. -inlinesTilEnd tag = try (do - inlines <- manyTill inline (htmlEndTag tag) - return inlines) +inlinesTilEnd tag = manyTill inline (htmlEndTag tag) -- | Parse blocks between open and close tag. -blocksIn tag = try $ do - htmlTag tag - spaces - blocksTilEnd tag +blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag -- | Parse inlines between open and close tag. -inlinesIn tag = try $ do - htmlTag tag - spaces - inlinesTilEnd tag +inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag -- | Extract type from a tag: e.g. @br@ from @\<br\>@ extractTagType :: String -> String @@ -103,19 +91,19 @@ extractTagType ('<':rest) = extractTagType _ = "" -- | Parse any HTML tag (closing or opening) and return text of tag -anyHtmlTag = try (do +anyHtmlTag = try $ do char '<' spaces tag <- many1 alphaNum attribs <- htmlAttributes spaces ender <- option "" (string "/") - let ender' = if (null ender) then "" else " /" + let ender' = if null ender then "" else " /" spaces char '>' - return ("<" ++ tag ++ attribs ++ ender' ++ ">")) + return $ "<" ++ tag ++ attribs ++ ender' ++ ">" -anyHtmlEndTag = try (do +anyHtmlEndTag = try $ do char '<' spaces char '/' @@ -123,19 +111,19 @@ anyHtmlEndTag = try (do tagType <- many1 alphaNum spaces char '>' - return ("</" ++ tagType ++ ">")) + return $ "</" ++ tagType ++ ">" htmlTag :: String -> GenParser Char st (String, [(String, String)]) -htmlTag tag = try (do +htmlTag tag = try $ do char '<' spaces stringAnyCase tag attribs <- many htmlAttribute spaces - option "" (string "/") + optional (string "/") spaces char '>' - return (tag, (map (\(name, content, raw) -> (name, content)) attribs))) + return (tag, (map (\(name, content, raw) -> (name, content)) attribs)) -- parses a quoted html attribute value quoted quoteChar = do @@ -145,20 +133,20 @@ quoted quoteChar = do htmlAttributes = do attrList <- many htmlAttribute - return (concatMap (\(name, content, raw) -> raw) attrList) + return $ concatMap (\(name, content, raw) -> raw) attrList htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute --- minimized boolean attribute (no = and value) -htmlMinimizedAttribute = try (do +-- minimized boolean attribute +htmlMinimizedAttribute = try $ do many1 space name <- many1 (choice [letter, oneOf ".-_:"]) spaces notFollowedBy (char '=') let content = name - return (name, content, (" " ++ name))) + return (name, content, (" " ++ name)) -htmlRegularAttribute = try (do +htmlRegularAttribute = try $ do many1 space name <- many1 (choice [letter, oneOf ".-_:"]) spaces @@ -170,10 +158,10 @@ htmlRegularAttribute = try (do a <- many (alphaNum <|> (oneOf "-._:")) return (a,"")) ] return (name, content, - (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr))) + (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)) -- | Parse an end tag of type 'tag' -htmlEndTag tag = try (do +htmlEndTag tag = try $ do char '<' spaces char '/' @@ -181,87 +169,83 @@ htmlEndTag tag = try (do stringAnyCase tag spaces char '>' - return ("</" ++ tag ++ ">")) + return $ "</" ++ tag ++ ">" -- | Returns @True@ if the tag is an inline tag. isInline tag = (extractTagType tag) `elem` inlineHtmlTags -anyHtmlBlockTag = try (do - tag <- choice [anyHtmlTag, anyHtmlEndTag] - if isInline tag then fail "inline tag" else return tag) +anyHtmlBlockTag = try $ do + tag <- anyHtmlTag <|> anyHtmlEndTag + if isInline tag then fail "inline tag" else return tag -anyHtmlInlineTag = try (do - tag <- choice [ anyHtmlTag, anyHtmlEndTag ] - if isInline tag then return tag else fail "not an inline tag") +anyHtmlInlineTag = try $ do + tag <- anyHtmlTag <|> anyHtmlEndTag + if isInline tag then return tag else fail "not an inline tag" -- | Parses material between script tags. -- Scripts must be treated differently, because they can contain '<>' etc. -htmlScript = try (do +htmlScript = try $ do open <- string "<script" rest <- manyTill anyChar (htmlEndTag "script") - return (open ++ rest ++ "</script>")) + return $ open ++ rest ++ "</script>" htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ] -rawHtmlBlock = try (do - notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"]) +rawHtmlBlock = try $ do + notFollowedBy' (htmlTag "/body" <|> htmlTag "/html") body <- htmlBlockElement <|> anyHtmlBlockTag - sp <- (many space) + sp <- many space state <- getState - if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null) + if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null -- | Parses an HTML comment. -htmlComment = try (do +htmlComment = try $ do string "<!--" comment <- manyTill anyChar (try (string "-->")) - return ("<!--" ++ comment ++ "-->")) + return $ "<!--" ++ comment ++ "-->" -- -- parsing documents -- -xmlDec = try (do +xmlDec = try $ do string "<?" rest <- manyTill anyChar (char '>') - return ("<?" ++ rest ++ ">")) + return $ "<?" ++ rest ++ ">" -definition = try (do +definition = try $ do string "<!" rest <- manyTill anyChar (char '>') - return ("<!" ++ rest ++ ">")) + return $ "<!" ++ rest ++ ">" -nonTitleNonHead = try (do - notFollowedBy' (htmlTag "title") - notFollowedBy' (htmlTag "/head") - result <- choice [do {rawHtmlBlock; return ' '}, anyChar] - return result) +nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >> + ((rawHtmlBlock >> return ' ') <|> anyChar) -parseTitle = try (do - (tag, attribs) <- htmlTag "title" +parseTitle = try $ do + (tag, _) <- htmlTag "title" contents <- inlinesTilEnd tag spaces - return contents) + return contents -- parse header and return meta-information (for now, just title) -parseHead = try (do +parseHead = try $ do htmlTag "head" spaces skipMany nonTitleNonHead contents <- option [] parseTitle skipMany nonTitleNonHead htmlTag "/head" - return (contents, [], "")) + return (contents, [], "") -skipHtmlTag tag = option ("",[]) (htmlTag tag) +skipHtmlTag tag = optional (htmlTag tag) -- h1 class="title" representation of title in body -bodyTitle = try (do +bodyTitle = try $ do (tag, attribs) <- htmlTag "h1" cl <- case (extractAttribute "class" attribs) of - Just "title" -> do {return ""} + Just "title" -> return "" otherwise -> fail "not title" inlinesTilEnd "h1" - return "") parseHtml = do sepEndBy (choice [xmlDec, definition, htmlComment]) spaces @@ -271,27 +255,30 @@ parseHtml = do spaces skipHtmlTag "body" spaces - option "" bodyTitle -- skip title in body, because it's represented in meta + optional bodyTitle -- skip title in body, because it's represented in meta blocks <- parseBlocks spaces - option "" (htmlEndTag "body") + optional (htmlEndTag "body") spaces - option "" (htmlEndTag "html") + optional (htmlEndTag "html") many anyChar -- ignore anything after </html> eof - return (Pandoc (Meta title authors date) blocks) + return $ Pandoc (Meta title authors date) blocks -- -- parsing blocks -- -parseBlocks = do - spaces - result <- sepEndBy block spaces - return $ filter (/= Null) result +parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null)) -block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain, - rawHtmlBlock ] <?> "block" +block = choice [ codeBlock + , header + , hrule + , list + , blockQuote + , para + , plain + , rawHtmlBlock ] <?> "block" -- -- header blocks @@ -299,53 +286,49 @@ block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain, header = choice (map headerLevel (enumFromTo 1 5)) <?> "header" -headerLevel n = try (do +headerLevel n = try $ do let level = "h" ++ show n (tag, attribs) <- htmlTag level contents <- inlinesTilEnd level - return (Header n (normalizeSpaces contents))) + return $ Header n (normalizeSpaces contents) -- -- hrule block -- -hrule = try (do +hrule = try $ do (tag, attribs) <- htmlTag "hr" state <- getState - if (not (null attribs)) && (stateParseRaw state) - then -- in this case we want to parse it as raw html - unexpected "attributes in hr" - else return HorizontalRule) + if not (null attribs) && stateParseRaw state + then unexpected "attributes in hr" -- parse as raw in this case + else return HorizontalRule -- -- code blocks -- -codeBlock = choice [ preCodeBlock, bareCodeBlock ] <?> "code block" +codeBlock = preCodeBlock <|> bareCodeBlock <?> "code block" -preCodeBlock = try (do +preCodeBlock = try $ do htmlTag "pre" spaces - htmlTag "code" - result <- manyTill anyChar (htmlEndTag "code") + result <- bareCodeBlock spaces htmlEndTag "pre" - return (CodeBlock (stripTrailingNewlines (decodeEntities result)))) + return result -bareCodeBlock = try (do +bareCodeBlock = try $ do htmlTag "code" result <- manyTill anyChar (htmlEndTag "code") - return (CodeBlock (stripTrailingNewlines (decodeEntities result)))) + return $ CodeBlock $ stripTrailingNewlines $ + decodeCharacterReferences result -- -- block quotes -- -blockQuote = try (do - tag <- htmlTag "blockquote" - spaces - blocks <- blocksTilEnd "blockquote" - return (BlockQuote blocks)) +blockQuote = try $ htmlTag "blockquote" >> spaces >> + blocksTilEnd "blockquote" >>= (return . BlockQuote) -- -- list blocks @@ -354,119 +337,105 @@ blockQuote = try (do list = choice [ bulletList, orderedList, definitionList ] <?> "list" orderedList = try $ do - (_, attribs) <- htmlTag "ol" - (start, style) <- option (1, DefaultStyle) $ - do failIfStrict - let sta = fromMaybe "1" $ - lookup "start" attribs - let sty = fromMaybe (fromMaybe "" $ - lookup "style" attribs) $ - lookup "class" attribs - let sty' = case sty of - "lower-roman" -> LowerRoman - "upper-roman" -> UpperRoman - "lower-alpha" -> LowerAlpha - "upper-alpha" -> UpperAlpha - "decimal" -> Decimal - _ -> DefaultStyle - return (read sta, sty') - spaces - items <- sepEndBy1 (blocksIn "li") spaces - htmlEndTag "ol" - return (OrderedList (start, style, DefaultDelim) items) + (_, attribs) <- htmlTag "ol" + (start, style) <- option (1, DefaultStyle) $ + do failIfStrict + let sta = fromMaybe "1" $ + lookup "start" attribs + let sty = fromMaybe (fromMaybe "" $ + lookup "style" attribs) $ + lookup "class" attribs + let sty' = case sty of + "lower-roman" -> LowerRoman + "upper-roman" -> UpperRoman + "lower-alpha" -> LowerAlpha + "upper-alpha" -> UpperAlpha + "decimal" -> Decimal + _ -> DefaultStyle + return (read sta, sty') + spaces + items <- sepEndBy1 (blocksIn "li") spaces + htmlEndTag "ol" + return $ OrderedList (start, style, DefaultDelim) items bulletList = try $ do - htmlTag "ul" - spaces - items <- sepEndBy1 (blocksIn "li") spaces - htmlEndTag "ul" - return (BulletList items) + htmlTag "ul" + spaces + items <- sepEndBy1 (blocksIn "li") spaces + htmlEndTag "ul" + return $ BulletList items definitionList = try $ do - failIfStrict -- def lists not part of standard markdown - tag <- htmlTag "dl" - spaces - items <- sepEndBy1 definitionListItem spaces - htmlEndTag "dl" - return (DefinitionList items) + failIfStrict -- def lists not part of standard markdown + tag <- htmlTag "dl" + spaces + items <- sepEndBy1 definitionListItem spaces + htmlEndTag "dl" + return $ DefinitionList items definitionListItem = try $ do - terms <- sepEndBy1 (inlinesIn "dt") spaces - defs <- sepEndBy1 (blocksIn "dd") spaces - let term = joinWithSep [LineBreak] terms - return (term, concat defs) + terms <- sepEndBy1 (inlinesIn "dt") spaces + defs <- sepEndBy1 (blocksIn "dd") spaces + let term = joinWithSep [LineBreak] terms + return (term, concat defs) -- -- paragraph block -- -para = try (do - tag <- htmlTag "p" - result <- inlinesTilEnd "p" - return (Para (normalizeSpaces result))) +para = htmlTag "p" >> inlinesTilEnd "p" >>= return . Para . normalizeSpaces -- -- plain block -- -plain = do - result <- many1 inline - return (Plain (normalizeSpaces result)) +plain = many1 inline >>= return . Plain . normalizeSpaces -- -- inline -- -inline = choice [ text, special ] <?> "inline" - -text = choice [ entity, strong, emph, superscript, subscript, - strikeout, spanStrikeout, code, str, - linebreak, whitespace ] <?> "text" - -special = choice [ link, image, rawHtmlInline ] <?> - "link, inline html, or image" - -entity = do - ent <- characterEntity - return $ Str [ent] - -code = try (do +inline = choice [ charRef + , strong + , emph + , superscript + , subscript + , strikeout + , spanStrikeout + , code + , str + , linebreak + , whitespace + , link + , image + , rawHtmlInline ] <?> "inline" + +code = try $ do htmlTag "code" result <- manyTill anyChar (htmlEndTag "code") -- remove internal line breaks, leading and trailing space, - -- and decode entities - let result' = decodeEntities $ removeLeadingTrailingSpace $ - joinWithSep " " $ lines result - return (Code result')) + -- and decode character references + return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $ + joinWithSep " " $ lines result rawHtmlInline = do - result <- choice [htmlScript, anyHtmlInlineTag] + result <- htmlScript <|> anyHtmlInlineTag state <- getState if stateParseRaw state then return (HtmlInline result) else return (Str "") -betweenTags tag = try (do - htmlTag tag - result <- inlinesTilEnd tag - return (normalizeSpaces result)) +betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>= + return . normalizeSpaces -emph = try (do - result <- choice [betweenTags "em", betweenTags "it"] - return (Emph result)) +emph = (betweenTags "em" <|> betweenTags "it") >>= return . Emph -superscript = try $ do - failIfStrict -- strict markdown has no superscript, so treat as raw HTML - result <- betweenTags "sup" - return (Superscript result) +strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong -subscript = try $ do - failIfStrict -- strict markdown has no subscript, so treat as raw HTML - result <- betweenTags "sub" - return (Subscript result) +superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript -strikeout = try $ do - failIfStrict -- strict markdown has no strikeout, so treat as raw HTML - result <- choice [betweenTags "s", betweenTags "strike"] - return (Strikeout result) +subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript + +strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>= + return . Strikeout spanStrikeout = try $ do failIfStrict -- strict markdown has no strikeout, so treat as raw HTML @@ -474,25 +443,14 @@ spanStrikeout = try $ do result <- case (extractAttribute "class" attributes) of Just "strikeout" -> inlinesTilEnd "span" _ -> fail "not a strikeout" - return (Strikeout result) + return $ Strikeout result -strong = try (do - result <- choice [betweenTags "b", betweenTags "strong"] - return (Strong result)) - -whitespace = do - many1 space - return Space +whitespace = many1 space >> return Space -- hard line break -linebreak = do - htmlTag "br" - option ' ' newline - return LineBreak +linebreak = htmlTag "br" >> optional newline >> return LineBreak -str = do - result <- many1 (noneOf "<& \t\n") - return (Str result) +str = many1 (noneOf "<& \t\n") >>= return . Str -- -- links and images @@ -501,27 +459,27 @@ str = do -- extract contents of attribute (attribute names are case-insensitive) extractAttribute name [] = Nothing extractAttribute name ((attrName, contents):rest) = - let name' = map toLower name - attrName' = map toLower attrName in - if (attrName' == name') - then Just (decodeEntities contents) - else extractAttribute name rest + let name' = map toLower name + attrName' = map toLower attrName + in if attrName' == name' + then Just (decodeCharacterReferences contents) + else extractAttribute name rest link = try $ do (tag, attributes) <- htmlTag "a" url <- case (extractAttribute "href" attributes) of - Just url -> do {return url} + Just url -> return url Nothing -> fail "no href" - let title = fromMaybe "" (extractAttribute "title" attributes) + let title = fromMaybe "" $ extractAttribute "title" attributes label <- inlinesTilEnd "a" return $ Link (normalizeSpaces label) (url, title) image = try $ do (tag, attributes) <- htmlTag "img" url <- case (extractAttribute "src" attributes) of - Just url -> do {return url} + Just url -> return url Nothing -> fail "no src" - let title = fromMaybe "" (extractAttribute "title" attributes) + let title = fromMaybe "" $ extractAttribute "title" attributes let alt = fromMaybe "" (extractAttribute "alt" attributes) return $ Image [Str alt] (url, title) |