aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs362
1 files changed, 160 insertions, 202 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 1742667b8..1eb5d7b4a 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -41,12 +41,12 @@ module Text.Pandoc.Readers.HTML (
) where
import Text.ParserCombinators.Parsec
-import Text.Pandoc.ParserCombinators
import Text.Pandoc.Definition
import Text.Pandoc.Shared
-import Text.Pandoc.Entities ( characterEntity, decodeEntities )
+import Text.Pandoc.CharacterReferences ( characterReference,
+ decodeCharacterReferences )
import Data.Maybe ( fromMaybe )
-import Data.List ( intersect, takeWhile, dropWhile )
+import Data.List ( takeWhile, dropWhile )
import Data.Char ( toUpper, toLower, isAlphaNum )
-- | Convert HTML-formatted string to 'Pandoc' document.
@@ -55,10 +55,6 @@ readHtml :: ParserState -- ^ Parser state
-> Pandoc
readHtml = readWith parseHtml
--- for testing
-testString :: String -> IO ()
-testString = testStringWith parseHtml
-
--
-- Constants
--
@@ -74,26 +70,18 @@ inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
--
-- | Read blocks until end tag.
-blocksTilEnd tag = try (do
- blocks <- manyTill (do {b <- block; spaces; return b}) (htmlEndTag tag)
- return $ filter (/= Null) blocks)
+blocksTilEnd tag = do
+ blocks <- manyTill (block >>~ spaces) (htmlEndTag tag)
+ return $ filter (/= Null) blocks
-- | Read inlines until end tag.
-inlinesTilEnd tag = try (do
- inlines <- manyTill inline (htmlEndTag tag)
- return inlines)
+inlinesTilEnd tag = manyTill inline (htmlEndTag tag)
-- | Parse blocks between open and close tag.
-blocksIn tag = try $ do
- htmlTag tag
- spaces
- blocksTilEnd tag
+blocksIn tag = try $ htmlTag tag >> spaces >> blocksTilEnd tag
-- | Parse inlines between open and close tag.
-inlinesIn tag = try $ do
- htmlTag tag
- spaces
- inlinesTilEnd tag
+inlinesIn tag = try $ htmlTag tag >> spaces >> inlinesTilEnd tag
-- | Extract type from a tag: e.g. @br@ from @\<br\>@
extractTagType :: String -> String
@@ -103,19 +91,19 @@ extractTagType ('<':rest) =
extractTagType _ = ""
-- | Parse any HTML tag (closing or opening) and return text of tag
-anyHtmlTag = try (do
+anyHtmlTag = try $ do
char '<'
spaces
tag <- many1 alphaNum
attribs <- htmlAttributes
spaces
ender <- option "" (string "/")
- let ender' = if (null ender) then "" else " /"
+ let ender' = if null ender then "" else " /"
spaces
char '>'
- return ("<" ++ tag ++ attribs ++ ender' ++ ">"))
+ return $ "<" ++ tag ++ attribs ++ ender' ++ ">"
-anyHtmlEndTag = try (do
+anyHtmlEndTag = try $ do
char '<'
spaces
char '/'
@@ -123,19 +111,19 @@ anyHtmlEndTag = try (do
tagType <- many1 alphaNum
spaces
char '>'
- return ("</" ++ tagType ++ ">"))
+ return $ "</" ++ tagType ++ ">"
htmlTag :: String -> GenParser Char st (String, [(String, String)])
-htmlTag tag = try (do
+htmlTag tag = try $ do
char '<'
spaces
stringAnyCase tag
attribs <- many htmlAttribute
spaces
- option "" (string "/")
+ optional (string "/")
spaces
char '>'
- return (tag, (map (\(name, content, raw) -> (name, content)) attribs)))
+ return (tag, (map (\(name, content, raw) -> (name, content)) attribs))
-- parses a quoted html attribute value
quoted quoteChar = do
@@ -145,20 +133,20 @@ quoted quoteChar = do
htmlAttributes = do
attrList <- many htmlAttribute
- return (concatMap (\(name, content, raw) -> raw) attrList)
+ return $ concatMap (\(name, content, raw) -> raw) attrList
htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
--- minimized boolean attribute (no = and value)
-htmlMinimizedAttribute = try (do
+-- minimized boolean attribute
+htmlMinimizedAttribute = try $ do
many1 space
name <- many1 (choice [letter, oneOf ".-_:"])
spaces
notFollowedBy (char '=')
let content = name
- return (name, content, (" " ++ name)))
+ return (name, content, (" " ++ name))
-htmlRegularAttribute = try (do
+htmlRegularAttribute = try $ do
many1 space
name <- many1 (choice [letter, oneOf ".-_:"])
spaces
@@ -170,10 +158,10 @@ htmlRegularAttribute = try (do
a <- many (alphaNum <|> (oneOf "-._:"))
return (a,"")) ]
return (name, content,
- (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)))
+ (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr))
-- | Parse an end tag of type 'tag'
-htmlEndTag tag = try (do
+htmlEndTag tag = try $ do
char '<'
spaces
char '/'
@@ -181,87 +169,83 @@ htmlEndTag tag = try (do
stringAnyCase tag
spaces
char '>'
- return ("</" ++ tag ++ ">"))
+ return $ "</" ++ tag ++ ">"
-- | Returns @True@ if the tag is an inline tag.
isInline tag = (extractTagType tag) `elem` inlineHtmlTags
-anyHtmlBlockTag = try (do
- tag <- choice [anyHtmlTag, anyHtmlEndTag]
- if isInline tag then fail "inline tag" else return tag)
+anyHtmlBlockTag = try $ do
+ tag <- anyHtmlTag <|> anyHtmlEndTag
+ if isInline tag then fail "inline tag" else return tag
-anyHtmlInlineTag = try (do
- tag <- choice [ anyHtmlTag, anyHtmlEndTag ]
- if isInline tag then return tag else fail "not an inline tag")
+anyHtmlInlineTag = try $ do
+ tag <- anyHtmlTag <|> anyHtmlEndTag
+ if isInline tag then return tag else fail "not an inline tag"
-- | Parses material between script tags.
-- Scripts must be treated differently, because they can contain '<>' etc.
-htmlScript = try (do
+htmlScript = try $ do
open <- string "<script"
rest <- manyTill anyChar (htmlEndTag "script")
- return (open ++ rest ++ "</script>"))
+ return $ open ++ rest ++ "</script>"
htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ]
-rawHtmlBlock = try (do
- notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"])
+rawHtmlBlock = try $ do
+ notFollowedBy' (htmlTag "/body" <|> htmlTag "/html")
body <- htmlBlockElement <|> anyHtmlBlockTag
- sp <- (many space)
+ sp <- many space
state <- getState
- if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null)
+ if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null
-- | Parses an HTML comment.
-htmlComment = try (do
+htmlComment = try $ do
string "<!--"
comment <- manyTill anyChar (try (string "-->"))
- return ("<!--" ++ comment ++ "-->"))
+ return $ "<!--" ++ comment ++ "-->"
--
-- parsing documents
--
-xmlDec = try (do
+xmlDec = try $ do
string "<?"
rest <- manyTill anyChar (char '>')
- return ("<?" ++ rest ++ ">"))
+ return $ "<?" ++ rest ++ ">"
-definition = try (do
+definition = try $ do
string "<!"
rest <- manyTill anyChar (char '>')
- return ("<!" ++ rest ++ ">"))
+ return $ "<!" ++ rest ++ ">"
-nonTitleNonHead = try (do
- notFollowedBy' (htmlTag "title")
- notFollowedBy' (htmlTag "/head")
- result <- choice [do {rawHtmlBlock; return ' '}, anyChar]
- return result)
+nonTitleNonHead = try $ notFollowedBy' (htmlTag "title" <|> htmlTag "/head") >>
+ ((rawHtmlBlock >> return ' ') <|> anyChar)
-parseTitle = try (do
- (tag, attribs) <- htmlTag "title"
+parseTitle = try $ do
+ (tag, _) <- htmlTag "title"
contents <- inlinesTilEnd tag
spaces
- return contents)
+ return contents
-- parse header and return meta-information (for now, just title)
-parseHead = try (do
+parseHead = try $ do
htmlTag "head"
spaces
skipMany nonTitleNonHead
contents <- option [] parseTitle
skipMany nonTitleNonHead
htmlTag "/head"
- return (contents, [], ""))
+ return (contents, [], "")
-skipHtmlTag tag = option ("",[]) (htmlTag tag)
+skipHtmlTag tag = optional (htmlTag tag)
-- h1 class="title" representation of title in body
-bodyTitle = try (do
+bodyTitle = try $ do
(tag, attribs) <- htmlTag "h1"
cl <- case (extractAttribute "class" attribs) of
- Just "title" -> do {return ""}
+ Just "title" -> return ""
otherwise -> fail "not title"
inlinesTilEnd "h1"
- return "")
parseHtml = do
sepEndBy (choice [xmlDec, definition, htmlComment]) spaces
@@ -271,27 +255,30 @@ parseHtml = do
spaces
skipHtmlTag "body"
spaces
- option "" bodyTitle -- skip title in body, because it's represented in meta
+ optional bodyTitle -- skip title in body, because it's represented in meta
blocks <- parseBlocks
spaces
- option "" (htmlEndTag "body")
+ optional (htmlEndTag "body")
spaces
- option "" (htmlEndTag "html")
+ optional (htmlEndTag "html")
many anyChar -- ignore anything after </html>
eof
- return (Pandoc (Meta title authors date) blocks)
+ return $ Pandoc (Meta title authors date) blocks
--
-- parsing blocks
--
-parseBlocks = do
- spaces
- result <- sepEndBy block spaces
- return $ filter (/= Null) result
+parseBlocks = spaces >> sepEndBy block spaces >>= (return . filter (/= Null))
-block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain,
- rawHtmlBlock ] <?> "block"
+block = choice [ codeBlock
+ , header
+ , hrule
+ , list
+ , blockQuote
+ , para
+ , plain
+ , rawHtmlBlock ] <?> "block"
--
-- header blocks
@@ -299,53 +286,49 @@ block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain,
header = choice (map headerLevel (enumFromTo 1 5)) <?> "header"
-headerLevel n = try (do
+headerLevel n = try $ do
let level = "h" ++ show n
(tag, attribs) <- htmlTag level
contents <- inlinesTilEnd level
- return (Header n (normalizeSpaces contents)))
+ return $ Header n (normalizeSpaces contents)
--
-- hrule block
--
-hrule = try (do
+hrule = try $ do
(tag, attribs) <- htmlTag "hr"
state <- getState
- if (not (null attribs)) && (stateParseRaw state)
- then -- in this case we want to parse it as raw html
- unexpected "attributes in hr"
- else return HorizontalRule)
+ if not (null attribs) && stateParseRaw state
+ then unexpected "attributes in hr" -- parse as raw in this case
+ else return HorizontalRule
--
-- code blocks
--
-codeBlock = choice [ preCodeBlock, bareCodeBlock ] <?> "code block"
+codeBlock = preCodeBlock <|> bareCodeBlock <?> "code block"
-preCodeBlock = try (do
+preCodeBlock = try $ do
htmlTag "pre"
spaces
- htmlTag "code"
- result <- manyTill anyChar (htmlEndTag "code")
+ result <- bareCodeBlock
spaces
htmlEndTag "pre"
- return (CodeBlock (stripTrailingNewlines (decodeEntities result))))
+ return result
-bareCodeBlock = try (do
+bareCodeBlock = try $ do
htmlTag "code"
result <- manyTill anyChar (htmlEndTag "code")
- return (CodeBlock (stripTrailingNewlines (decodeEntities result))))
+ return $ CodeBlock $ stripTrailingNewlines $
+ decodeCharacterReferences result
--
-- block quotes
--
-blockQuote = try (do
- tag <- htmlTag "blockquote"
- spaces
- blocks <- blocksTilEnd "blockquote"
- return (BlockQuote blocks))
+blockQuote = try $ htmlTag "blockquote" >> spaces >>
+ blocksTilEnd "blockquote" >>= (return . BlockQuote)
--
-- list blocks
@@ -354,119 +337,105 @@ blockQuote = try (do
list = choice [ bulletList, orderedList, definitionList ] <?> "list"
orderedList = try $ do
- (_, attribs) <- htmlTag "ol"
- (start, style) <- option (1, DefaultStyle) $
- do failIfStrict
- let sta = fromMaybe "1" $
- lookup "start" attribs
- let sty = fromMaybe (fromMaybe "" $
- lookup "style" attribs) $
- lookup "class" attribs
- let sty' = case sty of
- "lower-roman" -> LowerRoman
- "upper-roman" -> UpperRoman
- "lower-alpha" -> LowerAlpha
- "upper-alpha" -> UpperAlpha
- "decimal" -> Decimal
- _ -> DefaultStyle
- return (read sta, sty')
- spaces
- items <- sepEndBy1 (blocksIn "li") spaces
- htmlEndTag "ol"
- return (OrderedList (start, style, DefaultDelim) items)
+ (_, attribs) <- htmlTag "ol"
+ (start, style) <- option (1, DefaultStyle) $
+ do failIfStrict
+ let sta = fromMaybe "1" $
+ lookup "start" attribs
+ let sty = fromMaybe (fromMaybe "" $
+ lookup "style" attribs) $
+ lookup "class" attribs
+ let sty' = case sty of
+ "lower-roman" -> LowerRoman
+ "upper-roman" -> UpperRoman
+ "lower-alpha" -> LowerAlpha
+ "upper-alpha" -> UpperAlpha
+ "decimal" -> Decimal
+ _ -> DefaultStyle
+ return (read sta, sty')
+ spaces
+ items <- sepEndBy1 (blocksIn "li") spaces
+ htmlEndTag "ol"
+ return $ OrderedList (start, style, DefaultDelim) items
bulletList = try $ do
- htmlTag "ul"
- spaces
- items <- sepEndBy1 (blocksIn "li") spaces
- htmlEndTag "ul"
- return (BulletList items)
+ htmlTag "ul"
+ spaces
+ items <- sepEndBy1 (blocksIn "li") spaces
+ htmlEndTag "ul"
+ return $ BulletList items
definitionList = try $ do
- failIfStrict -- def lists not part of standard markdown
- tag <- htmlTag "dl"
- spaces
- items <- sepEndBy1 definitionListItem spaces
- htmlEndTag "dl"
- return (DefinitionList items)
+ failIfStrict -- def lists not part of standard markdown
+ tag <- htmlTag "dl"
+ spaces
+ items <- sepEndBy1 definitionListItem spaces
+ htmlEndTag "dl"
+ return $ DefinitionList items
definitionListItem = try $ do
- terms <- sepEndBy1 (inlinesIn "dt") spaces
- defs <- sepEndBy1 (blocksIn "dd") spaces
- let term = joinWithSep [LineBreak] terms
- return (term, concat defs)
+ terms <- sepEndBy1 (inlinesIn "dt") spaces
+ defs <- sepEndBy1 (blocksIn "dd") spaces
+ let term = joinWithSep [LineBreak] terms
+ return (term, concat defs)
--
-- paragraph block
--
-para = try (do
- tag <- htmlTag "p"
- result <- inlinesTilEnd "p"
- return (Para (normalizeSpaces result)))
+para = htmlTag "p" >> inlinesTilEnd "p" >>= return . Para . normalizeSpaces
--
-- plain block
--
-plain = do
- result <- many1 inline
- return (Plain (normalizeSpaces result))
+plain = many1 inline >>= return . Plain . normalizeSpaces
--
-- inline
--
-inline = choice [ text, special ] <?> "inline"
-
-text = choice [ entity, strong, emph, superscript, subscript,
- strikeout, spanStrikeout, code, str,
- linebreak, whitespace ] <?> "text"
-
-special = choice [ link, image, rawHtmlInline ] <?>
- "link, inline html, or image"
-
-entity = do
- ent <- characterEntity
- return $ Str [ent]
-
-code = try (do
+inline = choice [ charRef
+ , strong
+ , emph
+ , superscript
+ , subscript
+ , strikeout
+ , spanStrikeout
+ , code
+ , str
+ , linebreak
+ , whitespace
+ , link
+ , image
+ , rawHtmlInline ] <?> "inline"
+
+code = try $ do
htmlTag "code"
result <- manyTill anyChar (htmlEndTag "code")
-- remove internal line breaks, leading and trailing space,
- -- and decode entities
- let result' = decodeEntities $ removeLeadingTrailingSpace $
- joinWithSep " " $ lines result
- return (Code result'))
+ -- and decode character references
+ return $ Code $ decodeCharacterReferences $ removeLeadingTrailingSpace $
+ joinWithSep " " $ lines result
rawHtmlInline = do
- result <- choice [htmlScript, anyHtmlInlineTag]
+ result <- htmlScript <|> anyHtmlInlineTag
state <- getState
if stateParseRaw state then return (HtmlInline result) else return (Str "")
-betweenTags tag = try (do
- htmlTag tag
- result <- inlinesTilEnd tag
- return (normalizeSpaces result))
+betweenTags tag = try $ htmlTag tag >> inlinesTilEnd tag >>=
+ return . normalizeSpaces
-emph = try (do
- result <- choice [betweenTags "em", betweenTags "it"]
- return (Emph result))
+emph = (betweenTags "em" <|> betweenTags "it") >>= return . Emph
-superscript = try $ do
- failIfStrict -- strict markdown has no superscript, so treat as raw HTML
- result <- betweenTags "sup"
- return (Superscript result)
+strong = (betweenTags "b" <|> betweenTags "strong") >>= return . Strong
-subscript = try $ do
- failIfStrict -- strict markdown has no subscript, so treat as raw HTML
- result <- betweenTags "sub"
- return (Subscript result)
+superscript = failIfStrict >> betweenTags "sup" >>= return . Superscript
-strikeout = try $ do
- failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
- result <- choice [betweenTags "s", betweenTags "strike"]
- return (Strikeout result)
+subscript = failIfStrict >> betweenTags "sub" >>= return . Subscript
+
+strikeout = failIfStrict >> (betweenTags "s" <|> betweenTags "strike") >>=
+ return . Strikeout
spanStrikeout = try $ do
failIfStrict -- strict markdown has no strikeout, so treat as raw HTML
@@ -474,25 +443,14 @@ spanStrikeout = try $ do
result <- case (extractAttribute "class" attributes) of
Just "strikeout" -> inlinesTilEnd "span"
_ -> fail "not a strikeout"
- return (Strikeout result)
+ return $ Strikeout result
-strong = try (do
- result <- choice [betweenTags "b", betweenTags "strong"]
- return (Strong result))
-
-whitespace = do
- many1 space
- return Space
+whitespace = many1 space >> return Space
-- hard line break
-linebreak = do
- htmlTag "br"
- option ' ' newline
- return LineBreak
+linebreak = htmlTag "br" >> optional newline >> return LineBreak
-str = do
- result <- many1 (noneOf "<& \t\n")
- return (Str result)
+str = many1 (noneOf "<& \t\n") >>= return . Str
--
-- links and images
@@ -501,27 +459,27 @@ str = do
-- extract contents of attribute (attribute names are case-insensitive)
extractAttribute name [] = Nothing
extractAttribute name ((attrName, contents):rest) =
- let name' = map toLower name
- attrName' = map toLower attrName in
- if (attrName' == name')
- then Just (decodeEntities contents)
- else extractAttribute name rest
+ let name' = map toLower name
+ attrName' = map toLower attrName
+ in if attrName' == name'
+ then Just (decodeCharacterReferences contents)
+ else extractAttribute name rest
link = try $ do
(tag, attributes) <- htmlTag "a"
url <- case (extractAttribute "href" attributes) of
- Just url -> do {return url}
+ Just url -> return url
Nothing -> fail "no href"
- let title = fromMaybe "" (extractAttribute "title" attributes)
+ let title = fromMaybe "" $ extractAttribute "title" attributes
label <- inlinesTilEnd "a"
return $ Link (normalizeSpaces label) (url, title)
image = try $ do
(tag, attributes) <- htmlTag "img"
url <- case (extractAttribute "src" attributes) of
- Just url -> do {return url}
+ Just url -> return url
Nothing -> fail "no src"
- let title = fromMaybe "" (extractAttribute "title" attributes)
+ let title = fromMaybe "" $ extractAttribute "title" attributes
let alt = fromMaybe "" (extractAttribute "alt" attributes)
return $ Image [Str alt] (url, title)