From 4ea1b2bdc0a86f135bae4ae95cfc3d45a9416604 Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Sat, 30 Dec 2006 22:51:49 +0000 Subject: Merged 'strict' branch from r324. This adds a '--strict' option to pandoc, which forces it to stay as close as possible to official Markdown syntax. git-svn-id: https://pandoc.googlecode.com/svn/trunk@347 788f1e2b-df1e-0410-8736-df70ead52e1b --- src/Text/Pandoc/Readers/HTML.hs | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) (limited to 'src/Text/Pandoc/Readers/HTML.hs') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 2bf75654c..9beaaacff 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -32,7 +32,12 @@ module Text.Pandoc.Readers.HTML ( rawHtmlInline, rawHtmlBlock, anyHtmlBlockTag, - anyHtmlInlineTag + anyHtmlInlineTag, + anyHtmlTag, + anyHtmlEndTag, + htmlEndTag, + extractTagType, + htmlBlockElement ) where import Text.Regex ( matchRegex, mkRegex ) @@ -78,17 +83,18 @@ inlinesTilEnd tag = try (do inlines <- manyTill inline (htmlEndTag tag) return inlines) --- extract type from a tag: e.g. br from
, < br >,
, etc. +-- | Extract type from a tag: e.g. 'br' from '
' extractTagType tag = case (matchRegex (mkRegex "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of Just [match] -> (map toLower match) Nothing -> "" +-- | Parse any HTML tag (closing or opening) and return text of tag anyHtmlTag = try (do char '<' spaces tag <- many1 alphaNum - attribs <- htmlAttributes + attribs <- htmlAttributes spaces ender <- option "" (string "/") let ender' = if (null ender) then "" else " /" @@ -150,9 +156,10 @@ htmlRegularAttribute = try (do (do a <- many (alphaNum <|> (oneOf "-._:")) return (a,"")) ] - return (name, content, + return (name, content, (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr))) +-- | Parse an end tag of type 'tag' htmlEndTag tag = try (do char '<' spaces @@ -174,20 +181,23 @@ anyHtmlInlineTag = try (do tag <- choice [ anyHtmlTag, anyHtmlEndTag ] if isInline tag then return tag else fail "not an inline tag") --- scripts must be treated differently, because they can contain <> etc. +-- | Parses material between script tags. +-- Scripts must be treated differently, because they can contain '<>' etc. htmlScript = try (do open <- string "")) +htmlBlockElement = choice [ htmlScript, htmlComment, xmlDec, definition ] + rawHtmlBlock = try (do notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"]) - body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec, - definition] + body <- htmlBlockElement <|> anyHtmlBlockTag sp <- (many space) state <- getState if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null) +-- | Parses an HTML comment. htmlComment = try (do string "")) -- cgit v1.2.3