aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/HTML.hs
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-12-20 06:50:14 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-12-20 06:50:14 +0000
commitdc9c6450f3b16592d0ee865feafc17b670e4ad14 (patch)
treedc29955e1ea518d6652af3d12876863b19819f6d /src/Text/Pandoc/Readers/HTML.hs
parent42d29838960f9aed3a08a4d76fc7e9c3941680a8 (diff)
downloadpandoc-dc9c6450f3b16592d0ee865feafc17b670e4ad14.tar.gz
+ Added module data for haddock.
+ Reformatted code consistently. git-svn-id: https://pandoc.googlecode.com/svn/trunk@252 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs85
1 files changed, 47 insertions, 38 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index f9a738e94..c157f3b0e 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -1,4 +1,14 @@
--- | Converts HTML to 'Pandoc' document.
+{- |
+ Module : Text.Pandoc.Readers.HTML
+ Copyright : Copyright (C) 2006 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm at berkeley dot edu>
+ Stability : unstable
+ Portability : portable
+
+Conversion of HTML to 'Pandoc' document.
+-}
module Text.Pandoc.Readers.HTML (
readHtml,
rawHtmlInline,
@@ -30,10 +40,11 @@ testString = testStringWith parseHtml
-- Constants
--
-inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", "br", "cite",
- "code", "dfn", "em", "font", "i", "img", "input", "kbd", "label", "q",
- "s", "samp", "select", "small", "span", "strike", "strong", "sub",
- "sup", "textarea", "tt", "u", "var"]
+inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
+ "br", "cite", "code", "dfn", "em", "font", "i", "img",
+ "input", "kbd", "label", "q", "s", "samp", "select",
+ "small", "span", "strike", "strong", "sub", "sup",
+ "textarea", "tt", "u", "var"]
--
-- HTML utility functions
@@ -50,9 +61,10 @@ inlinesTilEnd tag = try (do
return inlines)
-- extract type from a tag: e.g. br from <br>, < br >, </br>, etc.
-extractTagType tag = case (matchRegex (mkRegex "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of
- Just [match] -> (map toLower match)
- Nothing -> ""
+extractTagType tag =
+ case (matchRegex (mkRegex "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of
+ Just [match] -> (map toLower match)
+ Nothing -> ""
anyHtmlTag = try (do
char '<'
@@ -90,7 +102,8 @@ htmlTag tag = try (do
-- parses a quoted html attribute value
quoted quoteChar = do
- result <- between (char quoteChar) (char quoteChar) (many (noneOf [quoteChar]))
+ result <- between (char quoteChar) (char quoteChar)
+ (many (noneOf [quoteChar]))
return (result, [quoteChar])
htmlAttributes = do
@@ -116,9 +129,11 @@ htmlRegularAttribute = try (do
spaces
(content, quoteStr) <- choice [ (quoted '\''),
(quoted '"'),
- (do{ a <- (many (alphaNum <|> (oneOf "-._:")));
- return (a,"")} ) ]
- return (name, content, (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)))
+ (do
+ a <- many (alphaNum <|> (oneOf "-._:"))
+ return (a,"")) ]
+ return (name, content,
+ (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)))
htmlEndTag tag = try (do
char '<'
@@ -135,17 +150,11 @@ isInline tag = (extractTagType tag) `elem` inlineHtmlTags
anyHtmlBlockTag = try (do
tag <- choice [anyHtmlTag, anyHtmlEndTag]
- if isInline tag then
- fail "inline tag"
- else
- return tag)
+ if isInline tag then fail "inline tag" else return tag)
anyHtmlInlineTag = try (do
tag <- choice [ anyHtmlTag, anyHtmlEndTag ]
- if isInline tag then
- return tag
- else
- fail "not an inline tag")
+ if isInline tag then return tag else fail "not an inline tag")
-- scripts must be treated differently, because they can contain <> etc.
htmlScript = try (do
@@ -155,13 +164,11 @@ htmlScript = try (do
rawHtmlBlock = try (do
notFollowedBy' (choice [htmlTag "/body", htmlTag "/html"])
- body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec, definition]
+ body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec,
+ definition]
sp <- (many space)
state <- getState
- if stateParseRaw state then
- return (RawHtml (body ++ sp))
- else
- return Null)
+ if stateParseRaw state then return (RawHtml (body ++ sp)) else return Null)
htmlComment = try (do
string "<!--"
@@ -266,10 +273,10 @@ headerLevel n = try (do
hrule = try (do
(tag, attribs) <- htmlTag "hr"
state <- getState
- if (not (null attribs)) && (stateParseRaw state) then
- unexpected "attributes in hr" -- in this case we want to parse it as raw html
- else
- return HorizontalRule)
+ if (not (null attribs)) && (stateParseRaw state)
+ then -- in this case we want to parse it as raw html
+ unexpected "attributes in hr"
+ else return HorizontalRule)
--
-- code blocks
@@ -352,29 +359,31 @@ inline = choice [ text, special ] <?> "inline"
text = choice [ entity, strong, emph, code, str, linebreak, whitespace ] <?> "text"
-special = choice [ link, image, rawHtmlInline ] <?> "link, inline html, or image"
+special = choice [ link, image, rawHtmlInline ] <?>
+ "link, inline html, or image"
entity = try (do
char '&'
- body <- choice [(many1 letter),
- (try (do{ char '#'; num <- many1 digit; return ("#" ++ num)}))]
+ body <- choice [(many1 letter), (try (do
+ char '#'
+ num <- many1 digit
+ return ("#" ++ num)))]
char ';'
return (Str [fromMaybe '?' (htmlEntityToChar ("&" ++ body ++ ";"))]))
code = try (do
htmlTag "code"
result <- manyTill anyChar (htmlEndTag "code")
- -- remove internal line breaks, leading and trailing space, and decode entities
- let result' = decodeEntities $ removeLeadingTrailingSpace $ joinWithSep " " $ lines result
+ -- remove internal line breaks, leading and trailing space,
+ -- and decode entities
+ let result' = decodeEntities $ removeLeadingTrailingSpace $
+ joinWithSep " " $ lines result
return (Code result'))
rawHtmlInline = do
result <- choice [htmlScript, anyHtmlInlineTag]
state <- getState
- if stateParseRaw state then
- return (HtmlInline result)
- else
- return (Str "")
+ if stateParseRaw state then return (HtmlInline result) else return (Str "")
betweenTags tag = try (do
htmlTag tag