diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/HTML.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 434 |
1 files changed, 434 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs new file mode 100644 index 000000000..054d9eb72 --- /dev/null +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -0,0 +1,434 @@ +-- | Converts HTML to 'Pandoc' document. +module Text.Pandoc.Readers.HTML ( + readHtml, + rawHtmlInline, + rawHtmlBlock, + anyHtmlBlockTag, + anyHtmlInlineTag + ) where + +import Text.Regex ( matchRegex, mkRegex ) +import Text.ParserCombinators.Parsec +import Text.ParserCombinators.Pandoc +import Text.Pandoc.Definition +import Text.Pandoc.Shared +import Text.Pandoc.HtmlEntities ( decodeEntities, htmlEntityToChar ) +import Maybe ( fromMaybe ) +import Char ( toUpper, toLower ) + +-- | Convert HTML-formatted string to 'Pandoc' document. +readHtml :: ParserState -- ^ Parser state + -> String -- ^ String to parse + -> Pandoc +readHtml = readWith parseHtml + +-- for testing +testString :: String -> IO () +testString = testStringWith parseHtml + +-- +-- Constants +-- + +inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big", "br", "cite", + "code", "dfn", "em", "font", "i", "img", "input", "kbd", "label", "q", + "s", "samp", "select", "small", "span", "strike", "strong", "sub", + "sup", "textarea", "tt", "u", "var"] + +-- +-- HTML utility functions +-- + +-- | Read blocks until end tag. +blocksTilEnd tag = try (do + blocks <- manyTill (do {b <- block; spaces; return b}) (htmlEndTag tag) + return blocks) + +-- | Read inlines until end tag. +inlinesTilEnd tag = try (do + inlines <- manyTill inline (htmlEndTag tag) + return inlines) + +-- extract type from a tag: e.g. br from <br>, < br >, </br>, etc. +extractTagType tag = case (matchRegex (mkRegex "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of + Just [match] -> (map toLower match) + Nothing -> "" + +anyHtmlTag = try (do + char '<' + spaces + tag <- many1 alphaNum + attribs <- htmlAttributes + spaces + ender <- option "" (string "/") + let ender' = if (null ender) then "" else " /" + spaces + char '>' + return ("<" ++ tag ++ attribs ++ ender' ++ ">")) + +anyHtmlEndTag = try (do + char '<' + spaces + char '/' + spaces + tagType <- many1 alphaNum + spaces + char '>' + return ("</" ++ tagType ++ ">")) + +htmlTag :: String -> GenParser Char st (String, [(String, String)]) +htmlTag tag = try (do + char '<' + spaces + stringAnyCase tag + attribs <- many htmlAttribute + spaces + option "" (string "/") + spaces + char '>' + return (tag, (map (\(name, content, raw) -> (name, content)) attribs))) + +-- parses a quoted html attribute value +quoted quoteChar = do + result <- between (char quoteChar) (char quoteChar) (many (noneOf [quoteChar])) + return (result, [quoteChar]) + +htmlAttributes = do + attrList <- many htmlAttribute + return (concatMap (\(name, content, raw) -> raw) attrList) + +htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute + +-- minimized boolean attribute (no = and value) +htmlMinimizedAttribute = try (do + spaces + name <- many1 (choice [letter, oneOf ".-_:"]) + spaces + notFollowedBy (char '=') + let content = name + return (name, content, (" " ++ name))) + +htmlRegularAttribute = try (do + spaces + name <- many1 (choice [letter, oneOf ".-_:"]) + spaces + char '=' + spaces + (content, quoteStr) <- choice [ (quoted '\''), + (quoted '"'), + (do{ a <- (many (alphaNum <|> (oneOf "-._:"))); + return (a,"")} ) ] + return (name, content, (" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr))) + +htmlEndTag tag = try (do + char '<' + spaces + char '/' + spaces + stringAnyCase tag + spaces + char '>' + return ("</" ++ tag ++ ">")) + +-- | Returns @True@ if the tag is an inline tag. +isInline tag = (extractTagType tag) `elem` inlineHtmlTags + +anyHtmlBlockTag = try (do + tag <- choice [anyHtmlTag, anyHtmlEndTag] + if isInline tag then + fail "inline tag" + else + return tag) + +anyHtmlInlineTag = try (do + tag <- choice [ anyHtmlTag, anyHtmlEndTag ] + if isInline tag then + return tag + else + fail "not an inline tag") + +-- scripts must be treated differently, because they can contain <> etc. +htmlScript = try (do + open <- string "<script" + rest <- manyTill anyChar (htmlEndTag "script") + return (open ++ rest ++ "</script>")) + +rawHtmlBlock = do + notFollowedBy (do {choice [htmlTag "/body", htmlTag "/html"]; return ' '}) + body <- choice [htmlScript, anyHtmlBlockTag, htmlComment, xmlDec, definition] + sp <- (many space) + state <- getState + if stateParseRaw state then + return (RawHtml (body ++ sp)) + else + return Null + +htmlComment = try (do + string "<!--" + comment <- manyTill anyChar (try (string "-->")) + return ("<!--" ++ comment ++ "-->")) + +-- +-- parsing documents +-- + +xmlDec = try (do + string "<?" + rest <- manyTill anyChar (char '>') + return ("<?" ++ rest ++ ">")) + +definition = try (do + string "<!" + rest <- manyTill anyChar (char '>') + return ("<!" ++ rest ++ ">")) + +nonTitleNonHead = try (do + notFollowedBy' (htmlTag "title") + notFollowedBy' (htmlTag "/head") + result <- choice [do {rawHtmlBlock; return ' '}, anyChar] + return result) + +parseTitle = try (do + (tag, attribs) <- htmlTag "title" + contents <- inlinesTilEnd tag + spaces + return contents) + +-- parse header and return meta-information (for now, just title) +parseHead = try (do + htmlTag "head" + spaces + skipMany nonTitleNonHead + contents <- option [] parseTitle + skipMany nonTitleNonHead + htmlTag "/head" + return (contents, [], "")) + +skipHtmlTag tag = option ("",[]) (htmlTag tag) + +-- h1 class="title" representation of title in body +bodyTitle = try (do + (tag, attribs) <- htmlTag "h1" + cl <- case (extractAttribute "class" attribs) of + Just "title" -> do {return ""} + otherwise -> fail "not title" + inlinesTilEnd "h1" + return "") + +parseHtml = do + sepEndBy (choice [xmlDec, definition, htmlComment]) spaces + skipHtmlTag "html" + spaces + (title, authors, date) <- option ([], [], "") parseHead + spaces + skipHtmlTag "body" + spaces + option "" bodyTitle -- skip title in body, because it's represented in meta + blocks <- parseBlocks + spaces + option "" (htmlEndTag "body") + spaces + option "" (htmlEndTag "html") + many anyChar -- ignore anything after </html> + eof + state <- getState + let keyBlocks = stateKeyBlocks state + return (Pandoc (Meta title authors date) (blocks ++ (reverse keyBlocks))) + +-- +-- parsing blocks +-- + +parseBlocks = do + spaces + result <- sepEndBy block spaces + return result + +block = choice [ codeBlock, header, hrule, list, blockQuote, para, plain, + rawHtmlBlock ] <?> "block" + +-- +-- header blocks +-- + +header = choice (map headerLevel (enumFromTo 1 5)) <?> "header" + +headerLevel n = try (do + let level = "h" ++ show n + (tag, attribs) <- htmlTag level + contents <- inlinesTilEnd level + return (Header n (normalizeSpaces contents))) + +-- +-- hrule block +-- + +hrule = try (do + (tag, attribs) <- htmlTag "hr" + state <- getState + if (not (null attribs)) && (stateParseRaw state) then + unexpected "attributes in hr" -- in this case we want to parse it as raw html + else + return HorizontalRule) + +-- +-- code blocks +-- + +codeBlock = choice [ preCodeBlock, bareCodeBlock ] <?> "code block" + +preCodeBlock = try (do + htmlTag "pre" + spaces + htmlTag "code" + result <- manyTill anyChar (htmlEndTag "code") + spaces + htmlEndTag "pre" + return (CodeBlock (decodeEntities result))) + +bareCodeBlock = try (do + htmlTag "code" + result <- manyTill anyChar (htmlEndTag "code") + return (CodeBlock (decodeEntities result))) + +-- +-- block quotes +-- + +blockQuote = try (do + tag <- htmlTag "blockquote" + spaces + blocks <- blocksTilEnd "blockquote" + return (BlockQuote blocks)) + +-- +-- list blocks +-- + +list = choice [ bulletList, orderedList ] <?> "list" + +orderedList = try (do + tag <- htmlTag "ol" + spaces + items <- sepEndBy1 listItem spaces + htmlEndTag "ol" + return (OrderedList items)) + +bulletList = try (do + tag <- htmlTag "ul" + spaces + items <- sepEndBy1 listItem spaces + htmlEndTag "ul" + return (BulletList items)) + +listItem = try (do + tag <- htmlTag "li" + spaces + blocks <- blocksTilEnd "li" + return blocks) + +-- +-- paragraph block +-- + +para = try (do + tag <- htmlTag "p" + result <- inlinesTilEnd "p" + return (Para (normalizeSpaces result))) + +-- +-- plain block +-- + +plain = do + result <- many1 inline + return (Plain (normalizeSpaces result)) + +-- +-- inline +-- + +inline = choice [ text, special ] <?> "inline" + +text = choice [ entity, strong, emph, code, str, linebreak, whitespace ] <?> "text" + +special = choice [ link, image, rawHtmlInline ] <?> "link, inline html, or image" + +entity = try (do + char '&' + body <- choice [(many1 letter), + (try (do{ char '#'; num <- many1 digit; return ("#" ++ num)}))] + char ';' + return (Str [fromMaybe '?' (htmlEntityToChar ("&" ++ body ++ ";"))])) + +code = try (do + htmlTag "code" + result <- manyTill anyChar (htmlEndTag "code") + -- remove internal line breaks, leading and trailing space, and decode entities + let result' = decodeEntities $ removeLeadingTrailingSpace $ joinWithSep " " $ lines result + return (Code result')) + +rawHtmlInline = do + result <- choice [htmlScript, anyHtmlInlineTag] + state <- getState + if stateParseRaw state then + return (HtmlInline result) + else + return (Str "") + +betweenTags tag = try (do + htmlTag tag + result <- inlinesTilEnd tag + return (normalizeSpaces result)) + +emph = try (do + result <- choice [betweenTags "em", betweenTags "it"] + return (Emph result)) + +strong = try (do + result <- choice [betweenTags "b", betweenTags "strong"] + return (Strong result)) + +whitespace = do + many1 space + return Space + +-- hard line break +linebreak = do + htmlTag "br" + return LineBreak + +str = do + result <- many1 (noneOf "<& \t\n") + return (Str (decodeEntities result)) + +-- +-- links and images +-- + +-- extract contents of attribute (attribute names are case-insensitive) +extractAttribute name [] = Nothing +extractAttribute name ((attrName, contents):rest) = + let name' = map toLower name + attrName' = map toLower attrName in + if (attrName' == name') then Just contents else extractAttribute name rest + +link = try (do + (tag, attributes) <- htmlTag "a" + url <- case (extractAttribute "href" attributes) of + Just url -> do {return url} + Nothing -> fail "no href" + let title = fromMaybe "" (extractAttribute "title" attributes) + label <- inlinesTilEnd "a" + ref <- generateReference url title + return (Link (normalizeSpaces label) ref)) + +image = try (do + (tag, attributes) <- htmlTag "img" + url <- case (extractAttribute "src" attributes) of + Just url -> do {return url} + Nothing -> fail "no src" + let title = fromMaybe "" (extractAttribute "title" attributes) + let alt = fromMaybe "" (extractAttribute "alt" attributes) + ref <- generateReference url title + return (Image [Str alt] ref)) |