{- |
Module : Text.Pandoc.Readers.HTML
Copyright : Copyright (C) 2006 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane
Stability : unstable
Portability : portable
Conversion of HTML to 'Pandoc' document.
-}
module Text.Pandoc.Readers.HTML (
readHtml,
rawHtmlInline,
rawHtmlBlock,
anyHtmlBlockTag,
anyHtmlInlineTag
) where
import Text.Regex ( matchRegex, mkRegex )
import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Pandoc
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Text.Pandoc.HtmlEntities ( decodeEntities, htmlEntityToChar )
import Maybe ( fromMaybe )
import Char ( toUpper, toLower )
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ParserState -- ^ Parser state
-> String -- ^ String to parse
-> Pandoc
readHtml = readWith parseHtml
-- for testing
testString :: String -> IO ()
testString = testStringWith parseHtml
--
-- Constants
--
inlineHtmlTags = ["a", "abbr", "acronym", "b", "basefont", "bdo", "big",
"br", "cite", "code", "dfn", "em", "font", "i", "img",
"input", "kbd", "label", "q", "s", "samp", "select",
"small", "span", "strike", "strong", "sub", "sup",
"textarea", "tt", "u", "var"]
--
-- HTML utility functions
--
-- | Read blocks until end tag.
blocksTilEnd tag = try (do
blocks <- manyTill (do {b <- block; spaces; return b}) (htmlEndTag tag)
return blocks)
-- | Read inlines until end tag.
inlinesTilEnd tag = try (do
inlines <- manyTill inline (htmlEndTag tag)
return inlines)
-- extract type from a tag: e.g. br from
, < br >, , etc.
extractTagType tag =
case (matchRegex (mkRegex "<[[:space:]]*/?([A-Za-z0-9]+)") tag) of
Just [match] -> (map toLower match)
Nothing -> ""
anyHtmlTag = try (do
char '<'
spaces
tag <- many1 alphaNum
attribs <- htmlAttributes
spaces
ender <- option "" (string "/")
let ender' = if (null ender) then "" else " /"
spaces
char '>'
return ("<" ++ tag ++ attribs ++ ender' ++ ">"))
anyHtmlEndTag = try (do
char '<'
spaces
char '/'
spaces
tagType <- many1 alphaNum
spaces
char '>'
return ("" ++ tagType ++ ">"))
htmlTag :: String -> GenParser Char st (String, [(String, String)])
htmlTag tag = try (do
char '<'
spaces
stringAnyCase tag
attribs <- many htmlAttribute
spaces
option "" (string "/")
spaces
char '>'
return (tag, (map (\(name, content, raw) -> (name, content)) attribs)))
-- parses a quoted html attribute value
quoted quoteChar = do
result <- between (char quoteChar) (char quoteChar)
(many (noneOf [quoteChar]))
return (result, [quoteChar])
htmlAttributes = do
attrList <- many htmlAttribute
return (concatMap (\(name, content, raw) -> raw) attrList)
htmlAttribute = htmlRegularAttribute <|> htmlMinimizedAttribute
-- minimized boolean attribute (no = and value)
htmlMinimizedAttribute = try (do
spaces
name <- many1 (choice [letter, oneOf ".-_:"])
spaces
notFollowedBy (char '=')
let content = name
return (name, content, (" " ++ name)))
htmlRegularAttribute = try (do
spaces
name <- many1 (choice [letter, oneOf ".-_:"])
spaces
char '='
spaces
(content, quoteStr) <- choice [ (quoted '\''),
(quoted '"'),
(do
a <- many (alphaNum <|> (oneOf "-._:"))
return (a,"")) ]
return (name, content,
(" " ++ name ++ "=" ++ quoteStr ++ content ++ quoteStr)))
htmlEndTag tag = try (do
char '<'
spaces
char '/'
spaces
stringAnyCase tag
spaces
char '>'
return ("" ++ tag ++ ">"))
-- | Returns @True@ if the tag is an inline tag.
isInline tag = (extractTagType tag) `elem` inlineHtmlTags
anyHtmlBlockTag = try (do
tag <- choice [anyHtmlTag, anyHtmlEndTag]
if isInline tag then fail "inline tag" else return tag)
anyHtmlInlineTag = try (do
tag <- choice [ anyHtmlTag, anyHtmlEndTag ]
if isInline tag then return tag else fail "not an inline tag")
-- scripts must be treated differently, because they can contain <> etc.
htmlScript = try (do
open <- string "