-------------------------------------------------------------------------------- -- | Provides utilities to manipulate HTML pages module Hakyll.Web.Html ( -- * Generic withTags , withTagList -- * Headers , demoteHeaders -- * Url manipulation , getUrls , withUrls , toUrl , toSiteRoot , isExternal -- * Stripping/escaping , stripTags , escapeHtml ) where -------------------------------------------------------------------------------- import Data.Char (digitToInt, intToDigit, isDigit, toLower) import Data.List (isPrefixOf) import qualified Data.Set as S import System.FilePath.Posix (joinPath, splitPath, takeDirectory) import Text.Blaze.Html (toHtml) import Text.Blaze.Html.Renderer.String (renderHtml) import qualified Text.HTML.TagSoup as TS import Network.URI (isUnreserved, escapeURIString) -------------------------------------------------------------------------------- -- | Map over all tags in the document withTags :: (TS.Tag String -> TS.Tag String) -> String -> String withTags = withTagList . map -- | Map over all tags (as list) in the document withTagList :: ([TS.Tag String] -> [TS.Tag String]) -> String -> String withTagList f = renderTags' . f . parseTags' -------------------------------------------------------------------------------- -- | Map every @h1@ to an @h2@, @h2@ to @h3@, etc. demoteHeaders :: String -> String demoteHeaders = withTags $ \tag -> case tag of TS.TagOpen t a -> TS.TagOpen (demote t) a TS.TagClose t -> TS.TagClose (demote t) t -> t where demote t@['h', n] | isDigit n = ['h', intToDigit (min 6 $ digitToInt n + 1)] | otherwise = t demote t = t -------------------------------------------------------------------------------- isUrlAttribute :: String -> Bool isUrlAttribute = (`elem` ["src", "href", "data", "poster"]) -------------------------------------------------------------------------------- getUrls :: [TS.Tag String] -> [String] getUrls tags = [v | TS.TagOpen _ as <- tags, (k, v) <- as, isUrlAttribute k] -------------------------------------------------------------------------------- -- | Apply a function to each URL on a webpage withUrls :: (String -> String) -> String -> String withUrls f = withTags tag where tag (TS.TagOpen s a) = TS.TagOpen s $ map attr a tag x = x attr (k, v) = (k, if isUrlAttribute k then f v else v) -------------------------------------------------------------------------------- -- | Customized TagSoup renderer. The default TagSoup renderer escape CSS -- within style tags, and doesn't properly minimize. renderTags' :: [TS.Tag String] -> String renderTags' = TS.renderTagsOptions TS.RenderOptions { TS.optRawTag = (`elem` ["script", "style"]) . map toLower , TS.optMinimize = (`S.member` minimize) . map toLower , TS.optEscape = id } where -- A list of elements which must be minimized minimize = S.fromList [ "area", "br", "col", "embed", "hr", "img", "input", "meta", "link" , "param" ] -------------------------------------------------------------------------------- -- | Customized TagSoup parser: do not decode any entities. parseTags' :: String -> [TS.Tag String] parseTags' = TS.parseTagsOptions (TS.parseOptions :: TS.ParseOptions String) { TS.optEntityData = \(str, b) -> [TS.TagText $ "&" ++ str ++ [';' | b]] , TS.optEntityAttrib = \(str, b) -> ("&" ++ str ++ [';' | b], []) } -------------------------------------------------------------------------------- -- | Convert a filepath to an URL starting from the site root -- -- Example: -- -- > toUrl "foo/bar.html" -- -- Result: -- -- > "/foo/bar.html" -- -- This also sanitizes the URL, e.g. converting spaces into '%20' toUrl :: FilePath -> String toUrl url = case url of ('/' : xs) -> '/' : sanitize xs xs -> '/' : sanitize xs where -- Everything but unreserved characters should be escaped as we are -- sanitising the path therefore reserved characters which have a -- meaning in URI does not appear. Special casing for `/`, because it has -- a special meaning in FilePath as well as in URI. sanitize = escapeURIString (\c -> c == '/' || isUnreserved c) -------------------------------------------------------------------------------- -- | Get the relative url to the site root, for a given (absolute) url toSiteRoot :: String -> String toSiteRoot = emptyException . joinPath . map parent . filter relevant . splitPath . takeDirectory where parent = const ".." emptyException [] = "." emptyException x = x relevant "." = False relevant "/" = False relevant "./" = False relevant _ = True -------------------------------------------------------------------------------- -- | Check if an URL links to an external HTTP(S) source isExternal :: String -> Bool isExternal url = any (flip isPrefixOf url) ["http://", "https://", "//"] -------------------------------------------------------------------------------- -- | Strip all HTML tags from a string -- -- Example: -- -- > stripTags "
foo
" -- -- Result: -- -- > "foo" -- -- This also works for incomplete tags -- -- Example: -- -- > stripTags "foo
"foo" stripTags :: String -> String stripTags [] = [] stripTags ('<' : xs) = stripTags $ drop 1 $ dropWhile (/= '>') xs stripTags (x : xs) = x : stripTags xs -------------------------------------------------------------------------------- -- | HTML-escape a string -- -- Example: -- -- > escapeHtml "Me & Dean" -- -- Result: -- -- > "Me & Dean" escapeHtml :: String -> String escapeHtml = renderHtml . toHtml