diff options
Diffstat (limited to 'src/Text/Pandoc/SelfContained.hs')
-rw-r--r-- | src/Text/Pandoc/SelfContained.hs | 181 |
1 files changed, 0 insertions, 181 deletions
diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs deleted file mode 100644 index 85b298a85..000000000 --- a/src/Text/Pandoc/SelfContained.hs +++ /dev/null @@ -1,181 +0,0 @@ -{-# LANGUAGE OverloadedStrings #-} -{- -Copyright (C) 2011-2016 John MacFarlane <jgm@berkeley.edu> - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.SelfContained - Copyright : Copyright (C) 2011-2016 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane <jgm@berkeley.edu> - Stability : alpha - Portability : portable - -Functions for converting an HTML file into one that can be viewed -offline, by incorporating linked images, CSS, and scripts into -the HTML using data URIs. --} -module Text.Pandoc.SelfContained ( makeSelfContained ) where -import Text.HTML.TagSoup -import Network.URI (isURI, escapeURIString, URI(..), parseURI) -import Data.ByteString.Base64 -import qualified Data.ByteString.Char8 as B -import Data.ByteString (ByteString) -import System.FilePath (takeExtension, takeDirectory, (</>)) -import Data.Char (toLower, isAscii, isAlphaNum) -import Codec.Compression.GZip as Gzip -import qualified Data.ByteString.Lazy as L -import Control.Monad.Trans (MonadIO(..)) -import Text.Pandoc.Shared (renderTags', err, warn, trim) -import Text.Pandoc.MediaBag (MediaBag) -import Text.Pandoc.MIME (MimeType) -import Text.Pandoc.UTF8 (toString) -import Text.Pandoc.Options (WriterOptions(..)) -import Data.List (isPrefixOf) -import Control.Applicative ((<|>)) -import Text.Parsec (runParserT, ParsecT) -import qualified Text.Parsec as P -import Control.Monad.Trans (lift) -import Text.Pandoc.Class (fetchItem, runIO, setMediaBag) - -isOk :: Char -> Bool -isOk c = isAscii c && isAlphaNum c - -makeDataURI :: String -> ByteString -> String -makeDataURI mime raw = - if textual - then "data:" ++ mime' ++ "," ++ escapeURIString isOk (toString raw) - else "data:" ++ mime' ++ ";base64," ++ toString (encode raw) - where textual = "text/" `Data.List.isPrefixOf` mime - mime' = if textual && ';' `notElem` mime - then mime ++ ";charset=utf-8" - else mime -- mime type already has charset - -convertTag :: MediaBag -> Maybe String -> Tag String -> IO (Tag String) -convertTag media sourceURL t@(TagOpen tagname as) - | tagname `elem` - ["img", "embed", "video", "input", "audio", "source", "track"] = do - as' <- mapM processAttribute as - return $ TagOpen tagname as' - where processAttribute (x,y) = - if x == "src" || x == "href" || x == "poster" - then do - enc <- getDataURI media sourceURL (fromAttrib "type" t) y - return (x, enc) - else return (x,y) -convertTag media sourceURL t@(TagOpen "script" as) = - case fromAttrib "src" t of - [] -> return t - src -> do - enc <- getDataURI media sourceURL (fromAttrib "type" t) src - return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"]) -convertTag media sourceURL t@(TagOpen "link" as) = - case fromAttrib "href" t of - [] -> return t - src -> do - enc <- getDataURI media sourceURL (fromAttrib "type" t) src - return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"]) -convertTag _ _ t = return t - -cssURLs :: MediaBag -> Maybe String -> FilePath -> ByteString - -> IO ByteString -cssURLs media sourceURL d orig = do - res <- runParserT (parseCSSUrls media sourceURL d) () "css" orig - case res of - Left e -> warn ("Could not parse CSS: " ++ show e) >> return orig - Right bs -> return bs - -parseCSSUrls :: MediaBag -> Maybe String -> FilePath - -> ParsecT ByteString () IO ByteString -parseCSSUrls media sourceURL d = B.concat <$> P.many - (pCSSWhite <|> pCSSComment <|> pCSSUrl media sourceURL d <|> pCSSOther) - --- Note: some whitespace in CSS is significant, so we can't collapse it! -pCSSWhite :: ParsecT ByteString () IO ByteString -pCSSWhite = B.singleton <$> P.space <* P.spaces - -pCSSComment :: ParsecT ByteString () IO ByteString -pCSSComment = P.try $ do - P.string "/*" - P.manyTill P.anyChar (P.try (P.string "*/")) - return B.empty - -pCSSOther :: ParsecT ByteString () IO ByteString -pCSSOther = do - (B.pack <$> P.many1 (P.noneOf "u/ \n\r\t")) <|> - (B.singleton <$> P.char 'u') <|> - (B.singleton <$> P.char '/') - -pCSSUrl :: MediaBag -> Maybe String -> FilePath - -> ParsecT ByteString () IO ByteString -pCSSUrl media sourceURL d = P.try $ do - P.string "url(" - P.spaces - quote <- P.option Nothing (Just <$> P.oneOf "\"'") - url <- P.manyTill P.anyChar (maybe (P.lookAhead (P.char ')')) P.char quote) - P.spaces - P.char ')' - let fallback = B.pack ("url(" ++ maybe "" (:[]) quote ++ trim url ++ - maybe "" (:[]) quote ++ ")") - case trim url of - '#':_ -> return fallback - 'd':'a':'t':'a':':':_ -> return fallback - u -> do let url' = if isURI u then u else d </> u - enc <- lift $ getDataURI media sourceURL "" url' - return (B.pack $ "url(" ++ enc ++ ")") - - -getDataURI :: MediaBag -> Maybe String -> MimeType -> String - -> IO String -getDataURI _ _ _ src@('d':'a':'t':'a':':':_) = return src -- already data: uri -getDataURI media sourceURL mimetype src = do - let ext = map toLower $ takeExtension src - fetchResult <- runIO $ do setMediaBag media - fetchItem sourceURL src - (raw, respMime) <- case fetchResult of - Left msg -> err 67 $ "Could not fetch " ++ src ++ - "\n" ++ show msg - Right x -> return x - let raw' = if ext == ".gz" - then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks - $ [raw] - else raw - let mime = case (mimetype, respMime) of - ("",Nothing) -> error - $ "Could not determine mime type for `" ++ src ++ "'" - (x, Nothing) -> x - (_, Just x ) -> x - let cssSourceURL = case parseURI src of - Just u - | uriScheme u `elem` ["http:","https:"] -> - Just $ show u{ uriPath = "", - uriQuery = "", - uriFragment = "" } - _ -> Nothing - result <- if mime == "text/css" - then cssURLs media cssSourceURL (takeDirectory src) raw' - else return raw' - return $ makeDataURI mime result - --- | Convert HTML into self-contained HTML, incorporating images, --- scripts, and CSS using data: URIs. -makeSelfContained :: MonadIO m => WriterOptions -> MediaBag -> String -> m String -makeSelfContained opts mediabag inp = liftIO $ do - let tags = parseTags inp - out' <- mapM (convertTag mediabag (writerSourceURL opts)) tags - return $ renderTags' out' |