aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/SelfContained.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/SelfContained.hs')
-rw-r--r--src/Text/Pandoc/SelfContained.hs279
1 files changed, 279 insertions, 0 deletions
diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs
new file mode 100644
index 000000000..116083fba
--- /dev/null
+++ b/src/Text/Pandoc/SelfContained.hs
@@ -0,0 +1,279 @@
+{-# LANGUAGE OverloadedStrings #-}
+{-
+Copyright (C) 2011 John MacFarlane <jgm@berkeley.edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.SelfContained
+ Copyright : Copyright (C) 2011 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Functions for converting an HTML file into one that can be viewed
+offline, by incorporating linked images, CSS, and scripts into
+the HTML using data URIs.
+-}
+module Text.Pandoc.SelfContained ( makeSelfContained ) where
+import Text.HTML.TagSoup
+import Network.URI (isAbsoluteURI, parseURI, escapeURIString)
+import Network.HTTP
+import Data.ByteString.Base64
+import qualified Data.ByteString.Char8 as B
+import Data.ByteString (ByteString)
+import Data.ByteString.UTF8 (toString)
+import System.FilePath (takeExtension, dropExtension, takeDirectory, (</>))
+import Data.Char (toLower, isAscii, isAlphaNum)
+import Codec.Compression.GZip as Gzip
+import qualified Data.ByteString.Lazy as L
+import Text.Pandoc.Shared (findDataFile)
+import System.Directory (doesFileExist)
+
+getItem :: Maybe FilePath -> String -> IO ByteString
+getItem userdata f =
+ if isAbsoluteURI f
+ then openURL f
+ else do
+ exists <- doesFileExist f
+ if exists
+ then B.readFile f
+ else do
+ res <- findDataFile userdata f
+ exists' <- doesFileExist res
+ if exists'
+ then B.readFile res
+ else error $ "Could not find `" ++ f ++ "'"
+
+openURL :: String -> IO ByteString
+openURL u = getResponseBody =<< simpleHTTP (getReq u)
+ where getReq v = case parseURI v of
+ Nothing -> error $ "Could not parse URI: " ++ v
+ Just u' -> mkRequest GET u'
+
+mimeTypeFor :: String -> String
+mimeTypeFor s = case lookup s mimetypes of
+ Nothing -> error $ "Could not find mime type for " ++ s
+ Just x -> x
+ where mimetypes = [ -- taken from MissingH
+ (".a", "application/octet-stream"),
+ (".ai", "application/postscript"),
+ (".aif", "audio/x-aiff"),
+ (".aifc", "audio/x-aiff"),
+ (".aiff", "audio/x-aiff"),
+ (".au", "audio/basic"),
+ (".avi", "video/x-msvideo"),
+ (".bat", "text/plain"),
+ (".bcpio", "application/x-bcpio"),
+ (".bin", "application/octet-stream"),
+ (".bmp", "image/x-ms-bmp"),
+ (".c", "text/plain"),
+ (".cdf", "application/x-netcdf"),
+ (".cpio", "application/x-cpio"),
+ (".csh", "application/x-csh"),
+ (".css", "text/css"),
+ (".dll", "application/octet-stream"),
+ (".doc", "application/msword"),
+ (".dot", "application/msword"),
+ (".dvi", "application/x-dvi"),
+ (".eml", "message/rfc822"),
+ (".eps", "application/postscript"),
+ (".etx", "text/x-setext"),
+ (".exe", "application/octet-stream"),
+ (".gif", "image/gif"),
+ (".gtar", "application/x-gtar"),
+ (".h", "text/plain"),
+ (".hdf", "application/x-hdf"),
+ (".htm", "text/html"),
+ (".html", "text/html"),
+ (".ief", "image/ief"),
+ (".jpe", "image/jpeg"),
+ (".jpeg", "image/jpeg"),
+ (".jpg", "image/jpeg"),
+ (".js", "application/x-javascript"),
+ (".ksh", "text/plain"),
+ (".latex", "application/x-latex"),
+ (".m1v", "video/mpeg"),
+ (".man", "application/x-troff-man"),
+ (".me", "application/x-troff-me"),
+ (".mht", "message/rfc822"),
+ (".mhtml", "message/rfc822"),
+ (".mif", "application/x-mif"),
+ (".mov", "video/quicktime"),
+ (".movie", "video/x-sgi-movie"),
+ (".mp2", "audio/mpeg"),
+ (".mp3", "audio/mpeg"),
+ (".mpa", "video/mpeg"),
+ (".mpe", "video/mpeg"),
+ (".mpeg", "video/mpeg"),
+ (".mpg", "video/mpeg"),
+ (".ms", "application/x-troff-ms"),
+ (".nc", "application/x-netcdf"),
+ (".nws", "message/rfc822"),
+ (".o", "application/octet-stream"),
+ (".obj", "application/octet-stream"),
+ (".oda", "application/oda"),
+ (".p12", "application/x-pkcs12"),
+ (".p7c", "application/pkcs7-mime"),
+ (".pbm", "image/x-portable-bitmap"),
+ (".pdf", "application/pdf"),
+ (".pfx", "application/x-pkcs12"),
+ (".pgm", "image/x-portable-graymap"),
+ (".pl", "text/plain"),
+ (".png", "image/png"),
+ (".pnm", "image/x-portable-anymap"),
+ (".pot", "application/vnd.ms-powerpoint"),
+ (".ppa", "application/vnd.ms-powerpoint"),
+ (".ppm", "image/x-portable-pixmap"),
+ (".pps", "application/vnd.ms-powerpoint"),
+ (".ppt", "application/vnd.ms-powerpoint"),
+ (".ps", "application/postscript"),
+ (".pwz", "application/vnd.ms-powerpoint"),
+ (".py", "text/x-python"),
+ (".pyc", "application/x-python-code"),
+ (".pyo", "application/x-python-code"),
+ (".qt", "video/quicktime"),
+ (".ra", "audio/x-pn-realaudio"),
+ (".ram", "application/x-pn-realaudio"),
+ (".ras", "image/x-cmu-raster"),
+ (".rdf", "application/xml"),
+ (".rgb", "image/x-rgb"),
+ (".roff", "application/x-troff"),
+ (".rtx", "text/richtext"),
+ (".sgm", "text/x-sgml"),
+ (".sgml", "text/x-sgml"),
+ (".sh", "application/x-sh"),
+ (".shar", "application/x-shar"),
+ (".snd", "audio/basic"),
+ (".so", "application/octet-stream"),
+ (".src", "application/x-wais-source"),
+ (".sv4cpio", "application/x-sv4cpio"),
+ (".sv4crc", "application/x-sv4crc"),
+ (".swf", "application/x-shockwave-flash"),
+ (".t", "application/x-troff"),
+ (".tar", "application/x-tar"),
+ (".tcl", "application/x-tcl"),
+ (".tex", "application/x-tex"),
+ (".texi", "application/x-texinfo"),
+ (".texinfo", "application/x-texinfo"),
+ (".tif", "image/tiff"),
+ (".tiff", "image/tiff"),
+ (".tr", "application/x-troff"),
+ (".tsv", "text/tab-separated-values"),
+ (".txt", "text/plain"),
+ (".ustar", "application/x-ustar"),
+ (".vcf", "text/x-vcard"),
+ (".wav", "audio/x-wav"),
+ (".wiz", "application/msword"),
+ (".xbm", "image/x-xbitmap"),
+ (".xlb", "application/vnd.ms-excel"),
+ (".xls", "application/vnd.ms-excel"),
+ (".xml", "text/xml"),
+ (".xpm", "image/x-xpixmap"),
+ (".xsl", "application/xml"),
+ (".xwd", "image/x-xwindowdump"),
+ (".zip", "application/zip"),
+ (".jpg", "image/jpg"),
+ (".mid", "audio/midi"),
+ (".midi", "audio/midi"),
+ (".pct", "image/pict"),
+ (".pic", "image/pict"),
+ (".pict", "image/pict"),
+ (".rtf", "application/rtf"),
+ (".xul", "text/xul")
+ ]
+
+isOk :: Char -> Bool
+isOk c = isAscii c && isAlphaNum c
+
+convertTag :: Maybe FilePath -> Tag String -> IO (Tag String)
+convertTag userdata t@(TagOpen "img" as) =
+ case fromAttrib "src" t of
+ [] -> return t
+ src -> do
+ (raw, mime) <- getRaw userdata t src
+ let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw)
+ return $ TagOpen "img" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
+convertTag userdata t@(TagOpen "script" as) =
+ case fromAttrib "src" t of
+ [] -> return t
+ src -> do
+ (raw, mime) <- getRaw userdata t src
+ let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw)
+ return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"])
+convertTag userdata t@(TagOpen "link" as) =
+ case fromAttrib "href" t of
+ [] -> return t
+ src -> do
+ (raw, mime) <- getRaw userdata t src
+ let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw)
+ return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"])
+convertTag _ t = return t
+
+cssImports :: Maybe FilePath -> FilePath -> ByteString -> IO ByteString
+cssImports userdata d orig =
+ case B.breakSubstring "@import" orig of
+ (x,y) | B.null y -> return orig
+ | otherwise -> do
+ rest <- handleImport userdata d (B.drop 7 y)
+ >>= cssImports userdata d
+ return $ x `B.append` rest
+
+-- @import url("blah");
+-- @import url(blah);
+-- @import "blah";
+handleImport :: Maybe FilePath -> FilePath -> ByteString -> IO ByteString
+handleImport userdata d x =
+ fmap (`B.append` rest) (getItem userdata $ d </> url)
+ where lparenOrQuote c = c == '(' || c == '"'
+ rparenOrQuote c = c == ')' || c == '"'
+ url = toString
+ $ B.takeWhile (not . rparenOrQuote)
+ $ B.dropWhile lparenOrQuote
+ $ B.dropWhile (not . lparenOrQuote) x
+ rest = B.drop 1 $ B.dropWhile (/= ';') x
+
+getRaw :: Maybe FilePath -> Tag String -> String -> IO (ByteString, String)
+getRaw userdata t src = do
+ let ext = map toLower $ takeExtension src
+ let (ext',decomp) = if ext == ".gz"
+ then (takeExtension $ dropExtension src, B.concat . L.toChunks . Gzip.decompress . L.fromChunks . (:[]))
+ else (ext, id)
+ let mime = case fromAttrib "type" t of
+ [] -> mimeTypeFor ext'
+ x -> x
+ raw <- getItem userdata src
+ result <- if mime == "text/css"
+ then cssImports userdata (takeDirectory src) $ decomp raw
+ else return $ decomp raw
+ return (result, mime)
+
+-- | Convert HTML into self-contained HTML, incorporating images,
+-- scripts, and CSS using data: URIs. Items specified using absolute
+-- URLs will be downloaded; those specified using relative URLs will
+-- be sought first relative to the working directory, then relative
+-- to the user data directory (if the first parameter is 'Just'
+-- a directory), and finally relative to pandoc's default data
+-- directory.
+makeSelfContained :: Maybe FilePath -> String -> IO String
+makeSelfContained userdata inp = do
+ let tags = parseTags inp
+ out' <- mapM (convertTag userdata) tags
+ return $ renderTagsOptions renderOptions{ optMinimize = (\t -> t == "br"
+ || t == "img" || t == "meta" || t == "link" ) } out'
+