From 10b23e85b7131eb9a653296884d71a3d6b02841d Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Mon, 21 Nov 2011 15:09:42 -0800 Subject: Changed Offline module to SelfContained, offline to makeSelfContained. --- src/Text/Pandoc/Offline.hs | 272 -------------------------------------- src/Text/Pandoc/SelfContained.hs | 279 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 279 insertions(+), 272 deletions(-) delete mode 100644 src/Text/Pandoc/Offline.hs create mode 100644 src/Text/Pandoc/SelfContained.hs (limited to 'src') diff --git a/src/Text/Pandoc/Offline.hs b/src/Text/Pandoc/Offline.hs deleted file mode 100644 index 99b461141..000000000 --- a/src/Text/Pandoc/Offline.hs +++ /dev/null @@ -1,272 +0,0 @@ -{-# LANGUAGE OverloadedStrings #-} -{- -Copyright (C) 2011 John MacFarlane - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.Offline - Copyright : Copyright (C) 2011 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane - Stability : alpha - Portability : portable - -Functions for converting an HTML file into one that can be viewed -offline, by incorporating linked images, CSS, and scripts into -the HTML using data URIs. --} -module Text.Pandoc.Offline ( offline ) where -import Text.HTML.TagSoup -import Network.URI (isAbsoluteURI, parseURI, escapeURIString) -import Network.HTTP -import Data.ByteString.Base64 -import qualified Data.ByteString.Char8 as B -import Data.ByteString (ByteString) -import Data.ByteString.UTF8 (toString) -import System.FilePath (takeExtension, dropExtension, takeDirectory, ()) -import Data.Char (toLower, isAscii, isAlphaNum) -import Codec.Compression.GZip as Gzip -import qualified Data.ByteString.Lazy as L -import Text.Pandoc.Shared (findDataFile) -import System.Directory (doesFileExist) - -getItem :: Maybe FilePath -> String -> IO ByteString -getItem userdata f = - if isAbsoluteURI f - then openURL f - else do - exists <- doesFileExist f - if exists - then B.readFile f - else do - res <- findDataFile userdata f - exists' <- doesFileExist res - if exists' - then B.readFile res - else B.readFile f -- will throw error - -openURL :: String -> IO ByteString -openURL u = getResponseBody =<< simpleHTTP (getReq u) - where getReq v = case parseURI v of - Nothing -> error $ "Could not parse URI: " ++ v - Just u' -> mkRequest GET u' - -mimeTypeFor :: String -> String -mimeTypeFor s = case lookup s mimetypes of - Nothing -> error $ "Could not find mime type for " ++ s - Just x -> x - where mimetypes = [ -- taken from MissingH - (".a", "application/octet-stream"), - (".ai", "application/postscript"), - (".aif", "audio/x-aiff"), - (".aifc", "audio/x-aiff"), - (".aiff", "audio/x-aiff"), - (".au", "audio/basic"), - (".avi", "video/x-msvideo"), - (".bat", "text/plain"), - (".bcpio", "application/x-bcpio"), - (".bin", "application/octet-stream"), - (".bmp", "image/x-ms-bmp"), - (".c", "text/plain"), - (".cdf", "application/x-netcdf"), - (".cpio", "application/x-cpio"), - (".csh", "application/x-csh"), - (".css", "text/css"), - (".dll", "application/octet-stream"), - (".doc", "application/msword"), - (".dot", "application/msword"), - (".dvi", "application/x-dvi"), - (".eml", "message/rfc822"), - (".eps", "application/postscript"), - (".etx", "text/x-setext"), - (".exe", "application/octet-stream"), - (".gif", "image/gif"), - (".gtar", "application/x-gtar"), - (".h", "text/plain"), - (".hdf", "application/x-hdf"), - (".htm", "text/html"), - (".html", "text/html"), - (".ief", "image/ief"), - (".jpe", "image/jpeg"), - (".jpeg", "image/jpeg"), - (".jpg", "image/jpeg"), - (".js", "application/x-javascript"), - (".ksh", "text/plain"), - (".latex", "application/x-latex"), - (".m1v", "video/mpeg"), - (".man", "application/x-troff-man"), - (".me", "application/x-troff-me"), - (".mht", "message/rfc822"), - (".mhtml", "message/rfc822"), - (".mif", "application/x-mif"), - (".mov", "video/quicktime"), - (".movie", "video/x-sgi-movie"), - (".mp2", "audio/mpeg"), - (".mp3", "audio/mpeg"), - (".mpa", "video/mpeg"), - (".mpe", "video/mpeg"), - (".mpeg", "video/mpeg"), - (".mpg", "video/mpeg"), - (".ms", "application/x-troff-ms"), - (".nc", "application/x-netcdf"), - (".nws", "message/rfc822"), - (".o", "application/octet-stream"), - (".obj", "application/octet-stream"), - (".oda", "application/oda"), - (".p12", "application/x-pkcs12"), - (".p7c", "application/pkcs7-mime"), - (".pbm", "image/x-portable-bitmap"), - (".pdf", "application/pdf"), - (".pfx", "application/x-pkcs12"), - (".pgm", "image/x-portable-graymap"), - (".pl", "text/plain"), - (".png", "image/png"), - (".pnm", "image/x-portable-anymap"), - (".pot", "application/vnd.ms-powerpoint"), - (".ppa", "application/vnd.ms-powerpoint"), - (".ppm", "image/x-portable-pixmap"), - (".pps", "application/vnd.ms-powerpoint"), - (".ppt", "application/vnd.ms-powerpoint"), - (".ps", "application/postscript"), - (".pwz", "application/vnd.ms-powerpoint"), - (".py", "text/x-python"), - (".pyc", "application/x-python-code"), - (".pyo", "application/x-python-code"), - (".qt", "video/quicktime"), - (".ra", "audio/x-pn-realaudio"), - (".ram", "application/x-pn-realaudio"), - (".ras", "image/x-cmu-raster"), - (".rdf", "application/xml"), - (".rgb", "image/x-rgb"), - (".roff", "application/x-troff"), - (".rtx", "text/richtext"), - (".sgm", "text/x-sgml"), - (".sgml", "text/x-sgml"), - (".sh", "application/x-sh"), - (".shar", "application/x-shar"), - (".snd", "audio/basic"), - (".so", "application/octet-stream"), - (".src", "application/x-wais-source"), - (".sv4cpio", "application/x-sv4cpio"), - (".sv4crc", "application/x-sv4crc"), - (".swf", "application/x-shockwave-flash"), - (".t", "application/x-troff"), - (".tar", "application/x-tar"), - (".tcl", "application/x-tcl"), - (".tex", "application/x-tex"), - (".texi", "application/x-texinfo"), - (".texinfo", "application/x-texinfo"), - (".tif", "image/tiff"), - (".tiff", "image/tiff"), - (".tr", "application/x-troff"), - (".tsv", "text/tab-separated-values"), - (".txt", "text/plain"), - (".ustar", "application/x-ustar"), - (".vcf", "text/x-vcard"), - (".wav", "audio/x-wav"), - (".wiz", "application/msword"), - (".xbm", "image/x-xbitmap"), - (".xlb", "application/vnd.ms-excel"), - (".xls", "application/vnd.ms-excel"), - (".xml", "text/xml"), - (".xpm", "image/x-xpixmap"), - (".xsl", "application/xml"), - (".xwd", "image/x-xwindowdump"), - (".zip", "application/zip"), - (".jpg", "image/jpg"), - (".mid", "audio/midi"), - (".midi", "audio/midi"), - (".pct", "image/pict"), - (".pic", "image/pict"), - (".pict", "image/pict"), - (".rtf", "application/rtf"), - (".xul", "text/xul") - ] - -isOk :: Char -> Bool -isOk c = isAscii c && isAlphaNum c - -convertTag :: Maybe FilePath -> Tag String -> IO (Tag String) -convertTag userdata t@(TagOpen "img" as) = - case fromAttrib "src" t of - [] -> return t - src -> do - (raw, mime) <- getRaw userdata t src - let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw) - return $ TagOpen "img" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"]) -convertTag userdata t@(TagOpen "script" as) = - case fromAttrib "src" t of - [] -> return t - src -> do - (raw, mime) <- getRaw userdata t src - let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw) - return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"]) -convertTag userdata t@(TagOpen "link" as) = - case fromAttrib "href" t of - [] -> return t - src -> do - (raw, mime) <- getRaw userdata t src - let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw) - return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"]) -convertTag _ t = return t - -cssImports :: Maybe FilePath -> FilePath -> ByteString -> IO ByteString -cssImports userdata d orig = - case B.breakSubstring "@import" orig of - (x,y) | B.null y -> return orig - | otherwise -> do - rest <- handleImport userdata d (B.drop 7 y) - >>= cssImports userdata d - return $ x `B.append` rest - --- @import url("blah"); --- @import url(blah); --- @import "blah"; -handleImport :: Maybe FilePath -> FilePath -> ByteString -> IO ByteString -handleImport userdata d x = - fmap (`B.append` rest) (getItem userdata $ d url) - where lparenOrQuote c = c == '(' || c == '"' - rparenOrQuote c = c == ')' || c == '"' - url = toString - $ B.takeWhile (not . rparenOrQuote) - $ B.dropWhile lparenOrQuote - $ B.dropWhile (not . lparenOrQuote) x - rest = B.drop 1 $ B.dropWhile (/= ';') x - -getRaw :: Maybe FilePath -> Tag String -> String -> IO (ByteString, String) -getRaw userdata t src = do - let ext = map toLower $ takeExtension src - let (ext',decomp) = if ext == ".gz" - then (takeExtension $ dropExtension src, B.concat . L.toChunks . Gzip.decompress . L.fromChunks . (:[])) - else (ext, id) - let mime = case fromAttrib "type" t of - [] -> mimeTypeFor ext' - x -> x - raw <- getItem userdata src - result <- if mime == "text/css" - then cssImports userdata (takeDirectory src) $ decomp raw - else return $ decomp raw - return (result, mime) - -offline :: Maybe FilePath -> String -> IO String -offline userdata inp = do - let tags = parseTags inp - out' <- mapM (convertTag userdata) tags - return $ renderTagsOptions renderOptions{ optMinimize = (\t -> t == "br" - || t == "img" || t == "meta" || t == "link" ) } out' - diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs new file mode 100644 index 000000000..116083fba --- /dev/null +++ b/src/Text/Pandoc/SelfContained.hs @@ -0,0 +1,279 @@ +{-# LANGUAGE OverloadedStrings #-} +{- +Copyright (C) 2011 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.SelfContained + Copyright : Copyright (C) 2011 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Functions for converting an HTML file into one that can be viewed +offline, by incorporating linked images, CSS, and scripts into +the HTML using data URIs. +-} +module Text.Pandoc.SelfContained ( makeSelfContained ) where +import Text.HTML.TagSoup +import Network.URI (isAbsoluteURI, parseURI, escapeURIString) +import Network.HTTP +import Data.ByteString.Base64 +import qualified Data.ByteString.Char8 as B +import Data.ByteString (ByteString) +import Data.ByteString.UTF8 (toString) +import System.FilePath (takeExtension, dropExtension, takeDirectory, ()) +import Data.Char (toLower, isAscii, isAlphaNum) +import Codec.Compression.GZip as Gzip +import qualified Data.ByteString.Lazy as L +import Text.Pandoc.Shared (findDataFile) +import System.Directory (doesFileExist) + +getItem :: Maybe FilePath -> String -> IO ByteString +getItem userdata f = + if isAbsoluteURI f + then openURL f + else do + exists <- doesFileExist f + if exists + then B.readFile f + else do + res <- findDataFile userdata f + exists' <- doesFileExist res + if exists' + then B.readFile res + else error $ "Could not find `" ++ f ++ "'" + +openURL :: String -> IO ByteString +openURL u = getResponseBody =<< simpleHTTP (getReq u) + where getReq v = case parseURI v of + Nothing -> error $ "Could not parse URI: " ++ v + Just u' -> mkRequest GET u' + +mimeTypeFor :: String -> String +mimeTypeFor s = case lookup s mimetypes of + Nothing -> error $ "Could not find mime type for " ++ s + Just x -> x + where mimetypes = [ -- taken from MissingH + (".a", "application/octet-stream"), + (".ai", "application/postscript"), + (".aif", "audio/x-aiff"), + (".aifc", "audio/x-aiff"), + (".aiff", "audio/x-aiff"), + (".au", "audio/basic"), + (".avi", "video/x-msvideo"), + (".bat", "text/plain"), + (".bcpio", "application/x-bcpio"), + (".bin", "application/octet-stream"), + (".bmp", "image/x-ms-bmp"), + (".c", "text/plain"), + (".cdf", "application/x-netcdf"), + (".cpio", "application/x-cpio"), + (".csh", "application/x-csh"), + (".css", "text/css"), + (".dll", "application/octet-stream"), + (".doc", "application/msword"), + (".dot", "application/msword"), + (".dvi", "application/x-dvi"), + (".eml", "message/rfc822"), + (".eps", "application/postscript"), + (".etx", "text/x-setext"), + (".exe", "application/octet-stream"), + (".gif", "image/gif"), + (".gtar", "application/x-gtar"), + (".h", "text/plain"), + (".hdf", "application/x-hdf"), + (".htm", "text/html"), + (".html", "text/html"), + (".ief", "image/ief"), + (".jpe", "image/jpeg"), + (".jpeg", "image/jpeg"), + (".jpg", "image/jpeg"), + (".js", "application/x-javascript"), + (".ksh", "text/plain"), + (".latex", "application/x-latex"), + (".m1v", "video/mpeg"), + (".man", "application/x-troff-man"), + (".me", "application/x-troff-me"), + (".mht", "message/rfc822"), + (".mhtml", "message/rfc822"), + (".mif", "application/x-mif"), + (".mov", "video/quicktime"), + (".movie", "video/x-sgi-movie"), + (".mp2", "audio/mpeg"), + (".mp3", "audio/mpeg"), + (".mpa", "video/mpeg"), + (".mpe", "video/mpeg"), + (".mpeg", "video/mpeg"), + (".mpg", "video/mpeg"), + (".ms", "application/x-troff-ms"), + (".nc", "application/x-netcdf"), + (".nws", "message/rfc822"), + (".o", "application/octet-stream"), + (".obj", "application/octet-stream"), + (".oda", "application/oda"), + (".p12", "application/x-pkcs12"), + (".p7c", "application/pkcs7-mime"), + (".pbm", "image/x-portable-bitmap"), + (".pdf", "application/pdf"), + (".pfx", "application/x-pkcs12"), + (".pgm", "image/x-portable-graymap"), + (".pl", "text/plain"), + (".png", "image/png"), + (".pnm", "image/x-portable-anymap"), + (".pot", "application/vnd.ms-powerpoint"), + (".ppa", "application/vnd.ms-powerpoint"), + (".ppm", "image/x-portable-pixmap"), + (".pps", "application/vnd.ms-powerpoint"), + (".ppt", "application/vnd.ms-powerpoint"), + (".ps", "application/postscript"), + (".pwz", "application/vnd.ms-powerpoint"), + (".py", "text/x-python"), + (".pyc", "application/x-python-code"), + (".pyo", "application/x-python-code"), + (".qt", "video/quicktime"), + (".ra", "audio/x-pn-realaudio"), + (".ram", "application/x-pn-realaudio"), + (".ras", "image/x-cmu-raster"), + (".rdf", "application/xml"), + (".rgb", "image/x-rgb"), + (".roff", "application/x-troff"), + (".rtx", "text/richtext"), + (".sgm", "text/x-sgml"), + (".sgml", "text/x-sgml"), + (".sh", "application/x-sh"), + (".shar", "application/x-shar"), + (".snd", "audio/basic"), + (".so", "application/octet-stream"), + (".src", "application/x-wais-source"), + (".sv4cpio", "application/x-sv4cpio"), + (".sv4crc", "application/x-sv4crc"), + (".swf", "application/x-shockwave-flash"), + (".t", "application/x-troff"), + (".tar", "application/x-tar"), + (".tcl", "application/x-tcl"), + (".tex", "application/x-tex"), + (".texi", "application/x-texinfo"), + (".texinfo", "application/x-texinfo"), + (".tif", "image/tiff"), + (".tiff", "image/tiff"), + (".tr", "application/x-troff"), + (".tsv", "text/tab-separated-values"), + (".txt", "text/plain"), + (".ustar", "application/x-ustar"), + (".vcf", "text/x-vcard"), + (".wav", "audio/x-wav"), + (".wiz", "application/msword"), + (".xbm", "image/x-xbitmap"), + (".xlb", "application/vnd.ms-excel"), + (".xls", "application/vnd.ms-excel"), + (".xml", "text/xml"), + (".xpm", "image/x-xpixmap"), + (".xsl", "application/xml"), + (".xwd", "image/x-xwindowdump"), + (".zip", "application/zip"), + (".jpg", "image/jpg"), + (".mid", "audio/midi"), + (".midi", "audio/midi"), + (".pct", "image/pict"), + (".pic", "image/pict"), + (".pict", "image/pict"), + (".rtf", "application/rtf"), + (".xul", "text/xul") + ] + +isOk :: Char -> Bool +isOk c = isAscii c && isAlphaNum c + +convertTag :: Maybe FilePath -> Tag String -> IO (Tag String) +convertTag userdata t@(TagOpen "img" as) = + case fromAttrib "src" t of + [] -> return t + src -> do + (raw, mime) <- getRaw userdata t src + let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw) + return $ TagOpen "img" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"]) +convertTag userdata t@(TagOpen "script" as) = + case fromAttrib "src" t of + [] -> return t + src -> do + (raw, mime) <- getRaw userdata t src + let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw) + return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"]) +convertTag userdata t@(TagOpen "link" as) = + case fromAttrib "href" t of + [] -> return t + src -> do + (raw, mime) <- getRaw userdata t src + let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw) + return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"]) +convertTag _ t = return t + +cssImports :: Maybe FilePath -> FilePath -> ByteString -> IO ByteString +cssImports userdata d orig = + case B.breakSubstring "@import" orig of + (x,y) | B.null y -> return orig + | otherwise -> do + rest <- handleImport userdata d (B.drop 7 y) + >>= cssImports userdata d + return $ x `B.append` rest + +-- @import url("blah"); +-- @import url(blah); +-- @import "blah"; +handleImport :: Maybe FilePath -> FilePath -> ByteString -> IO ByteString +handleImport userdata d x = + fmap (`B.append` rest) (getItem userdata $ d url) + where lparenOrQuote c = c == '(' || c == '"' + rparenOrQuote c = c == ')' || c == '"' + url = toString + $ B.takeWhile (not . rparenOrQuote) + $ B.dropWhile lparenOrQuote + $ B.dropWhile (not . lparenOrQuote) x + rest = B.drop 1 $ B.dropWhile (/= ';') x + +getRaw :: Maybe FilePath -> Tag String -> String -> IO (ByteString, String) +getRaw userdata t src = do + let ext = map toLower $ takeExtension src + let (ext',decomp) = if ext == ".gz" + then (takeExtension $ dropExtension src, B.concat . L.toChunks . Gzip.decompress . L.fromChunks . (:[])) + else (ext, id) + let mime = case fromAttrib "type" t of + [] -> mimeTypeFor ext' + x -> x + raw <- getItem userdata src + result <- if mime == "text/css" + then cssImports userdata (takeDirectory src) $ decomp raw + else return $ decomp raw + return (result, mime) + +-- | Convert HTML into self-contained HTML, incorporating images, +-- scripts, and CSS using data: URIs. Items specified using absolute +-- URLs will be downloaded; those specified using relative URLs will +-- be sought first relative to the working directory, then relative +-- to the user data directory (if the first parameter is 'Just' +-- a directory), and finally relative to pandoc's default data +-- directory. +makeSelfContained :: Maybe FilePath -> String -> IO String +makeSelfContained userdata inp = do + let tags = parseTags inp + out' <- mapM (convertTag userdata) tags + return $ renderTagsOptions renderOptions{ optMinimize = (\t -> t == "br" + || t == "img" || t == "meta" || t == "link" ) } out' + -- cgit v1.2.3