aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2016-12-11 23:10:46 +0100
committerJohn MacFarlane <jgm@berkeley.edu>2017-01-25 17:07:41 +0100
commit4cb124d147790814cf2055afdfd17e500cece559 (patch)
tree030f28050ba086fef6fc1cb8cfd6b297a8e83962 /src/Text/Pandoc
parentbe140ab496034f3b585d70859b652d4452ec3e03 (diff)
downloadpandoc-4cb124d147790814cf2055afdfd17e500cece559.tar.gz
Add openURL and readFileStrict to PandocMonad.
Removed fetchItem and fetchItem'. Provide fetchItem in PandocMonad (it uses openURL and readFileStrict). TODO: - PandocPure instance for openURL. - Fix places where fetchItem is used so that we trap the exception instead of checking for a Left value. (At least in the places where we want a warning rather than a failure.)
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r--src/Text/Pandoc/Class.hs136
1 files changed, 98 insertions, 38 deletions
diff --git a/src/Text/Pandoc/Class.hs b/src/Text/Pandoc/Class.hs
index 7af9b8bdd..9604d7c18 100644
--- a/src/Text/Pandoc/Class.hs
+++ b/src/Text/Pandoc/Class.hs
@@ -46,6 +46,7 @@ module Text.Pandoc.Class ( PandocMonad(..)
, getMediaBag
, setMediaBag
, insertMedia
+ , fetchItem
, getInputFiles
, getOutputFile
, PandocIO(..)
@@ -64,27 +65,28 @@ import qualified System.Random as IO (newStdGen)
import Codec.Archive.Zip (Archive, fromArchive, emptyArchive)
import Data.Unique (hashUnique)
import qualified Data.Unique as IO (newUnique)
-import qualified Text.Pandoc.Shared as IO ( fetchItem
- , fetchItem'
- , readDataFile
- , warn)
+import qualified Text.Pandoc.Shared as IO ( readDataFile
+ , warn
+ , openURL )
import Text.Pandoc.Compat.Time (UTCTime)
import Text.Pandoc.Parsing (ParserT, SourcePos)
import qualified Text.Pandoc.Compat.Time as IO (getCurrentTime)
+import Text.Pandoc.MIME (MimeType, getMimeType)
import Data.Time.Clock.POSIX ( utcTimeToPOSIXSeconds
, posixSecondsToUTCTime
, POSIXTime )
import Data.Time.LocalTime (TimeZone, ZonedTime, utcToZonedTime, utc)
+import Network.URI ( escapeURIString, nonStrictRelativeTo,
+ unEscapeString, parseURIReference, isAllowedInURI,
+ parseURI, URI(..) )
import qualified Data.Time.LocalTime as IO (getCurrentTimeZone)
-import Text.Pandoc.MIME (MimeType, getMimeType)
-import Text.Pandoc.MediaBag (MediaBag)
+import Text.Pandoc.MediaBag (MediaBag, lookupMedia)
import qualified Text.Pandoc.MediaBag as MB
import qualified Data.ByteString as B
import qualified Data.ByteString.Lazy as BL
-import qualified Control.Exception as E
import qualified System.Environment as IO (lookupEnv)
import System.FilePath.Glob (match, compile)
-import System.FilePath ((</>))
+import System.FilePath ((</>), takeExtension, dropExtension)
import qualified System.FilePath.Glob as IO (glob)
import qualified System.Directory as IO (getModificationTime)
import Control.Monad as M (fail)
@@ -106,17 +108,12 @@ class (Functor m, Applicative m, Monad m, MonadError PandocError m)
getCurrentTimeZone :: m TimeZone
newStdGen :: m StdGen
newUniqueHash :: m Int
+ openURL :: String -> m (B.ByteString, Maybe MimeType)
readFileLazy :: FilePath -> m BL.ByteString
+ readFileStrict :: FilePath -> m B.ByteString
readDataFile :: Maybe FilePath
-> FilePath
-> m B.ByteString
- fetchItem :: Maybe String
- -> String
- -> m (Either E.SomeException (B.ByteString, Maybe MimeType))
- fetchItem' :: MediaBag
- -> Maybe String
- -> String
- -> m (Either E.SomeException (B.ByteString, Maybe MimeType))
glob :: String -> m [FilePath]
getModificationTime :: FilePath -> m UTCTime
getCommonState :: m CommonState
@@ -213,19 +210,28 @@ instance PandocMonad PandocIO where
getCurrentTimeZone = liftIO IO.getCurrentTimeZone
newStdGen = liftIO IO.newStdGen
newUniqueHash = hashUnique <$> (liftIO IO.newUnique)
+ openURL u = do
+ eitherRes <- liftIO $ (tryIOError $ IO.openURL u)
+ case eitherRes of
+ Right (Right res) -> return res
+ Right (Left _) -> throwError $ PandocFileReadError u
+ Left _ -> throwError $ PandocFileReadError u
readFileLazy s = do
eitherBS <- liftIO (tryIOError $ BL.readFile s)
case eitherBS of
Right bs -> return bs
Left _ -> throwError $ PandocFileReadError s
+ readFileStrict s = do
+ eitherBS <- liftIO (tryIOError $ B.readFile s)
+ case eitherBS of
+ Right bs -> return bs
+ Left _ -> throwError $ PandocFileReadError s
-- TODO: Make this more sensitive to the different sorts of failure
readDataFile mfp fname = do
eitherBS <- liftIO (tryIOError $ IO.readDataFile mfp fname)
case eitherBS of
Right bs -> return bs
Left _ -> throwError $ PandocFileReadError fname
- fetchItem ms s = liftIO $ IO.fetchItem ms s
- fetchItem' mb ms s = liftIO $ IO.fetchItem' mb ms s
glob = liftIO . IO.glob
getModificationTime fp = do
eitherMtime <- liftIO (tryIOError $ IO.getModificationTime fp)
@@ -235,6 +241,64 @@ instance PandocMonad PandocIO where
getCommonState = PandocIO $ lift get
putCommonState x = PandocIO $ lift $ put x
+
+-- | Specialized version of parseURIReference that disallows
+-- single-letter schemes. Reason: these are usually windows absolute
+-- paths.
+parseURIReference' :: String -> Maybe URI
+parseURIReference' s =
+ case parseURIReference s of
+ Just u
+ | length (uriScheme u) > 2 -> Just u
+ | null (uriScheme u) -> Just u -- protocol-relative
+ _ -> Nothing
+
+-- | Fetch an image or other item from the local filesystem or the net.
+-- Returns raw content and maybe mime type.
+fetchItem :: PandocMonad m
+ => Maybe String
+ -> String
+ -> m (B.ByteString, Maybe MimeType)
+fetchItem sourceURL s = do
+ mediabag <- getMediaBag
+ case lookupMedia s mediabag of
+ Just (mime, bs) -> return $ (BL.toStrict bs, Just mime)
+ Nothing ->
+ case (sourceURL >>= parseURIReference' .
+ ensureEscaped, ensureEscaped s) of
+ (Just u, s') -> -- try fetching from relative path at source
+ case parseURIReference' s' of
+ Just u' -> openURL $ show $ u' `nonStrictRelativeTo` u
+ Nothing -> openURL s' -- will throw error
+ (Nothing, s'@('/':'/':_)) -> -- protocol-relative URI
+ case parseURIReference' s' of
+ Just u' -> openURL $ show $ u' `nonStrictRelativeTo` httpcolon
+ Nothing -> openURL s' -- will throw error
+ (Nothing, s') ->
+ case parseURI s' of -- requires absolute URI
+ -- We don't want to treat C:/ as a scheme:
+ Just u' | length (uriScheme u') > 2 -> openURL (show u')
+ Just u' | uriScheme u' == "file:" ->
+ readLocalFile $ dropWhile (=='/') (uriPath u')
+ _ -> readLocalFile fp -- get from local file system
+ where readLocalFile f = do
+ cont <- readFileStrict f
+ return (cont, mime)
+ httpcolon = URI{ uriScheme = "http:",
+ uriAuthority = Nothing,
+ uriPath = "",
+ uriQuery = "",
+ uriFragment = "" }
+ dropFragmentAndQuery = takeWhile (\c -> c /= '?' && c /= '#')
+ fp = unEscapeString $ dropFragmentAndQuery s
+ mime = case takeExtension fp of
+ ".gz" -> getMimeType $ dropExtension fp
+ ".svgz" -> getMimeType $ dropExtension fp ++ ".svg"
+ x -> getMimeType x
+ ensureEscaped = escapeURIString isAllowedInURI . map convertSlash
+ convertSlash '\\' = '/'
+ convertSlash x = x
+
data PureState = PureState { stStdGen :: StdGen
, stWord8Store :: [Word8] -- should be
-- inifinite,
@@ -332,33 +396,29 @@ instance PandocMonad PandocPure where
modifyPureState $ \st -> st { stUniqStore = us }
return u
_ -> M.fail "uniq store ran out of elements"
+ openURL _ = undefined -- TODO
readFileLazy fp = do
fps <- getsPureState stFiles
case infoFileContents <$> getFileInfo fp fps of
Just bs -> return (BL.fromStrict bs)
Nothing -> throwError $ PandocFileReadError fp
+ readFileStrict fp = do
+ fps <- getsPureState stFiles
+ case infoFileContents <$> getFileInfo fp fps of
+ Just bs -> return bs
+ Nothing -> throwError $ PandocFileReadError fp
readDataFile Nothing "reference.docx" = do
(B.concat . BL.toChunks . fromArchive) <$> getsPureState stReferenceDocx
readDataFile Nothing "reference.odt" = do
(B.concat . BL.toChunks . fromArchive) <$> getsPureState stReferenceODT
readDataFile Nothing fname = do
let fname' = if fname == "MANUAL.txt" then fname else "data" </> fname
- BL.toStrict <$> (readFileLazy fname')
+ readFileStrict fname'
readDataFile (Just userDir) fname = do
userDirFiles <- getsPureState stUserDataDir
case infoFileContents <$> (getFileInfo (userDir </> fname) userDirFiles) of
Just bs -> return bs
Nothing -> readDataFile Nothing fname
- fetchItem _ fp = do
- fps <- getsPureState stFiles
- case infoFileContents <$> (getFileInfo fp fps) of
- Just bs -> return (Right (bs, getMimeType fp))
- Nothing -> return (Left $ E.toException $ PandocFileReadError fp)
-
- fetchItem' media sourceUrl nm = do
- case MB.lookupMedia nm media of
- Nothing -> fetchItem sourceUrl nm
- Just (mime, bs) -> return (Right (B.concat $ BL.toChunks bs, Just mime))
glob s = do
fontFiles <- getsPureState stFontFiles
@@ -379,10 +439,10 @@ instance PandocMonad m => PandocMonad (ParserT s st m) where
getCurrentTimeZone = lift getCurrentTimeZone
newStdGen = lift newStdGen
newUniqueHash = lift newUniqueHash
+ openURL = lift . openURL
readFileLazy = lift . readFileLazy
+ readFileStrict = lift . readFileStrict
readDataFile mbuserdir = lift . readDataFile mbuserdir
- fetchItem media = lift . fetchItem media
- fetchItem' media sourceUrl = lift . fetchItem' media sourceUrl
glob = lift . glob
getModificationTime = lift . getModificationTime
getCommonState = lift getCommonState
@@ -394,10 +454,10 @@ instance PandocMonad m => PandocMonad (ReaderT r m) where
getCurrentTimeZone = lift getCurrentTimeZone
newStdGen = lift newStdGen
newUniqueHash = lift newUniqueHash
+ openURL = lift . openURL
readFileLazy = lift . readFileLazy
+ readFileStrict = lift . readFileStrict
readDataFile mbuserdir = lift . readDataFile mbuserdir
- fetchItem media = lift . fetchItem media
- fetchItem' media sourceUrl = lift . fetchItem' media sourceUrl
glob = lift . glob
getModificationTime = lift . getModificationTime
getCommonState = lift getCommonState
@@ -409,10 +469,10 @@ instance (PandocMonad m, Monoid w) => PandocMonad (WriterT w m) where
getCurrentTimeZone = lift getCurrentTimeZone
newStdGen = lift newStdGen
newUniqueHash = lift newUniqueHash
+ openURL = lift . openURL
readFileLazy = lift . readFileLazy
+ readFileStrict = lift . readFileStrict
readDataFile mbuserdir = lift . readDataFile mbuserdir
- fetchItem media = lift . fetchItem media
- fetchItem' media sourceUrl = lift . fetchItem' media sourceUrl
glob = lift . glob
getModificationTime = lift . getModificationTime
getCommonState = lift getCommonState
@@ -424,10 +484,10 @@ instance (PandocMonad m, Monoid w) => PandocMonad (RWST r w st m) where
getCurrentTimeZone = lift getCurrentTimeZone
newStdGen = lift newStdGen
newUniqueHash = lift newUniqueHash
+ openURL = lift . openURL
readFileLazy = lift . readFileLazy
+ readFileStrict = lift . readFileStrict
readDataFile mbuserdir = lift . readDataFile mbuserdir
- fetchItem media = lift . fetchItem media
- fetchItem' media sourceUrl = lift . fetchItem' media sourceUrl
glob = lift . glob
getModificationTime = lift . getModificationTime
getCommonState = lift getCommonState
@@ -439,10 +499,10 @@ instance PandocMonad m => PandocMonad (StateT st m) where
getCurrentTimeZone = lift getCurrentTimeZone
newStdGen = lift newStdGen
newUniqueHash = lift newUniqueHash
+ openURL = lift . openURL
readFileLazy = lift . readFileLazy
+ readFileStrict = lift . readFileStrict
readDataFile mbuserdir = lift . readDataFile mbuserdir
- fetchItem media = lift . fetchItem media
- fetchItem' media sourceUrl = lift . fetchItem' media sourceUrl
glob = lift . glob
getModificationTime = lift . getModificationTime
getCommonState = lift getCommonState