From 116f03a70a8525565850f2779af2b6b348676267 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Fri, 8 Aug 2014 19:58:23 +0100 Subject: EPUB Reader: Removed incorrectly set reader flag --- src/Text/Pandoc/Readers/EPUB.hs | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 968b815c0..8d6f322bf 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -51,7 +51,7 @@ runEPUB = either error id . runExcept -- are of the form "filename#id" -- archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag) -archiveToEPUB os archive = do +archiveToEPUB (setEPUBOptions -> os) archive = do (root, content) <- getManifest archive meta <- parseMeta content (cover, items) <- parseManifest content @@ -65,9 +65,6 @@ archiveToEPUB os archive = do let mediaBag = fetchImages (M.elems items) root archive ast return $ (ast, mediaBag) where - rs = readerExtensions os - os' = os {readerExtensions = foldr S.insert rs [Ext_epub_html_exts, Ext_raw_html]} - os'' = os' {readerParseRaw = True} parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc parseSpineElem (normalise -> r) (normalise -> path, mime) = do when (readerTrace os) (traceM path) @@ -78,13 +75,20 @@ archiveToEPUB os archive = do mimeToReader "application/xhtml+xml" r path = do fname <- findEntryByPathE (r path) archive return $ fixInternalReferences (r path) . - readHtml os'' . + readHtml os . UTF8.toStringLazy $ fromEntry fname mimeToReader s _ path | s `elem` imageMimes = return $ imageToPandoc path | otherwise = return $ mempty +setEPUBOptions :: ReaderOptions -> ReaderOptions +setEPUBOptions os = os'' + where + rs = readerExtensions os + os' = os {readerExtensions = foldr S.insert rs [Ext_epub_html_exts]} + os'' = os' {readerParseRaw = True} + fetchImages :: [(FilePath, MIME)] -> FilePath -> Archive -- cgit v1.2.3 From 2d956677eff4ce5635e37f389f1d2efd6f34615c Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Fri, 8 Aug 2014 20:10:58 +0100 Subject: Shared: Added collapseFilePath function This function removes intermediate "." and ".." from a path. --- src/Text/Pandoc/Shared.hs | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 51da34e79..a91ca9115 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -80,6 +80,7 @@ module Text.Pandoc.Shared ( fetchItem, fetchItem', openURL, + collapseFilePath, -- * Error handling err, warn, @@ -105,6 +106,7 @@ import Network.URI ( escapeURIString, isURI, nonStrictRelativeTo, unEscapeString, parseURIReference, isAllowedInURI ) import qualified Data.Set as Set import System.Directory +import System.FilePath (joinPath, splitDirectories) import Text.Pandoc.MIME (getMimeType) import System.FilePath ( (), takeExtension, dropExtension) import Data.Generics (Typeable, Data) @@ -530,7 +532,7 @@ stringify = query go . walk deNote deNote x = x -- | Bring all regular text in a pandoc structure to uppercase. --- +-- -- This function correctly handles cases where a lowercase character doesn't -- match to a single uppercase character – e.g. “Straße” would be converted -- to “STRASSE”, not “STRAßE”. @@ -854,6 +856,29 @@ warn msg = do name <- getProgName UTF8.hPutStrLn stderr $ name ++ ": " ++ msg +-- | Remove intermediate "." and ".." directories from a path. +-- +-- @ +-- collapseFilePath "./foo" == "foo" +-- collapseFilePath "/bar/../baz" == "/baz" +-- collapseFilePath "/../baz" == "/../baz" +-- collapseFilePath "parent/foo/baz/../bar" == "parent/foo/bar" +-- collapseFilePath "parent/foo/baz/../../bar" == "parent/bar" +-- collapseFilePath "parent/foo/.." == "parent" +-- collapseFilePath "/parent/foo/../../bar" == "/bar" +-- @ +collapseFilePath :: FilePath -> FilePath +collapseFilePath = joinPath . reverse . foldl go [] . splitDirectories + where + go rs "." = rs + go r@(p:rs) ".." = case p of + ".." -> ("..":r) + "/" -> ("..":r) + _ -> rs + go _ "/" = ["/"] + go rs x = x:rs + + -- -- Safe read -- -- cgit v1.2.3 From 8c551f6f43772420d5e6db8108e5a7a7786b7fc4 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Fri, 8 Aug 2014 22:10:32 +0100 Subject: EPUB Reader: Use collapseFilePath --- src/Text/Pandoc/Readers/EPUB.hs | 17 +++-------------- 1 file changed, 3 insertions(+), 14 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 8d6f322bf..c17f00989 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -15,7 +15,7 @@ import Text.Pandoc.Generic(bottomUp) import Text.Pandoc.Readers.HTML (readHtml) import Text.Pandoc.Options ( ReaderOptions(..), readerExtensions, Extension(..) , readerTrace) -import Text.Pandoc.Shared (escapeURI) +import Text.Pandoc.Shared (escapeURI, collapseFilePath) import Text.Pandoc.MediaBag (MediaBag, insertMedia) import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except) import qualified Text.Pandoc.Builder as B @@ -23,7 +23,7 @@ import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry , findEntryByPath, Entry) import qualified Data.ByteString.Lazy as BL (ByteString) import System.FilePath ( takeFileName, (), dropFileName, normalise - , joinPath, dropFileName, splitDirectories + , dropFileName , splitFileName ) import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) import Control.Applicative ((<$>)) @@ -109,20 +109,9 @@ iq _ = [] -- Remove relative paths renameImages :: FilePath -> Inline -> Inline -renameImages root (Image a (url, b)) = Image a (collapse (root url), b) +renameImages root (Image a (url, b)) = Image a (collapseFilePath (root url), b) renameImages _ x = x -collapse :: FilePath -> FilePath -collapse = joinPath . reverse . foldl go [] . splitDirectories - where - go rs "." = rs - go r@(p:rs) ".." = case p of - ".." -> ("..":r) - "/" -> ("..":r) - _ -> rs - go _ "/" = ["/"] - go rs x = x:rs - imageToPandoc :: FilePath -> Pandoc imageToPandoc s = B.doc . B.para $ B.image s "" mempty -- cgit v1.2.3 From 40ae8efddc243f75941328b66fce94e0c0ff3e16 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Fri, 8 Aug 2014 22:28:08 +0100 Subject: EPUB Reader: Fixed regressions in image extraction Before the images were relative to the position of the package file. The collapse function changed this so that they were then absolute in the archive but the fetchImages function wasn't updated to recognise this. --- src/Text/Pandoc/Readers/EPUB.hs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index c17f00989..2967bd09a 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -62,7 +62,7 @@ archiveToEPUB (setEPUBOptions -> os) archive = do foldM' (\a b -> ((a <>) . bottomUp (prependHash escapedSpine)) `liftM` parseSpineElem root b) mempty spine let ast = coverDoc <> (Pandoc meta bs) - let mediaBag = fetchImages (M.elems items) root archive ast + let mediaBag = fetchImages (M.elems items) archive ast return $ (ast, mediaBag) where parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc @@ -89,19 +89,19 @@ setEPUBOptions os = os'' os' = os {readerExtensions = foldr S.insert rs [Ext_epub_html_exts]} os'' = os' {readerParseRaw = True} +-- paths should be absolute when this function is called +-- renameImages should do this fetchImages :: [(FilePath, MIME)] - -> FilePath -> Archive -> Pandoc -> MediaBag -fetchImages mimes root arc (query iq -> links) = +fetchImages mimes arc (query iq -> links) = foldr (uncurry3 insertMedia) mempty (mapMaybe getEntry links) where - getEntry (normalise -> l) = - let mediaPos = normalise (root l) in - (l , lookup mediaPos mimes, ) . fromEntry - <$> findEntryByPath mediaPos arc + getEntry link = + (link , lookup link mimes, ) . fromEntry + <$> findEntryByPath link arc iq :: Inline -> [FilePath] iq (Image _ (url, _)) = [url] -- cgit v1.2.3 From cfd8c0214c3f369d0f8c6f033325c343b78c7659 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Fri, 8 Aug 2014 23:04:03 +0100 Subject: EPUB Reader: Improved robustness of image extraction We now maintain the invariant that when fetchImages is called, all images have absolute paths. This patch fixes several bugs relating to this as there are three places where images can be introduced. (1) During the HTML parse (2) As spine elements (3) As a cover image For (1), the paths are corrected by the transformation renameImages For (2) and (3), we need to append the "root" to the path we parse from the spine --- src/Text/Pandoc/Readers/EPUB.hs | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index 2967bd09a..8073f9ad2 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -52,10 +52,12 @@ runEPUB = either error id . runExcept -- archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag) archiveToEPUB (setEPUBOptions -> os) archive = do + -- root is path to folder with manifest file in (root, content) <- getManifest archive meta <- parseMeta content (cover, items) <- parseManifest content - let coverDoc = fromMaybe mempty (imageToPandoc <$> cover) + -- No need to collapse here as the image path is from the manifest file + let coverDoc = fromMaybe mempty (imageToPandoc . (root ) <$> cover) spine <- parseSpine items content let escapedSpine = map (escapeURI . takeFileName . fst) spine Pandoc _ bs <- @@ -68,17 +70,17 @@ archiveToEPUB (setEPUBOptions -> os) archive = do parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc parseSpineElem (normalise -> r) (normalise -> path, mime) = do when (readerTrace os) (traceM path) - doc <- mimeToReader mime r path + doc <- mimeToReader mime (r path) let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty return $ docSpan <> doc - mimeToReader :: MonadError String m => MIME -> FilePath -> FilePath -> m Pandoc - mimeToReader "application/xhtml+xml" r path = do - fname <- findEntryByPathE (r path) archive - return $ fixInternalReferences (r path) . + mimeToReader :: MonadError String m => MIME -> FilePath -> m Pandoc + mimeToReader "application/xhtml+xml" (normalise -> path) = do + fname <- findEntryByPathE path archive + return $ fixInternalReferences path . readHtml os . UTF8.toStringLazy $ fromEntry fname - mimeToReader s _ path + mimeToReader s path | s `elem` imageMimes = return $ imageToPandoc path | otherwise = return $ mempty -- cgit v1.2.3