From 089745af61638bc1e202ac35494059f3157910af Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Wed, 30 Jul 2014 00:47:54 +0100 Subject: EPUB Reader: Added EPUB reader --- src/Text/Pandoc/Readers/EPUB.hs | 248 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 src/Text/Pandoc/Readers/EPUB.hs (limited to 'src/Text/Pandoc/Readers/EPUB.hs') diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs new file mode 100644 index 000000000..f926a5864 --- /dev/null +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -0,0 +1,248 @@ +{-# LANGUAGE + ViewPatterns + , StandaloneDeriving + , FlexibleContexts #-} + +module Text.Pandoc.Readers.EPUB + (readEPUB) + where + +import Text.XML.Light +import Text.Pandoc.Definition hiding (Attr) +import Text.Pandoc.Walk (walk) +import Text.Pandoc.Generic(bottomUp) +import Text.Pandoc.Readers.HTML (readHtml) +import Text.Pandoc.Options (ReaderOptions(..), readerExtensions, Extension(..) ) +import Text.Pandoc.Shared (escapeURI) +import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except) +import qualified Text.Pandoc.Builder as B +import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry + , findEntryByPath, Entry) +import qualified Data.ByteString.Lazy as BL (ByteString) +import System.FilePath (takeFileName, (), dropFileName) +import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) +import Control.Applicative ((<$>)) +import Control.Monad (guard, liftM) +import Data.Monoid (mempty, (<>)) +import Data.List (isPrefixOf, isInfixOf) +import Data.Maybe (mapMaybe, fromMaybe) +import qualified Data.Map as M (Map, lookup, fromList) +import qualified Data.Set as S (insert) +import Control.DeepSeq.Generics (deepseq, NFData) + +type MIME = String + +type Items = M.Map String (FilePath, MIME) + +readEPUB :: ReaderOptions -> BL.ByteString -> Pandoc +readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes) + +runEPUB :: Except String a -> a +runEPUB = either error id . runExcept + +-- Note that internal reference are aggresively normalised so that all ids +-- are of the form "filename#id" +-- +-- For now all paths are stripped from images +archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m Pandoc +archiveToEPUB os archive = do + (root, content) <- getManifest archive + meta <- parseMeta content + (cover, items) <- parseManifest content + let coverDoc = fromMaybe mempty (imageToPandoc . takeFileName<$> cover) + spine <- parseSpine items content + let escapedSpine = map (escapeURI . takeFileName . fst) spine + --traceShow escapedSpine (return ()) + Pandoc _ bs <- + foldM' (\a b -> ((a <>) . bottomUp (prependHash escapedSpine)) + `liftM` parseSpineElem root b) mempty spine + return $ coverDoc <> (Pandoc meta bs) + where + rs = readerExtensions os + os' = os {readerExtensions = foldr S.insert rs [Ext_epub_html_exts, Ext_raw_html]} + os'' = os' {readerParseRaw = True} + parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc + parseSpineElem r (path, mime) = do + --traceShow path (return ()) + doc <- mimeToReader mime (if r /= "./" then r path else path) + let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty + return $ docSpan <> fixInternalReferences (takeFileName path) doc + mimeToReader :: MonadError String m => MIME -> FilePath -> m Pandoc + mimeToReader "application/xhtml+xml" path = do + fname <- findEntryByPathE path archive + return $ readHtml os'' . UTF8.toStringLazy $ fromEntry fname + mimeToReader s path + | s `elem` imageMimes = return $ imageToPandoc path + | otherwise = return $ mempty + + +imageToPandoc :: FilePath -> Pandoc +imageToPandoc s = B.doc . B.para $ B.image s "" mempty + +imageMimes :: [String] +imageMimes = ["image/gif", "image/jpeg", "image/png"] + +type CoverImage = FilePath + +parseManifest :: (MonadError String m) => Element -> m (Maybe CoverImage, Items) +parseManifest content = do + manifest <- findElementE (dfName "manifest") content + let items = findChildren (dfName "item") manifest + r <- mapM parseItem items + let cover = findAttr (emptyName "href") =<< filterChild findCover manifest + return (cover, (M.fromList r)) + where + findCover e = maybe False (isInfixOf "cover-image") + (findAttr (emptyName "properties") e) + parseItem e = do + uid <- findAttrE (emptyName "id") e + href <- findAttrE (emptyName "href") e + mime <- findAttrE (emptyName "media-type") e + return (uid, (href, mime)) + +parseSpine :: MonadError String m => Items -> Element -> m [(FilePath, MIME)] +parseSpine is e = do + spine <- findElementE (dfName "spine") e + let itemRefs = findChildren (dfName "itemref") spine + mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs + where + parseItemRef ref = do + let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref) + guard linear + findAttr (emptyName "idref") ref + +parseMeta :: MonadError String m => Element -> m Meta +parseMeta content = do + meta <- findElementE (dfName "metadata") content + let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True + dcspace _ = False + let dcs = filterChildrenName dcspace meta + let r = foldr parseMetaItem nullMeta dcs + return r + +-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem +parseMetaItem :: Element -> Meta -> Meta +parseMetaItem e@(stripNamespace . elName -> field) meta = + B.setMeta (renameMeta field) (B.str $ strContent e) meta + +renameMeta :: String -> String +renameMeta "creator" = "author" +renameMeta s = s + +getManifest :: MonadError String m => Archive -> m (String, Element) +getManifest archive = do + metaEntry <- findEntryByPathE ("META-INF" "container.xml") archive + docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) + as <- liftM ((map attrToPair) . elAttribs) + (findElementE (QName "rootfile" (Just ns) Nothing) docElem) + root <- mkE "Root not found" (lookup "full-path" as) + let rootdir = dropFileName root + --mime <- lookup "media-type" as + manifest <- findEntryByPathE root archive + liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) + +-- Fixup + +fixInternalReferences :: String -> Pandoc -> Pandoc +fixInternalReferences s = + (walk normalisePath) . (walk stripImage) . (walk $ fixBlockIRs s') . (walk $ fixInlineIRs s') + where + s' = escapeURI s + +fixInlineIRs :: String -> Inline -> Inline +fixInlineIRs s (Span as v) = + Span (fixAttrs s as) v +fixInlineIRs s (Code as code) = + Code (fixAttrs s as) code +fixInlineIRs s (Link t ('#':url, tit)) = + Link t (addHash s url, tit) +fixInlineIRs _ v = v + +normalisePath :: Inline -> Inline +normalisePath (Link t (url, tit)) = + let (path, uid) = span (/= '#') url in + Link t (takeFileName path ++ uid, tit) +normalisePath s = s + +prependHash :: [String] -> Inline -> Inline +prependHash ps l@(Link is (url, tit)) + | or [s `isPrefixOf` url | s <- ps] = + Link is ('#':url, tit) + | otherwise = l +prependHash _ i = i + +fixBlockIRs :: String -> Block -> Block +fixBlockIRs s (Div as b) = + Div (fixAttrs s as) b +fixBlockIRs s (Header i as b) = + Header i (fixAttrs s as) b +fixBlockIRs s (CodeBlock as code) = + CodeBlock (fixAttrs s as) code +fixBlockIRs _ b = b + +fixAttrs :: FilePath -> B.Attr -> B.Attr +fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs) + +addHash :: String -> String -> String +addHash _ "" = "" +addHash s ident = s ++ "#" ++ ident + +removeEPUBAttrs :: [(String, String)] -> [(String, String)] +removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs + +isEPUBAttr :: (String, String) -> Bool +isEPUBAttr (k, _) = "epub:" `isPrefixOf` k + +-- Remove relative paths +stripImage :: Inline -> Inline +stripImage (Image alt (url, tit)) = Image alt (takeFileName url, tit) +stripImage i = i + + +-- Utility + +-- Strict version of foldM +foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a +foldM' _ z [] = return z +foldM' f z (x:xs) = do + z' <- f z x + z' `deepseq` foldM' f z' xs + +stripNamespace :: QName -> String +stripNamespace (QName v _ _) = v + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) +attrToNSPair _ = Nothing + +attrToPair :: Attr -> (String, String) +attrToPair (Attr (QName name _ _) val) = (name, val) + +defaultNameSpace :: Maybe String +defaultNameSpace = Just "http://www.idpf.org/2007/opf" + +dfName :: String -> QName +dfName s = QName s defaultNameSpace Nothing + +emptyName :: String -> QName +emptyName s = QName s Nothing Nothing + +-- Convert Maybe interface to Either + +findAttrE :: MonadError String m => QName -> Element -> m String +findAttrE q e = mkE "findAttr" $ findAttr q e + +findEntryByPathE :: MonadError String m => FilePath -> Archive -> m Entry +findEntryByPathE path a = mkE ("No entry on path: " ++ path) $ findEntryByPath path a + +parseXMLDocE :: MonadError String m => String -> m Element +parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc + +findElementE :: MonadError String m => QName -> Element -> m Element +findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x + +mkE :: MonadError String m => String -> Maybe a -> m a +mkE s = maybe (throwError s) return + -- cgit v1.2.3 From cd9a5d90cbf93925db5bb9e9060ef40d05b4bfc8 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Thu, 31 Jul 2014 15:35:40 +0100 Subject: EPUB Reader: Now uses the new MediaBag for images --- src/Text/Pandoc/Readers/EPUB.hs | 65 ++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 20 deletions(-) (limited to 'src/Text/Pandoc/Readers/EPUB.hs') diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index f926a5864..ca65a8f0f 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -1,6 +1,7 @@ {-# LANGUAGE ViewPatterns , StandaloneDeriving + , TupleSections , FlexibleContexts #-} module Text.Pandoc.Readers.EPUB @@ -9,32 +10,36 @@ module Text.Pandoc.Readers.EPUB import Text.XML.Light import Text.Pandoc.Definition hiding (Attr) -import Text.Pandoc.Walk (walk) +import Text.Pandoc.Walk (walk, query) import Text.Pandoc.Generic(bottomUp) import Text.Pandoc.Readers.HTML (readHtml) -import Text.Pandoc.Options (ReaderOptions(..), readerExtensions, Extension(..) ) +import Text.Pandoc.Options ( ReaderOptions(..), readerExtensions, Extension(..) + , readerTrace) import Text.Pandoc.Shared (escapeURI) +import Text.Pandoc.MediaBag (MediaBag, insertMedia) import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except) import qualified Text.Pandoc.Builder as B import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry , findEntryByPath, Entry) import qualified Data.ByteString.Lazy as BL (ByteString) -import System.FilePath (takeFileName, (), dropFileName) +import System.FilePath (takeFileName, (), dropFileName, normalise) import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) import Control.Applicative ((<$>)) -import Control.Monad (guard, liftM) +import Control.Monad (guard, liftM, when) import Data.Monoid (mempty, (<>)) import Data.List (isPrefixOf, isInfixOf) import Data.Maybe (mapMaybe, fromMaybe) -import qualified Data.Map as M (Map, lookup, fromList) +import qualified Data.Map as M (Map, lookup, fromList, elems) import qualified Data.Set as S (insert) import Control.DeepSeq.Generics (deepseq, NFData) +import Debug.Trace (trace) + type MIME = String type Items = M.Map String (FilePath, MIME) -readEPUB :: ReaderOptions -> BL.ByteString -> Pandoc +readEPUB :: ReaderOptions -> BL.ByteString -> (Pandoc, MediaBag) readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes) runEPUB :: Except String a -> a @@ -44,27 +49,28 @@ runEPUB = either error id . runExcept -- are of the form "filename#id" -- -- For now all paths are stripped from images -archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m Pandoc +archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag) archiveToEPUB os archive = do (root, content) <- getManifest archive meta <- parseMeta content (cover, items) <- parseManifest content - let coverDoc = fromMaybe mempty (imageToPandoc . takeFileName<$> cover) + let coverDoc = fromMaybe mempty (imageToPandoc . takeFileName <$> cover) spine <- parseSpine items content let escapedSpine = map (escapeURI . takeFileName . fst) spine - --traceShow escapedSpine (return ()) Pandoc _ bs <- foldM' (\a b -> ((a <>) . bottomUp (prependHash escapedSpine)) `liftM` parseSpineElem root b) mempty spine - return $ coverDoc <> (Pandoc meta bs) + let ast = coverDoc <> (Pandoc meta bs) + let mediaBag = fetchImages (M.elems items) root archive ast + return $ (ast, mediaBag) where rs = readerExtensions os os' = os {readerExtensions = foldr S.insert rs [Ext_epub_html_exts, Ext_raw_html]} os'' = os' {readerParseRaw = True} parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc parseSpineElem r (path, mime) = do - --traceShow path (return ()) - doc <- mimeToReader mime (if r /= "./" then r path else path) + when (readerTrace os) (traceM path) + doc <- mimeToReader mime (normalise (r path)) let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty return $ docSpan <> fixInternalReferences (takeFileName path) doc mimeToReader :: MonadError String m => MIME -> FilePath -> m Pandoc @@ -75,6 +81,23 @@ archiveToEPUB os archive = do | s `elem` imageMimes = return $ imageToPandoc path | otherwise = return $ mempty +fetchImages :: [(FilePath, MIME)] + -> FilePath + -> Archive + -> Pandoc + -> MediaBag +fetchImages mimes root a (query iq -> links) = + foldr (uncurry3 insertMedia) mempty + (mapMaybe getEntry links) + where + getEntry l = let mediaPos = normalise (root l) in + (l , lookup mediaPos mimes, ) . fromEntry + <$> findEntryByPath mediaPos a + +iq :: Inline -> [FilePath] +iq (Image _ (url, _)) = [url] +iq _ = [] + imageToPandoc :: FilePath -> Pandoc imageToPandoc s = B.doc . B.para $ B.image s "" mempty @@ -147,7 +170,7 @@ getManifest archive = do fixInternalReferences :: String -> Pandoc -> Pandoc fixInternalReferences s = - (walk normalisePath) . (walk stripImage) . (walk $ fixBlockIRs s') . (walk $ fixInlineIRs s') + (walk normalisePath) . (walk $ fixBlockIRs s') . (walk $ fixInlineIRs s') where s' = escapeURI s @@ -195,13 +218,7 @@ removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs isEPUBAttr :: (String, String) -> Bool isEPUBAttr (k, _) = "epub:" `isPrefixOf` k --- Remove relative paths -stripImage :: Inline -> Inline -stripImage (Image alt (url, tit)) = Image alt (takeFileName url, tit) -stripImage i = i - - --- Utility +-- Library -- Strict version of foldM foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a @@ -210,6 +227,14 @@ foldM' f z (x:xs) = do z' <- f z x z' `deepseq` foldM' f z' xs +traceM :: Monad m => String -> m () +traceM = flip trace (return ()) + +uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d +uncurry3 f (a, b, c) = f a b c + +-- Utility + stripNamespace :: QName -> String stripNamespace (QName v _ _) = v -- cgit v1.2.3