aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/EPUB.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/Readers/EPUB.hs')
-rw-r--r--src/Text/Pandoc/Readers/EPUB.hs277
1 files changed, 277 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs
new file mode 100644
index 000000000..69edb9761
--- /dev/null
+++ b/src/Text/Pandoc/Readers/EPUB.hs
@@ -0,0 +1,277 @@
+{-# LANGUAGE
+ ViewPatterns
+ , StandaloneDeriving
+ , TupleSections
+ , FlexibleContexts #-}
+
+module Text.Pandoc.Readers.EPUB
+ (readEPUB)
+ where
+
+import Text.XML.Light
+import Text.Pandoc.Definition hiding (Attr)
+import Text.Pandoc.Readers.HTML (readHtml)
+import Text.Pandoc.Walk (walk, query)
+import Text.Pandoc.Options ( ReaderOptions(..))
+import Text.Pandoc.Extensions (enableExtension, Extension(Ext_raw_html))
+import Text.Pandoc.Shared (escapeURI, collapseFilePath, addMetaField)
+import Network.URI (unEscapeString)
+import Control.Monad.Except (throwError)
+import Text.Pandoc.MIME (MimeType)
+import qualified Text.Pandoc.Builder as B
+import Codec.Archive.Zip ( Archive (..), toArchiveOrFail, fromEntry
+ , findEntryByPath, Entry)
+import qualified Data.ByteString.Lazy as BL (ByteString)
+import System.FilePath ( takeFileName, (</>), dropFileName, normalise
+ , dropFileName
+ , splitFileName )
+import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy)
+import Control.Monad (guard, liftM)
+import Data.List (isPrefixOf, isInfixOf)
+import Data.Maybe (mapMaybe, fromMaybe)
+import qualified Data.Map as M (Map, lookup, fromList, elems)
+import Data.Monoid ((<>))
+import Control.DeepSeq (deepseq, NFData)
+import Text.Pandoc.Error
+import Text.Pandoc.Class (PandocMonad, insertMedia)
+
+type Items = M.Map String (FilePath, MimeType)
+
+readEPUB :: PandocMonad m => ReaderOptions -> BL.ByteString -> m Pandoc
+readEPUB opts bytes = case toArchiveOrFail bytes of
+ Right archive -> archiveToEPUB opts $ archive
+ Left _ -> throwError $ PandocParseError "Couldn't extract ePub file"
+
+-- runEPUB :: Except PandocError a -> Either PandocError a
+-- runEPUB = runExcept
+
+-- Note that internal reference are aggresively normalised so that all ids
+-- are of the form "filename#id"
+--
+archiveToEPUB :: (PandocMonad m) => ReaderOptions -> Archive -> m Pandoc
+archiveToEPUB os archive = do
+ -- root is path to folder with manifest file in
+ (root, content) <- getManifest archive
+ meta <- parseMeta content
+ (cover, items) <- parseManifest content
+ -- No need to collapse here as the image path is from the manifest file
+ let coverDoc = fromMaybe mempty (imageToPandoc <$> cover)
+ spine <- parseSpine items content
+ let escapedSpine = map (escapeURI . takeFileName . fst) spine
+ Pandoc _ bs <-
+ foldM' (\a b -> ((a <>) . walk (prependHash escapedSpine))
+ `liftM` parseSpineElem root b) mempty spine
+ let ast = coverDoc <> (Pandoc meta bs)
+ fetchImages (M.elems items) root archive ast
+ return ast
+ where
+ os' = os {readerExtensions = enableExtension Ext_raw_html (readerExtensions os)}
+ parseSpineElem :: PandocMonad m => FilePath -> (FilePath, MimeType) -> m Pandoc
+ parseSpineElem (normalise -> r) (normalise -> path, mime) = do
+ doc <- mimeToReader mime r path
+ let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty
+ return $ docSpan <> doc
+ mimeToReader :: PandocMonad m => MimeType -> FilePath -> FilePath -> m Pandoc
+ mimeToReader "application/xhtml+xml" (unEscapeString -> root)
+ (unEscapeString -> path) = do
+ fname <- findEntryByPathE (root </> path) archive
+ html <- readHtml os' . UTF8.toStringLazy $ fromEntry fname
+ return $ fixInternalReferences path html
+ mimeToReader s _ (unEscapeString -> path)
+ | s `elem` imageMimes = return $ imageToPandoc path
+ | otherwise = return $ mempty
+
+-- paths should be absolute when this function is called
+-- renameImages should do this
+fetchImages :: PandocMonad m
+ => [(FilePath, MimeType)]
+ -> FilePath -- ^ Root
+ -> Archive
+ -> Pandoc
+ -> m ()
+fetchImages mimes root arc (query iq -> links) =
+ mapM_ (uncurry3 insertMedia) (mapMaybe getEntry links)
+ where
+ getEntry link =
+ let abslink = normalise (root </> link) in
+ (link , lookup link mimes, ) . fromEntry
+ <$> findEntryByPath abslink arc
+
+iq :: Inline -> [FilePath]
+iq (Image _ _ (url, _)) = [url]
+iq _ = []
+
+-- Remove relative paths
+renameImages :: FilePath -> Inline -> Inline
+renameImages root img@(Image attr a (url, b))
+ | "data:" `isPrefixOf` url = img
+ | otherwise = Image attr a (collapseFilePath (root </> url), b)
+renameImages _ x = x
+
+imageToPandoc :: FilePath -> Pandoc
+imageToPandoc s = B.doc . B.para $ B.image s "" mempty
+
+imageMimes :: [MimeType]
+imageMimes = ["image/gif", "image/jpeg", "image/png"]
+
+type CoverImage = FilePath
+
+parseManifest :: (PandocMonad m) => Element -> m (Maybe CoverImage, Items)
+parseManifest content = do
+ manifest <- findElementE (dfName "manifest") content
+ let items = findChildren (dfName "item") manifest
+ r <- mapM parseItem items
+ let cover = findAttr (emptyName "href") =<< filterChild findCover manifest
+ return (cover, (M.fromList r))
+ where
+ findCover e = maybe False (isInfixOf "cover-image")
+ (findAttr (emptyName "properties") e)
+ parseItem e = do
+ uid <- findAttrE (emptyName "id") e
+ href <- findAttrE (emptyName "href") e
+ mime <- findAttrE (emptyName "media-type") e
+ return (uid, (href, mime))
+
+parseSpine :: PandocMonad m => Items -> Element -> m [(FilePath, MimeType)]
+parseSpine is e = do
+ spine <- findElementE (dfName "spine") e
+ let itemRefs = findChildren (dfName "itemref") spine
+ mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs
+ where
+ parseItemRef ref = do
+ let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref)
+ guard linear
+ findAttr (emptyName "idref") ref
+
+parseMeta :: PandocMonad m => Element -> m Meta
+parseMeta content = do
+ meta <- findElementE (dfName "metadata") content
+ let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True
+ dcspace _ = False
+ let dcs = filterChildrenName dcspace meta
+ let r = foldr parseMetaItem nullMeta dcs
+ return r
+
+-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem
+parseMetaItem :: Element -> Meta -> Meta
+parseMetaItem e@(stripNamespace . elName -> field) meta =
+ addMetaField (renameMeta field) (B.str $ strContent e) meta
+
+renameMeta :: String -> String
+renameMeta "creator" = "author"
+renameMeta s = s
+
+getManifest :: PandocMonad m => Archive -> m (String, Element)
+getManifest archive = do
+ metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive
+ docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry
+ let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
+ ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces)
+ as <- liftM ((map attrToPair) . elAttribs)
+ (findElementE (QName "rootfile" (Just ns) Nothing) docElem)
+ manifestFile <- mkE "Root not found" (lookup "full-path" as)
+ let rootdir = dropFileName manifestFile
+ --mime <- lookup "media-type" as
+ manifest <- findEntryByPathE manifestFile archive
+ liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
+
+-- Fixup
+
+fixInternalReferences :: FilePath -> Pandoc -> Pandoc
+fixInternalReferences pathToFile =
+ (walk $ renameImages root)
+ . (walk $ fixBlockIRs filename)
+ . (walk $ fixInlineIRs filename)
+ where
+ (root, escapeURI -> filename) = splitFileName pathToFile
+
+fixInlineIRs :: String -> Inline -> Inline
+fixInlineIRs s (Span as v) =
+ Span (fixAttrs s as) v
+fixInlineIRs s (Code as code) =
+ Code (fixAttrs s as) code
+fixInlineIRs s (Link as is ('#':url, tit)) =
+ Link (fixAttrs s as) is (addHash s url, tit)
+fixInlineIRs s (Link as is t) =
+ Link (fixAttrs s as) is t
+fixInlineIRs _ v = v
+
+prependHash :: [String] -> Inline -> Inline
+prependHash ps l@(Link attr is (url, tit))
+ | or [s `isPrefixOf` url | s <- ps] =
+ Link attr is ('#':url, tit)
+ | otherwise = l
+prependHash _ i = i
+
+fixBlockIRs :: String -> Block -> Block
+fixBlockIRs s (Div as b) =
+ Div (fixAttrs s as) b
+fixBlockIRs s (Header i as b) =
+ Header i (fixAttrs s as) b
+fixBlockIRs s (CodeBlock as code) =
+ CodeBlock (fixAttrs s as) code
+fixBlockIRs _ b = b
+
+fixAttrs :: FilePath -> B.Attr -> B.Attr
+fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs)
+
+addHash :: String -> String -> String
+addHash _ "" = ""
+addHash s ident = takeFileName s ++ "#" ++ ident
+
+removeEPUBAttrs :: [(String, String)] -> [(String, String)]
+removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs
+
+isEPUBAttr :: (String, String) -> Bool
+isEPUBAttr (k, _) = "epub:" `isPrefixOf` k
+
+-- Library
+
+-- Strict version of foldM
+foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a
+foldM' _ z [] = return z
+foldM' f z (x:xs) = do
+ z' <- f z x
+ z' `deepseq` foldM' f z' xs
+
+uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d
+uncurry3 f (a, b, c) = f a b c
+
+-- Utility
+
+stripNamespace :: QName -> String
+stripNamespace (QName v _ _) = v
+
+attrToNSPair :: Attr -> Maybe (String, String)
+attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val)
+attrToNSPair _ = Nothing
+
+attrToPair :: Attr -> (String, String)
+attrToPair (Attr (QName name _ _) val) = (name, val)
+
+defaultNameSpace :: Maybe String
+defaultNameSpace = Just "http://www.idpf.org/2007/opf"
+
+dfName :: String -> QName
+dfName s = QName s defaultNameSpace Nothing
+
+emptyName :: String -> QName
+emptyName s = QName s Nothing Nothing
+
+-- Convert Maybe interface to Either
+
+findAttrE :: PandocMonad m => QName -> Element -> m String
+findAttrE q e = mkE "findAttr" $ findAttr q e
+
+findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry
+findEntryByPathE (normalise -> path) a =
+ mkE ("No entry on path: " ++ path) $ findEntryByPath path a
+
+parseXMLDocE :: PandocMonad m => String -> m Element
+parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc
+
+findElementE :: PandocMonad m => QName -> Element -> m Element
+findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x
+
+mkE :: PandocMonad m => String -> Maybe a -> m a
+mkE s = maybe (throwError . PandocParseError $ s) return