From 089745af61638bc1e202ac35494059f3157910af Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Wed, 30 Jul 2014 00:47:54 +0100 Subject: EPUB Reader: Added EPUB reader --- src/Text/Pandoc/Readers/EPUB.hs | 248 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 248 insertions(+) create mode 100644 src/Text/Pandoc/Readers/EPUB.hs (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs new file mode 100644 index 000000000..f926a5864 --- /dev/null +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -0,0 +1,248 @@ +{-# LANGUAGE + ViewPatterns + , StandaloneDeriving + , FlexibleContexts #-} + +module Text.Pandoc.Readers.EPUB + (readEPUB) + where + +import Text.XML.Light +import Text.Pandoc.Definition hiding (Attr) +import Text.Pandoc.Walk (walk) +import Text.Pandoc.Generic(bottomUp) +import Text.Pandoc.Readers.HTML (readHtml) +import Text.Pandoc.Options (ReaderOptions(..), readerExtensions, Extension(..) ) +import Text.Pandoc.Shared (escapeURI) +import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except) +import qualified Text.Pandoc.Builder as B +import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry + , findEntryByPath, Entry) +import qualified Data.ByteString.Lazy as BL (ByteString) +import System.FilePath (takeFileName, (), dropFileName) +import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) +import Control.Applicative ((<$>)) +import Control.Monad (guard, liftM) +import Data.Monoid (mempty, (<>)) +import Data.List (isPrefixOf, isInfixOf) +import Data.Maybe (mapMaybe, fromMaybe) +import qualified Data.Map as M (Map, lookup, fromList) +import qualified Data.Set as S (insert) +import Control.DeepSeq.Generics (deepseq, NFData) + +type MIME = String + +type Items = M.Map String (FilePath, MIME) + +readEPUB :: ReaderOptions -> BL.ByteString -> Pandoc +readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes) + +runEPUB :: Except String a -> a +runEPUB = either error id . runExcept + +-- Note that internal reference are aggresively normalised so that all ids +-- are of the form "filename#id" +-- +-- For now all paths are stripped from images +archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m Pandoc +archiveToEPUB os archive = do + (root, content) <- getManifest archive + meta <- parseMeta content + (cover, items) <- parseManifest content + let coverDoc = fromMaybe mempty (imageToPandoc . takeFileName<$> cover) + spine <- parseSpine items content + let escapedSpine = map (escapeURI . takeFileName . fst) spine + --traceShow escapedSpine (return ()) + Pandoc _ bs <- + foldM' (\a b -> ((a <>) . bottomUp (prependHash escapedSpine)) + `liftM` parseSpineElem root b) mempty spine + return $ coverDoc <> (Pandoc meta bs) + where + rs = readerExtensions os + os' = os {readerExtensions = foldr S.insert rs [Ext_epub_html_exts, Ext_raw_html]} + os'' = os' {readerParseRaw = True} + parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc + parseSpineElem r (path, mime) = do + --traceShow path (return ()) + doc <- mimeToReader mime (if r /= "./" then r path else path) + let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty + return $ docSpan <> fixInternalReferences (takeFileName path) doc + mimeToReader :: MonadError String m => MIME -> FilePath -> m Pandoc + mimeToReader "application/xhtml+xml" path = do + fname <- findEntryByPathE path archive + return $ readHtml os'' . UTF8.toStringLazy $ fromEntry fname + mimeToReader s path + | s `elem` imageMimes = return $ imageToPandoc path + | otherwise = return $ mempty + + +imageToPandoc :: FilePath -> Pandoc +imageToPandoc s = B.doc . B.para $ B.image s "" mempty + +imageMimes :: [String] +imageMimes = ["image/gif", "image/jpeg", "image/png"] + +type CoverImage = FilePath + +parseManifest :: (MonadError String m) => Element -> m (Maybe CoverImage, Items) +parseManifest content = do + manifest <- findElementE (dfName "manifest") content + let items = findChildren (dfName "item") manifest + r <- mapM parseItem items + let cover = findAttr (emptyName "href") =<< filterChild findCover manifest + return (cover, (M.fromList r)) + where + findCover e = maybe False (isInfixOf "cover-image") + (findAttr (emptyName "properties") e) + parseItem e = do + uid <- findAttrE (emptyName "id") e + href <- findAttrE (emptyName "href") e + mime <- findAttrE (emptyName "media-type") e + return (uid, (href, mime)) + +parseSpine :: MonadError String m => Items -> Element -> m [(FilePath, MIME)] +parseSpine is e = do + spine <- findElementE (dfName "spine") e + let itemRefs = findChildren (dfName "itemref") spine + mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs + where + parseItemRef ref = do + let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref) + guard linear + findAttr (emptyName "idref") ref + +parseMeta :: MonadError String m => Element -> m Meta +parseMeta content = do + meta <- findElementE (dfName "metadata") content + let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True + dcspace _ = False + let dcs = filterChildrenName dcspace meta + let r = foldr parseMetaItem nullMeta dcs + return r + +-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem +parseMetaItem :: Element -> Meta -> Meta +parseMetaItem e@(stripNamespace . elName -> field) meta = + B.setMeta (renameMeta field) (B.str $ strContent e) meta + +renameMeta :: String -> String +renameMeta "creator" = "author" +renameMeta s = s + +getManifest :: MonadError String m => Archive -> m (String, Element) +getManifest archive = do + metaEntry <- findEntryByPathE ("META-INF" "container.xml") archive + docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry + let namespaces = mapMaybe attrToNSPair (elAttribs docElem) + ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces) + as <- liftM ((map attrToPair) . elAttribs) + (findElementE (QName "rootfile" (Just ns) Nothing) docElem) + root <- mkE "Root not found" (lookup "full-path" as) + let rootdir = dropFileName root + --mime <- lookup "media-type" as + manifest <- findEntryByPathE root archive + liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest) + +-- Fixup + +fixInternalReferences :: String -> Pandoc -> Pandoc +fixInternalReferences s = + (walk normalisePath) . (walk stripImage) . (walk $ fixBlockIRs s') . (walk $ fixInlineIRs s') + where + s' = escapeURI s + +fixInlineIRs :: String -> Inline -> Inline +fixInlineIRs s (Span as v) = + Span (fixAttrs s as) v +fixInlineIRs s (Code as code) = + Code (fixAttrs s as) code +fixInlineIRs s (Link t ('#':url, tit)) = + Link t (addHash s url, tit) +fixInlineIRs _ v = v + +normalisePath :: Inline -> Inline +normalisePath (Link t (url, tit)) = + let (path, uid) = span (/= '#') url in + Link t (takeFileName path ++ uid, tit) +normalisePath s = s + +prependHash :: [String] -> Inline -> Inline +prependHash ps l@(Link is (url, tit)) + | or [s `isPrefixOf` url | s <- ps] = + Link is ('#':url, tit) + | otherwise = l +prependHash _ i = i + +fixBlockIRs :: String -> Block -> Block +fixBlockIRs s (Div as b) = + Div (fixAttrs s as) b +fixBlockIRs s (Header i as b) = + Header i (fixAttrs s as) b +fixBlockIRs s (CodeBlock as code) = + CodeBlock (fixAttrs s as) code +fixBlockIRs _ b = b + +fixAttrs :: FilePath -> B.Attr -> B.Attr +fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs) + +addHash :: String -> String -> String +addHash _ "" = "" +addHash s ident = s ++ "#" ++ ident + +removeEPUBAttrs :: [(String, String)] -> [(String, String)] +removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs + +isEPUBAttr :: (String, String) -> Bool +isEPUBAttr (k, _) = "epub:" `isPrefixOf` k + +-- Remove relative paths +stripImage :: Inline -> Inline +stripImage (Image alt (url, tit)) = Image alt (takeFileName url, tit) +stripImage i = i + + +-- Utility + +-- Strict version of foldM +foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a +foldM' _ z [] = return z +foldM' f z (x:xs) = do + z' <- f z x + z' `deepseq` foldM' f z' xs + +stripNamespace :: QName -> String +stripNamespace (QName v _ _) = v + +attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val) +attrToNSPair _ = Nothing + +attrToPair :: Attr -> (String, String) +attrToPair (Attr (QName name _ _) val) = (name, val) + +defaultNameSpace :: Maybe String +defaultNameSpace = Just "http://www.idpf.org/2007/opf" + +dfName :: String -> QName +dfName s = QName s defaultNameSpace Nothing + +emptyName :: String -> QName +emptyName s = QName s Nothing Nothing + +-- Convert Maybe interface to Either + +findAttrE :: MonadError String m => QName -> Element -> m String +findAttrE q e = mkE "findAttr" $ findAttr q e + +findEntryByPathE :: MonadError String m => FilePath -> Archive -> m Entry +findEntryByPathE path a = mkE ("No entry on path: " ++ path) $ findEntryByPath path a + +parseXMLDocE :: MonadError String m => String -> m Element +parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc + +findElementE :: MonadError String m => QName -> Element -> m Element +findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x + +mkE :: MonadError String m => String -> Maybe a -> m a +mkE s = maybe (throwError s) return + -- cgit v1.2.3 From b57e554b59236eab7c9298b4b5bf6d32e5d7cda8 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Wed, 30 Jul 2014 00:48:40 +0100 Subject: Except Compat: Updated to export more module functions --- src/Text/Pandoc/Compat/Except.hs | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Compat/Except.hs b/src/Text/Pandoc/Compat/Except.hs index 7f5648e7a..9ce7c0d36 100644 --- a/src/Text/Pandoc/Compat/Except.hs +++ b/src/Text/Pandoc/Compat/Except.hs @@ -1,7 +1,10 @@ {-# LANGUAGE CPP #-} module Text.Pandoc.Compat.Except ( ExceptT + , Except , Error(..) , runExceptT + , runExcept + , MonadError , throwError , catchError ) where @@ -18,10 +21,17 @@ class Error a where #else import Control.Monad.Error +import Control.Monad.Identity (Identity, runIdentity) + type ExceptT = ErrorT -runExceptT :: ExceptT e m a -> m (Either e a) +type Except s a = ErrorT s Identity a + +runExceptT :: ExceptT e m a -> m (Either e a) runExceptT = runErrorT + +runExcept :: ExceptT e Identity a -> Either e a +runExcept = runIdentity . runExceptT #endif -- cgit v1.2.3 From 002ae95d7a72313d979e9da4666cf61ebd009154 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Wed, 30 Jul 2014 00:49:46 +0100 Subject: Options: Added option to turn on epub html extensions --- src/Text/Pandoc/Options.hs | 1 + 1 file changed, 1 insertion(+) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index 85a6a3096..bb213bac0 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -106,6 +106,7 @@ data Extension = | Ext_mmd_header_identifiers -- ^ Multimarkdown style header identifiers [myid] | Ext_implicit_header_references -- ^ Implicit reference links for headers | Ext_line_blocks -- ^ RST style line blocks + | Ext_epub_html_exts -- ^ Recognise the EPUB extended version of HTML deriving (Show, Read, Enum, Eq, Ord, Bounded) pandocExtensions :: Set Extension -- cgit v1.2.3 From 266e1977e03383f806867d9d3af86b5d55717830 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Wed, 30 Jul 2014 00:54:05 +0100 Subject: HTML Reader: Extended HTML Reader to recognise EPUB specific elements --- src/Text/Pandoc/Readers/HTML.hs | 206 ++++++++++++++++++++++++++++++++++------ 1 file changed, 178 insertions(+), 28 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 597156a5e..ef061df09 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -41,48 +41,64 @@ import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder (HasMeta (..), Blocks, Inlines, trimInlines) -import Text.Pandoc.Shared -import Text.Pandoc.Options -import Text.Pandoc.Parsing -import Data.Maybe ( fromMaybe, isJust ) -import Data.List ( intercalate ) +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) +import Text.Pandoc.Shared ( extractSpaces, renderTags' + , escapeURI, safeRead ) +import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace) + , Extension (Ext_epub_html_exts)) +import Text.Pandoc.Parsing hiding ((<|>)) +import Text.Pandoc.Walk +import Data.Maybe ( fromMaybe, isJust) +import Data.List ( intercalate, isInfixOf ) import Data.Char ( isDigit ) -import Control.Monad ( liftM, guard, when, mzero ) -import Control.Applicative ( (<$>), (<$), (<*) ) -import Data.Monoid +import Control.Monad ( liftM, guard, when, mzero, void, unless ) +import Control.Arrow ((***)) +import Control.Applicative ( (<$>), (<$), (<*), (*>), (<|>)) +import Data.Monoid (mconcat, Monoid, mempty, (<>), First (..)) import Text.Printf (printf) import Debug.Trace (trace) -import Data.Default (Default (..)) -import Control.Monad.Reader (Reader, runReader, asks, local, ask) +import Text.TeXMath (readMathML, writeTeXMath) +import Data.Default (Default (..), def) +import Control.Monad.Reader (Reader,ask, asks, local, runReader) -isSpace :: Char -> Bool -isSpace ' ' = True -isSpace '\t' = True -isSpace '\n' = True -isSpace _ = False -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assumes @'\n'@ line endings) -> Pandoc readHtml opts inp = - case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } ) "source" tags of + case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags of Left err' -> error $ "\nError at " ++ show err' Right result -> result - where tags = canonicalizeTags $ + where tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp parseDoc = do blocks <- (fixPlains False) . mconcat <$> manyTill block eof meta <- stateMeta . parserState <$> getState - return $ Pandoc meta (B.toList blocks) + bs' <- replaceNotes (B.toList blocks) + return $ Pandoc meta bs' + +replaceNotes :: [Block] -> TagParser [Block] +replaceNotes = walkM replaceNotes' + +replaceNotes' :: Inline -> TagParser Inline +replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes + where + getNotes = noteTable <$> getState +replaceNotes' x = return x data HTMLState = HTMLState - { parserState :: ParserState + { parserState :: ParserState, + noteTable :: [(String, Blocks)] } -data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext } +data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext + , inChapter :: Bool -- ^ Set if in chapter section + } + +setInChapter :: HTMLParser s a -> HTMLParser s a +setInChapter = local (\s -> s {inChapter = True}) type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal) @@ -110,7 +126,11 @@ block = do tr <- getOption readerTrace pos <- getPosition res <- choice - [ pPara + [ eSwitch + , eSection + , mempty <$ eFootnote + , mempty <$ eTOC + , pPara , pHeader , pBlockQuote , pCodeBlock @@ -127,6 +147,64 @@ block = do (take 60 $ show $ B.toList res)) (return ()) return res +namespaces :: [(String, TagParser Blocks)] +namespaces = [(mathMLNamespace, B.para <$> pMath True)] + +mathMLNamespace :: String +mathMLNamespace = "http://www.w3.org/1998/Math/MathML" + +eSwitch :: TagParser Blocks +eSwitch = try $ do + guardEnabled Ext_epub_html_exts + pSatisfy (~== TagOpen "switch" []) + cases <- getFirst . mconcat <$> + manyTill (First <$> (eCase <* skipMany pBlank) ) + (lookAhead $ try $ pSatisfy (~== TagOpen "default" [])) + skipMany pBlank + fallback <- pInTags "default" ( skipMany pBlank *> block <* skipMany pBlank ) + skipMany pBlank + pSatisfy (~== TagClose "switch") + return (fromMaybe fallback cases) + +eCase :: TagParser (Maybe Blocks) +eCase = do + skipMany pBlank + TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" []) + case (flip lookup namespaces) =<< lookup "required-namespace" attr of + Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)) + Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case")) + +eFootnote :: TagParser () +eFootnote = try $ do + let notes = ["footnote", "rearnote"] + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (flip elem notes) (lookup "type" attr)) + let ident = fromMaybe "" (lookup "id" attr) + content <- pInTags tag block + addNote ident content + +addNote :: String -> Blocks -> TagParser () +addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)}) + +eNoteref :: TagParser Inlines +eNoteref = try $ do + guardEnabled Ext_epub_html_exts + TagOpen tag attr <- lookAhead $ pAnyTag + guard (maybe False (== "noteref") (lookup "type" attr)) + let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr) + guard (not (null ident)) + pInTags tag block + return $ B.rawInline "noteref" ident + +-- Strip TOC if there is one, better to generate again +eTOC :: TagParser () +eTOC = try $ do + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (== "toc") (lookup "type" attr)) + void (pInTags tag block) + pList :: TagParser Blocks pList = pBulletList <|> pOrderedList <|> pDefinitionList @@ -230,13 +308,35 @@ pHtmlBlock t = try $ do contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) return $ renderTags' $ [open] ++ contents ++ [TagClose t] +-- Sets chapter context +eSection :: TagParser Blocks +eSection = try $ do + let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as) + let sectTag = tagOpen (`elem` sectioningContent) matchChapter + TagOpen tag _ <- lookAhead $ pSatisfy sectTag + setInChapter (pInTags tag block) + +headerLevel :: String -> TagParser Int +headerLevel tagtype = do + let level = read (drop 1 tagtype) + (try $ do + guardEnabled Ext_epub_html_exts + asks inChapter >>= guard + return (level - 1)) + <|> + return level + + + + + pHeader :: TagParser Blocks pHeader = try $ do TagOpen tagtype attr <- pSatisfy $ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] - let level = read (drop 1 tagtype) + level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] words $ lookup "class" attr @@ -336,7 +436,8 @@ pCodeBlock = try $ do inline :: TagParser Inlines inline = choice - [ pTagText + [ eNoteref + , pTagText , pQ , pEmph , pStrong @@ -348,6 +449,7 @@ inline = choice , pImage , pCode , pSpan + , pMath False , pRawHtmlInline ] @@ -620,8 +722,11 @@ blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist", "classsynopsis", "blockquote", "epigraph", "msgset", "sidebar", "title"] +epubTags :: [String] +epubTags = ["case", "switch", "default"] + blockTags :: [String] -blockTags = blockHtmlTags ++ blockDocBookTags +blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags isInlineTag :: Tag String -> Bool isInlineTag t = tagOpen isInlineTagName (const True) t || @@ -720,9 +825,32 @@ htmlTag f = try $ do mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) where attribsId = fromMaybe "" $ lookup "id" attr - attribsClasses = words $ fromMaybe "" $ lookup "class" attr + attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr + +-- Strip namespace prefixes +stripPrefixes :: [Tag String] -> [Tag String] +stripPrefixes = map stripPrefix +stripPrefix :: Tag String -> Tag String +stripPrefix (TagOpen s as) = + TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as) +stripPrefix (TagClose s) = TagClose (stripPrefix' s) +stripPrefix x = x + +stripPrefix' :: String -> String +stripPrefix' s = + case span (/= ':') s of + (_, "") -> s + (_, (_:ts)) -> ts + +isSpace :: Char -> Bool +isSpace ' ' = True +isSpace '\t' = True +isSpace '\n' = True +isSpace '\r' = True +isSpace _ = False -- Instances @@ -736,17 +864,39 @@ instance HasReaderOptions HTMLState where extractReaderOptions = extractReaderOptions . parserState instance Default HTMLState where - def = HTMLState def + def = HTMLState def [] instance HasMeta HTMLState where setMeta s b st = st {parserState = setMeta s b $ parserState st} deleteMeta s st = st {parserState = deleteMeta s $ parserState st} instance Default HTMLLocal where - def = HTMLLocal NoQuote + def = HTMLLocal NoQuote False instance HasLastStrPosition HTMLState where setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} getLastStrPos = getLastStrPos . parserState +-- EPUB Specific +-- +-- +sectioningContent :: [String] +sectioningContent = ["article", "aside", "nav", "section"] + +{- +groupingContent :: [String] +groupingContent = ["p", "hr", "pre", "blockquote", "ol" + , "ul", "li", "dl", "dt", "dt", "dd" + , "figure", "figcaption", "div", "main"] + + + +types :: [(String, ([String], Int))] +types = -- Document divisions + map (\s -> (s, (["section", "body"], 0))) + ["volume", "part", "chapter", "division"] + ++ -- Document section and components + [ + ("abstract", ([], 0))] +-} -- cgit v1.2.3 From 7c1f8673972eac79bfbccced0406a8fa4b3a319b Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Wed, 30 Jul 2014 00:55:31 +0100 Subject: HTML Reader: Added support for anchors on links and list items --- src/Text/Pandoc/Readers/HTML.hs | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index ef061df09..34aa381d2 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -217,9 +217,15 @@ pBulletList = try $ do -- note: if they have an
    or
      not in scope of a
    • , -- treat it as a list item, though it's not valid xhtml... skipMany nonItem - items <- manyTill (pInTags "li" block <* skipMany nonItem) (pCloses "ul") + items <- manyTill (pListItem nonItem) (pCloses "ul") return $ B.bulletList $ map (fixPlains True) items +pListItem :: TagParser a -> TagParser Blocks +pListItem nonItem = do + TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" []) + let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr) + (liDiv <>) <$> pInTags "li" block <* skipMany nonItem + pOrderedList :: TagParser Blocks pOrderedList = try $ do TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" []) @@ -245,7 +251,7 @@ pOrderedList = try $ do -- note: if they have an
        or
          not in scope of a
        • , -- treat it as a list item, though it's not valid xhtml... skipMany nonItem - items <- manyTill (pInTags "li" block <* skipMany nonItem) (pCloses "ol") + items <- manyTill (pListItem nonItem) (pCloses "ol") return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items pDefinitionList :: TagParser Blocks @@ -518,12 +524,24 @@ pLineBreak = do return B.linebreak pLink :: TagParser Inlines -pLink = try $ do +pLink = pRelLink <|> pAnchor + +pAnchor :: TagParser Inlines +pAnchor = try $ do + tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "id")) + return $ B.spanWith (fromAttrib "id" tag , [], []) mempty + +pRelLink :: TagParser Inlines +pRelLink = try $ do tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href")) let url = fromAttrib "href" tag let title = fromAttrib "title" tag + let uid = fromAttrib "id" tag + let spanC = case uid of + [] -> id + s -> B.spanWith (s, [], []) lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a") - return $ B.link (escapeURI url) title lab + return $ spanC $ B.link (escapeURI url) title lab pImage :: TagParser Inlines pImage = do -- cgit v1.2.3 From 51051e9953f97c35178eded5d3959394a184baf4 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Wed, 30 Jul 2014 00:56:04 +0100 Subject: HTML Reader: Added ability to read MathML formatted blocks --- src/Text/Pandoc/Readers/HTML.hs | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 34aa381d2..2e8b56124 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -571,6 +571,22 @@ pRawHtmlInline = do then return $ B.rawInline "html" $ renderTags' [result] else return mempty +mathMLToTeXMath :: String -> Either String String +mathMLToTeXMath s = writeTeXMath <$> readMathML s + +pMath :: Bool -> TagParser Inlines +pMath inCase = try $ do + open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True) + unless (inCase) (guard (maybe False (== mathMLNamespace) (lookup "xmlns" attr))) + contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math")) + let math = mathMLToTeXMath $ + (renderTags $ [open] ++ contents ++ [TagClose "math"]) + let constructor = + maybe B.math (\x -> if (x == "inline") then B.math else B.displayMath) + (lookup "display" attr) + return $ either (const mempty) + (\x -> if null x then mempty else constructor x) math + pInlinesInTags :: String -> (Inlines -> Inlines) -> TagParser Inlines pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline -- cgit v1.2.3 From 8460ea417f3bcd4599f1ba4877dd4c6e2a373ed1 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Wed, 30 Jul 2014 00:57:20 +0100 Subject: EPUB Reader: Integrated into program --- pandoc.cabal | 6 ++++-- pandoc.hs | 1 + src/Text/Pandoc.hs | 3 +++ 3 files changed, 8 insertions(+), 2 deletions(-) (limited to 'src/Text') diff --git a/pandoc.cabal b/pandoc.cabal index 47bdb6587..991398f24 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -231,7 +231,7 @@ Library xml >= 1.3.12 && < 1.4, random >= 1 && < 1.1, extensible-exceptions >= 0.1 && < 0.2, - pandoc-types >= 1.12.3.3 && < 1.13, + pandoc-types >= 1.12.3.4 && < 1.13, aeson >= 0.7 && < 0.8, tagsoup >= 0.13.1 && < 0.14, base64-bytestring >= 0.1 && < 1.1, @@ -248,7 +248,8 @@ Library binary >= 0.5 && < 0.8, SHA >= 1.6 && < 1.7, haddock-library >= 1.1 && < 1.2, - old-time + old-time, + deepseq-generics >= 0.1 && < 0.2 if flag(https) Build-Depends: http-client >= 0.3.2 && < 0.4, http-client-tls >= 0.2 && < 0.3, @@ -287,6 +288,7 @@ Library Text.Pandoc.Readers.Native, Text.Pandoc.Readers.Haddock, Text.Pandoc.Readers.Docx, + Text.Pandoc.Readers.EPUB, Text.Pandoc.Writers.Native, Text.Pandoc.Writers.Docbook, Text.Pandoc.Writers.OPML, diff --git a/pandoc.hs b/pandoc.hs index 2563afc9c..de41b38cb 100644 --- a/pandoc.hs +++ b/pandoc.hs @@ -891,6 +891,7 @@ defaultReaderName fallback (x:xs) = ".json" -> "json" ".docx" -> "docx" ".t2t" -> "t2t" + ".epub" -> "epub" _ -> defaultReaderName fallback xs -- Returns True if extension of first source is .lhs diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index 77eb3e82f..589a6af98 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -79,6 +79,7 @@ module Text.Pandoc , readJSON , readTxt2Tags , readTxt2TagsNoMacros + , readEPUB -- * Writers: converting /from/ Pandoc format , Writer (..) , writeNative @@ -134,6 +135,7 @@ import Text.Pandoc.Readers.Native import Text.Pandoc.Readers.Haddock import Text.Pandoc.Readers.Docx import Text.Pandoc.Readers.Txt2Tags +import Text.Pandoc.Readers.EPUB import Text.Pandoc.Writers.Native import Text.Pandoc.Writers.Markdown import Text.Pandoc.Writers.RST @@ -233,6 +235,7 @@ readers = [ ("native" , StringReader $ \_ s -> return $ readNative s) ,("haddock" , mkStringReader readHaddock) ,("docx" , mkBSReader readDocx) ,("t2t" , mkStringReader readTxt2TagsNoMacros) + ,("epub" , mkBSReader readEPUB) ] data Writer = PureStringWriter (WriterOptions -> Pandoc -> String) -- cgit v1.2.3 From cd9a5d90cbf93925db5bb9e9060ef40d05b4bfc8 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Thu, 31 Jul 2014 15:35:40 +0100 Subject: EPUB Reader: Now uses the new MediaBag for images --- src/Text/Pandoc/Readers/EPUB.hs | 65 ++++++++++++++++++++++++++++------------- 1 file changed, 45 insertions(+), 20 deletions(-) (limited to 'src/Text') diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs index f926a5864..ca65a8f0f 100644 --- a/src/Text/Pandoc/Readers/EPUB.hs +++ b/src/Text/Pandoc/Readers/EPUB.hs @@ -1,6 +1,7 @@ {-# LANGUAGE ViewPatterns , StandaloneDeriving + , TupleSections , FlexibleContexts #-} module Text.Pandoc.Readers.EPUB @@ -9,32 +10,36 @@ module Text.Pandoc.Readers.EPUB import Text.XML.Light import Text.Pandoc.Definition hiding (Attr) -import Text.Pandoc.Walk (walk) +import Text.Pandoc.Walk (walk, query) import Text.Pandoc.Generic(bottomUp) import Text.Pandoc.Readers.HTML (readHtml) -import Text.Pandoc.Options (ReaderOptions(..), readerExtensions, Extension(..) ) +import Text.Pandoc.Options ( ReaderOptions(..), readerExtensions, Extension(..) + , readerTrace) import Text.Pandoc.Shared (escapeURI) +import Text.Pandoc.MediaBag (MediaBag, insertMedia) import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except) import qualified Text.Pandoc.Builder as B import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry , findEntryByPath, Entry) import qualified Data.ByteString.Lazy as BL (ByteString) -import System.FilePath (takeFileName, (), dropFileName) +import System.FilePath (takeFileName, (), dropFileName, normalise) import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy) import Control.Applicative ((<$>)) -import Control.Monad (guard, liftM) +import Control.Monad (guard, liftM, when) import Data.Monoid (mempty, (<>)) import Data.List (isPrefixOf, isInfixOf) import Data.Maybe (mapMaybe, fromMaybe) -import qualified Data.Map as M (Map, lookup, fromList) +import qualified Data.Map as M (Map, lookup, fromList, elems) import qualified Data.Set as S (insert) import Control.DeepSeq.Generics (deepseq, NFData) +import Debug.Trace (trace) + type MIME = String type Items = M.Map String (FilePath, MIME) -readEPUB :: ReaderOptions -> BL.ByteString -> Pandoc +readEPUB :: ReaderOptions -> BL.ByteString -> (Pandoc, MediaBag) readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes) runEPUB :: Except String a -> a @@ -44,27 +49,28 @@ runEPUB = either error id . runExcept -- are of the form "filename#id" -- -- For now all paths are stripped from images -archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m Pandoc +archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag) archiveToEPUB os archive = do (root, content) <- getManifest archive meta <- parseMeta content (cover, items) <- parseManifest content - let coverDoc = fromMaybe mempty (imageToPandoc . takeFileName<$> cover) + let coverDoc = fromMaybe mempty (imageToPandoc . takeFileName <$> cover) spine <- parseSpine items content let escapedSpine = map (escapeURI . takeFileName . fst) spine - --traceShow escapedSpine (return ()) Pandoc _ bs <- foldM' (\a b -> ((a <>) . bottomUp (prependHash escapedSpine)) `liftM` parseSpineElem root b) mempty spine - return $ coverDoc <> (Pandoc meta bs) + let ast = coverDoc <> (Pandoc meta bs) + let mediaBag = fetchImages (M.elems items) root archive ast + return $ (ast, mediaBag) where rs = readerExtensions os os' = os {readerExtensions = foldr S.insert rs [Ext_epub_html_exts, Ext_raw_html]} os'' = os' {readerParseRaw = True} parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc parseSpineElem r (path, mime) = do - --traceShow path (return ()) - doc <- mimeToReader mime (if r /= "./" then r path else path) + when (readerTrace os) (traceM path) + doc <- mimeToReader mime (normalise (r path)) let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty return $ docSpan <> fixInternalReferences (takeFileName path) doc mimeToReader :: MonadError String m => MIME -> FilePath -> m Pandoc @@ -75,6 +81,23 @@ archiveToEPUB os archive = do | s `elem` imageMimes = return $ imageToPandoc path | otherwise = return $ mempty +fetchImages :: [(FilePath, MIME)] + -> FilePath + -> Archive + -> Pandoc + -> MediaBag +fetchImages mimes root a (query iq -> links) = + foldr (uncurry3 insertMedia) mempty + (mapMaybe getEntry links) + where + getEntry l = let mediaPos = normalise (root l) in + (l , lookup mediaPos mimes, ) . fromEntry + <$> findEntryByPath mediaPos a + +iq :: Inline -> [FilePath] +iq (Image _ (url, _)) = [url] +iq _ = [] + imageToPandoc :: FilePath -> Pandoc imageToPandoc s = B.doc . B.para $ B.image s "" mempty @@ -147,7 +170,7 @@ getManifest archive = do fixInternalReferences :: String -> Pandoc -> Pandoc fixInternalReferences s = - (walk normalisePath) . (walk stripImage) . (walk $ fixBlockIRs s') . (walk $ fixInlineIRs s') + (walk normalisePath) . (walk $ fixBlockIRs s') . (walk $ fixInlineIRs s') where s' = escapeURI s @@ -195,13 +218,7 @@ removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs isEPUBAttr :: (String, String) -> Bool isEPUBAttr (k, _) = "epub:" `isPrefixOf` k --- Remove relative paths -stripImage :: Inline -> Inline -stripImage (Image alt (url, tit)) = Image alt (takeFileName url, tit) -stripImage i = i - - --- Utility +-- Library -- Strict version of foldM foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a @@ -210,6 +227,14 @@ foldM' f z (x:xs) = do z' <- f z x z' `deepseq` foldM' f z' xs +traceM :: Monad m => String -> m () +traceM = flip trace (return ()) + +uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d +uncurry3 f (a, b, c) = f a b c + +-- Utility + stripNamespace :: QName -> String stripNamespace (QName v _ _) = v -- cgit v1.2.3