aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2014-08-04 07:36:18 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2014-08-04 07:36:18 -0700
commit4630cff2a6c116f1d474f459e6e759f5ce7f2003 (patch)
tree5c52982b8f0615fadb69a0a105af9e1e60d51f25 /src/Text/Pandoc
parent81335df9a51740631e75614c1279634f937d650a (diff)
parentcd9a5d90cbf93925db5bb9e9060ef40d05b4bfc8 (diff)
downloadpandoc-4630cff2a6c116f1d474f459e6e759f5ce7f2003.tar.gz
Merge branch 'epubend' of https://github.com/mpickering/pandoc into mpickering-epubend
Conflicts: pandoc.cabal
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r--src/Text/Pandoc/Compat/Except.hs12
-rw-r--r--src/Text/Pandoc/Options.hs1
-rw-r--r--src/Text/Pandoc/Readers/EPUB.hs273
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs248
4 files changed, 501 insertions, 33 deletions
diff --git a/src/Text/Pandoc/Compat/Except.hs b/src/Text/Pandoc/Compat/Except.hs
index 7f5648e7a..9ce7c0d36 100644
--- a/src/Text/Pandoc/Compat/Except.hs
+++ b/src/Text/Pandoc/Compat/Except.hs
@@ -1,7 +1,10 @@
{-# LANGUAGE CPP #-}
module Text.Pandoc.Compat.Except ( ExceptT
+ , Except
, Error(..)
, runExceptT
+ , runExcept
+ , MonadError
, throwError
, catchError )
where
@@ -18,10 +21,17 @@ class Error a where
#else
import Control.Monad.Error
+import Control.Monad.Identity (Identity, runIdentity)
+
type ExceptT = ErrorT
-runExceptT :: ExceptT e m a -> m (Either e a)
+type Except s a = ErrorT s Identity a
+
+runExceptT :: ExceptT e m a -> m (Either e a)
runExceptT = runErrorT
+
+runExcept :: ExceptT e Identity a -> Either e a
+runExcept = runIdentity . runExceptT
#endif
diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs
index 85a6a3096..bb213bac0 100644
--- a/src/Text/Pandoc/Options.hs
+++ b/src/Text/Pandoc/Options.hs
@@ -106,6 +106,7 @@ data Extension =
| Ext_mmd_header_identifiers -- ^ Multimarkdown style header identifiers [myid]
| Ext_implicit_header_references -- ^ Implicit reference links for headers
| Ext_line_blocks -- ^ RST style line blocks
+ | Ext_epub_html_exts -- ^ Recognise the EPUB extended version of HTML
deriving (Show, Read, Enum, Eq, Ord, Bounded)
pandocExtensions :: Set Extension
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs
new file mode 100644
index 000000000..ca65a8f0f
--- /dev/null
+++ b/src/Text/Pandoc/Readers/EPUB.hs
@@ -0,0 +1,273 @@
+{-# LANGUAGE
+ ViewPatterns
+ , StandaloneDeriving
+ , TupleSections
+ , FlexibleContexts #-}
+
+module Text.Pandoc.Readers.EPUB
+ (readEPUB)
+ where
+
+import Text.XML.Light
+import Text.Pandoc.Definition hiding (Attr)
+import Text.Pandoc.Walk (walk, query)
+import Text.Pandoc.Generic(bottomUp)
+import Text.Pandoc.Readers.HTML (readHtml)
+import Text.Pandoc.Options ( ReaderOptions(..), readerExtensions, Extension(..)
+ , readerTrace)
+import Text.Pandoc.Shared (escapeURI)
+import Text.Pandoc.MediaBag (MediaBag, insertMedia)
+import Text.Pandoc.Compat.Except (MonadError, throwError, runExcept, Except)
+import qualified Text.Pandoc.Builder as B
+import Codec.Archive.Zip ( Archive (..), toArchive, fromEntry
+ , findEntryByPath, Entry)
+import qualified Data.ByteString.Lazy as BL (ByteString)
+import System.FilePath (takeFileName, (</>), dropFileName, normalise)
+import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy)
+import Control.Applicative ((<$>))
+import Control.Monad (guard, liftM, when)
+import Data.Monoid (mempty, (<>))
+import Data.List (isPrefixOf, isInfixOf)
+import Data.Maybe (mapMaybe, fromMaybe)
+import qualified Data.Map as M (Map, lookup, fromList, elems)
+import qualified Data.Set as S (insert)
+import Control.DeepSeq.Generics (deepseq, NFData)
+
+import Debug.Trace (trace)
+
+type MIME = String
+
+type Items = M.Map String (FilePath, MIME)
+
+readEPUB :: ReaderOptions -> BL.ByteString -> (Pandoc, MediaBag)
+readEPUB opts bytes = runEPUB (archiveToEPUB opts $ toArchive bytes)
+
+runEPUB :: Except String a -> a
+runEPUB = either error id . runExcept
+
+-- Note that internal reference are aggresively normalised so that all ids
+-- are of the form "filename#id"
+--
+-- For now all paths are stripped from images
+archiveToEPUB :: (MonadError String m) => ReaderOptions -> Archive -> m (Pandoc, MediaBag)
+archiveToEPUB os archive = do
+ (root, content) <- getManifest archive
+ meta <- parseMeta content
+ (cover, items) <- parseManifest content
+ let coverDoc = fromMaybe mempty (imageToPandoc . takeFileName <$> cover)
+ spine <- parseSpine items content
+ let escapedSpine = map (escapeURI . takeFileName . fst) spine
+ Pandoc _ bs <-
+ foldM' (\a b -> ((a <>) . bottomUp (prependHash escapedSpine))
+ `liftM` parseSpineElem root b) mempty spine
+ let ast = coverDoc <> (Pandoc meta bs)
+ let mediaBag = fetchImages (M.elems items) root archive ast
+ return $ (ast, mediaBag)
+ where
+ rs = readerExtensions os
+ os' = os {readerExtensions = foldr S.insert rs [Ext_epub_html_exts, Ext_raw_html]}
+ os'' = os' {readerParseRaw = True}
+ parseSpineElem :: MonadError String m => FilePath -> (FilePath, MIME) -> m Pandoc
+ parseSpineElem r (path, mime) = do
+ when (readerTrace os) (traceM path)
+ doc <- mimeToReader mime (normalise (r </> path))
+ let docSpan = B.doc $ B.para $ B.spanWith (takeFileName path, [], []) mempty
+ return $ docSpan <> fixInternalReferences (takeFileName path) doc
+ mimeToReader :: MonadError String m => MIME -> FilePath -> m Pandoc
+ mimeToReader "application/xhtml+xml" path = do
+ fname <- findEntryByPathE path archive
+ return $ readHtml os'' . UTF8.toStringLazy $ fromEntry fname
+ mimeToReader s path
+ | s `elem` imageMimes = return $ imageToPandoc path
+ | otherwise = return $ mempty
+
+fetchImages :: [(FilePath, MIME)]
+ -> FilePath
+ -> Archive
+ -> Pandoc
+ -> MediaBag
+fetchImages mimes root a (query iq -> links) =
+ foldr (uncurry3 insertMedia) mempty
+ (mapMaybe getEntry links)
+ where
+ getEntry l = let mediaPos = normalise (root </> l) in
+ (l , lookup mediaPos mimes, ) . fromEntry
+ <$> findEntryByPath mediaPos a
+
+iq :: Inline -> [FilePath]
+iq (Image _ (url, _)) = [url]
+iq _ = []
+
+
+imageToPandoc :: FilePath -> Pandoc
+imageToPandoc s = B.doc . B.para $ B.image s "" mempty
+
+imageMimes :: [String]
+imageMimes = ["image/gif", "image/jpeg", "image/png"]
+
+type CoverImage = FilePath
+
+parseManifest :: (MonadError String m) => Element -> m (Maybe CoverImage, Items)
+parseManifest content = do
+ manifest <- findElementE (dfName "manifest") content
+ let items = findChildren (dfName "item") manifest
+ r <- mapM parseItem items
+ let cover = findAttr (emptyName "href") =<< filterChild findCover manifest
+ return (cover, (M.fromList r))
+ where
+ findCover e = maybe False (isInfixOf "cover-image")
+ (findAttr (emptyName "properties") e)
+ parseItem e = do
+ uid <- findAttrE (emptyName "id") e
+ href <- findAttrE (emptyName "href") e
+ mime <- findAttrE (emptyName "media-type") e
+ return (uid, (href, mime))
+
+parseSpine :: MonadError String m => Items -> Element -> m [(FilePath, MIME)]
+parseSpine is e = do
+ spine <- findElementE (dfName "spine") e
+ let itemRefs = findChildren (dfName "itemref") spine
+ mapM (mkE "parseSpine" . (flip M.lookup is)) $ mapMaybe parseItemRef itemRefs
+ where
+ parseItemRef ref = do
+ let linear = maybe True (== "yes") (findAttr (emptyName "linear") ref)
+ guard linear
+ findAttr (emptyName "idref") ref
+
+parseMeta :: MonadError String m => Element -> m Meta
+parseMeta content = do
+ meta <- findElementE (dfName "metadata") content
+ let dcspace (QName _ (Just "http://purl.org/dc/elements/1.1/") (Just "dc")) = True
+ dcspace _ = False
+ let dcs = filterChildrenName dcspace meta
+ let r = foldr parseMetaItem nullMeta dcs
+ return r
+
+-- http://www.idpf.org/epub/30/spec/epub30-publications.html#sec-metadata-elem
+parseMetaItem :: Element -> Meta -> Meta
+parseMetaItem e@(stripNamespace . elName -> field) meta =
+ B.setMeta (renameMeta field) (B.str $ strContent e) meta
+
+renameMeta :: String -> String
+renameMeta "creator" = "author"
+renameMeta s = s
+
+getManifest :: MonadError String m => Archive -> m (String, Element)
+getManifest archive = do
+ metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive
+ docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry
+ let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
+ ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces)
+ as <- liftM ((map attrToPair) . elAttribs)
+ (findElementE (QName "rootfile" (Just ns) Nothing) docElem)
+ root <- mkE "Root not found" (lookup "full-path" as)
+ let rootdir = dropFileName root
+ --mime <- lookup "media-type" as
+ manifest <- findEntryByPathE root archive
+ liftM ((,) rootdir) (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
+
+-- Fixup
+
+fixInternalReferences :: String -> Pandoc -> Pandoc
+fixInternalReferences s =
+ (walk normalisePath) . (walk $ fixBlockIRs s') . (walk $ fixInlineIRs s')
+ where
+ s' = escapeURI s
+
+fixInlineIRs :: String -> Inline -> Inline
+fixInlineIRs s (Span as v) =
+ Span (fixAttrs s as) v
+fixInlineIRs s (Code as code) =
+ Code (fixAttrs s as) code
+fixInlineIRs s (Link t ('#':url, tit)) =
+ Link t (addHash s url, tit)
+fixInlineIRs _ v = v
+
+normalisePath :: Inline -> Inline
+normalisePath (Link t (url, tit)) =
+ let (path, uid) = span (/= '#') url in
+ Link t (takeFileName path ++ uid, tit)
+normalisePath s = s
+
+prependHash :: [String] -> Inline -> Inline
+prependHash ps l@(Link is (url, tit))
+ | or [s `isPrefixOf` url | s <- ps] =
+ Link is ('#':url, tit)
+ | otherwise = l
+prependHash _ i = i
+
+fixBlockIRs :: String -> Block -> Block
+fixBlockIRs s (Div as b) =
+ Div (fixAttrs s as) b
+fixBlockIRs s (Header i as b) =
+ Header i (fixAttrs s as) b
+fixBlockIRs s (CodeBlock as code) =
+ CodeBlock (fixAttrs s as) code
+fixBlockIRs _ b = b
+
+fixAttrs :: FilePath -> B.Attr -> B.Attr
+fixAttrs s (ident, cs, kvs) = (addHash s ident, filter (not . null) cs, removeEPUBAttrs kvs)
+
+addHash :: String -> String -> String
+addHash _ "" = ""
+addHash s ident = s ++ "#" ++ ident
+
+removeEPUBAttrs :: [(String, String)] -> [(String, String)]
+removeEPUBAttrs kvs = filter (not . isEPUBAttr) kvs
+
+isEPUBAttr :: (String, String) -> Bool
+isEPUBAttr (k, _) = "epub:" `isPrefixOf` k
+
+-- Library
+
+-- Strict version of foldM
+foldM' :: (Monad m, NFData a) => (a -> b -> m a) -> a -> [b] -> m a
+foldM' _ z [] = return z
+foldM' f z (x:xs) = do
+ z' <- f z x
+ z' `deepseq` foldM' f z' xs
+
+traceM :: Monad m => String -> m ()
+traceM = flip trace (return ())
+
+uncurry3 :: (a -> b -> c -> d) -> (a, b, c) -> d
+uncurry3 f (a, b, c) = f a b c
+
+-- Utility
+
+stripNamespace :: QName -> String
+stripNamespace (QName v _ _) = v
+
+attrToNSPair :: Attr -> Maybe (String, String)
+attrToNSPair (Attr (QName "xmlns" _ _) val) = Just ("xmlns", val)
+attrToNSPair _ = Nothing
+
+attrToPair :: Attr -> (String, String)
+attrToPair (Attr (QName name _ _) val) = (name, val)
+
+defaultNameSpace :: Maybe String
+defaultNameSpace = Just "http://www.idpf.org/2007/opf"
+
+dfName :: String -> QName
+dfName s = QName s defaultNameSpace Nothing
+
+emptyName :: String -> QName
+emptyName s = QName s Nothing Nothing
+
+-- Convert Maybe interface to Either
+
+findAttrE :: MonadError String m => QName -> Element -> m String
+findAttrE q e = mkE "findAttr" $ findAttr q e
+
+findEntryByPathE :: MonadError String m => FilePath -> Archive -> m Entry
+findEntryByPathE path a = mkE ("No entry on path: " ++ path) $ findEntryByPath path a
+
+parseXMLDocE :: MonadError String m => String -> m Element
+parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc
+
+findElementE :: MonadError String m => QName -> Element -> m Element
+findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x
+
+mkE :: MonadError String m => String -> Maybe a -> m a
+mkE s = maybe (throwError s) return
+
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 597156a5e..2e8b56124 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -41,48 +41,64 @@ import Text.HTML.TagSoup
import Text.HTML.TagSoup.Match
import Text.Pandoc.Definition
import qualified Text.Pandoc.Builder as B
-import Text.Pandoc.Builder (HasMeta (..), Blocks, Inlines, trimInlines)
-import Text.Pandoc.Shared
-import Text.Pandoc.Options
-import Text.Pandoc.Parsing
-import Data.Maybe ( fromMaybe, isJust )
-import Data.List ( intercalate )
+import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
+import Text.Pandoc.Shared ( extractSpaces, renderTags'
+ , escapeURI, safeRead )
+import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace)
+ , Extension (Ext_epub_html_exts))
+import Text.Pandoc.Parsing hiding ((<|>))
+import Text.Pandoc.Walk
+import Data.Maybe ( fromMaybe, isJust)
+import Data.List ( intercalate, isInfixOf )
import Data.Char ( isDigit )
-import Control.Monad ( liftM, guard, when, mzero )
-import Control.Applicative ( (<$>), (<$), (<*) )
-import Data.Monoid
+import Control.Monad ( liftM, guard, when, mzero, void, unless )
+import Control.Arrow ((***))
+import Control.Applicative ( (<$>), (<$), (<*), (*>), (<|>))
+import Data.Monoid (mconcat, Monoid, mempty, (<>), First (..))
import Text.Printf (printf)
import Debug.Trace (trace)
-import Data.Default (Default (..))
-import Control.Monad.Reader (Reader, runReader, asks, local, ask)
+import Text.TeXMath (readMathML, writeTeXMath)
+import Data.Default (Default (..), def)
+import Control.Monad.Reader (Reader,ask, asks, local, runReader)
-isSpace :: Char -> Bool
-isSpace ' ' = True
-isSpace '\t' = True
-isSpace '\n' = True
-isSpace _ = False
-- | Convert HTML-formatted string to 'Pandoc' document.
readHtml :: ReaderOptions -- ^ Reader options
-> String -- ^ String to parse (assumes @'\n'@ line endings)
-> Pandoc
readHtml opts inp =
- case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } ) "source" tags of
+ case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags of
Left err' -> error $ "\nError at " ++ show err'
Right result -> result
- where tags = canonicalizeTags $
+ where tags = stripPrefixes . canonicalizeTags $
parseTagsOptions parseOptions{ optTagPosition = True } inp
parseDoc = do
blocks <- (fixPlains False) . mconcat <$> manyTill block eof
meta <- stateMeta . parserState <$> getState
- return $ Pandoc meta (B.toList blocks)
+ bs' <- replaceNotes (B.toList blocks)
+ return $ Pandoc meta bs'
+
+replaceNotes :: [Block] -> TagParser [Block]
+replaceNotes = walkM replaceNotes'
+
+replaceNotes' :: Inline -> TagParser Inline
+replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes
+ where
+ getNotes = noteTable <$> getState
+replaceNotes' x = return x
data HTMLState =
HTMLState
- { parserState :: ParserState
+ { parserState :: ParserState,
+ noteTable :: [(String, Blocks)]
}
-data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext }
+data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
+ , inChapter :: Bool -- ^ Set if in chapter section
+ }
+
+setInChapter :: HTMLParser s a -> HTMLParser s a
+setInChapter = local (\s -> s {inChapter = True})
type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal)
@@ -110,7 +126,11 @@ block = do
tr <- getOption readerTrace
pos <- getPosition
res <- choice
- [ pPara
+ [ eSwitch
+ , eSection
+ , mempty <$ eFootnote
+ , mempty <$ eTOC
+ , pPara
, pHeader
, pBlockQuote
, pCodeBlock
@@ -127,6 +147,64 @@ block = do
(take 60 $ show $ B.toList res)) (return ())
return res
+namespaces :: [(String, TagParser Blocks)]
+namespaces = [(mathMLNamespace, B.para <$> pMath True)]
+
+mathMLNamespace :: String
+mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
+
+eSwitch :: TagParser Blocks
+eSwitch = try $ do
+ guardEnabled Ext_epub_html_exts
+ pSatisfy (~== TagOpen "switch" [])
+ cases <- getFirst . mconcat <$>
+ manyTill (First <$> (eCase <* skipMany pBlank) )
+ (lookAhead $ try $ pSatisfy (~== TagOpen "default" []))
+ skipMany pBlank
+ fallback <- pInTags "default" ( skipMany pBlank *> block <* skipMany pBlank )
+ skipMany pBlank
+ pSatisfy (~== TagClose "switch")
+ return (fromMaybe fallback cases)
+
+eCase :: TagParser (Maybe Blocks)
+eCase = do
+ skipMany pBlank
+ TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" [])
+ case (flip lookup namespaces) =<< lookup "required-namespace" attr of
+ Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank))
+ Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case"))
+
+eFootnote :: TagParser ()
+eFootnote = try $ do
+ let notes = ["footnote", "rearnote"]
+ guardEnabled Ext_epub_html_exts
+ (TagOpen tag attr) <- lookAhead $ pAnyTag
+ guard (maybe False (flip elem notes) (lookup "type" attr))
+ let ident = fromMaybe "" (lookup "id" attr)
+ content <- pInTags tag block
+ addNote ident content
+
+addNote :: String -> Blocks -> TagParser ()
+addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)})
+
+eNoteref :: TagParser Inlines
+eNoteref = try $ do
+ guardEnabled Ext_epub_html_exts
+ TagOpen tag attr <- lookAhead $ pAnyTag
+ guard (maybe False (== "noteref") (lookup "type" attr))
+ let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr)
+ guard (not (null ident))
+ pInTags tag block
+ return $ B.rawInline "noteref" ident
+
+-- Strip TOC if there is one, better to generate again
+eTOC :: TagParser ()
+eTOC = try $ do
+ guardEnabled Ext_epub_html_exts
+ (TagOpen tag attr) <- lookAhead $ pAnyTag
+ guard (maybe False (== "toc") (lookup "type" attr))
+ void (pInTags tag block)
+
pList :: TagParser Blocks
pList = pBulletList <|> pOrderedList <|> pDefinitionList
@@ -139,9 +217,15 @@ pBulletList = try $ do
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
- items <- manyTill (pInTags "li" block <* skipMany nonItem) (pCloses "ul")
+ items <- manyTill (pListItem nonItem) (pCloses "ul")
return $ B.bulletList $ map (fixPlains True) items
+pListItem :: TagParser a -> TagParser Blocks
+pListItem nonItem = do
+ TagOpen _ attr <- lookAhead $ pSatisfy (~== TagOpen "li" [])
+ let liDiv = maybe mempty (\x -> B.divWith (x, [], []) mempty) (lookup "id" attr)
+ (liDiv <>) <$> pInTags "li" block <* skipMany nonItem
+
pOrderedList :: TagParser Blocks
pOrderedList = try $ do
TagOpen _ attribs <- pSatisfy (~== TagOpen "ol" [])
@@ -167,7 +251,7 @@ pOrderedList = try $ do
-- note: if they have an <ol> or <ul> not in scope of a <li>,
-- treat it as a list item, though it's not valid xhtml...
skipMany nonItem
- items <- manyTill (pInTags "li" block <* skipMany nonItem) (pCloses "ol")
+ items <- manyTill (pListItem nonItem) (pCloses "ol")
return $ B.orderedListWith (start, style, DefaultDelim) $ map (fixPlains True) items
pDefinitionList :: TagParser Blocks
@@ -230,13 +314,35 @@ pHtmlBlock t = try $ do
contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
return $ renderTags' $ [open] ++ contents ++ [TagClose t]
+-- Sets chapter context
+eSection :: TagParser Blocks
+eSection = try $ do
+ let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as)
+ let sectTag = tagOpen (`elem` sectioningContent) matchChapter
+ TagOpen tag _ <- lookAhead $ pSatisfy sectTag
+ setInChapter (pInTags tag block)
+
+headerLevel :: String -> TagParser Int
+headerLevel tagtype = do
+ let level = read (drop 1 tagtype)
+ (try $ do
+ guardEnabled Ext_epub_html_exts
+ asks inChapter >>= guard
+ return (level - 1))
+ <|>
+ return level
+
+
+
+
+
pHeader :: TagParser Blocks
pHeader = try $ do
TagOpen tagtype attr <- pSatisfy $
tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
(const True)
let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
- let level = read (drop 1 tagtype)
+ level <- headerLevel tagtype
contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
let ident = fromMaybe "" $ lookup "id" attr
let classes = maybe [] words $ lookup "class" attr
@@ -336,7 +442,8 @@ pCodeBlock = try $ do
inline :: TagParser Inlines
inline = choice
- [ pTagText
+ [ eNoteref
+ , pTagText
, pQ
, pEmph
, pStrong
@@ -348,6 +455,7 @@ inline = choice
, pImage
, pCode
, pSpan
+ , pMath False
, pRawHtmlInline
]
@@ -416,12 +524,24 @@ pLineBreak = do
return B.linebreak
pLink :: TagParser Inlines
-pLink = try $ do
+pLink = pRelLink <|> pAnchor
+
+pAnchor :: TagParser Inlines
+pAnchor = try $ do
+ tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "id"))
+ return $ B.spanWith (fromAttrib "id" tag , [], []) mempty
+
+pRelLink :: TagParser Inlines
+pRelLink = try $ do
tag <- pSatisfy (tagOpenLit "a" (isJust . lookup "href"))
let url = fromAttrib "href" tag
let title = fromAttrib "title" tag
+ let uid = fromAttrib "id" tag
+ let spanC = case uid of
+ [] -> id
+ s -> B.spanWith (s, [], [])
lab <- trimInlines . mconcat <$> manyTill inline (pCloses "a")
- return $ B.link (escapeURI url) title lab
+ return $ spanC $ B.link (escapeURI url) title lab
pImage :: TagParser Inlines
pImage = do
@@ -451,6 +571,22 @@ pRawHtmlInline = do
then return $ B.rawInline "html" $ renderTags' [result]
else return mempty
+mathMLToTeXMath :: String -> Either String String
+mathMLToTeXMath s = writeTeXMath <$> readMathML s
+
+pMath :: Bool -> TagParser Inlines
+pMath inCase = try $ do
+ open@(TagOpen _ attr) <- pSatisfy $ tagOpen (=="math") (const True)
+ unless (inCase) (guard (maybe False (== mathMLNamespace) (lookup "xmlns" attr)))
+ contents <- manyTill pAnyTag (pSatisfy (~== TagClose "math"))
+ let math = mathMLToTeXMath $
+ (renderTags $ [open] ++ contents ++ [TagClose "math"])
+ let constructor =
+ maybe B.math (\x -> if (x == "inline") then B.math else B.displayMath)
+ (lookup "display" attr)
+ return $ either (const mempty)
+ (\x -> if null x then mempty else constructor x) math
+
pInlinesInTags :: String -> (Inlines -> Inlines)
-> TagParser Inlines
pInlinesInTags tagtype f = extractSpaces f <$> pInTags tagtype inline
@@ -620,8 +756,11 @@ blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist",
"classsynopsis", "blockquote", "epigraph", "msgset",
"sidebar", "title"]
+epubTags :: [String]
+epubTags = ["case", "switch", "default"]
+
blockTags :: [String]
-blockTags = blockHtmlTags ++ blockDocBookTags
+blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags
isInlineTag :: Tag String -> Bool
isInlineTag t = tagOpen isInlineTagName (const True) t ||
@@ -720,9 +859,32 @@ htmlTag f = try $ do
mkAttr :: [(String, String)] -> Attr
mkAttr attr = (attribsId, attribsClasses, attribsKV)
where attribsId = fromMaybe "" $ lookup "id" attr
- attribsClasses = words $ fromMaybe "" $ lookup "class" attr
+ attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes
attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+ epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
+-- Strip namespace prefixes
+stripPrefixes :: [Tag String] -> [Tag String]
+stripPrefixes = map stripPrefix
+
+stripPrefix :: Tag String -> Tag String
+stripPrefix (TagOpen s as) =
+ TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as)
+stripPrefix (TagClose s) = TagClose (stripPrefix' s)
+stripPrefix x = x
+
+stripPrefix' :: String -> String
+stripPrefix' s =
+ case span (/= ':') s of
+ (_, "") -> s
+ (_, (_:ts)) -> ts
+
+isSpace :: Char -> Bool
+isSpace ' ' = True
+isSpace '\t' = True
+isSpace '\n' = True
+isSpace '\r' = True
+isSpace _ = False
-- Instances
@@ -736,17 +898,39 @@ instance HasReaderOptions HTMLState where
extractReaderOptions = extractReaderOptions . parserState
instance Default HTMLState where
- def = HTMLState def
+ def = HTMLState def []
instance HasMeta HTMLState where
setMeta s b st = st {parserState = setMeta s b $ parserState st}
deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
instance Default HTMLLocal where
- def = HTMLLocal NoQuote
+ def = HTMLLocal NoQuote False
instance HasLastStrPosition HTMLState where
setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
getLastStrPos = getLastStrPos . parserState
+-- EPUB Specific
+--
+--
+sectioningContent :: [String]
+sectioningContent = ["article", "aside", "nav", "section"]
+
+{-
+groupingContent :: [String]
+groupingContent = ["p", "hr", "pre", "blockquote", "ol"
+ , "ul", "li", "dl", "dt", "dt", "dd"
+ , "figure", "figcaption", "div", "main"]
+
+
+
+types :: [(String, ([String], Int))]
+types = -- Document divisions
+ map (\s -> (s, (["section", "body"], 0)))
+ ["volume", "part", "chapter", "division"]
+ ++ -- Document section and components
+ [
+ ("abstract", ([], 0))]
+-}