From 266e1977e03383f806867d9d3af86b5d55717830 Mon Sep 17 00:00:00 2001 From: Matthew Pickering Date: Wed, 30 Jul 2014 00:54:05 +0100 Subject: HTML Reader: Extended HTML Reader to recognise EPUB specific elements --- src/Text/Pandoc/Readers/HTML.hs | 206 ++++++++++++++++++++++++++++++++++------ 1 file changed, 178 insertions(+), 28 deletions(-) (limited to 'src/Text/Pandoc') diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 597156a5e..ef061df09 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -41,48 +41,64 @@ import Text.HTML.TagSoup import Text.HTML.TagSoup.Match import Text.Pandoc.Definition import qualified Text.Pandoc.Builder as B -import Text.Pandoc.Builder (HasMeta (..), Blocks, Inlines, trimInlines) -import Text.Pandoc.Shared -import Text.Pandoc.Options -import Text.Pandoc.Parsing -import Data.Maybe ( fromMaybe, isJust ) -import Data.List ( intercalate ) +import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..)) +import Text.Pandoc.Shared ( extractSpaces, renderTags' + , escapeURI, safeRead ) +import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace) + , Extension (Ext_epub_html_exts)) +import Text.Pandoc.Parsing hiding ((<|>)) +import Text.Pandoc.Walk +import Data.Maybe ( fromMaybe, isJust) +import Data.List ( intercalate, isInfixOf ) import Data.Char ( isDigit ) -import Control.Monad ( liftM, guard, when, mzero ) -import Control.Applicative ( (<$>), (<$), (<*) ) -import Data.Monoid +import Control.Monad ( liftM, guard, when, mzero, void, unless ) +import Control.Arrow ((***)) +import Control.Applicative ( (<$>), (<$), (<*), (*>), (<|>)) +import Data.Monoid (mconcat, Monoid, mempty, (<>), First (..)) import Text.Printf (printf) import Debug.Trace (trace) -import Data.Default (Default (..)) -import Control.Monad.Reader (Reader, runReader, asks, local, ask) +import Text.TeXMath (readMathML, writeTeXMath) +import Data.Default (Default (..), def) +import Control.Monad.Reader (Reader,ask, asks, local, runReader) -isSpace :: Char -> Bool -isSpace ' ' = True -isSpace '\t' = True -isSpace '\n' = True -isSpace _ = False -- | Convert HTML-formatted string to 'Pandoc' document. readHtml :: ReaderOptions -- ^ Reader options -> String -- ^ String to parse (assumes @'\n'@ line endings) -> Pandoc readHtml opts inp = - case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } ) "source" tags of + case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } []) "source" tags of Left err' -> error $ "\nError at " ++ show err' Right result -> result - where tags = canonicalizeTags $ + where tags = stripPrefixes . canonicalizeTags $ parseTagsOptions parseOptions{ optTagPosition = True } inp parseDoc = do blocks <- (fixPlains False) . mconcat <$> manyTill block eof meta <- stateMeta . parserState <$> getState - return $ Pandoc meta (B.toList blocks) + bs' <- replaceNotes (B.toList blocks) + return $ Pandoc meta bs' + +replaceNotes :: [Block] -> TagParser [Block] +replaceNotes = walkM replaceNotes' + +replaceNotes' :: Inline -> TagParser Inline +replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes + where + getNotes = noteTable <$> getState +replaceNotes' x = return x data HTMLState = HTMLState - { parserState :: ParserState + { parserState :: ParserState, + noteTable :: [(String, Blocks)] } -data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext } +data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext + , inChapter :: Bool -- ^ Set if in chapter section + } + +setInChapter :: HTMLParser s a -> HTMLParser s a +setInChapter = local (\s -> s {inChapter = True}) type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal) @@ -110,7 +126,11 @@ block = do tr <- getOption readerTrace pos <- getPosition res <- choice - [ pPara + [ eSwitch + , eSection + , mempty <$ eFootnote + , mempty <$ eTOC + , pPara , pHeader , pBlockQuote , pCodeBlock @@ -127,6 +147,64 @@ block = do (take 60 $ show $ B.toList res)) (return ()) return res +namespaces :: [(String, TagParser Blocks)] +namespaces = [(mathMLNamespace, B.para <$> pMath True)] + +mathMLNamespace :: String +mathMLNamespace = "http://www.w3.org/1998/Math/MathML" + +eSwitch :: TagParser Blocks +eSwitch = try $ do + guardEnabled Ext_epub_html_exts + pSatisfy (~== TagOpen "switch" []) + cases <- getFirst . mconcat <$> + manyTill (First <$> (eCase <* skipMany pBlank) ) + (lookAhead $ try $ pSatisfy (~== TagOpen "default" [])) + skipMany pBlank + fallback <- pInTags "default" ( skipMany pBlank *> block <* skipMany pBlank ) + skipMany pBlank + pSatisfy (~== TagClose "switch") + return (fromMaybe fallback cases) + +eCase :: TagParser (Maybe Blocks) +eCase = do + skipMany pBlank + TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" []) + case (flip lookup namespaces) =<< lookup "required-namespace" attr of + Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank)) + Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case")) + +eFootnote :: TagParser () +eFootnote = try $ do + let notes = ["footnote", "rearnote"] + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (flip elem notes) (lookup "type" attr)) + let ident = fromMaybe "" (lookup "id" attr) + content <- pInTags tag block + addNote ident content + +addNote :: String -> Blocks -> TagParser () +addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)}) + +eNoteref :: TagParser Inlines +eNoteref = try $ do + guardEnabled Ext_epub_html_exts + TagOpen tag attr <- lookAhead $ pAnyTag + guard (maybe False (== "noteref") (lookup "type" attr)) + let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr) + guard (not (null ident)) + pInTags tag block + return $ B.rawInline "noteref" ident + +-- Strip TOC if there is one, better to generate again +eTOC :: TagParser () +eTOC = try $ do + guardEnabled Ext_epub_html_exts + (TagOpen tag attr) <- lookAhead $ pAnyTag + guard (maybe False (== "toc") (lookup "type" attr)) + void (pInTags tag block) + pList :: TagParser Blocks pList = pBulletList <|> pOrderedList <|> pDefinitionList @@ -230,13 +308,35 @@ pHtmlBlock t = try $ do contents <- manyTill pAnyTag (pSatisfy (~== TagClose t)) return $ renderTags' $ [open] ++ contents ++ [TagClose t] +-- Sets chapter context +eSection :: TagParser Blocks +eSection = try $ do + let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as) + let sectTag = tagOpen (`elem` sectioningContent) matchChapter + TagOpen tag _ <- lookAhead $ pSatisfy sectTag + setInChapter (pInTags tag block) + +headerLevel :: String -> TagParser Int +headerLevel tagtype = do + let level = read (drop 1 tagtype) + (try $ do + guardEnabled Ext_epub_html_exts + asks inChapter >>= guard + return (level - 1)) + <|> + return level + + + + + pHeader :: TagParser Blocks pHeader = try $ do TagOpen tagtype attr <- pSatisfy $ tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"]) (const True) let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")] - let level = read (drop 1 tagtype) + level <- headerLevel tagtype contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof) let ident = fromMaybe "" $ lookup "id" attr let classes = maybe [] words $ lookup "class" attr @@ -336,7 +436,8 @@ pCodeBlock = try $ do inline :: TagParser Inlines inline = choice - [ pTagText + [ eNoteref + , pTagText , pQ , pEmph , pStrong @@ -348,6 +449,7 @@ inline = choice , pImage , pCode , pSpan + , pMath False , pRawHtmlInline ] @@ -620,8 +722,11 @@ blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist", "classsynopsis", "blockquote", "epigraph", "msgset", "sidebar", "title"] +epubTags :: [String] +epubTags = ["case", "switch", "default"] + blockTags :: [String] -blockTags = blockHtmlTags ++ blockDocBookTags +blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags isInlineTag :: Tag String -> Bool isInlineTag t = tagOpen isInlineTagName (const True) t || @@ -720,9 +825,32 @@ htmlTag f = try $ do mkAttr :: [(String, String)] -> Attr mkAttr attr = (attribsId, attribsClasses, attribsKV) where attribsId = fromMaybe "" $ lookup "id" attr - attribsClasses = words $ fromMaybe "" $ lookup "class" attr + attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr + epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr + +-- Strip namespace prefixes +stripPrefixes :: [Tag String] -> [Tag String] +stripPrefixes = map stripPrefix +stripPrefix :: Tag String -> Tag String +stripPrefix (TagOpen s as) = + TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as) +stripPrefix (TagClose s) = TagClose (stripPrefix' s) +stripPrefix x = x + +stripPrefix' :: String -> String +stripPrefix' s = + case span (/= ':') s of + (_, "") -> s + (_, (_:ts)) -> ts + +isSpace :: Char -> Bool +isSpace ' ' = True +isSpace '\t' = True +isSpace '\n' = True +isSpace '\r' = True +isSpace _ = False -- Instances @@ -736,17 +864,39 @@ instance HasReaderOptions HTMLState where extractReaderOptions = extractReaderOptions . parserState instance Default HTMLState where - def = HTMLState def + def = HTMLState def [] instance HasMeta HTMLState where setMeta s b st = st {parserState = setMeta s b $ parserState st} deleteMeta s st = st {parserState = deleteMeta s $ parserState st} instance Default HTMLLocal where - def = HTMLLocal NoQuote + def = HTMLLocal NoQuote False instance HasLastStrPosition HTMLState where setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)} getLastStrPos = getLastStrPos . parserState +-- EPUB Specific +-- +-- +sectioningContent :: [String] +sectioningContent = ["article", "aside", "nav", "section"] + +{- +groupingContent :: [String] +groupingContent = ["p", "hr", "pre", "blockquote", "ol" + , "ul", "li", "dl", "dt", "dt", "dd" + , "figure", "figcaption", "div", "main"] + + + +types :: [(String, ([String], Int))] +types = -- Document divisions + map (\s -> (s, (["section", "body"], 0))) + ["volume", "part", "chapter", "division"] + ++ -- Document section and components + [ + ("abstract", ([], 0))] +-} -- cgit v1.2.3