HTML Reader: Extended HTML Reader to recognise EPUB specific elements

author: Matthew Pickering <matthewtpickering@gmail.com> 2014-07-30 00:54:05 +0100
committer: Matthew Pickering <matthewtpickering@gmail.com> 2014-07-31 21:39:49 +0100
commit: 266e1977e03383f806867d9d3af86b5d55717830 (patch)
tree: 81592a158bf6e949748b915356ec7d50434bdc11 /src/Text/Pandoc
parent: 002ae95d7a72313d979e9da4666cf61ebd009154 (diff)
download: pandoc-266e1977e03383f806867d9d3af86b5d55717830.tar.gz
1 files changed, 178 insertions, 28 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 597156a5e..ef061df09 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -41,48 +41,64 @@ import Text.HTML.TagSoup
 import Text.HTML.TagSoup.Match
 import Text.Pandoc.Definition
 import qualified Text.Pandoc.Builder as B
-import Text.Pandoc.Builder (HasMeta (..), Blocks, Inlines, trimInlines)
-import Text.Pandoc.Shared
-import Text.Pandoc.Options
-import Text.Pandoc.Parsing
-import Data.Maybe ( fromMaybe, isJust )
-import Data.List ( intercalate )
+import Text.Pandoc.Builder (Blocks, Inlines, trimInlines, HasMeta(..))
+import Text.Pandoc.Shared ( extractSpaces, renderTags'
+                          , escapeURI, safeRead )
+import Text.Pandoc.Options (ReaderOptions(readerParseRaw, readerTrace)
+                           , Extension (Ext_epub_html_exts))
+import Text.Pandoc.Parsing hiding ((<|>))
+import Text.Pandoc.Walk
+import Data.Maybe ( fromMaybe, isJust)
+import Data.List ( intercalate, isInfixOf )
 import Data.Char ( isDigit )
-import Control.Monad ( liftM, guard, when, mzero )
-import Control.Applicative ( (<$>), (<$), (<*) )
-import Data.Monoid
+import Control.Monad ( liftM, guard, when, mzero, void, unless )
+import Control.Arrow ((***))
+import Control.Applicative ( (<$>), (<$), (<*), (*>), (<|>))
+import Data.Monoid (mconcat, Monoid, mempty, (<>), First (..))
 import Text.Printf (printf)
 import Debug.Trace (trace)
-import Data.Default (Default (..))
-import Control.Monad.Reader (Reader, runReader, asks, local, ask)
+import Text.TeXMath (readMathML, writeTeXMath)
+import Data.Default (Default (..), def)
+import Control.Monad.Reader (Reader,ask, asks, local, runReader)
 
-isSpace :: Char -> Bool
-isSpace ' '  = True
-isSpace '\t' = True
-isSpace '\n' = True
-isSpace _    = False
 
 -- | Convert HTML-formatted string to 'Pandoc' document.
 readHtml :: ReaderOptions -- ^ Reader options
          -> String        -- ^ String to parse (assumes @'\n'@ line endings)
          -> Pandoc
 readHtml opts inp =
-  case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } )  "source" tags of
+  case flip runReader def $ runParserT parseDoc (HTMLState def{ stateOptions = opts } [])  "source" tags of
           Left err'    -> error $ "\nError at " ++ show  err'
           Right result -> result
-    where tags = canonicalizeTags $
+    where tags = stripPrefixes . canonicalizeTags $
                    parseTagsOptions parseOptions{ optTagPosition = True } inp
           parseDoc = do
              blocks <- (fixPlains False) . mconcat <$> manyTill block eof
              meta <- stateMeta . parserState <$> getState
-             return $ Pandoc meta (B.toList blocks)
+             bs' <- replaceNotes (B.toList blocks)
+             return $ Pandoc meta bs'
+
+replaceNotes :: [Block] -> TagParser [Block]
+replaceNotes = walkM replaceNotes'
+
+replaceNotes' :: Inline -> TagParser Inline
+replaceNotes' (RawInline (Format "noteref") ref) = maybe (Str "") (Note . B.toList) . lookup ref <$> getNotes
+  where
+    getNotes = noteTable <$> getState
+replaceNotes' x = return x
 
 data HTMLState =
   HTMLState
-  {  parserState :: ParserState
+  {  parserState :: ParserState,
+     noteTable :: [(String, Blocks)]
   }
 
-data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext }
+data HTMLLocal = HTMLLocal { quoteContext :: QuoteContext
+                           , inChapter :: Bool -- ^ Set if in chapter section
+                           }
+
+setInChapter :: HTMLParser s a -> HTMLParser s a
+setInChapter = local (\s -> s {inChapter = True})
 
 type HTMLParser s = ParserT s HTMLState (Reader HTMLLocal)
 
@@ -110,7 +126,11 @@ block = do
   tr <- getOption readerTrace
   pos <- getPosition
   res <- choice
-            [ pPara
+            [ eSwitch
+            , eSection
+            , mempty <$ eFootnote
+            , mempty <$ eTOC
+            , pPara
             , pHeader
             , pBlockQuote
             , pCodeBlock
@@ -127,6 +147,64 @@ block = do
              (take 60 $ show $ B.toList res)) (return ())
   return res
 
+namespaces :: [(String, TagParser Blocks)]
+namespaces = [(mathMLNamespace, B.para <$> pMath True)]
+
+mathMLNamespace :: String
+mathMLNamespace = "http://www.w3.org/1998/Math/MathML"
+
+eSwitch :: TagParser Blocks
+eSwitch = try $ do
+  guardEnabled Ext_epub_html_exts
+  pSatisfy (~== TagOpen "switch" [])
+  cases <- getFirst . mconcat <$>
+            manyTill (First <$> (eCase <* skipMany pBlank) )
+              (lookAhead $ try $ pSatisfy (~== TagOpen "default" []))
+  skipMany pBlank
+  fallback <- pInTags "default" ( skipMany pBlank *> block <* skipMany pBlank )
+  skipMany pBlank
+  pSatisfy (~== TagClose "switch")
+  return (fromMaybe fallback cases)
+
+eCase :: TagParser (Maybe Blocks)
+eCase = do
+  skipMany pBlank
+  TagOpen _ attr <- lookAhead $ pSatisfy $ (~== TagOpen "case" [])
+  case (flip lookup namespaces) =<< lookup "required-namespace" attr of
+    Just p -> Just <$> (pInTags "case" (skipMany pBlank *> p <* skipMany pBlank))
+    Nothing -> Nothing <$ manyTill pAnyTag (pSatisfy (~== TagClose "case"))
+
+eFootnote :: TagParser ()
+eFootnote = try $ do
+  let notes = ["footnote", "rearnote"]
+  guardEnabled Ext_epub_html_exts
+  (TagOpen tag attr) <- lookAhead $ pAnyTag
+  guard (maybe False (flip elem notes) (lookup "type" attr))
+  let ident = fromMaybe "" (lookup "id" attr)
+  content <- pInTags tag block
+  addNote ident content
+
+addNote :: String -> Blocks -> TagParser ()
+addNote uid cont = updateState (\s -> s {noteTable = (uid, cont) : (noteTable s)})
+
+eNoteref :: TagParser Inlines
+eNoteref = try $ do
+  guardEnabled Ext_epub_html_exts
+  TagOpen tag attr <- lookAhead $ pAnyTag
+  guard (maybe False (== "noteref") (lookup "type" attr))
+  let ident = maybe "" (dropWhile (== '#')) (lookup "href" attr)
+  guard (not (null ident))
+  pInTags tag block
+  return $ B.rawInline "noteref" ident
+
+-- Strip TOC if there is one, better to generate again
+eTOC :: TagParser ()
+eTOC = try $ do
+  guardEnabled Ext_epub_html_exts
+  (TagOpen tag attr) <- lookAhead $ pAnyTag
+  guard (maybe False (== "toc") (lookup "type" attr))
+  void (pInTags tag block)
+
 pList :: TagParser Blocks
 pList = pBulletList <|> pOrderedList <|> pDefinitionList
 
@@ -230,13 +308,35 @@ pHtmlBlock t = try $ do
   contents <- manyTill pAnyTag (pSatisfy (~== TagClose t))
   return $ renderTags' $ [open] ++ contents ++ [TagClose t]
 
+-- Sets chapter context
+eSection :: TagParser Blocks
+eSection = try $ do
+  let matchChapter as = maybe False (isInfixOf "chapter") (lookup "type" as)
+  let sectTag = tagOpen (`elem` sectioningContent) matchChapter
+  TagOpen tag _ <- lookAhead $ pSatisfy sectTag
+  setInChapter (pInTags tag block)
+
+headerLevel :: String -> TagParser Int
+headerLevel tagtype = do
+  let level = read (drop 1 tagtype)
+  (try $ do
+    guardEnabled Ext_epub_html_exts
+    asks inChapter >>= guard
+    return (level - 1))
+    <|>
+      return level
+
+
+
+
+
 pHeader :: TagParser Blocks
 pHeader = try $ do
   TagOpen tagtype attr <- pSatisfy $
                            tagOpen (`elem` ["h1","h2","h3","h4","h5","h6"])
                            (const True)
   let bodyTitle = TagOpen tagtype attr ~== TagOpen "h1" [("class","title")]
-  let level = read (drop 1 tagtype)
+  level <- headerLevel tagtype
   contents <- trimInlines . mconcat <$> manyTill inline (pCloses tagtype <|> eof)
   let ident = fromMaybe "" $ lookup "id" attr
   let classes = maybe [] words $ lookup "class" attr
@@ -336,7 +436,8 @@ pCodeBlock = try $ do
 
 inline :: TagParser Inlines
 inline = choice
-           [ pTagText
+           [ eNoteref
+           , pTagText
            , pQ
            , pEmph
            , pStrong
@@ -348,6 +449,7 @@ inline = choice
            , pImage
            , pCode
            , pSpan
+           , pMath False
            , pRawHtmlInline
            ]
 
@@ -620,8 +722,11 @@ blockDocBookTags = ["calloutlist", "bibliolist", "glosslist", "itemizedlist",
                     "classsynopsis", "blockquote", "epigraph", "msgset",
                     "sidebar", "title"]
 
+epubTags :: [String]
+epubTags = ["case", "switch", "default"]
+
 blockTags :: [String]
-blockTags = blockHtmlTags ++ blockDocBookTags
+blockTags = blockHtmlTags ++ blockDocBookTags ++ epubTags
 
 isInlineTag :: Tag String -> Bool
 isInlineTag t = tagOpen isInlineTagName (const True) t ||
@@ -720,9 +825,32 @@ htmlTag f = try $ do
 mkAttr :: [(String, String)] -> Attr
 mkAttr attr = (attribsId, attribsClasses, attribsKV)
   where attribsId = fromMaybe "" $ lookup "id" attr
-        attribsClasses = words $ fromMaybe "" $ lookup "class" attr
+        attribsClasses = (words $ fromMaybe "" $ lookup "class" attr) ++ epubTypes
         attribsKV = filter (\(k,_) -> k /= "class" && k /= "id") attr
+        epubTypes = words $ fromMaybe "" $ lookup "epub:type" attr
+
+-- Strip namespace prefixes
+stripPrefixes :: [Tag String] -> [Tag String]
+stripPrefixes = map stripPrefix
 
+stripPrefix :: Tag String -> Tag String
+stripPrefix (TagOpen s as) =
+    TagOpen (stripPrefix' s) (map (stripPrefix' *** id) as)
+stripPrefix (TagClose s) = TagClose (stripPrefix' s)
+stripPrefix x = x
+
+stripPrefix' :: String -> String
+stripPrefix' s =
+  case span (/= ':') s of
+    (_, "") -> s
+    (_, (_:ts)) -> ts
+
+isSpace :: Char -> Bool
+isSpace ' '  = True
+isSpace '\t' = True
+isSpace '\n' = True
+isSpace '\r' = True
+isSpace _    = False
 
 -- Instances
 
@@ -736,17 +864,39 @@ instance HasReaderOptions HTMLState where
     extractReaderOptions = extractReaderOptions . parserState
 
 instance Default HTMLState where
-  def = HTMLState def
+  def = HTMLState def []
 
 instance HasMeta HTMLState where
   setMeta s b st = st {parserState = setMeta s b $ parserState st}
   deleteMeta s st = st {parserState = deleteMeta s $ parserState st}
 
 instance Default HTMLLocal where
-  def = HTMLLocal NoQuote
+  def = HTMLLocal NoQuote False
 
 instance HasLastStrPosition HTMLState where
   setLastStrPos s st = st {parserState = setLastStrPos s (parserState st)}
   getLastStrPos = getLastStrPos . parserState
 
 
+-- EPUB Specific
+--
+--
+sectioningContent :: [String]
+sectioningContent = ["article", "aside", "nav", "section"]
+
+{-
+groupingContent :: [String]
+groupingContent = ["p", "hr", "pre", "blockquote", "ol"
+                  , "ul", "li", "dl", "dt", "dt", "dd"
+                  , "figure", "figcaption", "div", "main"]
+
+
+
+types :: [(String, ([String], Int))]
+types =  -- Document divisions
+   map (\s -> (s, (["section", "body"], 0)))
+    ["volume", "part", "chapter", "division"]
+  ++ -- Document section and components
+  [
+    ("abstract",  ([], 0))]
+-}
author	Matthew Pickering <matthewtpickering@gmail.com>	2014-07-30 00:54:05 +0100
committer	Matthew Pickering <matthewtpickering@gmail.com>	2014-07-31 21:39:49 +0100
commit	266e1977e03383f806867d9d3af86b5d55717830 (patch)
tree	81592a158bf6e949748b915356ec7d50434bdc11 /src/Text/Pandoc
parent	002ae95d7a72313d979e9da4666cf61ebd009154 (diff)
download	pandoc-266e1977e03383f806867d9d3af86b5d55717830.tar.gz