module Text.Pandoc.Readers.DocBook ( readDocBook ) where import Text.Pandoc.Parsing (ParserState(..)) import Text.Pandoc.Definition import Text.Pandoc.Builder import Text.XML.Light import Data.Monoid import Data.Char (isSpace) import Control.Monad.State import Control.Applicative ((<$>)) type DB = State DBState data DBState = DBState{ dbSectionLevel :: Int , dbDocTitle :: Inlines , dbDocAuthors :: [Inlines] , dbDocDate :: Inlines } deriving Show readDocBook :: ParserState -> String -> Pandoc readDocBook st inp = setTitle (dbDocTitle st') $ setAuthors (dbDocAuthors st') $ setDate (dbDocDate st') $ doc $ mconcat bs where (bs, st') = runState (mapM parseBlock $ parseXML inp) DBState{ dbSectionLevel = 0 , dbDocTitle = mempty , dbDocAuthors = [] , dbDocDate = mempty } parseBlock :: Content -> DB Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE parseBlock (Text (CData _ s _)) = if all isSpace s then return mempty else return $ plain $ text s parseBlock (Elem e) = case qName (elName e) of "para" -> para <$> getInlines e "blockquote" -> blockQuote <$> innerBlocks "sect1" -> sect 1 "sect2" -> sect 2 "sect3" -> sect 3 "sect4" -> sect 4 "sect5" -> sect 5 "sect6" -> sect 6 "section" -> gets dbSectionLevel >>= sect . (+1) "articleinfo" -> getTitle >> getAuthors >> getDate >> return mempty "title" -> return mempty -- processed by sect "?xml" -> return mempty _ -> innerBlocks where innerBlocks = mconcat <$> (mapM parseBlock $ elContent e) getInlines e' = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e') isTitle e' = qName (elName e') == "title" skipWhite (Text (CData _ s _):xs) | all isSpace s = skipWhite xs | otherwise = xs skipWhite xs = xs getTitle = case findChild (unqual "title") e of Just t -> do tit <- getInlines t modify $ \st -> st{dbDocTitle = tit} Nothing -> return () getAuthors = do auths <- mapM getInlines $ findChildren (unqual "author") e modify $ \st -> st{dbDocAuthors = auths} getDate = case findChild (unqual "date") e of Just t -> do dat <- getInlines t modify $ \st -> st{dbDocDate = dat} Nothing -> return () sect n = case skipWhite (elContent e) of ((Elem t):body) | isTitle t -> do h <- header n <$> (getInlines t) modify $ \st -> st{ dbSectionLevel = n } b <- mconcat <$> (mapM parseBlock body) modify $ \st -> st{ dbSectionLevel = n - 1 } return $ h <> b body -> do let h = header n mempty modify $ \st -> st{ dbSectionLevel = n } b <- mconcat <$> (mapM parseBlock body) modify $ \st -> st{ dbSectionLevel = n - 1 } return $ h <> b parseBlock (CRef _) = return mempty parseInline :: Content -> DB Inlines parseInline (Text (CData _ s _)) = return $ text s parseInline (Elem e) = case qName (elName e) of "subscript" -> subscript <$> innerInlines "superscript" -> superscript <$> innerInlines "emphasis" -> case lookupAttrBy (\attr -> qName attr == "role") (elAttribs e) of Just "strong" -> strong <$> innerInlines _ -> emph <$> innerInlines "footnote" -> (note . mconcat) <$> (mapM parseBlock $ elContent e) _ -> innerInlines where innerInlines = (trimInlines . mconcat) <$> (mapM parseInline $ elContent e) parseInline (CRef _) = return mempty