Add FB2 reader (#4539)

author: Alexander <ilabdsf@gmail.com> 2018-04-26 22:33:18 +0300
committer: John MacFarlane <jgm@berkeley.edu> 2018-04-26 12:33:18 -0700
commit: 1927bc9aac0e822bd6179323e00fe38bee5a2cf3 (patch)
tree: 508b3527a8c09dfe72a790f82270691622af407d /src
parent: 5f0d407279e453107b28ee71fa0e45c5cc93090c (diff)
download: pandoc-1927bc9aac0e822bd6179323e00fe38bee5a2cf3.tar.gz
3 files changed, 406 insertions, 0 deletions
diff --git a/src/Text/Pandoc/App.hs b/src/Text/Pandoc/App.hs
index 99277d434..9a3e00c9f 100644
--- a/src/Text/Pandoc/App.hs
+++ b/src/Text/Pandoc/App.hs
@@ -738,6 +738,7 @@ defaultReaderName fallback (x:xs) =
     ".odt"      -> "odt"
     ".pdf"      -> "pdf"  -- so we get an "unknown reader" error
     ".doc"      -> "doc"  -- so we get an "unknown reader" error
+    ".fb2"      -> "fb2"
     _           -> defaultReaderName fallback xs
 
 -- Determine default writer based on output file extension
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index 680e7e0b0..7b7f92b35 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -65,6 +65,7 @@ module Text.Pandoc.Readers
   , readTxt2Tags
   , readEPUB
   , readMuse
+  , readFB2
   -- * Miscellaneous
   , getReader
   , getDefaultExtensions
@@ -86,6 +87,7 @@ import Text.Pandoc.Readers.Creole
 import Text.Pandoc.Readers.DocBook
 import Text.Pandoc.Readers.Docx
 import Text.Pandoc.Readers.EPUB
+import Text.Pandoc.Readers.FB2
 import Text.Pandoc.Readers.Haddock
 import Text.Pandoc.Readers.HTML (readHtml)
 import Text.Pandoc.Readers.JATS (readJATS)
@@ -143,6 +145,7 @@ readers = [ ("native"       , TextReader readNative)
            ,("t2t"          , TextReader readTxt2Tags)
            ,("epub"         , ByteStringReader readEPUB)
            ,("muse"         , TextReader readMuse)
+           ,("fb2"          , TextReader readFB2)
            ]
 
 -- | Retrieve reader, extensions based on formatSpec (format+extensions).
diff --git a/src/Text/Pandoc/Readers/FB2.hs b/src/Text/Pandoc/Readers/FB2.hs
new file mode 100644
index 000000000..99b71922f
--- /dev/null
+++ b/src/Text/Pandoc/Readers/FB2.hs
@@ -0,0 +1,402 @@
+{-# LANGUAGE FlexibleContexts #-}
+{-# LANGUAGE TupleSections #-}
+{-
+Copyright (C) 2018 Alexander Krotov <ilabdsf@gmail.com>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+-}
+
+{- |
+   Module      : Text.Pandoc.Readers.FB2
+   Copyright   : Copyright (C) 2018 Alexander Krotov
+   License     : GNU GPL, version 2 or above
+
+   Maintainer  : Alexander Krotov <ilabdsf@gmail.com>
+   Stability   : alpha
+   Portability : portable
+
+Conversion of FB2 to 'Pandoc' document.
+-}
+
+{-
+
+TODO:
+ - Tables
+ - Named styles
+ - Parse ID attribute for all elements that have it
+
+-}
+
+module Text.Pandoc.Readers.FB2 ( readFB2 ) where
+import Prelude
+import Control.Monad.Except (throwError)
+import Control.Monad.State.Strict
+import Data.ByteString.Lazy.Char8 ( pack )
+import Data.ByteString.Base64.Lazy
+import Data.Char (isSpace, toUpper)
+import Data.List (dropWhileEnd, intersperse)
+import Data.List.Split (splitOn)
+import Data.Text (Text)
+import Data.Default
+import Data.Maybe
+import Text.HTML.TagSoup.Entity (lookupEntity)
+import Text.Pandoc.Builder
+import Text.Pandoc.Class (PandocMonad, insertMedia)
+import Text.Pandoc.Error
+import Text.Pandoc.Options
+import Text.Pandoc.Shared (crFilter)
+import Text.XML.Light
+
+type FB2 m = StateT FB2State m
+
+data FB2State = FB2State{ fb2SectionLevel :: Int
+                        , fb2Meta :: Meta
+                        , fb2Authors :: [String]
+                        } deriving Show
+
+instance Default FB2State where
+  def = FB2State{ fb2SectionLevel = 1
+                , fb2Meta = mempty
+                , fb2Authors = []
+                }
+
+instance HasMeta FB2State where
+  setMeta field v s = s {fb2Meta = setMeta field v (fb2Meta s)}
+  deleteMeta field s = s {fb2Meta = deleteMeta field (fb2Meta s)}
+
+readFB2 :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
+readFB2 _ inp  = do
+  (bs, st) <- runStateT (mapM parseBlock $ parseXML (crFilter inp)) def
+  let authors = if null $ fb2Authors st
+                then id
+                else setMeta "author" (map text $ reverse $ fb2Authors st)
+  pure $ Pandoc (authors $ fb2Meta st) (toList . mconcat $ bs)
+
+-- * Utility functions
+
+trim :: String -> String
+trim = dropWhileEnd isSpace . dropWhile isSpace
+
+removeHash :: String -> String
+removeHash ('#':xs) = xs
+removeHash xs = xs
+
+convertEntity :: String -> String
+convertEntity e = fromMaybe (map toUpper e) (lookupEntity e)
+
+parseInline :: PandocMonad m => Content -> FB2 m Inlines
+parseInline (Elem e) =
+  case qName $ elName e of
+    "strong" -> strong <$> parseStyleType e
+    "emphasis" -> emph <$> parseStyleType e
+    "style" -> parseNamedStyle e
+    "a" -> parseLinkType e
+    "strikethrough" -> strikeout <$> parseStyleType e
+    "sub" -> subscript <$> parseStyleType e
+    "sup" -> superscript <$> parseStyleType e
+    "code" -> pure $ code $ strContent e
+    "image" -> parseInlineImageElement e
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ ".")
+parseInline (Text x) = pure $ text $ cdData x
+parseInline (CRef r) = pure $ str $ convertEntity r
+
+parseSubtitle :: PandocMonad m => Element -> FB2 m Blocks
+parseSubtitle e = headerWith ("", ["unnumbered"], []) <$> gets fb2SectionLevel <*> parsePType e
+
+-- * Root element parser
+
+parseBlock :: PandocMonad m => Content -> FB2 m Blocks
+parseBlock (Elem e) =
+  case qName $ elName e of
+    "?xml"  -> pure mempty
+    "FictionBook" -> mconcat <$> mapM parseFictionBookChild (elChildren e)
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ ".")
+parseBlock _ = pure mempty
+
+-- | Parse a child of @\<FictionBook>@ element.
+parseFictionBookChild :: PandocMonad m => Element -> FB2 m Blocks
+parseFictionBookChild e =
+  case qName $ elName e of
+    "stylesheet" -> pure mempty -- stylesheet is ignored
+    "description" -> mempty <$ mapM_ parseDescriptionChild (elChildren e)
+    "body" -> mconcat <$> mapM parseBodyChild (elChildren e)
+    "binary" -> mempty <$ parseBinaryElement e
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ "in FictionBook.")
+
+-- | Parse a child of @\<description>@ element.
+parseDescriptionChild :: PandocMonad m => Element -> FB2 m ()
+parseDescriptionChild e =
+  case qName $ elName e of
+    "title-info" -> mapM_ parseTitleInfoChild (elChildren e)
+    "src-title-info" -> pure () -- ignore
+    "document-info" -> pure ()
+    "publish-info" -> pure ()
+    "custom-info" -> pure ()
+    "output" -> pure ()
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ "in description.")
+
+-- | Parse a child of @\<body>@ element.
+parseBodyChild :: PandocMonad m => Element -> FB2 m Blocks
+parseBodyChild e =
+  case qName $ elName e of
+    "image" -> parseImageElement e
+    "title" -> header <$> gets fb2SectionLevel <*> parseTitleType (elContent e)
+    "epigraph" -> parseEpigraph e
+    "section" -> parseSection e
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ " in body.")
+
+-- | Parse a @\<binary>@ element.
+parseBinaryElement :: PandocMonad m => Element -> FB2 m ()
+parseBinaryElement e =
+  case (findAttr (QName "id" Nothing Nothing) e, findAttr (QName "content-type" Nothing Nothing) e) of
+    (Nothing, _) -> throwError $ PandocParseError "<binary> element must have an \"id\" attribute"
+    (Just _, Nothing) -> throwError $ PandocParseError "<binary> element must have a \"content-type\" attribute"
+    (Just filename, contentType) -> insertMedia filename contentType (decodeLenient (pack (strContent e)))
+
+-- * Type parsers
+
+-- | Parse @authorType@
+parseAuthor :: PandocMonad m => Element -> FB2 m String
+parseAuthor e = unwords <$> mapM parseAuthorChild (elChildren e)
+
+parseAuthorChild :: PandocMonad m => Element -> FB2 m String
+parseAuthorChild e =
+  case qName $ elName e of
+    "first-name" -> pure $ strContent e
+    "middle-name" -> pure $ strContent e
+    "last-name" -> pure $ strContent e
+    "nickname" -> pure $ strContent e
+    "home-page" -> pure $ strContent e
+    "email" -> pure $ strContent e
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ " in author.")
+
+-- | Parse @titleType@
+parseTitle :: PandocMonad m => Element -> FB2 m Blocks
+parseTitle e = header <$> gets fb2SectionLevel <*> parseTitleType (elContent e)
+
+parseTitleType :: PandocMonad m => [Content] -> FB2 m Inlines
+parseTitleType c = mconcat . intersperse linebreak . catMaybes <$> mapM parseTitleContent c
+
+parseTitleContent :: PandocMonad m => Content -> FB2 m (Maybe Inlines)
+parseTitleContent (Elem e) =
+  case qName $ elName e of
+    "p" -> Just <$> parsePType e
+    "empty-line" -> pure $ Just mempty
+    _ -> pure mempty
+parseTitleContent _ = pure Nothing
+
+-- | Parse @imageType@
+parseImageElement :: PandocMonad m => Element -> FB2 m Blocks
+parseImageElement e =
+  case href of
+    Just src -> pure $ para $ imageWith (imgId, [], []) (removeHash src) title alt
+    Nothing -> throwError $ PandocParseError "Couldn't parse FB2 file: image without href."
+  where alt = maybe mempty str $ findAttr (QName "alt" Nothing Nothing) e
+        title = fromMaybe "" $ findAttr (QName "title" Nothing Nothing) e
+        imgId = fromMaybe "" $ findAttr (QName "id" Nothing Nothing) e
+        href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e
+
+-- | Parse @pType@
+parsePType :: PandocMonad m => Element -> FB2 m Inlines
+parsePType = parseStyleType -- TODO add support for optional "id" and "style" attributes
+
+-- | Parse @citeType@
+parseCite :: PandocMonad m => Element -> FB2 m Blocks
+parseCite e = blockQuote . mconcat <$> mapM parseCiteChild (elChildren e)
+
+-- | Parse @citeType@ child
+parseCiteChild :: PandocMonad m => Element -> FB2 m Blocks
+parseCiteChild e =
+  case qName $ elName e of
+    "p" -> para <$> parsePType e
+    "poem" -> parsePoem e
+    "empty-line" -> pure horizontalRule
+    "subtitle" -> parseSubtitle e
+    "table" -> parseTable e
+    "text-author" -> para <$> parsePType e
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ " in cite.")
+
+-- | Parse @poemType@
+parsePoem :: PandocMonad m => Element -> FB2 m Blocks
+parsePoem e = mconcat <$> mapM parsePoemChild (elChildren e)
+
+parsePoemChild :: PandocMonad m => Element -> FB2 m Blocks
+parsePoemChild e =
+  case qName $ elName e of
+    "title" -> parseTitle e
+    "subtitle" -> parseSubtitle e
+    "epigraph" -> parseEpigraph e
+    "stanza" -> parseStanza e
+    "text-author" -> para <$> parsePType e
+    "date" -> pure $ para $ text $ strContent e
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ " in poem.")
+
+parseStanza :: PandocMonad m => Element -> FB2 m Blocks
+parseStanza e = fromList . joinLineBlocks . toList . mconcat <$> mapM parseStanzaChild (elChildren e)
+
+joinLineBlocks :: [Block] -> [Block]
+joinLineBlocks (LineBlock xs:LineBlock ys:zs) = joinLineBlocks (LineBlock (xs ++ ys) : zs)
+joinLineBlocks (x:xs) = x:joinLineBlocks xs
+joinLineBlocks [] = []
+
+parseStanzaChild :: PandocMonad m => Element -> FB2 m Blocks
+parseStanzaChild e =
+  case qName $ elName e of
+    "title" -> parseTitle e
+    "subtitle" -> parseSubtitle e
+    "v" -> lineBlock . (:[]) <$> parsePType e
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ " in stanza.")
+
+-- | Parse @epigraphType@
+parseEpigraph :: PandocMonad m => Element -> FB2 m Blocks
+parseEpigraph e =
+  divWith (divId, ["epigraph"], []) . mconcat <$> mapM parseEpigraphChild (elChildren e)
+  where divId = fromMaybe "" $ findAttr (QName "id" Nothing Nothing) e
+
+parseEpigraphChild :: PandocMonad m => Element -> FB2 m Blocks
+parseEpigraphChild e =
+  case qName $ elName e of
+    "p" -> para <$> parsePType e
+    "poem" -> parsePoem e
+    "cite" -> parseCite e
+    "empty-line" -> pure horizontalRule
+    "text-author" -> para <$> parsePType e
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ " in epigraph.")
+
+-- | Parse @annotationType@
+parseAnnotation :: PandocMonad m => Element -> FB2 m Blocks
+parseAnnotation e = mconcat <$> mapM parseAnnotationChild (elChildren e)
+
+parseAnnotationChild :: PandocMonad m => Element -> FB2 m Blocks
+parseAnnotationChild e =
+  case qName $ elName e of
+    "p" -> para <$> parsePType e
+    "poem" -> parsePoem e
+    "cite" -> parseCite e
+    "subtitle" -> parseSubtitle e
+    "table" -> parseTable e
+    "empty-line" -> pure horizontalRule
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ " in annotation.")
+
+-- | Parse @sectionType@
+parseSection :: PandocMonad m => Element -> FB2 m Blocks
+parseSection e = do
+  n <- gets fb2SectionLevel
+  modify $ \st -> st{ fb2SectionLevel = n + 1 }
+  let sectionId = fromMaybe "" $ findAttr (QName "id" Nothing Nothing) e
+  bs <- divWith (sectionId, ["section"], []) . mconcat <$> mapM parseSectionChild (elChildren e)
+  modify $ \st -> st{ fb2SectionLevel = n }
+  pure bs
+
+parseSectionChild :: PandocMonad m => Element -> FB2 m Blocks
+parseSectionChild e =
+  case qName $ elName e of
+    "title" -> parseBodyChild e
+    "epigraph" -> parseEpigraph e
+    "image" -> parseImageElement e
+    "annotation" -> parseAnnotation e
+    "poem" -> parsePoem e
+    "cite" -> parseCite e
+    "empty-line" -> pure horizontalRule
+    "table" -> parseTable e
+    "subtitle" -> parseSubtitle e
+    "p" -> para <$> parsePType e
+    "section" -> parseSection e
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ " in section.")
+
+-- | parse @styleType@
+parseStyleType :: PandocMonad m => Element -> FB2 m Inlines
+parseStyleType e = mconcat <$> mapM parseInline (elContent e)
+
+-- | Parse @namedStyleType@
+parseNamedStyle :: PandocMonad m => Element -> FB2 m Inlines
+parseNamedStyle e = do
+  content <- mconcat <$> mapM parseNamedStyleChild (elContent e)
+  let lang = maybeToList $ ("lang",) <$> findAttr (QName "lang" Nothing (Just "xml")) e
+  case findAttr (QName "name" Nothing Nothing) e of
+    Just name -> pure $ spanWith ("", [name], lang) content
+    Nothing -> throwError $ PandocParseError "Couldn't parse FB2 file: link without required name."
+
+parseNamedStyleChild :: PandocMonad m => Content -> FB2 m Inlines
+parseNamedStyleChild (Elem e) =
+  case qName (elName e) of
+    "strong" -> strong <$> parseStyleType e
+    "emphasis" -> emph <$> parseStyleType e
+    "style" -> parseNamedStyle e
+    "a" -> parseLinkType e
+    "strikethrough" -> strikeout <$> parseStyleType e
+    "sub" -> subscript <$> parseStyleType e
+    "sup" -> superscript <$> parseStyleType e
+    "code" -> pure $ code $ strContent e
+    "image" -> parseInlineImageElement e
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ ".")
+parseNamedStyleChild x = parseInline x
+
+-- | Parse @linkType@
+parseLinkType :: PandocMonad m => Element -> FB2 m Inlines
+parseLinkType e = do
+  content <- mconcat <$> mapM parseStyleLinkType (elContent e)
+  case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of
+    Just href -> pure $ link href "" content
+    Nothing -> throwError $ PandocParseError "Couldn't parse FB2 file: link without required href."
+
+-- | Parse @styleLinkType@
+parseStyleLinkType :: PandocMonad m => Content -> FB2 m Inlines
+parseStyleLinkType x@(Elem e) =
+  case qName (elName e) of
+    "a" -> throwError $ PandocParseError "Couldn't parse FB2 file: links cannot be nested."
+    _ -> parseInline x
+parseStyleLinkType x = parseInline x
+
+-- | Parse @tableType@
+parseTable :: PandocMonad m => Element -> FB2 m Blocks
+parseTable _ = pure mempty -- TODO: tables are not supported yet
+
+-- | Parse @title-infoType@
+parseTitleInfoChild :: PandocMonad m => Element -> FB2 m ()
+parseTitleInfoChild e =
+  case qName (elName e) of
+    "genre" -> pure ()
+    "author" -> parseAuthor e >>= \author -> modify (\st -> st {fb2Authors = author:fb2Authors st})
+    "book-title" -> modify (setMeta "title" (text $ strContent e))
+    "annotation" -> parseAnnotation e >>= modify . setMeta "abstract"
+    "keywords" -> modify (setMeta "keywords" (map (MetaString . trim) $ splitOn "," $ strContent e))
+    "date" -> modify (setMeta "date" (text $ strContent e))
+    "coverpage" -> parseCoverPage e
+    "lang" -> pure ()
+    "src-lang" -> pure ()
+    "translator" -> pure ()
+    "sequence" -> pure ()
+    name -> throwError $ PandocParseError ("Couldn't parse FB2 file: unexpected element " ++ name ++ " in title-info.")
+
+parseCoverPage :: PandocMonad m => Element -> FB2 m ()
+parseCoverPage e =
+  case findChild (QName "image" (Just "http://www.gribuser.ru/xml/fictionbook/2.0") Nothing) e of
+    Just img -> case href of
+                  Just src -> modify (setMeta "cover-image" (MetaString $ removeHash src))
+                  Nothing -> pure ()
+                where href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) img
+    Nothing -> pure ()
+
+-- | Parse @inlineImageType@ element
+parseInlineImageElement :: PandocMonad m
+                        => Element
+                        -> FB2 m Inlines
+parseInlineImageElement e =
+  case href of
+    Just src -> pure $ imageWith ("", [], []) (removeHash src) "" alt
+    Nothing -> throwError $ PandocParseError "Couldn't parse FB2 file: inline image without href."
+  where alt = maybe mempty str $ findAttr (QName "alt" Nothing Nothing) e
+        href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e
author	Alexander <ilabdsf@gmail.com>	2018-04-26 22:33:18 +0300
committer	John MacFarlane <jgm@berkeley.edu>	2018-04-26 12:33:18 -0700
commit	1927bc9aac0e822bd6179323e00fe38bee5a2cf3 (patch)
tree	508b3527a8c09dfe72a790f82270691622af407d /src
parent	5f0d407279e453107b28ee71fa0e45c5cc93090c (diff)
download	pandoc-1927bc9aac0e822bd6179323e00fe38bee5a2cf3.tar.gz