diff options
Diffstat (limited to 'src/Text/Pandoc/Readers/FB2.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/FB2.hs | 123 |
1 files changed, 63 insertions, 60 deletions
diff --git a/src/Text/Pandoc/Readers/FB2.hs b/src/Text/Pandoc/Readers/FB2.hs index 0b25b9fed..6eed3c104 100644 --- a/src/Text/Pandoc/Readers/FB2.hs +++ b/src/Text/Pandoc/Readers/FB2.hs @@ -1,5 +1,6 @@ {-# LANGUAGE FlexibleContexts #-} {-# LANGUAGE TupleSections #-} +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.FB2 Copyright : Copyright (C) 2018-2019 Alexander Krotov @@ -27,12 +28,11 @@ import Control.Monad.Except (throwError) import Control.Monad.State.Strict import Data.ByteString.Lazy.Char8 ( pack ) import Data.ByteString.Base64.Lazy -import Data.Char (isSpace, toUpper) import Data.Functor -import Data.List (dropWhileEnd, intersperse) -import Data.List.Split (splitOn) +import Data.List (intersperse) import qualified Data.Map as M import Data.Text (Text) +import qualified Data.Text as T import Data.Default import Data.Maybe import Text.HTML.TagSoup.Entity (lookupEntity) @@ -48,8 +48,8 @@ type FB2 m = StateT FB2State m data FB2State = FB2State{ fb2SectionLevel :: Int , fb2Meta :: Meta - , fb2Authors :: [String] - , fb2Notes :: M.Map String Blocks + , fb2Authors :: [Text] + , fb2Notes :: M.Map Text Blocks } deriving Show instance Default FB2State where @@ -76,19 +76,20 @@ readFB2 _ inp = -- * Utility functions -trim :: String -> String -trim = dropWhileEnd isSpace . dropWhile isSpace +trim :: Text -> Text +trim = T.strip -removeHash :: String -> String -removeHash ('#':xs) = xs -removeHash xs = xs +removeHash :: Text -> Text +removeHash t = case T.uncons t of + Just ('#', xs) -> xs + _ -> t -convertEntity :: String -> String -convertEntity e = fromMaybe (map toUpper e) (lookupEntity e) +convertEntity :: String -> Text +convertEntity e = maybe (T.toUpper $ T.pack e) T.pack $ lookupEntity e parseInline :: PandocMonad m => Content -> FB2 m Inlines parseInline (Elem e) = - case qName $ elName e of + case T.pack $ qName $ elName e of "strong" -> strong <$> parseStyleType e "emphasis" -> emph <$> parseStyleType e "style" -> parseNamedStyle e @@ -96,12 +97,12 @@ parseInline (Elem e) = "strikethrough" -> strikeout <$> parseStyleType e "sub" -> subscript <$> parseStyleType e "sup" -> superscript <$> parseStyleType e - "code" -> pure $ code $ strContent e + "code" -> pure $ code $ T.pack $ strContent e "image" -> parseInlineImageElement e name -> do report $ IgnoredElement name pure mempty -parseInline (Text x) = pure $ text $ cdData x +parseInline (Text x) = pure $ text $ T.pack $ cdData x parseInline (CRef r) = pure $ str $ convertEntity r parseSubtitle :: PandocMonad m => Element -> FB2 m Blocks @@ -111,7 +112,7 @@ parseSubtitle e = headerWith ("", ["unnumbered"], []) <$> gets fb2SectionLevel < parseRootElement :: PandocMonad m => Element -> FB2 m Blocks parseRootElement e = - case qName $ elName e of + case T.pack $ qName $ elName e of "FictionBook" -> do -- Parse notes before parsing the rest of the content. case filterChild isNotesBody e of @@ -144,7 +145,7 @@ parseNote e = Just sectionId -> do content <- mconcat <$> mapM parseSectionChild (dropTitle $ elChildren e) oldNotes <- gets fb2Notes - modify $ \s -> s { fb2Notes = M.insert ("#" ++ sectionId) content oldNotes } + modify $ \s -> s { fb2Notes = M.insert ("#" <> T.pack sectionId) content oldNotes } pure () where isTitle x = qName (elName x) == "title" @@ -156,7 +157,7 @@ parseNote e = -- | Parse a child of @\<FictionBook>@ element. parseFictionBookChild :: PandocMonad m => Element -> FB2 m Blocks parseFictionBookChild e = - case qName $ elName e of + case T.pack $ qName $ elName e of "stylesheet" -> pure mempty -- stylesheet is ignored "description" -> mempty <$ mapM_ parseDescriptionChild (elChildren e) "body" -> if isNotesBody e @@ -168,7 +169,7 @@ parseFictionBookChild e = -- | Parse a child of @\<description>@ element. parseDescriptionChild :: PandocMonad m => Element -> FB2 m () parseDescriptionChild e = - case qName $ elName e of + case T.pack $ qName $ elName e of "title-info" -> mapM_ parseTitleInfoChild (elChildren e) "src-title-info" -> pure () -- ignore "document-info" -> pure () @@ -176,13 +177,13 @@ parseDescriptionChild e = "custom-info" -> pure () "output" -> pure () name -> do - report $ IgnoredElement $ name ++ " in description" + report $ IgnoredElement $ name <> " in description" pure mempty -- | Parse a child of @\<body>@ element. parseBodyChild :: PandocMonad m => Element -> FB2 m Blocks parseBodyChild e = - case qName $ elName e of + case T.pack $ qName $ elName e of "image" -> parseImageElement e "title" -> header <$> gets fb2SectionLevel <*> parseTitleType (elContent e) "epigraph" -> parseEpigraph e @@ -196,25 +197,25 @@ parseBinaryElement e = (Nothing, _) -> report $ IgnoredElement "binary without id attribute" (Just _, Nothing) -> report $ IgnoredElement "binary without content-type attribute" - (Just filename, contentType) -> insertMedia filename contentType (decodeLenient (pack (strContent e))) + (Just filename, contentType) -> insertMedia filename (T.pack <$> contentType) (decodeLenient (pack (strContent e))) -- * Type parsers -- | Parse @authorType@ -parseAuthor :: PandocMonad m => Element -> FB2 m String -parseAuthor e = unwords . catMaybes <$> mapM parseAuthorChild (elChildren e) +parseAuthor :: PandocMonad m => Element -> FB2 m Text +parseAuthor e = T.unwords . catMaybes <$> mapM parseAuthorChild (elChildren e) -parseAuthorChild :: PandocMonad m => Element -> FB2 m (Maybe String) +parseAuthorChild :: PandocMonad m => Element -> FB2 m (Maybe Text) parseAuthorChild e = - case qName $ elName e of - "first-name" -> pure $ Just $ strContent e - "middle-name" -> pure $ Just $ strContent e - "last-name" -> pure $ Just $ strContent e - "nickname" -> pure $ Just $ strContent e - "home-page" -> pure $ Just $ strContent e - "email" -> pure $ Just $ strContent e + case T.pack $ qName $ elName e of + "first-name" -> pure $ Just $ T.pack $ strContent e + "middle-name" -> pure $ Just $ T.pack $ strContent e + "last-name" -> pure $ Just $ T.pack $ strContent e + "nickname" -> pure $ Just $ T.pack $ strContent e + "home-page" -> pure $ Just $ T.pack $ strContent e + "email" -> pure $ Just $ T.pack $ strContent e name -> do - report $ IgnoredElement $ name ++ " in author" + report $ IgnoredElement $ name <> " in author" pure Nothing -- | Parse @titleType@ @@ -236,13 +237,13 @@ parseTitleContent _ = pure Nothing parseImageElement :: PandocMonad m => Element -> FB2 m Blocks parseImageElement e = case href of - Just src -> pure $ para $ imageWith (imgId, [], []) (removeHash src) title alt + Just src -> pure $ para $ imageWith (imgId, [], []) (removeHash $ T.pack src) title alt Nothing -> do report $ IgnoredElement " image without href" pure mempty - where alt = maybe mempty str $ findAttr (unqual "alt") e - title = fromMaybe "" $ findAttr (unqual "title") e - imgId = fromMaybe "" $ findAttr (unqual "id") e + where alt = maybe mempty (str . T.pack) $ findAttr (unqual "alt") e + title = maybe "" T.pack $ findAttr (unqual "title") e + imgId = maybe "" T.pack $ findAttr (unqual "id") e href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e -- | Parse @pType@ @@ -256,7 +257,7 @@ parseCite e = blockQuote . mconcat <$> mapM parseCiteChild (elChildren e) -- | Parse @citeType@ child parseCiteChild :: PandocMonad m => Element -> FB2 m Blocks parseCiteChild e = - case qName $ elName e of + case T.pack $ qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "empty-line" -> pure horizontalRule @@ -271,13 +272,13 @@ parsePoem e = mconcat <$> mapM parsePoemChild (elChildren e) parsePoemChild :: PandocMonad m => Element -> FB2 m Blocks parsePoemChild e = - case qName $ elName e of + case T.pack $ qName $ elName e of "title" -> parseTitle e "subtitle" -> parseSubtitle e "epigraph" -> parseEpigraph e "stanza" -> parseStanza e "text-author" -> para <$> parsePType e - "date" -> pure $ para $ text $ strContent e + "date" -> pure $ para $ text $ T.pack $ strContent e name -> report (UnexpectedXmlElement name "poem") $> mempty parseStanza :: PandocMonad m => Element -> FB2 m Blocks @@ -290,7 +291,7 @@ joinLineBlocks [] = [] parseStanzaChild :: PandocMonad m => Element -> FB2 m Blocks parseStanzaChild e = - case qName $ elName e of + case T.pack $ qName $ elName e of "title" -> parseTitle e "subtitle" -> parseSubtitle e "v" -> lineBlock . (:[]) <$> parsePType e @@ -300,11 +301,11 @@ parseStanzaChild e = parseEpigraph :: PandocMonad m => Element -> FB2 m Blocks parseEpigraph e = divWith (divId, ["epigraph"], []) . mconcat <$> mapM parseEpigraphChild (elChildren e) - where divId = fromMaybe "" $ findAttr (unqual "id") e + where divId = maybe "" T.pack $ findAttr (unqual "id") e parseEpigraphChild :: PandocMonad m => Element -> FB2 m Blocks parseEpigraphChild e = - case qName $ elName e of + case T.pack $ qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "cite" -> parseCite e @@ -318,7 +319,7 @@ parseAnnotation e = mconcat <$> mapM parseAnnotationChild (elChildren e) parseAnnotationChild :: PandocMonad m => Element -> FB2 m Blocks parseAnnotationChild e = - case qName $ elName e of + case T.pack $ qName $ elName e of "p" -> para <$> parsePType e "poem" -> parsePoem e "cite" -> parseCite e @@ -332,14 +333,14 @@ parseSection :: PandocMonad m => Element -> FB2 m Blocks parseSection e = do n <- gets fb2SectionLevel modify $ \st -> st{ fb2SectionLevel = n + 1 } - let sectionId = fromMaybe "" $ findAttr (unqual "id") e + let sectionId = maybe "" T.pack $ findAttr (unqual "id") e bs <- divWith (sectionId, ["section"], []) . mconcat <$> mapM parseSectionChild (elChildren e) modify $ \st -> st{ fb2SectionLevel = n } pure bs parseSectionChild :: PandocMonad m => Element -> FB2 m Blocks parseSectionChild e = - case qName $ elName e of + case T.pack $ qName $ elName e of "title" -> parseBodyChild e "epigraph" -> parseEpigraph e "image" -> parseImageElement e @@ -361,16 +362,16 @@ parseStyleType e = mconcat <$> mapM parseInline (elContent e) parseNamedStyle :: PandocMonad m => Element -> FB2 m Inlines parseNamedStyle e = do content <- mconcat <$> mapM parseNamedStyleChild (elContent e) - let lang = maybeToList $ ("lang",) <$> findAttr (QName "lang" Nothing (Just "xml")) e + let lang = maybeToList $ ("lang",) . T.pack <$> findAttr (QName "lang" Nothing (Just "xml")) e case findAttr (unqual "name") e of - Just name -> pure $ spanWith ("", [name], lang) content + Just name -> pure $ spanWith ("", [T.pack name], lang) content Nothing -> do report $ IgnoredElement "link without required name" pure mempty parseNamedStyleChild :: PandocMonad m => Content -> FB2 m Inlines parseNamedStyleChild (Elem e) = - case qName (elName e) of + case T.pack $ qName (elName e) of "strong" -> strong <$> parseStyleType e "emphasis" -> emph <$> parseStyleType e "style" -> parseNamedStyle e @@ -378,10 +379,10 @@ parseNamedStyleChild (Elem e) = "strikethrough" -> strikeout <$> parseStyleType e "sub" -> subscript <$> parseStyleType e "sup" -> superscript <$> parseStyleType e - "code" -> pure $ code $ strContent e + "code" -> pure $ code $ T.pack $ strContent e "image" -> parseInlineImageElement e name -> do - report $ IgnoredElement $ name ++ " in style" + report $ IgnoredElement $ name <> " in style" pure mempty parseNamedStyleChild x = parseInline x @@ -390,7 +391,7 @@ parseLinkType :: PandocMonad m => Element -> FB2 m Inlines parseLinkType e = do content <- mconcat <$> mapM parseStyleLinkType (elContent e) notes <- gets fb2Notes - case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of + case T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of Just href -> case findAttr (QName "type" Nothing Nothing) e of Just "note" -> case M.lookup href notes of Nothing -> pure $ link href "" content @@ -417,19 +418,21 @@ parseTable _ = pure mempty -- TODO: tables are not supported yet -- | Parse @title-infoType@ parseTitleInfoChild :: PandocMonad m => Element -> FB2 m () parseTitleInfoChild e = - case qName (elName e) of + case T.pack $ qName (elName e) of "genre" -> pure () "author" -> parseAuthor e >>= \author -> modify (\st -> st {fb2Authors = author:fb2Authors st}) - "book-title" -> modify (setMeta "title" (text $ strContent e)) + "book-title" -> modify (setMeta "title" (text $ T.pack $ strContent e)) "annotation" -> parseAnnotation e >>= modify . setMeta "abstract" - "keywords" -> modify (setMeta "keywords" (map (MetaString . trim) $ splitOn "," $ strContent e)) - "date" -> modify (setMeta "date" (text $ strContent e)) + "keywords" -> modify (setMeta "keywords" (map (MetaString . trim) $ T.splitOn "," + $ T.pack + $ strContent e)) + "date" -> modify (setMeta "date" (text $ T.pack $ strContent e)) "coverpage" -> parseCoverPage e "lang" -> pure () "src-lang" -> pure () "translator" -> pure () "sequence" -> pure () - name -> report $ IgnoredElement $ name ++ " in title-info" + name -> report $ IgnoredElement $ name <> " in title-info" parseCoverPage :: PandocMonad m => Element -> FB2 m () parseCoverPage e = @@ -437,7 +440,7 @@ parseCoverPage e = Just img -> case href of Just src -> modify (setMeta "cover-image" (MetaString $ removeHash src)) Nothing -> pure () - where href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) img + where href = T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) img Nothing -> pure () -- | Parse @inlineImageType@ element @@ -450,5 +453,5 @@ parseInlineImageElement e = Nothing -> do report $ IgnoredElement "inline image without href" pure mempty - where alt = maybe mempty str $ findAttr (unqual "alt") e - href = findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e + where alt = maybe mempty (str . T.pack) $ findAttr (unqual "alt") e + href = T.pack <$> findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e |