diff options
author | John MacFarlane <jgm@berkeley.edu> | 2021-02-18 21:24:31 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2021-02-18 21:24:31 -0800 |
commit | 98d26c234579a06446a5bef1992ed77bac48a4ac (patch) | |
tree | 72713ec4feaedaf8d38744d97d219a9781857ff8 /src/Text/Pandoc | |
parent | ef642e2bbc1f46056fc27560ceba791f27f2daa6 (diff) | |
download | pandoc-98d26c234579a06446a5bef1992ed77bac48a4ac.tar.gz |
DocBook, JATS, OPML readers: performance optimization.
With the new XML parser, we can avoid the expensive tree
normalization step we used to do.
This gives a significant speed boost in docbook and JATS
parsing (e.g. 9.7 to 6 ms).
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r-- | src/Text/Pandoc/Readers/DocBook.hs | 26 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/JATS.hs | 24 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/OPML.hs | 22 |
3 files changed, 8 insertions, 64 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index e201b54fe..d38b07864 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -12,7 +12,7 @@ Conversion of DocBook XML to 'Pandoc' document. -} module Text.Pandoc.Readers.DocBook ( readDocBook ) where import Control.Monad.State.Strict -import Data.Char (isSpace, toUpper, isLetter) +import Data.Char (isSpace, isLetter) import Data.Default import Data.Either (rights) import Data.Foldable (asum) @@ -540,8 +540,9 @@ instance Default DBState where readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc readDocBook _ inp = do - tree <- either (throwError . PandocXMLError "") (return . normalizeTree) $ - parseXMLContents (TL.fromStrict . handleInstructions $ crFilter inp) + tree <- either (throwError . PandocXMLError "") return $ + parseXMLContents + (TL.fromStrict . handleInstructions $ crFilter inp) (bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree return $ Pandoc (dbMeta st') (toList . mconcat $ bs) @@ -571,25 +572,6 @@ getFigure e = do modify $ \st -> st{ dbFigureTitle = mempty, dbFigureId = mempty } return res --- normalize input, consolidating adjacent Text and CRef elements -normalizeTree :: [Content] -> [Content] -normalizeTree = everywhere (mkT go) - where go :: [Content] -> [Content] - go (Text (CData CDataRaw _ _):xs) = xs - go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 <> s2) z):xs - go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 <> convertEntity r) z):xs - go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r <> s1) z):xs - go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 <> - convertEntity r2) Nothing):xs - go xs = xs - -convertEntity :: Text -> Text -convertEntity e = maybe (T.map toUpper e) T.pack (lookupEntity $ T.unpack e) - -- convenience function to get an attribute value, defaulting to "" attrValue :: Text -> Element -> Text attrValue attr elt = diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs index 5353f2001..602f3b4f2 100644 --- a/src/Text/Pandoc/Readers/JATS.hs +++ b/src/Text/Pandoc/Readers/JATS.hs @@ -54,30 +54,11 @@ instance Default JATSState where readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc readJATS _ inp = do - tree <- either (throwError . PandocXMLError "") - (return . normalizeTree) $ + tree <- either (throwError . PandocXMLError "") return $ parseXMLContents (TL.fromStrict $ crFilter inp) (bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree return $ Pandoc (jatsMeta st') (toList . mconcat $ bs) --- normalize input, consolidating adjacent Text and CRef elements -normalizeTree :: [Content] -> [Content] -normalizeTree = everywhere (mkT go) - where go :: [Content] -> [Content] - go (Text (CData CDataRaw _ _):xs) = xs - go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 <> s2) z):xs - go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 <> convertEntity r) z):xs - go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r <> s1) z):xs - go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs - go xs = xs - -convertEntity :: Text -> Text -convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity $ T.unpack e) - -- convenience function to get an attribute value, defaulting to "" attrValue :: Text -> Element -> Text attrValue attr = @@ -454,7 +435,8 @@ elementToStr x = x parseInline :: PandocMonad m => Content -> JATS m Inlines parseInline (Text (CData _ s _)) = return $ text s -parseInline (CRef ref) = return . text . convertEntity $ ref +parseInline (CRef ref) = return $ maybe (text $ T.toUpper ref) (text . T.pack) + $ lookupEntity (T.unpack ref) parseInline (Elem e) = case qName (elName e) of "italic" -> innerInlines emph diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs index 184d5a63f..5f2ddb876 100644 --- a/src/Text/Pandoc/Readers/OPML.hs +++ b/src/Text/Pandoc/Readers/OPML.hs @@ -14,12 +14,10 @@ Conversion of OPML to 'Pandoc' document. module Text.Pandoc.Readers.OPML ( readOPML ) where import Control.Monad.State.Strict import Data.Default -import Data.Generics import Data.Maybe (fromMaybe) import Data.Text (Text) import qualified Data.Text as T import qualified Data.Text.Lazy as TL -import Text.HTML.TagSoup.Entity (lookupEntity) import Text.Pandoc.Builder import Text.Pandoc.Class.PandocMonad (PandocMonad) import Text.Pandoc.Options @@ -53,7 +51,7 @@ readOPML opts inp = do (bs, st') <- runStateT (case parseXMLContents (TL.fromStrict (crFilter inp)) of Left msg -> throwError $ PandocXMLError "" msg - Right ns -> mapM parseBlock $ normalizeTree ns) + Right ns -> mapM parseBlock ns) def{ opmlOptions = opts } return $ setTitle (opmlDocTitle st') $ @@ -61,24 +59,6 @@ readOPML opts inp = do setDate (opmlDocDate st') $ doc $ mconcat bs --- normalize input, consolidating adjacent Text and CRef elements -normalizeTree :: [Content] -> [Content] -normalizeTree = everywhere (mkT go) - where go :: [Content] -> [Content] - go (Text (CData CDataRaw _ _):xs) = xs - go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 <> s2) z):xs - go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 <> convertEntity r) z):xs - go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r <> s1) z):xs - go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs - go xs = xs - -convertEntity :: Text -> Text -convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity (T.unpack e)) - -- convenience function to get an attribute value, defaulting to "" attrValue :: Text -> Element -> Text attrValue attr elt = |