DocBook, JATS, OPML readers: performance optimization.

With the new XML parser, we can avoid the expensive tree normalization step we used to do. This gives a significant speed boost in docbook and JATS parsing (e.g. 9.7 to 6 ms).
author: John MacFarlane <jgm@berkeley.edu> 2021-02-18 21:24:31 -0800
committer: John MacFarlane <jgm@berkeley.edu> 2021-02-18 21:24:31 -0800
commit: 98d26c234579a06446a5bef1992ed77bac48a4ac (patch)
tree: 72713ec4feaedaf8d38744d97d219a9781857ff8 /src
parent: ef642e2bbc1f46056fc27560ceba791f27f2daa6 (diff)
download: pandoc-98d26c234579a06446a5bef1992ed77bac48a4ac.tar.gz
3 files changed, 8 insertions, 64 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs
index e201b54fe..d38b07864 100644
--- a/src/Text/Pandoc/Readers/DocBook.hs
+++ b/src/Text/Pandoc/Readers/DocBook.hs
@@ -12,7 +12,7 @@ Conversion of DocBook XML to 'Pandoc' document.
 -}
 module Text.Pandoc.Readers.DocBook ( readDocBook ) where
 import Control.Monad.State.Strict
-import Data.Char (isSpace, toUpper, isLetter)
+import Data.Char (isSpace, isLetter)
 import Data.Default
 import Data.Either (rights)
 import Data.Foldable (asum)
@@ -540,8 +540,9 @@ instance Default DBState where
 
 readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
 readDocBook _ inp = do
-  tree <- either (throwError . PandocXMLError "") (return . normalizeTree) $
-            parseXMLContents (TL.fromStrict . handleInstructions $ crFilter inp)
+  tree <- either (throwError . PandocXMLError "") return $
+            parseXMLContents
+              (TL.fromStrict . handleInstructions $ crFilter inp)
   (bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree
   return $ Pandoc (dbMeta st') (toList . mconcat $ bs)
 
@@ -571,25 +572,6 @@ getFigure e = do
   modify $ \st -> st{ dbFigureTitle = mempty, dbFigureId = mempty }
   return res
 
--- normalize input, consolidating adjacent Text and CRef elements
-normalizeTree :: [Content] -> [Content]
-normalizeTree = everywhere (mkT go)
-  where go :: [Content] -> [Content]
-        go (Text (CData CDataRaw _ _):xs) = xs
-        go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
-           Text (CData CDataText (s1 <> s2) z):xs
-        go (Text (CData CDataText s1 z):CRef r:xs) =
-           Text (CData CDataText (s1 <> convertEntity r) z):xs
-        go (CRef r:Text (CData CDataText s1 z):xs) =
-             Text (CData CDataText (convertEntity r <> s1) z):xs
-        go (CRef r1:CRef r2:xs) =
-             Text (CData CDataText (convertEntity r1 <>
-                                    convertEntity r2) Nothing):xs
-        go xs = xs
-
-convertEntity :: Text -> Text
-convertEntity e = maybe (T.map toUpper e) T.pack (lookupEntity $ T.unpack e)
-
 -- convenience function to get an attribute value, defaulting to ""
 attrValue :: Text -> Element -> Text
 attrValue attr elt =
diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs
index 5353f2001..602f3b4f2 100644
--- a/src/Text/Pandoc/Readers/JATS.hs
+++ b/src/Text/Pandoc/Readers/JATS.hs
@@ -54,30 +54,11 @@ instance Default JATSState where
 
 readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
 readJATS _ inp = do
-  tree <- either (throwError . PandocXMLError "")
-                 (return . normalizeTree) $
+  tree <- either (throwError . PandocXMLError "") return $
             parseXMLContents (TL.fromStrict $ crFilter inp)
   (bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree
   return $ Pandoc (jatsMeta st') (toList . mconcat $ bs)
 
--- normalize input, consolidating adjacent Text and CRef elements
-normalizeTree :: [Content] -> [Content]
-normalizeTree = everywhere (mkT go)
-  where go :: [Content] -> [Content]
-        go (Text (CData CDataRaw _ _):xs) = xs
-        go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
-           Text (CData CDataText (s1 <> s2) z):xs
-        go (Text (CData CDataText s1 z):CRef r:xs) =
-           Text (CData CDataText (s1 <> convertEntity r) z):xs
-        go (CRef r:Text (CData CDataText s1 z):xs) =
-             Text (CData CDataText (convertEntity r <> s1) z):xs
-        go (CRef r1:CRef r2:xs) =
-             Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs
-        go xs = xs
-
-convertEntity :: Text -> Text
-convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity $ T.unpack e)
-
 -- convenience function to get an attribute value, defaulting to ""
 attrValue :: Text -> Element -> Text
 attrValue attr =
@@ -454,7 +435,8 @@ elementToStr x = x
 
 parseInline :: PandocMonad m => Content -> JATS m Inlines
 parseInline (Text (CData _ s _)) = return $ text s
-parseInline (CRef ref) = return . text . convertEntity $ ref
+parseInline (CRef ref) = return $ maybe (text $ T.toUpper ref) (text . T.pack)
+                                $ lookupEntity (T.unpack ref)
 parseInline (Elem e) =
   case qName (elName e) of
         "italic" -> innerInlines emph
diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs
index 184d5a63f..5f2ddb876 100644
--- a/src/Text/Pandoc/Readers/OPML.hs
+++ b/src/Text/Pandoc/Readers/OPML.hs
@@ -14,12 +14,10 @@ Conversion of OPML to 'Pandoc' document.
 module Text.Pandoc.Readers.OPML ( readOPML ) where
 import Control.Monad.State.Strict
 import Data.Default
-import Data.Generics
 import Data.Maybe (fromMaybe)
 import Data.Text (Text)
 import qualified Data.Text as T
 import qualified Data.Text.Lazy as TL
-import Text.HTML.TagSoup.Entity (lookupEntity)
 import Text.Pandoc.Builder
 import Text.Pandoc.Class.PandocMonad (PandocMonad)
 import Text.Pandoc.Options
@@ -53,7 +51,7 @@ readOPML opts inp  = do
   (bs, st') <- runStateT
                  (case parseXMLContents (TL.fromStrict (crFilter inp)) of
                      Left msg -> throwError $ PandocXMLError "" msg
-                     Right ns -> mapM parseBlock $ normalizeTree ns)
+                     Right ns -> mapM parseBlock ns)
                  def{ opmlOptions = opts }
   return $
     setTitle (opmlDocTitle st') $
@@ -61,24 +59,6 @@ readOPML opts inp  = do
     setDate (opmlDocDate st') $
     doc $ mconcat bs
 
--- normalize input, consolidating adjacent Text and CRef elements
-normalizeTree :: [Content] -> [Content]
-normalizeTree = everywhere (mkT go)
-  where go :: [Content] -> [Content]
-        go (Text (CData CDataRaw _ _):xs) = xs
-        go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
-           Text (CData CDataText (s1 <> s2) z):xs
-        go (Text (CData CDataText s1 z):CRef r:xs) =
-           Text (CData CDataText (s1 <> convertEntity r) z):xs
-        go (CRef r:Text (CData CDataText s1 z):xs) =
-             Text (CData CDataText (convertEntity r <> s1) z):xs
-        go (CRef r1:CRef r2:xs) =
-             Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs
-        go xs = xs
-
-convertEntity :: Text -> Text
-convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity (T.unpack e))
-
 -- convenience function to get an attribute value, defaulting to ""
 attrValue :: Text -> Element -> Text
 attrValue attr elt =
author	John MacFarlane <jgm@berkeley.edu>	2021-02-18 21:24:31 -0800
committer	John MacFarlane <jgm@berkeley.edu>	2021-02-18 21:24:31 -0800
commit	98d26c234579a06446a5bef1992ed77bac48a4ac (patch)
tree	72713ec4feaedaf8d38744d97d219a9781857ff8 /src
parent	ef642e2bbc1f46056fc27560ceba791f27f2daa6 (diff)
download	pandoc-98d26c234579a06446a5bef1992ed77bac48a4ac.tar.gz