aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-02-18 21:24:31 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2021-02-18 21:24:31 -0800
commit98d26c234579a06446a5bef1992ed77bac48a4ac (patch)
tree72713ec4feaedaf8d38744d97d219a9781857ff8 /src
parentef642e2bbc1f46056fc27560ceba791f27f2daa6 (diff)
downloadpandoc-98d26c234579a06446a5bef1992ed77bac48a4ac.tar.gz
DocBook, JATS, OPML readers: performance optimization.
With the new XML parser, we can avoid the expensive tree normalization step we used to do. This gives a significant speed boost in docbook and JATS parsing (e.g. 9.7 to 6 ms).
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Readers/DocBook.hs26
-rw-r--r--src/Text/Pandoc/Readers/JATS.hs24
-rw-r--r--src/Text/Pandoc/Readers/OPML.hs22
3 files changed, 8 insertions, 64 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs
index e201b54fe..d38b07864 100644
--- a/src/Text/Pandoc/Readers/DocBook.hs
+++ b/src/Text/Pandoc/Readers/DocBook.hs
@@ -12,7 +12,7 @@ Conversion of DocBook XML to 'Pandoc' document.
-}
module Text.Pandoc.Readers.DocBook ( readDocBook ) where
import Control.Monad.State.Strict
-import Data.Char (isSpace, toUpper, isLetter)
+import Data.Char (isSpace, isLetter)
import Data.Default
import Data.Either (rights)
import Data.Foldable (asum)
@@ -540,8 +540,9 @@ instance Default DBState where
readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readDocBook _ inp = do
- tree <- either (throwError . PandocXMLError "") (return . normalizeTree) $
- parseXMLContents (TL.fromStrict . handleInstructions $ crFilter inp)
+ tree <- either (throwError . PandocXMLError "") return $
+ parseXMLContents
+ (TL.fromStrict . handleInstructions $ crFilter inp)
(bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree
return $ Pandoc (dbMeta st') (toList . mconcat $ bs)
@@ -571,25 +572,6 @@ getFigure e = do
modify $ \st -> st{ dbFigureTitle = mempty, dbFigureId = mempty }
return res
--- normalize input, consolidating adjacent Text and CRef elements
-normalizeTree :: [Content] -> [Content]
-normalizeTree = everywhere (mkT go)
- where go :: [Content] -> [Content]
- go (Text (CData CDataRaw _ _):xs) = xs
- go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
- Text (CData CDataText (s1 <> s2) z):xs
- go (Text (CData CDataText s1 z):CRef r:xs) =
- Text (CData CDataText (s1 <> convertEntity r) z):xs
- go (CRef r:Text (CData CDataText s1 z):xs) =
- Text (CData CDataText (convertEntity r <> s1) z):xs
- go (CRef r1:CRef r2:xs) =
- Text (CData CDataText (convertEntity r1 <>
- convertEntity r2) Nothing):xs
- go xs = xs
-
-convertEntity :: Text -> Text
-convertEntity e = maybe (T.map toUpper e) T.pack (lookupEntity $ T.unpack e)
-
-- convenience function to get an attribute value, defaulting to ""
attrValue :: Text -> Element -> Text
attrValue attr elt =
diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs
index 5353f2001..602f3b4f2 100644
--- a/src/Text/Pandoc/Readers/JATS.hs
+++ b/src/Text/Pandoc/Readers/JATS.hs
@@ -54,30 +54,11 @@ instance Default JATSState where
readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readJATS _ inp = do
- tree <- either (throwError . PandocXMLError "")
- (return . normalizeTree) $
+ tree <- either (throwError . PandocXMLError "") return $
parseXMLContents (TL.fromStrict $ crFilter inp)
(bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree
return $ Pandoc (jatsMeta st') (toList . mconcat $ bs)
--- normalize input, consolidating adjacent Text and CRef elements
-normalizeTree :: [Content] -> [Content]
-normalizeTree = everywhere (mkT go)
- where go :: [Content] -> [Content]
- go (Text (CData CDataRaw _ _):xs) = xs
- go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
- Text (CData CDataText (s1 <> s2) z):xs
- go (Text (CData CDataText s1 z):CRef r:xs) =
- Text (CData CDataText (s1 <> convertEntity r) z):xs
- go (CRef r:Text (CData CDataText s1 z):xs) =
- Text (CData CDataText (convertEntity r <> s1) z):xs
- go (CRef r1:CRef r2:xs) =
- Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs
- go xs = xs
-
-convertEntity :: Text -> Text
-convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity $ T.unpack e)
-
-- convenience function to get an attribute value, defaulting to ""
attrValue :: Text -> Element -> Text
attrValue attr =
@@ -454,7 +435,8 @@ elementToStr x = x
parseInline :: PandocMonad m => Content -> JATS m Inlines
parseInline (Text (CData _ s _)) = return $ text s
-parseInline (CRef ref) = return . text . convertEntity $ ref
+parseInline (CRef ref) = return $ maybe (text $ T.toUpper ref) (text . T.pack)
+ $ lookupEntity (T.unpack ref)
parseInline (Elem e) =
case qName (elName e) of
"italic" -> innerInlines emph
diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs
index 184d5a63f..5f2ddb876 100644
--- a/src/Text/Pandoc/Readers/OPML.hs
+++ b/src/Text/Pandoc/Readers/OPML.hs
@@ -14,12 +14,10 @@ Conversion of OPML to 'Pandoc' document.
module Text.Pandoc.Readers.OPML ( readOPML ) where
import Control.Monad.State.Strict
import Data.Default
-import Data.Generics
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import qualified Data.Text as T
import qualified Data.Text.Lazy as TL
-import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import Text.Pandoc.Options
@@ -53,7 +51,7 @@ readOPML opts inp = do
(bs, st') <- runStateT
(case parseXMLContents (TL.fromStrict (crFilter inp)) of
Left msg -> throwError $ PandocXMLError "" msg
- Right ns -> mapM parseBlock $ normalizeTree ns)
+ Right ns -> mapM parseBlock ns)
def{ opmlOptions = opts }
return $
setTitle (opmlDocTitle st') $
@@ -61,24 +59,6 @@ readOPML opts inp = do
setDate (opmlDocDate st') $
doc $ mconcat bs
--- normalize input, consolidating adjacent Text and CRef elements
-normalizeTree :: [Content] -> [Content]
-normalizeTree = everywhere (mkT go)
- where go :: [Content] -> [Content]
- go (Text (CData CDataRaw _ _):xs) = xs
- go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
- Text (CData CDataText (s1 <> s2) z):xs
- go (Text (CData CDataText s1 z):CRef r:xs) =
- Text (CData CDataText (s1 <> convertEntity r) z):xs
- go (CRef r:Text (CData CDataText s1 z):xs) =
- Text (CData CDataText (convertEntity r <> s1) z):xs
- go (CRef r1:CRef r2:xs) =
- Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs
- go xs = xs
-
-convertEntity :: Text -> Text
-convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity (T.unpack e))
-
-- convenience function to get an attribute value, defaulting to ""
attrValue :: Text -> Element -> Text
attrValue attr elt =