aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Writers
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-02-08 23:35:19 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2021-02-10 22:04:11 -0800
commit8ca191604dcd13af27c11d2da225da646ebce6fc (patch)
tree9663e0b951ecfce7efd08efd79dcd4b957601b85 /src/Text/Pandoc/Writers
parent9994ad977d03e97baadf680793c58a66ba7e77e9 (diff)
downloadpandoc-8ca191604dcd13af27c11d2da225da646ebce6fc.tar.gz
Add new unexported module T.P.XMLParser.
This exports functions that uses xml-conduit's parser to produce an xml-light Element or [Content]. This allows existing pandoc code to use a better parser without much modification. The new parser is used in all places where xml-light's parser was previously used. Benchmarks show a significant performance improvement in parsing XML-based formats (especially ODT and FB2). Note that the xml-light types use String, so the conversion from xml-conduit types involves a lot of extra allocation. It would be desirable to avoid that in the future by gradually switching to using xml-conduit directly. This can be done module by module. The new parser also reports errors, which we report when possible. A new constructor PandocXMLError has been added to PandocError in T.P.Error [API change]. Closes #7091, which was the main stimulus. These changes revealed the need for some changes in the tests. The docbook-reader.docbook test lacked definitions for the entities it used; these have been added. And the docx golden tests have been updated, because the new parser does not preserve the order of attributes. Add entity defs to docbook-reader.docbook. Update golden tests for docx.
Diffstat (limited to 'src/Text/Pandoc/Writers')
-rw-r--r--src/Text/Pandoc/Writers/EPUB.hs13
-rw-r--r--src/Text/Pandoc/Writers/FB2.hs11
-rw-r--r--src/Text/Pandoc/Writers/ODT.hs29
-rw-r--r--src/Text/Pandoc/Writers/OOXML.hs9
-rw-r--r--src/Text/Pandoc/Writers/Powerpoint/Output.hs4
5 files changed, 42 insertions, 24 deletions
diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs
index 1f16f6772..e99fa2567 100644
--- a/src/Text/Pandoc/Writers/EPUB.hs
+++ b/src/Text/Pandoc/Writers/EPUB.hs
@@ -55,8 +55,9 @@ import Text.Pandoc.Walk (query, walk, walkM)
import Text.Pandoc.Writers.HTML (writeHtmlStringForEPUB)
import Text.Printf (printf)
import Text.XML.Light (Attr (..), Element (..), Node (..), QName (..),
- add_attrs, lookupAttr, node, onlyElems, parseXML,
+ add_attrs, lookupAttr, node, onlyElems,
ppElement, showElement, strContent, unode, unqual)
+import Text.Pandoc.XMLParser (parseXMLContents)
import Text.Pandoc.XML (escapeStringForXML)
import Text.DocTemplates (FromContext(lookupContext), Context(..),
ToContext(toVal), Val(..))
@@ -160,7 +161,12 @@ mkEntry path content = do
getEPUBMetadata :: PandocMonad m => WriterOptions -> Meta -> E m EPUBMetadata
getEPUBMetadata opts meta = do
let md = metadataFromMeta opts meta
- let elts = maybe [] (onlyElems . parseXML) $ writerEpubMetadata opts
+ elts <- case writerEpubMetadata opts of
+ Nothing -> return []
+ Just t -> case parseXMLContents (TL.fromStrict t) of
+ Left msg -> throwError $
+ PandocXMLError "epub metadata" msg
+ Right ns -> return (onlyElems ns)
let md' = foldr addMetadataFromXML md elts
let addIdentifier m =
if null (epubIdentifier m)
@@ -836,7 +842,8 @@ pandocToEPUB version opts doc = do
: case subs of
[] -> []
(_:_) -> [unode "ol" ! [("class","toc")] $ subs]
- where titElements = parseXML titRendered
+ where titElements = either (const []) id $
+ parseXMLContents (TL.fromStrict titRendered)
titRendered = case P.runPure
(writeHtmlStringForEPUB version
opts{ writerTemplate = Nothing
diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs
index 25b1f28d1..9334d6e9a 100644
--- a/src/Text/Pandoc/Writers/FB2.hs
+++ b/src/Text/Pandoc/Writers/FB2.hs
@@ -19,7 +19,7 @@ FictionBook is an XML-based e-book format. For more information see:
module Text.Pandoc.Writers.FB2 (writeFB2) where
import Control.Monad (zipWithM)
-import Control.Monad.Except (catchError)
+import Control.Monad.Except (catchError, throwError)
import Control.Monad.State.Strict (StateT, evalStateT, get, gets, lift, liftM, modify)
import Data.ByteString.Base64 (encode)
import Data.Char (isAscii, isControl, isSpace)
@@ -27,16 +27,18 @@ import Data.Either (lefts, rights)
import Data.List (intercalate)
import Data.Text (Text, pack)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
import qualified Data.Text.Encoding as TE
import Network.HTTP (urlEncode)
import Text.XML.Light
import qualified Text.XML.Light as X
import qualified Text.XML.Light.Cursor as XC
-import qualified Text.XML.Light.Input as XI
+import Text.Pandoc.XMLParser (parseXMLContents)
import Text.Pandoc.Class.PandocMonad (PandocMonad, report)
import qualified Text.Pandoc.Class.PandocMonad as P
import Text.Pandoc.Definition
+import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Logging
import Text.Pandoc.Options (HTMLMathMethod (..), WriterOptions (..), def)
import Text.Pandoc.Shared (capitalize, isURI, orderedListMarkers,
@@ -307,7 +309,10 @@ blockToXml (CodeBlock _ s) = return . spaceBeforeAfter .
map (el "p" . el "code" . T.unpack) . T.lines $ s
blockToXml (RawBlock f str) =
if f == Format "fb2"
- then return $ XI.parseXML str
+ then
+ case parseXMLContents (TL.fromStrict str) of
+ Left msg -> throwError $ PandocXMLError "" msg
+ Right nds -> return nds
else return []
blockToXml (Div _ bs) = cMapM blockToXml bs
blockToXml (BlockQuote bs) = list . el "cite" <$> cMapM blockToXml bs
diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs
index 05dfad5eb..a32ff618c 100644
--- a/src/Text/Pandoc/Writers/ODT.hs
+++ b/src/Text/Pandoc/Writers/ODT.hs
@@ -13,7 +13,7 @@ Conversion of 'Pandoc' documents to ODT.
-}
module Text.Pandoc.Writers.ODT ( writeODT ) where
import Codec.Archive.Zip
-import Control.Monad.Except (catchError)
+import Control.Monad.Except (catchError, throwError)
import Control.Monad.State.Strict
import qualified Data.ByteString.Lazy as B
import Data.Generics (everywhere', mkT)
@@ -27,6 +27,7 @@ import Text.Pandoc.BCP47 (Lang (..), getLang, renderLang)
import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang)
import qualified Text.Pandoc.Class.PandocMonad as P
import Text.Pandoc.Definition
+import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.ImageSize
import Text.Pandoc.Logging
import Text.Pandoc.MIME (extensionFromMimeType, getMimeType)
@@ -35,10 +36,11 @@ import Text.DocLayout
import Text.Pandoc.Shared (stringify, pandocVersion, tshow)
import Text.Pandoc.Writers.Shared (lookupMetaString, lookupMetaBlocks,
fixDisplayMath)
-import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toStringLazy)
+import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toTextLazy)
import Text.Pandoc.Walk
import Text.Pandoc.Writers.OpenDocument (writeOpenDocument)
import Text.Pandoc.XML
+import Text.Pandoc.XMLParser (parseXMLElement)
import Text.TeXMath
import Text.XML.Light
@@ -172,17 +174,18 @@ updateStyleWithLang :: PandocMonad m => Maybe Lang -> Archive -> O m Archive
updateStyleWithLang Nothing arch = return arch
updateStyleWithLang (Just lang) arch = do
epochtime <- floor `fmap` lift P.getPOSIXTime
- return arch{ zEntries = [if eRelativePath e == "styles.xml"
- then case parseXMLDoc
- (toStringLazy (fromEntry e)) of
- Nothing -> e
- Just d ->
- toEntry "styles.xml" epochtime
- ( fromStringLazy
- . ppTopElement
- . addLang lang $ d )
- else e
- | e <- zEntries arch] }
+ entries <- mapM (\e -> if eRelativePath e == "styles.xml"
+ then case parseXMLElement
+ (toTextLazy (fromEntry e)) of
+ Left msg -> throwError $
+ PandocXMLError "styles.xml" msg
+ Right d -> return $
+ toEntry "styles.xml" epochtime
+ ( fromStringLazy
+ . ppTopElement
+ . addLang lang $ d )
+ else return e) (zEntries arch)
+ return arch{ zEntries = entries }
addLang :: Lang -> Element -> Element
addLang lang = everywhere' (mkT updateLangAttr)
diff --git a/src/Text/Pandoc/Writers/OOXML.hs b/src/Text/Pandoc/Writers/OOXML.hs
index 3ac007f4e..8f60e70d5 100644
--- a/src/Text/Pandoc/Writers/OOXML.hs
+++ b/src/Text/Pandoc/Writers/OOXML.hs
@@ -35,6 +35,7 @@ import qualified Data.Text as T
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import qualified Text.Pandoc.UTF8 as UTF8
import Text.XML.Light as XML
+import Text.Pandoc.XMLParser (parseXMLElement)
mknode :: Node t => String -> [(String,String)] -> t -> Element
mknode s attrs =
@@ -62,10 +63,10 @@ parseXml refArchive distArchive relpath =
findEntryByPath relpath distArchive of
Nothing -> throwError $ PandocSomeError $
T.pack relpath <> " missing in reference file"
- Just e -> case parseXMLDoc . UTF8.toStringLazy . fromEntry $ e of
- Nothing -> throwError $ PandocSomeError $
- T.pack relpath <> " corrupt in reference file"
- Just d -> return d
+ Just e -> case parseXMLElement . UTF8.toTextLazy . fromEntry $ e of
+ Left msg ->
+ throwError $ PandocXMLError (T.pack relpath) msg
+ Right d -> return d
-- Copied from Util
diff --git a/src/Text/Pandoc/Writers/Powerpoint/Output.hs b/src/Text/Pandoc/Writers/Powerpoint/Output.hs
index 8554db622..cd092969b 100644
--- a/src/Text/Pandoc/Writers/Powerpoint/Output.hs
+++ b/src/Text/Pandoc/Writers/Powerpoint/Output.hs
@@ -29,6 +29,7 @@ import Data.Time.Clock (UTCTime)
import Data.Time.Clock.POSIX (utcTimeToPOSIXSeconds, posixSecondsToUTCTime)
import System.FilePath.Posix (splitDirectories, splitExtension, takeExtension)
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLElement)
import Text.Pandoc.Definition
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Class.PandocMonad (PandocMonad)
@@ -77,7 +78,8 @@ getPresentationSize :: Archive -> Archive -> Maybe (Integer, Integer)
getPresentationSize refArchive distArchive = do
entry <- findEntryByPath "ppt/presentation.xml" refArchive `mplus`
findEntryByPath "ppt/presentation.xml" distArchive
- presElement <- parseXMLDoc $ UTF8.toStringLazy $ fromEntry entry
+ presElement <- either (const Nothing) return $
+ parseXMLElement $ UTF8.toTextLazy $ fromEntry entry
let ns = elemToNameSpaces presElement
sldSize <- findChild (elemName ns "p" "sldSz") presElement
cxS <- findAttr (QName "cx" Nothing Nothing) sldSize