diff options
author | John MacFarlane <jgm@berkeley.edu> | 2021-02-08 23:35:19 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2021-02-10 22:04:11 -0800 |
commit | 8ca191604dcd13af27c11d2da225da646ebce6fc (patch) | |
tree | 9663e0b951ecfce7efd08efd79dcd4b957601b85 /src/Text/Pandoc/Writers | |
parent | 9994ad977d03e97baadf680793c58a66ba7e77e9 (diff) | |
download | pandoc-8ca191604dcd13af27c11d2da225da646ebce6fc.tar.gz |
Add new unexported module T.P.XMLParser.
This exports functions that uses xml-conduit's parser to
produce an xml-light Element or [Content]. This allows
existing pandoc code to use a better parser without
much modification.
The new parser is used in all places where xml-light's
parser was previously used. Benchmarks show a significant
performance improvement in parsing XML-based formats
(especially ODT and FB2).
Note that the xml-light types use String, so the
conversion from xml-conduit types involves a lot
of extra allocation. It would be desirable to
avoid that in the future by gradually switching
to using xml-conduit directly. This can be done
module by module.
The new parser also reports errors, which we report
when possible.
A new constructor PandocXMLError has been added to
PandocError in T.P.Error [API change].
Closes #7091, which was the main stimulus.
These changes revealed the need for some changes
in the tests. The docbook-reader.docbook test
lacked definitions for the entities it used; these
have been added. And the docx golden tests have been
updated, because the new parser does not preserve
the order of attributes.
Add entity defs to docbook-reader.docbook.
Update golden tests for docx.
Diffstat (limited to 'src/Text/Pandoc/Writers')
-rw-r--r-- | src/Text/Pandoc/Writers/EPUB.hs | 13 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/FB2.hs | 11 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/ODT.hs | 29 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/OOXML.hs | 9 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Powerpoint/Output.hs | 4 |
5 files changed, 42 insertions, 24 deletions
diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 1f16f6772..e99fa2567 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -55,8 +55,9 @@ import Text.Pandoc.Walk (query, walk, walkM) import Text.Pandoc.Writers.HTML (writeHtmlStringForEPUB) import Text.Printf (printf) import Text.XML.Light (Attr (..), Element (..), Node (..), QName (..), - add_attrs, lookupAttr, node, onlyElems, parseXML, + add_attrs, lookupAttr, node, onlyElems, ppElement, showElement, strContent, unode, unqual) +import Text.Pandoc.XMLParser (parseXMLContents) import Text.Pandoc.XML (escapeStringForXML) import Text.DocTemplates (FromContext(lookupContext), Context(..), ToContext(toVal), Val(..)) @@ -160,7 +161,12 @@ mkEntry path content = do getEPUBMetadata :: PandocMonad m => WriterOptions -> Meta -> E m EPUBMetadata getEPUBMetadata opts meta = do let md = metadataFromMeta opts meta - let elts = maybe [] (onlyElems . parseXML) $ writerEpubMetadata opts + elts <- case writerEpubMetadata opts of + Nothing -> return [] + Just t -> case parseXMLContents (TL.fromStrict t) of + Left msg -> throwError $ + PandocXMLError "epub metadata" msg + Right ns -> return (onlyElems ns) let md' = foldr addMetadataFromXML md elts let addIdentifier m = if null (epubIdentifier m) @@ -836,7 +842,8 @@ pandocToEPUB version opts doc = do : case subs of [] -> [] (_:_) -> [unode "ol" ! [("class","toc")] $ subs] - where titElements = parseXML titRendered + where titElements = either (const []) id $ + parseXMLContents (TL.fromStrict titRendered) titRendered = case P.runPure (writeHtmlStringForEPUB version opts{ writerTemplate = Nothing diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs index 25b1f28d1..9334d6e9a 100644 --- a/src/Text/Pandoc/Writers/FB2.hs +++ b/src/Text/Pandoc/Writers/FB2.hs @@ -19,7 +19,7 @@ FictionBook is an XML-based e-book format. For more information see: module Text.Pandoc.Writers.FB2 (writeFB2) where import Control.Monad (zipWithM) -import Control.Monad.Except (catchError) +import Control.Monad.Except (catchError, throwError) import Control.Monad.State.Strict (StateT, evalStateT, get, gets, lift, liftM, modify) import Data.ByteString.Base64 (encode) import Data.Char (isAscii, isControl, isSpace) @@ -27,16 +27,18 @@ import Data.Either (lefts, rights) import Data.List (intercalate) import Data.Text (Text, pack) import qualified Data.Text as T +import qualified Data.Text.Lazy as TL import qualified Data.Text.Encoding as TE import Network.HTTP (urlEncode) import Text.XML.Light import qualified Text.XML.Light as X import qualified Text.XML.Light.Cursor as XC -import qualified Text.XML.Light.Input as XI +import Text.Pandoc.XMLParser (parseXMLContents) import Text.Pandoc.Class.PandocMonad (PandocMonad, report) import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Definition +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.Logging import Text.Pandoc.Options (HTMLMathMethod (..), WriterOptions (..), def) import Text.Pandoc.Shared (capitalize, isURI, orderedListMarkers, @@ -307,7 +309,10 @@ blockToXml (CodeBlock _ s) = return . spaceBeforeAfter . map (el "p" . el "code" . T.unpack) . T.lines $ s blockToXml (RawBlock f str) = if f == Format "fb2" - then return $ XI.parseXML str + then + case parseXMLContents (TL.fromStrict str) of + Left msg -> throwError $ PandocXMLError "" msg + Right nds -> return nds else return [] blockToXml (Div _ bs) = cMapM blockToXml bs blockToXml (BlockQuote bs) = list . el "cite" <$> cMapM blockToXml bs diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs index 05dfad5eb..a32ff618c 100644 --- a/src/Text/Pandoc/Writers/ODT.hs +++ b/src/Text/Pandoc/Writers/ODT.hs @@ -13,7 +13,7 @@ Conversion of 'Pandoc' documents to ODT. -} module Text.Pandoc.Writers.ODT ( writeODT ) where import Codec.Archive.Zip -import Control.Monad.Except (catchError) +import Control.Monad.Except (catchError, throwError) import Control.Monad.State.Strict import qualified Data.ByteString.Lazy as B import Data.Generics (everywhere', mkT) @@ -27,6 +27,7 @@ import Text.Pandoc.BCP47 (Lang (..), getLang, renderLang) import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang) import qualified Text.Pandoc.Class.PandocMonad as P import Text.Pandoc.Definition +import Text.Pandoc.Error (PandocError(..)) import Text.Pandoc.ImageSize import Text.Pandoc.Logging import Text.Pandoc.MIME (extensionFromMimeType, getMimeType) @@ -35,10 +36,11 @@ import Text.DocLayout import Text.Pandoc.Shared (stringify, pandocVersion, tshow) import Text.Pandoc.Writers.Shared (lookupMetaString, lookupMetaBlocks, fixDisplayMath) -import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toStringLazy) +import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toTextLazy) import Text.Pandoc.Walk import Text.Pandoc.Writers.OpenDocument (writeOpenDocument) import Text.Pandoc.XML +import Text.Pandoc.XMLParser (parseXMLElement) import Text.TeXMath import Text.XML.Light @@ -172,17 +174,18 @@ updateStyleWithLang :: PandocMonad m => Maybe Lang -> Archive -> O m Archive updateStyleWithLang Nothing arch = return arch updateStyleWithLang (Just lang) arch = do epochtime <- floor `fmap` lift P.getPOSIXTime - return arch{ zEntries = [if eRelativePath e == "styles.xml" - then case parseXMLDoc - (toStringLazy (fromEntry e)) of - Nothing -> e - Just d -> - toEntry "styles.xml" epochtime - ( fromStringLazy - . ppTopElement - . addLang lang $ d ) - else e - | e <- zEntries arch] } + entries <- mapM (\e -> if eRelativePath e == "styles.xml" + then case parseXMLElement + (toTextLazy (fromEntry e)) of + Left msg -> throwError $ + PandocXMLError "styles.xml" msg + Right d -> return $ + toEntry "styles.xml" epochtime + ( fromStringLazy + . ppTopElement + . addLang lang $ d ) + else return e) (zEntries arch) + return arch{ zEntries = entries } addLang :: Lang -> Element -> Element addLang lang = everywhere' (mkT updateLangAttr) diff --git a/src/Text/Pandoc/Writers/OOXML.hs b/src/Text/Pandoc/Writers/OOXML.hs index 3ac007f4e..8f60e70d5 100644 --- a/src/Text/Pandoc/Writers/OOXML.hs +++ b/src/Text/Pandoc/Writers/OOXML.hs @@ -35,6 +35,7 @@ import qualified Data.Text as T import Text.Pandoc.Class.PandocMonad (PandocMonad) import qualified Text.Pandoc.UTF8 as UTF8 import Text.XML.Light as XML +import Text.Pandoc.XMLParser (parseXMLElement) mknode :: Node t => String -> [(String,String)] -> t -> Element mknode s attrs = @@ -62,10 +63,10 @@ parseXml refArchive distArchive relpath = findEntryByPath relpath distArchive of Nothing -> throwError $ PandocSomeError $ T.pack relpath <> " missing in reference file" - Just e -> case parseXMLDoc . UTF8.toStringLazy . fromEntry $ e of - Nothing -> throwError $ PandocSomeError $ - T.pack relpath <> " corrupt in reference file" - Just d -> return d + Just e -> case parseXMLElement . UTF8.toTextLazy . fromEntry $ e of + Left msg -> + throwError $ PandocXMLError (T.pack relpath) msg + Right d -> return d -- Copied from Util diff --git a/src/Text/Pandoc/Writers/Powerpoint/Output.hs b/src/Text/Pandoc/Writers/Powerpoint/Output.hs index 8554db622..cd092969b 100644 --- a/src/Text/Pandoc/Writers/Powerpoint/Output.hs +++ b/src/Text/Pandoc/Writers/Powerpoint/Output.hs @@ -29,6 +29,7 @@ import Data.Time.Clock (UTCTime) import Data.Time.Clock.POSIX (utcTimeToPOSIXSeconds, posixSecondsToUTCTime) import System.FilePath.Posix (splitDirectories, splitExtension, takeExtension) import Text.XML.Light +import Text.Pandoc.XMLParser (parseXMLElement) import Text.Pandoc.Definition import qualified Text.Pandoc.UTF8 as UTF8 import Text.Pandoc.Class.PandocMonad (PandocMonad) @@ -77,7 +78,8 @@ getPresentationSize :: Archive -> Archive -> Maybe (Integer, Integer) getPresentationSize refArchive distArchive = do entry <- findEntryByPath "ppt/presentation.xml" refArchive `mplus` findEntryByPath "ppt/presentation.xml" distArchive - presElement <- parseXMLDoc $ UTF8.toStringLazy $ fromEntry entry + presElement <- either (const Nothing) return $ + parseXMLElement $ UTF8.toTextLazy $ fromEntry entry let ns = elemToNameSpaces presElement sldSize <- findChild (elemName ns "p" "sldSz") presElement cxS <- findAttr (QName "cx" Nothing Nothing) sldSize |