aboutsummaryrefslogtreecommitdiff
path: root/src/Text
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-02-08 23:35:19 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2021-02-10 22:04:11 -0800
commit8ca191604dcd13af27c11d2da225da646ebce6fc (patch)
tree9663e0b951ecfce7efd08efd79dcd4b957601b85 /src/Text
parent9994ad977d03e97baadf680793c58a66ba7e77e9 (diff)
downloadpandoc-8ca191604dcd13af27c11d2da225da646ebce6fc.tar.gz
Add new unexported module T.P.XMLParser.
This exports functions that uses xml-conduit's parser to produce an xml-light Element or [Content]. This allows existing pandoc code to use a better parser without much modification. The new parser is used in all places where xml-light's parser was previously used. Benchmarks show a significant performance improvement in parsing XML-based formats (especially ODT and FB2). Note that the xml-light types use String, so the conversion from xml-conduit types involves a lot of extra allocation. It would be desirable to avoid that in the future by gradually switching to using xml-conduit directly. This can be done module by module. The new parser also reports errors, which we report when possible. A new constructor PandocXMLError has been added to PandocError in T.P.Error [API change]. Closes #7091, which was the main stimulus. These changes revealed the need for some changes in the tests. The docbook-reader.docbook test lacked definitions for the entities it used; these have been added. And the docx golden tests have been updated, because the new parser does not preserve the order of attributes. Add entity defs to docbook-reader.docbook. Update golden tests for docx.
Diffstat (limited to 'src/Text')
-rw-r--r--src/Text/Pandoc/Error.hs3
-rw-r--r--src/Text/Pandoc/ImageSize.hs5
-rw-r--r--src/Text/Pandoc/Readers/DocBook.hs52
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs21
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse/Styles.hs28
-rw-r--r--src/Text/Pandoc/Readers/EPUB.hs17
-rw-r--r--src/Text/Pandoc/Readers/FB2.hs10
-rw-r--r--src/Text/Pandoc/Readers/JATS.hs9
-rw-r--r--src/Text/Pandoc/Readers/OPML.hs10
-rw-r--r--src/Text/Pandoc/Readers/Odt.hs24
-rw-r--r--src/Text/Pandoc/Writers/EPUB.hs13
-rw-r--r--src/Text/Pandoc/Writers/FB2.hs11
-rw-r--r--src/Text/Pandoc/Writers/ODT.hs29
-rw-r--r--src/Text/Pandoc/Writers/OOXML.hs9
-rw-r--r--src/Text/Pandoc/Writers/Powerpoint/Output.hs4
-rw-r--r--src/Text/Pandoc/XMLParser.hs66
16 files changed, 224 insertions, 87 deletions
diff --git a/src/Text/Pandoc/Error.hs b/src/Text/Pandoc/Error.hs
index 204cf15ca..831405f42 100644
--- a/src/Text/Pandoc/Error.hs
+++ b/src/Text/Pandoc/Error.hs
@@ -48,6 +48,7 @@ data PandocError = PandocIOError Text IOError
| PandocFailOnWarningError
| PandocPDFProgramNotFoundError Text
| PandocPDFError Text
+ | PandocXMLError Text Text
| PandocFilterError Text Text
| PandocLuaError Text
| PandocCouldNotFindDataFileError Text
@@ -103,6 +104,8 @@ handleError (Left e) =
PandocPDFProgramNotFoundError pdfprog -> err 47 $
pdfprog <> " not found. Please select a different --pdf-engine or install " <> pdfprog
PandocPDFError logmsg -> err 43 $ "Error producing PDF.\n" <> logmsg
+ PandocXMLError fp logmsg -> err 44 $ "Invalid XML" <>
+ (if T.null fp then "" else " in " <> fp) <> ":\n" <> logmsg
PandocFilterError filtername msg -> err 83 $ "Error running filter " <>
filtername <> ":\n" <> msg
PandocLuaError msg -> err 84 $ "Error running Lua:\n" <> msg
diff --git a/src/Text/Pandoc/ImageSize.hs b/src/Text/Pandoc/ImageSize.hs
index e19958f6a..e0a1af8e8 100644
--- a/src/Text/Pandoc/ImageSize.hs
+++ b/src/Text/Pandoc/ImageSize.hs
@@ -45,7 +45,9 @@ import Text.Pandoc.Definition
import Text.Pandoc.Options
import qualified Text.Pandoc.UTF8 as UTF8
import qualified Text.XML.Light as Xml
+import Text.Pandoc.XMLParser (parseXMLElement)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
import qualified Data.Text.Encoding as TE
import Control.Applicative
import qualified Data.Attoparsec.ByteString.Char8 as A
@@ -327,7 +329,8 @@ getSize img =
svgSize :: WriterOptions -> ByteString -> Maybe ImageSize
svgSize opts img = do
- doc <- Xml.parseXMLDoc $ UTF8.toString img
+ doc <- either (const mzero) return $ parseXMLElement
+ $ TL.fromStrict $ UTF8.toText img
let viewboxSize = do
vb <- Xml.findAttrBy (== Xml.QName "viewBox" Nothing Nothing) doc
[_,_,w,h] <- mapM safeRead (T.words (T.pack vb))
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs
index ada3e98ec..ad0108843 100644
--- a/src/Text/Pandoc/Readers/DocBook.hs
+++ b/src/Text/Pandoc/Readers/DocBook.hs
@@ -12,7 +12,7 @@ Conversion of DocBook XML to 'Pandoc' document.
-}
module Text.Pandoc.Readers.DocBook ( readDocBook ) where
import Control.Monad.State.Strict
-import Data.Char (isSpace, toUpper)
+import Data.Char (isSpace, toUpper, isLetter)
import Data.Default
import Data.Either (rights)
import Data.Foldable (asum)
@@ -21,7 +21,10 @@ import Data.List (intersperse,elemIndex)
import Data.Maybe (fromMaybe,mapMaybe)
import Data.Text (Text)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
+import Control.Monad.Except (throwError)
import Text.HTML.TagSoup.Entity (lookupEntity)
+import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad, report)
import Text.Pandoc.Options
@@ -29,6 +32,7 @@ import Text.Pandoc.Logging (LogMessage(..))
import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces)
import Text.TeXMath (readMathML, writeTeX)
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLContents)
{-
@@ -537,22 +541,25 @@ instance Default DBState where
readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readDocBook _ inp = do
- let tree = normalizeTree . parseXML . handleInstructions $ crFilter inp
+ tree <- either (throwError . PandocXMLError "") (return . normalizeTree) $
+ parseXMLContents (TL.fromStrict . handleInstructions $ crFilter inp)
(bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree
return $ Pandoc (dbMeta st') (toList . mconcat $ bs)
--- We treat <?asciidoc-br?> specially (issue #1236), converting it
--- to <br/>, since xml-light doesn't parse the instruction correctly.
--- Other xml instructions are simply removed from the input stream.
+-- We treat certain processing instructions by converting them to tags
+-- beginning "pi-".
handleInstructions :: Text -> Text
-handleInstructions = T.pack . handleInstructions' . T.unpack
-
-handleInstructions' :: String -> String
-handleInstructions' ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions' xs
-handleInstructions' xs = case break (=='<') xs of
- (ys, []) -> ys
- ([], '<':zs) -> '<' : handleInstructions' zs
- (ys, zs) -> ys ++ handleInstructions' zs
+handleInstructions t =
+ let (x,y) = T.breakOn "<?" t
+ in if T.null y
+ then x
+ else
+ let (w,z) = T.breakOn "?>" y
+ in (if T.takeWhile (\c -> isLetter c || c == '-')
+ (T.drop 2 w) `elem` ["asciidoc-br", "dbfo"]
+ then x <> "<pi-" <> T.drop 2 w <> "/>"
+ else x <> w <> T.take 2 z) <>
+ handleInstructions (T.drop 2 z)
getFigure :: PandocMonad m => Element -> DB m Blocks
getFigure e = do
@@ -892,7 +899,11 @@ parseBlock (Elem e) =
"subtitle" -> return mempty -- handled in parent element
_ -> skip >> getBlocks e
where skip = do
- lift $ report $ IgnoredElement $ T.pack $ qName (elName e)
+ let qn = T.pack $ qName $ elName e
+ let name = if "pi-" `T.isPrefixOf` qn
+ then "<?" <> qn <> "?>"
+ else qn
+ lift $ report $ IgnoredElement name
return mempty
codeBlockWithLang = do
@@ -964,7 +975,7 @@ parseBlock (Elem e) =
cs -> map toAlignment cs
let parseWidth s = safeRead (T.filter (\x -> (x >= '0' && x <= '9')
|| x == '.') s)
- let textWidth = case filterChild (named "?dbfo") e of
+ let textWidth = case filterChild (named "pi-dbfo") e of
Just d -> case attrValue "table-width" d of
"" -> 1.0
w -> fromMaybe 100.0 (parseWidth w) / 100.0
@@ -1165,12 +1176,15 @@ parseInline (Elem e) =
"title" -> return mempty
"affiliation" -> skip
-- Note: this isn't a real docbook tag; it's what we convert
- -- <?asciidor-br?> to in handleInstructions, above. A kludge to
- -- work around xml-light's inability to parse an instruction.
- "br" -> return linebreak
+ -- <?asciidor-br?> to in handleInstructions, above.
+ "pi-asciidoc-br" -> return linebreak
_ -> skip >> innerInlines id
where skip = do
- lift $ report $ IgnoredElement $ T.pack $ qName (elName e)
+ let qn = T.pack $ qName $ elName e
+ let name = if "pi-" `T.isPrefixOf` qn
+ then "<?" <> qn <> "?>"
+ else qn
+ lift $ report $ IgnoredElement name
return mempty
innerInlines f = extractSpaces f . mconcat <$>
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index fdcffcc3f..056dab6c2 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -74,6 +74,7 @@ import Text.TeXMath.Readers.OMML (readOMML)
import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, textToFont)
import Text.XML.Light
import qualified Text.XML.Light.Cursor as XMLC
+import Text.Pandoc.XMLParser (parseXMLElement)
data ReaderEnv = ReaderEnv { envNotes :: Notes
, envComments :: Comments
@@ -343,10 +344,16 @@ archiveToDocxWithWarnings archive = do
Right doc -> Right (Docx doc, stateWarnings st)
Left e -> Left e
+parseXMLFromEntry :: Entry -> Maybe Element
+parseXMLFromEntry entry =
+ case parseXMLElement (UTF8.toTextLazy (fromEntry entry)) of
+ Left _ -> Nothing
+ Right el -> Just el
+
getDocumentXmlPath :: Archive -> Maybe FilePath
getDocumentXmlPath zf = do
entry <- findEntryByPath "_rels/.rels" zf
- relsElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ relsElem <- parseXMLFromEntry entry
let rels = filterChildrenName (\n -> qName n == "Relationship") relsElem
rel <- find (\e -> findAttr (QName "Type" Nothing Nothing) e ==
Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument")
@@ -362,7 +369,7 @@ archiveToDocument :: Archive -> D Document
archiveToDocument zf = do
docPath <- asks envDocXmlPath
entry <- maybeToD $ findEntryByPath docPath zf
- docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ docElem <- maybeToD $ parseXMLFromEntry entry
let namespaces = elemToNameSpaces docElem
bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem
let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem)
@@ -401,9 +408,9 @@ constructBogusParStyleData stName = ParStyle
archiveToNotes :: Archive -> Notes
archiveToNotes zf =
let fnElem = findEntryByPath "word/footnotes.xml" zf
- >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ >>= parseXMLFromEntry
enElem = findEntryByPath "word/endnotes.xml" zf
- >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ >>= parseXMLFromEntry
fn_namespaces = maybe [] elemToNameSpaces fnElem
en_namespaces = maybe [] elemToNameSpaces enElem
ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces
@@ -415,7 +422,7 @@ archiveToNotes zf =
archiveToComments :: Archive -> Comments
archiveToComments zf =
let cmtsElem = findEntryByPath "word/comments.xml" zf
- >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ >>= parseXMLFromEntry
cmts_namespaces = maybe [] elemToNameSpaces cmtsElem
cmts = elemToComments cmts_namespaces <$> (cmtsElem >>= walkDocument cmts_namespaces)
in
@@ -445,7 +452,7 @@ filePathToRelationships :: Archive -> FilePath -> FilePath -> [Relationship]
filePathToRelationships ar docXmlPath fp
| Just relType <- filePathToRelType fp docXmlPath
, Just entry <- findEntryByPath fp ar
- , Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry =
+ , Just relElems <- parseXMLFromEntry entry =
mapMaybe (relElemToRelationship relType) $ elChildren relElems
filePathToRelationships _ _ _ = []
@@ -527,7 +534,7 @@ archiveToNumbering' zf =
case findEntryByPath "word/numbering.xml" zf of
Nothing -> Just $ Numbering [] [] []
Just entry -> do
- numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ numberingElem <- parseXMLFromEntry entry
let namespaces = elemToNameSpaces numberingElem
numElems = findChildrenByName namespaces "w" "num" numberingElem
absNumElems = findChildrenByName namespaces "w" "abstractNum" numberingElem
diff --git a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs
index 236167187..edade8654 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs
@@ -53,6 +53,7 @@ import Data.Coerce
import Text.Pandoc.Readers.Docx.Util
import qualified Text.Pandoc.UTF8 as UTF8
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLElement)
newtype CharStyleId = CharStyleId T.Text
deriving (Show, Eq, Ord, IsString, FromStyleId)
@@ -135,19 +136,22 @@ defaultRunStyle = RunStyle { isBold = Nothing
, rParentStyle = Nothing
}
-archiveToStyles' :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) =>
- (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2)
+archiveToStyles'
+ :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2)
+ => (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2)
archiveToStyles' conv1 conv2 zf =
- let stylesElem = findEntryByPath "word/styles.xml" zf >>=
- (parseXMLDoc . UTF8.toStringLazy . fromEntry)
- in
- case stylesElem of
- Nothing -> (M.empty, M.empty)
- Just styElem ->
- let namespaces = elemToNameSpaces styElem
- in
- ( M.fromList $ map (\r -> (conv1 r, r)) $ buildBasedOnList namespaces styElem Nothing,
- M.fromList $ map (\p -> (conv2 p, p)) $ buildBasedOnList namespaces styElem Nothing)
+ case findEntryByPath "word/styles.xml" zf of
+ Nothing -> (M.empty, M.empty)
+ Just entry ->
+ case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of
+ Left _ -> (M.empty, M.empty)
+ Right styElem ->
+ let namespaces = elemToNameSpaces styElem
+ in
+ ( M.fromList $ map (\r -> (conv1 r, r)) $
+ buildBasedOnList namespaces styElem Nothing,
+ M.fromList $ map (\p -> (conv2 p, p)) $
+ buildBasedOnList namespaces styElem Nothing)
isBasedOnStyle :: (ElemToStyle a, FromStyleId (StyleId a)) => NameSpaces -> Element -> Maybe a -> Bool
isBasedOnStyle ns element parentStyle
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs
index 5e3326e6d..369c4f0c9 100644
--- a/src/Text/Pandoc/Readers/EPUB.hs
+++ b/src/Text/Pandoc/Readers/EPUB.hs
@@ -17,7 +17,7 @@ module Text.Pandoc.Readers.EPUB
(readEPUB)
where
-import Codec.Archive.Zip (Archive (..), Entry, findEntryByPath, fromEntry,
+import Codec.Archive.Zip (Archive (..), Entry(..), findEntryByPath, fromEntry,
toArchiveOrFail)
import Control.DeepSeq (NFData, deepseq)
import Control.Monad (guard, liftM, liftM2, mplus)
@@ -41,9 +41,10 @@ import Text.Pandoc.MIME (MimeType)
import Text.Pandoc.Options (ReaderOptions (..))
import Text.Pandoc.Readers.HTML (readHtml)
import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI)
-import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy)
+import qualified Text.Pandoc.UTF8 as UTF8 (toTextLazy)
import Text.Pandoc.Walk (query, walk)
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLElement)
type Items = M.Map String (FilePath, MimeType)
@@ -181,7 +182,7 @@ renameMeta s = T.pack s
getManifest :: PandocMonad m => Archive -> m (String, Element)
getManifest archive = do
metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive
- docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry
+ docElem <- parseXMLDocE metaEntry
let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces)
as <- fmap (map attrToPair . elAttribs)
@@ -190,7 +191,7 @@ getManifest archive = do
let rootdir = dropFileName manifestFile
--mime <- lookup "media-type" as
manifest <- findEntryByPathE manifestFile archive
- (rootdir,) <$> (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
+ (rootdir,) <$> parseXMLDocE manifest
-- Fixup
@@ -284,8 +285,12 @@ findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry
findEntryByPathE (normalise . unEscapeString -> path) a =
mkE ("No entry on path: " ++ path) $ findEntryByPath path a
-parseXMLDocE :: PandocMonad m => String -> m Element
-parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc
+parseXMLDocE :: PandocMonad m => Entry -> m Element
+parseXMLDocE entry =
+ either (throwError . PandocXMLError fp) return $ parseXMLElement doc
+ where
+ doc = UTF8.toTextLazy . fromEntry $ entry
+ fp = T.pack $ eRelativePath entry
findElementE :: PandocMonad m => QName -> Element -> m Element
findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x
diff --git a/src/Text/Pandoc/Readers/FB2.hs b/src/Text/Pandoc/Readers/FB2.hs
index b0d2f092b..b804eab4f 100644
--- a/src/Text/Pandoc/Readers/FB2.hs
+++ b/src/Text/Pandoc/Readers/FB2.hs
@@ -32,6 +32,7 @@ import Data.List (intersperse)
import qualified Data.Map as M
import Data.Text (Text)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
import Data.Default
import Data.Maybe
import Text.HTML.TagSoup.Entity (lookupEntity)
@@ -42,6 +43,7 @@ import Text.Pandoc.Logging
import Text.Pandoc.Options
import Text.Pandoc.Shared (crFilter)
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLElement)
type FB2 m = StateT FB2State m
@@ -64,10 +66,10 @@ instance HasMeta FB2State where
readFB2 :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readFB2 _ inp =
- case parseXMLDoc $ crFilter inp of
- Nothing -> throwError $ PandocParseError "Not an XML document"
- Just e -> do
- (bs, st) <- runStateT (parseRootElement e) def
+ case parseXMLElement $ TL.fromStrict $ crFilter inp of
+ Left msg -> throwError $ PandocXMLError "" msg
+ Right el -> do
+ (bs, st) <- runStateT (parseRootElement el) def
let authors = if null $ fb2Authors st
then id
else setMeta "author" (map text $ reverse $ fb2Authors st)
diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs
index c638da519..dfd343b7a 100644
--- a/src/Text/Pandoc/Readers/JATS.hs
+++ b/src/Text/Pandoc/Readers/JATS.hs
@@ -14,6 +14,8 @@ Conversion of JATS XML to 'Pandoc' document.
module Text.Pandoc.Readers.JATS ( readJATS ) where
import Control.Monad.State.Strict
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Error (PandocError(..))
import Data.Char (isDigit, isSpace, toUpper)
import Data.Default
import Data.Generics
@@ -22,6 +24,7 @@ import qualified Data.Map as Map
import Data.Maybe (maybeToList, fromMaybe)
import Data.Text (Text)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad)
@@ -29,6 +32,7 @@ import Text.Pandoc.Options
import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces)
import Text.TeXMath (readMathML, writeTeX)
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLContents)
import qualified Data.Set as S (fromList, member)
import Data.Set ((\\))
@@ -51,8 +55,9 @@ instance Default JATSState where
readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readJATS _ inp = do
- let tree = normalizeTree . parseXML
- $ T.unpack $ crFilter inp
+ tree <- either (throwError . PandocXMLError "")
+ (return . normalizeTree) $
+ parseXMLContents (TL.fromStrict $ crFilter inp)
(bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree
return $ Pandoc (jatsMeta st') (toList . mconcat $ bs)
diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs
index 5b8996025..bdadc4dd9 100644
--- a/src/Text/Pandoc/Readers/OPML.hs
+++ b/src/Text/Pandoc/Readers/OPML.hs
@@ -19,14 +19,18 @@ import Data.Generics
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import Text.Pandoc.Options
+import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Readers.HTML (readHtml)
import Text.Pandoc.Readers.Markdown (readMarkdown)
import Text.Pandoc.Shared (crFilter, blocksToInlines')
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLContents)
+import Control.Monad.Except (throwError)
type OPML m = StateT OPMLState m
@@ -49,8 +53,10 @@ instance Default OPMLState where
readOPML :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readOPML opts inp = do
(bs, st') <- runStateT
- (mapM parseBlock $ normalizeTree $
- parseXML (T.unpack (crFilter inp))) def{ opmlOptions = opts }
+ (case parseXMLContents (TL.fromStrict (crFilter inp)) of
+ Left msg -> throwError $ PandocXMLError "" msg
+ Right ns -> mapM parseBlock $ normalizeTree ns)
+ def{ opmlOptions = opts }
return $
setTitle (opmlDocTitle st') $
setAuthors (opmlDocAuthors st') $
diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs
index 9943d3147..85308deb1 100644
--- a/src/Text/Pandoc/Readers/Odt.hs
+++ b/src/Text/Pandoc/Readers/Odt.hs
@@ -15,6 +15,7 @@ module Text.Pandoc.Readers.Odt ( readOdt ) where
import Codec.Archive.Zip
import qualified Text.XML.Light as XML
+import Text.Pandoc.XMLParser (parseXMLElement)
import qualified Data.ByteString.Lazy as B
@@ -66,18 +67,18 @@ bytesToOdt bytes = case toArchiveOrFail bytes of
--
archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag)
-archiveToOdt archive = either (Left. PandocParseError) Right $ do
- let onFailure msg Nothing = Left msg
+archiveToOdt archive = do
+ let onFailure msg Nothing = Left $ PandocParseError msg
onFailure _ (Just x) = Right x
contentEntry <- onFailure "Could not find content.xml"
(findEntryByPath "content.xml" archive)
stylesEntry <- onFailure "Could not find styles.xml"
(findEntryByPath "styles.xml" archive)
- contentElem <- onFailure "Could not find content element"
- (entryToXmlElem contentEntry)
- stylesElem <- onFailure "Could not find styles element"
- (entryToXmlElem stylesEntry)
- styles <- either (\_ -> Left "Could not read styles") Right
+ contentElem <- entryToXmlElem contentEntry
+ stylesElem <- entryToXmlElem stylesEntry
+ styles <- either
+ (\_ -> Left $ PandocParseError "Could not read styles")
+ Right
(chooseMax (readStylesAt stylesElem ) (readStylesAt contentElem))
let filePathIsOdtMedia :: FilePath -> Bool
filePathIsOdtMedia fp =
@@ -85,10 +86,13 @@ archiveToOdt archive = either (Left. PandocParseError) Right $ do
in (dir == "Pictures/") || (dir /= "./" && name == "content.xml")
let media = filteredFilesFromArchive archive filePathIsOdtMedia
let startState = readerState styles media
- either (\_ -> Left "Could not convert opendocument") Right
+ either (\_ -> Left $ PandocParseError "Could not convert opendocument") Right
(runConverter' read_body startState contentElem)
--
-entryToXmlElem :: Entry -> Maybe XML.Element
-entryToXmlElem = XML.parseXMLDoc . UTF8.toStringLazy . fromEntry
+entryToXmlElem :: Entry -> Either PandocError XML.Element
+entryToXmlElem entry =
+ case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of
+ Right x -> Right x
+ Left msg -> Left $ PandocXMLError (T.pack $ eRelativePath entry) msg
diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs
index 1f16f6772..e99fa2567 100644
--- a/src/Text/Pandoc/Writers/EPUB.hs
+++ b/src/Text/Pandoc/Writers/EPUB.hs
@@ -55,8 +55,9 @@ import Text.Pandoc.Walk (query, walk, walkM)
import Text.Pandoc.Writers.HTML (writeHtmlStringForEPUB)
import Text.Printf (printf)
import Text.XML.Light (Attr (..), Element (..), Node (..), QName (..),
- add_attrs, lookupAttr, node, onlyElems, parseXML,
+ add_attrs, lookupAttr, node, onlyElems,
ppElement, showElement, strContent, unode, unqual)
+import Text.Pandoc.XMLParser (parseXMLContents)
import Text.Pandoc.XML (escapeStringForXML)
import Text.DocTemplates (FromContext(lookupContext), Context(..),
ToContext(toVal), Val(..))
@@ -160,7 +161,12 @@ mkEntry path content = do
getEPUBMetadata :: PandocMonad m => WriterOptions -> Meta -> E m EPUBMetadata
getEPUBMetadata opts meta = do
let md = metadataFromMeta opts meta
- let elts = maybe [] (onlyElems . parseXML) $ writerEpubMetadata opts
+ elts <- case writerEpubMetadata opts of
+ Nothing -> return []
+ Just t -> case parseXMLContents (TL.fromStrict t) of
+ Left msg -> throwError $
+ PandocXMLError "epub metadata" msg
+ Right ns -> return (onlyElems ns)
let md' = foldr addMetadataFromXML md elts
let addIdentifier m =
if null (epubIdentifier m)
@@ -836,7 +842,8 @@ pandocToEPUB version opts doc = do
: case subs of
[] -> []
(_:_) -> [unode "ol" ! [("class","toc")] $ subs]
- where titElements = parseXML titRendered
+ where titElements = either (const []) id $
+ parseXMLContents (TL.fromStrict titRendered)
titRendered = case P.runPure
(writeHtmlStringForEPUB version
opts{ writerTemplate = Nothing
diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs
index 25b1f28d1..9334d6e9a 100644
--- a/src/Text/Pandoc/Writers/FB2.hs
+++ b/src/Text/Pandoc/Writers/FB2.hs
@@ -19,7 +19,7 @@ FictionBook is an XML-based e-book format. For more information see:
module Text.Pandoc.Writers.FB2 (writeFB2) where
import Control.Monad (zipWithM)
-import Control.Monad.Except (catchError)
+import Control.Monad.Except (catchError, throwError)
import Control.Monad.State.Strict (StateT, evalStateT, get, gets, lift, liftM, modify)
import Data.ByteString.Base64 (encode)
import Data.Char (isAscii, isControl, isSpace)
@@ -27,16 +27,18 @@ import Data.Either (lefts, rights)
import Data.List (intercalate)
import Data.Text (Text, pack)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
import qualified Data.Text.Encoding as TE
import Network.HTTP (urlEncode)
import Text.XML.Light
import qualified Text.XML.Light as X
import qualified Text.XML.Light.Cursor as XC
-import qualified Text.XML.Light.Input as XI
+import Text.Pandoc.XMLParser (parseXMLContents)
import Text.Pandoc.Class.PandocMonad (PandocMonad, report)
import qualified Text.Pandoc.Class.PandocMonad as P
import Text.Pandoc.Definition
+import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Logging
import Text.Pandoc.Options (HTMLMathMethod (..), WriterOptions (..), def)
import Text.Pandoc.Shared (capitalize, isURI, orderedListMarkers,
@@ -307,7 +309,10 @@ blockToXml (CodeBlock _ s) = return . spaceBeforeAfter .
map (el "p" . el "code" . T.unpack) . T.lines $ s
blockToXml (RawBlock f str) =
if f == Format "fb2"
- then return $ XI.parseXML str
+ then
+ case parseXMLContents (TL.fromStrict str) of
+ Left msg -> throwError $ PandocXMLError "" msg
+ Right nds -> return nds
else return []
blockToXml (Div _ bs) = cMapM blockToXml bs
blockToXml (BlockQuote bs) = list . el "cite" <$> cMapM blockToXml bs
diff --git a/src/Text/Pandoc/Writers/ODT.hs b/src/Text/Pandoc/Writers/ODT.hs
index 05dfad5eb..a32ff618c 100644
--- a/src/Text/Pandoc/Writers/ODT.hs
+++ b/src/Text/Pandoc/Writers/ODT.hs
@@ -13,7 +13,7 @@ Conversion of 'Pandoc' documents to ODT.
-}
module Text.Pandoc.Writers.ODT ( writeODT ) where
import Codec.Archive.Zip
-import Control.Monad.Except (catchError)
+import Control.Monad.Except (catchError, throwError)
import Control.Monad.State.Strict
import qualified Data.ByteString.Lazy as B
import Data.Generics (everywhere', mkT)
@@ -27,6 +27,7 @@ import Text.Pandoc.BCP47 (Lang (..), getLang, renderLang)
import Text.Pandoc.Class.PandocMonad (PandocMonad, report, toLang)
import qualified Text.Pandoc.Class.PandocMonad as P
import Text.Pandoc.Definition
+import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.ImageSize
import Text.Pandoc.Logging
import Text.Pandoc.MIME (extensionFromMimeType, getMimeType)
@@ -35,10 +36,11 @@ import Text.DocLayout
import Text.Pandoc.Shared (stringify, pandocVersion, tshow)
import Text.Pandoc.Writers.Shared (lookupMetaString, lookupMetaBlocks,
fixDisplayMath)
-import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toStringLazy)
+import Text.Pandoc.UTF8 (fromStringLazy, fromTextLazy, toTextLazy)
import Text.Pandoc.Walk
import Text.Pandoc.Writers.OpenDocument (writeOpenDocument)
import Text.Pandoc.XML
+import Text.Pandoc.XMLParser (parseXMLElement)
import Text.TeXMath
import Text.XML.Light
@@ -172,17 +174,18 @@ updateStyleWithLang :: PandocMonad m => Maybe Lang -> Archive -> O m Archive
updateStyleWithLang Nothing arch = return arch
updateStyleWithLang (Just lang) arch = do
epochtime <- floor `fmap` lift P.getPOSIXTime
- return arch{ zEntries = [if eRelativePath e == "styles.xml"
- then case parseXMLDoc
- (toStringLazy (fromEntry e)) of
- Nothing -> e
- Just d ->
- toEntry "styles.xml" epochtime
- ( fromStringLazy
- . ppTopElement
- . addLang lang $ d )
- else e
- | e <- zEntries arch] }
+ entries <- mapM (\e -> if eRelativePath e == "styles.xml"
+ then case parseXMLElement
+ (toTextLazy (fromEntry e)) of
+ Left msg -> throwError $
+ PandocXMLError "styles.xml" msg
+ Right d -> return $
+ toEntry "styles.xml" epochtime
+ ( fromStringLazy
+ . ppTopElement
+ . addLang lang $ d )
+ else return e) (zEntries arch)
+ return arch{ zEntries = entries }
addLang :: Lang -> Element -> Element
addLang lang = everywhere' (mkT updateLangAttr)
diff --git a/src/Text/Pandoc/Writers/OOXML.hs b/src/Text/Pandoc/Writers/OOXML.hs
index 3ac007f4e..8f60e70d5 100644
--- a/src/Text/Pandoc/Writers/OOXML.hs
+++ b/src/Text/Pandoc/Writers/OOXML.hs
@@ -35,6 +35,7 @@ import qualified Data.Text as T
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import qualified Text.Pandoc.UTF8 as UTF8
import Text.XML.Light as XML
+import Text.Pandoc.XMLParser (parseXMLElement)
mknode :: Node t => String -> [(String,String)] -> t -> Element
mknode s attrs =
@@ -62,10 +63,10 @@ parseXml refArchive distArchive relpath =
findEntryByPath relpath distArchive of
Nothing -> throwError $ PandocSomeError $
T.pack relpath <> " missing in reference file"
- Just e -> case parseXMLDoc . UTF8.toStringLazy . fromEntry $ e of
- Nothing -> throwError $ PandocSomeError $
- T.pack relpath <> " corrupt in reference file"
- Just d -> return d
+ Just e -> case parseXMLElement . UTF8.toTextLazy . fromEntry $ e of
+ Left msg ->
+ throwError $ PandocXMLError (T.pack relpath) msg
+ Right d -> return d
-- Copied from Util
diff --git a/src/Text/Pandoc/Writers/Powerpoint/Output.hs b/src/Text/Pandoc/Writers/Powerpoint/Output.hs
index 8554db622..cd092969b 100644
--- a/src/Text/Pandoc/Writers/Powerpoint/Output.hs
+++ b/src/Text/Pandoc/Writers/Powerpoint/Output.hs
@@ -29,6 +29,7 @@ import Data.Time.Clock (UTCTime)
import Data.Time.Clock.POSIX (utcTimeToPOSIXSeconds, posixSecondsToUTCTime)
import System.FilePath.Posix (splitDirectories, splitExtension, takeExtension)
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLElement)
import Text.Pandoc.Definition
import qualified Text.Pandoc.UTF8 as UTF8
import Text.Pandoc.Class.PandocMonad (PandocMonad)
@@ -77,7 +78,8 @@ getPresentationSize :: Archive -> Archive -> Maybe (Integer, Integer)
getPresentationSize refArchive distArchive = do
entry <- findEntryByPath "ppt/presentation.xml" refArchive `mplus`
findEntryByPath "ppt/presentation.xml" distArchive
- presElement <- parseXMLDoc $ UTF8.toStringLazy $ fromEntry entry
+ presElement <- either (const Nothing) return $
+ parseXMLElement $ UTF8.toTextLazy $ fromEntry entry
let ns = elemToNameSpaces presElement
sldSize <- findChild (elemName ns "p" "sldSz") presElement
cxS <- findAttr (QName "cx" Nothing Nothing) sldSize
diff --git a/src/Text/Pandoc/XMLParser.hs b/src/Text/Pandoc/XMLParser.hs
new file mode 100644
index 000000000..8ad22a66a
--- /dev/null
+++ b/src/Text/Pandoc/XMLParser.hs
@@ -0,0 +1,66 @@
+{-# LANGUAGE OverloadedStrings #-}
+{- |
+ Module : Text.Pandoc.XMLParser
+ Copyright : Copyright (C) 2021 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm@berkeley.edu>
+ Stability : alpha
+ Portability : portable
+
+Bridge to allow using xml-conduit's parser with xml-light's types.
+-}
+module Text.Pandoc.XMLParser
+ ( parseXMLElement
+ , parseXMLContents
+ , module Text.XML.Light.Types
+ ) where
+
+import qualified Control.Exception as E
+import qualified Text.XML as Conduit
+import Text.XML.Unresolved (InvalidEventStream(..))
+import qualified Text.XML.Light as Light
+import Text.XML.Light.Types
+import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
+import qualified Data.Map as M
+import Data.Maybe (mapMaybe)
+
+-- Drop in replacement for parseXMLDoc in xml-light.
+parseXMLElement :: TL.Text -> Either T.Text Light.Element
+parseXMLElement t =
+ elementToElement . Conduit.documentRoot <$>
+ either (Left . T.pack . E.displayException) Right
+ (Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t)
+
+parseXMLContents :: TL.Text -> Either T.Text [Light.Content]
+parseXMLContents t =
+ case Conduit.parseText Conduit.def{ Conduit.psRetainNamespaces = True } t of
+ Left e ->
+ case E.fromException e of
+ Just (ContentAfterRoot _) ->
+ elContent <$> parseXMLElement ("<wrapper>" <> t <> "</wrapper>")
+ _ -> Left . T.pack . E.displayException $ e
+ Right x -> Right [Light.Elem . elementToElement . Conduit.documentRoot $ x]
+
+elementToElement :: Conduit.Element -> Light.Element
+elementToElement (Conduit.Element name attribMap nodes) =
+ Light.Element (nameToQname name) attrs (mapMaybe nodeToContent nodes) Nothing
+ where
+ attrs = map (\(n,v) -> Light.Attr (nameToQname n) (T.unpack v)) $
+ M.toList attribMap
+ nameToQname (Conduit.Name localName mbns mbpref) =
+ case mbpref of
+ Nothing | "xmlns:" `T.isPrefixOf` localName ->
+ Light.QName (T.unpack $ T.drop 6 localName) (T.unpack <$> mbns)
+ (Just "xmlns")
+ _ -> Light.QName (T.unpack localName) (T.unpack <$> mbns)
+ (T.unpack <$> mbpref)
+
+nodeToContent :: Conduit.Node -> Maybe Light.Content
+nodeToContent (Conduit.NodeElement el) =
+ Just (Light.Elem (elementToElement el))
+nodeToContent (Conduit.NodeContent t) =
+ Just (Light.Text (Light.CData Light.CDataText (T.unpack t) Nothing))
+nodeToContent _ = Nothing
+