aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-02-08 23:35:19 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2021-02-10 22:04:11 -0800
commit8ca191604dcd13af27c11d2da225da646ebce6fc (patch)
tree9663e0b951ecfce7efd08efd79dcd4b957601b85 /src/Text/Pandoc/Readers
parent9994ad977d03e97baadf680793c58a66ba7e77e9 (diff)
downloadpandoc-8ca191604dcd13af27c11d2da225da646ebce6fc.tar.gz
Add new unexported module T.P.XMLParser.
This exports functions that uses xml-conduit's parser to produce an xml-light Element or [Content]. This allows existing pandoc code to use a better parser without much modification. The new parser is used in all places where xml-light's parser was previously used. Benchmarks show a significant performance improvement in parsing XML-based formats (especially ODT and FB2). Note that the xml-light types use String, so the conversion from xml-conduit types involves a lot of extra allocation. It would be desirable to avoid that in the future by gradually switching to using xml-conduit directly. This can be done module by module. The new parser also reports errors, which we report when possible. A new constructor PandocXMLError has been added to PandocError in T.P.Error [API change]. Closes #7091, which was the main stimulus. These changes revealed the need for some changes in the tests. The docbook-reader.docbook test lacked definitions for the entities it used; these have been added. And the docx golden tests have been updated, because the new parser does not preserve the order of attributes. Add entity defs to docbook-reader.docbook. Update golden tests for docx.
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/DocBook.hs52
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse.hs21
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse/Styles.hs28
-rw-r--r--src/Text/Pandoc/Readers/EPUB.hs17
-rw-r--r--src/Text/Pandoc/Readers/FB2.hs10
-rw-r--r--src/Text/Pandoc/Readers/JATS.hs9
-rw-r--r--src/Text/Pandoc/Readers/OPML.hs10
-rw-r--r--src/Text/Pandoc/Readers/Odt.hs24
8 files changed, 109 insertions, 62 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs
index ada3e98ec..ad0108843 100644
--- a/src/Text/Pandoc/Readers/DocBook.hs
+++ b/src/Text/Pandoc/Readers/DocBook.hs
@@ -12,7 +12,7 @@ Conversion of DocBook XML to 'Pandoc' document.
-}
module Text.Pandoc.Readers.DocBook ( readDocBook ) where
import Control.Monad.State.Strict
-import Data.Char (isSpace, toUpper)
+import Data.Char (isSpace, toUpper, isLetter)
import Data.Default
import Data.Either (rights)
import Data.Foldable (asum)
@@ -21,7 +21,10 @@ import Data.List (intersperse,elemIndex)
import Data.Maybe (fromMaybe,mapMaybe)
import Data.Text (Text)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
+import Control.Monad.Except (throwError)
import Text.HTML.TagSoup.Entity (lookupEntity)
+import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad, report)
import Text.Pandoc.Options
@@ -29,6 +32,7 @@ import Text.Pandoc.Logging (LogMessage(..))
import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces)
import Text.TeXMath (readMathML, writeTeX)
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLContents)
{-
@@ -537,22 +541,25 @@ instance Default DBState where
readDocBook :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readDocBook _ inp = do
- let tree = normalizeTree . parseXML . handleInstructions $ crFilter inp
+ tree <- either (throwError . PandocXMLError "") (return . normalizeTree) $
+ parseXMLContents (TL.fromStrict . handleInstructions $ crFilter inp)
(bs, st') <- flip runStateT (def{ dbContent = tree }) $ mapM parseBlock tree
return $ Pandoc (dbMeta st') (toList . mconcat $ bs)
--- We treat <?asciidoc-br?> specially (issue #1236), converting it
--- to <br/>, since xml-light doesn't parse the instruction correctly.
--- Other xml instructions are simply removed from the input stream.
+-- We treat certain processing instructions by converting them to tags
+-- beginning "pi-".
handleInstructions :: Text -> Text
-handleInstructions = T.pack . handleInstructions' . T.unpack
-
-handleInstructions' :: String -> String
-handleInstructions' ('<':'?':'a':'s':'c':'i':'i':'d':'o':'c':'-':'b':'r':'?':'>':xs) = '<':'b':'r':'/':'>': handleInstructions' xs
-handleInstructions' xs = case break (=='<') xs of
- (ys, []) -> ys
- ([], '<':zs) -> '<' : handleInstructions' zs
- (ys, zs) -> ys ++ handleInstructions' zs
+handleInstructions t =
+ let (x,y) = T.breakOn "<?" t
+ in if T.null y
+ then x
+ else
+ let (w,z) = T.breakOn "?>" y
+ in (if T.takeWhile (\c -> isLetter c || c == '-')
+ (T.drop 2 w) `elem` ["asciidoc-br", "dbfo"]
+ then x <> "<pi-" <> T.drop 2 w <> "/>"
+ else x <> w <> T.take 2 z) <>
+ handleInstructions (T.drop 2 z)
getFigure :: PandocMonad m => Element -> DB m Blocks
getFigure e = do
@@ -892,7 +899,11 @@ parseBlock (Elem e) =
"subtitle" -> return mempty -- handled in parent element
_ -> skip >> getBlocks e
where skip = do
- lift $ report $ IgnoredElement $ T.pack $ qName (elName e)
+ let qn = T.pack $ qName $ elName e
+ let name = if "pi-" `T.isPrefixOf` qn
+ then "<?" <> qn <> "?>"
+ else qn
+ lift $ report $ IgnoredElement name
return mempty
codeBlockWithLang = do
@@ -964,7 +975,7 @@ parseBlock (Elem e) =
cs -> map toAlignment cs
let parseWidth s = safeRead (T.filter (\x -> (x >= '0' && x <= '9')
|| x == '.') s)
- let textWidth = case filterChild (named "?dbfo") e of
+ let textWidth = case filterChild (named "pi-dbfo") e of
Just d -> case attrValue "table-width" d of
"" -> 1.0
w -> fromMaybe 100.0 (parseWidth w) / 100.0
@@ -1165,12 +1176,15 @@ parseInline (Elem e) =
"title" -> return mempty
"affiliation" -> skip
-- Note: this isn't a real docbook tag; it's what we convert
- -- <?asciidor-br?> to in handleInstructions, above. A kludge to
- -- work around xml-light's inability to parse an instruction.
- "br" -> return linebreak
+ -- <?asciidor-br?> to in handleInstructions, above.
+ "pi-asciidoc-br" -> return linebreak
_ -> skip >> innerInlines id
where skip = do
- lift $ report $ IgnoredElement $ T.pack $ qName (elName e)
+ let qn = T.pack $ qName $ elName e
+ let name = if "pi-" `T.isPrefixOf` qn
+ then "<?" <> qn <> "?>"
+ else qn
+ lift $ report $ IgnoredElement name
return mempty
innerInlines f = extractSpaces f . mconcat <$>
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs
index fdcffcc3f..056dab6c2 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse.hs
@@ -74,6 +74,7 @@ import Text.TeXMath.Readers.OMML (readOMML)
import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, textToFont)
import Text.XML.Light
import qualified Text.XML.Light.Cursor as XMLC
+import Text.Pandoc.XMLParser (parseXMLElement)
data ReaderEnv = ReaderEnv { envNotes :: Notes
, envComments :: Comments
@@ -343,10 +344,16 @@ archiveToDocxWithWarnings archive = do
Right doc -> Right (Docx doc, stateWarnings st)
Left e -> Left e
+parseXMLFromEntry :: Entry -> Maybe Element
+parseXMLFromEntry entry =
+ case parseXMLElement (UTF8.toTextLazy (fromEntry entry)) of
+ Left _ -> Nothing
+ Right el -> Just el
+
getDocumentXmlPath :: Archive -> Maybe FilePath
getDocumentXmlPath zf = do
entry <- findEntryByPath "_rels/.rels" zf
- relsElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ relsElem <- parseXMLFromEntry entry
let rels = filterChildrenName (\n -> qName n == "Relationship") relsElem
rel <- find (\e -> findAttr (QName "Type" Nothing Nothing) e ==
Just "http://schemas.openxmlformats.org/officeDocument/2006/relationships/officeDocument")
@@ -362,7 +369,7 @@ archiveToDocument :: Archive -> D Document
archiveToDocument zf = do
docPath <- asks envDocXmlPath
entry <- maybeToD $ findEntryByPath docPath zf
- docElem <- maybeToD $ (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ docElem <- maybeToD $ parseXMLFromEntry entry
let namespaces = elemToNameSpaces docElem
bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem
let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem)
@@ -401,9 +408,9 @@ constructBogusParStyleData stName = ParStyle
archiveToNotes :: Archive -> Notes
archiveToNotes zf =
let fnElem = findEntryByPath "word/footnotes.xml" zf
- >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ >>= parseXMLFromEntry
enElem = findEntryByPath "word/endnotes.xml" zf
- >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ >>= parseXMLFromEntry
fn_namespaces = maybe [] elemToNameSpaces fnElem
en_namespaces = maybe [] elemToNameSpaces enElem
ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces
@@ -415,7 +422,7 @@ archiveToNotes zf =
archiveToComments :: Archive -> Comments
archiveToComments zf =
let cmtsElem = findEntryByPath "word/comments.xml" zf
- >>= (parseXMLDoc . UTF8.toStringLazy . fromEntry)
+ >>= parseXMLFromEntry
cmts_namespaces = maybe [] elemToNameSpaces cmtsElem
cmts = elemToComments cmts_namespaces <$> (cmtsElem >>= walkDocument cmts_namespaces)
in
@@ -445,7 +452,7 @@ filePathToRelationships :: Archive -> FilePath -> FilePath -> [Relationship]
filePathToRelationships ar docXmlPath fp
| Just relType <- filePathToRelType fp docXmlPath
, Just entry <- findEntryByPath fp ar
- , Just relElems <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry =
+ , Just relElems <- parseXMLFromEntry entry =
mapMaybe (relElemToRelationship relType) $ elChildren relElems
filePathToRelationships _ _ _ = []
@@ -527,7 +534,7 @@ archiveToNumbering' zf =
case findEntryByPath "word/numbering.xml" zf of
Nothing -> Just $ Numbering [] [] []
Just entry -> do
- numberingElem <- (parseXMLDoc . UTF8.toStringLazy . fromEntry) entry
+ numberingElem <- parseXMLFromEntry entry
let namespaces = elemToNameSpaces numberingElem
numElems = findChildrenByName namespaces "w" "num" numberingElem
absNumElems = findChildrenByName namespaces "w" "abstractNum" numberingElem
diff --git a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs
index 236167187..edade8654 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs
@@ -53,6 +53,7 @@ import Data.Coerce
import Text.Pandoc.Readers.Docx.Util
import qualified Text.Pandoc.UTF8 as UTF8
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLElement)
newtype CharStyleId = CharStyleId T.Text
deriving (Show, Eq, Ord, IsString, FromStyleId)
@@ -135,19 +136,22 @@ defaultRunStyle = RunStyle { isBold = Nothing
, rParentStyle = Nothing
}
-archiveToStyles' :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2) =>
- (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2)
+archiveToStyles'
+ :: (Ord k1, Ord k2, ElemToStyle a1, ElemToStyle a2)
+ => (a1 -> k1) -> (a2 -> k2) -> Archive -> (M.Map k1 a1, M.Map k2 a2)
archiveToStyles' conv1 conv2 zf =
- let stylesElem = findEntryByPath "word/styles.xml" zf >>=
- (parseXMLDoc . UTF8.toStringLazy . fromEntry)
- in
- case stylesElem of
- Nothing -> (M.empty, M.empty)
- Just styElem ->
- let namespaces = elemToNameSpaces styElem
- in
- ( M.fromList $ map (\r -> (conv1 r, r)) $ buildBasedOnList namespaces styElem Nothing,
- M.fromList $ map (\p -> (conv2 p, p)) $ buildBasedOnList namespaces styElem Nothing)
+ case findEntryByPath "word/styles.xml" zf of
+ Nothing -> (M.empty, M.empty)
+ Just entry ->
+ case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of
+ Left _ -> (M.empty, M.empty)
+ Right styElem ->
+ let namespaces = elemToNameSpaces styElem
+ in
+ ( M.fromList $ map (\r -> (conv1 r, r)) $
+ buildBasedOnList namespaces styElem Nothing,
+ M.fromList $ map (\p -> (conv2 p, p)) $
+ buildBasedOnList namespaces styElem Nothing)
isBasedOnStyle :: (ElemToStyle a, FromStyleId (StyleId a)) => NameSpaces -> Element -> Maybe a -> Bool
isBasedOnStyle ns element parentStyle
diff --git a/src/Text/Pandoc/Readers/EPUB.hs b/src/Text/Pandoc/Readers/EPUB.hs
index 5e3326e6d..369c4f0c9 100644
--- a/src/Text/Pandoc/Readers/EPUB.hs
+++ b/src/Text/Pandoc/Readers/EPUB.hs
@@ -17,7 +17,7 @@ module Text.Pandoc.Readers.EPUB
(readEPUB)
where
-import Codec.Archive.Zip (Archive (..), Entry, findEntryByPath, fromEntry,
+import Codec.Archive.Zip (Archive (..), Entry(..), findEntryByPath, fromEntry,
toArchiveOrFail)
import Control.DeepSeq (NFData, deepseq)
import Control.Monad (guard, liftM, liftM2, mplus)
@@ -41,9 +41,10 @@ import Text.Pandoc.MIME (MimeType)
import Text.Pandoc.Options (ReaderOptions (..))
import Text.Pandoc.Readers.HTML (readHtml)
import Text.Pandoc.Shared (addMetaField, collapseFilePath, escapeURI)
-import qualified Text.Pandoc.UTF8 as UTF8 (toStringLazy)
+import qualified Text.Pandoc.UTF8 as UTF8 (toTextLazy)
import Text.Pandoc.Walk (query, walk)
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLElement)
type Items = M.Map String (FilePath, MimeType)
@@ -181,7 +182,7 @@ renameMeta s = T.pack s
getManifest :: PandocMonad m => Archive -> m (String, Element)
getManifest archive = do
metaEntry <- findEntryByPathE ("META-INF" </> "container.xml") archive
- docElem <- (parseXMLDocE . UTF8.toStringLazy . fromEntry) metaEntry
+ docElem <- parseXMLDocE metaEntry
let namespaces = mapMaybe attrToNSPair (elAttribs docElem)
ns <- mkE "xmlns not in namespaces" (lookup "xmlns" namespaces)
as <- fmap (map attrToPair . elAttribs)
@@ -190,7 +191,7 @@ getManifest archive = do
let rootdir = dropFileName manifestFile
--mime <- lookup "media-type" as
manifest <- findEntryByPathE manifestFile archive
- (rootdir,) <$> (parseXMLDocE . UTF8.toStringLazy . fromEntry $ manifest)
+ (rootdir,) <$> parseXMLDocE manifest
-- Fixup
@@ -284,8 +285,12 @@ findEntryByPathE :: PandocMonad m => FilePath -> Archive -> m Entry
findEntryByPathE (normalise . unEscapeString -> path) a =
mkE ("No entry on path: " ++ path) $ findEntryByPath path a
-parseXMLDocE :: PandocMonad m => String -> m Element
-parseXMLDocE doc = mkE "Unable to parse XML doc" $ parseXMLDoc doc
+parseXMLDocE :: PandocMonad m => Entry -> m Element
+parseXMLDocE entry =
+ either (throwError . PandocXMLError fp) return $ parseXMLElement doc
+ where
+ doc = UTF8.toTextLazy . fromEntry $ entry
+ fp = T.pack $ eRelativePath entry
findElementE :: PandocMonad m => QName -> Element -> m Element
findElementE e x = mkE ("Unable to find element: " ++ show e) $ findElement e x
diff --git a/src/Text/Pandoc/Readers/FB2.hs b/src/Text/Pandoc/Readers/FB2.hs
index b0d2f092b..b804eab4f 100644
--- a/src/Text/Pandoc/Readers/FB2.hs
+++ b/src/Text/Pandoc/Readers/FB2.hs
@@ -32,6 +32,7 @@ import Data.List (intersperse)
import qualified Data.Map as M
import Data.Text (Text)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
import Data.Default
import Data.Maybe
import Text.HTML.TagSoup.Entity (lookupEntity)
@@ -42,6 +43,7 @@ import Text.Pandoc.Logging
import Text.Pandoc.Options
import Text.Pandoc.Shared (crFilter)
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLElement)
type FB2 m = StateT FB2State m
@@ -64,10 +66,10 @@ instance HasMeta FB2State where
readFB2 :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readFB2 _ inp =
- case parseXMLDoc $ crFilter inp of
- Nothing -> throwError $ PandocParseError "Not an XML document"
- Just e -> do
- (bs, st) <- runStateT (parseRootElement e) def
+ case parseXMLElement $ TL.fromStrict $ crFilter inp of
+ Left msg -> throwError $ PandocXMLError "" msg
+ Right el -> do
+ (bs, st) <- runStateT (parseRootElement el) def
let authors = if null $ fb2Authors st
then id
else setMeta "author" (map text $ reverse $ fb2Authors st)
diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs
index c638da519..dfd343b7a 100644
--- a/src/Text/Pandoc/Readers/JATS.hs
+++ b/src/Text/Pandoc/Readers/JATS.hs
@@ -14,6 +14,8 @@ Conversion of JATS XML to 'Pandoc' document.
module Text.Pandoc.Readers.JATS ( readJATS ) where
import Control.Monad.State.Strict
+import Control.Monad.Except (throwError)
+import Text.Pandoc.Error (PandocError(..))
import Data.Char (isDigit, isSpace, toUpper)
import Data.Default
import Data.Generics
@@ -22,6 +24,7 @@ import qualified Data.Map as Map
import Data.Maybe (maybeToList, fromMaybe)
import Data.Text (Text)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad)
@@ -29,6 +32,7 @@ import Text.Pandoc.Options
import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces)
import Text.TeXMath (readMathML, writeTeX)
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLContents)
import qualified Data.Set as S (fromList, member)
import Data.Set ((\\))
@@ -51,8 +55,9 @@ instance Default JATSState where
readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readJATS _ inp = do
- let tree = normalizeTree . parseXML
- $ T.unpack $ crFilter inp
+ tree <- either (throwError . PandocXMLError "")
+ (return . normalizeTree) $
+ parseXMLContents (TL.fromStrict $ crFilter inp)
(bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree
return $ Pandoc (jatsMeta st') (toList . mconcat $ bs)
diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs
index 5b8996025..bdadc4dd9 100644
--- a/src/Text/Pandoc/Readers/OPML.hs
+++ b/src/Text/Pandoc/Readers/OPML.hs
@@ -19,14 +19,18 @@ import Data.Generics
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import Text.Pandoc.Options
+import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Readers.HTML (readHtml)
import Text.Pandoc.Readers.Markdown (readMarkdown)
import Text.Pandoc.Shared (crFilter, blocksToInlines')
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLContents)
+import Control.Monad.Except (throwError)
type OPML m = StateT OPMLState m
@@ -49,8 +53,10 @@ instance Default OPMLState where
readOPML :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readOPML opts inp = do
(bs, st') <- runStateT
- (mapM parseBlock $ normalizeTree $
- parseXML (T.unpack (crFilter inp))) def{ opmlOptions = opts }
+ (case parseXMLContents (TL.fromStrict (crFilter inp)) of
+ Left msg -> throwError $ PandocXMLError "" msg
+ Right ns -> mapM parseBlock $ normalizeTree ns)
+ def{ opmlOptions = opts }
return $
setTitle (opmlDocTitle st') $
setAuthors (opmlDocAuthors st') $
diff --git a/src/Text/Pandoc/Readers/Odt.hs b/src/Text/Pandoc/Readers/Odt.hs
index 9943d3147..85308deb1 100644
--- a/src/Text/Pandoc/Readers/Odt.hs
+++ b/src/Text/Pandoc/Readers/Odt.hs
@@ -15,6 +15,7 @@ module Text.Pandoc.Readers.Odt ( readOdt ) where
import Codec.Archive.Zip
import qualified Text.XML.Light as XML
+import Text.Pandoc.XMLParser (parseXMLElement)
import qualified Data.ByteString.Lazy as B
@@ -66,18 +67,18 @@ bytesToOdt bytes = case toArchiveOrFail bytes of
--
archiveToOdt :: Archive -> Either PandocError (Pandoc, MediaBag)
-archiveToOdt archive = either (Left. PandocParseError) Right $ do
- let onFailure msg Nothing = Left msg
+archiveToOdt archive = do
+ let onFailure msg Nothing = Left $ PandocParseError msg
onFailure _ (Just x) = Right x
contentEntry <- onFailure "Could not find content.xml"
(findEntryByPath "content.xml" archive)
stylesEntry <- onFailure "Could not find styles.xml"
(findEntryByPath "styles.xml" archive)
- contentElem <- onFailure "Could not find content element"
- (entryToXmlElem contentEntry)
- stylesElem <- onFailure "Could not find styles element"
- (entryToXmlElem stylesEntry)
- styles <- either (\_ -> Left "Could not read styles") Right
+ contentElem <- entryToXmlElem contentEntry
+ stylesElem <- entryToXmlElem stylesEntry
+ styles <- either
+ (\_ -> Left $ PandocParseError "Could not read styles")
+ Right
(chooseMax (readStylesAt stylesElem ) (readStylesAt contentElem))
let filePathIsOdtMedia :: FilePath -> Bool
filePathIsOdtMedia fp =
@@ -85,10 +86,13 @@ archiveToOdt archive = either (Left. PandocParseError) Right $ do
in (dir == "Pictures/") || (dir /= "./" && name == "content.xml")
let media = filteredFilesFromArchive archive filePathIsOdtMedia
let startState = readerState styles media
- either (\_ -> Left "Could not convert opendocument") Right
+ either (\_ -> Left $ PandocParseError "Could not convert opendocument") Right
(runConverter' read_body startState contentElem)
--
-entryToXmlElem :: Entry -> Maybe XML.Element
-entryToXmlElem = XML.parseXMLDoc . UTF8.toStringLazy . fromEntry
+entryToXmlElem :: Entry -> Either PandocError XML.Element
+entryToXmlElem entry =
+ case parseXMLElement . UTF8.toTextLazy . fromEntry $ entry of
+ Right x -> Right x
+ Left msg -> Left $ PandocXMLError (T.pack $ eRelativePath entry) msg