aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/OPML.hs
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-02-08 23:35:19 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2021-02-10 22:04:11 -0800
commit8ca191604dcd13af27c11d2da225da646ebce6fc (patch)
tree9663e0b951ecfce7efd08efd79dcd4b957601b85 /src/Text/Pandoc/Readers/OPML.hs
parent9994ad977d03e97baadf680793c58a66ba7e77e9 (diff)
downloadpandoc-8ca191604dcd13af27c11d2da225da646ebce6fc.tar.gz
Add new unexported module T.P.XMLParser.
This exports functions that uses xml-conduit's parser to produce an xml-light Element or [Content]. This allows existing pandoc code to use a better parser without much modification. The new parser is used in all places where xml-light's parser was previously used. Benchmarks show a significant performance improvement in parsing XML-based formats (especially ODT and FB2). Note that the xml-light types use String, so the conversion from xml-conduit types involves a lot of extra allocation. It would be desirable to avoid that in the future by gradually switching to using xml-conduit directly. This can be done module by module. The new parser also reports errors, which we report when possible. A new constructor PandocXMLError has been added to PandocError in T.P.Error [API change]. Closes #7091, which was the main stimulus. These changes revealed the need for some changes in the tests. The docbook-reader.docbook test lacked definitions for the entities it used; these have been added. And the docx golden tests have been updated, because the new parser does not preserve the order of attributes. Add entity defs to docbook-reader.docbook. Update golden tests for docx.
Diffstat (limited to 'src/Text/Pandoc/Readers/OPML.hs')
-rw-r--r--src/Text/Pandoc/Readers/OPML.hs10
1 files changed, 8 insertions, 2 deletions
diff --git a/src/Text/Pandoc/Readers/OPML.hs b/src/Text/Pandoc/Readers/OPML.hs
index 5b8996025..bdadc4dd9 100644
--- a/src/Text/Pandoc/Readers/OPML.hs
+++ b/src/Text/Pandoc/Readers/OPML.hs
@@ -19,14 +19,18 @@ import Data.Generics
import Data.Maybe (fromMaybe)
import Data.Text (Text)
import qualified Data.Text as T
+import qualified Data.Text.Lazy as TL
import Text.HTML.TagSoup.Entity (lookupEntity)
import Text.Pandoc.Builder
import Text.Pandoc.Class.PandocMonad (PandocMonad)
import Text.Pandoc.Options
+import Text.Pandoc.Error (PandocError(..))
import Text.Pandoc.Readers.HTML (readHtml)
import Text.Pandoc.Readers.Markdown (readMarkdown)
import Text.Pandoc.Shared (crFilter, blocksToInlines')
import Text.XML.Light
+import Text.Pandoc.XMLParser (parseXMLContents)
+import Control.Monad.Except (throwError)
type OPML m = StateT OPMLState m
@@ -49,8 +53,10 @@ instance Default OPMLState where
readOPML :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
readOPML opts inp = do
(bs, st') <- runStateT
- (mapM parseBlock $ normalizeTree $
- parseXML (T.unpack (crFilter inp))) def{ opmlOptions = opts }
+ (case parseXMLContents (TL.fromStrict (crFilter inp)) of
+ Left msg -> throwError $ PandocXMLError "" msg
+ Right ns -> mapM parseBlock $ normalizeTree ns)
+ def{ opmlOptions = opts }
return $
setTitle (opmlDocTitle st') $
setAuthors (opmlDocAuthors st') $