aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Odt
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-02-14 22:29:21 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2021-02-16 16:55:20 -0800
commit967e7f5fb990b29de48b37be1db40fb149a8cf55 (patch)
treeb9f903a5f2af14f20e769903e80659b9bffd59ff /src/Text/Pandoc/Readers/Odt
parentb5b576184c3c1668aad0c904e186136b81a0dd54 (diff)
downloadpandoc-967e7f5fb990b29de48b37be1db40fb149a8cf55.tar.gz
Rename Text.Pandoc.XMLParser -> Text.Pandoc.XML.Light...
..and add new definitions isomorphic to xml-light's, but with Text instead of String. This allows us to keep most of the code in existing readers that use xml-light, but avoid lots of unnecessary allocation. We also add versions of the functions from xml-light's Text.XML.Light.Output and Text.XML.Light.Proc that operate on our modified XML types, and functions that convert xml-light types to our types (since some of our dependencies, like texmath, use xml-light). Update golden tests for docx and pptx. OOXML test: Use `showContent` instead of `ppContent` in `displayDiff`. Docx: Do a manual traversal to unwrap sdt and smartTag. This is faster, and needed to pass the tests. Benchmarks: A = prior to 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8) B = as of 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8) C = this commit | Reader | A | B | C | | ------- | ----- | ------ | ----- | | docbook | 18 ms | 12 ms | 10 ms | | opml | 65 ms | 62 ms | 35 ms | | jats | 15 ms | 11 ms | 9 ms | | docx | 72 ms | 69 ms | 44 ms | | odt | 78 ms | 41 ms | 28 ms | | epub | 64 ms | 61 ms | 56 ms | | fb2 | 14 ms | 5 ms | 4 ms |
Diffstat (limited to 'src/Text/Pandoc/Readers/Odt')
-rw-r--r--src/Text/Pandoc/Readers/Odt/ContentReader.hs13
-rw-r--r--src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs3
-rw-r--r--src/Text/Pandoc/Readers/Odt/Generic/Utils.hs33
-rw-r--r--src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs23
-rw-r--r--src/Text/Pandoc/Readers/Odt/Namespaces.hs11
-rw-r--r--src/Text/Pandoc/Readers/Odt/StyleReader.hs23
6 files changed, 48 insertions, 58 deletions
diff --git a/src/Text/Pandoc/Readers/Odt/ContentReader.hs b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
index 43c44e7e9..df90880fa 100644
--- a/src/Text/Pandoc/Readers/Odt/ContentReader.hs
+++ b/src/Text/Pandoc/Readers/Odt/ContentReader.hs
@@ -29,14 +29,14 @@ import Control.Monad ((<=<))
import qualified Data.ByteString.Lazy as B
import Data.Foldable (fold)
-import Data.List (find, stripPrefix)
+import Data.List (find)
import qualified Data.Map as M
import qualified Data.Text as T
import Data.Maybe
import Data.Semigroup (First(..), Option(..))
import Text.TeXMath (readMathML, writeTeX)
-import qualified Text.XML.Light as XML
+import qualified Text.Pandoc.XML.Light as XML
import Text.Pandoc.Builder hiding (underline)
import Text.Pandoc.MediaBag (MediaBag, insertMedia)
@@ -557,7 +557,7 @@ read_plain_text = fst ^&&& read_plain_text' >>% recover
>>?% mappend
--
extractText :: XML.Content -> Fallible T.Text
- extractText (XML.Text cData) = succeedWith (T.pack $ XML.cdData cData)
+ extractText (XML.Text cData) = succeedWith (XML.cdData cData)
extractText _ = failEmpty
read_text_seq :: InlineMatcher
@@ -777,14 +777,14 @@ read_frame_img =
"" -> returnV mempty -< ()
src' -> do
let exts = extensionsFromList [Ext_auto_identifiers]
- resource <- lookupResource -< src'
+ resource <- lookupResource -< T.unpack src'
_ <- updateMediaWithResource -< resource
w <- findAttrText' NsSVG "width" -< ()
h <- findAttrText' NsSVG "height" -< ()
titleNodes <- matchChildContent' [ read_frame_title ] -< ()
alt <- matchChildContent [] read_plain_text -< ()
arr (firstMatch . uncurry4 imageWith) -<
- (image_attributes w h, T.pack src', inlineListToIdentifier exts (toList titleNodes), alt)
+ (image_attributes w h, src', inlineListToIdentifier exts (toList titleNodes), alt)
read_frame_title :: InlineMatcher
read_frame_title = matchingElement NsSVG "title" (matchChildContent [] read_plain_text)
@@ -804,7 +804,8 @@ read_frame_mathml =
case fold src of
"" -> returnV mempty -< ()
src' -> do
- let path = fromMaybe src' (stripPrefix "./" src') ++ "/content.xml"
+ let path = T.unpack $
+ fromMaybe src' (T.stripPrefix "./" src') <> "/content.xml"
(_, mathml) <- lookupResource -< path
case readMathML (UTF8.toText $ B.toStrict mathml) of
Left _ -> returnV mempty -< ()
diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs
index 77174c793..78a7fc0b2 100644
--- a/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs
+++ b/src/Text/Pandoc/Readers/Odt/Generic/Namespaces.hs
@@ -14,9 +14,10 @@ typesafe Haskell namespace identifiers and unsafe "real world" namespaces.
module Text.Pandoc.Readers.Odt.Generic.Namespaces where
import qualified Data.Map as M
+import Data.Text (Text)
--
-type NameSpaceIRI = String
+type NameSpaceIRI = Text
--
type NameSpaceIRIs nsID = M.Map nsID NameSpaceIRI
diff --git a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs
index 6dc56a0d9..edefe3c70 100644
--- a/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs
+++ b/src/Text/Pandoc/Readers/Odt/Generic/Utils.hs
@@ -20,7 +20,6 @@ module Text.Pandoc.Readers.Odt.Generic.Utils
, reverseComposition
, tryToRead
, Lookupable(..)
-, readLookupables
, readLookupable
, readPercent
, findBy
@@ -30,11 +29,11 @@ module Text.Pandoc.Readers.Odt.Generic.Utils
import Control.Category (Category, (<<<), (>>>))
import qualified Control.Category as Cat (id)
-import Control.Monad (msum)
-
+import Data.Char (isSpace)
import qualified Data.Foldable as F (Foldable, foldr)
import Data.Maybe
-
+import Data.Text (Text)
+import qualified Data.Text as T
-- | Equivalent to
-- > foldr (.) id
@@ -76,8 +75,8 @@ swing = flip.(.flip id)
-- (nobody wants that) while the latter returns "to much" for simple purposes.
-- This function instead applies 'reads' and returns the first match (if any)
-- in a 'Maybe'.
-tryToRead :: (Read r) => String -> Maybe r
-tryToRead = reads >>> listToMaybe >>> fmap fst
+tryToRead :: (Read r) => Text -> Maybe r
+tryToRead = (reads . T.unpack) >>> listToMaybe >>> fmap fst
-- | A version of 'reads' that requires a '%' sign after the number
readPercent :: ReadS Int
@@ -88,26 +87,12 @@ readPercent s = [ (i,s') | (i , r ) <- reads s
-- | Data that can be looked up.
-- This is mostly a utility to read data with kind *.
class Lookupable a where
- lookupTable :: [(String, a)]
-
--- | The idea is to use this function as if there was a declaration like
---
--- > instance (Lookupable a) => (Read a) where
--- > readsPrec _ = readLookupables
--- .
--- But including this code in this form would need UndecideableInstances.
--- That is a bad idea. Luckily 'readLookupable' (without the s at the end)
--- can be used directly in almost any case.
-readLookupables :: (Lookupable a) => String -> [(a,String)]
-readLookupables s = [ (a,rest) | (word,rest) <- lex s,
- a <- maybeToList (lookup word lookupTable)
- ]
+ lookupTable :: [(Text, a)]
-- | Very similar to a simple 'lookup' in the 'lookupTable', but with a lexer.
-readLookupable :: (Lookupable a) => String -> Maybe a
-readLookupable s = msum
- $ map ((`lookup` lookupTable).fst)
- $ lex s
+readLookupable :: (Lookupable a) => Text -> Maybe a
+readLookupable s =
+ lookup (T.takeWhile (not . isSpace) $ T.dropWhile isSpace s) lookupTable
uncurry3 :: (a->b->c -> z) -> (a,b,c ) -> z
uncurry4 :: (a->b->c->d -> z) -> (a,b,c,d ) -> z
diff --git a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs
index 00c636a0d..0d921e23b 100644
--- a/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs
+++ b/src/Text/Pandoc/Readers/Odt/Generic/XMLConverter.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE TupleSections #-}
{-# LANGUAGE GADTs #-}
{-# LANGUAGE LambdaCase #-}
@@ -60,11 +61,11 @@ import Control.Arrow
import Data.Bool ( bool )
import Data.Either ( rights )
import qualified Data.Map as M
-import qualified Data.Text as T
+import Data.Text (Text)
import Data.Default
import Data.Maybe
-import qualified Text.XML.Light as XML
+import qualified Text.Pandoc.XML.Light as XML
import Text.Pandoc.Readers.Odt.Arrows.State
import Text.Pandoc.Readers.Odt.Arrows.Utils
@@ -78,13 +79,13 @@ import Text.Pandoc.Readers.Odt.Generic.Fallible
--------------------------------------------------------------------------------
--
-type ElementName = String
-type AttributeName = String
-type AttributeValue = String
-type TextAttributeValue = T.Text
+type ElementName = Text
+type AttributeName = Text
+type AttributeValue = Text
+type TextAttributeValue = Text
--
-type NameSpacePrefix = String
+type NameSpacePrefix = Text
--
type NameSpacePrefixes nsID = M.Map nsID NameSpacePrefix
@@ -461,7 +462,7 @@ lookupDefaultingAttr :: (NameSpaceID nsID, Lookupable a, Default a)
lookupDefaultingAttr nsID attrName
= lookupAttrWithDefault nsID attrName def
--- | Return value as a (Maybe String)
+-- | Return value as a (Maybe Text)
findAttr' :: (NameSpaceID nsID)
=> nsID -> AttributeName
-> XMLConverter nsID extraState x (Maybe AttributeValue)
@@ -477,7 +478,6 @@ findAttrText' nsID attrName
= qualifyName nsID attrName
&&& getCurrentElement
>>% XML.findAttr
- >>^ fmap T.pack
-- | Return value as string or fail
findAttr :: (NameSpaceID nsID)
@@ -492,7 +492,6 @@ findAttrText :: (NameSpaceID nsID)
-> FallibleXMLConverter nsID extraState x TextAttributeValue
findAttrText nsID attrName
= findAttr' nsID attrName
- >>^ fmap T.pack
>>> maybeToChoice
-- | Return value as string or return provided default value
@@ -511,7 +510,7 @@ findAttrTextWithDefault :: (NameSpaceID nsID)
-> XMLConverter nsID extraState x TextAttributeValue
findAttrTextWithDefault nsID attrName deflt
= findAttr' nsID attrName
- >>^ maybe deflt T.pack
+ >>^ fromMaybe deflt
-- | Read and return value or fail
readAttr :: (NameSpaceID nsID, Read attrValue)
@@ -748,7 +747,7 @@ matchContent lookups fallback
-- Internals
--------------------------------------------------------------------------------
-stringToBool' :: String -> Maybe Bool
+stringToBool' :: Text -> Maybe Bool
stringToBool' val | val `elem` trueValues = Just True
| val `elem` falseValues = Just False
| otherwise = Nothing
diff --git a/src/Text/Pandoc/Readers/Odt/Namespaces.hs b/src/Text/Pandoc/Readers/Odt/Namespaces.hs
index 3a24a1162..70741c28d 100644
--- a/src/Text/Pandoc/Readers/Odt/Namespaces.hs
+++ b/src/Text/Pandoc/Readers/Odt/Namespaces.hs
@@ -1,3 +1,4 @@
+{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Reader.Odt.Namespaces
Copyright : Copyright (C) 2015 Martin Linnemann
@@ -13,10 +14,10 @@ Namespaces used in odt files.
module Text.Pandoc.Readers.Odt.Namespaces ( Namespace (..)
) where
-import Data.List (isPrefixOf)
import qualified Data.Map as M (empty, insert)
import Data.Maybe (fromMaybe, listToMaybe)
-
+import Data.Text (Text)
+import qualified Data.Text as T
import Text.Pandoc.Readers.Odt.Generic.Namespaces
@@ -30,7 +31,7 @@ instance NameSpaceID Namespace where
findID :: NameSpaceIRI -> Maybe Namespace
-findID iri = listToMaybe [nsID | (iri',nsID) <- nsIDs, iri' `isPrefixOf` iri]
+findID iri = listToMaybe [nsID | (iri',nsID) <- nsIDs, iri' `T.isPrefixOf` iri]
nsIDmap :: NameSpaceIRIs Namespace
nsIDmap = foldr (uncurry $ flip M.insert) M.empty nsIDs
@@ -54,12 +55,12 @@ data Namespace = -- Open Document core
-- Core XML (basically only for the 'id'-attribute)
| NsXML
-- Fallback
- | NsOther String
+ | NsOther Text
deriving ( Eq, Ord, Show )
-- | Not the actual iri's, but large prefixes of them - this way there are
-- less versioning problems and the like.
-nsIDs :: [(String,Namespace)]
+nsIDs :: [(Text, Namespace)]
nsIDs = [
("urn:oasis:names:tc:opendocument:xmlns:animation" , NsAnim ),
("urn:oasis:names:tc:opendocument:xmlns:chart" , NsChart ),
diff --git a/src/Text/Pandoc/Readers/Odt/StyleReader.hs b/src/Text/Pandoc/Readers/Odt/StyleReader.hs
index 46a777df1..5e10f896c 100644
--- a/src/Text/Pandoc/Readers/Odt/StyleReader.hs
+++ b/src/Text/Pandoc/Readers/Odt/StyleReader.hs
@@ -2,6 +2,7 @@
{-# LANGUAGE Arrows #-}
{-# LANGUAGE RecordWildCards #-}
{-# LANGUAGE TupleSections #-}
+{-# LANGUAGE OverloadedStrings #-}
{- |
Module : Text.Pandoc.Readers.Odt.StyleReader
Copyright : Copyright (C) 2015 Martin Linnemann
@@ -46,11 +47,13 @@ import qualified Data.Foldable as F
import Data.List (unfoldr)
import qualified Data.Map as M
import Data.Maybe
+import Data.Text (Text)
+import qualified Data.Text as T
import qualified Data.Set as S
-import qualified Text.XML.Light as XML
+import qualified Text.Pandoc.XML.Light as XML
-import Text.Pandoc.Shared (safeRead)
+import Text.Pandoc.Shared (safeRead, tshow)
import Text.Pandoc.Readers.Odt.Arrows.Utils
@@ -90,7 +93,7 @@ instance Default FontPitch where
--
-- Thus, we want
-type FontFaceName = String
+type FontFaceName = Text
type FontPitches = M.Map FontFaceName FontPitch
@@ -151,7 +154,7 @@ findPitch = ( lookupAttr NsStyle "font-pitch"
-- Definitions of main data
--------------------------------------------------------------------------------
-type StyleName = String
+type StyleName = Text
-- | There are two types of styles: named styles with a style family and an
-- optional style parent, and default styles for each style family,
@@ -355,8 +358,8 @@ getListLevelStyle level ListStyle{..} =
-- \^ simpler, but in general less efficient
data ListLevelStyle = ListLevelStyle { listLevelType :: ListLevelType
- , listItemPrefix :: Maybe String
- , listItemSuffix :: Maybe String
+ , listItemPrefix :: Maybe Text
+ , listItemSuffix :: Maybe Text
, listItemFormat :: ListItemNumberFormat
, listItemStart :: Int
}
@@ -366,9 +369,9 @@ instance Show ListLevelStyle where
show ListLevelStyle{..} = "<LLS|"
++ show listLevelType
++ "|"
- ++ maybeToString listItemPrefix
+ ++ maybeToString (T.unpack <$> listItemPrefix)
++ show listItemFormat
- ++ maybeToString listItemSuffix
+ ++ maybeToString (T.unpack <$> listItemSuffix)
++ ">"
where maybeToString = fromMaybe ""
@@ -471,7 +474,7 @@ readTextProperties =
)
where isFontEmphasised = [("normal",False),("italic",True),("oblique",True)]
isFontBold = ("normal",False):("bold",True)
- :map ((,True).show) ([100,200..900]::[Int])
+ :map ((,True) . tshow) ([100,200..900]::[Int])
readUnderlineMode :: StyleReaderSafe _x (Maybe UnderlineMode)
readUnderlineMode = readLineMode "text-underline-mode"
@@ -481,7 +484,7 @@ readStrikeThroughMode :: StyleReaderSafe _x (Maybe UnderlineMode)
readStrikeThroughMode = readLineMode "text-line-through-mode"
"text-line-through-style"
-readLineMode :: String -> String -> StyleReaderSafe _x (Maybe UnderlineMode)
+readLineMode :: Text -> Text -> StyleReaderSafe _x (Maybe UnderlineMode)
readLineMode modeAttr styleAttr = proc x -> do
isUL <- searchAttr NsStyle styleAttr False isLinePresent -< x
mode <- lookupAttr' NsStyle modeAttr -< x