aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/Docx/Parse
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-02-14 22:29:21 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2021-02-16 16:55:20 -0800
commit967e7f5fb990b29de48b37be1db40fb149a8cf55 (patch)
treeb9f903a5f2af14f20e769903e80659b9bffd59ff /src/Text/Pandoc/Readers/Docx/Parse
parentb5b576184c3c1668aad0c904e186136b81a0dd54 (diff)
downloadpandoc-967e7f5fb990b29de48b37be1db40fb149a8cf55.tar.gz
Rename Text.Pandoc.XMLParser -> Text.Pandoc.XML.Light...
..and add new definitions isomorphic to xml-light's, but with Text instead of String. This allows us to keep most of the code in existing readers that use xml-light, but avoid lots of unnecessary allocation. We also add versions of the functions from xml-light's Text.XML.Light.Output and Text.XML.Light.Proc that operate on our modified XML types, and functions that convert xml-light types to our types (since some of our dependencies, like texmath, use xml-light). Update golden tests for docx and pptx. OOXML test: Use `showContent` instead of `ppContent` in `displayDiff`. Docx: Do a manual traversal to unwrap sdt and smartTag. This is faster, and needed to pass the tests. Benchmarks: A = prior to 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8) B = as of 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8) C = this commit | Reader | A | B | C | | ------- | ----- | ------ | ----- | | docbook | 18 ms | 12 ms | 10 ms | | opml | 65 ms | 62 ms | 35 ms | | jats | 15 ms | 11 ms | 9 ms | | docx | 72 ms | 69 ms | 44 ms | | odt | 78 ms | 41 ms | 28 ms | | epub | 64 ms | 61 ms | 56 ms | | fb2 | 14 ms | 5 ms | 4 ms |
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx/Parse')
-rw-r--r--src/Text/Pandoc/Readers/Docx/Parse/Styles.hs31
1 files changed, 17 insertions, 14 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs
index edade8654..0d7271d6a 100644
--- a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs
+++ b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs
@@ -48,12 +48,13 @@ import Data.Function (on)
import Data.String (IsString(..))
import qualified Data.Map as M
import qualified Data.Text as T
+import qualified Data.Text.Read
+import Data.Text (Text)
import Data.Maybe
import Data.Coerce
import Text.Pandoc.Readers.Docx.Util
import qualified Text.Pandoc.UTF8 as UTF8
-import Text.XML.Light
-import Text.Pandoc.XMLParser (parseXMLElement)
+import Text.Pandoc.XML.Light
newtype CharStyleId = CharStyleId T.Text
deriving (Show, Eq, Ord, IsString, FromStyleId)
@@ -109,7 +110,7 @@ data RunStyle = RunStyle { isBold :: Maybe Bool
, isRTL :: Maybe Bool
, isForceCTL :: Maybe Bool
, rVertAlign :: Maybe VertAlign
- , rUnderline :: Maybe String
+ , rUnderline :: Maybe Text
, rParentStyle :: Maybe CharStyle
}
deriving Show
@@ -159,7 +160,7 @@ isBasedOnStyle ns element parentStyle
, Just styleType <- findAttrByName ns "w" "type" element
, styleType == cStyleType parentStyle
, Just basedOnVal <- findChildByName ns "w" "basedOn" element >>=
- findAttrTextByName ns "w" "val"
+ findAttrByName ns "w" "val"
, Just ps <- parentStyle = basedOnVal == fromStyleId (getStyleId ps)
| isElem ns "w" "style" element
, Just styleType <- findAttrByName ns "w" "type" element
@@ -169,7 +170,7 @@ isBasedOnStyle ns element parentStyle
| otherwise = False
class HasStyleId a => ElemToStyle a where
- cStyleType :: Maybe a -> String
+ cStyleType :: Maybe a -> Text
elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a
class FromStyleId (StyleId a) => HasStyleId a where
@@ -226,8 +227,10 @@ buildBasedOnList ns element rootStyle =
stys -> stys ++
concatMap (buildBasedOnList ns element . Just) stys
-stringToInteger :: String -> Maybe Integer
-stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)])
+stringToInteger :: Text -> Maybe Integer
+stringToInteger s = case Data.Text.Read.decimal s of
+ Right (x,_) -> Just x
+ Left _ -> Nothing
checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool
checkOnOff ns rPr tag
@@ -247,7 +250,7 @@ checkOnOff _ _ _ = Nothing
elemToCharStyle :: NameSpaces
-> Element -> Maybe CharStyle -> Maybe CharStyle
elemToCharStyle ns element parentStyle
- = CharStyle <$> (CharStyleId <$> findAttrTextByName ns "w" "styleId" element)
+ = CharStyle <$> (CharStyleId <$> findAttrByName ns "w" "styleId" element)
<*> getElementStyleName ns element
<*> Just (elemToRunStyle ns element parentStyle)
@@ -281,7 +284,7 @@ elemToRunStyle _ _ _ = defaultRunStyle
getHeaderLevel :: NameSpaces -> Element -> Maybe (ParaStyleName, Int)
getHeaderLevel ns element
| Just styleName <- getElementStyleName ns element
- , Just n <- stringToInteger . T.unpack =<<
+ , Just n <- stringToInteger =<<
(T.stripPrefix "heading " . T.toLower $
fromStyleName styleName)
, n > 0 = Just (styleName, fromInteger n)
@@ -289,8 +292,8 @@ getHeaderLevel _ _ = Nothing
getElementStyleName :: Coercible T.Text a => NameSpaces -> Element -> Maybe a
getElementStyleName ns el = coerce <$>
- ((findChildByName ns "w" "name" el >>= findAttrTextByName ns "w" "val")
- <|> findAttrTextByName ns "w" "styleId" el)
+ ((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val")
+ <|> findAttrByName ns "w" "styleId" el)
getNumInfo :: NameSpaces -> Element -> Maybe (T.Text, T.Text)
getNumInfo ns element = do
@@ -298,15 +301,15 @@ getNumInfo ns element = do
findChildByName ns "w" "numPr"
lvl = fromMaybe "0" (numPr >>=
findChildByName ns "w" "ilvl" >>=
- findAttrTextByName ns "w" "val")
+ findAttrByName ns "w" "val")
numId <- numPr >>=
findChildByName ns "w" "numId" >>=
- findAttrTextByName ns "w" "val"
+ findAttrByName ns "w" "val"
return (numId, lvl)
elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> Maybe ParStyle
elemToParStyleData ns element parentStyle
- | Just styleId <- findAttrTextByName ns "w" "styleId" element
+ | Just styleId <- findAttrByName ns "w" "styleId" element
, Just styleName <- getElementStyleName ns element
= Just $ ParStyle
{