diff options
author | John MacFarlane <jgm@berkeley.edu> | 2021-02-14 22:29:21 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2021-02-16 16:55:20 -0800 |
commit | 967e7f5fb990b29de48b37be1db40fb149a8cf55 (patch) | |
tree | b9f903a5f2af14f20e769903e80659b9bffd59ff /src/Text/Pandoc/Readers/Docx | |
parent | b5b576184c3c1668aad0c904e186136b81a0dd54 (diff) | |
download | pandoc-967e7f5fb990b29de48b37be1db40fb149a8cf55.tar.gz |
Rename Text.Pandoc.XMLParser -> Text.Pandoc.XML.Light...
..and add new definitions isomorphic to xml-light's, but with
Text instead of String. This allows us to keep most of the code in
existing readers that use xml-light, but avoid lots of unnecessary
allocation.
We also add versions of the functions from xml-light's
Text.XML.Light.Output and Text.XML.Light.Proc that operate
on our modified XML types, and functions that convert
xml-light types to our types (since some of our dependencies,
like texmath, use xml-light).
Update golden tests for docx and pptx.
OOXML test: Use `showContent` instead of `ppContent` in `displayDiff`.
Docx: Do a manual traversal to unwrap sdt and smartTag.
This is faster, and needed to pass the tests.
Benchmarks:
A = prior to 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8)
B = as of 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8)
C = this commit
| Reader | A | B | C |
| ------- | ----- | ------ | ----- |
| docbook | 18 ms | 12 ms | 10 ms |
| opml | 65 ms | 62 ms | 35 ms |
| jats | 15 ms | 11 ms | 9 ms |
| docx | 72 ms | 69 ms | 44 ms |
| odt | 78 ms | 41 ms | 28 ms |
| epub | 64 ms | 61 ms | 56 ms |
| fb2 | 14 ms | 5 ms | 4 ms |
Diffstat (limited to 'src/Text/Pandoc/Readers/Docx')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse.hs | 163 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Parse/Styles.hs | 31 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx/Util.hs | 27 |
3 files changed, 103 insertions, 118 deletions
diff --git a/src/Text/Pandoc/Readers/Docx/Parse.hs b/src/Text/Pandoc/Readers/Docx/Parse.hs index 056dab6c2..c76f3c171 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse.hs @@ -63,6 +63,7 @@ import Data.Char (chr, ord, readLitChar) import Data.List import qualified Data.Map as M import qualified Data.Text as T +import Data.Text (Text) import Data.Maybe import System.FilePath import Text.Pandoc.Readers.Docx.Util @@ -72,9 +73,7 @@ import qualified Text.Pandoc.UTF8 as UTF8 import Text.TeXMath (Exp) import Text.TeXMath.Readers.OMML (readOMML) import Text.TeXMath.Unicode.Fonts (Font (..), getUnicode, textToFont) -import Text.XML.Light -import qualified Text.XML.Light.Cursor as XMLC -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light data ReaderEnv = ReaderEnv { envNotes :: Notes , envComments :: Comments @@ -128,37 +127,23 @@ mapD f xs = in concatMapM handler xs -unwrap :: NameSpaces -> Content -> [Content] -unwrap ns (Elem element) +unwrapElement :: NameSpaces -> Element -> [Element] +unwrapElement ns element | isElem ns "w" "sdt" element , Just sdtContent <- findChildByName ns "w" "sdtContent" element - = concatMap (unwrap ns . Elem) (elChildren sdtContent) + = concatMap (unwrapElement ns) (elChildren sdtContent) | isElem ns "w" "smartTag" element - = concatMap (unwrap ns . Elem) (elChildren element) -unwrap _ content = [content] + = concatMap (unwrapElement ns) (elChildren element) + | otherwise + = [element{ elContent = concatMap (unwrapContent ns) (elContent element) }] -unwrapChild :: NameSpaces -> Content -> Content -unwrapChild ns (Elem element) = - Elem $ element { elContent = concatMap (unwrap ns) (elContent element) } -unwrapChild _ content = content +unwrapContent :: NameSpaces -> Content -> [Content] +unwrapContent ns (Elem element) = map Elem $ unwrapElement ns element +unwrapContent _ content = [content] -walkDocument' :: NameSpaces -> XMLC.Cursor -> XMLC.Cursor -walkDocument' ns cur = - let modifiedCur = XMLC.modifyContent (unwrapChild ns) cur - in - case XMLC.nextDF modifiedCur of - Just cur' -> walkDocument' ns cur' - Nothing -> XMLC.root modifiedCur - -walkDocument :: NameSpaces -> Element -> Maybe Element +walkDocument :: NameSpaces -> Element -> Element walkDocument ns element = - let cur = XMLC.fromContent (Elem element) - cur' = walkDocument' ns cur - in - case XMLC.toTree cur' of - Elem element' -> Just element' - _ -> Nothing - + element{ elContent = concatMap (unwrapContent ns) (elContent element) } newtype Docx = Docx Document deriving Show @@ -361,9 +346,9 @@ getDocumentXmlPath zf = do fp <- findAttr (QName "Target" Nothing Nothing) rel -- sometimes there will be a leading slash, which windows seems to -- have trouble with. - return $ case fp of + return $ case T.unpack fp of '/' : fp' -> fp' - _ -> fp + fp' -> fp' archiveToDocument :: Archive -> D Document archiveToDocument zf = do @@ -372,7 +357,7 @@ archiveToDocument zf = do docElem <- maybeToD $ parseXMLFromEntry entry let namespaces = elemToNameSpaces docElem bodyElem <- maybeToD $ findChildByName namespaces "w" "body" docElem - let bodyElem' = fromMaybe bodyElem (walkDocument namespaces bodyElem) + let bodyElem' = walkDocument namespaces bodyElem body <- elemToBody namespaces bodyElem' return $ Document namespaces body @@ -414,8 +399,8 @@ archiveToNotes zf = fn_namespaces = maybe [] elemToNameSpaces fnElem en_namespaces = maybe [] elemToNameSpaces enElem ns = unionBy (\x y -> fst x == fst y) fn_namespaces en_namespaces - fn = fnElem >>= walkDocument ns >>= elemToNotes ns "footnote" - en = enElem >>= walkDocument ns >>= elemToNotes ns "endnote" + fn = fnElem >>= elemToNotes ns "footnote" . walkDocument ns + en = enElem >>= elemToNotes ns "endnote" . walkDocument ns in Notes ns fn en @@ -424,7 +409,8 @@ archiveToComments zf = let cmtsElem = findEntryByPath "word/comments.xml" zf >>= parseXMLFromEntry cmts_namespaces = maybe [] elemToNameSpaces cmtsElem - cmts = elemToComments cmts_namespaces <$> (cmtsElem >>= walkDocument cmts_namespaces) + cmts = elemToComments cmts_namespaces . walkDocument cmts_namespaces <$> + cmtsElem in case cmts of Just c -> Comments cmts_namespaces c @@ -443,8 +429,8 @@ filePathToRelType path docXmlPath = relElemToRelationship :: DocumentLocation -> Element -> Maybe Relationship relElemToRelationship relType element | qName (elName element) == "Relationship" = do - relId <- findAttrText (QName "Id" Nothing Nothing) element - target <- findAttrText (QName "Target" Nothing Nothing) element + relId <- findAttr (QName "Id" Nothing Nothing) element + target <- findAttr (QName "Target" Nothing Nothing) element return $ Relationship relType relId target relElemToRelationship _ _ = Nothing @@ -485,10 +471,10 @@ lookupLevel numId ilvl (Numbering _ numbs absNumbs) = do loElemToLevelOverride :: NameSpaces -> Element -> Maybe LevelOverride loElemToLevelOverride ns element | isElem ns "w" "lvlOverride" element = do - ilvl <- findAttrTextByName ns "w" "ilvl" element + ilvl <- findAttrByName ns "w" "ilvl" element let startOverride = findChildByName ns "w" "startOverride" element >>= findAttrByName ns "w" "val" - >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + >>= stringToInteger lvl = findChildByName ns "w" "lvl" element >>= levelElemToLevel ns return $ LevelOverride ilvl startOverride lvl @@ -497,9 +483,9 @@ loElemToLevelOverride _ _ = Nothing numElemToNum :: NameSpaces -> Element -> Maybe Numb numElemToNum ns element | isElem ns "w" "num" element = do - numId <- findAttrTextByName ns "w" "numId" element + numId <- findAttrByName ns "w" "numId" element absNumId <- findChildByName ns "w" "abstractNumId" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" let lvlOverrides = mapMaybe (loElemToLevelOverride ns) (findChildrenByName ns "w" "lvlOverride" element) @@ -509,7 +495,7 @@ numElemToNum _ _ = Nothing absNumElemToAbsNum :: NameSpaces -> Element -> Maybe AbstractNumb absNumElemToAbsNum ns element | isElem ns "w" "abstractNum" element = do - absNumId <- findAttrTextByName ns "w" "abstractNumId" element + absNumId <- findAttrByName ns "w" "abstractNumId" element let levelElems = findChildrenByName ns "w" "lvl" element levels = mapMaybe (levelElemToLevel ns) levelElems return $ AbstractNumb absNumId levels @@ -518,14 +504,14 @@ absNumElemToAbsNum _ _ = Nothing levelElemToLevel :: NameSpaces -> Element -> Maybe Level levelElemToLevel ns element | isElem ns "w" "lvl" element = do - ilvl <- findAttrTextByName ns "w" "ilvl" element + ilvl <- findAttrByName ns "w" "ilvl" element fmt <- findChildByName ns "w" "numFmt" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" txt <- findChildByName ns "w" "lvlText" element - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" let start = findChildByName ns "w" "start" element >>= findAttrByName ns "w" "val" - >>= (\s -> listToMaybe (map fst (reads s :: [(Integer, String)]))) + >>= stringToInteger return (Level ilvl fmt txt start) levelElemToLevel _ _ = Nothing @@ -546,11 +532,11 @@ archiveToNumbering :: Archive -> Numbering archiveToNumbering archive = fromMaybe (Numbering [] [] []) (archiveToNumbering' archive) -elemToNotes :: NameSpaces -> String -> Element -> Maybe (M.Map T.Text Element) +elemToNotes :: NameSpaces -> Text -> Element -> Maybe (M.Map T.Text Element) elemToNotes ns notetype element | isElem ns "w" (notetype <> "s") element = let pairs = mapMaybe - (\e -> findAttrTextByName ns "w" "id" e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) (findChildrenByName ns "w" notetype element) in @@ -562,7 +548,7 @@ elemToComments :: NameSpaces -> Element -> M.Map T.Text Element elemToComments ns element | isElem ns "w" "comments" element = let pairs = mapMaybe - (\e -> findAttrTextByName ns "w" "id" e >>= + (\e -> findAttrByName ns "w" "id" e >>= (\a -> Just (a, e))) (findChildrenByName ns "w" "comment" element) in @@ -622,12 +608,12 @@ elemToParIndentation ns element | isElem ns "w" "ind" element = stringToInteger , hangingParIndent = findAttrByName ns "w" "hanging" element >>= - stringToInteger} + stringToInteger } elemToParIndentation _ _ = Nothing -testBitMask :: String -> Int -> Bool +testBitMask :: Text -> Int -> Bool testBitMask bitMaskS n = - case (reads ("0x" ++ bitMaskS) :: [(Int, String)]) of + case (reads ("0x" ++ T.unpack bitMaskS) :: [(Int, String)]) of [] -> False ((n', _) : _) -> (n' .|. n) /= 0 @@ -642,7 +628,7 @@ elemToBodyPart ns element | isElem ns "w" "p" element , (c:_) <- findChildrenByName ns "m" "oMathPara" element = do - expsLst <- eitherToD $ readOMML $ T.pack $ showElement c + expsLst <- eitherToD $ readOMML $ showElement c return $ OMathPara expsLst elemToBodyPart ns element | isElem ns "w" "p" element @@ -666,7 +652,7 @@ elemToBodyPart ns element | isElem ns "w" "tbl" element = do let caption' = findChildByName ns "w" "tblPr" element >>= findChildByName ns "w" "tblCaption" - >>= findAttrTextByName ns "w" "val" + >>= findAttrByName ns "w" "val" caption = fromMaybe "" caption' grid' = case findChildByName ns "w" "tblGrid" element of Just g -> elemToTblGrid ns g @@ -705,8 +691,8 @@ getTitleAndAlt :: NameSpaces -> Element -> (T.Text, T.Text) getTitleAndAlt ns element = let mbDocPr = findChildByName ns "wp" "inline" element >>= findChildByName ns "wp" "docPr" - title = fromMaybe "" (mbDocPr >>= findAttrTextByName ns "" "title") - alt = fromMaybe "" (mbDocPr >>= findAttrTextByName ns "" "descr") + title = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "title") + alt = fromMaybe "" (mbDocPr >>= findAttrByName ns "" "descr") in (title, alt) elemToParPart :: NameSpaces -> Element -> D ParPart @@ -718,7 +704,7 @@ elemToParPart ns element = let (title, alt) = getTitleAndAlt ns drawingElem a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrTextByName ns "r" "embed" + >>= findAttrByName ns "r" "embed" in case drawing of Just s -> expandDrawingId s >>= (\(fp, bs) -> return $ Drawing fp title alt bs $ elemToExtent drawingElem) @@ -728,7 +714,7 @@ elemToParPart ns element | isElem ns "w" "r" element , Just _ <- findChildByName ns "w" "pict" element = let drawing = findElement (elemName ns "v" "imagedata") element - >>= findAttrTextByName ns "r" "id" + >>= findAttrByName ns "r" "id" in case drawing of -- Todo: check out title and attr for deprecated format. @@ -797,7 +783,7 @@ elemToParPart ns element fldCharState <- gets stateFldCharState case fldCharState of FldCharOpen -> do - info <- eitherToD $ parseFieldInfo $ T.pack $ strContent instrText + info <- eitherToD $ parseFieldInfo $ strContent instrText modify $ \st -> st{stateFldCharState = FldCharFieldInfo info} return NullParPart _ -> return NullParPart @@ -818,48 +804,48 @@ elemToParPart ns element return $ ChangedRuns change runs elemToParPart ns element | isElem ns "w" "bookmarkStart" element - , Just bmId <- findAttrTextByName ns "w" "id" element - , Just bmName <- findAttrTextByName ns "w" "name" element = + , Just bmId <- findAttrByName ns "w" "id" element + , Just bmName <- findAttrByName ns "w" "name" element = return $ BookMark bmId bmName elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just relId <- findAttrTextByName ns "r" "id" element = do + , Just relId <- findAttrByName ns "r" "id" element = do location <- asks envLocation runs <- mapD (elemToRun ns) (elChildren element) rels <- asks envRelationships case lookupRelationship location relId rels of Just target -> - case findAttrTextByName ns "w" "anchor" element of + case findAttrByName ns "w" "anchor" element of Just anchor -> return $ ExternalHyperLink (target <> "#" <> anchor) runs Nothing -> return $ ExternalHyperLink target runs Nothing -> return $ ExternalHyperLink "" runs elemToParPart ns element | isElem ns "w" "hyperlink" element - , Just anchor <- findAttrTextByName ns "w" "anchor" element = do + , Just anchor <- findAttrByName ns "w" "anchor" element = do runs <- mapD (elemToRun ns) (elChildren element) return $ InternalHyperLink anchor runs elemToParPart ns element | isElem ns "w" "commentRangeStart" element - , Just cmtId <- findAttrTextByName ns "w" "id" element = do + , Just cmtId <- findAttrByName ns "w" "id" element = do (Comments _ commentMap) <- asks envComments case M.lookup cmtId commentMap of Just cmtElem -> elemToCommentStart ns cmtElem Nothing -> throwError WrongElem elemToParPart ns element | isElem ns "w" "commentRangeEnd" element - , Just cmtId <- findAttrTextByName ns "w" "id" element = + , Just cmtId <- findAttrByName ns "w" "id" element = return $ CommentEnd cmtId elemToParPart ns element | isElem ns "m" "oMath" element = - fmap PlainOMath (eitherToD $ readOMML $ T.pack $ showElement element) + fmap PlainOMath (eitherToD $ readOMML $ showElement element) elemToParPart _ _ = throwError WrongElem elemToCommentStart :: NameSpaces -> Element -> D ParPart elemToCommentStart ns element | isElem ns "w" "comment" element - , Just cmtId <- findAttrTextByName ns "w" "id" element - , Just cmtAuthor <- findAttrTextByName ns "w" "author" element - , cmtDate <- findAttrTextByName ns "w" "date" element = do + , Just cmtId <- findAttrByName ns "w" "id" element + , Just cmtAuthor <- findAttrByName ns "w" "author" element + , cmtDate <- findAttrByName ns "w" "date" element = do bps <- mapD (elemToBodyPart ns) (elChildren element) return $ CommentStart cmtId cmtAuthor cmtDate bps elemToCommentStart _ _ = throwError WrongElem @@ -878,7 +864,7 @@ elemToExtent drawingElem = where wp_ns = "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing" getDim at = findElement (QName "extent" (Just wp_ns) (Just "wp")) drawingElem - >>= findAttr (QName at Nothing Nothing) >>= safeRead . T.pack + >>= findAttr (QName at Nothing Nothing) >>= safeRead childElemToRun :: NameSpaces -> Element -> D Run @@ -889,7 +875,7 @@ childElemToRun ns element = let (title, alt) = getTitleAndAlt ns element a_ns = "http://schemas.openxmlformats.org/drawingml/2006/main" drawing = findElement (QName "blip" (Just a_ns) (Just "a")) picElem - >>= findAttrText (QName "embed" (lookup "r" ns) (Just "r")) + >>= findAttr (QName "embed" (lookup "r" ns) (Just "r")) in case drawing of Just s -> expandDrawingId s >>= @@ -902,7 +888,7 @@ childElemToRun ns element = return InlineChart childElemToRun ns element | isElem ns "w" "footnoteReference" element - , Just fnId <- findAttrTextByName ns "w" "id" element = do + , Just fnId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupFootnote fnId notes of Just e -> do bps <- local (\r -> r {envLocation=InFootnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -910,7 +896,7 @@ childElemToRun ns element Nothing -> return $ Footnote [] childElemToRun ns element | isElem ns "w" "endnoteReference" element - , Just enId <- findAttrTextByName ns "w" "id" element = do + , Just enId <- findAttrByName ns "w" "id" element = do notes <- asks envNotes case lookupEndnote enId notes of Just e -> do bps <- local (\r -> r {envLocation=InEndnote}) $ mapD (elemToBodyPart ns) (elChildren e) @@ -963,15 +949,15 @@ getParStyleField _ _ = Nothing getTrackedChange :: NameSpaces -> Element -> Maybe TrackedChange getTrackedChange ns element | isElem ns "w" "ins" element || isElem ns "w" "moveTo" element - , Just cId <- findAttrTextByName ns "w" "id" element - , Just cAuthor <- findAttrTextByName ns "w" "author" element - , mcDate <- findAttrTextByName ns "w" "date" element = + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , mcDate <- findAttrByName ns "w" "date" element = Just $ TrackedChange Insertion (ChangeInfo cId cAuthor mcDate) getTrackedChange ns element | isElem ns "w" "del" element || isElem ns "w" "moveFrom" element - , Just cId <- findAttrTextByName ns "w" "id" element - , Just cAuthor <- findAttrTextByName ns "w" "author" element - , mcDate <- findAttrTextByName ns "w" "date" element = + , Just cId <- findAttrByName ns "w" "id" element + , Just cAuthor <- findAttrByName ns "w" "author" element + , mcDate <- findAttrByName ns "w" "date" element = Just $ TrackedChange Deletion (ChangeInfo cId cAuthor mcDate) getTrackedChange _ _ = Nothing @@ -980,7 +966,7 @@ elemToParagraphStyle ns element sty | Just pPr <- findChildByName ns "w" "pPr" element = let style = mapMaybe - (fmap ParaStyleId . findAttrTextByName ns "w" "val") + (fmap ParaStyleId . findAttrByName ns "w" "val") (findChildrenByName ns "w" "pStyle" pPr) in ParagraphStyle {pStyle = mapMaybe (`M.lookup` sty) style @@ -1012,7 +998,7 @@ elemToRunStyleD ns element charStyles <- asks envCharStyles let parentSty = findChildByName ns "w" "rStyle" rPr >>= - findAttrTextByName ns "w" "val" >>= + findAttrByName ns "w" "val" >>= flip M.lookup charStyles . CharStyleId return $ elemToRunStyle ns element parentSty elemToRunStyleD _ _ = return defaultRunStyle @@ -1022,7 +1008,7 @@ elemToRunElem ns element | isElem ns "w" "t" element || isElem ns "w" "delText" element || isElem ns "m" "t" element = do - let str = T.pack $ strContent element + let str = strContent element font <- asks envFont case font of Nothing -> return $ TextRun str @@ -1044,14 +1030,14 @@ getSymChar :: NameSpaces -> Element -> RunElem getSymChar ns element | Just s <- lowerFromPrivate <$> getCodepoint , Just font <- getFont = - case readLitChar ("\\x" ++ s) of + case readLitChar ("\\x" ++ T.unpack s) of [(char, _)] -> TextRun . maybe "" T.singleton $ getUnicode font char _ -> TextRun "" where getCodepoint = findAttrByName ns "w" "char" element - getFont = textToFont . T.pack =<< findAttrByName ns "w" "font" element - lowerFromPrivate ('F':xs) = '0':xs - lowerFromPrivate xs = xs + getFont = textToFont =<< findAttrByName ns "w" "font" element + lowerFromPrivate t | "F" `T.isPrefixOf` t = "0" <> T.drop 1 t + | otherwise = t getSymChar _ _ = TextRun "" elemToRunElems :: NameSpaces -> Element -> D [RunElem] @@ -1061,8 +1047,9 @@ elemToRunElems ns element let qualName = elemName ns "w" let font = do fontElem <- findElement (qualName "rFonts") element - textToFont . T.pack =<< - foldr ((<|>) . (flip findAttr fontElem . qualName)) Nothing ["ascii", "hAnsi"] + textToFont =<< + foldr ((<|>) . (flip findAttr fontElem . qualName)) + Nothing ["ascii", "hAnsi"] local (setFont font) (mapD (elemToRunElem ns) (elChildren element)) elemToRunElems _ _ = throwError WrongElem diff --git a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs index edade8654..0d7271d6a 100644 --- a/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs +++ b/src/Text/Pandoc/Readers/Docx/Parse/Styles.hs @@ -48,12 +48,13 @@ import Data.Function (on) import Data.String (IsString(..)) import qualified Data.Map as M import qualified Data.Text as T +import qualified Data.Text.Read +import Data.Text (Text) import Data.Maybe import Data.Coerce import Text.Pandoc.Readers.Docx.Util import qualified Text.Pandoc.UTF8 as UTF8 -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLElement) +import Text.Pandoc.XML.Light newtype CharStyleId = CharStyleId T.Text deriving (Show, Eq, Ord, IsString, FromStyleId) @@ -109,7 +110,7 @@ data RunStyle = RunStyle { isBold :: Maybe Bool , isRTL :: Maybe Bool , isForceCTL :: Maybe Bool , rVertAlign :: Maybe VertAlign - , rUnderline :: Maybe String + , rUnderline :: Maybe Text , rParentStyle :: Maybe CharStyle } deriving Show @@ -159,7 +160,7 @@ isBasedOnStyle ns element parentStyle , Just styleType <- findAttrByName ns "w" "type" element , styleType == cStyleType parentStyle , Just basedOnVal <- findChildByName ns "w" "basedOn" element >>= - findAttrTextByName ns "w" "val" + findAttrByName ns "w" "val" , Just ps <- parentStyle = basedOnVal == fromStyleId (getStyleId ps) | isElem ns "w" "style" element , Just styleType <- findAttrByName ns "w" "type" element @@ -169,7 +170,7 @@ isBasedOnStyle ns element parentStyle | otherwise = False class HasStyleId a => ElemToStyle a where - cStyleType :: Maybe a -> String + cStyleType :: Maybe a -> Text elemToStyle :: NameSpaces -> Element -> Maybe a -> Maybe a class FromStyleId (StyleId a) => HasStyleId a where @@ -226,8 +227,10 @@ buildBasedOnList ns element rootStyle = stys -> stys ++ concatMap (buildBasedOnList ns element . Just) stys -stringToInteger :: String -> Maybe Integer -stringToInteger s = listToMaybe $ map fst (reads s :: [(Integer, String)]) +stringToInteger :: Text -> Maybe Integer +stringToInteger s = case Data.Text.Read.decimal s of + Right (x,_) -> Just x + Left _ -> Nothing checkOnOff :: NameSpaces -> Element -> QName -> Maybe Bool checkOnOff ns rPr tag @@ -247,7 +250,7 @@ checkOnOff _ _ _ = Nothing elemToCharStyle :: NameSpaces -> Element -> Maybe CharStyle -> Maybe CharStyle elemToCharStyle ns element parentStyle - = CharStyle <$> (CharStyleId <$> findAttrTextByName ns "w" "styleId" element) + = CharStyle <$> (CharStyleId <$> findAttrByName ns "w" "styleId" element) <*> getElementStyleName ns element <*> Just (elemToRunStyle ns element parentStyle) @@ -281,7 +284,7 @@ elemToRunStyle _ _ _ = defaultRunStyle getHeaderLevel :: NameSpaces -> Element -> Maybe (ParaStyleName, Int) getHeaderLevel ns element | Just styleName <- getElementStyleName ns element - , Just n <- stringToInteger . T.unpack =<< + , Just n <- stringToInteger =<< (T.stripPrefix "heading " . T.toLower $ fromStyleName styleName) , n > 0 = Just (styleName, fromInteger n) @@ -289,8 +292,8 @@ getHeaderLevel _ _ = Nothing getElementStyleName :: Coercible T.Text a => NameSpaces -> Element -> Maybe a getElementStyleName ns el = coerce <$> - ((findChildByName ns "w" "name" el >>= findAttrTextByName ns "w" "val") - <|> findAttrTextByName ns "w" "styleId" el) + ((findChildByName ns "w" "name" el >>= findAttrByName ns "w" "val") + <|> findAttrByName ns "w" "styleId" el) getNumInfo :: NameSpaces -> Element -> Maybe (T.Text, T.Text) getNumInfo ns element = do @@ -298,15 +301,15 @@ getNumInfo ns element = do findChildByName ns "w" "numPr" lvl = fromMaybe "0" (numPr >>= findChildByName ns "w" "ilvl" >>= - findAttrTextByName ns "w" "val") + findAttrByName ns "w" "val") numId <- numPr >>= findChildByName ns "w" "numId" >>= - findAttrTextByName ns "w" "val" + findAttrByName ns "w" "val" return (numId, lvl) elemToParStyleData :: NameSpaces -> Element -> Maybe ParStyle -> Maybe ParStyle elemToParStyleData ns element parentStyle - | Just styleId <- findAttrTextByName ns "w" "styleId" element + | Just styleId <- findAttrByName ns "w" "styleId" element , Just styleName <- getElementStyleName ns element = Just $ ParStyle { diff --git a/src/Text/Pandoc/Readers/Docx/Util.hs b/src/Text/Pandoc/Readers/Docx/Util.hs index f9c9a8e26..21df03d9e 100644 --- a/src/Text/Pandoc/Readers/Docx/Util.hs +++ b/src/Text/Pandoc/Readers/Docx/Util.hs @@ -1,3 +1,4 @@ +{-# LANGUAGE OverloadedStrings #-} {- | Module : Text.Pandoc.Readers.Docx.StyleMaps Copyright : © 2014-2020 Jesse Rosenthal <jrosenthal@jhu.edu>, @@ -18,51 +19,45 @@ module Text.Pandoc.Readers.Docx.Util ( , elemToNameSpaces , findChildByName , findChildrenByName - , findAttrText , findAttrByName - , findAttrTextByName ) where import Data.Maybe (mapMaybe) import qualified Data.Text as T -import Text.XML.Light +import Data.Text (Text) +import Text.Pandoc.XML.Light -type NameSpaces = [(String, String)] +type NameSpaces = [(Text, Text)] elemToNameSpaces :: Element -> NameSpaces elemToNameSpaces = mapMaybe attrToNSPair . elAttribs -attrToNSPair :: Attr -> Maybe (String, String) +attrToNSPair :: Attr -> Maybe (Text, Text) attrToNSPair (Attr (QName s _ (Just "xmlns")) val) = Just (s, val) attrToNSPair _ = Nothing -elemName :: NameSpaces -> String -> String -> QName +elemName :: NameSpaces -> Text -> Text -> QName elemName ns prefix name = - QName name (lookup prefix ns) (if null prefix then Nothing else Just prefix) + QName name (lookup prefix ns) (if T.null prefix then Nothing else Just prefix) -isElem :: NameSpaces -> String -> String -> Element -> Bool +isElem :: NameSpaces -> Text -> Text -> Element -> Bool isElem ns prefix name element = let ns' = ns ++ elemToNameSpaces element in qName (elName element) == name && qURI (elName element) == lookup prefix ns' -findChildByName :: NameSpaces -> String -> String -> Element -> Maybe Element +findChildByName :: NameSpaces -> Text -> Text -> Element -> Maybe Element findChildByName ns pref name el = let ns' = ns ++ elemToNameSpaces el in findChild (elemName ns' pref name) el -findChildrenByName :: NameSpaces -> String -> String -> Element -> [Element] +findChildrenByName :: NameSpaces -> Text -> Text -> Element -> [Element] findChildrenByName ns pref name el = let ns' = ns ++ elemToNameSpaces el in findChildren (elemName ns' pref name) el -findAttrText :: QName -> Element -> Maybe T.Text -findAttrText x = fmap T.pack . findAttr x - -findAttrByName :: NameSpaces -> String -> String -> Element -> Maybe String +findAttrByName :: NameSpaces -> Text -> Text -> Element -> Maybe Text findAttrByName ns pref name el = let ns' = ns ++ elemToNameSpaces el in findAttr (elemName ns' pref name) el -findAttrTextByName :: NameSpaces -> String -> String -> Element -> Maybe T.Text -findAttrTextByName a b c = fmap T.pack . findAttrByName a b c |