diff options
author | John MacFarlane <jgm@berkeley.edu> | 2021-02-14 22:29:21 -0800 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2021-02-16 16:55:20 -0800 |
commit | 967e7f5fb990b29de48b37be1db40fb149a8cf55 (patch) | |
tree | b9f903a5f2af14f20e769903e80659b9bffd59ff /src/Text/Pandoc/Readers/DocBook.hs | |
parent | b5b576184c3c1668aad0c904e186136b81a0dd54 (diff) | |
download | pandoc-967e7f5fb990b29de48b37be1db40fb149a8cf55.tar.gz |
Rename Text.Pandoc.XMLParser -> Text.Pandoc.XML.Light...
..and add new definitions isomorphic to xml-light's, but with
Text instead of String. This allows us to keep most of the code in
existing readers that use xml-light, but avoid lots of unnecessary
allocation.
We also add versions of the functions from xml-light's
Text.XML.Light.Output and Text.XML.Light.Proc that operate
on our modified XML types, and functions that convert
xml-light types to our types (since some of our dependencies,
like texmath, use xml-light).
Update golden tests for docx and pptx.
OOXML test: Use `showContent` instead of `ppContent` in `displayDiff`.
Docx: Do a manual traversal to unwrap sdt and smartTag.
This is faster, and needed to pass the tests.
Benchmarks:
A = prior to 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8)
B = as of 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8)
C = this commit
| Reader | A | B | C |
| ------- | ----- | ------ | ----- |
| docbook | 18 ms | 12 ms | 10 ms |
| opml | 65 ms | 62 ms | 35 ms |
| jats | 15 ms | 11 ms | 9 ms |
| docx | 72 ms | 69 ms | 44 ms |
| odt | 78 ms | 41 ms | 28 ms |
| epub | 64 ms | 61 ms | 56 ms |
| fb2 | 14 ms | 5 ms | 4 ms |
Diffstat (limited to 'src/Text/Pandoc/Readers/DocBook.hs')
-rw-r--r-- | src/Text/Pandoc/Readers/DocBook.hs | 76 |
1 files changed, 38 insertions, 38 deletions
diff --git a/src/Text/Pandoc/Readers/DocBook.hs b/src/Text/Pandoc/Readers/DocBook.hs index ad0108843..e201b54fe 100644 --- a/src/Text/Pandoc/Readers/DocBook.hs +++ b/src/Text/Pandoc/Readers/DocBook.hs @@ -31,8 +31,7 @@ import Text.Pandoc.Options import Text.Pandoc.Logging (LogMessage(..)) import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces) import Text.TeXMath (readMathML, writeTeX) -import Text.XML.Light -import Text.Pandoc.XMLParser (parseXMLContents) +import Text.Pandoc.XML.Light {- @@ -578,26 +577,27 @@ normalizeTree = everywhere (mkT go) where go :: [Content] -> [Content] go (Text (CData CDataRaw _ _):xs) = xs go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) = - Text (CData CDataText (s1 ++ s2) z):xs + Text (CData CDataText (s1 <> s2) z):xs go (Text (CData CDataText s1 z):CRef r:xs) = - Text (CData CDataText (s1 ++ convertEntity r) z):xs + Text (CData CDataText (s1 <> convertEntity r) z):xs go (CRef r:Text (CData CDataText s1 z):xs) = - Text (CData CDataText (convertEntity r ++ s1) z):xs + Text (CData CDataText (convertEntity r <> s1) z):xs go (CRef r1:CRef r2:xs) = - Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs + Text (CData CDataText (convertEntity r1 <> + convertEntity r2) Nothing):xs go xs = xs -convertEntity :: String -> String -convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e) +convertEntity :: Text -> Text +convertEntity e = maybe (T.map toUpper e) T.pack (lookupEntity $ T.unpack e) -- convenience function to get an attribute value, defaulting to "" -attrValue :: String -> Element -> Text +attrValue :: Text -> Element -> Text attrValue attr elt = - maybe "" T.pack (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) + fromMaybe "" (lookupAttrBy (\x -> qName x == attr) (elAttribs elt)) -- convenience function named :: Text -> Element -> Bool -named s e = qName (elName e) == T.unpack s +named s e = qName (elName e) == s -- @@ -634,7 +634,7 @@ isBlockElement :: Content -> Bool isBlockElement (Elem e) = qName (elName e) `elem` blockTags isBlockElement _ = False -blockTags :: [String] +blockTags :: [Text] blockTags = [ "abstract" , "ackno" @@ -721,7 +721,7 @@ blockTags = , "variablelist" ] ++ admonitionTags -admonitionTags :: [String] +admonitionTags :: [Text] admonitionTags = ["important","caution","note","tip","warning"] -- Trim leading and trailing newline characters @@ -779,10 +779,10 @@ getBlocks e = mconcat <$> parseBlock :: PandocMonad m => Content -> DB m Blocks parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE -parseBlock (Text (CData _ s _)) = if all isSpace s +parseBlock (Text (CData _ s _)) = if T.all isSpace s then return mempty - else return $ plain $ trimInlines $ text $ T.pack s -parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x + else return $ plain $ trimInlines $ text s +parseBlock (CRef x) = return $ plain $ str $ T.toUpper x parseBlock (Elem e) = case qName (elName e) of "toc" -> skip -- skip TOC, since in pandoc it's autogenerated @@ -837,7 +837,7 @@ parseBlock (Elem e) = "refsect2" -> sect 2 "refsect3" -> sect 3 "refsection" -> gets dbSectionLevel >>= sect . (+1) - l | l `elem` admonitionTags -> parseAdmonition $ T.pack l + l | l `elem` admonitionTags -> parseAdmonition l "area" -> skip "areaset" -> skip "areaspec" -> skip @@ -899,7 +899,7 @@ parseBlock (Elem e) = "subtitle" -> return mempty -- handled in parent element _ -> skip >> getBlocks e where skip = do - let qn = T.pack $ qName $ elName e + let qn = qName $ elName e let name = if "pi-" `T.isPrefixOf` qn then "<?" <> qn <> "?>" else qn @@ -911,7 +911,7 @@ parseBlock (Elem e) = "" -> [] x -> [x] return $ codeBlockWith (attrValue "id" e, classes', []) - $ trimNl $ T.pack $ strContentRecursive e + $ trimNl $ strContentRecursive e parseBlockquote = do attrib <- case filterChild (named "attribution") e of Nothing -> return mempty @@ -965,7 +965,7 @@ parseBlock (Elem e) = w <- findAttr (unqual "colwidth") c n <- safeRead $ "0" <> T.filter (\x -> (x >= '0' && x <= '9') - || x == '.') (T.pack w) + || x == '.') w if n > 0 then Just n else Nothing let numrows = case bodyrows of [] -> 0 @@ -1048,12 +1048,12 @@ parseMixed container conts = do x <- parseMixed container rs return $ p <> b <> x -parseRow :: PandocMonad m => [String] -> Element -> DB m [Cell] +parseRow :: PandocMonad m => [Text] -> Element -> DB m [Cell] parseRow cn = do let isEntry x = named "entry" x || named "td" x || named "th" x mapM (parseEntry cn) . filterChildren isEntry -parseEntry :: PandocMonad m => [String] -> Element -> DB m Cell +parseEntry :: PandocMonad m => [Text] -> Element -> DB m Cell parseEntry cn el = do let colDistance sa ea = do let iStrt = elemIndex sa cn @@ -1075,7 +1075,7 @@ getInlines :: PandocMonad m => Element -> DB m Inlines getInlines e' = trimInlines . mconcat <$> mapM parseInline (elContent e') -strContentRecursive :: Element -> String +strContentRecursive :: Element -> Text strContentRecursive = strContent . (\e' -> e'{ elContent = map elementToStr $ elContent e' }) @@ -1084,9 +1084,9 @@ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing elementToStr x = x parseInline :: PandocMonad m => Content -> DB m Inlines -parseInline (Text (CData _ s _)) = return $ text $ T.pack s +parseInline (Text (CData _ s _)) = return $ text s parseInline (CRef ref) = - return $ text $ maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref + return $ text $ maybe (T.toUpper ref) T.pack $ lookupEntity (T.unpack ref) parseInline (Elem e) = case qName (elName e) of "anchor" -> do @@ -1138,7 +1138,7 @@ parseInline (Elem e) = "userinput" -> codeWithLang "systemitem" -> codeWithLang "varargs" -> return $ code "(...)" - "keycap" -> return (str $ T.pack $ strContent e) + "keycap" -> return (str $ strContent e) "keycombo" -> keycombo <$> mapM parseInline (elContent e) "menuchoice" -> menuchoice <$> @@ -1150,17 +1150,17 @@ parseInline (Elem e) = let title = case attrValue "endterm" e of "" -> maybe "???" xrefTitleByElem (findElementById linkend content) - endterm -> maybe "???" (T.pack . strContent) + endterm -> maybe "???" strContent (findElementById endterm content) return $ link ("#" <> linkend) "" (text title) - "email" -> return $ link ("mailto:" <> T.pack (strContent e)) "" - $ str $ T.pack $ strContent e - "uri" -> return $ link (T.pack $ strContent e) "" $ str $ T.pack $ strContent e + "email" -> return $ link ("mailto:" <> strContent e) "" + $ str $ strContent e + "uri" -> return $ link (strContent e) "" $ str $ strContent e "ulink" -> innerInlines (link (attrValue "url" e) "") "link" -> do ils <- innerInlines id let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of - Just h -> T.pack h + Just h -> h _ -> "#" <> attrValue "linkend" e let ils' = if ils == mempty then str href else ils let attr = (attrValue "id" e, T.words $ attrValue "role" e, []) @@ -1180,7 +1180,7 @@ parseInline (Elem e) = "pi-asciidoc-br" -> return linebreak _ -> skip >> innerInlines id where skip = do - let qn = T.pack $ qName $ elName e + let qn = qName $ elName e let name = if "pi-" `T.isPrefixOf` qn then "<?" <> qn <> "?>" else qn @@ -1193,7 +1193,7 @@ parseInline (Elem e) = let classes' = case attrValue "language" e of "" -> [] l -> [l] - return $ codeWith (attrValue "id" e,classes',[]) $ T.pack $ strContentRecursive e + return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e simpleList = mconcat . intersperse (str "," <> space) <$> mapM getInlines (filterChildren (named "member") e) segmentedList = do @@ -1234,10 +1234,10 @@ parseInline (Elem e) = "sect5" -> descendantContent "title" el "cmdsynopsis" -> descendantContent "command" el "funcsynopsis" -> descendantContent "function" el - _ -> T.pack $ qName (elName el) ++ "_title" + _ -> qName (elName el) <> "_title" where xrefLabel = attrValue "xreflabel" el - descendantContent name = maybe "???" (T.pack . strContent) + descendantContent name = maybe "???" strContent . filterElementName (\n -> qName n == name) -- | Extract a math equation from an element @@ -1258,7 +1258,7 @@ equation e constructor = mathMLEquations :: [Text] mathMLEquations = map writeTeX $ rights $ readMath (\x -> qName (elName x) == "math" && qPrefix (elName x) == Just "mml") - (readMathML . T.pack . showElement) + (readMathML . showElement) latexEquations :: [Text] latexEquations = readMath (\x -> qName (elName x) == "mathphrase") @@ -1272,8 +1272,8 @@ equation e constructor = -- | Get the actual text stored in a CData block. 'showContent' -- returns the text still surrounded by the [[CDATA]] tags. showVerbatimCData :: Content -> Text -showVerbatimCData (Text (CData _ d _)) = T.pack d -showVerbatimCData c = T.pack $ showContent c +showVerbatimCData (Text (CData _ d _)) = d +showVerbatimCData c = showContent c -- | Set the prefix of a name to 'Nothing' |