aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers/JATS.hs
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2021-02-14 22:29:21 -0800
committerJohn MacFarlane <jgm@berkeley.edu>2021-02-16 16:55:20 -0800
commit967e7f5fb990b29de48b37be1db40fb149a8cf55 (patch)
treeb9f903a5f2af14f20e769903e80659b9bffd59ff /src/Text/Pandoc/Readers/JATS.hs
parentb5b576184c3c1668aad0c904e186136b81a0dd54 (diff)
downloadpandoc-967e7f5fb990b29de48b37be1db40fb149a8cf55.tar.gz
Rename Text.Pandoc.XMLParser -> Text.Pandoc.XML.Light...
..and add new definitions isomorphic to xml-light's, but with Text instead of String. This allows us to keep most of the code in existing readers that use xml-light, but avoid lots of unnecessary allocation. We also add versions of the functions from xml-light's Text.XML.Light.Output and Text.XML.Light.Proc that operate on our modified XML types, and functions that convert xml-light types to our types (since some of our dependencies, like texmath, use xml-light). Update golden tests for docx and pptx. OOXML test: Use `showContent` instead of `ppContent` in `displayDiff`. Docx: Do a manual traversal to unwrap sdt and smartTag. This is faster, and needed to pass the tests. Benchmarks: A = prior to 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8) B = as of 8ca191604dcd13af27c11d2da225da646ebce6fc (Feb 8) C = this commit | Reader | A | B | C | | ------- | ----- | ------ | ----- | | docbook | 18 ms | 12 ms | 10 ms | | opml | 65 ms | 62 ms | 35 ms | | jats | 15 ms | 11 ms | 9 ms | | docx | 72 ms | 69 ms | 44 ms | | odt | 78 ms | 41 ms | 28 ms | | epub | 64 ms | 61 ms | 56 ms | | fb2 | 14 ms | 5 ms | 4 ms |
Diffstat (limited to 'src/Text/Pandoc/Readers/JATS.hs')
-rw-r--r--src/Text/Pandoc/Readers/JATS.hs58
1 files changed, 25 insertions, 33 deletions
diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs
index dfd343b7a..5353f2001 100644
--- a/src/Text/Pandoc/Readers/JATS.hs
+++ b/src/Text/Pandoc/Readers/JATS.hs
@@ -16,7 +16,7 @@ module Text.Pandoc.Readers.JATS ( readJATS ) where
import Control.Monad.State.Strict
import Control.Monad.Except (throwError)
import Text.Pandoc.Error (PandocError(..))
-import Data.Char (isDigit, isSpace, toUpper)
+import Data.Char (isDigit, isSpace)
import Data.Default
import Data.Generics
import Data.List (foldl', intersperse)
@@ -31,8 +31,7 @@ import Text.Pandoc.Class.PandocMonad (PandocMonad)
import Text.Pandoc.Options
import Text.Pandoc.Shared (crFilter, safeRead, extractSpaces)
import Text.TeXMath (readMathML, writeTeX)
-import Text.XML.Light
-import Text.Pandoc.XMLParser (parseXMLContents)
+import Text.Pandoc.XML.Light
import qualified Data.Set as S (fromList, member)
import Data.Set ((\\))
@@ -67,29 +66,29 @@ normalizeTree = everywhere (mkT go)
where go :: [Content] -> [Content]
go (Text (CData CDataRaw _ _):xs) = xs
go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
- Text (CData CDataText (s1 ++ s2) z):xs
+ Text (CData CDataText (s1 <> s2) z):xs
go (Text (CData CDataText s1 z):CRef r:xs) =
- Text (CData CDataText (s1 ++ convertEntity r) z):xs
+ Text (CData CDataText (s1 <> convertEntity r) z):xs
go (CRef r:Text (CData CDataText s1 z):xs) =
- Text (CData CDataText (convertEntity r ++ s1) z):xs
+ Text (CData CDataText (convertEntity r <> s1) z):xs
go (CRef r1:CRef r2:xs) =
- Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs
+ Text (CData CDataText (convertEntity r1 <> convertEntity r2) Nothing):xs
go xs = xs
-convertEntity :: String -> String
-convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e)
+convertEntity :: Text -> Text
+convertEntity e = maybe (T.toUpper e) T.pack (lookupEntity $ T.unpack e)
-- convenience function to get an attribute value, defaulting to ""
-attrValue :: String -> Element -> Text
+attrValue :: Text -> Element -> Text
attrValue attr =
fromMaybe "" . maybeAttrValue attr
-maybeAttrValue :: String -> Element -> Maybe Text
+maybeAttrValue :: Text -> Element -> Maybe Text
maybeAttrValue attr elt =
- T.pack <$> lookupAttrBy (\x -> qName x == attr) (elAttribs elt)
+ lookupAttrBy (\x -> qName x == attr) (elAttribs elt)
-- convenience function
-named :: String -> Element -> Bool
+named :: Text -> Element -> Bool
named s e = qName (elName e) == s
--
@@ -155,10 +154,10 @@ getBlocks e = mconcat <$>
parseBlock :: PandocMonad m => Content -> JATS m Blocks
parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE
-parseBlock (Text (CData _ s _)) = if all isSpace s
+parseBlock (Text (CData _ s _)) = if T.all isSpace s
then return mempty
- else return $ plain $ trimInlines $ text $ T.pack s
-parseBlock (CRef x) = return $ plain $ str $ T.toUpper $ T.pack x
+ else return $ plain $ trimInlines $ text s
+parseBlock (CRef x) = return $ plain $ str $ T.toUpper x
parseBlock (Elem e) =
case qName (elName e) of
"p" -> parseMixed para (elContent e)
@@ -207,7 +206,7 @@ parseBlock (Elem e) =
"" -> []
x -> [x]
return $ codeBlockWith (attrValue "id" e, classes', [])
- $ trimNl $ textContentRecursive e
+ $ trimNl $ strContentRecursive e
parseBlockquote = do
attrib <- case filterChild (named "attribution") e of
Nothing -> return mempty
@@ -271,7 +270,7 @@ parseBlock (Elem e) =
Just "center" -> AlignCenter
_ -> AlignDefault
let toWidth c = do
- w <- findAttrText (unqual "colwidth") c
+ w <- findAttr (unqual "colwidth") c
n <- safeRead $ "0" <> T.filter (\x -> isDigit x || x == '.') w
if n > 0 then Just n else Nothing
let numrows = foldl' max 0 $ map length bodyrows
@@ -442,16 +441,10 @@ parseRef e = do
Nothing -> return $ Map.insert "id" (toMetaValue refId) mempty
-- TODO handle mixed-citation
-findAttrText :: QName -> Element -> Maybe Text
-findAttrText x = fmap T.pack . findAttr x
-
textContent :: Element -> Text
-textContent = T.pack . strContent
-
-textContentRecursive :: Element -> Text
-textContentRecursive = T.pack . strContentRecursive
+textContent = strContent
-strContentRecursive :: Element -> String
+strContentRecursive :: Element -> Text
strContentRecursive = strContent .
(\e' -> e'{ elContent = map elementToStr $ elContent e' })
@@ -460,9 +453,8 @@ elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing
elementToStr x = x
parseInline :: PandocMonad m => Content -> JATS m Inlines
-parseInline (Text (CData _ s _)) = return $ text $ T.pack s
-parseInline (CRef ref) =
- return . text . maybe (T.toUpper $ T.pack ref) T.pack $ lookupEntity ref
+parseInline (Text (CData _ s _)) = return $ text s
+parseInline (CRef ref) = return . text . convertEntity $ ref
parseInline (Elem e) =
case qName (elName e) of
"italic" -> innerInlines emph
@@ -507,9 +499,9 @@ parseInline (Elem e) =
else linkWith attr ("#" <> rid) "" ils
"ext-link" -> do
ils <- innerInlines id
- let title = fromMaybe "" $ findAttrText (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e
+ let title = fromMaybe "" $ findAttr (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e
let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of
- Just h -> T.pack h
+ Just h -> h
_ -> "#" <> attrValue "rid" e
let ils' = if ils == mempty then str href else ils
let attr = (attrValue "id" e, [], [])
@@ -529,7 +521,7 @@ parseInline (Elem e) =
where innerInlines f = extractSpaces f . mconcat <$>
mapM parseInline (elContent e)
mathML x =
- case readMathML . T.pack . showElement $ everywhere (mkT removePrefix) x of
+ case readMathML . showElement $ everywhere (mkT removePrefix) x of
Left _ -> mempty
Right m -> writeTeX m
formula constructor = do
@@ -547,4 +539,4 @@ parseInline (Elem e) =
let classes' = case attrValue "language" e of
"" -> []
l -> [l]
- return $ codeWith (attrValue "id" e,classes',[]) $ textContentRecursive e
+ return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e