aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2017-12-21 23:16:03 -0700
committerGitHub <noreply@github.com>2017-12-21 23:16:03 -0700
commitaf048816555046d83f2cc4813d61e0430321476e (patch)
tree2a36019c921f30506611ffa417b777744efa1c58 /src
parent32f9dbbae5e3e1cce43d372db5564da378947388 (diff)
parentd85357139748ea657f030ab314c39e70f56764f4 (diff)
downloadpandoc-af048816555046d83f2cc4813d61e0430321476e.tar.gz
Merge pull request #4177 from stencila/jats-xml-reader
Add Basic JATS reader based on DocBook reader
Diffstat (limited to 'src')
-rw-r--r--src/Text/Pandoc/Readers.hs5
-rw-r--r--src/Text/Pandoc/Readers/JATS.hs404
-rw-r--r--src/Text/Pandoc/Writers/JATS.hs47
3 files changed, 437 insertions, 19 deletions
diff --git a/src/Text/Pandoc/Readers.hs b/src/Text/Pandoc/Readers.hs
index d954256c8..a8448952e 100644
--- a/src/Text/Pandoc/Readers.hs
+++ b/src/Text/Pandoc/Readers.hs
@@ -52,6 +52,7 @@ module Text.Pandoc.Readers
, readOrg
, readLaTeX
, readHtml
+ , readJATS
, readTextile
, readDocBook
, readOPML
@@ -84,7 +85,8 @@ import Text.Pandoc.Readers.DocBook
import Text.Pandoc.Readers.Docx
import Text.Pandoc.Readers.EPUB
import Text.Pandoc.Readers.Haddock
-import Text.Pandoc.Readers.HTML
+import Text.Pandoc.Readers.HTML (readHtml)
+import Text.Pandoc.Readers.JATS (readJATS)
import Text.Pandoc.Readers.LaTeX
import Text.Pandoc.Readers.Markdown
import Text.Pandoc.Readers.MediaWiki
@@ -129,6 +131,7 @@ readers = [ ("native" , TextReader readNative)
,("org" , TextReader readOrg)
,("textile" , TextReader readTextile) -- TODO : textile+lhs
,("html" , TextReader readHtml)
+ ,("jats" , TextReader readJATS)
,("latex" , TextReader readLaTeX)
,("haddock" , TextReader readHaddock)
,("twiki" , TextReader readTWiki)
diff --git a/src/Text/Pandoc/Readers/JATS.hs b/src/Text/Pandoc/Readers/JATS.hs
new file mode 100644
index 000000000..851fbec35
--- /dev/null
+++ b/src/Text/Pandoc/Readers/JATS.hs
@@ -0,0 +1,404 @@
+{-# LANGUAGE ExplicitForAll, TupleSections #-}
+module Text.Pandoc.Readers.JATS ( readJATS ) where
+import Control.Monad.State.Strict
+import Data.Char (isDigit, isSpace, toUpper)
+import Data.Default
+import Data.Generics
+import Data.List (intersperse)
+import Data.Maybe (maybeToList, fromMaybe)
+import Data.Text (Text)
+import qualified Data.Text as T
+import Text.HTML.TagSoup.Entity (lookupEntity)
+import Text.Pandoc.Builder
+import Text.Pandoc.Class (PandocMonad)
+import Text.Pandoc.Options
+import Text.Pandoc.Shared (underlineSpan, crFilter, safeRead)
+import Text.TeXMath (readMathML, writeTeX)
+import Text.XML.Light
+import qualified Data.Set as S (fromList, member)
+import Data.Set ((\\))
+
+type JATS m = StateT JATSState m
+
+data JATSState = JATSState{ jatsSectionLevel :: Int
+ , jatsQuoteType :: QuoteType
+ , jatsMeta :: Meta
+ , jatsAcceptsMeta :: Bool
+ , jatsBook :: Bool
+ , jatsFigureTitle :: Inlines
+ , jatsContent :: [Content]
+ } deriving Show
+
+instance Default JATSState where
+ def = JATSState{ jatsSectionLevel = 0
+ , jatsQuoteType = DoubleQuote
+ , jatsMeta = mempty
+ , jatsAcceptsMeta = False
+ , jatsBook = False
+ , jatsFigureTitle = mempty
+ , jatsContent = [] }
+
+
+readJATS :: PandocMonad m => ReaderOptions -> Text -> m Pandoc
+readJATS _ inp = do
+ let tree = normalizeTree . parseXML
+ $ T.unpack $ crFilter inp
+ (bs, st') <- flip runStateT (def{ jatsContent = tree }) $ mapM parseBlock tree
+ return $ Pandoc (jatsMeta st') (toList . mconcat $ bs)
+
+-- normalize input, consolidating adjacent Text and CRef elements
+normalizeTree :: [Content] -> [Content]
+normalizeTree = everywhere (mkT go)
+ where go :: [Content] -> [Content]
+ go (Text (CData CDataRaw _ _):xs) = xs
+ go (Text (CData CDataText s1 z):Text (CData CDataText s2 _):xs) =
+ Text (CData CDataText (s1 ++ s2) z):xs
+ go (Text (CData CDataText s1 z):CRef r:xs) =
+ Text (CData CDataText (s1 ++ convertEntity r) z):xs
+ go (CRef r:Text (CData CDataText s1 z):xs) =
+ Text (CData CDataText (convertEntity r ++ s1) z):xs
+ go (CRef r1:CRef r2:xs) =
+ Text (CData CDataText (convertEntity r1 ++ convertEntity r2) Nothing):xs
+ go xs = xs
+
+convertEntity :: String -> String
+convertEntity e = Data.Maybe.fromMaybe (map toUpper e) (lookupEntity e)
+
+-- convenience function to get an attribute value, defaulting to ""
+attrValue :: String -> Element -> String
+attrValue attr =
+ fromMaybe "" . maybeAttrValue attr
+
+maybeAttrValue :: String -> Element -> Maybe String
+maybeAttrValue attr elt =
+ lookupAttrBy (\x -> qName x == attr) (elAttribs elt)
+
+-- convenience function
+named :: String -> Element -> Bool
+named s e = qName (elName e) == s
+
+--
+
+acceptingMetadata :: PandocMonad m => JATS m a -> JATS m a
+acceptingMetadata p = do
+ modify (\s -> s { jatsAcceptsMeta = True } )
+ res <- p
+ modify (\s -> s { jatsAcceptsMeta = False })
+ return res
+
+checkInMeta :: (PandocMonad m, Monoid a) => JATS m () -> JATS m a
+checkInMeta p = do
+ accepts <- jatsAcceptsMeta <$> get
+ when accepts p
+ return mempty
+
+addMeta :: PandocMonad m => ToMetaValue a => String -> a -> JATS m ()
+addMeta field val = modify (setMeta field val)
+
+instance HasMeta JATSState where
+ setMeta field v s = s {jatsMeta = setMeta field v (jatsMeta s)}
+ deleteMeta field s = s {jatsMeta = deleteMeta field (jatsMeta s)}
+
+isBlockElement :: Content -> Bool
+isBlockElement (Elem e) = qName (elName e) `S.member` blocktags
+ where blocktags = S.fromList (paragraphLevel ++ lists ++ mathML ++ other) \\ S.fromList inlinetags
+ paragraphLevel = ["address", "array", "boxed-text", "chem-struct-wrap",
+ "code", "fig", "fig-group", "graphic", "media", "preformat",
+ "supplementary-material", "table-wrap", "table-wrap-group",
+ "alternatives", "disp-formula", "disp-formula-group"]
+ lists = ["def-list", "list"]
+ mathML = ["tex-math", "mml:math"]
+ other = ["p", "related-article", "related-object", "ack", "disp-quote",
+ "speech", "statement", "verse-group", "x"]
+ inlinetags = ["email", "ext-link", "uri", "inline-supplementary-material",
+ "related-article", "related-object", "hr", "bold", "fixed-case",
+ "italic", "monospace", "overline", "overline-start", "overline-end",
+ "roman", "sans-serif", "sc", "strike", "underline", "underline-start",
+ "underline-end", "ruby", "alternatives", "inline-graphic", "private-char",
+ "chem-struct", "inline-formula", "tex-math", "mml:math", "abbrev",
+ "milestone-end", "milestone-start", "named-content", "styled-content",
+ "fn", "target", "xref", "sub", "sup", "x", "address", "array",
+ "boxed-text", "chem-struct-wrap", "code", "fig", "fig-group", "graphic",
+ "media", "preformat", "supplementary-material", "table-wrap",
+ "table-wrap-group", "disp-formula", "disp-formula-group",
+ "citation-alternatives", "element-citation", "mixed-citation",
+ "nlm-citation", "award-id", "funding-source", "open-access",
+ "def-list", "list", "ack", "disp-quote", "speech", "statement",
+ "verse-group"]
+isBlockElement _ = False
+
+-- Trim leading and trailing newline characters
+trimNl :: String -> String
+trimNl = reverse . go . reverse . go
+ where go ('\n':xs) = xs
+ go xs = xs
+
+-- function that is used by both graphic (in parseBlock)
+-- and inline-graphic (in parseInline)
+getGraphic :: PandocMonad m => Element -> JATS m Inlines
+getGraphic e = do
+ let atVal a = attrValue a e
+ attr = (atVal "id", words $ atVal "role", [])
+ imageUrl = atVal "href"
+ captionOrLabel = case filterChild (\x -> named "caption" x
+ || named "label" x) e of
+ Nothing -> return mempty
+ Just z -> mconcat <$>
+ mapM parseInline (elContent z)
+ figTitle <- gets jatsFigureTitle
+ let (caption, title) = if isNull figTitle
+ then (captionOrLabel, atVal "title")
+ else (return figTitle, "fig:")
+ fmap (imageWith attr imageUrl title) caption
+
+getBlocks :: PandocMonad m => Element -> JATS m Blocks
+getBlocks e = mconcat <$>
+ mapM parseBlock (elContent e)
+
+
+parseBlock :: PandocMonad m => Content -> JATS m Blocks
+parseBlock (Text (CData CDataRaw _ _)) = return mempty -- DOCTYPE
+parseBlock (Text (CData _ s _)) = if all isSpace s
+ then return mempty
+ else return $ plain $ trimInlines $ text s
+parseBlock (CRef x) = return $ plain $ str $ map toUpper x
+parseBlock (Elem e) =
+ case qName (elName e) of
+ "p" -> parseMixed para (elContent e)
+ "code" -> codeBlockWithLang
+ "preformat" -> codeBlockWithLang
+ "disp-quote" -> parseBlockquote
+ "list" -> case attrValue "list-type" e of
+ "bullet" -> bulletList <$> listitems
+ listType -> do
+ let start = fromMaybe 1 $
+ (strContent <$> (filterElement (named "list-item") e
+ >>= filterElement (named "lable")))
+ >>= safeRead
+ orderedListWith (start, parseListStyleType listType, DefaultDelim)
+ <$> listitems
+ "def-list" -> definitionList <$> deflistitems
+ "sec" -> gets jatsSectionLevel >>= sect . (+1)
+ "title" -> return mempty
+ "title-group" -> checkInMeta getTitle
+ "graphic" -> para <$> getGraphic e
+ "journal-meta" -> metaBlock
+ "article-meta" -> metaBlock
+ "custom-meta" -> metaBlock
+ "table" -> parseTable
+ "fig" -> divWith (attrValue "id" e, ["fig"], []) <$> getBlocks e
+ "table-wrap" -> divWith (attrValue "id" e, ["table-wrap"], []) <$> getBlocks e
+ "caption" -> divWith (attrValue "id" e, ["caption"], []) <$> sect 6
+ "ref-list" -> divWith ("refs", [], []) <$> getBlocks e
+ "ref" -> divWith ("ref-" <> attrValue "id" e, [], []) <$> getBlocks e
+ "?xml" -> return mempty
+ _ -> getBlocks e
+ where parseMixed container conts = do
+ let (ils,rest) = break isBlockElement conts
+ ils' <- (trimInlines . mconcat) <$> mapM parseInline ils
+ let p = if ils' == mempty then mempty else container ils'
+ case rest of
+ [] -> return p
+ (r:rs) -> do
+ b <- parseBlock r
+ x <- parseMixed container rs
+ return $ p <> b <> x
+ codeBlockWithLang = do
+ let classes' = case attrValue "language" e of
+ "" -> []
+ x -> [x]
+ return $ codeBlockWith (attrValue "id" e, classes', [])
+ $ trimNl $ strContentRecursive e
+ parseBlockquote = do
+ attrib <- case filterChild (named "attribution") e of
+ Nothing -> return mempty
+ Just z -> (para . (str "— " <>) . mconcat)
+ <$>
+ mapM parseInline (elContent z)
+ contents <- getBlocks e
+ return $ blockQuote (contents <> attrib)
+ parseListStyleType "roman-lower" = LowerRoman
+ parseListStyleType "roman-upper" = UpperRoman
+ parseListStyleType "alpha-lower" = LowerAlpha
+ parseListStyleType "alpha-upper" = UpperAlpha
+ parseListStyleType _ = DefaultStyle
+ listitems = mapM getBlocks $ filterChildren (named "list-item") e
+ deflistitems = mapM parseVarListEntry $ filterChildren
+ (named "def-item") e
+ parseVarListEntry e' = do
+ let terms = filterChildren (named "term") e'
+ let items = filterChildren (named "def") e'
+ terms' <- mapM getInlines terms
+ items' <- mapM getBlocks items
+ return (mconcat $ intersperse (str "; ") terms', items')
+ getTitle = do
+ tit <- case filterChild (named "article-title") e of
+ Just s -> getInlines s
+ Nothing -> return mempty
+ subtit <- case filterChild (named "subtitle") e of
+ Just s -> (text ": " <>) <$>
+ getInlines s
+ Nothing -> return mempty
+ addMeta "title" (tit <> subtit)
+
+ parseTable = do
+ let isCaption x = named "title" x || named "caption" x
+ caption <- case filterChild isCaption e of
+ Just t -> getInlines t
+ Nothing -> return mempty
+ let e' = fromMaybe e $ filterChild (named "tgroup") e
+ let isColspec x = named "colspec" x || named "col" x
+ let colspecs = case filterChild (named "colgroup") e' of
+ Just c -> filterChildren isColspec c
+ _ -> filterChildren isColspec e'
+ let isRow x = named "row" x || named "tr" x
+ headrows <- case filterChild (named "thead") e' of
+ Just h -> case filterChild isRow h of
+ Just x -> parseRow x
+ Nothing -> return []
+ Nothing -> return []
+ bodyrows <- case filterChild (named "tbody") e' of
+ Just b -> mapM parseRow
+ $ filterChildren isRow b
+ Nothing -> mapM parseRow
+ $ filterChildren isRow e'
+ let toAlignment c = case findAttr (unqual "align") c of
+ Just "left" -> AlignLeft
+ Just "right" -> AlignRight
+ Just "center" -> AlignCenter
+ _ -> AlignDefault
+ let toWidth c = case findAttr (unqual "colwidth") c of
+ Just w -> fromMaybe 0
+ $ safeRead $ '0': filter (\x ->
+ isDigit x || x == '.') w
+ Nothing -> 0 :: Double
+ let numrows = case bodyrows of
+ [] -> 0
+ xs -> maximum $ map length xs
+ let aligns = case colspecs of
+ [] -> replicate numrows AlignDefault
+ cs -> map toAlignment cs
+ let widths = case colspecs of
+ [] -> replicate numrows 0
+ cs -> let ws = map toWidth cs
+ tot = sum ws
+ in if all (> 0) ws
+ then map (/ tot) ws
+ else replicate numrows 0
+ let headrows' = if null headrows
+ then replicate numrows mempty
+ else headrows
+ return $ table caption (zip aligns widths)
+ headrows' bodyrows
+ isEntry x = named "entry" x || named "td" x || named "th" x
+ parseRow = mapM (parseMixed plain . elContent) . filterChildren isEntry
+ sect n = do isbook <- gets jatsBook
+ let n' = if isbook || n == 0 then n + 1 else n
+ headerText <- case filterChild (named "title") e `mplus`
+ (filterChild (named "info") e >>=
+ filterChild (named "title")) of
+ Just t -> getInlines t
+ Nothing -> return mempty
+ oldN <- gets jatsSectionLevel
+ modify $ \st -> st{ jatsSectionLevel = n }
+ b <- getBlocks e
+ let ident = attrValue "id" e
+ modify $ \st -> st{ jatsSectionLevel = oldN }
+ return $ headerWith (ident,[],[]) n' headerText <> b
+-- lineItems = mapM getInlines $ filterChildren (named "line") e
+ metaBlock = acceptingMetadata (getBlocks e) >> return mempty
+
+getInlines :: PandocMonad m => Element -> JATS m Inlines
+getInlines e' = (trimInlines . mconcat) <$>
+ mapM parseInline (elContent e')
+
+strContentRecursive :: Element -> String
+strContentRecursive = strContent .
+ (\e' -> e'{ elContent = map elementToStr $ elContent e' })
+
+elementToStr :: Content -> Content
+elementToStr (Elem e') = Text $ CData CDataText (strContentRecursive e') Nothing
+elementToStr x = x
+
+parseInline :: PandocMonad m => Content -> JATS m Inlines
+parseInline (Text (CData _ s _)) = return $ text s
+parseInline (CRef ref) =
+ return $ maybe (text $ map toUpper ref) text $ lookupEntity ref
+parseInline (Elem e) =
+ case qName (elName e) of
+ "italic" -> emph <$> innerInlines
+ "bold" -> strong <$> innerInlines
+ "strike" -> strikeout <$> innerInlines
+ "sub" -> subscript <$> innerInlines
+ "sup" -> superscript <$> innerInlines
+ "underline" -> underlineSpan <$> innerInlines
+ "break" -> return linebreak
+ "sc" -> smallcaps <$> innerInlines
+
+ "code" -> codeWithLang
+ "monospace" -> codeWithLang
+
+ "inline-graphic" -> getGraphic e
+ "disp-quote" -> do
+ qt <- gets jatsQuoteType
+ let qt' = if qt == SingleQuote then DoubleQuote else SingleQuote
+ modify $ \st -> st{ jatsQuoteType = qt' }
+ contents <- innerInlines
+ modify $ \st -> st{ jatsQuoteType = qt }
+ return $ if qt == SingleQuote
+ then singleQuoted contents
+ else doubleQuoted contents
+
+ "xref" -> do
+ ils <- innerInlines
+ let rid = attrValue "rid" e
+ let refType = ("ref-type",) <$> maybeAttrValue "ref-type" e
+ let attr = (attrValue "id" e, [], maybeToList refType)
+ return $ linkWith attr ('#' : rid) "" ils
+ "ext-link" -> do
+ ils <- innerInlines
+ let title = fromMaybe "" $ findAttr (QName "title" (Just "http://www.w3.org/1999/xlink") Nothing) e
+ let href = case findAttr (QName "href" (Just "http://www.w3.org/1999/xlink") Nothing) e of
+ Just h -> h
+ _ -> '#' : attrValue "rid" e
+ let ils' = if ils == mempty then str href else ils
+ let attr = (attrValue "id" e, [], [])
+ return $ linkWith attr href title ils'
+
+ "disp-formula" -> formula displayMath
+ "inline-formula" -> formula math
+ "math" | qPrefix (elName e) == Just "mml" -> return . math $ mathML e
+ "tex-math" -> return . math $ strContent e
+
+ "email" -> return $ link ("mailto:" ++ strContent e) ""
+ $ str $ strContent e
+ "uri" -> return $ link (strContent e) "" $ str $ strContent e
+ "fn" -> (note . mconcat) <$>
+ mapM parseBlock (elContent e)
+ -- Note: this isn't a real docbook tag; it's what we convert
+ -- <?asciidor-br?> to in handleInstructions, above. A kludge to
+ -- work around xml-light's inability to parse an instruction.
+ _ -> innerInlines
+ where innerInlines = (trimInlines . mconcat) <$>
+ mapM parseInline (elContent e)
+ mathML x =
+ case readMathML . showElement $ everywhere (mkT removePrefix) x of
+ Left _ -> mempty
+ Right m -> writeTeX m
+ formula constructor = do
+ let whereToLook = fromMaybe e $ filterElement (named "alternatives") e
+ texMaths = map strContent $
+ filterChildren (named "tex-math") whereToLook
+ mathMLs = map mathML $
+ filterChildren isMathML whereToLook
+ return . mconcat . take 1 . map constructor $ texMaths ++ mathMLs
+
+ isMathML x = qName (elName x) == "math" &&
+ qPrefix (elName x) == Just "mml"
+ removePrefix elname = elname { qPrefix = Nothing }
+ codeWithLang = do
+ let classes' = case attrValue "language" e of
+ "" -> []
+ l -> [l]
+ return $ codeWith (attrValue "id" e,classes',[]) $ strContentRecursive e
+
diff --git a/src/Text/Pandoc/Writers/JATS.hs b/src/Text/Pandoc/Writers/JATS.hs
index 901bcb646..e9e380a6c 100644
--- a/src/Text/Pandoc/Writers/JATS.hs
+++ b/src/Text/Pandoc/Writers/JATS.hs
@@ -170,6 +170,28 @@ imageMimeType src kvs =
((drop 1 . dropWhile (/='/')) <$> mbMT)
in (maintype, subtype)
+languageFor :: [String] -> String
+languageFor classes =
+ case langs of
+ (l:_) -> escapeStringForXML l
+ [] -> ""
+ where isLang l = map toLower l `elem` map (map toLower) languages
+ langsFrom s = if isLang s
+ then [s]
+ else languagesByExtension . map toLower $ s
+ langs = concatMap langsFrom classes
+
+codeAttr :: Attr -> (String, [(String, String)])
+codeAttr (ident,classes,kvs) = (lang, attr)
+ where
+ attr = [("id",ident) | not (null ident)] ++
+ [("language",lang) | not (null lang)] ++
+ [(k,v) | (k,v) <- kvs, k `elem` ["code-type",
+ "code-version", "executable",
+ "language-version", "orientation",
+ "platforms", "position", "specific-use"]]
+ lang = languageFor classes
+
-- | Convert a Pandoc block element to JATS.
blockToJATS :: PandocMonad m => WriterOptions -> Block -> JATS m Doc
blockToJATS _ Null = return empty
@@ -233,23 +255,10 @@ blockToJATS opts (LineBlock lns) =
blockToJATS opts $ linesToPara lns
blockToJATS opts (BlockQuote blocks) =
inTagsIndented "disp-quote" <$> blocksToJATS opts blocks
-blockToJATS _ (CodeBlock (ident,classes,kvs) str) = return $
+blockToJATS _ (CodeBlock a str) = return $
inTags False tag attr (flush (text (escapeStringForXML str)))
- where attr = [("id",ident) | not (null ident)] ++
- [("language",lang) | not (null lang)] ++
- [(k,v) | (k,v) <- kvs, k `elem` ["code-type",
- "code-version", "executable",
- "language-version", "orientation",
- "platforms", "position", "specific-use"]]
- tag = if null lang then "preformat" else "code"
- lang = case langs of
- (l:_) -> escapeStringForXML l
- [] -> ""
- isLang l = map toLower l `elem` map (map toLower) languages
- langsFrom s = if isLang s
- then [s]
- else languagesByExtension . map toLower $ s
- langs = concatMap langsFrom classes
+ where (lang, attr) = codeAttr a
+ tag = if null lang then "preformat" else "code"
blockToJATS _ (BulletList []) = return empty
blockToJATS opts (BulletList lst) =
inTags True "list" [("list-type", "bullet")] <$>
@@ -349,8 +358,10 @@ inlineToJATS opts (Quoted SingleQuote lst) = do
inlineToJATS opts (Quoted DoubleQuote lst) = do
contents <- inlinesToJATS opts lst
return $ char '“' <> contents <> char '”'
-inlineToJATS _ (Code _ str) =
- return $ inTagsSimple "monospace" $ text (escapeStringForXML str)
+inlineToJATS _ (Code a str) =
+ return $ inTags False tag attr $ text (escapeStringForXML str)
+ where (lang, attr) = codeAttr a
+ tag = if null lang then "monospace" else "code"
inlineToJATS _ il@(RawInline f x)
| f == "jats" = return $ text x
| otherwise = do