aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/Text/Pandoc/Writers/JATS.hs27
-rw-r--r--src/Text/Pandoc/Writers/JATS/References.hs5
-rw-r--r--src/Text/Pandoc/Writers/JATS/Table.hs4
-rw-r--r--src/Text/Pandoc/XML.hs30
-rw-r--r--test/Tests/Writers/JATS.hs205
5 files changed, 162 insertions, 109 deletions
diff --git a/src/Text/Pandoc/Writers/JATS.hs b/src/Text/Pandoc/Writers/JATS.hs
index a9369db7a..26f94cb03 100644
--- a/src/Text/Pandoc/Writers/JATS.hs
+++ b/src/Text/Pandoc/Writers/JATS.hs
@@ -239,7 +239,7 @@ languageFor classes =
codeAttr :: Attr -> (Text, [(Text, Text)])
codeAttr (ident,classes,kvs) = (lang, attr)
where
- attr = [("id",ident) | not (T.null ident)] ++
+ attr = [("id", escapeNCName ident) | not (T.null ident)] ++
[("language",lang) | not (T.null lang)] ++
[(k,v) | (k,v) <- kvs, k `elem` ["code-type",
"code-version", "executable",
@@ -251,7 +251,8 @@ codeAttr (ident,classes,kvs) = (lang, attr)
blockToJATS :: PandocMonad m => WriterOptions -> Block -> JATS m (Doc Text)
blockToJATS _ Null = return empty
blockToJATS opts (Div (id',"section":_,kvs) (Header _lvl _ ils : xs)) = do
- let idAttr = [("id", writerIdentifierPrefix opts <> id') | not (T.null id')]
+ let idAttr = [ ("id", writerIdentifierPrefix opts <> escapeNCName id')
+ | not (T.null id')]
let otherAttrs = ["sec-type", "specific-use"]
let attribs = idAttr ++ [(k,v) | (k,v) <- kvs, k `elem` otherAttrs]
title' <- inlinesToJATS opts ils
@@ -260,7 +261,7 @@ blockToJATS opts (Div (id',"section":_,kvs) (Header _lvl _ ils : xs)) = do
inTagsSimple "title" title' $$ contents
-- Bibliography reference:
blockToJATS opts (Div (ident,_,_) [Para lst]) | "ref-" `T.isPrefixOf` ident =
- inTags True "ref" [("id", ident)] .
+ inTags True "ref" [("id", escapeNCName ident)] .
inTagsSimple "mixed-citation" <$>
inlinesToJATS opts lst
blockToJATS opts (Div ("refs",_,_) xs) = do
@@ -271,14 +272,14 @@ blockToJATS opts (Div ("refs",_,_) xs) = do
return $ inTagsIndented "ref-list" contents
blockToJATS opts (Div (ident,[cls],kvs) bs) | cls `elem` ["fig", "caption", "table-wrap"] = do
contents <- blocksToJATS opts bs
- let attr = [("id", ident) | not (T.null ident)] ++
+ let attr = [("id", escapeNCName ident) | not (T.null ident)] ++
[("xml:lang",l) | ("lang",l) <- kvs] ++
[(k,v) | (k,v) <- kvs, k `elem` ["specific-use",
"content-type", "orientation", "position"]]
return $ inTags True cls attr contents
blockToJATS opts (Div (ident,_,kvs) bs) = do
contents <- blocksToJATS opts bs
- let attr = [("id", ident) | not (T.null ident)] ++
+ let attr = [("id", escapeNCName ident) | not (T.null ident)] ++
[("xml:lang",l) | ("lang",l) <- kvs] ++
[(k,v) | (k,v) <- kvs, k `elem` ["specific-use",
"content-type", "orientation", "position"]]
@@ -296,7 +297,7 @@ blockToJATS opts (Para [Image (ident,_,kvs) txt
let capt = if null txt
then empty
else inTagsSimple "caption" $ inTagsSimple "p" alt
- let attr = [("id", ident) | not (T.null ident)] ++
+ let attr = [("id", escapeNCName ident) | not (T.null ident)] ++
[(k,v) | (k,v) <- kvs, k `elem` ["fig-type", "orientation",
"position", "specific-use"]]
let graphicattr = [("mimetype",maintype),
@@ -307,7 +308,7 @@ blockToJATS opts (Para [Image (ident,_,kvs) txt
capt $$ selfClosingTag "graphic" graphicattr
blockToJATS _ (Para [Image (ident,_,kvs) _ (src, tit)]) = do
let (maintype, subtype) = imageMimeType src kvs
- let attr = [("id", ident) | not (T.null ident)] ++
+ let attr = [("id", escapeNCName ident) | not (T.null ident)] ++
[("mimetype", maintype),
("mime-subtype", subtype),
("xlink:href", src)] ++
@@ -434,7 +435,7 @@ inlineToJATS opts (Note contents) = do
let notenum = case notes of
(n, _):_ -> n + 1
[] -> 1
- thenote <- inTags True "fn" [("id","fn" <> tshow notenum)]
+ thenote <- inTags True "fn" [("id", "fn" <> tshow notenum)]
<$> wrappedBlocksToJATS (not . isPara) opts
(walk demoteHeaderAndRefs contents)
modify $ \st -> st{ jatsNotes = (notenum, thenote) : notes }
@@ -447,7 +448,7 @@ inlineToJATS opts (Cite _ lst) =
inlineToJATS opts (Span ("",_,[]) ils) = inlinesToJATS opts ils
inlineToJATS opts (Span (ident,_,kvs) ils) = do
contents <- inlinesToJATS opts ils
- let attr = [("id",ident) | not (T.null ident)] ++
+ let attr = [("id", escapeNCName ident) | not (T.null ident)] ++
[("xml:lang",l) | ("lang",l) <- kvs] ++
[(k,v) | (k,v) <- kvs
, k `elem` ["content-type", "rationale",
@@ -488,9 +489,9 @@ inlineToJATS _ (Link _attr [Str t] (T.stripPrefix "mailto:" -> Just email, _))
return $ inTagsSimple "email" $ literal (escapeStringForXML email)
inlineToJATS opts (Link (ident,_,kvs) txt (T.uncons -> Just ('#', src), _)) = do
let attr = mconcat
- [ [("id", ident) | not (T.null ident)]
+ [ [("id", escapeNCName ident) | not (T.null ident)]
, [("alt", stringify txt) | not (null txt)]
- , [("rid", src)]
+ , [("rid", escapeNCName src)]
, [(k,v) | (k,v) <- kvs, k `elem` ["ref-type", "specific-use"]]
, [("ref-type", "bibr") | "ref-" `T.isPrefixOf` src]
]
@@ -500,7 +501,7 @@ inlineToJATS opts (Link (ident,_,kvs) txt (T.uncons -> Just ('#', src), _)) = do
contents <- inlinesToJATS opts txt
return $ inTags False "xref" attr contents
inlineToJATS opts (Link (ident,_,kvs) txt (src, tit)) = do
- let attr = [("id", ident) | not (T.null ident)] ++
+ let attr = [("id", escapeNCName ident) | not (T.null ident)] ++
[("ext-link-type", "uri"),
("xlink:href", src)] ++
[("xlink:title", tit) | not (T.null tit)] ++
@@ -518,7 +519,7 @@ inlineToJATS _ (Image (ident,_,kvs) _ (src, tit)) = do
let subtype = fromMaybe "" $
lookup "mime-subtype" kvs `mplus`
(T.drop 1 . T.dropWhile (/='/') <$> mbMT)
- let attr = [("id", ident) | not (T.null ident)] ++
+ let attr = [("id", escapeNCName ident) | not (T.null ident)] ++
[("mimetype", maintype),
("mime-subtype", subtype),
("xlink:href", src)] ++
diff --git a/src/Text/Pandoc/Writers/JATS/References.hs b/src/Text/Pandoc/Writers/JATS/References.hs
index 903144128..5b19fd034 100644
--- a/src/Text/Pandoc/Writers/JATS/References.hs
+++ b/src/Text/Pandoc/Writers/JATS/References.hs
@@ -29,7 +29,7 @@ import Text.Pandoc.Builder (Inlines)
import Text.Pandoc.Options (WriterOptions)
import Text.Pandoc.Shared (tshow)
import Text.Pandoc.Writers.JATS.Types
-import Text.Pandoc.XML (escapeStringForXML, inTags)
+import Text.Pandoc.XML (escapeNCName, escapeStringForXML, inTags)
import qualified Data.Text as T
referencesToJATS :: PandocMonad m
@@ -46,7 +46,8 @@ referenceToJATS :: PandocMonad m
referenceToJATS _opts ref = do
let refType = referenceType ref
let pubType = [("publication-type", refType) | not (T.null refType)]
- let wrap = inTags True "ref" [("id", "ref-" <> unItemId (referenceId ref))]
+ let ident = escapeNCName $ "ref-" <> unItemId (referenceId ref)
+ let wrap = inTags True "ref" [("id", ident)]
. inTags True "element-citation" pubType
return . wrap . vcat $
[ authors
diff --git a/src/Text/Pandoc/Writers/JATS/Table.hs b/src/Text/Pandoc/Writers/JATS/Table.hs
index 465480f59..2e34900d2 100644
--- a/src/Text/Pandoc/Writers/JATS/Table.hs
+++ b/src/Text/Pandoc/Writers/JATS/Table.hs
@@ -24,7 +24,7 @@ import Text.Pandoc.Definition
import Text.Pandoc.Options (WriterOptions)
import Text.Pandoc.Shared (tshow)
import Text.Pandoc.Writers.JATS.Types
-import Text.Pandoc.XML (inTags, inTagsIndented, selfClosingTag)
+import Text.Pandoc.XML (escapeNCName, inTags, inTagsIndented, selfClosingTag)
import qualified Data.Text as T
import qualified Text.Pandoc.Writers.AnnotatedTable as Ann
@@ -216,7 +216,7 @@ cellToJats opts celltype (Ann.Cell (colspec :| _) _colNum cell) =
toAttribs :: Attr -> [Text] -> [(Text, Text)]
toAttribs (ident, _classes, kvs) knownAttribs =
- (if T.null ident then id else (("id", ident) :)) $
+ (if T.null ident then id else (("id", escapeNCName ident) :)) $
filter ((`elem` knownAttribs) . fst) kvs
tableCellToJats :: PandocMonad m
diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs
index 6dbbce1d2..79b4768ec 100644
--- a/src/Text/Pandoc/XML.hs
+++ b/src/Text/Pandoc/XML.hs
@@ -13,6 +13,7 @@ Functions for escaping and formatting XML.
-}
module Text.Pandoc.XML ( escapeCharForXML,
escapeStringForXML,
+ escapeNCName,
inTags,
selfClosingTag,
inTagsSimple,
@@ -24,7 +25,7 @@ module Text.Pandoc.XML ( escapeCharForXML,
html5Attributes,
rdfaAttributes ) where
-import Data.Char (isAscii, isSpace, ord)
+import Data.Char (isAscii, isSpace, ord, isLetter, isDigit)
import Data.Text (Text)
import qualified Data.Text as T
import Text.HTML.TagSoup.Entity (lookupEntity, htmlEntities)
@@ -119,8 +120,33 @@ html5EntityMap = foldr go mempty htmlEntities
where ent' = T.takeWhile (/=';') (T.pack ent)
_ -> entmap
+-- | Converts a string into an NCName, i.e., an XML name without colons.
+-- Disallowed characters are escaped using @ux%x@, where @%x@ is the
+-- hexadecimal unicode identifier of the escaped character.
+escapeNCName :: Text -> Text
+escapeNCName t = case T.uncons t of
+ Nothing -> T.empty
+ Just (c, cs) -> escapeStartChar c <> T.concatMap escapeNCNameChar cs
+ where
+ escapeStartChar :: Char -> Text
+ escapeStartChar c = if isLetter c || c == '_'
+ then T.singleton c
+ else escapeChar c
--- Unescapes XML entities
+ escapeNCNameChar :: Char -> Text
+ escapeNCNameChar c = if isNCNameChar c
+ then T.singleton c
+ else escapeChar c
+
+ isNCNameChar :: Char -> Bool
+ isNCNameChar c = isLetter c || c `elem` ("_-.·" :: String) || isDigit c
+ || '\x0300' <= c && c <= '\x036f'
+ || '\x203f' <= c && c <= '\x2040'
+
+ escapeChar :: Char -> Text
+ escapeChar = T.pack . printf "U%04X" . ord
+
+-- | Unescapes XML entities
fromEntities :: Text -> Text
fromEntities t
= let (x, y) = T.break (== '&') t
diff --git a/test/Tests/Writers/JATS.hs b/test/Tests/Writers/JATS.hs
index 2f501c890..23c1686dc 100644
--- a/test/Tests/Writers/JATS.hs
+++ b/test/Tests/Writers/JATS.hs
@@ -1,21 +1,21 @@
{-# LANGUAGE OverloadedStrings #-}
module Tests.Writers.JATS (tests) where
-import Data.Text (unpack)
+import Data.Text (Text)
import Test.Tasty
import Tests.Helpers
import Text.Pandoc
import Text.Pandoc.Arbitrary ()
import Text.Pandoc.Builder
+import qualified Data.Text as T
-jats :: (ToPandoc a) => a -> String
-jats = unpack
- . purely (writeJATS def{ writerWrapText = WrapNone })
- . toPandoc
+jats :: (ToPandoc a) => a -> Text
+jats = purely (writeJATS def{ writerWrapText = WrapNone })
+ . toPandoc
-jatsArticleAuthoring :: (ToPandoc a) => a -> String
-jatsArticleAuthoring = unpack
- . purely (writeJatsArticleAuthoring def{ writerWrapText = WrapNone })
+jatsArticleAuthoring :: (ToPandoc a) => a -> Text
+jatsArticleAuthoring =
+ purely (writeJatsArticleAuthoring def{ writerWrapText = WrapNone })
. toPandoc
{-
@@ -32,89 +32,114 @@ which is in turn shorthand for
infix 4 =:
(=:) :: (ToString a, ToPandoc a)
- => String -> (a, String) -> TestTree
+ => String -> (a, Text) -> TestTree
(=:) = test jats
tests :: [TestTree]
-tests = [ testGroup "inline code"
- [ "basic" =: code "@&" =?> "<p><monospace>@&amp;</monospace></p>"
- , "lang" =: codeWith ("", ["c"], []) "@&" =?> "<p><code language=\"c\">@&amp;</code></p>"
- ]
- , testGroup "block code"
- [ "basic" =: codeBlock "@&" =?> "<preformat>@&amp;</preformat>"
- , "lang" =: codeBlockWith ("", ["c"], []) "@&" =?> "<code language=\"c\">@&amp;</code>"
- ]
- , testGroup "images"
- [ "basic" =:
- image "/url" "title" mempty
- =?> "<graphic mimetype=\"image\" mime-subtype=\"\" xlink:href=\"/url\" xlink:title=\"title\" />"
- ]
- , testGroup "inlines"
- [ "Emphasis" =: emph "emphasized"
- =?> "<p><italic>emphasized</italic></p>"
+tests =
+ [ testGroup "inline code"
+ [ "basic" =: code "@&" =?> "<p><monospace>@&amp;</monospace></p>"
+ , "lang" =: codeWith ("", ["c"], []) "@&" =?> "<p><code language=\"c\">@&amp;</code></p>"
+ ]
+ , testGroup "block code"
+ [ "basic" =: codeBlock "@&" =?> "<preformat>@&amp;</preformat>"
+ , "lang" =: codeBlockWith ("", ["c"], []) "@&" =?> "<code language=\"c\">@&amp;</code>"
+ ]
+ , testGroup "images"
+ [ "basic" =:
+ image "/url" "title" mempty
+ =?> "<graphic mimetype=\"image\" mime-subtype=\"\" xlink:href=\"/url\" xlink:title=\"title\" />"
+ ]
+ , testGroup "inlines"
+ [ "Emphasis" =: emph "emphasized"
+ =?> "<p><italic>emphasized</italic></p>"
+
+ , test jatsArticleAuthoring "footnote in articleauthoring tag set"
+ ("test" <> note (para "footnote") =?>
+ unlines [ "<p>test<fn>"
+ , " <p>footnote</p>"
+ , "</fn></p>"
+ ])
+ ]
+ , "bullet list" =: bulletList [ plain $ text "first"
+ , plain $ text "second"
+ , plain $ text "third"
+ ]
+ =?> "<list list-type=\"bullet\">\n\
+ \ <list-item>\n\
+ \ <p>first</p>\n\
+ \ </list-item>\n\
+ \ <list-item>\n\
+ \ <p>second</p>\n\
+ \ </list-item>\n\
+ \ <list-item>\n\
+ \ <p>third</p>\n\
+ \ </list-item>\n\
+ \</list>"
+ , testGroup "definition lists"
+ [ "with internal link" =: definitionList [(link "#go" "" (str "testing"),
+ [plain (text "hi there")])] =?>
+ "<def-list>\n\
+ \ <def-item>\n\
+ \ <term><xref alt=\"testing\" rid=\"go\">testing</xref></term>\n\
+ \ <def>\n\
+ \ <p>hi there</p>\n\
+ \ </def>\n\
+ \ </def-item>\n\
+ \</def-list>"
+ ]
+ , testGroup "math"
+ [ "escape |" =: para (math "\\sigma|_{\\{x\\}}") =?>
+ "<p><inline-formula><alternatives>\n\
+ \<tex-math><![CDATA[\\sigma|_{\\{x\\}}]]></tex-math>\n\
+ \<mml:math display=\"inline\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"><mml:mrow><mml:mi>σ</mml:mi><mml:msub><mml:mo stretchy=\"false\" form=\"prefix\">|</mml:mo><mml:mrow><mml:mo stretchy=\"false\" form=\"prefix\">{</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy=\"false\" form=\"postfix\">}</mml:mo></mml:mrow></mml:msub></mml:mrow></mml:math></alternatives></inline-formula></p>"
+ ]
+ , testGroup "headers"
+ [ "unnumbered header" =:
+ headerWith ("foo",["unnumbered"],[]) 1
+ (text "Header 1" <> note (plain $ text "note")) =?>
+ "<sec id=\"foo\">\n\
+ \ <title>Header 1<xref ref-type=\"fn\" rid=\"fn1\">1</xref></title>\n\
+ \</sec>"
+ , "unnumbered sub header" =:
+ headerWith ("foo",["unnumbered"],[]) 1
+ (text "Header")
+ <> headerWith ("foo",["unnumbered"],[]) 2
+ (text "Sub-Header") =?>
+ "<sec id=\"foo\">\n\
+ \ <title>Header</title>\n\
+ \ <sec id=\"foo\">\n\
+ \ <title>Sub-Header</title>\n\
+ \ </sec>\n\
+ \</sec>"
+ , "containing image" =:
+ header 1 (image "imgs/foo.jpg" "" (text "Alt text")) =?>
+ "<sec>\n\
+ \ <title><inline-graphic mimetype=\"image\" mime-subtype=\"jpeg\" xlink:href=\"imgs/foo.jpg\" /></title>\n\
+ \</sec>"
+ ]
+
+ , testGroup "ids"
+ [ "non-ASCII in header ID" =:
+ headerWith ("smørbrød",[],[]) 1 (text "smørbrød") =?>
+ T.unlines [ "<sec id=\"smørbrød\">"
+ , " <title>smørbrød</title>"
+ , "</sec>"
+ ]
+
+ , "disallowed symbol in header id" =:
+ headerWith ("i/o",[],[]) 1 (text "I/O") =?>
+ T.unlines [ "<sec id=\"iU002Fo\">"
+ , " <title>I/O</title>"
+ , "</sec>"
+ ]
+
+ , "disallowed symbols in internal link target" =:
+ link "#foo:bar" "" "baz" =?>
+ "<p><xref alt=\"baz\" rid=\"fooU003Abar\">baz</xref></p>"
- , test jatsArticleAuthoring "footnote in articleauthoring tag set"
- ("test" <> note (para "footnote") =?>
- unlines [ "<p>test<fn>"
- , " <p>footnote</p>"
- , "</fn></p>"
- ])
- ]
- , "bullet list" =: bulletList [ plain $ text "first"
- , plain $ text "second"
- , plain $ text "third"
- ]
- =?> "<list list-type=\"bullet\">\n\
- \ <list-item>\n\
- \ <p>first</p>\n\
- \ </list-item>\n\
- \ <list-item>\n\
- \ <p>second</p>\n\
- \ </list-item>\n\
- \ <list-item>\n\
- \ <p>third</p>\n\
- \ </list-item>\n\
- \</list>"
- , testGroup "definition lists"
- [ "with internal link" =: definitionList [(link "#go" "" (str "testing"),
- [plain (text "hi there")])] =?>
- "<def-list>\n\
- \ <def-item>\n\
- \ <term><xref alt=\"testing\" rid=\"go\">testing</xref></term>\n\
- \ <def>\n\
- \ <p>hi there</p>\n\
- \ </def>\n\
- \ </def-item>\n\
- \</def-list>"
- ]
- , testGroup "math"
- [ "escape |" =: para (math "\\sigma|_{\\{x\\}}") =?>
- "<p><inline-formula><alternatives>\n\
- \<tex-math><![CDATA[\\sigma|_{\\{x\\}}]]></tex-math>\n\
- \<mml:math display=\"inline\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"><mml:mrow><mml:mi>σ</mml:mi><mml:msub><mml:mo stretchy=\"false\" form=\"prefix\">|</mml:mo><mml:mrow><mml:mo stretchy=\"false\" form=\"prefix\">{</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy=\"false\" form=\"postfix\">}</mml:mo></mml:mrow></mml:msub></mml:mrow></mml:math></alternatives></inline-formula></p>"
- ]
- , testGroup "headers"
- [ "unnumbered header" =:
- headerWith ("foo",["unnumbered"],[]) 1
- (text "Header 1" <> note (plain $ text "note")) =?>
- "<sec id=\"foo\">\n\
- \ <title>Header 1<xref ref-type=\"fn\" rid=\"fn1\">1</xref></title>\n\
- \</sec>"
- , "unnumbered sub header" =:
- headerWith ("foo",["unnumbered"],[]) 1
- (text "Header")
- <> headerWith ("foo",["unnumbered"],[]) 2
- (text "Sub-Header") =?>
- "<sec id=\"foo\">\n\
- \ <title>Header</title>\n\
- \ <sec id=\"foo\">\n\
- \ <title>Sub-Header</title>\n\
- \ </sec>\n\
- \</sec>"
- , "containing image" =:
- header 1 (image "imgs/foo.jpg" "" (text "Alt text")) =?>
- "<sec>\n\
- \ <title><inline-graphic mimetype=\"image\" mime-subtype=\"jpeg\" xlink:href=\"imgs/foo.jpg\" /></title>\n\
- \</sec>"
- ]
- ]
+ , "code id starting with a number" =:
+ codeWith ("7y",[],[]) "print 5" =?>
+ "<p><monospace id=\"U0037y\">print 5</monospace></p>"
+ ]
+ ]