diff options
-rw-r--r-- | src/Text/Pandoc/Writers/JATS.hs | 27 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/JATS/References.hs | 5 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/JATS/Table.hs | 4 | ||||
-rw-r--r-- | src/Text/Pandoc/XML.hs | 30 | ||||
-rw-r--r-- | test/Tests/Writers/JATS.hs | 205 |
5 files changed, 162 insertions, 109 deletions
diff --git a/src/Text/Pandoc/Writers/JATS.hs b/src/Text/Pandoc/Writers/JATS.hs index a9369db7a..26f94cb03 100644 --- a/src/Text/Pandoc/Writers/JATS.hs +++ b/src/Text/Pandoc/Writers/JATS.hs @@ -239,7 +239,7 @@ languageFor classes = codeAttr :: Attr -> (Text, [(Text, Text)]) codeAttr (ident,classes,kvs) = (lang, attr) where - attr = [("id",ident) | not (T.null ident)] ++ + attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("language",lang) | not (T.null lang)] ++ [(k,v) | (k,v) <- kvs, k `elem` ["code-type", "code-version", "executable", @@ -251,7 +251,8 @@ codeAttr (ident,classes,kvs) = (lang, attr) blockToJATS :: PandocMonad m => WriterOptions -> Block -> JATS m (Doc Text) blockToJATS _ Null = return empty blockToJATS opts (Div (id',"section":_,kvs) (Header _lvl _ ils : xs)) = do - let idAttr = [("id", writerIdentifierPrefix opts <> id') | not (T.null id')] + let idAttr = [ ("id", writerIdentifierPrefix opts <> escapeNCName id') + | not (T.null id')] let otherAttrs = ["sec-type", "specific-use"] let attribs = idAttr ++ [(k,v) | (k,v) <- kvs, k `elem` otherAttrs] title' <- inlinesToJATS opts ils @@ -260,7 +261,7 @@ blockToJATS opts (Div (id',"section":_,kvs) (Header _lvl _ ils : xs)) = do inTagsSimple "title" title' $$ contents -- Bibliography reference: blockToJATS opts (Div (ident,_,_) [Para lst]) | "ref-" `T.isPrefixOf` ident = - inTags True "ref" [("id", ident)] . + inTags True "ref" [("id", escapeNCName ident)] . inTagsSimple "mixed-citation" <$> inlinesToJATS opts lst blockToJATS opts (Div ("refs",_,_) xs) = do @@ -271,14 +272,14 @@ blockToJATS opts (Div ("refs",_,_) xs) = do return $ inTagsIndented "ref-list" contents blockToJATS opts (Div (ident,[cls],kvs) bs) | cls `elem` ["fig", "caption", "table-wrap"] = do contents <- blocksToJATS opts bs - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("xml:lang",l) | ("lang",l) <- kvs] ++ [(k,v) | (k,v) <- kvs, k `elem` ["specific-use", "content-type", "orientation", "position"]] return $ inTags True cls attr contents blockToJATS opts (Div (ident,_,kvs) bs) = do contents <- blocksToJATS opts bs - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("xml:lang",l) | ("lang",l) <- kvs] ++ [(k,v) | (k,v) <- kvs, k `elem` ["specific-use", "content-type", "orientation", "position"]] @@ -296,7 +297,7 @@ blockToJATS opts (Para [Image (ident,_,kvs) txt let capt = if null txt then empty else inTagsSimple "caption" $ inTagsSimple "p" alt - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [(k,v) | (k,v) <- kvs, k `elem` ["fig-type", "orientation", "position", "specific-use"]] let graphicattr = [("mimetype",maintype), @@ -307,7 +308,7 @@ blockToJATS opts (Para [Image (ident,_,kvs) txt capt $$ selfClosingTag "graphic" graphicattr blockToJATS _ (Para [Image (ident,_,kvs) _ (src, tit)]) = do let (maintype, subtype) = imageMimeType src kvs - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("mimetype", maintype), ("mime-subtype", subtype), ("xlink:href", src)] ++ @@ -434,7 +435,7 @@ inlineToJATS opts (Note contents) = do let notenum = case notes of (n, _):_ -> n + 1 [] -> 1 - thenote <- inTags True "fn" [("id","fn" <> tshow notenum)] + thenote <- inTags True "fn" [("id", "fn" <> tshow notenum)] <$> wrappedBlocksToJATS (not . isPara) opts (walk demoteHeaderAndRefs contents) modify $ \st -> st{ jatsNotes = (notenum, thenote) : notes } @@ -447,7 +448,7 @@ inlineToJATS opts (Cite _ lst) = inlineToJATS opts (Span ("",_,[]) ils) = inlinesToJATS opts ils inlineToJATS opts (Span (ident,_,kvs) ils) = do contents <- inlinesToJATS opts ils - let attr = [("id",ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("xml:lang",l) | ("lang",l) <- kvs] ++ [(k,v) | (k,v) <- kvs , k `elem` ["content-type", "rationale", @@ -488,9 +489,9 @@ inlineToJATS _ (Link _attr [Str t] (T.stripPrefix "mailto:" -> Just email, _)) return $ inTagsSimple "email" $ literal (escapeStringForXML email) inlineToJATS opts (Link (ident,_,kvs) txt (T.uncons -> Just ('#', src), _)) = do let attr = mconcat - [ [("id", ident) | not (T.null ident)] + [ [("id", escapeNCName ident) | not (T.null ident)] , [("alt", stringify txt) | not (null txt)] - , [("rid", src)] + , [("rid", escapeNCName src)] , [(k,v) | (k,v) <- kvs, k `elem` ["ref-type", "specific-use"]] , [("ref-type", "bibr") | "ref-" `T.isPrefixOf` src] ] @@ -500,7 +501,7 @@ inlineToJATS opts (Link (ident,_,kvs) txt (T.uncons -> Just ('#', src), _)) = do contents <- inlinesToJATS opts txt return $ inTags False "xref" attr contents inlineToJATS opts (Link (ident,_,kvs) txt (src, tit)) = do - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("ext-link-type", "uri"), ("xlink:href", src)] ++ [("xlink:title", tit) | not (T.null tit)] ++ @@ -518,7 +519,7 @@ inlineToJATS _ (Image (ident,_,kvs) _ (src, tit)) = do let subtype = fromMaybe "" $ lookup "mime-subtype" kvs `mplus` (T.drop 1 . T.dropWhile (/='/') <$> mbMT) - let attr = [("id", ident) | not (T.null ident)] ++ + let attr = [("id", escapeNCName ident) | not (T.null ident)] ++ [("mimetype", maintype), ("mime-subtype", subtype), ("xlink:href", src)] ++ diff --git a/src/Text/Pandoc/Writers/JATS/References.hs b/src/Text/Pandoc/Writers/JATS/References.hs index 903144128..5b19fd034 100644 --- a/src/Text/Pandoc/Writers/JATS/References.hs +++ b/src/Text/Pandoc/Writers/JATS/References.hs @@ -29,7 +29,7 @@ import Text.Pandoc.Builder (Inlines) import Text.Pandoc.Options (WriterOptions) import Text.Pandoc.Shared (tshow) import Text.Pandoc.Writers.JATS.Types -import Text.Pandoc.XML (escapeStringForXML, inTags) +import Text.Pandoc.XML (escapeNCName, escapeStringForXML, inTags) import qualified Data.Text as T referencesToJATS :: PandocMonad m @@ -46,7 +46,8 @@ referenceToJATS :: PandocMonad m referenceToJATS _opts ref = do let refType = referenceType ref let pubType = [("publication-type", refType) | not (T.null refType)] - let wrap = inTags True "ref" [("id", "ref-" <> unItemId (referenceId ref))] + let ident = escapeNCName $ "ref-" <> unItemId (referenceId ref) + let wrap = inTags True "ref" [("id", ident)] . inTags True "element-citation" pubType return . wrap . vcat $ [ authors diff --git a/src/Text/Pandoc/Writers/JATS/Table.hs b/src/Text/Pandoc/Writers/JATS/Table.hs index 465480f59..2e34900d2 100644 --- a/src/Text/Pandoc/Writers/JATS/Table.hs +++ b/src/Text/Pandoc/Writers/JATS/Table.hs @@ -24,7 +24,7 @@ import Text.Pandoc.Definition import Text.Pandoc.Options (WriterOptions) import Text.Pandoc.Shared (tshow) import Text.Pandoc.Writers.JATS.Types -import Text.Pandoc.XML (inTags, inTagsIndented, selfClosingTag) +import Text.Pandoc.XML (escapeNCName, inTags, inTagsIndented, selfClosingTag) import qualified Data.Text as T import qualified Text.Pandoc.Writers.AnnotatedTable as Ann @@ -216,7 +216,7 @@ cellToJats opts celltype (Ann.Cell (colspec :| _) _colNum cell) = toAttribs :: Attr -> [Text] -> [(Text, Text)] toAttribs (ident, _classes, kvs) knownAttribs = - (if T.null ident then id else (("id", ident) :)) $ + (if T.null ident then id else (("id", escapeNCName ident) :)) $ filter ((`elem` knownAttribs) . fst) kvs tableCellToJats :: PandocMonad m diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs index 6dbbce1d2..79b4768ec 100644 --- a/src/Text/Pandoc/XML.hs +++ b/src/Text/Pandoc/XML.hs @@ -13,6 +13,7 @@ Functions for escaping and formatting XML. -} module Text.Pandoc.XML ( escapeCharForXML, escapeStringForXML, + escapeNCName, inTags, selfClosingTag, inTagsSimple, @@ -24,7 +25,7 @@ module Text.Pandoc.XML ( escapeCharForXML, html5Attributes, rdfaAttributes ) where -import Data.Char (isAscii, isSpace, ord) +import Data.Char (isAscii, isSpace, ord, isLetter, isDigit) import Data.Text (Text) import qualified Data.Text as T import Text.HTML.TagSoup.Entity (lookupEntity, htmlEntities) @@ -119,8 +120,33 @@ html5EntityMap = foldr go mempty htmlEntities where ent' = T.takeWhile (/=';') (T.pack ent) _ -> entmap +-- | Converts a string into an NCName, i.e., an XML name without colons. +-- Disallowed characters are escaped using @ux%x@, where @%x@ is the +-- hexadecimal unicode identifier of the escaped character. +escapeNCName :: Text -> Text +escapeNCName t = case T.uncons t of + Nothing -> T.empty + Just (c, cs) -> escapeStartChar c <> T.concatMap escapeNCNameChar cs + where + escapeStartChar :: Char -> Text + escapeStartChar c = if isLetter c || c == '_' + then T.singleton c + else escapeChar c --- Unescapes XML entities + escapeNCNameChar :: Char -> Text + escapeNCNameChar c = if isNCNameChar c + then T.singleton c + else escapeChar c + + isNCNameChar :: Char -> Bool + isNCNameChar c = isLetter c || c `elem` ("_-.·" :: String) || isDigit c + || '\x0300' <= c && c <= '\x036f' + || '\x203f' <= c && c <= '\x2040' + + escapeChar :: Char -> Text + escapeChar = T.pack . printf "U%04X" . ord + +-- | Unescapes XML entities fromEntities :: Text -> Text fromEntities t = let (x, y) = T.break (== '&') t diff --git a/test/Tests/Writers/JATS.hs b/test/Tests/Writers/JATS.hs index 2f501c890..23c1686dc 100644 --- a/test/Tests/Writers/JATS.hs +++ b/test/Tests/Writers/JATS.hs @@ -1,21 +1,21 @@ {-# LANGUAGE OverloadedStrings #-} module Tests.Writers.JATS (tests) where -import Data.Text (unpack) +import Data.Text (Text) import Test.Tasty import Tests.Helpers import Text.Pandoc import Text.Pandoc.Arbitrary () import Text.Pandoc.Builder +import qualified Data.Text as T -jats :: (ToPandoc a) => a -> String -jats = unpack - . purely (writeJATS def{ writerWrapText = WrapNone }) - . toPandoc +jats :: (ToPandoc a) => a -> Text +jats = purely (writeJATS def{ writerWrapText = WrapNone }) + . toPandoc -jatsArticleAuthoring :: (ToPandoc a) => a -> String -jatsArticleAuthoring = unpack - . purely (writeJatsArticleAuthoring def{ writerWrapText = WrapNone }) +jatsArticleAuthoring :: (ToPandoc a) => a -> Text +jatsArticleAuthoring = + purely (writeJatsArticleAuthoring def{ writerWrapText = WrapNone }) . toPandoc {- @@ -32,89 +32,114 @@ which is in turn shorthand for infix 4 =: (=:) :: (ToString a, ToPandoc a) - => String -> (a, String) -> TestTree + => String -> (a, Text) -> TestTree (=:) = test jats tests :: [TestTree] -tests = [ testGroup "inline code" - [ "basic" =: code "@&" =?> "<p><monospace>@&</monospace></p>" - , "lang" =: codeWith ("", ["c"], []) "@&" =?> "<p><code language=\"c\">@&</code></p>" - ] - , testGroup "block code" - [ "basic" =: codeBlock "@&" =?> "<preformat>@&</preformat>" - , "lang" =: codeBlockWith ("", ["c"], []) "@&" =?> "<code language=\"c\">@&</code>" - ] - , testGroup "images" - [ "basic" =: - image "/url" "title" mempty - =?> "<graphic mimetype=\"image\" mime-subtype=\"\" xlink:href=\"/url\" xlink:title=\"title\" />" - ] - , testGroup "inlines" - [ "Emphasis" =: emph "emphasized" - =?> "<p><italic>emphasized</italic></p>" +tests = + [ testGroup "inline code" + [ "basic" =: code "@&" =?> "<p><monospace>@&</monospace></p>" + , "lang" =: codeWith ("", ["c"], []) "@&" =?> "<p><code language=\"c\">@&</code></p>" + ] + , testGroup "block code" + [ "basic" =: codeBlock "@&" =?> "<preformat>@&</preformat>" + , "lang" =: codeBlockWith ("", ["c"], []) "@&" =?> "<code language=\"c\">@&</code>" + ] + , testGroup "images" + [ "basic" =: + image "/url" "title" mempty + =?> "<graphic mimetype=\"image\" mime-subtype=\"\" xlink:href=\"/url\" xlink:title=\"title\" />" + ] + , testGroup "inlines" + [ "Emphasis" =: emph "emphasized" + =?> "<p><italic>emphasized</italic></p>" + + , test jatsArticleAuthoring "footnote in articleauthoring tag set" + ("test" <> note (para "footnote") =?> + unlines [ "<p>test<fn>" + , " <p>footnote</p>" + , "</fn></p>" + ]) + ] + , "bullet list" =: bulletList [ plain $ text "first" + , plain $ text "second" + , plain $ text "third" + ] + =?> "<list list-type=\"bullet\">\n\ + \ <list-item>\n\ + \ <p>first</p>\n\ + \ </list-item>\n\ + \ <list-item>\n\ + \ <p>second</p>\n\ + \ </list-item>\n\ + \ <list-item>\n\ + \ <p>third</p>\n\ + \ </list-item>\n\ + \</list>" + , testGroup "definition lists" + [ "with internal link" =: definitionList [(link "#go" "" (str "testing"), + [plain (text "hi there")])] =?> + "<def-list>\n\ + \ <def-item>\n\ + \ <term><xref alt=\"testing\" rid=\"go\">testing</xref></term>\n\ + \ <def>\n\ + \ <p>hi there</p>\n\ + \ </def>\n\ + \ </def-item>\n\ + \</def-list>" + ] + , testGroup "math" + [ "escape |" =: para (math "\\sigma|_{\\{x\\}}") =?> + "<p><inline-formula><alternatives>\n\ + \<tex-math><![CDATA[\\sigma|_{\\{x\\}}]]></tex-math>\n\ + \<mml:math display=\"inline\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"><mml:mrow><mml:mi>σ</mml:mi><mml:msub><mml:mo stretchy=\"false\" form=\"prefix\">|</mml:mo><mml:mrow><mml:mo stretchy=\"false\" form=\"prefix\">{</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy=\"false\" form=\"postfix\">}</mml:mo></mml:mrow></mml:msub></mml:mrow></mml:math></alternatives></inline-formula></p>" + ] + , testGroup "headers" + [ "unnumbered header" =: + headerWith ("foo",["unnumbered"],[]) 1 + (text "Header 1" <> note (plain $ text "note")) =?> + "<sec id=\"foo\">\n\ + \ <title>Header 1<xref ref-type=\"fn\" rid=\"fn1\">1</xref></title>\n\ + \</sec>" + , "unnumbered sub header" =: + headerWith ("foo",["unnumbered"],[]) 1 + (text "Header") + <> headerWith ("foo",["unnumbered"],[]) 2 + (text "Sub-Header") =?> + "<sec id=\"foo\">\n\ + \ <title>Header</title>\n\ + \ <sec id=\"foo\">\n\ + \ <title>Sub-Header</title>\n\ + \ </sec>\n\ + \</sec>" + , "containing image" =: + header 1 (image "imgs/foo.jpg" "" (text "Alt text")) =?> + "<sec>\n\ + \ <title><inline-graphic mimetype=\"image\" mime-subtype=\"jpeg\" xlink:href=\"imgs/foo.jpg\" /></title>\n\ + \</sec>" + ] + + , testGroup "ids" + [ "non-ASCII in header ID" =: + headerWith ("smørbrød",[],[]) 1 (text "smørbrød") =?> + T.unlines [ "<sec id=\"smørbrød\">" + , " <title>smørbrød</title>" + , "</sec>" + ] + + , "disallowed symbol in header id" =: + headerWith ("i/o",[],[]) 1 (text "I/O") =?> + T.unlines [ "<sec id=\"iU002Fo\">" + , " <title>I/O</title>" + , "</sec>" + ] + + , "disallowed symbols in internal link target" =: + link "#foo:bar" "" "baz" =?> + "<p><xref alt=\"baz\" rid=\"fooU003Abar\">baz</xref></p>" - , test jatsArticleAuthoring "footnote in articleauthoring tag set" - ("test" <> note (para "footnote") =?> - unlines [ "<p>test<fn>" - , " <p>footnote</p>" - , "</fn></p>" - ]) - ] - , "bullet list" =: bulletList [ plain $ text "first" - , plain $ text "second" - , plain $ text "third" - ] - =?> "<list list-type=\"bullet\">\n\ - \ <list-item>\n\ - \ <p>first</p>\n\ - \ </list-item>\n\ - \ <list-item>\n\ - \ <p>second</p>\n\ - \ </list-item>\n\ - \ <list-item>\n\ - \ <p>third</p>\n\ - \ </list-item>\n\ - \</list>" - , testGroup "definition lists" - [ "with internal link" =: definitionList [(link "#go" "" (str "testing"), - [plain (text "hi there")])] =?> - "<def-list>\n\ - \ <def-item>\n\ - \ <term><xref alt=\"testing\" rid=\"go\">testing</xref></term>\n\ - \ <def>\n\ - \ <p>hi there</p>\n\ - \ </def>\n\ - \ </def-item>\n\ - \</def-list>" - ] - , testGroup "math" - [ "escape |" =: para (math "\\sigma|_{\\{x\\}}") =?> - "<p><inline-formula><alternatives>\n\ - \<tex-math><![CDATA[\\sigma|_{\\{x\\}}]]></tex-math>\n\ - \<mml:math display=\"inline\" xmlns:mml=\"http://www.w3.org/1998/Math/MathML\"><mml:mrow><mml:mi>σ</mml:mi><mml:msub><mml:mo stretchy=\"false\" form=\"prefix\">|</mml:mo><mml:mrow><mml:mo stretchy=\"false\" form=\"prefix\">{</mml:mo><mml:mi>x</mml:mi><mml:mo stretchy=\"false\" form=\"postfix\">}</mml:mo></mml:mrow></mml:msub></mml:mrow></mml:math></alternatives></inline-formula></p>" - ] - , testGroup "headers" - [ "unnumbered header" =: - headerWith ("foo",["unnumbered"],[]) 1 - (text "Header 1" <> note (plain $ text "note")) =?> - "<sec id=\"foo\">\n\ - \ <title>Header 1<xref ref-type=\"fn\" rid=\"fn1\">1</xref></title>\n\ - \</sec>" - , "unnumbered sub header" =: - headerWith ("foo",["unnumbered"],[]) 1 - (text "Header") - <> headerWith ("foo",["unnumbered"],[]) 2 - (text "Sub-Header") =?> - "<sec id=\"foo\">\n\ - \ <title>Header</title>\n\ - \ <sec id=\"foo\">\n\ - \ <title>Sub-Header</title>\n\ - \ </sec>\n\ - \</sec>" - , "containing image" =: - header 1 (image "imgs/foo.jpg" "" (text "Alt text")) =?> - "<sec>\n\ - \ <title><inline-graphic mimetype=\"image\" mime-subtype=\"jpeg\" xlink:href=\"imgs/foo.jpg\" /></title>\n\ - \</sec>" - ] - ] + , "code id starting with a number" =: + codeWith ("7y",[],[]) "print 5" =?> + "<p><monospace id=\"U0037y\">print 5</monospace></p>" + ] + ] |