aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/XML.hs
diff options
context:
space:
mode:
authorAlbert Krewinkel <albert@zeitkraut.de>2021-04-05 21:45:52 +0200
committerAlbert Krewinkel <albert@zeitkraut.de>2021-04-05 21:55:54 +0200
commit038261ea529bc4516d7cee501db70020938dbf2b (patch)
tree75d8d53a14b19df2cf3951ae7e669c1f0b9ea520 /src/Text/Pandoc/XML.hs
parent65a9d3a8786c23f79de9dcdf56ab7efb789726ff (diff)
downloadpandoc-038261ea529bc4516d7cee501db70020938dbf2b.tar.gz
JATS writer: escape disallows chars in identifiers
XML identifiers must start with an underscore or letter, and can contain only a limited set of punctuation characters. Any IDs not adhering to these rules are rewritten by writing the offending characters as Uxxxx, where `xxxx` is the character's hex code.
Diffstat (limited to 'src/Text/Pandoc/XML.hs')
-rw-r--r--src/Text/Pandoc/XML.hs30
1 files changed, 28 insertions, 2 deletions
diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs
index 6dbbce1d2..79b4768ec 100644
--- a/src/Text/Pandoc/XML.hs
+++ b/src/Text/Pandoc/XML.hs
@@ -13,6 +13,7 @@ Functions for escaping and formatting XML.
-}
module Text.Pandoc.XML ( escapeCharForXML,
escapeStringForXML,
+ escapeNCName,
inTags,
selfClosingTag,
inTagsSimple,
@@ -24,7 +25,7 @@ module Text.Pandoc.XML ( escapeCharForXML,
html5Attributes,
rdfaAttributes ) where
-import Data.Char (isAscii, isSpace, ord)
+import Data.Char (isAscii, isSpace, ord, isLetter, isDigit)
import Data.Text (Text)
import qualified Data.Text as T
import Text.HTML.TagSoup.Entity (lookupEntity, htmlEntities)
@@ -119,8 +120,33 @@ html5EntityMap = foldr go mempty htmlEntities
where ent' = T.takeWhile (/=';') (T.pack ent)
_ -> entmap
+-- | Converts a string into an NCName, i.e., an XML name without colons.
+-- Disallowed characters are escaped using @ux%x@, where @%x@ is the
+-- hexadecimal unicode identifier of the escaped character.
+escapeNCName :: Text -> Text
+escapeNCName t = case T.uncons t of
+ Nothing -> T.empty
+ Just (c, cs) -> escapeStartChar c <> T.concatMap escapeNCNameChar cs
+ where
+ escapeStartChar :: Char -> Text
+ escapeStartChar c = if isLetter c || c == '_'
+ then T.singleton c
+ else escapeChar c
--- Unescapes XML entities
+ escapeNCNameChar :: Char -> Text
+ escapeNCNameChar c = if isNCNameChar c
+ then T.singleton c
+ else escapeChar c
+
+ isNCNameChar :: Char -> Bool
+ isNCNameChar c = isLetter c || c `elem` ("_-.ยท" :: String) || isDigit c
+ || '\x0300' <= c && c <= '\x036f'
+ || '\x203f' <= c && c <= '\x2040'
+
+ escapeChar :: Char -> Text
+ escapeChar = T.pack . printf "U%04X" . ord
+
+-- | Unescapes XML entities
fromEntities :: Text -> Text
fromEntities t
= let (x, y) = T.break (== '&') t