aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/XML.hs
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc/XML.hs')
-rw-r--r--src/Text/Pandoc/XML.hs62
1 files changed, 42 insertions, 20 deletions
diff --git a/src/Text/Pandoc/XML.hs b/src/Text/Pandoc/XML.hs
index 4b71d7b69..79b4768ec 100644
--- a/src/Text/Pandoc/XML.hs
+++ b/src/Text/Pandoc/XML.hs
@@ -1,9 +1,8 @@
{-# LANGUAGE LambdaCase #-}
{-# LANGUAGE OverloadedStrings #-}
-{-# LANGUAGE ViewPatterns #-}
{- |
Module : Text.Pandoc.XML
- Copyright : Copyright (C) 2006-2020 John MacFarlane
+ Copyright : Copyright (C) 2006-2021 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
@@ -14,6 +13,7 @@ Functions for escaping and formatting XML.
-}
module Text.Pandoc.XML ( escapeCharForXML,
escapeStringForXML,
+ escapeNCName,
inTags,
selfClosingTag,
inTagsSimple,
@@ -25,7 +25,7 @@ module Text.Pandoc.XML ( escapeCharForXML,
html5Attributes,
rdfaAttributes ) where
-import Data.Char (isAscii, isSpace, ord)
+import Data.Char (isAscii, isSpace, ord, isLetter, isDigit)
import Data.Text (Text)
import qualified Data.Text as T
import Text.HTML.TagSoup.Entity (lookupEntity, htmlEntities)
@@ -120,26 +120,48 @@ html5EntityMap = foldr go mempty htmlEntities
where ent' = T.takeWhile (/=';') (T.pack ent)
_ -> entmap
+-- | Converts a string into an NCName, i.e., an XML name without colons.
+-- Disallowed characters are escaped using @ux%x@, where @%x@ is the
+-- hexadecimal unicode identifier of the escaped character.
+escapeNCName :: Text -> Text
+escapeNCName t = case T.uncons t of
+ Nothing -> T.empty
+ Just (c, cs) -> escapeStartChar c <> T.concatMap escapeNCNameChar cs
+ where
+ escapeStartChar :: Char -> Text
+ escapeStartChar c = if isLetter c || c == '_'
+ then T.singleton c
+ else escapeChar c
--- Unescapes XML entities
+ escapeNCNameChar :: Char -> Text
+ escapeNCNameChar c = if isNCNameChar c
+ then T.singleton c
+ else escapeChar c
+
+ isNCNameChar :: Char -> Bool
+ isNCNameChar c = isLetter c || c `elem` ("_-.ยท" :: String) || isDigit c
+ || '\x0300' <= c && c <= '\x036f'
+ || '\x203f' <= c && c <= '\x2040'
+
+ escapeChar :: Char -> Text
+ escapeChar = T.pack . printf "U%04X" . ord
+
+-- | Unescapes XML entities
fromEntities :: Text -> Text
-fromEntities = T.pack . fromEntities'
+fromEntities t
+ = let (x, y) = T.break (== '&') t
+ in if T.null y
+ then t
+ else x <>
+ let (ent, rest) = T.break (\c -> isSpace c || c == ';') y
+ rest' = case T.uncons rest of
+ Just (';',ys) -> ys
+ _ -> rest
+ ent' = T.drop 1 ent <> ";"
+ in case T.pack <$> lookupEntity (T.unpack ent') of
+ Just c -> c <> fromEntities rest'
+ Nothing -> ent <> fromEntities rest
-fromEntities' :: Text -> String
-fromEntities' (T.uncons -> Just ('&', xs)) =
- case lookupEntity $ T.unpack ent' of
- Just c -> c <> fromEntities' rest
- Nothing -> "&" <> fromEntities' xs
- where (ent, rest) = case T.break (\c -> isSpace c || c == ';') xs of
- (zs,T.uncons -> Just (';',ys)) -> (zs,ys)
- (zs, ys) -> (zs,ys)
- ent'
- | Just ys <- T.stripPrefix "#X" ent = "#x" <> ys -- workaround tagsoup bug
- | Just ('#', _) <- T.uncons ent = ent
- | otherwise = ent <> ";"
-fromEntities' t = case T.uncons t of
- Just (x, xs) -> x : fromEntities' xs
- Nothing -> ""
html5Attributes :: Set.Set Text
html5Attributes = Set.fromList