1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
{-# LANGUAGE NoImplicitPrelude #-}
{- |
Module : Text.Pandoc.XML
Copyright : Copyright (C) 2006-2019 John MacFarlane
License : GNU GPL, version 2 or above
Maintainer : John MacFarlane <jgm@berkeley.edu>
Stability : alpha
Portability : portable
Functions for escaping and formatting XML.
-}
module Text.Pandoc.XML ( escapeCharForXML,
escapeStringForXML,
inTags,
selfClosingTag,
inTagsSimple,
inTagsIndented,
toEntities,
toHtml5Entities,
fromEntities ) where
import Prelude
import Data.Char (isAscii, isSpace, ord)
import Data.Text (Text)
import qualified Data.Text as T
import Text.HTML.TagSoup.Entity (lookupEntity, htmlEntities)
import Text.Pandoc.Pretty
import qualified Data.Map as M
-- | Escape one character as needed for XML.
escapeCharForXML :: Char -> String
escapeCharForXML x = case x of
'&' -> "&"
'<' -> "<"
'>' -> ">"
'"' -> """
c -> [c]
-- | Escape string as needed for XML. Entity references are not preserved.
escapeStringForXML :: String -> String
escapeStringForXML = concatMap escapeCharForXML . filter isLegalXMLChar
where isLegalXMLChar c = c == '\t' || c == '\n' || c == '\r' ||
(c >= '\x20' && c <= '\xD7FF') ||
(c >= '\xE000' && c <= '\xFFFD') ||
(c >= '\x10000' && c <= '\x10FFFF')
-- see https://www.w3.org/TR/xml/#charsets
-- | Escape newline characters as
escapeNls :: String -> String
escapeNls (x:xs)
| x == '\n' = " " ++ escapeNls xs
| otherwise = x : escapeNls xs
escapeNls [] = []
-- | Return a text object with a string of formatted XML attributes.
attributeList :: [(String, String)] -> Doc
attributeList = hcat . map
(\(a, b) -> text (' ' : escapeStringForXML a ++ "=\"" ++
escapeNls (escapeStringForXML b) ++ "\""))
-- | Put the supplied contents between start and end tags of tagType,
-- with specified attributes and (if specified) indentation.
inTags:: Bool -> String -> [(String, String)] -> Doc -> Doc
inTags isIndented tagType attribs contents =
let openTag = char '<' <> text tagType <> attributeList attribs <>
char '>'
closeTag = text "</" <> text tagType <> char '>'
in if isIndented
then openTag $$ nest 2 contents $$ closeTag
else openTag <> contents <> closeTag
-- | Return a self-closing tag of tagType with specified attributes
selfClosingTag :: String -> [(String, String)] -> Doc
selfClosingTag tagType attribs =
char '<' <> text tagType <> attributeList attribs <> text " />"
-- | Put the supplied contents between start and end tags of tagType.
inTagsSimple :: String -> Doc -> Doc
inTagsSimple tagType = inTags False tagType []
-- | Put the supplied contents in indented block btw start and end tags.
inTagsIndented :: String -> Doc -> Doc
inTagsIndented tagType = inTags True tagType []
-- | Escape all non-ascii characters using numerical entities.
toEntities :: Text -> Text
toEntities = T.concatMap go
where go c | isAscii c = T.singleton c
| otherwise = T.pack ("&#" ++ show (ord c) ++ ";")
-- | Escape all non-ascii characters using HTML5 entities, falling
-- back to numerical entities.
toHtml5Entities :: Text -> Text
toHtml5Entities = T.concatMap go
where go c | isAscii c = T.singleton c
| otherwise =
case M.lookup c html5EntityMap of
Just t -> T.singleton '&' <> t <> T.singleton ';'
Nothing -> T.pack ("&#" ++ show (ord c) ++ ";")
html5EntityMap :: M.Map Char Text
html5EntityMap = foldr go mempty htmlEntities
where go (ent, s) entmap =
case s of
[c] -> M.insertWith
(\new old -> if T.length new > T.length old
then old
else new) c ent' entmap
where ent' = T.takeWhile (/=';') (T.pack ent)
_ -> entmap
-- Unescapes XML entities
fromEntities :: String -> String
fromEntities ('&':xs) =
case lookupEntity ent' of
Just c -> c ++ fromEntities rest
Nothing -> '&' : fromEntities xs
where (ent, rest) = case break (\c -> isSpace c || c == ';') xs of
(zs,';':ys) -> (zs,ys)
(zs, ys) -> (zs,ys)
ent' = case ent of
'#':'X':ys -> '#':'x':ys -- workaround tagsoup bug
'#':_ -> ent
_ -> ent ++ ";"
fromEntities (x:xs) = x : fromEntities xs
fromEntities [] = []
|