aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/XML.hs
blob: 21f6d4d46c1ed254682bfc43ba9beae7a262d3dd (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
{-# LANGUAGE NoImplicitPrelude #-}
{-# LANGUAGE OverloadedStrings #-}
{-# LANGUAGE ViewPatterns      #-}
{- |
   Module      : Text.Pandoc.XML
   Copyright   : Copyright (C) 2006-2019 John MacFarlane
   License     : GNU GPL, version 2 or above

   Maintainer  : John MacFarlane <jgm@berkeley.edu>
   Stability   : alpha
   Portability : portable

Functions for escaping and formatting XML.
-}
module Text.Pandoc.XML ( escapeCharForXML,
                         escapeStringForXML,
                         inTags,
                         selfClosingTag,
                         inTagsSimple,
                         inTagsIndented,
                         toEntities,
                         toHtml5Entities,
                         fromEntities ) where

import Prelude
import Data.Char (isAscii, isSpace, ord)
import Data.Text (Text)
import qualified Data.Text as T
import Text.HTML.TagSoup.Entity (lookupEntity, htmlEntities)
import Text.DocLayout
import Text.Printf (printf)
import qualified Data.Map as M
import Data.String

-- | Escape one character as needed for XML.
escapeCharForXML :: Char -> Text
escapeCharForXML x = case x of
                       '&' -> "&amp;"
                       '<' -> "&lt;"
                       '>' -> "&gt;"
                       '"' -> "&quot;"
                       c   -> T.singleton c

-- | Escape string as needed for XML.  Entity references are not preserved.
escapeStringForXML :: Text -> Text
escapeStringForXML = T.concatMap escapeCharForXML . T.filter isLegalXMLChar
  where isLegalXMLChar c = c == '\t' || c == '\n' || c == '\r' ||
                           (c >= '\x20' && c <= '\xD7FF') ||
                           (c >= '\xE000' && c <= '\xFFFD') ||
                           (c >= '\x10000' && c <= '\x10FFFF')
  -- see https://www.w3.org/TR/xml/#charsets

-- | Escape newline characters as &#10;
escapeNls :: Text -> Text
escapeNls = T.concatMap $ \x -> case x of
  '\n' -> "&#10;"
  c    -> T.singleton c

-- | Return a text object with a string of formatted XML attributes.
attributeList :: (HasChars a, IsString a) => [(Text, Text)] -> Doc a
attributeList = hcat . map
  (\(a, b) -> text (T.unpack $ " " <> escapeStringForXML a <> "=\"" <>
  escapeNls (escapeStringForXML b) <> "\""))

-- | Put the supplied contents between start and end tags of tagType,
--   with specified attributes and (if specified) indentation.
inTags :: (HasChars a, IsString a)
      => Bool -> Text -> [(Text, Text)] -> Doc a -> Doc a
inTags isIndented tagType attribs contents =
  let openTag = char '<' <> text (T.unpack tagType) <> attributeList attribs <>
                char '>'
      closeTag  = text "</" <> text (T.unpack tagType) <> char '>'
  in  if isIndented
         then openTag $$ nest 2 contents $$ closeTag
         else openTag <> contents <> closeTag

-- | Return a self-closing tag of tagType with specified attributes
selfClosingTag :: (HasChars a, IsString a)
               => Text -> [(Text, Text)] -> Doc a
selfClosingTag tagType attribs =
  char '<' <> text (T.unpack tagType) <> attributeList attribs <> text " />"

-- | Put the supplied contents between start and end tags of tagType.
inTagsSimple :: (HasChars a, IsString a)
             => Text -> Doc a -> Doc a
inTagsSimple tagType = inTags False tagType []

-- | Put the supplied contents in indented block btw start and end tags.
inTagsIndented :: (HasChars a, IsString a)
               => Text -> Doc a -> Doc a
inTagsIndented tagType = inTags True tagType []

-- | Escape all non-ascii characters using numerical entities.
toEntities :: Text -> Text
toEntities = T.concatMap go
  where go c | isAscii c = T.singleton c
             | otherwise = T.pack (printf "&#x%X;" (ord c))

-- | Escape all non-ascii characters using HTML5 entities, falling
-- back to numerical entities.
toHtml5Entities :: Text -> Text
toHtml5Entities = T.concatMap go
  where go c | isAscii c = T.singleton c
             | otherwise =
                 case M.lookup c html5EntityMap of
                   Just t  -> T.singleton '&' <> t <> T.singleton ';'
                   Nothing -> T.pack ("&#" ++ show (ord c) ++ ";")

html5EntityMap :: M.Map Char Text
html5EntityMap = foldr go mempty htmlEntities
  where go (ent, s) entmap =
         case s of
           [c] -> M.insertWith
                   (\new old -> if T.length new > T.length old
                                   then old
                                   else new) c ent' entmap
             where ent' = T.takeWhile (/=';') (T.pack ent)
           _   -> entmap


-- Unescapes XML entities
fromEntities :: Text -> Text
fromEntities = T.pack . fromEntities'

fromEntities' :: Text -> String
fromEntities' (T.uncons -> Just ('&', xs)) =
  case lookupEntity $ T.unpack ent' of
        Just c  -> c <> fromEntities' rest
        Nothing -> "&" <> fromEntities' xs
    where (ent, rest) = case T.break (\c -> isSpace c || c == ';') xs of
                          (zs,T.uncons -> Just (';',ys)) -> (zs,ys)
                          (zs, ys) -> (zs,ys)
          ent'
            | Just ys <- T.stripPrefix "#X" ent = "#x" <> ys  -- workaround tagsoup bug
            | Just ('#', _) <- T.uncons ent     = ent
            | otherwise                         = ent <> ";"
fromEntities' t = case T.uncons t of
  Just (x, xs) -> x : fromEntities' xs
  Nothing      -> ""