aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Entities.hs
blob: 696f943a671bbc317e910030dcbe19b5b3202cb3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
{-
Copyright (C) 2006 John MacFarlane <jgm at berkeley dot edu>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
-}

{- |
   Module      : Text.Pandoc.Entities
   Copyright   : Copyright (C) 2006 John MacFarlane
   License     : GNU GPL, version 2 or above 

   Maintainer  : John MacFarlane <jgm at berkeley dot edu>
   Stability   : alpha
   Portability : portable

Functions for encoding unicode characters as entity references,
and vice versa.
-}
module Text.Pandoc.Entities (
                     charToEntity,
                     charToNumericalEntity,
                     encodeEntities,
                     decodeEntities,
                     escapeSGMLChar,
                     stringToSGML,
                     characterEntity
                    ) where
import Data.Char ( chr, ord )
import Text.ParserCombinators.Parsec
import Data.Maybe ( fromMaybe )

-- | Returns a string containing an entity reference for the character.
charToEntity :: Char -> String
charToEntity char = 
  let matches = filter (\(entity, character) -> (character == char)) 
                       entityTable in
  if (length matches) == 0
     then charToNumericalEntity char
     else fst (head matches)

-- | Returns a string containing a numerical entity reference for the char.
charToNumericalEntity :: Char -> String
charToNumericalEntity ch = "&#" ++ show (ord ch) ++ ";"

-- | Parse SGML character entity.
characterEntity :: GenParser Char st Char
characterEntity = namedEntity <|> hexEntity <|> decimalEntity <?> "SGML entity"

-- | Parse SGML character entity.
namedEntity :: GenParser Char st Char
namedEntity = try $ do
  st <- char '&'
  body <- many1 alphaNum
  end <- char ';'
  let entity = "&" ++ body ++ ";"
  return $ case (lookup entity entityTable) of
             Just ch -> ch
             Nothing -> '?'
   
-- | Parse SGML hexadecimal entity.
hexEntity :: GenParser Char st Char
hexEntity = try $ do
  st <- string "&#"
  hex <- oneOf "Xx"
  body <- many1 (oneOf "0123456789ABCDEFabcdef")
  end <- char ';'
  return $ chr $ read ('0':'x':body)

-- | Parse SGML decimal entity.
decimalEntity :: GenParser Char st Char
decimalEntity = try $ do
  st <- string "&#"
  body <- many1 digit
  end <- char ';'
  return $ chr $ read body

-- | Escape one character as needed for SGML.
escapeSGMLChar :: Char -> String
escapeSGMLChar x = 
  case x of
    '&'  -> "&amp;"
    '<'  -> "&lt;"
    '>'  -> "&gt;"
    '"'  -> "&quot;"
    c    -> [c] 

-- | True if the character needs to be escaped.
needsEscaping :: Char -> Bool
needsEscaping c = c `elem` "&<>\""

-- | Escape string as needed for SGML.  Entity references are not preserved.
encodeEntities :: String -> String
encodeEntities ""  = ""
encodeEntities str = 
  case break needsEscaping str of
    (okay, "")     -> okay
    (okay, (c:cs)) -> okay ++ escapeSGMLChar c ++ encodeEntities cs 

-- | Convert entities in a string to characters.
decodeEntities :: String -> String
decodeEntities str = 
  case parse (many (characterEntity <|> anyChar)) str str of
	Left err        -> error $ "\nError: " ++ show err
	Right result    -> result

-- | Escape string for SGML, preserving entity references.
stringToSGML :: String -> String
stringToSGML str = 
  let regular   = do
                    str <- many1 (satisfy (not . needsEscaping))
                    return str 
      special   = do
                    notFollowedBy characterEntity
                    c <- anyChar
                    return $ escapeSGMLChar c 
      entity    = do
                    ent <- manyTill anyChar (char ';')
                    return (ent ++ ";") in
  case parse (many (regular <|> special <|> entity)) str str of
    Left err       -> error $ "\nError: " ++ show err
    Right result   -> concat result

entityTable :: [(String, Char)]
entityTable =  [
	("&quot;", chr 34),
	("&amp;", chr 38),
	("&lt;", chr 60),
	("&gt;", chr 62),
	("&nbsp;", chr 160),
	("&iexcl;", chr 161),
	("&cent;", chr 162),
	("&pound;", chr 163),
	("&curren;", chr 164),
	("&yen;", chr 165),
	("&brvbar;", chr 166),
	("&sect;", chr 167),
	("&uml;", chr 168),
	("&copy;", chr 169),
	("&ordf;", chr 170),
	("&laquo;", chr 171),
	("&not;", chr 172),
	("&shy;", chr 173),
	("&reg;", chr 174),
	("&macr;", chr 175),
	("&deg;", chr 176),
	("&plusmn;", chr 177),
	("&sup2;", chr 178),
	("&sup3;", chr 179),
	("&acute;", chr 180),
	("&micro;", chr 181),
	("&para;", chr 182),
	("&middot;", chr 183),
	("&cedil;", chr 184),
	("&sup1;", chr 185),
	("&ordm;", chr 186),
	("&raquo;", chr 187),
	("&frac14;", chr 188),
	("&frac12;", chr 189),
	("&frac34;", chr 190),
	("&iquest;", chr 191),
	("&Agrave;", chr 192),
	("&Aacute;", chr 193),
	("&Acirc;", chr 194),
	("&Atilde;", chr 195),
	("&Auml;", chr 196),
	("&Aring;", chr 197),
	("&AElig;", chr 198),
	("&Ccedil;", chr 199),
	("&Egrave;", chr 200),
	("&Eacute;", chr 201),
	("&Ecirc;", chr 202),
	("&Euml;", chr 203),
	("&Igrave;", chr 204),
	("&Iacute;", chr 205),
	("&Icirc;", chr 206),
	("&Iuml;", chr 207),
	("&ETH;", chr 208),
	("&Ntilde;", chr 209),
	("&Ograve;", chr 210),
	("&Oacute;", chr 211),
	("&Ocirc;", chr 212),
	("&Otilde;", chr 213),
	("&Ouml;", chr 214),
	("&times;", chr 215),
	("&Oslash;", chr 216),
	("&Ugrave;", chr 217),
	("&Uacute;", chr 218),
	("&Ucirc;", chr 219),
	("&Uuml;", chr 220),
	("&Yacute;", chr 221),
	("&THORN;", chr 222),
	("&szlig;", chr 223),
	("&agrave;", chr 224),
	("&aacute;", chr 225),
	("&acirc;", chr 226),
	("&atilde;", chr 227),
	("&auml;", chr 228),
	("&aring;", chr 229),
	("&aelig;", chr 230),
	("&ccedil;", chr 231),
	("&egrave;", chr 232),
	("&eacute;", chr 233),
	("&ecirc;", chr 234),
	("&euml;", chr 235),
	("&igrave;", chr 236),
	("&iacute;", chr 237),
	("&icirc;", chr 238),
	("&iuml;", chr 239),
	("&eth;", chr 240),
	("&ntilde;", chr 241),
	("&ograve;", chr 242),
	("&oacute;", chr 243),
	("&ocirc;", chr 244),
	("&otilde;", chr 245),
	("&ouml;", chr 246),
	("&divide;", chr 247),
	("&oslash;", chr 248),
	("&ugrave;", chr 249),
	("&uacute;", chr 250),
	("&ucirc;", chr 251),
	("&uuml;", chr 252),
	("&yacute;", chr 253),
	("&thorn;", chr 254),
	("&yuml;", chr 255),
	("&OElig;", chr 338),
	("&oelig;", chr 339),
	("&Scaron;", chr 352),
	("&scaron;", chr 353),
	("&Yuml;", chr 376),
	("&fnof;", chr 402),
	("&circ;", chr 710),
	("&tilde;", chr 732),
	("&Alpha;", chr 913),
	("&Beta;", chr 914),
	("&Gamma;", chr 915),
	("&Delta;", chr 916),
	("&Epsilon;", chr 917),
	("&Zeta;", chr 918),
	("&Eta;", chr 919),
	("&Theta;", chr 920),
	("&Iota;", chr 921),
	("&Kappa;", chr 922),
	("&Lambda;", chr 923),
	("&Mu;", chr 924),
	("&Nu;", chr 925),
	("&Xi;", chr 926),
	("&Omicron;", chr 927),
	("&Pi;", chr 928),
	("&Rho;", chr 929),
	("&Sigma;", chr 931),
	("&Tau;", chr 932),
	("&Upsilon;", chr 933),
	("&Phi;", chr 934),
	("&Chi;", chr 935),
	("&Psi;", chr 936),
	("&Omega;", chr 937),
	("&alpha;", chr 945),
	("&beta;", chr 946),
	("&gamma;", chr 947),
	("&delta;", chr 948),
	("&epsilon;", chr 949),
	("&zeta;", chr 950),
	("&eta;", chr 951),
	("&theta;", chr 952),
	("&iota;", chr 953),
	("&kappa;", chr 954),
	("&lambda;", chr 955),
	("&mu;", chr 956),
	("&nu;", chr 957),
	("&xi;", chr 958),
	("&omicron;", chr 959),
	("&pi;", chr 960),
	("&rho;", chr 961),
	("&sigmaf;", chr 962),
	("&sigma;", chr 963),
	("&tau;", chr 964),
	("&upsilon;", chr 965),
	("&phi;", chr 966),
	("&chi;", chr 967),
	("&psi;", chr 968),
	("&omega;", chr 969),
	("&thetasym;", chr 977),
	("&upsih;", chr 978),
	("&piv;", chr 982),
	("&ensp;", chr 8194),
	("&emsp;", chr 8195),
	("&thinsp;", chr 8201),
	("&zwnj;", chr 8204),
	("&zwj;", chr 8205),
	("&lrm;", chr 8206),
	("&rlm;", chr 8207),
	("&ndash;", chr 8211),
	("&mdash;", chr 8212),
	("&lsquo;", chr 8216),
	("&rsquo;", chr 8217),
	("&sbquo;", chr 8218),
	("&ldquo;", chr 8220),
	("&rdquo;", chr 8221),
	("&bdquo;", chr 8222),
	("&dagger;", chr 8224),
	("&Dagger;", chr 8225),
	("&bull;", chr 8226),
	("&hellip;", chr 8230),
	("&permil;", chr 8240),
	("&prime;", chr 8242),
	("&Prime;", chr 8243),
	("&lsaquo;", chr 8249),
	("&rsaquo;", chr 8250),
	("&oline;", chr 8254),
	("&frasl;", chr 8260),
	("&euro;", chr 8364),
	("&image;", chr 8465),
	("&weierp;", chr 8472),
	("&real;", chr 8476),
	("&trade;", chr 8482),
	("&alefsym;", chr 8501),
	("&larr;", chr 8592),
	("&uarr;", chr 8593),
	("&rarr;", chr 8594),
	("&darr;", chr 8595),
	("&harr;", chr 8596),
	("&crarr;", chr 8629),
	("&lArr;", chr 8656),
	("&uArr;", chr 8657),
	("&rArr;", chr 8658),
	("&dArr;", chr 8659),
	("&hArr;", chr 8660),
	("&forall;", chr 8704),
	("&part;", chr 8706),
	("&exist;", chr 8707),
	("&empty;", chr 8709),
	("&nabla;", chr 8711),
	("&isin;", chr 8712),
	("&notin;", chr 8713),
	("&ni;", chr 8715),
	("&prod;", chr 8719),
	("&sum;", chr 8721),
	("&minus;", chr 8722),
	("&lowast;", chr 8727),
	("&radic;", chr 8730),
	("&prop;", chr 8733),
	("&infin;", chr 8734),
	("&ang;", chr 8736),
	("&and;", chr 8743),
	("&or;", chr 8744),
	("&cap;", chr 8745),
	("&cup;", chr 8746),
	("&int;", chr 8747),
	("&there4;", chr 8756),
	("&sim;", chr 8764),
	("&cong;", chr 8773),
	("&asymp;", chr 8776),
	("&ne;", chr 8800),
	("&equiv;", chr 8801),
	("&le;", chr 8804),
	("&ge;", chr 8805),
	("&sub;", chr 8834),
	("&sup;", chr 8835),
	("&nsub;", chr 8836),
	("&sube;", chr 8838),
	("&supe;", chr 8839),
	("&oplus;", chr 8853),
	("&otimes;", chr 8855),
	("&perp;", chr 8869),
	("&sdot;", chr 8901),
	("&lceil;", chr 8968),
	("&rceil;", chr 8969),
	("&lfloor;", chr 8970),
	("&rfloor;", chr 8971),
	("&lang;", chr 9001),
	("&rang;", chr 9002),
	("&loz;", chr 9674),
	("&spades;", chr 9824),
	("&clubs;", chr 9827),
	("&hearts;", chr 9829),
	("&diams;", chr 9830)
	]