aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Entities.hs
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-01-02 00:40:12 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-01-02 00:40:12 +0000
commitd4454536f021fc51b2a7211749560d97a1975de1 (patch)
tree1eb346633c25777799c639fe4e4a76ffb939ddf5 /src/Text/Pandoc/Entities.hs
parent4e5745134a8867d68a4678285272869d1c0ebce3 (diff)
downloadpandoc-d4454536f021fc51b2a7211749560d97a1975de1.tar.gz
Change 'HtmlEntities' module to 'Entities'. Adjusted calling
code accordingly. git-svn-id: https://pandoc.googlecode.com/svn/trunk@395 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Entities.hs')
-rw-r--r--src/Text/Pandoc/Entities.hs338
1 files changed, 338 insertions, 0 deletions
diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs
new file mode 100644
index 000000000..c5dfbf134
--- /dev/null
+++ b/src/Text/Pandoc/Entities.hs
@@ -0,0 +1,338 @@
+{-
+Copyright (C) 2006 John MacFarlane <jgm at berkeley dot edu>
+
+This program is free software; you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation; either version 2 of the License, or
+(at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+-}
+
+{- |
+ Module : Text.Pandoc.Entities
+ Copyright : Copyright (C) 2006 John MacFarlane
+ License : GNU GPL, version 2 or above
+
+ Maintainer : John MacFarlane <jgm at berkeley dot edu>
+ Stability : alpha
+ Portability : portable
+
+Functions for encoding unicode characters as entity references,
+and vice versa.
+-}
+module Text.Pandoc.Entities (
+ entityToChar,
+ charToEntity,
+ decodeEntities,
+ encodeEntities
+ ) where
+import Data.Char ( chr, ord )
+import Text.Regex ( mkRegex, matchRegexAll )
+import Maybe ( fromMaybe )
+
+-- regexs for entities
+decimalCodedEntity = mkRegex "&#([0-9]+);"
+characterEntity = mkRegex "&#[0-9]+;|&[A-Za-z0-9]+;"
+
+-- | Return a string with all entity references decoded to unicode characters
+-- where possible.
+decodeEntities :: String -> String
+decodeEntities str =
+ case (matchRegexAll characterEntity str) of
+ Nothing -> str
+ Just (before, match, rest, _) -> before ++ replacement ++
+ (decodeEntities rest)
+ where replacement = case (entityToChar match) of
+ Just ch -> [ch]
+ Nothing -> match
+
+-- | Returns a string with characters replaced with entity references where
+-- possible.
+encodeEntities :: String -> String
+encodeEntities [] = []
+encodeEntities (c:cs) = if ord c < 127
+ then c:(encodeEntities cs)
+ else (charToEntity c) ++ (encodeEntities cs)
+
+-- | If the string is a valid entity reference, returns @Just@ the character,
+-- otherwise @Nothing@.
+entityToChar :: String -> Maybe Char
+entityToChar entity =
+ case (lookup entity entityTable) of
+ Just ch -> Just ch
+ Nothing -> case (matchRegexAll decimalCodedEntity entity) of
+ Just (_, _, _, [sub]) -> Just (chr (read sub))
+ Nothing -> Nothing
+
+-- | Returns a string containing an entity reference for the character.
+charToEntity :: Char -> String
+charToEntity char =
+ let matches = filter (\(entity, character) -> (character == char))
+ entityTable in
+ if (length matches) == 0
+ then "&#" ++ show (ord char) ++ ";"
+ else fst (head matches)
+
+entityTable :: [(String, Char)]
+entityTable = [
+ ("&quot;", chr 34),
+ ("&amp;", chr 38),
+ ("&lt;", chr 60),
+ ("&gt;", chr 62),
+ ("&nbsp;", chr 160),
+ ("&iexcl;", chr 161),
+ ("&cent;", chr 162),
+ ("&pound;", chr 163),
+ ("&curren;", chr 164),
+ ("&yen;", chr 165),
+ ("&brvbar;", chr 166),
+ ("&sect;", chr 167),
+ ("&uml;", chr 168),
+ ("&copy;", chr 169),
+ ("&ordf;", chr 170),
+ ("&laquo;", chr 171),
+ ("&not;", chr 172),
+ ("&shy;", chr 173),
+ ("&reg;", chr 174),
+ ("&macr;", chr 175),
+ ("&deg;", chr 176),
+ ("&plusmn;", chr 177),
+ ("&sup2;", chr 178),
+ ("&sup3;", chr 179),
+ ("&acute;", chr 180),
+ ("&micro;", chr 181),
+ ("&para;", chr 182),
+ ("&middot;", chr 183),
+ ("&cedil;", chr 184),
+ ("&sup1;", chr 185),
+ ("&ordm;", chr 186),
+ ("&raquo;", chr 187),
+ ("&frac14;", chr 188),
+ ("&frac12;", chr 189),
+ ("&frac34;", chr 190),
+ ("&iquest;", chr 191),
+ ("&Agrave;", chr 192),
+ ("&Aacute;", chr 193),
+ ("&Acirc;", chr 194),
+ ("&Atilde;", chr 195),
+ ("&Auml;", chr 196),
+ ("&Aring;", chr 197),
+ ("&AElig;", chr 198),
+ ("&Ccedil;", chr 199),
+ ("&Egrave;", chr 200),
+ ("&Eacute;", chr 201),
+ ("&Ecirc;", chr 202),
+ ("&Euml;", chr 203),
+ ("&Igrave;", chr 204),
+ ("&Iacute;", chr 205),
+ ("&Icirc;", chr 206),
+ ("&Iuml;", chr 207),
+ ("&ETH;", chr 208),
+ ("&Ntilde;", chr 209),
+ ("&Ograve;", chr 210),
+ ("&Oacute;", chr 211),
+ ("&Ocirc;", chr 212),
+ ("&Otilde;", chr 213),
+ ("&Ouml;", chr 214),
+ ("&times;", chr 215),
+ ("&Oslash;", chr 216),
+ ("&Ugrave;", chr 217),
+ ("&Uacute;", chr 218),
+ ("&Ucirc;", chr 219),
+ ("&Uuml;", chr 220),
+ ("&Yacute;", chr 221),
+ ("&THORN;", chr 222),
+ ("&szlig;", chr 223),
+ ("&agrave;", chr 224),
+ ("&aacute;", chr 225),
+ ("&acirc;", chr 226),
+ ("&atilde;", chr 227),
+ ("&auml;", chr 228),
+ ("&aring;", chr 229),
+ ("&aelig;", chr 230),
+ ("&ccedil;", chr 231),
+ ("&egrave;", chr 232),
+ ("&eacute;", chr 233),
+ ("&ecirc;", chr 234),
+ ("&euml;", chr 235),
+ ("&igrave;", chr 236),
+ ("&iacute;", chr 237),
+ ("&icirc;", chr 238),
+ ("&iuml;", chr 239),
+ ("&eth;", chr 240),
+ ("&ntilde;", chr 241),
+ ("&ograve;", chr 242),
+ ("&oacute;", chr 243),
+ ("&ocirc;", chr 244),
+ ("&otilde;", chr 245),
+ ("&ouml;", chr 246),
+ ("&divide;", chr 247),
+ ("&oslash;", chr 248),
+ ("&ugrave;", chr 249),
+ ("&uacute;", chr 250),
+ ("&ucirc;", chr 251),
+ ("&uuml;", chr 252),
+ ("&yacute;", chr 253),
+ ("&thorn;", chr 254),
+ ("&yuml;", chr 255),
+ ("&OElig;", chr 338),
+ ("&oelig;", chr 339),
+ ("&Scaron;", chr 352),
+ ("&scaron;", chr 353),
+ ("&Yuml;", chr 376),
+ ("&fnof;", chr 402),
+ ("&circ;", chr 710),
+ ("&tilde;", chr 732),
+ ("&Alpha;", chr 913),
+ ("&Beta;", chr 914),
+ ("&Gamma;", chr 915),
+ ("&Delta;", chr 916),
+ ("&Epsilon;", chr 917),
+ ("&Zeta;", chr 918),
+ ("&Eta;", chr 919),
+ ("&Theta;", chr 920),
+ ("&Iota;", chr 921),
+ ("&Kappa;", chr 922),
+ ("&Lambda;", chr 923),
+ ("&Mu;", chr 924),
+ ("&Nu;", chr 925),
+ ("&Xi;", chr 926),
+ ("&Omicron;", chr 927),
+ ("&Pi;", chr 928),
+ ("&Rho;", chr 929),
+ ("&Sigma;", chr 931),
+ ("&Tau;", chr 932),
+ ("&Upsilon;", chr 933),
+ ("&Phi;", chr 934),
+ ("&Chi;", chr 935),
+ ("&Psi;", chr 936),
+ ("&Omega;", chr 937),
+ ("&alpha;", chr 945),
+ ("&beta;", chr 946),
+ ("&gamma;", chr 947),
+ ("&delta;", chr 948),
+ ("&epsilon;", chr 949),
+ ("&zeta;", chr 950),
+ ("&eta;", chr 951),
+ ("&theta;", chr 952),
+ ("&iota;", chr 953),
+ ("&kappa;", chr 954),
+ ("&lambda;", chr 955),
+ ("&mu;", chr 956),
+ ("&nu;", chr 957),
+ ("&xi;", chr 958),
+ ("&omicron;", chr 959),
+ ("&pi;", chr 960),
+ ("&rho;", chr 961),
+ ("&sigmaf;", chr 962),
+ ("&sigma;", chr 963),
+ ("&tau;", chr 964),
+ ("&upsilon;", chr 965),
+ ("&phi;", chr 966),
+ ("&chi;", chr 967),
+ ("&psi;", chr 968),
+ ("&omega;", chr 969),
+ ("&thetasym;", chr 977),
+ ("&upsih;", chr 978),
+ ("&piv;", chr 982),
+ ("&ensp;", chr 8194),
+ ("&emsp;", chr 8195),
+ ("&thinsp;", chr 8201),
+ ("&zwnj;", chr 8204),
+ ("&zwj;", chr 8205),
+ ("&lrm;", chr 8206),
+ ("&rlm;", chr 8207),
+ ("&ndash;", chr 8211),
+ ("&mdash;", chr 8212),
+ ("&lsquo;", chr 8216),
+ ("&rsquo;", chr 8217),
+ ("&sbquo;", chr 8218),
+ ("&ldquo;", chr 8220),
+ ("&rdquo;", chr 8221),
+ ("&bdquo;", chr 8222),
+ ("&dagger;", chr 8224),
+ ("&Dagger;", chr 8225),
+ ("&bull;", chr 8226),
+ ("&hellip;", chr 8230),
+ ("&permil;", chr 8240),
+ ("&prime;", chr 8242),
+ ("&Prime;", chr 8243),
+ ("&lsaquo;", chr 8249),
+ ("&rsaquo;", chr 8250),
+ ("&oline;", chr 8254),
+ ("&frasl;", chr 8260),
+ ("&euro;", chr 8364),
+ ("&image;", chr 8465),
+ ("&weierp;", chr 8472),
+ ("&real;", chr 8476),
+ ("&trade;", chr 8482),
+ ("&alefsym;", chr 8501),
+ ("&larr;", chr 8592),
+ ("&uarr;", chr 8593),
+ ("&rarr;", chr 8594),
+ ("&darr;", chr 8595),
+ ("&harr;", chr 8596),
+ ("&crarr;", chr 8629),
+ ("&lArr;", chr 8656),
+ ("&uArr;", chr 8657),
+ ("&rArr;", chr 8658),
+ ("&dArr;", chr 8659),
+ ("&hArr;", chr 8660),
+ ("&forall;", chr 8704),
+ ("&part;", chr 8706),
+ ("&exist;", chr 8707),
+ ("&empty;", chr 8709),
+ ("&nabla;", chr 8711),
+ ("&isin;", chr 8712),
+ ("&notin;", chr 8713),
+ ("&ni;", chr 8715),
+ ("&prod;", chr 8719),
+ ("&sum;", chr 8721),
+ ("&minus;", chr 8722),
+ ("&lowast;", chr 8727),
+ ("&radic;", chr 8730),
+ ("&prop;", chr 8733),
+ ("&infin;", chr 8734),
+ ("&ang;", chr 8736),
+ ("&and;", chr 8743),
+ ("&or;", chr 8744),
+ ("&cap;", chr 8745),
+ ("&cup;", chr 8746),
+ ("&int;", chr 8747),
+ ("&there4;", chr 8756),
+ ("&sim;", chr 8764),
+ ("&cong;", chr 8773),
+ ("&asymp;", chr 8776),
+ ("&ne;", chr 8800),
+ ("&equiv;", chr 8801),
+ ("&le;", chr 8804),
+ ("&ge;", chr 8805),
+ ("&sub;", chr 8834),
+ ("&sup;", chr 8835),
+ ("&nsub;", chr 8836),
+ ("&sube;", chr 8838),
+ ("&supe;", chr 8839),
+ ("&oplus;", chr 8853),
+ ("&otimes;", chr 8855),
+ ("&perp;", chr 8869),
+ ("&sdot;", chr 8901),
+ ("&lceil;", chr 8968),
+ ("&rceil;", chr 8969),
+ ("&lfloor;", chr 8970),
+ ("&rfloor;", chr 8971),
+ ("&lang;", chr 9001),
+ ("&rang;", chr 9002),
+ ("&loz;", chr 9674),
+ ("&spades;", chr 9824),
+ ("&clubs;", chr 9827),
+ ("&hearts;", chr 9829),
+ ("&diams;", chr 9830)
+ ]