diff options
Diffstat (limited to 'src/Text/Pandoc/CharacterReferences.hs')
-rw-r--r-- | src/Text/Pandoc/CharacterReferences.hs | 335 |
1 files changed, 335 insertions, 0 deletions
diff --git a/src/Text/Pandoc/CharacterReferences.hs b/src/Text/Pandoc/CharacterReferences.hs new file mode 100644 index 000000000..deb2c3f1a --- /dev/null +++ b/src/Text/Pandoc/CharacterReferences.hs @@ -0,0 +1,335 @@ +{- +Copyright (C) 2006-7 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.CharacterReferences + Copyright : Copyright (C) 2006-7 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Functions for parsing character references. +-} +module Text.Pandoc.CharacterReferences ( + characterReference, + decodeCharacterReferences, + ) where +import Data.Char ( chr ) +import Text.ParserCombinators.Parsec +import qualified Data.Map as Map + +-- | Parse character entity. +characterReference :: GenParser Char st Char +characterReference = characterEntity <|> + hexadecimalCharacterReference <|> + decimalCharacterReference <?> + "character entity" + +-- | Parse character entity. +characterEntity :: GenParser Char st Char +characterEntity = try $ do + st <- char '&' + body <- many1 alphaNum + end <- char ';' + let entity = "&" ++ body ++ ";" + return $ Map.findWithDefault '?' entity entityTable + +-- | Parse hexadecimal entity. +hexadecimalCharacterReference :: GenParser Char st Char +hexadecimalCharacterReference = try $ do + st <- string "&#" + hex <- oneOf "Xx" + body <- many1 (oneOf "0123456789ABCDEFabcdef") + end <- char ';' + return $ chr $ read ('0':'x':body) + +-- | Parse decimal entity. +decimalCharacterReference :: GenParser Char st Char +decimalCharacterReference = try $ do + st <- string "&#" + body <- many1 digit + end <- char ';' + return $ chr $ read body + +-- | Convert entities in a string to characters. +decodeCharacterReferences :: String -> String +decodeCharacterReferences str = + case parse (many (characterReference <|> anyChar)) str str of + Left err -> error $ "\nError: " ++ show err + Right result -> result + +entityTable :: Map.Map String Char +entityTable = Map.fromList entityTableList + +entityTableList :: [(String, Char)] +entityTableList = [ + (""", chr 34), + ("&", chr 38), + ("<", chr 60), + (">", chr 62), + (" ", chr 160), + ("¡", chr 161), + ("¢", chr 162), + ("£", chr 163), + ("¤", chr 164), + ("¥", chr 165), + ("¦", chr 166), + ("§", chr 167), + ("¨", chr 168), + ("©", chr 169), + ("ª", chr 170), + ("«", chr 171), + ("¬", chr 172), + ("­", chr 173), + ("®", chr 174), + ("¯", chr 175), + ("°", chr 176), + ("±", chr 177), + ("²", chr 178), + ("³", chr 179), + ("´", chr 180), + ("µ", chr 181), + ("¶", chr 182), + ("·", chr 183), + ("¸", chr 184), + ("¹", chr 185), + ("º", chr 186), + ("»", chr 187), + ("¼", chr 188), + ("½", chr 189), + ("¾", chr 190), + ("¿", chr 191), + ("À", chr 192), + ("Á", chr 193), + ("Â", chr 194), + ("Ã", chr 195), + ("Ä", chr 196), + ("Å", chr 197), + ("Æ", chr 198), + ("Ç", chr 199), + ("È", chr 200), + ("É", chr 201), + ("Ê", chr 202), + ("Ë", chr 203), + ("Ì", chr 204), + ("Í", chr 205), + ("Î", chr 206), + ("Ï", chr 207), + ("Ð", chr 208), + ("Ñ", chr 209), + ("Ò", chr 210), + ("Ó", chr 211), + ("Ô", chr 212), + ("Õ", chr 213), + ("Ö", chr 214), + ("×", chr 215), + ("Ø", chr 216), + ("Ù", chr 217), + ("Ú", chr 218), + ("Û", chr 219), + ("Ü", chr 220), + ("Ý", chr 221), + ("Þ", chr 222), + ("ß", chr 223), + ("à", chr 224), + ("á", chr 225), + ("â", chr 226), + ("ã", chr 227), + ("ä", chr 228), + ("å", chr 229), + ("æ", chr 230), + ("ç", chr 231), + ("è", chr 232), + ("é", chr 233), + ("ê", chr 234), + ("ë", chr 235), + ("ì", chr 236), + ("í", chr 237), + ("î", chr 238), + ("ï", chr 239), + ("ð", chr 240), + ("ñ", chr 241), + ("ò", chr 242), + ("ó", chr 243), + ("ô", chr 244), + ("õ", chr 245), + ("ö", chr 246), + ("÷", chr 247), + ("ø", chr 248), + ("ù", chr 249), + ("ú", chr 250), + ("û", chr 251), + ("ü", chr 252), + ("ý", chr 253), + ("þ", chr 254), + ("ÿ", chr 255), + ("Œ", chr 338), + ("œ", chr 339), + ("Š", chr 352), + ("š", chr 353), + ("Ÿ", chr 376), + ("ƒ", chr 402), + ("ˆ", chr 710), + ("˜", chr 732), + ("Α", chr 913), + ("Β", chr 914), + ("Γ", chr 915), + ("Δ", chr 916), + ("Ε", chr 917), + ("Ζ", chr 918), + ("Η", chr 919), + ("Θ", chr 920), + ("Ι", chr 921), + ("Κ", chr 922), + ("Λ", chr 923), + ("Μ", chr 924), + ("Ν", chr 925), + ("Ξ", chr 926), + ("Ο", chr 927), + ("Π", chr 928), + ("Ρ", chr 929), + ("Σ", chr 931), + ("Τ", chr 932), + ("Υ", chr 933), + ("Φ", chr 934), + ("Χ", chr 935), + ("Ψ", chr 936), + ("Ω", chr 937), + ("α", chr 945), + ("β", chr 946), + ("γ", chr 947), + ("δ", chr 948), + ("ε", chr 949), + ("ζ", chr 950), + ("η", chr 951), + ("θ", chr 952), + ("ι", chr 953), + ("κ", chr 954), + ("λ", chr 955), + ("μ", chr 956), + ("ν", chr 957), + ("ξ", chr 958), + ("ο", chr 959), + ("π", chr 960), + ("ρ", chr 961), + ("ς", chr 962), + ("σ", chr 963), + ("τ", chr 964), + ("υ", chr 965), + ("φ", chr 966), + ("χ", chr 967), + ("ψ", chr 968), + ("ω", chr 969), + ("ϑ", chr 977), + ("ϒ", chr 978), + ("ϖ", chr 982), + (" ", chr 8194), + (" ", chr 8195), + (" ", chr 8201), + ("‌", chr 8204), + ("‍", chr 8205), + ("‎", chr 8206), + ("‏", chr 8207), + ("–", chr 8211), + ("—", chr 8212), + ("‘", chr 8216), + ("’", chr 8217), + ("‚", chr 8218), + ("“", chr 8220), + ("”", chr 8221), + ("„", chr 8222), + ("†", chr 8224), + ("‡", chr 8225), + ("•", chr 8226), + ("…", chr 8230), + ("‰", chr 8240), + ("′", chr 8242), + ("″", chr 8243), + ("‹", chr 8249), + ("›", chr 8250), + ("‾", chr 8254), + ("⁄", chr 8260), + ("€", chr 8364), + ("ℑ", chr 8465), + ("℘", chr 8472), + ("ℜ", chr 8476), + ("™", chr 8482), + ("ℵ", chr 8501), + ("←", chr 8592), + ("↑", chr 8593), + ("→", chr 8594), + ("↓", chr 8595), + ("↔", chr 8596), + ("↵", chr 8629), + ("⇐", chr 8656), + ("⇑", chr 8657), + ("⇒", chr 8658), + ("⇓", chr 8659), + ("⇔", chr 8660), + ("∀", chr 8704), + ("∂", chr 8706), + ("∃", chr 8707), + ("∅", chr 8709), + ("∇", chr 8711), + ("∈", chr 8712), + ("∉", chr 8713), + ("∋", chr 8715), + ("∏", chr 8719), + ("∑", chr 8721), + ("−", chr 8722), + ("∗", chr 8727), + ("√", chr 8730), + ("∝", chr 8733), + ("∞", chr 8734), + ("∠", chr 8736), + ("∧", chr 8743), + ("∨", chr 8744), + ("∩", chr 8745), + ("∪", chr 8746), + ("∫", chr 8747), + ("∴", chr 8756), + ("∼", chr 8764), + ("≅", chr 8773), + ("≈", chr 8776), + ("≠", chr 8800), + ("≡", chr 8801), + ("≤", chr 8804), + ("≥", chr 8805), + ("⊂", chr 8834), + ("⊃", chr 8835), + ("⊄", chr 8836), + ("⊆", chr 8838), + ("⊇", chr 8839), + ("⊕", chr 8853), + ("⊗", chr 8855), + ("⊥", chr 8869), + ("⋅", chr 8901), + ("⌈", chr 8968), + ("⌉", chr 8969), + ("⌊", chr 8970), + ("⌋", chr 8971), + ("⟨", chr 9001), + ("⟩", chr 9002), + ("◊", chr 9674), + ("♠", chr 9824), + ("♣", chr 9827), + ("♥", chr 9829), + ("♦", chr 9830) + ] |