From d4454536f021fc51b2a7211749560d97a1975de1 Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Tue, 2 Jan 2007 00:40:12 +0000 Subject: Change 'HtmlEntities' module to 'Entities'. Adjusted calling code accordingly. git-svn-id: https://pandoc.googlecode.com/svn/trunk@395 788f1e2b-df1e-0410-8736-df70ead52e1b --- src/Text/Pandoc/Entities.hs | 338 ++++++++++++++++++++++++++++++++++++ src/Text/Pandoc/HtmlEntities.hs | 338 ------------------------------------ src/Text/Pandoc/Readers/HTML.hs | 4 +- src/Text/Pandoc/Readers/Markdown.hs | 2 +- src/Text/Pandoc/Shared.hs | 2 +- src/Text/Pandoc/Writers/Docbook.hs | 2 +- 6 files changed, 343 insertions(+), 343 deletions(-) create mode 100644 src/Text/Pandoc/Entities.hs delete mode 100644 src/Text/Pandoc/HtmlEntities.hs (limited to 'src/Text') diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs new file mode 100644 index 000000000..c5dfbf134 --- /dev/null +++ b/src/Text/Pandoc/Entities.hs @@ -0,0 +1,338 @@ +{- +Copyright (C) 2006 John MacFarlane + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.Entities + Copyright : Copyright (C) 2006 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane + Stability : alpha + Portability : portable + +Functions for encoding unicode characters as entity references, +and vice versa. +-} +module Text.Pandoc.Entities ( + entityToChar, + charToEntity, + decodeEntities, + encodeEntities + ) where +import Data.Char ( chr, ord ) +import Text.Regex ( mkRegex, matchRegexAll ) +import Maybe ( fromMaybe ) + +-- regexs for entities +decimalCodedEntity = mkRegex "&#([0-9]+);" +characterEntity = mkRegex "&#[0-9]+;|&[A-Za-z0-9]+;" + +-- | Return a string with all entity references decoded to unicode characters +-- where possible. +decodeEntities :: String -> String +decodeEntities str = + case (matchRegexAll characterEntity str) of + Nothing -> str + Just (before, match, rest, _) -> before ++ replacement ++ + (decodeEntities rest) + where replacement = case (entityToChar match) of + Just ch -> [ch] + Nothing -> match + +-- | Returns a string with characters replaced with entity references where +-- possible. +encodeEntities :: String -> String +encodeEntities [] = [] +encodeEntities (c:cs) = if ord c < 127 + then c:(encodeEntities cs) + else (charToEntity c) ++ (encodeEntities cs) + +-- | If the string is a valid entity reference, returns @Just@ the character, +-- otherwise @Nothing@. +entityToChar :: String -> Maybe Char +entityToChar entity = + case (lookup entity entityTable) of + Just ch -> Just ch + Nothing -> case (matchRegexAll decimalCodedEntity entity) of + Just (_, _, _, [sub]) -> Just (chr (read sub)) + Nothing -> Nothing + +-- | Returns a string containing an entity reference for the character. +charToEntity :: Char -> String +charToEntity char = + let matches = filter (\(entity, character) -> (character == char)) + entityTable in + if (length matches) == 0 + then "&#" ++ show (ord char) ++ ";" + else fst (head matches) + +entityTable :: [(String, Char)] +entityTable = [ + (""", chr 34), + ("&", chr 38), + ("<", chr 60), + (">", chr 62), + (" ", chr 160), + ("¡", chr 161), + ("¢", chr 162), + ("£", chr 163), + ("¤", chr 164), + ("¥", chr 165), + ("¦", chr 166), + ("§", chr 167), + ("¨", chr 168), + ("©", chr 169), + ("ª", chr 170), + ("«", chr 171), + ("¬", chr 172), + ("­", chr 173), + ("®", chr 174), + ("¯", chr 175), + ("°", chr 176), + ("±", chr 177), + ("²", chr 178), + ("³", chr 179), + ("´", chr 180), + ("µ", chr 181), + ("¶", chr 182), + ("·", chr 183), + ("¸", chr 184), + ("¹", chr 185), + ("º", chr 186), + ("»", chr 187), + ("¼", chr 188), + ("½", chr 189), + ("¾", chr 190), + ("¿", chr 191), + ("À", chr 192), + ("Á", chr 193), + ("Â", chr 194), + ("Ã", chr 195), + ("Ä", chr 196), + ("Å", chr 197), + ("Æ", chr 198), + ("Ç", chr 199), + ("È", chr 200), + ("É", chr 201), + ("Ê", chr 202), + ("Ë", chr 203), + ("Ì", chr 204), + ("Í", chr 205), + ("Î", chr 206), + ("Ï", chr 207), + ("Ð", chr 208), + ("Ñ", chr 209), + ("Ò", chr 210), + ("Ó", chr 211), + ("Ô", chr 212), + ("Õ", chr 213), + ("Ö", chr 214), + ("×", chr 215), + ("Ø", chr 216), + ("Ù", chr 217), + ("Ú", chr 218), + ("Û", chr 219), + ("Ü", chr 220), + ("Ý", chr 221), + ("Þ", chr 222), + ("ß", chr 223), + ("à", chr 224), + ("á", chr 225), + ("â", chr 226), + ("ã", chr 227), + ("ä", chr 228), + ("å", chr 229), + ("æ", chr 230), + ("ç", chr 231), + ("è", chr 232), + ("é", chr 233), + ("ê", chr 234), + ("ë", chr 235), + ("ì", chr 236), + ("í", chr 237), + ("î", chr 238), + ("ï", chr 239), + ("ð", chr 240), + ("ñ", chr 241), + ("ò", chr 242), + ("ó", chr 243), + ("ô", chr 244), + ("õ", chr 245), + ("ö", chr 246), + ("÷", chr 247), + ("ø", chr 248), + ("ù", chr 249), + ("ú", chr 250), + ("û", chr 251), + ("ü", chr 252), + ("ý", chr 253), + ("þ", chr 254), + ("ÿ", chr 255), + ("Œ", chr 338), + ("œ", chr 339), + ("Š", chr 352), + ("š", chr 353), + ("Ÿ", chr 376), + ("ƒ", chr 402), + ("ˆ", chr 710), + ("˜", chr 732), + ("Α", chr 913), + ("Β", chr 914), + ("Γ", chr 915), + ("Δ", chr 916), + ("Ε", chr 917), + ("Ζ", chr 918), + ("Η", chr 919), + ("Θ", chr 920), + ("Ι", chr 921), + ("Κ", chr 922), + ("Λ", chr 923), + ("Μ", chr 924), + ("Ν", chr 925), + ("Ξ", chr 926), + ("Ο", chr 927), + ("Π", chr 928), + ("Ρ", chr 929), + ("Σ", chr 931), + ("Τ", chr 932), + ("Υ", chr 933), + ("Φ", chr 934), + ("Χ", chr 935), + ("Ψ", chr 936), + ("Ω", chr 937), + ("α", chr 945), + ("β", chr 946), + ("γ", chr 947), + ("δ", chr 948), + ("ε", chr 949), + ("ζ", chr 950), + ("η", chr 951), + ("θ", chr 952), + ("ι", chr 953), + ("κ", chr 954), + ("λ", chr 955), + ("μ", chr 956), + ("ν", chr 957), + ("ξ", chr 958), + ("ο", chr 959), + ("π", chr 960), + ("ρ", chr 961), + ("ς", chr 962), + ("σ", chr 963), + ("τ", chr 964), + ("υ", chr 965), + ("φ", chr 966), + ("χ", chr 967), + ("ψ", chr 968), + ("ω", chr 969), + ("ϑ", chr 977), + ("ϒ", chr 978), + ("ϖ", chr 982), + (" ", chr 8194), + (" ", chr 8195), + (" ", chr 8201), + ("‌", chr 8204), + ("‍", chr 8205), + ("‎", chr 8206), + ("‏", chr 8207), + ("–", chr 8211), + ("—", chr 8212), + ("‘", chr 8216), + ("’", chr 8217), + ("‚", chr 8218), + ("“", chr 8220), + ("”", chr 8221), + ("„", chr 8222), + ("†", chr 8224), + ("‡", chr 8225), + ("•", chr 8226), + ("…", chr 8230), + ("‰", chr 8240), + ("′", chr 8242), + ("″", chr 8243), + ("‹", chr 8249), + ("›", chr 8250), + ("‾", chr 8254), + ("⁄", chr 8260), + ("€", chr 8364), + ("ℑ", chr 8465), + ("℘", chr 8472), + ("ℜ", chr 8476), + ("™", chr 8482), + ("ℵ", chr 8501), + ("←", chr 8592), + ("↑", chr 8593), + ("→", chr 8594), + ("↓", chr 8595), + ("↔", chr 8596), + ("↵", chr 8629), + ("⇐", chr 8656), + ("⇑", chr 8657), + ("⇒", chr 8658), + ("⇓", chr 8659), + ("⇔", chr 8660), + ("∀", chr 8704), + ("∂", chr 8706), + ("∃", chr 8707), + ("∅", chr 8709), + ("∇", chr 8711), + ("∈", chr 8712), + ("∉", chr 8713), + ("∋", chr 8715), + ("∏", chr 8719), + ("∑", chr 8721), + ("−", chr 8722), + ("∗", chr 8727), + ("√", chr 8730), + ("∝", chr 8733), + ("∞", chr 8734), + ("∠", chr 8736), + ("∧", chr 8743), + ("∨", chr 8744), + ("∩", chr 8745), + ("∪", chr 8746), + ("∫", chr 8747), + ("∴", chr 8756), + ("∼", chr 8764), + ("≅", chr 8773), + ("≈", chr 8776), + ("≠", chr 8800), + ("≡", chr 8801), + ("≤", chr 8804), + ("≥", chr 8805), + ("⊂", chr 8834), + ("⊃", chr 8835), + ("⊄", chr 8836), + ("⊆", chr 8838), + ("⊇", chr 8839), + ("⊕", chr 8853), + ("⊗", chr 8855), + ("⊥", chr 8869), + ("⋅", chr 8901), + ("⌈", chr 8968), + ("⌉", chr 8969), + ("⌊", chr 8970), + ("⌋", chr 8971), + ("⟨", chr 9001), + ("⟩", chr 9002), + ("◊", chr 9674), + ("♠", chr 9824), + ("♣", chr 9827), + ("♥", chr 9829), + ("♦", chr 9830) + ] diff --git a/src/Text/Pandoc/HtmlEntities.hs b/src/Text/Pandoc/HtmlEntities.hs deleted file mode 100644 index 157588262..000000000 --- a/src/Text/Pandoc/HtmlEntities.hs +++ /dev/null @@ -1,338 +0,0 @@ -{- -Copyright (C) 2006 John MacFarlane - -This program is free software; you can redistribute it and/or modify -it under the terms of the GNU General Public License as published by -the Free Software Foundation; either version 2 of the License, or -(at your option) any later version. - -This program is distributed in the hope that it will be useful, -but WITHOUT ANY WARRANTY; without even the implied warranty of -MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -GNU General Public License for more details. - -You should have received a copy of the GNU General Public License -along with this program; if not, write to the Free Software -Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA --} - -{- | - Module : Text.Pandoc.HtmlEntities - Copyright : Copyright (C) 2006 John MacFarlane - License : GNU GPL, version 2 or above - - Maintainer : John MacFarlane - Stability : alpha - Portability : portable - -Functions for encoding unicode characters as HTML entity references, -and vice versa. --} -module Text.Pandoc.HtmlEntities ( - htmlEntityToChar, - charToHtmlEntity, - decodeEntities, - encodeEntities - ) where -import Data.Char ( chr, ord ) -import Text.Regex ( mkRegex, matchRegexAll ) -import Maybe ( fromMaybe ) - --- regexs for entities -decimalCodedEntity = mkRegex "&#([0-9]+);" -characterEntity = mkRegex "&#[0-9]+;|&[A-Za-z0-9]+;" - --- | Return a string with all entity references decoded to unicode characters --- where possible. -decodeEntities :: String -> String -decodeEntities str = - case (matchRegexAll characterEntity str) of - Nothing -> str - Just (before, match, rest, _) -> before ++ replacement ++ - (decodeEntities rest) - where replacement = case (htmlEntityToChar match) of - Just ch -> [ch] - Nothing -> match - --- | Returns a string with characters replaced with entity references where --- possible. -encodeEntities :: String -> String -encodeEntities [] = [] -encodeEntities (c:cs) = if ord c < 127 - then c:(encodeEntities cs) - else (charToHtmlEntity c) ++ (encodeEntities cs) - --- | If the string is a valid entity reference, returns @Just@ the character, --- otherwise @Nothing@. -htmlEntityToChar :: String -> Maybe Char -htmlEntityToChar entity = - case (lookup entity htmlEntityTable) of - Just ch -> Just ch - Nothing -> case (matchRegexAll decimalCodedEntity entity) of - Just (_, _, _, [sub]) -> Just (chr (read sub)) - Nothing -> Nothing - --- | Returns a string containing an entity reference for the character. -charToHtmlEntity :: Char -> String -charToHtmlEntity char = - let matches = filter (\(entity, character) -> (character == char)) - htmlEntityTable in - if (length matches) == 0 - then "&#" ++ show (ord char) ++ ";" - else fst (head matches) - -htmlEntityTable :: [(String, Char)] -htmlEntityTable = [ - (""", chr 34), - ("&", chr 38), - ("<", chr 60), - (">", chr 62), - (" ", chr 160), - ("¡", chr 161), - ("¢", chr 162), - ("£", chr 163), - ("¤", chr 164), - ("¥", chr 165), - ("¦", chr 166), - ("§", chr 167), - ("¨", chr 168), - ("©", chr 169), - ("ª", chr 170), - ("«", chr 171), - ("¬", chr 172), - ("­", chr 173), - ("®", chr 174), - ("¯", chr 175), - ("°", chr 176), - ("±", chr 177), - ("²", chr 178), - ("³", chr 179), - ("´", chr 180), - ("µ", chr 181), - ("¶", chr 182), - ("·", chr 183), - ("¸", chr 184), - ("¹", chr 185), - ("º", chr 186), - ("»", chr 187), - ("¼", chr 188), - ("½", chr 189), - ("¾", chr 190), - ("¿", chr 191), - ("À", chr 192), - ("Á", chr 193), - ("Â", chr 194), - ("Ã", chr 195), - ("Ä", chr 196), - ("Å", chr 197), - ("Æ", chr 198), - ("Ç", chr 199), - ("È", chr 200), - ("É", chr 201), - ("Ê", chr 202), - ("Ë", chr 203), - ("Ì", chr 204), - ("Í", chr 205), - ("Î", chr 206), - ("Ï", chr 207), - ("Ð", chr 208), - ("Ñ", chr 209), - ("Ò", chr 210), - ("Ó", chr 211), - ("Ô", chr 212), - ("Õ", chr 213), - ("Ö", chr 214), - ("×", chr 215), - ("Ø", chr 216), - ("Ù", chr 217), - ("Ú", chr 218), - ("Û", chr 219), - ("Ü", chr 220), - ("Ý", chr 221), - ("Þ", chr 222), - ("ß", chr 223), - ("à", chr 224), - ("á", chr 225), - ("â", chr 226), - ("ã", chr 227), - ("ä", chr 228), - ("å", chr 229), - ("æ", chr 230), - ("ç", chr 231), - ("è", chr 232), - ("é", chr 233), - ("ê", chr 234), - ("ë", chr 235), - ("ì", chr 236), - ("í", chr 237), - ("î", chr 238), - ("ï", chr 239), - ("ð", chr 240), - ("ñ", chr 241), - ("ò", chr 242), - ("ó", chr 243), - ("ô", chr 244), - ("õ", chr 245), - ("ö", chr 246), - ("÷", chr 247), - ("ø", chr 248), - ("ù", chr 249), - ("ú", chr 250), - ("û", chr 251), - ("ü", chr 252), - ("ý", chr 253), - ("þ", chr 254), - ("ÿ", chr 255), - ("Œ", chr 338), - ("œ", chr 339), - ("Š", chr 352), - ("š", chr 353), - ("Ÿ", chr 376), - ("ƒ", chr 402), - ("ˆ", chr 710), - ("˜", chr 732), - ("Α", chr 913), - ("Β", chr 914), - ("Γ", chr 915), - ("Δ", chr 916), - ("Ε", chr 917), - ("Ζ", chr 918), - ("Η", chr 919), - ("Θ", chr 920), - ("Ι", chr 921), - ("Κ", chr 922), - ("Λ", chr 923), - ("Μ", chr 924), - ("Ν", chr 925), - ("Ξ", chr 926), - ("Ο", chr 927), - ("Π", chr 928), - ("Ρ", chr 929), - ("Σ", chr 931), - ("Τ", chr 932), - ("Υ", chr 933), - ("Φ", chr 934), - ("Χ", chr 935), - ("Ψ", chr 936), - ("Ω", chr 937), - ("α", chr 945), - ("β", chr 946), - ("γ", chr 947), - ("δ", chr 948), - ("ε", chr 949), - ("ζ", chr 950), - ("η", chr 951), - ("θ", chr 952), - ("ι", chr 953), - ("κ", chr 954), - ("λ", chr 955), - ("μ", chr 956), - ("ν", chr 957), - ("ξ", chr 958), - ("ο", chr 959), - ("π", chr 960), - ("ρ", chr 961), - ("ς", chr 962), - ("σ", chr 963), - ("τ", chr 964), - ("υ", chr 965), - ("φ", chr 966), - ("χ", chr 967), - ("ψ", chr 968), - ("ω", chr 969), - ("ϑ", chr 977), - ("ϒ", chr 978), - ("ϖ", chr 982), - (" ", chr 8194), - (" ", chr 8195), - (" ", chr 8201), - ("‌", chr 8204), - ("‍", chr 8205), - ("‎", chr 8206), - ("‏", chr 8207), - ("–", chr 8211), - ("—", chr 8212), - ("‘", chr 8216), - ("’", chr 8217), - ("‚", chr 8218), - ("“", chr 8220), - ("”", chr 8221), - ("„", chr 8222), - ("†", chr 8224), - ("‡", chr 8225), - ("•", chr 8226), - ("…", chr 8230), - ("‰", chr 8240), - ("′", chr 8242), - ("″", chr 8243), - ("‹", chr 8249), - ("›", chr 8250), - ("‾", chr 8254), - ("⁄", chr 8260), - ("€", chr 8364), - ("ℑ", chr 8465), - ("℘", chr 8472), - ("ℜ", chr 8476), - ("™", chr 8482), - ("ℵ", chr 8501), - ("←", chr 8592), - ("↑", chr 8593), - ("→", chr 8594), - ("↓", chr 8595), - ("↔", chr 8596), - ("↵", chr 8629), - ("⇐", chr 8656), - ("⇑", chr 8657), - ("⇒", chr 8658), - ("⇓", chr 8659), - ("⇔", chr 8660), - ("∀", chr 8704), - ("∂", chr 8706), - ("∃", chr 8707), - ("∅", chr 8709), - ("∇", chr 8711), - ("∈", chr 8712), - ("∉", chr 8713), - ("∋", chr 8715), - ("∏", chr 8719), - ("∑", chr 8721), - ("−", chr 8722), - ("∗", chr 8727), - ("√", chr 8730), - ("∝", chr 8733), - ("∞", chr 8734), - ("∠", chr 8736), - ("∧", chr 8743), - ("∨", chr 8744), - ("∩", chr 8745), - ("∪", chr 8746), - ("∫", chr 8747), - ("∴", chr 8756), - ("∼", chr 8764), - ("≅", chr 8773), - ("≈", chr 8776), - ("≠", chr 8800), - ("≡", chr 8801), - ("≤", chr 8804), - ("≥", chr 8805), - ("⊂", chr 8834), - ("⊃", chr 8835), - ("⊄", chr 8836), - ("⊆", chr 8838), - ("⊇", chr 8839), - ("⊕", chr 8853), - ("⊗", chr 8855), - ("⊥", chr 8869), - ("⋅", chr 8901), - ("⌈", chr 8968), - ("⌉", chr 8969), - ("⌊", chr 8970), - ("⌋", chr 8971), - ("⟨", chr 9001), - ("⟩", chr 9002), - ("◊", chr 9674), - ("♠", chr 9824), - ("♣", chr 9827), - ("♥", chr 9829), - ("♦", chr 9830) - ] diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 9beaaacff..79bdab76a 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -45,7 +45,7 @@ import Text.ParserCombinators.Parsec import Text.ParserCombinators.Pandoc import Text.Pandoc.Definition import Text.Pandoc.Shared -import Text.Pandoc.HtmlEntities ( decodeEntities, htmlEntityToChar ) +import Text.Pandoc.Entities ( decodeEntities, entityToChar ) import Maybe ( fromMaybe ) import Char ( toUpper, toLower ) @@ -397,7 +397,7 @@ entity = try (do num <- many1 digit return ("#" ++ num)))] char ';' - return (Str [fromMaybe '?' (htmlEntityToChar ("&" ++ body ++ ";"))])) + return (Str [fromMaybe '?' (entityToChar ("&" ++ body ++ ";"))])) code = try (do htmlTag "code" diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 9913b60f9..30d6a11df 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -41,7 +41,7 @@ import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType, htmlBlockElement ) -import Text.Pandoc.HtmlEntities ( decodeEntities ) +import Text.Pandoc.Entities ( decodeEntities ) import Text.Regex ( matchRegex, mkRegex ) import Text.ParserCombinators.Parsec diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index eb6c7be78..adc2621f3 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -66,7 +66,7 @@ module Text.Pandoc.Shared ( ) where import Text.Pandoc.Definition import Text.ParserCombinators.Parsec -import Text.Pandoc.HtmlEntities ( decodeEntities ) +import Text.Pandoc.Entities ( decodeEntities ) import Text.Regex ( matchRegexAll, mkRegex, subRegex, Regex ) import Char ( toLower ) import List ( find, groupBy ) diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index 61f24807b..8ccf84f30 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -33,7 +33,7 @@ module Text.Pandoc.Writers.Docbook ( import Text.Pandoc.Definition import Text.Pandoc.Shared import Text.Pandoc.Writers.HTML ( stringToSmartHtml, stringToHtml ) -import Text.Pandoc.HtmlEntities ( encodeEntities ) +import Text.Pandoc.Entities ( encodeEntities ) import Text.Html ( stringToHtmlString ) import Text.Regex ( mkRegex, matchRegex ) import Data.Char ( toLower, ord ) -- cgit v1.2.3