aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Entities.hs
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-01-27 03:04:40 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-01-27 03:04:40 +0000
commitd06417125dd4d8cb177abd2d472c0c1cad4c49be (patch)
treea3a6d7991f01e4184b678a5fcce023947d015451 /src/Text/Pandoc/Entities.hs
parentf2de08864ee9870147839389554b9353631d21f8 (diff)
downloadpandoc-d06417125dd4d8cb177abd2d472c0c1cad4c49be.tar.gz
Changes in entity handling:
+ Entities are parsed (and unicode characters returned) in both Markdown and HTML readers. + Parsers characterEntity, namedEntity, decimalEntity, hexEntity added to Entities.hs; these parse a string and return a unicode character. + Changed 'entity' parser in HTML reader to use the 'characterEntity' parser from Entities.hs. + Added new 'entity' parser to Markdown reader, and added '&' as a special character. Adjusted test suite accordingly since now we get 'Str "AT",Str "&",Str "T"' instead of 'Str "AT&T".. + stringToSGML moved to Entities.hs. escapeSGML removed as redundant, given encodeEntities. + stringToSGML, encodeEntities, and specialCharToEntity are given a boolean parameter that causes only numerical entities to be used. This is used in the docbook writer. The HTML writer uses named entities where possible, but not all docbook-consumers know about the named entities without special instructions, so it seems safer to use numerical entities there. + decodeEntities is rewritten in a way that avoids Text.Regex, using the new parsers. + charToEntity and charToNumericalEntity added to Entities.hs. + Moved specialCharToEntity from Shared.hs to Entities.hs. + Removed unneeded 'decodeEntities' from 'str' parser in HTML and Markdown readers. + Removed sgmlHexEntity, sgmlDecimalEntity, sgmlNamedEntity, and sgmlCharacterEntity from Shared.hs. + Modified Docbook writer so that it doesn't rely on Text.Regex for detecting "mailto" links. git-svn-id: https://pandoc.googlecode.com/svn/trunk@515 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Entities.hs')
-rw-r--r--src/Text/Pandoc/Entities.hs137
1 files changed, 92 insertions, 45 deletions
diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs
index ea5676b79..3e68db35c 100644
--- a/src/Text/Pandoc/Entities.hs
+++ b/src/Text/Pandoc/Entities.hs
@@ -29,62 +29,109 @@ Functions for encoding unicode characters as entity references,
and vice versa.
-}
module Text.Pandoc.Entities (
- entityToChar,
charToEntity,
- decodeEntities,
+ charToNumericalEntity,
+ specialCharToEntity,
encodeEntities,
+ decodeEntities,
+ stringToSGML,
characterEntity
) where
import Data.Char ( chr, ord )
-import Text.Regex ( mkRegex, matchRegexAll, Regex )
-import Maybe ( fromMaybe )
+import Text.ParserCombinators.Parsec
+import Data.Maybe ( fromMaybe )
--- | Regular expression for numerical coded entity.
-numericalEntity :: Text.Regex.Regex
-numericalEntity = mkRegex "&#([0-9]+|[xX][0-9A-Fa-f]+);"
+-- | Returns a string containing an entity reference for the character.
+charToEntity :: Char -> String
+charToEntity char =
+ let matches = filter (\(entity, character) -> (character == char))
+ entityTable in
+ if (length matches) == 0
+ then charToNumericalEntity char
+ else fst (head matches)
--- | Regular expression for character entity.
-characterEntity :: Text.Regex.Regex
-characterEntity = mkRegex "&#[0-9]+;|&#[xX][0-9A-Fa-f]+;|&[A-Za-z0-9]+;"
+-- | Returns a string containing a numerical entity reference for the char.
+charToNumericalEntity :: Char -> String
+charToNumericalEntity ch = "&#" ++ show (ord ch) ++ ";"
--- | Return a string with all entity references decoded to unicode characters
--- where possible.
-decodeEntities :: String -> String
-decodeEntities str =
- case (matchRegexAll characterEntity str) of
- Nothing -> str
- Just (before, match, rest, _) -> before ++ replacement ++
- (decodeEntities rest)
- where replacement = case (entityToChar match) of
- Just ch -> [ch]
- Nothing -> match
+-- | Escape special character to SGML entity.
+specialCharToEntity :: Bool -- ^ Use numerical entities only.
+ -> Char -- ^ Character to convert.
+ -> [Char]
+specialCharToEntity numericalEntities c =
+ if (c `elem` "&<>\"") || (ord c > 127)
+ then if numericalEntities
+ then charToNumericalEntity c
+ else charToEntity c
+ else [c]
--- | Returns a string with characters replaced with entity references where
--- possible.
-encodeEntities :: String -> String
-encodeEntities [] = []
-encodeEntities (c:cs) = if ord c < 128
- then c:(encodeEntities cs)
- else (charToEntity c) ++ (encodeEntities cs)
+-- | Parse SGML character entity.
+characterEntity :: GenParser Char st Char
+characterEntity = namedEntity <|> hexEntity <|> decimalEntity <?> "SGML entity"
--- | If the string is a valid entity reference, returns @Just@ the character,
--- otherwise @Nothing@.
-entityToChar :: String -> Maybe Char
-entityToChar entity =
- case (lookup entity entityTable) of
- Just ch -> Just ch
- Nothing -> case (matchRegexAll numericalEntity entity) of
- Just (_, _, _, [sub]) -> Just (chr (read ('0':sub)))
- Nothing -> Nothing
+-- | Parse SGML character entity.
+namedEntity :: GenParser Char st Char
+namedEntity = try $ do
+ st <- char '&'
+ body <- many1 alphaNum
+ end <- char ';'
+ let entity = "&" ++ body ++ ";"
+ return $ case (lookup entity entityTable) of
+ Just ch -> ch
+ Nothing -> '?'
+
+-- | Parse SGML hexadecimal entity.
+hexEntity :: GenParser Char st Char
+hexEntity = try $ do
+ st <- string "&#"
+ hex <- oneOf "Xx"
+ body <- many1 (oneOf "0123456789ABCDEFabcdef")
+ end <- char ';'
+ return $ chr $ read ('0':'x':body)
--- | Returns a string containing an entity reference for the character.
-charToEntity :: Char -> String
-charToEntity char =
- let matches = filter (\(entity, character) -> (character == char))
- entityTable in
- if (length matches) == 0
- then "&#" ++ show (ord char) ++ ";"
- else fst (head matches)
+-- | Parse SGML decimal entity.
+decimalEntity :: GenParser Char st Char
+decimalEntity = try $ do
+ st <- string "&#"
+ body <- many1 digit
+ end <- char ';'
+ return $ chr $ read body
+
+-- | Escape string as needed for SGML. Entity references are not preserved.
+encodeEntities :: Bool -- ^ Use only numerical entities.
+ -> String -- ^ String to convert.
+ -> String
+encodeEntities numericalEntities =
+ concatMap (specialCharToEntity numericalEntities)
+
+-- | Escape string as needed for SGML, using only numerical entities.
+-- Entity references are not preserved.
+encodeEntitiesNumerical :: String -> String
+encodeEntitiesNumerical =
+ concatMap (\c -> "&#" ++ show (ord c) ++ ";")
+
+-- | Convert entities in a string to characters.
+decodeEntities :: String -> String
+decodeEntities str =
+ case parse (many (characterEntity <|> anyChar)) str str of
+ Left err -> error $ "\nError: " ++ show err
+ Right result -> result
+
+-- | Escape string for SGML, preserving entity references.
+stringToSGML :: Bool -- ^ Use only numerical entities.
+ -> String -- ^ String to convert.
+ -> String
+stringToSGML numericalEntities str =
+ let nonentity = do
+ notFollowedBy characterEntity
+ c <- anyChar
+ return $ specialCharToEntity numericalEntities c
+ entity = do
+ ent <- manyTill anyChar (char ';')
+ return (ent ++ ";") in
+ case parse (many (nonentity <|> entity)) str str of
+ Left err -> error $ "\nError: " ++ show err
+ Right result -> concat result
entityTable :: [(String, Char)]
entityTable = [