From d06417125dd4d8cb177abd2d472c0c1cad4c49be Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Sat, 27 Jan 2007 03:04:40 +0000 Subject: Changes in entity handling: + Entities are parsed (and unicode characters returned) in both Markdown and HTML readers. + Parsers characterEntity, namedEntity, decimalEntity, hexEntity added to Entities.hs; these parse a string and return a unicode character. + Changed 'entity' parser in HTML reader to use the 'characterEntity' parser from Entities.hs. + Added new 'entity' parser to Markdown reader, and added '&' as a special character. Adjusted test suite accordingly since now we get 'Str "AT",Str "&",Str "T"' instead of 'Str "AT&T".. + stringToSGML moved to Entities.hs. escapeSGML removed as redundant, given encodeEntities. + stringToSGML, encodeEntities, and specialCharToEntity are given a boolean parameter that causes only numerical entities to be used. This is used in the docbook writer. The HTML writer uses named entities where possible, but not all docbook-consumers know about the named entities without special instructions, so it seems safer to use numerical entities there. + decodeEntities is rewritten in a way that avoids Text.Regex, using the new parsers. + charToEntity and charToNumericalEntity added to Entities.hs. + Moved specialCharToEntity from Shared.hs to Entities.hs. + Removed unneeded 'decodeEntities' from 'str' parser in HTML and Markdown readers. + Removed sgmlHexEntity, sgmlDecimalEntity, sgmlNamedEntity, and sgmlCharacterEntity from Shared.hs. + Modified Docbook writer so that it doesn't rely on Text.Regex for detecting "mailto" links. git-svn-id: https://pandoc.googlecode.com/svn/trunk@515 788f1e2b-df1e-0410-8736-df70ead52e1b --- src/Text/Pandoc/Entities.hs | 137 +++++++++++++++++++++++++++++--------------- 1 file changed, 92 insertions(+), 45 deletions(-) (limited to 'src/Text/Pandoc/Entities.hs') diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs index ea5676b79..3e68db35c 100644 --- a/src/Text/Pandoc/Entities.hs +++ b/src/Text/Pandoc/Entities.hs @@ -29,62 +29,109 @@ Functions for encoding unicode characters as entity references, and vice versa. -} module Text.Pandoc.Entities ( - entityToChar, charToEntity, - decodeEntities, + charToNumericalEntity, + specialCharToEntity, encodeEntities, + decodeEntities, + stringToSGML, characterEntity ) where import Data.Char ( chr, ord ) -import Text.Regex ( mkRegex, matchRegexAll, Regex ) -import Maybe ( fromMaybe ) +import Text.ParserCombinators.Parsec +import Data.Maybe ( fromMaybe ) --- | Regular expression for numerical coded entity. -numericalEntity :: Text.Regex.Regex -numericalEntity = mkRegex "&#([0-9]+|[xX][0-9A-Fa-f]+);" +-- | Returns a string containing an entity reference for the character. +charToEntity :: Char -> String +charToEntity char = + let matches = filter (\(entity, character) -> (character == char)) + entityTable in + if (length matches) == 0 + then charToNumericalEntity char + else fst (head matches) --- | Regular expression for character entity. -characterEntity :: Text.Regex.Regex -characterEntity = mkRegex "&#[0-9]+;|&#[xX][0-9A-Fa-f]+;|&[A-Za-z0-9]+;" +-- | Returns a string containing a numerical entity reference for the char. +charToNumericalEntity :: Char -> String +charToNumericalEntity ch = "&#" ++ show (ord ch) ++ ";" --- | Return a string with all entity references decoded to unicode characters --- where possible. -decodeEntities :: String -> String -decodeEntities str = - case (matchRegexAll characterEntity str) of - Nothing -> str - Just (before, match, rest, _) -> before ++ replacement ++ - (decodeEntities rest) - where replacement = case (entityToChar match) of - Just ch -> [ch] - Nothing -> match +-- | Escape special character to SGML entity. +specialCharToEntity :: Bool -- ^ Use numerical entities only. + -> Char -- ^ Character to convert. + -> [Char] +specialCharToEntity numericalEntities c = + if (c `elem` "&<>\"") || (ord c > 127) + then if numericalEntities + then charToNumericalEntity c + else charToEntity c + else [c] --- | Returns a string with characters replaced with entity references where --- possible. -encodeEntities :: String -> String -encodeEntities [] = [] -encodeEntities (c:cs) = if ord c < 128 - then c:(encodeEntities cs) - else (charToEntity c) ++ (encodeEntities cs) +-- | Parse SGML character entity. +characterEntity :: GenParser Char st Char +characterEntity = namedEntity <|> hexEntity <|> decimalEntity "SGML entity" --- | If the string is a valid entity reference, returns @Just@ the character, --- otherwise @Nothing@. -entityToChar :: String -> Maybe Char -entityToChar entity = - case (lookup entity entityTable) of - Just ch -> Just ch - Nothing -> case (matchRegexAll numericalEntity entity) of - Just (_, _, _, [sub]) -> Just (chr (read ('0':sub))) - Nothing -> Nothing +-- | Parse SGML character entity. +namedEntity :: GenParser Char st Char +namedEntity = try $ do + st <- char '&' + body <- many1 alphaNum + end <- char ';' + let entity = "&" ++ body ++ ";" + return $ case (lookup entity entityTable) of + Just ch -> ch + Nothing -> '?' + +-- | Parse SGML hexadecimal entity. +hexEntity :: GenParser Char st Char +hexEntity = try $ do + st <- string "&#" + hex <- oneOf "Xx" + body <- many1 (oneOf "0123456789ABCDEFabcdef") + end <- char ';' + return $ chr $ read ('0':'x':body) --- | Returns a string containing an entity reference for the character. -charToEntity :: Char -> String -charToEntity char = - let matches = filter (\(entity, character) -> (character == char)) - entityTable in - if (length matches) == 0 - then "&#" ++ show (ord char) ++ ";" - else fst (head matches) +-- | Parse SGML decimal entity. +decimalEntity :: GenParser Char st Char +decimalEntity = try $ do + st <- string "&#" + body <- many1 digit + end <- char ';' + return $ chr $ read body + +-- | Escape string as needed for SGML. Entity references are not preserved. +encodeEntities :: Bool -- ^ Use only numerical entities. + -> String -- ^ String to convert. + -> String +encodeEntities numericalEntities = + concatMap (specialCharToEntity numericalEntities) + +-- | Escape string as needed for SGML, using only numerical entities. +-- Entity references are not preserved. +encodeEntitiesNumerical :: String -> String +encodeEntitiesNumerical = + concatMap (\c -> "&#" ++ show (ord c) ++ ";") + +-- | Convert entities in a string to characters. +decodeEntities :: String -> String +decodeEntities str = + case parse (many (characterEntity <|> anyChar)) str str of + Left err -> error $ "\nError: " ++ show err + Right result -> result + +-- | Escape string for SGML, preserving entity references. +stringToSGML :: Bool -- ^ Use only numerical entities. + -> String -- ^ String to convert. + -> String +stringToSGML numericalEntities str = + let nonentity = do + notFollowedBy characterEntity + c <- anyChar + return $ specialCharToEntity numericalEntities c + entity = do + ent <- manyTill anyChar (char ';') + return (ent ++ ";") in + case parse (many (nonentity <|> entity)) str str of + Left err -> error $ "\nError: " ++ show err + Right result -> concat result entityTable :: [(String, Char)] entityTable = [ -- cgit v1.2.3