diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-27 03:04:40 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-27 03:04:40 +0000 |
commit | d06417125dd4d8cb177abd2d472c0c1cad4c49be (patch) | |
tree | a3a6d7991f01e4184b678a5fcce023947d015451 /src/Text/Pandoc/Entities.hs | |
parent | f2de08864ee9870147839389554b9353631d21f8 (diff) | |
download | pandoc-d06417125dd4d8cb177abd2d472c0c1cad4c49be.tar.gz |
Changes in entity handling:
+ Entities are parsed (and unicode characters returned) in both
Markdown and HTML readers.
+ Parsers characterEntity, namedEntity, decimalEntity, hexEntity added
to Entities.hs; these parse a string and return a unicode character.
+ Changed 'entity' parser in HTML reader to use the 'characterEntity'
parser from Entities.hs.
+ Added new 'entity' parser to Markdown reader, and added '&' as a
special character. Adjusted test suite accordingly since now we
get 'Str "AT",Str "&",Str "T"' instead of 'Str "AT&T"..
+ stringToSGML moved to Entities.hs. escapeSGML removed as redundant,
given encodeEntities.
+ stringToSGML, encodeEntities, and specialCharToEntity are given a
boolean parameter that causes only numerical entities to be used.
This is used in the docbook writer. The HTML writer uses named
entities where possible, but not all docbook-consumers know about
the named entities without special instructions, so it seems safer
to use numerical entities there.
+ decodeEntities is rewritten in a way that avoids Text.Regex, using
the new parsers.
+ charToEntity and charToNumericalEntity added to Entities.hs.
+ Moved specialCharToEntity from Shared.hs to Entities.hs.
+ Removed unneeded 'decodeEntities' from 'str' parser in HTML and
Markdown readers.
+ Removed sgmlHexEntity, sgmlDecimalEntity, sgmlNamedEntity, and
sgmlCharacterEntity from Shared.hs.
+ Modified Docbook writer so that it doesn't rely on Text.Regex for
detecting "mailto" links.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@515 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Entities.hs')
-rw-r--r-- | src/Text/Pandoc/Entities.hs | 137 |
1 files changed, 92 insertions, 45 deletions
diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs index ea5676b79..3e68db35c 100644 --- a/src/Text/Pandoc/Entities.hs +++ b/src/Text/Pandoc/Entities.hs @@ -29,62 +29,109 @@ Functions for encoding unicode characters as entity references, and vice versa. -} module Text.Pandoc.Entities ( - entityToChar, charToEntity, - decodeEntities, + charToNumericalEntity, + specialCharToEntity, encodeEntities, + decodeEntities, + stringToSGML, characterEntity ) where import Data.Char ( chr, ord ) -import Text.Regex ( mkRegex, matchRegexAll, Regex ) -import Maybe ( fromMaybe ) +import Text.ParserCombinators.Parsec +import Data.Maybe ( fromMaybe ) --- | Regular expression for numerical coded entity. -numericalEntity :: Text.Regex.Regex -numericalEntity = mkRegex "&#([0-9]+|[xX][0-9A-Fa-f]+);" +-- | Returns a string containing an entity reference for the character. +charToEntity :: Char -> String +charToEntity char = + let matches = filter (\(entity, character) -> (character == char)) + entityTable in + if (length matches) == 0 + then charToNumericalEntity char + else fst (head matches) --- | Regular expression for character entity. -characterEntity :: Text.Regex.Regex -characterEntity = mkRegex "&#[0-9]+;|&#[xX][0-9A-Fa-f]+;|&[A-Za-z0-9]+;" +-- | Returns a string containing a numerical entity reference for the char. +charToNumericalEntity :: Char -> String +charToNumericalEntity ch = "&#" ++ show (ord ch) ++ ";" --- | Return a string with all entity references decoded to unicode characters --- where possible. -decodeEntities :: String -> String -decodeEntities str = - case (matchRegexAll characterEntity str) of - Nothing -> str - Just (before, match, rest, _) -> before ++ replacement ++ - (decodeEntities rest) - where replacement = case (entityToChar match) of - Just ch -> [ch] - Nothing -> match +-- | Escape special character to SGML entity. +specialCharToEntity :: Bool -- ^ Use numerical entities only. + -> Char -- ^ Character to convert. + -> [Char] +specialCharToEntity numericalEntities c = + if (c `elem` "&<>\"") || (ord c > 127) + then if numericalEntities + then charToNumericalEntity c + else charToEntity c + else [c] --- | Returns a string with characters replaced with entity references where --- possible. -encodeEntities :: String -> String -encodeEntities [] = [] -encodeEntities (c:cs) = if ord c < 128 - then c:(encodeEntities cs) - else (charToEntity c) ++ (encodeEntities cs) +-- | Parse SGML character entity. +characterEntity :: GenParser Char st Char +characterEntity = namedEntity <|> hexEntity <|> decimalEntity <?> "SGML entity" --- | If the string is a valid entity reference, returns @Just@ the character, --- otherwise @Nothing@. -entityToChar :: String -> Maybe Char -entityToChar entity = - case (lookup entity entityTable) of - Just ch -> Just ch - Nothing -> case (matchRegexAll numericalEntity entity) of - Just (_, _, _, [sub]) -> Just (chr (read ('0':sub))) - Nothing -> Nothing +-- | Parse SGML character entity. +namedEntity :: GenParser Char st Char +namedEntity = try $ do + st <- char '&' + body <- many1 alphaNum + end <- char ';' + let entity = "&" ++ body ++ ";" + return $ case (lookup entity entityTable) of + Just ch -> ch + Nothing -> '?' + +-- | Parse SGML hexadecimal entity. +hexEntity :: GenParser Char st Char +hexEntity = try $ do + st <- string "&#" + hex <- oneOf "Xx" + body <- many1 (oneOf "0123456789ABCDEFabcdef") + end <- char ';' + return $ chr $ read ('0':'x':body) --- | Returns a string containing an entity reference for the character. -charToEntity :: Char -> String -charToEntity char = - let matches = filter (\(entity, character) -> (character == char)) - entityTable in - if (length matches) == 0 - then "&#" ++ show (ord char) ++ ";" - else fst (head matches) +-- | Parse SGML decimal entity. +decimalEntity :: GenParser Char st Char +decimalEntity = try $ do + st <- string "&#" + body <- many1 digit + end <- char ';' + return $ chr $ read body + +-- | Escape string as needed for SGML. Entity references are not preserved. +encodeEntities :: Bool -- ^ Use only numerical entities. + -> String -- ^ String to convert. + -> String +encodeEntities numericalEntities = + concatMap (specialCharToEntity numericalEntities) + +-- | Escape string as needed for SGML, using only numerical entities. +-- Entity references are not preserved. +encodeEntitiesNumerical :: String -> String +encodeEntitiesNumerical = + concatMap (\c -> "&#" ++ show (ord c) ++ ";") + +-- | Convert entities in a string to characters. +decodeEntities :: String -> String +decodeEntities str = + case parse (many (characterEntity <|> anyChar)) str str of + Left err -> error $ "\nError: " ++ show err + Right result -> result + +-- | Escape string for SGML, preserving entity references. +stringToSGML :: Bool -- ^ Use only numerical entities. + -> String -- ^ String to convert. + -> String +stringToSGML numericalEntities str = + let nonentity = do + notFollowedBy characterEntity + c <- anyChar + return $ specialCharToEntity numericalEntities c + entity = do + ent <- manyTill anyChar (char ';') + return (ent ++ ";") in + case parse (many (nonentity <|> entity)) str str of + Left err -> error $ "\nError: " ++ show err + Right result -> concat result entityTable :: [(String, Char)] entityTable = [ |