diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-27 03:04:40 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-27 03:04:40 +0000 |
commit | d06417125dd4d8cb177abd2d472c0c1cad4c49be (patch) | |
tree | a3a6d7991f01e4184b678a5fcce023947d015451 /src/Text/Pandoc/Shared.hs | |
parent | f2de08864ee9870147839389554b9353631d21f8 (diff) | |
download | pandoc-d06417125dd4d8cb177abd2d472c0c1cad4c49be.tar.gz |
Changes in entity handling:
+ Entities are parsed (and unicode characters returned) in both
Markdown and HTML readers.
+ Parsers characterEntity, namedEntity, decimalEntity, hexEntity added
to Entities.hs; these parse a string and return a unicode character.
+ Changed 'entity' parser in HTML reader to use the 'characterEntity'
parser from Entities.hs.
+ Added new 'entity' parser to Markdown reader, and added '&' as a
special character. Adjusted test suite accordingly since now we
get 'Str "AT",Str "&",Str "T"' instead of 'Str "AT&T"..
+ stringToSGML moved to Entities.hs. escapeSGML removed as redundant,
given encodeEntities.
+ stringToSGML, encodeEntities, and specialCharToEntity are given a
boolean parameter that causes only numerical entities to be used.
This is used in the docbook writer. The HTML writer uses named
entities where possible, but not all docbook-consumers know about
the named entities without special instructions, so it seems safer
to use numerical entities there.
+ decodeEntities is rewritten in a way that avoids Text.Regex, using
the new parsers.
+ charToEntity and charToNumericalEntity added to Entities.hs.
+ Moved specialCharToEntity from Shared.hs to Entities.hs.
+ Removed unneeded 'decodeEntities' from 'str' parser in HTML and
Markdown readers.
+ Removed sgmlHexEntity, sgmlDecimalEntity, sgmlNamedEntity, and
sgmlCharacterEntity from Shared.hs.
+ Modified Docbook writer so that it doesn't rely on Text.Regex for
detecting "mailto" links.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@515 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Shared.hs')
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 58 |
1 files changed, 3 insertions, 55 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index b82357d7a..f63ca4ce4 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -65,8 +65,6 @@ module Text.Pandoc.Shared ( replaceReferenceLinks, replaceRefLinksBlockList, -- * SGML - escapeSGML, - stringToSGML, inTags, selfClosingTag, inTagsSimple, @@ -74,7 +72,7 @@ module Text.Pandoc.Shared ( ) where import Text.Pandoc.Definition import Text.ParserCombinators.Parsec as Parsec -import Text.Pandoc.Entities ( decodeEntities, charToEntity ) +import Text.Pandoc.Entities ( decodeEntities, encodeEntities, stringToSGML ) import Text.PrettyPrint.HughesPJ as PP ( text, char, (<>), ($$), nest, Doc, isEmpty ) import Data.Char ( toLower, ord ) @@ -523,61 +521,11 @@ replaceRefLinksInline keytable (Quoted t lst) = Quoted t (map (replaceRefLinksInline keytable) lst) replaceRefLinksInline keytable other = other --- | Parse SGML character entity. -sgmlCharacterEntity :: GenParser Char st [Char] -sgmlCharacterEntity = sgmlNamedEntity <|> sgmlDecimalEntity <|> - sgmlHexEntity <?> "SGML entity" - --- | Parse SGML character entity. -sgmlNamedEntity :: GenParser Char st [Char] -sgmlNamedEntity = try $ do - st <- Parsec.char '&' - body <- many1 alphaNum - end <- Parsec.char ';' - return $ (st:body) ++ [end] - --- | Parse SGML decimal entity. -sgmlDecimalEntity :: GenParser Char st [Char] -sgmlDecimalEntity = try $ do - st <- string "&#" - body <- many1 (oneOf "0123456789") - end <- Parsec.char ';' - return $ st ++ body ++ [end] - --- | Parse SGML hexadecimal entity. -sgmlHexEntity :: GenParser Char st [Char] -sgmlHexEntity = try $ do - st <- string "&#" - hex <- oneOf "Xx" - body <- many1 (oneOf "0123456789ABCDEFabcdef") - end <- Parsec.char ';' - return $ st ++ (hex:body) ++ [end] - --- | Escape special character to SGML entity. -specialCharToEntity :: Char -> [Char] -specialCharToEntity c = if (c `elem` "&<>\"") || (ord c > 127) - then charToEntity c - else [c] - --- | Escape string, preserving character entities. -stringToSGML :: String -> String -stringToSGML str = - let segment = sgmlCharacterEntity <|> - (do{c <- anyChar; - return $ specialCharToEntity c}) - sgmlString = (do{segs <- many segment; return $ concat segs}) in - case parse sgmlString str str of - Left err -> error $ "\nError:\n" ++ show err - Right result -> result - --- | Escape string as needed for SGML. Entity references are not preserved. -escapeSGML :: String -> String -escapeSGML = concatMap specialCharToEntity - -- | Return a text object with a string of formatted SGML attributes. attributeList :: [(String, String)] -> Doc attributeList = text . concatMap - (\(a, b) -> " " ++ stringToSGML a ++ "=\"" ++ stringToSGML b ++ "\"") + (\(a, b) -> " " ++ stringToSGML True a ++ "=\"" ++ + stringToSGML True b ++ "\"") -- | Put the supplied contents between start and end tags of tagType, -- with specified attributes and (if specified) indentation. |