Changes in entity handling:

+ Entities are parsed (and unicode characters returned) in both Markdown and HTML readers. + Parsers characterEntity, namedEntity, decimalEntity, hexEntity added to Entities.hs; these parse a string and return a unicode character. + Changed 'entity' parser in HTML reader to use the 'characterEntity' parser from Entities.hs. + Added new 'entity' parser to Markdown reader, and added '&' as a special character. Adjusted test suite accordingly since now we get 'Str "AT",Str "&",Str "T"' instead of 'Str "AT&T".. + stringToSGML moved to Entities.hs. escapeSGML removed as redundant, given encodeEntities. + stringToSGML, encodeEntities, and specialCharToEntity are given a boolean parameter that causes only numerical entities to be used. This is used in the docbook writer. The HTML writer uses named entities where possible, but not all docbook-consumers know about the named entities without special instructions, so it seems safer to use numerical entities there. + decodeEntities is rewritten in a way that avoids Text.Regex, using the new parsers. + charToEntity and charToNumericalEntity added to Entities.hs. + Moved specialCharToEntity from Shared.hs to Entities.hs. + Removed unneeded 'decodeEntities' from 'str' parser in HTML and Markdown readers. + Removed sgmlHexEntity, sgmlDecimalEntity, sgmlNamedEntity, and sgmlCharacterEntity from Shared.hs. + Modified Docbook writer so that it doesn't rely on Text.Regex for detecting "mailto" links. git-svn-id: https://pandoc.googlecode.com/svn/trunk@515 788f1e2b-df1e-0410-8736-df70ead52e1b
author: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2007-01-27 03:04:40 +0000
committer: fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> 2007-01-27 03:04:40 +0000
commit: d06417125dd4d8cb177abd2d472c0c1cad4c49be (patch)
tree: a3a6d7991f01e4184b678a5fcce023947d015451 /src/Text/Pandoc/Readers
parent: f2de08864ee9870147839389554b9353631d21f8 (diff)
download: pandoc-d06417125dd4d8cb177abd2d472c0c1cad4c49be.tar.gz
2 files changed, 14 insertions, 14 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 135a90ea8..fc06b657e 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -44,7 +44,7 @@ import Text.ParserCombinators.Parsec
 import Text.ParserCombinators.Pandoc
 import Text.Pandoc.Definition
 import Text.Pandoc.Shared 
-import Text.Pandoc.Entities ( decodeEntities, entityToChar )
+import Text.Pandoc.Entities ( characterEntity, decodeEntities )
 import Maybe ( fromMaybe )
 import Data.List ( intersect, takeWhile, dropWhile )
 import Data.Char ( toUpper, toLower, isAlphaNum )
@@ -391,14 +391,9 @@ text =  choice [ entity, strong, emph, code, str, linebreak, whitespace ] <?> "t
 special = choice [ link, image, rawHtmlInline ] <?> 
                  "link, inline html, or image"
 
-entity = try (do
-  char '&'
-  body <- choice [(many1 letter), (try (do
-                                          char '#'
-                                          num <- many1 digit
-                                          return ("#" ++ num)))]
-  char ';'
-  return (Str [fromMaybe '?' (entityToChar ("&" ++ body ++ ";"))]))
+entity = do
+  ent <- characterEntity
+  return $ Str [ent]
 
 code = try (do 
   htmlTag "code"
@@ -439,7 +434,7 @@ linebreak = do
 
 str = do 
   result <- many1 (noneOf "<& \t\n")
-  return (Str (decodeEntities result))
+  return (Str result)
 
 --
 -- links and images
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 3aa0a6f12..9b3f047e9 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -42,7 +42,7 @@ import Text.Pandoc.Readers.HTML ( rawHtmlBlock,
                                   anyHtmlTag, anyHtmlEndTag,
                                   htmlEndTag, extractTagType,
                                   htmlBlockElement )
-import Text.Pandoc.Entities ( decodeEntities )
+import Text.Pandoc.Entities ( characterEntity )
 import Text.ParserCombinators.Parsec
 
 -- | Read markdown from an input string and return a Pandoc document.
@@ -88,12 +88,13 @@ blockQuoteChar = '>'
 hyphenChar = '-'
 ellipsesChar = '.'
 listColSepChar = '|'
+entityStart = '&'
 
 -- treat these as potentially non-text when parsing inline:
 specialChars = [escapeChar, labelStart, labelEnd, emphStart, emphEnd,
                 emphStartAlt, emphEndAlt, codeStart, codeEnd, autoLinkEnd,
                 autoLinkStart, mathStart, mathEnd, imageStart, noteStart,
-                hyphenChar, ellipsesChar] ++ quoteChars
+                hyphenChar, ellipsesChar, entityStart] ++ quoteChars
 
 --
 -- auxiliary functions
@@ -674,7 +675,7 @@ text = choice [ escapedChar, math, strong, emph, smartPunctuation,
                 code, ltSign, symbol,
                 str, linebreak, tabchar, whitespace, endline ] <?> "text"
 
-inline = choice [ rawLaTeXInline', escapedChar, special, text ] <?> "inline"
+inline = choice [ rawLaTeXInline', escapedChar, entity, special, text ] <?> "inline"
 
 special = choice [ noteRef, inlineNote, link, referenceLink, rawHtmlInline', 
                    autoLink, image ] <?> "link, inline html, note, or image"
@@ -827,9 +828,13 @@ linebreak = try (do
 
 nonEndline = noneOf endLineChars
 
+entity = do
+  ent <- characterEntity
+  return $ Str [ent]
+
 str = do 
   result <- many1 ((noneOf (specialChars ++ spaceChars ++ endLineChars))) 
-  return (Str (decodeEntities result))
+  return (Str result)
 
 -- an endline character that can be treated as a space, not a structural break
 endline = try (do
author	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2007-01-27 03:04:40 +0000
committer	fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>	2007-01-27 03:04:40 +0000
commit	d06417125dd4d8cb177abd2d472c0c1cad4c49be (patch)
tree	a3a6d7991f01e4184b678a5fcce023947d015451 /src/Text/Pandoc/Readers
parent	f2de08864ee9870147839389554b9353631d21f8 (diff)
download	pandoc-d06417125dd4d8cb177abd2d472c0c1cad4c49be.tar.gz