aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Readers
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-01-27 03:04:40 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-01-27 03:04:40 +0000
commitd06417125dd4d8cb177abd2d472c0c1cad4c49be (patch)
treea3a6d7991f01e4184b678a5fcce023947d015451 /src/Text/Pandoc/Readers
parentf2de08864ee9870147839389554b9353631d21f8 (diff)
downloadpandoc-d06417125dd4d8cb177abd2d472c0c1cad4c49be.tar.gz
Changes in entity handling:
+ Entities are parsed (and unicode characters returned) in both Markdown and HTML readers. + Parsers characterEntity, namedEntity, decimalEntity, hexEntity added to Entities.hs; these parse a string and return a unicode character. + Changed 'entity' parser in HTML reader to use the 'characterEntity' parser from Entities.hs. + Added new 'entity' parser to Markdown reader, and added '&' as a special character. Adjusted test suite accordingly since now we get 'Str "AT",Str "&",Str "T"' instead of 'Str "AT&T".. + stringToSGML moved to Entities.hs. escapeSGML removed as redundant, given encodeEntities. + stringToSGML, encodeEntities, and specialCharToEntity are given a boolean parameter that causes only numerical entities to be used. This is used in the docbook writer. The HTML writer uses named entities where possible, but not all docbook-consumers know about the named entities without special instructions, so it seems safer to use numerical entities there. + decodeEntities is rewritten in a way that avoids Text.Regex, using the new parsers. + charToEntity and charToNumericalEntity added to Entities.hs. + Moved specialCharToEntity from Shared.hs to Entities.hs. + Removed unneeded 'decodeEntities' from 'str' parser in HTML and Markdown readers. + Removed sgmlHexEntity, sgmlDecimalEntity, sgmlNamedEntity, and sgmlCharacterEntity from Shared.hs. + Modified Docbook writer so that it doesn't rely on Text.Regex for detecting "mailto" links. git-svn-id: https://pandoc.googlecode.com/svn/trunk@515 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs15
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs13
2 files changed, 14 insertions, 14 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 135a90ea8..fc06b657e 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -44,7 +44,7 @@ import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Pandoc
import Text.Pandoc.Definition
import Text.Pandoc.Shared
-import Text.Pandoc.Entities ( decodeEntities, entityToChar )
+import Text.Pandoc.Entities ( characterEntity, decodeEntities )
import Maybe ( fromMaybe )
import Data.List ( intersect, takeWhile, dropWhile )
import Data.Char ( toUpper, toLower, isAlphaNum )
@@ -391,14 +391,9 @@ text = choice [ entity, strong, emph, code, str, linebreak, whitespace ] <?> "t
special = choice [ link, image, rawHtmlInline ] <?>
"link, inline html, or image"
-entity = try (do
- char '&'
- body <- choice [(many1 letter), (try (do
- char '#'
- num <- many1 digit
- return ("#" ++ num)))]
- char ';'
- return (Str [fromMaybe '?' (entityToChar ("&" ++ body ++ ";"))]))
+entity = do
+ ent <- characterEntity
+ return $ Str [ent]
code = try (do
htmlTag "code"
@@ -439,7 +434,7 @@ linebreak = do
str = do
result <- many1 (noneOf "<& \t\n")
- return (Str (decodeEntities result))
+ return (Str result)
--
-- links and images
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 3aa0a6f12..9b3f047e9 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -42,7 +42,7 @@ import Text.Pandoc.Readers.HTML ( rawHtmlBlock,
anyHtmlTag, anyHtmlEndTag,
htmlEndTag, extractTagType,
htmlBlockElement )
-import Text.Pandoc.Entities ( decodeEntities )
+import Text.Pandoc.Entities ( characterEntity )
import Text.ParserCombinators.Parsec
-- | Read markdown from an input string and return a Pandoc document.
@@ -88,12 +88,13 @@ blockQuoteChar = '>'
hyphenChar = '-'
ellipsesChar = '.'
listColSepChar = '|'
+entityStart = '&'
-- treat these as potentially non-text when parsing inline:
specialChars = [escapeChar, labelStart, labelEnd, emphStart, emphEnd,
emphStartAlt, emphEndAlt, codeStart, codeEnd, autoLinkEnd,
autoLinkStart, mathStart, mathEnd, imageStart, noteStart,
- hyphenChar, ellipsesChar] ++ quoteChars
+ hyphenChar, ellipsesChar, entityStart] ++ quoteChars
--
-- auxiliary functions
@@ -674,7 +675,7 @@ text = choice [ escapedChar, math, strong, emph, smartPunctuation,
code, ltSign, symbol,
str, linebreak, tabchar, whitespace, endline ] <?> "text"
-inline = choice [ rawLaTeXInline', escapedChar, special, text ] <?> "inline"
+inline = choice [ rawLaTeXInline', escapedChar, entity, special, text ] <?> "inline"
special = choice [ noteRef, inlineNote, link, referenceLink, rawHtmlInline',
autoLink, image ] <?> "link, inline html, note, or image"
@@ -827,9 +828,13 @@ linebreak = try (do
nonEndline = noneOf endLineChars
+entity = do
+ ent <- characterEntity
+ return $ Str [ent]
+
str = do
result <- many1 ((noneOf (specialChars ++ spaceChars ++ endLineChars)))
- return (Str (decodeEntities result))
+ return (Str result)
-- an endline character that can be treated as a space, not a structural break
endline = try (do