diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-27 03:04:40 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-27 03:04:40 +0000 |
commit | d06417125dd4d8cb177abd2d472c0c1cad4c49be (patch) | |
tree | a3a6d7991f01e4184b678a5fcce023947d015451 /src/Text/Pandoc/Readers | |
parent | f2de08864ee9870147839389554b9353631d21f8 (diff) | |
download | pandoc-d06417125dd4d8cb177abd2d472c0c1cad4c49be.tar.gz |
Changes in entity handling:
+ Entities are parsed (and unicode characters returned) in both
Markdown and HTML readers.
+ Parsers characterEntity, namedEntity, decimalEntity, hexEntity added
to Entities.hs; these parse a string and return a unicode character.
+ Changed 'entity' parser in HTML reader to use the 'characterEntity'
parser from Entities.hs.
+ Added new 'entity' parser to Markdown reader, and added '&' as a
special character. Adjusted test suite accordingly since now we
get 'Str "AT",Str "&",Str "T"' instead of 'Str "AT&T"..
+ stringToSGML moved to Entities.hs. escapeSGML removed as redundant,
given encodeEntities.
+ stringToSGML, encodeEntities, and specialCharToEntity are given a
boolean parameter that causes only numerical entities to be used.
This is used in the docbook writer. The HTML writer uses named
entities where possible, but not all docbook-consumers know about
the named entities without special instructions, so it seems safer
to use numerical entities there.
+ decodeEntities is rewritten in a way that avoids Text.Regex, using
the new parsers.
+ charToEntity and charToNumericalEntity added to Entities.hs.
+ Moved specialCharToEntity from Shared.hs to Entities.hs.
+ Removed unneeded 'decodeEntities' from 'str' parser in HTML and
Markdown readers.
+ Removed sgmlHexEntity, sgmlDecimalEntity, sgmlNamedEntity, and
sgmlCharacterEntity from Shared.hs.
+ Modified Docbook writer so that it doesn't rely on Text.Regex for
detecting "mailto" links.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@515 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 15 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 13 |
2 files changed, 14 insertions, 14 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 135a90ea8..fc06b657e 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -44,7 +44,7 @@ import Text.ParserCombinators.Parsec import Text.ParserCombinators.Pandoc import Text.Pandoc.Definition import Text.Pandoc.Shared -import Text.Pandoc.Entities ( decodeEntities, entityToChar ) +import Text.Pandoc.Entities ( characterEntity, decodeEntities ) import Maybe ( fromMaybe ) import Data.List ( intersect, takeWhile, dropWhile ) import Data.Char ( toUpper, toLower, isAlphaNum ) @@ -391,14 +391,9 @@ text = choice [ entity, strong, emph, code, str, linebreak, whitespace ] <?> "t special = choice [ link, image, rawHtmlInline ] <?> "link, inline html, or image" -entity = try (do - char '&' - body <- choice [(many1 letter), (try (do - char '#' - num <- many1 digit - return ("#" ++ num)))] - char ';' - return (Str [fromMaybe '?' (entityToChar ("&" ++ body ++ ";"))])) +entity = do + ent <- characterEntity + return $ Str [ent] code = try (do htmlTag "code" @@ -439,7 +434,7 @@ linebreak = do str = do result <- many1 (noneOf "<& \t\n") - return (Str (decodeEntities result)) + return (Str result) -- -- links and images diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 3aa0a6f12..9b3f047e9 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -42,7 +42,7 @@ import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType, htmlBlockElement ) -import Text.Pandoc.Entities ( decodeEntities ) +import Text.Pandoc.Entities ( characterEntity ) import Text.ParserCombinators.Parsec -- | Read markdown from an input string and return a Pandoc document. @@ -88,12 +88,13 @@ blockQuoteChar = '>' hyphenChar = '-' ellipsesChar = '.' listColSepChar = '|' +entityStart = '&' -- treat these as potentially non-text when parsing inline: specialChars = [escapeChar, labelStart, labelEnd, emphStart, emphEnd, emphStartAlt, emphEndAlt, codeStart, codeEnd, autoLinkEnd, autoLinkStart, mathStart, mathEnd, imageStart, noteStart, - hyphenChar, ellipsesChar] ++ quoteChars + hyphenChar, ellipsesChar, entityStart] ++ quoteChars -- -- auxiliary functions @@ -674,7 +675,7 @@ text = choice [ escapedChar, math, strong, emph, smartPunctuation, code, ltSign, symbol, str, linebreak, tabchar, whitespace, endline ] <?> "text" -inline = choice [ rawLaTeXInline', escapedChar, special, text ] <?> "inline" +inline = choice [ rawLaTeXInline', escapedChar, entity, special, text ] <?> "inline" special = choice [ noteRef, inlineNote, link, referenceLink, rawHtmlInline', autoLink, image ] <?> "link, inline html, note, or image" @@ -827,9 +828,13 @@ linebreak = try (do nonEndline = noneOf endLineChars +entity = do + ent <- characterEntity + return $ Str [ent] + str = do result <- many1 ((noneOf (specialChars ++ spaceChars ++ endLineChars))) - return (Str (decodeEntities result)) + return (Str result) -- an endline character that can be treated as a space, not a structural break endline = try (do |