diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-28 00:04:43 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-28 00:04:43 +0000 |
commit | dc6925542c6aa60078c370e7e356b42ea216b1b7 (patch) | |
tree | 2fdb83d7f353da6687377ea4f16a5ee8f059d1a2 /src/Text/Pandoc/Readers | |
parent | 21484713c6745e56d92aecba620be44de8d32770 (diff) | |
download | pandoc-dc6925542c6aa60078c370e7e356b42ea216b1b7.tar.gz |
+ Simplified entity handling by removing stringToSGML from Entities.hs.
It is no longer needed now that all entities are processed in the markdown
and HTML readers. All calls to stringToSGML have been replaced by calls
to encodeEntities.
+ Since inTag's attribute handling already encodes entities,
calls to encodeEntities are no longer needed for attribute values, so
they've been removed.
+ The HTML and Markdown readers now call decodeEntities on all raw
strings (e.g. authors, dates, link titles), to ensure that no unprocessed
entities are included in the native representation of the document.
(In the HTML reader, most of this work is done by a change in
extractAttributeName.)
+ The result is a small speed improvement (around 5% on my benchmark)
and cleaner code.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@519 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/Text/Pandoc/Readers')
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 4 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 8 |
2 files changed, 7 insertions, 5 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index fc06b657e..3fcb33698 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -445,7 +445,9 @@ extractAttribute name [] = Nothing extractAttribute name ((attrName, contents):rest) = let name' = map toLower name attrName' = map toLower attrName in - if (attrName' == name') then Just contents else extractAttribute name rest + if (attrName' == name') + then Just (decodeEntities contents) + else extractAttribute name rest link = try (do (tag, attributes) <- htmlTag "a" diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 0f1ef348d..a7456426f 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -42,7 +42,7 @@ import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType, htmlBlockElement ) -import Text.Pandoc.Entities ( characterEntity ) +import Text.Pandoc.Entities ( characterEntity, decodeEntities ) import Text.ParserCombinators.Parsec -- | Read markdown from an input string and return a Pandoc document. @@ -144,14 +144,14 @@ authorsLine = try (do skipSpaces authors <- sepEndBy (many1 (noneOf ",;\n")) (oneOf ",;") newline - return (map removeLeadingTrailingSpace authors)) + return (map (decodeEntities . removeLeadingTrailingSpace) authors)) dateLine = try (do char '%' skipSpaces date <- many (noneOf "\n") newline - return (removeTrailingSpace date)) + return (decodeEntities $ removeTrailingSpace date)) titleBlock = try (do failIfStrict @@ -894,7 +894,7 @@ titleWith startChar endChar = try (do char endChar skipSpaces notFollowedBy (noneOf ")\n"))) - return tit) + return $ decodeEntities tit) title = choice [ titleWith '(' ')', titleWith '"' '"', |