aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-01-27 22:13:11 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-01-27 22:13:11 +0000
commit141affdb5140478464bf3c7331f6be4cf9454dd6 (patch)
treecf2db2443f46fd355bf62c0b0d4ea703d698fb1d
parentd06417125dd4d8cb177abd2d472c0c1cad4c49be (diff)
downloadpandoc-141affdb5140478464bf3c7331f6be4cf9454dd6.tar.gz
More changes in entity handling: Instead of using entities for characters
above 128 in HTML and Docbook output, we now just use unicode. After all, we're declaring UTF-8 content in the header. This makes the HTML and docbook files produced by pandoc much more readable and editable. Changes to Entities.hs: + Removed specialCharToEntity + Added escapeSGMLChar (which just escapes the basic four, <>&") + Modified encodeEntities and stringToSGML to use escapeSGMLChar + Removed encodeEntitiesNumerical + Rewrote encodeEntities for better performance + Rewrote stringToSGML for better performance git-svn-id: https://pandoc.googlecode.com/svn/trunk@516 788f1e2b-df1e-0410-8736-df70ead52e1b
-rw-r--r--src/Text/Pandoc/Entities.hs61
-rw-r--r--src/Text/Pandoc/Shared.hs4
-rw-r--r--src/Text/Pandoc/Writers/Docbook.hs20
-rw-r--r--src/Text/Pandoc/Writers/HTML.hs18
-rw-r--r--tests/writer.docbook60
-rw-r--r--tests/writer.html10
6 files changed, 86 insertions, 87 deletions
diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs
index 3e68db35c..696f943a6 100644
--- a/src/Text/Pandoc/Entities.hs
+++ b/src/Text/Pandoc/Entities.hs
@@ -31,9 +31,9 @@ and vice versa.
module Text.Pandoc.Entities (
charToEntity,
charToNumericalEntity,
- specialCharToEntity,
encodeEntities,
decodeEntities,
+ escapeSGMLChar,
stringToSGML,
characterEntity
) where
@@ -54,17 +54,6 @@ charToEntity char =
charToNumericalEntity :: Char -> String
charToNumericalEntity ch = "&#" ++ show (ord ch) ++ ";"
--- | Escape special character to SGML entity.
-specialCharToEntity :: Bool -- ^ Use numerical entities only.
- -> Char -- ^ Character to convert.
- -> [Char]
-specialCharToEntity numericalEntities c =
- if (c `elem` "&<>\"") || (ord c > 127)
- then if numericalEntities
- then charToNumericalEntity c
- else charToEntity c
- else [c]
-
-- | Parse SGML character entity.
characterEntity :: GenParser Char st Char
characterEntity = namedEntity <|> hexEntity <|> decimalEntity <?> "SGML entity"
@@ -97,18 +86,27 @@ decimalEntity = try $ do
end <- char ';'
return $ chr $ read body
--- | Escape string as needed for SGML. Entity references are not preserved.
-encodeEntities :: Bool -- ^ Use only numerical entities.
- -> String -- ^ String to convert.
- -> String
-encodeEntities numericalEntities =
- concatMap (specialCharToEntity numericalEntities)
+-- | Escape one character as needed for SGML.
+escapeSGMLChar :: Char -> String
+escapeSGMLChar x =
+ case x of
+ '&' -> "&amp;"
+ '<' -> "&lt;"
+ '>' -> "&gt;"
+ '"' -> "&quot;"
+ c -> [c]
--- | Escape string as needed for SGML, using only numerical entities.
--- Entity references are not preserved.
-encodeEntitiesNumerical :: String -> String
-encodeEntitiesNumerical =
- concatMap (\c -> "&#" ++ show (ord c) ++ ";")
+-- | True if the character needs to be escaped.
+needsEscaping :: Char -> Bool
+needsEscaping c = c `elem` "&<>\""
+
+-- | Escape string as needed for SGML. Entity references are not preserved.
+encodeEntities :: String -> String
+encodeEntities "" = ""
+encodeEntities str =
+ case break needsEscaping str of
+ (okay, "") -> okay
+ (okay, (c:cs)) -> okay ++ escapeSGMLChar c ++ encodeEntities cs
-- | Convert entities in a string to characters.
decodeEntities :: String -> String
@@ -118,18 +116,19 @@ decodeEntities str =
Right result -> result
-- | Escape string for SGML, preserving entity references.
-stringToSGML :: Bool -- ^ Use only numerical entities.
- -> String -- ^ String to convert.
- -> String
-stringToSGML numericalEntities str =
- let nonentity = do
+stringToSGML :: String -> String
+stringToSGML str =
+ let regular = do
+ str <- many1 (satisfy (not . needsEscaping))
+ return str
+ special = do
notFollowedBy characterEntity
c <- anyChar
- return $ specialCharToEntity numericalEntities c
+ return $ escapeSGMLChar c
entity = do
ent <- manyTill anyChar (char ';')
- return (ent ++ ";") in
- case parse (many (nonentity <|> entity)) str str of
+ return (ent ++ ";") in
+ case parse (many (regular <|> special <|> entity)) str str of
Left err -> error $ "\nError: " ++ show err
Right result -> concat result
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index f63ca4ce4..02f8782b2 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -524,8 +524,8 @@ replaceRefLinksInline keytable other = other
-- | Return a text object with a string of formatted SGML attributes.
attributeList :: [(String, String)] -> Doc
attributeList = text . concatMap
- (\(a, b) -> " " ++ stringToSGML True a ++ "=\"" ++
- stringToSGML True b ++ "\"")
+ (\(a, b) -> " " ++ stringToSGML a ++ "=\"" ++
+ stringToSGML b ++ "\"")
-- | Put the supplied contents between start and end tags of tagType,
-- with specified attributes and (if specified) indentation.
diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs
index 0c83d0ea0..1e0690c22 100644
--- a/src/Text/Pandoc/Writers/Docbook.hs
+++ b/src/Text/Pandoc/Writers/Docbook.hs
@@ -64,8 +64,8 @@ authorToDocbook name = inTagsIndented "author" $
then -- last name first
let (lastname, rest) = break (==',') name
firstname = removeLeadingSpace rest in
- inTagsSimple "firstname" (text $ stringToSGML True firstname) <>
- inTagsSimple "surname" (text $ stringToSGML True lastname)
+ inTagsSimple "firstname" (text $ stringToSGML firstname) <>
+ inTagsSimple "surname" (text $ stringToSGML lastname)
else -- last name last
let namewords = words name
lengthname = length namewords
@@ -73,8 +73,8 @@ authorToDocbook name = inTagsIndented "author" $
0 -> ("","")
1 -> ("", name)
n -> (joinWithSep " " (take (n-1) namewords), last namewords) in
- inTagsSimple "firstname" (text $ stringToSGML True firstname) $$
- inTagsSimple "surname" (text $ stringToSGML True lastname)
+ inTagsSimple "firstname" (text $ stringToSGML firstname) $$
+ inTagsSimple "surname" (text $ stringToSGML lastname)
-- | Convert Pandoc document to string in Docbook format.
writeDocbook :: WriterOptions -> Pandoc -> String
@@ -86,7 +86,7 @@ writeDocbook opts (Pandoc (Meta title authors date) blocks) =
then inTagsIndented "articleinfo" $
(inTagsSimple "title" (wrap opts title)) $$
(vcat (map authorToDocbook authors)) $$
- (inTagsSimple "date" (text $ stringToSGML True date))
+ (inTagsSimple "date" (text $ stringToSGML date))
else empty
blocks' = replaceReferenceLinks blocks
(noteBlocks, blocks'') = partition isNoteBlock blocks'
@@ -141,7 +141,7 @@ blockToDocbook opts (Para lst) =
blockToDocbook opts (BlockQuote blocks) =
inTagsIndented "blockquote" (blocksToDocbook opts blocks)
blockToDocbook opts (CodeBlock str) =
- text "<screen>\n" <> text (encodeEntities True str) <> text "\n</screen>"
+ text "<screen>\n" <> text (encodeEntities str) <> text "\n</screen>"
blockToDocbook opts (BulletList lst) =
inTagsIndented "itemizedlist" $ listItemsToDocbook opts lst
blockToDocbook opts (OrderedList lst) =
@@ -198,7 +198,7 @@ inlinesToDocbook opts lst = hcat (map (inlineToDocbook opts) lst)
-- | Convert an inline element to Docbook.
inlineToDocbook :: WriterOptions -> Inline -> Doc
-inlineToDocbook opts (Str str) = text $ stringToSGML True str
+inlineToDocbook opts (Str str) = text $ stringToSGML str
inlineToDocbook opts (Emph lst) =
inTagsSimple "emphasis" (inlinesToDocbook opts lst)
inlineToDocbook opts (Strong lst) =
@@ -211,7 +211,7 @@ inlineToDocbook opts Ellipses = text "&#8230;"
inlineToDocbook opts EmDash = text "&#8212;"
inlineToDocbook opts EnDash = text "&#8211;"
inlineToDocbook opts (Code str) =
- inTagsSimple "literal" $ text (encodeEntities True str)
+ inTagsSimple "literal" $ text (encodeEntities str)
inlineToDocbook opts (TeX str) = inlineToDocbook opts (Code str)
inlineToDocbook opts (HtmlInline str) = empty
inlineToDocbook opts LineBreak =
@@ -219,7 +219,7 @@ inlineToDocbook opts LineBreak =
inlineToDocbook opts Space = char ' '
inlineToDocbook opts (Link txt (Src src tit)) =
if isPrefixOf "mailto:" src
- then inTagsSimple "email" $ text (encodeEntities True $ drop 7 src)
+ then inTagsSimple "email" $ text (encodeEntities $ drop 7 src)
else inTags False "ulink" [("url", src)] $ inlinesToDocbook opts txt
inlineToDocbook opts (Link text (Ref ref)) = empty -- shouldn't occur
inlineToDocbook opts (Image alt (Src src tit)) =
@@ -227,7 +227,7 @@ inlineToDocbook opts (Image alt (Src src tit)) =
then empty
else inTagsIndented "objectinfo" $
inTagsIndented "title"
- (text $ stringToSGML True tit) in
+ (text $ stringToSGML tit) in
inTagsIndented "inlinemediaobject" $
inTagsIndented "imageobject" $
titleDoc $$ selfClosingTag "imagedata" [("fileref", src)]
diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs
index 7c89d6352..196aafad3 100644
--- a/src/Text/Pandoc/Writers/HTML.hs
+++ b/src/Text/Pandoc/Writers/HTML.hs
@@ -127,11 +127,11 @@ htmlHeader opts (Meta title authors date) =
then empty
else selfClosingTag "meta" [("name", "author"),
("content",
- joinWithSep ", " (map (stringToSGML False) authors))]
+ joinWithSep ", " (map stringToSGML authors))]
datetext = if (date == "")
then empty
else selfClosingTag "meta" [("name", "date"),
- ("content", stringToSGML False date)] in
+ ("content", stringToSGML date)] in
text (writerHeader opts) $$ authortext $$ datetext $$ titletext $$
text "</head>\n<body>"
@@ -168,7 +168,7 @@ blockToHtml opts (Note ref lst) =
(text "&#8617;")
blockToHtml opts (Key _ _) = empty
blockToHtml opts (CodeBlock str) =
- text "<pre><code>" <> text (encodeEntities False str) <> text "\n</code></pre>"
+ text "<pre><code>" <> text (encodeEntities str) <> text "\n</code></pre>"
blockToHtml opts (RawHtml str) = text str
blockToHtml opts (BulletList lst) =
let attribs = if (writerIncremental opts)
@@ -234,7 +234,7 @@ inlineToHtml opts (Emph lst) =
inlineToHtml opts (Strong lst) =
inTagsSimple "strong" (inlineListToHtml opts lst)
inlineToHtml opts (Code str) =
- inTagsSimple "code" $ text (encodeEntities False str)
+ inTagsSimple "code" $ text (encodeEntities str)
inlineToHtml opts (Quoted SingleQuote lst) =
text "&lsquo;" <> (inlineListToHtml opts lst) <> text "&rsquo;"
inlineToHtml opts (Quoted DoubleQuote lst) =
@@ -243,16 +243,16 @@ inlineToHtml opts EmDash = text "&mdash;"
inlineToHtml opts EnDash = text "&ndash;"
inlineToHtml opts Ellipses = text "&hellip;"
inlineToHtml opts Apostrophe = text "&rsquo;"
-inlineToHtml opts (Str str) = text $ stringToSGML False str
-inlineToHtml opts (TeX str) = text $ encodeEntities False str
+inlineToHtml opts (Str str) = text $ stringToSGML str
+inlineToHtml opts (TeX str) = text $ encodeEntities str
inlineToHtml opts (HtmlInline str) = text str
inlineToHtml opts (LineBreak) = selfClosingTag "br" []
inlineToHtml opts Space = space
inlineToHtml opts (Link txt (Src src tit)) =
- let title = stringToSGML False tit in
+ let title = stringToSGML tit in
if (isPrefixOf "mailto:" src)
then obfuscateLink opts txt src
- else inTags False "a" ([("href", encodeEntities False src)] ++
+ else inTags False "a" ([("href", encodeEntities src)] ++
if null tit then [] else [("title", title)])
(inlineListToHtml opts txt)
inlineToHtml opts (Link txt (Ref ref)) =
@@ -260,7 +260,7 @@ inlineToHtml opts (Link txt (Ref ref)) =
(inlineListToHtml opts ref) <> char ']'
-- this is what markdown does, for better or worse
inlineToHtml opts (Image alt (Src source tit)) =
- let title = stringToSGML False tit
+ let title = stringToSGML tit
alternate = render $ inlineListToHtml opts alt in
selfClosingTag "img" $ [("src", source)] ++
(if null alternate then [] else [("alt", alternate)]) ++
diff --git a/tests/writer.docbook b/tests/writer.docbook
index 600c52d6a..675a115cc 100644
--- a/tests/writer.docbook
+++ b/tests/writer.docbook
@@ -89,7 +89,7 @@
</para>
<screen>
sub status {
- print &#34;working&#34;;
+ print &quot;working&quot;;
}
</screen>
<para>
@@ -122,7 +122,7 @@ sub status {
</blockquote>
</blockquote>
<para>
- This should not be a block quote: 2 &#62; 1.
+ This should not be a block quote: 2 &gt; 1.
</para>
<para>
Box-style:
@@ -133,7 +133,7 @@ sub status {
</para>
<screen>
sub status {
- print &#34;working&#34;;
+ print &quot;working&quot;;
}
</screen>
</blockquote>
@@ -177,7 +177,7 @@ sub status {
---- (should be four hyphens)
sub status {
- print &#34;working&#34;;
+ print &quot;working&quot;;
}
this code block is indented by one tab
@@ -188,7 +188,7 @@ this code block is indented by one tab
<screen>
this code block is indented by two tabs
-These should not be escaped: \$ \\ \&#62; \[ \{
+These should not be escaped: \$ \\ \&gt; \[ \{
</screen>
</section>
<section>
@@ -577,9 +577,9 @@ These should not be escaped: \$ \\ \&#62; \[ \{
word.
</para>
<para>
- This is code: <literal>&#62;</literal>, <literal>$</literal>,
+ This is code: <literal>&gt;</literal>, <literal>$</literal>,
<literal>\</literal>, <literal>\$</literal>,
- <literal>&#60;html&#62;</literal>.
+ <literal>&lt;html&gt;</literal>.
</para>
</section>
<section>
@@ -602,7 +602,7 @@ These should not be escaped: \$ \\ \&#62; \[ \{
</para>
<para>
Here is some quoted <quote><literal>code</literal></quote> and a
- <quote><ulink url="http://example.com/?foo=1&#38;bar=2">quoted link</ulink></quote>.
+ <quote><ulink url="http://example.com/?foo=1&amp;bar=2">quoted link</ulink></quote>.
</para>
<para>
Some dashes: one&#8212;two&#8212;three&#8212;four&#8212;five.
@@ -691,9 +691,9 @@ These should not be escaped: \$ \\ \&#62; \[ \{
</para>
<para>
<literal>\begin{tabular}{|l|l|}\hline
-Animal &#38; Number \\ \hline
-Dog &#38; 2 \\
-Cat &#38; 1 \\ \hline
+Animal &amp; Number \\ \hline
+Dog &amp; 2 \\
+Cat &amp; 1 \\ \hline
\end{tabular}</literal>
</para>
</section>
@@ -705,44 +705,44 @@ Cat &#38; 1 \\ \hline
<itemizedlist>
<listitem>
<para>
- I hat: &#206;
+ I hat: Î
</para>
</listitem>
<listitem>
<para>
- o umlaut: &#246;
+ o umlaut: ö
</para>
</listitem>
<listitem>
<para>
- section: &#167;
+ section: §
</para>
</listitem>
<listitem>
<para>
- set membership: &#8712;
+ set membership: ∈
</para>
</listitem>
<listitem>
<para>
- copyright: &#169;
+ copyright: ©
</para>
</listitem>
</itemizedlist>
<para>
- AT&#38;T has an ampersand in their name.
+ AT&amp;T has an ampersand in their name.
</para>
<para>
- AT&#38;T is another way to write it.
+ AT&amp;T is another way to write it.
</para>
<para>
- This &#38; that.
+ This &amp; that.
</para>
<para>
- 4 &#60; 5.
+ 4 &lt; 5.
</para>
<para>
- 6 &#62; 5.
+ 6 &gt; 5.
</para>
<para>
Backslash: \
@@ -775,7 +775,7 @@ Cat &#38; 1 \\ \hline
Right paren: )
</para>
<para>
- Greater-than: &#62;
+ Greater-than: &gt;
</para>
<para>
Hash: #
@@ -868,25 +868,25 @@ Cat &#38; 1 \\ \hline
<title>With ampersands</title>
<para>
Here's a
- <ulink url="http://example.com/?foo=1&#38;bar=2">link with an ampersand in the URL</ulink>.
+ <ulink url="http://example.com/?foo=1&amp;bar=2">link with an ampersand in the URL</ulink>.
</para>
<para>
Here's a link with an amersand in the link text:
- <ulink url="http://att.com/">AT&#38;T</ulink>.
+ <ulink url="http://att.com/">AT&amp;T</ulink>.
</para>
<para>
- Here's an <ulink url="/script?foo=1&#38;bar=2">inline link</ulink>.
+ Here's an <ulink url="/script?foo=1&amp;bar=2">inline link</ulink>.
</para>
<para>
Here's an
- <ulink url="/script?foo=1&#38;bar=2">inline link in pointy braces</ulink>.
+ <ulink url="/script?foo=1&amp;bar=2">inline link in pointy braces</ulink>.
</para>
</section>
<section>
<title>Autolinks</title>
<para>
With an ampersand:
- <ulink url="http://example.com/?foo=1&#38;bar=2">http://example.com/?foo=1&#38;bar=2</ulink>
+ <ulink url="http://example.com/?foo=1&amp;bar=2">http://example.com/?foo=1&amp;bar=2</ulink>
</para>
<itemizedlist>
<listitem>
@@ -916,10 +916,10 @@ Cat &#38; 1 \\ \hline
</blockquote>
<para>
Auto-links should not occur here:
- <literal>&#60;http://example.com/&#62;</literal>
+ <literal>&lt;http://example.com/&gt;</literal>
</para>
<screen>
-or here: &#60;http://example.com/&#62;
+or here: &lt;http://example.com/&gt;
</screen>
</section>
</section>
@@ -970,7 +970,7 @@ or here: &#60;http://example.com/&#62;
footnote (as with list items).
</para>
<screen>
- { &#60;code&#62; }
+ { &lt;code&gt; }
</screen>
<para>
If you want, you can indent every line, but you can also be lazy
diff --git a/tests/writer.html b/tests/writer.html
index 3213bb9ce..356e4cb3e 100644
--- a/tests/writer.html
+++ b/tests/writer.html
@@ -557,11 +557,11 @@ Cat &amp; 1 \\ \hline
Here is some unicode:
</p>
<ul>
- <li>I hat: &Icirc;</li>
- <li>o umlaut: &ouml;</li>
- <li>section: &sect;</li>
- <li>set membership: &isin;</li>
- <li>copyright: &copy;</li>
+ <li>I hat: Î</li>
+ <li>o umlaut: ö</li>
+ <li>section: §</li>
+ <li>set membership: ∈</li>
+ <li>copyright: ©</li>
</ul>
<p>
AT&amp;T has an ampersand in their name.