diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-27 22:13:11 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-27 22:13:11 +0000 |
commit | 141affdb5140478464bf3c7331f6be4cf9454dd6 (patch) | |
tree | cf2db2443f46fd355bf62c0b0d4ea703d698fb1d | |
parent | d06417125dd4d8cb177abd2d472c0c1cad4c49be (diff) | |
download | pandoc-141affdb5140478464bf3c7331f6be4cf9454dd6.tar.gz |
More changes in entity handling: Instead of using entities for characters
above 128 in HTML and Docbook output, we now just use unicode. After all,
we're declaring UTF-8 content in the header. This makes the HTML and
docbook files produced by pandoc much more readable and editable.
Changes to Entities.hs:
+ Removed specialCharToEntity
+ Added escapeSGMLChar (which just escapes the basic four, <>&")
+ Modified encodeEntities and stringToSGML to use escapeSGMLChar
+ Removed encodeEntitiesNumerical
+ Rewrote encodeEntities for better performance
+ Rewrote stringToSGML for better performance
git-svn-id: https://pandoc.googlecode.com/svn/trunk@516 788f1e2b-df1e-0410-8736-df70ead52e1b
-rw-r--r-- | src/Text/Pandoc/Entities.hs | 61 | ||||
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 4 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Docbook.hs | 20 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/HTML.hs | 18 | ||||
-rw-r--r-- | tests/writer.docbook | 60 | ||||
-rw-r--r-- | tests/writer.html | 10 |
6 files changed, 86 insertions, 87 deletions
diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs index 3e68db35c..696f943a6 100644 --- a/src/Text/Pandoc/Entities.hs +++ b/src/Text/Pandoc/Entities.hs @@ -31,9 +31,9 @@ and vice versa. module Text.Pandoc.Entities ( charToEntity, charToNumericalEntity, - specialCharToEntity, encodeEntities, decodeEntities, + escapeSGMLChar, stringToSGML, characterEntity ) where @@ -54,17 +54,6 @@ charToEntity char = charToNumericalEntity :: Char -> String charToNumericalEntity ch = "&#" ++ show (ord ch) ++ ";" --- | Escape special character to SGML entity. -specialCharToEntity :: Bool -- ^ Use numerical entities only. - -> Char -- ^ Character to convert. - -> [Char] -specialCharToEntity numericalEntities c = - if (c `elem` "&<>\"") || (ord c > 127) - then if numericalEntities - then charToNumericalEntity c - else charToEntity c - else [c] - -- | Parse SGML character entity. characterEntity :: GenParser Char st Char characterEntity = namedEntity <|> hexEntity <|> decimalEntity <?> "SGML entity" @@ -97,18 +86,27 @@ decimalEntity = try $ do end <- char ';' return $ chr $ read body --- | Escape string as needed for SGML. Entity references are not preserved. -encodeEntities :: Bool -- ^ Use only numerical entities. - -> String -- ^ String to convert. - -> String -encodeEntities numericalEntities = - concatMap (specialCharToEntity numericalEntities) +-- | Escape one character as needed for SGML. +escapeSGMLChar :: Char -> String +escapeSGMLChar x = + case x of + '&' -> "&" + '<' -> "<" + '>' -> ">" + '"' -> """ + c -> [c] --- | Escape string as needed for SGML, using only numerical entities. --- Entity references are not preserved. -encodeEntitiesNumerical :: String -> String -encodeEntitiesNumerical = - concatMap (\c -> "&#" ++ show (ord c) ++ ";") +-- | True if the character needs to be escaped. +needsEscaping :: Char -> Bool +needsEscaping c = c `elem` "&<>\"" + +-- | Escape string as needed for SGML. Entity references are not preserved. +encodeEntities :: String -> String +encodeEntities "" = "" +encodeEntities str = + case break needsEscaping str of + (okay, "") -> okay + (okay, (c:cs)) -> okay ++ escapeSGMLChar c ++ encodeEntities cs -- | Convert entities in a string to characters. decodeEntities :: String -> String @@ -118,18 +116,19 @@ decodeEntities str = Right result -> result -- | Escape string for SGML, preserving entity references. -stringToSGML :: Bool -- ^ Use only numerical entities. - -> String -- ^ String to convert. - -> String -stringToSGML numericalEntities str = - let nonentity = do +stringToSGML :: String -> String +stringToSGML str = + let regular = do + str <- many1 (satisfy (not . needsEscaping)) + return str + special = do notFollowedBy characterEntity c <- anyChar - return $ specialCharToEntity numericalEntities c + return $ escapeSGMLChar c entity = do ent <- manyTill anyChar (char ';') - return (ent ++ ";") in - case parse (many (nonentity <|> entity)) str str of + return (ent ++ ";") in + case parse (many (regular <|> special <|> entity)) str str of Left err -> error $ "\nError: " ++ show err Right result -> concat result diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index f63ca4ce4..02f8782b2 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -524,8 +524,8 @@ replaceRefLinksInline keytable other = other -- | Return a text object with a string of formatted SGML attributes. attributeList :: [(String, String)] -> Doc attributeList = text . concatMap - (\(a, b) -> " " ++ stringToSGML True a ++ "=\"" ++ - stringToSGML True b ++ "\"") + (\(a, b) -> " " ++ stringToSGML a ++ "=\"" ++ + stringToSGML b ++ "\"") -- | Put the supplied contents between start and end tags of tagType, -- with specified attributes and (if specified) indentation. diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index 0c83d0ea0..1e0690c22 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -64,8 +64,8 @@ authorToDocbook name = inTagsIndented "author" $ then -- last name first let (lastname, rest) = break (==',') name firstname = removeLeadingSpace rest in - inTagsSimple "firstname" (text $ stringToSGML True firstname) <> - inTagsSimple "surname" (text $ stringToSGML True lastname) + inTagsSimple "firstname" (text $ stringToSGML firstname) <> + inTagsSimple "surname" (text $ stringToSGML lastname) else -- last name last let namewords = words name lengthname = length namewords @@ -73,8 +73,8 @@ authorToDocbook name = inTagsIndented "author" $ 0 -> ("","") 1 -> ("", name) n -> (joinWithSep " " (take (n-1) namewords), last namewords) in - inTagsSimple "firstname" (text $ stringToSGML True firstname) $$ - inTagsSimple "surname" (text $ stringToSGML True lastname) + inTagsSimple "firstname" (text $ stringToSGML firstname) $$ + inTagsSimple "surname" (text $ stringToSGML lastname) -- | Convert Pandoc document to string in Docbook format. writeDocbook :: WriterOptions -> Pandoc -> String @@ -86,7 +86,7 @@ writeDocbook opts (Pandoc (Meta title authors date) blocks) = then inTagsIndented "articleinfo" $ (inTagsSimple "title" (wrap opts title)) $$ (vcat (map authorToDocbook authors)) $$ - (inTagsSimple "date" (text $ stringToSGML True date)) + (inTagsSimple "date" (text $ stringToSGML date)) else empty blocks' = replaceReferenceLinks blocks (noteBlocks, blocks'') = partition isNoteBlock blocks' @@ -141,7 +141,7 @@ blockToDocbook opts (Para lst) = blockToDocbook opts (BlockQuote blocks) = inTagsIndented "blockquote" (blocksToDocbook opts blocks) blockToDocbook opts (CodeBlock str) = - text "<screen>\n" <> text (encodeEntities True str) <> text "\n</screen>" + text "<screen>\n" <> text (encodeEntities str) <> text "\n</screen>" blockToDocbook opts (BulletList lst) = inTagsIndented "itemizedlist" $ listItemsToDocbook opts lst blockToDocbook opts (OrderedList lst) = @@ -198,7 +198,7 @@ inlinesToDocbook opts lst = hcat (map (inlineToDocbook opts) lst) -- | Convert an inline element to Docbook. inlineToDocbook :: WriterOptions -> Inline -> Doc -inlineToDocbook opts (Str str) = text $ stringToSGML True str +inlineToDocbook opts (Str str) = text $ stringToSGML str inlineToDocbook opts (Emph lst) = inTagsSimple "emphasis" (inlinesToDocbook opts lst) inlineToDocbook opts (Strong lst) = @@ -211,7 +211,7 @@ inlineToDocbook opts Ellipses = text "…" inlineToDocbook opts EmDash = text "—" inlineToDocbook opts EnDash = text "–" inlineToDocbook opts (Code str) = - inTagsSimple "literal" $ text (encodeEntities True str) + inTagsSimple "literal" $ text (encodeEntities str) inlineToDocbook opts (TeX str) = inlineToDocbook opts (Code str) inlineToDocbook opts (HtmlInline str) = empty inlineToDocbook opts LineBreak = @@ -219,7 +219,7 @@ inlineToDocbook opts LineBreak = inlineToDocbook opts Space = char ' ' inlineToDocbook opts (Link txt (Src src tit)) = if isPrefixOf "mailto:" src - then inTagsSimple "email" $ text (encodeEntities True $ drop 7 src) + then inTagsSimple "email" $ text (encodeEntities $ drop 7 src) else inTags False "ulink" [("url", src)] $ inlinesToDocbook opts txt inlineToDocbook opts (Link text (Ref ref)) = empty -- shouldn't occur inlineToDocbook opts (Image alt (Src src tit)) = @@ -227,7 +227,7 @@ inlineToDocbook opts (Image alt (Src src tit)) = then empty else inTagsIndented "objectinfo" $ inTagsIndented "title" - (text $ stringToSGML True tit) in + (text $ stringToSGML tit) in inTagsIndented "inlinemediaobject" $ inTagsIndented "imageobject" $ titleDoc $$ selfClosingTag "imagedata" [("fileref", src)] diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index 7c89d6352..196aafad3 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -127,11 +127,11 @@ htmlHeader opts (Meta title authors date) = then empty else selfClosingTag "meta" [("name", "author"), ("content", - joinWithSep ", " (map (stringToSGML False) authors))] + joinWithSep ", " (map stringToSGML authors))] datetext = if (date == "") then empty else selfClosingTag "meta" [("name", "date"), - ("content", stringToSGML False date)] in + ("content", stringToSGML date)] in text (writerHeader opts) $$ authortext $$ datetext $$ titletext $$ text "</head>\n<body>" @@ -168,7 +168,7 @@ blockToHtml opts (Note ref lst) = (text "↩") blockToHtml opts (Key _ _) = empty blockToHtml opts (CodeBlock str) = - text "<pre><code>" <> text (encodeEntities False str) <> text "\n</code></pre>" + text "<pre><code>" <> text (encodeEntities str) <> text "\n</code></pre>" blockToHtml opts (RawHtml str) = text str blockToHtml opts (BulletList lst) = let attribs = if (writerIncremental opts) @@ -234,7 +234,7 @@ inlineToHtml opts (Emph lst) = inlineToHtml opts (Strong lst) = inTagsSimple "strong" (inlineListToHtml opts lst) inlineToHtml opts (Code str) = - inTagsSimple "code" $ text (encodeEntities False str) + inTagsSimple "code" $ text (encodeEntities str) inlineToHtml opts (Quoted SingleQuote lst) = text "‘" <> (inlineListToHtml opts lst) <> text "’" inlineToHtml opts (Quoted DoubleQuote lst) = @@ -243,16 +243,16 @@ inlineToHtml opts EmDash = text "—" inlineToHtml opts EnDash = text "–" inlineToHtml opts Ellipses = text "…" inlineToHtml opts Apostrophe = text "’" -inlineToHtml opts (Str str) = text $ stringToSGML False str -inlineToHtml opts (TeX str) = text $ encodeEntities False str +inlineToHtml opts (Str str) = text $ stringToSGML str +inlineToHtml opts (TeX str) = text $ encodeEntities str inlineToHtml opts (HtmlInline str) = text str inlineToHtml opts (LineBreak) = selfClosingTag "br" [] inlineToHtml opts Space = space inlineToHtml opts (Link txt (Src src tit)) = - let title = stringToSGML False tit in + let title = stringToSGML tit in if (isPrefixOf "mailto:" src) then obfuscateLink opts txt src - else inTags False "a" ([("href", encodeEntities False src)] ++ + else inTags False "a" ([("href", encodeEntities src)] ++ if null tit then [] else [("title", title)]) (inlineListToHtml opts txt) inlineToHtml opts (Link txt (Ref ref)) = @@ -260,7 +260,7 @@ inlineToHtml opts (Link txt (Ref ref)) = (inlineListToHtml opts ref) <> char ']' -- this is what markdown does, for better or worse inlineToHtml opts (Image alt (Src source tit)) = - let title = stringToSGML False tit + let title = stringToSGML tit alternate = render $ inlineListToHtml opts alt in selfClosingTag "img" $ [("src", source)] ++ (if null alternate then [] else [("alt", alternate)]) ++ diff --git a/tests/writer.docbook b/tests/writer.docbook index 600c52d6a..675a115cc 100644 --- a/tests/writer.docbook +++ b/tests/writer.docbook @@ -89,7 +89,7 @@ </para> <screen> sub status { - print "working"; + print "working"; } </screen> <para> @@ -122,7 +122,7 @@ sub status { </blockquote> </blockquote> <para> - This should not be a block quote: 2 > 1. + This should not be a block quote: 2 > 1. </para> <para> Box-style: @@ -133,7 +133,7 @@ sub status { </para> <screen> sub status { - print "working"; + print "working"; } </screen> </blockquote> @@ -177,7 +177,7 @@ sub status { ---- (should be four hyphens) sub status { - print "working"; + print "working"; } this code block is indented by one tab @@ -188,7 +188,7 @@ this code block is indented by one tab <screen> this code block is indented by two tabs -These should not be escaped: \$ \\ \> \[ \{ +These should not be escaped: \$ \\ \> \[ \{ </screen> </section> <section> @@ -577,9 +577,9 @@ These should not be escaped: \$ \\ \> \[ \{ word. </para> <para> - This is code: <literal>></literal>, <literal>$</literal>, + This is code: <literal>></literal>, <literal>$</literal>, <literal>\</literal>, <literal>\$</literal>, - <literal><html></literal>. + <literal><html></literal>. </para> </section> <section> @@ -602,7 +602,7 @@ These should not be escaped: \$ \\ \> \[ \{ </para> <para> Here is some quoted <quote><literal>code</literal></quote> and a - <quote><ulink url="http://example.com/?foo=1&bar=2">quoted link</ulink></quote>. + <quote><ulink url="http://example.com/?foo=1&bar=2">quoted link</ulink></quote>. </para> <para> Some dashes: one—two—three—four—five. @@ -691,9 +691,9 @@ These should not be escaped: \$ \\ \> \[ \{ </para> <para> <literal>\begin{tabular}{|l|l|}\hline -Animal & Number \\ \hline -Dog & 2 \\ -Cat & 1 \\ \hline +Animal & Number \\ \hline +Dog & 2 \\ +Cat & 1 \\ \hline \end{tabular}</literal> </para> </section> @@ -705,44 +705,44 @@ Cat & 1 \\ \hline <itemizedlist> <listitem> <para> - I hat: Î + I hat: Î </para> </listitem> <listitem> <para> - o umlaut: ö + o umlaut: ö </para> </listitem> <listitem> <para> - section: § + section: § </para> </listitem> <listitem> <para> - set membership: ∈ + set membership: ∈ </para> </listitem> <listitem> <para> - copyright: © + copyright: © </para> </listitem> </itemizedlist> <para> - AT&T has an ampersand in their name. + AT&T has an ampersand in their name. </para> <para> - AT&T is another way to write it. + AT&T is another way to write it. </para> <para> - This & that. + This & that. </para> <para> - 4 < 5. + 4 < 5. </para> <para> - 6 > 5. + 6 > 5. </para> <para> Backslash: \ @@ -775,7 +775,7 @@ Cat & 1 \\ \hline Right paren: ) </para> <para> - Greater-than: > + Greater-than: > </para> <para> Hash: # @@ -868,25 +868,25 @@ Cat & 1 \\ \hline <title>With ampersands</title> <para> Here's a - <ulink url="http://example.com/?foo=1&bar=2">link with an ampersand in the URL</ulink>. + <ulink url="http://example.com/?foo=1&bar=2">link with an ampersand in the URL</ulink>. </para> <para> Here's a link with an amersand in the link text: - <ulink url="http://att.com/">AT&T</ulink>. + <ulink url="http://att.com/">AT&T</ulink>. </para> <para> - Here's an <ulink url="/script?foo=1&bar=2">inline link</ulink>. + Here's an <ulink url="/script?foo=1&bar=2">inline link</ulink>. </para> <para> Here's an - <ulink url="/script?foo=1&bar=2">inline link in pointy braces</ulink>. + <ulink url="/script?foo=1&bar=2">inline link in pointy braces</ulink>. </para> </section> <section> <title>Autolinks</title> <para> With an ampersand: - <ulink url="http://example.com/?foo=1&bar=2">http://example.com/?foo=1&bar=2</ulink> + <ulink url="http://example.com/?foo=1&bar=2">http://example.com/?foo=1&bar=2</ulink> </para> <itemizedlist> <listitem> @@ -916,10 +916,10 @@ Cat & 1 \\ \hline </blockquote> <para> Auto-links should not occur here: - <literal><http://example.com/></literal> + <literal><http://example.com/></literal> </para> <screen> -or here: <http://example.com/> +or here: <http://example.com/> </screen> </section> </section> @@ -970,7 +970,7 @@ or here: <http://example.com/> footnote (as with list items). </para> <screen> - { <code> } + { <code> } </screen> <para> If you want, you can indent every line, but you can also be lazy diff --git a/tests/writer.html b/tests/writer.html index 3213bb9ce..356e4cb3e 100644 --- a/tests/writer.html +++ b/tests/writer.html @@ -557,11 +557,11 @@ Cat & 1 \\ \hline Here is some unicode: </p> <ul> - <li>I hat: Î</li> - <li>o umlaut: ö</li> - <li>section: §</li> - <li>set membership: ∈</li> - <li>copyright: ©</li> + <li>I hat: Î</li> + <li>o umlaut: ö</li> + <li>section: §</li> + <li>set membership: ∈</li> + <li>copyright: ©</li> </ul> <p> AT&T has an ampersand in their name. |