From 141affdb5140478464bf3c7331f6be4cf9454dd6 Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Sat, 27 Jan 2007 22:13:11 +0000 Subject: More changes in entity handling: Instead of using entities for characters above 128 in HTML and Docbook output, we now just use unicode. After all, we're declaring UTF-8 content in the header. This makes the HTML and docbook files produced by pandoc much more readable and editable. Changes to Entities.hs: + Removed specialCharToEntity + Added escapeSGMLChar (which just escapes the basic four, <>&") + Modified encodeEntities and stringToSGML to use escapeSGMLChar + Removed encodeEntitiesNumerical + Rewrote encodeEntities for better performance + Rewrote stringToSGML for better performance git-svn-id: https://pandoc.googlecode.com/svn/trunk@516 788f1e2b-df1e-0410-8736-df70ead52e1b --- src/Text/Pandoc/Entities.hs | 61 +++++++++++++++++++------------------- src/Text/Pandoc/Shared.hs | 4 +-- src/Text/Pandoc/Writers/Docbook.hs | 20 ++++++------- src/Text/Pandoc/Writers/HTML.hs | 18 +++++------ tests/writer.docbook | 60 ++++++++++++++++++------------------- tests/writer.html | 10 +++---- 6 files changed, 86 insertions(+), 87 deletions(-) diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs index 3e68db35c..696f943a6 100644 --- a/src/Text/Pandoc/Entities.hs +++ b/src/Text/Pandoc/Entities.hs @@ -31,9 +31,9 @@ and vice versa. module Text.Pandoc.Entities ( charToEntity, charToNumericalEntity, - specialCharToEntity, encodeEntities, decodeEntities, + escapeSGMLChar, stringToSGML, characterEntity ) where @@ -54,17 +54,6 @@ charToEntity char = charToNumericalEntity :: Char -> String charToNumericalEntity ch = "&#" ++ show (ord ch) ++ ";" --- | Escape special character to SGML entity. -specialCharToEntity :: Bool -- ^ Use numerical entities only. - -> Char -- ^ Character to convert. - -> [Char] -specialCharToEntity numericalEntities c = - if (c `elem` "&<>\"") || (ord c > 127) - then if numericalEntities - then charToNumericalEntity c - else charToEntity c - else [c] - -- | Parse SGML character entity. characterEntity :: GenParser Char st Char characterEntity = namedEntity <|> hexEntity <|> decimalEntity "SGML entity" @@ -97,18 +86,27 @@ decimalEntity = try $ do end <- char ';' return $ chr $ read body --- | Escape string as needed for SGML. Entity references are not preserved. -encodeEntities :: Bool -- ^ Use only numerical entities. - -> String -- ^ String to convert. - -> String -encodeEntities numericalEntities = - concatMap (specialCharToEntity numericalEntities) +-- | Escape one character as needed for SGML. +escapeSGMLChar :: Char -> String +escapeSGMLChar x = + case x of + '&' -> "&" + '<' -> "<" + '>' -> ">" + '"' -> """ + c -> [c] --- | Escape string as needed for SGML, using only numerical entities. --- Entity references are not preserved. -encodeEntitiesNumerical :: String -> String -encodeEntitiesNumerical = - concatMap (\c -> "&#" ++ show (ord c) ++ ";") +-- | True if the character needs to be escaped. +needsEscaping :: Char -> Bool +needsEscaping c = c `elem` "&<>\"" + +-- | Escape string as needed for SGML. Entity references are not preserved. +encodeEntities :: String -> String +encodeEntities "" = "" +encodeEntities str = + case break needsEscaping str of + (okay, "") -> okay + (okay, (c:cs)) -> okay ++ escapeSGMLChar c ++ encodeEntities cs -- | Convert entities in a string to characters. decodeEntities :: String -> String @@ -118,18 +116,19 @@ decodeEntities str = Right result -> result -- | Escape string for SGML, preserving entity references. -stringToSGML :: Bool -- ^ Use only numerical entities. - -> String -- ^ String to convert. - -> String -stringToSGML numericalEntities str = - let nonentity = do +stringToSGML :: String -> String +stringToSGML str = + let regular = do + str <- many1 (satisfy (not . needsEscaping)) + return str + special = do notFollowedBy characterEntity c <- anyChar - return $ specialCharToEntity numericalEntities c + return $ escapeSGMLChar c entity = do ent <- manyTill anyChar (char ';') - return (ent ++ ";") in - case parse (many (nonentity <|> entity)) str str of + return (ent ++ ";") in + case parse (many (regular <|> special <|> entity)) str str of Left err -> error $ "\nError: " ++ show err Right result -> concat result diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index f63ca4ce4..02f8782b2 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -524,8 +524,8 @@ replaceRefLinksInline keytable other = other -- | Return a text object with a string of formatted SGML attributes. attributeList :: [(String, String)] -> Doc attributeList = text . concatMap - (\(a, b) -> " " ++ stringToSGML True a ++ "=\"" ++ - stringToSGML True b ++ "\"") + (\(a, b) -> " " ++ stringToSGML a ++ "=\"" ++ + stringToSGML b ++ "\"") -- | Put the supplied contents between start and end tags of tagType, -- with specified attributes and (if specified) indentation. diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index 0c83d0ea0..1e0690c22 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -64,8 +64,8 @@ authorToDocbook name = inTagsIndented "author" $ then -- last name first let (lastname, rest) = break (==',') name firstname = removeLeadingSpace rest in - inTagsSimple "firstname" (text $ stringToSGML True firstname) <> - inTagsSimple "surname" (text $ stringToSGML True lastname) + inTagsSimple "firstname" (text $ stringToSGML firstname) <> + inTagsSimple "surname" (text $ stringToSGML lastname) else -- last name last let namewords = words name lengthname = length namewords @@ -73,8 +73,8 @@ authorToDocbook name = inTagsIndented "author" $ 0 -> ("","") 1 -> ("", name) n -> (joinWithSep " " (take (n-1) namewords), last namewords) in - inTagsSimple "firstname" (text $ stringToSGML True firstname) $$ - inTagsSimple "surname" (text $ stringToSGML True lastname) + inTagsSimple "firstname" (text $ stringToSGML firstname) $$ + inTagsSimple "surname" (text $ stringToSGML lastname) -- | Convert Pandoc document to string in Docbook format. writeDocbook :: WriterOptions -> Pandoc -> String @@ -86,7 +86,7 @@ writeDocbook opts (Pandoc (Meta title authors date) blocks) = then inTagsIndented "articleinfo" $ (inTagsSimple "title" (wrap opts title)) $$ (vcat (map authorToDocbook authors)) $$ - (inTagsSimple "date" (text $ stringToSGML True date)) + (inTagsSimple "date" (text $ stringToSGML date)) else empty blocks' = replaceReferenceLinks blocks (noteBlocks, blocks'') = partition isNoteBlock blocks' @@ -141,7 +141,7 @@ blockToDocbook opts (Para lst) = blockToDocbook opts (BlockQuote blocks) = inTagsIndented "blockquote" (blocksToDocbook opts blocks) blockToDocbook opts (CodeBlock str) = - text "\n" <> text (encodeEntities True str) <> text "\n" + text "\n" <> text (encodeEntities str) <> text "\n" blockToDocbook opts (BulletList lst) = inTagsIndented "itemizedlist" $ listItemsToDocbook opts lst blockToDocbook opts (OrderedList lst) = @@ -198,7 +198,7 @@ inlinesToDocbook opts lst = hcat (map (inlineToDocbook opts) lst) -- | Convert an inline element to Docbook. inlineToDocbook :: WriterOptions -> Inline -> Doc -inlineToDocbook opts (Str str) = text $ stringToSGML True str +inlineToDocbook opts (Str str) = text $ stringToSGML str inlineToDocbook opts (Emph lst) = inTagsSimple "emphasis" (inlinesToDocbook opts lst) inlineToDocbook opts (Strong lst) = @@ -211,7 +211,7 @@ inlineToDocbook opts Ellipses = text "…" inlineToDocbook opts EmDash = text "—" inlineToDocbook opts EnDash = text "–" inlineToDocbook opts (Code str) = - inTagsSimple "literal" $ text (encodeEntities True str) + inTagsSimple "literal" $ text (encodeEntities str) inlineToDocbook opts (TeX str) = inlineToDocbook opts (Code str) inlineToDocbook opts (HtmlInline str) = empty inlineToDocbook opts LineBreak = @@ -219,7 +219,7 @@ inlineToDocbook opts LineBreak = inlineToDocbook opts Space = char ' ' inlineToDocbook opts (Link txt (Src src tit)) = if isPrefixOf "mailto:" src - then inTagsSimple "email" $ text (encodeEntities True $ drop 7 src) + then inTagsSimple "email" $ text (encodeEntities $ drop 7 src) else inTags False "ulink" [("url", src)] $ inlinesToDocbook opts txt inlineToDocbook opts (Link text (Ref ref)) = empty -- shouldn't occur inlineToDocbook opts (Image alt (Src src tit)) = @@ -227,7 +227,7 @@ inlineToDocbook opts (Image alt (Src src tit)) = then empty else inTagsIndented "objectinfo" $ inTagsIndented "title" - (text $ stringToSGML True tit) in + (text $ stringToSGML tit) in inTagsIndented "inlinemediaobject" $ inTagsIndented "imageobject" $ titleDoc $$ selfClosingTag "imagedata" [("fileref", src)] diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index 7c89d6352..196aafad3 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -127,11 +127,11 @@ htmlHeader opts (Meta title authors date) = then empty else selfClosingTag "meta" [("name", "author"), ("content", - joinWithSep ", " (map (stringToSGML False) authors))] + joinWithSep ", " (map stringToSGML authors))] datetext = if (date == "") then empty else selfClosingTag "meta" [("name", "date"), - ("content", stringToSGML False date)] in + ("content", stringToSGML date)] in text (writerHeader opts) $$ authortext $$ datetext $$ titletext $$ text "\n" @@ -168,7 +168,7 @@ blockToHtml opts (Note ref lst) = (text "↩") blockToHtml opts (Key _ _) = empty blockToHtml opts (CodeBlock str) = - text "
" <> text (encodeEntities False str) <> text "\n
" + text "
" <> text (encodeEntities str) <> text "\n
" blockToHtml opts (RawHtml str) = text str blockToHtml opts (BulletList lst) = let attribs = if (writerIncremental opts) @@ -234,7 +234,7 @@ inlineToHtml opts (Emph lst) = inlineToHtml opts (Strong lst) = inTagsSimple "strong" (inlineListToHtml opts lst) inlineToHtml opts (Code str) = - inTagsSimple "code" $ text (encodeEntities False str) + inTagsSimple "code" $ text (encodeEntities str) inlineToHtml opts (Quoted SingleQuote lst) = text "‘" <> (inlineListToHtml opts lst) <> text "’" inlineToHtml opts (Quoted DoubleQuote lst) = @@ -243,16 +243,16 @@ inlineToHtml opts EmDash = text "—" inlineToHtml opts EnDash = text "–" inlineToHtml opts Ellipses = text "…" inlineToHtml opts Apostrophe = text "’" -inlineToHtml opts (Str str) = text $ stringToSGML False str -inlineToHtml opts (TeX str) = text $ encodeEntities False str +inlineToHtml opts (Str str) = text $ stringToSGML str +inlineToHtml opts (TeX str) = text $ encodeEntities str inlineToHtml opts (HtmlInline str) = text str inlineToHtml opts (LineBreak) = selfClosingTag "br" [] inlineToHtml opts Space = space inlineToHtml opts (Link txt (Src src tit)) = - let title = stringToSGML False tit in + let title = stringToSGML tit in if (isPrefixOf "mailto:" src) then obfuscateLink opts txt src - else inTags False "a" ([("href", encodeEntities False src)] ++ + else inTags False "a" ([("href", encodeEntities src)] ++ if null tit then [] else [("title", title)]) (inlineListToHtml opts txt) inlineToHtml opts (Link txt (Ref ref)) = @@ -260,7 +260,7 @@ inlineToHtml opts (Link txt (Ref ref)) = (inlineListToHtml opts ref) <> char ']' -- this is what markdown does, for better or worse inlineToHtml opts (Image alt (Src source tit)) = - let title = stringToSGML False tit + let title = stringToSGML tit alternate = render $ inlineListToHtml opts alt in selfClosingTag "img" $ [("src", source)] ++ (if null alternate then [] else [("alt", alternate)]) ++ diff --git a/tests/writer.docbook b/tests/writer.docbook index 600c52d6a..675a115cc 100644 --- a/tests/writer.docbook +++ b/tests/writer.docbook @@ -89,7 +89,7 @@ sub status { - print "working"; + print "working"; } @@ -122,7 +122,7 @@ sub status { - This should not be a block quote: 2 > 1. + This should not be a block quote: 2 > 1. Box-style: @@ -133,7 +133,7 @@ sub status { sub status { - print "working"; + print "working"; } @@ -177,7 +177,7 @@ sub status { ---- (should be four hyphens) sub status { - print "working"; + print "working"; } this code block is indented by one tab @@ -188,7 +188,7 @@ this code block is indented by one tab this code block is indented by two tabs -These should not be escaped: \$ \\ \> \[ \{ +These should not be escaped: \$ \\ \> \[ \{
@@ -577,9 +577,9 @@ These should not be escaped: \$ \\ \> \[ \{ word. - This is code: >, $, + This is code: >, $, \, \$, - <html>. + <html>.
@@ -602,7 +602,7 @@ These should not be escaped: \$ \\ \> \[ \{ Here is some quoted code and a - quoted link. + quoted link. Some dashes: one—two—three—four—five. @@ -691,9 +691,9 @@ These should not be escaped: \$ \\ \> \[ \{ \begin{tabular}{|l|l|}\hline -Animal & Number \\ \hline -Dog & 2 \\ -Cat & 1 \\ \hline +Animal & Number \\ \hline +Dog & 2 \\ +Cat & 1 \\ \hline \end{tabular}
@@ -705,44 +705,44 @@ Cat & 1 \\ \hline - I hat: Î + I hat: Î - o umlaut: ö + o umlaut: ö - section: § + section: § - set membership: ∈ + set membership: ∈ - copyright: © + copyright: © - AT&T has an ampersand in their name. + AT&T has an ampersand in their name. - AT&T is another way to write it. + AT&T is another way to write it. - This & that. + This & that. - 4 < 5. + 4 < 5. - 6 > 5. + 6 > 5. Backslash: \ @@ -775,7 +775,7 @@ Cat & 1 \\ \hline Right paren: ) - Greater-than: > + Greater-than: > Hash: # @@ -868,25 +868,25 @@ Cat & 1 \\ \hline With ampersands Here's a - link with an ampersand in the URL. + link with an ampersand in the URL. Here's a link with an amersand in the link text: - AT&T. + AT&T. - Here's an inline link. + Here's an inline link. Here's an - inline link in pointy braces. + inline link in pointy braces.
Autolinks With an ampersand: - http://example.com/?foo=1&bar=2 + http://example.com/?foo=1&bar=2 @@ -916,10 +916,10 @@ Cat & 1 \\ \hline Auto-links should not occur here: - <http://example.com/> + <http://example.com/> -or here: <http://example.com/> +or here: <http://example.com/>
@@ -970,7 +970,7 @@ or here: <http://example.com/> footnote (as with list items).
- { <code> } + { <code> } If you want, you can indent every line, but you can also be lazy diff --git a/tests/writer.html b/tests/writer.html index 3213bb9ce..356e4cb3e 100644 --- a/tests/writer.html +++ b/tests/writer.html @@ -557,11 +557,11 @@ Cat & 1 \\ \hline Here is some unicode:

    -
  • I hat: Î
  • -
  • o umlaut: ö
  • -
  • section: §
  • -
  • set membership: ∈
  • -
  • copyright: ©
  • +
  • I hat: Î
  • +
  • o umlaut: ö
  • +
  • section: §
  • +
  • set membership: ∈
  • +
  • copyright: ©

AT&T has an ampersand in their name. -- cgit v1.2.3