diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-27 22:13:11 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-01-27 22:13:11 +0000 |
commit | 141affdb5140478464bf3c7331f6be4cf9454dd6 (patch) | |
tree | cf2db2443f46fd355bf62c0b0d4ea703d698fb1d /src | |
parent | d06417125dd4d8cb177abd2d472c0c1cad4c49be (diff) | |
download | pandoc-141affdb5140478464bf3c7331f6be4cf9454dd6.tar.gz |
More changes in entity handling: Instead of using entities for characters
above 128 in HTML and Docbook output, we now just use unicode. After all,
we're declaring UTF-8 content in the header. This makes the HTML and
docbook files produced by pandoc much more readable and editable.
Changes to Entities.hs:
+ Removed specialCharToEntity
+ Added escapeSGMLChar (which just escapes the basic four, <>&")
+ Modified encodeEntities and stringToSGML to use escapeSGMLChar
+ Removed encodeEntitiesNumerical
+ Rewrote encodeEntities for better performance
+ Rewrote stringToSGML for better performance
git-svn-id: https://pandoc.googlecode.com/svn/trunk@516 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src')
-rw-r--r-- | src/Text/Pandoc/Entities.hs | 61 | ||||
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 4 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Docbook.hs | 20 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/HTML.hs | 18 |
4 files changed, 51 insertions, 52 deletions
diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs index 3e68db35c..696f943a6 100644 --- a/src/Text/Pandoc/Entities.hs +++ b/src/Text/Pandoc/Entities.hs @@ -31,9 +31,9 @@ and vice versa. module Text.Pandoc.Entities ( charToEntity, charToNumericalEntity, - specialCharToEntity, encodeEntities, decodeEntities, + escapeSGMLChar, stringToSGML, characterEntity ) where @@ -54,17 +54,6 @@ charToEntity char = charToNumericalEntity :: Char -> String charToNumericalEntity ch = "&#" ++ show (ord ch) ++ ";" --- | Escape special character to SGML entity. -specialCharToEntity :: Bool -- ^ Use numerical entities only. - -> Char -- ^ Character to convert. - -> [Char] -specialCharToEntity numericalEntities c = - if (c `elem` "&<>\"") || (ord c > 127) - then if numericalEntities - then charToNumericalEntity c - else charToEntity c - else [c] - -- | Parse SGML character entity. characterEntity :: GenParser Char st Char characterEntity = namedEntity <|> hexEntity <|> decimalEntity <?> "SGML entity" @@ -97,18 +86,27 @@ decimalEntity = try $ do end <- char ';' return $ chr $ read body --- | Escape string as needed for SGML. Entity references are not preserved. -encodeEntities :: Bool -- ^ Use only numerical entities. - -> String -- ^ String to convert. - -> String -encodeEntities numericalEntities = - concatMap (specialCharToEntity numericalEntities) +-- | Escape one character as needed for SGML. +escapeSGMLChar :: Char -> String +escapeSGMLChar x = + case x of + '&' -> "&" + '<' -> "<" + '>' -> ">" + '"' -> """ + c -> [c] --- | Escape string as needed for SGML, using only numerical entities. --- Entity references are not preserved. -encodeEntitiesNumerical :: String -> String -encodeEntitiesNumerical = - concatMap (\c -> "&#" ++ show (ord c) ++ ";") +-- | True if the character needs to be escaped. +needsEscaping :: Char -> Bool +needsEscaping c = c `elem` "&<>\"" + +-- | Escape string as needed for SGML. Entity references are not preserved. +encodeEntities :: String -> String +encodeEntities "" = "" +encodeEntities str = + case break needsEscaping str of + (okay, "") -> okay + (okay, (c:cs)) -> okay ++ escapeSGMLChar c ++ encodeEntities cs -- | Convert entities in a string to characters. decodeEntities :: String -> String @@ -118,18 +116,19 @@ decodeEntities str = Right result -> result -- | Escape string for SGML, preserving entity references. -stringToSGML :: Bool -- ^ Use only numerical entities. - -> String -- ^ String to convert. - -> String -stringToSGML numericalEntities str = - let nonentity = do +stringToSGML :: String -> String +stringToSGML str = + let regular = do + str <- many1 (satisfy (not . needsEscaping)) + return str + special = do notFollowedBy characterEntity c <- anyChar - return $ specialCharToEntity numericalEntities c + return $ escapeSGMLChar c entity = do ent <- manyTill anyChar (char ';') - return (ent ++ ";") in - case parse (many (nonentity <|> entity)) str str of + return (ent ++ ";") in + case parse (many (regular <|> special <|> entity)) str str of Left err -> error $ "\nError: " ++ show err Right result -> concat result diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index f63ca4ce4..02f8782b2 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -524,8 +524,8 @@ replaceRefLinksInline keytable other = other -- | Return a text object with a string of formatted SGML attributes. attributeList :: [(String, String)] -> Doc attributeList = text . concatMap - (\(a, b) -> " " ++ stringToSGML True a ++ "=\"" ++ - stringToSGML True b ++ "\"") + (\(a, b) -> " " ++ stringToSGML a ++ "=\"" ++ + stringToSGML b ++ "\"") -- | Put the supplied contents between start and end tags of tagType, -- with specified attributes and (if specified) indentation. diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index 0c83d0ea0..1e0690c22 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -64,8 +64,8 @@ authorToDocbook name = inTagsIndented "author" $ then -- last name first let (lastname, rest) = break (==',') name firstname = removeLeadingSpace rest in - inTagsSimple "firstname" (text $ stringToSGML True firstname) <> - inTagsSimple "surname" (text $ stringToSGML True lastname) + inTagsSimple "firstname" (text $ stringToSGML firstname) <> + inTagsSimple "surname" (text $ stringToSGML lastname) else -- last name last let namewords = words name lengthname = length namewords @@ -73,8 +73,8 @@ authorToDocbook name = inTagsIndented "author" $ 0 -> ("","") 1 -> ("", name) n -> (joinWithSep " " (take (n-1) namewords), last namewords) in - inTagsSimple "firstname" (text $ stringToSGML True firstname) $$ - inTagsSimple "surname" (text $ stringToSGML True lastname) + inTagsSimple "firstname" (text $ stringToSGML firstname) $$ + inTagsSimple "surname" (text $ stringToSGML lastname) -- | Convert Pandoc document to string in Docbook format. writeDocbook :: WriterOptions -> Pandoc -> String @@ -86,7 +86,7 @@ writeDocbook opts (Pandoc (Meta title authors date) blocks) = then inTagsIndented "articleinfo" $ (inTagsSimple "title" (wrap opts title)) $$ (vcat (map authorToDocbook authors)) $$ - (inTagsSimple "date" (text $ stringToSGML True date)) + (inTagsSimple "date" (text $ stringToSGML date)) else empty blocks' = replaceReferenceLinks blocks (noteBlocks, blocks'') = partition isNoteBlock blocks' @@ -141,7 +141,7 @@ blockToDocbook opts (Para lst) = blockToDocbook opts (BlockQuote blocks) = inTagsIndented "blockquote" (blocksToDocbook opts blocks) blockToDocbook opts (CodeBlock str) = - text "<screen>\n" <> text (encodeEntities True str) <> text "\n</screen>" + text "<screen>\n" <> text (encodeEntities str) <> text "\n</screen>" blockToDocbook opts (BulletList lst) = inTagsIndented "itemizedlist" $ listItemsToDocbook opts lst blockToDocbook opts (OrderedList lst) = @@ -198,7 +198,7 @@ inlinesToDocbook opts lst = hcat (map (inlineToDocbook opts) lst) -- | Convert an inline element to Docbook. inlineToDocbook :: WriterOptions -> Inline -> Doc -inlineToDocbook opts (Str str) = text $ stringToSGML True str +inlineToDocbook opts (Str str) = text $ stringToSGML str inlineToDocbook opts (Emph lst) = inTagsSimple "emphasis" (inlinesToDocbook opts lst) inlineToDocbook opts (Strong lst) = @@ -211,7 +211,7 @@ inlineToDocbook opts Ellipses = text "…" inlineToDocbook opts EmDash = text "—" inlineToDocbook opts EnDash = text "–" inlineToDocbook opts (Code str) = - inTagsSimple "literal" $ text (encodeEntities True str) + inTagsSimple "literal" $ text (encodeEntities str) inlineToDocbook opts (TeX str) = inlineToDocbook opts (Code str) inlineToDocbook opts (HtmlInline str) = empty inlineToDocbook opts LineBreak = @@ -219,7 +219,7 @@ inlineToDocbook opts LineBreak = inlineToDocbook opts Space = char ' ' inlineToDocbook opts (Link txt (Src src tit)) = if isPrefixOf "mailto:" src - then inTagsSimple "email" $ text (encodeEntities True $ drop 7 src) + then inTagsSimple "email" $ text (encodeEntities $ drop 7 src) else inTags False "ulink" [("url", src)] $ inlinesToDocbook opts txt inlineToDocbook opts (Link text (Ref ref)) = empty -- shouldn't occur inlineToDocbook opts (Image alt (Src src tit)) = @@ -227,7 +227,7 @@ inlineToDocbook opts (Image alt (Src src tit)) = then empty else inTagsIndented "objectinfo" $ inTagsIndented "title" - (text $ stringToSGML True tit) in + (text $ stringToSGML tit) in inTagsIndented "inlinemediaobject" $ inTagsIndented "imageobject" $ titleDoc $$ selfClosingTag "imagedata" [("fileref", src)] diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index 7c89d6352..196aafad3 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -127,11 +127,11 @@ htmlHeader opts (Meta title authors date) = then empty else selfClosingTag "meta" [("name", "author"), ("content", - joinWithSep ", " (map (stringToSGML False) authors))] + joinWithSep ", " (map stringToSGML authors))] datetext = if (date == "") then empty else selfClosingTag "meta" [("name", "date"), - ("content", stringToSGML False date)] in + ("content", stringToSGML date)] in text (writerHeader opts) $$ authortext $$ datetext $$ titletext $$ text "</head>\n<body>" @@ -168,7 +168,7 @@ blockToHtml opts (Note ref lst) = (text "↩") blockToHtml opts (Key _ _) = empty blockToHtml opts (CodeBlock str) = - text "<pre><code>" <> text (encodeEntities False str) <> text "\n</code></pre>" + text "<pre><code>" <> text (encodeEntities str) <> text "\n</code></pre>" blockToHtml opts (RawHtml str) = text str blockToHtml opts (BulletList lst) = let attribs = if (writerIncremental opts) @@ -234,7 +234,7 @@ inlineToHtml opts (Emph lst) = inlineToHtml opts (Strong lst) = inTagsSimple "strong" (inlineListToHtml opts lst) inlineToHtml opts (Code str) = - inTagsSimple "code" $ text (encodeEntities False str) + inTagsSimple "code" $ text (encodeEntities str) inlineToHtml opts (Quoted SingleQuote lst) = text "‘" <> (inlineListToHtml opts lst) <> text "’" inlineToHtml opts (Quoted DoubleQuote lst) = @@ -243,16 +243,16 @@ inlineToHtml opts EmDash = text "—" inlineToHtml opts EnDash = text "–" inlineToHtml opts Ellipses = text "…" inlineToHtml opts Apostrophe = text "’" -inlineToHtml opts (Str str) = text $ stringToSGML False str -inlineToHtml opts (TeX str) = text $ encodeEntities False str +inlineToHtml opts (Str str) = text $ stringToSGML str +inlineToHtml opts (TeX str) = text $ encodeEntities str inlineToHtml opts (HtmlInline str) = text str inlineToHtml opts (LineBreak) = selfClosingTag "br" [] inlineToHtml opts Space = space inlineToHtml opts (Link txt (Src src tit)) = - let title = stringToSGML False tit in + let title = stringToSGML tit in if (isPrefixOf "mailto:" src) then obfuscateLink opts txt src - else inTags False "a" ([("href", encodeEntities False src)] ++ + else inTags False "a" ([("href", encodeEntities src)] ++ if null tit then [] else [("title", title)]) (inlineListToHtml opts txt) inlineToHtml opts (Link txt (Ref ref)) = @@ -260,7 +260,7 @@ inlineToHtml opts (Link txt (Ref ref)) = (inlineListToHtml opts ref) <> char ']' -- this is what markdown does, for better or worse inlineToHtml opts (Image alt (Src source tit)) = - let title = stringToSGML False tit + let title = stringToSGML tit alternate = render $ inlineListToHtml opts alt in selfClosingTag "img" $ [("src", source)] ++ (if null alternate then [] else [("alt", alternate)]) ++ |