aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc
diff options
context:
space:
mode:
Diffstat (limited to 'src/Text/Pandoc')
-rw-r--r--src/Text/Pandoc/Entities.hs137
-rw-r--r--src/Text/Pandoc/Readers/HTML.hs15
-rw-r--r--src/Text/Pandoc/Readers/Markdown.hs13
-rw-r--r--src/Text/Pandoc/Shared.hs58
-rw-r--r--src/Text/Pandoc/Writers/Docbook.hs30
-rw-r--r--src/Text/Pandoc/Writers/HTML.hs20
-rw-r--r--src/Text/Pandoc/Writers/Markdown.hs1
7 files changed, 133 insertions, 141 deletions
diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs
index ea5676b79..3e68db35c 100644
--- a/src/Text/Pandoc/Entities.hs
+++ b/src/Text/Pandoc/Entities.hs
@@ -29,62 +29,109 @@ Functions for encoding unicode characters as entity references,
and vice versa.
-}
module Text.Pandoc.Entities (
- entityToChar,
charToEntity,
- decodeEntities,
+ charToNumericalEntity,
+ specialCharToEntity,
encodeEntities,
+ decodeEntities,
+ stringToSGML,
characterEntity
) where
import Data.Char ( chr, ord )
-import Text.Regex ( mkRegex, matchRegexAll, Regex )
-import Maybe ( fromMaybe )
+import Text.ParserCombinators.Parsec
+import Data.Maybe ( fromMaybe )
--- | Regular expression for numerical coded entity.
-numericalEntity :: Text.Regex.Regex
-numericalEntity = mkRegex "&#([0-9]+|[xX][0-9A-Fa-f]+);"
+-- | Returns a string containing an entity reference for the character.
+charToEntity :: Char -> String
+charToEntity char =
+ let matches = filter (\(entity, character) -> (character == char))
+ entityTable in
+ if (length matches) == 0
+ then charToNumericalEntity char
+ else fst (head matches)
--- | Regular expression for character entity.
-characterEntity :: Text.Regex.Regex
-characterEntity = mkRegex "&#[0-9]+;|&#[xX][0-9A-Fa-f]+;|&[A-Za-z0-9]+;"
+-- | Returns a string containing a numerical entity reference for the char.
+charToNumericalEntity :: Char -> String
+charToNumericalEntity ch = "&#" ++ show (ord ch) ++ ";"
--- | Return a string with all entity references decoded to unicode characters
--- where possible.
-decodeEntities :: String -> String
-decodeEntities str =
- case (matchRegexAll characterEntity str) of
- Nothing -> str
- Just (before, match, rest, _) -> before ++ replacement ++
- (decodeEntities rest)
- where replacement = case (entityToChar match) of
- Just ch -> [ch]
- Nothing -> match
+-- | Escape special character to SGML entity.
+specialCharToEntity :: Bool -- ^ Use numerical entities only.
+ -> Char -- ^ Character to convert.
+ -> [Char]
+specialCharToEntity numericalEntities c =
+ if (c `elem` "&<>\"") || (ord c > 127)
+ then if numericalEntities
+ then charToNumericalEntity c
+ else charToEntity c
+ else [c]
--- | Returns a string with characters replaced with entity references where
--- possible.
-encodeEntities :: String -> String
-encodeEntities [] = []
-encodeEntities (c:cs) = if ord c < 128
- then c:(encodeEntities cs)
- else (charToEntity c) ++ (encodeEntities cs)
+-- | Parse SGML character entity.
+characterEntity :: GenParser Char st Char
+characterEntity = namedEntity <|> hexEntity <|> decimalEntity <?> "SGML entity"
--- | If the string is a valid entity reference, returns @Just@ the character,
--- otherwise @Nothing@.
-entityToChar :: String -> Maybe Char
-entityToChar entity =
- case (lookup entity entityTable) of
- Just ch -> Just ch
- Nothing -> case (matchRegexAll numericalEntity entity) of
- Just (_, _, _, [sub]) -> Just (chr (read ('0':sub)))
- Nothing -> Nothing
+-- | Parse SGML character entity.
+namedEntity :: GenParser Char st Char
+namedEntity = try $ do
+ st <- char '&'
+ body <- many1 alphaNum
+ end <- char ';'
+ let entity = "&" ++ body ++ ";"
+ return $ case (lookup entity entityTable) of
+ Just ch -> ch
+ Nothing -> '?'
+
+-- | Parse SGML hexadecimal entity.
+hexEntity :: GenParser Char st Char
+hexEntity = try $ do
+ st <- string "&#"
+ hex <- oneOf "Xx"
+ body <- many1 (oneOf "0123456789ABCDEFabcdef")
+ end <- char ';'
+ return $ chr $ read ('0':'x':body)
--- | Returns a string containing an entity reference for the character.
-charToEntity :: Char -> String
-charToEntity char =
- let matches = filter (\(entity, character) -> (character == char))
- entityTable in
- if (length matches) == 0
- then "&#" ++ show (ord char) ++ ";"
- else fst (head matches)
+-- | Parse SGML decimal entity.
+decimalEntity :: GenParser Char st Char
+decimalEntity = try $ do
+ st <- string "&#"
+ body <- many1 digit
+ end <- char ';'
+ return $ chr $ read body
+
+-- | Escape string as needed for SGML. Entity references are not preserved.
+encodeEntities :: Bool -- ^ Use only numerical entities.
+ -> String -- ^ String to convert.
+ -> String
+encodeEntities numericalEntities =
+ concatMap (specialCharToEntity numericalEntities)
+
+-- | Escape string as needed for SGML, using only numerical entities.
+-- Entity references are not preserved.
+encodeEntitiesNumerical :: String -> String
+encodeEntitiesNumerical =
+ concatMap (\c -> "&#" ++ show (ord c) ++ ";")
+
+-- | Convert entities in a string to characters.
+decodeEntities :: String -> String
+decodeEntities str =
+ case parse (many (characterEntity <|> anyChar)) str str of
+ Left err -> error $ "\nError: " ++ show err
+ Right result -> result
+
+-- | Escape string for SGML, preserving entity references.
+stringToSGML :: Bool -- ^ Use only numerical entities.
+ -> String -- ^ String to convert.
+ -> String
+stringToSGML numericalEntities str =
+ let nonentity = do
+ notFollowedBy characterEntity
+ c <- anyChar
+ return $ specialCharToEntity numericalEntities c
+ entity = do
+ ent <- manyTill anyChar (char ';')
+ return (ent ++ ";") in
+ case parse (many (nonentity <|> entity)) str str of
+ Left err -> error $ "\nError: " ++ show err
+ Right result -> concat result
entityTable :: [(String, Char)]
entityTable = [
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs
index 135a90ea8..fc06b657e 100644
--- a/src/Text/Pandoc/Readers/HTML.hs
+++ b/src/Text/Pandoc/Readers/HTML.hs
@@ -44,7 +44,7 @@ import Text.ParserCombinators.Parsec
import Text.ParserCombinators.Pandoc
import Text.Pandoc.Definition
import Text.Pandoc.Shared
-import Text.Pandoc.Entities ( decodeEntities, entityToChar )
+import Text.Pandoc.Entities ( characterEntity, decodeEntities )
import Maybe ( fromMaybe )
import Data.List ( intersect, takeWhile, dropWhile )
import Data.Char ( toUpper, toLower, isAlphaNum )
@@ -391,14 +391,9 @@ text = choice [ entity, strong, emph, code, str, linebreak, whitespace ] <?> "t
special = choice [ link, image, rawHtmlInline ] <?>
"link, inline html, or image"
-entity = try (do
- char '&'
- body <- choice [(many1 letter), (try (do
- char '#'
- num <- many1 digit
- return ("#" ++ num)))]
- char ';'
- return (Str [fromMaybe '?' (entityToChar ("&" ++ body ++ ";"))]))
+entity = do
+ ent <- characterEntity
+ return $ Str [ent]
code = try (do
htmlTag "code"
@@ -439,7 +434,7 @@ linebreak = do
str = do
result <- many1 (noneOf "<& \t\n")
- return (Str (decodeEntities result))
+ return (Str result)
--
-- links and images
diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs
index 3aa0a6f12..9b3f047e9 100644
--- a/src/Text/Pandoc/Readers/Markdown.hs
+++ b/src/Text/Pandoc/Readers/Markdown.hs
@@ -42,7 +42,7 @@ import Text.Pandoc.Readers.HTML ( rawHtmlBlock,
anyHtmlTag, anyHtmlEndTag,
htmlEndTag, extractTagType,
htmlBlockElement )
-import Text.Pandoc.Entities ( decodeEntities )
+import Text.Pandoc.Entities ( characterEntity )
import Text.ParserCombinators.Parsec
-- | Read markdown from an input string and return a Pandoc document.
@@ -88,12 +88,13 @@ blockQuoteChar = '>'
hyphenChar = '-'
ellipsesChar = '.'
listColSepChar = '|'
+entityStart = '&'
-- treat these as potentially non-text when parsing inline:
specialChars = [escapeChar, labelStart, labelEnd, emphStart, emphEnd,
emphStartAlt, emphEndAlt, codeStart, codeEnd, autoLinkEnd,
autoLinkStart, mathStart, mathEnd, imageStart, noteStart,
- hyphenChar, ellipsesChar] ++ quoteChars
+ hyphenChar, ellipsesChar, entityStart] ++ quoteChars
--
-- auxiliary functions
@@ -674,7 +675,7 @@ text = choice [ escapedChar, math, strong, emph, smartPunctuation,
code, ltSign, symbol,
str, linebreak, tabchar, whitespace, endline ] <?> "text"
-inline = choice [ rawLaTeXInline', escapedChar, special, text ] <?> "inline"
+inline = choice [ rawLaTeXInline', escapedChar, entity, special, text ] <?> "inline"
special = choice [ noteRef, inlineNote, link, referenceLink, rawHtmlInline',
autoLink, image ] <?> "link, inline html, note, or image"
@@ -827,9 +828,13 @@ linebreak = try (do
nonEndline = noneOf endLineChars
+entity = do
+ ent <- characterEntity
+ return $ Str [ent]
+
str = do
result <- many1 ((noneOf (specialChars ++ spaceChars ++ endLineChars)))
- return (Str (decodeEntities result))
+ return (Str result)
-- an endline character that can be treated as a space, not a structural break
endline = try (do
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index b82357d7a..f63ca4ce4 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -65,8 +65,6 @@ module Text.Pandoc.Shared (
replaceReferenceLinks,
replaceRefLinksBlockList,
-- * SGML
- escapeSGML,
- stringToSGML,
inTags,
selfClosingTag,
inTagsSimple,
@@ -74,7 +72,7 @@ module Text.Pandoc.Shared (
) where
import Text.Pandoc.Definition
import Text.ParserCombinators.Parsec as Parsec
-import Text.Pandoc.Entities ( decodeEntities, charToEntity )
+import Text.Pandoc.Entities ( decodeEntities, encodeEntities, stringToSGML )
import Text.PrettyPrint.HughesPJ as PP ( text, char, (<>),
($$), nest, Doc, isEmpty )
import Data.Char ( toLower, ord )
@@ -523,61 +521,11 @@ replaceRefLinksInline keytable (Quoted t lst) =
Quoted t (map (replaceRefLinksInline keytable) lst)
replaceRefLinksInline keytable other = other
--- | Parse SGML character entity.
-sgmlCharacterEntity :: GenParser Char st [Char]
-sgmlCharacterEntity = sgmlNamedEntity <|> sgmlDecimalEntity <|>
- sgmlHexEntity <?> "SGML entity"
-
--- | Parse SGML character entity.
-sgmlNamedEntity :: GenParser Char st [Char]
-sgmlNamedEntity = try $ do
- st <- Parsec.char '&'
- body <- many1 alphaNum
- end <- Parsec.char ';'
- return $ (st:body) ++ [end]
-
--- | Parse SGML decimal entity.
-sgmlDecimalEntity :: GenParser Char st [Char]
-sgmlDecimalEntity = try $ do
- st <- string "&#"
- body <- many1 (oneOf "0123456789")
- end <- Parsec.char ';'
- return $ st ++ body ++ [end]
-
--- | Parse SGML hexadecimal entity.
-sgmlHexEntity :: GenParser Char st [Char]
-sgmlHexEntity = try $ do
- st <- string "&#"
- hex <- oneOf "Xx"
- body <- many1 (oneOf "0123456789ABCDEFabcdef")
- end <- Parsec.char ';'
- return $ st ++ (hex:body) ++ [end]
-
--- | Escape special character to SGML entity.
-specialCharToEntity :: Char -> [Char]
-specialCharToEntity c = if (c `elem` "&<>\"") || (ord c > 127)
- then charToEntity c
- else [c]
-
--- | Escape string, preserving character entities.
-stringToSGML :: String -> String
-stringToSGML str =
- let segment = sgmlCharacterEntity <|>
- (do{c <- anyChar;
- return $ specialCharToEntity c})
- sgmlString = (do{segs <- many segment; return $ concat segs}) in
- case parse sgmlString str str of
- Left err -> error $ "\nError:\n" ++ show err
- Right result -> result
-
--- | Escape string as needed for SGML. Entity references are not preserved.
-escapeSGML :: String -> String
-escapeSGML = concatMap specialCharToEntity
-
-- | Return a text object with a string of formatted SGML attributes.
attributeList :: [(String, String)] -> Doc
attributeList = text . concatMap
- (\(a, b) -> " " ++ stringToSGML a ++ "=\"" ++ stringToSGML b ++ "\"")
+ (\(a, b) -> " " ++ stringToSGML True a ++ "=\"" ++
+ stringToSGML True b ++ "\"")
-- | Put the supplied contents between start and end tags of tagType,
-- with specified attributes and (if specified) indentation.
diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs
index ec3801a9a..0c83d0ea0 100644
--- a/src/Text/Pandoc/Writers/Docbook.hs
+++ b/src/Text/Pandoc/Writers/Docbook.hs
@@ -32,10 +32,9 @@ module Text.Pandoc.Writers.Docbook (
) where
import Text.Pandoc.Definition
import Text.Pandoc.Shared
-import Text.Pandoc.Entities ( encodeEntities )
-import Text.Regex ( mkRegex, matchRegex )
+import Text.Pandoc.Entities ( encodeEntities, stringToSGML )
import Data.Char ( toLower, ord )
-import Data.List ( isPrefixOf, partition )
+import Data.List ( isPrefixOf, partition, drop )
import Text.PrettyPrint.HughesPJ hiding ( Str )
-- | Data structure for defining hierarchical Pandoc documents
@@ -65,8 +64,8 @@ authorToDocbook name = inTagsIndented "author" $
then -- last name first
let (lastname, rest) = break (==',') name
firstname = removeLeadingSpace rest in
- inTagsSimple "firstname" (text $ stringToSGML firstname) <>
- inTagsSimple "surname" (text $ stringToSGML lastname)
+ inTagsSimple "firstname" (text $ stringToSGML True firstname) <>
+ inTagsSimple "surname" (text $ stringToSGML True lastname)
else -- last name last
let namewords = words name
lengthname = length namewords
@@ -74,8 +73,8 @@ authorToDocbook name = inTagsIndented "author" $
0 -> ("","")
1 -> ("", name)
n -> (joinWithSep " " (take (n-1) namewords), last namewords) in
- inTagsSimple "firstname" (text $ stringToSGML firstname) $$
- inTagsSimple "surname" (text $ stringToSGML lastname)
+ inTagsSimple "firstname" (text $ stringToSGML True firstname) $$
+ inTagsSimple "surname" (text $ stringToSGML True lastname)
-- | Convert Pandoc document to string in Docbook format.
writeDocbook :: WriterOptions -> Pandoc -> String
@@ -87,7 +86,7 @@ writeDocbook opts (Pandoc (Meta title authors date) blocks) =
then inTagsIndented "articleinfo" $
(inTagsSimple "title" (wrap opts title)) $$
(vcat (map authorToDocbook authors)) $$
- (inTagsSimple "date" (text $ stringToSGML date))
+ (inTagsSimple "date" (text $ stringToSGML True date))
else empty
blocks' = replaceReferenceLinks blocks
(noteBlocks, blocks'') = partition isNoteBlock blocks'
@@ -142,7 +141,7 @@ blockToDocbook opts (Para lst) =
blockToDocbook opts (BlockQuote blocks) =
inTagsIndented "blockquote" (blocksToDocbook opts blocks)
blockToDocbook opts (CodeBlock str) =
- text "<screen>\n" <> text (escapeSGML str) <> text "\n</screen>"
+ text "<screen>\n" <> text (encodeEntities True str) <> text "\n</screen>"
blockToDocbook opts (BulletList lst) =
inTagsIndented "itemizedlist" $ listItemsToDocbook opts lst
blockToDocbook opts (OrderedList lst) =
@@ -199,7 +198,7 @@ inlinesToDocbook opts lst = hcat (map (inlineToDocbook opts) lst)
-- | Convert an inline element to Docbook.
inlineToDocbook :: WriterOptions -> Inline -> Doc
-inlineToDocbook opts (Str str) = text $ stringToSGML str
+inlineToDocbook opts (Str str) = text $ stringToSGML True str
inlineToDocbook opts (Emph lst) =
inTagsSimple "emphasis" (inlinesToDocbook opts lst)
inlineToDocbook opts (Strong lst) =
@@ -212,24 +211,23 @@ inlineToDocbook opts Ellipses = text "&#8230;"
inlineToDocbook opts EmDash = text "&#8212;"
inlineToDocbook opts EnDash = text "&#8211;"
inlineToDocbook opts (Code str) =
- inTagsSimple "literal" $ text (escapeSGML str)
+ inTagsSimple "literal" $ text (encodeEntities True str)
inlineToDocbook opts (TeX str) = inlineToDocbook opts (Code str)
inlineToDocbook opts (HtmlInline str) = empty
inlineToDocbook opts LineBreak =
text $ "<literallayout></literallayout>"
inlineToDocbook opts Space = char ' '
inlineToDocbook opts (Link txt (Src src tit)) =
- case (matchRegex (mkRegex "mailto:(.*)") src) of
- Just [addr] -> inTagsSimple "email" $ text (escapeSGML addr)
- Nothing -> inTags False "ulink" [("url", src)] $
- inlinesToDocbook opts txt
+ if isPrefixOf "mailto:" src
+ then inTagsSimple "email" $ text (encodeEntities True $ drop 7 src)
+ else inTags False "ulink" [("url", src)] $ inlinesToDocbook opts txt
inlineToDocbook opts (Link text (Ref ref)) = empty -- shouldn't occur
inlineToDocbook opts (Image alt (Src src tit)) =
let titleDoc = if null tit
then empty
else inTagsIndented "objectinfo" $
inTagsIndented "title"
- (text $ stringToSGML tit) in
+ (text $ stringToSGML True tit) in
inTagsIndented "inlinemediaobject" $
inTagsIndented "imageobject" $
titleDoc $$ selfClosingTag "imagedata" [("fileref", src)]
diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs
index 5465e125d..7c89d6352 100644
--- a/src/Text/Pandoc/Writers/HTML.hs
+++ b/src/Text/Pandoc/Writers/HTML.hs
@@ -32,7 +32,7 @@ module Text.Pandoc.Writers.HTML (
) where
import Text.Pandoc.Definition
import Text.Pandoc.Shared
-import Text.Pandoc.Entities ( encodeEntities )
+import Text.Pandoc.Entities ( encodeEntities, stringToSGML )
import Text.Regex ( mkRegex, matchRegex )
import Numeric ( showHex )
import Data.Char ( ord, toLower )
@@ -127,11 +127,11 @@ htmlHeader opts (Meta title authors date) =
then empty
else selfClosingTag "meta" [("name", "author"),
("content",
- joinWithSep ", " (map stringToSGML authors))]
+ joinWithSep ", " (map (stringToSGML False) authors))]
datetext = if (date == "")
then empty
else selfClosingTag "meta" [("name", "date"),
- ("content", stringToSGML date)] in
+ ("content", stringToSGML False date)] in
text (writerHeader opts) $$ authortext $$ datetext $$ titletext $$
text "</head>\n<body>"
@@ -168,7 +168,7 @@ blockToHtml opts (Note ref lst) =
(text "&#8617;")
blockToHtml opts (Key _ _) = empty
blockToHtml opts (CodeBlock str) =
- text "<pre><code>" <> text (escapeSGML str) <> text "\n</code></pre>"
+ text "<pre><code>" <> text (encodeEntities False str) <> text "\n</code></pre>"
blockToHtml opts (RawHtml str) = text str
blockToHtml opts (BulletList lst) =
let attribs = if (writerIncremental opts)
@@ -234,7 +234,7 @@ inlineToHtml opts (Emph lst) =
inlineToHtml opts (Strong lst) =
inTagsSimple "strong" (inlineListToHtml opts lst)
inlineToHtml opts (Code str) =
- inTagsSimple "code" $ text (escapeSGML str)
+ inTagsSimple "code" $ text (encodeEntities False str)
inlineToHtml opts (Quoted SingleQuote lst) =
text "&lsquo;" <> (inlineListToHtml opts lst) <> text "&rsquo;"
inlineToHtml opts (Quoted DoubleQuote lst) =
@@ -243,16 +243,16 @@ inlineToHtml opts EmDash = text "&mdash;"
inlineToHtml opts EnDash = text "&ndash;"
inlineToHtml opts Ellipses = text "&hellip;"
inlineToHtml opts Apostrophe = text "&rsquo;"
-inlineToHtml opts (Str str) = text $ stringToSGML str
-inlineToHtml opts (TeX str) = text $ escapeSGML str
+inlineToHtml opts (Str str) = text $ stringToSGML False str
+inlineToHtml opts (TeX str) = text $ encodeEntities False str
inlineToHtml opts (HtmlInline str) = text str
inlineToHtml opts (LineBreak) = selfClosingTag "br" []
inlineToHtml opts Space = space
inlineToHtml opts (Link txt (Src src tit)) =
- let title = stringToSGML tit in
+ let title = stringToSGML False tit in
if (isPrefixOf "mailto:" src)
then obfuscateLink opts txt src
- else inTags False "a" ([("href", escapeSGML src)] ++
+ else inTags False "a" ([("href", encodeEntities False src)] ++
if null tit then [] else [("title", title)])
(inlineListToHtml opts txt)
inlineToHtml opts (Link txt (Ref ref)) =
@@ -260,7 +260,7 @@ inlineToHtml opts (Link txt (Ref ref)) =
(inlineListToHtml opts ref) <> char ']'
-- this is what markdown does, for better or worse
inlineToHtml opts (Image alt (Src source tit)) =
- let title = stringToSGML tit
+ let title = stringToSGML False tit
alternate = render $ inlineListToHtml opts alt in
selfClosingTag "img" $ [("src", source)] ++
(if null alternate then [] else [("alt", alternate)]) ++
diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs
index 5eff079f8..46c47bf74 100644
--- a/src/Text/Pandoc/Writers/Markdown.hs
+++ b/src/Text/Pandoc/Writers/Markdown.hs
@@ -32,7 +32,6 @@ Markdown: <http://daringfireball.net/projects/markdown/>
module Text.Pandoc.Writers.Markdown (
writeMarkdown
) where
-import Text.Regex ( matchRegex, mkRegex )
import Text.Pandoc.Definition
import Text.Pandoc.Shared
import Data.List ( group, isPrefixOf, drop )