diff options
Diffstat (limited to 'src/Text')
-rw-r--r-- | src/Text/Pandoc/Entities.hs | 137 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 15 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Markdown.hs | 13 | ||||
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 58 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Docbook.hs | 30 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/HTML.hs | 20 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Markdown.hs | 1 |
7 files changed, 133 insertions, 141 deletions
diff --git a/src/Text/Pandoc/Entities.hs b/src/Text/Pandoc/Entities.hs index ea5676b79..3e68db35c 100644 --- a/src/Text/Pandoc/Entities.hs +++ b/src/Text/Pandoc/Entities.hs @@ -29,62 +29,109 @@ Functions for encoding unicode characters as entity references, and vice versa. -} module Text.Pandoc.Entities ( - entityToChar, charToEntity, - decodeEntities, + charToNumericalEntity, + specialCharToEntity, encodeEntities, + decodeEntities, + stringToSGML, characterEntity ) where import Data.Char ( chr, ord ) -import Text.Regex ( mkRegex, matchRegexAll, Regex ) -import Maybe ( fromMaybe ) +import Text.ParserCombinators.Parsec +import Data.Maybe ( fromMaybe ) --- | Regular expression for numerical coded entity. -numericalEntity :: Text.Regex.Regex -numericalEntity = mkRegex "&#([0-9]+|[xX][0-9A-Fa-f]+);" +-- | Returns a string containing an entity reference for the character. +charToEntity :: Char -> String +charToEntity char = + let matches = filter (\(entity, character) -> (character == char)) + entityTable in + if (length matches) == 0 + then charToNumericalEntity char + else fst (head matches) --- | Regular expression for character entity. -characterEntity :: Text.Regex.Regex -characterEntity = mkRegex "&#[0-9]+;|&#[xX][0-9A-Fa-f]+;|&[A-Za-z0-9]+;" +-- | Returns a string containing a numerical entity reference for the char. +charToNumericalEntity :: Char -> String +charToNumericalEntity ch = "&#" ++ show (ord ch) ++ ";" --- | Return a string with all entity references decoded to unicode characters --- where possible. -decodeEntities :: String -> String -decodeEntities str = - case (matchRegexAll characterEntity str) of - Nothing -> str - Just (before, match, rest, _) -> before ++ replacement ++ - (decodeEntities rest) - where replacement = case (entityToChar match) of - Just ch -> [ch] - Nothing -> match +-- | Escape special character to SGML entity. +specialCharToEntity :: Bool -- ^ Use numerical entities only. + -> Char -- ^ Character to convert. + -> [Char] +specialCharToEntity numericalEntities c = + if (c `elem` "&<>\"") || (ord c > 127) + then if numericalEntities + then charToNumericalEntity c + else charToEntity c + else [c] --- | Returns a string with characters replaced with entity references where --- possible. -encodeEntities :: String -> String -encodeEntities [] = [] -encodeEntities (c:cs) = if ord c < 128 - then c:(encodeEntities cs) - else (charToEntity c) ++ (encodeEntities cs) +-- | Parse SGML character entity. +characterEntity :: GenParser Char st Char +characterEntity = namedEntity <|> hexEntity <|> decimalEntity <?> "SGML entity" --- | If the string is a valid entity reference, returns @Just@ the character, --- otherwise @Nothing@. -entityToChar :: String -> Maybe Char -entityToChar entity = - case (lookup entity entityTable) of - Just ch -> Just ch - Nothing -> case (matchRegexAll numericalEntity entity) of - Just (_, _, _, [sub]) -> Just (chr (read ('0':sub))) - Nothing -> Nothing +-- | Parse SGML character entity. +namedEntity :: GenParser Char st Char +namedEntity = try $ do + st <- char '&' + body <- many1 alphaNum + end <- char ';' + let entity = "&" ++ body ++ ";" + return $ case (lookup entity entityTable) of + Just ch -> ch + Nothing -> '?' + +-- | Parse SGML hexadecimal entity. +hexEntity :: GenParser Char st Char +hexEntity = try $ do + st <- string "&#" + hex <- oneOf "Xx" + body <- many1 (oneOf "0123456789ABCDEFabcdef") + end <- char ';' + return $ chr $ read ('0':'x':body) --- | Returns a string containing an entity reference for the character. -charToEntity :: Char -> String -charToEntity char = - let matches = filter (\(entity, character) -> (character == char)) - entityTable in - if (length matches) == 0 - then "&#" ++ show (ord char) ++ ";" - else fst (head matches) +-- | Parse SGML decimal entity. +decimalEntity :: GenParser Char st Char +decimalEntity = try $ do + st <- string "&#" + body <- many1 digit + end <- char ';' + return $ chr $ read body + +-- | Escape string as needed for SGML. Entity references are not preserved. +encodeEntities :: Bool -- ^ Use only numerical entities. + -> String -- ^ String to convert. + -> String +encodeEntities numericalEntities = + concatMap (specialCharToEntity numericalEntities) + +-- | Escape string as needed for SGML, using only numerical entities. +-- Entity references are not preserved. +encodeEntitiesNumerical :: String -> String +encodeEntitiesNumerical = + concatMap (\c -> "&#" ++ show (ord c) ++ ";") + +-- | Convert entities in a string to characters. +decodeEntities :: String -> String +decodeEntities str = + case parse (many (characterEntity <|> anyChar)) str str of + Left err -> error $ "\nError: " ++ show err + Right result -> result + +-- | Escape string for SGML, preserving entity references. +stringToSGML :: Bool -- ^ Use only numerical entities. + -> String -- ^ String to convert. + -> String +stringToSGML numericalEntities str = + let nonentity = do + notFollowedBy characterEntity + c <- anyChar + return $ specialCharToEntity numericalEntities c + entity = do + ent <- manyTill anyChar (char ';') + return (ent ++ ";") in + case parse (many (nonentity <|> entity)) str str of + Left err -> error $ "\nError: " ++ show err + Right result -> concat result entityTable :: [(String, Char)] entityTable = [ diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 135a90ea8..fc06b657e 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -44,7 +44,7 @@ import Text.ParserCombinators.Parsec import Text.ParserCombinators.Pandoc import Text.Pandoc.Definition import Text.Pandoc.Shared -import Text.Pandoc.Entities ( decodeEntities, entityToChar ) +import Text.Pandoc.Entities ( characterEntity, decodeEntities ) import Maybe ( fromMaybe ) import Data.List ( intersect, takeWhile, dropWhile ) import Data.Char ( toUpper, toLower, isAlphaNum ) @@ -391,14 +391,9 @@ text = choice [ entity, strong, emph, code, str, linebreak, whitespace ] <?> "t special = choice [ link, image, rawHtmlInline ] <?> "link, inline html, or image" -entity = try (do - char '&' - body <- choice [(many1 letter), (try (do - char '#' - num <- many1 digit - return ("#" ++ num)))] - char ';' - return (Str [fromMaybe '?' (entityToChar ("&" ++ body ++ ";"))])) +entity = do + ent <- characterEntity + return $ Str [ent] code = try (do htmlTag "code" @@ -439,7 +434,7 @@ linebreak = do str = do result <- many1 (noneOf "<& \t\n") - return (Str (decodeEntities result)) + return (Str result) -- -- links and images diff --git a/src/Text/Pandoc/Readers/Markdown.hs b/src/Text/Pandoc/Readers/Markdown.hs index 3aa0a6f12..9b3f047e9 100644 --- a/src/Text/Pandoc/Readers/Markdown.hs +++ b/src/Text/Pandoc/Readers/Markdown.hs @@ -42,7 +42,7 @@ import Text.Pandoc.Readers.HTML ( rawHtmlBlock, anyHtmlTag, anyHtmlEndTag, htmlEndTag, extractTagType, htmlBlockElement ) -import Text.Pandoc.Entities ( decodeEntities ) +import Text.Pandoc.Entities ( characterEntity ) import Text.ParserCombinators.Parsec -- | Read markdown from an input string and return a Pandoc document. @@ -88,12 +88,13 @@ blockQuoteChar = '>' hyphenChar = '-' ellipsesChar = '.' listColSepChar = '|' +entityStart = '&' -- treat these as potentially non-text when parsing inline: specialChars = [escapeChar, labelStart, labelEnd, emphStart, emphEnd, emphStartAlt, emphEndAlt, codeStart, codeEnd, autoLinkEnd, autoLinkStart, mathStart, mathEnd, imageStart, noteStart, - hyphenChar, ellipsesChar] ++ quoteChars + hyphenChar, ellipsesChar, entityStart] ++ quoteChars -- -- auxiliary functions @@ -674,7 +675,7 @@ text = choice [ escapedChar, math, strong, emph, smartPunctuation, code, ltSign, symbol, str, linebreak, tabchar, whitespace, endline ] <?> "text" -inline = choice [ rawLaTeXInline', escapedChar, special, text ] <?> "inline" +inline = choice [ rawLaTeXInline', escapedChar, entity, special, text ] <?> "inline" special = choice [ noteRef, inlineNote, link, referenceLink, rawHtmlInline', autoLink, image ] <?> "link, inline html, note, or image" @@ -827,9 +828,13 @@ linebreak = try (do nonEndline = noneOf endLineChars +entity = do + ent <- characterEntity + return $ Str [ent] + str = do result <- many1 ((noneOf (specialChars ++ spaceChars ++ endLineChars))) - return (Str (decodeEntities result)) + return (Str result) -- an endline character that can be treated as a space, not a structural break endline = try (do diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index b82357d7a..f63ca4ce4 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -65,8 +65,6 @@ module Text.Pandoc.Shared ( replaceReferenceLinks, replaceRefLinksBlockList, -- * SGML - escapeSGML, - stringToSGML, inTags, selfClosingTag, inTagsSimple, @@ -74,7 +72,7 @@ module Text.Pandoc.Shared ( ) where import Text.Pandoc.Definition import Text.ParserCombinators.Parsec as Parsec -import Text.Pandoc.Entities ( decodeEntities, charToEntity ) +import Text.Pandoc.Entities ( decodeEntities, encodeEntities, stringToSGML ) import Text.PrettyPrint.HughesPJ as PP ( text, char, (<>), ($$), nest, Doc, isEmpty ) import Data.Char ( toLower, ord ) @@ -523,61 +521,11 @@ replaceRefLinksInline keytable (Quoted t lst) = Quoted t (map (replaceRefLinksInline keytable) lst) replaceRefLinksInline keytable other = other --- | Parse SGML character entity. -sgmlCharacterEntity :: GenParser Char st [Char] -sgmlCharacterEntity = sgmlNamedEntity <|> sgmlDecimalEntity <|> - sgmlHexEntity <?> "SGML entity" - --- | Parse SGML character entity. -sgmlNamedEntity :: GenParser Char st [Char] -sgmlNamedEntity = try $ do - st <- Parsec.char '&' - body <- many1 alphaNum - end <- Parsec.char ';' - return $ (st:body) ++ [end] - --- | Parse SGML decimal entity. -sgmlDecimalEntity :: GenParser Char st [Char] -sgmlDecimalEntity = try $ do - st <- string "&#" - body <- many1 (oneOf "0123456789") - end <- Parsec.char ';' - return $ st ++ body ++ [end] - --- | Parse SGML hexadecimal entity. -sgmlHexEntity :: GenParser Char st [Char] -sgmlHexEntity = try $ do - st <- string "&#" - hex <- oneOf "Xx" - body <- many1 (oneOf "0123456789ABCDEFabcdef") - end <- Parsec.char ';' - return $ st ++ (hex:body) ++ [end] - --- | Escape special character to SGML entity. -specialCharToEntity :: Char -> [Char] -specialCharToEntity c = if (c `elem` "&<>\"") || (ord c > 127) - then charToEntity c - else [c] - --- | Escape string, preserving character entities. -stringToSGML :: String -> String -stringToSGML str = - let segment = sgmlCharacterEntity <|> - (do{c <- anyChar; - return $ specialCharToEntity c}) - sgmlString = (do{segs <- many segment; return $ concat segs}) in - case parse sgmlString str str of - Left err -> error $ "\nError:\n" ++ show err - Right result -> result - --- | Escape string as needed for SGML. Entity references are not preserved. -escapeSGML :: String -> String -escapeSGML = concatMap specialCharToEntity - -- | Return a text object with a string of formatted SGML attributes. attributeList :: [(String, String)] -> Doc attributeList = text . concatMap - (\(a, b) -> " " ++ stringToSGML a ++ "=\"" ++ stringToSGML b ++ "\"") + (\(a, b) -> " " ++ stringToSGML True a ++ "=\"" ++ + stringToSGML True b ++ "\"") -- | Put the supplied contents between start and end tags of tagType, -- with specified attributes and (if specified) indentation. diff --git a/src/Text/Pandoc/Writers/Docbook.hs b/src/Text/Pandoc/Writers/Docbook.hs index ec3801a9a..0c83d0ea0 100644 --- a/src/Text/Pandoc/Writers/Docbook.hs +++ b/src/Text/Pandoc/Writers/Docbook.hs @@ -32,10 +32,9 @@ module Text.Pandoc.Writers.Docbook ( ) where import Text.Pandoc.Definition import Text.Pandoc.Shared -import Text.Pandoc.Entities ( encodeEntities ) -import Text.Regex ( mkRegex, matchRegex ) +import Text.Pandoc.Entities ( encodeEntities, stringToSGML ) import Data.Char ( toLower, ord ) -import Data.List ( isPrefixOf, partition ) +import Data.List ( isPrefixOf, partition, drop ) import Text.PrettyPrint.HughesPJ hiding ( Str ) -- | Data structure for defining hierarchical Pandoc documents @@ -65,8 +64,8 @@ authorToDocbook name = inTagsIndented "author" $ then -- last name first let (lastname, rest) = break (==',') name firstname = removeLeadingSpace rest in - inTagsSimple "firstname" (text $ stringToSGML firstname) <> - inTagsSimple "surname" (text $ stringToSGML lastname) + inTagsSimple "firstname" (text $ stringToSGML True firstname) <> + inTagsSimple "surname" (text $ stringToSGML True lastname) else -- last name last let namewords = words name lengthname = length namewords @@ -74,8 +73,8 @@ authorToDocbook name = inTagsIndented "author" $ 0 -> ("","") 1 -> ("", name) n -> (joinWithSep " " (take (n-1) namewords), last namewords) in - inTagsSimple "firstname" (text $ stringToSGML firstname) $$ - inTagsSimple "surname" (text $ stringToSGML lastname) + inTagsSimple "firstname" (text $ stringToSGML True firstname) $$ + inTagsSimple "surname" (text $ stringToSGML True lastname) -- | Convert Pandoc document to string in Docbook format. writeDocbook :: WriterOptions -> Pandoc -> String @@ -87,7 +86,7 @@ writeDocbook opts (Pandoc (Meta title authors date) blocks) = then inTagsIndented "articleinfo" $ (inTagsSimple "title" (wrap opts title)) $$ (vcat (map authorToDocbook authors)) $$ - (inTagsSimple "date" (text $ stringToSGML date)) + (inTagsSimple "date" (text $ stringToSGML True date)) else empty blocks' = replaceReferenceLinks blocks (noteBlocks, blocks'') = partition isNoteBlock blocks' @@ -142,7 +141,7 @@ blockToDocbook opts (Para lst) = blockToDocbook opts (BlockQuote blocks) = inTagsIndented "blockquote" (blocksToDocbook opts blocks) blockToDocbook opts (CodeBlock str) = - text "<screen>\n" <> text (escapeSGML str) <> text "\n</screen>" + text "<screen>\n" <> text (encodeEntities True str) <> text "\n</screen>" blockToDocbook opts (BulletList lst) = inTagsIndented "itemizedlist" $ listItemsToDocbook opts lst blockToDocbook opts (OrderedList lst) = @@ -199,7 +198,7 @@ inlinesToDocbook opts lst = hcat (map (inlineToDocbook opts) lst) -- | Convert an inline element to Docbook. inlineToDocbook :: WriterOptions -> Inline -> Doc -inlineToDocbook opts (Str str) = text $ stringToSGML str +inlineToDocbook opts (Str str) = text $ stringToSGML True str inlineToDocbook opts (Emph lst) = inTagsSimple "emphasis" (inlinesToDocbook opts lst) inlineToDocbook opts (Strong lst) = @@ -212,24 +211,23 @@ inlineToDocbook opts Ellipses = text "…" inlineToDocbook opts EmDash = text "—" inlineToDocbook opts EnDash = text "–" inlineToDocbook opts (Code str) = - inTagsSimple "literal" $ text (escapeSGML str) + inTagsSimple "literal" $ text (encodeEntities True str) inlineToDocbook opts (TeX str) = inlineToDocbook opts (Code str) inlineToDocbook opts (HtmlInline str) = empty inlineToDocbook opts LineBreak = text $ "<literallayout></literallayout>" inlineToDocbook opts Space = char ' ' inlineToDocbook opts (Link txt (Src src tit)) = - case (matchRegex (mkRegex "mailto:(.*)") src) of - Just [addr] -> inTagsSimple "email" $ text (escapeSGML addr) - Nothing -> inTags False "ulink" [("url", src)] $ - inlinesToDocbook opts txt + if isPrefixOf "mailto:" src + then inTagsSimple "email" $ text (encodeEntities True $ drop 7 src) + else inTags False "ulink" [("url", src)] $ inlinesToDocbook opts txt inlineToDocbook opts (Link text (Ref ref)) = empty -- shouldn't occur inlineToDocbook opts (Image alt (Src src tit)) = let titleDoc = if null tit then empty else inTagsIndented "objectinfo" $ inTagsIndented "title" - (text $ stringToSGML tit) in + (text $ stringToSGML True tit) in inTagsIndented "inlinemediaobject" $ inTagsIndented "imageobject" $ titleDoc $$ selfClosingTag "imagedata" [("fileref", src)] diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index 5465e125d..7c89d6352 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -32,7 +32,7 @@ module Text.Pandoc.Writers.HTML ( ) where import Text.Pandoc.Definition import Text.Pandoc.Shared -import Text.Pandoc.Entities ( encodeEntities ) +import Text.Pandoc.Entities ( encodeEntities, stringToSGML ) import Text.Regex ( mkRegex, matchRegex ) import Numeric ( showHex ) import Data.Char ( ord, toLower ) @@ -127,11 +127,11 @@ htmlHeader opts (Meta title authors date) = then empty else selfClosingTag "meta" [("name", "author"), ("content", - joinWithSep ", " (map stringToSGML authors))] + joinWithSep ", " (map (stringToSGML False) authors))] datetext = if (date == "") then empty else selfClosingTag "meta" [("name", "date"), - ("content", stringToSGML date)] in + ("content", stringToSGML False date)] in text (writerHeader opts) $$ authortext $$ datetext $$ titletext $$ text "</head>\n<body>" @@ -168,7 +168,7 @@ blockToHtml opts (Note ref lst) = (text "↩") blockToHtml opts (Key _ _) = empty blockToHtml opts (CodeBlock str) = - text "<pre><code>" <> text (escapeSGML str) <> text "\n</code></pre>" + text "<pre><code>" <> text (encodeEntities False str) <> text "\n</code></pre>" blockToHtml opts (RawHtml str) = text str blockToHtml opts (BulletList lst) = let attribs = if (writerIncremental opts) @@ -234,7 +234,7 @@ inlineToHtml opts (Emph lst) = inlineToHtml opts (Strong lst) = inTagsSimple "strong" (inlineListToHtml opts lst) inlineToHtml opts (Code str) = - inTagsSimple "code" $ text (escapeSGML str) + inTagsSimple "code" $ text (encodeEntities False str) inlineToHtml opts (Quoted SingleQuote lst) = text "‘" <> (inlineListToHtml opts lst) <> text "’" inlineToHtml opts (Quoted DoubleQuote lst) = @@ -243,16 +243,16 @@ inlineToHtml opts EmDash = text "—" inlineToHtml opts EnDash = text "–" inlineToHtml opts Ellipses = text "…" inlineToHtml opts Apostrophe = text "’" -inlineToHtml opts (Str str) = text $ stringToSGML str -inlineToHtml opts (TeX str) = text $ escapeSGML str +inlineToHtml opts (Str str) = text $ stringToSGML False str +inlineToHtml opts (TeX str) = text $ encodeEntities False str inlineToHtml opts (HtmlInline str) = text str inlineToHtml opts (LineBreak) = selfClosingTag "br" [] inlineToHtml opts Space = space inlineToHtml opts (Link txt (Src src tit)) = - let title = stringToSGML tit in + let title = stringToSGML False tit in if (isPrefixOf "mailto:" src) then obfuscateLink opts txt src - else inTags False "a" ([("href", escapeSGML src)] ++ + else inTags False "a" ([("href", encodeEntities False src)] ++ if null tit then [] else [("title", title)]) (inlineListToHtml opts txt) inlineToHtml opts (Link txt (Ref ref)) = @@ -260,7 +260,7 @@ inlineToHtml opts (Link txt (Ref ref)) = (inlineListToHtml opts ref) <> char ']' -- this is what markdown does, for better or worse inlineToHtml opts (Image alt (Src source tit)) = - let title = stringToSGML tit + let title = stringToSGML False tit alternate = render $ inlineListToHtml opts alt in selfClosingTag "img" $ [("src", source)] ++ (if null alternate then [] else [("alt", alternate)]) ++ diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index 5eff079f8..46c47bf74 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -32,7 +32,6 @@ Markdown: <http://daringfireball.net/projects/markdown/> module Text.Pandoc.Writers.Markdown ( writeMarkdown ) where -import Text.Regex ( matchRegex, mkRegex ) import Text.Pandoc.Definition import Text.Pandoc.Shared import Data.List ( group, isPrefixOf, drop ) |