From 9e658673006ca8c934bb75b224fdc0e7144b4030 Mon Sep 17 00:00:00 2001 From: John MacFarlane Date: Tue, 23 Mar 2010 13:51:52 -0700 Subject: Better definition of stringToURI. Now it escapes all characters that aren't allowed in a URI. %, ?, /, and other characters that are allowed in a URI are left alone. Unicode high characters are UTF-8 encoded. --- src/Text/Pandoc/Shared.hs | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) (limited to 'src/Text/Pandoc') diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 42e3345c8..0e1ace858 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -115,10 +115,11 @@ import Text.ParserCombinators.Parsec import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest ) import qualified Text.PrettyPrint.HughesPJ as PP import Text.Pandoc.CharacterReferences ( characterReference ) -import Data.Char ( toLower, toUpper, ord, chr, isLower, isUpper, isAlpha, +import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha, isPunctuation ) import Data.List ( find, isPrefixOf, intercalate ) -import Network.URI ( parseURI, URI (..), isAllowedInURI ) +import Network.URI ( parseURI, URI (..), isAllowedInURI, escapeURIString ) +import Codec.Binary.UTF8.String ( encodeString ) import System.Directory import System.FilePath ( () ) -- Note: ghc >= 6.12 (base >=4.2) supports unicode through iconv @@ -131,10 +132,6 @@ import System.IO.UTF8 import Data.Generics import qualified Control.Monad.State as S import Control.Monad (join) -import Data.ByteString (unpack) -import Data.Word (Word8) -import Data.ByteString.UTF8 (fromString) -import Text.Printf (printf) import Paths_pandoc (getDataFileName) -- @@ -234,15 +231,10 @@ toRomanNumeral x = _ | x >= 1 -> "I" ++ toRomanNumeral (x - 1) _ -> "" --- | Escape unicode characters in a URI. This means converting --- them to UTF-8, then URI-encoding the octets. We leave everything --- else the same, assuming that the user has already escaped --- special characters like & and %. +-- | Escape unicode characters in a URI. Characters that are +-- already valid in a URI, including % and ?, are left alone. stringToURI :: String -> String -stringToURI = concatMap encodeOctet . unpack . fromString - where encodeOctet :: Word8 -> String - encodeOctet x | x > 127 = printf "%%%2x" x - encodeOctet x = [chr (fromIntegral x)] +stringToURI = escapeURIString isAllowedInURI . encodeString -- | Wrap inlines to line length. wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc -- cgit v1.2.3