aboutsummaryrefslogtreecommitdiff
path: root/src/Text/Pandoc/Shared.hs
diff options
context:
space:
mode:
authorJohn MacFarlane <jgm@berkeley.edu>2010-03-22 19:29:37 -0700
committerJohn MacFarlane <jgm@berkeley.edu>2010-03-23 00:33:50 -0700
commit71eac37ac555b41325c334f662facc5308f43b64 (patch)
treeb560d3d95d78cea5c6606639c1915de9dbe88f80 /src/Text/Pandoc/Shared.hs
parent7689cacb5d0147e79b9080607bb35a0dd262b96a (diff)
downloadpandoc-71eac37ac555b41325c334f662facc5308f43b64.tar.gz
Fixed treatment of unicode characters in URIs.
* Added stringToURI to Shared. This is used in the HTML writer for all URIs. It properly URI-encodes high characters (> 127), leaving everything else (including symbols and spaces) the same. * Modified unsanitaryURI to allow UTF8 characters in a URI. (First, we convert the URI to URI-encoded octets, then we pass through parseURIReference.) This resolves gitit Issue #99. Previously '[abc](http://gitit.net/测试)' would not be rendered as a link when --sanitize was selected.
Diffstat (limited to 'src/Text/Pandoc/Shared.hs')
-rw-r--r--src/Text/Pandoc/Shared.hs18
1 files changed, 17 insertions, 1 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs
index f093ddbee..2c1d3ab71 100644
--- a/src/Text/Pandoc/Shared.hs
+++ b/src/Text/Pandoc/Shared.hs
@@ -43,6 +43,7 @@ module Text.Pandoc.Shared (
stripFirstAndLast,
camelCaseToHyphenated,
toRomanNumeral,
+ stringToURI,
wrapped,
wrapIfNeeded,
wrappedTeX,
@@ -114,7 +115,7 @@ import Text.ParserCombinators.Parsec
import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest )
import qualified Text.PrettyPrint.HughesPJ as PP
import Text.Pandoc.CharacterReferences ( characterReference )
-import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha,
+import Data.Char ( toLower, toUpper, ord, chr, isLower, isUpper, isAlpha,
isPunctuation )
import Data.List ( find, isPrefixOf, intercalate )
import Network.URI ( parseURI, URI (..), isAllowedInURI )
@@ -130,7 +131,12 @@ import System.IO.UTF8
import Data.Generics
import qualified Control.Monad.State as S
import Control.Monad (join)
+import Data.ByteString (unpack)
+import Data.Word (Word8)
+import Data.ByteString.UTF8 (fromString)
+import Text.Printf (printf)
import Paths_pandoc (getDataFileName)
+
--
-- List processing
--
@@ -228,6 +234,16 @@ toRomanNumeral x =
_ | x >= 1 -> "I" ++ toRomanNumeral (x - 1)
_ -> ""
+-- | Escape unicode characters in a URI. This means converting
+-- them to UTF-8, then URI-encoding the octets. We leave everything
+-- else the same, assuming that the user has already escaped
+-- special characters like & and %.
+stringToURI :: String -> String
+stringToURI = concatMap encodeOctet . unpack . fromString
+ where encodeOctet :: Word8 -> String
+ encodeOctet x | x > 127 = printf "%%%2x" x
+ encodeOctet x = [chr (fromIntegral x)]
+
-- | Wrap inlines to line length.
wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc
wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>=