diff options
author | John MacFarlane <jgm@berkeley.edu> | 2010-03-22 19:29:37 -0700 |
---|---|---|
committer | John MacFarlane <jgm@berkeley.edu> | 2010-03-23 00:33:50 -0700 |
commit | 71eac37ac555b41325c334f662facc5308f43b64 (patch) | |
tree | b560d3d95d78cea5c6606639c1915de9dbe88f80 /src/Text/Pandoc/Shared.hs | |
parent | 7689cacb5d0147e79b9080607bb35a0dd262b96a (diff) | |
download | pandoc-71eac37ac555b41325c334f662facc5308f43b64.tar.gz |
Fixed treatment of unicode characters in URIs.
* Added stringToURI to Shared. This is used in the HTML
writer for all URIs. It properly URI-encodes high
characters (> 127), leaving everything else (including
symbols and spaces) the same.
* Modified unsanitaryURI to allow UTF8 characters in a URI.
(First, we convert the URI to URI-encoded octets, then we
pass through parseURIReference.)
This resolves gitit Issue #99. Previously
'[abc](http://gitit.net/测试)' would not be rendered as
a link when --sanitize was selected.
Diffstat (limited to 'src/Text/Pandoc/Shared.hs')
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 18 |
1 files changed, 17 insertions, 1 deletions
diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index f093ddbee..2c1d3ab71 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -43,6 +43,7 @@ module Text.Pandoc.Shared ( stripFirstAndLast, camelCaseToHyphenated, toRomanNumeral, + stringToURI, wrapped, wrapIfNeeded, wrappedTeX, @@ -114,7 +115,7 @@ import Text.ParserCombinators.Parsec import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest ) import qualified Text.PrettyPrint.HughesPJ as PP import Text.Pandoc.CharacterReferences ( characterReference ) -import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha, +import Data.Char ( toLower, toUpper, ord, chr, isLower, isUpper, isAlpha, isPunctuation ) import Data.List ( find, isPrefixOf, intercalate ) import Network.URI ( parseURI, URI (..), isAllowedInURI ) @@ -130,7 +131,12 @@ import System.IO.UTF8 import Data.Generics import qualified Control.Monad.State as S import Control.Monad (join) +import Data.ByteString (unpack) +import Data.Word (Word8) +import Data.ByteString.UTF8 (fromString) +import Text.Printf (printf) import Paths_pandoc (getDataFileName) + -- -- List processing -- @@ -228,6 +234,16 @@ toRomanNumeral x = _ | x >= 1 -> "I" ++ toRomanNumeral (x - 1) _ -> "" +-- | Escape unicode characters in a URI. This means converting +-- them to UTF-8, then URI-encoding the octets. We leave everything +-- else the same, assuming that the user has already escaped +-- special characters like & and %. +stringToURI :: String -> String +stringToURI = concatMap encodeOctet . unpack . fromString + where encodeOctet :: Word8 -> String + encodeOctet x | x > 127 = printf "%%%2x" x + encodeOctet x = [chr (fromIntegral x)] + -- | Wrap inlines to line length. wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>= |