diff options
-rw-r--r-- | src/Text/Pandoc/Readers/HTML.hs | 2 | ||||
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 18 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/HTML.hs | 32 |
3 files changed, 38 insertions, 14 deletions
diff --git a/src/Text/Pandoc/Readers/HTML.hs b/src/Text/Pandoc/Readers/HTML.hs index 2e5473992..317c002e9 100644 --- a/src/Text/Pandoc/Readers/HTML.hs +++ b/src/Text/Pandoc/Readers/HTML.hs @@ -182,7 +182,7 @@ unsanitaryURI u = "ldaps:", "magnet:", "mms:", "msnim:", "notes:", "rsync:", "secondlife:", "skype:", "ssh:", "sftp:", "smb:", "sms:", "snews:", "webcal:", "ymsgr:"] - in case parseURIReference u of + in case parseURIReference (stringToURI u) of Just p -> (map toLower $ uriScheme p) `notElem` safeURISchemes Nothing -> True diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index f093ddbee..2c1d3ab71 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -43,6 +43,7 @@ module Text.Pandoc.Shared ( stripFirstAndLast, camelCaseToHyphenated, toRomanNumeral, + stringToURI, wrapped, wrapIfNeeded, wrappedTeX, @@ -114,7 +115,7 @@ import Text.ParserCombinators.Parsec import Text.PrettyPrint.HughesPJ ( Doc, fsep, ($$), (<>), empty, isEmpty, text, nest ) import qualified Text.PrettyPrint.HughesPJ as PP import Text.Pandoc.CharacterReferences ( characterReference ) -import Data.Char ( toLower, toUpper, ord, isLower, isUpper, isAlpha, +import Data.Char ( toLower, toUpper, ord, chr, isLower, isUpper, isAlpha, isPunctuation ) import Data.List ( find, isPrefixOf, intercalate ) import Network.URI ( parseURI, URI (..), isAllowedInURI ) @@ -130,7 +131,12 @@ import System.IO.UTF8 import Data.Generics import qualified Control.Monad.State as S import Control.Monad (join) +import Data.ByteString (unpack) +import Data.Word (Word8) +import Data.ByteString.UTF8 (fromString) +import Text.Printf (printf) import Paths_pandoc (getDataFileName) + -- -- List processing -- @@ -228,6 +234,16 @@ toRomanNumeral x = _ | x >= 1 -> "I" ++ toRomanNumeral (x - 1) _ -> "" +-- | Escape unicode characters in a URI. This means converting +-- them to UTF-8, then URI-encoding the octets. We leave everything +-- else the same, assuming that the user has already escaped +-- special characters like & and %. +stringToURI :: String -> String +stringToURI = concatMap encodeOctet . unpack . fromString + where encodeOctet :: Word8 -> String + encodeOctet x | x > 127 = printf "%%%2x" x + encodeOctet x = [chr (fromIntegral x)] + -- | Wrap inlines to line length. wrapped :: Monad m => ([Inline] -> m Doc) -> [Inline] -> m Doc wrapped listWriter sect = (mapM listWriter $ splitBy Space sect) >>= diff --git a/src/Text/Pandoc/Writers/HTML.hs b/src/Text/Pandoc/Writers/HTML.hs index d33dcff27..81f403f95 100644 --- a/src/Text/Pandoc/Writers/HTML.hs +++ b/src/Text/Pandoc/Writers/HTML.hs @@ -67,6 +67,14 @@ renderFragment opts = if writerWrapText opts stringToHtml :: String -> Html stringToHtml = primHtml . escapeStringForXML +-- Note: href and src, unmodified, incorrectly escape high +-- characters in URIs using entities. So we use these replacements: +href' :: String -> HtmlAttr +href' = href . stringToURI + +src' :: String -> HtmlAttr +src' = src . stringToURI + -- | Convert Pandoc document to Html string. writeHtmlString :: WriterOptions -> Pandoc -> String writeHtmlString opts d = @@ -112,13 +120,13 @@ pandocToHtml opts (Pandoc (Meta title' authors' date') blocks) = do then case writerHTMLMathMethod opts of LaTeXMathML (Just url) -> script ! - [src url, thetype "text/javascript"] $ noHtml + [src' url, thetype "text/javascript"] $ noHtml MathML (Just url) -> script ! - [src url, thetype "text/javascript"] $ noHtml + [src' url, thetype "text/javascript"] $ noHtml JsMath (Just url) -> script ! - [src url, thetype "text/javascript"] $ noHtml + [src' url, thetype "text/javascript"] $ noHtml _ -> case lookup "mathml-script" (writerVariables opts) of Just s -> script ! [thetype "text/javascript"] << @@ -188,7 +196,7 @@ elementToListItem opts (Sec _ num id' headerText subsecs) = do let subList = if null subHeads then noHtml else unordList subHeads - return $ Just $ (anchor ! [href ("#" ++ writerIdentifierPrefix opts ++ id')] $ txt) +++ subList + return $ Just $ (anchor ! [href' ("#" ++ writerIdentifierPrefix opts ++ id')] $ txt) +++ subList -- | Convert an Element to Html. elementToHtml :: WriterOptions -> Element -> State WriterState Html @@ -222,7 +230,7 @@ parseMailto _ = Nothing -- | Obfuscate a "mailto:" link. obfuscateLink :: WriterOptions -> String -> String -> Html obfuscateLink opts txt s | writerEmailObfuscation opts == NoObfuscation = - anchor ! [href s] << txt + anchor ! [href' s] << txt obfuscateLink opts txt s = let meth = writerEmailObfuscation opts s' = map toLower s @@ -249,7 +257,7 @@ obfuscateLink opts txt s = linkText ++ "+'<\\/'+'a'+'>');\n// -->\n")) +++ noscript (primHtml $ obfuscateString altText) _ -> error $ "Unknown obfuscation method: " ++ show meth - _ -> anchor ! [href s] $ stringToHtml txt -- malformed email + _ -> anchor ! [href' s] $ stringToHtml txt -- malformed email -- | Obfuscate character as entity. obfuscateChar :: Char -> String @@ -312,7 +320,7 @@ blockToHtml opts (Header level lst) = do stringToHtml " " +++ contents else contents let contents'' = if writerTableOfContents opts - then anchor ! [href $ "#" ++ writerIdentifierPrefix opts ++ "TOC"] $ contents' + then anchor ! [href' $ "#" ++ writerIdentifierPrefix opts ++ "TOC"] $ contents' else contents' return $ case level of 1 -> h1 contents'' @@ -452,7 +460,7 @@ inlineToHtml opts inline = then thespan ! [theclass "math"] $ primHtml str else thediv ! [theclass "math"] $ primHtml str MimeTeX url -> - return $ image ! [src (url ++ "?" ++ str), + return $ image ! [src' (url ++ "?" ++ str), alt str, title str] GladTeX -> return $ primHtml $ "<EQ>" ++ str ++ "</EQ>" @@ -484,13 +492,13 @@ inlineToHtml opts inline = return $ obfuscateLink opts (show linkText) s (Link txt (s,tit)) -> do linkText <- inlineListToHtml opts txt - return $ anchor ! ([href s] ++ + return $ anchor ! ([href' s] ++ if null tit then [] else [title tit]) $ linkText (Image txt (s,tit)) -> do alternate <- inlineListToHtml opts txt let alternate' = renderFragment opts alternate - let attributes = [src s] ++ + let attributes = [src' s] ++ (if null tit then [] else [title tit]) ++ @@ -508,7 +516,7 @@ inlineToHtml opts inline = -- push contents onto front of notes put $ st {stNotes = (htmlContents:notes)} return $ sup << - anchor ! [href ("#" ++ writerIdentifierPrefix opts ++ "fn" ++ ref), + anchor ! [href' ("#" ++ writerIdentifierPrefix opts ++ "fn" ++ ref), theclass "footnoteRef", prefixedId opts ("fnref" ++ ref)] << ref (Cite _ il) -> inlineListToHtml opts il @@ -517,7 +525,7 @@ blockListToNote :: WriterOptions -> String -> [Block] -> State WriterState Html blockListToNote opts ref blocks = -- If last block is Para or Plain, include the backlink at the end of -- that block. Otherwise, insert a new Plain block with the backlink. - let backlink = [HtmlInline $ " <a href=\"#" ++ writerIdentifierPrefix opts ++ "fnref" ++ ref ++ + let backlink = [HtmlInline $ " <a href=\"#" ++ stringToURI (writerIdentifierPrefix opts ++ "fnref" ++ ref) ++ "\" class=\"footnoteBackLink\"" ++ " title=\"Jump back to footnote " ++ ref ++ "\">↩</a>"] blocks' = if null blocks |