diff options
Diffstat (limited to 'src/Text')
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/SelfContained.hs | 93 | ||||
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 43 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/EPUB.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/FB2.hs | 11 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Markdown.hs | 10 |
6 files changed, 73 insertions, 90 deletions
diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 7a89c0b04..86ce62ced 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -219,7 +219,8 @@ runStyleToContainers rPr = , if isStrike rPr then (Just Strikeout) else Nothing , if isSuperScript rPr then (Just Superscript) else Nothing , if isSubScript rPr then (Just Subscript) else Nothing - , rUnderline rPr >>= (\f -> Just $ Span ("", [], [("underline", f)])) + , rUnderline rPr >>= + (\f -> if f == "single" then (Just Emph) else Nothing) ] in classContainers ++ formatters diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs index adb2c0014..1a4e037cf 100644 --- a/src/Text/Pandoc/SelfContained.hs +++ b/src/Text/Pandoc/SelfContained.hs @@ -32,56 +32,54 @@ the HTML using data URIs. -} module Text.Pandoc.SelfContained ( makeSelfContained ) where import Text.HTML.TagSoup -import Network.URI (isURI, escapeURIString) +import Network.URI (isURI, escapeURIString, URI(..), parseURI) import Data.ByteString.Base64 import qualified Data.ByteString.Char8 as B -import qualified Data.ByteString as BS import Data.ByteString (ByteString) -import System.FilePath (takeExtension, dropExtension, takeDirectory, (</>)) +import System.FilePath (takeExtension, takeDirectory, (</>)) import Data.Char (toLower, isAscii, isAlphaNum) import Codec.Compression.GZip as Gzip import qualified Data.ByteString.Lazy as L -import Text.Pandoc.Shared (renderTags', openURL, readDataFile, err) -import Text.Pandoc.MediaBag (MediaBag, lookupMedia) +import Text.Pandoc.Shared (renderTags', err, fetchItem') +import Text.Pandoc.MediaBag (MediaBag) import Text.Pandoc.UTF8 (toString, fromString) -import Text.Pandoc.MIME (getMimeType) -import System.Directory (doesFileExist) +import Text.Pandoc.Options (WriterOptions(..)) isOk :: Char -> Bool isOk c = isAscii c && isAlphaNum c -convertTag :: MediaBag -> Maybe FilePath -> Tag String -> IO (Tag String) -convertTag media userdata t@(TagOpen tagname as) +convertTag :: MediaBag -> Maybe String -> Tag String -> IO (Tag String) +convertTag media sourceURL t@(TagOpen tagname as) | tagname `elem` ["img", "embed", "video", "input", "audio", "source"] = do as' <- mapM processAttribute as return $ TagOpen tagname as' where processAttribute (x,y) = if x == "src" || x == "href" || x == "poster" then do - (raw, mime) <- getRaw media userdata (fromAttrib "type" t) y + (raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) y let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw) return (x, enc) else return (x,y) -convertTag media userdata t@(TagOpen "script" as) = +convertTag media sourceURL t@(TagOpen "script" as) = case fromAttrib "src" t of [] -> return t src -> do - (raw, mime) <- getRaw media userdata (fromAttrib "type" t) src + (raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) src let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw) return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"]) -convertTag media userdata t@(TagOpen "link" as) = +convertTag media sourceURL t@(TagOpen "link" as) = case fromAttrib "href" t of [] -> return t src -> do - (raw, mime) <- getRaw media userdata (fromAttrib "type" t) src + (raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) src let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw) return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"]) convertTag _ _ t = return t -- NOTE: This is really crude, it doesn't respect CSS comments. -cssURLs :: MediaBag -> Maybe FilePath -> FilePath -> ByteString +cssURLs :: MediaBag -> Maybe String -> FilePath -> ByteString -> IO ByteString -cssURLs media userdata d orig = +cssURLs media sourceURL d orig = case B.breakSubstring "url(" orig of (x,y) | B.null y -> return orig | otherwise -> do @@ -94,43 +92,21 @@ cssURLs media userdata d orig = let url' = if isURI url then url else d </> url - (raw, mime) <- getRaw media userdata "" url' - rest <- cssURLs media userdata d v + (raw, mime) <- getRaw media sourceURL "" url' + rest <- cssURLs media sourceURL d v let enc = "data:" `B.append` fromString mime `B.append` ";base64," `B.append` (encode raw) return $ x `B.append` "url(" `B.append` enc `B.append` rest -getItem :: MediaBag -> Maybe FilePath -> String - -> IO (ByteString, Maybe String) -getItem media userdata f = - if isURI f - then openURL f >>= either handleErr return - else do - -- strip off trailing query or fragment part, if relative URL. - -- this is needed for things like cmunrm.eot?#iefix, - -- which is used to get old versions of IE to work with web fonts. - let f' = takeWhile (\c -> c /= '?' && c /= '#') f - let mbMime = case takeExtension f' of - ".gz" -> getMimeType $ dropExtension f' - x -> getMimeType x - exists <- doesFileExist f' - if exists - then do - cont <- B.readFile f' - return (cont, mbMime) - else case lookupMedia f media of - Just (mime,bs) -> return (BS.concat $ L.toChunks bs, - Just mime) - Nothing -> do - cont <- readDataFile userdata f' - return (cont, mbMime) - where handleErr e = err 61 $ "Failed to retrieve " ++ f ++ "\n" ++ show e - -getRaw :: MediaBag -> Maybe FilePath -> String -> String +getRaw :: MediaBag -> Maybe String -> String -> String -> IO (ByteString, String) -getRaw media userdata mimetype src = do +getRaw media sourceURL mimetype src = do let ext = map toLower $ takeExtension src - (raw, respMime) <- getItem media userdata src + fetchResult <- fetchItem' media sourceURL src + (raw, respMime) <- case fetchResult of + Left msg -> err 67 $ "Could not fetch " ++ src ++ + "\n" ++ show msg + Right x -> return x let raw' = if ext == ".gz" then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks $ [raw] @@ -140,21 +116,22 @@ getRaw media userdata mimetype src = do $ "Could not determine mime type for `" ++ src ++ "'" (x, Nothing) -> x (_, Just x ) -> x + let cssSourceURL = case parseURI src of + Just u + | uriScheme u `elem` ["http:","https:"] -> + Just $ show u{ uriPath = "", + uriQuery = "", + uriFragment = "" } + _ -> Nothing result <- if mime == "text/css" - then cssURLs media userdata (takeDirectory src) raw' + then cssURLs media cssSourceURL (takeDirectory src) raw' else return raw' return (result, mime) -- | Convert HTML into self-contained HTML, incorporating images, --- scripts, and CSS using data: URIs. Items specified using absolute --- URLs will be downloaded; those specified using relative URLs will --- be sought first relative to the working directory, then in the --- media bag, then relative --- to the user data directory (if the first parameter is 'Just' --- a directory), and finally relative to pandoc's default data --- directory. -makeSelfContained :: MediaBag -> Maybe FilePath -> String -> IO String -makeSelfContained media userdata inp = do +-- scripts, and CSS using data: URIs. +makeSelfContained :: WriterOptions -> String -> IO String +makeSelfContained opts inp = do let tags = parseTags inp - out' <- mapM (convertTag media userdata) tags + out' <- mapM (convertTag (writerMediaBag opts) (writerSourceURL opts)) tags return $ renderTags' out' diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index d5769c1ab..f0e5bbe5d 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -59,6 +59,7 @@ module Text.Pandoc.Shared ( normalizeBlocks, removeFormatting, stringify, + capitalize, compactify, compactify', compactify'DL, @@ -101,7 +102,7 @@ import Data.Char ( toLower, isLower, isUpper, isAlpha, import Data.List ( find, isPrefixOf, intercalate ) import qualified Data.Map as M import Network.URI ( escapeURIString, isURI, nonStrictRelativeTo, - unEscapeString, parseURIReference ) + unEscapeString, parseURIReference, isAllowedInURI ) import qualified Data.Set as Set import System.Directory import Text.Pandoc.MIME (getMimeType) @@ -122,6 +123,7 @@ import qualified Data.ByteString.Char8 as B8 import Text.Pandoc.Compat.Monoid import Data.ByteString.Base64 (decodeLenient) import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) +import qualified Data.Text as T (toUpper, pack, unpack) #ifdef EMBED_DATA_FILES import Text.Pandoc.Data (dataFiles) @@ -527,6 +529,17 @@ stringify = query go . walk deNote deNote (Note _) = Str "" deNote x = x +-- | Bring all regular text in a pandoc structure to uppercase. +-- +-- This function correctly handles cases where a lowercase character doesn't +-- match to a single uppercase character – e.g. “Straße” would be converted +-- to “STRASSE”, not “STRAßE”. +capitalize :: Walkable Inline a => a -> a +capitalize = walk go + where go :: Inline -> Inline + go (Str s) = Str (T.unpack $ T.toUpper $ T.pack s) + go x = x + -- | Change final list item from @Para@ to @Plain@ if the list contains -- no other @Para@ blocks. compactify :: [[Block]] -- ^ List of list items (each a list of blocks) @@ -766,21 +779,23 @@ readDataFileUTF8 userDir fname = -- Returns raw content and maybe mime type. fetchItem :: Maybe String -> String -> IO (Either E.SomeException (BS.ByteString, Maybe String)) -fetchItem sourceURL s - | isURI s = openURL s - | otherwise = - case sourceURL >>= parseURIReference of - Just u -> case parseURIReference s of - Just s' -> openURL $ show $ - s' `nonStrictRelativeTo` u - Nothing -> openURL $ show u ++ "/" ++ s - Nothing -> E.try readLocalFile +fetchItem sourceURL s = + case (sourceURL >>= parseURIReference . ensureEscaped, ensureEscaped s) of + (_, s') | isURI s' -> openURL s' + (Just u, s') -> -- try fetching from relative path at source + case parseURIReference s' of + Just u' -> openURL $ show $ u' `nonStrictRelativeTo` u + Nothing -> openURL s' -- will throw error + (Nothing, _) -> E.try readLocalFile -- get from local file system where readLocalFile = do - let mime = case takeExtension s of - ".gz" -> getMimeType $ dropExtension s - x -> getMimeType x - cont <- BS.readFile $ unEscapeString s + cont <- BS.readFile fp return (cont, mime) + dropFragmentAndQuery = takeWhile (\c -> c /= '?' && c /= '#') + fp = unEscapeString $ dropFragmentAndQuery s + mime = case takeExtension fp of + ".gz" -> getMimeType $ dropExtension fp + x -> getMimeType x + ensureEscaped = escapeURIString isAllowedInURI -- | Like 'fetchItem', but also looks for items in a 'MediaBag'. fetchItem' :: MediaBag -> Maybe String -> String diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 770b6f244..34a6dcb2f 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -61,7 +61,6 @@ import Text.Pandoc.MIME (getMimeType) import qualified Control.Exception as E import Text.Blaze.Html.Renderer.Utf8 (renderHtml) import Text.HTML.TagSoup -import Data.Monoid -- A Chapter includes a list of blocks and maybe a section -- number offset. Note, some chapters are unnumbered. The section @@ -794,7 +793,7 @@ transformInline opts mediaRef (Image lab (src,tit)) = do return $ Image lab (newsrc, tit) transformInline opts _ (x@(Math _ _)) | WebTeX _ <- writerHTMLMathMethod opts = do - raw <- makeSelfContained mempty Nothing $ writeHtmlInline opts x + raw <- makeSelfContained opts $ writeHtmlInline opts x return $ RawInline (Format "html") raw transformInline opts mediaRef (RawInline fmt raw) | fmt == Format "html" = do diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs index 803617f95..7a9bff4fe 100644 --- a/src/Text/Pandoc/Writers/FB2.hs +++ b/src/Text/Pandoc/Writers/FB2.hs @@ -28,7 +28,7 @@ module Text.Pandoc.Writers.FB2 (writeFB2) where import Control.Monad.State (StateT, evalStateT, get, modify) import Control.Monad.State (liftM, liftM2, liftIO) import Data.ByteString.Base64 (encode) -import Data.Char (toUpper, toLower, isSpace, isAscii, isControl) +import Data.Char (toLower, isSpace, isAscii, isControl) import Data.List (intersperse, intercalate, isPrefixOf) import Data.Either (lefts, rights) import Network.Browser (browse, request, setAllowRedirects, setOutHandler) @@ -44,8 +44,7 @@ import qualified Text.XML.Light.Cursor as XC import Text.Pandoc.Definition import Text.Pandoc.Options (WriterOptions(..), HTMLMathMethod(..), def) -import Text.Pandoc.Shared (orderedListMarkers, isHeaderBlock) -import Text.Pandoc.Walk +import Text.Pandoc.Shared (orderedListMarkers, isHeaderBlock, capitalize) -- | Data to be written at the end of the document: -- (foot)notes, URLs, references, images. @@ -421,10 +420,6 @@ indent = indentBlock indentLines ins = let lns = split isLineBreak ins :: [[Inline]] in intercalate [LineBreak] $ map ((Str spacer):) lns -capitalize :: Inline -> Inline -capitalize (Str xs) = Str $ map toUpper xs -capitalize x = x - -- | Convert a Pandoc's Inline element to FictionBook XML representation. toXml :: Inline -> FBM [Content] toXml (Str s) = return [txt s] @@ -434,7 +429,7 @@ toXml (Strong ss) = list `liftM` wrap "strong" ss toXml (Strikeout ss) = list `liftM` wrap "strikethrough" ss toXml (Superscript ss) = list `liftM` wrap "sup" ss toXml (Subscript ss) = list `liftM` wrap "sub" ss -toXml (SmallCaps ss) = cMapM toXml $ walk capitalize ss +toXml (SmallCaps ss) = cMapM toXml $ capitalize ss toXml (Quoted SingleQuote ss) = do -- FIXME: should be language-specific inner <- cMapM toXml ss return $ [txt "‘"] ++ inner ++ [txt "’"] diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index 897e425c6..a859267cc 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -39,7 +39,7 @@ import Text.Pandoc.Writers.Shared import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (blankline, blanklines, char, space) import Data.List ( group, isPrefixOf, find, intersperse, transpose, sortBy ) -import Data.Char ( isSpace, isPunctuation, toUpper ) +import Data.Char ( isSpace, isPunctuation ) import Data.Ord ( comparing ) import Text.Pandoc.Pretty import Control.Monad.State @@ -672,10 +672,6 @@ escapeSpaces (Str s) = Str $ substitute " " "\\ " s escapeSpaces Space = Str "\\ " escapeSpaces x = x -toCaps :: Inline -> Inline -toCaps (Str s) = Str (map toUpper s) -toCaps x = x - -- | Convert Pandoc inline element to markdown. inlineToMarkdown :: WriterOptions -> Inline -> State WriterState Doc inlineToMarkdown opts (Span attrs ils) = do @@ -693,7 +689,7 @@ inlineToMarkdown opts (Emph lst) = do inlineToMarkdown opts (Strong lst) = do plain <- gets stPlain if plain - then inlineListToMarkdown opts $ walk toCaps lst + then inlineListToMarkdown opts $ capitalize lst else do contents <- inlineListToMarkdown opts lst return $ "**" <> contents <> "**" @@ -716,7 +712,7 @@ inlineToMarkdown opts (Subscript lst) = do inlineToMarkdown opts (SmallCaps lst) = do plain <- gets stPlain if plain - then inlineListToMarkdown opts $ walk toCaps lst + then inlineListToMarkdown opts $ capitalize lst else do contents <- inlineListToMarkdown opts lst return $ tagWithAttrs "span" |