diff options
-rw-r--r-- | README | 5 | ||||
-rw-r--r-- | pandoc.cabal | 5 | ||||
-rw-r--r-- | pandoc.hs | 159 | ||||
-rw-r--r-- | src/Text/Pandoc.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/MediaBag.hs | 107 | ||||
-rw-r--r-- | src/Text/Pandoc/Options.hs | 6 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 9 | ||||
-rw-r--r-- | src/Text/Pandoc/SelfContained.hs | 26 | ||||
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 22 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/EPUB.hs | 3 | ||||
-rw-r--r-- | tests/Tests/Readers/Docx.hs | 58 | ||||
-rw-r--r-- | tests/docx.image.docx | bin | 109656 -> 36942 bytes | |||
-rw-r--r-- | tests/docx.image1.jpeg | bin | 46626 -> 0 bytes | |||
-rw-r--r-- | tests/docx.image_no_embed.native | 4 | ||||
-rw-r--r-- | tests/docx.inline_formatting.native | 2 |
15 files changed, 273 insertions, 136 deletions
@@ -1774,7 +1774,10 @@ legal (though ugly) pipe table: orange|3.09 The cells of pipe tables cannot contain block elements like paragraphs -and lists, and cannot span multiple lines. +and lists, and cannot span multiple lines. Note also that in LaTeX/PDF +output, the cells produced by pipe tables will not wrap, since there +is no information available about relative widths. If you want content +to wrap within cells, use multiline or grid tables. [the same as in PHP markdown extra]: http://michelf.ca/projects/php-markdown/extra/#table diff --git a/pandoc.cabal b/pandoc.cabal index 377a3b6c4..1c74933ab 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -273,6 +273,7 @@ Library Text.Pandoc.Options, Text.Pandoc.Pretty, Text.Pandoc.Shared, + Text.Pandoc.MediaBag, Text.Pandoc.Readers.HTML, Text.Pandoc.Readers.LaTeX, Text.Pandoc.Readers.Markdown, @@ -385,7 +386,6 @@ Test-Suite test-pandoc syb >= 0.1 && < 0.5, pandoc, pandoc-types >= 1.12.3.3 && < 1.13, - base64-bytestring >= 0.1 && < 1.1, bytestring >= 0.9 && < 0.11, text >= 0.11 && < 1.2, directory >= 1 && < 1.3, @@ -399,7 +399,8 @@ Test-Suite test-pandoc QuickCheck >= 2.4 && < 2.8, HUnit >= 1.2 && < 1.3, containers >= 0.1 && < 0.6, - ansi-terminal >= 0.5 && < 0.7 + ansi-terminal >= 0.5 && < 0.7, + zip-archive >= 0.2.3.2 && < 0.3 Other-Modules: Tests.Old Tests.Helpers Tests.Arbitrary @@ -1,4 +1,4 @@ -{-# LANGUAGE CPP #-} +{-# LANGUAGE CPP, TupleSections #-} {- Copyright (C) 2006-2014 John MacFarlane <jgm@berkeley.edu> @@ -38,26 +38,26 @@ import Text.Pandoc.Readers.LaTeX (handleIncludes) import Text.Pandoc.Shared ( tabFilter, readDataFileUTF8, readDataFile, safeRead, headerShift, normalize, err, warn, openURL ) +import Text.Pandoc.MediaBag ( mediaDirectory, extractMediaBag, MediaBag ) import Text.Pandoc.XML ( toEntities ) import Text.Pandoc.SelfContained ( makeSelfContained ) import Text.Pandoc.Process (pipeProcess) import Text.Highlighting.Kate ( languages, Style, tango, pygments, espresso, zenburn, kate, haddock, monochrome ) -import System.Environment ( getArgs, getProgName ) +import System.Environment ( getArgs, getProgName, getEnvironment ) import System.Exit ( exitWith, ExitCode (..) ) import System.FilePath import System.Console.GetOpt import Data.Char ( toLower ) import Data.List ( intercalate, isPrefixOf, isSuffixOf, sort ) import System.Directory ( getAppUserDataDirectory, findExecutable, - doesFileExist, Permissions(..), getPermissions, - createDirectoryIfMissing ) + doesFileExist, Permissions(..), getPermissions ) import System.IO ( stdout, stderr ) import System.IO.Error ( isDoesNotExistError ) import qualified Control.Exception as E import Control.Exception.Extensible ( throwIO ) import qualified Text.Pandoc.UTF8 as UTF8 -import Control.Monad (when, unless, liftM) +import Control.Monad (when, unless, liftM, (>=>)) import Data.Foldable (foldrM) import Network.URI (parseURI, isURI, URI(..)) import qualified Data.ByteString.Lazy as B @@ -69,6 +69,9 @@ import qualified Data.Yaml as Yaml import qualified Data.Text as T import Control.Applicative ((<$>)) import Text.Pandoc.Readers.Txt2Tags (getT2TMeta) +import Data.Monoid + +type Transform = Pandoc -> Pandoc copyrightMessage :: String copyrightMessage = "\nCopyright (C) 2006-2014 John MacFarlane\n" ++ @@ -100,7 +103,10 @@ isTextFormat s = takeWhile (`notElem` "+-") s `notElem` ["odt","docx","epub","ep externalFilter :: FilePath -> [String] -> Pandoc -> IO Pandoc externalFilter f args' d = do - mbexe <- if '/' `elem` f -- don't check PATH if filter name it has a path + mbPath <- lookup "PATH" <$> getEnvironment + mbexe <- if '/' `elem` f || mbPath == Nothing + -- don't check PATH if filter name has a path, or + -- if the PATH is not set then return Nothing else findExecutable f (f', args'') <- case mbexe of @@ -140,7 +146,7 @@ data Opt = Opt , optWriter :: String -- ^ Writer format , optParseRaw :: Bool -- ^ Parse unconvertable HTML and TeX , optTableOfContents :: Bool -- ^ Include table of contents - , optTransforms :: [Pandoc -> Pandoc] -- ^ Doc transforms to apply + , optTransforms :: [Transform] -- ^ Doc transforms to apply , optTemplate :: Maybe FilePath -- ^ Custom template , optVariables :: [(String,String)] -- ^ Template variables to set , optMetadata :: M.Map String MetaValue -- ^ Metadata fields to set @@ -933,6 +939,31 @@ defaultWriterName x = ['.',y] | y `elem` ['1'..'9'] -> "man" _ -> "html" +-- Transformations of a Pandoc document post-parsing: + +extractMedia :: MediaBag -> FilePath -> Pandoc -> IO Pandoc +extractMedia media dir d = + case [fp | (fp, _, _) <- mediaDirectory media] of + [] -> return d + fps -> do + extractMediaBag True dir media + return $ walk (adjustImagePath dir fps) d + +adjustImagePath :: FilePath -> [FilePath] -> Inline -> Inline +adjustImagePath dir paths (Image lab (src, tit)) + | src `elem` paths = Image lab (dir ++ "/" ++ src, tit) +adjustImagePath _ _ x = x + +adjustMetadata :: M.Map String MetaValue -> Pandoc -> IO Pandoc +adjustMetadata metadata d = return $ M.foldWithKey setMeta d metadata + +applyTransforms :: [Transform] -> Pandoc -> IO Pandoc +applyTransforms transforms d = return $ foldr ($) d transforms + +applyFilters :: [FilePath] -> [String] -> Pandoc -> IO Pandoc +applyFilters filters args d = + foldrM ($) d $ map (flip externalFilter args) filters + main :: IO () main = do @@ -1025,7 +1056,6 @@ main = do all (\f -> takeBaseName f /= "pandoc-citeproc") filters -> "pandoc-citeproc" : filters _ -> filters - let plugins = map externalFilter filters' let sources = if ignoreArgs then [] else args @@ -1142,6 +1172,40 @@ main = do , readerTrackChanges = trackChanges } + when (not (isTextFormat writerName') && outputFile == "-") $ + err 5 $ "Cannot write " ++ writerName' ++ " output to stdout.\n" ++ + "Specify an output file using the -o option." + + let readSources [] = mapM readSource ["-"] + readSources srcs = mapM readSource srcs + readSource "-" = UTF8.getContents + readSource src = case parseURI src of + Just u | uriScheme u `elem` ["http:","https:"] -> + readURI src + _ -> UTF8.readFile src + readURI src = do + res <- openURL src + case res of + Left e -> throwIO e + Right (bs,_) -> return $ UTF8.toString bs + + let readFiles [] = error "Cannot read archive from stdin" + readFiles (x:_) = B.readFile x + + let convertTabs = tabFilter (if (preserveTabs || readerName' == "t2t") then 0 else tabStop) + + let handleIncludes' = if readerName' == "latex" || readerName' == "latex+lhs" + then handleIncludes + else return + + (doc, media) <- + case reader of + StringReader r-> (, mempty) <$> + ( readSources >=> + handleIncludes' . convertTabs . intercalate "\n" >=> + r readerOpts ) sources + ByteStringReader r -> readFiles sources >>= r readerOpts + let writerOptions = def { writerStandalone = standalone', writerTemplate = templ, writerVariables = variables'', @@ -1177,70 +1241,15 @@ main = do writerEpubChapterLevel = epubChapterLevel, writerTOCDepth = epubTOCDepth, writerReferenceODT = referenceODT, - writerReferenceDocx = referenceDocx + writerReferenceDocx = referenceDocx, + writerMediaBag = media } - when (not (isTextFormat writerName') && outputFile == "-") $ - err 5 $ "Cannot write " ++ writerName' ++ " output to stdout.\n" ++ - "Specify an output file using the -o option." - - let readSources [] = mapM readSource ["-"] - readSources srcs = mapM readSource srcs - readSource "-" = UTF8.getContents - readSource src = case parseURI src of - Just u | uriScheme u `elem` ["http:","https:"] -> - readURI src - _ -> UTF8.readFile src - readURI src = do - res <- openURL src - case res of - Left e -> throwIO e - Right (bs,_) -> return $ UTF8.toString bs - - let readFiles [] = error "Cannot read archive from stdin" - readFiles (x:_) = B.readFile x - - let convertTabs = tabFilter (if (preserveTabs || readerName' == "t2t") then 0 else tabStop) - - let handleIncludes' = if readerName' == "latex" || readerName' == "latex+lhs" - then handleIncludes - else return - - let writeMedia :: FilePath -> (FilePath, B.ByteString) -> IO () - writeMedia dir (subpath, bs) = do - -- we join and split to convert a/b/c to a\b\c on Windows; - -- in zip containers all paths use / - let fullpath = dir </> joinPath (splitPath subpath) - createDirectoryIfMissing True $ takeDirectory fullpath - warn $ "extracting " ++ fullpath - B.writeFile fullpath bs - - let adjustImagePath :: FilePath -> [FilePath] -> Inline -> Inline - adjustImagePath dir paths (Image lab (src, tit)) - | src `elem` paths = Image lab (dir ++ "/" ++ src, tit) - adjustImagePath _ _ x = x - (doc, media) <- - case reader of - StringReader r-> do - inp <- readSources sources >>= - handleIncludes' . convertTabs . intercalate "\n" - d <- r readerOpts inp - return (d, M.empty) - ByteStringReader r -> do - (d, media) <- readFiles sources >>= r readerOpts - d' <- case mbExtractMedia of - Just dir | not (M.null media) -> do - mapM_ (writeMedia dir) $ M.toList media - return $ walk (adjustImagePath dir (M.keys media)) d - _ -> return d - return (d', media) - - let writerOptions' = writerOptions{ writerMediaBag = media } - - let doc0 = M.foldWithKey setMeta doc metadata - let doc1 = foldr ($) doc0 transforms - doc2 <- foldrM ($) doc1 $ map ($ [writerName']) plugins + doc' <- (maybe return (extractMedia media) mbExtractMedia >=> + adjustMetadata metadata >=> + applyTransforms transforms >=> + applyFilters filters' [writerName']) doc let writeBinary :: B.ByteString -> IO () writeBinary = B.writeFile (UTF8.encodePath outputFile) @@ -1250,8 +1259,8 @@ main = do writerFn f = UTF8.writeFile f case writer of - IOStringWriter f -> f writerOptions' doc2 >>= writerFn outputFile - IOByteStringWriter f -> f writerOptions' doc2 >>= writeBinary + IOStringWriter f -> f writerOptions doc' >>= writerFn outputFile + IOByteStringWriter f -> f writerOptions doc' >>= writeBinary PureStringWriter f | pdfOutput -> do -- make sure writer is latex or beamer @@ -1265,14 +1274,14 @@ main = do err 41 $ latexEngine ++ " not found. " ++ latexEngine ++ " is needed for pdf output." - res <- makePDF latexEngine f writerOptions' doc2 + res <- makePDF latexEngine f writerOptions doc' case res of Right pdf -> writeBinary pdf Left err' -> do B.hPutStr stderr $ err' B.hPut stderr $ B.pack [10] err 43 "Error producing PDF from TeX source" - | otherwise -> selfcontain (f writerOptions' doc2 ++ + | otherwise -> selfcontain (f writerOptions doc' ++ ['\n' | not standalone']) >>= writerFn outputFile . handleEntities where htmlFormat = writerName' `elem` @@ -1280,8 +1289,8 @@ main = do "s5","slidy","slideous","dzslides","revealjs"] selfcontain = if selfContained && htmlFormat then makeSelfContained - (writerMediaBag writerOptions') - (writerUserDataDir writerOptions') + (writerMediaBag writerOptions) + (writerUserDataDir writerOptions) else return handleEntities = if htmlFormat && ascii then toEntities diff --git a/src/Text/Pandoc.hs b/src/Text/Pandoc.hs index 11553383c..77eb3e82f 100644 --- a/src/Text/Pandoc.hs +++ b/src/Text/Pandoc.hs @@ -160,7 +160,8 @@ import Text.Pandoc.Writers.Haddock import Text.Pandoc.Writers.Custom import Text.Pandoc.Templates import Text.Pandoc.Options -import Text.Pandoc.Shared (safeRead, warn, MediaBag) +import Text.Pandoc.Shared (safeRead, warn) +import Text.Pandoc.MediaBag (MediaBag) import Data.Aeson import qualified Data.ByteString.Lazy as BL import Data.List (intercalate) diff --git a/src/Text/Pandoc/MediaBag.hs b/src/Text/Pandoc/MediaBag.hs new file mode 100644 index 000000000..667089f55 --- /dev/null +++ b/src/Text/Pandoc/MediaBag.hs @@ -0,0 +1,107 @@ +{-# LANGUAGE GeneralizedNewtypeDeriving #-} +{- +Copyright (C) 2014 John MacFarlane <jgm@berkeley.edu> + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +-} + +{- | + Module : Text.Pandoc.MediaBag + Copyright : Copyright (C) 2014 John MacFarlane + License : GNU GPL, version 2 or above + + Maintainer : John MacFarlane <jgm@berkeley.edu> + Stability : alpha + Portability : portable + +Definition of a MediaBag object to hold binary resources, and an +interface for interacting with it. +-} +module Text.Pandoc.MediaBag ( + MediaBag, + lookupMedia, + insertMedia, + mediaDirectory, + extractMediaBag + ) where +import System.FilePath +import System.Directory (createDirectoryIfMissing) +import qualified Data.Map as M +import qualified Data.ByteString.Lazy as BL +import Data.Monoid (Monoid) +import Control.Monad (when, MonadPlus(..)) +import Text.Pandoc.MIME (getMimeType) +import qualified Text.Pandoc.UTF8 as UTF8 +import Data.Maybe (fromMaybe) +import System.IO (stderr) + +-- | A container for a collection of binary resources, with names and +-- mime types. Note that a 'MediaBag' is a Monoid, so 'mempty' +-- can be used for an empty 'MediaBag', and '<>' can be used to append +-- two 'MediaBag's. +newtype MediaBag = MediaBag (M.Map String (String, BL.ByteString)) + deriving (Monoid) + +instance Show MediaBag where + show bag = "MediaBag " ++ show (mediaDirectory bag) + +-- | Insert a media item into a 'MediaBag', replacing any existing +-- value with the same name. +insertMedia :: FilePath -- ^ relative path and canonical name of resource + -> Maybe String -- ^ mime type (Nothing = determine from extension) + -> BL.ByteString -- ^ contents of resource + -> MediaBag + -> MediaBag +insertMedia fp mbMime contents (MediaBag mediamap) = + MediaBag (M.insert fp (mime, contents) mediamap) + where mime = fromMaybe "application/octet-stream" (mbMime `mplus` fallback) + fallback = case takeExtension fp of + ".gz" -> getMimeType $ dropExtension fp + _ -> getMimeType fp + +-- | Lookup a media item in a 'MediaBag', returning mime type and contents. +lookupMedia :: FilePath + -> MediaBag + -> Maybe (String, BL.ByteString) +lookupMedia fp (MediaBag mediamap) = M.lookup fp mediamap + +-- | Get a list of the file paths stored in a 'MediaBag', with +-- their corresponding mime types and the lengths in bytes of the contents. +mediaDirectory :: MediaBag -> [(String, String, Int)] +mediaDirectory (MediaBag mediamap) = + M.foldWithKey (\fp (mime,contents) -> + ((fp, mime, fromIntegral $ BL.length contents):)) [] mediamap + +-- | Extract contents of MediaBag to a given directory. Print informational +-- messages if 'verbose' is true. +extractMediaBag :: Bool + -> FilePath + -> MediaBag + -> IO () +extractMediaBag verbose dir (MediaBag mediamap) = do + sequence_ $ M.foldWithKey + (\fp (_ ,contents) -> + ((writeMedia verbose dir (fp, contents)):)) [] mediamap + +writeMedia :: Bool -> FilePath -> (FilePath, BL.ByteString) -> IO () +writeMedia verbose dir (subpath, bs) = do + -- we join and split to convert a/b/c to a\b\c on Windows; + -- in zip containers all paths use / + let fullpath = dir </> joinPath (splitPath subpath) + createDirectoryIfMissing True $ takeDirectory fullpath + when verbose $ UTF8.hPutStrLn stderr $ "pandoc: extracting " ++ fullpath + BL.writeFile fullpath bs + + diff --git a/src/Text/Pandoc/Options.hs b/src/Text/Pandoc/Options.hs index 2de1a9e35..85a6a3096 100644 --- a/src/Text/Pandoc/Options.hs +++ b/src/Text/Pandoc/Options.hs @@ -49,8 +49,8 @@ import Data.Set (Set) import qualified Data.Set as Set import Data.Default import Text.Pandoc.Highlighting (Style, pygments) -import Text.Pandoc.Shared (MediaBag) -import qualified Data.Map as M +import Text.Pandoc.MediaBag (MediaBag) +import Data.Monoid -- | Individually selectable syntax extensions. data Extension = @@ -359,7 +359,7 @@ instance Default WriterOptions where , writerTOCDepth = 3 , writerReferenceODT = Nothing , writerReferenceDocx = Nothing - , writerMediaBag = M.empty + , writerMediaBag = mempty } -- | Returns True if the given extension is enabled. diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index f5fb6565a..86ce62ced 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -84,8 +84,10 @@ import Text.Pandoc.Readers.Docx.Lists import Text.Pandoc.Readers.Docx.Reducible import Text.Pandoc.Readers.Docx.TexChar import Text.Pandoc.Shared +import Text.Pandoc.MediaBag (insertMedia, MediaBag) import Data.Maybe (mapMaybe, fromMaybe) import Data.List (delete, isPrefixOf, (\\), intercalate, intersect) +import Data.Monoid import qualified Data.ByteString.Lazy as B import qualified Data.Map as M import Control.Monad.Reader @@ -108,7 +110,7 @@ data DState = DState { docxAnchorMap :: M.Map String String defaultDState :: DState defaultDState = DState { docxAnchorMap = M.empty - , docxMediaBag = M.empty + , docxMediaBag = mempty , docxInHeaderBlock = False , docxInTexSubscript = False} @@ -217,7 +219,8 @@ runStyleToContainers rPr = , if isStrike rPr then (Just Strikeout) else Nothing , if isSuperScript rPr then (Just Superscript) else Nothing , if isSubScript rPr then (Just Subscript) else Nothing - , rUnderline rPr >>= (\f -> Just $ Span ("", [], [("underline", f)])) + , rUnderline rPr >>= + (\f -> if f == "single" then (Just Emph) else Nothing) ] in classContainers ++ formatters @@ -369,7 +372,7 @@ parPartToInlines (BookMark _ anchor) = return [Span (newAnchor, ["anchor"], []) []] parPartToInlines (Drawing fp bs) = do mediaBag <- gets docxMediaBag - modify $ \s -> s { docxMediaBag = M.insert fp bs mediaBag} + modify $ \s -> s { docxMediaBag = insertMedia fp Nothing bs mediaBag } return [Image [] (fp, "")] parPartToInlines (InternalHyperLink anchor runs) = do ils <- concatMapM runToInlines runs diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs index 777da3551..adb2c0014 100644 --- a/src/Text/Pandoc/SelfContained.hs +++ b/src/Text/Pandoc/SelfContained.hs @@ -41,11 +41,11 @@ import System.FilePath (takeExtension, dropExtension, takeDirectory, (</>)) import Data.Char (toLower, isAscii, isAlphaNum) import Codec.Compression.GZip as Gzip import qualified Data.ByteString.Lazy as L -import Text.Pandoc.Shared (renderTags', openURL, readDataFile, err, MediaBag) +import Text.Pandoc.Shared (renderTags', openURL, readDataFile, err) +import Text.Pandoc.MediaBag (MediaBag, lookupMedia) import Text.Pandoc.UTF8 (toString, fromString) import Text.Pandoc.MIME (getMimeType) import System.Directory (doesFileExist) -import qualified Data.Map as M isOk :: Char -> Bool isOk c = isAscii c && isAlphaNum c @@ -110,16 +110,20 @@ getItem media userdata f = -- this is needed for things like cmunrm.eot?#iefix, -- which is used to get old versions of IE to work with web fonts. let f' = takeWhile (\c -> c /= '?' && c /= '#') f - let mime = case takeExtension f' of - ".gz" -> getMimeType $ dropExtension f' - x -> getMimeType x + let mbMime = case takeExtension f' of + ".gz" -> getMimeType $ dropExtension f' + x -> getMimeType x exists <- doesFileExist f' - cont <- if exists - then B.readFile f' - else case M.lookup f media of - Just bs -> return $ BS.concat $ L.toChunks bs - Nothing -> readDataFile userdata f' - return (cont, mime) + if exists + then do + cont <- B.readFile f' + return (cont, mbMime) + else case lookupMedia f media of + Just (mime,bs) -> return (BS.concat $ L.toChunks bs, + Just mime) + Nothing -> do + cont <- readDataFile userdata f' + return (cont, mbMime) where handleErr e = err 61 $ "Failed to retrieve " ++ f ++ "\n" ++ show e getRaw :: MediaBag -> Maybe FilePath -> String -> String diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index 81aa6cf5a..d5769c1ab 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -48,8 +48,6 @@ module Text.Pandoc.Shared ( toRomanNumeral, escapeURI, tabFilter, - -- * Media Handling - MediaBag, -- * Date/time normalizeDate, -- * Pandoc block and inline list processing @@ -92,6 +90,7 @@ module Text.Pandoc.Shared ( import Text.Pandoc.Definition import Text.Pandoc.Walk +import Text.Pandoc.MediaBag (MediaBag, lookupMedia) import Text.Pandoc.Builder (Inlines, Blocks, ToMetaValue(..)) import qualified Text.Pandoc.Builder as B import qualified Text.Pandoc.UTF8 as UTF8 @@ -106,7 +105,7 @@ import Network.URI ( escapeURIString, isURI, nonStrictRelativeTo, import qualified Data.Set as Set import System.Directory import Text.Pandoc.MIME (getMimeType) -import System.FilePath ( (</>), takeExtension, dropExtension ) +import System.FilePath ( (</>), takeExtension, dropExtension) import Data.Generics (Typeable, Data) import qualified Control.Monad.State as S import qualified Control.Exception as E @@ -120,7 +119,6 @@ import Text.HTML.TagSoup (renderTagsOptions, RenderOptions(..), Tag(..), renderOptions) import qualified Data.ByteString as BS import qualified Data.ByteString.Char8 as B8 -import qualified Data.ByteString.Lazy as BL import Text.Pandoc.Compat.Monoid import Data.ByteString.Base64 (decodeLenient) import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) @@ -287,14 +285,6 @@ tabFilter tabStop = x : go (spsToNextStop - 1) xs in go tabStop ---- ---- Media handling ---- - --- | A map of media paths to their binary representations. - -type MediaBag = M.Map String BL.ByteString - -- -- Date/time -- @@ -796,13 +786,9 @@ fetchItem sourceURL s fetchItem' :: MediaBag -> Maybe String -> String -> IO (Either E.SomeException (BS.ByteString, Maybe String)) fetchItem' media sourceURL s = do - case M.lookup s media of + case lookupMedia s media of Nothing -> fetchItem sourceURL s - Just bs -> do - let mime = case takeExtension s of - ".gz" -> getMimeType $ dropExtension s - x -> getMimeType x - return $ Right (BS.concat $ toChunks bs, mime) + Just (mime, bs) -> return $ Right (BS.concat $ toChunks bs, Just mime) -- | Read from a URL and return raw data and maybe mime type. openURL :: String -> IO (Either E.SomeException (BS.ByteString, Maybe String)) diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 1f222b8b8..770b6f244 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -61,6 +61,7 @@ import Text.Pandoc.MIME (getMimeType) import qualified Control.Exception as E import Text.Blaze.Html.Renderer.Utf8 (renderHtml) import Text.HTML.TagSoup +import Data.Monoid -- A Chapter includes a list of blocks and maybe a section -- number offset. Note, some chapters are unnumbered. The section @@ -793,7 +794,7 @@ transformInline opts mediaRef (Image lab (src,tit)) = do return $ Image lab (newsrc, tit) transformInline opts _ (x@(Math _ _)) | WebTeX _ <- writerHTMLMathMethod opts = do - raw <- makeSelfContained M.empty Nothing $ writeHtmlInline opts x + raw <- makeSelfContained mempty Nothing $ writeHtmlInline opts x return $ RawInline (Format "html") raw transformInline opts mediaRef (RawInline fmt raw) | fmt == Format "html" = do diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs index 494669fd5..efc520dba 100644 --- a/tests/Tests/Readers/Docx.hs +++ b/tests/Tests/Readers/Docx.hs @@ -5,13 +5,15 @@ import Text.Pandoc.Readers.Native import Text.Pandoc.Definition import Tests.Helpers import Test.Framework -import qualified Data.ByteString as BS +import Test.HUnit (assertBool) +import Test.Framework.Providers.HUnit import qualified Data.ByteString.Lazy as B -import qualified Data.ByteString.Char8 as B8 -import qualified Data.ByteString.Base64 as B64 import Text.Pandoc.Readers.Docx import Text.Pandoc.Writers.Native (writeNative) import qualified Data.Map as M +import Text.Pandoc.MediaBag (MediaBag, lookupMedia, mediaDirectory) +import Codec.Archive.Zip +import System.FilePath (combine) -- We define a wrapper around pandoc that doesn't normalize in the -- tests. Since we do our own normalization, we want to make sure @@ -55,22 +57,44 @@ testCompareWithOpts opts name docxFile nativeFile = testCompare :: String -> FilePath -> FilePath -> Test testCompare = testCompareWithOpts def -testCompareMediaIO :: String -> FilePath -> FilePath -> FilePath -> IO Test -testCompareMediaIO name docxFile mediaPath mediaFile = do +getMedia :: FilePath -> FilePath -> IO (Maybe B.ByteString) +getMedia archivePath mediaPath = do + zf <- B.readFile archivePath >>= return . toArchive + return $ findEntryByPath (combine "word" mediaPath) zf >>= (Just . fromEntry) + +compareMediaPathIO :: FilePath -> MediaBag -> FilePath -> IO Bool +compareMediaPathIO mediaPath mediaBag docxPath = do + docxMedia <- getMedia docxPath mediaPath + let mbBS = case lookupMedia mediaPath mediaBag of + Just (_, bs) -> bs + Nothing -> error ("couldn't find " ++ + mediaPath ++ + " in media bag") + docxBS = case docxMedia of + Just bs -> bs + Nothing -> error ("couldn't find " ++ + mediaPath ++ + " in media bag") + return $ mbBS == docxBS + +compareMediaBagIO :: FilePath -> IO Bool +compareMediaBagIO docxFile = do df <- B.readFile docxFile - mf <- B.readFile mediaFile let (_, mb) = readDocx def df - dBytes = case M.lookup mediaPath mb of - Just bs -> bs - Nothing -> error "Media file not found" - d64 = B8.unpack $ B64.encode $ BS.concat $ B.toChunks dBytes - m64 = B8.unpack $ B64.encode $ BS.concat $ B.toChunks mf - return $ test id name (d64, m64) + bools <- mapM + (\(fp, _, _) -> compareMediaPathIO fp mb docxFile) + (mediaDirectory mb) + return $ and bools -testCompareMedia :: String -> FilePath -> FilePath -> FilePath -> Test -testCompareMedia name docxFile mediaPath mediaFile = - buildTest $ testCompareMediaIO name docxFile mediaPath mediaFile +testMediaBagIO :: String -> FilePath -> IO Test +testMediaBagIO name docxFile = do + outcome <- compareMediaBagIO docxFile + return $ testCase name (assertBool + ("Media didn't match media bag in file " ++ docxFile) + outcome) +testMediaBag :: String -> FilePath -> Test +testMediaBag name docxFile = buildTest $ testMediaBagIO name docxFile tests :: [Test] tests = [ testGroup "inlines" @@ -185,11 +209,9 @@ tests = [ testGroup "inlines" "docx.track_changes_deletion_all.native" ] , testGroup "media" - [ testCompareMedia + [ testMediaBag "image extraction" "docx.image.docx" - "media/image1.jpeg" - "docx.image1.jpeg" ] , testGroup "metadata" [ testCompareWithOpts def{readerStandalone=True} diff --git a/tests/docx.image.docx b/tests/docx.image.docx Binary files differindex 060f2b204..06e4efd1a 100644 --- a/tests/docx.image.docx +++ b/tests/docx.image.docx diff --git a/tests/docx.image1.jpeg b/tests/docx.image1.jpeg Binary files differdeleted file mode 100644 index 423dff48b..000000000 --- a/tests/docx.image1.jpeg +++ /dev/null diff --git a/tests/docx.image_no_embed.native b/tests/docx.image_no_embed.native index aa0f65d27..95c73610e 100644 --- a/tests/docx.image_no_embed.native +++ b/tests/docx.image_no_embed.native @@ -1,2 +1,2 @@ -[Header 2 ("an-image",[],[]) [Str "An",Space,Str "image"] -,Para [Image [] ("media/image1.jpeg","")]] +[Para [Str "An",Space,Str "image:"] +,Para [Image [] ("media/image1.jpg","")]] diff --git a/tests/docx.inline_formatting.native b/tests/docx.inline_formatting.native index dc8a3d19a..22d8f79e8 100644 --- a/tests/docx.inline_formatting.native +++ b/tests/docx.inline_formatting.native @@ -1,5 +1,5 @@ [Para [Str "Regular",Space,Str "text",Space,Emph [Str "italics"],Space,Strong [Str "bold",Space,Emph [Str "bold",Space,Str "italics"]],Str "."] ,Para [Str "This",Space,Str "is",Space,SmallCaps [Str "Small",Space,Str "Caps"],Str ",",Space,Str "and",Space,Str "this",Space,Str "is",Space,Strikeout [Str "strikethrough"],Str "."] -,Para [Str "Some",Space,Str "people",Space,Str "use",Space,Span ("",[],[("underline","single")]) [Str "single",Space,Str "underlines",Space,Str "for",Space,Emph [Str "emphasis"]],Str "."] +,Para [Str "Some",Space,Str "people",Space,Str "use",Space,Emph [Str "single",Space,Str "underlines",Space,Str "for",Space,Emph [Str "emphasis"]],Str "."] ,Para [Str "Above",Space,Str "the",Space,Str "line",Space,Str "is",Space,Superscript [Str "superscript"],Space,Str "and",Space,Str "below",Space,Str "the",Space,Str "line",Space,Str "is",Space,Subscript [Str "subscript"],Str "."] ,Para [Str "A",Space,Str "line",LineBreak,Str "break."]] |