diff options
-rw-r--r-- | README | 138 | ||||
-rw-r--r-- | benchmark/benchmark-pandoc.hs | 36 | ||||
-rw-r--r-- | pandoc.cabal | 17 | ||||
-rw-r--r-- | pandoc.hs | 153 | ||||
-rw-r--r-- | src/Text/Pandoc/Readers/Docx.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/SelfContained.hs | 93 | ||||
-rw-r--r-- | src/Text/Pandoc/Shared.hs | 43 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/EPUB.hs | 3 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/FB2.hs | 11 | ||||
-rw-r--r-- | src/Text/Pandoc/Writers/Markdown.hs | 10 | ||||
-rw-r--r-- | tests/Tests/Readers/Docx.hs | 59 | ||||
-rw-r--r-- | tests/Tests/Writers/Plain.hs | 21 | ||||
-rw-r--r-- | tests/docx.image.docx | bin | 109656 -> 36942 bytes | |||
-rw-r--r-- | tests/docx.image1.jpeg | bin | 46626 -> 0 bytes | |||
-rw-r--r-- | tests/docx.image_no_embed.native | 4 | ||||
-rw-r--r-- | tests/docx.inline_formatting.native | 2 | ||||
-rw-r--r-- | tests/test-pandoc.hs | 2 |
17 files changed, 318 insertions, 277 deletions
@@ -938,7 +938,7 @@ A paragraph is one or more lines of text followed by one or more blank line. Newlines are treated as spaces, so you can reflow your paragraphs as you like. If you need a hard line break, put two or more spaces at the end of a line. -**Extension: `escaped_line_breaks`** +#### Extension: `escaped_line_breaks` #### A backslash followed by a newline is also a hard line break. Note: in multiline and grid table cells, this is the only way @@ -979,7 +979,7 @@ As with setext-style headers, the header text can contain formatting: # A level-one header with a [link](/url) and *emphasis* -**Extension: `blank_before_header`** +#### Extension: `blank_before_header` #### Standard markdown syntax does not require a blank line before a header. Pandoc does require this (except, of course, at the beginning of the @@ -993,7 +993,7 @@ wrapping). Consider, for example: ### Header identifiers in HTML, LaTeX, and ConTeXt ### -**Extension: `header_attributes`** +#### Extension: `header_attributes` #### Headers can be assigned attributes using this syntax at the end of the line containing the header text: @@ -1029,7 +1029,7 @@ is just the same as # My header {.unnumbered} -**Extension: `auto_identifiers`** +#### Extension: `auto_identifiers` #### A header without an explicitly specified identifier will be automatically assigned a unique identifier based on the header text. @@ -1078,7 +1078,7 @@ and the identifier will be attached to the enclosing `<div>` sections to be manipulated using javascript or treated differently in CSS. -**Extension: `implicit_header_references`** +#### Extension: `implicit_header_references` #### Pandoc behaves as if reference links have been defined for each header. So, instead of @@ -1137,7 +1137,7 @@ other block quotes. That is, block quotes can be nested: > > > A block quote within a block quote. -**Extension: `blank_before_blockquote`** +#### Extension: `blank_before_blockquote` #### Standard markdown syntax does not require a blank line before a block quote. Pandoc does require this (except, of course, at the beginning of the @@ -1171,7 +1171,7 @@ Note: blank lines in the verbatim text need not begin with four spaces. ### Fenced code blocks ### -**Extension: `fenced_code_blocks`** +#### Extension: `fenced_code_blocks` #### In addition to standard indented code blocks, Pandoc supports *fenced* code blocks. These begin with a row of three or more @@ -1197,7 +1197,7 @@ row of tildes or backticks at the start and end: ~~~~~~~~~~ ~~~~~~~~~~~~~~~~ -**Extension: `fenced_code_attributes`** +#### Extension: `fenced_code_attributes` #### Optionally, you may attach attributes to the code block using this syntax: @@ -1246,7 +1246,7 @@ To set the highlighting style, use `--highlight-style`. Line blocks ----------- -**Extension: `line_blocks`** +#### Extension: `line_blocks` #### A line block is a sequence of lines beginning with a vertical bar (`|`) followed by a space. The division into lines will be preserved in @@ -1388,7 +1388,7 @@ and this one: 7. two 1. three -**Extension: `fancy_lists`** +#### Extension: `fancy_lists` #### Unlike standard markdown, Pandoc allows ordered list items to be marked with uppercase and lowercase letters and roman numerals, in addition to @@ -1419,7 +1419,7 @@ ordered list marker in place of a numeral: #. one #. two -**Extension: `startnum`** +#### Extension: `startnum` #### Pandoc also pays attention to the type of list marker used, and to the starting number, and both of these are preserved where possible in the @@ -1451,7 +1451,7 @@ If default list markers are desired, use `#.`: ### Definition lists ### -**Extension: `definition_lists`** +#### Extension: `definition_lists` #### Pandoc supports definition lists, using the syntax of [PHP Markdown Extra] with some extensions.[^3] @@ -1512,7 +1512,7 @@ hard wrapping, can be activated with `compact_definition_lists`: see ### Numbered example lists ### -**Extension: `example_lists`** +#### Extension: `example_lists` #### The special list marker `@` can be used for sequentially numbered examples. The first list item with a `@` marker will be numbered '1', @@ -1618,9 +1618,14 @@ Four kinds of tables may be used. The first three kinds presuppose the use of a fixed-width font, such as Courier. The fourth kind can be used with proportionally spaced fonts, as it does not require lining up columns. -### Simple tables +#### Extension: `table_captions` #### -**Extension: `simple_tables`, `table_captions`** +A caption may optionally be provided with all 4 kinds of tables (as +illustrated in the examples below). A caption is a paragraph beginning +with the string `Table:` (or just `:`), which will be stripped off. +It may appear either before or after the table. + +#### Extension: `simple_tables` #### Simple tables look like this: @@ -1649,10 +1654,7 @@ to the dashed line below it:[^4] [Markdown discussion list](http://six.pairlist.net/pipermail/markdown-discuss/2005-March/001097.html). The table must end with a blank line, or a line of dashes followed by -a blank line. A caption may optionally be provided (as illustrated in -the example above). A caption is a paragraph beginning with the string -`Table:` (or just `:`), which will be stripped off. It may appear either -before or after the table. +a blank line. The column headers may be omitted, provided a dashed line is used to end the table. For example: @@ -1667,9 +1669,7 @@ When headers are omitted, column alignments are determined on the basis of the first line of the table body. So, in the tables above, the columns would be right, left, center, and right aligned, respectively. -### Multiline tables - -**Extension: `multiline_tables`, `table_captions`** +#### Extension: `multiline_tables` #### Multiline tables allow headers and table rows to span multiple lines of text (but cells that span multiple columns or rows of the table are @@ -1719,9 +1719,7 @@ It is possible for a multiline table to have just one row, but the row should be followed by a blank line (and then the row of dashes that ends the table), or the table may be interpreted as a simple table. -### Grid tables - -**Extension: `grid_tables`, `table_captions`** +#### Extension: `grid_tables` #### Grid tables look like this: @@ -1745,9 +1743,7 @@ columns or rows. Grid tables can be created easily using [Emacs table mode]. [Emacs table mode]: http://table.sourceforge.net/ -### Pipe tables - -**Extension: `pipe_tables`, `table_captions`** +#### Extension: `pipe_tables` #### Pipe tables look like this: @@ -1757,7 +1753,7 @@ Pipe tables look like this: | 123 | 123 | 123 | 123 | | 1 | 1 | 1 | 1 | - : Demonstration of simple table syntax. + : Demonstration of pipe table syntax. The syntax is [the same as in PHP markdown extra]. The beginning and ending pipe characters are optional, but pipes are required between all @@ -1796,10 +1792,10 @@ The difference is that `+` is used instead of `|`. Other orgtbl features are not supported. In particular, to get non-default column alignment, you'll need to add colons as above. -Title block ------------ +Metadata blocks +--------------- -**Extension: `pandoc_title_block`** +#### Extension: `pandoc_title_block` #### If the file begins with a title block @@ -1875,10 +1871,7 @@ will also have "Pandoc User Manuals" in the footer. will also have "Version 4.0" in the header. -YAML metadata block -------------------- - -**Extension: `yaml_metadata_block`** +#### Extension: `yaml_metadata_block` #### A YAML metadata block is a valid YAML object, delimited by a line of three hyphens (`---`) at the top and a line of three hyphens (`---`) or three dots @@ -1946,7 +1939,7 @@ custom template. For example: Backslash escapes ----------------- -**Extension: `all_symbols_escapable`** +#### Extension: `all_symbols_escapable` #### Except inside a code block or inline code, any punctuation or space character preceded by a backslash will be treated literally, even if it @@ -1985,7 +1978,7 @@ Backslash escapes do not work in verbatim contexts. Smart punctuation ----------------- -**Extension** +#### Extension #### If the `--smart` option is specified, pandoc will produce typographically correct output, converting straight quotes to curly quotes, `---` to @@ -2014,7 +2007,7 @@ will not trigger emphasis: This is * not emphasized *, and \*neither is this\*. -**Extension: `intraword_underscores`** +#### Extension: `intraword_underscores` #### Because `_` is sometimes used inside words and identifiers, pandoc does not interpret a `_` surrounded by alphanumeric @@ -2026,7 +2019,7 @@ just part of a word, use `*`: ### Strikeout ### -**Extension: `strikeout`** +#### Extension: `strikeout` #### To strikeout a section of text with a horizontal line, begin and end it with `~~`. Thus, for example, @@ -2036,7 +2029,7 @@ with `~~`. Thus, for example, ### Superscripts and subscripts ### -**Extension: `superscript`, `subscript`** +#### Extension: `superscript`, `subscript` #### Superscripts may be written by surrounding the superscripted text by `^` characters; subscripts may be written by surrounding the subscripted @@ -2074,7 +2067,7 @@ work in verbatim contexts: This is a backslash followed by an asterisk: `\*`. -**Extension: `inline_code_attributes`** +#### Extension: `inline_code_attributes` #### Attributes can be attached to verbatim text, just as with [fenced code blocks](#fenced-code-blocks): @@ -2093,7 +2086,7 @@ colon.) This will work in all output formats that support small caps. Math ---- -**Extension: `tex_math_dollars`** +#### Extension: `tex_math_dollars` #### Anything between two `$` characters will be treated as TeX math. The opening `$` must have a character immediately to its right, while the @@ -2195,7 +2188,7 @@ HTML, Slidy, DZSlides, S5, EPUB Raw HTML -------- -**Extension: `raw_html`** +#### Extension: `raw_html` #### Markdown allows you to insert raw HTML (or DocBook) anywhere in a document (except verbatim contexts, where `<`, `>`, and `&` are interpreted @@ -2207,7 +2200,7 @@ The raw HTML is passed through unchanged in HTML, S5, Slidy, Slideous, DZSlides, EPUB, Markdown, and Textile output, and suppressed in other formats. -**Extension: `markdown_in_html_blocks`** +#### Extension: `markdown_in_html_blocks` #### Standard markdown allows you to include HTML "blocks": blocks of HTML between balanced tags that are separated from the surrounding text @@ -2248,7 +2241,7 @@ from being interpreted as markdown. Raw TeX ------- -**Extension: `raw_tex`** +#### Extension: `raw_tex` #### In addition to raw HTML, pandoc allows raw LaTeX, TeX, and ConTeXt to be included in a document. Inline TeX commands will be preserved and passed @@ -2275,7 +2268,7 @@ and ConTeXt. LaTeX macros ------------ -**Extension: `latex_macros`** +#### Extension: `latex_macros` #### For output formats other than LaTeX, pandoc will parse LaTeX `\newcommand` and `\renewcommand` definitions and apply the resulting macros to all LaTeX @@ -2367,7 +2360,7 @@ not in most other implementations: > > [quote]: /foo -### Internal links +### Internal links ### To link to another section of the same document, use the automatically generated identifier (see [Header identifiers in HTML, LaTeX, and @@ -2397,9 +2390,7 @@ The link text will be used as the image's alt text: [movie reel]: movie.gif -### Pictures with captions ### - -**Extension: `implicit_figures`** +#### Extension: `implicit_figures` #### An image occurring by itself in a paragraph will be rendered as a figure with a caption.[^5] (In LaTeX, a figure environment will be @@ -2423,7 +2414,7 @@ nonbreaking space after the image: Footnotes --------- -**Extension: `footnotes`** +#### Extension: `footnotes` #### Pandoc's markdown allows footnotes, using the following syntax: @@ -2454,7 +2445,7 @@ The footnotes themselves need not be placed at the end of the document. They may appear anywhere except inside other block elements (lists, block quotes, tables, etc.). -**Extension: `inline_notes`** +#### Extension: `inline_notes` #### Inline footnotes are also allowed (though, unlike regular notes, they cannot contain multiple paragraphs). The syntax is as follows: @@ -2469,7 +2460,7 @@ Inline and regular footnotes may be mixed freely. Citations --------- -**Extension: `citations`** +#### Extension: `citations` #### Using an external filter, `pandoc-citeproc`, pandoc can automatically generate citations and a bibliography in a number of styles. Basic usage is @@ -2591,37 +2582,44 @@ in pandoc, but may be enabled by adding `+EXTENSION` to the format name, where `EXTENSION` is the name of the extension. Thus, for example, `markdown+hard_line_breaks` is markdown with hard line breaks. -**Extension: `lists_without_preceding_blankline`**\ +#### Extension: `lists_without_preceding_blankline` #### + Allow a list to occur right after a paragraph, with no intervening blank space. -**Extension: `hard_line_breaks`**\ +#### Extension: `hard_line_breaks` #### + Causes all newlines within a paragraph to be interpreted as hard line breaks instead of spaces. -**Extension: `ignore_line_breaks`**\ +#### Extension: `ignore_line_breaks` #### + Causes newlines within a paragraph to be ignored, rather than being treated as spaces or as hard line breaks. This option is intended for use with East Asian languages where spaces are not used between words, but text is divided into lines for readability. -**Extension: `tex_math_single_backslash`**\ +#### Extension: `tex_math_single_backslash` #### + Causes anything between `\(` and `\)` to be interpreted as inline TeX math, and anything between `\[` and `\]` to be interpreted as display TeX math. Note: a drawback of this extension is that it precludes escaping `(` and `[`. -**Extension: `tex_math_double_backslash`**\ +#### Extension: `tex_math_double_backslash` #### + Causes anything between `\\(` and `\\)` to be interpreted as inline TeX math, and anything between `\\[` and `\\]` to be interpreted as display TeX math. -**Extension: `markdown_attribute`**\ +#### Extension: `markdown_attribute` #### + By default, pandoc interprets material inside block-level tags as markdown. This extension changes the behavior so that markdown is only parsed inside block-level tags if the tags have the attribute `markdown=1`. -**Extension: `mmd_title_block`**\ +#### Extension: `mmd_title_block` #### + Enables a [MultiMarkdown] style title block at the top of the document, for example: @@ -2637,7 +2635,8 @@ See the MultiMarkdown documentation for details. If `pandoc_title_block` or [MultiMarkdown]: http://fletcherpenney.net/multimarkdown/ -**Extension: `abbreviations`**\ +#### Extension: `abbreviations` #### + Parses PHP Markdown Extra abbreviation keys, like *[HTML]: Hyper Text Markup Language @@ -2646,25 +2645,30 @@ Note that the pandoc document model does not support abbreviations, so if this extension is enabled, abbreviation keys are simply skipped (as opposed to being parsed as paragraphs). -**Extension: `autolink_bare_uris`**\ +#### Extension: `autolink_bare_uris` #### + Makes all absolute URIs into links, even when not surrounded by pointy braces `<...>`. -**Extension: `ascii_identifiers`**\ +#### Extension: `ascii_identifiers` #### + Causes the identifiers produced by `auto_identifiers` to be pure ASCII. Accents are stripped off of accented latin letters, and non-latin letters are omitted. -**Extension: `link_attributes`**\ +#### Extension: `link_attributes` #### + Parses multimarkdown style key-value attributes on link and image references. Note that pandoc's internal document model provides nowhere to put these, so they are presently just ignored. -**Extension: `mmd_header_identifiers`**\ +#### Extension: `mmd_header_identifiers` #### + Parses multimarkdown style header identifiers (in square brackets, after the header but before any trailing `#`s in an ATX header). -**Extension: `compact_definition_lists`**\ +#### Extension: `compact_definition_lists` #### + Activates the definition list syntax of pandoc 1.12.x and earlier. This syntax differs from the one described [above](#definition-lists) in several respects: diff --git a/benchmark/benchmark-pandoc.hs b/benchmark/benchmark-pandoc.hs index 9238b09d7..bf67eaa4d 100644 --- a/benchmark/benchmark-pandoc.hs +++ b/benchmark/benchmark-pandoc.hs @@ -20,19 +20,18 @@ import Criterion.Main import Criterion.Config import System.Environment (getArgs) import Data.Monoid +import Data.Maybe (mapMaybe) +import Debug.Trace (trace) readerBench :: Pandoc -> (String, ReaderOptions -> String -> IO Pandoc) - -> Benchmark -readerBench doc (name, reader) = - let writer = case lookup name writers of - Just (PureStringWriter w) -> w - _ -> error $ "Could not find writer for " ++ name - inp = writer def{ writerWrapText = True } doc - -- we compute the length to force full evaluation - getLength (Pandoc (Meta _) d) = length d - in bench (name ++ " reader") $ whnfIO $ getLength `fmap` - (reader def{ readerSmart = True }) inp + -> Maybe Benchmark +readerBench doc (name, reader) = case lookup name writers of + Just (PureStringWriter writer) -> + let inp = writer def{ writerWrapText = True} doc + in return $ bench (name ++ " reader") $ nfIO $ + (reader def{ readerSmart = True }) inp + _ -> trace ("\nCould not find writer for " ++ name ++ "\n") Nothing writerBench :: Pandoc -> (String, WriterOptions -> Pandoc -> String) @@ -43,13 +42,16 @@ writerBench doc (name, writer) = bench (name ++ " writer") $ nf main :: IO () main = do args <- getArgs - (conf,_) <- parseArgs defaultConfig{ cfgSamples = Last $ Just 20 } defaultOptions args - inp <- readFile "README" - inp2 <- readFile "tests/testsuite.txt" + (conf,_) <- parseArgs defaultConfig{ cfgSamples = Last $ Just 20 } + defaultOptions args + inp <- readFile "tests/testsuite.txt" let opts = def{ readerSmart = True } - let doc = readMarkdown opts $ inp ++ unlines (drop 3 $ lines inp2) - let readerBs = map (readerBench doc) - $ filter (\(n,_) -> n /="haddock") readers + let doc = readMarkdown opts inp + let readers' = [(n,r) | (n, StringReader r) <- readers] + let readerBs = mapMaybe (readerBench doc) + $ filter (\(n,_) -> n /="haddock") readers' let writers' = [(n,w) | (n, PureStringWriter w) <- writers] + let writerBs = map (writerBench doc) + $ writers' defaultMainWith conf (return ()) $ - map (writerBench doc) writers' ++ readerBs + writerBs ++ readerBs diff --git a/pandoc.cabal b/pandoc.cabal index 991398f24..d7bc5479e 100644 --- a/pandoc.cabal +++ b/pandoc.cabal @@ -227,12 +227,12 @@ Library old-locale >= 1 && < 1.1, time >= 1.2 && < 1.5, HTTP >= 4000.0.5 && < 4000.3, - texmath >= 0.6.6.3 && < 0.7, + texmath >= 0.7 && < 0.8, xml >= 1.3.12 && < 1.4, random >= 1 && < 1.1, extensible-exceptions >= 0.1 && < 0.2, - pandoc-types >= 1.12.3.4 && < 1.13, - aeson >= 0.7 && < 0.8, + pandoc-types >= 1.12.4 && < 1.13, + aeson >= 0.7 && < 0.9, tagsoup >= 0.13.1 && < 0.14, base64-bytestring >= 0.1 && < 1.1, zlib >= 0.5 && < 0.6, @@ -342,7 +342,7 @@ Library Executable pandoc Build-Depends: pandoc, - pandoc-types >= 1.12.3.3 && < 1.13, + pandoc-types >= 1.12.4 && < 1.13, base >= 4.2 && <5, directory >= 1 && < 1.3, filepath >= 1.1 && < 1.4, @@ -351,7 +351,7 @@ Executable pandoc bytestring >= 0.9 && < 0.11, extensible-exceptions >= 0.1 && < 0.2, highlighting-kate >= 0.5.8.5 && < 0.6, - aeson >= 0.7.0.5 && < 0.8, + aeson >= 0.7.0.5 && < 0.9, yaml >= 0.8.8.2 && < 0.9, containers >= 0.1 && < 0.6, HTTP >= 4000.0.5 && < 4000.3 @@ -387,8 +387,7 @@ Test-Suite test-pandoc Build-Depends: base >= 4.2 && < 5, syb >= 0.1 && < 0.5, pandoc, - pandoc-types >= 1.12.3.3 && < 1.13, - base64-bytestring >= 0.1 && < 1.1, + pandoc-types >= 1.12.4 && < 1.13, bytestring >= 0.9 && < 0.11, text >= 0.11 && < 1.2, directory >= 1 && < 1.3, @@ -402,7 +401,8 @@ Test-Suite test-pandoc QuickCheck >= 2.4 && < 2.8, HUnit >= 1.2 && < 1.3, containers >= 0.1 && < 0.6, - ansi-terminal >= 0.5 && < 0.7 + ansi-terminal >= 0.5 && < 0.7, + zip-archive >= 0.2.3.2 && < 0.3 Other-Modules: Tests.Old Tests.Helpers Tests.Arbitrary @@ -418,6 +418,7 @@ Test-Suite test-pandoc Tests.Writers.ConTeXt Tests.Writers.HTML Tests.Writers.Markdown + Tests.Writers.Plain Tests.Writers.AsciiDoc Tests.Writers.LaTeX Ghc-Options: -rtsopts -Wall -fno-warn-unused-do-bind @@ -38,13 +38,13 @@ import Text.Pandoc.Readers.LaTeX (handleIncludes) import Text.Pandoc.Shared ( tabFilter, readDataFileUTF8, readDataFile, safeRead, headerShift, normalize, err, warn, openURL ) -import Text.Pandoc.MediaBag ( mediaDirectory, extractMediaBag ) +import Text.Pandoc.MediaBag ( mediaDirectory, extractMediaBag, MediaBag ) import Text.Pandoc.XML ( toEntities ) import Text.Pandoc.SelfContained ( makeSelfContained ) import Text.Pandoc.Process (pipeProcess) import Text.Highlighting.Kate ( languages, Style, tango, pygments, espresso, zenburn, kate, haddock, monochrome ) -import System.Environment ( getArgs, getProgName ) +import System.Environment ( getArgs, getProgName, getEnvironment ) import System.Exit ( exitWith, ExitCode (..) ) import System.FilePath import System.Console.GetOpt @@ -71,6 +71,8 @@ import Control.Applicative ((<$>)) import Text.Pandoc.Readers.Txt2Tags (getT2TMeta) import Data.Monoid +type Transform = Pandoc -> Pandoc + copyrightMessage :: String copyrightMessage = "\nCopyright (C) 2006-2014 John MacFarlane\n" ++ "Web: http://johnmacfarlane.net/pandoc\n" ++ @@ -101,7 +103,10 @@ isTextFormat s = takeWhile (`notElem` "+-") s `notElem` ["odt","docx","epub","ep externalFilter :: FilePath -> [String] -> Pandoc -> IO Pandoc externalFilter f args' d = do - mbexe <- if '/' `elem` f -- don't check PATH if filter name it has a path + mbPath <- lookup "PATH" <$> getEnvironment + mbexe <- if '/' `elem` f || mbPath == Nothing + -- don't check PATH if filter name has a path, or + -- if the PATH is not set then return Nothing else findExecutable f (f', args'') <- case mbexe of @@ -141,7 +146,7 @@ data Opt = Opt , optWriter :: String -- ^ Writer format , optParseRaw :: Bool -- ^ Parse unconvertable HTML and TeX , optTableOfContents :: Bool -- ^ Include table of contents - , optTransforms :: [Pandoc -> Pandoc] -- ^ Doc transforms to apply + , optTransforms :: [Transform] -- ^ Doc transforms to apply , optTemplate :: Maybe FilePath -- ^ Custom template , optVariables :: [(String,String)] -- ^ Template variables to set , optMetadata :: M.Map String MetaValue -- ^ Metadata fields to set @@ -935,6 +940,31 @@ defaultWriterName x = ['.',y] | y `elem` ['1'..'9'] -> "man" _ -> "html" +-- Transformations of a Pandoc document post-parsing: + +extractMedia :: MediaBag -> FilePath -> Pandoc -> IO Pandoc +extractMedia media dir d = + case [fp | (fp, _, _) <- mediaDirectory media] of + [] -> return d + fps -> do + extractMediaBag True dir media + return $ walk (adjustImagePath dir fps) d + +adjustImagePath :: FilePath -> [FilePath] -> Inline -> Inline +adjustImagePath dir paths (Image lab (src, tit)) + | src `elem` paths = Image lab (dir ++ "/" ++ src, tit) +adjustImagePath _ _ x = x + +adjustMetadata :: M.Map String MetaValue -> Pandoc -> IO Pandoc +adjustMetadata metadata d = return $ M.foldWithKey setMeta d metadata + +applyTransforms :: [Transform] -> Pandoc -> IO Pandoc +applyTransforms transforms d = return $ foldr ($) d transforms + +applyFilters :: [FilePath] -> [String] -> Pandoc -> IO Pandoc +applyFilters filters args d = + foldrM ($) d $ map (flip externalFilter args) filters + main :: IO () main = do @@ -1027,7 +1057,6 @@ main = do all (\f -> takeBaseName f /= "pandoc-citeproc") filters -> "pandoc-citeproc" : filters _ -> filters - let plugins = map externalFilter filters' let sources = if ignoreArgs then [] else args @@ -1120,6 +1149,7 @@ main = do $ lines dztempl return $ ("dzslides-core", dzcore) : variables' else return variables' + let sourceURL = case sources of [] -> Nothing (x:_) -> case parseURI x of @@ -1144,6 +1174,40 @@ main = do , readerTrackChanges = trackChanges } + when (not (isTextFormat writerName') && outputFile == "-") $ + err 5 $ "Cannot write " ++ writerName' ++ " output to stdout.\n" ++ + "Specify an output file using the -o option." + + let readSources [] = mapM readSource ["-"] + readSources srcs = mapM readSource srcs + readSource "-" = UTF8.getContents + readSource src = case parseURI src of + Just u | uriScheme u `elem` ["http:","https:"] -> + readURI src + _ -> UTF8.readFile src + readURI src = do + res <- openURL src + case res of + Left e -> throwIO e + Right (bs,_) -> return $ UTF8.toString bs + + let readFiles [] = error "Cannot read archive from stdin" + readFiles (x:_) = B.readFile x + + let convertTabs = tabFilter (if (preserveTabs || readerName' == "t2t") then 0 else tabStop) + + let handleIncludes' = if readerName' == "latex" || readerName' == "latex+lhs" + then handleIncludes + else return + + (doc, media) <- + case reader of + StringReader r-> (, mempty) <$> + ( readSources >=> + handleIncludes' . convertTabs . intercalate "\n" >=> + r readerOpts ) sources + ByteStringReader r -> readFiles sources >>= r readerOpts + let writerOptions = def { writerStandalone = standalone', writerTemplate = templ, writerVariables = variables'', @@ -1179,70 +1243,15 @@ main = do writerEpubChapterLevel = epubChapterLevel, writerTOCDepth = epubTOCDepth, writerReferenceODT = referenceODT, - writerReferenceDocx = referenceDocx + writerReferenceDocx = referenceDocx, + writerMediaBag = media } - when (not (isTextFormat writerName') && outputFile == "-") $ - err 5 $ "Cannot write " ++ writerName' ++ " output to stdout.\n" ++ - "Specify an output file using the -o option." - - let readSources [] = mapM readSource ["-"] - readSources srcs = mapM readSource srcs - readSource "-" = UTF8.getContents - readSource src = case parseURI src of - Just u | uriScheme u `elem` ["http:","https:"] -> - readURI src - _ -> UTF8.readFile src - readURI src = do - res <- openURL src - case res of - Left e -> throwIO e - Right (bs,_) -> return $ UTF8.toString bs - - let readFiles [] = error "Cannot read archive from stdin" - readFiles (x:_) = B.readFile x - - let convertTabs = tabFilter (if (preserveTabs || readerName' == "t2t") then 0 else tabStop) - - let handleIncludes' = if readerName' == "latex" || readerName' == "latex+lhs" - then handleIncludes - else return - - let adjustImagePath :: FilePath -> [FilePath] -> Inline -> Inline - adjustImagePath dir paths (Image lab (src, tit)) - | src `elem` paths = Image lab (dir ++ "/" ++ src, tit) - adjustImagePath _ _ x = x - - (doc, media) <- - case reader of - StringReader r-> (, mempty) <$> - ( readSources >=> - handleIncludes' . convertTabs . intercalate "\n" >=> - r readerOpts ) sources - ByteStringReader r -> readFiles sources >>= r readerOpts - - let writerOptions' = writerOptions{ writerMediaBag = media } - - let extractMedia d = do - case mbExtractMedia of - Just dir -> do - case [fp | (fp, _, _) <- mediaDirectory media] of - [] -> return d - fps -> do - extractMediaBag True dir media - return $ walk (adjustImagePath dir fps) d - _ -> return d - - let adjustMetadata d = return $ M.foldWithKey setMeta d metadata - - let applyTransforms d = return $ foldr ($) d transforms - - let applyPlugins d = foldrM ($) d $ map ($ [writerName']) plugins - doc' <- (extractMedia >=> - adjustMetadata >=> - applyTransforms >=> - applyPlugins) doc + doc' <- (maybe return (extractMedia media) mbExtractMedia >=> + adjustMetadata metadata >=> + applyTransforms transforms >=> + applyFilters filters' [writerName']) doc let writeBinary :: B.ByteString -> IO () writeBinary = B.writeFile (UTF8.encodePath outputFile) @@ -1252,8 +1261,8 @@ main = do writerFn f = UTF8.writeFile f case writer of - IOStringWriter f -> f writerOptions' doc' >>= writerFn outputFile - IOByteStringWriter f -> f writerOptions' doc' >>= writeBinary + IOStringWriter f -> f writerOptions doc' >>= writerFn outputFile + IOByteStringWriter f -> f writerOptions doc' >>= writeBinary PureStringWriter f | pdfOutput -> do -- make sure writer is latex or beamer @@ -1267,23 +1276,21 @@ main = do err 41 $ latexEngine ++ " not found. " ++ latexEngine ++ " is needed for pdf output." - res <- makePDF latexEngine f writerOptions' doc' + res <- makePDF latexEngine f writerOptions doc' case res of Right pdf -> writeBinary pdf Left err' -> do B.hPutStr stderr $ err' B.hPut stderr $ B.pack [10] err 43 "Error producing PDF from TeX source" - | otherwise -> selfcontain (f writerOptions' doc' ++ + | otherwise -> selfcontain (f writerOptions doc' ++ ['\n' | not standalone']) >>= writerFn outputFile . handleEntities where htmlFormat = writerName' `elem` ["html","html+lhs","html5","html5+lhs", "s5","slidy","slideous","dzslides","revealjs"] selfcontain = if selfContained && htmlFormat - then makeSelfContained - (writerMediaBag writerOptions') - (writerUserDataDir writerOptions') + then makeSelfContained writerOptions else return handleEntities = if htmlFormat && ascii then toEntities diff --git a/src/Text/Pandoc/Readers/Docx.hs b/src/Text/Pandoc/Readers/Docx.hs index 7a89c0b04..86ce62ced 100644 --- a/src/Text/Pandoc/Readers/Docx.hs +++ b/src/Text/Pandoc/Readers/Docx.hs @@ -219,7 +219,8 @@ runStyleToContainers rPr = , if isStrike rPr then (Just Strikeout) else Nothing , if isSuperScript rPr then (Just Superscript) else Nothing , if isSubScript rPr then (Just Subscript) else Nothing - , rUnderline rPr >>= (\f -> Just $ Span ("", [], [("underline", f)])) + , rUnderline rPr >>= + (\f -> if f == "single" then (Just Emph) else Nothing) ] in classContainers ++ formatters diff --git a/src/Text/Pandoc/SelfContained.hs b/src/Text/Pandoc/SelfContained.hs index adb2c0014..1a4e037cf 100644 --- a/src/Text/Pandoc/SelfContained.hs +++ b/src/Text/Pandoc/SelfContained.hs @@ -32,56 +32,54 @@ the HTML using data URIs. -} module Text.Pandoc.SelfContained ( makeSelfContained ) where import Text.HTML.TagSoup -import Network.URI (isURI, escapeURIString) +import Network.URI (isURI, escapeURIString, URI(..), parseURI) import Data.ByteString.Base64 import qualified Data.ByteString.Char8 as B -import qualified Data.ByteString as BS import Data.ByteString (ByteString) -import System.FilePath (takeExtension, dropExtension, takeDirectory, (</>)) +import System.FilePath (takeExtension, takeDirectory, (</>)) import Data.Char (toLower, isAscii, isAlphaNum) import Codec.Compression.GZip as Gzip import qualified Data.ByteString.Lazy as L -import Text.Pandoc.Shared (renderTags', openURL, readDataFile, err) -import Text.Pandoc.MediaBag (MediaBag, lookupMedia) +import Text.Pandoc.Shared (renderTags', err, fetchItem') +import Text.Pandoc.MediaBag (MediaBag) import Text.Pandoc.UTF8 (toString, fromString) -import Text.Pandoc.MIME (getMimeType) -import System.Directory (doesFileExist) +import Text.Pandoc.Options (WriterOptions(..)) isOk :: Char -> Bool isOk c = isAscii c && isAlphaNum c -convertTag :: MediaBag -> Maybe FilePath -> Tag String -> IO (Tag String) -convertTag media userdata t@(TagOpen tagname as) +convertTag :: MediaBag -> Maybe String -> Tag String -> IO (Tag String) +convertTag media sourceURL t@(TagOpen tagname as) | tagname `elem` ["img", "embed", "video", "input", "audio", "source"] = do as' <- mapM processAttribute as return $ TagOpen tagname as' where processAttribute (x,y) = if x == "src" || x == "href" || x == "poster" then do - (raw, mime) <- getRaw media userdata (fromAttrib "type" t) y + (raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) y let enc = "data:" ++ mime ++ ";base64," ++ toString (encode raw) return (x, enc) else return (x,y) -convertTag media userdata t@(TagOpen "script" as) = +convertTag media sourceURL t@(TagOpen "script" as) = case fromAttrib "src" t of [] -> return t src -> do - (raw, mime) <- getRaw media userdata (fromAttrib "type" t) src + (raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) src let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw) return $ TagOpen "script" (("src",enc) : [(x,y) | (x,y) <- as, x /= "src"]) -convertTag media userdata t@(TagOpen "link" as) = +convertTag media sourceURL t@(TagOpen "link" as) = case fromAttrib "href" t of [] -> return t src -> do - (raw, mime) <- getRaw media userdata (fromAttrib "type" t) src + (raw, mime) <- getRaw media sourceURL (fromAttrib "type" t) src let enc = "data:" ++ mime ++ "," ++ escapeURIString isOk (toString raw) return $ TagOpen "link" (("href",enc) : [(x,y) | (x,y) <- as, x /= "href"]) convertTag _ _ t = return t -- NOTE: This is really crude, it doesn't respect CSS comments. -cssURLs :: MediaBag -> Maybe FilePath -> FilePath -> ByteString +cssURLs :: MediaBag -> Maybe String -> FilePath -> ByteString -> IO ByteString -cssURLs media userdata d orig = +cssURLs media sourceURL d orig = case B.breakSubstring "url(" orig of (x,y) | B.null y -> return orig | otherwise -> do @@ -94,43 +92,21 @@ cssURLs media userdata d orig = let url' = if isURI url then url else d </> url - (raw, mime) <- getRaw media userdata "" url' - rest <- cssURLs media userdata d v + (raw, mime) <- getRaw media sourceURL "" url' + rest <- cssURLs media sourceURL d v let enc = "data:" `B.append` fromString mime `B.append` ";base64," `B.append` (encode raw) return $ x `B.append` "url(" `B.append` enc `B.append` rest -getItem :: MediaBag -> Maybe FilePath -> String - -> IO (ByteString, Maybe String) -getItem media userdata f = - if isURI f - then openURL f >>= either handleErr return - else do - -- strip off trailing query or fragment part, if relative URL. - -- this is needed for things like cmunrm.eot?#iefix, - -- which is used to get old versions of IE to work with web fonts. - let f' = takeWhile (\c -> c /= '?' && c /= '#') f - let mbMime = case takeExtension f' of - ".gz" -> getMimeType $ dropExtension f' - x -> getMimeType x - exists <- doesFileExist f' - if exists - then do - cont <- B.readFile f' - return (cont, mbMime) - else case lookupMedia f media of - Just (mime,bs) -> return (BS.concat $ L.toChunks bs, - Just mime) - Nothing -> do - cont <- readDataFile userdata f' - return (cont, mbMime) - where handleErr e = err 61 $ "Failed to retrieve " ++ f ++ "\n" ++ show e - -getRaw :: MediaBag -> Maybe FilePath -> String -> String +getRaw :: MediaBag -> Maybe String -> String -> String -> IO (ByteString, String) -getRaw media userdata mimetype src = do +getRaw media sourceURL mimetype src = do let ext = map toLower $ takeExtension src - (raw, respMime) <- getItem media userdata src + fetchResult <- fetchItem' media sourceURL src + (raw, respMime) <- case fetchResult of + Left msg -> err 67 $ "Could not fetch " ++ src ++ + "\n" ++ show msg + Right x -> return x let raw' = if ext == ".gz" then B.concat $ L.toChunks $ Gzip.decompress $ L.fromChunks $ [raw] @@ -140,21 +116,22 @@ getRaw media userdata mimetype src = do $ "Could not determine mime type for `" ++ src ++ "'" (x, Nothing) -> x (_, Just x ) -> x + let cssSourceURL = case parseURI src of + Just u + | uriScheme u `elem` ["http:","https:"] -> + Just $ show u{ uriPath = "", + uriQuery = "", + uriFragment = "" } + _ -> Nothing result <- if mime == "text/css" - then cssURLs media userdata (takeDirectory src) raw' + then cssURLs media cssSourceURL (takeDirectory src) raw' else return raw' return (result, mime) -- | Convert HTML into self-contained HTML, incorporating images, --- scripts, and CSS using data: URIs. Items specified using absolute --- URLs will be downloaded; those specified using relative URLs will --- be sought first relative to the working directory, then in the --- media bag, then relative --- to the user data directory (if the first parameter is 'Just' --- a directory), and finally relative to pandoc's default data --- directory. -makeSelfContained :: MediaBag -> Maybe FilePath -> String -> IO String -makeSelfContained media userdata inp = do +-- scripts, and CSS using data: URIs. +makeSelfContained :: WriterOptions -> String -> IO String +makeSelfContained opts inp = do let tags = parseTags inp - out' <- mapM (convertTag media userdata) tags + out' <- mapM (convertTag (writerMediaBag opts) (writerSourceURL opts)) tags return $ renderTags' out' diff --git a/src/Text/Pandoc/Shared.hs b/src/Text/Pandoc/Shared.hs index d5769c1ab..f0e5bbe5d 100644 --- a/src/Text/Pandoc/Shared.hs +++ b/src/Text/Pandoc/Shared.hs @@ -59,6 +59,7 @@ module Text.Pandoc.Shared ( normalizeBlocks, removeFormatting, stringify, + capitalize, compactify, compactify', compactify'DL, @@ -101,7 +102,7 @@ import Data.Char ( toLower, isLower, isUpper, isAlpha, import Data.List ( find, isPrefixOf, intercalate ) import qualified Data.Map as M import Network.URI ( escapeURIString, isURI, nonStrictRelativeTo, - unEscapeString, parseURIReference ) + unEscapeString, parseURIReference, isAllowedInURI ) import qualified Data.Set as Set import System.Directory import Text.Pandoc.MIME (getMimeType) @@ -122,6 +123,7 @@ import qualified Data.ByteString.Char8 as B8 import Text.Pandoc.Compat.Monoid import Data.ByteString.Base64 (decodeLenient) import Data.Sequence (ViewR(..), ViewL(..), viewl, viewr) +import qualified Data.Text as T (toUpper, pack, unpack) #ifdef EMBED_DATA_FILES import Text.Pandoc.Data (dataFiles) @@ -527,6 +529,17 @@ stringify = query go . walk deNote deNote (Note _) = Str "" deNote x = x +-- | Bring all regular text in a pandoc structure to uppercase. +-- +-- This function correctly handles cases where a lowercase character doesn't +-- match to a single uppercase character – e.g. “Straße” would be converted +-- to “STRASSE”, not “STRAßE”. +capitalize :: Walkable Inline a => a -> a +capitalize = walk go + where go :: Inline -> Inline + go (Str s) = Str (T.unpack $ T.toUpper $ T.pack s) + go x = x + -- | Change final list item from @Para@ to @Plain@ if the list contains -- no other @Para@ blocks. compactify :: [[Block]] -- ^ List of list items (each a list of blocks) @@ -766,21 +779,23 @@ readDataFileUTF8 userDir fname = -- Returns raw content and maybe mime type. fetchItem :: Maybe String -> String -> IO (Either E.SomeException (BS.ByteString, Maybe String)) -fetchItem sourceURL s - | isURI s = openURL s - | otherwise = - case sourceURL >>= parseURIReference of - Just u -> case parseURIReference s of - Just s' -> openURL $ show $ - s' `nonStrictRelativeTo` u - Nothing -> openURL $ show u ++ "/" ++ s - Nothing -> E.try readLocalFile +fetchItem sourceURL s = + case (sourceURL >>= parseURIReference . ensureEscaped, ensureEscaped s) of + (_, s') | isURI s' -> openURL s' + (Just u, s') -> -- try fetching from relative path at source + case parseURIReference s' of + Just u' -> openURL $ show $ u' `nonStrictRelativeTo` u + Nothing -> openURL s' -- will throw error + (Nothing, _) -> E.try readLocalFile -- get from local file system where readLocalFile = do - let mime = case takeExtension s of - ".gz" -> getMimeType $ dropExtension s - x -> getMimeType x - cont <- BS.readFile $ unEscapeString s + cont <- BS.readFile fp return (cont, mime) + dropFragmentAndQuery = takeWhile (\c -> c /= '?' && c /= '#') + fp = unEscapeString $ dropFragmentAndQuery s + mime = case takeExtension fp of + ".gz" -> getMimeType $ dropExtension fp + x -> getMimeType x + ensureEscaped = escapeURIString isAllowedInURI -- | Like 'fetchItem', but also looks for items in a 'MediaBag'. fetchItem' :: MediaBag -> Maybe String -> String diff --git a/src/Text/Pandoc/Writers/EPUB.hs b/src/Text/Pandoc/Writers/EPUB.hs index 770b6f244..34a6dcb2f 100644 --- a/src/Text/Pandoc/Writers/EPUB.hs +++ b/src/Text/Pandoc/Writers/EPUB.hs @@ -61,7 +61,6 @@ import Text.Pandoc.MIME (getMimeType) import qualified Control.Exception as E import Text.Blaze.Html.Renderer.Utf8 (renderHtml) import Text.HTML.TagSoup -import Data.Monoid -- A Chapter includes a list of blocks and maybe a section -- number offset. Note, some chapters are unnumbered. The section @@ -794,7 +793,7 @@ transformInline opts mediaRef (Image lab (src,tit)) = do return $ Image lab (newsrc, tit) transformInline opts _ (x@(Math _ _)) | WebTeX _ <- writerHTMLMathMethod opts = do - raw <- makeSelfContained mempty Nothing $ writeHtmlInline opts x + raw <- makeSelfContained opts $ writeHtmlInline opts x return $ RawInline (Format "html") raw transformInline opts mediaRef (RawInline fmt raw) | fmt == Format "html" = do diff --git a/src/Text/Pandoc/Writers/FB2.hs b/src/Text/Pandoc/Writers/FB2.hs index 803617f95..7a9bff4fe 100644 --- a/src/Text/Pandoc/Writers/FB2.hs +++ b/src/Text/Pandoc/Writers/FB2.hs @@ -28,7 +28,7 @@ module Text.Pandoc.Writers.FB2 (writeFB2) where import Control.Monad.State (StateT, evalStateT, get, modify) import Control.Monad.State (liftM, liftM2, liftIO) import Data.ByteString.Base64 (encode) -import Data.Char (toUpper, toLower, isSpace, isAscii, isControl) +import Data.Char (toLower, isSpace, isAscii, isControl) import Data.List (intersperse, intercalate, isPrefixOf) import Data.Either (lefts, rights) import Network.Browser (browse, request, setAllowRedirects, setOutHandler) @@ -44,8 +44,7 @@ import qualified Text.XML.Light.Cursor as XC import Text.Pandoc.Definition import Text.Pandoc.Options (WriterOptions(..), HTMLMathMethod(..), def) -import Text.Pandoc.Shared (orderedListMarkers, isHeaderBlock) -import Text.Pandoc.Walk +import Text.Pandoc.Shared (orderedListMarkers, isHeaderBlock, capitalize) -- | Data to be written at the end of the document: -- (foot)notes, URLs, references, images. @@ -421,10 +420,6 @@ indent = indentBlock indentLines ins = let lns = split isLineBreak ins :: [[Inline]] in intercalate [LineBreak] $ map ((Str spacer):) lns -capitalize :: Inline -> Inline -capitalize (Str xs) = Str $ map toUpper xs -capitalize x = x - -- | Convert a Pandoc's Inline element to FictionBook XML representation. toXml :: Inline -> FBM [Content] toXml (Str s) = return [txt s] @@ -434,7 +429,7 @@ toXml (Strong ss) = list `liftM` wrap "strong" ss toXml (Strikeout ss) = list `liftM` wrap "strikethrough" ss toXml (Superscript ss) = list `liftM` wrap "sup" ss toXml (Subscript ss) = list `liftM` wrap "sub" ss -toXml (SmallCaps ss) = cMapM toXml $ walk capitalize ss +toXml (SmallCaps ss) = cMapM toXml $ capitalize ss toXml (Quoted SingleQuote ss) = do -- FIXME: should be language-specific inner <- cMapM toXml ss return $ [txt "‘"] ++ inner ++ [txt "’"] diff --git a/src/Text/Pandoc/Writers/Markdown.hs b/src/Text/Pandoc/Writers/Markdown.hs index 897e425c6..a859267cc 100644 --- a/src/Text/Pandoc/Writers/Markdown.hs +++ b/src/Text/Pandoc/Writers/Markdown.hs @@ -39,7 +39,7 @@ import Text.Pandoc.Writers.Shared import Text.Pandoc.Options import Text.Pandoc.Parsing hiding (blankline, blanklines, char, space) import Data.List ( group, isPrefixOf, find, intersperse, transpose, sortBy ) -import Data.Char ( isSpace, isPunctuation, toUpper ) +import Data.Char ( isSpace, isPunctuation ) import Data.Ord ( comparing ) import Text.Pandoc.Pretty import Control.Monad.State @@ -672,10 +672,6 @@ escapeSpaces (Str s) = Str $ substitute " " "\\ " s escapeSpaces Space = Str "\\ " escapeSpaces x = x -toCaps :: Inline -> Inline -toCaps (Str s) = Str (map toUpper s) -toCaps x = x - -- | Convert Pandoc inline element to markdown. inlineToMarkdown :: WriterOptions -> Inline -> State WriterState Doc inlineToMarkdown opts (Span attrs ils) = do @@ -693,7 +689,7 @@ inlineToMarkdown opts (Emph lst) = do inlineToMarkdown opts (Strong lst) = do plain <- gets stPlain if plain - then inlineListToMarkdown opts $ walk toCaps lst + then inlineListToMarkdown opts $ capitalize lst else do contents <- inlineListToMarkdown opts lst return $ "**" <> contents <> "**" @@ -716,7 +712,7 @@ inlineToMarkdown opts (Subscript lst) = do inlineToMarkdown opts (SmallCaps lst) = do plain <- gets stPlain if plain - then inlineListToMarkdown opts $ walk toCaps lst + then inlineListToMarkdown opts $ capitalize lst else do contents <- inlineListToMarkdown opts lst return $ tagWithAttrs "span" diff --git a/tests/Tests/Readers/Docx.hs b/tests/Tests/Readers/Docx.hs index 85a02debd..efc520dba 100644 --- a/tests/Tests/Readers/Docx.hs +++ b/tests/Tests/Readers/Docx.hs @@ -5,14 +5,15 @@ import Text.Pandoc.Readers.Native import Text.Pandoc.Definition import Tests.Helpers import Test.Framework -import qualified Data.ByteString as BS +import Test.HUnit (assertBool) +import Test.Framework.Providers.HUnit import qualified Data.ByteString.Lazy as B -import qualified Data.ByteString.Char8 as B8 -import qualified Data.ByteString.Base64 as B64 import Text.Pandoc.Readers.Docx import Text.Pandoc.Writers.Native (writeNative) import qualified Data.Map as M -import Text.Pandoc.MediaBag (lookupMedia) +import Text.Pandoc.MediaBag (MediaBag, lookupMedia, mediaDirectory) +import Codec.Archive.Zip +import System.FilePath (combine) -- We define a wrapper around pandoc that doesn't normalize in the -- tests. Since we do our own normalization, we want to make sure @@ -56,22 +57,44 @@ testCompareWithOpts opts name docxFile nativeFile = testCompare :: String -> FilePath -> FilePath -> Test testCompare = testCompareWithOpts def -testCompareMediaIO :: String -> FilePath -> FilePath -> FilePath -> IO Test -testCompareMediaIO name docxFile mediaPath mediaFile = do +getMedia :: FilePath -> FilePath -> IO (Maybe B.ByteString) +getMedia archivePath mediaPath = do + zf <- B.readFile archivePath >>= return . toArchive + return $ findEntryByPath (combine "word" mediaPath) zf >>= (Just . fromEntry) + +compareMediaPathIO :: FilePath -> MediaBag -> FilePath -> IO Bool +compareMediaPathIO mediaPath mediaBag docxPath = do + docxMedia <- getMedia docxPath mediaPath + let mbBS = case lookupMedia mediaPath mediaBag of + Just (_, bs) -> bs + Nothing -> error ("couldn't find " ++ + mediaPath ++ + " in media bag") + docxBS = case docxMedia of + Just bs -> bs + Nothing -> error ("couldn't find " ++ + mediaPath ++ + " in media bag") + return $ mbBS == docxBS + +compareMediaBagIO :: FilePath -> IO Bool +compareMediaBagIO docxFile = do df <- B.readFile docxFile - mf <- B.readFile mediaFile let (_, mb) = readDocx def df - dBytes = case lookupMedia mediaPath mb of - Just (_,bs) -> bs - Nothing -> error "Media file not found" - d64 = B8.unpack $ B64.encode $ BS.concat $ B.toChunks dBytes - m64 = B8.unpack $ B64.encode $ BS.concat $ B.toChunks mf - return $ test id name (d64, m64) + bools <- mapM + (\(fp, _, _) -> compareMediaPathIO fp mb docxFile) + (mediaDirectory mb) + return $ and bools -testCompareMedia :: String -> FilePath -> FilePath -> FilePath -> Test -testCompareMedia name docxFile mediaPath mediaFile = - buildTest $ testCompareMediaIO name docxFile mediaPath mediaFile +testMediaBagIO :: String -> FilePath -> IO Test +testMediaBagIO name docxFile = do + outcome <- compareMediaBagIO docxFile + return $ testCase name (assertBool + ("Media didn't match media bag in file " ++ docxFile) + outcome) +testMediaBag :: String -> FilePath -> Test +testMediaBag name docxFile = buildTest $ testMediaBagIO name docxFile tests :: [Test] tests = [ testGroup "inlines" @@ -186,11 +209,9 @@ tests = [ testGroup "inlines" "docx.track_changes_deletion_all.native" ] , testGroup "media" - [ testCompareMedia + [ testMediaBag "image extraction" "docx.image.docx" - "media/image1.jpeg" - "docx.image1.jpeg" ] , testGroup "metadata" [ testCompareWithOpts def{readerStandalone=True} diff --git a/tests/Tests/Writers/Plain.hs b/tests/Tests/Writers/Plain.hs new file mode 100644 index 000000000..f8f1d3d90 --- /dev/null +++ b/tests/Tests/Writers/Plain.hs @@ -0,0 +1,21 @@ +{-# LANGUAGE OverloadedStrings #-} +module Tests.Writers.Plain (tests) where + +import Test.Framework +import Text.Pandoc.Builder +import Text.Pandoc +import Tests.Helpers +import Tests.Arbitrary() + + +infix 4 =: +(=:) :: (ToString a, ToPandoc a) + => String -> (a, String) -> Test +(=:) = test (writePlain def . toPandoc) + + +tests :: [Test] +tests = [ "strongly emphasized text to uppercase" + =: strong "Straße" + =?> "STRASSE" + ] diff --git a/tests/docx.image.docx b/tests/docx.image.docx Binary files differindex 060f2b204..06e4efd1a 100644 --- a/tests/docx.image.docx +++ b/tests/docx.image.docx diff --git a/tests/docx.image1.jpeg b/tests/docx.image1.jpeg Binary files differdeleted file mode 100644 index 423dff48b..000000000 --- a/tests/docx.image1.jpeg +++ /dev/null diff --git a/tests/docx.image_no_embed.native b/tests/docx.image_no_embed.native index aa0f65d27..95c73610e 100644 --- a/tests/docx.image_no_embed.native +++ b/tests/docx.image_no_embed.native @@ -1,2 +1,2 @@ -[Header 2 ("an-image",[],[]) [Str "An",Space,Str "image"] -,Para [Image [] ("media/image1.jpeg","")]] +[Para [Str "An",Space,Str "image:"] +,Para [Image [] ("media/image1.jpg","")]] diff --git a/tests/docx.inline_formatting.native b/tests/docx.inline_formatting.native index dc8a3d19a..22d8f79e8 100644 --- a/tests/docx.inline_formatting.native +++ b/tests/docx.inline_formatting.native @@ -1,5 +1,5 @@ [Para [Str "Regular",Space,Str "text",Space,Emph [Str "italics"],Space,Strong [Str "bold",Space,Emph [Str "bold",Space,Str "italics"]],Str "."] ,Para [Str "This",Space,Str "is",Space,SmallCaps [Str "Small",Space,Str "Caps"],Str ",",Space,Str "and",Space,Str "this",Space,Str "is",Space,Strikeout [Str "strikethrough"],Str "."] -,Para [Str "Some",Space,Str "people",Space,Str "use",Space,Span ("",[],[("underline","single")]) [Str "single",Space,Str "underlines",Space,Str "for",Space,Emph [Str "emphasis"]],Str "."] +,Para [Str "Some",Space,Str "people",Space,Str "use",Space,Emph [Str "single",Space,Str "underlines",Space,Str "for",Space,Emph [Str "emphasis"]],Str "."] ,Para [Str "Above",Space,Str "the",Space,Str "line",Space,Str "is",Space,Superscript [Str "superscript"],Space,Str "and",Space,Str "below",Space,Str "the",Space,Str "line",Space,Str "is",Space,Subscript [Str "subscript"],Str "."] ,Para [Str "A",Space,Str "line",LineBreak,Str "break."]] diff --git a/tests/test-pandoc.hs b/tests/test-pandoc.hs index 1dab8e6f1..e6924f6b2 100644 --- a/tests/test-pandoc.hs +++ b/tests/test-pandoc.hs @@ -17,6 +17,7 @@ import qualified Tests.Writers.HTML import qualified Tests.Writers.Docbook import qualified Tests.Writers.Native import qualified Tests.Writers.Markdown +import qualified Tests.Writers.Plain import qualified Tests.Writers.AsciiDoc import qualified Tests.Shared import qualified Tests.Walk @@ -33,6 +34,7 @@ tests = [ testGroup "Old" Tests.Old.tests , testGroup "HTML" Tests.Writers.HTML.tests , testGroup "Docbook" Tests.Writers.Docbook.tests , testGroup "Markdown" Tests.Writers.Markdown.tests + , testGroup "Plain" Tests.Writers.Plain.tests , testGroup "AsciiDoc" Tests.Writers.AsciiDoc.tests ] , testGroup "Readers" |