aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Makefile24
-rw-r--r--README251
-rw-r--r--debian/changelog2
-rw-r--r--man/man1/html2markdown.161
-rw-r--r--man/man1/latex2markdown.134
-rw-r--r--man/man1/markdown2html.135
-rw-r--r--man/man1/markdown2latex.135
-rw-r--r--man/man1/markdown2pdf.186
-rw-r--r--man/man1/markdown2rst.11
-rw-r--r--man/man1/markdown2rtf.11
-rw-r--r--man/man1/markdown2s5.11
-rw-r--r--man/man1/pandoc.1126
-rw-r--r--man/man1/rst2markdown.11
-rw-r--r--man/man1/web2markdown.182
-rw-r--r--src/Main.hs142
-rw-r--r--src/wrappers/checkin.sh7
-rw-r--r--src/wrappers/common.sh27
-rw-r--r--src/wrappers/getopts.sh12
-rw-r--r--src/wrappers/html2markdown.in134
-rw-r--r--src/wrappers/latex2markdown.in14
-rw-r--r--src/wrappers/markdown2html.in12
-rw-r--r--src/wrappers/markdown2latex.in12
-rw-r--r--src/wrappers/markdown2pdf.in68
-rw-r--r--src/wrappers/postopts.sh17
-rw-r--r--src/wrappers/singlearg.sh7
-rw-r--r--src/wrappers/testwrapper.in141
-rw-r--r--src/wrappers/web2markdown.in173
27 files changed, 710 insertions, 796 deletions
diff --git a/Makefile b/Makefile
index aa3f63583..315bc9e45 100644
--- a/Makefile
+++ b/Makefile
@@ -23,15 +23,16 @@ EXECSBASE := $(shell sed -ne 's/^[Ee]xecutable:[[:space:]]*//p' $(CABAL).in)
#-------------------------------------------------------------------------------
# Install targets
#-------------------------------------------------------------------------------
+WRAPPERS := web2markdown markdown2pdf
+SYMLINKS := markdown2html markdown2latex markdown2s5 markdown2rst \
+ markdown2rtf html2markdown latex2markdown rst2markdown
+PROGS := $(EXECS) $(WRAPPERS)
# Add .exe extensions if we're running Windows/Cygwin.
EXTENSION := $(shell uname | tr '[:upper:]' '[:lower:]' | \
sed -ne 's/^cygwin.*$$/\.exe/p')
EXECS := $(addsuffix $(EXTENSION),$(EXECSBASE))
# First entry in Cabal's executable stanza is the main executable.
MAIN := $(firstword $(EXECS))
-WRAPPERS := html2markdown latex2markdown markdown2html \
- markdown2latex markdown2pdf
-PROGS := $(EXECS) $(WRAPPERS)
DOCS := README.html README BUGS
#-------------------------------------------------------------------------------
@@ -92,6 +93,12 @@ all: build-program
templates: $(SRCDIR)/templates
$(MAKE) -C $(SRCDIR)/templates
+.PHONY: symlinks
+cleanup_files+=$(SYMLINKS)
+symlinks: $(SYMLINKS)
+$(SYMLINKS): $(MAIN)
+ ln -sf ./$(MAIN) $@
+
define generate-shell-script
echo "Generating $@..."; \
awk ' \
@@ -131,7 +138,7 @@ build: configure
$(BUILDCMD) build
.PHONY: build-exec
-build-exec: $(PROGS)
+build-exec: $(PROGS) $(SYMLINKS)
cleanup_files+=$(EXECS)
$(EXECS): build
for f in $@; do \
@@ -191,8 +198,9 @@ install-exec: build-exec
fi; \
$(INSTALL_PROGRAM) $$f $(BINPATH)/; \
done
+ cd $(BINPATH); for f in $(SYMLINKS); do ln -sf $(MAIN) $$f; done
uninstall-exec:
- -for f in $(notdir $(PROGS)); do rm -f $(BINPATH)/$$f; done
+ -for f in $(notdir $(PROGS) $(SYMLINKS)); do rm -f $(BINPATH)/$$f; done ;
# Program + user documents installation.
.PHONY: install-program uninstall-program
@@ -277,15 +285,11 @@ osx-dmg: ../$(osx_dmg_name)
-rm -f $(osx_dmg_name)
mv $(osx_udzo_name) ../$(osx_dmg_name)
-.PHONY: test test-markdown test-wrapper
+.PHONY: test test-markdown
test: $(MAIN)
@cd $(TESTDIR) && perl runtests.pl -s $(PWD)/$(MAIN)
test-markdown: $(MAIN)
@cd $(TESTDIR)/MarkdownTest_1.0.3 && perl MarkdownTest.pl -s $(PWD)/$(MAIN) -tidy
-cleanup_files+=testwrapper
-test-wrappers: testwrapper
- @echo "Running $<..."
- @sh testwrapper
# Stolen and slightly improved from a GPLed Makefile. Credits to John Meacham.
src_all:=$(shell find $(SRCDIR) -type f -name '*hs' | egrep -v '^\./(_darcs|lib|test)/')
diff --git a/README b/README
index 88cc77d8f..82537eb6a 100644
--- a/README
+++ b/README
@@ -20,7 +20,7 @@ or output format requires only adding a reader or writer.
[reStructuredText]: http://docutils.sourceforge.net/docs/ref/rst/introduction.html
[S5]: http://meyerweb.com/eric/tools/s5/
[HTML]: http://www.w3.org/TR/html40/
-[LaTeX]: http://www.latex-project.org/
+[LaTeX]: http://www.latex-project.org/
[RTF]: http://en.wikipedia.org/wiki/Rich_Text_Format
[Haskell]: http://www.haskell.org/
@@ -30,9 +30,53 @@ any kind. (See COPYRIGHT for full copyright and warranty notices.)
Recai Oktaş (roktas at debian dot org) deserves credit for the build
system, the debian package, and the robust wrapper scripts.
-[GPL]: http://www.gnu.org/copyleft/gpl.html
+[GPL]: http://www.gnu.org/copyleft/gpl.html "GNU General Public License"
-# Using Pandoc
+Requirements
+============
+
+The `pandoc` program itself does not depend on any external libraries
+or programs. The convenience programs `markdown2html`, `markdown2latex`,
+`markdown2rst`, `markdown2rtf`, `markdown2s5`, `html2markdown`,
+`latex2markdown`, and `rst2markdown` are implemented as symbolic links to
+`pandoc`.
+
+The wrapper script `web2markdown` requires
+
+ - `html2markdown` (included with Pandoc)
+ - a POSIX-compliant shell (installed by default on all linux and unix
+ systems, including Mac OS X, and in [Cygwin] for Windows),
+ - `HTML Tidy`
+ - `iconv` (for character encoding conversion). (If `iconv` is absent,
+ `web2markdown` will still work, but it will treat everything as UTF-8.)
+
+[Cygwin]: http://www.cygwin.com/
+[HTML Tidy]: http://tidy.sourceforge.net/
+[`iconv`]: http://www.gnu.org/software/libiconv/
+
+The wrapper script `markdown2pdf` requires
+
+ - `markdown2latex` (included with Pandoc)
+ - a POSIX-compliant shell
+ - `pdflatex`, which should be part of any [LaTeX] distribution
+ - the [unicode] and [fancyvrb] LaTeX packages, which are included
+ in many LaTeX distributions. The [unicode] package allows LaTeX to
+ process UTF-8 characters. [fancyvrb] allows code blocks and verbatim
+ text to be used within footnotes. If your installation of LaTeX
+ does not include these packages, you will get an error (complaining
+ about missing `ucs.sty` or `fancyvrb.sty`) when you try to compile
+ a LaTeX file produced by Pandoc, or when you use the `markdown2pdf`
+ script (described below). If this happens, install the [unicode] and
+ [fancyvrb] packages package from [CTAN]. (Get the zip file from CTAN
+ and unpack it into `~/texmf/tex/latex/`. You may also need to run
+ `mktexlsr` or `texhash` before the files can be found by TeX.)
+
+[CTAN]: http://www.ctan.org "Comprehensive TeX Archive Network"
+[unicode]: http://www.ctan.org/tex-archive/macros/latex/contrib/unicode/
+[fancyvrb]: http://www.ctan.org/tex-archive/macros/latex/contrib/fancyvrb/
+
+Using Pandoc
+============
If you run `pandoc` without arguments, it will accept input from
STDIN. If you run it with file names as arguments, it will take input
@@ -66,10 +110,14 @@ a subset of reStructuredText syntax. For example, it doesn't handle
tables, definition lists, option lists, or footnotes. It handles only the
constructs expressible in unextended markdown. But for simple documents
it should be adequate. The `latex` and `html` readers are also limited
-in what they can do.
+in what they can do. Because the `html` reader is picky about the HTML
+it parses, it is recommended that you pipe HTML through [HTML Tidy] before
+sending it to `pandoc`, or use the `web2markdown` script described below.
+
+By default, `pandoc` writes its output to STDOUT. If you want to
+write to a file, use the `-o` option or shell redirection:
-`pandoc` writes its output to STDOUT. If you want to write to a file,
-use redirection:
+ pandoc -o hello.html hello.txt
pandoc hello.txt > hello.html
@@ -77,13 +125,14 @@ Note that you can specify multiple input files on the command line.
`pandoc` will concatenate them all (with blank lines between them)
before parsing:
- pandoc -s chapter1.txt chapter2.txt chapter3.txt references.txt > book.html
+ pandoc -s chapter1.txt chapter2.txt references.txt > book.html
(The `-s` option here tells `pandoc` to produce a standalone HTML file,
with a proper header, rather than a fragment. For more details on this
and many other command-line options, see below.)
-# Character encodings
+Character encodings
+-------------------
Unfortunately, due to limitations in GHC, `pandoc` does not automatically
detect the system's local character encoding. Hence, all input and
@@ -97,92 +146,65 @@ will convert `source.txt` from the local encoding to UTF-8, then
convert it to HTML, then convert back to the local encoding,
putting the output in `output.html`.
-[`iconv`]: http://www.gnu.org/software/libiconv/
-
The shell scripts (described below) automatically convert the input
from the local encoding to UTF-8 before running them through `pandoc`,
then convert the output back to the local encoding.
-## LaTeX and UTF-8
-
-LaTeX sources produced by Pandoc use `ucs.sty`, which is included in many
-LaTeX distributions. This allows LaTeX to process UTF-8 characters.
-If your installation of LaTeX does not include `ucs.sty`, you will get an
-error when you try to compile a LaTeX file produced by Pandoc, or when
-you use the `markdown2pdf` script (described below). If this happens,
-install the [unicode] package from [CTAN]. (Get the `unicode.zip`
-file from CTAN, unpack it, and copy the whole `unicode` directory into
-`~/texmf/tex/latex/`. You may also need to run `mktexlsr` or `texhash`
-before the files can be found by TeX.)
+Convenience programs and wrapper scripts
+========================================
-[CTAN]: http://www.ctan.org
-[unicode]: http://www.ctan.org/tex-archive/macros/latex/contrib/unicode/
+For convenience, eight variant programs are included with Pandoc:
+`markdown2html` (which is equivalent to `pandoc -w html`),
+`markdown2latex` (equivalent to `pandoc -w latex`), `markdown2rst`
+(equivalent to `pandoc -w rst`), `markdown2rtf` (equivalent to
+`pandoc -w rtf`), `markdown2s5` (equivalent to `pandoc -w s5`),
+`html2markdown` (equivalent to `pandoc -r html -w markdown`),
+`latex2markdown` (equivalent to `pandoc -r latex -w markdown`), and
+`rst2markdown` (equivalent to `pandoc -r rst -w markdown`). These
+programs take an appropriately restricted subset of `pandoc`'s
+options. (Run them with the `-h` flag for a full list of allowed
+options.)
-# The shell scripts
+Like `pandoc`, all of these programs produce fragments by default.
+If you want to produce a standalone file, complete with a header
+and footer appropriate to the format, use the `-s` option:
-Five shell scripts have been included that make it easy to run
-`pandoc` without worrying about character encodings, and without
-remembering all the command-line options:
+ markdown2latex -s sample.txt > sample.tex
-- `markdown2html` converts markdown-formatted text to HTML
-- `markdown2latex` converts markdown-formatted text to LaTeX
-- `markdown2pdf` produces a PDF file from markdown-formatted
- text, using `pdflatex`.
-- `html2markdown` converts HTML to markdown-formatted text
-- `latex2markdown` converts LaTeX to markdown-formatted text
+Two shell scripts have also been included:
-All of the scripts use `iconv` (if available) to convert to and from
-the local character encoding. All of the scripts presuppose that
-`pandoc` is in the path, and some have additional requirements. (For
-example, `html2markdown` uses `tidy`, and `markdown2pdf` uses
-`pdflatex`.)
+1. `markdown2pdf` produces a PDF file from markdown-formatted
+ text, using `markdown2latex` and `pdflatex`. The default
+ behavior of `markdown2pdf` is to create a file with the same
+ base name as the first argument and the extension `pdf`; thus,
+ for example,
-When no arguments are specified, text will be read from standard
-input. Arguments specify input files (limited to one in the case of
-`latex2markdown` and `html2markdown`; the other scripts accept any number
-of arguments). `html2markdown` may take a URL as argument instead of
-a filename; in this case, `curl`, `wget`, or an available text-based
-browser will be used to fetch the contents of the URL. (The `-n` option
-inhibits this behavior; the `-g` option allows the user to specify a
-custom command that will be used to fetch from a URL.)
+ markdown2pdf sample.txt endnotes.txt
-With the exception of `markdown2pdf`, the scripts write to standard output.
-Output can be sent to a file using shell output redirection:
+ will produce `sample.pdf`. (If `sample.pdf` exists already,
+ it will be backed up before being overwritten.) An output file
+ name can be specified explicitly using the `-o` option:
- latex2markdown sample.tex > sample.txt
+ markdown2pdf -o "My Book.pdf" chap1.txt chap2.txt chap3.txt
-The default behavior of `markdown2pdf` is to create a file with the same
-base name as the first argument and the extension `pdf`; thus, for example,
+ If no input file is specified, input will be taken from STDIN.
- markdown2pdf sample.txt endnotes.txt
+2. `web2markdown` grabs a web page from a file or URL and converts
+ it to markdown-formatted text, using `tidy` and `html2markdown`.
+ Unless input is from STDIN, an attempt is made to determine the
+ character encoding of the page from the "Content-type" meta tag.
+ If this is not present, UTF-8 is assumed. Alternatively, a character
+ encoding may be specified explicitly using the `-e` option.
-will produce `sample.pdf`. (If `sample.pdf` exists already, it will be
-backed up before being overwritten.) An output file name can be specified
-explicitly using the `-o` option:
+ `web2markdown` searches for an available program (`wget`, `curl`,
+ or a text-mode browser) to fetch the contents of a URL.
+ Optionally, the `-g` command may be used to specify the command
+ to be used:
- markdown2pdf -o "My Book.pdf" chap1.txt chap2.txt chap3.txt
+ web2markdown -g 'wget --user=foo --password=bar' mysite.com
-Options specific to the scripts, like `-o`, `-g`, and `-n`, must
-be specified *before* any command-line arguments (file names or URLs).
-Any options specified *after* the command-line arguments will be
-passed directly to `pandoc`. For example,
-
- markdown2html tusks.txt -S -T Elephants
-
-will convert `tusks.txt` to `tusks.html` using smart quotes, ellipses,
-and dashes, with "Elephants" as the page title prefix. (For a
-complete list of `pandoc` options, see below.) When there are no
-command-line arguments (because input is from STDIN), `pandoc`
-options must be preceded by ` -- `:
-
- cat tusks.txt | markdown2html -- -S -T Elephants
-
-The ` -- ` separator may optionally be used when there are command-line
-arguments:
-
- markdown2html -- tusks.txt -S -T Elephants
-
-# Command-line options
+Command-line options
+====================
Various command-line options can be used to customize the output.
For a complete list, type
@@ -207,9 +229,11 @@ specified.)
complete with appropriate document headers. By default, `pandoc`
produces a fragment.
-`--custom-header` can be used to specify a custom document header. To
-see the headers used by default, use the `-D` option: for example,
-`pandoc -D html` prints the default HTML header.
+`-o` or `--output-file` can be used to specify an output file.
+
+`-C` or `--custom-header` can be used to specify a custom document
+header. To see the headers used by default, use the `-D` option:
+for example, `pandoc -D html` prints the default HTML header.
`-c` or `--css` allows the user to specify a custom stylesheet that
will be linked to in HTML and S5 output.
@@ -253,15 +277,38 @@ is for lists to be displayed all at once.
`-N` or `--number-sections` causes sections to be numbered in LaTeX
output. By default, sections are not numbered.
-# Pandoc's markdown vs. standard markdown
+`-d` or `--debug` causes a debugging message to be written to STDERR.
+The format of the message is as follows:
+
+ OUTPUT=foo
+ INPUT=bar
+ INPUT=Foo Baz
+
+Here `OUTPUT=` is followed by the name of the output file specified
+using `-o`, if any. If no output file was specified, `OUTPUT=`
+will appear with nothing following it. Lines beginning `INPUT=`
+specify input files. If there are no input files, no `INPUT=` lines
+will be printed. The `-d` option forces output to be written to
+STDOUT, even if an output file was specified using the `-o` option.
+(This option is provided to make it easier to write wrappers for
+`pandoc`.)
+
+`-v` or `--version` prints the version number to STDERR.
+
+`-h` or `--help` prints a usage message to STDERR.
+
+Pandoc's markdown vs. standard markdown
+=======================================
In parsing markdown, Pandoc departs from and extends [standard markdown]
in a few respects. (To run Pandoc on the official
markdown test suite, type `make test-markdown`.)
[standard markdown]: http://daringfireball.net/projects/markdown/syntax
+ "Markdown syntax description"
-## Section Headings
+Section Headings
+----------------
Pandoc creates an invisible anchor in front of every HTML section
heading. The ID of this anchor is derived from the section heading
@@ -281,7 +328,8 @@ example, just insert:
[Back to Aristotle](#Aristotle's_De_Anima)
-## Lists
+Lists
+-----
Pandoc behaves differently from standard markdown on some "edge
cases" involving lists. Consider this source:
@@ -332,7 +380,8 @@ the example above:
B) Fie
C) Third
-## Literal quotes in titles
+Literal quotes in titles
+------------------------
Standard markdown allows unescaped literal quotes in titles, as
in
@@ -343,7 +392,8 @@ Pandoc requires all quotes within titles to be escaped:
[foo]: "bar \"embedded\" baz"
-## Reference links
+Reference links
+---------------
Pandoc allows implicit reference links in either of two styles:
@@ -357,7 +407,8 @@ will appear as regular bracketed text. Note: even `[link][]` will
appear as `[link]` if there's no reference for `link`. If you want
`[link][]`, use a backslash escape: `\[link]\[]`.
-## Footnotes
+Footnotes
+---------
Pandoc's markdown allows footnotes, using the following syntax:
@@ -394,7 +445,8 @@ they cannot contain multiple paragraphs). The syntax is as follows:
Inline and regular footnotes may be mixed freely.
-## Embedded HTML
+Embedded HTML
+-------------
Pandoc treats embedded HTML in markdown a bit differently than
Markdown 1.0. While Markdown 1.0 leaves HTML blocks exactly as they
@@ -427,7 +479,8 @@ markdown with HTML block elements. For example, one can surround
a block of markdown text with `<div>` tags without preventing it
from being interpreted as markdown.
-## Title blocks
+Title blocks
+------------
If the file begins with a title block
@@ -460,7 +513,8 @@ If a title prefix is specified with `-T` and no title block appears
in the document, the title prefix will be used by itself as the
HTML title.
-## Box-style blockquotes
+Box-style blockquotes
+---------------------
Pandoc supports emacs-style boxquote block quotes, in addition to
standard markdown (email-style) boxquotes:
@@ -469,7 +523,8 @@ standard markdown (email-style) boxquotes:
| They look like this.
`----
-## Inline LaTeX
+Inline LaTeX
+------------
Anything between two $ characters will be parsed as LaTeX math. The
opening $ must have a character immediately to its right, while the
@@ -501,7 +556,8 @@ You can also use LaTeX environments. For example,
Note, however, that material between the begin and end tags will
be interpreted as raw LaTeX, not as markdown.
-## Custom headers
+Custom headers
+--------------
When run with the "standalone" option (`-s`), `pandoc` creates a
standalone file, complete with an appropriate header. To see the
@@ -516,13 +572,14 @@ it and specify it on the command line as follows:
pandoc --header=MyHeaderFile
-# Producing S5 with Pandoc
+Producing S5 with Pandoc
+========================
-Producing an [S5] slide show with Pandoc is easy. A title page is
-constructed automatically from the document's title block (see above).
-Each section (with a level-one header) produces a single slide. (Note
-that if the section is too big, the slide will not fit on the page; S5
-is not smart enough to produce multiple pages.)
+Producing an [S5] web-based slide show with Pandoc is easy. A title
+page is constructed automatically from the document's title block (see
+above). Each section (with a level-one header) produces a single slide.
+(Note that if the section is too big, the slide will not fit on the page;
+S5 is not smart enough to produce multiple pages.)
Here's the markdown source for a simple slide show, `eating.txt`:
diff --git a/debian/changelog b/debian/changelog
index caf376ce4..a3c8b8b99 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -14,6 +14,8 @@ pandoc (0.22) unstable; urgency=low
* Refactored template processing (fillTemplates.pl).
+ * Modified wrapper scripts to make them more robust.
+
* Modified wrapper scripts to make them more robust and portable.
To avoid code duplication and ensure consistency, wrappers are
generated via a templating system from templates in src/wrappers.
diff --git a/man/man1/html2markdown.1 b/man/man1/html2markdown.1
index 6cdba595c..7b82576d6 100644
--- a/man/man1/html2markdown.1
+++ b/man/man1/html2markdown.1
@@ -1,60 +1 @@
-.TH HTML2MARKDOWN 1 "November 21, 2006" Pandoc "User Manuals"
-.SH NAME
-html2markdown \- converts HTML to markdown-formatted text
-.SH SYNOPSIS
-\fBhtml2markdown\fR [\fIoptions\fR] [\fIinput\-file\fR or \fIURL\fR]
-[\fB\-\-\fR] [\fIpandoc\-opts\fR]
-.SH DESCRIPTION
-\fBhtml2markdown\fR converts \fIinput\-file\fR or \fIURL\fR (or text
-from STDIN) from HTML to markdown\-formatted plain text.
-If a URL is specified, \fBhtml2markdown\fR uses an available program
-(e.g. wget, w3m, lynx or curl) to fetch its contents. Output is sent
-to STDOUT.
-.PP
-\fBhtml2markdown\fR is a wrapper for \fBpandoc\fR.
-.SH OPTIONS
-.TP
-.B \-h
-Show usage message.
-.TP
-.B \-e \fIencoding\fR
-Assume the character encoding \fIencoding\fR in reading the HTML.
-(Note: \fIencoding\fR will be passed to \fBiconv\fR; a list of
-available encodings may be obtained using `\fBiconv \-l\fR'.)
-If the \fB\-e\fR option is not specified, the encoding will be
-determined as follows: If input is from STDIN, the local encoding
-will be assumed. Otherwise, \fBhtml2markdown\fR will try to
-extract the character encoding from the "Content-type" meta tag.
-If no character encoding is specified in this way, UTF-8 will be
-assumed for a URL argument, and the local encoding will be assumed
-for a file argument.
-.TP
-.B \-g \fIcommand\fR
-Use \fIcommand\fR to fetch the contents of a URL. (By default,
-\fBhtml2markdown\fR searches for an available program or text-based
-browser to fetch the contents of a URL.) For example:
-.IP
-html2markdown \-g 'wget \-\-user=foo \-\-password=bar' mysite.com
-.TP
-.B \-n
-Disable automatic fetching of contents when URLs are specified as
-arguments.
-.TP
-.I pandoc\-opts
-Any options appearing after \fIinput\-file\fR or \fIURL\fR on the
-command line will be passed directly to \fBpandoc\fR. If no
-\fIinput-file\fR or \fIURL\fR is specified, these options must
-be preceded by ` \fB\-\-\fR '. (In other cases, ` \fB\-\-\fR ' is
-optional.) See \fBpandoc\fR(1) for a list of options that may be used.
-Example:
-.IP
-html2markdown input.txt \-\- \-R
-.SH "SEE ALSO"
-\fBpandoc\fR(1),
-\fBmarkdown2html\fR(1),
-\fBmarkdown2latex\fR(1),
-\fBlatex2markdown\fR(1),
-\fBmarkdown2pdf\fR(1),
-\fBiconv\fR(1)
-.SH AUTHOR
-John MacFarlane and Recai Oktas
+.so man1/pandoc.1
diff --git a/man/man1/latex2markdown.1 b/man/man1/latex2markdown.1
index 6e7e9e033..7b82576d6 100644
--- a/man/man1/latex2markdown.1
+++ b/man/man1/latex2markdown.1
@@ -1,33 +1 @@
-.TH LATEX2MARKDOWN 1 "November 21, 2006" Pandoc "User Manuals"
-.SH NAME
-latex2markdown \- converts LaTeX to markdown\-formatted text
-.SH SYNOPSIS
-\fBlatex2markdown\fR [\fIoptions\fR] [\fIinput\-file\fR]
-[\fB\-\-\fR] [\fIpandoc\-opts\fR]
-.SH DESCRIPTION
-\fBlatex2markdown\fR converts \fIinput\-file\fR
-(or text from STDIN) from LaTeX to markdown\-formatted plain text.
-Output is sent to STDOUT.
-.PP
-\fBlatex2markdown\fR is a wrapper for \fBpandoc\fR.
-.SH OPTIONS
-.TP
-.B \-h
-Show usage message.
-.TP
-.I pandoc\-opts
-Any options appearing after \fIinput\-file\fR on the command line
-will be passed directly to \fBpandoc\fR. If no \fIinput-file\fR
-is specified, these options must be preceded by ` \fB\-\-\fR '.
-(In other cases, ` \fB\-\-\fR ' is optional.) See \fBpandoc\fR(1)
-for a list of options that may be used. Example:
-.IP
-latex2markdown input.txt \-\- \-R
-.SH "SEE ALSO"
-\fBpandoc\fR(1),
-\fBmarkdown2html\fR(1),
-\fBhtml2markdown\fR(1),
-\fBmarkdown2latex\fR(1),
-\fBmarkdown2pdf\fR(1)
-.SH AUTHOR
-John MacFarlane and Recai Oktas
+.so man1/pandoc.1
diff --git a/man/man1/markdown2html.1 b/man/man1/markdown2html.1
index 33d063321..7b82576d6 100644
--- a/man/man1/markdown2html.1
+++ b/man/man1/markdown2html.1
@@ -1,34 +1 @@
-.TH MARKDOWN2HTML 1 "November 21, 2006" Pandoc "User Manuals"
-.SH NAME
-markdown2html \- converts markdown\-formatted text to HTML
-.SH SYNOPSIS
-\fBmarkdown2html\fR [\fIoptions\fR] [\fIinput\-file\fR]...
-[\fB\-\-\fR] [\fIpandoc\-opts\fR]
-.SH DESCRIPTION
-\fBmarkdown2html\fR converts \fIinput\-file\fR
-(or text from STDIN) from markdown\-formatted plain text to HTML.
-If multiple files are specified, they will be combined to make a single
-HTML document. Output is sent to STDOUT.
-.PP
-\fBmarkdown2html\fR is a wrapper for \fBpandoc\fR.
-.SH OPTIONS
-.TP
-.B \-h
-Show usage message.
-.TP
-.I pandoc\-opts
-Any options appearing after \fIinput\-file\fR... on the command line
-will be passed directly to \fBpandoc\fR. If no \fIinput-file\fR
-is specified, these options must be preceded by ` \fB\-\-\fR '.
-(In other cases, ` \fB\-\-\fR ' is optional.) See \fBpandoc\fR(1)
-for a list of options that may be used. Example:
-.IP
-markdown2html input.txt \-\- \-\-css=main.css \-S
-.SH "SEE ALSO"
-\fBpandoc\fR(1),
-\fBhtml2markdown\fR(1),
-\fBmarkdown2latex\fR(1),
-\fBlatex2markdown\fR(1),
-\fBmarkdown2pdf\fR(1)
-.SH AUTHOR
-John MacFarlane and Recai Oktas
+.so man1/pandoc.1
diff --git a/man/man1/markdown2latex.1 b/man/man1/markdown2latex.1
index 3039192d1..7b82576d6 100644
--- a/man/man1/markdown2latex.1
+++ b/man/man1/markdown2latex.1
@@ -1,34 +1 @@
-.TH MARKDOWN2LATEX 1 "November 21, 2006" Pandoc "User Manuals"
-.SH NAME
-markdown2latex \- converts markdown-formatted text to LaTeX
-.SH SYNOPSIS
-\fBmarkdown2latex\fR [\fIoptions\fR] [\fIinput\-file\fR]...
-[\fB\-\-\fR] [\fIpandoc\-opts\fR]
-.SH DESCRIPTION
-\fBmarkdown2latex\fR converts \fIinput\-file\fR (or text from STDIN)
-from markdown\-formatted plain text to LaTeX. If multiple files are
-specified, they will be combined to make a single LaTeX document.
-Output is sent to STDOUT.
-.PP
-\fBmarkdown2latex\fR is a wrapper for \fBpandoc\fR.
-.SH OPTIONS
-.TP
-.B \-h
-Show usage message.
-.TP
-.I pandoc\-opts
-Any options appearing after \fIinput\-file\fR... on the command line
-will be passed directly to \fBpandoc\fR. If no \fIinput-file\fR
-is specified, these options must be preceded by ` \fB\-\-\fR '.
-(In other cases, ` \fB\-\-\fR ' is optional.) See \fBpandoc\fR(1)
-for a list of options that may be used. Example:
-.IP
-markdown2latex input.txt \-\- \-\-custom\-header=letterhead.tex
-.SH "SEE ALSO"
-\fBpandoc\fR(1),
-\fBmarkdown2html\fR(1),
-\fBhtml2markdown\fR(1),
-\fBlatex2markdown\fR(1),
-\fBmarkdown2pdf\fR(1)
-.SH AUTHOR
-John MacFarlane and Recai Oktas
+.so man1/pandoc.1
diff --git a/man/man1/markdown2pdf.1 b/man/man1/markdown2pdf.1
index 99aa50a28..c15131a42 100644
--- a/man/man1/markdown2pdf.1
+++ b/man/man1/markdown2pdf.1
@@ -1,43 +1,71 @@
-.TH MARKDOWN2PDF 1 "November 21, 2006" Pandoc "User Manuals"
+.TH MARKDOWN2PDF 1 "December 15, 2006" Pandoc "User Manuals"
.SH NAME
markdown2pdf \- converts markdown-formatted text to PDF, using pdflatex
.SH SYNOPSIS
-\fBmarkdown2pdf\fR [\fIoptions\fR] [\fB\-o\fR \fIoutput-file\fR]
-[\fIinput-file\fR]... [\fB\-\-\fR] [\fIpandoc\-opts\fR]
+\fBmarkdown2pdf\fR [\fIoptions\fR] [\fIinput-file\fR]...
.SH DESCRIPTION
-\fBmarkdown2pdf\fR converts \fIinput\-file\fR (or text from STDIN) from
-markdown\-formatted plain text to PDF, using \fBpdflatex\fR. If no output
-filename is specified, the name of the output file is derived from the
-input file; thus, for example, if the input file is \fIhello.txt\fR,
-the output file will be \fIhello.pdf\fR. If the input is read from STDIN
-and no output filename is specified, the output file will be named
-\fIstdin.pdf\fR. If multiple input files are specified, they will be
-concatenated before conversion, and the name of the output file will be
-derived from the first input file.
+\fBmarkdown2pdf\fR converts \fIinput\-file\fR (or text from standard
+input) from markdown\-formatted plain text to PDF, using \fBpdflatex\fR.
+If no output filename is specified, the name of the output file is
+derived from the input file; thus, for example, if the input file
+is \fIhello.txt\fR, the output file will be \fIhello.pdf\fR. If
+the input is read from STDIN and no output filename is
+specified, the output file will be named \fIstdin.pdf\fR. If
+multiple input files are specified, they will be concatenated before
+conversion, and the name of the output file will be derived from
+the first input file.
.PP
-\fBmarkdown2pdf\fR is a wrapper for \fBpandoc\fR.
+Input is assumed to be in the UTF\-8 character encoding. If your
+local character encoding is not UTF\-8, you should pipe input and
+output through \fBiconv\fR:
+.IP
+.B iconv \-t utf\-8 input.txt | pandoc | iconv \-f utf\-8
+.PP
+\fBmarkdown2pdf\fR assumes that the 'unicode' package
+is in latex's search path. If this package is not included in your
+latex setup, it can be obtained from <http://ctan.org>.
+.PP
+\fBmarkdown2pdf\fR is a wrapper around \fBmarkdown2latex\fR.
.SH OPTIONS
.TP
-.B \-h
-Show usage message.
+.B \-o FILE, \-\-output=FILE
+Write output to \fIFILE\fR.
.TP
-.B \-o \fIoutput-file\fR
-Specify name of output (PDF) file.
+.B \-p, \-\-preserve-tabs
+Preserve tabs instead of converting them to spaces.
.TP
-.I pandoc\-opts
-Any options appearing after \fIinput\-file\fR... on the command line
-will be passed directly to \fBpandoc\fR. If no \fIinput-file\fR
-is specified, these options must be preceded by ` \fB\-\-\fR '.
-(In other cases, ` \fB\-\-\fR ' is optional.) See \fBpandoc\fR(1)
-for a list of options that may be used. Example:
-.IP
-markdown2pdf input.txt \-\- \-\-custom\-header=letterhead.tex
+.B \-\-tab-stop=\fITABSTOP\fB
+Specify tab stop (default is 4).
+.TP
+.B \-R, \-\-parse-raw
+Parse untranslatable LaTeX environments as raw LaTeX,
+instead of ignoring them.
+.TP
+.B \-N, \-\-number-sections
+Number section headings in LaTeX output. (Default is not to number them.)
+.TP
+.B \-H \fIFILE\fB, \-\-include-in-header=\fIFILE\fB
+Include (LaTeX) contents of \fIFILE\fR at the end of the header. Implies
+\fB\-s\fR.
+.TP
+.B \-B \fIFILE\fB, \-\-include-before-body=\fIFILE\fB
+Include (LaTeX) contents of \fIFILE\fR at the beginning of the document body.
+.TP
+.B \-A \fIFILE\fB, \-\-include-after-body=\fIFILE\fB
+Include (LaTeX) contents of \fIFILE\fR at the end of the document body.
+.TP
+.B \-C \fIFILE\fB, \-\-custom-header=\fIFILE\fB
+Use contents of \fIFILE\fR
+as the LaTeX document header (overriding the default header, which can be
+printed using '\fBpandoc \-D latex\fR'). Implies \fB-s\fR.
+.TP
+.B \-v, \-\-version
+Print version.
+.TP
+.B \-h, \-\-help
+Show usage message.
.SH "SEE ALSO"
\fBpandoc\fR(1),
-\fBmarkdown2html\fR(1),
-\fBhtml2markdown\fR(1),
-\fBmarkdown2latex\fR(1),
-\fBlatex2markdown\fR(1),
\fBpdflatex\fR(1)
.SH AUTHOR
John MacFarlane and Recai Oktas
diff --git a/man/man1/markdown2rst.1 b/man/man1/markdown2rst.1
new file mode 100644
index 000000000..7b82576d6
--- /dev/null
+++ b/man/man1/markdown2rst.1
@@ -0,0 +1 @@
+.so man1/pandoc.1
diff --git a/man/man1/markdown2rtf.1 b/man/man1/markdown2rtf.1
new file mode 100644
index 000000000..7b82576d6
--- /dev/null
+++ b/man/man1/markdown2rtf.1
@@ -0,0 +1 @@
+.so man1/pandoc.1
diff --git a/man/man1/markdown2s5.1 b/man/man1/markdown2s5.1
new file mode 100644
index 000000000..7b82576d6
--- /dev/null
+++ b/man/man1/markdown2s5.1
@@ -0,0 +1 @@
+.so man1/pandoc.1
diff --git a/man/man1/pandoc.1 b/man/man1/pandoc.1
index 5f632de90..82c9ae321 100644
--- a/man/man1/pandoc.1
+++ b/man/man1/pandoc.1
@@ -1,18 +1,23 @@
-.TH PANDOC 1 "November 21, 2006" Pandoc "User Manuals"
+.TH PANDOC 1 "December 15, 2006" Pandoc "User Manuals"
.SH NAME
-pandoc \- general markup converter
+pandoc, markdown2html, markdown2latex, markdown2rst, markdown2rtf,
+markdown2s5, html2markdown2, latex2markdown, rst2markdown \- general
+markup converter
.SH SYNOPSIS
\fBpandoc\fR [\fIoptions\fR] [\fIinput\-file\fR]...
.SH DESCRIPTION
-\fIPandoc\fR converts files from one markup format to another. It can
+\fBPandoc\fR converts files from one markup format to another. It can
read markdown and (subsets of) reStructuredText, HTML, and LaTeX, and
it can write markdown, reStructuredText, HTML, LaTeX, RTF, and S5 HTML
slide shows.
.PP
-If no \fIinput\-file\fR is specified, input is read from STDIN. Otherwise,
-the \fIinput\-files\fR are concatenated (with a blank line between each)
-and used as input. Output goes to STDOUT. If you want output to a file,
-use shell redirection:
+If no \fIinput\-file\fR is specified, input is read from STDIN.
+Otherwise, the \fIinput\-files\fR are concatenated (with a blank
+line between each) and used as input. Output goes to standard
+output. If you want output to a file, use the \fB\-o\fR option or
+shell redirection:
+.IP
+.B pandoc \-o output.html input.txt
.IP
.B pandoc input.txt > output.html
.PP
@@ -25,6 +30,19 @@ formats can be specified using command\-line options. For example,
converts \fIchap1.tex\fR from LaTeX to markdown\-formatted plain text.
See below for a detailed list of command\-line options.
.PP
+For convenience, eight variant programs are available:
+\fBmarkdown2html\fR (same as \fBpandoc \-w html\fR),
+\fBmarkdown2latex\fR (same as \fBpandoc \-w latex\fR),
+\fBmarkdown2rst\fR (same as \fBpandoc \-w rst\fR),
+\fBmarkdown2rtf\fR (same as \fBpandoc \-w rtf\fR),
+\fBmarkdown2s5\fR (same as \fBpandoc \-w s5\fR),
+\fBhtml2markdown\fR (same as \fBpandoc \-r html \-w markdown\fR),
+\fBlatex2markdown\fR (same as \fBpandoc \-r latex \-w markdown\fR),
+and \fBrst2markdown\fR (same as \fBpandoc \-r rst \-w markdown\fR).
+These programs take an appropriately restricted subset of \fBpandoc\fR's
+options. (Run them with the \fB-h\fR flag for a full list of allowed
+options.)
+.PP
\fIPandoc\fR uses the UTF\-8 character encoding for both input and output.
If your local character encoding is not UTF\-8, you should pipe input
and output through \fBiconv\fR:
@@ -33,61 +51,58 @@ and output through \fBiconv\fR:
.SH OPTIONS
.TP
-.B \-v, \-\-version
-Print version.
-.TP
-.B \-h, \-\-help
-Show usage message.
-.TP
-.B \-f FORMAT, \-r FORMAT, \-\-from=FORMAT, \-\-read=FORMAT
+.B \-f \fIFORMAT\fB, \-r \fIFORMAT\fB, \-\-from=\fIFORMAT\fB, \-\-read=\fIFORMAT\fB
Specify input format.
.I FORMAT
can be
-.I native
+.B native
(native Haskell),
-.I markdown
+.B markdown
(markdown or plain text),
-.I rst
+.B rst
(reStructuredText),
-.I html
+.B html
(HTML),
or
-.I latex
+.B latex
(LaTeX).
.TP
-.B \-t FORMAT, \-w FORMAT, \-\-to=FORMAT, \-\-write=FORMAT
+.B \-t \fIFORMAT\fB, \-w \fIFORMAT\fB, \-\-to=\fIFORMAT\fB, \-\-write=\fIFORMAT\fB
Specify output format.
.I FORMAT
can be
-.I native
+.B native
(native Haskell),
-.I markdown
+.B markdown
(markdown or plain text),
-.I rst
+.B rst
(reStructuredText),
-.I html
+.B html
(HTML),
-.I latex
+.B latex
(LaTeX),
-.I s5
+.B s5
(S5 HTML and javascript slide show),
or
-.I rtf
+.B rtf
(rich text format).
.TP
.B \-s, \-\-standalone
Produce output with an appropriate header and footer (e.g. a
standalone HTML, LaTeX, or RTF file, not a fragment).
.TP
+.B \-o FILE, \-\-output=FILE
+Write output to \fIFILE\fR instead of STDOUT.
+.TP
.B \-p, \-\-preserve-tabs
Preserve tabs instead of converting them to spaces.
.TP
-.B \-\-tab-stop=TABSTOP
+.B \-\-tab-stop=\fITABSTOP\fB
Specify tab stop (default is 4).
.TP
.B \-R, \-\-parse-raw
-Parse untranslatable HTML codes and LaTeX environments as raw HTML or
-LaTeX, instead of ignoring them.
+Parse untranslatable HTML codes and LaTeX environments as raw HTML
+or LaTeX, instead of ignoring them.
.TP
.B \-S, \-\-smartypants
Use smart quotes, dashes, and ellipses in HTML output.
@@ -99,41 +114,50 @@ Use ASCIIMathML to display embedded LaTeX math in HTML output.
Make list items in S5 display incrementally (one by one).
.TP
.B \-N, \-\-number-sections
-Number section headings in LaTeX output. (Default is not to number them.)
+Number section headings in LaTeX output. (Default is not to number
+them.)
.TP
-.B \-c CSS, \-\-css=CSS
+.B \-c \fICSS\fB, \-\-css=\fICSS\fB
Link to a CSS style sheet.
.I CSS
is the pathname of the style sheet.
.TP
-.B \-H FILENAME, \-\-include-in-header=FILENAME
-Include contents of \fIFILENAME\fR at the end of the header. Implies
+.B \-H \fIFILE\fB, \-\-include-in-header=\fIFILE\fB
+Include contents of \fIFILE\fR at the end of the header. Implies
\fB\-s\fR.
.TP
-.B \-B FILENAME, \-\-include-before-body=FILENAME
-Include contents of \fIFILENAME\fR at the beginning of the document body.
+.B \-B \fIFILE\fB, \-\-include-before-body=\fIFILE\fB
+Include contents of \fIFILE\fR at the beginning of the document
+body.
.TP
-.B \-A FILENAME, \-\-include-after-body=FILENAME
-Include contents of \fIFILENAME\fR at the end of the document body.
+.B \-A \fIFILE\fB, \-\-include-after-body=\fIFILE\fB
+Include contents of \fIFILE\fR at the end of the document body.
.TP
-.B \-\-custom-header=FILENAME
-Use contents of \fIFILENAME\fR
-as the document header (overriding the default header, which can be
-printed by using the \fB\-D\fR option). Implies
-\fB-s\fR.
+.B \-C \fIFILE\fB, \-\-custom-header=\fIFILE\fB
+Use contents of \fIFILE\fR as the document header (overriding the
+default header, which can be printed by using the \fB\-D\fR option).
+Implies \fB-s\fR.
.TP
-.B \-D FORMAT, \-\-print-default-header=FORMAT
-Print the default header for \fIFORMAT\fR
-(\fIhtml, s5, latex, markdown, rst, rtf\fR).
+.B \-D \fIFORMAT\fB, \-\-print-default-header=\fIFORMAT\fB
+Print the default header for \fIFORMAT\fR (\fIhtml, s5, latex,
+markdown, rst, rtf\fR).
.TP
-.B \-T STRING, \-\-title-prefix=STRING
+.B \-T \fISTRING\fB, \-\-title-prefix=\fISTRING\fB
Specify \fISTRING\fR as a prefix to the HTML window title.
+.TP
+.B \-d, \-\-debug
+Print debugging information (names of input and output files) to
+STDERR. Write output to STDOUT, even if an output file was specified
+using the \fB\-o\fR option.
+.TP
+.B \-v, \-\-version
+Print version.
+.TP
+.B \-h, \-\-help
+Show usage message.
.SH "SEE ALSO"
-\fBmarkdown2html\fR(1),
-\fBhtml2markdown\fR(1),
-\fBmarkdown2latex\fR(1),
-\fBlatex2markdown\fR(1),
+\fBweb2markdown\fR(1),
\fBmarkdown2pdf\fR(1),
\fBiconv\fR(1)
diff --git a/man/man1/rst2markdown.1 b/man/man1/rst2markdown.1
new file mode 100644
index 000000000..7b82576d6
--- /dev/null
+++ b/man/man1/rst2markdown.1
@@ -0,0 +1 @@
+.so man1/pandoc.1
diff --git a/man/man1/web2markdown.1 b/man/man1/web2markdown.1
new file mode 100644
index 000000000..a570cfc97
--- /dev/null
+++ b/man/man1/web2markdown.1
@@ -0,0 +1,82 @@
+.TH WEB2MARKDOWN 1 "December 15, 2006" Pandoc "User Manuals"
+.SH NAME
+web2markdown \- converts HTML to markdown-formatted text
+.SH SYNOPSIS
+\fBweb2markdown\fR [\fIoptions\fR] [\fIinput\-file\fR or \fIURL\fR]
+.SH DESCRIPTION
+\fBweb2markdown\fR converts \fIinput\-file\fR or \fIURL\fR (or text
+from STDIN) from HTML to markdown\-formatted plain text.
+If a URL is specified, \fBweb2markdown\fR uses an available program
+(e.g. wget, w3m, lynx or curl) to fetch its contents. Output is sent
+to STDOUT unless an output file is specified using the \fB\-o\fR
+option.
+.PP
+\fBweb2markdown\fR uses the character encoding specified in the
+"Content-type" meta tag. If this is not present, or if input comes
+from STDIN, UTF-8 is assumed. A character encoding may be specified
+explicitly using the \fB\-e\fR option.
+.PP
+\fBweb2markdown\fR is a wrapper for \fBhtml2markdown\fR.
+.SH OPTIONS
+.TP
+.B \-s, \-\-standalone
+Include title, author, and date information (if present) at the
+top of markdown output.
+.TP
+.B \-o FILE, \-\-output=FILE
+Write output to \fIFILE\fR instead of STDOUT.
+.TP
+.B \-p, \-\-preserve-tabs
+Preserve tabs instead of converting them to spaces.
+.TP
+.B \-\-tab-stop=\fITABSTOP\fB
+Specify tab stop (default is 4).
+.TP
+.B \-R, \-\-parse-raw
+Parse untranslatable HTML codes as raw HTML.
+.TP
+.B \-H \fIFILE\fB, \-\-include-in-header=\fIFILE\fB
+Include contents of \fIFILE\fR at the end of the header. Implies
+\fB\-s\fR.
+.TP
+.B \-B \fIFILE\fB, \-\-include-before-body=\fIFILE\fB
+Include contents of \fIFILE\fR at the beginning of the document body.
+.TP
+.B \-A \fIFILE\fB, \-\-include-after-body=\fIFILE\fB
+Include contents of \fIFILE\fR at the end of the document body.
+.TP
+.B \-C \fIFILE\fB, \-\-custom-header=\fIFILE\fB
+Use contents of \fIFILE\fR
+as the document header (overriding the default header, which can be
+printed using '\fBpandoc \-D markdown\fR'). Implies
+\fB-s\fR.
+.TP
+.B \-v, \-\-version
+Print version.
+.TP
+.B \-h, \-\-help
+Show usage message.
+.TP
+.B \-e \fIencoding\fR
+Assume the character encoding \fIencoding\fR in reading HTML.
+(Note: \fIencoding\fR will be passed to \fBiconv\fR; a list of
+available encodings may be obtained using `\fBiconv \-l\fR'.)
+If the \fB\-e\fR option is not specified and input is not from
+STDIN, \fBweb2markdown\fR will try to extract the character encoding
+from the "Content-type" meta tag. If no character encoding is
+specified in this way, or if input is from STDIN, UTF-8 will be
+assumed.
+.TP
+.B \-g \fIcommand\fR
+Use \fIcommand\fR to fetch the contents of a URL. (By default,
+\fBweb2markdown\fR searches for an available program or text-based
+browser to fetch the contents of a URL.) For example:
+.IP
+web2markdown \-g 'wget \-\-user=foo \-\-password=bar' mysite.com
+
+.SH "SEE ALSO"
+\fBpandoc\fR(1),
+\fBhtml2markdown\fR(1),
+\fBiconv\fR(1)
+.SH AUTHOR
+John MacFarlane and Recai Oktas
diff --git a/src/Main.hs b/src/Main.hs
index 542e521f6..0f8567517 100644
--- a/src/Main.hs
+++ b/src/Main.hs
@@ -45,6 +45,7 @@ import Text.Pandoc.Writers.DefaultHeaders ( defaultHtmlHeader,
defaultRTFHeader, defaultS5Header, defaultLaTeXHeader )
import Text.Pandoc.Definition
import Text.Pandoc.Shared
+import Text.Regex ( mkRegex, splitRegex )
import System ( exitWith, getArgs, getProgName )
import System.Exit
import System.Console.GetOpt
@@ -57,6 +58,9 @@ import Control.Monad ( (>>=) )
version :: String
version = "0.3"
+copyrightMessage :: String
+copyrightMessage = "\nCopyright (C) 2006 John MacFarlane\nWeb: http://sophos.berkeley.edu/macfarlane/pandoc\nThis is free software; see the source for copying conditions. There is no\nwarranty, not even for merchantability or fitness for a particular purpose."
+
-- | Association list of formats and readers.
readers :: [(String, ParserState -> String -> Pandoc)]
readers = [("native" , readPandoc)
@@ -101,10 +105,13 @@ data Opt = Opt
, optCustomHeader :: String -- ^ Custom header to use, or "DEFAULT"
, optDefaultHeader :: String -- ^ Default header
, optTitlePrefix :: String -- ^ Optional prefix for HTML title
+ , optOutputFile :: String -- ^ Name of output file
, optNumberSections :: Bool -- ^ If @True@, number sections in LaTeX
, optIncremental :: Bool -- ^ If @True@, incremental lists in S5
, optSmart :: Bool -- ^ If @True@, use smart typography
, optASCIIMathML :: Bool -- ^ If @True@, use ASCIIMathML in HTML
+ , optShowUsage :: Bool -- ^ If @True@, show usage message
+ , optDebug :: Bool -- ^ If @True@, output debug messages
}
-- | Defaults for command-line options.
@@ -123,32 +130,20 @@ startOpt = Opt
, optCustomHeader = "DEFAULT"
, optDefaultHeader = defaultHtmlHeader
, optTitlePrefix = ""
+ , optOutputFile = "" -- null for stdout
, optNumberSections = False
, optIncremental = False
, optSmart = False
, optASCIIMathML = False
+ , optShowUsage = False
+ , optDebug = False
}
-- | A list of functions, each transforming the options data structure in response
-- to a command-line option.
-options :: [OptDescr (Opt -> IO Opt)]
-options =
- [ Option "v" ["version"]
- (NoArg
- (\_ -> do
- hPutStrLn stderr ("Version " ++ version)
- exitWith ExitSuccess))
- "Print version"
-
- , Option "h" ["help"]
- (NoArg
- (\_ -> do
- prg <- getProgName
- hPutStrLn stderr (usageInfo (prg ++ " [OPTIONS] [FILES] - convert FILES from one markup format to another\nIf no OPTIONS specified, converts from markdown to html.\nIf no FILES specified, input is read from STDIN.\nOptions:") options)
- exitWith ExitSuccess))
- "Show help"
-
- , Option "fr" ["from","read"]
+allOptions :: [OptDescr (Opt -> IO Opt)]
+allOptions =
+ [ Option "fr" ["from","read"]
(ReqArg
(\arg opt -> case (lookup (map toLower arg) readers) of
Just reader -> return opt { optReader = reader }
@@ -172,6 +167,13 @@ options =
(\opt -> return opt { optStandalone = True }))
"Include needed header and footer on output"
+ , Option "o" ["output"]
+ (ReqArg
+ (\arg opt -> do
+ return opt { optOutputFile = arg })
+ "FILENAME")
+ "Name of output file"
+
, Option "p" ["preserve-tabs"]
(NoArg
(\opt -> return opt { optPreserveTabs = True }))
@@ -241,7 +243,7 @@ options =
"FILENAME")
"File to include after document body"
- , Option "" ["custom-header"]
+ , Option "C" ["custom-header"]
(ReqArg
(\arg opt -> do
text <- readFile arg
@@ -263,18 +265,87 @@ options =
let header = case (lookup arg writers) of
Just (writer, head) -> head
Nothing -> error ("Unknown reader: " ++ arg)
- hPutStrLn stdout header
+ hPutStr stdout header
exitWith ExitSuccess)
"FORMAT")
"Print default header for FORMAT"
+
+ , Option "d" ["debug"]
+ (NoArg
+ (\opt -> return opt { optDebug = True }))
+ "Print debug messages to stderr, output to stdout"
+
+ , Option "v" ["version"]
+ (NoArg
+ (\_ -> do
+ prg <- getProgName
+ hPutStrLn stderr (prg ++ " " ++ version ++
+ copyrightMessage)
+ exitWith $ ExitFailure 2))
+ "Print version"
+
+ , Option "h" ["help"]
+ (NoArg
+ (\opt -> return opt { optShowUsage = True }))
+ "Show help"
]
+
+-- parse name of calling program and return default reader and writer descriptions
+parseProgName name =
+ case (splitRegex (mkRegex "2") (map toLower name)) of
+ [from, to] -> (from, to)
+ _ -> ("markdown", "html")
+
+-- set default options based on reader and writer descriptions; start is starting options
+setDefaultOpts from to start =
+ case ((lookup from readers), (lookup to writers)) of
+ (Just reader, Just (writer, header)) -> start {optReader = reader,
+ optWriter = writer,
+ optDefaultHeader = header}
+ _ -> start
+
+-- True if single-letter option is in option list
+inOptList :: [Char] -> OptDescr (Opt -> IO Opt) -> Bool
+inOptList list desc =
+ let (Option letters _ _ _) = desc in
+ any (\x -> x `elem` list) letters
+
+-- Reformat usage message so it doesn't wrap illegibly
+reformatUsageInfo = gsub " *--" " --" .
+ gsub "(-[A-Za-z0-9]) *--" "\\1, --" .
+ gsub " *([^- ])" "\n\t\\1"
+
main = do
+ name <- getProgName
+ let (from, to) = parseProgName name
+
+ let irrelevantOptions = if not ('2' `elem` name)
+ then ""
+ else "frtwD" ++
+ (if (to /= "html" && to /= "s5") then "SmcT" else "") ++
+ (if (to /= "latex") then "N" else "") ++
+ (if (to /= "s5") then "i" else "") ++
+ (if (from /= "html" && from /= "latex") then "R" else "")
+
+ let options = filter (not . inOptList irrelevantOptions) allOptions
+
+ let defaultOpts = setDefaultOpts from to startOpt
+
args <- getArgs
- let (actions, sources, errors) = getOpt RequireOrder options args
+ let (actions, sources, errors) = getOpt Permute options args
+
+ if (not (null errors))
+ then do
+ mapM (\e -> hPutStrLn stderr e) errors
+ hPutStrLn stderr (reformatUsageInfo $
+ usageInfo (name ++ " [OPTIONS] [FILES]") options)
+ exitWith $ ExitFailure 2
+ else
+ return ()
-- thread option data structure through all supplied option actions
- opts <- foldl (>>=) (return startOpt) actions
+ opts <- foldl (>>=) (return defaultOpts) actions
let Opt { optPreserveTabs = preserveTabs
, optTabStop = tabStop
@@ -289,12 +360,31 @@ main = do
, optCustomHeader = customHeader
, optDefaultHeader = defaultHeader
, optTitlePrefix = titlePrefix
+ , optOutputFile = outputFile
, optNumberSections = numberSections
, optIncremental = incremental
, optSmart = smart
, optASCIIMathML = asciiMathML
+ , optShowUsage = showUsage
+ , optDebug = debug
} = opts
+ if showUsage
+ then do
+ hPutStr stderr (reformatUsageInfo $ usageInfo (name ++ " [OPTIONS] [FILES]") options)
+ exitWith $ ExitFailure 2
+ else return ()
+
+ output <- if ((null outputFile) || debug)
+ then return stdout
+ else openFile outputFile WriteMode
+
+ if debug
+ then do
+ hPutStrLn stderr ("OUTPUT=" ++ outputFile)
+ hPutStr stderr $ concatMap (\s -> "INPUT=" ++ s ++ "\n") sources
+ else return ()
+
let writingS5 = (defaultHeader == defaultS5Header)
let tabFilter = if preserveTabs then id else (tabsToSpaces tabStop)
let addBlank str = str ++ "\n\n"
@@ -323,13 +413,13 @@ main = do
writerIncludeBefore = includeBefore,
writerIncludeAfter = includeAfter }
- (readSources sources) >>= (putStr . encodeUTF8 . (writer writerOptions) .
+ (readSources sources) >>= (hPutStr output . encodeUTF8 .
+ (writer writerOptions) .
(reader startParserState) . filter .
- decodeUTF8 . (joinWithSep "\n"))
+ decodeUTF8 . (joinWithSep "\n")) >> hClose output
where
readSources [] = mapM readSource ["-"]
readSources sources = mapM readSource sources
- readSource "-" = getContents
+ readSource "-" = getContents
readSource source = readFile source
-
diff --git a/src/wrappers/checkin.sh b/src/wrappers/checkin.sh
deleted file mode 100644
index c9c564a23..000000000
--- a/src/wrappers/checkin.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-# Check if input files exist.
-for f; do
- if [ -n "$f" ] && ! [ -f "$f" ]; then
- err "File '$f' not found."
- exit 1
- fi
-done
diff --git a/src/wrappers/common.sh b/src/wrappers/common.sh
index 99a83be50..3481affff 100644
--- a/src/wrappers/common.sh
+++ b/src/wrappers/common.sh
@@ -8,22 +8,6 @@ WRAPPEE_ARGS=
err () { echo "$*" | fold -s -w ${COLUMNS:-110} >&2; }
errn () { printf "$*" | fold -s -w ${COLUMNS:-110} >&2; }
-usage () {
- synopsis="$@"
- err "Usage: $THIS $synopsis"
- err "See $THIS(1) man file for details."
-}
-
-runpandoc () {
- if [ -n "$WRAPPEE_ARGS" ]; then
- # Unpack arguments that will be passed to pandoc.
- oldifs="$IFS"; IFS="$NEWLINE"; set -- $WRAPPEE_ARGS "$@"; IFS="$oldifs"
- case "$1" in --) shift;; esac # tolerate the existence of a leading '--'
- fi
-
- pandoc "$@"
-}
-
# Portable which(1).
pathfind () {
oldifs="$IFS"; IFS=':'
@@ -37,17 +21,6 @@ pathfind () {
return 1
}
-HAVE_ICONV=
-if pathfind iconv; then
- HAVE_ICONV=1
- alias to_utf8='iconv -t utf-8'
- alias from_utf8='iconv -f utf-8'
-else
- err "Warning: iconv not present. Assuming UTF-8 character encoding."
- alias to_utf8='cat'
- alias from_utf8='cat'
-fi
-
for p in pandoc $REQUIRED; do
pathfind $p || {
err "You need '$p' to use this program!"
diff --git a/src/wrappers/getopts.sh b/src/wrappers/getopts.sh
deleted file mode 100644
index 263263c07..000000000
--- a/src/wrappers/getopts.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-if [ -z "$SYNOPSIS" ]; then
- SYNOPSIS="[-h] [input_file]"
- [ -n "$THIS_NARG" ] || SYNOPSIS="${SYNOPSIS}..."
-fi
-
-while getopts h opt; do
- case $opt in
- h|?) usage "$SYNOPSIS"; exit 2 ;;
- esac
-done
-
-shift $(($OPTIND - 1))
diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in
deleted file mode 100644
index 0fece3ccd..000000000
--- a/src/wrappers/html2markdown.in
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/bin/sh -e
-# converts html to markdown
-# uses an available program to fetch URL and tidy to normalize it first
-
-REQUIRED=tidy
-
-### common.sh
-
-grab_url_with () {
- url="${1:?internal error: grab_url_with: url required}"
-
- shift
- cmdline="$@"
-
- prog=
- prog_opts=
- if [ -n "$cmdline" ]; then
- eval "set -- $cmdline"
- prog=$1
- shift
- prog_opts="$@"
- fi
-
- if [ -z "$prog" ]; then
- # Locate a sensible web grabber (note the order).
- for p in wget lynx w3m curl links w3c; do
- if pathfind $p; then
- prog=$p
- break
- fi
- done
-
- [ -n "$prog" ] || {
- errn "$THIS: Couldn't find a program to fetch the file from URL "
- err "(e.g. wget, w3m, lynx, w3c, or curl)."
- return 1
- }
- else
- pathfind "$prog" || {
- err "$THIS: No such web grabber '$prog' found; aborting."
- return 1
- }
- fi
-
- # Setup proper base options for known grabbers.
- base_opts=
- case "$prog" in
- wget) base_opts="-O-" ;;
- lynx) base_opts="-source" ;;
- w3m) base_opts="-dump_source" ;;
- curl) base_opts="" ;;
- links) base_opts="-source" ;;
- w3c) base_opts="-n -get" ;;
- *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
- esac
-
- err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
- eval "set -- $base_opts $prog_opts"
- $prog "$@" "$url"
-}
-
-encoding=
-grabber=
-nograb=
-while getopts e:g:nh opt; do
- case $opt in
- e) encoding="$OPTARG" ;;
- g) grabber="$OPTARG" ;;
- n) nograb=1 ;;
- h|?)
- usage "[-e encoding] [-g grabber_command] [-n] [-h] [input_file|url]"
- exit 2 ;;
- esac
-done
-
-shift $(($OPTIND - 1))
-
-### postopts.sh
-
-### singlearg.sh
-
-inurl=
-if [ -n "$1" ] && ! [ -f "$1" ]; then
- if [ -n "$nograb" ]; then
- err "'$1' not found; refusing to treat input as URL."
- exit 1
- fi
- # Treat given argument as an URL.
- inurl="$1"
-fi
-
-if [ -n "$inurl" ]; then
- err "Attempting to fetch file from '$inurl'..."
-
- ### tempdir.sh
-
- grabber_out=$THIS_TEMPDIR/grabber.out
- grabber_log=$THIS_TEMPDIR/grabber.log
- if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out \
- 2>$grabber_log; then
- errn "grab_url_with failed"
- if [ -f $grabber_log ]; then
- err " with the following error log."
- err
- cat >&2 $grabber_log
- else
- err .
- fi
- exit 1
- fi
-
- set -- $grabber_out
-fi
-
-if [ -z "$encoding" ] && [ "x$@" != "x" ]; then
- # Try to determine character encoding unless not specified
- # and input is STDIN.
- encoding=$(
- head "$@" |
- LC_ALL=C tr 'A-Z' 'a-z' |
- sed -ne '/<meta .*content-type.*charset=/ {
- s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
- }'
- )
-fi
-
-if [ -n "$encoding" ] && [ -n "$HAVE_ICONV" ]; then
- alias to_utf8='iconv -f "$encoding" -t utf-8'
-elif [ -n "$inurl" ]; then # assume web pages are UTF-8
- alias to_utf8='cat'
-fi # else just use local encoding
-
-to_utf8 "$@" | tidy -utf8 2>/dev/null |
-runpandoc -r html -w markdown -s | from_utf8
diff --git a/src/wrappers/latex2markdown.in b/src/wrappers/latex2markdown.in
deleted file mode 100644
index e8cde8a97..000000000
--- a/src/wrappers/latex2markdown.in
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/sh -e
-# runs pandoc to convert latex to markdown
-
-### common.sh
-
-### getopts.sh
-
-### postopts.sh
-
-### singlearg.sh
-
-### checkin.sh
-
-to_utf8 "$@" | runpandoc -r latex -w markdown -s | from_utf8
diff --git a/src/wrappers/markdown2html.in b/src/wrappers/markdown2html.in
deleted file mode 100644
index e255398d2..000000000
--- a/src/wrappers/markdown2html.in
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh -e
-# converts markdown to HTML
-
-### common.sh
-
-### getopts.sh
-
-### postopts.sh
-
-### checkin.sh
-
-to_utf8 "$@" | runpandoc | from_utf8
diff --git a/src/wrappers/markdown2latex.in b/src/wrappers/markdown2latex.in
deleted file mode 100644
index c532b2f99..000000000
--- a/src/wrappers/markdown2latex.in
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh -e
-# converts markdown to latex
-
-### common.sh
-
-### getopts.sh
-
-### postopts.sh
-
-### checkin.sh
-
-to_utf8 "$@" | runpandoc -w latex -s | from_utf8
diff --git a/src/wrappers/markdown2pdf.in b/src/wrappers/markdown2pdf.in
index 838767224..c222c1cbd 100644
--- a/src/wrappers/markdown2pdf.in
+++ b/src/wrappers/markdown2pdf.in
@@ -1,64 +1,54 @@
#!/bin/sh -e
-# converts markdown to latex, then uses latex to make a PDF
-REQUIRED=pdflatex
+REQUIRED="markdown2latex pdflatex"
### common.sh
-outfile=
-while getopts o:h opt; do
- case $opt in
- o) outfile="$OPTARG" ;;
- h|?) usage "[-o output_file] [-h] [input_file]..."; exit 2 ;;
- esac
-done
-
-shift $(($OPTIND - 1))
-
-### postopts.sh
+### tempdir.sh
-### checkin.sh
+texname=output
+logfile=$THIS_TEMPDIR/log
-if [ -z "$outfile" ]; then
- if [ -n "$1" ]; then
- outfile="${1%.*}"
- else
- outfile="stdin" # input is STDIN, since no argument given
- fi
+if ! markdown2latex -s -d "$@" >$THIS_TEMPDIR/$texname.tex 2>$logfile; then
+ [ -f $logfile ] && sed -e 's/markdown2latex/markdown2pdf/g' \
+ -e '/^INPUT=/d' -e '/^OUTPUT=/d' $logfile >&2
+ exit 1
fi
-case "$outfile" in
-*.*) ;; # skip appending extension if one is already present
-*) outfile="${outfile%.*}.pdf";;
-esac
-### tempdir.sh
-
-# We should use a filename without white spaces for pdflatex.
-TEXNAME=$THIS
+outfile="$(sed -ne 's/^OUTPUT=//p' $logfile)"
+IFS="$NEWLINE"
+set -- $(sed -ne 's/^INPUT=//p' $logfile)
+firstinfilebase="${1%.*}"
+defaultdest="${firstinfilebase:-stdin}.pdf"
+destname="${outfile:-$defaultdest}"
-to_utf8 "$@" | runpandoc -w latex -s >$THIS_TEMPDIR/$TEXNAME.tex
(
cd $THIS_TEMPDIR
- if ! pdflatex -interaction=batchmode $TEXNAME.tex >/dev/null 2>&1; then
+ if ! pdflatex -interaction=batchmode $texname.tex >/dev/null 2>&1; then
err "LaTeX errors:"
- from_utf8 $TEXNAME.log | sed -ne '/^!/,/^ *$/p' >&2
- if grep -q "File \`ucs.sty' not found" $TEXNAME.log; then
- err "Please install the 'unicode' package from ctan.org."
+ sed -ne '/^!/,/^ *$/p' $texname.log >&2
+ if grep -q "File \`ucs.sty' not found" $texname.log; then
+ err "Please install the 'unicode' package from CTAN:"
+ err "http://www.ctan.org/tex-archive/macros/latex/contrib/unicode/"
+ fi
+ if grep -q "File \`fancyvrb.sty' not found" $texname.log; then
+ err "Please install the 'fancyvrb' package from CTAN:"
+ err "http://www.ctan.org/tex-archive/macros/latex/contrib/fancyvrb/"
fi
exit 1
fi
-)
+) || exit $?
is_target_exists=
-if [ -f "$outfile" ]; then
+if [ -f "$destname" ]; then
is_target_exists=1
- mv -f "$outfile" "$outfile~"
+ mv "$destname" "$destname~"
fi
-mv -f $THIS_TEMPDIR/$TEXNAME.pdf "$outfile"
+mv -f $THIS_TEMPDIR/$texname.pdf "$destname"
-errn "Created '$outfile'"
+errn "Created $destname"
[ -z "$is_target_exists" ] || {
- errn " (previous file has been backed up as '$outfile~')"
+ errn " (previous file has been backed up as $destname~)"
}
err .
diff --git a/src/wrappers/postopts.sh b/src/wrappers/postopts.sh
deleted file mode 100644
index e0d015f41..000000000
--- a/src/wrappers/postopts.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-# Parse wrapper and wrappee (pandoc) arguments by taking
-# into account that they may have space or tab characters.
-pick="WRAPPER_ARGS"
-while [ $# -gt 0 ]; do
- if [ "$pick" = "WRAPPER_ARGS" ]; then
- case "$1" in
- -*) pick="WRAPPEE_ARGS" ;;
- esac
- fi
- # Pack args with NEWLINE to preserve spaces,
- # and put them into the picked variable.
- eval "$pick=\"\$${pick}${NEWLINE}${1}\""
- shift
-done
-
-# Unpack filename arguments. Now "$@" will hold the filenames.
-oldifs="$IFS"; IFS="$NEWLINE"; set -- $WRAPPER_ARGS; IFS="$oldifs"
diff --git a/src/wrappers/singlearg.sh b/src/wrappers/singlearg.sh
deleted file mode 100644
index f742d1383..000000000
--- a/src/wrappers/singlearg.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ensure to work with a single argument.
-if [ $# -gt 1 ]; then
- first_arg="$1"
- shift
- err "Warning: extra arguments '$@' will be ignored."
- set -- $first_arg
-fi
diff --git a/src/wrappers/testwrapper.in b/src/wrappers/testwrapper.in
deleted file mode 100644
index e025c87e7..000000000
--- a/src/wrappers/testwrapper.in
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/bin/sh
-
-THIS=$1
-
-ASH="ash -s"
-BASH="bash --posix -s"
-DASH="dash -s"
-KSH="ksh -s"
-POSH="posh -s"
-ZSH="zsh -s"
-
-ERROR=""
-
-wrapper () {
- $SH -- "$@" <<-'EOF'
-### common.sh
-
-outfile=
-while getopts o: opt; do
- case $opt in
- o) outfile="$OPTARG" ;;
- esac
-done
-
-shift $(($OPTIND - 1))
-
-### postopts.sh
-
-echo "Options passed to wrapper:"
-[ -z "$outfile" ] || echo "|$outfile|"
-
-echo "Arguments passed to wrapper:"
-for arg; do
- echo "|$arg|"
-done
-
-pandoc () {
- echo "Arguments passed to wrappee:"
- for arg; do
- echo "|$arg|"
- done
-}
-runpandoc
-EOF
-}
-
-# Portable which(1).
-pathfind () {
- oldifs="$IFS"; IFS=':'
- for _p in $PATH; do
- if [ -x "$_p/$*" ] && [ -f "$_p/$*" ]; then
- IFS="$oldifs"
- return 0
- fi
- done
- IFS="$oldifs"
- return 1
-}
-
-check_results () {
- if [ "$1" = "$2" ]; then
- echo >&2 ok
- return 0
- else
- echo >&2 failed
- sed "s/^/\t/" >&2 <<EOF
-Command line: '$3'
-===> Expected:
-$2
-<=== Got:
-$1
-EOF
- return 1
- fi
-}
-
-for SH in "$BASH" "$DASH" "$KSH" "$ZSH"; do
- CMD=${SH%% *}
- echo >&2 " Testing with $CMD..."
- if pathfind "$CMD"; then
- if [ "$CMD" = "zsh" ]; then
- # Zsh needs to be called as 'sh' to enable POSIX mode.
- ln -s $(which zsh) ./sh
- SH="./sh ${SH#* }"
- trap 'err=$?; rm -f ./sh; exit $err' 0 1 2 3 13 15
- fi
-
- set -e
-
- # Test 1
- printf >&2 " test case 1... "
- actual=$(wrapper -o "output file" "foo bar" -A "quux baz" -B)
- expected=$(cat <<'EOF'
-Options passed to wrapper:
-|output file|
-Arguments passed to wrapper:
-|foo bar|
-Arguments passed to wrappee:
-|-A|
-|quux baz|
-|-B|
-EOF
-)
- check_results "$actual" "$expected" \
- 'wrapper -o "output file" "foo bar" -A "quux baz" -B'
-
- # Test 2
- printf >&2 " test case 2... "
- actual=$(wrapper -- -A "foo bar")
- expected=$(cat <<'EOF'
-Options passed to wrapper:
-Arguments passed to wrapper:
-Arguments passed to wrappee:
-|-A|
-|foo bar|
-EOF
-)
- check_results "$actual" "$expected" 'wrapper -- -A "foo bar"'
-
- # Test 3 (Test 1 with a redundant '--')
- printf >&2 " test case 3... "
- actual=$(wrapper -o "output file" "foo bar" -- -A "quux baz" -B)
- expected=$(cat <<'EOF'
-Options passed to wrapper:
-|output file|
-Arguments passed to wrapper:
-|foo bar|
-Arguments passed to wrappee:
-|-A|
-|quux baz|
-|-B|
-EOF
-)
- check_results "$actual" "$expected" \
- 'wrapper -o "output file" "foo bar" -- -A "quux baz" -B'
- else
- echo >&2 "Warning: cannot verify correctness with $CMD; shell not available"
- fi
-done
-
-exit 0
diff --git a/src/wrappers/web2markdown.in b/src/wrappers/web2markdown.in
new file mode 100644
index 000000000..64ff3db9b
--- /dev/null
+++ b/src/wrappers/web2markdown.in
@@ -0,0 +1,173 @@
+#!/bin/sh -e
+# converts HTML from a URL, file, or stdin to markdown
+# uses an available program to fetch URL and tidy to normalize it first
+
+REQUIRED="tidy html2markdown"
+
+### common.sh
+
+grab_url_with () {
+ url="${1:?internal error: grab_url_with: url required}"
+
+ shift
+ cmdline="$@"
+
+ prog=
+ prog_opts=
+ if [ -n "$cmdline" ]; then
+ eval "set -- $cmdline"
+ prog=$1
+ shift
+ prog_opts="$@"
+ fi
+
+ if [ -z "$prog" ]; then
+ # Locate a sensible web grabber (note the order).
+ for p in wget lynx w3m curl links w3c; do
+ if pathfind $p; then
+ prog=$p
+ break
+ fi
+ done
+
+ [ -n "$prog" ] || {
+ errn "$THIS: Couldn't find a program to fetch the file from URL "
+ err "(e.g. wget, w3m, lynx, w3c, or curl)."
+ return 1
+ }
+ else
+ pathfind "$prog" || {
+ err "$THIS: No such web grabber '$prog' found; aborting."
+ return 1
+ }
+ fi
+
+ # Setup proper base options for known grabbers.
+ base_opts=
+ case "$prog" in
+ wget) base_opts="-O-" ;;
+ lynx) base_opts="-source" ;;
+ w3m) base_opts="-dump_source" ;;
+ curl) base_opts="" ;;
+ links) base_opts="-source" ;;
+ w3c) base_opts="-n -get" ;;
+ *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
+ esac
+
+ err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
+ eval "set -- $base_opts $prog_opts"
+ $prog "$@" "$url"
+}
+
+add_option () {
+ options="$options$NEWLINE$1"
+}
+
+options=
+argument=
+encoding=
+grabber=
+
+# Parse command-line arguments
+while [ $# -gt 0 ]; do
+ case "$1" in
+ -h|--help)
+ html2markdown -h 2>&1 | sed -e 's/html2markdown/web2markdown/' 1>&2
+ err " -e ENCODING, --encoding=ENCODING"
+ err " Specify character encoding of input"
+ err " -g COMMAND, --grabber=COMMAND"
+ err " Specify command to be used to grab contents of URL"
+ exit 0 ;;
+ -v|--version)
+ html2markdown -v
+ exit 0 ;;
+ -e)
+ shift
+ encoding=$1 ;;
+ --encoding=*)
+ wholeopt=$1
+ # extract encoding from after =
+ encoding=${wholeopt#*=} ;;
+ -g)
+ shift
+ grabber=$1 ;;
+ --grabber=*)
+ wholeopt=$1
+ # extract encoding from after =
+ grabber=${wholeopt#*=} ;;
+ -o|--output|-b|--tab-stop|-H|--include-in-header| \
+ -A|--include-after-body|-C|-B|--include-before-body| \
+ -C|--custom-header|-T|--title-prefix)
+ add_option $1
+ shift
+ add_option $1 ;;
+ -*) add_option $1 ;;
+ *)
+ if [ -z "$argument" ]; then
+ argument=$1
+ else
+ err "Warning: extra argument '$1' will be ignored."
+ fi ;;
+ esac
+ shift
+done
+
+# Unpack options. Now "$@" will hold the html2markdown options.
+oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs"
+
+inurl=
+if [ -n "$argument" ] && ! [ -f "$argument" ]; then
+ # Treat given argument as an URL.
+ inurl="$argument"
+fi
+
+if [ -n "$inurl" ]; then
+ err "Attempting to fetch file from '$inurl'..."
+
+ ### tempdir.sh
+
+ grabber_out=$THIS_TEMPDIR/grabber.out
+ grabber_log=$THIS_TEMPDIR/grabber.log
+ if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
+ errn "grab_url_with failed"
+ if [ -f $grabber_log ]; then
+ err " with the following error log."
+ err
+ cat >&2 $grabber_log
+ else
+ err .
+ fi
+ exit 1
+ fi
+
+ argument="$grabber_out"
+fi
+
+if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then
+ # Try to determine character encoding if not specified
+ # and input is not STDIN.
+ encoding=$(
+ head "$argument" |
+ LC_ALL=C tr 'A-Z' 'a-z' |
+ sed -ne '/<meta .*content-type.*charset=/ {
+ s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
+ }'
+ )
+fi
+
+if [ -n "$encoding" ] && pathfind iconv; then
+ alias to_utf8='iconv -f "$encoding" -t utf-8'
+else # assume UTF-8
+ alias to_utf8='cat'
+fi
+
+if [ -z "$argument" ]; then
+ tidy -utf8 2>/dev/null | html2markdown "$@"
+else
+ if [ -f "$argument" ]; then
+ to_utf8 "$argument" | tidy -utf8 2>/dev/null | html2markdown "$@"
+ else
+ err "File '$argument' not found."
+ exit 1
+ fi
+fi