From 3491420b53b03dbc24b6001e4f379fd2fbdbea8d Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Fri, 29 Dec 2006 18:50:13 +0000 Subject: + Changed 'web2markdown' to 'html2markdown'. git-svn-id: https://pandoc.googlecode.com/svn/trunk@309 788f1e2b-df1e-0410-8736-df70ead52e1b --- Makefile | 2 +- README | 18 ++--- man/man1/html2markdown.1 | 81 +++++++++++++++++++ man/man1/pandoc.1 | 4 +- man/man1/web2markdown.1 | 81 ------------------- src/wrappers/html2markdown.in | 176 ++++++++++++++++++++++++++++++++++++++++++ src/wrappers/web2markdown.in | 176 ------------------------------------------ web/demos.sh | 2 +- web/index.txt | 2 +- 9 files changed, 271 insertions(+), 271 deletions(-) create mode 100644 man/man1/html2markdown.1 delete mode 100644 man/man1/web2markdown.1 create mode 100644 src/wrappers/html2markdown.in delete mode 100644 src/wrappers/web2markdown.in diff --git a/Makefile b/Makefile index 0985ea38d..10c67c0b1 100644 --- a/Makefile +++ b/Makefile @@ -25,7 +25,7 @@ EXECSBASE := $(shell sed -ne 's/^[Ee]xecutable:[[:space:]]*//p' $(CABAL).in) #------------------------------------------------------------------------------- # Install targets #------------------------------------------------------------------------------- -WRAPPERS := web2markdown markdown2pdf +WRAPPERS := html2markdown markdown2pdf # Add .exe extensions if we're running Windows/Cygwin. EXTENSION := $(shell uname | tr '[:upper:]' '[:lower:]' | \ sed -ne 's/^cygwin.*$$/\.exe/p') diff --git a/README b/README index 6c3210d9a..51a690ad5 100644 --- a/README +++ b/README @@ -38,14 +38,14 @@ Requirements The `pandoc` program itself does not depend on any external libraries or programs. -The wrapper script `web2markdown` requires +The wrapper script `html2markdown` requires - `pandoc` (which must be in the PATH) - a POSIX-compliant shell (installed by default on all linux and unix systems, including Mac OS X, and in [Cygwin] for Windows), - `HTML Tidy` - `iconv` (for character encoding conversion). (If `iconv` is absent, - `web2markdown` will still work, but it will treat everything as UTF-8.) + `html2markdown` will still work, but it will treat everything as UTF-8.) [Cygwin]: http://www.cygwin.com/ [HTML Tidy]: http://tidy.sourceforge.net/ @@ -117,7 +117,7 @@ But for simple documents it should be adequate. The `latex` and `html` readers are also limited in what they can do. Because the `html` reader is picky about the HTML it parses, it is recommended that you pipe HTML through [HTML Tidy] before sending it to `pandoc`, or use the -`web2markdown` script described below. +`html2markdown` script described below. If you don't specify a reader or writer explicitly, `pandoc` will try to determine the input and output format from the extensions of @@ -151,10 +151,10 @@ The shell scripts (described below) automatically convert the input from the local encoding to UTF-8 before running them through `pandoc`, then convert the output back to the local encoding. -`markdown2pdf` and `web2markdown` -================================= +`markdown2pdf` and `html2markdown` +================================== -Two shell scripts, `markdown2pdf` and `web2markdown`, are included in +Two shell scripts, `markdown2pdf` and `html2markdown`, are included in the standard Pandoc installation. (They are not included in the Windows binary package, as they require a POSIX shell, but they may be used in Windows under Cygwin.) @@ -175,19 +175,19 @@ in Windows under Cygwin.) If no input file is specified, input will be taken from STDIN. -2. `web2markdown` grabs a web page from a file or URL and converts +2. `html2markdown` grabs a web page from a file or URL and converts it to markdown-formatted text, using `tidy` and `pandoc`. Unless input is from STDIN, an attempt is made to determine the character encoding of the page from the "Content-type" meta tag. If this is not present, UTF-8 is assumed. Alternatively, a character encoding may be specified explicitly using the `-e` option. - `web2markdown` searches for an available program (`wget`, `curl`, + `html2markdown` searches for an available program (`wget`, `curl`, or a text-mode browser) to fetch the contents of a URL. Optionally, the `-g` command may be used to specify the command to be used: - web2markdown -g 'wget --user=foo --password=bar' mysite.com + html2markdown -g 'wget --user=foo --password=bar' mysite.com Command-line options ==================== diff --git a/man/man1/html2markdown.1 b/man/man1/html2markdown.1 new file mode 100644 index 000000000..413feb115 --- /dev/null +++ b/man/man1/html2markdown.1 @@ -0,0 +1,81 @@ +.TH HTML2MARKDOWN 1 "December 15, 2006" Pandoc "User Manuals" +.SH NAME +html2markdown \- converts HTML to markdown-formatted text +.SH SYNOPSIS +\fBhtml2markdown\fR [\fIoptions\fR] [\fIinput\-file\fR or \fIURL\fR] +.SH DESCRIPTION +\fBhtml2markdown\fR converts \fIinput\-file\fR or \fIURL\fR (or text +from STDIN) from HTML to markdown\-formatted plain text. +If a URL is specified, \fBhtml2markdown\fR uses an available program +(e.g. wget, w3m, lynx or curl) to fetch its contents. Output is sent +to STDOUT unless an output file is specified using the \fB\-o\fR +option. +.PP +\fBhtml2markdown\fR uses the character encoding specified in the +"Content-type" meta tag. If this is not present, or if input comes +from STDIN, UTF-8 is assumed. A character encoding may be specified +explicitly using the \fB\-e\fR option. +.PP +\fBhtml2markdown\fR is a wrapper for \fBpandoc\fR. +.SH OPTIONS +.TP +.B \-s, \-\-standalone +Include title, author, and date information (if present) at the +top of markdown output. +.TP +.B \-o FILE, \-\-output=FILE +Write output to \fIFILE\fR instead of STDOUT. +.TP +.B \-p, \-\-preserve-tabs +Preserve tabs instead of converting them to spaces. +.TP +.B \-\-tab-stop=\fITABSTOP\fB +Specify tab stop (default is 4). +.TP +.B \-R, \-\-parse-raw +Parse untranslatable HTML codes as raw HTML. +.TP +.B \-H \fIFILE\fB, \-\-include-in-header=\fIFILE\fB +Include contents of \fIFILE\fR at the end of the header. Implies +\fB\-s\fR. +.TP +.B \-B \fIFILE\fB, \-\-include-before-body=\fIFILE\fB +Include contents of \fIFILE\fR at the beginning of the document body. +.TP +.B \-A \fIFILE\fB, \-\-include-after-body=\fIFILE\fB +Include contents of \fIFILE\fR at the end of the document body. +.TP +.B \-C \fIFILE\fB, \-\-custom-header=\fIFILE\fB +Use contents of \fIFILE\fR +as the document header (overriding the default header, which can be +printed using '\fBpandoc \-D markdown\fR'). Implies +\fB-s\fR. +.TP +.B \-v, \-\-version +Print version. +.TP +.B \-h, \-\-help +Show usage message. +.TP +.B \-e \fIencoding\fR +Assume the character encoding \fIencoding\fR in reading HTML. +(Note: \fIencoding\fR will be passed to \fBiconv\fR; a list of +available encodings may be obtained using `\fBiconv \-l\fR'.) +If the \fB\-e\fR option is not specified and input is not from +STDIN, \fBhtml2markdown\fR will try to extract the character encoding +from the "Content-type" meta tag. If no character encoding is +specified in this way, or if input is from STDIN, UTF-8 will be +assumed. +.TP +.B \-g \fIcommand\fR +Use \fIcommand\fR to fetch the contents of a URL. (By default, +\fBhtml2markdown\fR searches for an available program or text-based +browser to fetch the contents of a URL.) For example: +.IP +html2markdown \-g 'wget \-\-user=foo \-\-password=bar' mysite.com + +.SH "SEE ALSO" +\fBpandoc\fR(1), +\fBiconv\fR(1) +.SH AUTHOR +John MacFarlane and Recai Oktas diff --git a/man/man1/pandoc.1 b/man/man1/pandoc.1 index f6280f463..a955e9e8a 100644 --- a/man/man1/pandoc.1 +++ b/man/man1/pandoc.1 @@ -41,7 +41,7 @@ and output through \fBiconv\fR: .PP \fIPandoc\fR's HTML parser is not very forgiving. If your input is HTML, consider running it through \fBtidy\fR(1) before passing it -to Pandoc. Or use \fBweb2markdown\fR(1), a wrapper around \fBpandoc\fR. +to Pandoc. Or use \fBhtml2markdown\fR(1), a wrapper around \fBpandoc\fR. .SH OPTIONS .TP @@ -151,7 +151,7 @@ Print version. Show usage message. .SH "SEE ALSO" -\fBweb2markdown\fR(1), +\fBhtml2markdown\fR(1), \fBmarkdown2pdf\fR(1). The .I README diff --git a/man/man1/web2markdown.1 b/man/man1/web2markdown.1 deleted file mode 100644 index 242b50671..000000000 --- a/man/man1/web2markdown.1 +++ /dev/null @@ -1,81 +0,0 @@ -.TH WEB2MARKDOWN 1 "December 15, 2006" Pandoc "User Manuals" -.SH NAME -web2markdown \- converts HTML to markdown-formatted text -.SH SYNOPSIS -\fBweb2markdown\fR [\fIoptions\fR] [\fIinput\-file\fR or \fIURL\fR] -.SH DESCRIPTION -\fBweb2markdown\fR converts \fIinput\-file\fR or \fIURL\fR (or text -from STDIN) from HTML to markdown\-formatted plain text. -If a URL is specified, \fBweb2markdown\fR uses an available program -(e.g. wget, w3m, lynx or curl) to fetch its contents. Output is sent -to STDOUT unless an output file is specified using the \fB\-o\fR -option. -.PP -\fBweb2markdown\fR uses the character encoding specified in the -"Content-type" meta tag. If this is not present, or if input comes -from STDIN, UTF-8 is assumed. A character encoding may be specified -explicitly using the \fB\-e\fR option. -.PP -\fBweb2markdown\fR is a wrapper for \fBpandoc\fR. -.SH OPTIONS -.TP -.B \-s, \-\-standalone -Include title, author, and date information (if present) at the -top of markdown output. -.TP -.B \-o FILE, \-\-output=FILE -Write output to \fIFILE\fR instead of STDOUT. -.TP -.B \-p, \-\-preserve-tabs -Preserve tabs instead of converting them to spaces. -.TP -.B \-\-tab-stop=\fITABSTOP\fB -Specify tab stop (default is 4). -.TP -.B \-R, \-\-parse-raw -Parse untranslatable HTML codes as raw HTML. -.TP -.B \-H \fIFILE\fB, \-\-include-in-header=\fIFILE\fB -Include contents of \fIFILE\fR at the end of the header. Implies -\fB\-s\fR. -.TP -.B \-B \fIFILE\fB, \-\-include-before-body=\fIFILE\fB -Include contents of \fIFILE\fR at the beginning of the document body. -.TP -.B \-A \fIFILE\fB, \-\-include-after-body=\fIFILE\fB -Include contents of \fIFILE\fR at the end of the document body. -.TP -.B \-C \fIFILE\fB, \-\-custom-header=\fIFILE\fB -Use contents of \fIFILE\fR -as the document header (overriding the default header, which can be -printed using '\fBpandoc \-D markdown\fR'). Implies -\fB-s\fR. -.TP -.B \-v, \-\-version -Print version. -.TP -.B \-h, \-\-help -Show usage message. -.TP -.B \-e \fIencoding\fR -Assume the character encoding \fIencoding\fR in reading HTML. -(Note: \fIencoding\fR will be passed to \fBiconv\fR; a list of -available encodings may be obtained using `\fBiconv \-l\fR'.) -If the \fB\-e\fR option is not specified and input is not from -STDIN, \fBweb2markdown\fR will try to extract the character encoding -from the "Content-type" meta tag. If no character encoding is -specified in this way, or if input is from STDIN, UTF-8 will be -assumed. -.TP -.B \-g \fIcommand\fR -Use \fIcommand\fR to fetch the contents of a URL. (By default, -\fBweb2markdown\fR searches for an available program or text-based -browser to fetch the contents of a URL.) For example: -.IP -web2markdown \-g 'wget \-\-user=foo \-\-password=bar' mysite.com - -.SH "SEE ALSO" -\fBpandoc\fR(1), -\fBiconv\fR(1) -.SH AUTHOR -John MacFarlane and Recai Oktas diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in new file mode 100644 index 000000000..740d69588 --- /dev/null +++ b/src/wrappers/html2markdown.in @@ -0,0 +1,176 @@ +#!/bin/sh -e +# converts HTML from a URL, file, or stdin to markdown +# uses an available program to fetch URL and tidy to normalize it first + +REQUIRED="tidy" + +### common.sh + +grab_url_with () { + url="${1:?internal error: grab_url_with: url required}" + + shift + cmdline="$@" + + prog= + prog_opts= + if [ -n "$cmdline" ]; then + eval "set -- $cmdline" + prog=$1 + shift + prog_opts="$@" + fi + + if [ -z "$prog" ]; then + # Locate a sensible web grabber (note the order). + for p in wget lynx w3m curl links w3c; do + if pathfind $p; then + prog=$p + break + fi + done + + [ -n "$prog" ] || { + errn "$THIS: Couldn't find a program to fetch the file from URL " + err "(e.g. wget, w3m, lynx, w3c, or curl)." + return 1 + } + else + pathfind "$prog" || { + err "$THIS: No such web grabber '$prog' found; aborting." + return 1 + } + fi + + # Setup proper base options for known grabbers. + base_opts= + case "$prog" in + wget) base_opts="-O-" ;; + lynx) base_opts="-source" ;; + w3m) base_opts="-dump_source" ;; + curl) base_opts="" ;; + links) base_opts="-source" ;; + w3c) base_opts="-n -get" ;; + *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." + esac + + err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." + eval "set -- $base_opts $prog_opts" + $prog "$@" "$url" +} + +add_option () { + options="$options$NEWLINE$1" +} + +options= +argument= +encoding= +grabber= + +# Parse command-line arguments +while [ $# -gt 0 ]; do + case "$1" in + -h|--help) + pandoc -h 2>&1 | sed -e 's/pandoc/html2markdown/' \ + -e '/^[[:space:]]*\(-f\|-t\|-S\|-N\|-m\|-i\|-c\|-T\|-D\|-d\)/,/./d'\ + 1>&2 + err " -e ENCODING, --encoding=ENCODING" + err " Specify character encoding of input" + err " -g COMMAND, --grabber=COMMAND" + err " Specify command to be used to grab contents of URL" + exit 0 ;; + -v|--version) + pandoc -v 2>&1 | sed -e 's/pandoc/html2markdown/' 1>&2 + exit 0 ;; + -e) + shift + encoding=$1 ;; + --encoding=*) + wholeopt=$1 + # extract encoding from after = + encoding=${wholeopt#*=} ;; + -g) + shift + grabber=$1 ;; + --grabber=*) + wholeopt=$1 + # extract encoding from after = + grabber=${wholeopt#*=} ;; + -o|--output|-b|--tab-stop|-H|--include-in-header| \ + -A|--include-after-body|-C|-B|--include-before-body| \ + -C|--custom-header|-T|--title-prefix) + add_option $1 + shift + add_option $1 ;; + -*) add_option $1 ;; + *) + if [ -z "$argument" ]; then + argument=$1 + else + err "Warning: extra argument '$1' will be ignored." + fi ;; + esac + shift +done + +# Unpack options. Now "$@" will hold the pandoc options. +oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs" + +inurl= +if [ -n "$argument" ] && ! [ -f "$argument" ]; then + # Treat given argument as an URL. + inurl="$argument" +fi + +if [ -n "$inurl" ]; then + err "Attempting to fetch file from '$inurl'..." + + ### tempdir.sh + + grabber_out=$THIS_TEMPDIR/grabber.out + grabber_log=$THIS_TEMPDIR/grabber.log + if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then + errn "grab_url_with failed" + if [ -f $grabber_log ]; then + err " with the following error log." + err + cat >&2 $grabber_log + else + err . + fi + exit 1 + fi + + argument="$grabber_out" +fi + +if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then + # Try to determine character encoding if not specified + # and input is not STDIN. + encoding=$( + head "$argument" | + LC_ALL=C tr 'A-Z' 'a-z' | + sed -ne '//dev/null | pandoc -r html -w markdown "$@" +else + if [ -f "$argument" ]; then + to_utf8 "$argument" | + tidy -utf8 2>/dev/null | pandoc -r html -w markdown "$@" + else + err "File '$argument' not found." + exit 1 + fi +fi diff --git a/src/wrappers/web2markdown.in b/src/wrappers/web2markdown.in deleted file mode 100644 index 89e884c3d..000000000 --- a/src/wrappers/web2markdown.in +++ /dev/null @@ -1,176 +0,0 @@ -#!/bin/sh -e -# converts HTML from a URL, file, or stdin to markdown -# uses an available program to fetch URL and tidy to normalize it first - -REQUIRED="tidy" - -### common.sh - -grab_url_with () { - url="${1:?internal error: grab_url_with: url required}" - - shift - cmdline="$@" - - prog= - prog_opts= - if [ -n "$cmdline" ]; then - eval "set -- $cmdline" - prog=$1 - shift - prog_opts="$@" - fi - - if [ -z "$prog" ]; then - # Locate a sensible web grabber (note the order). - for p in wget lynx w3m curl links w3c; do - if pathfind $p; then - prog=$p - break - fi - done - - [ -n "$prog" ] || { - errn "$THIS: Couldn't find a program to fetch the file from URL " - err "(e.g. wget, w3m, lynx, w3c, or curl)." - return 1 - } - else - pathfind "$prog" || { - err "$THIS: No such web grabber '$prog' found; aborting." - return 1 - } - fi - - # Setup proper base options for known grabbers. - base_opts= - case "$prog" in - wget) base_opts="-O-" ;; - lynx) base_opts="-source" ;; - w3m) base_opts="-dump_source" ;; - curl) base_opts="" ;; - links) base_opts="-source" ;; - w3c) base_opts="-n -get" ;; - *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." - esac - - err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." - eval "set -- $base_opts $prog_opts" - $prog "$@" "$url" -} - -add_option () { - options="$options$NEWLINE$1" -} - -options= -argument= -encoding= -grabber= - -# Parse command-line arguments -while [ $# -gt 0 ]; do - case "$1" in - -h|--help) - pandoc -h 2>&1 | sed -e 's/pandoc/web2markdown/' \ - -e '/^[[:space:]]*\(-f\|-t\|-S\|-N\|-m\|-i\|-c\|-T\|-D\|-d\)/,/./d'\ - 1>&2 - err " -e ENCODING, --encoding=ENCODING" - err " Specify character encoding of input" - err " -g COMMAND, --grabber=COMMAND" - err " Specify command to be used to grab contents of URL" - exit 0 ;; - -v|--version) - pandoc -v 2>&1 | sed -e 's/pandoc/web2markdown/' 1>&2 - exit 0 ;; - -e) - shift - encoding=$1 ;; - --encoding=*) - wholeopt=$1 - # extract encoding from after = - encoding=${wholeopt#*=} ;; - -g) - shift - grabber=$1 ;; - --grabber=*) - wholeopt=$1 - # extract encoding from after = - grabber=${wholeopt#*=} ;; - -o|--output|-b|--tab-stop|-H|--include-in-header| \ - -A|--include-after-body|-C|-B|--include-before-body| \ - -C|--custom-header|-T|--title-prefix) - add_option $1 - shift - add_option $1 ;; - -*) add_option $1 ;; - *) - if [ -z "$argument" ]; then - argument=$1 - else - err "Warning: extra argument '$1' will be ignored." - fi ;; - esac - shift -done - -# Unpack options. Now "$@" will hold the pandoc options. -oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs" - -inurl= -if [ -n "$argument" ] && ! [ -f "$argument" ]; then - # Treat given argument as an URL. - inurl="$argument" -fi - -if [ -n "$inurl" ]; then - err "Attempting to fetch file from '$inurl'..." - - ### tempdir.sh - - grabber_out=$THIS_TEMPDIR/grabber.out - grabber_log=$THIS_TEMPDIR/grabber.log - if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then - errn "grab_url_with failed" - if [ -f $grabber_log ]; then - err " with the following error log." - err - cat >&2 $grabber_log - else - err . - fi - exit 1 - fi - - argument="$grabber_out" -fi - -if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then - # Try to determine character encoding if not specified - # and input is not STDIN. - encoding=$( - head "$argument" | - LC_ALL=C tr 'A-Z' 'a-z' | - sed -ne '//dev/null | pandoc -r html -w markdown "$@" -else - if [ -f "$argument" ]; then - to_utf8 "$argument" | - tidy -utf8 2>/dev/null | pandoc -r html -w markdown "$@" - else - err "File '$argument' not found." - exit 1 - fi -fi diff --git a/web/demos.sh b/web/demos.sh index 6c6a2a698..bd87151d5 100644 --- a/web/demos.sh +++ b/web/demos.sh @@ -14,7 +14,7 @@ pandoc -s README.tex -o demo0.txt pandoc -s -w rst README -o demo0.txt pandoc -s README -o demo0.rtf pandoc -s -m -i -w s5 S5DEMO -o demo0.html -web2markdown http://www.gnu.org/software/make/ -o demo0.txt +html2markdown http://www.gnu.org/software/make/ -o demo0.txt markdown2pdf README -o demo0.pdf markdown2pdf -C myheader.tex README -o demo0.pdf' diff --git a/web/index.txt b/web/index.txt index 9fb86a4d9..024133487 100644 --- a/web/index.txt +++ b/web/index.txt @@ -35,7 +35,7 @@ you should extract from the zip archive and put somewhere in your PATH). See the included file `README-WINDOWS.txt` for instructions on using the program. Note: If you use [Cygwin], we recommend that you compile Pandoc from source. This will give you access to the -wrapper scripts `markdown2pdf` and `web2markdown`, which are not +wrapper scripts `markdown2pdf` and `html2markdown`, which are not included in the Windows binary package. [`@TARBALL_NAME@`]: http://pandoc.googlecode.com/files/@TARBALL_NAME@ -- cgit v1.2.3