aboutsummaryrefslogtreecommitdiff
path: root/src/wrappers/web2markdown.in
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-12-22 20:16:03 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-12-22 20:16:03 +0000
commitd829c4820adbe7a7634f1c1d825d0d206512e6e7 (patch)
tree2de3d3459e6f2788b3a9aede93add68503f5a588 /src/wrappers/web2markdown.in
parentcfaf0c178c422e00706eb04daea88d21a7fe9429 (diff)
downloadpandoc-d829c4820adbe7a7634f1c1d825d0d206512e6e7.tar.gz
Merged changes from branches/wrappers since r177.
Summary of main changes: + Added -o/--output and -d/--debug options to pandoc. + Modified pandoc to behave differently depending on the name of the program. For example, if the program name is 'html2latex', the default reader will be html and the default writer latex. + Removed most of the old wrappers, replacing them with symlinks to pandoc. + Rewrote markdown2pdf and created a new wrapper web2markdown, with the functionality of the old html2markdown script. These new scripts exploit pandoc's -d option to avoid having to do complex command-line parsing. + Revised man pages and documentation appropriately. git-svn-id: https://pandoc.googlecode.com/svn/trunk@279 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src/wrappers/web2markdown.in')
-rw-r--r--src/wrappers/web2markdown.in173
1 files changed, 173 insertions, 0 deletions
diff --git a/src/wrappers/web2markdown.in b/src/wrappers/web2markdown.in
new file mode 100644
index 000000000..64ff3db9b
--- /dev/null
+++ b/src/wrappers/web2markdown.in
@@ -0,0 +1,173 @@
+#!/bin/sh -e
+# converts HTML from a URL, file, or stdin to markdown
+# uses an available program to fetch URL and tidy to normalize it first
+
+REQUIRED="tidy html2markdown"
+
+### common.sh
+
+grab_url_with () {
+ url="${1:?internal error: grab_url_with: url required}"
+
+ shift
+ cmdline="$@"
+
+ prog=
+ prog_opts=
+ if [ -n "$cmdline" ]; then
+ eval "set -- $cmdline"
+ prog=$1
+ shift
+ prog_opts="$@"
+ fi
+
+ if [ -z "$prog" ]; then
+ # Locate a sensible web grabber (note the order).
+ for p in wget lynx w3m curl links w3c; do
+ if pathfind $p; then
+ prog=$p
+ break
+ fi
+ done
+
+ [ -n "$prog" ] || {
+ errn "$THIS: Couldn't find a program to fetch the file from URL "
+ err "(e.g. wget, w3m, lynx, w3c, or curl)."
+ return 1
+ }
+ else
+ pathfind "$prog" || {
+ err "$THIS: No such web grabber '$prog' found; aborting."
+ return 1
+ }
+ fi
+
+ # Setup proper base options for known grabbers.
+ base_opts=
+ case "$prog" in
+ wget) base_opts="-O-" ;;
+ lynx) base_opts="-source" ;;
+ w3m) base_opts="-dump_source" ;;
+ curl) base_opts="" ;;
+ links) base_opts="-source" ;;
+ w3c) base_opts="-n -get" ;;
+ *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
+ esac
+
+ err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
+ eval "set -- $base_opts $prog_opts"
+ $prog "$@" "$url"
+}
+
+add_option () {
+ options="$options$NEWLINE$1"
+}
+
+options=
+argument=
+encoding=
+grabber=
+
+# Parse command-line arguments
+while [ $# -gt 0 ]; do
+ case "$1" in
+ -h|--help)
+ html2markdown -h 2>&1 | sed -e 's/html2markdown/web2markdown/' 1>&2
+ err " -e ENCODING, --encoding=ENCODING"
+ err " Specify character encoding of input"
+ err " -g COMMAND, --grabber=COMMAND"
+ err " Specify command to be used to grab contents of URL"
+ exit 0 ;;
+ -v|--version)
+ html2markdown -v
+ exit 0 ;;
+ -e)
+ shift
+ encoding=$1 ;;
+ --encoding=*)
+ wholeopt=$1
+ # extract encoding from after =
+ encoding=${wholeopt#*=} ;;
+ -g)
+ shift
+ grabber=$1 ;;
+ --grabber=*)
+ wholeopt=$1
+ # extract encoding from after =
+ grabber=${wholeopt#*=} ;;
+ -o|--output|-b|--tab-stop|-H|--include-in-header| \
+ -A|--include-after-body|-C|-B|--include-before-body| \
+ -C|--custom-header|-T|--title-prefix)
+ add_option $1
+ shift
+ add_option $1 ;;
+ -*) add_option $1 ;;
+ *)
+ if [ -z "$argument" ]; then
+ argument=$1
+ else
+ err "Warning: extra argument '$1' will be ignored."
+ fi ;;
+ esac
+ shift
+done
+
+# Unpack options. Now "$@" will hold the html2markdown options.
+oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs"
+
+inurl=
+if [ -n "$argument" ] && ! [ -f "$argument" ]; then
+ # Treat given argument as an URL.
+ inurl="$argument"
+fi
+
+if [ -n "$inurl" ]; then
+ err "Attempting to fetch file from '$inurl'..."
+
+ ### tempdir.sh
+
+ grabber_out=$THIS_TEMPDIR/grabber.out
+ grabber_log=$THIS_TEMPDIR/grabber.log
+ if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
+ errn "grab_url_with failed"
+ if [ -f $grabber_log ]; then
+ err " with the following error log."
+ err
+ cat >&2 $grabber_log
+ else
+ err .
+ fi
+ exit 1
+ fi
+
+ argument="$grabber_out"
+fi
+
+if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then
+ # Try to determine character encoding if not specified
+ # and input is not STDIN.
+ encoding=$(
+ head "$argument" |
+ LC_ALL=C tr 'A-Z' 'a-z' |
+ sed -ne '/<meta .*content-type.*charset=/ {
+ s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
+ }'
+ )
+fi
+
+if [ -n "$encoding" ] && pathfind iconv; then
+ alias to_utf8='iconv -f "$encoding" -t utf-8'
+else # assume UTF-8
+ alias to_utf8='cat'
+fi
+
+if [ -z "$argument" ]; then
+ tidy -utf8 2>/dev/null | html2markdown "$@"
+else
+ if [ -f "$argument" ]; then
+ to_utf8 "$argument" | tidy -utf8 2>/dev/null | html2markdown "$@"
+ else
+ err "File '$argument' not found."
+ exit 1
+ fi
+fi