aboutsummaryrefslogtreecommitdiff
path: root/src/wrappers/html2markdown.in
diff options
context:
space:
mode:
Diffstat (limited to 'src/wrappers/html2markdown.in')
-rw-r--r--src/wrappers/html2markdown.in176
1 files changed, 176 insertions, 0 deletions
diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in
new file mode 100644
index 000000000..740d69588
--- /dev/null
+++ b/src/wrappers/html2markdown.in
@@ -0,0 +1,176 @@
+#!/bin/sh -e
+# converts HTML from a URL, file, or stdin to markdown
+# uses an available program to fetch URL and tidy to normalize it first
+
+REQUIRED="tidy"
+
+### common.sh
+
+grab_url_with () {
+ url="${1:?internal error: grab_url_with: url required}"
+
+ shift
+ cmdline="$@"
+
+ prog=
+ prog_opts=
+ if [ -n "$cmdline" ]; then
+ eval "set -- $cmdline"
+ prog=$1
+ shift
+ prog_opts="$@"
+ fi
+
+ if [ -z "$prog" ]; then
+ # Locate a sensible web grabber (note the order).
+ for p in wget lynx w3m curl links w3c; do
+ if pathfind $p; then
+ prog=$p
+ break
+ fi
+ done
+
+ [ -n "$prog" ] || {
+ errn "$THIS: Couldn't find a program to fetch the file from URL "
+ err "(e.g. wget, w3m, lynx, w3c, or curl)."
+ return 1
+ }
+ else
+ pathfind "$prog" || {
+ err "$THIS: No such web grabber '$prog' found; aborting."
+ return 1
+ }
+ fi
+
+ # Setup proper base options for known grabbers.
+ base_opts=
+ case "$prog" in
+ wget) base_opts="-O-" ;;
+ lynx) base_opts="-source" ;;
+ w3m) base_opts="-dump_source" ;;
+ curl) base_opts="" ;;
+ links) base_opts="-source" ;;
+ w3c) base_opts="-n -get" ;;
+ *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
+ esac
+
+ err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
+ eval "set -- $base_opts $prog_opts"
+ $prog "$@" "$url"
+}
+
+add_option () {
+ options="$options$NEWLINE$1"
+}
+
+options=
+argument=
+encoding=
+grabber=
+
+# Parse command-line arguments
+while [ $# -gt 0 ]; do
+ case "$1" in
+ -h|--help)
+ pandoc -h 2>&1 | sed -e 's/pandoc/html2markdown/' \
+ -e '/^[[:space:]]*\(-f\|-t\|-S\|-N\|-m\|-i\|-c\|-T\|-D\|-d\)/,/./d'\
+ 1>&2
+ err " -e ENCODING, --encoding=ENCODING"
+ err " Specify character encoding of input"
+ err " -g COMMAND, --grabber=COMMAND"
+ err " Specify command to be used to grab contents of URL"
+ exit 0 ;;
+ -v|--version)
+ pandoc -v 2>&1 | sed -e 's/pandoc/html2markdown/' 1>&2
+ exit 0 ;;
+ -e)
+ shift
+ encoding=$1 ;;
+ --encoding=*)
+ wholeopt=$1
+ # extract encoding from after =
+ encoding=${wholeopt#*=} ;;
+ -g)
+ shift
+ grabber=$1 ;;
+ --grabber=*)
+ wholeopt=$1
+ # extract encoding from after =
+ grabber=${wholeopt#*=} ;;
+ -o|--output|-b|--tab-stop|-H|--include-in-header| \
+ -A|--include-after-body|-C|-B|--include-before-body| \
+ -C|--custom-header|-T|--title-prefix)
+ add_option $1
+ shift
+ add_option $1 ;;
+ -*) add_option $1 ;;
+ *)
+ if [ -z "$argument" ]; then
+ argument=$1
+ else
+ err "Warning: extra argument '$1' will be ignored."
+ fi ;;
+ esac
+ shift
+done
+
+# Unpack options. Now "$@" will hold the pandoc options.
+oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs"
+
+inurl=
+if [ -n "$argument" ] && ! [ -f "$argument" ]; then
+ # Treat given argument as an URL.
+ inurl="$argument"
+fi
+
+if [ -n "$inurl" ]; then
+ err "Attempting to fetch file from '$inurl'..."
+
+ ### tempdir.sh
+
+ grabber_out=$THIS_TEMPDIR/grabber.out
+ grabber_log=$THIS_TEMPDIR/grabber.log
+ if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
+ errn "grab_url_with failed"
+ if [ -f $grabber_log ]; then
+ err " with the following error log."
+ err
+ cat >&2 $grabber_log
+ else
+ err .
+ fi
+ exit 1
+ fi
+
+ argument="$grabber_out"
+fi
+
+if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then
+ # Try to determine character encoding if not specified
+ # and input is not STDIN.
+ encoding=$(
+ head "$argument" |
+ LC_ALL=C tr 'A-Z' 'a-z' |
+ sed -ne '/<meta .*content-type.*charset=/ {
+ s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
+ }'
+ )
+fi
+
+if [ -n "$encoding" ] && pathfind iconv; then
+ alias to_utf8='iconv -f "$encoding" -t utf-8'
+else # assume UTF-8
+ alias to_utf8='cat'
+fi
+
+if [ -z "$argument" ]; then
+ tidy -utf8 2>/dev/null | pandoc -r html -w markdown "$@"
+else
+ if [ -f "$argument" ]; then
+ to_utf8 "$argument" |
+ tidy -utf8 2>/dev/null | pandoc -r html -w markdown "$@"
+ else
+ err "File '$argument' not found."
+ exit 1
+ fi
+fi