diff options
Diffstat (limited to 'src/wrappers/html2markdown.in')
-rw-r--r-- | src/wrappers/html2markdown.in | 176 |
1 files changed, 176 insertions, 0 deletions
diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in new file mode 100644 index 000000000..740d69588 --- /dev/null +++ b/src/wrappers/html2markdown.in @@ -0,0 +1,176 @@ +#!/bin/sh -e +# converts HTML from a URL, file, or stdin to markdown +# uses an available program to fetch URL and tidy to normalize it first + +REQUIRED="tidy" + +### common.sh + +grab_url_with () { + url="${1:?internal error: grab_url_with: url required}" + + shift + cmdline="$@" + + prog= + prog_opts= + if [ -n "$cmdline" ]; then + eval "set -- $cmdline" + prog=$1 + shift + prog_opts="$@" + fi + + if [ -z "$prog" ]; then + # Locate a sensible web grabber (note the order). + for p in wget lynx w3m curl links w3c; do + if pathfind $p; then + prog=$p + break + fi + done + + [ -n "$prog" ] || { + errn "$THIS: Couldn't find a program to fetch the file from URL " + err "(e.g. wget, w3m, lynx, w3c, or curl)." + return 1 + } + else + pathfind "$prog" || { + err "$THIS: No such web grabber '$prog' found; aborting." + return 1 + } + fi + + # Setup proper base options for known grabbers. + base_opts= + case "$prog" in + wget) base_opts="-O-" ;; + lynx) base_opts="-source" ;; + w3m) base_opts="-dump_source" ;; + curl) base_opts="" ;; + links) base_opts="-source" ;; + w3c) base_opts="-n -get" ;; + *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." + esac + + err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." + eval "set -- $base_opts $prog_opts" + $prog "$@" "$url" +} + +add_option () { + options="$options$NEWLINE$1" +} + +options= +argument= +encoding= +grabber= + +# Parse command-line arguments +while [ $# -gt 0 ]; do + case "$1" in + -h|--help) + pandoc -h 2>&1 | sed -e 's/pandoc/html2markdown/' \ + -e '/^[[:space:]]*\(-f\|-t\|-S\|-N\|-m\|-i\|-c\|-T\|-D\|-d\)/,/./d'\ + 1>&2 + err " -e ENCODING, --encoding=ENCODING" + err " Specify character encoding of input" + err " -g COMMAND, --grabber=COMMAND" + err " Specify command to be used to grab contents of URL" + exit 0 ;; + -v|--version) + pandoc -v 2>&1 | sed -e 's/pandoc/html2markdown/' 1>&2 + exit 0 ;; + -e) + shift + encoding=$1 ;; + --encoding=*) + wholeopt=$1 + # extract encoding from after = + encoding=${wholeopt#*=} ;; + -g) + shift + grabber=$1 ;; + --grabber=*) + wholeopt=$1 + # extract encoding from after = + grabber=${wholeopt#*=} ;; + -o|--output|-b|--tab-stop|-H|--include-in-header| \ + -A|--include-after-body|-C|-B|--include-before-body| \ + -C|--custom-header|-T|--title-prefix) + add_option $1 + shift + add_option $1 ;; + -*) add_option $1 ;; + *) + if [ -z "$argument" ]; then + argument=$1 + else + err "Warning: extra argument '$1' will be ignored." + fi ;; + esac + shift +done + +# Unpack options. Now "$@" will hold the pandoc options. +oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs" + +inurl= +if [ -n "$argument" ] && ! [ -f "$argument" ]; then + # Treat given argument as an URL. + inurl="$argument" +fi + +if [ -n "$inurl" ]; then + err "Attempting to fetch file from '$inurl'..." + + ### tempdir.sh + + grabber_out=$THIS_TEMPDIR/grabber.out + grabber_log=$THIS_TEMPDIR/grabber.log + if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then + errn "grab_url_with failed" + if [ -f $grabber_log ]; then + err " with the following error log." + err + cat >&2 $grabber_log + else + err . + fi + exit 1 + fi + + argument="$grabber_out" +fi + +if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then + # Try to determine character encoding if not specified + # and input is not STDIN. + encoding=$( + head "$argument" | + LC_ALL=C tr 'A-Z' 'a-z' | + sed -ne '/<meta .*content-type.*charset=/ { + s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p + }' + ) +fi + +if [ -n "$encoding" ] && pathfind iconv; then + alias to_utf8='iconv -f "$encoding" -t utf-8' +else # assume UTF-8 + alias to_utf8='cat' +fi + +if [ -z "$argument" ]; then + tidy -utf8 2>/dev/null | pandoc -r html -w markdown "$@" +else + if [ -f "$argument" ]; then + to_utf8 "$argument" | + tidy -utf8 2>/dev/null | pandoc -r html -w markdown "$@" + else + err "File '$argument' not found." + exit 1 + fi +fi |