#!/bin/sh -e
# converts HTML from a URL, file, or stdin to markdown
# uses an available program to fetch URL and tidy to normalize it first
REQUIRED="tidy html2markdown"
### common.sh
grab_url_with () {
    url="${1:?internal error: grab_url_with: url required}"
    shift
    cmdline="$@"
    prog=
    prog_opts=
    if [ -n "$cmdline" ]; then
	eval "set -- $cmdline"
	prog=$1
	shift
	prog_opts="$@"
    fi
    if [ -z "$prog" ]; then
	# Locate a sensible web grabber (note the order).
	for p in wget lynx w3m curl links w3c; do
		if pathfind $p; then
		    prog=$p
		    break
		fi
	done
	[ -n "$prog" ] || {
            errn "$THIS:  Couldn't find a program to fetch the file from URL "
	    err "(e.g. wget, w3m, lynx, w3c, or curl)."
	    return 1
	}
    else
	pathfind "$prog" || {
	    err "$THIS:  No such web grabber '$prog' found; aborting."
	    return 1
	}
    fi
    # Setup proper base options for known grabbers.
    base_opts=
    case "$prog" in
    wget)  base_opts="-O-" ;;
    lynx)  base_opts="-source" ;;
    w3m)   base_opts="-dump_source" ;;
    curl)  base_opts="" ;;
    links) base_opts="-source" ;;
    w3c)   base_opts="-n -get" ;;
    *)     err "$THIS:  unhandled web grabber '$prog'; hope it succeeds."
    esac
    err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
    eval "set -- $base_opts $prog_opts"
    $prog "$@" "$url"
}
add_option () {
    options="$options$NEWLINE$1"
}
options=
argument=
encoding=
grabber=
# Parse command-line arguments
while [ $# -gt 0 ]; do
    case "$1" in
        -h|--help)
            html2markdown -h 2>&1 | sed -e 's/html2markdown/web2markdown/' 1>&2 
            err "  -e ENCODING, --encoding=ENCODING"
            err "        Specify character encoding of input"
            err "  -g COMMAND, --grabber=COMMAND"
            err "        Specify command to be used to grab contents of URL"
            exit 0 ;;
        -v|--version) 
            html2markdown -v
            exit 0 ;;
        -e)
            shift
            encoding=$1 ;; 
        --encoding=*)
            wholeopt=$1
            # extract encoding from after =
            encoding=${wholeopt#*=} ;;
        -g)
            shift
            grabber=$1 ;; 
        --grabber=*)
            wholeopt=$1
            # extract encoding from after =
            grabber=${wholeopt#*=} ;;
        -o|--output|-b|--tab-stop|-H|--include-in-header| \
        -A|--include-after-body|-C|-B|--include-before-body| \
        -C|--custom-header|-T|--title-prefix) 
            add_option $1
            shift
            add_option $1 ;;
        -*) add_option $1 ;;
        *) 
            if [ -z "$argument" ]; then
                argument=$1
            else
                err "Warning: extra argument '$1' will be ignored."
            fi ;;            
    esac
    shift
done
# Unpack options.  Now "$@" will hold the html2markdown options.
oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs"
inurl=
if [ -n "$argument" ] && ! [ -f "$argument" ]; then
    # Treat given argument as an URL.
    inurl="$argument"
fi
if [ -n "$inurl" ]; then
    err "Attempting to fetch file from '$inurl'..."
    ### tempdir.sh
    grabber_out=$THIS_TEMPDIR/grabber.out
    grabber_log=$THIS_TEMPDIR/grabber.log
    if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
        errn "grab_url_with failed"
        if [ -f $grabber_log ]; then
            err " with the following error log."
            err
            cat >&2 $grabber_log
        else
            err .
        fi
        exit 1
    fi
    argument="$grabber_out"
fi
if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then
    # Try to determine character encoding if not specified
    # and input is not STDIN.
    encoding=$(
        head "$argument" |
        LC_ALL=C tr 'A-Z' 'a-z' |
        sed -ne '//dev/null | html2markdown "$@"
else
    if [ -f "$argument" ]; then
        to_utf8 "$argument" | tidy -utf8 2>/dev/null | html2markdown "$@"
    else
        err "File '$argument' not found."
        exit 1
    fi
fi