diff options
Diffstat (limited to 'html2markdown')
| -rwxr-xr-x | html2markdown | 221 | 
1 files changed, 0 insertions, 221 deletions
| diff --git a/html2markdown b/html2markdown deleted file mode 100755 index 0649e0478..000000000 --- a/html2markdown +++ /dev/null @@ -1,221 +0,0 @@ -#!/bin/sh -e -# converts HTML from a URL, file, or stdin to markdown -# uses an available program to fetch URL and tidy to normalize it first - -REQUIRED="tidy" -SYNOPSIS="converts HTML from a URL, file, or STDIN to markdown-formatted text." - -THIS=${0##*/} - -NEWLINE=' -' - -err ()  { echo "$*"   | fold -s -w ${COLUMNS:-110} >&2; } -errn () { printf "$*" | fold -s -w ${COLUMNS:-110} >&2; } - -usage () { -    err "$1 - $2" # short description -    err "See the $1(1) man page for usage." -} - -# Portable which(1). -pathfind () { -    oldifs="$IFS"; IFS=':' -    for _p in $PATH; do -        if [ -x "$_p/$*" ] && [ -f "$_p/$*" ]; then -            IFS="$oldifs" -            return 0 -        fi -    done -    IFS="$oldifs" -    return 1 -} - -for p in pandoc $REQUIRED; do -    pathfind $p || { -        err "You need '$p' to use this program!" -        exit 1 -    } -done - -CONF=$(pandoc --dump-args "$@" 2>&1) || { -    errcode=$? -    echo "$CONF" | sed -e '/^pandoc \[OPTIONS\] \[FILES\]/,$d' >&2 -    [ $errcode -eq 2 ] && usage "$THIS" "$SYNOPSIS" -    exit $errcode -} - -OUTPUT=$(echo "$CONF" | sed -ne '1p') -ARGS=$(echo "$CONF" | sed -e '1d') - - -grab_url_with () { -    url="${1:?internal error: grab_url_with: url required}" - -    shift -    cmdline="$@" - -    prog= -    prog_opts= -    if [ -n "$cmdline" ]; then -	eval "set -- $cmdline" -	prog=$1 -	shift -	prog_opts="$@" -    fi - -    if [ -z "$prog" ]; then -	# Locate a sensible web grabber (note the order). -	for p in wget lynx w3m curl links w3c; do -		if pathfind $p; then -		    prog=$p -		    break -		fi -	done - -	[ -n "$prog" ] || { -            errn "$THIS:  Couldn't find a program to fetch the file from URL " -	    err "(e.g. wget, w3m, lynx, w3c, or curl)." -	    return 1 -	} -    else -	pathfind "$prog" || { -	    err "$THIS:  No such web grabber '$prog' found; aborting." -	    return 1 -	} -    fi - -    # Setup proper base options for known grabbers. -    base_opts= -    case "$prog" in -    wget)  base_opts="-O-" ;; -    lynx)  base_opts="-source" ;; -    w3m)   base_opts="-dump_source" ;; -    curl)  base_opts="" ;; -    links) base_opts="-source" ;; -    w3c)   base_opts="-n -get" ;; -    *)     err "$THIS:  unhandled web grabber '$prog'; hope it succeeds." -    esac - -    err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." -    eval "set -- $base_opts $prog_opts" -    $prog "$@" "$url" -} - -# Parse command-line arguments -parse_arguments () { -    while [ $# -gt 0 ]; do -        case "$1" in -            --encoding=*) -                wholeopt="$1" -                # extract encoding from after = -                encoding="${wholeopt#*=}" ;; -            -e|--encoding|-encoding) -                shift -                encoding="$1" ;;  -            --grabber=*) -                wholeopt="$1" -                # extract encoding from after = -                grabber="\"${wholeopt#*=}\"" ;; -            -g|--grabber|-grabber) -                shift -                grabber="$1" ;;  -            *) -                if [ -z "$argument" ]; then -                    argument="$1" -                else -                    err "Warning:  extra argument '$1' will be ignored." -                fi ;; -            esac -        shift -    done -} - -argument= -encoding= -grabber= - -oldifs="$IFS" -IFS=$NEWLINE -parse_arguments $ARGS -IFS="$oldifs" - -inurl= -if [ -n "$argument" ] && ! [ -f "$argument" ]; then -    # Treat given argument as an URL. -    inurl="$argument" -fi - -# As a security measure refuse to proceed if mktemp is not available. -pathfind mktemp || { err "Couldn't find 'mktemp'; aborting."; exit 1;  } - -# Avoid issues with /tmp directory on Windows/Cygwin  -cygwin= -cygwin=$(uname | sed -ne '/^CYGWIN/p') -if [ -n "$cygwin" ]; then -    TMPDIR=. -    export TMPDIR -fi - -THIS_TEMPDIR= -THIS_TEMPDIR="$(mktemp -d -t $THIS.XXXXXXXX)" || exit 1 -readonly THIS_TEMPDIR - -trap 'exitcode=$? -      [ -z "$THIS_TEMPDIR" ] || rm -rf "$THIS_TEMPDIR" -      exit $exitcode' 0 1 2 3 13 15 - -if [ -n "$inurl" ]; then -    err "Attempting to fetch file from '$inurl'..." - -    grabber_out=$THIS_TEMPDIR/grabber.out -    grabber_log=$THIS_TEMPDIR/grabber.log -    if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then -        errn "grab_url_with failed" -        if [ -f $grabber_log ]; then -            err " with the following error log." -            err -            cat >&2 $grabber_log -        else -            err . -        fi -        exit 1 -    fi - -    argument="$grabber_out" -fi - -if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then -    # Try to determine character encoding if not specified -    # and input is not STDIN. -    encoding=$( -        head "$argument" | -        LC_ALL=C tr 'A-Z' 'a-z' | -        sed -ne '/<meta .*content-type.*charset=/ { -            s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p -        }' -    ) -fi - -if [ -n "$encoding" ] && pathfind iconv; then -    alias to_utf8='iconv -f "$encoding" -t utf-8' -else # assume UTF-8 -    alias to_utf8='cat' -fi  - -htmlinput=$THIS_TEMPDIR/htmlinput - -if [ -z "$argument" ]; then -    to_utf8 > $htmlinput                # read from STDIN -elif [ -f "$argument" ]; then -    to_utf8 "$argument" > $htmlinput    # read from file -else -    err "File '$argument' not found." -    exit 1 -fi - -if ! cat $htmlinput | pandoc --ignore-args -r html -w markdown "$@" ; then -     err "Failed to parse HTML.  Trying again with tidy..." -     tidy -q -asxhtml -utf8 $htmlinput | \ -        pandoc --ignore-args -r html -w markdown "$@" -fi | 
