#!/bin/sh -e # converts HTML from a URL, file, or stdin to markdown # uses an available program to fetch URL and tidy to normalize it first REQUIRED="tidy" ### common.sh grab_url_with () { url="${1:?internal error: grab_url_with: url required}" shift cmdline="$@" prog= prog_opts= if [ -n "$cmdline" ]; then eval "set -- $cmdline" prog=$1 shift prog_opts="$@" fi if [ -z "$prog" ]; then # Locate a sensible web grabber (note the order). for p in wget lynx w3m curl links w3c; do if pathfind $p; then prog=$p break fi done [ -n "$prog" ] || { errn "$THIS: Couldn't find a program to fetch the file from URL " err "(e.g. wget, w3m, lynx, w3c, or curl)." return 1 } else pathfind "$prog" || { err "$THIS: No such web grabber '$prog' found; aborting." return 1 } fi # Setup proper base options for known grabbers. base_opts= case "$prog" in wget) base_opts="-O-" ;; lynx) base_opts="-source" ;; w3m) base_opts="-dump_source" ;; curl) base_opts="" ;; links) base_opts="-source" ;; w3c) base_opts="-n -get" ;; *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." esac err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." eval "set -- $base_opts $prog_opts" $prog "$@" "$url" } add_option () { options="$options$NEWLINE$1" } options= argument= encoding= grabber= # Parse command-line arguments while [ $# -gt 0 ]; do case "$1" in -h|--help) pandoc -h 2>&1 | sed -e 's/pandoc/html2markdown/' \ -e '/^[[:space:]]*\(-f\|-t\|-S\|-N\|-m\|-i\|-c\|-T\|-D\|-d\)/,/./d'\ 1>&2 err " -e ENCODING, --encoding=ENCODING" err " Specify character encoding of input" err " -g COMMAND, --grabber=COMMAND" err " Specify command to be used to grab contents of URL" exit 0 ;; -v|--version) pandoc -v 2>&1 | sed -e 's/pandoc/html2markdown/' 1>&2 exit 0 ;; -e) shift encoding=$1 ;; --encoding=*) wholeopt=$1 # extract encoding from after = encoding=${wholeopt#*=} ;; -g) shift grabber=$1 ;; --grabber=*) wholeopt=$1 # extract encoding from after = grabber=${wholeopt#*=} ;; -o|--output|-b|--tab-stop|-H|--include-in-header| \ -A|--include-after-body|-C|-B|--include-before-body| \ -C|--custom-header|-T|--title-prefix) add_option $1 shift add_option $1 ;; -*) add_option $1 ;; *) if [ -z "$argument" ]; then argument=$1 else err "Warning: extra argument '$1' will be ignored." fi ;; esac shift done # Unpack options. Now "$@" will hold the pandoc options. oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs" inurl= if [ -n "$argument" ] && ! [ -f "$argument" ]; then # Treat given argument as an URL. inurl="$argument" fi if [ -n "$inurl" ]; then err "Attempting to fetch file from '$inurl'..." ### tempdir.sh grabber_out=$THIS_TEMPDIR/grabber.out grabber_log=$THIS_TEMPDIR/grabber.log if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then errn "grab_url_with failed" if [ -f $grabber_log ]; then err " with the following error log." err cat >&2 $grabber_log else err . fi exit 1 fi argument="$grabber_out" fi if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then # Try to determine character encoding if not specified # and input is not STDIN. encoding=$( head "$argument" | LC_ALL=C tr 'A-Z' 'a-z' | sed -ne '//dev/null | pandoc -r html -w markdown "$@" else if [ -f "$argument" ]; then to_utf8 "$argument" | tidy -utf8 2>/dev/null | pandoc -r html -w markdown "$@" else err "File '$argument' not found." exit 1 fi fi