diff options
Diffstat (limited to 'src/wrappers/html2markdown.in')
-rw-r--r-- | src/wrappers/html2markdown.in | 162 |
1 files changed, 162 insertions, 0 deletions
diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in new file mode 100644 index 000000000..0f4297128 --- /dev/null +++ b/src/wrappers/html2markdown.in @@ -0,0 +1,162 @@ +#!/bin/sh -e +# converts HTML from a URL, file, or stdin to markdown +# uses an available program to fetch URL and tidy to normalize it first + +REQUIRED="tidy" +SYNOPSIS="converts HTML from a URL, file, or STDIN to markdown-formatted text." + +### common.sh + +grab_url_with () { + url="${1:?internal error: grab_url_with: url required}" + + shift + cmdline="$@" + + prog= + prog_opts= + if [ -n "$cmdline" ]; then + eval "set -- $cmdline" + prog=$1 + shift + prog_opts="$@" + fi + + if [ -z "$prog" ]; then + # Locate a sensible web grabber (note the order). + for p in wget lynx w3m curl links w3c; do + if pathfind $p; then + prog=$p + break + fi + done + + [ -n "$prog" ] || { + errn "$THIS: Couldn't find a program to fetch the file from URL " + err "(e.g. wget, w3m, lynx, w3c, or curl)." + return 1 + } + else + pathfind "$prog" || { + err "$THIS: No such web grabber '$prog' found; aborting." + return 1 + } + fi + + # Setup proper base options for known grabbers. + base_opts= + case "$prog" in + wget) base_opts="-O-" ;; + lynx) base_opts="-source" ;; + w3m) base_opts="-dump_source" ;; + curl) base_opts="" ;; + links) base_opts="-source" ;; + w3c) base_opts="-n -get" ;; + *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." + esac + + err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." + eval "set -- $base_opts $prog_opts" + $prog "$@" "$url" +} + +# Parse command-line arguments +parse_arguments () { + while [ $# -gt 0 ]; do + case "$1" in + --encoding=*) + wholeopt="$1" + # extract encoding from after = + encoding="${wholeopt#*=}" ;; + -e|--encoding|-encoding) + shift + encoding="$1" ;; + --grabber=*) + wholeopt="$1" + # extract encoding from after = + grabber="\"${wholeopt#*=}\"" ;; + -g|--grabber|-grabber) + shift + grabber="$1" ;; + *) + if [ -z "$argument" ]; then + argument="$1" + else + err "Warning: extra argument '$1' will be ignored." + fi ;; + esac + shift + done +} + +argument= +encoding= +grabber= + +oldifs="$IFS" +IFS=$NEWLINE +parse_arguments $ARGS +IFS="$oldifs" + +inurl= +if [ -n "$argument" ] && ! [ -f "$argument" ]; then + # Treat given argument as an URL. + inurl="$argument" +fi + +### tempdir.sh + +if [ -n "$inurl" ]; then + err "Attempting to fetch file from '$inurl'..." + + grabber_out=$THIS_TEMPDIR/grabber.out + grabber_log=$THIS_TEMPDIR/grabber.log + if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then + errn "grab_url_with failed" + if [ -f $grabber_log ]; then + err " with the following error log." + err + cat >&2 $grabber_log + else + err . + fi + exit 1 + fi + + argument="$grabber_out" +fi + +if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then + # Try to determine character encoding if not specified + # and input is not STDIN. + encoding=$( + head "$argument" | + LC_ALL=C tr 'A-Z' 'a-z' | + sed -ne '/<meta .*content-type.*charset=/ { + s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p + }' + ) +fi + +if [ -n "$encoding" ] && pathfind iconv; then + alias to_utf8='iconv -f "$encoding" -t utf-8' +else # assume UTF-8 + alias to_utf8='cat' +fi + +htmlinput=$THIS_TEMPDIR/htmlinput + +if [ -z "$argument" ]; then + to_utf8 > $htmlinput # read from STDIN +elif [ -f "$argument" ]; then + to_utf8 "$argument" > $htmlinput # read from file +else + err "File '$argument' not found." + exit 1 +fi + +if ! cat $htmlinput | pandoc --ignore-args -r html -w markdown "$@" ; then + err "Failed to parse HTML. Trying again with tidy..." + tidy -q -asxhtml -utf8 $htmlinput | \ + pandoc --ignore-args -r html -w markdown "$@" +fi |