diff options
Diffstat (limited to 'src/wrappers/html2markdown.in')
-rw-r--r-- | src/wrappers/html2markdown.in | 134 |
1 files changed, 0 insertions, 134 deletions
diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in deleted file mode 100644 index 0fece3ccd..000000000 --- a/src/wrappers/html2markdown.in +++ /dev/null @@ -1,134 +0,0 @@ -#!/bin/sh -e -# converts html to markdown -# uses an available program to fetch URL and tidy to normalize it first - -REQUIRED=tidy - -### common.sh - -grab_url_with () { - url="${1:?internal error: grab_url_with: url required}" - - shift - cmdline="$@" - - prog= - prog_opts= - if [ -n "$cmdline" ]; then - eval "set -- $cmdline" - prog=$1 - shift - prog_opts="$@" - fi - - if [ -z "$prog" ]; then - # Locate a sensible web grabber (note the order). - for p in wget lynx w3m curl links w3c; do - if pathfind $p; then - prog=$p - break - fi - done - - [ -n "$prog" ] || { - errn "$THIS: Couldn't find a program to fetch the file from URL " - err "(e.g. wget, w3m, lynx, w3c, or curl)." - return 1 - } - else - pathfind "$prog" || { - err "$THIS: No such web grabber '$prog' found; aborting." - return 1 - } - fi - - # Setup proper base options for known grabbers. - base_opts= - case "$prog" in - wget) base_opts="-O-" ;; - lynx) base_opts="-source" ;; - w3m) base_opts="-dump_source" ;; - curl) base_opts="" ;; - links) base_opts="-source" ;; - w3c) base_opts="-n -get" ;; - *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." - esac - - err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." - eval "set -- $base_opts $prog_opts" - $prog "$@" "$url" -} - -encoding= -grabber= -nograb= -while getopts e:g:nh opt; do - case $opt in - e) encoding="$OPTARG" ;; - g) grabber="$OPTARG" ;; - n) nograb=1 ;; - h|?) - usage "[-e encoding] [-g grabber_command] [-n] [-h] [input_file|url]" - exit 2 ;; - esac -done - -shift $(($OPTIND - 1)) - -### postopts.sh - -### singlearg.sh - -inurl= -if [ -n "$1" ] && ! [ -f "$1" ]; then - if [ -n "$nograb" ]; then - err "'$1' not found; refusing to treat input as URL." - exit 1 - fi - # Treat given argument as an URL. - inurl="$1" -fi - -if [ -n "$inurl" ]; then - err "Attempting to fetch file from '$inurl'..." - - ### tempdir.sh - - grabber_out=$THIS_TEMPDIR/grabber.out - grabber_log=$THIS_TEMPDIR/grabber.log - if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out \ - 2>$grabber_log; then - errn "grab_url_with failed" - if [ -f $grabber_log ]; then - err " with the following error log." - err - cat >&2 $grabber_log - else - err . - fi - exit 1 - fi - - set -- $grabber_out -fi - -if [ -z "$encoding" ] && [ "x$@" != "x" ]; then - # Try to determine character encoding unless not specified - # and input is STDIN. - encoding=$( - head "$@" | - LC_ALL=C tr 'A-Z' 'a-z' | - sed -ne '/<meta .*content-type.*charset=/ { - s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p - }' - ) -fi - -if [ -n "$encoding" ] && [ -n "$HAVE_ICONV" ]; then - alias to_utf8='iconv -f "$encoding" -t utf-8' -elif [ -n "$inurl" ]; then # assume web pages are UTF-8 - alias to_utf8='cat' -fi # else just use local encoding - -to_utf8 "$@" | tidy -utf8 2>/dev/null | -runpandoc -r html -w markdown -s | from_utf8 |