diff options
Diffstat (limited to 'wrappers/html2markdown.in')
-rw-r--r-- | wrappers/html2markdown.in | 162 |
1 files changed, 0 insertions, 162 deletions
diff --git a/wrappers/html2markdown.in b/wrappers/html2markdown.in deleted file mode 100644 index 0f4297128..000000000 --- a/wrappers/html2markdown.in +++ /dev/null @@ -1,162 +0,0 @@ -#!/bin/sh -e -# converts HTML from a URL, file, or stdin to markdown -# uses an available program to fetch URL and tidy to normalize it first - -REQUIRED="tidy" -SYNOPSIS="converts HTML from a URL, file, or STDIN to markdown-formatted text." - -### common.sh - -grab_url_with () { - url="${1:?internal error: grab_url_with: url required}" - - shift - cmdline="$@" - - prog= - prog_opts= - if [ -n "$cmdline" ]; then - eval "set -- $cmdline" - prog=$1 - shift - prog_opts="$@" - fi - - if [ -z "$prog" ]; then - # Locate a sensible web grabber (note the order). - for p in wget lynx w3m curl links w3c; do - if pathfind $p; then - prog=$p - break - fi - done - - [ -n "$prog" ] || { - errn "$THIS: Couldn't find a program to fetch the file from URL " - err "(e.g. wget, w3m, lynx, w3c, or curl)." - return 1 - } - else - pathfind "$prog" || { - err "$THIS: No such web grabber '$prog' found; aborting." - return 1 - } - fi - - # Setup proper base options for known grabbers. - base_opts= - case "$prog" in - wget) base_opts="-O-" ;; - lynx) base_opts="-source" ;; - w3m) base_opts="-dump_source" ;; - curl) base_opts="" ;; - links) base_opts="-source" ;; - w3c) base_opts="-n -get" ;; - *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." - esac - - err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." - eval "set -- $base_opts $prog_opts" - $prog "$@" "$url" -} - -# Parse command-line arguments -parse_arguments () { - while [ $# -gt 0 ]; do - case "$1" in - --encoding=*) - wholeopt="$1" - # extract encoding from after = - encoding="${wholeopt#*=}" ;; - -e|--encoding|-encoding) - shift - encoding="$1" ;; - --grabber=*) - wholeopt="$1" - # extract encoding from after = - grabber="\"${wholeopt#*=}\"" ;; - -g|--grabber|-grabber) - shift - grabber="$1" ;; - *) - if [ -z "$argument" ]; then - argument="$1" - else - err "Warning: extra argument '$1' will be ignored." - fi ;; - esac - shift - done -} - -argument= -encoding= -grabber= - -oldifs="$IFS" -IFS=$NEWLINE -parse_arguments $ARGS -IFS="$oldifs" - -inurl= -if [ -n "$argument" ] && ! [ -f "$argument" ]; then - # Treat given argument as an URL. - inurl="$argument" -fi - -### tempdir.sh - -if [ -n "$inurl" ]; then - err "Attempting to fetch file from '$inurl'..." - - grabber_out=$THIS_TEMPDIR/grabber.out - grabber_log=$THIS_TEMPDIR/grabber.log - if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then - errn "grab_url_with failed" - if [ -f $grabber_log ]; then - err " with the following error log." - err - cat >&2 $grabber_log - else - err . - fi - exit 1 - fi - - argument="$grabber_out" -fi - -if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then - # Try to determine character encoding if not specified - # and input is not STDIN. - encoding=$( - head "$argument" | - LC_ALL=C tr 'A-Z' 'a-z' | - sed -ne '/<meta .*content-type.*charset=/ { - s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p - }' - ) -fi - -if [ -n "$encoding" ] && pathfind iconv; then - alias to_utf8='iconv -f "$encoding" -t utf-8' -else # assume UTF-8 - alias to_utf8='cat' -fi - -htmlinput=$THIS_TEMPDIR/htmlinput - -if [ -z "$argument" ]; then - to_utf8 > $htmlinput # read from STDIN -elif [ -f "$argument" ]; then - to_utf8 "$argument" > $htmlinput # read from file -else - err "File '$argument' not found." - exit 1 -fi - -if ! cat $htmlinput | pandoc --ignore-args -r html -w markdown "$@" ; then - err "Failed to parse HTML. Trying again with tidy..." - tidy -q -asxhtml -utf8 $htmlinput | \ - pandoc --ignore-args -r html -w markdown "$@" -fi |