aboutsummaryrefslogtreecommitdiff
path: root/src/wrappers/html2markdown.in
diff options
context:
space:
mode:
Diffstat (limited to 'src/wrappers/html2markdown.in')
-rw-r--r--src/wrappers/html2markdown.in134
1 files changed, 0 insertions, 134 deletions
diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in
deleted file mode 100644
index 0fece3ccd..000000000
--- a/src/wrappers/html2markdown.in
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/bin/sh -e
-# converts html to markdown
-# uses an available program to fetch URL and tidy to normalize it first
-
-REQUIRED=tidy
-
-### common.sh
-
-grab_url_with () {
- url="${1:?internal error: grab_url_with: url required}"
-
- shift
- cmdline="$@"
-
- prog=
- prog_opts=
- if [ -n "$cmdline" ]; then
- eval "set -- $cmdline"
- prog=$1
- shift
- prog_opts="$@"
- fi
-
- if [ -z "$prog" ]; then
- # Locate a sensible web grabber (note the order).
- for p in wget lynx w3m curl links w3c; do
- if pathfind $p; then
- prog=$p
- break
- fi
- done
-
- [ -n "$prog" ] || {
- errn "$THIS: Couldn't find a program to fetch the file from URL "
- err "(e.g. wget, w3m, lynx, w3c, or curl)."
- return 1
- }
- else
- pathfind "$prog" || {
- err "$THIS: No such web grabber '$prog' found; aborting."
- return 1
- }
- fi
-
- # Setup proper base options for known grabbers.
- base_opts=
- case "$prog" in
- wget) base_opts="-O-" ;;
- lynx) base_opts="-source" ;;
- w3m) base_opts="-dump_source" ;;
- curl) base_opts="" ;;
- links) base_opts="-source" ;;
- w3c) base_opts="-n -get" ;;
- *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
- esac
-
- err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
- eval "set -- $base_opts $prog_opts"
- $prog "$@" "$url"
-}
-
-encoding=
-grabber=
-nograb=
-while getopts e:g:nh opt; do
- case $opt in
- e) encoding="$OPTARG" ;;
- g) grabber="$OPTARG" ;;
- n) nograb=1 ;;
- h|?)
- usage "[-e encoding] [-g grabber_command] [-n] [-h] [input_file|url]"
- exit 2 ;;
- esac
-done
-
-shift $(($OPTIND - 1))
-
-### postopts.sh
-
-### singlearg.sh
-
-inurl=
-if [ -n "$1" ] && ! [ -f "$1" ]; then
- if [ -n "$nograb" ]; then
- err "'$1' not found; refusing to treat input as URL."
- exit 1
- fi
- # Treat given argument as an URL.
- inurl="$1"
-fi
-
-if [ -n "$inurl" ]; then
- err "Attempting to fetch file from '$inurl'..."
-
- ### tempdir.sh
-
- grabber_out=$THIS_TEMPDIR/grabber.out
- grabber_log=$THIS_TEMPDIR/grabber.log
- if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out \
- 2>$grabber_log; then
- errn "grab_url_with failed"
- if [ -f $grabber_log ]; then
- err " with the following error log."
- err
- cat >&2 $grabber_log
- else
- err .
- fi
- exit 1
- fi
-
- set -- $grabber_out
-fi
-
-if [ -z "$encoding" ] && [ "x$@" != "x" ]; then
- # Try to determine character encoding unless not specified
- # and input is STDIN.
- encoding=$(
- head "$@" |
- LC_ALL=C tr 'A-Z' 'a-z' |
- sed -ne '/<meta .*content-type.*charset=/ {
- s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
- }'
- )
-fi
-
-if [ -n "$encoding" ] && [ -n "$HAVE_ICONV" ]; then
- alias to_utf8='iconv -f "$encoding" -t utf-8'
-elif [ -n "$inurl" ]; then # assume web pages are UTF-8
- alias to_utf8='cat'
-fi # else just use local encoding
-
-to_utf8 "$@" | tidy -utf8 2>/dev/null |
-runpandoc -r html -w markdown -s | from_utf8