#!/bin/sh -e
# converts html to markdown
# uses an available program to fetch URL and tidy to normalize it first
REQUIRED=tidy
### common.sh
grab_url_with () {
url="${1:?internal error: grab_url_with: url required}"
shift
cmdline="$@"
prog=
prog_opts=
if [ -n "$cmdline" ]; then
eval "set -- $cmdline"
prog=$1
shift
prog_opts="$@"
fi
if [ -z "$prog" ]; then
# Locate a sensible web grabber (note the order).
for p in wget lynx w3m curl links w3c; do
if pathfind $p; then
prog=$p
break
fi
done
[ -n "$prog" ] || {
errn "$THIS: Couldn't find a program to fetch the file from URL "
err "(e.g. wget, w3m, lynx, w3c, or curl)."
return 1
}
else
pathfind "$prog" || {
err "$THIS: No such web grabber '$prog' found; aborting."
return 1
}
fi
# Setup proper base options for known grabbers.
base_opts=
case "$prog" in
wget) base_opts="-O-" ;;
lynx) base_opts="-source" ;;
w3m) base_opts="-dump_source" ;;
curl) base_opts="" ;;
links) base_opts="-source" ;;
w3c) base_opts="-n -get" ;;
*) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
esac
err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
eval "set -- $base_opts $prog_opts"
$prog "$@" "$url"
}
encoding=
grabber=
nograb=
while getopts e:g:nh opt; do
case $opt in
e) encoding="$OPTARG" ;;
g) grabber="$OPTARG" ;;
n) nograb=1 ;;
h|?)
usage "[-e encoding] [-g grabber_command] [-n] [-h] [input_file|url]"
exit 2 ;;
esac
done
shift $(($OPTIND - 1))
### postopts.sh
### singlearg.sh
inurl=
if [ -n "$1" ] && ! [ -f "$1" ]; then
if [ -n "$nograb" ]; then
err "'$1' not found; refusing to treat input as URL."
exit 1
fi
# Treat given argument as an URL.
inurl="$1"
fi
if [ -n "$inurl" ]; then
err "Attempting to fetch file from '$inurl'..."
### tempdir.sh
grabber_out=$THIS_TEMPDIR/grabber.out
grabber_log=$THIS_TEMPDIR/grabber.log
if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out \
2>$grabber_log; then
errn "grab_url_with failed"
if [ -f $grabber_log ]; then
err " with the following error log."
err
cat >&2 $grabber_log
else
err .
fi
exit 1
fi
set -- $grabber_out
fi
if [ -z "$encoding" ] && [ "x$@" != "x" ]; then
# Try to determine character encoding unless not specified
# and input is STDIN.
encoding=$(
head "$@" |
LC_ALL=C tr 'A-Z' 'a-z' |
sed -ne '//dev/null |
runpandoc -r html -w markdown -s | from_utf8