From 2ba5ff94833c9fee0fb2799e5440d8b235f9410e Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Sat, 9 Aug 2008 23:45:14 +0000 Subject: Include shell scripts themselves in repo, rather than generating from wrappers. + Removed wrappers directory + Removed wrappers Makefile target + Added hsmarkdown, html2markdown, and markdown2pdf git-svn-id: https://pandoc.googlecode.com/svn/trunk@1387 788f1e2b-df1e-0410-8736-df70ead52e1b --- html2markdown | 221 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 221 insertions(+) create mode 100755 html2markdown (limited to 'html2markdown') diff --git a/html2markdown b/html2markdown new file mode 100755 index 000000000..0649e0478 --- /dev/null +++ b/html2markdown @@ -0,0 +1,221 @@ +#!/bin/sh -e +# converts HTML from a URL, file, or stdin to markdown +# uses an available program to fetch URL and tidy to normalize it first + +REQUIRED="tidy" +SYNOPSIS="converts HTML from a URL, file, or STDIN to markdown-formatted text." + +THIS=${0##*/} + +NEWLINE=' +' + +err () { echo "$*" | fold -s -w ${COLUMNS:-110} >&2; } +errn () { printf "$*" | fold -s -w ${COLUMNS:-110} >&2; } + +usage () { + err "$1 - $2" # short description + err "See the $1(1) man page for usage." +} + +# Portable which(1). +pathfind () { + oldifs="$IFS"; IFS=':' + for _p in $PATH; do + if [ -x "$_p/$*" ] && [ -f "$_p/$*" ]; then + IFS="$oldifs" + return 0 + fi + done + IFS="$oldifs" + return 1 +} + +for p in pandoc $REQUIRED; do + pathfind $p || { + err "You need '$p' to use this program!" + exit 1 + } +done + +CONF=$(pandoc --dump-args "$@" 2>&1) || { + errcode=$? + echo "$CONF" | sed -e '/^pandoc \[OPTIONS\] \[FILES\]/,$d' >&2 + [ $errcode -eq 2 ] && usage "$THIS" "$SYNOPSIS" + exit $errcode +} + +OUTPUT=$(echo "$CONF" | sed -ne '1p') +ARGS=$(echo "$CONF" | sed -e '1d') + + +grab_url_with () { + url="${1:?internal error: grab_url_with: url required}" + + shift + cmdline="$@" + + prog= + prog_opts= + if [ -n "$cmdline" ]; then + eval "set -- $cmdline" + prog=$1 + shift + prog_opts="$@" + fi + + if [ -z "$prog" ]; then + # Locate a sensible web grabber (note the order). + for p in wget lynx w3m curl links w3c; do + if pathfind $p; then + prog=$p + break + fi + done + + [ -n "$prog" ] || { + errn "$THIS: Couldn't find a program to fetch the file from URL " + err "(e.g. wget, w3m, lynx, w3c, or curl)." + return 1 + } + else + pathfind "$prog" || { + err "$THIS: No such web grabber '$prog' found; aborting." + return 1 + } + fi + + # Setup proper base options for known grabbers. + base_opts= + case "$prog" in + wget) base_opts="-O-" ;; + lynx) base_opts="-source" ;; + w3m) base_opts="-dump_source" ;; + curl) base_opts="" ;; + links) base_opts="-source" ;; + w3c) base_opts="-n -get" ;; + *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." + esac + + err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." + eval "set -- $base_opts $prog_opts" + $prog "$@" "$url" +} + +# Parse command-line arguments +parse_arguments () { + while [ $# -gt 0 ]; do + case "$1" in + --encoding=*) + wholeopt="$1" + # extract encoding from after = + encoding="${wholeopt#*=}" ;; + -e|--encoding|-encoding) + shift + encoding="$1" ;; + --grabber=*) + wholeopt="$1" + # extract encoding from after = + grabber="\"${wholeopt#*=}\"" ;; + -g|--grabber|-grabber) + shift + grabber="$1" ;; + *) + if [ -z "$argument" ]; then + argument="$1" + else + err "Warning: extra argument '$1' will be ignored." + fi ;; + esac + shift + done +} + +argument= +encoding= +grabber= + +oldifs="$IFS" +IFS=$NEWLINE +parse_arguments $ARGS +IFS="$oldifs" + +inurl= +if [ -n "$argument" ] && ! [ -f "$argument" ]; then + # Treat given argument as an URL. + inurl="$argument" +fi + +# As a security measure refuse to proceed if mktemp is not available. +pathfind mktemp || { err "Couldn't find 'mktemp'; aborting."; exit 1; } + +# Avoid issues with /tmp directory on Windows/Cygwin +cygwin= +cygwin=$(uname | sed -ne '/^CYGWIN/p') +if [ -n "$cygwin" ]; then + TMPDIR=. + export TMPDIR +fi + +THIS_TEMPDIR= +THIS_TEMPDIR="$(mktemp -d -t $THIS.XXXXXXXX)" || exit 1 +readonly THIS_TEMPDIR + +trap 'exitcode=$? + [ -z "$THIS_TEMPDIR" ] || rm -rf "$THIS_TEMPDIR" + exit $exitcode' 0 1 2 3 13 15 + +if [ -n "$inurl" ]; then + err "Attempting to fetch file from '$inurl'..." + + grabber_out=$THIS_TEMPDIR/grabber.out + grabber_log=$THIS_TEMPDIR/grabber.log + if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then + errn "grab_url_with failed" + if [ -f $grabber_log ]; then + err " with the following error log." + err + cat >&2 $grabber_log + else + err . + fi + exit 1 + fi + + argument="$grabber_out" +fi + +if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then + # Try to determine character encoding if not specified + # and input is not STDIN. + encoding=$( + head "$argument" | + LC_ALL=C tr 'A-Z' 'a-z' | + sed -ne '/ $htmlinput # read from STDIN +elif [ -f "$argument" ]; then + to_utf8 "$argument" > $htmlinput # read from file +else + err "File '$argument' not found." + exit 1 +fi + +if ! cat $htmlinput | pandoc --ignore-args -r html -w markdown "$@" ; then + err "Failed to parse HTML. Trying again with tidy..." + tidy -q -asxhtml -utf8 $htmlinput | \ + pandoc --ignore-args -r html -w markdown "$@" +fi -- cgit v1.2.3