From 997ea5ea1d02e31bb8a9b03e3db26684cc81ac59 Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Sat, 6 Feb 2010 18:55:28 +0000 Subject: Removed html2markdown and hsmarkdown. html2markdown is no longer needed, since you can pass URI arguments to pandoc and directly convert web pages. (Note, however, that pandoc assumes the pages are UTF8. html2markdown made an attempt to guess the encoding and convert them.) hsmarkdown is pointless -- a large executable that could be replaced by 'pandoc --strict'. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1834 788f1e2b-df1e-0410-8736-df70ead52e1b --- html2markdown | 221 ---------------------------------------------------------- 1 file changed, 221 deletions(-) delete mode 100755 html2markdown (limited to 'html2markdown') diff --git a/html2markdown b/html2markdown deleted file mode 100755 index 0649e0478..000000000 --- a/html2markdown +++ /dev/null @@ -1,221 +0,0 @@ -#!/bin/sh -e -# converts HTML from a URL, file, or stdin to markdown -# uses an available program to fetch URL and tidy to normalize it first - -REQUIRED="tidy" -SYNOPSIS="converts HTML from a URL, file, or STDIN to markdown-formatted text." - -THIS=${0##*/} - -NEWLINE=' -' - -err () { echo "$*" | fold -s -w ${COLUMNS:-110} >&2; } -errn () { printf "$*" | fold -s -w ${COLUMNS:-110} >&2; } - -usage () { - err "$1 - $2" # short description - err "See the $1(1) man page for usage." -} - -# Portable which(1). -pathfind () { - oldifs="$IFS"; IFS=':' - for _p in $PATH; do - if [ -x "$_p/$*" ] && [ -f "$_p/$*" ]; then - IFS="$oldifs" - return 0 - fi - done - IFS="$oldifs" - return 1 -} - -for p in pandoc $REQUIRED; do - pathfind $p || { - err "You need '$p' to use this program!" - exit 1 - } -done - -CONF=$(pandoc --dump-args "$@" 2>&1) || { - errcode=$? - echo "$CONF" | sed -e '/^pandoc \[OPTIONS\] \[FILES\]/,$d' >&2 - [ $errcode -eq 2 ] && usage "$THIS" "$SYNOPSIS" - exit $errcode -} - -OUTPUT=$(echo "$CONF" | sed -ne '1p') -ARGS=$(echo "$CONF" | sed -e '1d') - - -grab_url_with () { - url="${1:?internal error: grab_url_with: url required}" - - shift - cmdline="$@" - - prog= - prog_opts= - if [ -n "$cmdline" ]; then - eval "set -- $cmdline" - prog=$1 - shift - prog_opts="$@" - fi - - if [ -z "$prog" ]; then - # Locate a sensible web grabber (note the order). - for p in wget lynx w3m curl links w3c; do - if pathfind $p; then - prog=$p - break - fi - done - - [ -n "$prog" ] || { - errn "$THIS: Couldn't find a program to fetch the file from URL " - err "(e.g. wget, w3m, lynx, w3c, or curl)." - return 1 - } - else - pathfind "$prog" || { - err "$THIS: No such web grabber '$prog' found; aborting." - return 1 - } - fi - - # Setup proper base options for known grabbers. - base_opts= - case "$prog" in - wget) base_opts="-O-" ;; - lynx) base_opts="-source" ;; - w3m) base_opts="-dump_source" ;; - curl) base_opts="" ;; - links) base_opts="-source" ;; - w3c) base_opts="-n -get" ;; - *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." - esac - - err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." - eval "set -- $base_opts $prog_opts" - $prog "$@" "$url" -} - -# Parse command-line arguments -parse_arguments () { - while [ $# -gt 0 ]; do - case "$1" in - --encoding=*) - wholeopt="$1" - # extract encoding from after = - encoding="${wholeopt#*=}" ;; - -e|--encoding|-encoding) - shift - encoding="$1" ;; - --grabber=*) - wholeopt="$1" - # extract encoding from after = - grabber="\"${wholeopt#*=}\"" ;; - -g|--grabber|-grabber) - shift - grabber="$1" ;; - *) - if [ -z "$argument" ]; then - argument="$1" - else - err "Warning: extra argument '$1' will be ignored." - fi ;; - esac - shift - done -} - -argument= -encoding= -grabber= - -oldifs="$IFS" -IFS=$NEWLINE -parse_arguments $ARGS -IFS="$oldifs" - -inurl= -if [ -n "$argument" ] && ! [ -f "$argument" ]; then - # Treat given argument as an URL. - inurl="$argument" -fi - -# As a security measure refuse to proceed if mktemp is not available. -pathfind mktemp || { err "Couldn't find 'mktemp'; aborting."; exit 1; } - -# Avoid issues with /tmp directory on Windows/Cygwin -cygwin= -cygwin=$(uname | sed -ne '/^CYGWIN/p') -if [ -n "$cygwin" ]; then - TMPDIR=. - export TMPDIR -fi - -THIS_TEMPDIR= -THIS_TEMPDIR="$(mktemp -d -t $THIS.XXXXXXXX)" || exit 1 -readonly THIS_TEMPDIR - -trap 'exitcode=$? - [ -z "$THIS_TEMPDIR" ] || rm -rf "$THIS_TEMPDIR" - exit $exitcode' 0 1 2 3 13 15 - -if [ -n "$inurl" ]; then - err "Attempting to fetch file from '$inurl'..." - - grabber_out=$THIS_TEMPDIR/grabber.out - grabber_log=$THIS_TEMPDIR/grabber.log - if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then - errn "grab_url_with failed" - if [ -f $grabber_log ]; then - err " with the following error log." - err - cat >&2 $grabber_log - else - err . - fi - exit 1 - fi - - argument="$grabber_out" -fi - -if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then - # Try to determine character encoding if not specified - # and input is not STDIN. - encoding=$( - head "$argument" | - LC_ALL=C tr 'A-Z' 'a-z' | - sed -ne '/ $htmlinput # read from STDIN -elif [ -f "$argument" ]; then - to_utf8 "$argument" > $htmlinput # read from file -else - err "File '$argument' not found." - exit 1 -fi - -if ! cat $htmlinput | pandoc --ignore-args -r html -w markdown "$@" ; then - err "Failed to parse HTML. Trying again with tidy..." - tidy -q -asxhtml -utf8 $htmlinput | \ - pandoc --ignore-args -r html -w markdown "$@" -fi -- cgit v1.2.3