From d829c4820adbe7a7634f1c1d825d0d206512e6e7 Mon Sep 17 00:00:00 2001 From: fiddlosopher Date: Fri, 22 Dec 2006 20:16:03 +0000 Subject: Merged changes from branches/wrappers since r177. Summary of main changes: + Added -o/--output and -d/--debug options to pandoc. + Modified pandoc to behave differently depending on the name of the program. For example, if the program name is 'html2latex', the default reader will be html and the default writer latex. + Removed most of the old wrappers, replacing them with symlinks to pandoc. + Rewrote markdown2pdf and created a new wrapper web2markdown, with the functionality of the old html2markdown script. These new scripts exploit pandoc's -d option to avoid having to do complex command-line parsing. + Revised man pages and documentation appropriately. git-svn-id: https://pandoc.googlecode.com/svn/trunk@279 788f1e2b-df1e-0410-8736-df70ead52e1b --- src/wrappers/web2markdown.in | 173 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 src/wrappers/web2markdown.in (limited to 'src/wrappers/web2markdown.in') diff --git a/src/wrappers/web2markdown.in b/src/wrappers/web2markdown.in new file mode 100644 index 000000000..64ff3db9b --- /dev/null +++ b/src/wrappers/web2markdown.in @@ -0,0 +1,173 @@ +#!/bin/sh -e +# converts HTML from a URL, file, or stdin to markdown +# uses an available program to fetch URL and tidy to normalize it first + +REQUIRED="tidy html2markdown" + +### common.sh + +grab_url_with () { + url="${1:?internal error: grab_url_with: url required}" + + shift + cmdline="$@" + + prog= + prog_opts= + if [ -n "$cmdline" ]; then + eval "set -- $cmdline" + prog=$1 + shift + prog_opts="$@" + fi + + if [ -z "$prog" ]; then + # Locate a sensible web grabber (note the order). + for p in wget lynx w3m curl links w3c; do + if pathfind $p; then + prog=$p + break + fi + done + + [ -n "$prog" ] || { + errn "$THIS: Couldn't find a program to fetch the file from URL " + err "(e.g. wget, w3m, lynx, w3c, or curl)." + return 1 + } + else + pathfind "$prog" || { + err "$THIS: No such web grabber '$prog' found; aborting." + return 1 + } + fi + + # Setup proper base options for known grabbers. + base_opts= + case "$prog" in + wget) base_opts="-O-" ;; + lynx) base_opts="-source" ;; + w3m) base_opts="-dump_source" ;; + curl) base_opts="" ;; + links) base_opts="-source" ;; + w3c) base_opts="-n -get" ;; + *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds." + esac + + err "$THIS: invoking '$prog $base_opts $prog_opts $url'..." + eval "set -- $base_opts $prog_opts" + $prog "$@" "$url" +} + +add_option () { + options="$options$NEWLINE$1" +} + +options= +argument= +encoding= +grabber= + +# Parse command-line arguments +while [ $# -gt 0 ]; do + case "$1" in + -h|--help) + html2markdown -h 2>&1 | sed -e 's/html2markdown/web2markdown/' 1>&2 + err " -e ENCODING, --encoding=ENCODING" + err " Specify character encoding of input" + err " -g COMMAND, --grabber=COMMAND" + err " Specify command to be used to grab contents of URL" + exit 0 ;; + -v|--version) + html2markdown -v + exit 0 ;; + -e) + shift + encoding=$1 ;; + --encoding=*) + wholeopt=$1 + # extract encoding from after = + encoding=${wholeopt#*=} ;; + -g) + shift + grabber=$1 ;; + --grabber=*) + wholeopt=$1 + # extract encoding from after = + grabber=${wholeopt#*=} ;; + -o|--output|-b|--tab-stop|-H|--include-in-header| \ + -A|--include-after-body|-C|-B|--include-before-body| \ + -C|--custom-header|-T|--title-prefix) + add_option $1 + shift + add_option $1 ;; + -*) add_option $1 ;; + *) + if [ -z "$argument" ]; then + argument=$1 + else + err "Warning: extra argument '$1' will be ignored." + fi ;; + esac + shift +done + +# Unpack options. Now "$@" will hold the html2markdown options. +oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs" + +inurl= +if [ -n "$argument" ] && ! [ -f "$argument" ]; then + # Treat given argument as an URL. + inurl="$argument" +fi + +if [ -n "$inurl" ]; then + err "Attempting to fetch file from '$inurl'..." + + ### tempdir.sh + + grabber_out=$THIS_TEMPDIR/grabber.out + grabber_log=$THIS_TEMPDIR/grabber.log + if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then + errn "grab_url_with failed" + if [ -f $grabber_log ]; then + err " with the following error log." + err + cat >&2 $grabber_log + else + err . + fi + exit 1 + fi + + argument="$grabber_out" +fi + +if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then + # Try to determine character encoding if not specified + # and input is not STDIN. + encoding=$( + head "$argument" | + LC_ALL=C tr 'A-Z' 'a-z' | + sed -ne '//dev/null | html2markdown "$@" +else + if [ -f "$argument" ]; then + to_utf8 "$argument" | tidy -utf8 2>/dev/null | html2markdown "$@" + else + err "File '$argument' not found." + exit 1 + fi +fi -- cgit v1.2.3