diff options
Diffstat (limited to 'html2markdown')
-rw-r--r-- | html2markdown | 39 |
1 files changed, 39 insertions, 0 deletions
diff --git a/html2markdown b/html2markdown new file mode 100644 index 000000000..3f9a4857e --- /dev/null +++ b/html2markdown @@ -0,0 +1,39 @@ +#!/bin/sh -e +# converts html to markdown +# uses an available program to fetch URL and tidy to normalize it first + +[ -n "$(which pandoc)" ] || { + echo >&2 "You need 'pandoc' to use this program!" + exit 1 +} +[ -n "$(which tidy)" ] || { + echo >&2 "You need 'tidy' to use this program!" + exit 1 +} + +if [ -z "$1" ] || [ -f $1 ]; then + tidy -utf8 $1 2>/dev/null | pandoc -r html -w markdown -s +else + # Treat given argument as an URL. Locate a + # sensible text based browser (note the order). + for p in wget lynx w3m curl links w3c; do + if which $p >/dev/null; then + DUMPER=$p + break + fi + done + # Setup proper options. + case "$DUMPER" in + wget) OPT="-O-" ;; + lynx) OPT="-source" ;; + w3m) OPT="-dump_source" ;; + curl) OPT="" ;; + links) OPT="-source" ;; + w3c) OPT="-n -get" ;; + "") echo -n >&2 "Needs a program to fetch the URL " + echo -n >&2 "(e.g. wget, w3m, lynx, w3m or curl)." + exit 1 ;; + esac + # Fetch and feed to pandoc. + $DUMPER $OPT $1 2>/dev/null | tidy -utf8 2>/dev/null | pandoc -r html -w markdown -s +fi |