diff options
author | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-10-02 02:08:52 +0000 |
---|---|---|
committer | fiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b> | 2007-10-02 02:08:52 +0000 |
commit | a1ad3b4e5fde28e74787dd4af609ea7aaf5b8005 (patch) | |
tree | d20ced325f4d78f86af8c8bb2ce60ba974728290 | |
parent | 5f64258a4e077c6f2a8f0dfe429e01a9401af016 (diff) | |
download | pandoc-a1ad3b4e5fde28e74787dd4af609ea7aaf5b8005.tar.gz |
Modified html2markdown. Previously html2markdown piped all input
through html tidy before passing it to pandoc. This causes problems
on certain sites (e.g. daringfireball.com/markdown) which have
well-formed xhtml that causes tidy to choke. Solution is to try
pandoc on the original HTML, and run it through tidy only if that
fails.
This means that a temp file is now always used, even when input comes
from a local file or standard input.
git-svn-id: https://pandoc.googlecode.com/svn/trunk@1039 788f1e2b-df1e-0410-8736-df70ead52e1b
-rw-r--r-- | src/wrappers/html2markdown.in | 25 |
1 files changed, 15 insertions, 10 deletions
diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in index ad026c24e..0f4297128 100644 --- a/src/wrappers/html2markdown.in +++ b/src/wrappers/html2markdown.in @@ -104,11 +104,11 @@ if [ -n "$argument" ] && ! [ -f "$argument" ]; then inurl="$argument" fi +### tempdir.sh + if [ -n "$inurl" ]; then err "Attempting to fetch file from '$inurl'..." - ### tempdir.sh - grabber_out=$THIS_TEMPDIR/grabber.out grabber_log=$THIS_TEMPDIR/grabber.log if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then @@ -144,14 +144,19 @@ else # assume UTF-8 alias to_utf8='cat' fi +htmlinput=$THIS_TEMPDIR/htmlinput + if [ -z "$argument" ]; then - tidy -asxhtml -utf8 2>/dev/null | pandoc --ignore-args -r html -w markdown "$@" + to_utf8 > $htmlinput # read from STDIN +elif [ -f "$argument" ]; then + to_utf8 "$argument" > $htmlinput # read from file else - if [ -f "$argument" ]; then - to_utf8 "$argument" | - tidy -asxhtml -utf8 2>/dev/null | pandoc --ignore-args -r html -w markdown "$@" - else - err "File '$argument' not found." - exit 1 - fi + err "File '$argument' not found." + exit 1 +fi + +if ! cat $htmlinput | pandoc --ignore-args -r html -w markdown "$@" ; then + err "Failed to parse HTML. Trying again with tidy..." + tidy -q -asxhtml -utf8 $htmlinput | \ + pandoc --ignore-args -r html -w markdown "$@" fi |