aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-10-02 02:08:52 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2007-10-02 02:08:52 +0000
commita1ad3b4e5fde28e74787dd4af609ea7aaf5b8005 (patch)
treed20ced325f4d78f86af8c8bb2ce60ba974728290
parent5f64258a4e077c6f2a8f0dfe429e01a9401af016 (diff)
downloadpandoc-a1ad3b4e5fde28e74787dd4af609ea7aaf5b8005.tar.gz
Modified html2markdown. Previously html2markdown piped all input
through html tidy before passing it to pandoc. This causes problems on certain sites (e.g. daringfireball.com/markdown) which have well-formed xhtml that causes tidy to choke. Solution is to try pandoc on the original HTML, and run it through tidy only if that fails. This means that a temp file is now always used, even when input comes from a local file or standard input. git-svn-id: https://pandoc.googlecode.com/svn/trunk@1039 788f1e2b-df1e-0410-8736-df70ead52e1b
-rw-r--r--src/wrappers/html2markdown.in25
1 files changed, 15 insertions, 10 deletions
diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in
index ad026c24e..0f4297128 100644
--- a/src/wrappers/html2markdown.in
+++ b/src/wrappers/html2markdown.in
@@ -104,11 +104,11 @@ if [ -n "$argument" ] && ! [ -f "$argument" ]; then
inurl="$argument"
fi
+### tempdir.sh
+
if [ -n "$inurl" ]; then
err "Attempting to fetch file from '$inurl'..."
- ### tempdir.sh
-
grabber_out=$THIS_TEMPDIR/grabber.out
grabber_log=$THIS_TEMPDIR/grabber.log
if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
@@ -144,14 +144,19 @@ else # assume UTF-8
alias to_utf8='cat'
fi
+htmlinput=$THIS_TEMPDIR/htmlinput
+
if [ -z "$argument" ]; then
- tidy -asxhtml -utf8 2>/dev/null | pandoc --ignore-args -r html -w markdown "$@"
+ to_utf8 > $htmlinput # read from STDIN
+elif [ -f "$argument" ]; then
+ to_utf8 "$argument" > $htmlinput # read from file
else
- if [ -f "$argument" ]; then
- to_utf8 "$argument" |
- tidy -asxhtml -utf8 2>/dev/null | pandoc --ignore-args -r html -w markdown "$@"
- else
- err "File '$argument' not found."
- exit 1
- fi
+ err "File '$argument' not found."
+ exit 1
+fi
+
+if ! cat $htmlinput | pandoc --ignore-args -r html -w markdown "$@" ; then
+ err "Failed to parse HTML. Trying again with tidy..."
+ tidy -q -asxhtml -utf8 $htmlinput | \
+ pandoc --ignore-args -r html -w markdown "$@"
fi