aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
Diffstat (limited to 'src')
-rw-r--r--src/Main.hs142
-rw-r--r--src/wrappers/checkin.sh7
-rw-r--r--src/wrappers/common.sh27
-rw-r--r--src/wrappers/getopts.sh12
-rw-r--r--src/wrappers/html2markdown.in134
-rw-r--r--src/wrappers/latex2markdown.in14
-rw-r--r--src/wrappers/markdown2html.in12
-rw-r--r--src/wrappers/markdown2latex.in12
-rw-r--r--src/wrappers/markdown2pdf.in68
-rw-r--r--src/wrappers/postopts.sh17
-rw-r--r--src/wrappers/singlearg.sh7
-rw-r--r--src/wrappers/testwrapper.in141
-rw-r--r--src/wrappers/web2markdown.in173
13 files changed, 318 insertions, 448 deletions
diff --git a/src/Main.hs b/src/Main.hs
index 542e521f6..0f8567517 100644
--- a/src/Main.hs
+++ b/src/Main.hs
@@ -45,6 +45,7 @@ import Text.Pandoc.Writers.DefaultHeaders ( defaultHtmlHeader,
defaultRTFHeader, defaultS5Header, defaultLaTeXHeader )
import Text.Pandoc.Definition
import Text.Pandoc.Shared
+import Text.Regex ( mkRegex, splitRegex )
import System ( exitWith, getArgs, getProgName )
import System.Exit
import System.Console.GetOpt
@@ -57,6 +58,9 @@ import Control.Monad ( (>>=) )
version :: String
version = "0.3"
+copyrightMessage :: String
+copyrightMessage = "\nCopyright (C) 2006 John MacFarlane\nWeb: http://sophos.berkeley.edu/macfarlane/pandoc\nThis is free software; see the source for copying conditions. There is no\nwarranty, not even for merchantability or fitness for a particular purpose."
+
-- | Association list of formats and readers.
readers :: [(String, ParserState -> String -> Pandoc)]
readers = [("native" , readPandoc)
@@ -101,10 +105,13 @@ data Opt = Opt
, optCustomHeader :: String -- ^ Custom header to use, or "DEFAULT"
, optDefaultHeader :: String -- ^ Default header
, optTitlePrefix :: String -- ^ Optional prefix for HTML title
+ , optOutputFile :: String -- ^ Name of output file
, optNumberSections :: Bool -- ^ If @True@, number sections in LaTeX
, optIncremental :: Bool -- ^ If @True@, incremental lists in S5
, optSmart :: Bool -- ^ If @True@, use smart typography
, optASCIIMathML :: Bool -- ^ If @True@, use ASCIIMathML in HTML
+ , optShowUsage :: Bool -- ^ If @True@, show usage message
+ , optDebug :: Bool -- ^ If @True@, output debug messages
}
-- | Defaults for command-line options.
@@ -123,32 +130,20 @@ startOpt = Opt
, optCustomHeader = "DEFAULT"
, optDefaultHeader = defaultHtmlHeader
, optTitlePrefix = ""
+ , optOutputFile = "" -- null for stdout
, optNumberSections = False
, optIncremental = False
, optSmart = False
, optASCIIMathML = False
+ , optShowUsage = False
+ , optDebug = False
}
-- | A list of functions, each transforming the options data structure in response
-- to a command-line option.
-options :: [OptDescr (Opt -> IO Opt)]
-options =
- [ Option "v" ["version"]
- (NoArg
- (\_ -> do
- hPutStrLn stderr ("Version " ++ version)
- exitWith ExitSuccess))
- "Print version"
-
- , Option "h" ["help"]
- (NoArg
- (\_ -> do
- prg <- getProgName
- hPutStrLn stderr (usageInfo (prg ++ " [OPTIONS] [FILES] - convert FILES from one markup format to another\nIf no OPTIONS specified, converts from markdown to html.\nIf no FILES specified, input is read from STDIN.\nOptions:") options)
- exitWith ExitSuccess))
- "Show help"
-
- , Option "fr" ["from","read"]
+allOptions :: [OptDescr (Opt -> IO Opt)]
+allOptions =
+ [ Option "fr" ["from","read"]
(ReqArg
(\arg opt -> case (lookup (map toLower arg) readers) of
Just reader -> return opt { optReader = reader }
@@ -172,6 +167,13 @@ options =
(\opt -> return opt { optStandalone = True }))
"Include needed header and footer on output"
+ , Option "o" ["output"]
+ (ReqArg
+ (\arg opt -> do
+ return opt { optOutputFile = arg })
+ "FILENAME")
+ "Name of output file"
+
, Option "p" ["preserve-tabs"]
(NoArg
(\opt -> return opt { optPreserveTabs = True }))
@@ -241,7 +243,7 @@ options =
"FILENAME")
"File to include after document body"
- , Option "" ["custom-header"]
+ , Option "C" ["custom-header"]
(ReqArg
(\arg opt -> do
text <- readFile arg
@@ -263,18 +265,87 @@ options =
let header = case (lookup arg writers) of
Just (writer, head) -> head
Nothing -> error ("Unknown reader: " ++ arg)
- hPutStrLn stdout header
+ hPutStr stdout header
exitWith ExitSuccess)
"FORMAT")
"Print default header for FORMAT"
+
+ , Option "d" ["debug"]
+ (NoArg
+ (\opt -> return opt { optDebug = True }))
+ "Print debug messages to stderr, output to stdout"
+
+ , Option "v" ["version"]
+ (NoArg
+ (\_ -> do
+ prg <- getProgName
+ hPutStrLn stderr (prg ++ " " ++ version ++
+ copyrightMessage)
+ exitWith $ ExitFailure 2))
+ "Print version"
+
+ , Option "h" ["help"]
+ (NoArg
+ (\opt -> return opt { optShowUsage = True }))
+ "Show help"
]
+
+-- parse name of calling program and return default reader and writer descriptions
+parseProgName name =
+ case (splitRegex (mkRegex "2") (map toLower name)) of
+ [from, to] -> (from, to)
+ _ -> ("markdown", "html")
+
+-- set default options based on reader and writer descriptions; start is starting options
+setDefaultOpts from to start =
+ case ((lookup from readers), (lookup to writers)) of
+ (Just reader, Just (writer, header)) -> start {optReader = reader,
+ optWriter = writer,
+ optDefaultHeader = header}
+ _ -> start
+
+-- True if single-letter option is in option list
+inOptList :: [Char] -> OptDescr (Opt -> IO Opt) -> Bool
+inOptList list desc =
+ let (Option letters _ _ _) = desc in
+ any (\x -> x `elem` list) letters
+
+-- Reformat usage message so it doesn't wrap illegibly
+reformatUsageInfo = gsub " *--" " --" .
+ gsub "(-[A-Za-z0-9]) *--" "\\1, --" .
+ gsub " *([^- ])" "\n\t\\1"
+
main = do
+ name <- getProgName
+ let (from, to) = parseProgName name
+
+ let irrelevantOptions = if not ('2' `elem` name)
+ then ""
+ else "frtwD" ++
+ (if (to /= "html" && to /= "s5") then "SmcT" else "") ++
+ (if (to /= "latex") then "N" else "") ++
+ (if (to /= "s5") then "i" else "") ++
+ (if (from /= "html" && from /= "latex") then "R" else "")
+
+ let options = filter (not . inOptList irrelevantOptions) allOptions
+
+ let defaultOpts = setDefaultOpts from to startOpt
+
args <- getArgs
- let (actions, sources, errors) = getOpt RequireOrder options args
+ let (actions, sources, errors) = getOpt Permute options args
+
+ if (not (null errors))
+ then do
+ mapM (\e -> hPutStrLn stderr e) errors
+ hPutStrLn stderr (reformatUsageInfo $
+ usageInfo (name ++ " [OPTIONS] [FILES]") options)
+ exitWith $ ExitFailure 2
+ else
+ return ()
-- thread option data structure through all supplied option actions
- opts <- foldl (>>=) (return startOpt) actions
+ opts <- foldl (>>=) (return defaultOpts) actions
let Opt { optPreserveTabs = preserveTabs
, optTabStop = tabStop
@@ -289,12 +360,31 @@ main = do
, optCustomHeader = customHeader
, optDefaultHeader = defaultHeader
, optTitlePrefix = titlePrefix
+ , optOutputFile = outputFile
, optNumberSections = numberSections
, optIncremental = incremental
, optSmart = smart
, optASCIIMathML = asciiMathML
+ , optShowUsage = showUsage
+ , optDebug = debug
} = opts
+ if showUsage
+ then do
+ hPutStr stderr (reformatUsageInfo $ usageInfo (name ++ " [OPTIONS] [FILES]") options)
+ exitWith $ ExitFailure 2
+ else return ()
+
+ output <- if ((null outputFile) || debug)
+ then return stdout
+ else openFile outputFile WriteMode
+
+ if debug
+ then do
+ hPutStrLn stderr ("OUTPUT=" ++ outputFile)
+ hPutStr stderr $ concatMap (\s -> "INPUT=" ++ s ++ "\n") sources
+ else return ()
+
let writingS5 = (defaultHeader == defaultS5Header)
let tabFilter = if preserveTabs then id else (tabsToSpaces tabStop)
let addBlank str = str ++ "\n\n"
@@ -323,13 +413,13 @@ main = do
writerIncludeBefore = includeBefore,
writerIncludeAfter = includeAfter }
- (readSources sources) >>= (putStr . encodeUTF8 . (writer writerOptions) .
+ (readSources sources) >>= (hPutStr output . encodeUTF8 .
+ (writer writerOptions) .
(reader startParserState) . filter .
- decodeUTF8 . (joinWithSep "\n"))
+ decodeUTF8 . (joinWithSep "\n")) >> hClose output
where
readSources [] = mapM readSource ["-"]
readSources sources = mapM readSource sources
- readSource "-" = getContents
+ readSource "-" = getContents
readSource source = readFile source
-
diff --git a/src/wrappers/checkin.sh b/src/wrappers/checkin.sh
deleted file mode 100644
index c9c564a23..000000000
--- a/src/wrappers/checkin.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-# Check if input files exist.
-for f; do
- if [ -n "$f" ] && ! [ -f "$f" ]; then
- err "File '$f' not found."
- exit 1
- fi
-done
diff --git a/src/wrappers/common.sh b/src/wrappers/common.sh
index 99a83be50..3481affff 100644
--- a/src/wrappers/common.sh
+++ b/src/wrappers/common.sh
@@ -8,22 +8,6 @@ WRAPPEE_ARGS=
err () { echo "$*" | fold -s -w ${COLUMNS:-110} >&2; }
errn () { printf "$*" | fold -s -w ${COLUMNS:-110} >&2; }
-usage () {
- synopsis="$@"
- err "Usage: $THIS $synopsis"
- err "See $THIS(1) man file for details."
-}
-
-runpandoc () {
- if [ -n "$WRAPPEE_ARGS" ]; then
- # Unpack arguments that will be passed to pandoc.
- oldifs="$IFS"; IFS="$NEWLINE"; set -- $WRAPPEE_ARGS "$@"; IFS="$oldifs"
- case "$1" in --) shift;; esac # tolerate the existence of a leading '--'
- fi
-
- pandoc "$@"
-}
-
# Portable which(1).
pathfind () {
oldifs="$IFS"; IFS=':'
@@ -37,17 +21,6 @@ pathfind () {
return 1
}
-HAVE_ICONV=
-if pathfind iconv; then
- HAVE_ICONV=1
- alias to_utf8='iconv -t utf-8'
- alias from_utf8='iconv -f utf-8'
-else
- err "Warning: iconv not present. Assuming UTF-8 character encoding."
- alias to_utf8='cat'
- alias from_utf8='cat'
-fi
-
for p in pandoc $REQUIRED; do
pathfind $p || {
err "You need '$p' to use this program!"
diff --git a/src/wrappers/getopts.sh b/src/wrappers/getopts.sh
deleted file mode 100644
index 263263c07..000000000
--- a/src/wrappers/getopts.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-if [ -z "$SYNOPSIS" ]; then
- SYNOPSIS="[-h] [input_file]"
- [ -n "$THIS_NARG" ] || SYNOPSIS="${SYNOPSIS}..."
-fi
-
-while getopts h opt; do
- case $opt in
- h|?) usage "$SYNOPSIS"; exit 2 ;;
- esac
-done
-
-shift $(($OPTIND - 1))
diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in
deleted file mode 100644
index 0fece3ccd..000000000
--- a/src/wrappers/html2markdown.in
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/bin/sh -e
-# converts html to markdown
-# uses an available program to fetch URL and tidy to normalize it first
-
-REQUIRED=tidy
-
-### common.sh
-
-grab_url_with () {
- url="${1:?internal error: grab_url_with: url required}"
-
- shift
- cmdline="$@"
-
- prog=
- prog_opts=
- if [ -n "$cmdline" ]; then
- eval "set -- $cmdline"
- prog=$1
- shift
- prog_opts="$@"
- fi
-
- if [ -z "$prog" ]; then
- # Locate a sensible web grabber (note the order).
- for p in wget lynx w3m curl links w3c; do
- if pathfind $p; then
- prog=$p
- break
- fi
- done
-
- [ -n "$prog" ] || {
- errn "$THIS: Couldn't find a program to fetch the file from URL "
- err "(e.g. wget, w3m, lynx, w3c, or curl)."
- return 1
- }
- else
- pathfind "$prog" || {
- err "$THIS: No such web grabber '$prog' found; aborting."
- return 1
- }
- fi
-
- # Setup proper base options for known grabbers.
- base_opts=
- case "$prog" in
- wget) base_opts="-O-" ;;
- lynx) base_opts="-source" ;;
- w3m) base_opts="-dump_source" ;;
- curl) base_opts="" ;;
- links) base_opts="-source" ;;
- w3c) base_opts="-n -get" ;;
- *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
- esac
-
- err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
- eval "set -- $base_opts $prog_opts"
- $prog "$@" "$url"
-}
-
-encoding=
-grabber=
-nograb=
-while getopts e:g:nh opt; do
- case $opt in
- e) encoding="$OPTARG" ;;
- g) grabber="$OPTARG" ;;
- n) nograb=1 ;;
- h|?)
- usage "[-e encoding] [-g grabber_command] [-n] [-h] [input_file|url]"
- exit 2 ;;
- esac
-done
-
-shift $(($OPTIND - 1))
-
-### postopts.sh
-
-### singlearg.sh
-
-inurl=
-if [ -n "$1" ] && ! [ -f "$1" ]; then
- if [ -n "$nograb" ]; then
- err "'$1' not found; refusing to treat input as URL."
- exit 1
- fi
- # Treat given argument as an URL.
- inurl="$1"
-fi
-
-if [ -n "$inurl" ]; then
- err "Attempting to fetch file from '$inurl'..."
-
- ### tempdir.sh
-
- grabber_out=$THIS_TEMPDIR/grabber.out
- grabber_log=$THIS_TEMPDIR/grabber.log
- if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out \
- 2>$grabber_log; then
- errn "grab_url_with failed"
- if [ -f $grabber_log ]; then
- err " with the following error log."
- err
- cat >&2 $grabber_log
- else
- err .
- fi
- exit 1
- fi
-
- set -- $grabber_out
-fi
-
-if [ -z "$encoding" ] && [ "x$@" != "x" ]; then
- # Try to determine character encoding unless not specified
- # and input is STDIN.
- encoding=$(
- head "$@" |
- LC_ALL=C tr 'A-Z' 'a-z' |
- sed -ne '/<meta .*content-type.*charset=/ {
- s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
- }'
- )
-fi
-
-if [ -n "$encoding" ] && [ -n "$HAVE_ICONV" ]; then
- alias to_utf8='iconv -f "$encoding" -t utf-8'
-elif [ -n "$inurl" ]; then # assume web pages are UTF-8
- alias to_utf8='cat'
-fi # else just use local encoding
-
-to_utf8 "$@" | tidy -utf8 2>/dev/null |
-runpandoc -r html -w markdown -s | from_utf8
diff --git a/src/wrappers/latex2markdown.in b/src/wrappers/latex2markdown.in
deleted file mode 100644
index e8cde8a97..000000000
--- a/src/wrappers/latex2markdown.in
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/sh -e
-# runs pandoc to convert latex to markdown
-
-### common.sh
-
-### getopts.sh
-
-### postopts.sh
-
-### singlearg.sh
-
-### checkin.sh
-
-to_utf8 "$@" | runpandoc -r latex -w markdown -s | from_utf8
diff --git a/src/wrappers/markdown2html.in b/src/wrappers/markdown2html.in
deleted file mode 100644
index e255398d2..000000000
--- a/src/wrappers/markdown2html.in
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh -e
-# converts markdown to HTML
-
-### common.sh
-
-### getopts.sh
-
-### postopts.sh
-
-### checkin.sh
-
-to_utf8 "$@" | runpandoc | from_utf8
diff --git a/src/wrappers/markdown2latex.in b/src/wrappers/markdown2latex.in
deleted file mode 100644
index c532b2f99..000000000
--- a/src/wrappers/markdown2latex.in
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh -e
-# converts markdown to latex
-
-### common.sh
-
-### getopts.sh
-
-### postopts.sh
-
-### checkin.sh
-
-to_utf8 "$@" | runpandoc -w latex -s | from_utf8
diff --git a/src/wrappers/markdown2pdf.in b/src/wrappers/markdown2pdf.in
index 838767224..c222c1cbd 100644
--- a/src/wrappers/markdown2pdf.in
+++ b/src/wrappers/markdown2pdf.in
@@ -1,64 +1,54 @@
#!/bin/sh -e
-# converts markdown to latex, then uses latex to make a PDF
-REQUIRED=pdflatex
+REQUIRED="markdown2latex pdflatex"
### common.sh
-outfile=
-while getopts o:h opt; do
- case $opt in
- o) outfile="$OPTARG" ;;
- h|?) usage "[-o output_file] [-h] [input_file]..."; exit 2 ;;
- esac
-done
-
-shift $(($OPTIND - 1))
-
-### postopts.sh
+### tempdir.sh
-### checkin.sh
+texname=output
+logfile=$THIS_TEMPDIR/log
-if [ -z "$outfile" ]; then
- if [ -n "$1" ]; then
- outfile="${1%.*}"
- else
- outfile="stdin" # input is STDIN, since no argument given
- fi
+if ! markdown2latex -s -d "$@" >$THIS_TEMPDIR/$texname.tex 2>$logfile; then
+ [ -f $logfile ] && sed -e 's/markdown2latex/markdown2pdf/g' \
+ -e '/^INPUT=/d' -e '/^OUTPUT=/d' $logfile >&2
+ exit 1
fi
-case "$outfile" in
-*.*) ;; # skip appending extension if one is already present
-*) outfile="${outfile%.*}.pdf";;
-esac
-### tempdir.sh
-
-# We should use a filename without white spaces for pdflatex.
-TEXNAME=$THIS
+outfile="$(sed -ne 's/^OUTPUT=//p' $logfile)"
+IFS="$NEWLINE"
+set -- $(sed -ne 's/^INPUT=//p' $logfile)
+firstinfilebase="${1%.*}"
+defaultdest="${firstinfilebase:-stdin}.pdf"
+destname="${outfile:-$defaultdest}"
-to_utf8 "$@" | runpandoc -w latex -s >$THIS_TEMPDIR/$TEXNAME.tex
(
cd $THIS_TEMPDIR
- if ! pdflatex -interaction=batchmode $TEXNAME.tex >/dev/null 2>&1; then
+ if ! pdflatex -interaction=batchmode $texname.tex >/dev/null 2>&1; then
err "LaTeX errors:"
- from_utf8 $TEXNAME.log | sed -ne '/^!/,/^ *$/p' >&2
- if grep -q "File \`ucs.sty' not found" $TEXNAME.log; then
- err "Please install the 'unicode' package from ctan.org."
+ sed -ne '/^!/,/^ *$/p' $texname.log >&2
+ if grep -q "File \`ucs.sty' not found" $texname.log; then
+ err "Please install the 'unicode' package from CTAN:"
+ err "http://www.ctan.org/tex-archive/macros/latex/contrib/unicode/"
+ fi
+ if grep -q "File \`fancyvrb.sty' not found" $texname.log; then
+ err "Please install the 'fancyvrb' package from CTAN:"
+ err "http://www.ctan.org/tex-archive/macros/latex/contrib/fancyvrb/"
fi
exit 1
fi
-)
+) || exit $?
is_target_exists=
-if [ -f "$outfile" ]; then
+if [ -f "$destname" ]; then
is_target_exists=1
- mv -f "$outfile" "$outfile~"
+ mv "$destname" "$destname~"
fi
-mv -f $THIS_TEMPDIR/$TEXNAME.pdf "$outfile"
+mv -f $THIS_TEMPDIR/$texname.pdf "$destname"
-errn "Created '$outfile'"
+errn "Created $destname"
[ -z "$is_target_exists" ] || {
- errn " (previous file has been backed up as '$outfile~')"
+ errn " (previous file has been backed up as $destname~)"
}
err .
diff --git a/src/wrappers/postopts.sh b/src/wrappers/postopts.sh
deleted file mode 100644
index e0d015f41..000000000
--- a/src/wrappers/postopts.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-# Parse wrapper and wrappee (pandoc) arguments by taking
-# into account that they may have space or tab characters.
-pick="WRAPPER_ARGS"
-while [ $# -gt 0 ]; do
- if [ "$pick" = "WRAPPER_ARGS" ]; then
- case "$1" in
- -*) pick="WRAPPEE_ARGS" ;;
- esac
- fi
- # Pack args with NEWLINE to preserve spaces,
- # and put them into the picked variable.
- eval "$pick=\"\$${pick}${NEWLINE}${1}\""
- shift
-done
-
-# Unpack filename arguments. Now "$@" will hold the filenames.
-oldifs="$IFS"; IFS="$NEWLINE"; set -- $WRAPPER_ARGS; IFS="$oldifs"
diff --git a/src/wrappers/singlearg.sh b/src/wrappers/singlearg.sh
deleted file mode 100644
index f742d1383..000000000
--- a/src/wrappers/singlearg.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ensure to work with a single argument.
-if [ $# -gt 1 ]; then
- first_arg="$1"
- shift
- err "Warning: extra arguments '$@' will be ignored."
- set -- $first_arg
-fi
diff --git a/src/wrappers/testwrapper.in b/src/wrappers/testwrapper.in
deleted file mode 100644
index e025c87e7..000000000
--- a/src/wrappers/testwrapper.in
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/bin/sh
-
-THIS=$1
-
-ASH="ash -s"
-BASH="bash --posix -s"
-DASH="dash -s"
-KSH="ksh -s"
-POSH="posh -s"
-ZSH="zsh -s"
-
-ERROR=""
-
-wrapper () {
- $SH -- "$@" <<-'EOF'
-### common.sh
-
-outfile=
-while getopts o: opt; do
- case $opt in
- o) outfile="$OPTARG" ;;
- esac
-done
-
-shift $(($OPTIND - 1))
-
-### postopts.sh
-
-echo "Options passed to wrapper:"
-[ -z "$outfile" ] || echo "|$outfile|"
-
-echo "Arguments passed to wrapper:"
-for arg; do
- echo "|$arg|"
-done
-
-pandoc () {
- echo "Arguments passed to wrappee:"
- for arg; do
- echo "|$arg|"
- done
-}
-runpandoc
-EOF
-}
-
-# Portable which(1).
-pathfind () {
- oldifs="$IFS"; IFS=':'
- for _p in $PATH; do
- if [ -x "$_p/$*" ] && [ -f "$_p/$*" ]; then
- IFS="$oldifs"
- return 0
- fi
- done
- IFS="$oldifs"
- return 1
-}
-
-check_results () {
- if [ "$1" = "$2" ]; then
- echo >&2 ok
- return 0
- else
- echo >&2 failed
- sed "s/^/\t/" >&2 <<EOF
-Command line: '$3'
-===> Expected:
-$2
-<=== Got:
-$1
-EOF
- return 1
- fi
-}
-
-for SH in "$BASH" "$DASH" "$KSH" "$ZSH"; do
- CMD=${SH%% *}
- echo >&2 " Testing with $CMD..."
- if pathfind "$CMD"; then
- if [ "$CMD" = "zsh" ]; then
- # Zsh needs to be called as 'sh' to enable POSIX mode.
- ln -s $(which zsh) ./sh
- SH="./sh ${SH#* }"
- trap 'err=$?; rm -f ./sh; exit $err' 0 1 2 3 13 15
- fi
-
- set -e
-
- # Test 1
- printf >&2 " test case 1... "
- actual=$(wrapper -o "output file" "foo bar" -A "quux baz" -B)
- expected=$(cat <<'EOF'
-Options passed to wrapper:
-|output file|
-Arguments passed to wrapper:
-|foo bar|
-Arguments passed to wrappee:
-|-A|
-|quux baz|
-|-B|
-EOF
-)
- check_results "$actual" "$expected" \
- 'wrapper -o "output file" "foo bar" -A "quux baz" -B'
-
- # Test 2
- printf >&2 " test case 2... "
- actual=$(wrapper -- -A "foo bar")
- expected=$(cat <<'EOF'
-Options passed to wrapper:
-Arguments passed to wrapper:
-Arguments passed to wrappee:
-|-A|
-|foo bar|
-EOF
-)
- check_results "$actual" "$expected" 'wrapper -- -A "foo bar"'
-
- # Test 3 (Test 1 with a redundant '--')
- printf >&2 " test case 3... "
- actual=$(wrapper -o "output file" "foo bar" -- -A "quux baz" -B)
- expected=$(cat <<'EOF'
-Options passed to wrapper:
-|output file|
-Arguments passed to wrapper:
-|foo bar|
-Arguments passed to wrappee:
-|-A|
-|quux baz|
-|-B|
-EOF
-)
- check_results "$actual" "$expected" \
- 'wrapper -o "output file" "foo bar" -- -A "quux baz" -B'
- else
- echo >&2 "Warning: cannot verify correctness with $CMD; shell not available"
- fi
-done
-
-exit 0
diff --git a/src/wrappers/web2markdown.in b/src/wrappers/web2markdown.in
new file mode 100644
index 000000000..64ff3db9b
--- /dev/null
+++ b/src/wrappers/web2markdown.in
@@ -0,0 +1,173 @@
+#!/bin/sh -e
+# converts HTML from a URL, file, or stdin to markdown
+# uses an available program to fetch URL and tidy to normalize it first
+
+REQUIRED="tidy html2markdown"
+
+### common.sh
+
+grab_url_with () {
+ url="${1:?internal error: grab_url_with: url required}"
+
+ shift
+ cmdline="$@"
+
+ prog=
+ prog_opts=
+ if [ -n "$cmdline" ]; then
+ eval "set -- $cmdline"
+ prog=$1
+ shift
+ prog_opts="$@"
+ fi
+
+ if [ -z "$prog" ]; then
+ # Locate a sensible web grabber (note the order).
+ for p in wget lynx w3m curl links w3c; do
+ if pathfind $p; then
+ prog=$p
+ break
+ fi
+ done
+
+ [ -n "$prog" ] || {
+ errn "$THIS: Couldn't find a program to fetch the file from URL "
+ err "(e.g. wget, w3m, lynx, w3c, or curl)."
+ return 1
+ }
+ else
+ pathfind "$prog" || {
+ err "$THIS: No such web grabber '$prog' found; aborting."
+ return 1
+ }
+ fi
+
+ # Setup proper base options for known grabbers.
+ base_opts=
+ case "$prog" in
+ wget) base_opts="-O-" ;;
+ lynx) base_opts="-source" ;;
+ w3m) base_opts="-dump_source" ;;
+ curl) base_opts="" ;;
+ links) base_opts="-source" ;;
+ w3c) base_opts="-n -get" ;;
+ *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
+ esac
+
+ err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
+ eval "set -- $base_opts $prog_opts"
+ $prog "$@" "$url"
+}
+
+add_option () {
+ options="$options$NEWLINE$1"
+}
+
+options=
+argument=
+encoding=
+grabber=
+
+# Parse command-line arguments
+while [ $# -gt 0 ]; do
+ case "$1" in
+ -h|--help)
+ html2markdown -h 2>&1 | sed -e 's/html2markdown/web2markdown/' 1>&2
+ err " -e ENCODING, --encoding=ENCODING"
+ err " Specify character encoding of input"
+ err " -g COMMAND, --grabber=COMMAND"
+ err " Specify command to be used to grab contents of URL"
+ exit 0 ;;
+ -v|--version)
+ html2markdown -v
+ exit 0 ;;
+ -e)
+ shift
+ encoding=$1 ;;
+ --encoding=*)
+ wholeopt=$1
+ # extract encoding from after =
+ encoding=${wholeopt#*=} ;;
+ -g)
+ shift
+ grabber=$1 ;;
+ --grabber=*)
+ wholeopt=$1
+ # extract encoding from after =
+ grabber=${wholeopt#*=} ;;
+ -o|--output|-b|--tab-stop|-H|--include-in-header| \
+ -A|--include-after-body|-C|-B|--include-before-body| \
+ -C|--custom-header|-T|--title-prefix)
+ add_option $1
+ shift
+ add_option $1 ;;
+ -*) add_option $1 ;;
+ *)
+ if [ -z "$argument" ]; then
+ argument=$1
+ else
+ err "Warning: extra argument '$1' will be ignored."
+ fi ;;
+ esac
+ shift
+done
+
+# Unpack options. Now "$@" will hold the html2markdown options.
+oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs"
+
+inurl=
+if [ -n "$argument" ] && ! [ -f "$argument" ]; then
+ # Treat given argument as an URL.
+ inurl="$argument"
+fi
+
+if [ -n "$inurl" ]; then
+ err "Attempting to fetch file from '$inurl'..."
+
+ ### tempdir.sh
+
+ grabber_out=$THIS_TEMPDIR/grabber.out
+ grabber_log=$THIS_TEMPDIR/grabber.log
+ if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
+ errn "grab_url_with failed"
+ if [ -f $grabber_log ]; then
+ err " with the following error log."
+ err
+ cat >&2 $grabber_log
+ else
+ err .
+ fi
+ exit 1
+ fi
+
+ argument="$grabber_out"
+fi
+
+if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then
+ # Try to determine character encoding if not specified
+ # and input is not STDIN.
+ encoding=$(
+ head "$argument" |
+ LC_ALL=C tr 'A-Z' 'a-z' |
+ sed -ne '/<meta .*content-type.*charset=/ {
+ s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
+ }'
+ )
+fi
+
+if [ -n "$encoding" ] && pathfind iconv; then
+ alias to_utf8='iconv -f "$encoding" -t utf-8'
+else # assume UTF-8
+ alias to_utf8='cat'
+fi
+
+if [ -z "$argument" ]; then
+ tidy -utf8 2>/dev/null | html2markdown "$@"
+else
+ if [ -f "$argument" ]; then
+ to_utf8 "$argument" | tidy -utf8 2>/dev/null | html2markdown "$@"
+ else
+ err "File '$argument' not found."
+ exit 1
+ fi
+fi