aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-12-22 20:16:03 +0000
committerfiddlosopher <fiddlosopher@788f1e2b-df1e-0410-8736-df70ead52e1b>2006-12-22 20:16:03 +0000
commitd829c4820adbe7a7634f1c1d825d0d206512e6e7 (patch)
tree2de3d3459e6f2788b3a9aede93add68503f5a588 /src
parentcfaf0c178c422e00706eb04daea88d21a7fe9429 (diff)
downloadpandoc-d829c4820adbe7a7634f1c1d825d0d206512e6e7.tar.gz
Merged changes from branches/wrappers since r177.
Summary of main changes: + Added -o/--output and -d/--debug options to pandoc. + Modified pandoc to behave differently depending on the name of the program. For example, if the program name is 'html2latex', the default reader will be html and the default writer latex. + Removed most of the old wrappers, replacing them with symlinks to pandoc. + Rewrote markdown2pdf and created a new wrapper web2markdown, with the functionality of the old html2markdown script. These new scripts exploit pandoc's -d option to avoid having to do complex command-line parsing. + Revised man pages and documentation appropriately. git-svn-id: https://pandoc.googlecode.com/svn/trunk@279 788f1e2b-df1e-0410-8736-df70ead52e1b
Diffstat (limited to 'src')
-rw-r--r--src/Main.hs142
-rw-r--r--src/wrappers/checkin.sh7
-rw-r--r--src/wrappers/common.sh27
-rw-r--r--src/wrappers/getopts.sh12
-rw-r--r--src/wrappers/html2markdown.in134
-rw-r--r--src/wrappers/latex2markdown.in14
-rw-r--r--src/wrappers/markdown2html.in12
-rw-r--r--src/wrappers/markdown2latex.in12
-rw-r--r--src/wrappers/markdown2pdf.in68
-rw-r--r--src/wrappers/postopts.sh17
-rw-r--r--src/wrappers/singlearg.sh7
-rw-r--r--src/wrappers/testwrapper.in141
-rw-r--r--src/wrappers/web2markdown.in173
13 files changed, 318 insertions, 448 deletions
diff --git a/src/Main.hs b/src/Main.hs
index 542e521f6..0f8567517 100644
--- a/src/Main.hs
+++ b/src/Main.hs
@@ -45,6 +45,7 @@ import Text.Pandoc.Writers.DefaultHeaders ( defaultHtmlHeader,
defaultRTFHeader, defaultS5Header, defaultLaTeXHeader )
import Text.Pandoc.Definition
import Text.Pandoc.Shared
+import Text.Regex ( mkRegex, splitRegex )
import System ( exitWith, getArgs, getProgName )
import System.Exit
import System.Console.GetOpt
@@ -57,6 +58,9 @@ import Control.Monad ( (>>=) )
version :: String
version = "0.3"
+copyrightMessage :: String
+copyrightMessage = "\nCopyright (C) 2006 John MacFarlane\nWeb: http://sophos.berkeley.edu/macfarlane/pandoc\nThis is free software; see the source for copying conditions. There is no\nwarranty, not even for merchantability or fitness for a particular purpose."
+
-- | Association list of formats and readers.
readers :: [(String, ParserState -> String -> Pandoc)]
readers = [("native" , readPandoc)
@@ -101,10 +105,13 @@ data Opt = Opt
, optCustomHeader :: String -- ^ Custom header to use, or "DEFAULT"
, optDefaultHeader :: String -- ^ Default header
, optTitlePrefix :: String -- ^ Optional prefix for HTML title
+ , optOutputFile :: String -- ^ Name of output file
, optNumberSections :: Bool -- ^ If @True@, number sections in LaTeX
, optIncremental :: Bool -- ^ If @True@, incremental lists in S5
, optSmart :: Bool -- ^ If @True@, use smart typography
, optASCIIMathML :: Bool -- ^ If @True@, use ASCIIMathML in HTML
+ , optShowUsage :: Bool -- ^ If @True@, show usage message
+ , optDebug :: Bool -- ^ If @True@, output debug messages
}
-- | Defaults for command-line options.
@@ -123,32 +130,20 @@ startOpt = Opt
, optCustomHeader = "DEFAULT"
, optDefaultHeader = defaultHtmlHeader
, optTitlePrefix = ""
+ , optOutputFile = "" -- null for stdout
, optNumberSections = False
, optIncremental = False
, optSmart = False
, optASCIIMathML = False
+ , optShowUsage = False
+ , optDebug = False
}
-- | A list of functions, each transforming the options data structure in response
-- to a command-line option.
-options :: [OptDescr (Opt -> IO Opt)]
-options =
- [ Option "v" ["version"]
- (NoArg
- (\_ -> do
- hPutStrLn stderr ("Version " ++ version)
- exitWith ExitSuccess))
- "Print version"
-
- , Option "h" ["help"]
- (NoArg
- (\_ -> do
- prg <- getProgName
- hPutStrLn stderr (usageInfo (prg ++ " [OPTIONS] [FILES] - convert FILES from one markup format to another\nIf no OPTIONS specified, converts from markdown to html.\nIf no FILES specified, input is read from STDIN.\nOptions:") options)
- exitWith ExitSuccess))
- "Show help"
-
- , Option "fr" ["from","read"]
+allOptions :: [OptDescr (Opt -> IO Opt)]
+allOptions =
+ [ Option "fr" ["from","read"]
(ReqArg
(\arg opt -> case (lookup (map toLower arg) readers) of
Just reader -> return opt { optReader = reader }
@@ -172,6 +167,13 @@ options =
(\opt -> return opt { optStandalone = True }))
"Include needed header and footer on output"
+ , Option "o" ["output"]
+ (ReqArg
+ (\arg opt -> do
+ return opt { optOutputFile = arg })
+ "FILENAME")
+ "Name of output file"
+
, Option "p" ["preserve-tabs"]
(NoArg
(\opt -> return opt { optPreserveTabs = True }))
@@ -241,7 +243,7 @@ options =
"FILENAME")
"File to include after document body"
- , Option "" ["custom-header"]
+ , Option "C" ["custom-header"]
(ReqArg
(\arg opt -> do
text <- readFile arg
@@ -263,18 +265,87 @@ options =
let header = case (lookup arg writers) of
Just (writer, head) -> head
Nothing -> error ("Unknown reader: " ++ arg)
- hPutStrLn stdout header
+ hPutStr stdout header
exitWith ExitSuccess)
"FORMAT")
"Print default header for FORMAT"
+
+ , Option "d" ["debug"]
+ (NoArg
+ (\opt -> return opt { optDebug = True }))
+ "Print debug messages to stderr, output to stdout"
+
+ , Option "v" ["version"]
+ (NoArg
+ (\_ -> do
+ prg <- getProgName
+ hPutStrLn stderr (prg ++ " " ++ version ++
+ copyrightMessage)
+ exitWith $ ExitFailure 2))
+ "Print version"
+
+ , Option "h" ["help"]
+ (NoArg
+ (\opt -> return opt { optShowUsage = True }))
+ "Show help"
]
+
+-- parse name of calling program and return default reader and writer descriptions
+parseProgName name =
+ case (splitRegex (mkRegex "2") (map toLower name)) of
+ [from, to] -> (from, to)
+ _ -> ("markdown", "html")
+
+-- set default options based on reader and writer descriptions; start is starting options
+setDefaultOpts from to start =
+ case ((lookup from readers), (lookup to writers)) of
+ (Just reader, Just (writer, header)) -> start {optReader = reader,
+ optWriter = writer,
+ optDefaultHeader = header}
+ _ -> start
+
+-- True if single-letter option is in option list
+inOptList :: [Char] -> OptDescr (Opt -> IO Opt) -> Bool
+inOptList list desc =
+ let (Option letters _ _ _) = desc in
+ any (\x -> x `elem` list) letters
+
+-- Reformat usage message so it doesn't wrap illegibly
+reformatUsageInfo = gsub " *--" " --" .
+ gsub "(-[A-Za-z0-9]) *--" "\\1, --" .
+ gsub " *([^- ])" "\n\t\\1"
+
main = do
+ name <- getProgName
+ let (from, to) = parseProgName name
+
+ let irrelevantOptions = if not ('2' `elem` name)
+ then ""
+ else "frtwD" ++
+ (if (to /= "html" && to /= "s5") then "SmcT" else "") ++
+ (if (to /= "latex") then "N" else "") ++
+ (if (to /= "s5") then "i" else "") ++
+ (if (from /= "html" && from /= "latex") then "R" else "")
+
+ let options = filter (not . inOptList irrelevantOptions) allOptions
+
+ let defaultOpts = setDefaultOpts from to startOpt
+
args <- getArgs
- let (actions, sources, errors) = getOpt RequireOrder options args
+ let (actions, sources, errors) = getOpt Permute options args
+
+ if (not (null errors))
+ then do
+ mapM (\e -> hPutStrLn stderr e) errors
+ hPutStrLn stderr (reformatUsageInfo $
+ usageInfo (name ++ " [OPTIONS] [FILES]") options)
+ exitWith $ ExitFailure 2
+ else
+ return ()
-- thread option data structure through all supplied option actions
- opts <- foldl (>>=) (return startOpt) actions
+ opts <- foldl (>>=) (return defaultOpts) actions
let Opt { optPreserveTabs = preserveTabs
, optTabStop = tabStop
@@ -289,12 +360,31 @@ main = do
, optCustomHeader = customHeader
, optDefaultHeader = defaultHeader
, optTitlePrefix = titlePrefix
+ , optOutputFile = outputFile
, optNumberSections = numberSections
, optIncremental = incremental
, optSmart = smart
, optASCIIMathML = asciiMathML
+ , optShowUsage = showUsage
+ , optDebug = debug
} = opts
+ if showUsage
+ then do
+ hPutStr stderr (reformatUsageInfo $ usageInfo (name ++ " [OPTIONS] [FILES]") options)
+ exitWith $ ExitFailure 2
+ else return ()
+
+ output <- if ((null outputFile) || debug)
+ then return stdout
+ else openFile outputFile WriteMode
+
+ if debug
+ then do
+ hPutStrLn stderr ("OUTPUT=" ++ outputFile)
+ hPutStr stderr $ concatMap (\s -> "INPUT=" ++ s ++ "\n") sources
+ else return ()
+
let writingS5 = (defaultHeader == defaultS5Header)
let tabFilter = if preserveTabs then id else (tabsToSpaces tabStop)
let addBlank str = str ++ "\n\n"
@@ -323,13 +413,13 @@ main = do
writerIncludeBefore = includeBefore,
writerIncludeAfter = includeAfter }
- (readSources sources) >>= (putStr . encodeUTF8 . (writer writerOptions) .
+ (readSources sources) >>= (hPutStr output . encodeUTF8 .
+ (writer writerOptions) .
(reader startParserState) . filter .
- decodeUTF8 . (joinWithSep "\n"))
+ decodeUTF8 . (joinWithSep "\n")) >> hClose output
where
readSources [] = mapM readSource ["-"]
readSources sources = mapM readSource sources
- readSource "-" = getContents
+ readSource "-" = getContents
readSource source = readFile source
-
diff --git a/src/wrappers/checkin.sh b/src/wrappers/checkin.sh
deleted file mode 100644
index c9c564a23..000000000
--- a/src/wrappers/checkin.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-# Check if input files exist.
-for f; do
- if [ -n "$f" ] && ! [ -f "$f" ]; then
- err "File '$f' not found."
- exit 1
- fi
-done
diff --git a/src/wrappers/common.sh b/src/wrappers/common.sh
index 99a83be50..3481affff 100644
--- a/src/wrappers/common.sh
+++ b/src/wrappers/common.sh
@@ -8,22 +8,6 @@ WRAPPEE_ARGS=
err () { echo "$*" | fold -s -w ${COLUMNS:-110} >&2; }
errn () { printf "$*" | fold -s -w ${COLUMNS:-110} >&2; }
-usage () {
- synopsis="$@"
- err "Usage: $THIS $synopsis"
- err "See $THIS(1) man file for details."
-}
-
-runpandoc () {
- if [ -n "$WRAPPEE_ARGS" ]; then
- # Unpack arguments that will be passed to pandoc.
- oldifs="$IFS"; IFS="$NEWLINE"; set -- $WRAPPEE_ARGS "$@"; IFS="$oldifs"
- case "$1" in --) shift;; esac # tolerate the existence of a leading '--'
- fi
-
- pandoc "$@"
-}
-
# Portable which(1).
pathfind () {
oldifs="$IFS"; IFS=':'
@@ -37,17 +21,6 @@ pathfind () {
return 1
}
-HAVE_ICONV=
-if pathfind iconv; then
- HAVE_ICONV=1
- alias to_utf8='iconv -t utf-8'
- alias from_utf8='iconv -f utf-8'
-else
- err "Warning: iconv not present. Assuming UTF-8 character encoding."
- alias to_utf8='cat'
- alias from_utf8='cat'
-fi
-
for p in pandoc $REQUIRED; do
pathfind $p || {
err "You need '$p' to use this program!"
diff --git a/src/wrappers/getopts.sh b/src/wrappers/getopts.sh
deleted file mode 100644
index 263263c07..000000000
--- a/src/wrappers/getopts.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-if [ -z "$SYNOPSIS" ]; then
- SYNOPSIS="[-h] [input_file]"
- [ -n "$THIS_NARG" ] || SYNOPSIS="${SYNOPSIS}..."
-fi
-
-while getopts h opt; do
- case $opt in
- h|?) usage "$SYNOPSIS"; exit 2 ;;
- esac
-done
-
-shift $(($OPTIND - 1))
diff --git a/src/wrappers/html2markdown.in b/src/wrappers/html2markdown.in
deleted file mode 100644
index 0fece3ccd..000000000
--- a/src/wrappers/html2markdown.in
+++ /dev/null
@@ -1,134 +0,0 @@
-#!/bin/sh -e
-# converts html to markdown
-# uses an available program to fetch URL and tidy to normalize it first
-
-REQUIRED=tidy
-
-### common.sh
-
-grab_url_with () {
- url="${1:?internal error: grab_url_with: url required}"
-
- shift
- cmdline="$@"
-
- prog=
- prog_opts=
- if [ -n "$cmdline" ]; then
- eval "set -- $cmdline"
- prog=$1
- shift
- prog_opts="$@"
- fi
-
- if [ -z "$prog" ]; then
- # Locate a sensible web grabber (note the order).
- for p in wget lynx w3m curl links w3c; do
- if pathfind $p; then
- prog=$p
- break
- fi
- done
-
- [ -n "$prog" ] || {
- errn "$THIS: Couldn't find a program to fetch the file from URL "
- err "(e.g. wget, w3m, lynx, w3c, or curl)."
- return 1
- }
- else
- pathfind "$prog" || {
- err "$THIS: No such web grabber '$prog' found; aborting."
- return 1
- }
- fi
-
- # Setup proper base options for known grabbers.
- base_opts=
- case "$prog" in
- wget) base_opts="-O-" ;;
- lynx) base_opts="-source" ;;
- w3m) base_opts="-dump_source" ;;
- curl) base_opts="" ;;
- links) base_opts="-source" ;;
- w3c) base_opts="-n -get" ;;
- *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
- esac
-
- err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
- eval "set -- $base_opts $prog_opts"
- $prog "$@" "$url"
-}
-
-encoding=
-grabber=
-nograb=
-while getopts e:g:nh opt; do
- case $opt in
- e) encoding="$OPTARG" ;;
- g) grabber="$OPTARG" ;;
- n) nograb=1 ;;
- h|?)
- usage "[-e encoding] [-g grabber_command] [-n] [-h] [input_file|url]"
- exit 2 ;;
- esac
-done
-
-shift $(($OPTIND - 1))
-
-### postopts.sh
-
-### singlearg.sh
-
-inurl=
-if [ -n "$1" ] && ! [ -f "$1" ]; then
- if [ -n "$nograb" ]; then
- err "'$1' not found; refusing to treat input as URL."
- exit 1
- fi
- # Treat given argument as an URL.
- inurl="$1"
-fi
-
-if [ -n "$inurl" ]; then
- err "Attempting to fetch file from '$inurl'..."
-
- ### tempdir.sh
-
- grabber_out=$THIS_TEMPDIR/grabber.out
- grabber_log=$THIS_TEMPDIR/grabber.log
- if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out \
- 2>$grabber_log; then
- errn "grab_url_with failed"
- if [ -f $grabber_log ]; then
- err " with the following error log."
- err
- cat >&2 $grabber_log
- else
- err .
- fi
- exit 1
- fi
-
- set -- $grabber_out
-fi
-
-if [ -z "$encoding" ] && [ "x$@" != "x" ]; then
- # Try to determine character encoding unless not specified
- # and input is STDIN.
- encoding=$(
- head "$@" |
- LC_ALL=C tr 'A-Z' 'a-z' |
- sed -ne '/<meta .*content-type.*charset=/ {
- s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
- }'
- )
-fi
-
-if [ -n "$encoding" ] && [ -n "$HAVE_ICONV" ]; then
- alias to_utf8='iconv -f "$encoding" -t utf-8'
-elif [ -n "$inurl" ]; then # assume web pages are UTF-8
- alias to_utf8='cat'
-fi # else just use local encoding
-
-to_utf8 "$@" | tidy -utf8 2>/dev/null |
-runpandoc -r html -w markdown -s | from_utf8
diff --git a/src/wrappers/latex2markdown.in b/src/wrappers/latex2markdown.in
deleted file mode 100644
index e8cde8a97..000000000
--- a/src/wrappers/latex2markdown.in
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/sh -e
-# runs pandoc to convert latex to markdown
-
-### common.sh
-
-### getopts.sh
-
-### postopts.sh
-
-### singlearg.sh
-
-### checkin.sh
-
-to_utf8 "$@" | runpandoc -r latex -w markdown -s | from_utf8
diff --git a/src/wrappers/markdown2html.in b/src/wrappers/markdown2html.in
deleted file mode 100644
index e255398d2..000000000
--- a/src/wrappers/markdown2html.in
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh -e
-# converts markdown to HTML
-
-### common.sh
-
-### getopts.sh
-
-### postopts.sh
-
-### checkin.sh
-
-to_utf8 "$@" | runpandoc | from_utf8
diff --git a/src/wrappers/markdown2latex.in b/src/wrappers/markdown2latex.in
deleted file mode 100644
index c532b2f99..000000000
--- a/src/wrappers/markdown2latex.in
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/sh -e
-# converts markdown to latex
-
-### common.sh
-
-### getopts.sh
-
-### postopts.sh
-
-### checkin.sh
-
-to_utf8 "$@" | runpandoc -w latex -s | from_utf8
diff --git a/src/wrappers/markdown2pdf.in b/src/wrappers/markdown2pdf.in
index 838767224..c222c1cbd 100644
--- a/src/wrappers/markdown2pdf.in
+++ b/src/wrappers/markdown2pdf.in
@@ -1,64 +1,54 @@
#!/bin/sh -e
-# converts markdown to latex, then uses latex to make a PDF
-REQUIRED=pdflatex
+REQUIRED="markdown2latex pdflatex"
### common.sh
-outfile=
-while getopts o:h opt; do
- case $opt in
- o) outfile="$OPTARG" ;;
- h|?) usage "[-o output_file] [-h] [input_file]..."; exit 2 ;;
- esac
-done
-
-shift $(($OPTIND - 1))
-
-### postopts.sh
+### tempdir.sh
-### checkin.sh
+texname=output
+logfile=$THIS_TEMPDIR/log
-if [ -z "$outfile" ]; then
- if [ -n "$1" ]; then
- outfile="${1%.*}"
- else
- outfile="stdin" # input is STDIN, since no argument given
- fi
+if ! markdown2latex -s -d "$@" >$THIS_TEMPDIR/$texname.tex 2>$logfile; then
+ [ -f $logfile ] && sed -e 's/markdown2latex/markdown2pdf/g' \
+ -e '/^INPUT=/d' -e '/^OUTPUT=/d' $logfile >&2
+ exit 1
fi
-case "$outfile" in
-*.*) ;; # skip appending extension if one is already present
-*) outfile="${outfile%.*}.pdf";;
-esac
-### tempdir.sh
-
-# We should use a filename without white spaces for pdflatex.
-TEXNAME=$THIS
+outfile="$(sed -ne 's/^OUTPUT=//p' $logfile)"
+IFS="$NEWLINE"
+set -- $(sed -ne 's/^INPUT=//p' $logfile)
+firstinfilebase="${1%.*}"
+defaultdest="${firstinfilebase:-stdin}.pdf"
+destname="${outfile:-$defaultdest}"
-to_utf8 "$@" | runpandoc -w latex -s >$THIS_TEMPDIR/$TEXNAME.tex
(
cd $THIS_TEMPDIR
- if ! pdflatex -interaction=batchmode $TEXNAME.tex >/dev/null 2>&1; then
+ if ! pdflatex -interaction=batchmode $texname.tex >/dev/null 2>&1; then
err "LaTeX errors:"
- from_utf8 $TEXNAME.log | sed -ne '/^!/,/^ *$/p' >&2
- if grep -q "File \`ucs.sty' not found" $TEXNAME.log; then
- err "Please install the 'unicode' package from ctan.org."
+ sed -ne '/^!/,/^ *$/p' $texname.log >&2
+ if grep -q "File \`ucs.sty' not found" $texname.log; then
+ err "Please install the 'unicode' package from CTAN:"
+ err "http://www.ctan.org/tex-archive/macros/latex/contrib/unicode/"
+ fi
+ if grep -q "File \`fancyvrb.sty' not found" $texname.log; then
+ err "Please install the 'fancyvrb' package from CTAN:"
+ err "http://www.ctan.org/tex-archive/macros/latex/contrib/fancyvrb/"
fi
exit 1
fi
-)
+) || exit $?
is_target_exists=
-if [ -f "$outfile" ]; then
+if [ -f "$destname" ]; then
is_target_exists=1
- mv -f "$outfile" "$outfile~"
+ mv "$destname" "$destname~"
fi
-mv -f $THIS_TEMPDIR/$TEXNAME.pdf "$outfile"
+mv -f $THIS_TEMPDIR/$texname.pdf "$destname"
-errn "Created '$outfile'"
+errn "Created $destname"
[ -z "$is_target_exists" ] || {
- errn " (previous file has been backed up as '$outfile~')"
+ errn " (previous file has been backed up as $destname~)"
}
err .
diff --git a/src/wrappers/postopts.sh b/src/wrappers/postopts.sh
deleted file mode 100644
index e0d015f41..000000000
--- a/src/wrappers/postopts.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-# Parse wrapper and wrappee (pandoc) arguments by taking
-# into account that they may have space or tab characters.
-pick="WRAPPER_ARGS"
-while [ $# -gt 0 ]; do
- if [ "$pick" = "WRAPPER_ARGS" ]; then
- case "$1" in
- -*) pick="WRAPPEE_ARGS" ;;
- esac
- fi
- # Pack args with NEWLINE to preserve spaces,
- # and put them into the picked variable.
- eval "$pick=\"\$${pick}${NEWLINE}${1}\""
- shift
-done
-
-# Unpack filename arguments. Now "$@" will hold the filenames.
-oldifs="$IFS"; IFS="$NEWLINE"; set -- $WRAPPER_ARGS; IFS="$oldifs"
diff --git a/src/wrappers/singlearg.sh b/src/wrappers/singlearg.sh
deleted file mode 100644
index f742d1383..000000000
--- a/src/wrappers/singlearg.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-# Ensure to work with a single argument.
-if [ $# -gt 1 ]; then
- first_arg="$1"
- shift
- err "Warning: extra arguments '$@' will be ignored."
- set -- $first_arg
-fi
diff --git a/src/wrappers/testwrapper.in b/src/wrappers/testwrapper.in
deleted file mode 100644
index e025c87e7..000000000
--- a/src/wrappers/testwrapper.in
+++ /dev/null
@@ -1,141 +0,0 @@
-#!/bin/sh
-
-THIS=$1
-
-ASH="ash -s"
-BASH="bash --posix -s"
-DASH="dash -s"
-KSH="ksh -s"
-POSH="posh -s"
-ZSH="zsh -s"
-
-ERROR=""
-
-wrapper () {
- $SH -- "$@" <<-'EOF'
-### common.sh
-
-outfile=
-while getopts o: opt; do
- case $opt in
- o) outfile="$OPTARG" ;;
- esac
-done
-
-shift $(($OPTIND - 1))
-
-### postopts.sh
-
-echo "Options passed to wrapper:"
-[ -z "$outfile" ] || echo "|$outfile|"
-
-echo "Arguments passed to wrapper:"
-for arg; do
- echo "|$arg|"
-done
-
-pandoc () {
- echo "Arguments passed to wrappee:"
- for arg; do
- echo "|$arg|"
- done
-}
-runpandoc
-EOF
-}
-
-# Portable which(1).
-pathfind () {
- oldifs="$IFS"; IFS=':'
- for _p in $PATH; do
- if [ -x "$_p/$*" ] && [ -f "$_p/$*" ]; then
- IFS="$oldifs"
- return 0
- fi
- done
- IFS="$oldifs"
- return 1
-}
-
-check_results () {
- if [ "$1" = "$2" ]; then
- echo >&2 ok
- return 0
- else
- echo >&2 failed
- sed "s/^/\t/" >&2 <<EOF
-Command line: '$3'
-===> Expected:
-$2
-<=== Got:
-$1
-EOF
- return 1
- fi
-}
-
-for SH in "$BASH" "$DASH" "$KSH" "$ZSH"; do
- CMD=${SH%% *}
- echo >&2 " Testing with $CMD..."
- if pathfind "$CMD"; then
- if [ "$CMD" = "zsh" ]; then
- # Zsh needs to be called as 'sh' to enable POSIX mode.
- ln -s $(which zsh) ./sh
- SH="./sh ${SH#* }"
- trap 'err=$?; rm -f ./sh; exit $err' 0 1 2 3 13 15
- fi
-
- set -e
-
- # Test 1
- printf >&2 " test case 1... "
- actual=$(wrapper -o "output file" "foo bar" -A "quux baz" -B)
- expected=$(cat <<'EOF'
-Options passed to wrapper:
-|output file|
-Arguments passed to wrapper:
-|foo bar|
-Arguments passed to wrappee:
-|-A|
-|quux baz|
-|-B|
-EOF
-)
- check_results "$actual" "$expected" \
- 'wrapper -o "output file" "foo bar" -A "quux baz" -B'
-
- # Test 2
- printf >&2 " test case 2... "
- actual=$(wrapper -- -A "foo bar")
- expected=$(cat <<'EOF'
-Options passed to wrapper:
-Arguments passed to wrapper:
-Arguments passed to wrappee:
-|-A|
-|foo bar|
-EOF
-)
- check_results "$actual" "$expected" 'wrapper -- -A "foo bar"'
-
- # Test 3 (Test 1 with a redundant '--')
- printf >&2 " test case 3... "
- actual=$(wrapper -o "output file" "foo bar" -- -A "quux baz" -B)
- expected=$(cat <<'EOF'
-Options passed to wrapper:
-|output file|
-Arguments passed to wrapper:
-|foo bar|
-Arguments passed to wrappee:
-|-A|
-|quux baz|
-|-B|
-EOF
-)
- check_results "$actual" "$expected" \
- 'wrapper -o "output file" "foo bar" -- -A "quux baz" -B'
- else
- echo >&2 "Warning: cannot verify correctness with $CMD; shell not available"
- fi
-done
-
-exit 0
diff --git a/src/wrappers/web2markdown.in b/src/wrappers/web2markdown.in
new file mode 100644
index 000000000..64ff3db9b
--- /dev/null
+++ b/src/wrappers/web2markdown.in
@@ -0,0 +1,173 @@
+#!/bin/sh -e
+# converts HTML from a URL, file, or stdin to markdown
+# uses an available program to fetch URL and tidy to normalize it first
+
+REQUIRED="tidy html2markdown"
+
+### common.sh
+
+grab_url_with () {
+ url="${1:?internal error: grab_url_with: url required}"
+
+ shift
+ cmdline="$@"
+
+ prog=
+ prog_opts=
+ if [ -n "$cmdline" ]; then
+ eval "set -- $cmdline"
+ prog=$1
+ shift
+ prog_opts="$@"
+ fi
+
+ if [ -z "$prog" ]; then
+ # Locate a sensible web grabber (note the order).
+ for p in wget lynx w3m curl links w3c; do
+ if pathfind $p; then
+ prog=$p
+ break
+ fi
+ done
+
+ [ -n "$prog" ] || {
+ errn "$THIS: Couldn't find a program to fetch the file from URL "
+ err "(e.g. wget, w3m, lynx, w3c, or curl)."
+ return 1
+ }
+ else
+ pathfind "$prog" || {
+ err "$THIS: No such web grabber '$prog' found; aborting."
+ return 1
+ }
+ fi
+
+ # Setup proper base options for known grabbers.
+ base_opts=
+ case "$prog" in
+ wget) base_opts="-O-" ;;
+ lynx) base_opts="-source" ;;
+ w3m) base_opts="-dump_source" ;;
+ curl) base_opts="" ;;
+ links) base_opts="-source" ;;
+ w3c) base_opts="-n -get" ;;
+ *) err "$THIS: unhandled web grabber '$prog'; hope it succeeds."
+ esac
+
+ err "$THIS: invoking '$prog $base_opts $prog_opts $url'..."
+ eval "set -- $base_opts $prog_opts"
+ $prog "$@" "$url"
+}
+
+add_option () {
+ options="$options$NEWLINE$1"
+}
+
+options=
+argument=
+encoding=
+grabber=
+
+# Parse command-line arguments
+while [ $# -gt 0 ]; do
+ case "$1" in
+ -h|--help)
+ html2markdown -h 2>&1 | sed -e 's/html2markdown/web2markdown/' 1>&2
+ err " -e ENCODING, --encoding=ENCODING"
+ err " Specify character encoding of input"
+ err " -g COMMAND, --grabber=COMMAND"
+ err " Specify command to be used to grab contents of URL"
+ exit 0 ;;
+ -v|--version)
+ html2markdown -v
+ exit 0 ;;
+ -e)
+ shift
+ encoding=$1 ;;
+ --encoding=*)
+ wholeopt=$1
+ # extract encoding from after =
+ encoding=${wholeopt#*=} ;;
+ -g)
+ shift
+ grabber=$1 ;;
+ --grabber=*)
+ wholeopt=$1
+ # extract encoding from after =
+ grabber=${wholeopt#*=} ;;
+ -o|--output|-b|--tab-stop|-H|--include-in-header| \
+ -A|--include-after-body|-C|-B|--include-before-body| \
+ -C|--custom-header|-T|--title-prefix)
+ add_option $1
+ shift
+ add_option $1 ;;
+ -*) add_option $1 ;;
+ *)
+ if [ -z "$argument" ]; then
+ argument=$1
+ else
+ err "Warning: extra argument '$1' will be ignored."
+ fi ;;
+ esac
+ shift
+done
+
+# Unpack options. Now "$@" will hold the html2markdown options.
+oldifs="$IFS"; IFS="$NEWLINE"; set -- $options; IFS="$oldifs"
+
+inurl=
+if [ -n "$argument" ] && ! [ -f "$argument" ]; then
+ # Treat given argument as an URL.
+ inurl="$argument"
+fi
+
+if [ -n "$inurl" ]; then
+ err "Attempting to fetch file from '$inurl'..."
+
+ ### tempdir.sh
+
+ grabber_out=$THIS_TEMPDIR/grabber.out
+ grabber_log=$THIS_TEMPDIR/grabber.log
+ if ! grab_url_with "$inurl" "$grabber" 1>$grabber_out 2>$grabber_log; then
+ errn "grab_url_with failed"
+ if [ -f $grabber_log ]; then
+ err " with the following error log."
+ err
+ cat >&2 $grabber_log
+ else
+ err .
+ fi
+ exit 1
+ fi
+
+ argument="$grabber_out"
+fi
+
+if [ -z "$encoding" ] && [ "x$argument" != "x" ]; then
+ # Try to determine character encoding if not specified
+ # and input is not STDIN.
+ encoding=$(
+ head "$argument" |
+ LC_ALL=C tr 'A-Z' 'a-z' |
+ sed -ne '/<meta .*content-type.*charset=/ {
+ s/.*charset=["'\'']*\([-a-zA-Z0-9]*\).*["'\'']*/\1/p
+ }'
+ )
+fi
+
+if [ -n "$encoding" ] && pathfind iconv; then
+ alias to_utf8='iconv -f "$encoding" -t utf-8'
+else # assume UTF-8
+ alias to_utf8='cat'
+fi
+
+if [ -z "$argument" ]; then
+ tidy -utf8 2>/dev/null | html2markdown "$@"
+else
+ if [ -f "$argument" ]; then
+ to_utf8 "$argument" | tidy -utf8 2>/dev/null | html2markdown "$@"
+ else
+ err "File '$argument' not found."
+ exit 1
+ fi
+fi